1 Introduction
这个部分要完成一个网络的模块化,然后实现一个新的网络结构。
2 使用torch的模块化功能
2.1 模块化
将输入的字符长度变成8,并将之前的代码模块化
# Near copy paste of the layers we have developed in Part 3# -----------------------------------------------------------------------------------------------class Linear:def __init__(self, fan_in, fan_out, bias=True):self.weight = torch.randn((fan_in, fan_out)) / fan_in**0.5 # note: kaiming initself.bias = torch.zeros(fan_out) if bias else Nonedef __call__(self, x):self.out = x @ self.weightif self.bias is not None:self.out += self.biasreturn self.outdef parameters(self):return [self.weight] + ([] if self.bias is None else [self.bias])# -----------------------------------------------------------------------------------------------
class BatchNorm1d:def __init__(self, dim, eps=1e-5, momentum=0.1):self.eps = epsself.momentum = momentumself.training = True# parameters (trained with backprop)self.gamma = torch.ones(dim)self.beta = torch.zeros(dim)# buffers (trained with a running 'momentum update')self.running_mean = torch.zeros(dim)self.running_var = torch.ones(dim)def __call__(self, x):# calculate the forward passif self.training:if x.ndim == 2:dim = 0elif x.ndim == 3:dim = (0,1)xmean = x.mean(dim, keepdim=True) # batch meanxvar = x.var(dim, keepdim=True) # batch varianceelse:xmean = self.running_meanxvar = self.running_varxhat = (x - xmean) / torch.sqrt(xvar + self.eps) # normalize to unit varianceself.out = self.gamma * xhat + self.beta# update the buffersif self.training:with torch.no_grad():self.running_mean = (1 - self.momentum) * self.running_mean + self.momentum * xmeanself.running_var = (1 - self.momentum) * self.running_var + self.momentum * xvarreturn self.outdef parameters(self):return [self.gamma, self.beta]# -----------------------------------------------------------------------------------------------
class Tanh:def __call__(self, x):self.out = torch.tanh(x)return self.outdef parameters(self):return []# -----------------------------------------------------------------------------------------------
class Embedding:def __init__(self, num_embeddings, embedding_dim):self.weight = torch.randn((num_embeddings, embedding_dim))def __call__(self, IX):self.out = self.weight[IX]return self.outdef parameters(self):return [self.weight]class Flatten:def __call__(self, x):self.out = x.view(x.shape[0], -1)return self.outdef parameters(self):return []# -----------------------------------------------------------------------------------------------
class FlattenConsecutive:def __init__(self, n):self.n = ndef __call__(self, x):B, T, C = x.shapex = x.view(B, T//self.n, C*self.n)if x.shape[1] == 1:x = x.squeeze(1)self.out = xreturn self.outdef parameters(self):return []# -----------------------------------------------------------------------------------------------
class Sequential:def __init__(self, layers):self.layers = layersdef __call__(self, x):for layer in self.layers:x = layer(x)self.out = xreturn self.outdef parameters(self):# get parameters of all layers and stretch them out into one listreturn [p for layer in self.layers for p in layer.parameters()]
定义网络结构
block_size = 8
n_emb = 10
n_batch = 32
n_hidden = 200
g = torch.Generator().manual_seed(2147483647)
model = Sequential([Embedding(vocab_size, n_emb),Flatten(),Linear(n_emb * block_size, n_hidden, bias=False),BatchNorm1d(n_hidden),Tanh(),Linear(n_hidden, vocab_size),
])
with torch.no_grad():model.layers[-1].weight *= 0.1print(sum(p.nelement() for p in model.parameters()))
for p in model.parameters():p.requires_grad = Truefor layer in model.layers:if isinstance(layer, BatchNorm1d):layer.training = True
进行训练
import torch.nn.functional as F
max_iter = 200000
lossi = []
ud = []
for i in range(max_iter):ix = torch.randint(0, Xtr.shape[0], (n_batch,), generator=g)Xb, Yb = Xtr[ix], Ytr[ix]logits = model(Xb)loss = F.cross_entropy(logits, Yb)for p in model.parameters():p.grad = Noneloss.backward()lr = 0.1 if i < 100000 else 0.01for p in model.parameters():p.data -= lr * p.grad.datalossi.append(loss.item())with torch.no_grad():ud.append([((-lr * p.grad).std() / p.data.std()).log10().item() for p in model.parameters()])if i % 1000 == 0:print(f"Iteration: {i}/{max_iter}, Loss: {loss.item()}")# break
显示曲线
import matplotlib.pyplot as plt
plt.plot(torch.tensor(lossi).view(-1, 1000).mean(dim=1, keepdim=False))
比较训练和测试的误差
@torch.no_grad()
def batch_infer(datasets):X, Y = {'train' : (Xtr, Ytr),'val' : (Xdev, Ydev),'test' : (Xte, Yte),}[datasets]logits = model(X)loss = F.cross_entropy(logits, Y)print(f'{datasets}, loss is: {loss}')for layer in model.layers:if isinstance(layer, BatchNorm1d):layer.training = False
batch_infer('train')
batch_infer('val')
train, loss is: 1.926148533821106
val, loss is: 2.028862237930298
网络现在有一点过拟合了。
看一下输出的结果
for _ in range(20):context = [0] * block_sizech = []while(True):X = torch.tensor([context])logits = model(X)probs = torch.softmax(logits, dim=-1).squeeze(0)ix = torch.multinomial(probs, num_samples=1).item()context = context[1:] + [ix]ch.append(itos[ix])if ix == 0:breakprint(''.join(ch))
quab.
nomawa.
brenne.
sevanille.
razlyn.
zile.
audaina.
zaralynn.
dawsyn.
wyle.
yalikbi.
zuria.
endrame.
mesty.
nooap.
dangele.
ellania.
bako.
memaisee.
zailan.
2.2 加上wavenet
这个图表示,两个点使用相同的参数矩阵C,进行映射。
首先来看矩阵乘法的表示
然后再来看我们这个问题,
代码表示为:
class FlattenConsecutive:def __init__(self, n):self.n = ndef __call__(self, x):B, T, C = x.shapex = x.view(B, T//self.n, C*self.n)if x.shape[1] == 1:x = x.squeeze(1)self.out = xreturn self.outdef parameters(self):return []
定义新的完整网络
block_size = 8
n_emb = 24
n_batch = 32
n_hidden = 128
g = torch.Generator().manual_seed(2147483647)
model = Sequential([Embedding(vocab_size, n_emb),FlattenConsecutive(2), Linear(n_emb * 2, n_hidden, bias=False), BatchNorm1d(n_hidden), Tanh(),FlattenConsecutive(2), Linear(n_hidden * 2, n_hidden, bias=False), BatchNorm1d(n_hidden), Tanh(),FlattenConsecutive(2), Linear(n_hidden * 2, n_hidden, bias=False), BatchNorm1d(n_hidden), Tanh(),Linear(n_hidden, vocab_size)
])
with torch.no_grad():model.layers[-1].weight *= 0.1print(sum(p.nelement() for p in model.parameters()))
for p in model.parameters():p.requires_grad = Truefor layer in model.layers:if isinstance(layer, BatchNorm1d):layer.training = True
这里注意一个问题,因为我们采用是batchnormal, 也就是说除了最后一维的数据,其他的数据需要normalize
class BatchNorm1d:def __init__(self, dim, eps=1e-5, momentum=0.1):self.eps = epsself.momentum = momentumself.training = True# parameters (trained with backprop)self.gamma = torch.ones(dim)self.beta = torch.zeros(dim)# buffers (trained with a running 'momentum update')self.running_mean = torch.zeros(dim)self.running_var = torch.ones(dim)def __call__(self, x):# calculate the forward passif self.training:if x.ndim == 2:dim = 0elif x.ndim == 3:dim = (0,1)xmean = x.mean(dim, keepdim=True) # batch meanxvar = x.var(dim, keepdim=True) # batch varianceelse:xmean = self.running_meanxvar = self.running_varxhat = (x - xmean) / torch.sqrt(xvar + self.eps) # normalize to unit varianceself.out = self.gamma * xhat + self.beta# update the buffersif self.training:with torch.no_grad():self.running_mean = (1 - self.momentum) * self.running_mean + self.momentum * xmeanself.running_var = (1 - self.momentum) * self.running_var + self.momentum * xvarreturn self.outdef parameters(self):return [self.gamma, self.beta]
其他的内容和之前的网络相同,最后看一下训练的结果
train, loss is: 1.7904815673828125
val, loss is: 1.9868937730789185
sabris.
lilly.
pryce.
antwling.
lakelyn.
dayre.
theora.
hunna.
michael.
amillia.
zivy.
zuri.
florby.
jairael.
aiyank.
anahit.
madelynn.
briani.
payzleigh.
sola.
2.3 convolution
我们只执行了这里的黑色部分的代码,如果完整执行就是一个convolutional neural network。
References
[1] WaveNet 2016 from DeepMind https://arxiv.org/abs/1609.03499