从0-1搭建Transformer架构
架构图
本文主要讲解
1)输入层的词嵌入
2)输入层的位置编码
3)编码层的多头注意力机制
4)编码层的前馈全连接
1)输入层的词嵌入
class Embeddings(nn.Module):"""构建embedding类实现文本嵌入"""def __init__(self, d_model, vocab):# d_model: 词嵌入维度# vocab: 词表的大小super(Embeddings, self).__init__()self.lut = nn.Embedding(vocab, d_model)self.d_model = d_modeldef forward(self, x):return self.lut(x) * math.sqrt(self.d_model)
2)输入层的位置编码
class PositionalEncoding(nn.Module):"""位置编码"""def __init__(self, d_model, pad_size=5000):# d_model 词嵌入维度# pad_size 默认词汇大小super(PositionalEncoding, self).__init__()self.d_model = d_modelself.pad_size = pad_sizepe = torch.zeros(pad_size, d_model)for t in range(pad_size):for i in range(d_model // 2):angle_rate = 1 / (10000 ** (2 * i / d_model))pe[t, 2 * i] = np.sin(t * angle_rate)pe[t, 2 * i + 1] = np.cos(t * angle_rate)# # 双层循环等价写法# pe = torch.tensor(# [[pad / (10000.0 ** (i // 2 * 2.0 / d_model)) for i in range(d_model)] for pad in range(pad_size)])## pe[:, 0::2] = np.sin(pe[:, 0::2])# pe[:, 1::2] = np.cos(pe[:, 1::2])# 将位置编码扩展到三维pe = pe.unsqueeze(0)# 将位置编码矩阵注册成模型的buffer,buffer不是模型的参数,不跟随优化器更新# 注册成buffer后,在模型保存后重新加载模型的时候,将这个位置编码将和参数一起加载进来self.register_buffer('pe', pe)def forward(self, x):# 位置编码不需要反向更新x = x + Variable(self.pe[:, :x.size(1)], requires_grad=False)return x
3)编码层的多头注意力机制
三个辅助函数注意力机制、module拷贝函数、
def attention(q, k, v, dropout=None, mask=None):# 计算公式 AT(Q,K,V) = softmax(\frac{QK^{T}}{\sqrt{d_k}})V# 词嵌入维度d_k = q.shape[-1]score = torch.matmul(q, k.transpose(-2, -1)) / math.sqrt(d_k)if mask is not None:score = score.masked_fill(mask == 0, -1e6)score = F.softmax(score, dim=-1)if dropout is not None:score = dropout(score)return torch.matmul(score, v), scoredef clones(module, N):""":param module: 需要复制的网络模块:param N: copy数量"""return nn.ModuleList([copy.deepcopy(module) for _ in range(N)])class SublayerConnection(nn.Module):""" 子层连接结构,根据传入的sublayer(实例对象)处理在编码层sublayer可以是多头注意机制或者前馈全连接在解码层sublayer也可以是带有掩码的多头注意力机制SublayerConnection处理流程:规范化 -> 掩码多头/多头/前馈 -> 残差连接"""def __init__(self, d_k, dropout=0.1):super(SublayerConnection, self).__init__()self.norm = nn.LayerNorm(d_k)self.dropout = nn.Dropout(p=dropout)def forward(self, x, sublayer):# 先规范化处理在由具体子层函数处理out = sublayer(self.norm(x))out = self.dropout(out)# 残差连接return x + out
多头注意力机制
class MultiHeadAttention(nn.Module):"""多头注意力机制"""def __init__(self, d_k, head_num, dropout=0.0):super(MultiHeadAttention, self).__init__()self.d_k = d_kself.head_num = head_numassert d_k % head_num == 0self.head_dim = d_k // head_numself.dropout = nn.Dropout(p=dropout)# 深度copy4个线性层,3个用于Q、K、V矩阵,一个将用于指定维度转换self.linears = clones(nn.Linear(d_k, d_k), 4)self.attn = Nonedef forward(self, query, key, value, mask=None):if mask is not None:mask = mask.unsqueeze(0)batch_size = query.size(0)# 三个线性层对输入进行进行隐空间特征提取query, key, value = \[model(x).view(batch_size, -1, self.head_num, self.head_dim).transpose(1, 2) for model, x inzip(self.linears, (query, key, value))]score, self.attn = attention(query, key, value, dropout=self.dropout, mask=mask)score = score.transpose(1, 2).contiguous().view(batch_size, -1, self.head_dim * self.head_num)return self.linears[-1](score)# 多头注意力机制的另一种实现 建议理解这一个代码,比较好理解# def forward2(self, query, key, value, mask=None):# if mask is not None:# mask = mask.unsqueeze(0)# batch_size = query.size(0)# query, key, value = \# [model(x).view(batch_size * self.head_num, -1, self.head_dim) for model, x in# zip(self.linears, (query, key, value))]# score, self.attn = attention(query, key, value, dropout=self.dropout, mask=mask)# score = score.view(batch_size, -1, self.head_dim * self.head_num)# return self.linears[-1](score)
前馈全连接
class PositionalWiseFeedForward(nn.Module):"""前馈全连接"""def __init__(self, d_k, hidden_size, dropout=0.1):super(PositionalWiseFeedForward, self).__init__()self.w1 = nn.Linear(d_k, hidden_size)self.w2 = nn.Linear(hidden_size, d_k)self.dropout = nn.Dropout(p=dropout)def forward(self, x):out = self.w1(x)out = F.relu(out)out = self.dropout(out)return self.w2(out)
编码层
class EncoderLayer(nn.Module):""" 子层连接结构,将多头注意力机制和前馈全连接组装"""def __init__(self, d_k, attn, feed_forward, dropout):"""attn 多头注意力实例feed_forward 前馈全连接实例dropout 置零率实例"""super(EncoderLayer, self).__init__()self.attn = attnself.feed_forward = feed_forward# 拷贝2个子层连接结构,具体处理方式(多头/前馈)调用时指定self.sublayer = clones(SublayerConnection(d_k, dropout), 2)# 保存词嵌入维度,方便后续使用self.size = d_kdef forward(self, x, mask):""" 先走多头注意力机制,在过前馈全连接。 Transformer编码顺序"""x = self.sublayer[0](x, lambda x: self.attn(x, x, x, mask))return self.sublayer[1](x, self.feed_forward)
编码器实现
class Encoder(nn.Module):""" 编码器实现,N个编码层EncoderLayer的堆叠"""def __init__(self, encoder_layer, N):super(Encoder, self).__init__()self.layers = clones(encoder_layer, N)# 使用自定义规范会层 encoder_layer.size 词嵌入维度self.norn = LayerNorm(encoder_layer.size)# torch中规范会层# self.norn = nn.LayerNorm(encoder_layer.size)def forward(self, x, mask=None):for layer in self.layers:x = layer(x, mask)return self.norn(x)