二. GPT-2

hugging-face 的 transformers 库中有模型源码, 为 from transformers.models.gpt2 import GPT2Model.

成员字段

wte, Word Token Emb
wpe, Word Position Emb
h, 模型主体. nn.ModuleList([GPT2Block(config, layer_idx=i) for i in range(config.num_hidden_layers)])

Conv1D

并非 torch cv 中的 Conv1D, 而是 gpt 系列自己的.

class Conv1D(nn.Module):"""1D-convolutional layer as defined by Radford et al. for OpenAI GPT (and also used in GPT-2).Basically works like a linear layer but the weights are transposed.Args:nf (`int`): The number of output features.nx (`int`): The number of input features."""def __init__(self, nf, nx):super().__init__()self.nf = nfw = torch.empty(nx, nf)nn.init.normal_(w, std=0.02)self.weight = nn.Parameter(w)self.bias = nn.Parameter(torch.zeros(nf))def forward(self, x):size_out = x.size()[:-1] + (self.nf,)x = torch.addmm(self.bias, x.view(-1, x.size(-1)), self.weight)x = x.view(size_out)return x

addmm() 方法是方程 beta*mat + alpha*(mat1 @ mat2) 的优化版本, 其实就是 kx+b.

GPT2Attention

class GPT2Attention(nn.Module):def __init__(self, config, is_cross_attention=False, layer_idx=None):self.embed_dim = config.hidden_sizeself.num_heads = config.num_attention_headsself.head_dim = self.embed_dim // self.num_headsself.split_size = self.embed_dimself.is_cross_attention = is_cross_attentionself.c_proj = Conv1D(self.embed_dim, self.embed_dim)def _attn(self, query, key, value, attention_mask=None, head_mask=None):attn_weights = torch.matmul(query, key.transpose(-1, -2))attn_weights = nn.functional.softmax(attn_weights, dim=-1)attn_output = torch.matmul(attn_weights, value)return attn_output, attn_weightsdef _split_heads(self, tensor, num_heads, attn_head_size):"""Splits hidden_size dim into attn_head_size and num_heads"""new_shape = tensor.size()[:-1] + (num_heads, attn_head_size)tensor = tensor.view(new_shape)return tensor.permute(0, 2, 1, 3)  # (batch, head, seq_length, head_features)def forward(...):query = self._split_heads(query, self.num_heads, self.head_dim)key = self._split_heads(key, self.num_heads, self.head_dim)value = self._split_heads(value, self.num_heads, self.head_dim)attn_output, attn_weights = self._attn(query, key, value, attention_mask, head_mask)attn_output = self._merge_heads(attn_output, self.num_heads, self.head_dim)attn_output = self.c_proj(attn_output)outputs = (attn_output, present)return outputs

因为 attention 是 multi_head 的, 所以有四组 qkv 的权重. 但为了计算性能, 用到了 split_heads 等方法, 但可读性急剧下降.

GPT2MLP

class GPT2MLP(nn.Module):def __init__(self, intermediate_size, config):super().__init__()embed_dim = config.hidden_sizeself.c_fc = Conv1D(intermediate_size, embed_dim)self.c_proj = Conv1D(embed_dim, intermediate_size)self.act = ACT2FN[config.activation_function]self.dropout = nn.Dropout(config.resid_pdrop)def forward(self, hidden_states: Optional[Tuple[torch.FloatTensor]]) -> torch.FloatTensor:hidden_states = self.c_fc(hidden_states)hidden_states = self.act(hidden_states)hidden_states = self.c_proj(hidden_states)hidden_states = self.dropout(hidden_states)return hidden_states

GPT2Block

class GPT2Block(nn.Module):def __init__(self, config, layer_idx=None):super().__init__()hidden_size = config.hidden_sizeinner_dim = config.n_inner if config.n_inner is not None else 4 * hidden_sizeself.ln_1 = nn.LayerNorm(hidden_size, eps=config.layer_norm_epsilon)self.attn = GPT2Attention(config, layer_idx=layer_idx)self.ln_2 = nn.LayerNorm(hidden_size, eps=config.layer_norm_epsilon)if config.add_cross_attention:self.crossattention = GPT2Attention(config, is_cross_attention=True, layer_idx=layer_idx)self.ln_cross_attn = nn.LayerNorm(hidden_size, eps=config.layer_norm_epsilon)self.mlp = GPT2MLP(inner_dim, config)