经过上一篇transformer学习笔记-自注意力机制(1)原理学习,这一篇对其中的几个关键知识点代码演示:
1、整体qkv注意力计算
先来个最简单未经变换的QKV处理:
import torch
Q = torch.tensor([[3.0, 3.0,0.0],[0.5, 4.0,0.0]])
K = Q.T
V = Qscores = Q @ K #计算内积
weights = torch.softmax(scores, dim=0)
print(f"概率分布:{weights}")
newQ = weights @ V
print(f"输出:{newQ}")
再来个输入经过Wq/Wk/Wv变换的:
import torch
Q = torch.tensor([[3.0, 3.0,0.0],[0.5, 4.0,0.0]])
torch.manual_seed(123)
d_q, d_k, d_v = 4, 4, 5 # W_query, W_key, W_value 的维度
d = Q.shape[1] # W_query, W_key, W_value 的行数等于输入token的维度
# 获取W_query, W_key, W_value(随机生成)
W_query = torch.nn.Parameter(torch.rand(d, d_q))
W_key = torch.nn.Parameter(torch.rand(d, d_k))
W_value = torch.nn.Parameter(torch.rand(d, d_v))print("W_query:", W_query)
print("W_key:", W_key)
print("W_value:", W_value)#先只计算苹果对整个句子的注意力,看看效果
apple = Q[0]
query_apple = apple @ W_query
keys = Q @ W_key
values = Q @ W_value
print(f"query_apple:{query_apple}")
print(f"keys:{keys}")
print(f"values:{values}")
scores = query_apple @ keys.T
print(f"scores:{scores}")
weights = torch.softmax(scores, dim=0)
print(f"weights:{weights}")
newQ = weights @ values
print(f"newQ:{newQ}")#再看下整体的
querys = Q @ W_query
all_scores = querys @ keys.T
print(f"all_scores:{all_scores}")
all_weights = torch.softmax(all_scores, dim=-1)
print(f"all_weights:{all_weights}")
output = all_weights @ values
print(f"output:{output}")
最终生成的output的维度与W_value 的维度一致。
2、调换顺序结果不变
import torchdef simple_attention(Q):K = Q.TV = Qscores = Q @ K #计算内积weights = torch.softmax(scores, dim=-1)print(f"概率分布:{weights}")newQ = weights @ Vprint(f"输出:{newQ}")Q = torch.tensor([[3.0, 3.0,0.0],[0.5, 4.0,0.0]])
Q1 = torch.tensor([[0.5, 4.0,0.0],[3.0, 3.0,0.0]])
print("模拟‘苹果梨’:")
simple_attention(Q)
print("模拟‘梨苹果’:")
simple_attention(Q1)
可以看到“苹果梨”、“梨苹果”即便换了词token的顺序,并不会影响新的梨和新的苹果的向量数值。这里我们用了softmax函数求概率分布,因此跟上一篇文章的示例数值不一样,不要在意这个细节。
3、softmax:
import numpy as npdef softmax(x):e_x = np.exp(x)return e_x / e_x.sum(axis=0)def softmax_with_temperature(x,T):e_x = np.exp(x/T)return e_x / e_x.sum(axis=0)# 示例使用
if __name__ == "__main__":input_vector = np.array([2.0, 1.0, 0.1])output = softmax(input_vector)print("Softmax Output:", output)print("Softmax with Temperature 0.5 Output:", softmax_with_temperature(input_vector,0.5))print("Softmax with Temperature 1 Output:", softmax_with_temperature(input_vector,1))print("Softmax with Temperature 5 Output:", softmax_with_temperature(input_vector,5))
可以看到随着T的不断加大,概率分布不断趋于均匀分布。
4、softmax除以 d k \sqrt{d_k} dk
还是用上面的softmax函数,演示下除以 d k \sqrt{d_k} dk的效果:
# 高维输入向量input_vector_high_dim = np.random.randn(100) * 10 # 生成一个100维的高斯分布随机向量,乘以10增加内积output_high_dim = softmax(input_vector_high_dim)print("High Dimension Softmax Output:", output_high_dim)# 打印高维输出的概率分布print("Max Probability in High Dimension:", np.max(output_high_dim))print("Min Probability in High Dimension:", np.min(output_high_dim))# 高维输入向量除以10input_vector_high_dim_div10 = input_vector_high_dim / 10output_high_dim_div10 = softmax(input_vector_high_dim_div10)print("High Dimension Softmax Output (Divided by 10):", output_high_dim_div10)# 打印高维输出的概率分布print("Max Probability in High Dimension (Divided by 10):", np.max(output_high_dim_div10))print("Min Probability in High Dimension (Divided by 10):", np.min(output_high_dim_div10))# 绘制高维概率分布曲线plt.figure(figsize=(10, 6))# 绘制图形plt.plot(output_high_dim, label='High Dim')plt.plot(output_high_dim_div10, label='High Dim Divided by 10')plt.legend()plt.title('High Dimension Softmax Output Comparison')plt.xlabel('Index')plt.ylabel('Probability')plt.show()
在除以 d k \sqrt{d_k} dk之前,由于内积变大,导致概率分布变得尖锐,趋近0的位置梯度基本消失,softmax 函数的损失函数的导数在输出接近 0 时接近零,在反向传播过程中,无法有效地更新权重。有兴趣的话可以试试对softmax 函数的损失函数求导。
继续上面的代码,来看下softmax的输出的损失函数求梯度:
def test_grad( dim_vertor):import numpy as npimport torchimport torch.nn.functional as F# 假设的输入z = torch.tensor(dim_vertor, requires_grad=True)print(z)# 计算 softmax 输出p = F.softmax(z, dim=0)true_label = np.zeros(100)true_label[3] = 1# 模拟损失函数(例如交叉熵)y = torch.tensor(true_label) # one-hot 编码的真实标签loss = -torch.sum(y * torch.log(p))# 反向传播并获取梯度loss.backward()# print(z.grad) # 输出梯度return z.gradgrad_div10 = test_grad(input_vector_high_dim_div10)grad = test_grad(input_vector_high_dim)print(f"grad_div10:{grad_div10}")print(f"grad:{grad}")
明显看出,没有除以 d k \sqrt{d_k} dk求出的梯度,基本为0;上面的代码是torch已经实现的。当然也可以根据损失函数自己求导,这里我们只为演示效果,点到即止:
5、多头注意力:
import torch
import torch.nn as nntorch.manual_seed(123)# 输入矩阵 Q
Q = torch.tensor([[3.0, 3.0, 0.0],[0.5, 4.0, 0.0]])# 维度设置
d_q, d_k, d_v = 4, 4, 5 # 每个头的 query, key, value 的维度
d_model = Q.shape[1] # 输入 token 的维度
num_heads = 2 # 头的数量# 初始化每个头的权重矩阵
W_query = nn.ParameterList([nn.Parameter(torch.rand(d_model, d_q)) for _ in range(num_heads)])
W_key = nn.ParameterList([nn.Parameter(torch.rand(d_model, d_k)) for _ in range(num_heads)])
W_value = nn.ParameterList([nn.Parameter(torch.rand(d_model, d_v)) for _ in range(num_heads)])# 输出权重矩阵
W_output = nn.Parameter(torch.rand(num_heads * d_v, d_model))# 打印权重矩阵
for i in range(num_heads):print(f"W_query_{i+1}:\n{W_query[i]}")print(f"W_key_{i+1}:\n{W_key[i]}")print(f"W_value_{i+1}:\n{W_value[i]}")# 计算每个头的 Q, K, V
queries = [Q @ W_query[i] for i in range(num_heads)]
keys = [Q @ W_key[i] for i in range(num_heads)]
values = [Q @ W_value[i] for i in range(num_heads)]# 计算每个头的注意力分数和权重
outputs = []
for i in range(num_heads):scores = queries[i] @ keys[i].T / (d_k ** 0.5)weights = torch.softmax(scores, dim=-1)output = weights @ values[i]outputs.append(output)# 拼接所有头的输出
concat_output = torch.cat(outputs, dim=-1)
print(f"concat_output:\n{concat_output}")
# 最终线性变换
final_output = concat_output @ W_output# 打印结果
print(f"Final Output:\n{final_output}")
6、掩码注意力:
import torch# 原始 Q 矩阵
Q = torch.tensor([[3.0, 3.0, 0.0],[0.5, 4.0, 0.0],[1.0, 2.0, 0.0],[2.0, 1.0, 0.0]])torch.manual_seed(123)
d_q, d_k, d_v = 4, 4, 5 # query, key, value 的维度
d = Q.shape[1] # query, key, value 的行数等于输入 token 的维度# 初始化权重矩阵
W_query = torch.nn.Parameter(torch.rand(d, d_q))
W_key = torch.nn.Parameter(torch.rand(d, d_k))
W_value = torch.nn.Parameter(torch.rand(d, d_v))print("W_query:", W_query)
print("W_key:", W_key)
print("W_value:", W_value)# 计算 Q, K, V
querys = Q @ W_query
keys = Q @ W_key
values = Q @ W_valueprint(f"querys:\n{querys}")
print(f"keys:\n{keys}")
print(f"values:\n{values}")# 计算注意力分数
all_scores = querys @ keys.T / (d_k ** 0.5)
print(f"all_scores:\n{all_scores}")# 生成掩码
seq_len = Q.shape[0]
mask = torch.triu(torch.ones(seq_len, seq_len), diagonal=1).bool()
masked_scores = all_scores.masked_fill(mask, float('-inf'))print(f"Mask:\n{mask}")
print(f"Masked Scores:\n{masked_scores}")# 计算权重
all_weights = torch.softmax(masked_scores, dim=-1)
print(f"all_weights:\n{all_weights}")# 计算输出
output = all_weights @ values
print(f"output:\n{output}")
主要看下生成的掩码矩阵,和通过掩码矩阵处理的权重分布: