异构图上的连接预测二
- 对整个大图进行transform处理
- 获取批次子图
- 定义GNN网络
- 定义分类器:
- 整合模型。
- 开始训练:
对整个大图进行transform处理
详细描述过程都在代码中体现。
transform = T.RandomLinkSplit(num_val=0.1, # 10%的 边 作为验证集,num_test=0.1, # 10%的 边 作为测试集disjoint_train_ratio=0.3, #训练集中 30%的边将不会出现在验证集和测试集中。neg_sampling_ratio=2.0, # 负采样比例为2.0,就是说生成的负样本数量是正样本数量的两倍,# 什么是负样本呢? 就是图中不存在的边。add_negative_train_samples=False, # 是否为训练集添加负样本呢?edge_types=("user", "rates", "movie"),# 指定边的类型,user到movie,关系是rates,即用户对电影的评分。rev_edge_types=("movie", "rev_rates", "user"), # 同上,只不过反了,即电影被用户评分。
)
#
train_data, val_data, test_data = transform(data)
# print("Training data:")
# print("==============")
# print(train_data)
# print()
# print("Validation data:")
# print("================")
# print(val_data)
# print()
# print("Testing data:")
# print("================")
# print(test_data)
#
# print(train_data["user", "rates", "movie"].edge_label)
# print(train_data["user", "rates", "movie"].edge_label_index)assert train_data["user", "rates", "movie"].num_edges == 56469
assert train_data["user", "rates", "movie"].edge_label_index.size(1) == 24201
assert train_data["movie", "rev_rates", "user"].num_edges == 56469# 没有负采样边(标签都为1)
assert train_data["user", "rates", "movie"].edge_label.min() == 1
assert train_data["user", "rates", "movie"].edge_label.max() == 1assert val_data["user", "rates", "movie"].num_edges == 80670
assert val_data["user", "rates", "movie"].edge_label_index.size(1) == 30249
assert val_data["movie", "rev_rates", "user"].num_edges == 80670# 负采样边比例为2
assert val_data["user", "rates", "movie"].edge_label.long().bincount().tolist() == [20166, 10083]
"""
训练边:验证边:测试边=0.8:0.1:0.1,总共100836
100836*0.8=80670
训练边中:消息传递:监督=0.7:0.3,训练边一共80670,其中消息边为80670 * 0.7=56469(edge_index);监督边24201
验证边和测试边正样本(标签为1)各自为100836 * 0.1 ≈10083,由于有负采样,所以edge_label都为10083 * 3 =30249 (user, rates, movie)={edge_index=[2, 56469],edge_label=[24201],edge_label_index=[2, 24201],},理解下:edge_index=[2, 56469] 为啥这里有56469条边,而edge_label=[24201] 只有24201条边的标签呢?训练边中:消息传递:监督=0.7:0.3,训练边一共80670,其中消息边为80670 * 0.7=56469(edge_index);监督边24201因为edge_index包括了图中所有的边,而edge_label是用于监督的,在训练集中,占了80%,其中30%用于监督。什么?不理解监督什么意思吗??简单来说了你预测了一个东西,而且事先是知道该东西是啥玩意,即已知标签,将预测与标签进行对比。"""# 获取需要的边标签索引和边标签
edge_label_index = train_data['user','rates','movie'].edge_label_index # 标签对应的索引,那不就是监督边的索引吗?
edge_label = train_data['user','rates','movie'].edge_label # 标签,监督边
获取批次子图
# 1-hop ,采样20个邻居,2-hop采样10个邻居
train_loader = LinkNeighborLoader(# 这里其实相当于在整个图中取出多个子图data=train_data,num_neighbors=[20, 10],neg_sampling_ratio=2.0,edge_label_index=(('user','rates','movie'), edge_label_index),edge_label=edge_label,batch_size=128, # 该批次选择了128个初始节点shuffle=True,# 128个节点,然后第一层都挑选20个一阶节点,第二层挑选10个二阶节点。# 因为neg_sampling_ration = 2,也就是负样本的数量将是正样本的两本,那么总的数量就是128 *2 + 128 = 384
)# 一个采样数据
sampled_data = next(iter(train_loader))print("Sampled mini-batch:")
print("===================")
# print(sampled_data)assert sampled_data["user", "rates", "movie"].edge_label_index.size(1) == 3 * 128
assert sampled_data["user", "rates", "movie"].edge_label.min() == 0
assert sampled_data["user", "rates", "movie"].edge_label.max() == 1
定义GNN网络
在这里应该注意到这是GNN网络,用于处理同构图的。也就是边和节点类型都一样的图。
class GNN(nn.Module):def __init__(self,hidden_channels):super(GNN,self).__init__()# 对子图进行处理咯,例如吧,电影的特征20, 输出为64,self.conv1 = SAGEConv(hidden_channels,hidden_channels)self.conv2 = SAGEConv(hidden_channels,hidden_channels)# x 的类型被注释为tensor,edge_index 的类型也是tensor,而->tensor 用于指示forward方法的返回类型是tensordef forward(self,x: Tensor, edge_index:Tensor)->Tensor:x = F.relu(self.conv1(x,edge_index))x = self.conv2(x,edge_index)return x
定义分类器:
你说分类器干嘛的?
假设数据:
x_user = [
[0.1, 0.2, 0.3], # 用户1的嵌入向量
[0.4, 0.5, 0.6], # 用户2的嵌入向量
[0.7, 0.8, 0.9] # 用户3的嵌入向量
]
x_movie = [
[0.1, 0.2, 0.3], # 电影A的嵌入向量
[0.4, 0.5, 0.6], # 电影B的嵌入向量
[0.7, 0.8, 0.9] # 电影C的嵌入向量
]
edge_label_index = [
[0, 1, 2], # 用户的节点ID
[0, 1, 2] # 对应电影的节点ID
]
(0.1 * 0.1) + (0.2 * 0.2) + (0.3 * 0.3) = 0.01 + 0.04 + 0.09 = 0.14
(0.4 * 0.4) + (0.5 * 0.5) + (0.6 * 0.6) = 0.16 + 0.25 + 0.36 = 0.77
(0.7 * 0.7) + (0.8 * 0.8) + (0.9 * 0.9) = 0.49 + 0.64 + 0.81 = 1.94
pred:tensor([0.14, 0.77, 1.94])
用于预测用户对电影的评分。
分类器通过点积操作计算用户和电影嵌入向量的相似度,从而预测用户对电影的评分。
class Classifier(nn.Module):def forward(self,x_user:Tensor,x_movie:Tensor,edge_label_index:Tensor)->Tensor:# 将节点嵌入转换为边表示:edge_feat_user = x_user[edge_label_index[0]]edge_feat_movie = x_movie[edge_label_index[1]]return (edge_feat_user * edge_feat_movie).sum(dim=-1)
整合模型。
class Model(nn.Module):def __init__(self,hidden_channels):super().__init__()# 电影的特征维度是20self.movie_lin = nn.Linear(in_features=20,out_features=hidden_channels)# embedding操作,为用户生成向量,self.user_emb = nn.Embedding(data['user'].num_nodes,hidden_channels)# embedding操作,为电影生成向量,self.movie_emb = nn.Embedding(data['movie'].num_nodes,hidden_channels)self.gnn = GNN(hidden_channels)# 将同构图转变为异构图。self.gnn = to_hetero(self.gnn,metadata=data.metadata())self.classifer = Classifier()def forward(self,data:HeteroData)->Tensor:x_dict = {'user':self.user_emb(data['user'].node_id),# 其实可以不用相加的,但相加的话,可能学习效果会更好,# self.movie_lin(data['movie'].x) x其实是电影特征=>[128,20] =>[128,64]# self.movie_emb(data['movie'].node_id) [128,64]'movie':self.movie_lin(data['movie'].x) +self.movie_emb(data['movie'].node_id)}# model 初始化时已经调用了self.gnn = to_hetero(self.gnn,metadata=data.metadata()) 将其变为异构图gnn吗,能够对异构图进行处理x_dict = self.gnn(x_dict, data.edge_index_dict)pred = self.classifer(x_dict["user"],x_dict["movie"],data["user", "rates", "movie"].edge_label_index,)return pred
开始训练:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(f'运行在{device}上')
model = Model(hidden_channels=64).to(device)
optimizer = torch.optim.Adam(model.parameters(),lr=0.001)
for epoch in range(1,6):total_loss = total_examples = 0for sampled_data in tqdm.tqdm(train_loader):sampled_data =sampled_data.to(device)# 梯度清零optimizer.zero_grad()# 运行pred = model(sampled_data)# 真实值ground_truth = sampled_data['user','rates','movie'].edge_labelloss = F.binary_cross_entropy_with_logits(pred,ground_truth)loss.backward()optimizer.step()total_loss += float(loss) * pred.numel()total_examples += pred.numel()print(f"Epoch: {epoch:03d}, Loss: {total_loss / total_examples:.4f}")