准备工作:
从爬虫网站中爬取维基百科See also关联词条:https://densitydesign.github.io/strumentalia-seealsology/
维基百科网站:https://www.wikipedia.org/
爬取过程:
下载 tsv 文件:
import networkx as nx # 图数据挖掘包
import numpy as np # 数据分析
import pandas as pd
import random
from tqdm import tqdm # 进度条
# 数据可视化
import matplotlib.pyplot as plt
plt.rcParams['font.sans-serif'] = ['SimHei'] # 用来正常显示中文标签
plt.rcParams['axes.unicode_minus'] = False # 用来正常显示负号# 导入数据
df = pd.read_csv("seealsology-data.tsv", sep='\t')
# 构建无向图
G = nx.from_pandas_edgelist(df, "source", "target", edge_attr=True, create_using=nx.Graph())
# print(len(G)) # 图的节点数# 随机游走函数
def get_randomwalk(node, path_length):'''输入起始节点和路径长度,生成随机游走序列:param node::param path_length::return:'''random_walk = [node]for i in range(path_length-1):# 汇总邻居节点temp = list(G.neighbors(node))temp = list(set(temp) - set(random_walk)) # 去掉已经访问过的节点if len(temp) == 0:break# 从邻居节点中随机选择下一个节点random_node = random.choice(temp)random_walk.append(random_node)node = random_nodereturn random_walkall_nodes = list(G.nodes())
# print(all_nodes)
# 生成随机游走序列
gamma = 10 # 每个节点作为起始点生成随机游走序列个数
walk_length = 5 # 随机游走序列最大长度
random_walks = []
for n in tqdm(all_nodes):for i in range(gamma):random_walks.append(get_randomwalk(n, walk_length))
# print(len(random_walks))from gensim.models import Word2Vec # 自然语言工具包
model = Word2Vec(vector_size=256,window=4,sg=1,hs=0,negative=10,alpha=0.03,min_alpha=0.0007,seed=14)
# 用随机游走序列构建词汇表
model.build_vocab(random_walks, progress_per=2)
# 训练
model.train(random_walks, total_examples=model.corpus_count, epochs=50, report_delay=1)
# 查看某个节点的Embedding
# print(model.wv.get_vector("deep learning"))
# 找相似词语
# print(model.wv.similar_by_word('deep learning'))# TSNE降维可视化
from sklearn.manifold import TSNE
X = model.wv.vectors
term2index = model.wv.key_to_index
tsne = TSNE(n_components=2, n_iter=1000) # 降维到2维
embed_2d = tsne.fit_transform(X)
plt.figure(figsize=(14, 14))
plt.scatter(embed_2d[:, 0], embed_2d[:, 1])
pagerank = nx.pagerank(G) # 计算PageRank重要度
node_importance = sorted(pagerank.items(), key=lambda x: x[1], reverse=True)
n = 30 # 取PageRank重要度最高的前n个节点
terms_chosen = []
for each in node_importance[:n]:terms_chosen.append(each[0])
for item in terms_chosen:idx = term2index[item]plt.scatter(embed_2d[idx, 0], embed_2d[idx, 1], c='r', s=50)plt.annotate(item, xy=(embed_2d[idx, 0], embed_2d[idx, 1]), c='k', fontsize=12)
plt.show()
参考资料:【DeepWalk代码实战-维基百科词条图嵌入可视化】 https://www.bilibili.com/video/BV1et4y187Gd/?share_source=copy_web&vd_source=9a6c606c6f9df7c015effdcaa7e1fa84