看完之后你也可以生成自己的词云
提供一个过滤人名的英中词性分析对应,更多的可以去我的码云上看看
https://gitee.com/billion_lines_of_code/learn-wordcloud
# 只过滤人名
En2Cn_name = {'nr': '名词-人名','nr1': '名词-汉语姓氏','nr2': '名词-汉语名字','nrf': '名词-音译人名','nrfg': '名词-人名',
}
代码中字符串的位置,大部分是可以替换的,你想要的自己改改偶
# 时间包,为了计算程序耗时
import datetime
# 规则执行 便于正则使用
import re
# 中文分析库
import jieba
# 获取词性
import jieba.posseg
# 多进程包
import multiprocessing
# 词云库
import wordcloud
# 集合包
import collections
# 绘图
import numpy
# 图片处理
from PIL import Image
# 图片处理,字体
import matplotlib.pylab as plt
# 本地的文件,词性模版
import baseresource.EnToCN as EnToCNdef jieba_doing(text_str, dic):# 动态调整词典jieba.suggest_freq('', True)# 可以添加用户的词典if dic:jieba.load_userdict('')# 文本分词 精确分词+HMMreturn jieba.cut(text_str, cut_all=False, HMM=True)def jieba_doing_paddle(text_str, dic):# 动态调整词典jieba.suggest_freq('', True)# 可以添加用户的词典if dic:jieba.load_userdict('')# 文本分词 精确分词+HMMreturn jieba.cut(text_str, cut_all=False, HMM=True, use_paddle=True)def main_process():# 词云分析文件# 分析文档analysis_text = '/Users/mac/Downloads/庆余年.txt'number = 100output_file = '词频.txt'text_str = read_text_file(analysis_text)# 文本处理# 正则表达式过滤# 过滤中文符号pattern = re.compile(u'[^a-zA-Z0-9\u4e00-\u9fa5]')text_str = remove_no_need_words(pattern, text_str)# jieba库做的一些事情# word_list_analysis = jieba_doing(text_str, '')word_list_analysis = jieba_doing_paddle(text_str, '')word_list_analysis = list(word_list_analysis)# 使用多进程处理,加快处理速度object_list = multi_process(word_list_analysis, 4)# 词频统计word_count = collections.Counter(object_list)word_count_top = word_count.most_common(number)print('词语\t词频\t词性\n')# 只写名词file_write(output_file, word_count_top, number, EnToCN.En2Cn_name)word_count_top = dict(word_count_top)print('\n 开始制作词云')mask = numpy.array(Image.open('../photomodel/chinamap.jpeg'))do_wordCloud(word_count_top, mask, 5000)# 制作词云
def do_wordCloud(word_count_top, mask, dpi):wc = wordcloud.WordCloud(background_color='white',font_path='../fontmodel/mashanzhengmaobikaishu.ttf', mask=mask,max_font_size=150)wc.generate_from_frequencies(word_count_top)# print(word_count_top)plt.figure('词云')plt.imshow(wc)plt.axis('off')# png = '%s.png' % nameplt.savefig('庆余年人物.png', dpi=dpi)plt.show()print('制作完成')pass# 输出文词频分析文件,需要文件,高频词数组,需要词的个数,词性分析的模版数组
def file_write(output_file, word_count_top, number, EnToCNlist):# 输出文件file_out = open(output_file, 'w', encoding='utf-8')file_out.write('词语\t词频\t词性\n')file_out.write('--------\n')count = 0for TopWord, Frequency in word_count_top: # 获取词语和词频for POS in jieba.posseg.cut(TopWord): # 获取词性if count == number:breakif POS.flag in list(EnToCNlist.keys()):print(TopWord + '\t', str(Frequency) + '\t',list(EnToCNlist.values())[list(EnToCNlist.keys()).index(POS.flag)]) # 逐行输出数据file_out.write(TopWord + '\t' + str(Frequency) + '\t' + list(EnToCNlist.values())[list(EnToCNlist.keys()).index(POS.flag)] + '\n') # 逐行写入str格式数据count += 1file_out.close() # 关闭文件# 去掉文本的不需要的内容
def remove_no_need_words(pattern, text_str):# 去掉符合的符号text_str = re.sub(pattern, '', text_str)return text_str# 输入要分析的文档
def read_text_file(analysis_text):# 读取文件file = open(analysis_text, 'r', encoding='utf-8')text_str = file.read()file.close()return text_str# 词性过滤方法,词性模版
def filter_method(word_list_analysis):object_list = []for word in word_list_analysis:if len(word) < 2:continuefor speech in jieba.posseg.cut(word):if speech.flag in list(EnToCN.En2Cn_name.keys()):object_list.append(word)return object_list# 多进程处理,加快速度
def multi_process(word_list_analysis, num):object_list = []pool = multiprocessing.Pool(num)# 将数组拆分为多块parts = [word_list_analysis[i:i + num] for i in range(0, len(word_list_analysis), num)]results = pool.map(filter_method, parts)for result in results:object_list.extend(result)pool.close()return object_listif __name__ == '__main__':start = datetime.datetime.now()main_process()end = datetime.datetime.now()print(end - start)
paddlepaddle-tiny error 安装不成功使用下面的命令
python3 -m pip install paddlepaddle-tiny -i https://mirror.baidu.com/pypi/simple