本部分记录如何利用Python进行分词工具集成,集成工具可以实现运行无环境要求,同时也更方便。
该文章主要是记录,知识点不是特别多,欢迎访问个人博客:https://blog.jiumoz.top/archives/fen-ci-gong-ju-ji-cheng
成品展示
软件链接:https://cloud.189.cn/web/share?code=BN3yYvIJfUfq(访问码:vvw4)
大家要是想体验的话,就下载了试试吧,有点大,主要是pyqt5太大了,好几百兆…
用的是天翼云,百度云盘有众所周知的原因,阿里云盘不能分享压缩包…
- 软件包含分词、词性标注、自定义停用词表、文件导出等功能,但是也依旧不够智能,比如不能自己设置很多参数、文件保存的格式…
工具介绍
都是python工具包,pip安装就行。
- GUI界面主要构成是PyQt5
- 核心功能是分词是jieba
- 打包有很多方式,这里使用两种方式,一种是利用cx_Freeze;另一种是pyinstaller;
开始简单的试验
首先实现简单的手工输入语句并完成分词与输出
主要的代码编写
- 借Python实现简单GUI程序中相关的内容,我们直接修改相关内容确定最后的窗体页面:
- 关键代码,借jieba分词中的内容,我们导入
jieba
包后直接集成,主要函数代码如下:
def cut(self):self.equal.clear()print(self.first.toPlainText())text = self.first.toPlainText()words = jieba.lcut(text)print(words)word = ""for i in words:word = word+" "+iself.equal.append(word)
- 测试效果:
- 完整代码:
# -*- coding: utf-8 -*-
# @Time : 2022/5/1 11:52
# @Author : MinChess
# @File : test2.py
# @Software: PyCharm
import sys
from PyQt5 import QtCore, QtGui, QtWidgets
from PyQt5.QtWidgets import *
from PyQt5.QtGui import *
from PyQt5.QtCore import *
import jiebaclass test(QWidget):def __init__(self):super().__init__()self.initUI()def initUI(self):self.setWindowTitle("test")self.show()self.resize(1200,900)self.setMinimumSize(1200,900)alllayout = QVBoxLayout()vlayout = QHBoxLayout()vlayout2 = QHBoxLayout()self.addbtn = QPushButton("开始处理")self.addbtn.setFixedHeight(66)self.addbtn.clicked.connect(self.cut)self.addbtn.setStyleSheet("font-size:36px;")self.first = QTextEdit()self.equal = QTextEdit()self.equal.setReadOnly(True)vlayout.addWidget(self.addbtn)vlayout2.addWidget(self.first)vlayout2.addWidget(self.equal)alllayout.addLayout(vlayout)alllayout.addLayout(vlayout2)self.setLayout(alllayout)def cut(self):self.equal.clear()print(self.first.toPlainText())text = self.first.toPlainText()words = jieba.lcut(text)print(words)word = ""for i in words:word = word+" "+iself.equal.append(word)if __name__ == '__main__':app = QApplication(sys.argv)ex = test()ex.show()sys.exit(app.exec_())
集成为exe
这里主要介绍cx_Freeze集成的方法
pip install cx_freeze
安装打包的库- 编写配置文件
# -*- coding: utf-8 -*-
# @Time : 2021/12/1 20:44
# @Author : MinChess
# @File : setup.py
# @Software: PyCharm
import sys
from cx_Freeze import setup, Executable
build_exe_options = {"packages": ["os"]}
base = None
if sys.platform == "win32":base = "Win32GUI"
setup(name="九陌斋分词",version="0.1",description="My GUI application!",options={"build_exe": build_exe_options},executables=[Executable("xxx.py", base=base)])
- 到该目录下通过
python setup.py build
运行该程序即可完成集成
完整分词工具的编写
- 首先得确定方案与路线
- 首先得有GUI界面:得有按钮 得有显示字符串的框…
- 核心代码:有了界面就得思考如何实现了,比如如何获取文件 如何输出 如何执行分词…
- 事件通信:事件通信就是思考如何将事件绑定起来,也就是点击对应按钮执行我们想要的功能
- 测试与集成:写完代码就需要不断的测试,直到没有bug再集成成为EXE文件
- 这里不做详细的代码拆分介绍,贴出完整代码如下,资料包点击链接即可获取
代码包(主程序、集成程序、图标文件、停用词表):https://cloud.189.cn/web/share?code=ZBZvqeBBz6Jb(访问码:0wri)
# -*- coding: utf-8 -*-
# @Time : 2021/12/1 20:44
# @Author : MinChess
# @File : jieba_cut.py
# @Software: PyCharm
import sys
from PyQt5 import QtCore, QtGui, QtWidgets
from PyQt5.QtWidgets import *
from PyQt5.QtGui import *
from PyQt5.QtCore import *
import re
import jieba
import jieba.posseg
from collections import Counterclass Jieba_Main_Window(QWidget):sig = pyqtSignal()def __init__(self):super().__init__()self.initUI()sys.stdout = Stream(newText=self.onUpdateText)self.timer = QTimer(self)self.timer.timeout.connect(self.__openByIODevice)def onUpdateText(self, text):cursor = self.info_content.textCursor()cursor.movePosition(QTextCursor.End)cursor.insertText(text)self.info_content.setTextCursor(cursor)self.info_content.ensureCursorVisible()def initUI(self):self.setWindowTitle('九陌斋-Jieba分词')self.setWindowIcon(QIcon('favicon.ico'))self.show()self.resize(1200, 900) # 宽×高self.setMinimumSize(1200, 900)alllayout = QHBoxLayout()vlistsum = QHBoxLayout()vlistlayout = QVBoxLayout()v00layout = QVBoxLayout()v01layout = QVBoxLayout()v10layout = QVBoxLayout()v11layout = QVBoxLayout()v0layout = QVBoxLayout()v1layout = QVBoxLayout()self.choose_file = QPushButton("文件选择")self.choose_file.setStyleSheet('''QPushButton{font-size:18px;color:white;font-weight:bold;}QPushButton{background:#2ca9e1;border-radius:5px;}QPushButton:hover{background:#84a2d4;}''')self.choose_file.setFixedHeight(32)self.choose_file.clicked.connect(self.on_actQFile_Open_triggered)self.choose_save = QPushButton("文件保存")self.choose_save.clicked.connect(self.QFile_Save)self.choose_save.setStyleSheet('''QPushButton{font-size:18px;color:white;font-weight:bold;}QPushButton{background:#2ca9e1;border-radius:5px;}QPushButton:hover{background:#84a2d4;}''')self.choose_save.setFixedHeight(32)# self.choose_save.clicked.connect(self.QFile_Save)self.start_clean = QPushButton("开始处理")self.start_clean.clicked.connect(self.jieba_cut)self.start_clean.setStyleSheet('''QPushButton{font-size:18px;color:white;font-weight:bold;}QPushButton{background:#2ca9e1;border-radius:5px;}QPushButton:hover{background:#84a2d4;}''')self.start_clean.setFixedHeight(32)self.cut_word_list = QPushButton("默认列表")self.cut_word_list.clicked.connect(self.default_cut_list)self.cut_word_list.setStyleSheet('''QPushButton{font-size:18px;color:white;font-weight:bold;}QPushButton{background:#2ca9e1;border-radius:5px;}QPushButton:hover{background:#84a2d4;}''')self.cut_word_list.setFixedHeight(32)self.default_clean = QPushButton("帮助文档")self.default_clean.setStyleSheet('''QPushButton{font-size:18px;color:white;font-weight:bold;}QPushButton{background:#2ca9e1;border-radius:5px;}QPushButton:hover{background:#84a2d4;}''')self.default_clean.setFixedHeight(32)self.default_clean.clicked.connect(self.help_info)self.file_name = QLabel('文件名称:')self.file_name_path = QLineEdit()self.lab_rules = QLabel('文件内容:')self.rules_content = QTextEdit()self.stop_words_rules = QLabel('停用词列表:')self.stop_words_content = QTextEdit()self.file_name_path.setReadOnly(True)self.file_content_name = QLabel('分词结果:')self.file_content = QTextEdit()self.save_file_fenci = QPushButton("分词结果保存")self.save_file_fenci.clicked.connect(self.on_actQFile_Save_triggered)self.save_file_fenci.setStyleSheet('''QPushButton{font-size:18px;color:white;font-weight:bold;}QPushButton{background:#2ca9e1;border-radius:5px;}QPushButton:hover{background:#84a2d4;}''')self.save_file_fenci.setFixedHeight(36)self.lab_info = QLabel('信息输出(输出系统提示信息):')self.info_content = QTextEdit()self.info_content.setReadOnly(True)self.info_content.setStyleSheet("font-size:18px;color:#003399")self.lab_finish = QLabel('词性标注结果(不建议使用文件保存,各方面问题还在解决中):')self.finishi_content = QTextEdit()self.save_file = QPushButton("词性标注结果保存")self.save_file.clicked.connect(self.on_actQFile_Save_triggered2)self.save_file.setStyleSheet('''QPushButton{font-size:18px;color:white;font-weight:bold;}QPushButton{background:#2ca9e1;border-radius:5px;}QPushButton:hover{background:#84a2d4;}''')self.save_file.setFixedHeight(36)v00layout.addWidget(self.choose_file)v00layout.addWidget(self.choose_save)v00layout.addWidget(self.start_clean)v00layout.addWidget(self.cut_word_list)v00layout.addWidget(self.default_clean)v00layout.addWidget(self.file_name)v00layout.addWidget(self.file_name_path)v00layout.addWidget(self.lab_rules)v00layout.addWidget(self.rules_content)vlistlayout.addWidget(self.stop_words_rules)vlistlayout.addWidget(self.stop_words_content)vlistsum.addLayout(v00layout)vlistsum.addLayout(vlistlayout)vlistsum.setStretchFactor(v00layout,5)vlistsum.setStretchFactor(vlistlayout,3)v01layout.addWidget(self.file_content_name)v01layout.addWidget(self.file_content)v01layout.addWidget(self.save_file_fenci)v10layout.addWidget(self.lab_info)v10layout.addWidget(self.info_content)v11layout.addWidget(self.lab_finish)v11layout.addWidget(self.finishi_content)v11layout.addWidget(self.save_file)v0layout.addLayout(vlistsum)v0layout.addLayout(v10layout)v0layout.setStretchFactor(vlistsum,7)v0layout.setStretchFactor(v10layout,3)v1layout.addLayout(v01layout)v1layout.addLayout(v11layout)alllayout.addLayout(v0layout)alllayout.addLayout(v1layout)alllayout.setStretchFactor(v0layout,4)alllayout.setStretchFactor(v1layout,3)self.setLayout(alllayout)def __openByIODevice(self, fileName):fileDevice = QFile(fileName)if not fileDevice.exists():return Falseif not fileDevice.open(QIODevice.ReadOnly | QIODevice.Text):return Falsetry:self.rules_content.clear()while not fileDevice.atEnd():qtBytes = fileDevice.readLine() # 返回QByteArray类型pyBytes = bytes(qtBytes.data()) # QByteArray转换为bytes类型lineStr = pyBytes.decode("utf-8") # bytes转换为str型lineStr = lineStr.strip() # 去除结尾增加的空行self.rules_content.append(lineStr)finally:fileDevice.close()return Truedef on_actQFile_Open_triggered(self):curPath = QDir.currentPath()title = "打开一个文件"filt = "文本文件(*.txt);;csv文件(*.csv);;程序文件(*.h *.py);;所有文件(*.*)" # 文件过滤器fileName, flt = QFileDialog.getOpenFileName(self, title, curPath, filt)if (fileName == ""):returnif self.__openByIODevice(fileName):self.file_name_path.setText(fileName)print("文件已打开!")else:print("错误", "打开文件失败")def __saveByIODevice(self,fileName): ##用QFile保存文件fileDevice = QFile(fileName)if not fileDevice.open(QIODevice.WriteOnly | QIODevice.Text):return Falsetry:text = self.rules_content.toPlainText() # 返回str类型strBytes = text.encode("utf-8") # str转换为bytes类型fileDevice.write(strBytes) # 写入文件finally:fileDevice.close()return Truedef saveByIODevice(self,fileName): ##用QFile保存文件fileDevice = QFile(fileName)if not fileDevice.open(QIODevice.WriteOnly | QIODevice.Text):return Falsetry:text = self.file_content.toPlainText() # 返回str类型strBytes = text.encode("utf-8") # str转换为bytes类型fileDevice.write(strBytes) # 写入文件finally:fileDevice.close()return Truedef on_actQFile_Save_triggered(self):curPath = QDir.currentPath() # 获取系统当前目录title = "另存为一个文件" # 对话框标题filt = "文本文件(*.txt);;Python程序(*.py);;文本文件(*.xlsx);;csv文件(*.csv);;所有文件(*.*)" # 文件过滤器fileName, flt = QFileDialog.getSaveFileName(self, title, curPath, filt)if (fileName == ""):returnif self.saveByIODevice(fileName):print("文件保存成功:",fileName)else:print("错误", "保存文件失败")# QMessageBox.critical(self, "错误", "保存文件失败")def saveByIODevice2(self,fileName): ##用QFile保存文件fileDevice = QFile(fileName)if not fileDevice.open(QIODevice.WriteOnly | QIODevice.Text):return Falsetry:text = self.finishi_content.toPlainText() # 返回str类型strBytes = text.encode("utf-8") # str转换为bytes类型fileDevice.write(strBytes) # 写入文件finally:fileDevice.close()return Truedef on_actQFile_Save_triggered2(self):curPath = QDir.currentPath() # 获取系统当前目录title = "另存为一个文件" # 对话框标题filt = "文本文件(*.txt);;Python程序(*.py);;文本文件(*.xlsx);;csv文件(*.csv);;所有文件(*.*)" # 文件过滤器fileName, flt = QFileDialog.getSaveFileName(self, title, curPath, filt)if (fileName == ""):returnif self.saveByIODevice2(fileName):print("文件保存成功:",fileName)else:print("错误", "保存文件失败")# QMessageBox.critical(self, "错误", "保存文件失败")def QFile_Save(self):fileName = self.file_name_path.text()if fileName == "":print("你没有打开任何文件......")if self.__saveByIODevice(fileName):print("保存成功!\n文件路径:",fileName)else:print("保存文件失败")def help_info(self):QMessageBox.about(self, "提示信息:", "分词模块主要利用jieba分词工具对文本数据进行分词处理,同时还加入了去停用词功能,即去掉一些无意义的词条,支持自定义停用词词表!这一个大模块的功能非常重要,因为词频统计、LDA主题模型等多个方面都是基于词语展开的!同时分词功能中还加入了多个模式的分词。")def jieba_cut(self):self.finishi_content.clear()self.file_content.clear()cut_words = ""cut_words2 = ""all_words = ""cut_re = ""line = self.rules_content.toPlainText()cut_word_list = self.stop_words_content.toPlainText().split()cut = set(cut_word_list)if line == "":print("待处理文件为空!!!")elif self.stop_words_content.toPlainText() == "":print("请设置停用词列表,可点击默认列表使用系统列表!")else:line.strip('\n')fir_result = jieba.cut(line)for word in fir_result:if word not in cut:if word !='\t' and word != " ":cut_re += wordcut_re += " "seg_list = jieba.cut(line, cut_all=True)words = jieba.posseg.cut(str(cut_re).replace(" ",""))ccc = jieba.cut(str(cut_re).replace(" ",""))all_words += (" ".join(ccc))for word, flag in words:loop = QEventLoop()QTimer.singleShot(1, loop.quit)loop.exec_()self.finishi_content.append(word+','+flag)print(all_words)self.file_content.append(str(cut_re))# for i in all_words:# print(i)# 词频统计c = Counter()for x in all_words.split(" "):if len(x) >= 1 and x != '\r\n':c[x] += 1# 输出词频最高的前N个词# print('\n词频统计结果:')for (k, v) in c.most_common(60):loop = QEventLoop()QTimer.singleShot(100, loop.quit)loop.exec_()# print("%s:%d" % (k, v))def default_cut_list(self):filename = "NLPIR_stopwords.txt"pf = open(filename, "r", encoding='UTF-8')a = pf.read()self.stop_words_content.setText(a)self.selected = Falseclass Stream(QObject):newText = pyqtSignal(str)def write(self, text):self.newText.emit(str(text))if __name__ == '__main__':app = QApplication(sys.argv)ex = Jieba_Main_Window()sys.exit(app.exec_())