大家都知道python在数据处理、数据分析和机器学习等方面的强大之处,那么如何使用Python实现Web方式的HTTP应用和服务呢,其实有很多种方式,比如利用gradio、streamlit实现web应用,利用FastApi实现web服务等等,具体详见之前的大模型应用文章,本文采用tornado框架实现http web服务,并结合自然语言处理(NLP)的分词和实体识别的需求,分别实现如何提供相应的http服务。具体如下:
运行环境:python3.10,tornado,jieba,time,logging等
运行命令:python httpServer_nlp.py
调用方法:http://localhost:8082/cutsegment?content=油气勘探开发文档的语义分析及提取方法的研究与实现三个方面研究&search_type=1
识别结果(分词):{"cut": ["油气勘探", "开发", "文档", "语义", "分析", "提取", "方法", "研究", "三个", "研究"], "entities": [], "returncode": 0, "message": "ok", "runtime": 0.3878319263458252}
识别结果(实体):{"cut": [], "entities": ["方法", "语义", "分析", "提取", "实现", "文档", "方面"], "returncode": 0, "message": "ok", "runtime": 0.4005763530731201}
import sys
import os
import time
import tornado.httpserver
from tornado.options import define, options
import json
from collections import OrderedDict
import re
import jieba
from jieba import analyse
import logging
# 设置日志记录
logger = logging.getLogger() # 获取日志记录器
logger.setLevel(logging.INFO) # 设置全局日志输出级别
# 创建文件日志记录处理器,并指定一些设置选项
fileHandler = logging.FileHandler(filename=f'./log/service.log', mode='a+', encoding='utf-8', delay=False)
# 定义日志输出风格(格式器)
format_option = '%(asctime)s - %(filename)s[line:%(lineno)d] - %(threadName)s - %(levelname)s: %(message)s'
fileHandler.setFormatter(logging.Formatter(format_option))
# 将日志记录处理器加入日志对象
logger.addHandler(fileHandler)
# logger.info("result:{}".format(output)) # 本地日志记录示例 # 设置端口
define("port", default=8082, help="--port", type=int)
# 获取分词结果
def get_kg_result_0(text): jieba.load_userdict("./data/StopWord/user_dict.txt") # 加载自定义分词词典 # 2 获取停用词 stwlist = get_stop_words() text,theDate=get_date(text) # 3 分词并去除停用词 out_dict = remove_special_tokens(jieba.cut(text, cut_all=False), stwlist) print('\n\n1.加载自定义分词词典:\n' + "/ ".join(out_dict)) return out_dict+theDate
# 利用TF-IDF获取实体
def get_entity_0(text): # 抽取前多少的关键词,并按照权重值由高到低返回 print('抽取前多少的关键词' + '//') textrank = analyse.textrank keywords2 = textrank(text) print(keywords2) return keywords2
# 读取停用词
def get_stop_words(path=r'./data/StopWord/NLPIR_stopwords.txt'): file = open(path, 'r',encoding='utf-8').read().split('\n') return set(file)
# 去掉一些停用词/单字符/空字符等
def remove_special_tokens(words, stwlist): words_list = list(words) stop_words = stwlist for i in range(words_list.__len__())[::-1]: if words_list[i] in stop_words: # 去除停用词 words_list.pop(i) elif (len(words_list[i]) == 1): # 去除单个字符 words_list.pop(i) elif words_list[i] == " ": # 去除空字符 words_list.pop(i) return words_list # 整体获取日期,日期需要在分词之前做单独处理
def get_date(content): pattern = r'\d{4}-\d{1,2}-\d{1,2}|\d{4}年\d{1,2}月\d{1,2}日|\d{4}/\d{1,2}/\d{1,2}' result = re.findall(pattern, content) for item in result: content = content.replace(item, "灥") # 将日期转换为特殊字符,一般采取不常用的汉字代替 return content,result
# tornado结构,只需要在里面添加代码,规范输出格式即可
class MainGetHandler(tornado.web.RequestHandler): def recog(self, mode="get"): """ 能够同时支持get和post请求 """ if mode == "get": sub = self.get_argument("content", None) search_type = self.get_argument("search_type", 0) search_type = int(search_type) uid = self.get_argument("uuid", "000000") else: """ post方式接收data传递来的参数 """ data = json.loads(self.request.body.decode()) sub = data["content"] if "content" in data else None search_type = int(data["search_type"]) if "search_type" in data else 0 uid = data["uuid"] if "uuid" in data else "000000" #### 配置参数 #### result = OrderedDict() returncode = 0 message = "ok" output = {} entity={} start = time.time() if search_type == 0 or search_type > 4: returncode = 10000 message = "search_type is error" if sub is None and rel is None and obj is None: returncode = 10001 message = "data is null" if search_type == 1: # 查content的全部relation和object,content不能为空 try: if sub == None or sub in [""," "]: returncode = 10002 message = "when search_type is 1, content not null" else: output = get_kg_result_0(sub) entity=[] except Exception as e: logger.info("{},error: {}".format(output)) returncode = 10002 message = "service error" elif search_type == 2: try: if sub == None or sub in [""," "]: returncode = 10003 message = "when search_type is 2, content and rel not null" else: output = [] entity=get_entity_0(sub) except Exception as e: logger.info("{},error: {}".format(entity)) returncode = 10003 message = "service error" end = time.time() detal = end - start # 以json格式输出,参考输出格式 result["cut"] = output result["entities"]=entity #实体识别 result["returncode"] = returncode result["message"] = message result["runtime"] = detal logger.info("result:{}".format(result)) # 本地日志 self.write(json.dumps(result, ensure_ascii=False)) # 写结果 self.finish() def get(self): """ get方式调用 """ self.recog(mode="get") def post(self): """ post方式调用 """ self.recog(mode="post")
# 主程序
if __name__ == "__main__": # """ 服务器启动 """ print("Server is listening,Port:" + str(options.port) + " ...") sys.path.append("../") # 将当前目录加载道path中 tornado.options.parse_command_line() # 域名规则,需要与nginx中配置的一致 application = tornado.web.Application([(r"/cutsegment", MainGetHandler)]) http_server = tornado.httpserver.HTTPServer(application) http_server.listen(options.port) tornado.ioloop.IOLoop.instance().start()