python拆分句子、去除句子符号等并分词

import redef split_text_into_batches(text, max_tokens_per_batch):# 定义一个正则表达式，在中文标点符号处拆分句子sentence_splitter = re.compile(r'(?<=[。！？])')# 将文本拆分为句子sentences = [sentence.strip() for sentence in sentence_splitter.split(text) if sentence.strip()]# 初始化变量batches = []current_batch = ""for sentence in sentences:if len(current_batch) + len(sentence) <= max_tokens_per_batch:current_batch += sentence + " "else:# 找到距离 max_tokens_per_batch 限制最近的标点符号last_punctuation_index = max(current_batch.rfind('。'), current_batch.rfind('！'), current_batch.rfind('？'))# 如果限制范围内没有标点符号，就在最后一个空格处拆分split_index = last_punctuation_index if last_punctuation_index != -1 else current_batch.rfind(' ')# 将批次添加到拆分索引处batches.append(current_batch[:split_index].strip())# 新批次从拆分索引开始current_batch = sentence + " "if current_batch.strip():  # 确保不将空字符串添加到批次中batches.append(current_batch.strip())return batchestext = ""max_tokens_per_batch = 20
batches = split_text_into_batches(text, max_tokens_per_batch)
print("Batches:", batches)

import re
import nltk
import jieba
nltk.download('punkt')
import string
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
def clean_html_tags(text):clean_text = re.sub(r'<.*?>', '', text)return clean_textdef remove_links(text):clean_text = re.sub(r'http\S+', '', text)return clean_textdef remove_special_characters(text):clean_text = ''.join(char for char in text if char not in string.punctuation)return clean_textdef remove_extra_whitespace(text):clean_text = ' '.join(text.split())return clean_textdef remove_stopwords(text):stop_words = set(stopwords.words('english'))word_tokens = word_tokenize(text)clean_text = ' '.join(word for word in word_tokens if word.lower() not in stop_words)return clean_textdef clean_chinese_text(text):# 清除HTML标签cleaned_text = clean_html_tags(text)# 去除链接cleaned_text = remove_links(cleaned_text)# 去除特殊字符cleaned_text = remove_special_characters(cleaned_text)# 去除额外的空白cleaned_text = remove_extra_whitespace(cleaned_text)# 去除停用词cleaned_text = remove_stopwords(cleaned_text)# 使用jieba进行分词word_list = jieba.lcut(cleaned_text)# 拼接成清洗后的文本cleaned_text = ' '.join(word_list)return cleaned_textinput_text =""cleaned_text = clean_chinese_text(input_text)
print(cleaned_text)

本文来自互联网用户投稿，该文观点仅代表作者本人，不代表本站立场。本站仅提供信息存储空间服务，不拥有所有权，不承担相关法律责任。如若转载，请注明出处：http://www.mzph.cn/news/593120.shtml

如若内容造成侵权/违法违规/事实不符，请联系多彩编程网进行投诉反馈email:809451989@qq.com，一经查实，立即删除！