import redef split_text_into_batches(text, max_tokens_per_batch):# 定义一个正则表达式,在中文标点符号处拆分句子sentence_splitter = re.compile(r'(?<=[。!?])')# 将文本拆分为句子sentences = [sentence.strip() for sentence in sentence_splitter.split(text) if sentence.strip()]# 初始化变量batches = []current_batch = ""for sentence in sentences:if len(current_batch) + len(sentence) <= max_tokens_per_batch:current_batch += sentence + " "else:# 找到距离 max_tokens_per_batch 限制最近的标点符号last_punctuation_index = max(current_batch.rfind('。'), current_batch.rfind('!'), current_batch.rfind('?'))# 如果限制范围内没有标点符号,就在最后一个空格处拆分split_index = last_punctuation_index if last_punctuation_index != -1 else current_batch.rfind(' ')# 将批次添加到拆分索引处batches.append(current_batch[:split_index].strip())# 新批次从拆分索引开始current_batch = sentence + " "if current_batch.strip(): # 确保不将空字符串添加到批次中batches.append(current_batch.strip())return batchestext = ""max_tokens_per_batch = 20
batches = split_text_into_batches(text, max_tokens_per_batch)
print("Batches:", batches)
import re
import nltk
import jieba
nltk.download('punkt')
import string
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
def clean_html_tags(text):clean_text = re.sub(r'<.*?>', '', text)return clean_textdef remove_links(text):clean_text = re.sub(r'http\S+', '', text)return clean_textdef remove_special_characters(text):clean_text = ''.join(char for char in text if char not in string.punctuation)return clean_textdef remove_extra_whitespace(text):clean_text = ' '.join(text.split())return clean_textdef remove_stopwords(text):stop_words = set(stopwords.words('english'))word_tokens = word_tokenize(text)clean_text = ' '.join(word for word in word_tokens if word.lower() not in stop_words)return clean_textdef clean_chinese_text(text):# 清除HTML标签cleaned_text = clean_html_tags(text)# 去除链接cleaned_text = remove_links(cleaned_text)# 去除特殊字符cleaned_text = remove_special_characters(cleaned_text)# 去除额外的空白cleaned_text = remove_extra_whitespace(cleaned_text)# 去除停用词cleaned_text = remove_stopwords(cleaned_text)# 使用jieba进行分词word_list = jieba.lcut(cleaned_text)# 拼接成清洗后的文本cleaned_text = ' '.join(word_list)return cleaned_textinput_text =""cleaned_text = clean_chinese_text(input_text)
print(cleaned_text)