文本预处理(text preprocess)总结

在任何机器学习任务中，清理（cleaning ）或预处理（preprocessing）数据与模型构建同样重要，甚至更重要。当涉及文本等非结构化数据时，这个过程就更加重要。

1. 小写化(Lower Casing)

小写是一种常见的文本预处理技术。这个想法是将输入文本转换为相同的大小写格式，以便以相同的方式处理 'text'、'Text' 和 'TEXT'。

    def lower_casing(self, text):return text.lower()

2. 删除标点符号(Removal of Punctuations)

另一种常见的文本预处理技术是从文本数据中删除标点符号。这又是一个文本标准化过程，将有助于处理“hurray”和“hurray！” 以同样的方式。

    #     PUNCT_TO_REMOVE = """!"#$%&\'()*+,-./:;<=>?@[\\]^_{|}~`‘"""def remove_punctuation(self, text):return text.translate(str.maketrans('', '', self.PUNCT_TO_REMOVE))

3. 删除停用词(Removal of stopwords)

停用词是语言中常见的单词，如“the”、“a”等。大多数时候它们可以从文本中删除，因为它们不为下游分析提供有价值的信息。在像词性标记这样的情况下，我们不应该删除它们，因为它们提供了有关 POS 的非常有价值的信息。

    def remove_stopwords(self, text):"""custom function to remove the stopwords"""return " ".join([word for word in str(text).split() if word not in self.STOPWORDS])

4. 删除常用词(Removal of Frequent words)

在前面的预处理步骤中，我们根据语言信息删除了停用词。但是，如果我们有一个特定领域的语料库，我们可能还会有一些对我们来说不太重要的频繁出现的单词。

所以这一步就是去除给定语料库中的频繁出现的单词。如果我们使用 tfidf 之类的东西，就会自动解决这个问题。

from collections import Counter
cnt = Counter()
for text in df["text"].values:for word in text.split():cnt[word] += 1cnt.most_common(10)

5. 删除不经常用的词(Removal of Rare words)

这与之前的预处理步骤非常相似，但我们将从语料库中删除稀有单词。

n_rare_words = 10
RAREWORDS = set([w for (w, wc) in cnt.most_common()[:-n_rare_words-1:-1]])
def remove_rarewords(text):"""custom function to remove the rare words"""return " ".join([word for word in str(text).split() if word not in RAREWORDS])df["text"] = df["text"].apply(lambda text: remove_rarewords(text))

6. 词干提取(Stemming)

词干提取是将词形变化（或有时派生）的单词还原为其词干、词根或词根形式的过程

例如，如果语料库中有两个单词walks和walking，那么词干提取就会对后缀进行词干处理，使它们成为walking。但在另一个例子中，我们有两个单词 console 和 consoling，词干分析器将删除后缀并使它们成为 consol，这不是一个正确的英语单词。

有多种类型的词干算法可用，其中最著名的一种是广泛使用的 porter 词干分析器。我们可以使用 nltk 包来实现同样的目的。

    #  self.stemmer = PorterStemmer()def stem_words(self, text):return " ".join([self.stemmer.stem(word) for word in text.split()])

7. 词形还原(Lemmatization)

词形还原与词干提取类似，将词形变化的单词减少到词干，但不同之处在于它确保词根（也称为词条）属于该语言。

因此，这一过程通常比词干提取过程慢。因此，根据速度要求，我们可以选择使用词干提取或词形还原。

让我们使用 nltk 中的 WordNetLemmatizer 来对句子进行词形还原

    #  self.lemmatizer = WordNetLemmatizer()def lemmatize_words(self, text):return " ".join([self.lemmatizer.lemmatize(word) for word in text.split()])

8. 删除表情符号(Removal of Emojis)

随着社交媒体平台的使用越来越多，表情符号在我们日常生活中的使用也呈爆炸式增长。也许我们可能需要删除这些表情符号以进行一些文本分析。

感谢这段代码，请在下面找到一个辅助函数，从我们的文本中删除表情符号。

# Reference : https://gist.github.com/slowkow/7a7f61f495e3dbb7e3d767f97bd7304b
def remove_emoji(string):emoji_pattern = re.compile("["u"\U0001F600-\U0001F64F"  # emoticonsu"\U0001F300-\U0001F5FF"  # symbols & pictographsu"\U0001F680-\U0001F6FF"  # transport & map symbolsu"\U0001F1E0-\U0001F1FF"  # flags (iOS)u"\U00002702-\U000027B0"u"\U000024C2-\U0001F251""]+", flags=re.UNICODE)return emoji_pattern.sub(r'', string)remove_emoji("game is on 🔥🔥")

9. 删除表情符号(Removal of Emoticons)

https://github.com/NeelShah18/emot/blob/master/emot/emo_unicode.py

def remove_emoticons(text):emoticon_pattern = re.compile(u'(' + u'|'.join(k for k in EMOTICONS) + u')')return emoticon_pattern.sub(r'', text)remove_emoticons("Hello :-)")

10. 替换或删除Http url

     def remove_urls(self, text):url_pattern = re.compile(r'https?://\S+|www\.\S+')return url_pattern.sub(r'', text)def replace_http_url(self, text,  word = "urladd"):return re.sub(r'https?://\S+|www\.\S+', word, text)

11. 替换邮件地址

    def replace_email_id(self, text,  word = "emailadd"):return re.sub(r"([A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\.[A-Za-z]{2,})", word,text)

12. 替换数字

主要是为了较低输入的维度

    def replace_digit(self, text,  word = "digitadd"):return re.sub('\d+', word, text)

13. 删掉多余空格和换行

    def remove_extra_space(self, text):return re.sub(' +', ' ', text)def remove_line_break_space(self, text):# return text.replace('\n', ' ').replace('\r', '')return " ".join([word for word in text.split()])

14. 提取html标签里内容并删除

    def remove_html(self, text):return BeautifulSoup(text, features='html5lib').text

15. 缩写还原

# import library
import contractions
# contracted text
text = '''I'll be there within 5 min. Shouldn't you be there too? I'd love to see u there my dear. It's awesome to meet new friends.We've been waiting for this day for so long.'''# creating an empty list
expanded_words = []    
for word in text.split():# using contractions.fix to expand the shortened wordsexpanded_words.append(contractions.fix(word))   expanded_text = ' '.join(expanded_words)
print('Original text: ' + text)
print('Expanded_text: ' + expanded_text)

完整代码

from bs4 import BeautifulSoup
import lxml
import re
from nltk.corpus import stopwords
import nltk
from nltk.stem.porter import PorterStemmer
from spellchecker import SpellChecker
from nltk.stem import WordNetLemmatizer
from nltk.corpus import wordnet
import contractionsclass text_preprocessing():PUNCT_TO_REMOVE = """!"#$%&\'()*+,-./:;<=>?@[\\]^_{|}~`‘"""def __init__(self):nltk.download('stopwords')self.STOPWORDS = set(stopwords.words('english'))self.stemmer = PorterStemmer()nltk.download('wordnet')self.lemmatizer = WordNetLemmatizer()self.spell = SpellChecker()self.wordnet_map = {"N": wordnet.NOUN, "V": wordnet.VERB, "J": wordnet.ADJ, "R": wordnet.ADV}def expand_contractions(self, text):expanded_text = contractions.fix(text)return expanded_textdef lemmatize_words(self, text):return " ".join([self.lemmatizer.lemmatize(word) for word in text.split()])def lemmatize_words_position(self, text):pos_tagged_text = nltk.pos_tag(text.split())return " ".join([self.lemmatizer.lemmatize(word, self.wordnet_map.get(pos[0], wordnet.NOUN)) for word, pos in pos_tagged_text])def remove_punctuation(self, text):"""custom function to remove the punctuation"""return text.translate(str.maketrans('', '', self.PUNCT_TO_REMOVE))def remove_space(self, text):return text.replace("_x000D_", " ")def remove_extra_space(self, text):return re.sub(' +', ' ', text)def remove_line_break_space(self, text):# return text.replace('\n', ' ').replace('\r', '')return " ".join([word for word in text.split()])def remove_html(self, text):return BeautifulSoup(text, features='html5lib').textdef lower_casing(self, text):return text.lower()def remove_urls(self, text):url_pattern = re.compile(r'https?://\S+|www\.\S+')return url_pattern.sub(r'', text)def remove_stopwords(self, text):"""custom function to remove the stopwords"""return " ".join([word for word in str(text).split() if word not in self.STOPWORDS])def stem_words(self, text):return " ".join([self.stemmer.stem(word) for word in text.split()])def remove_words(self, text, words):return " ".join([word for word in str(text).split() if word not in words])def correct_spellings(self, text):corrected_text = []misspelled_words = self.spell.unknown(text.split())for word in text.split():if word in misspelled_words:corrected_text.append(str(self.spell.correction(word)))else:corrected_text.append(str(word))if len(corrected_text) == 0:return  ""return " ".join(corrected_text)def replace_http_url(self, text,  word = "urladd"):return re.sub(r'https?://\S+|www\.\S+', word, text)def replace_email_id(self, text,  word = "emailadd"):return re.sub(r"([A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\.[A-Za-z]{2,})", word,text)def replace_digit(self, text,  word = "digitadd"):return re.sub('\d+', word, text)if __name__ == '__main__':text_preprocessing = text_preprocessing()text ="""this text is for test"""text = text_preprocessing.replace_email_id(text)text = text_preprocessing.replace_http_url(text)text = text_preprocessing.replace_digit(text)text = text_preprocessing.expand_contractions(text)print(text)# text = text_preprocessing.remove_extra_space(text)# print('after removing extra space:', text)# old_text= text_preprocessing.remove_line_break_space(text)# print('old_text:',old_text)# text = text_preprocessing.lemmatize_words(old_text)# print("lemmatize_words_position:", text)# text = text_preprocessing.stem_words(old_text)# print("stem_words:",text)

Padas 处理的文字代码

import pandas as pd
from pandas import DataFrame
from tabulate import tabulate
from nlp.text_preprocessing_util.text_preprocessing import *base_dir = "C:/apps/ml_datasets"text_preprocessing = text_preprocessing()def get_dataset():data = pd.read_excel(base_dir+'/Support_email_category.xlsx', sheet_name='Emails')#data = pd.read_excel('../dataset/final.xlsx', sheetname='final')# data = data.apply(preprocess_data, axis=1)X_orig = data['Subject'].astype(str) +" "+  data['Email content']y_orig = data['Priority']new_data = pd.DataFrame()new_data['X_orig'] = X_orignew_data['y_orig'] = y_orignew_data.to_excel(base_dir+'/raw_data.xlsx', index=None)return new_datadef lower_casing(data: DataFrame):data["X_orig"] = data["X_orig"].apply(lambda text: text_preprocessing.lower_casing(text))return datadef remove_space(data: DataFrame):data["X_orig"] = data["X_orig"].apply(lambda text: text_preprocessing.remove_space(text))return datadef remove_punctuation(data: DataFrame):data["X_orig"] = data["X_orig"].apply(lambda text: text_preprocessing.remove_punctuation(text))return datadef remove_stopwords(data: DataFrame):data["X_orig"] = data["X_orig"].apply(lambda text: text_preprocessing.remove_stopwords(text))return datadef correct_spellings(data: DataFrame):data["X_orig"] = data["X_orig"].apply(lambda text: text_preprocessing.correct_spellings(text))return datadef remove_freqwords(data: DataFrame):from collections import Countercnt = Counter()for text in data["X_orig"].values:for word in text.split():cnt[word] += 1cnt.most_common(10)FREQWORDS = set([w for (w, wc) in cnt.most_common(10)])data["X_orig"] = data["X_orig"].apply(lambda text: text_preprocessing.remove_words(text, FREQWORDS))return datadef remove_rare_words(data: DataFrame):from collections import Countercnt = Counter()for text in data["X_orig"].values:for word in text.split():cnt[word] += 1cnt.most_common(10)n_rare_words = 10RAREWORDS = set([w for (w, wc) in cnt.most_common()[:-n_rare_words - 1:-1]])print("rarewords:", RAREWORDS)data["X_orig"] = data["X_orig"].apply(lambda text: text_preprocessing.remove_words(text, RAREWORDS))return datadef stem_words(data: DataFrame):data["X_orig"] = data["X_orig"].apply(lambda text: text_preprocessing.stem_words(text))return datadef lemmatize_words(data: DataFrame):data["X_orig"] = data["X_orig"].apply(lambda text: text_preprocessing.lemmatize_words(text))return datadef lemmatize_words1(data: DataFrame):import nltkfrom nltk.stem import WordNetLemmatizerfrom nltk.corpus import wordnetnltk.download('wordnet')lemmatizer = WordNetLemmatizer()wordnet_map = {"N": wordnet.NOUN, "V": wordnet.VERB, "J": wordnet.ADJ, "R": wordnet.ADV}data["X_orig"] = data["X_orig"].apply(lambda text: text_preprocessing.lemmatize_words_position(text, lemmatizer, wordnet_map))return datadef remove_urls(data: DataFrame):data["X_orig"] = data["X_orig"].apply(lambda text: text_preprocessing.remove_urls(text))return datadef remove_html(data: DataFrame):data["X_orig"] = data["X_orig"].apply(lambda text: text_preprocessing.remove_html(text))return datadef process_abbreviated_words(data: DataFrame):data["after X_orig"] = data["X_orig"].apply(lambda text: text_preprocessing.chat_words_conversion(text ))return datadef texts_preprocessing(data: DataFrame):data = remove_space(data)data = remove_urls(data)data= remove_html(data)data= process_abbreviated_words(data)data = lower_casing(data)# print(tabulate(data.head(3)))data = remove_punctuation(data)data = remove_stopwords(data)data= remove_freqwords(data)data = remove_rare_words(data)# data = stem_words(data)print('before...')print(tabulate(data.head(3)))# data = lemmatize_words(data)data = lemmatize_words1(data)# data = correct_spellings(data)print('after...')print(tabulate(data.head(3)))return datadef save_file(data, file_name):data.to_excel(base_dir + '/'+file_name, index=None)if __name__ == '__main__':data =  get_dataset()data = texts_preprocessing(data)save_file(data, 'after_preprocessing.xlsx')