原文网址:
https://segmentfault.com/a/1190000021947908
import pandas as pd
import jieba
import re
#逻辑回归建模需要的库
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.linear_model import LogisticRegression
import numpy as np
from pandas import DataFramedf1 = [{"name":"整儿钱小姐","short":"少年的你值得一看"}]
df2 = [{"rating":[('50','力荐')]}]
data = pd.merge(df1,df2,how = 'outer')
print(data.shape)#划分等级
def rating(e):if '50' in e:return 5if '40' in e:return 4if '30' in e:return 3if '20' in e:return 2if '10' in e:return 1data['new_rating'] = data['rating'].map(rating)
print(data.head())#剔除中性的评价
new_data = data[data['new_rating'] != 3]
new_data['sentiment'] = new_data['new_rating'].apply(lambda x : +1 if x>3 else -1)print(new_data['sentiment'].value_counts())#分词
def cut_word(text):text = jieba.cut(str(text), cut_all = False)return " ".join(text)
new_data['new_short'] = new_data['short'].apply(cut_word)#删除数字
def remove_num(new_short):return re.sub(r'\d+','',new_short)#删除字母
def remove_word(new_short):return re.sub(r'[a-z]+','',new_short)new_data['new_short'] = new_data['new_short'].apply(remove_num)
new_data['new_short'] = new_data['new_short'].apply(remove_word)#逻辑回归分析与建模
#第一步需要对分析好的数据进行数据划分,分为训练集和测试集
train_data, test_data = train_test_split(new_data, train_size = 0.8,random_stat=0)#文本提取
transfer = CountVectorizer()
train_word = transfer.fit_transform(train_data['new_short'])
test_word = transfer.transform(test_data['new_short'])#稀疏矩阵
print('new_data:\n', train_word.toarray())#特征值
print('feature_name:\n',transfer.get_feature_names())#第二步对分词后的文本进行特征提取,可以生成一个对应的稀疏矩阵,并且得到稀疏矩阵对应的特征值
#第三步利用逻辑回归建模,即让训练集中的特征值和目标值进行拟合,从而生成一个模型
x_train, x_test,y_train,y_test = train_test_split(new_data['new_short'],new_data['sentiment'],train_size = 0.8, random_state = 0)
x_train = train_word
x_test = test_word
model = LogisticRegression()
model.fit(x_train,y_train)
y_predict = model.predict(x_test)
print('布尔比对:\n',y_predict==y_test)
score = model.score(x_test,y_test)
print('模型准确率:\n',score)example = test_data[50:55]
example[['short','new_rating','sentiment']]possibility = model.predict_proba(test_word)[:,1]
test_data.loc[:,'possibility'] = possibility
print(test_data.head())