本章节的目的是 【明确目标用户群】 ,以更好的服务现有用户。
【知识点】
1.作图
- 显示中文
plt.rcParams['font.sans-serif'] = ['SimHei'] # 步骤一(替换sans-serif字体) plt.rcParams['axes.unicode_minus'] = False # 步骤二(解决坐标轴负数的负号显示问题)
2.数据库操作
- sqlalchemy 引擎
engine = create_engine('mysql+pymysql://root:123456@localhost:3306/datascience')
3.批量读取文件
- os.wolk()、os.path.join()用法
for root, dirs, files in os.walk(path): for file in files:`rfile = os.path.join(root,file)if rfile.split('.')[-1] == 'tsv':rdf = pd.read_csv(rfile, sep='\t')df = df.append(rdf)
4.groupby()以及agg() 的联合使用,应对不同列使用不同的函数
- 按月统计
affc = {'payment':'sum', 'log_date':'count'} dfm = df.groupby(['log_month', 'user_id']).agg(affc).reset_index()
- 修改列明
renam = {'log_date':'access_days'} dfm.rename(columns=renam, inplace=True)
5.KMeans 聚类的使用
- 单列的聚类(需要将单列应用 reshape(-1,1)格式化为1列)
from sklearn.cluster import KMeans a47 = action['A47'].reshape(-1, 1) kms = KMeans(n_clusters=3).fit(a47)
- 聚类的标签 labels_ 属性
cluster = kms.labels_
- 将标签添加至源数据中,运用groupby()查看分组情况
action['cluster'] = cluster action.groupby(['cluster'])['user_id'].count()
- 可视化分组
snsdf = action[['user_id','A47','cluster']].sort_values(by='A47',ascending=False)
plt.figure(figsize=(8,5))
snsdf1 = snsdf.reset_index()
snsdf1[snsdf1['cluster']==2]['A47'].plot(color='r',label='2:重度用户')
snsdf1[snsdf1['cluster']==1]['A47'].plot(color='g',label='1:中度用户')
snsdf1[snsdf1['cluster']==0]['A47'].plot(color='b',label='0:轻度用户')
plt.legend()
plt.xlabel('用户分布')
plt.ylabel('排行榜得分')
6.主成分分析
数据预处理
- 提取要进行主成分分析的列
paction = acc.iloc[:,3:(len(acc.columns)-1)]
- 提取要进行主成分分析的列
- 删掉0值较多的列
cc = paction[paction==0].count(axis=0)/len(paction) cc.plot() dd = cc[cc<.9] #删掉该列中90%以上都是0值的列 paction = paction[dd.index] paction.head()
- 删掉0值较多的列
删掉相关性较强的列
# 数据概览 corp = paction.corr() sns.heatmap(corp) mask = np.array(corp) mask[np.tril_indices_from(mask)] = False # 画下三角heatmap的方法 sns.heatmap(corp,mask=mask)# 通过下三角矩阵的方式,删掉相关性较强的数据列 coll = corp.columns corp = pd.DataFrame(np.tril(corp, -1)) # 应用 np.tril(m, -1) 函数获取下三角,上三角数据全部置为0 corp.columns = coll pac2 = paction.loc[:,(corp.abs()<.8).all()] # 任何一个数都小于 0.8 的数据 all() 函数 pac2.head()
进行主成分分析
from sklearn.decomposition import PCA pca = PCA() pca.fit(pac2)redio = pca.explained_variance_ratio_ # pca.explained_variance_ratio_ 是PCA降维后的矩阵课解释性比率 print(redio) print(pca.singular_values_) # singular_values_ 是奇异值矩阵
主成分的课解释性曲线
recu = redio.cumsum() # 应用 cumsum() 函数进行逐数据累加 plt.plot(recu)
获取降维后的数据以进行下一步
pca.set_params(n_components=10) # 设置 维度 为 10 pac3 = pd.DataFrame(pca.fit_transform(pac2)) # 使用fit_transform()函数训练并获得降维后的数据 pac3.head()
- 继续应用 KMENAS 进行聚类, 得到所有用户的 分类 ,然后再 平均 每个分类的每个行为的所有用户的值
- 继续应用相关性 删除 相关性强的列, 获得最后 主要观察指标
对主要观察指标进行 雷达图 展示
# 首先,对数据进行标准化处理 from sklearn.preprocessing import scale ccccc = pd.DataFrame(scale(cccc)) ccccc.columns = cccc.columns# 画图 plt.figure(figsize=(8,8)) N = ccccc.shape[1] # 极坐标的分割分数 angles = np.linspace(0, 2*np.pi, N, endpoint=False) # 设置雷达图的角度,用于平分切开一个圆面 angles = np.concatenate((angles,[angles[0]])) # 使雷达图一圈封闭起来 for i in range(len(ccccc)):values = ccccc.loc[i,:] # 构造数据values = np.concatenate((values,[values[0]])) # 为了使雷达图一圈封闭起来plt.polar(angles, values, 'o-', linewidth=2) # 绘制 plt.legend(ccccc.index, loc='lower right') plt.thetagrids(angles * 180/np.pi, labels=list(ccccc.columns)) # 添加极坐标的标签 plt.title('重要指标雷达图呈现')
一、库导入以及matplotlib显示中文
import pandas as pd
import numpy as np
import pymysql
from sqlalchemy import create_engine
import matplotlib.pyplot as plt
import seaborn as sns
import missingno as msno
import osplt.rcParams['font.sans-serif'] = ['SimHei'] # 步骤一(替换sans-serif字体)
plt.rcParams['axes.unicode_minus'] = False # 步骤二(解决坐标轴负数的负号显示问题)
%matplotlib inline
数据库引擎
engine = create_engine('mysql+pymysql://root:123456@localhost:3306/datascience')
二、批量读取文件
def read_files(path):df = pd.DataFrame()for root, dirs, files in os.walk(path):for file in files:rfile = os.path.join(root,file)if rfile.split('.')[-1] == 'tsv':rdf = pd.read_csv(rfile, sep='\t')df = df.append(rdf)return df
action_path = 'data/sample-data/section8/daily/action/'
dau_path = 'data/sample-data/section8/daily/dau/'
dpu_path = 'data/sample-data/section8/daily/dpu/'action = read_files(action_path)
dau = read_files(dau_path)
dpu = read_files(dpu_path)
查看数据完整性以及头部信息
print(action.isnull().sum().sum())
print(action.shape)
# print(action.info())
action.head()
0
(2653, 57)
log_date | app_name | user_id | A1 | A2 | A3 | A4 | A5 | A6 | A7 | ... | A45 | A46 | A47 | A48 | A49 | A50 | A51 | A52 | A53 | A54 | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 2013-10-31 | game-01 | 654133 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | ... | 0 | 0 | 380 | 25655 | 0 | 0 | 0 | 0 | 0.0 | 46 |
1 | 2013-10-31 | game-01 | 425530 | 0 | 0 | 0 | 0 | 10 | 1 | 233 | ... | 19 | 20 | 180543 | 347 | 36 | 22 | 4 | 0 | 0.0 | 71 |
2 | 2013-10-31 | game-01 | 709596 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | ... | 0 | 0 | 416 | 24817 | 0 | 0 | 0 | 0 | 0.0 | 2 |
3 | 2013-10-31 | game-01 | 525047 | 0 | 2 | 0 | 0 | 9 | 0 | 0 | ... | 22 | 22 | 35200 | 6412 | 21 | 0 | 0 | 0 | 0.0 | 109 |
4 | 2013-10-31 | game-01 | 796908 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | ... | 29 | 29 | 388 | 25444 | 1 | 0 | 0 | 0 | 0.0 | 64 |
5 rows × 57 columns
print(dau.isnull().sum().sum())
print(dau.shape)
print(dau.info())
dau.head()
0
(509754, 3)
<class 'pandas.core.frame.DataFrame'>
Int64Index: 509754 entries, 0 to 2410
Data columns (total 3 columns):
log_date 509754 non-null object
app_name 509754 non-null object
user_id 509754 non-null int64
dtypes: int64(1), object(2)
memory usage: 15.6+ MB
None
log_date | app_name | user_id | |
---|---|---|---|
0 | 2013-05-01 | game-01 | 608801 |
1 | 2013-05-01 | game-01 | 712453 |
2 | 2013-05-01 | game-01 | 776853 |
3 | 2013-05-01 | game-01 | 823486 |
4 | 2013-05-01 | game-01 | 113600 |
print(dpu.isnull().sum().sum())
print(dpu.shape)
print(dpu.info())
dpu.head()
0
(3532, 4)
<class 'pandas.core.frame.DataFrame'>
Int64Index: 3532 entries, 0 to 7
Data columns (total 4 columns):
log_date 3532 non-null object
app_name 3532 non-null object
user_id 3532 non-null int64
payment 3532 non-null int64
dtypes: int64(2), object(2)
memory usage: 138.0+ KB
None
log_date | app_name | user_id | payment | |
---|---|---|---|---|
0 | 2013-05-01 | game-01 | 804005 | 571 |
1 | 2013-05-01 | game-01 | 793537 | 81 |
2 | 2013-05-01 | game-01 | 317717 | 81 |
3 | 2013-05-01 | game-01 | 317717 | 81 |
4 | 2013-05-01 | game-01 | 426525 | 324 |
# 写入数据库# action.to_sql('s8_action', engine, index=False)
# dau.to_sql('s8_dau', engine, index=False)
# dpu.to_sql('s8_dpu', engine, index=False)
三、数据预处理
1.合并 DAU DPU
df = pd.merge(dau, dpu[['log_date','user_id','payment']], how='left', on=['user_id','log_date'])
df.head()
log_date | app_name | user_id | payment | |
---|---|---|---|---|
0 | 2013-05-01 | game-01 | 608801 | NaN |
1 | 2013-05-01 | game-01 | 712453 | NaN |
2 | 2013-05-01 | game-01 | 776853 | NaN |
3 | 2013-05-01 | game-01 | 823486 | NaN |
4 | 2013-05-01 | game-01 | 113600 | NaN |
# 将无消费记录的消费额设为 0
print(df.payment.isnull().sum())
df['payment'].fillna(0, inplace=True)
print(df.payment.isnull().sum())
507151
0
# 添加消费额标志位
df['is_pay'] = df['payment'].apply( lambda x: 1 if x>0 else 0 )
df.head()
log_date | app_name | user_id | payment | is_pay | |
---|---|---|---|---|---|
0 | 2013-05-01 | game-01 | 608801 | 0.0 | 0 |
1 | 2013-05-01 | game-01 | 712453 | 0.0 | 0 |
2 | 2013-05-01 | game-01 | 776853 | 0.0 | 0 |
3 | 2013-05-01 | game-01 | 823486 | 0.0 | 0 |
4 | 2013-05-01 | game-01 | 113600 | 0.0 | 0 |
2.按月统计
# 增加月份列
df['log_month'] = df['log_date'].apply(lambda x: x[0:7])
df.head()
log_date | app_name | user_id | payment | is_pay | log_month | |
---|---|---|---|---|---|---|
0 | 2013-05-01 | game-01 | 608801 | 0.0 | 0 | 2013-05 |
1 | 2013-05-01 | game-01 | 712453 | 0.0 | 0 | 2013-05 |
2 | 2013-05-01 | game-01 | 776853 | 0.0 | 0 | 2013-05 |
3 | 2013-05-01 | game-01 | 823486 | 0.0 | 0 | 2013-05 |
4 | 2013-05-01 | game-01 | 113600 | 0.0 | 0 | 2013-05 |
巧妙运用 groupby 以及 agg 函数,统计出用户按月份的 消费情况 和 登陆次数
# 按月统计
affc = {'payment':'sum', 'log_date':'count'}
dfm = df.groupby(['log_month', 'user_id']).agg(affc).reset_index()
# 修改列明
renam = {'log_date':'access_days'}
dfm.rename(columns=renam, inplace=True)
dfm.head()
log_month | user_id | payment | access_days | |
---|---|---|---|---|
0 | 2013-05 | 65 | 0.0 | 1 |
1 | 2013-05 | 115 | 0.0 | 1 |
2 | 2013-05 | 194 | 0.0 | 1 |
3 | 2013-05 | 426 | 0.0 | 4 |
4 | 2013-05 | 539 | 0.0 | 1 |
4.使用 Kmeans 进行分类, 得到排名靠前的用户,即 重度用户/中度用户/轻度用户
A47 列即是排行榜得分, 从分布图上看出,大部分用户得分很低,符合幂律曲线
#
action['A47'].hist(bins=50, figsize=(6,4))
<matplotlib.axes._subplots.AxesSubplot at 0x1c21d894240>
sns.distplot(action['A47'],bins=50,kde=True)
<matplotlib.axes._subplots.AxesSubplot at 0x1c21af07a58>
对 A47 列进行聚类,分为3类
from sklearn.cluster import KMeansa47 = action['A47'].reshape(-1, 1)kms = KMeans(n_clusters=3).fit(a47)
D:\ProgramData\Anaconda3\lib\site-packages\ipykernel_launcher.py:3: FutureWarning: reshape is deprecated and will raise in a subsequent release. Please use .values.reshape(...) insteadThis is separate from the ipykernel package so we can avoid doing imports until
cluster = kms.labels_
kms.cluster_centers_
array([[ 9359.84787792],[ 69386.11297071],[185857.17948718]])
action['cluster'] = cluster
action.head()
log_date | app_name | user_id | A1 | A2 | A3 | A4 | A5 | A6 | A7 | ... | A46 | A47 | A48 | A49 | A50 | A51 | A52 | A53 | A54 | cluster | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 2013-10-31 | game-01 | 654133 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | ... | 0 | 380 | 25655 | 0 | 0 | 0 | 0 | 0.0 | 46 | 0 |
1 | 2013-10-31 | game-01 | 425530 | 0 | 0 | 0 | 0 | 10 | 1 | 233 | ... | 20 | 180543 | 347 | 36 | 22 | 4 | 0 | 0.0 | 71 | 2 |
2 | 2013-10-31 | game-01 | 709596 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | ... | 0 | 416 | 24817 | 0 | 0 | 0 | 0 | 0.0 | 2 | 0 |
3 | 2013-10-31 | game-01 | 525047 | 0 | 2 | 0 | 0 | 9 | 0 | 0 | ... | 22 | 35200 | 6412 | 21 | 0 | 0 | 0 | 0.0 | 109 | 0 |
4 | 2013-10-31 | game-01 | 796908 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | ... | 29 | 388 | 25444 | 1 | 0 | 0 | 0 | 0.0 | 64 | 0 |
5 rows × 58 columns
action.groupby(['cluster'])['user_id'].count()
cluster
0 2096
1 479
2 78
Name: user_id, dtype: int64
图上显示,通过聚类分解后用户分为3个类, 0 表示轻度用户,排行榜得分最少; 1 表示中度用户,排行版得分居中; 2 表示重度用户,排行版得分较高,而且用户数量较少,符合实际情况。
snsdf = action[['user_id','A47','cluster']].sort_values(by='A47',ascending=False)
snsdf['user'] = range(len(snsdf))
sns.scatterplot(x='user',y='A47',hue='cluster',data=snsdf, palette='rainbow', alpha=.2)
<matplotlib.axes._subplots.AxesSubplot at 0x1c21b9bf898>
snsdf = action[['user_id','A47','cluster']].sort_values(by='A47',ascending=False)
snsdf['user'] = range(len(snsdf))plt.figure(figsize=(8,5))
snsdf1 = snsdf.reset_index()
snsdf1[snsdf1['cluster']==2]['A47'].plot(color='r',label='2:重度用户')
snsdf1[snsdf1['cluster']==1]['A47'].plot(color='g',label='1:中度用户')
snsdf1[snsdf1['cluster']==0]['A47'].plot(color='b',label='0:轻度用户')
plt.legend()
plt.xlabel('用户分布')
plt.ylabel('排行榜得分')
Text(0,0.5,'排行榜得分')
限定排名靠前的用户,即得分较高的重度和中度用户,以便接下来进行分析
acc = action[action['cluster']>=1]
acc.head()
log_date | app_name | user_id | A1 | A2 | A3 | A4 | A5 | A6 | A7 | ... | A46 | A47 | A48 | A49 | A50 | A51 | A52 | A53 | A54 | cluster | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
1 | 2013-10-31 | game-01 | 425530 | 0 | 0 | 0 | 0 | 10 | 1 | 233 | ... | 20 | 180543 | 347 | 36 | 22 | 4 | 0 | 0.0 | 71 | 2 |
5 | 2013-10-31 | game-01 | 776120 | 0 | 0 | 0 | 0 | 9 | 0 | 0 | ... | 38 | 142214 | 684 | 37 | 15 | 0 | 0 | 0.0 | 312 | 2 |
7 | 2013-10-31 | game-01 | 276197 | 0 | 0 | 0 | 0 | 7 | 0 | 58 | ... | 15 | 54602 | 4226 | 15 | 0 | 8 | 0 | 0.0 | 95 | 1 |
8 | 2013-10-31 | game-01 | 221572 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | ... | 24 | 39891 | 5792 | 4 | 0 | 0 | 0 | 0.0 | 21 | 1 |
9 | 2013-10-31 | game-01 | 692433 | 0 | 0 | 0 | 0 | 6 | 0 | 0 | ... | 28 | 50706 | 4549 | 16 | 8 | 0 | 0 | 0.0 | 154 | 1 |
5 rows × 58 columns
5.主成分分析
获取关键的参数
paction = acc.iloc[:,3:(len(acc.columns)-1)]
paction.index=acc.user_id
paction.head()
A1 | A2 | A3 | A4 | A5 | A6 | A7 | A8 | A9 | A10 | ... | A45 | A46 | A47 | A48 | A49 | A50 | A51 | A52 | A53 | A54 | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
user_id | |||||||||||||||||||||
425530 | 0 | 0 | 0 | 0 | 10 | 1 | 233 | 58.25 | 288 | 230 | ... | 19 | 20 | 180543 | 347 | 36 | 22 | 4 | 0 | 0.0 | 71 |
776120 | 0 | 0 | 0 | 0 | 9 | 0 | 0 | 0.00 | 325 | 195 | ... | 19 | 38 | 142214 | 684 | 37 | 15 | 0 | 0 | 0.0 | 312 |
276197 | 0 | 0 | 0 | 0 | 7 | 0 | 58 | 7.25 | 150 | 100 | ... | 15 | 15 | 54602 | 4226 | 15 | 0 | 8 | 0 | 0.0 | 95 |
221572 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0.00 | 40 | 14 | ... | 24 | 24 | 39891 | 5792 | 4 | 0 | 0 | 0 | 0.0 | 21 |
692433 | 0 | 0 | 0 | 0 | 6 | 0 | 0 | 0.00 | 102 | 95 | ... | 15 | 28 | 50706 | 4549 | 16 | 8 | 0 | 0 | 0.0 | 154 |
5 rows × 54 columns
1.删掉 0 值比较多的列
cc = paction[paction==0].count(axis=0)/len(paction)
print(cc.head())
cc.plot()
A1 1.000000
A2 0.926391
A3 1.000000
A4 0.994614
A5 0.055655
dtype: float64<matplotlib.axes._subplots.AxesSubplot at 0x1c21bbb1470>
# cc[cc>.8]
dd = cc[cc<.95]
paction = paction[dd.index]
paction.head()
A2 | A5 | A6 | A7 | A8 | A9 | A10 | A11 | A12 | A13 | ... | A45 | A46 | A47 | A48 | A49 | A50 | A51 | A52 | A53 | A54 | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
user_id | |||||||||||||||||||||
425530 | 0 | 10 | 1 | 233 | 58.25 | 288 | 230 | 19 | 2 | 19 | ... | 19 | 20 | 180543 | 347 | 36 | 22 | 4 | 0 | 0.0 | 71 |
776120 | 0 | 9 | 0 | 0 | 0.00 | 325 | 195 | 38 | 8 | 19 | ... | 19 | 38 | 142214 | 684 | 37 | 15 | 0 | 0 | 0.0 | 312 |
276197 | 0 | 7 | 0 | 58 | 7.25 | 150 | 100 | 15 | 3 | 11 | ... | 15 | 15 | 54602 | 4226 | 15 | 0 | 8 | 0 | 0.0 | 95 |
221572 | 0 | 1 | 0 | 0 | 0.00 | 40 | 14 | 0 | 0 | 3 | ... | 24 | 24 | 39891 | 5792 | 4 | 0 | 0 | 0 | 0.0 | 21 |
692433 | 0 | 6 | 0 | 0 | 0.00 | 102 | 95 | 0 | 0 | 2 | ... | 15 | 28 | 50706 | 4549 | 16 | 8 | 0 | 0 | 0.0 | 154 |
5 rows × 32 columns
2.删掉相关性较强的列
corp = paction.corr()
plt.figure(figsize=(15,8))
sns.heatmap(corp)
<matplotlib.axes._subplots.AxesSubplot at 0x1c21bc094a8>
画下三角heatmap,使用到的函数
mask = np.array(corp)
mask[np.tril_indices_from(mask)] = False
fig,ax = plt.subplots()
fig.set_size_inches(15,8)
sns.heatmap(corp,mask=mask)
<matplotlib.axes._subplots.AxesSubplot at 0x1c21bc09400>
获取矩阵的下三角,如果要获取上三角的话, np.tril(m, 1)
coll = corp.columns
corp = pd.DataFrame(np.tril(corp, -1))
corp.columns = coll
corp.head()
A2 | A5 | A6 | A7 | A8 | A9 | A10 | A11 | A12 | A13 | ... | A45 | A46 | A47 | A48 | A49 | A50 | A51 | A52 | A53 | A54 | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | ... | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 |
1 | 0.069744 | 0.000000 | 0.000000 | 0.000000 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | ... | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 |
2 | 0.076185 | 0.178833 | 0.000000 | 0.000000 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | ... | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 |
3 | 0.158735 | 0.219395 | 0.371360 | 0.000000 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | ... | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 |
4 | 0.167200 | 0.186124 | 0.242025 | 0.803161 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | ... | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 |
5 rows × 32 columns
pac2 = paction.loc[:,(corp.abs()<.7).all()] # 任何一个数都小于0.7 的数据
pac2.head()
A2 | A11 | A12 | A13 | A20 | A23 | A24 | A43 | A44 | A46 | A48 | A49 | A50 | A51 | A53 | A54 | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
user_id | ||||||||||||||||
425530 | 0 | 19 | 2 | 19 | 0 | 0 | 0.5 | 23 | 0.92174 | 20 | 347 | 36 | 22 | 4 | 0.0 | 71 |
776120 | 0 | 38 | 8 | 19 | 0 | 0 | 0.0 | 20 | 0.90256 | 38 | 684 | 37 | 15 | 0 | 0.0 | 312 |
276197 | 0 | 15 | 3 | 11 | 0 | 0 | 0.0 | 10 | 0.92000 | 15 | 4226 | 15 | 0 | 8 | 0.0 | 95 |
221572 | 0 | 0 | 0 | 3 | 0 | 0 | 0.0 | 2 | 0.85714 | 24 | 5792 | 4 | 0 | 0 | 0.0 | 21 |
692433 | 0 | 0 | 0 | 2 | 0 | 0 | 0.0 | 11 | 0.73684 | 28 | 4549 | 16 | 8 | 0 | 0.0 | 154 |
进行主成分分析
from sklearn.decomposition import PCA
pca = PCA()
pca.fit(pac2)
PCA(copy=True, iterated_power='auto', n_components=None, random_state=None,svd_solver='auto', tol=0.0, whiten=False)
redio = pca.explained_variance_ratio_
print(redio)
print(pca.singular_values_)
[9.97843804e-01 1.92024564e-03 1.20120771e-04 5.57014208e-052.67905481e-05 1.54533752e-05 9.31262940e-06 4.38846214e-063.02317261e-06 8.36725295e-07 1.31874979e-07 9.78197162e-083.86464536e-08 2.94647596e-08 1.82272465e-08 7.54580333e-09]
[3.96183910e+04 1.73797668e+03 4.34684952e+02 2.96004755e+022.05284590e+02 1.55911168e+02 1.21032418e+02 8.30848288e+016.89599635e+01 3.62791414e+01 1.44027941e+01 1.24044853e+017.79687146e+00 6.80796010e+00 5.35458829e+00 3.44523057e+00]
recu = redio.cumsum()
print(recu)
x = np.arange(len(recu))
plt.plot(recu, color='r')
[0.9978438 0.99976405 0.99988417 0.99993987 0.99996666 0.999982120.99999143 0.99999582 0.99999884 0.99999968 0.99999981 0.999999910.99999994 0.99999997 0.99999999 1. ][<matplotlib.lines.Line2D at 0x1c21dadada0>]
得到降维后的数据
pca.set_params(n_components=10)
pac3 = pd.DataFrame(pca.fit_transform(pac2))
pacsse = pac3.copy()
pac3.head()
0 | 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | |
---|---|---|---|---|---|---|---|---|---|---|
0 | 2706.266005 | -100.824346 | -1.874787 | -1.577536 | 12.481591 | -2.394320 | 9.770878 | 7.807535 | 0.021273 | -2.169596 |
1 | 2373.811140 | 147.314930 | -16.386795 | -8.428655 | 10.019577 | -3.004725 | 6.009771 | 0.961469 | -1.598531 | 2.144615 |
2 | -1171.733361 | -5.493081 | 0.744995 | 0.542033 | -0.785251 | -5.756412 | -1.012336 | -1.778067 | 7.256884 | 0.343277 |
3 | -2738.903900 | -50.468487 | 2.328491 | 2.965415 | -5.794347 | 11.891289 | 2.965366 | -1.182413 | 0.065619 | 1.245358 |
4 | -1493.642618 | 58.686385 | -10.807612 | 11.777973 | 7.664692 | 9.312968 | 4.376429 | 1.994214 | -1.568050 | 0.426246 |
6.KMeans 进行聚类
from sklearn.cluster import KMeanskm = KMeans(n_clusters=5)
km.fit(pac3)
KMeans(algorithm='auto', copy_x=True, init='k-means++', max_iter=300,n_clusters=5, n_init=10, n_jobs=1, precompute_distances='auto',random_state=None, tol=0.0001, verbose=0)
clu = km.labels_
pac3['clu'] = clu
pac3.head()
0 | 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | clu | |
---|---|---|---|---|---|---|---|---|---|---|---|
0 | 2706.266005 | -100.824346 | -1.874787 | -1.577536 | 12.481591 | -2.394320 | 9.770878 | 7.807535 | 0.021273 | -2.169596 | 0 |
1 | 2373.811140 | 147.314930 | -16.386795 | -8.428655 | 10.019577 | -3.004725 | 6.009771 | 0.961469 | -1.598531 | 2.144615 | 0 |
2 | -1171.733361 | -5.493081 | 0.744995 | 0.542033 | -0.785251 | -5.756412 | -1.012336 | -1.778067 | 7.256884 | 0.343277 | 1 |
3 | -2738.903900 | -50.468487 | 2.328491 | 2.965415 | -5.794347 | 11.891289 | 2.965366 | -1.182413 | 0.065619 | 1.245358 | 4 |
4 | -1493.642618 | 58.686385 | -10.807612 | 11.777973 | 7.664692 | 9.312968 | 4.376429 | 1.994214 | -1.568050 | 0.426246 | 1 |
pac3.groupby('clu')[2].count()
clu
0 90
1 113
2 122
3 109
4 123
Name: 2, dtype: int64
#### palette 的颜色风格:
Accent, Accent_r, Blues, Blues_r, BrBG, BrBG_r, BuGn, BuGn_r, BuPu, BuPu_r, CMRmap, CMRmap_r, Dark2, Dark2_r, GnBu, GnBu_r, Greens, Greens_r, Greys, Greys_r, OrRd, OrRd_r, Oranges, Oranges_r, PRGn, PRGn_r, Paired, Paired_r, Pastel1, Pastel1_r, Pastel2, Pastel2_r, PiYG, PiYG_r, PuBu, PuBuGn, PuBuGn_r, PuBu_r, PuOr, PuOr_r, PuRd, PuRd_r, Purples, Purples_r, RdBu, RdBu_r, RdGy, RdGy_r, RdPu, RdPu_r, RdYlBu, RdYlBu_r, RdYlGn, RdYlGn_r, Reds, Reds_r, Set1, Set1_r, Set2, Set2_r, Set3, Set3_r, Spectral, Spectral_r, Vega10, Vega10_r, Vega20, Vega20_r, Vega20b, Vega20b_r, Vega20c, Vega20c_r, Wistia, Wistia_r, YlGn, YlGnBu, YlGnBu_r, YlGn_r, YlOrBr, YlOrBr_r, YlOrRd, YlOrRd_r, afmhot, afmhot_r, autumn, autumn_r, binary, binary_r, bone, bone_r, brg, brg_r, bwr, bwr_r, cool, cool_r, coolwarm, coolwarm_r, copper, copper_r, cubehelix, cubehelix_r, flag, flag_r, gist_earth, gist_earth_r, gist_gray, gist_gray_r, gist_heat, gist_heat_r, gist_ncar, gist_ncar_r, gist_rainbow, gist_rainbow_r, gist_stern, gist_stern_r, gist_yarg, gist_yarg_r, gnuplot, gnuplot2, gnuplot2_r, gnuplot_r, gray, gray_r, hot, hot_r, hsv, hsv_r, icefire, icefire_r, inferno, inferno_r, jet, jet_r, magma, magma_r, mako, mako_r, nipy_spectral, nipy_spectral_r, ocean, ocean_r, pink, pink_r, plasma, plasma_r, prism, prism_r, rainbow, rainbow_r, rocket, rocket_r, seismic, seismic_r, spectral, spectral_r, spring, spring_r, summer, summer_r, tab10, tab10_r, tab20, tab20_r, tab20b, tab20b_r, tab20c, tab20c_r, terrain, terrain_r, viridis, viridis_r, vlag, vlag_r, winter, winter_r
plt.figure(figsize=(13,7))
sns.scatterplot(x=0, y=1, data=pac3,style='clu',hue='clu', palette='autumn')
<matplotlib.axes._subplots.AxesSubplot at 0x1c21db35438>
将分类后的类别添加至原数据中
pac4 = pac2.copy()
pac4['cluster'] = list(pac3.clu)
pac4.head()
A2 | A11 | A12 | A13 | A20 | A23 | A24 | A43 | A44 | A46 | A48 | A49 | A50 | A51 | A53 | A54 | cluster | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
user_id | |||||||||||||||||
425530 | 0 | 19 | 2 | 19 | 0 | 0 | 0.5 | 23 | 0.92174 | 20 | 347 | 36 | 22 | 4 | 0.0 | 71 | 0 |
776120 | 0 | 38 | 8 | 19 | 0 | 0 | 0.0 | 20 | 0.90256 | 38 | 684 | 37 | 15 | 0 | 0.0 | 312 | 0 |
276197 | 0 | 15 | 3 | 11 | 0 | 0 | 0.0 | 10 | 0.92000 | 15 | 4226 | 15 | 0 | 8 | 0.0 | 95 | 1 |
221572 | 0 | 0 | 0 | 3 | 0 | 0 | 0.0 | 2 | 0.85714 | 24 | 5792 | 4 | 0 | 0 | 0.0 | 21 | 4 |
692433 | 0 | 0 | 0 | 2 | 0 | 0 | 0.0 | 11 | 0.73684 | 28 | 4549 | 16 | 8 | 0 | 0.0 | 154 | 1 |
# 计算每个类的平均值
clu5 = pac4.groupby('cluster').mean()
# 删除相关性较高的列
clu5.drop(columns='A53',inplace=True)
c5cor = clu5.corr()
plt.figure(figsize=(15,8))
sns.heatmap(c5cor,annot=True)
<matplotlib.axes._subplots.AxesSubplot at 0x1c21d92a780>
ccrp = pd.DataFrame(np.tril(c5cor,-1))
ccrp.columns = clu5.columns
cccc = clu5.loc[:,(ccrp.abs()<.95).all()]
cccc
A2 | A20 | A23 | A24 | A44 | A46 | A50 | A51 | A54 | |
---|---|---|---|---|---|---|---|---|---|
cluster | |||||||||
0 | 0.022222 | 0.322222 | 0.655556 | 0.167691 | 0.858193 | 27.600000 | 10.666667 | 2.011111 | 166.711111 |
1 | 0.079646 | 0.274336 | 0.362832 | 0.095231 | 0.844027 | 20.159292 | 3.008850 | 1.469027 | 102.106195 |
2 | 0.073770 | 0.377049 | 0.336066 | 0.070628 | 0.849343 | 24.737705 | 4.286885 | 1.844262 | 121.909836 |
3 | 0.018349 | 0.229358 | 0.284404 | 0.098252 | 0.845981 | 24.119266 | 5.266055 | 1.733945 | 146.871560 |
4 | 0.203252 | 0.292683 | 0.243902 | 0.063686 | 0.775076 | 18.983740 | 2.130081 | 0.975610 | 84.032520 |
from sklearn.preprocessing import scaleccccc = pd.DataFrame(scale(cccc))
ccccc.columns = cccc.columns
ccccc
A2 | A20 | A23 | A24 | A44 | A46 | A50 | A51 | A54 | |
---|---|---|---|---|---|---|---|---|---|
0 | -0.855590 | 0.468859 | 1.918400 | 1.862020 | 0.785882 | 1.422970 | 1.867773 | 1.118457 | 1.424282 |
1 | 0.002962 | -0.503392 | -0.094337 | -0.104961 | 0.315530 | -0.940402 | -0.688647 | -0.381093 | -0.746672 |
2 | -0.084884 | 1.582038 | -0.278379 | -0.772826 | 0.492038 | 0.513827 | -0.261998 | 0.656909 | -0.081200 |
3 | -0.913505 | -1.416613 | -0.633601 | -0.022944 | 0.380387 | 0.317394 | 0.064879 | 0.351742 | 0.757602 |
4 | 1.851016 | -0.130892 | -0.912083 | -0.961289 | -1.973837 | -1.313789 | -0.982007 | -1.746015 | -1.354012 |
plt.figure(figsize=(8,8))
# 极坐标的分割分数
N = ccccc.shape[1]
# 设置雷达图的角度,用于平分切开一个圆面
angles = np.linspace(0, 2*np.pi, N, endpoint=False)
# 使雷达图一圈封闭起来
angles = np.concatenate((angles,[angles[0]]))
for i in range(len(ccccc)):# 构造数据values = ccccc.loc[i,:]# 为了使雷达图一圈封闭起来values = np.concatenate((values,[values[0]]))# 绘制plt.polar(angles, values, 'o-', linewidth=2)
plt.legend(ccccc.index, loc='lower right')
# 添加极坐标的标签
plt.thetagrids(angles * 180/np.pi, labels=list(ccccc.columns))
plt.title('重要指标雷达图呈现')
Text(0.5,1.05,'重要指标雷达图呈现')
不进行预处理的降维
dfp = acc.iloc[:,3:(len(acc.columns)-1)]
dfp.index=acc.user_id
dfp.head()
A1 | A2 | A3 | A4 | A5 | A6 | A7 | A8 | A9 | A10 | ... | A45 | A46 | A47 | A48 | A49 | A50 | A51 | A52 | A53 | A54 | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
user_id | |||||||||||||||||||||
425530 | 0 | 0 | 0 | 0 | 10 | 1 | 233 | 58.25 | 288 | 230 | ... | 19 | 20 | 180543 | 347 | 36 | 22 | 4 | 0 | 0.0 | 71 |
776120 | 0 | 0 | 0 | 0 | 9 | 0 | 0 | 0.00 | 325 | 195 | ... | 19 | 38 | 142214 | 684 | 37 | 15 | 0 | 0 | 0.0 | 312 |
276197 | 0 | 0 | 0 | 0 | 7 | 0 | 58 | 7.25 | 150 | 100 | ... | 15 | 15 | 54602 | 4226 | 15 | 0 | 8 | 0 | 0.0 | 95 |
221572 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0.00 | 40 | 14 | ... | 24 | 24 | 39891 | 5792 | 4 | 0 | 0 | 0 | 0.0 | 21 |
692433 | 0 | 0 | 0 | 0 | 6 | 0 | 0 | 0.00 | 102 | 95 | ... | 15 | 28 | 50706 | 4549 | 16 | 8 | 0 | 0 | 0.0 | 154 |
5 rows × 54 columns
from sklearn.decomposition import PCApca = PCA(whiten=False)
pca.fit(dfp)
PCA(copy=True, iterated_power='auto', n_components=None, random_state=None,svd_solver='auto', tol=0.0, whiten=False)
retio = pca.explained_variance_ratio_
# print(retio)
# print(pca.singular_values_) rec = retio.cumsum()
print(rec)
x = np.arange(len(rec))
plt.plot(rec, color='r')
[0.9996008 0.99995245 0.99997489 0.99999016 0.9999933 0.999995640.99999759 0.99999838 0.99999897 0.9999995 0.99999962 0.999999720.99999979 0.99999986 0.9999999 0.99999993 0.99999996 0.999999970.99999997 0.99999998 0.99999998 0.99999999 0.99999999 0.999999990.99999999 1. 1. 1. 1. 1.1. 1. 1. 1. 1. 1.1. 1. 1. 1. 1. 1.1. 1. 1. 1. 1. 1.1. 1. 1. 1. 1. 1. ][<matplotlib.lines.Line2D at 0x1c21f406780>]
pca.set_params(n_components=10)
pacsse = pd.DataFrame(pca.fit_transform(dfp))
pacsse.head()
0 | 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | |
---|---|---|---|---|---|---|---|---|---|---|
0 | 94938.293061 | -342.891655 | -161.442878 | -199.616210 | 1.830692 | 73.107938 | 153.124982 | 124.440657 | -34.371612 | 46.548951 |
1 | 56613.313155 | -960.580156 | -38.560364 | -45.836571 | 13.670166 | 90.767620 | -145.846645 | -40.255134 | 10.508203 | 16.287863 |
2 | -31060.195159 | 388.005529 | -6.932692 | -0.948812 | -5.332728 | 18.237293 | 11.393467 | 14.689011 | -7.994909 | 32.398532 |
3 | -45806.252443 | 1579.357883 | -81.812845 | -96.488345 | -18.477649 | -90.059217 | 31.377291 | -22.865193 | -19.724837 | 16.293640 |
4 | -34963.135693 | 611.858506 | -18.187490 | -16.454233 | -5.597209 | -9.722257 | -63.112236 | -3.943266 | 7.222725 | -10.889839 |
手肘法获取最优 K 值
from sklearn.cluster import KMeansdf_features = pacsse # 读入数据
# '利用SSE选择k'
SSE = [] # 存放每次结果的误差平方和
for k in range(1,9):estimator = KMeans(n_clusters=k) # 构造聚类器estimator.fit(df_features)SSE.append(estimator.inertia_)
X = range(1,9)
plt.xlabel('k')
plt.ylabel('SSE')
plt.plot(X,SSE,'o-')
[<matplotlib.lines.Line2D at 0x1c2211cac50>]
显然,先标准化数据是不合适的
# 显然,先标准化数据是不合适的df_features = pd.DataFrame(scale(pacsse)) SSE = []
for k in range(1,9):estimator = KMeans(n_clusters=k) estimator.fit(df_features)SSE.append(estimator.inertia_)
X = range(1,9)
plt.xlabel('k')
plt.ylabel('SSE')
plt.plot(X,SSE,'o-')
[<matplotlib.lines.Line2D at 0x1c2213bc438>]
km = KMeans(n_clusters=4)
km.fit(pacsse)
clu = km.labels_
pacsse['clu'] = clu
pacsse.head()
0 | 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | clu | |
---|---|---|---|---|---|---|---|---|---|---|---|
0 | 94938.293061 | -342.891655 | -161.442878 | -199.616210 | 1.830692 | 73.107938 | 153.124982 | 124.440657 | -34.371612 | 46.548951 | 2 |
1 | 56613.313155 | -960.580156 | -38.560364 | -45.836571 | 13.670166 | 90.767620 | -145.846645 | -40.255134 | 10.508203 | 16.287863 | 0 |
2 | -31060.195159 | 388.005529 | -6.932692 | -0.948812 | -5.332728 | 18.237293 | 11.393467 | 14.689011 | -7.994909 | 32.398532 | 1 |
3 | -45806.252443 | 1579.357883 | -81.812845 | -96.488345 | -18.477649 | -90.059217 | 31.377291 | -22.865193 | -19.724837 | 16.293640 | 1 |
4 | -34963.135693 | 611.858506 | -18.187490 | -16.454233 | -5.597209 | -9.722257 | -63.112236 | -3.943266 | 7.222725 | -10.889839 | 1 |
pacsse.groupby('clu')[2].count()
clu
0 153
1 344
2 54
3 6
Name: 2, dtype: int64
plt.figure(figsize=(13,7))
sns.scatterplot(x=0, y=1, data=pacsse,style='clu',hue='clu', palette='autumn')
<matplotlib.axes._subplots.AxesSubplot at 0x1c22118b668>
显然,不进行预处理的数据聚类是有问题的, 第一主成分和第二主成分 显然是相关的
pac4 = pac2.copy()
pac4['cluster'] = list(pacsse.clu)
pac4.head()clu5 = pac4.groupby('cluster').mean()
clu5.drop(columns='A53',inplace=True)
c5cor = clu5.corr()
plt.figure(figsize=(15,8))
sns.heatmap(c5cor,annot=True)
<matplotlib.axes._subplots.AxesSubplot at 0x1c22145a4e0>
ccrp = pd.DataFrame(np.tril(c5cor,-1))
ccrp.columns = clu5.columns
cccc = clu5.loc[:,(ccrp.abs()<.95).all()]
cccc
A12 | A20 | A51 | A54 | |
---|---|---|---|---|
cluster | ||||
0 | 3.398693 | 0.228758 | 1.810458 | 146.287582 |
1 | 1.938953 | 0.316860 | 1.433140 | 101.531977 |
2 | 4.592593 | 0.407407 | 1.870370 | 169.777778 |
3 | 2.166667 | 0.166667 | 1.666667 | 213.833333 |
from sklearn.preprocessing import scaleccccc = pd.DataFrame(scale(cccc))ccccc.columns = cccc.columns
ccccc
A12 | A20 | A51 | A54 | |
---|---|---|---|---|
0 | 0.352533 | -0.562784 | 0.684599 | -0.285229 |
1 | -1.021705 | 0.406288 | -1.555764 | -1.388557 |
2 | 1.476502 | 1.402249 | 1.040338 | 0.293858 |
3 | -0.807330 | -1.245753 | -0.169173 | 1.379928 |
plt.figure(figsize=(8,8))
# 极坐标的分割分数
N = ccccc.shape[1]
# 设置雷达图的角度,用于平分切开一个圆面
angles = np.linspace(0, 2*np.pi, N, endpoint=False)
# 使雷达图一圈封闭起来
angles = np.concatenate((angles,[angles[0]]))
for i in range(len(ccccc)):# 构造数据values = ccccc.loc[i,:]# 为了使雷达图一圈封闭起来values = np.concatenate((values,[values[0]]))# 绘制plt.polar(angles, values, 'o-', linewidth=2)
plt.legend(ccccc.index, loc='lower right')
# 添加极坐标的标签
plt.thetagrids(angles * 180/np.pi, labels=list(ccccc.columns))
plt.title('重要指标雷达图呈现')
Text(0.5,1.05,'重要指标雷达图呈现')