第一步:统计user和event相关信息
#查看train_csv的数据
import pandas as pd
df_train = pd.read_csv('train.csv')
df_train.head()
user | event | invited | timestamp | interested | not_interested | |
---|---|---|---|---|---|---|
0 | 3044012 | 1918771225 | 0 | 2012-10-02 15:53:05.754000+00:00 | 0 | 0 |
1 | 3044012 | 1502284248 | 0 | 2012-10-02 15:53:05.754000+00:00 | 0 | 0 |
2 | 3044012 | 2529072432 | 0 | 2012-10-02 15:53:05.754000+00:00 | 1 | 0 |
3 | 3044012 | 3072478280 | 0 | 2012-10-02 15:53:05.754000+00:00 | 0 | 0 |
4 | 3044012 | 1390707377 | 0 | 2012-10-02 15:53:05.754000+00:00 | 0 | 0 |
df_train.info()
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 15398 entries, 0 to 15397
Data columns (total 6 columns):
user 15398 non-null int64
event 15398 non-null int64
invited 15398 non-null int64
timestamp 15398 non-null object
interested 15398 non-null int64
not_interested 15398 non-null int64
dtypes: int64(5), object(1)
memory usage: 721.9+ KB
#查看test_csv的数据
df_test = pd.read_csv('test.csv')
df_test.head()
user | event | invited | timestamp | |
---|---|---|---|---|
0 | 1776192 | 2877501688 | 0 | 2012-11-30 11:39:01.230000+00:00 |
1 | 1776192 | 3025444328 | 0 | 2012-11-30 11:39:01.230000+00:00 |
2 | 1776192 | 4078218285 | 0 | 2012-11-30 11:39:01.230000+00:00 |
3 | 1776192 | 1024025121 | 0 | 2012-11-30 11:39:01.230000+00:00 |
4 | 1776192 | 2972428928 | 0 | 2012-11-30 11:39:21.985000+00:00 |
df_test.info()
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10237 entries, 0 to 10236
Data columns (total 4 columns):
user 10237 non-null int64
event 10237 non-null int64
invited 10237 non-null int64
timestamp 10237 non-null object
dtypes: int64(3), object(1)
memory usage: 320.0+ KB
- 前两列是用户ID和对应的event ID
- 而test.csv中用户缺少了标签(interested or not_interested)
#第一步的全部程序如下
from collections import defaultdict
import scipy.sparse as ss
import scipy.io as sio
import itertools
#import cPickle
#From python3, cPickle has beed replaced by _pickle
import _pickle as cPickle#用于分析train和test中用户和事件之间的相关性。
class ProgramEntities:"""我们只关心train和test中出现的user和event,因此重点处理这部分关联数据,经过统计:train和test中总共3391个users和13418个events"""def __init__(self):#统计训练集中有多少独立的用户的eventsuniqueUsers = set()#uniqueUsers保存总共多少个用户:3391个uniqueEvents = set()#uniqueEvents保存总共多少个events:13418个eventsForUser = defaultdict(set)#字典eventsForUser保存了每个user:所对应的eventusersForEvent = defaultdict(set)#字典usersForEvent保存了每个event:哪些user点击for filename in ['train.csv', 'test.csv']:f = open(filename)f.readline()#跳过第一行for line in f:cols = line.strip().split(',')uniqueUsers.add( cols[0] ) #统计所有的用户有哪些uniqueEvents.add( cols[1] ) #统计所有的事件有哪些eventsForUser[cols[0]].add( cols[1] ) #将用户作为键值,保存下每个用户对应的事件usersForEvent[cols[1]].add( cols[0] ) #将事件作为键值,保存下每个事件对应的用户f.close()self.userEventScores = ss.dok_matrix( ( len(uniqueUsers), len(uniqueEvents) ) )self.userIndex = dict()self.eventIndex = dict()for i, u in enumerate(uniqueUsers):self.userIndex[u] = ifor i, e in enumerate(uniqueEvents):self.eventIndex[e] = iftrain = open('train.csv')ftrain.readline()for line in ftrain:cols = line.strip().split(',')i = self.userIndex[ cols[0] ]j = self.eventIndex[ cols[1] ]self.userEventScores[i, j] = int( cols[4] ) - int( cols[5] )ftrain.close()sio.mmwrite('PE_userEventScores', self.userEventScores)#为了防止不必要的计算,我们找出来所有关联的用户或者关联的event#所谓关联用户指的是至少在同一个event上有行为的用户user pair#关联的event指的是至少同一个user有行为的event pairself.uniqueUserPairs = set()self.uniqueEventPairs = set()#查找关联用户for event in uniqueEvents:users = usersForEvent[event]if len(users) > 2:self.uniqueUserPairs.update( itertools.combinations(users, 2) )#查找关联事件for user in uniqueUsers:events = eventsForUser[user]if len(events) > 2:self.uniqueEventPairs.update( itertools.combinations(events, 2) )#rint(self.userIndex)cPickle.dump( self.userIndex, open('PE_userIndex.pkl', 'wb'))cPickle.dump( self.eventIndex, open('PE_eventIndex.pkl', 'wb') )print('第1步:统计user和event相关信息...')
pe = ProgramEntities()
print('第1步完成...\n')
第1步:统计user和event相关信息...
第1步完成...
pe.userEventScores
<3391x13418 sparse matrix of type '<class 'numpy.float64'>'with 4645 stored elements in Dictionary Of Keys format>
说明:
- 其中PE_userEventScores.mtx是所有users和events的矩阵,但是里面的值只有train.csv的值,值是1或者-1
- scipy.sparse.dok_matrix()函数是产生一个稀疏矩阵,这样PE_userEventScores.mtx只保存了非0值
- 针对该步使用的变量作简单介绍:
- uniqueUsers:集合,保存train.csv和test.csv中的所有user ID
- uniqueEvents:集合,保存train.csv和test.csv中的所有event ID
- eventsForUser:字典,key为每个用户,value为该用户对应的event集合
- usersForEvent:字典,key为每个event,value为该event对应的user集合
- userIndex:字典,每个用户有个Index
- eventIndex:字典,每个event有个Index
- userEventScores:稀疏矩阵3391 * 13418,use vs event,矩阵元素为train.csv中
每个user对某个event的兴趣分(1, 0 or -1)即interested - not_interested
import pandas as pd
pd.DataFrame(userEventScores)
userEventScores:每个user对每个event的兴趣分(1, 0 or -1)
import pandas as pd
df_train = pd.read_csv('train.csv')
df_train[df_train['event']==1502284248]
import itertools
for each in itertools.combinations(set([3044012,1302145719,3194014105,3669515588]), 2):print(each)
(3194014105, 3669515588)
(3194014105, 3044012)
(3194014105, 1302145719)
(3669515588, 3044012)
(3669515588, 1302145719)
(3044012, 1302145719)
uniqueUserPairs:集合,如果对于同一个event来说,关联上3个及3个以上users,则该event关联上的users进行两两配对,保存在uniqueUserPairs中,注意保存的是userId,而不是user对应的索引:
import pandas as pd
df_train = pd.read_csv('train.csv')
df_train[df_train['user']==3044012]
user | event | invited | timestamp | interested | not_interested | |
---|---|---|---|---|---|---|
0 | 3044012 | 1918771225 | 0 | 2012-10-02 15:53:05.754000+00:00 | 0 | 0 |
1 | 3044012 | 1502284248 | 0 | 2012-10-02 15:53:05.754000+00:00 | 0 | 0 |
2 | 3044012 | 2529072432 | 0 | 2012-10-02 15:53:05.754000+00:00 | 1 | 0 |
3 | 3044012 | 3072478280 | 0 | 2012-10-02 15:53:05.754000+00:00 | 0 | 0 |
4 | 3044012 | 1390707377 | 0 | 2012-10-02 15:53:05.754000+00:00 | 0 | 0 |
5 | 3044012 | 1532377761 | 0 | 2012-10-02 15:53:05.754000+00:00 | 0 | 0 |
import itertools
for each in itertools.combinations(set([1918771225,1502284248,2529072432, 3072478280, 1390707377, 1532377761 ]), 2):print(each)
(1532377761, 3072478280)
(1532377761, 2529072432)
(1532377761, 1390707377)
(1532377761, 1502284248)
(1532377761, 1918771225)
(3072478280, 2529072432)
(3072478280, 1390707377)
(3072478280, 1502284248)
(3072478280, 1918771225)
(2529072432, 1390707377)
(2529072432, 1502284248)
(2529072432, 1918771225)
(1390707377, 1502284248)
(1390707377, 1918771225)
(1502284248, 1918771225)
第二步:计算用户相似度
由于用到:users.csv,我们先看看其内容(看前10行)
import pandas as pd
df_users = pd.read_csv('users.csv')
df_users.head(10)
user_id | locale | birthyear | gender | joinedAt | location | timezone | |
---|---|---|---|---|---|---|---|
0 | 3197468391 | id_ID | 1993 | male | 2012-10-02T06:40:55.524Z | Medan Indonesia | 480.0 |
1 | 3537982273 | id_ID | 1992 | male | 2012-09-29T18:03:12.111Z | Medan Indonesia | 420.0 |
2 | 823183725 | en_US | 1975 | male | 2012-10-06T03:14:07.149Z | Stratford Ontario | -240.0 |
3 | 1872223848 | en_US | 1991 | female | 2012-11-04T08:59:43.783Z | Tehran Iran | 210.0 |
4 | 3429017717 | id_ID | 1995 | female | 2012-09-10T16:06:53.132Z | NaN | 420.0 |
5 | 627175141 | ka_GE | 1973 | female | 2012-11-01T09:59:17.590Z | Tbilisi Georgia | 240.0 |
6 | 2752000443 | id_ID | 1994 | male | 2012-10-03T05:22:17.637Z | Medan Indonesia | 420.0 |
7 | 3473687777 | id_ID | 1965 | female | 2012-10-03T12:19:29.975Z | Medan Indonesia | 420.0 |
8 | 2966052962 | id_ID | 1979 | male | 2012-10-31T10:11:57.668Z | Medan Indonesia | 420.0 |
9 | 264876277 | id_ID | 1988 | female | 2012-10-02T07:28:09.555Z | Medan Indonesia | 420.0 |
#使用locale和pycountry模块来将字符串转换成数值
import locale
locale.locale_alias
{'a3': 'az_AZ.KOI8-C','a3_az': 'az_AZ.KOI8-C','a3_az.koic': 'az_AZ.KOI8-C','aa_dj': 'aa_DJ.ISO8859-1','aa_er': 'aa_ER.UTF-8','aa_et': 'aa_ET.UTF-8','af': 'af_ZA.ISO8859-1','af_za': 'af_ZA.ISO8859-1','am': 'am_ET.UTF-8','am_et': 'am_ET.UTF-8','american': 'en_US.ISO8859-1','an_es': 'an_ES.ISO8859-15','ar': 'ar_AA.ISO8859-6','ar_aa': 'ar_AA.ISO8859-6','ar_ae': 'ar_AE.ISO8859-6','ar_bh': 'ar_BH.ISO8859-6','ar_dz': 'ar_DZ.ISO8859-6','ar_eg': 'ar_EG.ISO8859-6','ar_in': 'ar_IN.UTF-8','ar_iq': 'ar_IQ.ISO8859-6','ar_jo': 'ar_JO.ISO8859-6','ar_kw': 'ar_KW.ISO8859-6','ar_lb': 'ar_LB.ISO8859-6','ar_ly': 'ar_LY.ISO8859-6','ar_ma': 'ar_MA.ISO8859-6','ar_om': 'ar_OM.ISO8859-6','ar_qa': 'ar_QA.ISO8859-6','ar_sa': 'ar_SA.ISO8859-6','ar_sd': 'ar_SD.ISO8859-6','ar_sy': 'ar_SY.ISO8859-6','ar_tn': 'ar_TN.ISO8859-6','ar_ye': 'ar_YE.ISO8859-6','arabic': 'ar_AA.ISO8859-6','as': 'as_IN.UTF-8','as_in': 'as_IN.UTF-8','ast_es': 'ast_ES.ISO8859-15','ayc_pe': 'ayc_PE.UTF-8','az': 'az_AZ.ISO8859-9E','az_az': 'az_AZ.ISO8859-9E','az_az.iso88599e': 'az_AZ.ISO8859-9E','be': 'be_BY.CP1251','be@latin': 'be_BY.UTF-8@latin','be_bg.utf8': 'bg_BG.UTF-8','be_by': 'be_BY.CP1251','be_by@latin': 'be_BY.UTF-8@latin','bem_zm': 'bem_ZM.UTF-8','ber_dz': 'ber_DZ.UTF-8','ber_ma': 'ber_MA.UTF-8','bg': 'bg_BG.CP1251','bg_bg': 'bg_BG.CP1251','bho_in': 'bho_IN.UTF-8','bn_bd': 'bn_BD.UTF-8','bn_in': 'bn_IN.UTF-8','bo_cn': 'bo_CN.UTF-8','bo_in': 'bo_IN.UTF-8','bokmal': 'nb_NO.ISO8859-1','bokmål': 'nb_NO.ISO8859-1','br': 'br_FR.ISO8859-1','br_fr': 'br_FR.ISO8859-1','brx_in': 'brx_IN.UTF-8','bs': 'bs_BA.ISO8859-2','bs_ba': 'bs_BA.ISO8859-2','bulgarian': 'bg_BG.CP1251','byn_er': 'byn_ER.UTF-8','c': 'C','c-french': 'fr_CA.ISO8859-1','c.ascii': 'C','c.en': 'C','c.iso88591': 'en_US.ISO8859-1','c.utf8': 'en_US.UTF-8','c_c': 'C','c_c.c': 'C','ca': 'ca_ES.ISO8859-1','ca_ad': 'ca_AD.ISO8859-1','ca_es': 'ca_ES.ISO8859-1','ca_es@valencia': 'ca_ES.ISO8859-15@valencia','ca_fr': 'ca_FR.ISO8859-1','ca_it': 'ca_IT.ISO8859-1','catalan': 'ca_ES.ISO8859-1','cextend': 'en_US.ISO8859-1','chinese-s': 'zh_CN.eucCN','chinese-t': 'zh_TW.eucTW','crh_ua': 'crh_UA.UTF-8','croatian': 'hr_HR.ISO8859-2','cs': 'cs_CZ.ISO8859-2','cs_cs': 'cs_CZ.ISO8859-2','cs_cz': 'cs_CZ.ISO8859-2','csb_pl': 'csb_PL.UTF-8','cv_ru': 'cv_RU.UTF-8','cy': 'cy_GB.ISO8859-1','cy_gb': 'cy_GB.ISO8859-1','cz': 'cs_CZ.ISO8859-2','cz_cz': 'cs_CZ.ISO8859-2','czech': 'cs_CZ.ISO8859-2','da': 'da_DK.ISO8859-1','da_dk': 'da_DK.ISO8859-1','danish': 'da_DK.ISO8859-1','dansk': 'da_DK.ISO8859-1','de': 'de_DE.ISO8859-1','de_at': 'de_AT.ISO8859-1','de_be': 'de_BE.ISO8859-1','de_ch': 'de_CH.ISO8859-1','de_de': 'de_DE.ISO8859-1','de_li.utf8': 'de_LI.UTF-8','de_lu': 'de_LU.ISO8859-1','deutsch': 'de_DE.ISO8859-1','doi_in': 'doi_IN.UTF-8','dutch': 'nl_NL.ISO8859-1','dutch.iso88591': 'nl_BE.ISO8859-1','dv_mv': 'dv_MV.UTF-8','dz_bt': 'dz_BT.UTF-8','ee': 'ee_EE.ISO8859-4','ee_ee': 'ee_EE.ISO8859-4','eesti': 'et_EE.ISO8859-1','el': 'el_GR.ISO8859-7','el_cy': 'el_CY.ISO8859-7','el_gr': 'el_GR.ISO8859-7','el_gr@euro': 'el_GR.ISO8859-15','en': 'en_US.ISO8859-1','en_ag': 'en_AG.UTF-8','en_au': 'en_AU.ISO8859-1','en_be': 'en_BE.ISO8859-1','en_bw': 'en_BW.ISO8859-1','en_ca': 'en_CA.ISO8859-1','en_dk': 'en_DK.ISO8859-1','en_dl.utf8': 'en_DL.UTF-8','en_gb': 'en_GB.ISO8859-1','en_hk': 'en_HK.ISO8859-1','en_ie': 'en_IE.ISO8859-1','en_in': 'en_IN.ISO8859-1','en_ng': 'en_NG.UTF-8','en_nz': 'en_NZ.ISO8859-1','en_ph': 'en_PH.ISO8859-1','en_sg': 'en_SG.ISO8859-1','en_uk': 'en_GB.ISO8859-1','en_us': 'en_US.ISO8859-1','en_us@euro@euro': 'en_US.ISO8859-15','en_za': 'en_ZA.ISO8859-1','en_zm': 'en_ZM.UTF-8','en_zw': 'en_ZW.ISO8859-1','en_zw.utf8': 'en_ZS.UTF-8','eng_gb': 'en_GB.ISO8859-1','english': 'en_EN.ISO8859-1','english_uk': 'en_GB.ISO8859-1','english_united-states': 'en_US.ISO8859-1','english_united-states.437': 'C','english_us': 'en_US.ISO8859-1','eo': 'eo_XX.ISO8859-3','eo.utf8': 'eo.UTF-8','eo_eo': 'eo_EO.ISO8859-3','eo_us.utf8': 'eo_US.UTF-8','eo_xx': 'eo_XX.ISO8859-3','es': 'es_ES.ISO8859-1','es_ar': 'es_AR.ISO8859-1','es_bo': 'es_BO.ISO8859-1','es_cl': 'es_CL.ISO8859-1','es_co': 'es_CO.ISO8859-1','es_cr': 'es_CR.ISO8859-1','es_cu': 'es_CU.UTF-8','es_do': 'es_DO.ISO8859-1','es_ec': 'es_EC.ISO8859-1','es_es': 'es_ES.ISO8859-1','es_gt': 'es_GT.ISO8859-1','es_hn': 'es_HN.ISO8859-1','es_mx': 'es_MX.ISO8859-1','es_ni': 'es_NI.ISO8859-1','es_pa': 'es_PA.ISO8859-1','es_pe': 'es_PE.ISO8859-1','es_pr': 'es_PR.ISO8859-1','es_py': 'es_PY.ISO8859-1','es_sv': 'es_SV.ISO8859-1','es_us': 'es_US.ISO8859-1','es_uy': 'es_UY.ISO8859-1','es_ve': 'es_VE.ISO8859-1','estonian': 'et_EE.ISO8859-1','et': 'et_EE.ISO8859-15','et_ee': 'et_EE.ISO8859-15','eu': 'eu_ES.ISO8859-1','eu_es': 'eu_ES.ISO8859-1','eu_fr': 'eu_FR.ISO8859-1','fa': 'fa_IR.UTF-8','fa_ir': 'fa_IR.UTF-8','fa_ir.isiri3342': 'fa_IR.ISIRI-3342','ff_sn': 'ff_SN.UTF-8','fi': 'fi_FI.ISO8859-15','fi_fi': 'fi_FI.ISO8859-15','fil_ph': 'fil_PH.UTF-8','finnish': 'fi_FI.ISO8859-1','fo': 'fo_FO.ISO8859-1','fo_fo': 'fo_FO.ISO8859-1','fr': 'fr_FR.ISO8859-1','fr_be': 'fr_BE.ISO8859-1','fr_ca': 'fr_CA.ISO8859-1','fr_ch': 'fr_CH.ISO8859-1','fr_fr': 'fr_FR.ISO8859-1','fr_lu': 'fr_LU.ISO8859-1','français': 'fr_FR.ISO8859-1','fre_fr': 'fr_FR.ISO8859-1','french': 'fr_FR.ISO8859-1','french.iso88591': 'fr_CH.ISO8859-1','french_france': 'fr_FR.ISO8859-1','fur_it': 'fur_IT.UTF-8','fy_de': 'fy_DE.UTF-8','fy_nl': 'fy_NL.UTF-8','ga': 'ga_IE.ISO8859-1','ga_ie': 'ga_IE.ISO8859-1','galego': 'gl_ES.ISO8859-1','galician': 'gl_ES.ISO8859-1','gd': 'gd_GB.ISO8859-1','gd_gb': 'gd_GB.ISO8859-1','ger_de': 'de_DE.ISO8859-1','german': 'de_DE.ISO8859-1','german.iso88591': 'de_CH.ISO8859-1','german_germany': 'de_DE.ISO8859-1','gez_er': 'gez_ER.UTF-8','gez_et': 'gez_ET.UTF-8','gl': 'gl_ES.ISO8859-1','gl_es': 'gl_ES.ISO8859-1','greek': 'el_GR.ISO8859-7','gu_in': 'gu_IN.UTF-8','gv': 'gv_GB.ISO8859-1','gv_gb': 'gv_GB.ISO8859-1','ha_ng': 'ha_NG.UTF-8','he': 'he_IL.ISO8859-8','he_il': 'he_IL.ISO8859-8','hebrew': 'he_IL.ISO8859-8','hi': 'hi_IN.ISCII-DEV','hi_in': 'hi_IN.ISCII-DEV','hi_in.isciidev': 'hi_IN.ISCII-DEV','hne': 'hne_IN.UTF-8','hne_in': 'hne_IN.UTF-8','hr': 'hr_HR.ISO8859-2','hr_hr': 'hr_HR.ISO8859-2','hrvatski': 'hr_HR.ISO8859-2','hsb_de': 'hsb_DE.ISO8859-2','ht_ht': 'ht_HT.UTF-8','hu': 'hu_HU.ISO8859-2','hu_hu': 'hu_HU.ISO8859-2','hungarian': 'hu_HU.ISO8859-2','hy_am': 'hy_AM.UTF-8','hy_am.armscii8': 'hy_AM.ARMSCII_8','ia': 'ia.UTF-8','ia_fr': 'ia_FR.UTF-8','icelandic': 'is_IS.ISO8859-1','id': 'id_ID.ISO8859-1','id_id': 'id_ID.ISO8859-1','ig_ng': 'ig_NG.UTF-8','ik_ca': 'ik_CA.UTF-8','in': 'id_ID.ISO8859-1','in_id': 'id_ID.ISO8859-1','is': 'is_IS.ISO8859-1','is_is': 'is_IS.ISO8859-1','iso-8859-1': 'en_US.ISO8859-1','iso-8859-15': 'en_US.ISO8859-15','iso8859-1': 'en_US.ISO8859-1','iso8859-15': 'en_US.ISO8859-15','iso_8859_1': 'en_US.ISO8859-1','iso_8859_15': 'en_US.ISO8859-15','it': 'it_IT.ISO8859-1','it_ch': 'it_CH.ISO8859-1','it_it': 'it_IT.ISO8859-1','italian': 'it_IT.ISO8859-1','iu': 'iu_CA.NUNACOM-8','iu_ca': 'iu_CA.NUNACOM-8','iu_ca.nunacom8': 'iu_CA.NUNACOM-8','iw': 'he_IL.ISO8859-8','iw_il': 'he_IL.ISO8859-8','iw_il.utf8': 'iw_IL.UTF-8','ja': 'ja_JP.eucJP','ja_jp': 'ja_JP.eucJP','ja_jp.euc': 'ja_JP.eucJP','ja_jp.mscode': 'ja_JP.SJIS','ja_jp.pck': 'ja_JP.SJIS','japan': 'ja_JP.eucJP','japanese': 'ja_JP.eucJP','japanese-euc': 'ja_JP.eucJP','japanese.euc': 'ja_JP.eucJP','jp_jp': 'ja_JP.eucJP','ka': 'ka_GE.GEORGIAN-ACADEMY','ka_ge': 'ka_GE.GEORGIAN-ACADEMY','ka_ge.georgianacademy': 'ka_GE.GEORGIAN-ACADEMY','ka_ge.georgianps': 'ka_GE.GEORGIAN-PS','ka_ge.georgianrs': 'ka_GE.GEORGIAN-ACADEMY','kk_kz': 'kk_KZ.RK1048','kl': 'kl_GL.ISO8859-1','kl_gl': 'kl_GL.ISO8859-1','km_kh': 'km_KH.UTF-8','kn': 'kn_IN.UTF-8','kn_in': 'kn_IN.UTF-8','ko': 'ko_KR.eucKR','ko_kr': 'ko_KR.eucKR','ko_kr.euc': 'ko_KR.eucKR','kok_in': 'kok_IN.UTF-8','korean': 'ko_KR.eucKR','korean.euc': 'ko_KR.eucKR','ks': 'ks_IN.UTF-8','ks_in': 'ks_IN.UTF-8','ks_in@devanagari.utf8': 'ks_IN.UTF-8@devanagari','ku_tr': 'ku_TR.ISO8859-9','kw': 'kw_GB.ISO8859-1','kw_gb': 'kw_GB.ISO8859-1','ky': 'ky_KG.UTF-8','ky_kg': 'ky_KG.UTF-8','lb_lu': 'lb_LU.UTF-8','lg_ug': 'lg_UG.ISO8859-10','li_be': 'li_BE.UTF-8','li_nl': 'li_NL.UTF-8','lij_it': 'lij_IT.UTF-8','lithuanian': 'lt_LT.ISO8859-13','lo': 'lo_LA.MULELAO-1','lo_la': 'lo_LA.MULELAO-1','lo_la.cp1133': 'lo_LA.IBM-CP1133','lo_la.ibmcp1133': 'lo_LA.IBM-CP1133','lo_la.mulelao1': 'lo_LA.MULELAO-1','lt': 'lt_LT.ISO8859-13','lt_lt': 'lt_LT.ISO8859-13','lv': 'lv_LV.ISO8859-13','lv_lv': 'lv_LV.ISO8859-13','mag_in': 'mag_IN.UTF-8','mai': 'mai_IN.UTF-8','mai_in': 'mai_IN.UTF-8','mg_mg': 'mg_MG.ISO8859-15','mhr_ru': 'mhr_RU.UTF-8','mi': 'mi_NZ.ISO8859-1','mi_nz': 'mi_NZ.ISO8859-1','mk': 'mk_MK.ISO8859-5','mk_mk': 'mk_MK.ISO8859-5','ml': 'ml_IN.UTF-8','ml_in': 'ml_IN.UTF-8','mn_mn': 'mn_MN.UTF-8','mni_in': 'mni_IN.UTF-8','mr': 'mr_IN.UTF-8','mr_in': 'mr_IN.UTF-8','ms': 'ms_MY.ISO8859-1','ms_my': 'ms_MY.ISO8859-1','mt': 'mt_MT.ISO8859-3','mt_mt': 'mt_MT.ISO8859-3','my_mm': 'my_MM.UTF-8','nan_tw@latin': 'nan_TW.UTF-8@latin','nb': 'nb_NO.ISO8859-1','nb_no': 'nb_NO.ISO8859-1','nds_de': 'nds_DE.UTF-8','nds_nl': 'nds_NL.UTF-8','ne_np': 'ne_NP.UTF-8','nhn_mx': 'nhn_MX.UTF-8','niu_nu': 'niu_NU.UTF-8','niu_nz': 'niu_NZ.UTF-8','nl': 'nl_NL.ISO8859-1','nl_aw': 'nl_AW.UTF-8','nl_be': 'nl_BE.ISO8859-1','nl_nl': 'nl_NL.ISO8859-1','nn': 'nn_NO.ISO8859-1','nn_no': 'nn_NO.ISO8859-1','no': 'no_NO.ISO8859-1','no@nynorsk': 'ny_NO.ISO8859-1','no_no': 'no_NO.ISO8859-1','no_no.iso88591@bokmal': 'no_NO.ISO8859-1','no_no.iso88591@nynorsk': 'no_NO.ISO8859-1','norwegian': 'no_NO.ISO8859-1','nr': 'nr_ZA.ISO8859-1','nr_za': 'nr_ZA.ISO8859-1','nso': 'nso_ZA.ISO8859-15','nso_za': 'nso_ZA.ISO8859-15','ny': 'ny_NO.ISO8859-1','ny_no': 'ny_NO.ISO8859-1','nynorsk': 'nn_NO.ISO8859-1','oc': 'oc_FR.ISO8859-1','oc_fr': 'oc_FR.ISO8859-1','om_et': 'om_ET.UTF-8','om_ke': 'om_KE.ISO8859-1','or': 'or_IN.UTF-8','or_in': 'or_IN.UTF-8','os_ru': 'os_RU.UTF-8','pa': 'pa_IN.UTF-8','pa_in': 'pa_IN.UTF-8','pa_pk': 'pa_PK.UTF-8','pap_an': 'pap_AN.UTF-8','pd': 'pd_US.ISO8859-1','pd_de': 'pd_DE.ISO8859-1','pd_us': 'pd_US.ISO8859-1','ph': 'ph_PH.ISO8859-1','ph_ph': 'ph_PH.ISO8859-1','pl': 'pl_PL.ISO8859-2','pl_pl': 'pl_PL.ISO8859-2','polish': 'pl_PL.ISO8859-2','portuguese': 'pt_PT.ISO8859-1','portuguese_brazil': 'pt_BR.ISO8859-1','posix': 'C','posix-utf2': 'C','pp': 'pp_AN.ISO8859-1','pp_an': 'pp_AN.ISO8859-1','ps_af': 'ps_AF.UTF-8','pt': 'pt_PT.ISO8859-1','pt_br': 'pt_BR.ISO8859-1','pt_pt': 'pt_PT.ISO8859-1','ro': 'ro_RO.ISO8859-2','ro_ro': 'ro_RO.ISO8859-2','romanian': 'ro_RO.ISO8859-2','ru': 'ru_RU.UTF-8','ru_ru': 'ru_RU.UTF-8','ru_ua': 'ru_UA.KOI8-U','rumanian': 'ro_RO.ISO8859-2','russian': 'ru_RU.ISO8859-5','rw': 'rw_RW.ISO8859-1','rw_rw': 'rw_RW.ISO8859-1','sa_in': 'sa_IN.UTF-8','sat_in': 'sat_IN.UTF-8','sc_it': 'sc_IT.UTF-8','sd': 'sd_IN.UTF-8','sd_in': 'sd_IN.UTF-8','sd_in@devanagari.utf8': 'sd_IN.UTF-8@devanagari','sd_pk': 'sd_PK.UTF-8','se_no': 'se_NO.UTF-8','serbocroatian': 'sr_RS.UTF-8@latin','sh': 'sr_RS.UTF-8@latin','sh_ba.iso88592@bosnia': 'sr_CS.ISO8859-2','sh_hr': 'sh_HR.ISO8859-2','sh_hr.iso88592': 'hr_HR.ISO8859-2','sh_sp': 'sr_CS.ISO8859-2','sh_yu': 'sr_RS.UTF-8@latin','shs_ca': 'shs_CA.UTF-8','si': 'si_LK.UTF-8','si_lk': 'si_LK.UTF-8','sid_et': 'sid_ET.UTF-8','sinhala': 'si_LK.UTF-8','sk': 'sk_SK.ISO8859-2','sk_sk': 'sk_SK.ISO8859-2','sl': 'sl_SI.ISO8859-2','sl_cs': 'sl_CS.ISO8859-2','sl_si': 'sl_SI.ISO8859-2','slovak': 'sk_SK.ISO8859-2','slovene': 'sl_SI.ISO8859-2','slovenian': 'sl_SI.ISO8859-2','so_dj': 'so_DJ.ISO8859-1','so_et': 'so_ET.UTF-8','so_ke': 'so_KE.ISO8859-1','so_so': 'so_SO.ISO8859-1','sp': 'sr_CS.ISO8859-5','sp_yu': 'sr_CS.ISO8859-5','spanish': 'es_ES.ISO8859-1','spanish_spain': 'es_ES.ISO8859-1','sq': 'sq_AL.ISO8859-2','sq_al': 'sq_AL.ISO8859-2','sq_mk': 'sq_MK.UTF-8','sr': 'sr_RS.UTF-8','sr@cyrillic': 'sr_RS.UTF-8','sr@latn': 'sr_CS.UTF-8@latin','sr_cs': 'sr_CS.UTF-8','sr_cs.iso88592@latn': 'sr_CS.ISO8859-2','sr_cs@latn': 'sr_CS.UTF-8@latin','sr_me': 'sr_ME.UTF-8','sr_rs': 'sr_RS.UTF-8','sr_rs@latn': 'sr_RS.UTF-8@latin','sr_sp': 'sr_CS.ISO8859-2','sr_yu': 'sr_RS.UTF-8@latin','sr_yu.cp1251@cyrillic': 'sr_CS.CP1251','sr_yu.iso88592': 'sr_CS.ISO8859-2','sr_yu.iso88595': 'sr_CS.ISO8859-5','sr_yu.iso88595@cyrillic': 'sr_CS.ISO8859-5','sr_yu.microsoftcp1251@cyrillic': 'sr_CS.CP1251','sr_yu.utf8': 'sr_RS.UTF-8','sr_yu.utf8@cyrillic': 'sr_RS.UTF-8','sr_yu@cyrillic': 'sr_RS.UTF-8','ss': 'ss_ZA.ISO8859-1','ss_za': 'ss_ZA.ISO8859-1','st': 'st_ZA.ISO8859-1','st_za': 'st_ZA.ISO8859-1','sv': 'sv_SE.ISO8859-1','sv_fi': 'sv_FI.ISO8859-1','sv_se': 'sv_SE.ISO8859-1','sw_ke': 'sw_KE.UTF-8','sw_tz': 'sw_TZ.UTF-8','swedish': 'sv_SE.ISO8859-1','szl_pl': 'szl_PL.UTF-8','ta': 'ta_IN.TSCII-0','ta_in': 'ta_IN.TSCII-0','ta_in.tscii': 'ta_IN.TSCII-0','ta_in.tscii0': 'ta_IN.TSCII-0','ta_lk': 'ta_LK.UTF-8','te': 'te_IN.UTF-8','te_in': 'te_IN.UTF-8','tg': 'tg_TJ.KOI8-C','tg_tj': 'tg_TJ.KOI8-C','th': 'th_TH.ISO8859-11','th_th': 'th_TH.ISO8859-11','th_th.tactis': 'th_TH.TIS620','th_th.tis620': 'th_TH.TIS620','thai': 'th_TH.ISO8859-11','ti_er': 'ti_ER.UTF-8','ti_et': 'ti_ET.UTF-8','tig_er': 'tig_ER.UTF-8','tk_tm': 'tk_TM.UTF-8','tl': 'tl_PH.ISO8859-1','tl_ph': 'tl_PH.ISO8859-1','tn': 'tn_ZA.ISO8859-15','tn_za': 'tn_ZA.ISO8859-15','tr': 'tr_TR.ISO8859-9','tr_cy': 'tr_CY.ISO8859-9','tr_tr': 'tr_TR.ISO8859-9','ts': 'ts_ZA.ISO8859-1','ts_za': 'ts_ZA.ISO8859-1','tt': 'tt_RU.TATAR-CYR','tt_ru': 'tt_RU.TATAR-CYR','tt_ru.tatarcyr': 'tt_RU.TATAR-CYR','tt_ru@iqtelif': 'tt_RU.UTF-8@iqtelif','turkish': 'tr_TR.ISO8859-9','ug_cn': 'ug_CN.UTF-8','uk': 'uk_UA.KOI8-U','uk_ua': 'uk_UA.KOI8-U','univ': 'en_US.utf','universal': 'en_US.utf','universal.utf8@ucs4': 'en_US.UTF-8','unm_us': 'unm_US.UTF-8','ur': 'ur_PK.CP1256','ur_in': 'ur_IN.UTF-8','ur_pk': 'ur_PK.CP1256','uz': 'uz_UZ.UTF-8','uz_uz': 'uz_UZ.UTF-8','uz_uz@cyrillic': 'uz_UZ.UTF-8','ve': 've_ZA.UTF-8','ve_za': 've_ZA.UTF-8','vi': 'vi_VN.TCVN','vi_vn': 'vi_VN.TCVN','vi_vn.tcvn': 'vi_VN.TCVN','vi_vn.tcvn5712': 'vi_VN.TCVN','vi_vn.viscii': 'vi_VN.VISCII','vi_vn.viscii111': 'vi_VN.VISCII','wa': 'wa_BE.ISO8859-1','wa_be': 'wa_BE.ISO8859-1','wae_ch': 'wae_CH.UTF-8','wal_et': 'wal_ET.UTF-8','wo_sn': 'wo_SN.UTF-8','xh': 'xh_ZA.ISO8859-1','xh_za': 'xh_ZA.ISO8859-1','yi': 'yi_US.CP1255','yi_us': 'yi_US.CP1255','yo_ng': 'yo_NG.UTF-8','yue_hk': 'yue_HK.UTF-8','zh': 'zh_CN.eucCN','zh_cn': 'zh_CN.gb2312','zh_cn.big5': 'zh_TW.big5','zh_cn.euc': 'zh_CN.eucCN','zh_hk': 'zh_HK.big5hkscs','zh_hk.big5hk': 'zh_HK.big5hkscs','zh_sg': 'zh_SG.GB2312','zh_sg.gbk': 'zh_SG.GBK','zh_tw': 'zh_TW.big5','zh_tw.euc': 'zh_TW.eucTW','zh_tw.euctw': 'zh_TW.eucTW','zu': 'zu_ZA.ISO8859-1','zu_za': 'zu_ZA.ISO8859-1'}
1.locale列处理
import locale
from collections import defaultdictlocaleIdMap = defaultdict(int)
for i, l in enumerate(locale.locale_alias.keys()):localeIdMap[l] = i + 1
for each in localeIdMap:print(each, '\t', localeIdMap[each])
ee 1
fr_ch 2
fo_fo 3
af_za 4
bn_in 5
mni_in 93
da_dk 8
ar_ma 9
ig_ng 369
fr_be 11
italian 12
he_il 13
aa_dj 15
ml 463
yue_hk 17
pt_br 19
es_mx 280
gu_in 22
sid_et 23
it_it 24
japanese 95
de_de 26
en_ag 523
croatian 27
it 96
cs 29
mn_mn 30
ar_bh 31
ro_ro 481
gv_gb 33
rw 34
bg_bg 35
ar 499
en_us@euro@euro 36
fil_ph 37
fr_fr 466
french 39
de 40
polish 285
kok_in 42
korean.euc 43
sr 44
sr_cs.iso88592@latn 45
pap_an 46
sr_yu.iso88595 47
turkish 51
c.utf8 52
uz_uz 53
lv_lv 429
sr_rs@latn 54
eo_xx 55
ik_ca 57
iso_8859_1 58
no_no.iso88591@bokmal 59
cextend 60
doi_in 225
universal 61
es_cr 62
hne_in 63
gd_gb 64
cy 65
nl_aw 66
yi 67
mt_mt 68
sk_sk 384
si_lk 71
a3_az 72
lt 500
st_za 73
iw 74
te 318
en_nz 528
en_in 76
zh_tw.euc 77
ne_np 49
brx_in 286
no 80
az 81
german.iso88591 475
ky 32
he 85
kn_in 86
id_id 110
mai 88
nb_no 89
czech 90
sq 91
ja 92
tr 6
german_germany 94
shs_ca 265
mr 28
fi_fi 97
wal_et 48
cs_cs 100
sd_in@devanagari.utf8 101
gez_er 102
a3 103
wae_ch 283
iu 106
nl 107
french.iso88591 108
japanese-euc 83
tig_er 98
hne 111
c.iso88591 112
ar_qa 113
chinese-t 114
fo 115
de_li.utf8 117
br_fr 118
mag_in 515
sv_fi 119
russian 120
pp 121
wa_be 123
norwegian 124
fa_ir.isiri3342 126
ky_kg 127
zh_tw.euctw 128
fre_fr 130
english_uk 131
arabic 133
fr_ca 134
ber_ma 135
ml_in 136
li_nl 137
et 138
fur_it 139
om_ke 140
gl 141
bg 142
is_is 143
sr_yu 282
tk_tm 125
en_au 146
fa_ir 147
be_bg.utf8 148
zu 303
sh_hr.iso88592 150
szl_pl 310
ar_ae 152
nynorsk 153
en_bw 154
iso-8859-1 155
tl_ph 518
bulgarian 299
ts 356
kn 159
af 160
wa 161
or_in 162
dansk 163
bs 522
be@latin 164
lij_it 398
ko_kr 167
tr_tr 168
ar_in 169
os_ru 170
sr_yu@cyrillic 171
ta_lk 172
sr_rs 490
es_ec 174
en_be 175
no_no.iso88591@nynorsk 176
zh_cn.big5 177
pt_pt 178
an_es 179
zh_hk 180
es_cl 181
unm_us 312
am 183
as 184
cv_ru 185
ar_aa 186
gd 419
ti_er 187
ar_lb 188
sp 189
ja_jp.euc 190
csb_pl 191
el_gr 192
de_be 193
bokmål 194
danish 195
be_by@latin 196
kw 198
iso_8859_15 301
sr_yu.iso88595@cyrillic 199
cs_cz 200
tn 201
ar_tn 202
or 203
se_no 204
mhr_ru 495
be_by 206
eu_fr 406
de_at 207
tr_cy 104
mai_in 209
zu_za 210
sh_hr 211
ta_in.tscii 212
sr_yu.utf8 213
de_ch 214
dv_mv 236
mk 215
mt 216
fa 217
tt_ru 218
ga_ie 306
iw_il 219
li_be 220
ka_ge.georgianacademy 221
az_az.iso88599e 222
eng_gb 223
en_zw 224
en_dl.utf8 75
estonian 226
es_pa 227
sw_ke 228
es_pe 229
pa_pk 230
hebrew 231
niu_nu 232
lo_la 233
ca_es 309
sq_al 235
ka_ge.georgianrs 305
ca 238
tt_ru.tatarcyr 239
zh_hk.big5hk 240
nb 241
mg_mg 242
eo_eo 510
kl_gl 411
lo 244
iu_ca 245
thai 517
as_in 246
en_ng 313
ar_om 248
ia 249
eo_us.utf8 250
ur_pk 251
vi_vn.tcvn 252
ar_eg 253
es_py 254
ru_ua 255
nn 256
hr 504
chinese-s 258
sc_it 259
ta_in.tscii0 260
korean 261
nr_za 262
si 263
zh_sg 264
portuguese_brazil 440
bokmal 482
ber_dz 266
pa 316
ee_ee 526
american 268
en_za 269
lo_la.cp1133 270
pa_in 271
en_uk 272
sat_in 273
so_so 274
finnish 275
cy_gb 277
mi_nz 278
gez_et 279
german 20
am_et 281
ko_kr.euc 543
es_cu 144
sd 69
ti_et 156
en_ca 506
sr_yu.microsoftcp1251@cyrillic 87
c.ascii 402
lv 287
ka_ge.georgianps 288
pl_pl 237
ar_kw 290
hrvatski 7
bo_in 292
dutch.iso88591 293
pd_de 294
in_id 296
ms 297
hsb_de 298
sr_yu.utf8@cyrillic 157
th_th.tis620 300
lb_lu 315
lg_ug 302
uz_uz@cyrillic 304
sh_sp 314
tg_tj 129
ku_tr 307
deutsch 105
ar_ly 536
nds_nl 390
my_mm 308
fy_nl 234
aa_er 151
kw_gb 311
hy_am 247
romanian 267
wo_sn 122
so_ke 320
sr_yu.iso88592 322
pl 295
sp_yu 324
be 325
et_ee 326
en_ie 328
es_do 329
en_sg 330
it_ch 331
bs_ba 332
el_gr@euro 333
sinhala 334
hu 335
tt_ru@iqtelif 336
ger_de 337
iu_ca.nunacom8 78
ph_ph 339
en_ph 469
rw_rw 393
so_et 340
ka 341
ur_in 205
hr_hr 343
ar_sa 344
french_france 345
sk 346
es_pr 347
galician 349
ff_sn 350
sq_mk 56
ny_no 352
ro 353
zh_cn 354
tt 355
nhn_mx 427
en_dk 372
ar_iq 358
lt_lt 359
dutch 360
slovenian 361
cz 362
nso_za 508
cz_cz 428
ss 364
ar_sy 365
en_gb 366
byn_er 367
ayc_pe 368
en_zw.utf8 338
ug_cn 14
es_ni 371
catalan 84
english_us 373
hi_in.isciidev 374
eu_es 422
ca_fr 375
vi_vn.tcvn5712 376
so_dj 50
nl_nl 378
en_zm 379
posix-utf2 380
el 525
lo_la.ibmcp1133 382
en 383
th_th 70
ka_ge 385
kk_kz 386
a3_az.koic 387
fr 388
de_lu 389
zh 21
es_gt 542
oc_fr 391
ta 392
sv_se 116
st 10
galego 395
eu 158
sr_sp 529
sr_yu.cp1251@cyrillic 166
es_ar 400
mk_mk 401
english_united-states.437 18
dz_bt 351
ga 432
en_us 404
ar_jo 405
es_uy 342
tl 407
c-french 408
english_united-states 409
en_hk 410
br 478
nso 243
spanish_spain 412
xh 413
yi_us 414
ps_af 415
zh_tw 416
bho_in 417
ia_fr 435
ss_za 418
gv 291
es_bo 420
eo 491
gl_es 421
ja_jp 319
tn_za 423
crh_ua 424
sw_tz 425
jp_jp 426
sh_ba.iso88592@bosnia 357
km_kh 363
sv 399
no@nynorsk 16
vi 403
hy_am.armscii8 433
ru_ru 434
univ 276
mr_in 436
ur 437
ht_ht 438
japan 439
sh 377
fr_lu 441
es_hn 442
ast_es 443
ta_in 444
sd_pk 445
portuguese 446
ts_za 447
mi 448
lithuanian 488
c.en 450
zh_cn.euc 321
az_az 452
ko 537
sr@latn 454
es_us 455
ny 456
is 182
iso8859-1 431
fy_de 197
oc 459
icelandic 460
es_es 461
greek 462
pp_an 284
da 464
ha_ng 465
ks_in@devanagari.utf8 38
el_cy 512
pd_us 467
th 468
ja_jp.pck 149
ru 470
c 396
ca_es@valencia 458
uk 472
rumanian 473
français 474
ja_jp.mscode 82
tg 476
es_sv 477
japanese.euc 99
ca_it 479
c_c.c 25
english 480
es_ve 394
kl 483
ve 484
sr_cs@latn 485
ar_dz 486
aa_et 487
bo_cn 109
iw_il.utf8 145
nn_no 489
vi_vn 173
spanish 79
ca_ad 492
vi_vn.viscii111 494
c_c 451
nan_tw@latin 370
ar_sd 498
vi_vn.viscii 496
ms_my 501
es_co 502
posix 503
niu_nz 257
ks 505
id 430
iso-8859-15 507
sd_in 327
es 509
th_th.tactis 41
iso8859-15 471
bn_bd 511
hu_hu 323
nds_de 513
nr 514
slovene 208
sl_si 516
ve_za 317
sh_yu 545
sr@cyrillic 519
slovak 521
pd 497
serbocroatian 132
ph 457
sa_in 381
fi 348
nl_be 527
sr_me 165
swedish 397
sl_cs 530
ar_ye 524
yo_ng 531
eesti 532
hungarian 533
no_no 534
hi 548
uz 535
in 449
om_et 453
sr_cs 538
xh_za 539
pt 541
universal.utf8@ucs4 520
ks_in 493
bem_zm 544
hi_in 289
eo.utf8 546
uk_ua 547
zh_sg.gbk 540
te_in 549
sl 550
lo_la.mulelao1 551
所以传给localeIdMap一个locale的字符串,就可以将其转换成数值型,如果传入的字符串不在localeIdMap的key中,则返回0,这也就体现了defaultdict(int)的作用
print(localeIdMap['en_GB'.lower()])
print(localeIdMap['en_US'.lower()])
print(localeIdMap['id_ID'.lower()])
print(localeIdMap['ka_GE'.lower()])
366
404
110
385
2.birthyear列处理
该列处理比较简单,存在就直接转换成数值,不存在就用0填充
def getBirthYearInt(birthYear):try:return 0 if birthYear=="None" else int(birthYear)except:return 0
print(getBirthYearInt(1992))
print(getBirthYearInt(None))
1992
0
3.gender列处理
male转换为1, female转换为2,空值用0填充
from collections import defaultdict
genderIdMap = defaultdict(int, {'male':1, 'female':2})
print(genderIdMap['male'])
print(genderIdMap['female'])
print(genderIdMap[None])
1
2
0
4.joinedAt列处理
我们发现该列信息有些共性特点:
import pandas as pd
df_users = pd.read_csv('users.csv')
df_users['joinedAt'][:10]
0 2012-10-02T06:40:55.524Z
1 2012-09-29T18:03:12.111Z
2 2012-10-06T03:14:07.149Z
3 2012-11-04T08:59:43.783Z
4 2012-09-10T16:06:53.132Z
5 2012-11-01T09:59:17.590Z
6 2012-10-03T05:22:17.637Z
7 2012-10-03T12:19:29.975Z
8 2012-10-31T10:11:57.668Z
9 2012-10-02T07:28:09.555Z
Name: joinedAt, dtype: object
我们发现该列要么是None,要么是上面的时间字符串,均有T在中间和S在尾部,根据这个共性我们用datetime模块,提取时间信息:
import datetime
def getJoinedYearMonth(dateString):try:dttm = datetime.datetime.strptime(dateString, "%Y-%m-%dT%H:%M:%S.%fZ")return "".join( [str(dttm.year), str(dttm.month)] )except:return 0
df_users['joinedAt'].map(getJoinedYearMonth)[:10]
0 201210
1 20129
2 201210
3 201211
4 20129
5 201211
6 201210
7 201210
8 201210
9 201210
Name: joinedAt, dtype: object
5.location列处理
我们来看看users.csv中location列信息(前20行):
df_users['location'][:20]
0 Medan Indonesia
1 Medan Indonesia
2 Stratford Ontario
3 Tehran Iran
4 NaN
5 Tbilisi Georgia
6 Medan Indonesia
7 Medan Indonesia
8 Medan Indonesia
9 Medan Indonesia
10 Medan Indonesia
11 Phnom Penh
12 Djokja Yogyakarta Indonesia
13 Triolet Mauritius
14 NaN
15 NaN
16 NaN
17 Surabaya Indonesia
18 Medan Indonesia
19 NaN
Name: location, dtype: object
我们使用pycountry模块来将此列转换为数值型,pycountry.countries是个迭代器:
import pycountry
from collections import defaultdict
countryIdMap = defaultdict(int)
for i, c in enumerate(pycountry.countries):countryIdMap[c.name.lower()] = i + 1
#将地址信息转换为数值型
def getCountryId(location):if (isinstance( location, str)) and len(location.strip()) > 0 and location.rfind(' ') > -1:return countryIdMap[ location[location.rindex(' ') + 2: ].lower() ]else:return 0
print(getCountryId('San Dimas California'))
print(getCountryId('Jogjakarta Indonesia'))
0
103
我们知道许多机器学习模型只能接受数值型的数据作为模型的输入,所以在这里需要将位置信息转换为数值型的数据,常见的做法是对其做one hot处理,但是这样会造成矩阵太稀疏,我们可以使用pycountry库,对位置数据按照pycountry中存储的位置信息进行编码,使用编码来代替原始的位置信息。
6.timezone列处理
比较简单,存在值就转换为int型,不存在用0填充
def getTimezoneInt(timezone):try:return int(timezone)except:return 0
print(getTimezoneInt(-240))#-240
print(getTimezoneInt(240))
print(getTimezoneInt(None))
-240
240
0
7.将上面处理的1-6列进行归一化
self.userMatrix矩阵的处理中归一化使用了sklearn.preprocessing.normalize()函数,归一化后方便计算两个user的相似度
这里只计算Event Recommendation Engine Challenge分步解析第一步中的uniqueUserPairs,他们因为同一个event事件关联起来了,有联系
计算相关性用到了scipy.spatial.distance.correlation(u, v) #计算向量u和v之间的相关系数(pearson correlation coefficient, Centered Cosine)
#第二步全部代码
from collections import defaultdict
import locale, pycountry
import scipy.sparse as ss
import scipy.io as sio
import itertools
#import cPickle
#From python3, cPickle has beed replaced by _pickle
import _pickle as cPickleimport scipy.spatial.distance as ssd
import datetime
from sklearn.preprocessing import normalize#构建用户-事件矩阵类
class ProgramEntities:"""我们只关心train和test中出现的user和event,因此重点处理这部分关联数据,经过统计:train和test中总共3391个users和13418个events"""def __init__(self):#统计训练集中有多少独立的用户的eventsuniqueUsers = set()#uniqueUsers保存总共多少个用户:3391个uniqueEvents = set()#uniqueEvents保存总共多少个events:13418个eventsForUser = defaultdict(set)#字典eventsForUser保存了每个user:所对应的eventusersForEvent = defaultdict(set)#字典usersForEvent保存了每个event:哪些user点击for filename in ['train.csv', 'test.csv']:f = open(filename)f.readline()#跳过第一行for line in f:cols = line.strip().split(',')uniqueUsers.add( cols[0] )uniqueEvents.add( cols[1] )eventsForUser[cols[0]].add( cols[1] )usersForEvent[cols[1]].add( cols[0] )f.close()self.userEventScores = ss.dok_matrix( ( len(uniqueUsers), len(uniqueEvents) ) )self.userIndex = dict()self.eventIndex = dict()for i, u in enumerate(uniqueUsers):self.userIndex[u] = ifor i, e in enumerate(uniqueEvents):self.eventIndex[e] = iftrain = open('train.csv')ftrain.readline()for line in ftrain:cols = line.strip().split(',')i = self.userIndex[ cols[0] ]j = self.eventIndex[ cols[1] ]self.userEventScores[i, j] = int( cols[4] ) - int( cols[5] )ftrain.close()sio.mmwrite('PE_userEventScores', self.userEventScores)#为了防止不必要的计算,我们找出来所有关联的用户或者关联的event#所谓关联用户指的是至少在同一个event上有行为的用户user pair#关联的event指的是至少同一个user有行为的event pairself.uniqueUserPairs = set()self.uniqueEventPairs = set()for event in uniqueEvents:users = usersForEvent[event]if len(users) > 2:self.uniqueUserPairs.update( itertools.combinations(users, 2) )for user in uniqueUsers:events = eventsForUser[user]if len(events) > 2:self.uniqueEventPairs.update( itertools.combinations(events, 2) )#rint(self.userIndex)cPickle.dump( self.userIndex, open('PE_userIndex.pkl', 'wb'))cPickle.dump( self.eventIndex, open('PE_eventIndex.pkl', 'wb') )#数据清洗类
class DataCleaner:def __init__(self):#一些字符串转数值的方法#载入localeself.localeIdMap = defaultdict(int)for i, l in enumerate(locale.locale_alias.keys()):self.localeIdMap[l] = i + 1#载入countryself.countryIdMap = defaultdict(int)ctryIdx = defaultdict(int)for i, c in enumerate(pycountry.countries):self.countryIdMap[c.name.lower()] = i + 1if c.name.lower() == 'usa':ctryIdx['US'] = iif c.name.lower() == 'canada':ctryIdx['CA'] = ifor cc in ctryIdx.keys():for s in pycountry.subdivisions.get(country_code=cc):self.countryIdMap[s.name.lower()] = ctryIdx[cc] + 1#处理性别信息 self.genderIdMap = defaultdict(int, {'male':1, 'female':2})#处理LocaleIddef getLocaleId(self, locstr):#这样因为localeIdMap是defaultdict(int),如果key中没有locstr.lower(),就会返回默认int 0return self.localeIdMap[ locstr.lower() ]#处理birthyeardef getBirthYearInt(self, birthYear):try:return 0 if birthYear == 'None' else int(birthYear)except:return 0#性别处理def getGenderId(self, genderStr):return self.genderIdMap[genderStr]#joinedAtdef getJoinedYearMonth(self, dateString):dttm = datetime.datetime.strptime(dateString, "%Y-%m-%dT%H:%M:%S.%fZ")return "".join( [str(dttm.year), str(dttm.month) ] )#处理locationdef getCountryId(self, location):if (isinstance( location, str)) and len(location.strip()) > 0 and location.rfind(' ') > -1:return self.countryIdMap[ location[location.rindex(' ') + 2: ].lower() ]else:return 0#处理timezonedef getTimezoneInt(self, timezone):try:return int(timezone)except:return 0#构建用户和用户之间的相似矩阵类
class Users:"""构建user/user相似度矩阵"""def __init__(self, programEntities, sim=ssd.correlation):#spatial.distance.correlation(u, v) #计算向量u和v之间的相关系数cleaner = DataCleaner()nusers = len(programEntities.userIndex.keys())#3391#print(nusers)fin = open('users.csv')colnames = fin.readline().strip().split(',') #7列特征self.userMatrix = ss.dok_matrix( (nusers, len(colnames)-1 ) )#构建稀疏矩阵for line in fin:cols = line.strip().split(',')#只考虑train.csv中出现的用户,这一行是作者注释上的,但是我不是很理解#userIndex包含了train和test的所有用户,为何说只考虑train.csv中出现的用户#构造用户矩阵(将原始数据中的数据进行处理后构建)if cols[0] in programEntities.userIndex:i = programEntities.userIndex[ cols[0] ]#获取user:对应的indexself.userMatrix[i, 0] = cleaner.getLocaleId( cols[1] )#localeself.userMatrix[i, 1] = cleaner.getBirthYearInt( cols[2] )#birthyear,空值0填充self.userMatrix[i, 2] = cleaner.getGenderId( cols[3] )#处理性别self.userMatrix[i, 3] = cleaner.getJoinedYearMonth( cols[4] )#处理joinedAt列self.userMatrix[i, 4] = cleaner.getCountryId( cols[5] )#处理locationself.userMatrix[i, 5] = cleaner.getTimezoneInt( cols[6] )#处理timezonefin.close()#归一化矩阵self.userMatrix = normalize(self.userMatrix, norm='l1', axis=0, copy=False)sio.mmwrite('US_userMatrix', self.userMatrix)#计算用户相似度矩阵,之后会用到self.userSimMatrix = ss.dok_matrix( (nusers, nusers) )#(3391,3391)for i in range(0, nusers):self.userSimMatrix[i, i] = 1.0for u1, u2 in programEntities.uniqueUserPairs:i = programEntities.userIndex[u1] #获取用户u1的索引j = programEntities.userIndex[u2]if (i, j) not in self.userSimMatrix:#print(self.userMatrix.getrow(i).todense()) 如[[0.00028123,0.00029847,0.00043592,0.00035208,0,0.00032346]]#print(self.userMatrix.getrow(j).todense()) 如[[0.00028123,0.00029742,0.00043592,0.00035208,0,-0.00032346]]usim = sim(self.userMatrix.getrow(i).todense(),self.userMatrix.getrow(j).todense()) #计算两个用户向量之间的相似性,为对称矩阵self.userSimMatrix[i, j] = usimself.userSimMatrix[j, i] = usimsio.mmwrite('US_userSimMatrix', self.userSimMatrix) print('第1步:统计user和event相关信息...')
pe = ProgramEntities()
print('第1步完成...\n')print('第2步:计算用户相似度信息,并用矩阵形式存储...')
Users(pe)
print('第2步完成...\n')
第1步:统计user和event相关信息...
第1步完成...第2步:计算用户相似度信息,并用矩阵形式存储...
第2步完成...
第三步:用户社交关系信息处理
这一步需要user_friends.csv.gz文件,我们先来看看文件内容:
import pandas as pd
df_user_friends = pd.read_csv('user_friends.csv.gz', compression='gzip')
df_user_friends.head()
user | friends | |
---|---|---|
0 | 3197468391 | 1346449342 3873244116 4226080662 1222907620 54... |
1 | 3537982273 | 1491560444 395798035 2036380346 899375619 3534... |
2 | 823183725 | 1484954627 1950387873 1652977611 4185960823 42... |
3 | 1872223848 | 83361640 723814682 557944478 1724049724 253059... |
4 | 3429017717 | 4253303705 2130310957 1838389374 3928735761 71... |
- 1)如果你有更多的朋友,可能你性格外向,更容易参加各种活动
- 2)如果你朋友会参加某个活动,可能你也会跟随去参加一下
# 第三步完整代码
from collections import defaultdict
import locale, pycountry
import scipy.sparse as ss
import scipy.io as sio
import itertools
#import cPickle
#From python3, cPickle has beed replaced by _pickle
import _pickle as cPickleimport scipy.spatial.distance as ssd
import datetime
from sklearn.preprocessing import normalizeimport gzip
import numpy as np#处理user和event关联数据
class ProgramEntities:"""我们只关心train和test中出现的user和event,因此重点处理这部分关联数据,经过统计:train和test中总共3391个users和13418个events"""def __init__(self):#统计训练集中有多少独立的用户的eventsuniqueUsers = set()#uniqueUsers保存总共多少个用户:3391个uniqueEvents = set()#uniqueEvents保存总共多少个events:13418个eventsForUser = defaultdict(set)#字典eventsForUser保存了每个user:所对应的eventusersForEvent = defaultdict(set)#字典usersForEvent保存了每个event:哪些user点击for filename in ['train.csv', 'test.csv']:f = open(filename)f.readline()#跳过第一行for line in f:cols = line.strip().split(',')uniqueUsers.add( cols[0] )uniqueEvents.add( cols[1] )eventsForUser[cols[0]].add( cols[1] )usersForEvent[cols[1]].add( cols[0] )f.close()self.userEventScores = ss.dok_matrix( ( len(uniqueUsers), len(uniqueEvents) ) )self.userIndex = dict()self.eventIndex = dict()for i, u in enumerate(uniqueUsers):self.userIndex[u] = ifor i, e in enumerate(uniqueEvents):self.eventIndex[e] = iftrain = open('train.csv')ftrain.readline()for line in ftrain:cols = line.strip().split(',')i = self.userIndex[ cols[0] ]j = self.eventIndex[ cols[1] ]self.userEventScores[i, j] = int( cols[4] ) - int( cols[5] )ftrain.close()sio.mmwrite('PE_userEventScores', self.userEventScores)#为了防止不必要的计算,我们找出来所有关联的用户或者关联的event#所谓关联用户指的是至少在同一个event上有行为的用户user pair#关联的event指的是至少同一个user有行为的event pairself.uniqueUserPairs = set()self.uniqueEventPairs = set()for event in uniqueEvents:users = usersForEvent[event]if len(users) > 2:self.uniqueUserPairs.update( itertools.combinations(users, 2) )for user in uniqueUsers:events = eventsForUser[user]if len(events) > 2:self.uniqueEventPairs.update( itertools.combinations(events, 2) )#rint(self.userIndex)cPickle.dump( self.userIndex, open('PE_userIndex.pkl', 'wb'))cPickle.dump( self.eventIndex, open('PE_eventIndex.pkl', 'wb') )#数据清洗类
class DataCleaner:def __init__(self):#一些字符串转数值的方法#载入localeself.localeIdMap = defaultdict(int)for i, l in enumerate(locale.locale_alias.keys()):self.localeIdMap[l] = i + 1#载入countryself.countryIdMap = defaultdict(int)ctryIdx = defaultdict(int)for i, c in enumerate(pycountry.countries):self.countryIdMap[c.name.lower()] = i + 1if c.name.lower() == 'usa':ctryIdx['US'] = iif c.name.lower() == 'canada':ctryIdx['CA'] = ifor cc in ctryIdx.keys():for s in pycountry.subdivisions.get(country_code=cc):self.countryIdMap[s.name.lower()] = ctryIdx[cc] + 1self.genderIdMap = defaultdict(int, {'male':1, 'female':2})#处理LocaleIddef getLocaleId(self, locstr):#这样因为localeIdMap是defaultdict(int),如果key中没有locstr.lower(),就会返回默认int 0return self.localeIdMap[ locstr.lower() ]#处理birthyeardef getBirthYearInt(self, birthYear):try:return 0 if birthYear == 'None' else int(birthYear)except:return 0#性别处理def getGenderId(self, genderStr):return self.genderIdMap[genderStr]#joinedAtdef getJoinedYearMonth(self, dateString):dttm = datetime.datetime.strptime(dateString, "%Y-%m-%dT%H:%M:%S.%fZ")return "".join( [str(dttm.year), str(dttm.month) ] )#处理locationdef getCountryId(self, location):if (isinstance( location, str)) and len(location.strip()) > 0 and location.rfind(' ') > -1:return self.countryIdMap[ location[location.rindex(' ') + 2: ].lower() ]else:return 0#处理timezonedef getTimezoneInt(self, timezone):try:return int(timezone)except:return 0#用户与用户相似度矩阵
class Users:"""构建user/user相似度矩阵"""def __init__(self, programEntities, sim=ssd.correlation):#spatial.distance.correlation(u, v) #计算向量u和v之间的相关系数cleaner = DataCleaner()nusers = len(programEntities.userIndex.keys())#3391#print(nusers)fin = open('users.csv')colnames = fin.readline().strip().split(',') #7列特征self.userMatrix = ss.dok_matrix( (nusers, len(colnames)-1 ) )#构建稀疏矩阵for line in fin:cols = line.strip().split(',')#只考虑train.csv中出现的用户,这一行是作者注释上的,但是我不是很理解#userIndex包含了train和test的所有用户,为何说只考虑train.csv中出现的用户if cols[0] in programEntities.userIndex:i = programEntities.userIndex[ cols[0] ]#获取user:对应的indexself.userMatrix[i, 0] = cleaner.getLocaleId( cols[1] )#localeself.userMatrix[i, 1] = cleaner.getBirthYearInt( cols[2] )#birthyear,空值0填充self.userMatrix[i, 2] = cleaner.getGenderId( cols[3] )#处理性别self.userMatrix[i, 3] = cleaner.getJoinedYearMonth( cols[4] )#处理joinedAt列self.userMatrix[i, 4] = cleaner.getCountryId( cols[5] )#处理locationself.userMatrix[i, 5] = cleaner.getTimezoneInt( cols[6] )#处理timezonefin.close()#归一化矩阵self.userMatrix = normalize(self.userMatrix, norm='l1', axis=0, copy=False)sio.mmwrite('US_userMatrix', self.userMatrix)#计算用户相似度矩阵,之后会用到self.userSimMatrix = ss.dok_matrix( (nusers, nusers) )#(3391,3391)for i in range(0, nusers):self.userSimMatrix[i, i] = 1.0for u1, u2 in programEntities.uniqueUserPairs:i = programEntities.userIndex[u1]j = programEntities.userIndex[u2]if (i, j) not in self.userSimMatrix:#print(self.userMatrix.getrow(i).todense()) 如[[0.00028123,0.00029847,0.00043592,0.00035208,0,0.00032346]]#print(self.userMatrix.getrow(j).todense()) 如[[0.00028123,0.00029742,0.00043592,0.00035208,0,-0.00032346]]usim = sim(self.userMatrix.getrow(i).todense(),self.userMatrix.getrow(j).todense())self.userSimMatrix[i, j] = usimself.userSimMatrix[j, i] = usimsio.mmwrite('US_userSimMatrix', self.userSimMatrix)#用户社交关系挖掘
class UserFriends:"""找出某用户的那些朋友,想法非常简单1)如果你有更多的朋友,可能你性格外向,更容易参加各种活动2)如果你朋友会参加某个活动,可能你也会跟随去参加一下"""def __init__(self, programEntities):nusers = len(programEntities.userIndex.keys())#3391 用户数目self.numFriends = np.zeros( (nusers) )#array([0., 0., 0., ..., 0., 0., 0.]),保存每一个用户的朋友数self.userFriends = ss.dok_matrix( (nusers, nusers) ) #记录下每个用户的朋友点击事件的次数fin = gzip.open('user_friends.csv.gz')print( 'Header In User_friends.csv.gz:',fin.readline() )ln = 0#逐行打开user_friends.csv.gz文件#判断第一列的user是否在userIndex中,只有user在userIndex中才是我们关心的user#获取该用户的Index,和朋友数目#对于该用户的每一个朋友,如果朋友也在userIndex中,获取其朋友的userIndex,然后去userEventScores中获取该朋友对每个events的反应#score即为该朋友对所有events的平均分#userFriends矩阵记录了用户和朋友之间的score#如851286067:1750用户出现在test.csv中,该用户在User_friends.csv.gz中一共2151个朋友#那么其朋友占比应该是2151 / 总的朋友数sumNumFriends=3731377.0 = 2151 / 3731377 = 0.0005764627910822198for line in fin:if ln % 200 == 0:print( 'Loading line:', ln )cols = line.decode().strip().split(',')user = cols[0]if user in programEntities.userIndex:friends = cols[1].split(' ')#获得该用户的朋友列表i = programEntities.userIndex[user]self.numFriends[i] = len(friends)for friend in friends:if friend in programEntities.userIndex:j = programEntities.userIndex[friend]#the objective of this score is to infer the degree to#and direction in which this friend will influence the#user's decision, so we sum the user/event score for#this user across all training eventseventsForUser = programEntities.userEventScores.getrow(j).todense()#获取朋友对每个events的反应:0, 1, or -1#print(eventsForUser.sum(), np.shape(eventsForUser)[1] )#socre即是用户朋友在13418个events上的平均分score = eventsForUser.sum() / np.shape(eventsForUser)[1]#eventsForUser = 13418,#print(score)self.userFriends[i, j] += scoreself.userFriends[j, i] += scoreln += 1fin.close()#归一化数组sumNumFriends = self.numFriends.sum(axis=0)#每个用户的朋友数相加print(sumNumFriends)self.numFriends = self.numFriends / sumNumFriends#每个user的朋友数目比例sio.mmwrite('UF_numFriends', np.matrix(self.numFriends) ) #将用户-朋友数矩阵保存self.userFriends = normalize(self.userFriends, norm='l1', axis=0, copy=False)sio.mmwrite('UF_userFriends', self.userFriends) #将用户-朋友事件点击矩阵保存print('第1步:统计user和event相关信息...')
pe = ProgramEntities()
print('第1步完成...\n')print('第2步:计算用户相似度信息,并用矩阵形式存储...')
#Users(pe)
print('第2步完成...\n')print('第3步:计算用户社交关系信息,并存储...')
UserFriends(pe)
print('第3步完成...\n')
第1步:统计user和event相关信息...
第1步完成...第2步:计算用户相似度信息,并用矩阵形式存储...
第2步完成...第3步:计算用户社交关系信息,并存储...
Header In User_friends.csv.gz: b'user,friends\n'
Loading line: 0
Loading line: 200
Loading line: 400
Loading line: 600
Loading line: 800
Loading line: 1000
Loading line: 1200
Loading line: 1400
Loading line: 1600
Loading line: 1800
Loading line: 2000
Loading line: 2200
Loading line: 2400
Loading line: 2600
Loading line: 2800
Loading line: 3000
Loading line: 3200
Loading line: 3400
Loading line: 3600
Loading line: 3800
Loading line: 4000
Loading line: 4200
Loading line: 4400
Loading line: 4600
Loading line: 4800
Loading line: 5000
Loading line: 5200
Loading line: 5400
Loading line: 5600
Loading line: 5800
Loading line: 6000
Loading line: 6200
Loading line: 6400
Loading line: 6600
Loading line: 6800
Loading line: 7000
Loading line: 7200
Loading line: 7400
Loading line: 7600
Loading line: 7800
Loading line: 8000
Loading line: 8200
Loading line: 8400
Loading line: 8600
Loading line: 8800
Loading line: 9000
Loading line: 9200
Loading line: 9400
Loading line: 9600
Loading line: 9800
Loading line: 10000
Loading line: 10200
Loading line: 10400
Loading line: 10600
Loading line: 10800
Loading line: 11000
Loading line: 11200
Loading line: 11400
Loading line: 11600
Loading line: 11800
Loading line: 12000
Loading line: 12200
Loading line: 12400
Loading line: 12600
Loading line: 12800
Loading line: 13000
Loading line: 13200
Loading line: 13400
Loading line: 13600
Loading line: 13800
Loading line: 14000
Loading line: 14200
Loading line: 14400
Loading line: 14600
Loading line: 14800
Loading line: 15000
Loading line: 15200
Loading line: 15400
Loading line: 15600
Loading line: 15800
Loading line: 16000
Loading line: 16200
Loading line: 16400
Loading line: 16600
Loading line: 16800
Loading line: 17000
Loading line: 17200
Loading line: 17400
Loading line: 17600
Loading line: 17800
Loading line: 18000
Loading line: 18200
Loading line: 18400
Loading line: 18600
Loading line: 18800
Loading line: 19000
Loading line: 19200
Loading line: 19400
Loading line: 19600
Loading line: 19800
Loading line: 20000
Loading line: 20200
Loading line: 20400
Loading line: 20600
Loading line: 20800
Loading line: 21000
Loading line: 21200
Loading line: 21400
Loading line: 21600
Loading line: 21800
Loading line: 22000
Loading line: 22200
Loading line: 22400
Loading line: 22600
Loading line: 22800
Loading line: 23000
Loading line: 23200
Loading line: 23400
Loading line: 23600
Loading line: 23800
Loading line: 24000
Loading line: 24200
Loading line: 24400
Loading line: 24600
Loading line: 24800
Loading line: 25000
Loading line: 25200
Loading line: 25400
Loading line: 25600
Loading line: 25800
Loading line: 26000
Loading line: 26200
Loading line: 26400
Loading line: 26600
Loading line: 26800
Loading line: 27000
Loading line: 27200
Loading line: 27400
Loading line: 27600
Loading line: 27800
Loading line: 28000
Loading line: 28200
Loading line: 28400
Loading line: 28600
Loading line: 28800
Loading line: 29000
Loading line: 29200
Loading line: 29400
Loading line: 29600
Loading line: 29800
Loading line: 30000
Loading line: 30200
Loading line: 30400
Loading line: 30600
Loading line: 30800
Loading line: 31000
Loading line: 31200
Loading line: 31400
Loading line: 31600
Loading line: 31800
Loading line: 32000
Loading line: 32200
Loading line: 32400
Loading line: 32600
Loading line: 32800
Loading line: 33000
Loading line: 33200
Loading line: 33400
Loading line: 33600
Loading line: 33800
Loading line: 34000
Loading line: 34200
Loading line: 34400
Loading line: 34600
Loading line: 34800
Loading line: 35000
Loading line: 35200
Loading line: 35400
Loading line: 35600
Loading line: 35800
Loading line: 36000
Loading line: 36200
Loading line: 36400
Loading line: 36600
Loading line: 36800
Loading line: 37000
Loading line: 37200
Loading line: 37400
Loading line: 37600
Loading line: 37800
Loading line: 38000
Loading line: 38200
3731377.0
第3步完成...
第四步:构建event和event相似度数据
我们先看看events.csv.gz:
import pandas as pd
df_events_csv = pd.read_csv('events.csv.gz', compression='gzip')
df_events_csv.head()
event_id | user_id | start_time | city | state | zip | country | lat | lng | c_1 | ... | c_92 | c_93 | c_94 | c_95 | c_96 | c_97 | c_98 | c_99 | c_100 | c_other | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 684921758 | 3647864012 | 2012-10-31T00:00:00.001Z | NaN | NaN | NaN | NaN | NaN | NaN | 2 | ... | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 9 |
1 | 244999119 | 3476440521 | 2012-11-03T00:00:00.001Z | NaN | NaN | NaN | NaN | NaN | NaN | 2 | ... | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 7 |
2 | 3928440935 | 517514445 | 2012-11-05T00:00:00.001Z | NaN | NaN | NaN | NaN | NaN | NaN | 0 | ... | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 12 |
3 | 2582345152 | 781585781 | 2012-10-30T00:00:00.001Z | NaN | NaN | NaN | NaN | NaN | NaN | 1 | ... | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 8 |
4 | 1051165850 | 1016098580 | 2012-09-27T00:00:00.001Z | NaN | NaN | NaN | NaN | NaN | NaN | 1 | ... | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 9 |
5 rows × 110 columns
对上面的信息进行数值转换
1.start_time列的信息使用 datetime库进行处理
2.city,state,zip,country列处理都利用了hashlib包:注意这里处理event信息的时候,只有那些出现在train.csv和test.csv中的event才会进入数值转换程序
import hashlib
def FeatureHash(value):if len(value.strip()) == 0:return -1else:return int( hashlib.sha224(value.encode('utf-8')).hexdigest()[0:4] ,16)print(FeatureHash('Muaraenim'))#47294
print(FeatureHash('a test demo'))#4030
47294
4030
3.lat和lon列处理
空值用0.0填充,其他转换为自身的float型
def getFloatValue(self, value):if len(value.strip()) == 0:return 0.0else:return float(value)
4.c_1之后列(也就是第10列之后)处理
- 这里用了一个矩阵eventContMatrix来保存c_1到c_100列信息,但是没有用的c_other列
5.将eventPropMatrix和eventContMatrix矩阵归一化后进行文件保存
6.使用uniqueEventPairs来计算event pairs相似度
- 利用了scipy.spatial.distance的correlation和cosine方法
## 第四步完整代码
from collections import defaultdict
import locale, pycountry
import scipy.sparse as ss
import scipy.io as sio
import itertools
#import cPickle
#From python3, cPickle has beed replaced by _pickle
import _pickle as cPickleimport scipy.spatial.distance as ssd
import datetime
from sklearn.preprocessing import normalizeimport gzip
import numpy as npimport hashlib#处理user和event关联数据
class ProgramEntities:"""我们只关心train和test中出现的user和event,因此重点处理这部分关联数据,经过统计:train和test中总共3391个users和13418个events"""def __init__(self):#统计训练集中有多少独立的用户的eventsuniqueUsers = set()#uniqueUsers保存总共多少个用户:3391个uniqueEvents = set()#uniqueEvents保存总共多少个events:13418个eventsForUser = defaultdict(set)#字典eventsForUser保存了每个user:所对应的eventusersForEvent = defaultdict(set)#字典usersForEvent保存了每个event:哪些user点击for filename in ['train.csv', 'test.csv']:f = open(filename)f.readline()#跳过第一行for line in f:cols = line.strip().split(',')uniqueUsers.add( cols[0] )uniqueEvents.add( cols[1] )eventsForUser[cols[0]].add( cols[1] )usersForEvent[cols[1]].add( cols[0] )f.close()self.userEventScores = ss.dok_matrix( ( len(uniqueUsers), len(uniqueEvents) ) )self.userIndex = dict()self.eventIndex = dict()for i, u in enumerate(uniqueUsers):self.userIndex[u] = ifor i, e in enumerate(uniqueEvents):self.eventIndex[e] = iftrain = open('train.csv')ftrain.readline()for line in ftrain:cols = line.strip().split(',')i = self.userIndex[ cols[0] ]j = self.eventIndex[ cols[1] ]self.userEventScores[i, j] = int( cols[4] ) - int( cols[5] )ftrain.close()sio.mmwrite('PE_userEventScores', self.userEventScores)#为了防止不必要的计算,我们找出来所有关联的用户或者关联的event#所谓关联用户指的是至少在同一个event上有行为的用户user pair#关联的event指的是至少同一个user有行为的event pairself.uniqueUserPairs = set()self.uniqueEventPairs = set()for event in uniqueEvents:users = usersForEvent[event]if len(users) > 2:self.uniqueUserPairs.update( itertools.combinations(users, 2) )for user in uniqueUsers:events = eventsForUser[user]if len(events) > 2:self.uniqueEventPairs.update( itertools.combinations(events, 2) )#rint(self.userIndex)cPickle.dump( self.userIndex, open('PE_userIndex.pkl', 'wb'))cPickle.dump( self.eventIndex, open('PE_eventIndex.pkl', 'wb') )#数据清洗类
class DataCleaner:def __init__(self):#一些字符串转数值的方法#载入localeself.localeIdMap = defaultdict(int)for i, l in enumerate(locale.locale_alias.keys()):self.localeIdMap[l] = i + 1#载入countryself.countryIdMap = defaultdict(int)ctryIdx = defaultdict(int)for i, c in enumerate(pycountry.countries):self.countryIdMap[c.name.lower()] = i + 1if c.name.lower() == 'usa':ctryIdx['US'] = iif c.name.lower() == 'canada':ctryIdx['CA'] = ifor cc in ctryIdx.keys():for s in pycountry.subdivisions.get(country_code=cc):self.countryIdMap[s.name.lower()] = ctryIdx[cc] + 1self.genderIdMap = defaultdict(int, {'male':1, 'female':2})#处理LocaleIddef getLocaleId(self, locstr):#这样因为localeIdMap是defaultdict(int),如果key中没有locstr.lower(),就会返回默认int 0return self.localeIdMap[ locstr.lower() ]#处理birthyeardef getBirthYearInt(self, birthYear):try:return 0 if birthYear == 'None' else int(birthYear)except:return 0#性别处理def getGenderId(self, genderStr):return self.genderIdMap[genderStr]#joinedAtdef getJoinedYearMonth(self, dateString):dttm = datetime.datetime.strptime(dateString, "%Y-%m-%dT%H:%M:%S.%fZ")return "".join( [str(dttm.year), str(dttm.month) ] )#处理locationdef getCountryId(self, location):if (isinstance( location, str)) and len(location.strip()) > 0 and location.rfind(' ') > -1:return self.countryIdMap[ location[location.rindex(' ') + 2: ].lower() ]else:return 0#处理timezonedef getTimezoneInt(self, timezone):try:return int(timezone)except:return 0def getFeatureHash(self, value):if len(value.strip()) == 0:return -1else:#return int( hashlib.sha224(value).hexdigest()[0:4], 16) python3会报如下错误#TypeError: Unicode-objects must be encoded before hashingreturn int( hashlib.sha224(value.encode('utf-8')).hexdigest()[0:4], 16)#python必须先进行encodedef getFloatValue(self, value):if len(value.strip()) == 0:return 0.0else:return float(value)#用户与用户相似度矩阵
class Users:"""构建user/user相似度矩阵"""def __init__(self, programEntities, sim=ssd.correlation):#spatial.distance.correlation(u, v) #计算向量u和v之间的相关系数cleaner = DataCleaner()nusers = len(programEntities.userIndex.keys())#3391#print(nusers)fin = open('users.csv')colnames = fin.readline().strip().split(',') #7列特征self.userMatrix = ss.dok_matrix( (nusers, len(colnames)-1 ) )#构建稀疏矩阵for line in fin:cols = line.strip().split(',')#只考虑train.csv中出现的用户,这一行是作者注释上的,但是我不是很理解#userIndex包含了train和test的所有用户,为何说只考虑train.csv中出现的用户if cols[0] in programEntities.userIndex:i = programEntities.userIndex[ cols[0] ]#获取user:对应的indexself.userMatrix[i, 0] = cleaner.getLocaleId( cols[1] )#localeself.userMatrix[i, 1] = cleaner.getBirthYearInt( cols[2] )#birthyear,空值0填充self.userMatrix[i, 2] = cleaner.getGenderId( cols[3] )#处理性别self.userMatrix[i, 3] = cleaner.getJoinedYearMonth( cols[4] )#处理joinedAt列self.userMatrix[i, 4] = cleaner.getCountryId( cols[5] )#处理locationself.userMatrix[i, 5] = cleaner.getTimezoneInt( cols[6] )#处理timezonefin.close()#归一化矩阵self.userMatrix = normalize(self.userMatrix, norm='l1', axis=0, copy=False)sio.mmwrite('US_userMatrix', self.userMatrix)#计算用户相似度矩阵,之后会用到self.userSimMatrix = ss.dok_matrix( (nusers, nusers) )#(3391,3391)for i in range(0, nusers):self.userSimMatrix[i, i] = 1.0for u1, u2 in programEntities.uniqueUserPairs:i = programEntities.userIndex[u1]j = programEntities.userIndex[u2]if (i, j) not in self.userSimMatrix:#print(self.userMatrix.getrow(i).todense()) 如[[0.00028123,0.00029847,0.00043592,0.00035208,0,0.00032346]]#print(self.userMatrix.getrow(j).todense()) 如[[0.00028123,0.00029742,0.00043592,0.00035208,0,-0.00032346]]usim = sim(self.userMatrix.getrow(i).todense(),self.userMatrix.getrow(j).todense())self.userSimMatrix[i, j] = usimself.userSimMatrix[j, i] = usimsio.mmwrite('US_userSimMatrix', self.userSimMatrix)#用户社交关系挖掘
class UserFriends:"""找出某用户的那些朋友,想法非常简单1)如果你有更多的朋友,可能你性格外向,更容易参加各种活动2)如果你朋友会参加某个活动,可能你也会跟随去参加一下"""def __init__(self, programEntities):nusers = len(programEntities.userIndex.keys())#3391self.numFriends = np.zeros( (nusers) )#array([0., 0., 0., ..., 0., 0., 0.]),保存每一个用户的朋友数self.userFriends = ss.dok_matrix( (nusers, nusers) )fin = gzip.open('user_friends.csv.gz')print( 'Header In User_friends.csv.gz:',fin.readline() )ln = 0#逐行打开user_friends.csv.gz文件#判断第一列的user是否在userIndex中,只有user在userIndex中才是我们关心的user#获取该用户的Index,和朋友数目#对于该用户的每一个朋友,如果朋友也在userIndex中,获取其朋友的userIndex,然后去userEventScores中获取该朋友对每个events的反应#score即为该朋友对所有events的平均分#userFriends矩阵记录了用户和朋友之间的score#如851286067:1750用户出现在test.csv中,该用户在User_friends.csv.gz中一共2151个朋友#那么其朋友占比应该是2151 / 总的朋友数sumNumFriends=3731377.0 = 2151 / 3731377 = 0.0005764627910822198for line in fin:if ln % 200 == 0:print( 'Loading line:', ln )cols = line.decode().strip().split(',')user = cols[0]if user in programEntities.userIndex:friends = cols[1].split(' ')#获得该用户的朋友列表i = programEntities.userIndex[user]self.numFriends[i] = len(friends)for friend in friends:if friend in programEntities.userIndex:j = programEntities.userIndex[friend]#the objective of this score is to infer the degree to#and direction in which this friend will influence the#user's decision, so we sum the user/event score for#this user across all training eventseventsForUser = programEntities.userEventScores.getrow(j).todense()#获取朋友对每个events的反应:0, 1, or -1#print(eventsForUser.sum(), np.shape(eventsForUser)[1] )#socre即是用户朋友在13418个events上的平均分score = eventsForUser.sum() / np.shape(eventsForUser)[1]#eventsForUser = 13418,#print(score)self.userFriends[i, j] += scoreself.userFriends[j, i] += scoreln += 1fin.close()#归一化数组sumNumFriends = self.numFriends.sum(axis=0)#每个用户的朋友数相加#print(sumNumFriends)self.numFriends = self.numFriends / sumNumFriends#每个user的朋友数目比例sio.mmwrite('UF_numFriends', np.matrix(self.numFriends) )self.userFriends = normalize(self.userFriends, norm='l1', axis=0, copy=False)sio.mmwrite('UF_userFriends', self.userFriends)#构造event和event相似度数据
class Events:"""构建event-event相似度,注意这里有2种相似度1)由用户-event行为,类似协同过滤算出的相似度2)由event本身的内容(event信息)计算出的event-event相似度"""def __init__(self, programEntities, psim=ssd.correlation, csim=ssd.cosine):cleaner = DataCleaner()fin = gzip.open('events.csv.gz')fin.readline()#skip headernevents = len(programEntities.eventIndex) #事件的数目print(nevents)#13418self.eventPropMatrix = ss.dok_matrix( (nevents, 7) ) #存储事件-前7列特征self.eventContMatrix = ss.dok_matrix( (nevents, 100) ) #存储事件ln = 0for line in fin:#if ln > 10:#breakcols = line.decode().strip().split(',')eventId = cols[0]if eventId in programEntities.eventIndex:i = programEntities.eventIndex[eventId]self.eventPropMatrix[i, 0] = cleaner.getJoinedYearMonth( cols[2] )#start_timeself.eventPropMatrix[i, 1] = cleaner.getFeatureHash( cols[3] )#cityself.eventPropMatrix[i, 2] = cleaner.getFeatureHash( cols[4] )#stateself.eventPropMatrix[i, 3] = cleaner.getFeatureHash( cols[5] )#zipself.eventPropMatrix[i, 4] = cleaner.getFeatureHash( cols[6] )#countryself.eventPropMatrix[i, 5] = cleaner.getFloatValue( cols[7] )#latself.eventPropMatrix[i, 6] = cleaner.getFloatValue( cols[8] )#lon#将10-101列的属性进行统计排布for j in range(9, 109):self.eventContMatrix[i, j-9] = cols[j]ln += 1fin.close()#对特征矩阵1进行归一化处理 self.eventPropMatrix = normalize(self.eventPropMatrix, norm='l1', axis=0, copy=False)sio.mmwrite('EV_eventPropMatrix', self.eventPropMatrix)#对特征矩阵2进行规一划处理self.eventContMatrix = normalize(self.eventContMatrix, norm='l1', axis=0, copy=False)sio.mmwrite('EV_eventContMatrix', self.eventContMatrix)#calculate similarity between event pairs based on the two matricesself.eventPropSim = ss.dok_matrix( (nevents, nevents) )self.eventContSim = ss.dok_matrix( (nevents, nevents) )for e1, e2 in programEntities.uniqueEventPairs:i = programEntities.eventIndex[e1]j = programEntities.eventIndex[e2]#计算前10列数据的相识度if not ((i, j) in self.eventPropSim):epsim = psim( self.eventPropMatrix.getrow(i).todense(), self.eventPropMatrix.getrow(j).todense())self.eventPropSim[i, j] = epsimself.eventPropSim[j, i] = epsim#计算后面数据的相似度if not ((i, j) in self.eventContSim):ecsim = csim( self.eventContMatrix.getrow(i).todense(), self.eventContMatrix.getrow(j).todense())self.eventContSim[i, j] = ecsimself.eventContSim[j, i] = ecsimsio.mmwrite('EV_eventPropSim', self.eventPropSim)sio.mmwrite('EV_eventContSim', self.eventContSim)print('第1步:统计user和event相关信息...')
pe = ProgramEntities()
print('第1步完成...\n')print('第2步:计算用户相似度信息,并用矩阵形式存储...')
#Users(pe)
print('第2步完成...\n')print('第3步:计算用户社交关系信息,并存储...')
UserFriends(pe)
print('第3步完成...\n')print('第4步:计算event相似度信息,并用矩阵形式存储...')
Events(pe)
print('第4步完成...\n')
第五步:活跃度/event热度数据
由于用到event_attendees.csv.gz文件,我们先看看该文件
import pandas as pd
df_events_attendees = pd.read_csv('event_attendees.csv.gz', compression='gzip')
df_events_attendees.head()
event | yes | maybe | invited | no | |
---|---|---|---|---|---|
0 | 1159822043 | 1975964455 252302513 4226086795 3805886383 142... | 2733420590 517546982 1350834692 532087573 5831... | 1723091036 3795873583 4109144917 3560622906 31... | 3575574655 1077296663 |
1 | 686467261 | 2394228942 2686116898 1056558062 3792942231 41... | 1498184352 645689144 3770076778 331335845 4239... | 1788073374 733302094 1830571649 676508092 7081... | NaN |
2 | 1186208412 | NaN | 3320380166 3810793697 | 1379121209 440668682 | 1728988561 2950720854 |
3 | 2621578336 | NaN | NaN | NaN | NaN |
4 | 855842686 | 2406118796 3550897984 294255260 1125817077 109... | 2671721559 1761448345 2356975806 2666669465 10... | 1518670705 880919237 2326414227 2673818347 332... | 3500235232 |
## 第五步全部代码
from collections import defaultdict
import locale, pycountry
import scipy.sparse as ss
import scipy.io as sio
import itertools
#import cPickle
#From python3, cPickle has beed replaced by _pickle
import _pickle as cPickleimport scipy.spatial.distance as ssd
import datetime
from sklearn.preprocessing import normalizeimport gzip
import numpy as npimport hashlib#处理user和event关联数据
class ProgramEntities:"""我们只关心train和test中出现的user和event,因此重点处理这部分关联数据,经过统计:train和test中总共3391个users和13418个events"""def __init__(self):#统计训练集中有多少独立的用户的eventsuniqueUsers = set()#uniqueUsers保存总共多少个用户:3391个uniqueEvents = set()#uniqueEvents保存总共多少个events:13418个eventsForUser = defaultdict(set)#字典eventsForUser保存了每个user:所对应的eventusersForEvent = defaultdict(set)#字典usersForEvent保存了每个event:哪些user点击for filename in ['train.csv', 'test.csv']:f = open(filename)f.readline()#跳过第一行for line in f:cols = line.strip().split(',')uniqueUsers.add( cols[0] )uniqueEvents.add( cols[1] )eventsForUser[cols[0]].add( cols[1] )usersForEvent[cols[1]].add( cols[0] )f.close()self.userEventScores = ss.dok_matrix( ( len(uniqueUsers), len(uniqueEvents) ) )self.userIndex = dict()self.eventIndex = dict()for i, u in enumerate(uniqueUsers):self.userIndex[u] = ifor i, e in enumerate(uniqueEvents):self.eventIndex[e] = iftrain = open('train.csv')ftrain.readline()for line in ftrain:cols = line.strip().split(',')i = self.userIndex[ cols[0] ]j = self.eventIndex[ cols[1] ]self.userEventScores[i, j] = int( cols[4] ) - int( cols[5] )ftrain.close()sio.mmwrite('PE_userEventScores', self.userEventScores)#为了防止不必要的计算,我们找出来所有关联的用户或者关联的event#所谓关联用户指的是至少在同一个event上有行为的用户user pair#关联的event指的是至少同一个user有行为的event pairself.uniqueUserPairs = set()self.uniqueEventPairs = set()for event in uniqueEvents:users = usersForEvent[event]if len(users) > 2:self.uniqueUserPairs.update( itertools.combinations(users, 2) )for user in uniqueUsers:events = eventsForUser[user]if len(events) > 2:self.uniqueEventPairs.update( itertools.combinations(events, 2) )#rint(self.userIndex)cPickle.dump( self.userIndex, open('PE_userIndex.pkl', 'wb'))cPickle.dump( self.eventIndex, open('PE_eventIndex.pkl', 'wb') )#数据清洗类
class DataCleaner:def __init__(self):#一些字符串转数值的方法#载入localeself.localeIdMap = defaultdict(int)for i, l in enumerate(locale.locale_alias.keys()):self.localeIdMap[l] = i + 1#载入countryself.countryIdMap = defaultdict(int)ctryIdx = defaultdict(int)for i, c in enumerate(pycountry.countries):self.countryIdMap[c.name.lower()] = i + 1if c.name.lower() == 'usa':ctryIdx['US'] = iif c.name.lower() == 'canada':ctryIdx['CA'] = ifor cc in ctryIdx.keys():for s in pycountry.subdivisions.get(country_code=cc):self.countryIdMap[s.name.lower()] = ctryIdx[cc] + 1self.genderIdMap = defaultdict(int, {'male':1, 'female':2})#处理LocaleIddef getLocaleId(self, locstr):#这样因为localeIdMap是defaultdict(int),如果key中没有locstr.lower(),就会返回默认int 0return self.localeIdMap[ locstr.lower() ]#处理birthyeardef getBirthYearInt(self, birthYear):try:return 0 if birthYear == 'None' else int(birthYear)except:return 0#性别处理def getGenderId(self, genderStr):return self.genderIdMap[genderStr]#joinedAtdef getJoinedYearMonth(self, dateString):dttm = datetime.datetime.strptime(dateString, "%Y-%m-%dT%H:%M:%S.%fZ")return "".join( [str(dttm.year), str(dttm.month) ] )#处理locationdef getCountryId(self, location):if (isinstance( location, str)) and len(location.strip()) > 0 and location.rfind(' ') > -1:return self.countryIdMap[ location[location.rindex(' ') + 2: ].lower() ]else:return 0#处理timezonedef getTimezoneInt(self, timezone):try:return int(timezone)except:return 0def getFeatureHash(self, value):if len(value.strip()) == 0:return -1else:#return int( hashlib.sha224(value).hexdigest()[0:4], 16) python3会报如下错误#TypeError: Unicode-objects must be encoded before hashingreturn int( hashlib.sha224(value.encode('utf-8')).hexdigest()[0:4], 16)#python必须先进行encodedef getFloatValue(self, value):if len(value.strip()) == 0:return 0.0else:return float(value)#用户与用户相似度矩阵
class Users:"""构建user/user相似度矩阵"""def __init__(self, programEntities, sim=ssd.correlation):#spatial.distance.correlation(u, v) #计算向量u和v之间的相关系数cleaner = DataCleaner()nusers = len(programEntities.userIndex.keys())#3391#print(nusers)fin = open('users.csv')colnames = fin.readline().strip().split(',') #7列特征self.userMatrix = ss.dok_matrix( (nusers, len(colnames)-1 ) )#构建稀疏矩阵for line in fin:cols = line.strip().split(',')#只考虑train.csv中出现的用户,这一行是作者注释上的,但是我不是很理解#userIndex包含了train和test的所有用户,为何说只考虑train.csv中出现的用户if cols[0] in programEntities.userIndex:i = programEntities.userIndex[ cols[0] ]#获取user:对应的indexself.userMatrix[i, 0] = cleaner.getLocaleId( cols[1] )#localeself.userMatrix[i, 1] = cleaner.getBirthYearInt( cols[2] )#birthyear,空值0填充self.userMatrix[i, 2] = cleaner.getGenderId( cols[3] )#处理性别self.userMatrix[i, 3] = cleaner.getJoinedYearMonth( cols[4] )#处理joinedAt列self.userMatrix[i, 4] = cleaner.getCountryId( cols[5] )#处理locationself.userMatrix[i, 5] = cleaner.getTimezoneInt( cols[6] )#处理timezonefin.close()#归一化矩阵self.userMatrix = normalize(self.userMatrix, norm='l1', axis=0, copy=False)sio.mmwrite('US_userMatrix', self.userMatrix)#计算用户相似度矩阵,之后会用到self.userSimMatrix = ss.dok_matrix( (nusers, nusers) )#(3391,3391)for i in range(0, nusers):self.userSimMatrix[i, i] = 1.0for u1, u2 in programEntities.uniqueUserPairs:i = programEntities.userIndex[u1]j = programEntities.userIndex[u2]if (i, j) not in self.userSimMatrix:#print(self.userMatrix.getrow(i).todense()) 如[[0.00028123,0.00029847,0.00043592,0.00035208,0,0.00032346]]#print(self.userMatrix.getrow(j).todense()) 如[[0.00028123,0.00029742,0.00043592,0.00035208,0,-0.00032346]]usim = sim(self.userMatrix.getrow(i).todense(),self.userMatrix.getrow(j).todense())self.userSimMatrix[i, j] = usimself.userSimMatrix[j, i] = usimsio.mmwrite('US_userSimMatrix', self.userSimMatrix)#用户社交关系挖掘
class UserFriends:"""找出某用户的那些朋友,想法非常简单1)如果你有更多的朋友,可能你性格外向,更容易参加各种活动2)如果你朋友会参加某个活动,可能你也会跟随去参加一下"""def __init__(self, programEntities):nusers = len(programEntities.userIndex.keys())#3391self.numFriends = np.zeros( (nusers) )#array([0., 0., 0., ..., 0., 0., 0.]),保存每一个用户的朋友数self.userFriends = ss.dok_matrix( (nusers, nusers) )fin = gzip.open('user_friends.csv.gz')print( 'Header In User_friends.csv.gz:',fin.readline() )ln = 0#逐行打开user_friends.csv.gz文件#判断第一列的user是否在userIndex中,只有user在userIndex中才是我们关心的user#获取该用户的Index,和朋友数目#对于该用户的每一个朋友,如果朋友也在userIndex中,获取其朋友的userIndex,然后去userEventScores中获取该朋友对每个events的反应#score即为该朋友对所有events的平均分#userFriends矩阵记录了用户和朋友之间的score#如851286067:1750用户出现在test.csv中,该用户在User_friends.csv.gz中一共2151个朋友#那么其朋友占比应该是2151 / 总的朋友数sumNumFriends=3731377.0 = 2151 / 3731377 = 0.0005764627910822198for line in fin:if ln % 200 == 0:print( 'Loading line:', ln )cols = line.decode().strip().split(',')user = cols[0]if user in programEntities.userIndex:friends = cols[1].split(' ')#获得该用户的朋友列表i = programEntities.userIndex[user]self.numFriends[i] = len(friends)for friend in friends:if friend in programEntities.userIndex:j = programEntities.userIndex[friend]#the objective of this score is to infer the degree to#and direction in which this friend will influence the#user's decision, so we sum the user/event score for#this user across all training eventseventsForUser = programEntities.userEventScores.getrow(j).todense()#获取朋友对每个events的反应:0, 1, or -1#print(eventsForUser.sum(), np.shape(eventsForUser)[1] )#socre即是用户朋友在13418个events上的平均分score = eventsForUser.sum() / np.shape(eventsForUser)[1]#eventsForUser = 13418,#print(score)self.userFriends[i, j] += scoreself.userFriends[j, i] += scoreln += 1fin.close()#归一化数组sumNumFriends = self.numFriends.sum(axis=0)#每个用户的朋友数相加#print(sumNumFriends)self.numFriends = self.numFriends / sumNumFriends#每个user的朋友数目比例sio.mmwrite('UF_numFriends', np.matrix(self.numFriends) )self.userFriends = normalize(self.userFriends, norm='l1', axis=0, copy=False)sio.mmwrite('UF_userFriends', self.userFriends)#构造event和event相似度数据
class Events:"""构建event-event相似度,注意这里有2种相似度1)由用户-event行为,类似协同过滤算出的相似度2)由event本身的内容(event信息)计算出的event-event相似度"""def __init__(self, programEntities, psim=ssd.correlation, csim=ssd.cosine):cleaner = DataCleaner()fin = gzip.open('events.csv.gz')fin.readline()#skip headernevents = len(programEntities.eventIndex)print(nevents)#13418self.eventPropMatrix = ss.dok_matrix( (nevents, 7) )self.eventContMatrix = ss.dok_matrix( (nevents, 100) )ln = 0for line in fin:#if ln > 10:#breakcols = line.decode().strip().split(',')eventId = cols[0]if eventId in programEntities.eventIndex:i = programEntities.eventIndex[eventId]self.eventPropMatrix[i, 0] = cleaner.getJoinedYearMonth( cols[2] )#start_timeself.eventPropMatrix[i, 1] = cleaner.getFeatureHash( cols[3] )#cityself.eventPropMatrix[i, 2] = cleaner.getFeatureHash( cols[4] )#stateself.eventPropMatrix[i, 3] = cleaner.getFeatureHash( cols[5] )#zipself.eventPropMatrix[i, 4] = cleaner.getFeatureHash( cols[6] )#countryself.eventPropMatrix[i, 5] = cleaner.getFloatValue( cols[7] )#latself.eventPropMatrix[i, 6] = cleaner.getFloatValue( cols[8] )#lonfor j in range(9, 109):self.eventContMatrix[i, j-9] = cols[j]ln += 1fin.close()self.eventPropMatrix = normalize(self.eventPropMatrix, norm='l1', axis=0, copy=False)sio.mmwrite('EV_eventPropMatrix', self.eventPropMatrix)self.eventContMatrix = normalize(self.eventContMatrix, norm='l1', axis=0, copy=False)sio.mmwrite('EV_eventContMatrix', self.eventContMatrix)#calculate similarity between event pairs based on the two matricesself.eventPropSim = ss.dok_matrix( (nevents, nevents) )self.eventContSim = ss.dok_matrix( (nevents, nevents) )for e1, e2 in programEntities.uniqueEventPairs:i = programEntities.eventIndex[e1]j = programEntities.eventIndex[e2]if not ((i, j) in self.eventPropSim):epsim = psim( self.eventPropMatrix.getrow(i).todense(), self.eventPropMatrix.getrow(j).todense())if np.isnan(epsim):epsim = 0self.eventPropSim[i, j] = epsimself.eventPropSim[j, i] = epsimif not ((i, j) in self.eventContSim):#两个向量,如果某个全为0,会返回nan"""import numpy as npa = np.array([0, 1, 1, 1, 0, 0, 0, 1, 0, 0])b = np.array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0])from scipy.spatial.distance import cosinetemp = cosine(a, b)会出现下面问题:Warning (from warnings module):File "D:\Python35\lib\site-packages\scipy\spatial\distance.py", line 644dist = 1.0 - uv / np.sqrt(uu * vv)RuntimeWarning: invalid value encountered in double_scalars"""ecsim = csim( self.eventContMatrix.getrow(i).todense(), self.eventContMatrix.getrow(j).todense())if np.isnan(ecsim):ecsim = 0self.eventContSim[i, j] = ecsimself.eventContSim[j, i] = ecsimsio.mmwrite('EV_eventPropSim', self.eventPropSim)sio.mmwrite('EV_eventContSim', self.eventContSim)
#第五步
class EventAttendees:"""统计某个活动,参加和不参加的人数,从而为活动活跃度做准备"""def __init__(self, programEntities):nevents = len(programEntities.eventIndex)#13418 事件的总数self.eventPopularity = ss.dok_matrix( (nevents, 1) )f = gzip.open('event_attendees.csv.gz')f.readline()#skip headerfor line in f:cols = line.decode().strip().split(',')eventId = cols[0]if eventId in programEntities.eventIndex:i = programEntities.eventIndex[eventId]self.eventPopularity[i, 0] = len(cols[1].split(' ')) - len(cols[4].split(' '))#yes人数-no人数,即出席人数减未出席人数f.close()self.eventPopularity = normalize( self.eventPopularity, norm='l1', axis=0, copy=False)sio.mmwrite('EA_eventPopularity', self.eventPopularity)def data_prepare():"""计算生成所有的数据,用矩阵或者其他形式存储方便后续提取特征和建模"""print('第1步:统计user和event相关信息...')pe = ProgramEntities()print('第1步完成...\n')print('第2步:计算用户相似度信息,并用矩阵形式存储...')Users(pe)print('第2步完成...\n')print('第3步:计算用户社交关系信息,并存储...')UserFriends(pe)print('第3步完成...\n')print('第4步:计算event相似度信息,并用矩阵形式存储...')Events(pe)print('第4步完成...\n')print('第5步:计算event热度信息...')EventAttendees(pe)print('第5步完成...\n')#运行进行数据准备
data_prepare()
6.特征构建
#这是特征构建部分#import cPickle
#From python3, cPickle has beed replaced by _pickle
import _pickle as cPickle
import scipy.io as sio
import numpy as npclass DataRewriter:def __init__(self):#读入数据做初始化self.userIndex = cPickle.load( open('PE_userIndex.pkl','rb') )self.eventIndex = cPickle.load( open('PE_eventIndex.pkl', 'rb') )self.userEventScores = sio.mmread('PE_userEventScores').todense()self.userSimMatrix = sio.mmread('US_userSimMatrix').todense() self.eventPropSim = sio.mmread('EV_eventPropSim').todense() self.eventContSim = sio.mmread('EV_eventContSim').todense() self.numFriends = sio.mmread('UF_numFriends')self.userFriends = sio.mmread('UF_userFriends').todense()self.eventPopularity = sio.mmread('EA_eventPopularity').todense()def userReco(self, userId, eventId):"""根据User-based协同过滤,得到event的推荐度基本的伪代码思路如下:for item in ifor every other user v that has a preference for icompute similarity s between u and vincorporate v's preference for i weighted by s into running averagereturn top items ranked by weighted average"""i = self.userIndex[userId]j = self.eventIndex[eventId]vs = self.userEventScores[:, j]sims = self.userSimMatrix[i, :]prod = sims * vstry:return prod[0, 0] - self.userEventScores[i, j]except IndexError:return 0def eventReco(self, userId, eventId):"""根据基于物品的协同过滤,得到Event的推荐度基本的伪代码思路:for item i:for every item j that u has a preference forcompute similarity s between i and jadd u's preference for j weighted by s to a running averagereturn top items, ranked by weighted average"""i = self.userIndex[userId]j = self.eventIndex[eventId]js = self.userEventScores[i, :]psim = self.eventPropSim[:, j]csim = self.eventContSim[:, j]pprod = js * psimcprod = js * csimpscore = 0cscore = 0try:pscore = pprod[0, 0] - self.userEventScores[i, j]except IndexError:passtry:cscore = cprod[0, 0] - self.userEventScores[i, j]except IndexError:passreturn pscore, cscoredef userPop(self, userId):"""基于用户的朋友个数来推断用户的社交程度主要的考量是如果用户的朋友非常多,可能会更倾向于参加各种社交活动"""if userId in self.userIndex:i = self.userIndex[userId]try:return self.numFriends[0, i]except IndexError:return 0else:return 0def friendInfluence(self, userId):"""朋友对用户的影响主要考虑用户的所有朋友中,有多少是非常喜欢参加各种社交活动(event)的用户的朋友圈如果都是积极参加各种event,可能会对当前用户有一定的影响"""nusers = np.shape(self.userFriends)[1]i = self.userIndex[userId]#下面的一行代码是不是有问题呢?#是不是应该为某个用户的所有朋友的兴趣分之和,然后除以nusers,也就是axis应该=1return (self.userFriends[i, :].sum(axis=0) / nusers)[0, 0]def eventPop(self, eventId):"""活动本身的热度主要通过参与的参数来界定的"""i = self.eventIndex[eventId]return self.eventPopularity[i, 0]def rewriteData(self, start=1, train=True, header=True):"""把前面user-based协同过滤和item-based协同过滤以及各种热度和影响度作为特征组合在一起生成新的train,用于分类器分类使用"""fn = 'train.csv' if train else 'test.csv'fin = open(fn)fout = open('data_' + fn, 'w')#write output headerif header:ocolnames = ['invited', 'user_reco', 'evt_p_reco', 'evt_c_reco', 'user_pop', 'frnd_infl', 'evt_pop']if train:ocolnames.append('interested')ocolnames.append('not_interested')fout.write( ','.join(ocolnames) + '\n' )ln = 0for line in fin:ln += 1if ln < start:continuecols = line.strip().split(',')#user,event,invited,timestamp,interested,not_interesteduserId = cols[0]eventId = cols[1]invited = cols[2]if ln % 500 == 0:print("%s : %d (userId, eventId) = (%s, %s)" % (fn, ln, userId, eventId))user_reco = self.userReco( userId, eventId )evt_p_reco, evt_c_reco = self.eventReco( userId, eventId )user_pop = self.userPop( userId )frnd_infl = self.friendInfluence( userId )evt_pop = self.eventPop( eventId )ocols = [invited, user_reco, evt_p_reco, evt_c_reco, user_pop, frnd_infl, evt_pop]if train:ocols.append( cols[4] )#interestedocols.append( cols[5] )#not_interestedfout.write(','.join( map(lambda x: str(x), ocols)) + '\n')fin.close()fout.close()def rewriteTrainingSet(self):self.rewriteData(True)def rewriteTestSet(self):self.rewriteData(False)dr = DataRewriter()
print('生成训练数据...\n')
dr.rewriteData(train=True, start=2, header=True)print('生成预测数据...\n')
dr.rewriteData(train=False, start=2, header=True)
print('done')
第七步:模型构建与预测
import pandas as pd
import numpy as np
from sklearn.linear_model import SGDClassifier
from sklearn.model_selection import KFold
import warnings
warnings.filterwarnings('ignore')def train():"""在我们得到的特征上训练分类器,target为1(感兴趣),或者是0(不感兴趣)"""trainDf = pd.read_csv('data_train.csv')X = np.matrix( pd.DataFrame(trainDf, index=None, columns=['invited', 'user_reco', 'evt_p_reco','evt_c_reco','user_pop', 'frnd_infl', 'evt_pop']) )y = np.array(trainDf.interested)clf = SGDClassifier(loss='log', penalty='l2')clf.fit(X, y)return clfdef validate():"""10折的交叉验证,并输出交叉验证的平均准确率"""trainDf = pd.read_csv('data_train.csv')X = np.matrix(pd.DataFrame(trainDf, index=None, columns=['invited', 'user_reco', 'evt_p_reco','evt_c_reco','user_pop', 'frnd_infl', 'evt_pop']) )y = np.array(trainDf.interested)nrows = len(trainDf)kfold = KFold(n_splits=10,shuffle=False)avgAccuracy = 0run = 0for train, test in kfold.split(X, y):Xtrain, Xtest, ytrain, ytest = X[train], X[test], y[train], y[test]clf = SGDClassifier(loss='log', penalty='l2')clf.fit(Xtrain, ytrain)accuracy = 0ntest = len(ytest)for i in range(0, ntest):yt = clf.predict(Xtest[i, :])if yt == ytest[i]:accuracy += 1accuracy = accuracy / ntestprint('accuracy(run %d) : %f' % (run, accuracy) )def test(clf):"""读取test数据,用分类器完成预测"""origTestDf = pd.read_csv("test.csv")users = origTestDf.userevents = origTestDf.eventtestDf = pd.read_csv("data_test.csv")fout = open("result.csv", 'w')fout.write(",".join(["user", "event", "outcome", "dist"]) + "\n")nrows = len(testDf)Xp = np.matrix(testDf)yp = np.zeros((nrows, 2))for i in range(0, nrows):xp = Xp[i, :]yp[i, 0] = clf.predict(xp)yp[i, 1] = clf.decision_function(xp)fout.write(",".join( map( lambda x: str(x), [users[i], events[i], yp[i, 0], yp[i, 1]] ) ) + "\n")fout.close()clf = train()
validate()
test(clf)
print('done')