Event Recommendation Engine Challenge（基础版）---代码

第一步：统计user和event相关信息

#查看train_csv的数据
import pandas as pd
df_train = pd.read_csv('train.csv')
df_train.head()

	user	event	timestamp	interested
0	3044012	1918771225	2012-10-02 15:53:05.754000+00:00	0
1	3044012	1502284248	2012-10-02 15:53:05.754000+00:00	0
2	3044012	2529072432	2012-10-02 15:53:05.754000+00:00	1
3	3044012	3072478280	2012-10-02 15:53:05.754000+00:00	0
4	3044012	1390707377	2012-10-02 15:53:05.754000+00:00	0

df_train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 15398 entries, 0 to 15397
Data columns (total 6 columns):
user              15398 non-null int64
event             15398 non-null int64
invited           15398 non-null int64
timestamp         15398 non-null object
interested        15398 non-null int64
not_interested    15398 non-null int64
dtypes: int64(5), object(1)
memory usage: 721.9+ KB

#查看test_csv的数据
df_test = pd.read_csv('test.csv')
df_test.head()

	user	event	timestamp
0	1776192	2877501688	2012-11-30 11:39:01.230000+00:00
1	1776192	3025444328	2012-11-30 11:39:01.230000+00:00
2	1776192	4078218285	2012-11-30 11:39:01.230000+00:00
3	1776192	1024025121	2012-11-30 11:39:01.230000+00:00
4	1776192	2972428928	2012-11-30 11:39:21.985000+00:00

df_test.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10237 entries, 0 to 10236
Data columns (total 4 columns):
user         10237 non-null int64
event        10237 non-null int64
invited      10237 non-null int64
timestamp    10237 non-null object
dtypes: int64(3), object(1)
memory usage: 320.0+ KB

前两列是用户ID和对应的event ID
而test.csv中用户缺少了标签(interested or not_interested)

#第一步的全部程序如下
from collections import defaultdict
import scipy.sparse as ss
import scipy.io as sio
import itertools
#import cPickle
#From python3, cPickle has beed replaced by _pickle
import _pickle as cPickle#用于分析train和test中用户和事件之间的相关性。
class ProgramEntities:"""我们只关心train和test中出现的user和event，因此重点处理这部分关联数据，经过统计：train和test中总共3391个users和13418个events"""def __init__(self):#统计训练集中有多少独立的用户的eventsuniqueUsers = set()#uniqueUsers保存总共多少个用户：3391个uniqueEvents = set()#uniqueEvents保存总共多少个events：13418个eventsForUser = defaultdict(set)#字典eventsForUser保存了每个user:所对应的eventusersForEvent = defaultdict(set)#字典usersForEvent保存了每个event：哪些user点击for filename in ['train.csv', 'test.csv']:f = open(filename)f.readline()#跳过第一行for line in f:cols = line.strip().split(',')uniqueUsers.add( cols[0] )  #统计所有的用户有哪些uniqueEvents.add( cols[1] ) #统计所有的事件有哪些eventsForUser[cols[0]].add( cols[1] )  #将用户作为键值，保存下每个用户对应的事件usersForEvent[cols[1]].add( cols[0] )  #将事件作为键值，保存下每个事件对应的用户f.close()self.userEventScores = ss.dok_matrix( ( len(uniqueUsers), len(uniqueEvents) ) )self.userIndex = dict()self.eventIndex = dict()for i, u in enumerate(uniqueUsers):self.userIndex[u] = ifor i, e in enumerate(uniqueEvents):self.eventIndex[e] = iftrain = open('train.csv')ftrain.readline()for line in ftrain:cols = line.strip().split(',')i = self.userIndex[ cols[0] ]j = self.eventIndex[ cols[1] ]self.userEventScores[i, j] = int( cols[4] ) - int( cols[5] )ftrain.close()sio.mmwrite('PE_userEventScores', self.userEventScores)#为了防止不必要的计算，我们找出来所有关联的用户或者关联的event#所谓关联用户指的是至少在同一个event上有行为的用户user pair#关联的event指的是至少同一个user有行为的event pairself.uniqueUserPairs = set()self.uniqueEventPairs = set()#查找关联用户for event in uniqueEvents:users = usersForEvent[event]if len(users) > 2:self.uniqueUserPairs.update( itertools.combinations(users, 2) )#查找关联事件for user in uniqueUsers:events = eventsForUser[user]if len(events) > 2:self.uniqueEventPairs.update( itertools.combinations(events, 2) )#rint(self.userIndex)cPickle.dump( self.userIndex, open('PE_userIndex.pkl', 'wb'))cPickle.dump( self.eventIndex, open('PE_eventIndex.pkl', 'wb') )print('第1步：统计user和event相关信息...')
pe = ProgramEntities()
print('第1步完成...\n')

第1步：统计user和event相关信息...
第1步完成...

pe.userEventScores

<3391x13418 sparse matrix of type '<class 'numpy.float64'>'with 4645 stored elements in Dictionary Of Keys format>

说明：

其中PE_userEventScores.mtx是所有users和events的矩阵，但是里面的值只有train.csv的值，值是1或者-1
scipy.sparse.dok_matrix()函数是产生一个稀疏矩阵，这样PE_userEventScores.mtx只保存了非0值
针对该步使用的变量作简单介绍：
- uniqueUsers：集合，保存train.csv和test.csv中的所有user ID
- uniqueEvents：集合，保存train.csv和test.csv中的所有event ID
- eventsForUser：字典，key为每个用户，value为该用户对应的event集合
- usersForEvent：字典，key为每个event，value为该event对应的user集合
- userIndex：字典，每个用户有个Index
- eventIndex：字典，每个event有个Index
- userEventScores：稀疏矩阵3391 * 13418，use vs event，矩阵元素为train.csv中
  每个user对某个event的兴趣分（1， 0 or -1）即interested - not_interested

import pandas as pd
pd.DataFrame(userEventScores)

userEventScores：每个user对每个event的兴趣分（1， 0 or -1）

import pandas as pd
df_train = pd.read_csv('train.csv')
df_train[df_train['event']==1502284248]
import itertools
for each in itertools.combinations(set([3044012,1302145719,3194014105,3669515588]), 2):print(each)

(3194014105, 3669515588)
(3194014105, 3044012)
(3194014105, 1302145719)
(3669515588, 3044012)
(3669515588, 1302145719)
(3044012, 1302145719)

uniqueUserPairs：集合，如果对于同一个event来说，关联上3个及3个以上users，则该event关联上的users进行两两配对，保存在uniqueUserPairs中，注意保存的是userId，而不是user对应的索引：

import pandas as pd
df_train = pd.read_csv('train.csv')
df_train[df_train['user']==3044012]

	user	event	timestamp	interested
0	3044012	1918771225	2012-10-02 15:53:05.754000+00:00	0
1	3044012	1502284248	2012-10-02 15:53:05.754000+00:00	0
2	3044012	2529072432	2012-10-02 15:53:05.754000+00:00	1
3	3044012	3072478280	2012-10-02 15:53:05.754000+00:00	0
4	3044012	1390707377	2012-10-02 15:53:05.754000+00:00	0
5	3044012	1532377761	2012-10-02 15:53:05.754000+00:00	0

import itertools
for each in itertools.combinations(set([1918771225,1502284248,2529072432, 3072478280, 1390707377, 1532377761    ]), 2):print(each)

(1532377761, 3072478280)
(1532377761, 2529072432)
(1532377761, 1390707377)
(1532377761, 1502284248)
(1532377761, 1918771225)
(3072478280, 2529072432)
(3072478280, 1390707377)
(3072478280, 1502284248)
(3072478280, 1918771225)
(2529072432, 1390707377)
(2529072432, 1502284248)
(2529072432, 1918771225)
(1390707377, 1502284248)
(1390707377, 1918771225)
(1502284248, 1918771225)

第二步：计算用户相似度

由于用到：users.csv，我们先看看其内容（看前10行）

import pandas as pd
df_users = pd.read_csv('users.csv')
df_users.head(10)

	user_id	locale	birthyear	gender	joinedAt	location	timezone
0	3197468391	id_ID	1993	male	2012-10-02T06:40:55.524Z	Medan Indonesia	480.0
1	3537982273	id_ID	1992	male	2012-09-29T18:03:12.111Z	Medan Indonesia	420.0
2	823183725	en_US	1975	male	2012-10-06T03:14:07.149Z	Stratford Ontario	-240.0
3	1872223848	en_US	1991	female	2012-11-04T08:59:43.783Z	Tehran Iran	210.0
4	3429017717	id_ID	1995	female	2012-09-10T16:06:53.132Z	NaN	420.0
5	627175141	ka_GE	1973	female	2012-11-01T09:59:17.590Z	Tbilisi Georgia	240.0
6	2752000443	id_ID	1994	male	2012-10-03T05:22:17.637Z	Medan Indonesia	420.0
7	3473687777	id_ID	1965	female	2012-10-03T12:19:29.975Z	Medan Indonesia	420.0
8	2966052962	id_ID	1979	male	2012-10-31T10:11:57.668Z	Medan Indonesia	420.0
9	264876277	id_ID	1988	female	2012-10-02T07:28:09.555Z	Medan Indonesia	420.0

#使用locale和pycountry模块来将字符串转换成数值
import locale
locale.locale_alias

{'a3': 'az_AZ.KOI8-C','a3_az': 'az_AZ.KOI8-C','a3_az.koic': 'az_AZ.KOI8-C','aa_dj': 'aa_DJ.ISO8859-1','aa_er': 'aa_ER.UTF-8','aa_et': 'aa_ET.UTF-8','af': 'af_ZA.ISO8859-1','af_za': 'af_ZA.ISO8859-1','am': 'am_ET.UTF-8','am_et': 'am_ET.UTF-8','american': 'en_US.ISO8859-1','an_es': 'an_ES.ISO8859-15','ar': 'ar_AA.ISO8859-6','ar_aa': 'ar_AA.ISO8859-6','ar_ae': 'ar_AE.ISO8859-6','ar_bh': 'ar_BH.ISO8859-6','ar_dz': 'ar_DZ.ISO8859-6','ar_eg': 'ar_EG.ISO8859-6','ar_in': 'ar_IN.UTF-8','ar_iq': 'ar_IQ.ISO8859-6','ar_jo': 'ar_JO.ISO8859-6','ar_kw': 'ar_KW.ISO8859-6','ar_lb': 'ar_LB.ISO8859-6','ar_ly': 'ar_LY.ISO8859-6','ar_ma': 'ar_MA.ISO8859-6','ar_om': 'ar_OM.ISO8859-6','ar_qa': 'ar_QA.ISO8859-6','ar_sa': 'ar_SA.ISO8859-6','ar_sd': 'ar_SD.ISO8859-6','ar_sy': 'ar_SY.ISO8859-6','ar_tn': 'ar_TN.ISO8859-6','ar_ye': 'ar_YE.ISO8859-6','arabic': 'ar_AA.ISO8859-6','as': 'as_IN.UTF-8','as_in': 'as_IN.UTF-8','ast_es': 'ast_ES.ISO8859-15','ayc_pe': 'ayc_PE.UTF-8','az': 'az_AZ.ISO8859-9E','az_az': 'az_AZ.ISO8859-9E','az_az.iso88599e': 'az_AZ.ISO8859-9E','be': 'be_BY.CP1251','be@latin': 'be_BY.UTF-8@latin','be_bg.utf8': 'bg_BG.UTF-8','be_by': 'be_BY.CP1251','be_by@latin': 'be_BY.UTF-8@latin','bem_zm': 'bem_ZM.UTF-8','ber_dz': 'ber_DZ.UTF-8','ber_ma': 'ber_MA.UTF-8','bg': 'bg_BG.CP1251','bg_bg': 'bg_BG.CP1251','bho_in': 'bho_IN.UTF-8','bn_bd': 'bn_BD.UTF-8','bn_in': 'bn_IN.UTF-8','bo_cn': 'bo_CN.UTF-8','bo_in': 'bo_IN.UTF-8','bokmal': 'nb_NO.ISO8859-1','bokmål': 'nb_NO.ISO8859-1','br': 'br_FR.ISO8859-1','br_fr': 'br_FR.ISO8859-1','brx_in': 'brx_IN.UTF-8','bs': 'bs_BA.ISO8859-2','bs_ba': 'bs_BA.ISO8859-2','bulgarian': 'bg_BG.CP1251','byn_er': 'byn_ER.UTF-8','c': 'C','c-french': 'fr_CA.ISO8859-1','c.ascii': 'C','c.en': 'C','c.iso88591': 'en_US.ISO8859-1','c.utf8': 'en_US.UTF-8','c_c': 'C','c_c.c': 'C','ca': 'ca_ES.ISO8859-1','ca_ad': 'ca_AD.ISO8859-1','ca_es': 'ca_ES.ISO8859-1','ca_es@valencia': 'ca_ES.ISO8859-15@valencia','ca_fr': 'ca_FR.ISO8859-1','ca_it': 'ca_IT.ISO8859-1','catalan': 'ca_ES.ISO8859-1','cextend': 'en_US.ISO8859-1','chinese-s': 'zh_CN.eucCN','chinese-t': 'zh_TW.eucTW','crh_ua': 'crh_UA.UTF-8','croatian': 'hr_HR.ISO8859-2','cs': 'cs_CZ.ISO8859-2','cs_cs': 'cs_CZ.ISO8859-2','cs_cz': 'cs_CZ.ISO8859-2','csb_pl': 'csb_PL.UTF-8','cv_ru': 'cv_RU.UTF-8','cy': 'cy_GB.ISO8859-1','cy_gb': 'cy_GB.ISO8859-1','cz': 'cs_CZ.ISO8859-2','cz_cz': 'cs_CZ.ISO8859-2','czech': 'cs_CZ.ISO8859-2','da': 'da_DK.ISO8859-1','da_dk': 'da_DK.ISO8859-1','danish': 'da_DK.ISO8859-1','dansk': 'da_DK.ISO8859-1','de': 'de_DE.ISO8859-1','de_at': 'de_AT.ISO8859-1','de_be': 'de_BE.ISO8859-1','de_ch': 'de_CH.ISO8859-1','de_de': 'de_DE.ISO8859-1','de_li.utf8': 'de_LI.UTF-8','de_lu': 'de_LU.ISO8859-1','deutsch': 'de_DE.ISO8859-1','doi_in': 'doi_IN.UTF-8','dutch': 'nl_NL.ISO8859-1','dutch.iso88591': 'nl_BE.ISO8859-1','dv_mv': 'dv_MV.UTF-8','dz_bt': 'dz_BT.UTF-8','ee': 'ee_EE.ISO8859-4','ee_ee': 'ee_EE.ISO8859-4','eesti': 'et_EE.ISO8859-1','el': 'el_GR.ISO8859-7','el_cy': 'el_CY.ISO8859-7','el_gr': 'el_GR.ISO8859-7','el_gr@euro': 'el_GR.ISO8859-15','en': 'en_US.ISO8859-1','en_ag': 'en_AG.UTF-8','en_au': 'en_AU.ISO8859-1','en_be': 'en_BE.ISO8859-1','en_bw': 'en_BW.ISO8859-1','en_ca': 'en_CA.ISO8859-1','en_dk': 'en_DK.ISO8859-1','en_dl.utf8': 'en_DL.UTF-8','en_gb': 'en_GB.ISO8859-1','en_hk': 'en_HK.ISO8859-1','en_ie': 'en_IE.ISO8859-1','en_in': 'en_IN.ISO8859-1','en_ng': 'en_NG.UTF-8','en_nz': 'en_NZ.ISO8859-1','en_ph': 'en_PH.ISO8859-1','en_sg': 'en_SG.ISO8859-1','en_uk': 'en_GB.ISO8859-1','en_us': 'en_US.ISO8859-1','en_us@euro@euro': 'en_US.ISO8859-15','en_za': 'en_ZA.ISO8859-1','en_zm': 'en_ZM.UTF-8','en_zw': 'en_ZW.ISO8859-1','en_zw.utf8': 'en_ZS.UTF-8','eng_gb': 'en_GB.ISO8859-1','english': 'en_EN.ISO8859-1','english_uk': 'en_GB.ISO8859-1','english_united-states': 'en_US.ISO8859-1','english_united-states.437': 'C','english_us': 'en_US.ISO8859-1','eo': 'eo_XX.ISO8859-3','eo.utf8': 'eo.UTF-8','eo_eo': 'eo_EO.ISO8859-3','eo_us.utf8': 'eo_US.UTF-8','eo_xx': 'eo_XX.ISO8859-3','es': 'es_ES.ISO8859-1','es_ar': 'es_AR.ISO8859-1','es_bo': 'es_BO.ISO8859-1','es_cl': 'es_CL.ISO8859-1','es_co': 'es_CO.ISO8859-1','es_cr': 'es_CR.ISO8859-1','es_cu': 'es_CU.UTF-8','es_do': 'es_DO.ISO8859-1','es_ec': 'es_EC.ISO8859-1','es_es': 'es_ES.ISO8859-1','es_gt': 'es_GT.ISO8859-1','es_hn': 'es_HN.ISO8859-1','es_mx': 'es_MX.ISO8859-1','es_ni': 'es_NI.ISO8859-1','es_pa': 'es_PA.ISO8859-1','es_pe': 'es_PE.ISO8859-1','es_pr': 'es_PR.ISO8859-1','es_py': 'es_PY.ISO8859-1','es_sv': 'es_SV.ISO8859-1','es_us': 'es_US.ISO8859-1','es_uy': 'es_UY.ISO8859-1','es_ve': 'es_VE.ISO8859-1','estonian': 'et_EE.ISO8859-1','et': 'et_EE.ISO8859-15','et_ee': 'et_EE.ISO8859-15','eu': 'eu_ES.ISO8859-1','eu_es': 'eu_ES.ISO8859-1','eu_fr': 'eu_FR.ISO8859-1','fa': 'fa_IR.UTF-8','fa_ir': 'fa_IR.UTF-8','fa_ir.isiri3342': 'fa_IR.ISIRI-3342','ff_sn': 'ff_SN.UTF-8','fi': 'fi_FI.ISO8859-15','fi_fi': 'fi_FI.ISO8859-15','fil_ph': 'fil_PH.UTF-8','finnish': 'fi_FI.ISO8859-1','fo': 'fo_FO.ISO8859-1','fo_fo': 'fo_FO.ISO8859-1','fr': 'fr_FR.ISO8859-1','fr_be': 'fr_BE.ISO8859-1','fr_ca': 'fr_CA.ISO8859-1','fr_ch': 'fr_CH.ISO8859-1','fr_fr': 'fr_FR.ISO8859-1','fr_lu': 'fr_LU.ISO8859-1','français': 'fr_FR.ISO8859-1','fre_fr': 'fr_FR.ISO8859-1','french': 'fr_FR.ISO8859-1','french.iso88591': 'fr_CH.ISO8859-1','french_france': 'fr_FR.ISO8859-1','fur_it': 'fur_IT.UTF-8','fy_de': 'fy_DE.UTF-8','fy_nl': 'fy_NL.UTF-8','ga': 'ga_IE.ISO8859-1','ga_ie': 'ga_IE.ISO8859-1','galego': 'gl_ES.ISO8859-1','galician': 'gl_ES.ISO8859-1','gd': 'gd_GB.ISO8859-1','gd_gb': 'gd_GB.ISO8859-1','ger_de': 'de_DE.ISO8859-1','german': 'de_DE.ISO8859-1','german.iso88591': 'de_CH.ISO8859-1','german_germany': 'de_DE.ISO8859-1','gez_er': 'gez_ER.UTF-8','gez_et': 'gez_ET.UTF-8','gl': 'gl_ES.ISO8859-1','gl_es': 'gl_ES.ISO8859-1','greek': 'el_GR.ISO8859-7','gu_in': 'gu_IN.UTF-8','gv': 'gv_GB.ISO8859-1','gv_gb': 'gv_GB.ISO8859-1','ha_ng': 'ha_NG.UTF-8','he': 'he_IL.ISO8859-8','he_il': 'he_IL.ISO8859-8','hebrew': 'he_IL.ISO8859-8','hi': 'hi_IN.ISCII-DEV','hi_in': 'hi_IN.ISCII-DEV','hi_in.isciidev': 'hi_IN.ISCII-DEV','hne': 'hne_IN.UTF-8','hne_in': 'hne_IN.UTF-8','hr': 'hr_HR.ISO8859-2','hr_hr': 'hr_HR.ISO8859-2','hrvatski': 'hr_HR.ISO8859-2','hsb_de': 'hsb_DE.ISO8859-2','ht_ht': 'ht_HT.UTF-8','hu': 'hu_HU.ISO8859-2','hu_hu': 'hu_HU.ISO8859-2','hungarian': 'hu_HU.ISO8859-2','hy_am': 'hy_AM.UTF-8','hy_am.armscii8': 'hy_AM.ARMSCII_8','ia': 'ia.UTF-8','ia_fr': 'ia_FR.UTF-8','icelandic': 'is_IS.ISO8859-1','id': 'id_ID.ISO8859-1','id_id': 'id_ID.ISO8859-1','ig_ng': 'ig_NG.UTF-8','ik_ca': 'ik_CA.UTF-8','in': 'id_ID.ISO8859-1','in_id': 'id_ID.ISO8859-1','is': 'is_IS.ISO8859-1','is_is': 'is_IS.ISO8859-1','iso-8859-1': 'en_US.ISO8859-1','iso-8859-15': 'en_US.ISO8859-15','iso8859-1': 'en_US.ISO8859-1','iso8859-15': 'en_US.ISO8859-15','iso_8859_1': 'en_US.ISO8859-1','iso_8859_15': 'en_US.ISO8859-15','it': 'it_IT.ISO8859-1','it_ch': 'it_CH.ISO8859-1','it_it': 'it_IT.ISO8859-1','italian': 'it_IT.ISO8859-1','iu': 'iu_CA.NUNACOM-8','iu_ca': 'iu_CA.NUNACOM-8','iu_ca.nunacom8': 'iu_CA.NUNACOM-8','iw': 'he_IL.ISO8859-8','iw_il': 'he_IL.ISO8859-8','iw_il.utf8': 'iw_IL.UTF-8','ja': 'ja_JP.eucJP','ja_jp': 'ja_JP.eucJP','ja_jp.euc': 'ja_JP.eucJP','ja_jp.mscode': 'ja_JP.SJIS','ja_jp.pck': 'ja_JP.SJIS','japan': 'ja_JP.eucJP','japanese': 'ja_JP.eucJP','japanese-euc': 'ja_JP.eucJP','japanese.euc': 'ja_JP.eucJP','jp_jp': 'ja_JP.eucJP','ka': 'ka_GE.GEORGIAN-ACADEMY','ka_ge': 'ka_GE.GEORGIAN-ACADEMY','ka_ge.georgianacademy': 'ka_GE.GEORGIAN-ACADEMY','ka_ge.georgianps': 'ka_GE.GEORGIAN-PS','ka_ge.georgianrs': 'ka_GE.GEORGIAN-ACADEMY','kk_kz': 'kk_KZ.RK1048','kl': 'kl_GL.ISO8859-1','kl_gl': 'kl_GL.ISO8859-1','km_kh': 'km_KH.UTF-8','kn': 'kn_IN.UTF-8','kn_in': 'kn_IN.UTF-8','ko': 'ko_KR.eucKR','ko_kr': 'ko_KR.eucKR','ko_kr.euc': 'ko_KR.eucKR','kok_in': 'kok_IN.UTF-8','korean': 'ko_KR.eucKR','korean.euc': 'ko_KR.eucKR','ks': 'ks_IN.UTF-8','ks_in': 'ks_IN.UTF-8','ks_in@devanagari.utf8': 'ks_IN.UTF-8@devanagari','ku_tr': 'ku_TR.ISO8859-9','kw': 'kw_GB.ISO8859-1','kw_gb': 'kw_GB.ISO8859-1','ky': 'ky_KG.UTF-8','ky_kg': 'ky_KG.UTF-8','lb_lu': 'lb_LU.UTF-8','lg_ug': 'lg_UG.ISO8859-10','li_be': 'li_BE.UTF-8','li_nl': 'li_NL.UTF-8','lij_it': 'lij_IT.UTF-8','lithuanian': 'lt_LT.ISO8859-13','lo': 'lo_LA.MULELAO-1','lo_la': 'lo_LA.MULELAO-1','lo_la.cp1133': 'lo_LA.IBM-CP1133','lo_la.ibmcp1133': 'lo_LA.IBM-CP1133','lo_la.mulelao1': 'lo_LA.MULELAO-1','lt': 'lt_LT.ISO8859-13','lt_lt': 'lt_LT.ISO8859-13','lv': 'lv_LV.ISO8859-13','lv_lv': 'lv_LV.ISO8859-13','mag_in': 'mag_IN.UTF-8','mai': 'mai_IN.UTF-8','mai_in': 'mai_IN.UTF-8','mg_mg': 'mg_MG.ISO8859-15','mhr_ru': 'mhr_RU.UTF-8','mi': 'mi_NZ.ISO8859-1','mi_nz': 'mi_NZ.ISO8859-1','mk': 'mk_MK.ISO8859-5','mk_mk': 'mk_MK.ISO8859-5','ml': 'ml_IN.UTF-8','ml_in': 'ml_IN.UTF-8','mn_mn': 'mn_MN.UTF-8','mni_in': 'mni_IN.UTF-8','mr': 'mr_IN.UTF-8','mr_in': 'mr_IN.UTF-8','ms': 'ms_MY.ISO8859-1','ms_my': 'ms_MY.ISO8859-1','mt': 'mt_MT.ISO8859-3','mt_mt': 'mt_MT.ISO8859-3','my_mm': 'my_MM.UTF-8','nan_tw@latin': 'nan_TW.UTF-8@latin','nb': 'nb_NO.ISO8859-1','nb_no': 'nb_NO.ISO8859-1','nds_de': 'nds_DE.UTF-8','nds_nl': 'nds_NL.UTF-8','ne_np': 'ne_NP.UTF-8','nhn_mx': 'nhn_MX.UTF-8','niu_nu': 'niu_NU.UTF-8','niu_nz': 'niu_NZ.UTF-8','nl': 'nl_NL.ISO8859-1','nl_aw': 'nl_AW.UTF-8','nl_be': 'nl_BE.ISO8859-1','nl_nl': 'nl_NL.ISO8859-1','nn': 'nn_NO.ISO8859-1','nn_no': 'nn_NO.ISO8859-1','no': 'no_NO.ISO8859-1','no@nynorsk': 'ny_NO.ISO8859-1','no_no': 'no_NO.ISO8859-1','no_no.iso88591@bokmal': 'no_NO.ISO8859-1','no_no.iso88591@nynorsk': 'no_NO.ISO8859-1','norwegian': 'no_NO.ISO8859-1','nr': 'nr_ZA.ISO8859-1','nr_za': 'nr_ZA.ISO8859-1','nso': 'nso_ZA.ISO8859-15','nso_za': 'nso_ZA.ISO8859-15','ny': 'ny_NO.ISO8859-1','ny_no': 'ny_NO.ISO8859-1','nynorsk': 'nn_NO.ISO8859-1','oc': 'oc_FR.ISO8859-1','oc_fr': 'oc_FR.ISO8859-1','om_et': 'om_ET.UTF-8','om_ke': 'om_KE.ISO8859-1','or': 'or_IN.UTF-8','or_in': 'or_IN.UTF-8','os_ru': 'os_RU.UTF-8','pa': 'pa_IN.UTF-8','pa_in': 'pa_IN.UTF-8','pa_pk': 'pa_PK.UTF-8','pap_an': 'pap_AN.UTF-8','pd': 'pd_US.ISO8859-1','pd_de': 'pd_DE.ISO8859-1','pd_us': 'pd_US.ISO8859-1','ph': 'ph_PH.ISO8859-1','ph_ph': 'ph_PH.ISO8859-1','pl': 'pl_PL.ISO8859-2','pl_pl': 'pl_PL.ISO8859-2','polish': 'pl_PL.ISO8859-2','portuguese': 'pt_PT.ISO8859-1','portuguese_brazil': 'pt_BR.ISO8859-1','posix': 'C','posix-utf2': 'C','pp': 'pp_AN.ISO8859-1','pp_an': 'pp_AN.ISO8859-1','ps_af': 'ps_AF.UTF-8','pt': 'pt_PT.ISO8859-1','pt_br': 'pt_BR.ISO8859-1','pt_pt': 'pt_PT.ISO8859-1','ro': 'ro_RO.ISO8859-2','ro_ro': 'ro_RO.ISO8859-2','romanian': 'ro_RO.ISO8859-2','ru': 'ru_RU.UTF-8','ru_ru': 'ru_RU.UTF-8','ru_ua': 'ru_UA.KOI8-U','rumanian': 'ro_RO.ISO8859-2','russian': 'ru_RU.ISO8859-5','rw': 'rw_RW.ISO8859-1','rw_rw': 'rw_RW.ISO8859-1','sa_in': 'sa_IN.UTF-8','sat_in': 'sat_IN.UTF-8','sc_it': 'sc_IT.UTF-8','sd': 'sd_IN.UTF-8','sd_in': 'sd_IN.UTF-8','sd_in@devanagari.utf8': 'sd_IN.UTF-8@devanagari','sd_pk': 'sd_PK.UTF-8','se_no': 'se_NO.UTF-8','serbocroatian': 'sr_RS.UTF-8@latin','sh': 'sr_RS.UTF-8@latin','sh_ba.iso88592@bosnia': 'sr_CS.ISO8859-2','sh_hr': 'sh_HR.ISO8859-2','sh_hr.iso88592': 'hr_HR.ISO8859-2','sh_sp': 'sr_CS.ISO8859-2','sh_yu': 'sr_RS.UTF-8@latin','shs_ca': 'shs_CA.UTF-8','si': 'si_LK.UTF-8','si_lk': 'si_LK.UTF-8','sid_et': 'sid_ET.UTF-8','sinhala': 'si_LK.UTF-8','sk': 'sk_SK.ISO8859-2','sk_sk': 'sk_SK.ISO8859-2','sl': 'sl_SI.ISO8859-2','sl_cs': 'sl_CS.ISO8859-2','sl_si': 'sl_SI.ISO8859-2','slovak': 'sk_SK.ISO8859-2','slovene': 'sl_SI.ISO8859-2','slovenian': 'sl_SI.ISO8859-2','so_dj': 'so_DJ.ISO8859-1','so_et': 'so_ET.UTF-8','so_ke': 'so_KE.ISO8859-1','so_so': 'so_SO.ISO8859-1','sp': 'sr_CS.ISO8859-5','sp_yu': 'sr_CS.ISO8859-5','spanish': 'es_ES.ISO8859-1','spanish_spain': 'es_ES.ISO8859-1','sq': 'sq_AL.ISO8859-2','sq_al': 'sq_AL.ISO8859-2','sq_mk': 'sq_MK.UTF-8','sr': 'sr_RS.UTF-8','sr@cyrillic': 'sr_RS.UTF-8','sr@latn': 'sr_CS.UTF-8@latin','sr_cs': 'sr_CS.UTF-8','sr_cs.iso88592@latn': 'sr_CS.ISO8859-2','sr_cs@latn': 'sr_CS.UTF-8@latin','sr_me': 'sr_ME.UTF-8','sr_rs': 'sr_RS.UTF-8','sr_rs@latn': 'sr_RS.UTF-8@latin','sr_sp': 'sr_CS.ISO8859-2','sr_yu': 'sr_RS.UTF-8@latin','sr_yu.cp1251@cyrillic': 'sr_CS.CP1251','sr_yu.iso88592': 'sr_CS.ISO8859-2','sr_yu.iso88595': 'sr_CS.ISO8859-5','sr_yu.iso88595@cyrillic': 'sr_CS.ISO8859-5','sr_yu.microsoftcp1251@cyrillic': 'sr_CS.CP1251','sr_yu.utf8': 'sr_RS.UTF-8','sr_yu.utf8@cyrillic': 'sr_RS.UTF-8','sr_yu@cyrillic': 'sr_RS.UTF-8','ss': 'ss_ZA.ISO8859-1','ss_za': 'ss_ZA.ISO8859-1','st': 'st_ZA.ISO8859-1','st_za': 'st_ZA.ISO8859-1','sv': 'sv_SE.ISO8859-1','sv_fi': 'sv_FI.ISO8859-1','sv_se': 'sv_SE.ISO8859-1','sw_ke': 'sw_KE.UTF-8','sw_tz': 'sw_TZ.UTF-8','swedish': 'sv_SE.ISO8859-1','szl_pl': 'szl_PL.UTF-8','ta': 'ta_IN.TSCII-0','ta_in': 'ta_IN.TSCII-0','ta_in.tscii': 'ta_IN.TSCII-0','ta_in.tscii0': 'ta_IN.TSCII-0','ta_lk': 'ta_LK.UTF-8','te': 'te_IN.UTF-8','te_in': 'te_IN.UTF-8','tg': 'tg_TJ.KOI8-C','tg_tj': 'tg_TJ.KOI8-C','th': 'th_TH.ISO8859-11','th_th': 'th_TH.ISO8859-11','th_th.tactis': 'th_TH.TIS620','th_th.tis620': 'th_TH.TIS620','thai': 'th_TH.ISO8859-11','ti_er': 'ti_ER.UTF-8','ti_et': 'ti_ET.UTF-8','tig_er': 'tig_ER.UTF-8','tk_tm': 'tk_TM.UTF-8','tl': 'tl_PH.ISO8859-1','tl_ph': 'tl_PH.ISO8859-1','tn': 'tn_ZA.ISO8859-15','tn_za': 'tn_ZA.ISO8859-15','tr': 'tr_TR.ISO8859-9','tr_cy': 'tr_CY.ISO8859-9','tr_tr': 'tr_TR.ISO8859-9','ts': 'ts_ZA.ISO8859-1','ts_za': 'ts_ZA.ISO8859-1','tt': 'tt_RU.TATAR-CYR','tt_ru': 'tt_RU.TATAR-CYR','tt_ru.tatarcyr': 'tt_RU.TATAR-CYR','tt_ru@iqtelif': 'tt_RU.UTF-8@iqtelif','turkish': 'tr_TR.ISO8859-9','ug_cn': 'ug_CN.UTF-8','uk': 'uk_UA.KOI8-U','uk_ua': 'uk_UA.KOI8-U','univ': 'en_US.utf','universal': 'en_US.utf','universal.utf8@ucs4': 'en_US.UTF-8','unm_us': 'unm_US.UTF-8','ur': 'ur_PK.CP1256','ur_in': 'ur_IN.UTF-8','ur_pk': 'ur_PK.CP1256','uz': 'uz_UZ.UTF-8','uz_uz': 'uz_UZ.UTF-8','uz_uz@cyrillic': 'uz_UZ.UTF-8','ve': 've_ZA.UTF-8','ve_za': 've_ZA.UTF-8','vi': 'vi_VN.TCVN','vi_vn': 'vi_VN.TCVN','vi_vn.tcvn': 'vi_VN.TCVN','vi_vn.tcvn5712': 'vi_VN.TCVN','vi_vn.viscii': 'vi_VN.VISCII','vi_vn.viscii111': 'vi_VN.VISCII','wa': 'wa_BE.ISO8859-1','wa_be': 'wa_BE.ISO8859-1','wae_ch': 'wae_CH.UTF-8','wal_et': 'wal_ET.UTF-8','wo_sn': 'wo_SN.UTF-8','xh': 'xh_ZA.ISO8859-1','xh_za': 'xh_ZA.ISO8859-1','yi': 'yi_US.CP1255','yi_us': 'yi_US.CP1255','yo_ng': 'yo_NG.UTF-8','yue_hk': 'yue_HK.UTF-8','zh': 'zh_CN.eucCN','zh_cn': 'zh_CN.gb2312','zh_cn.big5': 'zh_TW.big5','zh_cn.euc': 'zh_CN.eucCN','zh_hk': 'zh_HK.big5hkscs','zh_hk.big5hk': 'zh_HK.big5hkscs','zh_sg': 'zh_SG.GB2312','zh_sg.gbk': 'zh_SG.GBK','zh_tw': 'zh_TW.big5','zh_tw.euc': 'zh_TW.eucTW','zh_tw.euctw': 'zh_TW.eucTW','zu': 'zu_ZA.ISO8859-1','zu_za': 'zu_ZA.ISO8859-1'}

1.locale列处理

import locale
from collections import defaultdictlocaleIdMap = defaultdict(int)
for i, l in enumerate(locale.locale_alias.keys()):localeIdMap[l] = i + 1
for each in localeIdMap:print(each, '\t', localeIdMap[each])

ee 	 1
fr_ch 	 2
fo_fo 	 3
af_za 	 4
bn_in 	 5
mni_in 	 93
da_dk 	 8
ar_ma 	 9
ig_ng 	 369
fr_be 	 11
italian 	 12
he_il 	 13
aa_dj 	 15
ml 	 463
yue_hk 	 17
pt_br 	 19
es_mx 	 280
gu_in 	 22
sid_et 	 23
it_it 	 24
japanese 	 95
de_de 	 26
en_ag 	 523
croatian 	 27
it 	 96
cs 	 29
mn_mn 	 30
ar_bh 	 31
ro_ro 	 481
gv_gb 	 33
rw 	 34
bg_bg 	 35
ar 	 499
en_us@euro@euro 	 36
fil_ph 	 37
fr_fr 	 466
french 	 39
de 	 40
polish 	 285
kok_in 	 42
korean.euc 	 43
sr 	 44
sr_cs.iso88592@latn 	 45
pap_an 	 46
sr_yu.iso88595 	 47
turkish 	 51
c.utf8 	 52
uz_uz 	 53
lv_lv 	 429
sr_rs@latn 	 54
eo_xx 	 55
ik_ca 	 57
iso_8859_1 	 58
no_no.iso88591@bokmal 	 59
cextend 	 60
doi_in 	 225
universal 	 61
es_cr 	 62
hne_in 	 63
gd_gb 	 64
cy 	 65
nl_aw 	 66
yi 	 67
mt_mt 	 68
sk_sk 	 384
si_lk 	 71
a3_az 	 72
lt 	 500
st_za 	 73
iw 	 74
te 	 318
en_nz 	 528
en_in 	 76
zh_tw.euc 	 77
ne_np 	 49
brx_in 	 286
no 	 80
az 	 81
german.iso88591 	 475
ky 	 32
he 	 85
kn_in 	 86
id_id 	 110
mai 	 88
nb_no 	 89
czech 	 90
sq 	 91
ja 	 92
tr 	 6
german_germany 	 94
shs_ca 	 265
mr 	 28
fi_fi 	 97
wal_et 	 48
cs_cs 	 100
sd_in@devanagari.utf8 	 101
gez_er 	 102
a3 	 103
wae_ch 	 283
iu 	 106
nl 	 107
french.iso88591 	 108
japanese-euc 	 83
tig_er 	 98
hne 	 111
c.iso88591 	 112
ar_qa 	 113
chinese-t 	 114
fo 	 115
de_li.utf8 	 117
br_fr 	 118
mag_in 	 515
sv_fi 	 119
russian 	 120
pp 	 121
wa_be 	 123
norwegian 	 124
fa_ir.isiri3342 	 126
ky_kg 	 127
zh_tw.euctw 	 128
fre_fr 	 130
english_uk 	 131
arabic 	 133
fr_ca 	 134
ber_ma 	 135
ml_in 	 136
li_nl 	 137
et 	 138
fur_it 	 139
om_ke 	 140
gl 	 141
bg 	 142
is_is 	 143
sr_yu 	 282
tk_tm 	 125
en_au 	 146
fa_ir 	 147
be_bg.utf8 	 148
zu 	 303
sh_hr.iso88592 	 150
szl_pl 	 310
ar_ae 	 152
nynorsk 	 153
en_bw 	 154
iso-8859-1 	 155
tl_ph 	 518
bulgarian 	 299
ts 	 356
kn 	 159
af 	 160
wa 	 161
or_in 	 162
dansk 	 163
bs 	 522
be@latin 	 164
lij_it 	 398
ko_kr 	 167
tr_tr 	 168
ar_in 	 169
os_ru 	 170
sr_yu@cyrillic 	 171
ta_lk 	 172
sr_rs 	 490
es_ec 	 174
en_be 	 175
no_no.iso88591@nynorsk 	 176
zh_cn.big5 	 177
pt_pt 	 178
an_es 	 179
zh_hk 	 180
es_cl 	 181
unm_us 	 312
am 	 183
as 	 184
cv_ru 	 185
ar_aa 	 186
gd 	 419
ti_er 	 187
ar_lb 	 188
sp 	 189
ja_jp.euc 	 190
csb_pl 	 191
el_gr 	 192
de_be 	 193
bokmål 	 194
danish 	 195
be_by@latin 	 196
kw 	 198
iso_8859_15 	 301
sr_yu.iso88595@cyrillic 	 199
cs_cz 	 200
tn 	 201
ar_tn 	 202
or 	 203
se_no 	 204
mhr_ru 	 495
be_by 	 206
eu_fr 	 406
de_at 	 207
tr_cy 	 104
mai_in 	 209
zu_za 	 210
sh_hr 	 211
ta_in.tscii 	 212
sr_yu.utf8 	 213
de_ch 	 214
dv_mv 	 236
mk 	 215
mt 	 216
fa 	 217
tt_ru 	 218
ga_ie 	 306
iw_il 	 219
li_be 	 220
ka_ge.georgianacademy 	 221
az_az.iso88599e 	 222
eng_gb 	 223
en_zw 	 224
en_dl.utf8 	 75
estonian 	 226
es_pa 	 227
sw_ke 	 228
es_pe 	 229
pa_pk 	 230
hebrew 	 231
niu_nu 	 232
lo_la 	 233
ca_es 	 309
sq_al 	 235
ka_ge.georgianrs 	 305
ca 	 238
tt_ru.tatarcyr 	 239
zh_hk.big5hk 	 240
nb 	 241
mg_mg 	 242
eo_eo 	 510
kl_gl 	 411
lo 	 244
iu_ca 	 245
thai 	 517
as_in 	 246
en_ng 	 313
ar_om 	 248
ia 	 249
eo_us.utf8 	 250
ur_pk 	 251
vi_vn.tcvn 	 252
ar_eg 	 253
es_py 	 254
ru_ua 	 255
nn 	 256
hr 	 504
chinese-s 	 258
sc_it 	 259
ta_in.tscii0 	 260
korean 	 261
nr_za 	 262
si 	 263
zh_sg 	 264
portuguese_brazil 	 440
bokmal 	 482
ber_dz 	 266
pa 	 316
ee_ee 	 526
american 	 268
en_za 	 269
lo_la.cp1133 	 270
pa_in 	 271
en_uk 	 272
sat_in 	 273
so_so 	 274
finnish 	 275
cy_gb 	 277
mi_nz 	 278
gez_et 	 279
german 	 20
am_et 	 281
ko_kr.euc 	 543
es_cu 	 144
sd 	 69
ti_et 	 156
en_ca 	 506
sr_yu.microsoftcp1251@cyrillic 	 87
c.ascii 	 402
lv 	 287
ka_ge.georgianps 	 288
pl_pl 	 237
ar_kw 	 290
hrvatski 	 7
bo_in 	 292
dutch.iso88591 	 293
pd_de 	 294
in_id 	 296
ms 	 297
hsb_de 	 298
sr_yu.utf8@cyrillic 	 157
th_th.tis620 	 300
lb_lu 	 315
lg_ug 	 302
uz_uz@cyrillic 	 304
sh_sp 	 314
tg_tj 	 129
ku_tr 	 307
deutsch 	 105
ar_ly 	 536
nds_nl 	 390
my_mm 	 308
fy_nl 	 234
aa_er 	 151
kw_gb 	 311
hy_am 	 247
romanian 	 267
wo_sn 	 122
so_ke 	 320
sr_yu.iso88592 	 322
pl 	 295
sp_yu 	 324
be 	 325
et_ee 	 326
en_ie 	 328
es_do 	 329
en_sg 	 330
it_ch 	 331
bs_ba 	 332
el_gr@euro 	 333
sinhala 	 334
hu 	 335
tt_ru@iqtelif 	 336
ger_de 	 337
iu_ca.nunacom8 	 78
ph_ph 	 339
en_ph 	 469
rw_rw 	 393
so_et 	 340
ka 	 341
ur_in 	 205
hr_hr 	 343
ar_sa 	 344
french_france 	 345
sk 	 346
es_pr 	 347
galician 	 349
ff_sn 	 350
sq_mk 	 56
ny_no 	 352
ro 	 353
zh_cn 	 354
tt 	 355
nhn_mx 	 427
en_dk 	 372
ar_iq 	 358
lt_lt 	 359
dutch 	 360
slovenian 	 361
cz 	 362
nso_za 	 508
cz_cz 	 428
ss 	 364
ar_sy 	 365
en_gb 	 366
byn_er 	 367
ayc_pe 	 368
en_zw.utf8 	 338
ug_cn 	 14
es_ni 	 371
catalan 	 84
english_us 	 373
hi_in.isciidev 	 374
eu_es 	 422
ca_fr 	 375
vi_vn.tcvn5712 	 376
so_dj 	 50
nl_nl 	 378
en_zm 	 379
posix-utf2 	 380
el 	 525
lo_la.ibmcp1133 	 382
en 	 383
th_th 	 70
ka_ge 	 385
kk_kz 	 386
a3_az.koic 	 387
fr 	 388
de_lu 	 389
zh 	 21
es_gt 	 542
oc_fr 	 391
ta 	 392
sv_se 	 116
st 	 10
galego 	 395
eu 	 158
sr_sp 	 529
sr_yu.cp1251@cyrillic 	 166
es_ar 	 400
mk_mk 	 401
english_united-states.437 	 18
dz_bt 	 351
ga 	 432
en_us 	 404
ar_jo 	 405
es_uy 	 342
tl 	 407
c-french 	 408
english_united-states 	 409
en_hk 	 410
br 	 478
nso 	 243
spanish_spain 	 412
xh 	 413
yi_us 	 414
ps_af 	 415
zh_tw 	 416
bho_in 	 417
ia_fr 	 435
ss_za 	 418
gv 	 291
es_bo 	 420
eo 	 491
gl_es 	 421
ja_jp 	 319
tn_za 	 423
crh_ua 	 424
sw_tz 	 425
jp_jp 	 426
sh_ba.iso88592@bosnia 	 357
km_kh 	 363
sv 	 399
no@nynorsk 	 16
vi 	 403
hy_am.armscii8 	 433
ru_ru 	 434
univ 	 276
mr_in 	 436
ur 	 437
ht_ht 	 438
japan 	 439
sh 	 377
fr_lu 	 441
es_hn 	 442
ast_es 	 443
ta_in 	 444
sd_pk 	 445
portuguese 	 446
ts_za 	 447
mi 	 448
lithuanian 	 488
c.en 	 450
zh_cn.euc 	 321
az_az 	 452
ko 	 537
sr@latn 	 454
es_us 	 455
ny 	 456
is 	 182
iso8859-1 	 431
fy_de 	 197
oc 	 459
icelandic 	 460
es_es 	 461
greek 	 462
pp_an 	 284
da 	 464
ha_ng 	 465
ks_in@devanagari.utf8 	 38
el_cy 	 512
pd_us 	 467
th 	 468
ja_jp.pck 	 149
ru 	 470
c 	 396
ca_es@valencia 	 458
uk 	 472
rumanian 	 473
français 	 474
ja_jp.mscode 	 82
tg 	 476
es_sv 	 477
japanese.euc 	 99
ca_it 	 479
c_c.c 	 25
english 	 480
es_ve 	 394
kl 	 483
ve 	 484
sr_cs@latn 	 485
ar_dz 	 486
aa_et 	 487
bo_cn 	 109
iw_il.utf8 	 145
nn_no 	 489
vi_vn 	 173
spanish 	 79
ca_ad 	 492
vi_vn.viscii111 	 494
c_c 	 451
nan_tw@latin 	 370
ar_sd 	 498
vi_vn.viscii 	 496
ms_my 	 501
es_co 	 502
posix 	 503
niu_nz 	 257
ks 	 505
id 	 430
iso-8859-15 	 507
sd_in 	 327
es 	 509
th_th.tactis 	 41
iso8859-15 	 471
bn_bd 	 511
hu_hu 	 323
nds_de 	 513
nr 	 514
slovene 	 208
sl_si 	 516
ve_za 	 317
sh_yu 	 545
sr@cyrillic 	 519
slovak 	 521
pd 	 497
serbocroatian 	 132
ph 	 457
sa_in 	 381
fi 	 348
nl_be 	 527
sr_me 	 165
swedish 	 397
sl_cs 	 530
ar_ye 	 524
yo_ng 	 531
eesti 	 532
hungarian 	 533
no_no 	 534
hi 	 548
uz 	 535
in 	 449
om_et 	 453
sr_cs 	 538
xh_za 	 539
pt 	 541
universal.utf8@ucs4 	 520
ks_in 	 493
bem_zm 	 544
hi_in 	 289
eo.utf8 	 546
uk_ua 	 547
zh_sg.gbk 	 540
te_in 	 549
sl 	 550
lo_la.mulelao1 	 551

所以传给localeIdMap一个locale的字符串，就可以将其转换成数值型，如果传入的字符串不在localeIdMap的key中，则返回0，这也就体现了defaultdict(int)的作用

print(localeIdMap['en_GB'.lower()])
print(localeIdMap['en_US'.lower()])
print(localeIdMap['id_ID'.lower()])
print(localeIdMap['ka_GE'.lower()])

2.birthyear列处理

该列处理比较简单，存在就直接转换成数值，不存在就用0填充

def getBirthYearInt(birthYear):try:return 0 if birthYear=="None" else int(birthYear)except:return 0
print(getBirthYearInt(1992))
print(getBirthYearInt(None))

1992
0

3.gender列处理

male转换为1， female转换为2，空值用0填充

from collections import defaultdict
genderIdMap = defaultdict(int, {'male':1, 'female':2})
print(genderIdMap['male'])
print(genderIdMap['female'])
print(genderIdMap[None])

1
2
0

4.joinedAt列处理

我们发现该列信息有些共性特点：

import pandas as pd
df_users = pd.read_csv('users.csv')
df_users['joinedAt'][:10]

0    2012-10-02T06:40:55.524Z
1    2012-09-29T18:03:12.111Z
2    2012-10-06T03:14:07.149Z
3    2012-11-04T08:59:43.783Z
4    2012-09-10T16:06:53.132Z
5    2012-11-01T09:59:17.590Z
6    2012-10-03T05:22:17.637Z
7    2012-10-03T12:19:29.975Z
8    2012-10-31T10:11:57.668Z
9    2012-10-02T07:28:09.555Z
Name: joinedAt, dtype: object

我们发现该列要么是None，要么是上面的时间字符串，均有T在中间和S在尾部，根据这个共性我们用datetime模块，提取时间信息：

import datetime
def getJoinedYearMonth(dateString):try:dttm = datetime.datetime.strptime(dateString, "%Y-%m-%dT%H:%M:%S.%fZ")return "".join( [str(dttm.year), str(dttm.month)] )except:return 0
df_users['joinedAt'].map(getJoinedYearMonth)[:10]

0    201210
1     20129
2    201210
3    201211
4     20129
5    201211
6    201210
7    201210
8    201210
9    201210
Name: joinedAt, dtype: object

5.location列处理

我们来看看users.csv中location列信息（前20行）：

df_users['location'][:20]

0                  Medan  Indonesia
1                  Medan  Indonesia
2                Stratford  Ontario
3                      Tehran  Iran
4                               NaN
5                  Tbilisi  Georgia
6                  Medan  Indonesia
7                  Medan  Indonesia
8                  Medan  Indonesia
9                  Medan  Indonesia
10                 Medan  Indonesia
11                       Phnom Penh
12    Djokja  Yogyakarta  Indonesia
13               Triolet  Mauritius
14                              NaN
15                              NaN
16                              NaN
17              Surabaya  Indonesia
18                 Medan  Indonesia
19                              NaN
Name: location, dtype: object

我们使用pycountry模块来将此列转换为数值型，pycountry.countries是个迭代器：

import pycountry
from collections import defaultdict
countryIdMap = defaultdict(int)
for i, c in enumerate(pycountry.countries):countryIdMap[c.name.lower()] = i + 1
#将地址信息转换为数值型
def getCountryId(location):if (isinstance( location, str)) and len(location.strip()) > 0 and location.rfind('  ') > -1:return countryIdMap[ location[location.rindex('  ') + 2: ].lower() ]else:return 0
print(getCountryId('San Dimas  California'))
print(getCountryId('Jogjakarta  Indonesia'))

0
103

我们知道许多机器学习模型只能接受数值型的数据作为模型的输入，所以在这里需要将位置信息转换为数值型的数据，常见的做法是对其做one hot处理，但是这样会造成矩阵太稀疏，我们可以使用pycountry库，对位置数据按照pycountry中存储的位置信息进行编码，使用编码来代替原始的位置信息。

6.timezone列处理

比较简单，存在值就转换为int型，不存在用0填充

def getTimezoneInt(timezone):try:return int(timezone)except:return 0
print(getTimezoneInt(-240))#-240
print(getTimezoneInt(240))
print(getTimezoneInt(None))

-240
240
0

7.将上面处理的1-6列进行归一化

self.userMatrix矩阵的处理中归一化使用了sklearn.preprocessing.normalize()函数，归一化后方便计算两个user的相似度

这里只计算Event Recommendation Engine Challenge分步解析第一步中的uniqueUserPairs，他们因为同一个event事件关联起来了，有联系

计算相关性用到了scipy.spatial.distance.correlation(u, v) #计算向量u和v之间的相关系数（pearson correlation coefficient, Centered Cosine）

#第二步全部代码
from collections import defaultdict
import locale, pycountry
import scipy.sparse as ss
import scipy.io as sio
import itertools
#import cPickle
#From python3, cPickle has beed replaced by _pickle
import _pickle as cPickleimport scipy.spatial.distance as ssd
import datetime
from sklearn.preprocessing import normalize#构建用户-事件矩阵类
class ProgramEntities:"""我们只关心train和test中出现的user和event，因此重点处理这部分关联数据，经过统计：train和test中总共3391个users和13418个events"""def __init__(self):#统计训练集中有多少独立的用户的eventsuniqueUsers = set()#uniqueUsers保存总共多少个用户：3391个uniqueEvents = set()#uniqueEvents保存总共多少个events：13418个eventsForUser = defaultdict(set)#字典eventsForUser保存了每个user:所对应的eventusersForEvent = defaultdict(set)#字典usersForEvent保存了每个event：哪些user点击for filename in ['train.csv', 'test.csv']:f = open(filename)f.readline()#跳过第一行for line in f:cols = line.strip().split(',')uniqueUsers.add( cols[0] )uniqueEvents.add( cols[1] )eventsForUser[cols[0]].add( cols[1] )usersForEvent[cols[1]].add( cols[0] )f.close()self.userEventScores = ss.dok_matrix( ( len(uniqueUsers), len(uniqueEvents) ) )self.userIndex = dict()self.eventIndex = dict()for i, u in enumerate(uniqueUsers):self.userIndex[u] = ifor i, e in enumerate(uniqueEvents):self.eventIndex[e] = iftrain = open('train.csv')ftrain.readline()for line in ftrain:cols = line.strip().split(',')i = self.userIndex[ cols[0] ]j = self.eventIndex[ cols[1] ]self.userEventScores[i, j] = int( cols[4] ) - int( cols[5] )ftrain.close()sio.mmwrite('PE_userEventScores', self.userEventScores)#为了防止不必要的计算，我们找出来所有关联的用户或者关联的event#所谓关联用户指的是至少在同一个event上有行为的用户user pair#关联的event指的是至少同一个user有行为的event pairself.uniqueUserPairs = set()self.uniqueEventPairs = set()for event in uniqueEvents:users = usersForEvent[event]if len(users) > 2:self.uniqueUserPairs.update( itertools.combinations(users, 2) )for user in uniqueUsers:events = eventsForUser[user]if len(events) > 2:self.uniqueEventPairs.update( itertools.combinations(events, 2) )#rint(self.userIndex)cPickle.dump( self.userIndex, open('PE_userIndex.pkl', 'wb'))cPickle.dump( self.eventIndex, open('PE_eventIndex.pkl', 'wb') )#数据清洗类
class DataCleaner:def __init__(self):#一些字符串转数值的方法#载入localeself.localeIdMap = defaultdict(int)for i, l in enumerate(locale.locale_alias.keys()):self.localeIdMap[l] = i + 1#载入countryself.countryIdMap = defaultdict(int)ctryIdx = defaultdict(int)for i, c in enumerate(pycountry.countries):self.countryIdMap[c.name.lower()] = i + 1if c.name.lower() == 'usa':ctryIdx['US'] = iif c.name.lower() == 'canada':ctryIdx['CA'] = ifor cc in ctryIdx.keys():for s in pycountry.subdivisions.get(country_code=cc):self.countryIdMap[s.name.lower()] = ctryIdx[cc] + 1#处理性别信息       self.genderIdMap = defaultdict(int, {'male':1, 'female':2})#处理LocaleIddef getLocaleId(self, locstr):#这样因为localeIdMap是defaultdict(int)，如果key中没有locstr.lower()，就会返回默认int 0return self.localeIdMap[ locstr.lower() ]#处理birthyeardef getBirthYearInt(self, birthYear):try:return 0 if birthYear == 'None' else int(birthYear)except:return 0#性别处理def getGenderId(self, genderStr):return self.genderIdMap[genderStr]#joinedAtdef getJoinedYearMonth(self, dateString):dttm = datetime.datetime.strptime(dateString, "%Y-%m-%dT%H:%M:%S.%fZ")return "".join( [str(dttm.year), str(dttm.month) ] )#处理locationdef getCountryId(self, location):if (isinstance( location, str)) and len(location.strip()) > 0 and location.rfind('  ') > -1:return self.countryIdMap[ location[location.rindex('  ') + 2: ].lower() ]else:return 0#处理timezonedef getTimezoneInt(self, timezone):try:return int(timezone)except:return 0#构建用户和用户之间的相似矩阵类       
class Users:"""构建user/user相似度矩阵"""def __init__(self, programEntities, sim=ssd.correlation):#spatial.distance.correlation(u, v) #计算向量u和v之间的相关系数cleaner = DataCleaner()nusers = len(programEntities.userIndex.keys())#3391#print(nusers)fin = open('users.csv')colnames = fin.readline().strip().split(',') #7列特征self.userMatrix = ss.dok_matrix( (nusers, len(colnames)-1 ) )#构建稀疏矩阵for line in fin:cols = line.strip().split(',')#只考虑train.csv中出现的用户，这一行是作者注释上的，但是我不是很理解#userIndex包含了train和test的所有用户，为何说只考虑train.csv中出现的用户#构造用户矩阵（将原始数据中的数据进行处理后构建）if cols[0] in programEntities.userIndex:i = programEntities.userIndex[ cols[0] ]#获取user：对应的indexself.userMatrix[i, 0] = cleaner.getLocaleId( cols[1] )#localeself.userMatrix[i, 1] = cleaner.getBirthYearInt( cols[2] )#birthyear,空值0填充self.userMatrix[i, 2] = cleaner.getGenderId( cols[3] )#处理性别self.userMatrix[i, 3] = cleaner.getJoinedYearMonth( cols[4] )#处理joinedAt列self.userMatrix[i, 4] = cleaner.getCountryId( cols[5] )#处理locationself.userMatrix[i, 5] = cleaner.getTimezoneInt( cols[6] )#处理timezonefin.close()#归一化矩阵self.userMatrix = normalize(self.userMatrix, norm='l1', axis=0, copy=False)sio.mmwrite('US_userMatrix', self.userMatrix)#计算用户相似度矩阵，之后会用到self.userSimMatrix = ss.dok_matrix( (nusers, nusers) )#(3391,3391)for i in range(0, nusers):self.userSimMatrix[i, i] = 1.0for u1, u2 in programEntities.uniqueUserPairs:i = programEntities.userIndex[u1]  #获取用户u1的索引j = programEntities.userIndex[u2]if (i, j) not in self.userSimMatrix:#print(self.userMatrix.getrow(i).todense()) 如[[0.00028123,0.00029847,0.00043592,0.00035208,0,0.00032346]]#print(self.userMatrix.getrow(j).todense()) 如[[0.00028123,0.00029742,0.00043592,0.00035208,0,-0.00032346]]usim = sim(self.userMatrix.getrow(i).todense(),self.userMatrix.getrow(j).todense())  #计算两个用户向量之间的相似性，为对称矩阵self.userSimMatrix[i, j] = usimself.userSimMatrix[j, i] = usimsio.mmwrite('US_userSimMatrix', self.userSimMatrix) print('第1步：统计user和event相关信息...')
pe = ProgramEntities()
print('第1步完成...\n')print('第2步：计算用户相似度信息，并用矩阵形式存储...')
Users(pe)
print('第2步完成...\n')

第1步：统计user和event相关信息...
第1步完成...第2步：计算用户相似度信息，并用矩阵形式存储...
第2步完成...

第三步：用户社交关系信息处理

这一步需要user_friends.csv.gz文件，我们先来看看文件内容：

import pandas as pd
df_user_friends = pd.read_csv('user_friends.csv.gz', compression='gzip')
df_user_friends.head()

	user	friends
0	3197468391	1346449342 3873244116 4226080662 1222907620 54...
1	3537982273	1491560444 395798035 2036380346 899375619 3534...
2	823183725	1484954627 1950387873 1652977611 4185960823 42...
3	1872223848	83361640 723814682 557944478 1724049724 253059...
4	3429017717	4253303705 2130310957 1838389374 3928735761 71...

1）如果你有更多的朋友，可能你性格外向，更容易参加各种活动
2）如果你朋友会参加某个活动，可能你也会跟随去参加一下

# 第三步完整代码
from collections import defaultdict
import locale, pycountry
import scipy.sparse as ss
import scipy.io as sio
import itertools
#import cPickle
#From python3, cPickle has beed replaced by _pickle
import _pickle as cPickleimport scipy.spatial.distance as ssd
import datetime
from sklearn.preprocessing import normalizeimport gzip
import numpy as np#处理user和event关联数据
class ProgramEntities:"""我们只关心train和test中出现的user和event，因此重点处理这部分关联数据，经过统计：train和test中总共3391个users和13418个events"""def __init__(self):#统计训练集中有多少独立的用户的eventsuniqueUsers = set()#uniqueUsers保存总共多少个用户：3391个uniqueEvents = set()#uniqueEvents保存总共多少个events：13418个eventsForUser = defaultdict(set)#字典eventsForUser保存了每个user:所对应的eventusersForEvent = defaultdict(set)#字典usersForEvent保存了每个event：哪些user点击for filename in ['train.csv', 'test.csv']:f = open(filename)f.readline()#跳过第一行for line in f:cols = line.strip().split(',')uniqueUsers.add( cols[0] )uniqueEvents.add( cols[1] )eventsForUser[cols[0]].add( cols[1] )usersForEvent[cols[1]].add( cols[0] )f.close()self.userEventScores = ss.dok_matrix( ( len(uniqueUsers), len(uniqueEvents) ) )self.userIndex = dict()self.eventIndex = dict()for i, u in enumerate(uniqueUsers):self.userIndex[u] = ifor i, e in enumerate(uniqueEvents):self.eventIndex[e] = iftrain = open('train.csv')ftrain.readline()for line in ftrain:cols = line.strip().split(',')i = self.userIndex[ cols[0] ]j = self.eventIndex[ cols[1] ]self.userEventScores[i, j] = int( cols[4] ) - int( cols[5] )ftrain.close()sio.mmwrite('PE_userEventScores', self.userEventScores)#为了防止不必要的计算，我们找出来所有关联的用户或者关联的event#所谓关联用户指的是至少在同一个event上有行为的用户user pair#关联的event指的是至少同一个user有行为的event pairself.uniqueUserPairs = set()self.uniqueEventPairs = set()for event in uniqueEvents:users = usersForEvent[event]if len(users) > 2:self.uniqueUserPairs.update( itertools.combinations(users, 2) )for user in uniqueUsers:events = eventsForUser[user]if len(events) > 2:self.uniqueEventPairs.update( itertools.combinations(events, 2) )#rint(self.userIndex)cPickle.dump( self.userIndex, open('PE_userIndex.pkl', 'wb'))cPickle.dump( self.eventIndex, open('PE_eventIndex.pkl', 'wb') )#数据清洗类
class DataCleaner:def __init__(self):#一些字符串转数值的方法#载入localeself.localeIdMap = defaultdict(int)for i, l in enumerate(locale.locale_alias.keys()):self.localeIdMap[l] = i + 1#载入countryself.countryIdMap = defaultdict(int)ctryIdx = defaultdict(int)for i, c in enumerate(pycountry.countries):self.countryIdMap[c.name.lower()] = i + 1if c.name.lower() == 'usa':ctryIdx['US'] = iif c.name.lower() == 'canada':ctryIdx['CA'] = ifor cc in ctryIdx.keys():for s in pycountry.subdivisions.get(country_code=cc):self.countryIdMap[s.name.lower()] = ctryIdx[cc] + 1self.genderIdMap = defaultdict(int, {'male':1, 'female':2})#处理LocaleIddef getLocaleId(self, locstr):#这样因为localeIdMap是defaultdict(int)，如果key中没有locstr.lower()，就会返回默认int 0return self.localeIdMap[ locstr.lower() ]#处理birthyeardef getBirthYearInt(self, birthYear):try:return 0 if birthYear == 'None' else int(birthYear)except:return 0#性别处理def getGenderId(self, genderStr):return self.genderIdMap[genderStr]#joinedAtdef getJoinedYearMonth(self, dateString):dttm = datetime.datetime.strptime(dateString, "%Y-%m-%dT%H:%M:%S.%fZ")return "".join( [str(dttm.year), str(dttm.month) ] )#处理locationdef getCountryId(self, location):if (isinstance( location, str)) and len(location.strip()) > 0 and location.rfind('  ') > -1:return self.countryIdMap[ location[location.rindex('  ') + 2: ].lower() ]else:return 0#处理timezonedef getTimezoneInt(self, timezone):try:return int(timezone)except:return 0#用户与用户相似度矩阵
class Users:"""构建user/user相似度矩阵"""def __init__(self, programEntities, sim=ssd.correlation):#spatial.distance.correlation(u, v) #计算向量u和v之间的相关系数cleaner = DataCleaner()nusers = len(programEntities.userIndex.keys())#3391#print(nusers)fin = open('users.csv')colnames = fin.readline().strip().split(',') #7列特征self.userMatrix = ss.dok_matrix( (nusers, len(colnames)-1 ) )#构建稀疏矩阵for line in fin:cols = line.strip().split(',')#只考虑train.csv中出现的用户，这一行是作者注释上的，但是我不是很理解#userIndex包含了train和test的所有用户，为何说只考虑train.csv中出现的用户if cols[0] in programEntities.userIndex:i = programEntities.userIndex[ cols[0] ]#获取user：对应的indexself.userMatrix[i, 0] = cleaner.getLocaleId( cols[1] )#localeself.userMatrix[i, 1] = cleaner.getBirthYearInt( cols[2] )#birthyear,空值0填充self.userMatrix[i, 2] = cleaner.getGenderId( cols[3] )#处理性别self.userMatrix[i, 3] = cleaner.getJoinedYearMonth( cols[4] )#处理joinedAt列self.userMatrix[i, 4] = cleaner.getCountryId( cols[5] )#处理locationself.userMatrix[i, 5] = cleaner.getTimezoneInt( cols[6] )#处理timezonefin.close()#归一化矩阵self.userMatrix = normalize(self.userMatrix, norm='l1', axis=0, copy=False)sio.mmwrite('US_userMatrix', self.userMatrix)#计算用户相似度矩阵，之后会用到self.userSimMatrix = ss.dok_matrix( (nusers, nusers) )#(3391,3391)for i in range(0, nusers):self.userSimMatrix[i, i] = 1.0for u1, u2 in programEntities.uniqueUserPairs:i = programEntities.userIndex[u1]j = programEntities.userIndex[u2]if (i, j) not in self.userSimMatrix:#print(self.userMatrix.getrow(i).todense()) 如[[0.00028123,0.00029847,0.00043592,0.00035208,0,0.00032346]]#print(self.userMatrix.getrow(j).todense()) 如[[0.00028123,0.00029742,0.00043592,0.00035208,0,-0.00032346]]usim = sim(self.userMatrix.getrow(i).todense(),self.userMatrix.getrow(j).todense())self.userSimMatrix[i, j] = usimself.userSimMatrix[j, i] = usimsio.mmwrite('US_userSimMatrix', self.userSimMatrix)#用户社交关系挖掘
class UserFriends:"""找出某用户的那些朋友，想法非常简单1）如果你有更多的朋友，可能你性格外向，更容易参加各种活动2）如果你朋友会参加某个活动，可能你也会跟随去参加一下"""def __init__(self, programEntities):nusers = len(programEntities.userIndex.keys())#3391  用户数目self.numFriends = np.zeros( (nusers) )#array([0., 0., 0., ..., 0., 0., 0.])，保存每一个用户的朋友数self.userFriends = ss.dok_matrix( (nusers, nusers) )  #记录下每个用户的朋友点击事件的次数fin = gzip.open('user_friends.csv.gz')print( 'Header In User_friends.csv.gz:',fin.readline() )ln = 0#逐行打开user_friends.csv.gz文件#判断第一列的user是否在userIndex中，只有user在userIndex中才是我们关心的user#获取该用户的Index，和朋友数目#对于该用户的每一个朋友，如果朋友也在userIndex中，获取其朋友的userIndex，然后去userEventScores中获取该朋友对每个events的反应#score即为该朋友对所有events的平均分#userFriends矩阵记录了用户和朋友之间的score#如851286067：1750用户出现在test.csv中，该用户在User_friends.csv.gz中一共2151个朋友#那么其朋友占比应该是2151 / 总的朋友数sumNumFriends=3731377.0 = 2151 / 3731377 = 0.0005764627910822198for line in fin:if ln % 200 == 0:print( 'Loading line:', ln )cols = line.decode().strip().split(',')user = cols[0]if user in programEntities.userIndex:friends = cols[1].split(' ')#获得该用户的朋友列表i = programEntities.userIndex[user]self.numFriends[i] = len(friends)for friend in friends:if friend in programEntities.userIndex:j = programEntities.userIndex[friend]#the objective of this score is to infer the degree to#and direction in which this friend will influence the#user's decision, so we sum the user/event score for#this user across all training eventseventsForUser = programEntities.userEventScores.getrow(j).todense()#获取朋友对每个events的反应：0， 1， or -1#print(eventsForUser.sum(), np.shape(eventsForUser)[1] )#socre即是用户朋友在13418个events上的平均分score = eventsForUser.sum() / np.shape(eventsForUser)[1]#eventsForUser = 13418,#print(score)self.userFriends[i, j] += scoreself.userFriends[j, i] += scoreln += 1fin.close()#归一化数组sumNumFriends = self.numFriends.sum(axis=0)#每个用户的朋友数相加print(sumNumFriends)self.numFriends = self.numFriends / sumNumFriends#每个user的朋友数目比例sio.mmwrite('UF_numFriends', np.matrix(self.numFriends) )  #将用户-朋友数矩阵保存self.userFriends = normalize(self.userFriends, norm='l1', axis=0, copy=False)sio.mmwrite('UF_userFriends', self.userFriends)  #将用户-朋友事件点击矩阵保存print('第1步：统计user和event相关信息...')
pe = ProgramEntities()
print('第1步完成...\n')print('第2步：计算用户相似度信息，并用矩阵形式存储...')
#Users(pe)
print('第2步完成...\n')print('第3步：计算用户社交关系信息，并存储...')
UserFriends(pe)
print('第3步完成...\n')

第1步：统计user和event相关信息...
第1步完成...第2步：计算用户相似度信息，并用矩阵形式存储...
第2步完成...第3步：计算用户社交关系信息，并存储...
Header In User_friends.csv.gz: b'user,friends\n'
Loading line: 0
Loading line: 200
Loading line: 400
Loading line: 600
Loading line: 800
Loading line: 1000
Loading line: 1200
Loading line: 1400
Loading line: 1600
Loading line: 1800
Loading line: 2000
Loading line: 2200
Loading line: 2400
Loading line: 2600
Loading line: 2800
Loading line: 3000
Loading line: 3200
Loading line: 3400
Loading line: 3600
Loading line: 3800
Loading line: 4000
Loading line: 4200
Loading line: 4400
Loading line: 4600
Loading line: 4800
Loading line: 5000
Loading line: 5200
Loading line: 5400
Loading line: 5600
Loading line: 5800
Loading line: 6000
Loading line: 6200
Loading line: 6400
Loading line: 6600
Loading line: 6800
Loading line: 7000
Loading line: 7200
Loading line: 7400
Loading line: 7600
Loading line: 7800
Loading line: 8000
Loading line: 8200
Loading line: 8400
Loading line: 8600
Loading line: 8800
Loading line: 9000
Loading line: 9200
Loading line: 9400
Loading line: 9600
Loading line: 9800
Loading line: 10000
Loading line: 10200
Loading line: 10400
Loading line: 10600
Loading line: 10800
Loading line: 11000
Loading line: 11200
Loading line: 11400
Loading line: 11600
Loading line: 11800
Loading line: 12000
Loading line: 12200
Loading line: 12400
Loading line: 12600
Loading line: 12800
Loading line: 13000
Loading line: 13200
Loading line: 13400
Loading line: 13600
Loading line: 13800
Loading line: 14000
Loading line: 14200
Loading line: 14400
Loading line: 14600
Loading line: 14800
Loading line: 15000
Loading line: 15200
Loading line: 15400
Loading line: 15600
Loading line: 15800
Loading line: 16000
Loading line: 16200
Loading line: 16400
Loading line: 16600
Loading line: 16800
Loading line: 17000
Loading line: 17200
Loading line: 17400
Loading line: 17600
Loading line: 17800
Loading line: 18000
Loading line: 18200
Loading line: 18400
Loading line: 18600
Loading line: 18800
Loading line: 19000
Loading line: 19200
Loading line: 19400
Loading line: 19600
Loading line: 19800
Loading line: 20000
Loading line: 20200
Loading line: 20400
Loading line: 20600
Loading line: 20800
Loading line: 21000
Loading line: 21200
Loading line: 21400
Loading line: 21600
Loading line: 21800
Loading line: 22000
Loading line: 22200
Loading line: 22400
Loading line: 22600
Loading line: 22800
Loading line: 23000
Loading line: 23200
Loading line: 23400
Loading line: 23600
Loading line: 23800
Loading line: 24000
Loading line: 24200
Loading line: 24400
Loading line: 24600
Loading line: 24800
Loading line: 25000
Loading line: 25200
Loading line: 25400
Loading line: 25600
Loading line: 25800
Loading line: 26000
Loading line: 26200
Loading line: 26400
Loading line: 26600
Loading line: 26800
Loading line: 27000
Loading line: 27200
Loading line: 27400
Loading line: 27600
Loading line: 27800
Loading line: 28000
Loading line: 28200
Loading line: 28400
Loading line: 28600
Loading line: 28800
Loading line: 29000
Loading line: 29200
Loading line: 29400
Loading line: 29600
Loading line: 29800
Loading line: 30000
Loading line: 30200
Loading line: 30400
Loading line: 30600
Loading line: 30800
Loading line: 31000
Loading line: 31200
Loading line: 31400
Loading line: 31600
Loading line: 31800
Loading line: 32000
Loading line: 32200
Loading line: 32400
Loading line: 32600
Loading line: 32800
Loading line: 33000
Loading line: 33200
Loading line: 33400
Loading line: 33600
Loading line: 33800
Loading line: 34000
Loading line: 34200
Loading line: 34400
Loading line: 34600
Loading line: 34800
Loading line: 35000
Loading line: 35200
Loading line: 35400
Loading line: 35600
Loading line: 35800
Loading line: 36000
Loading line: 36200
Loading line: 36400
Loading line: 36600
Loading line: 36800
Loading line: 37000
Loading line: 37200
Loading line: 37400
Loading line: 37600
Loading line: 37800
Loading line: 38000
Loading line: 38200
3731377.0
第3步完成...

第四步：构建event和event相似度数据

我们先看看events.csv.gz：

import pandas as pd
df_events_csv = pd.read_csv('events.csv.gz', compression='gzip')
df_events_csv.head()

	event_id	user_id	start_time	city	state	zip	country	lat	lng	c_1	...	c_93	c_other
0	684921758	3647864012	2012-10-31T00:00:00.001Z	NaN	NaN	NaN	NaN	NaN	NaN	2	...	1	9
1	244999119	3476440521	2012-11-03T00:00:00.001Z	NaN	NaN	NaN	NaN	NaN	NaN	2	...	0	7
2	3928440935	517514445	2012-11-05T00:00:00.001Z	NaN	NaN	NaN	NaN	NaN	NaN	0	...	0	12
3	2582345152	781585781	2012-10-30T00:00:00.001Z	NaN	NaN	NaN	NaN	NaN	NaN	1	...	0	8
4	1051165850	1016098580	2012-09-27T00:00:00.001Z	NaN	NaN	NaN	NaN	NaN	NaN	1	...	0	9

5 rows × 110 columns

对上面的信息进行数值转换

1.start_time列的信息使用 datetime库进行处理

2.city，state，zip，country列处理都利用了hashlib包：注意这里处理event信息的时候，只有那些出现在train.csv和test.csv中的event才会进入数值转换程序

import hashlib
def FeatureHash(value):if len(value.strip()) == 0:return -1else:return int( hashlib.sha224(value.encode('utf-8')).hexdigest()[0:4] ,16)print(FeatureHash('Muaraenim'))#47294
print(FeatureHash('a test demo'))#4030

47294
4030

3.lat和lon列处理

空值用0.0填充，其他转换为自身的float型

def getFloatValue(self, value):if len(value.strip()) == 0:return 0.0else:return float(value)

4.c_1之后列（也就是第10列之后）处理

这里用了一个矩阵eventContMatrix来保存c_1到c_100列信息，但是没有用的c_other列

5.将eventPropMatrix和eventContMatrix矩阵归一化后进行文件保存

6.使用uniqueEventPairs来计算event pairs相似度

利用了scipy.spatial.distance的correlation和cosine方法

## 第四步完整代码
from collections import defaultdict
import locale, pycountry
import scipy.sparse as ss
import scipy.io as sio
import itertools
#import cPickle
#From python3, cPickle has beed replaced by _pickle
import _pickle as cPickleimport scipy.spatial.distance as ssd
import datetime
from sklearn.preprocessing import normalizeimport gzip
import numpy as npimport hashlib#处理user和event关联数据
class ProgramEntities:"""我们只关心train和test中出现的user和event，因此重点处理这部分关联数据，经过统计：train和test中总共3391个users和13418个events"""def __init__(self):#统计训练集中有多少独立的用户的eventsuniqueUsers = set()#uniqueUsers保存总共多少个用户：3391个uniqueEvents = set()#uniqueEvents保存总共多少个events：13418个eventsForUser = defaultdict(set)#字典eventsForUser保存了每个user:所对应的eventusersForEvent = defaultdict(set)#字典usersForEvent保存了每个event：哪些user点击for filename in ['train.csv', 'test.csv']:f = open(filename)f.readline()#跳过第一行for line in f:cols = line.strip().split(',')uniqueUsers.add( cols[0] )uniqueEvents.add( cols[1] )eventsForUser[cols[0]].add( cols[1] )usersForEvent[cols[1]].add( cols[0] )f.close()self.userEventScores = ss.dok_matrix( ( len(uniqueUsers), len(uniqueEvents) ) )self.userIndex = dict()self.eventIndex = dict()for i, u in enumerate(uniqueUsers):self.userIndex[u] = ifor i, e in enumerate(uniqueEvents):self.eventIndex[e] = iftrain = open('train.csv')ftrain.readline()for line in ftrain:cols = line.strip().split(',')i = self.userIndex[ cols[0] ]j = self.eventIndex[ cols[1] ]self.userEventScores[i, j] = int( cols[4] ) - int( cols[5] )ftrain.close()sio.mmwrite('PE_userEventScores', self.userEventScores)#为了防止不必要的计算，我们找出来所有关联的用户或者关联的event#所谓关联用户指的是至少在同一个event上有行为的用户user pair#关联的event指的是至少同一个user有行为的event pairself.uniqueUserPairs = set()self.uniqueEventPairs = set()for event in uniqueEvents:users = usersForEvent[event]if len(users) > 2:self.uniqueUserPairs.update( itertools.combinations(users, 2) )for user in uniqueUsers:events = eventsForUser[user]if len(events) > 2:self.uniqueEventPairs.update( itertools.combinations(events, 2) )#rint(self.userIndex)cPickle.dump( self.userIndex, open('PE_userIndex.pkl', 'wb'))cPickle.dump( self.eventIndex, open('PE_eventIndex.pkl', 'wb') )#数据清洗类
class DataCleaner:def __init__(self):#一些字符串转数值的方法#载入localeself.localeIdMap = defaultdict(int)for i, l in enumerate(locale.locale_alias.keys()):self.localeIdMap[l] = i + 1#载入countryself.countryIdMap = defaultdict(int)ctryIdx = defaultdict(int)for i, c in enumerate(pycountry.countries):self.countryIdMap[c.name.lower()] = i + 1if c.name.lower() == 'usa':ctryIdx['US'] = iif c.name.lower() == 'canada':ctryIdx['CA'] = ifor cc in ctryIdx.keys():for s in pycountry.subdivisions.get(country_code=cc):self.countryIdMap[s.name.lower()] = ctryIdx[cc] + 1self.genderIdMap = defaultdict(int, {'male':1, 'female':2})#处理LocaleIddef getLocaleId(self, locstr):#这样因为localeIdMap是defaultdict(int)，如果key中没有locstr.lower()，就会返回默认int 0return self.localeIdMap[ locstr.lower() ]#处理birthyeardef getBirthYearInt(self, birthYear):try:return 0 if birthYear == 'None' else int(birthYear)except:return 0#性别处理def getGenderId(self, genderStr):return self.genderIdMap[genderStr]#joinedAtdef getJoinedYearMonth(self, dateString):dttm = datetime.datetime.strptime(dateString, "%Y-%m-%dT%H:%M:%S.%fZ")return "".join( [str(dttm.year), str(dttm.month) ] )#处理locationdef getCountryId(self, location):if (isinstance( location, str)) and len(location.strip()) > 0 and location.rfind('  ') > -1:return self.countryIdMap[ location[location.rindex('  ') + 2: ].lower() ]else:return 0#处理timezonedef getTimezoneInt(self, timezone):try:return int(timezone)except:return 0def getFeatureHash(self, value):if len(value.strip()) == 0:return -1else:#return int( hashlib.sha224(value).hexdigest()[0:4], 16) python3会报如下错误#TypeError: Unicode-objects must be encoded before hashingreturn int( hashlib.sha224(value.encode('utf-8')).hexdigest()[0:4], 16)#python必须先进行encodedef getFloatValue(self, value):if len(value.strip()) == 0:return 0.0else:return float(value)#用户与用户相似度矩阵
class Users:"""构建user/user相似度矩阵"""def __init__(self, programEntities, sim=ssd.correlation):#spatial.distance.correlation(u, v) #计算向量u和v之间的相关系数cleaner = DataCleaner()nusers = len(programEntities.userIndex.keys())#3391#print(nusers)fin = open('users.csv')colnames = fin.readline().strip().split(',') #7列特征self.userMatrix = ss.dok_matrix( (nusers, len(colnames)-1 ) )#构建稀疏矩阵for line in fin:cols = line.strip().split(',')#只考虑train.csv中出现的用户，这一行是作者注释上的，但是我不是很理解#userIndex包含了train和test的所有用户，为何说只考虑train.csv中出现的用户if cols[0] in programEntities.userIndex:i = programEntities.userIndex[ cols[0] ]#获取user：对应的indexself.userMatrix[i, 0] = cleaner.getLocaleId( cols[1] )#localeself.userMatrix[i, 1] = cleaner.getBirthYearInt( cols[2] )#birthyear,空值0填充self.userMatrix[i, 2] = cleaner.getGenderId( cols[3] )#处理性别self.userMatrix[i, 3] = cleaner.getJoinedYearMonth( cols[4] )#处理joinedAt列self.userMatrix[i, 4] = cleaner.getCountryId( cols[5] )#处理locationself.userMatrix[i, 5] = cleaner.getTimezoneInt( cols[6] )#处理timezonefin.close()#归一化矩阵self.userMatrix = normalize(self.userMatrix, norm='l1', axis=0, copy=False)sio.mmwrite('US_userMatrix', self.userMatrix)#计算用户相似度矩阵，之后会用到self.userSimMatrix = ss.dok_matrix( (nusers, nusers) )#(3391,3391)for i in range(0, nusers):self.userSimMatrix[i, i] = 1.0for u1, u2 in programEntities.uniqueUserPairs:i = programEntities.userIndex[u1]j = programEntities.userIndex[u2]if (i, j) not in self.userSimMatrix:#print(self.userMatrix.getrow(i).todense()) 如[[0.00028123,0.00029847,0.00043592,0.00035208,0,0.00032346]]#print(self.userMatrix.getrow(j).todense()) 如[[0.00028123,0.00029742,0.00043592,0.00035208,0,-0.00032346]]usim = sim(self.userMatrix.getrow(i).todense(),self.userMatrix.getrow(j).todense())self.userSimMatrix[i, j] = usimself.userSimMatrix[j, i] = usimsio.mmwrite('US_userSimMatrix', self.userSimMatrix)#用户社交关系挖掘
class UserFriends:"""找出某用户的那些朋友，想法非常简单1）如果你有更多的朋友，可能你性格外向，更容易参加各种活动2）如果你朋友会参加某个活动，可能你也会跟随去参加一下"""def __init__(self, programEntities):nusers = len(programEntities.userIndex.keys())#3391self.numFriends = np.zeros( (nusers) )#array([0., 0., 0., ..., 0., 0., 0.])，保存每一个用户的朋友数self.userFriends = ss.dok_matrix( (nusers, nusers) )fin = gzip.open('user_friends.csv.gz')print( 'Header In User_friends.csv.gz:',fin.readline() )ln = 0#逐行打开user_friends.csv.gz文件#判断第一列的user是否在userIndex中，只有user在userIndex中才是我们关心的user#获取该用户的Index，和朋友数目#对于该用户的每一个朋友，如果朋友也在userIndex中，获取其朋友的userIndex，然后去userEventScores中获取该朋友对每个events的反应#score即为该朋友对所有events的平均分#userFriends矩阵记录了用户和朋友之间的score#如851286067：1750用户出现在test.csv中，该用户在User_friends.csv.gz中一共2151个朋友#那么其朋友占比应该是2151 / 总的朋友数sumNumFriends=3731377.0 = 2151 / 3731377 = 0.0005764627910822198for line in fin:if ln % 200 == 0:print( 'Loading line:', ln )cols = line.decode().strip().split(',')user = cols[0]if user in programEntities.userIndex:friends = cols[1].split(' ')#获得该用户的朋友列表i = programEntities.userIndex[user]self.numFriends[i] = len(friends)for friend in friends:if friend in programEntities.userIndex:j = programEntities.userIndex[friend]#the objective of this score is to infer the degree to#and direction in which this friend will influence the#user's decision, so we sum the user/event score for#this user across all training eventseventsForUser = programEntities.userEventScores.getrow(j).todense()#获取朋友对每个events的反应：0， 1， or -1#print(eventsForUser.sum(), np.shape(eventsForUser)[1] )#socre即是用户朋友在13418个events上的平均分score = eventsForUser.sum() / np.shape(eventsForUser)[1]#eventsForUser = 13418,#print(score)self.userFriends[i, j] += scoreself.userFriends[j, i] += scoreln += 1fin.close()#归一化数组sumNumFriends = self.numFriends.sum(axis=0)#每个用户的朋友数相加#print(sumNumFriends)self.numFriends = self.numFriends / sumNumFriends#每个user的朋友数目比例sio.mmwrite('UF_numFriends', np.matrix(self.numFriends) )self.userFriends = normalize(self.userFriends, norm='l1', axis=0, copy=False)sio.mmwrite('UF_userFriends', self.userFriends)#构造event和event相似度数据
class Events:"""构建event-event相似度，注意这里有2种相似度1）由用户-event行为，类似协同过滤算出的相似度2）由event本身的内容(event信息)计算出的event-event相似度"""def __init__(self, programEntities, psim=ssd.correlation, csim=ssd.cosine):cleaner = DataCleaner()fin = gzip.open('events.csv.gz')fin.readline()#skip headernevents = len(programEntities.eventIndex)  #事件的数目print(nevents)#13418self.eventPropMatrix = ss.dok_matrix( (nevents, 7) )  #存储事件-前7列特征self.eventContMatrix = ss.dok_matrix( (nevents, 100) ) #存储事件ln = 0for line in fin:#if ln > 10:#breakcols = line.decode().strip().split(',')eventId = cols[0]if eventId in programEntities.eventIndex:i = programEntities.eventIndex[eventId]self.eventPropMatrix[i, 0] = cleaner.getJoinedYearMonth( cols[2] )#start_timeself.eventPropMatrix[i, 1] = cleaner.getFeatureHash( cols[3] )#cityself.eventPropMatrix[i, 2] = cleaner.getFeatureHash( cols[4] )#stateself.eventPropMatrix[i, 3] = cleaner.getFeatureHash( cols[5] )#zipself.eventPropMatrix[i, 4] = cleaner.getFeatureHash( cols[6] )#countryself.eventPropMatrix[i, 5] = cleaner.getFloatValue( cols[7] )#latself.eventPropMatrix[i, 6] = cleaner.getFloatValue( cols[8] )#lon#将10-101列的属性进行统计排布for j in range(9, 109):self.eventContMatrix[i, j-9] = cols[j]ln += 1fin.close()#对特征矩阵1进行归一化处理 self.eventPropMatrix = normalize(self.eventPropMatrix, norm='l1', axis=0, copy=False)sio.mmwrite('EV_eventPropMatrix', self.eventPropMatrix)#对特征矩阵2进行规一划处理self.eventContMatrix = normalize(self.eventContMatrix, norm='l1', axis=0, copy=False)sio.mmwrite('EV_eventContMatrix', self.eventContMatrix)#calculate similarity between event pairs based on the two matricesself.eventPropSim = ss.dok_matrix( (nevents, nevents) )self.eventContSim = ss.dok_matrix( (nevents, nevents) )for e1, e2 in programEntities.uniqueEventPairs:i = programEntities.eventIndex[e1]j = programEntities.eventIndex[e2]#计算前10列数据的相识度if not ((i, j) in self.eventPropSim):epsim = psim( self.eventPropMatrix.getrow(i).todense(), self.eventPropMatrix.getrow(j).todense())self.eventPropSim[i, j] = epsimself.eventPropSim[j, i] = epsim#计算后面数据的相似度if not ((i, j) in self.eventContSim):ecsim = csim( self.eventContMatrix.getrow(i).todense(), self.eventContMatrix.getrow(j).todense())self.eventContSim[i, j] = ecsimself.eventContSim[j, i] = ecsimsio.mmwrite('EV_eventPropSim', self.eventPropSim)sio.mmwrite('EV_eventContSim', self.eventContSim)print('第1步：统计user和event相关信息...')
pe = ProgramEntities()
print('第1步完成...\n')print('第2步：计算用户相似度信息，并用矩阵形式存储...')
#Users(pe)
print('第2步完成...\n')print('第3步：计算用户社交关系信息，并存储...')
UserFriends(pe)
print('第3步完成...\n')print('第4步：计算event相似度信息，并用矩阵形式存储...')
Events(pe)
print('第4步完成...\n')

第五步：活跃度/event热度数据

由于用到event_attendees.csv.gz文件，我们先看看该文件

import pandas as pd
df_events_attendees = pd.read_csv('event_attendees.csv.gz', compression='gzip')
df_events_attendees.head()

	event	yes	maybe	invited	no
0	1159822043	1975964455 252302513 4226086795 3805886383 142...	2733420590 517546982 1350834692 532087573 5831...	1723091036 3795873583 4109144917 3560622906 31...	3575574655 1077296663
1	686467261	2394228942 2686116898 1056558062 3792942231 41...	1498184352 645689144 3770076778 331335845 4239...	1788073374 733302094 1830571649 676508092 7081...	NaN
2	1186208412	NaN	3320380166 3810793697	1379121209 440668682	1728988561 2950720854
3	2621578336	NaN	NaN	NaN	NaN
4	855842686	2406118796 3550897984 294255260 1125817077 109...	2671721559 1761448345 2356975806 2666669465 10...	1518670705 880919237 2326414227 2673818347 332...	3500235232

## 第五步全部代码
from collections import defaultdict
import locale, pycountry
import scipy.sparse as ss
import scipy.io as sio
import itertools
#import cPickle
#From python3, cPickle has beed replaced by _pickle
import _pickle as cPickleimport scipy.spatial.distance as ssd
import datetime
from sklearn.preprocessing import normalizeimport gzip
import numpy as npimport hashlib#处理user和event关联数据
class ProgramEntities:"""我们只关心train和test中出现的user和event，因此重点处理这部分关联数据，经过统计：train和test中总共3391个users和13418个events"""def __init__(self):#统计训练集中有多少独立的用户的eventsuniqueUsers = set()#uniqueUsers保存总共多少个用户：3391个uniqueEvents = set()#uniqueEvents保存总共多少个events：13418个eventsForUser = defaultdict(set)#字典eventsForUser保存了每个user:所对应的eventusersForEvent = defaultdict(set)#字典usersForEvent保存了每个event：哪些user点击for filename in ['train.csv', 'test.csv']:f = open(filename)f.readline()#跳过第一行for line in f:cols = line.strip().split(',')uniqueUsers.add( cols[0] )uniqueEvents.add( cols[1] )eventsForUser[cols[0]].add( cols[1] )usersForEvent[cols[1]].add( cols[0] )f.close()self.userEventScores = ss.dok_matrix( ( len(uniqueUsers), len(uniqueEvents) ) )self.userIndex = dict()self.eventIndex = dict()for i, u in enumerate(uniqueUsers):self.userIndex[u] = ifor i, e in enumerate(uniqueEvents):self.eventIndex[e] = iftrain = open('train.csv')ftrain.readline()for line in ftrain:cols = line.strip().split(',')i = self.userIndex[ cols[0] ]j = self.eventIndex[ cols[1] ]self.userEventScores[i, j] = int( cols[4] ) - int( cols[5] )ftrain.close()sio.mmwrite('PE_userEventScores', self.userEventScores)#为了防止不必要的计算，我们找出来所有关联的用户或者关联的event#所谓关联用户指的是至少在同一个event上有行为的用户user pair#关联的event指的是至少同一个user有行为的event pairself.uniqueUserPairs = set()self.uniqueEventPairs = set()for event in uniqueEvents:users = usersForEvent[event]if len(users) > 2:self.uniqueUserPairs.update( itertools.combinations(users, 2) )for user in uniqueUsers:events = eventsForUser[user]if len(events) > 2:self.uniqueEventPairs.update( itertools.combinations(events, 2) )#rint(self.userIndex)cPickle.dump( self.userIndex, open('PE_userIndex.pkl', 'wb'))cPickle.dump( self.eventIndex, open('PE_eventIndex.pkl', 'wb') )#数据清洗类
class DataCleaner:def __init__(self):#一些字符串转数值的方法#载入localeself.localeIdMap = defaultdict(int)for i, l in enumerate(locale.locale_alias.keys()):self.localeIdMap[l] = i + 1#载入countryself.countryIdMap = defaultdict(int)ctryIdx = defaultdict(int)for i, c in enumerate(pycountry.countries):self.countryIdMap[c.name.lower()] = i + 1if c.name.lower() == 'usa':ctryIdx['US'] = iif c.name.lower() == 'canada':ctryIdx['CA'] = ifor cc in ctryIdx.keys():for s in pycountry.subdivisions.get(country_code=cc):self.countryIdMap[s.name.lower()] = ctryIdx[cc] + 1self.genderIdMap = defaultdict(int, {'male':1, 'female':2})#处理LocaleIddef getLocaleId(self, locstr):#这样因为localeIdMap是defaultdict(int)，如果key中没有locstr.lower()，就会返回默认int 0return self.localeIdMap[ locstr.lower() ]#处理birthyeardef getBirthYearInt(self, birthYear):try:return 0 if birthYear == 'None' else int(birthYear)except:return 0#性别处理def getGenderId(self, genderStr):return self.genderIdMap[genderStr]#joinedAtdef getJoinedYearMonth(self, dateString):dttm = datetime.datetime.strptime(dateString, "%Y-%m-%dT%H:%M:%S.%fZ")return "".join( [str(dttm.year), str(dttm.month) ] )#处理locationdef getCountryId(self, location):if (isinstance( location, str)) and len(location.strip()) > 0 and location.rfind('  ') > -1:return self.countryIdMap[ location[location.rindex('  ') + 2: ].lower() ]else:return 0#处理timezonedef getTimezoneInt(self, timezone):try:return int(timezone)except:return 0def getFeatureHash(self, value):if len(value.strip()) == 0:return -1else:#return int( hashlib.sha224(value).hexdigest()[0:4], 16) python3会报如下错误#TypeError: Unicode-objects must be encoded before hashingreturn int( hashlib.sha224(value.encode('utf-8')).hexdigest()[0:4], 16)#python必须先进行encodedef getFloatValue(self, value):if len(value.strip()) == 0:return 0.0else:return float(value)#用户与用户相似度矩阵
class Users:"""构建user/user相似度矩阵"""def __init__(self, programEntities, sim=ssd.correlation):#spatial.distance.correlation(u, v) #计算向量u和v之间的相关系数cleaner = DataCleaner()nusers = len(programEntities.userIndex.keys())#3391#print(nusers)fin = open('users.csv')colnames = fin.readline().strip().split(',') #7列特征self.userMatrix = ss.dok_matrix( (nusers, len(colnames)-1 ) )#构建稀疏矩阵for line in fin:cols = line.strip().split(',')#只考虑train.csv中出现的用户，这一行是作者注释上的，但是我不是很理解#userIndex包含了train和test的所有用户，为何说只考虑train.csv中出现的用户if cols[0] in programEntities.userIndex:i = programEntities.userIndex[ cols[0] ]#获取user：对应的indexself.userMatrix[i, 0] = cleaner.getLocaleId( cols[1] )#localeself.userMatrix[i, 1] = cleaner.getBirthYearInt( cols[2] )#birthyear,空值0填充self.userMatrix[i, 2] = cleaner.getGenderId( cols[3] )#处理性别self.userMatrix[i, 3] = cleaner.getJoinedYearMonth( cols[4] )#处理joinedAt列self.userMatrix[i, 4] = cleaner.getCountryId( cols[5] )#处理locationself.userMatrix[i, 5] = cleaner.getTimezoneInt( cols[6] )#处理timezonefin.close()#归一化矩阵self.userMatrix = normalize(self.userMatrix, norm='l1', axis=0, copy=False)sio.mmwrite('US_userMatrix', self.userMatrix)#计算用户相似度矩阵，之后会用到self.userSimMatrix = ss.dok_matrix( (nusers, nusers) )#(3391,3391)for i in range(0, nusers):self.userSimMatrix[i, i] = 1.0for u1, u2 in programEntities.uniqueUserPairs:i = programEntities.userIndex[u1]j = programEntities.userIndex[u2]if (i, j) not in self.userSimMatrix:#print(self.userMatrix.getrow(i).todense()) 如[[0.00028123,0.00029847,0.00043592,0.00035208,0,0.00032346]]#print(self.userMatrix.getrow(j).todense()) 如[[0.00028123,0.00029742,0.00043592,0.00035208,0,-0.00032346]]usim = sim(self.userMatrix.getrow(i).todense(),self.userMatrix.getrow(j).todense())self.userSimMatrix[i, j] = usimself.userSimMatrix[j, i] = usimsio.mmwrite('US_userSimMatrix', self.userSimMatrix)#用户社交关系挖掘
class UserFriends:"""找出某用户的那些朋友，想法非常简单1）如果你有更多的朋友，可能你性格外向，更容易参加各种活动2）如果你朋友会参加某个活动，可能你也会跟随去参加一下"""def __init__(self, programEntities):nusers = len(programEntities.userIndex.keys())#3391self.numFriends = np.zeros( (nusers) )#array([0., 0., 0., ..., 0., 0., 0.])，保存每一个用户的朋友数self.userFriends = ss.dok_matrix( (nusers, nusers) )fin = gzip.open('user_friends.csv.gz')print( 'Header In User_friends.csv.gz:',fin.readline() )ln = 0#逐行打开user_friends.csv.gz文件#判断第一列的user是否在userIndex中，只有user在userIndex中才是我们关心的user#获取该用户的Index，和朋友数目#对于该用户的每一个朋友，如果朋友也在userIndex中，获取其朋友的userIndex，然后去userEventScores中获取该朋友对每个events的反应#score即为该朋友对所有events的平均分#userFriends矩阵记录了用户和朋友之间的score#如851286067：1750用户出现在test.csv中，该用户在User_friends.csv.gz中一共2151个朋友#那么其朋友占比应该是2151 / 总的朋友数sumNumFriends=3731377.0 = 2151 / 3731377 = 0.0005764627910822198for line in fin:if ln % 200 == 0:print( 'Loading line:', ln )cols = line.decode().strip().split(',')user = cols[0]if user in programEntities.userIndex:friends = cols[1].split(' ')#获得该用户的朋友列表i = programEntities.userIndex[user]self.numFriends[i] = len(friends)for friend in friends:if friend in programEntities.userIndex:j = programEntities.userIndex[friend]#the objective of this score is to infer the degree to#and direction in which this friend will influence the#user's decision, so we sum the user/event score for#this user across all training eventseventsForUser = programEntities.userEventScores.getrow(j).todense()#获取朋友对每个events的反应：0， 1， or -1#print(eventsForUser.sum(), np.shape(eventsForUser)[1] )#socre即是用户朋友在13418个events上的平均分score = eventsForUser.sum() / np.shape(eventsForUser)[1]#eventsForUser = 13418,#print(score)self.userFriends[i, j] += scoreself.userFriends[j, i] += scoreln += 1fin.close()#归一化数组sumNumFriends = self.numFriends.sum(axis=0)#每个用户的朋友数相加#print(sumNumFriends)self.numFriends = self.numFriends / sumNumFriends#每个user的朋友数目比例sio.mmwrite('UF_numFriends', np.matrix(self.numFriends) )self.userFriends = normalize(self.userFriends, norm='l1', axis=0, copy=False)sio.mmwrite('UF_userFriends', self.userFriends)#构造event和event相似度数据
class Events:"""构建event-event相似度，注意这里有2种相似度1）由用户-event行为，类似协同过滤算出的相似度2）由event本身的内容(event信息)计算出的event-event相似度"""def __init__(self, programEntities, psim=ssd.correlation, csim=ssd.cosine):cleaner = DataCleaner()fin = gzip.open('events.csv.gz')fin.readline()#skip headernevents = len(programEntities.eventIndex)print(nevents)#13418self.eventPropMatrix = ss.dok_matrix( (nevents, 7) )self.eventContMatrix = ss.dok_matrix( (nevents, 100) )ln = 0for line in fin:#if ln > 10:#breakcols = line.decode().strip().split(',')eventId = cols[0]if eventId in programEntities.eventIndex:i = programEntities.eventIndex[eventId]self.eventPropMatrix[i, 0] = cleaner.getJoinedYearMonth( cols[2] )#start_timeself.eventPropMatrix[i, 1] = cleaner.getFeatureHash( cols[3] )#cityself.eventPropMatrix[i, 2] = cleaner.getFeatureHash( cols[4] )#stateself.eventPropMatrix[i, 3] = cleaner.getFeatureHash( cols[5] )#zipself.eventPropMatrix[i, 4] = cleaner.getFeatureHash( cols[6] )#countryself.eventPropMatrix[i, 5] = cleaner.getFloatValue( cols[7] )#latself.eventPropMatrix[i, 6] = cleaner.getFloatValue( cols[8] )#lonfor j in range(9, 109):self.eventContMatrix[i, j-9] = cols[j]ln += 1fin.close()self.eventPropMatrix = normalize(self.eventPropMatrix, norm='l1', axis=0, copy=False)sio.mmwrite('EV_eventPropMatrix', self.eventPropMatrix)self.eventContMatrix = normalize(self.eventContMatrix, norm='l1', axis=0, copy=False)sio.mmwrite('EV_eventContMatrix', self.eventContMatrix)#calculate similarity between event pairs based on the two matricesself.eventPropSim = ss.dok_matrix( (nevents, nevents) )self.eventContSim = ss.dok_matrix( (nevents, nevents) )for e1, e2 in programEntities.uniqueEventPairs:i = programEntities.eventIndex[e1]j = programEntities.eventIndex[e2]if not ((i, j) in self.eventPropSim):epsim = psim( self.eventPropMatrix.getrow(i).todense(), self.eventPropMatrix.getrow(j).todense())if np.isnan(epsim):epsim = 0self.eventPropSim[i, j] = epsimself.eventPropSim[j, i] = epsimif not ((i, j) in self.eventContSim):#两个向量，如果某个全为0，会返回nan"""import numpy as npa = np.array([0, 1, 1, 1, 0, 0, 0, 1, 0, 0])b = np.array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0])from scipy.spatial.distance import cosinetemp = cosine(a, b)会出现下面问题：Warning (from warnings module):File "D:\Python35\lib\site-packages\scipy\spatial\distance.py", line 644dist = 1.0 - uv / np.sqrt(uu * vv)RuntimeWarning: invalid value encountered in double_scalars"""ecsim = csim( self.eventContMatrix.getrow(i).todense(), self.eventContMatrix.getrow(j).todense())if np.isnan(ecsim):ecsim = 0self.eventContSim[i, j] = ecsimself.eventContSim[j, i] = ecsimsio.mmwrite('EV_eventPropSim', self.eventPropSim)sio.mmwrite('EV_eventContSim', self.eventContSim)
#第五步
class EventAttendees:"""统计某个活动，参加和不参加的人数，从而为活动活跃度做准备"""def __init__(self, programEntities):nevents = len(programEntities.eventIndex)#13418  事件的总数self.eventPopularity = ss.dok_matrix( (nevents, 1) )f = gzip.open('event_attendees.csv.gz')f.readline()#skip headerfor line in f:cols = line.decode().strip().split(',')eventId = cols[0]if eventId in programEntities.eventIndex:i = programEntities.eventIndex[eventId]self.eventPopularity[i, 0] = len(cols[1].split(' ')) - len(cols[4].split(' '))#yes人数-no人数，即出席人数减未出席人数f.close()self.eventPopularity = normalize( self.eventPopularity, norm='l1', axis=0, copy=False)sio.mmwrite('EA_eventPopularity', self.eventPopularity)def data_prepare():"""计算生成所有的数据，用矩阵或者其他形式存储方便后续提取特征和建模"""print('第1步：统计user和event相关信息...')pe = ProgramEntities()print('第1步完成...\n')print('第2步：计算用户相似度信息，并用矩阵形式存储...')Users(pe)print('第2步完成...\n')print('第3步：计算用户社交关系信息，并存储...')UserFriends(pe)print('第3步完成...\n')print('第4步：计算event相似度信息，并用矩阵形式存储...')Events(pe)print('第4步完成...\n')print('第5步：计算event热度信息...')EventAttendees(pe)print('第5步完成...\n')#运行进行数据准备
data_prepare()

6.特征构建

#这是特征构建部分#import cPickle
#From python3, cPickle has beed replaced by _pickle
import _pickle as cPickle
import scipy.io as sio
import numpy as npclass DataRewriter:def __init__(self):#读入数据做初始化self.userIndex = cPickle.load( open('PE_userIndex.pkl','rb') )self.eventIndex = cPickle.load( open('PE_eventIndex.pkl', 'rb') )self.userEventScores = sio.mmread('PE_userEventScores').todense()self.userSimMatrix = sio.mmread('US_userSimMatrix').todense()  self.eventPropSim = sio.mmread('EV_eventPropSim').todense() self.eventContSim = sio.mmread('EV_eventContSim').todense()  self.numFriends = sio.mmread('UF_numFriends')self.userFriends = sio.mmread('UF_userFriends').todense()self.eventPopularity = sio.mmread('EA_eventPopularity').todense()def userReco(self, userId, eventId):"""根据User-based协同过滤，得到event的推荐度基本的伪代码思路如下：for item in ifor every other user v that has a preference for icompute similarity s between u and vincorporate v's preference for i weighted by s into running averagereturn top items ranked by weighted average"""i = self.userIndex[userId]j = self.eventIndex[eventId]vs = self.userEventScores[:, j]sims = self.userSimMatrix[i, :]prod = sims * vstry:return prod[0, 0] - self.userEventScores[i, j]except IndexError:return 0def eventReco(self, userId, eventId):"""根据基于物品的协同过滤，得到Event的推荐度基本的伪代码思路：for item i:for every item j that u has a preference forcompute similarity s between i and jadd u's preference for j weighted by s to a running averagereturn top items, ranked by weighted average"""i = self.userIndex[userId]j = self.eventIndex[eventId]js = self.userEventScores[i, :]psim = self.eventPropSim[:, j]csim = self.eventContSim[:, j]pprod = js * psimcprod = js * csimpscore = 0cscore = 0try:pscore = pprod[0, 0] - self.userEventScores[i, j]except IndexError:passtry:cscore = cprod[0, 0] - self.userEventScores[i, j]except IndexError:passreturn pscore, cscoredef userPop(self, userId):"""基于用户的朋友个数来推断用户的社交程度主要的考量是如果用户的朋友非常多，可能会更倾向于参加各种社交活动"""if userId in self.userIndex:i = self.userIndex[userId]try:return self.numFriends[0, i]except IndexError:return 0else:return 0def friendInfluence(self, userId):"""朋友对用户的影响主要考虑用户的所有朋友中，有多少是非常喜欢参加各种社交活动(event)的用户的朋友圈如果都是积极参加各种event，可能会对当前用户有一定的影响"""nusers = np.shape(self.userFriends)[1]i = self.userIndex[userId]#下面的一行代码是不是有问题呢？#是不是应该为某个用户的所有朋友的兴趣分之和，然后除以nusers，也就是axis应该=1return (self.userFriends[i, :].sum(axis=0) / nusers)[0, 0]def eventPop(self, eventId):"""活动本身的热度主要通过参与的参数来界定的"""i = self.eventIndex[eventId]return self.eventPopularity[i, 0]def rewriteData(self, start=1, train=True, header=True):"""把前面user-based协同过滤和item-based协同过滤以及各种热度和影响度作为特征组合在一起生成新的train，用于分类器分类使用"""fn = 'train.csv' if train else 'test.csv'fin = open(fn)fout = open('data_' + fn, 'w')#write output headerif header:ocolnames = ['invited', 'user_reco', 'evt_p_reco', 'evt_c_reco', 'user_pop', 'frnd_infl', 'evt_pop']if train:ocolnames.append('interested')ocolnames.append('not_interested')fout.write( ','.join(ocolnames) + '\n' )ln = 0for line in fin:ln += 1if ln < start:continuecols = line.strip().split(',')#user,event,invited,timestamp,interested,not_interesteduserId = cols[0]eventId = cols[1]invited = cols[2]if ln % 500 == 0:print("%s : %d (userId, eventId) = (%s, %s)" % (fn, ln, userId, eventId))user_reco = self.userReco( userId, eventId )evt_p_reco, evt_c_reco = self.eventReco( userId, eventId )user_pop = self.userPop( userId )frnd_infl = self.friendInfluence( userId )evt_pop = self.eventPop( eventId )ocols = [invited, user_reco, evt_p_reco, evt_c_reco, user_pop, frnd_infl, evt_pop]if train:ocols.append( cols[4] )#interestedocols.append( cols[5] )#not_interestedfout.write(','.join( map(lambda x: str(x), ocols)) + '\n')fin.close()fout.close()def rewriteTrainingSet(self):self.rewriteData(True)def rewriteTestSet(self):self.rewriteData(False)dr = DataRewriter()
print('生成训练数据...\n')
dr.rewriteData(train=True, start=2, header=True)print('生成预测数据...\n')
dr.rewriteData(train=False, start=2, header=True)
print('done')

第七步：模型构建与预测

import pandas as pd
import numpy as np
from sklearn.linear_model import SGDClassifier
from sklearn.model_selection import KFold
import warnings
warnings.filterwarnings('ignore')def train():"""在我们得到的特征上训练分类器，target为1（感兴趣），或者是0（不感兴趣）"""trainDf = pd.read_csv('data_train.csv')X = np.matrix( pd.DataFrame(trainDf, index=None, columns=['invited', 'user_reco', 'evt_p_reco','evt_c_reco','user_pop', 'frnd_infl', 'evt_pop']) )y = np.array(trainDf.interested)clf = SGDClassifier(loss='log', penalty='l2')clf.fit(X, y)return clfdef validate():"""10折的交叉验证，并输出交叉验证的平均准确率"""trainDf = pd.read_csv('data_train.csv')X = np.matrix(pd.DataFrame(trainDf, index=None, columns=['invited', 'user_reco', 'evt_p_reco','evt_c_reco','user_pop', 'frnd_infl', 'evt_pop']) )y = np.array(trainDf.interested)nrows = len(trainDf)kfold = KFold(n_splits=10,shuffle=False)avgAccuracy = 0run = 0for train, test in kfold.split(X, y):Xtrain, Xtest, ytrain, ytest = X[train], X[test], y[train], y[test]clf = SGDClassifier(loss='log', penalty='l2')clf.fit(Xtrain, ytrain)accuracy = 0ntest = len(ytest)for i in range(0, ntest):yt = clf.predict(Xtest[i, :])if yt == ytest[i]:accuracy += 1accuracy = accuracy / ntestprint('accuracy(run %d) : %f' % (run, accuracy) )def test(clf):"""读取test数据，用分类器完成预测"""origTestDf = pd.read_csv("test.csv")users = origTestDf.userevents = origTestDf.eventtestDf = pd.read_csv("data_test.csv")fout = open("result.csv", 'w')fout.write(",".join(["user", "event", "outcome", "dist"]) + "\n")nrows = len(testDf)Xp = np.matrix(testDf)yp = np.zeros((nrows, 2))for i in range(0, nrows):xp = Xp[i, :]yp[i, 0] = clf.predict(xp)yp[i, 1] = clf.decision_function(xp)fout.write(",".join( map( lambda x: str(x), [users[i], events[i], yp[i, 0], yp[i, 1]] ) ) + "\n")fout.close()clf = train()
validate()
test(clf)
print('done')