本文补充第二种特征生成方法,第一种是A属性等于1,针对一个卡号的B属性各个值求和,还有种方法是统计各个维度上的统计指标(求和等指标)
# In[86]:
transaction=pd.read_csv('d:/transaction_d_pre.csv',low_memory=False)
# In[87]:
transaction.head(5)
# In[98]:
numeric_cols = ['authorized_flag', 'category_1', 'installments', 'category_3', 'month_lag','purchase_month',
'purchase_amount', 'category_2',
'purchase_month', 'purchase_hour_section',
'most_recent_sales_range', 'most_recent_purchases_range']
# In[99]:
categorical_cols = ['city_id', 'merchant_category_id', 'merchant_id', 'state_id', 'subsector_id', 'category_4','purchase_date']
# In[100]:
aggs={}
# In[101]:
for col in numeric_cols:
aggs[col]= ['nunique', 'mean', 'min', 'max','var','skew', 'sum']
# In[102]:
for col in categorical_cols:
aggs[col]=['nunique']
aggs
# In[103]:
aggs['card_id']=['size','count']#加上card_id
# In[104]:
cols=['card_id']
# In[105]:
for key in aggs.keys():
cols.extend([key+''+stat for stat in aggs[key]])#把名称组合出来
df = transaction[transaction['month_lag']<0].groupby('card_id').agg(aggs).reset_index()
df.columns = cols[:1] + [co+'_hist' for co in cols[1:]]
df2 = transaction[transaction['month_lag']>=0].groupby('card_id').agg(aggs).reset_index()
df2.columns = cols[:1] + [co+'_new' for co in cols[1:]]
df = pd.merge(df, df2, how='left',on='card_id')
df2 = transaction.groupby('card_id').agg(aggs).reset_index()
df2.columns = cols
df = pd.merge(df, df2, how='left',on='card_id')
del transaction
gc.collect()
# 生成训练集与测试集
train = pd.merge(train, df, how='left', on='card_id')
test = pd.merge(test, df, how='left', on='card_id')
del df
train.to_csv(\ preprocess/train_groupby.csv\ , index=False)
test.to_csv(\ preprocess/test_groupby.csv\ , index=False)
gc.collect()