1. 函数库导入
import os, sys, pickleimport numpy as np
import pandas as pdimport matplotlib. pyplot as plt
import matplotlib. dates as mdatesimport seaborn as sns
import datetime as dtfrom datetime import datefrom sklearn. linear_model import SGDClassifier, LogisticRegression
from sklearn. metrics import log_loss, roc_auc_score, auc, roc_curve
% matplotlib inline
% config InlineBackend. figure_format = 'retina'
2. 读取文件数据
dfoff = pd. read_csv( './data/ccf_offline_stage1_train.csv' )
dftest = pd. read_csv( './data/ccf_offline_stage1_test_revised.csv' )
dfon = pd. read_csv( './data/ccf_online_stage1_train.csv' ) dfoff. head( )
User_id Merchant_id Coupon_id Discount_rate Distance Date_received Date 0 1439408 2632 NaN NaN 0.0 NaN 20160217.0 1 1439408 4663 11002.0 150:20 1.0 20160528.0 NaN 2 1439408 2632 8591.0 20:1 0.0 20160217.0 NaN 3 1439408 2632 1078.0 20:1 0.0 20160319.0 NaN 4 1439408 2632 8591.0 20:1 0.0 20160613.0 NaN
3. 数据处理
def getDiscountType ( row) : if pd. isnull( row) : return np. nanelif ':' in row: return 1 else : return 0 def convertRate ( row) : """Convert discount to rate""" if pd. isnull( row) : return 1.0 elif ':' in str ( row) : rows = row. split( ':' ) return 1.0 - float ( rows[ 1 ] ) / float ( rows[ 0 ] ) else : return float ( row) def getDiscountMan ( row) : if ':' in str ( row) : rows = row. split( ':' ) return int ( rows[ 0 ] ) else : return 0 def getDiscountJian ( row) : if ':' in str ( row) : rows = row. split( ':' ) return int ( rows[ 1 ] ) else : return 0 def processData ( df) : df[ 'discount_rate' ] = df[ 'Discount_rate' ] . apply ( convertRate) df[ 'discount_man' ] = df[ 'Discount_rate' ] . apply ( getDiscountMan) df[ 'discount_jian' ] = df[ 'Discount_rate' ] . apply ( getDiscountJian) df[ 'discount_type' ] = df[ 'Discount_rate' ] . apply ( getDiscountType) df[ 'distance' ] = df[ 'Distance' ] . fillna( - 1 ) . astype( int ) return dfdfoff = processData( dfoff)
dftest = processData( dftest) dfoff. head( )
dftest. head( )
User_id Merchant_id Coupon_id Discount_rate Distance Date_received discount_rate discount_man discount_jian discount_type distance 0 4129537 450 9983 30:5 1.0 20160712 0.833333 30 5 1 1 1 6949378 1300 3429 30:5 NaN 20160706 0.833333 30 5 1 -1 2 2166529 7113 6928 200:20 5.0 20160727 0.900000 200 20 1 5 3 2166529 7113 1808 100:10 5.0 20160727 0.900000 100 10 1 5 4 6172162 7605 6500 30:1 2.0 20160708 0.966667 30 1 1 2
date_received = dfoff[ 'Date_received' ] . unique( )
date_received = sorted ( date_received[ pd. notnull( date_received) ] ) date_buy = dfoff[ 'Date' ] . unique( )
date_buy = sorted ( date_buy[ pd. notnull( date_buy) ] )
date_buy