一、 建模步骤
(1)从数据中衍生特征
(2)对类别型变量和数值型变量进行补缺
(3)基于卡方分箱法对变量进行分箱
(4)WOE编码后的单变量分析与多变量分析
(5)应用逻辑回归模型
(6)尺度化
(7)模型预测能力
二、代码
import pandas as pdimport datetimeimport collectionsimport numpy as npimport numbersimport randomimport sysimport picklefrom itertools import combinationsfrom sklearn.linear_model import LinearRegressionfrom sklearn.linear_model import LogisticRegressionfrom sklearn.ensemble import RandomForestClassifierfrom sklearn.model_selection import train_test_splitfrom sklearn.metrics import roc_curvefrom sklearn.metrics import roc_auc_scoreimport statsmodels.api as sm# ##针对图形显示问题# import matplotlib# matplotlib.use('TkAgg')from importlib import reloadfrom matplotlib import pyplot as plt# 如果Jupyter,需要打开下面行代码。#%matplotlib inlinereload(sys)# sys.setdefaultencoding( "utf-8")from scorecard_functions import *from sklearn.linear_model import LogisticRegressionCV# -*- coding: utf-8 -*-######################################## UDF: 自定义函数 ########################################### 对时间窗口,计算累计产比 ###def TimeWindowSelection(df, daysCol, time_windows): ''' :param df: the dataset containg variabel of days :param daysCol: the column of days :param time_windows: the list of time window :return: ''' freq_tw = {} for tw in time_windows: freq = sum(df[daysCol].apply(lambda x: int(x<=tw))) freq_tw[tw] = freq # freq_tw = {dict}: {30: 499175, 60: 524173, 90: 535223, 120: 542683, 150: 548083, 180: 552009, 210: 555009, 240: 557393, 270: 559259, 300: 560823, 330: 562458, 360: 563952} return freq_twdef DeivdedByZero(nominator, denominator): ''' 当分母为0时,返回0;否则返回正常值 ''' if denominator == 0: return 0 else: return nominator*1.0/denominator#对某些统一的字段进行统一def ChangeContent(x): y = x.upper() if y == '_MOBILEPHONE': y = '_PHONE' return ydef MissingCategorial(df,x): ''' :param df: the dataresources. :param x: the column of the dataresources. :return: ''' missing_vals = df[x].map(lambda x: int(x!=x)) return sum(missing_vals)*1.0/df.shape[0]def MissingContinuous(df,x): missing_vals = df[x].map(lambda x: int(np.isnan(x))) return sum(missing_vals) * 1.0 / df.shape[0]def MakeupRandom(x, sampledList): if x==x: return x else: randIndex = random.randint(0, len(sampledList)-1) return sampledList[randIndex]#############################################################Step 0: 数据分析的初始工作, 包括读取数据文件、检查用户Id的一致性等############################################################## F:\chen\download\creditcard\Chimerge\cyc# folderOfData = '/Users/Code/Data Collections/bank default/'folderOfData = 'F:\/chen\/download\/creditcard\/Chimerge\/cyc\/'data1 = pd.read_csv(folderOfData+'PPD_LogInfo_3_1_Training_Set.csv', header = 0)data2 = pd.read_csv(folderOfData+'PPD_Training_Master_GBK_3_1_Training_Set.csv', dtype = {'target': np.int64}, header = 0,encoding = 'gbk')data3 = pd.read_csv(folderOfData+'PPD_Userupdate_Info_3_1_Training_Set.csv', header = 0)# score card:# data=pd.read_csv('F:\/chen\/download\/creditcard\/score card\/data_all_values.csv')############################################################################################## Step 1: 从PPD_LogInfo_3_1_Training_Set & PPD_Userupdate_Info_3_1_Training_Set数据中衍生特征############################################################################################### compare whether the four city variables matchdata2['city_match'] = data2.apply(lambda x: int(x.UserInfo_2 == x.UserInfo_4 == x.UserInfo_8 == x.UserInfo_20),axis = 1)# score card:# data=pd.read_csv('F:\/chen\/download\/creditcard\/score card\/data_all_values.csv')del data2['UserInfo_2']del data2['UserInfo_4']del data2['UserInfo_8']del data2['UserInfo_20']### 提取申请日期,计算日期差,查看日期差的分布data1['logInfo'] = data1['LogInfo3'].map(lambda x: datetime.datetime.strptime(x,'%Y-%m-%d'))data1['Listinginfo'] = data1['Listinginfo1'].map(lambda x: datetime.datetime.strptime(x,'%Y-%m-%d'))data1['ListingGap'] = data1[['logInfo','Listinginfo']].apply(lambda x: (x[1]-x[0]).days,axis = 1)plt.hist(data1['ListingGap'],bins=200)plt.title('Days between login date and listing date')ListingGap2 = data1['ListingGap'].map(lambda x: min(x,365))plt.hist(ListingGap2,bins=200)# plt.figure()plt.show()timeWindows = TimeWindowSelection(data1, 'ListingGap', range(30,361,30))'''使用180天作为最大的时间窗口计算新特征所有可以使用的时间窗口可以有7 days, 30 days, 60 days, 90 days, 120 days, 150 days and 180 days.在每个时间窗口内,计算总的登录次数,不同的登录方式,以及每种登录方式的平均次数'''time_window = [7, 30, 60, 90, 120, 150, 180]var_list = ['LogInfo1','LogInfo2']data1GroupbyIdx = pd.DataFrame({'Idx':data1['Idx'].drop_duplicates()})for tw in time_window: data1['TruncatedLogInfo'] = data1['Listinginfo'].map(lambda x: x + datetime.timedelta(-tw)) temp = data1.loc[data1['logInfo'] >= data1['TruncatedLogInfo']] for var in var_list: #count the frequences of LogInfo1 and LogInfo2 count_stats = temp.groupby(['Idx'])[var].count().to_dict() data1GroupbyIdx[str(var)+'_'+str(tw)+'_count'] = data1GroupbyIdx['Idx'].map(lambda x: count_stats.get(x,0)) # count the distinct value of LogInfo1 and LogInfo2 Idx_UserupdateInfo1 = temp[['Idx', var]].drop_duplicates() uniq_stats = Idx_UserupdateInfo1.groupby(['Idx'])[var].count().to_dict() data1GroupbyIdx[str(var) + '_' + str(tw) + '_unique'] = data1GroupbyIdx['Idx'].map(lambda x: uniq_stats.get(x,0)) # calculate the average count of each value in LogInfo1 and LogInfo2 # groupbyid,因为之前Idx已经生成,所以groupby,有可能找不到,这样就是0。从而在计算平均值时候会出现分母是01的情况,使用DeicdedbyZero会防止这种情况。 data1GroupbyIdx[str(var) + '_' + str(tw) + '_avg_count'] = data1GroupbyIdx[[str(var)+'_'+str(tw)+'_count',str(var) + '_' + str(tw) + '_unique']].\ apply(lambda x: DeivdedByZero(x[0],x[1]), axis=1)data3['ListingInfo'] = data3['ListingInfo1'].map(lambda x: datetime.datetime.strptime(x,'%Y/%m/%d'))data3['UserupdateInfo'] = data3['UserupdateInfo2'].map(lambda x: datetime.datetime.strptime(x,'%Y/%m/%d'))data3['ListingGap'] = data3[['UserupdateInfo','ListingInfo']].apply(lambda x: (x[1]-x[0]).days,axis = 1)collections.Counter(data3['ListingGap'])hist_ListingGap = np.histogram(data3['ListingGap'])hist_ListingGap = pd.DataFrame({'Freq':hist_ListingGap[0],'gap':hist_ListingGap[1][1:]})hist_ListingGap['CumFreq'] = hist_ListingGap['Freq'].cumsum()hist_ListingGap['CumPercent'] = hist_ListingGap['CumFreq'].map(lambda x: x*1.0/hist_ListingGap.iloc[-1]['CumFreq'])'''对不同表达方法,如: QQ和qQ, Idnumber和idNumber,MOBILEPHONE和PHONE等 进行统一在时间切片内,计算 (1) 更新的频率 (2) 每种更新对象的种类个数 (3) 对重要信息如IDNUMBER,HASBUYCAR, MARRIAGESTATUSID, PHONE的更新。这一点,比如,申请人对自己的重要信息如身份证、是否有车、婚姻状况等等进行更改。表示这个人很有可能存在问题,会衍生出一个变量。 不过这里的程序,有bug。就是它会统计data3里面有多少个 IDNUMBER等等,然后map去对应的ID,但是,如果get不到的话,它会有默认值就是ID自己。所以有时候看到这个值很大,因为就是ID它自己。代码:data3GroupbyIdx['UserupdateInfo_' + str(tw) + str(item)] = data3GroupbyIdx['Idx'].map(lambda x: item_dict.get(x, x))'''data3['UserupdateInfo1'] = data3['UserupdateInfo1'].map(ChangeContent)data3GroupbyIdx = pd.DataFrame({'Idx':data3['Idx'].drop_duplicates()})time_window = [7, 30, 60, 90, 120, 150, 180]for tw in time_window: data3['TruncatedLogInfo'] = data3['ListingInfo'].map(lambda x: x + datetime.timedelta(-tw)) temp = data3.loc[data3['UserupdateInfo'] >= data3['TruncatedLogInfo']] #frequency of updating freq_stats = temp.groupby(['Idx'])['UserupdateInfo1'].count().to_dict() data3GroupbyIdx['UserupdateInfo_'+str(tw)+'_freq'] = data3GroupbyIdx['Idx'].map(lambda x: freq_stats.get(x,0)) # number of updated types Idx_UserupdateInfo1 = temp[['Idx','UserupdateInfo1']].drop_duplicates() uniq_stats = Idx_UserupdateInfo1.groupby(['Idx'])['UserupdateInfo1'].count().to_dict() data3GroupbyIdx['UserupdateInfo_' + str(tw) + '_unique'] = data3GroupbyIdx['Idx'].map(lambda x: uniq_stats.get(x, x)) #average count of each type data3GroupbyIdx['UserupdateInfo_' + str(tw) + '_avg_count'] = data3GroupbyIdx[['UserupdateInfo_'+str(tw)+'_freq', 'UserupdateInfo_' + str(tw) + '_unique']]. \ apply(lambda x: x[0] * 1.0 / x[1], axis=1) #whether the applicant changed items like IDNUMBER,HASBUYCAR, MARRIAGESTATUSID, PHONE # Idx_UserupdateInfo1 = {DataFrame} # Idx UserupdateInfo1 # 23 10002 [_PHONE] # 42 10006 [_EDUCATIONID] # 43 10006 [_HASBUYCAR] # 44 10006 [_MARRIAGESTATUSID] # 45 10006 [_PHONE] # ... ... ... # 372437 9995 [_QQ] # 372438 9995 [_RESIDENCEADDRESS] # 372439 9995 [_RESIDENCEPHONE] # 372440 9995 [_RESIDENCETYPEID] # 372441 9995 [_RESIDENCEYEARS] Idx_UserupdateInfo1['UserupdateInfo1'] = Idx_UserupdateInfo1['UserupdateInfo1'].map(lambda x: [x]) # Idx_UserupdateInfo1_V2 = {Series}Idx # 8 [_PHONE] # 16 [_PHONE] # 17 [_PHONE] # 18 [_EDUCATIONID, _HASBUYCAR, _IDNUMBER, _LASTUPD... # 20 [_EDUCATIONID, _HASBUYCAR, _IDNUMBER, _LASTUPD... # ... # 91688 [_CITYID, _DISTRICTID, _EDUCATIONID, _HASBUYCA... # 91689 [_CITYID, _DISTRICTID, _EDUCATIONID, _HASBUYCA... # 91695 [_DISTRICTID, _LASTUPDATEDATE, _PHONE, _RESIDE... # 91702 [_CITYID, _DISTRICTID, _EDUCATIONID, _HASBUYCA... # 91703 [_LASTUPDATEDATE] Idx_UserupdateInfo1_V2 = Idx_UserupdateInfo1.groupby(['Idx'])['UserupdateInfo1'].sum() for item in ['_IDNUMBER','_HASBUYCAR','_MARRIAGESTATUSID','_PHONE']: item_dict = Idx_UserupdateInfo1_V2.map(lambda x: int(item in x)).to_dict() data3GroupbyIdx['UserupdateInfo_' + str(tw) + str(item)] = data3GroupbyIdx['Idx'].map(lambda x: item_dict.get(x, x))# Combine the above features with raw features in PPD_Training_Master_GBK_3_1_Training_SetallData = pd.concat([data2.set_index('Idx'), data3GroupbyIdx.set_index('Idx'), data1GroupbyIdx.set_index('Idx')],axis= 1)allData.to_csv(folderOfData+'allData_0.csv',encoding = 'gbk')######################################## Step 2: 对类别型变量和数值型变量进行补缺#######################################allData = pd.read_csv(folderOfData+'allData_0.csv',header = 0,encoding = 'gbk')allFeatures = list(allData.columns)allFeatures.remove('target')if 'Idx' in allFeatures: allFeatures.remove('Idx')allFeatures.remove('ListingInfo')#检查是否有常数型变量(如果有的话就拿掉),并且检查是类别型还是数值型变量"""有的时候,一些虽然是数值型变量,但是取值很少。这个时候可能这些是表示特殊意义的类别型"""numerical_var = []for col in allFeatures: if len(set(allData[col])) == 1: print('delete {} from the dataset because it is a constant'.format(col)) del allData[col] allFeatures.remove(col) else: # 排除了缺失值 uniq_valid_vals = [i for i in allData[col] if i == i] # 通过set()方法,很好地拿到了各个值 uniq_valid_vals = list(set(uniq_valid_vals)) if len(uniq_valid_vals) >= 10 and isinstance(uniq_valid_vals[0], numbers.Real): numerical_var.append(col)categorical_var = [i for i in allFeatures if i not in numerical_var]#检查变量的最多值的占比情况,以及每个变量中占比最大的值records_count = allData.shape[0]# col_most_values{'学历': 0.86,'收入':0.45, '单位性质':0.58,'UserInfo_19': 0.909}# col_large_value存放每一个变量col最大值对应的值 = {'UserInfo_19': '广东省', '收入':23021}col_most_values,col_large_value = {},{}for col in allFeatures: # UserInfo_19 target # 广东省 30001 # 广西 2000 # ''' value_count = {Series} 山西省 945 广东省 2405 广西壮族自治区 1198 新疆维吾尔自治区 204 Name: UserInfo_19, dtype: int64 ''' value_count = allData[col].groupby(allData[col]).count() col_most_values[col] = max(value_count)/records_count # 这里要注意, index索引出来的不一定是数字,而是groupby出来的各种值。如果是'学历'这个变量的话,就是:本科、研究生、博士等等 # groupby什么,什么就是index # 所以这个large_value起名字也很好,就是large_value的值。而不是index. # print("value_count[value_count== max(value_count)].index = ") # print(value_count[value_count== max(value_count)].index) large_value = value_count[value_count== max(value_count)].index[0] # value_count[value_count == max(value_count)] 这个公式,只会起过滤作用,选择条件是True的数据出来。也即:只有1行数据出来 # 测试了一下: 原来,index的值是:Float64Index([5.0], dtype='float64', name='UserInfo_3'),所以,必须写成index[0],这样index[0] = 5.0 # large_value = value_count[value_count == max(value_count)].index col_large_value[col] = large_valuecol_most_values_df = pd.DataFrame.from_dict(col_most_values, orient = 'index')col_most_values_df.columns = ['max percent']col_most_values_df = col_most_values_df.sort_values(by = 'max percent', ascending = False)pcnt = list(col_most_values_df[:500]['max percent'])vars = list(col_most_values_df[:500].index)plt.bar(range(len(pcnt)), height = pcnt)plt.title('Largest Percentage of Single Value in Each Variable')plt.show()# max percent# WeblogInfo_13 0.999700# SocialNetwork_11 0.999600说# WeblogInfo_55 0.999433#计算多数值占比超过90%的字段中,少数值的坏样本率是否会显著高于多数值# print("col_most_values_df[col_most_values_df['max percent']>=0.9].index = ")# print(col_most_values_df[col_most_values_df['max percent']>=0.9].index)large_percent_cols = list(col_most_values_df[col_most_values_df['max percent']>=0.9].index)bad_rate_diff = {}# print("col_large_value = ")# print(col_large_value)# print("large_percent_cols = ")# print(large_percent_cols)for col in large_percent_cols: large_value = col_large_value[col] temp = allData[[col,'target']] temp[col] = temp.apply(lambda x: int(x[col]==large_value),axis=1) bad_rate = temp.groupby(col).mean() # print("bad_rate = ") # print(bad_rate) if bad_rate.iloc[0]['target'] == 0: bad_rate_diff[col] = 0 continue bad_rate_diff[col] = np.log(bad_rate.iloc[0]['target']/bad_rate.iloc[1]['target'])bad_rate_diff_sorted = sorted(bad_rate_diff.items(),key=lambda x: x[1], reverse=True)bad_rate_diff_sorted_values = [x[1] for x in bad_rate_diff_sorted]plt.bar(x = range(len(bad_rate_diff_sorted_values)), height = bad_rate_diff_sorted_values)plt.title("log of bad rate ratio in large varaibles in which 90% is single value")plt.show()#由于所有的少数值的坏样本率并没有显著高于多数值,意味着这些变量可以直接剔除for col in large_percent_cols: if col in numerical_var: numerical_var.remove(col) else: categorical_var.remove(col) del allData[col]'''对类别型变量,如果缺失超过80%, 就删除,否则当成特殊的状态'''missing_pcnt_threshould_1 = 0.8for col in categorical_var: missingRate = MissingCategorial(allData,col) print('{0} has missing rate as {1}'.format(col,missingRate)) if missingRate > missing_pcnt_threshould_1: categorical_var.remove(col) del allData[col] if 0 < missingRate < missing_pcnt_threshould_1: uniq_valid_vals = [i for i in allData[col] if i == i] uniq_valid_vals = list(set(uniq_valid_vals)) if isinstance(uniq_valid_vals[0], numbers.Real): missing_position = allData.loc[allData[col] != allData[col]][col].index not_missing_sample = [-1]*len(missing_position) allData.loc[missing_position, col] = not_missing_sample else: # In this way we convert NaN to NAN, which is a string instead of np.nan allData[col] = allData[col].map(lambda x: str(x).upper())allData_bk = allData.copy()'''检查数值型变量'''missing_pcnt_threshould_2 = 0.8deleted_var = []for col in numerical_var: missingRate = MissingContinuous(allData, col) print('{0} has missing rate as {1}'.format(col, missingRate)) if missingRate > missing_pcnt_threshould_2: deleted_var.append(col) print('we delete variable {} because of its high missing rate'.format(col)) else: if missingRate > 0: not_missing = allData.loc[allData[col] == allData[col]][col] #makeuped = allData[col].map(lambda x: MakeupRandom(x, list(not_missing))) # missing_positon 是一个index的list missing_position = allData.loc[allData[col] != allData[col]][col].index # 函数random.sample(list, n)作用是随机从list中取出n个元素 not_missing_sample = random.sample(list(not_missing), len(missing_position)) allData.loc[missing_position,col] = not_missing_sample #del allData[col] #allData[col] = makeuped missingRate2 = MissingContinuous(allData, col) print('missing rate after making up is:{}'.format(str(missingRate2)))if deleted_var != []: for col in deleted_var: numerical_var.remove(col) del allData[col]allData.to_csv(folderOfData+'allData_1.csv', header=True,encoding='gbk', columns = allData.columns, index=False)allData = pd.read_csv(folderOfData+'allData_1.csv', header=0,encoding='gbk')#################################### Step 3: 基于卡方分箱法对变量进行分箱#####################################for each categorical variable, if it has distinct values more than 5, we use the ChiMerge to merge ittrainData = pd.read_csv(folderOfData+'allData_1.csv',header = 0, encoding='gbk',dtype = {'target': np.int64})allFeatures = list(trainData.columns)allFeatures.remove('ListingInfo')allFeatures.remove('target')#allFeatures.remove('Idx')print("开始基于卡方分箱法对变量进行分箱")# 将特征区分为数值型和类别型numerical_var = []for var in allFeatures: uniq_vals = list(set(trainData[var])) if np.nan in uniq_vals: uniq_vals.remove( np.nan) if len(uniq_vals) >= 10 and isinstance(uniq_vals[0], numbers.Real): numerical_var.append(var)categorical_var = [i for i in allFeatures if i not in numerical_var]for col in categorical_var: #for Chinese character, upper() is not valid if col not in ['UserInfo_7','UserInfo_9','UserInfo_19']: trainData[col] = trainData[col].map(lambda x: str(x).upper())'''对于类别型变量,按照以下方式处理1,如果变量的取值个数超过5,计算bad rate进行编码2,除此之外,其他任何类别型变量如果有某个取值中,对应的样本全部是坏样本或者是好样本,进行合并。'''deleted_features = [] #将处理过的变量删除,防止对后面建模的干扰encoded_features = {} #将bad rate编码方式保存下来,在以后的测试和生产环境中需要使用merged_features = {} #将类别型变量合并方案保留下来var_IV = {} #save the IV values for binned features #将IV值和WOE值保留var_WOE = {}for col in categorical_var: print("现在正在处理 类别型变量。。处理的变量是:") print('we are processing {}'.format(col)) if len(set(trainData[col]))>5: print(" now the len this category is > 5 , 需要用bad rate做编码转换成数值型变量,再分箱") print('{} is encoded with bad rate'.format(col)) col0 = str(col)+'_encoding' #(1), 计算坏样本率并进行编码 encoding_result = BadRateEncoding(trainData, col, 'target') # 0 0.082894 # 1 0.058025 # 2 0.082624 # 3 0.058025 # 4 0.058025 # ... # 29995 0.077460 # 29996 0.082041 # 29997 0.082624 # 29998 0.058025 # 29999 0.082041 # Name: UserInfo_1, Length: 30000, dtype: float64, # 'bad_rate': {'-1.0': 0.0, '0.0': 0.0, '1.0': 0.058024568061520024, '2.0': 0.018867924528301886, '3.0': 0.08262393590385578, '4.0': 0.0774604479145264, '5.0': 0.0828936170212766, '6.0': 0.08204081632653061, '7.0': 0.07959479015918958}} trainData[col0], br_encoding = encoding_result['encoding'],encoding_result['bad_rate'] #(2), 将(1)中的编码后的变量也加入数值型变量列表中,为后面的卡方分箱做准备 numerical_var.append(col0) #(3), 保存编码结果,保持编码方案用于测试处理 encoded_features[col] = [col0, br_encoding] # print("encoded_features = ") # print(encoded_features) #(4), 删除原始值 deleted_features.append(col) else: print("类别型变量处理中。本变量 类别数少于或等于5") bad_bin = trainData.groupby([col])['target'].sum() #对于类别数少于5个,但是出现0坏样本的特征需要做处理 if min(bad_bin) == 0: print("说明出现了 0 坏样本的特征,需要处理") print('{} has 0 bad sample!'.format(col)) col1 = str(col) + '_mergeByBadRate' #(1), 找出最优合并方式,使得每一箱同时包含好坏样本 mergeBin = MergeBad0(trainData, col, 'target') # (2), 依照(1)的结果对值进行合并 trainData[col1] = trainData[col].map(mergeBin) maxPcnt = MaximumBinPcnt(trainData, col1) #如果合并后导致有箱占比超过90%,就删除。 if maxPcnt > 0.9: print("说明合并后导致有箱占比超过90%,需要删除。为什么能够直接删除?不允许计算10%的显著性?") print('{} is deleted because of large percentage of single bin'.format(col)) deleted_features.append(col) categorical_var.remove(col) del trainData[col] continue #(3) 如果合并后的新的变量满足要求,就保留下来 print("说明合并后导致有箱占比 没有 超过90%,不需要删除。正常操作即可") merged_features[col] = [col1, mergeBin] WOE_IV = CalcWOE(trainData, col1, 'target') var_WOE[col1] = WOE_IV['WOE'] var_IV[col1] = WOE_IV['IV'] #del trainData[col] deleted_features.append(col) else: print("说明没有 0 坏样本的特征,按照最正常处理方式即可") WOE_IV = CalcWOE(trainData, col, 'target') var_WOE[col] = WOE_IV['WOE'] var_IV[col] = WOE_IV['IV']print("处理 类别型变量工作结束! 开始处理数值型变量")var_cutoff = {}for col in numerical_var: """ 这是最重要的环节 """ print("正在处理数值型变量,变量是:") print("{} is in processing".format(col)) col1 = str(col) + '_Bin' #(1),用卡方分箱法进行分箱,并且保存每一个分割的端点。例如端点=[10,20,30]表示将变量分为x<10,1030. #特别地,缺失值-1不参与分箱 ''' 缺失值采用-1表示。如果有缺失值,则不参与分箱 cutOffPoints是分割点 ''' if -1 in set(trainData[col]): special_attribute = [-1] else: special_attribute = [] cutOffPoints = ChiMerge(trainData, col, 'target',special_attribute=special_attribute) var_cutoff[col] = cutOffPoints trainData[col1] = trainData[col].map(lambda x: AssignBin(x, cutOffPoints,special_attribute=special_attribute)) #(2), check whether the bad rate is monotone BRM = BadRateMonotone(trainData, col1, 'target',special_attribute=special_attribute) if not BRM: if special_attribute == []: bin_merged = Monotone_Merge(trainData, 'target', col1) removed_index = [] for bin in bin_merged: if len(bin)>1: indices = [int(b.replace('Bin ','')) for b in bin] removed_index = removed_index+indices[0:-1] # removed_index = {list}[0] # cutOffPoints = {list}: [10750.0, 26020.0, 49531.0, 79315.0] # removed_point = {list}: [10750.0, 79315.0] removed_point = [cutOffPoints[k] for k in removed_index] for p in removed_point: cutOffPoints.remove(p) var_cutoff[col] = cutOffPoints trainData[col1] = trainData[col].map(lambda x: AssignBin(x, cutOffPoints, special_attribute=special_attribute)) else: cutOffPoints2 = [i for i in cutOffPoints if i not in special_attribute] temp = trainData.loc[~trainData[col].isin(special_attribute)] bin_merged = Monotone_Merge(temp, 'target', col1) removed_index = [] for bin in bin_merged: if len(bin) > 1: indices = [int(b.replace('Bin ', '')) for b in bin] # first # removed_index = removed_index + indices[0:-1] removed_point = [cutOffPoints2[k] for k in removed_index] for p in removed_point: cutOffPoints2.remove(p) cutOffPoints2 = cutOffPoints2 + special_attribute var_cutoff[col] = cutOffPoints2 trainData[col1] = trainData[col].map(lambda x: AssignBin(x, cutOffPoints2, special_attribute=special_attribute)) maxPcnt = MaximumBinPcnt(trainData, col1) if maxPcnt > 0.9: # del trainData[col1] deleted_features.append(col) numerical_var.remove(col) print('we delete {} because the maximum bin occupies more than 90%'.format(col)) continue WOE_IV = CalcWOE(trainData, col1, 'target') var_IV[col] = WOE_IV['IV'] var_WOE[col] = WOE_IV['WOE'] #del trainData[col]print("数值型变量工作结束")print("Step3 基于卡方分箱法对变量进行分箱, 工作完成后的结果")print("var_WOE = ")print(var_WOE)print("var_IV = ")print(var_IV)print("merged_features = ")print(merged_features)trainData.to_csv(folderOfData+'allData_2.csv', header=True,encoding='gbk', columns = trainData.columns, index=False)print("var_cutoff = ")print(var_cutoff)with open(folderOfData+'var_WOE.pkl',"wb") as f: f.write(pickle.dumps(var_WOE))with open(folderOfData+'var_IV.pkl',"wb") as f: f.write(pickle.dumps(var_IV))with open(folderOfData+'var_cutoff.pkl',"wb") as f: f.write(pickle.dumps(var_cutoff))with open(folderOfData+'merged_features.pkl',"wb") as f: f.write(pickle.dumps(merged_features))######################################### Step 4: WOE编码后的单变量分析与多变量分析#########################################trainData = pd.read_csv(folderOfData+'allData_2.csv', header=0, encoding='gbk')with open(folderOfData+'var_WOE.pkl',"rb") as f: var_WOE = pickle.load(f)with open(folderOfData+'var_IV.pkl',"rb") as f: var_IV = pickle.load(f)with open(folderOfData+'var_cutoff.pkl',"rb") as f: var_cutoff = pickle.load(f)with open(folderOfData+'merged_features.pkl',"rb") as f: merged_features = pickle.load(f)#将一些看起来像数值变量实际上是类别变量的字段转换成字符num2str = ['SocialNetwork_13','SocialNetwork_12','UserInfo_6','UserInfo_5','UserInfo_10','UserInfo_11','UserInfo_12','UserInfo_13','UserInfo_17']for col in num2str: trainData[col] = trainData[col].map(lambda x: str(x))for col in var_WOE.keys(): print(col) col2 = str(col)+"_WOE" if col in var_cutoff.keys(): cutOffPoints = var_cutoff[col] special_attribute = [] if - 1 in cutOffPoints: special_attribute = [-1] # 需要有给样本赋值bin的过程。因为数值型变量已经安装ChiMerge值进行了分箱。 binValue = trainData[col].map(lambda x: AssignBin(x, cutOffPoints,special_attribute=special_attribute)) trainData[col2] = binValue.map(lambda x: var_WOE[col][x]) else: trainData[col2] = trainData[col].map(lambda x: var_WOE[col][x])trainData.to_csv(folderOfData+'allData_3.csv', header=True,encoding='gbk', columns = trainData.columns, index=False)### (i) 选择IV高于阈值的变量trainData = pd.read_csv(folderOfData+'allData_3.csv', header=0,encoding='gbk')all_IV = list(var_IV.values())all_IV = sorted(all_IV, reverse=True)plt.bar(x=range(len(all_IV)), height = all_IV)plt.title("IV sorted")plt.show()iv_threshould = 0.02varByIV = [k for k, v in var_IV.items() if v > iv_threshould]### (ii) 检查WOE编码后的变量的两两线性相关性var_IV_selected = {k:var_IV[k] for k in varByIV}var_IV_sorted = sorted(var_IV_selected.items(), key=lambda d:d[1], reverse = True)var_IV_sorted = [i[0] for i in var_IV_sorted]removed_var = []roh_thresould = 0.6for i in range(len(var_IV_sorted)-1): if var_IV_sorted[i] not in removed_var: x1 = var_IV_sorted[i]+"_WOE" for j in range(i+1,len(var_IV_sorted)): if var_IV_sorted[j] not in removed_var: x2 = var_IV_sorted[j] + "_WOE" roh = np.corrcoef([trainData[x1], trainData[x2]])[0, 1] if abs(roh) >= roh_thresould: print('the correlation coeffient between {0} and {1} is {2}'.format(x1, x2, str(roh))) if var_IV[var_IV_sorted[i]] > var_IV[var_IV_sorted[j]]: removed_var.append(var_IV_sorted[j]) else: removed_var.append(var_IV_sorted[i])var_IV_sortet_2 = [i for i in var_IV_sorted if i not in removed_var]### (iii)检查是否有变量与其他所有变量的VIF > 10# 由于涉及到Matrix,所以需要把DataFrame转为Matrixfor i in range(len(var_IV_sortet_2)): x0 = trainData[var_IV_sortet_2[i]+'_WOE'] x0 = np.array(x0) # 之前,var_IV_sorted_2 = {list} : ['ThirdParty_Info_Period2_6', 'ThirdParty_Info_Period6_6', 'ThirdParty_Info_Period5_6', 'UserInfo_14_encoding', 'ThirdParty_Info_Period4_15', 'ThirdParty_Info_Period1_15', 'ThirdParty_Info_Period3_15', 'ThirdParty_Info_Period6_1', 'ThirdParty_Info_Period5_1', 'ThirdParty_Info_Period5_2', 'UserInfo_16_encoding', 'ThirdParty_Info_Period5_10', 'WeblogInfo_6', 'Idx', 'ThirdParty_Info_Period4_8', 'ThirdParty_Info_Period2_8', 'UserInfo_7_encoding', 'WeblogInfo_20_encoding', 'ThirdParty_Info_Period3_10', 'ThirdParty_Info_Period4_9', 'UserInfo_17', 'ThirdParty_Info_Period1_10', 'ThirdParty_Info_Period1_3', 'ThirdParty_Info_Period2_10', 'ThirdParty_Info_Period4_4', 'WeblogInfo_2_encoding', 'ThirdParty_Info_Period3_3', 'UserInfo_1_encoding', 'LogInfo1_30_avg_count', 'WeblogInfo_5', 'UserInfo_12'] # 之后,X_Col = {list} : ['ThirdParty_Info_Period6_6_WOE', 'ThirdParty_Info_Period5_6_WOE', 'UserInfo_14_encoding_WOE', 'ThirdParty_Info_Period4_15_WOE', 'ThirdParty_Info_Period1_15_WOE', 'ThirdParty_Info_Period3_15_WOE', 'ThirdParty_Info_Period6_1_WOE', 'ThirdParty_Info_Period5_1_WOE', 'ThirdParty_Info_Period5_2_WOE', 'UserInfo_16_encoding_WOE', 'ThirdParty_Info_Period5_10_WOE', 'WeblogInfo_6_WOE', 'Idx_WOE', 'ThirdParty_Info_Period4_8_WOE', 'ThirdParty_Info_Period2_8_WOE', 'UserInfo_7_encoding_WOE', 'WeblogInfo_20_encoding_WOE', 'ThirdParty_Info_Period3_10_WOE', 'ThirdParty_Info_Period4_9_WOE', 'UserInfo_17_WOE', 'ThirdParty_Info_Period1_10_WOE', 'ThirdParty_Info_Period1_3_WOE', 'ThirdParty_Info_Period2_10_WOE', 'ThirdParty_Info_Period4_4_WOE', 'WeblogInfo_2_encoding_WOE', 'ThirdParty_Info_Period3_3_WOE', 'UserInfo_1_encoding_WOE', 'LogInfo1_30_avg_count_WOE', 'WeblogInfo_5_WOE', 'UserInfo_12_WOE'] X_Col = [k+'_WOE' for k in var_IV_sortet_2 if k != var_IV_sortet_2[i]] X = trainData[X_Col] X = np.matrix(X) regr = LinearRegression() clr= regr.fit(X, x0) x_pred = clr.predict(X) R2 = 1 - ((x_pred - x0) ** 2).sum() / ((x0 - x0.mean()) ** 2).sum() vif = 1/(1-R2) if vif > 10: print("Warning: the vif for {0} is {1}".format(var_IV_sortet_2[i], vif))########################## Step 5: 应用逻辑回归模型########################### 这里 var_IV_sortet_2是变量名称multi_analysis = [i+'_WOE' for i in var_IV_sortet_2]y = trainData['target']X = trainData[multi_analysis].copy()X['intercept'] = [1]*X.shape[0]LR = sm.Logit(y, X).fit()summary = LR.summary2()pvals = LR.pvalues.to_dict()params = LR.params.to_dict()#发现有变量不显著,因此需要单独检验显著性'''把该变量单独拧出来,看是否显著'''varLargeP = {k: v for k,v in pvals.items() if v >= 0.1}varLargeP = sorted(varLargeP.items(), key=lambda d:d[1], reverse = True)varLargeP = [i[0] for i in varLargeP]p_value_list = {}for var in varLargeP: # trainData[var] = {Series} X_temp = trainData[var].copy().to_frame() X_temp['intercept'] = [1] * X_temp.shape[0] LR = sm.Logit(y, X_temp).fit() p_value_list[var] = LR.pvalues[var]for k,v in p_value_list.items(): print("{0} has p-value of {1} in univariate regression".format(k,v))#发现有变量的系数为正,因此需要单独检验正确性varPositive = [k for k,v in params.items() if v >= 0]coef_list = {}for var in varPositive: X_temp = trainData[var].copy().to_frame() X_temp['intercept'] = [1] * X_temp.shape[0] LR = sm.Logit(y, X_temp).fit() coef_list[var] = LR.params[var]for k,v in coef_list.items(): print("{0} has coefficient of {1} in univariate regression".format(k,v))selected_var = [multi_analysis[0]]for var in multi_analysis[1:]: try_vars = selected_var+[var] X_temp = trainData[try_vars].copy() X_temp['intercept'] = [1] * X_temp.shape[0] LR = sm.Logit(y, X_temp).fit() #summary = LR.summary2() pvals, params = LR.pvalues, LR.params del params['intercept'] if max(pvals)<0.1 and max(params)<0: selected_var.append(var)# X_temp = {DataFrame} ThirdParty_Info_Period2_6_WOE ... intercept# 0 0.357555 ... 1# 1 -0.413268 ... 1# 2 -0.413268 ... 1# 3 0.357555 ... 1# 4 0.357555 ... 1# ... ... ... ...# 29995 -0.056315 ... 1# 29996 -0.056315 ... 1# 29997 0.357555 ... 1# 29998 0.357555 ... 1# 29999 0.357555 ... 1## [30000 rows x 19 columns]# y = {Series} 0 0# 1 0# 2 0# 3 0# 4 0# ..# 29995 0# 29996 0# 29997 0# 29998 0# 29999 0# Name: target, Length: 30000, dtype: int64# try_vars = {list} : ['ThirdParty_Info_Period2_6_WOE', 'ThirdParty_Info_Period6_6_WOE', 'ThirdParty_Info_Period5_6_WOE', 'UserInfo_14_encoding_WOE', 'ThirdParty_Info_Period1_15_WOE', 'ThirdParty_Info_Period3_15_WOE', 'ThirdParty_Info_Period6_1_WOE', 'UserInfo_16_encoding_WOE', 'WeblogInfo_4_WOE', 'ThirdParty_Info_Period2_8_WOE', 'UserInfo_7_encoding_WOE', 'WeblogInfo_20_encoding_WOE', 'UserInfo_17_WOE', 'ThirdParty_Info_Period1_10_WOE', 'ThirdParty_Info_Period2_10_WOE', 'WeblogInfo_2_encoding_WOE', 'LogInfo1_30_avg_count_WOE', 'UserInfo_12_WOE']# try_vars._len_ = {int} 18LR.summary2()print(LR.summary2())y_pred = LR.predict(X_temp)y_result = pd.DataFrame({'y_pred':y_pred, 'y_real':list(trainData['target'])})roc_auc_score_result = roc_auc_score(trainData['target'], y_pred)print("roc_auc_score_result = ", roc_auc_score_result)################# Step 6: 尺度化#################plt.show()scores = Prob2Score(y_pred, 200, 100)# scores.to_excel("scores.xlsx")plt.title("Score Distribution")plt.xlabel("Score")plt.ylabel("Quantity")plt.hist(scores,bins=100)# plt.title("score-transfer")plt.show()print("完成分数转换! ")print("Step 6 结束!")######################## Step 7: 生成ROC图#######################print("开始 step 7")import warningswarnings.filterwarnings('ignore')def proba_conversion(x, threshold3 = 0.3): if (x >= threshold3 ): return 1 else: return 0scores = Prob2Score(y_pred, 200, 100)trainData['predication'] = scoresscorecard = pd.DataFrame({'y_pred':y_pred, 'y_real':list(trainData['target']),'score':scores})ks_result = KS(scorecard,'score','y_real')print("ks_result = ", ks_result)# ROC_AUC(df, score, target)roc_auc_score_result2 = ROC_AUC(trainData, 'predication', 'target')# roc_auc_score_result2 = ROC_AUC(trainData, 'y_pred', 'target', )# 也可用sklearn带的函数roc_auc_score_result_directly = roc_auc_score(trainData['target'], y_pred)# print("")print("roc_auc_score = ", roc_auc_score_result_directly)#roc_auc_score_result = roc_auc_score(trainData['target'], y_pred)from sklearn.metrics import confusion_matrix, precision_recall_curveimport itertoolsplt.rcParams['font.sans-serif'] = ['SimHei']def plot_precision_recall(): plt.step(recall, precision, color = 'b', alpha = 0.2, where = 'post') plt.fill_between(recall, precision, step = 'post', alpha = 0.2, color = 'b') plt.plot(recall, precision, linewidth=2) plt.xlim([0.0,1]) plt.ylim([0.0,1.05]) plt.xlabel('召回率') plt.ylabel('精确率') plt.title('精确率 - 召回率 曲线') plt.show()def show_metrics(): tp = cm[1,1] fn = cm[1,0] fp = cm[0,1] tn = cm[0,0] print('精确率:{:.3f}'.format(tp/(tp+fp))) print('召回率:{:.3f}'.format(tp/(tp+fn))) print('F1 值:{:.3f}'.format( 2*( ( (tp/(tp+fp))*(tp/(tp+fn)) )/( (tp/(tp+fp))+(tp/(tp+fn)) ) ) ))def plot_confusion_matrix(cm, classes, normalize=False, title='Confusion matrix', cmap=plt.cm.Blues): plt.figure() plt.imshow(cm, interpolation='nearest', cmap=cmap) plt.title(title) plt.colorbar() tick_marks = np.arange(len(classes)) plt.xticks(tick_marks, classes, rotation=0) plt.yticks(tick_marks, classes) thresh = cm.max() / 2. for i, j in itertools.product(range(cm.shape[0]), range(cm.shape[1])): plt.text(j, i, cm[i, j], horizontalalignment='center', color='white' if cm[i, j] > thresh else 'black') plt.tight_layout() plt.ylabel('True label') plt.xlabel('Predicted label') plt.show()print("结束")
三、运行结果
下面图片,是代码在Jupyter Note 上运行后展示的结果
四、具体运行过程
具体运行过程,可以在码云(https://gitee.com/)上关注用户号:abcgz