1. 少数类别过采样技术SMOTE简介
Synthetic Minority Oversampling Technique,是一种用于合成少数类样本的过采样技术,通过对训练集中的正例进行插值来产生额外的正例。
基本思想:
对少数类样本进行分析,然后在现有少数类样本之间进行插值,人工合成新样本,并将新样本添加到数据集中进行训练。
该技术是目前处理非平衡数据的常用手段,并受到学术界和工业界的一致认同。
2. SMOTE算法基本步骤
1. 采样最邻近算法,计算出每个少数类样本的K个近邻;
2. 从K个近邻中随机挑选N个样本进行随机线性插值;
3. 构造新的少数类样本;
4. 将新样本与原数据合并,产生新的训练集。
3. SMOTE案例
由于SMOTE算法是基于样本空间进行插值的,会放大数据集中的噪声和异常,因此要对训练样本进行清洗。这里使用LightGBM算法对数据进行拟合,将预测结果较差的样本权重降低不参与SMOTE算法的插值过程。
参考:不均衡学习和异常检测_正负样本不均衡异常检测-CSDN博客
4. 创建一个工具类进行SMOTE抽样
4.1 lgb
1. 样本加权,标注识别不准的样本
class imbalanceData(): # 处理不均衡数据 def __init__(self, train, test, mmin, mmax, label, lis=[]): # lis不参与建模变量列表 self.label = label self.train_x = train.drop([label]+lis, axis=1) self.train_y = train[label] self.test_x = test.drop([label]+lis, axis=1) self.test_y = test[bad_ind] self.columns = list(self.train_x.columns) self.keep = self.columns + [self.label] self.mmin = 0.1 # mmin低分段错分比例 self.mmax = 0.7 # mmax高分段错分比例# 针对头部和尾部预测不准的样本,进行加权处理def weight(self,x,y): if x == 0 and y < self.mmin: return 0.1 # 噪声的权重,不参与过采样elif x == 1 and y > self.mmax: return 0.1 # 正常样本权重,参与过采样else: return 1
2. 样本选择
用一个lgbm算法和weight函数进行样本选择,只取预测准确的部分进行后续的smote过采样。
def data_cleaning(self): lgb_model, lgb_auc = self.lgb_test() sample = self.train_x.copy() sample[self.label] = self.train_y sample['pred'] = lgb_model.predict_proba(self.train_x)[:,1] sample = sample.sort_values(by=['pred'], ascending=False).reset_index() sample['rank'] = np.array(sample.index)/len(sample) sample['weight'] = sample.apply(lambda x:self.weight(x.label, x['rank']),axis = 1)smote_sample = sample[sample.weight == 1][self.keep] drop_sample = sample[sample.weight < 1][self.keep] train_x_smote = smote_sample[self.columns] train_y_smote = smote_sample[self.label] return train_x_smote, train_y_smote, drop_sample
3. smote过采样
只对部分样本做过采样
def apply_smote(self): train_x_smote, train_y_smote, drop_sample = self.data_cleaning() rex, rey = self.smote(train_x_smote, train_y_smote) print('badpctn:',rey.sum()/len(rey)) # 0.5df_rex = pd.DataFrame(rex) df_rex.columns = self.columns df_rex['weight'] = 1 df_rex[self.label] = rey df_aff_smote = df_rex.append(drop_sample) return df_aff_smote, rex, rey
4. 定义LightGBM函数
def lgb_test(self): import lightgbm as lgb clf =lgb.LGBMClassifier(boosting_type = 'gbdt', objective = 'binary', metric = 'auc', learning_rate = 0.1, n_estimators = 24, max_depth = 4, num_leaves = 25, max_bin = 40, min_data_in_leaf = 5, bagging_fraction = 0.6, bagging_freq = 0, feature_fraction = 0.8) clf.fit(self.train_x,self.train_y,eval_set=[(self.train_x, self.train_y), (self.test_x,self.test_y)], eval_metric = 'auc')return clf,clf.best_score_['valid_1']['auc']
5. 调用imblearn中的smote函数
def smote(self, train_x_smote, train_y_smote, K=15, random_state=0): from imblearn.over_sampling import SMOTE smote = SMOTE(k_neighbors=K, n_jobs=1, random_state=random_state) rex, rey = smote.fit_resample(train_x_smote,train_y_smote) return rex, rey
4.2 lr
x_smote = rex[feature_lst]
y_smote = rey
lr_model = LogisticRegression(C=0.1)
lr_model.fit(x_smote, y_smote)
x = train[feature_lst]
y = train['label']
val_x = val[feature_lst]
val_y = val['bad_ind']
y_pred = lr_model.predict_proba(x)[:,1]
fpr_lr_train, tpr_lr_train, _ = roc_curve(y, y_pred) # 计算TPR和FPR
train_ks = abs(fpr_lr_train-tpr_lr_train).max() # 计算训练集KS
print('train_ks : ', train_ks)
y_pred = lr_model.predict_proba(val_x)[:,1] # 计算验证集预测值
fpr_lr, tpr_lr, _ = roc_curve(val_y, y_pred) # 计算验证集预测值
val_ks = abs(fpr_lr - tpr_lr).max() # 计算验证集KS值
print('val_ks : ', val_ks)