使用pandas读取特征数据,并处理数据中的双引号
使用xgboost训练一版模型
xgboost==1.6.2
#!/usr/bin/env python
# -*- coding:utf-8 -*-
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
import xgboost as xgb
import logging
import csv
from xgboost import XGBClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import roc_auc_score, accuracy_score, log_lossmodel_version = "v101"
root_path = "/home/.../model/xgboost_tool"class DataProcess(object):def __init__(self, train_path, test_path):self.train_data = pd.read_csv(train_path, header=None, sep="\t", quoting=csv.QUOTE_MINIMAL, escapechar='\\')self.train_data.replace('""', '', inplace=True)self.train_df = self.train_data.apply(pd.to_numeric, errors='coerce')self.test_data = pd.read_csv(test_path, header=None, sep="\t", quoting=csv.QUOTE_MINIMAL, escapechar='\\')self.test_data.replace('""', '', inplace=True)self.test_df = self.test_data.apply(pd.to_numeric, errors='coerce')def data_process(self, mode="train"):if mode == "train":X_train, X_dev, Y_train, Y_dev = self.train_data_process()return X_train, X_dev, Y_train, Y_develse:X_test, Y_test = self.test_data_process()return X_test, Y_testdef train_data_process(self):data_X = self.train_df.iloc[:, 2:].astype(float)data_Y = self.train_df.iloc[:, 0].astype(int)X_train, X_dev, Y_train, Y_dev = train_test_split(data_X, data_Y, test_size=0.2)return X_train, X_dev, Y_train, Y_devdef test_data_process(self):X_test = self.test_df.iloc[:, 2:].astype(float)Y_test = self.test_df.iloc[:, 0].astype(int)return X_test, Y_testdef xgb_fit_single(X_train, Y_train, X_test, Y_test):"""模型训练"""logging.info("Train model start...")# 决策树数量num_round = 10max_depth = 5learning_rate = 0.1model = XGBClassifier(max_depth=max_depth, learning_rate=learning_rate, n_estimators=num_round, silent=True,objective='binary:logistic')eval_set = [(X_test, Y_test)]model.fit(X_train, Y_train, eval_metric=["auc", "logloss"], eval_set=eval_set, verbose=True)model_name = f"{num_round}_{max_depth}_{learning_rate}_{model_version}.json"model.save_model(f"{root_path}/saved_model/{model_name}")return modelif __name__ == '__main__':train_file_path = "/home/.../train.csv"test_file_path = "/home/.../test.csv"dp = DataProcess(train_file_path, test_file_path)X_train, X_dev, Y_train, Y_dev = dp.data_process(mode="train")print("Start xgboost training")model = xgb_fit_single(X_train, Y_train, X_dev, Y_dev)