kaggle-Prediction of Obesity Risk

使用 LGBMClassifier

# https://www.kaggle.com/code/ddosad/ps4e2-visual-eda-lgbm-obesity-risk
import pandas as pd
from lightgbm import LGBMClassifier
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split, GridSearchCVdf_train = pd.read_csv('D:\\python\\dataset\\PredictionOfObesityRisk\\playground-series-s4e2\\train.csv')
original = pd.read_csv('D:\\python\\dataset\\PredictionOfObesityRisk\\playground-series-s4e2\\ObesityDataSet.csv')
df_test = pd.read_csv('D:\\python\\dataset\\PredictionOfObesityRisk\\playground-series-s4e2\\test.csv')df_train = pd.concat([df_train, original]).drop(['id'], axis=1).drop_duplicates()
id = df_test['id']
df_test = df_test.drop(['id'], axis=1)
obesityTypeDict = {'Insufficient_Weight': 0, 'Normal_Weight': 1, 'Obesity_Type_I': 2, 'Obesity_Type_II': 3,'Obesity_Type_III': 4, 'Overweight_Level_I': 5, 'Overweight_Level_II': 6}
obesityNumDict = {0: 'Insufficient_Weight', 1: 'Normal_Weight', 2: 'Obesity_Type_I', 3: 'Obesity_Type_II',4: 'Obesity_Type_III', 5: 'Overweight_Level_I', 6: 'Overweight_Level_II'}
df_train['BIM'] = df_train['Weight'] / (df_train['Height'] * df_train['Height'])
df_test['BIM'] = df_test['Weight'] / (df_test['Height'] * df_test['Height'])Y = df_train['NObeyesdad'].map(lambda x: obesityTypeDict[x])
x_train_o = df_train.drop(['NObeyesdad'], axis=1)
X = pd.get_dummies(x_train_o, drop_first=False)X_train, X_test, y_train, y_test = train_test_split(X, Y,test_size=0.2, random_state=42)model3 = LGBMClassifier()
parameters3 = {"n_estimators": [100, 200, 300, 400, 500],"learning_rate": [0.01, 0.05, 0.1, 0.5, 1],"random_state": [42],"num_leaves": [16, 17, 18]}grid_search3 = GridSearchCV(model3, parameters3, cv=5, n_jobs=-1, scoring='accuracy')
grid_search3.fit(X_train, y_train)
print(grid_search3.best_score_)
best_parameters3 = grid_search3.best_params_
print(f"best_parameters3: {best_parameters3}")
model3 = LGBMClassifier(**best_parameters3)
model3.fit(X_train, y_train)
X_test_pred3 = model3.predict(X_test)
accuracy_score(y_test, X_test_pred3)df_test = pd.get_dummies(df_test, drop_first=False)
df_preds = pd.DataFrame({'id': id, 'NObeyesdad': model3.predict(df_test)})df_preds['NObeyesdad'] = df_preds['NObeyesdad'].map(lambda x: obesityNumDict[x])
df_preds.to_csv('submission_lgb.csv', index=False)

使用 XGBClassifier

# https://www.kaggle.com/code/ddosad/ps4e2-visual-eda-lgbm-obesity-risk
import pandas as pd
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split, GridSearchCV
from xgboost import XGBClassifierdf_train = pd.read_csv('D:\\python\\dataset\\PredictionOfObesityRisk\\playground-series-s4e2\\train.csv')
original = pd.read_csv('D:\\python\\dataset\\PredictionOfObesityRisk\\playground-series-s4e2\\ObesityDataSet.csv')
df_test = pd.read_csv('D:\\python\\dataset\\PredictionOfObesityRisk\\playground-series-s4e2\\test.csv')df_train = pd.concat([df_train, original]).drop(['id'], axis=1).drop_duplicates()
id = df_test['id']
df_test = df_test.drop(['id'], axis=1)obesityTypeDict = {'Insufficient_Weight': 0, 'Normal_Weight': 1, 'Obesity_Type_I': 2, 'Obesity_Type_II': 3,'Obesity_Type_III': 4, 'Overweight_Level_I': 5, 'Overweight_Level_II': 6}
obesityNumDict = {0: 'Insufficient_Weight', 1: 'Normal_Weight', 2: 'Obesity_Type_I', 3: 'Obesity_Type_II',4: 'Obesity_Type_III', 5: 'Overweight_Level_I', 6: 'Overweight_Level_II'}
df_train['BIM'] = df_train['Weight'] / (df_train['Height'] * df_train['Height'])
df_test['BIM'] = df_test['Weight'] / (df_test['Height'] * df_test['Height'])Y = df_train['NObeyesdad'].map(lambda x: obesityTypeDict[x])
x_train_o = df_train.drop(['NObeyesdad'], axis=1)
X = pd.get_dummies(x_train_o, drop_first=False)X_train, X_test, y_train, y_test = train_test_split(X, Y,test_size=0.2, random_state=42)model3 = XGBClassifier()
parameters3 = {"n_estimators": [50, 100, 150, 200, 300],  # 多少棵树"learning_rate": [0.05, 0.1, 0, 2, 0.3],  # 学习率"max_depth": [3, 4, 5, 6, 7],  # 树的最大深度"colsample_bytree": [0.4, 0.6, 0.8, 1],  # 选择多少列构建一个树"min_child_weight": [1, 2, 3, 4]  # 叶子节点最小样本数目
}
# 构建grid search 模型, 5折交叉验证。# grid_search3 = RandomizedSearchCV(model3, parameters3, cv=5, n_jobs=-1, scoring='accuracy')grid_search3 = GridSearchCV(model3, parameters3, cv=5, n_jobs=-1, scoring='accuracy')
grid_search3.fit(X_train, y_train)
print(grid_search3.best_score_)
best_parameters3 = grid_search3.best_params_
print(best_parameters3)
model3 = XGBClassifier(**best_parameters3)
model3.fit(X_train, y_train)
X_test_pred3 = model3.predict(X_test)
accuracy_score(y_test, X_test_pred3)df_test = pd.get_dummies(df_test, drop_first=False)
df_preds = pd.DataFrame({'id': id, 'NObeyesdad': model3.predict(df_test)})df_preds['NObeyesdad'] = df_preds['NObeyesdad'].map(lambda x: obesityNumDict[x])
df_preds.to_csv('submission_xbg.csv', index=False)

使用stacking

# https://www.kaggle.com/code/ddosad/ps4e2-visual-eda-lgbm-obesity-risk
import pandas as pd
import numpy as npimport seaborn as sns
import matplotlib.pyplot as plt
import ydata_profilingfrom sklearn.metrics import roc_auc_score, accuracy_score
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import accuracy_score
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier, AdaBoostClassifier, StackingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier, ExtraTreeClassifier
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier
from catboost import CatBoostClassifierdf_train = pd.read_csv('D:\\python\\dataset\\PredictionOfObesityRisk\\playground-series-s4e2\\train.csv')
original = pd.read_csv('D:\\python\\dataset\\PredictionOfObesityRisk\\playground-series-s4e2\\ObesityDataSet.csv')
df_test = pd.read_csv('D:\\python\\dataset\\PredictionOfObesityRisk\\playground-series-s4e2\\test.csv')df_train = pd.concat([df_train, original]).drop(['id'], axis=1).drop_duplicates()
id = df_test['id']
df_test = df_test.drop(['id'], axis=1)
print(f'The Train dataset has {df_train.shape[0]} rows and {df_train.shape[1]} columns')
print(f'The Test dataset has {df_test.shape[0]} rows and {df_test.shape[1]} columns')# df_train.describe().to_csv('output.csv')
# pfr = ydata_profiling.ProfileReport(df_train)
# pfr.to_file("profile.html")obesityType = df_train[['NObeyesdad']].copy()
obesityType = obesityType.drop_duplicates(ignore_index=True)
print(obesityType)
obesityTypeDict = {'Insufficient_Weight': 0, 'Normal_Weight': 1, 'Obesity_Type_I': 2, 'Obesity_Type_II': 3,'Obesity_Type_III': 4, 'Overweight_Level_I': 5, 'Overweight_Level_II': 6}
obesityNumDict = {0: 'Insufficient_Weight', 1: 'Normal_Weight', 2: 'Obesity_Type_I', 3: 'Obesity_Type_II',4: 'Obesity_Type_III', 5: 'Overweight_Level_I', 6: 'Overweight_Level_II'}
df_train['BIM'] = df_train['Weight'] / (df_train['Height'] * df_train['Height'])
df_test['BIM'] = df_test['Weight'] / (df_test['Height'] * df_test['Height'])# def cal_grade(bim):
#     if bim < 18.5:
#         return 'Insufficient_Weight'
#     elif 18.5 <= bim < 25:
#         return 'Normal_Weight'
#     elif 25 <= bim < 26.89:
#         return 'Overweight_Level_I'
#     elif 26.89 <= bim < 30:
#         return 'Overweight_Level_II'
#     elif 30 <= bim < 35:
#         return 'Obesity_Type_I'
#     elif 35 <= bim < 40:
#         return 'Obesity_Type_II'
#     else:
#         return 'Obesity_Type_III'# df_train['cal_NObeyesdad'] = df_train['BIM'].map(cal_grade)
# same = df_train[df_train['cal_NObeyesdad'] == df_train['NObeyesdad']]
# print(same.shape)
# diff = df_train[df_train['cal_NObeyesdad'] != df_train['NObeyesdad']]
# print(diff.shape)
# BIMsort = df_train.sort_values(by=['BIM'])
# Overweight_Level_II = df_train[df_train['NObeyesdad']=='Overweight_Level_II'].sort_values(by = ['BIM'])
Y = df_train['NObeyesdad'].map(lambda x: obesityTypeDict[x])
x_train_o = df_train.drop(['NObeyesdad'], axis=1)
X = pd.get_dummies(x_train_o, drop_first=False)X_train, X_test, y_train, y_test = train_test_split(X, Y,test_size=0.2, random_state=42)training_score = []
testing_score = []def model_prediction(model):model.fit(X_train, y_train)y_pred_train = model.predict(X_train)y_pred_test = model.predict(X_test)train_accuracy = accuracy_score(y_train, y_pred_train)test_accuracy = accuracy_score(y_test, y_pred_test)training_score.append(train_accuracy)testing_score.append(test_accuracy)print(f"Training accuracy: {train_accuracy}")print(f"Testing accuracy: {test_accuracy}")model_prediction(SVC())
model_prediction(RandomForestClassifier())
model_prediction(AdaBoostClassifier())
model_prediction(GradientBoostingClassifier())
model_prediction(LGBMClassifier())
model_prediction(XGBClassifier())
model_prediction(CatBoostClassifier(verbose=False))models = ["SVC", "RandomForest", "AdaBoost", "GradientBoost", "LGBM", "XGB", "CatBoost"]df = pd.DataFrame({'Model': models, 'Training Accuracy': training_score, 'Testing Accuracy': testing_score})print(df)
# Plotting the above results as column-bar chart
df.plot(x='Model', y=['Training Accuracy', 'Testing Accuracy'], kind='bar', figsize=(10, 8))
plt.title('Training and Testing Accuracy for Each Model')
plt.ylabel('Accuracy')
plt.xlabel('Model')
plt.show()# 过于耗时
model1 = LGBMClassifier(force_row_wise=True)parameters1 = {"n_estimators": [200, 300, 400],"learning_rate": [0.01, 0.05],"random_state": [42],"num_leaves": [17, 18]}grid_search1 = GridSearchCV(model1, parameters1, scoring='accuracy', n_jobs=-1, cv=5)
grid_search1.fit(X_train, y_train)
print(f"grid_search1.best_score_: {grid_search1.best_score_}")
best_parameters1 = grid_search1.best_params_
print(best_parameters1)model1 = LGBMClassifier(**best_parameters1)
model1.fit(X_train, y_train)
X_test_pred1 = model1.predict(X_test)
accuracy_score(y_test, X_test_pred1)#model2 = CatBoostClassifier(verbose=False)
# parameters2 = {"learning_rate": np.arange(0.1, 0.5),
#                "random_state": [42],
#                "depth": [9, 10],
#                "iterations": [40, 50]}
# grid_search2 = GridSearchCV(model2, parameters2, cv=5, n_jobs=-1, scoring='accuracy')
#
# grid_search2.fit(X_train, y_train)
# print(f"grid_search2.best_score_: {grid_search2.best_score_}")
# best_parameters2 = grid_search2.best_params_
# print(best_parameters2)
# model2 = CatBoostClassifier(**best_parameters2, verbose=False)
# model2.fit(X_train, y_train)
#
# X_test_pred2 = model2.predict(X_test)
# accuracy_score(y_test, X_test_pred2)model3 = XGBClassifier()
parameters3 = {"n_estimators": [100, 150],"random_state": [42],"learning_rate": [0.1, 0.3, 0.5]}grid_search3 = GridSearchCV(model3, parameters3, cv=5, n_jobs=-1, scoring='accuracy')
grid_search3.fit(X_train, y_train)
print(grid_search3.best_score_)
best_parameters3 = grid_search3.best_params_
model3 = XGBClassifier(**best_parameters3)
model3.fit(X_train, y_train)
X_test_pred3 = model3.predict(X_test)
accuracy_score(y_test, X_test_pred3)model4 = RandomForestClassifier()
parameters4 = {'n_estimators': [300, 500, 550],'min_samples_split': [8, 9],'max_depth': [11, 12],'min_samples_leaf': [4, 5]}
grid_search4 = GridSearchCV(model4, parameters4, cv=5, n_jobs=-1, scoring='accuracy')
grid_search4.fit(X_train, y_train)
best_parameters4 = grid_search4.best_params_
model4 = RandomForestClassifier(**best_parameters4)
model4.fit(X_train, y_train)
X_test_pred4 = model4.predict(X_test)
accuracy_score(y_test, X_test_pred4)stacking_model = StackingClassifier(estimators=[('LGBM', model1),('XGBoost', model3),('RandomForest', model4)])stacking_model.fit(X_train, y_train)X_train_pred5 = stacking_model.predict(X_train)
X_test_pred5 = stacking_model.predict(X_test)
print(f'Stacking model training data is {accuracy_score(y_train, X_train_pred5)}')
print(f'Stacking model testing data is {accuracy_score(y_test, X_test_pred5)}')df_test = pd.get_dummies(df_test, drop_first=False)
df_preds = pd.DataFrame({'id': id, 'NObeyesdad': stacking_model.predict(df_test)})df_preds['NObeyesdad'] = df_preds['NObeyesdad'].map(lambda x: obesityNumDict[x])
df_preds.to_csv('submission1.csv', index=False)

本文来自互联网用户投稿,该文观点仅代表作者本人,不代表本站立场。本站仅提供信息存储空间服务,不拥有所有权,不承担相关法律责任。如若转载,请注明出处:http://www.mzph.cn/news/772480.shtml

如若内容造成侵权/违法违规/事实不符,请联系多彩编程网进行投诉反馈email:809451989@qq.com,一经查实,立即删除!

相关文章

Java中的String字符串练习

目录 Java中的String字符串练习 01-用户登录 02-遍历字符串并统计字符个数 03-字符串拼接 04-字符串反转 注意点 05-金额转化(简单) 代码解释: 06-手机号屏蔽 07-身份证号码查看 易错点: 08-敏感词替换 01-用户登录 package com.xiaonan.exercise06;import java.u…

题解:CF1937B(Binary Path)

题解&#xff1a;CF1937B&#xff08;Binary Path&#xff09; 一、 理解题意 1. 题目链接 CodeForces&#xff1b; 洛谷。 2. 题目翻译 给定一个 由 0 0 0 和 1 1 1 组成的 2 2 2 行 n n n 列的网格上寻找一条路径&#xff0c;使得这条路径上所有的数串联起来形成的0…

[leetcode]118.杨辉三角

前言&#xff1a;剑指offer刷题系列 问题&#xff1a; 给定一个非负整数 *numRows&#xff0c;*生成「杨辉三角」的前 numRows 行。 在「杨辉三角」中&#xff0c;每个数是它左上方和右上方的数的和。 示例&#xff1a; 输入: numRows 5 输出: [[1],[1,1],[1,2,1],[1,3,3,…

CKS之镜像漏洞扫描工具:Trivy

目录 Trivy介绍 Trivy安装 Trivy使用命令 容器镜像扫描 打印指定&#xff08;高危、严重&#xff09;漏洞信息 JSON格式输出 HTML格式输出 离线扫描命令 离线更新Trivy数据库 Harbor安装Trivy Trivy介绍 Trivy是一款用于扫描容器镜像、文件系统、Git仓库等的漏洞扫描…

Matlab|基于两阶段鲁棒优化的微网电源储能容量优化配置

目录 主要内容 1.1 目标函数 1.2 约束条件 1.3 不确定变量 部分代码 结果一览 下载链接 主要内容 程序主要复现的是《考虑寿命损耗的微网电池储能容量优化配置》&#xff0c;解决微网中电源/储能容量优化配置的问题&#xff0c;即风电、光伏、储能以及燃气轮机…

LeetCode - 执行子串操作后的字典序最小字符串

题目要求经过操作后的字符串的字典序要比之前小。 在做这道题的之后陷入了一个误区&#xff0c;就是看a的位置&#xff0c;a-1之后z&#xff0c;z的字典序比a大&#xff0c;所以要尽可能的避免a变成z&#xff0c;但是字典序的比较是从前往后比较的&#xff0c;纠结于a变成z&am…

NSCaching: Simple and Efficient NegativeSampling for Knowledge Graph Embedding

摘要 知识图嵌入是数据挖掘研究中的一个基本问题&#xff0c;在现实世界中有着广泛的应用。它的目的是将图中的实体和关系编码到低维向量空间中&#xff0c;以便后续算法使用。负抽样&#xff0c;即从训练数据中未观察到的负三元组中抽取负三元组&#xff0c;是KG嵌入的重要步…

Mybatis-02

Mybatis 1.${} 和 #{}的区别 ${}&#xff1a;表示拼接sql串&#xff0c;可能会发生sql注入 #{}&#xff1a;表示一个占位符号&#xff0c;可以预解析&#xff0c;防止sql注入 2Param注解 当涉及到多个参数传参的时候&#xff0c;我们直接使用变量名会发现控制台有错误提示…

第四百二十六回

文章目录 1. 概念介绍2. 实现方法2.1 原生方式2.1 插件方式 3. 示例代码4. 内容总结 我们在上一章回中介绍了"如何修改程序的桌面图标"相关的内容&#xff0c;本章回中将介绍如何处理ListView中的事件冲突.闲话休提&#xff0c;让我们一起Talk Flutter吧。 1. 概念介…

利用vite创建vue3项目

vue3 项目推荐使用vue官方推荐的vite手脚架创建&#xff0c;vue3项目&#xff0c;使用vue-cli 会存在一些问题 1.node的版本 目前的vue3需要至少需要node18及以上&#xff0c;可以安装nvm node包管理器可以快速切换node版本&#xff0c;因为node的版本的兼容性真是一言难尽。…

第十四届蓝桥杯C++A组(A/B/C/D/E/H)

文章目录 A.幸运数B.有奖问答C.平方差D.更小的数E.颜色平衡树H.异或和之和 A.幸运数 /*纯暴力*/ #include <bits/stdc.h>using namespace std;void solve() {int sum 0;for(int i 1; i < 100000000; i ){int n i;int a[11];int j 1;for(; n ! 0; j ){a[j] n % …

C++ 友元函数

目录 如果觉得有用的话&#xff0c;给小弟点个赞吧&#xff01;哈哈哈哈&#xff0c;谢谢嘞&#xff01; 概念&#xff1a; 如何理解&#xff1f; 概念&#xff1a; 友元&#xff1a;慎用&#xff08;突破封装&#xff09; 友元函数&#xff1a;在函数前加friend的函数称为…

网页代理ip怎么设置的

众所周知&#xff0c;现在网络安全和隐私保护是我们非常关注的问题。为了更好地保护自己的隐私&#xff0c;提高上网的安全性&#xff0c;使用代理IP成为了很多人的首选。 那么&#xff0c;网页代理IP是怎么设置的呢&#xff1f;下面&#xff0c;就让我来一一为大家介绍。 一、…

CMake学习笔记(二)从PROJECT_BINARY_DIR看外部编译和内部编译

目录 外部编译 内部编译 总结 外部编译 看如下例子&#xff1a;我在EXE_OUT_PATH中建立了文件夹build、文件夹src2 和 文件CMakeLists.txt 其中EXE_OUT_PATH/CMakeLists.txt的内容如下&#xff1a; PROJECT(out_path) ADD_SUBDIRECTORY(src2 bin2) MESSAGE(STATUS "m…

uniapp中实现canvas超出屏幕滚动查看(全网唯一可行方案)

亲爱的小伙伴&#xff0c;当你需要在uniapp中使用canvas绘制一个超长图&#xff0c;就类似于横向的流程图时&#xff0c;这个canvas超出屏幕部分拖动屏幕查看会变得十分棘手。我查阅了大量资料&#xff0c;甚至是问了无数遍AI&#xff0c;得到的结果只有很敷衍的监听touch,然后…

(一)whatsapp 语音通话基本流程

经过了一整年的开发测试&#xff0c;终于将whatsapp 语音通话完成&#xff0c;期间主要参考webrtc的源码来实现.下面简要说一下大致的步骤 XMPP 协商 发起或者接受语音通话第一步是发起XMPP 协商&#xff0c;这个协商过程非常重要。下面是协商一个包 <call toxxxs.whatsap…

【大模型基础】什么是KV Cache?

哪里存在KV Cache&#xff1f; KV cache发生在多个token生成的步骤中&#xff0c;并且只发生在decoder中&#xff08;例如&#xff0c;decoder-only模型&#xff0c;如 GPT&#xff0c;或在encoder-decoder模型&#xff0c;如T5的decoder部分&#xff09;&#xff0c;BERT这样…

USB - 通过configfs配置Linux USB Gadget

Linux USB gadget configured through configfs Overview USB Linux 小工具是一种具有 UDC&#xff08;USB 设备控制器&#xff09;的设备&#xff0c;可连接到 USB 主机&#xff0c;以扩展其附加功能&#xff0c;如串行端口或大容量存储功能。 A USB Linux Gadget is a device…

数据分析面试题(21~30)

21、简单说一下说说置信区间、置信度。 ①置信区间是指由样本统计量所构成的总体参数的估计区间。通常以一个样本统计量的估计值为中心&#xff0c;加减一个标准误差的倍数&#xff0c;构成一个区间。 ②置信度是对置信区间的信心程度的度量&#xff0c;通常以百分比的形式表…

Protocol Buffers设计要点

概述 一种开源跨平台的序列化结构化数据的协议。可用于存储数据或在网络上进行数据通信。它提供了用于描述数据结构的接口描述语言&#xff08;IDL&#xff09;&#xff0c;也提供了根据 IDL 产生代码的程序工具。Protocol Buffers的设计目标是简单和性能&#xff0c;所以与 XM…