机器学习:分类决策树(Python)

一、各种熵的计算

entropy_utils.py

import numpy as np  # 数值计算
import math  # 标量数据的计算class EntropyUtils:"""决策树中各种熵的计算,包括信息熵、信息增益、信息增益率、基尼指数。统一要求:按照信息增益最大、信息增益率最大、基尼指数增益最大"""@staticmethoddef _set_sample_weight(sample_weight, n_samples):"""扩展到集成学习,此处为样本权重的设置:param sample_weight: 各样本的权重:param n_samples: 样本量:return:"""if sample_weight is None:sample_weight = np.asarray([1.0] * n_samples)return sample_weightdef cal_info_entropy(self, y_labels, sample_weight=None):"""计算样本的信息熵:param y_labels: 递归样本子集中类别集合或特征取值:param sample_weight: 各样本的权重:return:"""y = np.asarray(y_labels)sample_weight = self._set_sample_weight(sample_weight, len(y))y_values = np.unique(y)  # 样本中不同类别值ent_y = 0.0for val in y_values:p_i = len(y[y == val]) * np.mean(sample_weight[y == val]) / len(y)ent_y += -p_i * math.log2(p_i)return ent_ydef conditional_entropy(self, feature_x, y_labels, sample_weight=None):"""计算条件熵,给定特征属性的情况下,信息熵的计算:param feature_x: 某个样本特征:param y_labels: 递归样本子集中的类别集合:param sample_weight: 各样本的权重:return:"""x, y = np.asarray(feature_x), np.asarray(y_labels)sample_weight = self._set_sample_weight(sample_weight, len(y))cond_ent = 0.0for x_val in np.unique(x):x_idx = np.where(x == x_val)  # 某个特征取值的样本索引集合sub_x, sub_y = x[x_idx], y[x_idx]sub_sample_weight = sample_weight[x_idx]p_k = len(sub_y) / len(y)cond_ent += p_k * self.cal_info_entropy(sub_y, sub_sample_weight)return cond_entdef info_gain(self, feature_x, y_labels, sample_weight=None):"""计算信息增益:param feature_x::param y_labels::param sample_weight::return:"""return self.cal_info_entropy(y_labels, sample_weight) - \self.conditional_entropy(feature_x, y_labels, sample_weight)def info_gain_rate(self, feature_x, y_labels, sample_weight=None):"""计算信息增益率:param feature_x::param y_labels::param sample_weight::return:"""return self.info_gain(feature_x, y_labels, sample_weight) / \self.cal_info_entropy(feature_x, sample_weight)def cal_gini(self, y_label, sample_weight=None):"""计算当前特征或类别集合的基尼值:param y_label: 递归样本子集中类别集合或特征取值:param sample_weight::return:"""y = np.asarray(y_label)sample_weight = self._set_sample_weight(sample_weight, len(y))y_values = np.unique(y)gini_val = 1.0for val in y_values:p_k = len(y[y == val]) * np.mean(sample_weight[y == val]) / len(y)gini_val -= p_k ** 2return gini_valdef conditional_gini(self, feature_x, y_labels, sample_weight=None):"""计算条件基尼指数:param feature_x::param y_labels::param sample_weight::return:"""x, y = np.asarray(feature_x), np.asarray(y_labels)sample_weight = self._set_sample_weight(sample_weight, len(y))cond_gini = 0.0for x_val in np.unique(x):x_idx = np.where(x == x_val)  # 某个特征取值的样本索引集合sub_x, sub_y = x[x_idx], y[x_idx]sub_sample_weight = sample_weight[x_idx]p_k = len(sub_y) / len(y)cond_gini += p_k * self.cal_gini(sub_y, sub_sample_weight)return cond_ginidef gini_gain(self, feature_x, y_labels, sample_weight=None):"""计算基尼指数增益:param feature_x::param y_labels::param sample_weight::return:"""return self.cal_gini(y_labels, sample_weight) - \self.conditional_gini(feature_x, y_labels, sample_weight)# if __name__ == '__main__':
#     y = np.random.randint(0, 2, 50)
#     entropy = EntropyUtils()
#     ent = entropy.cal_info_entropy(y)
#     print(ent)

二、连续特征数据的离散分箱

data_bin_wrapper.py

import numpy as npclass DataBinsWrapper:"""连续特征数据的离散化,分箱(分段)操作,根据用户传参max_bins,计算分位数,以分位数分箱(分段)然后根据样本特征取值所在区间段(哪个箱)位置索引标记当前值1. fit(x)根据样本进行分箱2. transform(x)根据已存在的箱,把数据分成max_bins类"""def __init__(self, max_bins=10):self.max_bins = max_bins  # 分箱数:10%,20%,...,90%self.XrangeMap = None  # 箱(区间段)def fit(self, x_samples):"""根据样本进行分箱:param x_samples: 样本(二维数组 n * k),或一个特征属性的数据(二维 n * 1):return:"""if x_samples.ndim == 1:  # 一个特征属性,转换为二维数组n_features = 1x_samples = x_samples[:, np.newaxis]  # 添加一个轴,转换为二维数组else:n_features = x_samples.shape[1]# 构建分箱,区间段self.XrangeMap = [[] for _ in range(n_features)]for idx in range(n_features):x_sorted = sorted(x_samples[:, idx])  # 按特征索引取值,并从小到大排序for bin in range(1, self.max_bins):p = (bin / self.max_bins) * 100 // 1p_val = np.percentile(x_sorted, p)self.XrangeMap[idx].append(p_val)self.XrangeMap[idx] = sorted(list(set(self.XrangeMap[idx])))def transform(self, x_samples, XrangeMap=None):"""根据已存在的箱,把数据分成max_bins类:param x_samples: 样本(二维数组 n * k),或一个特征属性的数据(二维 n * 1):return:"""if x_samples.ndim == 1:if XrangeMap is not None:return np.asarray(np.digitize(x_samples, XrangeMap[0])).reshape(-1)else:return np.asarray(np.digitize(x_samples, self.XrangeMap[0])).reshape(-1)else:return np.asarray([np.digitize(x_samples[:, i], self.XrangeMap[i])for i in range(x_samples.shape[1])]).T# if __name__ == '__main__':
#     x = np.random.randn(10, 5)
#     print(x)
#     dbw = DataBinsWrapper(max_bins=5)
#     dbw.fit(x)
#     print(dbw.XrangeMap)
#     print(dbw.transform(x))

三、可视化分类边界函数

plt_decision_funtion.py

import matplotlib.pylab as plt
import numpy as npdef plot_decision_function(X, y, clf, acc=None, title_info=None, is_show=True, support_vectors=None):"""可视化分类边界函数:param X, y: 测试样本与类别:param clf: 分类模型:param acc: 模型分类正确率:param title_info: 可视化标题title的额外信息:param is_show: 是否在当前显示图像,用于父函数绘制子图:param support_vectors: 扩展支持向量机:return:"""if is_show:plt.figure(figsize=(7, 5))# 根据特征变量的最小值和最大值,生成二维网络,用于绘制等值线x_min, x_max = X[:, 0].min() - 1, X[:, 0].max() + 1y_min, y_max = X[:, 1].min() - 1, X[:, 1].max() + 1xi, yi = np.meshgrid(np.arange(x_min, x_max, 0.02),np.arange(y_min, y_max, 0.02))y_pred = clf.predict(np.c_[xi.ravel(), yi.ravel()])  # 模型预测值y_pred = y_pred.reshape(xi.shape)plt.contourf(xi, yi, y_pred, alpha=0.4)plt.scatter(X[:, 0], X[:, 1], alpha=0.8, c=y, edgecolors="k")plt.xlabel("Feature 1", fontdict={"fontsize": 12})plt.ylabel("Feature 2", fontdict={"fontsize": 12})if acc:if title_info:plt.title("Model Classification Boundary %s \n(accuracy = %.5f)"% (title_info, acc), fontdict={"fontsize": 14})else:plt.title("Model Classification Boundary (accuracy = %.5f)"% acc, fontdict={"fontsize": 14})else:if title_info:plt.title("Model Classification Boundary %s"% title_info, fontdict={"fontsize": 14})else:plt.title("Model Classification Boundary", fontdict={"fontsize": 14})if support_vectors is not None:  # 可视化支持向量,针对SVMplt.scatter(X[support_vectors, 0], X[support_vectors, 1],s=50, c="None", alpha=0.7, edgecolors="red")if is_show:plt.show()

四、熵计算的测试

test_entropy.py

import numpy as np
import pandas as pd
from utils.entropy_utils import EntropyUtils
from utils.data_bin_wrapper import DataBinsWrapperdata = pd.read_csv("data/watermelon.csv").iloc[:, 1:]
feat_names = data.columns[:6]
y = data.iloc[:, -1]
ent_obj = EntropyUtils()print("各特征的信息增益如下:")
for feat in feat_names:print(feat, ":", ent_obj.info_gain(data.loc[:, feat], y))print("=" * 60)
print("各特征的信息增益率如下:")
for feat in feat_names:print(feat, ":", ent_obj.info_gain_rate(data.loc[:, feat], y))print("=" * 60)
print("各特征的基尼指数增益如下:")
for feat in feat_names:print(feat, ":", ent_obj.gini_gain(data.loc[:, feat], y))print("=" * 60)
x1 = np.asarray(data.loc[:, ["密度", "含糖率"]])
print(x1)
dbw = DataBinsWrapper(max_bins=8)
dbw.fit(x1)
print(dbw.transform(x1))

五、树的结点信息封装 

tree_node.py


class TreeNode_C:"""决策树分类算法,树的结点信息封装,实体类:setXXX()、getXXX()"""def __init__(self, feature_idx: int = None, feature_val=None, criterion_val: float = None,n_samples: int = None, target_dist: dict = None, weight_dist: dict = None,left_child_Node=None, right_child_Node=None):"""决策树结点信息封装:param feature_idx: 特征索引,如果指定特征属性的名称,可以按照索引取值:param feature_val: 特征取值:param criterion_val: 划分结点的标准:信息增益(率)、基尼指数增益:param n_samples: 当前结点所包含的样本量:param target_dist: 当前结点类别分布:0-25%,1-50%,2-25%:param weight_dist: 当前结点所包含的样本权重分布:param left_child_Node: 左子树:param right_child_Node: 右子树"""self.feature_idx = feature_idxself.feature_val = feature_valself.criterion_val = criterion_valself.n_samples = n_samplesself.target_dist = target_distself.weight_dist = weight_distself.left_child_Node = left_child_Node  # 递归self.right_child_Node = right_child_Node  # 递归def level_order(self):"""按层次遍历树...:return:"""pass# def get_feature_idx(self):#     return self.get_feature_idx()## def set_feature_idx(self, feature_idx):#     self.feature_idx = feature_idx

六、分类决策树算法的实现

decision_tree_C.py

import numpy as np
from utils.entropy_utils import EntropyUtils
from utils.tree_node import TreeNode_C
from utils.data_bin_wrapper import DataBinsWrapperclass DecisionTreeClassifier:"""分类决策树算法实现:无论是ID3、C4.5或CART,统一按照二叉树构造1. 划分标准:信息增益(率)、基尼指数增益,都按照最大值选择特征属性2. 创建决策树fit(),递归算法实现,注意出口条件3. 预测predict_proba()、predict() --> 对树的搜索4. 数据的预处理操作,尤其是连续数据的离散化,分箱5. 剪枝处理"""def __init__(self, criterion="CART", is_feature_all_R=False, dbw_feature_idx=None,max_depth=None, min_sample_split=2, min_sample_leaf=1,min_impurity_decrease=0, max_bins=10):self.utils = EntropyUtils()  # 结点划分类self.criterion = criterion  # 结点的划分标准if criterion.lower() == "cart":self.criterion_func = self.utils.gini_gain  # 基尼指数增益elif criterion.lower() == "c45":self.criterion_func = self.utils.info_gain_rate  # 信息增益率elif criterion.lower() == "id3":self.criterion_func = self.utils.info_gain  # 信息增益else:raise ValueError("参数criterion仅限cart、c45或id3...")self.is_feature_all_R = is_feature_all_R  # 所有样本特征是否全是连续数据self.dbw_feature_idx = dbw_feature_idx  # 混合类型数据,可指定连续特征属性的索引self.max_depth = max_depth  # 树的最大深度,不传参,则一直划分下去self.min_sample_split = min_sample_split  # 最小的划分结点的样本量,小于则不划分self.min_sample_leaf = min_sample_leaf  # 叶子结点所包含的最小样本量,剩余的样本小于这个值,标记叶子结点self.min_impurity_decrease = min_impurity_decrease  # 最小结点不纯度减少值,小于这个值,不足以划分self.max_bins = max_bins  # 连续数据的分箱数,越大,则划分越细self.root_node: TreeNode_C() = None  # 分类决策树的根节点self.dbw = DataBinsWrapper(max_bins=max_bins)  # 连续数据离散化对象self.dbw_XrangeMap = {}  # 存储训练样本连续特征分箱的端点self.class_values = None  # 样本的类别取值def _data_bin_wrapper(self, x_samples):"""针对特定的连续特征属性索引dbw_feature_idx,分别进行分箱,考虑测试样本与训练样本使用同一个XrangeMap:param x_samples: 样本:即可以是训练样本,也可以是测试样本:return:"""self.dbw_feature_idx = np.asarray(self.dbw_feature_idx)x_samples_prop = []  # 分箱之后的数据if not self.dbw_XrangeMap:# 为空,即创建决策树前所做的分箱操作for i in range(x_samples.shape[1]):if i in self.dbw_feature_idx:  # 说明当前特征是连续数值self.dbw.fit(x_samples[:, i])self.dbw_XrangeMap[i] = self.dbw.XrangeMapx_samples_prop.append(self.dbw.transform(x_samples[:, i]))else:x_samples_prop.append(x_samples[:, i])else:  # 针对测试样本的分箱操作for i in range(x_samples.shape[1]):if i in self.dbw_feature_idx:  # 说明当前特征是连续数值x_samples_prop.append(self.dbw.transform(x_samples[:, i], self.dbw_XrangeMap[i]))else:x_samples_prop.append(x_samples[:, i])return np.asarray(x_samples_prop).Tdef fit(self, x_train, y_train, sample_weight=None):"""决策树的创建,递归操作前的必要信息处理:param x_train: 训练样本:ndarray,n * k:param y_train: 目标集:ndarray,(n, ):param sample_weight: 各样本的权重,(n, ):return:"""x_train, y_train = np.asarray(x_train), np.asarray(y_train)self.class_values = np.unique(y_train)  # 样本的类别取值n_samples, n_features = x_train.shape  # 训练样本的样本量和特征属性数目if sample_weight is None:sample_weight = np.asarray([1.0] * n_samples)self.root_node = TreeNode_C()  # 创建一个空树if self.is_feature_all_R:  # 全部是连续数据self.dbw.fit(x_train)x_train = self.dbw.transform(x_train)elif self.dbw_feature_idx:x_train = self._data_bin_wrapper(x_train)self._build_tree(1, self.root_node, x_train, y_train, sample_weight)# print(x_train)def _build_tree(self, cur_depth, cur_node: TreeNode_C, x_train, y_train, sample_weight):"""递归创建决策树算法,核心算法。按先序(中序、后序)创建的:param cur_depth: 递归划分后的树的深度:param cur_node: 递归划分后的当前根结点:param x_train: 递归划分后的训练样本:param y_train: 递归划分后的目标集合:param sample_weight: 递归划分后的各样本权重:return:"""n_samples, n_features = x_train.shape  # 当前样本子集中的样本量和特征属性数目target_dist, weight_dist = {}, {}  # 当前样本类别分布和权重分布  0-->30%,1-->70%class_labels = np.unique(y_train)  # 不同的类别值for label in class_labels:target_dist[label] = len(y_train[y_train == label]) / n_samplesweight_dist[label] = np.mean(sample_weight[y_train == label])cur_node.target_dist = target_distcur_node.weight_dist = weight_distcur_node.n_samples = n_samples# 递归出口判断if len(target_dist) <= 1:  # 所有的样本全属于同一个类别,递归出口1# 如果为0,则表示当前样本集合为空,递归出口3returnif n_samples < self.min_sample_split:  # 当前结点所包含的样本量不足以划分returnif self.max_depth is not None and cur_depth > self.max_depth:  # 树的深度达到最大深度return# 划分标准,选择最佳的划分特征及其取值best_idx, best_val, best_criterion_val = None, None, 0.0for k in range(n_features):  # 对当前样本集合中每个特征计算划分标准for f_val in np.unique(x_train[:, k]):  # 当前特征的不同取值feat_k_values = (x_train[:, k] == f_val).astype(int)  # 是当前取值f_val就是1,否则就是0criterion_val = self.criterion_func(feat_k_values, y_train, sample_weight)if criterion_val > best_criterion_val:best_criterion_val = criterion_val  # 最佳的划分标准值best_idx, best_val = k, f_val  # 当前最佳特征索引以及取值# 递归出口的判断if best_idx is None: # 当前属性为空,或者所有样本在所有属性上取值相同,无法划分returnif best_criterion_val <= self.min_impurity_decrease:  # 小于最小不纯度阈值,不划分returncur_node.criterion_val = best_criterion_valcur_node.feature_idx = best_idxcur_node.feature_val = best_val# print("当前划分的特征索引:", best_idx, "取值:", best_val, "最佳标准值:", best_criterion_val)# print("当前结点的类别分布:", target_dist)# 创建左子树,并递归创建以当前结点为子树根节点的左子树left_idx = np.where(x_train[:, best_idx] == best_val)  # 左子树所包含的样本子集索引if len(left_idx) >= self.min_sample_leaf:  # 小于叶子结点所包含的最少样本量,则标记为叶子结点left_child_node = TreeNode_C()  # 创建左子树空结点# 以当前结点为子树根结点,递归创建cur_node.left_child_Node = left_child_nodeself._build_tree(cur_depth + 1, left_child_node, x_train[left_idx],y_train[left_idx], sample_weight[left_idx])right_idx = np.where(x_train[:, best_idx] != best_val)  # 右子树所包含的样本子集索引if len(right_idx) >= self.min_sample_leaf:  # 小于叶子结点所包含的最少样本量,则标记为叶子结点right_child_node = TreeNode_C()  # 创建右子树空结点# 以当前结点为子树根结点,递归创建cur_node.right_child_Node = right_child_nodeself._build_tree(cur_depth + 1, right_child_node, x_train[right_idx],y_train[right_idx], sample_weight[right_idx])def _search_tree_predict(self, cur_node: TreeNode_C, x_test):"""根据测试样本从根结点到叶子结点搜索路径,判定类别搜索:按照后续遍历:param x_test: 单个测试样本:return:"""if cur_node.left_child_Node and x_test[cur_node.feature_idx] == cur_node.feature_val:return self._search_tree_predict(cur_node.left_child_Node, x_test)elif cur_node.right_child_Node and x_test[cur_node.feature_idx] != cur_node.feature_val:return self._search_tree_predict(cur_node.right_child_Node, x_test)else:# 叶子结点,类别,包含有类别分布# print(cur_node.target_dist)class_p = np.zeros(len(self.class_values))  # 测试样本的类别概率for i, c in enumerate(self.class_values):class_p[i] = cur_node.target_dist.get(c, 0) * cur_node.weight_dist.get(c, 1.0)class_p / np.sum(class_p)  # 归一化return class_pdef predict_proba(self, x_test):"""预测测试样本x_test的类别概率:param x_test: 测试样本ndarray、numpy数值运算:return:"""x_test = np.asarray(x_test)  # 避免传递DataFrame、list...if self.is_feature_all_R:if self.dbw.XrangeMap is not None:x_test = self.dbw.transform(x_test)else:raise ValueError("请先创建决策树...")elif self.dbw_feature_idx is not None:x_test = self._data_bin_wrapper(x_test)prob_dist = []  # 用于存储测试样本的类别概率分布for i in range(x_test.shape[0]):prob_dist.append(self._search_tree_predict(self.root_node, x_test[i]))return np.asarray(prob_dist)def predict(self, x_test):"""预测测试样本的类别:param x_test: 测试样本:return:"""x_test = np.asarray(x_test)  # 避免传递DataFrame、list...return np.argmax(self.predict_proba(x_test), axis=1)def _prune_node(self, cur_node: TreeNode_C, alpha):"""递归剪枝,针对决策树中的内部结点,自底向上,逐个考察方法:后序遍历:param cur_node: 当前递归的决策树的内部结点:param alpha: 剪枝阈值:return:"""# 若左子树存在,递归左子树进行剪枝if cur_node.left_child_Node:self._prune_node(cur_node.left_child_Node, alpha)# 若右子树存在,递归右子树进行剪枝if cur_node.right_child_Node:self._prune_node(cur_node.right_child_Node, alpha)# 针对决策树的内部结点剪枝,非叶结点if cur_node.left_child_Node is not None or cur_node.right_child_Node is not None:for child_node in [cur_node.left_child_Node, cur_node.right_child_Node]:if child_node is None:# 可能存在左右子树之一为空的情况,当左右子树划分的样本子集数小于min_samples_leafcontinueif child_node.left_child_Node is not None or child_node.right_child_Node is not None:return# 计算剪枝前的损失值,2表示当前结点包含两个叶子结点pre_prune_value = 2 * alphafor child_node in [cur_node.left_child_Node, cur_node.right_child_Node]:# 计算左右叶子结点的经验熵  if child_node is None:# 可能存在左右子树之一为空的情况,当左右子树划分的样本子集数小于min_samples_leafcontinuefor key, value in child_node.target_dist.items():  # 对每个叶子结点的类别分布pre_prune_value += -1 * child_node.n_samples * value * np.log(value) * \child_node.weight_dist.get(key, 1.0)# 计算剪枝后的损失值,当前结点即是叶子结点after_prune_value = alphafor key, value in cur_node.target_dist.items():  # 当前待剪枝的结点的类别分布after_prune_value += -1 * cur_node.n_samples * value * np.log(value) * \cur_node.weight_dist.get(key, 1.0)if after_prune_value <= pre_prune_value:  # 进行剪枝操作cur_node.left_child_Node = Nonecur_node.right_child_Node = Nonecur_node.feature_idx, cur_node.feature_val = None, Nonedef prune(self, alpha=0.01):"""决策树后剪枝算法(李航)C(T) + alpha * |T|:param alpha: 剪枝阈值,权衡模型对训练数据的拟合程度与模型的复杂度:return:"""self._prune_node(self.root_node, alpha)return self.root_node

七、分类决策树算法的测试

test_decision_tree_C.py

import pandas as pd
from decision_tree_C import DecisionTreeClassifier
from sklearn.datasets import load_iris, load_breast_cancer
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, accuracy_score
import numpy as np
import matplotlib.pyplot as plt
from sklearn.preprocessing import LabelEncoder# data = pd.read_csv("data/watermelon.csv").iloc[:, 1:]
# X = data.iloc[:, :-1]
# y = data.iloc[:, -1]# iris = load_iris()
# X, y = iris.data, iris.target# bc_data = load_breast_cancer()
# X, y = bc_data.data, bc_data.targetnursery = pd.read_csv("data/nursery.csv").dropna()
X, y = np.asarray(nursery.iloc[:, :-1]), np.asarray(nursery.iloc[:, -1])y = LabelEncoder().fit_transform(y)X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0, stratify=y)depth = np.linspace(2, 12, 11, dtype=np.int64)
accuracy = []for d in depth:dtc = DecisionTreeClassifier(is_feature_all_R=False, max_depth=d)dtc.fit(X_train, y_train)y_pred_labels = dtc.predict(X_test)acc = accuracy_score(y_test, y_pred_labels)# print(acc)accuracy.append(acc)
# dtc = DecisionTreeClassifier(dbw_feature_idx=[6, 7], max_bins=8, max_depth=2)
# dtc.fit(X, y)
# y_pred_prob = dtc.predict_proba(X)
# print(y_pred_prob)# print(classification_report(y_test, y_pred_labels))plt.figure(figsize=(7, 5))
plt.plot(depth, accuracy, "ko-", lw=1)
plt.show()

test_decision_tree_C_2.py

import numpy as np
import matplotlib.pyplot as plt
from decision_tree_C import DecisionTreeClassifier
from sklearn.datasets import make_classification
from sklearn.metrics import classification_report, accuracy_score
from utils.plt_decision_function import plot_decision_function# 生成数据
data, target = make_classification(n_samples=100, n_features=2, n_classes=2, n_informative=1, n_redundant=0,n_clusters_per_class=1, class_sep=0.8, random_state=21)
# print(data)
# print(target)cart_tree = DecisionTreeClassifier(is_feature_all_R=True)
cart_tree.fit(data, target)
y_test_pred = cart_tree.predict(data)
print(classification_report(target, y_test_pred))
plt.figure(figsize=(14, 10))
plt.subplot(221)
acc = accuracy_score(target, y_test_pred)
plot_decision_function(data, target, cart_tree, acc=acc, is_show=False, title_info="By CART UnPrune")# 剪枝处理
alpha = [1, 3, 5]
for i in range(3):cart_tree.prune(alpha=alpha[i])y_test_pred = cart_tree.predict(data)acc = accuracy_score(target, y_test_pred)plt.subplot(222 + i)plot_decision_function(data, target, cart_tree, acc=acc, is_show=False,title_info="By CART Prune α = %.1f" % alpha[i])
plt.tight_layout()
plt.show()

test_decision_tree_C_3.py

import copyimport numpy as np
import matplotlib.pyplot as plt
from decision_tree_C import DecisionTreeClassifier
from sklearn.datasets import load_breast_cancer, load_iris
from sklearn.metrics import classification_report, accuracy_score
from utils.plt_decision_function import plot_decision_function
from sklearn.model_selection import StratifiedKFoldbc_data = load_breast_cancer()
X, y = bc_data.data, bc_data.target
alphas = np.linspace(0, 10, 30)
accuracy_scores = []  # 存储每个alpha阈值下的交叉验证均分
cart = DecisionTreeClassifier(criterion="cart", is_feature_all_R=True, max_bins=10)
for alpha in alphas:scores = []k_fold = StratifiedKFold(n_splits=10).split(X, y)for train_idx, test_idx in k_fold:tree = copy.deepcopy(cart)tree.fit(X[train_idx], y[train_idx])tree.prune(alpha=alpha)y_test_pred = tree.predict(X[test_idx])scores.append(accuracy_score(y[test_idx], y_test_pred))del treeprint(alpha, ":", np.mean(scores))accuracy_scores.append(np.mean(scores))plt.figure(figsize=(7, 5))
plt.plot(alphas, accuracy_scores, "ko-", lw=1)
plt.grid(ls=":")
plt.xlabel("Alpha", fontdict={"fontsize": 12})
plt.ylabel("Accuracy Scores", fontdict={"fontsize": 12})
plt.title("Cross Validation Scores under different Prune Alpha", fontdict={"fontsize": 14})
plt.show()

本文来自互联网用户投稿,该文观点仅代表作者本人,不代表本站立场。本站仅提供信息存储空间服务,不拥有所有权,不承担相关法律责任。如若转载,请注明出处:http://www.mzph.cn/news/675788.shtml

如若内容造成侵权/违法违规/事实不符,请联系多彩编程网进行投诉反馈email:809451989@qq.com,一经查实,立即删除!

相关文章

mysql8.0 正值表达式Regular expressions (sample database classicmodels _No.5)

mysql8.0 正值表达式Regular expressions 准备工作&#xff0c;可以去下载 classicmodels 数据库资源如下 [ 点击&#xff1a;classicmodels] (https://download.csdn.net/download/tomxjc/88685970) 也可以去我的博客资源下载 https://download.csdn.net/download/tomxjc/8…

第二十六回 母夜叉孟州道卖人肉 武都头十字坡遇张青-Ubuntu 防火墙ufw配置

武松到县里投案&#xff0c;县官看武松是个汉子&#xff0c;就把诉状改成&#xff1a;武松与嫂一时斗殴杀死&#xff0c;后西门庆前来&#xff0c;两人互殴&#xff0c;打死西门庆。上报东平府。东平府尹也可怜武松&#xff0c;从轻发落&#xff0c;最后判了个&#xff1a;脊杖…

一条 SQL 更新语句是如何执行的?

之前你可能经常听 DBA 同事说&#xff0c;MySQL 可以恢复到半个月内任意一秒的状态&#xff0c;惊叹的同时&#xff0c;你是不是心中也会不免会好奇&#xff0c;这是怎样做到的呢&#xff1f; 我们先从一条更新语句讲起&#xff0c;首先创建一个表&#xff0c;这个表有一个主键…

百卓Smart管理平台 uploadfile.php 文件上传漏洞(CVE-2024-0939)

免责声明&#xff1a;文章来源互联网收集整理&#xff0c;请勿利用文章内的相关技术从事非法测试&#xff0c;由于传播、利用此文所提供的信息或者工具而造成的任何直接或者间接的后果及损失&#xff0c;均由使用者本人负责&#xff0c;所产生的一切不良后果与文章作者无关。该…

零基础学Python(9)— 流程控制语句(下)

前言&#xff1a;Hello大家好&#xff0c;我是小哥谈。流程控制语句是编程语言中用于控制程序执行流程的语句&#xff0c;本节课就带大家认识下Python语言中常见的流程控制语句&#xff01;~&#x1f308; 目录 &#x1f680;1.while循环 &#x1f680;2.for循环 &#x1…

RCE(命令执行)知识点总结最详细

description: 这里是CTF做题时常见的会遇见的RCE的漏洞知识点总结。 如果你觉得写得好并且想看更多web知识的话可以去gitbook.22kaka.fun去看&#xff0c;上面是我写的一本关于web学习的一个gitbook&#xff0c;当然如果你能去我的github为我的这个项目点亮星星我会感激不尽htt…

STM32之定时器

一、简介 STM32F4xx系列共有14个定时器&#xff0c;其中2个高级定时器、10个通用定时器、2个基本定时器。下图 为各定时器及其功能。 图1.各定时器及其功能 二、定时器的计数模式 向上计数模式&#xff1a;计数器从0计数到自动加载值(TIMx_ARR)&#xff0c;然后重新从0开始…

17:定时器编程实战

1、实验目的 (1)使用定时器来完成LED闪烁 (2)原来实现闪烁时中间的延迟是用delay函数实现的&#xff0c;在delay的过程中CPU要一直耗在这里不能去做别的事情。这是之前的缺点 (3)本节用定时器来定一个时间&#xff08;譬如0.3s&#xff09;&#xff0c;在这个定时器定时时间内…

抽象springBoot报错

Failed to configure a DataSource: url attribute is not specified and no embedded datasource could be configured. 中文翻译&#xff1a;无法配置DataSource&#xff1a;未指定“url”属性&#xff0c;并且无法配置嵌入数据源。 DataSource 翻译&#xff1a;数据源 得…

The Back-And-Forth Method (BFM) for Wasserstein Gradient Flows windows安装

本文记录了BFM算法代码在windows上的安装过程。 算法原网站&#xff1a;https://wasserstein-gradient-flows.netlify.app/ github&#xff1a;https://github.com/wonjunee/wgfBFMcodes 文章目录 FFTWwgfBFMcodesMATLABpython注 FFTW 官网/下载路径&#xff1a;https://ww…

警惕钓鱼邮件,保护您的开发者账号

请警惕钓鱼邮件 钓鱼邮件经常冒充官方 Google Play 通信&#xff0c;以窃取敏感信息&#xff0c;并最终为了经济利益盗取开发者账号。 保护开发者免受钓鱼邮件侵害的提示&#xff1a; Google.com 是用于联系开发者的唯一合法电子邮件域名。我们不会通过电子邮件或实时聊天要求您…

【Linux系统学习】 4.Linux实用操作 上

Linux实用操作 1.各类小技巧&#xff08;快捷键&#xff09; 1.1 ctrl c 强制停止 Linux某些程序的运行&#xff0c;如果想要强制停止它&#xff0c;可以使用快捷键ctrl c 命令输入错误&#xff0c;也可以通过快捷键ctrl c&#xff0c;退出当前输入&#xff0c;重新输入 1…

第十六篇【传奇开心果系列】Python的OpenCV库技术点案例示例:图像质量评估

传奇开心果短博文系列 系列短博文目录Python的OpenCV库技术点案例示例短博文系列博文目录前言一、图像质量评估方法和相关函数的介绍二、均方误差示例代码三、峰值信噪比示例代码四、结构相似性指数示例代码五、视频质量评估示例代码六、OpenCV均方根误差计算示例代码七、OpenC…

Cilium CNI深度指南

Cilium是基于eBPF的功能强大的CNI插件&#xff0c;为云原生环境提供了强大的网络和安全支持。原文: Cilium CNI: A Comprehensive Deep Dive Guide for Networking and Security Enthusiasts! &#x1f313;简介 欢迎阅读为网络和安全爱好者提供的全面深入的指南&#xff01; 本…

【JavaEE】_传输层协议UDP与TCP

目录 1. 开发中常见的数据组织格式 1.1 XML 1.2 JSON 1.3 Protobuf 2. 端口号 3. UDP协议 4. TCP协议 4.1 特点 4.2 TCP报文格式 4.3 TCP可靠性机制 4.3.1 确认应答机制 4.3.2 超时重传机制 4.3.2.1 丢包的两种情况 4.3.2.2 重传时间 4.3.3 连接管理机制 4.3.3…

VSCode如何让先前打开的文件不被自动关闭,一直保持在标签栏里(关闭预览模式)

第一次接触VSCode-Huawei IDE编辑器&#xff0c;每次打开一个新的代码文件&#xff0c;旧的代码文件都会被自动关闭&#xff08;现在才知道是因为文件默认是以预览模式打开展示的&#xff09;。 那么如何才能让先前打开的文件一直保持在标签栏里呢&#xff1f; 我们需要去设置…

MySQL 升级脚本制作

当数据库更新字段后或添加一些基础信息&#xff0c;要对生产环境进行升级&#xff0c;之前都是手动编写sql&#xff0c;容易出错还容易缺失。 通过 Navcat 工具的数据库结构同步功能和数据同步功能完成数据库脚本的制作。 一、结构同步功能 1、选择 工具–结构同步&#xff1…

【项目技术点总结之三】使用Java生成复杂好看的word或pdf报告的解决方案

前言 项目中往往会遇到需要生成报告的场景&#xff0c;不管是简单报告还是复杂报告&#xff0c;其实都需要找很多资料去尝试&#xff0c;本文会提出几种个人完美解决报告生成的解决方案&#xff0c;而且会提出几个失败但是能生成报告的设想&#xff0c;当然都是踩过坑的&#…

Web3智能合约:重新定义商业合作的未来

随着区块链技术的飞速发展&#xff0c;Web3时代正逐渐到来&#xff0c;而其中的智能合约成为推动商业合作变革的关键力量。本文将深入探讨Web3智能合约的概念、特点以及对商业合作未来的巨大影响。 什么是Web3智能合约&#xff1f; 智能合约是一种以代码形式编写、自动执行合同…

Linux——进程间通信:管道

我们在开发过程中&#xff0c;可能会碰到两个或多个进程需要协同进行&#xff0c;这两个进 程之间有着一定的关系&#xff0c;这个进程可能会需要另一个进程的某些消息来达 到自己的目的&#xff0c;或者是一个进程控制着另一个进程&#xff0c;又或者是需要某种资 源的共享。但…