【头歌】机器学习实训代码
第一关:决策树算法思想
-
1、下列说法正确的是?( AB )
A、训练决策树的过程就是构建决策树的过程
B、ID3算法是根据信息增益来构建决策树
C、C4.5算法是根据基尼系数来构建决策树
D、决策树模型的可理解性不高 -
2、下列说法错误的是?( B )
A、从树的根节点开始,根据特征的值一步一步走到叶子节点的过程是决策树做决策的过程
B、决策树只能是一棵二叉树
C、根节点所代表的特征是最优特征
第二关:决策树算法原理
import numpy as np
def calcInfoGain(feature, label, index):'''计算信息增益:param feature:测试用例中字典里的feature,类型为ndarray:param label:测试用例中字典里的label,类型为ndarray:param index:测试用例中字典里的index,即feature部分特征列的索引。该索引指的是feature中第几个特征,如index:0表示使用第一个特征来计算信息增益。:return:信息增益,类型float'''#*********** Begin ***********## 计算熵def calcInfoEntropy(feature, label):'''计算信息熵:param feature:数据集中的特征,类型为ndarray:param label:数据集中的标签,类型为ndarray:return:信息熵,类型float'''label_set = set(label)result = 0for l in label_set:count = 0for j in range(len(label)):if label[j] == l:count += 1# 计算标签在数据集中出现的概率p = count / len(label)# 计算熵result -= p * np.log2(p)return result# 计算条件熵def calcHDA(feature, label, index, value):'''计算信息熵:param feature:数据集中的特征,类型为ndarray:param label:数据集中的标签,类型为ndarray:param index:需要使用的特征列索引,类型为int:param value:index所表示的特征列中需要考察的特征值,类型为int:return:信息熵,类型float'''count = 0# sub_feature和sub_label表示根据特征列和特征值分割出的子数据集中的特征和标签sub_feature = []sub_label = []for i in range(len(feature)):if feature[i][index] == value:count += 1sub_feature.append(feature[i])sub_label.append(label[i])pHA = count / len(feature)e = calcInfoEntropy(sub_feature, sub_label)return pHA * ebase_e = calcInfoEntropy(feature, label)f = np.array(feature)# 得到指定特征列的值的集合f_set = set(f[:, index])sum_HDA = 0# 计算条件熵for value in f_set:sum_HDA += calcHDA(feature, label, index, value)# 计算信息增益return base_e - sum_HDA#*********** End *************#
第三关:动手实现ID3决策树
import numpy as np# 计算熵
def calcInfoEntropy(label):'''input:label(narray):样本标签output:InfoEntropy(float):熵'''label_set = set(label)InfoEntropy = 0for l in label_set:count = 0for j in range(len(label)):if label[j] == l:count += 1# 计算标签在数据集中出现的概率p = count / len(label)# 计算熵InfoEntropy -= p * np.log2(p)return InfoEntropy#计算条件熵
def calcHDA(feature,label,index,value):'''input:feature(ndarray):样本特征label(ndarray):样本标签index(int):需要使用的特征列索引value(int):index所表示的特征列中需要考察的特征值output:HDA(float):信息熵'''count = 0# sub_feature和sub_label表示根据特征列和特征值分割出的子数据集中的特征和标签sub_feature = []sub_label = []for i in range(len(feature)):if feature[i][index] == value:count += 1sub_feature.append(feature[i])sub_label.append(label[i])pHA = count / len(feature)e = calcInfoEntropy(sub_label)HDA = pHA * ereturn HDA#计算信息增益
def calcInfoGain(feature, label, index):'''input:feature(ndarry):测试用例中字典里的featurelabel(ndarray):测试用例中字典里的labelindex(int):测试用例中字典里的index,即feature部分特征列的索引。该索引指的是feature中第几个特征,如index:0表示使用第一个特征来计算信息增益。output:InfoGain(float):信息增益'''base_e = calcInfoEntropy(label)f = np.array(feature)# 得到指定特征列的值的集合f_set = set(f[:, index])sum_HDA = 0# 计算条件熵for value in f_set:sum_HDA += calcHDA(feature, label, index, value)# 计算信息增益InfoGain = base_e - sum_HDAreturn InfoGain# 获得信息增益最高的特征
def getBestFeature(feature, label):'''input:feature(ndarray):样本特征label(ndarray):样本标签output:best_feature(int):信息增益最高的特征'''#*********Begin*********#max_infogain = 0best_feature = 0for i in range(len(feature[0])):infogain = calcInfoGain(feature, label, i)if infogain > max_infogain:max_infogain = infogainbest_feature = i#*********End*********#return best_feature#创建决策树
def createTree(feature, label):'''input:feature(ndarray):训练样本特征label(ndarray):训练样本标签output:tree(dict):决策树模型 '''#*********Begin*********## 样本里都是同一个label没必要继续分叉了if len(set(label)) == 1:return label[0]# 样本中只有一个特征或者所有样本的特征都一样的话就看哪个label的票数高if len(feature[0]) == 1 or len(np.unique(feature, axis=0)) == 1:vote = {}for l in label:if l in vote.keys():vote[l] += 1else:vote[l] = 1max_count = 0vote_label = Nonefor k, v in vote.items():if v > max_count:max_count = vvote_label = kreturn vote_label# 根据信息增益拿到特征的索引best_feature = getBestFeature(feature, label)tree = {best_feature: {}}f = np.array(feature)# 拿到bestfeature的所有特征值f_set = set(f[:, best_feature])# 构建对应特征值的子样本集sub_feature, sub_labelfor v in f_set:sub_feature = []sub_label = []for i in range(len(feature)):if feature[i][best_feature] == v:sub_feature.append(feature[i])sub_label.append(label[i])# 递归构建决策树tree[best_feature][v] = createTree(sub_feature, sub_label)#*********End*********#return tree#决策树分类
def dt_clf(train_feature,train_label,test_feature):'''input:train_feature(ndarray):训练样本特征train_label(ndarray):训练样本标签test_feature(ndarray):测试样本特征output:predict(ndarray):测试样本预测标签 '''#*********Begin*********#result = []tree = createTree(train_feature,train_label)def classify(tree, feature):if not isinstance(tree, dict):return treet_index, t_value = list(tree.items())[0]f_value = feature[t_index]if isinstance(t_value, dict):classLabel = classify(tree[t_index][f_value], feature)return classLabelelse:return t_valuefor feature in test_feature:result.append(classify(tree, feature))predict = np.array(result)#*********End*********#return predict