决策树模型python代码实现

计算熵函数（二元熵）

# UNQ_C1
# GRADED FUNCTION: compute_entropydef compute_entropy(y):"""Computes the entropy for Args:y (ndarray): Numpy array indicating whether each example at a node isedible (`1`) or poisonous (`0`)Returns:entropy (float): Entropy at that node"""# You need to return the following variables correctlyentropy = 0### START CODE HERE ###p=len(y[y==1])/len(y)if pos==size or pos==0:entropy=0else:entropy=-p*np.log2(p)-(1-p)*np.log2(1-p)### END CODE HERE ###        return entropy

根据特征分裂结点

# UNQ_C2
# GRADED FUNCTION: split_datasetdef split_dataset(X, node_indices, feature):"""Splits the data at the given node intoleft and right branchesArgs:X (ndarray):             Data matrix of shape(n_samples, n_features)node_indices (ndarray):  List containing the active indices. I.e, the samples being considered at this step.feature (int):           Index of feature to split onReturns:left_indices (ndarray): Indices with feature value == 1right_indices (ndarray): Indices with feature value == 0"""# You need to return the following variables correctlyleft_indices = []right_indices = []### START CODE HERE ###for i in node_indices:if X[i,feature]==1:left_indices.append(i)else:right_indices.append(i)### END CODE HERE ###return left_indices, right_indices

计算信息增益

# UNQ_C3
# GRADED FUNCTION: compute_information_gaindef compute_information_gain(X, y, node_indices, feature):"""Compute the information of splitting the node on a given featureArgs:X (ndarray):            Data matrix of shape(n_samples, n_features)y (array like):         list or ndarray with n_samples containing the target variablenode_indices (ndarray): List containing the active indices. I.e, the samples being considered in this step.Returns:cost (float):        Cost computed"""    # Split datasetleft_indices, right_indices = split_dataset(X, node_indices, feature)# Some useful variablesX_node, y_node = X[node_indices], y[node_indices]X_left, y_left = X[left_indices], y[left_indices]X_right, y_right = X[right_indices], y[right_indices]# You need to return the following variables correctlyinformation_gain = 0### START CODE HERE #### Weights lefts=len(y_left)rights=len(y_right)w_left=lefts/(lefts+rights)w_right=rights/(lefts+rights)#Weighted entropyH_left=compute_entropy(y_left)H_right=compute_entropy(y_right)#Information gain                                                   H_root=compute_entropy(y_node)information_gain=H_root-(w_left*H_left+w_right*H_right)### END CODE HERE ###  return information_gain

选择最佳分裂特征

# UNQ_C4
# GRADED FUNCTION: get_best_splitdef get_best_split(X, y, node_indices):   """Returns the optimal feature and threshold valueto split the node data Args:X (ndarray):            Data matrix of shape(n_samples, n_features)y (array like):         list or ndarray with n_samples containing the target variablenode_indices (ndarray): List containing the active indices. I.e, the samples being considered in this step.Returns:best_feature (int):     The index of the best feature to split"""    # Some useful variablesnum_features = X.shape[1]# You need to return the following variables correctlybest_feature = -1IG=0### START CODE HERE ###for i in range(num_features):ig=compute_information_gain(X,y,node_indices,i)if ig>IG:IG=igbest_feature=i### END CODE HERE ##    return best_feature

随机森林算法


def build_tree_recursive(X, y, node_indices, branch_name, max_depth, current_depth):"""Build a tree using the recursive algorithm that split the dataset into 2 subgroups at each node.This function just prints the tree.Args:X (ndarray):            Data matrix of shape(n_samples, n_features)y (array like):         list or ndarray with n_samples containing the target variablenode_indices (ndarray): List containing the active indices. I.e, the samples being considered in this step.branch_name (string):   Name of the branch. ['Root', 'Left', 'Right']max_depth (int):        Max depth of the resulting tree. current_depth (int):    Current depth. Parameter used during recursive call.""" # Maximum depth reached - stop splittingif current_depth == max_depth:formatting = " "*current_depth + "-"*current_depthprint(formatting, "%s leaf node with indices" % branch_name, node_indices)return# Otherwise, get best split and split the data# Get the best feature and threshold at this nodebest_feature = get_best_split(X, y, node_indices) tree.append((current_depth, branch_name, best_feature, node_indices))formatting = "-"*current_depthprint("%s Depth %d, %s: Split on feature: %d" % (formatting, current_depth, branch_name, best_feature))# Split the dataset at the best featureleft_indices, right_indices = split_dataset(X, node_indices, best_feature)# continue splitting the left and the right child. Increment current depthbuild_tree_recursive(X, y, left_indices, "Left", max_depth, current_depth+1)build_tree_recursive(X, y, right_indices, "Right", max_depth, current_depth+1)