227 lines
8.5 KiB
Python
227 lines
8.5 KiB
Python
import numpy as np
|
|
from math import log, floor, ceil, log2
|
|
from collections import Counter
|
|
#export
|
|
class Utility(object):
|
|
|
|
# This method computes entropy for information gain
|
|
def entropy(self, class_y):
|
|
# Input:
|
|
# class_y : list of class labels (0's and 1's)
|
|
|
|
# TODO: Compute the entropy for a list of classes
|
|
#
|
|
# Example:
|
|
# entropy([0,0,0,1,1,1,1,1,1]) = 0.918 (rounded to three decimal places)
|
|
|
|
entropy = 0
|
|
### Implement your code here
|
|
#############a################################
|
|
dict_class_vals = {}
|
|
if len(set(class_y))>1:
|
|
dict_class_vals[list(np.unique(class_y, return_counts=True)[0])[0]]=list(np.unique(class_y, return_counts=True)[1])[0]
|
|
dict_class_vals[list(np.unique(class_y, return_counts=True)[0])[1]]=list(np.unique(class_y, return_counts=True)[1])[1]
|
|
p0 = dict_class_vals[0]/len(class_y)
|
|
p1 = dict_class_vals[1]/len(class_y)
|
|
entropy = -p0*log2(p0)-p1*log2(p1)
|
|
else:
|
|
entropy=0
|
|
#############################################
|
|
return entropy
|
|
|
|
|
|
def partition_classes(self, X, y, split_attribute, split_val):
|
|
# Inputs:
|
|
# X : data containing all attributes
|
|
# y : labels
|
|
# split_attribute : column index of the attribute to split on
|
|
# split_val : a numerical value to divide the split_attribute
|
|
|
|
|
|
|
|
# TODO: Partition the data(X) and labels(y) based on the split value - BINARY SPLIT.
|
|
#
|
|
# Split_val should be a numerical value
|
|
# For example, your split_val could be the mean of the values of split_attribute
|
|
#
|
|
# You can perform the partition in the following way
|
|
# Numeric Split Attribute:
|
|
# Split the data X into two lists(X_left and X_right) where the first list has all
|
|
# the rows where the split attribute is less than or equal to the split value, and the
|
|
# second list has all the rows where the split attribute is greater than the split
|
|
# value. Also create two lists(y_left and y_right) with the corresponding y labels.
|
|
|
|
|
|
|
|
'''
|
|
Example:
|
|
|
|
|
|
|
|
X = [[3, 10], y = [1,
|
|
[1, 22], 1,
|
|
[2, 28], 0,
|
|
[5, 32], 0,
|
|
[4, 32]] 1]
|
|
|
|
|
|
|
|
Here, columns 0 and 1 represent numeric attributes.
|
|
|
|
|
|
|
|
Consider the case where we call the function with split_attribute = 0 and split_val = 3 (mean of column 0)
|
|
Then we divide X into two lists - X_left, where column 0 is <= 3 and X_right, where column 0 is > 3.
|
|
|
|
|
|
|
|
X_left = [[3, 10], y_left = [1,
|
|
[1, 22], 1,
|
|
[2, 28]] 0]
|
|
|
|
|
|
|
|
X_right = [[5, 32], y_right = [0,
|
|
[4, 32]] 1]
|
|
|
|
|
|
|
|
'''
|
|
|
|
X_left = []
|
|
X_right = []
|
|
|
|
y_left = []
|
|
y_right = []
|
|
### Implement your code here
|
|
#############################################
|
|
for x_elem,y_elem in zip(X,y):
|
|
if x_elem[split_attribute]<= split_val:
|
|
X_left.append(x_elem)
|
|
y_left.append(y_elem)
|
|
else:
|
|
X_right.append(x_elem)
|
|
y_right.append(y_elem)
|
|
|
|
#############################################
|
|
return (X_left, X_right, y_left, y_right)
|
|
|
|
|
|
def information_gain(self, previous_y, current_y):
|
|
# Inputs:
|
|
# previous_y: the distribution of original labels (0's and 1's)
|
|
# current_y: the distribution of labels after splitting based on a particular
|
|
# split attribute and split value
|
|
|
|
# TODO: Compute and return the information gain from partitioning the previous_y labels
|
|
# into the current_y labels.
|
|
# You will need to use the entropy function above to compute information gain
|
|
# Reference: http://www.cs.cmu.edu/afs/cs.cmu.edu/academic/class/15381-s06/www/DTs.pdf
|
|
|
|
"""
|
|
Example:
|
|
|
|
previous_y = [0,0,0,1,1,1]
|
|
current_y = [[0,0], [1,1,1,0]]
|
|
|
|
info_gain = 0.45915
|
|
"""
|
|
|
|
info_gain = 0
|
|
### Implement your code here
|
|
#############################################
|
|
H = self.entropy(previous_y)
|
|
HL = self.entropy(current_y[0])
|
|
PL = round(len(current_y[0])/len(previous_y),3)
|
|
HR = self.entropy(current_y[1])
|
|
PR = round(len(current_y[1])/len(previous_y),3)
|
|
|
|
info_gain = round(H - (HL*PL+HR*PR),4)
|
|
#############################################
|
|
return info_gain
|
|
|
|
|
|
def best_split(self, X, y):
|
|
# Inputs:
|
|
# X : Data containing all attributes
|
|
# y : labels
|
|
# TODO : For each node find the best split criteria and return the split attribute,
|
|
# spliting value along with X_left, X_right, y_left, y_right (using partition_classes)
|
|
# in the dictionary format {'split_attribute':split_attribute, 'split_val':split_val,
|
|
# 'X_left':X_left, 'X_right':X_right, 'y_left':y_left, 'y_right':y_right, 'info_gain':info_gain}
|
|
'''
|
|
|
|
Example:
|
|
|
|
X = [[3, 10], y = [1,
|
|
[1, 22], 1,
|
|
[2, 28], 0,
|
|
[5, 32], 0,
|
|
[4, 32]] 1]
|
|
|
|
Starting entropy: 0.971
|
|
|
|
Calculate information gain at splits: (In this example, we are testing all values in an
|
|
attribute as a potential split value, but you can experiment with different values in your implementation)
|
|
|
|
feature 0: --> split_val = 1 --> info_gain = 0.17
|
|
split_val = 2 --> info_gain = 0.01997
|
|
split_val = 3 --> info_gain = 0.01997
|
|
split_val = 4 --> info_gain = 0.32
|
|
split_val = 5 --> info_gain = 0
|
|
|
|
best info_gain = 0.32, best split_val = 4
|
|
|
|
|
|
feature 1: --> split_val = 10 --> info_gain = 0.17
|
|
split_val = 22 --> info_gain = 0.41997
|
|
split_val = 28 --> info_gain = 0.01997
|
|
split_val = 32 --> info_gain = 0
|
|
|
|
best info_gain = 0.4199, best split_val = 22
|
|
|
|
|
|
best_split_feature: 1
|
|
best_split_val: 22
|
|
|
|
'X_left': [[3, 10], [1, 22]]
|
|
'X_right': [[2, 28],[5, 32], [4, 32]]
|
|
|
|
'y_left': [1, 1]
|
|
'y_right': [0, 0, 1]
|
|
'''
|
|
|
|
split_attribute = 0
|
|
split_val = 0
|
|
feature_vals = {}
|
|
X_left, X_right, y_left, y_right = [], [], [], []
|
|
### Implement your code here
|
|
#############################################
|
|
num_features = len(X[0])
|
|
for feature_num in range(num_features-1):
|
|
information_gain = []
|
|
nested_dict = {}
|
|
distinct_vals = list(set(sorted(list(list(zip(*X))[feature_num]))))
|
|
for idx,split_value in enumerate(distinct_vals):
|
|
X_l, X_r, y_l, y_r = self.partition_classes(X,y,feature_num,split_value)
|
|
ig = self.information_gain(y,[y_l,y_r])
|
|
information_gain.append(ig)
|
|
index_max_ig = information_gain.index(max(information_gain))
|
|
nested_dict['split_val']=distinct_vals[index_max_ig]
|
|
nested_dict['information_gain']=max(information_gain)
|
|
feature_vals[f'feature_{feature_num}']=nested_dict
|
|
best_feature=max(feature_vals, key=lambda v: feature_vals[v]['information_gain'])
|
|
best_feature_val = int(best_feature.split('_')[1])
|
|
best_split_val = feature_vals[best_feature]['split_val']
|
|
X_left, X_right, y_left, y_right = self.partition_classes(X,y,best_feature_val,best_split_val)
|
|
info_gain = feature_vals[best_feature]['information_gain']
|
|
final_dict = {
|
|
'split_attribute':best_feature_val,
|
|
'split_val':best_split_val,
|
|
'X_left':X_left,
|
|
'X_right':X_right,
|
|
'y_left':y_left,
|
|
'y_right':y_right,
|
|
'info_gain':info_gain}
|
|
return final_dict
|
|
############################################# |