In [1]:
import csv
import numpy as np  # http://www.numpy.org
import ast
from datetime import datetime
from math import log, floor, ceil

Modify the Utility class's methods. You can also add additional methods as required but don't change existing methods' arguments.

In [2]:
class Utility(object):
    
    # This method computes entropy for information gain
    def entropy(self, class_y):
        # Input:            
        #   class_y         : list of class labels (0's and 1's)

        # TODO: Compute the entropy for a list of classes
        #
        # Example:
        #    entropy([0,0,0,1,1,1,1,1,1]) = 0.92

        entropy = 0
        ### Implement your code here
        #############################################
        if len(class_y) == 0:
            return entropy
        p = sum(class_y)*1.0/len(class_y)
        for p_val in [p, 1-p]:
            if p > 0:
                entropy -= p_val*np.log2(p_val)
                
        #############################################
        return entropy


    def partition_classes(self, X, y, split_attribute, split_val):
        # Inputs:
        #   X               : data containing all attributes
        #   y               : labels
        #   split_attribute : column index of the attribute to split on
        #   split_val       : either a numerical or categorical value to divide the split_attribute

        # TODO: Partition the data(X) and labels(y) based on the split value - BINARY SPLIT.
        # 
        # You will have to first check if the split attribute is numerical or categorical    
        # If the split attribute is numeric, split_val should be a numerical value
        # For example, your split_val could be the mean of the values of split_attribute
        # If the split attribute is categorical, split_val should be one of the categories.   
        #
        # You can perform the partition in the following way
        # Numeric Split Attribute:
        #   Split the data X into two lists(X_left and X_right) where the first list has all
        #   the rows where the split attribute is less than or equal to the split value, and the 
        #   second list has all the rows where the split attribute is greater than the split 
        #   value. Also create two lists(y_left and y_right) with the corresponding y labels.
        #
        # Categorical Split Attribute:
        #   Split the data X into two lists(X_left and X_right) where the first list has all 
        #   the rows where the split attribute is equal to the split value, and the second list
        #   has all the rows where the split attribute is not equal to the split value.
        #   Also create two lists(y_left and y_right) with the corresponding y labels.

        '''
        Example:

        X = [[3, 'aa', 10],                 y = [1,
             [1, 'bb', 22],                      1,
             [2, 'cc', 28],                      0,
             [5, 'bb', 32],                      0,
             [4, 'cc', 32]]                      1]

        Here, columns 0 and 2 represent numeric attributes, while column 1 is a categorical attribute.

        Consider the case where we call the function with split_attribute = 0 and split_val = 3 (mean of column 0)
        Then we divide X into two lists - X_left, where column 0 is <= 3  and X_right, where column 0 is > 3.

        X_left = [[3, 'aa', 10],                 y_left = [1,
                  [1, 'bb', 22],                           1,
                  [2, 'cc', 28]]                           0]

        X_right = [[5, 'bb', 32],                y_right = [0,
                   [4, 'cc', 32]]                           1]

        Consider another case where we call the function with split_attribute = 1 and split_val = 'bb'
        Then we divide X into two lists, one where column 1 is 'bb', and the other where it is not 'bb'.

        X_left = [[1, 'bb', 22],                 y_left = [1,
                  [5, 'bb', 32]]                           0]

        X_right = [[3, 'aa', 10],                y_right = [1,
                   [2, 'cc', 28],                           0,
                   [4, 'cc', 32]]                           1]

        ''' 

        X_left = []
        X_right = []

        y_left = []
        y_right = []
        ### Implement your code here
        #############################################
        for Xp, yp in zip(X, y):
            if type(Xp[split_attribute]) == str:
                if Xp[split_attribute] == split_val:
                    X_left.append(Xp)
                    y_left.append(yp)
                else:
                    X_right.append(Xp)
                    y_right.append(yp)
            else:
                if Xp[split_attribute] <= split_val:
                    X_left.append(Xp)
                    y_left.append(yp)
                else:
                    X_right.append(Xp)
                    y_right.append(yp)
                
        #############################################
        return (X_left, X_right, y_left, y_right)


    def information_gain(self, previous_y, current_y):
        # Inputs:
        #   previous_y: the distribution of original labels (0's and 1's)
        #   current_y:  the distribution of labels after splitting based on a particular
        #               split attribute and split value

        # TODO: Compute and return the information gain from partitioning the previous_y labels
        # into the current_y labels.
        # You will need to use the entropy function above to compute information gain
        # Reference: http://www.cs.cmu.edu/afs/cs.cmu.edu/academic/class/15381-s06/www/DTs.pdf

        """
        Example:

        previous_y = [0,0,0,1,1,1]
        current_y = [[0,0], [1,1,1,0]]

        info_gain = 0.45915
        """

        info_gain = 0
        ### Implement your code here
        #############################################
        info_gain = self.entropy(previous_y) -\
                sum([self.entropy(y) * len(y)/len(previous_y) for y in current_y])
        #############################################
        return info_gain


    def best_split(self, X, y):
        # Inputs:
        #   X       : Data containing all attributes
        #   y       : labels
        # TODO: For each node find the best split criteria and return the 
        # split attribute, spliting value along with 
        # X_left, X_right, y_left, y_right (using partition_classes)
        '''

        NOTE: Just like taught in class, don't use all the features for a node.
        Repeat the steps:

        1. Select m attributes out of d available attributes
        2. Pick the best variable/split-point among the m attributes
        3. return the split attributes, split point, left and right children nodes data 

        '''
        split_attribute = 0
        split_value = 0
        X_left, X_right, y_left, y_right = [], [], [], []
        ### Implement your code here
        #############################################
        max_gain = -np.inf
        split_val = None
        for i in range(len(X[0])):
            if type(X[0][i]) == str:
                values = np.unique(X[:,i])
                for val in values:
                    X_l, X_r, y_l, y_r = self.partition_classes(X, y, i, val)
                    split_val = val
            else:
                mean = np.mean(X[:, i])
                X_l, X_r, y_l, y_r = self.partition_classes(X, y, i, mean)
                split_val = mean
            gain = self.information_gain(y, [y_l, y_r])
            if gain > max_gain:
                max_gain = gain
                split_attribute = i
                split_value = split_val
                X_left, X_right, y_left, y_right = X_l, X_r, y_l, y_r
        #############################################
        return X_left, X_right, y_left, y_right, split_attribute, split_value

### Define the classes 'DecisionTree' and 'RandomForest'

Please modify the 'DecisionTree' and 'RandomForest' classes below

In [3]:
class DecisionTree(object):
    def __init__(self, max_depth):
        # Initializing the tree as an empty dictionary or list, as preferred
        self.tree = {}
        self.max_depth = max_depth
        
    	
    def learn(self, X, y, par_node = {}, depth=0):
        # TODO: Train the decision tree (self.tree) using the the sample X and labels y
        # You will have to make use of the functions in Utility class to train the tree

        # Use the function best_split in Utility class to get the best split and 
        # data corresponding to left and right child nodes
        
        # One possible way of implementing the tree:
        #    Each node in self.tree could be in the form of a dictionary:
        #       https://docs.python.org/2/library/stdtypes.html#mapping-types-dict
        #    For example, a non-leaf node with two children can have a 'left' key and  a 
        #    'right' key. You can add more keys which might help in classification
        #    (eg. split attribute and split value)
        ### Implement your code here
        #############################################
        # Base case
        if len(y) == 1:
            return y[0]
        if depth >= self.max_depth:
            values, counts = np.unique(y, return_counts=True)
            most_frequent = values[np.argmax(counts)]
            return most_frequent
        
        X_left, X_right, y_left, y_right, split_attribute, split_value = Utility().best_split(X, y) 
        X_left = np.asarray(X_left)
        y_left = np.asarray(y_left)
        X_right = np.asarray(X_right)
        y_right = np.asarray(y_right)
        
        # No information gain
        if len(X_left) == 0 or len(X_right) == 0:
            values, counts = np.unique(y, return_counts=True)
            most_frequent = values[np.argmax(counts)]
            return most_frequent
        
        par_node = {
                        'split_attribute': split_attribute,
                        'split_value': split_value,
                        'left': self.learn(X_left, y_left, depth=depth+1),
                        'right': self.learn(X_right, y_right, depth=depth+1)
                   }
        
        # Final case
        if depth == 0:
            self.tree = par_node
            
        return par_node
        
        #############################################


    def classify(self, record):
        # TODO: classify the record using self.tree and return the predicted label
        ### Implement your code here
        #############################################
        node = self.tree
        while type(node) == dict:
            key = 'left'
            if type(record[node['split_attribute']]) == str:
                if record[node['split_attribute']] != node['split_value']:
                    key = 'right'
            else:
                if record[node['split_attribute']] > node['split_value']: 
                    key = 'right'
            node = node[key]
        return node
        #############################################

In [4]:
# This starter code does not run. You will have to add your changes and
# turn in code that runs properly.

"""
Here, 
1. X is assumed to be a matrix with n rows and d columns where n is the
number of total records and d is the number of features of each record. 
2. y is assumed to be a vector of labels of length n.
3. XX is similar to X, except that XX also contains the data label for each
record.
"""

"""
This skeleton is provided to help you implement the assignment.You must 
implement the existing functions as necessary. You may add new functions
as long as they are called from within the given classes. 

VERY IMPORTANT!
Do NOT change the signature of the given functions.
Do NOT change any part of the main function APART from the forest_size parameter.  
"""


class RandomForest(object):
    num_trees = 0
    decision_trees = []

    # the bootstrapping datasets for trees
    # bootstraps_datasets is a list of lists, where each list in bootstraps_datasets is a bootstrapped dataset.
    bootstraps_datasets = []

    # the true class labels, corresponding to records in the bootstrapping datasets
    # bootstraps_labels is a list of lists, where the 'i'th list contains the labels corresponding to records in
    # the 'i'th bootstrapped dataset.
    bootstraps_labels = []

    def __init__(self, num_trees):
        # Initialization done here
        self.num_trees = num_trees
        self.decision_trees = [DecisionTree(max_depth=10) for i in range(num_trees)]
        self.bootstraps_datasets = []
        self.bootstraps_labels = []
        
    def _bootstrapping(self, XX, n):
        # Reference: https://en.wikipedia.org/wiki/Bootstrapping_(statistics)
        #
        # TODO: Create a sample dataset of size n by sampling with replacement
        #       from the original dataset XX.
        # Note that you would also need to record the corresponding class labels
        # for the sampled records for training purposes.
        XX = np.asarray(XX)
        samples = [] # sampled dataset
        labels = []  # class labels for the sampled records
        ### Implement your code here
        #############################################
        for i in range(len(XX)):    
            row = np.random.randint(0, n-1)
            samples.append(XX[row, :-1])
            labels.append(XX[row, -1])
        #############################################
        return (np.asarray(samples), np.asarray(labels))

    def bootstrapping(self, XX):
        # Initializing the bootstap datasets for each tree
        for i in range(self.num_trees):
            data_sample, data_label = self._bootstrapping(XX, len(XX))
            self.bootstraps_datasets.append(data_sample)
            self.bootstraps_labels.append(data_label)

    def fitting(self):
        # TODO: Train `num_trees` decision trees using the bootstraps datasets
        # and labels by calling the learn function from your DecisionTree class.
        ### Implement your code here
        #############################################
        for dt, X, y in zip(self.decision_trees, self.bootstraps_datasets, self.bootstraps_labels):
            dt.learn(X, y)
        #############################################

    def voting(self, X):
        y = []

        for record in X:
            # Following steps have been performed here:
            #   1. Find the set of trees that consider the record as an
            #      out-of-bag sample.
            #   2. Predict the label using each of the above found trees.
            #   3. Use majority vote to find the final label for this recod.
            votes = []
            
            for i in range(len(self.bootstraps_datasets)):
                dataset = self.bootstraps_datasets[i]
                
                if len(np.where(dataset == record)) > 0:
                    OOB_tree = self.decision_trees[i]
                    effective_vote = OOB_tree.classify(record)
                    votes.append(effective_vote)

            counts = np.bincount(votes)

            if len(counts) == 0:
                # TODO: Special case
                #  Handle the case where the record is not an out-of-bag sample
                #  for any of the trees.
                # NOTE - you can add few lines of codes above (but inside voting) to make this work
                ### Implement your code here
                #############################################
#                 idx = -1
                for ds, lbl in zip(self.bootstraps_datasets, self.bootstraps_labels):
                    indices = np.where(ds == record)
                    if len(indices) > 0:
                        y = np.append(y, lbl[indices[0]])
                #############################################
            else:
                y = np.append(y, np.argmax(counts))
                
        return y

    def user(self):
        """
        :return: string
        your GTUsername, NOT your 9-Digit GTId  
        """
        ### Implement your code here
        #############################################
        return 'psrinivasan48'
        #############################################

In [5]:
# TODO: Initialize according to your implementation
# VERY IMPORTANT: Minimum forest_size should be 10
forest_size = 30

### Do not modify the below cell

In [6]:
# start time 
start = datetime.now()
X = list()
y = list()
XX = list()  # Contains data features and data labels
numerical_cols = set([i for i in range(0, 9)])  # indices of numeric attributes (columns)

# Loading data set
print("reading the data")
with open("pima-indians-diabetes.csv") as f:
    next(f, None)
    for line in csv.reader(f, delimiter=","):
        xline = []
        for i in range(len(line)):
            if i in numerical_cols:
                xline.append(ast.literal_eval(line[i]))
            else:
                xline.append(line[i])

        X.append(xline[:-1])
        y.append(xline[-1])
        XX.append(xline[:])

# Initializing a random forest.
randomForest = RandomForest(forest_size)

# printing the name
print("__Name: " + randomForest.user()+"__")

# Creating the bootstrapping datasets
print("creating the bootstrap datasets")
randomForest.bootstrapping(XX)

# Building trees in the forest
print("fitting the forest")
randomForest.fitting()

# Calculating an unbiased error estimation of the random forest
# based on out-of-bag (OOB) error estimate.
y_predicted = randomForest.voting(X)

# Comparing predicted and true labels
results = [prediction == truth for prediction, truth in zip(y_predicted, y)]

# Accuracy
accuracy = float(results.count(True)) / float(len(results))

print("accuracy: %.4f" % accuracy)
print("OOB estimate: %.4f" % (1 - accuracy))

# end time
print("Execution time: " + str(datetime.now() - start))

reading the data
__Name: psrinivasan48__
creating the bootstrap datasets
fitting the forest
accuracy: 0.9140
OOB estimate: 0.0860
Execution time: 0:00:04.365937
