{ "cells": [ { "cell_type": "code", "execution_count": 23, "metadata": {}, "outputs": [], "source": [ "#export\n", "import csv\n", "import numpy as np # http://www.numpy.org\n", "import ast\n", "from datetime import datetime\n", "from math import log, floor, ceil,log2\n", "import random\n", "import numpy as np" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Modify the Utility class's methods. You can also add additional methods as required but don't change existing methods' arguments." ] }, { "cell_type": "code", "execution_count": 24, "metadata": {}, "outputs": [], "source": [ "#export\n", "class Utility(object):\n", " \n", " # This method computes entropy for information gain\n", " def entropy(self, class_y):\n", " # Input: \n", " # class_y : list of class labels (0's and 1's)\n", "\n", " # TODO: Compute the entropy for a list of classes\n", " #\n", " # Example:\n", " # entropy([0,0,0,1,1,1,1,1,1]) = 0.918 (rounded to three decimal places)\n", "\n", " entropy = 0\n", " ### Implement your code here\n", " #############a################################\n", " dict_class_vals = {}\n", " if len(set(class_y))>1:\n", " dict_class_vals[list(np.unique(class_y, return_counts=True)[0])[0]]=list(np.unique(class_y, return_counts=True)[1])[0]\n", " dict_class_vals[list(np.unique(class_y, return_counts=True)[0])[1]]=list(np.unique(class_y, return_counts=True)[1])[1]\n", " p0 = dict_class_vals[0]/len(class_y)\n", " p1 = dict_class_vals[1]/len(class_y)\n", " entropy = -p0*log2(p0)-p1*log2(p1)\n", " else:\n", " entropy=0\n", " #############################################\n", " return entropy\n", "\n", "\n", " def partition_classes(self, X, y, split_attribute, split_val):\n", " # Inputs:\n", " # X : data containing all attributes\n", " # y : labels\n", " # split_attribute : column index of the attribute to split on\n", " # split_val : a numerical value to divide the split_attribute\n", "\n", " \n", "\n", " # TODO: Partition the data(X) and labels(y) based on the split value - BINARY SPLIT.\n", " # \n", " # Split_val should be a numerical value\n", " # For example, your split_val could be the mean of the values of split_attribute\n", " #\n", " # You can perform the partition in the following way\n", " # Numeric Split Attribute:\n", " # Split the data X into two lists(X_left and X_right) where the first list has all\n", " # the rows where the split attribute is less than or equal to the split value, and the \n", " # second list has all the rows where the split attribute is greater than the split \n", " # value. Also create two lists(y_left and y_right) with the corresponding y labels.\n", "\n", "\n", "\n", " '''\n", " Example:\n", "\n", " \n", "\n", " X = [[3, 10], y = [1,\n", " [1, 22], 1,\n", " [2, 28], 0,\n", " [5, 32], 0,\n", " [4, 32]] 1]\n", "\n", " \n", "\n", " Here, columns 0 and 1 represent numeric attributes.\n", "\n", " \n", "\n", " Consider the case where we call the function with split_attribute = 0 and split_val = 3 (mean of column 0)\n", " Then we divide X into two lists - X_left, where column 0 is <= 3 and X_right, where column 0 is > 3.\n", "\n", " \n", "\n", " X_left = [[3, 10], y_left = [1,\n", " [1, 22], 1,\n", " [2, 28]] 0]\n", "\n", " \n", "\n", " X_right = [[5, 32], y_right = [0,\n", " [4, 32]] 1]\n", "\n", " \n", "\n", " ''' \n", "\n", " X_left = []\n", " X_right = []\n", "\n", " y_left = []\n", " y_right = []\n", " ### Implement your code here\n", " #############################################\n", " for x_elem,y_elem in zip(X,y):\n", " if x_elem[split_attribute]<= split_val:\n", " X_left.append(x_elem)\n", " y_left.append(y_elem)\n", " else:\n", " X_right.append(x_elem)\n", " y_right.append(y_elem)\n", " \n", " #############################################\n", " return (X_left, X_right, y_left, y_right)\n", "\n", "\n", " def information_gain(self, previous_y, current_y):\n", " # Inputs:\n", " # previous_y: the distribution of original labels (0's and 1's)\n", " # current_y: the distribution of labels after splitting based on a particular\n", " # split attribute and split value\n", "\n", " # TODO: Compute and return the information gain from partitioning the previous_y labels\n", " # into the current_y labels.\n", " # You will need to use the entropy function above to compute information gain\n", " # Reference: http://www.cs.cmu.edu/afs/cs.cmu.edu/academic/class/15381-s06/www/DTs.pdf\n", "\n", " \"\"\"\n", " Example:\n", "\n", " previous_y = [0,0,0,1,1,1]\n", " current_y = [[0,0], [1,1,1,0]]\n", "\n", " info_gain = 0.45915\n", " \"\"\"\n", "\n", " info_gain = 0\n", " ### Implement your code here\n", " #############################################\n", " H = self.entropy(previous_y)\n", " HL = self.entropy(current_y[0])\n", " PL = round(len(current_y[0])/len(previous_y),3)\n", " HR = self.entropy(current_y[1])\n", " PR = round(len(current_y[1])/len(previous_y),3)\n", " \n", " info_gain = round(H - (HL*PL+HR*PR),4)\n", " #############################################\n", " return info_gain\n", "\n", "\n", " def best_split(self, X, y):\n", " # Inputs:\n", " # X : Data containing all attributes\n", " # y : labels\n", " # TODO : For each node find the best split criteria and return the split attribute, \n", " # spliting value along with X_left, X_right, y_left, y_right (using partition_classes) \n", " # in the dictionary format {'split_attribute':split_attribute, 'split_val':split_val, \n", " # 'X_left':X_left, 'X_right':X_right, 'y_left':y_left, 'y_right':y_right, 'info_gain':info_gain}\n", " '''\n", "\n", " Example: \n", "\n", " X = [[3, 10], y = [1, \n", " [1, 22], 1, \n", " [2, 28], 0, \n", " [5, 32], 0, \n", " [4, 32]] 1] \n", "\n", " Starting entropy: 0.971 \n", "\n", " Calculate information gain at splits: (In this example, we are testing all values in an \n", " attribute as a potential split value, but you can experiment with different values in your implementation) \n", "\n", " feature 0: --> split_val = 1 --> info_gain = 0.17 \n", " split_val = 2 --> info_gain = 0.01997 \n", " split_val = 3 --> info_gain = 0.01997 \n", " split_val = 4 --> info_gain = 0.32 \n", " split_val = 5 --> info_gain = 0 \n", " \n", " best info_gain = 0.32, best split_val = 4 \n", "\n", "\n", " feature 1: --> split_val = 10 --> info_gain = 0.17 \n", " split_val = 22 --> info_gain = 0.41997 \n", " split_val = 28 --> info_gain = 0.01997 \n", " split_val = 32 --> info_gain = 0 \n", "\n", " best info_gain = 0.4199, best split_val = 22 \n", "\n", " \n", " best_split_feature: 1 \n", " best_split_val: 22 \n", "\n", " 'X_left': [[3, 10], [1, 22]] \n", " 'X_right': [[2, 28],[5, 32], [4, 32]] \n", "\n", " 'y_left': [1, 1] \n", " 'y_right': [0, 0, 1] \n", " '''\n", " \n", " split_attribute = 0\n", " split_val = 0\n", " feature_vals = {}\n", " X_left, X_right, y_left, y_right = [], [], [], []\n", " ### Implement your code here\n", " #############################################\n", " num_features = len(X[0])\n", " for feature_num in range(num_features-1):\n", " information_gain = []\n", " nested_dict = {}\n", " distinct_vals = list(set(sorted(list(list(zip(*X))[feature_num]))))\n", " for idx,split_value in enumerate(distinct_vals):\n", " X_l, X_r, y_l, y_r = self.partition_classes(X,y,feature_num,split_value)\n", " ig = self.information_gain(y,[y_l,y_r])\n", " information_gain.append(ig)\n", " index_max_ig = information_gain.index(max(information_gain))\n", " nested_dict['split_val']=distinct_vals[index_max_ig]\n", " nested_dict['information_gain']=max(information_gain)\n", " feature_vals[f'feature_{feature_num}']=nested_dict\n", " best_feature=max(feature_vals, key=lambda v: feature_vals[v]['information_gain'])\n", " best_feature_val = int(best_feature.split('_')[1])\n", " best_split_val = feature_vals[best_feature]['split_val']\n", " X_left, X_right, y_left, y_right = self.partition_classes(X,y,best_feature_val,best_split_val)\n", " info_gain = feature_vals[best_feature]['information_gain']\n", " final_dict = {\n", " 'split_attribute':best_feature_val, \n", " 'split_val':best_split_val, \n", " 'X_left':X_left, \n", " 'X_right':X_right, \n", " 'y_left':y_left, \n", " 'y_right':y_right, \n", " 'info_gain':info_gain}\n", " return final_dict\n", " #############################################\n", " " ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### Define the classes 'DecisionTree' and 'RandomForest'" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Please modify the 'DecisionTree' and 'RandomForest' classes below" ] }, { "cell_type": "code", "execution_count": 25, "metadata": {}, "outputs": [], "source": [ "#export\n", "class DecisionTree(object):\n", " def __init__(self, max_depth):\n", " # Initializing the tree as an empty dictionary or list, as preferred\n", " self.tree = {}\n", " self.max_depth = max_depth\n", " \n", " \t\n", " def learn(self, X, y, par_node = {}, depth=0):\n", " # TODO: Train the decision tree (self.tree) using the the sample X and labels y\n", " # You will have to make use of the functions in Utility class to train the tree\n", " \n", " # par_node is a parameter that is useful to pass additional information to call \n", " # the learn method recursively. Its not mandatory to use this parameter\n", "\n", " # Use the function best_split in Utility class to get the best split and \n", " # data corresponding to left and right child nodes\n", " \n", " # One possible way of implementing the tree:\n", " # Each node in self.tree could be in the form of a dictionary:\n", " # https://docs.python.org/2/library/stdtypes.html#mapping-types-dict\n", " # For example, a non-leaf node with two children can have a 'left' key and a \n", " # 'right' key. You can add more keys which might help in classification\n", " # (eg. split attribute and split value)\n", " ### Implement your code here\n", " #############################################\n", " utils = Utility()\n", " num_samples_per_class = [np.sum(np.array(y) == i) for i in range(len(set(y)))]\n", " value = np.argmax(num_samples_per_class)\n", " node = Node(\n", " value = value\n", " )\n", " \n", " if depth < self.max_depth:\n", " result_dict =utils.best_split(X,y)\n", " node.feature=result_dict['split_attribute']\n", " node.threshold=result_dict['split_val']\n", " node.entropy = utils.entropy(y)\n", " self.tree[f'node_{len(self.tree)+1}']=node\n", " #Do recursive call for both lead nodes\n", " if node.entropy != 0:\n", " new_depth = depth+1\n", " node.left = self.learn(result_dict['X_left'],result_dict['y_left'],None,new_depth)\n", " node.right = self.learn(result_dict['X_right'],result_dict['y_right'],None,new_depth)\n", " \n", " return node\n", " #############################################\n", "\n", "\n", " def classify(self, record):\n", " # TODO: classify the record using self.tree and return the predicted label\n", " ### Implement your code here\n", " #############################################\n", " node = self.tree['node_1']\n", " while node.left:\n", " if record[node.feature] < node.threshold:\n", " node = node.left\n", " else:\n", " node = node.right\n", " return node.value" ] }, { "cell_type": "code", "execution_count": 26, "metadata": {}, "outputs": [], "source": [ "#export\n", "# This starter code does not run. You will have to add your changes and\n", "# turn in code that runs properly.\n", "\n", "\"\"\"\n", "Here, \n", "1. X is assumed to be a matrix with n rows and d columns where n is the\n", "number of total records and d is the number of features of each record. \n", "2. y is assumed to be a vector of labels of length n.\n", "3. XX is similar to X, except that XX also contains the data label for each\n", "record.\n", "\"\"\"\n", "\n", "\"\"\"\n", "This skeleton is provided to help you implement the assignment.You must \n", "implement the existing functions as necessary. You may add new functions\n", "as long as they are called from within the given classes. \n", "\n", "VERY IMPORTANT!\n", "Do NOT change the signature of the given functions.\n", "Do NOT change any part of the main function APART from the forest_size parameter. \n", "\"\"\"\n", "\n", "\n", "class RandomForest(object):\n", " num_trees = 0\n", " decision_trees = []\n", "\n", " # the bootstrapping datasets for trees\n", " # bootstraps_datasets is a list of lists, where each list in bootstraps_datasets is a bootstrapped dataset.\n", " bootstraps_datasets = []\n", "\n", " # the true class labels, corresponding to records in the bootstrapping datasets\n", " # bootstraps_labels is a list of lists, where the 'i'th list contains the labels corresponding to records in\n", " # the 'i'th bootstrapped dataset.\n", " bootstraps_labels = []\n", "\n", " def __init__(self, num_trees):\n", " # Initialization done here\n", " self.num_trees = num_trees\n", " self.decision_trees = [DecisionTree(max_depth=10) for i in range(num_trees)]\n", " self.bootstraps_datasets = []\n", " self.bootstraps_labels = []\n", " \n", " def _bootstrapping(self, XX, n):\n", " # Reference: https://en.wikipedia.org/wiki/Bootstrapping_(statistics)\n", " #\n", " # TODO: Create a sample dataset of size n by sampling with replacement\n", " # from the original dataset XX.\n", " # Note that you would also need to record the corresponding class labels\n", " # for the sampled records for training purposes.\n", "\n", " sample = [] # sampled dataset\n", " labels = [] # class labels for the sampled records\n", " ### Implement your code here\n", " #############################################\n", " random_sample = random.choices(XX,k=n)\n", " label_position = len(random_sample[1])-1\n", " for i in range(n):\n", " sample.append(random_sample[i][0:-1])\n", " labels.append(random_sample[i][label_position])\n", " #############################################\n", " return (sample, labels)\n", "\n", " def bootstrapping(self, XX):\n", " # Initializing the bootstap datasets for each tree\n", " for i in range(self.num_trees):\n", " data_sample, data_label = self._bootstrapping(XX, len(XX))\n", " self.bootstraps_datasets.append(data_sample)\n", " self.bootstraps_labels.append(data_label)\n", "\n", " def fitting(self):\n", " # TODO: Train `num_trees` decision trees using the bootstraps datasets\n", " # and labels by calling the learn function from your DecisionTree class.\n", " ### Implement your code here\n", " #############################################\n", " for tree, X, y in zip(self.decision_trees, self.bootstraps_datasets,self.bootstraps_labels):\n", " tree.learn(X,y)\n", " #############################################\n", "\n", " def voting(self, X):\n", " y = []\n", "\n", " for record in X:\n", " # Following steps have been performed here:\n", " # 1. Find the set of trees that consider the record as an\n", " # out-of-bag sample.\n", " # 2. Predict the label using each of the above found trees.\n", " # 3. Use majority vote to find the final label for this recod.\n", " votes = []\n", " \n", " for i in range(len(self.bootstraps_datasets)):\n", " dataset = self.bootstraps_datasets[i]\n", " \n", " if record not in dataset:\n", " OOB_tree = self.decision_trees[i]\n", " effective_vote = OOB_tree.classify(record)\n", " votes.append(effective_vote)\n", "\n", " counts = np.bincount(votes)\n", "\n", " if len(counts) == 0:\n", " # TODO: Special case\n", " # Handle the case where the record is not an out-of-bag sample\n", " # for any of the trees.\n", " # NOTE - you can add few lines of codes above (but inside voting) to make this work\n", " ### Implement your code here\n", " #############################################\n", " OOB_tree = self.decision_trees[1]\n", " effective_vote = OOB_tree.classify(record)\n", " y =np.append(y, effective_vote)\n", " #############################################\n", " else:\n", " y = np.append(y, np.argmax(counts))\n", " \n", " return y\n", "\n", " def user(self):\n", " \"\"\"\n", " :return: string\n", " your GTUsername, NOT your 9-Digit GTId \n", " \"\"\"\n", " ### Implement your code here\n", " #############################################\n", " return 'mpearl3'\n", " #############################################\n", " \n", "class Node:\n", " def __init__(self, feature=None, threshold=None, left=None, right=None, *, value=None,entropy=None):\n", " self.feature = feature\n", " self.threshold = threshold\n", " self.left = left\n", " self.right = right\n", " self.value = value\n", " self.entropy=entropy" ] }, { "cell_type": "code", "execution_count": 27, "metadata": {}, "outputs": [], "source": [ "#export\n", "\n", "# TODO: Determine the forest size according to your implementation. \n", "# This function will be used by the autograder to set your forest size during testing\n", "# VERY IMPORTANT: Minimum forest_size should be 10\n", "def get_forest_size():\n", " forest_size = 10\n", " return forest_size\n", "\n", "# TODO: Determine random seed to set for reproducibility\n", "# This function will be used by the autograder to set the random seed to obtain the same results you achieve locally\n", "def get_random_seed():\n", " random_seed = 0\n", " return random_seed" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### Do not modify the below cell\n", "The cell below is provided to test that your random forest classifier can be successfully built and run. Similar steps will be used to build and run your code in Gradescope. Any additional testing of functions can be done in the cells below the `%run helpers/notebook2script submission` cell, as these will not be parsed by the autograder." ] }, { "cell_type": "code", "execution_count": 28, "metadata": {}, "outputs": [], "source": [ "def run():\n", " np.random.seed(get_random_seed())\n", " # start time \n", " start = datetime.now()\n", " X = list()\n", " y = list()\n", " XX = list() # Contains data features and data labels\n", " numerical_cols = set([i for i in range(0, 9)]) # indices of numeric attributes (columns)\n", "\n", " # Loading data set\n", " print(\"reading the data\")\n", " with open(\"pima-indians-diabetes.csv\") as f:\n", " next(f, None)\n", " for line in csv.reader(f, delimiter=\",\"):\n", " xline = []\n", " for i in range(len(line)):\n", " if i in numerical_cols:\n", " xline.append(ast.literal_eval(line[i]))\n", " else:\n", " xline.append(line[i])\n", "\n", " X.append(xline[:-1])\n", " y.append(xline[-1])\n", " XX.append(xline[:])\n", "\n", " # Initializing a random forest.\n", " randomForest = RandomForest(get_forest_size())\n", "\n", " # printing the name\n", " print(\"__Name: \" + randomForest.user()+\"__\")\n", "\n", " # Creating the bootstrapping datasets\n", " print(\"creating the bootstrap datasets\")\n", " randomForest.bootstrapping(XX)\n", "\n", " # Building trees in the forest\n", " print(\"fitting the forest\")\n", " randomForest.fitting()\n", "\n", " # Calculating an unbiased error estimation of the random forest\n", " # based on out-of-bag (OOB) error estimate.\n", " y_predicted = randomForest.voting(X)\n", "\n", " # Comparing predicted and true labels\n", " results = [prediction == truth for prediction, truth in zip(y_predicted, y)]\n", "\n", " # Accuracy\n", " accuracy = float(results.count(True)) / float(len(results))\n", "\n", " print(\"accuracy: %.4f\" % accuracy)\n", " print(\"OOB estimate: %.4f\" % (1 - accuracy))\n", "\n", " # end time\n", " print(\"Execution time: \" + str(datetime.now() - start))" ] }, { "cell_type": "code", "execution_count": 29, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Late Policy:\n", "\n", " \"I have read the late policy for CS6424.\"\n", " \n", "\n", "\n", "Honor Pledge:\n", "\n", " \"I have read the Collaboration and Academic Honesty policy for CS6424.\n", " I certify that I have or will use outside references only in accordance with\n", " this policy, that I have or will cite any such references via code comments,\n", " and that I have not or will not copy any portion of my submission from another\n", " past or current student.\"\n", "\n", " \n", "\n", "\n", "Converted Q2.ipynb to submission\\submission.py\n" ] } ], "source": [ " %run helpers/notebook2script submission" ] }, { "cell_type": "code", "execution_count": 30, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "reading the data\n", "__Name: mpearl3__\n", "creating the bootstrap datasets\n", "fitting the forest\n", "accuracy: 0.6458\n", "OOB estimate: 0.3542\n", "Execution time: 0:00:41.224284\n" ] } ], "source": [ "# Call the run() function to test your implementation\n", "# Use this cell and any cells below for additional testing\n", "run()" ] } ], "metadata": { "kernelspec": { "display_name": "Python 3", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.7.3" } }, "nbformat": 4, "nbformat_minor": 4 }