{ "cells": [ { "cell_type": "code", "execution_count": 1, "metadata": {}, "outputs": [], "source": [ "import csv\n", "import numpy as np # http://www.numpy.org\n", "import ast\n", "from datetime import datetime\n", "from math import log, floor, ceil" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Modify the Utility class's methods. You can also add additional methods as required but don't change existing methods' arguments." ] }, { "cell_type": "code", "execution_count": 2, "metadata": {}, "outputs": [], "source": [ "class Utility(object):\n", " \n", " # This method computes entropy for information gain\n", " def entropy(self, class_y):\n", " # Input: \n", " # class_y : list of class labels (0's and 1's)\n", "\n", " # TODO: Compute the entropy for a list of classes\n", " #\n", " # Example:\n", " # entropy([0,0,0,1,1,1,1,1,1]) = 0.92\n", "\n", " entropy = 0\n", " ### Implement your code here\n", " #############################################\n", " if len(class_y) == 0:\n", " return entropy\n", " p = sum(class_y)*1.0/len(class_y)\n", " for p_val in [p, 1-p]:\n", " if p > 0:\n", " entropy -= p_val*np.log2(p_val)\n", " \n", " #############################################\n", " return entropy\n", "\n", "\n", " def partition_classes(self, X, y, split_attribute, split_val):\n", " # Inputs:\n", " # X : data containing all attributes\n", " # y : labels\n", " # split_attribute : column index of the attribute to split on\n", " # split_val : either a numerical or categorical value to divide the split_attribute\n", "\n", " # TODO: Partition the data(X) and labels(y) based on the split value - BINARY SPLIT.\n", " # \n", " # You will have to first check if the split attribute is numerical or categorical \n", " # If the split attribute is numeric, split_val should be a numerical value\n", " # For example, your split_val could be the mean of the values of split_attribute\n", " # If the split attribute is categorical, split_val should be one of the categories. \n", " #\n", " # You can perform the partition in the following way\n", " # Numeric Split Attribute:\n", " # Split the data X into two lists(X_left and X_right) where the first list has all\n", " # the rows where the split attribute is less than or equal to the split value, and the \n", " # second list has all the rows where the split attribute is greater than the split \n", " # value. Also create two lists(y_left and y_right) with the corresponding y labels.\n", " #\n", " # Categorical Split Attribute:\n", " # Split the data X into two lists(X_left and X_right) where the first list has all \n", " # the rows where the split attribute is equal to the split value, and the second list\n", " # has all the rows where the split attribute is not equal to the split value.\n", " # Also create two lists(y_left and y_right) with the corresponding y labels.\n", "\n", " '''\n", " Example:\n", "\n", " X = [[3, 'aa', 10], y = [1,\n", " [1, 'bb', 22], 1,\n", " [2, 'cc', 28], 0,\n", " [5, 'bb', 32], 0,\n", " [4, 'cc', 32]] 1]\n", "\n", " Here, columns 0 and 2 represent numeric attributes, while column 1 is a categorical attribute.\n", "\n", " Consider the case where we call the function with split_attribute = 0 and split_val = 3 (mean of column 0)\n", " Then we divide X into two lists - X_left, where column 0 is <= 3 and X_right, where column 0 is > 3.\n", "\n", " X_left = [[3, 'aa', 10], y_left = [1,\n", " [1, 'bb', 22], 1,\n", " [2, 'cc', 28]] 0]\n", "\n", " X_right = [[5, 'bb', 32], y_right = [0,\n", " [4, 'cc', 32]] 1]\n", "\n", " Consider another case where we call the function with split_attribute = 1 and split_val = 'bb'\n", " Then we divide X into two lists, one where column 1 is 'bb', and the other where it is not 'bb'.\n", "\n", " X_left = [[1, 'bb', 22], y_left = [1,\n", " [5, 'bb', 32]] 0]\n", "\n", " X_right = [[3, 'aa', 10], y_right = [1,\n", " [2, 'cc', 28], 0,\n", " [4, 'cc', 32]] 1]\n", "\n", " ''' \n", "\n", " X_left = []\n", " X_right = []\n", "\n", " y_left = []\n", " y_right = []\n", " ### Implement your code here\n", " #############################################\n", " for Xp, yp in zip(X, y):\n", " if type(Xp[split_attribute]) == str:\n", " if Xp[split_attribute] == split_val:\n", " X_left.append(Xp)\n", " y_left.append(yp)\n", " else:\n", " X_right.append(Xp)\n", " y_right.append(yp)\n", " else:\n", " if Xp[split_attribute] <= split_val:\n", " X_left.append(Xp)\n", " y_left.append(yp)\n", " else:\n", " X_right.append(Xp)\n", " y_right.append(yp)\n", " \n", " #############################################\n", " return (X_left, X_right, y_left, y_right)\n", "\n", "\n", " def information_gain(self, previous_y, current_y):\n", " # Inputs:\n", " # previous_y: the distribution of original labels (0's and 1's)\n", " # current_y: the distribution of labels after splitting based on a particular\n", " # split attribute and split value\n", "\n", " # TODO: Compute and return the information gain from partitioning the previous_y labels\n", " # into the current_y labels.\n", " # You will need to use the entropy function above to compute information gain\n", " # Reference: http://www.cs.cmu.edu/afs/cs.cmu.edu/academic/class/15381-s06/www/DTs.pdf\n", "\n", " \"\"\"\n", " Example:\n", "\n", " previous_y = [0,0,0,1,1,1]\n", " current_y = [[0,0], [1,1,1,0]]\n", "\n", " info_gain = 0.45915\n", " \"\"\"\n", "\n", " info_gain = 0\n", " ### Implement your code here\n", " #############################################\n", " info_gain = self.entropy(previous_y) -\\\n", " sum([self.entropy(y) * len(y)/len(previous_y) for y in current_y])\n", " #############################################\n", " return info_gain\n", "\n", "\n", " def best_split(self, X, y):\n", " # Inputs:\n", " # X : Data containing all attributes\n", " # y : labels\n", " # TODO: For each node find the best split criteria and return the \n", " # split attribute, spliting value along with \n", " # X_left, X_right, y_left, y_right (using partition_classes)\n", " '''\n", "\n", " NOTE: Just like taught in class, don't use all the features for a node.\n", " Repeat the steps:\n", "\n", " 1. Select m attributes out of d available attributes\n", " 2. Pick the best variable/split-point among the m attributes\n", " 3. return the split attributes, split point, left and right children nodes data \n", "\n", " '''\n", " split_attribute = 0\n", " split_value = 0\n", " X_left, X_right, y_left, y_right = [], [], [], []\n", " ### Implement your code here\n", " #############################################\n", " max_gain = -np.inf\n", " split_val = None\n", " for i in range(len(X[0])):\n", " if type(X[0][i]) == str:\n", " values = np.unique(X[:,i])\n", " for val in values:\n", " X_l, X_r, y_l, y_r = self.partition_classes(X, y, i, val)\n", " split_val = val\n", " else:\n", " mean = np.mean(X[:, i])\n", " X_l, X_r, y_l, y_r = self.partition_classes(X, y, i, mean)\n", " split_val = mean\n", " gain = self.information_gain(y, [y_l, y_r])\n", " if gain > max_gain:\n", " max_gain = gain\n", " split_attribute = i\n", " split_value = split_val\n", " X_left, X_right, y_left, y_right = X_l, X_r, y_l, y_r\n", " #############################################\n", " return X_left, X_right, y_left, y_right, split_attribute, split_value" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### Define the classes 'DecisionTree' and 'RandomForest'" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Please modify the 'DecisionTree' and 'RandomForest' classes below" ] }, { "cell_type": "code", "execution_count": 3, "metadata": {}, "outputs": [], "source": [ "class DecisionTree(object):\n", " def __init__(self, max_depth):\n", " # Initializing the tree as an empty dictionary or list, as preferred\n", " self.tree = {}\n", " self.max_depth = max_depth\n", " \n", " \t\n", " def learn(self, X, y, par_node = {}, depth=0):\n", " # TODO: Train the decision tree (self.tree) using the the sample X and labels y\n", " # You will have to make use of the functions in Utility class to train the tree\n", "\n", " # Use the function best_split in Utility class to get the best split and \n", " # data corresponding to left and right child nodes\n", " \n", " # One possible way of implementing the tree:\n", " # Each node in self.tree could be in the form of a dictionary:\n", " # https://docs.python.org/2/library/stdtypes.html#mapping-types-dict\n", " # For example, a non-leaf node with two children can have a 'left' key and a \n", " # 'right' key. You can add more keys which might help in classification\n", " # (eg. split attribute and split value)\n", " ### Implement your code here\n", " #############################################\n", " # Base case\n", " if len(y) == 1:\n", " return y[0]\n", " if depth >= self.max_depth:\n", " values, counts = np.unique(y, return_counts=True)\n", " most_frequent = values[np.argmax(counts)]\n", " return most_frequent\n", " \n", " X_left, X_right, y_left, y_right, split_attribute, split_value = Utility().best_split(X, y) \n", " X_left = np.asarray(X_left)\n", " y_left = np.asarray(y_left)\n", " X_right = np.asarray(X_right)\n", " y_right = np.asarray(y_right)\n", " \n", " # No information gain\n", " if len(X_left) == 0 or len(X_right) == 0:\n", " values, counts = np.unique(y, return_counts=True)\n", " most_frequent = values[np.argmax(counts)]\n", " return most_frequent\n", " \n", " par_node = {\n", " 'split_attribute': split_attribute,\n", " 'split_value': split_value,\n", " 'left': self.learn(X_left, y_left, depth=depth+1),\n", " 'right': self.learn(X_right, y_right, depth=depth+1)\n", " }\n", " \n", " # Final case\n", " if depth == 0:\n", " self.tree = par_node\n", " \n", " return par_node\n", " \n", " #############################################\n", "\n", "\n", " def classify(self, record):\n", " # TODO: classify the record using self.tree and return the predicted label\n", " ### Implement your code here\n", " #############################################\n", " node = self.tree\n", " while type(node) == dict:\n", " key = 'left'\n", " if type(record[node['split_attribute']]) == str:\n", " if record[node['split_attribute']] != node['split_value']:\n", " key = 'right'\n", " else:\n", " if record[node['split_attribute']] > node['split_value']: \n", " key = 'right'\n", " node = node[key]\n", " return node\n", " #############################################" ] }, { "cell_type": "code", "execution_count": 4, "metadata": {}, "outputs": [], "source": [ "# This starter code does not run. You will have to add your changes and\n", "# turn in code that runs properly.\n", "\n", "\"\"\"\n", "Here, \n", "1. X is assumed to be a matrix with n rows and d columns where n is the\n", "number of total records and d is the number of features of each record. \n", "2. y is assumed to be a vector of labels of length n.\n", "3. XX is similar to X, except that XX also contains the data label for each\n", "record.\n", "\"\"\"\n", "\n", "\"\"\"\n", "This skeleton is provided to help you implement the assignment.You must \n", "implement the existing functions as necessary. You may add new functions\n", "as long as they are called from within the given classes. \n", "\n", "VERY IMPORTANT!\n", "Do NOT change the signature of the given functions.\n", "Do NOT change any part of the main function APART from the forest_size parameter. \n", "\"\"\"\n", "\n", "\n", "class RandomForest(object):\n", " num_trees = 0\n", " decision_trees = []\n", "\n", " # the bootstrapping datasets for trees\n", " # bootstraps_datasets is a list of lists, where each list in bootstraps_datasets is a bootstrapped dataset.\n", " bootstraps_datasets = []\n", "\n", " # the true class labels, corresponding to records in the bootstrapping datasets\n", " # bootstraps_labels is a list of lists, where the 'i'th list contains the labels corresponding to records in\n", " # the 'i'th bootstrapped dataset.\n", " bootstraps_labels = []\n", "\n", " def __init__(self, num_trees):\n", " # Initialization done here\n", " self.num_trees = num_trees\n", " self.decision_trees = [DecisionTree(max_depth=10) for i in range(num_trees)]\n", " self.bootstraps_datasets = []\n", " self.bootstraps_labels = []\n", " \n", " def _bootstrapping(self, XX, n):\n", " # Reference: https://en.wikipedia.org/wiki/Bootstrapping_(statistics)\n", " #\n", " # TODO: Create a sample dataset of size n by sampling with replacement\n", " # from the original dataset XX.\n", " # Note that you would also need to record the corresponding class labels\n", " # for the sampled records for training purposes.\n", " XX = np.asarray(XX)\n", " samples = [] # sampled dataset\n", " labels = [] # class labels for the sampled records\n", " ### Implement your code here\n", " #############################################\n", " for i in range(len(XX)): \n", " row = np.random.randint(0, n-1)\n", " samples.append(XX[row, :-1])\n", " labels.append(XX[row, -1])\n", " #############################################\n", " return (np.asarray(samples), np.asarray(labels))\n", "\n", " def bootstrapping(self, XX):\n", " # Initializing the bootstap datasets for each tree\n", " for i in range(self.num_trees):\n", " data_sample, data_label = self._bootstrapping(XX, len(XX))\n", " self.bootstraps_datasets.append(data_sample)\n", " self.bootstraps_labels.append(data_label)\n", "\n", " def fitting(self):\n", " # TODO: Train `num_trees` decision trees using the bootstraps datasets\n", " # and labels by calling the learn function from your DecisionTree class.\n", " ### Implement your code here\n", " #############################################\n", " for dt, X, y in zip(self.decision_trees, self.bootstraps_datasets, self.bootstraps_labels):\n", " dt.learn(X, y)\n", " #############################################\n", "\n", " def voting(self, X):\n", " y = []\n", "\n", " for record in X:\n", " # Following steps have been performed here:\n", " # 1. Find the set of trees that consider the record as an\n", " # out-of-bag sample.\n", " # 2. Predict the label using each of the above found trees.\n", " # 3. Use majority vote to find the final label for this recod.\n", " votes = []\n", " \n", " for i in range(len(self.bootstraps_datasets)):\n", " dataset = self.bootstraps_datasets[i]\n", " \n", " if len(np.where(dataset == record)) > 0:\n", " OOB_tree = self.decision_trees[i]\n", " effective_vote = OOB_tree.classify(record)\n", " votes.append(effective_vote)\n", "\n", " counts = np.bincount(votes)\n", "\n", " if len(counts) == 0:\n", " # TODO: Special case\n", " # Handle the case where the record is not an out-of-bag sample\n", " # for any of the trees.\n", " # NOTE - you can add few lines of codes above (but inside voting) to make this work\n", " ### Implement your code here\n", " #############################################\n", "# idx = -1\n", " for ds, lbl in zip(self.bootstraps_datasets, self.bootstraps_labels):\n", " indices = np.where(ds == record)\n", " if len(indices) > 0:\n", " y = np.append(y, lbl[indices[0]])\n", " #############################################\n", " else:\n", " y = np.append(y, np.argmax(counts))\n", " \n", " return y\n", "\n", " def user(self):\n", " \"\"\"\n", " :return: string\n", " your GTUsername, NOT your 9-Digit GTId \n", " \"\"\"\n", " ### Implement your code here\n", " #############################################\n", " return 'psrinivasan48'\n", " #############################################" ] }, { "cell_type": "code", "execution_count": 5, "metadata": {}, "outputs": [], "source": [ "# TODO: Initialize according to your implementation\n", "# VERY IMPORTANT: Minimum forest_size should be 10\n", "forest_size = 30" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### Do not modify the below cell" ] }, { "cell_type": "code", "execution_count": 6, "metadata": { "scrolled": true }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "reading the data\n", "__Name: psrinivasan48__\n", "creating the bootstrap datasets\n", "fitting the forest\n", "accuracy: 0.9140\n", "OOB estimate: 0.0860\n", "Execution time: 0:00:04.365937\n" ] } ], "source": [ "# start time \n", "start = datetime.now()\n", "X = list()\n", "y = list()\n", "XX = list() # Contains data features and data labels\n", "numerical_cols = set([i for i in range(0, 9)]) # indices of numeric attributes (columns)\n", "\n", "# Loading data set\n", "print(\"reading the data\")\n", "with open(\"pima-indians-diabetes.csv\") as f:\n", " next(f, None)\n", " for line in csv.reader(f, delimiter=\",\"):\n", " xline = []\n", " for i in range(len(line)):\n", " if i in numerical_cols:\n", " xline.append(ast.literal_eval(line[i]))\n", " else:\n", " xline.append(line[i])\n", "\n", " X.append(xline[:-1])\n", " y.append(xline[-1])\n", " XX.append(xline[:])\n", "\n", "# Initializing a random forest.\n", "randomForest = RandomForest(forest_size)\n", "\n", "# printing the name\n", "print(\"__Name: \" + randomForest.user()+\"__\")\n", "\n", "# Creating the bootstrapping datasets\n", "print(\"creating the bootstrap datasets\")\n", "randomForest.bootstrapping(XX)\n", "\n", "# Building trees in the forest\n", "print(\"fitting the forest\")\n", "randomForest.fitting()\n", "\n", "# Calculating an unbiased error estimation of the random forest\n", "# based on out-of-bag (OOB) error estimate.\n", "y_predicted = randomForest.voting(X)\n", "\n", "# Comparing predicted and true labels\n", "results = [prediction == truth for prediction, truth in zip(y_predicted, y)]\n", "\n", "# Accuracy\n", "accuracy = float(results.count(True)) / float(len(results))\n", "\n", "print(\"accuracy: %.4f\" % accuracy)\n", "print(\"OOB estimate: %.4f\" % (1 - accuracy))\n", "\n", "# end time\n", "print(\"Execution time: \" + str(datetime.now() - start))" ] } ], "metadata": { "kernelspec": { "display_name": "Python 3", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.7.7" } }, "nbformat": 4, "nbformat_minor": 4 }