590 lines
24 KiB
Plaintext
590 lines
24 KiB
Plaintext
{
|
|
"cells": [
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": 8,
|
|
"metadata": {},
|
|
"outputs": [],
|
|
"source": [
|
|
"import csv\n",
|
|
"import numpy as np # http://www.numpy.org\n",
|
|
"import ast\n",
|
|
"from datetime import datetime\n",
|
|
"from math import log, floor, ceil"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "markdown",
|
|
"metadata": {},
|
|
"source": [
|
|
"Modify the Utility class's methods. You can also add additional methods as required but don't change existing methods' arguments."
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": 9,
|
|
"metadata": {},
|
|
"outputs": [],
|
|
"source": [
|
|
"class Utility(object):\n",
|
|
" \n",
|
|
" # This method computes entropy for information gain\n",
|
|
" def entropy(self, class_y):\n",
|
|
" # Input: \n",
|
|
" # class_y : list of class labels (0's and 1's)\n",
|
|
"\n",
|
|
" # TODO: Compute the entropy for a list of classes\n",
|
|
" #\n",
|
|
" # Example:\n",
|
|
" # entropy([0,0,0,1,1,1,1,1,1]) = 0.92\n",
|
|
"\n",
|
|
" entropy = 0\n",
|
|
" ### Implement your code here\n",
|
|
" #############################################\n",
|
|
" \n",
|
|
" p = np.mean(class_y)\n",
|
|
" if p == 0 or p == 1: return 0\n",
|
|
" entropy = - p * log(p) - (1 - p) * log(1 - p)\n",
|
|
" \n",
|
|
" #############################################\n",
|
|
" return entropy\n",
|
|
"\n",
|
|
"\n",
|
|
" def partition_classes(self, X, y, split_attribute, split_val):\n",
|
|
" # Inputs:\n",
|
|
" # X : data containing all attributes\n",
|
|
" # y : labels\n",
|
|
" # split_attribute : column index of the attribute to split on\n",
|
|
" # split_val : a numerical value to divide the split_attribute\n",
|
|
"\n",
|
|
" \n",
|
|
"\n",
|
|
" # TODO: Partition the data(X) and labels(y) based on the split value - BINARY SPLIT.\n",
|
|
" # \n",
|
|
" # Split_val should be a numerical value\n",
|
|
" # For example, your split_val could be the mean of the values of split_attribute\n",
|
|
" #\n",
|
|
" # You can perform the partition in the following way\n",
|
|
" # Numeric Split Attribute:\n",
|
|
" # Split the data X into two lists(X_left and X_right) where the first list has all\n",
|
|
" # the rows where the split attribute is less than or equal to the split value, and the \n",
|
|
" # second list has all the rows where the split attribute is greater than the split \n",
|
|
" # value. Also create two lists(y_left and y_right) with the corresponding y labels.\n",
|
|
"\n",
|
|
" \n",
|
|
"\n",
|
|
" '''\n",
|
|
" Example:\n",
|
|
"\n",
|
|
" \n",
|
|
"\n",
|
|
" X = [[3, 10], y = [1,\n",
|
|
" [1, 22], 1,\n",
|
|
" [2, 28], 0,\n",
|
|
" [5, 32], 0,\n",
|
|
" [4, 32]] 1]\n",
|
|
"\n",
|
|
" \n",
|
|
"\n",
|
|
" Here, columns 0 and 1 represent numeric attributes.\n",
|
|
"\n",
|
|
" \n",
|
|
"\n",
|
|
" Consider the case where we call the function with split_attribute = 0 and split_val = 3 (mean of column 0)\n",
|
|
" Then we divide X into two lists - X_left, where column 0 is <= 3 and X_right, where column 0 is > 3.\n",
|
|
"\n",
|
|
" \n",
|
|
"\n",
|
|
" X_left = [[3, 10], y_left = [1,\n",
|
|
" [1, 22], 1,\n",
|
|
" [2, 28]] 0]\n",
|
|
"\n",
|
|
" \n",
|
|
"\n",
|
|
" X_right = [[5, 32], y_right = [0,\n",
|
|
" [4, 32]] 1]\n",
|
|
"\n",
|
|
" \n",
|
|
"\n",
|
|
" ''' \n",
|
|
"\n",
|
|
" X_left = []\n",
|
|
" X_right = []\n",
|
|
"\n",
|
|
" y_left = []\n",
|
|
" y_right = []\n",
|
|
" ### Implement your code here\n",
|
|
" #############################################\n",
|
|
" left = X[:, split_attribute] <= split_val\n",
|
|
" right = X[:, split_attribute] > split_val\n",
|
|
" X_left = X[left, :] \n",
|
|
" X_right = X[right, :]\n",
|
|
" y_left = y[left]\n",
|
|
" y_right = y[right]\n",
|
|
" \n",
|
|
" #############################################\n",
|
|
" return (X_left, X_right, y_left, y_right)\n",
|
|
"\n",
|
|
"\n",
|
|
" def information_gain(self, previous_y, current_y):\n",
|
|
" # Inputs:\n",
|
|
" # previous_y: the distribution of original labels (0's and 1's)\n",
|
|
" # current_y: the distribution of labels after splitting based on a particular\n",
|
|
" # split attribute and split value\n",
|
|
"\n",
|
|
" # TODO: Compute and return the information gain from partitioning the previous_y labels\n",
|
|
" # into the current_y labels.\n",
|
|
" # You will need to use the entropy function above to compute information gain\n",
|
|
" # Reference: http://www.cs.cmu.edu/afs/cs.cmu.edu/academic/class/15381-s06/www/DTs.pdf\n",
|
|
"\n",
|
|
" \"\"\"\n",
|
|
" Example:\n",
|
|
"\n",
|
|
" previous_y = [0,0,0,1,1,1]\n",
|
|
" current_y = [[0,0], [1,1,1,0]]\n",
|
|
"\n",
|
|
" info_gain = 0.45915\n",
|
|
" \"\"\"\n",
|
|
"\n",
|
|
" info_gain = 0\n",
|
|
" ### Implement your code here\n",
|
|
" #############################################\n",
|
|
" \n",
|
|
" p = len(current_y[0]) / len(previous_y)\n",
|
|
" if p == 0 or p == 1: return 0\n",
|
|
" info_gain = self.entropy(previous_y) - (p * self.entropy(current_y[0]) + (1 - p) * self.entropy(current_y[1]))\n",
|
|
" \n",
|
|
" #############################################\n",
|
|
" return info_gain\n",
|
|
"\n",
|
|
"\n",
|
|
" def best_split(self, X, y):\n",
|
|
" # Inputs:\n",
|
|
" # X : Data containing all attributes\n",
|
|
" # y : labels\n",
|
|
" # TODO: For each node find the best split criteria and return the \n",
|
|
" # split attribute, spliting value along with \n",
|
|
" # X_left, X_right, y_left, y_right (using partition_classes)\n",
|
|
" '''\n",
|
|
"\n",
|
|
" NOTE: Just like taught in class, don't use all the features for a node.\n",
|
|
" Repeat the steps:\n",
|
|
"\n",
|
|
" 1. Select m attributes out of d available attributes\n",
|
|
" 2. Pick the best variable/split-point among the m attributes\n",
|
|
" 3. return the split attributes, split point, left and right children nodes data \n",
|
|
"\n",
|
|
" '''\n",
|
|
" split_attribute = 0\n",
|
|
" split_value = 0\n",
|
|
" X_left, X_right, y_left, y_right = [], [], [], []\n",
|
|
" ### Implement your code here\n",
|
|
" #############################################\n",
|
|
" '''\n",
|
|
" temp = -1\n",
|
|
" for i in range(X.shape[1]):\n",
|
|
" SplitVal = np.median(X[:, i])\n",
|
|
" X_left, X_right, y_left, y_right = self.partition_classes(X, y, i, SplitVal)\n",
|
|
" IG = self.information_gain(y, [y_left, y_right])\n",
|
|
" if not np.isnan(IG) and IG > temp:\n",
|
|
" temp = IG\n",
|
|
" split_attribute = i\n",
|
|
" split_value = np.median(X[:, split_attribute])\n",
|
|
" \n",
|
|
" '''\n",
|
|
" temp = -1\n",
|
|
" for i in range(len(X[0])):\n",
|
|
" split_attribute = i\n",
|
|
" info_gain = -1\n",
|
|
" best_split_val = None\n",
|
|
" for i in range(len(X)):\n",
|
|
" SplitVal = X[i][split_attribute]\n",
|
|
" X_left, X_right, y_left, y_right = self.partition_classes(X, y, split_attribute, SplitVal)\n",
|
|
" IG = self.information_gain(y, [y_left, y_right])\n",
|
|
" if not np.isnan(IG) and IG > info_gain:\n",
|
|
" info_gain = IG\n",
|
|
" best_split_val = SplitVal\n",
|
|
" temp_val = best_split_val\n",
|
|
" IG = info_gain\n",
|
|
" \n",
|
|
" if not np.isnan(IG) and IG > temp:\n",
|
|
" temp = IG\n",
|
|
" split_attribute = i\n",
|
|
" split_value = temp_val \n",
|
|
" \n",
|
|
" \n",
|
|
" \n",
|
|
" return split_attribute, split_value, X_left, X_right, y_left, y_right\n",
|
|
" #############################################"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "markdown",
|
|
"metadata": {},
|
|
"source": [
|
|
"### Define the classes 'DecisionTree' and 'RandomForest'"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "markdown",
|
|
"metadata": {},
|
|
"source": [
|
|
"Please modify the 'DecisionTree' and 'RandomForest' classes below"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": 10,
|
|
"metadata": {},
|
|
"outputs": [],
|
|
"source": [
|
|
"class DecisionTree(object):\n",
|
|
" def __init__(self, max_depth):\n",
|
|
" # Initializing the tree as an empty dictionary or list, as preferred\n",
|
|
" #self.tree = np.array([[]])\n",
|
|
" self.max_depth = max_depth\n",
|
|
" \n",
|
|
" \t\n",
|
|
" def learn(self, X, y, par_node = {}, depth=0):\n",
|
|
" # TODO: Train the decision tree (self.tree) using the the sample X and labels y\n",
|
|
" # You will have to make use of the functions in Utility class to train the tree\n",
|
|
"\n",
|
|
" # Use the function best_split in Utility class to get the best split and \n",
|
|
" # data corresponding to left and right child nodes\n",
|
|
" \n",
|
|
" # One possible way of implementing the tree:\n",
|
|
" # Each node in self.tree could be in the form of a dictionary:\n",
|
|
" # https://docs.python.org/2/library/stdtypes.html#mapping-types-dict\n",
|
|
" # For example, a non-leaf node with two children can have a 'left' key and a \n",
|
|
" # 'right' key. You can add more keys which might help in classification\n",
|
|
" # (eg. split attribute and split value)\n",
|
|
" ### Implement your code here\n",
|
|
" #############################################\n",
|
|
" x=Utility()\n",
|
|
" '''\n",
|
|
" if depth == self.max_depth: y = [stats.mode(y, axis = None)[0]] * len(y)\n",
|
|
" if len(y) <= 1:\n",
|
|
" self.tree = np.append(self.tree, [[\"Leaf\", np.mean(y, axis = None)[0], np.nan, np.nan]], axis = 1) if len(self.tree[0]) == 0 else np.append(self.tree, [[\"Leaf\", stats.mode(y, axis = None)[0], np.nan, np.nan]], axis = 0)\n",
|
|
" \n",
|
|
" n = self.tree.shape[0]\n",
|
|
" for i in range(n):\n",
|
|
" if self.tree[n-1-i, 3] == None:\n",
|
|
" self.tree[n-1-i, 3] = i + 1\n",
|
|
" else:\n",
|
|
" index, SplitVal,X_left, X_right, y_left, y_right = x.best_split(X, y)\n",
|
|
" if len(X_left) == 0 or len(X_right) == 0:\n",
|
|
" self.tree = np.append(self.tree, [[\"Leaf\", np.mean(y, axis = None)[0], np.nan, np.nan]], axis = 1) if len(self.tree[0]) == 0 else np.append(self.tree, [[\"Leaf\", stats.mode(y, axis = None)[0], np.nan, np.nan]], axis = 0)\n",
|
|
" \n",
|
|
" n = self.tree.shape[0]\n",
|
|
" for i in range(n):\n",
|
|
" if self.tree[n-1-i, 3] == None:\n",
|
|
" self.tree[n-1-i, 3] = i + 1\n",
|
|
" else:\n",
|
|
" self.tree = np.append(self.tree, [[index, SplitVal, 1, None]], axis = 1) if len(self.tree[0]) == 0 else np.append(self.tree, [[index, SplitVal, 1, None]], axis = 0)\n",
|
|
" self.learn(X_left, y_left, depth + 1)\n",
|
|
" self.learn(X_right, y_right, depth + 1)\n",
|
|
"\n",
|
|
" ''' \n",
|
|
" if len(y) <= 10:\n",
|
|
" self.tree = np.append(self.tree, [[\"Leaf\", np.mean(y), np.nan, np.nan]], axis = 0) if hasattr(self, 'tree') else np.array([[\"Leaf\", 1, np.nan, np.nan]])\n",
|
|
" n = self.tree.shape[0]\n",
|
|
" for i in range(n):\n",
|
|
" if self.tree[n-1-i, 3] == None:\n",
|
|
" self.tree[n-1-i, 3] = i + 1\n",
|
|
" \n",
|
|
" elif (y == y[0]).all():\n",
|
|
" self.tree = np.append(self.tree, [[\"Leaf\", y[0], np.nan, np.nan]], axis = 0) if hasattr(self, 'tree') else np.array([[\"Leaf\", y[0], np.nan, np.nan]])\n",
|
|
" n = self.tree.shape[0]\n",
|
|
" for i in range(n):\n",
|
|
" if self.tree[n-1-i, 3] == None:\n",
|
|
" self.tree[n-1-i, 3] = i + 1\n",
|
|
" \n",
|
|
" else:\n",
|
|
" index, SplitVal,X_left, X_right, y_left, y_right = x.best_split(X, y)\n",
|
|
" if len(X_left) == 0 or len(X_right) == 0:\n",
|
|
" self.tree = np.append(self.tree, [[\"Leaf\", np.mean(y), np.nan, np.nan]], axis = 0) if hasattr(self, 'tree') else np.array([[\"Leaf\", np.mean(y), np.nan, np.nan]])\n",
|
|
" n = self.tree.shape[0]\n",
|
|
" for i in range(n):\n",
|
|
" if self.tree[n-1-i, 3] == None:\n",
|
|
" self.tree[n-1-i, 3] = i + 1\n",
|
|
" else:\n",
|
|
" self.tree = np.append(self.tree, [[index, SplitVal, 1, None]], axis = 0) if hasattr(self, 'tree') else np.array([[index, SplitVal, 1, None]])\n",
|
|
" self.learn(X_left, y_left)\n",
|
|
" self.learn(X_right, y_right)\n",
|
|
" \n",
|
|
" \n",
|
|
" #############################################\n",
|
|
"\n",
|
|
" def classify(self, record):\n",
|
|
" # TODO: classify the record using self.tree and return the predicted label\n",
|
|
" ### Implement your code here\n",
|
|
" #############################################\n",
|
|
" curr = 0\n",
|
|
" while True:\n",
|
|
" if self.tree[curr, 0] == \"Leaf\":\n",
|
|
" return int(float(self.tree[curr, 1]) + 0.5)\n",
|
|
" else:\n",
|
|
" curr += self.tree[curr, 2] if record[self.tree[curr, 0]] <= self.tree[curr, 1] else self.tree[curr, 3]\n",
|
|
"\n",
|
|
" #############################################"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": 11,
|
|
"metadata": {},
|
|
"outputs": [],
|
|
"source": [
|
|
"# This starter code does not run. You will have to add your changes and\n",
|
|
"# turn in code that runs properly.\n",
|
|
"\n",
|
|
"\"\"\"\n",
|
|
"Here, \n",
|
|
"1. X is assumed to be a matrix with n rows and d columns where n is the\n",
|
|
"number of total records and d is the number of features of each record. \n",
|
|
"2. y is assumed to be a vector of labels of length n.\n",
|
|
"3. XX is similar to X, except that XX also contains the data label for each\n",
|
|
"record.\n",
|
|
"\"\"\"\n",
|
|
"\n",
|
|
"\"\"\"\n",
|
|
"This skeleton is provided to help you implement the assignment.You must \n",
|
|
"implement the existing functions as necessary. You may add new functions\n",
|
|
"as long as they are called from within the given classes. \n",
|
|
"\n",
|
|
"VERY IMPORTANT!\n",
|
|
"Do NOT change the signature of the given functions.\n",
|
|
"Do NOT change any part of the main function APART from the forest_size parameter. \n",
|
|
"\"\"\"\n",
|
|
"\n",
|
|
"\n",
|
|
"class RandomForest(object):\n",
|
|
" num_trees = 0\n",
|
|
" decision_trees = []\n",
|
|
"\n",
|
|
" # the bootstrapping datasets for trees\n",
|
|
" # bootstraps_datasets is a list of lists, where each list in bootstraps_datasets is a bootstrapped dataset.\n",
|
|
" bootstraps_datasets = []\n",
|
|
"\n",
|
|
" # the true class labels, corresponding to records in the bootstrapping datasets\n",
|
|
" # bootstraps_labels is a list of lists, where the 'i'th list contains the labels corresponding to records in\n",
|
|
" # the 'i'th bootstrapped dataset.\n",
|
|
" bootstraps_labels = []\n",
|
|
"\n",
|
|
" def __init__(self, num_trees):\n",
|
|
" # Initialization done here\n",
|
|
" self.num_trees = num_trees\n",
|
|
" self.decision_trees = [DecisionTree(max_depth=10) for i in range(num_trees)]\n",
|
|
" self.bootstraps_datasets = []\n",
|
|
" self.bootstraps_labels = []\n",
|
|
" \n",
|
|
" def _bootstrapping(self, XX, n):\n",
|
|
" # Reference: https://en.wikipedia.org/wiki/Bootstrapping_(statistics)\n",
|
|
" #\n",
|
|
" # TODO: Create a sample dataset of size n by sampling with replacement\n",
|
|
" # from the original dataset XX.\n",
|
|
" # Note that you would also need to record the corresponding class labels\n",
|
|
" # for the sampled records for training purposes.\n",
|
|
"\n",
|
|
" samples = [] # sampled dataset\n",
|
|
" labels = [] # class labels for the sampled records\n",
|
|
" ### Implement your code here\n",
|
|
" #############################################\n",
|
|
" \n",
|
|
" choice = np.random.randint(0, n, size = n)\n",
|
|
" XX = np.array(XX)\n",
|
|
" samples = XX[:, :-1][choice]\n",
|
|
" labels = XX[:, -1][choice]\n",
|
|
" \n",
|
|
" #############################################\n",
|
|
" return (samples, labels)\n",
|
|
"\n",
|
|
" def bootstrapping(self, XX):\n",
|
|
" # Initializing the bootstap datasets for each tree\n",
|
|
" for i in range(self.num_trees):\n",
|
|
" data_sample, data_label = self._bootstrapping(XX, len(XX))\n",
|
|
" self.bootstraps_datasets.append(data_sample)\n",
|
|
" self.bootstraps_labels.append(data_label)\n",
|
|
"\n",
|
|
" def fitting(self):\n",
|
|
" # TODO: Train `num_trees` decision trees using the bootstraps datasets\n",
|
|
" # and labels by calling the learn function from your DecisionTree class.\n",
|
|
" ### Implement your code here\n",
|
|
" #############################################\n",
|
|
" \n",
|
|
" for i in range(self.num_trees):\n",
|
|
" self.decision_trees[i].learn(self.bootstraps_datasets[i], self.bootstraps_labels[i])\n",
|
|
"\n",
|
|
" #############################################\n",
|
|
"\n",
|
|
" def voting(self, X):\n",
|
|
" y = []\n",
|
|
"\n",
|
|
" for record in X:\n",
|
|
" # Following steps have been performed here:\n",
|
|
" # 1. Find the set of trees that consider the record as an\n",
|
|
" # out-of-bag sample.\n",
|
|
" # 2. Predict the label using each of the above found trees.\n",
|
|
" # 3. Use majority vote to find the final label for this recod.\n",
|
|
" votes = []\n",
|
|
" \n",
|
|
" for i in range(len(self.bootstraps_datasets)):\n",
|
|
" dataset = self.bootstraps_datasets[i]\n",
|
|
" \n",
|
|
" if record not in dataset:\n",
|
|
" OOB_tree = self.decision_trees[i]\n",
|
|
" effective_vote = OOB_tree.classify(record)\n",
|
|
" votes.append(effective_vote)\n",
|
|
"\n",
|
|
" counts = np.bincount(votes)\n",
|
|
"\n",
|
|
" if len(counts) == 0:\n",
|
|
" # TODO: Special case\n",
|
|
" # Handle the case where the record is not an out-of-bag sample\n",
|
|
" # for any of the trees.\n",
|
|
" # NOTE - you can add few lines of codes above (but inside voting) to make this work\n",
|
|
" ### Implement your code here\n",
|
|
" #############################################\n",
|
|
" \n",
|
|
" y = np.append(y, self.decision_trees[0].classify(record))\n",
|
|
" \n",
|
|
" #############################################\n",
|
|
" else:\n",
|
|
" y = np.append(y, np.argmax(counts))\n",
|
|
" \n",
|
|
" return y\n",
|
|
"\n",
|
|
" def user(self):\n",
|
|
" \"\"\"\n",
|
|
" :return: string\n",
|
|
" your GTUsername, NOT your 9-Digit GTId \n",
|
|
" \"\"\"\n",
|
|
" ### Implement your code here\n",
|
|
" #############################################\n",
|
|
" return 'helfayoumy3'\n",
|
|
" #############################################"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": 12,
|
|
"metadata": {},
|
|
"outputs": [],
|
|
"source": [
|
|
"# TODO: Initialize according to your implementation\n",
|
|
"# VERY IMPORTANT: Minimum forest_size should be 10\n",
|
|
"forest_size = 10"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "markdown",
|
|
"metadata": {},
|
|
"source": [
|
|
"### Do not modify the below cell"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": 13,
|
|
"metadata": {},
|
|
"outputs": [
|
|
{
|
|
"name": "stdout",
|
|
"output_type": "stream",
|
|
"text": [
|
|
"reading the data\n",
|
|
"__Name: helfayoumy3__\n",
|
|
"creating the bootstrap datasets\n",
|
|
"fitting the forest\n",
|
|
"accuracy: 0.6519\n",
|
|
"OOB estimate: 0.3481\n",
|
|
"Execution time: 0:00:10.655878\n"
|
|
]
|
|
}
|
|
],
|
|
"source": [
|
|
"# start time \n",
|
|
"start = datetime.now()\n",
|
|
"X = list()\n",
|
|
"y = list()\n",
|
|
"XX = list() # Contains data features and data labels\n",
|
|
"numerical_cols = set([i for i in range(0, 9)]) # indices of numeric attributes (columns)\n",
|
|
"\n",
|
|
"# Loading data set\n",
|
|
"print(\"reading the data\")\n",
|
|
"with open(\"pima-indians-diabetes.csv\") as f:\n",
|
|
" next(f, None)\n",
|
|
" for line in csv.reader(f, delimiter=\",\"):\n",
|
|
" xline = []\n",
|
|
" for i in range(len(line)):\n",
|
|
" if i in numerical_cols:\n",
|
|
" xline.append(ast.literal_eval(line[i]))\n",
|
|
" else:\n",
|
|
" xline.append(line[i])\n",
|
|
"\n",
|
|
" X.append(xline[:-1])\n",
|
|
" y.append(xline[-1])\n",
|
|
" XX.append(xline[:])\n",
|
|
"\n",
|
|
"# Initializing a random forest.\n",
|
|
"randomForest = RandomForest(forest_size)\n",
|
|
"\n",
|
|
"# printing the name\n",
|
|
"print(\"__Name: \" + randomForest.user()+\"__\")\n",
|
|
"\n",
|
|
"# Creating the bootstrapping datasets\n",
|
|
"print(\"creating the bootstrap datasets\")\n",
|
|
"randomForest.bootstrapping(XX)\n",
|
|
"\n",
|
|
"# Building trees in the forest\n",
|
|
"print(\"fitting the forest\")\n",
|
|
"randomForest.fitting()\n",
|
|
"\n",
|
|
"# Calculating an unbiased error estimation of the random forest\n",
|
|
"# based on out-of-bag (OOB) error estimate.\n",
|
|
"y_predicted = randomForest.voting(X)\n",
|
|
"\n",
|
|
"# Comparing predicted and true labels\n",
|
|
"results = [prediction == truth for prediction, truth in zip(y_predicted, y)]\n",
|
|
"\n",
|
|
"# Accuracy\n",
|
|
"accuracy = float(results.count(True)) / float(len(results))\n",
|
|
"\n",
|
|
"print(\"accuracy: %.4f\" % accuracy)\n",
|
|
"print(\"OOB estimate: %.4f\" % (1 - accuracy))\n",
|
|
"\n",
|
|
"# end time\n",
|
|
"print(\"Execution time: \" + str(datetime.now() - start))"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": null,
|
|
"metadata": {},
|
|
"outputs": [],
|
|
"source": []
|
|
}
|
|
],
|
|
"metadata": {
|
|
"kernelspec": {
|
|
"display_name": "Python 3",
|
|
"language": "python",
|
|
"name": "python3"
|
|
},
|
|
"language_info": {
|
|
"codemirror_mode": {
|
|
"name": "ipython",
|
|
"version": 3
|
|
},
|
|
"file_extension": ".py",
|
|
"mimetype": "text/x-python",
|
|
"name": "python",
|
|
"nbconvert_exporter": "python",
|
|
"pygments_lexer": "ipython3",
|
|
"version": "3.7.9"
|
|
}
|
|
},
|
|
"nbformat": 4,
|
|
"nbformat_minor": 4
|
|
}
|