{ "cells": [ { "cell_type": "markdown", "metadata": { "colab_type": "text", "id": "DB0vv4pBcWu9" }, "source": [ "# Q3 Using Scikit-Learn" ] }, { "cell_type": "code", "execution_count": 1, "metadata": { "colab": {}, "colab_type": "code", "id": "GalZFbfhcWvA" }, "outputs": [], "source": [ "#export\n", "import numpy as np\n", "import pandas as pd\n", "import time\n", "import gc\n", "import random\n", "from sklearn.model_selection import cross_val_score, GridSearchCV, cross_validate, train_test_split\n", "from sklearn.metrics import accuracy_score, classification_report\n", "from sklearn.svm import SVC\n", "from sklearn.linear_model import LinearRegression\n", "from sklearn.neural_network import MLPClassifier\n", "from sklearn.ensemble import RandomForestClassifier\n", "from sklearn.preprocessing import StandardScaler, normalize\n", "from sklearn.decomposition import PCA\n", "from sklearn.impute import SimpleImputer" ] }, { "cell_type": "code", "execution_count": 2, "metadata": {}, "outputs": [], "source": [ "%load_ext autoreload\n", "%autoreload 2\n", "import tests as tests" ] }, { "cell_type": "code", "execution_count": 3, "metadata": { "colab": {}, "colab_type": "code", "id": "7WLcC3HAlXUF", "outputId": "afd04c19-13dd-4044-a352-40ad8c28cec7" }, "outputs": [], "source": [ "#export\n", "class GaTech():\n", " # Change to your GA Tech Username\n", " def GTusername(self):\n", " gt_username = \"mpearl3\"\n", " return gt_username" ] }, { "cell_type": "code", "execution_count": 4, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Your python version is 3.7.9\n", "✅ ALL GOOD\n" ] } ], "source": [ "%run helpers/verify_config.py # verify the environment setup" ] }, { "cell_type": "markdown", "metadata": { "colab_type": "text", "id": "2Z1gV3UlcWvD" }, "source": [ "# Q3.1 Data Import and Cleansing Setup" ] }, { "cell_type": "code", "execution_count": 5, "metadata": { "colab": {}, "colab_type": "code", "id": "9VS44b2kcWvE" }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "dataAllocation Function Executed\n", "trainSets Function Executed\n" ] } ], "source": [ "#export\n", "class Data():\n", " \n", " # points [1]\n", " def dataAllocation(self,path):\n", " # TODO: Separate out the x_data and y_data and return each\n", " # args: string path for .csv file\n", " # return: pandas dataframe, pandas series\n", " # -------------------------------\n", " # ADD CODE HERE\n", " data = pd.read_csv(path)\n", " y_data = data['y']\n", " x_data = data[data.columns[~data.columns.isin(['y'])]] \n", " # ------------------------------- \n", " return x_data,y_data\n", " \n", " # points [1]\n", " def trainSets(self,x_data,y_data):\n", " # TODO: Split 70% of the data into training and 30% into test sets. Call them x_train, x_test, y_train and y_test.\n", " # Use the train_test_split method in sklearn with the parameter 'shuffle' set to true and the 'random_state' set to 614.\n", " # args: pandas dataframe, pandas dataframe\n", " # return: pandas dataframe, pandas dataframe, pandas series, pandas series\n", " # -------------------------------\n", " # ADD CODE HERE\n", " x_train, x_test, y_train, y_test = train_test_split(x_data,y_data,train_size=0.7,shuffle=True,random_state=614)\n", " # -------------------------------\n", " return x_train, x_test, y_train, y_test\n", "\n", "##################################################\n", "##### Do not add anything below this line ########\n", "tests.dataTest(Data)\n", "##################################################" ] }, { "cell_type": "markdown", "metadata": { "colab_type": "text", "id": "q09V5Ux5cWvI" }, "source": [ "# Q3.2 Linear Regression " ] }, { "cell_type": "code", "execution_count": 6, "metadata": { "colab": {}, "colab_type": "code", "id": "tnHXBF1UcWvJ" }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "linearClassifier Function Executed\n", "Linear Regression Train Accuracy: 0.7839851024208566\n", "Linear Regression Test Accuracy: 0.7316017316017316\n" ] } ], "source": [ "#export\n", "class LinearRegressionModel():\n", " \n", " # points [2]\n", " def linearClassifier(self,x_train, x_test, y_train):\n", " # TODO: Create a LinearRegression classifier and train it.\n", " # args: pandas dataframe, pandas dataframe, pandas series\n", " # return: numpy array, numpy array\n", " # -------------------------------\n", " # ADD CODE HERE\n", " lg = LinearRegression().fit(x_train,y_train)\n", " y_predict_train = lg.predict(np.array(x_train))\n", " y_predict_test = lg.predict(np.array(x_test)) \n", " # -------------------------------\n", " return y_predict_train, y_predict_test\n", "\n", " # points [1]\n", " def lgTrainAccuracy(self,y_train,y_predict_train):\n", " # TODO: Return accuracy (on the training set) using the accuracy_score method.\n", " # Note: Round the output values greater than or equal to 0.5 to 1 and those less than 0.5 to 0. You can use any method that satisfies the requriements.\n", " # args: pandas series, numpy array\n", " # return: float\n", " # -------------------------------\n", " # ADD CODE HERE \n", " y_predict_train = np.where(y_predict_train >= 0.5, 1, 0)\n", " train_accuracy = accuracy_score(y_train,y_predict_train)\n", " # ------------------------------- \n", " return train_accuracy\n", " \n", " # points [1]\n", " def lgTestAccuracy(self,y_test,y_predict_test):\n", " # TODO: Return accuracy (on the testing set) using the accuracy_score method.\n", " # Note: Round the output values greater than or equal to 0.5 to 1 and those less than 0.5 to 0. You can use any method that satisfies the requriements.\n", " # args: pandas series, numpy array\n", " # return: float\n", " # -------------------------------\n", " # ADD CODE HERE\n", " y_predict_test = np.where(y_predict_test >= 0.5, 1, 0) \n", " test_accuracy = accuracy_score(y_test,y_predict_test)\n", " # -------------------------------\n", " return test_accuracy\n", " \n", "##################################################\n", "##### Do not add anything below this line ########\n", "tests.linearTest(Data,LinearRegressionModel)\n", "##################################################" ] }, { "cell_type": "markdown", "metadata": { "colab_type": "text", "id": "WbqnCyHAcWvP" }, "source": [ "# Q3.3 Random Forest Classifier" ] }, { "cell_type": "code", "execution_count": 7, "metadata": { "colab": {}, "colab_type": "code", "id": "dTtIFJW7cWvQ" }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "randomForestClassifier Function Executed\n", "Random Forest Train Accuracy: 1.0\n", "Random Forest Test Accuracy: 0.7316017316017316\n", "Random Forest Feature Importance: [0.07481604 0.25521095 0.08551354 0.07373347 0.0754602 0.1630978\n", " 0.12729624 0.14487176]\n", "Random Forest Sorted Feature Importance: [3 0 4 2 6 7 5 1]\n", "HyperParameterTuning Function Executed\n", "Random Forest Best Parameters: {'max_depth': 8, 'n_estimators': 256}\n", "Random Forest Best Score: 0.7858255451713395\n" ] } ], "source": [ "#export\n", "class RFClassifier():\n", " \n", " # points [2]\n", " def randomForestClassifier(self,x_train,x_test, y_train):\n", " # TODO: Create a RandomForestClassifier and train it. Set Random state to 614.\n", " # args: pandas dataframe, pandas dataframe, pandas series\n", " # return: RandomForestClassifier object, numpy array, numpy array\n", " # -------------------------------\n", " # ADD CODE HERE\n", " rf_clf = RandomForestClassifier(random_state=614).fit(x_train,y_train)\n", " y_predict_train = rf_clf.predict(x_train)\n", " y_predict_test = rf_clf.predict(x_test) \n", " # -------------------------------\n", " return rf_clf,y_predict_train, y_predict_test\n", " \n", " # points [1]\n", " def rfTrainAccuracy(self,y_train,y_predict_train):\n", " # TODO: Return accuracy on the training set using the accuracy_score method.\n", " # args: pandas series, numpy array\n", " # return: float\n", " # -------------------------------\n", " # ADD CODE HERE\n", " train_accuracy = accuracy_score(y_train,y_predict_train)\n", " # -------------------------------\n", " return train_accuracy\n", " \n", " # points [1]\n", " def rfTestAccuracy(self,y_test,y_predict_test):\n", " # TODO: Return accuracy on the test set using the accuracy_score method.\n", " # args: pandas series, numpy array\n", " # return: float\n", " # -------------------------------\n", " # ADD CODE HERE\n", " test_accuracy = accuracy_score(y_test,y_predict_test)\n", " # -------------------------------\n", " return test_accuracy\n", " \n", "# Q3.3.1 Feature Importance\n", " \n", " # points [1]\n", " def rfFeatureImportance(self,rf_clf):\n", " # TODO: Determine the feature importance as evaluated by the Random Forest Classifier.\n", " # args: RandomForestClassifier object\n", " # return: float array\n", " # -------------------------------\n", " # ADD CODE HERE\n", " feature_importance = rf_clf.feature_importances_\n", " # -------------------------------\n", " return feature_importance\n", " \n", " # points [1]\n", " def sortedRFFeatureImportanceIndicies(self,rf_clf):\n", " # TODO: Sort them in the descending order and return the feature numbers[0 to ...].\n", " # Hint: There is a direct function available in sklearn to achieve this. Also checkout argsort() function in Python.\n", " # args: RandomForestClassifier object\n", " # return: int array\n", " # -------------------------------\n", " # ADD CODE HERE\n", " sorted_indices = rf_clf.feature_importances_.argsort()\n", " # -------------------------------\n", " return sorted_indices\n", " \n", "# Q3.3.2 Hyper-parameter Tuning\n", "\n", " # points [2]\n", " def hyperParameterTuning(self,rf_clf,x_train,y_train):\n", " # TODO: Tune the hyper-parameters 'n_estimators' and 'max_depth'.\n", " # args: RandomForestClassifier object, pandas dataframe, pandas series\n", " # return: GridSearchCV object\n", " # 'n_estimators': [4, 16, 256]\n", " # 'max_depth': [2, 8, 16]\n", " # -------------------------------\n", " # ADD CODE HERE\n", " param_grid = {\n", " 'n_estimators': [4, 16, 256],\n", " 'max_depth': [2, 8, 16]\n", " }\n", " grid_search = GridSearchCV(estimator = rf_clf, param_grid = param_grid, n_jobs = -1)\n", " gscv_rfc = grid_search.fit(x_train,y_train) \n", " # -------------------------------\n", " return gscv_rfc\n", " \n", " # points [1]\n", " def bestParams(self,gscv_rfc):\n", " # TODO: Get the best params, using .best_params_\n", " # args: GridSearchCV object\n", " # return: parameter dict\n", " # -------------------------------\n", " # ADD CODE HERE\n", " best_params = gscv_rfc.best_params_\n", " # -------------------------------\n", " return best_params\n", " \n", " # points [1]\n", " def bestScore(self,gscv_rfc):\n", " # TODO: Get the best score, using .best_score_.\n", " # args: GridSearchCV object\n", " # return: float\n", " # -------------------------------\n", " # ADD CODE HERE\n", " best_score = gscv_rfc.best_score_\n", " # -------------------------------\n", " return best_score\n", " \n", "##################################################\n", "##### Do not add anything below this line ########\n", "tests.RandomForestTest(Data,RFClassifier)\n", "##################################################" ] }, { "cell_type": "markdown", "metadata": { "colab_type": "text", "id": "BNeOPWIpcWvg" }, "source": [ "# Q3.4 Support Vector Machine" ] }, { "cell_type": "code", "execution_count": 13, "metadata": { "colab": {}, "colab_type": "code", "id": "9msZXyImcWvh" }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ " 0 1 2 3 4 5 6 \\\n", "0 -1.131241 1.759569 1.067761 0.372070 -0.717966 1.481755 0.645836 \n", "1 -0.832433 -3.731472 -0.064501 0.877898 -0.717966 -0.051000 -0.245593 \n", "2 0.063991 0.471168 0.758963 -1.335097 -0.717966 -0.152339 -0.656791 \n", "3 0.960414 -0.479794 1.170694 -0.196985 -0.717966 -1.229067 -0.707815 \n", "4 0.661606 -0.541147 0.244298 -0.196985 0.639076 -0.317015 0.753887 \n", ".. ... ... ... ... ... ... ... \n", "532 -0.533625 -0.694528 -3.564220 -1.335097 -0.717966 -1.292404 -1.088998 \n", "533 -0.234817 2.189036 0.038432 0.624984 -0.717966 0.316355 -0.689806 \n", "534 -1.131241 0.287111 -0.167434 1.194040 -0.717966 0.240350 -0.824871 \n", "535 1.259222 -0.663851 0.244298 1.194040 1.152316 0.886387 0.570799 \n", "536 -1.131241 -0.817232 0.810429 0.245613 -0.404803 0.633039 -0.671798 \n", "\n", " 7 \n", "0 -0.867083 \n", "1 -0.953726 \n", "2 -0.260577 \n", "3 1.299010 \n", "4 0.692504 \n", ".. ... \n", "532 -0.867083 \n", "533 -0.693795 \n", "534 -0.953726 \n", "535 0.865792 \n", "536 -0.780439 \n", "\n", "[537 rows x 8 columns]\n", "dataPreProcess Function Executed\n", "SVCClassifier Function Executed\n", "Support Vector Machine Trian Accuracy: 0.8324022346368715\n", "Support Vector Machine Test Accuracy: 0.7272727272727273\n", "0.7820526133610246\n", "Support Vector Machine Best Score: 0.7820526133610246\n", "SVCClassifierParam Function Executed\n", "Support Vector Machine Trian Accuracy: 0.7877094972067039\n", "Support Vector Machine Test Accuracy: 0.7575757575757576\n", "Support Vector Machine Rank Test Score: [4 6 2 5 1 3]\n", "Support Vector Machine Mean Test Score: [0.77826237 0.63501211 0.782018 0.76341295 0.78205261 0.78033922]\n" ] } ], "source": [ "#export\n", "class SupportVectorMachine():\n", " \n", "# Q3.4.1 Pre-process\n", "\n", " # points [1]\n", " def dataPreProcess(self,x_train,x_test):\n", " # TODO: Pre-process the data to standardize it, otherwise the grid search will take much longer.\n", " # args: pandas dataframe, pandas dataframe\n", " # return: pandas dataframe, pandas dataframe\n", " # -------------------------------\n", " # ADD CODE HERE\n", " sc = StandardScaler()\n", " scaled_x_train = pd.DataFrame(sc.fit_transform(x_train))\n", " print(scaled_x_train)\n", " scaled_x_test = sc.transform(x_test) \n", " # -------------------------------\n", " return scaled_x_train, scaled_x_test\n", " \n", "# Q3.4.2 Classification\n", "\n", " # points [1]\n", " def SVCClassifier(self,scaled_x_train,scaled_x_test, y_train):\n", " # TODO: Create a SVC classifier and train it. Set gamma = 'auto'\n", " # args: pandas dataframe, pandas dataframe, pandas series\n", " # return: numpy array, numpy array\n", " # -------------------------------\n", " # ADD CODE HERE\n", " svc = SVC(gamma='auto').fit(scaled_x_train,y_train)\n", " y_predict_train = svc.predict(scaled_x_train)\n", " y_predict_test = svc.predict(scaled_x_test) \n", " # -------------------------------\n", " return y_predict_train,y_predict_test\n", " \n", " # points [1]\n", " def SVCTrainAccuracy(self,y_train,y_predict_train):\n", " # TODO: Return accuracy on the training set using the accuracy_score method.\n", " # args: pandas series, numpy array\n", " # return: float \n", " # -------------------------------\n", " # ADD CODE HERE\n", " train_accuracy = accuracy_score(y_train,y_predict_train)\n", " # -------------------------------\n", " return train_accuracy\n", " \n", " # points [1]\n", " def SVCTestAccuracy(self,y_test,y_predict_test):\n", " # TODO: Return accuracy on the test set using the accuracy_score method.\n", " # args: pandas series, numpy array\n", " # return: float \n", " # -------------------------------\n", " # ADD CODE HERE\n", " test_accuracy = accuracy_score(y_test,y_predict_test)\n", " # -------------------------------\n", " return test_accuracy\n", " \n", "# Q3.4.3 Hyper-parameter Tuning\n", " \n", " # points [1]\n", " def SVMBestScore(self, scaled_x_train, y_train):\n", " # TODO: Tune the hyper-parameters 'C' and 'kernel' (use rbf and linear).\n", " # Note: Set n_jobs = -1 and return_train_score = True and gamma = 'auto'\n", " # args: pandas dataframe, pandas series\n", " # return: GridSearchCV object, float\n", " # -------------------------------\n", " svm_parameters = {'kernel':('linear', 'rbf'), 'C':[0.01, 0.1, 1.0]}\n", " # ADD CODE HERE\n", " svm_new = SVC(gamma='auto')\n", " svm_cv = GridSearchCV(estimator = svm_new, param_grid = svm_parameters, n_jobs = -1, return_train_score=True)\n", " svm_fit = svm_cv.fit(scaled_x_train, y_train)\n", " best_score = svm_fit.best_score_ \n", " print(best_score)\n", " # -------------------------------\n", " \n", " return svm_cv, best_score\n", " \n", " # points [1]\n", " def SVCClassifierParam(self,svm_cv,scaled_x_train,scaled_x_test,y_train):\n", " # TODO: Calculate the training and test set accuracy values after hyperparameter tuning and standardization. \n", " # args: GridSearchCV object, pandas dataframe, pandas dataframe, pandas series\n", " # return: numpy series, numpy series\n", " # -------------------------------\n", " # ADD CODE HERE\n", " y_predict_train = svm_cv.predict(scaled_x_train)\n", " y_predict_test = svm_cv.predict(scaled_x_test) \n", " # -------------------------------\n", " return y_predict_train,y_predict_test\n", "\n", " # points [1]\n", " def svcTrainAccuracy(self,y_train,y_predict_train):\n", " # TODO: Return accuracy (on the training set) using the accuracy_score method.\n", " # args: pandas series, numpy array\n", " # return: float\n", " # -------------------------------\n", " # ADD CODE HERE\n", " train_accuracy = accuracy_score(y_train,y_predict_train)\n", " # -------------------------------\n", " return train_accuracy\n", "\n", " # points [1]\n", " def svcTestAccuracy(self,y_test,y_predict_test):\n", " # TODO: Return accuracy (on the test set) using the accuracy_score method.\n", " # args: pandas series, numpy array\n", " # return: float\n", " # -------------------------------\n", " # ADD CODE HERE\n", " test_accuracy = accuracy_score(y_test,y_predict_test)\n", " # -------------------------------\n", " return test_accuracy\n", " \n", "# Q3.4.4 Cross Validation Results\n", "\n", " # points [1]\n", " def SVMRankTestScore(self,svm_cv):\n", " # TODO: Return the rank test score for all hyperparameter values that you obtained in Q3.4.3. The \n", " # GridSearchCV class holds a 'cv_results_' dictionary that should help you report these metrics easily.\n", " # args: GridSearchCV object \n", " # return: int array\n", " # -------------------------------\n", " # ADD CODE HERE\n", " rank_test_score = svm_cv.cv_results_['rank_test_score']\n", " # -------------------------------\n", " return rank_test_score\n", " \n", " # points [1]\n", " def SVMMeanTestScore(self,svm_cv):\n", " # TODO: Return mean test score for all of hyperparameter values that you obtained in Q3.4.3. The \n", " # GridSearchCV class holds a 'cv_results_' dictionary that should help you report these metrics easily.\n", " # args: GridSearchCV object\n", " # return: float array\n", " # -------------------------------\n", " # ADD CODE HERE\n", " mean_test_score = svm_cv.cv_results_['mean_test_score']\n", " # -------------------------------\n", " return mean_test_score\n", "\n", "##################################################\n", "##### Do not add anything below this line ########\n", "tests.SupportVectorMachineTest(Data,SupportVectorMachine)\n", "##################################################" ] }, { "cell_type": "markdown", "metadata": { "colab_type": "text", "id": "c2qDYMjgcWv5" }, "source": [ "# Q3.5 PCA" ] }, { "cell_type": "code", "execution_count": 9, "metadata": { "colab": {}, "colab_type": "code", "id": "-C9BuGsqcWv5" }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "pcaClassifier Function Executed\n", "PCA Explained Variance Ratio: [8.88546635e-01 6.15907837e-02 2.57901189e-02 1.30861374e-02\n", " 7.44093864e-03 3.02614919e-03 5.12444875e-04 6.79264301e-06]\n", "PCA Singular Values: [3212.6611207 845.82919167 547.33280231 389.87962763 293.9941346\n", " 187.48648707 77.15221185 8.88268374]\n" ] } ], "source": [ "#export\n", "class PCAClassifier():\n", " \n", " # points [2]\n", " def pcaClassifier(self,x_data):\n", " # TODO: Perform dimensionality reduction of the data using PCA.\n", " # Set parameters n_components to 8 and svd_solver to 'full'. Keep other parameters at their default value.\n", " # args: pandas dataframe\n", " # return: pca_object\n", " # -------------------------------\n", " # ADD CODE HERE\n", " pca = PCA(n_components=8,svd_solver='full')\n", " pca= pca.fit(x_data) \n", " # -------------------------------\n", " return pca\n", " \n", " # points [1]\n", " def pcaExplainedVarianceRatio(self, pca):\n", " # TODO: Return percentage of variance explained by each of the selected components\n", " # args: pca_object\n", " # return: float array\n", " # -------------------------------\n", " # ADD CODE HERE\n", " explained_variance_ratio = pca.explained_variance_ratio_\n", " # -------------------------------\n", " return explained_variance_ratio\n", " \n", " # points [1]\n", " def pcaSingularValues(self, pca):\n", " # TODO: Return the singular values corresponding to each of the selected components.\n", " # args: pca_object\n", " # return: float array\n", " # -------------------------------\n", " # ADD CODE HERE\n", " singular_values= pca.singular_values_\n", " # -------------------------------\n", " return singular_values\n", " \n", "##################################################\n", "##### Do not add anything below this line ########\n", "tests.PCATest(Data,PCAClassifier)\n", "##################################################" ] }, { "cell_type": "code", "execution_count": 10, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Late Policy:\n", " \n", " \"I have read the late policy for CS6424.\"\n", " \n", "\n", "\n", "Honor Pledge:\n", " \n", " \"I have read the Collaboration and Academic Honesty policy for CS6424.\n", " I certify that I have or will use outside references only in accordance with\n", " this policy, that I have or will cite any such references via code comments,\n", " and that I have not or will not copy any portion of my submission from another\n", " past or current student.\"\n", "\n", " \n", "\n", "\n", "Converted Q3.ipynb to submission\\submission.py\n" ] } ], "source": [ " %run helpers/notebook2script submission" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [] } ], "metadata": { "colab": { "name": "hw4q3.soln.ipynb", "provenance": [] }, "kernelspec": { "display_name": "Python 3 (ipykernel)", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.7.9" } }, "nbformat": 4, "nbformat_minor": 1 }