Files
004_comission/tunmnlu/task_04/hw4-skeleton/Q3/Q3.ipynb
louiscklaw 0687d83118 update,
2025-01-31 22:22:19 +08:00

632 lines
21 KiB
Plaintext
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

{
"cells": [
{
"cell_type": "markdown",
"metadata": {
"colab_type": "text",
"id": "DB0vv4pBcWu9"
},
"source": [
"# Q3 Using Scikit-Learn"
]
},
{
"cell_type": "markdown",
"metadata": {
"collapsed": false
},
"source": [
"# Imports\n",
"Do not modify"
]
},
{
"cell_type": "code",
"execution_count": 1,
"metadata": {
"colab": {},
"colab_type": "code",
"id": "GalZFbfhcWvA"
},
"outputs": [],
"source": [
"#export\n",
"import pkg_resources\n",
"from pkg_resources import DistributionNotFound, VersionConflict\n",
"from platform import python_version\n",
"import numpy as np\n",
"import pandas as pd\n",
"import time\n",
"import gc\n",
"import random\n",
"from sklearn.model_selection import cross_val_score, GridSearchCV, cross_validate, train_test_split\n",
"from sklearn.metrics import accuracy_score, classification_report\n",
"from sklearn.svm import SVC\n",
"from sklearn.linear_model import LinearRegression\n",
"from sklearn.neural_network import MLPClassifier\n",
"from sklearn.ensemble import RandomForestClassifier\n",
"from sklearn.preprocessing import StandardScaler, normalize\n",
"from sklearn.decomposition import PCA\n",
"from sklearn.impute import SimpleImputer"
]
},
{
"cell_type": "code",
"execution_count": 2,
"metadata": {},
"outputs": [],
"source": [
"%load_ext autoreload\n",
"%autoreload 2\n",
"import tests as tests"
]
},
{
"cell_type": "markdown",
"metadata": {
"collapsed": false
},
"source": [
"# Verify your Python version and setup"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"collapsed": false,
"pycharm": {
"name": "#%%\n"
}
},
"outputs": [],
"source": [
"#function to check setup\n",
"def check_env_setup():\n",
" dependencies = open(\"requirements.txt\").readlines()\n",
" try:\n",
" pkg_resources.require(dependencies)\n",
" print(\"✅ ALL GOOD\")\n",
" except DistributionNotFound as e:\n",
" print(\"⚠️ Library is missing\")\n",
" print(e)\n",
" except VersionConflict as e:\n",
" print(\"⚠️ Library version conflict\")\n",
" print(e)\n",
" except Exception as e:\n",
" print(\"⚠️ Something went wrong\")\n",
" print(e)"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"collapsed": false,
"pycharm": {
"name": "#%%\n"
}
},
"outputs": [],
"source": [
"# verify the environment setup\n",
"check_env_setup()"
]
},
{
"cell_type": "markdown",
"metadata": {
"collapsed": false
},
"source": [
"# Add your Georgia Tech Username"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"collapsed": false,
"pycharm": {
"name": "#%%\n"
}
},
"outputs": [],
"source": [
"#export\n",
"class GaTech():\n",
" # Change to your GA Tech Username\n",
" # NOT your 9-Digit GTId\n",
" def GTusername(self):\n",
" gt_username = \"gburdell3\"\n",
" return gt_username"
]
},
{
"cell_type": "markdown",
"metadata": {
"colab_type": "text",
"id": "2Z1gV3UlcWvD"
},
"source": [
"# Q3.1 Data Import\n",
"Now for the fun stuff. Lets import some data!"
]
},
{
"cell_type": "code",
"execution_count": 3,
"metadata": {
"colab": {},
"colab_type": "code",
"id": "9VS44b2kcWvE"
},
"outputs": [],
"source": [
"#export\n",
"class Data():\n",
" \n",
" # points [1]\n",
" def dataAllocation(self,path):\n",
" # TODO: Separate out the x_data and y_data and return each\n",
" # args: string path for .csv file\n",
" # return: pandas dataframe, pandas series\n",
" # -------------------------------\n",
" # ADD CODE HERE\n",
" \n",
" # ------------------------------- \n",
" return x_data,y_data\n",
" \n",
" # points [1]\n",
" def trainSets(self,x_data,y_data):\n",
" # TODO: Split 70% of the data into training and 30% into test sets. Call them x_train, x_test, y_train and y_test.\n",
" # Use the train_test_split method in sklearn with the parameter 'shuffle' set to true and the 'random_state' set to 614.\n",
" # args: pandas dataframe, pandas dataframe\n",
" # return: pandas dataframe, pandas dataframe, pandas series, pandas series\n",
" # -------------------------------\n",
" # ADD CODE HERE\n",
" \n",
" # -------------------------------\n",
" return x_train, x_test, y_train, y_test\n",
"\n",
"##################################################\n",
"##### Do not add anything below this line ########\n",
"tests.dataTest(Data)\n",
"##################################################"
]
},
{
"cell_type": "markdown",
"metadata": {
"colab_type": "text",
"id": "q09V5Ux5cWvI"
},
"source": [
"# Q3.2 Linear Regression "
]
},
{
"cell_type": "code",
"execution_count": 4,
"metadata": {
"colab": {},
"colab_type": "code",
"id": "tnHXBF1UcWvJ"
},
"outputs": [],
"source": [
"#export\n",
"class LinearRegressionModel():\n",
" \n",
" # points [2]\n",
" def linearClassifier(self,x_train, x_test, y_train):\n",
" # TODO: Create a LinearRegression classifier and train it.\n",
" # args: pandas dataframe, pandas dataframe, pandas series\n",
" # return: numpy array, numpy array\n",
" # -------------------------------\n",
" # ADD CODE HERE\n",
" \n",
" # -------------------------------\n",
" return y_predict_train, y_predict_test\n",
"\n",
" # points [1]\n",
" def lgTrainAccuracy(self,y_train,y_predict_train):\n",
" # TODO: Return accuracy (on the training set) using the accuracy_score method.\n",
" # Note: Round the output values greater than or equal to 0.5 to 1 and those less than 0.5 to 0. You can use any method that satisfies the requriements.\n",
" # args: pandas series, numpy array\n",
" # return: float\n",
" # -------------------------------\n",
" # ADD CODE HERE \n",
" \n",
" # ------------------------------- \n",
" return train_accuracy\n",
" \n",
" # points [1]\n",
" def lgTestAccuracy(self,y_test,y_predict_test):\n",
" # TODO: Return accuracy (on the testing set) using the accuracy_score method.\n",
" # Note: Round the output values greater than or equal to 0.5 to 1 and those less than 0.5 to 0. You can use any method that satisfies the requriements.\n",
" # args: pandas series, numpy array\n",
" # return: float\n",
" # -------------------------------\n",
" # ADD CODE HERE\n",
" \n",
" # -------------------------------\n",
" return test_accuracy\n",
" \n",
"##################################################\n",
"##### Do not add anything below this line ########\n",
"tests.linearTest(Data,LinearRegressionModel)\n",
"##################################################"
]
},
{
"cell_type": "markdown",
"metadata": {
"colab_type": "text",
"id": "WbqnCyHAcWvP"
},
"source": [
"# Q3.3 Random Forest Classifier"
]
},
{
"cell_type": "code",
"execution_count": 1,
"metadata": {
"colab": {},
"colab_type": "code",
"id": "dTtIFJW7cWvQ"
},
"outputs": [],
"source": [
"#export\n",
"class RFClassifier():\n",
" \n",
" # points [2]\n",
" def randomForestClassifier(self,x_train,x_test, y_train):\n",
" # TODO: Create a RandomForestClassifier and train it. Set Random state to 614.\n",
" # args: pandas dataframe, pandas dataframe, pandas series\n",
" # return: RandomForestClassifier object, numpy array, numpy array\n",
" # -------------------------------\n",
" # ADD CODE HERE\n",
" \n",
" # -------------------------------\n",
" return rf_clf,y_predict_train, y_predict_test\n",
" \n",
" # points [1]\n",
" def rfTrainAccuracy(self,y_train,y_predict_train):\n",
" # TODO: Return accuracy on the training set using the accuracy_score method.\n",
" # args: pandas series, numpy array\n",
" # return: float\n",
" # -------------------------------\n",
" # ADD CODE HERE\n",
" \n",
" # -------------------------------\n",
" return train_accuracy\n",
" \n",
" # points [1]\n",
" def rfTestAccuracy(self,y_test,y_predict_test):\n",
" # TODO: Return accuracy on the test set using the accuracy_score method.\n",
" # args: pandas series, numpy array\n",
" # return: float\n",
" # -------------------------------\n",
" # ADD CODE HERE\n",
" \n",
" # -------------------------------\n",
" return test_accuracy\n",
" \n",
"# Q3.3.1 Feature Importance\n",
" \n",
" # points [1]\n",
" def rfFeatureImportance(self,rf_clf):\n",
" # TODO: Determine the feature importance as evaluated by the Random Forest Classifier.\n",
" # args: RandomForestClassifier object\n",
" # return: float array\n",
" # -------------------------------\n",
" # ADD CODE HERE\n",
" \n",
" # -------------------------------\n",
" return feature_importance\n",
" \n",
" # points [1]\n",
" def sortedRFFeatureImportanceIndicies(self,rf_clf):\n",
" # TODO: Sort them in the descending order and return the feature numbers[0 to ...].\n",
" # Hint: There is a direct function available in sklearn to achieve this. Also checkout argsort() function in Python.\n",
" # args: RandomForestClassifier object\n",
" # return: int array\n",
" # -------------------------------\n",
" # ADD CODE HERE\n",
" \n",
" # -------------------------------\n",
" return sorted_indices\n",
" \n",
"# Q3.3.2 Hyper-parameter Tuning\n",
"\n",
" # points [2]\n",
" def hyperParameterTuning(self,rf_clf,x_train,y_train):\n",
" # TODO: Tune the hyper-parameters 'n_estimators' and 'max_depth'.\n",
" # Define param_grid for GridSearchCV as a dictionary\n",
" # args: RandomForestClassifier object, pandas dataframe, pandas series\n",
" # return: GridSearchCV object\n",
" # 'n_estimators': [4, 16, 256]\n",
" # 'max_depth': [2, 8, 16]\n",
" # -------------------------------\n",
" # ADD CODE HERE\n",
" \n",
" # -------------------------------\n",
" return gscv_rfc\n",
" \n",
" # points [1]\n",
" def bestParams(self,gscv_rfc):\n",
" # TODO: Get the best params, using .best_params_\n",
" # args: GridSearchCV object\n",
" # return: parameter dict\n",
" # -------------------------------\n",
" # ADD CODE HERE\n",
" \n",
" # -------------------------------\n",
" return best_params\n",
" \n",
" # points [1]\n",
" def bestScore(self,gscv_rfc):\n",
" # TODO: Get the best score, using .best_score_.\n",
" # args: GridSearchCV object\n",
" # return: float\n",
" # -------------------------------\n",
" # ADD CODE HERE\n",
" \n",
" # -------------------------------\n",
" return best_score\n",
" \n",
"##################################################\n",
"##### Do not add anything below this line ########\n",
"tests.RandomForestTest(Data,RFClassifier)\n",
"##################################################"
]
},
{
"cell_type": "markdown",
"metadata": {
"colab_type": "text",
"id": "BNeOPWIpcWvg"
},
"source": [
"# Q3.4 Support Vector Machine"
]
},
{
"cell_type": "code",
"execution_count": 5,
"metadata": {
"colab": {},
"colab_type": "code",
"id": "9msZXyImcWvh"
},
"outputs": [],
"source": [
"#export\n",
"class SupportVectorMachine():\n",
" \n",
"# Q3.4.1 Pre-process\n",
"\n",
" # points [1]\n",
" def dataPreProcess(self,x_train,x_test):\n",
" # TODO: Pre-process the data to standardize it, otherwise the grid search will take much longer.\n",
" # args: pandas dataframe, pandas dataframe\n",
" # return: pandas dataframe, pandas dataframe\n",
" # -------------------------------\n",
" # ADD CODE HERE\n",
" \n",
" # -------------------------------\n",
" return scaled_x_train, scaled_x_test\n",
" \n",
"# Q3.4.2 Classification\n",
"\n",
" # points [1]\n",
" def SVCClassifier(self,scaled_x_train,scaled_x_test, y_train):\n",
" # TODO: Create a SVC classifier and train it. Set gamma = 'auto'\n",
" # args: pandas dataframe, pandas dataframe, pandas series\n",
" # return: numpy array, numpy array\n",
" # -------------------------------\n",
" # ADD CODE HERE\n",
" \n",
" # -------------------------------\n",
" return y_predict_train,y_predict_test\n",
" \n",
" # points [1]\n",
" def SVCTrainAccuracy(self,y_train,y_predict_train):\n",
" # TODO: Return accuracy on the training set using the accuracy_score method.\n",
" # args: pandas series, numpy array\n",
" # return: float \n",
" # -------------------------------\n",
" # ADD CODE HERE\n",
" \n",
" # -------------------------------\n",
" return train_accuracy\n",
" \n",
" # points [1]\n",
" def SVCTestAccuracy(self,y_test,y_predict_test):\n",
" # TODO: Return accuracy on the test set using the accuracy_score method.\n",
" # args: pandas series, numpy array\n",
" # return: float \n",
" # -------------------------------\n",
" # ADD CODE HERE\n",
" \n",
" # -------------------------------\n",
" return test_accuracy\n",
" \n",
"# Q3.4.3 Hyper-parameter Tuning\n",
" \n",
" # points [1]\n",
" def SVMBestScore(self, scaled_x_train, y_train):\n",
" # TODO: Tune the hyper-parameters 'C' and 'kernel' (use rbf and linear).\n",
" # Note: Set n_jobs = -1 and return_train_score = True and gamma = 'auto'\n",
" # args: pandas dataframe, pandas series\n",
" # return: GridSearchCV object, float\n",
" # -------------------------------\n",
" svm_parameters = {'kernel':('linear', 'rbf'), 'C':[0.01, 0.1, 1.0]}\n",
" # ADD CODE HERE\n",
" \n",
" # -------------------------------\n",
" \n",
" return svm_cv, best_score\n",
" \n",
" # points [1]\n",
" def SVCClassifierParam(self,svm_cv,scaled_x_train,scaled_x_test,y_train):\n",
" # TODO: Calculate the training and test set predicted values after hyperparameter tuning and standardization.\n",
" # args: GridSearchCV object, pandas dataframe, pandas dataframe, pandas series\n",
" # return: numpy series, numpy series\n",
" # -------------------------------\n",
" # ADD CODE HERE\n",
" \n",
" # -------------------------------\n",
" return y_predict_train,y_predict_test\n",
"\n",
" # points [1]\n",
" def svcTrainAccuracy(self,y_train,y_predict_train):\n",
" # TODO: Return accuracy (on the training set) using the accuracy_score method.\n",
" # args: pandas series, numpy array\n",
" # return: float\n",
" # -------------------------------\n",
" # ADD CODE HERE\n",
" \n",
" # -------------------------------\n",
" return train_accuracy\n",
"\n",
" # points [1]\n",
" def svcTestAccuracy(self,y_test,y_predict_test):\n",
" # TODO: Return accuracy (on the test set) using the accuracy_score method.\n",
" # args: pandas series, numpy array\n",
" # return: float\n",
" # -------------------------------\n",
" # ADD CODE HERE\n",
" \n",
" # -------------------------------\n",
" return test_accuracy\n",
" \n",
"# Q3.4.4 Cross Validation Results\n",
"\n",
" # points [1]\n",
" def SVMRankTestScore(self,svm_cv):\n",
" # TODO: Return the rank test score for all hyperparameter values that you obtained in Q3.4.3. The \n",
" # GridSearchCV class holds a 'cv_results_' dictionary that should help you report these metrics easily.\n",
" # args: GridSearchCV object \n",
" # return: int array\n",
" # -------------------------------\n",
" # ADD CODE HERE\n",
" \n",
" # -------------------------------\n",
" return rank_test_score\n",
" \n",
" # points [1]\n",
" def SVMMeanTestScore(self,svm_cv):\n",
" # TODO: Return mean test score for all of hyperparameter values that you obtained in Q3.4.3. The \n",
" # GridSearchCV class holds a 'cv_results_' dictionary that should help you report these metrics easily.\n",
" # args: GridSearchCV object\n",
" # return: float array\n",
" # -------------------------------\n",
" # ADD CODE HERE\n",
" \n",
" # -------------------------------\n",
" return mean_test_score\n",
"\n",
"##################################################\n",
"##### Do not add anything below this line ########\n",
"tests.SupportVectorMachineTest(Data,SupportVectorMachine)\n",
"##################################################"
]
},
{
"cell_type": "markdown",
"metadata": {
"colab_type": "text",
"id": "c2qDYMjgcWv5"
},
"source": [
"# Q3.5 PCA"
]
},
{
"cell_type": "code",
"execution_count": 6,
"metadata": {
"colab": {},
"colab_type": "code",
"id": "-C9BuGsqcWv5",
"pycharm": {
"name": "#%%\n"
}
},
"outputs": [],
"source": [
"#export\n",
"class PCAClassifier():\n",
" \n",
" # points [2]\n",
" def pcaClassifier(self,x_data):\n",
" # TODO: Perform dimensionality reduction of the data using PCA.\n",
" # Set parameters n_components to 8 and svd_solver to 'full'. Keep other parameters at their default value.\n",
" # args: pandas dataframe\n",
" # return: pca_object\n",
" # -------------------------------\n",
" # ADD CODE HERE\n",
" \n",
" # -------------------------------\n",
" return pca\n",
" \n",
" # points [1]\n",
" def pcaExplainedVarianceRatio(self, pca):\n",
" # TODO: Return percentage of variance explained by each of the selected components\n",
" # args: pca_object\n",
" # return: float array\n",
" # -------------------------------\n",
" # ADD CODE HERE\n",
" \n",
" # -------------------------------\n",
" return explained_variance_ratio\n",
" \n",
" # points [1]\n",
" def pcaSingularValues(self, pca):\n",
" # TODO: Return the singular values corresponding to each of the selected components.\n",
" # args: pca_object\n",
" # return: float array\n",
" # -------------------------------\n",
" # ADD CODE HERE\n",
" \n",
" # -------------------------------\n",
" return singular_values\n",
" \n",
"##################################################\n",
"##### Do not add anything below this line ########\n",
"tests.PCATest(Data,PCAClassifier)\n",
"##################################################"
]
}
],
"metadata": {
"colab": {
"name": "hw4q3.soln.ipynb",
"provenance": []
},
"kernelspec": {
"display_name": "Python 3 (ipykernel)",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 2
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython2",
"version": "2.7.18"
}
},
"nbformat": 4,
"nbformat_minor": 1
}