# Q3 Using Scikit-Learn

In [1]:
#export
import numpy as np
import pandas as pd
import time
import gc
import random
from sklearn.model_selection import cross_val_score, GridSearchCV, cross_validate, train_test_split
from sklearn.metrics import accuracy_score, classification_report
from sklearn.svm import SVC
from sklearn.linear_model import LinearRegression
from sklearn.neural_network import MLPClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import StandardScaler, normalize
from sklearn.decomposition import PCA
from sklearn.impute import SimpleImputer

In [2]:
%load_ext autoreload
%autoreload 2
import tests as tests

In [3]:
#export
class GaTech():
    # Change to your GA Tech Username
    def GTusername(self):
        gt_username = "mpearl3"
        return gt_username

In [4]:
%run helpers/verify_config.py # verify the environment setup

Your python version is  3.7.9
âœ… ALL GOOD


# Q3.1 Data Import and Cleansing Setup

In [5]:
#export
class Data():
    
    # points [1]
    def dataAllocation(self,path):
        # TODO: Separate out the x_data and y_data and return each
        # args: string path for .csv file
        # return: pandas dataframe, pandas series
        # -------------------------------
        # ADD CODE HERE
        data = pd.read_csv(path)
        y_data = data['y']
        x_data = data[data.columns[~data.columns.isin(['y'])]] 
        # ------------------------------- 
        return x_data,y_data
    
    # points [1]
    def trainSets(self,x_data,y_data):
        # TODO: Split 70% of the data into training and 30% into test sets. Call them x_train, x_test, y_train and y_test.
        # Use the train_test_split method in sklearn with the parameter 'shuffle' set to true and the 'random_state' set to 614.
        # args: pandas dataframe, pandas dataframe
        # return: pandas dataframe, pandas dataframe, pandas series, pandas series
        # -------------------------------
        # ADD CODE HERE
        x_train, x_test, y_train, y_test = train_test_split(x_data,y_data,train_size=0.7,shuffle=True,random_state=614)
        # -------------------------------
        return x_train, x_test, y_train, y_test

##################################################
##### Do not add anything below this line ########
tests.dataTest(Data)
##################################################

dataAllocation Function Executed
trainSets Function Executed


# Q3.2 Linear Regression 

In [6]:
#export
class LinearRegressionModel():
    
    # points [2]
    def linearClassifier(self,x_train, x_test, y_train):
        # TODO: Create a LinearRegression classifier and train it.
        # args: pandas dataframe, pandas dataframe, pandas series
        # return: numpy array, numpy array
        # -------------------------------
        # ADD CODE HERE
        lg = LinearRegression().fit(x_train,y_train)
        y_predict_train = lg.predict(np.array(x_train))
        y_predict_test = lg.predict(np.array(x_test))     
        # -------------------------------
        return y_predict_train, y_predict_test

    # points [1]
    def lgTrainAccuracy(self,y_train,y_predict_train):
        # TODO: Return accuracy (on the training set) using the accuracy_score method.
        # Note: Round the output values greater than or equal to 0.5 to 1 and those less than 0.5 to 0. You can use any method that satisfies the requriements.
        # args: pandas series, numpy array
        # return: float
        # -------------------------------
        # ADD CODE HERE 
        y_predict_train = np.where(y_predict_train >= 0.5, 1, 0)
        train_accuracy = accuracy_score(y_train,y_predict_train)
        # -------------------------------   
        return train_accuracy
    
    # points [1]
    def lgTestAccuracy(self,y_test,y_predict_test):
        # TODO: Return accuracy (on the testing set) using the accuracy_score method.
        # Note: Round the output values greater than or equal to 0.5 to 1 and those less than 0.5 to 0. You can use any method that satisfies the requriements.
        # args: pandas series, numpy array
        # return: float
        # -------------------------------
        # ADD CODE HERE
        y_predict_test = np.where(y_predict_test >= 0.5, 1, 0)           
        test_accuracy = accuracy_score(y_test,y_predict_test)
        # -------------------------------
        return test_accuracy
    
##################################################
##### Do not add anything below this line ########
tests.linearTest(Data,LinearRegressionModel)
##################################################

linearClassifier Function Executed
Linear Regression Train Accuracy:  0.7839851024208566
Linear Regression Test Accuracy:  0.7316017316017316


# Q3.3 Random Forest Classifier

In [7]:
#export
class RFClassifier():
    
    # points [2]
    def randomForestClassifier(self,x_train,x_test, y_train):
        # TODO: Create a RandomForestClassifier and train it. Set Random state to 614.
        # args: pandas dataframe, pandas dataframe, pandas series
        # return: RandomForestClassifier object, numpy array, numpy array
        # -------------------------------
        # ADD CODE HERE
        rf_clf = RandomForestClassifier(random_state=614).fit(x_train,y_train)
        y_predict_train = rf_clf.predict(x_train)
        y_predict_test = rf_clf.predict(x_test)        
        # -------------------------------
        return rf_clf,y_predict_train, y_predict_test
    
    # points [1]
    def rfTrainAccuracy(self,y_train,y_predict_train):
        # TODO: Return accuracy on the training set using the accuracy_score method.
        # args: pandas series, numpy array
        # return: float
        # -------------------------------
        # ADD CODE HERE
        train_accuracy = accuracy_score(y_train,y_predict_train)
        # -------------------------------
        return train_accuracy
    
    # points [1]
    def rfTestAccuracy(self,y_test,y_predict_test):
        # TODO: Return accuracy on the test set using the accuracy_score method.
        # args: pandas series, numpy array
        # return: float
        # -------------------------------
        # ADD CODE HERE
        test_accuracy = accuracy_score(y_test,y_predict_test)
        # -------------------------------
        return test_accuracy
    
# Q3.3.1 Feature Importance
    
    # points [1]
    def rfFeatureImportance(self,rf_clf):
        # TODO: Determine the feature importance as evaluated by the Random Forest Classifier.
        # args: RandomForestClassifier object
        # return: float array
        # -------------------------------
        # ADD CODE HERE
        feature_importance = rf_clf.feature_importances_
        # -------------------------------
        return feature_importance
    
    # points [1]
    def sortedRFFeatureImportanceIndicies(self,rf_clf):
        # TODO: Sort them in the descending order and return the feature numbers[0 to ...].
        #       Hint: There is a direct function available in sklearn to achieve this. Also checkout argsort() function in Python.
        # args: RandomForestClassifier object
        # return: int array
        # -------------------------------
        # ADD CODE HERE
        sorted_indices = rf_clf.feature_importances_.argsort()
        # -------------------------------
        return sorted_indices
    
# Q3.3.2 Hyper-parameter Tuning

    # points [2]
    def hyperParameterTuning(self,rf_clf,x_train,y_train):
        # TODO: Tune the hyper-parameters 'n_estimators' and 'max_depth'.
        # args: RandomForestClassifier object, pandas dataframe, pandas series
        # return: GridSearchCV object
        # 'n_estimators': [4, 16, 256]
        # 'max_depth': [2, 8, 16]
        # -------------------------------
        # ADD CODE HERE
        param_grid = {
            'n_estimators': [4, 16, 256],
            'max_depth': [2, 8, 16]
        }
        grid_search = GridSearchCV(estimator = rf_clf, param_grid = param_grid, n_jobs = -1)
        gscv_rfc = grid_search.fit(x_train,y_train)        
        # -------------------------------
        return gscv_rfc
    
    # points [1]
    def bestParams(self,gscv_rfc):
        # TODO: Get the best params, using .best_params_
        # args:  GridSearchCV object
        # return: parameter dict
        # -------------------------------
        # ADD CODE HERE
        best_params = gscv_rfc.best_params_
        # -------------------------------
        return best_params
    
    # points [1]
    def bestScore(self,gscv_rfc):
        # TODO: Get the best score, using .best_score_.
        # args: GridSearchCV object
        # return: float
        # -------------------------------
        # ADD CODE HERE
        best_score = gscv_rfc.best_score_
        # -------------------------------
        return best_score
    
##################################################
##### Do not add anything below this line ########
tests.RandomForestTest(Data,RFClassifier)
##################################################

randomForestClassifier Function Executed
Random Forest Train Accuracy:  1.0
Random Forest Test Accuracy:  0.7316017316017316
Random Forest Feature Importance:  [0.07481604 0.25521095 0.08551354 0.07373347 0.0754602  0.1630978
 0.12729624 0.14487176]
Random Forest Sorted Feature Importance:  [3 0 4 2 6 7 5 1]
HyperParameterTuning Function Executed
Random Forest Best Parameters:  {'max_depth': 8, 'n_estimators': 256}
Random Forest Best Score:  0.7858255451713395


# Q3.4 Support Vector Machine

In [13]:
#export
class SupportVectorMachine():
    
# Q3.4.1 Pre-process

    # points [1]
    def dataPreProcess(self,x_train,x_test):
        # TODO: Pre-process the data to standardize it, otherwise the grid search will take much longer.
        # args: pandas dataframe, pandas dataframe
        # return: pandas dataframe, pandas dataframe
        # -------------------------------
        # ADD CODE HERE
        sc  = StandardScaler()
        scaled_x_train = pd.DataFrame(sc.fit_transform(x_train))
        print(scaled_x_train)
        scaled_x_test = sc.transform(x_test)        
        # -------------------------------
        return scaled_x_train, scaled_x_test
    
# Q3.4.2 Classification

    # points [1]
    def SVCClassifier(self,scaled_x_train,scaled_x_test, y_train):
        # TODO: Create a SVC classifier and train it. Set gamma = 'auto'
        # args: pandas dataframe, pandas dataframe, pandas series
        # return: numpy array, numpy array
        # -------------------------------
        # ADD CODE HERE
        svc = SVC(gamma='auto').fit(scaled_x_train,y_train)
        y_predict_train = svc.predict(scaled_x_train)
        y_predict_test = svc.predict(scaled_x_test)       
        # -------------------------------
        return y_predict_train,y_predict_test
    
    # points [1]
    def SVCTrainAccuracy(self,y_train,y_predict_train):
        # TODO: Return accuracy on the training set using the accuracy_score method.
        # args: pandas series, numpy array
        # return: float 
        # -------------------------------
        # ADD CODE HERE
        train_accuracy = accuracy_score(y_train,y_predict_train)
        # -------------------------------
        return train_accuracy
    
    # points [1]
    def SVCTestAccuracy(self,y_test,y_predict_test):
        # TODO: Return accuracy on the test set using the accuracy_score method.
        # args: pandas series, numpy array
        # return: float 
        # -------------------------------
        # ADD CODE HERE
        test_accuracy = accuracy_score(y_test,y_predict_test)
        # -------------------------------
        return test_accuracy
    
# Q3.4.3 Hyper-parameter Tuning
    
    # points [1]
    def SVMBestScore(self, scaled_x_train, y_train):
        # TODO: Tune the hyper-parameters 'C' and 'kernel' (use rbf and linear).
        # Note: Set n_jobs = -1 and return_train_score = True and gamma = 'auto'
        # args: pandas dataframe, pandas series
        # return: GridSearchCV object, float
        # -------------------------------
        svm_parameters = {'kernel':('linear', 'rbf'), 'C':[0.01, 0.1, 1.0]}
        # ADD CODE HERE
        svm_new = SVC(gamma='auto')
        svm_cv = GridSearchCV(estimator = svm_new, param_grid = svm_parameters, n_jobs = -1, return_train_score=True)
        svm_fit = svm_cv.fit(scaled_x_train, y_train)
        best_score = svm_fit.best_score_        
        print(best_score)
        # -------------------------------
        
        return svm_cv, best_score
    
    # points [1]
    def SVCClassifierParam(self,svm_cv,scaled_x_train,scaled_x_test,y_train):
        # TODO: Calculate the training and test set accuracy values after hyperparameter tuning and standardization. 
        # args: GridSearchCV object, pandas dataframe, pandas dataframe, pandas series
        # return: numpy series, numpy series
        # -------------------------------
        # ADD CODE HERE
        y_predict_train = svm_cv.predict(scaled_x_train)
        y_predict_test = svm_cv.predict(scaled_x_test)       
        # -------------------------------
        return y_predict_train,y_predict_test

    # points [1]
    def svcTrainAccuracy(self,y_train,y_predict_train):
        # TODO: Return accuracy (on the training set) using the accuracy_score method.
        # args: pandas series, numpy array
        # return: float
        # -------------------------------
        # ADD CODE HERE
        train_accuracy = accuracy_score(y_train,y_predict_train)
        # -------------------------------
        return train_accuracy

    # points [1]
    def svcTestAccuracy(self,y_test,y_predict_test):
        # TODO: Return accuracy (on the test set) using the accuracy_score method.
        # args: pandas series, numpy array
        # return: float
        # -------------------------------
        # ADD CODE HERE
        test_accuracy = accuracy_score(y_test,y_predict_test)
        # -------------------------------
        return test_accuracy
    
# Q3.4.4 Cross Validation Results

    # points [1]
    def SVMRankTestScore(self,svm_cv):
        # TODO: Return the rank test score for all hyperparameter values that you obtained in Q3.4.3. The 
        # GridSearchCV class holds a 'cv_results_' dictionary that should help you report these metrics easily.
        # args: GridSearchCV object 
        # return: int array
        # -------------------------------
        # ADD CODE HERE
        rank_test_score = svm_cv.cv_results_['rank_test_score']
        # -------------------------------
        return rank_test_score
    
    # points [1]
    def SVMMeanTestScore(self,svm_cv):
        # TODO: Return mean test score for all of hyperparameter values that you obtained in Q3.4.3. The 
        # GridSearchCV class holds a 'cv_results_' dictionary that should help you report these metrics easily.
        # args: GridSearchCV object
        # return: float array
        # -------------------------------
        # ADD CODE HERE
        mean_test_score = svm_cv.cv_results_['mean_test_score']
        # -------------------------------
        return mean_test_score

##################################################
##### Do not add anything below this line ########
tests.SupportVectorMachineTest(Data,SupportVectorMachine)
##################################################

            0         1         2         3         4         5         6  \
0   -1.131241  1.759569  1.067761  0.372070 -0.717966  1.481755  0.645836   
1   -0.832433 -3.731472 -0.064501  0.877898 -0.717966 -0.051000 -0.245593   
2    0.063991  0.471168  0.758963 -1.335097 -0.717966 -0.152339 -0.656791   
3    0.960414 -0.479794  1.170694 -0.196985 -0.717966 -1.229067 -0.707815   
4    0.661606 -0.541147  0.244298 -0.196985  0.639076 -0.317015  0.753887   
..        ...       ...       ...       ...       ...       ...       ...   
532 -0.533625 -0.694528 -3.564220 -1.335097 -0.717966 -1.292404 -1.088998   
533 -0.234817  2.189036  0.038432  0.624984 -0.717966  0.316355 -0.689806   
534 -1.131241  0.287111 -0.167434  1.194040 -0.717966  0.240350 -0.824871   
535  1.259222 -0.663851  0.244298  1.194040  1.152316  0.886387  0.570799   
536 -1.131241 -0.817232  0.810429  0.245613 -0.404803  0.633039 -0.671798   

            7  
0   -0.867083  
1   -0.953726  
2   -0.260577  
3    1.2990

# Q3.5 PCA

In [9]:
#export
class PCAClassifier():
    
    # points [2]
    def pcaClassifier(self,x_data):
        # TODO: Perform dimensionality reduction of the data using PCA.
        #       Set parameters n_components to 8 and svd_solver to 'full'. Keep other parameters at their default value.
        # args: pandas dataframe
        # return: pca_object
        # -------------------------------
        # ADD CODE HERE
        pca = PCA(n_components=8,svd_solver='full')
        pca= pca.fit(x_data)        
        # -------------------------------
        return pca
    
    # points [1]
    def pcaExplainedVarianceRatio(self, pca):
        # TODO: Return percentage of variance explained by each of the selected components
        # args: pca_object
        # return: float array
        # -------------------------------
        # ADD CODE HERE
        explained_variance_ratio = pca.explained_variance_ratio_
        # -------------------------------
        return explained_variance_ratio
    
    # points [1]
    def pcaSingularValues(self, pca):
        # TODO: Return the singular values corresponding to each of the selected components.
        # args: pca_object
        # return: float array
        # -------------------------------
        # ADD CODE HERE
        singular_values= pca.singular_values_
        # -------------------------------
        return singular_values
    
##################################################
##### Do not add anything below this line ########
tests.PCATest(Data,PCAClassifier)
##################################################

pcaClassifier Function Executed
PCA Explained Variance Ratio:  [8.88546635e-01 6.15907837e-02 2.57901189e-02 1.30861374e-02
 7.44093864e-03 3.02614919e-03 5.12444875e-04 6.79264301e-06]
PCA Singular Values:  [3212.6611207   845.82919167  547.33280231  389.87962763  293.9941346
  187.48648707   77.15221185    8.88268374]


In [10]:
 %run helpers/notebook2script submission

Late Policy:
    
      "I have read the late policy for CS6424."
    


Honor Pledge:
    
      "I have read the Collaboration and Academic Honesty policy for CS6424.
      I certify that I have or will use outside references only in accordance with
      this policy, that I have or will cite any such references via code comments,
      and that I have not or will not copy any portion of my submission from another
      past or current student."

    


Converted Q3.ipynb to submission\submission.py
