################################################# ### THIS FILE WAS AUTOGENERATED! DO NOT EDIT! ### ################################################# # file to edit: Q3.ipynb import numpy as np import pandas as pd import time import gc import random from sklearn.model_selection import cross_val_score, GridSearchCV, cross_validate, train_test_split from sklearn.metrics import accuracy_score, classification_report from sklearn.svm import SVC from sklearn.linear_model import LinearRegression from sklearn.neural_network import MLPClassifier from sklearn.ensemble import RandomForestClassifier from sklearn.preprocessing import StandardScaler, normalize from sklearn.decomposition import PCA from sklearn.impute import SimpleImputer class GaTech(): # Change to your GA Tech Username def GTusername(self): gt_username = "psrinivasan48" return gt_username class Data(): # points [1] def dataAllocation(self,path): # TODO: Separate out the x_data and y_data and return each # args: string path for .csv file # return: pandas dataframe, pandas dataframe # ------------------------------- # ADD CODE HERE # ------------------------------- df = pd.read_csv(path) x_data = df.copy() x_data = x_data.drop('y', axis=1) y_data = df['y'] return x_data,y_data # points [1] def trainSets(self,x_data,y_data): # TODO: Split 70% of the data into training and 30% into test sets. Call them x_train, x_test, y_train and y_test. # Use the train_test_split method in sklearn with the parameter 'shuffle' set to true and the 'random_state' set to 614. # args: pandas dataframe, pandas dataframe # return: pandas dataframe, pandas dataframe, pandas series, pandas series # ------------------------------- # ADD CODE HERE # ------------------------------- x_train, x_test, y_train, y_test = train_test_split(x_data, y_data, test_size=0.3, random_state=614, shuffle=True) return x_train, x_test, y_train, y_test ################################################## ##### Do not add anything below this line ######## ################################################## class LinearRegressionModel(): # points [2] def linearClassifer(self,x_train, x_test, y_train): # TODO: Create a LinearRegression classifier and train it. # args: pandas dataframe, pandas dataframe, pandas series # return: numpy array, numpy array # ------------------------------- # ADD CODE HERE lr = LinearRegression() lr.fit(x_train, y_train) y_predict_train = lr.predict(x_train) y_predict_test = lr.predict(x_test) # ------------------------------- return y_predict_train, y_predict_test # points [1] def lgTrainAccuracy(self,y_train,y_predict_train): # TODO: Return accuracy (on the training set) using the accuracy_score method. # Note: Round the output values greater than or equal to 0.5 to 1 and those less than 0.5 to 0. You can use any method that satisfies the requriements. # args: pandas series, numpy array # return: float # ------------------------------- # ADD CODE HERE y_predict_train = [1 if i >= 0.5 else 0 for i in y_predict_train] train_accuracy = accuracy_score(y_train, y_predict_train) # ------------------------------- return train_accuracy # points [1] def lgTestAccuracy(self,y_test,y_predict_test): # TODO: Return accuracy (on the testing set) using the accuracy_score method. # Note: Round the output values greater than or equal to 0.5 to 1 and those less than 0.5 to 0. You can use any method that satisfies the requriements. # args: pandas series, numpy array # return: float # ------------------------------- # ADD CODE HERE y_predict_test = [1 if i >= 0.5 else 0 for i in y_predict_test] test_accuracy = accuracy_score(y_test, y_predict_test) # ------------------------------- return test_accuracy ################################################## ##### Do not add anything below this line ######## ################################################## class RFClassifier(): # points [2] def randomForestClassifier(self,x_train,x_test, y_train): # TODO: Create a RandomForestClassifier and train it. Set Random state to 614. # args: pandas dataframe, pandas dataframe, pandas series # return: RandomForestClassifier object, numpy array, numpy array # ------------------------------- # ADD CODE HERE rf_clf = RandomForestClassifier(random_state=614) rf_clf.fit(x_train, y_train) y_predict_train = rf_clf.predict(x_train) y_predict_test = rf_clf.predict(x_test) # ------------------------------- return rf_clf,y_predict_train, y_predict_test # points [1] def rfTrainAccuracy(self,y_train,y_predict_train): # TODO: Return accuracy on the training set using the accuracy_score method. # args: pandas series, numpy array # return: float # ------------------------------- # ADD CODE HERE train_accuracy = accuracy_score(y_train, y_predict_train) # ------------------------------- return train_accuracy # points [1] def rfTestAccuracy(self,y_test,y_predict_test): # TODO: Return accuracy on the test set using the accuracy_score method. # args: pandas series, numpy array # return: float # ------------------------------- # ADD CODE HERE test_accuracy = accuracy_score(y_test, y_predict_test) # ------------------------------- return test_accuracy # Q3.3.1 Feature Importance # points [1] def rfFeatureImportance(self,rf_clf): # TODO: Determine the feature importance as evaluated by the Random Forest Classifier. # args: RandomForestClassifier object # return: float array # ------------------------------- # ADD CODE HERE feature_importance = rf_clf.feature_importances_ # ------------------------------- return feature_importance # points [1] def sortedRFFeatureImportanceIndicies(self,rf_clf): # TODO: Sort them in the ascending order and return the feature numbers[0 to ...]. # Hint: There is a direct function available in sklearn to achieve this. Also checkout argsort() function in Python. # args: RandomForestClassifier object # return: int array # ------------------------------- # ADD CODE HERE sorted_indices = np.argsort(rf_clf.feature_importances_)[::-1] # ------------------------------- return sorted_indices # Q3.3.2 Hyper-parameter Tuning # points [2] def hyperParameterTuning(self,rf_clf,x_train,y_train): # TODO: Tune the hyper-parameters 'n_estimators' and 'max_depth'. # args: RandomForestClassifier object, pandas dataframe, pandas series # return: GridSearchCV object, float # 'n_estimators': [4, 16, 256] # 'max_depth': [2, 8, 16] # ------------------------------- # ADD CODE HERE params = { 'n_estimators': [4, 16, 256], 'max_depth': [2, 8, 16] } gscv_rfc = GridSearchCV(estimator=rf_clf, param_grid=params) gscv_rfc_fit = gscv_rfc.fit(x_train, y_train) # ------------------------------- return gscv_rfc, gscv_rfc_fit # points [1] def bestParams(self,gscv_rfc): # TODO: Get the best params, using .best_params_ # args: GridSearchCV object # return: parameter dict # ------------------------------- # ADD CODE HERE best_params = gscv_rfc.best_params_ # ------------------------------- return best_params # points [1] def bestScore(self,gscv_rfc): # TODO: Get the best score, using .best_score_. # args: GridSearchCV object # return: float # ------------------------------- # ADD CODE HERE best_score = gscv_rfc.best_score_ # ------------------------------- return best_score ################################################## ##### Do not add anything below this line ######## ################################################## class SupportVectorMachine(): # Q3.4.1 Pre-process # points [1] def dataPreProcess(self,x_train,x_test): # TODO: Pre-process the data to standardize it, otherwise the grid search will take much longer. # args: pandas dataframe, pandas dataframe # return: pandas dataframe, pandas dataframe # ------------------------------- # ADD CODE HERE scaler = StandardScaler().fit(x_train) scaled_x_train = scaler.transform(x_train) scaler = StandardScaler().fit(x_test) scaled_x_test = scaler.transform(x_test) # ------------------------------- return scaled_x_train, scaled_x_test # Q3.4.2 Classification # points [1] def SVCClassifer(self,scaled_x_train,scaled_x_test, y_train): # TODO: Create a SVC classifier and train it. Set gamma = 'auto' # args: pandas dataframe, pandas dataframe, pandas series # return: numpy array, numpy array # ------------------------------- # ADD CODE HERE svc = SVC(gamma='auto') svc.fit(scaled_x_train, y_train) y_predict_train = svc.predict(scaled_x_train) y_predict_test = svc.predict(scaled_x_test) # ------------------------------- return y_predict_train,y_predict_test # points [1] def SVCTrainAccuracy(self,y_train,y_predict_train): # TODO: Return accuracy on the training set using the accuracy_score method. # args: pandas series, numpy array # return: float # ------------------------------- # ADD CODE HERE train_accuracy = accuracy_score(y_train, y_predict_train) # ------------------------------- return train_accuracy # points [1] def SVCTestAccuracy(self,y_test,y_predict_test): # TODO: Return accuracy on the test set using the accuracy_score method. # args: pandas series, numpy array # return: float # ------------------------------- # ADD CODE HERE test_accuracy = accuracy_score(y_test, y_predict_test) # ------------------------------- return test_accuracy # Q3.4.3 Hyper-parameter Tuning # points [1] def SVMBestScore(self, scaled_x_train, y_train): # TODO: Tune the hyper-parameters 'C' and 'kernel' (use rbf and linear). # Note: Set n_jobs = -1 and return_train_score = True and gamma = 'auto' # args: pandas dataframe, pandas series # return: GridSearchCV object, float # ------------------------------- svm_parameters = {'kernel':('linear', 'rbf'), 'C':[0.01, 0.1, 1.0]} # ADD CODE HERE svm_cv = GridSearchCV(estimator=SVC(gamma = 'auto'), param_grid=svm_parameters, n_jobs = -1, return_train_score = True) svm_cv.fit(scaled_x_train, y_train) best_score = svm_cv.best_score_ # ------------------------------- return svm_cv, best_score # points [1] def SVCClassiferParam(self,svm_cv,scaled_x_train,scaled_x_test,y_train): # TODO: Calculate the training and test set accuracy values after hyperparameter tuning and standardization. # args: GridSearchCV object, pandas dataframe, pandas dataframe, pandas series # return: numpy series, numpy series # ------------------------------- # ADD CODE HERE best_param = svm_cv.best_params_ svc = SVC(kernel = best_param['kernel'], C = best_param['C']) svc.fit(scaled_x_train, y_train) y_predict_train = svc.predict(scaled_x_train) y_predict_test = svc.predict(scaled_x_test) # ------------------------------- return y_predict_train,y_predict_test # points [1] def svcTrainAccuracy(self,y_train,y_predict_train): # TODO: Return accuracy (on the training set) using the accuracy_score method. # args: pandas series, numpy array # return: float # ------------------------------- # ADD CODE HERE train_accuracy = accuracy_score(y_train, y_predict_train) # ------------------------------- return train_accuracy # points [1] def svcTestAccuracy(self,y_test,y_predict_test): # TODO: Return accuracy (on the test set) using the accuracy_score method. # args: pandas series, numpy array # return: float # ------------------------------- # ADD CODE HERE test_accuracy = accuracy_score(y_test, y_predict_test) # ------------------------------- return test_accuracy # Q3.4.4 Cross Validation Results # points [1] def SVMRankTestScore(self,svm_cv): # TODO: Return the rank test score for all hyperparameter values that you obtained in Q3.4.3. The # GridSearchCV class holds a ‘cv_results_’ dictionary that should help you report these metrics easily. # args: GridSearchCV object # return: int array # ------------------------------- # ADD CODE HERE rank_test_score = svm_cv.cv_results_['rank_test_score'] # ------------------------------- return rank_test_score # points [1] def SVMMeanTestScore(self,svm_cv): # TODO: Return mean test score for all of hyperparameter values that you obtained in Q3.4.3. The # GridSearchCV class holds a ‘cv_results_’ dictionary that should help you report these metrics easily. # args: GridSearchCV object # return: float array # ------------------------------- # ADD CODE HERE mean_test_score = svm_cv.cv_results_['mean_test_score'] # ------------------------------- return mean_test_score ################################################## ##### Do not add anything below this line ######## ################################################## class PCAClassifer(): # points [2] def pcaClassifer(self,x_data): # TODO: Perform dimensionality reduction of the data using PCA. # Set parameters n_component to 8 and svd_solver to 'full'. Keep other parameters at their default value. # args: pandas dataframe # return: pca_object # ------------------------------- # ADD CODE HERE pca = PCA(n_components=8, svd_solver='full') pca.fit_transform(x_data) # ------------------------------- return pca # points [1] def pcaExplainedVarianceRatio(self, pca): # TODO: Return percentage of variance explained by each of the selected components # args: pca_object # return: float array # ------------------------------- # ADD CODE HERE explained_variance_ratio = pca.explained_variance_ratio_ # ------------------------------- return explained_variance_ratio # points [1] def pcaSingularValues(self, pca): # TODO: Return the singular values corresponding to each of the selected components. # args: pca_object # return: float array # ------------------------------- # ADD CODE HERE singular_values = pca.singular_values_ # ------------------------------- return singular_values ################################################## ##### Do not add anything below this line ######## ##################################################