tunmnlu/task_2/others-answer/omsa-main/ISYE-6501-OAN/hw2/homework2-4.1.R

library(kknn)
library(caret)
library(tidyverse)

credit_data <- read.table("C:/Users/mjpearl/Downloads/data 2.2/credit_card_data.txt", stringsAsFactors = FALSE, header = FALSE)

#Number of rows in credit card table data
rows = nrow(credit_data)

#Randomly selecting 1 5th of the rows indexes among 654 indexes
sample = sample(1:rows, size = round(rows/5), replace = FALSE)

#training dataset selected by excluding the 1/5th of the sample
training = credit_data[-sample,]
#testing will cover the remaining portion and include the 1/5th previously exlcuded in training
testing = credit_data[sample,]

#training of kknn method via leave-one-out crossvalidation, we want to find the optimal value of 'k'.
#The training function will determine the optimal hyperparameter values required for the best model performance
kknn_fit=train.kknn(formula = V11 ~ .,
                data = training,
                kmax = 100,
                kernel = c("optimal","rectangular", "inv", "gaussian", "triangular"),
                scale = TRUE)
kknn_fit

#Call:
#  train.kknn(formula = V11 ~ ., data = d.train, kmax = 100, kernel = c("optimal",     "rectangular", "inv", "gaussian", "triangular"), scale = TRUE)#
#
#Type of response variable: continuous
#minimal mean absolute error: 0.2006881
#Minimal mean squared error: 0.1062738
#Best kernel: inv
#Best k: 31

#Testing the model on test data
pred<-predict(kknn_fit, testing)
#This to ensure we're receiving a binary response for classifcation instead of a rounded result
pred_bin<-round(pred)
pred_accuracy<-table(pred_bin,testing$V11)
pred_accuracy

#KKNN accuracy for the prediction
sum(pred_bin==testing$V11)/length(testing$V11)

#[1] 0.8549618

#The most effective train/test split was 1/5. Multiple combinations were tried including 1/3, 1/4 and 1/2.