194 lines
6.6 KiB
Plaintext
194 lines
6.6 KiB
Plaintext
---
|
|
title: "exam2"
|
|
author: "Mark Pearl"
|
|
date: "7/28/2021"
|
|
output: html_document
|
|
---
|
|
```{r setup}
|
|
library(glmnet)
|
|
library(fastDummies)
|
|
library(dplyr)
|
|
library(grpreg)
|
|
```
|
|
|
|
```{r read data and split into train/test}
|
|
students <- read.csv('./student.csv')
|
|
students$Parents_Education <- students$Medu + students$Fedu
|
|
students$Alcohol_Consumption <- students$Dalc + students$Walc
|
|
|
|
students_dummies <-data.frame(students)
|
|
|
|
dummy_vars_df <- as.data.frame(list(sapply(students_dummies, class)),col.names = 'type')
|
|
#Set the index to a new column called column_name
|
|
dummy_vars_df <- cbind(column_name = rownames(dummy_vars_df), dummy_vars_df)
|
|
rownames(dummy_vars_df) <- 1:nrow(dummy_vars_df)
|
|
dummy_vars_df <- dummy_vars_df %>% filter(!column_name %in% c("absences","G1","G2","G3","Parents_Education","Alcohol_Consumption"))
|
|
|
|
for (row in 1:nrow(dummy_vars_df)) {
|
|
#Iterate over all the rows and create dummy variables
|
|
students_dummies <- dummy_cols(students_dummies, select_columns = dummy_vars_df[row,'column_name'])
|
|
students_dummies = students_dummies[,!(names(students_dummies) %in% dummy_vars_df[row,'column_name'])]
|
|
}
|
|
|
|
drop <- c("Parents_Education","Alcohol_Consumption")
|
|
students_dummies = students_dummies[,!(names(students_dummies) %in% drop)]
|
|
|
|
#Split data into train and test
|
|
dt = sort(sample(nrow(students_dummies), nrow(students_dummies)*.8))
|
|
train<-students_dummies[dt,]
|
|
train_y <- train$G3
|
|
train_x <- train[ ,!(colnames(train) == "G3")]
|
|
test<-students_dummies[-dt,]
|
|
test_y <- test$G3
|
|
test_x <- test[ ,!(colnames(test) == "G3")]
|
|
```
|
|
|
|
```{r lasso regression}
|
|
mod <- cv.glmnet(x = as.matrix(train_x),y = train_y, family = "gaussian",alpha = 1, nfolds=10)
|
|
lambda_lasso = mod$lambda.min
|
|
lasso_final = glmnet(x = as.matrix(train_x),y = train_y, family = "gaussian",alpha = 1,lambda=lambda_lasso)
|
|
lasso_final$beta
|
|
```
|
|
```{r q2 lasso predict}
|
|
# Lasso prediction / mse results
|
|
y_lasso = predict(lasso_final, as.matrix(test_x), s = lambda_lasso)
|
|
mse_lasso = sum((test_y-y_lasso)^2)/nrow(test_x)
|
|
mse_lasso
|
|
```
|
|
```{r q2 group variables}
|
|
# Lasso prediction / mse results
|
|
train_x_group <- train_x[,-(grep(paste0("other","$"),colnames(train_x),perl= TRUE))]
|
|
test_x_group <- test_x[,-(grep(paste0("other","$"),colnames(test_x),perl= TRUE))]
|
|
|
|
#Create group vector for group lasso
|
|
group_vars_df <- as.data.frame(list(sapply(train_x_group, class)),col.names = 'type')
|
|
#Set the index to a new column called column_name
|
|
group_vars_df <- cbind(column_name = rownames(group_vars_df), group_vars_df)
|
|
rownames(group_vars_df) <- 1:nrow(group_vars_df)
|
|
group_vars_df
|
|
```
|
|
|
|
```{r create dataframe}
|
|
column_name_prefixes <- rep(NA, nrow(group_vars_df))
|
|
|
|
for (row in 1:nrow(group_vars_df)) {
|
|
#Iterate over all the rows and create dummy variables
|
|
column_name_prefixes[row]=strsplit(group_vars_df[row,'column_name'],"_")[[1]][1]
|
|
}
|
|
|
|
#Get the unique elements of the list
|
|
unique_prefixes <- unique(column_name_prefixes)
|
|
mapping_list <- seq(1:length(unique_prefixes))
|
|
group_factor <- rep(NA, length(column_name_prefixes))
|
|
|
|
#Create group_factor list
|
|
i<-1
|
|
for (elem in column_name_prefixes) {
|
|
group_factor[i]=which(unique_prefixes %in% elem)
|
|
i <- i+1
|
|
}
|
|
```
|
|
|
|
```{r group lasso regression}
|
|
glasso = cv.grpreg(as.matrix(train_x_group),as.vector(train_y),group_factor)
|
|
```
|
|
|
|
|
|
|
|
```{r store optimal lambda from cv group lasso}
|
|
glasso_lambda <- min(glasso$lambda)
|
|
which(glasso$fit$beta[,glasso$min]==0)
|
|
```
|
|
|
|
|
|
```{r prediction / mse results group lasso}
|
|
y_pred_glasso <- predict(glasso,as.matrix(test_x_group),lambda=glasso_lambda)
|
|
mse_glasso = sum((test_y-y_pred_glasso)^2)/length(y_pred_glasso)
|
|
mse_glasso
|
|
```
|
|
```{r summary glasso}
|
|
summary(glasso)
|
|
glasso$fit$beta
|
|
```
|
|
|
|
```{r glass grouping part d}
|
|
#Create single feature for parents education
|
|
Alcohol_Consumption <- students$Alcohol_Consumption
|
|
Parents_Education <- students$Parents_Education
|
|
|
|
students_dummies_final = cbind(students_dummies,Alcohol_Consumption,Parents_Education)
|
|
students_dummies_final <- dummy_cols(students_dummies_final, select_columns = "Alcohol_Consumption")
|
|
students_dummies_final <- dummy_cols(students_dummies_final, select_columns = "Parents_Education")
|
|
```
|
|
|
|
```{r drop columns}
|
|
####NEEDED TO RESTART R RESSION AND RUN EACH OF THESE COMMANDS IN THE CONSOLE FOR IT TO WORK
|
|
###I BELIEVE IT"S BECAUSE OF THE PACKAGE ORDER DEFINED AT THE BEGINNING
|
|
|
|
#Drop columns that end with _no, since yes_ factor already contains inherit grouping for the binary variable
|
|
library(dplyr)
|
|
students_dummies_final <- select(students_dummies_final, -contains("Medu"))
|
|
students_dummies_final <- select(students_dummies_final, -contains("Fedu"))
|
|
students_dummies_final <- select(students_dummies_final, -contains("Dalc"))
|
|
students_dummies_final <- select(students_dummies_final, -contains("Walc"))
|
|
#students_dummies_final <- select(students_dummies, -contains("_no"))
|
|
```
|
|
|
|
``` {r split data and run final glasso}
|
|
#Split data into train and test
|
|
dt = sort(sample(nrow(students_dummies_final), nrow(students_dummies_final)*.8))
|
|
train<-students_dummies_final[dt,]
|
|
train_y_final <- train$G3
|
|
train_x_final <- train[ ,!(colnames(train) == "G3")]
|
|
test<-students_dummies_final[-dt,]
|
|
test_y_final <- test$G3
|
|
test_x_final <- test[ ,!(colnames(test) == "G3")]
|
|
|
|
|
|
#Create group vector for group lasso
|
|
group_vars_df_final <- as.data.frame(list(sapply(train_x_final, class)),col.names = 'type')
|
|
#Set the index to a new column called column_name
|
|
group_vars_df_final <- cbind(column_name = rownames(group_vars_df_final), group_vars_df_final)
|
|
rownames(group_vars_df_final) <- 1:nrow(group_vars_df_final)
|
|
|
|
|
|
column_name_prefixes_final <- rep(NA, nrow(group_vars_df_final))
|
|
|
|
for (row in 1:nrow(group_vars_df_final)) {
|
|
#Iterate over all the rows and create dummy variables
|
|
column_name_prefixes_final[row]=strsplit(group_vars_df_final[row,'column_name'],"_")[[1]][1]
|
|
}
|
|
|
|
#Get the unique elements of the list
|
|
unique_prefixes <- unique(column_name_prefixes_final)
|
|
mapping_list <- seq(1:length(unique_prefixes))
|
|
group_factor_final <- rep(NA, length(column_name_prefixes_final))
|
|
|
|
#Create group_factor list
|
|
i<-1
|
|
for (elem in column_name_prefixes_final) {
|
|
group_factor_final[i]=which(unique_prefixes %in% elem)
|
|
i <- i+1
|
|
}
|
|
```
|
|
|
|
```{r group lasso regression final grouping}
|
|
glasso_final = cv.grpreg(as.matrix(train_x_final),as.vector(train_y_final),group_factor_final)
|
|
```
|
|
|
|
```{r store optimal lambda from cv group lasso}
|
|
glasso_lambda <- min(glasso_final$lambda)
|
|
which(glasso_final$fit$beta[,glasso_final$min]==0)
|
|
```
|
|
|
|
```{r prediction / mse results group lasso}
|
|
y_pred_glasso_final <- predict(glasso_final,as.matrix(test_x_final),lambda=glasso_lambda)
|
|
mse_glasso_final = sum((test_y_final-y_pred_glasso_final)^2)/length(y_pred_glasso_final)
|
|
mse_glasso_final
|
|
```
|
|
|
|
```{r summary glasso}
|
|
summary(glasso_final)
|
|
glasso$fit$beta
|
|
glasso_final
|
|
``` |