# -------------------- Code for Question 8.2 -----------------------------
# Clear environment

rm(list = ls())


# Setting the random number generator seed so that our results are reproducible
# (Your solution doesn't need this, but it's usually good practice to do)

set.seed(1)


# ---------------------------- Data manipulation -------------------------------------

#First, Read in the data
#
dat <- read.table("uscrime.txt", stringsAsFactors = FALSE, header = TRUE)

#
# optional check to make sure the data is read correctly
#

head(dat)

## M So   Ed  Po1  Po2    LF   M.F Pop   NW    U1  U2 Wealth Ineq     Prob    Time Crime
## 1 15.1  1  9.1  5.8  5.6 0.510  95.0  33 30.1 0.108 4.1   3940 26.1 0.084602 26.2011   791
## 2 14.3  0 11.3 10.3  9.5 0.583 101.2  13 10.2 0.096 3.6   5570 19.4 0.029599 25.2999  1635
## 3 14.2  1  8.9  4.5  4.4 0.533  96.9  18 21.9 0.094 3.3   3180 25.0 0.083401 24.3006   578
## 4 13.6  0 12.1 14.9 14.1 0.577  99.4 157  8.0 0.102 3.9   6730 16.7 0.015801 29.9012  1969
## 5 14.1  0 12.1 10.9 10.1 0.591  98.5  18  3.0 0.091 2.0   5780 17.4 0.041399 21.2998  1234
## 6 12.1  0 11.0 11.8 11.5 0.547  96.4  25  4.4 0.084 2.9   6890 12.6 0.034201 20.9995   682
# NOTE: ALL ROWS OF THIS FILE STARTING WITH "##" DENOTE R OUTPUT
#
# Crime is response, other variables are predictors
#

# *****************************
# Solution using lm()
# *****************************


# We use the entire dataset to build a regression model which is then used for prediction
# We're not choosing between models (so validation isn't needed)
# and we're not bothering to estimate model quality (so test data isn't needed)

model <- lm( Crime ~ ., data = dat)

#Summary of the model

summary(model)

## Residuals:
##   Min      1Q  Median      3Q     Max 
## -395.74  -98.09   -6.69  112.99  512.67 
## 
## Coefficients:
##   Estimate Std. Error t value Pr(>|t|)    
## (Intercept) -5.984e+03  1.628e+03  -3.675 0.000893 ***
## M            8.783e+01  4.171e+01   2.106 0.043443 *  
## So          -3.803e+00  1.488e+02  -0.026 0.979765    
## Ed           1.883e+02  6.209e+01   3.033 0.004861 ** 
## Po1          1.928e+02  1.061e+02   1.817 0.078892 .  
## Po2         -1.094e+02  1.175e+02  -0.931 0.358830    
## LF          -6.638e+02  1.470e+03  -0.452 0.654654    
## M.F          1.741e+01  2.035e+01   0.855 0.398995    
## Pop         -7.330e-01  1.290e+00  -0.568 0.573845    
## NW           4.204e+00  6.481e+00   0.649 0.521279    
## U1          -5.827e+03  4.210e+03  -1.384 0.176238    
## U2           1.678e+02  8.234e+01   2.038 0.050161 .  
## Wealth       9.617e-02  1.037e-01   0.928 0.360754    
## Ineq         7.067e+01  2.272e+01   3.111 0.003983 ** 
## Prob        -4.855e+03  2.272e+03  -2.137 0.040627 *  
## Time        -3.479e+00  7.165e+00  -0.486 0.630708    
## ---
##   Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 209.1 on 31 degrees of freedom
## Multiple R-squared:  0.8031,	Adjusted R-squared:  0.7078 
## F-statistic: 8.429 on 15 and 31 DF,  p-value: 3.539e-07


#Create the test datapoint manually using dataframe

test <-data.frame(M = 14.0,So = 0, Ed = 10.0, Po1 = 12.0, Po2 = 15.5,LF = 0.640, M.F = 94.0, Pop = 150, NW = 1.1, U1 = 0.120, U2 = 3.6, Wealth = 3200, Ineq = 20.1, Prob = 0.040,Time = 39.0)


#Predict the crime rate for test data point

pred_model <- predict(model, test)
pred_model

## 155.4349 

# This is unexpected!
# The estimate is less than half of the crime rate of the next-lowest city.
# None of the factor values of the test data point
# are outside the range of the other data points, so that's not
# the explanation.
#
# What might be going on?
#
# I specifically chose this data point as a demonstration.
# The full model we used above includes a lot of insignificant factors.
# You might wonder, "Why not just use the whole model, even if some
# factors are insignificant?"
# This is why!
#
# Let's go back and just use the singificant factors to get an estimate.
# We'll try using all of the factors with p<=0.1.
# (In Module 11, we'll see better ways of going about this.)

model2 <- lm( Crime ~  M + Ed + Po1 + U2 + Ineq + Prob, data = dat)

#Summary of the model

summary(model2)

## Residuals:
##   Min      1Q  Median      3Q     Max 
## -470.68  -78.41  -19.68  133.12  556.23 
## 
## Coefficients:
##   Estimate Std. Error t value Pr(>|t|)    
## (Intercept) -5040.50     899.84  -5.602 1.72e-06 ***
##   Ed            196.47      44.75   4.390 8.07e-05 ***
##   Po1           115.02      13.75   8.363 2.56e-10 ***
##   Ineq           67.65      13.94   4.855 1.88e-05 ***
##   M             105.02      33.30   3.154  0.00305 ** 
##   Prob        -3801.84    1528.10  -2.488  0.01711 *  
##   U2             89.37      40.91   2.185  0.03483 *  
##   ---
##   Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 200.7 on 40 degrees of freedom
## Multiple R-squared:  0.7659,	Adjusted R-squared:  0.7307 
## F-statistic: 21.81 on 6 and 40 DF,  p-value: 3.418e-11

#Predict on our test observation

pred_model2 <- predict(model2, test)
pred_model2

## 1304.245 

#This seems like a more reasonable prediction, now that the insignificant factors are gone.

# Oops, I forgot that we actually *do* want to know the model quality.
# We can't just use what's reported above, because that's on the training data.
#

# Install the DAAG package, which has cross-validation functions

install.packages("DAAG")
library(DAAG)

# do 5-fold cross-validation

c <- cv.lm(dat,model2,m=5) # note that here, "m" is used for the number of folds, rather than the usual "k"
c

# The overall mean squared prediction error in cross-validation is shown as "ms".
# NOTE that there seems to be a typo in cv.lm -- 
# when it says "sum over all n folds", n is actually the number
# of data points in the last fold, not the number of folds.

# We can calculate the R-squared values directly.
# R-squared = 1 - SSEresiduals/SSEtotal
#
# total sum of squared differences between data and its mean

SStot <- sum((dat$Crime - mean(dat$Crime))^2)

# for model, model2, and cross-validation, calculated SEres

SSres_model <- sum(model$residuals^2)

SSres_model2 <- sum(model2$residuals^2)

SSres_c <- attr(c,"ms")*nrow(dat) # mean squared error, times number of data points, gives sum of squared errors

# Calculate R-squareds for model, model2, cross-validation

1 - SSres_model/SStot # initial model with insignificant factors

## 0.803

1 - SSres_model2/SStot # model2 without insignificant factors

## 0.766

1 - SSres_c/SStot # cross-validated

## 0.638

# So, this shows that including the insignificant factors overfits compared to removing them,
# and even the fitted model is probably overfitted.
# That's not so surprising, since we started with just 47 data points and we have 15 factors to predict from.
# The ratio of data points to factors is about 3:1, 
# and it's usually good to have 10:1 or more.
#
# We'll see in Module 11 ways we can try to get around this problem.

# We can also try cross-validation on the first, 15-factor model

cfirst <- cv.lm(dat,model,m=5)

SSres_cfirst <- attr(cfirst,"ms")*nrow(dat) # mean squared error, times number of data points, gives sum of squared errors

1 - SSres_cfirst/SStot # cross-validated

## 0.413

# That's a huge difference from the 0.803 reported by lm() on the training data, which demonstrates the need to validate!


# *****************************
# Solution using glm()
# *****************************

# We can do the same things using glm() instead of lm().
#
# glm() is a more-general function for regression.

g <- glm(Crime ~ . , data=dat, family="gaussian")
summary(g)

## Coefficients:
##              Estimate Std. Error t value Pr(>|t|)    
## (Intercept) -5.98e+03   1.63e+03   -3.68  0.00089 ***
## M            8.78e+01   4.17e+01    2.11  0.04344 *  
## So          -3.80e+00   1.49e+02   -0.03  0.97977    
## Ed           1.88e+02   6.21e+01    3.03  0.00486 ** 
## Po1          1.93e+02   1.06e+02    1.82  0.07889 .  
## Po2         -1.09e+02   1.17e+02   -0.93  0.35883    
## LF          -6.64e+02   1.47e+03   -0.45  0.65465    
## M.F          1.74e+01   2.04e+01    0.86  0.39900    
## Pop         -7.33e-01   1.29e+00   -0.57  0.57385    
## NW           4.20e+00   6.48e+00    0.65  0.52128    
## U1          -5.83e+03   4.21e+03   -1.38  0.17624    
## U2           1.68e+02   8.23e+01    2.04  0.05016 .  
## Wealth       9.62e-02   1.04e-01    0.93  0.36075    
## Ineq         7.07e+01   2.27e+01    3.11  0.00398 ** 
## Prob        -4.86e+03   2.27e+03   -2.14  0.04063 *  
## Time        -3.48e+00   7.17e+00   -0.49  0.63071    
## ---
## Signif. codes:  0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1

g2 <- glm(Crime ~ M + Ed + Po1 + U2 + Ineq + Prob , data=dat, family="gaussian")
summary(g2)

## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  -5040.5      899.8   -5.60  1.7e-06 ***
## M              105.0       33.3    3.15   0.0031 ** 
## Ed             196.5       44.8    4.39  8.1e-05 ***
## Po1            115.0       13.8    8.36  2.6e-10 ***
## U2              89.4       40.9    2.18   0.0348 *  
## Ineq            67.7       13.9    4.85  1.9e-05 ***
## Prob         -3801.8     1528.1   -2.49   0.0171 *  
## ---
## Signif. codes:  0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1

# for cross-validation, we need the boot library

library(boot)

cg <- cv.glm(dat,g,K=5) # note that here, K is the number of folds
cg2 <- cv.glm(dat,g2,K=5)

# mean squared error is cg$delta[1]

1 - cg$delta[1]*nrow(dat)/SStot

## 0.281 
# depending on random seed, this could be different; 
# a second time I got 0.427
# That's a low R-squared value when cross-validating the 15-factor model

1 - cg2$delta[1]*nrow(dat)/SStot

## 0.671
# depending on random seed, this could be different; 
# a second time I got 0.673