285 lines
9.7 KiB
R
285 lines
9.7 KiB
R
# -------------------- Code for Question 8.2 -----------------------------
|
||
# Clear environment
|
||
|
||
rm(list = ls())
|
||
|
||
|
||
# Setting the random number generator seed so that our results are reproducible
|
||
# (Your solution doesn't need this, but it's usually good practice to do)
|
||
|
||
set.seed(1)
|
||
|
||
|
||
# ---------------------------- Data manipulation -------------------------------------
|
||
|
||
#First, Read in the data
|
||
#
|
||
dat <- read.table("uscrime.txt", stringsAsFactors = FALSE, header = TRUE)
|
||
|
||
#
|
||
# optional check to make sure the data is read correctly
|
||
#
|
||
|
||
head(dat)
|
||
|
||
## M So Ed Po1 Po2 LF M.F Pop NW U1 U2 Wealth Ineq Prob Time Crime
|
||
## 1 15.1 1 9.1 5.8 5.6 0.510 95.0 33 30.1 0.108 4.1 3940 26.1 0.084602 26.2011 791
|
||
## 2 14.3 0 11.3 10.3 9.5 0.583 101.2 13 10.2 0.096 3.6 5570 19.4 0.029599 25.2999 1635
|
||
## 3 14.2 1 8.9 4.5 4.4 0.533 96.9 18 21.9 0.094 3.3 3180 25.0 0.083401 24.3006 578
|
||
## 4 13.6 0 12.1 14.9 14.1 0.577 99.4 157 8.0 0.102 3.9 6730 16.7 0.015801 29.9012 1969
|
||
## 5 14.1 0 12.1 10.9 10.1 0.591 98.5 18 3.0 0.091 2.0 5780 17.4 0.041399 21.2998 1234
|
||
## 6 12.1 0 11.0 11.8 11.5 0.547 96.4 25 4.4 0.084 2.9 6890 12.6 0.034201 20.9995 682
|
||
# NOTE: ALL ROWS OF THIS FILE STARTING WITH "##" DENOTE R OUTPUT
|
||
#
|
||
# Crime is response, other variables are predictors
|
||
#
|
||
|
||
# *****************************
|
||
# Solution using lm()
|
||
# *****************************
|
||
|
||
|
||
# We use the entire dataset to build a regression model which is then used for prediction
|
||
# We're not choosing between models (so validation isn't needed)
|
||
# and we're not bothering to estimate model quality (so test data isn't needed)
|
||
|
||
model <- lm( Crime ~ ., data = dat)
|
||
|
||
#Summary of the model
|
||
|
||
summary(model)
|
||
|
||
## Residuals:
|
||
## Min 1Q Median 3Q Max
|
||
## -395.74 -98.09 -6.69 112.99 512.67
|
||
##
|
||
## Coefficients:
|
||
## Estimate Std. Error t value Pr(>|t|)
|
||
## (Intercept) -5.984e+03 1.628e+03 -3.675 0.000893 ***
|
||
## M 8.783e+01 4.171e+01 2.106 0.043443 *
|
||
## So -3.803e+00 1.488e+02 -0.026 0.979765
|
||
## Ed 1.883e+02 6.209e+01 3.033 0.004861 **
|
||
## Po1 1.928e+02 1.061e+02 1.817 0.078892 .
|
||
## Po2 -1.094e+02 1.175e+02 -0.931 0.358830
|
||
## LF -6.638e+02 1.470e+03 -0.452 0.654654
|
||
## M.F 1.741e+01 2.035e+01 0.855 0.398995
|
||
## Pop -7.330e-01 1.290e+00 -0.568 0.573845
|
||
## NW 4.204e+00 6.481e+00 0.649 0.521279
|
||
## U1 -5.827e+03 4.210e+03 -1.384 0.176238
|
||
## U2 1.678e+02 8.234e+01 2.038 0.050161 .
|
||
## Wealth 9.617e-02 1.037e-01 0.928 0.360754
|
||
## Ineq 7.067e+01 2.272e+01 3.111 0.003983 **
|
||
## Prob -4.855e+03 2.272e+03 -2.137 0.040627 *
|
||
## Time -3.479e+00 7.165e+00 -0.486 0.630708
|
||
## ---
|
||
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
|
||
##
|
||
## Residual standard error: 209.1 on 31 degrees of freedom
|
||
## Multiple R-squared: 0.8031, Adjusted R-squared: 0.7078
|
||
## F-statistic: 8.429 on 15 and 31 DF, p-value: 3.539e-07
|
||
|
||
|
||
#Create the test datapoint manually using dataframe
|
||
|
||
test <-data.frame(M = 14.0,So = 0, Ed = 10.0, Po1 = 12.0, Po2 = 15.5,LF = 0.640, M.F = 94.0, Pop = 150, NW = 1.1, U1 = 0.120, U2 = 3.6, Wealth = 3200, Ineq = 20.1, Prob = 0.040,Time = 39.0)
|
||
|
||
|
||
#Predict the crime rate for test data point
|
||
|
||
pred_model <- predict(model, test)
|
||
pred_model
|
||
|
||
## 155.4349
|
||
|
||
# This is unexpected!
|
||
# The estimate is less than half of the crime rate of the next-lowest city.
|
||
# None of the factor values of the test data point
|
||
# are outside the range of the other data points, so that's not
|
||
# the explanation.
|
||
#
|
||
# What might be going on?
|
||
#
|
||
# I specifically chose this data point as a demonstration.
|
||
# The full model we used above includes a lot of insignificant factors.
|
||
# You might wonder, "Why not just use the whole model, even if some
|
||
# factors are insignificant?"
|
||
# This is why!
|
||
#
|
||
# Let's go back and just use the singificant factors to get an estimate.
|
||
# We'll try using all of the factors with p<=0.1.
|
||
# (In Module 11, we'll see better ways of going about this.)
|
||
|
||
model2 <- lm( Crime ~ M + Ed + Po1 + U2 + Ineq + Prob, data = dat)
|
||
|
||
#Summary of the model
|
||
|
||
summary(model2)
|
||
|
||
## Residuals:
|
||
## Min 1Q Median 3Q Max
|
||
## -470.68 -78.41 -19.68 133.12 556.23
|
||
##
|
||
## Coefficients:
|
||
## Estimate Std. Error t value Pr(>|t|)
|
||
## (Intercept) -5040.50 899.84 -5.602 1.72e-06 ***
|
||
## Ed 196.47 44.75 4.390 8.07e-05 ***
|
||
## Po1 115.02 13.75 8.363 2.56e-10 ***
|
||
## Ineq 67.65 13.94 4.855 1.88e-05 ***
|
||
## M 105.02 33.30 3.154 0.00305 **
|
||
## Prob -3801.84 1528.10 -2.488 0.01711 *
|
||
## U2 89.37 40.91 2.185 0.03483 *
|
||
## ---
|
||
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
|
||
##
|
||
## Residual standard error: 200.7 on 40 degrees of freedom
|
||
## Multiple R-squared: 0.7659, Adjusted R-squared: 0.7307
|
||
## F-statistic: 21.81 on 6 and 40 DF, p-value: 3.418e-11
|
||
|
||
#Predict on our test observation
|
||
|
||
pred_model2 <- predict(model2, test)
|
||
pred_model2
|
||
|
||
## 1304.245
|
||
|
||
#This seems like a more reasonable prediction, now that the insignificant factors are gone.
|
||
|
||
# Oops, I forgot that we actually *do* want to know the model quality.
|
||
# We can't just use what's reported above, because that's on the training data.
|
||
#
|
||
|
||
# Install the DAAG package, which has cross-validation functions
|
||
|
||
install.packages("DAAG")
|
||
library(DAAG)
|
||
|
||
# do 5-fold cross-validation
|
||
|
||
c <- cv.lm(dat,model2,m=5) # note that here, "m" is used for the number of folds, rather than the usual "k"
|
||
c
|
||
|
||
# The overall mean squared prediction error in cross-validation is shown as "ms".
|
||
# NOTE that there seems to be a typo in cv.lm --
|
||
# when it says "sum over all n folds", n is actually the number
|
||
# of data points in the last fold, not the number of folds.
|
||
|
||
# We can calculate the R-squared values directly.
|
||
# R-squared = 1 - SSEresiduals/SSEtotal
|
||
#
|
||
# total sum of squared differences between data and its mean
|
||
|
||
SStot <- sum((dat$Crime - mean(dat$Crime))^2)
|
||
|
||
# for model, model2, and cross-validation, calculated SEres
|
||
|
||
SSres_model <- sum(model$residuals^2)
|
||
|
||
SSres_model2 <- sum(model2$residuals^2)
|
||
|
||
SSres_c <- attr(c,"ms")*nrow(dat) # mean squared error, times number of data points, gives sum of squared errors
|
||
|
||
# Calculate R-squareds for model, model2, cross-validation
|
||
|
||
1 - SSres_model/SStot # initial model with insignificant factors
|
||
|
||
## 0.803
|
||
|
||
1 - SSres_model2/SStot # model2 without insignificant factors
|
||
|
||
## 0.766
|
||
|
||
1 - SSres_c/SStot # cross-validated
|
||
|
||
## 0.638
|
||
|
||
# So, this shows that including the insignificant factors overfits compared to removing them,
|
||
# and even the fitted model is probably overfitted.
|
||
# That's not so surprising, since we started with just 47 data points and we have 15 factors to predict from.
|
||
# The ratio of data points to factors is about 3:1,
|
||
# and it's usually good to have 10:1 or more.
|
||
#
|
||
# We'll see in Module 11 ways we can try to get around this problem.
|
||
|
||
# We can also try cross-validation on the first, 15-factor model
|
||
|
||
cfirst <- cv.lm(dat,model,m=5)
|
||
|
||
SSres_cfirst <- attr(cfirst,"ms")*nrow(dat) # mean squared error, times number of data points, gives sum of squared errors
|
||
|
||
1 - SSres_cfirst/SStot # cross-validated
|
||
|
||
## 0.413
|
||
|
||
# That's a huge difference from the 0.803 reported by lm() on the training data, which demonstrates the need to validate!
|
||
|
||
|
||
# *****************************
|
||
# Solution using glm()
|
||
# *****************************
|
||
|
||
# We can do the same things using glm() instead of lm().
|
||
#
|
||
# glm() is a more-general function for regression.
|
||
|
||
g <- glm(Crime ~ . , data=dat, family="gaussian")
|
||
summary(g)
|
||
|
||
## Coefficients:
|
||
## Estimate Std. Error t value Pr(>|t|)
|
||
## (Intercept) -5.98e+03 1.63e+03 -3.68 0.00089 ***
|
||
## M 8.78e+01 4.17e+01 2.11 0.04344 *
|
||
## So -3.80e+00 1.49e+02 -0.03 0.97977
|
||
## Ed 1.88e+02 6.21e+01 3.03 0.00486 **
|
||
## Po1 1.93e+02 1.06e+02 1.82 0.07889 .
|
||
## Po2 -1.09e+02 1.17e+02 -0.93 0.35883
|
||
## LF -6.64e+02 1.47e+03 -0.45 0.65465
|
||
## M.F 1.74e+01 2.04e+01 0.86 0.39900
|
||
## Pop -7.33e-01 1.29e+00 -0.57 0.57385
|
||
## NW 4.20e+00 6.48e+00 0.65 0.52128
|
||
## U1 -5.83e+03 4.21e+03 -1.38 0.17624
|
||
## U2 1.68e+02 8.23e+01 2.04 0.05016 .
|
||
## Wealth 9.62e-02 1.04e-01 0.93 0.36075
|
||
## Ineq 7.07e+01 2.27e+01 3.11 0.00398 **
|
||
## Prob -4.86e+03 2.27e+03 -2.14 0.04063 *
|
||
## Time -3.48e+00 7.17e+00 -0.49 0.63071
|
||
## ---
|
||
## Signif. codes: 0 <20>***<2A> 0.001 <20>**<2A> 0.01 <20>*<2A> 0.05 <20>.<2E> 0.1 <20> <20> 1
|
||
|
||
g2 <- glm(Crime ~ M + Ed + Po1 + U2 + Ineq + Prob , data=dat, family="gaussian")
|
||
summary(g2)
|
||
|
||
## Coefficients:
|
||
## Estimate Std. Error t value Pr(>|t|)
|
||
## (Intercept) -5040.5 899.8 -5.60 1.7e-06 ***
|
||
## M 105.0 33.3 3.15 0.0031 **
|
||
## Ed 196.5 44.8 4.39 8.1e-05 ***
|
||
## Po1 115.0 13.8 8.36 2.6e-10 ***
|
||
## U2 89.4 40.9 2.18 0.0348 *
|
||
## Ineq 67.7 13.9 4.85 1.9e-05 ***
|
||
## Prob -3801.8 1528.1 -2.49 0.0171 *
|
||
## ---
|
||
## Signif. codes: 0 <20>***<2A> 0.001 <20>**<2A> 0.01 <20>*<2A> 0.05 <20>.<2E> 0.1 <20> <20> 1
|
||
|
||
# for cross-validation, we need the boot library
|
||
|
||
library(boot)
|
||
|
||
cg <- cv.glm(dat,g,K=5) # note that here, K is the number of folds
|
||
cg2 <- cv.glm(dat,g2,K=5)
|
||
|
||
# mean squared error is cg$delta[1]
|
||
|
||
1 - cg$delta[1]*nrow(dat)/SStot
|
||
|
||
## 0.281
|
||
# depending on random seed, this could be different;
|
||
# a second time I got 0.427
|
||
# That's a low R-squared value when cross-validating the 15-factor model
|
||
|
||
1 - cg2$delta[1]*nrow(dat)/SStot
|
||
|
||
## 0.671
|
||
# depending on random seed, this could be different;
|
||
# a second time I got 0.673
|
||
|