Files
tunmnlu/task_2/others-answer/omsa-main/ISYE-6501-OAN/hw5/solution/solution 8.2.R
louiscklaw 9035c1312b update,
2025-02-01 02:09:32 +08:00

285 lines
9.7 KiB
R
Raw Blame History

# -------------------- Code for Question 8.2 -----------------------------
# Clear environment
rm(list = ls())
# Setting the random number generator seed so that our results are reproducible
# (Your solution doesn't need this, but it's usually good practice to do)
set.seed(1)
# ---------------------------- Data manipulation -------------------------------------
#First, Read in the data
#
dat <- read.table("uscrime.txt", stringsAsFactors = FALSE, header = TRUE)
#
# optional check to make sure the data is read correctly
#
head(dat)
## M So Ed Po1 Po2 LF M.F Pop NW U1 U2 Wealth Ineq Prob Time Crime
## 1 15.1 1 9.1 5.8 5.6 0.510 95.0 33 30.1 0.108 4.1 3940 26.1 0.084602 26.2011 791
## 2 14.3 0 11.3 10.3 9.5 0.583 101.2 13 10.2 0.096 3.6 5570 19.4 0.029599 25.2999 1635
## 3 14.2 1 8.9 4.5 4.4 0.533 96.9 18 21.9 0.094 3.3 3180 25.0 0.083401 24.3006 578
## 4 13.6 0 12.1 14.9 14.1 0.577 99.4 157 8.0 0.102 3.9 6730 16.7 0.015801 29.9012 1969
## 5 14.1 0 12.1 10.9 10.1 0.591 98.5 18 3.0 0.091 2.0 5780 17.4 0.041399 21.2998 1234
## 6 12.1 0 11.0 11.8 11.5 0.547 96.4 25 4.4 0.084 2.9 6890 12.6 0.034201 20.9995 682
# NOTE: ALL ROWS OF THIS FILE STARTING WITH "##" DENOTE R OUTPUT
#
# Crime is response, other variables are predictors
#
# *****************************
# Solution using lm()
# *****************************
# We use the entire dataset to build a regression model which is then used for prediction
# We're not choosing between models (so validation isn't needed)
# and we're not bothering to estimate model quality (so test data isn't needed)
model <- lm( Crime ~ ., data = dat)
#Summary of the model
summary(model)
## Residuals:
## Min 1Q Median 3Q Max
## -395.74 -98.09 -6.69 112.99 512.67
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -5.984e+03 1.628e+03 -3.675 0.000893 ***
## M 8.783e+01 4.171e+01 2.106 0.043443 *
## So -3.803e+00 1.488e+02 -0.026 0.979765
## Ed 1.883e+02 6.209e+01 3.033 0.004861 **
## Po1 1.928e+02 1.061e+02 1.817 0.078892 .
## Po2 -1.094e+02 1.175e+02 -0.931 0.358830
## LF -6.638e+02 1.470e+03 -0.452 0.654654
## M.F 1.741e+01 2.035e+01 0.855 0.398995
## Pop -7.330e-01 1.290e+00 -0.568 0.573845
## NW 4.204e+00 6.481e+00 0.649 0.521279
## U1 -5.827e+03 4.210e+03 -1.384 0.176238
## U2 1.678e+02 8.234e+01 2.038 0.050161 .
## Wealth 9.617e-02 1.037e-01 0.928 0.360754
## Ineq 7.067e+01 2.272e+01 3.111 0.003983 **
## Prob -4.855e+03 2.272e+03 -2.137 0.040627 *
## Time -3.479e+00 7.165e+00 -0.486 0.630708
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 209.1 on 31 degrees of freedom
## Multiple R-squared: 0.8031, Adjusted R-squared: 0.7078
## F-statistic: 8.429 on 15 and 31 DF, p-value: 3.539e-07
#Create the test datapoint manually using dataframe
test <-data.frame(M = 14.0,So = 0, Ed = 10.0, Po1 = 12.0, Po2 = 15.5,LF = 0.640, M.F = 94.0, Pop = 150, NW = 1.1, U1 = 0.120, U2 = 3.6, Wealth = 3200, Ineq = 20.1, Prob = 0.040,Time = 39.0)
#Predict the crime rate for test data point
pred_model <- predict(model, test)
pred_model
## 155.4349
# This is unexpected!
# The estimate is less than half of the crime rate of the next-lowest city.
# None of the factor values of the test data point
# are outside the range of the other data points, so that's not
# the explanation.
#
# What might be going on?
#
# I specifically chose this data point as a demonstration.
# The full model we used above includes a lot of insignificant factors.
# You might wonder, "Why not just use the whole model, even if some
# factors are insignificant?"
# This is why!
#
# Let's go back and just use the singificant factors to get an estimate.
# We'll try using all of the factors with p<=0.1.
# (In Module 11, we'll see better ways of going about this.)
model2 <- lm( Crime ~ M + Ed + Po1 + U2 + Ineq + Prob, data = dat)
#Summary of the model
summary(model2)
## Residuals:
## Min 1Q Median 3Q Max
## -470.68 -78.41 -19.68 133.12 556.23
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -5040.50 899.84 -5.602 1.72e-06 ***
## Ed 196.47 44.75 4.390 8.07e-05 ***
## Po1 115.02 13.75 8.363 2.56e-10 ***
## Ineq 67.65 13.94 4.855 1.88e-05 ***
## M 105.02 33.30 3.154 0.00305 **
## Prob -3801.84 1528.10 -2.488 0.01711 *
## U2 89.37 40.91 2.185 0.03483 *
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 200.7 on 40 degrees of freedom
## Multiple R-squared: 0.7659, Adjusted R-squared: 0.7307
## F-statistic: 21.81 on 6 and 40 DF, p-value: 3.418e-11
#Predict on our test observation
pred_model2 <- predict(model2, test)
pred_model2
## 1304.245
#This seems like a more reasonable prediction, now that the insignificant factors are gone.
# Oops, I forgot that we actually *do* want to know the model quality.
# We can't just use what's reported above, because that's on the training data.
#
# Install the DAAG package, which has cross-validation functions
install.packages("DAAG")
library(DAAG)
# do 5-fold cross-validation
c <- cv.lm(dat,model2,m=5) # note that here, "m" is used for the number of folds, rather than the usual "k"
c
# The overall mean squared prediction error in cross-validation is shown as "ms".
# NOTE that there seems to be a typo in cv.lm --
# when it says "sum over all n folds", n is actually the number
# of data points in the last fold, not the number of folds.
# We can calculate the R-squared values directly.
# R-squared = 1 - SSEresiduals/SSEtotal
#
# total sum of squared differences between data and its mean
SStot <- sum((dat$Crime - mean(dat$Crime))^2)
# for model, model2, and cross-validation, calculated SEres
SSres_model <- sum(model$residuals^2)
SSres_model2 <- sum(model2$residuals^2)
SSres_c <- attr(c,"ms")*nrow(dat) # mean squared error, times number of data points, gives sum of squared errors
# Calculate R-squareds for model, model2, cross-validation
1 - SSres_model/SStot # initial model with insignificant factors
## 0.803
1 - SSres_model2/SStot # model2 without insignificant factors
## 0.766
1 - SSres_c/SStot # cross-validated
## 0.638
# So, this shows that including the insignificant factors overfits compared to removing them,
# and even the fitted model is probably overfitted.
# That's not so surprising, since we started with just 47 data points and we have 15 factors to predict from.
# The ratio of data points to factors is about 3:1,
# and it's usually good to have 10:1 or more.
#
# We'll see in Module 11 ways we can try to get around this problem.
# We can also try cross-validation on the first, 15-factor model
cfirst <- cv.lm(dat,model,m=5)
SSres_cfirst <- attr(cfirst,"ms")*nrow(dat) # mean squared error, times number of data points, gives sum of squared errors
1 - SSres_cfirst/SStot # cross-validated
## 0.413
# That's a huge difference from the 0.803 reported by lm() on the training data, which demonstrates the need to validate!
# *****************************
# Solution using glm()
# *****************************
# We can do the same things using glm() instead of lm().
#
# glm() is a more-general function for regression.
g <- glm(Crime ~ . , data=dat, family="gaussian")
summary(g)
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -5.98e+03 1.63e+03 -3.68 0.00089 ***
## M 8.78e+01 4.17e+01 2.11 0.04344 *
## So -3.80e+00 1.49e+02 -0.03 0.97977
## Ed 1.88e+02 6.21e+01 3.03 0.00486 **
## Po1 1.93e+02 1.06e+02 1.82 0.07889 .
## Po2 -1.09e+02 1.17e+02 -0.93 0.35883
## LF -6.64e+02 1.47e+03 -0.45 0.65465
## M.F 1.74e+01 2.04e+01 0.86 0.39900
## Pop -7.33e-01 1.29e+00 -0.57 0.57385
## NW 4.20e+00 6.48e+00 0.65 0.52128
## U1 -5.83e+03 4.21e+03 -1.38 0.17624
## U2 1.68e+02 8.23e+01 2.04 0.05016 .
## Wealth 9.62e-02 1.04e-01 0.93 0.36075
## Ineq 7.07e+01 2.27e+01 3.11 0.00398 **
## Prob -4.86e+03 2.27e+03 -2.14 0.04063 *
## Time -3.48e+00 7.17e+00 -0.49 0.63071
## ---
## Signif. codes: 0 <20>***<2A> 0.001 <20>**<2A> 0.01 <20>*<2A> 0.05 <20>.<2E> 0.1 <20> <20> 1
g2 <- glm(Crime ~ M + Ed + Po1 + U2 + Ineq + Prob , data=dat, family="gaussian")
summary(g2)
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -5040.5 899.8 -5.60 1.7e-06 ***
## M 105.0 33.3 3.15 0.0031 **
## Ed 196.5 44.8 4.39 8.1e-05 ***
## Po1 115.0 13.8 8.36 2.6e-10 ***
## U2 89.4 40.9 2.18 0.0348 *
## Ineq 67.7 13.9 4.85 1.9e-05 ***
## Prob -3801.8 1528.1 -2.49 0.0171 *
## ---
## Signif. codes: 0 <20>***<2A> 0.001 <20>**<2A> 0.01 <20>*<2A> 0.05 <20>.<2E> 0.1 <20> <20> 1
# for cross-validation, we need the boot library
library(boot)
cg <- cv.glm(dat,g,K=5) # note that here, K is the number of folds
cg2 <- cv.glm(dat,g2,K=5)
# mean squared error is cg$delta[1]
1 - cg$delta[1]*nrow(dat)/SStot
## 0.281
# depending on random seed, this could be different;
# a second time I got 0.427
# That's a low R-squared value when cross-validating the 15-factor model
1 - cg2$delta[1]*nrow(dat)/SStot
## 0.671
# depending on random seed, this could be different;
# a second time I got 0.673