# ------------------------ Code for Question 5.1 ------------------------------------- # Clear environment rm(list = ls()) # Installing and calling packages install.packages("outliers") library(outliers) # Reading the data data <- read.table("uscrime.txt", stringsAsFactors = FALSE, header = TRUE) # optional check to make sure the data is read correctly head(data) ## M So Ed Po1 Po2 LF M.F Pop NW U1 U2 Wealth Ineq Prob Time Crime ## 1 15.1 1 9.1 5.8 5.6 0.510 95.0 33 30.1 0.108 4.1 3940 26.1 0.084602 26.2011 791 ## 2 14.3 0 11.3 10.3 9.5 0.583 101.2 13 10.2 0.096 3.6 5570 19.4 0.029599 25.2999 1635 ## 3 14.2 1 8.9 4.5 4.4 0.533 96.9 18 21.9 0.094 3.3 3180 25.0 0.083401 24.3006 578 ## 4 13.6 0 12.1 14.9 14.1 0.577 99.4 157 8.0 0.102 3.9 6730 16.7 0.015801 29.9012 1969 ## 5 14.1 0 12.1 10.9 10.1 0.591 98.5 18 3.0 0.091 2.0 5780 17.4 0.041399 21.2998 1234 ## 6 12.1 0 11.0 11.8 11.5 0.547 96.4 25 4.4 0.084 2.9 6890 12.6 0.034201 20.9995 682 # NOTE: ALL ROWS OF THIS FILE STARTING WITH "##" DENOTE R OUTPUT # Crime is the variable of interest crime <- data[,"Crime"] # Run the Shapiro-Wilk test to test the normality of the crime data shapiro.test(crime) ## Shapiro-Wilk normality test ## ## data: crime ## W = 0.91273, p-value = 0.001882 # p-value rejects the null hypothesis that the data is normally distributed # But, normality tests are prone to missing the forest for the trees. # (I.e., they can give an answer that's probably wrong, because # they focus too much on a few data points. # That's especially true if there are outliers in the data, # which is exactly what we're looking for. # Look at the Q-Q plot of the crime data as another method to test the normality of the data qqnorm(crime) qqnorm(scale(crime)) # Q-Q plot suggests that the "middle" of the data is normally distributed, so we may assume # that the data is approximately normally distributed and run the Grubbs' test # Run the Grubbs' test for two outliers on opposite tails test <- grubbs.test(crime, type = 11) # Print results of grubbs test test ## Grubbs test for two opposite outliers ## ## data: crime ## G = 4.26880, U = 0.78103, p-value = 1 ## alternative hypothesis: 342 and 1993 are outliers # With p-value = 1, at least one of the extremes (highest or lowest) # is NOT an outlier. # # So, let's check each one individually. test <- grubbs.test(crime, type = 10) test ## Grubbs test for one outlier ## ## data: crime ## G = 2.8129, U = 0.8243, p-value = 0.07887 ## alternative hypothesis: highest value 1993 is an outlier # Depending on our threshold p-value, we might or might not # choose to call than an outlier. For example, some people use # p=0.05 as a threshold, some use p=0.10. # # Let's go ahead and declare this an outlier. # # Now, even though you didn't need to for the homework, # let's check the second-highest point to see if it's an outlier too. # Create a new data set without the largest value crime2 <- crime[-which.max(crime)] # Now test it test <- grubbs.test(crime2, type = 10) test ## Grubbs test for one outlier ## ## data: crime2 ## G = 3.0634, U = 0.7868, p-value = 0.02848 ## alternative hypothesis: highest value 1969 is an outlier # That's a low p-value, suggesting that the second-highest-crime # city in the original data set is also an outlier. # # So, let's remove it, and test the next one. crime3 <- crime2[-which.max(crime2)] test <- grubbs.test(crime3, type = 10) test ## Grubbs test for one outlier ## ## data: crime3 ## G = 2.5646, U = 0.8471, p-value = 0.1781 ## alternative hypothesis: highest value 1674 is an outlier # That's a high-enough p-value that it's not clear the point is an outlier. # So let's stop here, having removed the two highest points as outliers. # # But let's also check the lowest point. # grubbs.test picks the most-outlying point, and it always picked # the high ones. To get the low one, we'll use the opposite=TRUE parameter. test <- grubbs.test(crime3,type=10,opposite=TRUE) test ## Grubbs test for one outlier ## ## data: crime3 ## G = 1.6180, U = 0.9392, p-value = 1 ## alternative hypothesis: lowest value 342 is an outlier # The p-value rounds to 1, so the lowest-crime city does not # seem to be an outlier. # That's why our first test that checked both extremes returned # a p-value of 1. # # Note that the result would've been the same even if we hadn't # removed the two outliers yet. # # Finally, let's do one more thing that's not necessary: # Let's see what it looks like visually. # # Let's make a box-and-whisker plot ############# Outlier Visualization (Not Necessary) ############# # Installing and calling packages # Here's the same plotting package as in question 2. install.packages("ggplot2") library(ggplot2) # Define a function that finds 5%, 25%, 50%, 75%, and 95% quantiles of the data quant <- function(x) { r <- quantile(x, probs = c(0.05, 0.25, 0.5, 0.75, 0.95)) names(r) <- c("ymin", "lower", "middle", "upper", "ymax") r } # Create data frame with just the crime data df <- data.frame(x = rep(1, nrow(data)), y = crime) # Define a function that finds points below and aboce the 5% and 95% quantiles of the data outliers <- function(x) { subset(x, x < quantile(x, 0.05) | quantile(x, 0.95) < x) } # Create the box-and-whisker plot ggplot(df, aes(x, y)) + stat_summary(fun.data = quant, geom="boxplot") + stat_summary(fun.y = outliers, geom="point") # This visualization shows us that the two cities with the # highest amount of crime seem to be outliers. # The two cities with the lowest amount of crime are close enough # to not necessarily be considered outliers.