Files
tunmnlu/task_2/others-answer/omsa-main/ISYE-6501-OAN/hw3/solutions/solution 5.1.R
louiscklaw 9035c1312b update,
2025-02-01 02:09:32 +08:00

192 lines
5.6 KiB
R

# ------------------------ Code for Question 5.1 -------------------------------------
# Clear environment
rm(list = ls())
# Installing and calling packages
install.packages("outliers")
library(outliers)
# Reading the data
data <- read.table("uscrime.txt", stringsAsFactors = FALSE, header = TRUE)
# optional check to make sure the data is read correctly
head(data)
## M So Ed Po1 Po2 LF M.F Pop NW U1 U2 Wealth Ineq Prob Time Crime
## 1 15.1 1 9.1 5.8 5.6 0.510 95.0 33 30.1 0.108 4.1 3940 26.1 0.084602 26.2011 791
## 2 14.3 0 11.3 10.3 9.5 0.583 101.2 13 10.2 0.096 3.6 5570 19.4 0.029599 25.2999 1635
## 3 14.2 1 8.9 4.5 4.4 0.533 96.9 18 21.9 0.094 3.3 3180 25.0 0.083401 24.3006 578
## 4 13.6 0 12.1 14.9 14.1 0.577 99.4 157 8.0 0.102 3.9 6730 16.7 0.015801 29.9012 1969
## 5 14.1 0 12.1 10.9 10.1 0.591 98.5 18 3.0 0.091 2.0 5780 17.4 0.041399 21.2998 1234
## 6 12.1 0 11.0 11.8 11.5 0.547 96.4 25 4.4 0.084 2.9 6890 12.6 0.034201 20.9995 682
# NOTE: ALL ROWS OF THIS FILE STARTING WITH "##" DENOTE R OUTPUT
# Crime is the variable of interest
crime <- data[,"Crime"]
# Run the Shapiro-Wilk test to test the normality of the crime data
shapiro.test(crime)
## Shapiro-Wilk normality test
##
## data: crime
## W = 0.91273, p-value = 0.001882
# p-value rejects the null hypothesis that the data is normally distributed
# But, normality tests are prone to missing the forest for the trees.
# (I.e., they can give an answer that's probably wrong, because
# they focus too much on a few data points.
# That's especially true if there are outliers in the data,
# which is exactly what we're looking for.
# Look at the Q-Q plot of the crime data as another method to test the normality of the data
qqnorm(crime)
qqnorm(scale(crime))
# Q-Q plot suggests that the "middle" of the data is normally distributed, so we may assume
# that the data is approximately normally distributed and run the Grubbs' test
# Run the Grubbs' test for two outliers on opposite tails
test <- grubbs.test(crime, type = 11)
# Print results of grubbs test
test
## Grubbs test for two opposite outliers
##
## data: crime
## G = 4.26880, U = 0.78103, p-value = 1
## alternative hypothesis: 342 and 1993 are outliers
# With p-value = 1, at least one of the extremes (highest or lowest)
# is NOT an outlier.
#
# So, let's check each one individually.
test <- grubbs.test(crime, type = 10)
test
## Grubbs test for one outlier
##
## data: crime
## G = 2.8129, U = 0.8243, p-value = 0.07887
## alternative hypothesis: highest value 1993 is an outlier
# Depending on our threshold p-value, we might or might not
# choose to call than an outlier. For example, some people use
# p=0.05 as a threshold, some use p=0.10.
#
# Let's go ahead and declare this an outlier.
#
# Now, even though you didn't need to for the homework,
# let's check the second-highest point to see if it's an outlier too.
# Create a new data set without the largest value
crime2 <- crime[-which.max(crime)]
# Now test it
test <- grubbs.test(crime2, type = 10)
test
## Grubbs test for one outlier
##
## data: crime2
## G = 3.0634, U = 0.7868, p-value = 0.02848
## alternative hypothesis: highest value 1969 is an outlier
# That's a low p-value, suggesting that the second-highest-crime
# city in the original data set is also an outlier.
#
# So, let's remove it, and test the next one.
crime3 <- crime2[-which.max(crime2)]
test <- grubbs.test(crime3, type = 10)
test
## Grubbs test for one outlier
##
## data: crime3
## G = 2.5646, U = 0.8471, p-value = 0.1781
## alternative hypothesis: highest value 1674 is an outlier
# That's a high-enough p-value that it's not clear the point is an outlier.
# So let's stop here, having removed the two highest points as outliers.
#
# But let's also check the lowest point.
# grubbs.test picks the most-outlying point, and it always picked
# the high ones. To get the low one, we'll use the opposite=TRUE parameter.
test <- grubbs.test(crime3,type=10,opposite=TRUE)
test
## Grubbs test for one outlier
##
## data: crime3
## G = 1.6180, U = 0.9392, p-value = 1
## alternative hypothesis: lowest value 342 is an outlier
# The p-value rounds to 1, so the lowest-crime city does not
# seem to be an outlier.
# That's why our first test that checked both extremes returned
# a p-value of 1.
#
# Note that the result would've been the same even if we hadn't
# removed the two outliers yet.
#
# Finally, let's do one more thing that's not necessary:
# Let's see what it looks like visually.
#
# Let's make a box-and-whisker plot
############# Outlier Visualization (Not Necessary) #############
# Installing and calling packages
# Here's the same plotting package as in question 2.
install.packages("ggplot2")
library(ggplot2)
# Define a function that finds 5%, 25%, 50%, 75%, and 95% quantiles of the data
quant <- function(x) {
r <- quantile(x, probs = c(0.05, 0.25, 0.5, 0.75, 0.95))
names(r) <- c("ymin", "lower", "middle", "upper", "ymax")
r
}
# Create data frame with just the crime data
df <- data.frame(x = rep(1, nrow(data)), y = crime)
# Define a function that finds points below and aboce the 5% and 95% quantiles of the data
outliers <- function(x) {
subset(x, x < quantile(x, 0.05) | quantile(x, 0.95) < x)
}
# Create the box-and-whisker plot
ggplot(df, aes(x, y)) +
stat_summary(fun.data = quant, geom="boxplot") +
stat_summary(fun.y = outliers, geom="point")
# This visualization shows us that the two cities with the
# highest amount of crime seem to be outliers.
# The two cities with the lowest amount of crime are close enough
# to not necessarily be considered outliers.