108 lines
2.2 KiB
Plaintext
108 lines
2.2 KiB
Plaintext
---
|
|
title: "exam1_Q1"
|
|
author: "Mark Pearl"
|
|
date: "6/28/2021"
|
|
output: html_document
|
|
---
|
|
|
|
```{r setup, include=FALSE}
|
|
knitr::opts_chunk$set(echo = TRUE)
|
|
```
|
|
|
|
##Exam 1
|
|
|
|
#Q1 Functional Data Analysis
|
|
|
|
```{r lib and data imports}
|
|
#Import the required libraries
|
|
library(splines)
|
|
library(tidyverse)
|
|
library(R.matlab)
|
|
|
|
nir_mat <- readMat('./Question1-2.mat')
|
|
y_df <- data.frame(SOM=nir_mat$SOM)
|
|
nir_df <- data.frame(NIR=nir_mat$NIR)
|
|
#,SOM=nir_mat$SOM)
|
|
nir_train <- nir_df[1:80,]
|
|
nir_test <- nir_df[80:108,]
|
|
y_train <- y_df[1:80,]
|
|
y_test <- y_df[80:108,]
|
|
nir_mean_df <- data.frame(mean=colMeans(nir_train))
|
|
```
|
|
|
|
## 1a
|
|
|
|
You can also embed plots, for example:
|
|
|
|
Note that the `echo = FALSE` parameter was added to the code chunk to prevent printing of the R code that generated the plot.
|
|
|
|
```{r 1a Use Smoothing Splines and GCV to compute the Mean}
|
|
#Compute the b-spline basis matrix
|
|
y <- nir_mean_df$mean
|
|
x = seq(0,length(y),length.out = length(y))
|
|
k <- 15
|
|
n <- length(y)
|
|
|
|
# Different lambda selection
|
|
allspar = seq(0,1,length.out = 1000)
|
|
p = length(allspar)
|
|
RSS = rep(0,p)
|
|
df = rep(0,p)
|
|
for(i in 1:p)
|
|
{
|
|
yhat = smooth.spline(nir_mean_df, df = k+1, spar = allspar[i])
|
|
df[i] = yhat$df
|
|
yhat = yhat$y
|
|
RSS[i] = sum((yhat-y)^2)
|
|
}
|
|
# GCV criterion
|
|
GCV = (RSS/n)/((1-df/n)^2)
|
|
par(mfrow=c(1,2))
|
|
plot(allspar,GCV,type = "l", lwd = 3)
|
|
spar = allspar[which.min(GCV)]
|
|
points(spar,GCV[which.min(GCV)],col = "red",lwd=5)
|
|
S = smooth.spline(y, df = k+1, spar = spar)
|
|
yhat = S$y
|
|
|
|
plot(y,type = "l",width=5,lwd=4)
|
|
lines(x,yhat,col = "red",lwd=1)
|
|
```
|
|
The optimal lambda is 0.1171171.
|
|
|
|
## 1b
|
|
|
|
```{r 1b}
|
|
library(glmnet)
|
|
basis_matrix <- matrix(ncol=59,nrow=80)
|
|
for (i in seq(1:80)){
|
|
s = smooth.spline(nir_train[,i], df = k+1, spar = spar)
|
|
basis_matrix[i,]= s$fit$coef
|
|
}
|
|
|
|
ridge <- glmnet(x=basis_matrix,y=y_train,alpha=0)
|
|
predict(ridge,y_test)
|
|
|
|
```
|
|
|
|
```{r 2}
|
|
library(fda)
|
|
splinebasis = create.bspline.basis(c(0,1),28)
|
|
x = seq(0,1,length=1050)
|
|
smooth = smooth.basis(x,t(nir_df),splinebasis)
|
|
plot(smooth)
|
|
|
|
|
|
|
|
|
|
```
|
|
|
|
```{r}
|
|
Xfun = smooth$fd
|
|
pca = pca.fd(Xfun, 10)
|
|
var.pca = cumsum(pca$varprop)
|
|
nharm = sum(var.pca < 0.95) + 1
|
|
pc = pca.fd(Xfun, nharm)
|
|
plot(pca$varprop, type='l')
|
|
```
|
|
Based on the results we can see that the 99% of variation is explained by the first 5 features.
|