---
title: "exam1_Q1"
author: "Mark Pearl"
date: "6/28/2021"
output: html_document
---

```{r setup, include=FALSE}
knitr::opts_chunk$set(echo = TRUE)
```

##Exam 1

#Q1 Functional Data Analysis

```{r lib and data imports}
#Import the required libraries
library(splines)
library(tidyverse)
library(R.matlab)

nir_mat <- readMat('./Question1-2.mat')
y_df <- data.frame(SOM=nir_mat$SOM)
nir_df <- data.frame(NIR=nir_mat$NIR)
                     #,SOM=nir_mat$SOM)
nir_train <- nir_df[1:80,]
nir_test <- nir_df[80:108,]
y_train <- y_df[1:80,]
y_test <- y_df[80:108,]
nir_mean_df <- data.frame(mean=colMeans(nir_train))
```

## 1a

You can also embed plots, for example:

Note that the `echo = FALSE` parameter was added to the code chunk to prevent printing of the R code that generated the plot.

```{r 1a Use Smoothing Splines and GCV to compute the Mean}
#Compute the b-spline basis matrix
y <- nir_mean_df$mean
x = seq(0,length(y),length.out = length(y))
k <- 15
n <- length(y)

# Different lambda selection
allspar = seq(0,1,length.out = 1000)
p = length(allspar)
RSS = rep(0,p)
df = rep(0,p)
for(i in 1:p)
{
  yhat = smooth.spline(nir_mean_df, df = k+1, spar = allspar[i])
  df[i] = yhat$df
  yhat = yhat$y
  RSS[i] = sum((yhat-y)^2)
}
# GCV criterion 
GCV = (RSS/n)/((1-df/n)^2)
par(mfrow=c(1,2))
plot(allspar,GCV,type = "l", lwd = 3)
spar = allspar[which.min(GCV)]
points(spar,GCV[which.min(GCV)],col = "red",lwd=5)
S = smooth.spline(y, df = k+1, spar = spar)
yhat = S$y

plot(y,type = "l",width=5,lwd=4)
lines(x,yhat,col = "red",lwd=1)
```
The optimal lambda is 0.1171171. 

## 1b

```{r 1b}
library(glmnet)
basis_matrix <- matrix(ncol=59,nrow=80)
for (i in seq(1:80)){
  s = smooth.spline(nir_train[,i], df = k+1, spar = spar)
  basis_matrix[i,]= s$fit$coef
}

ridge <- glmnet(x=basis_matrix,y=y_train,alpha=0)
predict(ridge,y_test)

```

```{r 2}
library(fda)
splinebasis = create.bspline.basis(c(0,1),28)
x = seq(0,1,length=1050)
smooth = smooth.basis(x,t(nir_df),splinebasis)
plot(smooth)


```

```{r}
Xfun = smooth$fd
pca = pca.fd(Xfun, 10)
var.pca = cumsum(pca$varprop)
nharm = sum(var.pca < 0.95) + 1
pc = pca.fd(Xfun, nharm)
plot(pca$varprop, type='l')
```
Based on the results we can see that the 99% of variation is explained by the first 5 features.