#EXAMPLES OF FUNCTIONAL DATA ANALYSIS MODULE

########################## SPLINES ##########################

### Curve fitting ###

#Question 3
#read_csv("1,2,3\n4,5,6", col_names = c("x", "y", "z"))
library(tidyverse)
coal_df <- read_csv('C:/Users/mjpearl/Desktop/omsa/ISYE-8803-OAN/hw1/P04.csv',col_names=c("year","energy"))

#Initialize variables used throughout the question
n = length(coal_df$year)
X = seq(1,length(coal_df$year))
Y = coal_df$energy

#Data Generation
X = seq(0,1,0.001)
Y_true = sin(2*pi*X^3)^3
Y = Y_true + 0.1*rnorm(length(X))
#Define knots and basis
k = seq(0,1,length.out = 8)
k = k[2:7]
h1 = rep(1,length(X))
h2 = X
h3 = X^2
h4 = X^3
h5 = (X-k[1])^3
h5[h5 <= 0] = 0
h6 = (X-k[2])^3
h6[h6 <= 0] = 0
h7 = (X-k[3])^3
h7[h7 <= 0] = 0
h8 = (X-k[4])^3
h8[h8 <= 0] = 0
h9 = (X-k[5])^3
h9[h9 <= 0] = 0
h10 = (X-k[6])^3
h10[h10 <= 0] = 0
H = cbind(h1, h2, h3, h4, h5, h6, h7, h8, h9, h10)
#Least square estimates
B=solve(t(H)%*%H)%*%t(H)%*%Y
plot(X,Y)   
lines(X,H%*%B,col = "red",lwd = 3)
lines(X,Y_true,col = "blue", lwd = 3)

########################## B-SPLINES ##########################

### B-splines basis ###

# Required library
library(splines)
# Basis
x = seq(1,100,length.out = 100)
par(mfrow = c(4,1))
for(sd in 1:4)
{
  knots = seq(1, 100,length.out = 10);
  B = bs(x, knots = knots, degree = sd,intercept = FALSE)
  matplot(x, B, type = "l")
}
dev.off()
image(c(1:12), x, t(B[,1:11]))

### Curve fitting ###

# Generate data:
n = 100
x = seq(0,1,length.out = n)
y = 2.5*x-sin(10*x)-exp(-10*x)
sigma = 0.3
ynoise = y + rnorm(n)*sigma
# Generate B-spline basis:
knots = seq(0,1,length.out = 10)
B = bs(x, knots = knots, degree = 2,intercept = FALSE)[,1:11]
# Least square estimation
yhat = B%*%solve(t(B)%*%B)%*%t(B)%*%ynoise
sigma2 = (1/(n-11))*t(ynoise-yhat)%*%(ynoise-yhat)
yn = yhat-3*sqrt(diag(as.numeric(sigma2)*B%*%solve(t(B)%*%B)%*%t(B)))
yp = yhat+3*sqrt(diag(as.numeric(sigma2)*B%*%solve(t(B)%*%B)%*%t(B)))
plot(x,ynoise,col = "red")
lines(x,yn,col = "blue")
lines(x,yp,col = "blue")
lines(x,yhat,col = "black")

### Fat content prediction ###
load('C:/Users/mjpearl/Desktop/omsa/ISYE-8803-OAN/hw1/meat.rdata')
# train and test data sets
s = sample.int(214,20)
train = meat[-s,]
test = meat[s,]
# model using 100 data points
linear1 = lm(X101~.,train)
pred1 = predict(linear1,test[,1:100])
rmse1 = sum((test[,101]-pred1)^2)/20
# B-splines
library(splines)
X = meat[,1:100]
x = seq(0,1,length.out =  100)
knots = seq(0,1,length.out = 8)
B = bs(x, knots = knots, degree = 3)[,1:10]
Bcoef = matrix(0,dim(X)[1],10)
for(i in 1:dim(X)[1])
{
  Bcoef[i,] = solve(t(B)%*%B)%*%t(B)%*%t(as.matrix(X[i,]))
}
meat = cbind.data.frame(Bcoef,meat[,101])
names(meat)[11] = "y"
train = meat[-s,]
test = meat[s,]
# model using 10 B-splines
linear2 = lm(y~.,train)
pred2 = predict(linear2,test[,1:10])
rmse2 = sum((test[,11]-pred2)^2)/20

########################## Smoothing splines ##########################

### Over-fitting ###

# Generate data:
n = 100
k = 40
x = seq(0,1,length.out = n)
y = 2.5*x-sin(10*x)-exp(-10*x)
sigma = 0.3
ynoise = y + rnorm(n)*sigma
# Generate B-spline basis for natural cubic spline:
   
# Least square estimation
yhat = B%*%solve(t(B)%*%B)%*%t(B)%*%ynoise
plot(x,ynoise,col = "red",lwd=3)
lines(x,yhat,col = "black",lwd=3)    

### Smoothing penalty ###

# Different lambda selection
allspar = seq(0,1,length.out = 1000)
p = length(allspar)
RSS = rep(0,p)
df = rep(0,p)
for(i in 1:p)
{
  yhat = smooth.spline(ynoise, df = k+1, spar = allspar[i])
  df[i] = yhat$df
  yhat = yhat$y
  RSS[i] = sum((yhat-ynoise)^2)
}
# GCV criterion 
GCV = (RSS/n)/((1-df/n)^2)
plot(allspar,GCV,type = "l", lwd = 3)
spar = allspar[which.min(GCV)]
points(spar,GCV[which.min(GCV)],col = "red",lwd=5)
yhat = smooth.spline(ynoise, df = k+1, spar = spar)
yhat = yhat$y
lines(x,yhat,col = "black",lwd=3) 
  
########################## Kernel smoothers ##########################

### RBF Kernel ###

# Data Genereation 
x = c(0:100)
y = sin(x/10)+(x/50)^2+0.1*rnorm(length(x)) 
kerf = function(z){exp(-z*z/2)/sqrt(2*pi)}
# leave-one-out CV
h1=seq(1,4,0.1)
er = rep(0, length(y))
mse = rep(0, length(h1))
for(j in 1:length(h1))
{
  h=h1[j]
  for(i in 1:length(y))
  {
    X1=x;
    Y1=y;
    X1=x[-i];
    Y1=y[-i];
    z=kerf((x[i]-X1)/h)
    yke=sum(z*Y1)/sum(z)
    er[i]=y[i]-yke
  }
  mse[j]=sum(er^2)
}
plot(h1,mse,type = "l")
h = h1[which.min(mse)]
points(h,mse[which.min(mse)],col = "red", lwd=5)

### Slide 8 ###

# Interpolation for N values
N=1000
xall = seq(min(x),max(x),length.out = N)
f = rep(0,N);
for(k in 1:N)
{
  z=kerf((xall[k]-x)/h)
  f[k]=sum(z*y)/sum(z);
}
ytrue = sin(xall/10)+(xall/50)^2
plot(x,y,col = "black")
lines(xall,ytrue,col = "red")
lines(xall, f, col = "blue")

################### FUNCTIONAL PRINCIPAL COMPONENTS ###################

### Functional data classification ###

library(randomForest)
# Data generation
set.seed(123)
M = 100
n = 50
x = seq(0,1,length=n)
E = as.matrix(dist(x, diag=T, upper=T))
Sigma = exp(-10*E^2)
eig = eigen(Sigma)
Sigma.sqrt = eig$vec%*%diag(sqrt(eig$val+10^(-10)))%*%t(eig$vec)
S_noise = exp(-0.1*E^2)
eig_noise = eigen(S_noise)
S.sqrt_noise = eig_noise$vec%*%diag(sqrt(eig_noise$val+10^(-10)))%*%t(eig_noise$vec)
# Class 1
mean1 = Sigma.sqrt%*%rnorm(n)
noise1 = S.sqrt_noise%*%rnorm(n)
signal1 = mean1 + noise1
var1 = var(signal1)
ds1 = sqrt(var1/100)
S.sqrt_err1 = diag(n)*drop(ds1)
x1 = matrix(0,M,n)
for(j in 1:(M))
{
  noise1 = S.sqrt_noise%*%rnorm(n)
  error1 = S.sqrt_err1%*%rnorm(n)
  x1[j,] = mean1 + noise1 + error1
}
matplot(x,t(x1),type = "l",ylab = "y",col = "blue")
# Class 2
mean2 = Sigma.sqrt%*%rnorm(n)
noise2 = S.sqrt_noise%*%rnorm(n)
signal2 = mean2 + noise2
var2 = var(signal2)
ds2 = sqrt(var2/100)
S.sqrt_err2 = diag(n)*drop(ds2)
x2 = matrix(0,M,n)
for(j in 1:(M))
{
  noise2 = S.sqrt_noise%*%rnorm(n)
  error2 = S.sqrt_err2%*%rnorm(n)
  x2[j,] = mean2 + noise2 + error2
} 
matplot(x,t(x2),type = "l",ylab = "y",col = "red")
# Train and test data sets
X = rbind(x1,x2)
lab = as.factor(c(rep(0,M),rep(1,M)))
train = sample.int(2*M,floor(0.8*2*M))
labtrain = lab[train]
labtest = lab[-train]
# Option 1: B-splines
library(splines)
knots = seq(0,1,length.out = 8)
B = bs(x, knots = knots, degree = 3)[,1:10]
Bcoef = matrix(0,dim(X)[1],10)
for(i in 1:dim(X)[1])
{
  Bcoef[i,] = solve(t(B)%*%B)%*%t(B)%*%X[i,]
}
fit = randomForest(labtrain ~ .,
                   data=cbind.data.frame(as.data.frame(Bcoef[train,]),labtrain))
pred = predict(fit,Bcoef[-train,])
table(labtest,pred)
Xtest = X[-train,]
matplot(x,t(Xtest[pred==0,]),type="l",col = "blue",ylab = "y",ylim = c(-4,4),main="Classification using B-spline coefficients")
X2 = Xtest[pred == 1,]
for(i in 1:length(pred[pred==1]))
{
  lines(x,X2[i,],col = "red")
}
# Option 2: Functional principal components
library(fda)
splinebasis = create.bspline.basis(c(0,1),10)
smooth = smooth.basis(x,t(X),splinebasis)
Xfun = smooth$fd
pca = pca.fd(Xfun, 10)
var.pca = cumsum(pca$varprop)
nharm = sum(var.pca < 0.95) + 1
pc = pca.fd(Xfun, nharm)
plot(pc$scores[lab==0,],xlab = "FPC-score 1", ylab = "FPC-score 2",col = "blue",ylim=c(-1,1))
points(pc$scores[lab==1,],col = "red")
FPCcoef = pc$scores
fit = randomForest(labtrain ~ .,
                   data=cbind.data.frame(as.data.frame(FPCcoef[train,]),labtrain))
pred = predict(fit,FPCcoef[-train,])
table(labtest,pred)
Xtest = X[-train,]
matplot(x,t(Xtest[pred==0,]),type="l",col = "blue",ylab = "y",ylim = c(-4,4),main="Classification using FPCA scores")
X2 = Xtest[pred == 1,]
for(i in 1:length(pred[pred==1]))
{
  lines(x,X2[i,],col = "red")
}