301 lines
7.4 KiB
R
301 lines
7.4 KiB
R
#EXAMPLES OF FUNCTIONAL DATA ANALYSIS MODULE
|
|
|
|
########################## SPLINES ##########################
|
|
|
|
### Curve fitting ###
|
|
|
|
#Question 3
|
|
#read_csv("1,2,3\n4,5,6", col_names = c("x", "y", "z"))
|
|
library(tidyverse)
|
|
coal_df <- read_csv('C:/Users/mjpearl/Desktop/omsa/ISYE-8803-OAN/hw1/P04.csv',col_names=c("year","energy"))
|
|
|
|
#Initialize variables used throughout the question
|
|
n = length(coal_df$year)
|
|
X = seq(1,length(coal_df$year))
|
|
Y = coal_df$energy
|
|
|
|
#Data Generation
|
|
X = seq(0,1,0.001)
|
|
Y_true = sin(2*pi*X^3)^3
|
|
Y = Y_true + 0.1*rnorm(length(X))
|
|
#Define knots and basis
|
|
k = seq(0,1,length.out = 8)
|
|
k = k[2:7]
|
|
h1 = rep(1,length(X))
|
|
h2 = X
|
|
h3 = X^2
|
|
h4 = X^3
|
|
h5 = (X-k[1])^3
|
|
h5[h5 <= 0] = 0
|
|
h6 = (X-k[2])^3
|
|
h6[h6 <= 0] = 0
|
|
h7 = (X-k[3])^3
|
|
h7[h7 <= 0] = 0
|
|
h8 = (X-k[4])^3
|
|
h8[h8 <= 0] = 0
|
|
h9 = (X-k[5])^3
|
|
h9[h9 <= 0] = 0
|
|
h10 = (X-k[6])^3
|
|
h10[h10 <= 0] = 0
|
|
H = cbind(h1, h2, h3, h4, h5, h6, h7, h8, h9, h10)
|
|
#Least square estimates
|
|
B=solve(t(H)%*%H)%*%t(H)%*%Y
|
|
plot(X,Y)
|
|
lines(X,H%*%B,col = "red",lwd = 3)
|
|
lines(X,Y_true,col = "blue", lwd = 3)
|
|
|
|
########################## B-SPLINES ##########################
|
|
|
|
### B-splines basis ###
|
|
|
|
# Required library
|
|
library(splines)
|
|
# Basis
|
|
x = seq(1,100,length.out = 100)
|
|
par(mfrow = c(4,1))
|
|
for(sd in 1:4)
|
|
{
|
|
knots = seq(1, 100,length.out = 10);
|
|
B = bs(x, knots = knots, degree = sd,intercept = FALSE)
|
|
matplot(x, B, type = "l")
|
|
}
|
|
dev.off()
|
|
image(c(1:12), x, t(B[,1:11]))
|
|
|
|
### Curve fitting ###
|
|
|
|
# Generate data:
|
|
n = 100
|
|
x = seq(0,1,length.out = n)
|
|
y = 2.5*x-sin(10*x)-exp(-10*x)
|
|
sigma = 0.3
|
|
ynoise = y + rnorm(n)*sigma
|
|
# Generate B-spline basis:
|
|
knots = seq(0,1,length.out = 10)
|
|
B = bs(x, knots = knots, degree = 2,intercept = FALSE)[,1:11]
|
|
# Least square estimation
|
|
yhat = B%*%solve(t(B)%*%B)%*%t(B)%*%ynoise
|
|
sigma2 = (1/(n-11))*t(ynoise-yhat)%*%(ynoise-yhat)
|
|
yn = yhat-3*sqrt(diag(as.numeric(sigma2)*B%*%solve(t(B)%*%B)%*%t(B)))
|
|
yp = yhat+3*sqrt(diag(as.numeric(sigma2)*B%*%solve(t(B)%*%B)%*%t(B)))
|
|
plot(x,ynoise,col = "red")
|
|
lines(x,yn,col = "blue")
|
|
lines(x,yp,col = "blue")
|
|
lines(x,yhat,col = "black")
|
|
|
|
### Fat content prediction ###
|
|
load('C:/Users/mjpearl/Desktop/omsa/ISYE-8803-OAN/hw1/meat.rdata')
|
|
# train and test data sets
|
|
s = sample.int(214,20)
|
|
train = meat[-s,]
|
|
test = meat[s,]
|
|
# model using 100 data points
|
|
linear1 = lm(X101~.,train)
|
|
pred1 = predict(linear1,test[,1:100])
|
|
rmse1 = sum((test[,101]-pred1)^2)/20
|
|
# B-splines
|
|
library(splines)
|
|
X = meat[,1:100]
|
|
x = seq(0,1,length.out = 100)
|
|
knots = seq(0,1,length.out = 8)
|
|
B = bs(x, knots = knots, degree = 3)[,1:10]
|
|
Bcoef = matrix(0,dim(X)[1],10)
|
|
for(i in 1:dim(X)[1])
|
|
{
|
|
Bcoef[i,] = solve(t(B)%*%B)%*%t(B)%*%t(as.matrix(X[i,]))
|
|
}
|
|
meat = cbind.data.frame(Bcoef,meat[,101])
|
|
names(meat)[11] = "y"
|
|
train = meat[-s,]
|
|
test = meat[s,]
|
|
# model using 10 B-splines
|
|
linear2 = lm(y~.,train)
|
|
pred2 = predict(linear2,test[,1:10])
|
|
rmse2 = sum((test[,11]-pred2)^2)/20
|
|
|
|
########################## Smoothing splines ##########################
|
|
|
|
### Over-fitting ###
|
|
|
|
# Generate data:
|
|
n = 100
|
|
k = 40
|
|
x = seq(0,1,length.out = n)
|
|
y = 2.5*x-sin(10*x)-exp(-10*x)
|
|
sigma = 0.3
|
|
ynoise = y + rnorm(n)*sigma
|
|
# Generate B-spline basis for natural cubic spline:
|
|
|
|
# Least square estimation
|
|
yhat = B%*%solve(t(B)%*%B)%*%t(B)%*%ynoise
|
|
plot(x,ynoise,col = "red",lwd=3)
|
|
lines(x,yhat,col = "black",lwd=3)
|
|
|
|
### Smoothing penalty ###
|
|
|
|
# Different lambda selection
|
|
allspar = seq(0,1,length.out = 1000)
|
|
p = length(allspar)
|
|
RSS = rep(0,p)
|
|
df = rep(0,p)
|
|
for(i in 1:p)
|
|
{
|
|
yhat = smooth.spline(ynoise, df = k+1, spar = allspar[i])
|
|
df[i] = yhat$df
|
|
yhat = yhat$y
|
|
RSS[i] = sum((yhat-ynoise)^2)
|
|
}
|
|
# GCV criterion
|
|
GCV = (RSS/n)/((1-df/n)^2)
|
|
plot(allspar,GCV,type = "l", lwd = 3)
|
|
spar = allspar[which.min(GCV)]
|
|
points(spar,GCV[which.min(GCV)],col = "red",lwd=5)
|
|
yhat = smooth.spline(ynoise, df = k+1, spar = spar)
|
|
yhat = yhat$y
|
|
lines(x,yhat,col = "black",lwd=3)
|
|
|
|
########################## Kernel smoothers ##########################
|
|
|
|
### RBF Kernel ###
|
|
|
|
# Data Genereation
|
|
x = c(0:100)
|
|
y = sin(x/10)+(x/50)^2+0.1*rnorm(length(x))
|
|
kerf = function(z){exp(-z*z/2)/sqrt(2*pi)}
|
|
# leave-one-out CV
|
|
h1=seq(1,4,0.1)
|
|
er = rep(0, length(y))
|
|
mse = rep(0, length(h1))
|
|
for(j in 1:length(h1))
|
|
{
|
|
h=h1[j]
|
|
for(i in 1:length(y))
|
|
{
|
|
X1=x;
|
|
Y1=y;
|
|
X1=x[-i];
|
|
Y1=y[-i];
|
|
z=kerf((x[i]-X1)/h)
|
|
yke=sum(z*Y1)/sum(z)
|
|
er[i]=y[i]-yke
|
|
}
|
|
mse[j]=sum(er^2)
|
|
}
|
|
plot(h1,mse,type = "l")
|
|
h = h1[which.min(mse)]
|
|
points(h,mse[which.min(mse)],col = "red", lwd=5)
|
|
|
|
### Slide 8 ###
|
|
|
|
# Interpolation for N values
|
|
N=1000
|
|
xall = seq(min(x),max(x),length.out = N)
|
|
f = rep(0,N);
|
|
for(k in 1:N)
|
|
{
|
|
z=kerf((xall[k]-x)/h)
|
|
f[k]=sum(z*y)/sum(z);
|
|
}
|
|
ytrue = sin(xall/10)+(xall/50)^2
|
|
plot(x,y,col = "black")
|
|
lines(xall,ytrue,col = "red")
|
|
lines(xall, f, col = "blue")
|
|
|
|
################### FUNCTIONAL PRINCIPAL COMPONENTS ###################
|
|
|
|
### Functional data classification ###
|
|
|
|
library(randomForest)
|
|
# Data generation
|
|
set.seed(123)
|
|
M = 100
|
|
n = 50
|
|
x = seq(0,1,length=n)
|
|
E = as.matrix(dist(x, diag=T, upper=T))
|
|
Sigma = exp(-10*E^2)
|
|
eig = eigen(Sigma)
|
|
Sigma.sqrt = eig$vec%*%diag(sqrt(eig$val+10^(-10)))%*%t(eig$vec)
|
|
S_noise = exp(-0.1*E^2)
|
|
eig_noise = eigen(S_noise)
|
|
S.sqrt_noise = eig_noise$vec%*%diag(sqrt(eig_noise$val+10^(-10)))%*%t(eig_noise$vec)
|
|
# Class 1
|
|
mean1 = Sigma.sqrt%*%rnorm(n)
|
|
noise1 = S.sqrt_noise%*%rnorm(n)
|
|
signal1 = mean1 + noise1
|
|
var1 = var(signal1)
|
|
ds1 = sqrt(var1/100)
|
|
S.sqrt_err1 = diag(n)*drop(ds1)
|
|
x1 = matrix(0,M,n)
|
|
for(j in 1:(M))
|
|
{
|
|
noise1 = S.sqrt_noise%*%rnorm(n)
|
|
error1 = S.sqrt_err1%*%rnorm(n)
|
|
x1[j,] = mean1 + noise1 + error1
|
|
}
|
|
matplot(x,t(x1),type = "l",ylab = "y",col = "blue")
|
|
# Class 2
|
|
mean2 = Sigma.sqrt%*%rnorm(n)
|
|
noise2 = S.sqrt_noise%*%rnorm(n)
|
|
signal2 = mean2 + noise2
|
|
var2 = var(signal2)
|
|
ds2 = sqrt(var2/100)
|
|
S.sqrt_err2 = diag(n)*drop(ds2)
|
|
x2 = matrix(0,M,n)
|
|
for(j in 1:(M))
|
|
{
|
|
noise2 = S.sqrt_noise%*%rnorm(n)
|
|
error2 = S.sqrt_err2%*%rnorm(n)
|
|
x2[j,] = mean2 + noise2 + error2
|
|
}
|
|
matplot(x,t(x2),type = "l",ylab = "y",col = "red")
|
|
# Train and test data sets
|
|
X = rbind(x1,x2)
|
|
lab = as.factor(c(rep(0,M),rep(1,M)))
|
|
train = sample.int(2*M,floor(0.8*2*M))
|
|
labtrain = lab[train]
|
|
labtest = lab[-train]
|
|
# Option 1: B-splines
|
|
library(splines)
|
|
knots = seq(0,1,length.out = 8)
|
|
B = bs(x, knots = knots, degree = 3)[,1:10]
|
|
Bcoef = matrix(0,dim(X)[1],10)
|
|
for(i in 1:dim(X)[1])
|
|
{
|
|
Bcoef[i,] = solve(t(B)%*%B)%*%t(B)%*%X[i,]
|
|
}
|
|
fit = randomForest(labtrain ~ .,
|
|
data=cbind.data.frame(as.data.frame(Bcoef[train,]),labtrain))
|
|
pred = predict(fit,Bcoef[-train,])
|
|
table(labtest,pred)
|
|
Xtest = X[-train,]
|
|
matplot(x,t(Xtest[pred==0,]),type="l",col = "blue",ylab = "y",ylim = c(-4,4),main="Classification using B-spline coefficients")
|
|
X2 = Xtest[pred == 1,]
|
|
for(i in 1:length(pred[pred==1]))
|
|
{
|
|
lines(x,X2[i,],col = "red")
|
|
}
|
|
# Option 2: Functional principal components
|
|
library(fda)
|
|
splinebasis = create.bspline.basis(c(0,1),10)
|
|
smooth = smooth.basis(x,t(X),splinebasis)
|
|
Xfun = smooth$fd
|
|
pca = pca.fd(Xfun, 10)
|
|
var.pca = cumsum(pca$varprop)
|
|
nharm = sum(var.pca < 0.95) + 1
|
|
pc = pca.fd(Xfun, nharm)
|
|
plot(pc$scores[lab==0,],xlab = "FPC-score 1", ylab = "FPC-score 2",col = "blue",ylim=c(-1,1))
|
|
points(pc$scores[lab==1,],col = "red")
|
|
FPCcoef = pc$scores
|
|
fit = randomForest(labtrain ~ .,
|
|
data=cbind.data.frame(as.data.frame(FPCcoef[train,]),labtrain))
|
|
pred = predict(fit,FPCcoef[-train,])
|
|
table(labtest,pred)
|
|
Xtest = X[-train,]
|
|
matplot(x,t(Xtest[pred==0,]),type="l",col = "blue",ylab = "y",ylim = c(-4,4),main="Classification using FPCA scores")
|
|
X2 = Xtest[pred == 1,]
|
|
for(i in 1:length(pred[pred==1]))
|
|
{
|
|
lines(x,X2[i,],col = "red")
|
|
}
|
|
|