Add files via upload

lopesoll · Mar 6, 2023 · 4747aeceb72cf03df70179fc4f03c0d0b062be7a · 4747aec
1 parent 7a7c542
commit 4747aeceb72cf03df70179fc4f03c0d0b062be7a
Show file tree

Hide file tree

Showing 8 changed files with 7,453 additions and 0 deletions.
diff --git a/Task_1.R b/Task_1.R
@@ -0,0 +1,66 @@
+#time series plots
+plot(Dataframe$time, Dataframe$x1, type = "n", xlim = c(30, 50), ylim = c(-5, 4), xlab="Time", ylab="EEG Signals")
+lines(Dataframe$time, Dataframe$x1, col="blue", lwd="1")
+lines(Dataframe$time, Dataframe$x2, col="red", lwd="1")
+
+plot(Dataframe$time, Dataframe$x1, type = "n", xlim = c(50, 54), ylim = c(-5, 3.5), xlab="Time", ylab="EEG Signals")
+lines(Dataframe$time, Dataframe$x1, col="blue", lwd="2")
+lines(Dataframe$time, Dataframe$x2, col="red", lwd="2")
+lines(Dataframe$time, Dataframe$y, col="purple", lwd="1")
+
+
+plot(Dataframe$time, Dataframe$y, type = "l", xlim = c(30, 50), ylim = c(-50, 50), col="purple", xlab="Time", ylab="Audio input")
+
+#distribution of the signals 
+hist(Dataframe$x1, col="blue", main = ("Distribution of X1"), xlab="", ylab="Frequency")
+hist(Dataframe$x2, col="red", main = ("Distribution of X2"), xlab="", ylab="Frequency")
+hist(Dataframe$y, col="purple", main = ("Distribution of Y"), xlab="", ylab="Frequency")
+
+
+#variance
+print(var(Dataframe$x1))
+print(var(Dataframe$x2))
+print(var(Dataframe$y))
+#mean
+print(mean(Dataframe$x1))
+print(mean(Dataframe$x2))
+print(mean(Dataframe$y))
+#standard deviation
+print(sd(Dataframe$x1))
+print(sd(Dataframe$x2))
+print(sd(Dataframe$y))
+#correlation
+print(cor(Dataframe$x1, Dataframe$y))
+print(cor(Dataframe$x2, Dataframe$y))
+
+#scatter plot with linear model 
+plot(Dataframe$x1, Dataframe$y, xlab = "X1", ylab = "Y")#positive correlation 
+abline(lm(Dataframe$y ~ Dataframe$x1), col = "red")
+
+plot(Dataframe$x2, Dataframe$y, xlab = "X2", ylab = "Y") #no correlation 
+abline(lm(Dataframe$y ~ Dataframe$x2), col = "blue") 
+
+df=data.frame(Dataframe$x1,Dataframe$x2, Dataframe$y)
+
+#box plots
+library(ggplot2)
+df=data.frame(X, Dataframe$y)
+
+# Plot the box plot
+ggplot(data = df, aes(x = "x1", y = x1)) +
+  geom_boxplot() +
+  xlab("EEG Signal") +
+  ylab("Value") +
+  ggtitle("Boxplot of EEG Signal x1")
+
+ggplot(data = df, aes(x = "x2", y = x2)) +
+  geom_boxplot() +
+  xlab("EEG Signal") +
+  ylab("Value") +
+  ggtitle("Boxplot of EEG Signal x2")
+
+ggplot(data = df, aes(x = "y", y = Dataframe.y)) +
+  geom_boxplot() +
+  xlab("EEG Signal") +
+  ylab("Value") +
+  ggtitle("Boxplot of Sound y")
diff --git a/Task_2.7.R b/Task_2.7.R
@@ -0,0 +1,53 @@
+set.seed(123)
+#split the matrix of data 70% training data and 30% testing data
+data = sort(sample(nrow(X), nrow(X)*.7))
+train_data<-Dataframe[data,]
+test_data<-Dataframe[-data,]
+
+#task 2.7.1
+#create the theta bias of length X1 in training data.
+const = rep(1, times=length(train_data$x1))
+
+#create the theta bias of length X1 in training data.
+const_test = rep(1, times=length(test_data$x1))
+
+#create the model
+model3=cbind((train_data$x1^3), train_data$x2, train_data$x1, const)
+model3_test_data=cbind((test_data$x1^3), test_data$x2, test_data$x1, const_test)
+
+#estimate model parameters using least squares (training data)
+thetahat=solve(t(model3) %*% model3) %*% t(model3) %*% train_data$y
+
+#compute model prediction using test data
+y_pred=model3_test_data %*% thetahat
+
+#plot 95% confidence interval 
+residuals = test_data$y - y_pred
+rss = sum(residuals^2)
+n = length(test_data$y)
+#calculate variance of the residuals
+sigma_squared = rss/(n-1)
+#compute the estimated covariance matrix of the model parameters
+cov_thetaHat = sigma_squared * (solve(t(model3_test_data) %*% model3_test_data))
+
+var_y_hat = matrix(0 , n , 1)
+
+#compute the variance for of y_pred
+for( i in 1:n){
+  X_i = matrix( model3_test_data[i,] , 1 , 4 ) # X[i,] creates a vector. Convert it to matrix
+  var_y_hat[i,1] = X_i %*% cov_thetaHat %*% t(X_i) 
+}
+
+#get all three variables to order
+CI = 2 * sqrt(var_y_hat) # Confidence interval
+y=test_data$y
+time=test_data$time
+
+#create dataframe and order it to plot
+data = data.frame(y, y_pred, CI, time)
+data = data[order(data$time), ]
+
+#plot prediction, testing data, and confidence intervals 
+plot(data$time, data$y_pred, type = "o", xlim = c(41, 47), ylim = c(-1, 1), col="steelblue1", xlab="Time", ylab="Y", lwd=2)
+points(data$time, data$y, col="forestgreen", lwd=2)
+segments(data$time, data$y_pred - data$CI, data$time, data$y_pred + data$CI, col="red1", lwd=3)
diff --git a/Task_2.R b/Task_2.R
@@ -0,0 +1,63 @@
+set.seed(123)
+
+#create the theta bias
+constant = rep(1, times = length(y))
+
+#split X into X1 and X2
+X1=Dataframe[,1]
+X2=Dataframe[,2]
+
+#create the models 
+model1=cbind((X1^3), (X2^5), constant)
+model2=cbind((X1^4), (X2^2), constant)
+model3=cbind((X1^3), X2, X1, constant)
+model4=cbind(X1, (X1^2),(X1^3), (X2^3), constant)
+model5=cbind((X1^3), (X1^4), X2, constant)
+
+for (i in 1:5) {
+  #get the model for every index: (model + i)
+  var_name=paste0("model", i)
+  my_model=get(var_name)
+
+  #estimate model parameters using least squares (task 2.1)
+  thetahat=solve(t(my_model) %*% my_model) %*% t(my_model) %*% y
+  #print(thetahat) #print thetahat to include in the report
+
+  #compute the models prediction
+  y_pred=my_model %*% thetahat
+
+  #compute model residual (error) and calculate the sum of squared errors (task 2.2)
+  residuals = y - y_pred
+  rss = sum(residuals^2)
+  print(paste0("--------------------- Model ", i," -----------------------"))
+  print(paste0("RSS for model ",i, ": ", rss))
+
+  #compute the log-likelihood function (task 2.3)
+  n = length(y)
+  sigma_squared = rss/(n-1)
+  loglikelihood = -(n/2) * log(2*pi) - (n/2) * log(sigma_squared) - (1/(2*sigma_squared)) * rss
+  print(paste0("log likelihood for model ",i, ": ", loglikelihood))
+
+  #define k for AIC and BIC this is the number of estimated parameters in each model 
+  #K is 4 in model 1 and 2, K is 5 in model 3 and 5, K is 6 in model 4  
+  if (i == 1 || i == 2) {
+    k= 4
+  } else if (i == 4){
+    k= 6
+  } else {
+    k= 5
+  }
+  #compute AIC (task 2.4)
+  AIC = 2 * k - 2 * loglikelihood
+  print(paste0("AIC for model ",i, ": ", AIC, " info: K is equal to: ", k))
+
+  #compute BIC
+  BIC = log(n) * k - 2 * loglikelihood  
+  print(paste0("BIC for model ",i, ": ", BIC))
+
+  #Check if the residuals are close to normal/Gaussian (task 2.5)
+  qqnorm(residuals, main=paste("Q-Q Plot for Model", i))
+  qqline(residuals)
+  hist(residuals, col="limegreen", main=paste("Distribution of the residuals for model", i), xlab="Model prediction errors", ylab="Frequency")
+  print(shapiro.test(residuals))
+} 
diff --git a/Task_3.R b/Task_3.R
@@ -0,0 +1,58 @@
+set.seed(123)
+constant = rep(1, times = length(y))
+n=length(Dataframe$y)
+X1 = Dataframe$x1
+X2 = Dataframe$x2
+
+model_matrix = cbind((X1^3), X2, X1, constant)
+
+# Use a Uniform distribution as prior, around the estimated parameter values for those 2 parameters
+prior1 = runif(10000, 4.181390 - 1, 4.181390 + 1)#0.078 because the when set to 1, the max and min accepted ranges +-0.57
+prior2 = runif(10000, -6.651400 -1, -6.651400 + 1)#0.1
+
+# Create empty array to store parameters
+accepted_params = matrix(nrow = 0, ncol = 3)
+
+num_iter = 100000
+tolerance = 0.018
+
+for (i in 1:num_iter) {
+  # Draw parameters from prior
+  theta3 = sample(prior1, 1)
+  thetaBias = sample(prior2, 1)
+
+  # Fix other parameters as constant
+  thetahat = c(2.715713, -3.151350, theta3, thetaBias)
+  theta=matrix(thetahat)
+
+  # Compute simulated data using the model
+  y_sim = model_matrix %*% theta
+
+  # Compute distance between simulated and observed data
+  dist = (sqrt(sum((y_sim - y)^2)))/n 
+
+  print(dist)
+  # Accept or reject parameter values based on the distance
+  if (dist < tolerance) {
+    accepted_params = rbind(accepted_params, c(theta3, thetaBias, dist))
+  }
+}
+
+# Plot joint posterior distribution
+library(ggplot2)
+
+# Extract accepted parameters
+theta3_posterior = accepted_params[, 1]
+thetaBias_posterior = accepted_params[, 2]
+
+# Plot joint posterior distribution
+#plot(accepted_params[,1], accepted_params[,2], type = "p",
+#     xlab = "Theta 3", ylab = "Theta Bias",col="blue", main = "Joint Posterior Distribution", lwd=1)
+
+ggplot(data.frame(theta3_posterior, thetaBias_posterior), aes(x=theta3_posterior, y=thetaBias_posterior)) + 
+  geom_bin2d(bins = 15, colour="green") +
+  labs(x = expression(theta[3]), y = expression(theta[Bias]))
+
+#plot the marginal distribution
+hist(theta3_posterior, col="orangered1", main = ("Posterior distribution for Theta 3"), xlab="Value", ylab="Frequency")
+hist(thetaBias_posterior, col="limegreen", main = ("Posterior distribution for Theta Bias"), xlab="Value", ylab="Frequency")