--- title: "Lab3CodePractice" author: "Kenya Amano" date: "10/23/2020" output: pdf_document editor_options: chunk_output_type: console --- {r setup, include=FALSE} rm(list=ls()) # Clear memory set.seed(123456) # For reproducible random numbers library(MASS) # Load packages library(simcf) library(tidyverse) # This should be after MASS  \begin{aligned} \mathrm{P}\left(\mathbf{y} | \boldsymbol{\mu}, \boldsymbol{\sigma}^{2}\right) &=\prod_{i=1}^{n} f_{\mathcal{N}}\left(y_{i} | \mu_{i}, \sigma_{i}^{2}\right) & [\text{Joint probability}] \\ \mathrm{P}\left(\mathbf{y} | \boldsymbol{\mu}, \boldsymbol{\sigma}^{2}\right) &=\prod_{i=1}^{n}\left(2 \pi \sigma_{i}^{2}\right)^{-1 / 2} \exp \left[\frac{-\left(y_{i}-\mu_{i}\right)^{2}}{2 \sigma_{i}^{2}}\right] & [\text{Expressed in Normal distribution}]\\ \log \mathcal{L}\left(\boldsymbol{\beta}, \boldsymbol{\sigma}^{2} | \mathbf{y}\right) & \propto-\frac{1}{2} \sum_{i=1}^{n} \log \sigma_{i}^{2}-\frac{1}{2} \sum_{i=1}^{n} \frac{\left(y_{i}-\mathbf{x}_{i} \boldsymbol{\beta}\right)^{2}}{\sigma_{i}^{2}} & [\text{Converted to log likelihood; simplify}] \\ \log \mathcal{L}(\boldsymbol{\beta}, \boldsymbol{\gamma} | \mathbf{y}) & \propto-\frac{1}{2} \sum_{i=1}^{n} \log \mathbf{z}_{i} \gamma-\frac{1}{2} \sum_{i=1}^{n} \frac{\left(y_{i}-\mathbf{x}_{i} \boldsymbol{\beta}\right)^{2}}{\exp \left(\mathbf{z}_{i} \gamma\right)} & [\text{Substitute in systematic components}] \end{aligned} # 2.1 Generating heteroskedastic normal data {r } n <- 1500 # Generate 1500 observations w0 <- rep(1, n) # Create the constant w1 <- runif(n, min = 0, max = 1) # Create two covariates w2 <- runif(n, min = 0, max = 1) x <- cbind(w0, w1, w2) # Create a matrix of the covariates z <- x # i.e., same covariates affect mu and sigma beta <- c(0, 5, 15) # Set a parameter vector for the mean gamma <- c(1, 0, 3) # Set a parameter vector for the variance  {r} mu <- x %*% beta # Create systemtic component for the mean sigma2 <- exp(z %*% gamma) # Create systematic component for the variance y <- rnorm(n = n, # Create the outcome variable mean = mu, # Think about the stochastic component! sd = sqrt(sigma2) ) data <- cbind(y, w1, w2) %>% as_tibble # Save the data to a data frame  # 2.2. Fitting a model using OLS - lm() {r} ls.result <- lm(y ~ w1 + w2, # Fit a linear model data = data) # using simulated data summary(ls.result) ls.aic <- AIC(ls.result) # Calculate and print the AIC # i.e. Akaike Information Criterion # to assess goodness of fit; lower AIC is better ls.aic  # 2.3 Fitting the model using ML - optim() {r} # A likelihood function for ML heteroskedastic Normal llk.hetnormlin <- function(param, y, x, z) { x <- as.matrix(x) # x (some covariates) as a matrix z <- as.matrix(z) # z (some covariates) as a matrix os <- rep(1, nrow(x)) # Set the intercept as 1 (constant) x <- cbind(os, x) # Add intercept to covariates x z <- cbind(os, z) # Add intercept to covariates z b <- param[1 : ncol(x)] # Parameters for x g <- param[(ncol(x) + 1) : (ncol(x) + ncol(z))] # Parameters for z xb <- x %*% b # Systematic components for mean s2 <- exp(z %*% g) # Systematic components for variance sum(0.5 * (log(s2) + (y - xb)^2 / s2)) # Likelihood we want to maximize # optim is a minimizer by default # To maximize lnL is to minimize -lnL # so the +/- signs are reversed } # Create input matrices xcovariates <- cbind(w1, w2) zcovariates <- cbind(w1, w2) # Initial guesses of beta0, beta1, ..., gamma0, gamma1, ... # We need one entry per parameter, in order! # Note: also include beta and gamma estiamtes for constants stval <- c(0, 0, 0, 0, 0, 0) # Run ML, and get the output we need hetnorm.result <- optim(stval, # Initial guesses llk.hetnormlin, # Likelihood function method = "BFGS", # Gradient method hessian = TRUE, # Return Hessian matrix y = y, # Outcome variable x = xcovariates, # Covariates x (w/o constant) z = zcovariates # Covariates z (w/o constant) ) pe <- hetnorm.result$par # Point estimates vc <- solve(hetnorm.result$hessian) # Var-cov matrix (for computing s.e.) se <- sqrt(diag(vc)) # To compute standard errors (s.e.) # take the diagonal of the Hessian; # then take square root mle.result <- cbind(pe[1:3], se[1:3]) # Report ML results colnames(mle.result) <- c("Estimate", "Std.Error") rownames(mle.result) <- c("(Intercept)", "w1", "w2") round(mle.result, 3) round(coef(summary(ls.result))[, c(1, 2)], 3) # Compare with lm() results ll <- -hetnorm.result\$value # Report likelihood at maximum # No need to have negative sign # if optim is set as maximizer ll  ## AIC AIC is 2 * number of parameters - 2 * ll (i.e. likelihood at maximum) {r} hetnorm.aic <- 2 * length(stval) - 2 * ll # Compare AIC from ML and from lm(); lower is better print(hetnorm.aic) print(ls.aic)  # 2.4 Calculate quantities of interest - S1 ## Scenario 1: Vary covariate 1; hold covariate 2 constant {r} # Set up w1range <- seq(from = 0, to = 1, by = 0.05) # Set up hypothetical values for w1 w2mean <- mean(w2) # Set up mean for w2 xhypo <- crossing(w1 = w1range, w2 = w2mean) # Create a new dataset using crossing() head(xhypo) # Inspect # Calculate predicted values simPI.w1 <- predict(ls.result, # A model object newdata = xhypo, # New dataset interval = "prediction", # What kind of intervals? level = 0.95) # What levels of confidence? head(simPI.w1) # Inspect simPI.w1 <- simPI.w1 %>% as_tibble() %>% # Coerce it into a tibble bind_cols(w1 = w1range) # Combine hypo w1 with predicted y head(simPI.w1) # Inspect  # 2.4 Calculate quantities of interest - S1 Plot {r, out.width = '70%', fig.align = "center"} # ggplot2 theme_set(theme_classic()) ggplot(simPI.w1, aes(x = w1, y = fit, ymax = upr, ymin = lwr)) + geom_line() + geom_ribbon(alpha = 0.1) + labs(y = "Predicted Y", x = "Covariate 1")  # 2.4 Calculate quantities of interest - S1 Calculate confidence intervals using predict() {r} simCI.w1 <- predict(ls.result, newdata = xhypo, interval = "confidence", level = 0.95) simCI.w1 <- simCI.w1 %>% as_tibble() %>% bind_cols(w1 = w1range) head(simCI.w1) # Plot confidence intervals ggplot(simCI.w1, aes(x = w1, y = fit, ymax = upr, ymin = lwr)) + geom_line() + geom_ribbon(alpha = 0.1) + labs(y = "Predicted Y", x = "Covariate 1")  # 2.4 Calculate quantities of interest - S1 Plot How to plot two plots side by side? In ggplot2, we need to combine two datasets but also create a new variable to identify whether the data are from prediction or confidence intervals {r} simALL.w1 <- bind_rows( simPI.w1 %>% mutate(type = "PI"), simCI.w1 %>% mutate(type = "CI") ) head(simALL.w1) # Plot confidence intervals and predictive intervals side by side ggplot(simALL.w1, aes(x = w1, y = fit, ymax = upr, ymin = lwr)) + geom_line() + geom_ribbon(alpha = 0.1) + labs(y = "Predicted Y", x = "Covariate 1") + facet_grid(~ type)  # 2.4 Calculate quantities of interest - S2 Scenario 2: Vary covariate 2; hold covariate 1 constant {r} w2range <- seq(from = 0, to = 1, by = 0.05) # Set up hypothetical values for w1 w1mean <- mean(w1) # Set up mean for w2 xhypo <- crossing(w1 = w1mean, # Create a new dataset w2 = w2range) # crossing() is from tidyr package head(xhypo) # Inspect # Calculate predicted values simPI.w2 <- predict(ls.result, # A model object newdata = xhypo, # New dataset interval = "prediction", # What kind of intervals? level = 0.95) # What levels of confidence? simPI.w2 <- simPI.w2 %>% as_tibble() %>% bind_cols(w2 = w2range) # Plot the predictive intervals for hypothetical w2 ggplot(simPI.w2, aes(x = w2, y = fit, ymax = upr, ymin = lwr)) + geom_line() + geom_ribbon(alpha = 0.1) + labs(y = "Predicted Y", x = "Covariate 2")  # 2.5 Simulating QoI using simcf {r} # Draw parameters from the model predictive distribution sims <- 10000 simparam <- mvrnorm(n = sims, mu = pe, # M.N.D. with population mean = pe (from ML); Sigma = vc) # population var-covar matrix = vc (from ML) head(simparam) # Inspect  # 2.5 Simulating QoI using simcf :S1 {R} # Separate into the simulated betas and simulated gammas simbetas <- simparam[ , 1:(ncol(xcovariates) + 1)] simgammas <- simparam[ , (ncol(simbetas) + 1):ncol(simparam)] # Specify model formulas varmodel <- (y ~ w1 + w2)  # 2.5 Simulating QoI using simcf :S1 {r} # Create hypothetical scenarios for covariate 1 w1range <- seq(from = 0, to = 1, by = 0.2) # Use cfMake to create a baseline dataset xhypo <- cfMake(varmodel, data, nscen = length(w1range)) xhypo # Use cfChange and loop function to loop through each hypothetical x values for (i in 1:length(w1range)) { xhypo <- cfChange(xscen = xhypo, covname = "w1", x = w1range[i], scen = i) } xhypo # Repeat the same procedures for z zhypo <- cfMake(varmodel, data, nscen = length(w1range)) for (i in 1:length(w1range)) { zhypo <- cfChange(zhypo, "w1", x = w1range[i], scen = i) } zhypo  # 2.5 Simulating QoI using simcf :S1 {r} # Simulate predictive intervals for heteroskedastic linear models # hetnormsimpv() is from simcf package simRES.w1 <- hetnormsimpv(xhypo, simbetas, zhypo, simgammas, ci = 0.95, constant = 1, varconstant = 1) simRES.w1  # 2.5 Simulating QoI using simcf :S1 {r, fig.asp = 0.618, out.width = '60%', fig.align = "center"} simRES.w1 <- simRES.w1 %>% bind_rows() %>% # Collaspe the list into d.f. bind_cols(w1 = w1range) # Combine with hypo. w1 values simRES.w1 # Inspect  # 2.5 Simulating QoI using simcf :S1 {r, fig.asp = 0.618, out.width = '80%', fig.align = "center"} # ggplot2 ggplot(simRES.w1, aes(x = w1, y = pe, ymax = upper, ymin = lower)) + geom_line() + geom_ribbon(alpha = 0.1) + labs(y = "Predicted Y", x = "Covariate 1")  # 2.5 Simulating QoI using simcf :S2 Scenario 2: Vary covariate 2; hold covariate 1 constant 1. Create a data frame with a set of hypothetical scenarios for covariate 2 while keeping covariate 1 at its mean 2. Simulate the predicted values using MASS and simcf 3. Plot the results # 2.5 Simulating QoI using simcf :S2 {r} # Create hypothetical scenarios for w2 w2range <- xhypo <- for (i in ) { xhypo <- } zhypo <- for (i in ) { zhypo <- }  # 2.5 Simulating QoI using simcf :S2 {r} # Simulate the predicted Y's and PI's simRES.w2 <- hetnormsimpv(xhypo, simbetas, zhypo, simgammas, ci = 0.95, constant = 1, varconstant = 1) simRES.w2 <- simRES.w2 %>% bind_rows() %>% bind_cols(w2 = w2range)  # 2.5 Simulating QoI using simcf :S2 {r, fig.asp = 0.618, out.width = '80%', fig.align = "center"} # Plot the predictive intervals for hypothetical w2 ggplot(simRES.w2, aes(x = w2, y = pe, ymax = upper, ymin = lower)) + geom_line() + geom_ribbon(alpha = 0.1) + labs(y = "Predicted Y", x = "Covariate 2")  Can we compare them with the prediction intervals generated by predict()? # 2.5 Simulating QoI using simcf :S2 {r, fig.asp = 0.618, out.width = '65%', fig.align = "center"} # Combine two dataframes simPI.w2 <- simPI.w2 %>% rename(pe = fit, lower = lwr, upper = upr) simALL.w2 <- bind_rows( simRES.w2 %>% mutate(method = "sim"), simPI.w2 %>% mutate(method = "lm") ) # Plot the predictive intervals for hypothetical w2 ggplot(simALL.w2, aes(x = w2, y = pe, ymax = upper, ymin = lower)) + geom_line() + geom_ribbon(alpha = 0.1) + labs(y = "Predicted Y", x = "Covariate 2") + facet_grid(~ method)