--- title: "Lab3CodePractice" author: "Kenya Amano" date: "10/23/2020" output: pdf_document editor_options: chunk_output_type: console --- ```{r setup, include=FALSE} rm(list=ls()) # Clear memory set.seed(123456) # For reproducible random numbers library(MASS) # Load packages library(simcf) library(tidyverse) ``` # 2.1 Generating heteroskedastic normal data ```{r } n <- 1500 # Generate 1500 observations w0 <- rep(1, n) # Create the constant w1 <- runif(n, min = 0, max = 1) # Create two covariates w2 <- runif(n, min = 0, max = 1) x <- cbind(w0, w1, w2) # Create a matrix of the covariates z <- x # i.e., same covariates affect mu and sigma beta <- c(0, 5, 15) # Set a parameter vector for the mean gamma <- c(1, 0, 3) # Set a parameter vector for the variance ``` Let's assign mu and sigma2 ```{r} mu <- # Create systemtic component for the mean sigma2 <- # Create systematic component for the variance y <- rnorm(n = , # Create the outcome variable mean = , # Think about the stochastic component! sd = ) data <- cbind(y, w1, w2) %>% as_tibble # Save the data to a data frame ``` ## Plot ```{r} par(mfrow = c(1, 3)) #Plot the data plot(y = y, x = w1) plot(y = y, x = w2) plot(y = w1, x = w2) ``` # 2.2. Fitting a model using OLS - `lm()` ```{r} ls.result <- lm(y ~ w1 + w2, # Fit a linear model data = data) # using simulated data summary(ls.result) ls.aic <- AIC(ls.result) # Calculate and print the AIC # i.e. Akaike Information Criterion # to assess goodness of fit; lower AIC is better ls.aic ``` # 2.3 Fitting the model using ML - `optim()` Think back: $$ \begin{aligned} \log \mathcal{L}(\boldsymbol{\beta}, \boldsymbol{\gamma} | \mathbf{y}) &\propto-\frac{1}{2} \sum_{i=1}^{n} \log \mathbf{z}_{i} \gamma-\frac{1}{2} \sum_{i=1}^{n} \frac{\left(y_{i}-\mathbf{x}_{i} \boldsymbol{\beta}\right)^{2}}{\exp \left(\mathbf{z}_{i} \gamma\right)} \end{aligned} $$ ```{r} # A likelihood function for ML heteroskedastic Normal llk.hetnormlin <- function(param, y, x, z) { x <- as.matrix(x) # x (some covariates) as a matrix z <- as.matrix(z) # z (some covariates) as a matrix os <- rep(1, nrow(x)) # Set the intercept as 1 (constant) x <- cbind(os, x) # Add intercept to covariates x z <- cbind(os, z) # Add intercept to covariates z b <- param[1 : ncol(x)] # Parameters for x g <- param[(ncol(x) + 1) : (ncol(x) + ncol(z))] # Parameters for z xb <- x %*% b # Systematic components for mean s2 <- exp(z %*% g) # Systematic components for variance # Likelihood we want to maximize WRITE CODE # optim is a minimizer by default # To maximize lnL is to minimize -lnL # so the +/- signs are reversed } # Create input matrices xcovariates <- cbind(w1, w2) zcovariates <- cbind(w1, w2) # Initial guesses of beta0, beta1, ..., gamma0, gamma1, ... # We need one entry per parameter, in order! # Note: also include beta and gamma estiamtes for constants stval <- c(0, 0, 0, 0, 0, 0) # Run ML, and get the output we need hetnorm.result <- optim(stval, # Initial guesses llk.hetnormlin, # Likelihood function method = "BFGS", # Gradient method hessian = TRUE, # Return Hessian matrix y = y, # Outcome variable x = xcovariates, # Covariates x (w/o constant) z = zcovariates # Covariates z (w/o constant) ) pe <- hetnorm.result$par # Point estimates vc <- solve(hetnorm.result$hessian) # Var-cov matrix (for computing s.e.) se <- sqrt(diag(vc)) # To compute standard errors (s.e.) # take the diagonal of the Hessian; # then take square root mle.result <- cbind(pe[1:3], se[1:3]) # Report ML results colnames(mle.result) <- c("Estimate", "Std.Error") rownames(mle.result) <- c("(Intercept)", "w1", "w2") round(mle.result, 3) round(coef(summary(ls.result))[, c(1, 2)], 3) # Compare with lm() results ll <- -hetnorm.result$value # Report likelihood at maximum # No need to have negative sign # if optim is set as maximizer ll ``` ## AIC AIC is 2 * number of parameters - 2 * ll (i.e. likelihood at maximum) ```{r} hetnorm.aic <- 2 * length(stval) - 2 * ll # Compare AIC from ML and from lm(); lower is better print(hetnorm.aic) print(ls.aic) ``` # 2.4 Calculate quantities of interest - S1 ## Scenario 1: Vary covariate 1; hold covariate 2 constant ```{r} # Set up w1range <- seq(from = 0, to = 1, by = 0.05) # Set up hypothetical values for w1 w2mean <- mean(w2) # Set up mean for w2 xhypo <- crossing(w1 = w1range, w2 = w2mean) # Create a new dataset using crossing() head(xhypo) # Inspect # Calculate predicted values simPI.w1 <- predict(ls.result, # A model object newdata = xhypo, # New dataset interval = "prediction", # What kind of intervals? level = 0.95) # What levels of confidence? head(simPI.w1) # Inspect simPI.w1 <- simPI.w1 %>% as_tibble() %>% # Coerce it into a tibble bind_cols(w1 = w1range) # Combine hypo w1 with predicted y head(simPI.w1) # Inspect ``` # 2.4 Calculate quantities of interest - S1 Plot ```{r, out.width = '70%', fig.align = "center"} # ggplot2 theme_set(theme_classic()) ggplot(simPI.w1, aes(x = w1, y = fit, ymax = upr, ymin = lwr)) + geom_line() + geom_ribbon(alpha = 0.1) + labs(y = "Predicted Y", x = "Covariate 1") ``` # 2.4 Calculate quantities of interest - S1 Calculate confidence intervals using predict() ```{r} simCI.w1 <- predict(ls.result, newdata = xhypo, interval = "confidence", level = 0.95) simCI.w1 <- simCI.w1 %>% as_tibble() %>% bind_cols(w1 = w1range) head(simCI.w1) # Plot confidence intervals ggplot(simCI.w1, aes(x = w1, y = fit, ymax = upr, ymin = lwr)) + geom_line() + geom_ribbon(alpha = 0.1) + labs(y = "Predicted Y", x = "Covariate 1") ``` # 2.4 Calculate quantities of interest - S1 Plot How to plot two plots side by side? In ggplot2, we need to combine two datasets but also create a new variable to identify whether the data are from prediction or confidence intervals ```{r} simALL.w1 <- bind_rows( simPI.w1 %>% mutate(type = "PI"), simCI.w1 %>% mutate(type = "CI") ) head(simALL.w1) # Plot confidence intervals and predictive intervals side by side ggplot(simALL.w1, aes(x = w1, y = fit, ymax = upr, ymin = lwr)) + geom_line() + geom_ribbon(alpha = 0.1) + labs(y = "Predicted Y", x = "Covariate 1") + facet_grid(~ type) ``` # 2.4 Calculate quantities of interest - S2 Scenario 2: Vary covariate 2; hold covariate 1 constant ```{r} w2range <- seq(from = 0, to = 1, by = 0.05) # Set up hypothetical values for w1 w1mean <- mean(w1) # Set up mean for w2 xhypo <- crossing(w1 = w1mean, # Create a new dataset w2 = w2range) # crossing() is from tidyr package head(xhypo) # Inspect # Calculate predicted values simPI.w2 <- predict(ls.result, # A model object newdata = xhypo, # New dataset interval = "prediction", # What kind of intervals? level = 0.95) # What levels of confidence? simPI.w2 <- simPI.w2 %>% as_tibble() %>% bind_cols(w2 = w2range) # Plot the predictive intervals for hypothetical w2 ggplot(simPI.w2, aes(x = w2, y = fit, ymax = upr, ymin = lwr)) + geom_line() + geom_ribbon(alpha = 0.1) + labs(y = "Predicted Y", x = "Covariate 2") ```