--- title: 'CSSS/POLS 510 Maximum Likelihood Estimation: Lab 5' editor_options: chunk_output_type: console date: '2020-11-6' output: beamer_presentation: colortheme: seagull fonttheme: structurebold keep_tex: yes theme: Szeged slidy_presentation: default fontsize: 12pt subtitle: Nested Logit model GOF author: Kenya Amano --- # Agenda 1. Homework: Recap HW2 2. Logit model: Godness of Fit 3. Homework: HW3 # 1. Homework: Recap HW2 \tiny \begin{aligned} f_\text{Poisson, t}(y \mid \lambda, t) &= \frac{\exp(-\lambda_it_i){(\lambda_it_i})^{y_i}}{\text{y}!} &\\ \text{Pr} (y\mid\lambda, t) &= \prod_{i = 1}^{n} \frac{\exp(-\lambda_it_i)(\lambda_it_i)^{y_i}}{\text{y}!} &\scriptsize[\text{Joint probability}]\\ \log \mathcal{L}(\lambda \mid y) &= \log\prod_{i = 1}^{n}{k({y_i})}\frac{\exp(-\lambda_it_i)(\lambda_it_i)^{y_i}}{\text{y}!} &\scriptsize[\text{Converted to log likelihood}]\\ &\propto \sum_{i = 1}^{n} \Big({{\log(\exp(-\lambda_it_i) + {y_i}\log(\lambda_it_i}) - \log(\text{y}!) \Big)} &\scriptsize[\text{Simplify: proportional to}]\\ &\propto \sum_{i = 1}^{n} \Big({{{y_i}\log(\lambda_it_i})- \lambda_it_i - \log(\text{y}!) \Big)} &\scriptsize[\text{Simplify}] \end{aligned} # 1. Homework: Recap HW2 \tiny \begin{aligned} \log \mathcal L (\lambda \mid y) &\propto \sum_{i = 1}^{n} \Big({{{y_i}\log(\lambda_it_i})- \lambda_it_i - \log(\text{y}!) \Big)} &\\ &\propto \sum_{i = 1}^{n} \Big({{{y_i}\big({\log\lambda_i + \log t_i}) \big)}- \lambda_it_i - \log(\text{y}!) \Big)} &\scriptsize[\text{Simplify}]\\ &\propto \sum_{i = 1}^{n} \Big({y_i}\big({\log\lambda_i) \big)}- \lambda_it_i \Big) &\scriptsize[\text{Reduce to “sufficient statistics”}]\\ \log \mathcal{L}(\beta \mid y) &\propto \sum_{i = 1}^{n} \Big({y_i} (\text{x}_i\beta) - \exp(\text{x}_i\beta)t_i \Big) &\scriptsize[\text{Substitute in systematic components}] \end{aligned} # 1. Homework: Recap HW2 Bad assignment \tiny {r, eval=F} xb <- exp(x %*% b) lambda <- xb  # 1. Homework: Recap HW2 Good assignment \tiny {r, eval=F} xb <- x %*% b lambda <- exp(xb)  # 1. Homework: Recap HW2 Problem2: Which one do you have more confidence? $\newline$ What does it mean if you have a smaller SE? \tiny {r, echo=F, message=F, warning=F} library(tidyverse) pe1 <- c(-0.046, 0.979, 2.096) se1 <- c(0.010, 0.029, 0.030) pe2 <- c(0.088, 0.857, 2.019) se2 <- c(0.055, 0.095, 0.027) sample <- tibble( beta = c("beta0", "beta1", "beta2"), pe1 = pe1, se1 = se1, pe2 = pe2, se2 = se2 ) sample %>% pander::pander()  # 3. Homework: Recap HW2 How about this? \tiny {r, echo=F, fig.asp = 0.618, out.width = '90%', fig.align = "center"} dta <- tibble(pe = c(pe1,pe2), se = c(se1,se2), trueBeta = rep(c(0,1,2),2), beta = rep(c("beta0", "beta1", "beta2"),2), type = c(rep("pe1", 3), rep("pe2",3))) %>% mutate(lower = pe -1.96*se, upper = pe +1.96*se, Conf95 = case_when(lower<=trueBeta & upper >= trueBeta ~ 1, TRUE ~0) %>% factor()) dta %>% ggplot(aes(x = beta, y = pe, ymin = lower, ymax = upper, shape = Conf95, fill = Conf95))+ geom_point(size = 2.5, show.legend = F)+ geom_linerange()+ geom_hline(yintercept = c(0, 1, 2), linetype = "dotted")+ labs(y = "Point estimates", x = "")+ scale_shape_manual(values = c(1,16))+ scale_fill_manual(values = c("white","black"))+ facet_grid(~type)+ theme_bw()  # 3. Homework: Recap HW2 How can I calculate CIs? \tiny {r, eval=F} dta <- tibble(pe = c(pe1,pe2), se = c(se1,se2), trueBeta = rep(c(0,1,2),2), beta = rep(c("beta0", "beta1", "beta2"),2), type = c(rep("pe1", 3), rep("pe2",3))) %>% mutate(lower = pe -1.96*se, upper = pe +1.96*se, Conf95 = case_when(lower<=trueBeta & upper >= trueBeta ~ 1, TRUE ~0) %>% factor())  # 3. Homework: Recap HW2 Tidy code from [\underline{David Robinson}](http://varianceexplained.org/r/birthday-problem/) \scriptsize {r, eval=F} summarized <- crossing(people = seq(2, 50, 2), trial = 1:10000) %>% mutate(birthday = map(people, ~ sample(365, ., replace = TRUE)), multiple = map_lgl(birthday, ~ any(duplicated(.)))) %>% # You could use "anyDuplicated()" group_by(people) %>% summarize(chance = mean(multiple)) # Visualizing the probability ggplot(summarized, aes(people, chance)) + geom_line() + scale_y_continuous(labels = scales::percent_format()) + labs(y = "Probability two have the same birthday")  # 2. Logit model + Before go over godness of fit... Let's recap the process # 2.0 Logit model:Overview \small 1. Obtain data 2. Think model (distribution/covariates) 3. Fit model 4. Obtain ML estimates of it 5. Interpret those estimates(Estimate QOI) 6. Test goodness of fit 7. Present your results to a broad audience # 2.0 Logit model:Overview \small 1. Obtain data 2. Think model (distribution/covariates) 3. Fit model 4. Obtain ML estimates of it 5. Interpret those estimates ($\textcolor{red}{\text{Estimate QOI}}$) 6. $\textcolor{red}{\text{Test goodness of fit}}$ 7. Present your results to a broad audience # 2.1 Simulating QoI In order to use simcf to generate quantities of interest, the following steps are needed: 1. Estimate: MLE $\hat{\beta}$ and its variance $\hat{V}(\hat{\beta})$ 2. Simulate estimation uncertainty from a multivariate normal distribution:\ Draw $\tilde{\beta} \sim MVN \big[\hat{\beta}, \hat{V}(\hat{\beta})\big]$ 3. Create hypothetical scenarios of your substantive interest:\ Choose valuese of X: $X_c$ # 2.1 Simulating QoI 4. Calculate expected values:\ $\tilde{\mu_c} = g(X_c, \tilde{\beta})$ 5. Simulate fundamental uncertainty:\ $\tilde{y_c} \sim f(\tilde{\mu_c}, \tilde{\alpha})$\ or 5. Compute EVs, First Differences or Relative Risks\ EV: $\mathbb{E}(y|X_{c1})$\ FD: $\mathbb{E}(y|X_{c2}) - \mathbb{E}(y|X_{c1})$\ RR: $\frac{\mathbb{E}(y|X_{c2})}{\mathbb{E}(y|X_{c1})}$ # 2.1 Simulating QoI In order to use simcf to generate quantities of interest, the following steps are needed: 1. Estimate: MLE $\hat{\beta}$ and its variance $\hat{V}(\hat{\beta})$\ $\textcolor{red}{\rightarrow \texttt{optim(), glm()}}$ 2. Simulate estimation uncertainty from a multivariate normal distribution:\ Draw $\tilde{\beta} \sim MVN \big[\hat{\beta}, \hat{V}(\hat{\beta})\big]$\ $\textcolor{red}{\rightarrow \texttt{MASS::mvrnorm()}}$ 3. Create hypothetical scenarios of your substantive interest:\ Choose valuese of X: $X_c$ $\textcolor{red}{\rightarrow \texttt{simcf::cfmake(), cfchange()} \dots}$ # 2.1 Simulating QoI 4. Calculate expected values:\ $\tilde{\mu_c} = g(X_c, \tilde{\beta})$ \ 5. Simulate fundamental uncertainty:\ $\tilde{y_c} \sim f(\tilde{\mu_c}, \tilde{\alpha})$\ $\textcolor{red}{\rightarrow \texttt{simcf::hetnormsimpv()} \dots}$\ or 5. Compute EVs, First Differences or Relative Risks\ EV: $\mathbb{E}(y|X_{c1})$\ $\textcolor{red}{\rightarrow \texttt{simcf::logitsimev()} \dots}$\ FD: $\mathbb{E}(y|X_{c2}) - \mathbb{E}(y|X_{c1})$\ $\textcolor{red}{\rightarrow \texttt{simcf::logitsimfd()} \dots}$\ RR: $\frac{\mathbb{E}(y|X_{c2})}{\mathbb{E}(y|X_{c1})}$\ $\textcolor{red}{\rightarrow \texttt{simcf::logitsimrr()} \dots}$\ # 2.2 GOF: Overview \scriptsize 1. Likelihood Ratio Test $\newline$ 2. Akaike Information Criterion $\newline$ 3. Bayesian Information Criterion $\newline$ 4. Percent Correctly Predicted $\newline$ 5. Separation Plots $\newline$ 6. Actual vs. Predicted Plots $\newline$ 7. Error vs. Predicted Plots $\newline$ 8. ROC Plots $\newline$ 9. Residual vs Leverage Plots $\newline$ 10. Cross-validation # 2.2 GOF: Overview \scriptsize 1. $\textcolor{red}{\text{Likelihood Ratio Test}}$ $\newline$ 2. $\textcolor{red}{\text{Akaike Information Criterion}}$ $\newline$ 3. $\textcolor{red}{\text{Bayesian Information Criterion}}$ $\newline$ 4. Percent Correctly Predicted $\newline$ 5. Separation Plots $\newline$ 6. $\textcolor{red}{\text{Actual vs. Predicted Plots}}$ $\newline$ 7. Error vs. Predicted Plots $\newline$ 8. $\textcolor{red}{\text{ROC Plots}}$ $\newline$ 9. Residual vs Leverage Plots $\newline$ 10. $\textcolor{red}{\text{Cross-validation}}$ # 2.1. Likelihood Ratio Test Given two nested models: \begin{aligned} \mathcal{M}_{1} &: \text{logit}^{-1}(\beta_{0} + \beta_{1}x_{1} + \dots +\beta_{i}x_{i})\\ \mathcal{M}_{2} &: \text{logit}^{-1}(\beta_{0} + \beta_{1}x_{1} + \dots +\beta_{i}x_{i}+\beta_{i+1}x_{i+1}+ \dots +\beta_{i+j}x_{i+j}) \end{aligned} The likelihood ratio statistic tests the null hypothesis that the additional parameters are equal to zero: $$H_{0}: \beta_{i+1}= \dots =\beta_{i+j}=0$$ # 2.1. Likelihood Ratio Test $\newline$ The likelihood ratio statistic is the difference in the deviances of $\mathcal{M}_{1}$ and $\mathcal{M}_{2}$, where the deviance is -2 multiplied by the log likelihood of the model at its maximum: $$D(\mathcal{M})= -2 \log \mathcal{L}(\mathcal{M})$$ # 2.1. Likelihood Ratio Test Then, we draw LR: \small $$\text{LR} = -2\log \frac{\mathcal{L}(\mathcal{M}_{1})}{\mathcal{L}(\mathcal{M}_{2})}$$ The likelihood ratio statistic follows a Chi-squared distributed with $|\mathcal{M}_{1}|$ - $|\mathcal{M}_{2}|$ degrees of freedom, where $|\mathcal{M1}|$ is the number of parameters in model $\mathcal{M1}$. You can evaluate like: "Since the p-value is smaller than $\textcolor{red}{0.05}$, we reject the null hypothesis, and we favor the more complex model $\mathcal{M}_{2}$ to $\mathcal{M}_{1}$. {r include=FALSE} ## Estimation by ML using optim() or by glm() on reduced dataset: ## Model 1: Age, Age^2, HS, College ## Model 2: Age, Age^2, HS, College, Married # Clear memory rm(list=ls()) # Load libraries library(simcf) library(MASS) library(nlme) library(boot) # For cv.glm() library(separationplot) # For separation plot library(pscl) # Alternative PCP code library(verification) # For ROC area library(tile) # For some graphics; used by plot.binPredict() library(RColorBrewer) # For nice colors source("binaryGOF.R") # Percent correctly predicted and concordance indexes source("binPredict.R") # Code for making predicted vs actual plots # Get nice colors col <- brewer.pal(5, "Set1") blue <- col[2] orange <- col[5] # Note: the variable marriedo is current marrieds, # the variable married is ever-marrieds fulldata <- read.csv("https://faculty.washington.edu/cadolph/mle/nes00a.csv", header = TRUE) m2 <- vote00 ~ age + I(age^2) + hsdeg + coldeg + marriedo # Keep only cases observed for all models data <- extractdata(m2, fulldata, na.rm = TRUE) attach(data)  # 2.1. Likelihood Ratio Test \scriptsize {r} # Models in R formula format m1 <- vote00 ~ age + I(age^2) + hsdeg + coldeg m2 <- vote00 ~ age + I(age^2) + hsdeg + coldeg + marriedo # Construct variables and model objects y <- vote00 x1 <- cbind(age,age^2,hsdeg,coldeg) x2 <- cbind(age,age^2,hsdeg,coldeg,marriedo)  # 2.1. Likelihood Ratio Test \scriptsize {r} # Likelihood function for logit llk.logit <- function(param,y,x) { os <- rep(1,length(x[,1])) x <- cbind(os,x) b <- param[ 1 : ncol(x) ] xb <- x%*%b sum( y*log(1+exp(-xb)) + (1-y)*log(1+exp(xb))) # optim is a minimizer, so min -ln L(param|y) } # Fit logit model using optim ls.result <- lm(y~x1) # use ls estimates as starting values stval <- ls.result$coefficients # initial guesses logit.m1 <- optim(stval,llk.logit,method="BFGS",hessian=T,y=y,x=x1) # call minimizer procedure pe.m1 <- logit.m1$par # point estimates vc.m1 <- solve(logit.m1$hessian) # var-cov matrix se.m1 <- sqrt(diag(vc.m1)) # standard errors ll.m1 <- -logit.m1$value # likelihood at maximum  # 2.1. Likelihood Ratio Test \scriptsize {r} # Alternative estimation technique: GLM glm.m1 <- glm(m1, data=data, family="binomial") # Fit logit model with added covariate: married ls.result <- lm(y~x2) # use ls estimates as starting values stval <- ls.result$coefficients # initial guesses logit.m2 <- optim(stval,llk.logit,method="BFGS",hessian=T,y=y,x=x2) # call minimizer procedure pe.m2 <- logit.m2$par # point estimates vc.m2 <- solve(logit.m2$hessian) # var-cov matrix se.m2 <- sqrt(diag(vc.m2)) # standard errors ll.m2 <- -logit.m2$value # likelihood at maximum # GLM estimation of model with married glm.m2 <- glm(m2, data=data, family="binomial")  # 2.1. Likelihood Ratio Test \scriptsize {r} ## Goodness of fit of model 1 and model 2 # Check number of parameters in each model k.m1 <- length(pe.m1) k.m2 <- length(pe.m2) k.m1 k.m2 # Likelihood ratio (LR) test lr.test <- 2*(ll.m2 - ll.m1) lr.test.p <- pchisq(lr.test,df=(k.m2 - k.m1),lower.tail=FALSE) lr.test.p  # 2.2 Akaike Information Criterion For non-nested models, we cannot use likelihood ratio tests. Instead we turn to several information theoretic measures to assess model fit, which can be thought of as penalized LR tests. The Akaike Information Criterion (AIC) is defined as follows: $$AIC(\mathcal{M}) = D(\mathcal{M})+2\times|\mathcal{M}|$$ Or -2 times the log likelihood of the model at its maximum plus 2 times the number of parameters. A model with a smaller AIC is preferred. Why? # 2.2 Akaike Information Criterion *Smaller is better:* This is because the first part, $D(\mathcal{M})$, will be lower as the likelihood increases, but it also always decreases as more variables are added to the model. The second part, $2\times|\mathcal{M}|$, always increases as more variables are added to the model, and thus penalizes the first part. Put together, the two parts create a balance between fit and complexity. # 2.2 Akaike Information Criterion {r} # Akaike Information Criterion (AIC) aic.m1 <- 2*k.m1 - 2*ll.m1 aic.m2 <- 2*k.m2 - 2*ll.m2 aic.test <- aic.m2 - aic.m1 aic.test  # 2.3 Bayesian Information Criterion The Bayesian Information Criterion (BIC) is similar to the AIC but penalizes the deviance in a different way: $$BIC(\mathcal{M}) = D(\mathcal{M})+ \text{log}(n) \times|\mathcal{M}|$$ Or -2 times the log likelihood of the model at its maximum multiplied by $\text{log}(n)$ times the number of parameters. In this case, the penalty considers the number of observations and the number of variables. # 2.3 Bayesian Information Criterion The BIC is based on a Bayesian comparison of models proposed by Raftery (1996). Recall Bayes theorem: \begin{equation*} \begin{split} P(\boldsymbol{\theta}|\boldsymbol{y}) &= \frac{P(\boldsymbol{y}|\boldsymbol{\theta})P(\boldsymbol{\theta})}{P(\boldsymbol{y})}\\ \Bigg[\frac{\text{P}(\mathcal{M}_{1}|\text{Observed Data})}{\text{P}(\mathcal{M}_{2}|(\text{Observed Data})} \Bigg] &\approx \Bigg[\frac{\text{P}(\text{Observed Data}|\mathcal{M}_{1})}{\text{P}(\text{Observed Data}|\mathcal{M}_{2})} \times \frac{\text{P}(\mathcal{M}_{1})}{\text{P}(\mathcal{M}_{2})}\Bigg] \end{split} \end{equation*} # 2.3 Bayesian Information Criterion If we assume that $\frac{\text{P}(\mathcal{M}_{1})}{\text{P}(\mathcal{M}_{2})}=1$, Raftery shows that $$2 \log \Bigg[\frac{\text{P}(\text{Observed Data}|\mathcal{M}_{1})}{\text{P}(\text{Observed Data}|\mathcal{M}_{2})} \Bigg] \approx \text{BIC}_{\mathcal{M}_2} - \text{BIC}_{\mathcal{M}_1}$$ A model with a smaller BIC is again preferred. # 2.3 Bayesian Information Criterion Raftery's suggested guidelines for the strength of evidence favoring $\text{BIC}_{\mathcal{M}_1}$ over $\text{BIC}_{\mathcal{M}_2}$ are as follows: \begin{table} \begin{tabular}{l | r } Absolute Difference & Evidence \\ \hline 0-2 & Weak \\ 2-6 & Positive \\ 6-10 & Strong \\ >10 & Very Strong \end{tabular} \end{table} # 2.3 Bayesian Information Criterion {r} # Bayesian Information Criterion (BIC) bic.m1 <- log(nrow(x1))*k.m1 - 2*ll.m1 bic.m2 <- log(nrow(x2))*k.m2 - 2*ll.m2 bic.test <- bic.m2 - bic.m1 bic.test  # 2.4 Percent Correctly Predicted We can also observe the percentage of observations our model that correctly predicts, such that $$\hat{y}\geq0.5 \quad \text{and} \quad y=1$$ \centering or $$\hat{y}<0.5 \quad \text{and} \quad y=0$$ \flushleft In other words, we check to see how often the predicted probability from our model is greater than or equal to 0.5 when the observed value is 1 and how often it is less than 0.5 when the observed value is 0. # 2.4 Percent Correctly Predicted Recall we want to report $\text{PCP} - \overline{y}$ \scriptsize {r} # Percent correctly predicted (using glm result and my source code) pcp.glm  # 2.4 Percent Correctly Predicted \scriptsize {r} pcp.glm(glm.m1, vote00, type="null") # mean pcp.glm(glm.m1, vote00, type="model") pcp.glm(glm.m2, vote00, type="model") pcp.glm(glm.m1, vote00, type="improve") #model - null pcp.glm(glm.m2, vote00, type="improve")  # 2.4 Percent Correctly Predicted Another way to cumpute PCP with the pscl package and pre with the DAMisc package \tiny {r, eval=F } library(pscl) hitmiss(glm.m1) hitmiss(glm.m1, k=.3) #change the threshold library(DAMisc) pre(glm.m1)  # 2.5 Separation Plots We can also visualize the PCP using separation plots. 1. Sort the observations by predicted probability of the outcome (smallest to largest) 3. Plot 1s as red lines and 0s as tan lines 4. Trace a horizontal black line as the predicted probability for each case Red lines to the left and tan lines to the right of where the horizontal line passes the half way point to the top are mispredictions. A model that accurately predicts will be tan on the left and red on the right. # 2.5 Separation Plots \scriptsize {r} # Separation plots separationplot(pred=glm.m1$fitted.values, actual=glm.m1$y)  \centering \includegraphics{Sep1.pdf} # 2.5 Separation Plots \scriptsize {r} #on Screen separationplot(pred=glm.m2$fitted.values, actual=glm.m2$y) #PDF #separationplot(pred=glm.m2$fitted.values, actual=glm.m2$y, file="sepplot_m1.pdf")  \centering \includegraphics{Sep2.pdf} # 2.6 Actual vs Predicted Plots Alternatively, we can plot the average of the observed values against the predicted probabilities of the model for small intervals or bins. $\newline$ For each bin or interval, the average of the predicted probabilities should match the average of the observed values (1s and 0s) if our model is performing well. $\newline$ \textbf{That is, for each bin we compare the avg predicted probability and avg empirical frequency.} # 2.6 Actual vs Predicted Plots These will be plot along the $45^{\circ}$ line. When the average of the observed values is above (below) the $45^{\circ}$ line, the predicted probabilities over (under) predict. # 2.6 Actual vs Predicted Plots 1. Sort the observations by predicted probability of the outcome (smallest to largest) $\newline$ 2. Bin the sorted data into ranges of predicted probabilities (spaced equally with a new bin every $\frac{1}{\# \text{of bin}}$ or with an equal number of observations with a new bin after every $\frac{\# \text{of obs}}{\# \text{of bin}}$) $\newline$ 3. Compute the average predicted probability within each bin $\newline$ # 2.6 Actual vs Predicted Plots 4. Compute the average empirical frequency within each bin $\newline$ 5. Plot the average empirical frequency against the average predicted probability for each bin # 2.6 Actual vs Predicted Plots \tiny {r} # binPredict for Actual vs Predicted plots, Error vs Predicted plots, and ROC plots # From binPredict.R source code # We use a helper function binPredict() to compute bins and ROC curves for us. # The we can plot one or more models using the plot function # Other options for binPredict(): # bins = scalar, number of bins (default is 20) # quantiles = logical, force bins to same # of observations (default is FALSE) # sims = scalar, if sim=0 use point estimates to compute predictions; # if sims>0 use (this many) simulations from predictive distribution # to compute predictions (accounts for model uncertainty) # default is 100 simulations; note: ROC curves always use point estimates only binnedM1 <- binPredict(glm.m1, col=blue, label="M1: Age, Edu", quantiles=TRUE) binnedM2 <- binPredict(glm.m2, col=orange, label="M2: Age, Edu, Married", quantiles=TRUE)  # 2.6 Actual vs Predicted Plots To make bins of equal probability width instead of equal # obs: \tiny {r, eval=F} binnedM1b <- binPredict(glm.m1, col=blue, label="M1: Age, Edu", quantiles=FALSE) binnedM2b <- binPredict(glm.m2, col=orange, label="M2: Age, Edu, Married", quantiles=FALSE)  # 2.6 Actual vs Predicted Plots \tiny {r} ## Some options for plot.binPredict (more in source code) ## together = logical, plot models overlapping on same plot (default is TRUE) ## display = character, avp: plot predicted actual vs predicted probs ## evr: plot actual/predicted vs predicted probs ## roc: plot receiver operator characteristic curves ## default is c("avp", "evp", "roc") for all three ## thresholds = numeric, show these thresholds on ROC plot (default is NULL) ## hide = logical, do not show number of observations in each bin (default is TRUE) ## ignore = scalar, do not show bins with fewer observations than this (default = 5) ## totalarea = scalar, total area of all circles for a model relative to plot (default=0.1) ## cex = scalar, size of numeric labels ## showbins = logical, show bin boundaries ## file = character, save result to a pdf with this file name  # 2.6 Actual vs Predicted Plots \tiny {r} # Show actual vs predicted of M1 on screen plot(binnedM1, display="avp", hide=TRUE, labx=0.35)  # 2.6 Actual vs Predicted Plots \tiny {r} # Show actual vs predicted of M1 and M2 to file plot(binnedM1, binnedM2, display="avp", hide=TRUE, labx=0.35)  # 2.7 Error vs Predicted Plots The EVP plot is similar to the AVP plot but uses the ratio of the actual to the predicted probabilities on the y-axis. 1. Sort the observations by predicted probability of the outcome (smallest to largest) $\newline$ 2. Bin the sorted data into ranges of predicted probabilities (spaced equally with a new bin every 1/nbin or with an equal number of observations with a new bin after every obs/nbin observations) $\newline$ 3. Compute the average predicted probability within each bin $\newline$ # 2.7 Error vs Predicted Plots 4. Compute the average empirical frequency within each bin $\newline$ 5. Compute the ratio of the average empirical frequency to the average predicted probability for each bin $\newline$ 6. Plot the ratio of the average empirical frequency to the average predicted probability against the average predicted probability for each bin # 2.7 Error vs Predicted Plots \tiny {r} # Send error vs predicted of M1 and M2 to file plot(binnedM1, binnedM2, display="evp", hide=TRUE, labx=0.35)  # 2.8 ROC Plots ROC plots allow us to observe the trade off between sensitivity (true positive rate) and specificity (true negative rate). 1. Sort observations from highest predicted probability to lowest 2. Start a line from the origin and - move up $\frac{1}{\text{total positives}}$ for each positive case - move right $\frac{1}{\text{total negatives}}$ for each negative case # 2.8 ROC Plots Successful predictions of $y=1$ will move the curve up, unsuccessful predictions of $y=1$ will move the curve right. $\newline$ A model that predicts accurately will have a large area under the curve. # 2.8 ROC Plots \tiny {r} # Send ROC plots for M1 and M2 to file plot(binnedM1, binnedM2, display="roc", thresholds=c(0.9, 0.8, 0.7, 0.6), labx=0.35) # Also see ROCR package for ROC curves and many other prediction metrics # and the verification package for a rudimentary roc plot function roc.plot()  # 2.8 ROC Plots \tiny {r} # Concordance Indexes / AUC (using glm result and my source code) concord.glm(glm.m1, vote00, type="null") concord.glm(glm.m1, vote00, type="model") concord.glm(glm.m2, vote00, type="model") concord.glm(glm.m1, vote00, type="improve") #(model-null)/(1-null) concord.glm(glm.m2, vote00, type="improve")  # 2.9 Residual vs Leverage Plots Another familiar technique is to observe any outliers within the data set. $\newline$ This is done by plotting each observation's leverage against its discrepancy, or the studentized residuals against the standardized hat-values. $\newline$ Recall that studentized residuals greater than 2 and less than -2 are considered large, while standardized hat-values of greater than 2 or 3 are considered large. # 2.9 Residual vs Leverage Plots \tiny {r} ### Residuals using glm version hatscore.m1 <- hatvalues(glm.m1)/mean(hatvalues(glm.m1)) rstu.m1 <- rstudent(glm.m1) hatscore.m2 <- hatvalues(glm.m2)/mean(hatvalues(glm.m2)) rstu.m2 <- rstudent(glm.m2)  # 9. Residual vs Leverage Plots {r, echo=F, fig.asp = 0.618, out.width = '60%', fig.align = "center"} tibble( x = hatscore.m1, y = rstu.m1 ) %>% ggplot(aes(x=x, y=y))+ geom_point(alpha=.3, size =3, color=blue)+ scale_x_continuous(breaks = c(0:10))+ scale_y_continuous(breaks = c(-3:3))+ geom_vline(xintercept = c(3), linetype="dashed")+ geom_hline(yintercept = c(-2,2), linetype="dashed")+ labs(x="Standardized hat-values", y="Studentized residuals")+ theme_bw()  # 9. Residual vs Leverage Plots {r, echo=F, fig.asp = 0.618, out.width = '60%', fig.align = "center"} tibble( x = hatscore.m2, y = rstu.m2 ) %>% ggplot(aes(x=x, y=y))+ geom_point(alpha=.3, size =3, color=orange)+ scale_x_continuous(breaks = c(0:10))+ scale_y_continuous(breaks = c(-3:3))+ geom_vline(xintercept = c(3), linetype="dashed")+ geom_hline(yintercept = c(-2,2), linetype="dashed")+ labs(x="Standardized hat-values", y="Studentized residuals")+ theme_bw()  # 2.10 Cross-validation For cross-validation, we select an error metric: in this case, we use the percent correctly predicted. $\newline$ In leave-on-out cross-validation, we remove each observation then find the rate at which the model correctly predicts the left out observation. $\newline$ We can then compare the PCP across models. # 2.10 Cross-validation \tiny {r, cache=T} ### Cross-validation (takes a few minutes to run) ## A precent-correctly-predicted-style cost function ## r is actual y, pi is expected y ## Rate of inaccuracy: mean(vote00!=round(yp)) costpcp <- function(r, pi=0) mean(r!=round(pi)) ## an alternative cost function for binary data ## cost <- function(r, pi=0) mean(abs(r-pi)>0.5) cv.m1 <- cv.glm(data, glm.m1, costpcp) cvPCP.m1 <- 1 - cv.m1$delta[2] cv.m2 <- cv.glm(data, glm.m2, costpcp) cvPCP.m2 <- 1 - cv.m2$delta[2] cvPCP.m1 cvPCP.m2  # 2.10 Cross-validation \tiny {r, cache=T} #### More cross-validation ## A simple leave-one-out cross-validation function for logit glm; returns predicted probs loocv <- function (obj) { data <- obj$data m <- dim(data)[1] form <- formula(obj) fam <- obj$family\$family loo <- rep(NA, m) for (i in 1:m) { i.glm <- glm(form, data = data[-i, ], family = fam) loo[i] <- predict(i.glm, newdata = data[i,], family = fam, type = "response") } loo } # LOOCV for models 1 and 2 predCVm1 <- loocv(glm.m1) predCVm2 <- loocv(glm.m2)  # 2.10 Cross-validation \tiny {r, cache=T} # Make cross-validated AVP and ROC plots; note use of newpred input in binPredict binnedM1cv <- binPredict(glm.m1, newpred=predCVm1, col=blue, label="M1: LOO-CV", quantiles=TRUE) plot(binnedM1cv, display=c("avp","roc"), hide=TRUE, thresholds=c(0.9, 0.8, 0.7, 0.6), labx=0.25)  # 2.10 Cross-validation \tiny {r, cache=T} binnedM2cv <- binPredict(glm.m2, newpred=predCVm2, col=orange, label="M2: LOO-CV", quantiles=TRUE) plot(binnedM2cv, display=c("avp","roc"), hide=TRUE, thresholds=c(0.9, 0.8, 0.7, 0.6), labx=0.25)  # 2.10 Cross-validation \tiny {r} plot(binnedM1cv, binnedM2cv, display=c("avp","roc"), hide=TRUE, thresholds=c(0.9, 0.8, 0.7, 0.6), labx=0.25)  # 3. Homework: Question HW3 + Due on Nov 12 + One common problem when knitting: the math mode environment doesn't like white space or empty line + Try \begin{alinged} instead of \begin{split} + R Markdown guide is [\underline{here}](https://bookdown.org/yihui/rmarkdown) + Email subject: **MLE510HW3** + File name: **MLE510HW3KenyaAmano**