# Omissions by Calvin Garner, November 30, 2018 #Aaron Erlich and Sijeong Lim, earlier versions rm(list=ls()) # Load libraries library(MASS) library(simcf) library(pscl) library(car) # for influence statistics and outlier detection library(stargazer) library(texreg) library(RColorBrewer) library(tidyverse) library(ggstance) source("http://faculty.washington.edu/cadolph/mle/avp.R") source("http://staff.washington.edu/kpleung/vis/theme/theme_cavis.R") #ggplot theme # Get 3 colors brewer <- brewer.pal(5,"Set1") red <- brewer[1] blue <- brewer[2] green <- brewer[3] purple <- brewer[4] orange <- brewer[5] data<-read.csv("http://staff.washington.edu/kamano/MLE/LabMaterials/Lab08/fish.csv", header=TRUE) summary(data) data %>% ggplot(aes(x = count))+ geom_histogram(fill = red, color="black")+ labs(x = "Number of fish", y = NULL)+ theme_bw() data %>% ggplot(aes(x = count, y=persons))+ geom_point(alpha=0.1, color=red)+ scale_x_continuous(trans='log10', breaks = c(0, 1, 2,3,4,5, 10, 20, 40, 100), sec.axis = dup_axis(name = NULL)) + # dup_axis() creates symmetric axes scale_y_continuous(breaks = seq(from = 1, to = 12, by = 1))+ labs(x = "Number of fish", y = "Number of people") + theme_bw() hist(data$count) # obs: 250 fishing group, #counts: number of fish caught, #livebait: 1 if used #persons: number of people in a fishing group # Process data for model y <- data$count x <- cbind(livebait=data$livebait,persons=data$persons) model <- count ~ livebait + persons modeldata <- as.data.frame(cbind(y,x)) names(modeldata) <- c("count","livebait","persons") # Fit Poisson model using optim llk.poisson <- function(param,y,x) { os <- rep(1,length(x[,1])) x <- cbind(os,x) b <- param[ 1 : ncol(x) ] xb <- x%*%b -sum( xb*y - exp(xb) ) # log-likelihood for poisson # optim is a minimizer, so min -ln L(param|y) } ls.result <- lm(y~x) # use ls estimates as starting values stval <- ls.result$coefficients # initial guesses pois.result <- optim(stval,llk.poisson,method="BFGS",hessian=T,y=y,x=x) # call minimizer procedure pe.pois <- pois.result$par # point estimates irr.pois<- exp(pe.pois) # incident rate ratio vc.pois <- solve(pois.result$hessian) # var-cov matrix se.pois <- sqrt(diag(vc.pois)) # standard errors ll.pois<- -pois.result$value #how do we interpret the incidence rates? cbind(pe.pois, se.pois, irr.pois) glm.pois <- glm(count~livebait+persons, modeldata, family="poisson") # Set up counterfactuals person.seq <- seq(1,12,1) xhyp <- cfMake(model,modeldata,nscen = length(person.seq)) for (i in 1:length(person.seq)){ xhyp <- cfChange(xscen = xhyp, covname = "persons", x=person.seq[i], scen=i)} # Simulate results sims <- 10000 simbetas.pois <- mvrnorm(sims,pe.pois,vc.pois) # draw parameters yhyp.pois <- loglinsimev(xhyp, simbetas.pois) # log of the expected count is being modeled in Poisson, so use loglinsimev # Plot results yhyp.tidy <- yhyp.pois %>% bind_rows() %>% mutate(people = 1:n(), model = "Poisson") ggplot(yhyp.tidy) + aes(y = people, x = pe, xmax = upper, xmin = lower) + geom_pointrangeh(size = 1, color = green) + # geom_vline(xintercept = 0, size=1.2)+ scale_x_continuous(trans='log10', breaks = c(0, 1, 2,3,4,5, 10, 20, 40), sec.axis = dup_axis(name = NULL)) + # dup_axis() creates symmetric axes scale_y_continuous(breaks = seq(from = 1, to = 12, by = 1))+ labs(x = "Number of fish", y = "Number of people") + theme_cavis_vgrid + # theme from Brian theme(axis.text.x = element_text(color = "black", size = 12), axis.ticks.x = element_blank()) ggplot(yhyp.tidy) + aes(y = ,pe, x = people, ymax = upper, ymin = lower, color = model, fill = model) + # For point estimates geom_line(show.legend = FALSE) + # For confidence intervals geom_ribbon(alpha = 0.2, linetype = 0, show.legend = FALSE) + scale_y_continuous( trans='log10', breaks = c(0, 1, 3, 5, 10, 20, 40, 100), sec.axis = dup_axis(name = NULL)) + # dup_axis() creates symmetric axes scale_x_continuous(breaks = seq(from = 1, to = 12, by = 1))+ scale_color_manual(values = c(green)) + scale_fill_manual(values = c(green)) + labs(y = "Number of fish", x = "Number of people") + theme_cavis_hgrid ############################################### # Re-analyze using negative binomial regression nb.result <- glm.nb(formula = count~livebait+persons, data = modeldata) stargazer(glm.pois, nb.result, type="text", omit="all") pe.nb <- nb.result$coefficients vc.nb <- vcov(nb.result) se.nb <- sqrt(diag(vc.nb)) # note that se values are larger now ll.nb <- as.numeric(logLik(nb.result)) theta.nb <- nb.result$theta setheta.nb <- nb.result$SE.theta print(theta.nb) summary(nb.result) # Simulate results (we re-use xhyp from above) simbetas.nb <- mvrnorm(sims, pe.nb, vc.nb) # draw parameters yhyp.nb <- loglinsimev(xhyp, simbetas.nb) # Plot results yhyp.tidy <- yhyp.tidy %>% bind_rows( yhyp.nb %>% bind_rows() %>% mutate(people = 1:n(), model = "Negative Binomial") ) ggplot(yhyp.tidy, aes(y = factor(people), x = pe, xmax = upper, xmin = lower, color = model)) + geom_pointrangeh(size = 1, position = position_dodge2v(height = 1, reverse = T)) + #Plot lines parallelly ) + scale_x_continuous(trans='log10', breaks = c(0, 1, 2, 3, 4, 5, 10, 20, 40), sec.axis = dup_axis(name = NULL)) + # dup_axis() creates symmetric axes # scale_y_discrete(limits=rev)+ scale_color_manual(values = c(orange, green))+ # For line color labs(x = "Number of fish", y = "Number of people") + theme_cavis_vgrid + # theme from Brian theme(axis.text.x = element_text(color = "black", size = 12), axis.ticks.x = element_blank(), legend.position = c(0.8, 0.2)) ggplot(yhyp.tidy) + aes(y = ,pe, x = people, ymax = upper, ymin = lower, color = model, fill = model) + # For point estimates geom_line(show.legend = FALSE) + # For confidence intervals geom_ribbon(alpha = 0.2, linetype = 0, show.legend = FALSE) + scale_y_continuous( trans='log10', breaks = c(0, 1, 3, 5, 10, 20, 40, 100), sec.axis = dup_axis(name = NULL)) + # dup_axis() creates symmetric axes scale_x_continuous(breaks = seq(from = 1, to = 12, by = 1))+ scale_color_manual(values = c(orange, green)) + scale_fill_manual(values = c(orange, green)) + labs(y = "Number of fish", x = "Number of people") + theme_cavis_hgrid ## what is theta? is it alpha or phi? # Type glm.nb and see how loglik is defined: "th" for theta # make a hypothetical data with known phi and run glm.nb to figure this out # use rnbinom ########################### # Re-analyze using Zero-Inflated Negative Binomial # Estimate ZINB using zeroinfl in the pscl library zinbmodel <- count ~ livebait + persons | livebait + persons # the first part is the count model, and the second part is the zero-inflation model. # The two components can have different regressors. zinb.result <- zeroinfl(zinbmodel, modeldata, dist="negbin") # to do ZIP, leave out dist argument summary(zinb.result) stargazer(zinb.result, nb.result, zero.component=F, star.cutoffs=0, notes.append=FALSE, type="text", notes="") # Extract results, taking care to keep track of both equations pe.zinb.count <- zinb.result$coefficients$count pe.zinb.zeros <- zinb.result$coefficients$zero pe.zinb <- c(pe.zinb.count,pe.zinb.zeros) vc.zinb <- vcov(zinb.result) se.zinb <- sqrt(diag(vc.zinb)) se.zinb.count <- se.zinb[1:length(pe.zinb.count)] se.zinb.zeros <- se.zinb[ (length(pe.zinb.count)+1) : length(se.zinb) ] ll.zinb <- as.numeric(logLik(zinb.result)) theta.zinb <- zinb.result$theta setheta.zinb <- zinb.result$SE.theta # Simulate results (we re-use xhyp from above) # E(count | not a structural zero) ,so use the count model coefficients simparam.zinb <- mvrnorm(sims,pe.zinb,vc.zinb) # draw parameters simbetas.zinb <- simparam.zinb[,1:length(pe.zinb.count)] #simgammas <- simparam[,(length(pe.zinb.count)+1) : length(se.zinb) ] yhyp.zinb <- loglinsimev(xhyp, simbetas.zinb) yhyp.tidy <- yhyp.tidy %>% bind_rows( yhyp.zinb %>% bind_rows() %>% mutate(people = 1:n(), model = "Zero inflated") ) ggplot(yhyp.tidy) + aes(y = factor(people), x = pe, xmax = upper, xmin = lower, color = model) + geom_pointrangeh(size = 1, position = position_dodge2v(height = 0.8, reverse = T)) + #Plot lines parallelly ) + scale_x_continuous(trans='log10', breaks = c(0, 1, 2, 3, 4, 5, 10, 20, 40, 100, 400), sec.axis = dup_axis(name = NULL)) + # dup_axis() creates symmetric axes # scale_y_discrete(limits=rev)+ scale_color_manual(values = c(orange, green, blue))+ # For line color labs(x = "Number of fish", y = "Number of people") + theme_cavis_vgrid + # theme from Brian theme(axis.text.x = element_text(color = "black", size = 12), axis.ticks.x = element_blank(), legend.position = c(0.8, 0.2)) ggplot(yhyp.tidy) + aes(y = pe, x = people, ymax = upper, ymin = lower, color = model, fill = model) + # For point estimates geom_line(show.legend = FALSE) + # For confidence intervals geom_ribbon(alpha = 0.2, linetype = 0, show.legend = FALSE) + scale_y_continuous( trans='log10', breaks = c(0, 1, 3, 5, 10, 20, 40, 100, 400), sec.axis = dup_axis(name = NULL)) + # dup_axis() creates symmetric axes scale_x_continuous(breaks = seq(from = 1, to = 12, by = 1))+ scale_color_manual(values = c(green, orange, blue)) + scale_fill_manual(values = c(green, orange, blue)) + labs(y = "Number of fish", x = "Number of people") + theme_cavis_hgrid # Goodness of fit test # ZINB vs NB: Vuong #A large, positive test statistic provides evidence of the superiority of m1 over m2 #Null hypothesis is that the models are indistinguishible print(vuong(m1=glm.pois, m2=nb.result)) print(vuong(m1=glm.pois, m2=zinb.result)) print(vuong(m1=zinb.result, m2=nb.result)) # Outliers? par(mfrow=c(1,2)) influencePlot(glm.pois, id.method="identify", xlab = "Hat-Values:Poisson") influencePlot(nb.result, id.method="identify:NB") data[c(89,104, 202),] #check outliers