--- title: "Lab01: Practice Code - Key" author: "Kenya Amano" date: "10/2/2020" output: pdf_document: default html_document: df_print: paged editor_options: chunk_output_type: console --- ```{r setup, include=FALSE} knitr::opts_chunk$set(echo = TRUE) ``` # Prerequiste ```{r, message=FALSE} rm(list = ls()) # Clear memory library(tidyverse) # Load package ``` # Working directory Check if your working directory is correct (where you have saved `Lab01Data.csv` and `Lab01Survey.csv`) ```{r} basedir <- getwd() rowdata.folder <- paste(basedir, "Specify if you create a folder", sep = "/") ``` # Vector Practice 1. `Vector1` : The numbers one through five and then the number six five times 2. `Vector2` : 10 randomly drawn numbers from a normal distribution with a mean 10 and a s.d. of 1 3. `Vector3` : Results of 10 single binomial trials with a probability of 0.4 4. `Vector4` : Sample 100 observations from a 5-trial binomial distribution with a probability of success of 0.4 5. `Vector5` : The numbers one through three and the word apple ```{r} #1 Vector1 <- c(1, 2, 3, 4, 5, 6, 6, 6, 6, 6) Vector1 <- c(1:5, 6, 6, 6, 6, 6) Vector1 <- c(seq(from = 1, to = 5, by = 1), rep(6, 5)) #2 Vector2 <- rnorm(n = 10, mean = 10, sd = 1) #3 Vector3 <- rbinom(n = 10, size = 1, prob = 0.4) #4 Vector4 <- rbinom(n = 100, size = 5, prob = 0.4) ?rbinom #The size is the total number of trials, of which size*prob are expected to be successes. stem(rbinom(n = 10, size = 1, prob = 0.4)) # This gives the results of 10 runs of 1 coin flips each with 0.4 winning probability, returning the number of successes in each run stem(rbinom(n = 100, size = 5, prob = 0.4)) # This gives the results of 100 runs of 5 coin flips each with 0.4 winning probability, returning the number of successes in each run #5 Vector5 <- c(1:3, "apple") ``` 6. What type of data is Vector2? 7. Round up Vector2 to two decimal place 8. What happened in Vector5? ```{r} #6 is.character(Vector2) mode(Vector2) class(Vector2) #7 round(Vector2, 2) #8 class(Vector5) Vector5 ``` # Matrices Practice 1. Matrix1: Create 5 by 5 matrix containing all NAs 2. Assign Matrix1 the row names (a,b,c,d,e) and the column names (1,2,3,4,5) 3. Replace the NAs in the first columne of Matrix1 with "Inf" ```{r} #1 Matrix1 <- matrix(data = NA, nrow=5, ncol=5) rownames(Matrix1) <- c("a", "b", "c", "d", "e") colnames(Matrix1) <- c(1, 2, 3, 4, 5) Matrix1[, 1] <- Inf #Matrix1[1,3]<-"apple" ``` # List Practice 1. Create a list that contains Vector1, Vector2, Vector3, and Matrix1 2. Name each list component as Vector1, Vector2, Vector3, and Matrix1 respectively 3. Locate Vector2 from the list ```{r} #1 List1NoName <- list(Vector1, Vector2, Vector3, Matrix1) #2 names(List1) <- c("Vector1", "Vector2", "Vector3", "Matrix1") #3 List1[[2]] #or List1$Vector2 ``` # Data Frames Practice 1 ## 1. Load Lab01data.csv in R ```{r} # Load data #Dta <- read.csv("Lab01Data.csv", header = TRUE, stringsAsFactors = FALSE) DataURL <- "http://staff.washington.edu/kamano/MLE/LabMaterials/Lab01/Lab01Data.csv" Dta <- read.csv(DataURL) ``` ## 2. What is the data structure? What does that tell us about type? ```{r} # Check structure dim(Dta) class(Dta) is.data.frame(Dta) is.matrix(Dta) # Alternatively str(Dta) summary(Dta) ``` ## 3. Check the names and summary statistics of the data. Fix any names that are less than good. ```{r} # Check and fix names names(Dta) names(Dta)[3] <- "GdpPerCap" names(Dta) # Check again Dta <- Dta %>% rename(Country = country, Polity2 = polity2) # Summary Statistics summary(Dta) ``` ## 4. Remove observations with missing values ```{r} # Remove NAs DataClean <- na.omit(Dta) # listwise deletion!! Dta %>% na.omit() dim(Dta) dim(DataClean) ``` ## 5. Calculate the average GDP per capita for Brazil for the observed period. Repeat the calculation for all countries. ```{r} # Base R mean(DataClean[DataClean$Country == "Brazil", "GdpPerCap"]) # Tidy way DataClean %>% filter(Country == "Brazil") %>% summarize(mean(GdpPerCap)) # Average GdpPerCap for all countries DataClean %>% group_by(Country) %>% summarize(mean(GdpPerCap)) DataClean %>% group_by(Country) %>% summarize_at(vars(GdpPerCap), list(mean, median)) ``` ## 6. Plot GDP per capita (on the x-axis) and Polity2 (on the y-axis) ```{r} # Base Graphics dev.off() plot(x = DataClean$GdpPerCap, y = DataClean$Polity2) # Try logging GDP plot(x = log(DataClean$GdpPerCap), y = DataClean$Polity2, xlab = "Logged GDP per capita", ylab = "Polity2") # ggplot2 ggplot(DataClean, aes(y = Polity2, x = log(GdpPerCap))) + geom_point() + labs(x = "Logged GDP per capita", y = "Polity2") + theme_classic() ``` ## 7. Create a new variable called "democracy". Assign 0 to countries with negative value or zero polity2 score, and assign 1 to countries with positive score. ```{r, results='hide'} # Create a variable called "democracy" DataClean$democracy <- NA head(DataClean) # You can subset data based on a logical statement DataClean$Polity2 <= 0 DataClean[DataClean$Polity2 <= 0, ] # ",__" specify all columns # Take advantage of this: Assign values to "democracy" based on Polity2 values DataClean$democracy[DataClean$Polity2 <= 0] <- 0 # Do the same for positive Polity2 score DataClean$democracy[DataClean$Polity2 > 0] <- 1 # Tidy way DataClean %>% mutate(democracy = case_when(Polity2 <= 0 ~ 0, TRUE ~ 1)) # "Polity2 > 0 ~ 1" also works DataClean %>% mutate(democracy = case_when(Polity2 <= 0 ~ 0, Polity2 > 0 ~ 1)) ``` # Data Frames Practice 2 ## 1. Read in the data "lab1_survey.csv" ```{r} # Clear and load data rm(list = ls()) #SurveyData <- read.csv(file = "Lab01Survey.csv") DataURL2 <- "http://staff.washington.edu/kamano/MLE/LabMaterials/Lab01/Lab01Survey.csv" SurveyData <- read.csv(DataURL2) ``` ## 2. Inspect the data. What format are they in? What values do the data take, and how do those values correspond with the survey? ```{r} str(SurveyData) ``` ## 3. Generate some summary statistics. ```{r} summary(SurveyData) mean(SurveyData$R) mean(SurveyData$latex) median(SurveyData$R) median(SurveyData$latex) sd(SurveyData$R) sd(SurveyData$latex) # Tidy way SurveyData %>% summarize_all(funs(mean, median, sd, min, max)) # %>% gather(key = "stat") ``` ## 4. How are these two variables related to each other (assuming equal intervals b/w categories)? ```{r} cor1 <- cor(SurveyData$R, SurveyData$latex) ``` The correlation b/w R knowledge and LaTeX knowledge is `r cor1`, or more nicely, `r round(cor1, 2)`. ## 5. Are there any problems with the way the data are coded? (Think about lecture yesterday.) ## 6. Recode the data 0:What's that? 1:I've heard of it 2:I can use it or apply it 3:I understand it well ```{r} SurveyData %>% mutate(# Recode R into categories RCat = case_when(R == 0 ~ "What's that?", R == 1 ~ "I've heard of it", R == 2 ~ "I can use it or apply it", TRUE ~ "I understand it well"), # Recode latex into categories latexCat = case_when(latex == 0 ~ "What's that?", latex == 1 ~ "I've heard of it", latex == 2 ~ "I can use it or apply it", TRUE ~ "I understand it well")) # We're repeating ourselves... Must be a faster way SurveyDataRecode <- SurveyData %>% mutate_at(vars(R, latex), function(x) case_when(x == 0 ~ "What's that?", x == 1 ~ "I've heard of it", x == 2 ~ "I can use it or apply it", TRUE ~ "I understand it well")) ``` ## 7. Why is this coding method better? ## 8. Generate some plots of the data: bar charts are good here, scatterplots even better. ```{r, echo= FALSE} # Bar charts ggplot(SurveyDataRecode, aes(x = R)) + geom_bar() + labs(x = "R knowledge") SurveyDataRecode %>% ggplot(aes(x = latex)) + geom_bar() + labs(x = "LaTeX knowledge") # Scatter plot SurveyDataRecode %>% ggplot(aes(x = R, y = latex)) + geom_jitter(alpha = .7, height = .2, width = .2) + labs(x = "R knowledge", y = "LaTeX knowledge") + theme_classic() ##### Something is wrong? ##### # Convert two variables into factors KnowledgeLevels <- c("What's that?", "I've heard of it", "I can use it or apply it", "I understand it well") SurveyDataPlot <- SurveyDataRecode %>% mutate(R = factor(R, levels = KnowledgeLevels), latex = factor(latex, levels = KnowledgeLevels) ) # Redo the scatter plot SurveyDataPlot %>% ggplot(aes(x = R, y = latex)) + geom_jitter(alpha = .7, height = .2, width = .2) + labs(x = "R knowledge", y = "LaTeX knowledge") + scale_x_discrete(limits = KnowledgeLevels) + theme_classic() ``` # LaTex in R Markdown $$ 1 + 1 = 2 $$ $$ 11 \times 11 = 121 \\ $$ $$ E = mc^2 $$ I think it's Einstein who proposed $E = mc^2$. $$ x = \frac{-b \pm \sqrt{b^2 - 4ac}}{2a} $$ $$ \begin{split} X & = (x+a)(x-b) \\ & = x(x-b) + a(x-b) \\ & = x^2 + x(a-b) - ab \end{split} $$ # Install guide for Chris's packages and TinyTeX ```{r} # Install tile install.packages("https://faculty.washington.edu/cadolph/software/tile_0.4.15.tar.gz", repos = NULL, type="source") # Install simcf install.packages("https://faculty.washington.edu/cadolph/software/simcf_0.2.18.tar.gz", repos = NULL, type="source") ``` # Install tinytex ## Caution!!!! It takes sooooooooooooo long!! ## ```{r} install.packages("tinytex") tinytex::install_tinytex() ```