---
title: "Lab01: Practice Code - Key"
author: "Kenya Amano"
date: "10/2/2020"
output:
pdf_document: default
html_document:
df_print: paged
editor_options:
chunk_output_type: console
---
```{r setup, include=FALSE}
knitr::opts_chunk$set(echo = TRUE)
```
# Prerequiste
```{r, message=FALSE}
rm(list = ls()) # Clear memory
library(tidyverse) # Load package
```
# Working directory
Check if your working directory is correct (where you have saved `Lab01Data.csv` and `Lab01Survey.csv`)
```{r}
basedir <- getwd()
rowdata.folder <- paste(basedir, "Specify if you create a folder", sep = "/")
```
# Vector Practice
1. `Vector1` : The numbers one through five and then the number six five times
2. `Vector2` : 10 randomly drawn numbers from a normal distribution with a mean 10 and a s.d. of 1
3. `Vector3` : Results of 10 single binomial trials with a probability of 0.4
4. `Vector4` : Sample 100 observations from a 5-trial binomial distribution with a probability of success of 0.4
5. `Vector5` : The numbers one through three and the word apple
```{r}
#1
Vector1 <- c(1, 2, 3, 4, 5, 6, 6, 6, 6, 6)
Vector1 <- c(1:5, 6, 6, 6, 6, 6)
Vector1 <- c(seq(from = 1, to = 5, by = 1), rep(6, 5))
#2
Vector2 <- rnorm(n = 10, mean = 10, sd = 1)
#3
Vector3 <- rbinom(n = 10, size = 1, prob = 0.4)
#4
Vector4 <- rbinom(n = 100, size = 5, prob = 0.4)
?rbinom
#The size is the total number of trials, of which size*prob are expected to be successes.
stem(rbinom(n = 10, size = 1, prob = 0.4))
# This gives the results of 10 runs of 1 coin flips each with 0.4 winning probability, returning the number of successes in each run
stem(rbinom(n = 100, size = 5, prob = 0.4))
# This gives the results of 100 runs of 5 coin flips each with 0.4 winning probability, returning the number of successes in each run
#5
Vector5 <- c(1:3, "apple")
```
6. What type of data is Vector2?
7. Round up Vector2 to two decimal place
8. What happened in Vector5?
```{r}
#6
is.character(Vector2)
mode(Vector2)
class(Vector2)
#7
round(Vector2, 2)
#8
class(Vector5)
Vector5
```
# Matrices Practice
1. Matrix1: Create 5 by 5 matrix containing all NAs
2. Assign Matrix1 the row names (a,b,c,d,e) and the column names (1,2,3,4,5)
3. Replace the NAs in the first columne of Matrix1 with "Inf"
```{r}
#1
Matrix1 <- matrix(data = NA, nrow=5, ncol=5)
rownames(Matrix1) <- c("a", "b", "c", "d", "e")
colnames(Matrix1) <- c(1, 2, 3, 4, 5)
Matrix1[, 1] <- Inf
#Matrix1[1,3]<-"apple"
```
# List Practice
1. Create a list that contains Vector1, Vector2, Vector3, and Matrix1
2. Name each list component as Vector1, Vector2, Vector3, and Matrix1 respectively
3. Locate Vector2 from the list
```{r}
#1
List1NoName <- list(Vector1, Vector2, Vector3, Matrix1)
#2
names(List1) <- c("Vector1", "Vector2", "Vector3", "Matrix1")
#3
List1[[2]]
#or
List1$Vector2
```
# Data Frames Practice 1
## 1. Load Lab01data.csv in R
```{r}
# Load data
#Dta <- read.csv("Lab01Data.csv", header = TRUE, stringsAsFactors = FALSE)
DataURL <- "http://staff.washington.edu/kamano/MLE/LabMaterials/Lab01/Lab01Data.csv"
Dta <- read.csv(DataURL)
```
## 2. What is the data structure? What does that tell us about type?
```{r}
# Check structure
dim(Dta)
class(Dta)
is.data.frame(Dta)
is.matrix(Dta)
# Alternatively
str(Dta)
summary(Dta)
```
## 3. Check the names and summary statistics of the data. Fix any names that are less than good.
```{r}
# Check and fix names
names(Dta)
names(Dta)[3] <- "GdpPerCap"
names(Dta) # Check again
Dta <-
Dta %>%
rename(Country = country,
Polity2 = polity2)
# Summary Statistics
summary(Dta)
```
## 4. Remove observations with missing values
```{r}
# Remove NAs
DataClean <- na.omit(Dta) # listwise deletion!!
Dta %>%
na.omit()
dim(Dta)
dim(DataClean)
```
## 5. Calculate the average GDP per capita for Brazil for the observed period. Repeat the calculation for all countries.
```{r}
# Base R
mean(DataClean[DataClean$Country == "Brazil", "GdpPerCap"])
# Tidy way
DataClean %>%
filter(Country == "Brazil") %>%
summarize(mean(GdpPerCap))
# Average GdpPerCap for all countries
DataClean %>%
group_by(Country) %>%
summarize(mean(GdpPerCap))
DataClean %>%
group_by(Country) %>%
summarize_at(vars(GdpPerCap), list(mean, median))
```
## 6. Plot GDP per capita (on the x-axis) and Polity2 (on the y-axis)
```{r}
# Base Graphics
dev.off()
plot(x = DataClean$GdpPerCap,
y = DataClean$Polity2)
# Try logging GDP
plot(x = log(DataClean$GdpPerCap),
y = DataClean$Polity2,
xlab = "Logged GDP per capita",
ylab = "Polity2")
# ggplot2
ggplot(DataClean, aes(y = Polity2, x = log(GdpPerCap))) +
geom_point() +
labs(x = "Logged GDP per capita", y = "Polity2") +
theme_classic()
```
## 7. Create a new variable called "democracy". Assign 0 to countries with negative value or zero polity2 score, and assign 1 to countries with positive score.
```{r, results='hide'}
# Create a variable called "democracy"
DataClean$democracy <- NA
head(DataClean)
# You can subset data based on a logical statement
DataClean$Polity2 <= 0
DataClean[DataClean$Polity2 <= 0, ] # ",__" specify all columns
# Take advantage of this: Assign values to "democracy" based on Polity2 values
DataClean$democracy[DataClean$Polity2 <= 0] <- 0
# Do the same for positive Polity2 score
DataClean$democracy[DataClean$Polity2 > 0] <- 1
# Tidy way
DataClean %>%
mutate(democracy = case_when(Polity2 <= 0 ~ 0,
TRUE ~ 1)) # "Polity2 > 0 ~ 1" also works
DataClean %>%
mutate(democracy = case_when(Polity2 <= 0 ~ 0,
Polity2 > 0 ~ 1))
```
# Data Frames Practice 2
## 1. Read in the data "lab1_survey.csv"
```{r}
# Clear and load data
rm(list = ls())
#SurveyData <- read.csv(file = "Lab01Survey.csv")
DataURL2 <- "http://staff.washington.edu/kamano/MLE/LabMaterials/Lab01/Lab01Survey.csv"
SurveyData <- read.csv(DataURL2)
```
## 2. Inspect the data. What format are they in? What values do the data take, and how do those values correspond with the survey?
```{r}
str(SurveyData)
```
## 3. Generate some summary statistics.
```{r}
summary(SurveyData)
mean(SurveyData$R)
mean(SurveyData$latex)
median(SurveyData$R)
median(SurveyData$latex)
sd(SurveyData$R)
sd(SurveyData$latex)
# Tidy way
SurveyData %>%
summarize_all(funs(mean, median, sd, min, max))
# %>% gather(key = "stat")
```
## 4. How are these two variables related to each other (assuming equal intervals b/w categories)?
```{r}
cor1 <- cor(SurveyData$R, SurveyData$latex)
```
The correlation b/w R knowledge and LaTeX knowledge is `r cor1`, or more nicely, `r round(cor1, 2)`.
## 5. Are there any problems with the way the data are coded? (Think about lecture yesterday.)
## 6. Recode the data
0:What's that?
1:I've heard of it
2:I can use it or apply it
3:I understand it well
```{r}
SurveyData %>%
mutate(# Recode R into categories
RCat = case_when(R == 0 ~ "What's that?",
R == 1 ~ "I've heard of it",
R == 2 ~ "I can use it or apply it",
TRUE ~ "I understand it well"),
# Recode latex into categories
latexCat = case_when(latex == 0 ~ "What's that?",
latex == 1 ~ "I've heard of it",
latex == 2 ~ "I can use it or apply it",
TRUE ~ "I understand it well"))
# We're repeating ourselves... Must be a faster way
SurveyDataRecode <-
SurveyData %>%
mutate_at(vars(R, latex),
function(x) case_when(x == 0 ~ "What's that?",
x == 1 ~ "I've heard of it",
x == 2 ~ "I can use it or apply it",
TRUE ~ "I understand it well"))
```
## 7. Why is this coding method better?
## 8. Generate some plots of the data: bar charts are good here, scatterplots even better.
```{r, echo= FALSE}
# Bar charts
ggplot(SurveyDataRecode, aes(x = R)) +
geom_bar() +
labs(x = "R knowledge")
SurveyDataRecode %>%
ggplot(aes(x = latex)) +
geom_bar() +
labs(x = "LaTeX knowledge")
# Scatter plot
SurveyDataRecode %>%
ggplot(aes(x = R, y = latex)) +
geom_jitter(alpha = .7, height = .2, width = .2) +
labs(x = "R knowledge", y = "LaTeX knowledge") +
theme_classic()
##### Something is wrong? #####
# Convert two variables into factors
KnowledgeLevels <- c("What's that?",
"I've heard of it",
"I can use it or apply it",
"I understand it well")
SurveyDataPlot <-
SurveyDataRecode %>%
mutate(R = factor(R, levels = KnowledgeLevels),
latex = factor(latex, levels = KnowledgeLevels)
)
# Redo the scatter plot
SurveyDataPlot %>%
ggplot(aes(x = R, y = latex)) +
geom_jitter(alpha = .7, height = .2, width = .2) +
labs(x = "R knowledge", y = "LaTeX knowledge") +
scale_x_discrete(limits = KnowledgeLevels) +
theme_classic()
```
# LaTex in R Markdown
$$
1 + 1 = 2
$$
$$
11 \times 11 = 121 \\
$$
$$
E = mc^2
$$
I think it's Einstein who proposed $E = mc^2$.
$$
x = \frac{-b \pm \sqrt{b^2 - 4ac}}{2a}
$$
$$
\begin{split}
X & = (x+a)(x-b) \\
& = x(x-b) + a(x-b) \\
& = x^2 + x(a-b) - ab
\end{split}
$$
# Install guide for Chris's packages and TinyTeX
```{r}
# Install tile
install.packages("https://faculty.washington.edu/cadolph/software/tile_0.4.15.tar.gz", repos = NULL, type="source")
# Install simcf
install.packages("https://faculty.washington.edu/cadolph/software/simcf_0.2.18.tar.gz", repos = NULL, type="source")
```
# Install tinytex
## Caution!!!! It takes sooooooooooooo long!! ##
```{r}
install.packages("tinytex")
tinytex::install_tinytex()
```