1 Statistic Cheat Sheet

Statistic Cheat Sheet by Jennifer Morgen

2 Data Preparation

Data File

Apply the recodes we did in lab 7 to the dataset using the syntax below.

#remember to set your working director
eb_pums <- readRDS("eb_pums_2000.rds")
library(dplyr)

eb_pums <- eb_pums %>% select(year, metaread, puma, 
                        ownershp, builtyr, hhincome, 
                        perwt, age, sex, race, hispan, 
                        tranwork, carpool, trantime)



#Note: People of any race can also be Hispanic (Hispanic is not a race category), 
#so when you recode Hispanics as ???4???, they will no longer be coded in the ???White,??? ???Black,??? or ???Asian??? categories. 
#Rename a variable: metaread to detailed.meta.area
names(eb_pums)[2] <- 'detailed.meta.area'

eb_pums <- eb_pums %>% mutate(
  race=as.character(race), 
  hispan=as.character(hispan),
  racehisp=case_when(
                     race=="Chinese" ~ "Asian", 
                     race=="Japanese" ~ "Asian", 
                     race=="Other Asian or Pacific" ~ "Asian",
                     race=="White" ~ "White", 
                     race=="Black" ~ "Black", 
                     race=="American Indian or Alaskan" ~ "American Indian or Alaskan", 
                     hispan != "Not Hispanic" ~ "Hispanic",
                     TRUE~"Other"))

table(eb_pums$racehisp)
## 
## American Indian or Alaskan                      Asian 
##                        771                      20391 
##                      Black                   Hispanic 
##                      13978                      12163 
##                      Other                      White 
##                       5032                      65109
# recode modes (https://usa.ipums.org/usa-action/variables/TRANWORK#codes_section)
eb_pums <- eb_pums %>%
  mutate(
    tranwork = as.integer(tranwork),
    mode = case_when(
      tranwork %in% c(10, 11, 12, 13, 14, 15, 20) ~ "driving",
      tranwork %in% c(30, 31, 32, 33, 34, 35, 36) ~ "transit",
      tranwork %in% c(40, 50) ~ "bike/walk",
      tranwork %in% c(60, 70) ~ "other",
      TRUE ~ as.character(NA)
    )
  )

table(eb_pums$mode)
## 
## bike/walk   driving     other   transit 
##      1993     44456      2647      5512
# recode modes (https://usa.ipums.org/usa-action/variables/TRANWORK#codes_section)
eb_pums <- eb_pums %>%
  mutate(
    builtyr2 = case_when(
      as.integer(builtyr) %in% c(2, 3) ~ "0-10",
      as.integer(builtyr) == 4 ~ "11-20",
      as.integer(builtyr) == 5 ~ "21-30",
      as.integer(builtyr) == 6 ~ "31-40",
      as.integer(builtyr) == 7 ~ "41-50",
      as.integer(builtyr) == 8 ~ "51-60",
      as.integer(builtyr) == 9 ~ "61+"
    )
  )

table(eb_pums$builtyr2)
## 
##  0-10 11-20 21-30 31-40 41-50 51-60   61+ 
## 11972 16540 20612 19808 18865 10439 15355
## Recode income and tenure

eb_pums <- eb_pums %>%
  mutate(
    increc = ifelse(eb_pums$hhincome <=0 | eb_pums$hhincome >= 999999, NA, hhincome),
    tenure = recode(ownershp, `1`="Owned", `2`="Rented", .default=as.character(NA))
  )


table(eb_pums$tenure)
## 
##  Owned Rented 
##  73871  41432
# Save your recoded data frame 
saveRDS(eb_pums, file="eb_pums_2000_recoded.rds")
# keep 2 decimal places in output
options(digits=2)

#Cross-tabulate means of travel to work by race/ethnicity
xtabs(~mode+racehisp, data=eb_pums, subset=!is.na(mode), drop.unused.levels=T)
##            racehisp
## mode        American Indian or Alaskan Asian Black Hispanic Other White
##   bike/walk                         17   333   142      212    91  1198
##   driving                          279  7686  4227     3812  1593 26859
##   other                             16   341   198      145   100  1847
##   transit                           33  1019   935      468   222  2835
#assign cross-tab to a variable
xtabs.TR <- xtabs(~mode+racehisp, data=eb_pums, subset=!is.na(mode), drop.unused.levels=T)

xtabs.TR
##            racehisp
## mode        American Indian or Alaskan Asian Black Hispanic Other White
##   bike/walk                         17   333   142      212    91  1198
##   driving                          279  7686  4227     3812  1593 26859
##   other                             16   341   198      145   100  1847
##   transit                           33  1019   935      468   222  2835
#Get column %
prop.table(xtabs(~mode+racehisp,data=eb_pums,subset=!is.na(mode), drop.unused.levels=T), 2)
##            racehisp
## mode        American Indian or Alaskan Asian Black Hispanic Other White
##   bike/walk                      0.049 0.036 0.026    0.046 0.045 0.037
##   driving                        0.809 0.819 0.768    0.822 0.794 0.820
##   other                          0.046 0.036 0.036    0.031 0.050 0.056
##   transit                        0.096 0.109 0.170    0.101 0.111 0.087
#OR 
prop.table(xtabs.TR, 2)
##            racehisp
## mode        American Indian or Alaskan Asian Black Hispanic Other White
##   bike/walk                      0.049 0.036 0.026    0.046 0.045 0.037
##   driving                        0.809 0.819 0.768    0.822 0.794 0.820
##   other                          0.046 0.036 0.036    0.031 0.050 0.056
##   transit                        0.096 0.109 0.170    0.101 0.111 0.087
#results of chi-square test of independence
summary(xtabs.TR)
## Call: xtabs(formula = ~mode + racehisp, data = eb_pums, subset = !is.na(mode), 
##     drop.unused.levels = T)
## Number of cases in table: 54608 
## Number of factors: 2 
## Test for independence of all factors:
##  Chisq = 503, df = 15, p-value = 2e-97
#alternatively

3 Boxplot by group

We will create a boxplot of trantime by racehisp using the ggplot2 package.

require(ggplot2)

ggplot(eb_pums, aes(x = mode, y = trantime)) + geom_boxplot()

4 ANOVA

Now we will conduct an ANOVA test using the PUMS dataset in order to determine whether commuting travel time (trantime) varies significantly by race/Hispanic origin- i.e. we will test the likielihood that the sample data come from a population in which personal income does not vary between racial/ethnic categories.

4.1 Conduct ANOVA test

Recall that for ANOVA to work, we need the groups to be of equal size and equal variance. We can check the group size with a bar chart and the variances by group.

#for our frequency bar chart
ggplot(eb_pums, aes(x = racehisp)) + geom_bar()

#for our variance by group
sdbygroup <- eb_pums %>% group_by(racehisp) %>%
  summarize(std_dev = sd(trantime))

sdbygroup
## # A tibble: 6 x 2
##   racehisp                   std_dev
##   <chr>                        <dbl>
## 1 American Indian or Alaskan    21.9
## 2 Asian                         21.4
## 3 Black                         21.4
## 4 Hispanic                      20.5
## 5 Other                         21.0
## 6 White                         21.7

Since non-Hispanic Whites outnumber many of the other racial/ethnic groups we will limit the test to only three groups: Black non-Hispanic, Asian non-Hispanic, and Hispanic. Let’s create a subset for the three groups.

# dplyr approach
eb_pums.sub <- eb_pums %>% 
  filter(racehisp %in% c("Hispanic", "Black", "Asian"))

#check
table(eb_pums.sub$racehisp)
## 
##    Asian    Black Hispanic 
##    20391    13978    12163
# base r approach
#eb_pums.sub <-eb_pums[eb_pums$racehisp %in% c("Hispanic", "Black non-Hisp", "Asian non-Hisp"),]

# Recreate the bar chart
ggplot(eb_pums.sub, aes(racehisp)) + geom_bar()

Now that we have similar sample sizes we have met the assumptions for conducting an ANOVA.

What are the other critical assumptions of ANOVA again?

As with all things R there are many ways of conducting an ANOVA. We will use the aov function as one approach. The formula for ANOVA follows the general form of:

aov(dependent variable ~ grouping variable, data = df)

The dependent variable is on the left side of ~ while the independent variable (the variable we are looking to test whether or not there is a difference between its groups) is on the other side.

mod1 <- aov(trantime ~ racehisp, data = eb_pums.sub)

summary(mod1)
##                Df   Sum Sq Mean Sq F value Pr(>F)    
## racehisp        2    62878   31439      70 <2e-16 ***
## Residuals   46529 20903233     449                   
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

Examine the outputs in the ANOVA table, and determine whether to reject the null hypothesis. Zero in on the F-statistics, and its probability value (and significance codes); this indicates the probability that the null hypothesis can be rejected.

4.2 Pairwise comparison of means

For the pairwise comparison of means we will use the pairwise.t.test() function. Are the differences from other groups significant? What is the adjustment using the Bonferroni correction for multiple comparisons. The general form of the pairwise test function is this:

pairwise.t.test(dependent var, independent var, p.adj = c(“none”, “bonferroni”, “holm”))

pairwise.t.test(eb_pums.sub$trantime , eb_pums.sub$racehisp, p.adj = "bonferroni")
## 
##  Pairwise comparisons using t tests with pooled SD 
## 
## data:  eb_pums.sub$trantime and eb_pums.sub$racehisp 
## 
##          Asian  Black
## Black    2e-14  -    
## Hispanic <2e-16 0.002
## 
## P value adjustment method: bonferroni

Quick question: Conduct an ANOVA test in order to determine whether personal income (increc) varies significantly by race/Hispanic origin.

4.3 Test of Equal Variances

We can use hypothesis testing to test whether groups have equal variance. One such test is Levene’s Equality of Variance Test. Levene’s test estimates a hypothesis test on whether the variances of your groups are equal. Thus the the null hypothesis is that the variances are in fact equal. What does this mean for our alternative hypothesis?

#install car package if you don't have it
# install.packages("car")
library(car)
#running Leven's Equality of Variance
leveneTest(trantime ~ racehisp, data = eb_pums.sub,
  center = "mean")
## Levene's Test for Homogeneity of Variance (center = "mean")
##          Df F value Pr(>F)    
## group     2    58.8 <2e-16 ***
##       46529                   
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1