This tutorial requires the following R packages; install and load them if you have not yet:
if (!require("pacman")) {install.packages("pacman"); library(pacman)}
## Loading required package: pacman
pacman::p_load("moments", "tidyverse", "knitr", "plotly", "scales")
Question: What is the type for each of the variables in the TOD dataset?
californiatod <- read_csv("californiatod.csv")
## Parsed with column specification:
## cols(
## name = col_character(),
## region = col_character(),
## transit = col_double(),
## density = col_double(),
## houseval = col_double(),
## railtype = col_character()
## )
kable(californiatod)
name | region | transit | density | houseval | railtype |
---|---|---|---|---|---|
Archstone Barrington Hills Apartments | Bay Area | 0.4000 | 3.36 | 192593.26 | Heavy or commuter rail |
Archstone Mission Va | SD | 0.1077 | 2.18 | 203054.65 | Light rail |
Atherton Place | Bay Area | 0.5000 | 5.21 | 217671.52 | Heavy or commuter rail |
Bellamar Apartments | LA | 0.3178 | 10.05 | 117255.94 | Light rail |
Coggins Square | Bay Area | 0.5278 | 4.68 | 270554.72 | Heavy or commuter rail |
Del Norte Place | Bay Area | 0.6429 | 4.91 | 229335.59 | Heavy or commuter rail |
Iron Horse Lofts | Bay Area | 0.6061 | 4.68 | 270207.40 | Heavy or commuter rail |
Mercado Apartments | SD | 0.0000 | 3.21 | 121785.96 | Light rail |
Mission Wells | Bay Area | 0.1475 | 3.96 | 323496.98 | Heavy or commuter rail |
Northpark Apartments | Bay Area | 0.1738 | 3.27 | 667661.04 | Heavy or commuter rail |
Ohlone-Chynoweith | Bay Area | 0.0000 | 4.00 | 357694.13 | Light rail |
Pacific Court | LA | 0.0000 | 10.33 | 118401.77 | Light rail |
Palo Alto Condos | Bay Area | 0.0667 | 4.78 | 779791.98 | Heavy or commuter rail |
Park Regency | Bay Area | 0.3178 | 4.71 | 271728.59 | Heavy or commuter rail |
Poinsettia Station | SD | 0.1538 | 1.72 | 273004.39 | Heavy or commuter rail |
Studio Village | LA | 0.1875 | 8.70 | 246330.40 | Heavy or commuter rail |
The Crossings | Bay Area | 0.1316 | 6.49 | 558643.30 | Heavy or commuter rail |
Union Square Condos | SD | 0.1892 | 3.87 | 192739.76 | Light rail |
Verandas Apartments | Bay Area | 0.4912 | 3.15 | 327738.97 | Heavy or commuter rail |
Villages of LaMesa | SD | 0.1111 | 3.16 | 207159.11 | Light rail |
Wayside Plaza | Bay Area | 0.4737 | 4.67 | 270087.94 | Heavy or commuter rail |
Western Carlton | LA | 0.2100 | 14.85 | 371469.76 | Heavy or commuter rail |
Wilcox Apartments | LA | 0.3333 | 14.76 | 288948.71 | Heavy or commuter rail |
Wilshire Promenade | LA | 0.1053 | 3.84 | 216780.07 | Heavy or commuter rail |
Windsor Ridge | Sacramento | 0.2222 | 2.36 | 152840.33 | Light rail |
Woodlake Close | Sacramento | 0.1579 | 2.46 | 89045.74 | Light rail |
Descriptive Stats for a single variable
Common quantitative descriptive measures include mean, standard deviation (variance), median, IQR Skewness and Kurtosis are measures comparing the distribution of the variable of interest to a normal distribution.
mean(californiatod$density)
## [1] 5.36
sd(californiatod$density)
## [1] 3.531352
#range
max(californiatod$density) - min(californiatod$density)
## [1] 13.13
# robust measures
median(californiatod$density)
## [1] 4.335
IQR(californiatod$density)
## [1] 1.91
# Or use package like skimr
library(skimr)
##
## Attaching package: 'skimr'
## The following object is masked from 'package:knitr':
##
## kable
skim(californiatod) %>% print()
## Skim summary statistics
## n obs: 26
## n variables: 6
##
## Variable type: character
## variable missing complete n min max empty n_unique
## name 0 26 26 12 37 0 26
## railtype 0 26 26 10 22 0 2
## region 0 26 26 2 10 0 4
##
## Variable type: numeric
## variable missing complete n mean sd p0 p25
## density 0 26 26 5.36 3.53 1.72 3.23
## houseval 0 26 26 282154.69 163092.19 89045.74 2e+05
## transit 0 26 26 0.25 0.19 0 0.12
## p50 p75 p100 hist
## 4.34 5.13 14.85 ▇▇▂▁▁▂▁▂
## 258209.17 314859.91 779791.98 ▅▇▇▂▁▁▁▁
## 0.19 0.38 0.64 ▅▇▆▂▂▁▃▂
# skewness & kurtosis (from the moments package)
skewness(californiatod$density)
## [1] 1.626587
kurtosis(californiatod$density)
## [1] 4.732171
Histogram is a graphical representation of the distribution of Numeric variables.
require(ggplot2)
qplot(density, data=californiatod, geom="histogram")
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
# The default bin width is set to range/30 in qplot and it doesn't look good. Let's set it to 2
# You can experiment it with different values
qplot(density, data=californiatod, geom="histogram", binwidth=2)
# show percentage
hist.percent <- ggplot(californiatod, aes(x=density)) +
geom_histogram(aes(y=(..count..)/sum(..count..)), binwidth = 2) +
scale_y_continuous(labels=percent) + labs(y="")
hist.percent
# with normal curve superimposed
hist.percent + stat_function(fun=dnorm, args=list(mean=mean(californiatod$density), sd=sd(californiatod$density)))
qplot('all', y=density, data=californiatod, geom="boxplot")
table(californiatod$region) # counts
##
## Bay Area LA Sacramento SD
## 13 6 2 5
options(digits=2) # only show 2 decimal places
prop.table(table(californiatod$region)) #share
##
## Bay Area LA Sacramento SD
## 0.500 0.231 0.077 0.192
qplot(region, data=californiatod, geom="bar") # counts
# in percentage scale
ggplot(californiatod, aes(x=region)) +
geom_bar(aes(y=(..count..)/sum(..count..))) +
scale_y_continuous(labels=percent) + labs(y="")
Quantitative measures for the relationship between a pair of variables will be covered later. We will focus on visualization here.
qplot(x=density, y=houseval, data=californiatod, geom="point")
qplot(x=transit, y=houseval, data=californiatod, geom="point")
qplot(density, houseval, data=californiatod, geom="point") + geom_smooth(method=lm, se=FALSE)
qplot(transit, houseval, data=californiatod, geom="point") + geom_smooth(method=lm, se=FALSE)
plot.int <- ggplot(californiatod, aes(transit, houseval, text=paste("name:", name))) + geom_point()
ggplotly(plot.int)
qplot(railtype, houseval, data=californiatod, geom="boxplot")
qplot(region, houseval, data=californiatod, geom="boxplot")
# Bar plot, but with less information (only mean)
ggplot(californiatod, aes(railtype, houseval)) + geom_bar(stat = "identity")
ggplot(californiatod, aes(region, houseval)) + geom_bar(stat="identity")
# We can try scatter plot even though not ideal
qplot(railtype, houseval, data=californiatod, geom="point")
qplot(region, houseval, data=californiatod, geom="point")
qplot(transit, railtype, data=californiatod, geom="point")
qplot(density, railtype, data=californiatod, geom="point")
qplot(region, fill=railtype, data=californiatod, geom="bar")
# show percentage
ggplot(californiatod, aes(x=region, fill=railtype)) +
geom_bar(aes(y=(..count..)/sum(..count..))) +
scale_y_continuous(labels=percent) + labs(y="")
# compare with scatter plots, not ideal for working with categorical variables, even when the data points are jittered
qplot(region, railtype, data=californiatod, geom="point")
qplot(region, railtype, data=californiatod, geom="point") + geom_jitter()
table(californiatod$railtype, californiatod$region) # counts
##
## Bay Area LA Sacramento SD
## Heavy or commuter rail 12 4 0 1
## Light rail 1 2 2 4
prop.table(table(californiatod$railtype, californiatod$region), 2) * 100 #percentage
##
## Bay Area LA Sacramento SD
## Heavy or commuter rail 92.3 66.7 0.0 20.0
## Light rail 7.7 33.3 100.0 80.0
Now we will do something a bit more complicated. We will create a new two-way crosstab like you did above but now let’s include the mean of a continuous third variable. This sounds complex, but we shall approach it in a straightforward way using the dplyr package.
#the dplyr way
#we will first calculate our cross tabs and then the mean of transit usage
californiatod %>%
group_by(region, railtype) %>%
summarise(n = n(),
AvgTransit = mean(transit))
## # A tibble: 7 x 4
## # Groups: region [?]
## region railtype n AvgTransit
## <chr> <chr> <int> <dbl>
## 1 Bay Area Heavy or commuter rail 12 0.373
## 2 Bay Area Light rail 1 0.
## 3 LA Heavy or commuter rail 4 0.209
## 4 LA Light rail 2 0.159
## 5 Sacramento Light rail 2 0.190
## 6 SD Heavy or commuter rail 1 0.154
## 7 SD Light rail 4 0.102
The dplyr package is designed to perform table operations on dataframes for multiple kinds of data manipulation. The preceding syntax told R to take our californiatod dataframe, group by the variables for region and railtype, and the calculate the mean transit usage from there. This is similar to the kind of pivot table operations once can do in excel.