if(!file.exists("births.csv")){
download.file("https://raw.githubusercontent.com/michaelweylandt/STA9750/main/births.csv",
destfile="births.csv")
}library(readr)
library(dplyr)
<- read_csv("births.csv")
births glimpse(births)
STA 9750 Week 5 In-Class Activity: Let us JOIN
Our Tables Together
Welcome!
Review Practice
The file “births.csv” in the course repository contains daily US birth counts for 20 years from 1969 to 1988, obtained from the US Social Security Administration. Download this file and read it into R
and answer the following review questions with your group.
The following code may be useful:
The columns here are:
id
: The day in the entire time series (going up to \(\approx 365 * 20\) plus a few for leap day)day_of_year
: the day in the year (1 to 365/366)day_of_week
: the day of the week, coded as an integerday
,month
,year
: the parts of the date as we normally think of thembirths
: the number of births that day.
- How many children were born on January 1st, 1984?
|> filter(day==1, month==1, year==1984) births
- How many total children were born in 1984?
|> filter(year==1984) |> summarize(sum(births)) births
- How many children were born each year? (Print a 20 row table)
|> group_by(year) |> summarize(n_births = sum(births)) births
- How many more children were born each year than the preceeding? (The
lag
function will be useful here!)
|>
births group_by(year) |>
summarize(n_births = sum(births)) |>
mutate(increase_births = n_births - lag(n_births))
- On average, in what month are the most children born?
|>
births group_by(month) |>
summarize(avg_births = mean(births)) |>
slice_max(avg_births)
After completing these, work with your group to formulate and answer three more advanced questions with your group.
Multi Table Operations
This week, we are going to dive into the most useful “multi-table” dplyr
operations, the join
family. We will focus on the “big three” joins:
inner_join
full_join
left_join
These inspired by the SQL
joins, INNER JOIN
, FULL [OUTER] JOIN
, and LEFT JOIN
.1 We will apply them to the various tables in the nycflights13
data set. Recall the structure of each of these tables:
library(nycflights13)
glimpse(flights)
Rows: 336,776
Columns: 19
$ year <int> 2013, 2013, 2013, 2013, 2013, 2013, 2013, 2013, 2013, 2…
$ month <int> 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1…
$ day <int> 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1…
$ dep_time <int> 517, 533, 542, 544, 554, 554, 555, 557, 557, 558, 558, …
$ sched_dep_time <int> 515, 529, 540, 545, 600, 558, 600, 600, 600, 600, 600, …
$ dep_delay <dbl> 2, 4, 2, -1, -6, -4, -5, -3, -3, -2, -2, -2, -2, -2, -1…
$ arr_time <int> 830, 850, 923, 1004, 812, 740, 913, 709, 838, 753, 849,…
$ sched_arr_time <int> 819, 830, 850, 1022, 837, 728, 854, 723, 846, 745, 851,…
$ arr_delay <dbl> 11, 20, 33, -18, -25, 12, 19, -14, -8, 8, -2, -3, 7, -1…
$ carrier <chr> "UA", "UA", "AA", "B6", "DL", "UA", "B6", "EV", "B6", "…
$ flight <int> 1545, 1714, 1141, 725, 461, 1696, 507, 5708, 79, 301, 4…
$ tailnum <chr> "N14228", "N24211", "N619AA", "N804JB", "N668DN", "N394…
$ origin <chr> "EWR", "LGA", "JFK", "JFK", "LGA", "EWR", "EWR", "LGA",…
$ dest <chr> "IAH", "IAH", "MIA", "BQN", "ATL", "ORD", "FLL", "IAD",…
$ air_time <dbl> 227, 227, 160, 183, 116, 150, 158, 53, 140, 138, 149, 1…
$ distance <dbl> 1400, 1416, 1089, 1576, 762, 719, 1065, 229, 944, 733, …
$ hour <dbl> 5, 5, 5, 5, 6, 5, 6, 6, 6, 6, 6, 6, 6, 6, 6, 5, 6, 6, 6…
$ minute <dbl> 15, 29, 40, 45, 0, 58, 0, 0, 0, 0, 0, 0, 0, 0, 0, 59, 0…
$ time_hour <dttm> 2013-01-01 05:00:00, 2013-01-01 05:00:00, 2013-01-01 0…
glimpse(airlines)
Rows: 16
Columns: 2
$ carrier <chr> "9E", "AA", "AS", "B6", "DL", "EV", "F9", "FL", "HA", "MQ", "O…
$ name <chr> "Endeavor Air Inc.", "American Airlines Inc.", "Alaska Airline…
glimpse(airports)
Rows: 1,458
Columns: 8
$ faa <chr> "04G", "06A", "06C", "06N", "09J", "0A9", "0G6", "0G7", "0P2", "…
$ name <chr> "Lansdowne Airport", "Moton Field Municipal Airport", "Schaumbur…
$ lat <dbl> 41.13047, 32.46057, 41.98934, 41.43191, 31.07447, 36.37122, 41.4…
$ lon <dbl> -80.61958, -85.68003, -88.10124, -74.39156, -81.42778, -82.17342…
$ alt <dbl> 1044, 264, 801, 523, 11, 1593, 730, 492, 1000, 108, 409, 875, 10…
$ tz <dbl> -5, -6, -6, -5, -5, -5, -5, -5, -5, -8, -5, -6, -5, -5, -5, -5, …
$ dst <chr> "A", "A", "A", "A", "A", "A", "A", "A", "U", "A", "A", "U", "A",…
$ tzone <chr> "America/New_York", "America/Chicago", "America/Chicago", "Ameri…
glimpse(planes)
Rows: 3,322
Columns: 9
$ tailnum <chr> "N10156", "N102UW", "N103US", "N104UW", "N10575", "N105UW…
$ year <int> 2004, 1998, 1999, 1999, 2002, 1999, 1999, 1999, 1999, 199…
$ type <chr> "Fixed wing multi engine", "Fixed wing multi engine", "Fi…
$ manufacturer <chr> "EMBRAER", "AIRBUS INDUSTRIE", "AIRBUS INDUSTRIE", "AIRBU…
$ model <chr> "EMB-145XR", "A320-214", "A320-214", "A320-214", "EMB-145…
$ engines <int> 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, …
$ seats <int> 55, 182, 182, 182, 55, 182, 182, 182, 182, 182, 55, 55, 5…
$ speed <int> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, N…
$ engine <chr> "Turbo-fan", "Turbo-fan", "Turbo-fan", "Turbo-fan", "Turb…
glimpse(weather)
Rows: 26,115
Columns: 15
$ origin <chr> "EWR", "EWR", "EWR", "EWR", "EWR", "EWR", "EWR", "EWR", "EW…
$ year <int> 2013, 2013, 2013, 2013, 2013, 2013, 2013, 2013, 2013, 2013,…
$ month <int> 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,…
$ day <int> 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,…
$ hour <int> 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 13, 14, 15, 16, 17, 18, …
$ temp <dbl> 39.02, 39.02, 39.02, 39.92, 39.02, 37.94, 39.02, 39.92, 39.…
$ dewp <dbl> 26.06, 26.96, 28.04, 28.04, 28.04, 28.04, 28.04, 28.04, 28.…
$ humid <dbl> 59.37, 61.63, 64.43, 62.21, 64.43, 67.21, 64.43, 62.21, 62.…
$ wind_dir <dbl> 270, 250, 240, 250, 260, 240, 240, 250, 260, 260, 260, 330,…
$ wind_speed <dbl> 10.35702, 8.05546, 11.50780, 12.65858, 12.65858, 11.50780, …
$ wind_gust <dbl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, 20.…
$ precip <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,…
$ pressure <dbl> 1012.0, 1012.3, 1012.5, 1012.2, 1011.9, 1012.4, 1012.2, 101…
$ visib <dbl> 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10,…
$ time_hour <dttm> 2013-01-01 01:00:00, 2013-01-01 02:00:00, 2013-01-01 03:00…
From here, we can see that there are many relationships between these tables. For example, the origin
and dest
columns of the flights
table, representing the origin and destination airport respectively, both correspond to the FAA identifiers used by the faa
column of the airports
table. These “commonalities” form the basis of join specifications.
Join Specifications
dplyr
specifies joins using the join_by
function. The output of the join_by
function, also known as a “join specification” is a series of logical tests applied to pairs of rows. The results of these logical tests are used to identify “matches” between rows. Joins differ primarily on how they use the outputs of these logical tests to construct their output.
The simplest and most useful logical test to use in a join is an equality test. In dplyr
, these are simply written as
join_by(left_name == right_name)
This type of test checks whether the value in the left_name
column of the first (left) argument matches the value in the right_name
column of the second (right) argument.
For example, if I wanted to join the origin
column of flights
table to the faa
column of the airports
table, I might use something like the following:
inner_join(flights, airports, join_by(origin == faa))
# A tibble: 336,776 × 26
year month day dep_time sched_dep_time dep_delay arr_time sched_arr_time
<int> <int> <int> <int> <int> <dbl> <int> <int>
1 2013 1 1 517 515 2 830 819
2 2013 1 1 533 529 4 850 830
3 2013 1 1 542 540 2 923 850
4 2013 1 1 544 545 -1 1004 1022
5 2013 1 1 554 600 -6 812 837
6 2013 1 1 554 558 -4 740 728
7 2013 1 1 555 600 -5 913 854
8 2013 1 1 557 600 -3 709 723
9 2013 1 1 557 600 -3 838 846
10 2013 1 1 558 600 -2 753 745
# ℹ 336,766 more rows
# ℹ 18 more variables: arr_delay <dbl>, carrier <chr>, flight <int>,
# tailnum <chr>, origin <chr>, dest <chr>, air_time <dbl>, distance <dbl>,
# hour <dbl>, minute <dbl>, time_hour <dttm>, name <chr>, lat <dbl>,
# lon <dbl>, alt <dbl>, tz <dbl>, dst <chr>, tzone <chr>
Here origin
is taken to be a column from the first (left) table and faa
is taken to be a column from the second (right) table. As with other dplyr
functions, there is a bit of programming magic used to allow column names to be used as variables and interpreted correctly.
For the airport identifiers, we only need to match on the single unique ID. (We can assume the FAA assigns unique IDs to each airport.) In other circumstances, we need to combine several logical tests to get a true match.
For example, suppose we want to align our flights with the weather at their origin airport at scheduled take off time. Here, we’d need to combine the flights
and weather
table on many columns:
origin
toorigin
year
toyear
month
tomonth
day
today
hour
tohour
In this case, we’d pass 5 equality conditions to join_by
:
inner_join(flights,
weather, join_by(origin == origin,
== year,
year == month,
month == day,
day == hour)) hour
# A tibble: 335,220 × 29
year month day dep_time sched_dep_time dep_delay arr_time sched_arr_time
<int> <int> <int> <int> <int> <dbl> <int> <int>
1 2013 1 1 517 515 2 830 819
2 2013 1 1 533 529 4 850 830
3 2013 1 1 542 540 2 923 850
4 2013 1 1 544 545 -1 1004 1022
5 2013 1 1 554 600 -6 812 837
6 2013 1 1 554 558 -4 740 728
7 2013 1 1 555 600 -5 913 854
8 2013 1 1 557 600 -3 709 723
9 2013 1 1 557 600 -3 838 846
10 2013 1 1 558 600 -2 753 745
# ℹ 335,210 more rows
# ℹ 21 more variables: arr_delay <dbl>, carrier <chr>, flight <int>,
# tailnum <chr>, origin <chr>, dest <chr>, air_time <dbl>, distance <dbl>,
# hour <dbl>, minute <dbl>, time_hour.x <dttm>, temp <dbl>, dewp <dbl>,
# humid <dbl>, wind_dir <dbl>, wind_speed <dbl>, wind_gust <dbl>,
# precip <dbl>, pressure <dbl>, visib <dbl>, time_hour.y <dttm>
Here we look only at those rows which match on all 5 tests. In this way, join_by
behaves like filter
: it “passes” the intersection of positive results.
Note that it is relatively common for matched columns to have the same name in both tables: to support this case, dplyr
reads a single column name as “self-equality”. So the above code can be more concisely written as:
inner_join(flights,
weather, join_by(origin,
year,
month,
day, hour))
# A tibble: 335,220 × 29
year month day dep_time sched_dep_time dep_delay arr_time sched_arr_time
<int> <int> <int> <int> <int> <dbl> <int> <int>
1 2013 1 1 517 515 2 830 819
2 2013 1 1 533 529 4 850 830
3 2013 1 1 542 540 2 923 850
4 2013 1 1 544 545 -1 1004 1022
5 2013 1 1 554 600 -6 812 837
6 2013 1 1 554 558 -4 740 728
7 2013 1 1 555 600 -5 913 854
8 2013 1 1 557 600 -3 709 723
9 2013 1 1 557 600 -3 838 846
10 2013 1 1 558 600 -2 753 745
# ℹ 335,210 more rows
# ℹ 21 more variables: arr_delay <dbl>, carrier <chr>, flight <int>,
# tailnum <chr>, origin <chr>, dest <chr>, air_time <dbl>, distance <dbl>,
# hour <dbl>, minute <dbl>, time_hour.x <dttm>, temp <dbl>, dewp <dbl>,
# humid <dbl>, wind_dir <dbl>, wind_speed <dbl>, wind_gust <dbl>,
# precip <dbl>, pressure <dbl>, visib <dbl>, time_hour.y <dttm>
I recommend against using this short-cut. It takes hardly more time to write your intent explicitly and it’s far more robust. Measure twice, cut once.
Unfortunately, it is not easy to perform an “OR” in join_by
. We may cover this below, time allowing.
We now turn to specific joins. All of these joins use the join_by
operator but they construct results differently based on its output.
Inner Joins
The most common and most important join in data analysis is the inner_join
. The inner join returns matches between two tables. Conceptually, the inner join constructs all possible pairs of rows between the two tables (so {r eval=FALSE} NROW(x) * NROW(y)
total rows) and then filters down to those which pass the join_by
test. In practice, more efficient algorithms are used to prevent wasteful computation.
Inner joins are used when seeking matches between two tables. They are particularly useful when both tables are “comprehensive” and we are sure that there are matches. For instance, we can use an inner_join
to combine most of the tables in nycflights13
because they come from a comprehensive government data source. (E.g., No flights going to secret “unauthorized” airports.)
Let’s start by asking what the average arrival delay of flights going to west coast airports is. We do not have enough information to answer this using the flights
table alone. To identify west coast airports, let’s filter airports
on tzone
:
<- airports |> filter(tzone == "America/Los_Angeles") west_coast_airports
We can now join this to the original flights table to find only those flights with destination matches in west_coast_airports
:
inner_join(flights, west_coast_airports, join_by(dest == faa))
# A tibble: 46,324 × 26
year month day dep_time sched_dep_time dep_delay arr_time sched_arr_time
<int> <int> <int> <int> <int> <dbl> <int> <int>
1 2013 1 1 558 600 -2 924 917
2 2013 1 1 558 600 -2 923 937
3 2013 1 1 559 600 -1 854 902
4 2013 1 1 611 600 11 945 931
5 2013 1 1 628 630 -2 1016 947
6 2013 1 1 646 645 1 1023 1030
7 2013 1 1 651 655 -4 936 942
8 2013 1 1 655 700 -5 1037 1045
9 2013 1 1 658 700 -2 1027 1025
10 2013 1 1 702 700 2 1058 1014
# ℹ 46,314 more rows
# ℹ 18 more variables: arr_delay <dbl>, carrier <chr>, flight <int>,
# tailnum <chr>, origin <chr>, dest <chr>, air_time <dbl>, distance <dbl>,
# hour <dbl>, minute <dbl>, time_hour <dttm>, name <chr>, lat <dbl>,
# lon <dbl>, alt <dbl>, tz <dbl>, dst <chr>, tzone <chr>
Here, we have only a subset of our original flights
table. From this, we can compute our relevant summary statistic:
inner_join(flights, west_coast_airports, join_by(dest == faa)) |>
summarize(mean(arr_delay, na.rm=TRUE))
# A tibble: 1 × 1
`mean(arr_delay, na.rm = TRUE)`
<dbl>
1 1.28
Is this any better than the following alternative approach:
inner_join(flights, airports, join_by(dest == faa)) |>
filter(tzone == "America/Los_Angeles") |>
drop_na() |>
summarize(mean(arr_delay, na.rm=TRUE))
# A tibble: 1 × 1
`mean(arr_delay, na.rm = TRUE)`
<dbl>
1 1.28
Formally, these are basically equivalent. (filter
and inner_join
commute). As usual, it’s a matter of communicating intent. Here the single line filter(tzone == "America/Los_Angeles")
is simple enough it probably doesn’t need a separate variable. But if, instead of a one line operation, we performed a very complex set of filtering options, we may benefit from giving it a separate name as opposed to trying to shoe-horn the complex filtering into a pipeline.
Performance-wise, it is a bit better to perform filter
before inner_join
(Why? Think about the size of the result of each step.) but the difference is rarely material. Clarity of intent, not optimizing performance, should dictate the order in which you perform steps.
Both approaches are also equivalent to:
inner_join(flights,
|> filter(tzone == "America/Los_Angeles"),
airports join_by(dest == faa)) |>
drop_na() |>
summarize(mean(arr_delay, na.rm=TRUE))
# A tibble: 1 × 1
`mean(arr_delay, na.rm = TRUE)`
<dbl>
1 1.28
But I find this sort of “filter
inside join
argument” to be terribly difficult to read: it mixes standard (inside-out) and piped (left to right) evaluation orders in a confusing manner. Avoid this!
Work with your group to answer the following questions using inner_join
.
- What is the name of the airline with the longest average departure delay?
- What is the name of the origin airport with the longest average departure delay?
- What is the name of the destination airport with the longest average departure delay?
- Are average delays longer for East-coast destinations or West-coast destinations?
- Which plane (
tailnum
) flew the most times leaving NYC? Who manufactured it? - Which manufacturer has the most planes flying out of NYC airports?
- Which manufacturer has the longest average flight?
- What model of plane has the smallest average delay leaving NYC?
Left Join
Left joins are useful when you don’t want to dropped unmatched columns in one table. For instance, suppose we misplace some rows from our airlines
table:
<- airlines |>
airlines_major filter(carrier %in% c("AA", "DL", "UA", "WN", "B6", "AS"))
If we inner join on airlines_major
, we loose many of the rows in flights
.
NROW(flights)
[1] 336776
inner_join(flights,
airlines_major, join_by(carrier == carrier)) |>
NROW()
[1] 207128
Sometimes this is what we want, but not always. If we instead use a left join, we keep all of the rows in flights
:
NROW(flights)
[1] 336776
left_join(flights,
airlines_major, join_by(carrier == carrier)) |>
NROW()
[1] 336776
Rows lacking a pair in airlines_major
fill the missing columns with NA
. This fits our mental model of missing values in R
: in theory, these flights should have some carrier name, but given the data at hand, we don’t know what it is.
NROW(flights)
[1] 336776
left_join(flights,
airlines_major, join_by(carrier == carrier)) |>
filter(carrier %in% c("MQ", "OO", "VX")) |>
glimpse() # Look at 'name' column
Rows: 31,591
Columns: 20
$ year <int> 2013, 2013, 2013, 2013, 2013, 2013, 2013, 2013, 2013, 2…
$ month <int> 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1…
$ day <int> 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1…
$ dep_time <int> 600, 602, 608, 624, 656, 658, 729, 749, 800, 805, 811, …
$ sched_dep_time <int> 600, 605, 600, 630, 705, 700, 730, 710, 810, 815, 630, …
$ dep_delay <dbl> 0, -3, 8, -6, -9, -2, -1, 39, -10, -10, 101, -4, -5, -8…
$ arr_time <int> 837, 821, 807, 840, 1007, 1027, 1049, 939, 949, 1006, 1…
$ sched_arr_time <int> 825, 805, 735, 830, 940, 1025, 1115, 850, 955, 1010, 83…
$ arr_delay <dbl> 12, 16, 32, 10, 27, 2, -26, 49, -6, -4, 137, -13, -13, …
$ carrier <chr> "MQ", "MQ", "MQ", "MQ", "MQ", "VX", "VX", "MQ", "MQ", "…
$ flight <int> 4650, 4401, 3768, 4599, 4534, 399, 11, 3737, 4406, 4490…
$ tailnum <chr> "N542MQ", "N730MQ", "N9EAMQ", "N518MQ", "N722MQ", "N627…
$ origin <chr> "LGA", "LGA", "EWR", "LGA", "LGA", "JFK", "JFK", "EWR",…
$ dest <chr> "ATL", "DTW", "ORD", "MSP", "XNA", "LAX", "SFO", "ORD",…
$ air_time <dbl> 134, 105, 139, 166, 233, 361, 356, 148, 80, 101, 118, 5…
$ distance <dbl> 762, 502, 719, 1020, 1147, 2475, 2586, 719, 427, 479, 5…
$ hour <dbl> 6, 6, 6, 6, 7, 7, 7, 7, 8, 8, 6, 8, 8, 8, 8, 18, 9, 9, …
$ minute <dbl> 0, 5, 0, 30, 5, 0, 30, 10, 10, 15, 30, 25, 35, 40, 50, …
$ time_hour <dttm> 2013-01-01 06:00:00, 2013-01-01 06:00:00, 2013-01-01 0…
$ name <chr> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,…
left_join
s are useful if we want to join two tables, but want to avoid dropping any rows from a ‘gold standard’ table.
Outer Join
Advanced Join Specifications
Multi-Column Operations
Analytical Functions
Cumulative Operators
*_rank
Other dplyr
functionality
Advanced Joins
cross_join
semi_join
anti_join
nest_join
bind_rows
and bind_columns
Footnotes
Note that some
SQL
engines useLEFT OUTER JOIN
thanLEFT JOIN
. BecauseOUTER
is a bit ambiguous,dplyr
emphasizesfull_
vsleft_
in its function naming. Also note the convention ofdplyr
names - lower case, underscore separated - and that it differs fromSQL
syntax.↩︎