# Install the necessary R packages
if(!require("tidyverse")) install.packages("tidyverse")
library(lubridate) # added for the my() function
if(!require("DT")) install.packages("DT")
library(DT)
Mini-Project #01
Author: Thanh Dao
Updated: October 3rd, 2024
Introduction
There is a great variety of transportation systems and modes in America, spanning across land and water. Americans utilize all of them on a daily basis, commuting to wherever they need to go. In this report, the fiscal characteristics of major US public transit systems will be analyzed.
Obtaining Data
This report runs on R; thus, the necessary packages must be installed prior to any data collection or analysis.
Now that the necessary packages have been installed, data collection from the National Transit Database can proceed.
# Download the "Fare Revenue" Data
if(!file.exists("2022_fare_revenue.xlsx")){
download.file("http://www.transit.dot.gov/sites/fta.dot.gov/files/2024-04/2022%20Fare%20Revenue.xlsx",
destfile="2022_fare_revenue.xlsx",
quiet=FALSE,
method="wget")
}
# Select the columns within the scope of research for this report
<- readxl::read_xlsx("2022_fare_revenue.xlsx") |>
FARES select(-`State/Parent NTD ID`,
-`Reporter Type`,
-`Reporting Module`,
-`TOS`,
-`Passenger Paid Fares`,
-`Organization Paid Fares`) |>
filter(`Expense Type` == "Funds Earned During Period") |>
select(-`Expense Type`)
# Download the "Expenses" Data
if(!file.exists("2022_expenses.csv")){
download.file("https://data.transportation.gov/api/views/dkxx-zjd6/rows.csv?date=20231102&accessType=DOWNLOAD&bom=true&format=true",
destfile="2022_expenses.csv",
quiet=FALSE,
method="wget")
}
# Select the columns within the scope of research for this report
<- readr::read_csv("2022_expenses.csv") |>
EXPENSES select(`NTD ID`,
`Agency`,
`Total`,
`Mode`) |>
mutate(`NTD ID` = as.integer(`NTD ID`)) |>
rename(Expenses = Total) |>
group_by(`NTD ID`, `Mode`) |>
summarize(Expenses = sum(Expenses)) |>
ungroup()
# Combine the selected "Fare Revenue" and "Expenses" Data
<- inner_join(FARES, EXPENSES, join_by(`NTD ID`, `Mode`))
FINANCIALS
# Download the "Monthly Transit Numbers" Data
if(!file.exists("ridership.xlsx")){
download.file("https://www.transit.dot.gov/sites/fta.dot.gov/files/2024-09/July%202024%20Complete%20Monthly%20Ridership%20%28with%20adjustments%20and%20estimates%29_240903.xlsx",
destfile="ridership.xlsx",
quiet=FALSE,
method="wget")
}
# Select the columns within the scope of research for this report
<- readxl::read_xlsx("ridership.xlsx", sheet="UPT") |>
TRIPS filter(`Mode/Type of Service Status` == "Active") |>
select(-`Legacy NTD ID`,
-`Reporter Type`,
-`Mode/Type of Service Status`,
-`UACE CD`,
-`TOS`) |>
pivot_longer(-c(`NTD ID`:`3 Mode`),
names_to="month",
values_to="UPT") |>
drop_na() |>
mutate(month=my(month))
<- readxl::read_xlsx("ridership.xlsx", sheet="VRM") |>
MILES filter(`Mode/Type of Service Status` == "Active") |>
select(-`Legacy NTD ID`,
-`Reporter Type`,
-`Mode/Type of Service Status`,
-`UACE CD`,
-`TOS`) |>
pivot_longer(-c(`NTD ID`:`3 Mode`),
names_to="month",
values_to="VRM") |>
drop_na() |>
group_by(`NTD ID`, `Agency`, `UZA Name`,
`Mode`, `3 Mode`, month) |>
summarize(VRM = sum(VRM)) |>
ungroup() |>
mutate(month=my(month))
# Combine the selected "Monthly Transit Numbers" Data
<- inner_join(TRIPS, MILES) |>
USAGE mutate(`NTD ID` = as.integer(`NTD ID`))
Initial Data Table
Using the data just obtained from the National Transit Database, the following table can be created:
Unfortunately, this data still some flaws that need to be fixed.
Cleaning the Data
To the average person, “UZA Name” doesn’t provide proper context or meaning; thus, it needs to be changed to something people will understand, like “Metro Area.” The same can be said about “UPT” and “VRM”; these also need to be changed, as follows:
# Task 1: Create Syntactic Names
<- rename(USAGE, "metro_area" = "UZA Name",
USAGE "unlinked_passenger_trips" = "UPT",
"vehicle_revenue_miles" = "VRM",
"NTD_ID" = "NTD ID")
<- rename(FINANCIALS, "NTD_ID" = "NTD ID") FINANCIALS
Now that some column headers have been renamed, focus can be made on the “Mode” column. All of these modes of transport are acronyms that would need to be looked up to be understood. For user ease, they’ll be changed as follows:
# Task 2: Recoding the Mode Column
<- USAGE |> # Interpret the Mode column in "USAGE"
USAGE mutate(Mode = case_when(
== "AR" ~ "Alaska Railroad",
Mode == "CB" ~ "Commuter Bus",
Mode == "CC" ~ "Cable Car",
Mode == "CR" ~ "Commuter Rail",
Mode == "DR" ~ "Demand Response",
Mode == "FB" ~ "Ferryboat",
Mode == "HR" ~ "Heavy Rail",
Mode == "IP" ~ "Inclined Plane",
Mode == "LR" ~ "Light Rail",
Mode == "MB" ~ "Bus",
Mode == "MG" ~ "Monorail and Automated Guideway",
Mode == "PB" ~ "Publico",
Mode == "RB" ~ "Bus Rapid Transit",
Mode == "SR" ~ "Streetcar Rail",
Mode == "TB" ~ "Trolleybus",
Mode == "TR" ~ "Aerial Tramways",
Mode == "VP" ~ "Vanpool",
Mode == "YR" ~ "Hybrid Rail",
Mode TRUE ~ "Unknown"))
<- FINANCIALS |> # Interpret the Mode column in "Financials"
FINANCIALS mutate(Mode = case_when(
== "AR" ~ "Alaska Railroad",
Mode == "CB" ~ "Commuter Bus",
Mode == "CC" ~ "Cable Car",
Mode == "CR" ~ "Commuter Rail",
Mode == "DR" ~ "Demand Response",
Mode == "FB" ~ "Ferryboat",
Mode == "HR" ~ "Heavy Rail",
Mode == "IP" ~ "Inclined Plane",
Mode == "LR" ~ "Light Rail",
Mode == "MB" ~ "Bus",
Mode == "MG" ~ "Monorail and Automated Guideway",
Mode == "PB" ~ "Publico",
Mode == "RB" ~ "Bus Rapid Transit",
Mode == "SR" ~ "Streetcar Rail",
Mode == "TB" ~ "Trolleybus",
Mode == "TR" ~ "Aerial Tramways",
Mode == "VP" ~ "Vanpool",
Mode == "YR" ~ "Hybrid Rail",
Mode TRUE ~ "Unknown"))
Unnecessary columns will be removed as well.
<- USAGE |> # Remove "3 Mode"
USAGE select(-c("3 Mode"))
Now that the data is cleaned up, a new table can be established, as follows:
Main Analysis
Using this table, the following analyses can be done.
The transit agency with the most total VRM in this data set can be determined with the following code:
|>
USAGE group_by(Agency) |>
summarize(total_vrm = sum(vehicle_revenue_miles, na.rm = T)) |>
arrange(desc(total_vrm)) |>
slice(1)
# A tibble: 1 × 2
Agency total_vrm
<chr> <dbl>
1 MTA New York City Transit 10832855350
Based on this, it can be determined that the MTA New York City Transit had the most total VRM, of approximately 10,800,000,000 miles.
Comparatively, a similar code can be used to determine the transit mode with the most total VRM:
|>
USAGE group_by(Mode) |>
summarize(total_vrm = sum(vehicle_revenue_miles, na.rm = T)) |>
arrange(desc(total_vrm)) |>
slice(1)
# A tibble: 1 × 2
Mode total_vrm
<chr> <dbl>
1 Bus 49444494088
It can be determined that buses had the most total VRM, with 49,444,494,088 miles.
Now, a closer look will be had on the MTA, specifically the number of trips taken on the NYC Subway in May 2024:
$month <- as.character(USAGE$month)
USAGE|>
USAGE filter(USAGE$Agency == "MTA New York City Transit",
$Mode == "Heavy Rail",
USAGE$month == "2024-05-01") USAGE
# A tibble: 1 × 7
NTD_ID Agency metro_area Mode month unlinked_passenger_t…¹
<int> <chr> <chr> <chr> <chr> <dbl>
1 20008 MTA New York City Transit New York-… Heav… 2024… 180458819
# ℹ abbreviated name: ¹unlinked_passenger_trips
# ℹ 1 more variable: vehicle_revenue_miles <dbl>
Here, in May 2024, 180,458,819 trips were taken.
While this number is indeed high, it would be interesting to analyze the impact the COVID-19 pandemic had on NYC Subway ridership. To do so, the fall of NYC subway ridership between April 2019 and April 2020 needs to be determined:
|>
USAGE filter(USAGE$Agency == "MTA New York City Transit", #2019
$Mode == "Heavy Rail",
USAGE$month == "2019-04-01") USAGE
# A tibble: 1 × 7
NTD_ID Agency metro_area Mode month unlinked_passenger_t…¹
<int> <chr> <chr> <chr> <chr> <dbl>
1 20008 MTA New York City Transit New York-… Heav… 2019… 232223929
# ℹ abbreviated name: ¹unlinked_passenger_trips
# ℹ 1 more variable: vehicle_revenue_miles <dbl>
|>
USAGE filter(USAGE$Agency == "MTA New York City Transit", #2020
$Mode == "Heavy Rail",
USAGE$month == "2020-04-01") USAGE
# A tibble: 1 × 7
NTD_ID Agency metro_area Mode month unlinked_passenger_t…¹
<int> <chr> <chr> <chr> <chr> <dbl>
1 20008 MTA New York City Transit New York-… Heav… 2020… 20254269
# ℹ abbreviated name: ¹unlinked_passenger_trips
# ℹ 1 more variable: vehicle_revenue_miles <dbl>
Based on this information, ridership fell from 232,223,929 in April 2019 to 20,254,269 in April 2020. This points to a difference of 211,969,660 rides.
Additional Analysis
Previously, buses were determined to be the transit mode with the most total VRM. It would be interesting to determine the opposite:
|>
USAGE group_by(Mode) |> # Look through the lens of transportation mode
summarize(total_vrm = sum(vehicle_revenue_miles, na.rm = T)) |> # Obtain the total VRM per mode
arrange(total_vrm) |> # Ascending order
slice(1) # Obtain the top data point
# A tibble: 1 × 2
Mode total_vrm
<chr> <dbl>
1 Aerial Tramways 292860
In contrast to buses, aerial tramways were the transit mode with the least total VRM, with 292,850 miles.
This is a difference of 49,444,201,238 miles. While this might feel drastic, it makes sense. Aerial tramways, vehicles suspended from a system of cables that are propelled through a suspension system, are not commonly used, which attributes to their low mileage. On the other hand, buses are used extensively, with there being numerous routes that run daily, attributing to their high mileage.
Following this contrast, the agency with the least total VRM was Barnegat Bay Decoy & Baymen’s Museum, with 2,312 miles.
|>
USAGE group_by(Agency) |> # Look through the lens of transportation agency
summarize(total_vrm = sum(vehicle_revenue_miles, na.rm = T)) |> # Obtain the total VRM per agency
arrange(total_vrm) |> # Ascending order
slice(1) # Obtain the top data point
# A tibble: 1 × 2
Agency total_vrm
<chr> <dbl>
1 Barnegat Bay Decoy & Baymen's Museum 2312
Compared to the MTA, there is a difference of 10,799,997,688 miles. This large difference can be attributed to the vast difference in sizes between the two. The MTA is the largest transportation system in North America. On the other hand, Barnegat is only a 40 acre cultural center meant to preserve the maritime history of the Jersey Shore. The VRM accumulated correlate to the size and population attributed to each agency.
Because the MTA in NYC has been established as having the largest VRM, examining which location has the second highest average vehicle revenue miles, after the New York / New Jersey area, would be interesting.
|>
USAGE group_by(metro_area) |> # Look through the lens of location
summarize(average_vrm = mean(vehicle_revenue_miles, na.rm = T)) |> # Obtain the average VRM per location
arrange(desc(average_vrm)) |> # Descending order
slice(2) # Obtain the second data point
# A tibble: 1 × 2
metro_area average_vrm
<chr> <dbl>
1 Denver--Aurora, CO 1565688.
This turns out to be Denver, Colorado, with 1,565,688 average VRM. This is a 244,360 mile difference to NYC, with 1,810,058 miles.
Farebox Recovery Analysis
To fully analyze the farebox recovery data, a new table must be created from the previous table, narrowing down the data to specifically 2022. First, the obtained data needs to be edited to create the summary table.
## Task 5: Table Summarization
<- USAGE |>
USAGE_2022_ANNUAL mutate(year = year(month)) |>
filter(year == 2022) |>
group_by(NTD_ID,
Agency,
metro_area,
Mode,
unlinked_passenger_trips, |>
vehicle_revenue_miles) summarize(
total_upt = sum(unlinked_passenger_trips, na.rm = T),
total_vrm = sum(vehicle_revenue_miles, na.rm = T),
.groups = "keep",
|>
) ungroup()
<- left_join(USAGE_2022_ANNUAL,
USAGE_AND_FINANCIALS
FINANCIALS, join_by(NTD_ID, Mode),
relationship = "many-to-many") |>
drop_na()
Second, the table can be created.
When analyzing the farebox recovery, it can be determined that the transit system with the most UPT in 2022 was the MTA New York CIty Transit, specifically the heavy rail, with 1,793,073,801 trips.
<- USAGE_AND_FINANCIALS |> # Create a new variable
mostUPT2022 group_by(Agency, Mode) |> # Look through the lens of Agency and Mode
filter(total_upt > 400000) |> # Major Transit systems
summarize(total_upt2022 = sum(total_upt)) |> # Obtain the total UPT per what was grouped
arrange(desc(total_upt2022)) # Descending order
head(mostUPT2022, n=1) # Get only the wanted data
# A tibble: 1 × 3
# Groups: Agency [1]
Agency Mode total_upt2022
<chr> <chr> <dbl>
1 MTA New York City Transit Heavy Rail 1793073801
However, if examining the transit system with the highest farebox recovery, it would be the Anaheim Transportation Network, specifically the bus, with 0.865.
<- USAGE_AND_FINANCIALS |> # Create a new variable
highestfarebox group_by(Agency, Mode) |> # Look through the lens of Agency and Mode
filter(total_upt > 400000) |> # Major Transit systems
summarize(highestfarebox = sum(`Total Fares`) / sum (Expenses)) |> # Obtain the ratio of total fares to expenses
arrange(desc(highestfarebox)) # Descending order
head(highestfarebox, n=1) # Get only the wanted data
# A tibble: 1 × 3
# Groups: Agency [1]
Agency Mode highestfarebox
<chr> <chr> <dbl>
1 Anaheim Transportation Network Bus 0.865
Moving on, when examining the transit system with the lowest expenses per UPT, it would be the University of Georgia bus system, with $14.90 per trip.
<- USAGE_AND_FINANCIALS |> # Create a new variable
lowestexpenses group_by(Agency, Mode) |> # Look through the lens of Agency and Mode
filter(total_upt > 400000) |> # Major Transit systems
summarize(lowestexpenses = sum(Expenses) / sum(total_upt)) |> # Obtain the ratio of expenses to UPT
arrange(desc(lowestexpenses)) # Descending order
tail(lowestexpenses, n=1) # Get only the wanted data
# A tibble: 1 × 3
# Groups: Agency [1]
Agency Mode lowestexpenses
<chr> <chr> <dbl>
1 University of Georgia Bus 14.9
Comparatively, the transit system with lowest expenses per VRM is the Interurban Transit Partnership bus system, with $84.10 per mile.
<- USAGE_AND_FINANCIALS |> # Create a new variable
lowestexpensesvrm group_by(Agency, Mode) |> # Look through the lens of Agency and Mode
filter(total_upt > 400000) |> # Major Transit systems
summarize(lowestexpensesvrm = sum(Expenses) / sum(total_vrm)) |> # Obtain the ratio of expenses to VRM
arrange(desc(lowestexpensesvrm)) # Descending order
tail(lowestexpensesvrm, n=1) # Get only the wanted data
# A tibble: 1 × 3
# Groups: Agency [1]
Agency Mode lowestexpensesvrm
<chr> <chr> <dbl>
1 Interurban Transit Partnership Bus 83.1
Meanwhile, the transit system with the highest total fares per UPT is the Metro-North Commuter Railroad Company, with the bus, at $98.70 per trip.
<- USAGE_AND_FINANCIALS |> # Create a new variable
highestfares group_by(Agency, Mode) |> # Look through the lens of Agency and Mode
filter(total_upt > 400000) |> # Major Transit systems
summarize(highestfares = sum(`Total Fares`) / sum(total_upt)) |> # Obtain the ratio of total fares to UPT
arrange(desc(highestfares)) # Descending order
head(highestfares, n=1) # Get only the wanted data
# A tibble: 1 × 3
# Groups: Agency [1]
Agency Mode highestfares
<chr> <chr> <dbl>
1 Metro-North Commuter Railroad Company, dba: MTA Metro-Nort… Comm… 98.7
Comparatively, the transit system with highest total fares per VRM is the Washington State Ferries with the ferryboat, at $937 per mile.
<- USAGE_AND_FINANCIALS |> # Create a new variable
highestfaresvrm group_by(Agency, Mode) |> # Look through the lens of Agency and Mode
filter(total_upt > 400000) |> # Major Transit systems
summarize(highestfaresvrm = sum(`Total Fares`) / sum(total_vrm)) |> # obtain the ratio of total fares to VRM
arrange(desc(highestfaresvrm)) # Descending order
head(highestfaresvrm, n=1) # Get only the wanted data
# A tibble: 1 × 3
# Groups: Agency [1]
Agency Mode highestfaresvrm
<chr> <chr> <dbl>
1 Washington State Ferries Ferryboat 937.
Conclusions
Overall, the most efficient transit system in the country is the MTA New York City Transit. It is the largest North American transit system, and has proven that it is as efficient as its size. It has the most total and average vehicle revenue miles. Additionally, in 2022, it was the transit system with the most unlinked passenger trips.