Code
library(tidyverse)
library(scales)
library(leaflet)
library(leaflet.minicharts)
library(janitor)
library(plotly)
library(gganimate)
library(dygraphs)
options(scipen = 999)
Stephen J Parton
20-10-2022
This analysis summarises Australian domestic flight volumes and on-time performance (OTP) issues by airline over time.
It has been prepared mainly to get more used to Quarto, and comprises:
Initial data load and preprocessing - not shown
Domestic Flight Analysis - This document
Global Flight Analysis - Next document
Forecasting - to come
Data is sourced from https://data.gov.au/ site, specific datasets used being:
Top routes
Industry Totals
On-Time-Performance - Domestic
(need to add notes/refs)
top_routes_prep_df <- read_rds("./artifacts/top_routes_prep_df.rds")
ind_totals_prep_df <- read_rds("./artifacts/ind_totals_prep_df.rds")
dom_cargo_prep_df <- read_rds("./artifacts/dom_cargo_prep_df.rds")
otp_prep_df <- read_rds("./artifacts/otp_prep_df.rds")
latest_date <- max(top_routes_prep_df$date)
otp_prep_df <- otp_prep_df %>%
mutate(across(airline,str_replace,'QantasLink','Qantas')) %>%
mutate(across(airline,str_replace,"Virgin Australia - Atr/F100 Operations","Virgin Australia")) %>%
mutate(across(airline,str_replace,"Virgin Australia Regional Airlines","Virgin Australia"))
Total monthly passenger numbers :
g <- ind_totals_prep_df %>%
#filter(year>2010) %>%
ggplot(aes(x=date,y=passenger_trips))+
geom_line()+
scale_y_continuous(labels=scales::comma)+
scale_x_date(date_breaks = "2 year",date_labels = "%y")+
labs(title = "Australian Domestic Flight History", x="Year", y = "Passenger Numbers (monthly)")+
theme_bw()
ggplotly(g)
Key “points of interest”:
1987 pilot strike
2000 Olympic Games
COVID!!!!
Seasonality and trend both also clearly show, at least until covid.
We can break this down by top 10 routes (only tracked 2-way):
## * Top routes ----
top_routes <- top_routes_prep_df %>%
group_by(route,date=max(date)) %>%
summarise(passenger_trips = sum(passenger_trips)) %>%
ungroup() %>%
slice_max(passenger_trips, n=10) %>%
select(route) %>%
pull() %>%
as.character()
g1 <- top_routes_prep_df %>%
filter(route %in% top_routes) %>%
mutate(route = factor(route,levels=top_routes)) %>%
ggplot(aes(date,passenger_trips,colour =route))+
geom_line()+
scale_y_continuous(labels=scales::comma)+
scale_x_date(date_breaks = "2 year",date_labels = "%y")+
scale_colour_discrete(name ="Route - 2way")+
labs(title = "Australian Domestic Flight History - Top10 (2-way) Routes", x="by Month", y = "Passenger Numbers (monthly)")+
#theme_bw() +
theme(legend.position="bottom")
ggplotly(g1)
Following map shows all routes in 2019 (precovid), thickness of line representiing pax volumes for the year (in this case with a moving monthly timeline to show impact of covid - but does not really work that well). As width of line signifies volumes of passenger trips, Sydney-Melbourne route clearly has thickest line!
## * Routes Mapped - Leaflet ----
top_routes_short <- top_routes_prep_df %>%
filter(year>2019)
# group_by(year,city1,city2,city1_lng,city1_lat,city2_lng,city2_lat) %>%
# summarise(passenger_trips = sum(passenger_trips))
leaflet() %>%
addProviderTiles(providers$OpenTopoMap) %>%
addTiles() %>%
#addProviderTiles(providers$Esri.WorldStreetMap) %>%
addFlows(
top_routes_short$city1_lng,
top_routes_short$city1_lat,
top_routes_short$city2_lng,
top_routes_short$city2_lat,
flow = top_routes_short$passenger_trips,
time = top_routes_short$date,
dir = 0,
minThickness = .1,
maxThickness = 5,
popupOptions = list(closeOnClick = FALSE, autoClose = FALSE)
)
Performance Metric: OTP_issues_pct = (delayed arrivals + cancelled flights)/ Total Sectors Scheduled.
As this metric is based on arrival delays and canellations as a percentage of scheduled services, the higher the number, then the worse the performance!
otp_issues_all <- otp_prep_df %>%
filter(airline == "All Airlines") %>%
group_by(date) %>%
summarise(sectors_scheduled = sum(sectors_scheduled),
arrivals_delayed = sum(arrivals_delayed),
cancellations = sum(cancellations),
otp_issues_num = sum(otp_issues_num)
) %>%
mutate(otp_issues_pct = (arrivals_delayed+cancellations)/sectors_scheduled)
g_opt <- otp_issues_all %>%
ggplot(aes(date,otp_issues_pct))+
geom_line()+
geom_smooth(method="loess")+
scale_y_continuous(labels=scales::percent)+
theme_bw()
ggplotly(g_opt)
While the ‘loess’ smoother indicates a continual worsening of performance, most recent reporting perhaps indicates the airlines are starting to address OTP issues.
This graph just focuses on the main 3 domestic carriers.
otp_issues_airline <- otp_prep_df %>%
filter(airline %in% c("Jetstar","Qantas","Virgin Australia"),
year > 2019
) %>%
mutate(airline = str_to_title(airline)) %>%
group_by(date,airline) %>%
summarise(sectors_scheduled = sum(sectors_scheduled),
arrivals_delayed = sum(arrivals_delayed),
cancellations = sum(cancellations),
otp_issues_num = sum(otp_issues_num)
) %>%
mutate(arrivals_delayed_pct = arrivals_delayed/sectors_scheduled,
cacellations_pct = cancellations/sectors_scheduled,
otp_issues_total_pct = (arrivals_delayed+cancellations)/sectors_scheduled ) %>%
select(date,airline,ends_with("pct")) %>%
pivot_longer(cols = ends_with("pct"), names_to = "otp_metric",values_to = "pct_issues")
g_otp_issues_airline <- otp_issues_airline %>%
ggplot(aes(date,pct_issues,colour = airline))+
geom_line()+
#geom_smooth(method="loess")+
scale_x_date(date_breaks = "3 month",date_labels = "%m/%y")+
scale_y_continuous(labels=scales::percent)+
xlab("Month")+
ylab("Pct of Monthly Scheduled Services") +
theme_bw()+
theme(legend.position = "bottom")+
facet_wrap(~otp_metric,ncol=1)
ggplotly(g_otp_issues_airline)
Note:
cancellations in initial covid period
Upswing in OTP issues (mainly non-cancellations) in more recent days
Jetstar worst performer, although all 3 airlines guilty of worsening performance.
Signs of improvement in most recent reports.
To highlight the y-o-y changes:
year_select <- 2016
otp_issues_airline2 <- otp_prep_df %>%
filter(airline != "All Airlines",
year > year_select
) %>%
mutate(airline = str_to_title(airline)) %>%
group_by(year,airline) %>%
summarise(sectors_scheduled = sum(sectors_scheduled),
arrivals_delayed = sum(arrivals_delayed),
cancellations = sum(cancellations),
otp_issues_num = sum(otp_issues_num)
) %>%
mutate(otp_issues_pct = (arrivals_delayed+cancellations)/sectors_scheduled ) %>%
mutate(airline = fct_reorder(airline,otp_issues_pct))
g_otp_issues_airline_2 <- otp_issues_airline2 %>%
filter(airline %in%c("Jetstar","Virgin Australia", "Qantas")) %>%
ggplot(aes(year,otp_issues_pct,fill = year))+
geom_col()+
geom_text(aes(label = percent(otp_issues_pct,accuracy = .1)),
hjust = 1,
colour = "white")+
#coord_flip()+
scale_y_continuous(labels=scales::percent)+
#scale_x_discrete(breaks = 0)+
ylab("OTP Issues/Scheduled Services")+
xlab("")+
labs(title="On-Time Performance Issues by Year",
subtitle = "as pct of Scheduled Services")+
theme_bw()+
coord_flip()+
facet_wrap(vars(airline),dir = "v")
g_otp_issues_airline_2
---
title: "Australian Domestic Flights"
author: "Stephen J Parton"
date: "2022-10-20"
categories: [code, analytics, flights, maps]
website:
sidebar:
style: "docked"
search: true
format:
html:
theme: litera
toc: true
toc-title: Contents
number-sections: true
number-depth: 3
code-fold: true
code-summary: "Code"
code-tools: true
execute:
echo: true
warning: false
error: false
freeze: true
cache: true
---
![](thumbnail.jpg){width="215"}
## Introduction
This analysis summarises Australian domestic flight volumes and on-time performance (OTP) issues by airline over time.
It has been prepared mainly to get more used to Quarto, and comprises:
- Initial data load and preprocessing - not shown
- Domestic Flight Analysis - This document
- Global Flight Analysis - Next document
- Equivalent (draft) shiny app (on [shinyapps here](https://sjpanalytics.shinyapps.io/shiny_aust_flights/))
- Forecasting - to come
```{r, load_packages}
library(tidyverse)
library(scales)
library(leaflet)
library(leaflet.minicharts)
library(janitor)
library(plotly)
library(gganimate)
library(dygraphs)
options(scipen = 999)
```
## Data Sources
Data is sourced from https://data.gov.au/ site, specific datasets used being:
- Top routes
- Industry Totals
- On-Time-Performance - Domestic
(need to add notes/refs)
```{r, data_load}
top_routes_prep_df <- read_rds("./artifacts/top_routes_prep_df.rds")
ind_totals_prep_df <- read_rds("./artifacts/ind_totals_prep_df.rds")
dom_cargo_prep_df <- read_rds("./artifacts/dom_cargo_prep_df.rds")
otp_prep_df <- read_rds("./artifacts/otp_prep_df.rds")
latest_date <- max(top_routes_prep_df$date)
otp_prep_df <- otp_prep_df %>%
mutate(across(airline,str_replace,'QantasLink','Qantas')) %>%
mutate(across(airline,str_replace,"Virgin Australia - Atr/F100 Operations","Virgin Australia")) %>%
mutate(across(airline,str_replace,"Virgin Australia Regional Airlines","Virgin Australia"))
```
## Exploratory Data Analysis
### All Major Routes - Total Monthly Pax
Total monthly passenger numbers :
```{r, total_pax}
g <- ind_totals_prep_df %>%
#filter(year>2010) %>%
ggplot(aes(x=date,y=passenger_trips))+
geom_line()+
scale_y_continuous(labels=scales::comma)+
scale_x_date(date_breaks = "2 year",date_labels = "%y")+
labs(title = "Australian Domestic Flight History", x="Year", y = "Passenger Numbers (monthly)")+
theme_bw()
ggplotly(g)
```
Key "points of interest":
- 1987 pilot strike
- 2000 Olympic Games
- COVID!!!!
Seasonality and trend both also clearly show, at least until covid.
We can break this down by top 10 routes (only tracked 2-way):
### Top 10 Routes - Monthly Pax by O/D City Pairs
```{r,top_routes}
## * Top routes ----
top_routes <- top_routes_prep_df %>%
group_by(route,date=max(date)) %>%
summarise(passenger_trips = sum(passenger_trips)) %>%
ungroup() %>%
slice_max(passenger_trips, n=10) %>%
select(route) %>%
pull() %>%
as.character()
g1 <- top_routes_prep_df %>%
filter(route %in% top_routes) %>%
mutate(route = factor(route,levels=top_routes)) %>%
ggplot(aes(date,passenger_trips,colour =route))+
geom_line()+
scale_y_continuous(labels=scales::comma)+
scale_x_date(date_breaks = "2 year",date_labels = "%y")+
scale_colour_discrete(name ="Route - 2way")+
labs(title = "Australian Domestic Flight History - Top10 (2-way) Routes", x="by Month", y = "Passenger Numbers (monthly)")+
#theme_bw() +
theme(legend.position="bottom")
ggplotly(g1)
```
### All Routes - Total Monthly Pax - Mapped
Following map shows all routes in 2019 (precovid), thickness of line representiing pax volumes for the year (in this case with a moving monthly timeline to show impact of covid - but does not really work that well). As width of line signifies volumes of passenger trips, Sydney-Melbourne route clearly has thickest line!
```{r, leaflet_mapped}
## * Routes Mapped - Leaflet ----
top_routes_short <- top_routes_prep_df %>%
filter(year>2019)
# group_by(year,city1,city2,city1_lng,city1_lat,city2_lng,city2_lat) %>%
# summarise(passenger_trips = sum(passenger_trips))
leaflet() %>%
addProviderTiles(providers$OpenTopoMap) %>%
addTiles() %>%
#addProviderTiles(providers$Esri.WorldStreetMap) %>%
addFlows(
top_routes_short$city1_lng,
top_routes_short$city1_lat,
top_routes_short$city2_lng,
top_routes_short$city2_lat,
flow = top_routes_short$passenger_trips,
time = top_routes_short$date,
dir = 0,
minThickness = .1,
maxThickness = 5,
popupOptions = list(closeOnClick = FALSE, autoClose = FALSE)
)
```
### On Time Performance (OTP) - All Domestic Routes
Performance Metric: OTP_issues_pct = (delayed arrivals + cancelled flights)/ Total Sectors Scheduled.
As this metric is based on arrival delays and canellations as a percentage of scheduled services, the higher the number, then the worse the performance!
```{r, otp_total}
#| warnining: false
#| message: false
otp_issues_all <- otp_prep_df %>%
filter(airline == "All Airlines") %>%
group_by(date) %>%
summarise(sectors_scheduled = sum(sectors_scheduled),
arrivals_delayed = sum(arrivals_delayed),
cancellations = sum(cancellations),
otp_issues_num = sum(otp_issues_num)
) %>%
mutate(otp_issues_pct = (arrivals_delayed+cancellations)/sectors_scheduled)
g_opt <- otp_issues_all %>%
ggplot(aes(date,otp_issues_pct))+
geom_line()+
geom_smooth(method="loess")+
scale_y_continuous(labels=scales::percent)+
theme_bw()
ggplotly(g_opt)
```
While the 'loess' smoother indicates a continual worsening of performance, most recent reporting perhaps indicates the airlines are starting to address OTP issues.
### OTP - By Airline over Time
This graph just focuses on the main 3 domestic carriers.
```{r, otp_airline}
otp_issues_airline <- otp_prep_df %>%
filter(airline %in% c("Jetstar","Qantas","Virgin Australia"),
year > 2019
) %>%
mutate(airline = str_to_title(airline)) %>%
group_by(date,airline) %>%
summarise(sectors_scheduled = sum(sectors_scheduled),
arrivals_delayed = sum(arrivals_delayed),
cancellations = sum(cancellations),
otp_issues_num = sum(otp_issues_num)
) %>%
mutate(arrivals_delayed_pct = arrivals_delayed/sectors_scheduled,
cacellations_pct = cancellations/sectors_scheduled,
otp_issues_total_pct = (arrivals_delayed+cancellations)/sectors_scheduled ) %>%
select(date,airline,ends_with("pct")) %>%
pivot_longer(cols = ends_with("pct"), names_to = "otp_metric",values_to = "pct_issues")
g_otp_issues_airline <- otp_issues_airline %>%
ggplot(aes(date,pct_issues,colour = airline))+
geom_line()+
#geom_smooth(method="loess")+
scale_x_date(date_breaks = "3 month",date_labels = "%m/%y")+
scale_y_continuous(labels=scales::percent)+
xlab("Month")+
ylab("Pct of Monthly Scheduled Services") +
theme_bw()+
theme(legend.position = "bottom")+
facet_wrap(~otp_metric,ncol=1)
ggplotly(g_otp_issues_airline)
```
Note:
- cancellations in initial covid period
- Upswing in OTP issues (mainly non-cancellations) in more recent days
- Jetstar worst performer, although all 3 airlines guilty of worsening performance.
- Signs of improvement in most recent reports.
To highlight the y-o-y changes:
```{r, fig.height= 8}
year_select <- 2016
otp_issues_airline2 <- otp_prep_df %>%
filter(airline != "All Airlines",
year > year_select
) %>%
mutate(airline = str_to_title(airline)) %>%
group_by(year,airline) %>%
summarise(sectors_scheduled = sum(sectors_scheduled),
arrivals_delayed = sum(arrivals_delayed),
cancellations = sum(cancellations),
otp_issues_num = sum(otp_issues_num)
) %>%
mutate(otp_issues_pct = (arrivals_delayed+cancellations)/sectors_scheduled ) %>%
mutate(airline = fct_reorder(airline,otp_issues_pct))
g_otp_issues_airline_2 <- otp_issues_airline2 %>%
filter(airline %in%c("Jetstar","Virgin Australia", "Qantas")) %>%
ggplot(aes(year,otp_issues_pct,fill = year))+
geom_col()+
geom_text(aes(label = percent(otp_issues_pct,accuracy = .1)),
hjust = 1,
colour = "white")+
#coord_flip()+
scale_y_continuous(labels=scales::percent)+
#scale_x_discrete(breaks = 0)+
ylab("OTP Issues/Scheduled Services")+
xlab("")+
labs(title="On-Time Performance Issues by Year",
subtitle = "as pct of Scheduled Services")+
theme_bw()+
coord_flip()+
facet_wrap(vars(airline),dir = "v")
g_otp_issues_airline_2
#ggplotly(g_otp_issues_airline_2)
```