Notes to Myself - Australian Domestic Flights

1 Introduction

This analysis summarises Australian domestic flight volumes and on-time performance (OTP) issues by airline over time.

It has been prepared mainly to get more used to Quarto, and comprises:

Initial data load and preprocessing - not shown
Domestic Flight Analysis - This document
Global Flight Analysis - Next document
Forecasting - to come

Code

library(tidyverse)


library(scales)

library(leaflet)
library(leaflet.minicharts)


library(janitor)
library(plotly)
library(gganimate)

library(dygraphs)


options(scipen = 999)

2 Data Sources

Data is sourced from https://data.gov.au/ site, specific datasets used being:

Top routes
Industry Totals
On-Time-Performance - Domestic

(need to add notes/refs)

Code

top_routes_prep_df <- read_rds("./artifacts/top_routes_prep_df.rds")
ind_totals_prep_df <- read_rds("./artifacts/ind_totals_prep_df.rds")
dom_cargo_prep_df  <- read_rds("./artifacts/dom_cargo_prep_df.rds")
otp_prep_df        <- read_rds("./artifacts/otp_prep_df.rds")

latest_date        <- max(top_routes_prep_df$date)


otp_prep_df <- otp_prep_df %>% 
  mutate(across(airline,str_replace,'QantasLink','Qantas')) %>% 
  mutate(across(airline,str_replace,"Virgin Australia - Atr/F100 Operations","Virgin Australia")) %>% 
  mutate(across(airline,str_replace,"Virgin Australia Regional Airlines","Virgin Australia"))

3 Exploratory Data Analysis

3.1 All Major Routes - Total Monthly Pax

Total monthly passenger numbers :

Code

g <- ind_totals_prep_df %>% 
  #filter(year>2010) %>% 
  ggplot(aes(x=date,y=passenger_trips))+
  geom_line()+
  scale_y_continuous(labels=scales::comma)+
  scale_x_date(date_breaks = "2 year",date_labels = "%y")+
  labs(title = "Australian Domestic Flight History", x="Year", y = "Passenger Numbers (monthly)")+
  theme_bw()
ggplotly(g)

Key “points of interest”:

1987 pilot strike
2000 Olympic Games
COVID!!!!

Seasonality and trend both also clearly show, at least until covid.

We can break this down by top 10 routes (only tracked 2-way):

3.2 Top 10 Routes - Monthly Pax by O/D City Pairs

Code

## * Top routes ----
top_routes <- top_routes_prep_df %>% 
  group_by(route,date=max(date)) %>% 
  summarise(passenger_trips = sum(passenger_trips)) %>% 
  ungroup() %>%
  slice_max(passenger_trips, n=10) %>% 
  select(route) %>% 
  pull() %>% 
  as.character()
  

g1 <- top_routes_prep_df %>% 
  filter(route %in% top_routes) %>%
  mutate(route = factor(route,levels=top_routes)) %>% 
  ggplot(aes(date,passenger_trips,colour =route))+
  geom_line()+
  scale_y_continuous(labels=scales::comma)+
  scale_x_date(date_breaks = "2 year",date_labels = "%y")+
  scale_colour_discrete(name  ="Route - 2way")+
  labs(title = "Australian Domestic Flight History - Top10 (2-way) Routes", x="by Month", y = "Passenger Numbers (monthly)")+
  #theme_bw() +
  theme(legend.position="bottom")
ggplotly(g1)

3.3 All Routes - Total Monthly Pax - Mapped

Following map shows all routes in 2019 (precovid), thickness of line representiing pax volumes for the year (in this case with a moving monthly timeline to show impact of covid - but does not really work that well). As width of line signifies volumes of passenger trips, Sydney-Melbourne route clearly has thickest line!

Code

## * Routes Mapped - Leaflet ----

top_routes_short <- top_routes_prep_df %>%
  filter(year>2019) 
  # group_by(year,city1,city2,city1_lng,city1_lat,city2_lng,city2_lat) %>% 
  # summarise(passenger_trips = sum(passenger_trips))

  leaflet() %>% 
    addProviderTiles(providers$OpenTopoMap) %>% 
    addTiles() %>%
    #addProviderTiles(providers$Esri.WorldStreetMap) %>% 
  addFlows(
    top_routes_short$city1_lng, 
    top_routes_short$city1_lat, 
    top_routes_short$city2_lng, 
    top_routes_short$city2_lat,
    flow = top_routes_short$passenger_trips,
    time = top_routes_short$date,
    dir = 0,
    minThickness = .1,
    maxThickness = 5,
    popupOptions = list(closeOnClick = FALSE, autoClose = FALSE)
  )

3.4 On Time Performance (OTP) - All Domestic Routes

Performance Metric: OTP_issues_pct = (delayed arrivals + cancelled flights)/ Total Sectors Scheduled.

As this metric is based on arrival delays and canellations as a percentage of scheduled services, the higher the number, then the worse the performance!

Code

otp_issues_all <- otp_prep_df %>% 
  filter(airline == "All Airlines") %>% 
  group_by(date) %>% 
  summarise(sectors_scheduled = sum(sectors_scheduled),
            arrivals_delayed = sum(arrivals_delayed),
            cancellations = sum(cancellations),
            otp_issues_num = sum(otp_issues_num)
            ) %>% 
  mutate(otp_issues_pct = (arrivals_delayed+cancellations)/sectors_scheduled)

g_opt <- otp_issues_all %>% 
  ggplot(aes(date,otp_issues_pct))+
  geom_line()+
  geom_smooth(method="loess")+
  scale_y_continuous(labels=scales::percent)+
  theme_bw()

ggplotly(g_opt)

While the ‘loess’ smoother indicates a continual worsening of performance, most recent reporting perhaps indicates the airlines are starting to address OTP issues.

3.5 OTP - By Airline over Time

This graph just focuses on the main 3 domestic carriers.

Code

otp_issues_airline <- otp_prep_df %>% 
  filter(airline %in% c("Jetstar","Qantas","Virgin Australia"),
         year > 2019
         ) %>%
  
  mutate(airline = str_to_title(airline)) %>% 
  group_by(date,airline) %>% 
  summarise(sectors_scheduled = sum(sectors_scheduled),
            arrivals_delayed  = sum(arrivals_delayed),
            cancellations     = sum(cancellations),
            otp_issues_num    = sum(otp_issues_num)
            ) %>% 
  mutate(arrivals_delayed_pct = arrivals_delayed/sectors_scheduled,
         cacellations_pct     = cancellations/sectors_scheduled,
         otp_issues_total_pct = (arrivals_delayed+cancellations)/sectors_scheduled ) %>% 
  select(date,airline,ends_with("pct")) %>% 
  pivot_longer(cols = ends_with("pct"), names_to = "otp_metric",values_to = "pct_issues")

g_otp_issues_airline <- otp_issues_airline %>% 
  ggplot(aes(date,pct_issues,colour = airline))+
  geom_line()+
  #geom_smooth(method="loess")+
  scale_x_date(date_breaks = "3 month",date_labels = "%m/%y")+
  scale_y_continuous(labels=scales::percent)+
  xlab("Month")+
  ylab("Pct of Monthly Scheduled Services") +
  theme_bw()+
  theme(legend.position =  "bottom")+
  facet_wrap(~otp_metric,ncol=1)


ggplotly(g_otp_issues_airline)

Note:

cancellations in initial covid period
Upswing in OTP issues (mainly non-cancellations) in more recent days
Jetstar worst performer, although all 3 airlines guilty of worsening performance.
Signs of improvement in most recent reports.

To highlight the y-o-y changes:

Code

year_select <- 2016

otp_issues_airline2 <- otp_prep_df %>% 
  filter(airline != "All Airlines",
         year > year_select
         ) %>%
  mutate(airline = str_to_title(airline)) %>% 
  group_by(year,airline) %>% 
  summarise(sectors_scheduled = sum(sectors_scheduled),
            arrivals_delayed = sum(arrivals_delayed),
            cancellations = sum(cancellations),
            otp_issues_num = sum(otp_issues_num)
            ) %>% 
  mutate(otp_issues_pct = (arrivals_delayed+cancellations)/sectors_scheduled ) %>% 
  mutate(airline = fct_reorder(airline,otp_issues_pct))


g_otp_issues_airline_2 <- otp_issues_airline2 %>% 
  filter(airline %in%c("Jetstar","Virgin Australia", "Qantas")) %>% 
  ggplot(aes(year,otp_issues_pct,fill = year))+
  geom_col()+
  geom_text(aes(label = percent(otp_issues_pct,accuracy = .1)),
            hjust = 1,
            colour = "white")+
  #coord_flip()+
  scale_y_continuous(labels=scales::percent)+
  #scale_x_discrete(breaks = 0)+
  ylab("OTP Issues/Scheduled Services")+
  xlab("")+
  labs(title="On-Time Performance Issues by Year",
       subtitle = "as pct of Scheduled Services")+
  theme_bw()+
  
  coord_flip()+
  facet_wrap(vars(airline),dir = "v")

g_otp_issues_airline_2

Code

#ggplotly(g_otp_issues_airline_2)

--- title: "Australian Domestic Flights" author: "Stephen J Parton" date: "2022-10-20" categories: [code, analytics, flights, maps] website: sidebar: style: "docked" search: true format: html: theme: litera toc: true toc-title: Contents number-sections: true number-depth: 3 code-fold: true code-summary: "Code" code-tools: true execute: echo: true warning: false error: false freeze: true cache: true --- ![](thumbnail.jpg){width="215"} ## Introduction This analysis summarises Australian domestic flight volumes and on-time performance (OTP) issues by airline over time. It has been prepared mainly to get more used to Quarto, and comprises: - Initial data load and preprocessing - not shown - Domestic Flight Analysis - This document - Global Flight Analysis - Next document - Equivalent (draft) shiny app (on [shinyapps here](https://sjpanalytics.shinyapps.io/shiny_aust_flights/)) - Forecasting - to come ```{r, load_packages} library(tidyverse) library(scales) library(leaflet) library(leaflet.minicharts) library(janitor) library(plotly) library(gganimate) library(dygraphs) options(scipen = 999) ``` ## Data Sources Data is sourced from https://data.gov.au/ site, specific datasets used being: - Top routes - Industry Totals - On-Time-Performance - Domestic (need to add notes/refs) ```{r, data_load} top_routes_prep_df <- read_rds("./artifacts/top_routes_prep_df.rds") ind_totals_prep_df <- read_rds("./artifacts/ind_totals_prep_df.rds") dom_cargo_prep_df <- read_rds("./artifacts/dom_cargo_prep_df.rds") otp_prep_df <- read_rds("./artifacts/otp_prep_df.rds") latest_date <- max(top_routes_prep_df$date) otp_prep_df <- otp_prep_df %>% mutate(across(airline,str_replace,'QantasLink','Qantas')) %>% mutate(across(airline,str_replace,"Virgin Australia - Atr/F100 Operations","Virgin Australia")) %>% mutate(across(airline,str_replace,"Virgin Australia Regional Airlines","Virgin Australia")) ``` ## Exploratory Data Analysis ### All Major Routes - Total Monthly Pax Total monthly passenger numbers : ```{r, total_pax} g <- ind_totals_prep_df %>% #filter(year>2010) %>% ggplot(aes(x=date,y=passenger_trips))+ geom_line()+ scale_y_continuous(labels=scales::comma)+ scale_x_date(date_breaks = "2 year",date_labels = "%y")+ labs(title = "Australian Domestic Flight History", x="Year", y = "Passenger Numbers (monthly)")+ theme_bw() ggplotly(g) ``` Key "points of interest": - 1987 pilot strike - 2000 Olympic Games - COVID!!!! Seasonality and trend both also clearly show, at least until covid. We can break this down by top 10 routes (only tracked 2-way): ### Top 10 Routes - Monthly Pax by O/D City Pairs ```{r,top_routes} ## * Top routes ---- top_routes <- top_routes_prep_df %>% group_by(route,date=max(date)) %>% summarise(passenger_trips = sum(passenger_trips)) %>% ungroup() %>% slice_max(passenger_trips, n=10) %>% select(route) %>% pull() %>% as.character() g1 <- top_routes_prep_df %>% filter(route %in% top_routes) %>% mutate(route = factor(route,levels=top_routes)) %>% ggplot(aes(date,passenger_trips,colour =route))+ geom_line()+ scale_y_continuous(labels=scales::comma)+ scale_x_date(date_breaks = "2 year",date_labels = "%y")+ scale_colour_discrete(name ="Route - 2way")+ labs(title = "Australian Domestic Flight History - Top10 (2-way) Routes", x="by Month", y = "Passenger Numbers (monthly)")+ #theme_bw() + theme(legend.position="bottom") ggplotly(g1) ``` ### All Routes - Total Monthly Pax - Mapped Following map shows all routes in 2019 (precovid), thickness of line representiing pax volumes for the year (in this case with a moving monthly timeline to show impact of covid - but does not really work that well). As width of line signifies volumes of passenger trips, Sydney-Melbourne route clearly has thickest line! ```{r, leaflet_mapped} ## * Routes Mapped - Leaflet ---- top_routes_short <- top_routes_prep_df %>% filter(year>2019) # group_by(year,city1,city2,city1_lng,city1_lat,city2_lng,city2_lat) %>% # summarise(passenger_trips = sum(passenger_trips)) leaflet() %>% addProviderTiles(providers$OpenTopoMap) %>% addTiles() %>% #addProviderTiles(providers$Esri.WorldStreetMap) %>% addFlows( top_routes_short$city1_lng, top_routes_short$city1_lat, top_routes_short$city2_lng, top_routes_short$city2_lat, flow = top_routes_short$passenger_trips, time = top_routes_short$date, dir = 0, minThickness = .1, maxThickness = 5, popupOptions = list(closeOnClick = FALSE, autoClose = FALSE) ) ``` ### On Time Performance (OTP) - All Domestic Routes Performance Metric: OTP_issues_pct = (delayed arrivals + cancelled flights)/ Total Sectors Scheduled. As this metric is based on arrival delays and canellations as a percentage of scheduled services, the higher the number, then the worse the performance! ```{r, otp_total} #| warnining: false #| message: false otp_issues_all <- otp_prep_df %>% filter(airline == "All Airlines") %>% group_by(date) %>% summarise(sectors_scheduled = sum(sectors_scheduled), arrivals_delayed = sum(arrivals_delayed), cancellations = sum(cancellations), otp_issues_num = sum(otp_issues_num) ) %>% mutate(otp_issues_pct = (arrivals_delayed+cancellations)/sectors_scheduled) g_opt <- otp_issues_all %>% ggplot(aes(date,otp_issues_pct))+ geom_line()+ geom_smooth(method="loess")+ scale_y_continuous(labels=scales::percent)+ theme_bw() ggplotly(g_opt) ``` While the 'loess' smoother indicates a continual worsening of performance, most recent reporting perhaps indicates the airlines are starting to address OTP issues. ### OTP - By Airline over Time This graph just focuses on the main 3 domestic carriers. ```{r, otp_airline} otp_issues_airline <- otp_prep_df %>% filter(airline %in% c("Jetstar","Qantas","Virgin Australia"), year > 2019 ) %>% mutate(airline = str_to_title(airline)) %>% group_by(date,airline) %>% summarise(sectors_scheduled = sum(sectors_scheduled), arrivals_delayed = sum(arrivals_delayed), cancellations = sum(cancellations), otp_issues_num = sum(otp_issues_num) ) %>% mutate(arrivals_delayed_pct = arrivals_delayed/sectors_scheduled, cacellations_pct = cancellations/sectors_scheduled, otp_issues_total_pct = (arrivals_delayed+cancellations)/sectors_scheduled ) %>% select(date,airline,ends_with("pct")) %>% pivot_longer(cols = ends_with("pct"), names_to = "otp_metric",values_to = "pct_issues") g_otp_issues_airline <- otp_issues_airline %>% ggplot(aes(date,pct_issues,colour = airline))+ geom_line()+ #geom_smooth(method="loess")+ scale_x_date(date_breaks = "3 month",date_labels = "%m/%y")+ scale_y_continuous(labels=scales::percent)+ xlab("Month")+ ylab("Pct of Monthly Scheduled Services") + theme_bw()+ theme(legend.position = "bottom")+ facet_wrap(~otp_metric,ncol=1) ggplotly(g_otp_issues_airline) ``` Note: - cancellations in initial covid period - Upswing in OTP issues (mainly non-cancellations) in more recent days - Jetstar worst performer, although all 3 airlines guilty of worsening performance. - Signs of improvement in most recent reports. To highlight the y-o-y changes: ```{r, fig.height= 8} year_select <- 2016 otp_issues_airline2 <- otp_prep_df %>% filter(airline != "All Airlines", year > year_select ) %>% mutate(airline = str_to_title(airline)) %>% group_by(year,airline) %>% summarise(sectors_scheduled = sum(sectors_scheduled), arrivals_delayed = sum(arrivals_delayed), cancellations = sum(cancellations), otp_issues_num = sum(otp_issues_num) ) %>% mutate(otp_issues_pct = (arrivals_delayed+cancellations)/sectors_scheduled ) %>% mutate(airline = fct_reorder(airline,otp_issues_pct)) g_otp_issues_airline_2 <- otp_issues_airline2 %>% filter(airline %in%c("Jetstar","Virgin Australia", "Qantas")) %>% ggplot(aes(year,otp_issues_pct,fill = year))+ geom_col()+ geom_text(aes(label = percent(otp_issues_pct,accuracy = .1)), hjust = 1, colour = "white")+ #coord_flip()+ scale_y_continuous(labels=scales::percent)+ #scale_x_discrete(breaks = 0)+ ylab("OTP Issues/Scheduled Services")+ xlab("")+ labs(title="On-Time Performance Issues by Year", subtitle = "as pct of Scheduled Services")+ theme_bw()+ coord_flip()+ facet_wrap(vars(airline),dir = "v") g_otp_issues_airline_2 #ggplotly(g_otp_issues_airline_2) ```