---
title: "Titanic from Kaggle"
author: "Stephen Parton"
date: "2022-09-01"
categories: [code, analysis,titanic]
website:
sidebar:
style: "docked"
search: true
contents:
- section: "Data Exploration"
- index.qmd
format:
html:
theme: litera
toc: true
toc-title: Contents
number-sections: true
number-depth: 3
code-fold: true
code-summary: "Code"
code-tools: true
execute:
echo: true
warning: false
error: false
freeze: true
cache: true
---
![](thumbnail.jpg) {width="215"}
## Summary
This is just a first test with code in a blog using the new Quarto framework! Guess what I am using..yep Titanic, Kaggle version..
It is not very well structured as it is pretty much in the order I did it following all instructions, books and blogs from the expert TidyModels and Quarto teams at RStudio/Posit . All errors belong to me!
```{r, packages}
#| context: setup
#| include: false
library (tidyverse)
library (janitor)
library (skimr)
library (DataExplorer)
library (tidymodels)
library (vip)
library (ggforce)
tidymodels_prefer ()
```
## Final Kaggle Scores
```{r, kaggle}
kaggle <- tibble (
Model = c ("Logistic Regression" ,
"Regularised Logistic Regression" ,
"Random Forest-final" ,
"Random Forest-initial" ,
"XG Boost" ,
"Neural Net" ,
"Ensemble" ),
Score = c (.76555 ,.77033 ,.77751 ,.78229 ,.77272 ,.76794 ,.77751 )
)
kaggle %>% knitr:: kable ()
```
Which when all submitted gave me a ranking of 1,872 out of 13,000 or so teams, so no grand-master!
Seems like the value mainly comes from the feature engineering and selection process (as the experts all seem to say) given the similarity in above model scores.
## Review Data
### Load Some Kaggle Data
Not the...? Yes, the Titanic again....
```{r, data}
#| warning: false
#| echo: true
#| message: false
train <- read_csv ("data_raw/train.csv" ,show_col_types = FALSE ) %>% clean_names () %>% mutate (train_test = "train" )
test <- read_csv ("data_raw/test.csv" ,show_col_types = FALSE ) %>% clean_names () %>%
mutate (train_test = "test" )
all <- train %>% bind_rows (test)
# colnames(data)
# cwd()
```
### Some Initial EDA
A quick look.
```{r, skim}
train %>% skim ()
```
### Some Initial Wrangling
```{r, wrangle_1}
all_proc <- all %>%
mutate (title = str_extract (name,"( \\ w)([a-z]+)( \\ .)" )) %>%
mutate (pax_type = case_when (
title %in% c ("Miss." ,"Ms." ,"Mlle." ) ~ "F_unmarried" ,
title %in% c ("Mme." ,"Mrs." ) ~ "F_married" ,
title %in% c ("Countess." ,"Lady." ,"Dona." ) ~ "F_titled" ,
title %in% c ("Capt." ,"Col." ,"Major." ) ~ "Military" ,
title %in% c ("Dr." ,"Rev." ) ~ "M_Professional" ,
title %in% c ("Don." ,"Jonkheer." ,"Sir." ) ~ "M_titled" ,
TRUE ~ title
),
surname = str_extract (name,"( \\ w+)( \\ ,)" ),
survival = ifelse (survived== 0 ,"No" ,"Yes" ),
ticket_preface = str_extract (ticket,"([:graph:]+)( \\ s)" ),
ticket_preface = ifelse (is.na (ticket_preface),"none" ,ticket_preface),
cabin_preface = ifelse (is.na (cabin),"nk" ,
substr (cabin,1 ,1 )),
embarked = ifelse (is.na (embarked),"S" ,embarked)
) %>%
group_by (pax_type,pclass) %>%
mutate (age = ifelse (is.na (age),median (age,na.rm = T), age)) %>%
ungroup () %>%
add_count (ticket,name = "ticket_group" ) %>%
mutate (ticket_group = case_when (
ticket_group == 1 ~ "single" ,
ticket_group == 2 ~ "couple" ,
TRUE ~ "group"
),
family_group = as.numeric (sib_sp)+ as.numeric (parch)+ 1
) %>%
mutate (family_group = factor (
case_when (
family_group < 2 ~ "single" ,
family_group < 3 ~ "couple" ,
TRUE ~ "family"
),
ordered = TRUE )
) %>%
mutate (age_group = factor (case_when (
age < 13 ~ "child" ,
age < 20 ~ "teen" ,
age < 30 ~ "20s" ,
age < 40 ~ "30s" ,
age < 50 ~ "40s" ,
age < 60 ~ "50s" ,
TRUE ~ "60+"
),
ordered = TRUE )
) %>%
mutate (across (where (is.character),as_factor)) %>%
mutate (pclass = factor (pclass,levels = c ("1" ,"2" ,"3" )),
survived = factor (survived)
) %>%
select (- c (title,ticket_preface))
#all_proc %>% glimpse()
```
### A bit more EDA
```{r, EDA_1}
all_proc %>%
select (- c (name,ticket,cabin,surname,train_test)) %>%
DataExplorer:: plot_bar ()
```
```{r, data_explorer1}
all_proc %>% DataExplorer:: plot_histogram (ggtheme = theme_light () )
```
### Eyeballing Survival Graphs on Training Data
```{r, eye_ball_survival, fig.height=15}
#| warning: false
no_f <- all_proc %>%
filter (train_test == "train" ) %>%
select (passenger_id,pclass,sex,embarked,pax_type,ticket_group,family_group,age_group,cabin_preface,survival) %>%
droplevels () %>%
mutate (across (where (is.factor),~ factor (.x,ordered = FALSE ))) %>%
pivot_longer (cols = c (pclass: cabin_preface))
g_l <- no_f %>%
split (.$ name) %>%
map (~ ggplot (.,aes (y= value,fill= survival)) +
geom_bar () +
ggtitle (.$ name) +
theme_bw () +
labs (x= NULL ,y= NULL )+
scale_fill_viridis_d (option = "cividis" )
)
library (patchwork)
wrap_plots (g_l, ncol = 2 )
```
### Split Data back to Train/Test/Validation
```{r, split}
train_proc_adj_tbl <- all_proc %>%
filter (train_test == "train" ) %>%
select (- c (survival))
train_split <- initial_split (train_proc_adj_tbl,strata = survived)
train_train <- training (train_split)
train_test <- testing (train_split)
```
## Recipe-Base
```{r, recipe_base}
recipe_base <-
recipe (survived ~ ., data = train_train) %>%
update_role (passenger_id, name,surname,ticket,cabin,new_role = "ID" ) %>%
step_impute_knn (all_numeric_predictors ()) %>%
step_dummy (all_nominal_predictors ()) %>%
step_factor2string (all_nominal_predictors ()) %>%
step_zv (all_predictors ()) %>%
step_pca ()
recipe_base
```
### Save Files
```{r, save_rds}
write_rds (all_proc,"artifacts/all_proc.rds" )
write_rds (train_split,"artifacts/train_split.rds" )
write_rds (recipe_base,"artifacts/recipe_base.rds" )
#
# all_proc <- read_rds("artifacts/all_proc.rds")
# train_split <- read_rds("artifacts/train_split.rds")
# recipe_base <- read_rds("artifacts/recipe_base.rds")
```
## Models
### Logistic Regression
#### LR Model Spec
```{r, LR_model}
lr_spec <-
logistic_reg () %>%
set_engine ("glm" )
lr_spec
```
#### LR Workflow
```{r, LR_wflow}
lr_wflow <-
workflow () %>%
add_model (lr_spec) %>%
add_recipe (recipe_base)
lr_wflow
```
#### LR Fit Model
```{r, LR_fit}
lr_fit <-
lr_wflow %>%
last_fit (train_split)
#lr_fit
lr_final_metrics <- lr_fit %>% collect_metrics ()
lr_final_metrics
#show_notes(.Last.tune.result)
```
#### LR Predict
```{r, LR_pred}
lr_test_predictions <- lr_fit %>% collect_predictions () %>%
rename (survived_pred = survived) %>%
bind_cols (train_test)
lr_test_predictions
```
#### LR Performance on validation set
##### AUC Curve
```{r, LR_auc}
lr_test_predictions %>%
roc_curve (truth = survived,.pred_1,event_level= "second" ) %>%
autoplot ()
```
##### Confusion Matrix
```{r, LR_confuse}
lr_test_predictions %>%
conf_mat (survived,.pred_class) %>%
autoplot (type = "heatmap" )
```
#### LR Resampling
```{r, LR_resample}
#| message: false
#| warning: false
folds <- vfold_cv (train_train, strata = survived, v= 5 )
#folds
control <- control_resamples (save_pred = TRUE ,save_workflow = TRUE )
cores <- parallel:: detectCores ()
cl <- parallel:: makePSOCKcluster (cores - 1 )
# doParallel::registerDoParallel(cores = cores)
set.seed (1234 )
lr_fit_cv <-
lr_wflow %>%
fit_resamples (folds, control = control)
#show_best(lr_fit_cv,metric= "accuracy")
#lr_fit_cv
lr_metrics_resample <- collect_metrics (lr_fit_cv)
lr_metrics_resample
parallel:: stopCluster (cl)
```
Following still to be fixed!
```{r, LR_preds}
#lr_param <- extract_parameter_set_dials(lr_spec)
lr_resample_test_predictions <- collect_predictions (lr_fit_cv) %>%
rename (survived_pred = survived)
# bind_cols(testing(train_split))
lr_resample_test_predictions
```
```{r, LR_fit2}
cl <- parallel:: makePSOCKcluster (cores - 1 )
set.seed (1234 )
lm_fit <- lr_wflow %>% fit (data = train_proc_adj_tbl)
extract_recipe (lm_fit, estimated = TRUE )
parallel:: stopCluster (cl)
```
## Regularised Logistic Regression - GLMNET
### RLR Model Spec
```{r, rlr_model}
rlr_model <-
logistic_reg (penalty = tune (), mixture = tune ()) %>%
set_engine ("glmnet" )
rlr_model
```
### RLR Parameter Tuning
```{r, rlr_tuning}
rlr_param <- extract_parameter_set_dials (rlr_model)
rlr_grid <- grid_latin_hypercube (
penalty (),
mixture (),
size = 30
)
head (rlr_grid) %>% knitr:: kable (digits = 3 )
```
### RLR Workflow
```{r, rlr_wflow}
rlr_wflow <-
workflow () %>%
add_model (rlr_model) %>%
add_recipe (recipe_base)
rlr_wflow
```
### RLR Hyper-parameter Tuning
```{r, rlr_cvs}
# rlr_folds <- vfold_cv(training(train_split), strata = survived, v=10,repeats = 5)
# rlr_folds %>% tidy()
#doParallel::registerDoParallel(cores = cores)
cl <- parallel:: makePSOCKcluster (cores - 1 )
set.seed (234 )
rlr_tuning_result <- tune_grid (
rlr_wflow,
resamples = folds,
grid = rlr_grid,
control = control_grid (save_pred = TRUE , save_workflow = TRUE )
)
rlr_tuning_metrics <- collect_metrics (rlr_tuning_result)
head (rlr_tuning_metrics) %>% knitr:: kable (digits = 3 )
parallel:: stopCluster (cl)
```
Review hyper-parameter tuning results and select best
```{r, rlr_tune}
rlr_tuning_result %>%
collect_metrics () %>%
filter (.metric == "accuracy" ) %>%
select (mean, penalty,mixture) %>%
pivot_longer (penalty: mixture,
values_to = "value" ,
names_to = "parameter"
) %>%
ggplot (aes (value, mean, color = parameter)) +
geom_point (alpha = 0.8 , show.legend = FALSE ) +
facet_wrap (~ parameter, scales = "free_x" ) +
labs (x = NULL , y = "AUC" )
show_best (rlr_tuning_result, "accuracy" )
best_rlr_auc <- select_best (rlr_tuning_result, "accuracy" )
best_rlr_auc
```
### RLR Predict
```{r, rlr_predict1}
rlr_final_wflow <- finalize_workflow (
rlr_wflow,
best_rlr_auc
)
rlr_final_wflow
rlr_final_wflow %>%
last_fit (train_split) %>%
extract_fit_parsnip () %>%
vip (geom = "col" )
```
```{r, rlr_predict2}
rlr_final_fit <- rlr_final_wflow %>%
last_fit (train_split)
rlr_final_metrics <- collect_metrics (rlr_final_fit)
rlr_final_metrics %>% knitr:: kable ()
rlr_test_predictions <- rlr_final_fit %>% collect_predictions ()
rlr_test_predictions_all <- rlr_test_predictions %>%
bind_cols (train_test %>% select (- survived))
glimpse (rlr_test_predictions_all)
# rlr_pred <- predict(rlr_final_fit,train_2 )%>%
# bind_cols(predict(rlr_final_fit, train_2,type="prob")) %>%
# bind_cols(train_2 %>% select(survived))
#
# rlr_pred %>%
# roc_auc(truth = survived, .pred_1, event_level = "second")
#
# rlr_pred %>%
# roc_curve(truth = survived, .pred_1,event_level="second") %>%
# autoplot()
#
#
# rlr_metrics <- rlr_pred %>%
# metrics(truth = survived, estimate = .pred_class) %>%
# filter(.metric == "accuracy")
# rlr_metrics
# survive_rlr_pred <-
# augment(survive_lr_fit, train_2)
# survive_rlr_pred
```
### RLR Confusion Matrix
```{r, rlr_confusion_matrix}
rlr_test_predictions %>% conf_mat (survived,.pred_class) %>%
autoplot (type = "heatmap" )
```
## Random Forest
### RF Model Spec - Ranger
```{r, rf_model}
rf_model <-
rand_forest (
trees = 1000 ,
mtry = tune (),
min_n = tune ()
) %>%
set_engine ("ranger" ,importance = "permutation" ) %>%
set_mode ("classification" )
```
### RF Workflow
```{r, rf_wflow}
rf_wflow <-
workflow () %>%
add_model (rf_model) %>%
add_recipe (recipe_base)
```
### RF Tuning - Initial
```{r, rf_tuning}
cl <- parallel:: makePSOCKcluster (cores - 1 )
set.seed (1234 )
rf_tuning_result <- tune_grid (
rf_wflow,
resamples = folds,
grid = 20
)
parallel:: stopCluster (cl)
rf_tuning_result
rf_tuning_result %>%
collect_metrics () %>%
filter (.metric == "accuracy" ) %>%
select (mean,min_n,mtry) %>%
pivot_longer (min_n: mtry) %>%
ggplot (aes (value, mean, color = name)) +
geom_point (show.legend = FALSE ) +
facet_wrap (~ name, scales = "free_x" ) +
labs (x = NULL , y = "Accuracy" )
```
Bit hard to make much of it, but say min_n between 10 and 40 and mtry between 10 and 30?
```{r}
rf_grid <- grid_regular (
mtry (range = c (5 , 40 )),
min_n (range = c (5 , 30 )),
levels = 5
)
rf_grid
```
### RF Graph Results
```{r}
#| warning: false
#| echo: true
#| message: false
#|
cl <- parallel:: makePSOCKcluster (cores - 1 )
set.seed (1234 )
rf_grid_tune <- tune_grid (
rf_wflow,
resamples = folds,
grid = rf_grid
)
rf_grid_tune
parallel:: stopCluster (cl)
rf_grid_tune %>%
collect_metrics () %>%
filter (.metric == "accuracy" ) %>%
mutate (min_n = factor (min_n)) %>%
ggplot (aes (mtry, mean, color = min_n)) +
geom_line (alpha = 0.5 , size = 1.5 ) +
geom_point () +
labs (y = "Accuracy" )
```
Well that's interesting, lets see what tune thinks is best
```{r}
rf_best_params <- select_best (rf_grid_tune,"accuracy" )
rf_best_params %>% knitr:: kable ()
```
### RF Final Model
```{r}
rf_final_model <- finalize_model (
rf_model,
rf_best_params
)
rf_final_model
```
### RF Final Workflow
```{r}
rf_final_wflow <- finalize_workflow (
rf_wflow,
rf_best_params
)
rf_final_wflow
```
### RF Parameter Importance
```{r, rf_vip}
rf_final_wflow %>%
fit (data = train_proc_adj_tbl) %>%
extract_fit_parsnip () %>%
vip (geom = "point" )
```
### RF Final Fit
```{r, rf_fit}
rf_final_fit <-
rf_final_wflow %>%
last_fit (train_split)
rf_final_metrics <- collect_metrics (rf_final_fit)
rf_final_metrics
```
### RF Predict
```{r, rf_predict}
# rf_final_fit <- rf_wflow %>% fit(train_test)
# class(rf_final_fit)
rf_test_predictions <-
collect_predictions (rf_final_fit)
# fit(rf_final_wflow,train_train) %>%
# predict(rf_final_wflow, new_data = train_test) %>%
#bind_cols(predict(rf_final_wflow, train_test,type = "prob")) %>%
#bind_cols(train_test %>% select(survived))
head (rf_test_predictions)
```
### RF Performance on Test Set
```{r, rf_perf}
# rf_test_predictions %>%
# roc_auc(truth = survived, .pred_1,event_level = "second")
rf_metrics_accuracy <- rf_test_predictions %>%
metrics (truth = survived, estimate = .pred_class) %>%
filter (.metric == "accuracy" )
rf_metrics_accuracy
rf_test_predictions %>%
roc_curve (truth = survived, .pred_1,event_level = "second" ) %>%
autoplot ()
```
### RF Confusion Matrix
```{r, rf_confusion_matrix}
rf_test_predictions %>% conf_mat (survived,.pred_class) %>%
autoplot (type = "heatmap" )
```
## XG Boost - Usemodel
### XGB - Usemodel Library specs
```{r}
library (usemodels)
use_xgboost (survived ~ .,
data= train_train,
verbose = TRUE
)
```
### XGB - Parameters
This grid is used for both versions of XG Boost.
```{r, xgb_grid}
#| warning: false
#| echo: true
#| message: false
xgb_grid <- grid_latin_hypercube (
tree_depth (),
min_n (),
trees (),
loss_reduction (),
sample_size = sample_prop (),
finalize (mtry (), train_train),
learn_rate (),
size = 30
)
head (xgb_grid)
```
```{r, usemodel_scripts}
xgboost_usemodel_recipe <-
recipe (formula = survived ~ ., data = train_train) %>%
step_novel (all_nominal_predictors ()) %>%
## This model requires the predictors to be numeric. The most common
## method to convert qualitative predictors to numeric is to create
## binary indicator variables (aka dummy variables) from these
## predictors. However, for this model, binary indicator variables can be
## made for each of the levels of the factors (known as 'one-hot
## encoding').
step_dummy (all_nominal_predictors (), one_hot = TRUE ) %>%
step_zv (all_predictors ())
xgboost_usemodel_model <-
boost_tree (trees = tune (), mtry = tune (),min_n = tune (), tree_depth = tune (), learn_rate = tune (),
loss_reduction = tune (), sample_size = tune ()) %>%
set_mode ("classification" ) %>%
set_engine ("xgboost" )
xgboost_usemodel_wflow <-
workflow () %>%
add_recipe (xgboost_usemodel_recipe) %>%
add_model (xgboost_usemodel_model)
#doParallel::registerDoParallel(cores = cores)
cl <- parallel:: makePSOCKcluster (cores - 1 )
set.seed (1234 )
xgboost_usemodel_tune <-
tune_grid (xgboost_usemodel_wflow, resamples = folds, grid = xgb_grid)
parallel:: stopCluster (cl)
```
### XGB - Usemodel Best Parameter Settings
```{r, xgb_usemodel_para_sel}
xgb_tuning_metrics_usemodel <- collect_metrics (xgboost_usemodel_tune)
xgb_tuning_metrics_usemodel
xgboost_usemodel_tune %>%
collect_metrics () %>%
filter (.metric == "accuracy" ) %>%
select (mean, mtry: sample_size) %>%
pivot_longer (mtry: sample_size,
values_to = "value" ,
names_to = "parameter"
) %>%
ggplot (aes (value, mean, color = parameter)) +
geom_point (alpha = 0.8 , show.legend = FALSE ) +
facet_wrap (~ parameter, scales = "free_x" ) +
labs (x = NULL , y = "Accuracy" )
```
Now select best from above
```{r, xgb_usemodel_select_paras}
show_best (xgboost_usemodel_tune, "accuracy" )
xgb_usemodel_best_params <- select_best (xgboost_usemodel_tune, "accuracy" )
xgb_usemodel_best_params
xgb_usemodel_final_wflow <- finalize_workflow (
xgboost_usemodel_wflow,
xgb_usemodel_best_params
)
xgb_usemodel_final_wflow
```
### XGB - Usemodel Parameter Ranking - VIP
```{r, xgb_usemodel_vip}
xgb_usemodel_final_wflow %>%
fit (data = train_train) %>%
extract_fit_parsnip () %>%
vip (geom = "point" )
```
### XGB - Usemodel Performance
#### XGB - Usemodel Accuracy Measured on Test Set
```{r, xgb_usemodel_final_metrics}
cl <- parallel:: makePSOCKcluster (cores - 1 )
set.seed (1234 )
xgb_usemodel_final_res <- last_fit (xgb_usemodel_final_wflow, train_split)
xgb_usemodel_final_res
xgb_usemodel_final_metrics <- collect_metrics (xgb_usemodel_final_res)
xgb_usemodel_final_metrics
parallel:: stopCluster (cl)
```
#### XGB - Usemodel AUC on Test Set (within train)
```{r, xgb_usemodel_auc}
xgb_usemodel_final_res %>%
collect_predictions () %>%
roc_curve ( truth = survived,.pred_1, event_level = "second" ) %>%
ggplot (aes (x = 1 - specificity, y = sensitivity)) +
geom_line (size = 1.5 , color = "midnightblue" ) +
geom_abline (
lty = 2 , alpha = 0.5 ,
color = "gray50" ,
size = 1.2
)
```
```{r, }
xgb_usemodel_test_predictions <- collect_predictions (xgb_usemodel_final_res)
head (xgb_usemodel_test_predictions)
```
### XGB - Usemodel Confusion Matrix
```{r}
xgb_usemodel_test_predictions %>% conf_mat (survived,.pred_class) %>%
autoplot (type = "heatmap" )
```
## XG Boost - Base Recipe
### XGB Model Spec
```{r, xgb_model}
xgb_model <-
boost_tree (
trees = tune (),
tree_depth = tune (),
min_n = tune (),
loss_reduction = tune (),
sample_size = tune (),
mtry = tune (),
learn_rate = tune ()) %>%
set_engine ("xgboost" ) %>%
set_mode ("classification" )
xgb_model
```
### XGB Workflow
```{r, xgb_wflow}
xgb_wflow <-
workflow () %>%
add_model (xgb_model) %>%
add_recipe (recipe_base)
```
### XGB Hyper-Parameter Tuning
```{r}
# xgb_folds <- vfold_cv(training(train_split), strata = survived)
# xgb_folds
#doParallel::registerDoParallel(cores = cores)
set.seed (1234 )
cl <- parallel:: makePSOCKcluster (cores - 1 )
xgb_tuning_result <- tune_grid (
xgb_wflow,
resamples = folds,
grid = xgb_grid,
control = control_grid (save_pred = TRUE ,save_workflow = TRUE )
)
xgb_tuning_result
parallel:: stopCluster (cl)
```
```{r}
xgb_tuning_metrics <- collect_metrics (xgb_tuning_result)
xgb_tuning_metrics
xgb_tuning_result %>%
collect_metrics () %>%
filter (.metric == "accuracy" ) %>%
select (mean, mtry: sample_size) %>%
pivot_longer (mtry: sample_size,
values_to = "value" ,
names_to = "parameter"
) %>%
ggplot (aes (value, mean, color = parameter)) +
geom_point (alpha = 0.8 , show.legend = FALSE ) +
facet_wrap (~ parameter, scales = "free_x" ) +
labs (x = NULL , y = "AUC" )
```
#### XGB Best Parameters then Finalise Workflow
```{r}
show_best (xgb_tuning_result, "accuracy" )
xgb_best_params <- select_best (xgb_tuning_result, "accuracy" )
xgb_best_params
xgb_final_wflow <- finalize_workflow (
xgb_wflow,
xgb_best_params
)
xgb_final_wflow
```
```{r}
xgb_final_wflow %>%
fit (data = train_train) %>%
extract_fit_parsnip () %>%
vip (geom = "point" )
```
### XGB Performance on Training Test Set
#### XGB Accuracy Measured on Test Set
```{r}
xgb_final_res <- last_fit (xgb_final_wflow, train_split)
xgb_final_res
xgb_final_metrics <- collect_metrics (xgb_final_res)
xgb_final_metrics
```
#### XGB AUC on Test Set (within train)
```{r}
xgb_final_res %>%
collect_predictions () %>%
roc_curve ( truth = survived,.pred_1, event_level = "second" ) %>%
ggplot (aes (x = 1 - specificity, y = sensitivity)) +
geom_line (size = 1.5 , color = "midnightblue" ) +
geom_abline (
lty = 2 , alpha = 0.5 ,
color = "gray50" ,
size = 1.2
)
```
```{r}
xgb_test_predictions <- collect_predictions (xgb_final_res)
head (xgb_test_predictions)
```
### XGB Confusion Matrix
```{r}
xgb_test_predictions %>% conf_mat (survived,.pred_class) %>%
autoplot (type = "heatmap" )
```
## Neural Net
### NN Model
```{r, nn_model}
nnet_model <-
mlp (hidden_units = tune (), penalty = tune (), epochs = tune ()) %>%
set_engine ("nnet" , MaxNWts = 2600 ) %>%
set_mode ("classification" )
nnet_model %>% translate ()
```
### NN Workflow
```{r, nn_wflow}
nnet_wflow <- workflow () %>%
add_model (nnet_model) %>%
add_recipe (recipe_base)
```
### NN Parameters
```{r, nn_params}
nnet_grid <- grid_latin_hypercube (
hidden_units (),
penalty (),
epochs ()
)
head (nnet_grid)
```
### NN Hyper-Parameter Tuning
```{r, nn_tuning}
# nnet_folds <- vfold_cv(train_train, strata = survived)
# nnet_folds
# doParallel::registerDoParallel(cores = cores)
cl <- parallel:: makePSOCKcluster (cores - 1 )
set.seed (1234 )
nnet_tuning_result <- tune_grid (
nnet_wflow,
resamples = folds,
grid = nnet_grid,
control = control_grid (save_pred = TRUE ,save_workflow = TRUE )
)
nnet_tuning_result
parallel:: stopCluster (cl)
```
### NN Best Parameters and Finalise Workflow
```{r, nn_best_params}
show_best (nnet_tuning_result, "accuracy" )
nn_best_params <- select_best (nnet_tuning_result, "accuracy" )
nnet_best_auc <- select_best (xgb_tuning_result, "accuracy" )
nnet_best_auc
nnet_final_wflow <- finalize_workflow (
nnet_wflow,
nn_best_params
)
nnet_final_wflow
```
```{r, nn_final_train}
nnet_final_wflow %>%
fit (data = train_train) %>%
extract_fit_parsnip () %>%
vip (geom = "point" )
```
### NN Accuracy - Train/Test Set
```{r}
nnet_tuning_metrics <- collect_metrics (nnet_tuning_result)
nnet_tuning_metrics
nnet_final_res <- last_fit (nnet_final_wflow, train_split)
nnet_final_res
nnet_final_metrics <- collect_metrics (nnet_final_res)
nnet_final_metrics
```
### NN AUC
```{r}
nnet_final_res %>%
collect_predictions () %>%
roc_curve ( truth = survived,.pred_1, event_level = "second" ) %>%
ggplot (aes (x = 1 - specificity, y = sensitivity)) +
geom_line (size = 1.5 , color = "midnightblue" ) +
geom_abline (
lty = 2 , alpha = 0.5 ,
color = "gray50" ,
size = 1.2
)
```
### NN Predictions on Train/Test Set
```{r}
nnet_test_predictions <- nnet_final_res %>%
collect_predictions ()
head (nnet_test_predictions)
```
### NN Confusion Matrix
```{r, NN_confusion_matrix}
nnet_test_predictions %>% conf_mat (survived,.pred_class) %>%
autoplot (type = "heatmap" )
```
## Stack Models
### Stack Recipe
```{r, stack_recipe}
recipe_stack <-
recipe (survived ~ ., data = train_train) %>%
update_role (passenger_id, name,surname,ticket,cabin,new_role = "ID" ) %>%
step_impute_knn (all_numeric_predictors ()) %>%
step_dummy (all_nominal_predictors ()) %>%
step_factor2string (all_nominal_predictors ()) %>%
step_zv (all_predictors ()) %>%
step_pca ()
recipe_stack
recipe_stack_trained <- prep (recipe_base)
recipe_stack_trained
```
### Stack Controls
```{r, stack_controls}
stack_ctrl <- control_resamples (save_pred = TRUE , save_workflow = TRUE )
#stack_folds <- vfold_cv(training(train_split), v=10,strata = "survived")
library (stacks)
model_stack <-
stacks () %>%
#add_candidates(lr_wflow) %>%
#add_candidates(rf_wflow) %>%
add_candidates (nnet_tuning_result) %>%
add_candidates (rlr_tuning_result) %>%
add_candidates (xgb_tuning_result)
```
### Stack Blend
```{r}
cl <- parallel:: makePSOCKcluster (cores - 1 )
set.seed (1234 )
ensemble <- blend_predictions (model_stack,penalty = 10 ^ seq (- 2 , - 0.5 , length = 20 ))
autoplot (ensemble)
parallel:: stopCluster (cl)
```
```{r,ensemble_table}
ensemble
```
### Stack Weights
```{r}
autoplot (ensemble, "weights" ) +
geom_text (aes (x = weight + 0.01 , label = model), hjust = 0 ) +
theme (legend.position = "none" )
```
### Fit Member Models
```{r, fit_ensemble}
ensemble <- fit_members (ensemble)
collect_parameters (ensemble,"xgb_tuning_result" )
```
### Stack Predict
```{r}
#ensemble_metrics <- metric_set(roc_auc,accuracy)
ensemble_test_predictions <-
predict (ensemble,train_test) %>%
bind_cols (train_test)
# ensemble_test_predictions <- ensemble_test_predictions %>%
# mutate(.pred_class=as.numeric(.pred_class)) %>%
# mutate(survived =as.numeric(survived))
#
# ensemble_test_predictions <- ensemble_test_predictions %>%
# mutate(roc = roc_auc(truth=survived, estimate = .pred_class))
glimpse (ensemble_test_predictions)
```
## Join Model Prediction Data
```{r, all_predictions}
all_predictions <-
lr_test_predictions %>% mutate (model = "LR" ) %>%
bind_rows (nnet_test_predictions %>% mutate (model = "NNet" )) %>%
bind_rows (rlr_test_predictions %>% mutate (model = "Reg_LR" )) %>%
bind_rows (rf_test_predictions %>% mutate (model = "RF" )) %>%
bind_rows (xgb_test_predictions %>% mutate (model = "xgb" )) %>%
bind_rows (xgb_usemodel_test_predictions %>% mutate (model = "xgb_usemodel" )) %>%
bind_rows (ensemble_test_predictions %>% mutate (model = "ensemble" ))
all_predictions %>% head () %>% knitr:: kable ()
```
## All Metrics
Ordered by descending Accuracy metric
```{r, all_metrics}
all_metrics <-
lr_final_metrics %>% mutate (model = "LR" ) %>%
bind_rows (nnet_final_metrics %>% mutate (model = "NNet" )) %>%
bind_rows (rlr_final_metrics %>% mutate (model = "Reg_LR" )) %>%
bind_rows (rf_final_metrics %>% mutate (model = "RF" )) %>%
bind_rows (xgb_final_metrics %>% mutate (model = "xgb" )) %>%
bind_rows (xgb_usemodel_final_metrics %>% mutate (model = "xgb-usemodel" ))
all_metrics_table <- all_metrics %>%
pivot_wider (names_from = .metric,values_from = .estimate) %>%
arrange (desc (accuracy))
write_rds (all_metrics,"artifacts/all_metrics.rds" )
all_metrics_table %>% knitr:: kable (digits= 3 )
```
and a graph:
```{r, graph_all_metrics}
all_metrics %>%
filter (.metric == "accuracy" ) %>%
select (model, accuracy = .estimate) %>%
ggplot (aes (model, accuracy)) +
geom_col ()
```
# Final Submission
```{r, predict_test}
# all_predictions %>%
# distinct(model)
test_proc <- all_proc %>%
filter (train_test== "test" )
# LR ----
final_test_pred_LR <-
lr_wflow %>%
fit (train_proc_adj_tbl) %>%
predict (new_data= test_proc) %>%
bind_cols (test_proc)
submission_LR <- final_test_pred_LR %>%
select (PassengerID = passenger_id,Survived = .pred_class)
write_csv (submission_LR,"titanic_submission_LR.csv" )
# RLR ----
final_test_pred_RLR <-
rlr_final_wflow %>%
fit (train_proc_adj_tbl) %>%
predict (new_data= test_proc) %>%
bind_cols (test_proc)
submission_RLR <- final_test_pred_RLR %>%
select (PassengerID = passenger_id,Survived = .pred_class)
write_csv (submission_RLR,"titanic_submission_RLR.csv" )
# RF ----
final_test_pred_RF <-
rf_final_wflow %>%
fit (train_proc_adj_tbl) %>%
predict (new_data= test_proc) %>%
bind_cols (test_proc)
submission_RF <- final_test_pred_RF %>%
select (PassengerID = passenger_id,Survived = .pred_class)
write_csv (submission_RF,"titanic_submission_RF.csv" )
# NN ----
final_test_pred_NN <-
nnet_final_wflow %>%
fit (train_proc_adj_tbl) %>%
predict (new_data= test_proc) %>%
bind_cols (test_proc)
submission_NN <- final_test_pred_NN %>%
select (PassengerID = passenger_id,Survived = .pred_class)
write_csv (submission_NN,"titanic_submission_NN.csv" )
# XGB -----
final_test_pred_xgb <-
xgb_final_wflow %>%
fit (train_proc_adj_tbl) %>%
predict (new_data= test_proc) %>%
bind_cols (test_proc)
submission_xgb <- final_test_pred_xgb %>%
select (PassengerID = passenger_id,Survived = .pred_class)
write_csv (submission_xgb,"titanic_submission_xgb.csv" )
# ensemble -----
final_test_pred_ens <-
ensemble %>%
#fit(train_proc_adj_tbl) %>%
predict (new_data= test_proc) %>%
bind_cols (test_proc)
submission_ens <- final_test_pred_ens %>%
select (PassengerID = passenger_id,Survived = .pred_class)
write_csv (submission_ens,"titanic_submission_ens.csv" )
```