---  
title:  "Titanic from Kaggle"  
author:  "Stephen Parton"  
date:  "2022-09-01"  
categories:  [code, analysis,titanic]  
website:  
  sidebar:  
    style: "docked"  
    search: true  
    contents:  
      - section: "Data Exploration"  
      - index.qmd  
format:    
  html:   
    theme: litera  
    toc: true  
    toc-title: Contents  
    number-sections: true  
    number-depth: 3  
    code-fold: true  
    code-summary: "Code"  
    code-tools: true  
execute:    
  echo: true  
  warning: false  
  error: false  
  freeze: true  
  cache: true  
---  
 
 {width="215"} 
 
## Summary  
 
 This is just a first test with code in a blog using the new Quarto framework! Guess what I am using..yep Titanic, Kaggle version.. 
 
 It is not very well structured as it is pretty much in the order I did it following all instructions, books and blogs from the expert TidyModels and Quarto teams at RStudio/Posit . All errors belong to me! 
 
```{r, packages}  
#| context: setup  
#| include: false  
 
 
library (tidyverse) 
library (janitor) 
library (skimr) 
library (DataExplorer) 
 
 
library (tidymodels) 
library (vip) 
library (ggforce) 
tidymodels_prefer () 
 
```  
 
## Final Kaggle Scores  
 
```{r, kaggle}  
 
 kaggle <-  tibble ( 
   Model =  c ("Logistic Regression" , 
             "Regularised Logistic Regression" , 
             "Random Forest-final" , 
             "Random Forest-initial" , 
             "XG Boost" , 
             "Neural Net" , 
             "Ensemble" ),  
   Score =  c (.76555 ,.77033 ,.77751 ,.78229 ,.77272 ,.76794 ,.77751 ) 
   ) 
 
 kaggle %>%  knitr:: kable () 
```  
 
 Which when all submitted gave me a ranking of 1,872 out of 13,000 or so teams, so no grand-master! 
 
 Seems like the value mainly comes from the feature engineering and selection process (as the experts all seem to say) given the similarity in above model scores. 
 
## Review Data  
 
### Load Some Kaggle Data  
 
 Not the...? Yes, the Titanic again.... 
 
```{r, data}  
#| warning: false  
#| echo: true  
#| message: false  
 
 train <-  read_csv ("data_raw/train.csv" ,show_col_types =  FALSE ) %>%  clean_names () %>%  mutate (train_test =  "train" ) 
 test <-  read_csv ("data_raw/test.csv" ,show_col_types =  FALSE ) %>%  clean_names () %>%   
   mutate (train_test =  "test" ) 
 all <-  train %>%  bind_rows (test) 
 
# colnames(data)  
# cwd()  
 
```  
 
### Some Initial EDA  
 
 A quick look. 
 
```{r, skim}  
 train %>%  skim ()  
```  
 
### Some Initial Wrangling  
 
```{r, wrangle_1}  
 all_proc <-  all %>%   
   mutate (title =  str_extract (name,"( \\ w)([a-z]+)( \\ .)" )) %>%   
   mutate (pax_type =  case_when ( 
     title %in%  c ("Miss." ,"Ms." ,"Mlle." )         ~  "F_unmarried" , 
     title %in%  c ("Mme." ,"Mrs." )                 ~  "F_married" , 
     title %in%  c ("Countess." ,"Lady." ,"Dona." )   ~  "F_titled" , 
     title %in%  c ("Capt." ,"Col." ,"Major." )       ~  "Military" , 
     title %in%  c ("Dr." ,"Rev." )                  ~  "M_Professional" , 
     title %in%  c ("Don." ,"Jonkheer." ,"Sir." )     ~  "M_titled" , 
     TRUE  ~  title 
   ), 
   surname        =  str_extract (name,"( \\ w+)( \\ ,)" ), 
   survival       =  ifelse (survived== 0 ,"No" ,"Yes" ), 
   ticket_preface =  str_extract (ticket,"([:graph:]+)( \\ s)" ), 
   ticket_preface =  ifelse (is.na (ticket_preface),"none" ,ticket_preface), 
   cabin_preface  =  ifelse (is.na (cabin),"nk" , 
                     substr (cabin,1 ,1 )), 
   embarked       =  ifelse (is.na (embarked),"S" ,embarked) 
   ) %>%   
   group_by (pax_type,pclass) %>%   
   mutate (age     =  ifelse (is.na (age),median (age,na.rm =  T), age)) %>%   
   ungroup () %>%   
   add_count (ticket,name =  "ticket_group" ) %>%   
   mutate (ticket_group =  case_when ( 
     ticket_group ==  1  ~  "single" , 
     ticket_group ==  2  ~  "couple" , 
     TRUE               ~  "group"  
   ), 
     family_group =  as.numeric (sib_sp)+ as.numeric (parch)+ 1  
   ) %>%   
   mutate (family_group =  factor ( 
     case_when ( 
         family_group <  2   ~  "single" , 
         family_group <  3   ~  "couple" , 
         TRUE               ~  "family"  
         ), 
     ordered =  TRUE ) 
   ) %>%   
   mutate (age_group =  factor (case_when ( 
     age <  13       ~  "child" , 
     age <  20       ~  "teen" , 
     age <  30       ~  "20s" , 
     age <  40       ~  "30s" , 
     age <  50       ~  "40s" , 
     age <  60       ~  "50s" , 
     TRUE           ~  "60+"  
      
   ), 
   ordered =  TRUE ) 
   ) %>%   
   mutate (across (where (is.character),as_factor)) %>%   
   mutate (pclass =  factor (pclass,levels =  c ("1" ,"2" ,"3" )), 
          survived =  factor (survived) 
          ) %>%   
select (- c (title,ticket_preface)) 
    
#all_proc %>% glimpse()   
 
```  
 
### A bit more EDA  
 
```{r, EDA_1}  
 all_proc %>%   
   select (- c (name,ticket,cabin,surname,train_test)) %>%   
   DataExplorer:: plot_bar () 
```  
 
```{r, data_explorer1}  
 all_proc %>%  DataExplorer:: plot_histogram (ggtheme =  theme_light () ) 
```  
 
### Eyeballing Survival Graphs on Training Data  
 
```{r, eye_ball_survival, fig.height=15}  
#| warning: false  
 
 no_f <-  all_proc %>%  
   filter (train_test ==  "train" ) %>%   
   select (passenger_id,pclass,sex,embarked,pax_type,ticket_group,family_group,age_group,cabin_preface,survival) %>%   
   droplevels () %>%  
   mutate (across (where (is.factor),~  factor (.x,ordered =  FALSE ))) %>%  
   pivot_longer (cols =  c (pclass: cabin_preface))  
 
 
 g_l <-  no_f %>%   
   split (.$ name) %>%   
   map (~  ggplot (.,aes (y= value,fill= survival)) +  
                 geom_bar () +  
               ggtitle (.$ name) +  
         theme_bw () +  
         labs (x= NULL ,y= NULL )+  
         scale_fill_viridis_d (option =  "cividis" ) 
        
             )  
 
library (patchwork) 
wrap_plots (g_l, ncol =  2 ) 
 
```  
 
### Split Data back to Train/Test/Validation  
 
```{r, split}  
 
 train_proc_adj_tbl <-  all_proc %>%   
   filter (train_test == "train" ) %>%   
   select (- c (survival)) 
 
 
    
 train_split <-  initial_split (train_proc_adj_tbl,strata =  survived) 
 
 train_train <-  training (train_split) 
 train_test <-  testing (train_split) 
 
```  
 
## Recipe-Base  
 
```{r, recipe_base}  
 recipe_base <-   
   recipe (survived ~  ., data =  train_train) %>%   
   update_role (passenger_id, name,surname,ticket,cabin,new_role =  "ID" ) %>%  
   step_impute_knn (all_numeric_predictors ()) %>%   
   step_dummy (all_nominal_predictors ()) %>%  
   step_factor2string (all_nominal_predictors ()) %>%   
   step_zv (all_predictors ()) %>%   
   step_pca () 
 recipe_base 
 
 
 
 
```  
 
### Save Files  
 
```{r, save_rds}  
 
write_rds (all_proc,"artifacts/all_proc.rds" ) 
write_rds (train_split,"artifacts/train_split.rds" ) 
write_rds (recipe_base,"artifacts/recipe_base.rds" ) 
#   
# all_proc <- read_rds("artifacts/all_proc.rds")  
# train_split <- read_rds("artifacts/train_split.rds")  
# recipe_base <- read_rds("artifacts/recipe_base.rds")  
 
```  
 
## Models  
 
### Logistic Regression  
 
#### LR Model Spec  
 
```{r, LR_model}  
 lr_spec <-    
   logistic_reg () %>%   
   set_engine ("glm" ) 
 
 lr_spec 
 
 
```  
 
#### LR Workflow  
 
```{r, LR_wflow}  
 lr_wflow <-   
   workflow () %>%   
   add_model (lr_spec) %>%   
   add_recipe (recipe_base) 
 
 lr_wflow 
 
```  
 
#### LR Fit Model  
 
```{r, LR_fit}  
 
 lr_fit <-   
   lr_wflow %>%   
   last_fit (train_split) 
 
#lr_fit  
 
 lr_final_metrics <-  lr_fit %>%  collect_metrics () 
 lr_final_metrics  
#show_notes(.Last.tune.result)  
```  
 
#### LR Predict  
 
```{r, LR_pred}  
 
 
 lr_test_predictions <-  lr_fit %>%  collect_predictions () %>%   
   rename (survived_pred =  survived) %>%   
   bind_cols (train_test) 
 lr_test_predictions 
```  
 
#### LR Performance on validation set  
 
##### AUC Curve  
 
```{r, LR_auc}  
 
 
 
 lr_test_predictions %>%   
   roc_curve (truth =  survived,.pred_1,event_level= "second" ) %>%   
   autoplot () 
 
```  
 
##### Confusion Matrix  
 
```{r, LR_confuse}  
 
 lr_test_predictions %>%   
   conf_mat (survived,.pred_class) %>%   
   autoplot (type =  "heatmap" ) 
 
```  
 
#### LR Resampling  
 
```{r, LR_resample}  
#| message: false  
#| warning: false  
 
 folds <-  vfold_cv (train_train, strata =  survived, v= 5 ) 
#folds  
 
 control <-  control_resamples (save_pred =  TRUE ,save_workflow =  TRUE ) 
 
 cores <-  parallel:: detectCores () 
 cl <-  parallel:: makePSOCKcluster (cores -  1 ) 
 
# doParallel::registerDoParallel(cores = cores)  
set.seed (1234 ) 
 lr_fit_cv <-   
   lr_wflow %>%   
   fit_resamples (folds, control =  control) 
 
#show_best(lr_fit_cv,metric= "accuracy")  
 
#lr_fit_cv  
 lr_metrics_resample <-  collect_metrics (lr_fit_cv) 
 lr_metrics_resample 
 
 parallel:: stopCluster (cl) 
 
```  
 
 Following still to be fixed! 
 
```{r, LR_preds}  
#lr_param <- extract_parameter_set_dials(lr_spec)  
 
 lr_resample_test_predictions <-  collect_predictions (lr_fit_cv) %>%   
   rename (survived_pred =  survived)  
#  bind_cols(testing(train_split))  
 lr_resample_test_predictions 
 
```  
 
```{r, LR_fit2}  
 cl <-  parallel:: makePSOCKcluster (cores -  1 ) 
 
set.seed (1234 ) 
 lm_fit <-  lr_wflow %>%  fit (data =  train_proc_adj_tbl) 
extract_recipe (lm_fit, estimated =  TRUE ) 
 
 parallel:: stopCluster (cl) 
```  
 
## Regularised Logistic Regression - GLMNET  
 
### RLR Model Spec  
 
```{r, rlr_model}  
 
 rlr_model <-   
   logistic_reg (penalty =  tune (), mixture =  tune ()) %>%   
   set_engine ("glmnet" ) 
 rlr_model 
 
```  
 
### RLR Parameter Tuning  
 
```{r, rlr_tuning}  
 
 
 rlr_param <-  extract_parameter_set_dials (rlr_model) 
 
 rlr_grid <-  grid_latin_hypercube ( 
   penalty (), 
   mixture (), 
   size =  30  
 ) 
head (rlr_grid) %>%  knitr:: kable (digits = 3 ) 
 
```  
 
### RLR Workflow  
 
```{r, rlr_wflow}  
 
 rlr_wflow <-   
   workflow () %>%   
   add_model (rlr_model) %>%   
   add_recipe (recipe_base) 
 rlr_wflow 
 
 
```  
 
### RLR Hyper-parameter Tuning  
 
```{r, rlr_cvs}  
 
# rlr_folds <- vfold_cv(training(train_split), strata = survived, v=10,repeats = 5)  
# rlr_folds %>% tidy()  
 
#doParallel::registerDoParallel(cores = cores)  
 cl <-  parallel:: makePSOCKcluster (cores -  1 ) 
 
set.seed (234 ) 
 rlr_tuning_result <-  tune_grid ( 
   rlr_wflow, 
   resamples =  folds, 
   grid      =  rlr_grid, 
   control   =  control_grid (save_pred =  TRUE , save_workflow =  TRUE ) 
 ) 
 
 rlr_tuning_metrics <-  collect_metrics (rlr_tuning_result) 
head (rlr_tuning_metrics) %>%  knitr:: kable (digits =  3 ) 
 
 
 parallel:: stopCluster (cl) 
```  
 
 Review hyper-parameter tuning results and select best 
 
```{r, rlr_tune}  
 
 rlr_tuning_result %>%  
   collect_metrics () %>%  
   filter (.metric ==  "accuracy" ) %>%  
   select (mean, penalty,mixture) %>%  
   pivot_longer (penalty: mixture, 
                values_to =  "value" , 
                names_to =  "parameter"  
   ) %>%  
   ggplot (aes (value, mean, color =  parameter)) +  
   geom_point (alpha =  0.8 , show.legend =  FALSE ) +  
   facet_wrap (~ parameter, scales =  "free_x" ) +  
   labs (x =  NULL , y =  "AUC" ) 
 
show_best (rlr_tuning_result, "accuracy" ) 
 
 best_rlr_auc <-  select_best (rlr_tuning_result, "accuracy" ) 
 best_rlr_auc 
 
```  
 
### RLR Predict  
 
```{r, rlr_predict1}  
 
 rlr_final_wflow <-  finalize_workflow ( 
   rlr_wflow, 
   best_rlr_auc 
 ) 
 
 rlr_final_wflow 
 
 
 rlr_final_wflow %>%  
   last_fit (train_split) %>%  
   extract_fit_parsnip () %>%  
   vip (geom =  "col" ) 
 
```  
 
```{r, rlr_predict2}  
 
 rlr_final_fit <-  rlr_final_wflow %>%  
   last_fit (train_split) 
 
 rlr_final_metrics <-  collect_metrics (rlr_final_fit) 
 rlr_final_metrics %>%  knitr:: kable () 
 
 rlr_test_predictions <-  rlr_final_fit %>%  collect_predictions () 
 rlr_test_predictions_all <-  rlr_test_predictions %>%   
   bind_cols (train_test %>%  select (- survived))  
 
 
 
glimpse (rlr_test_predictions_all) 
 
# rlr_pred <- predict(rlr_final_fit,train_2 )%>%   
#   bind_cols(predict(rlr_final_fit, train_2,type="prob")) %>%   
#   bind_cols(train_2 %>% select(survived))  
#   
# rlr_pred %>%   
#   roc_auc(truth = survived, .pred_1, event_level = "second")  
#   
# rlr_pred %>%   
#   roc_curve(truth = survived, .pred_1,event_level="second") %>%   
#   autoplot()  
#   
#   
# rlr_metrics <- rlr_pred %>%   
# metrics(truth = survived, estimate = .pred_class) %>%   
#   filter(.metric == "accuracy")  
# rlr_metrics  
# survive_rlr_pred <-   
#   augment(survive_lr_fit, train_2)  
# survive_rlr_pred  
 
```  
 
### RLR Confusion Matrix  
 
```{r, rlr_confusion_matrix}  
 
 rlr_test_predictions %>%  conf_mat (survived,.pred_class) %>%   
   autoplot (type =  "heatmap" ) 
 
 
 
```  
 
## Random Forest  
 
### RF Model Spec - Ranger  
 
```{r, rf_model}  
 
 rf_model <-   
   rand_forest ( 
     trees =  1000 , 
     mtry  =  tune (), 
     min_n =  tune () 
     ) %>%   
   set_engine ("ranger" ,importance =  "permutation" ) %>%   
   set_mode ("classification" ) 
 
```  
 
### RF Workflow  
 
```{r, rf_wflow}  
 
 rf_wflow <-   
   workflow () %>%   
   add_model (rf_model) %>%   
   add_recipe (recipe_base) 
 
```  
 
### RF Tuning - Initial  
 
```{r, rf_tuning}  
 
 cl <-  parallel:: makePSOCKcluster (cores -  1 ) 
 
set.seed (1234 ) 
 rf_tuning_result <-  tune_grid ( 
   rf_wflow, 
   resamples =  folds, 
   grid =  20  
 ) 
 parallel:: stopCluster (cl) 
 
 rf_tuning_result 
 
 rf_tuning_result %>%   
   collect_metrics () %>%   
   filter (.metric ==  "accuracy" ) %>%   
   select (mean,min_n,mtry) %>%   
   pivot_longer (min_n: mtry) %>%   
   ggplot (aes (value, mean, color =  name)) +  
   geom_point (show.legend =  FALSE ) +  
   facet_wrap (~ name, scales =  "free_x" ) +  
   labs (x =  NULL , y =  "Accuracy" ) 
 
```  
 
 Bit hard to make much of it, but say min_n between 10 and 40 and mtry between 10 and 30? 
 
```{r}  
 rf_grid <-  grid_regular ( 
   mtry (range =  c (5 , 40 )), 
   min_n (range =  c (5 , 30 )), 
   levels =  5  
 ) 
 
 rf_grid 
```  
 
### RF Graph Results  
 
```{r}  
#| warning: false  
#| echo: true  
#| message: false  
#|   
 cl <-  parallel:: makePSOCKcluster (cores -  1 ) 
 
 
set.seed (1234 ) 
 rf_grid_tune <-  tune_grid ( 
   rf_wflow, 
   resamples =  folds, 
   grid =  rf_grid 
 ) 
 rf_grid_tune 
 
 parallel:: stopCluster (cl) 
 
 rf_grid_tune %>%  
   collect_metrics () %>%  
   filter (.metric ==  "accuracy" ) %>%  
   mutate (min_n =  factor (min_n)) %>%  
   ggplot (aes (mtry, mean, color =  min_n)) +  
   geom_line (alpha =  0.5 , size =  1.5 ) +  
   geom_point () +  
   labs (y =  "Accuracy" ) 
 
```  
 
 Well that's interesting, lets see what tune thinks is best 
 
```{r}  
 rf_best_params <-  select_best (rf_grid_tune,"accuracy" ) 
 rf_best_params %>%  knitr:: kable () 
 
```  
 
### RF Final Model  
 
```{r}  
 rf_final_model <-  finalize_model ( 
   rf_model, 
   rf_best_params 
 ) 
 rf_final_model 
 
```  
 
### RF Final Workflow  
 
```{r}  
 rf_final_wflow <-  finalize_workflow ( 
   rf_wflow, 
   rf_best_params 
 ) 
 
 rf_final_wflow 
```  
 
### RF Parameter Importance  
 
```{r, rf_vip}  
 
 rf_final_wflow %>%  
   fit (data =  train_proc_adj_tbl) %>%  
   extract_fit_parsnip () %>%  
   vip (geom =  "point" ) 
 
```  
 
### RF Final Fit  
 
```{r, rf_fit}  
 
 rf_final_fit <-   
   rf_final_wflow %>%   
   last_fit (train_split) 
 
 rf_final_metrics <-  collect_metrics (rf_final_fit) 
 rf_final_metrics 
```  
 
### RF Predict  
 
```{r, rf_predict}  
 
# rf_final_fit <- rf_wflow %>% fit(train_test)  
# class(rf_final_fit)  
 
  rf_test_predictions <-   
    collect_predictions (rf_final_fit) 
    # fit(rf_final_wflow,train_train) %>%   
    # predict(rf_final_wflow, new_data = train_test) %>%   
    #bind_cols(predict(rf_final_wflow, train_test,type = "prob")) %>%   
    #bind_cols(train_test %>% select(survived))  
 
   
  head (rf_test_predictions) 
   
   
```  
 
### RF Performance on Test Set  
 
```{r, rf_perf}  
 
# rf_test_predictions %>%   
#   roc_auc(truth = survived, .pred_1,event_level = "second")  
 
 rf_metrics_accuracy <-  rf_test_predictions %>%   
   metrics (truth =  survived, estimate =  .pred_class) %>%   
   filter (.metric ==  "accuracy" ) 
 rf_metrics_accuracy 
 
 rf_test_predictions %>%   
   roc_curve (truth =  survived, .pred_1,event_level =  "second" ) %>%   
   autoplot () 
```  
 
### RF Confusion Matrix  
 
```{r, rf_confusion_matrix}  
 
 rf_test_predictions %>%  conf_mat (survived,.pred_class) %>%   
   autoplot (type =  "heatmap" ) 
 
```  
 
## XG Boost - Usemodel  
 
### XGB - Usemodel Library specs  
 
```{r}  
library (usemodels) 
 
use_xgboost (survived ~  ., 
             data= train_train, 
             verbose =  TRUE  
    
 ) 
 
 
```  
 
### XGB - Parameters  
 
 This grid is used for both versions of XG Boost. 
 
```{r, xgb_grid}  
#| warning: false  
#| echo: true  
#| message: false  
 
 
 xgb_grid <-  grid_latin_hypercube ( 
   tree_depth (), 
   min_n (), 
   trees (), 
   loss_reduction (), 
   sample_size =  sample_prop (), 
   finalize (mtry (), train_train), 
   learn_rate (), 
   size =  30  
 ) 
 
head (xgb_grid) 
 
 
 
```  
 
```{r, usemodel_scripts}  
 
 xgboost_usemodel_recipe <-   
   recipe (formula =  survived ~  ., data =  train_train) %>%   
   step_novel (all_nominal_predictors ()) %>%   
   ## This model requires the predictors to be numeric. The most common   
   ## method to convert qualitative predictors to numeric is to create   
   ## binary indicator variables (aka dummy variables) from these   
   ## predictors. However, for this model, binary indicator variables can be   
   ## made for each of the levels of the factors (known as 'one-hot   
   ## encoding').   
   step_dummy (all_nominal_predictors (), one_hot =  TRUE ) %>%   
   step_zv (all_predictors ())  
 
 xgboost_usemodel_model <-   
   boost_tree (trees =  tune (), mtry =  tune (),min_n =  tune (), tree_depth =  tune (), learn_rate =  tune (),  
     loss_reduction =  tune (), sample_size =  tune ()) %>%   
   set_mode ("classification" ) %>%   
   set_engine ("xgboost" )  
 
 xgboost_usemodel_wflow <-   
   workflow () %>%   
   add_recipe (xgboost_usemodel_recipe) %>%   
   add_model (xgboost_usemodel_model)  
 
#doParallel::registerDoParallel(cores = cores)  
 cl <-  parallel:: makePSOCKcluster (cores -  1 ) 
 
set.seed (1234 ) 
 xgboost_usemodel_tune <-  
   tune_grid (xgboost_usemodel_wflow, resamples =  folds, grid =  xgb_grid) 
 
 parallel:: stopCluster (cl) 
 
 
```  
 
### XGB - Usemodel Best Parameter Settings  
 
```{r, xgb_usemodel_para_sel}  
 
 xgb_tuning_metrics_usemodel <-  collect_metrics (xgboost_usemodel_tune) 
 xgb_tuning_metrics_usemodel 
 
 xgboost_usemodel_tune %>%  
   collect_metrics () %>%  
   filter (.metric ==  "accuracy" ) %>%  
   select (mean, mtry: sample_size) %>%  
   pivot_longer (mtry: sample_size, 
                values_to =  "value" , 
                names_to =  "parameter"  
   ) %>%  
   ggplot (aes (value, mean, color =  parameter)) +  
   geom_point (alpha =  0.8 , show.legend =  FALSE ) +  
   facet_wrap (~ parameter, scales =  "free_x" ) +  
   labs (x =  NULL , y =  "Accuracy" ) 
 
```  
 
 Now select best from above 
 
```{r, xgb_usemodel_select_paras}  
 
show_best (xgboost_usemodel_tune, "accuracy" ) 
 
 xgb_usemodel_best_params <-  select_best (xgboost_usemodel_tune, "accuracy" ) 
 xgb_usemodel_best_params 
 
 xgb_usemodel_final_wflow <-  finalize_workflow ( 
   xgboost_usemodel_wflow, 
   xgb_usemodel_best_params 
 ) 
 
 xgb_usemodel_final_wflow 
 
 
```  
 
### XGB - Usemodel Parameter Ranking - VIP  
 
```{r, xgb_usemodel_vip}  
 
 xgb_usemodel_final_wflow %>%  
   fit (data =  train_train) %>%  
   extract_fit_parsnip () %>%  
   vip (geom =  "point" ) 
```  
 
### XGB - Usemodel Performance  
 
#### XGB - Usemodel Accuracy Measured on Test Set  
 
```{r, xgb_usemodel_final_metrics}  
 
 cl <-  parallel:: makePSOCKcluster (cores -  1 ) 
 
set.seed (1234 ) 
 xgb_usemodel_final_res <-  last_fit (xgb_usemodel_final_wflow, train_split) 
 xgb_usemodel_final_res 
 xgb_usemodel_final_metrics <-  collect_metrics (xgb_usemodel_final_res) 
 xgb_usemodel_final_metrics 
 
 parallel:: stopCluster (cl) 
 
```  
 
#### XGB - Usemodel AUC on Test Set (within train)  
 
```{r, xgb_usemodel_auc}  
 
 xgb_usemodel_final_res %>%  
   collect_predictions () %>%  
   roc_curve ( truth =  survived,.pred_1, event_level =  "second" ) %>%  
   ggplot (aes (x =  1 - specificity, y =  sensitivity)) +  
   geom_line (size =  1.5 , color =  "midnightblue" ) +  
   geom_abline ( 
     lty =  2 , alpha =  0.5 , 
     color =  "gray50" , 
     size =  1.2  
   ) 
 
```  
 
```{r, }  
 
 xgb_usemodel_test_predictions <-  collect_predictions (xgb_usemodel_final_res) 
head (xgb_usemodel_test_predictions) 
 
```  
 
### XGB - Usemodel Confusion Matrix  
 
```{r}  
 xgb_usemodel_test_predictions %>%  conf_mat (survived,.pred_class) %>%   
   autoplot (type =  "heatmap" ) 
 
 
```  
 
## XG Boost - Base Recipe  
 
### XGB Model Spec  
 
```{r, xgb_model}  
 
 xgb_model <-   
   boost_tree ( 
     trees =  tune (), 
     tree_depth =  tune (), 
     min_n =  tune (), 
     loss_reduction =  tune (), 
     sample_size =  tune (), 
     mtry =  tune (), 
     learn_rate =  tune ()) %>%   
   set_engine ("xgboost" ) %>%   
   set_mode ("classification" ) 
 
 xgb_model 
 
```  
 
### XGB Workflow  
 
```{r, xgb_wflow}  
 
 xgb_wflow <-   
   workflow () %>%   
   add_model (xgb_model) %>%   
   add_recipe (recipe_base) 
 
 
 
```  
 
### XGB Hyper-Parameter Tuning  
 
```{r}  
# xgb_folds <- vfold_cv(training(train_split), strata = survived)  
# xgb_folds  
 
 
#doParallel::registerDoParallel(cores = cores)  
 
set.seed (1234 ) 
 cl <-  parallel:: makePSOCKcluster (cores -  1 ) 
 
 xgb_tuning_result <-  tune_grid ( 
   xgb_wflow, 
   resamples =  folds, 
   grid      =  xgb_grid, 
   control  =  control_grid (save_pred =  TRUE ,save_workflow =  TRUE ) 
 ) 
 xgb_tuning_result 
 
 parallel:: stopCluster (cl) 
```  
 
```{r}  
 xgb_tuning_metrics <-  collect_metrics (xgb_tuning_result) 
 xgb_tuning_metrics 
 
 xgb_tuning_result %>%  
   collect_metrics () %>%  
   filter (.metric ==  "accuracy" ) %>%  
   select (mean, mtry: sample_size) %>%  
   pivot_longer (mtry: sample_size, 
                values_to =  "value" , 
                names_to =  "parameter"  
   ) %>%  
   ggplot (aes (value, mean, color =  parameter)) +  
   geom_point (alpha =  0.8 , show.legend =  FALSE ) +  
   facet_wrap (~ parameter, scales =  "free_x" ) +  
   labs (x =  NULL , y =  "AUC" ) 
```  
 
#### XGB Best Parameters then Finalise Workflow  
 
```{r}  
show_best (xgb_tuning_result, "accuracy" ) 
 
 xgb_best_params <-  select_best (xgb_tuning_result, "accuracy" ) 
 xgb_best_params 
 
 xgb_final_wflow <-  finalize_workflow ( 
   xgb_wflow, 
   xgb_best_params 
 ) 
 
 xgb_final_wflow 
 
```  
 
```{r}  
 xgb_final_wflow %>%  
   fit (data =  train_train) %>%  
   extract_fit_parsnip () %>%  
   vip (geom =  "point" ) 
```  
 
### XGB Performance on Training Test Set  
 
#### XGB Accuracy Measured on Test Set  
 
```{r}  
 xgb_final_res <-  last_fit (xgb_final_wflow, train_split) 
 xgb_final_res 
 xgb_final_metrics <-  collect_metrics (xgb_final_res) 
 xgb_final_metrics 
 
```  
 
#### XGB AUC on Test Set (within train)  
 
```{r}  
 xgb_final_res %>%  
   collect_predictions () %>%  
   roc_curve ( truth =  survived,.pred_1, event_level =  "second" ) %>%  
   ggplot (aes (x =  1 - specificity, y =  sensitivity)) +  
   geom_line (size =  1.5 , color =  "midnightblue" ) +  
   geom_abline ( 
     lty =  2 , alpha =  0.5 , 
     color =  "gray50" , 
     size =  1.2  
   ) 
 
```  
 
```{r}  
 xgb_test_predictions <-  collect_predictions (xgb_final_res) 
head (xgb_test_predictions) 
 
```  
 
### XGB Confusion Matrix  
 
```{r}  
 xgb_test_predictions %>%  conf_mat (survived,.pred_class) %>%   
   autoplot (type =  "heatmap" ) 
 
 
```  
 
## Neural Net  
 
### NN Model  
 
```{r, nn_model}  
 
 nnet_model <-   
    mlp (hidden_units =  tune (), penalty =  tune (), epochs =  tune ()) %>%   
    set_engine ("nnet" , MaxNWts =  2600 ) %>%   
    set_mode ("classification" ) 
 
 nnet_model %>%  translate () 
 
```  
 
### NN Workflow  
 
```{r, nn_wflow}  
 
 nnet_wflow <-  workflow () %>%   
   add_model (nnet_model) %>%   
   add_recipe (recipe_base) 
    
 
```  
 
### NN Parameters  
 
```{r, nn_params}  
 
 nnet_grid <-  grid_latin_hypercube ( 
   hidden_units (), 
   penalty  (), 
   epochs  () 
 ) 
 
head (nnet_grid)  
 
```  
 
### NN Hyper-Parameter Tuning  
 
```{r, nn_tuning}  
 
# nnet_folds <- vfold_cv(train_train, strata = survived)  
# nnet_folds  
 
 
# doParallel::registerDoParallel(cores = cores)  
 cl <-  parallel:: makePSOCKcluster (cores -  1 ) 
 
set.seed (1234 ) 
 nnet_tuning_result <-  tune_grid ( 
   nnet_wflow, 
   resamples =  folds, 
   grid      =  nnet_grid, 
   control   =  control_grid (save_pred =  TRUE ,save_workflow =  TRUE ) 
 ) 
 nnet_tuning_result 
 
 parallel:: stopCluster (cl) 
 
```  
 
### NN Best Parameters and Finalise Workflow  
 
```{r, nn_best_params}  
 
show_best (nnet_tuning_result, "accuracy" ) 
 nn_best_params <-  select_best (nnet_tuning_result, "accuracy" ) 
 
 nnet_best_auc <-  select_best (xgb_tuning_result, "accuracy" ) 
 nnet_best_auc 
 
 nnet_final_wflow <-  finalize_workflow ( 
   nnet_wflow, 
   nn_best_params 
 ) 
 
 nnet_final_wflow 
 
```  
 
```{r, nn_final_train}  
 
 nnet_final_wflow %>%  
   fit (data =  train_train) %>%  
   extract_fit_parsnip () %>%  
   vip (geom =  "point" ) 
 
```  
 
### NN Accuracy - Train/Test Set  
 
```{r}  
 nnet_tuning_metrics <-  collect_metrics (nnet_tuning_result) 
 nnet_tuning_metrics 
 
 nnet_final_res <-  last_fit (nnet_final_wflow, train_split) 
 nnet_final_res 
 nnet_final_metrics <-  collect_metrics (nnet_final_res) 
 nnet_final_metrics 
 
 
```  
 
### NN AUC  
 
```{r}  
 nnet_final_res %>%  
   collect_predictions () %>%  
   roc_curve ( truth =  survived,.pred_1, event_level =  "second" ) %>%  
   ggplot (aes (x =  1 - specificity, y =  sensitivity)) +  
   geom_line (size =  1.5 , color =  "midnightblue" ) +  
   geom_abline ( 
     lty =  2 , alpha =  0.5 , 
     color =  "gray50" , 
     size =  1.2  
   ) 
 
```  
 
### NN Predictions on Train/Test Set  
 
```{r}  
 nnet_test_predictions <-  nnet_final_res %>%  
   collect_predictions ()  
head (nnet_test_predictions) 
```  
 
### NN Confusion Matrix  
 
```{r, NN_confusion_matrix}  
 
 nnet_test_predictions %>%  conf_mat (survived,.pred_class) %>%   
   autoplot (type =  "heatmap" ) 
 
```  
 
## Stack Models  
 
### Stack Recipe  
 
```{r, stack_recipe}  
 
 recipe_stack <-   
   recipe (survived ~  ., data =  train_train) %>%   
   update_role (passenger_id, name,surname,ticket,cabin,new_role =  "ID" ) %>%   
   step_impute_knn (all_numeric_predictors ()) %>%   
   step_dummy (all_nominal_predictors ()) %>%  
   step_factor2string (all_nominal_predictors ()) %>%   
   step_zv (all_predictors ()) %>%   
   step_pca () 
 recipe_stack 
 
 recipe_stack_trained <-  prep (recipe_base) 
 recipe_stack_trained 
 
 
 
```  
 
### Stack Controls  
 
```{r, stack_controls}  
 
 
 stack_ctrl <-  control_resamples (save_pred =  TRUE , save_workflow =  TRUE ) 
#stack_folds <- vfold_cv(training(train_split), v=10,strata = "survived")  
 
library (stacks) 
 
 model_stack <-  
   stacks () %>%  
   #add_candidates(lr_wflow) %>%  
   #add_candidates(rf_wflow) %>%  
   add_candidates (nnet_tuning_result) %>%  
   add_candidates (rlr_tuning_result) %>%   
   add_candidates (xgb_tuning_result) 
 
```  
 
### Stack Blend  
 
```{r}  
 cl <-  parallel:: makePSOCKcluster (cores -  1 ) 
 
set.seed (1234 ) 
 ensemble <-  blend_predictions (model_stack,penalty =  10 ^ seq (- 2 , - 0.5 , length =  20 )) 
autoplot (ensemble) 
 
 parallel:: stopCluster (cl) 
```  
 
```{r,ensemble_table}  
 
 ensemble  
 
```  
 
### Stack Weights  
 
```{r}  
autoplot (ensemble, "weights" ) +  
   geom_text (aes (x =  weight +  0.01 , label =  model), hjust =  0 ) +   
   theme (legend.position =  "none" )  
 
 
```  
 
### Fit Member Models  
 
```{r, fit_ensemble}  
 
 ensemble <-  fit_members (ensemble) 
collect_parameters (ensemble,"xgb_tuning_result" ) 
 
```  
 
### Stack Predict  
 
```{r}  
#ensemble_metrics <- metric_set(roc_auc,accuracy)  
 
 ensemble_test_predictions <-   
   predict (ensemble,train_test) %>%   
   bind_cols (train_test)  
 
 
# ensemble_test_predictions <- ensemble_test_predictions %>%   
#   mutate(.pred_class=as.numeric(.pred_class)) %>%   
#    mutate(survived =as.numeric(survived))   
#   
# ensemble_test_predictions <- ensemble_test_predictions %>%   
#   mutate(roc = roc_auc(truth=survived, estimate = .pred_class))  
 
 
 
glimpse (ensemble_test_predictions) 
 
```  
 
## Join Model Prediction Data  
 
```{r, all_predictions}  
 
 all_predictions <-   
   lr_test_predictions %>%  mutate (model =  "LR" ) %>%   
   bind_rows (nnet_test_predictions %>%  mutate (model =  "NNet" )) %>%   
   bind_rows (rlr_test_predictions %>%  mutate (model =  "Reg_LR" )) %>%   
   bind_rows (rf_test_predictions %>%  mutate (model =  "RF" )) %>%   
   bind_rows (xgb_test_predictions %>%  mutate (model =  "xgb" )) %>%   
   bind_rows (xgb_usemodel_test_predictions %>%  mutate (model =  "xgb_usemodel" )) %>%   
   bind_rows (ensemble_test_predictions %>%  mutate (model =  "ensemble" )) 
    
 all_predictions %>%  head () %>%  knitr:: kable () 
 
 
 
```  
 
## All Metrics  
 
 Ordered by descending Accuracy metric 
 
```{r, all_metrics}  
 
 all_metrics <-   
   lr_final_metrics %>%  mutate (model =  "LR" ) %>%   
   bind_rows (nnet_final_metrics %>%  mutate (model =  "NNet" )) %>%   
   bind_rows (rlr_final_metrics %>%  mutate (model =  "Reg_LR" )) %>%   
   bind_rows (rf_final_metrics %>%  mutate (model =  "RF" )) %>%   
   bind_rows (xgb_final_metrics %>%  mutate (model =  "xgb" )) %>%   
   bind_rows (xgb_usemodel_final_metrics %>%  mutate (model =  "xgb-usemodel" ))  
 
 all_metrics_table <-  all_metrics %>%   
    pivot_wider (names_from =  .metric,values_from =  .estimate) %>%   
    arrange (desc (accuracy)) 
    
write_rds (all_metrics,"artifacts/all_metrics.rds" ) 
 
 all_metrics_table %>%  knitr:: kable (digits= 3 ) 
 
 
 
```  
 
 and a graph: 
 
```{r, graph_all_metrics}  
 
 all_metrics %>%   
   filter (.metric ==  "accuracy" ) %>%   
   select (model, accuracy =  .estimate) %>%   
   ggplot (aes (model, accuracy)) +  
   geom_col () 
 
```  
 
# Final Submission  
 
```{r, predict_test}  
 
# all_predictions %>%   
# distinct(model)  
 
 
 
 test_proc <-  all_proc %>%   
   filter (train_test== "test" ) 
 
# LR ----  
 final_test_pred_LR <-   
   lr_wflow %>%   
   fit (train_proc_adj_tbl) %>%   
   predict (new_data= test_proc) %>%   
   bind_cols (test_proc) 
 
 submission_LR <-  final_test_pred_LR %>%   
   select (PassengerID =  passenger_id,Survived =  .pred_class) 
 
write_csv (submission_LR,"titanic_submission_LR.csv" )  
 
 
# RLR ----  
 final_test_pred_RLR <-   
   rlr_final_wflow %>%   
   fit (train_proc_adj_tbl) %>%   
   predict (new_data= test_proc) %>%   
   bind_cols (test_proc) 
 
 submission_RLR <-  final_test_pred_RLR %>%   
   select (PassengerID =  passenger_id,Survived =  .pred_class) 
 
write_csv (submission_RLR,"titanic_submission_RLR.csv" )  
 
# RF ----  
 final_test_pred_RF <-   
   rf_final_wflow %>%   
   fit (train_proc_adj_tbl) %>%   
   predict (new_data= test_proc) %>%   
   bind_cols (test_proc) 
 
 submission_RF <-  final_test_pred_RF %>%   
   select (PassengerID =  passenger_id,Survived =  .pred_class) 
 
write_csv (submission_RF,"titanic_submission_RF.csv" )  
 
# NN ----  
 final_test_pred_NN <-   
   nnet_final_wflow %>%   
   fit (train_proc_adj_tbl) %>%   
   predict (new_data= test_proc) %>%   
   bind_cols (test_proc) 
 
 submission_NN <-  final_test_pred_NN %>%   
   select (PassengerID =  passenger_id,Survived =  .pred_class) 
 
write_csv (submission_NN,"titanic_submission_NN.csv" )  
 
 
# XGB -----  
 final_test_pred_xgb <-  
   xgb_final_wflow %>%   
   fit (train_proc_adj_tbl) %>%   
   predict (new_data= test_proc) %>%   
   bind_cols (test_proc) 
 
 submission_xgb <-  final_test_pred_xgb %>%   
   select (PassengerID =  passenger_id,Survived =  .pred_class) 
 
write_csv (submission_xgb,"titanic_submission_xgb.csv" ) 
 
 
# ensemble -----  
 final_test_pred_ens <-  
   ensemble %>%   
   #fit(train_proc_adj_tbl) %>%   
   predict (new_data= test_proc) %>%   
   bind_cols (test_proc) 
 
 submission_ens <-  final_test_pred_ens %>%   
   select (PassengerID =  passenger_id,Survived =  .pred_class) 
 
write_csv (submission_ens,"titanic_submission_ens.csv" ) 
 
 
 
```