Library Includes

library(tidyverse)      # General utility packages
library(party)          # Random Forest conditional inference tree utilization
library(missForest)     # Imputation (for both factor and numeric)
library(cowplot)        # Plotting ggplot's side by side
library(corrplot)       # Visual correlation plotting
library(fastDummies)    # Used for dummy coding
library(caret)          # Used for k-fold cross validation
library(ggthemes)       # Provides extra themes for styling ggplots.

Data Loading

train <- read_csv("data/train.csv") # Returns tibble
test <- read_csv("data/test.csv")   # Returns tibble
test$Survived <- NA
combi <- rbind(train, test)

Utility Functions

createSubmission <- function(submissionName = "submit.csv", prediction) {
  submit <- data.frame(PassengerId = test$PassengerId, Survived = prediction)
  write_csv(submit, submissionName)
}

Feature Eng. (Categorical / Factors)

# Creating a new Title feature by parsing it using a regular expression.
combi$Title <- sapply(combi$Name, FUN = function(x) { strsplit(x, split = "[,.]")[[1]][2]})
combi$Title <- sub(' ', '', combi$Title)

# Combining some rare Title values to simplify our factor counts.
combi$Title[combi$Title %in% c("Mme", "Mlle")] <- "Mlle"
combi$Title[combi$Title %in% c("Capt", "Don", "Major", "Sir")] <- "Sir"
combi$Title[combi$Title %in% c("Dona", "Lady", "the Countess", "Jonkheer")] <- "Lady"
combi$Title <- factor(combi$Title)

# Creating a new Surname feature by parsing it using a regular expression.
combi$Surname <- sapply(combi$Name, FUN = function(x) { strsplit(x, split = "[,.]")[[1]][1]})

# Creating a new FamilySize feature including self, parents, siblings, spouse, and children.
combi$FamilySize <- combi$SibSp + combi$Parch + 1

# Creating simple flag indicating whether person has any family on board.
combi$FamilyOnBoard <- ifelse(combi$SibSp + combi$Parch > 0, 1, 0)

# Creating a new CabinLetter feature by parsing it using a regular expression.
combi$CabinLetter <- sapply(combi$Cabin, 
                            FUN = function(x) { ifelse(is.na(x), NA_character_, substr(x, 1, 1)) })
combi$CabinLetter <- factor(combi$CabinLetter)

# Keeping track of whether an observation had a Cabin noted at all
combi$HasCabin <- ifelse(is.na(combi$Cabin), 0, 1)

# Creating socioeconomic feature
combi$SocioEconomic <- ifelse(
  is.na(combi$CabinLetter), 
  paste(combi$Pclass, "X", sep=""),
  paste(combi$Pclass, combi$CabinLetter, sep=""))

Feature Eng. (Continuous)

Normalization (Standardization)

Binning (Discretization)

# Binning Age into 3 factor bins (so that future imputation picks a factor before conversion).
combi$Age <- sapply(combi$Age, 
                    FUN = function(x) { 
                      if (is.na(x)) NA
                      else if (x < 18) "Child"
                      else if (x >= 18 & x < 50) "Adult"
                      else "Elder"
                    })
ggplot(combi) +
  aes(x = Age) +
  geom_bar(width = 0.2,
           stat = "count") +
  theme_clean(base_size = 10) +
  labs(x = "Age", y = "Frequency")

Data Imputation & Cleanup

# In order to leverage missForest for imputation, it only supports consuming / predicting factor 
# and numeric data types so all applicable passed-in features should be in one of those two formats.
combi$Age <- as.factor(combi$Age)
combi$Sex <- as.factor(combi$Sex)
combi$Embarked <- as.factor(combi$Embarked)
combi$SocioEconomic <- as.factor(combi$SocioEconomic)
glimpse(combi, width = 105)
Rows: 1,309
Columns: 19
$ PassengerId   <dbl> 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 2…
$ Survived      <dbl> 0, 1, 1, 1, 0, 0, 0, 0, 1, 1, 1, 1, 0, 0, 0, 1, 0, 1, 0, 1, 0, 1, 1, 1, 0, 1, 0,…
$ Pclass        <dbl> 3, 1, 3, 1, 3, 3, 1, 3, 3, 2, 3, 1, 3, 3, 3, 2, 3, 2, 3, 3, 2, 2, 3, 1, 3, 3, 3,…
$ Name          <chr> "Braund, Mr. Owen Harris", "Cumings, Mrs. John Bradley (Florence Briggs Thayer)"…
$ Sex           <fct> male, female, female, female, male, male, male, male, female, female, female, fe…
$ Age           <fct> Adult, Adult, Adult, Adult, Adult, NA, Elder, Child, Adult, Child, Child, Elder,…
$ SibSp         <dbl> 1, 1, 0, 1, 0, 0, 0, 3, 0, 1, 1, 0, 0, 1, 0, 0, 4, 0, 1, 0, 0, 0, 0, 0, 3, 1, 0,…
$ Parch         <dbl> 0, 0, 0, 0, 0, 0, 0, 1, 2, 0, 1, 0, 0, 5, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 5, 0,…
$ Ticket        <chr> "A/5 21171", "PC 17599", "STON/O2. 3101282", "113803", "373450", "330877", "1746…
$ Fare          <dbl> 7.2500, 71.2833, 7.9250, 53.1000, 8.0500, 8.4583, 51.8625, 21.0750, 11.1333, 30.…
$ Cabin         <chr> NA, "C85", NA, "C123", NA, NA, "E46", NA, NA, NA, "G6", "C103", NA, NA, NA, NA, …
$ Embarked      <fct> S, C, S, S, S, Q, S, S, S, C, S, S, S, S, S, S, Q, S, S, C, S, S, Q, S, S, S, C,…
$ Title         <fct> Mr, Mrs, Miss, Mrs, Mr, Mr, Mr, Master, Mrs, Mrs, Miss, Miss, Mr, Mr, Miss, Mrs,…
$ Surname       <chr> "Braund", "Cumings", "Heikkinen", "Futrelle", "Allen", "Moran", "McCarthy", "Pal…
$ FamilySize    <dbl> 2, 2, 1, 2, 1, 1, 1, 5, 3, 2, 3, 1, 1, 7, 1, 1, 6, 1, 2, 1, 1, 1, 1, 1, 5, 7, 1,…
$ FamilyOnBoard <dbl> 1, 1, 0, 1, 0, 0, 0, 1, 1, 1, 1, 0, 0, 1, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 1, 1, 0,…
$ CabinLetter   <fct> NA, C, NA, C, NA, NA, E, NA, NA, NA, G, C, NA, NA, NA, NA, NA, NA, NA, NA, NA, D…
$ HasCabin      <dbl> 0, 1, 0, 1, 0, 0, 1, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0,…
$ SocioEconomic <fct> 3X, 1C, 3X, 1C, 3X, 3X, 1E, 3X, 3X, 2X, 3G, 1C, 3X, 3X, 3X, 2X, 3X, 2X, 3X, 3X, …
# Examining how many NA's and blanks our dataset has.
sapply(combi, function(x) sum(is.na(x) | x == "")) %>%
  as.data.frame()
                 .
PassengerId      0
Survived       418
Pclass           0
Name             0
Sex              0
Age            263
SibSp            0
Parch            0
Ticket           0
Fare             1
Cabin         1014
Embarked         2
Title            0
Surname          0
FamilySize       0
FamilyOnBoard    0
CabinLetter   1014
HasCabin         0
SocioEconomic    0
# Imputation Step: excluding irrelevant / unsupported data type features, plus casting to dataframe 
# for missForest.
set.seed(420)
combi.imp <- combi %>%
  select(-c("Survived", "Name", "Ticket", "Surname", "Cabin", "CabinLetter")) %>%
  as.data.frame() %>%
  missForest()
  missForest iteration 1 in progress...done!
  missForest iteration 2 in progress...done!
  missForest iteration 3 in progress...done!
  missForest iteration 4 in progress...done!
# Observing results + error rates for imputation (~4.72% for numeric, and ~9.82% for factors).
combi.imp$OOBerror
     NRMSE        PFC 
0.04723969 0.09826029 
# Merging imputed features back into "combi".
combi$Age <- combi.imp$ximp$Age
combi$Fare <- combi.imp$ximp$Fare
combi$Embarked <- combi.imp$ximp$Embarked
# Examining how many NA's and blanks our dataset has.
sapply(combi, function(x) sum(is.na(x) | x == "")) %>%
  as.data.frame()
                 .
PassengerId      0
Survived       418
Pclass           0
Name             0
Sex              0
Age              0
SibSp            0
Parch            0
Ticket           0
Fare             0
Cabin         1014
Embarked         0
Title            0
Surname          0
FamilySize       0
FamilyOnBoard    0
CabinLetter   1014
HasCabin         0
SocioEconomic    0
ggplot(combi) +
  aes(x = Age) +
  geom_bar(width = 0.2,
           stat = "count") +
  theme_clean(base_size = 10) +
  labs(x = "Age", y = "Frequency")

Dimensional Reduction

Feature Selection

Feature Extraction

Dummy Coding

# All features selected for dummification must be character or factor columns. Also decided to remove 
# said selected columns after they have been dummified to conserve space and boost future correlation
# performance.
combi <- dummy_cols(combi, 
                    select_columns = c("Age", "Sex", "Embarked", "Title", "SocioEconomic"),
                    remove_selected_columns = TRUE)
glimpse(combi, width = 105)
Rows: 1,309
Columns: 48
$ PassengerId      <dbl> 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22…
$ Survived         <dbl> 0, 1, 1, 1, 0, 0, 0, 0, 1, 1, 1, 1, 0, 0, 0, 1, 0, 1, 0, 1, 0, 1, 1, 1, 0, 1,…
$ Pclass           <dbl> 3, 1, 3, 1, 3, 3, 1, 3, 3, 2, 3, 1, 3, 3, 3, 2, 3, 2, 3, 3, 2, 2, 3, 1, 3, 3,…
$ Name             <chr> "Braund, Mr. Owen Harris", "Cumings, Mrs. John Bradley (Florence Briggs Thaye…
$ SibSp            <dbl> 1, 1, 0, 1, 0, 0, 0, 3, 0, 1, 1, 0, 0, 1, 0, 0, 4, 0, 1, 0, 0, 0, 0, 0, 3, 1,…
$ Parch            <dbl> 0, 0, 0, 0, 0, 0, 0, 1, 2, 0, 1, 0, 0, 5, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 5,…
$ Ticket           <chr> "A/5 21171", "PC 17599", "STON/O2. 3101282", "113803", "373450", "330877", "1…
$ Fare             <dbl> 7.2500, 71.2833, 7.9250, 53.1000, 8.0500, 8.4583, 51.8625, 21.0750, 11.1333, …
$ Cabin            <chr> NA, "C85", NA, "C123", NA, NA, "E46", NA, NA, NA, "G6", "C103", NA, NA, NA, N…
$ Surname          <chr> "Braund", "Cumings", "Heikkinen", "Futrelle", "Allen", "Moran", "McCarthy", "…
$ FamilySize       <dbl> 2, 2, 1, 2, 1, 1, 1, 5, 3, 2, 3, 1, 1, 7, 1, 1, 6, 1, 2, 1, 1, 1, 1, 1, 5, 7,…
$ FamilyOnBoard    <dbl> 1, 1, 0, 1, 0, 0, 0, 1, 1, 1, 1, 0, 0, 1, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 1, 1,…
$ CabinLetter      <fct> NA, C, NA, C, NA, NA, E, NA, NA, NA, G, C, NA, NA, NA, NA, NA, NA, NA, NA, NA…
$ HasCabin         <dbl> 0, 1, 0, 1, 0, 0, 1, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0,…
$ Age_Adult        <int> 1, 1, 1, 1, 1, 1, 0, 0, 1, 0, 0, 0, 1, 1, 0, 0, 0, 1, 1, 1, 1, 1, 0, 1, 0, 1,…
$ Age_Child        <int> 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 1, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 1, 0, 1, 0,…
$ Age_Elder        <int> 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,…
$ Sex_female       <int> 0, 1, 1, 1, 0, 0, 0, 0, 1, 1, 1, 1, 0, 0, 1, 1, 0, 0, 1, 1, 0, 0, 1, 0, 1, 1,…
$ Sex_male         <int> 1, 0, 0, 0, 1, 1, 1, 1, 0, 0, 0, 0, 1, 1, 0, 0, 1, 1, 0, 0, 1, 1, 0, 1, 0, 0,…
$ Embarked_C       <int> 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0,…
$ Embarked_Q       <int> 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0,…
$ Embarked_S       <int> 1, 0, 1, 1, 1, 0, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 1,…
$ Title_Col        <int> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,…
$ Title_Dr         <int> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,…
$ Title_Lady       <int> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,…
$ Title_Master     <int> 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0,…
$ Title_Miss       <int> 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0,…
$ Title_Mlle       <int> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,…
$ Title_Mr         <int> 1, 0, 0, 0, 1, 1, 1, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 1, 0, 0, 1, 1, 0, 1, 0, 0,…
$ Title_Mrs        <int> 0, 1, 0, 1, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 1, 0, 0, 1, 1, 0, 0, 0, 0, 0, 1,…
$ Title_Ms         <int> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,…
$ Title_Rev        <int> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,…
$ Title_Sir        <int> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,…
$ SocioEconomic_1A <int> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0,…
$ SocioEconomic_1B <int> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,…
$ SocioEconomic_1C <int> 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,…
$ SocioEconomic_1D <int> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,…
$ SocioEconomic_1E <int> 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,…
$ SocioEconomic_1T <int> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,…
$ SocioEconomic_1X <int> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,…
$ SocioEconomic_2D <int> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0,…
$ SocioEconomic_2E <int> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,…
$ SocioEconomic_2F <int> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,…
$ SocioEconomic_2X <int> 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0,…
$ SocioEconomic_3E <int> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,…
$ SocioEconomic_3F <int> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,…
$ SocioEconomic_3G <int> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,…
$ SocioEconomic_3X <int> 1, 0, 1, 0, 1, 1, 0, 1, 1, 0, 0, 0, 1, 1, 1, 0, 1, 0, 1, 1, 0, 0, 1, 0, 1, 1,…

Data Splitting & Sampling

train <- combi[1:891,]
test <- combi[892:1309,]

Feature Correlation Analysis

# Examine correlations between dependent Survived feature and independent input features. Excluding
# specific features.
trainCor <- cor(
  train %>%
    select(-PassengerId, -Name, -Ticket, -Cabin, -Surname, -CabinLetter)
)
corrplot(trainCor, type = "upper")

# Examining frequency distribution for all dummy coding values to get a sense of which features
# have the widest usage.
trainDummies <- train[sapply(train, is.numeric)] %>%
  colSums(na.rm = TRUE) %>%
  t() %>%
  data.frame() %>%
  select(contains("_")) %>%
  t() %>%
  data.frame()
# Moving the row names into a new column, and updating the row names to simply be numbers.
trainDummies <- cbind(DummyCode = rownames(trainDummies), trainDummies)
rownames(trainDummies) <- 1:nrow(trainDummies)
# Plotting the dummy-coded features from most to least used.
ggplot(trainDummies) +
  aes(x = reorder(DummyCode,.), weight = .) +
  geom_bar() +
  coord_flip() +
  theme_clean(base_size = 10) +
  labs(x = "Dummy Code", y = "Frequency")

Model Creation & Tuning

#Defining our repeated k-fold cross validation to split into c chunks, and cycle process t times.
train_control <- trainControl(method = "repeatedcv", number = 5, repeats = 3)
survivalModel <- train(as.factor(Survived) ~ 
                         FamilyOnBoard + 
                         Title_Miss + 
                         Sex_female + 
                         Title_Mrs + 
                         Fare + 
                         HasCabin + 
                         Embarked_C +
                         Title_Mr + 
                         Sex_male + 
                         Age_Child +
                         Pclass + 
                         Embarked_S + 
                         Age_Adult +
                         Title_Master +
                         SocioEconomic_3X,
                       data = train,
                       method = "cforest",
                       trControl = train_control,
                       controls = party::cforest_unbiased(ntree = 1000))

Model Scoring & Prediction

# Examining the model scoring after cross validation (uses portions of the train set as validation).
survivalModel
Conditional Inference Random Forest 

891 samples
 15 predictor
  2 classes: '0', '1' 

No pre-processing
Resampling: Cross-Validated (5 fold, repeated 3 times) 
Summary of sample sizes: 713, 713, 713, 713, 712, 713, ... 
Resampling results across tuning parameters:

  mtry  Accuracy   Kappa    
   2    0.8125820  0.5919011
   8    0.8200433  0.6046258
  15    0.8207945  0.6070504

Accuracy was used to select the optimal model using the largest value.
The final value used for the model was mtry = 15.
# Make predictions based on resulting trained model.
predictions <- predict(survivalModel, newdata = test, OOB = TRUE, type = "raw")

# Creating submission.
createSubmission("submit.csv", predictions)
