# Data manipulation imports
import pandas as pd
import numpy as np
# Model setup imports
from sklearn.model_selection import train_test_split
# Models
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier
from sklearn.tree import DecisionTreeClassifier
from catboost import CatBoostClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
# NEW IMPORT - VotingClassifier
from sklearn.ensemble import VotingClassifier
# Scoring models
from sklearn.metrics import ConfusionMatrixDisplay, confusion_matrix, f1_score, precision_score, \
recall_score, roc_auc_score
25 Ensembles (Titanic Dataset)
sklearn allows you to easily create ensembles of multiple models.
We will just be looking at ‘voting’ classifiers today. These allow you to combine ‘conceptually different machine learning classifiers’, so are very flexible.
You may also want to look into stacking models if this is an area you are interested in exploring further.
The main new import is VotingClassifier
from the sklearn.ensemble
module.
Let’s import our processed titanic dataset and split it into train, validation and testing datsets.
try:
= pd.read_csv("data/processed_data.csv")
data
except FileNotFoundError:
# Download processed data:
= 'https://raw.githubusercontent.com/MichaelAllen1966/' + \
address '1804_python_healthcare/master/titanic/data/processed_data.csv'
= pd.read_csv(address)
data
# Create a data subfolder if one does not already exist
import os
='./data/'
data_directory if not os.path.exists(data_directory):
os.makedirs(data_directory)
# Save data
+ 'processed_data.csv', index=False)
data.to_csv(data_directory
= data.astype(float)
data
# Drop Passengerid (axis=1 indicates we are removing a column rather than a row)
# We drop passenger ID as it is not original data
'PassengerId', inplace=True, axis=1)
data.drop(
= data.drop('Survived',axis=1) # X = all 'data' except the 'survived' column
X = data['Survived'] # y = 'survived' column from 'data'
y
= X.columns.tolist()
feature_names
= train_test_split(X, y, test_size=0.2, random_state=42)
X_train_val, X_test, y_train_val, y_test = train_test_split(X_train_val, y_train_val, test_size=0.2, random_state=42)
X_train, X_validate, y_train, y_validate
print(f"Training Dataset Samples: {len(X_train)}")
print(f"Validation Dataset Samples: {len(X_validate)}")
print(f"Testing Dataset Samples: {len(X_test)}")
Training Dataset Samples: 569
Validation Dataset Samples: 143
Testing Dataset Samples: 179
Let’s first set up a function to allow us to quickly pull back results for different machine learning models. This takes a model as the input and will output a series of metrics in a table to allow us to quickly compare and rank models.
def fit_train(name="XGBoost",
=X_train, X_validate=X_validate,
X_train=y_train, y_validate=y_validate,
y_train=XGBClassifier(random_state=42)
model
):
model.fit(X_train, y_train)
= model.predict(X_train)
y_pred_train = model.predict(X_validate)
y_pred_val
= confusion_matrix(y_validate, y_pred_val, labels=[0, 1]).ravel()
tn, fp, fn, tp
return pd.DataFrame({
'Accuracy (training)': np.mean(y_pred_train == y_train),
'Accuracy (validation)': np.mean(y_pred_val == y_validate),
'Precision (validation)': precision_score(y_validate, y_pred_val, average='macro'),
'Recall (validation)': recall_score(y_validate, y_pred_val, average='macro'),
"AUC": roc_auc_score(y_validate, y_pred_val),
"f1": f1_score(y_validate, y_pred_val, average='macro'),
"FP": fp,
"FN": fn
=[name]
}, indexround(3) ).
Let’s first just train an XGBoost model to get an idea of the performance that can be achieved on this dataset.
= XGBClassifier(random_state=42)
clf1 = fit_train(model = clf1)
results_df results_df
Accuracy (training) | Accuracy (validation) | Precision (validation) | Recall (validation) | AUC | f1 | FP | FN | |
---|---|---|---|---|---|---|---|---|
XGBoost | 0.979 | 0.797 | 0.788 | 0.786 | 0.786 | 0.787 | 14 | 15 |
Let’s also train a decision tree for comparison.
= DecisionTreeClassifier(max_depth=6, random_state=42)
clf2 = pd.concat([results_df,fit_train(model=clf2, name="Decision Tree")]) results_df
25.1 Creating an Ensemble: Using VotingClassifier
First, let’s try creating an ensemble of these two models.
We pass in a list containing a tuple per model; the tuple needs to have a name for the model, and the model object itself.
Here, we’ve chosen ‘hard’ voting, which means it just looks at the prediction from each model and uses the majority vote.
= VotingClassifier(
voting_classifier_1 =[('dt', clf1), ('xGB', clf2)],
estimators='hard') voting
We then just use our fit_train function on this, appending the results to the end of our existing results_df.
The output of VotingClassifier
is a model object - just as if we’d created a RandomForestClassifier()
or XGBClassifier()
. This means we can use all of the normal features like .fit()
and .predict()
.
= pd.concat(
results_df
[results_df,=voting_classifier_1, name="DT, XGB: hard")]
fit_train(model
)
results_df
Accuracy (training) | Accuracy (validation) | Precision (validation) | Recall (validation) | AUC | f1 | FP | FN | |
---|---|---|---|---|---|---|---|---|
XGBoost | 0.979 | 0.797 | 0.788 | 0.786 | 0.786 | 0.787 | 14 | 15 |
Decision Tree | 0.886 | 0.818 | 0.814 | 0.800 | 0.800 | 0.805 | 10 | 16 |
DT, XGB: hard | 0.898 | 0.832 | 0.836 | 0.808 | 0.808 | 0.817 | 7 | 17 |
25.1.1 Working with more classifiers
Now let’s try this with some additional models.
We’re going to use some additional models that require the data to be standardised, so let’s do that first.
25.1.2 Standardisation
from sklearn.preprocessing import StandardScaler
= StandardScaler()
sc
# Apply the scaler to the training and test sets
= sc.fit_transform(X_train)
X_train_standardised = sc.fit_transform(X_validate)
X_validate_standardised = sc.fit_transform(X_test) X_test_standardised
25.1.3 Creating Additional Models
= KNeighborsClassifier(n_neighbors=7)
clf3
= SVC(kernel='rbf', probability=True)
clf4
= LogisticRegression() clf5
25.1.4 A more complex voting classifier
= VotingClassifier(estimators=[
voting_classifier_2 'XGBoost', clf1),
('Decision Tree', clf2),
('K-Nearest Neighbours', clf3),
('SVC', clf4),
('Logistic Regression', clf5)
(
],='hard') voting
Let’s now just append our results and view our updated table.
= pd.concat([
results_df
results_df,
fit_train(=X_train_standardised,
X_train=X_validate_standardised,
X_validate=voting_classifier_2,
model="DT, XGBoost, KNN, LogReg + SVC: hard")
name
])
results_df
Accuracy (training) | Accuracy (validation) | Precision (validation) | Recall (validation) | AUC | f1 | FP | FN | |
---|---|---|---|---|---|---|---|---|
XGBoost | 0.979 | 0.797 | 0.788 | 0.786 | 0.786 | 0.787 | 14 | 15 |
Decision Tree | 0.886 | 0.818 | 0.814 | 0.800 | 0.800 | 0.805 | 10 | 16 |
DT, XGB: hard | 0.898 | 0.832 | 0.836 | 0.808 | 0.808 | 0.817 | 7 | 17 |
DT, XGBoost, KNN, LogReg + SVC: hard | 0.880 | 0.797 | 0.788 | 0.782 | 0.782 | 0.785 | 13 | 16 |
25.2 Hard and Soft Voting
We previously used the ‘hard’ voting parameter, which looks at the predicted class from each classifier and takes the majority vote.
Instead, the ‘soft’ classifier looks at the predicted probabilities from each classifier and averages them.
This does mean that each model that is passed in must have a .predict_proba()
method - most do.
= VotingClassifier(
voting_classifier_1 =[('dt', clf1), ('xGB', clf2)],
estimators='soft')
voting
= pd.concat(
results_df
[results_df,=voting_classifier_1, name="DT, XGB: soft")]
fit_train(model
)
results_df
Accuracy (training) | Accuracy (validation) | Precision (validation) | Recall (validation) | AUC | f1 | FP | FN | |
---|---|---|---|---|---|---|---|---|
XGBoost | 0.979 | 0.797 | 0.788 | 0.786 | 0.786 | 0.787 | 14 | 15 |
Decision Tree | 0.886 | 0.818 | 0.814 | 0.800 | 0.800 | 0.805 | 10 | 16 |
DT, XGB: hard | 0.898 | 0.832 | 0.836 | 0.808 | 0.808 | 0.817 | 7 | 17 |
DT, XGBoost, KNN, LogReg + SVC: hard | 0.880 | 0.797 | 0.788 | 0.782 | 0.782 | 0.785 | 13 | 16 |
DT, XGB: soft | 0.951 | 0.804 | 0.795 | 0.795 | 0.795 | 0.795 | 14 | 14 |
25.2.1 Weighting classifiers
Whether working with hard or soft voting, we can also weight the predictions of different models.
Here, we give the prediction of the decision tree twice the weight of the XGBoost Model.
= VotingClassifier(
voting_classifier_1 =[('dt', clf1), ('xGB', clf2)],
estimators='soft',
voting=[1, 2])
weights
= pd.concat(
results_df
[results_df,=voting_classifier_1, name="DT, XGB: soft, 2:1")]
fit_train(model
)
results_df
Accuracy (training) | Accuracy (validation) | Precision (validation) | Recall (validation) | AUC | f1 | FP | FN | |
---|---|---|---|---|---|---|---|---|
XGBoost | 0.979 | 0.797 | 0.788 | 0.786 | 0.786 | 0.787 | 14 | 15 |
Decision Tree | 0.886 | 0.818 | 0.814 | 0.800 | 0.800 | 0.805 | 10 | 16 |
DT, XGB: hard | 0.898 | 0.832 | 0.836 | 0.808 | 0.808 | 0.817 | 7 | 17 |
DT, XGBoost, KNN, LogReg + SVC: hard | 0.880 | 0.797 | 0.788 | 0.782 | 0.782 | 0.785 | 13 | 16 |
DT, XGB: soft | 0.951 | 0.804 | 0.795 | 0.795 | 0.795 | 0.795 | 14 | 14 |
DT, XGB: soft, 2:1 | 0.924 | 0.818 | 0.810 | 0.806 | 0.806 | 0.808 | 12 | 14 |
We can also apply each of these to more complex ensembles. Here, let’s try soft voting with our 5-model ensemble.
= VotingClassifier(estimators=[
voting_classifier_2 'XGBoost', clf1),
('Decision Tree', clf2),
('K-Nearest Neighbours', clf3),
('SVC', clf4),
('Logistic Regression', clf5)
(
],='soft')
voting
= pd.concat([
results_df
results_df,
fit_train(=X_train_standardised,
X_train=X_validate_standardised,
X_validate=voting_classifier_2,
model="DT, XGBoost, KNN + SVC: soft")])
name results_df
Accuracy (training) | Accuracy (validation) | Precision (validation) | Recall (validation) | AUC | f1 | FP | FN | |
---|---|---|---|---|---|---|---|---|
XGBoost | 0.979 | 0.797 | 0.788 | 0.786 | 0.786 | 0.787 | 14 | 15 |
Decision Tree | 0.886 | 0.818 | 0.814 | 0.800 | 0.800 | 0.805 | 10 | 16 |
DT, XGB: hard | 0.898 | 0.832 | 0.836 | 0.808 | 0.808 | 0.817 | 7 | 17 |
DT, XGBoost, KNN, LogReg + SVC: hard | 0.880 | 0.797 | 0.788 | 0.782 | 0.782 | 0.785 | 13 | 16 |
DT, XGB: soft | 0.951 | 0.804 | 0.795 | 0.795 | 0.795 | 0.795 | 14 | 14 |
DT, XGB: soft, 2:1 | 0.924 | 0.818 | 0.810 | 0.806 | 0.806 | 0.808 | 12 | 14 |
DT, XGBoost, KNN + SVC: soft | 0.916 | 0.811 | 0.803 | 0.797 | 0.797 | 0.800 | 12 | 15 |
Now let’s try weighting this and see the impact.
= VotingClassifier(estimators=[
voting_classifier_2 'XGBoost', clf1),
('Decision Tree', clf2),
('K-Nearest Neighbours', clf3),
('SVC', clf4),
('Logistic Regression', clf5)
(
],='soft',
voting=[2,2,1,1,2])
weights
= pd.concat([
results_df
results_df,
fit_train(=X_train_standardised,
X_train=X_validate_standardised,
X_validate=voting_classifier_2,
model="DT, XGBoost, KNN + SVC: soft, 2:2:1:1:2")])
name results_df
Accuracy (training) | Accuracy (validation) | Precision (validation) | Recall (validation) | AUC | f1 | FP | FN | |
---|---|---|---|---|---|---|---|---|
XGBoost | 0.979 | 0.797 | 0.788 | 0.786 | 0.786 | 0.787 | 14 | 15 |
Decision Tree | 0.886 | 0.818 | 0.814 | 0.800 | 0.800 | 0.805 | 10 | 16 |
DT, XGB: hard | 0.898 | 0.832 | 0.836 | 0.808 | 0.808 | 0.817 | 7 | 17 |
DT, XGBoost, KNN, LogReg + SVC: hard | 0.880 | 0.797 | 0.788 | 0.782 | 0.782 | 0.785 | 13 | 16 |
DT, XGB: soft | 0.951 | 0.804 | 0.795 | 0.795 | 0.795 | 0.795 | 14 | 14 |
DT, XGB: soft, 2:1 | 0.924 | 0.818 | 0.810 | 0.806 | 0.806 | 0.808 | 12 | 14 |
DT, XGBoost, KNN + SVC: soft | 0.916 | 0.811 | 0.803 | 0.797 | 0.797 | 0.800 | 12 | 15 |
DT, XGBoost, KNN + SVC: soft, 2:2:1:1:2 | 0.923 | 0.818 | 0.812 | 0.803 | 0.803 | 0.807 | 11 | 15 |