32  Checking Model Calibration (Titanic Dataset)

Model calibration plots can be an important way to assess your model, particularly when the predicted probabilities are important.

import pandas as pd
import numpy as np
from time import time
import matplotlib.pyplot as plt
from xgboost import XGBClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import ConfusionMatrixDisplay, confusion_matrix, f1_score, precision_score, \
                            recall_score
from sklearn.calibration import calibration_curve, CalibrationDisplay
from sklearn.preprocessing import StandardScaler

%matplotlib inline

Let’s first import our processed data.

try:
    data = pd.read_csv("data/processed_data.csv")

except FileNotFoundError:
    # Download processed data:
    address = 'https://raw.githubusercontent.com/MichaelAllen1966/' + \
                '1804_python_healthcare/master/titanic/data/processed_data.csv'

    data = pd.read_csv(address)

    # Create a data subfolder if one does not already exist
    import os
    data_directory ='./data/'
    if not os.path.exists(data_directory):
        os.makedirs(data_directory)

    # Save data
    data.to_csv(data_directory + 'processed_data.csv', index=False)

data = data.astype(float)

# Drop Passengerid (axis=1 indicates we are removing a column rather than a row)
# We drop passenger ID as it is not original data

data.drop('PassengerId', inplace=True, axis=1)

X = data.drop('Survived',axis=1) # X = all 'data' except the 'survived' column
y = data['Survived'] # y = 'survived' column from 'data'

feature_names = X.columns.tolist()

X_train_val, X_test, y_train_val, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
X_train, X_validate, y_train, y_validate = train_test_split(X_train_val, y_train_val, test_size=0.2, random_state=42)

print(f"Training Dataset Samples: {len(X_train)}")
print(f"Validation Dataset Samples: {len(X_validate)}")
print(f"Testing Dataset Samples: {len(X_test)}")
Training Dataset Samples: 569
Validation Dataset Samples: 143
Testing Dataset Samples: 179

Let’s fit an initial model.

model = XGBClassifier(random_state=42)
model.fit(X_train, y_train)

y_pred_train = model.predict(X_train)
y_pred_val = model.predict(X_validate)

The first major step is predicting the probabilities of survival.

y_calibrate_probabilities = model.predict_proba(X_validate)[:,1]
y_calibrate_probabilities
array([2.05822676e-01, 3.52452919e-02, 9.60064888e-01, 9.88666415e-01,
       2.01759231e-03, 3.81041132e-02, 9.79259014e-01, 9.98046637e-01,
       2.01433767e-02, 3.72755504e-03, 9.89065170e-01, 1.33913994e-01,
       3.56350005e-01, 9.77980614e-01, 9.99136150e-01, 3.38162435e-03,
       1.04845390e-02, 3.55994864e-03, 3.69379260e-02, 1.35302618e-01,
       9.92524981e-01, 8.23074400e-01, 7.01691031e-01, 9.22003865e-01,
       9.63228106e-01, 3.20929438e-02, 1.01891626e-03, 9.91041601e-01,
       5.07083893e-01, 1.61430892e-03, 9.04381633e-01, 9.59128499e-01,
       2.63705608e-02, 1.39667233e-03, 7.53330241e-04, 1.13243781e-01,
       6.23617694e-02, 5.43069188e-03, 1.52443694e-02, 9.99328375e-01,
       9.59980607e-01, 5.96146807e-02, 9.97057915e-01, 2.48304289e-02,
       9.97931361e-01, 9.61281121e-01, 5.95116755e-03, 5.50567806e-02,
       5.32550991e-01, 7.36888684e-03, 6.07086957e-01, 2.72897142e-03,
       2.00232957e-03, 2.91303862e-02, 8.32613051e-01, 9.96867716e-01,
       6.10939553e-03, 4.82243709e-02, 1.09095879e-01, 1.85978502e-01,
       9.91041601e-01, 1.86239276e-03, 8.54294449e-02, 9.82371092e-01,
       4.84345341e-03, 3.17367172e-04, 4.94391285e-02, 4.86289591e-01,
       5.34910895e-02, 3.31237465e-02, 5.33424504e-02, 9.98463035e-01,
       1.36515990e-01, 1.19548671e-01, 1.14806831e-01, 8.01631901e-03,
       9.96852696e-01, 3.16940248e-01, 9.94843483e-01, 6.41588688e-01,
       9.88738298e-01, 6.91143870e-01, 5.78789674e-02, 9.99705255e-01,
       7.05600670e-03, 1.09724440e-02, 6.32528588e-03, 5.96146807e-02,
       9.68342274e-02, 6.60832405e-01, 4.04346781e-03, 2.64491490e-03,
       9.84592974e-01, 1.63405478e-01, 1.10971414e-01, 9.82463479e-01,
       3.71817723e-02, 9.72673178e-01, 1.98875666e-01, 1.27914354e-01,
       9.75768328e-01, 9.79870483e-02, 9.60265517e-01, 9.41147208e-01,
       4.29043584e-02, 5.43069188e-03, 9.44249749e-01, 1.75828557e-03,
       6.48384243e-02, 9.45494026e-02, 4.38896380e-03, 2.50388514e-02,
       4.16158959e-02, 1.18054882e-01, 4.66120988e-01, 2.81780422e-01,
       9.67573524e-01, 1.01316497e-02, 1.48710921e-01, 6.32528588e-03,
       6.32528588e-03, 8.05518329e-01, 9.99740303e-01, 1.72039345e-01,
       9.97362077e-01, 7.45192885e-01, 6.37774915e-02, 1.25395339e-02,
       9.96176243e-01, 9.98697460e-01, 9.20668066e-01, 8.95885170e-01,
       7.31337190e-01, 6.85901716e-02, 2.14473590e-01, 9.96718943e-01,
       1.04652986e-01, 9.91419435e-01, 9.98377800e-01, 5.43069188e-03,
       7.94559438e-03, 1.61430892e-03, 1.46020606e-01], dtype=float32)

32.1 Manual Calculation - equal-width bins

Now we can create a reliability plot, binning cases by their predicted probability fo survival.

# Bin data with numpy digitize (this will assign a bin to each case)
step = 0.10
bins = np.arange(step, 1+step, step)
digitized = np.digitize(y_calibrate_probabilities, bins)

# Put data in DataFrame
reliability = pd.DataFrame()
reliability['bin'] = digitized
reliability['probability'] = y_calibrate_probabilities
reliability['observed'] = y_validate.values

# Summarise data by bin in new dataframe
reliability_summary = pd.DataFrame()

# Add bins to summary
reliability_summary['bin'] = bins

# Calculate mean of predicted probability of survival for each bin
reliability_summary['confidence'] = \
    reliability.groupby('bin').mean()['probability']

# Calculate the proportion of passengers who survive in each bin
reliability_summary['fraction_positive'] = \
    reliability.groupby('bin').mean()['observed']

reliability_summary
bin confidence fraction_positive
0 0.1 0.026164 0.171875
1 0.2 0.137591 0.117647
2 0.3 0.234026 0.333333
3 0.4 0.336645 0.000000
4 0.5 0.476205 0.500000
5 0.6 0.519817 1.000000
6 0.7 0.650163 0.000000
7 0.8 0.726074 0.000000
8 0.9 0.839273 0.250000
9 1.0 0.978974 0.904762

Let’s now observe this on a plot.

plt.plot(reliability_summary['confidence'],
         reliability_summary['fraction_positive'],
         linestyle='-',
         marker='o',
         label='model')

plt.plot([0,1],[0,1],
         linestyle='--',
         label='theoretical')

plt.xlabel('Model probability')
plt.ylabel('Fraction positive')

plt.title('Reliability plot')

plt.grid()
plt.legend()
plt.show()

32.2 The sklearn reliability plot

CalibrationDisplay.from_estimator(model, X_test, y_test)
plt.show()

Let’s compare this for a series of classifiers.

model_dt = DecisionTreeClassifier(max_depth=6)
model_rf = RandomForestClassifier(random_state=42, max_depth=6)
model_xgb = XGBClassifier(random_state=42)
model_lr = LogisticRegression()

fig, (ax1, ax2, ax3, ax4) = plt.subplots(1, 4, figsize=(15, 7))

#######################
# Decision Tree       #
#######################

CalibrationDisplay.from_estimator(model_dt.fit(X_train, y_train), X_validate, y_validate, ax=ax1)
ax1 = ax1.set_title("Decision Tree")

#######################
# Random Forest       #
#######################

CalibrationDisplay.from_estimator(model_rf.fit(X_train, y_train), X_validate, y_validate, ax=ax2)
ax2 = ax2.set_title("Random Forest")

#######################
# XGBoost             #
#######################

CalibrationDisplay.from_estimator(model_xgb.fit(X_train, y_train), X_validate, y_validate, ax=ax3)
ax3 = ax3.set_title("XGBoost")

#######################
# Logistic Regression #
#######################

# Initialise a new scaling object for normalising input data
sc = StandardScaler()

# Apply the scaler to the training and test sets
train_std=sc.fit_transform(X_train)
val_std = sc.fit_transform(X_validate)
test_std=sc.fit_transform(X_test)

CalibrationDisplay.from_estimator(model_lr.fit(train_std, y_train), val_std , y_validate, ax=ax4)
ax4 = ax4.set_title("Logistic Regression")

Repeat this, plotting them overlaid.

fig, ax = plt.subplots(figsize=(12,7))
CalibrationDisplay.from_estimator(model_dt.fit(X_train, y_train), X_validate, y_validate, ax=ax)
CalibrationDisplay.from_estimator(model_rf.fit(X_train, y_train), X_validate, y_validate, ax=ax)
CalibrationDisplay.from_estimator(model_xgb.fit(X_train, y_train), X_validate, y_validate, ax=ax)
CalibrationDisplay.from_estimator(model_lr.fit(train_std, y_train), val_std , y_validate, ax=ax)

32.2.1 Histograms

fig, (ax1, ax2, ax3, ax4, ax5, ax6) = plt.subplots(6, 1, figsize=(7, 15), sharey=True, sharex=True)

#######################
# Decision Tree       #
#######################

ax1.hist(model_dt.predict_proba(X_validate)[:,1], bins=np.arange(0,1.01,0.1))
ax1 = ax1.set_title("Decision Tree")

#######################
# Random Forest       #
#######################

ax2.hist(model_rf.predict_proba(X_validate)[:,1], bins=np.arange(0,1.01,0.1))
ax2 = ax2.set_title("Random Forest")

#######################
# XGBoost             #
#######################

ax3.hist(model_xgb.predict_proba(X_validate)[:,1], bins=np.arange(0,1.01,0.1))
ax3 = ax3.set_title("XGBoost")

#######################
# Logistic Regression #
#######################

ax4.hist(model_lr.predict_proba(val_std)[:,1], bins=np.arange(0,1.01,0.1))
ax4 = ax4.set_title("Logistic Regression")

#######################
# KNN                 #
#######################

ax5.hist(KNeighborsClassifier().fit(train_std, y_train).predict_proba(val_std)[:,1], bins=np.arange(0,1.01,0.1))
ax5 = ax5.set_title("K Nearest Neighbours")

#######################
# Naive Bayes         #
#######################

ax6.hist(GaussianNB().fit(X_train, y_train).predict_proba(X_validate)[:,1], bins=np.arange(0,1.01,0.1))
ax6 = ax6.set_title("Gaussian Naive Bayes")

33 Calibrating Classifiers

Let’s remind ourself of our calibration for our XGBoost model.


X_train, X_calibrate, y_train, y_calibrate = train_test_split(X_train, y_train, test_size=0.2, random_state=42)

model_xgb = XGBClassifier()
model_xgb.fit(X_train, y_train)

CalibrationDisplay.from_estimator(model_xgb, X_validate, y_validate)

We can use CalibratedClassifierCV to recalibrate a model and then plot the output.

from sklearn.calibration import CalibratedClassifierCV

model_xgb_calibrated = CalibratedClassifierCV(model_xgb, cv="prefit")
model_xgb_calibrated.fit(X_calibrate, y_calibrate)
CalibratedClassifierCV(cv='prefit',
                       estimator=XGBClassifier(base_score=None, booster=None,
                                               callbacks=None,
                                               colsample_bylevel=None,
                                               colsample_bynode=None,
                                               colsample_bytree=None,
                                               device=None,
                                               early_stopping_rounds=None,
                                               enable_categorical=False,
                                               eval_metric=None,
                                               feature_types=None, gamma=None,
                                               grow_policy=None,
                                               importance_type=None,
                                               interaction_constraints=None,
                                               learning_rate=None, max_bin=None,
                                               max_cat_threshold=None,
                                               max_cat_to_onehot=None,
                                               max_delta_step=None,
                                               max_depth=None, max_leaves=None,
                                               min_child_weight=None,
                                               missing=nan,
                                               monotone_constraints=None,
                                               multi_strategy=None,
                                               n_estimators=None, n_jobs=None,
                                               num_parallel_tree=None,
                                               random_state=None, ...))
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(15, 7), sharey=True, sharex=True)

#######################
# XGBoost             #
#######################

ax1.hist(model_xgb.predict_proba(X_validate)[:,1], bins=np.arange(0,1.01,0.1))
ax1 = ax1.set_title("XGBoost")

########################
# XGBoost - Calibrated #
########################

ax2.hist(model_xgb_calibrated.predict_proba(X_validate)[:,1], bins=np.arange(0,1.01,0.1))
ax2 = ax2.set_title("XGBoost - Calibrated")

34 Compare performance

Let’s compare the performance of our calibrated and uncalibrated model, does this make any difference?

def pred_assess(name, model=XGBClassifier(random_state=42),
              X_train=X_train, X_validate=X_validate, y_train=y_train, y_validate=y_validate,
              show_confusion_matrix=False
              ):

     y_pred_train = model.predict(X_train)
     y_pred_val = model.predict(X_validate)

     if show_confusion_matrix:
          confusion_matrix_titanic = ConfusionMatrixDisplay(
               confusion_matrix=confusion_matrix(
                    y_true=y_validate,
                    y_pred=y_pred_val,
                    normalize='true'
                    ),
                    display_labels=["Died", "Survived"]
               )

          confusion_matrix_titanic.plot()

     return pd.DataFrame({
          'Accuracy (training)': np.mean(y_pred_train == y_train).round(4),
          'Accuracy (validation)': np.mean(y_pred_val == y_validate).round(4),
          'Precision (validation)': precision_score(y_validate, y_pred_val, average='macro').round(4),
          'Recall (validation)': recall_score(y_validate, y_pred_val, average='macro').round(4),
          'features': ", ".join(X_train.columns.tolist())

          }, index=[name]
)
pd.concat(
    [pred_assess(model=model_xgb, name="XBoost"),
     pred_assess(model=model_xgb_calibrated, name="XBoost - Calibrated")]
     )
Accuracy (training) Accuracy (validation) Precision (validation) Recall (validation) features
XBoost 0.989 0.8322 0.8239 0.8239 Pclass, Age, SibSp, Parch, Fare, AgeImputed, E...
XBoost - Calibrated 0.989 0.8322 0.8269 0.8175 Pclass, Age, SibSp, Parch, Fare, AgeImputed, E...

Let’s just look at the confusion matrices.

pred_assess(model=model_xgb, name="XGBoost", show_confusion_matrix=True)
Accuracy (training) Accuracy (validation) Precision (validation) Recall (validation) features
XGBoost 0.989 0.8322 0.8239 0.8239 Pclass, Age, SibSp, Parch, Fare, AgeImputed, E...

pred_assess(model=model_xgb_calibrated, name="XGBoost - calibrated", show_confusion_matrix=True)
Accuracy (training) Accuracy (validation) Precision (validation) Recall (validation) features
XGBoost - calibrated 0.989 0.8322 0.8269 0.8175 Pclass, Age, SibSp, Parch, Fare, AgeImputed, E...