import pandas as pd
import numpy as np
from time import time
import matplotlib.pyplot as plt
from xgboost import XGBClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import ConfusionMatrixDisplay, confusion_matrix, f1_score, precision_score, \
recall_scorefrom sklearn.calibration import calibration_curve, CalibrationDisplay
from sklearn.preprocessing import StandardScaler
%matplotlib inline
32 Checking Model Calibration (Titanic Dataset)
Model calibration plots can be an important way to assess your model, particularly when the predicted probabilities are important.
Let’s first import our processed data.
try:
= pd.read_csv("data/processed_data.csv")
data
except FileNotFoundError:
# Download processed data:
= 'https://raw.githubusercontent.com/MichaelAllen1966/' + \
address '1804_python_healthcare/master/titanic/data/processed_data.csv'
= pd.read_csv(address)
data
# Create a data subfolder if one does not already exist
import os
='./data/'
data_directory if not os.path.exists(data_directory):
os.makedirs(data_directory)
# Save data
+ 'processed_data.csv', index=False)
data.to_csv(data_directory
= data.astype(float)
data
# Drop Passengerid (axis=1 indicates we are removing a column rather than a row)
# We drop passenger ID as it is not original data
'PassengerId', inplace=True, axis=1)
data.drop(
= data.drop('Survived',axis=1) # X = all 'data' except the 'survived' column
X = data['Survived'] # y = 'survived' column from 'data'
y
= X.columns.tolist()
feature_names
= train_test_split(X, y, test_size=0.2, random_state=42)
X_train_val, X_test, y_train_val, y_test = train_test_split(X_train_val, y_train_val, test_size=0.2, random_state=42)
X_train, X_validate, y_train, y_validate
print(f"Training Dataset Samples: {len(X_train)}")
print(f"Validation Dataset Samples: {len(X_validate)}")
print(f"Testing Dataset Samples: {len(X_test)}")
Training Dataset Samples: 569
Validation Dataset Samples: 143
Testing Dataset Samples: 179
Let’s fit an initial model.
= XGBClassifier(random_state=42)
model
model.fit(X_train, y_train)
= model.predict(X_train)
y_pred_train = model.predict(X_validate) y_pred_val
The first major step is predicting the probabilities of survival.
= model.predict_proba(X_validate)[:,1]
y_calibrate_probabilities y_calibrate_probabilities
array([2.05822676e-01, 3.52452919e-02, 9.60064888e-01, 9.88666415e-01,
2.01759231e-03, 3.81041132e-02, 9.79259014e-01, 9.98046637e-01,
2.01433767e-02, 3.72755504e-03, 9.89065170e-01, 1.33913994e-01,
3.56350005e-01, 9.77980614e-01, 9.99136150e-01, 3.38162435e-03,
1.04845390e-02, 3.55994864e-03, 3.69379260e-02, 1.35302618e-01,
9.92524981e-01, 8.23074400e-01, 7.01691031e-01, 9.22003865e-01,
9.63228106e-01, 3.20929438e-02, 1.01891626e-03, 9.91041601e-01,
5.07083893e-01, 1.61430892e-03, 9.04381633e-01, 9.59128499e-01,
2.63705608e-02, 1.39667233e-03, 7.53330241e-04, 1.13243781e-01,
6.23617694e-02, 5.43069188e-03, 1.52443694e-02, 9.99328375e-01,
9.59980607e-01, 5.96146807e-02, 9.97057915e-01, 2.48304289e-02,
9.97931361e-01, 9.61281121e-01, 5.95116755e-03, 5.50567806e-02,
5.32550991e-01, 7.36888684e-03, 6.07086957e-01, 2.72897142e-03,
2.00232957e-03, 2.91303862e-02, 8.32613051e-01, 9.96867716e-01,
6.10939553e-03, 4.82243709e-02, 1.09095879e-01, 1.85978502e-01,
9.91041601e-01, 1.86239276e-03, 8.54294449e-02, 9.82371092e-01,
4.84345341e-03, 3.17367172e-04, 4.94391285e-02, 4.86289591e-01,
5.34910895e-02, 3.31237465e-02, 5.33424504e-02, 9.98463035e-01,
1.36515990e-01, 1.19548671e-01, 1.14806831e-01, 8.01631901e-03,
9.96852696e-01, 3.16940248e-01, 9.94843483e-01, 6.41588688e-01,
9.88738298e-01, 6.91143870e-01, 5.78789674e-02, 9.99705255e-01,
7.05600670e-03, 1.09724440e-02, 6.32528588e-03, 5.96146807e-02,
9.68342274e-02, 6.60832405e-01, 4.04346781e-03, 2.64491490e-03,
9.84592974e-01, 1.63405478e-01, 1.10971414e-01, 9.82463479e-01,
3.71817723e-02, 9.72673178e-01, 1.98875666e-01, 1.27914354e-01,
9.75768328e-01, 9.79870483e-02, 9.60265517e-01, 9.41147208e-01,
4.29043584e-02, 5.43069188e-03, 9.44249749e-01, 1.75828557e-03,
6.48384243e-02, 9.45494026e-02, 4.38896380e-03, 2.50388514e-02,
4.16158959e-02, 1.18054882e-01, 4.66120988e-01, 2.81780422e-01,
9.67573524e-01, 1.01316497e-02, 1.48710921e-01, 6.32528588e-03,
6.32528588e-03, 8.05518329e-01, 9.99740303e-01, 1.72039345e-01,
9.97362077e-01, 7.45192885e-01, 6.37774915e-02, 1.25395339e-02,
9.96176243e-01, 9.98697460e-01, 9.20668066e-01, 8.95885170e-01,
7.31337190e-01, 6.85901716e-02, 2.14473590e-01, 9.96718943e-01,
1.04652986e-01, 9.91419435e-01, 9.98377800e-01, 5.43069188e-03,
7.94559438e-03, 1.61430892e-03, 1.46020606e-01], dtype=float32)
32.1 Manual Calculation - equal-width bins
Now we can create a reliability plot, binning cases by their predicted probability fo survival.
# Bin data with numpy digitize (this will assign a bin to each case)
= 0.10
step = np.arange(step, 1+step, step)
bins = np.digitize(y_calibrate_probabilities, bins)
digitized
# Put data in DataFrame
= pd.DataFrame()
reliability 'bin'] = digitized
reliability['probability'] = y_calibrate_probabilities
reliability['observed'] = y_validate.values
reliability[
# Summarise data by bin in new dataframe
= pd.DataFrame()
reliability_summary
# Add bins to summary
'bin'] = bins
reliability_summary[
# Calculate mean of predicted probability of survival for each bin
'confidence'] = \
reliability_summary['bin').mean()['probability']
reliability.groupby(
# Calculate the proportion of passengers who survive in each bin
'fraction_positive'] = \
reliability_summary['bin').mean()['observed']
reliability.groupby(
reliability_summary
bin | confidence | fraction_positive | |
---|---|---|---|
0 | 0.1 | 0.026164 | 0.171875 |
1 | 0.2 | 0.137591 | 0.117647 |
2 | 0.3 | 0.234026 | 0.333333 |
3 | 0.4 | 0.336645 | 0.000000 |
4 | 0.5 | 0.476205 | 0.500000 |
5 | 0.6 | 0.519817 | 1.000000 |
6 | 0.7 | 0.650163 | 0.000000 |
7 | 0.8 | 0.726074 | 0.000000 |
8 | 0.9 | 0.839273 | 0.250000 |
9 | 1.0 | 0.978974 | 0.904762 |
Let’s now observe this on a plot.
'confidence'],
plt.plot(reliability_summary['fraction_positive'],
reliability_summary[='-',
linestyle='o',
marker='model')
label
0,1],[0,1],
plt.plot([='--',
linestyle='theoretical')
label
'Model probability')
plt.xlabel('Fraction positive')
plt.ylabel(
'Reliability plot')
plt.title(
plt.grid()
plt.legend() plt.show()
32.2 The sklearn reliability plot
CalibrationDisplay.from_estimator(model, X_test, y_test) plt.show()
Let’s compare this for a series of classifiers.
= DecisionTreeClassifier(max_depth=6)
model_dt = RandomForestClassifier(random_state=42, max_depth=6)
model_rf = XGBClassifier(random_state=42)
model_xgb = LogisticRegression()
model_lr
= plt.subplots(1, 4, figsize=(15, 7))
fig, (ax1, ax2, ax3, ax4)
#######################
# Decision Tree #
#######################
=ax1)
CalibrationDisplay.from_estimator(model_dt.fit(X_train, y_train), X_validate, y_validate, ax= ax1.set_title("Decision Tree")
ax1
#######################
# Random Forest #
#######################
=ax2)
CalibrationDisplay.from_estimator(model_rf.fit(X_train, y_train), X_validate, y_validate, ax= ax2.set_title("Random Forest")
ax2
#######################
# XGBoost #
#######################
=ax3)
CalibrationDisplay.from_estimator(model_xgb.fit(X_train, y_train), X_validate, y_validate, ax= ax3.set_title("XGBoost")
ax3
#######################
# Logistic Regression #
#######################
# Initialise a new scaling object for normalising input data
= StandardScaler()
sc
# Apply the scaler to the training and test sets
=sc.fit_transform(X_train)
train_std= sc.fit_transform(X_validate)
val_std =sc.fit_transform(X_test)
test_std
=ax4)
CalibrationDisplay.from_estimator(model_lr.fit(train_std, y_train), val_std , y_validate, ax= ax4.set_title("Logistic Regression") ax4
Repeat this, plotting them overlaid.
= plt.subplots(figsize=(12,7))
fig, ax =ax)
CalibrationDisplay.from_estimator(model_dt.fit(X_train, y_train), X_validate, y_validate, ax=ax)
CalibrationDisplay.from_estimator(model_rf.fit(X_train, y_train), X_validate, y_validate, ax=ax)
CalibrationDisplay.from_estimator(model_xgb.fit(X_train, y_train), X_validate, y_validate, ax=ax) CalibrationDisplay.from_estimator(model_lr.fit(train_std, y_train), val_std , y_validate, ax
32.2.1 Histograms
= plt.subplots(6, 1, figsize=(7, 15), sharey=True, sharex=True)
fig, (ax1, ax2, ax3, ax4, ax5, ax6)
#######################
# Decision Tree #
#######################
1], bins=np.arange(0,1.01,0.1))
ax1.hist(model_dt.predict_proba(X_validate)[:,= ax1.set_title("Decision Tree")
ax1
#######################
# Random Forest #
#######################
1], bins=np.arange(0,1.01,0.1))
ax2.hist(model_rf.predict_proba(X_validate)[:,= ax2.set_title("Random Forest")
ax2
#######################
# XGBoost #
#######################
1], bins=np.arange(0,1.01,0.1))
ax3.hist(model_xgb.predict_proba(X_validate)[:,= ax3.set_title("XGBoost")
ax3
#######################
# Logistic Regression #
#######################
1], bins=np.arange(0,1.01,0.1))
ax4.hist(model_lr.predict_proba(val_std)[:,= ax4.set_title("Logistic Regression")
ax4
#######################
# KNN #
#######################
1], bins=np.arange(0,1.01,0.1))
ax5.hist(KNeighborsClassifier().fit(train_std, y_train).predict_proba(val_std)[:,= ax5.set_title("K Nearest Neighbours")
ax5
#######################
# Naive Bayes #
#######################
1], bins=np.arange(0,1.01,0.1))
ax6.hist(GaussianNB().fit(X_train, y_train).predict_proba(X_validate)[:,= ax6.set_title("Gaussian Naive Bayes") ax6
33 Calibrating Classifiers
Let’s remind ourself of our calibration for our XGBoost model.
= train_test_split(X_train, y_train, test_size=0.2, random_state=42)
X_train, X_calibrate, y_train, y_calibrate
= XGBClassifier()
model_xgb
model_xgb.fit(X_train, y_train)
CalibrationDisplay.from_estimator(model_xgb, X_validate, y_validate)
We can use CalibratedClassifierCV to recalibrate a model and then plot the output.
from sklearn.calibration import CalibratedClassifierCV
= CalibratedClassifierCV(model_xgb, cv="prefit")
model_xgb_calibrated model_xgb_calibrated.fit(X_calibrate, y_calibrate)
CalibratedClassifierCV(cv='prefit', estimator=XGBClassifier(base_score=None, booster=None, callbacks=None, colsample_bylevel=None, colsample_bynode=None, colsample_bytree=None, device=None, early_stopping_rounds=None, enable_categorical=False, eval_metric=None, feature_types=None, gamma=None, grow_policy=None, importance_type=None, interaction_constraints=None, learning_rate=None, max_bin=None, max_cat_threshold=None, max_cat_to_onehot=None, max_delta_step=None, max_depth=None, max_leaves=None, min_child_weight=None, missing=nan, monotone_constraints=None, multi_strategy=None, n_estimators=None, n_jobs=None, num_parallel_tree=None, random_state=None, ...))In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
CalibratedClassifierCV(cv='prefit', estimator=XGBClassifier(base_score=None, booster=None, callbacks=None, colsample_bylevel=None, colsample_bynode=None, colsample_bytree=None, device=None, early_stopping_rounds=None, enable_categorical=False, eval_metric=None, feature_types=None, gamma=None, grow_policy=None, importance_type=None, interaction_constraints=None, learning_rate=None, max_bin=None, max_cat_threshold=None, max_cat_to_onehot=None, max_delta_step=None, max_depth=None, max_leaves=None, min_child_weight=None, missing=nan, monotone_constraints=None, multi_strategy=None, n_estimators=None, n_jobs=None, num_parallel_tree=None, random_state=None, ...))
XGBClassifier(base_score=None, booster=None, callbacks=None, colsample_bylevel=None, colsample_bynode=None, colsample_bytree=None, device=None, early_stopping_rounds=None, enable_categorical=False, eval_metric=None, feature_types=None, gamma=None, grow_policy=None, importance_type=None, interaction_constraints=None, learning_rate=None, max_bin=None, max_cat_threshold=None, max_cat_to_onehot=None, max_delta_step=None, max_depth=None, max_leaves=None, min_child_weight=None, missing=nan, monotone_constraints=None, multi_strategy=None, n_estimators=None, n_jobs=None, num_parallel_tree=None, random_state=None, ...)
XGBClassifier(base_score=None, booster=None, callbacks=None, colsample_bylevel=None, colsample_bynode=None, colsample_bytree=None, device=None, early_stopping_rounds=None, enable_categorical=False, eval_metric=None, feature_types=None, gamma=None, grow_policy=None, importance_type=None, interaction_constraints=None, learning_rate=None, max_bin=None, max_cat_threshold=None, max_cat_to_onehot=None, max_delta_step=None, max_depth=None, max_leaves=None, min_child_weight=None, missing=nan, monotone_constraints=None, multi_strategy=None, n_estimators=None, n_jobs=None, num_parallel_tree=None, random_state=None, ...)
= plt.subplots(1, 2, figsize=(15, 7), sharey=True, sharex=True)
fig, (ax1, ax2)
#######################
# XGBoost #
#######################
1], bins=np.arange(0,1.01,0.1))
ax1.hist(model_xgb.predict_proba(X_validate)[:,= ax1.set_title("XGBoost")
ax1
########################
# XGBoost - Calibrated #
########################
1], bins=np.arange(0,1.01,0.1))
ax2.hist(model_xgb_calibrated.predict_proba(X_validate)[:,= ax2.set_title("XGBoost - Calibrated") ax2
34 Compare performance
Let’s compare the performance of our calibrated and uncalibrated model, does this make any difference?
def pred_assess(name, model=XGBClassifier(random_state=42),
=X_train, X_validate=X_validate, y_train=y_train, y_validate=y_validate,
X_train=False
show_confusion_matrix
):
= model.predict(X_train)
y_pred_train = model.predict(X_validate)
y_pred_val
if show_confusion_matrix:
= ConfusionMatrixDisplay(
confusion_matrix_titanic =confusion_matrix(
confusion_matrix=y_validate,
y_true=y_pred_val,
y_pred='true'
normalize
),=["Died", "Survived"]
display_labels
)
confusion_matrix_titanic.plot()
return pd.DataFrame({
'Accuracy (training)': np.mean(y_pred_train == y_train).round(4),
'Accuracy (validation)': np.mean(y_pred_val == y_validate).round(4),
'Precision (validation)': precision_score(y_validate, y_pred_val, average='macro').round(4),
'Recall (validation)': recall_score(y_validate, y_pred_val, average='macro').round(4),
'features': ", ".join(X_train.columns.tolist())
=[name]
}, index )
pd.concat(=model_xgb, name="XBoost"),
[pred_assess(model=model_xgb_calibrated, name="XBoost - Calibrated")]
pred_assess(model )
Accuracy (training) | Accuracy (validation) | Precision (validation) | Recall (validation) | features | |
---|---|---|---|---|---|
XBoost | 0.989 | 0.8322 | 0.8239 | 0.8239 | Pclass, Age, SibSp, Parch, Fare, AgeImputed, E... |
XBoost - Calibrated | 0.989 | 0.8322 | 0.8269 | 0.8175 | Pclass, Age, SibSp, Parch, Fare, AgeImputed, E... |
Let’s just look at the confusion matrices.
=model_xgb, name="XGBoost", show_confusion_matrix=True) pred_assess(model
Accuracy (training) | Accuracy (validation) | Precision (validation) | Recall (validation) | features | |
---|---|---|---|---|---|
XGBoost | 0.989 | 0.8322 | 0.8239 | 0.8239 | Pclass, Age, SibSp, Parch, Fare, AgeImputed, E... |
=model_xgb_calibrated, name="XGBoost - calibrated", show_confusion_matrix=True) pred_assess(model
Accuracy (training) | Accuracy (validation) | Precision (validation) | Recall (validation) | features | |
---|---|---|---|---|---|
XGBoost - calibrated | 0.989 | 0.8322 | 0.8269 | 0.8175 | Pclass, Age, SibSp, Parch, Fare, AgeImputed, E... |