4  Decision Trees for Classification (Titanic Dataset)

import numpy as np
import pandas as pd
# Import machine learning methods
from sklearn.tree import DecisionTreeClassifier, plot_tree
from sklearn.model_selection import train_test_split
import matplotlib.pyplot as plt
%matplotlib inline
import plotly.express as px
from sklearn.metrics import auc, roc_curve, RocCurveDisplay, f1_score, precision_score, \
                            recall_score, confusion_matrix, ConfusionMatrixDisplay, \
                            classification_report, precision_recall_fscore_support
np.random.seed(42)
download_required = True

if download_required:

    # Download processed data:
    address = 'https://raw.githubusercontent.com/MichaelAllen1966/' + \
                '1804_python_healthcare/master/titanic/data/processed_data.csv'

    data = pd.read_csv(address)

    # Create a data subfolder if one does not already exist
    import os
    data_directory ='../datasets/'
    if not os.path.exists(data_directory):
        os.makedirs(data_directory)

    # Save data
    data.to_csv(data_directory + 'processed_titanic_data.csv', index=False)
data = pd.read_csv('../datasets/processed_titanic_data.csv')
# Make all data 'float' type
data = data.astype(float)
data.head(10)
PassengerId Survived Pclass Age SibSp Parch Fare AgeImputed EmbarkedImputed CabinLetterImputed ... Embarked_missing CabinLetter_A CabinLetter_B CabinLetter_C CabinLetter_D CabinLetter_E CabinLetter_F CabinLetter_G CabinLetter_T CabinLetter_missing
0 1.0 0.0 3.0 22.0 1.0 0.0 7.2500 0.0 0.0 1.0 ... 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 1.0
1 2.0 1.0 1.0 38.0 1.0 0.0 71.2833 0.0 0.0 0.0 ... 0.0 0.0 0.0 1.0 0.0 0.0 0.0 0.0 0.0 0.0
2 3.0 1.0 3.0 26.0 0.0 0.0 7.9250 0.0 0.0 1.0 ... 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 1.0
3 4.0 1.0 1.0 35.0 1.0 0.0 53.1000 0.0 0.0 0.0 ... 0.0 0.0 0.0 1.0 0.0 0.0 0.0 0.0 0.0 0.0
4 5.0 0.0 3.0 35.0 0.0 0.0 8.0500 0.0 0.0 1.0 ... 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 1.0
5 6.0 0.0 3.0 28.0 0.0 0.0 8.4583 1.0 0.0 1.0 ... 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 1.0
6 7.0 0.0 1.0 54.0 0.0 0.0 51.8625 0.0 0.0 0.0 ... 0.0 0.0 0.0 0.0 0.0 1.0 0.0 0.0 0.0 0.0
7 8.0 0.0 3.0 2.0 3.0 1.0 21.0750 0.0 0.0 1.0 ... 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 1.0
8 9.0 1.0 3.0 27.0 0.0 2.0 11.1333 0.0 0.0 1.0 ... 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 1.0
9 10.0 1.0 2.0 14.0 1.0 0.0 30.0708 0.0 0.0 1.0 ... 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 1.0

10 rows × 26 columns

data.describe()
PassengerId Survived Pclass Age SibSp Parch Fare AgeImputed EmbarkedImputed CabinLetterImputed ... Embarked_missing CabinLetter_A CabinLetter_B CabinLetter_C CabinLetter_D CabinLetter_E CabinLetter_F CabinLetter_G CabinLetter_T CabinLetter_missing
count 891.000000 891.000000 891.000000 891.000000 891.000000 891.000000 891.000000 891.000000 891.000000 891.000000 ... 891.000000 891.000000 891.000000 891.000000 891.000000 891.000000 891.000000 891.000000 891.000000 891.000000
mean 446.000000 0.383838 2.308642 29.361582 0.523008 0.381594 32.204208 0.198653 0.002245 0.771044 ... 0.002245 0.016835 0.052750 0.066218 0.037037 0.035915 0.014590 0.004489 0.001122 0.771044
std 257.353842 0.486592 0.836071 13.019697 1.102743 0.806057 49.693429 0.399210 0.047351 0.420397 ... 0.047351 0.128725 0.223659 0.248802 0.188959 0.186182 0.119973 0.066890 0.033501 0.420397
min 1.000000 0.000000 1.000000 0.420000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 ... 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000
25% 223.500000 0.000000 2.000000 22.000000 0.000000 0.000000 7.910400 0.000000 0.000000 1.000000 ... 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 1.000000
50% 446.000000 0.000000 3.000000 28.000000 0.000000 0.000000 14.454200 0.000000 0.000000 1.000000 ... 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 1.000000
75% 668.500000 1.000000 3.000000 35.000000 1.000000 0.000000 31.000000 0.000000 0.000000 1.000000 ... 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 1.000000
max 891.000000 1.000000 3.000000 80.000000 8.000000 6.000000 512.329200 1.000000 1.000000 1.000000 ... 1.000000 1.000000 1.000000 1.000000 1.000000 1.000000 1.000000 1.000000 1.000000 1.000000

8 rows × 26 columns

# Drop Passengerid (axis=1 indicates we are removing a column rather than a row)
# We drop passenger ID as it is not original data
# inplace=True means change the dataframe itself - don't create a copy with this column dropped

data.drop('PassengerId', inplace=True, axis=1)

4.1 Divide into X (features) and y (labels)

X = data.drop('Survived',axis=1) # X = all 'data' except the 'survived' column
y = data['Survived'] # y = 'survived' column from 'data'

4.2 Divide into training and tets sets

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.25)

4.3 Fit decision tree model

model = DecisionTreeClassifier() # Create a Decision Tree Model
model = model.fit(X_train,y_train) # Fit the model using our training data

4.4 Predict values

# Predict training and test set labels
y_pred_train = model.predict(X_train)
y_pred_test = model.predict(X_test)

4.5 Calculate accuracy

# The shorthand below says to check each predicted y value against the actual
# y value in the training data.  This gives a list of True and False values
# for each prediction, where True indicates the predicted value matches the
# actual value.  Then we take the mean of these Boolean values, which gives
# us a proportion (where if all values were True, the proportion would be 1.0)
# If you want to see why that works, just uncomment the following line of code
# to see what y_pred_train == y_train is doing.
# print (y_pred_train == y_train)
accuracy_train = np.mean(y_pred_train == y_train)
accuracy_test = np.mean(y_pred_test == y_test)

print (f'Accuracy of predicting training data = {accuracy_train:3f}')
print (f'Accuracy of predicting test data = {accuracy_test:3f}')
Accuracy of predicting training data = 0.983533
Accuracy of predicting test data = 0.766816
# Show first ten predicted classes
classes = model.predict(X_test)
classes[0:10]
array([0., 1., 0., 1., 1., 1., 1., 0., 0., 1.])
# Show first ten predicted probabilities
probabilities = model.predict_proba(X_test)
probabilities[0:10]
array([[1.        , 0.        ],
       [0.        , 1.        ],
       [1.        , 0.        ],
       [0.        , 1.        ],
       [0.        , 1.        ],
       [0.        , 1.        ],
       [0.16666667, 0.83333333],
       [1.        , 0.        ],
       [1.        , 0.        ],
       [0.        , 1.        ]])

4.6 Plot tree

fig = plot_tree(model)

fig = plot_tree(
    model,
    feature_names=data.drop('Survived',axis=1).columns.tolist(),
    class_names=['Died', 'Survived'],
    filled=True
    )

4.7 Following Nodes

https://scikit-learn.org/stable/auto_examples/tree/plot_unveil_tree_structure.html#sphx-glr-auto-examples-tree-plot-unveil-tree-structure-py

5 Tweaking DT Parameters

def train_and_run_dt(model):
    model.fit(X_train,y_train)
    y_pred_train = model.predict(X_train)
    y_pred_test = model.predict(X_test)
    accuracy_train = np.mean(y_pred_train == y_train)
    accuracy_test = np.mean(y_pred_test == y_test)

    print (f'Accuracy of predicting training data = {accuracy_train:.3f}')
    print (f'Accuracy of predicting test data = {accuracy_test:.3f}')

    print (f'Precision on test data = {precision_score(y_test, y_pred_test, average='micro'):.3f}')
    print (f'Recall on test data = {recall_score(y_test, y_pred_test, average='micro'):.3f}')
    print (f'Specificity on test data = {precision_score(y_test, y_pred_test, average='micro', pos_label=0):.3f}')
recall_score(y_test, y_pred_test, average='micro')
0.7668161434977578
precision_score(y_test, y_pred_test, pos_label=0)
0.8106060606060606
train_and_run_dt(model = DecisionTreeClassifier())
Accuracy of predicting training data = 0.984
Accuracy of predicting test data = 0.758
Precision on test data = 0.758
Recall on test data = 0.758
Specificity on test data = 0.758
c:\HSMA\_HSMA 6\Sammi's Sessions\h6_4d_decision_trees_random_forests\h6_4d_decision_trees_random_forests\.venv\Lib\site-packages\sklearn\metrics\_classification.py:1561: UserWarning:

Note that pos_label (set to 0) is ignored when average != 'binary' (got 'micro'). You may use labels=[pos_label] to specify a single positive class.

5.0.1 Min Samples Leaf

train_and_run_dt(model = DecisionTreeClassifier(min_samples_leaf=5))
Accuracy of predicting training data = 0.883
Accuracy of predicting test data = 0.830
Precision on test data = 0.830
Recall on test data = 0.830
Specificity on test data = 0.830
c:\HSMA\_HSMA 6\Sammi's Sessions\h6_4d_decision_trees_random_forests\h6_4d_decision_trees_random_forests\.venv\Lib\site-packages\sklearn\metrics\_classification.py:1561: UserWarning:

Note that pos_label (set to 0) is ignored when average != 'binary' (got 'micro'). You may use labels=[pos_label] to specify a single positive class.
accuracy_results = []

for i in range(1, 15, 1):
    model = DecisionTreeClassifier(min_samples_leaf=i, random_state=42)
    model.fit(X_train,y_train)
    y_pred_train = model.predict(X_train)
    y_pred_test = model.predict(X_test)
    accuracy_train = np.mean(y_pred_train == y_train)
    accuracy_test = np.mean(y_pred_test == y_test)
    accuracy_results.append({'accuracy_train': accuracy_train, 'accuracy_test': accuracy_test, 'min_samples_leaf': i})

px.line(pd.DataFrame(accuracy_results).melt(id_vars='min_samples_leaf'),
        x='min_samples_leaf', y='value', color='variable')
Unable to display output for mime type(s): application/vnd.plotly.v1+json

5.0.1.1 Min Samples Split

train_and_run_dt(model = DecisionTreeClassifier(min_samples_split=5))
Accuracy of predicting training data = 0.945
Accuracy of predicting test data = 0.767
Precision on test data = 0.767
Recall on test data = 0.767
Specificity on test data = 0.767
c:\HSMA\_HSMA 6\Sammi's Sessions\h6_4d_decision_trees_random_forests\h6_4d_decision_trees_random_forests\.venv\Lib\site-packages\sklearn\metrics\_classification.py:1561: UserWarning:

Note that pos_label (set to 0) is ignored when average != 'binary' (got 'micro'). You may use labels=[pos_label] to specify a single positive class.
accuracy_results = []

for i in range(2, 15, 1):
    model = DecisionTreeClassifier(min_samples_split=i)
    model.fit(X_train,y_train)
    y_pred_train = model.predict(X_train)
    y_pred_test = model.predict(X_test)
    accuracy_train = np.mean(y_pred_train == y_train)
    accuracy_test = np.mean(y_pred_test == y_test)
    accuracy_results.append({'accuracy_train': accuracy_train, 'accuracy_test': accuracy_test, 'min_samples_split': i})

px.line(pd.DataFrame(accuracy_results).melt(id_vars='min_samples_split'),
        x='min_samples_split', y='value', color='variable')
Unable to display output for mime type(s): application/vnd.plotly.v1+json
train_and_run_dt(model = DecisionTreeClassifier())
Accuracy of predicting training data = 0.984
Accuracy of predicting test data = 0.744
Precision on test data = 0.744
Recall on test data = 0.744
Specificity on test data = 0.744
c:\HSMA\_HSMA 6\Sammi's Sessions\h6_4d_decision_trees_random_forests\h6_4d_decision_trees_random_forests\.venv\Lib\site-packages\sklearn\metrics\_classification.py:1561: UserWarning:

Note that pos_label (set to 0) is ignored when average != 'binary' (got 'micro'). You may use labels=[pos_label] to specify a single positive class.
train_and_run_dt(model = DecisionTreeClassifier(max_depth=5))
Accuracy of predicting training data = 0.861
Accuracy of predicting test data = 0.807
Precision on test data = 0.807
Recall on test data = 0.807
Specificity on test data = 0.807
c:\HSMA\_HSMA 6\Sammi's Sessions\h6_4d_decision_trees_random_forests\h6_4d_decision_trees_random_forests\.venv\Lib\site-packages\sklearn\metrics\_classification.py:1561: UserWarning:

Note that pos_label (set to 0) is ignored when average != 'binary' (got 'micro'). You may use labels=[pos_label] to specify a single positive class.
train_and_run_dt(model = DecisionTreeClassifier(max_depth=3))
Accuracy of predicting training data = 0.832
Accuracy of predicting test data = 0.803
Precision on test data = 0.803
Recall on test data = 0.803
Specificity on test data = 0.803
c:\HSMA\_HSMA 6\Sammi's Sessions\h6_4d_decision_trees_random_forests\h6_4d_decision_trees_random_forests\.venv\Lib\site-packages\sklearn\metrics\_classification.py:1561: UserWarning:

Note that pos_label (set to 0) is ignored when average != 'binary' (got 'micro'). You may use labels=[pos_label] to specify a single positive class.
train_and_run_dt(model = DecisionTreeClassifier(max_depth=7))
Accuracy of predicting training data = 0.891
Accuracy of predicting test data = 0.789
Precision on test data = 0.789
Recall on test data = 0.789
Specificity on test data = 0.789
c:\HSMA\_HSMA 6\Sammi's Sessions\h6_4d_decision_trees_random_forests\h6_4d_decision_trees_random_forests\.venv\Lib\site-packages\sklearn\metrics\_classification.py:1561: UserWarning:

Note that pos_label (set to 0) is ignored when average != 'binary' (got 'micro'). You may use labels=[pos_label] to specify a single positive class.
accuracy_results = []

for i in range(1, 15, 1):
    model = model = DecisionTreeClassifier(max_depth=i)
    model.fit(X_train,y_train)
    y_pred_train = model.predict(X_train)
    y_pred_test = model.predict(X_test)
    accuracy_train = np.mean(y_pred_train == y_train)
    accuracy_test = np.mean(y_pred_test == y_test)
    accuracy_results.append({'accuracy_train': accuracy_train, 'accuracy_test': accuracy_test, 'max_depth': i})

px.line(pd.DataFrame(accuracy_results).melt(id_vars='max_depth'),
        x='max_depth', y='value', color='variable')
Unable to display output for mime type(s): application/vnd.plotly.v1+json
fig, ax = plt.subplots(figsize=(10,8))

model = DecisionTreeClassifier()
model = model.fit(X_train, y_train)
tree_plot = plot_tree(model,
    feature_names=data.drop('Survived',axis=1).columns.tolist(),
    class_names=['Died', 'Survived'],
    filled=True,
    ax=ax
    )

fig, ax = plt.subplots(figsize=(18,12))

model = DecisionTreeClassifier(max_depth=3)
model = model.fit(X_train, y_train)
tree_plot = plot_tree(model,
    feature_names=data.drop('Survived',axis=1).columns.tolist(),
    class_names=['Died', 'Survived'],
    filled=True,
    ax=ax,
    fontsize=11
    )

6 Compare with our log reg

from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
def standardise_data(X_train, X_test):

    # Initialise a new scaling object for normalising input data
    sc = StandardScaler()

    # Apply the scaler to the training and test sets
    train_std=sc.fit_transform(X_train)
    test_std=sc.fit_transform(X_test)

    return train_std, test_std
X_train_standardised, X_test_standardised = standardise_data(X_train, X_test)
model = LogisticRegression()
model.fit(X_train_standardised,y_train)
LogisticRegression()
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
# Predict training and test set labels
y_pred_train = model.predict(X_train_standardised)
y_pred_test = model.predict(X_test_standardised)
# The shorthand below says to check each predicted y value against the actual
# y value in the training data.  This gives a list of True and False values
# for each prediction, where True indicates the predicted value matches the
# actual value.  Then we take the mean of these Boolean values, which gives
# us a proportion (where if all values were True, the proportion would be 1.0)
# If you want to see why that works, just uncomment the following line of code
# to see what y_pred_train == y_train is doing.
# print (y_pred_train == y_train)
accuracy_train = np.mean(y_pred_train == y_train)
accuracy_test = np.mean(y_pred_test == y_test)

print (f'Accuracy of predicting training data = {accuracy_train}')
print (f'Accuracy of predicting test data = {accuracy_test}')
Accuracy of predicting training data = 0.8083832335329342
Accuracy of predicting test data = 0.8116591928251121

Best train accuracy seen in DT: 0.823 Train accuracy seen in LR: 0.805

Best test accuracy seen in DT: 0.820 Test accuracy seen in LR: 0.798

6.1 Post pruning

https://ranvir.xyz/blog/practical-approach-to-tree-pruning-using-sklearn/

https://scikit-learn.org/stable/auto_examples/tree/plot_cost_complexity_pruning.html

import matplotlib.pyplot as plt
model = DecisionTreeClassifier()
path = model.cost_complexity_pruning_path(X_train, y_train)

ccp_alphas, impurities = path.ccp_alphas, path.impurities

plt.figure(figsize=(10, 6))
plt.plot(ccp_alphas, impurities)
plt.xlabel("effective alpha")
plt.ylabel("total impurity of leaves")
Text(0, 0.5, 'total impurity of leaves')

models = []

for ccp_alpha in ccp_alphas:
    model = DecisionTreeClassifier(random_state=0, ccp_alpha=ccp_alpha)
    model.fit(X_train, y_train)
    models.append(model)
tree_depths = [model.tree_.max_depth for model in models]
plt.figure(figsize=(10,  6))
plt.plot(ccp_alphas[:-1], tree_depths[:-1])
plt.xlabel("effective alpha")
plt.ylabel("total depth")
Text(0, 0.5, 'total depth')

from sklearn.metrics import accuracy_score

acc_scores = [accuracy_score(y_test, model.predict(X_test)) for model in models]

tree_depths = [model.tree_.max_depth for model in models]
plt.figure(figsize=(10,  6))
plt.grid()
plt.plot(ccp_alphas[:-1], acc_scores[:-1])
plt.xlabel("effective alpha")
plt.ylabel("Accuracy scores")
Text(0, 0.5, 'Accuracy scores')

Final model from this approach

train_and_run_dt(DecisionTreeClassifier(random_state=0, ccp_alpha=0.0045))
Accuracy of predicting training data = 0.832
Accuracy of predicting test data = 0.803
Precision on test data = 0.803
Recall on test data = 0.803
Specificity on test data = 0.803
c:\HSMA\_HSMA 6\Sammi's Sessions\h6_4d_decision_trees_random_forests\h6_4d_decision_trees_random_forests\.venv\Lib\site-packages\sklearn\metrics\_classification.py:1561: UserWarning:

Note that pos_label (set to 0) is ignored when average != 'binary' (got 'micro'). You may use labels=[pos_label] to specify a single positive class.

6.2 Exploring metrics in our lr and dt

6.2.1 Decision Tree

decision_tree_model = model = DecisionTreeClassifier(max_depth=6)
decision_tree_model = decision_tree_model.fit(X_train,y_train)
y_pred_train_dt = decision_tree_model.predict(X_train)
y_pred_test_dt = decision_tree_model.predict(X_test)
roc_curve = RocCurveDisplay.from_estimator(
    decision_tree_model, X_test, y_test
)

fig = roc_curve.figure_
ax = roc_curve.ax_


ax.plot([0, 1], [0, 1], color='darkblue', linestyle='--')

np.random.seed(42)

decision_tree_model = DecisionTreeClassifier(max_depth=6)
decision_tree_model = decision_tree_model.fit(X_train,y_train)

y_pred_train_dt = decision_tree_model.predict(X_train)
y_pred_test_dt = decision_tree_model.predict(X_test)

roc_curve_dt = RocCurveDisplay.from_estimator(
    decision_tree_model, X_test, y_test
)

fig = roc_curve_dt.figure_
ax = roc_curve_dt.ax_

ax.plot([0, 1], [0, 1], color='darkblue', linestyle='--')

confusion_matrix_dt = ConfusionMatrixDisplay(
    confusion_matrix=confusion_matrix(
        y_true=y_test,
        y_pred=y_pred_test_dt
        ),
        display_labels=["Died", "Survived"]
)

confusion_matrix_dt.plot()

plt.show()

confusion_matrix_dt_normalised = ConfusionMatrixDisplay(
    confusion_matrix=confusion_matrix(
        y_true=y_test,
        y_pred=y_pred_test_dt,
        normalize='true'
        ),
        display_labels=["Died", "Survived"]
)

confusion_matrix_dt_normalised.plot()

plt.show()

pd.DataFrame(precision_recall_fscore_support(
        y_true=y_test,
        y_pred=y_pred_test_dt,
        average="binary"
        ))
0
0 0.769231
1 0.561798
2 0.649351
3 NaN

6.2.2 Logistic Regression

from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import StandardScaler

def standardise_data(X_train, X_test):

    # Initialise a new scaling object for normalising input data
    sc = StandardScaler()

    # Apply the scaler to the training and test sets
    train_std=sc.fit_transform(X_train)
    test_std=sc.fit_transform(X_test)

    return train_std, test_std

X_train_standardised, X_test_standardised = standardise_data(X_train, X_test)

logistic_regression_model = LogisticRegression()
logistic_regression_model = logistic_regression_model.fit(X_train_standardised,y_train)

y_pred_train_lr = logistic_regression_model.predict(X_train_standardised)
y_pred_test_lr = logistic_regression_model.predict(X_test_standardised)

accuracy_train = np.mean(y_pred_train_lr == y_train)
accuracy_test = np.mean(y_pred_test_lr == y_test)

print (f'Accuracy of predicting training data = {accuracy_train}')
print (f'Accuracy of predicting test data = {accuracy_test}')
Accuracy of predicting training data = 0.8083832335329342
Accuracy of predicting test data = 0.8116591928251121
roc_curve_lr = RocCurveDisplay.from_estimator(
    logistic_regression_model, X_test_standardised, y_test
)

fig = roc_curve_lr.figure_
ax = roc_curve_lr.ax_

ax.plot([0, 1], [0, 1], color='darkblue', linestyle='--')

confusion_matrix_lr = ConfusionMatrixDisplay(
    confusion_matrix=confusion_matrix(
        y_true=y_test,
        y_pred=y_pred_test_lr,
        ),
        display_labels=["Died", "Survived"]
)

confusion_matrix_lr.plot()

plt.show()

confusion_matrix_lr_normalised = ConfusionMatrixDisplay(
    confusion_matrix=confusion_matrix(
        y_true=y_test,
        y_pred=y_pred_test_lr,
        normalize='true',
        ),
        display_labels=["Died", "Survived"]
)

confusion_matrix_lr_normalised.plot()

plt.show()

pd.DataFrame(classification_report(
        y_true=y_test,
        y_pred=y_pred_test_lr,
        target_names=["Died", "Survived"],
        output_dict=True
))
Died Survived accuracy macro avg weighted avg
precision 0.823944 0.790123 0.811659 0.807034 0.810446
recall 0.873134 0.719101 0.811659 0.796118 0.811659
f1-score 0.847826 0.752941 0.811659 0.800384 0.809957
support 134.000000 89.000000 0.811659 223.000000 223.000000
precision, recall, fbeta, support = precision_recall_fscore_support(
        y_true=y_test,
        y_pred=y_pred_test_lr,
        average="binary"
        )

6.3 Compare confusion matrices

fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(14, 5))
confusion_matrix_dt.plot(ax=ax1)
ax1.title.set_text('Decision Tree')

confusion_matrix_lr.plot(ax=ax2)
ax2.title.set_text('Logistic Regression')

fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(14, 5))
confusion_matrix_dt_normalised.plot(ax=ax1)
ax1.title.set_text('Decision Tree - Normalised')

confusion_matrix_lr_normalised.plot(ax=ax2)
ax2.title.set_text('Logistic Regression - Normalised')

7 Compare ROC Curves

fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(14, 5))
roc_curve_dt.plot(ax=ax1)
ax1.title.set_text('Decision Tree')
ax1.plot([0, 1], [0, 1], color='darkblue', linestyle='--')

roc_curve_lr.plot(ax=ax2)
ax2.title.set_text('Logistic Regression')
ax2.plot([0, 1], [0, 1], color='darkblue', linestyle='--')