import numpy as np
import pandas as pd
# Import machine learning methods
from xgboost.sklearn import XGBClassifier
from sklearn.model_selection import train_test_split
from sklearn.tree import plot_tree
from sklearn.ensemble import AdaBoostClassifier
import plotly.express as px
import matplotlib.pyplot as plt
from sklearn.metrics import auc, roc_curve, RocCurveDisplay, f1_score, precision_score, \
\
recall_score, confusion_matrix, ConfusionMatrixDisplay, classification_report
8 Boosted Trees for Classification: XGBoost, Catboost, AdaBoost, LightGBM, HistogramGradientBoosting (Titanic Dataset)
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import StandardScaler
42) np.random.seed(
= False
download_required
if download_required:
# Download processed data:
= 'https://raw.githubusercontent.com/MichaelAllen1966/' + \
address '1804_python_healthcare/master/titanic/data/processed_data.csv'
= pd.read_csv(address)
data
# Create a data subfolder if one does not already exist
import os
='../datasets/'
data_directory if not os.path.exists(data_directory):
os.makedirs(data_directory)
# Save data
+ 'processed_titanic_data.csv', index=False) data.to_csv(data_directory
= pd.read_csv('../datasets/processed_titanic_data.csv')
data # Make all data 'float' type
= data.astype(float) data
10) data.head(
PassengerId | Survived | Pclass | Age | SibSp | Parch | Fare | AgeImputed | EmbarkedImputed | CabinLetterImputed | ... | Embarked_missing | CabinLetter_A | CabinLetter_B | CabinLetter_C | CabinLetter_D | CabinLetter_E | CabinLetter_F | CabinLetter_G | CabinLetter_T | CabinLetter_missing | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 1.0 | 0.0 | 3.0 | 22.0 | 1.0 | 0.0 | 7.2500 | 0.0 | 0.0 | 1.0 | ... | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 1.0 |
1 | 2.0 | 1.0 | 1.0 | 38.0 | 1.0 | 0.0 | 71.2833 | 0.0 | 0.0 | 0.0 | ... | 0.0 | 0.0 | 0.0 | 1.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 |
2 | 3.0 | 1.0 | 3.0 | 26.0 | 0.0 | 0.0 | 7.9250 | 0.0 | 0.0 | 1.0 | ... | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 1.0 |
3 | 4.0 | 1.0 | 1.0 | 35.0 | 1.0 | 0.0 | 53.1000 | 0.0 | 0.0 | 0.0 | ... | 0.0 | 0.0 | 0.0 | 1.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 |
4 | 5.0 | 0.0 | 3.0 | 35.0 | 0.0 | 0.0 | 8.0500 | 0.0 | 0.0 | 1.0 | ... | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 1.0 |
5 | 6.0 | 0.0 | 3.0 | 28.0 | 0.0 | 0.0 | 8.4583 | 1.0 | 0.0 | 1.0 | ... | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 1.0 |
6 | 7.0 | 0.0 | 1.0 | 54.0 | 0.0 | 0.0 | 51.8625 | 0.0 | 0.0 | 0.0 | ... | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 1.0 | 0.0 | 0.0 | 0.0 | 0.0 |
7 | 8.0 | 0.0 | 3.0 | 2.0 | 3.0 | 1.0 | 21.0750 | 0.0 | 0.0 | 1.0 | ... | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 1.0 |
8 | 9.0 | 1.0 | 3.0 | 27.0 | 0.0 | 2.0 | 11.1333 | 0.0 | 0.0 | 1.0 | ... | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 1.0 |
9 | 10.0 | 1.0 | 2.0 | 14.0 | 1.0 | 0.0 | 30.0708 | 0.0 | 0.0 | 1.0 | ... | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 1.0 |
10 rows × 26 columns
data.describe()
PassengerId | Survived | Pclass | Age | SibSp | Parch | Fare | AgeImputed | EmbarkedImputed | CabinLetterImputed | ... | Embarked_missing | CabinLetter_A | CabinLetter_B | CabinLetter_C | CabinLetter_D | CabinLetter_E | CabinLetter_F | CabinLetter_G | CabinLetter_T | CabinLetter_missing | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
count | 891.000000 | 891.000000 | 891.000000 | 891.000000 | 891.000000 | 891.000000 | 891.000000 | 891.000000 | 891.000000 | 891.000000 | ... | 891.000000 | 891.000000 | 891.000000 | 891.000000 | 891.000000 | 891.000000 | 891.000000 | 891.000000 | 891.000000 | 891.000000 |
mean | 446.000000 | 0.383838 | 2.308642 | 29.361582 | 0.523008 | 0.381594 | 32.204208 | 0.198653 | 0.002245 | 0.771044 | ... | 0.002245 | 0.016835 | 0.052750 | 0.066218 | 0.037037 | 0.035915 | 0.014590 | 0.004489 | 0.001122 | 0.771044 |
std | 257.353842 | 0.486592 | 0.836071 | 13.019697 | 1.102743 | 0.806057 | 49.693429 | 0.399210 | 0.047351 | 0.420397 | ... | 0.047351 | 0.128725 | 0.223659 | 0.248802 | 0.188959 | 0.186182 | 0.119973 | 0.066890 | 0.033501 | 0.420397 |
min | 1.000000 | 0.000000 | 1.000000 | 0.420000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | ... | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 |
25% | 223.500000 | 0.000000 | 2.000000 | 22.000000 | 0.000000 | 0.000000 | 7.910400 | 0.000000 | 0.000000 | 1.000000 | ... | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 1.000000 |
50% | 446.000000 | 0.000000 | 3.000000 | 28.000000 | 0.000000 | 0.000000 | 14.454200 | 0.000000 | 0.000000 | 1.000000 | ... | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 1.000000 |
75% | 668.500000 | 1.000000 | 3.000000 | 35.000000 | 1.000000 | 0.000000 | 31.000000 | 0.000000 | 0.000000 | 1.000000 | ... | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 1.000000 |
max | 891.000000 | 1.000000 | 3.000000 | 80.000000 | 8.000000 | 6.000000 | 512.329200 | 1.000000 | 1.000000 | 1.000000 | ... | 1.000000 | 1.000000 | 1.000000 | 1.000000 | 1.000000 | 1.000000 | 1.000000 | 1.000000 | 1.000000 | 1.000000 |
8 rows × 26 columns
# Drop Passengerid (axis=1 indicates we are removing a column rather than a row)
# We drop passenger ID as it is not original data
# inplace=True means change the dataframe itself - don't create a copy with this column dropped
'PassengerId', inplace=True, axis=1) data.drop(
8.1 Divide into X (features) and y (labels)
= data.drop('Survived',axis=1) # X = all 'data' except the 'survived' column
X = data['Survived'] # y = 'survived' column from 'data' y
8.2 Divide into training and tets sets
= train_test_split(X, y, test_size = 0.25, random_state=42) X_train, X_test, y_train, y_test
8.3 Fit decision tree model
= XGBClassifier(random_state=42)
model = model.fit(X_train,y_train) model
8.4 Predict values
# Predict training and test set labels
= model.predict(X_train)
y_pred_train = model.predict(X_test) y_pred_test
8.5 Calculate accuracy
# The shorthand below says to check each predicted y value against the actual
# y value in the training data. This gives a list of True and False values
# for each prediction, where True indicates the predicted value matches the
# actual value. Then we take the mean of these Boolean values, which gives
# us a proportion (where if all values were True, the proportion would be 1.0)
# If you want to see why that works, just uncomment the following line of code
# to see what y_pred_train == y_train is doing.
# print (y_pred_train == y_train)
= np.mean(y_pred_train == y_train)
accuracy_train = np.mean(y_pred_test == y_test)
accuracy_test
print (f'Accuracy of predicting training data = {accuracy_train}')
print (f'Accuracy of predicting test data = {accuracy_test}')
Accuracy of predicting training data = 0.9730538922155688
Accuracy of predicting test data = 0.8071748878923767
# Show first ten predicted classes
= model.predict(X_test)
classes 0:10] classes[
array([0, 0, 0, 1, 1, 1, 1, 0, 0, 1])
# Show first ten predicted probabilities
= model.predict_proba(X_test)
probabilities 0:10] probabilities[
array([[0.86352104, 0.13647896],
[0.7883349 , 0.2116651 ],
[0.5199659 , 0.48003414],
[0.00145131, 0.9985487 ],
[0.07196659, 0.9280334 ],
[0.01444411, 0.9855559 ],
[0.22169638, 0.7783036 ],
[0.99416023, 0.00583977],
[0.67432725, 0.32567278],
[0.00534749, 0.9946525 ]], dtype=float32)
8.6 Calculate other metrics
= f1_score(y_test, y_pred_test, average='macro')
f1_score_xg = precision_score(y_test, y_pred_test, average='macro')
precision_score_xg = recall_score(y_test, y_pred_test, average='macro')
recall_score_xg
print (f'Accuracy of predicting test data = {accuracy_test}')
print (f'f1 score = {f1_score_xg}')
print (f'precision score = {recall_score_xg}')
print (f'recall score = {recall_score_xg}')
Accuracy of predicting test data = 0.8071748878923767
f1 score = 0.7978070637849236
precision score = 0.7961596511822908
recall score = 0.7961596511822908
print(classification_report(y_test, y_pred_test))
precision recall f1-score support
0.0 0.83 0.85 0.84 134
1.0 0.77 0.74 0.75 89
accuracy 0.81 223
macro avg 0.80 0.80 0.80 223
weighted avg 0.81 0.81 0.81 223
9 Comparing Performance
def train_and_run(model):
model.fit(X_train,y_train)= model.predict(X_train)
y_pred_train = model.predict(X_test)
y_pred_test = np.mean(y_pred_train == y_train)
accuracy_train = np.mean(y_pred_test == y_test)
accuracy_test
print (f'Accuracy of predicting training data = {accuracy_train:.3f}')
print (f'Accuracy of predicting test data = {accuracy_test:.3f}')
42)
np.random.seed(
= LogisticRegression()) train_and_run(model
Accuracy of predicting training data = 0.804
Accuracy of predicting test data = 0.807
c:\HSMA\_HSMA 6\Sammi's Sessions\h6_4e_boosted_trees\.venv\Lib\site-packages\sklearn\linear_model\_logistic.py:469: ConvergenceWarning: lbfgs failed to converge (status=1):
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.
Increase the number of iterations (max_iter) or scale the data as shown in:
https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
n_iter_i = _check_optimize_result(
42)
np.random.seed(
= DecisionTreeClassifier(random_state=42)) train_and_run(model
Accuracy of predicting training data = 0.984
Accuracy of predicting test data = 0.758
42)
np.random.seed(
= RandomForestClassifier()) train_and_run(model
Accuracy of predicting training data = 0.984
Accuracy of predicting test data = 0.794
42)
np.random.seed(
= XGBClassifier()) train_and_run(model
Accuracy of predicting training data = 0.973
Accuracy of predicting test data = 0.807
9.0.0.1 Random Forest
42)
np.random.seed(
= RandomForestClassifier()
random_forest_model = random_forest_model.fit(X_train,y_train)
random_forest_model
= random_forest_model.predict(X_train)
y_pred_train_rf = random_forest_model.predict(X_test)
y_pred_test_rf
= RocCurveDisplay.from_estimator(
roc_curve_rf
random_forest_model, X_test, y_test
)
= ConfusionMatrixDisplay(
confusion_matrix_rf =confusion_matrix(
confusion_matrix=y_test,
y_true=y_pred_test_rf
y_pred
),=["Died", "Survived"]
display_labels
)
= ConfusionMatrixDisplay(
confusion_matrix_rf_normalised =confusion_matrix(
confusion_matrix=y_test,
y_true=y_pred_test_rf,
y_pred='true'
normalize
),=["Died", "Survived"]
display_labels )
9.0.0.2 Decision Tree
42)
np.random.seed(
= DecisionTreeClassifier(max_depth=6)
decision_tree_model = decision_tree_model.fit(X_train,y_train)
decision_tree_model
= decision_tree_model.predict(X_train)
y_pred_train_dt = decision_tree_model.predict(X_test)
y_pred_test_dt
= RocCurveDisplay.from_estimator(
roc_curve_dt
decision_tree_model, X_test, y_test
)
= ConfusionMatrixDisplay(
confusion_matrix_dt =confusion_matrix(
confusion_matrix=y_test,
y_true=y_pred_test_dt
y_pred
),=["Died", "Survived"]
display_labels
)
= ConfusionMatrixDisplay(
confusion_matrix_dt_normalised =confusion_matrix(
confusion_matrix=y_test,
y_true=y_pred_test_dt,
y_pred='true'
normalize
),=["Died", "Survived"]
display_labels )
9.0.0.3 Logistic Regression
42)
np.random.seed(
def standardise_data(X_train, X_test):
# Initialise a new scaling object for normalising input data
= StandardScaler()
sc
# Apply the scaler to the training and test sets
=sc.fit_transform(X_train)
train_std=sc.fit_transform(X_test)
test_std
return train_std, test_std
= standardise_data(X_train, X_test)
X_train_standardised, X_test_standardised
= LogisticRegression()
logistic_regression_model
logistic_regression_model.fit(X_train_standardised,y_train)
= logistic_regression_model.predict(X_train_standardised)
y_pred_train_lr = logistic_regression_model.predict(X_test_standardised)
y_pred_test_lr
= RocCurveDisplay.from_estimator(
roc_curve_lr
logistic_regression_model, X_test_standardised, y_test
)
= ConfusionMatrixDisplay(
confusion_matrix_lr =confusion_matrix(
confusion_matrix=y_test,
y_true=y_pred_test_lr
y_pred
),=["Died", "Survived"]
display_labels
)
= ConfusionMatrixDisplay(
confusion_matrix_lr_normalised =confusion_matrix(
confusion_matrix=y_test,
y_true=y_pred_test_lr,
y_pred='true'
normalize
),=["Died", "Survived"]
display_labels )
42)
np.random.seed(
= XGBClassifier()
xgboost_model = xgboost_model.fit(X_train,y_train)
xgboost_model
= xgboost_model.predict(X_train)
y_pred_train_xg = xgboost_model.predict(X_test)
y_pred_test_xg
= RocCurveDisplay.from_estimator(
roc_curve_xg
xgboost_model, X_test, y_test
)
= ConfusionMatrixDisplay(
confusion_matrix_xg =confusion_matrix(
confusion_matrix=y_test,
y_true=y_pred_test_xg
y_pred
),=["Died", "Survived"]
display_labels
)
= ConfusionMatrixDisplay(
confusion_matrix_xg_normalised =confusion_matrix(
confusion_matrix=y_test,
y_true=y_pred_test_xg,
y_pred='true'
normalize
),=["Died", "Survived"]
display_labels )
= plt.subplots(1, 4, figsize=(14, 5))
fig, (ax1, ax2, ax3, ax4)
=ax1)
confusion_matrix_lr.plot(ax'Logistic Regression')
ax1.title.set_text(
=ax2)
confusion_matrix_dt.plot(ax'Decision Tree')
ax2.title.set_text(
=ax3)
confusion_matrix_rf.plot(ax'Random Forest')
ax3.title.set_text(
=ax4)
confusion_matrix_xg.plot(ax'XGBoost') ax4.title.set_text(
= plt.subplots(1, 4, figsize=(14, 5))
fig, (ax1, ax2, ax3, ax4) =ax1)
confusion_matrix_rf_normalised.plot(ax'Random Forest - Normalised')
ax1.title.set_text(
=ax2)
confusion_matrix_dt_normalised.plot(ax'Decision Tree - Normalised')
ax2.title.set_text(
=ax3)
confusion_matrix_lr_normalised.plot(ax'Logistic Regression - Normalised')
ax3.title.set_text(
=ax4)
confusion_matrix_xg_normalised.plot(ax'XGBoost - Normalised') ax4.title.set_text(
= plt.subplots(1, 4, figsize=(14, 5))
fig, (ax1, ax2, ax3, ax4)
=ax1)
roc_curve_rf.plot(ax'Random Forest')
ax1.title.set_text(0, 1], [0, 1], color='darkblue', linestyle='--')
ax1.plot([
=ax2)
roc_curve_dt.plot(ax'Decision Tree')
ax2.title.set_text(0, 1], [0, 1], color='darkblue', linestyle='--')
ax2.plot([
=ax3)
roc_curve_lr.plot(ax'Logistic Regression')
ax3.title.set_text(0, 1], [0, 1], color='darkblue', linestyle='--')
ax3.plot([
=ax4)
roc_curve_xg.plot(ax'XGBoost')
ax4.title.set_text(0, 1], [0, 1], color='darkblue', linestyle='--') ax4.plot([
10 XGBoost - Parallel Computation
from datetime import datetime
10.0.1 Time a standard xgboost training
42)
np.random.seed(= datetime.now()
start_at = XGBClassifier())
train_and_run(model print("Duration =", (datetime.now() - start_at))
Accuracy of predicting training data = 0.973
Accuracy of predicting test data = 0.807
Duration = 0:00:00.089040
10.0.2 Time a parallel xgboost training
42)
np.random.seed(= datetime.now()
start_at = XGBClassifier(nthread=-1))
train_and_run(model print("Duration =", (datetime.now() - start_at))
Accuracy of predicting training data = 0.973
Accuracy of predicting test data = 0.807
Duration = 0:00:00.083743
10.1 Hyperparameters
10.1.1 n estimators (trees per forest)
= []
accuracy_results
for i in range(10, 500, 10):
= model = XGBClassifier(n_estimators=i, random_state=42, nthread=-1)
model
model.fit(X_train,y_train)= model.predict(X_train)
y_pred_train = model.predict(X_test)
y_pred_test = np.mean(y_pred_train == y_train)
accuracy_train = np.mean(y_pred_test == y_test)
accuracy_test 'accuracy_train': accuracy_train, 'accuracy_test': accuracy_test, 'n_estimators': i})
accuracy_results.append({
='n_estimators'),
px.line(pd.DataFrame(accuracy_results).melt(id_vars='n_estimators', y='value', color='variable') x
ValueError: Mime type rendering requires nbformat>=4.2.0 but it is not installed
"n_estimators").sort_values(by=["accuracy_test"], ascending=False) pd.DataFrame(accuracy_results).set_index(
accuracy_train | accuracy_test | |
---|---|---|
n_estimators | ||
10 | 0.901198 | 0.847534 |
20 | 0.931138 | 0.834081 |
30 | 0.941617 | 0.829596 |
40 | 0.950599 | 0.820628 |
50 | 0.956587 | 0.820628 |
70 | 0.965569 | 0.820628 |
60 | 0.965569 | 0.816143 |
80 | 0.965569 | 0.811659 |
130 | 0.976048 | 0.811659 |
230 | 0.980539 | 0.811659 |
220 | 0.980539 | 0.807175 |
280 | 0.980539 | 0.807175 |
270 | 0.980539 | 0.807175 |
260 | 0.980539 | 0.807175 |
240 | 0.980539 | 0.807175 |
250 | 0.980539 | 0.807175 |
90 | 0.970060 | 0.807175 |
100 | 0.973054 | 0.807175 |
360 | 0.980539 | 0.802691 |
350 | 0.980539 | 0.802691 |
340 | 0.980539 | 0.802691 |
320 | 0.980539 | 0.802691 |
310 | 0.980539 | 0.802691 |
300 | 0.980539 | 0.802691 |
290 | 0.980539 | 0.802691 |
390 | 0.980539 | 0.802691 |
110 | 0.974551 | 0.802691 |
120 | 0.974551 | 0.802691 |
210 | 0.979042 | 0.802691 |
200 | 0.979042 | 0.802691 |
190 | 0.979042 | 0.802691 |
400 | 0.980539 | 0.798206 |
450 | 0.980539 | 0.798206 |
470 | 0.980539 | 0.798206 |
480 | 0.980539 | 0.798206 |
440 | 0.980539 | 0.798206 |
430 | 0.980539 | 0.798206 |
420 | 0.980539 | 0.798206 |
170 | 0.979042 | 0.798206 |
330 | 0.980539 | 0.798206 |
380 | 0.980539 | 0.798206 |
180 | 0.979042 | 0.798206 |
140 | 0.977545 | 0.798206 |
150 | 0.979042 | 0.798206 |
160 | 0.979042 | 0.798206 |
490 | 0.980539 | 0.798206 |
410 | 0.980539 | 0.793722 |
370 | 0.980539 | 0.793722 |
460 | 0.980539 | 0.793722 |
10.1.2 n estimators (trees per forest) - with max depth of 5
= []
accuracy_results
for i in range(10, 200, 10):
= XGBClassifier(n_estimators=i, random_state=42, max_depth=5, nthread=-1)
model
model.fit(X_train,y_train)= model.predict(X_train)
y_pred_train = model.predict(X_test)
y_pred_test = np.mean(y_pred_train == y_train)
accuracy_train = np.mean(y_pred_test == y_test)
accuracy_test 'accuracy_train': accuracy_train, 'accuracy_test': accuracy_test, 'n_estimators': i})
accuracy_results.append({
='n_estimators'),
px.line(pd.DataFrame(accuracy_results).melt(id_vars='n_estimators', y='value', color='variable') x
ValueError: Mime type rendering requires nbformat>=4.2.0 but it is not installed
"n_estimators").sort_values(by=["accuracy_test"], ascending=False) pd.DataFrame(accuracy_results).set_index(
accuracy_train | accuracy_test | |
---|---|---|
n_estimators | ||
30 | 0.926647 | 0.843049 |
20 | 0.913174 | 0.838565 |
40 | 0.937126 | 0.829596 |
50 | 0.941617 | 0.825112 |
10 | 0.899701 | 0.820628 |
80 | 0.962575 | 0.820628 |
100 | 0.964072 | 0.820628 |
60 | 0.952096 | 0.820628 |
70 | 0.956587 | 0.816143 |
90 | 0.964072 | 0.816143 |
110 | 0.968563 | 0.811659 |
120 | 0.970060 | 0.811659 |
130 | 0.973054 | 0.807175 |
140 | 0.971557 | 0.802691 |
190 | 0.977545 | 0.802691 |
150 | 0.974551 | 0.798206 |
160 | 0.976048 | 0.798206 |
170 | 0.977545 | 0.798206 |
180 | 0.977545 | 0.798206 |
42)
np.random.seed(
= pd.DataFrame(accuracy_results).sort_values(by=["accuracy_test"], ascending=False).head(1)['n_estimators'].values[0]
best_n_estimators
= RandomForestClassifier(n_estimators=best_n_estimators, random_state=42, max_depth=8)
model
model.fit(X_train,y_train)= model.predict(X_train)
y_pred_train = model.predict(X_test)
y_pred_test
= RocCurveDisplay.from_estimator(
roc_curve
model, X_test, y_test
)
= roc_curve.figure_
fig = roc_curve.ax_
ax
0, 1], [0, 1], color='darkblue', linestyle='--') ax.plot([
ConfusionMatrixDisplay(=confusion_matrix(
confusion_matrix=y_test,
y_true=y_pred_test
y_pred
),=["Died", "Survived"]
display_labels ).plot()
ConfusionMatrixDisplay(=confusion_matrix(
confusion_matrix=y_test,
y_true=y_pred_test,
y_pred='true'
normalize
),=["Died", "Survived"]
display_labels ).plot()
10.2 Learning Rate (ETA)
import numpy as np
= []
accuracy_results
for i in np.arange(0.005, 0.2, 0.01):
= XGBClassifier(learning_rate=i, random_state=42, nthread=-1)
model
model.fit(X_train,y_train)= model.predict(X_train)
y_pred_train = model.predict(X_test)
y_pred_test = np.mean(y_pred_train == y_train)
accuracy_train = np.mean(y_pred_test == y_test)
accuracy_test 'accuracy_train': accuracy_train, 'accuracy_test': accuracy_test, 'learning_rate': i})
accuracy_results.append({
='learning_rate'),
px.line(pd.DataFrame(accuracy_results).melt(id_vars='learning_rate', y='value', color='variable') x
ValueError: Mime type rendering requires nbformat>=4.2.0 but it is not installed
10.3 Min child weight
= []
accuracy_results
for i in range(2, 15):
= XGBClassifier(min_child_weight=i, random_state=42, nthread=-1)
model
model.fit(X_train,y_train)= model.predict(X_train)
y_pred_train = model.predict(X_test)
y_pred_test = np.mean(y_pred_train == y_train)
accuracy_train = np.mean(y_pred_test == y_test)
accuracy_test 'accuracy_train': accuracy_train, 'accuracy_test': accuracy_test, 'min_child_weight': i})
accuracy_results.append({
='min_child_weight'),
px.line(pd.DataFrame(accuracy_results).melt(id_vars='min_child_weight', y='value', color='variable') x
ValueError: Mime type rendering requires nbformat>=4.2.0 but it is not installed
10.4 Subsample
https://blog.cambridgespark.com/hyperparameter-tuning-in-xgboost-4ff9100a3b2f
= []
accuracy_results
for i in np.arange(0.05, 1, 0.05):
= XGBClassifier(subsample=i, random_state=42, nthread=-1)
model
model.fit(X_train,y_train)= model.predict(X_train)
y_pred_train = model.predict(X_test)
y_pred_test = np.mean(y_pred_train == y_train)
accuracy_train = np.mean(y_pred_test == y_test)
accuracy_test 'accuracy_train': accuracy_train, 'accuracy_test': accuracy_test, 'subsample': i})
accuracy_results.append({
='subsample'),
px.line(pd.DataFrame(accuracy_results).melt(id_vars='subsample', y='value', color='variable') x
ValueError: Mime type rendering requires nbformat>=4.2.0 but it is not installed
10.5 Colsample
= []
accuracy_results
for i in np.arange(0.05, 1, 0.05):
= XGBClassifier(colsample_bytree=i, random_state=42, nthread=-1)
model
model.fit(X_train,y_train)= model.predict(X_train)
y_pred_train = model.predict(X_test)
y_pred_test = np.mean(y_pred_train == y_train)
accuracy_train = np.mean(y_pred_test == y_test)
accuracy_test 'accuracy_train': accuracy_train, 'accuracy_test': accuracy_test, 'colsample_bytree': i})
accuracy_results.append({
='colsample_bytree'),
px.line(pd.DataFrame(accuracy_results).melt(id_vars='colsample_bytree', y='value', color='variable') x
ValueError: Mime type rendering requires nbformat>=4.2.0 but it is not installed
10.6 Num boost round
11 AdaBoost
= AdaBoostClassifier()
model model.fit(X_train,y_train)
12 CatBoost
from catboost import CatBoostClassifier
= CatBoostClassifier()
model model.fit(X_train,y_train)
13 LightGBM
from lightgbm import LGBMClassifier
= LGBMClassifier()
model model.fit(X_train,y_train)
[LightGBM] [Info] Number of positive: 253, number of negative: 415
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000513 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 247
[LightGBM] [Info] Number of data points in the train set: 668, number of used features: 17
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.378743 -> initscore=-0.494889
[LightGBM] [Info] Start training from score -0.494889
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
LGBMClassifier()In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
LGBMClassifier()
14 Histogram based boosting classifiers
from sklearn.ensemble import HistGradientBoostingClassifier
= HistGradientBoostingClassifier()
model model.fit(X_train,y_train)
HistGradientBoostingClassifier()In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
HistGradientBoostingClassifier()