import pandas as pd
import numpy as np
from time import time
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import ConfusionMatrixDisplay, confusion_matrix, f1_score, precision_score, \
recall_score
# Imports for hyperparameter optimisation
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV, cross_val_score
import optuna
from optuna.visualization import plot_optimization_history, \
plot_param_importances, plot_rank, plot_slice
# Import scipy random integer function and give it an alias so we don't overwrite
# other functions
from scipy.stats import randint as sp_randint
28 Hyperparameter Optimisation (Titanic Dataset)
Hyperparameter optimisation with grid search or frameworks like Optuna can be more efficient than manually trying out combinations of hyperparameters.
First let’s import the processed titanic dataset and split it into training and testing datasets.
We will be using cross-validated models so rather than separating out a validation dataset, it will be taken care of when we pass in the training dataset.
try:
= pd.read_csv("data/processed_data.csv")
data
except FileNotFoundError:
# Download processed data:
= 'https://raw.githubusercontent.com/MichaelAllen1966/' + \
address '1804_python_healthcare/master/titanic/data/processed_data.csv'
= pd.read_csv(address)
data
# Create a data subfolder if one does not already exist
import os
='./data/'
data_directory if not os.path.exists(data_directory):
os.makedirs(data_directory)
# Save data
+ 'processed_data.csv', index=False)
data.to_csv(data_directory
= data.astype(float)
data
# Drop Passengerid (axis=1 indicates we are removing a column rather than a row)
# We drop passenger ID as it is not original data
'PassengerId', inplace=True, axis=1)
data.drop(
= data.drop('Survived',axis=1) # X = all 'data' except the 'survived' column
X = data['Survived'] # y = 'survived' column from 'data'
y
= X.columns.tolist()
feature_names
= train_test_split(X, y, test_size=0.2, random_state=42)
X_train, X_test, y_train, y_test
print(f"Training Dataset Samples: {len(X_train)}")
print(f"Testing Dataset Samples: {len(X_test)}")
Training Dataset Samples: 712
Testing Dataset Samples: 179
29 Exhaustive
Let’s begin by doing an exhaustive search - evaluating every possible combination of parameters - using GridSearchCV.
We create a dictionary containing the parameter names on the left - these must match how they are referred to in the model we are working with, which is a decision tree in this case - and the possible values on the right.
We can pass in a list of possible options, or generate a range object and pass that in.
Remember that the ‘up to’ parameter of a range isn’t inclusive. The third parameter is the step/gap between numbers.
Verbose
controls the level of output - higher numbers will give more detail in the cell output.
= [
params 'criterion': ['gini', 'entropy', 'log_loss'],
{'max_depth': range(1, 16, 1),
'min_samples_split': range(2, 16, 1),
'min_samples_leaf': range(1, 16, 1)
}]
= GridSearchCV(DecisionTreeClassifier(random_state=42),
gridsearch_dt =params,
param_grid='accuracy',
scoring=5,
cv=2) verbose
We then fit our grid search object and return the best parameters.
We can also calculate the score.
= time()
start_time
gridsearch_dt.fit(X_train, y_train)
= time() end_time
print(f"Time taken: {(end_time - start_time):.3f}s")
print(f"Best parameters{gridsearch_dt.best_params_}")
print(f"Training Set Score: {gridsearch_dt.score(X_train, y_train)}")
print(f"Test Set Score: {gridsearch_dt.score(X_test, y_test)}")
Time taken: 515.106s
Best parameters{'criterion': 'entropy', 'max_depth': 10, 'min_samples_leaf': 4, 'min_samples_split': 9}
Training Set Score: 0.8904494382022472
Test Set Score: 0.7988826815642458
30 Randomised
Randomised grid search will instead do as many different combinations as we specify with the ‘n_iter’ parameter.
Remember that there will be cross-validations - so with 5-fold cross validations, there will actually be 5 * 500 different model fits: 2500 model fits!
Note that we have to specify the parameter grid slightly differently when using randomised search. Scipy’s randomint function, which we’ve imported as sp_randint
, provides the parameter values in the format RandomizedSearchCV requires.
= [
params_dist 'criterion': ['gini', 'entropy', 'log_loss'],
{'max_depth': sp_randint(1, 16),
'min_samples_split': sp_randint(2, 16),
'min_samples_leaf': sp_randint(1, 16)
}]
= time()
start_time
= RandomizedSearchCV(
rgs_dt =42),
DecisionTreeClassifier(random_state=500,
n_iter=params_dist,
param_distributions='accuracy',
scoring=5)
cv
rgs_dt.fit(X_train, y_train)
print(f"Best parameters{rgs_dt.best_params_}")
print(f"Training Set Score: {rgs_dt.score(X_train, y_train)}")
print(f"Test Set Score: {rgs_dt.score(X_test, y_test)}")
print(f"Time taken: {(time() - start_time):.3f}s")
Best parameters{'criterion': 'entropy', 'max_depth': 10, 'min_samples_leaf': 3, 'min_samples_split': 7}
Training Set Score: 0.8974719101123596
Test Set Score: 0.8268156424581006
Time taken: 17.146s
31 Optuna
The optuna framework more intelligently finds the best combinations of parameters.
We need to set this up slightly differently - we need to define an objective
function that gets passed the parameter trial
.
Within this, we set up our parameter values; the first argument is the actual name of the parameter in the relevant model.
Then we set up the model, and ensure that the return value from the objective
function will be a numeric value representing a score - here we’ve chosen average accuracy over 3 cross-validated folds.
def objective(trial):
# Set Optuna trial parameters and ranges
= trial.suggest_categorical('criterion', ['gini', 'entropy', 'log_loss'])
rf_criterion = trial.suggest_int('max_depth', 1, 32, log=True)
rf_max_depth = trial.suggest_int('min_samples_leaf', 1, 32, log=True)
rf_min_samples_leaf = trial.suggest_int('min_samples_split', 2, 32, log=True)
rf_min_samples_split
# Set up model
= DecisionTreeClassifier(
model =rf_criterion,
criterion=rf_max_depth,
max_depth=rf_min_samples_leaf,
min_samples_leaf=rf_min_samples_split,
min_samples_split=42
random_state
)
# Assess accuracy with sklearn.model_selection.cross_val_score
= cross_val_score(
accuracy =-1, cv=3).mean()
model, X_train, y_train, n_jobs
return accuracy
We then want to set up a study
using the optuna_create_study()
function.
= time()
start_time
# Set up Optuna study - we need to specifiy that we wish to maximise objective
= optuna.create_study(direction='maximize')
study
# Run optimisation
=1000)
study.optimize(objective, n_trials
# Get best model run
= study.best_trial
trial
print(f'Accuracy: {trial.value:0.3f}')
print(f'Best hyperparameters: {trial.params}')
print(f"Time taken: {(time() - start_time):.3f}s")
Finally, we can explore the plots that Optuna provides.
The optimization history shows how the best value - in this case, accuracy - increased as additional ‘trials’ (parameter combinations) were tried. Each dot is the result from a single trial.
plot_optimization_history(study)
Unable to display output for mime type(s): application/vnd.plotly.v1+json
The slice plot shows the range of values achieved with different values, and where most of the search time was spent.
plot_slice(study)
Unable to display output for mime type(s): application/vnd.plotly.v1+json
Finally, we can get some sense of how important different hyperparameters were for the final output.
plot_param_importances(study)
Unable to display output for mime type(s): application/vnd.plotly.v1+json
We can also look at different importances - like how much impact on the model fit time different parameters had.
optuna.visualization.plot_param_importances(=lambda t: t.duration.total_seconds(), target_name="duration"
study, target )
Unable to display output for mime type(s): application/vnd.plotly.v1+json
The rank plot is another option available but isn’t the easiest to interpret!
plot_rank(study)
C:\Users\Sammi\AppData\Local\Temp\ipykernel_45892\3430271029.py:1: ExperimentalWarning:
plot_rank is experimental (supported from v3.2.0). The interface can change in the future.
Unable to display output for mime type(s): application/vnd.plotly.v1+json