import pandas as pd
from xgboost import XGBClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import ConfusionMatrixDisplay, confusion_matrix, f1_score, precision_score, \
recall_scoreimport numpy as np
from time import time
# New import
from sklearn.feature_selection import SequentialFeatureSelector, SelectFromModel
27 Feature Selection Methods for Machine Learning (Titanic Dataset)
Feature selection - the act of reducing the number of features used for prediction in your final model - can have a positive impact on your model’s performance.
There are several options available to us from within sklearn.
We’re going to work with the processed titanic dataset - let’s load it in.
try:
= pd.read_csv("data/processed_data.csv")
data
except FileNotFoundError:
# Download processed data:
= 'https://raw.githubusercontent.com/MichaelAllen1966/' + \
address '1804_python_healthcare/master/titanic/data/processed_data.csv'
= pd.read_csv(address)
data
# Create a data subfolder if one does not already exist
import os
='./data/'
data_directory if not os.path.exists(data_directory):
os.makedirs(data_directory)
# Save data
+ 'processed_data.csv', index=False)
data.to_csv(data_directory
= data.astype(float)
data
# Drop Passengerid (axis=1 indicates we are removing a column rather than a row)
# We drop passenger ID as it is not original data
'PassengerId', inplace=True, axis=1)
data.drop(
= data.drop('Survived',axis=1) # X = all 'data' except the 'survived' column
X = data['Survived'] # y = 'survived' column from 'data' y
Let’s create a list containing our feature names, as well as split our data into training, test and validation sets.
= X.columns.tolist()
feature_names
= train_test_split(
X_train_val, X_test, y_train_val, y_test =0.2, random_state=42
X, y, test_size
)
= train_test_split(
X_train, X_validate, y_train, y_validate =0.2, random_state=42
X_train_val, y_train_val, test_size
)
print(f"Training Dataset Samples: {len(X_train)}")
print(f"Validation Dataset Samples: {len(X_validate)}")
print(f"Testing Dataset Samples: {len(X_test)}")
Training Dataset Samples: 569
Validation Dataset Samples: 143
Testing Dataset Samples: 179
Let’s fit an initial model to see its performance.
= XGBClassifier(random_state=42)
model
model.fit(X_train, y_train)
= model.predict(X_train)
y_pred_train = model.predict(X_validate) y_pred_val
We can also create a confusion matrix.
= ConfusionMatrixDisplay(
confusion_matrix_titanic =confusion_matrix(
confusion_matrix=y_validate,
y_true=y_pred_val
y_pred
),=["Died", "Survived"]
display_labels
)
confusion_matrix_titanic.plot()
27.1 Useful Function: Model evaluation
Let’s create a function that can take a model as the input and
We also have created an additional parameter that will be used to store the runtime for different instances; some of these might be quite long as feature selection can take a while!
def fit_train(X_train, X_validate, y_train, y_validate,
name,="N/A",
feature_selection_runtime=XGBClassifier(random_state=42),
model=False
show_confusion_matrix
):
model.fit(X_train, y_train)
= model.predict(X_train)
y_pred_train = model.predict(X_validate)
y_pred_val
if show_confusion_matrix:
= ConfusionMatrixDisplay(
confusion_matrix_titanic =confusion_matrix(
confusion_matrix=y_validate,
y_true=y_pred_val
y_pred
),=["Died", "Survived"]
display_labels
)
confusion_matrix_titanic.plot()
return pd.DataFrame({
'Accuracy (training)': np.mean(y_pred_train == y_train).round(4),
'Accuracy (validation)': np.mean(y_pred_val == y_validate).round(4),
'Precision (validation)': precision_score(y_validate, y_pred_val, average='macro').round(4),
'Recall (validation)': recall_score(y_validate, y_pred_val, average='macro').round(4),
'features': ", ".join(X_train.columns.tolist()),
'feature_selection_runtime': feature_selection_runtime
=[name]
}, index )
Let’s first use this to create a dataframe of results that just contains the results from running an xgboost model with all available features in use.
= fit_train(X_train=X_train,
experiment_results_df =X_validate,
X_validate=y_train,
y_train=y_validate,
y_validate=f"XGBoost - all features ({len(X.columns)})")
name
experiment_results_df
Accuracy (training) | Accuracy (validation) | Precision (validation) | Recall (validation) | features | feature_selection_runtime | |
---|---|---|---|---|---|---|
XGBoost - all features (24) | 0.9789 | 0.7972 | 0.7875 | 0.7856 | Pclass, Age, SibSp, Parch, Fare, AgeImputed, E... | N/A |
27.2 Forward Feature Selection
Forward feature selection starts by finding which single feature produces the best model.
It then iteratively goes through and adds additional features, in each case keeping the feature that adds the most predictive power.
We specify the model first, then record the time feature selection began.
We then create a SequentialFeatureSelector
, passing in the model, how many features we want to end up with, and whether to do ‘forward’ or ‘backward’ selection.
We then run fit on this, and when this has completed, it will record the duration.
= XGBClassifier(random_state=42)
model
= time()
start_time
= SequentialFeatureSelector(
sfs_forward =3, direction="forward"
model, n_features_to_select
)
sfs_forward.fit(X_train, y_train)
= time() - start_time
duration
sfs_forward
SequentialFeatureSelector(estimator=XGBClassifier(base_score=None, booster=None, callbacks=None, colsample_bylevel=None, colsample_bynode=None, colsample_bytree=None, device=None, early_stopping_rounds=None, enable_categorical=False, eval_metric=None, feature_types=None, gamma=None, grow_policy=None, importance_type=None, interaction_constraints=None, learning_rate=None, max_bin=None, max_cat_threshold=None, max_cat_to_onehot=None, max_delta_step=None, max_depth=None, max_leaves=None, min_child_weight=None, missing=nan, monotone_constraints=None, multi_strategy=None, n_estimators=None, n_jobs=None, num_parallel_tree=None, random_state=42, ...), n_features_to_select=3)In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
SequentialFeatureSelector(estimator=XGBClassifier(base_score=None, booster=None, callbacks=None, colsample_bylevel=None, colsample_bynode=None, colsample_bytree=None, device=None, early_stopping_rounds=None, enable_categorical=False, eval_metric=None, feature_types=None, gamma=None, grow_policy=None, importance_type=None, interaction_constraints=None, learning_rate=None, max_bin=None, max_cat_threshold=None, max_cat_to_onehot=None, max_delta_step=None, max_depth=None, max_leaves=None, min_child_weight=None, missing=nan, monotone_constraints=None, multi_strategy=None, n_estimators=None, n_jobs=None, num_parallel_tree=None, random_state=42, ...), n_features_to_select=3)
XGBClassifier(base_score=None, booster=None, callbacks=None, colsample_bylevel=None, colsample_bynode=None, colsample_bytree=None, device=None, early_stopping_rounds=None, enable_categorical=False, eval_metric=None, feature_types=None, gamma=None, grow_policy=None, importance_type=None, interaction_constraints=None, learning_rate=None, max_bin=None, max_cat_threshold=None, max_cat_to_onehot=None, max_delta_step=None, max_depth=None, max_leaves=None, min_child_weight=None, missing=nan, monotone_constraints=None, multi_strategy=None, n_estimators=None, n_jobs=None, num_parallel_tree=None, random_state=42, ...)
XGBClassifier(base_score=None, booster=None, callbacks=None, colsample_bylevel=None, colsample_bynode=None, colsample_bytree=None, device=None, early_stopping_rounds=None, enable_categorical=False, eval_metric=None, feature_types=None, gamma=None, grow_policy=None, importance_type=None, interaction_constraints=None, learning_rate=None, max_bin=None, max_cat_threshold=None, max_cat_to_onehot=None, max_delta_step=None, max_depth=None, max_leaves=None, min_child_weight=None, missing=nan, monotone_constraints=None, multi_strategy=None, n_estimators=None, n_jobs=None, num_parallel_tree=None, random_state=42, ...)
Let’s explore what the output of this is.
sfs_forward.get_support()
array([False, False, False, False, True, False, False, False, True,
False, True, False, False, False, False, False, False, False,
False, False, False, False, False, False])
It looks like it’s a MASK. This list of true and false values could be applied to an array so that we only keep the values in the array that match up with an instance of ‘true’ in the mask.
Let’s use the mask to get the actual feature names out instead and print these.
= np.array(feature_names)[sfs_forward.get_support()]
feature_names_selected_ff
print(
"Features selected by forward sequential selection: "
f"{feature_names_selected_ff}"
)
Features selected by forward sequential selection: ['Fare' 'CabinNumber' 'male']
Let’s now assess the performance of a model trained using just these features and als pass in the duration of the feature selection step (calculated two cells previously).
We pass the feature names we have obtained to the training and validation datasets to filter them down to just these cells.
= pd.concat([experiment_results_df,
experiment_results_df =X_train[feature_names_selected_ff],
fit_train(X_train=X_validate[feature_names_selected_ff],
X_validate=y_train,
y_train=y_validate,
y_validate=f"{duration:.3f}s",
feature_selection_runtime="Forward Feature Selection - 3")]
name
)
experiment_results_df
Accuracy (training) | Accuracy (validation) | Precision (validation) | Recall (validation) | features | feature_selection_runtime | |
---|---|---|---|---|---|---|
XGBoost - all features (24) | 0.9789 | 0.7972 | 0.7875 | 0.7856 | Pclass, Age, SibSp, Parch, Fare, AgeImputed, E... | N/A |
Forward Feature Selection - 3 | 0.9244 | 0.8112 | 0.8052 | 0.7939 | Fare, CabinNumber, male | 203.124s |
27.2.0.1 Repeat with 5 features
Let’s do it again, seeing how much better 5 features perform and how long it takes.
= XGBClassifier(random_state=42)
model
= time()
start_time
= SequentialFeatureSelector(
sfs_forward_5 =5, direction="forward"
model, n_features_to_select
)
sfs_forward_5.fit(X_train, y_train)
= time() - start_time
duration
= np.array(feature_names)[sfs_forward_5.get_support()]
feature_names_selected_ff_5
print(
"Features selected by forward sequential selection: "
f"{feature_names_selected_ff}"
)
Features selected by forward sequential selection: ['Fare' 'CabinNumber' 'male']
= pd.concat([experiment_results_df,
experiment_results_df =X_train[feature_names_selected_ff_5],
fit_train(X_train=X_validate[feature_names_selected_ff_5],
X_validate=y_train,
y_train=y_validate,
y_validate=f"{duration:.3f}s",
feature_selection_runtime="Forward Feature Selection - 5")]
name
)
experiment_results_df
Accuracy (training) | Accuracy (validation) | Precision (validation) | Recall (validation) | features | feature_selection_runtime | |
---|---|---|---|---|---|---|
XGBoost - all features (24) | 0.9789 | 0.7972 | 0.7875 | 0.7856 | Pclass, Age, SibSp, Parch, Fare, AgeImputed, E... | N/A |
Forward Feature Selection - 3 | 0.9244 | 0.8112 | 0.8052 | 0.7939 | Fare, CabinNumber, male | 203.124s |
Forward Feature Selection - 5 | 0.9262 | 0.8042 | 0.7967 | 0.7882 | Fare, CabinLetterImputed, CabinNumber, male, C... | 106.305s |
27.3 Backward Feature Selection
The code is almost identical when we want to perform backward selection - we just use direction="backward"
instead.
= XGBClassifier(random_state=42)
model
= time()
start_time
= SequentialFeatureSelector(
sfs_backward_3 =3, direction="backward"
model, n_features_to_select
)
sfs_backward_3.fit(X_train, y_train)
= time() - start_time
duration
= np.array(feature_names)[sfs_backward_3.get_support()]
feature_names_selected_bf_3
print(
"Features selected by backward sequential selection: "
f"{feature_names_selected_bf_3}"
)
Features selected by backward sequential selection: ['Fare' 'CabinNumber' 'male']
= pd.concat([experiment_results_df,
experiment_results_df =X_train[feature_names_selected_bf_3],
fit_train(X_train=X_validate[feature_names_selected_bf_3],
X_validate=y_train,
y_train=y_validate,
y_validate=f"{duration:.3f}s",
feature_selection_runtime="Backward Feature Selection - 3")]
name
)
experiment_results_df
Accuracy (training) | Accuracy (validation) | Precision (validation) | Recall (validation) | features | feature_selection_runtime | |
---|---|---|---|---|---|---|
XGBoost - all features (24) | 0.9789 | 0.7972 | 0.7875 | 0.7856 | Pclass, Age, SibSp, Parch, Fare, AgeImputed, E... | N/A |
Forward Feature Selection - 3 | 0.9244 | 0.8112 | 0.8052 | 0.7939 | Fare, CabinNumber, male | 203.124s |
Forward Feature Selection - 5 | 0.9262 | 0.8042 | 0.7967 | 0.7882 | Fare, CabinLetterImputed, CabinNumber, male, C... | 106.305s |
Backward Feature Selection - 3 | 0.9244 | 0.8112 | 0.8052 | 0.7939 | Fare, CabinNumber, male | 268.981s |
Let’s repeat with 5 and assess the performance.
= XGBClassifier(random_state=42)
model
= time()
start_time
= SequentialFeatureSelector(
sfs_backward_5 =5, direction="backward"
model, n_features_to_select
)
sfs_backward_5.fit(X_train, y_train)
= time() - start_time
duration
= np.array(feature_names)[sfs_backward_5.get_support()]
feature_names_selected_bf_5
print(
"Features selected by backward sequential selection: "
f"{feature_names_selected_bf_5}"
)
Features selected by backward sequential selection: ['Pclass' 'Fare' 'CabinNumber' 'male' 'CabinLetter_E']
= pd.concat([experiment_results_df,
experiment_results_df =X_train[feature_names_selected_bf_5],
fit_train(X_train=X_validate[feature_names_selected_bf_5],
X_validate=y_train,
y_train=y_validate,
y_validate=f"{duration:.3f}s",
feature_selection_runtime="Backward Feature Selection - 5")]
name
)
experiment_results_df
Accuracy (training) | Accuracy (validation) | Precision (validation) | Recall (validation) | features | feature_selection_runtime | |
---|---|---|---|---|---|---|
XGBoost - all features (24) | 0.9789 | 0.7972 | 0.7875 | 0.7856 | Pclass, Age, SibSp, Parch, Fare, AgeImputed, E... | N/A |
Forward Feature Selection - 3 | 0.9244 | 0.8112 | 0.8052 | 0.7939 | Fare, CabinNumber, male | 203.124s |
Forward Feature Selection - 5 | 0.9262 | 0.8042 | 0.7967 | 0.7882 | Fare, CabinLetterImputed, CabinNumber, male, C... | 106.305s |
Backward Feature Selection - 3 | 0.9244 | 0.8112 | 0.8052 | 0.7939 | Fare, CabinNumber, male | 268.981s |
Backward Feature Selection - 5 | 0.9262 | 0.8042 | 0.7945 | 0.7945 | Pclass, Fare, CabinNumber, male, CabinLetter_E | 179.093s |
27.3.1 The ‘auto’ parameter
We can also let SequentialFeatureSelector decide how many features to use.
= XGBClassifier(random_state=42)
model
= time()
start_time
= SequentialFeatureSelector(
sfs_backward_auto
model,="auto",
n_features_to_select="backward"
direction
)
sfs_backward_auto.fit(X_train, y_train)
= time() - start_time
duration
= np.array(feature_names)[sfs_backward_auto.get_support()]
feature_names_selected_bf_auto
print(
"Features selected by backward sequential selection: "
f"{feature_names_selected_bf_auto}"
)
Features selected by backward sequential selection: ['Pclass' 'SibSp' 'Parch' 'Fare' 'CabinNumber' 'male' 'Embarked_C'
'Embarked_Q' 'Embarked_S' 'CabinLetter_D' 'CabinLetter_E'
'CabinLetter_missing']
= pd.concat([experiment_results_df,
experiment_results_df =X_train[feature_names_selected_bf_auto],
fit_train(X_train=X_validate[feature_names_selected_bf_auto],
X_validate=y_train,
y_train=y_validate,
y_validate=f"{duration:.3f}s",
feature_selection_runtime="Backward Feature Selection - auto")]
name
)
experiment_results_df
Accuracy (training) | Accuracy (validation) | Precision (validation) | Recall (validation) | features | feature_selection_runtime | |
---|---|---|---|---|---|---|
XGBoost - all features (24) | 0.9789 | 0.7972 | 0.7875 | 0.7856 | Pclass, Age, SibSp, Parch, Fare, AgeImputed, E... | N/A |
Forward Feature Selection - 3 | 0.9244 | 0.8112 | 0.8052 | 0.7939 | Fare, CabinNumber, male | 203.124s |
Forward Feature Selection - 5 | 0.9262 | 0.8042 | 0.7967 | 0.7882 | Fare, CabinLetterImputed, CabinNumber, male, C... | 106.305s |
Backward Feature Selection - 3 | 0.9244 | 0.8112 | 0.8052 | 0.7939 | Fare, CabinNumber, male | 268.981s |
Backward Feature Selection - 5 | 0.9262 | 0.8042 | 0.7945 | 0.7945 | Pclass, Fare, CabinNumber, male, CabinLetter_E | 179.093s |
Backward Feature Selection - auto | 0.9385 | 0.8042 | 0.7942 | 0.7977 | Pclass, SibSp, Parch, Fare, CabinNumber, male,... | 74.881s |
27.4 Using Feature Importance for Feature Selection
We could also use some of the feature importance metrics we’ve explored in previous sessions to make a judgment of which features to keep in our model.
The SelectFromModel
function works with any model that has a featur_importances_
attribute.
e.g.
For logistic regression, this will be model coefficients.
For decision trees, this will be mean decrease in impurity.
We can determine the threshold we are going to use for keeping features in the model.
= time()
start_time
= XGBClassifier(random_state=42)
model
= SelectFromModel(
selector =model,
estimator=0.03
threshold
)
selector.fit(X_train, y_train)
= time() - start_time duration
selector.estimator_.feature_importances_
array([0.15078323, 0.03309093, 0.04509973, 0.03712323, 0.03330522,
0.01645992, 0. , 0.02637349, 0.0527233 , 0. ,
0.38024062, 0.03597496, 0.02359743, 0.03333741, 0. ,
0. , 0. , 0.02592567, 0.04060741, 0.06535744,
0. , 0. , 0. , 0. ], dtype=float32)
selector.threshold_
0.03
Similarly to before, we can use ‘get_support’.
selector.get_support()
array([ True, True, True, True, True, False, False, False, True,
False, True, True, False, True, False, False, False, False,
True, True, False, False, False, False])
= np.array(feature_names)[selector.get_support()]
feature_names_selected_fi_03 feature_names_selected_fi_03
array(['Pclass', 'Age', 'SibSp', 'Parch', 'Fare', 'CabinNumber', 'male',
'Embarked_C', 'Embarked_S', 'CabinLetter_D', 'CabinLetter_E'],
dtype='<U19')
= pd.concat([experiment_results_df,
experiment_results_df =X_train[feature_names_selected_fi_03],
fit_train(X_train=X_validate[feature_names_selected_fi_03],
X_validate=y_train,
y_train=y_validate,
y_validate=f"{duration:.3f}s",
feature_selection_runtime="Feature Importance Selection - Threshold 0.03")]
name
)
experiment_results_df
Accuracy (training) | Accuracy (validation) | Precision (validation) | Recall (validation) | features | feature_selection_runtime | |
---|---|---|---|---|---|---|
XGBoost - all features (24) | 0.9789 | 0.7972 | 0.7875 | 0.7856 | Pclass, Age, SibSp, Parch, Fare, AgeImputed, E... | N/A |
Forward Feature Selection - 3 | 0.9244 | 0.8112 | 0.8052 | 0.7939 | Fare, CabinNumber, male | 203.124s |
Forward Feature Selection - 5 | 0.9262 | 0.8042 | 0.7967 | 0.7882 | Fare, CabinLetterImputed, CabinNumber, male, C... | 106.305s |
Backward Feature Selection - 3 | 0.9244 | 0.8112 | 0.8052 | 0.7939 | Fare, CabinNumber, male | 268.981s |
Backward Feature Selection - 5 | 0.9262 | 0.8042 | 0.7945 | 0.7945 | Pclass, Fare, CabinNumber, male, CabinLetter_E | 179.093s |
Backward Feature Selection - auto | 0.9385 | 0.8042 | 0.7942 | 0.7977 | Pclass, SibSp, Parch, Fare, CabinNumber, male,... | 74.881s |
Feature Importance Selection - Threshold 0.03 | 0.9807 | 0.7972 | 0.7875 | 0.7856 | Pclass, Age, SibSp, Parch, Fare, CabinNumber, ... | 0.193s |
Let’s try tweaking with a different model.
= pd.concat([experiment_results_df,
experiment_results_df =X_train,
fit_train(X_train=X_validate,
X_validate=y_train,
y_train=y_validate,
y_validate=RandomForestClassifier(random_state=42, max_depth=6),
model=f"Random Forest - all features ({len(X_train.columns)})")]
name
)
experiment_results_df
Accuracy (training) | Accuracy (validation) | Precision (validation) | Recall (validation) | features | feature_selection_runtime | |
---|---|---|---|---|---|---|
XGBoost - all features (24) | 0.9789 | 0.7972 | 0.7875 | 0.7856 | Pclass, Age, SibSp, Parch, Fare, AgeImputed, E... | N/A |
Forward Feature Selection - 3 | 0.9244 | 0.8112 | 0.8052 | 0.7939 | Fare, CabinNumber, male | 203.124s |
Forward Feature Selection - 5 | 0.9262 | 0.8042 | 0.7967 | 0.7882 | Fare, CabinLetterImputed, CabinNumber, male, C... | 106.305s |
Backward Feature Selection - 3 | 0.9244 | 0.8112 | 0.8052 | 0.7939 | Fare, CabinNumber, male | 268.981s |
Backward Feature Selection - 5 | 0.9262 | 0.8042 | 0.7945 | 0.7945 | Pclass, Fare, CabinNumber, male, CabinLetter_E | 179.093s |
Backward Feature Selection - auto | 0.9385 | 0.8042 | 0.7942 | 0.7977 | Pclass, SibSp, Parch, Fare, CabinNumber, male,... | 74.881s |
Feature Importance Selection - Threshold 0.03 | 0.9807 | 0.7972 | 0.7875 | 0.7856 | Pclass, Age, SibSp, Parch, Fare, CabinNumber, ... | 0.193s |
Random Forest - all features (24) | 0.8822 | 0.7972 | 0.7870 | 0.7888 | Pclass, Age, SibSp, Parch, Fare, AgeImputed, E... | N/A |
= SelectFromModel(estimator=RandomForestClassifier(random_state=42, max_depth=6), threshold="mean").fit(X_train, y_train)
selector selector.estimator_.feature_importances_
array([6.66890927e-02, 1.03772923e-01, 4.78576994e-02, 3.96696333e-02,
1.29373457e-01, 1.25008170e-02, 1.55485193e-04, 3.27266742e-02,
8.05041446e-02, 4.58115404e-02, 3.09684642e-01, 1.31952009e-02,
7.36228867e-03, 2.11471973e-02, 9.83537258e-06, 2.28944280e-03,
1.19636370e-02, 1.08809761e-02, 5.33431862e-03, 1.15495213e-02,
2.40979415e-03, 8.87648714e-04, 0.00000000e+00, 4.42240310e-02])
= np.array(feature_names)[selector.get_support()]
feature_names_selected_fi_rf_mean
feature_names_selected_fi_rf_mean
= pd.concat([experiment_results_df,
experiment_results_df =X_train[feature_names_selected_fi_rf_mean],
fit_train(X_train=X_validate[feature_names_selected_fi_rf_mean],
X_validate=y_train,
y_train=y_validate,
y_validate=RandomForestClassifier(random_state=42, max_depth=6),
model=f"Random Forest - Mean Feature Importance Threshold ({len(feature_names_selected_fi_rf_mean)})")]
name
)
experiment_results_df
Accuracy (training) | Accuracy (validation) | Precision (validation) | Recall (validation) | features | feature_selection_runtime | |
---|---|---|---|---|---|---|
XGBoost - all features (24) | 0.9789 | 0.7972 | 0.7875 | 0.7856 | Pclass, Age, SibSp, Parch, Fare, AgeImputed, E... | N/A |
Forward Feature Selection - 3 | 0.9244 | 0.8112 | 0.8052 | 0.7939 | Fare, CabinNumber, male | 203.124s |
Forward Feature Selection - 5 | 0.9262 | 0.8042 | 0.7967 | 0.7882 | Fare, CabinLetterImputed, CabinNumber, male, C... | 106.305s |
Backward Feature Selection - 3 | 0.9244 | 0.8112 | 0.8052 | 0.7939 | Fare, CabinNumber, male | 268.981s |
Backward Feature Selection - 5 | 0.9262 | 0.8042 | 0.7945 | 0.7945 | Pclass, Fare, CabinNumber, male, CabinLetter_E | 179.093s |
Backward Feature Selection - auto | 0.9385 | 0.8042 | 0.7942 | 0.7977 | Pclass, SibSp, Parch, Fare, CabinNumber, male,... | 74.881s |
Feature Importance Selection - Threshold 0.03 | 0.9807 | 0.7972 | 0.7875 | 0.7856 | Pclass, Age, SibSp, Parch, Fare, CabinNumber, ... | 0.193s |
Random Forest - all features (24) | 0.8822 | 0.7972 | 0.7870 | 0.7888 | Pclass, Age, SibSp, Parch, Fare, AgeImputed, E... | N/A |
Random Forest - Mean Feature Importance Threshold (8) | 0.8946 | 0.8322 | 0.8239 | 0.8239 | Pclass, Age, SibSp, Fare, CabinNumber, CabinNu... | N/A |