from sklearn.pipeline import Pipeline # For setting up pipeline
from sklearn.preprocessing import Normalizer, StandardScaler
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV, train_test_split
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.feature_selection import SequentialFeatureSelector
from sklearn.ensemble import VotingClassifier
import pandas as pd
33 Pipelines (Titanic Dataset)
To make your sklearn code neater and potentially easier to move into production, we can use the built-in pipelines
functionality.
First, we will still import our data and set up the train-test split manually.
try:
= pd.read_csv("data/processed_data.csv")
data
except FileNotFoundError:
# Download processed data:
= 'https://raw.githubusercontent.com/MichaelAllen1966/' + \
address '1804_python_healthcare/master/titanic/data/processed_data.csv'
= pd.read_csv(address)
data
# Create a data subfolder if one does not already exist
import os
='./data/'
data_directory if not os.path.exists(data_directory):
os.makedirs(data_directory)
# Save data
+ 'processed_data.csv', index=False)
data.to_csv(data_directory
= data.astype(float)
data
# Drop Passengerid (axis=1 indicates we are removing a column rather than a row)
# We drop passenger ID as it is not original data
'PassengerId', inplace=True, axis=1)
data.drop(
= data.drop('Survived',axis=1) # X = all 'data' except the 'survived' column
X = data['Survived'] # y = 'survived' column from 'data'
y
= X.columns.tolist()
feature_names
= train_test_split(X, y, test_size=0.2, random_state=42)
X_train_val, X_test, y_train_val, y_test = train_test_split(X_train_val, y_train_val, test_size=0.2, random_state=42)
X_train, X_validate, y_train, y_validate
print(f"Training Dataset Samples: {len(X_train)}")
print(f"Validation Dataset Samples: {len(X_validate)}")
print(f"Testing Dataset Samples: {len(X_test)}")
Training Dataset Samples: 569
Validation Dataset Samples: 143
Testing Dataset Samples: 179
data.head()
Survived | Pclass | Age | SibSp | Parch | Fare | AgeImputed | EmbarkedImputed | CabinLetterImputed | CabinNumber | ... | Embarked_missing | CabinLetter_A | CabinLetter_B | CabinLetter_C | CabinLetter_D | CabinLetter_E | CabinLetter_F | CabinLetter_G | CabinLetter_T | CabinLetter_missing | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 0.0 | 3.0 | 22.0 | 1.0 | 0.0 | 7.2500 | 0.0 | 0.0 | 1.0 | 0.0 | ... | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 1.0 |
1 | 1.0 | 1.0 | 38.0 | 1.0 | 0.0 | 71.2833 | 0.0 | 0.0 | 0.0 | 85.0 | ... | 0.0 | 0.0 | 0.0 | 1.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 |
2 | 1.0 | 3.0 | 26.0 | 0.0 | 0.0 | 7.9250 | 0.0 | 0.0 | 1.0 | 0.0 | ... | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 1.0 |
3 | 1.0 | 1.0 | 35.0 | 1.0 | 0.0 | 53.1000 | 0.0 | 0.0 | 0.0 | 123.0 | ... | 0.0 | 0.0 | 0.0 | 1.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 |
4 | 0.0 | 3.0 | 35.0 | 0.0 | 0.0 | 8.0500 | 0.0 | 0.0 | 1.0 | 0.0 | ... | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 1.0 |
5 rows × 25 columns
Now let’s build a simple pipeline!
= Pipeline([
pipe 'scaler', StandardScaler()),
('classifier', KNeighborsClassifier())
( ])
pipe.fit(X_train, y_train)print(f'Training set score: {pipe.score(X_train,y_train):.3f}')
print(f'Test set score: {pipe.score(X_test,y_test):.3f}')
Training set score: 0.858
Test set score: 0.754
33.1 Increasing the complexity of our pipe
Let’s add in another step!
= KNeighborsClassifier()
knn_classifier = SequentialFeatureSelector(knn_classifier,
sfs ='auto',
n_features_to_select=.01,
tol=-1)
n_jobs= Pipeline([
pipe 'scaler', StandardScaler()),
('feature_selection', sfs),
('classifier', knn_classifier)
( ])
pipe
Pipeline(steps=[('scaler', StandardScaler()), ('feature_selection', SequentialFeatureSelector(estimator=KNeighborsClassifier(), n_jobs=-1, tol=0.01)), ('classifier', KNeighborsClassifier())])In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
Pipeline(steps=[('scaler', StandardScaler()), ('feature_selection', SequentialFeatureSelector(estimator=KNeighborsClassifier(), n_jobs=-1, tol=0.01)), ('classifier', KNeighborsClassifier())])
StandardScaler()
SequentialFeatureSelector(estimator=KNeighborsClassifier(), n_jobs=-1, tol=0.01)
KNeighborsClassifier()
KNeighborsClassifier()
KNeighborsClassifier()
pipe.fit(X_train, y_train)print(f'Training set score: {pipe.score(X_train,y_train):.3f}')
print(f'Test set score: {pipe.score(X_test,y_test):.3f}')
Training set score: 0.787
Test set score: 0.788
33.2 Ensembles with pipelines
Ensembles can easily be added in to the process as well - they are effectively just a classifier like a single model once we have set them up.
= VotingClassifier(
voting_classifier_1 =[('knn', KNeighborsClassifier()),
estimators'logreg', LogisticRegression())],
(='soft')
voting
= Pipeline([
pipe 'scaler', StandardScaler()),
('classifier', voting_classifier_1)
(
])
pipe.fit(X_train, y_train)print(f'Training set score: {pipe.score(X_train,y_train):.3f}')
print(f'Test set score: {pipe.score(X_test,y_test):.3f}')
Training set score: 0.851
Test set score: 0.810
33.3 Grid search with pipelines
To use grid search with our pipeline, we just need to add in the step name we defined with a double underscore before each parameter.
= KNeighborsClassifier()
knn_classifier = SequentialFeatureSelector(knn_classifier,
sfs ="backward",
direction=-1)
n_jobs= Pipeline([
pipe 'scaler', StandardScaler()),
('feature_selection', sfs),
('classifier', knn_classifier)
(
])
= {
parameter_grid "feature_selection__n_features_to_select": [18, 20],
"classifier__n_neighbors": [i for i in range(1,10, 2)],
"classifier__metric": ["minowski", "manhattan", "euclidean"]
}
= GridSearchCV(
random_search =pipe, # notice that we're passing our pipeline in here
estimator=parameter_grid,
param_grid=1, # If n_jobs is not one, you won't get the progress report during the process
n_jobs=2, # this controls the level of detail being output
verbose
)
random_search.fit(X_train, y_train)
print("Best parameters combination found:")
= random_search.best_estimator_.get_params()
best_parameters for param_name in sorted(parameter_grid.keys()):
print(f"{param_name}: {best_parameters[param_name]}")
Best parameters combination found:
classifier__metric: euclidean
classifier__n_neighbors: 3
feature_selection__n_features_to_select: 20
= random_search.score(X_test, y_test)
test_accuracy print(
"Accuracy of the best parameters using the inner CV of "
f"the random search: {random_search.best_score_:.3f}"
)print(f"Accuracy on test set: {test_accuracy:.3f}")
Accuracy of the best parameters using the inner CV of the random search: 0.805
Accuracy on test set: 0.771