33 Pipelines (Titanic Dataset)

To make your sklearn code neater and potentially easier to move into production, we can use the built-in pipelines functionality.

from sklearn.pipeline import Pipeline # For setting up pipeline
from sklearn.preprocessing import Normalizer, StandardScaler
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV, train_test_split
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.feature_selection import SequentialFeatureSelector
from sklearn.ensemble import VotingClassifier
import pandas as pd

First, we will still import our data and set up the train-test split manually.

try:
    data = pd.read_csv("data/processed_data.csv")

except FileNotFoundError:
    # Download processed data:
    address = 'https://raw.githubusercontent.com/MichaelAllen1966/' + \
                '1804_python_healthcare/master/titanic/data/processed_data.csv'

    data = pd.read_csv(address)

    # Create a data subfolder if one does not already exist
    import os
    data_directory ='./data/'
    if not os.path.exists(data_directory):
        os.makedirs(data_directory)

    # Save data
    data.to_csv(data_directory + 'processed_data.csv', index=False)

data = data.astype(float)

# Drop Passengerid (axis=1 indicates we are removing a column rather than a row)
# We drop passenger ID as it is not original data

data.drop('PassengerId', inplace=True, axis=1)

X = data.drop('Survived',axis=1) # X = all 'data' except the 'survived' column
y = data['Survived'] # y = 'survived' column from 'data'

feature_names = X.columns.tolist()

X_train_val, X_test, y_train_val, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
X_train, X_validate, y_train, y_validate = train_test_split(X_train_val, y_train_val, test_size=0.2, random_state=42)

print(f"Training Dataset Samples: {len(X_train)}")
print(f"Validation Dataset Samples: {len(X_validate)}")
print(f"Testing Dataset Samples: {len(X_test)}")

Training Dataset Samples: 569
Validation Dataset Samples: 143
Testing Dataset Samples: 179

data.head()

	Survived	Pclass	Age	SibSp	Fare	CabinLetterImputed	CabinNumber	...	CabinLetter_C	CabinLetter_missing
0	0.0	3.0	22.0	1.0	7.2500	1.0	0.0	...	0.0	1.0
1	1.0	1.0	38.0	1.0	71.2833	0.0	85.0	...	1.0	0.0
2	1.0	3.0	26.0	0.0	7.9250	1.0	0.0	...	0.0	1.0
3	1.0	1.0	35.0	1.0	53.1000	0.0	123.0	...	1.0	0.0
4	0.0	3.0	35.0	0.0	8.0500	1.0	0.0	...	0.0	1.0

5 rows × 25 columns

Now let’s build a simple pipeline!

pipe = Pipeline([
('scaler', StandardScaler()),
('classifier', KNeighborsClassifier())
])

pipe.fit(X_train, y_train)
print(f'Training set score: {pipe.score(X_train,y_train):.3f}')
print(f'Test set score: {pipe.score(X_test,y_test):.3f}')

Training set score: 0.858
Test set score: 0.754

33.1 Increasing the complexity of our pipe

Let’s add in another step!

knn_classifier = KNeighborsClassifier()
sfs = SequentialFeatureSelector(knn_classifier,
                                n_features_to_select='auto',
                                tol=.01,
                                n_jobs=-1)
pipe = Pipeline([
    ('scaler', StandardScaler()),
    ('feature_selection', sfs),
    ('classifier', knn_classifier)
])

pipe

Pipeline(steps=[('scaler', StandardScaler()),
                ('feature_selection',
                 SequentialFeatureSelector(estimator=KNeighborsClassifier(),
                                           n_jobs=-1, tol=0.01)),
                ('classifier', KNeighborsClassifier())])

In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.

pipe.fit(X_train, y_train)
print(f'Training set score: {pipe.score(X_train,y_train):.3f}')
print(f'Test set score: {pipe.score(X_test,y_test):.3f}')

Training set score: 0.787
Test set score: 0.788

33.2 Ensembles with pipelines

Ensembles can easily be added in to the process as well - they are effectively just a classifier like a single model once we have set them up.

voting_classifier_1 = VotingClassifier(
    estimators=[('knn', KNeighborsClassifier()),
                ('logreg', LogisticRegression())],
    voting='soft')

pipe = Pipeline([
    ('scaler', StandardScaler()),
    ('classifier', voting_classifier_1)
])

pipe.fit(X_train, y_train)
print(f'Training set score: {pipe.score(X_train,y_train):.3f}')
print(f'Test set score: {pipe.score(X_test,y_test):.3f}')

Training set score: 0.851
Test set score: 0.810

33.3 Grid search with pipelines

To use grid search with our pipeline, we just need to add in the step name we defined with a double underscore before each parameter.

knn_classifier = KNeighborsClassifier()
sfs = SequentialFeatureSelector(knn_classifier,
                                direction="backward",
                                n_jobs=-1)
pipe = Pipeline([
    ('scaler', StandardScaler()),
    ('feature_selection', sfs),
    ('classifier', knn_classifier)
])

parameter_grid = {
    "feature_selection__n_features_to_select": [18, 20],
    "classifier__n_neighbors": [i for i in range(1,10, 2)],
    "classifier__metric": ["minowski", "manhattan", "euclidean"]
}

random_search = GridSearchCV(
    estimator=pipe, # notice that we're passing our pipeline in here
    param_grid=parameter_grid,
    n_jobs=1, # If n_jobs is not one, you won't get the progress report during the process
    verbose=2, # this controls the level of detail being output
)

random_search.fit(X_train, y_train)

print("Best parameters combination found:")
best_parameters = random_search.best_estimator_.get_params()
for param_name in sorted(parameter_grid.keys()):
    print(f"{param_name}: {best_parameters[param_name]}")

Best parameters combination found:
classifier__metric: euclidean
classifier__n_neighbors: 3
feature_selection__n_features_to_select: 20

test_accuracy = random_search.score(X_test, y_test)
print(
    "Accuracy of the best parameters using the inner CV of "
    f"the random search: {random_search.best_score_:.3f}"
)
print(f"Accuracy on test set: {test_accuracy:.3f}")

Accuracy of the best parameters using the inner CV of the random search: 0.805
Accuracy on test set: 0.771