Scikit-Learn Interview Challenges
10 practical Scikit-Learn challenges that test your ability to build production-quality ML pipelines, write custom transformers, and use the sklearn API correctly — the skills that distinguish senior ML engineers in interviews.
Challenge 1: Build a Complete Pipeline
cross_val_score.import numpy as np
import pandas as pd
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import cross_val_score
# Sample data with mixed types and missing values
X = pd.DataFrame({
'age': [25, 30, np.nan, 45, 50, 35, np.nan, 28],
'salary': [50000, np.nan, 70000, 80000, 90000, 60000, 75000, np.nan],
'department': ['Eng', 'Sales', 'Eng', None, 'Sales', 'Eng', 'PM', 'Sales'],
'education': ['BS', 'MS', 'PhD', 'BS', None, 'MS', 'BS', 'MS']
})
y = np.array([0, 1, 0, 1, 1, 0, 1, 0])
numeric_features = ['age', 'salary']
categorical_features = ['department', 'education']
# Numeric pipeline: impute with median, then scale
numeric_transformer = Pipeline([
('imputer', SimpleImputer(strategy='median')),
('scaler', StandardScaler())
])
# Categorical pipeline: impute with most frequent, then one-hot encode
categorical_transformer = Pipeline([
('imputer', SimpleImputer(strategy='most_frequent')),
('encoder', OneHotEncoder(handle_unknown='ignore', sparse_output=False))
])
# Combine with ColumnTransformer
preprocessor = ColumnTransformer([
('num', numeric_transformer, numeric_features),
('cat', categorical_transformer, categorical_features)
])
# Full pipeline: preprocessing + model
pipeline = Pipeline([
('preprocessor', preprocessor),
('classifier', RandomForestClassifier(n_estimators=100, random_state=42))
])
# Cross-validate (pipeline prevents data leakage!)
scores = cross_val_score(pipeline, X, y, cv=3, scoring='accuracy')
print(f"CV Accuracy: {scores.mean():.3f} +/- {scores.std():.3f}")
# Fit and predict
pipeline.fit(X, y)
print(f"Feature names: {pipeline[:-1].get_feature_names_out()}")
Interviewer focus: Using ColumnTransformer for mixed types, handle_unknown='ignore' for unseen categories at test time, and most critically — putting everything in a Pipeline so that cross_val_score applies preprocessing within each fold (preventing data leakage).
Challenge 2: Custom Transformer
get_feature_names_out.import numpy as np
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from itertools import combinations
class InteractionFeatures(BaseEstimator, TransformerMixin):
"""Create pairwise interaction features (products).
Parameters
----------
interaction_only : bool, default=True
If True, only interaction (cross) features are produced.
If False, self-interactions (squares) are also included.
"""
def __init__(self, interaction_only=True):
self.interaction_only = interaction_only
def fit(self, X, y=None):
n_features = X.shape[1]
self.n_features_in_ = n_features
if self.interaction_only:
self.pairs_ = list(combinations(range(n_features), 2))
else:
self.pairs_ = list(combinations(range(n_features), 2))
self.pairs_ += [(i, i) for i in range(n_features)]
return self
def transform(self, X):
X = np.asarray(X)
interactions = np.column_stack([
X[:, i] * X[:, j] for i, j in self.pairs_
])
return np.hstack([X, interactions])
def get_feature_names_out(self, input_features=None):
if input_features is None:
input_features = [f"x{i}" for i in range(self.n_features_in_)]
interaction_names = [
f"{input_features[i]}*{input_features[j]}"
for i, j in self.pairs_
]
return list(input_features) + interaction_names
# Test
X = np.array([[1, 2, 3],
[4, 5, 6],
[7, 8, 9]])
pipe = Pipeline([
('interactions', InteractionFeatures(interaction_only=True)),
('scaler', StandardScaler())
])
X_transformed = pipe.fit_transform(X)
print(f"Input shape: {X.shape}") # (3, 3)
print(f"Output shape: {X_transformed.shape}") # (3, 6)
print(f"Features: {pipe[0].get_feature_names_out()}")
# ['x0', 'x1', 'x2', 'x0*x1', 'x0*x2', 'x1*x2']
Interviewer focus: Inheriting from BaseEstimator and TransformerMixin, implementing fit/transform, setting n_features_in_ in fit, and implementing get_feature_names_out. These are the sklearn API contracts that production code must follow.
Challenge 3: Proper Cross-Validation with Data Leakage Prevention
import numpy as np
from sklearn.datasets import make_classification
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import cross_val_score
from sklearn.pipeline import Pipeline
X, y = make_classification(n_samples=200, n_features=50,
n_informative=5, random_state=42)
# WRONG: Scale BEFORE cross-validation (data leakage!)
scaler = StandardScaler()
X_scaled_leaked = scaler.fit_transform(X) # Test fold info leaks into scaling
scores_leaked = cross_val_score(
LogisticRegression(max_iter=1000), X_scaled_leaked, y, cv=5
)
print(f"WRONG (leaked): {scores_leaked.mean():.4f} +/- {scores_leaked.std():.4f}")
# CORRECT: Scale INSIDE cross-validation via Pipeline
pipe = Pipeline([
('scaler', StandardScaler()),
('model', LogisticRegression(max_iter=1000))
])
scores_correct = cross_val_score(pipe, X, y, cv=5)
print(f"CORRECT (pipeline): {scores_correct.mean():.4f} +/- {scores_correct.std():.4f}")
# The leaked version will show slightly higher scores because
# the scaler used test fold statistics during training
Challenge 4: GridSearchCV with Pipeline
import numpy as np
from sklearn.datasets import make_classification
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import GridSearchCV
X, y = make_classification(n_samples=300, n_features=20,
n_informative=10, random_state=42)
# Pipeline with named steps
pipe = Pipeline([
('scaler', StandardScaler()),
('pca', PCA()),
('model', LogisticRegression(max_iter=1000))
])
# Parameter grid uses step_name__parameter_name convention
param_grid = {
'pca__n_components': [5, 10, 15, None], # None = keep all
'model__C': [0.01, 0.1, 1, 10], # Regularization
'model__penalty': ['l1', 'l2'], # Regularization type
'model__solver': ['saga'], # Required for l1
}
grid = GridSearchCV(
pipe, param_grid,
cv=5,
scoring='f1',
n_jobs=-1, # Use all CPU cores
verbose=0,
return_train_score=True # For diagnosing overfitting
)
grid.fit(X, y)
print(f"Best score: {grid.best_score_:.4f}")
print(f"Best params: {grid.best_params_}")
# Check for overfitting
best_idx = grid.best_index_
train_score = grid.cv_results_['mean_train_score'][best_idx]
test_score = grid.cv_results_['mean_test_score'][best_idx]
print(f"Train: {train_score:.4f}, Test: {test_score:.4f}, "
f"Gap: {train_score - test_score:.4f}")
Interviewer focus: The double-underscore naming convention (step__param) for pipeline parameters, using return_train_score=True to diagnose overfitting, and knowing that PCA components is a tunable hyperparameter too.
Challenge 5: Feature Selection Inside Pipeline
import numpy as np
from sklearn.datasets import make_classification
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.feature_selection import (
VarianceThreshold, SelectKBest, mutual_info_classif, RFECV
)
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import cross_val_score
X, y = make_classification(n_samples=300, n_features=30,
n_informative=8, n_redundant=5,
random_state=42)
# Method 1: Variance Threshold (remove near-constant features)
pipe_var = Pipeline([
('scaler', StandardScaler()),
('selector', VarianceThreshold(threshold=0.5)),
('model', LogisticRegression(max_iter=1000))
])
# Method 2: Mutual Information (top K features)
pipe_mi = Pipeline([
('scaler', StandardScaler()),
('selector', SelectKBest(mutual_info_classif, k=10)),
('model', LogisticRegression(max_iter=1000))
])
# Method 3: Recursive Feature Elimination with CV
pipe_rfe = Pipeline([
('scaler', StandardScaler()),
('selector', RFECV(
estimator=LogisticRegression(max_iter=1000),
step=1, cv=3, scoring='f1', min_features_to_select=5
)),
('model', LogisticRegression(max_iter=1000))
])
# Compare methods
methods = {
'No selection': Pipeline([
('scaler', StandardScaler()),
('model', LogisticRegression(max_iter=1000))
]),
'Variance': pipe_var,
'Mutual Info': pipe_mi,
'RFECV': pipe_rfe,
}
for name, pipe in methods.items():
scores = cross_val_score(pipe, X, y, cv=5, scoring='f1')
print(f"{name:15s}: F1 = {scores.mean():.4f} +/- {scores.std():.4f}")
Interviewer focus: Feature selection MUST be inside the pipeline to avoid data leakage. Knowing when to use each method: variance threshold for initial cleanup, mutual information for filter-based selection, and RFECV for wrapper-based selection. Mention that RFECV is slower but more accurate.
Challenge 6: Custom Scorer Function
import numpy as np
from sklearn.datasets import make_classification
from sklearn.metrics import make_scorer, precision_score, recall_score, fbeta_score
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV
X, y = make_classification(n_samples=500, n_features=10,
weights=[0.9, 0.1], # 90/10 imbalanced
random_state=42)
def weighted_recall_precision(y_true, y_pred, recall_weight=2.0):
"""Custom metric: weighted combination of recall and precision.
Higher recall_weight prioritizes catching all positives
(important for fraud detection, medical diagnosis).
"""
recall = recall_score(y_true, y_pred)
precision = precision_score(y_true, y_pred, zero_division=0)
if recall + precision == 0:
return 0.0
# Weighted harmonic mean (generalized F-beta score)
beta_sq = recall_weight ** 2
return (1 + beta_sq) * precision * recall / (beta_sq * precision + recall)
# Create sklearn scorer (greater_is_better=True for metrics to maximize)
custom_scorer = make_scorer(weighted_recall_precision, recall_weight=2.0)
# Also equivalent to F2 score:
f2_scorer = make_scorer(fbeta_score, beta=2)
# Use in GridSearchCV
grid = GridSearchCV(
RandomForestClassifier(random_state=42),
param_grid={
'n_estimators': [50, 100],
'class_weight': [None, 'balanced', {0: 1, 1: 5}, {0: 1, 1: 10}],
'max_depth': [5, 10, None]
},
scoring={
'custom': custom_scorer,
'f2': f2_scorer,
'recall': 'recall',
'precision': 'precision'
},
refit='custom', # Which score to use for best model
cv=5
)
grid.fit(X, y)
print(f"Best params: {grid.best_params_}")
print(f"Best custom score: {grid.best_score_:.4f}")
Interviewer focus: Using make_scorer to wrap custom functions, multi-metric evaluation with scoring dict, and using refit to specify which metric selects the best model. Understanding F-beta as generalized precision-recall trade-off.
Challenge 7: Stratified K-Fold for Imbalanced Data
import numpy as np
from sklearn.datasets import make_classification
from sklearn.model_selection import (
KFold, StratifiedKFold, StratifiedGroupKFold, cross_val_score
)
from sklearn.linear_model import LogisticRegression
X, y = make_classification(n_samples=300, n_features=10,
weights=[0.9, 0.1], random_state=42)
print(f"Class distribution: {np.bincount(y)} (0s vs 1s)")
# Regular KFold - class ratios vary across folds
kf = KFold(n_splits=5, shuffle=True, random_state=42)
print("\nRegular KFold fold compositions:")
for i, (train_idx, test_idx) in enumerate(kf.split(X)):
ratio = y[test_idx].mean()
print(f" Fold {i+1}: {len(test_idx)} samples, "
f"{ratio:.1%} positive")
# StratifiedKFold - maintains class ratios
skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
print("\nStratified KFold fold compositions:")
for i, (train_idx, test_idx) in enumerate(skf.split(X, y)):
ratio = y[test_idx].mean()
print(f" Fold {i+1}: {len(test_idx)} samples, "
f"{ratio:.1%} positive")
# Compare scores
model = LogisticRegression(max_iter=1000)
scores_kf = cross_val_score(model, X, y, cv=kf, scoring='f1')
scores_skf = cross_val_score(model, X, y, cv=skf, scoring='f1')
print(f"\nRegular KFold F1: {scores_kf.mean():.4f} +/- {scores_kf.std():.4f}")
print(f"Stratified KFold F1: {scores_skf.mean():.4f} +/- {scores_skf.std():.4f}")
print("(Stratified has lower variance - more reliable estimate)")
Interviewer focus: Always use StratifiedKFold for classification. The key insight is that regular KFold can create folds with zero minority class samples, making F1 undefined. Mention StratifiedGroupKFold when data has groups (e.g., multiple samples per patient).
Challenge 8: Custom Transformer with fit/transform State
fit (on training data) and applied during transform (to avoid data leakage).import numpy as np
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import cross_val_score
class OutlierClipper(BaseEstimator, TransformerMixin):
"""Clip outliers to percentile bounds learned from training data.
Parameters
----------
lower_percentile : float
Lower bound percentile (default 1.0).
upper_percentile : float
Upper bound percentile (default 99.0).
"""
def __init__(self, lower_percentile=1.0, upper_percentile=99.0):
self.lower_percentile = lower_percentile
self.upper_percentile = upper_percentile
def fit(self, X, y=None):
X = np.asarray(X)
self.lower_bounds_ = np.percentile(X, self.lower_percentile, axis=0)
self.upper_bounds_ = np.percentile(X, self.upper_percentile, axis=0)
self.n_features_in_ = X.shape[1]
return self
def transform(self, X):
X = np.asarray(X, dtype=float).copy()
return np.clip(X, self.lower_bounds_, self.upper_bounds_)
def get_feature_names_out(self, input_features=None):
if input_features is None:
return [f"x{i}" for i in range(self.n_features_in_)]
return list(input_features)
# Test with outliers
np.random.seed(42)
X = np.random.randn(200, 3)
X[0, 0] = 100 # Extreme outlier
X[1, 1] = -50 # Extreme outlier
y = X[:, 0] * 2 + X[:, 1] - X[:, 2] + np.random.randn(200) * 0.5
pipe = Pipeline([
('clipper', OutlierClipper(lower_percentile=1, upper_percentile=99)),
('model', LinearRegression())
])
scores = cross_val_score(pipe, X, y, cv=5, scoring='r2')
print(f"With clipping: R2 = {scores.mean():.4f}")
scores_no_clip = cross_val_score(LinearRegression(), X, y, cv=5, scoring='r2')
print(f"Without clipping: R2 = {scores_no_clip.mean():.4f}")
Interviewer focus: The critical point is that percentile bounds are computed in fit using training data and stored as attributes (with trailing underscore convention). The same bounds are applied in transform, preventing data leakage from test data.
Challenge 9: Multi-Output Classification
MultiOutputClassifier and evaluate properly.import numpy as np
from sklearn.datasets import make_multilabel_classification
from sklearn.multioutput import MultiOutputClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import cross_val_score
from sklearn.metrics import make_scorer, f1_score
# Generate multi-label data (3 possible labels per sample)
X, y = make_multilabel_classification(
n_samples=300, n_features=15, n_classes=3,
n_labels=2, random_state=42
)
print(f"X shape: {X.shape}, y shape: {y.shape}")
print(f"Sample labels: {y[:5]}")
# Pipeline with multi-output wrapper
pipe = Pipeline([
('scaler', StandardScaler()),
('model', MultiOutputClassifier(
GradientBoostingClassifier(n_estimators=50, random_state=42),
n_jobs=-1
))
])
# Custom scorer for multi-label
def multilabel_f1(y_true, y_pred):
"""Compute macro F1 across all labels."""
return f1_score(y_true, y_pred, average='macro')
scorer = make_scorer(multilabel_f1)
scores = cross_val_score(pipe, X, y, cv=5, scoring=scorer)
print(f"Multi-label Macro F1: {scores.mean():.4f} +/- {scores.std():.4f}")
# Fit and inspect per-label performance
pipe.fit(X, y)
y_pred = pipe.predict(X)
for i in range(y.shape[1]):
f1 = f1_score(y[:, i], y_pred[:, i])
print(f" Label {i}: F1 = {f1:.4f}")
Interviewer focus: Knowing the difference between multi-class (one label per sample) and multi-label (multiple labels per sample). Using MultiOutputClassifier as a wrapper, and understanding that multi-label metrics need special handling (average='macro' vs 'micro' vs 'samples').
Challenge 10: Model Persistence and Versioning
import numpy as np
import json
import pickle
import hashlib
from datetime import datetime
from sklearn.datasets import make_classification
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import cross_val_score
X, y = make_classification(n_samples=200, n_features=5, random_state=42)
feature_names = ['age', 'income', 'score', 'tenure', 'activity']
# Build and train pipeline
pipe = Pipeline([
('scaler', StandardScaler()),
('model', LogisticRegression(max_iter=1000))
])
scores = cross_val_score(pipe, X, y, cv=5, scoring='accuracy')
pipe.fit(X, y)
y_pred_original = pipe.predict(X)
# Save model with metadata
model_artifact = {
'pipeline': pipe,
'metadata': {
'created_at': datetime.now().isoformat(),
'sklearn_version': '1.4.0', # In production, use sklearn.__version__
'feature_names': feature_names,
'cv_accuracy_mean': float(scores.mean()),
'cv_accuracy_std': float(scores.std()),
'training_samples': len(X),
'model_hash': hashlib.md5(pickle.dumps(pipe)).hexdigest()
}
}
# Save (in interview, explain joblib vs pickle)
# joblib is preferred for sklearn: handles large numpy arrays better
# import joblib; joblib.dump(model_artifact, 'model.joblib')
# For demo: serialize and deserialize in memory
serialized = pickle.dumps(model_artifact)
loaded_artifact = pickle.loads(serialized)
# Verify predictions match
loaded_pipe = loaded_artifact['pipeline']
y_pred_loaded = loaded_pipe.predict(X)
assert np.array_equal(y_pred_original, y_pred_loaded)
print("Metadata:")
for k, v in loaded_artifact['metadata'].items():
print(f" {k}: {v}")
print("\nPredictions match: True")
Interviewer focus: Knowing that joblib is preferred over pickle for sklearn models (better numpy array handling), including metadata for reproducibility, and understanding version compatibility risks. In production, mention MLflow or similar model registries.
Lilly Tech Systems