Intermediate
Data Processing Challenges
Real-world ML interviews often start with messy data. Master feature engineering, missing value imputation, categorical encoding, and normalization — the skills that separate data scientists from algorithm memorizers.
Interview Question #1: “Given a dataset with missing values in both numeric and categorical columns, write functions to handle them. Explain the tradeoffs of each approach.”
Handling Missing Values
import numpy as np
import pandas as pd
def handle_missing_values(df, strategy='auto'):
"""
Handle missing values in a DataFrame.
Strategies:
- 'auto': mean for numeric, mode for categorical
- 'median': median for numeric, mode for categorical
- 'drop': drop rows with any missing value
- 'indicator': add binary columns indicating missingness
Returns:
DataFrame with missing values handled
"""
df_clean = df.copy()
if strategy == 'drop':
return df_clean.dropna()
for col in df_clean.columns:
if df_clean[col].isnull().sum() == 0:
continue
if strategy == 'indicator':
# Add a binary indicator column
df_clean[f'{col}_missing'] = df_clean[col].isnull().astype(int)
if df_clean[col].dtype in ['float64', 'int64', 'float32', 'int32']:
# Numeric: fill with mean or median
if strategy in ['auto', 'indicator']:
df_clean[col] = df_clean[col].fillna(df_clean[col].mean())
elif strategy == 'median':
df_clean[col] = df_clean[col].fillna(df_clean[col].median())
else:
# Categorical: fill with mode
mode_val = df_clean[col].mode()
if len(mode_val) > 0:
df_clean[col] = df_clean[col].fillna(mode_val[0])
return df_clean
# ---- Test ----
df = pd.DataFrame({
'age': [25, 30, np.nan, 45, 50, np.nan, 35],
'salary': [50000, 60000, 70000, np.nan, 90000, 80000, np.nan],
'city': ['NYC', 'LA', np.nan, 'NYC', 'SF', 'LA', 'NYC'],
'experience': [2, 5, 8, 12, np.nan, 7, 3]
})
print("Original:")
print(df)
print(f"\nMissing values:\n{df.isnull().sum()}\n")
df_auto = handle_missing_values(df, strategy='auto')
print("After auto imputation:")
print(df_auto)
df_indicator = handle_missing_values(df, strategy='indicator')
print("\nWith missing indicators:")
print(df_indicator.columns.tolist())
What interviewers look for:
- Knowing that mean imputation biases variance downward
- Mentioning that you should never compute fill values from the test set (data leakage)
- Understanding that missing indicator features can be informative (the missingness itself may be predictive)
- Knowing when to drop vs impute (drop when <5% missing; impute when more)
Interview Question #2: “Implement one-hot encoding, label encoding, and target encoding from scratch. When would you use each?”
Encoding Categorical Variables
def one_hot_encode(series):
"""
One-hot encode a categorical series from scratch.
Returns a DataFrame with binary columns.
"""
categories = sorted(series.dropna().unique())
encoded = pd.DataFrame()
for cat in categories:
encoded[f'{series.name}_{cat}'] = (series == cat).astype(int)
return encoded
def label_encode(series):
"""
Label encode: map categories to integers.
Returns encoded series and the mapping dictionary.
"""
categories = sorted(series.dropna().unique())
mapping = {cat: idx for idx, cat in enumerate(categories)}
encoded = series.map(mapping)
return encoded, mapping
def target_encode(series, target, smoothing=10):
"""
Target encoding: replace category with mean of target variable.
Uses smoothing to prevent overfitting on rare categories.
smoothing: higher = more regularization toward global mean
"""
global_mean = target.mean()
stats = pd.DataFrame({'category': series, 'target': target})
agg = stats.groupby('category')['target'].agg(['mean', 'count'])
# Smoothed encoding: weighted average of category mean and global mean
# weight = count / (count + smoothing)
smooth_mean = (agg['count'] * agg['mean'] + smoothing * global_mean) / \
(agg['count'] + smoothing)
return series.map(smooth_mean)
# ---- Test ----
data = pd.DataFrame({
'color': ['red', 'blue', 'green', 'red', 'blue', 'green', 'red', 'blue'],
'size': ['S', 'M', 'L', 'S', 'M', 'L', 'M', 'S'],
'price': [10, 20, 30, 12, 22, 28, 15, 18]
})
# One-hot encoding
print("One-hot encoding:")
print(one_hot_encode(data['color']))
# Label encoding
encoded, mapping = label_encode(data['color'])
print(f"\nLabel encoding: {mapping}")
print(encoded.values)
# Target encoding
target_enc = target_encode(data['color'], data['price'])
print(f"\nTarget encoding (color -> mean price):")
print(target_enc.values)
| Method | When to Use | Pros | Cons |
|---|---|---|---|
| One-Hot | Low cardinality (<20 categories), no ordinal relationship | No assumed ordering, works with all models | High dimensionality for many categories |
| Label | Ordinal features (S/M/L), tree-based models | Single column, memory efficient | Implies false ordering for non-ordinal data |
| Target | High cardinality, supervised learning | Single column, captures target relationship | Risk of data leakage, needs smoothing |
Data leakage trap: Target encoding must use only training data statistics. If you compute the target mean using the full dataset (including test), you leak test labels into your features. Always fit the encoding on train and transform both train and test.
Interview Question #3: “Implement StandardScaler and MinMaxScaler from scratch. Explain when normalization vs standardization matters and common mistakes.”
Feature Scaling
class StandardScaler:
"""
Standardize features by removing the mean and scaling to unit variance.
z = (x - mean) / std
"""
def __init__(self):
self.mean_ = None
self.std_ = None
def fit(self, X):
"""Compute mean and std from training data."""
self.mean_ = np.mean(X, axis=0)
self.std_ = np.std(X, axis=0)
# Replace zero std with 1 to avoid division by zero
self.std_[self.std_ == 0] = 1.0
return self
def transform(self, X):
"""Scale features using stored mean and std."""
return (X - self.mean_) / self.std_
def fit_transform(self, X):
"""Fit and transform in one step."""
return self.fit(X).transform(X)
def inverse_transform(self, X_scaled):
"""Reverse the scaling."""
return X_scaled * self.std_ + self.mean_
class MinMaxScaler:
"""
Scale features to a given range [0, 1] by default.
x_scaled = (x - min) / (max - min)
"""
def __init__(self, feature_range=(0, 1)):
self.feature_range = feature_range
self.min_ = None
self.max_ = None
def fit(self, X):
self.min_ = np.min(X, axis=0)
self.max_ = np.max(X, axis=0)
# Handle constant features
self.range_ = self.max_ - self.min_
self.range_[self.range_ == 0] = 1.0
return self
def transform(self, X):
lo, hi = self.feature_range
X_scaled = (X - self.min_) / self.range_
return X_scaled * (hi - lo) + lo
def fit_transform(self, X):
return self.fit(X).transform(X)
# ---- Test ----
X = np.array([
[1, 200, 0.5],
[2, 400, 0.3],
[3, 100, 0.8],
[4, 300, 0.1],
[5, 500, 0.9]
], dtype=float)
print("Original:\n", X)
scaler = StandardScaler()
X_standard = scaler.fit_transform(X)
print("\nStandardized (mean=0, std=1):\n", X_standard)
print("Mean:", np.mean(X_standard, axis=0)) # Should be ~0
print("Std: ", np.std(X_standard, axis=0)) # Should be ~1
minmax = MinMaxScaler()
X_minmax = minmax.fit_transform(X)
print("\nMin-Max scaled [0,1]:\n", X_minmax)
print("Min:", np.min(X_minmax, axis=0)) # Should be 0
print("Max:", np.max(X_minmax, axis=0)) # Should be 1
Critical interview point: Always
fit on training data only, then transform both train and test. The most common data leakage mistake is calling fit_transform on the entire dataset before splitting. This leaks test statistics into training.Interview Question #4: “Given this e-commerce dataset, create useful features for predicting customer churn. Show your feature engineering process.”
Feature Engineering Challenge
import pandas as pd
import numpy as np
# Simulated e-commerce data
np.random.seed(42)
n = 100
orders = pd.DataFrame({
'customer_id': np.random.choice(range(20), n),
'order_date': pd.date_range('2025-01-01', periods=n, freq='D'),
'order_amount': np.random.exponential(50, n).round(2),
'product_category': np.random.choice(
['electronics', 'clothing', 'food', 'books'], n
),
'returned': np.random.choice([0, 1], n, p=[0.85, 0.15])
})
def engineer_customer_features(orders_df):
"""
Create customer-level features from order data.
This is the type of feature engineering interviewers expect.
"""
features = orders_df.groupby('customer_id').agg(
# Recency: days since last order
last_order_date=('order_date', 'max'),
# Frequency: total number of orders
total_orders=('order_amount', 'count'),
# Monetary: total and average spend
total_spend=('order_amount', 'sum'),
avg_order_value=('order_amount', 'mean'),
max_order_value=('order_amount', 'max'),
min_order_value=('order_amount', 'min'),
# Return rate
return_count=('returned', 'sum'),
# Category diversity
unique_categories=('product_category', 'nunique'),
).reset_index()
# Derived features
reference_date = orders_df['order_date'].max()
features['days_since_last_order'] = (
reference_date - features['last_order_date']
).dt.days
features['return_rate'] = (
features['return_count'] / features['total_orders']
).round(4)
features['spend_std'] = orders_df.groupby('customer_id')['order_amount'] \
.std().values
# Order frequency (avg days between orders)
def avg_days_between_orders(group):
dates = group.sort_values()
if len(dates) < 2:
return 0
diffs = dates.diff().dropna().dt.days
return diffs.mean()
features['avg_days_between_orders'] = orders_df.groupby('customer_id') \
['order_date'].apply(avg_days_between_orders).values
# Drop intermediate columns
features = features.drop(columns=['last_order_date', 'return_count'])
return features
customer_features = engineer_customer_features(orders)
print("Engineered features:")
print(customer_features.head())
print(f"\nFeature columns: {customer_features.columns.tolist()}")
print(f"Shape: {customer_features.shape}")
What interviewers look for:
- RFM features (Recency, Frequency, Monetary) — the gold standard for customer analytics
- Using
groupbyandaggefficiently rather than looping - Creating ratio features (return rate, not just return count)
- Thinking about time-based features (days between orders, days since last order)
- Handling edge cases (customers with only 1 order)
Interview Question #5: “Write a function to detect and handle outliers using both IQR and z-score methods. When would you use each?”
Outlier Detection
def detect_outliers_iqr(data, factor=1.5):
"""
Detect outliers using the Interquartile Range (IQR) method.
Robust to skewed distributions.
"""
Q1 = np.percentile(data, 25)
Q3 = np.percentile(data, 75)
IQR = Q3 - Q1
lower = Q1 - factor * IQR
upper = Q3 + factor * IQR
outlier_mask = (data < lower) | (data > upper)
return {
'mask': outlier_mask,
'bounds': (lower, upper),
'n_outliers': outlier_mask.sum(),
'outlier_values': data[outlier_mask]
}
def detect_outliers_zscore(data, threshold=3.0):
"""
Detect outliers using the z-score method.
Assumes approximately normal distribution.
"""
mean = np.mean(data)
std = np.std(data)
z_scores = np.abs((data - mean) / std)
outlier_mask = z_scores > threshold
return {
'mask': outlier_mask,
'z_scores': z_scores,
'n_outliers': outlier_mask.sum(),
'outlier_values': data[outlier_mask]
}
def handle_outliers(data, method='clip', factor=1.5):
"""
Handle outliers by clipping or removing.
Clipping is usually preferred as it preserves sample size.
"""
result = detect_outliers_iqr(data, factor)
lower, upper = result['bounds']
if method == 'clip':
return np.clip(data, lower, upper)
elif method == 'remove':
return data[~result['mask']]
return data
# ---- Test ----
data = np.array([10, 12, 14, 15, 16, 18, 20, 100, 200]) # 100, 200 are outliers
iqr_result = detect_outliers_iqr(data)
print(f"IQR method: {iqr_result['n_outliers']} outliers found")
print(f"Bounds: {iqr_result['bounds']}")
print(f"Outlier values: {iqr_result['outlier_values']}")
z_result = detect_outliers_zscore(data)
print(f"\nZ-score method: {z_result['n_outliers']} outliers found")
print(f"Outlier values: {z_result['outlier_values']}")
clipped = handle_outliers(data, method='clip')
print(f"\nAfter clipping: {clipped}")
Lilly Tech Systems