Intermediate

Data Processing Challenges

Real-world ML interviews often start with messy data. Master feature engineering, missing value imputation, categorical encoding, and normalization — the skills that separate data scientists from algorithm memorizers.

📝

Interview Question #1: “Given a dataset with missing values in both numeric and categorical columns, write functions to handle them. Explain the tradeoffs of each approach.”

Handling Missing Values

import numpy as np
import pandas as pd

def handle_missing_values(df, strategy='auto'):
    """
    Handle missing values in a DataFrame.

    Strategies:
    - 'auto': mean for numeric, mode for categorical
    - 'median': median for numeric, mode for categorical
    - 'drop': drop rows with any missing value
    - 'indicator': add binary columns indicating missingness

    Returns:
        DataFrame with missing values handled
    """
    df_clean = df.copy()

    if strategy == 'drop':
        return df_clean.dropna()

    for col in df_clean.columns:
        if df_clean[col].isnull().sum() == 0:
            continue

        if strategy == 'indicator':
            # Add a binary indicator column
            df_clean[f'{col}_missing'] = df_clean[col].isnull().astype(int)

        if df_clean[col].dtype in ['float64', 'int64', 'float32', 'int32']:
            # Numeric: fill with mean or median
            if strategy in ['auto', 'indicator']:
                df_clean[col] = df_clean[col].fillna(df_clean[col].mean())
            elif strategy == 'median':
                df_clean[col] = df_clean[col].fillna(df_clean[col].median())
        else:
            # Categorical: fill with mode
            mode_val = df_clean[col].mode()
            if len(mode_val) > 0:
                df_clean[col] = df_clean[col].fillna(mode_val[0])

    return df_clean


# ---- Test ----
df = pd.DataFrame({
    'age': [25, 30, np.nan, 45, 50, np.nan, 35],
    'salary': [50000, 60000, 70000, np.nan, 90000, 80000, np.nan],
    'city': ['NYC', 'LA', np.nan, 'NYC', 'SF', 'LA', 'NYC'],
    'experience': [2, 5, 8, 12, np.nan, 7, 3]
})

print("Original:")
print(df)
print(f"\nMissing values:\n{df.isnull().sum()}\n")

df_auto = handle_missing_values(df, strategy='auto')
print("After auto imputation:")
print(df_auto)

df_indicator = handle_missing_values(df, strategy='indicator')
print("\nWith missing indicators:")
print(df_indicator.columns.tolist())

What interviewers look for:

Knowing that mean imputation biases variance downward
Mentioning that you should never compute fill values from the test set (data leakage)
Understanding that missing indicator features can be informative (the missingness itself may be predictive)
Knowing when to drop vs impute (drop when <5% missing; impute when more)

📝

Interview Question #2: “Implement one-hot encoding, label encoding, and target encoding from scratch. When would you use each?”

Encoding Categorical Variables

def one_hot_encode(series):
    """
    One-hot encode a categorical series from scratch.
    Returns a DataFrame with binary columns.
    """
    categories = sorted(series.dropna().unique())
    encoded = pd.DataFrame()

    for cat in categories:
        encoded[f'{series.name}_{cat}'] = (series == cat).astype(int)

    return encoded


def label_encode(series):
    """
    Label encode: map categories to integers.
    Returns encoded series and the mapping dictionary.
    """
    categories = sorted(series.dropna().unique())
    mapping = {cat: idx for idx, cat in enumerate(categories)}
    encoded = series.map(mapping)
    return encoded, mapping


def target_encode(series, target, smoothing=10):
    """
    Target encoding: replace category with mean of target variable.
    Uses smoothing to prevent overfitting on rare categories.

    smoothing: higher = more regularization toward global mean
    """
    global_mean = target.mean()
    stats = pd.DataFrame({'category': series, 'target': target})
    agg = stats.groupby('category')['target'].agg(['mean', 'count'])

    # Smoothed encoding: weighted average of category mean and global mean
    # weight = count / (count + smoothing)
    smooth_mean = (agg['count'] * agg['mean'] + smoothing * global_mean) / \
                  (agg['count'] + smoothing)

    return series.map(smooth_mean)


# ---- Test ----
data = pd.DataFrame({
    'color': ['red', 'blue', 'green', 'red', 'blue', 'green', 'red', 'blue'],
    'size': ['S', 'M', 'L', 'S', 'M', 'L', 'M', 'S'],
    'price': [10, 20, 30, 12, 22, 28, 15, 18]
})

# One-hot encoding
print("One-hot encoding:")
print(one_hot_encode(data['color']))

# Label encoding
encoded, mapping = label_encode(data['color'])
print(f"\nLabel encoding: {mapping}")
print(encoded.values)

# Target encoding
target_enc = target_encode(data['color'], data['price'])
print(f"\nTarget encoding (color -> mean price):")
print(target_enc.values)

Method	When to Use	Pros	Cons
One-Hot	Low cardinality (<20 categories), no ordinal relationship	No assumed ordering, works with all models	High dimensionality for many categories
Label	Ordinal features (S/M/L), tree-based models	Single column, memory efficient	Implies false ordering for non-ordinal data
Target	High cardinality, supervised learning	Single column, captures target relationship	Risk of data leakage, needs smoothing

⚠

Data leakage trap: Target encoding must use only training data statistics. If you compute the target mean using the full dataset (including test), you leak test labels into your features. Always fit the encoding on train and transform both train and test.

📝

Interview Question #3: “Implement StandardScaler and MinMaxScaler from scratch. Explain when normalization vs standardization matters and common mistakes.”

Feature Scaling

class StandardScaler:
    """
    Standardize features by removing the mean and scaling to unit variance.
    z = (x - mean) / std
    """

    def __init__(self):
        self.mean_ = None
        self.std_ = None

    def fit(self, X):
        """Compute mean and std from training data."""
        self.mean_ = np.mean(X, axis=0)
        self.std_ = np.std(X, axis=0)
        # Replace zero std with 1 to avoid division by zero
        self.std_[self.std_ == 0] = 1.0
        return self

    def transform(self, X):
        """Scale features using stored mean and std."""
        return (X - self.mean_) / self.std_

    def fit_transform(self, X):
        """Fit and transform in one step."""
        return self.fit(X).transform(X)

    def inverse_transform(self, X_scaled):
        """Reverse the scaling."""
        return X_scaled * self.std_ + self.mean_


class MinMaxScaler:
    """
    Scale features to a given range [0, 1] by default.
    x_scaled = (x - min) / (max - min)
    """

    def __init__(self, feature_range=(0, 1)):
        self.feature_range = feature_range
        self.min_ = None
        self.max_ = None

    def fit(self, X):
        self.min_ = np.min(X, axis=0)
        self.max_ = np.max(X, axis=0)
        # Handle constant features
        self.range_ = self.max_ - self.min_
        self.range_[self.range_ == 0] = 1.0
        return self

    def transform(self, X):
        lo, hi = self.feature_range
        X_scaled = (X - self.min_) / self.range_
        return X_scaled * (hi - lo) + lo

    def fit_transform(self, X):
        return self.fit(X).transform(X)


# ---- Test ----
X = np.array([
    [1, 200, 0.5],
    [2, 400, 0.3],
    [3, 100, 0.8],
    [4, 300, 0.1],
    [5, 500, 0.9]
], dtype=float)

print("Original:\n", X)

scaler = StandardScaler()
X_standard = scaler.fit_transform(X)
print("\nStandardized (mean=0, std=1):\n", X_standard)
print("Mean:", np.mean(X_standard, axis=0))  # Should be ~0
print("Std: ", np.std(X_standard, axis=0))   # Should be ~1

minmax = MinMaxScaler()
X_minmax = minmax.fit_transform(X)
print("\nMin-Max scaled [0,1]:\n", X_minmax)
print("Min:", np.min(X_minmax, axis=0))  # Should be 0
print("Max:", np.max(X_minmax, axis=0))  # Should be 1

💡

Critical interview point: Always fit on training data only, then transform both train and test. The most common data leakage mistake is calling fit_transform on the entire dataset before splitting. This leaks test statistics into training.

📝

Interview Question #4: “Given this e-commerce dataset, create useful features for predicting customer churn. Show your feature engineering process.”

Feature Engineering Challenge

import pandas as pd
import numpy as np

# Simulated e-commerce data
np.random.seed(42)
n = 100
orders = pd.DataFrame({
    'customer_id': np.random.choice(range(20), n),
    'order_date': pd.date_range('2025-01-01', periods=n, freq='D'),
    'order_amount': np.random.exponential(50, n).round(2),
    'product_category': np.random.choice(
        ['electronics', 'clothing', 'food', 'books'], n
    ),
    'returned': np.random.choice([0, 1], n, p=[0.85, 0.15])
})

def engineer_customer_features(orders_df):
    """
    Create customer-level features from order data.
    This is the type of feature engineering interviewers expect.
    """
    features = orders_df.groupby('customer_id').agg(
        # Recency: days since last order
        last_order_date=('order_date', 'max'),

        # Frequency: total number of orders
        total_orders=('order_amount', 'count'),

        # Monetary: total and average spend
        total_spend=('order_amount', 'sum'),
        avg_order_value=('order_amount', 'mean'),
        max_order_value=('order_amount', 'max'),
        min_order_value=('order_amount', 'min'),

        # Return rate
        return_count=('returned', 'sum'),

        # Category diversity
        unique_categories=('product_category', 'nunique'),
    ).reset_index()

    # Derived features
    reference_date = orders_df['order_date'].max()
    features['days_since_last_order'] = (
        reference_date - features['last_order_date']
    ).dt.days

    features['return_rate'] = (
        features['return_count'] / features['total_orders']
    ).round(4)

    features['spend_std'] = orders_df.groupby('customer_id')['order_amount'] \
        .std().values

    # Order frequency (avg days between orders)
    def avg_days_between_orders(group):
        dates = group.sort_values()
        if len(dates) < 2:
            return 0
        diffs = dates.diff().dropna().dt.days
        return diffs.mean()

    features['avg_days_between_orders'] = orders_df.groupby('customer_id') \
        ['order_date'].apply(avg_days_between_orders).values

    # Drop intermediate columns
    features = features.drop(columns=['last_order_date', 'return_count'])

    return features


customer_features = engineer_customer_features(orders)
print("Engineered features:")
print(customer_features.head())
print(f"\nFeature columns: {customer_features.columns.tolist()}")
print(f"Shape: {customer_features.shape}")

What interviewers look for:

RFM features (Recency, Frequency, Monetary) — the gold standard for customer analytics
Using groupby and agg efficiently rather than looping
Creating ratio features (return rate, not just return count)
Thinking about time-based features (days between orders, days since last order)
Handling edge cases (customers with only 1 order)

📝

Interview Question #5: “Write a function to detect and handle outliers using both IQR and z-score methods. When would you use each?”

Outlier Detection

def detect_outliers_iqr(data, factor=1.5):
    """
    Detect outliers using the Interquartile Range (IQR) method.
    Robust to skewed distributions.
    """
    Q1 = np.percentile(data, 25)
    Q3 = np.percentile(data, 75)
    IQR = Q3 - Q1
    lower = Q1 - factor * IQR
    upper = Q3 + factor * IQR

    outlier_mask = (data < lower) | (data > upper)

    return {
        'mask': outlier_mask,
        'bounds': (lower, upper),
        'n_outliers': outlier_mask.sum(),
        'outlier_values': data[outlier_mask]
    }


def detect_outliers_zscore(data, threshold=3.0):
    """
    Detect outliers using the z-score method.
    Assumes approximately normal distribution.
    """
    mean = np.mean(data)
    std = np.std(data)
    z_scores = np.abs((data - mean) / std)

    outlier_mask = z_scores > threshold

    return {
        'mask': outlier_mask,
        'z_scores': z_scores,
        'n_outliers': outlier_mask.sum(),
        'outlier_values': data[outlier_mask]
    }


def handle_outliers(data, method='clip', factor=1.5):
    """
    Handle outliers by clipping or removing.
    Clipping is usually preferred as it preserves sample size.
    """
    result = detect_outliers_iqr(data, factor)
    lower, upper = result['bounds']

    if method == 'clip':
        return np.clip(data, lower, upper)
    elif method == 'remove':
        return data[~result['mask']]
    return data


# ---- Test ----
data = np.array([10, 12, 14, 15, 16, 18, 20, 100, 200])  # 100, 200 are outliers

iqr_result = detect_outliers_iqr(data)
print(f"IQR method: {iqr_result['n_outliers']} outliers found")
print(f"Bounds: {iqr_result['bounds']}")
print(f"Outlier values: {iqr_result['outlier_values']}")

z_result = detect_outliers_zscore(data)
print(f"\nZ-score method: {z_result['n_outliers']} outliers found")
print(f"Outlier values: {z_result['outlier_values']}")

clipped = handle_outliers(data, method='clip')
print(f"\nAfter clipping: {clipped}")

← Previous Neural Networks Next → Evaluation Metrics