Intermediate

Statistical Operations

Six coding challenges covering statistical computations essential for feature engineering and data preprocessing in ML: summary statistics along axes, percentile computation, correlation analysis, and multiple normalization techniques.

Challenge 1: Summary Statistics Along Axes

Problem: Given a feature matrix X of shape (N, D), compute the mean, standard deviation, variance, min, max, and median for each feature (column). Also compute these statistics for each sample (row). Handle the ddof parameter correctly for sample vs. population statistics.

import numpy as np

# ---- CHALLENGE ----
def feature_statistics(X):
    """Compute summary statistics per feature (column). Return dict."""
    pass

def sample_statistics(X):
    """Compute summary statistics per sample (row). Return dict."""
    pass

# ---- SOLUTION ----
def feature_statistics(X):
    """Per-feature (column) statistics. axis=0 reduces across samples."""
    return {
        "mean": X.mean(axis=0),           # (D,)
        "std": X.std(axis=0, ddof=1),     # (D,) sample std
        "var": X.var(axis=0, ddof=1),     # (D,) sample variance
        "min": X.min(axis=0),             # (D,)
        "max": X.max(axis=0),             # (D,)
        "median": np.median(X, axis=0),   # (D,)
        "range": X.ptp(axis=0),           # (D,) peak-to-peak (max - min)
    }

def sample_statistics(X):
    """Per-sample (row) statistics. axis=1 reduces across features."""
    return {
        "mean": X.mean(axis=1),           # (N,)
        "std": X.std(axis=1, ddof=1),     # (N,)
        "l2_norm": np.linalg.norm(X, axis=1),  # (N,) L2 norm per sample
        "l1_norm": np.abs(X).sum(axis=1),      # (N,) L1 norm per sample
    }

# ---- VERIFICATION ----
X = np.array([[1.0, 10.0, 100.0],
              [2.0, 20.0, 200.0],
              [3.0, 30.0, 300.0],
              [4.0, 40.0, 400.0]])

feat_stats = feature_statistics(X)
assert feat_stats["mean"].shape == (3,), "Should be one value per feature"
assert np.allclose(feat_stats["mean"], [2.5, 25.0, 250.0])
assert np.allclose(feat_stats["median"], [2.5, 25.0, 250.0])

samp_stats = sample_statistics(X)
assert samp_stats["mean"].shape == (4,), "Should be one value per sample"
print("Challenge 1 passed!")

Challenge 2: Percentile Computation

Problem: Implement a function that computes arbitrary percentiles for each feature and uses them for robust scaling (scaling based on IQR instead of standard deviation). This is preferred over standard scaling when data has outliers.

import numpy as np

# ---- CHALLENGE ----
def robust_scale(X):
    """Scale features using median and IQR (interquartile range).
    x_scaled = (x - median) / IQR where IQR = Q3 - Q1."""
    pass

def percentile_bins(X, n_bins=10):
    """Create equal-frequency bins for each feature using percentiles."""
    pass

# ---- SOLUTION ----
def robust_scale(X):
    """Robust scaling using median and IQR."""
    median = np.median(X, axis=0)                    # (D,)
    q1 = np.percentile(X, 25, axis=0)               # (D,)
    q3 = np.percentile(X, 75, axis=0)               # (D,)
    iqr = q3 - q1                                     # (D,)
    # Avoid division by zero for constant features
    iqr = np.where(iqr == 0, 1.0, iqr)
    # Broadcasting: (N, D) - (D,) / (D,) -> (N, D)
    X_scaled = (X - median) / iqr
    return X_scaled, median, iqr

def percentile_bins(X, n_bins=10):
    """Assign each value to a percentile-based bin (0 to n_bins-1)."""
    percentiles = np.linspace(0, 100, n_bins + 1)    # [0, 10, 20, ..., 100]
    # Compute bin edges per feature
    edges = np.percentile(X, percentiles, axis=0)     # (n_bins+1, D)
    # Digitize each value (find which bin it belongs to)
    binned = np.zeros_like(X, dtype=int)
    for d in range(X.shape[1]):
        binned[:, d] = np.digitize(X[:, d], edges[1:-1, d])  # n_bins-1 edges
    return binned

# ---- VERIFICATION ----
np.random.seed(42)
# Data with outliers
X = np.random.randn(1000, 3)
X[0, 0] = 100    # extreme outlier
X[1, 1] = -200   # extreme outlier

X_scaled, median, iqr = robust_scale(X)
# Median of scaled data should be ~0 (outliers don't affect median)
assert np.all(np.abs(np.median(X_scaled, axis=0)) < 0.01), "Median should be ~0"

binned = percentile_bins(X, n_bins=10)
# Each bin should have roughly equal count
for d in range(3):
    counts = np.bincount(binned[:, d], minlength=10)
    assert np.all(counts >= 80), f"Bins should be roughly equal, got {counts}"
print("Challenge 2 passed!")

Challenge 3: Correlation Matrix

Problem: Compute the Pearson correlation matrix for a feature matrix X of shape (N, D) from scratch (without using np.corrcoef). The correlation between features i and j is: corr(i,j) = cov(i,j) / (std_i * std_j).

import numpy as np

# ---- CHALLENGE ----
def correlation_matrix(X):
    """Compute Pearson correlation matrix from scratch. Return (D, D) matrix."""
    pass

def find_correlated_features(X, threshold=0.8):
    """Find pairs of features with |correlation| > threshold."""
    pass

# ---- SOLUTION ----
def correlation_matrix(X):
    """Pearson correlation from scratch using vectorized operations."""
    # Step 1: center the data
    X_centered = X - X.mean(axis=0)  # (N, D)

    # Step 2: compute covariance matrix
    # (D, N) @ (N, D) / (N-1) -> (D, D)
    N = X.shape[0]
    cov = (X_centered.T @ X_centered) / (N - 1)

    # Step 3: compute standard deviations
    stds = X.std(axis=0, ddof=1)  # (D,)

    # Step 4: normalize covariance by outer product of stds
    # (D, D) / ((D, 1) * (1, D)) -> (D, D)
    corr = cov / (stds[:, None] * stds[None, :])

    # Clip to [-1, 1] for numerical safety
    corr = np.clip(corr, -1.0, 1.0)
    return corr

def find_correlated_features(X, threshold=0.8):
    """Find highly correlated feature pairs."""
    corr = correlation_matrix(X)
    D = corr.shape[0]
    pairs = []
    # Only check upper triangle (avoid duplicates and diagonal)
    for i in range(D):
        for j in range(i + 1, D):
            if abs(corr[i, j]) > threshold:
                pairs.append((i, j, corr[i, j]))
    return pairs

# ---- VERIFICATION ----
np.random.seed(42)
# Create correlated features
X = np.random.randn(500, 4)
X[:, 1] = X[:, 0] * 2 + np.random.randn(500) * 0.1  # highly correlated with col 0
X[:, 3] = -X[:, 2] + np.random.randn(500) * 0.1      # negatively correlated with col 2

corr = correlation_matrix(X)
# Compare with NumPy's built-in
expected = np.corrcoef(X.T)
assert np.allclose(corr, expected, atol=1e-10), "Should match np.corrcoef"
assert corr[0, 1] > 0.95, f"Features 0,1 should be highly correlated: {corr[0, 1]:.4f}"
assert corr[2, 3] < -0.95, f"Features 2,3 should be negatively correlated: {corr[2, 3]:.4f}"

pairs = find_correlated_features(X, threshold=0.8)
assert len(pairs) >= 2, f"Should find at least 2 correlated pairs, got {len(pairs)}"
print("Challenge 3 passed!")

Challenge 4: Feature Normalization

Problem: Implement three common normalization techniques used in ML: (a) Min-Max scaling to [0, 1], (b) L2 normalization (unit vector per sample), (c) Max-Abs scaling (scale by max absolute value). Each must handle edge cases like constant features and zero vectors.

import numpy as np

# ---- CHALLENGE ----
def minmax_scale(X):
    """Scale each feature to [0, 1] range."""
    pass

def l2_normalize(X):
    """Normalize each row (sample) to unit L2 norm."""
    pass

def maxabs_scale(X):
    """Scale each feature by its maximum absolute value to [-1, 1]."""
    pass

# ---- SOLUTION ----
def minmax_scale(X):
    """Min-Max scaling per feature."""
    X_min = X.min(axis=0)                    # (D,)
    X_max = X.max(axis=0)                    # (D,)
    denom = X_max - X_min                    # (D,)
    denom = np.where(denom == 0, 1.0, denom)  # handle constant features
    return (X - X_min) / denom

def l2_normalize(X):
    """L2 normalization: each row becomes a unit vector."""
    norms = np.linalg.norm(X, axis=1, keepdims=True)  # (N, 1)
    norms = np.where(norms == 0, 1.0, norms)           # handle zero vectors
    return X / norms

def maxabs_scale(X):
    """Scale by max absolute value per feature."""
    max_abs = np.abs(X).max(axis=0)          # (D,)
    max_abs = np.where(max_abs == 0, 1.0, max_abs)
    return X / max_abs

# ---- VERIFICATION ----
X = np.array([[1.0, -10.0, 100.0],
              [5.0,   0.0, 200.0],
              [3.0,  10.0, 300.0]])

# Min-Max: all values should be in [0, 1]
X_mm = minmax_scale(X)
assert np.all(X_mm >= 0) and np.all(X_mm <= 1), "Min-max should be in [0,1]"
assert np.allclose(X_mm.min(axis=0), 0), "Min should be 0"
assert np.allclose(X_mm.max(axis=0), 1), "Max should be 1"

# L2: all row norms should be 1
X_l2 = l2_normalize(X)
norms = np.linalg.norm(X_l2, axis=1)
assert np.allclose(norms, 1.0), f"All norms should be 1, got {norms}"

# MaxAbs: all values should be in [-1, 1]
X_ma = maxabs_scale(X)
assert np.all(np.abs(X_ma) <= 1.0 + 1e-10), "MaxAbs should be in [-1, 1]"
print("Challenge 4 passed!")

Challenge 5: Z-Score Standardization

Problem: Implement z-score standardization with a train/test split pattern: compute mean and std on the training set, then apply the same transformation to the test set. Also implement the inverse transform to recover original values from standardized data.

import numpy as np

# ---- CHALLENGE ----
class StandardScaler:
    """Z-score standardization with fit/transform/inverse_transform."""

    def fit(self, X_train):
        """Compute mean and std from training data."""
        pass

    def transform(self, X):
        """Apply standardization using stored mean and std."""
        pass

    def fit_transform(self, X_train):
        """Fit and transform in one step."""
        pass

    def inverse_transform(self, X_scaled):
        """Recover original values from standardized data."""
        pass

# ---- SOLUTION ----
class StandardScaler:
    def __init__(self):
        self.mean_ = None
        self.std_ = None

    def fit(self, X_train):
        self.mean_ = X_train.mean(axis=0)
        self.std_ = X_train.std(axis=0, ddof=0)  # population std for consistency
        self.std_ = np.where(self.std_ == 0, 1.0, self.std_)
        return self

    def transform(self, X):
        return (X - self.mean_) / self.std_

    def fit_transform(self, X_train):
        self.fit(X_train)
        return self.transform(X_train)

    def inverse_transform(self, X_scaled):
        return X_scaled * self.std_ + self.mean_

# ---- VERIFICATION ----
np.random.seed(42)
X_train = np.random.randn(800, 5) * np.array([1, 10, 100, 1000, 10000])
X_test = np.random.randn(200, 5) * np.array([1, 10, 100, 1000, 10000])

scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)

# Training data should have mean ~0, std ~1
assert np.allclose(X_train_scaled.mean(axis=0), 0, atol=1e-10), "Train mean should be 0"
assert np.allclose(X_train_scaled.std(axis=0), 1, atol=1e-10), "Train std should be 1"

# Transform test data using train statistics
X_test_scaled = scaler.transform(X_test)
assert X_test_scaled.shape == X_test.shape, "Shape should be preserved"

# Inverse transform should recover original
X_recovered = scaler.inverse_transform(X_train_scaled)
assert np.allclose(X_recovered, X_train, atol=1e-10), "Inverse should recover original"
print("Challenge 5 passed!")

Challenge 6: Covariance Matrix

Problem: Compute the covariance matrix from scratch using vectorized operations. Then use it to generate samples from a multivariate Gaussian distribution. Also implement the Mahalanobis distance, which uses the inverse covariance to compute distances that account for feature correlations.

import numpy as np

# ---- CHALLENGE ----
def covariance_matrix(X):
    """Compute sample covariance matrix from scratch. Return (D, D)."""
    pass

def mahalanobis_distance(x, mean, cov_inv):
    """Compute Mahalanobis distance of point x from distribution."""
    pass

def sample_multivariate_gaussian(mean, cov, n_samples, seed=42):
    """Generate samples from multivariate Gaussian using Cholesky decomposition."""
    pass

# ---- SOLUTION ----
def covariance_matrix(X):
    """Sample covariance: C = (X - mean)^T (X - mean) / (N - 1)."""
    X_centered = X - X.mean(axis=0)
    N = X.shape[0]
    return (X_centered.T @ X_centered) / (N - 1)

def mahalanobis_distance(x, mean, cov_inv):
    """d_M = sqrt((x - mean)^T @ cov_inv @ (x - mean))."""
    diff = x - mean
    # For single point: (D,) @ (D, D) @ (D,) -> scalar
    # For batch: (N, D) @ (D, D) -> (N, D), then element-wise * (N, D) sum
    if diff.ndim == 1:
        return np.sqrt(diff @ cov_inv @ diff)
    else:
        # Batch version: (N, D)
        return np.sqrt(np.sum((diff @ cov_inv) * diff, axis=1))

def sample_multivariate_gaussian(mean, cov, n_samples, seed=42):
    """Sample using Cholesky: X = mean + L @ Z where cov = L @ L^T."""
    rng = np.random.default_rng(seed)
    D = len(mean)
    # Cholesky decomposition: cov = L @ L^T
    L = np.linalg.cholesky(cov)
    # Generate standard normal samples
    Z = rng.standard_normal((n_samples, D))
    # Transform: each sample = mean + L @ z
    samples = Z @ L.T + mean  # (N, D) @ (D, D) + (D,) -> (N, D)
    return samples

# ---- VERIFICATION ----
np.random.seed(42)
X = np.random.randn(1000, 3)
X[:, 1] = X[:, 0] * 2 + np.random.randn(1000) * 0.1

cov = covariance_matrix(X)
expected = np.cov(X.T)
assert np.allclose(cov, expected, atol=1e-10), "Should match np.cov"
assert cov.shape == (3, 3), f"Expected (3, 3), got {cov.shape}"

# Mahalanobis distance
cov_inv = np.linalg.inv(cov)
mean = X.mean(axis=0)
# Distance of mean from itself should be 0
d = mahalanobis_distance(mean, mean, cov_inv)
assert abs(d) < 1e-10, f"Distance of mean from itself should be 0, got {d}"

# Batch Mahalanobis
distances = mahalanobis_distance(X[:10], mean, cov_inv)
assert distances.shape == (10,), f"Expected (10,), got {distances.shape}"

# Sampling
target_mean = np.array([1.0, 2.0, 3.0])
target_cov = np.array([[1.0, 0.5, 0.0],
                        [0.5, 1.0, 0.3],
                        [0.0, 0.3, 1.0]])
samples = sample_multivariate_gaussian(target_mean, target_cov, 10000)
assert np.allclose(samples.mean(axis=0), target_mean, atol=0.1), "Sample mean should match"
assert np.allclose(covariance_matrix(samples), target_cov, atol=0.1), "Sample cov should match"
print("Challenge 6 passed!")

Key Takeaways

💡
  • axis=0 computes per-feature statistics (across samples), axis=1 per-sample (across features)
  • Use ddof=1 for sample statistics (Bessel's correction), ddof=0 for population statistics
  • Robust scaling (median/IQR) is preferred over z-score when data has outliers
  • Always fit statistics on training data only, then apply to test data — this prevents data leakage
  • Correlation = normalized covariance. Build correlation from covariance by dividing by outer product of stds
  • Mahalanobis distance accounts for feature correlations — it is the standard distance metric for anomaly detection