Why ML Monitoring is Different Beginner
Traditional software monitoring tracks whether your code is running. ML monitoring must also track whether your model is correct. A model can return 200 OK on every request while silently producing garbage predictions. This lesson explains why ML systems need a fundamentally different monitoring approach and introduces the 4 pillars of ML observability.
Traditional Monitoring vs ML Monitoring
| Dimension | Traditional Software | ML Systems |
|---|---|---|
| Failure Mode | Crashes, errors, timeouts (loud) | Silent degradation (model returns wrong answers with 200 OK) |
| Correctness | Unit tests catch bugs before deploy | Correctness depends on data distribution at inference time |
| Root Cause | Code change or infra failure | Data shift, feature pipeline bug, concept drift, upstream change |
| Validation | CI/CD tests | Ground truth may arrive hours/days/weeks later |
| Rollback | Revert the commit | Revert model + retrain on correct data + validate |
The 4 Pillars of ML Observability
Every production ML system needs monitoring across four dimensions. Missing any one pillar creates blind spots that will eventually cause production incidents.
# The 4 Pillars of ML Monitoring - Reference Architecture
from dataclasses import dataclass, field
from typing import Dict, List, Optional
from enum import Enum
import time
class MonitoringPillar(Enum):
DATA = "data"
MODEL = "model"
INFRASTRUCTURE = "infrastructure"
BUSINESS = "business"
@dataclass
class MLMonitoringConfig:
"""Production monitoring configuration for an ML system.
Every deployed model should have one of these."""
model_name: str
model_version: str
# Pillar 1: Data Monitoring
data_checks: Dict[str, dict] = field(default_factory=lambda: {
"feature_drift": {
"method": "psi", # Population Stability Index
"threshold": 0.2, # PSI > 0.2 = significant drift
"check_interval_minutes": 60,
"features_to_monitor": "all",
"reference_dataset": "training_data_v1"
},
"schema_validation": {
"check_nulls": True,
"check_types": True,
"check_ranges": True,
"alert_on_new_categories": True
},
"data_quality": {
"max_null_rate": 0.05, # Alert if >5% nulls
"max_duplicate_rate": 0.01,
"check_cardinality": True
}
})
# Pillar 2: Model Performance
model_checks: Dict[str, dict] = field(default_factory=lambda: {
"prediction_drift": {
"method": "ks_test", # Kolmogorov-Smirnov
"threshold": 0.05, # p-value threshold
"window_size": 1000
},
"accuracy_tracking": {
"primary_metric": "auc_roc",
"secondary_metrics": ["precision", "recall", "f1"],
"degradation_threshold": 0.05, # 5% drop triggers alert
"ground_truth_delay_hours": 24
},
"prediction_distribution": {
"monitor_class_balance": True,
"expected_positive_rate": 0.15,
"tolerance": 0.05
}
})
# Pillar 3: Infrastructure
infra_checks: Dict[str, dict] = field(default_factory=lambda: {
"latency": {
"p50_ms": 100,
"p95_ms": 500,
"p99_ms": 1000,
"timeout_ms": 5000
},
"throughput": {
"min_qps": 10,
"max_qps": 10000,
"alert_on_sudden_drop": True
},
"resources": {
"max_gpu_memory_pct": 90,
"max_cpu_pct": 80,
"max_disk_pct": 85
},
"errors": {
"max_error_rate": 0.01, # 1%
"track_error_types": True
}
})
# Pillar 4: Business Metrics
business_checks: Dict[str, dict] = field(default_factory=lambda: {
"conversion": {
"metric": "click_through_rate",
"baseline": 0.12,
"min_acceptable": 0.08,
"tracking_window_hours": 24
},
"cost": {
"max_daily_inference_cost": 500.00,
"cost_per_prediction_alert": 0.10,
"track_by_model_version": True
},
"user_feedback": {
"track_thumbs_up_down": True,
"min_satisfaction_rate": 0.80,
"sample_rate": 0.1
}
})
# Usage: create config for a deployed model
config = MLMonitoringConfig(
model_name="fraud-detector",
model_version="v2.3.1"
)
print(f"Monitoring {config.model_name} across 4 pillars:")
for pillar in MonitoringPillar:
print(f" - {pillar.value}")
Silent Failures: Why ML Systems Break Differently
The most dangerous ML failures are the ones nobody notices. Here are real patterns from production incidents.
# Silent failure detector - catches issues traditional monitoring misses
import numpy as np
from datetime import datetime, timedelta
class SilentFailureDetector:
"""Detects common ML silent failures that return 200 OK
but produce incorrect predictions."""
def __init__(self, model_name: str):
self.model_name = model_name
self.prediction_buffer = []
self.feature_stats = {}
def check_prediction_collapse(self, predictions: list,
threshold: float = 0.95) -> dict:
"""Detect when model always predicts the same class.
Real incident: fraud model started predicting 'not fraud'
for 99.8% of transactions after a feature pipeline change."""
predictions = np.array(predictions)
if len(predictions) == 0:
return {"status": "no_data"}
# For classification: check if one class dominates
unique, counts = np.unique(predictions, return_counts=True)
max_ratio = counts.max() / counts.sum()
if max_ratio > threshold:
dominant_class = unique[counts.argmax()]
return {
"status": "ALERT",
"issue": "prediction_collapse",
"detail": f"Class '{dominant_class}' is {max_ratio:.1%} of predictions",
"severity": "critical",
"action": "Check feature pipeline and model inputs immediately"
}
return {"status": "ok", "max_class_ratio": max_ratio}
def check_feature_staleness(self, feature_timestamps: dict,
max_age_minutes: int = 30) -> dict:
"""Detect stale features being fed to the model.
Real incident: real-time feature store cache expired,
model received 3-day-old features for 6 hours."""
stale_features = {}
now = datetime.utcnow()
for feature_name, last_updated in feature_timestamps.items():
age = (now - last_updated).total_seconds() / 60
if age > max_age_minutes:
stale_features[feature_name] = {
"age_minutes": round(age, 1),
"last_updated": last_updated.isoformat()
}
if stale_features:
return {
"status": "ALERT",
"issue": "stale_features",
"stale_count": len(stale_features),
"features": stale_features,
"severity": "high",
"action": "Check feature pipeline and data source freshness"
}
return {"status": "ok"}
def check_null_spike(self, feature_null_rates: dict,
baseline_null_rates: dict,
spike_factor: float = 3.0) -> dict:
"""Detect sudden increase in null/missing values.
Real incident: upstream team changed API field name,
feature extractor returned nulls, model used default values."""
spikes = {}
for feature, current_rate in feature_null_rates.items():
baseline = baseline_null_rates.get(feature, 0.01)
if baseline > 0 and current_rate / baseline > spike_factor:
spikes[feature] = {
"current_null_rate": current_rate,
"baseline_null_rate": baseline,
"increase_factor": round(current_rate / baseline, 1)
}
if spikes:
return {
"status": "ALERT",
"issue": "null_spike",
"affected_features": len(spikes),
"details": spikes,
"severity": "high",
"action": "Check upstream data sources and feature extraction"
}
return {"status": "ok"}
def check_score_distribution_shift(self, current_scores: list,
baseline_mean: float,
baseline_std: float,
z_threshold: float = 3.0) -> dict:
"""Detect when prediction score distribution shifts significantly.
Real incident: model score mean shifted from 0.45 to 0.82 after
a training data leak went unnoticed for 2 weeks."""
current_mean = np.mean(current_scores)
z_score = abs(current_mean - baseline_mean) / baseline_std
if z_score > z_threshold:
return {
"status": "ALERT",
"issue": "score_distribution_shift",
"current_mean": round(current_mean, 4),
"baseline_mean": round(baseline_mean, 4),
"z_score": round(z_score, 2),
"severity": "critical",
"action": "Investigate data pipeline and model inputs"
}
return {"status": "ok", "z_score": round(z_score, 2)}
Real Production Incidents
Incident: Zillow's iBuying Algorithm (2021)
Zillow's home price prediction model systematically overvalued properties. The company bought thousands of homes at inflated prices before discovering the model was consistently wrong. Result: $569M write-down, 2,000 layoffs, and exit from the iBuying business. Root cause: insufficient monitoring of prediction accuracy against actual sale prices (ground truth delay).
Zillow's home price prediction model systematically overvalued properties. The company bought thousands of homes at inflated prices before discovering the model was consistently wrong. Result: $569M write-down, 2,000 layoffs, and exit from the iBuying business. Root cause: insufficient monitoring of prediction accuracy against actual sale prices (ground truth delay).
Incident: Amazon's Recruiting Tool (2018)
Amazon's ML-powered resume screener learned to penalize resumes containing the word "women's" (e.g., "women's chess club"). The model was trained on historical hiring data that reflected existing biases. The issue went undetected because the team monitored accuracy on historical data but not fairness metrics on live predictions.
Amazon's ML-powered resume screener learned to penalize resumes containing the word "women's" (e.g., "women's chess club"). The model was trained on historical hiring data that reflected existing biases. The issue went undetected because the team monitored accuracy on historical data but not fairness metrics on live predictions.
Incident: Healthcare Risk Scoring (2019)
A major healthcare algorithm used healthcare spending as a proxy for health needs, systematically assigning lower risk scores to Black patients. The algorithm affected 200 million patients annually. Root cause: the business metric (cost prediction) was monitored, but the fairness metric (equal treatment across demographics) was not.
A major healthcare algorithm used healthcare spending as a proxy for health needs, systematically assigning lower risk scores to Black patients. The algorithm affected 200 million patients annually. Root cause: the business metric (cost prediction) was monitored, but the fairness metric (equal treatment across demographics) was not.
Building Your Monitoring Strategy
# Step-by-step monitoring setup for a new model deployment
from dataclasses import dataclass
from typing import List
@dataclass
class MonitoringPlan:
"""Create this before deploying any model to production."""
model_name: str
model_type: str # classification, regression, ranking, generative
team: str
oncall_rotation: str
# What to monitor (prioritized)
critical_metrics: List[str] = None
warning_metrics: List[str] = None
# How to validate
ground_truth_source: str = ""
ground_truth_delay: str = ""
proxy_metrics: List[str] = None
# When to alert
alert_channels: List[str] = None
runbook_url: str = ""
def __post_init__(self):
if self.critical_metrics is None:
self.critical_metrics = self._default_critical()
if self.warning_metrics is None:
self.warning_metrics = self._default_warnings()
if self.proxy_metrics is None:
self.proxy_metrics = []
if self.alert_channels is None:
self.alert_channels = ["slack", "pagerduty"]
def _default_critical(self) -> List[str]:
"""Every model needs these monitored."""
return [
"error_rate",
"p99_latency",
"prediction_volume",
"feature_null_rate",
"prediction_distribution"
]
def _default_warnings(self) -> List[str]:
"""Important but not wake-up-at-3am level."""
return [
"feature_drift_psi",
"prediction_drift",
"gpu_memory_usage",
"inference_cost_daily"
]
def generate_checklist(self) -> str:
"""Generate pre-deployment monitoring checklist."""
checks = [
f"Model: {self.model_name} ({self.model_type})",
f"Team: {self.team} | On-call: {self.oncall_rotation}",
"",
"PRE-DEPLOYMENT MONITORING CHECKLIST:",
"[ ] Baseline metrics recorded from validation set",
"[ ] Feature drift reference dataset stored",
"[ ] Prediction distribution baseline captured",
"[ ] Alerting rules configured:",
]
for metric in self.critical_metrics:
checks.append(f" [CRITICAL] {metric}")
for metric in self.warning_metrics:
checks.append(f" [WARNING] {metric}")
checks.extend([
f"[ ] Ground truth pipeline: {self.ground_truth_source}",
f" (delay: {self.ground_truth_delay})",
f"[ ] Runbook created: {self.runbook_url}",
f"[ ] Alert channels: {', '.join(self.alert_channels)}",
"[ ] Dashboard created with all 4 pillars",
"[ ] Shadow mode testing complete",
"[ ] Rollback procedure documented and tested"
])
return "\n".join(checks)
# Example: monitoring plan for a fraud detection model
plan = MonitoringPlan(
model_name="fraud-detector-v2",
model_type="classification",
team="ml-platform",
oncall_rotation="ml-oncall-primary",
ground_truth_source="chargeback_reports",
ground_truth_delay="30-90 days",
proxy_metrics=["manual_review_rate", "block_rate"],
runbook_url="https://wiki.internal/runbooks/fraud-model"
)
print(plan.generate_checklist())
Key Takeaway: The fundamental difference between traditional monitoring and ML monitoring is that ML systems can fail silently. A model can return predictions with low latency and zero errors while being completely wrong. Your monitoring must cover all 4 pillars: data quality, model performance, infrastructure health, and business impact.
Lilly Tech Systems