Intermediate

Step 1: Data Exploration & Feature Engineering

Perform thorough exploratory analysis on the credit card fraud dataset, understand the extreme class imbalance, create powerful engineered features, and apply SMOTE to balance the training data.

Load and Inspect the Data

Start by loading the dataset and understanding its structure, distributions, and potential issues:

# src/features.py
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler, RobustScaler
from imblearn.over_sampling import SMOTE
from sklearn.model_selection import train_test_split
import warnings
warnings.filterwarnings('ignore')


def load_data(path: str = "data/creditcard.csv") -> pd.DataFrame:
    """Load and perform initial inspection of the fraud dataset."""
    df = pd.read_csv(path)

    print(f"Dataset shape: {df.shape}")
    print(f"\nClass distribution:")
    print(f"  Legitimate: {(df['Class'] == 0).sum():,} ({(df['Class'] == 0).mean()*100:.3f}%)")
    print(f"  Fraudulent: {(df['Class'] == 1).sum():,} ({(df['Class'] == 1).mean()*100:.3f}%)")
    print(f"  Imbalance ratio: 1:{int((df['Class'] == 0).sum() / (df['Class'] == 1).sum())}")
    print(f"\nMissing values: {df.isnull().sum().sum()}")
    print(f"\nAmount statistics:")
    print(df['Amount'].describe())

    return df


# Run EDA
df = load_data()

💡

Key finding: The imbalance ratio is approximately 1:578. For every fraudulent transaction, there are 578 legitimate ones. A naive model that always predicts "legitimate" would achieve 99.83% accuracy but catch zero fraud. This is why accuracy is a misleading metric for fraud detection.

Exploratory Data Analysis

Let us examine the distribution differences between fraudulent and legitimate transactions across key features:

import matplotlib.pyplot as plt
import seaborn as sns


def plot_class_distribution(df: pd.DataFrame):
    """Visualize the extreme class imbalance."""
    fig, axes = plt.subplots(1, 3, figsize=(18, 5))

    # Class distribution
    df['Class'].value_counts().plot(kind='bar', ax=axes[0], color=['#2ecc71', '#e74c3c'])
    axes[0].set_title('Class Distribution')
    axes[0].set_xticklabels(['Legitimate', 'Fraud'], rotation=0)
    axes[0].set_ylabel('Count')

    # Amount distribution by class
    for label, color in [(0, '#2ecc71'), (1, '#e74c3c')]:
        subset = df[df['Class'] == label]['Amount']
        axes[1].hist(subset, bins=50, alpha=0.7, color=color,
                     label='Fraud' if label else 'Legit', density=True)
    axes[1].set_title('Transaction Amount Distribution')
    axes[1].set_xlabel('Amount')
    axes[1].legend()
    axes[1].set_xlim(0, 500)  # Focus on common range

    # Time distribution by class
    for label, color in [(0, '#2ecc71'), (1, '#e74c3c')]:
        subset = df[df['Class'] == label]['Time']
        axes[2].hist(subset, bins=48, alpha=0.7, color=color,
                     label='Fraud' if label else 'Legit', density=True)
    axes[2].set_title('Transaction Time Distribution')
    axes[2].set_xlabel('Time (seconds)')
    axes[2].legend()

    plt.tight_layout()
    plt.savefig('notebooks/eda_distributions.png', dpi=150)
    plt.show()


def analyze_fraud_patterns(df: pd.DataFrame):
    """Deep dive into fraud vs legitimate transaction differences."""
    fraud = df[df['Class'] == 1]
    legit = df[df['Class'] == 0]

    print("=== Amount Analysis ===")
    print(f"Fraud  - Mean: ${fraud['Amount'].mean():.2f}, "
          f"Median: ${fraud['Amount'].median():.2f}, "
          f"Max: ${fraud['Amount'].max():.2f}")
    print(f"Legit  - Mean: ${legit['Amount'].mean():.2f}, "
          f"Median: ${legit['Amount'].median():.2f}, "
          f"Max: ${legit['Amount'].max():.2f}")

    # Find most discriminative PCA features
    print("\n=== Most Discriminative Features (by mean difference) ===")
    diffs = {}
    for col in [f'V{i}' for i in range(1, 29)]:
        diff = abs(fraud[col].mean() - legit[col].mean())
        diffs[col] = diff

    top_features = sorted(diffs.items(), key=lambda x: x[1], reverse=True)[:10]
    for feat, diff in top_features:
        print(f"  {feat}: mean diff = {diff:.4f}")

    return [f[0] for f in top_features]


def plot_correlation_matrix(df: pd.DataFrame):
    """Plot correlation between top features and fraud label."""
    # Select most important features
    important_cols = ['V1', 'V2', 'V3', 'V4', 'V7', 'V10',
                      'V11', 'V12', 'V14', 'V17', 'Amount', 'Class']
    corr = df[important_cols].corr()

    plt.figure(figsize=(12, 10))
    sns.heatmap(corr, annot=True, fmt='.2f', cmap='RdBu_r',
                center=0, square=True, linewidths=0.5)
    plt.title('Feature Correlation Matrix')
    plt.tight_layout()
    plt.savefig('notebooks/eda_correlation.png', dpi=150)
    plt.show()


plot_class_distribution(df)
top_features = analyze_fraud_patterns(df)
plot_correlation_matrix(df)

Feature Engineering

The raw PCA features are useful, but we can create additional features that capture transaction patterns. These engineered features are critical for real-time fraud detection because they encode behavioral signals:

def engineer_features(df: pd.DataFrame) -> pd.DataFrame:
    """Create engineered features for fraud detection.

    Features capture:
    - Amount patterns (log transform, z-score)
    - Time-based patterns (hour of day, cyclical encoding)
    - Interaction features between top PCA components
    - Velocity-style aggregations (simulated for static dataset)
    """
    df = df.copy()

    # --- Amount Features ---
    # Log transform (reduces skewness, important for tree models)
    df['Amount_log'] = np.log1p(df['Amount'])

    # Robust scaling (handles outliers better than StandardScaler)
    scaler = RobustScaler()
    df['Amount_scaled'] = scaler.fit_transform(df[['Amount']])

    # Amount percentile rank
    df['Amount_percentile'] = df['Amount'].rank(pct=True)

    # Is the amount unusually high? (above 95th percentile)
    amount_95 = df['Amount'].quantile(0.95)
    df['Amount_is_high'] = (df['Amount'] > amount_95).astype(int)

    # Is the amount a round number? (fraud often uses round amounts)
    df['Amount_is_round'] = (df['Amount'] % 1 == 0).astype(int)

    # --- Time Features ---
    # Convert seconds to hours (approximate hour of day)
    df['Hour'] = (df['Time'] / 3600) % 24

    # Cyclical encoding of hour (so 23:00 is close to 00:00)
    df['Hour_sin'] = np.sin(2 * np.pi * df['Hour'] / 24)
    df['Hour_cos'] = np.cos(2 * np.pi * df['Hour'] / 24)

    # Is it nighttime? (fraud peaks during off-hours)
    df['Is_night'] = ((df['Hour'] >= 22) | (df['Hour'] <= 5)).astype(int)

    # --- PCA Interaction Features ---
    # V1-V2 interaction (top discriminative features)
    df['V1_V2_product'] = df['V1'] * df['V2']
    df['V1_V2_ratio'] = df['V1'] / (df['V2'] + 1e-8)

    # V3-V4 interaction
    df['V3_V4_product'] = df['V3'] * df['V4']

    # Magnitude of top PCA components
    df['PCA_magnitude'] = np.sqrt(
        df['V1']**2 + df['V2']**2 + df['V3']**2 + df['V4']**2
    )

    # --- Anomaly Score Features ---
    # Distance from mean for each PCA component
    for col in ['V1', 'V3', 'V10', 'V12', 'V14']:
        mean_val = df[col].mean()
        std_val = df[col].std()
        df[f'{col}_zscore'] = (df[col] - mean_val) / std_val
        df[f'{col}_is_outlier'] = (abs(df[f'{col}_zscore']) > 3).astype(int)

    # Count of outlier features per transaction
    outlier_cols = [c for c in df.columns if c.endswith('_is_outlier')]
    df['Outlier_count'] = df[outlier_cols].sum(axis=1)

    print(f"Engineered features: {len(df.columns) - 31} new columns")
    print(f"Total features: {len(df.columns)}")

    return df


df_engineered = engineer_features(df)
print(f"\nFinal dataset shape: {df_engineered.shape}")

⚠

Real-time feature parity: Every feature you engineer here must be computable at inference time. If you use features like "average amount over the last 24 hours," you need a real-time feature store to serve those values during prediction. We will address this when building the FastAPI endpoint in Step 4.

Simulated Velocity Features

In production fraud systems, velocity features are among the most powerful signals. They measure how fast a card is being used. Since our dataset does not have card IDs, we simulate this concept using time-windowed aggregations:

def add_velocity_features(df: pd.DataFrame, window_seconds: int = 3600) -> pd.DataFrame:
    """Simulate velocity-style features using time windows.

    In production, these would be computed from a feature store
    keyed by card_id with sliding window aggregations.
    """
    df = df.sort_values('Time').reset_index(drop=True)

    # Rolling statistics over time windows
    # Using index-based rolling since we do not have card_id
    df['Amount_rolling_mean'] = (
        df['Amount'].rolling(window=100, min_periods=1).mean()
    )
    df['Amount_rolling_std'] = (
        df['Amount'].rolling(window=100, min_periods=1).std().fillna(0)
    )

    # Deviation from rolling average
    df['Amount_deviation'] = (
        (df['Amount'] - df['Amount_rolling_mean']) /
        (df['Amount_rolling_std'] + 1e-8)
    )

    # Transaction frequency (count in recent window)
    df['Tx_count_window'] = (
        df['Time'].rolling(window=100, min_periods=1).count()
    )

    # Time since last transaction
    df['Time_since_last'] = df['Time'].diff().fillna(0)
    df['Time_since_last_log'] = np.log1p(df['Time_since_last'])

    # Is this a rapid succession of transactions?
    df['Is_rapid'] = (df['Time_since_last'] < 60).astype(int)

    print(f"Added {6} velocity features")
    return df


df_final = add_velocity_features(df_engineered)

Handle Class Imbalance with SMOTE

With a 578:1 imbalance ratio, the model will be biased toward predicting everything as legitimate. SMOTE (Synthetic Minority Over-sampling Technique) creates synthetic fraud examples by interpolating between existing fraud samples in feature space:

def prepare_training_data(
    df: pd.DataFrame,
    test_size: float = 0.2,
    apply_smote: bool = True,
    random_state: int = 42
) -> tuple:
    """Split data and optionally apply SMOTE to training set only.

    IMPORTANT: SMOTE is applied ONLY to the training set.
    The test set must reflect real-world class distribution.
    """
    # Define feature columns (exclude target, Time, and raw Amount)
    exclude_cols = ['Class', 'Time']
    feature_cols = [c for c in df.columns if c not in exclude_cols]

    X = df[feature_cols].values
    y = df['Class'].values

    # Stratified split preserves class ratio in both sets
    X_train, X_test, y_train, y_test = train_test_split(
        X, y,
        test_size=test_size,
        stratify=y,
        random_state=random_state
    )

    print(f"Before SMOTE:")
    print(f"  Train: {X_train.shape[0]:,} samples "
          f"(fraud: {y_train.sum():,}, legit: {(y_train == 0).sum():,})")
    print(f"  Test:  {X_test.shape[0]:,} samples "
          f"(fraud: {y_test.sum():,}, legit: {(y_test == 0).sum():,})")

    if apply_smote:
        smote = SMOTE(
            sampling_strategy=0.5,  # Fraud = 50% of legitimate count
            k_neighbors=5,
            random_state=random_state
        )
        X_train, y_train = smote.fit_resample(X_train, y_train)

        print(f"\nAfter SMOTE:")
        print(f"  Train: {X_train.shape[0]:,} samples "
              f"(fraud: {y_train.sum():,}, legit: {(y_train == 0).sum():,})")
        print(f"  Ratio: 1:{int((y_train == 0).sum() / y_train.sum())}")

    return X_train, X_test, y_train, y_test, feature_cols


X_train, X_test, y_train, y_test, feature_cols = prepare_training_data(df_final)

💡

Why sampling_strategy=0.5? Setting SMOTE to generate fraud samples equal to 50% of the legitimate count (rather than 100%) avoids over-correcting. A 2:1 ratio gives the model enough fraud examples to learn patterns while still reflecting that fraud is inherently rare. Full 1:1 balancing can lead to higher false positive rates.

Save Preprocessed Data

import joblib

# Save the preprocessed data and feature pipeline
preprocessing_artifacts = {
    'X_train': X_train,
    'X_test': X_test,
    'y_train': y_train,
    'y_test': y_test,
    'feature_cols': feature_cols,
    'df_engineered': df_final
}

joblib.dump(preprocessing_artifacts, 'models/preprocessing_artifacts.pkl')
print("Preprocessing artifacts saved to models/preprocessing_artifacts.pkl")

Feature Summary

Here is the complete set of features we have created for the model:

Category	Features	Count
Original PCA	V1 through V28	28
Amount	Amount, Amount_log, Amount_scaled, Amount_percentile, Amount_is_high, Amount_is_round	6
Time	Hour, Hour_sin, Hour_cos, Is_night	4
Interactions	V1_V2_product, V1_V2_ratio, V3_V4_product, PCA_magnitude	4
Anomaly	Z-scores and outlier flags for V1, V3, V10, V12, V14 + Outlier_count	11
Velocity	Rolling mean/std, deviation, tx_count, time_since_last, is_rapid	6

What Is Next

With our features engineered and training data balanced, we are ready to train our fraud detection models. In the next lesson, we will train XGBoost and LightGBM classifiers, tune decision thresholds for optimal fraud recall, and implement stratified cross-validation to ensure robust performance estimates.

← Previous Project Setup Next → Model Training