Step 1: Data Exploration & Feature Engineering
Perform thorough exploratory analysis on the credit card fraud dataset, understand the extreme class imbalance, create powerful engineered features, and apply SMOTE to balance the training data.
Load and Inspect the Data
Start by loading the dataset and understanding its structure, distributions, and potential issues:
# src/features.py
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler, RobustScaler
from imblearn.over_sampling import SMOTE
from sklearn.model_selection import train_test_split
import warnings
warnings.filterwarnings('ignore')
def load_data(path: str = "data/creditcard.csv") -> pd.DataFrame:
"""Load and perform initial inspection of the fraud dataset."""
df = pd.read_csv(path)
print(f"Dataset shape: {df.shape}")
print(f"\nClass distribution:")
print(f" Legitimate: {(df['Class'] == 0).sum():,} ({(df['Class'] == 0).mean()*100:.3f}%)")
print(f" Fraudulent: {(df['Class'] == 1).sum():,} ({(df['Class'] == 1).mean()*100:.3f}%)")
print(f" Imbalance ratio: 1:{int((df['Class'] == 0).sum() / (df['Class'] == 1).sum())}")
print(f"\nMissing values: {df.isnull().sum().sum()}")
print(f"\nAmount statistics:")
print(df['Amount'].describe())
return df
# Run EDA
df = load_data()
Exploratory Data Analysis
Let us examine the distribution differences between fraudulent and legitimate transactions across key features:
import matplotlib.pyplot as plt
import seaborn as sns
def plot_class_distribution(df: pd.DataFrame):
"""Visualize the extreme class imbalance."""
fig, axes = plt.subplots(1, 3, figsize=(18, 5))
# Class distribution
df['Class'].value_counts().plot(kind='bar', ax=axes[0], color=['#2ecc71', '#e74c3c'])
axes[0].set_title('Class Distribution')
axes[0].set_xticklabels(['Legitimate', 'Fraud'], rotation=0)
axes[0].set_ylabel('Count')
# Amount distribution by class
for label, color in [(0, '#2ecc71'), (1, '#e74c3c')]:
subset = df[df['Class'] == label]['Amount']
axes[1].hist(subset, bins=50, alpha=0.7, color=color,
label='Fraud' if label else 'Legit', density=True)
axes[1].set_title('Transaction Amount Distribution')
axes[1].set_xlabel('Amount')
axes[1].legend()
axes[1].set_xlim(0, 500) # Focus on common range
# Time distribution by class
for label, color in [(0, '#2ecc71'), (1, '#e74c3c')]:
subset = df[df['Class'] == label]['Time']
axes[2].hist(subset, bins=48, alpha=0.7, color=color,
label='Fraud' if label else 'Legit', density=True)
axes[2].set_title('Transaction Time Distribution')
axes[2].set_xlabel('Time (seconds)')
axes[2].legend()
plt.tight_layout()
plt.savefig('notebooks/eda_distributions.png', dpi=150)
plt.show()
def analyze_fraud_patterns(df: pd.DataFrame):
"""Deep dive into fraud vs legitimate transaction differences."""
fraud = df[df['Class'] == 1]
legit = df[df['Class'] == 0]
print("=== Amount Analysis ===")
print(f"Fraud - Mean: ${fraud['Amount'].mean():.2f}, "
f"Median: ${fraud['Amount'].median():.2f}, "
f"Max: ${fraud['Amount'].max():.2f}")
print(f"Legit - Mean: ${legit['Amount'].mean():.2f}, "
f"Median: ${legit['Amount'].median():.2f}, "
f"Max: ${legit['Amount'].max():.2f}")
# Find most discriminative PCA features
print("\n=== Most Discriminative Features (by mean difference) ===")
diffs = {}
for col in [f'V{i}' for i in range(1, 29)]:
diff = abs(fraud[col].mean() - legit[col].mean())
diffs[col] = diff
top_features = sorted(diffs.items(), key=lambda x: x[1], reverse=True)[:10]
for feat, diff in top_features:
print(f" {feat}: mean diff = {diff:.4f}")
return [f[0] for f in top_features]
def plot_correlation_matrix(df: pd.DataFrame):
"""Plot correlation between top features and fraud label."""
# Select most important features
important_cols = ['V1', 'V2', 'V3', 'V4', 'V7', 'V10',
'V11', 'V12', 'V14', 'V17', 'Amount', 'Class']
corr = df[important_cols].corr()
plt.figure(figsize=(12, 10))
sns.heatmap(corr, annot=True, fmt='.2f', cmap='RdBu_r',
center=0, square=True, linewidths=0.5)
plt.title('Feature Correlation Matrix')
plt.tight_layout()
plt.savefig('notebooks/eda_correlation.png', dpi=150)
plt.show()
plot_class_distribution(df)
top_features = analyze_fraud_patterns(df)
plot_correlation_matrix(df)
Feature Engineering
The raw PCA features are useful, but we can create additional features that capture transaction patterns. These engineered features are critical for real-time fraud detection because they encode behavioral signals:
def engineer_features(df: pd.DataFrame) -> pd.DataFrame:
"""Create engineered features for fraud detection.
Features capture:
- Amount patterns (log transform, z-score)
- Time-based patterns (hour of day, cyclical encoding)
- Interaction features between top PCA components
- Velocity-style aggregations (simulated for static dataset)
"""
df = df.copy()
# --- Amount Features ---
# Log transform (reduces skewness, important for tree models)
df['Amount_log'] = np.log1p(df['Amount'])
# Robust scaling (handles outliers better than StandardScaler)
scaler = RobustScaler()
df['Amount_scaled'] = scaler.fit_transform(df[['Amount']])
# Amount percentile rank
df['Amount_percentile'] = df['Amount'].rank(pct=True)
# Is the amount unusually high? (above 95th percentile)
amount_95 = df['Amount'].quantile(0.95)
df['Amount_is_high'] = (df['Amount'] > amount_95).astype(int)
# Is the amount a round number? (fraud often uses round amounts)
df['Amount_is_round'] = (df['Amount'] % 1 == 0).astype(int)
# --- Time Features ---
# Convert seconds to hours (approximate hour of day)
df['Hour'] = (df['Time'] / 3600) % 24
# Cyclical encoding of hour (so 23:00 is close to 00:00)
df['Hour_sin'] = np.sin(2 * np.pi * df['Hour'] / 24)
df['Hour_cos'] = np.cos(2 * np.pi * df['Hour'] / 24)
# Is it nighttime? (fraud peaks during off-hours)
df['Is_night'] = ((df['Hour'] >= 22) | (df['Hour'] <= 5)).astype(int)
# --- PCA Interaction Features ---
# V1-V2 interaction (top discriminative features)
df['V1_V2_product'] = df['V1'] * df['V2']
df['V1_V2_ratio'] = df['V1'] / (df['V2'] + 1e-8)
# V3-V4 interaction
df['V3_V4_product'] = df['V3'] * df['V4']
# Magnitude of top PCA components
df['PCA_magnitude'] = np.sqrt(
df['V1']**2 + df['V2']**2 + df['V3']**2 + df['V4']**2
)
# --- Anomaly Score Features ---
# Distance from mean for each PCA component
for col in ['V1', 'V3', 'V10', 'V12', 'V14']:
mean_val = df[col].mean()
std_val = df[col].std()
df[f'{col}_zscore'] = (df[col] - mean_val) / std_val
df[f'{col}_is_outlier'] = (abs(df[f'{col}_zscore']) > 3).astype(int)
# Count of outlier features per transaction
outlier_cols = [c for c in df.columns if c.endswith('_is_outlier')]
df['Outlier_count'] = df[outlier_cols].sum(axis=1)
print(f"Engineered features: {len(df.columns) - 31} new columns")
print(f"Total features: {len(df.columns)}")
return df
df_engineered = engineer_features(df)
print(f"\nFinal dataset shape: {df_engineered.shape}")
Simulated Velocity Features
In production fraud systems, velocity features are among the most powerful signals. They measure how fast a card is being used. Since our dataset does not have card IDs, we simulate this concept using time-windowed aggregations:
def add_velocity_features(df: pd.DataFrame, window_seconds: int = 3600) -> pd.DataFrame:
"""Simulate velocity-style features using time windows.
In production, these would be computed from a feature store
keyed by card_id with sliding window aggregations.
"""
df = df.sort_values('Time').reset_index(drop=True)
# Rolling statistics over time windows
# Using index-based rolling since we do not have card_id
df['Amount_rolling_mean'] = (
df['Amount'].rolling(window=100, min_periods=1).mean()
)
df['Amount_rolling_std'] = (
df['Amount'].rolling(window=100, min_periods=1).std().fillna(0)
)
# Deviation from rolling average
df['Amount_deviation'] = (
(df['Amount'] - df['Amount_rolling_mean']) /
(df['Amount_rolling_std'] + 1e-8)
)
# Transaction frequency (count in recent window)
df['Tx_count_window'] = (
df['Time'].rolling(window=100, min_periods=1).count()
)
# Time since last transaction
df['Time_since_last'] = df['Time'].diff().fillna(0)
df['Time_since_last_log'] = np.log1p(df['Time_since_last'])
# Is this a rapid succession of transactions?
df['Is_rapid'] = (df['Time_since_last'] < 60).astype(int)
print(f"Added {6} velocity features")
return df
df_final = add_velocity_features(df_engineered)
Handle Class Imbalance with SMOTE
With a 578:1 imbalance ratio, the model will be biased toward predicting everything as legitimate. SMOTE (Synthetic Minority Over-sampling Technique) creates synthetic fraud examples by interpolating between existing fraud samples in feature space:
def prepare_training_data(
df: pd.DataFrame,
test_size: float = 0.2,
apply_smote: bool = True,
random_state: int = 42
) -> tuple:
"""Split data and optionally apply SMOTE to training set only.
IMPORTANT: SMOTE is applied ONLY to the training set.
The test set must reflect real-world class distribution.
"""
# Define feature columns (exclude target, Time, and raw Amount)
exclude_cols = ['Class', 'Time']
feature_cols = [c for c in df.columns if c not in exclude_cols]
X = df[feature_cols].values
y = df['Class'].values
# Stratified split preserves class ratio in both sets
X_train, X_test, y_train, y_test = train_test_split(
X, y,
test_size=test_size,
stratify=y,
random_state=random_state
)
print(f"Before SMOTE:")
print(f" Train: {X_train.shape[0]:,} samples "
f"(fraud: {y_train.sum():,}, legit: {(y_train == 0).sum():,})")
print(f" Test: {X_test.shape[0]:,} samples "
f"(fraud: {y_test.sum():,}, legit: {(y_test == 0).sum():,})")
if apply_smote:
smote = SMOTE(
sampling_strategy=0.5, # Fraud = 50% of legitimate count
k_neighbors=5,
random_state=random_state
)
X_train, y_train = smote.fit_resample(X_train, y_train)
print(f"\nAfter SMOTE:")
print(f" Train: {X_train.shape[0]:,} samples "
f"(fraud: {y_train.sum():,}, legit: {(y_train == 0).sum():,})")
print(f" Ratio: 1:{int((y_train == 0).sum() / y_train.sum())}")
return X_train, X_test, y_train, y_test, feature_cols
X_train, X_test, y_train, y_test, feature_cols = prepare_training_data(df_final)
Save Preprocessed Data
import joblib
# Save the preprocessed data and feature pipeline
preprocessing_artifacts = {
'X_train': X_train,
'X_test': X_test,
'y_train': y_train,
'y_test': y_test,
'feature_cols': feature_cols,
'df_engineered': df_final
}
joblib.dump(preprocessing_artifacts, 'models/preprocessing_artifacts.pkl')
print("Preprocessing artifacts saved to models/preprocessing_artifacts.pkl")
Feature Summary
Here is the complete set of features we have created for the model:
| Category | Features | Count |
|---|---|---|
| Original PCA | V1 through V28 | 28 |
| Amount | Amount, Amount_log, Amount_scaled, Amount_percentile, Amount_is_high, Amount_is_round | 6 |
| Time | Hour, Hour_sin, Hour_cos, Is_night | 4 |
| Interactions | V1_V2_product, V1_V2_ratio, V3_V4_product, PCA_magnitude | 4 |
| Anomaly | Z-scores and outlier flags for V1, V3, V10, V12, V14 + Outlier_count | 11 |
| Velocity | Rolling mean/std, deviation, tx_count, time_since_last, is_rapid | 6 |
What Is Next
With our features engineered and training data balanced, we are ready to train our fraud detection models. In the next lesson, we will train XGBoost and LightGBM classifiers, tune decision thresholds for optimal fraud recall, and implement stratified cross-validation to ensure robust performance estimates.
Lilly Tech Systems