Intermediate

Data Cleaning

Learn to handle missing data, remove duplicates, detect outliers, scale features, encode categoricals, and build reproducible cleaning pipelines.

Handling Missing Data

Python
import pandas as pd
import numpy as np

# Detect missing values
df.isnull().sum()              # Count nulls per column
df.isnull().mean() * 100       # Percentage missing

# Drop missing values
df.dropna()                     # Drop rows with any NaN
df.dropna(subset=["age"])       # Drop where age is NaN
df.dropna(thresh=3)             # Keep rows with >= 3 non-null

# Fill missing values
df["age"].fillna(df["age"].mean())     # Fill with mean
df["age"].fillna(df["age"].median())   # Fill with median
df["city"].fillna("Unknown")            # Fill with constant
df.fillna(method="ffill")               # Forward fill
df.fillna(method="bfill")               # Backward fill

Duplicate Removal

Python
df.duplicated().sum()                   # Count duplicates
df.drop_duplicates()                    # Remove exact duplicates
df.drop_duplicates(subset=["email"])   # Based on specific column
df.drop_duplicates(keep="last")        # Keep last occurrence

Data Type Conversion

Python
df["price"] = df["price"].astype(float)
df["date"] = pd.to_datetime(df["date"])
df["category"] = df["category"].astype("category")  # Memory efficient

String Cleaning

Python
df["name"] = df["name"].str.strip()          # Remove whitespace
df["name"] = df["name"].str.lower()          # Lowercase
df["phone"] = df["phone"].str.replace(r"\D", "", regex=True)  # Remove non-digits
df["email"] = df["email"].str.contains("@")  # Validate emails

Outlier Detection

Python
# IQR Method
Q1 = df["salary"].quantile(0.25)
Q3 = df["salary"].quantile(0.75)
IQR = Q3 - Q1
lower = Q1 - 1.5 * IQR
upper = Q3 + 1.5 * IQR
df_clean = df[(df["salary"] >= lower) & (df["salary"] <= upper)]

# Z-Score Method
from scipy import stats
z_scores = np.abs(stats.zscore(df["salary"]))
df_clean = df[z_scores < 3]  # Keep within 3 std devs

Feature Scaling

Python
from sklearn.preprocessing import StandardScaler, MinMaxScaler

# StandardScaler: mean=0, std=1
scaler = StandardScaler()
df[["age_scaled", "salary_scaled"]] = scaler.fit_transform(df[["age", "salary"]])

# MinMaxScaler: range [0, 1]
scaler = MinMaxScaler()
df[["age_norm", "salary_norm"]] = scaler.fit_transform(df[["age", "salary"]])

Encoding Categorical Variables

Python
# One-Hot Encoding
df_encoded = pd.get_dummies(df, columns=["department"], drop_first=True)

# Label Encoding (ordinal)
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()
df["department_encoded"] = le.fit_transform(df["department"])

# Ordinal Encoding (with order)
size_map = {"S": 1, "M": 2, "L": 3, "XL": 4}
df["size_num"] = df["size"].map(size_map)

Cleaning Pipeline Example

Python
def clean_data(df):
    """Complete data cleaning pipeline."""
    return (
        df
        .drop_duplicates()
        .assign(
            name=lambda x: x["name"].str.strip().str.title(),
            age=lambda x: x["age"].fillna(x["age"].median()),
            salary=lambda x: x["salary"].clip(lower=20000, upper=200000)
        )
        .dropna(subset=["email"])
        .reset_index(drop=True)
    )

df_clean = clean_data(df_raw)
Data cleaning often takes 60-80% of a data scientist's time. Building reusable cleaning functions and pipelines saves enormous effort across projects.