Intermediate
Data Cleaning
Learn to handle missing data, remove duplicates, detect outliers, scale features, encode categoricals, and build reproducible cleaning pipelines.
Handling Missing Data
Python
import pandas as pd import numpy as np # Detect missing values df.isnull().sum() # Count nulls per column df.isnull().mean() * 100 # Percentage missing # Drop missing values df.dropna() # Drop rows with any NaN df.dropna(subset=["age"]) # Drop where age is NaN df.dropna(thresh=3) # Keep rows with >= 3 non-null # Fill missing values df["age"].fillna(df["age"].mean()) # Fill with mean df["age"].fillna(df["age"].median()) # Fill with median df["city"].fillna("Unknown") # Fill with constant df.fillna(method="ffill") # Forward fill df.fillna(method="bfill") # Backward fill
Duplicate Removal
Python
df.duplicated().sum() # Count duplicates df.drop_duplicates() # Remove exact duplicates df.drop_duplicates(subset=["email"]) # Based on specific column df.drop_duplicates(keep="last") # Keep last occurrence
Data Type Conversion
Python
df["price"] = df["price"].astype(float) df["date"] = pd.to_datetime(df["date"]) df["category"] = df["category"].astype("category") # Memory efficient
String Cleaning
Python
df["name"] = df["name"].str.strip() # Remove whitespace df["name"] = df["name"].str.lower() # Lowercase df["phone"] = df["phone"].str.replace(r"\D", "", regex=True) # Remove non-digits df["email"] = df["email"].str.contains("@") # Validate emails
Outlier Detection
Python
# IQR Method Q1 = df["salary"].quantile(0.25) Q3 = df["salary"].quantile(0.75) IQR = Q3 - Q1 lower = Q1 - 1.5 * IQR upper = Q3 + 1.5 * IQR df_clean = df[(df["salary"] >= lower) & (df["salary"] <= upper)] # Z-Score Method from scipy import stats z_scores = np.abs(stats.zscore(df["salary"])) df_clean = df[z_scores < 3] # Keep within 3 std devs
Feature Scaling
Python
from sklearn.preprocessing import StandardScaler, MinMaxScaler # StandardScaler: mean=0, std=1 scaler = StandardScaler() df[["age_scaled", "salary_scaled"]] = scaler.fit_transform(df[["age", "salary"]]) # MinMaxScaler: range [0, 1] scaler = MinMaxScaler() df[["age_norm", "salary_norm"]] = scaler.fit_transform(df[["age", "salary"]])
Encoding Categorical Variables
Python
# One-Hot Encoding df_encoded = pd.get_dummies(df, columns=["department"], drop_first=True) # Label Encoding (ordinal) from sklearn.preprocessing import LabelEncoder le = LabelEncoder() df["department_encoded"] = le.fit_transform(df["department"]) # Ordinal Encoding (with order) size_map = {"S": 1, "M": 2, "L": 3, "XL": 4} df["size_num"] = df["size"].map(size_map)
Cleaning Pipeline Example
Python
def clean_data(df): """Complete data cleaning pipeline.""" return ( df .drop_duplicates() .assign( name=lambda x: x["name"].str.strip().str.title(), age=lambda x: x["age"].fillna(x["age"].median()), salary=lambda x: x["salary"].clip(lower=20000, upper=200000) ) .dropna(subset=["email"]) .reset_index(drop=True) ) df_clean = clean_data(df_raw)
Data cleaning often takes 60-80% of a data scientist's time. Building reusable cleaning functions and pipelines saves enormous effort across projects.
Lilly Tech Systems