Pandas Interview Challenges
10 real Pandas coding challenges from ML interviews. Each tests a critical data manipulation skill that interviewers expect you to handle fluently under time pressure.
Challenge 1: GroupBy with Multiple Aggregations
import pandas as pd
import numpy as np
# Setup data
df = pd.DataFrame({
'date': pd.date_range('2024-01-01', periods=12, freq='D').tolist() * 2,
'region': ['East']*12 + ['West']*12,
'product': ['A','B','C','A','B','C','A','B','C','A','B','C'] * 2,
'revenue': np.random.randint(100, 1000, 24),
'quantity': np.random.randint(1, 50, 24)
})
# Solution
result = df.groupby('region').agg(
total_revenue=('revenue', 'sum'),
avg_quantity=('quantity', 'mean'),
unique_products=('product', 'nunique'),
best_day=('revenue', lambda x: df.loc[x.idxmax(), 'date'])
).reset_index()
print(result)
Interviewer focus: Using named aggregation syntax (.agg(name=('col', func))) is the modern Pandas idiom. Using lambda with idxmax to cross-reference another column shows advanced groupby understanding.
Challenge 2: Complex Merge with Indicator
import pandas as pd
import numpy as np
users = pd.DataFrame({
'user_id': range(1, 8),
'name': ['Alice','Bob','Carol','Dave','Eve','Frank','Grace'],
'signup_date': pd.to_datetime(['2024-01-01','2024-01-05','2024-01-10',
'2024-01-15','2024-01-20','2024-01-25','2024-01-30'])
})
orders = pd.DataFrame({
'order_id': range(101, 109),
'user_id': [1, 1, 2, 3, 3, 3, 5, 5],
'amount': [50, 75, 200, 30, 45, 60, 150, 80],
'order_date': pd.to_datetime(['2024-01-02','2024-01-15','2024-01-06',
'2024-01-11','2024-01-20','2024-01-25',
'2024-01-21','2024-02-01'])
})
# (a) Users who never ordered - use left merge with indicator
merged = users.merge(orders, on='user_id', how='left', indicator=True)
never_ordered = users[
users['user_id'].isin(
merged.loc[merged['_merge'] == 'left_only', 'user_id']
)
]
print("Never ordered:", never_ordered['name'].tolist())
# ['Dave', 'Frank', 'Grace']
# (b) Users who ordered within 7 days of signup
merged_dates = users.merge(orders, on='user_id', how='inner')
merged_dates['days_since_signup'] = (
merged_dates['order_date'] - merged_dates['signup_date']
).dt.days
early_buyers = merged_dates.loc[
merged_dates['days_since_signup'] <= 7, 'name'
].unique()
print("Early buyers:", early_buyers.tolist())
# (c) Total spend per user including zeros
spend = (
users.merge(orders, on='user_id', how='left')
.groupby(['user_id', 'name'])['amount']
.sum()
.fillna(0)
.reset_index()
.rename(columns={'amount': 'total_spend'})
)
print(spend)
Interviewer focus: Using indicator=True in merge to identify unmatched rows, .dt.days for datetime arithmetic, and fillna(0) after left join to handle non-buyers. These are the exact patterns used in real data science work.
Challenge 3: Pivot Table and Unpivot
import pandas as pd
import numpy as np
df = pd.DataFrame({
'student': ['Alice','Alice','Alice','Bob','Bob','Carol','Carol','Carol'],
'subject': ['Math','English','Science','Math','Science','Math','English','Science'],
'score': [90, 85, 92, 78, 88, 95, None, 91]
})
# Long to wide (pivot)
wide = df.pivot_table(
index='student',
columns='subject',
values='score',
aggfunc='mean' # Handles duplicates if any
).reset_index()
# Flatten column names
wide.columns.name = None
print("Wide format:")
print(wide)
# Wide back to long (melt/unpivot)
long = wide.melt(
id_vars='student',
value_vars=['English', 'Math', 'Science'],
var_name='subject',
value_name='score'
).dropna()
print("\nLong format:")
print(long.sort_values(['student', 'subject']).reset_index(drop=True))
Interviewer focus: Knowing the difference between pivot (fails on duplicates) and pivot_table (handles duplicates with aggfunc). Using melt for the reverse operation. Handling NaN values throughout the pipeline.
Challenge 4: Window Functions — Rolling and Expanding
import pandas as pd
import numpy as np
# Generate sample stock data
np.random.seed(42)
dates = pd.date_range('2024-01-01', periods=90, freq='D')
prices = 100 + np.cumsum(np.random.randn(90) * 2)
df = pd.DataFrame({'date': dates, 'price': prices})
df = df.set_index('date')
# (a) 7-day rolling average
df['rolling_7d_avg'] = df['price'].rolling(window=7, min_periods=1).mean()
# (b) 30-day rolling standard deviation
df['rolling_30d_std'] = df['price'].rolling(window=30, min_periods=1).std()
# (c) Cumulative max (expanding window)
df['cumulative_max'] = df['price'].expanding().max()
# (d) Percentage change from 7 days ago
df['pct_change_7d'] = df['price'].pct_change(periods=7) * 100
print(df.head(10))
print("...")
print(df.tail(5))
Interviewer focus: Using min_periods to handle the start of the series, expanding() for cumulative operations, and pct_change(periods=n) for lookback comparisons. These are daily operations in quantitative finance and time series ML.
Challenge 5: Time Series Resampling
import pandas as pd
import numpy as np
# Generate hourly sensor data with gaps
np.random.seed(42)
timestamps = pd.date_range('2024-01-01', periods=72, freq='h')
sensor_data = []
for sid in ['S1', 'S2']:
for ts in timestamps:
if np.random.random() > 0.1: # 10% missing
sensor_data.append({'timestamp': ts, 'sensor_id': sid,
'value': np.random.randn() * 10 + 50})
df = pd.DataFrame(sensor_data)
# Forward-fill missing hours within each sensor
df = df.set_index('timestamp')
filled = (
df.groupby('sensor_id')
.apply(lambda g: g.resample('h').ffill()) # Fill gaps
.drop(columns='sensor_id')
.reset_index()
)
# Resample to daily with multiple aggregations
daily = (
filled.set_index('timestamp')
.groupby('sensor_id')
.resample('D')
.agg(
mean_value=('value', 'mean'),
max_value=('value', 'max'),
reading_count=('value', 'count')
)
.reset_index()
)
print(daily)
Interviewer focus: Combining groupby with resample is a common ML data pipeline pattern. Forward-filling within groups (not across sensors) shows you understand data integrity. The multi-level groupby + resample syntax trips up many candidates.
Challenge 6: Feature Engineering from Text Columns
import pandas as pd
from collections import Counter
df = pd.DataFrame({
'id': range(1, 6),
'description': [
'Machine learning model for predicting customer churn',
'Deep neural network with attention mechanism',
'Simple linear regression baseline model',
'Gradient boosted trees for fraud detection system',
'Recurrent neural network for time series forecasting'
]
})
# Word count
df['word_count'] = df['description'].str.split().str.len()
# Character count (excluding spaces)
df['char_count'] = df['description'].str.replace(' ', '', regex=False).str.len()
# Average word length
df['avg_word_len'] = df['description'].apply(
lambda x: sum(len(w) for w in x.split()) / len(x.split())
)
# Keyword flags
keywords = ['neural', 'model', 'learning']
for kw in keywords:
df[f'has_{kw}'] = df['description'].str.contains(kw, case=False).astype(int)
# Most common word per row
df['top_word'] = df['description'].apply(
lambda x: Counter(x.lower().split()).most_common(1)[0][0]
)
print(df[['id', 'word_count', 'avg_word_len', 'has_neural', 'has_model', 'top_word']])
Interviewer focus: Using .str accessor for vectorized string operations instead of row-by-row apply where possible. Knowing when apply is unavoidable (like Counter). This is a common feature engineering pattern for NLP pipelines.
Challenge 7: Handling Missing Data Strategically
import pandas as pd
import numpy as np
# Create messy data
df = pd.DataFrame({
'age': [25, np.nan, 30, np.nan, 45, 50, np.nan, 35],
'salary': [50000, 60000, np.nan, 70000, np.nan, 80000, 65000, np.nan],
'department': ['Eng', None, 'Eng', 'Sales', None, 'Eng', 'Sales', 'Eng'],
'rating': [np.nan, np.nan, np.nan, np.nan, np.nan, np.nan, np.nan, 4.0],
'name': ['Alice', 'Bob', 'Carol', 'Dave', 'Eve', 'Frank', None, 'Helen']
})
# (a) Missingness report
missing_report = pd.DataFrame({
'missing_count': df.isna().sum(),
'missing_pct': (df.isna().sum() / len(df) * 100).round(1),
'dtype': df.dtypes
})
print("Missing Report:")
print(missing_report)
# (b) Fill numeric columns with median
numeric_cols = df.select_dtypes(include='number').columns
df[numeric_cols] = df[numeric_cols].fillna(df[numeric_cols].median())
# (c) Fill categorical columns with mode
categorical_cols = df.select_dtypes(include='object').columns
for col in categorical_cols:
df[col] = df[col].fillna(df[col].mode()[0])
# (d) Flag rows with >50% originally missing
original_missing = pd.DataFrame({
'age': [25, np.nan, 30, np.nan, 45, 50, np.nan, 35],
'salary': [50000, 60000, np.nan, 70000, np.nan, 80000, 65000, np.nan],
'department': ['Eng', None, 'Eng', 'Sales', None, 'Eng', 'Sales', 'Eng'],
'rating': [np.nan]*7 + [4.0],
'name': ['Alice', 'Bob', 'Carol', 'Dave', 'Eve', 'Frank', None, 'Helen']
})
df['high_missing_flag'] = (original_missing.isna().sum(axis=1) / len(original_missing.columns) > 0.5).astype(int)
# (e) Drop columns with >80% missing (rating had 87.5% missing)
cols_to_drop = missing_report[missing_report['missing_pct'] > 80].index.tolist()
df = df.drop(columns=cols_to_drop)
print(f"\nDropped columns: {cols_to_drop}")
print("\nCleaned DataFrame:")
print(df)
Interviewer focus: Using select_dtypes to separate numeric vs categorical columns, median for numeric (robust to outliers) vs mode for categorical, and creating a missingness report. This is the standard ML data cleaning pipeline.
Challenge 8: GroupBy Transform vs Apply
transform, (b) rank within department, (c) z-score within department. Explain when to use transform vs apply.import pandas as pd
import numpy as np
df = pd.DataFrame({
'name': ['Alice','Bob','Carol','Dave','Eve','Frank','Grace','Helen'],
'dept': ['Eng','Eng','Eng','Sales','Sales','Sales','PM','PM'],
'salary': [120, 130, 110, 80, 90, 85, 100, 95]
})
# (a) Salary as percentage of department average
# transform returns same-length Series, aligns with original index
df['pct_of_dept_avg'] = (
df['salary'] / df.groupby('dept')['salary'].transform('mean') * 100
).round(1)
# (b) Rank within department
df['dept_rank'] = df.groupby('dept')['salary'].rank(ascending=False).astype(int)
# (c) Z-score within department
df['dept_zscore'] = df.groupby('dept')['salary'].transform(
lambda x: (x - x.mean()) / x.std()
).round(3)
print(df)
# WHEN TO USE WHAT:
# transform: returns same shape as input, good for adding columns
# apply: can return any shape, good for custom aggregations
# agg: returns reduced shape, good for summary statistics
Interviewer focus: The distinction between transform (same shape, broadcasts back) and apply (flexible shape) is a classic interview question. Using transform for within-group normalization is the idiomatic approach.
Challenge 9: Multi-Index and Cross-Tabulation
import pandas as pd
import numpy as np
np.random.seed(42)
df = pd.DataFrame({
'respondent_id': range(200),
'age_group': np.random.choice(['18-25', '26-35', '36-50', '50+'], 200),
'gender': np.random.choice(['M', 'F', 'Other'], 200, p=[0.45, 0.45, 0.1]),
'question': 'Q1',
'answer': np.random.choice(['Strongly Agree', 'Agree', 'Neutral',
'Disagree', 'Strongly Disagree'], 200)
})
# Cross-tabulation with counts
ct = pd.crosstab(
index=[df['age_group'], df['gender']],
columns=df['answer'],
margins=True,
margins_name='Total'
)
print("Counts:")
print(ct)
# Normalized (percentages per row)
ct_pct = pd.crosstab(
index=[df['age_group'], df['gender']],
columns=df['answer'],
normalize='index' # Normalize across each row
).round(3) * 100
print("\nPercentages (per row):")
print(ct_pct)
# Flatten MultiIndex for ML features
ct_flat = ct_pct.reset_index()
ct_flat.columns = [f"{c[0]}_{c[1]}" if isinstance(c, tuple) else c
for c in ct_flat.columns]
print("\nFlattened:")
print(ct_flat.head())
Interviewer focus: pd.crosstab is the right tool for contingency tables (not pivot_table). Understanding normalize='index' vs 'columns' vs 'all'. Flattening MultiIndex for downstream ML use.
Challenge 10: Efficient String Matching and Categorization
import pandas as pd
import numpy as np
df = pd.DataFrame({
'product_name': [
'iPhone 15 Pro Max', 'Samsung Galaxy S24', 'MacBook Pro 16"',
'Dell XPS 15 laptop', 'iPad Air', 'Galaxy Tab S9',
'AirPods Pro', 'Sony WH-1000XM5 headphones', 'iphone 14',
'MACBOOK air m3', 'Pixel 8 phone', 'Lenovo ThinkPad x1'
]
})
def categorize_product(name: str) -> str:
"""Categorize product by name using pattern matching."""
name_lower = name.lower()
# Define category rules (order matters - first match wins)
rules = [
(['iphone', 'galaxy s', 'pixel.*phone'], 'Smartphone'),
(['macbook', 'thinkpad', 'xps.*laptop', 'laptop'], 'Laptop'),
(['ipad', 'galaxy tab', 'tab s'], 'Tablet'),
(['airpods', 'headphone', 'wh-1000', 'earbuds'], 'Audio'),
]
for patterns, category in rules:
for pattern in patterns:
if pd.Series([name_lower]).str.contains(pattern, regex=True).iloc[0]:
return category
return 'Other'
# Vectorized approach using np.select for performance
conditions = [
df['product_name'].str.contains('iphone|galaxy s|pixel.*phone', case=False, regex=True),
df['product_name'].str.contains('macbook|thinkpad|xps.*laptop|laptop', case=False, regex=True),
df['product_name'].str.contains('ipad|galaxy tab|tab s', case=False, regex=True),
df['product_name'].str.contains('airpods|headphone|wh-1000|earbuds', case=False, regex=True),
]
choices = ['Smartphone', 'Laptop', 'Tablet', 'Audio']
df['category'] = np.select(conditions, choices, default='Other')
print(df[['product_name', 'category']])
Interviewer focus: Using np.select with vectorized string matching is orders of magnitude faster than apply for large DataFrames. Showing both the readable (apply) and performant (np.select) approaches demonstrates production awareness.
Lilly Tech Systems