Intermediate

Clustering

Group similar data points with K-Means, DBSCAN, and hierarchical clustering. Reduce dimensionality with PCA and t-SNE, and detect anomalies.

K-Means Clustering

Python
from sklearn.cluster import KMeans
from sklearn.metrics import silhouette_score

kmeans = KMeans(n_clusters=3, random_state=42, n_init=10)
labels = kmeans.fit_predict(X)
centers = kmeans.cluster_centers_

# Elbow Method - find optimal k
inertias = []
K_range = range(2, 11)
for k in K_range:
    km = KMeans(n_clusters=k, random_state=42, n_init=10)
    km.fit(X)
    inertias.append(km.inertia_)

# Silhouette Score (higher = better defined clusters)
score = silhouette_score(X, labels)
print(f"Silhouette Score: {score:.4f}")

DBSCAN

Python
from sklearn.cluster import DBSCAN

# Density-based: finds clusters of any shape, handles noise
dbscan = DBSCAN(eps=0.5, min_samples=5)
labels = dbscan.fit_predict(X)

n_clusters = len(set(labels)) - (1 if -1 in labels else 0)
n_noise = list(labels).count(-1)
print(f"Clusters: {n_clusters}, Noise points: {n_noise}")

Hierarchical Clustering

Python
from scipy.cluster.hierarchy import dendrogram, linkage
from sklearn.cluster import AgglomerativeClustering

# Dendrogram
linked = linkage(X, method="ward")
dendrogram(linked)
plt.title("Dendrogram")
plt.show()

# Agglomerative Clustering
agg = AgglomerativeClustering(n_clusters=3)
labels = agg.fit_predict(X)

Dimensionality Reduction

Python
from sklearn.decomposition import PCA
from sklearn.manifold import TSNE

# PCA - Principal Component Analysis
pca = PCA(n_components=2)
X_pca = pca.fit_transform(X)
print(f"Explained variance: {pca.explained_variance_ratio_}")

# t-SNE - for visualization (not for feature engineering)
tsne = TSNE(n_components=2, random_state=42, perplexity=30)
X_tsne = tsne.fit_transform(X)

# UMAP (install: pip install umap-learn)
import umap
reducer = umap.UMAP(n_components=2, random_state=42)
X_umap = reducer.fit_transform(X)

Anomaly Detection

Python
from sklearn.ensemble import IsolationForest

iso_forest = IsolationForest(contamination=0.05, random_state=42)
anomaly_labels = iso_forest.fit_predict(X)
# -1 = anomaly, 1 = normal
anomalies = X[anomaly_labels == -1]

Cluster Visualization

Python
plt.scatter(X_pca[:, 0], X_pca[:, 1], c=labels, cmap="viridis", alpha=0.6)
plt.xlabel("Component 1")
plt.ylabel("Component 2")
plt.title("Clusters visualized with PCA")
plt.colorbar(label="Cluster")
plt.show()
Algorithm choice: Use K-Means for well-separated spherical clusters. Use DBSCAN for irregular shapes and noisy data. Use hierarchical when you want to explore different numbers of clusters via a dendrogram.