Intermediate
Clustering
Group similar data points with K-Means, DBSCAN, and hierarchical clustering. Reduce dimensionality with PCA and t-SNE, and detect anomalies.
K-Means Clustering
Python
from sklearn.cluster import KMeans from sklearn.metrics import silhouette_score kmeans = KMeans(n_clusters=3, random_state=42, n_init=10) labels = kmeans.fit_predict(X) centers = kmeans.cluster_centers_ # Elbow Method - find optimal k inertias = [] K_range = range(2, 11) for k in K_range: km = KMeans(n_clusters=k, random_state=42, n_init=10) km.fit(X) inertias.append(km.inertia_) # Silhouette Score (higher = better defined clusters) score = silhouette_score(X, labels) print(f"Silhouette Score: {score:.4f}")
DBSCAN
Python
from sklearn.cluster import DBSCAN # Density-based: finds clusters of any shape, handles noise dbscan = DBSCAN(eps=0.5, min_samples=5) labels = dbscan.fit_predict(X) n_clusters = len(set(labels)) - (1 if -1 in labels else 0) n_noise = list(labels).count(-1) print(f"Clusters: {n_clusters}, Noise points: {n_noise}")
Hierarchical Clustering
Python
from scipy.cluster.hierarchy import dendrogram, linkage from sklearn.cluster import AgglomerativeClustering # Dendrogram linked = linkage(X, method="ward") dendrogram(linked) plt.title("Dendrogram") plt.show() # Agglomerative Clustering agg = AgglomerativeClustering(n_clusters=3) labels = agg.fit_predict(X)
Dimensionality Reduction
Python
from sklearn.decomposition import PCA from sklearn.manifold import TSNE # PCA - Principal Component Analysis pca = PCA(n_components=2) X_pca = pca.fit_transform(X) print(f"Explained variance: {pca.explained_variance_ratio_}") # t-SNE - for visualization (not for feature engineering) tsne = TSNE(n_components=2, random_state=42, perplexity=30) X_tsne = tsne.fit_transform(X) # UMAP (install: pip install umap-learn) import umap reducer = umap.UMAP(n_components=2, random_state=42) X_umap = reducer.fit_transform(X)
Anomaly Detection
Python
from sklearn.ensemble import IsolationForest iso_forest = IsolationForest(contamination=0.05, random_state=42) anomaly_labels = iso_forest.fit_predict(X) # -1 = anomaly, 1 = normal anomalies = X[anomaly_labels == -1]
Cluster Visualization
Python
plt.scatter(X_pca[:, 0], X_pca[:, 1], c=labels, cmap="viridis", alpha=0.6) plt.xlabel("Component 1") plt.ylabel("Component 2") plt.title("Clusters visualized with PCA") plt.colorbar(label="Cluster") plt.show()
Algorithm choice: Use K-Means for well-separated spherical clusters. Use DBSCAN for irregular shapes and noisy data. Use hierarchical when you want to explore different numbers of clusters via a dendrogram.
Lilly Tech Systems