Intermediate
Classification
Predict categorical outcomes using logistic regression, SVM, KNN, random forests, XGBoost, and proper evaluation metrics.
Logistic Regression
R
library(tidymodels) # Prepare binary classification data data <- mtcars |> mutate(am = factor(am, labels = c("auto", "manual"))) # Base R glm_model <- glm(am ~ wt + hp, data = data, family = "binomial") summary(glm_model) # With tidymodels log_spec <- logistic_reg() |> set_engine("glm") log_fit <- log_spec |> fit(am ~ wt + hp, data = data) tidy(log_fit)
Support Vector Machine (SVM)
R
svm_spec <- svm_rbf(cost = 1, rbf_sigma = 0.1) |> set_engine("kernlab") |> set_mode("classification") svm_fit <- svm_spec |> fit(am ~ ., data = data)
K-Nearest Neighbors
R
knn_spec <- nearest_neighbor(neighbors = 5) |> set_engine("kknn") |> set_mode("classification") # KNN requires scaling - use a recipe rec <- recipe(am ~ ., data = data) |> step_normalize(all_numeric_predictors()) wf <- workflow() |> add_recipe(rec) |> add_model(knn_spec) knn_fit <- wf |> fit(data = data)
Random Forest & XGBoost Classification
R
# Random Forest rf_spec <- rand_forest(trees = 500) |> set_engine("ranger") |> set_mode("classification") # XGBoost xgb_spec <- boost_tree(trees = 300, tree_depth = 5) |> set_engine("xgboost") |> set_mode("classification") # Decision Tree tree_spec <- decision_tree() |> set_engine("rpart") |> set_mode("classification")
Classification Metrics
R
# Get predictions preds <- rf_fit |> predict(test) probs <- rf_fit |> predict(test, type = "prob") results <- test |> bind_cols(preds) |> bind_cols(probs) # Accuracy accuracy(results, truth = am, estimate = .pred_class) # Sensitivity (recall) and Specificity sensitivity(results, truth = am, estimate = .pred_class) specificity(results, truth = am, estimate = .pred_class) # ROC-AUC roc_auc(results, truth = am, .pred_auto) # Confusion matrix conf_mat(results, truth = am, estimate = .pred_class) # Multiple metrics at once class_metrics <- metric_set(accuracy, sensitivity, specificity) class_metrics(results, truth = am, estimate = .pred_class)
Multi-class Classification
R
# iris has 3 classes rf_spec <- rand_forest(trees = 500) |> set_engine("ranger") |> set_mode("classification") rf_fit <- rf_spec |> fit(Species ~ ., data = iris) preds <- rf_fit |> predict(iris) results <- iris |> bind_cols(preds) accuracy(results, truth = Species, estimate = .pred_class) conf_mat(results, truth = Species, estimate = .pred_class)