Intermediate

Classification

Predict categorical outcomes using logistic regression, SVM, KNN, random forests, XGBoost, and proper evaluation metrics.

Logistic Regression

R
library(tidymodels)

# Prepare binary classification data
data <- mtcars |> mutate(am = factor(am, labels = c("auto", "manual")))

# Base R
glm_model <- glm(am ~ wt + hp, data = data, family = "binomial")
summary(glm_model)

# With tidymodels
log_spec <- logistic_reg() |>
  set_engine("glm")

log_fit <- log_spec |> fit(am ~ wt + hp, data = data)
tidy(log_fit)

Support Vector Machine (SVM)

R
svm_spec <- svm_rbf(cost = 1, rbf_sigma = 0.1) |>
  set_engine("kernlab") |>
  set_mode("classification")

svm_fit <- svm_spec |> fit(am ~ ., data = data)

K-Nearest Neighbors

R
knn_spec <- nearest_neighbor(neighbors = 5) |>
  set_engine("kknn") |>
  set_mode("classification")

# KNN requires scaling - use a recipe
rec <- recipe(am ~ ., data = data) |>
  step_normalize(all_numeric_predictors())

wf <- workflow() |> add_recipe(rec) |> add_model(knn_spec)
knn_fit <- wf |> fit(data = data)

Random Forest & XGBoost Classification

R
# Random Forest
rf_spec <- rand_forest(trees = 500) |>
  set_engine("ranger") |>
  set_mode("classification")

# XGBoost
xgb_spec <- boost_tree(trees = 300, tree_depth = 5) |>
  set_engine("xgboost") |>
  set_mode("classification")

# Decision Tree
tree_spec <- decision_tree() |>
  set_engine("rpart") |>
  set_mode("classification")

Classification Metrics

R
# Get predictions
preds <- rf_fit |> predict(test)
probs <- rf_fit |> predict(test, type = "prob")
results <- test |> bind_cols(preds) |> bind_cols(probs)

# Accuracy
accuracy(results, truth = am, estimate = .pred_class)

# Sensitivity (recall) and Specificity
sensitivity(results, truth = am, estimate = .pred_class)
specificity(results, truth = am, estimate = .pred_class)

# ROC-AUC
roc_auc(results, truth = am, .pred_auto)

# Confusion matrix
conf_mat(results, truth = am, estimate = .pred_class)

# Multiple metrics at once
class_metrics <- metric_set(accuracy, sensitivity, specificity)
class_metrics(results, truth = am, estimate = .pred_class)

Multi-class Classification

R
# iris has 3 classes
rf_spec <- rand_forest(trees = 500) |>
  set_engine("ranger") |>
  set_mode("classification")

rf_fit <- rf_spec |> fit(Species ~ ., data = iris)
preds <- rf_fit |> predict(iris)

results <- iris |> bind_cols(preds)
accuracy(results, truth = Species, estimate = .pred_class)
conf_mat(results, truth = Species, estimate = .pred_class)