Intermediate

Regression

Predict continuous outcomes using linear regression, regularized models, decision trees, random forests, and gradient boosting.

Linear Regression

R
# Base R
model <- lm(mpg ~ wt + hp + cyl, data = mtcars)
summary(model)

# With tidymodels (parsnip)
library(tidymodels)
lm_spec <- linear_reg() |> set_engine("lm")
lm_fit <- lm_spec |> fit(mpg ~ wt + hp + cyl, data = mtcars)
tidy(lm_fit)    # Coefficients as tibble
glance(lm_fit)  # Model summary statistics

Ridge and Lasso Regression

R
# Ridge regression (penalty = L2, mixture = 0)
ridge_spec <- linear_reg(penalty = 0.1, mixture = 0) |>
  set_engine("glmnet")

# Lasso regression (penalty = L1, mixture = 1)
lasso_spec <- linear_reg(penalty = 0.1, mixture = 1) |>
  set_engine("glmnet")

# Elastic net (mixture between 0 and 1)
enet_spec <- linear_reg(penalty = 0.1, mixture = 0.5) |>
  set_engine("glmnet")

# Fit with workflow
rec <- recipe(mpg ~ ., data = mtcars) |>
  step_normalize(all_numeric_predictors())

wf <- workflow() |> add_recipe(rec) |> add_model(lasso_spec)
fit <- wf |> fit(data = mtcars)

Decision Tree Regression

R
tree_spec <- decision_tree(tree_depth = 5, min_n = 10) |>
  set_engine("rpart") |>
  set_mode("regression")

tree_fit <- tree_spec |> fit(mpg ~ ., data = mtcars)

# Visualize the tree
library(rpart.plot)
rpart.plot(tree_fit$fit)

Random Forest Regression

R
rf_spec <- rand_forest(trees = 500, mtry = 3, min_n = 5) |>
  set_engine("ranger", importance = "impurity") |>
  set_mode("regression")

rf_fit <- rf_spec |> fit(mpg ~ ., data = mtcars)

# Feature importance
library(vip)
vip(rf_fit)

XGBoost Regression

R
xgb_spec <- boost_tree(
  trees = 500,
  tree_depth = 6,
  learn_rate = 0.01,
  loss_reduction = 0.1
) |>
  set_engine("xgboost") |>
  set_mode("regression")

xgb_fit <- xgb_spec |> fit(mpg ~ ., data = mtcars)

Regression Metrics

MetricFunctionInterpretation
RMSErmse()Root Mean Squared Error (lower is better)
MAEmae()Mean Absolute Error (lower is better)
R-squaredrsq()Proportion of variance explained (closer to 1 is better)
R
# Evaluate predictions
predictions <- rf_fit |> predict(test)
results <- test |> bind_cols(predictions)

rmse(results, truth = mpg, estimate = .pred)
mae(results, truth = mpg, estimate = .pred)
rsq(results, truth = mpg, estimate = .pred)