# Packages
library(tidyverse)
library(dplyr)
library(stringr)
library(ggplot2)
library(patchwork)Multiple Regression
Chapter 7 Companion Exercise
This exercise is for you to work through while watching the lecture videos and reading the chapter on multiple regression.
Setup
The Data
We’ll work with data on 19 students (Taken directly from figure 7.2 in the book):
wtdat <- tribble(
~Name, ~Sex, ~Age, ~Height, ~Weight,
"Alfred", "M", 14, 69.0, 112.5,
"Alice", "F", 13, 56.5, 84.0,
"Barbara", "F", 13, 65.3, 98.0,
"Camella", "F", 14, 62.8, 102.5,
"Henry", "M", 14, 63.5, 102.5,
"Jamal", "M", 12, 57.3, 83.0,
"Jane", "F", 12, 59.8, 84.5,
"Janet", "F", 15, 62.5, 112.5,
"Jayden", "M", 13, 62.5, 84.0,
"John", "M", 12, 59.0, 99.5,
"Joyce", "F", 11, 51.3, 50.5,
"Judy", "F", 14, 64.3, 90.0,
"Louise", "F", 12, 56.3, 77.0,
"Mary", "F", 15, 66.5, 112.0,
"Philip", "M", 16, 72.0, 150.0,
"Robert", "M", 12, 64.8, 128.0,
"Sequan", "M", 15, 67.0, 133.0,
"Thomas", "M", 11, 57.5, 85.0,
"William", "M", 15, 66.5, 112.0
)Redundancy Between Predictors
Let’s specify four models predicting weight: mean only, height only, age only, and height + age:
m_mean <- lm(Weight ~ 1, data = wtdat)
m_height <- lm(Weight ~ Height, data = wtdat)
m_age <- lm(Weight ~ Age, data = wtdat)
m_heightage <- lm(Weight ~ Height + Age, data = wtdat)
SSE_mean <- sum(residuals(m_mean)^2)
SSE_h <- sum(residuals(m_height)^2)
SSE_a <- sum(residuals(m_age)^2)
SSE_ha <- sum(residuals(m_heightage)^2)
PRE_h <- 1 - SSE_h / SSE_mean
PRE_a <- 1 - SSE_a / SSE_mean
PRE_ha <- 1 - SSE_ha / SSE_mean
pre_tbl <- tibble(
ModelPredictors = factor(c("Height", "Age", "Height + Age"),
levels = c("Height", "Age", "Height + Age")),
PRE = c(PRE_h, PRE_a, PRE_ha)
)
# This will generate a table with the PRE for each model relative to a
# simple, mean-only Model C
print(pre_tbl)We can also ask for a summary of any of the specified models above, but I want to particularly call attention to the summary for the multiple regression model—the one in which we predict weight using BOTH height and age—so that you can see that it reproduces the estimates from the textbook. Please confirm for yourself.
summary(m_heightage)OPTIONAL Visualizing the Models
First some prep:
library(scatterplot3d)
wtdat <- wtdat |>
mutate(
yhat_age = predict(m_age, newdata = wtdat),
yhat_height = predict(m_height, newdata = wtdat),
yhat_ha = predict(m_heightage, newdata = wtdat)
)
ybar <- mean(wtdat$Weight)
mean_label <- "Mean-only model"Weight vs. Age
Now we visualize the model for weight predicted by age:
p_age <- ggplot(wtdat, aes(x = Age, y = Weight)) +
geom_point(size = 2) +
# Residuals to the regression line (red dotted) — from observed y to fitted yhat_age
geom_segment(aes(xend = Age, yend = yhat_age),
linetype = "dotted", color = "red") +
# Mean-only horizontal line for context
geom_hline(yintercept = ybar) +
# Simple regression fit (Weight ~ Age)
geom_smooth(method = "lm", se = FALSE) +
labs(
title = "Weight vs Age",
subtitle = paste("Dotted red: residuals to regression fit | Horizontal:", mean_label),
x = "Age", y = "Weight"
) +
theme_minimal(base_size = 12)
ggsave("V1_weight_vs_age_resid_to_reg.png", p_age, width = 8, height = 5, dpi = 300)
p_ageWeight vs. Height
And now weight by height:
p_height <- ggplot(wtdat, aes(x = Height, y = Weight)) +
geom_point(size = 2) +
# Residuals to the regression line (red dotted) — from observed y to fitted yhat_height
geom_segment(aes(xend = Height, yend = yhat_height),
linetype = "dotted", color = "red") +
# Mean-only horizontal line for context
geom_hline(yintercept = ybar) +
# Simple regression fit (Weight ~ Height)
geom_smooth(method = "lm", se = FALSE) +
labs(
title = "Weight vs Height",
subtitle = paste("Dotted red: residuals to regression fit | Horizontal:", mean_label),
x = "Height", y = "Weight"
) +
theme_minimal(base_size = 12)
ggsave("V1_weight_vs_height_resid_to_reg.png", p_height, width = 8, height = 5, dpi = 300)
p_heightMultiple Regression: The 3D Plane
Now we look at the multiple regression, predicting weight by height & age… but remember, we’re now moving from the land of simple regression to the land of multiple regression. Which means we no longer visualize as lines with slopes but as degrees of slopes for a plane in a multi-dimensional space… it’s a little intense but not as “welcome to the multiverse” as it might sound… look:
# Static 3D scatter
m_age_height <- lm(Weight ~ Age + Height, data = wtdat)
# First we make a 3D scatter plot (our data):
library(scatterplot3d) # Note you will likely need to install this package
s3d <- scatterplot3d(
x = wtdat$Age, y = wtdat$Height, z = wtdat$Weight,
pch = 16, color = "black",
xlab = "Age", ylab = "Height", zlab = "Weight",
main = "Multiple Regression: Weight ~ Age + Height"
)
# Then we draw the plane that corresponds to our multiple regression model's predictions:
s3d$plane3d(m_age_height, draw_polygon = TRUE,
polygon_args = list(border = NA, col = rgb(0.6, 0.6, 0.6, 0.25)))
# ...and we can once again visualize the residuals to this plane
wtdat$yhat_ah <- predict(m_age_height, newdata = wtdat)
p_fit <- s3d$xyz.convert(wtdat$Age, wtdat$Height, wtdat$yhat_ah)
p_obs <- s3d$xyz.convert(wtdat$Age, wtdat$Height, wtdat$Weight)
segments(x0 = p_fit$x, y0 = p_fit$y, x1 = p_obs$x, y1 = p_obs$y,
lty = "dotted", col = "red")Interactive 3D Version
Or we can also visualize as an interactive 3D version of the same:
library(plotly) # Once again, you will likely need to install
# Coefficients for the plane z = b0 + b1*Age + b2*Height
co <- coef(m_heightage)
# Grid for the plane
xg <- seq(min(wtdat$Age), max(wtdat$Age), length.out = 30)
yg <- seq(min(wtdat$Height), max(wtdat$Height), length.out = 30)
zg <- outer(xg, yg, function(a, h) co[1] + co["Age"]*a + co["Height"]*h)
# Fitted values for each observed point (to draw residuals)
wtdat$yhat_ha <- predict(m_heightage, newdata = wtdat)
# Build residual segments: (x, y, z_fit) -> (x, y, z_obs), with NA breaks
segs_x <- as.vector(rbind(wtdat$Age, wtdat$Age, rep(NA, nrow(wtdat))))
segs_y <- as.vector(rbind(wtdat$Height, wtdat$Height, rep(NA, nrow(wtdat))))
segs_z <- as.vector(rbind(wtdat$yhat_ha, wtdat$Weight, rep(NA, nrow(wtdat))))
plot_ly() %>%
# Points
add_trace(
type = "scatter3d", mode = "markers",
x = wtdat$Age, y = wtdat$Height, z = wtdat$Weight,
marker = list(size = 4),
name = "Data points"
) %>%
# Fitted plane (no 'mode' on surfaces)
add_surface(
x = xg, y = yg, z = zg,
showscale = FALSE, opacity = 0.5,
name = "Fitted plane"
) %>%
layout(
title = "Multiple Regression: Weight ~ Age + Height",
scene = list(
xaxis = list(title = "Age"),
yaxis = list(title = "Height"),
zaxis = list(title = "Weight")
),
legend = list(orientation = "h")
)Height vs. Age
One more visualization because I show it in the lecture video—examining the relationship between our two predictors, age and height:
m_height_on_age <- lm(Height ~ Age, data = wtdat)
wtdat <- wtdat |>
mutate(
ybar_height = mean(Height),
yhat_height_on_age = as.numeric(predict(m_height_on_age, newdata = wtdat))
)
# Plot: Height vs Age with residuals to regression line
p_height_by_age <- ggplot(wtdat, aes(x = Age, y = Height)) +
geom_point(size = 2) +
geom_segment(aes(xend = Age, yend = yhat_height_on_age),
linetype = "dotted", color = "red") +
# Mean-only horizontal line for Height
geom_hline(aes(yintercept = ybar_height)) +
# Regression line (Height ~ Age)
geom_smooth(method = "lm", se = FALSE) +
labs(
title = "Height vs Age",
subtitle = paste("Dotted red: residuals to regression fit | Horizontal:", mean_label),
x = "Age", y = "Height"
) +
theme_minimal(base_size = 12)
p_height_by_ageBlock Tests (Nested Models)
A little more to play around with regarding different types of model comparisons:
Example A: Does Adding {Age, Sex} to Height Help?
m_small_A <- lm(Weight ~ Height, data = wtdat)
m_big_A <- lm(Weight ~ Height + Age + Sex, data = wtdat)
anova_A <- anova(m_small_A, m_big_A) # partial/block F test
R2_smallA <- summary(m_small_A)$r.squared
R2_bigA <- summary(m_big_A)$r.squared
dR2_A <- R2_bigA - R2_smallA
cat("\n--- Block Test A: Add {Age, Sex} to Height ---\n")
print(anova_A)
cat("ΔR^2 (big - small):", round(dR2_A, 4), "\n")
cat("Interpretation: If the F-test is significant, {Age, Sex} add\n",
"incremental predictive information beyond Height.\n")Example B: Does Adding {Sex} to {Height + Age} Help?
m_small_B <- lm(Weight ~ Height + Age, data = wtdat)
m_big_B <- lm(Weight ~ Height + Age + Sex, data = wtdat)
anova_B <- anova(m_small_B, m_big_B)
R2_smallB <- summary(m_small_B)$r.squared
R2_bigB <- summary(m_big_B)$r.squared
dR2_B <- R2_bigB - R2_smallBManual Partial-F for Example A
SSE_small <- sum(residuals(m_small_A)^2)
SSE_big <- sum(residuals(m_big_A)^2)
df_small <- df.residual(m_small_A)
df_big <- df.residual(m_big_A)
num_df <- df_small - df_big
den_df <- df_big
F_manual <- ((SSE_small - SSE_big) / num_df) / (SSE_big / den_df)
cat("\nManual partial-F (should match anova):", round(F_manual, 4),
" with df =", num_df, "and", den_df, "\n")
anova_A
F_manualSingle-Predictor (1 df) Tests
m_full <- lm(Weight ~ Height + Age + Sex, data = wtdat)
summ_full <- summary(m_full)
print(coef(summ_full)) # compact view
cat("\nHere's how to interpret:\n",
"Height: expected change in Weight (units) per 1-inch increase, holding Age & Sex constant.\n",
"Age : expected change in Weight per 1-year increase, holding Height & Sex constant.\n",
"Sex : expected mean difference (M vs F) in Weight holding Height & Age constant.\n", sep = "")Confidence Intervals and Standardized Coefficients
I didn’t mention this in lecture but it’s in the reading. Interpretation of CIs is consistent with previous discussion around CIs—we’re just interpreting with “all else constant.”
95% Confidence Intervals
cat("\n--- 95% Confidence Intervals (unstandardized β) ---\n")
print(confint(m_full))Standardized Coefficients
Approach: standardize all variables, then fit the same model.
wtdat_std <- wtdat |>
mutate(
Weight_z = as.numeric(scale(Weight)),
Height_z = as.numeric(scale(Height)),
Age_z = as.numeric(scale(Age))
)
m_full_std <- lm(Weight_z ~ Height_z + Age_z, data = wtdat_std)
cat("\n--- Standardized coefficients (betas) ---\n")
print(coef(summary(m_full_std)))
# Compare to unstandardized:
m_unstandardized_full <- lm(Weight ~ Height + Age, data = wtdat)
cat("\n--- Unstandardized coefficients ---\n")
print(coef(summary(m_unstandardized_full)))
cat("\nNote: Standardized betas aid COMPARISON across predictors;\n",
"they do not replace theory and do not change significance.\n", sep = "")