This tutorial demonstrates g-computation (also known as the g-formula) for estimating causal effects with time-varying confounders. We replicate the analysis from Naimi, Cole, and Kennedy (2017).
From our knowledge of the data-generating process, we know the true average causal effect to be 50.
library(dplyr)
library(knitr)
library(kableExtra)
The empirical setting is to treat HIV with a therapy regimen (A) in two time periods (t = 0, t = 1). We measure the time-varying confounder, HIV viral load (Z), at times t = 0 and t = 1. Our outcome is the CD4 count (cells/mm³) observed at t = 2.
Objective: Estimate the causal effect of always being treated vs. never being treated on HIV outcomes.
Variables:
Causal structure: Z0 → A0 → Z1 → A1 → Y
Target estimand: \(E[Y^{1,1}] - E[Y^{0,0}]\) (always treated vs. never treated)
# Aggregated data from Table 1 of Naimi et al. (2017)
<- data.frame(
data a0 = c(0, 0, 0, 0, 1, 1, 1, 1), # Treatment at time 0
z1 = c(0, 0, 1, 1, 0, 0, 1, 1), # Confounder at time 1
a1 = c(0, 1, 0, 1, 0, 1, 0, 1), # Treatment at time 1
y = c(87.288, 112.107, 119.654, 144.842,
105.282, 130.184, 137.720, 162.832), # Mean CD4 count
n = c(209271, 93779, 60657, 136293,
134781, 60789, 93903, 210527) # Number of subjects
)
kable(data,
caption = "Aggregated Study Data",
col.names = c("A0", "Z1", "A1", "Y (mean)", "N (count)")) %>%
kable_styling(bootstrap_options = c("striped", "hover"))
A0 | Z1 | A1 | Y (mean) | N (count) |
---|---|---|---|---|
0 | 0 | 0 | 87.288 | 209271 |
0 | 0 | 1 | 112.107 | 93779 |
0 | 1 | 0 | 119.654 | 60657 |
0 | 1 | 1 | 144.842 | 136293 |
1 | 0 | 0 | 105.282 | 134781 |
1 | 0 | 1 | 130.184 | 60789 |
1 | 1 | 0 | 137.720 | 93903 |
1 | 1 | 1 | 162.832 | 210527 |
cat("Total sample size:", sum(data$n), "subjects\n")
## Total sample size: 1e+06 subjects
cat("True causal effect (from data generation process): 50\n")
## True causal effect (from data generation process): 50
The nonparametric approach directly estimates the g-formula using empirical probabilities:
\[E[Y^{a_0, a_1}] = \sum_{z_1} E[Y | A_1 = a_1, Z_1 = z_1, A_0 = a_0] \cdot P(Z_1 = z_1 | A_0 = a_0)\]
# Calculate empirical probabilities P(Z1 | A0)
# Calculate empirical probabilities P(Z1 | A0)
<- data %>%
z1_probs group_by(a0, z1) %>%
summarise(count = sum(n), .groups = 'drop') %>%
group_by(a0) %>%
mutate(
total = sum(count),
prob = count / total
%>%
) select(a0, z1, prob) %>%
arrange(desc(a0), z1) # Order by A0 descending (1 first), then Z1 ascending
kable(z1_probs,
caption = "Empirical Probabilities P(Z1 | A0)",
col.names = c("A0", "Z1", "P(Z1 | A0)"),
digits = 4) %>%
kable_styling(bootstrap_options = c("striped", "hover"))
A0 | Z1 | P(Z1 | A0) |
---|---|---|
1 | 0 | 0.3911 |
1 | 1 | 0.6089 |
0 | 0 | 0.6061 |
0 | 1 | 0.3939 |
# Function to calculate potential outcome under intervention (a0, a1)
<- function(a0_val, a1_val) {
calculate_potential_outcome
# Get relevant probabilities P(Z1 | A0 = a0_val)
<- z1_probs %>% filter(a0 == a0_val)
probs
# Get outcome values E[Y | A0 = a0_val, Z1, A1 = a1_val]
<- data %>%
outcomes filter(a0 == a0_val, a1 == a1_val) %>%
select(z1, y)
# Combine and calculate weighted expectation
<- probs %>%
result left_join(outcomes, by = "z1") %>%
summarise(
potential_outcome = sum(prob * y),
.groups = 'drop'
%>%
) pull(potential_outcome)
return(result)
}
# Function to create detailed breakdown table with E[Y|Z1,A] and P(Z1)
<- function(a0_val, a1_val, intervention_name) {
create_detailed_table
# Get relevant probabilities P(Z1 | A0 = a0_val)
<- z1_probs %>% filter(a0 == a0_val)
probs
# Get outcome values E[Y | A0 = a0_val, Z1, A1 = a1_val]
<- data %>%
outcomes filter(a0 == a0_val, a1 == a1_val) %>%
group_by(z1) %>%
summarise(
expected_y = mean(y, na.rm = TRUE),
.groups = 'drop'
)
# Combine probabilities and outcomes
<- probs %>%
detailed_table left_join(outcomes, by = "z1") %>%
mutate(
intervention = intervention_name,
weighted_outcome = prob * expected_y
%>%
) select(
Intervention = intervention,
Z1 = z1,
`P(Z1)` = prob,
`E[Y|Z1,A]` = expected_y,
`P(Z1) * E[Y|Z1,A]` = weighted_outcome
)
return(detailed_table)
}
# Calculate potential outcomes
<- calculate_potential_outcome(a0_val = 1, a1_val = 1)
always_treated <- calculate_potential_outcome(a0_val = 0, a1_val = 0)
never_treated
# Create detailed breakdown tables
<- create_detailed_table(a0_val = 1, a1_val = 1, "Always treated (1,1)")
always_treated_table <- create_detailed_table(a0_val = 0, a1_val = 0, "Never treated (0,0)")
never_treated_table
# Combine detailed tables
<- rbind(always_treated_table, never_treated_table)
detailed_breakdown
# Display detailed breakdown table
kable(detailed_breakdown,
caption = "Detailed Breakdown: E[Y|Z1,A] and P(Z1) by Intervention",
digits = 4) %>%
kable_styling(bootstrap_options = c("striped", "hover")) %>%
pack_rows("Always treated (1,1)", 1, nrow(always_treated_table)) %>%
pack_rows("Never treated (0,0)", nrow(always_treated_table) + 1, nrow(detailed_breakdown))
a0 | Intervention | Z1 | P(Z1) | E[Y|Z1,A] | P(Z1) * E[Y|Z1,A] |
---|---|---|---|---|---|
Always treated (1,1) | |||||
1 | Always treated (1,1) | 0 | 0.3911 | 130.184 | 50.9202 |
1 | Always treated (1,1) | 1 | 0.6089 | 162.832 | 99.1419 |
Never treated (0,0) | |||||
0 | Never treated (0,0) | 0 | 0.6061 | 87.288 | 52.9053 |
0 | Never treated (0,0) | 1 | 0.3939 | 119.654 | 47.1317 |
# Summary table with totals
<- detailed_breakdown %>%
summary_table group_by(Intervention) %>%
summarise(
`Total E[Y(a, a)]` = sum(`P(Z1) * E[Y|Z1,A]`, na.rm = TRUE),
.groups = 'drop'
)
kable(summary_table,
caption = "Summary: Potential Outcomes by Intervention",
digits = 4) %>%
kable_styling(bootstrap_options = c("striped", "hover"))
Intervention | Total E[Y(a, a)] |
---|---|
Always treated (1,1) | 150.0621 |
Never treated (0,0) | 100.0370 |
# Calculate and display ATE
<- always_treated - never_treated
nonparam_ate cat("Nonparametric ATE:", round(nonparam_ate, 4), "\n")
## Nonparametric ATE: 50.0251
The parametric approach uses regression models to estimate each component of the g-formula.
# Fit models for each component (using sample weights!)
<- glm(z1 ~ a0,
z1_model family = binomial(),
data = data,
weights = n)
<- lm(y ~ a0 + z1 + a1 + a0*a1 + z1*a1,
y_model data = data,
weights = n)
# Display model fit
cat("Model summaries:\n\n")
## Model summaries:
cat("Z1 Model (P(Z1 | A0)):\n")
## Z1 Model (P(Z1 | A0)):
print(summary(z1_model)$coefficients)
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -0.4309479 0.002894342 -148.8932 0
## a0 0.8734707 0.004095766 213.2619 0
cat("\nY Model (E[Y | A0, Z1, A1]):\n")
##
## Y Model (E[Y | A0, Z1, A1]):
print(summary(y_model)$coefficients)
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 8.727925e+01 0.02611352 3.342301e+03 8.951769e-08
## a0 1.801633e+01 0.03697234 4.872921e+02 4.211322e-06
## z1 3.240493e+01 0.03983439 8.134912e+02 1.511100e-06
## a1 2.485141e+01 0.04407823 5.638024e+02 3.145894e-06
## a0:a1 4.970271e-04 0.05220134 9.521347e-03 9.932675e-01
## z1:a1 2.901257e-01 0.05628599 5.154492e+00 3.563828e-02
cat("\nY Model R²:", round(summary(y_model)$r.squared, 4), "\n")
##
## Y Model R²: 1
# Simulation-based g-computation
set.seed(123)
<- 100000
n_sim
# Function to simulate potential outcomes under intervention
<- function(a0_intervention, a1_intervention, n_sim) {
simulate_intervention
# Generate Z1 under the intervention A0 = a0_intervention
<- rbinom(n = n_sim,
z1_sim size = 1,
prob = predict(z1_model,
newdata = data.frame(a0 = a0_intervention),
type = "response"))
# Predict Y under the full intervention (A0, A1) and simulated Z1
<- predict(y_model,
y_sim newdata = data.frame(
a0 = a0_intervention,
z1 = z1_sim,
a1 = a1_intervention
))
return(mean(y_sim))
}
# Simulate potential outcomes
<- simulate_intervention(a0_intervention = 1,
always_treated_param a1_intervention = 1,
n_sim = n_sim)
<- simulate_intervention(a0_intervention = 0,
never_treated_param a1_intervention = 0,
n_sim = n_sim)
<- always_treated_param - never_treated_param
param_ate
# Results
<- data.frame(
param_results Intervention = c("Always treated (1,1)", "Never treated (0,0)"),
Potential_Outcome = c(always_treated_param, never_treated_param)
)
kable(param_results,
caption = "Parametric G-Computation Results",
digits = 4) %>%
kable_styling(bootstrap_options = c("striped", "hover"))
Intervention | Potential_Outcome |
---|---|
Always treated (1,1) | 150.0866 |
Never treated (0,0) | 100.0555 |
cat("Parametric ATE:", round(param_ate, 4), "\n")
## Parametric ATE: 50.031
# Explore other intervention scenarios
<- data.frame(
other_interventions Scenario = c("Treated at time 0 only", "Treated at time 1 only", "Natural course"),
A0 = c(1, 0, "Natural"),
A1 = c(0, 1, "Natural"),
Potential_Outcome = c(
simulate_intervention(1, 0, n_sim),
simulate_intervention(0, 1, n_sim),
mean(predict(y_model, newdata = data.frame(
a0 = sample(c(0,1), n_sim, replace = TRUE),
z1 = rbinom(n_sim, 1, 0.5), # Natural Z1 distribution
a1 = rbinom(n_sim, 1, 0.5) # Natural A1 distribution
)))
)
)
kable(other_interventions,
caption = "Additional Intervention Scenarios",
digits = 4) %>%
kable_styling(bootstrap_options = c("striped", "hover"))
Scenario | A0 | A1 | Potential_Outcome |
---|---|---|---|
Treated at time 0 only | 1 | 0 | 125.0354 |
Treated at time 1 only | 0 | 1 | 125.0488 |
Natural course | Natural | Natural | 124.8908 |
# Compare all methods
<- data.frame(
comparison Method = c("Nonparametric G-computation",
"Parametric G-computation",
"True Effect"),
ATE = c(round(nonparam_ate, 4),
round(param_ate, 4),
50.0),
Difference_from_Truth = c(round(nonparam_ate - 50, 4),
round(param_ate - 50, 4),
0)
)
kable(comparison,
caption = "Method Comparison",
digits = 4) %>%
kable_styling(bootstrap_options = c("striped", "hover"))
Method | ATE | Difference_from_Truth |
---|---|---|
Nonparametric G-computation | 50.0251 | 0.0251 |
Parametric G-computation | 50.0310 | 0.0310 |
True Effect | 50.0000 | 0.0000 |
# Model validation: Check if parametric models recover empirical probabilities
cat("\nModel Validation:\n")
##
## Model Validation:
cat("Empirical P(Z1=1|A0=0):", round(z1_probs$prob[z1_probs$a0==0 & z1_probs$z1==1], 4), "\n")
## Empirical P(Z1=1|A0=0): 0.3939
cat("Model P(Z1=1|A0=0): ", round(predict(z1_model, data.frame(a0=0), type="response"), 4), "\n")
## Model P(Z1=1|A0=0): 0.3939
cat("Empirical P(Z1=1|A0=1):", round(z1_probs$prob[z1_probs$a0==1 & z1_probs$z1==1], 4), "\n")
## Empirical P(Z1=1|A0=1): 0.6089
cat("Model P(Z1=1|A0=1): ", round(predict(z1_model, data.frame(a0=1), type="response"), 4), "\n")
## Model P(Z1=1|A0=1): 0.6089
Naimi, A. I., Cole, S. R., & Kennedy, E. H. (2017). An introduction to g methods. International Journal of Epidemiology, 46(2), 756-762.