3 Chapter 13: Standardization and the Parametric G-Formula

This is the code for Chapter 13

library(tidyverse)
library(haven)
library(broom)
library(boot)

Instead of re-cleaning the NHEFS data we used in Chapter 12, we will load it with the cidata package, which contains all the data we need for this chapter.

# contains `nhefs`, `nhefs_codebook`, and `greek_data`, as well
library(cidata)

nhefs_complete
## # A tibble: 1,566 x 78
##       id numerator_sex numerator  seqn  qsmk death yrdth modth dadth   sbp
##    <int>         <dbl>     <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
##  1     1         0.711     0.743   233     0     0    NA    NA    NA   175
##  2     2         0.711     0.743   235     0     0    NA    NA    NA   123
##  3     3         0.772     0.743   244     0     0    NA    NA    NA   115
##  4     4         0.711     0.743   245     0     1    85     2    14   148
##  5     5         0.711     0.743   252     0     0    NA    NA    NA   118
##  6     6         0.772     0.743   257     0     0    NA    NA    NA   141
##  7     7         0.772     0.743   262     0     0    NA    NA    NA   132
##  8     8         0.772     0.743   266     0     0    NA    NA    NA   100
##  9     9         0.711     0.743   419     0     1    84    10    13   163
## 10    10         0.711     0.743   420     0     1    86    10    17   184
## # … with 1,556 more rows, and 68 more variables: dbp <dbl>, sex <fct>,
## #   age <dbl>, race <fct>, income <dbl>, marital <dbl>, school <dbl>,
## #   education <fct>, ht <dbl>, wt71 <dbl>, wt82 <dbl>, wt82_71 <dbl>,
## #   birthplace <dbl>, smokeintensity <dbl>, smkintensity82_71 <dbl>,
## #   smokeyrs <dbl>, asthma <dbl>, bronch <dbl>, tb <dbl>, hf <dbl>,
## #   hbp <dbl>, pepticulcer <dbl>, colitis <dbl>, hepatitis <dbl>,
## #   chroniccough <dbl>, hayfever <dbl>, diabetes <dbl>, polio <dbl>,
## #   tumor <dbl>, nervousbreak <dbl>, alcoholpy <dbl>, alcoholfreq <dbl>,
## #   alcoholtype <dbl>, alcoholhowmuch <dbl>, pica <dbl>, headache <dbl>,
## #   otherpain <dbl>, weakheart <dbl>, allergies <dbl>, nerves <dbl>,
## #   lackpep <dbl>, hbpmed <dbl>, boweltrouble <dbl>, wtloss <dbl>,
## #   infection <dbl>, active <fct>, exercise <fct>, birthcontrol <dbl>,
## #   pregnancies <dbl>, cholesterol <dbl>, hightax82 <dbl>, price71 <dbl>,
## #   price82 <dbl>, tax71 <dbl>, tax82 <dbl>, price71_82 <dbl>,
## #   tax71_82 <dbl>, censored <dbl>, older <dbl>, .fitted <dbl>,
## #   .resid <dbl>, .std.resid <dbl>, .hat <dbl>, .sigma <dbl>,
## #   .cooksd <dbl>, wts <dbl>, swts <dbl>, swts_sex <dbl>

3.1 Program 13.1

The parametric G-Formula uses parametric models, like linear regression, to predict mean risk under different counterfactual outcomes. To do so, we need to fit a model with the observed data. Like in Chapter 12, we’ll fit a linear regression model with wt82_71 as the outcome and qmk as the exposure, but we’ll also control for the covariates directly instead of via weights from a propensity score model. In this model, we’ll also include an interaction term between smoking cessation and smoking intensity, I(qsmk * smokeintensity).

standardized_model <- lm(
  wt82_71 ~ qsmk + I(qsmk * smokeintensity) + smokeintensity + 
    I(smokeintensity^2) + sex + race + age + I(age^2) + education + smokeyrs + 
    I(smokeyrs^2) + exercise + active + wt71 + I(wt71^2), 
  data = nhefs_complete
)

A model can be used to predict any combination of covariates. For example, to compute the predicted effects of having the covariates set to the same values as the participant with seqn == 24770, we can pass that row to augment().

# fit using the values of the covariates that this participant has
standardized_model %>% 
  augment(newdata = filter(nhefs_complete, seqn == 24770)) %>% 
  select(.fitted) %>% 
  knitr::kable(digits = 2)
.fitted
0.34

To do so on all the data, we just need to use augment() directly.

# predict on all combonations of covariates present in the data
standardized_model %>% 
  augment() %>% 
  summarise_each(funs(mean, min, max), .fitted) %>% 
  knitr::kable(digits = 2)
mean min max
2.64 -10.88 9.88

3.2 Program 13.2

The data from Table 2.2 is available as greek_data in the cidata package.

knitr::kable(greek_data)
name l a y
Rheia 0 0 0
Kronos 0 0 1
Demeter 0 0 0
Hades 0 0 0
Hestia 0 1 0
Poseidon 0 1 0
Hera 0 1 0
Zeus 0 1 1
Artemis 1 0 1
Apollo 1 0 1
Leto 1 0 0
Ares 1 1 1
Athena 1 1 1
Hephaestus 1 1 1
Aphrodite 1 1 1
Cyclope 1 1 1
Persephone 1 1 1
Hermes 1 1 0
Hebe 1 1 0
Dionysus 1 1 0

The first step in the parametric G-Formula is to fit a model using the observed data. We’ll use linear regression with an interaction term between a and l.

model_greeks <- lm(y ~ a * l, data = greek_data)

Instead of predicting on the observed data, we’ll clone our data twice. In the first clone, we’ll set everyone to be untreated (a = 0), and in the second, we’ll set everyone to be treated (a = 1).

#  set all participants to have a = 0
untreated_data <- greek_data %>% 
  mutate(a = 0)

#  set all participants to have a = 1
treated_data <- greek_data %>% 
  mutate(a = 1)

Then, we’ll use these data sets to get predicted values of y for each participant under both counterfactual exposures.

#  predict under the data where everyone is untreated
predicted_untreated <- model_greeks %>% 
  augment(newdata = untreated_data) %>%
  select(untreated = .fitted)

#  predict under the data where everyone is treated
predicted_treated <- model_greeks %>% 
  augment(newdata = treated_data) %>%
  select(treated = .fitted)

The average treatment effect is simply the difference in the mean of the predicted values. Here, there is no difference between the two groups.

#  join the two sets of predictions together
bind_cols(predicted_untreated, predicted_treated) %>% 
  summarise(
    mean_treated = mean(treated),
    mean_untreated = mean(untreated),
    difference = mean_treated - mean_untreated
  )
## # A tibble: 1 x 3
##   mean_treated mean_untreated difference
##          <dbl>          <dbl>      <dbl>
## 1        0.500            0.5  -1.67e-16

3.3 Program 13.3

We’ll follow the same premise to use the parametric G-formula for the NHEFS data. First, we’ll clone nhefs_complete twice: one for where everyone quit smoking and one where everyone kept smoking. Then, we’ll predict the change in weight for both groups using the model we fit with the observed data in Program 13.1, standardized_model.

kept_smoking <- nhefs_complete %>% 
  mutate(qsmk = 0)

quit_smoking <- nhefs_complete %>% 
  mutate(qsmk = 1)

predicted_kept_smoking <- standardized_model %>% 
  augment(newdata = kept_smoking) %>%
  select(kept_smoking = .fitted)

predicted_quit_smoking <- standardized_model %>% 
  augment(newdata = quit_smoking) %>%
  select(quit_smoking = .fitted)

Again, average treatment effect is the difference in the mean of the predicted values.

bind_cols(predicted_kept_smoking, predicted_quit_smoking) %>% 
  summarise(
    mean_quit_smoking = mean(quit_smoking),
    mean_kept_smoking = mean(kept_smoking),
    difference = mean_quit_smoking - mean_kept_smoking
  )
## # A tibble: 1 x 3
##   mean_quit_smoking mean_kept_smoking difference
##               <dbl>             <dbl>      <dbl>
## 1              5.27              1.76       3.52

3.4 Program 13.4

To get valid confidence intervals for the estimated average treatment effect, we need to use the bootstrap. Like in Chapter 12, we’ll write a function and pass it to the boot() function. We need to refit standardized_model using the bootstrapped data for each replication.

fit_gformula <- function(data, indices) {
  #  resample data set
  .df <- data[indices, ]
  
  #  fit the standardized regression model using the resampled observed data
  standardized_model <- lm(
    wt82_71 ~ qsmk + I(qsmk * smokeintensity) + smokeintensity + 
      I(smokeintensity^2) + sex + race + age + I(age^2) + education + smokeyrs + 
      I(smokeyrs^2) + exercise + active + wt71 + I(wt71^2), 
    data = .df
  )
  
  #  clone the data and set each to given exposure level
  kept_smoking <- nhefs_complete %>% 
    mutate(qsmk = 0)
  
  quit_smoking <- nhefs_complete %>% 
    mutate(qsmk = 1)
  
  #  predict on the cloned data
  predicted_kept_smoking <- standardized_model %>% 
    augment(newdata = kept_smoking) %>%
    select(kept_smoking = .fitted)
  
  predicted_quit_smoking <- standardized_model %>% 
    augment(newdata = quit_smoking) %>%
    select(quit_smoking = .fitted)
  
  #  summarize the mean difference and pull from data frame
  bind_cols(predicted_kept_smoking, predicted_quit_smoking) %>% 
    summarise(
      mean_quit_smoking = mean(quit_smoking),
      mean_kept_smoking = mean(kept_smoking),
      difference = mean_quit_smoking - mean_kept_smoking
    ) %>% 
    pull(difference)
}

As with Chapter 12, we’ll use 2000 replications and bias-corrected confidence intervals, then pass the results to tidy().

bootstrapped_gformula <- boot(nhefs_complete, fit_gformula, R = 2000)

bootstrapped_gformula %>% 
  tidy(conf.int = TRUE, conf.meth = "bca")
## # A tibble: 1 x 5
##   statistic      bias std.error conf.low conf.high
##       <dbl>     <dbl>     <dbl>    <dbl>     <dbl>
## 1      3.52 -0.000109     0.490     2.61      4.49