The causalPAF package contains a suite of functions for causal analysis calculations of population attributable fractions (PAF) given a causal diagram which apply both:
Results are presentable in both table and plot format.
You can install the released version of causalPAF from CRAN with:
install.packages("causalPAF")
And the development version from GitHub with:
# install.packages("devtools")
::install_github("MauriceOConnell/causalPAF") devtools
You start with causalPAF by supplying data and a causal directed acyclic graph (DAG). In the R code below, the DAG is defined in the variable ‘in_out’. You can then calculate causal population attributable fractions (PAFs) using functions such as causalPAFplot() for pathway specific PAFs (PS-PAF) with bootstrapped confidence intervals; or the pointEstimate() function if only point estimates (with no confidence intervals) of the PS-PAF are required. Sequential PAFs are calculated using the sequential_PAF() function.
The ‘strokedata’ data within the causalPAF package is a fictional case control dataset #containing key causal and modifiable risk factors for stroke. The data is restricted to #ischemic stroke patients and their matched controls according to age, gender and region.
library(causalPAF)
head(strokedata)
#> regionnn7 case esex eage htnadmbp nevfcur global_stress2 whrs2tert phys
#> 1 1 0 1 61 1 1 1 3 1
#> 2 1 0 1 60 1 1 2 2 1
#> 3 1 0 2 81 1 1 1 2 2
#> 4 1 0 2 75 1 1 1 3 2
#> 5 1 0 1 60 1 2 1 2 1
#> 6 1 0 1 73 1 1 1 1 1
#> alcohfreqwk dmhba1c2 cardiacrfcat ahei3tert apob_apoatert subeduc moteduc
#> 1 2 1 1 2 3 3 2
#> 2 1 2 1 3 2 3 3
#> 3 2 1 2 1 3 5 3
#> 4 1 1 1 3 1 3 3
#> 5 1 1 1 3 2 5 2
#> 6 2 1 1 3 1 5 5
#> fatduc subhtn whr apob_apoa weights
#> 1 2 1 0.8806173 0.6603774 0.9965
#> 2 4 2 0.9051095 0.7500000 0.9965
#> 3 5 2 1.0734177 0.8540146 0.9965
#> 4 4 2 1.0635593 1.3809524 0.9965
#> 5 3 1 0.8173077 0.7132353 0.9965
#> 6 5 1 0.8232044 0.8161765 0.9965
Example R code, calculating pathway specific PAFs (PS-PAFs) using causalPAFplot() and pointEstimate() and ‘strokedata’ are shown below.
library(causalPAF)
<- strokedata
stroke_reduced
# The data should contain a column of weights for case control matching.
# strokedata$weights
# Weigths are not needed for cohort/cross sectional designs.
# Next, define the causal structure or directed acyclic graph (DAG) of the causal Bayesian
# network defined by the data. We list the parents of each exposure or risk factor or outcome
# in a vector as follows:
# Note it is important that the order at which the variables are defined is such that all
# parents of that variable are defined before it.
<- c("subeduc","moteduc","fatduc")
in_phys <- c("subeduc","moteduc","fatduc")
in_ahei <- c("subeduc","moteduc","fatduc")
in_nevfcur <- c("subeduc","moteduc","fatduc")
in_alcohfreqwk <- c("subeduc","moteduc","fatduc")
in_global_stress2 <- c("subeduc","moteduc","fatduc","phys","ahei3tert","nevfcur","alcohfreqwk",
in_subhtn "global_stress2")
<- c("subeduc","moteduc","fatduc","phys","ahei3tert","nevfcur","alcohfreqwk",
in_apob_apoa "global_stress2")
<- c("subeduc","moteduc","fatduc","phys","ahei3tert","nevfcur","alcohfreqwk",
in_whr "global_stress2")
# Note splines can be fitted within the causal structure as shown below especially if splines
# are to be used in the fitted models.
# It is important that splines of parent variables are "typed" or "spelt" consistently
# (including spaces) throughout as causalPAF can fit models automatically provided variables are
# spelt consistently. Also if a parent variable is a spline it should be defined in spline
# format in all occurences of the parent variable.
<- c("subeduc","moteduc","fatduc","phys","ahei3tert","nevfcur","alcohfreqwk",
in_cardiacrfcat "global_stress2",
"ns(apob_apoa,knots=quantile(apob_apoa,c(.25,.5,.75)),Boundary.knots=quantile(apob_apoa,c(.001,.95)))",
"ns(whr,df=5)","subhtn")
<- c("subeduc","moteduc","fatduc","phys","ahei3tert","nevfcur","alcohfreqwk",
in_dmhba1c2 "global_stress2",
"ns(apob_apoa,knots=quantile(apob_apoa,c(.25,.5,.75)),Boundary.knots=quantile(apob_apoa,c(.001,.95)))",
"ns(whr,df=5)","subhtn")
<- c("subeduc","moteduc","fatduc","phys","ahei3tert","nevfcur","alcohfreqwk",
in_case "global_stress2",
"ns(apob_apoa,knots=quantile(apob_apoa,c(.25,.5,.75)),Boundary.knots=quantile(apob_apoa,c(.001,.95)))",
"ns(whr,df=5)","subhtn","cardiacrfcat","dmhba1c2")
# Then we define a two dimensional list consisting of
# 1. inlist i.e. a list of the parents of each variable of interest corresponding to its column
# name in the data. Splines should be included here if they are to be modelled as splines.
# 2. outlist i.e. a list of each variable of interest corresponding to its column name in the
# data. Splines should not be input here, only the column names of the variables of interest in
# the data.
# Again the order is such that each variable is defined after all its parents.
<- list(inlist=list(in_phys,in_ahei,in_nevfcur,in_alcohfreqwk,in_global_stress2,
in_out
in_subhtn,in_apob_apoa,in_whr,in_cardiacrfcat,in_dmhba1c2,in_case),outlist=c("phys","ahei3tert","nevfcur","alcohfreqwk","global_stress2","subhtn",
"apob_apoa","whr","cardiacrfcat","dmhba1c2","case"))
# If splines are to be used for variables listed in in_out$outlist, then the splines should be
# defined in the same order as variables appear in in_out$outlist as follows. It is necessary to
# list variables in in_out$outlist without splines if no spline is to be applied.
# It is important that Splines_outlist is defined in the following format
# list(c("splinename1","splinename2","splinename3")) for the package to be applied correctly.
# And Splines_outlist should not be an empty list(). If there are no splines it should be
# defined the same as in_out[[2]] and in the same order as variables defined in_out[[2]].
= list( c("phys","ahei3tert","nevfcur","alcohfreqwk","global_stress2","subhtn",
Splines_outlist "ns(apob_apoa,knots=quantile(apob_apoa,c(.25,.5,.75)),Boundary.knots=quantile(apob_apoa,c(.001,.95)))",
"ns(whr,df=5)","cardiacrfcat","dmhba1c2","case") )
# To fit these models to case control data, one needs to perform weighted maximum-likelihood
# estimation to imitate estimation using a random sample from the population. We chose weights
# of 0.0035 (for each case) and 0.9965 (for each control), reflective of a yearly incidence of
# first ischemic stroke of 0.35%, or 3.5 strokes per 1,000 individuals. These weights were
# chosen according to average incidences across country, age, group and gender within
# INTERSTROKE according to the global burden of disease.
<- rep(1,nrow(stroke_reduced))
w $case==0] <- 0.9965
w[stroke_reduced$case==1] <- 0.0035
w[stroke_reduced
# It is important to assign stroke_reduced$weights to the updated weights defined in w.
# Otherwise if stroke_reduced$weights <- w is not set, the alternative weights supplied in the
# fictional data will be used. In this case, we want to use weigths as defined in w.
$weights <- w
stroke_reduced
#The checkMarkovDAG() function in the causalPAF package should be used before running
# causalPAFplot() to ensure:
#1. The causal Markov condition holds for the causal structure defined in the variable in_out.
#2. The variables in in_out are listed in the order so that no variable is defined before a
# parent or direct cause. Note: if this order does not hold, checkMarkovDAG() will automatically
# reorder the variables in, in_out, provided it is a Markov DAG.
#The causal analysis requires that the causal structure is a Markov DAG. The Causal Markov (CM)
# condition states that, conditional on the set of all its direct causes, a node is independent
# of all variables which are not direct causes or direct effects of that node. In the event that
# the structure of a Bayesian network accurately depicts causality, the two conditions are
# equivalent. However, a network may accurately embody the Markov condition without depicting
# causality, in which case it should not be assumed to embody the causal Markov condition.
# in_out is as defined above and input into this code.
if(checkMarkovDAG(in_out)$IsMarkovDAG & !checkMarkovDAG(in_out)$Reordered){
print("Your in_out DAG is a Markov DAG.")
else if( checkMarkovDAG(in_out)$IsMarkovDAG & checkMarkovDAG(in_out)$Reordered ) {
}
<- checkMarkovDAG(in_out)[[2]]
in_out
print("Your in_out DAG is a Markov DAG.The checkMarkovDAG function has reordered your
in_out list so that all parent variables come before descendants.")
else{ print("Your ``in_out'' list is not a Bayesian Markov DAG so the methods in the
} causalPAF package cannot be applied for non-Markov DAGs.")}
# The pointEstimate() function evaluates Point Estimates for Total PAF, Direct PAF, Indirect PAF
# and Path Specific PAF for a user inputted number of integral simulations. There is no bootstap
# applied in this fucntion.
# Since bootstraps are not applied, the pointEstimate() function will run quicker than the
# alternative causalPAFplot() function which calculates bootstrap estimates which can take
# longer to run.
pointEstimate(dataframe = stroke_reduced,
exposure="phys",
mediator=c("subhtn","apob_apoa","whr"),
response="case",
response_model_mediators = list(),
response_model_exposure = list(),
in_outArg = in_out,
Splines_outlist = Splines_outlist,
splinesDefinedIn_in_outDAG = TRUE,
model_listArg = list(),
weights = w,
NumSimulation = 3,
addCustom = TRUE,
custom = "regionnn7*ns(eage,df=5)+esex*ns(eage,df=5)")
# The causalPAFplot() function will perform Pathway-Specific Population Attributable Fraction
# (PS-PAF) calculations and output results based on an exposure, mediators and response input
# by the user according to the columns names of these variables defined in the dataframe.
# Setting model_listArg, response_model_mediators and response_model_exposure by default to an
# empty list will instruct the causalPAF package to fit these models automatically based on the
# causal DAG supplied in the in _outArg. Alternatively the user can supply their custom fitted,
# model_listpop, response_model_mediators and response_model_exposure which should be consistent
# with the causal structure.
# Note we fit a custom interaction for the outcome (or case or response) regression
# ( custom = "regionnn7*ns(eage,df=5)+esex*ns(eage,df=5)") ). Care should be taken that the
# customised regression should not contain variables that might affect the causal interpretation
# of the regression e.g. in this case we have used baseline confounders (i.e. regionn, eage and
# esex) with interactions and splines. In general, using baseline confounders in custom should
# not affect any causal interpretations whereas using variables far ``downstream'' might block
# causal pathways. The user is required to apply discretion in using ``addCustom'' and
# ``Custom'' in ensuring a causal interpretation remains. If no customisation is required the
# user can input addCustom = FALSE and custom = "" which is the default setting.
# Finally we call the causalPAFplot function for the pathway-specific PAF calculations as
# follows:
causalPAFplot(dataframe = stroke_reduced,
exposure="phys",
mediator=c("subhtn","apob_apoa","whr"),
response="case",
response_model_mediators = list(),
response_model_exposure = list(),
in_outArg = in_out,
Splines_outlist = Splines_outlist,
splinesDefinedIn_in_outDAG = TRUE,
model_listArg = list(),
weights = w,
NumBootstrap = 2,
NumSimulation = 2,
plot = "bar",
fill= "skyblue",
colour="orange",
addCustom = TRUE,
custom = "regionnn7*ns(eage,df=5)+esex*ns(eage,df=5)")
Sequential PAFs can be calculated using the sequential_PAF() function. Example R code, using ‘strokedata’ is shown below.
<- strokedata
stroke_reduced
<- c("subeduc","moteduc","fatduc")
in_phys <- c("subeduc","moteduc","fatduc")
in_ahei <- c("subeduc","moteduc","fatduc")
in_nevfcur <- c("subeduc","moteduc","fatduc")
in_alcohfreqwk <- c("subeduc","moteduc","fatduc")
in_global_stress2 <- c("subeduc","moteduc","fatduc","phys","ahei3tert","nevfcur","alcohfreqwk",
in_htnadmbp "global_stress2")
<- c("subeduc","moteduc","fatduc","phys","ahei3tert","nevfcur","alcohfreqwk",
in_apob_apoatert "global_stress2")
<- c("subeduc","moteduc","fatduc","phys","ahei3tert","nevfcur","alcohfreqwk",
in_whrs2tert "global_stress2")
<- c("subeduc","moteduc","fatduc","phys","ahei3tert","nevfcur","alcohfreqwk",
in_cardiacrfcat "global_stress2", "apob_apoatert","whrs2tert","htnadmbp")
<- c("subeduc","moteduc","fatduc","phys","ahei3tert","nevfcur","alcohfreqwk",
in_dmhba1c2 "global_stress2", "apob_apoatert","whrs2tert","htnadmbp")
<- c("subeduc","moteduc","fatduc","phys","ahei3tert","nevfcur","alcohfreqwk",
in_case "global_stress2", "apob_apoatert","whrs2tert","htnadmbp","cardiacrfcat","dmhba1c2")
<- list(inlist=list(in_phys,in_ahei,in_nevfcur,in_alcohfreqwk,in_global_stress2,
in_out
in_htnadmbp, in_apob_apoatert,in_whrs2tert,in_cardiacrfcat,
in_dmhba1c2,in_case),outlist=c("phys","ahei3tert","nevfcur","alcohfreqwk","global_stress2",
"htnadmbp","apob_apoatert", "whrs2tert","cardiacrfcat",
"dmhba1c2","case"))
if(checkMarkovDAG(in_out)$IsMarkovDAG & !checkMarkovDAG(in_out)$Reordered){
print("Your in_out DAG is a Markov DAG.")
else if( checkMarkovDAG(in_out)$IsMarkovDAG & checkMarkovDAG(in_out)$Reordered ) {
}
<- checkMarkovDAG(in_out)[[2]]
in_out
print("Your in_out DAG is a Markov DAG.The checkMarkovDAG function has reordered your
in_out list so that all parent variables come before descendants.")
else{ print("Your ``in_out'' list is not a Bayesian Markov DAG so the methods in the
} causalPAF package cannot be applied for non-Markov DAGs.")}
<- rep(1,nrow(stroke_reduced))
w $case==0] <- 0.9965
w[stroke_reduced$case==1] <- 0.0035
w[stroke_reduced
$weights <- w
stroke_reduced
<- sequential_PAF( dataframe = stroke_reduced,
sequentialPAF model_list_var = list(),
weights = w,
in_outDAG = in_out,
response = "case",
NumOrderRiskFactors = 3,
addCustom = TRUE,
custom = "regionnn7*ns(eage,df=5)+esex*ns(eage,df=5)" )
$SAF_summary
sequentialPAF