rquery::extend_se()
and rquery::extend()
each automatically partition a sequence of assignments so that no
statement is using any value created in the same partition element or
group. This is to eliminate potentially dangerous ambiguity in
statements.
For such a partition: the evaluation result does not depend on the
order of execution of the statements in each group (as they are all
independent of each other’s left-hand-sides). A no-dependency small
number of groups partition is very helpful when executing
expressions on SQL
based data interfaces (such as
Apache Spark
).
The method used to partition expressions is to scan the remaining expressions in order taking any that: have all their values available from earlier groups, do not use a value formed in the current group, and do not overwrite a value formed in the current group.
This partitioning method ensures safe and correct results, and can lead to far fewer groups (and much more efficient queries) than the straightforward method of breaking up the sequence of expressions at each new-value use.
Here is a non-trivial database based example (demonstrating methods
that might be used in big data work such as with Spark
). We
are going to assign pairs of items to complimentary treatment (“T”) and
control (“C”) groups based on already populated pseudo-random numbers
(pre-populating the pseudo-random numbers is to avoid known
issues in using rand()
in RSQlite
, and
would not be necessary with other databases).
run_vignette <- requireNamespace("DBI", quietly = TRUE) && requireNamespace("RSQLite", quietly = TRUE)
Loading required package: wrapr
# example data
set.seed(3463)
d <- data.frame(id = seq_len(4))
for(group in c('a', 'b', 'c', 'd', 'e')) {
d[[paste0("rand_", group)]] <- runif(nrow(d))
}
my_db <- DBI::dbConnect(RSQLite::SQLite(),
":memory:")
d1 <- rq_copy_to(my_db, "example_table", d)
d1 %.>%
to_sql(., my_db) %.>%
DBI::dbGetQuery(my_db, .) %.>%
knitr::kable(.)
id | rand_a | rand_b | rand_c | rand_d | rand_e |
---|---|---|---|---|---|
1 | 0.8438177 | 0.9459773 | 0.2941489 | 0.1054046 | 0.3038159 |
2 | 0.9045364 | 0.4839231 | 0.4654982 | 0.6617276 | 0.9056346 |
3 | 0.5496617 | 0.6112306 | 0.6989960 | 0.6536909 | 0.1683751 |
4 | 0.6545816 | 0.6593733 | 0.9678277 | 0.8316179 | 0.0597492 |
# design the experiment
plan <- d1 %.>%
extend(.,
choice_a = rand_a>=0.5,
a_1 = ifelse(choice_a,
'T',
'C'),
a_2 = ifelse(choice_a,
'C',
'T'),
choice_b = rand_b>=0.5,
b_1 = ifelse(choice_b,
'T',
'C'),
b_2 = ifelse(choice_b,
'C',
'T'),
choice_c = rand_c>=0.5,
c_1 = ifelse(choice_c,
'T',
'C'),
c_2 = ifelse(choice_c,
'C',
'T'),
choice_d = rand_d>=0.5,
d_1 = ifelse(choice_d,
'T',
'C'),
d_2 = ifelse(choice_d,
'C',
'T'),
choice_e = rand_e>=0.5,
e_1 = ifelse(choice_e,
'T',
'C'),
e_2 = ifelse(choice_e,
'C',
'T')
) %.>%
select_columns(.,
qc(id,
a_1, a_2, b_1, b_2,
c_1, c_2, d_1, d_2,
e_1, e_2))
cat(format(plan))
mk_td("example_table", c(
"id",
"rand_a",
"rand_b",
"rand_c",
"rand_d",
"rand_e")) %.>%
extend(.,
choice_a := rand_a >= 0.5,
choice_b := rand_b >= 0.5,
choice_c := rand_c >= 0.5,
choice_d := rand_d >= 0.5,
choice_e := rand_e >= 0.5) %.>%
extend(.,
a_1 := ifelse(choice_a, "T", "C"),
a_2 := ifelse(choice_a, "C", "T"),
b_1 := ifelse(choice_b, "T", "C"),
b_2 := ifelse(choice_b, "C", "T"),
c_1 := ifelse(choice_c, "T", "C"),
c_2 := ifelse(choice_c, "C", "T"),
d_1 := ifelse(choice_d, "T", "C"),
d_2 := ifelse(choice_d, "C", "T"),
e_1 := ifelse(choice_e, "T", "C"),
e_2 := ifelse(choice_e, "C", "T")) %.>%
select_columns(.,
c('id', 'a_1', 'a_2', 'b_1', 'b_2', 'c_1', 'c_2', 'd_1', 'd_2', 'e_1', 'e_2'))
Notice rquery::extend_se()
split the work into 3
unambiguous groups. The statements inside each group can now be executed
in any order (or even in parallel) with no ambiguity of meaning or risk
of error. The goal was: split into a small number of groups such that
the observable execution semantics are very close to executing the
original statements in order in completely separate groups (which is
likely what a user intends).
SELECT
`id`,
`a_1`,
`a_2`,
`b_1`,
`b_2`,
`c_1`,
`c_2`,
`d_1`,
`d_2`,
`e_1`,
`e_2`
FROM (
SELECT
`id`,
( CASE WHEN ( `choice_a` ) THEN ( 'T' ) WHEN NOT ( `choice_a` ) THEN ( 'C' ) ELSE NULL END ) AS `a_1`,
( CASE WHEN ( `choice_a` ) THEN ( 'C' ) WHEN NOT ( `choice_a` ) THEN ( 'T' ) ELSE NULL END ) AS `a_2`,
( CASE WHEN ( `choice_b` ) THEN ( 'T' ) WHEN NOT ( `choice_b` ) THEN ( 'C' ) ELSE NULL END ) AS `b_1`,
( CASE WHEN ( `choice_b` ) THEN ( 'C' ) WHEN NOT ( `choice_b` ) THEN ( 'T' ) ELSE NULL END ) AS `b_2`,
( CASE WHEN ( `choice_c` ) THEN ( 'T' ) WHEN NOT ( `choice_c` ) THEN ( 'C' ) ELSE NULL END ) AS `c_1`,
( CASE WHEN ( `choice_c` ) THEN ( 'C' ) WHEN NOT ( `choice_c` ) THEN ( 'T' ) ELSE NULL END ) AS `c_2`,
( CASE WHEN ( `choice_d` ) THEN ( 'T' ) WHEN NOT ( `choice_d` ) THEN ( 'C' ) ELSE NULL END ) AS `d_1`,
( CASE WHEN ( `choice_d` ) THEN ( 'C' ) WHEN NOT ( `choice_d` ) THEN ( 'T' ) ELSE NULL END ) AS `d_2`,
( CASE WHEN ( `choice_e` ) THEN ( 'T' ) WHEN NOT ( `choice_e` ) THEN ( 'C' ) ELSE NULL END ) AS `e_1`,
( CASE WHEN ( `choice_e` ) THEN ( 'C' ) WHEN NOT ( `choice_e` ) THEN ( 'T' ) ELSE NULL END ) AS `e_2`
FROM (
SELECT
`id`,
`rand_a` >= 0.5 AS `choice_a`,
`rand_b` >= 0.5 AS `choice_b`,
`rand_c` >= 0.5 AS `choice_c`,
`rand_d` >= 0.5 AS `choice_d`,
`rand_e` >= 0.5 AS `choice_e`
FROM (
SELECT
`id`,
`rand_a`,
`rand_b`,
`rand_c`,
`rand_d`,
`rand_e`
FROM
`example_table`
) tsql_24400885758306713700_0000000000
) tsql_24400885758306713700_0000000001
) tsql_24400885758306713700_0000000002
id | a_1 | a_2 | b_1 | b_2 | c_1 | c_2 | d_1 | d_2 | e_1 | e_2 |
---|---|---|---|---|---|---|---|---|---|---|
1 | T | C | T | C | C | T | C | T | C | T |
2 | T | C | C | T | C | T | T | C | T | C |
3 | T | C | T | C | T | C | T | C | C | T |
4 | T | C | T | C | T | C | T | C | C | T |
A straightforward method (with no statement re-ordering) of splitting into non-dependent groups would have to split the mutate at each first use of a new value: yielding more mutate stages. For why a low number of execution stages is important please see here.
Note: re-using variable variable names does limit the planner’s ability to efficiently partition the the statement. The planner still emits safe and correct code, but unless it were to be allowed to introduce new variable names it must break sequences in more places. We show this effect below:
plan2 <- d1 %.>%
extend(.,
choice = rand_a>=0.5,
a_1 = ifelse(choice,
'T',
'C'),
a_2 = ifelse(choice,
'C',
'T'),
choice = rand_b>=0.5,
b_1 = ifelse(choice,
'T',
'C'),
b_2 = ifelse(choice,
'C',
'T'),
choice = rand_c>=0.5,
c_1 = ifelse(choice,
'T',
'C'),
c_2 = ifelse(choice,
'C',
'T'),
choice = rand_d>=0.5,
d_1 = ifelse(choice,
'T',
'C'),
d_2 = ifelse(choice,
'C',
'T'),
choice = rand_e>=0.5,
e_1 = ifelse(choice,
'T',
'C'),
e_2 = ifelse(choice,
'C',
'T')
) %.>%
select_columns(.,
qc(id,
a_1, a_2, b_1, b_2,
c_1, c_2, d_1, d_2,
e_1, e_2))
cat(format(plan2))
mk_td("example_table", c(
"id",
"rand_a",
"rand_b",
"rand_c",
"rand_d",
"rand_e")) %.>%
extend(.,
choice := rand_a >= 0.5) %.>%
extend(.,
a_1 := ifelse(choice, "T", "C"),
a_2 := ifelse(choice, "C", "T")) %.>%
extend(.,
choice := rand_b >= 0.5) %.>%
extend(.,
b_1 := ifelse(choice, "T", "C"),
b_2 := ifelse(choice, "C", "T")) %.>%
extend(.,
choice := rand_c >= 0.5) %.>%
extend(.,
c_1 := ifelse(choice, "T", "C"),
c_2 := ifelse(choice, "C", "T")) %.>%
extend(.,
choice := rand_d >= 0.5) %.>%
extend(.,
d_1 := ifelse(choice, "T", "C"),
d_2 := ifelse(choice, "C", "T")) %.>%
extend(.,
choice := rand_e >= 0.5) %.>%
extend(.,
e_1 := ifelse(choice, "T", "C"),
e_2 := ifelse(choice, "C", "T")) %.>%
select_columns(.,
c('id', 'a_1', 'a_2', 'b_1', 'b_2', 'c_1', 'c_2', 'd_1', 'd_2', 'e_1', 'e_2'))
id | a_1 | a_2 | b_1 | b_2 | c_1 | c_2 | d_1 | d_2 | e_1 | e_2 |
---|---|---|---|---|---|---|---|---|---|---|
1 | T | C | T | C | C | T | C | T | C | T |
2 | T | C | C | T | C | T | T | C | T | C |
3 | T | C | T | C | T | C | T | C | C | T |
4 | T | C | T | C | T | C | T | C | C | T |
Notice the returned tables are identical (as they should be, which
is not always the case for database backed dplyr
).