rquery Many Columns

John Mount, Win-Vector LLC

2023-08-19

A common data manipulation need is: applying the same operation to a number of columns.

In the rquery R package we strongly recommend using value-oriented (or standard evaluation) for this task.

Here is what this looks like.

For our data lets build a simple data set similar to mtcars:

library(wrapr)
library(rquery)
have_rqdatatable <- requireNamespace('rqdatatable', quietly = TRUE)

example_data <- wrapr::build_frame(
   "mpg"  , "cyl", "disp",  "car"               |
     21   , 6    , 160   ,  "Mazda RX4"         |
     21   , 6    , 160   ,  "Mazda RX4 Wag"     |
     22.8 , 4    , 108   ,  "Datsun 710"        |
     21.4 , 6    , 258   ,  "Hornet 4 Drive"    |
     18.7 , 8    , 360   ,  "Hornet Sportabout" |
     18.1 , 6    , 225   ,  "Valiant"           )

knitr::kable(example_data)

mpg	cyl	disp	car
21.0	6	160	Mazda RX4
21.0	6	160	Mazda RX4 Wag
22.8	4	108	Datsun 710
21.4	6	258	Hornet 4 Drive
18.7	8	360	Hornet Sportabout
18.1	6	225	Valiant

Now suppose for a number of columns we wish to perform a calculation, such centering it with respect to the grand average.

This is easily handled by first specifying the set of variables we wish to work with.

vars <- setdiff(colnames(example_data), 'car')

print(vars)

## [1] "mpg"  "cyl"  "disp"

Now we build up what we want as a name-vector of strings using the := named map builder.

expressions <- vars := paste0(vars, ' - mean(', vars, ')')

print(expressions)

##                 mpg                 cyl                disp 
##   "mpg - mean(mpg)"   "cyl - mean(cyl)" "disp - mean(disp)"

The idea is: the := operator fits into R idiom by looking very much like a vectorized version of “names get assigned expressions”.

These expressions can then be used in an rquery pipeline using the _se-variant of extend(): extend_se().

ops <- local_td(example_data) %.>%
  extend_se(., expressions)

cat(format(ops))

## mk_td("example_data", c(
##   "mpg",
##   "cyl",
##   "disp",
##   "car")) %.>%
##  extend(.,
##   mpg := mpg - mean(mpg),
##   cyl := cyl - mean(cyl),
##   disp := disp - mean(disp))

And this operator pipeline is ready to use (assuming we have rqdatatable available):

if(have_rqdatatable) {
  example_data %.>%
    ops %.>%
    knitr::kable(.)
}

mpg	cyl	disp	car
0.5	0	-51.83333	Mazda RX4
0.5	0	-51.83333	Mazda RX4 Wag
2.3	-2	-103.83333	Datsun 710
0.9	0	46.16667	Hornet 4 Drive
-1.8	2	148.16667	Hornet Sportabout
-2.4	0	13.16667	Valiant

The expression construction can also be done inside the extend_se() operator.

ops <- local_td(example_data) %.>%
  extend_se(., vars := paste0(vars, ' - mean(', vars, ')'))

cat(format(ops))

## mk_td("example_data", c(
##   "mpg",
##   "cyl",
##   "disp",
##   "car")) %.>%
##  extend(.,
##   mpg := mpg - mean(mpg),
##   cyl := cyl - mean(cyl),
##   disp := disp - mean(disp))

Note: the above is only a notional example to demonstrate the operations; for supervised machine learning we would probably use base::scale(), which saves the learned centering for later re-use on new data.

The point is: we use standard R tools to build up the lists of names and operations. We are not restricted to any single argument manipulation grammar.

For example we could build all interaction terms as follows.

combos <- t(combn(vars, 2))
interactions <- 
  paste0(combos[, 1], '_', combos[, 2]) := 
  paste0(combos[, 1], ' * ', combos[, 2])

print(interactions)

##      mpg_cyl     mpg_disp     cyl_disp 
##  "mpg * cyl" "mpg * disp" "cyl * disp"

ops <- local_td(example_data) %.>%
  extend_se(., interactions)

cat(format(ops))

## mk_td("example_data", c(
##   "mpg",
##   "cyl",
##   "disp",
##   "car")) %.>%
##  extend(.,
##   mpg_cyl := mpg * cyl,
##   mpg_disp := mpg * disp,
##   cyl_disp := cyl * disp)

It is a critical advantage to work with sets of variables as standard values. In this case what we can do is limited only by the power of R itself.

Note: we also supply an alias for := as %:=% for those that don’t want to confuse this assignment with how the symbol is used in data.table. Take care that := has the precedence-level of an assignment and %:=% has the precedence-level of a user defined operator.

As, as always, our queries can be used on data.

if(have_rqdatatable) {
  example_data %.>%
    ops %.>%
    knitr::kable(.)
}

mpg	cyl	disp	car	mpg_cyl	mpg_disp	cyl_disp
21.0	6	160	Mazda RX4	126.0	3360.0	960
21.0	6	160	Mazda RX4 Wag	126.0	3360.0	960
22.8	4	108	Datsun 710	91.2	2462.4	432
21.4	6	258	Hornet 4 Drive	128.4	5521.2	1548
18.7	8	360	Hornet Sportabout	149.6	6732.0	2880
18.1	6	225	Valiant	108.6	4072.5	1350

Or even in a database.

have_db <- requireNamespace("DBI", quietly = TRUE) && 
  requireNamespace("RSQLite", quietly = TRUE)

if(have_db) {
  raw_connection <- DBI::dbConnect(RSQLite::SQLite(), 
                                   ":memory:")
  RSQLite::initExtension(raw_connection)
  db <- rquery_db_info(
    connection = raw_connection,
    is_dbi = TRUE,
    connection_options = rq_connection_tests(raw_connection))
  
  rq_copy_to(db, 'example_data',
             example_data,
             temporary = TRUE, 
             overwrite = TRUE)
  
  sql <- to_sql(ops, db)
  
  cat(format(sql))
}

## SELECT
##  `mpg`,
##  `cyl`,
##  `disp`,
##  `car`,
##  `mpg` * `cyl`  AS `mpg_cyl`,
##  `mpg` * `disp`  AS `mpg_disp`,
##  `cyl` * `disp`  AS `cyl_disp`
## FROM (
##  SELECT
##   `mpg`,
##   `cyl`,
##   `disp`,
##   `car`
##  FROM
##   `example_data`
##  ) tsql_73730651641561315786_0000000000

if(have_db) {
  res_table <- materialize(db, ops)
  DBI::dbReadTable(raw_connection, res_table$table_name) %.>%
    knitr::kable(.)
}

mpg	cyl	disp	car	mpg_cyl	mpg_disp	cyl_disp
21.0	6	160	Mazda RX4	126.0	3360.0	960
21.0	6	160	Mazda RX4 Wag	126.0	3360.0	960
22.8	4	108	Datsun 710	91.2	2462.4	432
21.4	6	258	Hornet 4 Drive	128.4	5521.2	1548
18.7	8	360	Hornet Sportabout	149.6	6732.0	2880
18.1	6	225	Valiant	108.6	4072.5	1350

if(have_db) {
  DBI::dbDisconnect(raw_connection)
}