assertable Template

File Check and Import

First, use check_files to make sure the files exist. We can use the system.file command to locate them within the assertable package. Then, run import_files to bring them in. We’ll call the combined data object master_data.

filenames <- paste0("file_",c(1:3),".csv")
filenames <- system.file("extdata", filenames, package = "assertable")

filenames

## [1] "C:/Users/grantng/AppData/Local/Temp/RtmpGeH3Km/Rinst6ea45a575202/assertable/extdata/file_1.csv"
## [2] "C:/Users/grantng/AppData/Local/Temp/RtmpGeH3Km/Rinst6ea45a575202/assertable/extdata/file_2.csv"
## [3] "C:/Users/grantng/AppData/Local/Temp/RtmpGeH3Km/Rinst6ea45a575202/assertable/extdata/file_3.csv"

check_files(filenames)

## [1] "All results are present"

master_data <- import_files(filenames,FUN=fread)

## [1] "All results are present"

head(master_data)

##    Plant   Type  Treatment conc uptake id_var
## 1:   Qn1 Quebec nonchilled   95   16.0      1
## 2:   Qn1 Quebec nonchilled  175   30.4      1
## 3:   Qn1 Quebec nonchilled  250   34.8      1
## 4:   Qn1 Quebec nonchilled  350   37.2      1
## 5:   Qn1 Quebec nonchilled  500   35.3      1
## 6:   Qn1 Quebec nonchilled  675   39.2      1

Checking Dimensions

This dataset should have 84 * 3 rows, and six columns: Plant, Type, Treatment, conc, uptake, and id_var.

assert_nrows(master_data,(84*3))

## [1] "All rows present"

assert_colnames(master_data,c("plant","type","treatment","conc","uptake","id_var"))

## Error in assert_colnames(master_data, c("plant", "type", "treatment", : These columns exist in colnames but not in your dataframe: plant type treatment and these exist in your dataframe but not in colnames: Plant Type Treatment

Oops, forgot to capitalize the column names. Trying again.

assert_nrows(master_data,(84*3))

## [1] "All rows present"

assert_colnames(master_data,c("Plant","Type","Treatment","conc","uptake","id_var"))

## [1] "All column names present"

Checking IDs

We believe the dataset should be unique by Plant, conc, and id_var (where id_var just represents the replication number of the dataset). Let’s check this.

plants <- unique(master_data$Plant)
concs <- unique(master_data$conc)
id_vars <- unique(master_data$id_var)

id_list <- list(Plant=plants, conc=concs, id_var=id_vars)
assert_ids(master_data,id_list)

## [1] "Data is identified by id_vars: Plant conc id_var"

Now, let’s make sure that there are only two values in Type: Quebec and Mississippi. Let’s also make sure that uptake and conc are more than 0 and less than 1500.

assert_values(master_data, colnames = "Type", test="in", test_val = c("Quebec","Mississippi"))

## [1] "Variable Type passed in test"

assert_values(master_data, colnames = c("uptake","conc"), test="gt", test_val = 0)

## [1] "Variable uptake passed gt test"
## [1] "Variable conc passed gt test"

assert_values(master_data, colnames = c("uptake","conc"), test="lt", test_val = 1500)

## [1] "Variable uptake passed lt test"
## [1] "Variable conc passed lt test"

Finally, let’s assert that all values of conc must be at least 6 times the value of uptake

assert_values(master_data, colnames = "conc", test="gt", test_val = master_data$uptake * 6)

##     Plant   Type  Treatment conc uptake id_var
##  1:   Qn1 Quebec nonchilled   95   16.0      1
##  2:   Qn1 Quebec nonchilled  175   30.4      1
##  3:   Qn3 Quebec nonchilled   95   16.2      1
##  4:   Qn3 Quebec nonchilled  175   32.4      1
##  5:   Qn1 Quebec nonchilled   95   16.0      2
##  6:   Qn1 Quebec nonchilled  175   30.4      2
##  7:   Qn3 Quebec nonchilled   95   16.2      2
##  8:   Qn3 Quebec nonchilled  175   32.4      2
##  9:   Qn1 Quebec nonchilled   95   16.0      3
## 10:   Qn1 Quebec nonchilled  175   30.4      3
## 11:   Qn3 Quebec nonchilled   95   16.2      3
## 12:   Qn3 Quebec nonchilled  175   32.4      3

## Error in assert_values(master_data, colnames = "conc", test = "gt", test_val = master_data$uptake * : 12 Rows for variable conc not more than the test value(s) in the dataset above

Bummer. Let’s finally do some subsetting of our data.

new_data <- master_data[master_data$Type == "Quebec" & master_data$Plant %in% c("Qn2","Qn3") & uptake > 20,]

Now, let’s see if our values of concs can uniquely identify our observations.

assert_ids(new_data, list(Plant=c("Qn2","Qn3"), conc=concs))

##    Plant conc
## 1:   Qn2   95
## 2:   Qn3   95

## Error in assert_ids(new_data, list(Plant = c("Qn2", "Qn3"), conc = concs)): The above combinations of id variables do not exist in your dataset

Rough, let’s take 95 out of our concs level and try it again.

new_concs <- c(175,250,350,500,675,1000)
assert_ids(new_data, list(Plant=c("Qn2","Qn3"),conc=new_concs))

##     Plant conc n_duplicates
##  1:   Qn2  175            3
##  2:   Qn2  250            3
##  3:   Qn2  350            3
##  4:   Qn2  500            3
##  5:   Qn2  675            3
##  6:   Qn2 1000            3
##  7:   Qn3  175            3
##  8:   Qn3  250            3
##  9:   Qn3  350            3
## 10:   Qn3  500            3
## 11:   Qn3  675            3
## 12:   Qn3 1000            3

## Error in assert_ids(new_data, list(Plant = c("Qn2", "Qn3"), conc = new_concs)): These combinations of id variables have n_duplicates duplicate observations per combination (36 total duplicates)

Let’s first get the actual rows and look at them.

new_concs <- c(175,250,350,500,675,1000)
vetting_data <- assert_ids(new_data, list(Plant=c("Qn2","Qn3"),conc=new_concs), 
                           ids_only=F, warn_only=T)

## Warning in assert_ids(new_data, list(Plant = c("Qn2", "Qn3"), conc
## = new_concs), : These rows of data are all of the observations with
## duplicated id_vars, and have n_duplicates duplicate observations per
## combination of id_varnames (36 total duplicates)

print(head(vetting_data))

##    Plant conc   Type  Treatment uptake id_var n_duplicates duplicate_id
## 1:   Qn2  175 Quebec nonchilled   27.3      1            3            1
## 2:   Qn2  175 Quebec nonchilled   27.3      2            3            2
## 3:   Qn2  175 Quebec nonchilled   27.3      3            3            3
## 4:   Qn2  250 Quebec nonchilled   37.1      1            3            1
## 5:   Qn2  250 Quebec nonchilled   37.1      2            3            2
## 6:   Qn2  250 Quebec nonchilled   37.1      3            3            3

Hmm, we forgot to include the values of id_var in the actual id_vars argument. Now, let’s try it the last time with the id_var character vector included.

new_concs <- c(175,250,350,500,675,1000)
assert_ids(new_data, list(Plant=c("Qn2","Qn3"), conc=new_concs, id_var=id_vars))

## [1] "Data is identified by id_vars: Plant conc id_var"

Awesome! Now you’re a data wizard!

assertable Template

Grant Nguyen

2021-01-26

Data

File Check and Import

Checking Dimensions

Checking IDs