The goal of is this vignette is to illustrate the cases functionality by means of a real-world example focused on biomarker assessment and prediction model evaluation.
Load the package:
## load packages:
library(readr)
library(dplyr)
#>
#> Attaching package: 'dplyr'
#> The following objects are masked from 'package:stats':
#>
#> filter, lag
#> The following objects are masked from 'package:base':
#>
#> intersect, setdiff, setequal, union
library(splitstackshape)
library(glmnet)
#> Loading required package: Matrix
#> Loaded glmnet 4.1-4
library(cases)
We utilize the breast cancer wisconsin (diagnostic) data set.
?data_wdbc<- data_wdbc data
dim(data)
#> [1] 569 31
table(data$diagnosis)
#>
#> 0 1
#> 357 212
## Missing values?
colSums(is.na(data)) # -> no missing values
#> diagnosis radius_mean texture_mean
#> 0 0 0
#> perimeter_mean area_mean smoothness_mean
#> 0 0 0
#> compactness_mean concavity_mean concave_points_mean
#> 0 0 0
#> symmetry_mean fractal_dimension_mean radius_sd
#> 0 0 0
#> texture_sd perimeter_sd area_sd
#> 0 0 0
#> smoothness_sd compactness_sd concavity_sd
#> 0 0 0
#> concave_points_sd symmetry_sd fractal_dimension_sd
#> 0 0 0
#> radius_peak texture_peak perimeter_peak
#> 0 0 0
#> area_peak smoothness_peak compactness_peak
#> 0 0 0
#> concavity_peak concave_points_peak symmetry_peak
#> 0 0 0
#> fractal_dimension_peak
#> 0
summary(data)
#> diagnosis radius_mean texture_mean perimeter_mean area_mean
#> 0:357 Min. : 6.981 Min. : 9.71 Min. : 43.79 Min. : 143.5
#> 1:212 1st Qu.:11.700 1st Qu.:16.17 1st Qu.: 75.17 1st Qu.: 420.3
#> Median :13.370 Median :18.84 Median : 86.24 Median : 551.1
#> Mean :14.127 Mean :19.29 Mean : 91.97 Mean : 654.9
#> 3rd Qu.:15.780 3rd Qu.:21.80 3rd Qu.:104.10 3rd Qu.: 782.7
#> Max. :28.110 Max. :39.28 Max. :188.50 Max. :2501.0
#> smoothness_mean compactness_mean concavity_mean concave_points_mean
#> Min. :0.05263 Min. :0.01938 Min. :0.00000 Min. :0.00000
#> 1st Qu.:0.08637 1st Qu.:0.06492 1st Qu.:0.02956 1st Qu.:0.02031
#> Median :0.09587 Median :0.09263 Median :0.06154 Median :0.03350
#> Mean :0.09636 Mean :0.10434 Mean :0.08880 Mean :0.04892
#> 3rd Qu.:0.10530 3rd Qu.:0.13040 3rd Qu.:0.13070 3rd Qu.:0.07400
#> Max. :0.16340 Max. :0.34540 Max. :0.42680 Max. :0.20120
#> symmetry_mean fractal_dimension_mean radius_sd texture_sd
#> Min. :0.1060 Min. :0.04996 Min. :0.1115 Min. :0.3602
#> 1st Qu.:0.1619 1st Qu.:0.05770 1st Qu.:0.2324 1st Qu.:0.8339
#> Median :0.1792 Median :0.06154 Median :0.3242 Median :1.1080
#> Mean :0.1812 Mean :0.06280 Mean :0.4052 Mean :1.2169
#> 3rd Qu.:0.1957 3rd Qu.:0.06612 3rd Qu.:0.4789 3rd Qu.:1.4740
#> Max. :0.3040 Max. :0.09744 Max. :2.8730 Max. :4.8850
#> perimeter_sd area_sd smoothness_sd compactness_sd
#> Min. : 0.757 Min. : 6.802 Min. :0.001713 Min. :0.002252
#> 1st Qu.: 1.606 1st Qu.: 17.850 1st Qu.:0.005169 1st Qu.:0.013080
#> Median : 2.287 Median : 24.530 Median :0.006380 Median :0.020450
#> Mean : 2.866 Mean : 40.337 Mean :0.007041 Mean :0.025478
#> 3rd Qu.: 3.357 3rd Qu.: 45.190 3rd Qu.:0.008146 3rd Qu.:0.032450
#> Max. :21.980 Max. :542.200 Max. :0.031130 Max. :0.135400
#> concavity_sd concave_points_sd symmetry_sd fractal_dimension_sd
#> Min. :0.00000 Min. :0.000000 Min. :0.007882 Min. :0.0008948
#> 1st Qu.:0.01509 1st Qu.:0.007638 1st Qu.:0.015160 1st Qu.:0.0022480
#> Median :0.02589 Median :0.010930 Median :0.018730 Median :0.0031870
#> Mean :0.03189 Mean :0.011796 Mean :0.020542 Mean :0.0037949
#> 3rd Qu.:0.04205 3rd Qu.:0.014710 3rd Qu.:0.023480 3rd Qu.:0.0045580
#> Max. :0.39600 Max. :0.052790 Max. :0.078950 Max. :0.0298400
#> radius_peak texture_peak perimeter_peak area_peak
#> Min. : 7.93 Min. :12.02 Min. : 50.41 Min. : 185.2
#> 1st Qu.:13.01 1st Qu.:21.08 1st Qu.: 84.11 1st Qu.: 515.3
#> Median :14.97 Median :25.41 Median : 97.66 Median : 686.5
#> Mean :16.27 Mean :25.68 Mean :107.26 Mean : 880.6
#> 3rd Qu.:18.79 3rd Qu.:29.72 3rd Qu.:125.40 3rd Qu.:1084.0
#> Max. :36.04 Max. :49.54 Max. :251.20 Max. :4254.0
#> smoothness_peak compactness_peak concavity_peak concave_points_peak
#> Min. :0.07117 Min. :0.02729 Min. :0.0000 Min. :0.00000
#> 1st Qu.:0.11660 1st Qu.:0.14720 1st Qu.:0.1145 1st Qu.:0.06493
#> Median :0.13130 Median :0.21190 Median :0.2267 Median :0.09993
#> Mean :0.13237 Mean :0.25427 Mean :0.2722 Mean :0.11461
#> 3rd Qu.:0.14600 3rd Qu.:0.33910 3rd Qu.:0.3829 3rd Qu.:0.16140
#> Max. :0.22260 Max. :1.05800 Max. :1.2520 Max. :0.29100
#> symmetry_peak fractal_dimension_peak
#> Min. :0.1565 Min. :0.05504
#> 1st Qu.:0.2504 1st Qu.:0.07146
#> Median :0.2822 Median :0.08004
#> Mean :0.2901 Mean :0.08395
#> 3rd Qu.:0.3179 3rd Qu.:0.09208
#> Max. :0.6638 Max. :0.20750
## define minimal acceptable criteria for specificity, sensitivity:
<- 0.7
sp0 <- 0.9
se0 <- c(sp0, se0) benchmark
<- seq(0,1, 0.1)
pr
quantile(data$area_peak, pr) # 500, 600, 700, 800, 900 ---> area
#> 0% 10% 20% 30% 40% 50% 60% 70% 80% 90%
#> 185.20 384.72 475.98 544.14 599.70 686.50 781.18 926.96 1269.00 1673.00
#> 100%
#> 4254.00
quantile(data$compactness_peak, pr) # 0.10, 0.15, 0.20, 0.25, 0.30 ---> compactness (perimeter^2 / area - 1.0)
#> 0% 10% 20% 30% 40% 50% 60% 70%
#> 0.027290 0.093676 0.125660 0.161400 0.184620 0.211900 0.251400 0.303960
#> 80% 90% 100%
#> 0.367060 0.447840 1.058000
quantile(data$concavity_peak, pr) # 0.10, 0.15, 0.20, 0.25, 0.30 ---> concavity (severity of concave portions of the contour)
#> 0% 10% 20% 30% 40% 50% 60% 70%
#> 0.000000 0.045652 0.091974 0.136880 0.177180 0.226700 0.286600 0.349920
#> 80% 90% 100%
#> 0.419540 0.571320 1.252000
<- c(500, 600, 700, 800, 900,
cc 0.10, 0.15, 0.20, 0.25, 0.30,
0.10, 0.15, 0.20, 0.25, 0.30)
<- data %>%
comp_bm ::select(area_peak, compactness_peak, concavity_peak) %>%
dplyr::categorize(cc, rep(1:3, each=5)) %>%
cases::compare(labels = as.numeric(as.character(data$diagnosis)))
cases
<- cases::evaluate(comp_bm, benchmark = benchmark,
results_bm alternative = "greater", alpha = 0.025,
adj="boot", regu=1)
#> Drawing 2000 'pairs' bootstrap samples...
results_bm#> [cases] evaluation results:
#> $specificity
#> parameter hypothesis estimate lower upper pval pval_all reject
#> 1 area_peak_500 <= 0.7 0.3645 0.2911 Inf 1.000 1.0000 FALSE
#> 2 area_peak_600 <= 0.7 0.6243 0.5505 Inf 1.000 1.0000 FALSE
#> 3 area_peak_700 <= 0.7 0.8059 0.7456 Inf 0.000 0.0010 TRUE
#> 4 area_peak_800 <= 0.7 0.9064 0.8620 Inf 0.000 1.0000 TRUE
#> 5 area_peak_900 <= 0.7 0.9763 0.9530 Inf 0.000 1.0000 TRUE
#> 6 compactness_peak_0.1 <= 0.7 0.1913 0.1314 Inf 1.000 1.0000 FALSE
#> 7 compactness_peak_0.15 <= 0.7 0.3980 0.3234 Inf 1.000 1.0000 FALSE
#> 8 compactness_peak_0.2 <= 0.7 0.6466 0.5738 Inf 1.000 1.0000 FALSE
#> 9 compactness_peak_0.25 <= 0.7 0.7975 0.7362 Inf 0.001 1.0000 TRUE
#> 10 compactness_peak_0.3 <= 0.7 0.8925 0.8452 Inf 0.000 1.0000 TRUE
#> 11 concavity_peak_0.1 <= 0.7 0.3422 0.2698 Inf 1.000 1.0000 FALSE
#> 12 concavity_peak_0.15 <= 0.7 0.5321 0.4560 Inf 1.000 1.0000 FALSE
#> 13 concavity_peak_0.2 <= 0.7 0.7221 0.6538 Inf 0.798 0.7980 FALSE
#> 14 concavity_peak_0.25 <= 0.7 0.8059 0.7456 Inf 0.000 0.9855 TRUE
#> 15 concavity_peak_0.3 <= 0.7 0.8673 0.8156 Inf 0.000 1.0000 TRUE
#> reject_all
#> 1 FALSE
#> 2 FALSE
#> 3 TRUE
#> 4 FALSE
#> 5 FALSE
#> 6 FALSE
#> 7 FALSE
#> 8 FALSE
#> 9 FALSE
#> 10 FALSE
#> 11 FALSE
#> 12 FALSE
#> 13 FALSE
#> 14 FALSE
#> 15 FALSE
#>
#> $sensitivity
#> parameter hypothesis estimate lower upper pval pval_all
#> 1 area_peak_500 <= 0.9 0.9977 0.9881 Inf 0.0000 1.0000
#> 2 area_peak_600 <= 0.9 0.9742 0.9429 Inf 0.0000 1.0000
#> 3 area_peak_700 <= 0.9 0.9601 0.9214 Inf 0.0010 0.0010
#> 4 area_peak_800 <= 0.9 0.8850 0.8220 Inf 1.0000 1.0000
#> 5 area_peak_900 <= 0.9 0.8052 0.7269 Inf 1.0000 1.0000
#> 6 compactness_peak_0.1 <= 0.9 0.9836 0.9585 Inf 0.0000 1.0000
#> 7 compactness_peak_0.15 <= 0.9 0.9648 0.9284 Inf 0.0000 1.0000
#> 8 compactness_peak_0.2 <= 0.9 0.8756 0.8104 Inf 1.0000 1.0000
#> 9 compactness_peak_0.25 <= 0.9 0.7441 0.6580 Inf 1.0000 1.0000
#> 10 compactness_peak_0.3 <= 0.9 0.6362 0.5411 Inf 1.0000 1.0000
#> 11 concavity_peak_0.1 <= 0.9 0.9883 0.9670 Inf 0.0000 1.0000
#> 12 concavity_peak_0.15 <= 0.9 0.9789 0.9505 Inf 0.0000 1.0000
#> 13 concavity_peak_0.2 <= 0.9 0.9695 0.9355 Inf 0.0000 0.7980
#> 14 concavity_peak_0.25 <= 0.9 0.9038 0.8455 Inf 0.9855 0.9855
#> 15 concavity_peak_0.3 <= 0.9 0.8052 0.7269 Inf 1.0000 1.0000
#> reject reject_all
#> 1 TRUE FALSE
#> 2 TRUE FALSE
#> 3 TRUE TRUE
#> 4 FALSE FALSE
#> 5 FALSE FALSE
#> 6 TRUE FALSE
#> 7 TRUE FALSE
#> 8 FALSE FALSE
#> 9 FALSE FALSE
#> 10 FALSE FALSE
#> 11 TRUE FALSE
#> 12 TRUE FALSE
#> 13 TRUE FALSE
#> 14 FALSE FALSE
#> 15 FALSE FALSE
visualize(results_bm)
## data splitting:
set.seed(1337)
<- stratified(data, c('diagnosis'), 1/3, bothSets = TRUE)
split <- split[[1]] %>% as.data.frame()
val <- split[[2]] %>% as.data.frame()
trn dim(trn)
#> [1] 379 31
dim(val)
#> [1] 190 31
table(val$diagnosis)
#>
#> 0 1
#> 119 71
## train example model
<- glmnet(x=trn[,-1], y=trn[,1], family="binomial", alpha=0.25)
mod str(mod, 0)
#> List of 13
#> - attr(*, "class")= chr [1:2] "lognet" "glmnet"
set.seed(1337)
## define hyperparameter values for L1/L2 penalty mixing parameter (alpha):
<- c(0, 0.25, 0.5, 0.75, 1)
aa
## train models and create predictions:
<- sapply(aa, function(alpha){
pred_pm <- cv.glmnet(x = as.matrix(trn[,-1]), y=trn[,1],
mod_pm family = "binomial",
type.measure = "class",
alpha = alpha)
message(paste0("cv.glmnet (alpha = ", alpha, "):"))
print(mod_pm)
message("+++++")
predict(mod_pm, as.matrix(val[,-1]), type="response")
})#> cv.glmnet (alpha = 0):
#>
#> Call: cv.glmnet(x = as.matrix(trn[, -1]), y = trn[, 1], type.measure = "class", family = "binomial", alpha = alpha)
#>
#> Measure: Misclassification Error
#>
#> Lambda Index Measure SE Nonzero
#> min 0.03847 100 0.01847 0.006853 30
#> 1se 0.04634 98 0.02375 0.008289 30
#> +++++
#> cv.glmnet (alpha = 0.25):
#>
#> Call: cv.glmnet(x = as.matrix(trn[, -1]), y = trn[, 1], type.measure = "class", family = "binomial", alpha = alpha)
#>
#> Measure: Misclassification Error
#>
#> Lambda Index Measure SE Nonzero
#> min 0.004383 64 0.01055 0.004299 27
#> 1se 0.012195 53 0.01319 0.005888 23
#> +++++
#> cv.glmnet (alpha = 0.5):
#>
#> Call: cv.glmnet(x = as.matrix(trn[, -1]), y = trn[, 1], type.measure = "class", family = "binomial", alpha = alpha)
#>
#> Measure: Misclassification Error
#>
#> Lambda Index Measure SE Nonzero
#> min 0.004612 56 0.01055 0.008061 22
#> 1se 0.012834 45 0.01847 0.008869 19
#> +++++
#> cv.glmnet (alpha = 0.75):
#>
#> Call: cv.glmnet(x = as.matrix(trn[, -1]), y = trn[, 1], type.measure = "class", family = "binomial", alpha = alpha)
#>
#> Measure: Misclassification Error
#>
#> Lambda Index Measure SE Nonzero
#> min 0.002802 57 0.007916 0.004023 19
#> 1se 0.009391 44 0.010554 0.005824 18
#> +++++
#> cv.glmnet (alpha = 1):
#>
#> Call: cv.glmnet(x = as.matrix(trn[, -1]), y = trn[, 1], type.measure = "class", family = "binomial", alpha = alpha)
#>
#> Measure: Misclassification Error
#>
#> Lambda Index Measure SE Nonzero
#> min 0.003672 51 0.01319 0.005888 13
#> 1se 0.005328 47 0.01847 0.008822 11
#> +++++
colnames(pred_pm) <- paste0("en", aa*100)
head(pred_pm)
#> en0 en25 en50 en75 en100
#> [1,] 0.9747700 0.9961507 0.9938921 0.9964515 0.9992168
#> [2,] 0.9996722 0.9999897 0.9999674 0.9999877 0.9999995
#> [3,] 0.9142164 0.9763833 0.9698399 0.9816259 0.9949924
#> [4,] 0.9955105 0.9995248 0.9991571 0.9995259 0.9999234
#> [5,] 0.9559590 0.9885991 0.9807955 0.9856935 0.9949629
#> [6,] 0.7573470 0.7981572 0.7785671 0.7485468 0.6770751
## define cutpoints (probability scale):
<- rep(seq(0.1, 0.5, 0.1), 5)
cc <- rep(1:5, each=5)
mm
## create predictions:
<- pred_pm %>%
comp_pm ::categorize(cc, mm) %>%
cases::compare(labels = as.numeric(as.character(val$diagnosis)))
casesstr(comp_pm, 1)
#> List of 2
#> $ specificity:'data.frame': 119 obs. of 25 variables:
#> $ sensitivity:'data.frame': 71 obs. of 25 variables:
## conduct statistical analysis:
set.seed(1337)
<- cases::evaluate(comp_pm, benchmark = benchmark,
results_pm alternative = "greater", alpha = 0.025,
adj="boot", regu=1)
#> Drawing 2000 'pairs' bootstrap samples...
str(results_pm, 1)
#> List of 2
#> $ specificity:'data.frame': 25 obs. of 9 variables:
#> $ sensitivity:'data.frame': 25 obs. of 9 variables:
#> - attr(*, "n")= Named int [1:2] 119 71
#> ..- attr(*, "names")= chr [1:2] "specificity" "sensitivity"
#> - attr(*, "m")= int 25
#> - attr(*, "alpha")= num 0.025
#> - attr(*, "alpha_adj")= logi NA
#> - attr(*, "cv")= num [1:2] -3.65 Inf
#> - attr(*, "class")= chr [1:2] "list" "cases_results"
#> - attr(*, "contrast")= chr [1:2] "raw" NA
#> - attr(*, "benchmark")= num [1:2] 0.7 0.9
#> - attr(*, "alternative")= chr "greater"
#> - attr(*, "analysis")= chr "co-primary"
#> - attr(*, "transformation")= chr "none"
#> - attr(*, "regu")= num [1:3] 1 0.5 0.25
#> - attr(*, "pars")= list()
%>% lapply(filter, reject_all)
results_pm #> $specificity
#> parameter hypothesis estimate lower upper pval pval_all reject
#> 1 en25_0.1 <= 0.7 0.8541667 0.7371061 Inf 0.0095 0.0095 TRUE
#> 2 en50_0.1 <= 0.7 0.8541667 0.7371061 Inf 0.0095 0.0095 TRUE
#> 3 en75_0.1 <= 0.7 0.8708333 0.7595952 Inf 0.0045 0.0045 TRUE
#> 4 en100_0.1 <= 0.7 0.8875000 0.7826976 Inf 0.0040 0.0095 TRUE
#> reject_all
#> 1 TRUE
#> 2 TRUE
#> 3 TRUE
#> 4 TRUE
#>
#> $sensitivity
#> parameter hypothesis estimate lower upper pval pval_all reject
#> 1 en25_0.1 <= 0.9 0.9791667 0.9181779 Inf 0.0095 0.0095 TRUE
#> 2 en50_0.1 <= 0.9 0.9930556 0.9575948 Inf 0.0000 0.0095 TRUE
#> 3 en75_0.1 <= 0.9 0.9930556 0.9575948 Inf 0.0000 0.0045 TRUE
#> 4 en100_0.1 <= 0.9 0.9791667 0.9181779 Inf 0.0095 0.0095 TRUE
#> reject_all
#> 1 TRUE
#> 2 TRUE
#> 3 TRUE
#> 4 TRUE
visualize(results_pm)