Consider the neuroblastoma data. There are 3418 labeled examples. If we consider subsets, how long does it take to compute the AUM and its directional derivatives?
data(neuroblastomaProcessed, package="penaltyLearning")
library(data.table)
nb.err <- data.table(neuroblastomaProcessed$errors)
nb.err[, example := paste0(profile.id, ".", chromosome)]
nb.X <- neuroblastomaProcessed$feature.mat
(N.pred.vec <- as.integer(10^seq(1, log10(nrow(nb.X)), by=0.5)))
#> [1] 10 31 100 316 1000 3162
if(requireNamespace("atime")){
aum.pL.list <- atime::atime(
N=N.pred.vec,
setup={
N.pred.names <- rownames(nb.X)[1:N]
N.diffs.dt <- aum::aum_diffs_penalty(nb.err, N.pred.names)
pred.dt <- data.table(example=N.pred.names, pred.log.lambda=0)
},
penaltyLearning={
roc.list <- penaltyLearning::ROChange(nb.err, pred.dt, "example")
},
aum={
aum.list <- aum::aum(N.diffs.dt, pred.dt$pred.log.lambda)
})
plot(aum.pL.list)
}
#> Le chargement a nécessité le package : atime
#> Warning in atime::atime(N = N.pred.vec, setup = {: please increase max N or
#> seconds.limit, because only one N was evaluated for expr.name: penaltyLearning
#> Le chargement a nécessité le package : directlabels
From the plot above we can see that both packages have similar asymptotic time complexity. However aum is faster by orders of magnitude.
In this section we show a base R implementation of aum.
diffs.df <- data.frame(
example=c(0,1,1,2,3),
pred=c(0,0,1,0,0),
fp_diff=c(1,1,1,0,0),
fn_diff=c(0,0,0,-1,-1))
pred.log.lambda <- c(0,1,-1,0)
microbenchmark::microbenchmark("C++"={
aum::aum(diffs.df, pred.log.lambda)
}, R={
thresh.vec <- with(diffs.df, pred-pred.log.lambda[example+1])
s.vec <- order(thresh.vec)
sort.diffs <- data.frame(diffs.df, thresh.vec)[s.vec,]
for(fp.or.fn in c("fp","fn")){
ord.fun <- if(fp.or.fn=="fp")identity else rev
fwd.or.rev <- sort.diffs[ord.fun(1:nrow(sort.diffs)),]
fp.or.fn.diff <- fwd.or.rev[[paste0(fp.or.fn,"_diff")]]
last.in.run <- c(diff(fwd.or.rev$thresh.vec) != 0, TRUE)
after.or.before <-
ifelse(fp.or.fn=="fp",1,-1)*cumsum(fp.or.fn.diff)[last.in.run]
distribute <- function(values)with(fwd.or.rev, structure(
values,
names=thresh.vec[last.in.run]
)[paste(thresh.vec)])
out.df <- data.frame(
before=distribute(c(0, after.or.before[-length(after.or.before)])),
after=distribute(after.or.before))
sort.diffs[
paste0(fp.or.fn,"_",ord.fun(c("before","after")))
] <- as.list(out.df[ord.fun(1:nrow(out.df)),])
}
AUM.vec <- with(sort.diffs, diff(thresh.vec)*pmin(fp_before,fn_before)[-1])
list(
aum=sum(AUM.vec),
deriv_mat=sapply(c("after","before"),function(b.or.a){
s <- if(b.or.a=="before")1 else -1
f <- function(p.or.n,suffix=b.or.a){
sort.diffs[[paste0("f",p.or.n,"_",suffix)]]
}
fp <- f("p")
fn <- f("n")
aggregate(
s*(pmin(fp+s*f("p","diff"),fn+s*f("n","diff"))-pmin(fp, fn)),
list(sort.diffs$example),
sum)$x
}))
}, times=10)
#> Unit: microseconds
#> expr min lq mean median uq max neval
#> C++ 706.12 738.72 776.132 763.14 795.40 878.40 10
#> R 50971.56 52175.48 56842.228 53770.06 55076.96 78110.76 10
It is clear that the C++ implementation is several orders of magnitude faster.
library(data.table)
max.N <- 1e6
(N.pred.vec <- as.integer(10^seq(1, log10(max.N), by=0.5)))
#> [1] 10 31 100 316 1000 3162 10000 31622 100000
#> [10] 316227 1000000
max.y.vec <- rep(c(0,1), l=max.N)
max.diffs.dt <- aum::aum_diffs_binary(max.y.vec)
set.seed(1)
max.pred.vec <- rnorm(max.N)
if(requireNamespace("atime")){
aum.sort.list <- atime::atime(
N=N.pred.vec,
setup={
N.diffs.dt <- max.diffs.dt[1:N]
N.pred.vec <- max.pred.vec[1:N]
},
dt_sort={
N.diffs.dt[order(N.pred.vec)]
},
R_sort_radix={
sort(N.pred.vec, method="radix")
},
R_sort_quick={
sort(N.pred.vec, method="quick")
},
aum_sort={
aum.list <- aum:::aum_sort_interface(N.diffs.dt, N.pred.vec)
})
plot(aum.sort.list)
}
#> Warning in ggplot2::scale_y_log10("median line, min/max band"): log-10 transformation introduced infinite values.
#> log-10 transformation introduced infinite values.
#> log-10 transformation introduced infinite values.