COAP: simulation

This vignette introduces the usage of COAP for the analysis of high-dimensional count data with additional high-dimensional covariates, by comparison with other methods.

Compare with other methods

We compare COAP with various prominent methods in the literature. They are (1) High-dimensional LFM (Bai and Ng 2002) implemented in the R package GFM; (2) PoissonPCA (Kenney et al. 2021) implemented in the R package PoissonPCA; (3) Zero-inflated Poisson factor model (ZIPFA, Xu et al. 2021) implemented in the R package ZIPFA; (4) Generalized factor model (Liu et al. 2023) implemented in the R package GFM; (5) PLNPCA (Chiquet et al. 2018) implemented in the R package PLNmodels; (6) Generalized Linear Latent Variable Models (GLLVM, Hui et al. 2017) implemented in the R package gllvm. (7) Poisson regression model for each \(x_{ij}, (j = 1,··· ,p)\), implemented in stats R package; (8) Multi-response reduced-rank Poisson regression model (MMMR, Luo et al. 2018) implemented in rrpack R package.

(1). First, we implemented the linear factor model (LFM) and record the metrics that measure the estimation accuracy and computational cost.

metricList$LFM <- list()
tic <- proc.time()
fit_lfm <- Factorm(X_count, q=q)
toc <- proc.time()
time_lfm <- toc[3] - tic[3]

hbb1 <- colMeans(X_count)
metricList$LFM$Tr_H <- measurefun(fit_lfm$hH, H0)
metricList$LFM$Tr_B <- measurefun(fit_lfm$hB, B0)
metricList$LFM$err_bb1 <- norm_vec(hbb1- bbeta0[,1])
metricList$LFM$err_bb <- NA
metricList$LFM$Time <- time_lfm

(2). Then, we implemented PoissonPCA and recorded the metrics.

metricList$PoissonPCA <- list()
library(PoissonPCA)
tic <- proc.time()
fit_poispca <- Poisson_Corrected_PCA(X_count, k= hq) 
toc <- proc.time()
time_ppca <- toc[3] - tic[3]

hbb1 <- colMeans(X_count)
metricList$PoissonPCA$Tr_H <- measurefun(fit_poispca$scores, H0)
metricList$PoissonPCA$Tr_B <- measurefun(fit_poispca$loadings, B0)
metricList$PoissonPCA$err_bb1 <- norm_vec(log(1+fit_poispca$center)- bbeta0[,1])
metricList$PoissonPCA$err_bb <- NA
metricList$PoissonPCA$Time <- time_ppca

Thirdly, we implemented the zero-inflated Poisson factor model:

## ZIPFA runs very slowly, so we do not run it here.
library(ZIPFA)
metricList$ZIPFA <- list()
system.time(
  tic <- proc.time()
  fit_zipfa <- ZIPFA(X_count, k=hq, display = FALSE)
  toc <- proc.time()
  time_zipfa <- toc[3] - tic[3]
)


  
idx_max_like <- which.max(fit_zipfa$Likelihood) 
hbb1 <- colMeans(X_count)
metricList$ZIPFA$Tr_H <- measurefun(fit_zipfa$Ufit[[idx_max_like]], H0)
metricList$ZIPFA$Tr_B <- measurefun(fit_zipfa$Vfit[[idx_max_like]], B0)
metricList$PoissonPCA$Time <- time_zipfa

Fourthly, we also applied the generalized factor model to estimate the loading matrix and factor matrix.

metricList$GFM <- list()
tic <- proc.time()
fit_gfm <- gfm(list(X_count),  type='poisson', q= q, verbose = F)
toc <- proc.time()
time_gfm <- toc[3] - tic[3]
metricList$GFM$Tr_H <- measurefun(fit_gfm$hH, H0)
metricList$GFM$Tr_B <- measurefun(fit_gfm$hB, B0)
metricList$GFM$err_bb1 <- norm_vec(fit_gfm$hmu- bbeta0[,1])
metricList$GFM$err_bb <- NA
metricList$GFM$Time <- time_gfm

Fifthly, we implemented PLNPCA:

PLNPCA_run <- function(X_count, covariates, q,  Offset=rep(1, nrow(X_count))){
  require(PLNmodels)
 
  if(!is.character(Offset)){
    dat_plnpca <- prepare_data(X_count, covariates)
    dat_plnpca$Offset <- Offset
  }else{
    dat_plnpca <- prepare_data(X_count, covariates, offset = Offset)
  }
  
  d <- ncol(covariates)
  #  offset(log(Offset))+
  formu <- paste0("Abundance ~ 1 + offset(log(Offset))+",paste(paste0("V",1:d), collapse = '+'))
  
  
  myPCA <- PLNPCA(as.formula(formu), data = dat_plnpca, ranks = q)
  
  myPCA1 <- getBestModel(myPCA)
  myPCA1$scores
  
  res_plnpca <- list(PCs= myPCA1$scores, bbeta= myPCA1$model_par$B, 
                     loadings=myPCA1$model_par$C)
  
  return(res_plnpca)
}



  tic <- proc.time()
  fit_plnpca <- PLNPCA_run(X_count,  covariates = Z[,-1], q= q)
  toc <- proc.time()
  time_plnpca <- toc[3] - tic[3]
message(time_plnpca, " seconds")

metricList$PLNPCA$Tr_H <- measurefun(fit_plnpca$PCs, H0)
metricList$PLNPCA$Tr_B <- measurefun(fit_plnpca$loadings, B0)
metricList$PLNPCA$err_bb1 <- norm_vec(fit_plnpca$bbeta[,1]- bbeta0[,1])
metricList$PLNPCA$err_bb <- norm_vec(as.vector(fit_plnpca$bbeta) - as.vector(bbeta0)) 
metricList$PLNPCA$Time <- time_plnpca

Sixthly, we implement the generalized linear latent variable models (GLLVM, Hui et al. 2017).

## GLLVM runs very slowly, so we do not run it here.

library(gllvm)
colnames(Z) <- c(paste0("V",1: ncol(Z)))
tic <- proc.time()
fit <- gllvm(y=X_count, X=Z, family=poisson(), num.lv= q, control = list(trace=T))
toc <- proc.time()
time_gllvm <- toc[3] - tic[3]

metricList$GLLVM <- list()
metricList$GLLVM$Tr_H <- measurefun(fit$lvs, H0)
metricList$GLLVM$Tr_B <- measurefun(fit$params$theta, B0)
metricList$GLLVM$err_bb1 <- norm_vec(fit$params$beta0- bbeta0[,1])
metricList$GLLVM$err_bb <- norm_vec(as.vector(cbind(fit$params$beta0,fit$params$Xcoef)) - as.vector(bbeta0)) 
metricList$GLLVM$Time <- time_gllvm
}

Seventhly, Poisson regression model for each variable was implemented.

PoisReg <- function(X_count, covariates){
     library(stats)
     hbbeta <- apply(X_count, 2, function(x){
       glm1 <- glm(x~covariates+0, family = "poisson")
       coef(glm1)
     } )
     return(t(hbbeta))
}
tic <- proc.time()
hbbeta_poisreg <- PoisReg(X_count, Z)
toc <- proc.time()
time_poisreg <- toc[3] - tic[3]
metricList$GLM <- list()
metricList$GLM$Tr_H <- NA
metricList$GLM$Tr_B <- NA
metricList$GLM$err_bb1 <- norm_vec(hbbeta_poisreg[,1]- bbeta0[,1])
metricList$GLM$err_bb <- norm_vec(as.vector(hbbeta_poisreg) - as.vector(bbeta0)) 
metricList$GLM$Time <- time_poisreg

Eightly, we implemented the first version of multi-response reduced-rank Poisson regression model (MMMR, Luo et al. 2018) implemented in rrpack R package (MRRR-Z), that did not consider the latent factor structure but only the covariates.

mrrr_run <- function(Y, X, rank0, q=NULL, family=list(poisson()), familygroup=rep(1,ncol(Y))){
  
  
  require(rrpack)
  
  n <- nrow(Y); p <- ncol(Y)
  
  if(!is.null(q)){
    rank0 <- rank0+q
    X <- cbind(X, diag(n))
  }
  
  svdX0d1 <- svd(X)$d[1]
  init1 = list(kappaC0 = svdX0d1 * 5) ## this setting follows the example that authors provided.
  
  fit.mrrr <- mrrr(Y=Y, X=X[,-1], family = family, familygroup = familygroup,
                   penstr = list(penaltySVD = "rankCon", lambdaSVD = 0.1),
                   init = init1, maxrank = rank0)
  hbbeta_mrrr <-t(fit.mrrr$coef[1:ncol(Z), ])
  if(!is.null(q)){
    Theta_hb <- (fit.mrrr$coef[(ncol(Z)+1): (nrow(Z)+ncol(Z)), ])
    svdTheta <- svd(Theta_hb, nu=q, nv=q)
    return(list(hbbeta=hbbeta_mrrr, factor=svdTheta$u, loading=svdTheta$v))
  }else{
    return(list(hbbeta=hbbeta_mrrr))
  }
  
  
}
tic <- proc.time()

res_mrrrz <- mrrr_run(X_count, Z, rank0)
toc <- proc.time()
time_mrrrz <- toc[3] - tic[3]

metricList$MRRR_Z <- list()
metricList$MRRR_Z$Tr_H <- NA
metricList$MRRR_Z$Tr_B <-NA
metricList$MRRR_Z$err_bb1 <- norm_vec(res_mrrrz$hbbeta[,1]- bbeta0[,1])
metricList$MRRR_Z$err_bb <- norm_vec(as.vector(res_mrrrz$hbbeta) - as.vector(bbeta0)) 
metricList$MRRR_Z$Time <- time_mrrrz

Lastly, we implemented the second version of MRRR (MRRR-F) that considered both covariates and the latent factor structure.

tic <- proc.time()
res_mrrrf <- mrrr_run(X_count, Z, rank0, q=q)
toc <- proc.time()
time_mrrrf <- toc[3] - tic[3]
metricList$MRRR_F <- list()
metricList$MRRR_F$Tr_H <- measurefun(res_mrrrf$factor, H0)
metricList$MRRR_F$Tr_B <- measurefun(res_mrrrf$loading, B0)
metricList$MRRR_F$err_bb1 <- norm_vec(res_mrrrf$hbbeta[,1]- bbeta0[,1])
metricList$MRRR_F$err_bb <- norm_vec(as.vector(res_mrrrf$hbbeta) - as.vector(bbeta0)) 
metricList$MRRR_F$Time <- time_mrrrf

COAP: simulation

Wei Liu

2025-03-27

Generate the simulated data

Compare with other methods

Visualize the comparison of performance

Select the parameters