Merge pull request #564 from belleau/main

Update to parse the plink output for LD blocks.
KrasnitzLab · Oct 16, 2024 · 7bceec5 · 7bceec5
2 parents 6bbb29a + 6c88638
commit 7bceec5
Show file tree

Hide file tree

Showing 10 changed files with 496 additions and 18 deletions.
diff --git a/DESCRIPTION b/DESCRIPTION
@@ -9,7 +9,7 @@ Description: This package implements specialized algorithms that enable
     following publication: Belleau, P et al. Genetic Ancestry Inference from 
     Cancer-Derived Molecular Data across Genomic and Transcriptomic 
     Platforms. Cancer Res 1 January 2023; 83 (1): 49–58.
-Version: 1.3.1
+Version: 1.3.2
 Authors@R: c(person("Pascal", "Belleau", email="[email protected]",
     role=c("cre", "aut"), comment = c(ORCID = "0000-0002-0802-1071")),
     person("Astrid", "Deschênes", email="[email protected]",

diff --git a/NAMESPACE b/NAMESPACE
@@ -1,6 +1,7 @@
 # Generated by roxygen2: do not edit by hand
 
 export(add1KG2SampleGDS)
+export(addBlockFromDetFile)
 export(addGeneBlockGDSRefAnnot)
 export(addGeneBlockRefAnnot)
 export(addRef2GDS1KG)

diff --git a/R/gdsWrapper_internal.R b/R/gdsWrapper_internal.R
@@ -1335,3 +1335,107 @@ appendGDSSampleOnly <- function(gds, listSamples) {
 
     return(0L)
 }
+
+#' @title Append information associated to ld blocks, as indexes, into the
+#' Population Reference SNV Annotation GDS file
+#'
+#' @description The function appends the information about the ld blocks into
+#' the Population Reference SNV Annotation GDS file. The information is
+#' extracted from the parameter listBlock.
+#'
+#' @param gds an object of class \link[gdsfmt]{gds.class}
+#' (GDS file), an opened Reference Annotation GDS file.
+#'
+#' @param listBlock a \code{array} of integer
+#' representing the linkage disequilibrium block for
+#' each SNV in the in the same order than the variant
+#' in Population reference dataset.
+#'
+#' @param blockName a \code{character} string representing the id of the block.
+#' The blockName should not exist in \'gdsRefAnnotFile\'.
+#'
+#' @param blockDesc a \code{character} string representing the description of
+#' the block.
+#'
+#' @return The integer \code{0L} when successful.
+#'
+#' @examples
+#'
+#' ## Required library for GDS
+#' library(gdsfmt)
+#' ## Path to the demo pedigree file is located in this package
+#' dataDir <- system.file("extdata", package="RAIDS")
+#'
+#  ## Temporary file
+#' fileAnnotGDS <- file.path(tempdir(), "ex1_good_small_1KG_Ann_GDS.gds")
+#'
+#'
+#' file.copy(file.path(dataDir, "tests",
+#'     "ex1_NoBlockGene.1KG_Annot_GDS.gds"), fileAnnotGDS)
+#'
+#'
+#' fileReferenceGDS  <- file.path(dataDir, "tests",
+#'     "ex1_good_small_1KG.gds")
+#'  \donttest{
+#'     gdsRef <- openfn.gds(fileReferenceGDS)
+#'     listBlock <- read.gdsn(index.gdsn(gdsRef, "snp.position"))
+#'     listBlock <- rep(-1, length(listBlock))
+#'     closefn.gds(gdsRef)
+#'     gdsAnnot1KG <- openfn.gds(fileAnnotGDS, readonly=FALSE)
+#'     ## Append information associated to blocks
+#'     RAIDS:::addGDS1KGLDBlock(gds=gdsAnnot1KG,
+#'         listBlock=listBlock,
+#'         blockName="blockEmpty",
+#'         blockDesc="Example")
+#'.    closefn.gds(gdsAnnot1KG)
+#'
+#'     gdsAnnot1KG <- openfn.gds(fileAnnotGDS)
+#'     print(gdsAnnot1KG)
+#'
+#'     closefn.gds(gdsAnnot1KG)
+#' }
+#'
+#' ## Remove temporary file
+#' unlink(fileAnnotGDS, force=TRUE)
+#'
+#' @author Pascal Belleau, Astrid Deschênes and Alexander Krasnitz
+#' @importFrom gdsfmt add.gdsn index.gdsn ls.gdsn compression.gdsn
+#' @importFrom gdsfmt append.gdsn sync.gds
+#' @encoding UTF-8
+#' @keywords internal
+addGDS1KGLDBlock <- function(gds, listBlock, blockName, blockDesc) {
+
+    blockAnnot <- data.frame(block.id=blockName,
+                             block.desc=blockDesc,
+                             stringsAsFactors=FALSE)
+
+    if(! ("block.annot" %in% ls.gdsn(gds))) {
+        varBlockAnnot <- add.gdsn(gds, "block.annot", blockAnnot)
+    }else {
+        curAnnot <- index.gdsn(gds, "block.annot/block.id")
+        append.gdsn(curAnnot,blockAnnot$block.id)
+        curAnnot <- index.gdsn(gds, "block.annot/block.desc")
+        append.gdsn(curAnnot, blockAnnot$block.desc)
+    }
+
+    varBlock <- NULL
+    if(! ("block" %in% ls.gdsn(gds))){
+        varBlock <- add.gdsn(gds, "block",
+                             valdim=c(length(listBlock), 1),
+                             listBlock, storage="int32",
+                             compress = "LZ4_RA")
+        readmode.gdsn(varBlock)
+
+    }else {
+        if(is.null(varBlock)) {
+            varBlock <- index.gdsn(gds, "block")
+            varBlock <- compression.gdsn(varBlock, "")
+        }
+        append.gdsn(varBlock, listBlock)
+        varBlock <- compression.gdsn(varBlock, "LZ4_RA")
+    }
+
+    sync.gds(gds)
+
+    return(0L)
+}
diff --git a/R/process1KG.R b/R/process1KG.R
@@ -1020,6 +1020,183 @@ getRefSuperPop <- function(fileReferenceGDS) {
     return(df)
 }
 
+#' @title Append information associated to ld blocks, as indexes, into the
+#' Population Reference SNV Annotation GDS file
+#'
+#' @description The function appends the information about the ld blocks into
+#' the Population Reference SNV Annotation GDS file. The information is
+#' extracted from the Population Reference GDS file and files \'.det\'.
+#'
+#' @param fileReferenceGDS a \code{character} string representing the file
+#' name of the Reference GDS file. The file must exist.
+#'
+#' @param gdsRefAnnotFile a \code{character} string representing the
+#' file name corresponding the Reference SNV
+#' Annotation GDS file. The function will
+#' open it in write mode and close it after. The file must exist.
+#'
+#' @param pathBlock a \code{character} string representing the directory
+#' where all the output file det from the plink block command are located.
+#' The directory must not include other file with the extension \'.det\'.
+#' The name of the \'.det\' must include the super-population between \'.\'
+#' and the chromosome in the form \'chrNumber.\' \( \'chr1.\'\).
+#'
+#' @param superPop a \code{character} string representing the super population.
+#'
+#' @param blockName a \code{character} string representing the id of the block.
+#' The blockName should not exist in \'gdsRefAnnotFile\'.
+#' Default: \code{"ldBlock"}.
+#'
+#' @param blockDesc a \code{character} string representing the description of
+#' the block.
+#' Default: \code{"Not Define"}
+#'
+#' @param verbose a \code{logical} indicating if message information should be
+#' printed. Default: \code{FALSE}.
+#'
+#' @return \code{OL} when the function is successful.
+#'
+#' @details
+#'
+#' More information about GDS file format can be found at the Bioconductor
+#' gdsfmt website:
+#' https://bioconductor.org/packages/gdsfmt/
+#'
+#' @examples
+#'
+#' ## Path to the demo pedigree file is located in this package
+#' dataDir <- system.file("extdata", package="RAIDS")
+#'
+#  ## Temporary file
+#' fileAnnotGDS <- file.path(tempdir(), "ex1_good_small_1KG_Ann_GDS.gds")
+#'
+#' ## Demo of of output file det from the plink block
+#' ## command for chromosome 1
+#' fileLdBlock <- file.path(dirname(fileAnnotGDS), "block.sp.EUR.Ex.chr1.blocks.det")
+#'
+#'
+#' file.copy(file.path(dataDir, "tests",
+#'     "ex1_NoBlockGene.1KG_Annot_GDS.gds"), fileAnnotGDS)
+#' file.copy(file.path(dataDir, "block.sp.EUR.Ex.chr1.blocks.det"),
+#'     fileLdBlock)
+#'
+#'
+#'
+#' ## GDS Reference file
+#' fileReferenceGDS  <- file.path(dataDir, "tests",
+#'     "ex1_good_small_1KG.gds")
+#'
+#'  \donttest{
+#'
+#'
+#'     ## Append information associated to blocks
+#'     addBlockFromDetFile(fileReferenceGDS=fileReferenceGDS,
+#'         gdsRefAnnotFile=fileAnnotGDS,
+#'         pathBlock=dirname(fileAnnotGDS),
+#'         superPop="EUR")
+#'
+#'     gdsAnnot1KG <- openfn.gds(fileAnnotGDS)
+#'     print(gdsAnnot1KG)
+#'
+#'     closefn.gds(gdsAnnot1KG)
+#' }
+#'
+#' ## Remove temporary file
+#' unlink(fileAnnotGDS, force=TRUE)
+#' unlink(fileLdBlock, force=TRUE)
+#'
+#' @author Pascal Belleau, Astrid Deschênes and Alexander Krasnitz
+#'
+#' @importFrom gdsfmt openfn.gds closefn.gds read.gdsn index.gdsn ls.gdsn
+#' @importFrom SNPRelate snpgdsOpen
+#' @encoding UTF-8
+#' @export
+addBlockFromDetFile <- function(fileReferenceGDS, gdsRefAnnotFile, pathBlock,
+                                superPop, blockName="ldBlock",
+                                blockDesc="Not Define", verbose=FALSE) {
+    if (!(is.character(fileReferenceGDS) && (file.exists(fileReferenceGDS)))) {
+        stop("The \'fileReferenceGDS\' must be a character string ",
+             "representing the Reference GDS file. The file must exist.")
+    }
+    if(!(is.character(blockName))){
+        stop("The \'blockName\' must be a character string ",
+             "representing the name of the block.")
+    }
+
+    if(blockName == "ldBlock"){
+        blockName <- paste0(blockName, ".", superPop)
+    }
+
+    gdsRefAnnot <- openfn.gds(gdsRefAnnotFile)
+
+    if(("block.annot" %in% ls.gdsn(gdsRefAnnot))) {
+        listAnno <- read.gdsn(index.gdsn(gdsRefAnnot, "block.annot"))
+        if(length(which(gdsRefAnnot$block.id  == blockName)) > 0){
+            stop("The \'blockName\' already exist in \'gdsRefAnnotFile\'.")
+        }
+    }
+    closefn.gds(gdsRefAnnot)
+
+    gdsReference <- snpgdsOpen(filename=fileReferenceGDS)
+
+
+
+    ## The verbose must be a logical
+    validateLogical(verbose, "verbose")
+
+    ## Extract the SNP chromosomes and positions
+    snpChromosome <- read.gdsn(index.gdsn(gdsReference, "snp.chromosome"))
+    #snpPosition <- read.gdsn(index.gdsn(gdsReference, "snp.position"))
+    closefn.gds(gdsReference)
+
+    listFileBlock <- dir(pathBlock, ".det")
+    listFileBlock <- listFileBlock[grep(paste0("\\.", superPop, "\\."), listFileBlock)]
+
+    listChr <- unique(snpChromosome)
+
+    #listChr <- listChr[order(listChr)]
+    #listChr <- seq_len(22)
+    listBlock <- list()
+
+    for(chr in seq_len(length(listChr))) {
+        if(verbose) { message("chr", listChr[chr], " ",Sys.time()) }
+        listChrCur <- listFileBlock[grep(paste0("chr",listChr[chr],"\\."), listFileBlock)]
+        if(length(listChrCur) == 1){
+            tmp <- processBlockChr(fileReferenceGDS, file.path(pathBlock, listChrCur))
+            listBlock[[chr]] <- tmp$block.snp
+            if(chr > 1) {
+                vMax <- max(listBlock[[chr-1]])
+                vMin <- min(listBlock[[chr-1]])
+                listBlock[[chr]][listBlock[[chr]] > 0] <-
+                    listBlock[[chr]][listBlock[[chr]] > 0] + vMax
+                if(vMin < 0) {
+                    listBlock[[chr]][listBlock[[chr]] < 0] <-
+                        listBlock[[chr]][listBlock[[chr]] < 0] + vMin
+                }
+            }
+        }else{
+
+            listBlock[[chr]] <- rep(-1, length(which(snpChromosome == listChr[chr])))
+            vMin <- 0
+            if(chr > 1){
+                vMin <- min(listBlock[[chr-1]])
+            }
+            if(vMin < 0){
+                listBlock[[chr]] <- listBlock[[chr]] + vMin
+            }
+        }
+
+    }
+    listBlock <- do.call(c, listBlock)
+
+    gdsRefAnnot <- openfn.gds(gdsRefAnnotFile, readonly = FALSE)
+
+    ## Save the information into the GDS file
+    addGDS1KGLDBlock(gdsRefAnnot, listBlock, blockName, blockDesc)
+    closefn.gds(gdsRefAnnot)
+    ## Success
+    return(0L)
+}
 
 #' @title Append information associated to blocks, as indexes, into the
 #' Population Reference SNV Annotation GDS file

diff --git a/R/processStudy.R b/R/processStudy.R
@@ -2351,7 +2351,9 @@ inferAncestry <- function(profileFile, pathProfileGDS,
 
     genoSource <- arg_match(genoSource)
 
-
+    if(genoSource == "bam"){
+        stop("The bam is not release yet look to get a \'Devel\' version or contact us")
+    }
     profileName <- gsub("\\.gz$", "", profileBaseName, ignore.case = TRUE)
     for(extCur in c( "\\.vcf$", "\\.txt$", "\\.bam", "\\.tsv", "\\.csv")){
         profileName <- gsub(extCur, "", profileName, ignore.case = TRUE)
@@ -2776,6 +2778,9 @@ inferAncestryGeneAware <- function(profileFile, pathProfileGDS,
 
     genoSource <- arg_match(genoSource)
 
+    if(genoSource == "bam"){
+        stop("The bam is not release yet look to get a \'Devel\' version or contact us")
+    }
 
     profileName <- gsub("\\.gz$", "", profileBaseName, ignore.case = TRUE)
     for(extCur in c( "\\.vcf$", "\\.txt$", "\\.bam", "\\.tsv", "\\.csv")){

diff --git a/R/tools_internal.R b/R/tools_internal.R
@@ -458,7 +458,7 @@ readSNVVCF <- function(fileName,
 #' \item{\code{chr}}{ a \code{integer} representing a the chromosome from
 #' fileBlock.
 #' }
-#' \item{\code{block.snp}}{ the a \code{array} of integer
+#' \item{\code{block.snp}}{ a \code{array} of integer
 #' representing the linkage disequilibrium block for
 #' each SNV in the in the same order than the variant
 #' in Population reference dataset.
@@ -506,43 +506,46 @@ processBlockChr <- function(fileReferenceGDS, fileBlock) {
     listChr <- as.integer(gsub("chr", "", listChr))
     listSNVChr <- read.gdsn(index.gdsn(gdsReference, "snp.chromosome"))
     listSNVChr <- which(listSNVChr == listChr)
-    snp.keep <- read.gdsn(index.gdsn(gdsReference, "snp.position"))[listSNVChr]
+    snpKeep <- read.gdsn(index.gdsn(gdsReference, "snp.position"))[listSNVChr]
     closefn.gds(gdsReference)
-    z <- cbind(c(blockChr$BP1, snp.keep, blockChr$BP2+1),
+    z <- cbind(c(blockChr$BP1, snpKeep, blockChr$BP2+1),
                c(seq_len(nrow(blockChr)),
-                 rep(0, length(snp.keep)), -1*seq_len(nrow(blockChr))))
+                 rep(0, length(snpKeep)), -1*seq_len(nrow(blockChr))))
 
     z <- z[order(z[,1]),]
-    block.snp <- cumsum(z[,2])[z[,2] == 0]
+    blockSnp <- cumsum(z[,2])[z[,2] == 0]
 
     curStart <- 0
     activeBlock <- 0
     blockState <- 0
-    block.inter <- rep(0, length(which(block.snp == 0)))
+    blockInter <- rep(0, length(which(blockSnp == 0)))
     k <- 1
-    for(i in seq_len(length(block.snp))){
-        if(block.snp[i] == 0){
+    for(i in seq_len(length(blockSnp))){
+        if(blockSnp[i] == 0){
             if(activeBlock == 1){
-                if(snp.keep[i] - curStart >= 10000) {
+                if(snpKeep[i] - curStart >= 10000) {
                     blockState <- blockState - 1
 
-                    curStart <- snp.keep[i]
+                    curStart <- snpKeep[i]
                 }
             } else{
                 blockState <- blockState - 1
-                curStart <- snp.keep[i]
-                curStart <- snp.keep[i]
+                curStart <- snpKeep[i]
                 activeBlock <- 1
             }
-            block.inter[k] <- blockState
+            if(blockState == 0){
+                blockState <- -1
+            }
+            blockInter[k] <- blockState
             k <- k + 1
         }else{
             activeBlock <- 0
         }
     }
-    block.snp[block.snp == 0] <- block.inter
+
+    blockSnp[blockSnp == 0] <- blockInter
     res <- list(chr=listChr,
-                block.snp=block.snp)
+                block.snp=blockSnp)
     return(res)
 }