Skip to content

Commit

Permalink
Merge pull request #564 from belleau/main
Browse files Browse the repository at this point in the history
Update to parse the plink output for LD blocks.
  • Loading branch information
adeschen authored Oct 16, 2024
2 parents 6bbb29a + 6c88638 commit 7bceec5
Show file tree
Hide file tree
Showing 10 changed files with 496 additions and 18 deletions.
2 changes: 1 addition & 1 deletion DESCRIPTION
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,7 @@ Description: This package implements specialized algorithms that enable
following publication: Belleau, P et al. Genetic Ancestry Inference from
Cancer-Derived Molecular Data across Genomic and Transcriptomic
Platforms. Cancer Res 1 January 2023; 83 (1): 49–58.
Version: 1.3.1
Version: 1.3.2
Authors@R: c(person("Pascal", "Belleau", email="[email protected]",
role=c("cre", "aut"), comment = c(ORCID = "0000-0002-0802-1071")),
person("Astrid", "Deschênes", email="[email protected]",
Expand Down
1 change: 1 addition & 0 deletions NAMESPACE
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
# Generated by roxygen2: do not edit by hand

export(add1KG2SampleGDS)
export(addBlockFromDetFile)
export(addGeneBlockGDSRefAnnot)
export(addGeneBlockRefAnnot)
export(addRef2GDS1KG)
Expand Down
104 changes: 104 additions & 0 deletions R/gdsWrapper_internal.R
Original file line number Diff line number Diff line change
Expand Up @@ -1335,3 +1335,107 @@ appendGDSSampleOnly <- function(gds, listSamples) {

return(0L)
}

#' @title Append information associated to ld blocks, as indexes, into the
#' Population Reference SNV Annotation GDS file
#'
#' @description The function appends the information about the ld blocks into
#' the Population Reference SNV Annotation GDS file. The information is
#' extracted from the parameter listBlock.
#'
#' @param gds an object of class \link[gdsfmt]{gds.class}
#' (GDS file), an opened Reference Annotation GDS file.
#'
#' @param listBlock a \code{array} of integer
#' representing the linkage disequilibrium block for
#' each SNV in the in the same order than the variant
#' in Population reference dataset.
#'
#' @param blockName a \code{character} string representing the id of the block.
#' The blockName should not exist in \'gdsRefAnnotFile\'.
#'
#' @param blockDesc a \code{character} string representing the description of
#' the block.
#'
#' @return The integer \code{0L} when successful.
#'
#' @examples
#'
#' ## Required library for GDS
#' library(gdsfmt)
#' ## Path to the demo pedigree file is located in this package
#' dataDir <- system.file("extdata", package="RAIDS")
#'
# ## Temporary file
#' fileAnnotGDS <- file.path(tempdir(), "ex1_good_small_1KG_Ann_GDS.gds")
#'
#'
#' file.copy(file.path(dataDir, "tests",
#' "ex1_NoBlockGene.1KG_Annot_GDS.gds"), fileAnnotGDS)
#'
#'
#' fileReferenceGDS <- file.path(dataDir, "tests",
#' "ex1_good_small_1KG.gds")
#' \donttest{
#' gdsRef <- openfn.gds(fileReferenceGDS)
#' listBlock <- read.gdsn(index.gdsn(gdsRef, "snp.position"))
#' listBlock <- rep(-1, length(listBlock))
#' closefn.gds(gdsRef)
#' gdsAnnot1KG <- openfn.gds(fileAnnotGDS, readonly=FALSE)
#' ## Append information associated to blocks
#' RAIDS:::addGDS1KGLDBlock(gds=gdsAnnot1KG,
#' listBlock=listBlock,
#' blockName="blockEmpty",
#' blockDesc="Example")
#'. closefn.gds(gdsAnnot1KG)
#'
#' gdsAnnot1KG <- openfn.gds(fileAnnotGDS)
#' print(gdsAnnot1KG)
#'
#' closefn.gds(gdsAnnot1KG)
#' }
#'
#' ## Remove temporary file
#' unlink(fileAnnotGDS, force=TRUE)
#'
#' @author Pascal Belleau, Astrid Deschênes and Alexander Krasnitz
#' @importFrom gdsfmt add.gdsn index.gdsn ls.gdsn compression.gdsn
#' @importFrom gdsfmt append.gdsn sync.gds
#' @encoding UTF-8
#' @keywords internal
addGDS1KGLDBlock <- function(gds, listBlock, blockName, blockDesc) {

blockAnnot <- data.frame(block.id=blockName,
block.desc=blockDesc,
stringsAsFactors=FALSE)

if(! ("block.annot" %in% ls.gdsn(gds))) {
varBlockAnnot <- add.gdsn(gds, "block.annot", blockAnnot)
}else {
curAnnot <- index.gdsn(gds, "block.annot/block.id")
append.gdsn(curAnnot,blockAnnot$block.id)
curAnnot <- index.gdsn(gds, "block.annot/block.desc")
append.gdsn(curAnnot, blockAnnot$block.desc)
}

varBlock <- NULL
if(! ("block" %in% ls.gdsn(gds))){
varBlock <- add.gdsn(gds, "block",
valdim=c(length(listBlock), 1),
listBlock, storage="int32",
compress = "LZ4_RA")
readmode.gdsn(varBlock)

}else {
if(is.null(varBlock)) {
varBlock <- index.gdsn(gds, "block")
varBlock <- compression.gdsn(varBlock, "")
}
append.gdsn(varBlock, listBlock)
varBlock <- compression.gdsn(varBlock, "LZ4_RA")
}

sync.gds(gds)

return(0L)
}
177 changes: 177 additions & 0 deletions R/process1KG.R
Original file line number Diff line number Diff line change
Expand Up @@ -1020,6 +1020,183 @@ getRefSuperPop <- function(fileReferenceGDS) {
return(df)
}

#' @title Append information associated to ld blocks, as indexes, into the
#' Population Reference SNV Annotation GDS file
#'
#' @description The function appends the information about the ld blocks into
#' the Population Reference SNV Annotation GDS file. The information is
#' extracted from the Population Reference GDS file and files \'.det\'.
#'
#' @param fileReferenceGDS a \code{character} string representing the file
#' name of the Reference GDS file. The file must exist.
#'
#' @param gdsRefAnnotFile a \code{character} string representing the
#' file name corresponding the Reference SNV
#' Annotation GDS file. The function will
#' open it in write mode and close it after. The file must exist.
#'
#' @param pathBlock a \code{character} string representing the directory
#' where all the output file det from the plink block command are located.
#' The directory must not include other file with the extension \'.det\'.
#' The name of the \'.det\' must include the super-population between \'.\'
#' and the chromosome in the form \'chrNumber.\' \( \'chr1.\'\).
#'
#' @param superPop a \code{character} string representing the super population.
#'
#' @param blockName a \code{character} string representing the id of the block.
#' The blockName should not exist in \'gdsRefAnnotFile\'.
#' Default: \code{"ldBlock"}.
#'
#' @param blockDesc a \code{character} string representing the description of
#' the block.
#' Default: \code{"Not Define"}
#'
#' @param verbose a \code{logical} indicating if message information should be
#' printed. Default: \code{FALSE}.
#'
#' @return \code{OL} when the function is successful.
#'
#' @details
#'
#' More information about GDS file format can be found at the Bioconductor
#' gdsfmt website:
#' https://bioconductor.org/packages/gdsfmt/
#'
#' @examples
#'
#' ## Path to the demo pedigree file is located in this package
#' dataDir <- system.file("extdata", package="RAIDS")
#'
# ## Temporary file
#' fileAnnotGDS <- file.path(tempdir(), "ex1_good_small_1KG_Ann_GDS.gds")
#'
#' ## Demo of of output file det from the plink block
#' ## command for chromosome 1
#' fileLdBlock <- file.path(dirname(fileAnnotGDS), "block.sp.EUR.Ex.chr1.blocks.det")
#'
#'
#' file.copy(file.path(dataDir, "tests",
#' "ex1_NoBlockGene.1KG_Annot_GDS.gds"), fileAnnotGDS)
#' file.copy(file.path(dataDir, "block.sp.EUR.Ex.chr1.blocks.det"),
#' fileLdBlock)
#'
#'
#'
#' ## GDS Reference file
#' fileReferenceGDS <- file.path(dataDir, "tests",
#' "ex1_good_small_1KG.gds")
#'
#' \donttest{
#'
#'
#' ## Append information associated to blocks
#' addBlockFromDetFile(fileReferenceGDS=fileReferenceGDS,
#' gdsRefAnnotFile=fileAnnotGDS,
#' pathBlock=dirname(fileAnnotGDS),
#' superPop="EUR")
#'
#' gdsAnnot1KG <- openfn.gds(fileAnnotGDS)
#' print(gdsAnnot1KG)
#'
#' closefn.gds(gdsAnnot1KG)
#' }
#'
#' ## Remove temporary file
#' unlink(fileAnnotGDS, force=TRUE)
#' unlink(fileLdBlock, force=TRUE)
#'
#' @author Pascal Belleau, Astrid Deschênes and Alexander Krasnitz
#'
#' @importFrom gdsfmt openfn.gds closefn.gds read.gdsn index.gdsn ls.gdsn
#' @importFrom SNPRelate snpgdsOpen
#' @encoding UTF-8
#' @export
addBlockFromDetFile <- function(fileReferenceGDS, gdsRefAnnotFile, pathBlock,
superPop, blockName="ldBlock",
blockDesc="Not Define", verbose=FALSE) {
if (!(is.character(fileReferenceGDS) && (file.exists(fileReferenceGDS)))) {
stop("The \'fileReferenceGDS\' must be a character string ",
"representing the Reference GDS file. The file must exist.")
}
if(!(is.character(blockName))){
stop("The \'blockName\' must be a character string ",
"representing the name of the block.")
}

if(blockName == "ldBlock"){
blockName <- paste0(blockName, ".", superPop)
}

gdsRefAnnot <- openfn.gds(gdsRefAnnotFile)

if(("block.annot" %in% ls.gdsn(gdsRefAnnot))) {
listAnno <- read.gdsn(index.gdsn(gdsRefAnnot, "block.annot"))
if(length(which(gdsRefAnnot$block.id == blockName)) > 0){
stop("The \'blockName\' already exist in \'gdsRefAnnotFile\'.")
}
}
closefn.gds(gdsRefAnnot)

gdsReference <- snpgdsOpen(filename=fileReferenceGDS)



## The verbose must be a logical
validateLogical(verbose, "verbose")

## Extract the SNP chromosomes and positions
snpChromosome <- read.gdsn(index.gdsn(gdsReference, "snp.chromosome"))
#snpPosition <- read.gdsn(index.gdsn(gdsReference, "snp.position"))
closefn.gds(gdsReference)

listFileBlock <- dir(pathBlock, ".det")
listFileBlock <- listFileBlock[grep(paste0("\\.", superPop, "\\."), listFileBlock)]

listChr <- unique(snpChromosome)

#listChr <- listChr[order(listChr)]
#listChr <- seq_len(22)
listBlock <- list()

for(chr in seq_len(length(listChr))) {
if(verbose) { message("chr", listChr[chr], " ",Sys.time()) }
listChrCur <- listFileBlock[grep(paste0("chr",listChr[chr],"\\."), listFileBlock)]
if(length(listChrCur) == 1){
tmp <- processBlockChr(fileReferenceGDS, file.path(pathBlock, listChrCur))
listBlock[[chr]] <- tmp$block.snp
if(chr > 1) {
vMax <- max(listBlock[[chr-1]])
vMin <- min(listBlock[[chr-1]])
listBlock[[chr]][listBlock[[chr]] > 0] <-
listBlock[[chr]][listBlock[[chr]] > 0] + vMax
if(vMin < 0) {
listBlock[[chr]][listBlock[[chr]] < 0] <-
listBlock[[chr]][listBlock[[chr]] < 0] + vMin
}
}
}else{

listBlock[[chr]] <- rep(-1, length(which(snpChromosome == listChr[chr])))
vMin <- 0
if(chr > 1){
vMin <- min(listBlock[[chr-1]])
}
if(vMin < 0){
listBlock[[chr]] <- listBlock[[chr]] + vMin
}
}

}
listBlock <- do.call(c, listBlock)

gdsRefAnnot <- openfn.gds(gdsRefAnnotFile, readonly = FALSE)

## Save the information into the GDS file
addGDS1KGLDBlock(gdsRefAnnot, listBlock, blockName, blockDesc)
closefn.gds(gdsRefAnnot)
## Success
return(0L)
}

#' @title Append information associated to blocks, as indexes, into the
#' Population Reference SNV Annotation GDS file
Expand Down
7 changes: 6 additions & 1 deletion R/processStudy.R
Original file line number Diff line number Diff line change
Expand Up @@ -2351,7 +2351,9 @@ inferAncestry <- function(profileFile, pathProfileGDS,

genoSource <- arg_match(genoSource)


if(genoSource == "bam"){
stop("The bam is not release yet look to get a \'Devel\' version or contact us")
}
profileName <- gsub("\\.gz$", "", profileBaseName, ignore.case = TRUE)
for(extCur in c( "\\.vcf$", "\\.txt$", "\\.bam", "\\.tsv", "\\.csv")){
profileName <- gsub(extCur, "", profileName, ignore.case = TRUE)
Expand Down Expand Up @@ -2776,6 +2778,9 @@ inferAncestryGeneAware <- function(profileFile, pathProfileGDS,

genoSource <- arg_match(genoSource)

if(genoSource == "bam"){
stop("The bam is not release yet look to get a \'Devel\' version or contact us")
}

profileName <- gsub("\\.gz$", "", profileBaseName, ignore.case = TRUE)
for(extCur in c( "\\.vcf$", "\\.txt$", "\\.bam", "\\.tsv", "\\.csv")){
Expand Down
33 changes: 18 additions & 15 deletions R/tools_internal.R
Original file line number Diff line number Diff line change
Expand Up @@ -458,7 +458,7 @@ readSNVVCF <- function(fileName,
#' \item{\code{chr}}{ a \code{integer} representing a the chromosome from
#' fileBlock.
#' }
#' \item{\code{block.snp}}{ the a \code{array} of integer
#' \item{\code{block.snp}}{ a \code{array} of integer
#' representing the linkage disequilibrium block for
#' each SNV in the in the same order than the variant
#' in Population reference dataset.
Expand Down Expand Up @@ -506,43 +506,46 @@ processBlockChr <- function(fileReferenceGDS, fileBlock) {
listChr <- as.integer(gsub("chr", "", listChr))
listSNVChr <- read.gdsn(index.gdsn(gdsReference, "snp.chromosome"))
listSNVChr <- which(listSNVChr == listChr)
snp.keep <- read.gdsn(index.gdsn(gdsReference, "snp.position"))[listSNVChr]
snpKeep <- read.gdsn(index.gdsn(gdsReference, "snp.position"))[listSNVChr]
closefn.gds(gdsReference)
z <- cbind(c(blockChr$BP1, snp.keep, blockChr$BP2+1),
z <- cbind(c(blockChr$BP1, snpKeep, blockChr$BP2+1),
c(seq_len(nrow(blockChr)),
rep(0, length(snp.keep)), -1*seq_len(nrow(blockChr))))
rep(0, length(snpKeep)), -1*seq_len(nrow(blockChr))))

z <- z[order(z[,1]),]
block.snp <- cumsum(z[,2])[z[,2] == 0]
blockSnp <- cumsum(z[,2])[z[,2] == 0]

curStart <- 0
activeBlock <- 0
blockState <- 0
block.inter <- rep(0, length(which(block.snp == 0)))
blockInter <- rep(0, length(which(blockSnp == 0)))
k <- 1
for(i in seq_len(length(block.snp))){
if(block.snp[i] == 0){
for(i in seq_len(length(blockSnp))){
if(blockSnp[i] == 0){
if(activeBlock == 1){
if(snp.keep[i] - curStart >= 10000) {
if(snpKeep[i] - curStart >= 10000) {
blockState <- blockState - 1

curStart <- snp.keep[i]
curStart <- snpKeep[i]
}
} else{
blockState <- blockState - 1
curStart <- snp.keep[i]
curStart <- snp.keep[i]
curStart <- snpKeep[i]
activeBlock <- 1
}
block.inter[k] <- blockState
if(blockState == 0){
blockState <- -1
}
blockInter[k] <- blockState
k <- k + 1
}else{
activeBlock <- 0
}
}
block.snp[block.snp == 0] <- block.inter

blockSnp[blockSnp == 0] <- blockInter
res <- list(chr=listChr,
block.snp=block.snp)
block.snp=blockSnp)
return(res)
}

Loading

0 comments on commit 7bceec5

Please sign in to comment.