Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Update to parse the plink output for LD blocks. #564

Merged
merged 3 commits into from
Oct 16, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion DESCRIPTION
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,7 @@ Description: This package implements specialized algorithms that enable
following publication: Belleau, P et al. Genetic Ancestry Inference from
Cancer-Derived Molecular Data across Genomic and Transcriptomic
Platforms. Cancer Res 1 January 2023; 83 (1): 49–58.
Version: 1.3.1
Version: 1.3.2
Authors@R: c(person("Pascal", "Belleau", email="[email protected]",
role=c("cre", "aut"), comment = c(ORCID = "0000-0002-0802-1071")),
person("Astrid", "Deschênes", email="[email protected]",
Expand Down
1 change: 1 addition & 0 deletions NAMESPACE
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
# Generated by roxygen2: do not edit by hand

export(add1KG2SampleGDS)
export(addBlockFromDetFile)
export(addGeneBlockGDSRefAnnot)
export(addGeneBlockRefAnnot)
export(addRef2GDS1KG)
Expand Down
104 changes: 104 additions & 0 deletions R/gdsWrapper_internal.R
Original file line number Diff line number Diff line change
Expand Up @@ -1335,3 +1335,107 @@ appendGDSSampleOnly <- function(gds, listSamples) {

return(0L)
}

#' @title Append information associated to ld blocks, as indexes, into the
#' Population Reference SNV Annotation GDS file
#'
#' @description The function appends the information about the ld blocks into
#' the Population Reference SNV Annotation GDS file. The information is
#' extracted from the parameter listBlock.
#'
#' @param gds an object of class \link[gdsfmt]{gds.class}
#' (GDS file), an opened Reference Annotation GDS file.
#'
#' @param listBlock a \code{array} of integer
#' representing the linkage disequilibrium block for
#' each SNV in the in the same order than the variant
#' in Population reference dataset.
#'
#' @param blockName a \code{character} string representing the id of the block.
#' The blockName should not exist in \'gdsRefAnnotFile\'.
#'
#' @param blockDesc a \code{character} string representing the description of
#' the block.
#'
#' @return The integer \code{0L} when successful.
#'
#' @examples
#'
#' ## Required library for GDS
#' library(gdsfmt)
#' ## Path to the demo pedigree file is located in this package
#' dataDir <- system.file("extdata", package="RAIDS")
#'
# ## Temporary file
#' fileAnnotGDS <- file.path(tempdir(), "ex1_good_small_1KG_Ann_GDS.gds")
#'
#'
#' file.copy(file.path(dataDir, "tests",
#' "ex1_NoBlockGene.1KG_Annot_GDS.gds"), fileAnnotGDS)
#'
#'
#' fileReferenceGDS <- file.path(dataDir, "tests",
#' "ex1_good_small_1KG.gds")
#' \donttest{
#' gdsRef <- openfn.gds(fileReferenceGDS)
#' listBlock <- read.gdsn(index.gdsn(gdsRef, "snp.position"))
#' listBlock <- rep(-1, length(listBlock))
#' closefn.gds(gdsRef)
#' gdsAnnot1KG <- openfn.gds(fileAnnotGDS, readonly=FALSE)
#' ## Append information associated to blocks
#' RAIDS:::addGDS1KGLDBlock(gds=gdsAnnot1KG,
#' listBlock=listBlock,
#' blockName="blockEmpty",
#' blockDesc="Example")
#'. closefn.gds(gdsAnnot1KG)
#'
#' gdsAnnot1KG <- openfn.gds(fileAnnotGDS)
#' print(gdsAnnot1KG)
#'
#' closefn.gds(gdsAnnot1KG)
#' }
#'
#' ## Remove temporary file
#' unlink(fileAnnotGDS, force=TRUE)
#'
#' @author Pascal Belleau, Astrid Deschênes and Alexander Krasnitz
#' @importFrom gdsfmt add.gdsn index.gdsn ls.gdsn compression.gdsn
#' @importFrom gdsfmt append.gdsn sync.gds
#' @encoding UTF-8
#' @keywords internal
addGDS1KGLDBlock <- function(gds, listBlock, blockName, blockDesc) {

blockAnnot <- data.frame(block.id=blockName,
block.desc=blockDesc,
stringsAsFactors=FALSE)

if(! ("block.annot" %in% ls.gdsn(gds))) {
varBlockAnnot <- add.gdsn(gds, "block.annot", blockAnnot)
}else {
curAnnot <- index.gdsn(gds, "block.annot/block.id")
append.gdsn(curAnnot,blockAnnot$block.id)
curAnnot <- index.gdsn(gds, "block.annot/block.desc")
append.gdsn(curAnnot, blockAnnot$block.desc)
}

varBlock <- NULL
if(! ("block" %in% ls.gdsn(gds))){
varBlock <- add.gdsn(gds, "block",
valdim=c(length(listBlock), 1),
listBlock, storage="int32",
compress = "LZ4_RA")
readmode.gdsn(varBlock)

}else {
if(is.null(varBlock)) {
varBlock <- index.gdsn(gds, "block")
varBlock <- compression.gdsn(varBlock, "")
}
append.gdsn(varBlock, listBlock)
varBlock <- compression.gdsn(varBlock, "LZ4_RA")
}

sync.gds(gds)

return(0L)
}
177 changes: 177 additions & 0 deletions R/process1KG.R
Original file line number Diff line number Diff line change
Expand Up @@ -1020,6 +1020,183 @@ getRefSuperPop <- function(fileReferenceGDS) {
return(df)
}

#' @title Append information associated to ld blocks, as indexes, into the
#' Population Reference SNV Annotation GDS file
#'
#' @description The function appends the information about the ld blocks into
#' the Population Reference SNV Annotation GDS file. The information is
#' extracted from the Population Reference GDS file and files \'.det\'.
#'
#' @param fileReferenceGDS a \code{character} string representing the file
#' name of the Reference GDS file. The file must exist.
#'
#' @param gdsRefAnnotFile a \code{character} string representing the
#' file name corresponding the Reference SNV
#' Annotation GDS file. The function will
#' open it in write mode and close it after. The file must exist.
#'
#' @param pathBlock a \code{character} string representing the directory
#' where all the output file det from the plink block command are located.
#' The directory must not include other file with the extension \'.det\'.
#' The name of the \'.det\' must include the super-population between \'.\'
#' and the chromosome in the form \'chrNumber.\' \( \'chr1.\'\).
#'
#' @param superPop a \code{character} string representing the super population.
#'
#' @param blockName a \code{character} string representing the id of the block.
#' The blockName should not exist in \'gdsRefAnnotFile\'.
#' Default: \code{"ldBlock"}.
#'
#' @param blockDesc a \code{character} string representing the description of
#' the block.
#' Default: \code{"Not Define"}
#'
#' @param verbose a \code{logical} indicating if message information should be
#' printed. Default: \code{FALSE}.
#'
#' @return \code{OL} when the function is successful.
#'
#' @details
#'
#' More information about GDS file format can be found at the Bioconductor
#' gdsfmt website:
#' https://bioconductor.org/packages/gdsfmt/
#'
#' @examples
#'
#' ## Path to the demo pedigree file is located in this package
#' dataDir <- system.file("extdata", package="RAIDS")
#'
# ## Temporary file
#' fileAnnotGDS <- file.path(tempdir(), "ex1_good_small_1KG_Ann_GDS.gds")
#'
#' ## Demo of of output file det from the plink block
#' ## command for chromosome 1
#' fileLdBlock <- file.path(dirname(fileAnnotGDS), "block.sp.EUR.Ex.chr1.blocks.det")
#'
#'
#' file.copy(file.path(dataDir, "tests",
#' "ex1_NoBlockGene.1KG_Annot_GDS.gds"), fileAnnotGDS)
#' file.copy(file.path(dataDir, "block.sp.EUR.Ex.chr1.blocks.det"),
#' fileLdBlock)
#'
#'
#'
#' ## GDS Reference file
#' fileReferenceGDS <- file.path(dataDir, "tests",
#' "ex1_good_small_1KG.gds")
#'
#' \donttest{
#'
#'
#' ## Append information associated to blocks
#' addBlockFromDetFile(fileReferenceGDS=fileReferenceGDS,
#' gdsRefAnnotFile=fileAnnotGDS,
#' pathBlock=dirname(fileAnnotGDS),
#' superPop="EUR")
#'
#' gdsAnnot1KG <- openfn.gds(fileAnnotGDS)
#' print(gdsAnnot1KG)
#'
#' closefn.gds(gdsAnnot1KG)
#' }
#'
#' ## Remove temporary file
#' unlink(fileAnnotGDS, force=TRUE)
#' unlink(fileLdBlock, force=TRUE)
#'
#' @author Pascal Belleau, Astrid Deschênes and Alexander Krasnitz
#'
#' @importFrom gdsfmt openfn.gds closefn.gds read.gdsn index.gdsn ls.gdsn
#' @importFrom SNPRelate snpgdsOpen
#' @encoding UTF-8
#' @export
addBlockFromDetFile <- function(fileReferenceGDS, gdsRefAnnotFile, pathBlock,
superPop, blockName="ldBlock",
blockDesc="Not Define", verbose=FALSE) {
if (!(is.character(fileReferenceGDS) && (file.exists(fileReferenceGDS)))) {
stop("The \'fileReferenceGDS\' must be a character string ",
"representing the Reference GDS file. The file must exist.")
}
if(!(is.character(blockName))){
stop("The \'blockName\' must be a character string ",
"representing the name of the block.")
}

if(blockName == "ldBlock"){
blockName <- paste0(blockName, ".", superPop)
}

gdsRefAnnot <- openfn.gds(gdsRefAnnotFile)

if(("block.annot" %in% ls.gdsn(gdsRefAnnot))) {
listAnno <- read.gdsn(index.gdsn(gdsRefAnnot, "block.annot"))
if(length(which(gdsRefAnnot$block.id == blockName)) > 0){
stop("The \'blockName\' already exist in \'gdsRefAnnotFile\'.")
}
}
closefn.gds(gdsRefAnnot)

gdsReference <- snpgdsOpen(filename=fileReferenceGDS)



## The verbose must be a logical
validateLogical(verbose, "verbose")

## Extract the SNP chromosomes and positions
snpChromosome <- read.gdsn(index.gdsn(gdsReference, "snp.chromosome"))
#snpPosition <- read.gdsn(index.gdsn(gdsReference, "snp.position"))
closefn.gds(gdsReference)

listFileBlock <- dir(pathBlock, ".det")
listFileBlock <- listFileBlock[grep(paste0("\\.", superPop, "\\."), listFileBlock)]

listChr <- unique(snpChromosome)

#listChr <- listChr[order(listChr)]
#listChr <- seq_len(22)
listBlock <- list()

for(chr in seq_len(length(listChr))) {
if(verbose) { message("chr", listChr[chr], " ",Sys.time()) }
listChrCur <- listFileBlock[grep(paste0("chr",listChr[chr],"\\."), listFileBlock)]
if(length(listChrCur) == 1){
tmp <- processBlockChr(fileReferenceGDS, file.path(pathBlock, listChrCur))
listBlock[[chr]] <- tmp$block.snp
if(chr > 1) {
vMax <- max(listBlock[[chr-1]])
vMin <- min(listBlock[[chr-1]])
listBlock[[chr]][listBlock[[chr]] > 0] <-
listBlock[[chr]][listBlock[[chr]] > 0] + vMax
if(vMin < 0) {
listBlock[[chr]][listBlock[[chr]] < 0] <-
listBlock[[chr]][listBlock[[chr]] < 0] + vMin
}
}
}else{

listBlock[[chr]] <- rep(-1, length(which(snpChromosome == listChr[chr])))
vMin <- 0
if(chr > 1){
vMin <- min(listBlock[[chr-1]])
}
if(vMin < 0){
listBlock[[chr]] <- listBlock[[chr]] + vMin
}
}

}
listBlock <- do.call(c, listBlock)

gdsRefAnnot <- openfn.gds(gdsRefAnnotFile, readonly = FALSE)

## Save the information into the GDS file
addGDS1KGLDBlock(gdsRefAnnot, listBlock, blockName, blockDesc)
closefn.gds(gdsRefAnnot)
## Success
return(0L)
}

#' @title Append information associated to blocks, as indexes, into the
#' Population Reference SNV Annotation GDS file
Expand Down
7 changes: 6 additions & 1 deletion R/processStudy.R
Original file line number Diff line number Diff line change
Expand Up @@ -2351,7 +2351,9 @@ inferAncestry <- function(profileFile, pathProfileGDS,

genoSource <- arg_match(genoSource)


if(genoSource == "bam"){
stop("The bam is not release yet look to get a \'Devel\' version or contact us")
}
profileName <- gsub("\\.gz$", "", profileBaseName, ignore.case = TRUE)
for(extCur in c( "\\.vcf$", "\\.txt$", "\\.bam", "\\.tsv", "\\.csv")){
profileName <- gsub(extCur, "", profileName, ignore.case = TRUE)
Expand Down Expand Up @@ -2776,6 +2778,9 @@ inferAncestryGeneAware <- function(profileFile, pathProfileGDS,

genoSource <- arg_match(genoSource)

if(genoSource == "bam"){
stop("The bam is not release yet look to get a \'Devel\' version or contact us")
}

profileName <- gsub("\\.gz$", "", profileBaseName, ignore.case = TRUE)
for(extCur in c( "\\.vcf$", "\\.txt$", "\\.bam", "\\.tsv", "\\.csv")){
Expand Down
33 changes: 18 additions & 15 deletions R/tools_internal.R
Original file line number Diff line number Diff line change
Expand Up @@ -458,7 +458,7 @@ readSNVVCF <- function(fileName,
#' \item{\code{chr}}{ a \code{integer} representing a the chromosome from
#' fileBlock.
#' }
#' \item{\code{block.snp}}{ the a \code{array} of integer
#' \item{\code{block.snp}}{ a \code{array} of integer
#' representing the linkage disequilibrium block for
#' each SNV in the in the same order than the variant
#' in Population reference dataset.
Expand Down Expand Up @@ -506,43 +506,46 @@ processBlockChr <- function(fileReferenceGDS, fileBlock) {
listChr <- as.integer(gsub("chr", "", listChr))
listSNVChr <- read.gdsn(index.gdsn(gdsReference, "snp.chromosome"))
listSNVChr <- which(listSNVChr == listChr)
snp.keep <- read.gdsn(index.gdsn(gdsReference, "snp.position"))[listSNVChr]
snpKeep <- read.gdsn(index.gdsn(gdsReference, "snp.position"))[listSNVChr]
closefn.gds(gdsReference)
z <- cbind(c(blockChr$BP1, snp.keep, blockChr$BP2+1),
z <- cbind(c(blockChr$BP1, snpKeep, blockChr$BP2+1),
c(seq_len(nrow(blockChr)),
rep(0, length(snp.keep)), -1*seq_len(nrow(blockChr))))
rep(0, length(snpKeep)), -1*seq_len(nrow(blockChr))))

z <- z[order(z[,1]),]
block.snp <- cumsum(z[,2])[z[,2] == 0]
blockSnp <- cumsum(z[,2])[z[,2] == 0]

curStart <- 0
activeBlock <- 0
blockState <- 0
block.inter <- rep(0, length(which(block.snp == 0)))
blockInter <- rep(0, length(which(blockSnp == 0)))
k <- 1
for(i in seq_len(length(block.snp))){
if(block.snp[i] == 0){
for(i in seq_len(length(blockSnp))){
if(blockSnp[i] == 0){
if(activeBlock == 1){
if(snp.keep[i] - curStart >= 10000) {
if(snpKeep[i] - curStart >= 10000) {
blockState <- blockState - 1

curStart <- snp.keep[i]
curStart <- snpKeep[i]
}
} else{
blockState <- blockState - 1
curStart <- snp.keep[i]
curStart <- snp.keep[i]
curStart <- snpKeep[i]
activeBlock <- 1
}
block.inter[k] <- blockState
if(blockState == 0){
blockState <- -1
}
blockInter[k] <- blockState
k <- k + 1
}else{
activeBlock <- 0
}
}
block.snp[block.snp == 0] <- block.inter

blockSnp[blockSnp == 0] <- blockInter
res <- list(chr=listChr,
block.snp=block.snp)
block.snp=blockSnp)
return(res)
}

Loading
Loading