Skip to content

Commit

Permalink
fix errors in pubchem queries and modify sleep
Browse files Browse the repository at this point in the history
  • Loading branch information
Jermiah committed Dec 14, 2023
1 parent f06e4c5 commit f2877fe
Show file tree
Hide file tree
Showing 7 changed files with 326 additions and 307 deletions.
2 changes: 1 addition & 1 deletion DESCRIPTION
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
Package: AnnotationGx
Title: AnnotationGx: A package for building, updating and querying an
annotation database for pharmaco-genomic data.
Version: 0.0.5.9001
Version: 0.0.5.9002
Authors@R: c(
person("Christopher", "Eeles", role = c("aut"),
email = "[email protected]"),
Expand Down
67 changes: 45 additions & 22 deletions R/getChEMBL.R
Original file line number Diff line number Diff line change
Expand Up @@ -131,13 +131,27 @@ getChemblAllMechanisms <- function(url="https://www.ebi.ac.uk",
#'| search | Special type of filter allowing a full text search based on elastic search queries |
#'| only | Select specific properties from the original endpoint and returns only the desired properties on each record | |
#'
#' @param resource `character(1)` Resource to query
#' @param field `character(1)` Field to query
#' @param filter_type `character(1)` Filter type
#' @param value `character(1)` Value to query
#'
#'
#' @md
#' @export
constructChemblQuery <- function(resource, field, filter_type, value){
constructChemblQuery <- function(resource, field, filter_type, value, format = "json"){

# possible formats for now are XML, JSON and YAML
checkmate::assert_character(format)
checkmate::assert_character(resource)
checkmate::assert_character(field)
checkmate::assert_character(filter_type)
checkmate::assert_character(value)
checkmate::assert_subset(format, c("json", "xml", "yaml"))

url <- "https://www.ebi.ac.uk/chembl/api/data/"

final <- paste0(url, resource, ".json", "?", field, "__", filter_type, "=", value)
final <- paste0(url, resource, "?", "format=", format, "&", field, "__", filter_type, "=", value)

return (final)
}
Expand Down Expand Up @@ -251,31 +265,40 @@ getChemblMechanism <- function(
filter_type = "in",
returnURL = FALSE){


# constructChemblQuery(resource = "mechanism", field = "molecule_chembl_id", filter_type = "in", value = "CHEMBL1413")
urls <- constructChemblQuery(resource = resources, field = field, filter_type = filter_type, value = chembl.ID)
urls <- URLencode(urls)
# urls <- constructChemblQuery(resource = resources, field = field, filter_type = filter_type, value = chembl.ID)
# urls <- URLencode(urls)

if(returnURL) return(urls)

# [1] "action_type" "binding_site_comment"
# [3] "direct_interaction" "disease_efficacy"
# [5] "max_phase" "mec_id"
# [7] "mechanism_comment" "mechanism_of_action"
# [9] "mechanism_refs" "molecular_mechanism"
# [11] "molecule_chembl_id" "parent_molecule_chembl_id"
# [13] "record_id" "selectivity_comment"
# [15] "site_id" "target_chembl_id"
# [17] "variant_sequence"

responses <- data.table::rbindlist(lapply(urls, function(url){
# if(returnURL) return(urls)

cols <- c("molecule_chembl_id", "action_type",
"mechanism_of_action", "molecular_mechanism",
"mechanism_comment", "parent_molecule_chembl_id", "target_chembl_id")

responses <- lapply(chembl.ID, function(ID){
url <- constructChemblQuery(resource = resources, field = field, filter_type = filter_type, value = ID)
response <- httr::GET(url)
response <- AnnotationGx::parseJSON(response)
response <- parseJSON(response)
mechanisms <- data.table::as.data.table(response$mechanisms)
result <- mechanisms[,.(molecule_chembl_id, action_type, mechanism_of_action, molecular_mechanism, mechanism_comment, parent_molecule_chembl_id, target_chembl_id)]
}))

# if cols are not in names then return an empty data.table with cols
if(!all(cols %in% names(mechanisms))){
mechanisms <- data.table::data.table()
mechanisms[, (cols) := NA]
mechanisms[, "molecule_chembl_id" := ID]
return(mechanisms)
}
return(mechanisms[,..cols])
})

return(responses)
# replace the _ in the column names with .
responses <- lapply(responses, function(x){
names(x) <- gsub("_", ".", names(x))
return(x)
})

return(data.table::rbindlist(responses, fill = TRUE))

}


Expand Down
201 changes: 93 additions & 108 deletions R/getPubChem-helpers.R
Original file line number Diff line number Diff line change
Expand Up @@ -8,52 +8,37 @@
if (throttleMessage == TRUE){
message(message)
}
# Request Count status: Green (5%), Request Time status: Green (0%), Service status: Green (30%)


matches <- regmatches(message, gregexpr("\\((.*?)%\\)", message)) # Extracts text within parentheses
percentages <- gsub("\\(|%|\\)", "", unlist(matches[1:3]))
# print(percentages)
percentage <- max(as.numeric(percentages))
if(as.integer(percentage) > 15 && as.integer(percentage) < 30){
Sys.sleep(15)
}else if (as.integer(percentage) > 30 && as.integer(percentage) < 50){
Sys.sleep(20)
}else if (as.integer(percentage) > 50 && as.integer(percentage) < 75) {
print(paste0("Throttling at ", percentage, "%. Sleeping for 30 seconds."))
Sys.sleep(30)
}else if (as.integer(percentage) > 75) {
print(paste0("Throttling at ", percentage, "%. Sleeping for 60 seconds."))
Sys.sleep(60)
}else{
Sys.sleep(max(as.numeric(percentages)))
}
percentages <- as.numeric(gsub("\\(|%|\\)", "", unlist(matches[1:3])))
percentage <- max(percentages[1:3])
# percentage <- ifelse(percentages[3] > 75, percentages[3], max(percentages[1:2]))
sleep_time <- ifelse(percentage > 50, 30, ifelse(percentage > 30, 20, 15))

return(as.integer(percentage) > 15)
if (percentage > 15) {
# message(paste0("Sleeping for ", sleep_time, " seconds"))
Sys.sleep(sleep_time)
} else {
Sys.sleep(percentage)
}

return(percentage > 15)
}

## TODO:: Retrieve PubChem server status to dynamically set query spacing
##>based on server load
## TODO:: Make the query away for server load status in response header
#' @importFrom crayon strip_style
.queryPubChemSleep <- function(x, ..., query_only=FALSE) {
proxy <- list(...)$proxy
t1 <- Sys.time()
queryRes <- tryCatch({
queryRequestPubChem(x, ..., query_only=query_only)
},
error=function(e) {
cat('\r')
print(e)
list(Error=list(
Code='.queryPubChemSleep.ERROR',
Message='See Details for error msg',
Details=paste0(strip_style(e), collapse=' ')
))
})
t2 <- Sys.time()
queryTime <- t2 - t1
if (!isTRUE(proxy) && queryTime < 0.31) Sys.sleep(0.31 - queryTime)
return(queryRes)
#' simple checkThrottling Status by a default response
#' @return `logical` whether the query is throttled
#' @export
getPubChemStatus <- function(sleep = FALSE){
response <- httr::GET("https://pubchem.ncbi.nlm.nih.gov/rest/pug/compound/name/Aspirin/cids/JSON")
msg <- headers(response)$`x-throttling-control`
matches <- regmatches(msg, gregexpr("\\((.*?)%\\)", msg)) # Extracts text within parentheses
if(sleep) .checkThrottlingStatus(response)
message(msg)
}


# -----------------------------
# getPubChemAnnotations Helpers

Expand Down Expand Up @@ -96,75 +81,75 @@
return(DT)
}

#' @importFrom data.table data.table as.data.table merge.data.table last rbindlist
.parseNSCannotations <- function(DT) {
DT[, NSC := unlist(lapply(Data, `[[`, i=4))]
annotationDT <- DT[,
.(CID=unlist(LinkedRecords.CID), SID=unlist(LinkedRecords.SID)),
by=.(SourceName, SourceID, NSC)]
return(annotationDT)
}
# #' @importFrom data.table data.table as.data.table merge.data.table last rbindlist
# .parseNSCannotations <- function(DT) {
# DT[, NSC := unlist(lapply(Data, `[[`, i=4))]
# annotationDT <- DT[,
# .(CID=unlist(LinkedRecords.CID), SID=unlist(LinkedRecords.SID)),
# by=.(SourceName, SourceID, NSC)]
# return(annotationDT)
# }

#' @importFrom data.table data.table as.data.table merge.data.table last rbindlist
.parseCTDannotations <- function(DT) {
annotationDT <- DT[, .(CID=unlist(LinkedRecords)),
by=.(SourceName, SourceID, URL)]
annotationDT[, CTD := gsub('::.*$', '', SourceID)]
return(annotationDT)
}
# #' @importFrom data.table data.table as.data.table merge.data.table last rbindlist
# .parseCTDannotations <- function(DT) {
# annotationDT <- DT[, .(CID=unlist(LinkedRecords)),
# by=.(SourceName, SourceID, URL)]
# annotationDT[, CTD := gsub('::.*$', '', SourceID)]
# return(annotationDT)
# }

#' @importFrom data.table data.table as.data.table merge.data.table last rbindlist setnames
.parseCASannotations <- function(list) {
# Make sure CIDs all go in the same column
CAS_list <- lapply(list, setnames, old='LinkedRecords.CID', new='LinkedRecords',
skip_absent=TRUE)
DT <- rbindlist(CAS_list, fill=TRUE, use.names=TRUE)
DT[, CAS := lapply(Data, function(x) unlist(x[[2]]))]
CAS_DT <- DT[, .(CAS=unlist(CAS)), by=.(SourceName, SourceID, Name)]
ID_DT <- DT[,
.(CID=unlist(lapply(LinkedRecords, function(x) if(is.null(x)) NA_integer_ else x))),
by=.(SourceName, SourceID, Name, URL)]
annotationDT <- merge.data.table(
CAS_DT, ID_DT,
by=c('SourceName', 'SourceID', 'Name'),
all.x=TRUE,
allow.cartesian=TRUE)
annotationDT <- unique(annotationDT)
return(annotationDT)
}
# #' @importFrom data.table data.table as.data.table merge.data.table last rbindlist setnames
# .parseCASannotations <- function(list) {
# # Make sure CIDs all go in the same column
# CAS_list <- lapply(list, setnames, old='LinkedRecords.CID', new='LinkedRecords',
# skip_absent=TRUE)
# DT <- rbindlist(CAS_list, fill=TRUE, use.names=TRUE)
# DT[, CAS := lapply(Data, function(x) unlist(x[[2]]))]
# CAS_DT <- DT[, .(CAS=unlist(CAS)), by=.(SourceName, SourceID, Name)]
# ID_DT <- DT[,
# .(CID=unlist(lapply(LinkedRecords, function(x) if(is.null(x)) NA_integer_ else x))),
# by=.(SourceName, SourceID, Name, URL)]
# annotationDT <- merge.data.table(
# CAS_DT, ID_DT,
# by=c('SourceName', 'SourceID', 'Name'),
# all.x=TRUE,
# allow.cartesian=TRUE)
# annotationDT <- unique(annotationDT)
# return(annotationDT)
# }

#' @importFrom data.table data.table as.data.table merge.data.table last rbindlist
.parseSynonymsAndIdentifiers <- function(DT) {
dataList <- lapply(DT$Data, as.data.table)
names(dataList) <- DT$SourceID
dataDT <- rbindlist(dataList, fill=TRUE, use.names=TRUE,
idcol='SourceID')
DT[, Data := NULL]
dataDT[,
Synonyms := paste0(unlist(Value.StringWithMarkup[[1]]), collapse='|'),
by=SourceID]
dataDT[, Synonyms := paste(Synonyms, '|', Name), by=SourceID]
dataDT[, Value.StringWithMarkup := NULL]
annotationDT <- merge.data.table(dataDT, DT, by='SourceID', allow.cartesian=TRUE)
setnames(annotationDT,
old=c('TOCHeading.type', 'TOCHeading..TOCHeading', 'LinkedRecords'),
new=c('Type', 'Heading', 'ID')
)
annotationDT <- unique(annotationDT)
return(annotationDT)
}
# #' @importFrom data.table data.table as.data.table merge.data.table last rbindlist
# .parseSynonymsAndIdentifiers <- function(DT) {
# dataList <- lapply(DT$Data, as.data.table)
# names(dataList) <- DT$SourceID
# dataDT <- rbindlist(dataList, fill=TRUE, use.names=TRUE,
# idcol='SourceID')
# DT[, Data := NULL]
# dataDT[,
# Synonyms := paste0(unlist(Value.StringWithMarkup[[1]]), collapse='|'),
# by=SourceID]
# dataDT[, Synonyms := paste(Synonyms, '|', Name), by=SourceID]
# dataDT[, Value.StringWithMarkup := NULL]
# annotationDT <- merge.data.table(dataDT, DT, by='SourceID', allow.cartesian=TRUE)
# setnames(annotationDT,
# old=c('TOCHeading.type', 'TOCHeading..TOCHeading', 'LinkedRecords'),
# new=c('Type', 'Heading', 'ID')
# )
# annotationDT <- unique(annotationDT)
# return(annotationDT)
# }

#' @importFrom data.table data.table as.data.table merge.data.table last rbindlist
.parseNamesAndSynonyms <- function(DT) {
DT[, Synonyms := lapply(Data, function(x) x[2, ]$Value[[1]][[1]])]
# Remove the weird annotation from the end of the synonym
DT[, Synonyms := lapply(Synonyms, FUN=gsub, pattern=' - .*$', replacement='')]
DT[, Synonyms := unlist(lapply(Synonyms, FUN=paste0, collapse='|'))]
# fix NULL list itemss
DT[, CID := lapply(LinkedRecords.CID, function(x) if(is.null(x)) NA_integer_ else x)]
DT[, SID := lapply(LinkedRecords.SID, function(x) if(is.null(x)) NA_integer_ else x)]
annotationDT <- DT[, .(CID=unlist(CID), SID=unlist(SID)),
by=.(SourceName, SourceID, Name, URL, Synonyms)]
annotationDT <- unique(annotationDT)
return(annotationDT)
}
# #' @importFrom data.table data.table as.data.table merge.data.table last rbindlist
# .parseNamesAndSynonyms <- function(DT) {
# DT[, Synonyms := lapply(Data, function(x) x[2, ]$Value[[1]][[1]])]
# # Remove the weird annotation from the end of the synonym
# DT[, Synonyms := lapply(Synonyms, FUN=gsub, pattern=' - .*$', replacement='')]
# DT[, Synonyms := unlist(lapply(Synonyms, FUN=paste0, collapse='|'))]
# # fix NULL list itemss
# DT[, CID := lapply(LinkedRecords.CID, function(x) if(is.null(x)) NA_integer_ else x)]
# DT[, SID := lapply(LinkedRecords.SID, function(x) if(is.null(x)) NA_integer_ else x)]
# annotationDT <- DT[, .(CID=unlist(CID), SID=unlist(SID)),
# by=.(SourceName, SourceID, Name, URL, Synonyms)]
# annotationDT <- unique(annotationDT)
# return(annotationDT)
# }
Loading

0 comments on commit f2877fe

Please sign in to comment.