Skip to content

Commit

Permalink
Merge branch 'main' of github.com:mseri/doi2bib into main
Browse files Browse the repository at this point in the history
  • Loading branch information
mseri committed Apr 2, 2021
2 parents 13f90a6 + 2dadff2 commit 31191ed
Show file tree
Hide file tree
Showing 7 changed files with 254 additions and 185 deletions.
7 changes: 4 additions & 3 deletions .github/workflows/workflow.yml
Original file line number Diff line number Diff line change
Expand Up @@ -90,8 +90,8 @@ jobs:
- name: Install dependencies
run: |
opam depext tls
opam install tls
opam depext tls decompress
opam install tls decompress
opam install astring cmdliner cohttp-lwt-unix ezxmlm lwt_ppx
- name: Build project
Expand Down Expand Up @@ -176,7 +176,8 @@ jobs:
"@opam/cohttp-lwt": "2.5.5",
"@opam/cohttp-lwt-unix": "2.5.5",
"@opam/conduit-lwt": "2.3.0",
"@opam/conduit": "2.3.0"
"@opam/conduit": "2.3.0",
"@opam/decompress": "1.3.0"
}
}' > esy.json
Expand Down
184 changes: 3 additions & 181 deletions bin/doi2bib.ml
Original file line number Diff line number Diff line change
@@ -1,182 +1,4 @@
type id =
| DOI of string
| ArXiv of string
| PubMed of string

exception Parse_error of string
exception Entry_not_found
exception PubMed_DOI_not_found
exception Bad_gateway

let string_of_id = function
| DOI s -> "DOI ID '" ^ s ^ "'"
| ArXiv s -> "arXiv ID '" ^ s ^ "'"
| PubMed s -> "PubMed ID '" ^ s ^ "'"


let parse_id id =
let open Astring in
let is_prefix affix s = String.is_prefix ~affix (String.Ascii.lowercase s) in
let sub start s = String.sub ~start s |> String.Sub.to_string |> String.trim in
let contains c s = String.exists (fun c' -> c' = c) s in
match id with
| doi when is_prefix "doi:" doi -> DOI (sub 4 doi)
| arxiv when is_prefix "arxiv:" arxiv -> ArXiv (sub 6 arxiv)
| pubmed when is_prefix "pmc" pubmed -> PubMed pubmed
| doi when contains '/' doi -> DOI (String.trim doi)
| arxiv when contains '.' arxiv -> ArXiv (String.trim arxiv)
| _ -> raise (Parse_error id)


let parse_atom id atom =
let bibentry () =
let open Ezxmlm in
let entry = atom |> member "feed" |> member "entry" in
let title = entry |> member "title" |> to_string in
let authors =
entry
|> members "author"
|> List.map (fun n -> member "name" n |> to_string)
|> String.concat " and "
in
let year =
try entry |> member "updated" |> to_string |> fun s -> String.sub s 0 4 with
| Tag_not_found _ ->
entry |> member "published" |> to_string |> fun s -> String.sub s 0 4
in
let cat =
entry |> member_with_attr "primary_category" |> fun (a, _) -> get_attr "term" a
in
let bibid =
let open Astring in
(match String.cuts ~empty:false ~sep:" " authors with
| _ :: s :: _ -> s
| s :: _ -> s
| [] -> "")
^ year
^ (String.cut ~sep:" " title |> Option.map fst |> Option.value ~default:"")
in
Printf.sprintf
{|@misc{%s,
title={%s},
author={%s},
year={%s},
eprint={%s},
archivePrefix={arXiv},
primaryClass={%s}
}|}
bibid
title
authors
year
id
cat
in
try bibentry () with
| Ezxmlm.Tag_not_found t ->
raise
@@ Failure ("Unexpected error parsing arXiv's metadata, tag '" ^ t ^ "' not present.")


let rec get ?headers ?fallback uri =
let open Lwt.Syntax in
let* resp, body = Cohttp_lwt_unix.Client.get ?headers uri in
let status = Cohttp_lwt.Response.status resp in
if status <> `OK then Lwt.ignore_result (Cohttp_lwt.Body.drain_body body);
match status with
| `OK ->
let* body = Cohttp_lwt.Body.to_string body in
Lwt.return body
| `Found ->
let uri' = Cohttp_lwt.(resp |> Response.headers |> Cohttp.Header.get_location) in
(match uri', fallback with
| Some uri, _ -> get ?headers ?fallback uri
| None, Some uri -> get ?headers uri
| None, None ->
Lwt.fail_with ("Malformed redirection trying to access '" ^ Uri.to_string uri ^ "'."))
| d when (d = `Not_found || d = `Gateway_timeout) && Option.is_some fallback ->
(match fallback with
| Some uri -> get ?headers uri
| None -> assert false)
| `Bad_request | `Not_found -> Lwt.fail Entry_not_found
| `Bad_gateway -> Lwt.fail Bad_gateway
| _ ->
Lwt.fail_with
("Response error: '"
^ Cohttp.Code.string_of_status status
^ "' trying to access '"
^ Uri.to_string uri
^ "'.")


let bib_of_doi doi =
let uri = "https://doi.org/" ^ String.trim doi |> Uri.of_string in
let headers =
Cohttp.Header.of_list [ "Accept", "application/x-bibtex"; "charset", "utf-8" ]
in
let fallback =
Uri.of_string
("https://citation.crosscite.org/format?doi=" ^ doi ^ "&style=bibtex&lang=en-US")
in
get ~headers ~fallback uri


let bib_of_arxiv arxiv =
let uri =
"https://export.arxiv.org/api/query?id_list=" ^ String.trim arxiv |> Uri.of_string
in
let open Lwt.Syntax in
let* body = get uri in
let _, atom_blob = Ezxmlm.from_string body in
try
let doi =
Ezxmlm.(atom_blob |> member "feed" |> member "entry" |> member "doi" |> to_string)
in
bib_of_doi doi
with
| Ezxmlm.Tag_not_found _ -> parse_atom arxiv atom_blob |> Lwt.return


let bib_of_pubmed pubmed =
let pubmed = String.trim pubmed in
let uri =
"https://www.ncbi.nlm.nih.gov/pmc/utils/idconv/v1.0/?ids=" ^ pubmed |> Uri.of_string
in
let open Lwt.Syntax in
let* body = get uri in
let _, xml_blob = Ezxmlm.from_string body in
try
let doi = ref "" in
let _ =
Ezxmlm.filter_map
~tag:"record"
~f:(fun attrs node ->
doi := Ezxmlm.get_attr "doi" attrs;
node)
xml_blob
in
bib_of_doi !doi
with
| Not_found ->
let exn =
match
Ezxmlm.(
member "pmcids" xml_blob
|> member_with_attr "record"
|> fun (a, _) -> mem_attr "status" "error" a)
with
| true -> Entry_not_found
| false -> PubMed_DOI_not_found
| exception Ezxmlm.(Tag_not_found _) -> Entry_not_found
in
Lwt.fail exn


let get_bib_entry = function
| DOI doi -> bib_of_doi doi
| ArXiv arxiv -> bib_of_arxiv arxiv
| PubMed pubmed -> bib_of_pubmed pubmed

open Lib

let err s = `Error (false, s)

Expand All @@ -188,14 +10,14 @@ let doi2bib id =
| bibtex -> `Ok (Printf.printf "%s" bibtex)
| exception PubMed_DOI_not_found ->
err @@ Printf.sprintf "Error: unable to find a DOI entry for %s.\n" id
| exception Entry_not_found ->
| exception Http.Entry_not_found ->
err
@@ Printf.sprintf
"Error: unable to find any bibtex entry for %s.\n\
Check the ID before trying again.\n"
id
| exception Failure s -> err @@ Printf.sprintf "Unexpected error. %s\n" s
| exception Bad_gateway ->
| exception Http.Bad_gateway ->
err
@@ Printf.sprintf
"Remote server error: wait some time and try again.\n\
Expand Down
2 changes: 1 addition & 1 deletion bin/dune
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
(executable
(name doi2bib)
(public_name doi2bib)
(libraries astring cmdliner cohttp-lwt-unix ezxmlm)
(libraries astring cmdliner cohttp-lwt-unix decompress.gz ezxmlm)
(preprocess future_syntax))
61 changes: 61 additions & 0 deletions bin/ezgz.ml
Original file line number Diff line number Diff line change
@@ -0,0 +1,61 @@
(* Mostly from deflate.gz documentation :) *)

let uncompress_string str =
let i = De.bigstring_create De.io_buffer_size in
let o = De.bigstring_create De.io_buffer_size in
let r = Buffer.create 0x1000 in
let p = ref 0 in
let refill buf =
let len = min (String.length str - !p) De.io_buffer_size in
Bigstringaf.blit_from_string str ~src_off:!p buf ~dst_off:0 ~len;
p := !p + len;
len
in
let flush buf len =
let str = Bigstringaf.substring buf ~off:0 ~len in
Buffer.add_string r str
in
Gz.Higher.uncompress ~refill ~flush i o
|> Result.map (fun _metadata -> Buffer.contents r)


let time () = Int32.of_float (Unix.gettimeofday ())

let compress_string ?(level = 4) str =
let i = De.bigstring_create De.io_buffer_size in
let o = De.bigstring_create De.io_buffer_size in
let w = De.Lz77.make_window ~bits:15 in
let q = De.Queue.create 0x1000 in
let r = Buffer.create 0x1000 in
let p = ref 0 in
let cfg = Gz.Higher.configuration Gz.Unix time in
let refill buf =
let len = min (String.length str - !p) De.io_buffer_size in
Bigstringaf.blit_from_string str ~src_off:!p buf ~dst_off:0 ~len;
p := !p + len;
len
in
let flush buf len =
let str = Bigstringaf.substring buf ~off:0 ~len in
Buffer.add_string r str
in
Gz.Higher.compress ~level ~w ~q ~refill ~flush () cfg i o;
Buffer.contents r


exception GzipError of string

let extract is_gzipped body =
if is_gzipped
then (
match uncompress_string body with
| Ok content -> content
| Error (`Msg error) -> raise (GzipError error))
else body


let gzip_h =
let gzip_h = Cohttp.Header.of_list [ "accept-encoding", "gzip" ] in
function
| None -> Some gzip_h
| Some h -> Some (Cohttp.Header.add_unless_exists h "accept-encoding" "gzip")
39 changes: 39 additions & 0 deletions bin/http.ml
Original file line number Diff line number Diff line change
@@ -0,0 +1,39 @@
exception Entry_not_found
exception Bad_gateway

let rec get ?headers ?fallback uri =
let headers = Ezgz.gzip_h headers in
let open Lwt.Syntax in
let* resp, body = Cohttp_lwt_unix.Client.get ?headers uri in
let status = Cohttp_lwt.Response.status resp in
if status <> `OK then Lwt.ignore_result (Cohttp_lwt.Body.drain_body body);
match status with
| `OK ->
let* body = Cohttp_lwt.Body.to_string body in
let is_gzipped : bool =
Cohttp_lwt.Response.headers resp
|> fun resp -> Cohttp.Header.get resp "content-encoding" = Some "gzip"
in
let open Ezgz in
(try Lwt.return @@ extract is_gzipped body with
| GzipError error -> Lwt.fail @@ Failure error)
| `Found ->
let uri' = Cohttp_lwt.(resp |> Response.headers |> Cohttp.Header.get_location) in
(match uri', fallback with
| Some uri, _ -> get ?headers ?fallback uri
| None, Some uri -> get ?headers uri
| None, None ->
Lwt.fail_with ("Malformed redirection trying to access '" ^ Uri.to_string uri ^ "'."))
| d when (d = `Not_found || d = `Gateway_timeout) && Option.is_some fallback ->
(match fallback with
| Some uri -> get ?headers uri
| None -> assert false)
| `Bad_request | `Not_found -> Lwt.fail Entry_not_found
| `Bad_gateway -> Lwt.fail Bad_gateway
| _ ->
Lwt.fail_with
("Response error: '"
^ Cohttp.Code.string_of_status status
^ "' trying to access '"
^ Uri.to_string uri
^ "'.")
Loading

0 comments on commit 31191ed

Please sign in to comment.