Skip to content

Commit

Permalink
cuz -> clz, doi -> crossref + ocamlformat
Browse files Browse the repository at this point in the history
Signed-off-by: Marcello Seri <[email protected]>
  • Loading branch information
mseri committed Dec 17, 2021
1 parent 8fa80f8 commit 38eea26
Show file tree
Hide file tree
Showing 28 changed files with 173 additions and 503 deletions.
3 changes: 0 additions & 3 deletions .ocamlformat
Original file line number Diff line number Diff line change
@@ -1,3 +0,0 @@
profile=janestreet
wrap-comments=false
let-binding-spacing=sparse
9 changes: 9 additions & 0 deletions CHANGES.md
Original file line number Diff line number Diff line change
@@ -1,3 +1,12 @@
# 0.5.2 (2021-12-17)

- Move from cuz to the published clz
- Move from dx.doi.org to crossref rest api service,
the latter gives better and more consistent results and
does not seem to require a fallback service any longer
- Update arxiv generated bibtex accordingly
- Update ocamlformat

# 0.5.1 (2021-07-01)

- Fix for transitive dependency in cuz
Expand Down
1 change: 0 additions & 1 deletion README.md
Original file line number Diff line number Diff line change
Expand Up @@ -52,7 +52,6 @@ Examples of use (the bibtex entry is printed on standard output):
Each release comes with attached binaries for windows, mac and linux.
If you want to build the package yourself, the most immediate way is by running

$ opam pin add doi2bib https://github.com/mseri/doi2bib.git
$ opam install doi2bib

To run the tests, clone this repository and from of the root of the project run
Expand Down
67 changes: 36 additions & 31 deletions bin/doi2bib.ml
Original file line number Diff line number Diff line change
Expand Up @@ -5,51 +5,56 @@ let err s = `Error (false, s)
let doi2bib id =
match id with
| None -> `Help (`Pager, None)
| Some id ->
(match Lwt_main.run (Http.get_bib_entry @@ Parser.parse_id id) with
| bibtex -> `Ok (Printf.printf "%s" bibtex)
| exception Http.PubMed_DOI_not_found ->
err @@ Printf.sprintf "Error: unable to find a DOI entry for %s.\n" id
| exception Http.Entry_not_found ->
err
@@ Printf.sprintf
"Error: unable to find any bibtex entry for %s.\n\
Check the ID before trying again.\n"
id
| exception Failure s -> err @@ Printf.sprintf "Unexpected error. %s\n" s
| exception Http.Bad_gateway ->
err
@@ Printf.sprintf
"Remote server error: wait some time and try again.\n\
This error tends to happen when the remote servers are busy."
| exception Parser.Parse_error id ->
err
@@ Printf.sprintf
"Error: unable to parse ID: '%s'.\n\
You can force me to consider it by prepending 'doi:', 'arxiv:' or 'PMC' as \
appropriate."
id)

| Some id -> (
match Lwt_main.run (Http.get_bib_entry @@ Parser.parse_id id) with
| bibtex -> `Ok (Printf.printf "%s" bibtex)
| exception Http.PubMed_DOI_not_found ->
err @@ Printf.sprintf "Error: unable to find a DOI entry for %s.\n" id
| exception Http.Entry_not_found ->
err
@@ Printf.sprintf
"Error: unable to find any bibtex entry for %s.\n\
Check the ID before trying again.\n"
id
| exception Failure s -> err @@ Printf.sprintf "Unexpected error. %s\n" s
| exception Http.Bad_gateway ->
err
@@ Printf.sprintf
"Remote server error: wait some time and try again.\n\
This error tends to happen when the remote servers are busy."
| exception Parser.Parse_error id ->
err
@@ Printf.sprintf
"Error: unable to parse ID: '%s'.\n\
You can force me to consider it by prepending 'doi:', 'arxiv:' \
or 'PMC' as appropriate."
id)

let () =
let open Cmdliner in
let id =
let doc =
"A DOI, an arXiv ID or a PubMed ID. The tool tries to automatically infer what \
kind of ID you are using. You can force the cli to lookup a DOI by using the form \
'doi:ID' or an arXiv ID by using the form 'arXiv:ID'.\n\
"A DOI, an arXiv ID or a PubMed ID. The tool tries to automatically \
infer what kind of ID you are using. You can force the cli to lookup a \
DOI by using the form 'doi:ID' or an arXiv ID by using the form \
'arXiv:ID'.\n\
PubMed IDs always start with 'PMC'."
in
Arg.(value & pos 0 (some string) None & info ~docv:"ID" ~doc [])
in
let doi2bib_t = Term.(ret (const doi2bib $ id)) in
let info =
let doc =
"A little CLI tool to get the bibtex entry for a given DOI, arXiv or PubMed ID."
"A little CLI tool to get the bibtex entry for a given DOI, arXiv or \
PubMed ID."
in
let man =
[ `S Manpage.s_bugs; `P "Report bugs to https://github.com/mseri/doi2bib/issues" ]
[
`S Manpage.s_bugs;
`P "Report bugs to https://github.com/mseri/doi2bib/issues";
]
in
Term.info "doi2bib" ~version:"%%VERSION%%" ~doc ~exits:Term.default_exits ~man
Term.info "doi2bib" ~version:"%%VERSION%%" ~doc ~exits:Term.default_exits
~man
in
Term.exit @@ Term.eval (doi2bib_t, info)
5 changes: 1 addition & 4 deletions doi2bib.opam
Original file line number Diff line number Diff line change
Expand Up @@ -13,17 +13,14 @@ depends: [
"astring" {>= "0.8.0"}
"cohttp-lwt-unix" {>= "2.5.0"}
"cmdliner" {>= "1.0.0"}
"decompress" {>= "1.4.0"}
"clz" {>= "0.1.0"}
"ezxmlm" {>= "1.1.0"}
"lwt" {>= "5.3.0"}
"bigstringaf" {>= "0.2.0"}
"tls" {>= "0.12.0"}
"re" {>= "1.0.0"}
"odoc" {with-doc}
]
conflicts: [
"result" {< "1.5"} # uses Result.map but result can be pulled via lwt and takes over the Result module
]
build: [
["dune" "subst"] {dev}
[
Expand Down
1 change: 0 additions & 1 deletion dune

This file was deleted.

4 changes: 2 additions & 2 deletions dune-project
Original file line number Diff line number Diff line change
Expand Up @@ -28,8 +28,8 @@
(>= 2.5.0))
(cmdliner
(>= 1.0.0))
(decompress
(>= 1.4.0))
(clz
(>= 0.1.0))
(ezxmlm
(>= 1.1.0))
(lwt
Expand Down
2 changes: 1 addition & 1 deletion lib/dune
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
(library
(name doi2bib)
(libraries astring cohttp-lwt-unix cuz.cohttp ezxmlm lwt re unix)
(libraries astring cohttp-lwt-unix clz.cohttp ezxmlm lwt re unix)
(preprocess future_syntax))
97 changes: 43 additions & 54 deletions lib/http.ml
Original file line number Diff line number Diff line change
Expand Up @@ -5,61 +5,52 @@ exception Bad_gateway
exception PubMed_DOI_not_found

let rec get ?proxy ?headers ?fallback uri =
let headers = Cuz_cohttp.accept_gzde headers in
let headers = Clz_cohttp.update_header headers in
let uri = Option.value ~default:"" proxy ^ uri |> Uri.of_string in
let open Lwt.Syntax in
let* resp, body = Cohttp_lwt_unix.Client.get ~headers uri in
let status = Cohttp_lwt.Response.status resp in
let* () = if status <> `OK then Cohttp_lwt.Body.drain_body body else Lwt.return_unit in
let* () =
if status <> `OK then Cohttp_lwt.Body.drain_body body else Lwt.return_unit
in
match status with
| `OK -> Cuz_cohttp.decompress (resp, body)
| `Found ->
let uri' = Cohttp_lwt.(resp |> Response.headers |> Cohttp.Header.get_location) in
(match uri', fallback with
| Some uri, _ -> get ?proxy ~headers ?fallback (Uri.to_string uri)
| None, Some uri -> get ?proxy ~headers uri
| None, None ->
Lwt.fail_with ("Malformed redirection trying to access '" ^ Uri.to_string uri ^ "'."))
| d when (d = `Not_found || d = `Gateway_timeout) && Option.is_some fallback ->
(match fallback with
| Some uri -> get ?proxy ~headers uri
| None -> assert false)
| `OK -> Clz_cohttp.decompress (resp, body)
| `Found -> (
let uri' =
Cohttp_lwt.(resp |> Response.headers |> Cohttp.Header.get_location)
in
match (uri', fallback) with
| Some uri, _ -> get ?proxy ~headers ?fallback (Uri.to_string uri)
| None, Some uri -> get ?proxy ~headers uri
| None, None ->
Lwt.fail_with
("Malformed redirection trying to access '" ^ Uri.to_string uri
^ "'."))
| d when (d = `Not_found || d = `Gateway_timeout) && Option.is_some fallback
-> (
match fallback with
| Some uri -> get ?proxy ~headers uri
| None -> assert false)
| `Bad_request | `Not_found -> Lwt.fail Entry_not_found
| `Bad_gateway -> Lwt.fail Bad_gateway
| _ ->
Lwt.fail_with
("Response error: '"
^ Cohttp.Code.string_of_status status
^ "' trying to access '"
^ Uri.to_string uri
^ "'.")


let cleanup =
let re_whsp = Re.(compile @@ seq [ bol; rep1 space ]) in
let re_endbr = Re.(compile @@ str "},") in
let re_title = Re.(compile @@ str ", title=") in
let re_last = Re.(compile @@ seq [ char '}'; rep space; char '}' ]) in
fun body ->
let body = Re.replace_string ~by:"" re_whsp body in
let body = Re.replace_string ~by:"},\n " re_endbr body in
let body = Re.replace_string ~by:",\n title=" re_title body in
Re.replace_string ~by:"}\n}" re_last body

Lwt.fail_with
("Response error: '"
^ Cohttp.Code.string_of_status status
^ "' trying to access '" ^ Uri.to_string uri ^ "'.")

let bib_of_doi ?proxy doi =
let uri = "https://dx.doi.org/" ^ String.trim doi in
let uri =
"https://api.crossref.org/works/" ^ String.trim doi
^ "/transform/application/x-bibtex"
in
let headers =
Cohttp.Header.of_list
[ "Accept", "text/bibliography; style=bibtex"; "charset", "utf-8" ]
in
let fallback =
"https://citation.crosscite.org/format?doi=" ^ doi ^ "&style=bibtex&lang=en-US"
[ ("Accept", "text/bibliography; style=bibtex"); ("charset", "utf-8") ]
in
let open Lwt.Syntax in
let* body = get ?proxy ~headers ~fallback uri in
Lwt.return (cleanup body)

let* body = get ?proxy ~headers uri in
Lwt.return body

let bib_of_arxiv ?proxy arxiv =
let uri = "https://export.arxiv.org/api/query?id_list=" ^ String.trim arxiv in
Expand All @@ -68,49 +59,47 @@ let bib_of_arxiv ?proxy arxiv =
let _, atom_blob = Ezxmlm.from_string body in
try
let doi =
Ezxmlm.(atom_blob |> member "feed" |> member "entry" |> member "doi" |> to_string)
Ezxmlm.(
atom_blob |> member "feed" |> member "entry" |> member "doi"
|> to_string)
in
bib_of_doi ?proxy doi
with
| Ezxmlm.Tag_not_found _ ->
with Ezxmlm.Tag_not_found _ ->
Lwt.catch
(fun () -> get ("https://arxiv.org/bibtex/" ^ String.trim arxiv))
(fun _e -> parse_atom arxiv atom_blob |> Lwt.return)


let bib_of_pubmed ?proxy pubmed =
let pubmed = String.trim pubmed in
let uri = "https://www.ncbi.nlm.nih.gov/pmc/utils/idconv/v1.0/?ids=" ^ pubmed in
let uri =
"https://www.ncbi.nlm.nih.gov/pmc/utils/idconv/v1.0/?ids=" ^ pubmed
in
let open Lwt.Syntax in
let* body = get ?proxy uri in
let _, xml_blob = Ezxmlm.from_string body in
try
let doi = ref "" in
let _ =
Ezxmlm.filter_map
~tag:"record"
Ezxmlm.filter_map ~tag:"record"
~f:(fun attrs node ->
doi := Ezxmlm.get_attr "doi" attrs;
node)
xml_blob
in
bib_of_doi ?proxy !doi
with
| Not_found ->
with Not_found ->
let exn =
match
Ezxmlm.(
member "pmcids" xml_blob
|> member_with_attr "record"
|> fun (a, _) -> mem_attr "status" "error" a)
member "pmcids" xml_blob |> member_with_attr "record" |> fun (a, _) ->
mem_attr "status" "error" a)
with
| true -> Entry_not_found
| false -> PubMed_DOI_not_found
| exception Ezxmlm.(Tag_not_found _) -> Entry_not_found
in
Lwt.fail exn


let get_bib_entry ?proxy = function
| DOI doi -> bib_of_doi ?proxy doi
| ArXiv arxiv -> bib_of_arxiv ?proxy arxiv
Expand Down
48 changes: 21 additions & 27 deletions lib/parser.ml
Original file line number Diff line number Diff line change
@@ -1,7 +1,4 @@
type id =
| DOI of string
| ArXiv of string
| PubMed of string
type id = DOI of string | ArXiv of string | PubMed of string

exception Parse_error of string

Expand All @@ -10,11 +7,12 @@ let string_of_id = function
| ArXiv s -> "arXiv ID '" ^ s ^ "'"
| PubMed s -> "PubMed ID '" ^ s ^ "'"


let parse_id id =
let open Astring in
let is_prefix affix s = String.is_prefix ~affix (String.Ascii.lowercase s) in
let sub start s = String.sub ~start s |> String.Sub.to_string |> String.trim in
let sub start s =
String.sub ~start s |> String.Sub.to_string |> String.trim
in
let contains c s = String.exists (fun c' -> c' = c) s in
match id with
| doi when is_prefix "doi:" doi -> DOI (sub 4 doi)
Expand All @@ -24,25 +22,24 @@ let parse_id id =
| arxiv when contains '.' arxiv -> ArXiv (String.trim arxiv)
| _ -> raise (Parse_error id)


let parse_atom id atom =
let bibentry () =
let open Ezxmlm in
let entry = atom |> member "feed" |> member "entry" in
let title = entry |> member "title" |> to_string in
let authors =
entry
|> members "author"
entry |> members "author"
|> List.map (fun n -> member "name" n |> to_string)
|> String.concat " and "
in
let year =
try entry |> member "updated" |> to_string |> fun s -> String.sub s 0 4 with
| Tag_not_found _ ->
try entry |> member "updated" |> to_string |> fun s -> String.sub s 0 4
with Tag_not_found _ ->
entry |> member "published" |> to_string |> fun s -> String.sub s 0 4
in
let cat =
entry |> member_with_attr "primary_category" |> fun (a, _) -> get_attr "term" a
entry |> member_with_attr "primary_category" |> fun (a, _) ->
get_attr "term" a
in
let bibid =
let open Astring in
Expand All @@ -55,21 +52,18 @@ let parse_atom id atom =
in
Printf.sprintf
{|@misc{%s,
title={%s},
author={%s},
year={%s},
eprint={%s},
archivePrefix={arXiv},
primaryClass={%s}
title={%s},
author={%s},
year={%s},
eprint={%s},
archivePrefix={arXiv},
primaryClass={%s}
}|}
bibid
title
authors
year
id
cat
bibid title authors year id cat
in
try bibentry () with
| Ezxmlm.Tag_not_found t ->
try bibentry ()
with Ezxmlm.Tag_not_found t ->
raise
@@ Failure ("Unexpected error parsing arXiv's metadata, tag '" ^ t ^ "' not present.")
@@ Failure
("Unexpected error parsing arXiv's metadata, tag '" ^ t
^ "' not present.")
Loading

0 comments on commit 38eea26

Please sign in to comment.