Allow changing the hash algorithm during canonicalization

rdf-elixir · Feb 28, 2024 · 7b60f0e · 7b60f0e
1 parent d013992
commit 7b60f0e
Show file tree

Hide file tree

Showing 5 changed files with 72 additions and 28 deletions.
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -25,6 +25,9 @@ Elixir versions < 1.12 are no longer supported
 - The Turtle encoder now sorts the prefixes (based on `RDF.PrefixMap.to_sorted_list/1`), 
   which has become necessary, since OTP 26 maps are now unordered even in smaller cases 
   (previously only larger maps were unordered).
+- The hash algorithm to be used for RDF canonicalization can be configured either
+  with the `:hash_algorithm` keyword option or the `:canon_hash_algorithm` application 
+  runtime configuration.
 
 ### Fixed
 

diff --git a/lib/rdf/canonicalization/canonicalization.ex b/lib/rdf/canonicalization/canonicalization.ex
@@ -10,11 +10,25 @@ defmodule RDF.Canonicalization do
 
   import RDF.Sigils
 
+  @hash_algorithm_config_doc """
+  - `:hash_algorithm`: Allows to set the hash algorithm to be used. Any of the `:crypto.hash_algorithm()`
+    values is allowed. Defaults to the runtime configured `:canon_hash_algorithm` of the `:rdf`
+    application or `:sha256` of not configured otherwise.
+
+        config :rdf,
+          canon_hash_algorithm: :sha512
+
+  """
+
   @doc """
   Canonicalizes the blank nodes of a graph or dataset according to the RDF Dataset Canonicalization spec.
 
   This function always returns a `RDF.Dataset`. If you want to canonicalize a `RDF.Graph` and
-  get a `RDF.Graph` back, use `RDF.Graph.canonicalize/1`.
+  get a `RDF.Graph` back, use `RDF.Graph.canonicalize/2`.
+
+  ## Options
+
+  #{@hash_algorithm_config_doc}
 
   ## Example
 
@@ -23,14 +37,18 @@ defmodule RDF.Canonicalization do
       RDF.Dataset.new([{~B<c14n0>, EX.p(), ~B<c14n1>}, {~B<c14n1>, EX.p(), ~B<c14n0>}])
 
   """
-  @spec canonicalize(RDF.Graph.t() | RDF.Dataset.t()) :: RDF.Dataset.t()
-  def canonicalize(input) do
-    urdna2015(input)
+  @spec canonicalize(RDF.Graph.t() | RDF.Dataset.t(), keyword) :: RDF.Dataset.t()
+  def canonicalize(input, opts \\ []) do
+    urdna2015(input, opts)
   end
 
   @doc """
   Checks whether two graphs or datasets are equal, regardless of the concrete names of the blank nodes they contain.
 
+  ## Options
+
+  #{@hash_algorithm_config_doc}
+
   ## Examples
 
       iex> RDF.Graph.new([{~B<foo>, EX.p(), ~B<bar>}, {~B<bar>, EX.p(), 42}])
@@ -43,14 +61,15 @@ defmodule RDF.Canonicalization do
       ...>      RDF.Graph.new([{~B<b1>, EX.p(), ~B<b2>}, {~B<b3>, EX.p(), 42}]))
       false
   """
-  @spec isomorphic?(RDF.Graph.t() | RDF.Dataset.t(), RDF.Graph.t() | RDF.Dataset.t()) :: boolean
-  def isomorphic?(a, b) do
-    a |> canonicalize() |> Dataset.equal?(canonicalize(b))
+  @spec isomorphic?(RDF.Graph.t() | RDF.Dataset.t(), RDF.Graph.t() | RDF.Dataset.t(), keyword) ::
+          boolean
+  def isomorphic?(a, b, opts \\ []) do
+    a |> canonicalize(opts) |> Dataset.equal?(canonicalize(b, opts))
   end
 
-  defp urdna2015(input) do
+  defp urdna2015(input, opts) do
     input
-    |> State.new()
+    |> State.new(opts)
     |> create_canonical_identifiers_for_single_node_hashes()
     |> create_canonical_identifiers_for_multiple_node_hashes()
     |> apply_canonicalization(input)
@@ -155,7 +174,7 @@ defmodule RDF.Canonicalization do
     # TODO: "Sort nquads in Unicode code point order"
     |> Enum.sort()
     |> Enum.join()
-    |> hash()
+    |> hash(state)
 
     # |> IO.inspect(label: "1deg: node: #{inspect(ref_bnode_id)}, hash_first_degree_quads")
   end
@@ -178,7 +197,7 @@ defmodule RDF.Canonicalization do
           hash_first_degree_quads(state, related)
         end
 
-    hash(input)
+    hash(input, state)
     # |> IO.inspect(label: "hrel: input: #{inspect(input)}, hash_related_bnode")
   end
 
@@ -310,7 +329,7 @@ defmodule RDF.Canonicalization do
 
     # IO.puts("ndeg: datatohash: #{data_to_hash}, hash: #{hash(data_to_hash)}")
 
-    {hash(data_to_hash), issuer}
+    {hash(data_to_hash, state), issuer}
   end
 
   # 4.8.2.3.1) Group adjacent bnodes by hash
@@ -336,7 +355,7 @@ defmodule RDF.Canonicalization do
     end)
   end
 
-  defp hash(data) do
-    :crypto.hash(:sha256, data) |> Base.encode16(case: :lower)
+  defp hash(data, state) do
+    :crypto.hash(state.hash_algorithm, data) |> Base.encode16(case: :lower)
   end
 end
diff --git a/lib/rdf/canonicalization/state.ex b/lib/rdf/canonicalization/state.ex
@@ -10,10 +10,20 @@ defmodule RDF.Canonicalization.State do
 
   defstruct bnode_to_quads: nil,
             hash_to_bnodes: %{},
-            canonical_issuer: IdentifierIssuer.canonical()
+            canonical_issuer: IdentifierIssuer.canonical(),
+            hash_algorithm: nil
 
-  def new(input) do
-    %__MODULE__{bnode_to_quads: bnode_to_quads(input)}
+  def new(input, opts) do
+    hash_algorithm = Keyword.get_lazy(opts, :hash_algorithm, &default_hash_algorithm/0)
+
+    %__MODULE__{
+      bnode_to_quads: bnode_to_quads(input),
+      hash_algorithm: hash_algorithm
+    }
+  end
+
+  def default_hash_algorithm do
+    Application.get_env(:rdf, :canon_hash_algorithm, :sha256)
   end
 
   def issue_canonical_identifier(state, identifier) do

diff --git a/lib/rdf/model/graph.ex b/lib/rdf/model/graph.ex
@@ -1328,9 +1328,9 @@ defmodule RDF.Graph do
       ...>      RDF.Graph.new([{~B<b1>, EX.p(), ~B<b2>}, {~B<b3>, EX.p(), 42}]))
       false
   """
-  @spec isomorphic?(RDF.Graph.t(), RDF.Graph.t()) :: boolean
-  def isomorphic?(%__MODULE__{} = graph1, %__MODULE__{} = graph2) do
-    graph1 |> canonicalize() |> equal?(canonicalize(graph2))
+  @spec isomorphic?(RDF.Graph.t(), RDF.Graph.t(), keyword) :: boolean
+  def isomorphic?(%__MODULE__{} = graph1, %__MODULE__{} = graph2, opts \\ []) do
+    graph1 |> canonicalize(opts) |> equal?(canonicalize(graph2, opts))
   end
 
   @doc """
@@ -1343,10 +1343,10 @@ defmodule RDF.Graph do
       RDF.Graph.new([{~B<c14n0>, EX.p(), ~B<c14n1>}, {~B<c14n1>, EX.p(), ~B<c14n0>}])
 
   """
-  @spec canonicalize(RDF.Graph.t()) :: RDF.Graph.t()
-  def canonicalize(%__MODULE__{} = graph) do
+  @spec canonicalize(RDF.Graph.t(), keyword) :: RDF.Graph.t()
+  def canonicalize(%__MODULE__{} = graph, opts \\ []) do
     graph
-    |> RDF.Canonicalization.canonicalize()
+    |> RDF.Canonicalization.canonicalize(opts)
     |> Dataset.default_graph()
   end
 

diff --git a/test/acceptance/canonicalization_w3c_test.exs b/test/acceptance/canonicalization_w3c_test.exs
@@ -18,18 +18,14 @@ defmodule RDF.Canonicalization.W3C.Test do
 
   TestSuite.test_cases(@manifest, RDFC.RDFC10EvalTest)
   |> Enum.each(fn test_case ->
-    if RDFC.hashAlgorithm() in RDF.Description.predicates(test_case) do
-      @tag skip: "missing ability to change hash algorithm"
-    end
-
     @tag test_case: test_case
     test TestSuite.test_title(test_case), %{test_case: test_case} do
       file_url = to_string(TestSuite.test_input_file(test_case))
       input = test_case_file(test_case, &TestSuite.test_input_file/1)
       result = test_case_file(test_case, &TestSuite.test_output_file/1)
 
       assert NQuads.read_file!(input, base: file_url)
-             |> Canonicalization.canonicalize() ==
+             |> Canonicalization.canonicalize(hash_algorithm_opts(test_case)) ==
                NQuads.read_file!(result)
     end
   end)
@@ -43,4 +39,20 @@ defmodule RDF.Canonicalization.W3C.Test do
       |> String.trim_leading(@base)
     )
   end
+
+  defp hash_algorithm_opts(test_case) do
+    case RDFC.hashAlgorithm(test_case) do
+      nil ->
+        []
+
+      [hash_algorithm] ->
+        [
+          hash_algorithm:
+            hash_algorithm
+            |> to_string()
+            |> String.downcase()
+            |> String.to_atom()
+        ]
+    end
+  end
 end