Skip to content

Commit

Permalink
Allow changing the hash algorithm during canonicalization
Browse files Browse the repository at this point in the history
  • Loading branch information
marcelotto committed Feb 28, 2024
1 parent d013992 commit 7b60f0e
Show file tree
Hide file tree
Showing 5 changed files with 72 additions and 28 deletions.
3 changes: 3 additions & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -25,6 +25,9 @@ Elixir versions < 1.12 are no longer supported
- The Turtle encoder now sorts the prefixes (based on `RDF.PrefixMap.to_sorted_list/1`),
which has become necessary, since OTP 26 maps are now unordered even in smaller cases
(previously only larger maps were unordered).
- The hash algorithm to be used for RDF canonicalization can be configured either
with the `:hash_algorithm` keyword option or the `:canon_hash_algorithm` application
runtime configuration.

### Fixed

Expand Down
47 changes: 33 additions & 14 deletions lib/rdf/canonicalization/canonicalization.ex
Original file line number Diff line number Diff line change
Expand Up @@ -10,11 +10,25 @@ defmodule RDF.Canonicalization do

import RDF.Sigils

@hash_algorithm_config_doc """
- `:hash_algorithm`: Allows to set the hash algorithm to be used. Any of the `:crypto.hash_algorithm()`
values is allowed. Defaults to the runtime configured `:canon_hash_algorithm` of the `:rdf`
application or `:sha256` of not configured otherwise.
config :rdf,
canon_hash_algorithm: :sha512
"""

@doc """
Canonicalizes the blank nodes of a graph or dataset according to the RDF Dataset Canonicalization spec.
This function always returns a `RDF.Dataset`. If you want to canonicalize a `RDF.Graph` and
get a `RDF.Graph` back, use `RDF.Graph.canonicalize/1`.
get a `RDF.Graph` back, use `RDF.Graph.canonicalize/2`.
## Options
#{@hash_algorithm_config_doc}
## Example
Expand All @@ -23,14 +37,18 @@ defmodule RDF.Canonicalization do
RDF.Dataset.new([{~B<c14n0>, EX.p(), ~B<c14n1>}, {~B<c14n1>, EX.p(), ~B<c14n0>}])
"""
@spec canonicalize(RDF.Graph.t() | RDF.Dataset.t()) :: RDF.Dataset.t()
def canonicalize(input) do
urdna2015(input)
@spec canonicalize(RDF.Graph.t() | RDF.Dataset.t(), keyword) :: RDF.Dataset.t()
def canonicalize(input, opts \\ []) do
urdna2015(input, opts)
end

@doc """
Checks whether two graphs or datasets are equal, regardless of the concrete names of the blank nodes they contain.
## Options
#{@hash_algorithm_config_doc}
## Examples
iex> RDF.Graph.new([{~B<foo>, EX.p(), ~B<bar>}, {~B<bar>, EX.p(), 42}])
Expand All @@ -43,14 +61,15 @@ defmodule RDF.Canonicalization do
...> RDF.Graph.new([{~B<b1>, EX.p(), ~B<b2>}, {~B<b3>, EX.p(), 42}]))
false
"""
@spec isomorphic?(RDF.Graph.t() | RDF.Dataset.t(), RDF.Graph.t() | RDF.Dataset.t()) :: boolean
def isomorphic?(a, b) do
a |> canonicalize() |> Dataset.equal?(canonicalize(b))
@spec isomorphic?(RDF.Graph.t() | RDF.Dataset.t(), RDF.Graph.t() | RDF.Dataset.t(), keyword) ::
boolean
def isomorphic?(a, b, opts \\ []) do
a |> canonicalize(opts) |> Dataset.equal?(canonicalize(b, opts))
end

defp urdna2015(input) do
defp urdna2015(input, opts) do
input
|> State.new()
|> State.new(opts)
|> create_canonical_identifiers_for_single_node_hashes()
|> create_canonical_identifiers_for_multiple_node_hashes()
|> apply_canonicalization(input)
Expand Down Expand Up @@ -155,7 +174,7 @@ defmodule RDF.Canonicalization do
# TODO: "Sort nquads in Unicode code point order"
|> Enum.sort()
|> Enum.join()
|> hash()
|> hash(state)

# |> IO.inspect(label: "1deg: node: #{inspect(ref_bnode_id)}, hash_first_degree_quads")
end
Expand All @@ -178,7 +197,7 @@ defmodule RDF.Canonicalization do
hash_first_degree_quads(state, related)
end

hash(input)
hash(input, state)
# |> IO.inspect(label: "hrel: input: #{inspect(input)}, hash_related_bnode")
end

Expand Down Expand Up @@ -310,7 +329,7 @@ defmodule RDF.Canonicalization do

# IO.puts("ndeg: datatohash: #{data_to_hash}, hash: #{hash(data_to_hash)}")

{hash(data_to_hash), issuer}
{hash(data_to_hash, state), issuer}
end

# 4.8.2.3.1) Group adjacent bnodes by hash
Expand All @@ -336,7 +355,7 @@ defmodule RDF.Canonicalization do
end)
end

defp hash(data) do
:crypto.hash(:sha256, data) |> Base.encode16(case: :lower)
defp hash(data, state) do
:crypto.hash(state.hash_algorithm, data) |> Base.encode16(case: :lower)
end
end
16 changes: 13 additions & 3 deletions lib/rdf/canonicalization/state.ex
Original file line number Diff line number Diff line change
Expand Up @@ -10,10 +10,20 @@ defmodule RDF.Canonicalization.State do

defstruct bnode_to_quads: nil,
hash_to_bnodes: %{},
canonical_issuer: IdentifierIssuer.canonical()
canonical_issuer: IdentifierIssuer.canonical(),
hash_algorithm: nil

def new(input) do
%__MODULE__{bnode_to_quads: bnode_to_quads(input)}
def new(input, opts) do
hash_algorithm = Keyword.get_lazy(opts, :hash_algorithm, &default_hash_algorithm/0)

%__MODULE__{
bnode_to_quads: bnode_to_quads(input),
hash_algorithm: hash_algorithm
}
end

def default_hash_algorithm do
Application.get_env(:rdf, :canon_hash_algorithm, :sha256)
end

def issue_canonical_identifier(state, identifier) do
Expand Down
12 changes: 6 additions & 6 deletions lib/rdf/model/graph.ex
Original file line number Diff line number Diff line change
Expand Up @@ -1328,9 +1328,9 @@ defmodule RDF.Graph do
...> RDF.Graph.new([{~B<b1>, EX.p(), ~B<b2>}, {~B<b3>, EX.p(), 42}]))
false
"""
@spec isomorphic?(RDF.Graph.t(), RDF.Graph.t()) :: boolean
def isomorphic?(%__MODULE__{} = graph1, %__MODULE__{} = graph2) do
graph1 |> canonicalize() |> equal?(canonicalize(graph2))
@spec isomorphic?(RDF.Graph.t(), RDF.Graph.t(), keyword) :: boolean
def isomorphic?(%__MODULE__{} = graph1, %__MODULE__{} = graph2, opts \\ []) do
graph1 |> canonicalize(opts) |> equal?(canonicalize(graph2, opts))
end

@doc """
Expand All @@ -1343,10 +1343,10 @@ defmodule RDF.Graph do
RDF.Graph.new([{~B<c14n0>, EX.p(), ~B<c14n1>}, {~B<c14n1>, EX.p(), ~B<c14n0>}])
"""
@spec canonicalize(RDF.Graph.t()) :: RDF.Graph.t()
def canonicalize(%__MODULE__{} = graph) do
@spec canonicalize(RDF.Graph.t(), keyword) :: RDF.Graph.t()
def canonicalize(%__MODULE__{} = graph, opts \\ []) do
graph
|> RDF.Canonicalization.canonicalize()
|> RDF.Canonicalization.canonicalize(opts)
|> Dataset.default_graph()
end

Expand Down
22 changes: 17 additions & 5 deletions test/acceptance/canonicalization_w3c_test.exs
Original file line number Diff line number Diff line change
Expand Up @@ -18,18 +18,14 @@ defmodule RDF.Canonicalization.W3C.Test do

TestSuite.test_cases(@manifest, RDFC.RDFC10EvalTest)
|> Enum.each(fn test_case ->
if RDFC.hashAlgorithm() in RDF.Description.predicates(test_case) do
@tag skip: "missing ability to change hash algorithm"
end

@tag test_case: test_case
test TestSuite.test_title(test_case), %{test_case: test_case} do
file_url = to_string(TestSuite.test_input_file(test_case))
input = test_case_file(test_case, &TestSuite.test_input_file/1)
result = test_case_file(test_case, &TestSuite.test_output_file/1)

assert NQuads.read_file!(input, base: file_url)
|> Canonicalization.canonicalize() ==
|> Canonicalization.canonicalize(hash_algorithm_opts(test_case)) ==
NQuads.read_file!(result)
end
end)
Expand All @@ -43,4 +39,20 @@ defmodule RDF.Canonicalization.W3C.Test do
|> String.trim_leading(@base)
)
end

defp hash_algorithm_opts(test_case) do
case RDFC.hashAlgorithm(test_case) do
nil ->
[]

[hash_algorithm] ->
[
hash_algorithm:
hash_algorithm
|> to_string()
|> String.downcase()
|> String.to_atom()
]
end
end
end

0 comments on commit 7b60f0e

Please sign in to comment.