diff --git a/apps/transport/lib/jobs/new_datagouv_datasets_job.ex b/apps/transport/lib/jobs/new_datagouv_datasets_job.ex index 80782e52dd..879cab2b64 100644 --- a/apps/transport/lib/jobs/new_datagouv_datasets_job.ex +++ b/apps/transport/lib/jobs/new_datagouv_datasets_job.ex @@ -35,9 +35,9 @@ defmodule Transport.Jobs.NewDatagouvDatasetsJob do "trottinette", "vls", "scooter", + "scooters", "libre-service", - "libre service", - "scooter" + "libre service" ]), formats: MapSet.new(["gbfs"]) }, @@ -51,7 +51,7 @@ defmodule Transport.Jobs.NewDatagouvDatasetsJob do "etalab/schema-comptage-mobilites-measure", "etalab/schema-comptage-mobilites-site" ], - tags: MapSet.new(["cyclable", "parking", "stationnement", "vélo"]), + tags: MapSet.new(["cyclable", "cyclables", "parking", "parkings", "stationnement", "vélo", "vélos"]), formats: MapSet.new([]) }, %{ @@ -221,7 +221,17 @@ defmodule Transport.Jobs.NewDatagouvDatasetsJob do defp string_matches?(str, %{formats: formats, tags: tags} = _rule) when is_binary(str) do searches = MapSet.union(formats, tags) |> MapSet.to_list() |> Enum.map(&normalize/1) - str |> normalize() |> String.contains?(searches) + {words_with_spaces, words_without_spaces} = Enum.split_with(searches, &String.contains?(&1, " ")) + + match_without_spaces = + not (str + |> normalize() + |> String.split(~r/\s+/) + |> MapSet.new() + |> MapSet.disjoint?(MapSet.new(words_without_spaces))) + + match_with_spaces = str |> normalize() |> String.contains?(words_with_spaces) + match_without_spaces || match_with_spaces end defp tags_is_relevant?(%{"tags" => tags} = _dataset, rule) do @@ -257,8 +267,26 @@ defmodule Transport.Jobs.NewDatagouvDatasetsJob do "velo" iex> normalize("Châteauroux") "chateauroux" + iex> normalize("J'adore manger") + "j'adore manger" """ def normalize(value) do - value |> String.normalize(:nfd) |> String.replace(~r/[^A-z]/u, "") |> String.downcase() + value + |> String.downcase() + |> String.graphemes() + |> Enum.map_join("", &normalize_grapheme/1) + end + + defp normalize_grapheme(grapheme) do + case String.normalize(grapheme, :nfd) do + <> when is_binary(rest) -> + case String.valid?(<>) do + true -> <> + false -> "" + end + + _ -> + grapheme + end end end diff --git a/apps/transport/test/transport/jobs/new_datagouv_datasets_job_test.exs b/apps/transport/test/transport/jobs/new_datagouv_datasets_job_test.exs index e8e75f37d4..d1fba619ef 100644 --- a/apps/transport/test/transport/jobs/new_datagouv_datasets_job_test.exs +++ b/apps/transport/test/transport/jobs/new_datagouv_datasets_job_test.exs @@ -60,6 +60,8 @@ defmodule Transport.Test.Transport.Jobs.NewDatagouvDatasetsJobTest do }) assert :no_match == relevant_fn.(%{base | "title" => "Résultat des élections"}) + # does not match on the word `velo` in the middle of the tag + assert :no_match == relevant_fn.(%{base | "tags" => ["developpement-du-territoire"]}) assert %{category: "IRVE"} = relevant_fn.(%{