diff --git a/CHANGELOG.md b/CHANGELOG.md index 00bb32b5c9a1..9f22b99ca7c6 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -12,6 +12,7 @@ All notable changes to this project will be documented in this file. ### Changed - Details modal search inputs are now case-insensitive. +- Improved report performance in cases where site has a lot of unique pathnames ### Fixed diff --git a/lib/plausible/stats/imported/imported.ex b/lib/plausible/stats/imported/imported.ex index 3e9e6d9fc557..b05e1d2112e0 100644 --- a/lib/plausible/stats/imported/imported.ex +++ b/lib/plausible/stats/imported/imported.ex @@ -222,10 +222,13 @@ defmodule Plausible.Stats.Imported do def merge_imported(q, _, %Query{include_imported: false}, _), do: q def merge_imported(q, site, %Query{dimensions: []} = query, metrics) do + q = paginate_optimization(q, query) + imported_q = site |> Imported.Base.query_imported(query) |> select_imported_metrics(metrics) + |> paginate_optimization(query) from( s in subquery(q), @@ -275,12 +278,15 @@ defmodule Plausible.Stats.Imported do def merge_imported(q, site, query, metrics) do if schema_supports_query?(query) do + q = paginate_optimization(q, query) + imported_q = site |> Imported.Base.query_imported(query) |> where([i], i.visitors > 0) |> group_imported_by(query) |> select_imported_metrics(metrics) + |> paginate_optimization(query) from(s in subquery(q), full_join: i in subquery(imported_q), @@ -299,4 +305,48 @@ defmodule Plausible.Stats.Imported do |> Imported.Base.query_imported(query) |> select_merge([i], %{total_visitors: fragment("sum(?)", i.visitors)}) end + + defp naive_dimension_join(q1, q2, metrics) do + from(a in subquery(q1), + full_join: b in subquery(q2), + on: a.dim0 == b.dim0, + select: %{} + ) + |> select_merge_as([a, b], %{ + dim0: fragment("if(? != 0, ?, ?)", a.dim0, a.dim0, b.dim0) + }) + |> select_joined_metrics(metrics) + end + + # Optimization for cases when grouping by a very high cardinality column. + # + # Instead of joining all rows from main and imported tables, we limit the number of rows + # in both tables to LIMIT N * 100. + # + # This speeds up cases where a site has millions of unique pathnames, reducing the time spent + # JOINing tables by an order of magnitude. + # + # Note that this optimization is lossy as the true top N values can arise from outside the top C + # items of either subquery. In practice though, this will give plausible results. + # + # We only apply this optimization in cases where we can deterministically ORDER BY. This covers + # opening Plausible dashboard but not more complicated use-cases. + defp paginate_optimization(q, query) do + if is_map(query.pagination) and can_order_by?(query) do + n = (query.pagination.limit + query.pagination.offset) * 100 + + q + |> QueryBuilder.build_order_by(query) + |> limit(^n) + else + q + end + end + + defp can_order_by?(query) do + Enum.all?(query.order_by, fn + {metric, _direction} when is_atom(metric) -> metric in query.metrics + _ -> true + end) + end end diff --git a/lib/plausible/stats/imported/sql/expression.ex b/lib/plausible/stats/imported/sql/expression.ex index a22549b694a4..057bc22d695a 100644 --- a/lib/plausible/stats/imported/sql/expression.ex +++ b/lib/plausible/stats/imported/sql/expression.ex @@ -15,200 +15,108 @@ defmodule Plausible.Stats.Imported.SQL.Expression do @not_set "(not set)" @none "(none)" - def select_imported_metrics(q, []), do: q + def select_imported_metrics( + %Ecto.Query{from: %Ecto.Query.FromExpr{source: {table, _}}} = q, + metrics + ) do + select_clause = + metrics + |> Enum.map(&select_metric(&1, table)) + |> Enum.reduce(%{}, &Map.merge/2) - def select_imported_metrics(q, [:visitors | rest]) do q - |> select_merge([i], %{visitors: sum(i.visitors)}) - |> select_imported_metrics(rest) + |> select_merge(q, ^select_clause) + |> filter_pageviews(metrics, table) end - def select_imported_metrics( - %Ecto.Query{from: %Ecto.Query.FromExpr{source: {"imported_custom_events", _}}} = q, - [:events | rest] - ) do - q - |> select_merge([i], %{events: sum(i.events)}) - |> select_imported_metrics(rest) + defp filter_pageviews(q, metrics, table) do + should_filter = :pageviews in metrics or :views_per_visit in metrics + + case {should_filter, table} do + {_, "imported_custom_events"} -> q + {true, _} -> q |> where([i], i.pageviews > 0) + {false, _} -> q + end end - def select_imported_metrics(q, [:events | rest]) do - q - |> select_merge([i], %{events: sum(i.pageviews)}) - |> select_imported_metrics(rest) + defp select_metric(:visitors, _table) do + wrap_alias([i], %{visitors: sum(i.visitors)}) end - def select_imported_metrics( - %Ecto.Query{from: %Ecto.Query.FromExpr{source: {"imported_exit_pages", _}}} = q, - [:visits | rest] - ) do - q - |> select_merge([i], %{visits: sum(i.exits)}) - |> select_imported_metrics(rest) + defp select_metric(:events, "imported_custom_events") do + wrap_alias([i], %{events: sum(i.events)}) end - def select_imported_metrics( - %Ecto.Query{from: %Ecto.Query.FromExpr{source: {"imported_entry_pages", _}}} = q, - [:visits | rest] - ) do - q - |> select_merge([i], %{visits: sum(i.entrances)}) - |> select_imported_metrics(rest) + defp select_metric(:events, _table) do + wrap_alias([i], %{events: sum(i.pageviews)}) end - def select_imported_metrics(q, [:visits | rest]) do - q - |> select_merge([i], %{visits: sum(i.visits)}) - |> select_imported_metrics(rest) + defp select_metric(:visits, "imported_exit_pages") do + wrap_alias([i], %{visits: sum(i.exits)}) end - def select_imported_metrics( - %Ecto.Query{from: %Ecto.Query.FromExpr{source: {"imported_custom_events", _}}} = q, - [:pageviews | rest] - ) do - q - |> select_merge([i], %{pageviews: 0}) - |> select_imported_metrics(rest) + defp select_metric(:visits, "imported_entry_pages") do + wrap_alias([i], %{visits: sum(i.entrances)}) end - def select_imported_metrics(q, [:pageviews | rest]) do - q - |> where([i], i.pageviews > 0) - |> select_merge([i], %{pageviews: sum(i.pageviews)}) - |> select_imported_metrics(rest) + defp select_metric(:visits, _table) do + wrap_alias([i], %{visits: sum(i.visits)}) end - def select_imported_metrics( - %Ecto.Query{from: %Ecto.Query.FromExpr{source: {"imported_pages", _}}} = q, - [:bounce_rate | rest] - ) do - q - |> select_merge([i], %{ - bounces: 0, - __internal_visits: 0 - }) - |> select_imported_metrics(rest) + defp select_metric(:pageviews, "imported_custom_events") do + wrap_alias([i], %{pageviews: 0}) end - def select_imported_metrics( - %Ecto.Query{from: %Ecto.Query.FromExpr{source: {"imported_entry_pages", _}}} = q, - [:bounce_rate | rest] - ) do - q - |> select_merge([i], %{ - bounces: sum(i.bounces), - __internal_visits: sum(i.entrances) - }) - |> select_imported_metrics(rest) + defp select_metric(:pageviews, _table) do + wrap_alias([i], %{pageviews: sum(i.pageviews)}) end - def select_imported_metrics( - %Ecto.Query{from: %Ecto.Query.FromExpr{source: {"imported_exit_pages", _}}} = q, - [:bounce_rate | rest] - ) do - q - |> select_merge([i], %{ - bounces: sum(i.bounces), - __internal_visits: sum(i.exits) - }) - |> select_imported_metrics(rest) + defp select_metric(:bounce_rate, "imported_pages") do + wrap_alias([i], %{bounces: 0, __internal_visits: 0}) end - def select_imported_metrics(q, [:bounce_rate | rest]) do - q - |> select_merge([i], %{ - bounces: sum(i.bounces), - __internal_visits: sum(i.visits) - }) - |> select_imported_metrics(rest) + defp select_metric(:bounce_rate, "imported_exit_pages") do + wrap_alias([i], %{bounces: sum(i.bounces), __internal_visits: sum(i.exits)}) end - def select_imported_metrics( - %Ecto.Query{from: %Ecto.Query.FromExpr{source: {"imported_pages", _}}} = q, - [:visit_duration | rest] - ) do - q - |> select_merge([i], %{ - visit_duration: 0, - __internal_visits: 0 - }) - |> select_imported_metrics(rest) + defp select_metric(:bounce_rate, "imported_entry_pages") do + wrap_alias([i], %{bounces: sum(i.bounces), __internal_visits: sum(i.entrances)}) end - def select_imported_metrics( - %Ecto.Query{from: %Ecto.Query.FromExpr{source: {"imported_entry_pages", _}}} = q, - [:visit_duration | rest] - ) do - q - |> select_merge([i], %{ - visit_duration: sum(i.visit_duration), - __internal_visits: sum(i.entrances) - }) - |> select_imported_metrics(rest) + defp select_metric(:bounce_rate, _table) do + wrap_alias([i], %{bounces: sum(i.bounces), __internal_visits: sum(i.visits)}) end - def select_imported_metrics( - %Ecto.Query{from: %Ecto.Query.FromExpr{source: {"imported_exit_pages", _}}} = q, - [:visit_duration | rest] - ) do - q - |> select_merge([i], %{ - visit_duration: sum(i.visit_duration), - __internal_visits: sum(i.exits) - }) - |> select_imported_metrics(rest) + defp select_metric(:visit_duration, "imported_pages") do + wrap_alias([i], %{visit_duration: 0}) end - def select_imported_metrics(q, [:visit_duration | rest]) do - q - |> select_merge([i], %{ - visit_duration: sum(i.visit_duration), - __internal_visits: sum(i.visits) - }) - |> select_imported_metrics(rest) + defp select_metric(:visit_duration, "imported_exit_pages") do + wrap_alias([i], %{visit_duration: sum(i.visit_duration), __internal_visits: sum(i.exits)}) end - def select_imported_metrics( - %Ecto.Query{from: %Ecto.Query.FromExpr{source: {"imported_entry_pages", _}}} = q, - [:views_per_visit | rest] - ) do - q - |> where([i], i.pageviews > 0) - |> select_merge([i], %{ - pageviews: sum(i.pageviews), - __internal_visits: sum(i.entrances) - }) - |> select_imported_metrics(rest) + defp select_metric(:visit_duration, "imported_entry_pages") do + wrap_alias([i], %{visit_duration: sum(i.visit_duration), __internal_visits: sum(i.entrances)}) end - def select_imported_metrics( - %Ecto.Query{from: %Ecto.Query.FromExpr{source: {"imported_exit_pages", _}}} = q, - [:views_per_visit | rest] - ) do - q - |> where([i], i.pageviews > 0) - |> select_merge([i], %{ - pageviews: sum(i.pageviews), - __internal_visits: sum(i.exits) - }) - |> select_imported_metrics(rest) + defp select_metric(:visit_duration, _table) do + wrap_alias([i], %{visit_duration: sum(i.visit_duration), __internal_visits: sum(i.visits)}) end - def select_imported_metrics(q, [:views_per_visit | rest]) do - q - |> where([i], i.pageviews > 0) - |> select_merge([i], %{ - pageviews: sum(i.pageviews), - __internal_visits: sum(i.visits) - }) - |> select_imported_metrics(rest) + defp select_metric(:views_per_visit, "imported_exit_pages") do + wrap_alias([i], %{pageviews: sum(i.pageviews), __internal_visits: sum(i.exits)}) end - def select_imported_metrics(q, [_ | rest]) do - q - |> select_imported_metrics(rest) + defp select_metric(:views_per_visit, "imported_entry_pages") do + wrap_alias([i], %{pageviews: sum(i.pageviews), __internal_visits: sum(i.entrances)}) + end + + defp select_metric(:views_per_visit, _table) do + wrap_alias([i], %{pageviews: sum(i.pageviews), __internal_visits: sum(i.visits)}) end + defp select_metric(_metric, _table), do: %{} + def group_imported_by(q, query) do Enum.reduce(query.dimensions, q, fn dimension, q -> q @@ -456,17 +364,5 @@ defmodule Plausible.Stats.Imported.SQL.Expression do |> select_joined_metrics(rest) end - def naive_dimension_join(q1, q2, metrics) do - from(a in subquery(q1), - full_join: b in subquery(q2), - on: a.dim0 == b.dim0, - select: %{} - ) - |> select_merge_as([a, b], %{ - dim0: fragment("if(? != 0, ?, ?)", a.dim0, a.dim0, b.dim0) - }) - |> select_joined_metrics(metrics) - end - defp dim(dimension), do: Plausible.Stats.Filters.without_prefix(dimension) end diff --git a/lib/plausible/stats/query_optimizer.ex b/lib/plausible/stats/query_optimizer.ex index 827dfebb169c..280d045e385c 100644 --- a/lib/plausible/stats/query_optimizer.ex +++ b/lib/plausible/stats/query_optimizer.ex @@ -37,8 +37,7 @@ defmodule Plausible.Stats.QueryOptimizer do { Query.set(query, metrics: event_metrics, - include_imported: query.include_imported, - pagination: nil + include_imported: query.include_imported ), split_sessions_query(query, sessions_metrics) } @@ -171,8 +170,7 @@ defmodule Plausible.Stats.QueryOptimizer do filters: filters, metrics: session_metrics, dimensions: dimensions, - include_imported: query.include_imported, - pagination: nil + include_imported: query.include_imported ) end diff --git a/lib/plausible/stats/sql/query_builder.ex b/lib/plausible/stats/sql/query_builder.ex index 5aedeb5f7c59..5337268e55bd 100644 --- a/lib/plausible/stats/sql/query_builder.ex +++ b/lib/plausible/stats/sql/query_builder.ex @@ -27,6 +27,10 @@ defmodule Plausible.Stats.SQL.QueryBuilder do |> select_total_rows(query.include.total_rows) end + def build_order_by(q, query) do + Enum.reduce(query.order_by || [], q, &build_order_by(&2, query, &1)) + end + defp build_events_query(_site, %Query{metrics: []}), do: nil defp build_events_query(site, events_query) do @@ -153,11 +157,7 @@ defmodule Plausible.Stats.SQL.QueryBuilder do |> group_by([], selected_as(^key)) end - defp build_order_by(q, query) do - Enum.reduce(query.order_by || [], q, &build_order_by(&2, query, &1)) - end - - def build_order_by(q, query, {metric_or_dimension, order_direction}) do + defp build_order_by(q, query, {metric_or_dimension, order_direction}) do order_by( q, [t], diff --git a/lib/plausible/stats/sql/special_metrics.ex b/lib/plausible/stats/sql/special_metrics.ex index e6b3aeba9065..80fcd0150a32 100644 --- a/lib/plausible/stats/sql/special_metrics.ex +++ b/lib/plausible/stats/sql/special_metrics.ex @@ -26,7 +26,8 @@ defmodule Plausible.Stats.SQL.SpecialMetrics do |> remove_filters_ignored_in_totals_query() |> Query.set( dimensions: [], - include_imported: query.include_imported + include_imported: query.include_imported, + pagination: nil ) q @@ -57,7 +58,8 @@ defmodule Plausible.Stats.SQL.SpecialMetrics do |> Query.set( dimensions: [], include_imported: query.include_imported, - preloaded_goals: [] + preloaded_goals: [], + pagination: nil ) q @@ -100,7 +102,8 @@ defmodule Plausible.Stats.SQL.SpecialMetrics do metrics: [:visitors], order_by: [], include_imported: query.include_imported, - preloaded_goals: [] + preloaded_goals: [], + pagination: nil ) from(e in subquery(q), diff --git a/test/plausible_web/controllers/api/external_stats_controller/query_imported_test.exs b/test/plausible_web/controllers/api/external_stats_controller/query_imported_test.exs index cd887fb7418a..f9cac1f88704 100644 --- a/test/plausible_web/controllers/api/external_stats_controller/query_imported_test.exs +++ b/test/plausible_web/controllers/api/external_stats_controller/query_imported_test.exs @@ -1223,5 +1223,68 @@ defmodule PlausibleWeb.Api.ExternalStatsController.QueryImportedTest do %{"dimensions" => ["Kärdla", "Estonia"], "metrics" => [2]} ] end + + test "page breakdown with paginate_optimization (ideal case)", %{ + conn: conn, + site: site + } do + site_import = insert(:site_import, site: site) + + populate_stats( + site, + site_import.id, + [ + build(:pageview, pathname: "/99", timestamp: ~N[2021-01-01 00:00:00]) + ] ++ + Enum.map(1..100, fn i -> + build(:imported_pages, page: "/#{i}", pageviews: 1, visitors: 1, date: ~D[2021-01-01]) + end) + ) + + conn = + post(conn, "/api/v2/query", %{ + "site_id" => site.domain, + "metrics" => ["pageviews"], + "date_range" => "all", + "dimensions" => ["event:page"], + "include" => %{"imports" => true}, + "pagination" => %{"limit" => 1} + }) + + assert json_response(conn, 200)["results"] == [%{"dimensions" => ["/99"], "metrics" => [2]}] + end + + test "page breakdown with paginate_optimization (lossy case)", %{ + conn: conn, + site: site + } do + site_import = insert(:site_import, site: site) + + populate_stats( + site, + site_import.id, + [ + build(:pageview, pathname: "/99", timestamp: ~N[2021-01-01 00:00:00]) + ] ++ + Enum.map(1..200, fn i -> + build(:imported_pages, page: "/#{i}", pageviews: 1, visitors: 1, date: ~D[2021-01-01]) + end) + ) + + conn = + post(conn, "/api/v2/query", %{ + "site_id" => site.domain, + "metrics" => ["pageviews"], + "date_range" => "all", + "dimensions" => ["event:page"], + "include" => %{"imports" => true}, + "pagination" => %{"limit" => 1} + }) + + [%{"dimensions" => ["/99"], "metrics" => [pageviews]}] = json_response(conn, 200)["results"] + + # This is non-deterministic since /99 might not be in the top N items of imported pages subquery. + assert pageviews in 1..2 + end end end