Skip to content

Commit

Permalink
Improve report performance with high-cardinality import joins (#4848)
Browse files Browse the repository at this point in the history
* Improve report performance in cases where site has a lot of unique pathnames

Ref: https://3.basecamp.com/5308029/buckets/39750953/card_tables/cards/8052057081

JOINs in ClickHouse are slow. In one degenerate case I found a user had
over 20 million unique paths in an import, which resulted in extremely slow
JOINs. This introduces a sort-of hacky solution to it by limiting the
amount of data analyzed.

Query timing without this change:
```
9 rows in set. Elapsed: 11.383 sec. Processed 49.16 million rows, 5.75 GB (4.32 million rows/s., 505.29 MB/s.)
Peak memory usage: 14.75 GiB.
```

After:
```
9 rows in set. Elapsed: 0.572 sec. Processed 49.18 million rows, 5.75 GB (86.03 million rows/s., 10.06 GB/s.)
Peak memory usage: 9.01 GiB.
```

* Splitting should no longer remove pagination. Handle special cases in special_metrics.ex

* select_merge_as in imports

This sets up selected_as aliases which will be used in a subsequent commit

* Add explicit ORDER BY to import

* Rewrite comment

* quoting

* merge conflict

* Split test

* Merge conflict fail fix
  • Loading branch information
macobo authored Dec 5, 2024
1 parent 3afec60 commit bec14ee
Show file tree
Hide file tree
Showing 7 changed files with 187 additions and 176 deletions.
1 change: 1 addition & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,7 @@ All notable changes to this project will be documented in this file.
### Changed

- Details modal search inputs are now case-insensitive.
- Improved report performance in cases where site has a lot of unique pathnames

### Fixed

Expand Down
50 changes: 50 additions & 0 deletions lib/plausible/stats/imported/imported.ex
Original file line number Diff line number Diff line change
Expand Up @@ -222,10 +222,13 @@ defmodule Plausible.Stats.Imported do
def merge_imported(q, _, %Query{include_imported: false}, _), do: q

def merge_imported(q, site, %Query{dimensions: []} = query, metrics) do
q = paginate_optimization(q, query)

imported_q =
site
|> Imported.Base.query_imported(query)
|> select_imported_metrics(metrics)
|> paginate_optimization(query)

from(
s in subquery(q),
Expand Down Expand Up @@ -275,12 +278,15 @@ defmodule Plausible.Stats.Imported do

def merge_imported(q, site, query, metrics) do
if schema_supports_query?(query) do
q = paginate_optimization(q, query)

imported_q =
site
|> Imported.Base.query_imported(query)
|> where([i], i.visitors > 0)
|> group_imported_by(query)
|> select_imported_metrics(metrics)
|> paginate_optimization(query)

from(s in subquery(q),
full_join: i in subquery(imported_q),
Expand All @@ -299,4 +305,48 @@ defmodule Plausible.Stats.Imported do
|> Imported.Base.query_imported(query)
|> select_merge([i], %{total_visitors: fragment("sum(?)", i.visitors)})
end

defp naive_dimension_join(q1, q2, metrics) do
from(a in subquery(q1),
full_join: b in subquery(q2),
on: a.dim0 == b.dim0,
select: %{}
)
|> select_merge_as([a, b], %{
dim0: fragment("if(? != 0, ?, ?)", a.dim0, a.dim0, b.dim0)
})
|> select_joined_metrics(metrics)
end

# Optimization for cases when grouping by a very high cardinality column.
#
# Instead of joining all rows from main and imported tables, we limit the number of rows
# in both tables to LIMIT N * 100.
#
# This speeds up cases where a site has millions of unique pathnames, reducing the time spent
# JOINing tables by an order of magnitude.
#
# Note that this optimization is lossy as the true top N values can arise from outside the top C
# items of either subquery. In practice though, this will give plausible results.
#
# We only apply this optimization in cases where we can deterministically ORDER BY. This covers
# opening Plausible dashboard but not more complicated use-cases.
defp paginate_optimization(q, query) do
if is_map(query.pagination) and can_order_by?(query) do
n = (query.pagination.limit + query.pagination.offset) * 100

q
|> QueryBuilder.build_order_by(query)
|> limit(^n)
else
q
end
end

defp can_order_by?(query) do
Enum.all?(query.order_by, fn
{metric, _direction} when is_atom(metric) -> metric in query.metrics
_ -> true
end)
end
end
224 changes: 60 additions & 164 deletions lib/plausible/stats/imported/sql/expression.ex
Original file line number Diff line number Diff line change
Expand Up @@ -15,200 +15,108 @@ defmodule Plausible.Stats.Imported.SQL.Expression do
@not_set "(not set)"
@none "(none)"

def select_imported_metrics(q, []), do: q
def select_imported_metrics(
%Ecto.Query{from: %Ecto.Query.FromExpr{source: {table, _}}} = q,
metrics
) do
select_clause =
metrics
|> Enum.map(&select_metric(&1, table))
|> Enum.reduce(%{}, &Map.merge/2)

def select_imported_metrics(q, [:visitors | rest]) do
q
|> select_merge([i], %{visitors: sum(i.visitors)})
|> select_imported_metrics(rest)
|> select_merge(q, ^select_clause)
|> filter_pageviews(metrics, table)
end

def select_imported_metrics(
%Ecto.Query{from: %Ecto.Query.FromExpr{source: {"imported_custom_events", _}}} = q,
[:events | rest]
) do
q
|> select_merge([i], %{events: sum(i.events)})
|> select_imported_metrics(rest)
defp filter_pageviews(q, metrics, table) do
should_filter = :pageviews in metrics or :views_per_visit in metrics

case {should_filter, table} do
{_, "imported_custom_events"} -> q
{true, _} -> q |> where([i], i.pageviews > 0)
{false, _} -> q
end
end

def select_imported_metrics(q, [:events | rest]) do
q
|> select_merge([i], %{events: sum(i.pageviews)})
|> select_imported_metrics(rest)
defp select_metric(:visitors, _table) do
wrap_alias([i], %{visitors: sum(i.visitors)})
end

def select_imported_metrics(
%Ecto.Query{from: %Ecto.Query.FromExpr{source: {"imported_exit_pages", _}}} = q,
[:visits | rest]
) do
q
|> select_merge([i], %{visits: sum(i.exits)})
|> select_imported_metrics(rest)
defp select_metric(:events, "imported_custom_events") do
wrap_alias([i], %{events: sum(i.events)})
end

def select_imported_metrics(
%Ecto.Query{from: %Ecto.Query.FromExpr{source: {"imported_entry_pages", _}}} = q,
[:visits | rest]
) do
q
|> select_merge([i], %{visits: sum(i.entrances)})
|> select_imported_metrics(rest)
defp select_metric(:events, _table) do
wrap_alias([i], %{events: sum(i.pageviews)})
end

def select_imported_metrics(q, [:visits | rest]) do
q
|> select_merge([i], %{visits: sum(i.visits)})
|> select_imported_metrics(rest)
defp select_metric(:visits, "imported_exit_pages") do
wrap_alias([i], %{visits: sum(i.exits)})
end

def select_imported_metrics(
%Ecto.Query{from: %Ecto.Query.FromExpr{source: {"imported_custom_events", _}}} = q,
[:pageviews | rest]
) do
q
|> select_merge([i], %{pageviews: 0})
|> select_imported_metrics(rest)
defp select_metric(:visits, "imported_entry_pages") do
wrap_alias([i], %{visits: sum(i.entrances)})
end

def select_imported_metrics(q, [:pageviews | rest]) do
q
|> where([i], i.pageviews > 0)
|> select_merge([i], %{pageviews: sum(i.pageviews)})
|> select_imported_metrics(rest)
defp select_metric(:visits, _table) do
wrap_alias([i], %{visits: sum(i.visits)})
end

def select_imported_metrics(
%Ecto.Query{from: %Ecto.Query.FromExpr{source: {"imported_pages", _}}} = q,
[:bounce_rate | rest]
) do
q
|> select_merge([i], %{
bounces: 0,
__internal_visits: 0
})
|> select_imported_metrics(rest)
defp select_metric(:pageviews, "imported_custom_events") do
wrap_alias([i], %{pageviews: 0})
end

def select_imported_metrics(
%Ecto.Query{from: %Ecto.Query.FromExpr{source: {"imported_entry_pages", _}}} = q,
[:bounce_rate | rest]
) do
q
|> select_merge([i], %{
bounces: sum(i.bounces),
__internal_visits: sum(i.entrances)
})
|> select_imported_metrics(rest)
defp select_metric(:pageviews, _table) do
wrap_alias([i], %{pageviews: sum(i.pageviews)})
end

def select_imported_metrics(
%Ecto.Query{from: %Ecto.Query.FromExpr{source: {"imported_exit_pages", _}}} = q,
[:bounce_rate | rest]
) do
q
|> select_merge([i], %{
bounces: sum(i.bounces),
__internal_visits: sum(i.exits)
})
|> select_imported_metrics(rest)
defp select_metric(:bounce_rate, "imported_pages") do
wrap_alias([i], %{bounces: 0, __internal_visits: 0})
end

def select_imported_metrics(q, [:bounce_rate | rest]) do
q
|> select_merge([i], %{
bounces: sum(i.bounces),
__internal_visits: sum(i.visits)
})
|> select_imported_metrics(rest)
defp select_metric(:bounce_rate, "imported_exit_pages") do
wrap_alias([i], %{bounces: sum(i.bounces), __internal_visits: sum(i.exits)})
end

def select_imported_metrics(
%Ecto.Query{from: %Ecto.Query.FromExpr{source: {"imported_pages", _}}} = q,
[:visit_duration | rest]
) do
q
|> select_merge([i], %{
visit_duration: 0,
__internal_visits: 0
})
|> select_imported_metrics(rest)
defp select_metric(:bounce_rate, "imported_entry_pages") do
wrap_alias([i], %{bounces: sum(i.bounces), __internal_visits: sum(i.entrances)})
end

def select_imported_metrics(
%Ecto.Query{from: %Ecto.Query.FromExpr{source: {"imported_entry_pages", _}}} = q,
[:visit_duration | rest]
) do
q
|> select_merge([i], %{
visit_duration: sum(i.visit_duration),
__internal_visits: sum(i.entrances)
})
|> select_imported_metrics(rest)
defp select_metric(:bounce_rate, _table) do
wrap_alias([i], %{bounces: sum(i.bounces), __internal_visits: sum(i.visits)})
end

def select_imported_metrics(
%Ecto.Query{from: %Ecto.Query.FromExpr{source: {"imported_exit_pages", _}}} = q,
[:visit_duration | rest]
) do
q
|> select_merge([i], %{
visit_duration: sum(i.visit_duration),
__internal_visits: sum(i.exits)
})
|> select_imported_metrics(rest)
defp select_metric(:visit_duration, "imported_pages") do
wrap_alias([i], %{visit_duration: 0})
end

def select_imported_metrics(q, [:visit_duration | rest]) do
q
|> select_merge([i], %{
visit_duration: sum(i.visit_duration),
__internal_visits: sum(i.visits)
})
|> select_imported_metrics(rest)
defp select_metric(:visit_duration, "imported_exit_pages") do
wrap_alias([i], %{visit_duration: sum(i.visit_duration), __internal_visits: sum(i.exits)})
end

def select_imported_metrics(
%Ecto.Query{from: %Ecto.Query.FromExpr{source: {"imported_entry_pages", _}}} = q,
[:views_per_visit | rest]
) do
q
|> where([i], i.pageviews > 0)
|> select_merge([i], %{
pageviews: sum(i.pageviews),
__internal_visits: sum(i.entrances)
})
|> select_imported_metrics(rest)
defp select_metric(:visit_duration, "imported_entry_pages") do
wrap_alias([i], %{visit_duration: sum(i.visit_duration), __internal_visits: sum(i.entrances)})
end

def select_imported_metrics(
%Ecto.Query{from: %Ecto.Query.FromExpr{source: {"imported_exit_pages", _}}} = q,
[:views_per_visit | rest]
) do
q
|> where([i], i.pageviews > 0)
|> select_merge([i], %{
pageviews: sum(i.pageviews),
__internal_visits: sum(i.exits)
})
|> select_imported_metrics(rest)
defp select_metric(:visit_duration, _table) do
wrap_alias([i], %{visit_duration: sum(i.visit_duration), __internal_visits: sum(i.visits)})
end

def select_imported_metrics(q, [:views_per_visit | rest]) do
q
|> where([i], i.pageviews > 0)
|> select_merge([i], %{
pageviews: sum(i.pageviews),
__internal_visits: sum(i.visits)
})
|> select_imported_metrics(rest)
defp select_metric(:views_per_visit, "imported_exit_pages") do
wrap_alias([i], %{pageviews: sum(i.pageviews), __internal_visits: sum(i.exits)})
end

def select_imported_metrics(q, [_ | rest]) do
q
|> select_imported_metrics(rest)
defp select_metric(:views_per_visit, "imported_entry_pages") do
wrap_alias([i], %{pageviews: sum(i.pageviews), __internal_visits: sum(i.entrances)})
end

defp select_metric(:views_per_visit, _table) do
wrap_alias([i], %{pageviews: sum(i.pageviews), __internal_visits: sum(i.visits)})
end

defp select_metric(_metric, _table), do: %{}

def group_imported_by(q, query) do
Enum.reduce(query.dimensions, q, fn dimension, q ->
q
Expand Down Expand Up @@ -456,17 +364,5 @@ defmodule Plausible.Stats.Imported.SQL.Expression do
|> select_joined_metrics(rest)
end

def naive_dimension_join(q1, q2, metrics) do
from(a in subquery(q1),
full_join: b in subquery(q2),
on: a.dim0 == b.dim0,
select: %{}
)
|> select_merge_as([a, b], %{
dim0: fragment("if(? != 0, ?, ?)", a.dim0, a.dim0, b.dim0)
})
|> select_joined_metrics(metrics)
end

defp dim(dimension), do: Plausible.Stats.Filters.without_prefix(dimension)
end
6 changes: 2 additions & 4 deletions lib/plausible/stats/query_optimizer.ex
Original file line number Diff line number Diff line change
Expand Up @@ -37,8 +37,7 @@ defmodule Plausible.Stats.QueryOptimizer do
{
Query.set(query,
metrics: event_metrics,
include_imported: query.include_imported,
pagination: nil
include_imported: query.include_imported
),
split_sessions_query(query, sessions_metrics)
}
Expand Down Expand Up @@ -171,8 +170,7 @@ defmodule Plausible.Stats.QueryOptimizer do
filters: filters,
metrics: session_metrics,
dimensions: dimensions,
include_imported: query.include_imported,
pagination: nil
include_imported: query.include_imported
)
end

Expand Down
Loading

0 comments on commit bec14ee

Please sign in to comment.