diff --git a/.github/workflows/linux-32bit-build-and-test.yaml b/.github/workflows/linux-32bit-build-and-test.yaml index 51fcc458cf5..0cec2577a08 100644 --- a/.github/workflows/linux-32bit-build-and-test.yaml +++ b/.github/workflows/linux-32bit-build-and-test.yaml @@ -48,7 +48,13 @@ jobs: DEBIAN_FRONTEND: noninteractive # vectorized_aggregation has different output on i386 because int8 is by # reference and currently it cannot be used for vectorized hash grouping. - IGNORES: "append-* transparent_decompression-* transparent_decompress_chunk-* pg_dump telemetry bgw_db_scheduler* hypercore_vacuum vectorized_aggregation" + # vector_agg_text and vector_agg_groupagg use the UMASH hashing library + # that we can't compile on i386. + IGNORES: >- + append-* transparent_decompression-* + transparent_decompress_chunk-* pg_dump telemetry bgw_db_scheduler* + hypercore_vacuum vectorized_aggregation vector_agg_text + vector_agg_groupagg SKIPS: chunk_adaptive histogram_test-* EXTENSIONS: "postgres_fdw test_decoding pageinspect pgstattuple" strategy: diff --git a/.github/workflows/windows-build-and-test.yaml b/.github/workflows/windows-build-and-test.yaml index fe5b40ea05b..fbab17457c1 100644 --- a/.github/workflows/windows-build-and-test.yaml +++ b/.github/workflows/windows-build-and-test.yaml @@ -59,7 +59,7 @@ jobs: build_type: ${{ fromJson(needs.config.outputs.build_type) }} ignores: ["chunk_adaptive metadata telemetry"] tsl_ignores: ["compression_algos"] - tsl_skips: ["bgw_db_scheduler bgw_db_scheduler_fixed"] + tsl_skips: ["vector_agg_text vector_agg_groupagg bgw_db_scheduler bgw_db_scheduler_fixed"] pg_config: ["-cfsync=off -cstatement_timeout=60s"] include: - pg: 14 diff --git a/.unreleased/vectorized-text-grouping b/.unreleased/vectorized-text-grouping new file mode 100644 index 00000000000..5dfbe17b015 --- /dev/null +++ b/.unreleased/vectorized-text-grouping @@ -0,0 +1 @@ +Implements: #7586 Vectorized aggregation with grouping by a single text column. diff --git a/tsl/src/nodes/vector_agg/grouping_policy.h b/tsl/src/nodes/vector_agg/grouping_policy.h index 9c7a7a30095..9154c8dd500 100644 --- a/tsl/src/nodes/vector_agg/grouping_policy.h +++ b/tsl/src/nodes/vector_agg/grouping_policy.h @@ -65,7 +65,8 @@ typedef enum VAGT_Batch, VAGT_HashSingleFixed2, VAGT_HashSingleFixed4, - VAGT_HashSingleFixed8 + VAGT_HashSingleFixed8, + VAGT_HashSingleText } VectorAggGroupingType; extern GroupingPolicy *create_grouping_policy_batch(int num_agg_defs, VectorAggDef *agg_defs, diff --git a/tsl/src/nodes/vector_agg/grouping_policy_hash.c b/tsl/src/nodes/vector_agg/grouping_policy_hash.c index 3170f48bb05..92ecfce11bc 100644 --- a/tsl/src/nodes/vector_agg/grouping_policy_hash.c +++ b/tsl/src/nodes/vector_agg/grouping_policy_hash.c @@ -36,6 +36,9 @@ extern HashingStrategy single_fixed_2_strategy; extern HashingStrategy single_fixed_4_strategy; extern HashingStrategy single_fixed_8_strategy; +#ifdef TS_USE_UMASH +extern HashingStrategy single_text_strategy; +#endif static const GroupingPolicy grouping_policy_hash_functions; @@ -70,6 +73,11 @@ create_grouping_policy_hash(int num_agg_defs, VectorAggDef *agg_defs, int num_gr switch (grouping_type) { +#ifdef TS_USE_UMASH + case VAGT_HashSingleText: + policy->hashing = single_text_strategy; + break; +#endif case VAGT_HashSingleFixed8: policy->hashing = single_fixed_8_strategy; break; @@ -84,6 +92,8 @@ create_grouping_policy_hash(int num_agg_defs, VectorAggDef *agg_defs, int num_gr break; } + policy->hashing.key_body_mctx = policy->agg_extra_mctx; + policy->hashing.init(&policy->hashing, policy); return &policy->funcs; diff --git a/tsl/src/nodes/vector_agg/hashing/CMakeLists.txt b/tsl/src/nodes/vector_agg/hashing/CMakeLists.txt index c6ff65f65ca..401e5f22025 100644 --- a/tsl/src/nodes/vector_agg/hashing/CMakeLists.txt +++ b/tsl/src/nodes/vector_agg/hashing/CMakeLists.txt @@ -3,4 +3,9 @@ set(SOURCES ${CMAKE_CURRENT_SOURCE_DIR}/hash_strategy_single_fixed_4.c ${CMAKE_CURRENT_SOURCE_DIR}/hash_strategy_single_fixed_8.c ${CMAKE_CURRENT_SOURCE_DIR}/hash_strategy_common.c) + +if(USE_UMASH) + list(APPEND SOURCES ${CMAKE_CURRENT_SOURCE_DIR}/hash_strategy_single_text.c) +endif() + target_sources(${TSL_LIBRARY_NAME} PRIVATE ${SOURCES}) diff --git a/tsl/src/nodes/vector_agg/hashing/hash_strategy_single_text.c b/tsl/src/nodes/vector_agg/hashing/hash_strategy_single_text.c new file mode 100644 index 00000000000..5b54970b595 --- /dev/null +++ b/tsl/src/nodes/vector_agg/hashing/hash_strategy_single_text.c @@ -0,0 +1,135 @@ +/* + * This file and its contents are licensed under the Timescale License. + * Please see the included NOTICE for copyright information and + * LICENSE-TIMESCALE for a copy of the license. + */ + +/* + * Implementation of column hashing for a single text column. + */ + +#include + +#include + +#include "compression/arrow_c_data_interface.h" +#include "nodes/decompress_chunk/compressed_batch.h" +#include "nodes/vector_agg/exec.h" +#include "nodes/vector_agg/grouping_policy_hash.h" +#include "template_helper.h" + +#include "batch_hashing_params.h" + +#include "umash_fingerprint_key.h" + +#define EXPLAIN_NAME "single text" +#define KEY_VARIANT single_text +#define OUTPUT_KEY_TYPE BytesView + +static void +single_text_key_hashing_init(HashingStrategy *hashing) +{ + hashing->umash_params = umash_key_hashing_init(); +} + +typedef struct BytesView +{ + const uint8 *data; + uint32 len; +} BytesView; + +static BytesView +get_bytes_view(CompressedColumnValues *column_values, int arrow_row) +{ + const uint32 start = ((uint32 *) column_values->buffers[1])[arrow_row]; + const int32 value_bytes = ((uint32 *) column_values->buffers[1])[arrow_row + 1] - start; + Assert(value_bytes >= 0); + + return (BytesView){ .len = value_bytes, .data = &((uint8 *) column_values->buffers[2])[start] }; +} + +static pg_attribute_always_inline void +single_text_key_hashing_get_key(BatchHashingParams params, int row, void *restrict output_key_ptr, + void *restrict hash_table_key_ptr, bool *restrict valid) +{ + Assert(params.policy->num_grouping_columns == 1); + + BytesView *restrict output_key = (BytesView *) output_key_ptr; + HASH_TABLE_KEY_TYPE *restrict hash_table_key = (HASH_TABLE_KEY_TYPE *) hash_table_key_ptr; + + if (unlikely(params.single_grouping_column.decompression_type == DT_Scalar)) + { + *valid = !*params.single_grouping_column.output_isnull; + if (*valid) + { + output_key->len = VARSIZE_ANY_EXHDR(*params.single_grouping_column.output_value); + output_key->data = + (const uint8 *) VARDATA_ANY(*params.single_grouping_column.output_value); + } + else + { + output_key->len = 0; + output_key->data = NULL; + } + } + else if (params.single_grouping_column.decompression_type == DT_ArrowText) + { + *output_key = get_bytes_view(¶ms.single_grouping_column, row); + *valid = arrow_row_is_valid(params.single_grouping_column.buffers[0], row); + } + else if (params.single_grouping_column.decompression_type == DT_ArrowTextDict) + { + const int16 index = ((int16 *) params.single_grouping_column.buffers[3])[row]; + *output_key = get_bytes_view(¶ms.single_grouping_column, index); + *valid = arrow_row_is_valid(params.single_grouping_column.buffers[0], row); + } + else + { + pg_unreachable(); + } + + DEBUG_PRINT("%p consider key row %d key index %d is %d bytes: ", + params.policy, + row, + params.policy->last_used_key_index + 1, + output_key->len); + for (size_t i = 0; i < output_key->len; i++) + { + DEBUG_PRINT("%.2x.", output_key->data[i]); + } + DEBUG_PRINT("\n"); + + const struct umash_fp fp = umash_fprint(params.policy->hashing.umash_params, + /* seed = */ ~0ULL, + output_key->data, + output_key->len); + *hash_table_key = umash_fingerprint_get_key(fp); +} + +static pg_attribute_always_inline void +single_text_key_hashing_store_new(GroupingPolicyHash *restrict policy, uint32 new_key_index, + BytesView output_key) +{ + const int total_bytes = output_key.len + VARHDRSZ; + text *restrict stored = (text *) MemoryContextAlloc(policy->hashing.key_body_mctx, total_bytes); + SET_VARSIZE(stored, total_bytes); + memcpy(VARDATA(stored), output_key.data, output_key.len); + policy->hashing.output_keys[new_key_index] = PointerGetDatum(stored); +} + +/* + * We use the standard single-key key output functions. + */ +static void +single_text_emit_key(GroupingPolicyHash *policy, uint32 current_key, + TupleTableSlot *aggregated_slot) +{ + return hash_strategy_output_key_single_emit(policy, current_key, aggregated_slot); +} + +static void +single_text_key_hashing_prepare_for_batch(GroupingPolicyHash *policy, TupleTableSlot *vector_slot) +{ +} + +#include "hash_strategy_impl.c" diff --git a/tsl/src/nodes/vector_agg/hashing/hashing_strategy.h b/tsl/src/nodes/vector_agg/hashing/hashing_strategy.h index b900f002bdd..bb1cfcb61a5 100644 --- a/tsl/src/nodes/vector_agg/hashing/hashing_strategy.h +++ b/tsl/src/nodes/vector_agg/hashing/hashing_strategy.h @@ -42,10 +42,12 @@ typedef struct HashingStrategy * This is stored separately from hash table keys, because they might not * have the full column values, and also storing them contiguously here * leads to better memory access patterns when emitting the results. - * The details of the key storage are managed by the hashing strategy. + * The details of the key storage are managed by the hashing strategy. The + * by-reference keys can use a separate memory context for dense storage. */ Datum *restrict output_keys; uint64 num_allocated_output_keys; + MemoryContext key_body_mctx; /* * In single-column grouping, we store the null key outside of the hash @@ -54,6 +56,13 @@ typedef struct HashingStrategy * to reduce the hash table size. */ uint32 null_key_index; + +#ifdef TS_USE_UMASH + /* + * UMASH fingerprinting parameters. + */ + struct umash_params *umash_params; +#endif } HashingStrategy; void hash_strategy_output_key_alloc(GroupingPolicyHash *policy, uint16 nrows); diff --git a/tsl/src/nodes/vector_agg/hashing/umash_fingerprint_key.h b/tsl/src/nodes/vector_agg/hashing/umash_fingerprint_key.h new file mode 100644 index 00000000000..ed6a9b8ce03 --- /dev/null +++ b/tsl/src/nodes/vector_agg/hashing/umash_fingerprint_key.h @@ -0,0 +1,45 @@ +/* + * This file and its contents are licensed under the Timescale License. + * Please see the included NOTICE for copyright information and + * LICENSE-TIMESCALE for a copy of the license. + */ +#pragma once + +/* + * Helpers to use the umash fingerprint as a hash table key in our hashing + * strategies for vectorized grouping. + */ + +#include "import/umash.h" + +/* + * The struct is packed so that the hash table entry fits into 16 + * bytes with the uint32 key index that goes before. + */ +struct umash_fingerprint_key +{ + uint32 hash; + uint64 rest; +} pg_attribute_packed(); + +#define HASH_TABLE_KEY_TYPE struct umash_fingerprint_key +#define KEY_HASH(X) (X.hash) +#define KEY_EQUAL(a, b) (a.hash == b.hash && a.rest == b.rest) + +static inline struct umash_fingerprint_key +umash_fingerprint_get_key(struct umash_fp fp) +{ + const struct umash_fingerprint_key key = { + .hash = fp.hash[0] & (~(uint32) 0), + .rest = fp.hash[1], + }; + return key; +} + +static inline struct umash_params * +umash_key_hashing_init() +{ + struct umash_params *params = palloc0(sizeof(struct umash_params)); + umash_params_derive(params, 0xabcdef1234567890ull, NULL); + return params; +} diff --git a/tsl/src/nodes/vector_agg/plan.c b/tsl/src/nodes/vector_agg/plan.c index e705d23f37a..bb26d319016 100644 --- a/tsl/src/nodes/vector_agg/plan.c +++ b/tsl/src/nodes/vector_agg/plan.c @@ -505,6 +505,8 @@ get_vectorized_grouping_type(Agg *agg, CustomScan *custom, List *resolved_target /* * We support hashed vectorized grouping by one fixed-size by-value * compressed column. + * We can use our hash table for GroupAggregate as well, because it preserves + * the input order of the keys. */ if (num_grouping_columns == 1) { @@ -526,6 +528,15 @@ get_vectorized_grouping_type(Agg *agg, CustomScan *custom, List *resolved_target break; } } +#ifdef TS_USE_UMASH + else + { + Ensure(single_grouping_var->vartype == TEXTOID, + "invalid vector type %d for grouping", + single_grouping_var->vartype); + return VAGT_HashSingleText; + } +#endif } return VAGT_Invalid; diff --git a/tsl/test/expected/vector_agg_groupagg.out b/tsl/test/expected/vector_agg_groupagg.out new file mode 100644 index 00000000000..93784426438 --- /dev/null +++ b/tsl/test/expected/vector_agg_groupagg.out @@ -0,0 +1,79 @@ +-- This file and its contents are licensed under the Timescale License. +-- Please see the included NOTICE for copyright information and +-- LICENSE-TIMESCALE for a copy of the license. +-- Check that the vectorized aggregation works properly in the GroupAggregate +-- mode. +create table groupagg(t int, s text, value int); +select create_hypertable('groupagg', 't', chunk_time_interval => 10000); +NOTICE: adding not-null constraint to column "t" + create_hypertable +----------------------- + (1,public,groupagg,t) +(1 row) + +insert into groupagg +select + xfast * 100 + xslow, + case when xfast = 13 then null else xfast end, + xfast * 7 + xslow * 3 +from generate_series(10, 99) xfast, + generate_series(1, 10000) xslow +; +alter table groupagg set (timescaledb.compress, timescaledb.compress_segmentby = '', + timescaledb.compress_orderby = 's'); +select count(compress_chunk(x)) from show_chunks('groupagg') x; + count +------- + 2 +(1 row) + +set enable_hashagg to off; +set timescaledb.debug_require_vector_agg to 'allow'; +select s, sum(value) from groupagg group by s order by s limit 10; + s | sum +----+----------- + 10 | 150715000 + 11 | 150785000 + 12 | 150855000 + 14 | 150995000 + 15 | 151065000 + 16 | 151135000 + 17 | 151205000 + 18 | 151275000 + 19 | 151345000 + 20 | 151415000 +(10 rows) + +reset timescaledb.debug_require_vector_agg; +select count(decompress_chunk(x)) from show_chunks('groupagg') x; + count +------- + 2 +(1 row) + +alter table groupagg set (timescaledb.compress, timescaledb.compress_segmentby = '', + timescaledb.compress_orderby = 's nulls first'); +select count(compress_chunk(x)) from show_chunks('groupagg') x; + count +------- + 2 +(1 row) + +set timescaledb.debug_require_vector_agg to 'require'; +select s , sum(value) from groupagg group by s order by s nulls first limit 10; + s | sum +----+----------- + | 150925000 + 10 | 150715000 + 11 | 150785000 + 12 | 150855000 + 14 | 150995000 + 15 | 151065000 + 16 | 151135000 + 17 | 151205000 + 18 | 151275000 + 19 | 151345000 +(10 rows) + +reset enable_hashagg; +reset timescaledb.debug_require_vector_agg; diff --git a/tsl/test/expected/vector_agg_text.out b/tsl/test/expected/vector_agg_text.out new file mode 100644 index 00000000000..3e38713f3fe --- /dev/null +++ b/tsl/test/expected/vector_agg_text.out @@ -0,0 +1,283 @@ +-- This file and its contents are licensed under the Timescale License. +-- Please see the included NOTICE for copyright information and +-- LICENSE-TIMESCALE for a copy of the license. +\c :TEST_DBNAME :ROLE_SUPERUSER +-- helper function: float -> pseudorandom float [-0.5..0.5] +CREATE OR REPLACE FUNCTION mix(x anyelement) RETURNS float8 AS $$ + SELECT hashfloat8(x::float8) / pow(2, 32) +$$ LANGUAGE SQL; +\set CHUNKS 2::int +\set CHUNK_ROWS 100000::int +\set GROUPING_CARDINALITY 10::int +create table agggroup(t int, s int, + cint2 int2, cint4 int4, cint8 int8); +select create_hypertable('agggroup', 's', chunk_time_interval => :GROUPING_CARDINALITY / :CHUNKS); +NOTICE: adding not-null constraint to column "s" + create_hypertable +----------------------- + (1,public,agggroup,t) +(1 row) + +create view source as +select s * 10000 + t as t, + s, + case when t % 1051 = 0 then null + else (mix(s + t * 1019) * 32767)::int2 end as cint2, + (mix(s + t * 1021) * 32767)::int4 as cint4, + (mix(s + t * 1031) * 32767)::int8 as cint8 +from + generate_series(1::int, :CHUNK_ROWS * :CHUNKS / :GROUPING_CARDINALITY) t, + generate_series(0::int, :GROUPING_CARDINALITY - 1::int) s(s) +; +insert into agggroup select * from source where s = 1; +alter table agggroup set (timescaledb.compress, timescaledb.compress_orderby = 't', + timescaledb.compress_segmentby = 's'); +select count(compress_chunk(x)) from show_chunks('agggroup') x; + count +------- + 1 +(1 row) + +alter table agggroup add column ss int default 11; +alter table agggroup add column x text default '11'; +insert into agggroup +select *, ss::text as x from ( + select *, + case + -- null in entire batch + when s = 2 then null + -- null for some rows + when s = 3 and t % 1051 = 0 then null + -- for some rows same as default + when s = 4 and t % 1057 = 0 then 11 + -- not null for entire batch + else s + end as ss + from source where s != 1 +) t +; +select count(compress_chunk(x)) from show_chunks('agggroup') x; + count +------- + 2 +(1 row) + +vacuum freeze analyze agggroup; +-- Long strings +create table long(t int, a text, b text, c text, d text); +select create_hypertable('long', 't'); +NOTICE: adding not-null constraint to column "t" + create_hypertable +------------------- + (3,public,long,t) +(1 row) + +insert into long select n, x, x, x, x from ( + select n, repeat('1', 100 * 4 + n) x + from generate_series(1, 4) n) t +; +insert into long values (-1, 'a', 'b', 'c', 'd'); +alter table long set (timescaledb.compress); +WARNING: there was some uncertainty picking the default segment by for the hypertable: You do not have any indexes on columns that can be used for segment_by and thus we are not using segment_by for compression. Please make sure you are not missing any indexes +NOTICE: default segment by for hypertable "long" is set to "" +NOTICE: default order by for hypertable "long" is set to "t DESC" +select count(compress_chunk(x)) from show_chunks('long') x; + count +------- + 2 +(1 row) + +set timescaledb.debug_require_vector_agg = 'require'; +---- Uncomment to generate reference. Note that there are minor discrepancies +---- on float4 due to different numeric stability in our and PG implementations. +--set timescaledb.enable_chunkwise_aggregation to off; set timescaledb.enable_vectorized_aggregation to off; set timescaledb.debug_require_vector_agg = 'forbid'; +select + format('%sselect %s%s(%s) from agggroup%s%s%s;', + explain, + grouping || ', ', + function, variable, + ' where ' || condition, + ' group by ' || grouping, + format(' order by %s(%s), ', function, variable) || grouping || ' limit 10', + function, variable) +from + unnest(array[ + 'explain (costs off) ', + null]) explain, + unnest(array[ + 'cint2', + '*']) variable, + unnest(array[ + 'min', + 'count']) function, + unnest(array[ + null, + 'cint2 > 0', + 'cint2 is null', + 'cint2 is null and x is null']) with ordinality as condition(condition, n), + unnest(array['x']) with ordinality as grouping(grouping, n) +where + true + and (explain is null /* or condition is null and grouping = 's' */) + and (variable != '*' or function = 'count') +order by explain, condition.n, variable, function, grouping.n +\gexec +select x, count(*) from agggroup group by x order by count(*), x limit 10; + x | count +----+------- + | 19 + 3 | 19981 + 4 | 19981 + 0 | 20000 + 5 | 20000 + 6 | 20000 + 7 | 20000 + 8 | 20000 + 9 | 20000 + 11 | 40019 +(10 rows) + +select x, count(cint2) from agggroup group by x order by count(cint2), x limit 10; + x | count +----+------- + | 19 + 3 | 19962 + 4 | 19962 + 0 | 19981 + 5 | 19981 + 6 | 19981 + 7 | 19981 + 8 | 19981 + 9 | 19981 + 11 | 39981 +(10 rows) + +select x, min(cint2) from agggroup group by x order by min(cint2), x limit 10; + x | min +----+-------- + 0 | -16383 + 4 | -16383 + 5 | -16383 + 6 | -16383 + 11 | -16382 + 7 | -16382 + 8 | -16382 + 3 | -16381 + 9 | -16375 + | -16295 +(10 rows) + +select x, count(*) from agggroup where cint2 > 0 group by x order by count(*), x limit 10; + x | count +----+------- + | 9 + 3 | 9884 + 6 | 9890 + 4 | 9897 + 8 | 9898 + 7 | 9973 + 0 | 10012 + 9 | 10018 + 5 | 10110 + 11 | 19973 +(10 rows) + +select x, count(cint2) from agggroup where cint2 > 0 group by x order by count(cint2), x limit 10; + x | count +----+------- + | 9 + 3 | 9884 + 6 | 9890 + 4 | 9897 + 8 | 9898 + 7 | 9973 + 0 | 10012 + 9 | 10018 + 5 | 10110 + 11 | 19973 +(10 rows) + +select x, min(cint2) from agggroup where cint2 > 0 group by x order by min(cint2), x limit 10; + x | min +----+------ + 11 | 1 + 3 | 1 + 5 | 1 + 7 | 1 + 8 | 1 + 9 | 2 + 6 | 3 + 0 | 4 + 4 | 4 + | 4895 +(10 rows) + +select x, count(*) from agggroup where cint2 is null group by x order by count(*), x limit 10; + x | count +----+------- + 0 | 19 + 3 | 19 + 4 | 19 + 5 | 19 + 6 | 19 + 7 | 19 + 8 | 19 + 9 | 19 + 11 | 38 +(9 rows) + +select x, count(cint2) from agggroup where cint2 is null group by x order by count(cint2), x limit 10; + x | count +----+------- + 0 | 0 + 11 | 0 + 3 | 0 + 4 | 0 + 5 | 0 + 6 | 0 + 7 | 0 + 8 | 0 + 9 | 0 +(9 rows) + +select x, min(cint2) from agggroup where cint2 is null group by x order by min(cint2), x limit 10; + x | min +----+----- + 0 | + 11 | + 3 | + 4 | + 5 | + 6 | + 7 | + 8 | + 9 | +(9 rows) + +select x, count(*) from agggroup where cint2 is null and x is null group by x order by count(*), x limit 10; + x | count +---+------- +(0 rows) + +select x, count(cint2) from agggroup where cint2 is null and x is null group by x order by count(cint2), x limit 10; + x | count +---+------- +(0 rows) + +select x, min(cint2) from agggroup where cint2 is null and x is null group by x order by min(cint2), x limit 10; + x | min +---+----- +(0 rows) + +-- Test grouping by long strings +select count(*) from long group by a order by 1 limit 10; + count +------- + 1 + 1 + 1 + 1 + 1 +(5 rows) + +reset timescaledb.debug_require_vector_agg; diff --git a/tsl/test/sql/CMakeLists.txt b/tsl/test/sql/CMakeLists.txt index 243aab81a00..a12491cb756 100644 --- a/tsl/test/sql/CMakeLists.txt +++ b/tsl/test/sql/CMakeLists.txt @@ -45,6 +45,7 @@ set(TEST_FILES skip_scan.sql transparent_decompression_join_index.sql vector_agg_functions.sql + vector_agg_groupagg.sql vector_agg_param.sql vectorized_aggregation.sql) @@ -119,6 +120,7 @@ if(CMAKE_BUILD_TYPE MATCHES Debug) feature_flags.sql vector_agg_default.sql vector_agg_filter.sql + vector_agg_text.sql vector_agg_memory.sql vector_agg_segmentby.sql) diff --git a/tsl/test/sql/vector_agg_groupagg.sql b/tsl/test/sql/vector_agg_groupagg.sql new file mode 100644 index 00000000000..d27a6b90095 --- /dev/null +++ b/tsl/test/sql/vector_agg_groupagg.sql @@ -0,0 +1,40 @@ +-- This file and its contents are licensed under the Timescale License. +-- Please see the included NOTICE for copyright information and +-- LICENSE-TIMESCALE for a copy of the license. + +-- Check that the vectorized aggregation works properly in the GroupAggregate +-- mode. + +create table groupagg(t int, s text, value int); +select create_hypertable('groupagg', 't', chunk_time_interval => 10000); + +insert into groupagg +select + xfast * 100 + xslow, + case when xfast = 13 then null else xfast end, + xfast * 7 + xslow * 3 +from generate_series(10, 99) xfast, + generate_series(1, 10000) xslow +; + +alter table groupagg set (timescaledb.compress, timescaledb.compress_segmentby = '', + timescaledb.compress_orderby = 's'); +select count(compress_chunk(x)) from show_chunks('groupagg') x; + +set enable_hashagg to off; +set timescaledb.debug_require_vector_agg to 'allow'; +select s, sum(value) from groupagg group by s order by s limit 10; + + +reset timescaledb.debug_require_vector_agg; +select count(decompress_chunk(x)) from show_chunks('groupagg') x; +alter table groupagg set (timescaledb.compress, timescaledb.compress_segmentby = '', + timescaledb.compress_orderby = 's nulls first'); +select count(compress_chunk(x)) from show_chunks('groupagg') x; + +set timescaledb.debug_require_vector_agg to 'require'; +select s , sum(value) from groupagg group by s order by s nulls first limit 10; + + +reset enable_hashagg; +reset timescaledb.debug_require_vector_agg; diff --git a/tsl/test/sql/vector_agg_text.sql b/tsl/test/sql/vector_agg_text.sql new file mode 100644 index 00000000000..51542d6306a --- /dev/null +++ b/tsl/test/sql/vector_agg_text.sql @@ -0,0 +1,113 @@ +-- This file and its contents are licensed under the Timescale License. +-- Please see the included NOTICE for copyright information and +-- LICENSE-TIMESCALE for a copy of the license. + +\c :TEST_DBNAME :ROLE_SUPERUSER +-- helper function: float -> pseudorandom float [-0.5..0.5] +CREATE OR REPLACE FUNCTION mix(x anyelement) RETURNS float8 AS $$ + SELECT hashfloat8(x::float8) / pow(2, 32) +$$ LANGUAGE SQL; + +\set CHUNKS 2::int +\set CHUNK_ROWS 100000::int +\set GROUPING_CARDINALITY 10::int + +create table agggroup(t int, s int, + cint2 int2, cint4 int4, cint8 int8); +select create_hypertable('agggroup', 's', chunk_time_interval => :GROUPING_CARDINALITY / :CHUNKS); + +create view source as +select s * 10000 + t as t, + s, + case when t % 1051 = 0 then null + else (mix(s + t * 1019) * 32767)::int2 end as cint2, + (mix(s + t * 1021) * 32767)::int4 as cint4, + (mix(s + t * 1031) * 32767)::int8 as cint8 +from + generate_series(1::int, :CHUNK_ROWS * :CHUNKS / :GROUPING_CARDINALITY) t, + generate_series(0::int, :GROUPING_CARDINALITY - 1::int) s(s) +; + +insert into agggroup select * from source where s = 1; + +alter table agggroup set (timescaledb.compress, timescaledb.compress_orderby = 't', + timescaledb.compress_segmentby = 's'); + +select count(compress_chunk(x)) from show_chunks('agggroup') x; + +alter table agggroup add column ss int default 11; +alter table agggroup add column x text default '11'; + +insert into agggroup +select *, ss::text as x from ( + select *, + case + -- null in entire batch + when s = 2 then null + -- null for some rows + when s = 3 and t % 1051 = 0 then null + -- for some rows same as default + when s = 4 and t % 1057 = 0 then 11 + -- not null for entire batch + else s + end as ss + from source where s != 1 +) t +; + +select count(compress_chunk(x)) from show_chunks('agggroup') x; +vacuum freeze analyze agggroup; + +-- Long strings +create table long(t int, a text, b text, c text, d text); +select create_hypertable('long', 't'); +insert into long select n, x, x, x, x from ( + select n, repeat('1', 100 * 4 + n) x + from generate_series(1, 4) n) t +; +insert into long values (-1, 'a', 'b', 'c', 'd'); +alter table long set (timescaledb.compress); +select count(compress_chunk(x)) from show_chunks('long') x; + + +set timescaledb.debug_require_vector_agg = 'require'; +---- Uncomment to generate reference. Note that there are minor discrepancies +---- on float4 due to different numeric stability in our and PG implementations. +--set timescaledb.enable_chunkwise_aggregation to off; set timescaledb.enable_vectorized_aggregation to off; set timescaledb.debug_require_vector_agg = 'forbid'; + +select + format('%sselect %s%s(%s) from agggroup%s%s%s;', + explain, + grouping || ', ', + function, variable, + ' where ' || condition, + ' group by ' || grouping, + format(' order by %s(%s), ', function, variable) || grouping || ' limit 10', + function, variable) +from + unnest(array[ + 'explain (costs off) ', + null]) explain, + unnest(array[ + 'cint2', + '*']) variable, + unnest(array[ + 'min', + 'count']) function, + unnest(array[ + null, + 'cint2 > 0', + 'cint2 is null', + 'cint2 is null and x is null']) with ordinality as condition(condition, n), + unnest(array['x']) with ordinality as grouping(grouping, n) +where + true + and (explain is null /* or condition is null and grouping = 's' */) + and (variable != '*' or function = 'count') +order by explain, condition.n, variable, function, grouping.n +\gexec + +-- Test grouping by long strings +select count(*) from long group by a order by 1 limit 10; + +reset timescaledb.debug_require_vector_agg;