Skip to content

Commit

Permalink
Merge pull request duckdb#59 from motherduckdb/pb/generate-small-data
Browse files Browse the repository at this point in the history
add generated_data to repo to facilitate CI
  • Loading branch information
samansmink authored Jul 1, 2024
2 parents cdbada1 + 08b9811 commit bc35b1a
Show file tree
Hide file tree
Showing 4 changed files with 33 additions and 41 deletions.
6 changes: 3 additions & 3 deletions Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -9,11 +9,11 @@ include extension-ci-tools/makefiles/duckdb_extension.Makefile

# Custom makefile targets
data: data_clean
python3 scripts/test_data_generator/generate_iceberg.py 0.01 data/iceberg/generated_spec1_0_01 1
python3 scripts/test_data_generator/generate_iceberg.py 0.01 data/iceberg/generated_spec2_0_01 2
python3 scripts/test_data_generator/generate_iceberg.py 0.001 data/iceberg/generated_spec1_0_001 1
python3 scripts/test_data_generator/generate_iceberg.py 0.001 data/iceberg/generated_spec2_0_001 2

data_large: data data_clean
python3 scripts/test_data_generator/generate_iceberg.py 1 data/iceberg/generated_spec2_1 2

data_clean:
rm -rf data/iceberg/generated_*
rm -rf data/iceberg/generated_*
10 changes: 2 additions & 8 deletions test/sql/iceberg_scan.test
Original file line number Diff line number Diff line change
Expand Up @@ -8,16 +8,10 @@ SELECT * FROM ICEBERG_SCAN('data/iceberg/lineitem_iceberg');
----
Catalog Error

require iceberg

# parquet is required for scanning this parquet-backed iceberg table
statement error
SELECT count(*) FROM ICEBERG_SCAN('data/iceberg/lineitem_iceberg', ALLOW_MOVED_PATHS=TRUE);
----
Catalog Error

require parquet

require iceberg

### Scanning latest snapshot
query I
SELECT count(*) FROM ICEBERG_SCAN('data/iceberg/lineitem_iceberg', ALLOW_MOVED_PATHS=TRUE);
Expand Down
Original file line number Diff line number Diff line change
@@ -1,110 +1,108 @@
# name: test/sql/iceberg_scan_generated_data_0_01.test_slow
# description: test iceberg extension with the sf0.01 generated test set
# name: test/sql/iceberg_scan_generated_data_0_001.test_slow
# description: test iceberg extension with the sf0.001 generated test set
# group: [iceberg]

require parquet

require iceberg

require-env DUCKDB_ICEBERG_HAVE_TEST_DATA

### Iceberg spec v1

# Check count matches
query I
SELECT count(*) FROM ICEBERG_SCAN('data/iceberg/generated_spec1_0_01/pyspark_iceberg_table');
SELECT count(*) FROM ICEBERG_SCAN('data/iceberg/generated_spec1_0_001/pyspark_iceberg_table');
----
<FILE>:data/iceberg/generated_spec1_0_01/expected_results/last/count.csv
<FILE>:data/iceberg/generated_spec1_0_001/expected_results/last/count.csv

# Check schema is identical, sorting by uuid to guarantee unique order
query I nosort q1-schema
DESCRIBE SELECT * FROM ICEBERG_SCAN('data/iceberg/generated_spec1_0_01/pyspark_iceberg_table') ORDER BY uuid;
DESCRIBE SELECT * FROM ICEBERG_SCAN('data/iceberg/generated_spec1_0_001/pyspark_iceberg_table') ORDER BY uuid;
----

query I nosort q1-schema
DESCRIBE SELECT * FROM ICEBERG_SCAN('data/iceberg/generated_spec1_0_01/pyspark_iceberg_table/metadata/v9.metadata.json') ORDER BY uuid;
DESCRIBE SELECT * FROM ICEBERG_SCAN('data/iceberg/generated_spec1_0_001/pyspark_iceberg_table/metadata/v9.metadata.json') ORDER BY uuid;
----

query I nosort q1-schema
DESCRIBE SELECT * FROM PARQUET_SCAN('data/iceberg/generated_spec1_0_01/expected_results/q08/data/*.parquet') ORDER BY uuid;
DESCRIBE SELECT * FROM PARQUET_SCAN('data/iceberg/generated_spec1_0_001/expected_results/last/data/*.parquet') ORDER BY uuid;
----

# Check data is identical, sorting by uuid to guarantee unique order
query I nosort q1-data
SELECT * FROM ICEBERG_SCAN('data/iceberg/generated_spec1_0_01/pyspark_iceberg_table') ORDER BY uuid;
SELECT * FROM ICEBERG_SCAN('data/iceberg/generated_spec1_0_001/pyspark_iceberg_table') ORDER BY uuid;
----

query I nosort q1-data
SELECT * FROM ICEBERG_SCAN('data/iceberg/generated_spec1_0_01/pyspark_iceberg_table/metadata/v9.metadata.json') ORDER BY uuid;
SELECT * FROM ICEBERG_SCAN('data/iceberg/generated_spec1_0_001/pyspark_iceberg_table/metadata/v9.metadata.json') ORDER BY uuid;
----

query I nosort q1-data
SELECT * FROM PARQUET_SCAN('data/iceberg/generated_spec1_0_01/expected_results/q08/data/*.parquet') ORDER BY uuid;
SELECT * FROM PARQUET_SCAN('data/iceberg/generated_spec1_0_001/expected_results/last/data/*.parquet') ORDER BY uuid;
----

# Confirm the type matches that of the iceberg schema
query IIIIII
DESCRIBE SELECT schema_evol_added_col_1 FROM ICEBERG_SCAN('data/iceberg/generated_spec1_0_01/pyspark_iceberg_table') ORDER BY uuid;
DESCRIBE SELECT schema_evol_added_col_1 FROM ICEBERG_SCAN('data/iceberg/generated_spec1_0_001/pyspark_iceberg_table') ORDER BY uuid;
----
schema_evol_added_col_1 BIGINT YES NULL NULL NULL

### Iceberg spec v2

# Check count matches
query I
SELECT count(*) FROM ICEBERG_SCAN('data/iceberg/generated_spec2_0_01/pyspark_iceberg_table');
SELECT count(*) FROM ICEBERG_SCAN('data/iceberg/generated_spec2_0_001/pyspark_iceberg_table');
----
<FILE>:data/iceberg/generated_spec2_0_01/expected_results/last/count.csv
<FILE>:data/iceberg/generated_spec2_0_001/expected_results/last/count.csv

# We should also be able to scan the metadata file directly
query I
SELECT count(*) FROM ICEBERG_SCAN('data/iceberg/generated_spec2_0_01/pyspark_iceberg_table/metadata/v9.metadata.json');
SELECT count(*) FROM ICEBERG_SCAN('data/iceberg/generated_spec2_0_001/pyspark_iceberg_table/metadata/v9.metadata.json');
----
<FILE>:data/iceberg/generated_spec2_0_01/expected_results/last/count.csv
<FILE>:data/iceberg/generated_spec2_0_001/expected_results/last/count.csv

# Check schema is identical, sorting by uuid to guarantee unique order
query I nosort q2-schema
DESCRIBE SELECT * FROM ICEBERG_SCAN('data/iceberg/generated_spec2_0_01/pyspark_iceberg_table') ORDER BY uuid;
DESCRIBE SELECT * FROM ICEBERG_SCAN('data/iceberg/generated_spec2_0_001/pyspark_iceberg_table') ORDER BY uuid;
----

query I nosort q2-schema
DESCRIBE SELECT * FROM ICEBERG_SCAN('data/iceberg/generated_spec2_0_01/pyspark_iceberg_table/metadata/v9.metadata.json') ORDER BY uuid;
DESCRIBE SELECT * FROM ICEBERG_SCAN('data/iceberg/generated_spec2_0_001/pyspark_iceberg_table/metadata/v9.metadata.json') ORDER BY uuid;
----

query I nosort q2-schema
DESCRIBE SELECT * FROM PARQUET_SCAN('data/iceberg/generated_spec2_0_01/expected_results/q08/data/*.parquet') ORDER BY uuid;
DESCRIBE SELECT * FROM PARQUET_SCAN('data/iceberg/generated_spec2_0_001/expected_results/last/data/*.parquet') ORDER BY uuid;
----

# Check data is identical, sorting by uuid to guarantee unique order
query I nosort q2-data
SELECT * FROM ICEBERG_SCAN('data/iceberg/generated_spec2_0_01/pyspark_iceberg_table') ORDER BY uuid;
SELECT * FROM ICEBERG_SCAN('data/iceberg/generated_spec2_0_001/pyspark_iceberg_table') ORDER BY uuid;
----

# Check data is identical, sorting by uuid to guarantee unique order
query I nosort q2-data
SELECT * FROM ICEBERG_SCAN('data/iceberg/generated_spec2_0_01/pyspark_iceberg_table/metadata/v9.metadata.json') ORDER BY uuid;
SELECT * FROM ICEBERG_SCAN('data/iceberg/generated_spec2_0_001/pyspark_iceberg_table/metadata/v9.metadata.json') ORDER BY uuid;
----

query I nosort q2-data
SELECT * FROM PARQUET_SCAN('data/iceberg/generated_spec2_0_01/expected_results/q08/data/*.parquet') ORDER BY uuid;
SELECT * FROM PARQUET_SCAN('data/iceberg/generated_spec2_0_001/expected_results/last/data/*.parquet') ORDER BY uuid;
----

### Test schema evolution

# Latest metadata version has correct type
query IIIIII
DESCRIBE SELECT schema_evol_added_col_1 FROM ICEBERG_SCAN('data/iceberg/generated_spec2_0_01/pyspark_iceberg_table/metadata/v9.metadata.json') ORDER BY uuid;
DESCRIBE SELECT schema_evol_added_col_1 FROM ICEBERG_SCAN('data/iceberg/generated_spec2_0_001/pyspark_iceberg_table/metadata/v9.metadata.json') ORDER BY uuid;
----
schema_evol_added_col_1 BIGINT YES NULL NULL NULL

# One before has the old type
query IIIIII
DESCRIBE SELECT schema_evol_added_col_1 FROM ICEBERG_SCAN('data/iceberg/generated_spec2_0_01/pyspark_iceberg_table/metadata/v8.metadata.json') ORDER BY uuid;
DESCRIBE SELECT schema_evol_added_col_1 FROM ICEBERG_SCAN('data/iceberg/generated_spec2_0_001/pyspark_iceberg_table/metadata/v8.metadata.json') ORDER BY uuid;
----
schema_evol_added_col_1 INTEGER YES NULL NULL NULL

# Even older: it did not exist yet
statement error
DESCRIBE SELECT schema_evol_added_col_1 FROM ICEBERG_SCAN('data/iceberg/generated_spec2_0_01/pyspark_iceberg_table/metadata/v6.metadata.json') ORDER BY uuid;
DESCRIBE SELECT schema_evol_added_col_1 FROM ICEBERG_SCAN('data/iceberg/generated_spec2_0_001/pyspark_iceberg_table/metadata/v6.metadata.json') ORDER BY uuid;
----
Binder Error
Binder Error
6 changes: 3 additions & 3 deletions test/sql/iceberg_scan_generated_data_1.test_slow
Original file line number Diff line number Diff line change
Expand Up @@ -20,7 +20,7 @@ SELECT COUNT(*) FROM ICEBERG_SCAN('data/iceberg/generated_1/pyspark_iceberg_tabl
----

query I nosort q1
SELECT COUNT(*) FROM PARQUET_SCAN('data/iceberg/generated_1/expected_results/q08/data/*.parquet');
SELECT COUNT(*) FROM PARQUET_SCAN('data/iceberg/generated_1/expected_results/last/data/*.parquet');
----

query I nosort q2
Expand All @@ -37,5 +37,5 @@ SELECT * FROM ICEBERG_SCAN('data/iceberg/generated_1/pyspark_iceberg_table') WHE
----

query I nosort q3
SELECT * FROM PARQUET_SCAN('data/iceberg/generated_1/expected_results/q08/data/*.parquet') WHERE uuid NOT NULL ORDER BY uuid;
----
SELECT * FROM PARQUET_SCAN('data/iceberg/generated_1/expected_results/last/data/*.parquet') WHERE uuid NOT NULL ORDER BY uuid;
----

0 comments on commit bc35b1a

Please sign in to comment.