diff --git a/chispa/bcolors.py b/chispa/bcolors.py index 1912ff0..67996a5 100644 --- a/chispa/bcolors.py +++ b/chispa/bcolors.py @@ -42,6 +42,14 @@ def blue(s: str) -> str: return bcolors.LightBlue + str(s) + bcolors.LightRed +def line_blue(s: str) -> str: + return bcolors.LightBlue + s + bcolors.NC + + +def line_red(s: str) -> str: + return bcolors.LightRed + s + bcolors.NC + + def underline_text(input_text: str) -> str: """ Takes an input string and returns a white, underlined string (based on PrettyTable formatting) diff --git a/chispa/schema_comparer.py b/chispa/schema_comparer.py index 5671cd5..725fd70 100644 --- a/chispa/schema_comparer.py +++ b/chispa/schema_comparer.py @@ -6,6 +6,7 @@ from prettytable import PrettyTable from pyspark.sql.types import StructField, StructType +from chispa.bcolors import bcolors, line_blue, line_red from chispa.formatting import blue @@ -15,6 +16,108 @@ class SchemasNotEqualError(Exception): pass +def print_schema_diff( + s1: StructType, s2: StructType, ignore_nullable: bool, ignore_metadata: bool, output_format: str = "table" +) -> None: + valid_output_formats = ["table", "tree"] + if output_format == "table": + schema_diff_table: PrettyTable = create_schema_comparison_table(s1, s2, ignore_nullable, ignore_metadata) + print(schema_diff_table) + elif output_format == "tree": + schema_diff_tree: str = create_schema_comparison_tree(s1, s2, ignore_nullable, ignore_metadata) + print(schema_diff_tree) + else: + raise ValueError(f"output_format must be one of {valid_output_formats}") + + +def create_schema_comparison_tree(s1: StructType, s2: StructType, ignore_nullable: bool, ignore_metadata: bool) -> str: + def parse_schema_as_tree(s: StructType, indent: int) -> tuple[list[str], list[StructField]]: + tree_lines = [] + fields = [] + + for struct_field in s: + nullable = "(nullable = true)" if struct_field.nullable else "(nullable = false)" + struct_field_type = struct_field.dataType.typeName() + + struct_prefix = f"{indent * ' '}|{'-' * 2}" + struct_as_string = f"{struct_field.name}: {struct_field_type} {nullable}" + + tree_lines += [f"{struct_prefix} {struct_as_string}"] + + if not struct_field_type == "struct": + fields += [struct_field] + continue + + tree_line_nested, fields_nested = parse_schema_as_tree(struct_field.dataType, indent + 4) # type: ignore[arg-type] + + fields += [struct_field] + tree_lines += tree_line_nested + fields += fields_nested + + return tree_lines, fields + + tree_space = 6 + s1_tree, s1_fields = parse_schema_as_tree(s1, 0) + s2_tree, s2_fields = parse_schema_as_tree(s2, 0) + + widest_line = max(len(line) for line in s1_tree) + longest_tree = max(len(s1_tree), len(s2_tree)) + schema_gap = widest_line + tree_space + + tree = "\nschema1".ljust(schema_gap) + "schema2\n" + for i in range(longest_tree): + line1 = line2 = "" + s1_field = s2_field = None + + if i < len(s1_tree): + line1 = s1_tree[i] + s1_field = s1_fields[i] + if i < len(s2_tree): + line2 = s2_tree[i] + s2_field = s2_fields[i] + + tree_line = line1.ljust(schema_gap) + line2 + + if are_structfields_equal(s1_field, s2_field, ignore_nullable, ignore_metadata): + tree += line_blue(tree_line) + "\n" + else: + tree += line_red(tree_line) + "\n" + + tree += bcolors.NC + return tree + + +def create_schema_comparison_table( + s1: StructType, s2: StructType, ignore_nullable: bool, ignore_metadata: bool +) -> PrettyTable: + t = PrettyTable(["schema1", "schema2"]) + zipped = list(zip_longest(s1, s2)) + for sf1, sf2 in zipped: + if are_structfields_equal(sf1, sf2, ignore_nullable, ignore_metadata): + t.add_row([blue(str(sf1)), blue(str(sf2))]) + else: + t.add_row([sf1, sf2]) + return t + + +def check_if_schemas_are_wide(s1: StructType, s2: StructType) -> bool: + contains_nested_structs = any(sf.dataType.typeName() == "struct" for sf in s1) or any( + sf.dataType.typeName() == "struct" for sf in s2 + ) + contains_many_columns = len(s1) > 10 or len(s2) > 10 + return contains_nested_structs or contains_many_columns + + +def handle_schemas_not_equal(s1: StructType, s2: StructType, ignore_nullable: bool, ignore_metadata: bool) -> None: + schemas_are_wide = check_if_schemas_are_wide(s1, s2) + if schemas_are_wide: + error_message = create_schema_comparison_tree(s1, s2, ignore_nullable, ignore_metadata) + else: + t = create_schema_comparison_table(s1, s2, ignore_nullable, ignore_metadata) + error_message = "\n" + t.get_string() + raise SchemasNotEqualError(error_message) + + def assert_schema_equality( s1: StructType, s2: StructType, ignore_nullable: bool = False, ignore_metadata: bool = False ) -> None: @@ -37,14 +140,7 @@ def inner(s1: StructType, s2: StructType, ignore_nullable: bool, ignore_metadata return True if not inner(s1, s2, ignore_nullable, ignore_metadata): - t = PrettyTable(["schema1", "schema2"]) - zipped = list(zip_longest(s1, s2)) - for sf1, sf2 in zipped: - if are_structfields_equal(sf1, sf2, True): - t.add_row([blue(str(sf1)), blue(str(sf2))]) - else: - t.add_row([sf1, sf2]) - raise SchemasNotEqualError("\n" + t.get_string()) + handle_schemas_not_equal(s1, s2, ignore_nullable, ignore_metadata) # deprecate this @@ -52,27 +148,13 @@ def inner(s1: StructType, s2: StructType, ignore_nullable: bool, ignore_metadata # I think schema equality operations are really fast to begin with def assert_basic_schema_equality(s1: StructType, s2: StructType) -> None: if s1 != s2: - t = PrettyTable(["schema1", "schema2"]) - zipped = list(zip_longest(s1, s2)) - for sf1, sf2 in zipped: - if sf1 == sf2: - t.add_row([blue(str(sf1)), blue(str(sf2))]) - else: - t.add_row([sf1, sf2]) - raise SchemasNotEqualError("\n" + t.get_string()) + handle_schemas_not_equal(s1, s2, ignore_nullable=False, ignore_metadata=False) # deprecate this. ignore_nullable should be a flag. def assert_schema_equality_ignore_nullable(s1: StructType, s2: StructType) -> None: if not are_schemas_equal_ignore_nullable(s1, s2): - t = PrettyTable(["schema1", "schema2"]) - zipped = list(zip_longest(s1, s2)) - for sf1, sf2 in zipped: - if are_structfields_equal(sf1, sf2, True): - t.add_row([blue(str(sf1)), blue(str(sf2))]) - else: - t.add_row([sf1, sf2]) - raise SchemasNotEqualError("\n" + t.get_string()) + handle_schemas_not_equal(s1, s2, ignore_nullable=True, ignore_metadata=False) # deprecate this. ignore_nullable should be a flag. diff --git a/poetry.lock b/poetry.lock index e26e9b0..7d95f06 100644 --- a/poetry.lock +++ b/poetry.lock @@ -1,4 +1,4 @@ -# This file is automatically @generated by Poetry 1.8.3 and should not be changed by hand. +# This file is automatically @generated by Poetry 1.8.2 and should not be changed by hand. [[package]] name = "astunparse" diff --git a/tests/data/tree_string/it_prints_correctly_for_wide_schemas.txt b/tests/data/tree_string/it_prints_correctly_for_wide_schemas.txt new file mode 100644 index 0000000..6034fea --- /dev/null +++ b/tests/data/tree_string/it_prints_correctly_for_wide_schemas.txt @@ -0,0 +1 @@ +'\nschema1 schema2\n\x1b[34m|-- name: string (nullable = true) |-- name: string (nullable = true)\x1b[0m\n\x1b[34m|-- age: integer (nullable = true) |-- age: integer (nullable = true)\x1b[0m\n\x1b[34m|-- fav_number: integer (nullable = true) |-- fav_number: integer (nullable = true)\x1b[0m\n\x1b[34m|-- fav_numbers: array (nullable = true) |-- fav_numbers: array (nullable = true)\x1b[0m\n\x1b[31m|-- fav_colors: struct (nullable = true) |-- fav_colors: struct (nullable = true)\x1b[0m\n\x1b[31m |-- red: integer (nullable = true) |-- orange: integer (nullable = true)\x1b[0m\n\x1b[34m |-- green: integer (nullable = true) |-- green: integer (nullable = true)\x1b[0m\n\x1b[31m |-- blue: integer (nullable = true) |-- yellow: integer (nullable = true)\x1b[0m\n\x1b[0m' diff --git a/tests/data/tree_string/it_prints_correctly_for_wide_schemas_different_lengths.txt b/tests/data/tree_string/it_prints_correctly_for_wide_schemas_different_lengths.txt new file mode 100644 index 0000000..be22fb0 --- /dev/null +++ b/tests/data/tree_string/it_prints_correctly_for_wide_schemas_different_lengths.txt @@ -0,0 +1 @@ +'\nschema1 schema2\n\x1b[34m|-- name: string (nullable = true) |-- name: string (nullable = true)\x1b[0m\n\x1b[34m|-- age: integer (nullable = true) |-- age: integer (nullable = true)\x1b[0m\n\x1b[34m|-- fav_number: integer (nullable = true) |-- fav_number: integer (nullable = true)\x1b[0m\n\x1b[34m|-- fav_numbers: array (nullable = true) |-- fav_numbers: array (nullable = true)\x1b[0m\n\x1b[31m|-- fav_colors: struct (nullable = true) |-- fav_colors: struct (nullable = true)\x1b[0m\n\x1b[31m |-- red: integer (nullable = true) |-- orange: integer (nullable = true)\x1b[0m\n\x1b[34m |-- green: integer (nullable = true) |-- green: integer (nullable = true)\x1b[0m\n\x1b[31m |-- blue: integer (nullable = true) |-- yellow: integer (nullable = true)\x1b[0m\n\x1b[31m |-- purple: integer (nullable = true)\x1b[0m\n\x1b[31m |-- phone_number: string (nullable = true)\x1b[0m\n\x1b[0m' diff --git a/tests/data/tree_string/it_prints_correctly_for_wide_schemas_ignore_metadata.txt b/tests/data/tree_string/it_prints_correctly_for_wide_schemas_ignore_metadata.txt new file mode 100644 index 0000000..6034fea --- /dev/null +++ b/tests/data/tree_string/it_prints_correctly_for_wide_schemas_ignore_metadata.txt @@ -0,0 +1 @@ +'\nschema1 schema2\n\x1b[34m|-- name: string (nullable = true) |-- name: string (nullable = true)\x1b[0m\n\x1b[34m|-- age: integer (nullable = true) |-- age: integer (nullable = true)\x1b[0m\n\x1b[34m|-- fav_number: integer (nullable = true) |-- fav_number: integer (nullable = true)\x1b[0m\n\x1b[34m|-- fav_numbers: array (nullable = true) |-- fav_numbers: array (nullable = true)\x1b[0m\n\x1b[31m|-- fav_colors: struct (nullable = true) |-- fav_colors: struct (nullable = true)\x1b[0m\n\x1b[31m |-- red: integer (nullable = true) |-- orange: integer (nullable = true)\x1b[0m\n\x1b[34m |-- green: integer (nullable = true) |-- green: integer (nullable = true)\x1b[0m\n\x1b[31m |-- blue: integer (nullable = true) |-- yellow: integer (nullable = true)\x1b[0m\n\x1b[0m' diff --git a/tests/data/tree_string/it_prints_correctly_for_wide_schemas_ignore_nullable.txt b/tests/data/tree_string/it_prints_correctly_for_wide_schemas_ignore_nullable.txt new file mode 100644 index 0000000..1fd9390 --- /dev/null +++ b/tests/data/tree_string/it_prints_correctly_for_wide_schemas_ignore_nullable.txt @@ -0,0 +1 @@ +'\nschema1 schema2\n\x1b[34m|-- name: string (nullable = true) |-- name: string (nullable = true)\x1b[0m\n\x1b[34m|-- age: integer (nullable = true) |-- age: integer (nullable = false)\x1b[0m\n\x1b[34m|-- fav_number: integer (nullable = true) |-- fav_number: integer (nullable = true)\x1b[0m\n\x1b[34m|-- fav_numbers: array (nullable = true) |-- fav_numbers: array (nullable = false)\x1b[0m\n\x1b[31m|-- fav_colors: struct (nullable = true) |-- fav_colors: struct (nullable = true)\x1b[0m\n\x1b[31m |-- red: integer (nullable = true) |-- orange: integer (nullable = true)\x1b[0m\n\x1b[34m |-- green: integer (nullable = true) |-- green: integer (nullable = false)\x1b[0m\n\x1b[31m |-- blue: integer (nullable = true) |-- yellow: integer (nullable = true)\x1b[0m\n\x1b[0m' diff --git a/tests/data/tree_string/it_prints_correctly_for_wide_schemas_multiple_nested_structs.txt b/tests/data/tree_string/it_prints_correctly_for_wide_schemas_multiple_nested_structs.txt new file mode 100644 index 0000000..9ca3165 --- /dev/null +++ b/tests/data/tree_string/it_prints_correctly_for_wide_schemas_multiple_nested_structs.txt @@ -0,0 +1 @@ +'\nschema1 schema2\n\x1b[34m|-- name: string (nullable = true) |-- name: string (nullable = true)\x1b[0m\n\x1b[31m|-- fav_genres: struct (nullable = true) |-- fav_genres: struct (nullable = true)\x1b[0m\n\x1b[31m |-- rock: struct (nullable = true) |-- rock: struct (nullable = true)\x1b[0m\n\x1b[34m |-- metal: integer (nullable = true) |-- metal: integer (nullable = true)\x1b[0m\n\x1b[31m |-- punk: integer (nullable = true) |-- classic: integer (nullable = true)\x1b[0m\n\x1b[34m |-- electronic: struct (nullable = true) |-- electronic: struct (nullable = true)\x1b[0m\n\x1b[34m |-- house: integer (nullable = true) |-- house: integer (nullable = true)\x1b[0m\n\x1b[34m |-- dubstep: integer (nullable = true) |-- dubstep: integer (nullable = true)\x1b[0m\n\x1b[31m |-- pop: struct (nullable = true)\x1b[0m\n\x1b[31m |-- pop: integer (nullable = true)\x1b[0m\n\x1b[0m' diff --git a/tests/data/tree_string/it_prints_correctly_for_wide_schemas_with_metadata.txt b/tests/data/tree_string/it_prints_correctly_for_wide_schemas_with_metadata.txt new file mode 100644 index 0000000..79214f3 --- /dev/null +++ b/tests/data/tree_string/it_prints_correctly_for_wide_schemas_with_metadata.txt @@ -0,0 +1 @@ +'\nschema1 schema2\n\x1b[31m|-- name: string (nullable = true) |-- name: string (nullable = true)\x1b[0m\n\x1b[34m|-- age: integer (nullable = true) |-- age: integer (nullable = true)\x1b[0m\n\x1b[34m|-- fav_number: integer (nullable = true) |-- fav_number: integer (nullable = true)\x1b[0m\n\x1b[34m|-- fav_numbers: array (nullable = true) |-- fav_numbers: array (nullable = true)\x1b[0m\n\x1b[31m|-- fav_colors: struct (nullable = true) |-- fav_colors: struct (nullable = true)\x1b[0m\n\x1b[31m |-- red: integer (nullable = true) |-- orange: integer (nullable = true)\x1b[0m\n\x1b[34m |-- green: integer (nullable = true) |-- green: integer (nullable = true)\x1b[0m\n\x1b[31m |-- blue: integer (nullable = true) |-- yellow: integer (nullable = true)\x1b[0m\n\x1b[0m' diff --git a/tests/test_schema_comparer.py b/tests/test_schema_comparer.py index eee7d9b..40a5916 100644 --- a/tests/test_schema_comparer.py +++ b/tests/test_schema_comparer.py @@ -9,6 +9,7 @@ are_structfields_equal, assert_schema_equality, assert_schema_equality_ignore_nullable, + create_schema_comparison_tree, ) @@ -50,6 +51,260 @@ def it_throws_when_schema_lengths_differ(): assert_schema_equality(s1, s2) +def describe_tree_string(): + def it_prints_correctly_for_wide_schemas(): + with open("tests/data/tree_string/it_prints_correctly_for_wide_schemas.txt") as f: + expected = f.read() + + s1 = StructType([ + StructField("name", StringType(), True), + StructField("age", IntegerType(), True), + StructField("fav_number", IntegerType(), True), + StructField("fav_numbers", ArrayType(IntegerType(), True), True), + StructField( + "fav_colors", + StructType([ + StructField("red", IntegerType(), True), + StructField("green", IntegerType(), True), + StructField("blue", IntegerType(), True), + ]), + ), + ]) + + s2 = StructType([ + StructField("name", StringType(), True), + StructField("age", IntegerType(), True), + StructField("fav_number", IntegerType(), True), + StructField("fav_numbers", ArrayType(IntegerType(), True), True), + StructField( + "fav_colors", + StructType([ + StructField("orange", IntegerType(), True), + StructField("green", IntegerType(), True), + StructField("yellow", IntegerType(), True), + ]), + ), + ]) + + result = create_schema_comparison_tree(s1, s2, ignore_nullable=False, ignore_metadata=False) + + assert repr(result) + "\n" == expected + + def it_prints_correctly_for_wide_schemas_multiple_nested_structs(): + with open("tests/data/tree_string/it_prints_correctly_for_wide_schemas_multiple_nested_structs.txt") as f: + expected = f.read() + + s1 = StructType([ + StructField("name", StringType(), True), + StructField( + "fav_genres", + StructType([ + StructField( + "rock", + StructType([ + StructField("metal", IntegerType(), True), + StructField("punk", IntegerType(), True), + ]), + True, + ), + StructField( + "electronic", + StructType([ + StructField("house", IntegerType(), True), + StructField("dubstep", IntegerType(), True), + ]), + True, + ), + ]), + ), + ]) + + s2 = StructType([ + StructField("name", StringType(), True), + StructField( + "fav_genres", + StructType([ + StructField( + "rock", + StructType([ + StructField("metal", IntegerType(), True), + StructField("classic", IntegerType(), True), + ]), + True, + ), + StructField( + "electronic", + StructType([ + StructField("house", IntegerType(), True), + StructField("dubstep", IntegerType(), True), + ]), + True, + ), + StructField( + "pop", + StructType([ + StructField("pop", IntegerType(), True), + ]), + True, + ), + ]), + ), + ]) + + result = create_schema_comparison_tree(s1, s2, ignore_nullable=False, ignore_metadata=False) + assert repr(result) + "\n" == expected + + def it_prints_correctly_for_wide_schemas_ignore_nullable(): + with open("tests/data/tree_string/it_prints_correctly_for_wide_schemas_ignore_nullable.txt") as f: + expected = f.read() + + s1 = StructType([ + StructField("name", StringType(), True), + StructField("age", IntegerType(), True), + StructField("fav_number", IntegerType(), True), + StructField("fav_numbers", ArrayType(IntegerType(), True), True), + StructField( + "fav_colors", + StructType([ + StructField("red", IntegerType(), True), + StructField("green", IntegerType(), True), + StructField("blue", IntegerType(), True), + ]), + ), + ]) + + s2 = StructType([ + StructField("name", StringType(), True), + StructField("age", IntegerType(), False), + StructField("fav_number", IntegerType(), True), + StructField("fav_numbers", ArrayType(IntegerType(), True), False), + StructField( + "fav_colors", + StructType([ + StructField("orange", IntegerType(), True), + StructField("green", IntegerType(), False), + StructField("yellow", IntegerType(), True), + ]), + ), + ]) + + result = create_schema_comparison_tree(s1, s2, ignore_nullable=True, ignore_metadata=False) + + assert repr(result) + "\n" == expected + + def it_prints_correctly_for_wide_schemas_different_lengths(): + with open("tests/data/tree_string/it_prints_correctly_for_wide_schemas_different_lengths.txt") as f: + expected = f.read() + + s1 = StructType([ + StructField("name", StringType(), True), + StructField("age", IntegerType(), True), + StructField("fav_number", IntegerType(), True), + StructField("fav_numbers", ArrayType(IntegerType(), True), True), + StructField( + "fav_colors", + StructType([ + StructField("red", IntegerType(), True), + StructField("green", IntegerType(), True), + StructField("blue", IntegerType(), True), + ]), + ), + ]) + + s2 = StructType([ + StructField("name", StringType(), True), + StructField("age", IntegerType(), True), + StructField("fav_number", IntegerType(), True), + StructField("fav_numbers", ArrayType(IntegerType(), True), True), + StructField( + "fav_colors", + StructType([ + StructField("orange", IntegerType(), True), + StructField("green", IntegerType(), True), + StructField("yellow", IntegerType(), True), + StructField("purple", IntegerType(), True), + ]), + ), + StructField("phone_number", StringType(), True), + ]) + + result = create_schema_comparison_tree(s1, s2, ignore_nullable=False, ignore_metadata=False) + assert repr(result) + "\n" == expected + + def it_prints_correctly_for_wide_schemas_ignore_metadata(): + with open("tests/data/tree_string/it_prints_correctly_for_wide_schemas_ignore_metadata.txt") as f: + expected = f.read() + + s1 = StructType([ + StructField("name", StringType(), True, {"foo": "bar"}), + StructField("age", IntegerType(), True), + StructField("fav_number", IntegerType(), True), + StructField("fav_numbers", ArrayType(IntegerType(), True), True), + StructField( + "fav_colors", + StructType([ + StructField("red", IntegerType(), True), + StructField("green", IntegerType(), True), + StructField("blue", IntegerType(), True), + ]), + ), + ]) + + s2 = StructType([ + StructField("name", StringType(), True, {"foo": "baz"}), + StructField("age", IntegerType(), True), + StructField("fav_number", IntegerType(), True), + StructField("fav_numbers", ArrayType(IntegerType(), True), True), + StructField( + "fav_colors", + StructType([ + StructField("orange", IntegerType(), True), + StructField("green", IntegerType(), True), + StructField("yellow", IntegerType(), True), + ]), + ), + ]) + result = create_schema_comparison_tree(s1, s2, ignore_nullable=False, ignore_metadata=True) + assert repr(result) + "\n" == expected + + def it_prints_correctly_for_wide_schemas_with_metadata(): + with open("tests/data/tree_string/it_prints_correctly_for_wide_schemas_with_metadata.txt") as f: + expected = f.read() + + s1 = StructType([ + StructField("name", StringType(), True, {"foo": "bar"}), + StructField("age", IntegerType(), True), + StructField("fav_number", IntegerType(), True), + StructField("fav_numbers", ArrayType(IntegerType(), True), True), + StructField( + "fav_colors", + StructType([ + StructField("red", IntegerType(), True), + StructField("green", IntegerType(), True), + StructField("blue", IntegerType(), True), + ]), + ), + ]) + + s2 = StructType([ + StructField("name", StringType(), True, {"foo": "baz"}), + StructField("age", IntegerType(), True), + StructField("fav_number", IntegerType(), True), + StructField("fav_numbers", ArrayType(IntegerType(), True), True), + StructField( + "fav_colors", + StructType([ + StructField("orange", IntegerType(), True), + StructField("green", IntegerType(), True), + StructField("yellow", IntegerType(), True), + ]), + ), + ]) + + result = create_schema_comparison_tree(s1, s2, ignore_nullable=False, ignore_metadata=False) + assert repr(result) + "\n" == expected + + def describe_assert_schema_equality_ignore_nullable(): def it_has_good_error_messages_for_different_sized_schemas(): s1 = StructType([