Skip to content

Commit

Permalink
Add tree string (#95)
Browse files Browse the repository at this point in the history
* prelim tree string code - move table creation to separate function

* add prelim test for tree string

* add prelim tree string code

* add side by side schema tree string

* improve tree appearance

* simplify formatting

* add helper functions + autoformat

* handle ignore nullable in tree string

* add print output tests

* handle metadata, use existing `are_structfields_equal` comparison check

* update tests

* add metadata tests

* simplify logic, remove horizontal character param

* improve variable names

* add `print_schema_diff` as wrapper to compare two schemas without error

* add missing return type hints

* add six package

* add missing `create_schema_comparison_tree` function

* fix unit test failures in `test_schema_comparer` and `test_dataframe_comparer`

* replace six with itertools

* remove double import

* formatting fixes

* handle mypy issues

* update tests to include newline added by pre-commit format
  • Loading branch information
jeffbrennan authored Oct 12, 2024
1 parent 25791ff commit 50c2411
Show file tree
Hide file tree
Showing 10 changed files with 376 additions and 25 deletions.
8 changes: 8 additions & 0 deletions chispa/bcolors.py
Original file line number Diff line number Diff line change
Expand Up @@ -42,6 +42,14 @@ def blue(s: str) -> str:
return bcolors.LightBlue + str(s) + bcolors.LightRed


def line_blue(s: str) -> str:
return bcolors.LightBlue + s + bcolors.NC


def line_red(s: str) -> str:
return bcolors.LightRed + s + bcolors.NC


def underline_text(input_text: str) -> str:
"""
Takes an input string and returns a white, underlined string (based on PrettyTable formatting)
Expand Down
130 changes: 106 additions & 24 deletions chispa/schema_comparer.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,7 @@
from prettytable import PrettyTable
from pyspark.sql.types import StructField, StructType

from chispa.bcolors import bcolors, line_blue, line_red
from chispa.formatting import blue


Expand All @@ -15,6 +16,108 @@ class SchemasNotEqualError(Exception):
pass


def print_schema_diff(
s1: StructType, s2: StructType, ignore_nullable: bool, ignore_metadata: bool, output_format: str = "table"
) -> None:
valid_output_formats = ["table", "tree"]
if output_format == "table":
schema_diff_table: PrettyTable = create_schema_comparison_table(s1, s2, ignore_nullable, ignore_metadata)
print(schema_diff_table)
elif output_format == "tree":
schema_diff_tree: str = create_schema_comparison_tree(s1, s2, ignore_nullable, ignore_metadata)
print(schema_diff_tree)
else:
raise ValueError(f"output_format must be one of {valid_output_formats}")


def create_schema_comparison_tree(s1: StructType, s2: StructType, ignore_nullable: bool, ignore_metadata: bool) -> str:
def parse_schema_as_tree(s: StructType, indent: int) -> tuple[list[str], list[StructField]]:
tree_lines = []
fields = []

for struct_field in s:
nullable = "(nullable = true)" if struct_field.nullable else "(nullable = false)"
struct_field_type = struct_field.dataType.typeName()

struct_prefix = f"{indent * ' '}|{'-' * 2}"
struct_as_string = f"{struct_field.name}: {struct_field_type} {nullable}"

tree_lines += [f"{struct_prefix} {struct_as_string}"]

if not struct_field_type == "struct":
fields += [struct_field]
continue

tree_line_nested, fields_nested = parse_schema_as_tree(struct_field.dataType, indent + 4) # type: ignore[arg-type]

fields += [struct_field]
tree_lines += tree_line_nested
fields += fields_nested

return tree_lines, fields

tree_space = 6
s1_tree, s1_fields = parse_schema_as_tree(s1, 0)
s2_tree, s2_fields = parse_schema_as_tree(s2, 0)

widest_line = max(len(line) for line in s1_tree)
longest_tree = max(len(s1_tree), len(s2_tree))
schema_gap = widest_line + tree_space

tree = "\nschema1".ljust(schema_gap) + "schema2\n"
for i in range(longest_tree):
line1 = line2 = ""
s1_field = s2_field = None

if i < len(s1_tree):
line1 = s1_tree[i]
s1_field = s1_fields[i]
if i < len(s2_tree):
line2 = s2_tree[i]
s2_field = s2_fields[i]

tree_line = line1.ljust(schema_gap) + line2

if are_structfields_equal(s1_field, s2_field, ignore_nullable, ignore_metadata):
tree += line_blue(tree_line) + "\n"
else:
tree += line_red(tree_line) + "\n"

tree += bcolors.NC
return tree


def create_schema_comparison_table(
s1: StructType, s2: StructType, ignore_nullable: bool, ignore_metadata: bool
) -> PrettyTable:
t = PrettyTable(["schema1", "schema2"])
zipped = list(zip_longest(s1, s2))
for sf1, sf2 in zipped:
if are_structfields_equal(sf1, sf2, ignore_nullable, ignore_metadata):
t.add_row([blue(str(sf1)), blue(str(sf2))])
else:
t.add_row([sf1, sf2])
return t


def check_if_schemas_are_wide(s1: StructType, s2: StructType) -> bool:
contains_nested_structs = any(sf.dataType.typeName() == "struct" for sf in s1) or any(
sf.dataType.typeName() == "struct" for sf in s2
)
contains_many_columns = len(s1) > 10 or len(s2) > 10
return contains_nested_structs or contains_many_columns


def handle_schemas_not_equal(s1: StructType, s2: StructType, ignore_nullable: bool, ignore_metadata: bool) -> None:
schemas_are_wide = check_if_schemas_are_wide(s1, s2)
if schemas_are_wide:
error_message = create_schema_comparison_tree(s1, s2, ignore_nullable, ignore_metadata)
else:
t = create_schema_comparison_table(s1, s2, ignore_nullable, ignore_metadata)
error_message = "\n" + t.get_string()
raise SchemasNotEqualError(error_message)


def assert_schema_equality(
s1: StructType, s2: StructType, ignore_nullable: bool = False, ignore_metadata: bool = False
) -> None:
Expand All @@ -37,42 +140,21 @@ def inner(s1: StructType, s2: StructType, ignore_nullable: bool, ignore_metadata
return True

if not inner(s1, s2, ignore_nullable, ignore_metadata):
t = PrettyTable(["schema1", "schema2"])
zipped = list(zip_longest(s1, s2))
for sf1, sf2 in zipped:
if are_structfields_equal(sf1, sf2, True):
t.add_row([blue(str(sf1)), blue(str(sf2))])
else:
t.add_row([sf1, sf2])
raise SchemasNotEqualError("\n" + t.get_string())
handle_schemas_not_equal(s1, s2, ignore_nullable, ignore_metadata)


# deprecate this
# perhaps it is a little faster, but do we really need this?
# I think schema equality operations are really fast to begin with
def assert_basic_schema_equality(s1: StructType, s2: StructType) -> None:
if s1 != s2:
t = PrettyTable(["schema1", "schema2"])
zipped = list(zip_longest(s1, s2))
for sf1, sf2 in zipped:
if sf1 == sf2:
t.add_row([blue(str(sf1)), blue(str(sf2))])
else:
t.add_row([sf1, sf2])
raise SchemasNotEqualError("\n" + t.get_string())
handle_schemas_not_equal(s1, s2, ignore_nullable=False, ignore_metadata=False)


# deprecate this. ignore_nullable should be a flag.
def assert_schema_equality_ignore_nullable(s1: StructType, s2: StructType) -> None:
if not are_schemas_equal_ignore_nullable(s1, s2):
t = PrettyTable(["schema1", "schema2"])
zipped = list(zip_longest(s1, s2))
for sf1, sf2 in zipped:
if are_structfields_equal(sf1, sf2, True):
t.add_row([blue(str(sf1)), blue(str(sf2))])
else:
t.add_row([sf1, sf2])
raise SchemasNotEqualError("\n" + t.get_string())
handle_schemas_not_equal(s1, s2, ignore_nullable=True, ignore_metadata=False)


# deprecate this. ignore_nullable should be a flag.
Expand Down
2 changes: 1 addition & 1 deletion poetry.lock

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
'\nschema1 schema2\n\x1b[34m|-- name: string (nullable = true) |-- name: string (nullable = true)\x1b[0m\n\x1b[34m|-- age: integer (nullable = true) |-- age: integer (nullable = true)\x1b[0m\n\x1b[34m|-- fav_number: integer (nullable = true) |-- fav_number: integer (nullable = true)\x1b[0m\n\x1b[34m|-- fav_numbers: array (nullable = true) |-- fav_numbers: array (nullable = true)\x1b[0m\n\x1b[31m|-- fav_colors: struct (nullable = true) |-- fav_colors: struct (nullable = true)\x1b[0m\n\x1b[31m |-- red: integer (nullable = true) |-- orange: integer (nullable = true)\x1b[0m\n\x1b[34m |-- green: integer (nullable = true) |-- green: integer (nullable = true)\x1b[0m\n\x1b[31m |-- blue: integer (nullable = true) |-- yellow: integer (nullable = true)\x1b[0m\n\x1b[0m'
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
'\nschema1 schema2\n\x1b[34m|-- name: string (nullable = true) |-- name: string (nullable = true)\x1b[0m\n\x1b[34m|-- age: integer (nullable = true) |-- age: integer (nullable = true)\x1b[0m\n\x1b[34m|-- fav_number: integer (nullable = true) |-- fav_number: integer (nullable = true)\x1b[0m\n\x1b[34m|-- fav_numbers: array (nullable = true) |-- fav_numbers: array (nullable = true)\x1b[0m\n\x1b[31m|-- fav_colors: struct (nullable = true) |-- fav_colors: struct (nullable = true)\x1b[0m\n\x1b[31m |-- red: integer (nullable = true) |-- orange: integer (nullable = true)\x1b[0m\n\x1b[34m |-- green: integer (nullable = true) |-- green: integer (nullable = true)\x1b[0m\n\x1b[31m |-- blue: integer (nullable = true) |-- yellow: integer (nullable = true)\x1b[0m\n\x1b[31m |-- purple: integer (nullable = true)\x1b[0m\n\x1b[31m |-- phone_number: string (nullable = true)\x1b[0m\n\x1b[0m'
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
'\nschema1 schema2\n\x1b[34m|-- name: string (nullable = true) |-- name: string (nullable = true)\x1b[0m\n\x1b[34m|-- age: integer (nullable = true) |-- age: integer (nullable = true)\x1b[0m\n\x1b[34m|-- fav_number: integer (nullable = true) |-- fav_number: integer (nullable = true)\x1b[0m\n\x1b[34m|-- fav_numbers: array (nullable = true) |-- fav_numbers: array (nullable = true)\x1b[0m\n\x1b[31m|-- fav_colors: struct (nullable = true) |-- fav_colors: struct (nullable = true)\x1b[0m\n\x1b[31m |-- red: integer (nullable = true) |-- orange: integer (nullable = true)\x1b[0m\n\x1b[34m |-- green: integer (nullable = true) |-- green: integer (nullable = true)\x1b[0m\n\x1b[31m |-- blue: integer (nullable = true) |-- yellow: integer (nullable = true)\x1b[0m\n\x1b[0m'
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
'\nschema1 schema2\n\x1b[34m|-- name: string (nullable = true) |-- name: string (nullable = true)\x1b[0m\n\x1b[34m|-- age: integer (nullable = true) |-- age: integer (nullable = false)\x1b[0m\n\x1b[34m|-- fav_number: integer (nullable = true) |-- fav_number: integer (nullable = true)\x1b[0m\n\x1b[34m|-- fav_numbers: array (nullable = true) |-- fav_numbers: array (nullable = false)\x1b[0m\n\x1b[31m|-- fav_colors: struct (nullable = true) |-- fav_colors: struct (nullable = true)\x1b[0m\n\x1b[31m |-- red: integer (nullable = true) |-- orange: integer (nullable = true)\x1b[0m\n\x1b[34m |-- green: integer (nullable = true) |-- green: integer (nullable = false)\x1b[0m\n\x1b[31m |-- blue: integer (nullable = true) |-- yellow: integer (nullable = true)\x1b[0m\n\x1b[0m'
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
'\nschema1 schema2\n\x1b[34m|-- name: string (nullable = true) |-- name: string (nullable = true)\x1b[0m\n\x1b[31m|-- fav_genres: struct (nullable = true) |-- fav_genres: struct (nullable = true)\x1b[0m\n\x1b[31m |-- rock: struct (nullable = true) |-- rock: struct (nullable = true)\x1b[0m\n\x1b[34m |-- metal: integer (nullable = true) |-- metal: integer (nullable = true)\x1b[0m\n\x1b[31m |-- punk: integer (nullable = true) |-- classic: integer (nullable = true)\x1b[0m\n\x1b[34m |-- electronic: struct (nullable = true) |-- electronic: struct (nullable = true)\x1b[0m\n\x1b[34m |-- house: integer (nullable = true) |-- house: integer (nullable = true)\x1b[0m\n\x1b[34m |-- dubstep: integer (nullable = true) |-- dubstep: integer (nullable = true)\x1b[0m\n\x1b[31m |-- pop: struct (nullable = true)\x1b[0m\n\x1b[31m |-- pop: integer (nullable = true)\x1b[0m\n\x1b[0m'
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
'\nschema1 schema2\n\x1b[31m|-- name: string (nullable = true) |-- name: string (nullable = true)\x1b[0m\n\x1b[34m|-- age: integer (nullable = true) |-- age: integer (nullable = true)\x1b[0m\n\x1b[34m|-- fav_number: integer (nullable = true) |-- fav_number: integer (nullable = true)\x1b[0m\n\x1b[34m|-- fav_numbers: array (nullable = true) |-- fav_numbers: array (nullable = true)\x1b[0m\n\x1b[31m|-- fav_colors: struct (nullable = true) |-- fav_colors: struct (nullable = true)\x1b[0m\n\x1b[31m |-- red: integer (nullable = true) |-- orange: integer (nullable = true)\x1b[0m\n\x1b[34m |-- green: integer (nullable = true) |-- green: integer (nullable = true)\x1b[0m\n\x1b[31m |-- blue: integer (nullable = true) |-- yellow: integer (nullable = true)\x1b[0m\n\x1b[0m'
Loading

0 comments on commit 50c2411

Please sign in to comment.