From 5db7e0a4f7e3f8e268e51f39a4401540193445fc Mon Sep 17 00:00:00 2001 From: MrPowers Date: Thu, 21 Mar 2019 23:22:13 +0000 Subject: [PATCH] Add a better schema mismatch error message --- README.md | 8 ++--- .../spark/fast/tests/DatasetComparer.scala | 23 ++++++++++++-- .../spark/fast/tests/ExamplesTest.scala | 31 +++++++++++++++++++ 3 files changed, 56 insertions(+), 6 deletions(-) diff --git a/README.md b/README.md index 0b8b198..fcee332 100644 --- a/README.md +++ b/README.md @@ -1,10 +1,10 @@ # spark-fast-tests -A fast, test framework independent Apache Spark testing helper library with beautifully formatted error messages! +A fast Apache Spark testing helper library with beautifully formatted error messages! Works with scalatest and uTest. [![Codacy Badge](https://api.codacy.com/project/badge/Grade/ab42211c18984740bee7f87c631a8f42)](https://www.codacy.com/app/MrPowers/spark-fast-tests?utm_source=github.com&utm_medium=referral&utm_content=MrPowers/spark-fast-tests&utm_campaign=Badge_Grade) -For example, the `assertSmallDatasetEquality` method can be used to compare two Datasets (or two DataFrames). +The `assertSmallDatasetEquality` method can be used to compare two Datasets (or two DataFrames). ```scala val sourceDF = Seq( @@ -47,8 +47,8 @@ The `DatasetComparer` has `assertSmallDatasetEquality` and `assertLargeDatasetEq If you only need to compare DataFrames, you can use `DataFrameComparer` with the associated `assertSmallDataFrameEquality` and `assertLargeDataFrameEquality` methods. Under the hood, `DataFrameComparer` uses the `assertSmallDatasetEquality` and `assertLargeDatasetEquality`. -*Note : comparing Datasets can be tricky since some column names might be given by spark when applying transformations. -Would this happen, use the `ignoreColumnNames` boolean to skip name verification.* +*Note : comparing Datasets can be tricky since some column names might be given by Spark when applying transformations. +Use the `ignoreColumnNames` boolean to skip name verification.* ## Setup diff --git a/src/main/scala/com/github/mrpowers/spark/fast/tests/DatasetComparer.scala b/src/main/scala/com/github/mrpowers/spark/fast/tests/DatasetComparer.scala index d4c3ab0..bad89ed 100644 --- a/src/main/scala/com/github/mrpowers/spark/fast/tests/DatasetComparer.scala +++ b/src/main/scala/com/github/mrpowers/spark/fast/tests/DatasetComparer.scala @@ -16,6 +16,7 @@ object DatasetComparerLike { def naiveEquality[T](o1: T, o2: T): Boolean = { o1.equals(o2) } + } trait DatasetComparer { @@ -29,6 +30,24 @@ ${expectedDS.schema} """ } + private def betterSchemaMismatchMessage[T](actualDS: Dataset[T], expectedDS: Dataset[T]): String = { + "\n" + actualDS.schema + .zipAll( + expectedDS.schema, + "", + "" + ) + .map { + case (sf1, sf2) => + if (sf1.equals(sf2)) { + ufansi.Color.Blue(s"$sf1 | $sf2") + } else { + ufansi.Color.Red(s"$sf1 | $sf2") + } + } + .mkString("\n") + } + private def countMismatchMessage(actualCount: Long, expectedCount: Long): String = { s""" Actual DataFrame Row Count: '${actualCount}' @@ -80,7 +99,7 @@ ${DataFramePrettyPrint.showString( ignoreColumnNames )) { throw DatasetSchemaMismatch( - schemaMismatchMessage( + betterSchemaMismatchMessage( actualDS, expectedDS ) @@ -123,7 +142,7 @@ ${DataFramePrettyPrint.showString( def assertLargeDatasetEquality[T: ClassTag](actualDS: Dataset[T], expectedDS: Dataset[T], equals: (T, T) => Boolean = naiveEquality _): Unit = { if (!actualDS.schema.equals(expectedDS.schema)) { throw DatasetSchemaMismatch( - schemaMismatchMessage( + betterSchemaMismatchMessage( actualDS, expectedDS ) diff --git a/src/test/scala/com/github/mrpowers/spark/fast/tests/ExamplesTest.scala b/src/test/scala/com/github/mrpowers/spark/fast/tests/ExamplesTest.scala index 50b88f4..4bbd647 100644 --- a/src/test/scala/com/github/mrpowers/spark/fast/tests/ExamplesTest.scala +++ b/src/test/scala/com/github/mrpowers/spark/fast/tests/ExamplesTest.scala @@ -37,6 +37,37 @@ // // } // +// "error when schemas don't match" - { +// +// val sourceDF = spark.createDF( +// List( +// (1, "a"), +// (5, "b") +// ), +// List( +// ("number", IntegerType, true), +// ("letter", StringType, true) +// ) +// ) +// +// val expectedDF = spark.createDF( +// List( +// (1, "a"), +// (5, "b") +// ), +// List( +// ("num", IntegerType, true), +// ("letter", StringType, true) +// ) +// ) +// +// assertSmallDatasetEquality( +// sourceDF, +// expectedDF +// ) +// +// } +// // } // // }