From 0b6da30aff2416e5b4fa9c2170285afb59031b53 Mon Sep 17 00:00:00 2001 From: Gokul-Radhakrishnan <38651133+Gokul-Radhakrishnan@users.noreply.github.com> Date: Tue, 22 Oct 2024 07:57:46 +0530 Subject: [PATCH] Implement Cryptographic hash functions (#788) * Implement Cryptographic hash functions Signed-off-by: Gokul R * update documentation Signed-off-by: Gokul R * added integration tests and updated readme file Signed-off-by: Gokul R * format the code Signed-off-by: Gokul R * fix integration tests Signed-off-by: Gokul R --------- Signed-off-by: Gokul R Signed-off-by: Gokul-Radhakrishnan <38651133+Gokul-Radhakrishnan@users.noreply.github.com> --- docs/ppl-lang/PPL-Example-Commands.md | 4 + docs/ppl-lang/README.md | 2 + docs/ppl-lang/functions/ppl-cryptographic.md | 77 +++++++++++++++++++ .../FlintSparkPPLBuiltinFunctionITSuite.scala | 36 +++++++++ .../src/main/antlr4/OpenSearchPPLLexer.g4 | 5 ++ .../src/main/antlr4/OpenSearchPPLParser.g4 | 8 ++ .../function/BuiltinFunctionName.java | 5 ++ .../ppl/utils/BuiltinFunctionTranslator.java | 29 ++----- ...ographicFunctionsTranslatorTestSuite.scala | 69 +++++++++++++++++ 9 files changed, 211 insertions(+), 24 deletions(-) create mode 100644 docs/ppl-lang/functions/ppl-cryptographic.md create mode 100644 ppl-spark-integration/src/test/scala/org/opensearch/flint/spark/ppl/PPLLogicalPlanCryptographicFunctionsTranslatorTestSuite.scala diff --git a/docs/ppl-lang/PPL-Example-Commands.md b/docs/ppl-lang/PPL-Example-Commands.md index 2bd56df89..d161613a6 100644 --- a/docs/ppl-lang/PPL-Example-Commands.md +++ b/docs/ppl-lang/PPL-Example-Commands.md @@ -97,6 +97,10 @@ Assumptions: `a`, `b`, `c` are existing fields in `table` - `source = table | eval f = case(a = 0, 'zero', a = 1, 'one', a = 2, 'two', a = 3, 'three', a = 4, 'four', a = 5, 'five', a = 6, 'six', a = 7, 'se7en', a = 8, 'eight', a = 9, 'nine')` - `source = table | eval f = case(a = 0, 'zero', a = 1, 'one' else 'unknown')` - `source = table | eval f = case(a = 0, 'zero', a = 1, 'one' else concat(a, ' is an incorrect binary digit'))` +- `source = table | eval digest = md5(fieldName) | fields digest` +- `source = table | eval digest = sha1(fieldName) | fields digest` +- `source = table | eval digest = sha2(fieldName,256) | fields digest` +- `source = table | eval digest = sha2(fieldName,512) | fields digest` #### Fillnull Assumptions: `a`, `b`, `c`, `d`, `e` are existing fields in `table` diff --git a/docs/ppl-lang/README.md b/docs/ppl-lang/README.md index 9054a79f5..fd7c36605 100644 --- a/docs/ppl-lang/README.md +++ b/docs/ppl-lang/README.md @@ -81,6 +81,8 @@ For additional examples see the next [documentation](PPL-Example-Commands.md). - [`Type Conversion Functions`](functions/ppl-conversion.md) + - [`Cryptographic Functions`](functions/ppl-cryptographic.md) + --- ### PPL On Spark diff --git a/docs/ppl-lang/functions/ppl-cryptographic.md b/docs/ppl-lang/functions/ppl-cryptographic.md new file mode 100644 index 000000000..ecabc624c --- /dev/null +++ b/docs/ppl-lang/functions/ppl-cryptographic.md @@ -0,0 +1,77 @@ +## PPL Cryptographic Functions + +### `MD5` + +**Description** + +Calculates the MD5 digest and returns the value as a 32 character hex string. + +Usage: `md5('hello')` + +**Argument type:** +- STRING +- Return type: **STRING** + +Example: + + os> source=people | eval `MD5('hello')` = MD5('hello') | fields `MD5('hello')` + fetched rows / total rows = 1/1 + +----------------------------------+ + | MD5('hello') | + |----------------------------------| + | 5d41402abc4b2a76b9719d911017c592 | + +----------------------------------+ + +### `SHA1` + +**Description** + +Returns the hex string result of SHA-1 + +Usage: `sha1('hello')` + +**Argument type:** +- STRING +- Return type: **STRING** + +Example: + + os> source=people | eval `SHA1('hello')` = SHA1('hello') | fields `SHA1('hello')` + fetched rows / total rows = 1/1 + +------------------------------------------+ + | SHA1('hello') | + |------------------------------------------| + | aaf4c61ddcc5e8a2dabede0f3b482cd9aea9434d | + +------------------------------------------+ + +### `SHA2` + +**Description** + +Returns the hex string result of SHA-2 family of hash functions (SHA-224, SHA-256, SHA-384, and SHA-512). The numBits indicates the desired bit length of the result, which must have a value of 224, 256, 384, 512 + +Usage: `sha2('hello',256)` + +Usage: `sha2('hello',512)` + +**Argument type:** +- STRING, INTEGER +- Return type: **STRING** + +Example: + + os> source=people | eval `SHA2('hello',256)` = SHA2('hello',256) | fields `SHA2('hello',256)` + fetched rows / total rows = 1/1 + +------------------------------------------------------------------+ + | SHA2('hello',256) | + |------------------------------------------------------------------| + | 2cf24dba5fb0a30e26e83b2ac5b9e29e1b161e5c1fa7425e73043362938b9824 | + +------------------------------------------------------------------+ + + os> source=people | eval `SHA2('hello',512)` = SHA2('hello',512) | fields `SHA2('hello',512)` + fetched rows / total rows = 1/1 + +----------------------------------------------------------------------------------------------------------------------------------+ + | SHA2('hello',512) | + |----------------------------------------------------------------------------------------------------------------------------------| + | 9b71d224bd62f3785d96d46ad3ea3d73319bfbc2890caadae2dff72519673ca72323c3d99ba5c11d7c7acc6e14b8c5da0c4663475c2e5c3adef46f73bcdec043 | + +----------------------------------------------------------------------------------------------------------------------------------+ \ No newline at end of file diff --git a/integ-test/src/integration/scala/org/opensearch/flint/spark/ppl/FlintSparkPPLBuiltinFunctionITSuite.scala b/integ-test/src/integration/scala/org/opensearch/flint/spark/ppl/FlintSparkPPLBuiltinFunctionITSuite.scala index 67e799c00..763c2411b 100644 --- a/integ-test/src/integration/scala/org/opensearch/flint/spark/ppl/FlintSparkPPLBuiltinFunctionITSuite.scala +++ b/integ-test/src/integration/scala/org/opensearch/flint/spark/ppl/FlintSparkPPLBuiltinFunctionITSuite.scala @@ -785,6 +785,42 @@ class FlintSparkPPLBuiltinFunctionITSuite assert(results.sameElements(expectedResults)) } + test("test cryptographic hash functions - md5") { + val frame = sql(s""" + | source = $testTable | eval a = md5('Spark') = '8cde774d6f7333752ed72cacddb05126' | fields age, a + | """.stripMargin) + + val results: Array[Row] = frame.collect() + val expectedResults: Array[Row] = + Array(Row(70, true), Row(30, true), Row(25, true), Row(20, true)) + implicit val rowOrdering: Ordering[Row] = Ordering.by[Row, Integer](_.getAs[Integer](0)) + assert(results.sorted.sameElements(expectedResults.sorted)) + } + + test("test cryptographic hash functions - sha1") { + val frame = sql(s""" + | source = $testTable | eval a = sha1('Spark') = '85f5955f4b27a9a4c2aab6ffe5d7189fc298b92c' | fields age, a + | """.stripMargin) + + val results: Array[Row] = frame.collect() + val expectedResults: Array[Row] = + Array(Row(70, true), Row(30, true), Row(25, true), Row(20, true)) + implicit val rowOrdering: Ordering[Row] = Ordering.by[Row, Integer](_.getAs[Integer](0)) + assert(results.sorted.sameElements(expectedResults.sorted)) + } + + test("test cryptographic hash functions - sha2") { + val frame = sql(s""" + | source = $testTable | eval a = sha2('Spark',256) = '529bc3b07127ecb7e53a4dcf1991d9152c24537d919178022b2c42657f79a26b' | fields age, a + | """.stripMargin) + + val results: Array[Row] = frame.collect() + val expectedResults: Array[Row] = + Array(Row(70, true), Row(30, true), Row(25, true), Row(20, true)) + implicit val rowOrdering: Ordering[Row] = Ordering.by[Row, Integer](_.getAs[Integer](0)) + assert(results.sorted.sameElements(expectedResults.sorted)) + } + // Todo // +---------------------------------------+ // | Below tests are not supported (cast) | diff --git a/ppl-spark-integration/src/main/antlr4/OpenSearchPPLLexer.g4 b/ppl-spark-integration/src/main/antlr4/OpenSearchPPLLexer.g4 index f62553d4c..4494ee72b 100644 --- a/ppl-spark-integration/src/main/antlr4/OpenSearchPPLLexer.g4 +++ b/ppl-spark-integration/src/main/antlr4/OpenSearchPPLLexer.g4 @@ -280,6 +280,11 @@ RADIANS: 'RADIANS'; SIN: 'SIN'; TAN: 'TAN'; +// CRYPTOGRAPHIC FUNCTIONS +MD5: 'MD5'; +SHA1: 'SHA1'; +SHA2: 'SHA2'; + // DATE AND TIME FUNCTIONS ADDDATE: 'ADDDATE'; ADDTIME: 'ADDTIME'; diff --git a/ppl-spark-integration/src/main/antlr4/OpenSearchPPLParser.g4 b/ppl-spark-integration/src/main/antlr4/OpenSearchPPLParser.g4 index e0672690d..064688983 100644 --- a/ppl-spark-integration/src/main/antlr4/OpenSearchPPLParser.g4 +++ b/ppl-spark-integration/src/main/antlr4/OpenSearchPPLParser.g4 @@ -508,6 +508,7 @@ evalFunctionName | systemFunctionName | positionFunctionName | coalesceFunctionName + | cryptographicFunctionName ; functionArgs @@ -623,6 +624,12 @@ trigonometricFunctionName | TAN ; +cryptographicFunctionName + : MD5 + | SHA1 + | SHA2 + ; + dateTimeFunctionName : ADDDATE | ADDTIME @@ -954,6 +961,7 @@ keywordsCanBeId | textFunctionName | mathematicalFunctionName | positionFunctionName + | cryptographicFunctionName // commands | SEARCH | DESCRIBE diff --git a/ppl-spark-integration/src/main/java/org/opensearch/sql/expression/function/BuiltinFunctionName.java b/ppl-spark-integration/src/main/java/org/opensearch/sql/expression/function/BuiltinFunctionName.java index 5d92f4b58..1b41a3df8 100644 --- a/ppl-spark-integration/src/main/java/org/opensearch/sql/expression/function/BuiltinFunctionName.java +++ b/ppl-spark-integration/src/main/java/org/opensearch/sql/expression/function/BuiltinFunctionName.java @@ -52,6 +52,11 @@ public enum BuiltinFunctionName { SIN(FunctionName.of("sin")), TAN(FunctionName.of("tan")), + /** Cryptographic Functions. */ + MD5(FunctionName.of("md5")), + SHA1(FunctionName.of("sha1")), + SHA2(FunctionName.of("sha2")), + /** Date and Time Functions. */ ADDDATE(FunctionName.of("adddate")), // ADDTIME(FunctionName.of("addtime")), diff --git a/ppl-spark-integration/src/main/java/org/opensearch/sql/ppl/utils/BuiltinFunctionTranslator.java b/ppl-spark-integration/src/main/java/org/opensearch/sql/ppl/utils/BuiltinFunctionTranslator.java index 7c5b0fad1..485ccb522 100644 --- a/ppl-spark-integration/src/main/java/org/opensearch/sql/ppl/utils/BuiltinFunctionTranslator.java +++ b/ppl-spark-integration/src/main/java/org/opensearch/sql/ppl/utils/BuiltinFunctionTranslator.java @@ -13,30 +13,7 @@ import java.util.List; import java.util.Map; -import static org.opensearch.sql.expression.function.BuiltinFunctionName.ADD; -import static org.opensearch.sql.expression.function.BuiltinFunctionName.ADDDATE; -import static org.opensearch.sql.expression.function.BuiltinFunctionName.DATEDIFF; -import static org.opensearch.sql.expression.function.BuiltinFunctionName.DAY_OF_MONTH; -import static org.opensearch.sql.expression.function.BuiltinFunctionName.COALESCE; -import static org.opensearch.sql.expression.function.BuiltinFunctionName.SUBTRACT; -import static org.opensearch.sql.expression.function.BuiltinFunctionName.MULTIPLY; -import static org.opensearch.sql.expression.function.BuiltinFunctionName.DIVIDE; -import static org.opensearch.sql.expression.function.BuiltinFunctionName.MODULUS; -import static org.opensearch.sql.expression.function.BuiltinFunctionName.DAY_OF_WEEK; -import static org.opensearch.sql.expression.function.BuiltinFunctionName.DAY_OF_YEAR; -import static org.opensearch.sql.expression.function.BuiltinFunctionName.HOUR_OF_DAY; -import static org.opensearch.sql.expression.function.BuiltinFunctionName.IS_NOT_NULL; -import static org.opensearch.sql.expression.function.BuiltinFunctionName.IS_NULL; -import static org.opensearch.sql.expression.function.BuiltinFunctionName.LENGTH; -import static org.opensearch.sql.expression.function.BuiltinFunctionName.LOCALTIME; -import static org.opensearch.sql.expression.function.BuiltinFunctionName.MINUTE_OF_HOUR; -import static org.opensearch.sql.expression.function.BuiltinFunctionName.MONTH_OF_YEAR; -import static org.opensearch.sql.expression.function.BuiltinFunctionName.SECOND_OF_MINUTE; -import static org.opensearch.sql.expression.function.BuiltinFunctionName.SUBDATE; -import static org.opensearch.sql.expression.function.BuiltinFunctionName.SYSDATE; -import static org.opensearch.sql.expression.function.BuiltinFunctionName.TRIM; -import static org.opensearch.sql.expression.function.BuiltinFunctionName.WEEK; -import static org.opensearch.sql.expression.function.BuiltinFunctionName.WEEK_OF_YEAR; +import static org.opensearch.sql.expression.function.BuiltinFunctionName.*; import static org.opensearch.sql.ppl.utils.DataTypeTransformer.seq; import static scala.Option.empty; @@ -68,6 +45,10 @@ public interface BuiltinFunctionTranslator { .put(DATEDIFF, "datediff") .put(LOCALTIME, "localtimestamp") .put(SYSDATE, "now") + // Cryptographic functions + .put(MD5, "md5") + .put(SHA1, "sha1") + .put(SHA2, "sha2") // condition functions .put(IS_NULL, "isnull") .put(IS_NOT_NULL, "isnotnull") diff --git a/ppl-spark-integration/src/test/scala/org/opensearch/flint/spark/ppl/PPLLogicalPlanCryptographicFunctionsTranslatorTestSuite.scala b/ppl-spark-integration/src/test/scala/org/opensearch/flint/spark/ppl/PPLLogicalPlanCryptographicFunctionsTranslatorTestSuite.scala new file mode 100644 index 000000000..a3f163de9 --- /dev/null +++ b/ppl-spark-integration/src/test/scala/org/opensearch/flint/spark/ppl/PPLLogicalPlanCryptographicFunctionsTranslatorTestSuite.scala @@ -0,0 +1,69 @@ +/* + * Copyright OpenSearch Contributors + * SPDX-License-Identifier: Apache-2.0 + */ + +package org.opensearch.flint.spark.ppl + +import org.opensearch.flint.spark.ppl.PlaneUtils.plan +import org.opensearch.sql.ppl.{CatalystPlanContext, CatalystQueryPlanVisitor} +import org.opensearch.sql.ppl.utils.DataTypeTransformer.seq +import org.scalatest.matchers.should.Matchers + +import org.apache.spark.SparkFunSuite +import org.apache.spark.sql.catalyst.analysis.{UnresolvedAttribute, UnresolvedFunction, UnresolvedRelation, UnresolvedStar} +import org.apache.spark.sql.catalyst.expressions.{Alias, EqualTo, GreaterThan, GreaterThanOrEqual, LessThan, LessThanOrEqual, Literal, Not} +import org.apache.spark.sql.catalyst.plans.PlanTest +import org.apache.spark.sql.catalyst.plans.logical.{Filter, Project} + +class PPLLogicalPlanCryptographicFunctionsTranslatorTestSuite + extends SparkFunSuite + with PlanTest + with LogicalPlanTestUtils + with Matchers { + + private val planTransformer = new CatalystQueryPlanVisitor() + private val pplParser = new PPLSyntaxParser() + + test("test md5") { + val context = new CatalystPlanContext + val logPlan = planTransformer.visit(plan(pplParser, "source=t a = md5(b)"), context) + + val table = UnresolvedRelation(Seq("t")) + val filterExpr = EqualTo( + UnresolvedAttribute("a"), + UnresolvedFunction("md5", seq(UnresolvedAttribute("b")), isDistinct = false)) + val filterPlan = Filter(filterExpr, table) + val projectList = Seq(UnresolvedStar(None)) + val expectedPlan = Project(projectList, filterPlan) + comparePlans(expectedPlan, logPlan, false) + } + + test("test sha1") { + val context = new CatalystPlanContext + val logPlan = planTransformer.visit(plan(pplParser, "source=t a = sha1(b)"), context) + + val table = UnresolvedRelation(Seq("t")) + val filterExpr = EqualTo( + UnresolvedAttribute("a"), + UnresolvedFunction("sha1", seq(UnresolvedAttribute("b")), isDistinct = false)) + val filterPlan = Filter(filterExpr, table) + val projectList = Seq(UnresolvedStar(None)) + val expectedPlan = Project(projectList, filterPlan) + comparePlans(expectedPlan, logPlan, false) + } + + test("test sha2") { + val context = new CatalystPlanContext + val logPlan = planTransformer.visit(plan(pplParser, "source=t a = sha2(b,256)"), context) + + val table = UnresolvedRelation(Seq("t")) + val filterExpr = EqualTo( + UnresolvedAttribute("a"), + UnresolvedFunction("sha2", seq(UnresolvedAttribute("b"), Literal(256)), isDistinct = false)) + val filterPlan = Filter(filterExpr, table) + val projectList = Seq(UnresolvedStar(None)) + val expectedPlan = Project(projectList, filterPlan) + comparePlans(expectedPlan, logPlan, false) + } +}