From 79ac58242f37ada213878f2e35d97e66a4cc263b Mon Sep 17 00:00:00 2001 From: nicobao Date: Fri, 18 Oct 2024 11:19:59 +0800 Subject: [PATCH 1/2] [INLONG-11369][Sort] KV split has error when there is a escape char without before & and = in text --- .../inlong/sort/formats/util/StringUtils.java | 99 ++++++++++--------- .../sort/formats/common/StringUtilsTest.java | 21 +++- 2 files changed, 71 insertions(+), 49 deletions(-) diff --git a/inlong-sort/sort-formats/format-common/src/main/java/org/apache/inlong/sort/formats/util/StringUtils.java b/inlong-sort/sort-formats/format-common/src/main/java/org/apache/inlong/sort/formats/util/StringUtils.java index 3ea6678ca10..c0a706d3860 100644 --- a/inlong-sort/sort-formats/format-common/src/main/java/org/apache/inlong/sort/formats/util/StringUtils.java +++ b/inlong-sort/sort-formats/format-common/src/main/java/org/apache/inlong/sort/formats/util/StringUtils.java @@ -100,15 +100,25 @@ public static List> splitKv( */ int kvState = STATE_KEY; - char lastCh = 0; + char nextCh = 0; for (int i = 0; i < text.length(); ++i) { char ch = text.charAt(i); + if ((i + 1) < text.length()) { + nextCh = text.charAt(i + 1); + } else { + nextCh = 0; + } if (ch == kvDelimiter) { switch (state) { + // match previous kv delimiter first when there are more than one kvDelimiter case STATE_KEY: - key = stringBuilder.toString(); - stringBuilder.setLength(0); - state = STATE_VALUE; + if (i == 0) { + stringBuilder.append(ch); + } else { + key = stringBuilder.toString(); + stringBuilder.setLength(0); + state = STATE_VALUE; + } break; case STATE_VALUE: stringBuilder.append(ch); @@ -124,24 +134,19 @@ public static List> splitKv( } else if (ch == entryDelimiter) { switch (state) { case STATE_KEY: - key = lastKey; - if (lastValue == null) { - value = ch + stringBuilder.toString(); - } else { - value = lastValue + ch + stringBuilder.toString(); - } - fields.put(key, value); - lastKey = key; - lastValue = value; - stringBuilder.setLength(0); + stringBuilder.append(ch); break; case STATE_VALUE: - value = stringBuilder.toString(); - fields.put(key, value); - lastKey = key; - lastValue = value; - stringBuilder.setLength(0); - state = STATE_KEY; + if (nextCh == entryDelimiter) { + stringBuilder.append(ch); + } else { + value = stringBuilder.toString(); + fields.put(key, value); + lastKey = key; + lastValue = value; + stringBuilder.setLength(0); + state = STATE_KEY; + } break; case STATE_ESCAPING: stringBuilder.append(ch); @@ -154,12 +159,6 @@ public static List> splitKv( } else if (escapeChar != null && ch == escapeChar) { switch (state) { case STATE_KEY: - if (lastCh != 0) { - stringBuilder.append(lastCh); - } - kvState = state; - state = STATE_ESCAPING; - break; case STATE_VALUE: kvState = state; state = STATE_ESCAPING; @@ -175,12 +174,6 @@ public static List> splitKv( } else if (quoteChar != null && ch == quoteChar) { switch (state) { case STATE_KEY: - if (lastCh != 0) { - stringBuilder.append(lastCh); - } - kvState = state; - state = STATE_QUOTING; - break; case STATE_VALUE: kvState = state; state = STATE_QUOTING; @@ -196,20 +189,26 @@ public static List> splitKv( } else if (lineDelimiter != null && ch == lineDelimiter) { switch (state) { case STATE_KEY: + String remainingKey = stringBuilder.toString(); key = lastKey; - stringBuilder.append(lastValue).append(lastCh); + stringBuilder.setLength(0); + stringBuilder.append(lastValue).append(entryDelimiter).append(remainingKey); value = stringBuilder.toString(); fields.put(key, value); + Map copyFields = new HashMap<>(); + copyFields.putAll(fields); + lines.add(copyFields); + stringBuilder.setLength(0); + fields.clear(); lastKey = null; lastValue = null; - stringBuilder.setLength(0); break; case STATE_VALUE: lastKey = null; lastValue = null; value = stringBuilder.toString(); fields.put(key, value); - Map copyFields = new HashMap<>(); + copyFields = new HashMap<>(); copyFields.putAll(fields); lines.add(copyFields); stringBuilder.setLength(0); @@ -226,14 +225,22 @@ public static List> splitKv( } } else { stringBuilder.append(ch); + switch (state) { + case STATE_ESCAPING: + state = kvState; + } } - lastCh = ch; } switch (state) { case STATE_KEY: if (lastKey != null && lastValue != null && text != null) { - fields.put(lastKey, lastValue + lastCh); + String remainingKey = stringBuilder.toString(); + key = lastKey; + stringBuilder.setLength(0); + stringBuilder.append(lastValue).append(entryDelimiter).append(remainingKey); + value = stringBuilder.toString(); + fields.put(key, value); } lines.add(fields); return lines; @@ -244,14 +251,18 @@ public static List> splitKv( return lines; case STATE_ESCAPING: case STATE_QUOTING: - value = stringBuilder.toString(); - String oldValue = fields.get(key); - if (value != null && !"".equals(value) - && oldValue != null && !"".equals(oldValue)) { - fields.put(key, oldValue + value); - } else if (value != null && !"".equals(value)) { - fields.put(key, value); + switch (kvState) { + case STATE_VALUE: + value = stringBuilder.toString(); + fields.put(key, value); + case STATE_KEY: + if (lastKey != null) { + value = stringBuilder.toString(); + String oldValue = fields.get(key); + fields.put(key, oldValue + entryDelimiter + value); + } } + lines.add(fields); return lines; default: diff --git a/inlong-sort/sort-formats/format-common/src/test/java/org/apache/inlong/sort/formats/common/StringUtilsTest.java b/inlong-sort/sort-formats/format-common/src/test/java/org/apache/inlong/sort/formats/common/StringUtilsTest.java index fc64811a971..b9c88ed788f 100644 --- a/inlong-sort/sort-formats/format-common/src/test/java/org/apache/inlong/sort/formats/common/StringUtilsTest.java +++ b/inlong-sort/sort-formats/format-common/src/test/java/org/apache/inlong/sort/formats/common/StringUtilsTest.java @@ -19,11 +19,13 @@ import org.apache.inlong.sort.formats.util.StringUtils; +import org.junit.Assert; import org.junit.Test; import java.util.List; import java.util.Map; +import static org.apache.inlong.sort.formats.util.StringUtils.splitKv; import static org.junit.Assert.assertEquals; public class StringUtilsTest { @@ -55,17 +57,17 @@ public void testSplitKvString() { '=', '\\', '\'', '\n'); assertEquals("=", map4.get(0).get("name")); assertEquals("20&&", map4.get(0).get("age")); - assertEquals("=", map4.get(0).get("name1")); - assertEquals("20&&", map4.get(0).get("age1")); + assertEquals("=", map4.get(1).get("name1")); + assertEquals("20&&", map4.get(1).get("age1")); String kvString5 = "name==&age=20&&\nname1==&age1=20&&&value=aaa&dddd&"; List> map5 = StringUtils.splitKv(kvString5, '&', '=', '\\', '\'', '\n'); assertEquals("=", map5.get(0).get("name")); assertEquals("20&&", map5.get(0).get("age")); - assertEquals("=", map5.get(0).get("name1")); - assertEquals("20&&", map5.get(0).get("age1")); - assertEquals("aaa&dddd&", map5.get(0).get("value")); + assertEquals("=", map5.get(1).get("name1")); + assertEquals("20&&", map5.get(1).get("age1")); + assertEquals("aaa&dddd&", map5.get(1).get("value")); String kvString6 = "name==&age=20&&\\"; List> map6 = StringUtils.splitKv(kvString6, '&', @@ -153,4 +155,13 @@ public void testSplitCsvStringWithMaxFields() { assertEquals("home", csv1Array4[2][1]); assertEquals("home", csv1Array4[2][2]); } + + @Test + public void testKvScapeCharSplit() { + String text = "k1=v1&\nk\\2=v2\\&&k3=v3"; + Map kvMap = splitKv(text, '&', '=', '\\', null); + Assert.assertTrue(kvMap != null && kvMap.size() == 3); + Assert.assertTrue(kvMap.get("k3") != null); + Assert.assertTrue(kvMap.get("\nk2") != null); + } } From 8f979c9c3eba85c4d24966b4b05772aabfced930 Mon Sep 17 00:00:00 2001 From: nicobao Date: Fri, 18 Oct 2024 11:57:31 +0800 Subject: [PATCH 2/2] handle code test --- .../apache/inlong/sort/formats/util/StringUtils.java | 11 ++++------- .../apache/inlong/sort/formats/kv/KvUtilsTest.java | 2 +- 2 files changed, 5 insertions(+), 8 deletions(-) diff --git a/inlong-sort/sort-formats/format-common/src/main/java/org/apache/inlong/sort/formats/util/StringUtils.java b/inlong-sort/sort-formats/format-common/src/main/java/org/apache/inlong/sort/formats/util/StringUtils.java index c0a706d3860..000d7a7175b 100644 --- a/inlong-sort/sort-formats/format-common/src/main/java/org/apache/inlong/sort/formats/util/StringUtils.java +++ b/inlong-sort/sort-formats/format-common/src/main/java/org/apache/inlong/sort/formats/util/StringUtils.java @@ -112,13 +112,9 @@ public static List> splitKv( switch (state) { // match previous kv delimiter first when there are more than one kvDelimiter case STATE_KEY: - if (i == 0) { - stringBuilder.append(ch); - } else { - key = stringBuilder.toString(); - stringBuilder.setLength(0); - state = STATE_VALUE; - } + key = stringBuilder.toString(); + stringBuilder.setLength(0); + state = STATE_VALUE; break; case STATE_VALUE: stringBuilder.append(ch); @@ -255,6 +251,7 @@ public static List> splitKv( case STATE_VALUE: value = stringBuilder.toString(); fields.put(key, value); + break; case STATE_KEY: if (lastKey != null) { value = stringBuilder.toString(); diff --git a/inlong-sort/sort-formats/format-row/format-kv/src/test/java/org/apache/inlong/sort/formats/kv/KvUtilsTest.java b/inlong-sort/sort-formats/format-row/format-kv/src/test/java/org/apache/inlong/sort/formats/kv/KvUtilsTest.java index 37bbe758aa3..953d607f9ca 100644 --- a/inlong-sort/sort-formats/format-row/format-kv/src/test/java/org/apache/inlong/sort/formats/kv/KvUtilsTest.java +++ b/inlong-sort/sort-formats/format-row/format-kv/src/test/java/org/apache/inlong/sort/formats/kv/KvUtilsTest.java @@ -210,7 +210,7 @@ public void testSplitDanglingKey1() { public void testSplitDanglingKey2() { Map kvMap = splitKv("f1&f2=3", '&', '=', null, null); - Assert.assertEquals("3", kvMap.get("f2")); + Assert.assertEquals("3", kvMap.get("f1&f2")); } @Test