Skip to content

Commit

Permalink
[INLONG-11172][SDK] Transform REGEXP_MATCHES() function supports more…
Browse files Browse the repository at this point in the history
… flags (#11174)
  • Loading branch information
emptyOVO authored Sep 24, 2024
1 parent fca812a commit b68bba2
Show file tree
Hide file tree
Showing 2 changed files with 39 additions and 18 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -41,7 +41,8 @@
* 3) flags: one or more characters that control the behavior of a function,
* 'g' flag can be used when we want to match all the substrings that occur,
* 'i' flag to ignore case for matching,
* 'm' flag allows regular expressions to match across multiple lines
* 'x' flag to extend syntax (ignoring whitespace and comments in regular expressions)
* 'm' and 'n' flag allows regular expressions to match across multiple lines
*/
@TransformFunction(names = {"regexp_matches"})
public class RegexpMatchesFunction implements ValueParser {
Expand Down Expand Up @@ -78,30 +79,37 @@ public Object parse(SourceData sourceData, int rowIndex, Context context) {

private String regexpMatches(String input, String regex, String flags) {
int flag = 0;
if (flags.contains("i")) {
flag |= Pattern.CASE_INSENSITIVE;
}
if (flags.contains("m")) {
flag |= Pattern.MULTILINE;
}
if (flags.contains("g")) {
flag |= Pattern.DOTALL;
if (flags != null) {
if (flags.contains("i")) {
flag |= Pattern.CASE_INSENSITIVE;
}
if (flags.contains("m") || flags.contains("n")) {
flag |= Pattern.MULTILINE;
}
if (flags.contains("s")) {
flag |= Pattern.DOTALL;
}
if (flags.contains("x")) {
flag |= Pattern.COMMENTS;
}
}

Pattern pattern = Pattern.compile(regex, flag);
Matcher matcher = pattern.matcher(input);

boolean isGlobalMatch = flags != null && flags.contains("g");
List<String[]> matches = new ArrayList<>();

while (matcher.find()) {
if (matcher.groupCount() == 0) {
matches.add(new String[]{matcher.group(0)});
} else {
String[] matchGroups = new String[matcher.groupCount()];
for (int i = 1; i <= matcher.groupCount(); i++) {
matchGroups[i - 1] = matcher.group(i) != null ? matcher.group(i) : "";
}
matches.add(matchGroups);
int groupCount = matcher.groupCount();
String[] matchGroups = new String[groupCount > 0 ? groupCount : 1];

for (int i = 0; i <= groupCount; i++) {
matchGroups[i == 0 ? 0 : i - 1] = matcher.group(i) != null ? matcher.group(i) : "";
}
matches.add(matchGroups);

if (!isGlobalMatch) {
break;
}
}
return listToString(matches);
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -37,6 +37,7 @@ public void testRegexpMatchesFunction() throws Exception {
TransformProcessor<String, String> processor1 = TransformProcessor
.create(config1, SourceDecoderFactory.createCsvDecoder(csvSource),
SinkEncoderFactory.createKvEncoder(kvSink));

// case1: regexp_matches("The quick brown fox", "quick")
List<String> output1 = processor1.transform("The quick brown fox|quick|5|2|1|3", new HashMap<>());
Assert.assertEquals(1, output1.size());
Expand All @@ -46,11 +47,13 @@ public void testRegexpMatchesFunction() throws Exception {
TransformProcessor<String, String> processor2 = TransformProcessor
.create(config2, SourceDecoderFactory.createCsvDecoder(csvSource),
SinkEncoderFactory.createKvEncoder(kvSink));

// case2: regexp_matches("User: Alice, ID: 12345", "User: (\\w+), ID: (\\d+)")
List<String> output2 =
processor2.transform("User: Alice, ID: 12345|User: (\\\\w+), ID: (\\\\d+)|5|2|1|3", new HashMap<>());
Assert.assertEquals(1, output2.size());
Assert.assertEquals(output2.get(0), "result=[{\"Alice\",\"12345\"}]");

// case3: regexp_matches("User: Alice, ID: 12345User: Alice, ID: 12345;
// User: Bob, ID: 67890", "User: (\\w+), ID: (\\d+)")
List<String> output3 =
Expand All @@ -62,10 +65,12 @@ public void testRegexpMatchesFunction() throws Exception {
TransformProcessor<String, String> processor3 = TransformProcessor
.create(config3, SourceDecoderFactory.createCsvDecoder(csvSource),
SinkEncoderFactory.createKvEncoder(kvSink));

// case4: regexp_matches("foo 123 bar 456", "\\d+", "g")
List<String> output4 = processor3.transform("foo 123 bar 456|\\\\d+|g|2|1|3", new HashMap<>());
Assert.assertEquals(1, output4.size());
Assert.assertEquals(output4.get(0), "result=[{\"123\"},{\"456\"}]");

// case5: regexp_matches("User: Alice, ID: 12345User: Alice, ID: 12345;
// User: Bob, ID: 67890", "User: (\\w+),ID: (\\d+)", "g")
List<String> output5 = processor3.transform(
Expand All @@ -77,6 +82,7 @@ public void testRegexpMatchesFunction() throws Exception {
TransformProcessor<String, String> processor4 = TransformProcessor
.create(config4, SourceDecoderFactory.createCsvDecoder(csvSource),
SinkEncoderFactory.createKvEncoder(kvSink));

// case6: regexp_matches("Hello! hello World", "hello", "ig")
List<String> output6 = processor4.transform("Hello! hello World|hello|ig|2|1|3", new HashMap<>());
Assert.assertEquals(1, output6.size());
Expand All @@ -86,9 +92,16 @@ public void testRegexpMatchesFunction() throws Exception {
TransformProcessor<String, String> processor5 = TransformProcessor
.create(config5, SourceDecoderFactory.createCsvDecoder(csvSource),
SinkEncoderFactory.createKvEncoder(kvSink));

// case7: regexp_matches("First line\nSecond line", "^Second", "m")
List<String> output7 = processor5.transform("First line\\\nSecond line|^Second|m|2|1|3", new HashMap<>());
Assert.assertEquals(1, output7.size());
Assert.assertEquals(output7.get(0), "result=[{\"Second\"}]");

// without 'g' flag
// case7: regexp_matches("Hello! hello World", "hello", "i")
List<String> output8 = processor5.transform("Hello! hello World|hello|i|2|1|3", new HashMap<>());
Assert.assertEquals(1, output8.size());
Assert.assertEquals(output8.get(0), "result=[{\"Hello\"}]");
}
}

0 comments on commit b68bba2

Please sign in to comment.