diff --git a/warehouse/query-core/src/main/java/datawave/query/jexl/visitors/RegexRewritePattern.java b/warehouse/query-core/src/main/java/datawave/query/jexl/visitors/RegexRewritePattern.java new file mode 100644 index 0000000000..e106aed388 --- /dev/null +++ b/warehouse/query-core/src/main/java/datawave/query/jexl/visitors/RegexRewritePattern.java @@ -0,0 +1,51 @@ +package datawave.query.jexl.visitors; + +import org.apache.commons.lang3.builder.EqualsBuilder; +import org.apache.commons.lang3.builder.HashCodeBuilder; + +/** + * There may exist certain field-pattern combinations that you always want to rewrite + */ +public class RegexRewritePattern { + private String field; + private String literal; + + public RegexRewritePattern(String field, String literal) { + this.field = field; + this.literal = literal; + } + + public boolean matches(String field, String literal) { + return this.field.equals(field) && this.literal.equals(literal); + } + + public String getField() { + return field; + } + + public void setField(String field) { + this.field = field; + } + + public String getLiteral() { + return literal; + } + + public void setLiteral(String literal) { + this.literal = literal; + } + + @Override + public boolean equals(Object o) { + if (o instanceof RegexRewritePattern) { + RegexRewritePattern other = (RegexRewritePattern) o; + return new EqualsBuilder().append(field, other.field).append(literal, other.literal).isEquals(); + } + return false; + } + + @Override + public int hashCode() { + return new HashCodeBuilder().append(field).append(literal).hashCode(); + } +} diff --git a/warehouse/query-core/src/main/java/datawave/query/jexl/visitors/RewriteRegexVisitor.java b/warehouse/query-core/src/main/java/datawave/query/jexl/visitors/RewriteRegexVisitor.java new file mode 100644 index 0000000000..2a1195e71b --- /dev/null +++ b/warehouse/query-core/src/main/java/datawave/query/jexl/visitors/RewriteRegexVisitor.java @@ -0,0 +1,225 @@ +package datawave.query.jexl.visitors; + +import java.util.Collections; +import java.util.LinkedList; +import java.util.List; +import java.util.Set; + +import org.apache.commons.jexl3.parser.ASTAndNode; +import org.apache.commons.jexl3.parser.ASTERNode; +import org.apache.commons.jexl3.parser.JexlNode; +import org.apache.commons.jexl3.parser.JexlNodes; + +import datawave.query.Constants; +import datawave.query.jexl.JexlASTHelper; +import datawave.query.jexl.NodeTypeCount; +import datawave.query.jexl.nodes.QueryPropertyMarker; +import datawave.query.jexl.nodes.QueryPropertyMarker.MarkerType; +import datawave.query.jexl.visitors.pushdown.AnchorDetectionVisitor; + +/** + * Rewrites regex terms provided an anchor exists. Regex terms are wrapped in EvalOnly marker + *

+ * An anchor is an executable term or subtree. + *

+ * This visitor supports several configuration options + *

+ * IncludeFields + *

+ * Limit rewrite operations to the specified fields + *

+ *

+ * ExcludeFields + *

+ * Rewrite operations will not be applied to the specified fields. This option overrides any 'include fields' but can be superseded by + * {@link RegexRewritePattern} + *

+ *

+ * RegexRewritePattern + *

+ * In very specific cases one may want to always attempt a regex rewrite, regardless of any previously specified include or exclude fields + *

+ */ +public class RewriteRegexVisitor extends ShortCircuitBaseVisitor { + + private final Set indexedFields; + private final Set indexOnlyFields; + + private final Set includeFields; + private final Set excludeFields; + + private final Set patterns; + + private final AnchorDetectionVisitor anchorDetectionVisitor; + + /** + * Constructor with minimal args + * + * @param indexedFields + * the set of indexed fields + * @param indexOnlyFields + * the set of index only fields + */ + public RewriteRegexVisitor(Set indexedFields, Set indexOnlyFields) { + this(indexedFields, indexOnlyFields, Collections.emptySet(), Collections.emptySet(), Collections.emptySet()); + } + + /** + * Constructor with minimal args + * + * @param indexedFields + * the set of indexed fields + * @param indexOnlyFields + * the set of index only fields + */ + public RewriteRegexVisitor(Set indexedFields, Set indexOnlyFields, Set includeFields, Set excludeFields, + Set patterns) { + this.indexedFields = indexedFields; + this.indexOnlyFields = indexOnlyFields; + this.includeFields = includeFields; + this.excludeFields = excludeFields; + this.patterns = patterns; + + this.anchorDetectionVisitor = new AnchorDetectionVisitor(indexedFields, indexOnlyFields); + } + + /** + * Static entry point + * + * @param node + * the query or subtree + * @param indexedFields + * the set of indexed fields + * @param indexOnlyFields + * the set of index only fields + * @return the modified tree + */ + public static JexlNode rewrite(JexlNode node, Set indexedFields, Set indexOnlyFields) { + return rewrite(node, indexedFields, indexOnlyFields, Collections.emptySet(), Collections.emptySet(), Collections.emptySet()); + } + + public static JexlNode rewrite(JexlNode node, Set indexedFields, Set indexOnlyFields, Set includeFields, Set excludeFields, + Set patterns) { + RewriteRegexVisitor visitor = new RewriteRegexVisitor(indexedFields, indexOnlyFields, includeFields, excludeFields, patterns); + node.jjtAccept(visitor, null); + return node; + } + + // union is not overridden here + + @Override + public Object visit(ASTAndNode node, Object data) { + + if (data instanceof Boolean) { + return data; // short circuit repeated post-traversals + } + + if (QueryPropertyMarker.findInstance(node).isAnyType()) { + return data; // do not descend into markers + } + + // enforce a post-order traversal for maximum rewrite + node.childrenAccept(this, data); + + List anchorCandidates = new LinkedList<>(); + List anchorNonCandidates = new LinkedList<>(); + List otherCandidates = new LinkedList<>(); + + for (int i = 0; i < node.jjtGetNumChildren(); i++) { + JexlNode child = node.jjtGetChild(i); + + // this seems expensive, a visitor that returned raw counts, depth, and complexity would nice to have + NodeTypeCount counts = NodeTypeCountVisitor.countNodes(child, ASTERNode.class); + + if (anchorDetectionVisitor.isAnchor(child)) { + if (counts.getTotal(ASTERNode.class) > 0) { + anchorCandidates.add(child); + } else { + anchorNonCandidates.add(child); + } + } else if (counts.getTotal(ASTERNode.class) > 0) { + otherCandidates.add(child); + } + } + + if (!anchorCandidates.isEmpty() || !anchorNonCandidates.isEmpty()) { + + if (!anchorNonCandidates.isEmpty()) { + // rewrite all anchor candidates + for (JexlNode candidate : anchorCandidates) { + candidate.jjtAccept(this, true); + } + } else { + // rewrite all anchor candidates except the last one, to preserve executability + for (int i = 0; i < anchorCandidates.size() - 1; i++) { + anchorCandidates.get(i).jjtAccept(this, true); + } + } + + // if any anchor exists, rewrite other candidates + for (JexlNode otherCandidate : otherCandidates) { + otherCandidate.jjtAccept(this, true); + } + } + + return data; + } + + @Override + public Object visit(ASTERNode node, Object data) { + String field = JexlASTHelper.getIdentifier(node); + + if (isLegalRewrite(field, data)) { + + // once legality of rewrite is established make sure it's not filtered + String literal = (String) JexlASTHelper.getLiteralValue(node); + + if (isNodeRewritableFromRules(field, literal)) { + JexlNode marker = QueryPropertyMarker.create(node, MarkerType.EVALUATION_ONLY); + JexlNodes.replaceChild(node.jjtGetParent(), node, marker); + } + } + + return data; + } + + private boolean isLegalRewrite(String field, Object data) { + // never rewrite ANY_FIELD or index-only fields + if (field.equals(Constants.ANY_FIELD) || indexOnlyFields.contains(field)) { + return false; + } + + // 1. anchor exists elsewhere + // 2. field is not indexed + return data instanceof Boolean || !indexedFields.contains(field); + } + + /** + * Determine if the node can be rewritten given any configured rules (include fields, exclude fields, patterns) + * + * @param field + * the field + * @param literal + * the literal + * @return true if the node can be rewritten + */ + private boolean isNodeRewritableFromRules(String field, String literal) { + // check patterns first because they supersede include/exclude rules + for (RegexRewritePattern pattern : patterns) { + if (pattern.matches(field, literal)) { + return true; + } + } + + // exclude fields beat include fields + if (!excludeFields.isEmpty() && excludeFields.contains(field)) { + return false; + } + + if (!includeFields.isEmpty()) { + return includeFields.contains(field); + } + + return true; + } +} diff --git a/warehouse/query-core/src/main/java/datawave/query/jexl/visitors/pushdown/AnchorDetectionVisitor.java b/warehouse/query-core/src/main/java/datawave/query/jexl/visitors/pushdown/AnchorDetectionVisitor.java new file mode 100644 index 0000000000..4740b29894 --- /dev/null +++ b/warehouse/query-core/src/main/java/datawave/query/jexl/visitors/pushdown/AnchorDetectionVisitor.java @@ -0,0 +1,208 @@ +package datawave.query.jexl.visitors.pushdown; + +import java.util.Set; + +import org.apache.commons.jexl3.parser.ASTAndNode; +import org.apache.commons.jexl3.parser.ASTAssignment; +import org.apache.commons.jexl3.parser.ASTEQNode; +import org.apache.commons.jexl3.parser.ASTERNode; +import org.apache.commons.jexl3.parser.ASTFunctionNode; +import org.apache.commons.jexl3.parser.ASTGENode; +import org.apache.commons.jexl3.parser.ASTGTNode; +import org.apache.commons.jexl3.parser.ASTJexlScript; +import org.apache.commons.jexl3.parser.ASTLENode; +import org.apache.commons.jexl3.parser.ASTLTNode; +import org.apache.commons.jexl3.parser.ASTNENode; +import org.apache.commons.jexl3.parser.ASTNRNode; +import org.apache.commons.jexl3.parser.ASTNotNode; +import org.apache.commons.jexl3.parser.ASTOrNode; +import org.apache.commons.jexl3.parser.ASTReference; +import org.apache.commons.jexl3.parser.ASTReferenceExpression; +import org.apache.commons.jexl3.parser.JexlNode; + +import datawave.query.jexl.JexlASTHelper; +import datawave.query.jexl.nodes.QueryPropertyMarker; +import datawave.query.jexl.visitors.ShortCircuitBaseVisitor; + +/** + * Determines if a subtree is an anchor for a given query + *

+ * An anchor is defined as an executable leaf or subtree. + */ +public class AnchorDetectionVisitor extends ShortCircuitBaseVisitor { + + private final Set indexedFields; + private final Set indexOnlyFields; + + /** + * Default constructor + * + * @param indexedFields + * the set of indexed query fields + * @param indexOnlyFields + * the set of index only query fields + */ + public AnchorDetectionVisitor(Set indexedFields, Set indexOnlyFields) { + this.indexedFields = indexedFields; + this.indexOnlyFields = indexOnlyFields; + } + + public boolean isAnchor(JexlNode node) { + return (boolean) node.jjtAccept(this, null); + } + + // pass through nodes + + @Override + public Object visit(ASTJexlScript node, Object data) { + return node.jjtGetChild(0).jjtAccept(this, data); + } + + @Override + public Object visit(ASTReference node, Object data) { + return node.jjtGetChild(0).jjtAccept(this, data); + } + + @Override + public Object visit(ASTReferenceExpression node, Object data) { + return node.jjtGetChild(0).jjtAccept(this, data); + } + + @Override + public Object visit(ASTAssignment node, Object data) { + return false; + } + + @Override + public Object visit(ASTNotNode node, Object data) { + return false; + } + + // junction nodes + + /** + * An OrNode is considered an anchor if and only if all children are anchor nodes + * + * @param node + * a JexlNode + * @param data + * an Object + * @return True if this node is an anchor + */ + @Override + public Object visit(ASTOrNode node, Object data) { + for (int i = 0; i < node.jjtGetNumChildren(); i++) { + boolean childIsAnchor = (boolean) node.jjtGetChild(i).jjtAccept(this, data); + if (!childIsAnchor) { + return false; + } + } + return true; + } + + /** + * An AndNode is considered an anchor if at least one child node is an anchor + * + * @param node + * a JexlNode + * @param data + * an Object + * @return True if this node is an anchor + */ + @Override + public Object visit(ASTAndNode node, Object data) { + QueryPropertyMarker.Instance instance = QueryPropertyMarker.findInstance(node); + if (instance.isAnyType()) { + return visitMarker(instance); + } + + for (int i = 0; i < node.jjtGetNumChildren(); i++) { + boolean isChildAnchor = (boolean) node.jjtGetChild(i).jjtAccept(this, data); + if (isChildAnchor) { + return true; + } + } + return false; + } + + // leaf nodes + + @Override + public Object visit(ASTEQNode node, Object data) { + return visitLeaf(node); + } + + @Override + public Object visit(ASTNENode node, Object data) { + return visitLeaf(node); + } + + @Override + public Object visit(ASTLTNode node, Object data) { + return visitLeaf(node); + } + + @Override + public Object visit(ASTGTNode node, Object data) { + return visitLeaf(node); + } + + @Override + public Object visit(ASTLENode node, Object data) { + return visitLeaf(node); + } + + @Override + public Object visit(ASTGENode node, Object data) { + return visitLeaf(node); + } + + @Override + public Object visit(ASTERNode node, Object data) { + return visitLeaf(node); + } + + @Override + public Object visit(ASTNRNode node, Object data) { + return visitLeaf(node); + } + + @Override + public Object visit(ASTFunctionNode node, Object data) { + return false; + } + + private boolean visitLeaf(JexlNode node) { + String field = JexlASTHelper.getIdentifier(node, true); + if (indexedFields.contains(field) || indexOnlyFields.contains(field)) { + if (node instanceof ASTEQNode || node instanceof ASTNENode) { + Object value = JexlASTHelper.getLiteralValue(node); + return value != null; + } + return true; + } + return false; + } + + private Object visitMarker(QueryPropertyMarker.Instance instance) { + + if (instance == null || instance.getType() == null) { + return false; + } + + // might need to handle double markers, such as delayed bounded ranges + + switch (instance.getType()) { + case BOUNDED_RANGE: + case EXCEEDED_OR: + case EXCEEDED_TERM: + case EXCEEDED_VALUE: + return true; + case DELAYED: + case EVALUATION_ONLY: + default: + return false; + } + } + +} diff --git a/warehouse/query-core/src/main/java/datawave/query/planner/DefaultQueryPlanner.java b/warehouse/query-core/src/main/java/datawave/query/planner/DefaultQueryPlanner.java index 8fbebd477e..c3684a2095 100644 --- a/warehouse/query-core/src/main/java/datawave/query/planner/DefaultQueryPlanner.java +++ b/warehouse/query-core/src/main/java/datawave/query/planner/DefaultQueryPlanner.java @@ -147,6 +147,7 @@ import datawave.query.jexl.visitors.RegexIndexExpansionVisitor; import datawave.query.jexl.visitors.RewriteNegationsVisitor; import datawave.query.jexl.visitors.RewriteNullFunctionsVisitor; +import datawave.query.jexl.visitors.RewriteRegexVisitor; import datawave.query.jexl.visitors.SetMembershipVisitor; import datawave.query.jexl.visitors.SortedUIDsRequiredVisitor; import datawave.query.jexl.visitors.TermCountingVisitor; @@ -352,6 +353,11 @@ public class DefaultQueryPlanner extends QueryPlanner implements Cloneable { */ protected boolean showReducedQueryPrune = true; + /** + * Controls optimistic rewriting of regex terms as filter functions, preserving overall query executability + */ + private RegexRewriteOptions regexRewriteOptions; + // handles boilerplate operations that surround a visitor's execution (e.g., timers, logging, validating) private TimedVisitorManager visitorManager = new TimedVisitorManager(); @@ -387,6 +393,7 @@ protected DefaultQueryPlanner(DefaultQueryPlanner other) { rangeStreamClass = other.rangeStreamClass; setSourceLimit(other.sourceLimit); setPushdownThreshold(other.getPushdownThreshold()); + setRegexRewriteOptions(other.getRegexRewriteOptions()); setVisitorManager(other.getVisitorManager()); setTransformRules(other.getTransformRules() == null ? null : new ArrayList<>(other.transformRules)); } @@ -903,6 +910,16 @@ protected ASTJexlScript updateQueryTree(ScannerFactory scannerFactory, MetadataH config.setQueryTree(timedEnforceUniqueDisjunctionsWithinExpressions(timers, config.getQueryTree())); } + // rewrite regex nodes, optimistically + if (regexRewriteOptions != null && regexRewriteOptions.isPreExpansionEnabled()) { + // @formatter:off + RewriteRegexVisitor.rewrite(config.getQueryTree(), getIndexedFields(), indexOnlyFields, + regexRewriteOptions.getPreExpansionIncludeFields(), + regexRewriteOptions.getPreExpansionExcludeFields(), + regexRewriteOptions.getPreExpansionPatterns()); + // @formatter:on + } + if (disableBoundedLookup) { // protection mechanism. If we disable bounded ranges and have a // LT,GT or ER node, we should expand it @@ -1057,6 +1074,16 @@ protected ASTJexlScript processTree(final ASTJexlScript originalQueryTree, Shard config.setQueryTree(timedPushFunctions(timers, config.getQueryTree(), config, metadataHelper)); } + // rewrite regex nodes, optimistically + if (regexRewriteOptions != null && regexRewriteOptions.isPostExpansionEnabled()) { + // @formatter:off + RewriteRegexVisitor.rewrite(config.getQueryTree(), indexedFields, indexOnlyFields, + regexRewriteOptions.getPostExpansionIncludeFields(), + regexRewriteOptions.getPostExpansionExcludeFields(), + regexRewriteOptions.getPostExpansionPatterns()); + // @formatter:on + } + if (executableExpansion) { config.setQueryTree(timedExecutableExpansion(timers, config.getQueryTree(), config, metadataHelper)); } @@ -3278,6 +3305,14 @@ public void finalize() { } } + public RegexRewriteOptions getRegexRewriteOptions() { + return regexRewriteOptions; + } + + public void setRegexRewriteOptions(RegexRewriteOptions regexRewriteOptions) { + this.regexRewriteOptions = regexRewriteOptions; + } + protected CompositeMetadata getCompositeMetadata() { if (compositeMetadata == null && compositeMetadataCallable != null) { TraceStopwatch stopwatch = stageStopWatch.newStartedStopwatch(compositeMetadataCallable.stageName()); diff --git a/warehouse/query-core/src/main/java/datawave/query/planner/RegexRewriteOptions.java b/warehouse/query-core/src/main/java/datawave/query/planner/RegexRewriteOptions.java new file mode 100644 index 0000000000..966a957ec8 --- /dev/null +++ b/warehouse/query-core/src/main/java/datawave/query/planner/RegexRewriteOptions.java @@ -0,0 +1,87 @@ +package datawave.query.planner; + +import java.util.Collections; +import java.util.Set; + +import datawave.query.jexl.visitors.RegexRewritePattern; +import datawave.query.jexl.visitors.RewriteRegexVisitor; + +/** + * Provides fine-grain control over how the {@link RewriteRegexVisitor} operates pre and post index expansion + */ +public class RegexRewriteOptions { + + private boolean preExpansionEnabled = false; + private Set preExpansionIncludeFields = Collections.emptySet(); + private Set preExpansionExcludeFields = Collections.emptySet(); + private Set preExpansionPatterns = Collections.emptySet(); + + private boolean postExpansionEnabled = false; + private Set postExpansionIncludeFields = Collections.emptySet(); + private Set postExpansionExcludeFields = Collections.emptySet(); + private Set postExpansionPatterns = Collections.emptySet(); + + public boolean isPreExpansionEnabled() { + return preExpansionEnabled; + } + + public void setPreExpansionEnabled(boolean preExpansionEnabled) { + this.preExpansionEnabled = preExpansionEnabled; + } + + public Set getPreExpansionIncludeFields() { + return preExpansionIncludeFields; + } + + public void setPreExpansionIncludeFields(Set preExpansionIncludeFields) { + this.preExpansionIncludeFields = preExpansionIncludeFields; + } + + public Set getPreExpansionExcludeFields() { + return preExpansionExcludeFields; + } + + public void setPreExpansionExcludeFields(Set preExpansionExcludeFields) { + this.preExpansionExcludeFields = preExpansionExcludeFields; + } + + public Set getPreExpansionPatterns() { + return preExpansionPatterns; + } + + public void setPreExpansionPatterns(Set preExpansionPatterns) { + this.preExpansionPatterns = preExpansionPatterns; + } + + public boolean isPostExpansionEnabled() { + return postExpansionEnabled; + } + + public void setPostExpansionEnabled(boolean postExpansionEnabled) { + this.postExpansionEnabled = postExpansionEnabled; + } + + public Set getPostExpansionIncludeFields() { + return postExpansionIncludeFields; + } + + public void setPostExpansionIncludeFields(Set postExpansionIncludeFields) { + this.postExpansionIncludeFields = postExpansionIncludeFields; + } + + public Set getPostExpansionExcludeFields() { + return postExpansionExcludeFields; + } + + public void setPostExpansionExcludeFields(Set postExpansionExcludeFields) { + this.postExpansionExcludeFields = postExpansionExcludeFields; + } + + public Set getPostExpansionPatterns() { + return postExpansionPatterns; + } + + public void setPostExpansionPatterns(Set postExpansionPatterns) { + this.postExpansionPatterns = postExpansionPatterns; + } +} diff --git a/warehouse/query-core/src/test/java/datawave/query/ShapesTest.java b/warehouse/query-core/src/test/java/datawave/query/ShapesTest.java index a44295201e..6aa887d348 100644 --- a/warehouse/query-core/src/test/java/datawave/query/ShapesTest.java +++ b/warehouse/query-core/src/test/java/datawave/query/ShapesTest.java @@ -908,11 +908,34 @@ public void testSortQueryPreIndexWithFieldCounts() throws Exception { } } + @Test + public void testRewriteRegexFromIncludes() throws Exception { + withQuery("ONLY_HEX == 'hexa' && TYPE =~ 'reg.*'"); + withExpected(Sets.newHashSet(ShapesIngest.hexagonUid)); + planAndExecuteQuery(); + assertPlannedQuery("ONLY_HEX == 'hexa' && ((_Eval_ = true) && (TYPE =~ 'reg.*'))"); + } + + @Test + public void testDoNotRewriteRegexWithExcludedField() throws Exception { + withQuery("ONLY_HEX == 'hexa' && SHAPE =~ 'hex.*'"); + withExpected(Sets.newHashSet(ShapesIngest.hexagonUid)); + planAndExecuteQuery(); + assertPlannedQuery("ONLY_HEX == 'hexa' && ((_Delayed_ = true) && (SHAPE =~ 'hex.*'))"); + } + + @Test + public void testRewriteRegexWithExcludedFieldBecauseOfPatternMatch() throws Exception { + withQuery("ONLY_HEX == 'hexa' && SHAPE =~ 'hexag.*'"); + withExpected(Sets.newHashSet(ShapesIngest.hexagonUid)); + planAndExecuteQuery(); + assertPlannedQuery("ONLY_HEX == 'hexa' && ((_Eval_ = true) && (SHAPE =~ 'hexag.*'))"); + } + private void disableAllSortOptions() { logic.setSortQueryPreIndexWithImpliedCounts(false); logic.setSortQueryPreIndexWithFieldCounts(false); logic.setSortQueryPostIndexWithFieldCounts(false); logic.setSortQueryPostIndexWithTermCounts(false); } - } diff --git a/warehouse/query-core/src/test/java/datawave/query/jexl/visitors/RewriteRegexVisitorTest.java b/warehouse/query-core/src/test/java/datawave/query/jexl/visitors/RewriteRegexVisitorTest.java new file mode 100644 index 0000000000..e77c8d6fc5 --- /dev/null +++ b/warehouse/query-core/src/test/java/datawave/query/jexl/visitors/RewriteRegexVisitorTest.java @@ -0,0 +1,442 @@ +package datawave.query.jexl.visitors; + +import static org.junit.jupiter.api.Assertions.assertEquals; +import static org.junit.jupiter.api.Assertions.fail; + +import java.util.HashSet; +import java.util.Set; + +import org.apache.commons.jexl3.parser.ASTJexlScript; +import org.apache.commons.jexl3.parser.ParseException; +import org.junit.jupiter.api.BeforeEach; +import org.junit.jupiter.api.Test; + +import datawave.query.jexl.JexlASTHelper; + +public class RewriteRegexVisitorTest { + + private final Set indexedFields = Set.of("F", "F2", "IO", "IO2"); + private final Set indexOnlyFields = Set.of("IO", "IO2"); + + private final Set includeFields = new HashSet<>(); + private final Set excludeFields = new HashSet<>(); + + private final Set patterns = new HashSet<>(); + + @BeforeEach + public void beforeEach() { + includeFields.clear(); + excludeFields.clear(); + patterns.clear(); + } + + // A and regex + @Test + public void testSingleTermAndRegex() { + // term and indexed regex + test("F == 'a' && F =~ 'ba.*'", "F == 'a' && ((_Eval_ = true) && (F =~ 'ba.*'))"); + test("IO == 'a' && F =~ 'ba.*'", "IO == 'a' && ((_Eval_ = true) && (F =~ 'ba.*'))"); + test("NA == 'a' && F =~ 'ba.*'"); + + // term and index only regex is never rewritten + test("F == 'a' && IO =~ 'ba.*'"); + test("IO == 'a' && IO =~ 'ba.*'"); + test("NA == 'a' && IO =~ 'ba.*'"); + + // term and non-indexed regex is always rewritten + test("F == 'a' && NA =~ 'ba.*'", "F == 'a' && ((_Eval_ = true) && (NA =~ 'ba.*'))"); + test("IO == 'a' && NA =~ 'ba.*'", "IO == 'a' && ((_Eval_ = true) && (NA =~ 'ba.*'))"); + test("NA == 'a' && NA =~ 'ba.*'", "NA == 'a' && ((_Eval_ = true) && (NA =~ 'ba.*'))"); + } + + // A or regex + @Test + public void testSingleTermOrRegex() { + // term or indexed regex is never rewritten + test("F == 'a' || F =~ 'ba.*'"); + test("IO == 'a' || F =~ 'ba.*'"); + test("NA == 'a' || F =~ 'ba.*'"); + + // term or index only regex is never rewritten + test("F == 'a' || IO =~ 'ba.*'"); + test("IO == 'a' || IO =~ 'ba.*'"); + test("NA == 'a' || IO =~ 'ba.*'"); + + // top level union with non-indexed regex is a full table scan, do not rewrite + test("F == 'a' || NA =~ 'ba.*'", "F == 'a' || ((_Eval_ = true) && (NA =~ 'ba.*'))"); + test("IO == 'a' || NA =~ 'ba.*'", "IO == 'a' || ((_Eval_ = true) && (NA =~ 'ba.*'))"); + test("NA == 'a' || NA =~ 'ba.*'", "NA == 'a' || ((_Eval_ = true) && (NA =~ 'ba.*'))"); + } + + // (A and B) or regex + @Test + public void testNestedIntersectionOrRegex() { + // all combinations of nested intersection and indexed regex + test("(F == 'a' && F == 'b') || F =~ 'ba.*'"); + test("(F == 'a' && IO == 'b') || F =~ 'ba.*'"); + test("(F == 'a' && NA == 'b') || F =~ 'ba.*'"); + test("(IO == 'a' && IO == 'b') || F =~ 'ba.*'"); + test("(IO == 'a' && NA == 'b') || F =~ 'ba.*'"); + test("(NA == 'a' && NA == 'b') || F =~ 'ba.*'"); + + // all combinations of nested intersection and index only regex + test("(F == 'a' && F == 'b') || IO =~ 'ba.*'"); + test("(F == 'a' && IO == 'b') || IO =~ 'ba.*'"); + test("(F == 'a' && NA == 'b') || IO =~ 'ba.*'"); + test("(IO == 'a' && IO == 'b') || IO =~ 'ba.*'"); + test("(IO == 'a' && NA == 'b') || IO =~ 'ba.*'"); + test("(NA == 'a' && NA == 'b') || IO =~ 'ba.*'"); + + // the input queries are non-executable, non-indexed field still gets rewritten + // all combinations of nested intersection and non-indexed regex + test("(F == 'a' && F == 'b') || NA =~ 'ba.*'", "(F == 'a' && F == 'b') || ((_Eval_ = true) && (NA =~ 'ba.*'))"); + test("(F == 'a' && IO == 'b') || NA =~ 'ba.*'", "(F == 'a' && IO == 'b') || ((_Eval_ = true) && (NA =~ 'ba.*'))"); + test("(F == 'a' && NA == 'b') || NA =~ 'ba.*'", "(F == 'a' && NA == 'b') || ((_Eval_ = true) && (NA =~ 'ba.*'))"); + test("(IO == 'a' && IO == 'b') || NA =~ 'ba.*'", "(IO == 'a' && IO == 'b') || ((_Eval_ = true) && (NA =~ 'ba.*'))"); + test("(IO == 'a' && NA == 'b') || NA =~ 'ba.*'", "(IO == 'a' && NA == 'b') || ((_Eval_ = true) && (NA =~ 'ba.*'))"); + test("(NA == 'a' && NA == 'b') || NA =~ 'ba.*'", "(NA == 'a' && NA == 'b') || ((_Eval_ = true) && (NA =~ 'ba.*'))"); + } + + // (A or B) and regex + @Test + public void testNestedUnionAndRegex() { + // all combinations of nested intersection and indexed regex + test("(F == 'a' || F == 'b') && F =~ 'ba.*'", "(F == 'a' || F == 'b') && ((_Eval_ = true) && (F =~ 'ba.*'))"); + test("(F == 'a' || IO == 'b') && F =~ 'ba.*'", "(F == 'a' || IO == 'b') && ((_Eval_ = true) && (F =~ 'ba.*'))"); + test("(F == 'a' || NA == 'b') && F =~ 'ba.*'"); + test("(IO == 'a' || IO == 'b') && F =~ 'ba.*'", "(IO == 'a' || IO == 'b') && ((_Eval_ = true) && (F =~ 'ba.*'))"); + test("(IO == 'a' || NA == 'b') && F =~ 'ba.*'"); + test("(NA == 'a' || NA == 'b') && F =~ 'ba.*'"); + + // all combinations of nested intersection and index only regex + test("(F == 'a' || F == 'b') && IO =~ 'ba.*'"); + test("(F == 'a' || IO == 'b') && IO =~ 'ba.*'"); + test("(F == 'a' || NA == 'b') && IO =~ 'ba.*'"); + test("(IO == 'a' || IO == 'b') && IO =~ 'ba.*'"); + test("(IO == 'a' || NA == 'b') && IO =~ 'ba.*'"); + test("(NA == 'a' || NA == 'b') && IO =~ 'ba.*'"); + + // all combinations of nested intersection and non-indexed regex + test("(F == 'a' || F == 'b') && NA =~ 'ba.*'", "(F == 'a' || F == 'b') && ((_Eval_ = true) && (NA =~ 'ba.*'))"); + test("(F == 'a' || IO == 'b') && NA =~ 'ba.*'", "(F == 'a' || IO == 'b') && ((_Eval_ = true) && (NA =~ 'ba.*'))"); + test("(F == 'a' || NA == 'b') && NA =~ 'ba.*'", "(F == 'a' || NA == 'b') && ((_Eval_ = true) && (NA =~ 'ba.*'))"); + test("(IO == 'a' || IO == 'b') && NA =~ 'ba.*'", "(IO == 'a' || IO == 'b') && ((_Eval_ = true) && (NA =~ 'ba.*'))"); + test("(IO == 'a' || NA == 'b') && NA =~ 'ba.*'", "(IO == 'a' || NA == 'b') && ((_Eval_ = true) && (NA =~ 'ba.*'))"); + test("(NA == 'a' || NA == 'b') && NA =~ 'ba.*'", "(NA == 'a' || NA == 'b') && ((_Eval_ = true) && (NA =~ 'ba.*'))"); + } + + // A and (B or regex) + @Test + public void testIntersectionWithNestedUnionWithSingleRegex() { + // top level indexed term, variable indexed state for nested term, indexed regex + test("F == 'a' && (F == 'b' || F =~ 'ba.*')", "F == 'a' && (F == 'b' || ((_Eval_ = true) && (F =~ 'ba.*')))"); + test("F == 'a' && (IO == 'b' || F =~ 'ba.*')", "F == 'a' && (IO == 'b' || ((_Eval_ = true) && (F =~ 'ba.*')))"); + test("F == 'a' && (NA == 'b' || F =~ 'ba.*')", "F == 'a' && (NA == 'b' || ((_Eval_ = true) && (F =~ 'ba.*')))"); + + // top level indexed term, variable indexed state for nested term, index only regex + test("F == 'a' && (F == 'b' || IO =~ 'ba.*')"); + test("F == 'a' && (IO == 'b' || IO =~ 'ba.*')"); + test("F == 'a' && (NA == 'b' || IO =~ 'ba.*')"); + + // top level indexed term, variable indexed state for nested term, non-indexed regex + test("F == 'a' && (F == 'b' || NA =~ 'ba.*')", "F == 'a' && (F == 'b' || ((_Eval_ = true) && (NA =~ 'ba.*')))"); + test("F == 'a' && (IO == 'b' || NA =~ 'ba.*')", "F == 'a' && (IO == 'b' || ((_Eval_ = true) && (NA =~ 'ba.*')))"); + test("F == 'a' && (NA == 'b' || NA =~ 'ba.*')", "F == 'a' && (NA == 'b' || ((_Eval_ = true) && (NA =~ 'ba.*')))"); + + // top level index only term, variable indexed state for nested term, indexed regex + test("IO == 'a' && (F == 'b' || F =~ 'ba.*')", "IO == 'a' && (F == 'b' || ((_Eval_ = true) && (F =~ 'ba.*')))"); + test("IO == 'a' && (IO == 'b' || F =~ 'ba.*')", "IO == 'a' && (IO == 'b' || ((_Eval_ = true) && (F =~ 'ba.*')))"); + test("IO == 'a' && (NA == 'b' || F =~ 'ba.*')", "IO == 'a' && (NA == 'b' || ((_Eval_ = true) && (F =~ 'ba.*')))"); + + // top level index only term, variable indexed state for nested term, index only regex + test("IO == 'a' && (F == 'b' || IO =~ 'ba.*')"); + test("IO == 'a' && (IO == 'b' || IO =~ 'ba.*')"); + test("IO == 'a' && (NA == 'b' || IO =~ 'ba.*')"); + + // top level index only term, variable indexed state for nested term, non-indexed regex + test("IO == 'a' && (F == 'b' || NA =~ 'ba.*')", "IO == 'a' && (F == 'b' || ((_Eval_ = true) && (NA =~ 'ba.*')))"); + test("IO == 'a' && (IO == 'b' || NA =~ 'ba.*')", "IO == 'a' && (IO == 'b' || ((_Eval_ = true) && (NA =~ 'ba.*')))"); + test("IO == 'a' && (NA == 'b' || NA =~ 'ba.*')", "IO == 'a' && (NA == 'b' || ((_Eval_ = true) && (NA =~ 'ba.*')))"); + + // top level non-indexed term, variable indexed state for nested term, indexed regex + test("NA == 'a' && (F == 'b' || F =~ 'ba.*')"); + test("NA == 'a' && (IO == 'b' || F =~ 'ba.*')"); + test("NA == 'a' && (NA == 'b' || F =~ 'ba.*')"); + + // top level non-indexed term, variable indexed state for nested term, index only regex + test("NA == 'a' && (F == 'b' || IO =~ 'ba.*')"); + test("NA == 'a' && (IO == 'b' || IO =~ 'ba.*')"); + test("NA == 'a' && (NA == 'b' || IO =~ 'ba.*')"); + + // top level non-indexed term, variable indexed state for nested term, non-indexed regex + test("NA == 'a' && (F == 'b' || NA =~ 'ba.*')", "NA == 'a' && (F == 'b' || ((_Eval_ = true) && (NA =~ 'ba.*')))"); + test("NA == 'a' && (IO == 'b' || NA =~ 'ba.*')", "NA == 'a' && (IO == 'b' || ((_Eval_ = true) && (NA =~ 'ba.*')))"); + test("NA == 'a' && (NA == 'b' || NA =~ 'ba.*')", "NA == 'a' && (NA == 'b' || ((_Eval_ = true) && (NA =~ 'ba.*')))"); + } + + // A or (B and regex) + @Test + public void testUnionWithNestedIntersectionWithSingleRegex() { + // top level indexed, variable index state of nested term, indexed regex + test("F == 'a' || (F == 'b' && F == 'ab.*')"); + test("F == 'a' || (IO == 'b' && F == 'ab.*')"); + test("F == 'a' || (NA == 'b' && F == 'ab.*')"); + + // top level indexed, variable index state of nested term, index only regex + test("F == 'a' || (F == 'b' && IO == 'ab.*')"); + test("F == 'a' || (IO == 'b' && IO == 'ab.*')"); + test("F == 'a' || (NA == 'b' && IO == 'ab.*')"); + + // top level indexed, variable index state of nested term, non-indexed regex + test("F == 'a' || (F == 'b' && NA == 'ab.*')"); + test("F == 'a' || (IO == 'b' && NA == 'ab.*')"); + test("F == 'a' || (NA == 'b' && NA == 'ab.*')"); + + // top level index only, variable index state of nested term, indexed regex + test("IO == 'a' || (F == 'b' && F == 'ab.*')"); + test("IO == 'a' || (IO == 'b' && F == 'ab.*')"); + test("IO == 'a' || (NA == 'b' && F == 'ab.*')"); + + // top level index only, variable index state of nested term, index only regex + test("IO == 'a' || (F == 'b' && IO == 'ab.*')"); + test("IO == 'a' || (IO == 'b' && IO == 'ab.*')"); + test("IO == 'a' || (NA == 'b' && IO == 'ab.*')"); + + // top level index only, variable index state of nested term, non-indexed regex + test("IO == 'a' || (F == 'b' && NA == 'ab.*')"); + test("IO == 'a' || (IO == 'b' && NA == 'ab.*')"); + test("IO == 'a' || (NA == 'b' && NA == 'ab.*')"); + + // top level non-indexed, variable index state of nested term, indexed regex + test("NA == 'a' || (F == 'b' && F == 'ab.*')"); + test("NA == 'a' || (IO == 'b' && F == 'ab.*')"); + test("NA == 'a' || (NA == 'b' && F == 'ab.*')"); + + // top level non-indexed, variable index state of nested term, index only regex + test("NA == 'a' || (F == 'b' && IO == 'ab.*')"); + test("NA == 'a' || (IO == 'b' && IO == 'ab.*')"); + test("NA == 'a' || (NA == 'b' && IO == 'ab.*')"); + + // top level non-indexed, variable index state of nested term, non-indexed regex + test("NA == 'a' || (F == 'b' && NA == 'ab.*')"); + test("NA == 'a' || (IO == 'b' && NA == 'ab.*')"); + test("NA == 'a' || (NA == 'b' && NA == 'ab.*')"); + } + + // A and (regex or regex) + @Test + public void testIntersectionWithNestedUnionOfRegexes() { + // indexed term and union of regexes with all possible index states + test("F == 'a' && (F =~ 'ab.*' || F =~ 'ac.*')", "F == 'a' && (((_Eval_ = true) && (F =~ 'ab.*')) || ((_Eval_ = true) && (F =~ 'ac.*')))"); + test("F == 'a' && (F =~ 'ab.*' || IO =~ 'ac.*')", "F == 'a' && (((_Eval_ = true) && (F =~ 'ab.*')) || IO =~ 'ac.*')"); + test("F == 'a' && (F =~ 'ab.*' || NA =~ 'ac.*')", "F == 'a' && (((_Eval_ = true) && (F =~ 'ab.*')) || ((_Eval_ = true) && (NA =~ 'ac.*')))"); + test("F == 'a' && (IO =~ 'ab.*' || IO =~ 'ac.*')"); + test("F == 'a' && (IO =~ 'ab.*' || NA =~ 'ac.*')", "F == 'a' && (IO =~ 'ab.*' || ((_Eval_ = true) && (NA =~ 'ac.*')))"); + test("F == 'a' && (NA =~ 'ab.*' || NA =~ 'ac.*')", "F == 'a' && (((_Eval_ = true) && (NA =~ 'ab.*')) || ((_Eval_ = true) && (NA =~ 'ac.*')))"); + + // index only term and union of regexes with all possible index states + test("IO == 'a' && (F =~ 'ab.*' || F =~ 'ac.*')", "IO == 'a' && (((_Eval_ = true) && (F =~ 'ab.*')) || ((_Eval_ = true) && (F =~ 'ac.*')))"); + test("IO == 'a' && (F =~ 'ab.*' || IO =~ 'ac.*')", "IO == 'a' && (((_Eval_ = true) && (F =~ 'ab.*')) || IO =~ 'ac.*')"); + test("IO == 'a' && (F =~ 'ab.*' || NA =~ 'ac.*')", "IO == 'a' && (((_Eval_ = true) && (F =~ 'ab.*')) || ((_Eval_ = true) && (NA =~ 'ac.*')))"); + test("IO == 'a' && (IO =~ 'ab.*' || IO =~ 'ac.*')"); + test("IO == 'a' && (IO =~ 'ab.*' || NA =~ 'ac.*')", "IO == 'a' && (IO =~ 'ab.*' || ((_Eval_ = true) && (NA =~ 'ac.*')))"); + test("IO == 'a' && (NA =~ 'ab.*' || NA =~ 'ac.*')", "IO == 'a' && (((_Eval_ = true) && (NA =~ 'ab.*')) || ((_Eval_ = true) && (NA =~ 'ac.*')))"); + + // non-indexed tem and union of regexes with all possible index states + test("NA == 'a' && (F =~ 'ab.*' || F =~ 'ac.*')"); + test("NA == 'a' && (F =~ 'ab.*' || IO =~ 'ac.*')"); + test("NA == 'a' && (F =~ 'ab.*' || NA =~ 'ac.*')", "NA == 'a' && (F =~ 'ab.*' || ((_Eval_ = true) && (NA =~ 'ac.*')))"); + test("NA == 'a' && (IO =~ 'ab.*' || IO =~ 'ac.*')"); + test("NA == 'a' && (IO =~ 'ab.*' || NA =~ 'ac.*')", "NA == 'a' && (IO =~ 'ab.*' || ((_Eval_ = true) && (NA =~ 'ac.*')))"); + test("NA == 'a' && (NA =~ 'ab.*' || NA =~ 'ac.*')", "NA == 'a' && (((_Eval_ = true) && (NA =~ 'ab.*')) || ((_Eval_ = true) && (NA =~ 'ac.*')))"); + } + + // A or (regex and regex) + @Test + public void testUnionWithNestedIntersectionOfRegexes() { + // indexed term or intersection of regexes with all possible index states + test("F == 'a' || (F =~ 'ab.*' && F =~ 'ac.*')", "F == 'a' || (((_Eval_ = true) && (F =~ 'ab.*')) && F =~ 'ac.*')"); + test("F == 'a' || (F =~ 'ab.*' && IO =~ 'ac.*')", "F == 'a' || (((_Eval_ = true) && (F =~ 'ab.*')) && IO =~ 'ac.*')"); + test("F == 'a' || (F =~ 'ab.*' && NA =~ 'ac.*')", "F == 'a' || (F =~ 'ab.*' && ((_Eval_ = true) && (NA =~ 'ac.*')))"); + test("F == 'a' || (IO =~ 'ab.*' && IO =~ 'ac.*')"); + test("F == 'a' || (IO =~ 'ab.*' && NA =~ 'ac.*')", "F == 'a' || (IO =~ 'ab.*' && ((_Eval_ = true) && (NA =~ 'ac.*')))"); + test("F == 'a' || (NA =~ 'ab.*' && NA =~ 'ac.*')", "F == 'a' || (((_Eval_ = true) && (NA =~ 'ab.*')) && ((_Eval_ = true) && (NA =~ 'ac.*')))"); + + // index only term or intersection of regexes with all possible index states + test("IO == 'a' || (F =~ 'ab.*' && F =~ 'ac.*')", "IO == 'a' || (((_Eval_ = true) && (F =~ 'ab.*')) && F =~ 'ac.*')"); + test("IO == 'a' || (F =~ 'ab.*' && IO =~ 'ac.*')", "IO == 'a' || (((_Eval_ = true) && (F =~ 'ab.*')) && IO =~ 'ac.*')"); + test("IO == 'a' || (F =~ 'ab.*' && NA =~ 'ac.*')", "IO == 'a' || (F =~ 'ab.*' && ((_Eval_ = true) && (NA =~ 'ac.*')))"); + test("IO == 'a' || (IO =~ 'ab.*' && IO =~ 'ac.*')"); + test("IO == 'a' || (IO =~ 'ab.*' && NA =~ 'ac.*')", "IO == 'a' || (IO =~ 'ab.*' && ((_Eval_ = true) && (NA =~ 'ac.*')))"); + test("IO == 'a' || (NA =~ 'ab.*' && NA =~ 'ac.*')", "IO == 'a' || (((_Eval_ = true) && (NA =~ 'ab.*')) && ((_Eval_ = true) && (NA =~ 'ac.*')))"); + + // non-indexed tem or intersection of regexes with all possible index states + test("NA == 'a' || (F =~ 'ab.*' && F =~ 'ac.*')", "NA == 'a' || (((_Eval_ = true) && (F =~ 'ab.*')) && F =~ 'ac.*')"); + test("NA == 'a' || (F =~ 'ab.*' && IO =~ 'ac.*')", "NA == 'a' || (((_Eval_ = true) && (F =~ 'ab.*')) && IO =~ 'ac.*')"); + test("NA == 'a' || (F =~ 'ab.*' && NA =~ 'ac.*')", "NA == 'a' || (F =~ 'ab.*' && ((_Eval_ = true) && (NA =~ 'ac.*')))"); + test("NA == 'a' || (IO =~ 'ab.*' && IO =~ 'ac.*')"); + test("NA == 'a' || (IO =~ 'ab.*' && NA =~ 'ac.*')", "NA == 'a' || (IO =~ 'ab.*' && ((_Eval_ = true) && (NA =~ 'ac.*')))"); + test("NA == 'a' || (NA =~ 'ab.*' && NA =~ 'ac.*')", "NA == 'a' || (((_Eval_ = true) && (NA =~ 'ab.*')) && ((_Eval_ = true) && (NA =~ 'ac.*')))"); + } + + // (A or regex) and (B or regex) + @Test + public void testNestedUnionsWithDistributedRegexes() { + String query = "(F == 'a' || F =~ 'ab.*') && (F == 'b' || F =~ 'ac.*')"; + String expected = "(F == 'a' || ((_Eval_ = true) && (F =~ 'ab.*'))) && (F == 'b' || F =~ 'ac.*')"; + test(query, expected); + + query = "(F == 'a' || NA =~ 'ab.*') && (F == 'b' || F =~ 'ac.*')"; + expected = "(F == 'a' || ((_Eval_ = true) && (NA =~ 'ab.*'))) && (F == 'b' || F =~ 'ac.*')"; + test(query, expected); + } + + // (A and regex) or (B and regex) + @Test + public void testNestedIntersectionsWithDistributedRegexes() { + String query = "(F == 'a' && F =~ 'ab.*') || (F == 'b' && F =~ 'ac.*')"; + String expected = "(F == 'a' && ((_Eval_ = true) && (F =~ 'ab.*'))) || (F == 'b' && ((_Eval_ = true) && (F =~ 'ac.*')))"; + test(query, expected); + } + + // (A or B) and (regex or regex) + @Test + public void testPartialAnchorAndNestedUnionRegex() { + String query = "(F == 'a' || F == 'b') && (F =~ 'ab.*' || F =~ 'ac.*')"; + String expected = "(F == 'a' || F == 'b') && (((_Eval_ = true) && (F =~ 'ab.*')) || ((_Eval_ = true) && (F =~ 'ac.*')))"; + test(query, expected); + } + + // A and (B or (C and regex) + @Test + public void testLeftAnchorAndDeeplyNestedRegex() { + String query = "F == 'a' && (F == 'b' || (F == 'c' && F =~ 'ab.*'))"; + String expected = "F == 'a' && (F == 'b' || (F == 'c' && ((_Eval_ = true) && (F =~ 'ab.*'))))"; + test(query, expected); + } + + // ((regex and C) or B) and A + @Test + public void testRightAnchorAndDeeplyNestedRegex() { + String query = "((F =~ 'ab.*' && F == 'c') || F == 'b') && F == 'a'"; + String expected = "((((_Eval_ = true) && (F =~ 'ab.*')) && F == 'c') || F == 'b') && F == 'a'"; + test(query, expected); + } + + @Test + public void testUnionOfTwoLegalRewrites() { + String query = "(F == 'a' && F =~ 'ab.*') || (F == 'b' && F =~ 'ac.*')"; + String expected = "(F == 'a' && ((_Eval_ = true) && (F =~ 'ab.*'))) || (F == 'b' && ((_Eval_ = true) && (F =~ 'ac.*')))"; + test(query, expected); + } + + // (NA and regex) or (NA and regex) + @Test + public void testUnionOfTwoIllegalRewrites() { + String query = "(NA == 'a' && F =~ 'ab.*') || (NA == 'b' && F =~ 'ac.*')"; + test(query); + } + + @Test + public void testIncludeFieldsPreventNoRewrites() { + withIncludeFields(Set.of("F", "F2")); + test("IO == 'a' && F =~ 'ab.*' && F2 =~ 'ac.*'", "IO == 'a' && ((_Eval_ = true) && (F =~ 'ab.*')) && ((_Eval_ = true) && (F2 =~ 'ac.*'))"); + } + + @Test + public void testIncludeFieldsPreventSomeLegalRewrites() { + withIncludeFields(Set.of("F2")); + test("IO == 'a' && F =~ 'ab.*' && F2 =~ 'ac.*'", "IO == 'a' && F =~ 'ab.*' && ((_Eval_ = true) && (F2 =~ 'ac.*'))"); + } + + @Test + public void testExcludeFieldsPreventAllLegalRewrites() { + withExcludeFields(Set.of("F", "F2")); + test("IO == 'a' && F =~ 'ab.*' && F2 =~ 'ac.*'"); + } + + @Test + public void testExcludeFieldsPreventSomeLegalRewrites() { + withExcludeFields(Set.of("F2")); + test("IO == 'a' && F =~ 'ab.*' && F2 =~ 'ac.*'", "IO == 'a' && ((_Eval_ = true) && (F =~ 'ab.*')) && F2 =~ 'ac.*'"); + } + + @Test + public void testFullyInclusiveIncludeAndExcludeFields() { + withIncludeFields(Set.of("F")); + withExcludeFields(Set.of("F")); + // exclude fields beats include fields + test("IO == 'a' && F =~ 'ab.*'"); + } + + @Test + public void testPatternBeatsExcludeFields() { + withPattern("F", "zz.*"); + withExcludeFields(Set.of("F")); + // pattern beats exclude fields + test("IO == 'a' && F =~ 'zz.*'", "IO == 'a' && ((_Eval_ = true) && (F =~ 'zz.*'))"); + } + + @Test + public void testPatternBeatsIncludeFields() { + withPattern("F", "zz.*"); + withIncludeFields(Set.of("F2")); + // pattern beats include fields + test("IO == 'a' && F =~ 'zz.*'", "IO == 'a' && ((_Eval_ = true) && (F =~ 'zz.*'))"); + } + + @Test + public void testPatternBeatsIncludeAndExcludeFields() { + withPattern("F", "zz.*"); + withIncludeFields(Set.of("F2")); + withExcludeFields(Set.of("F")); + // pattern beats include fields + test("IO == 'a' && F =~ 'zz.*'", "IO == 'a' && ((_Eval_ = true) && (F =~ 'zz.*'))"); + } + + /** + * Assert that the provided query does not change + * + * @param query + * the query + */ + private void test(String query) { + test(query, query); + } + + /** + * Assert that the provided query matches the expected query after the {@link RewriteRegexVisitor} is applied + * + * @param query + * the query + * @param expected + * the expected result + */ + private void test(String query, String expected) { + ASTJexlScript script = parse(query); + RewriteRegexVisitor.rewrite(script, indexedFields, indexOnlyFields, includeFields, excludeFields, patterns); + String result = JexlStringBuildingVisitor.buildQuery(script); + assertEquals(expected, result); + } + + private ASTJexlScript parse(String query) { + try { + return JexlASTHelper.parseAndFlattenJexlQuery(query); + } catch (ParseException e) { + fail("Failed to parse query: " + query, e); + throw new RuntimeException(e); + } + } + + private void withIncludeFields(Set includeFields) { + this.includeFields.addAll(includeFields); + } + + private void withExcludeFields(Set excludeFields) { + this.excludeFields.addAll(excludeFields); + } + + private void withPattern(String field, String literal) { + patterns.add(new RegexRewritePattern(field, literal)); + } +} diff --git a/warehouse/query-core/src/test/java/datawave/query/jexl/visitors/pushdown/AnchorDetectionVisitorTest.java b/warehouse/query-core/src/test/java/datawave/query/jexl/visitors/pushdown/AnchorDetectionVisitorTest.java new file mode 100644 index 0000000000..a9be296a03 --- /dev/null +++ b/warehouse/query-core/src/test/java/datawave/query/jexl/visitors/pushdown/AnchorDetectionVisitorTest.java @@ -0,0 +1,274 @@ +package datawave.query.jexl.visitors.pushdown; + +import static org.junit.jupiter.api.Assertions.assertEquals; +import static org.junit.jupiter.api.Assertions.fail; + +import java.util.Collections; +import java.util.Set; + +import org.apache.commons.jexl3.parser.ASTJexlScript; +import org.apache.commons.jexl3.parser.JexlNode; +import org.junit.jupiter.api.Test; + +import datawave.query.jexl.JexlASTHelper; + +class AnchorDetectionVisitorTest { + + private final Set indexOnlyFields = Collections.singleton("IO"); + private final Set indexedFields = Collections.singleton("F"); + private AnchorDetectionVisitor visitor; + + @Test + void testIndexedLeaves() { + // @formatter:off + String[] queries = new String[]{ + "F == '1'", + "F != '1'", + "F < '2'", + "F > '2'", + "F <= '2'", + "F >= '2'", + "F =~ 'ba.*'", + "F !~ 'ba.*'", + }; + // @formatter:on + + test(queries, true); + } + + @Test + void testIndexOnlyLeaves() { + // @formatter:off + String[] queries = new String[]{ + "IO == '1'", + "IO != '1'", + "IO < '2'", + "IO > '2'", + "IO <= '2'", + "IO >= '2'", + "IO =~ 'ba.*'", + "IO !~ 'ba.*'", + }; + // @formatter:on + + test(queries, true); + } + + @Test + void testNonIndexedLeaves() { + // @formatter:off + String[] queries = new String[]{ + "FIELD == '1'", + "FIELD != '1'", + "FIELD < '2'", + "FIELD > '2'", + "FIELD <= '2'", + "FIELD >= '2'", + "FIELD =~ 'ba.*'", + "FIELD !~ 'ba.*'", + }; + // @formatter:on + + test(queries, false); + } + + @Test + void testNullLiterals() { + test("F == null", false); + test("F != null", false); + test("IO == null", false); + test("IO != null", false); + test("FIELD == null", false); + test("FIELD != null", false); + } + + @Test + void testFilterFunctions() { + // @formatter:off + String[] queries = new String[]{ + // index only include/exclude are rewritten to regex nodes + "filter:include(F, 'ba.*')", + "filter:exclude(F, 'ba.*')", + "filter:include(FIELD, 'ba.*')", + "filter:exclude(FIELD, 'ba.*')", + // isNull functions should be rewritten to 'F == null' + "filter:isNull(F)", + "filter:isNull(F)", + "filter:isNull(FIELD)", + "filter:isNull(FIELD)", + // isNotNull functions should be rewritten to !(F == null) + "filter:isNotNull(F)", + "filter:isNotNull(F)", + "filter:isNotNull(FIELD)", + "filter:isNotNull(FIELD)", + "filter:compare(F,'==','any',F)", + "filter:compare(IO,'==','any',IO)", + "filter:compare(FIELD,'==','any',FIELD)", + }; + // @formatter:on + + test(queries, false); + } + + @Test + void testMarkers() { + // @formatter:off + String[] anchorMarkers = new String[] { + "((_Bounded_ = true) && (F > '2' && F < '5'))", + "((_List_ = true) && ((id = 'id') && (field = 'F') && (params = '{\"ranges\":[[\"[r1\",\"r2]\"],[\"[r3\",\"f4]\"]]}')))", + "((_Value_ = true) && (F =~ 'ba.*'))", + "((_Term_ = true) && (_ANYFIELD_ =~ 'ba.*'))" + }; + // @formatter:on + + test(anchorMarkers, true); + + // @formatter:off + String[] nonAnchorMarkers = new String[]{ + "((_Delayed_ = true) && (F == '1'))", + "((_Eval_ = true) && (F == '1'))", + "((_Hole_ = true) && (F == '1'))", + "((_Drop_ = true) && (F == '1'))", + "((_Lenient_ = true) && (F == '1'))", + "((_Strict_ = true) && (F == '1'))" + }; + // @formatter:on + + test(nonAnchorMarkers, false); + } + + @Test + void testUnions() { + // @formatter:off + String[] anchorUnions = new String[] { + "F == '1' || F == '2'", + "F == '1' || IO == '1'", + "IO == '1' || IO == '2'"}; + // @formatter:on + + test(anchorUnions, true); + + // @formatter:off + String[] nonAnchorUnions = new String[] { + "FIELD == '1' || F == '2'", + "F == '1' || IO == '1' || FIELD == '3'", + "FIELD == '1' || FIELD == '2'"}; + // @formatter:onn + + test(nonAnchorUnions, false); + } + + @Test + void testIntersections() { + // @formatter:off + String[] anchorIntersections = new String[] { + "F == '1' && F == '2'", + "F == '1' && IO == '1'", + "IO == '1' && IO == '2'", + "F == '1' && IO == null", + "IO == '1' && IO == null", + // intersection needs just one anchor to be executable + "X == '1' && F == '2'", "X == '1' && IO == '2'" + }; + // @formatter:on + + test(anchorIntersections, true); + + // @formatter:off + String[] nonAnchorQueries = new String[] { + "X == '1' && Y == '2' && Z == '3'", + "F == null && IO == null", + }; + // @formatter:on + + test(nonAnchorQueries, false); + } + + @Test + void testNestedUnions() { + // @formatter:off + String[] anchorNestedUnions = new String[]{ + "(F == '1' || F == '2') && (F == '3' || F == '4')", + "(F == '1' || F == '2') && (IO == '3' || IO == '4')", + "(IO == '1' || IO == '2') && (F == '3' || F == '4')", + "(F == '1' || IO == '2') && (F == '3' || IO == '4')", + "(IO == '1' || F == '2') && (IO == '3' || F == '4')", + }; + // @formatter:on + + test(anchorNestedUnions, true); + } + + @Test + void testNestedIntersections() { + // @formatter:off + String[] anchorNestedIntersections = new String[]{ + "(F == '1' && F == '2') || (F == '3' && F == '4')", + "(F == '1' && F == '2') || (IO == '3' && IO == '4')", + "(IO == '1' && IO == '2') || (F == '3' && F == '4')", + "(F == '1' && IO == '2') || (F == '3' && IO == '4')", + "(IO == '1' && F == '2') || (IO == '3' && F == '4')", + }; + // @formatter:on + + test(anchorNestedIntersections, true); + } + + @Test + void testFullContentPhraseFunction() { + String query = "content:phrase(F, termOffsetMap, 'foo', 'bar') && F == 'foo' && F == 'bar'"; + test(query, true); + } + + @Test + void testArithmeticAndSizeMethods() { + // @formatter:off + String[] queries = new String[]{ + // filter + "filter:getMinTime(F) == 1892160000000", + "filter:getMinTime(F) != 1892160000000", + "filter:getMinTime(F) > 1892160000000", + "filter:getMinTime(F) < 1892160000000", + "filter:getMinTime(F) >= 1892160000000", + "filter:getMinTime(F) <= 1892160000000", + // method + "F.size() == 1", + "F.size() != 1", + "F.size() > 1", + "F.size() < 1", + "F.size() >= 1", + "F.size() <= 1", + }; + // @formatter:on + + test(queries, false); + } + + private void test(String[] queries, boolean expected) { + for (String query : queries) { + test(query, expected); + } + } + + private void test(String query, boolean expected) { + JexlNode node = parseQuery(query); + assertEquals(expected, getVisitor().isAnchor(node)); + } + + private JexlNode parseQuery(String query) { + try { + ASTJexlScript script = JexlASTHelper.parseAndFlattenJexlQuery(query); + return script.jjtGetChild(0); + } catch (Exception e) { + fail("Could not parse query: " + query); + throw new IllegalStateException(e); + } + } + + private AnchorDetectionVisitor getVisitor() { + if (visitor == null) { + visitor = new AnchorDetectionVisitor(indexedFields, indexOnlyFields); + } + return visitor; + } +} diff --git a/warehouse/query-core/src/test/resources/datawave/query/QueryLogicFactory.xml b/warehouse/query-core/src/test/resources/datawave/query/QueryLogicFactory.xml index 7c043448b9..4c4e5f4fd3 100644 --- a/warehouse/query-core/src/test/resources/datawave/query/QueryLogicFactory.xml +++ b/warehouse/query-core/src/test/resources/datawave/query/QueryLogicFactory.xml @@ -368,10 +368,38 @@ + + + + + + + + + + + + + + + TYPE + + + + SHAPE + + + + + + + + +