rename "sequence" scope to "span of calls" scope

mandiant · Jan 17, 2025 · 06472c1 · 06472c1
1 parent 261b384
commit 06472c1
Show file tree

Hide file tree

Showing 13 changed files with 131 additions and 131 deletions.
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -4,11 +4,11 @@
 
 ### New Features
 
-- add sequence scope to match features against a across a sliding window of API calls within a thread @williballenthin #2532
+- add span-of-calls scope to match features against a across a sliding window of API calls within a thread @williballenthin #2532
 
 ### Breaking Changes
 
-- add sequence scope to rule format
+- add span-of-calls scope to rule format
 - capabilities functions return dataclasses instead of tuples
 
 ### New Rules (0)

diff --git a/capa/capabilities/dynamic.py b/capa/capabilities/dynamic.py
@@ -32,11 +32,11 @@
 logger = logging.getLogger(__name__)
 
 
-# The number of calls that make up a sequence.
+# The number of calls that make up a span of calls.
 #
 # The larger this is, the more calls are grouped together to match rule logic.
 # This means a longer chain can be recognized; however, its a bit more expensive.
-SEQUENCE_SIZE = 20
+SPAN_SIZE = 20
 
 
 @dataclass
@@ -45,8 +45,8 @@ class CallCapabilities:
     matches: MatchResults
 
 
-# The number of calls that make up a sequence.
-SEQUENCE_SIZE = 5
+# The number of calls that make up a span.
+SPAN_SIZE = 5
 
 
 def find_call_capabilities(
@@ -78,44 +78,44 @@ def find_call_capabilities(
 class ThreadCapabilities:
     features: FeatureSet
     thread_matches: MatchResults
-    sequence_matches: MatchResults
+    span_matches: MatchResults
     call_matches: MatchResults
 
 
-class SequenceMatcher:
+class SpanOfCallsMatcher:
     def __init__(self, ruleset: RuleSet):
         super().__init__()
         self.ruleset = ruleset
 
-        # matches found at the sequence scope.
+        # matches found at the span scope.
         self.matches: MatchResults = collections.defaultdict(list)
 
-        # We matches sequences as the sliding window of calls with size SEQUENCE_SIZE.
+        # We matches spans as the sliding window of calls with size SPAN_SIZE.
         #
-        # For each call, we consider the window of SEQUENCE_SIZE calls leading up to it,
+        # For each call, we consider the window of SPAN_SIZE calls leading up to it,
         #  merging all their features and doing a match.
         #
         # We track these features in two data structures:
         #   1. a deque of those features found in the prior calls.
-        #      We'll append to it, and as it grows larger than SEQUENCE_SIZE, the oldest items are removed.
-        #   2. a live set of features seen in the sequence.
+        #      We'll append to it, and as it grows larger than SPAN_SIZE, the oldest items are removed.
+        #   2. a live set of features seen in the span.
         #      As we pop from the deque, we remove features from the current set,
         #      and as we push to the deque, we insert features to the current set.
-        # With this approach, our algorithm performance is independent of SEQUENCE_SIZE.
-        # The naive algorithm, of merging all the trailing feature sets at each call, is dependent upon SEQUENCE_SIZE
-        # (that is, runtime gets slower the larger SEQUENCE_SIZE is).
-        self.current_feature_sets: collections.deque[FeatureSet] = collections.deque(maxlen=SEQUENCE_SIZE)
+        # With this approach, our algorithm performance is independent of SPAN_SIZE.
+        # The naive algorithm, of merging all the trailing feature sets at each call, is dependent upon SPAN_SIZE
+        # (that is, runtime gets slower the larger SPAN_SIZE is).
+        self.current_feature_sets: collections.deque[FeatureSet] = collections.deque(maxlen=SPAN_SIZE)
         self.current_features: FeatureSet = collections.defaultdict(set)
 
-        # the names of rules matched at the last sequence,
+        # the names of rules matched at the last span,
         # so that we can deduplicate long strings of the same matches.
-        self.last_sequence_matches: set[str] = set()
+        self.last_span_matches: set[str] = set()
 
     def next(self, ch: CallHandle, call_features: FeatureSet):
         # As we add items to the end of the deque, overflow and drop the oldest items (at the left end).
         # While we could rely on `deque.append` with `maxlen` set (which we provide above),
         # we want to use the dropped item first, to remove the old features, so we manually pop it here.
-        if len(self.current_feature_sets) == SEQUENCE_SIZE:
+        if len(self.current_feature_sets) == SPAN_SIZE:
             overflowing_feature_set = self.current_feature_sets.popleft()
 
             for feature, vas in overflowing_feature_set.items():
@@ -135,20 +135,20 @@ def next(self, ch: CallHandle, call_features: FeatureSet):
         for feature, vas in call_features.items():
             self.current_features[feature] |= vas
 
-        _, matches = self.ruleset.match(Scope.SEQUENCE, self.current_features, ch.address)
+        _, matches = self.ruleset.match(Scope.SPAN_OF_CALLS, self.current_features, ch.address)
 
-        newly_encountered_rules = set(matches.keys()) - self.last_sequence_matches
+        newly_encountered_rules = set(matches.keys()) - self.last_span_matches
 
-        # don't emit match results for rules seen during the immediately preceeding sequence.
+        # don't emit match results for rules seen during the immediately preceeding spans.
         #
-        # This means that we won't emit duplicate matches when there are multiple sequences
+        # This means that we won't emit duplicate matches when there are multiple spans
         #  that overlap a single matching event.
         # It also handles the case of a tight loop containing matched logic;
         #  only the first match will be recorded.
         #
         # In theory, this means the result document doesn't have *every* possible match location,
         # but in practice, humans will only be interested in the first handful anyways.
-        suppressed_rules = set(self.last_sequence_matches)
+        suppressed_rules = set(self.last_span_matches)
 
         # however, if a newly encountered rule depends on a suppressed rule,
         # don't suppress that rule match, or we won't be able to reconstruct the vverbose output.
@@ -161,15 +161,15 @@ def next(self, ch: CallHandle, call_features: FeatureSet):
                 continue
             self.matches[rule_name].extend(res)
 
-        self.last_sequence_matches = set(matches.keys())
+        self.last_span_matches = set(matches.keys())
 
 
 def find_thread_capabilities(
     ruleset: RuleSet, extractor: DynamicFeatureExtractor, ph: ProcessHandle, th: ThreadHandle
 ) -> ThreadCapabilities:
     """
     find matches for the given rules within the given thread,
-    which includes matches for all the sequences and calls within it.
+    which includes matches for all the spans and calls within it.
     """
     # all features found within this thread,
     # includes features found within calls.
@@ -179,7 +179,7 @@ def find_thread_capabilities(
     # might be found at different calls, that's ok.
     call_matches: MatchResults = collections.defaultdict(list)
 
-    sequence_matcher = SequenceMatcher(ruleset)
+    span_matcher = SpanOfCallsMatcher(ruleset)
 
     call_count = 0
     for call_count, ch in enumerate(extractor.get_calls(ph, th)):  # noqa: B007
@@ -190,7 +190,7 @@ def find_thread_capabilities(
         for rule_name, res in call_capabilities.matches.items():
             call_matches[rule_name].extend(res)
 
-        sequence_matcher.next(ch, call_capabilities.features)
+        span_matcher.next(ch, call_capabilities.features)
 
     for feature, va in itertools.chain(extractor.extract_thread_features(ph, th), extractor.extract_global_features()):
         features[feature].add(va)
@@ -209,16 +209,16 @@ def find_thread_capabilities(
         th.address.tid,
         call_count,
         len(features),
-        len(matches) + len(sequence_matcher.matches) + len(call_matches),
+        len(matches) + len(span_matcher.matches) + len(call_matches),
     )
-    return ThreadCapabilities(features, matches, sequence_matcher.matches, call_matches)
+    return ThreadCapabilities(features, matches, span_matcher.matches, call_matches)
 
 
 @dataclass
 class ProcessCapabilities:
     process_matches: MatchResults
     thread_matches: MatchResults
-    sequence_matches: MatchResults
+    span_matches: MatchResults
     call_matches: MatchResults
     feature_count: int
 
@@ -237,9 +237,9 @@ def find_process_capabilities(
     # might be found at different threads, that's ok.
     thread_matches: MatchResults = collections.defaultdict(list)
 
-    # matches found at the sequence scope.
-    # might be found at different sequences, that's ok.
-    sequence_matches: MatchResults = collections.defaultdict(list)
+    # matches found at the span-of-calls scope.
+    # might be found at different spans, that's ok.
+    span_matches: MatchResults = collections.defaultdict(list)
 
     # matches found at the call scope.
     # might be found at different calls, that's ok.
@@ -253,8 +253,8 @@ def find_process_capabilities(
         for rule_name, res in thread_capabilities.thread_matches.items():
             thread_matches[rule_name].extend(res)
 
-        for rule_name, res in thread_capabilities.sequence_matches.items():
-            sequence_matches[rule_name].extend(res)
+        for rule_name, res in thread_capabilities.span_matches.items():
+            span_matches[rule_name].extend(res)
 
         for rule_name, res in thread_capabilities.call_matches.items():
             call_matches[rule_name].extend(res)
@@ -270,15 +270,15 @@ def find_process_capabilities(
         len(process_features),
         len(process_matches),
     )
-    return ProcessCapabilities(process_matches, thread_matches, sequence_matches, call_matches, len(process_features))
+    return ProcessCapabilities(process_matches, thread_matches, span_matches, call_matches, len(process_features))
 
 
 def find_dynamic_capabilities(
     ruleset: RuleSet, extractor: DynamicFeatureExtractor, disable_progress: bool = False
 ) -> Capabilities:
     all_process_matches: MatchResults = collections.defaultdict(list)
     all_thread_matches: MatchResults = collections.defaultdict(list)
-    all_sequence_matches: MatchResults = collections.defaultdict(list)
+    all_span_matches: MatchResults = collections.defaultdict(list)
     all_call_matches: MatchResults = collections.defaultdict(list)
 
     feature_counts = rdoc.DynamicFeatureCounts(file=0, processes=())
@@ -303,8 +303,8 @@ def find_dynamic_capabilities(
                 all_process_matches[rule_name].extend(res)
             for rule_name, res in process_capabilities.thread_matches.items():
                 all_thread_matches[rule_name].extend(res)
-            for rule_name, res in process_capabilities.sequence_matches.items():
-                all_sequence_matches[rule_name].extend(res)
+            for rule_name, res in process_capabilities.span_matches.items():
+                all_span_matches[rule_name].extend(res)
             for rule_name, res in process_capabilities.call_matches.items():
                 all_call_matches[rule_name].extend(res)
 
@@ -314,7 +314,7 @@ def find_dynamic_capabilities(
     # mapping from feature (matched rule) to set of addresses at which it matched.
     process_and_lower_features: FeatureSet = collections.defaultdict(set)
     for rule_name, results in itertools.chain(
-        all_process_matches.items(), all_thread_matches.items(), all_sequence_matches.items(), all_call_matches.items()
+        all_process_matches.items(), all_thread_matches.items(), all_span_matches.items(), all_call_matches.items()
     ):
         locations = {p[0] for p in results}
         rule = ruleset[rule_name]
@@ -329,7 +329,7 @@ def find_dynamic_capabilities(
             # so there won't be any overlap among these following MatchResults,
             # and we can merge the dictionaries naively.
             all_call_matches.items(),
-            all_sequence_matches.items(),
+            all_span_matches.items(),
             all_thread_matches.items(),
             all_process_matches.items(),
             all_file_capabilities.matches.items(),

diff --git a/capa/render/proto/__init__.py b/capa/render/proto/__init__.py
@@ -163,8 +163,8 @@ def scope_to_pb2(scope: capa.rules.Scope) -> capa_pb2.Scope.ValueType:
         return capa_pb2.Scope.SCOPE_PROCESS
     elif scope == capa.rules.Scope.THREAD:
         return capa_pb2.Scope.SCOPE_THREAD
-    elif scope == capa.rules.Scope.SEQUENCE:
-        return capa_pb2.Scope.SCOPE_SEQUENCE
+    elif scope == capa.rules.Scope.SPAN_OF_CALLS:
+        return capa_pb2.Scope.SCOPE_SPAN_OF_CALLS
     elif scope == capa.rules.Scope.CALL:
         return capa_pb2.Scope.SCOPE_CALL
     else:
@@ -657,8 +657,8 @@ def scope_from_pb2(scope: capa_pb2.Scope.ValueType) -> capa.rules.Scope:
         return capa.rules.Scope.PROCESS
     elif scope == capa_pb2.Scope.SCOPE_THREAD:
         return capa.rules.Scope.THREAD
-    elif scope == capa_pb2.Scope.SCOPE_SEQUENCE:
-        return capa.rules.Scope.SEQUENCE
+    elif scope == capa_pb2.Scope.SCOPE_SPAN_OF_CALLS:
+        return capa.rules.Scope.SPAN_OF_CALLS
     elif scope == capa_pb2.Scope.SCOPE_CALL:
         return capa.rules.Scope.CALL
     else:

diff --git a/capa/render/proto/capa.proto b/capa/render/proto/capa.proto
@@ -378,7 +378,7 @@ enum Scope {
   SCOPE_PROCESS = 5;
   SCOPE_THREAD = 6;
   SCOPE_CALL = 7;
-  SCOPE_SEQUENCE = 8;
+  SCOPE_SPAN_OF_CALLS = 8;
 }
 
 message Scopes {

diff --git a/capa/render/proto/capa_pb2.py b/capa/render/proto/capa_pb2.py
diff --git a/capa/render/proto/capa_pb2.pyi b/capa/render/proto/capa_pb2.pyi
@@ -94,7 +94,7 @@ class _ScopeEnumTypeWrapper(google.protobuf.internal.enum_type_wrapper._EnumType
     SCOPE_PROCESS: _Scope.ValueType  # 5
     SCOPE_THREAD: _Scope.ValueType  # 6
     SCOPE_CALL: _Scope.ValueType  # 7
-    SCOPE_SEQUENCE: _Scope.ValueType  # 8
+    SCOPE_SPAN_OF_CALLS: _Scope.ValueType  # 8
 
 class Scope(_Scope, metaclass=_ScopeEnumTypeWrapper): ...
 
@@ -106,7 +106,7 @@ SCOPE_INSTRUCTION: Scope.ValueType  # 4
 SCOPE_PROCESS: Scope.ValueType  # 5
 SCOPE_THREAD: Scope.ValueType  # 6
 SCOPE_CALL: Scope.ValueType  # 7
-SCOPE_SEQUENCE: Scope.ValueType  # 8
+SCOPE_SPAN_OF_CALLS: Scope.ValueType  # 8
 global___Scope = Scope
 
 @typing.final

diff --git a/capa/render/result_document.py b/capa/render/result_document.py
@@ -399,12 +399,12 @@ def from_capa(
                         if location in rule_matches:
                             # exact match, such as matching a call-scoped rule.
                             children.append(Match.from_capa(rules, capabilities, rule_matches[location]))
-                        # we'd like to assert the scope of the current rule is "sequence"
+                        # we'd like to assert the scope of the current rule is span-of-calls
                         # but we don't have that data here.
                         else:
-                            # Sequence scopes can match each other, but they don't strictly contain each other,
+                            # Span-of-calls scopes can match each other, but they don't strictly contain each other,
                             #  like the way a function contains a basic block.
-                            # So when we have a match within a sequence for another sequence, we need to look
+                            # So when we have a match within a span for another span, we need to look
                             #  for all the places it might be found.
                             #
                             # Despite the edge cases (like API hammering), this turns out to be pretty easy:

diff --git a/capa/render/verbose.py b/capa/render/verbose.py
@@ -126,7 +126,7 @@ def render_thread(layout: rd.DynamicLayout, addr: frz.Address) -> str:
     return f"{name}{{pid:{thread.process.pid},tid:{thread.tid}}}"
 
 
-def render_sequence(layout: rd.DynamicLayout, addrs: list[frz.Address]) -> str:
+def render_span_of_calls(layout: rd.DynamicLayout, addrs: list[frz.Address]) -> str:
     calls: list[capa.features.address.DynamicCallAddress] = [addr.to_capa() for addr in addrs]  # type: ignore
     for call in calls:
         assert isinstance(call, capa.features.address.DynamicCallAddress)
@@ -328,7 +328,7 @@ def render_rules(console: Console, doc: rd.ResultDocument):
                     lines = [render_process(doc.meta.analysis.layout, loc) for loc in locations]
                 elif rule.meta.scopes.dynamic == capa.rules.Scope.THREAD:
                     lines = [render_thread(doc.meta.analysis.layout, loc) for loc in locations]
-                elif rule.meta.scopes.dynamic in (capa.rules.Scope.CALL, capa.rules.Scope.SEQUENCE):
+                elif rule.meta.scopes.dynamic in (capa.rules.Scope.CALL, capa.rules.Scope.SPAN_OF_CALLS):
                     # because we're only in verbose mode, we won't show the full call details (name, args, retval)
                     # we'll only show the details of the thread in which the calls are found.
                     # so select the thread locations and render those.

diff --git a/capa/render/vverbose.py b/capa/render/vverbose.py
@@ -231,7 +231,7 @@ def render_feature(
             # of the output, so don't re-render it again for each feature.
             pass
         elif isinstance(layout, rd.DynamicLayout) and isinstance(feature, frzf.MatchFeature):
-            # don't render copies of the sequence address for submatches
+            # don't render copies of the span of calls address for submatches
             pass
         else:
             render_locations(console, layout, match.locations, indent)
@@ -312,13 +312,13 @@ def render_match(
         render_match(console, layout, rule, child, indent=indent + 1, mode=child_mode)
 
 
-def collect_sequence_locations(
+def collect_span_of_calls_locations(
     match: rd.Match,
     mode=MODE_SUCCESS,
 ):
     """
-    Find all the (call, sequence) locations used in a given sequence match, recursively.
-    Useful to collect the events used to match a sequence scoped rule.
+    Find all the call locations used in a given span-of-calls match, recursively.
+    Useful to collect the events used to match a span-of-calls scoped rule.
     """
     if isinstance(match.node, rd.StatementNode):
         if (
@@ -327,7 +327,7 @@ def collect_sequence_locations(
         ):
             child_mode = MODE_FAILURE if mode == MODE_SUCCESS else MODE_SUCCESS
             for child in match.children:
-                yield from collect_sequence_locations(child, child_mode)
+                yield from collect_span_of_calls_locations(child, child_mode)
         elif isinstance(match.node.statement, rd.RangeStatement):
             for location in match.locations:
                 if location.type not in (frz.AddressType.CALL,):
@@ -337,7 +337,7 @@ def collect_sequence_locations(
                 yield location
         else:
             for child in match.children:
-                yield from collect_sequence_locations(child, mode)
+                yield from collect_span_of_calls_locations(child, mode)
     elif isinstance(match.node, rd.FeatureNode):
         for location in match.locations:
             if location.type not in (frz.AddressType.CALL,):
@@ -488,9 +488,9 @@ def render_rules(console: Console, doc: rd.ResultDocument):
                         console.write(v.render_process(doc.meta.analysis.layout, location))
                     elif rule.meta.scopes.dynamic == capa.rules.Scope.THREAD:
                         console.write(v.render_thread(doc.meta.analysis.layout, location))
-                    elif rule.meta.scopes.dynamic == capa.rules.Scope.SEQUENCE:
-                        calls = sorted(set(collect_sequence_locations(match)))
-                        console.write(hanging_indent(v.render_sequence(doc.meta.analysis.layout, calls), indent=1))
+                    elif rule.meta.scopes.dynamic == capa.rules.Scope.SPAN_OF_CALLS:
+                        calls = sorted(set(collect_span_of_calls_locations(match)))
+                        console.write(hanging_indent(v.render_span_of_calls(doc.meta.analysis.layout, calls), indent=1))
                     elif rule.meta.scopes.dynamic == capa.rules.Scope.CALL:
                         console.write(hanging_indent(v.render_call(doc.meta.analysis.layout, location), indent=1))
                     else: