Skip to content

Commit

Permalink
rename "sequence" scope to "span of calls" scope
Browse files Browse the repository at this point in the history
  • Loading branch information
williballenthin committed Jan 17, 2025
1 parent 261b384 commit 06472c1
Show file tree
Hide file tree
Showing 13 changed files with 131 additions and 131 deletions.
4 changes: 2 additions & 2 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -4,11 +4,11 @@

### New Features

- add sequence scope to match features against a across a sliding window of API calls within a thread @williballenthin #2532
- add span-of-calls scope to match features against a across a sliding window of API calls within a thread @williballenthin #2532

### Breaking Changes

- add sequence scope to rule format
- add span-of-calls scope to rule format
- capabilities functions return dataclasses instead of tuples

### New Rules (0)
Expand Down
82 changes: 41 additions & 41 deletions capa/capabilities/dynamic.py
Original file line number Diff line number Diff line change
Expand Up @@ -32,11 +32,11 @@
logger = logging.getLogger(__name__)


# The number of calls that make up a sequence.
# The number of calls that make up a span of calls.
#
# The larger this is, the more calls are grouped together to match rule logic.
# This means a longer chain can be recognized; however, its a bit more expensive.
SEQUENCE_SIZE = 20
SPAN_SIZE = 20


@dataclass
Expand All @@ -45,8 +45,8 @@ class CallCapabilities:
matches: MatchResults


# The number of calls that make up a sequence.
SEQUENCE_SIZE = 5
# The number of calls that make up a span.
SPAN_SIZE = 5


def find_call_capabilities(
Expand Down Expand Up @@ -78,44 +78,44 @@ def find_call_capabilities(
class ThreadCapabilities:
features: FeatureSet
thread_matches: MatchResults
sequence_matches: MatchResults
span_matches: MatchResults
call_matches: MatchResults


class SequenceMatcher:
class SpanOfCallsMatcher:
def __init__(self, ruleset: RuleSet):
super().__init__()
self.ruleset = ruleset

# matches found at the sequence scope.
# matches found at the span scope.
self.matches: MatchResults = collections.defaultdict(list)

# We matches sequences as the sliding window of calls with size SEQUENCE_SIZE.
# We matches spans as the sliding window of calls with size SPAN_SIZE.
#
# For each call, we consider the window of SEQUENCE_SIZE calls leading up to it,
# For each call, we consider the window of SPAN_SIZE calls leading up to it,
# merging all their features and doing a match.
#
# We track these features in two data structures:
# 1. a deque of those features found in the prior calls.
# We'll append to it, and as it grows larger than SEQUENCE_SIZE, the oldest items are removed.
# 2. a live set of features seen in the sequence.
# We'll append to it, and as it grows larger than SPAN_SIZE, the oldest items are removed.
# 2. a live set of features seen in the span.
# As we pop from the deque, we remove features from the current set,
# and as we push to the deque, we insert features to the current set.
# With this approach, our algorithm performance is independent of SEQUENCE_SIZE.
# The naive algorithm, of merging all the trailing feature sets at each call, is dependent upon SEQUENCE_SIZE
# (that is, runtime gets slower the larger SEQUENCE_SIZE is).
self.current_feature_sets: collections.deque[FeatureSet] = collections.deque(maxlen=SEQUENCE_SIZE)
# With this approach, our algorithm performance is independent of SPAN_SIZE.
# The naive algorithm, of merging all the trailing feature sets at each call, is dependent upon SPAN_SIZE
# (that is, runtime gets slower the larger SPAN_SIZE is).
self.current_feature_sets: collections.deque[FeatureSet] = collections.deque(maxlen=SPAN_SIZE)
self.current_features: FeatureSet = collections.defaultdict(set)

# the names of rules matched at the last sequence,
# the names of rules matched at the last span,
# so that we can deduplicate long strings of the same matches.
self.last_sequence_matches: set[str] = set()
self.last_span_matches: set[str] = set()

def next(self, ch: CallHandle, call_features: FeatureSet):
# As we add items to the end of the deque, overflow and drop the oldest items (at the left end).
# While we could rely on `deque.append` with `maxlen` set (which we provide above),
# we want to use the dropped item first, to remove the old features, so we manually pop it here.
if len(self.current_feature_sets) == SEQUENCE_SIZE:
if len(self.current_feature_sets) == SPAN_SIZE:
overflowing_feature_set = self.current_feature_sets.popleft()

for feature, vas in overflowing_feature_set.items():
Expand All @@ -135,20 +135,20 @@ def next(self, ch: CallHandle, call_features: FeatureSet):
for feature, vas in call_features.items():
self.current_features[feature] |= vas

_, matches = self.ruleset.match(Scope.SEQUENCE, self.current_features, ch.address)
_, matches = self.ruleset.match(Scope.SPAN_OF_CALLS, self.current_features, ch.address)

newly_encountered_rules = set(matches.keys()) - self.last_sequence_matches
newly_encountered_rules = set(matches.keys()) - self.last_span_matches

# don't emit match results for rules seen during the immediately preceeding sequence.
# don't emit match results for rules seen during the immediately preceeding spans.
#
# This means that we won't emit duplicate matches when there are multiple sequences
# This means that we won't emit duplicate matches when there are multiple spans
# that overlap a single matching event.
# It also handles the case of a tight loop containing matched logic;
# only the first match will be recorded.
#
# In theory, this means the result document doesn't have *every* possible match location,
# but in practice, humans will only be interested in the first handful anyways.
suppressed_rules = set(self.last_sequence_matches)
suppressed_rules = set(self.last_span_matches)

# however, if a newly encountered rule depends on a suppressed rule,
# don't suppress that rule match, or we won't be able to reconstruct the vverbose output.
Expand All @@ -161,15 +161,15 @@ def next(self, ch: CallHandle, call_features: FeatureSet):
continue
self.matches[rule_name].extend(res)

self.last_sequence_matches = set(matches.keys())
self.last_span_matches = set(matches.keys())


def find_thread_capabilities(
ruleset: RuleSet, extractor: DynamicFeatureExtractor, ph: ProcessHandle, th: ThreadHandle
) -> ThreadCapabilities:
"""
find matches for the given rules within the given thread,
which includes matches for all the sequences and calls within it.
which includes matches for all the spans and calls within it.
"""
# all features found within this thread,
# includes features found within calls.
Expand All @@ -179,7 +179,7 @@ def find_thread_capabilities(
# might be found at different calls, that's ok.
call_matches: MatchResults = collections.defaultdict(list)

sequence_matcher = SequenceMatcher(ruleset)
span_matcher = SpanOfCallsMatcher(ruleset)

call_count = 0
for call_count, ch in enumerate(extractor.get_calls(ph, th)): # noqa: B007
Expand All @@ -190,7 +190,7 @@ def find_thread_capabilities(
for rule_name, res in call_capabilities.matches.items():
call_matches[rule_name].extend(res)

sequence_matcher.next(ch, call_capabilities.features)
span_matcher.next(ch, call_capabilities.features)

for feature, va in itertools.chain(extractor.extract_thread_features(ph, th), extractor.extract_global_features()):
features[feature].add(va)
Expand All @@ -209,16 +209,16 @@ def find_thread_capabilities(
th.address.tid,
call_count,
len(features),
len(matches) + len(sequence_matcher.matches) + len(call_matches),
len(matches) + len(span_matcher.matches) + len(call_matches),
)
return ThreadCapabilities(features, matches, sequence_matcher.matches, call_matches)
return ThreadCapabilities(features, matches, span_matcher.matches, call_matches)


@dataclass
class ProcessCapabilities:
process_matches: MatchResults
thread_matches: MatchResults
sequence_matches: MatchResults
span_matches: MatchResults
call_matches: MatchResults
feature_count: int

Expand All @@ -237,9 +237,9 @@ def find_process_capabilities(
# might be found at different threads, that's ok.
thread_matches: MatchResults = collections.defaultdict(list)

# matches found at the sequence scope.
# might be found at different sequences, that's ok.
sequence_matches: MatchResults = collections.defaultdict(list)
# matches found at the span-of-calls scope.
# might be found at different spans, that's ok.
span_matches: MatchResults = collections.defaultdict(list)

# matches found at the call scope.
# might be found at different calls, that's ok.
Expand All @@ -253,8 +253,8 @@ def find_process_capabilities(
for rule_name, res in thread_capabilities.thread_matches.items():
thread_matches[rule_name].extend(res)

for rule_name, res in thread_capabilities.sequence_matches.items():
sequence_matches[rule_name].extend(res)
for rule_name, res in thread_capabilities.span_matches.items():
span_matches[rule_name].extend(res)

for rule_name, res in thread_capabilities.call_matches.items():
call_matches[rule_name].extend(res)
Expand All @@ -270,15 +270,15 @@ def find_process_capabilities(
len(process_features),
len(process_matches),
)
return ProcessCapabilities(process_matches, thread_matches, sequence_matches, call_matches, len(process_features))
return ProcessCapabilities(process_matches, thread_matches, span_matches, call_matches, len(process_features))


def find_dynamic_capabilities(
ruleset: RuleSet, extractor: DynamicFeatureExtractor, disable_progress: bool = False
) -> Capabilities:
all_process_matches: MatchResults = collections.defaultdict(list)
all_thread_matches: MatchResults = collections.defaultdict(list)
all_sequence_matches: MatchResults = collections.defaultdict(list)
all_span_matches: MatchResults = collections.defaultdict(list)
all_call_matches: MatchResults = collections.defaultdict(list)

feature_counts = rdoc.DynamicFeatureCounts(file=0, processes=())
Expand All @@ -303,8 +303,8 @@ def find_dynamic_capabilities(
all_process_matches[rule_name].extend(res)
for rule_name, res in process_capabilities.thread_matches.items():
all_thread_matches[rule_name].extend(res)
for rule_name, res in process_capabilities.sequence_matches.items():
all_sequence_matches[rule_name].extend(res)
for rule_name, res in process_capabilities.span_matches.items():
all_span_matches[rule_name].extend(res)
for rule_name, res in process_capabilities.call_matches.items():
all_call_matches[rule_name].extend(res)

Expand All @@ -314,7 +314,7 @@ def find_dynamic_capabilities(
# mapping from feature (matched rule) to set of addresses at which it matched.
process_and_lower_features: FeatureSet = collections.defaultdict(set)
for rule_name, results in itertools.chain(
all_process_matches.items(), all_thread_matches.items(), all_sequence_matches.items(), all_call_matches.items()
all_process_matches.items(), all_thread_matches.items(), all_span_matches.items(), all_call_matches.items()
):
locations = {p[0] for p in results}
rule = ruleset[rule_name]
Expand All @@ -329,7 +329,7 @@ def find_dynamic_capabilities(
# so there won't be any overlap among these following MatchResults,
# and we can merge the dictionaries naively.
all_call_matches.items(),
all_sequence_matches.items(),
all_span_matches.items(),
all_thread_matches.items(),
all_process_matches.items(),
all_file_capabilities.matches.items(),
Expand Down
8 changes: 4 additions & 4 deletions capa/render/proto/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -163,8 +163,8 @@ def scope_to_pb2(scope: capa.rules.Scope) -> capa_pb2.Scope.ValueType:
return capa_pb2.Scope.SCOPE_PROCESS
elif scope == capa.rules.Scope.THREAD:
return capa_pb2.Scope.SCOPE_THREAD
elif scope == capa.rules.Scope.SEQUENCE:
return capa_pb2.Scope.SCOPE_SEQUENCE
elif scope == capa.rules.Scope.SPAN_OF_CALLS:
return capa_pb2.Scope.SCOPE_SPAN_OF_CALLS
elif scope == capa.rules.Scope.CALL:
return capa_pb2.Scope.SCOPE_CALL
else:
Expand Down Expand Up @@ -657,8 +657,8 @@ def scope_from_pb2(scope: capa_pb2.Scope.ValueType) -> capa.rules.Scope:
return capa.rules.Scope.PROCESS
elif scope == capa_pb2.Scope.SCOPE_THREAD:
return capa.rules.Scope.THREAD
elif scope == capa_pb2.Scope.SCOPE_SEQUENCE:
return capa.rules.Scope.SEQUENCE
elif scope == capa_pb2.Scope.SCOPE_SPAN_OF_CALLS:
return capa.rules.Scope.SPAN_OF_CALLS
elif scope == capa_pb2.Scope.SCOPE_CALL:
return capa.rules.Scope.CALL
else:
Expand Down
2 changes: 1 addition & 1 deletion capa/render/proto/capa.proto
Original file line number Diff line number Diff line change
Expand Up @@ -378,7 +378,7 @@ enum Scope {
SCOPE_PROCESS = 5;
SCOPE_THREAD = 6;
SCOPE_CALL = 7;
SCOPE_SEQUENCE = 8;
SCOPE_SPAN_OF_CALLS = 8;
}

message Scopes {
Expand Down
4 changes: 2 additions & 2 deletions capa/render/proto/capa_pb2.py

Large diffs are not rendered by default.

4 changes: 2 additions & 2 deletions capa/render/proto/capa_pb2.pyi
Original file line number Diff line number Diff line change
Expand Up @@ -94,7 +94,7 @@ class _ScopeEnumTypeWrapper(google.protobuf.internal.enum_type_wrapper._EnumType
SCOPE_PROCESS: _Scope.ValueType # 5
SCOPE_THREAD: _Scope.ValueType # 6
SCOPE_CALL: _Scope.ValueType # 7
SCOPE_SEQUENCE: _Scope.ValueType # 8
SCOPE_SPAN_OF_CALLS: _Scope.ValueType # 8

class Scope(_Scope, metaclass=_ScopeEnumTypeWrapper): ...

Expand All @@ -106,7 +106,7 @@ SCOPE_INSTRUCTION: Scope.ValueType # 4
SCOPE_PROCESS: Scope.ValueType # 5
SCOPE_THREAD: Scope.ValueType # 6
SCOPE_CALL: Scope.ValueType # 7
SCOPE_SEQUENCE: Scope.ValueType # 8
SCOPE_SPAN_OF_CALLS: Scope.ValueType # 8
global___Scope = Scope

@typing.final
Expand Down
6 changes: 3 additions & 3 deletions capa/render/result_document.py
Original file line number Diff line number Diff line change
Expand Up @@ -399,12 +399,12 @@ def from_capa(
if location in rule_matches:
# exact match, such as matching a call-scoped rule.
children.append(Match.from_capa(rules, capabilities, rule_matches[location]))
# we'd like to assert the scope of the current rule is "sequence"
# we'd like to assert the scope of the current rule is span-of-calls
# but we don't have that data here.
else:
# Sequence scopes can match each other, but they don't strictly contain each other,
# Span-of-calls scopes can match each other, but they don't strictly contain each other,
# like the way a function contains a basic block.
# So when we have a match within a sequence for another sequence, we need to look
# So when we have a match within a span for another span, we need to look
# for all the places it might be found.
#
# Despite the edge cases (like API hammering), this turns out to be pretty easy:
Expand Down
4 changes: 2 additions & 2 deletions capa/render/verbose.py
Original file line number Diff line number Diff line change
Expand Up @@ -126,7 +126,7 @@ def render_thread(layout: rd.DynamicLayout, addr: frz.Address) -> str:
return f"{name}{{pid:{thread.process.pid},tid:{thread.tid}}}"


def render_sequence(layout: rd.DynamicLayout, addrs: list[frz.Address]) -> str:
def render_span_of_calls(layout: rd.DynamicLayout, addrs: list[frz.Address]) -> str:
calls: list[capa.features.address.DynamicCallAddress] = [addr.to_capa() for addr in addrs] # type: ignore
for call in calls:
assert isinstance(call, capa.features.address.DynamicCallAddress)
Expand Down Expand Up @@ -328,7 +328,7 @@ def render_rules(console: Console, doc: rd.ResultDocument):
lines = [render_process(doc.meta.analysis.layout, loc) for loc in locations]
elif rule.meta.scopes.dynamic == capa.rules.Scope.THREAD:
lines = [render_thread(doc.meta.analysis.layout, loc) for loc in locations]
elif rule.meta.scopes.dynamic in (capa.rules.Scope.CALL, capa.rules.Scope.SEQUENCE):
elif rule.meta.scopes.dynamic in (capa.rules.Scope.CALL, capa.rules.Scope.SPAN_OF_CALLS):
# because we're only in verbose mode, we won't show the full call details (name, args, retval)
# we'll only show the details of the thread in which the calls are found.
# so select the thread locations and render those.
Expand Down
18 changes: 9 additions & 9 deletions capa/render/vverbose.py
Original file line number Diff line number Diff line change
Expand Up @@ -231,7 +231,7 @@ def render_feature(
# of the output, so don't re-render it again for each feature.
pass
elif isinstance(layout, rd.DynamicLayout) and isinstance(feature, frzf.MatchFeature):
# don't render copies of the sequence address for submatches
# don't render copies of the span of calls address for submatches
pass
else:
render_locations(console, layout, match.locations, indent)
Expand Down Expand Up @@ -312,13 +312,13 @@ def render_match(
render_match(console, layout, rule, child, indent=indent + 1, mode=child_mode)


def collect_sequence_locations(
def collect_span_of_calls_locations(
match: rd.Match,
mode=MODE_SUCCESS,
):
"""
Find all the (call, sequence) locations used in a given sequence match, recursively.
Useful to collect the events used to match a sequence scoped rule.
Find all the call locations used in a given span-of-calls match, recursively.
Useful to collect the events used to match a span-of-calls scoped rule.
"""
if isinstance(match.node, rd.StatementNode):
if (
Expand All @@ -327,7 +327,7 @@ def collect_sequence_locations(
):
child_mode = MODE_FAILURE if mode == MODE_SUCCESS else MODE_SUCCESS
for child in match.children:
yield from collect_sequence_locations(child, child_mode)
yield from collect_span_of_calls_locations(child, child_mode)
elif isinstance(match.node.statement, rd.RangeStatement):
for location in match.locations:
if location.type not in (frz.AddressType.CALL,):
Expand All @@ -337,7 +337,7 @@ def collect_sequence_locations(
yield location
else:
for child in match.children:
yield from collect_sequence_locations(child, mode)
yield from collect_span_of_calls_locations(child, mode)
elif isinstance(match.node, rd.FeatureNode):
for location in match.locations:
if location.type not in (frz.AddressType.CALL,):
Expand Down Expand Up @@ -488,9 +488,9 @@ def render_rules(console: Console, doc: rd.ResultDocument):
console.write(v.render_process(doc.meta.analysis.layout, location))
elif rule.meta.scopes.dynamic == capa.rules.Scope.THREAD:
console.write(v.render_thread(doc.meta.analysis.layout, location))
elif rule.meta.scopes.dynamic == capa.rules.Scope.SEQUENCE:
calls = sorted(set(collect_sequence_locations(match)))
console.write(hanging_indent(v.render_sequence(doc.meta.analysis.layout, calls), indent=1))
elif rule.meta.scopes.dynamic == capa.rules.Scope.SPAN_OF_CALLS:
calls = sorted(set(collect_span_of_calls_locations(match)))
console.write(hanging_indent(v.render_span_of_calls(doc.meta.analysis.layout, calls), indent=1))
elif rule.meta.scopes.dynamic == capa.rules.Scope.CALL:
console.write(hanging_indent(v.render_call(doc.meta.analysis.layout, location), indent=1))
else:
Expand Down
Loading

0 comments on commit 06472c1

Please sign in to comment.