Skip to content

Commit

Permalink
sequence: don't use sequence addresses
Browse files Browse the repository at this point in the history
sequence: remove sequence address
  • Loading branch information
williballenthin committed Jan 16, 2025
1 parent f12d50c commit f54cc7e
Show file tree
Hide file tree
Showing 10 changed files with 199 additions and 320 deletions.
13 changes: 2 additions & 11 deletions capa/capabilities/dynamic.py
Original file line number Diff line number Diff line change
Expand Up @@ -23,7 +23,7 @@
import capa.render.result_document as rdoc
from capa.rules import Scope, RuleSet
from capa.engine import FeatureSet, MatchResults
from capa.features.address import DynamicCallAddress, DynamicSequenceAddress, _NoAddress
from capa.features.address import _NoAddress
from capa.capabilities.common import Capabilities, find_file_capabilities
from capa.features.extractors.base_extractor import CallHandle, ThreadHandle, ProcessHandle, DynamicFeatureExtractor

Expand Down Expand Up @@ -98,7 +98,6 @@ def __init__(self, ruleset: RuleSet):
# With this approach, our algorithm performance is independent of SEQUENCE_SIZE.
# The naive algorithm, of merging all the trailing feature sets at each call, is dependent upon SEQUENCE_SIZE
# (that is, runtime gets slower the larger SEQUENCE_SIZE is).
self.current_call_addresses: collections.deque[DynamicCallAddress] = collections.deque(maxlen=SEQUENCE_SIZE)
self.current_feature_sets: collections.deque[FeatureSet] = collections.deque(maxlen=SEQUENCE_SIZE)
self.current_features: FeatureSet = collections.defaultdict(set)

Expand All @@ -107,12 +106,6 @@ def __init__(self, ruleset: RuleSet):
self.last_sequence_matches: set[str] = set()

def next(self, ch: CallHandle, call_features: FeatureSet):
self.current_call_addresses.append(ch.address)
# TODO: it would be nice to create this only when needed, since it generates garbage.
sequence_address = DynamicSequenceAddress(
ch.address.thread, id=ch.address.id, calls=tuple(address.id for address in self.current_call_addresses)
)

# As we add items to the end of the deque, overflow and drop the oldest items (at the left end).
# While we could rely on `deque.append` with `maxlen` set (which we provide above),
# we want to use the dropped item first, to remove the old features, so we manually pop it here.
Expand Down Expand Up @@ -143,7 +136,7 @@ def next(self, ch: CallHandle, call_features: FeatureSet):
# don't update in place!
self.current_features[feature] |= vas

_, matches = self.ruleset.match(Scope.SEQUENCE, self.current_features, sequence_address)
_, matches = self.ruleset.match(Scope.SEQUENCE, self.current_features, ch.address)

newly_encountered_rules = set(matches.keys()) - self.last_sequence_matches

Expand All @@ -164,9 +157,7 @@ def next(self, ch: CallHandle, call_features: FeatureSet):
for new_rule in newly_encountered_rules:
suppressed_rules -= set(self.ruleset.rules[new_rule].get_dependencies(self.ruleset.rules_by_namespace))

# TODO: if smatches: create the sequence location
for rule_name, res in matches.items():
# TODO: maybe just garbage collect here better.
if rule_name in suppressed_rules:
continue
self.matches[rule_name].extend(res)
Expand Down
27 changes: 0 additions & 27 deletions capa/features/address.py
Original file line number Diff line number Diff line change
Expand Up @@ -121,33 +121,6 @@ def __lt__(self, other):
return (self.thread, self.id) < (other.thread, other.id)


class DynamicSequenceAddress(Address):
"""addresses a sequence in a dynamic execution trace"""

def __init__(self, thread: ThreadAddress, id: int, calls: tuple[int, ...]):
assert id >= 0
self.thread = thread
# ID of the call that identifies this sequence
self.id = id
# list of call IDs contained with this sequence.
# not required for identity, because the id + SEQUENCE_SIZE will dictate this.
self.calls = calls

def __repr__(self):
return f"{self.thread}, sequence(id: {self.id})"

def __hash__(self):
# calls not required for identity, because the id + SEQUENCE_SIZE will be sufficient.
return hash(("sequence", self.thread, self.id))

def __eq__(self, other):
return isinstance(other, DynamicSequenceAddress) and (self.thread, self.id) == (other.thread, other.id)

def __lt__(self, other):
assert isinstance(other, DynamicCallAddress)
return (self.thread, self.id) < (other.thread, other.id)


class RelativeVirtualAddress(int, Address):
"""a memory address relative to a base address"""

Expand Down
17 changes: 0 additions & 17 deletions capa/features/freeze/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -102,12 +102,6 @@ def from_capa(cls, a: capa.features.address.Address) -> "Address":
elif isinstance(a, capa.features.address.DynamicCallAddress):
return cls(type=AddressType.CALL, value=(a.thread.process.ppid, a.thread.process.pid, a.thread.tid, a.id))

elif isinstance(a, capa.features.address.DynamicSequenceAddress):
return cls(
type=AddressType.SEQUENCE,
value=(a.thread.process.ppid, a.thread.process.pid, a.thread.tid, a.id, a.calls),
)

elif a == capa.features.address.NO_ADDRESS or isinstance(a, capa.features.address._NoAddress):
return cls(type=AddressType.NO_ADDRESS, value=None)

Expand Down Expand Up @@ -171,17 +165,6 @@ def to_capa(self) -> capa.features.address.Address:
id=id_,
)

elif self.type is AddressType.SEQUENCE:
assert isinstance(self.value, tuple)
ppid, pid, tid, id_, calls = self.value
return capa.features.address.DynamicSequenceAddress(
thread=capa.features.address.ThreadAddress(
process=capa.features.address.ProcessAddress(ppid=ppid, pid=pid), tid=tid
),
id=id_,
calls=calls,
)

elif self.type is AddressType.NO_ADDRESS:
return capa.features.address.NO_ADDRESS

Expand Down
27 changes: 0 additions & 27 deletions capa/render/proto/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -142,25 +142,6 @@ def addr_to_pb2(addr: frz.Address) -> capa_pb2.Address:
),
)

elif addr.type is AddressType.SEQUENCE:
assert isinstance(addr.value, tuple)
ppid, pid, tid, id_, calls = addr.value
assert isinstance(ppid, int)
assert isinstance(pid, int)
assert isinstance(tid, int)
assert isinstance(id_, int)
assert isinstance(calls, tuple)
return capa_pb2.Address(
type=capa_pb2.AddressType.ADDRESSTYPE_SEQUENCE,
ppid_pid_tid_id_calls=capa_pb2.Ppid_Pid_Tid_Id_Calls(
ppid=int_to_pb2(ppid),
pid=int_to_pb2(pid),
tid=int_to_pb2(tid),
id=int_to_pb2(id_),
calls=tuple(int_to_pb2(i) for i in calls),
),
)

elif addr.type is AddressType.NO_ADDRESS:
# value == None, so only set type
return capa_pb2.Address(type=capa_pb2.AddressType.ADDRESSTYPE_NO_ADDRESS)
Expand Down Expand Up @@ -656,14 +637,6 @@ def addr_from_pb2(addr: capa_pb2.Address) -> frz.Address:
id_ = int_from_pb2(addr.ppid_pid_tid_id.id)
return frz.Address(type=frz.AddressType.CALL, value=(ppid, pid, tid, id_))

elif addr.type == capa_pb2.AddressType.ADDRESSTYPE_SEQUENCE:
ppid = int_from_pb2(addr.ppid_pid_tid_id_calls.ppid)
pid = int_from_pb2(addr.ppid_pid_tid_id_calls.pid)
tid = int_from_pb2(addr.ppid_pid_tid_id_calls.tid)
id_ = int_from_pb2(addr.ppid_pid_tid_id_calls.id)
calls = tuple(int_from_pb2(i) for i in addr.ppid_pid_tid_id_calls.calls)
return frz.Address(type=frz.AddressType.SEQUENCE, value=(ppid, pid, tid, id_, calls))

elif addr.type == capa_pb2.AddressType.ADDRESSTYPE_NO_ADDRESS:
return frz.Address(type=frz.AddressType.NO_ADDRESS, value=None)

Expand Down
10 changes: 0 additions & 10 deletions capa/render/proto/capa.proto
Original file line number Diff line number Diff line change
Expand Up @@ -30,7 +30,6 @@ message Address {
Ppid_Pid ppid_pid = 4;
Ppid_Pid_Tid ppid_pid_tid = 5;
Ppid_Pid_Tid_Id ppid_pid_tid_id = 6;
Ppid_Pid_Tid_Id_Calls ppid_pid_tid_id_calls = 7;
};
}

Expand All @@ -45,7 +44,6 @@ enum AddressType {
ADDRESSTYPE_PROCESS = 7;
ADDRESSTYPE_THREAD = 8;
ADDRESSTYPE_CALL = 9;
ADDRESSTYPE_SEQUENCE = 10;
}

message Analysis {
Expand Down Expand Up @@ -489,14 +487,6 @@ message Ppid_Pid_Tid_Id {
Integer id = 4;
}

message Ppid_Pid_Tid_Id_Calls {
Integer ppid = 1;
Integer pid = 2;
Integer tid = 3;
Integer id = 4;
repeated Integer calls = 5;
}

message Integer { oneof value { uint64 u = 1; sint64 i = 2; } } // unsigned or signed int

message Number { oneof value { uint64 u = 1; sint64 i = 2; double f = 3; } }
278 changes: 138 additions & 140 deletions capa/render/proto/capa_pb2.py

Large diffs are not rendered by default.

58 changes: 16 additions & 42 deletions capa/render/proto/capa_pb2.pyi
Original file line number Diff line number Diff line change
@@ -1,6 +1,19 @@
"""
@generated by mypy-protobuf. Do not edit manually!
isort:skip_file
Copyright 2023 Google LLC
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
"""

import builtins
Expand Down Expand Up @@ -35,7 +48,6 @@ class _AddressTypeEnumTypeWrapper(google.protobuf.internal.enum_type_wrapper._En
ADDRESSTYPE_PROCESS: _AddressType.ValueType # 7
ADDRESSTYPE_THREAD: _AddressType.ValueType # 8
ADDRESSTYPE_CALL: _AddressType.ValueType # 9
ADDRESSTYPE_SEQUENCE: _AddressType.ValueType # 10

class AddressType(_AddressType, metaclass=_AddressTypeEnumTypeWrapper): ...

Expand All @@ -49,7 +61,6 @@ ADDRESSTYPE_NO_ADDRESS: AddressType.ValueType # 6
ADDRESSTYPE_PROCESS: AddressType.ValueType # 7
ADDRESSTYPE_THREAD: AddressType.ValueType # 8
ADDRESSTYPE_CALL: AddressType.ValueType # 9
ADDRESSTYPE_SEQUENCE: AddressType.ValueType # 10
global___AddressType = AddressType

class _Flavor:
Expand Down Expand Up @@ -131,7 +142,6 @@ class Address(google.protobuf.message.Message):
PPID_PID_FIELD_NUMBER: builtins.int
PPID_PID_TID_FIELD_NUMBER: builtins.int
PPID_PID_TID_ID_FIELD_NUMBER: builtins.int
PPID_PID_TID_ID_CALLS_FIELD_NUMBER: builtins.int
type: global___AddressType.ValueType
@property
def v(self) -> global___Integer: ...
Expand All @@ -143,8 +153,6 @@ class Address(google.protobuf.message.Message):
def ppid_pid_tid(self) -> global___Ppid_Pid_Tid: ...
@property
def ppid_pid_tid_id(self) -> global___Ppid_Pid_Tid_Id: ...
@property
def ppid_pid_tid_id_calls(self) -> global___Ppid_Pid_Tid_Id_Calls: ...
def __init__(
self,
*,
Expand All @@ -154,11 +162,10 @@ class Address(google.protobuf.message.Message):
ppid_pid: global___Ppid_Pid | None = ...,
ppid_pid_tid: global___Ppid_Pid_Tid | None = ...,
ppid_pid_tid_id: global___Ppid_Pid_Tid_Id | None = ...,
ppid_pid_tid_id_calls: global___Ppid_Pid_Tid_Id_Calls | None = ...,
) -> None: ...
def HasField(self, field_name: typing.Literal["ppid_pid", b"ppid_pid", "ppid_pid_tid", b"ppid_pid_tid", "ppid_pid_tid_id", b"ppid_pid_tid_id", "ppid_pid_tid_id_calls", b"ppid_pid_tid_id_calls", "token_offset", b"token_offset", "v", b"v", "value", b"value"]) -> builtins.bool: ...
def ClearField(self, field_name: typing.Literal["ppid_pid", b"ppid_pid", "ppid_pid_tid", b"ppid_pid_tid", "ppid_pid_tid_id", b"ppid_pid_tid_id", "ppid_pid_tid_id_calls", b"ppid_pid_tid_id_calls", "token_offset", b"token_offset", "type", b"type", "v", b"v", "value", b"value"]) -> None: ...
def WhichOneof(self, oneof_group: typing.Literal["value", b"value"]) -> typing.Literal["v", "token_offset", "ppid_pid", "ppid_pid_tid", "ppid_pid_tid_id", "ppid_pid_tid_id_calls"] | None: ...
def HasField(self, field_name: typing.Literal["ppid_pid", b"ppid_pid", "ppid_pid_tid", b"ppid_pid_tid", "ppid_pid_tid_id", b"ppid_pid_tid_id", "token_offset", b"token_offset", "v", b"v", "value", b"value"]) -> builtins.bool: ...
def ClearField(self, field_name: typing.Literal["ppid_pid", b"ppid_pid", "ppid_pid_tid", b"ppid_pid_tid", "ppid_pid_tid_id", b"ppid_pid_tid_id", "token_offset", b"token_offset", "type", b"type", "v", b"v", "value", b"value"]) -> None: ...
def WhichOneof(self, oneof_group: typing.Literal["value", b"value"]) -> typing.Literal["v", "token_offset", "ppid_pid", "ppid_pid_tid", "ppid_pid_tid_id"] | None: ...

global___Address = Address

Expand Down Expand Up @@ -1823,39 +1830,6 @@ class Ppid_Pid_Tid_Id(google.protobuf.message.Message):

global___Ppid_Pid_Tid_Id = Ppid_Pid_Tid_Id

@typing.final
class Ppid_Pid_Tid_Id_Calls(google.protobuf.message.Message):
DESCRIPTOR: google.protobuf.descriptor.Descriptor

PPID_FIELD_NUMBER: builtins.int
PID_FIELD_NUMBER: builtins.int
TID_FIELD_NUMBER: builtins.int
ID_FIELD_NUMBER: builtins.int
CALLS_FIELD_NUMBER: builtins.int
@property
def ppid(self) -> global___Integer: ...
@property
def pid(self) -> global___Integer: ...
@property
def tid(self) -> global___Integer: ...
@property
def id(self) -> global___Integer: ...
@property
def calls(self) -> google.protobuf.internal.containers.RepeatedCompositeFieldContainer[global___Integer]: ...
def __init__(
self,
*,
ppid: global___Integer | None = ...,
pid: global___Integer | None = ...,
tid: global___Integer | None = ...,
id: global___Integer | None = ...,
calls: collections.abc.Iterable[global___Integer] | None = ...,
) -> None: ...
def HasField(self, field_name: typing.Literal["id", b"id", "pid", b"pid", "ppid", b"ppid", "tid", b"tid"]) -> builtins.bool: ...
def ClearField(self, field_name: typing.Literal["calls", b"calls", "id", b"id", "pid", b"pid", "ppid", b"ppid", "tid", b"tid"]) -> None: ...

global___Ppid_Pid_Tid_Id_Calls = Ppid_Pid_Tid_Id_Calls

@typing.final
class Integer(google.protobuf.message.Message):
DESCRIPTOR: google.protobuf.descriptor.Descriptor
Expand Down
78 changes: 39 additions & 39 deletions capa/render/result_document.py
Original file line number Diff line number Diff line change
Expand Up @@ -29,7 +29,7 @@
from capa.rules import RuleSet
from capa.engine import MatchResults
from capa.helpers import assert_never, load_json_from_path
from capa.features.address import DynamicCallAddress, DynamicSequenceAddress
from capa.features.address import DynamicCallAddress

if TYPE_CHECKING:
from capa.capabilities.common import Capabilities
Expand Down Expand Up @@ -394,40 +394,29 @@ def from_capa(

for location in result.locations:

# TODO: assert source and destination rules are sequence scoped
if isinstance(location, capa.features.address.DynamicSequenceAddress):
# sequence scopes can match each other, but they don't strictly contain each other,
# like the way a function contains a basic block.
# so when we have a match within a sequence for another sequence, we need to look
# for all the places it might be found.

# where the wanted rule *could* be found.
# call ids within the current thread.
sequence_locations = set(location.calls)

# where the wanted rule *has* been found.
# call ids within the current thread.
match_locations = {
addr.id
for addr in rule_matches.keys()
if isinstance(addr, DynamicSequenceAddress) and addr.thread == location.thread
}

if not (sequence_locations & match_locations):
breakpoint()

new_children = []
for call_id in location.calls:
if match := rule_matches.get(DynamicSequenceAddress(location.thread, call_id, calls=())):
new_children.append(Match.from_capa(rules, capabilities, match))
elif match := rule_matches.get(DynamicCallAddress(location.thread, call_id)):
new_children.append(Match.from_capa(rules, capabilities, match))
else:
# there was no match at this call in the given sequence
pass

assert new_children, "failed to find locations for sequence matching sequence"
children.extend(new_children)
# keep this in sync with the copy below
if isinstance(location, DynamicCallAddress):
if location in rule_matches:
# exact match, such as matching a call-scoped rule.
children.append(Match.from_capa(rules, capabilities, rule_matches[location]))
# we'd like to assert the scope of the current rule is "sequence"
# but we don't have that data here.
else:
# Sequence scopes can match each other, but they don't strictly contain each other,
# like the way a function contains a basic block.
# So when we have a match within a sequence for another sequence, we need to look
# for all the places it might be found.
#
# Despite the edge cases (like API hammering), this turns out to be pretty easy:
# collect the most recent match (with the given name) prior to the wanted location.
matches_in_thread = sorted([
(a.id, m) for a, m in rule_matches.items()
if isinstance(a, DynamicCallAddress)
and a.thread == location.thread
and a.id <= location.id
])
_, most_recent_match = matches_in_thread[-1]
children.append(Match.from_capa(rules, capabilities, most_recent_match))

else:
children.append(Match.from_capa(rules, capabilities, rule_matches[location]))
Expand Down Expand Up @@ -462,9 +451,6 @@ def from_capa(
# in the meantime, the above might be sufficient.
rule_matches = dict(capabilities[rule.name])
for location in result.locations:

# TODO: update here too

# doc[locations] contains all matches for the given namespace.
# for example, the feature might be `match: anti-analysis/packer`
# which matches against "generic unpacker" and "UPX".
Expand All @@ -474,7 +460,21 @@ def from_capa(
# this is a subset of doc[locations].
#
# so, grab only the locations for current rule.
if location in rule_matches:

# keep this in sync with the block above.
if isinstance(location, DynamicCallAddress):
if location in rule_matches:
children.append(Match.from_capa(rules, capabilities, rule_matches[location]))
else:
matches_in_thread = sorted([
(a.id, m) for a, m in rule_matches.items()
if isinstance(a, DynamicCallAddress)
and a.thread == location.thread
and a.id <= location.id
])
_, most_recent_match = matches_in_thread[-1]
children.append(Match.from_capa(rules, capabilities, most_recent_match))
else:
children.append(Match.from_capa(rules, capabilities, rule_matches[location]))

return cls(
Expand Down
Loading

0 comments on commit f54cc7e

Please sign in to comment.