From 1389e0fb7ed0fd851e22df25260c43781f61d946 Mon Sep 17 00:00:00 2001 From: Willi Ballenthin Date: Thu, 5 Dec 2024 14:28:32 +0000 Subject: [PATCH 1/4] binexport2: prune operands more precisely --- .../features/extractors/binexport2/helpers.py | 61 ++++++------------- 1 file changed, 20 insertions(+), 41 deletions(-) diff --git a/capa/features/extractors/binexport2/helpers.py b/capa/features/extractors/binexport2/helpers.py index 29c40e81d..47bee6d06 100644 --- a/capa/features/extractors/binexport2/helpers.py +++ b/capa/features/extractors/binexport2/helpers.py @@ -52,37 +52,26 @@ def is_vertex_type(vertex: BinExport2.CallGraph.Vertex, type_: BinExport2.CallGr # internal to `build_expression_tree` # this is unstable: it is subject to change, so don't rely on it! -def _prune_expression_tree_empty_shifts( - be2: BinExport2, - operand: BinExport2.Operand, +def _prune_expression_tree_references_to_tree_index( expression_tree: list[list[int]], tree_index: int, ): - expression_index = operand.expression_index[tree_index] - expression = be2.expression[expression_index] - children_tree_indexes: list[int] = expression_tree[tree_index] - - if expression.type == BinExport2.Expression.OPERATOR: - if len(children_tree_indexes) == 0 and expression.symbol in ("lsl", "lsr"): - # Ghidra may emit superfluous lsl nodes with no children. - # https://github.com/mandiant/capa/pull/2340/files#r1750003919 - # Which is maybe: https://github.com/NationalSecurityAgency/ghidra/issues/6821#issuecomment-2295394697 - # - # Which seems to be as if the shift wasn't there (shift of #0) - # so we want to remove references to this node from any parent nodes. - for tree_node in expression_tree: - if tree_index in tree_node: - tree_node.remove(tree_index) + # `i` is the index of the tree node that we'll search for `tree_index` + # if we remove `tree_index` from it, and it is now empty, + # then we'll need to prune references to `i`. + for i, tree_node in enumerate(expression_tree): + if tree_index in tree_node: + tree_node.remove(tree_index) - return - - for child_tree_index in children_tree_indexes: - _prune_expression_tree_empty_shifts(be2, operand, expression_tree, child_tree_index) + if len(tree_node) == 0: + # if the parent node is now empty, + # remove references to that parent node. + _prune_expression_tree_references_to_tree_index(expression_tree, i) # internal to `build_expression_tree` # this is unstable: it is subject to change, so don't rely on it! -def _prune_expression_tree_empty_commas( +def _prune_expression_tree_empty_shifts( be2: BinExport2, operand: BinExport2.Operand, expression_tree: list[list[int]], @@ -93,27 +82,19 @@ def _prune_expression_tree_empty_commas( children_tree_indexes: list[int] = expression_tree[tree_index] if expression.type == BinExport2.Expression.OPERATOR: - if len(children_tree_indexes) == 1 and expression.symbol == ",": - # Due to the above pruning of empty LSL or LSR expressions, - # the parents might need to be fixed up. - # - # Specifically, if the pruned node was part of a comma list with two children, - # now there's only a single child, which renders as an extra comma, - # so we replace references to the comma node with the immediate child. + if len(children_tree_indexes) == 0 and expression.symbol in ("lsl", "lsr"): + # Ghidra may emit superfluous lsl nodes with no children. + # https://github.com/mandiant/capa/pull/2340/files#r1750003919 + # Which is maybe: https://github.com/NationalSecurityAgency/ghidra/issues/6821#issuecomment-2295394697 # - # A more correct way of doing this might be to walk up the parents and do fixups, - # but I'm not quite sure how to do this yet. Just do two passes right now. - child = children_tree_indexes[0] - - for tree_node in expression_tree: - tree_node.index - if tree_index in tree_node: - tree_node[tree_node.index(tree_index)] = child + # Which seems to be as if the shift wasn't there (shift of #0) + # so we want to remove references to this node from any parent nodes. + _prune_expression_tree_references_to_tree_index(expression_tree, tree_index) return for child_tree_index in children_tree_indexes: - _prune_expression_tree_empty_commas(be2, operand, expression_tree, child_tree_index) + _prune_expression_tree_empty_shifts(be2, operand, expression_tree, child_tree_index) # internal to `build_expression_tree` @@ -124,7 +105,6 @@ def _prune_expression_tree( expression_tree: list[list[int]], ): _prune_expression_tree_empty_shifts(be2, operand, expression_tree, 0) - _prune_expression_tree_empty_commas(be2, operand, expression_tree, 0) # this is unstable: it is subject to change, so don't rely on it! @@ -173,7 +153,6 @@ def _build_expression_tree( tree.append(children) _prune_expression_tree(be2, operand, tree) - _prune_expression_tree(be2, operand, tree) return tree From 1104188103327351f399e6ed9038b600e9fd1005 Mon Sep 17 00:00:00 2001 From: Willi Ballenthin Date: Thu, 5 Dec 2024 14:28:59 +0000 Subject: [PATCH 2/4] inspect-binexport2: better render ARM lsl/lsr and pruned expressions --- scripts/inspect-binexport2.py | 34 ++++++++++++++++++++++++++++++---- 1 file changed, 30 insertions(+), 4 deletions(-) diff --git a/scripts/inspect-binexport2.py b/scripts/inspect-binexport2.py index 2b59e35e9..9e205b0d2 100644 --- a/scripts/inspect-binexport2.py +++ b/scripts/inspect-binexport2.py @@ -128,6 +128,11 @@ def _render_expression_tree( if expression.symbol != "!": o.write(expression.symbol) + if expression.symbol in ("lsl", "lsr"): + # like: lsl 16 + # not like: lsl16 + o.write(" ") + child_index = children_tree_indexes[0] _render_expression_tree(be2, operand, expression_tree, child_index, o) @@ -141,7 +146,13 @@ def _render_expression_tree( child_a = children_tree_indexes[0] child_b = children_tree_indexes[1] _render_expression_tree(be2, operand, expression_tree, child_a, o) + o.write(expression.symbol) + if expression.symbol == ",": + # like: 10, 20 + # not like 10,20 + o.write(" ") + _render_expression_tree(be2, operand, expression_tree, child_b, o) return @@ -152,11 +163,19 @@ def _render_expression_tree( child_c = children_tree_indexes[2] _render_expression_tree(be2, operand, expression_tree, child_a, o) o.write(expression.symbol) + if expression.symbol == ",": + o.write(" ") _render_expression_tree(be2, operand, expression_tree, child_b, o) o.write(expression.symbol) + if expression.symbol == ",": + o.write(" ") _render_expression_tree(be2, operand, expression_tree, child_c, o) return + elif len(children_tree_indexes) == 0: + # like when all subtrees have been pruned: don't render anything + return + else: raise NotImplementedError(len(children_tree_indexes)) @@ -362,10 +381,17 @@ def main(argv=None): operands = [] for operand_index in instruction.operand_index: operand = be2.operand[operand_index] - # Ghidra bug where empty operands (no expressions) may - # exist so we skip those for now (see https://github.com/NationalSecurityAgency/ghidra/issues/6817) - if len(operand.expression_index) > 0: - operands.append(render_operand(be2, operand, index=operand_index)) + if not operand.expression_index: + # Ghidra bug where empty operands (no expressions) may + # exist so we skip those for now (see https://github.com/NationalSecurityAgency/ghidra/issues/6817) + continue + + op = render_operand(be2, operand, index=operand_index) + if not op: + # operand has been pruned away, so don't show it + continue + + operands.append(op) call_targets = "" if instruction.call_target: From 4e981e3e196353a508bab44575b19079f87cca8f Mon Sep 17 00:00:00 2001 From: Willi Ballenthin Date: Thu, 5 Dec 2024 14:29:44 +0000 Subject: [PATCH 3/4] changelog --- CHANGELOG.md | 1 + 1 file changed, 1 insertion(+) diff --git a/CHANGELOG.md b/CHANGELOG.md index 1703ae491..5e11d0b2c 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -46,6 +46,7 @@ - binja: fix unit test failure by fixing up the analysis for file al-khaser_x64.exe_ #2507 @xusheng6 - binja: move the stack string detection to function level #2516 @xusheng6 - BinExport2: fix handling of incorrect thunk functions #2524 @williballenthin +- BinExport2: more precise pruning of expressions @williballenthin ### capa Explorer Web From 3add05637e677cb682496e85856189c22e78df19 Mon Sep 17 00:00:00 2001 From: Willi Ballenthin Date: Thu, 5 Dec 2024 15:56:28 +0000 Subject: [PATCH 4/4] binexport2: better pruning of comma expressions with a single child --- .../features/extractors/binexport2/helpers.py | 36 +++++++++++++++++++ 1 file changed, 36 insertions(+) diff --git a/capa/features/extractors/binexport2/helpers.py b/capa/features/extractors/binexport2/helpers.py index 47bee6d06..f23c95cbd 100644 --- a/capa/features/extractors/binexport2/helpers.py +++ b/capa/features/extractors/binexport2/helpers.py @@ -97,6 +97,41 @@ def _prune_expression_tree_empty_shifts( _prune_expression_tree_empty_shifts(be2, operand, expression_tree, child_tree_index) +# internal to `build_expression_tree` +# this is unstable: it is subject to change, so don't rely on it! +def _fixup_expression_tree_references_to_tree_index( + expression_tree: list[list[int]], + existing_index: int, + new_index: int, +): + for tree_node in expression_tree: + for i, index in enumerate(tree_node): + if index == existing_index: + tree_node[i] = new_index + + +# internal to `build_expression_tree` +# this is unstable: it is subject to change, so don't rely on it! +def _fixup_expression_tree_lonely_commas( + be2: BinExport2, + operand: BinExport2.Operand, + expression_tree: list[list[int]], + tree_index: int, +): + expression_index = operand.expression_index[tree_index] + expression = be2.expression[expression_index] + children_tree_indexes: list[int] = expression_tree[tree_index] + + if expression.type == BinExport2.Expression.OPERATOR: + if len(children_tree_indexes) == 1 and expression.symbol == ",": + existing_index = tree_index + new_index = children_tree_indexes[0] + _fixup_expression_tree_references_to_tree_index(expression_tree, existing_index, new_index) + + for child_tree_index in children_tree_indexes: + _fixup_expression_tree_lonely_commas(be2, operand, expression_tree, child_tree_index) + + # internal to `build_expression_tree` # this is unstable: it is subject to change, so don't rely on it! def _prune_expression_tree( @@ -105,6 +140,7 @@ def _prune_expression_tree( expression_tree: list[list[int]], ): _prune_expression_tree_empty_shifts(be2, operand, expression_tree, 0) + _fixup_expression_tree_lonely_commas(be2, operand, expression_tree, 0) # this is unstable: it is subject to change, so don't rely on it!