about summary refs log tree commit diff stats
path: root/miasm2/analysis/data_flow.py
diff options
context:
space:
mode:
Diffstat (limited to 'miasm2/analysis/data_flow.py')
-rw-r--r--miasm2/analysis/data_flow.py364
1 files changed, 263 insertions, 101 deletions
diff --git a/miasm2/analysis/data_flow.py b/miasm2/analysis/data_flow.py
index 4bf64e25..2201a088 100644
--- a/miasm2/analysis/data_flow.py
+++ b/miasm2/analysis/data_flow.py
@@ -11,6 +11,7 @@ from miasm2.expression.expression_helper import possible_values
 from miasm2.analysis.ssa import get_phi_sources_parent_block, \
     irblock_has_phi
 
+
 class ReachingDefinitions(dict):
     """
     Computes for each assignblock the set of reaching definitions.
@@ -291,6 +292,7 @@ def _test_merge_next_block(ircfg, loc_key):
         return None
     if son not in ircfg.blocks:
         return None
+
     return son
 
 
@@ -328,14 +330,16 @@ def _do_merge_blocks(ircfg, loc_key, son_loc_key):
     ircfg.blocks[loc_key] = new_block
 
 
-def _test_jmp_only(ircfg, loc_key):
+def _test_jmp_only(ircfg, loc_key, heads):
     """
     If irblock at @loc_key sets only IRDst to an ExprLoc, return the
     corresponding loc_key target.
+    Avoid creating predecssors for heads LocKeys
     None in other cases.
 
     @ircfg: IRCFG instance
     @loc_key: LocKey instance of the candidate irblock
+    @heads: LocKey heads of the graph
 
     """
 
@@ -347,11 +351,20 @@ def _test_jmp_only(ircfg, loc_key):
     items = dict(irblock.assignblks[0]).items()
     if len(items) != 1:
         return None
+    if len(ircfg.successors(loc_key)) != 1:
+        return None
+    # Don't create predecessors on heads
     dst, src = items[0]
     assert dst.is_id("IRDst")
     if not src.is_loc():
         return None
-    return src.loc_key
+    dst = src.loc_key
+    if loc_key in heads:
+        predecessors = set(ircfg.predecessors(dst))
+        predecessors.difference_update(set([loc_key]))
+        if predecessors:
+            return None
+    return dst
 
 
 def _relink_block_node(ircfg, loc_key, son_loc_key, replace_dct):
@@ -396,7 +409,6 @@ def _remove_to_son(ircfg, loc_key, son_loc_key):
 
     # Unlink block destinations
     ircfg.del_edge(loc_key, son_loc_key)
-    del ircfg.blocks[loc_key]
 
     replace_dct = {
         ExprLoc(loc_key, ircfg.IRDst.size):ExprLoc(son_loc_key, ircfg.IRDst.size)
@@ -404,6 +416,9 @@ def _remove_to_son(ircfg, loc_key, son_loc_key):
 
     _relink_block_node(ircfg, loc_key, son_loc_key, replace_dct)
 
+    ircfg.del_node(loc_key)
+    del ircfg.blocks[loc_key]
+
     return True
 
 
@@ -433,7 +448,6 @@ def _remove_to_parent(ircfg, loc_key, son_loc_key):
 
     ircfg.blocks[son_loc_key] = new_irblock
 
-    del ircfg.blocks[son_loc_key]
     ircfg.add_irblock(new_irblock)
 
     replace_dct = {
@@ -442,10 +456,14 @@ def _remove_to_parent(ircfg, loc_key, son_loc_key):
 
     _relink_block_node(ircfg, son_loc_key, loc_key, replace_dct)
 
+
+    ircfg.del_node(son_loc_key)
+    del ircfg.blocks[son_loc_key]
+
     return True
 
 
-def merge_blocks(ircfg, loc_key_entries):
+def merge_blocks(ircfg, heads):
     """
     This function modifies @ircfg to apply the following transformations:
     - group an irblock with its son if the irblock has one and only one son and
@@ -457,10 +475,12 @@ def merge_blocks(ircfg, loc_key_entries):
       IRDst with a given label, this irblock is dropped and its son becomes the
       head. References are fixed
 
+    This function avoid creating predecessors on heads
+
     Return True if at least an irblock has been modified
 
     @ircfg: IRCFG instance
-    @loc_key_entries: loc_key to keep
+    @heads: loc_key to keep
     """
 
     modified = False
@@ -470,15 +490,15 @@ def merge_blocks(ircfg, loc_key_entries):
 
         # Test merge block
         son = _test_merge_next_block(ircfg, loc_key)
-        if son is not None and son not in loc_key_entries:
+        if son is not None and son not in heads:
             _do_merge_blocks(ircfg, loc_key, son)
             todo.add(loc_key)
             modified = True
             continue
 
         # Test jmp only block
-        son = _test_jmp_only(ircfg, loc_key)
-        if son is not None and loc_key not in loc_key_entries:
+        son = _test_jmp_only(ircfg, loc_key, heads)
+        if son is not None and loc_key not in heads:
             ret = _remove_to_son(ircfg, loc_key, son)
             modified |= ret
             if ret:
@@ -487,7 +507,7 @@ def merge_blocks(ircfg, loc_key_entries):
 
         # Test head jmp only block
         if (son is not None and
-            son not in loc_key_entries and
+            son not in heads and
             son in ircfg.blocks):
             # jmp only test done previously
             ret = _remove_to_parent(ircfg, loc_key, son)
@@ -510,17 +530,19 @@ def remove_empty_assignblks(ircfg):
     modified = False
     for loc_key, block in ircfg.blocks.iteritems():
         irs = []
+        block_modified = False
         for assignblk in block:
             if len(assignblk):
                 irs.append(assignblk)
             else:
-                modified = True
-        ircfg.blocks[loc_key] = IRBlock(loc_key, irs)
-
+                block_modified = True
+        if block_modified:
+            new_irblock = IRBlock(loc_key, irs)
+            ircfg.blocks[loc_key] = new_irblock
+            modified = True
     return modified
 
 
-
 class SSADefUse(DiGraph):
     """
     Generate DefUse information from SSA transformation
@@ -528,33 +550,28 @@ class SSADefUse(DiGraph):
     """
 
     def add_var_def(self, node, src):
-        lbl, index, dst = node
-        index2dst = self._links.setdefault(lbl, {})
-        dst2src = index2dst.setdefault(index, {})
-        dst2src[dst] = src
+        index2dst = self._links.setdefault(node.label, {})
+        dst2src = index2dst.setdefault(node.index, {})
+        dst2src[node.var] = src
 
     def add_def_node(self, def_nodes, node, src):
-        lbl, index, dst = node
-        if dst.is_id():
-            def_nodes[dst] = node
+        if node.var.is_id():
+            def_nodes[node.var] = node
 
     def add_use_node(self, use_nodes, node, src):
-        lbl, index, dst = node
         sources = set()
-        if dst.is_mem():
-            sources.update(dst.ptr.get_r(mem_read=True))
+        if node.var.is_mem():
+            sources.update(node.var.ptr.get_r(mem_read=True))
         sources.update(src.get_r(mem_read=True))
         for source in sources:
             if not source.is_mem():
                 use_nodes.setdefault(source, set()).add(node)
 
     def get_node_target(self, node):
-        lbl, index, reg = node
-        return self._links[lbl][index][reg]
+        return self._links[node.label][node.index][node.var]
 
     def set_node_target(self, node, src):
-        lbl, index, reg = node
-        self._links[lbl][index][reg] = src
+        self._links[node.label][node.index][node.var] = src
 
     @classmethod
     def from_ssa(cls, ssa):
@@ -575,7 +592,7 @@ class SSADefUse(DiGraph):
                 continue
             for index, assignblk in enumerate(block):
                 for dst, src in assignblk.iteritems():
-                    node = lbl, index, dst
+                    node = AssignblkNode(lbl, index, dst)
                     graph.add_var_def(node, src)
                     graph.add_def_node(def_nodes, node, src)
                     graph.add_use_node(use_nodes, node, src)
@@ -622,73 +639,58 @@ def expr_has_mem(expr):
     return expr_test_visit(expr, expr_has_mem_test)
 
 
-def expr_has_call_test(expr, result):
-    if result:
-        # Don't analyse if we already found a candidate
-        return False
-    if expr.is_op() and expr.op.startswith("call"):
-        result.add(expr)
-        return False
-    return True
-
-
-def expr_has_call(expr):
+class PropagateThroughExprId(object):
     """
-    Return True if expr contains at least one "call" operator
-    @expr: Expr instance
+    Propagate expressions though ExprId
     """
-    return expr_test_visit(expr, expr_has_call_test)
-
-
-class PropagateExpr(object):
-
-    def assignblk_is_propagation_barrier(self, assignblk):
-        for dst, src in assignblk.iteritems():
-            if expr_has_call(src):
-                return True
-            if dst.is_mem():
-                return True
-        return False
 
     def has_propagation_barrier(self, assignblks):
+        """
+        Return True if propagation cannot cross the @assignblks
+        @assignblks: list of AssignBlock to check
+        """
         for assignblk in assignblks:
             for dst, src in assignblk.iteritems():
-                if expr_has_call(src):
+                if src.is_function_call():
                     return True
                 if dst.is_mem():
                     return True
         return False
 
-    def is_mem_written(self, ssa, node, successor):
-        loc_a, index_a, reg_a = node
-        loc_b, index_b, reg_b = successor
-        block_b = ssa.graph.blocks[loc_b]
+    def is_mem_written(self, ssa, node_a, node_b):
+        """
+        Return True if memory is written at least once between @node_a and
+        @node_b
 
-        nodes_to_do = self.compute_reachable_nodes_from_a_to_b(ssa.graph, loc_a, loc_b)
+        @node: AssignblkNode representing the start position
+        @successor: AssignblkNode representing the end position
+        """
 
+        block_b = ssa.graph.blocks[node_b.label]
+        nodes_to_do = self.compute_reachable_nodes_from_a_to_b(ssa.graph, node_a.label, node_b.label)
 
-        if loc_a == loc_b:
+        if node_a.label == node_b.label:
             # src is dst
-            assert nodes_to_do == set([loc_a])
-            if self.has_propagation_barrier(block_b.assignblks[index_a:index_b]):
+            assert nodes_to_do == set([node_a.label])
+            if self.has_propagation_barrier(block_b.assignblks[node_a.index:node_b.index]):
                 return True
         else:
-            # Check everyone but loc_a and loc_b
-            for loc in nodes_to_do - set([loc_a, loc_b]):
+            # Check everyone but node_a.label and node_b.label
+            for loc in nodes_to_do - set([node_a.label, node_b.label]):
                 block = ssa.graph.blocks[loc]
                 if self.has_propagation_barrier(block.assignblks):
                     return True
-            # Check loc_a partially
-            block_a = ssa.graph.blocks[loc_a]
-            if self.has_propagation_barrier(block_a.assignblks[index_a:]):
+            # Check node_a.label partially
+            block_a = ssa.graph.blocks[node_a.label]
+            if self.has_propagation_barrier(block_a.assignblks[node_a.index:]):
                 return True
-            if nodes_to_do.intersection(ssa.graph.successors(loc_b)):
-                # There is a path from loc_b to loc_b => Check loc_b fully
+            if nodes_to_do.intersection(ssa.graph.successors(node_b.label)):
+                # There is a path from node_b.label to node_b.label => Check node_b.label fully
                 if self.has_propagation_barrier(block_b.assignblks):
                     return True
             else:
-                # Check loc_b partially
-                if self.has_propagation_barrier(block_b.assignblks[:index_b]):
+                # Check node_b.label partially
+                if self.has_propagation_barrier(block_b.assignblks[:node_b.index]):
                     return True
         return False
 
@@ -699,45 +701,104 @@ class PropagateExpr(object):
 
     def propagation_allowed(self, ssa, to_replace, node_a, node_b):
         """
-        Return True if we can replace @node source into @node_b
+        Return True if we can replace @node_a source present in @to_replace into
+        @node_b
+
+        @node_a: AssignblkNode position
+        @node_b: AssignblkNode position
         """
-        loc_a, index_a, reg_a = node_a
-        if not expr_has_mem(to_replace[reg_a]):
+        if not expr_has_mem(to_replace[node_a.var]):
             return True
         if self.is_mem_written(ssa, node_a, node_b):
             return False
         return True
 
-    def propagate(self, ssa, head):
+
+    def get_var_definitions(self, ssa):
+        """
+        Return a dictionary linking variable to its assignment location
+        @ssa: SSADiGraph instance
+        """
+        ircfg = ssa.graph
+        def_dct = {}
+        for node in ircfg.nodes():
+            for index, assignblk in enumerate(ircfg.blocks[node]):
+                for dst, src in assignblk.iteritems():
+                    if not dst.is_id():
+                        continue
+                    if dst in ssa.immutable_ids:
+                        continue
+                    assert dst not in def_dct
+                    def_dct[dst] = node, index
+        return def_dct
+
+    def phi_has_identical_sources(self, ssa, def_dct, var):
+        """
+        If phi operation has identical source values, return it; else None
+        @ssa: SSADiGraph instance
+        @def_dct: dictionary linking variable to its assignment location
+        @var: Phi destination variable
+        """
+        loc_key, index = def_dct[var]
+        sources = ssa.graph.blocks[loc_key][index][var]
+        assert sources.is_op('Phi')
+        sources_values = set()
+        for src in sources.args:
+            assert src in def_dct
+            loc_key, index = def_dct[src]
+            value = ssa.graph.blocks[loc_key][index][src]
+            sources_values.add(value)
+        if len(sources_values) != 1:
+            return None
+        return list(sources_values)[0]
+
+    def get_candidates(self, ssa, head, max_expr_depth):
+        def_dct = self.get_var_definitions(ssa)
         defuse = SSADefUse.from_ssa(ssa)
         to_replace = {}
         node_to_reg = {}
         for node in defuse.nodes():
-            lbl, index, reg = node
+            if node.var in ssa.immutable_ids:
+                continue
             src = defuse.get_node_target(node)
-            if expr_has_call(src):
+            if max_expr_depth is not None and len(str(src)) > max_expr_depth:
                 continue
-            if src.is_op('Phi'):
+            if src.is_function_call():
+                continue
+            if node.var.is_mem():
                 continue
-            if reg.is_mem():
+            if src.is_op('Phi'):
+                ret = self.phi_has_identical_sources(ssa, def_dct, node.var)
+                if ret:
+                    to_replace[node.var] = ret
+                    node_to_reg[node] = node.var
                 continue
-            to_replace[reg] = src
-            node_to_reg[node] = reg
+            to_replace[node.var] = src
+            node_to_reg[node] = node.var
+        return node_to_reg, to_replace, defuse
 
+    def propagate(self, ssa, head, max_expr_depth=None):
+        """
+        Do expression propagation
+        @ssa: SSADiGraph instance
+        @head: the head location of the graph
+        @max_expr_depth: the maximum allowed depth of an expression
+        """
+        node_to_reg, to_replace, defuse = self.get_candidates(ssa, head, max_expr_depth)
         modified = False
         for node, reg in node_to_reg.iteritems():
             for successor in defuse.successors(node):
                 if not self.propagation_allowed(ssa, to_replace, node, successor):
                     continue
 
-                loc_a, index_a, reg_a = node
-                loc_b, index_b, reg_b = successor
-                block = ssa.graph.blocks[loc_b]
+                node_a = node
+                node_b = successor
+                block = ssa.graph.blocks[node_b.label]
 
-                replace = {reg_a: to_replace[reg_a]}
+                replace = {node_a.var: to_replace[node_a.var]}
                 # Replace
                 assignblks = list(block)
-                assignblk = block[index_b]
+                assignblk = block[node_b.index]
                 out = {}
                 for dst, src in assignblk.iteritems():
                     if src.is_op('Phi'):
@@ -745,8 +806,7 @@ class PropagateExpr(object):
                         continue
 
                     if src.is_mem():
-                        ptr = src.ptr
-                        ptr = ptr.replace_expr(replace)
+                        ptr = src.ptr.replace_expr(replace)
                         new_src = ExprMem(ptr, src.size)
                     else:
                         new_src = src.replace_expr(replace)
@@ -754,8 +814,7 @@ class PropagateExpr(object):
                     if dst.is_id():
                         new_dst = dst
                     elif dst.is_mem():
-                        ptr = dst.ptr
-                        ptr = ptr.replace_expr(replace)
+                        ptr = dst.ptr.replace_expr(replace)
                         new_dst = ExprMem(ptr, dst.size)
                     else:
                         new_dst = dst.replace_expr(replace)
@@ -765,9 +824,107 @@ class PropagateExpr(object):
                         modified = True
                     out[new_dst] = new_src
                 out = AssignBlock(out, assignblk.instr)
-                assignblks[index_b] = out
+                assignblks[node_b.index] = out
                 new_block = IRBlock(block.loc_key, assignblks)
                 ssa.graph.blocks[block.loc_key] = new_block
+
+        return modified
+
+
+
+class PropagateExprIntThroughExprId(PropagateThroughExprId):
+    """
+    Propagate ExprInt though ExprId: classic constant propagation
+    This is a sub family of PropagateThroughExprId.
+    It reduces leaves in expressions of a program.
+    """
+
+    def get_candidates(self, ssa, head, max_expr_depth):
+        defuse = SSADefUse.from_ssa(ssa)
+
+        to_replace = {}
+        node_to_reg = {}
+        for node in defuse.nodes():
+            src = defuse.get_node_target(node)
+            if not src.is_int():
+                continue
+            if src.is_function_call():
+                continue
+            if node.var.is_mem():
+                continue
+            to_replace[node.var] = src
+            node_to_reg[node] = node.var
+        return node_to_reg, to_replace, defuse
+
+    def propagation_allowed(self, ssa, to_replace, node_a, node_b):
+        """
+        Propagating ExprInt is always ok
+        """
+        return True
+
+
+class PropagateThroughExprMem(object):
+    """
+    Propagate through ExprMem in very simple cases:
+    - if no memory write between source and target
+    - if source does not contain any memory reference
+    """
+
+    def propagate(self, ssa, head, max_expr_depth=None):
+        ircfg = ssa.graph
+        todo = set()
+        modified = False
+        for block in ircfg.blocks.itervalues():
+            for i, assignblk in enumerate(block):
+                for dst, src in assignblk.iteritems():
+                    if not dst.is_mem():
+                        continue
+                    if expr_has_mem(src):
+                        continue
+                    todo.add((block.loc_key, i + 1, dst, src))
+                    ptr = dst.ptr
+                    for size in xrange(8, dst.size, 8):
+                        todo.add((block.loc_key, i + 1, ExprMem(ptr, size), src[:size]))
+
+        while todo:
+            loc_key, index, mem_dst, mem_src = todo.pop()
+            block = ircfg.blocks[loc_key]
+            assignblks = list(block)
+            block_modified = False
+            for i in xrange(index, len(block)):
+                assignblk = block[i]
+                write_mem = False
+                assignblk_modified = False
+                out = dict(assignblk)
+                out_new = {}
+                for dst, src in out.iteritems():
+                    if dst.is_mem():
+                        write_mem = True
+                        ptr = dst.ptr.replace_expr({mem_dst:mem_src})
+                        dst = ExprMem(ptr, dst.size)
+                    src = src.replace_expr({mem_dst:mem_src})
+                    out_new[dst] = src
+                if out != out_new:
+                    assignblk_modified = True
+
+                if assignblk_modified:
+                    assignblks[i] = AssignBlock(out_new, assignblk.instr)
+                    block_modified = True
+                if write_mem:
+                    break
+            else:
+                # If no memory written, we may propagate to sons
+                # if son has only parent
+                for successor in ircfg.successors(loc_key):
+                    predecessors = ircfg.predecessors(successor)
+                    if len(predecessors) != 1:
+                        continue
+                    todo.add((successor, 0, mem_dst, mem_src))
+
+            if block_modified:
+                modified = True
+                new_block = IRBlock(block.loc_key, assignblks)
+                ircfg.blocks[block.loc_key] = new_block
         return modified
 
 
@@ -832,15 +989,15 @@ def check_expr_below_stack(ir_arch_a, expr):
     return True
 
 
-def retrieve_stack_accesses(ir_arch_a, ssa):
+def retrieve_stack_accesses(ir_arch_a, ircfg):
     """
     Walk the ssa graph and find stack based variables.
     Return a dictionary linking stack base address to its size/name
     @ir_arch_a: ira instance
-    @ssa: SSADiGraph instance
+    @ircfg: IRCFG instance
     """
     stack_vars = set()
-    for block in ssa.graph.blocks.itervalues():
+    for block in ircfg.blocks.itervalues():
         for assignblk in block:
             for dst, src in assignblk.iteritems():
                 stack_vars.update(get_stack_accesses(ir_arch_a, dst))
@@ -906,18 +1063,23 @@ def replace_mem_stack_vars(expr, base_to_info):
     return expr.visit(lambda expr:fix_stack_vars(expr, base_to_info))
 
 
-def replace_stack_vars(ir_arch_a, ssa):
+def replace_stack_vars(ir_arch_a, ircfg):
     """
     Try to replace stack based memory accesses by variables.
+
+    Hypothesis: the input ircfg must have all it's accesses to stack explicitly
+    done through the stack register, ie every aliases on those variables is
+    resolved.
+
     WARNING: may fail
 
     @ir_arch_a: ira instance
-    @ssa: SSADiGraph instance
+    @ircfg: IRCFG instance
     """
 
-    base_to_info = retrieve_stack_accesses(ir_arch_a, ssa)
+    base_to_info = retrieve_stack_accesses(ir_arch_a, ircfg)
     modified = False
-    for block in ssa.graph.blocks.itervalues():
+    for block in ircfg.blocks.itervalues():
         assignblks = []
         for assignblk in block:
             out = {}
@@ -932,7 +1094,7 @@ def replace_stack_vars(ir_arch_a, ssa):
             out = AssignBlock(out, assignblk.instr)
             assignblks.append(out)
         new_block = IRBlock(block.loc_key, assignblks)
-        ssa.graph.blocks[block.loc_key] = new_block
+        ircfg.blocks[block.loc_key] = new_block
     return modified
 
 
@@ -975,7 +1137,7 @@ def load_from_int(ir_arch, bs, is_addr_ro_variable):
     """
 
     modified = False
-    for label, block in ir_arch.blocks.iteritems():
+    for block in ir_arch.blocks.itervalues():
         assignblks = list()
         for assignblk in block:
             out = {}