2 files changed, 259 insertions, 245 deletions
diff --git a/miasm2/analysis/data_flow.py b/miasm2/analysis/data_flow.py
new file mode 100644
index 00000000..b9764daa
--- /dev/null
+++ b/miasm2/analysis/data_flow.py
@@ -0,0 +1,252 @@
+"""Data flow analysis based on miasm intermediate representation"""
+
+from collections import namedtuple
+from miasm2.core.graph import DiGraph
+
+class ReachingDefinitions(dict):
+    """
+    Computes for each instruction the set of reaching definitions.
+    Example:
+    IR block:
+    lbl0:
+       0 A = 1
+         B = 3
+       1 B = 2
+       2 A = A + B + 4
+
+    Reach definition of lbl0:
+    (lbl0, 0) => {}
+    (lbl0, 1) => {A: {(lbl0, 0)}, B: {(lbl0, 0)}}
+    (lbl0, 2) => {A: {(lbl0, 0)}, B: {(lbl0, 1)}}
+    (lbl0, 3) => {A: {(lbl0, 2)}, B: {(lbl0, 1)}}
+
+    Source set 'REACHES' in: Kennedy, K. (1979).
+    A survey of data flow analysis techniques.
+    IBM Thomas J. Watson Research Division,  Algorithm MK
+
+    This class is usable as a dictionnary whose struture is
+    { (block, instr_index): { lvalue: set((block, instr_index)) } }
+    """
+
+    ir_a = None
+
+    def __init__(self, ir_a):
+        super(ReachingDefinitions, self).__init__()
+        self.ir_a = ir_a
+        self.compute()
+
+    def get_definitions(self, block_lbl, instruction):
+        """Returns the dict { lvalue: set((def_block_lbl, def_instr_index)) }
+        associated with self.ir_a.@block.irs[@instruction]
+        or {} if it is not yet computed
+        """
+        return self.get((block_lbl, instruction), {})
+
+    def compute(self):
+        """This is the main fixpoint"""
+        modified = True
+        while modified:
+            modified = False
+            for block in self.ir_a.blocks.itervalues():
+                modified |= self.process_block(block)
+
+    def process_block(self, block):
+        """
+        Fetch reach definitions from predecessors and propagate it to
+        the instruction in block @block.
+        """
+        predecessor_state = {}
+        for pred_lbl in self.ir_a.graph.predecessors(block.label):
+            pred = self.ir_a.blocks[pred_lbl]
+            for lval, definitions in self.get_definitions(pred_lbl, len(pred.irs)).iteritems():
+                predecessor_state.setdefault(lval, set()).update(definitions)
+
+        modified = self.get((block.label, 0)) != predecessor_state
+        if not modified:
+            return False
+        self[(block.label, 0)] = predecessor_state
+
+        for instr_index in xrange(len(block.irs)):
+            modified |= self.process_instruction(block, instr_index)
+        return modified
+
+    def process_instruction(self, block, instr_index):
+        """
+        Updates the reach definitions with values defined at
+        instruction @instr_index in block @block.
+        NB: the effect of instruction @instr_index in stored at index
+        (@block, @instr_index + 1).
+        """
+
+        instr = block.irs[instr_index]
+        defs = self.get_definitions(block.label, instr_index).copy()
+        for lval in instr:
+            defs.update({lval: set([(block.label, instr_index)])})
+
+        modified = self.get((block.label, instr_index + 1)) != defs
+        if modified:
+            self[(block.label, instr_index + 1)] = defs
+
+        return modified
+
+ATTR_DEP = {"color" : "black",
+            "_type" : "data"}
+
+InstrNode = namedtuple('InstructionNode', ['label', 'index', 'var'])
+
+class DiGraphDefUse(DiGraph):
+    """Representation of a Use-Definition graph as defined by
+    Kennedy, K. (1979). A survey of data flow analysis techniques.
+    IBM Thomas J. Watson Research Division.
+    Example:
+    IR block:
+    lbl0:
+       0 A = 1
+         B = 3
+       1 B = 2
+       2 A = A + B + 4
+
+    Def use analysis:
+    (lbl0, 0, A) => {(lbl0, 2, A)}
+    (lbl0, 0, B) => {}
+    (lbl0, 1, B) => {(lbl0, 2, A)}
+    (lbl0, 2, A) => {}
+
+    """
+
+
+    def __init__(self, reaching_defs,
+                 deref_mem=False, *args, **kwargs):
+        """Instanciate a DiGraphIR
+        @blocks: IR blocks
+        """
+        self._edge_attr = {}
+
+        # For dot display
+        self._filter_node = None
+        self._dot_offset = None
+        self._blocks = reaching_defs.ir_a.blocks
+
+        super(DiGraphDefUse, self).__init__(*args, **kwargs)
+        self._compute_def_use(reaching_defs,
+                              deref_mem=deref_mem)
+
+    def edge_attr(self, src, dst):
+        """
+        Return a dictionary of attributes for the edge between @src and @dst
+        @src: the source node of the edge
+        @dst: the destination node of the edge
+        """
+        return self._edge_attr[(src, dst)]
+
+    def _compute_def_use(self, reaching_defs,
+                         deref_mem=False):
+        for block in self._blocks.itervalues():
+            self._compute_def_use_block(block,
+                                        reaching_defs,
+                                        deref_mem=deref_mem)
+
+    def _compute_def_use_block(self, block, reaching_defs, deref_mem=False):
+        for ind, instr in enumerate(block.irs):
+            instruction_reaching_defs = reaching_defs.get_definitions(block.label, ind)
+            for lval, expr in instr.iteritems():
+                self.add_node(InstrNode(block.label, ind, lval))
+
+                read_vars = expr.get_r(mem_read=deref_mem)
+                if deref_mem and lval.is_mem():
+                    read_vars.update(lval.arg.get_r(mem_read=deref_mem))
+                for read_var in read_vars:
+                    for reach in instruction_reaching_defs.get(read_var, set()):
+                        self.add_data_edge(InstrNode(reach[0], reach[1], read_var),
+                                           InstrNode(block.label, ind, lval))
+
+    def del_edge(self, src, dst):
+        super(DiGraphDefUse, self).del_edge(src, dst)
+        del self._edge_attr[(src, dst)]
+
+    def add_uniq_labeled_edge(self, src, dst, edge_label):
+        """Adds the edge (@src, @dst) with label @edge_label.
+        if edge (@src, @dst) already exists, the previous label is overriden
+        """
+        self.add_uniq_edge(src, dst)
+        self._edge_attr[(src, dst)] = edge_label
+
+    def add_data_edge(self, src, dst):
+        """Adds an edge representing a data dependencie
+        and sets the label accordingly"""
+        self.add_uniq_labeled_edge(src, dst, ATTR_DEP)
+
+    def node2lines(self, node):
+        lbl, ind, reg = node
+        yield self.DotCellDescription(text="%s (%s)" % (lbl, ind),
+                                      attr={'align': 'center',
+                                            'colspan': 2,
+                                            'bgcolor': 'grey'})
+        src = self._blocks[lbl].irs[ind][reg]
+        line = "%s = %s" % (reg, src)
+        yield self.DotCellDescription(text=line, attr={})
+        yield self.DotCellDescription(text="", attr={})
+
+
+def dead_simp_useful_instrs(defuse, reaching_defs):
+    """Mark useful statements using previous reach analysis and defuse
+
+    Source : Kennedy, K. (1979). A survey of data flow analysis techniques.
+    IBM Thomas J. Watson Research Division,  Algorithm MK
+
+    Return a set of triplets (block, instruction number, instruction) of
+    useful instructions
+    PRE: compute_reach(self)
+
+    """
+    ir_a = reaching_defs.ir_a
+    useful = set()
+
+    for block_lbl, block in ir_a.blocks.iteritems():
+        successors = ir_a.graph.successors(block_lbl)
+        for successor in successors:
+            if successor not in ir_a.blocks:
+                keep_all_definitions = True
+                break
+        else:
+            keep_all_definitions = False
+
+        # Block has a nonexistant successor or is a leaf
+        if keep_all_definitions or (len(successors) == 0):
+            valid_definitions = reaching_defs.get_definitions(block_lbl,
+                                                              len(block.irs))
+            for lval, definitions in valid_definitions.iteritems():
+                if (lval in ir_a.get_out_regs(block)
+                    or keep_all_definitions):
+                    for definition in definitions:
+                        useful.add(InstrNode(definition[0], definition[1], lval))
+
+        # Force keeping of specific cases
+        for instr_index, instr in enumerate(block.irs):
+            for lval, rval in instr.iteritems():
+                if (lval.is_mem()
+                    or ir_a.IRDst == lval
+                    or rval.is_function_call()):
+                    useful.add(InstrNode(block_lbl, instr_index, lval))
+
+    # Useful nodes dependencies
+    for node in useful:
+        for parent in defuse.reachable_parents(node):
+            yield parent
+
+def dead_simp(ir_a):
+    """
+    This function is used to analyse relation of a * complete function *
+    This means the blocks under study represent a solid full function graph.
+
+    Source : Kennedy, K. (1979). A survey of data flow analysis techniques.
+    IBM Thomas J. Watson Research Division, page 43
+    """
+    reaching_defs = ReachingDefinitions(ir_a)
+    defuse = DiGraphDefUse(reaching_defs, deref_mem=True)
+    useful = set(dead_simp_useful_instrs(defuse, reaching_defs))
+    for block in ir_a.blocks.itervalues():
+        for idx, assignblk in enumerate(block.irs):
+            for lval in assignblk.keys():
+                if InstrNode(block.label, idx, lval) not in useful:
+                    del assignblk[lval]
diff --git a/miasm2/ir/analysis.py b/miasm2/ir/analysis.py
index 4f7b321f..5c5193c9 100644
--- a/miasm2/ir/analysis.py
+++ b/miasm2/ir/analysis.py
@@ -6,6 +6,8 @@ from miasm2.ir.symbexec import SymbolicExecutionEngine
 from miasm2.ir.ir import IntermediateRepresentation, AssignBlock
 from miasm2.expression.expression \
     import ExprAff, ExprCond, ExprId, ExprInt, ExprMem, ExprOp
+from miasm2.analysis.data_flow import dead_simp as new_dead_simp_imp
+import warnings
 
 log = logging.getLogger("analysis")
 console_handler = logging.StreamHandler()
@@ -27,10 +29,6 @@ class ira(IntermediateRepresentation):
 
     """
 
-    def ira_regs_ids(self):
-        """Returns ids of all registers used in the IR"""
-        return self.arch.regs.all_regs_ids + [self.IRDst]
-
     def call_effects(self, ad, instr):
         """Default modelisation of a function call to @ad. This may be used to:
 
@@ -58,247 +56,6 @@ class ira(IntermediateRepresentation):
             irb_cur.lines.append(instr)
         return None
 
-    def remove_dead_instr(self, irb, useful):
-        """Remove dead affectations using previous reaches analysis
-        @irb: irbloc instance
-        @useful: useful statements from previous reach analysis
-        Return True iff the block state has changed
-        PRE: compute_reach(self)
-        """
-        modified = False
-        for idx, assignblk in enumerate(irb.irs):
-            for dst in assignblk.keys():
-                if (isinstance(dst, ExprId) and
-                        (irb.label, idx, dst) not in useful):
-                    del assignblk[dst]
-                    modified = True
-        return modified
-
-    def init_useful_instr(self):
-        """Computes a set of triples (block, instruction number, instruction)
-        containing initially useful instructions :
-          - Instructions affecting final value of return registers
-          - Instructions affecting IRDst register
-          - Instructions writing in memory
-          - Function call instructions
-        Return set of intial useful instructions
-        """
-
-        useful = set()
-
-        for node in self.graph.nodes():
-            if node not in self.blocks:
-                continue
-
-            block = self.blocks[node]
-            successors = self.graph.successors(node)
-            has_son = bool(successors)
-            for p_son in successors:
-                if p_son not in self.blocks:
-                    # Leaf has lost its son: don't remove anything
-                    # reaching this block
-                    for r in self.ira_regs_ids():
-                        useful.update(block.irs[-1]._cur_reach[r].union(
-                            block.irs[-1].defout[r]))
-
-            # Function call, memory write or IRDst affectation
-            for idx, assignblk in enumerate(block.irs):
-                for dst, src in assignblk.iteritems():
-                    if src.is_function_call():
-                        # /!\ never remove ir calls
-                        useful.add((block.label, idx, dst))
-                    if isinstance(dst, ExprMem):
-                        useful.add((block.label, idx, dst))
-                    useful.update(block.irs[idx].defout[self.IRDst])
-
-            # Affecting return registers
-            if not has_son:
-                for r in self.get_out_regs(block):
-                    useful.update(block.irs[-1].defout[r]
-                                  if block.irs[-1].defout[r] else
-                                  block.irs[-1]._cur_reach[r])
-
-        return useful
-
-    def _mark_useful_code(self):
-        """Mark useful statements using previous reach analysis
-
-        Source : Kennedy, K. (1979). A survey of data flow analysis techniques.
-        IBM Thomas J. Watson Research Division,  Algorithm MK
-
-        Return a set of triplets (block, instruction number, instruction) of
-        useful instructions
-        PRE: compute_reach(self)
-
-        """
-
-        useful = self.init_useful_instr()
-        worklist = useful.copy()
-        while worklist:
-            elem = worklist.pop()
-            useful.add(elem)
-            irb_label, irs_ind, dst = elem
-
-            assignblk = self.blocks[irb_label].irs[irs_ind]
-            ins = assignblk.dst2ExprAff(dst)
-
-            # Handle dependencies of used variables in ins
-            for reg in ins.get_r(True).intersection(self.ira_regs_ids()):
-                worklist.update(
-                    assignblk._cur_reach[reg].difference(useful).difference(
-                        assignblk._cur_kill[reg]
-                        if not assignblk.defout[reg] else
-                        set()))
-                for _, _, defout_dst in assignblk.defout[reg]:
-                    # Loop case (dst in defout of current irb)
-                    if defout_dst == dst:
-                        worklist.update(
-                            assignblk._cur_reach[reg].difference(useful))
-        return useful
-
-    def remove_dead_code(self):
-        """Remove dead instructions in each block of the graph using the reach
-        analysis .
-        Returns True if a block has been modified
-        PRE : compute_reach(self)
-        """
-        useful = self._mark_useful_code()
-        modified = False
-        for irblock in self.blocks.values():
-            modified |= self.remove_dead_instr(irblock, useful)
-            # Remove useless structures
-            for assignblk in irblock.irs:
-                del assignblk._cur_kill
-                del assignblk._prev_kill
-                del assignblk._cur_reach
-                del assignblk._prev_reach
-        return modified
-
-    def set_dead_regs(self, b):
-        pass
-
-    def add_unused_regs(self):
-        pass
-
-    @staticmethod
-    def print_set(v_set):
-        """Print each triplet contained in a set
-        @v_set: set containing triplets elements
-        """
-        for p in v_set:
-            print '    (%s, %s, %s)' % p
-
-    def dump_bloc_state(self, irb):
-        print '*' * 80
-        for irs in irb.irs:
-            for assignblk in irs:
-                print 5 * "-"
-                print 'instr', assignblk
-                print 5 * "-"
-                for v in self.ira_regs_ids():
-                    if assignblk._cur_reach[v]:
-                        print 'REACH[%d][%s]' % (k, v)
-                        self.print_set(assignblk._cur_reach[v])
-                    if assignblk._cur_kill[v]:
-                        print 'KILL[%d][%s]' % (k, v)
-                        self.print_set(assignblk._cur_kill[v])
-                    if assignblk.defout[v]:
-                        print 'DEFOUT[%d][%s]' % (k, v)
-                        self.print_set(assignblk.defout[v])
-
-    def compute_reach_block(self, irb):
-        """Variable influence computation for a single block
-        @irb: irbloc instance
-        PRE: init_reach()
-        """
-
-        reach_block = {key: value.copy()
-                       for key, value in irb.irs[0]._cur_reach.iteritems()}
-
-        # Compute reach from predecessors
-        for n_pred in self.graph.predecessors(irb.label):
-            p_block = self.blocks[n_pred]
-
-            # Handle each register definition
-            for c_reg in self.ira_regs_ids():
-                # REACH(n) = U[p in pred] DEFOUT(p) U REACH(p)\KILL(p)
-                pred_through = p_block.irs[-1].defout[c_reg].union(
-                    p_block.irs[-1]._cur_reach[c_reg].difference(
-                        p_block.irs[-1]._cur_kill[c_reg]))
-                reach_block[c_reg].update(pred_through)
-
-        # If a predecessor has changed
-        if reach_block != irb.irs[0]._cur_reach:
-            irb.irs[0]._cur_reach = reach_block
-            for c_reg in self.ira_regs_ids():
-                if irb.irs[0].defout[c_reg]:
-                    # KILL(n) = DEFOUT(n) ? REACH(n)\DEFOUT(n) : EMPTY
-                    irb.irs[0]._cur_kill[c_reg].update(
-                        reach_block[c_reg].difference(irb.irs[0].defout[c_reg]))
-
-        # Compute reach and kill for block's instructions
-        for i in xrange(1, len(irb.irs)):
-            for c_reg in self.ira_regs_ids():
-                # REACH(n) = U[p in pred] DEFOUT(p) U REACH(p)\KILL(p)
-                pred_through = irb.irs[i - 1].defout[c_reg].union(
-                    irb.irs[i - 1]._cur_reach[c_reg].difference(
-                        irb.irs[i - 1]._cur_kill[c_reg]))
-                irb.irs[i]._cur_reach[c_reg].update(pred_through)
-                if irb.irs[i].defout[c_reg]:
-                    # KILL(n) = DEFOUT(n) ? REACH(n)\DEFOUT(n) : EMPTY
-                    irb.irs[i]._cur_kill[c_reg].update(
-                        irb.irs[i]._cur_reach[c_reg].difference(
-                            irb.irs[i].defout[c_reg]))
-
-    def _test_kill_reach_fix(self):
-        """Return True iff a fixed point has been reached during reach
-        analysis"""
-
-        fixed = True
-        for node in self.graph.nodes():
-            if node in self.blocks:
-                irb = self.blocks[node]
-                for assignblk in irb.irs:
-                    if (assignblk._cur_reach != assignblk._prev_reach or
-                            assignblk._cur_kill != assignblk._prev_kill):
-                        fixed = False
-                        # This is not a deepcopy, but cur_reach is assigned to a
-                        # new dictionnary on change in `compute_reach_block`
-                        assignblk._prev_reach = assignblk._cur_reach.copy()
-                        assignblk._prev_kill = assignblk._cur_kill.copy()
-        return fixed
-
-    def compute_reach(self):
-        """
-        Compute reach, defout and kill sets until a fixed point is reached.
-
-        Source : Kennedy, K. (1979). A survey of data flow analysis techniques.
-        IBM Thomas J. Watson Research Division, page 43
-        """
-        fixed_point = False
-        log.debug('iteration...')
-        while not fixed_point:
-            for node in self.graph.nodes():
-                if node in self.blocks:
-                    self.compute_reach_block(self.blocks[node])
-            fixed_point = self._test_kill_reach_fix()
-
-    def dead_simp(self):
-        """
-        This function is used to analyse relation of a * complete function *
-        This means the blocks under study represent a solid full function graph.
-
-        Source : Kennedy, K. (1979). A survey of data flow analysis techniques.
-        IBM Thomas J. Watson Research Division, page 43
-        """
-        # Update r/w variables for all irblocks
-        self.get_rw(self.ira_regs_ids())
-        # Liveness step
-        self.compute_reach()
-        self.remove_dead_code()
-        # Simplify expressions
-        self.simplify_blocs()
-
     def gen_equations(self):
         for irb in self.blocks.values():
             symbols_init = dict(self.arch.regs.all_regs_ids_init)
@@ -335,3 +92,8 @@ class ira(IntermediateRepresentation):
     def sizeof_pointer(self):
         "Return the size of a void* in bits"
         raise NotImplementedError("Abstract method")
+
+    def dead_simp(self):
+        """Deprecated: See miasm2.analysis.data_flow.dead_simp()"""
+        warnings.warn('DEPRECATION WARNING: Please use miasm2.analysis.data_flow.dead_simp(ira) instead of ira.dead_simp()')
+        new_dead_simp_imp(self)