diff options
Diffstat (limited to 'miasm2')
| -rw-r--r-- | miasm2/analysis/data_flow.py | 252 | ||||
| -rw-r--r-- | miasm2/ir/analysis.py | 252 |
2 files changed, 259 insertions, 245 deletions
diff --git a/miasm2/analysis/data_flow.py b/miasm2/analysis/data_flow.py new file mode 100644 index 00000000..b9764daa --- /dev/null +++ b/miasm2/analysis/data_flow.py @@ -0,0 +1,252 @@ +"""Data flow analysis based on miasm intermediate representation""" + +from collections import namedtuple +from miasm2.core.graph import DiGraph + +class ReachingDefinitions(dict): + """ + Computes for each instruction the set of reaching definitions. + Example: + IR block: + lbl0: + 0 A = 1 + B = 3 + 1 B = 2 + 2 A = A + B + 4 + + Reach definition of lbl0: + (lbl0, 0) => {} + (lbl0, 1) => {A: {(lbl0, 0)}, B: {(lbl0, 0)}} + (lbl0, 2) => {A: {(lbl0, 0)}, B: {(lbl0, 1)}} + (lbl0, 3) => {A: {(lbl0, 2)}, B: {(lbl0, 1)}} + + Source set 'REACHES' in: Kennedy, K. (1979). + A survey of data flow analysis techniques. + IBM Thomas J. Watson Research Division, Algorithm MK + + This class is usable as a dictionnary whose struture is + { (block, instr_index): { lvalue: set((block, instr_index)) } } + """ + + ir_a = None + + def __init__(self, ir_a): + super(ReachingDefinitions, self).__init__() + self.ir_a = ir_a + self.compute() + + def get_definitions(self, block_lbl, instruction): + """Returns the dict { lvalue: set((def_block_lbl, def_instr_index)) } + associated with self.ir_a.@block.irs[@instruction] + or {} if it is not yet computed + """ + return self.get((block_lbl, instruction), {}) + + def compute(self): + """This is the main fixpoint""" + modified = True + while modified: + modified = False + for block in self.ir_a.blocks.itervalues(): + modified |= self.process_block(block) + + def process_block(self, block): + """ + Fetch reach definitions from predecessors and propagate it to + the instruction in block @block. + """ + predecessor_state = {} + for pred_lbl in self.ir_a.graph.predecessors(block.label): + pred = self.ir_a.blocks[pred_lbl] + for lval, definitions in self.get_definitions(pred_lbl, len(pred.irs)).iteritems(): + predecessor_state.setdefault(lval, set()).update(definitions) + + modified = self.get((block.label, 0)) != predecessor_state + if not modified: + return False + self[(block.label, 0)] = predecessor_state + + for instr_index in xrange(len(block.irs)): + modified |= self.process_instruction(block, instr_index) + return modified + + def process_instruction(self, block, instr_index): + """ + Updates the reach definitions with values defined at + instruction @instr_index in block @block. + NB: the effect of instruction @instr_index in stored at index + (@block, @instr_index + 1). + """ + + instr = block.irs[instr_index] + defs = self.get_definitions(block.label, instr_index).copy() + for lval in instr: + defs.update({lval: set([(block.label, instr_index)])}) + + modified = self.get((block.label, instr_index + 1)) != defs + if modified: + self[(block.label, instr_index + 1)] = defs + + return modified + +ATTR_DEP = {"color" : "black", + "_type" : "data"} + +InstrNode = namedtuple('InstructionNode', ['label', 'index', 'var']) + +class DiGraphDefUse(DiGraph): + """Representation of a Use-Definition graph as defined by + Kennedy, K. (1979). A survey of data flow analysis techniques. + IBM Thomas J. Watson Research Division. + Example: + IR block: + lbl0: + 0 A = 1 + B = 3 + 1 B = 2 + 2 A = A + B + 4 + + Def use analysis: + (lbl0, 0, A) => {(lbl0, 2, A)} + (lbl0, 0, B) => {} + (lbl0, 1, B) => {(lbl0, 2, A)} + (lbl0, 2, A) => {} + + """ + + + def __init__(self, reaching_defs, + deref_mem=False, *args, **kwargs): + """Instanciate a DiGraphIR + @blocks: IR blocks + """ + self._edge_attr = {} + + # For dot display + self._filter_node = None + self._dot_offset = None + self._blocks = reaching_defs.ir_a.blocks + + super(DiGraphDefUse, self).__init__(*args, **kwargs) + self._compute_def_use(reaching_defs, + deref_mem=deref_mem) + + def edge_attr(self, src, dst): + """ + Return a dictionary of attributes for the edge between @src and @dst + @src: the source node of the edge + @dst: the destination node of the edge + """ + return self._edge_attr[(src, dst)] + + def _compute_def_use(self, reaching_defs, + deref_mem=False): + for block in self._blocks.itervalues(): + self._compute_def_use_block(block, + reaching_defs, + deref_mem=deref_mem) + + def _compute_def_use_block(self, block, reaching_defs, deref_mem=False): + for ind, instr in enumerate(block.irs): + instruction_reaching_defs = reaching_defs.get_definitions(block.label, ind) + for lval, expr in instr.iteritems(): + self.add_node(InstrNode(block.label, ind, lval)) + + read_vars = expr.get_r(mem_read=deref_mem) + if deref_mem and lval.is_mem(): + read_vars.update(lval.arg.get_r(mem_read=deref_mem)) + for read_var in read_vars: + for reach in instruction_reaching_defs.get(read_var, set()): + self.add_data_edge(InstrNode(reach[0], reach[1], read_var), + InstrNode(block.label, ind, lval)) + + def del_edge(self, src, dst): + super(DiGraphDefUse, self).del_edge(src, dst) + del self._edge_attr[(src, dst)] + + def add_uniq_labeled_edge(self, src, dst, edge_label): + """Adds the edge (@src, @dst) with label @edge_label. + if edge (@src, @dst) already exists, the previous label is overriden + """ + self.add_uniq_edge(src, dst) + self._edge_attr[(src, dst)] = edge_label + + def add_data_edge(self, src, dst): + """Adds an edge representing a data dependencie + and sets the label accordingly""" + self.add_uniq_labeled_edge(src, dst, ATTR_DEP) + + def node2lines(self, node): + lbl, ind, reg = node + yield self.DotCellDescription(text="%s (%s)" % (lbl, ind), + attr={'align': 'center', + 'colspan': 2, + 'bgcolor': 'grey'}) + src = self._blocks[lbl].irs[ind][reg] + line = "%s = %s" % (reg, src) + yield self.DotCellDescription(text=line, attr={}) + yield self.DotCellDescription(text="", attr={}) + + +def dead_simp_useful_instrs(defuse, reaching_defs): + """Mark useful statements using previous reach analysis and defuse + + Source : Kennedy, K. (1979). A survey of data flow analysis techniques. + IBM Thomas J. Watson Research Division, Algorithm MK + + Return a set of triplets (block, instruction number, instruction) of + useful instructions + PRE: compute_reach(self) + + """ + ir_a = reaching_defs.ir_a + useful = set() + + for block_lbl, block in ir_a.blocks.iteritems(): + successors = ir_a.graph.successors(block_lbl) + for successor in successors: + if successor not in ir_a.blocks: + keep_all_definitions = True + break + else: + keep_all_definitions = False + + # Block has a nonexistant successor or is a leaf + if keep_all_definitions or (len(successors) == 0): + valid_definitions = reaching_defs.get_definitions(block_lbl, + len(block.irs)) + for lval, definitions in valid_definitions.iteritems(): + if (lval in ir_a.get_out_regs(block) + or keep_all_definitions): + for definition in definitions: + useful.add(InstrNode(definition[0], definition[1], lval)) + + # Force keeping of specific cases + for instr_index, instr in enumerate(block.irs): + for lval, rval in instr.iteritems(): + if (lval.is_mem() + or ir_a.IRDst == lval + or rval.is_function_call()): + useful.add(InstrNode(block_lbl, instr_index, lval)) + + # Useful nodes dependencies + for node in useful: + for parent in defuse.reachable_parents(node): + yield parent + +def dead_simp(ir_a): + """ + This function is used to analyse relation of a * complete function * + This means the blocks under study represent a solid full function graph. + + Source : Kennedy, K. (1979). A survey of data flow analysis techniques. + IBM Thomas J. Watson Research Division, page 43 + """ + reaching_defs = ReachingDefinitions(ir_a) + defuse = DiGraphDefUse(reaching_defs, deref_mem=True) + useful = set(dead_simp_useful_instrs(defuse, reaching_defs)) + for block in ir_a.blocks.itervalues(): + for idx, assignblk in enumerate(block.irs): + for lval in assignblk.keys(): + if InstrNode(block.label, idx, lval) not in useful: + del assignblk[lval] diff --git a/miasm2/ir/analysis.py b/miasm2/ir/analysis.py index 4f7b321f..5c5193c9 100644 --- a/miasm2/ir/analysis.py +++ b/miasm2/ir/analysis.py @@ -6,6 +6,8 @@ from miasm2.ir.symbexec import SymbolicExecutionEngine from miasm2.ir.ir import IntermediateRepresentation, AssignBlock from miasm2.expression.expression \ import ExprAff, ExprCond, ExprId, ExprInt, ExprMem, ExprOp +from miasm2.analysis.data_flow import dead_simp as new_dead_simp_imp +import warnings log = logging.getLogger("analysis") console_handler = logging.StreamHandler() @@ -27,10 +29,6 @@ class ira(IntermediateRepresentation): """ - def ira_regs_ids(self): - """Returns ids of all registers used in the IR""" - return self.arch.regs.all_regs_ids + [self.IRDst] - def call_effects(self, ad, instr): """Default modelisation of a function call to @ad. This may be used to: @@ -58,247 +56,6 @@ class ira(IntermediateRepresentation): irb_cur.lines.append(instr) return None - def remove_dead_instr(self, irb, useful): - """Remove dead affectations using previous reaches analysis - @irb: irbloc instance - @useful: useful statements from previous reach analysis - Return True iff the block state has changed - PRE: compute_reach(self) - """ - modified = False - for idx, assignblk in enumerate(irb.irs): - for dst in assignblk.keys(): - if (isinstance(dst, ExprId) and - (irb.label, idx, dst) not in useful): - del assignblk[dst] - modified = True - return modified - - def init_useful_instr(self): - """Computes a set of triples (block, instruction number, instruction) - containing initially useful instructions : - - Instructions affecting final value of return registers - - Instructions affecting IRDst register - - Instructions writing in memory - - Function call instructions - Return set of intial useful instructions - """ - - useful = set() - - for node in self.graph.nodes(): - if node not in self.blocks: - continue - - block = self.blocks[node] - successors = self.graph.successors(node) - has_son = bool(successors) - for p_son in successors: - if p_son not in self.blocks: - # Leaf has lost its son: don't remove anything - # reaching this block - for r in self.ira_regs_ids(): - useful.update(block.irs[-1]._cur_reach[r].union( - block.irs[-1].defout[r])) - - # Function call, memory write or IRDst affectation - for idx, assignblk in enumerate(block.irs): - for dst, src in assignblk.iteritems(): - if src.is_function_call(): - # /!\ never remove ir calls - useful.add((block.label, idx, dst)) - if isinstance(dst, ExprMem): - useful.add((block.label, idx, dst)) - useful.update(block.irs[idx].defout[self.IRDst]) - - # Affecting return registers - if not has_son: - for r in self.get_out_regs(block): - useful.update(block.irs[-1].defout[r] - if block.irs[-1].defout[r] else - block.irs[-1]._cur_reach[r]) - - return useful - - def _mark_useful_code(self): - """Mark useful statements using previous reach analysis - - Source : Kennedy, K. (1979). A survey of data flow analysis techniques. - IBM Thomas J. Watson Research Division, Algorithm MK - - Return a set of triplets (block, instruction number, instruction) of - useful instructions - PRE: compute_reach(self) - - """ - - useful = self.init_useful_instr() - worklist = useful.copy() - while worklist: - elem = worklist.pop() - useful.add(elem) - irb_label, irs_ind, dst = elem - - assignblk = self.blocks[irb_label].irs[irs_ind] - ins = assignblk.dst2ExprAff(dst) - - # Handle dependencies of used variables in ins - for reg in ins.get_r(True).intersection(self.ira_regs_ids()): - worklist.update( - assignblk._cur_reach[reg].difference(useful).difference( - assignblk._cur_kill[reg] - if not assignblk.defout[reg] else - set())) - for _, _, defout_dst in assignblk.defout[reg]: - # Loop case (dst in defout of current irb) - if defout_dst == dst: - worklist.update( - assignblk._cur_reach[reg].difference(useful)) - return useful - - def remove_dead_code(self): - """Remove dead instructions in each block of the graph using the reach - analysis . - Returns True if a block has been modified - PRE : compute_reach(self) - """ - useful = self._mark_useful_code() - modified = False - for irblock in self.blocks.values(): - modified |= self.remove_dead_instr(irblock, useful) - # Remove useless structures - for assignblk in irblock.irs: - del assignblk._cur_kill - del assignblk._prev_kill - del assignblk._cur_reach - del assignblk._prev_reach - return modified - - def set_dead_regs(self, b): - pass - - def add_unused_regs(self): - pass - - @staticmethod - def print_set(v_set): - """Print each triplet contained in a set - @v_set: set containing triplets elements - """ - for p in v_set: - print ' (%s, %s, %s)' % p - - def dump_bloc_state(self, irb): - print '*' * 80 - for irs in irb.irs: - for assignblk in irs: - print 5 * "-" - print 'instr', assignblk - print 5 * "-" - for v in self.ira_regs_ids(): - if assignblk._cur_reach[v]: - print 'REACH[%d][%s]' % (k, v) - self.print_set(assignblk._cur_reach[v]) - if assignblk._cur_kill[v]: - print 'KILL[%d][%s]' % (k, v) - self.print_set(assignblk._cur_kill[v]) - if assignblk.defout[v]: - print 'DEFOUT[%d][%s]' % (k, v) - self.print_set(assignblk.defout[v]) - - def compute_reach_block(self, irb): - """Variable influence computation for a single block - @irb: irbloc instance - PRE: init_reach() - """ - - reach_block = {key: value.copy() - for key, value in irb.irs[0]._cur_reach.iteritems()} - - # Compute reach from predecessors - for n_pred in self.graph.predecessors(irb.label): - p_block = self.blocks[n_pred] - - # Handle each register definition - for c_reg in self.ira_regs_ids(): - # REACH(n) = U[p in pred] DEFOUT(p) U REACH(p)\KILL(p) - pred_through = p_block.irs[-1].defout[c_reg].union( - p_block.irs[-1]._cur_reach[c_reg].difference( - p_block.irs[-1]._cur_kill[c_reg])) - reach_block[c_reg].update(pred_through) - - # If a predecessor has changed - if reach_block != irb.irs[0]._cur_reach: - irb.irs[0]._cur_reach = reach_block - for c_reg in self.ira_regs_ids(): - if irb.irs[0].defout[c_reg]: - # KILL(n) = DEFOUT(n) ? REACH(n)\DEFOUT(n) : EMPTY - irb.irs[0]._cur_kill[c_reg].update( - reach_block[c_reg].difference(irb.irs[0].defout[c_reg])) - - # Compute reach and kill for block's instructions - for i in xrange(1, len(irb.irs)): - for c_reg in self.ira_regs_ids(): - # REACH(n) = U[p in pred] DEFOUT(p) U REACH(p)\KILL(p) - pred_through = irb.irs[i - 1].defout[c_reg].union( - irb.irs[i - 1]._cur_reach[c_reg].difference( - irb.irs[i - 1]._cur_kill[c_reg])) - irb.irs[i]._cur_reach[c_reg].update(pred_through) - if irb.irs[i].defout[c_reg]: - # KILL(n) = DEFOUT(n) ? REACH(n)\DEFOUT(n) : EMPTY - irb.irs[i]._cur_kill[c_reg].update( - irb.irs[i]._cur_reach[c_reg].difference( - irb.irs[i].defout[c_reg])) - - def _test_kill_reach_fix(self): - """Return True iff a fixed point has been reached during reach - analysis""" - - fixed = True - for node in self.graph.nodes(): - if node in self.blocks: - irb = self.blocks[node] - for assignblk in irb.irs: - if (assignblk._cur_reach != assignblk._prev_reach or - assignblk._cur_kill != assignblk._prev_kill): - fixed = False - # This is not a deepcopy, but cur_reach is assigned to a - # new dictionnary on change in `compute_reach_block` - assignblk._prev_reach = assignblk._cur_reach.copy() - assignblk._prev_kill = assignblk._cur_kill.copy() - return fixed - - def compute_reach(self): - """ - Compute reach, defout and kill sets until a fixed point is reached. - - Source : Kennedy, K. (1979). A survey of data flow analysis techniques. - IBM Thomas J. Watson Research Division, page 43 - """ - fixed_point = False - log.debug('iteration...') - while not fixed_point: - for node in self.graph.nodes(): - if node in self.blocks: - self.compute_reach_block(self.blocks[node]) - fixed_point = self._test_kill_reach_fix() - - def dead_simp(self): - """ - This function is used to analyse relation of a * complete function * - This means the blocks under study represent a solid full function graph. - - Source : Kennedy, K. (1979). A survey of data flow analysis techniques. - IBM Thomas J. Watson Research Division, page 43 - """ - # Update r/w variables for all irblocks - self.get_rw(self.ira_regs_ids()) - # Liveness step - self.compute_reach() - self.remove_dead_code() - # Simplify expressions - self.simplify_blocs() - def gen_equations(self): for irb in self.blocks.values(): symbols_init = dict(self.arch.regs.all_regs_ids_init) @@ -335,3 +92,8 @@ class ira(IntermediateRepresentation): def sizeof_pointer(self): "Return the size of a void* in bits" raise NotImplementedError("Abstract method") + + def dead_simp(self): + """Deprecated: See miasm2.analysis.data_flow.dead_simp()""" + warnings.warn('DEPRECATION WARNING: Please use miasm2.analysis.data_flow.dead_simp(ira) instead of ira.dead_simp()') + new_dead_simp_imp(self) |