diff options
Diffstat (limited to 'miasm2/analysis/data_analysis.py')
| -rw-r--r-- | miasm2/analysis/data_analysis.py | 326 |
1 files changed, 326 insertions, 0 deletions
diff --git a/miasm2/analysis/data_analysis.py b/miasm2/analysis/data_analysis.py new file mode 100644 index 00000000..cb953399 --- /dev/null +++ b/miasm2/analysis/data_analysis.py @@ -0,0 +1,326 @@ +from miasm2.expression.expression import * +from miasm2.ir.symbexec import symbexec + + +def get_node_name(label, i, n): + # n_name = "%s_%d_%s"%(label.name, i, n) + n_name = (label, i, n) + return n_name + + +def intra_bloc_flow_raw(my_ir, flow_graph, irb): + """ + Create data flow for an irbloc using raw IR expressions + """ + in_nodes = {} + out_nodes = {} + current_nodes = {} + for i, exprs in enumerate(irb.irs): + list_rw = get_list_rw(exprs) + current_nodes.update(out_nodes) + + # gen mem arg to mem node links + all_mems = set() + for nodes_r, nodes_w in list_rw: + for n in nodes_r.union(nodes_w): + all_mems.update(get_expr_mem(n)) + if not all_mems: + continue + + # print [str(x) for x in all_mems] + for n in all_mems: + node_n_w = get_node_name(irb.label, i, n) + if not n in nodes_r: + continue + o_r = n.arg.get_r(mem_read=False, cst_read=True) + for n_r in o_r: + if n_r in current_nodes: + node_n_r = current_nodes[n_r] + else: + node_n_r = get_node_name(irb.label, i, n_r) + current_nodes[n_r] = node_n_r + in_nodes[n_r] = node_n_r + flow_graph.add_uniq_edge(node_n_r, node_n_w) + + # gen data flow links + for nodes_r, nodes_w in list_rw: + for n_r in nodes_r: + if n_r in current_nodes: + node_n_r = current_nodes[n_r] + else: + node_n_r = get_node_name(irb.label, i, n_r) + current_nodes[n_r] = node_n_r + in_nodes[n_r] = node_n_r + + flow_graph.add_node(node_n_r) + for n_w in nodes_w: + node_n_w = get_node_name(irb.label, i + 1, n_w) + out_nodes[n_w] = node_n_w + # current_nodes[n_w] = node_n_w + + flow_graph.add_node(node_n_w) + flow_graph.add_uniq_edge(node_n_r, node_n_w) + irb.in_nodes = in_nodes + irb.out_nodes = out_nodes + + +def intra_bloc_flow_symbexec(my_ir, flow_graph, irb): + """ + Create data flow for an irbloc using symbolic execution + """ + in_nodes = {} + out_nodes = {} + current_nodes = {} + + symbols_init = {} + for r in my_ir.arch.regs.all_regs_ids: + # symbols_init[r] = my_ir.arch.regs.all_regs_ids_init[i] + x = ExprId(r.name, r.size) + x.is_term = True + symbols_init[r] = x + + sb = symbexec(my_ir.arch, dict(symbols_init)) + sb.emulbloc(irb) + # print "*"*40 + # print irb + # print sb.dump_id() + # print sb.dump_mem() + + for n_w in sb.symbols: + # print n_w + v = sb.symbols[n_w] + if n_w in symbols_init and symbols_init[n_w] == v: + continue + read_values = v.get_r(cst_read=True) + # print n_w, v, [str(x) for x in read_values] + node_n_w = get_node_name(irb.label, len(irb.lines), n_w) + + for n_r in read_values: + if n_r in current_nodes: + node_n_r = current_nodes[n_r] + else: + node_n_r = get_node_name(irb.label, 0, n_r) + current_nodes[n_r] = node_n_r + in_nodes[n_r] = node_n_r + + out_nodes[n_w] = node_n_w + flow_graph.add_uniq_edge(node_n_r, node_n_w) + + irb.in_nodes = in_nodes + irb.out_nodes = out_nodes + + +def inter_bloc_flow_link(my_ir, flow_graph, todo, link_exec_to_data): + lbl, current_nodes, exec_nodes = todo + # print 'TODO' + # print lbl + # print [(str(x[0]), str(x[1])) for x in current_nodes] + current_nodes = dict(current_nodes) + + # link current nodes to bloc in_nodes + if not lbl in my_ir.blocs: + print "cannot find bloc!!", lbl + return set() + irb = my_ir.blocs[lbl] + # pp(('IN', lbl, [(str(x[0]), str(x[1])) for x in current_nodes.items()])) + to_del = set() + for n_r, node_n_r in irb.in_nodes.items(): + if not n_r in current_nodes: + continue + # print 'add link', current_nodes[n_r], node_n_r + flow_graph.add_uniq_edge(current_nodes[n_r], node_n_r) + to_del.add(n_r) + + # if link exec to data, all nodes depends on exec nodes + if link_exec_to_data: + for n_x_r in exec_nodes: + for n_r, node_n_r in irb.in_nodes.items(): + if not n_x_r in current_nodes: + continue + if isinstance(n_r, ExprInt): + continue + flow_graph.add_uniq_edge(current_nodes[n_x_r], node_n_r) + + # update current nodes using bloc out_nodes + for n_w, node_n_w in irb.out_nodes.items(): + current_nodes[n_w] = node_n_w + + # get nodes involved in exec flow + x_nodes = tuple(sorted(list(irb.dst.get_r()))) + + todo = set() + for lbl_dst in my_ir.g.successors(irb.label): + todo.add((lbl_dst, tuple(current_nodes.items()), x_nodes)) + + # pp(('OUT', lbl, [(str(x[0]), str(x[1])) for x in current_nodes.items()])) + + return todo + + +def create_implicit_flow(my_ir, flow_graph): + + # first fix IN/OUT + # If a son read a node which in not in OUT, add it + todo = set(my_ir.blocs.keys()) + while todo: + lbl = todo.pop() + irb = my_ir.blocs[lbl] + for lbl_son in my_ir.g.successors(irb.label): + if not lbl_son in my_ir.blocs: + print "cannot find bloc!!", lbl + continue + irb_son = my_ir.blocs[lbl_son] + for n_r in irb_son.in_nodes: + if n_r in irb.out_nodes: + continue + if not isinstance(n_r, ExprId): + continue + + # print "###", n_r + # print "###", irb + # print "###", 'OUT', [str(x) for x in irb.out_nodes] + # print "###", irb_son + # print "###", 'IN', [str(x) for x in irb_son.in_nodes] + + node_n_w = irb.label, len(irb.lines), n_r + irb.out_nodes[n_r] = node_n_w + if not n_r in irb.in_nodes: + irb.in_nodes[n_r] = irb.label, 0, n_r + node_n_r = irb.in_nodes[n_r] + # print "###", node_n_r + for lbl_p in my_ir.g.predecessors(irb.label): + todo.add(lbl_p) + + flow_graph.add_uniq_edge(node_n_r, node_n_w) + + +def inter_bloc_flow(my_ir, flow_graph, irb_0, link_exec_to_data=True): + + todo = set() + done = set() + todo.add((irb_0, (), ())) + + while todo: + state = todo.pop() + if state in done: + continue + done.add(state) + out = inter_bloc_flow_link(my_ir, flow_graph, state, link_exec_to_data) + todo.update(out) + + +class symb_exec_func: + + """ + This algorithm will do symbolic execution on a function, trying to propagate + states between basic blocs in order to extract inter-blocs dataflow. The + algorithm tries to merge states from blocs with multiple parents. + + There is no real magic here, loops and complex merging will certainly fail. + """ + + def __init__(self, my_ir): + self.todo = set() + self.stateby_ad = {} + self.cpt = {} + self.states_var_done = set() + self.states_done = set() + self.total_done = 0 + self.my_ir = my_ir + + def add_state(self, parent, ad, state): + variables = dict(state.symbols.items()) + + # get bloc dead, and remove from state + b = self.my_ir.get_bloc(ad) + if b is None: + raise ValueError("unknown bloc! %s" % ad) + """ + dead = b.dead[0] + for d in dead: + if d in variables: + del(variables[d]) + """ + variables = variables.items() + + s = parent, ad, tuple(sorted(variables)) + """ + state_var = s[1] + if s in self.states_var_done: + print 'skip state' + return + if not ad in self.stateby_ad: + self.stateby_ad[ad] = set() + self.stateby_ad[ad].add(state_var) + + """ + self.todo.add(s) + + """ + if not ad in self.cpt: + self.cpt[ad] = 0 + """ + """ + def get_next_min(self): + state_by_ad = {} + for state in self.todo: + ad = state[1] + if not ad in state_by_ad: + state_by_ad[ad] = [] + state_by_ad[ad].append(state) + print "XX", [len(x) for x in state_by_ad.values()] + state_by_ad = state_by_ad.items() + state_by_ad.sort(key=lambda x:len(x[1])) + state_by_ad.reverse() + return state_by_ad.pop()[1][0] + """ + + def get_next_state(self): + state = self.todo.pop() + return state + + def do_step(self): + if len(self.todo) == 0: + return None + if self.total_done > 600: + print "symbexec watchdog!" + return None + self.total_done += 1 + print 'CPT', self.total_done + while self.todo: + # if self.total_done>20: + # self.get_next_min() + # state = self.todo.pop() + state = self.get_next_state() + parent, ad, s = state + self.states_done.add(state) + self.states_var_done.add(state) + # if s in self.states_var_done: + # print "state done" + # continue + + sb = symbexec(self.my_ir.arch, dict(s)) + """ + if (not is_dispatcher(ad)) and len(self.stateby_ad[ad]) > 10: + print "DROP", ad + continue + + if (not is_dispatcher(ad)) and len(self.stateby_ad[ad]) > 5: + print ad + big_keys = diff_states(*self.stateby_ad[ad]) + print big_keys + print "MERGE", ad + + if not big_keys: + return parent, sb + #assert(len(big_keys) == 1) + s_out = [] + for k, v in s: + if k not in big_keys : + s_out.append((k, v)) + sb = symbexec(mn, dict(s_out)) + return parent, ad, sb + #diff_states(*self.stateby_ad[ad]) + """ + return parent, ad, sb + return None |