"""Dynamic symbolic execution module. Offers a way to have a symbolic execution along a concrete one. Basically, this is done through DSEEngine class, with scheme: dse = DSEEngine(Machine("x86_32")) dse.attach(jitter) The DSE state can be updated through: - .update_state_from_concrete: update the values from the CPU, so the symbolic execution will be completely concrete from this point (until changes) - .update_state: inject information, for instance RAX = symbolic_RAX - .symbolize_memory: symbolize (using .memory_to_expr) memory areas (ie, reading from an address in one of these areas yield a symbol) The DSE run can be instrumented through: - .add_handler: register an handler, modifying the state instead of the current execution. Can be used for stubbing external API - .add_lib_handler: register handlers for libraries - .add_instrumentation: register an handler, modifying the state but continuing the current execution. Can be used for logging facilities On branch, if the decision is symbolic, one can also collect "path constraints" and inverse them to produce new inputs potentially reaching new paths. Basically, this is done through DSEPathConstraint. In order to produce a new solution, one can extend this class, and override 'handle_solution' to produce a solution which fit its needs. It could avoid computing new solution by overriding 'produce_solution'. If one is only interested in constraints associated to its path, the option "produce_solution" should be set to False, to speed up emulation. The constraints are accumulated in the .z3_cur z3.Solver object. Here are a few remainings TODO: - handle endianness in check_state / atomic read: currently, but this is also true for others Miasm2 symbolic engines, the endianness is not take in account, and assumed to be Little Endian - too many memory dependencies in constraint tracking: in order to let z3 find new solution, it does need information on memory values (for instance, a lookup in a table with a symbolic index). The estimated possible involved memory location could be too large to pass to the solver (threshold named MAX_MEMORY_INJECT). One possible solution, not yet implemented, is to call the solver for reducing the possible values thanks to its accumulated constraints. """ from builtins import range from collections import namedtuple import warnings try: import z3 except: z3 = None from future.utils import viewitems from miasm.core.utils import encode_hex, force_bytes from miasm.expression.expression import ExprMem, ExprInt, ExprCompose, \ ExprAssign, ExprId, ExprLoc, LocKey, canonize_to_exprloc from miasm.core.bin_stream import bin_stream_vm from miasm.jitter.emulatedsymbexec import EmulatedSymbExec from miasm.expression.expression_helper import possible_values from miasm.ir.translators import Translator from miasm.analysis.expression_range import expr_range from miasm.analysis.modularintervals import ModularIntervals DriftInfo = namedtuple("DriftInfo", ["symbol", "computed", "expected"]) class DriftException(Exception): """Raised when the emulation drift from the reference engine""" def __init__(self, info): super(DriftException, self).__init__() self.info = info def __str__(self): if len(self.info) == 1: return "Drift of %s: %s instead of %s" % ( self.info[0].symbol, self.info[0].computed, self.info[0].expected, ) else: return "Drift of:\n\t" + "\n\t".join("%s: %s instead of %s" % ( dinfo.symbol, dinfo.computed, dinfo.expected) for dinfo in self.info) class ESETrackModif(EmulatedSymbExec): """Extension of EmulatedSymbExec to be used by DSE engines Add the tracking of modified expressions, and the ability to symbolize memory areas """ def __init__(self, *args, **kwargs): super(ESETrackModif, self).__init__(*args, **kwargs) self.modified_expr = set() # Expr modified since the last reset self.dse_memory_range = [] # List/Intervals of memory addresses to # symbolize self.dse_memory_to_expr = None # function(addr) -> Expr used to # symbolize def mem_read(self, expr_mem): if not expr_mem.ptr.is_int(): return super(ESETrackModif, self).mem_read(expr_mem) dst_addr = int(expr_mem.ptr) # Split access in atomic accesses out = [] for addr in range(dst_addr, dst_addr + expr_mem.size // 8): if addr in self.dse_memory_range: # Symbolize memory access out.append(self.dse_memory_to_expr(addr)) continue atomic_access = ExprMem(ExprInt(addr, expr_mem.ptr.size), 8) if atomic_access in self.symbols: out.append( super(EmulatedSymbExec, self).mem_read(atomic_access)) else: # Get concrete value atomic_access = ExprMem(ExprInt(addr, expr_mem.ptr.size), 8) out.append(super(ESETrackModif, self).mem_read(atomic_access)) if len(out) == 1: # Trivial case (optimization) return out[0] # Simplify for constant merging (ex: {ExprInt(1, 8), ExprInt(2, 8)}) return self.expr_simp(ExprCompose(*out)) def mem_write(self, expr, data): # Call Symbolic mem_write (avoid side effects on vm) return super(EmulatedSymbExec, self).mem_write(expr, data) def reset_modified(self): """Reset modified expression tracker""" self.modified_expr.clear() def apply_change(self, dst, src): super(ESETrackModif, self).apply_change(dst, src) self.modified_expr.add(dst) class ESENoVMSideEffects(EmulatedSymbExec): """ Do EmulatedSymbExec without modifying memory """ def mem_write(self, expr, data): return super(EmulatedSymbExec, self).mem_write(expr, data) class DSEEngine(object): """Dynamic Symbolic Execution Engine This class aims to be overridden for each specific purpose """ SYMB_ENGINE = ESETrackModif def __init__(self, machine, loc_db): self.machine = machine self.loc_db = loc_db self.handler = {} # addr -> callback(DSEEngine instance) self.instrumentation = {} # addr -> callback(DSEEngine instance) self.addr_to_cacheblocks = {} # addr -> {label -> IRBlock} self.lifter = self.machine.lifter(loc_db=self.loc_db) # corresponding IR self.ircfg = self.lifter.new_ircfg() # corresponding IR # Defined after attachment self.jitter = None # Jitload (concrete execution) self.symb = None # SymbolicExecutionEngine self.symb_concrete = None # Concrete SymbExec for path desambiguisation self.mdis = None # DisasmEngine def prepare(self): """Prepare the environment for attachment with a jitter""" # Disassembler self.mdis = self.machine.dis_engine(bin_stream_vm(self.jitter.vm), lines_wd=1, loc_db=self.loc_db) # Symbexec engine ## Prepare symbexec engines self.symb = self.SYMB_ENGINE(self.jitter.cpu, self.jitter.vm, self.lifter, {}) self.symb.enable_emulated_simplifications() self.symb_concrete = ESENoVMSideEffects( self.jitter.cpu, self.jitter.vm, self.lifter, {} ) ## Update registers value self.symb.symbols[self.lifter.IRDst] = ExprInt( getattr(self.jitter.cpu, self.lifter.pc.name), self.lifter.IRDst.size ) # Activate callback on each instr self.jitter.jit.set_options(max_exec_per_call=1, jit_maxline=1) self.jitter.exec_cb = self.callback # Clean jit cache to avoid multi-line basic blocks already jitted self.jitter.jit.clear_jitted_blocks() def attach(self, emulator): """Attach the DSE to @emulator @emulator: jitload (or API equivalent) instance To attach *DURING A BREAKPOINT*, one may consider using the following snippet: def breakpoint(self, jitter): ... dse.attach(jitter) dse.update... ... # Additional call to the exec callback is necessary, as breakpoints are # honored AFTER exec callback jitter.exec_cb(jitter) return True Without it, one may encounteer a DriftException error due to a "desynchronization" between jitter and dse states. Indeed, on 'handle' call, the jitter must be one instruction AFTER the dse. """ self.jitter = emulator self.prepare() def handle(self, cur_addr): r"""Handle destination @cur_addr: Expr of the next address in concrete execution [!] cur_addr may be a loc_key In this method, self.symb is in the "just before branching" state """ pass def add_handler(self, addr, callback): """Add a @callback for address @addr before any state update. The state IS NOT updated after returning from the callback @addr: int @callback: func(dse instance)""" self.handler[addr] = callback def add_lib_handler(self, libimp, namespace): """Add search for handler based on a @libimp libimp instance Known functions will be looked by {name}_symb or {name}_{ord}_symb in the @namespace """ namespace = dict( (force_bytes(name), func) for name, func in viewitems(namespace) ) # lambda cannot contain statement def default_func(dse): fname = libimp.fad2cname[dse.jitter.pc] if isinstance(fname, tuple): fname = b"%s_%d_symb" % (force_bytes(fname[0]), fname[1]) else: fname = b"%s_symb" % force_bytes(fname) raise RuntimeError("Symbolic stub '%s' not found" % fname) for addr, fname in viewitems(libimp.fad2cname): if isinstance(fname, tuple): fname = b"%s_%d_symb" % (force_bytes(fname[0]), fname[1]) else: fname = b"%s_symb" % force_bytes(fname) func = namespace.get(fname, None) if func is not None: self.add_handler(addr, func) else: self.add_handler(addr, default_func) def add_instrumentation(self, addr, callback): """Add a @callback for address @addr before any state update. The state IS updated after returning from the callback @addr: int @callback: func(dse instance)""" self.instrumentation[addr] = callback def _check_state(self): """Check the current state against the concrete one""" errors = [] # List of DriftInfo for symbol in self.symb.modified_expr: # Do not consider PC if symbol in [self.lifter.pc, self.lifter.IRDst]: continue # Consider only concrete values symb_value = self.eval_expr(symbol) if not symb_value.is_int(): continue symb_value = int(symb_value) # Check computed values against real ones if symbol.is_id(): if hasattr(self.jitter.cpu, symbol.name): value = getattr(self.jitter.cpu, symbol.name) if value != symb_value: errors.append(DriftInfo(symbol, symb_value, value)) elif symbol.is_mem() and symbol.ptr.is_int(): value_chr = self.jitter.vm.get_mem( int(symbol.ptr), symbol.size // 8 ) exp_value = int(encode_hex(value_chr[::-1]), 16) if exp_value != symb_value: errors.append(DriftInfo(symbol, symb_value, exp_value)) # Check for drift, and act accordingly if errors: raise DriftException(errors) def callback(self, _): """Called before each instruction""" # Assert synchronization with concrete execution self._check_state() # Call callbacks associated to the current address cur_addr = self.jitter.pc if isinstance(cur_addr, LocKey): lbl = self.lifter.loc_db.loc_key_to_label(cur_addr) cur_addr = lbl.offset if cur_addr in self.handler: self.handler[cur_addr](self) return True if cur_addr in self.instrumentation: self.instrumentation[cur_addr](self) # Handle current address self.handle(ExprInt(cur_addr, self.lifter.IRDst.size)) # Avoid memory issue in ExpressionSimplifier if len(self.symb.expr_simp.cache) > 100000: self.symb.expr_simp.cache.clear() # Get IR blocks if cur_addr in self.addr_to_cacheblocks: self.ircfg.blocks.clear() self.ircfg.blocks.update(self.addr_to_cacheblocks[cur_addr]) else: ## Reset cache structures self.ircfg.blocks.clear()# = {} ## Update current state asm_block = self.mdis.dis_block(cur_addr) self.lifter.add_asmblock_to_ircfg(asm_block, self.ircfg) self.addr_to_cacheblocks[cur_addr] = dict(self.ircfg.blocks) # Emulate the current instruction self.symb.reset_modified() # Is the symbolic execution going (potentially) to jump on a lbl_gen? if len(self.ircfg.blocks) == 1: self.symb.run_at(self.ircfg, cur_addr) else: # Emulation could stuck in generated IR blocks # But concrete execution callback is not enough precise to obtain # the full IR blocks path # -> Use a fully concrete execution to get back path # Update the concrete execution self._update_state_from_concrete_symb( self.symb_concrete, cpu=True, mem=True ) while True: next_addr_concrete = self.symb_concrete.run_block_at( self.ircfg, cur_addr ) self.symb.run_block_at(self.ircfg, cur_addr) if not (isinstance(next_addr_concrete, ExprLoc) and self.lifter.loc_db.get_location_offset( next_addr_concrete.loc_key ) is None): # Not a lbl_gen, exit break # Call handle with lbl_gen state self.handle(next_addr_concrete) cur_addr = next_addr_concrete # At this stage, symbolic engine is one instruction after the concrete # engine return True def _get_gpregs(self): """Return a dict of regs: value from the jitter This version use the regs associated to the attrib (!= cpu.get_gpreg()) """ out = {} regs = self.lifter.arch.regs.attrib_to_regs[self.lifter.attrib] for reg in regs: if hasattr(self.jitter.cpu, reg.name): out[reg.name] = getattr(self.jitter.cpu, reg.name) return out def take_snapshot(self): """Return a snapshot of the current state (including jitter state)""" snapshot = { "mem": self.jitter.vm.get_all_memory(), "regs": self._get_gpregs(), "symb": self.symb.symbols.copy(), } return snapshot def restore_snapshot(self, snapshot, memory=True): """Restore a @snapshot taken with .take_snapshot @snapshot: .take_snapshot output @memory: (optional) if set, also restore the memory """ # Restore memory if memory: self.jitter.vm.reset_memory_page_pool() self.jitter.vm.reset_code_bloc_pool() for addr, metadata in viewitems(snapshot["mem"]): self.jitter.vm.add_memory_page( addr, metadata["access"], metadata["data"] ) # Restore registers self.jitter.pc = snapshot["regs"][self.lifter.pc.name] for reg, value in viewitems(snapshot["regs"]): setattr(self.jitter.cpu, reg, value) # Reset intern elements self.jitter.vm.set_exception(0) self.jitter.cpu.set_exception(0) self.jitter.bs._atomic_mode = False # Reset symb exec for key, _ in list(viewitems(self.symb.symbols)): del self.symb.symbols[key] for expr, value in viewitems(snapshot["symb"]): self.symb.symbols[expr] = value def update_state(self, assignblk): """From this point, assume @assignblk in the symbolic execution @assignblk: AssignBlock/{dst -> src} """ for dst, src in viewitems(assignblk): self.symb.apply_change(dst, src) def _update_state_from_concrete_symb(self, symbexec, cpu=True, mem=False): if mem: # Values will be retrieved from the concrete execution if they are # not present symbexec.symbols.symbols_mem.base_to_memarray.clear() if cpu: regs = self.lifter.arch.regs.attrib_to_regs[self.lifter.attrib] for reg in regs: if hasattr(self.jitter.cpu, reg.name): value = ExprInt(getattr(self.jitter.cpu, reg.name), size=reg.size) symbexec.symbols[reg] = value def update_state_from_concrete(self, cpu=True, mem=False): r"""Update the symbolic state with concrete values from the concrete engine @cpu: (optional) if set, update registers' value @mem: (optional) if set, update memory value [!] all current states will be loss. This function is usually called when states are no more synchronized (at the beginning, returning from an unstubbed syscall, ...) """ self._update_state_from_concrete_symb(self.symb, cpu, mem) def eval_expr(self, expr): """Return the evaluation of @expr: @expr: Expr instance""" return self.symb.eval_expr(expr) @staticmethod def memory_to_expr(addr): """Translate an address to its corresponding symbolic ID (8bits) @addr: int""" return ExprId("MEM_0x%x" % int(addr), 8) def symbolize_memory(self, memory_range): """Register a range of memory addresses to symbolize @memory_range: object with support of __in__ operation (intervals, list, ...) """ self.symb.dse_memory_range = memory_range self.symb.dse_memory_to_expr = self.memory_to_expr class DSEPathConstraint(DSEEngine): """Dynamic Symbolic Execution Engine keeping the path constraint Possible new "solutions" are produced along the path, by inversing concrete path constraint. Thus, a "solution" is a potential initial context leading to a new path. In order to produce a new solution, one can extend this class, and override 'handle_solution' to produce a solution which fit its needs. It could avoid computing new solution by overriding 'produce_solution'. If one is only interested in constraints associated to its path, the option "produce_solution" should be set to False, to speed up emulation. The constraints are accumulated in the .z3_cur z3.Solver object. """ # Maximum memory size to inject in constraints solving MAX_MEMORY_INJECT = 0x10000 # Produce solution strategies PRODUCE_NO_SOLUTION = 0 PRODUCE_SOLUTION_CODE_COV = 1 PRODUCE_SOLUTION_BRANCH_COV = 2 PRODUCE_SOLUTION_PATH_COV = 3 def __init__(self, machine, loc_db, produce_solution=PRODUCE_SOLUTION_CODE_COV, known_solutions=None, **kwargs): """Init a DSEPathConstraint @machine: Machine of the targeted architecture instance @produce_solution: (optional) if set, new solutions will be computed""" super(DSEPathConstraint, self).__init__(machine, loc_db, **kwargs) # Dependency check assert z3 is not None # Init PathConstraint specifics structures self.cur_solver = z3.Solver() self.new_solutions = {} # solution identifier -> solution's model self._known_solutions = set() # set of solution identifiers self.z3_trans = Translator.to_language("z3") self._produce_solution_strategy = produce_solution self._previous_addr = None self._history = None if produce_solution == self.PRODUCE_SOLUTION_PATH_COV: self._history = [] # List of addresses in the current path @property def ir_arch(self): warnings.warn('DEPRECATION WARNING: use ".lifter" instead of ".ir_arch"') return self.lifter def take_snapshot(self, *args, **kwargs): snap = super(DSEPathConstraint, self).take_snapshot(*args, **kwargs) snap["new_solutions"] = { dst: src.copy for dst, src in viewitems(self.new_solutions) } snap["cur_constraints"] = self.cur_solver.assertions() if self._produce_solution_strategy == self.PRODUCE_SOLUTION_PATH_COV: snap["_history"] = list(self._history) elif self._produce_solution_strategy == self.PRODUCE_SOLUTION_BRANCH_COV: snap["_previous_addr"] = self._previous_addr return snap def restore_snapshot(self, snapshot, keep_known_solutions=True, **kwargs): """Restore a DSEPathConstraint snapshot @keep_known_solutions: if set, do not forget solutions already found. -> They will not appear in 'new_solutions' """ super(DSEPathConstraint, self).restore_snapshot(snapshot, **kwargs) self.new_solutions.clear() self.new_solutions.update(snapshot["new_solutions"]) self.cur_solver = z3.Solver() self.cur_solver.add(snapshot["cur_constraints"]) if not keep_known_solutions: self._known_solutions.clear() if self._produce_solution_strategy == self.PRODUCE_SOLUTION_PATH_COV: self._history = list(snapshot["_history"]) elif self._produce_solution_strategy == self.PRODUCE_SOLUTION_BRANCH_COV: self._previous_addr = snapshot["_previous_addr"] def _key_for_solution_strategy(self, destination): """Return the associated identifier for the current solution strategy""" if self._produce_solution_strategy == self.PRODUCE_NO_SOLUTION: # Never produce a solution return None elif self._produce_solution_strategy == self.PRODUCE_SOLUTION_CODE_COV: # Decision based on code coverage # -> produce a solution if the destination has never been seen key = destination elif self._produce_solution_strategy == self.PRODUCE_SOLUTION_BRANCH_COV: # Decision based on branch coverage # -> produce a solution if the current branch has never been take key = (self._previous_addr, destination) elif self._produce_solution_strategy == self.PRODUCE_SOLUTION_PATH_COV: # Decision based on path coverage # -> produce a solution if the current path has never been take key = tuple(self._history + [destination]) else: raise ValueError("Unknown produce solution strategy") return key def produce_solution(self, destination): """Called to determine if a solution for @destination should be test for satisfiability and computed @destination: Expr instance of the target @destination """ key = self._key_for_solution_strategy(destination) if key is None: return False return key not in self._known_solutions def handle_solution(self, model, destination): """Called when a new solution for destination @destination is founded @model: z3 model instance @destination: Expr instance for an addr which is not on the DSE path """ key = self._key_for_solution_strategy(destination) assert key is not None self.new_solutions[key] = model self._known_solutions.add(key) def handle_correct_destination(self, destination, path_constraints): """[DEV] Called by handle() to update internal structures giving the correct destination (the concrete execution one). """ # Update structure used by produce_solution() if self._produce_solution_strategy == self.PRODUCE_SOLUTION_PATH_COV: self._history.append(destination) elif self._produce_solution_strategy == self.PRODUCE_SOLUTION_BRANCH_COV: self._previous_addr = destination # Update current solver for cons in path_constraints: self.cur_solver.add(self.z3_trans.from_expr(cons)) def handle(self, cur_addr): cur_addr = canonize_to_exprloc(self.lifter.loc_db, cur_addr) symb_pc = self.eval_expr(self.lifter.IRDst) possibilities = possible_values(symb_pc) cur_path_constraint = set() # path_constraint for the concrete path if len(possibilities) == 1: dst = next(iter(possibilities)).value dst = canonize_to_exprloc(self.lifter.loc_db, dst) assert dst == cur_addr else: for possibility in possibilities: target_addr = canonize_to_exprloc(self.lifter.loc_db, possibility.value) path_constraint = set() # Set of ExprAssign for the possible path # Get constraint associated to the possible path memory_to_add = ModularIntervals(symb_pc.size) for cons in possibility.constraints: eaff = cons.to_constraint() # eaff.get_r(mem_read=True) is not enough # ExprAssign consider a Memory access in dst as a write mem = eaff.dst.get_r(mem_read=True) mem.update(eaff.src.get_r(mem_read=True)) for expr in mem: if expr.is_mem(): addr_range = expr_range(expr.ptr) # At upper bounds, add the size of the memory access # if addr (- [a, b], then @size[addr] reachables # values are in @8[a, b + size[ for start, stop in addr_range: stop += expr.size // 8 - 1 full_range = ModularIntervals( symb_pc.size, [(start, stop)] ) memory_to_add.update(full_range) path_constraint.add(eaff) if memory_to_add.length > self.MAX_MEMORY_INJECT: # TODO re-croncretize the constraint or z3-try raise RuntimeError("Not implemented: too long memory area") # Inject memory for start, stop in memory_to_add: for address in range(start, stop + 1): expr_mem = ExprMem(ExprInt(address, self.lifter.pc.size), 8) value = self.eval_expr(expr_mem) if not value.is_int(): raise TypeError("Rely on a symbolic memory case, " \ "address 0x%x" % address) path_constraint.add(ExprAssign(expr_mem, value)) if target_addr == cur_addr: # Add path constraint cur_path_constraint = path_constraint elif self.produce_solution(target_addr): # Looking for a new solution self.cur_solver.push() for cons in path_constraint: trans = self.z3_trans.from_expr(cons) trans = z3.simplify(trans) self.cur_solver.add(trans) result = self.cur_solver.check() if result == z3.sat: model = self.cur_solver.model() self.handle_solution(model, target_addr) self.cur_solver.pop() self.handle_correct_destination(cur_addr, cur_path_constraint)