diff options
| -rw-r--r-- | example/samples/dse_crackme.c | 104 | ||||
| -rw-r--r-- | example/symbol_exec/dse_crackme.py | 310 | ||||
| -rw-r--r-- | miasm2/analysis/dse.py | 545 | ||||
| -rw-r--r-- | miasm2/arch/aarch64/regs.py | 5 | ||||
| -rw-r--r-- | miasm2/arch/arm/regs.py | 5 | ||||
| -rw-r--r-- | miasm2/arch/mips32/regs.py | 5 | ||||
| -rw-r--r-- | miasm2/arch/msp430/regs.py | 5 | ||||
| -rw-r--r-- | miasm2/arch/x86/regs.py | 6 | ||||
| -rw-r--r-- | test/analysis/dse.py | 94 | ||||
| -rwxr-xr-x | test/test_all.py | 17 | ||||
| -rw-r--r-- | test/utils/test.py | 9 | ||||
| -rw-r--r-- | test/utils/testset.py | 3 |
12 files changed, 1105 insertions, 3 deletions
diff --git a/example/samples/dse_crackme.c b/example/samples/dse_crackme.c new file mode 100644 index 00000000..5fc0faaf --- /dev/null +++ b/example/samples/dse_crackme.c @@ -0,0 +1,104 @@ +#include <stdio.h> +#include <stdint.h> + +/* + * This example is a tiny crackme, with a few specificities: + * - the flag must be read from a file ('test.txt') + * - a few steps are not naively bruteforcable (using a fuzzer for instance): + * comparison with a 32bit value, then CRC16 + * - a DSE engine has to support the search into a fixed memory table + * (crc16_tab) based on variables (file bytes) +*/ + + +static const uint16_t crc16_tab[] = { + 0x0000, 0x1021, 0x2042, 0x3063, 0x4084, 0x50a5, 0x60c6, 0x70e7, + 0x8108, 0x9129, 0xa14a, 0xb16b, 0xc18c, 0xd1ad, 0xe1ce, 0xf1ef, + 0x1231, 0x0210, 0x3273, 0x2252, 0x52b5, 0x4294, 0x72f7, 0x62d6, + 0x9339, 0x8318, 0xb37b, 0xa35a, 0xd3bd, 0xc39c, 0xf3ff, 0xe3de, + 0x2462, 0x3443, 0x0420, 0x1401, 0x64e6, 0x74c7, 0x44a4, 0x5485, + 0xa56a, 0xb54b, 0x8528, 0x9509, 0xe5ee, 0xf5cf, 0xc5ac, 0xd58d, + 0x3653, 0x2672, 0x1611, 0x0630, 0x76d7, 0x66f6, 0x5695, 0x46b4, + 0xb75b, 0xa77a, 0x9719, 0x8738, 0xf7df, 0xe7fe, 0xd79d, 0xc7bc, + 0x48c4, 0x58e5, 0x6886, 0x78a7, 0x0840, 0x1861, 0x2802, 0x3823, + 0xc9cc, 0xd9ed, 0xe98e, 0xf9af, 0x8948, 0x9969, 0xa90a, 0xb92b, + 0x5af5, 0x4ad4, 0x7ab7, 0x6a96, 0x1a71, 0x0a50, 0x3a33, 0x2a12, + 0xdbfd, 0xcbdc, 0xfbbf, 0xeb9e, 0x9b79, 0x8b58, 0xbb3b, 0xab1a, + 0x6ca6, 0x7c87, 0x4ce4, 0x5cc5, 0x2c22, 0x3c03, 0x0c60, 0x1c41, + 0xedae, 0xfd8f, 0xcdec, 0xddcd, 0xad2a, 0xbd0b, 0x8d68, 0x9d49, + 0x7e97, 0x6eb6, 0x5ed5, 0x4ef4, 0x3e13, 0x2e32, 0x1e51, 0x0e70, + 0xff9f, 0xefbe, 0xdfdd, 0xcffc, 0xbf1b, 0xaf3a, 0x9f59, 0x8f78, + 0x9188, 0x81a9, 0xb1ca, 0xa1eb, 0xd10c, 0xc12d, 0xf14e, 0xe16f, + 0x1080, 0x00a1, 0x30c2, 0x20e3, 0x5004, 0x4025, 0x7046, 0x6067, + 0x83b9, 0x9398, 0xa3fb, 0xb3da, 0xc33d, 0xd31c, 0xe37f, 0xf35e, + 0x02b1, 0x1290, 0x22f3, 0x32d2, 0x4235, 0x5214, 0x6277, 0x7256, + 0xb5ea, 0xa5cb, 0x95a8, 0x8589, 0xf56e, 0xe54f, 0xd52c, 0xc50d, + 0x34e2, 0x24c3, 0x14a0, 0x0481, 0x7466, 0x6447, 0x5424, 0x4405, + 0xa7db, 0xb7fa, 0x8799, 0x97b8, 0xe75f, 0xf77e, 0xc71d, 0xd73c, + 0x26d3, 0x36f2, 0x0691, 0x16b0, 0x6657, 0x7676, 0x4615, 0x5634, + 0xd94c, 0xc96d, 0xf90e, 0xe92f, 0x99c8, 0x89e9, 0xb98a, 0xa9ab, + 0x5844, 0x4865, 0x7806, 0x6827, 0x18c0, 0x08e1, 0x3882, 0x28a3, + 0xcb7d, 0xdb5c, 0xeb3f, 0xfb1e, 0x8bf9, 0x9bd8, 0xabbb, 0xbb9a, + 0x4a75, 0x5a54, 0x6a37, 0x7a16, 0x0af1, 0x1ad0, 0x2ab3, 0x3a92, + 0xfd2e, 0xed0f, 0xdd6c, 0xcd4d, 0xbdaa, 0xad8b, 0x9de8, 0x8dc9, + 0x7c26, 0x6c07, 0x5c64, 0x4c45, 0x3ca2, 0x2c83, 0x1ce0, 0x0cc1, + 0xef1f, 0xff3e, 0xcf5d, 0xdf7c, 0xaf9b, 0xbfba, 0x8fd9, 0x9ff8, + 0x6e17, 0x7e36, 0x4e55, 0x5e74, 0x2e93, 0x3eb2, 0x0ed1, 0x1ef0, +}; + +uint16_t crc16(uint16_t seed, unsigned char *buf, size_t len) +{ + size_t i; + uint16_t tmp; + + tmp = seed ^ 0xFFFFFFFF; + for (i = 0; i < len; i++) { + tmp = crc16_tab[((tmp>>8) ^ buf[i]) & 0xFF] ^ (tmp << 8); + } + return tmp; +} + +uint16_t test() { + FILE *file; + unsigned char buf[0x100] = {0}; + size_t read; + uint32_t temp; + + file = fopen("test.txt", "r"); + if (file) { + read = fread(buf, sizeof(char), 0x100, file); + fclose(file); + if (read < 14) { + return 0xFFFF; + } + if (buf[0] != 'M') { + return 0xFFFF; + } + if (!((unsigned char) (2 * buf[7]) == 14) || (buf[7] == 7)) { + return 0xFFFF; + } + temp = buf[1]; + temp <<= 8; + temp |= buf[2]; + temp <<= 8; + temp |= buf[3]; + temp <<= 8; + temp |= buf[4]; + if (temp != 0x2140244d) { // "!@$M" + return 0xFFFF; + } + return crc16(0, buf, read); + } else { + return 0xFFFF; + } +} + +int main(int argc, char** argv) { + uint16_t result = test(); + if (result == 0x1337) { + printf("OK\n"); + } else { + printf("BAD\n"); + } + return 0; +} diff --git a/example/symbol_exec/dse_crackme.py b/example/symbol_exec/dse_crackme.py new file mode 100644 index 00000000..e08536f9 --- /dev/null +++ b/example/symbol_exec/dse_crackme.py @@ -0,0 +1,310 @@ +"""Example of DynamicSymbolicExecution engine use + +This example should run on the compiled ELF x86 64bits version of +"dse_crackme.c" + +""" + +#### This part is only related to the run of the sample, without DSE #### +import os +import subprocess +from collections import namedtuple +from pdb import pm + +from miasm2.jitter.csts import PAGE_READ, PAGE_WRITE +from miasm2.analysis.sandbox import Sandbox_Linux_x86_64 +from miasm2.expression.expression import * + +# File "management" +my_FILE_ptr = 0x11223344 +FInfo = namedtuple("FInfo", ["path", "fdesc"]) +FILE_to_info = {} +def xxx_fopen(jitter): + ''' + #include <stdio.h> + + FILE *fopen(const char *path, const char *mode); + ''' + global my_FILE_ptr + ret_addr, args = jitter.func_args_systemv(['path', 'mode']) + fname = jitter.get_str_ansi(args.path) + FILE_to_info[my_FILE_ptr] = FInfo(fname, open(fname)) + my_FILE_ptr += 1 + return jitter.func_ret_stdcall(ret_addr, my_FILE_ptr - 1) + +def xxx_fread(jitter): + ''' + #include <stdio.h> + + size_t fread(void *ptr, size_t size, size_t nmemb, FILE *stream); + ''' + ret_addr, args = jitter.func_args_systemv(['ptr', 'size', 'nmemb', 'stream']) + info = FILE_to_info[args.stream] + data = info.fdesc.read(args.size * args.nmemb) + jitter.vm.set_mem(args.ptr, data) + return jitter.func_ret_stdcall(ret_addr, len(data)) + +def xxx_fclose(jitter): + ''' + #include <stdio.h> + + int fclose(FILE *stream); + ''' + ret_addr, args = jitter.func_args_systemv(['stream']) + del FILE_to_info[args.stream] + return jitter.func_ret_stdcall(ret_addr, 0) + +# Create sandbox +parser = Sandbox_Linux_x86_64.parser(description="ELF sandboxer") +parser.add_argument("filename", help="ELF Filename") +options = parser.parse_args() +options.mimic_env = True +sb = Sandbox_Linux_x86_64(options.filename, options, globals()) + +# Init segment +sb.jitter.ir_arch.do_stk_segm = True +sb.jitter.ir_arch.do_ds_segm = True +sb.jitter.ir_arch.do_str_segm = True +sb.jitter.ir_arch.do_all_segm = True +FS_0_ADDR = 0x7ff70000 +sb.jitter.cpu.FS = 0x4 +sb.jitter.cpu.set_segm_base(sb.jitter.cpu.FS, FS_0_ADDR) +sb.jitter.vm.add_memory_page( + FS_0_ADDR + 0x28, PAGE_READ, "\x42\x42\x42\x42\x42\x42\x42\x42", + "Stack canary FS[0x28]") + +# Prepare the execution +sb.jitter.init_run(sb.entry_point) + + +#### This part handle the DSE #### +from miasm2.analysis.dse import DSEPathConstraint +from miasm2.analysis.machine import Machine + + +# File "management" + +class SymbolicFile(object): + """Symbolic file with read operation, returning symbolic bytes""" + + def __init__(self, fname): + self.fname = fname + self.position = 0 + self.max_size = os.stat(fname).st_size + self.gen_bytes = {} + self.state = "OPEN" + + def read(self, length): + assert self.state == "OPEN" + out = [] + for i in xrange(self.position, min(self.position + length, + self.max_size)): + if i not in self.gen_bytes: + ret = ExprId("SF_%08x_%d" % (id(self), i), 8) + self.gen_bytes[i] = ret + out.append(self.gen_bytes[i]) + self.position += 1 + + return out + + def close(self): + self.state = "CLOSE" + + +FILE_to_info_symb = {} +FILE_stream = ExprId("FILE_0", 64) +FILE_size = ExprId("FILE_0_size", 64) + +def xxx_fopen_symb(dse): + regs = dse.ir_arch.arch.regs + fname_addr = dse.eval_expr(regs.RDI) + mode = dse.eval_expr(regs.RSI) + assert fname_addr.is_int() + assert mode.is_int() + fname = dse.jitter.get_str_ansi(int(fname_addr)) + ret_addr = ExprInt(dse.jitter.get_stack_arg(0), regs.RIP.size) + + assert len(FILE_to_info_symb) == 0 + ret_value = FILE_stream + FILE_to_info_symb[ret_value] = SymbolicFile(fname) + + dse.update_state({ + regs.RSP: dse.eval_expr(regs.RSP + ExprInt(8, regs.RSP.size)), + dse.ir_arch.IRDst: ret_addr, + regs.RIP: ret_addr, + regs.RAX: ret_value, + }) + +def xxx_fread_symb(dse): + regs = dse.ir_arch.arch.regs + ptr = dse.eval_expr(regs.RDI) + size = dse.eval_expr(regs.RSI) + nmemb = dse.eval_expr(regs.RDX) + stream = dse.eval_expr(regs.RCX) + + assert size.is_int() + assert nmemb.is_int() + + # Fill the buffer with symbolic bytes + update = {} + sf = FILE_to_info_symb[stream] + data = sf.read(int(size) * int(nmemb)) + for i, content in enumerate(data): + addr = dse.symb.expr_simp(ptr + ExprInt(i, ptr.size)) + update[ExprMem(addr, 8)] = content + + ret_addr = ExprInt(dse.jitter.get_stack_arg(0), regs.RIP.size) + ret_value = FILE_size + + update.update({ + regs.RSP: dse.symb.eval_expr(regs.RSP + ExprInt(8, regs.RSP.size)), + dse.ir_arch.IRDst: ret_addr, + regs.RIP: ret_addr, + regs.RAX: ret_value, + }) + dse.update_state(update) + +def xxx_fclose_symb(dse): + regs = dse.ir_arch.arch.regs + stream = dse.eval_expr(regs.RDI) + FILE_to_info_symb[stream].close() + + ret_addr = ExprInt(dse.jitter.get_stack_arg(0), regs.RIP.size) + dse.update_state({ + regs.RSP: dse.symb.eval_expr(regs.RSP + ExprInt(8, regs.RSP.size)), + dse.ir_arch.IRDst: ret_addr, + regs.RIP: ret_addr, + regs.RAX: ExprInt(0, regs.RAX.size), + }) + +# Symbolic naive version of _libc_start_main + +def xxx___libc_start_main_symb(dse): + # ['RDI', 'RSI', 'RDX', 'RCX', 'R8', 'R9'] + # main, argc, argv, ... + regs = dse.ir_arch.arch.regs + top_stack = dse.eval_expr(regs.RSP) + main_addr = dse.eval_expr(regs.RDI) + argc = dse.eval_expr(regs.RSI) + argv = dse.eval_expr(regs.RDX) + hlt_addr = ExprInt(0x1337beef, 64) + + dse.update_state({ + ExprMem(top_stack, 64): hlt_addr, + regs.RDI: argc, + regs.RSI: argv, + dse.ir_arch.IRDst: main_addr, + dse.ir_arch.pc: main_addr, + }) + +# Stop the execution on puts and get back the corresponding string +class FinnishOn(Exception): + + def __init__(self, string): + self.string = string + super(FinnishOn, self).__init__() + +def xxx_puts_symb(dse): + string = dse.jitter.get_str_ansi(dse.jitter.cpu.RDI) + raise FinnishOn(string) + + +done = set([]) # Set of jump address already handled +todo = set([""]) # Set of file content to test + +class DSEGenFile(DSEPathConstraint): + """DSE with a specific solution creation: + The solution is the content of the FILE to be read + + The politics of exploration is the branch coverage: create a solution only + if the target address has never been seen + """ + + def handle_solution(self, model, destination): + global todo, done + assert destination.is_int() + + if destination in done: + # Skip this path, already treated + return + + finfo = FILE_to_info_symb[FILE_stream] + + # Build corresponding file + out = "" + fsize = max(model.eval(self.z3_trans.from_expr(FILE_size)).as_long(), + len(finfo.gen_bytes)) + for index in xrange(fsize): + try: + byteid = finfo.gen_bytes[index] + out += chr(model.eval(self.z3_trans.from_expr(byteid)).as_long()) + except (KeyError, AttributeError) as _: + # Default value if there is no constraint on current byte + out += "\x00" + + todo.add(out) + done.add(destination) + +# Instanciate the DSE engine +machine = Machine("x86_64") +dse = DSEGenFile(machine) + +# Attach to the jitter +dse.attach(sb.jitter) + +# Update the jitter state: df is read, but never set +# Approachs: specific or generic +# - Specific: +# df_value = ExprInt(sb.jitter.cpu.df, dse.ir_arch.arch.regs.df.size) +# dse.update_state({ +# dse.ir_arch.arch.regs.df: df_value +# }) +# - Generic: +dse.update_state_from_concrete() + +# Add constraint on file size, we don't want to generate too big FILE +z3_file_size = dse.z3_trans.from_expr(FILE_size) +dse.cur_solver.add(0 < z3_file_size) +dse.cur_solver.add(z3_file_size < 0x10) + +# Register symbolic stubs for extern functions (xxx_puts_symb, ...) +dse.add_lib_handler(sb.libs, globals()) + +# Automatic exploration of solution + +## Save the current clean state, before any computation of the FILE content +snapshot = dse.take_snapshot() +found = False + +while todo: + # Prepare a solution to try, based on the clean state + file_content = todo.pop() + print "CUR: %r" % file_content + open("test.txt", "w").write(file_content) + dse.restore_snapshot(snapshot) + FILE_to_info.clear() + FILE_to_info_symb.clear() + + # Play the current file + try: + sb.run() + except FinnishOn as finnish_info: + print finnish_info.string + if finnish_info.string == "OK": + # Stop if the expected result is found + found = True + break + +# Assert that the result has been found +assert found == True +print "FOUND !" + +# Replay for real +print "Trying to launch the binary without Miasm" +crackme = subprocess.Popen([options.filename], stdout=subprocess.PIPE, + stderr=subprocess.PIPE) +stdout, stderr = crackme.communicate() +assert not stderr +stdout = stdout.strip() +print stdout +assert stdout == "OK" diff --git a/miasm2/analysis/dse.py b/miasm2/analysis/dse.py new file mode 100644 index 00000000..290015ea --- /dev/null +++ b/miasm2/analysis/dse.py @@ -0,0 +1,545 @@ +"""Dynamic symbolic execution module. + +Offers a way to have a symbolic execution along a concrete one. +Basically, this is done through DSEEngine class, with scheme: + +dse = DSEEngine(Machine("x86_32")) +dse.attach(jitter) + +The DSE state can be updated through: + + - .update_state_from_concrete: update the values from the CPU, so the symbolic + execution will be completely concrete from this point (until changes) + - .update_state: inject information, for instance RAX = symbolic_RAX + - .symbolize_memory: symbolize (using .memory_to_expr) memory areas (ie, + reading from an address in one of these areas yield a symbol) + +The DSE run can be instrumented through: + - .add_handler: register an handler, modifying the state instead of the current + execution. Can be used for stubbing external API + - .add_lib_handler: register handlers for libraries + - .add_instrumentation: register an handler, modifying the state but continuing + the current execution. Can be used for logging facilities + + +On branch, if the decision is symbolic, one can also collect "path constraints" +and inverse them to produce new inputs potentially reaching new paths. + +Basically, this is done through DSEPathConstraint. In order to produce a new +solution, one can extend this class, and override 'handle_solution' to +produce a solution which fit its needs. + +If one is only interested in constraints associated to its path, the option +"produce_solution" should be set to False, to speed up emulation. +The constraints are accumulated in the .z3_cur z3.Solver object. + +Here are a few remainings TODO: + - handle endianess in check_state / atomic read: currently, but this is also + true for others Miasm2 symbolic engines, the endianness is not take in + account, and assumed to be Little Endian + + - too many memory dependencies in constraint tracking: in order to let z3 find + new solution, it does need information on memory values (for instance, a + lookup in a table with a symbolic index). The estimated possible involved + memory location could be too large to pass to the solver (threshold named + MAX_MEMORY_INJECT). One possible solution, not yet implemented, is to call + the solver for reducing the possible values thanks to its accumulated + constraints. +""" + +from collections import namedtuple + +import z3 + +from miasm2.expression.expression import ExprMem, ExprInt, ExprCompose, \ + ExprAff, ExprId +from miasm2.core.bin_stream import bin_stream_vm +from miasm2.core.asmblock import expr_is_label +from miasm2.jitter.emulatedsymbexec import EmulatedSymbExec +from miasm2.expression.expression_helper import possible_values +from miasm2.ir.translators import Translator +from miasm2.analysis.expression_range import expr_range +from miasm2.analysis.modularintervals import ModularIntervals + + +DriftInfo = namedtuple("DriftInfo", ["symbol", "computed", "expected"]) + +class DriftException(Exception): + """Raised when the emulation drift from the reference engine""" + + def __init__(self, info): + super(Exception, self).__init__() + self.info = info + + def __str__(self): + if len(self.info) == 1: + return "Drift of %s: %s instead of %s" % ( + self.info[0].symbol, + self.info[0].computed, + self.info[0].expected, + ) + else: + return "Drift of:\n\t" + "\n\t".join("%s: %s instead of %s" % ( + dinfo.symbol, + dinfo.computed, + dinfo.expected) + for dinfo in self.info) + + +class ESETrackModif(EmulatedSymbExec): + """Extension of EmulatedSymbExec to be used by DSE engines + + Add the tracking of modified expressions, and the ability to symbolize + memory areas + """ + + def __init__(self, *args, **kwargs): + super(ESETrackModif, self).__init__(*args, **kwargs) + self.modified_expr = set() # Expr modified since the last reset + self.dse_memory_range = [] # List/Intervals of memory addresses to + # symbolize + self.dse_memory_to_expr = None # function(addr) -> Expr used to + # symbolize + + def _func_read(self, expr_mem): + assert expr_mem.arg.is_int() + dst_addr = int(expr_mem.arg) + + if not self.dse_memory_range: + # Trivial case (optimization) + return super(ESETrackModif, self)._func_read(expr_mem) + + # Split access in atomic accesses + out = [] + for addr in xrange(dst_addr, dst_addr + (expr_mem.size / 8)): + if addr in self.dse_memory_range: + # Symbolize memory access + out.append(self.dse_memory_to_expr(addr)) + else: + # Get concrete value + atomic_access = ExprMem(ExprInt(addr, expr_mem.arg.size), 8) + out.append(super(ESETrackModif, self)._func_read(atomic_access)) + + if len(out) == 1: + # Trivial case (optimization) + return out[0] + + # Simplify for constant merging (ex: {ExprInt(1, 8), ExprInt(2, 8)}) + return self.expr_simp(ExprCompose(*out)) + + def reset_modified(self): + """Reset modified expression tracker""" + self.modified_expr.clear() + + def apply_change(self, dst, src): + super(ESETrackModif, self).apply_change(dst, src) + self.modified_expr.add(dst) + + +class DSEEngine(object): + """Dynamic Symbolic Execution Engine + + This class aims to be overrided for each specific purpose + """ + SYMB_ENGINE = ESETrackModif + + def __init__(self, machine): + self.machine = machine + self.handler = {} # addr -> callback(DSEEngine instance) + self.instrumentation = {} # addr -> callback(DSEEngine instance) + self.addr_to_cacheblocks = {} # addr -> {label -> IRBlock} + self.ir_arch = self.machine.ir() # corresponding IR + + # Defined after attachment + self.jitter = None # Jitload (concrete execution) + self.symb = None # SymbolicExecutionEngine + self.symb_concrete = None # Concrete SymbExec for path desambiguisation + self.mdis = None # DisasmEngine + + def prepare(self): + """Prepare the environment for attachment with a jitter""" + # Disassembler + self.mdis = self.machine.dis_engine(bin_stream_vm(self.jitter.vm), + lines_wd=1) + + # Symbexec engine + ## Prepare symbexec engines + self.symb = self.SYMB_ENGINE(self.jitter.cpu, self.jitter.vm, + self.ir_arch, {}) + self.symb.enable_emulated_simplifications() + self.symb_concrete = EmulatedSymbExec(self.jitter.cpu, self.jitter.vm, + self.ir_arch, {}) + + ## Update registers value + self.symb.symbols[self.ir_arch.IRDst] = ExprInt(getattr(self.jitter.cpu, + self.ir_arch.pc.name), + self.ir_arch.IRDst.size) + + # Avoid memory write + self.symb.func_write = None + + # Activate callback on each instr + self.jitter.jit.set_options(max_exec_per_call=1, jit_maxline=1) + self.jitter.exec_cb = self.callback + + def attach(self, emulator): + """Attach the DSE to @emulator + @emulator: jitload (or API equivalent) instance""" + self.jitter = emulator + self.prepare() + + def handle(self, cur_addr): + """Handle destination + @cur_addr: Expr of the next address in concrete execution + /!\ cur_addr may be a lbl_gen + + In this method, self.symb is in the "just before branching" state + """ + pass + + def add_handler(self, addr, callback): + """Add a @callback for address @addr before any state update. + The state IS NOT updated after returning from the callback + @addr: int + @callback: func(dse instance)""" + self.handler[addr] = callback + + def add_lib_handler(self, libimp, namespace): + """Add search for handler based on a @libimp libimp instance + + Known functions will be looked by {name}_symb in the @namespace + """ + + # lambda cannot contain statement + def default_func(dse): + fname = "%s_symb" % libimp.fad2cname[dse.jitter.pc] + raise RuntimeError("Symbolic stub '%s' not found" % fname) + + for addr, fname in libimp.fad2cname.iteritems(): + fname = "%s_symb" % fname + func = namespace.get(fname, None) + if func is not None: + self.add_handler(addr, func) + else: + self.add_handler(addr, default_func) + + def add_instrumentation(self, addr, callback): + """Add a @callback for address @addr before any state update. + The state IS updated after returning from the callback + @addr: int + @callback: func(dse instance)""" + self.instrumentation[addr] = callback + + def _check_state(self): + """Check the current state against the concrete one""" + errors = [] # List of DriftInfo + + for symbol in self.symb.modified_expr: + # Do not consider PC + if symbol in [self.ir_arch.pc, self.ir_arch.IRDst]: + continue + + # Consider only concrete values + symb_value = self.eval_expr(symbol) + if not symb_value.is_int(): + continue + symb_value = int(symb_value) + + # Check computed values against real ones + if symbol.is_id(): + if hasattr(self.jitter.cpu, symbol.name): + value = getattr(self.jitter.cpu, symbol.name) + if value != symb_value: + errors.append(DriftInfo(symbol, symb_value, value)) + elif symbol.is_mem() and symbol.arg.is_int(): + value_chr = self.jitter.vm.get_mem(int(symbol.arg), + symbol.size / 8) + exp_value = int(value_chr[::-1].encode("hex"), 16) + if exp_value != symb_value: + errors.append(DriftInfo(symbol, symb_value, exp_value)) + + # Check for drift, and act accordingly + if errors: + raise DriftException(errors) + + def callback(self, _): + """Called before each instruction""" + # Assert synchronization with concrete execution + self._check_state() + + # Call callbacks associated to the current address + cur_addr = self.jitter.pc + + if cur_addr in self.handler: + self.handler[cur_addr](self) + return True + + if cur_addr in self.instrumentation: + self.instrumentation[cur_addr](self) + + # Handle current address + self.handle(ExprInt(cur_addr, self.ir_arch.IRDst.size)) + + # Avoid memory issue in ExpressionSimplifier + if len(self.symb.expr_simp.simplified_exprs) > 100000: + self.symb.expr_simp.simplified_exprs.clear() + + # Get IR blocks + if cur_addr in self.addr_to_cacheblocks: + self.ir_arch.blocks.clear() + self.ir_arch.blocks.update(self.addr_to_cacheblocks[cur_addr]) + else: + + ## Reset cache structures + self.mdis.job_done.clear() + self.ir_arch.blocks.clear()# = {} + + ## Update current state + asm_block = self.mdis.dis_bloc(cur_addr) + self.ir_arch.add_bloc(asm_block) + self.addr_to_cacheblocks[cur_addr] = dict(self.ir_arch.blocks) + + # Emulate the current instruction + self.symb.reset_modified() + + # Is the symbolic execution going (potentially) to jump on a lbl_gen? + if len(self.ir_arch.blocks) == 1: + next_addr = self.symb.emul_ir_blocks(cur_addr) + else: + # Emulation could stuck in generated IR blocks + # But concrete execution callback is not enough precise to obtain + # the full IR blocks path + # -> Use a fully concrete execution to get back path + + # Update the concrete execution + self._update_state_from_concrete_symb(self.symb_concrete) + while True: + next_addr_concrete = self.symb_concrete.emul_ir_block(cur_addr) + self.symb.emul_ir_block(cur_addr) + + if not(expr_is_label(next_addr_concrete) and + next_addr_concrete.name.offset is None): + # Not a lbl_gen, exit + break + + # Call handle with lbl_gen state + self.handle(next_addr_concrete) + cur_addr = next_addr_concrete + + # At this stage, symbolic engine is one instruction after the concrete + # engine + + return True + + def take_snapshot(self): + """Return a snapshot of the current state (including jitter state)""" + snapshot = { + "mem": self.jitter.vm.get_all_memory(), + "regs": self.jitter.cpu.get_gpreg(), + "symb": self.symb.symbols.copy() + } + return snapshot + + def restore_snapshot(self, snapshot, memory=True): + """Restore a @snapshot taken with .take_snapshot + @snapshot: .take_snapshot output + @memory: (optional) if set, also restore the memory + """ + # Restore memory + if memory: + self.jitter.vm.reset_memory_page_pool() + self.jitter.vm.reset_code_bloc_pool() + for addr, metadata in snapshot["mem"].iteritems(): + self.jitter.vm.add_memory_page(addr, + metadata["access"], + metadata["data"]) + + # Restore registers + self.jitter.pc = snapshot["regs"][self.ir_arch.pc.name] + self.jitter.cpu.set_gpreg(snapshot["regs"]) + + # Reset intern elements + self.jitter.vm.set_exception(0) + self.jitter.cpu.set_exception(0) + self.jitter.bs._atomic_mode = False + + # Reset symb exec + for key, _ in self.symb.symbols.items(): + del self.symb.symbols[key] + for expr, value in snapshot["symb"].items(): + self.symb.symbols[expr] = value + + def update_state(self, assignblk): + """From this point, assume @assignblk in the symbolic execution + @assignblk: AssignBlock/{dst -> src} + """ + for dst, src in assignblk.iteritems(): + self.symb.apply_change(dst, src) + + def _update_state_from_concrete_symb(self, symbexec, cpu=True, mem=False): + if mem: + # Values will be retrieved from the concrete execution if they are + # not present + for symbol in symbexec.symbols.symbols_mem.copy(): + del symbexec.symbols[symbol] + if cpu: + regs = self.ir_arch.arch.regs.attrib_to_regs[self.ir_arch.attrib] + for reg in regs: + if hasattr(self.jitter.cpu, reg.name): + value = ExprInt(getattr(self.jitter.cpu, reg.name), + size=reg.size) + symbexec.symbols[reg] = value + + def update_state_from_concrete(self, cpu=True, mem=False): + """Update the symbolic state with concrete values from the concrete + engine + + @cpu: (optional) if set, update registers' value + @mem: (optional) if set, update memory value + + /!\ all current states will be loss. + This function is usually called when states are no more synchronized + (at the beginning, returning from an unstubbed syscall, ...) + """ + self._update_state_from_concrete_symb(self.symb, cpu, mem) + + def eval_expr(self, expr): + """Return the evaluation of @expr: + @expr: Expr instance""" + return self.symb.eval_expr(expr) + + @staticmethod + def memory_to_expr(addr): + """Translate an address to its corresponding symbolic ID (8bits) + @addr: int""" + return ExprId("MEM_0x%x" % int(addr), 8) + + def symbolize_memory(self, memory_range): + """Register a range of memory addresses to symbolize + @memory_range: object with support of __in__ operation (intervals, list, + ...) + """ + self.symb.dse_memory_range = memory_range + self.symb.dse_memory_to_expr = self.memory_to_expr + + +class DSEPathConstraint(DSEEngine): + """Dynamic Symbolic Execution Engine keeping the path constraint + + Possible new "solutions" are produced along the path, by inversing concrete + path constraint. Thus, a "solution" is a potential initial context leading + to a new path. + + In order to produce a new solution, one can extend this class, and override + 'handle_solution' to produce a solution which fit its needs. + + If one is only interested in constraints associated to its path, the option + "produce_solution" should be set to False, to speed up emulation. + The constraints are accumulated in the .z3_cur z3.Solver object. + + """ + + # Maximum memory size to inject in constraints solving + MAX_MEMORY_INJECT = 0x10000 + + def __init__(self, machine, produce_solution=True, **kwargs): + """Init a DSEPathConstraint + @machine: Machine of the targeted architecture instance + @produce_solution: (optional) if set, new solutions will be computed""" + super(DSEPathConstraint, self).__init__(machine, **kwargs) + self.path_constraint = set() + self.produce_solution = produce_solution + self.cur_solver = z3.Solver() + self.new_solutions = {} # destination -> set of model + self.z3_trans = Translator.to_language("z3") + + def take_snapshot(self, *args, **kwargs): + snap = super(DSEPathConstraint, self).take_snapshot(*args, **kwargs) + snap["new_solutions"] = {dst: src.copy + for dst, src in self.new_solutions.iteritems()} + snap["cur_constraints"] = self.cur_solver.assertions() + return snap + + def restore_snapshot(self, snapshot, *args, **kwargs): + super(DSEPathConstraint, self).restore_snapshot(snapshot, *args, + **kwargs) + self.new_solutions.clear() + self.new_solutions.update(snapshot["new_solutions"]) + self.cur_solver = z3.Solver() + self.cur_solver.add(snapshot["cur_constraints"]) + + def handle_solution(self, model, destination): + """Called when a new solution for destination @destination is founded + @model: z3 model instance + @destination: Expr instance for an addr which is not on the DSE path + """ + self.new_solutions.setdefault(destination, set()).add(model) + + def handle(self, cur_addr): + symb_pc = self.eval_expr(self.ir_arch.IRDst) + possibilities = possible_values(symb_pc) + if len(possibilities) == 1: + assert next(iter(possibilities)).value == cur_addr + else: + cur_path_constraint = set() # path_constraint for the concrete path + for possibility in possibilities: + path_constraint = set() # Set of ExprAff for the possible path + + # Get constraint associated to the possible path + memory_to_add = ModularIntervals(symb_pc.size) + for cons in possibility.constraints: + eaff = cons.to_constraint() + # eaff.get_r(mem_read=True) is not enough + # ExprAff consider a Memory access in dst as a write + mem = eaff.dst.get_r(mem_read=True) + mem.update(eaff.src.get_r(mem_read=True)) + for expr in mem: + if expr.is_mem(): + addr_range = expr_range(expr.arg) + # At upper bounds, add the size of the memory access + # if addr (- [a, b], then @size[addr] reachables + # values are in @8[a, b + size[ + for start, stop in addr_range: + stop += (expr.size / 8) - 1 + full_range = ModularIntervals(symb_pc.size, + [(start, stop)]) + memory_to_add.update(full_range) + path_constraint.add(eaff) + + if memory_to_add.length > self.MAX_MEMORY_INJECT: + # TODO re-croncretize the constraint or z3-try + raise RuntimeError("Not implemented: too long memory area") + + # Inject memory + for start, stop in memory_to_add: + for address in xrange(start, stop + 1): + expr_mem = ExprMem(ExprInt(address, + self.ir_arch.pc.size), + 8) + value = self.eval_expr(expr_mem) + if not value.is_int(): + raise TypeError("Rely on a symbolic memory case, " \ + "address 0x%x" % address) + path_constraint.add(ExprAff(expr_mem, value)) + + if possibility.value == cur_addr: + # Add path constraint + cur_path_constraint = path_constraint + + elif self.produce_solution: + # Looking for a new solution + self.cur_solver.push() + for cons in path_constraint: + trans = self.z3_trans.from_expr(cons) + trans = z3.simplify(trans) + self.cur_solver.add(trans) + + result = self.cur_solver.check() + if result == z3.sat: + model = self.cur_solver.model() + self.handle_solution(model, possibility.value) + self.cur_solver.pop() + + # Update current solver + for cons in cur_path_constraint: + self.cur_solver.add(self.z3_trans.from_expr(cons)) diff --git a/miasm2/arch/aarch64/regs.py b/miasm2/arch/aarch64/regs.py index 01ae4252..4589c17a 100644 --- a/miasm2/arch/aarch64/regs.py +++ b/miasm2/arch/aarch64/regs.py @@ -97,6 +97,11 @@ all_regs_ids = [ all_regs_ids_no_alias = all_regs_ids +attrib_to_regs = { + 'l': all_regs_ids_no_alias, + 'b': all_regs_ids_no_alias, +} + all_regs_ids_byname = dict([(x.name, x) for x in all_regs_ids]) all_regs_ids_init = (simd08_init + diff --git a/miasm2/arch/arm/regs.py b/miasm2/arch/arm/regs.py index 69488cb5..400c6080 100644 --- a/miasm2/arch/arm/regs.py +++ b/miasm2/arch/arm/regs.py @@ -71,6 +71,11 @@ all_regs_ids = [ all_regs_ids_no_alias = all_regs_ids +attrib_to_regs = { + 'l': all_regs_ids_no_alias, + 'b': all_regs_ids_no_alias, +} + all_regs_ids_byname = dict([(x.name, x) for x in all_regs_ids]) all_regs_ids_init = [R0_init, R1_init, R2_init, R3_init, diff --git a/miasm2/arch/mips32/regs.py b/miasm2/arch/mips32/regs.py index 974d3a2b..fbd55a46 100644 --- a/miasm2/arch/mips32/regs.py +++ b/miasm2/arch/mips32/regs.py @@ -62,6 +62,11 @@ all_regs_ids_init = [PC_init, PC_FETCH_init, R_LO_init, R_HI_init] + \ gpregs_init + regs_flt_init + regs_fcc_init + regs_cpr0_init all_regs_ids_no_alias = all_regs_ids[:] +attrib_to_regs = { + 'l': all_regs_ids_no_alias, + 'b': all_regs_ids_no_alias, +} + regs_init = {} for i, r in enumerate(all_regs_ids): regs_init[r] = all_regs_ids_init[i] diff --git a/miasm2/arch/msp430/regs.py b/miasm2/arch/msp430/regs.py index 1e35029f..11385779 100644 --- a/miasm2/arch/msp430/regs.py +++ b/miasm2/arch/msp430/regs.py @@ -92,6 +92,11 @@ all_regs_ids = [ all_regs_ids_no_alias = all_regs_ids +attrib_to_regs = { + 'l': all_regs_ids_no_alias, + 'b': all_regs_ids_no_alias, +} + all_regs_ids_byname = dict([(x.name, x) for x in all_regs_ids]) all_regs_ids_init = [PC_init, SP_init, SR_init, R3_init, diff --git a/miasm2/arch/x86/regs.py b/miasm2/arch/x86/regs.py index 34585dae..7354457f 100644 --- a/miasm2/arch/x86/regs.py +++ b/miasm2/arch/x86/regs.py @@ -424,6 +424,12 @@ all_regs_ids_no_alias = [ exception_flags, interrupt_num, ] + fltregs32_expr +attrib_to_regs = { + 16: regs16_expr + all_regs_ids_no_alias[all_regs_ids_no_alias.index(zf):], + 32: regs32_expr + all_regs_ids_no_alias[all_regs_ids_no_alias.index(zf):], + 64: all_regs_ids_no_alias, +} + all_regs_ids_byname = dict([(x.name, x) for x in all_regs_ids]) all_regs_ids_init = [ExprId("%s_init" % x.name, x.size) for x in all_regs_ids] diff --git a/test/analysis/dse.py b/test/analysis/dse.py new file mode 100644 index 00000000..59fc2b59 --- /dev/null +++ b/test/analysis/dse.py @@ -0,0 +1,94 @@ +import sys +import os +from pdb import pm + +from miasm2.core.cpu import ParseAst +from miasm2.arch.x86.arch import mn_x86, base_expr, variable +from miasm2.core import parse_asm +from miasm2.expression.expression import * +from miasm2.core import asmblock +from elfesteem.strpatchwork import StrPatchwork +from miasm2.analysis.machine import Machine +from miasm2.jitter.csts import * +from miasm2.analysis.dse import DSEEngine + +reg_and_id = dict(mn_x86.regs.all_regs_ids_byname) + +class DSE_test(object): + """Inspired from TEST/ARCH/X86 + + Test the symbolic execution correctly follow generated labels + """ + TXT = ''' + main: + SHL EDX, CL + RET + ''' + + arch_name = "x86_32" + arch_attrib = 32 + ret_addr = 0x1337beef + + run_addr = 0x0 + + def __init__(self, jitter_engine): + self.machine = Machine(self.arch_name) + self.myjit = self.machine.jitter(jitter_engine) + self.myjit.init_stack() + + self.myjit.jit.log_regs = False + self.myjit.jit.log_mn = False + + def init_machine(self): + self.myjit.vm.add_memory_page(self.run_addr, PAGE_READ | PAGE_WRITE, self.assembly) + self.myjit.push_uint32_t(self.ret_addr) + self.myjit.add_breakpoint(self.ret_addr, lambda x:False) + + def prepare(self): + self.myjit.cpu.ECX = 4 + self.myjit.cpu.EDX = 5 + + self.dse = DSEEngine(self.machine) + self.dse.attach(self.myjit) + + def __call__(self): + self.asm() + self.init_machine() + self.prepare() + self.run() + self.check() + + def run(self): + + self.myjit.init_run(self.run_addr) + self.myjit.continue_run() + + assert(self.myjit.pc == self.ret_addr) + + def asm(self): + blocks, symbol_pool = parse_asm.parse_txt(mn_x86, self.arch_attrib, self.TXT, + symbol_pool=self.myjit.ir_arch.symbol_pool) + + + # fix shellcode addr + symbol_pool.set_offset(symbol_pool.getby_name("main"), 0x0) + s = StrPatchwork() + patches = asmblock.asm_resolve_final(mn_x86, blocks, symbol_pool) + for offset, raw in patches.items(): + s[offset] = raw + + s = str(s) + self.assembly = s + + def check(self): + regs = self.dse.ir_arch.arch.regs + value = self.dse.eval_expr(regs.EDX) + # The expected value should contains '<<', showing it has been in the + # corresponding generated label + expected = ExprOp('<<', regs.EDX, + ExprCompose(regs.ECX[0:8], + ExprInt(0x0, 24)) & ExprInt(0x1F, 32)) + assert value == expected + +if __name__ == "__main__": + [test(*sys.argv[1:])() for test in [DSE_test]] diff --git a/test/test_all.py b/test/test_all.py index 94779898..d2c3e5e2 100755 --- a/test/test_all.py +++ b/test/test_all.py @@ -278,6 +278,12 @@ testset += RegressionTest(["depgraph.py"], base_dir="analysis", (14, 1), (15, 1)) ]) testset += RegressionTest(["modularintervals.py"], base_dir="analysis") +for jitter in ArchUnitTest.jitter_engines: + if jitter in blacklist.get(script, []): + continue + tags = [TAGS[jitter]] if jitter in TAGS else [] + testset += RegressionTest(["dse.py", jitter], base_dir="analysis", tags=tags) + testset += RegressionTest(["range.py"], base_dir="analysis", tags=[TAGS["z3"]]) @@ -586,6 +592,17 @@ for options, nb_sol, tag in [([], 4, []), depends=[test_x86_32_if_reg], tags=tag) +dse_crackme_out = Example.get_sample("dse_crackme.c")[:-2] +dse_crackme = ExampleSymbolExec([Example.get_sample("dse_crackme.c"), + "-o", dse_crackme_out], + products=[dse_crackme_out], + executable="cc") +testset += dse_crackme +testset += ExampleSymbolExec(["dse_crackme.py", dse_crackme_out], + depends=[dse_crackme], + products=["test.txt"], + tags=[TAGS["z3"]]) + ## Jitter class ExampleJitter(Example): """Jitter examples specificities: diff --git a/test/utils/test.py b/test/utils/test.py index e4f8888c..9e569a4b 100644 --- a/test/utils/test.py +++ b/test/utils/test.py @@ -2,22 +2,25 @@ class Test(object): "Stand for a test to run" def __init__(self, command_line, base_dir="", depends=None, - products=None, tags=None): + products=None, tags=None, executable=None): """Create a Test instance. @command_line: list of string standing for arguments to launch @base_dir: base directory for launch @depends: list of Test instance indicating dependencies @products: elements produced to remove after tests @tags: list of str indicating current test categories + @executable: if set, use this binary instead of Python """ self.command_line = command_line self.base_dir = base_dir self.depends = depends if depends else [] self.products = products if products else [] self.tags = tags if tags else [] + self.executable = executable def __repr__(self): displayed = ["command_line", "base_dir", "depends", "products", "tags"] + displayed.append("python" if not self.executable else self.executable) return "<Test " + \ " ".join("%s=%s" % (n, getattr(self,n)) for n in displayed ) + ">" @@ -29,4 +32,6 @@ class Test(object): self.base_dir == test.base_dir, self.depends == test.depends, self.products == test.products, - self.tags == test.tags]) + self.tags == test.tags, + self.executable == test.executable, + ]) diff --git a/test/utils/testset.py b/test/utils/testset.py index e1df01a4..29a4e5d0 100644 --- a/test/utils/testset.py +++ b/test/utils/testset.py @@ -153,7 +153,8 @@ class TestSet(object): os.chdir(test.base_dir) # Launch test - testpy = subprocess.Popen(([sys.executable] + + executable = test.executable if test.executable else sys.executable + testpy = subprocess.Popen(([executable] + init_args + test.command_line), stdout=subprocess.PIPE, stderr=subprocess.PIPE) |