diff options
| author | Theofilos Augoustis <theofilos.augoustis@gmail.com> | 2024-01-22 20:02:03 +0100 |
|---|---|---|
| committer | Theofilos Augoustis <theofilos.augoustis@gmail.com> | 2024-01-22 20:02:03 +0100 |
| commit | c8bb7528b8dc053b45d8c652663479f327273ef8 (patch) | |
| tree | 06a1488549e2206784edd34e227a463ebf30b165 | |
| parent | 605e12fc5cf0fb64e45f68378390a09aa28df2f9 (diff) | |
| download | focaccia-c8bb7528b8dc053b45d8c652663479f327273ef8.tar.gz focaccia-c8bb7528b8dc053b45d8c652663479f327273ef8.zip | |
Use symbolic execution to speed up QEMU testing
We don't need QEMU's log anymore, so we connect to a GDB server instance that the user has to start with `$ qemu -g <port> ...`. Co-authored-by: Theofilos Augoustis <theofilos.augoustis@gmail.com> Co-authored-by: Nicola Crivellin <nicola.crivellin98@gmail.com>
Diffstat (limited to '')
| -rw-r--r-- | focaccia/arch/x86.py | 75 | ||||
| -rw-r--r-- | focaccia/lldb_target.py | 33 | ||||
| -rw-r--r-- | focaccia/symbolic.py | 12 | ||||
| -rw-r--r-- | tools/qemu_tool.py | 233 | ||||
| -rw-r--r-- | tools/verify_qemu.py (renamed from tools/invoke_qemu_tool.py) | 47 |
5 files changed, 231 insertions, 169 deletions
diff --git a/focaccia/arch/x86.py b/focaccia/arch/x86.py index 95e1a82..ebe74dd 100644 --- a/focaccia/arch/x86.py +++ b/focaccia/arch/x86.py @@ -61,30 +61,65 @@ def decompose_rflags(rflags: int) -> dict[str, int]: """ return { # FLAGS - 'CF': rflags & 0x0001, - # 0x0002 reserved - 'PF': rflags & 0x0004, - # 0x0008 reserved - 'AF': rflags & 0x0010, - # 0x0020 reserved - 'ZF': rflags & 0x0040, - 'SF': rflags & 0x0080, - 'TF': rflags & 0x0100, - 'IF': rflags & 0x0200, - 'DF': rflags & 0x0400, - 'OF': rflags & 0x0800, - 'IOPL': rflags & 0x3000, - 'NT': rflags & 0x4000, + 'CF': (rflags & 0x0001) != 0, + # 0x0002 reserved + 'PF': (rflags & 0x0004) != 0, + # 0x0008 reserved + 'AF': (rflags & 0x0010) != 0, + # 0x0020 reserved + 'ZF': (rflags & 0x0040) != 0, + 'SF': (rflags & 0x0080) != 0, + 'TF': (rflags & 0x0100) != 0, + 'IF': (rflags & 0x0200) != 0, + 'DF': (rflags & 0x0400) != 0, + 'OF': (rflags & 0x0800) != 0, + 'IOPL': (rflags & 0x3000) != 0, + 'NT': (rflags & 0x4000) != 0, # EFLAGS - 'RF': rflags & 0x00010000, - 'VM': rflags & 0x00020000, - 'AC': rflags & 0x00040000, - 'VIF': rflags & 0x00080000, - 'VIP': rflags & 0x00100000, - 'ID': rflags & 0x00200000, + 'RF': (rflags & 0x00010000) != 0, + 'VM': (rflags & 0x00020000) != 0, + 'AC': (rflags & 0x00040000) != 0, + 'VIF': (rflags & 0x00080000) != 0, + 'VIP': (rflags & 0x00100000) != 0, + 'ID': (rflags & 0x00200000) != 0, } +def compose_rflags(rflags: dict[str, int]) -> int: + """Compose separate flags into RFLAGS register's value. + + Uses flag name abbreviation conventions from + `https://en.wikipedia.org/wiki/FLAGS_register`. + + :param rflags: A dictionary mapping Miasm's flag names to their alues. + :return: The RFLAGS register value. + """ + return ( + # FLAGS + (0x0001 if rflags['CF'] else 0) | + # 0x0002 reserved + (0x0004 if rflags['PF'] else 0) | + # 0x0008 reserved + (0x0010 if rflags['AF'] else 0) | + # 0x0020 reserved + (0x0040 if rflags['ZF'] else 0) | + (0x0080 if rflags['SF'] else 0) | + (0x0100 if rflags['TF'] else 0) | + (0x0200 if rflags['IF'] else 0) | + (0x0400 if rflags['DF'] else 0) | + (0x0800 if rflags['OF'] else 0) | + (0x3000 if rflags['IOPL'] else 0) | + (0x4000 if rflags['NT'] else 0) | + + # EFLAGS + (0x00010000 if rflags['RF'] else 0) | + (0x00020000 if rflags['VM'] else 0) | + (0x00040000 if rflags['AC'] else 0) | + (0x00080000 if rflags['VIF'] else 0) | + (0x00100000 if rflags['VIP'] else 0) | + (0x00200000 if rflags['ID'] else 0) + ) + class ArchX86(Arch): def __init__(self): super().__init__(archname, regnames) diff --git a/focaccia/lldb_target.py b/focaccia/lldb_target.py index 444ab36..05ab66d 100644 --- a/focaccia/lldb_target.py +++ b/focaccia/lldb_target.py @@ -26,6 +26,9 @@ class ConcreteRegisterError(Exception): class ConcreteMemoryError(Exception): pass +class ConcreteSectionError(Exception): + pass + class LLDBConcreteTarget: def __init__(self, executable: str, argv: list[str] = []): """Construct an LLDB concrete target. Stop at entry. @@ -74,6 +77,14 @@ class LLDBConcreteTarget: """Step forward by a single instruction.""" thread: lldb.SBThread = self.process.GetThreadAtIndex(0) thread.StepInstruction(False) + + def run_until(self, address: int) -> None: + """Continue execution until the address is arrived, ignores other breakpoints""" + bp = self.target.BreakpointCreateByAddress(address) + while self.read_register("pc") != address: + self.target.run() + self.target.BreakpointDelete(bp.GetID()) + def record_snapshot(self) -> ProgramState: """Record the concrete target's state in a ProgramState object.""" @@ -208,3 +219,25 @@ class LLDBConcreteTarget: command = f'breakpoint delete {addr}' result = lldb.SBCommandReturnObject() self.interpreter.HandleCommand(command, result) + + def get_basic_block(self, addr: int) -> [lldb.SBInstruction]: + """Returns a basic block pointed by addr + a code section is considered a basic block only if + the last instruction is a brach, e.g. JUMP, CALL, RET + """ + block = [] + while not self.target.ReadInstructions(lldb.SBAddress(addr, self.target), 1)[0].is_branch: + block.append(self.target.ReadInstructions(lldb.SBAddress(addr, self.target), 1)[0]) + addr += self.target.ReadInstructions(lldb.SBAddress(addr, self.target), 1)[0].size + block.append(self.target.ReadInstructions(lldb.SBAddress(addr, self.target), 1)[0]) + + return block + + def get_symbol(self, addr: int) -> lldb.SBSymbol: + """Returns the symbol that belongs to the addr""" + for s in self.target.module.symbols: + if (s.GetType() == lldb.eSymbolTypeCode + and s.GetStartAddress().GetLoadAddress(self.target) <= addr + and addr < s.GetEndAddress().GetLoadAddress(self.target)): + return s + raise ConcreteSectionError(f'Error getting the symbol to which address {hex(addr)} belongs to') diff --git a/focaccia/symbolic.py b/focaccia/symbolic.py index 7ab0d84..50108d4 100644 --- a/focaccia/symbolic.py +++ b/focaccia/symbolic.py @@ -195,8 +195,10 @@ class SymbolicTransform: def resolve_register(self, regname: str) -> int | None: accessed_regs.add(regname) return None - def resolve_memory(self, addr: int, size: int): assert(False) - def resolve_location(self, _): assert(False) + def resolve_memory(self, addr: int, size: int): + pass + def resolve_location(self, _): + assert(False) state = ConcreteStateWrapper() for expr in self.changed_regs.values(): @@ -421,7 +423,11 @@ def _run_block(pc: int, conc_state: MiasmConcreteState, ctx: DisassemblyContext) # Execute each instruction in the current basic block and record the # resulting change in program state. for assignblk in irblock: - modified = engine.eval_assignblk(assignblk) + # A clean engine for the single-instruction diff, otherwise + # it concatenates the current instruction to the previous ones in + # the block. + _engine = SymbolicExecutionEngine(ctx.lifter) + modified = _engine.eval_assignblk(assignblk) symb_trace.append((assignblk.instr.offset, modified)) # Run a single instruction diff --git a/tools/qemu_tool.py b/tools/qemu_tool.py index c7730fd..d670692 100644 --- a/tools/qemu_tool.py +++ b/tools/qemu_tool.py @@ -3,134 +3,123 @@ gdb -n --batch -x qemu_tool.py """ -import argparse -import re -import shlex -import subprocess -from typing import TextIO +import gdb +import platform import focaccia.parser as parser -from focaccia.arch import x86 -from focaccia.lldb_target import MemoryMap +from focaccia.arch import supported_architectures, Arch +from focaccia.compare import compare_symbolic, ErrorTypes from focaccia.snapshot import ProgramState - -def parse_memory_maps(stream: TextIO) -> tuple[list[MemoryMap], str]: - """ - :return: Returns the list of parsed memory mappings as well as the first - line in the stream that does not belong to the memory mapping - information, i.e. the line that terminates the block of mapping - information. - The line is returned for the technical reason that the parser - needs to read a line from the stream in order to determine that - this line does no longer belong to the mapping information; but it - might still contain other important information. - """ - mappings = [] - while True: - line = stream.readline() - split = line.split(' ') - if len(split) != 3 or not re.match('^[0-9a-f]+-[0-9a-f]+$', split[0]): - return mappings, line - - addr_range, size, perms = split - start, end = addr_range.split('-') - start, end = int(start, 16), int(end, 16) - mappings.append(MemoryMap(start, end, '[unnamed]', perms)) - -def copy_memory(proc, state: ProgramState, maps: list[MemoryMap]): - """Copy memory from a GDB process to a ProgramState object. - - Problem: Reading large mappings via GDB takes way too long (~500ms for ~8MB). - """ - for mapping in maps: - # Only copy read- and writeable memory from the process. This is a - # heuristic to try to copy only heap and stack. - if 'rw' not in mapping.perms: - continue - - map_size = mapping.end_address - mapping.start_address - mem = proc.read_memory(mapping.start_address, map_size) - assert(mem.contiguous) - assert(mem.nbytes == len(mem.tobytes())) - assert(mem.nbytes == map_size) - state.write_memory(mapping.start_address, mem.tobytes()) - -def run_gdb(qemu_log: TextIO, qemu_port: int) -> list[ProgramState]: - import gdb - - gdb.execute('set pagination 0') - gdb.execute('set sysroot') - gdb.execute(f'target remote localhost:{qemu_port}') - process = gdb.selected_inferior() - - arch = x86.ArchX86() - mappings: list[MemoryMap] = [] - states: list[ProgramState] = [] - - while process.is_valid() and len(process.threads()) > 0: - for line in qemu_log: - if re.match('^start +end +size +prot$', line): - mappings, line = parse_memory_maps(qemu_log) - - if line.startswith('Trace'): - states.append(ProgramState(arch)) - copy_memory(process, states[-1], mappings) - continue - - if states: - parser._parse_qemu_line(line, states[-1]) - - gdb.execute('si', to_string=True) +from focaccia.symbolic import SymbolicTransform, eval_symbol +from focaccia.utils import print_result + +from verify_qemu import make_argparser + +class GDBProgramState: + def __init__(self, process: gdb.Inferior, frame: gdb.Frame): + self._proc = process + self._frame = frame + + def read_register(self, regname: str) -> int | None: + try: + return int(self._frame.read_register(regname.lower())) + except ValueError as err: + from focaccia.arch import x86 + rflags = int(self._frame.read_register('eflags')) + rflags = x86.decompose_rflags(rflags) + if regname in rflags: + return rflags[regname] + + print(f'{regname}: {err}') + return None + + def read_memory(self, addr: int, size: int) -> bytes | None: + try: + return self._proc.read_memory(addr, size).tobytes() + except gdb.MemoryError as err: + print(f'@{size}[{hex(addr)}]: {err}') + return None + +class GDBServerStateIterator: + def __init__(self, address: str, port: int): + gdb.execute('set pagination 0') + gdb.execute('set sysroot') + gdb.execute(f'target remote {address}:{port}') + self._process = gdb.selected_inferior() + self._first_next = True + + def __iter__(self): + return self + + def __next__(self): + # The first call to __next__ should yield the first program state, + # i.e. before stepping the first time + if self._first_next: + self._first_next = False + return GDBProgramState(self._process, gdb.selected_frame()) + + # Step + pc = gdb.selected_frame().read_register('pc') + new_pc = pc + while pc == new_pc: + gdb.execute('si', to_string=True) + if not self._process.is_valid() or len(self._process.threads()) == 0: + raise StopIteration + new_pc = gdb.selected_frame().read_register('pc') + + return GDBProgramState(self._process, gdb.selected_frame()) + +def collect_conc_trace(arch: Arch, \ + gdb: GDBServerStateIterator, \ + strace: list[SymbolicTransform]) \ + -> list[ProgramState]: + states = [] + for qemu, transform in zip(gdb, strace): + qemu_pc = qemu.read_register('pc') + assert(qemu_pc is not None) + + if qemu_pc != transform.addr: + print(f'Fatal error: QEMU\'s program counter' + f' ({hex(qemu_pc)}) does not match the' + f' expected program counter in the symbolic trace' + f' ({hex(transform.addr)}).') + print(f'Processing only partial trace up to this instruction.') + return states + + state = ProgramState(arch) + state.set_register('PC', transform.addr) + + accessed_regs = transform.get_used_registers() + accessed_mems = transform.get_used_memory_addresses() + for regname in accessed_regs: + regval = qemu.read_register(regname) + if regval is not None: + state.set_register(regname, regval) + for mem in accessed_mems: + assert(mem.size % 8 == 0) + addr = eval_symbol(mem.ptr, qemu) + mem = qemu.read_memory(addr, int(mem.size / 8)) + if mem is not None: + state.write_memory(addr, mem) + states.append(state) return states -def make_argparser(): - prog = argparse.ArgumentParser() - prog.add_argument('binary', - type=str, - help='The binary to run and record.') - prog.add_argument('--binary-args', - type=str, - help='A string of arguments to be passed to the binary.') - prog.add_argument('--output', '-o', help='Name of output file.') - prog.add_argument('--gdbserver-port', type=int, default=12421) - prog.add_argument('--qemu', type=str, default='qemu-x86_64', - help='QEMU binary to invoke. [Default: qemu-x86_64') - prog.add_argument('--qemu-log', type=str, default='qemu.log') - prog.add_argument('--qemu-extra-args', type=str, default='', - help='Arguments passed to QEMU in addition to the' - ' default ones required by this script.') - return prog - -if __name__ == "__main__": +def main(): args = make_argparser().parse_args() - binary = args.binary - binary_args = shlex.split(args.binary_args) if args.binary_args else '' - - qemu_bin = args.qemu gdbserver_port = args.gdbserver_port - qemu_log_name = args.qemu_log - qemu_args = [ - qemu_bin, - '--trace', 'target_mmap*', - '--trace', 'memory_notdirty_*', - # We write QEMU's output to a log file, then read it from that file. - # This is preferred over reading from the process's stdout pipe because - # we require a non-blocking solution that returns when all available - # lines have been read. - '-D', qemu_log_name, - '-d', 'cpu,fpu,exec,unimp,page,strace', - '-g', str(gdbserver_port), - *shlex.split(args.qemu_extra_args), - binary, - *binary_args, - ] - - qemu = subprocess.Popen(qemu_args) - - with open(qemu_log_name, 'r') as qemu_log: - snapshots = run_gdb(qemu_log, gdbserver_port) - - with open(args.output, 'w') as file: - parser.serialize_snapshots(snapshots, file) + with open(args.symb_trace, 'r') as strace: + symb_transforms = parser.parse_transformations(strace) + + arch = supported_architectures[platform.machine()] + conc_states = collect_conc_trace( + arch, + GDBServerStateIterator('localhost', gdbserver_port), + symb_transforms) + + res = compare_symbolic(conc_states, symb_transforms) + print_result(res, ErrorTypes.POSSIBLE) + +if __name__ == "__main__": + main() diff --git a/tools/invoke_qemu_tool.py b/tools/verify_qemu.py index 152c208..98437cb 100644 --- a/tools/invoke_qemu_tool.py +++ b/tools/verify_qemu.py @@ -1,39 +1,32 @@ """ -This mechanism exists to retrieve per-instruction program snapshots from QEMU, -specifically including memory dumps. This is surprisingly nontrivial (we don't -have a log option like `-d memory`), and the mechanism we have implemented to -achieve this is accordingly complicated. - -In short: We use QEMU's feature to interact with the emulation via a GDB server -interface together with parsing QEMU's logs to record register and memory state -at single-instruction intervals. - -We need QEMU's log in addition to the GDB server because QEMU's GDB server does -not support querying memory mapping information. We need this information to -know from where we need to read memory, so we parse memory mappings from the -log (option `-d page`). +Spawn GDB, connect to QEMU's GDB server, and read test states from that. We need two scripts (this one and the primary `qemu_tool.py`) because we can't pass arguments to scripts executed via `gdb -x <script>`. -This script (`invoke_qemu_tool.py`) is the one the user interfaces with. It +This script (`verify_qemu.py`) is the one the user interfaces with. It eventually calls `execv` to spawn a GDB process that calls the main -`qemu_tool.py` script; `python invoke_qemu_tool.py` essentially behaves as if +`qemu_tool.py` script; `python verify_qemu.py` essentially behaves as if something like `gdb --batch -x qemu_tool.py` were executed instead. Before it starts GDB, though, it parses command line arguments and applies some weird but necessary logic to pass them to `qemu_tool.py`. - -The main script `qemu_tool.py`, which runs inside of GDB, finally forks a QEMU -instance that provides a GDB server and writes its logs to a file. It then -connects GDB to that server and incrementally reads the QEMU logs while -stepping through the program. Doing that, it generates program snapshots at -each instruction. """ +import argparse import os +import subprocess import sys -from qemu_tool import make_argparser +def make_argparser(): + """This is also used by the GDB-invoked script to parse its args.""" + prog = argparse.ArgumentParser() + prog.add_argument('--symb-trace', + required=True, + help='A symbolic transformation trace to be used for' \ + ' verification.') + prog.add_argument('--output', '-o', help='Name of output file.') + prog.add_argument('gdbserver_port', type=int) + return prog def quoted(s: str) -> str: return f'"{s}"' @@ -67,12 +60,18 @@ if __name__ == "__main__": # mechanism to pass arguments to a script that it executes, so we # overwrite `sys.argv` manually before invoking the script. argv_str = f'[{", ".join(quoted(a) for a in argv)}]' + path_str = f'[{", ".join(quoted(s) for s in sys.path)}]' - os.execv(args.gdb, [ + gdb_cmd = [ args.gdb, '-nx', # Don't parse any .gdbinits '--batch-silent' if args.quiet else '--batch', '-ex', f'py import sys', '-ex', f'py sys.argv = {argv_str}', + '-ex', f'py sys.path = {path_str}', '-x', qemu_tool_path - ]) + ] + proc = subprocess.Popen(gdb_cmd) + + ret = proc.wait() + exit(ret) |