Use symbolic execution to speed up QEMU testing

We don't need QEMU's log anymore, so we connect to a GDB server instance that the user has to start with `$ qemu -g <port> ...`. Co-authored-by: Theofilos Augoustis <theofilos.augoustis@gmail.com> Co-authored-by: Nicola Crivellin <nicola.crivellin98@gmail.com>
author: Theofilos Augoustis <theofilos.augoustis@gmail.com> 2024-01-22 20:02:03 +0100
committer: Theofilos Augoustis <theofilos.augoustis@gmail.com> 2024-01-22 20:02:03 +0100
commit: c8bb7528b8dc053b45d8c652663479f327273ef8 (patch)
tree: 06a1488549e2206784edd34e227a463ebf30b165
parent: 605e12fc5cf0fb64e45f68378390a09aa28df2f9 (diff)
download: focaccia-c8bb7528b8dc053b45d8c652663479f327273ef8.tar.gz
focaccia-c8bb7528b8dc053b45d8c652663479f327273ef8.zip
5 files changed, 231 insertions, 169 deletions
diff --git a/focaccia/arch/x86.py b/focaccia/arch/x86.py
index 95e1a82..ebe74dd 100644
--- a/focaccia/arch/x86.py
+++ b/focaccia/arch/x86.py
@@ -61,30 +61,65 @@ def decompose_rflags(rflags: int) -> dict[str, int]:
     """
     return {
         # FLAGS
-        'CF':     rflags & 0x0001,
-                         # 0x0002   reserved
-        'PF':     rflags & 0x0004,
-                         # 0x0008   reserved
-        'AF':     rflags & 0x0010,
-                         # 0x0020   reserved
-        'ZF':     rflags & 0x0040,
-        'SF':     rflags & 0x0080,
-        'TF':     rflags & 0x0100,
-        'IF':     rflags & 0x0200,
-        'DF':     rflags & 0x0400,
-        'OF':     rflags & 0x0800,
-        'IOPL':   rflags & 0x3000,
-        'NT':     rflags & 0x4000,
+        'CF':     (rflags & 0x0001) != 0,
+                          # 0x0002   reserved
+        'PF':     (rflags & 0x0004) != 0,
+                          # 0x0008   reserved
+        'AF':     (rflags & 0x0010) != 0,
+                          # 0x0020   reserved
+        'ZF':     (rflags & 0x0040) != 0,
+        'SF':     (rflags & 0x0080) != 0,
+        'TF':     (rflags & 0x0100) != 0,
+        'IF':     (rflags & 0x0200) != 0,
+        'DF':     (rflags & 0x0400) != 0,
+        'OF':     (rflags & 0x0800) != 0,
+        'IOPL':   (rflags & 0x3000) != 0,
+        'NT':     (rflags & 0x4000) != 0,
 
         # EFLAGS
-        'RF':     rflags & 0x00010000,
-        'VM':     rflags & 0x00020000,
-        'AC':     rflags & 0x00040000,
-        'VIF':    rflags & 0x00080000,
-        'VIP':    rflags & 0x00100000,
-        'ID':     rflags & 0x00200000,
+        'RF':     (rflags & 0x00010000) != 0,
+        'VM':     (rflags & 0x00020000) != 0,
+        'AC':     (rflags & 0x00040000) != 0,
+        'VIF':    (rflags & 0x00080000) != 0,
+        'VIP':    (rflags & 0x00100000) != 0,
+        'ID':     (rflags & 0x00200000) != 0,
     }
 
+def compose_rflags(rflags: dict[str, int]) -> int:
+    """Compose separate flags into RFLAGS register's value.
+
+    Uses flag name abbreviation conventions from
+    `https://en.wikipedia.org/wiki/FLAGS_register`.
+
+    :param rflags: A dictionary mapping Miasm's flag names to their alues.
+    :return: The RFLAGS register value.
+    """
+    return (
+        # FLAGS
+        (0x0001 if rflags['CF']   else 0) |
+                        # 0x0002   reserved
+        (0x0004 if rflags['PF']   else 0) |
+                        # 0x0008   reserved
+        (0x0010 if rflags['AF']   else 0) |
+                        # 0x0020   reserved
+        (0x0040 if rflags['ZF']   else 0) |
+        (0x0080 if rflags['SF']   else 0) |
+        (0x0100 if rflags['TF']   else 0) |
+        (0x0200 if rflags['IF']   else 0) |
+        (0x0400 if rflags['DF']   else 0) |
+        (0x0800 if rflags['OF']   else 0) |
+        (0x3000 if rflags['IOPL'] else 0) |
+        (0x4000 if rflags['NT']   else 0) |
+
+        # EFLAGS
+        (0x00010000 if rflags['RF']  else 0) |
+        (0x00020000 if rflags['VM']  else 0) |
+        (0x00040000 if rflags['AC']  else 0) |
+        (0x00080000 if rflags['VIF'] else 0) |
+        (0x00100000 if rflags['VIP'] else 0) |
+        (0x00200000 if rflags['ID']  else 0)
+    )
+
 class ArchX86(Arch):
     def __init__(self):
         super().__init__(archname, regnames)
diff --git a/focaccia/lldb_target.py b/focaccia/lldb_target.py
index 444ab36..05ab66d 100644
--- a/focaccia/lldb_target.py
+++ b/focaccia/lldb_target.py
@@ -26,6 +26,9 @@ class ConcreteRegisterError(Exception):
 class ConcreteMemoryError(Exception):
     pass
 
+class ConcreteSectionError(Exception):
+    pass
+
 class LLDBConcreteTarget:
     def __init__(self, executable: str, argv: list[str] = []):
         """Construct an LLDB concrete target. Stop at entry.
@@ -74,6 +77,14 @@ class LLDBConcreteTarget:
         """Step forward by a single instruction."""
         thread: lldb.SBThread = self.process.GetThreadAtIndex(0)
         thread.StepInstruction(False)
+    
+    def run_until(self, address: int) -> None:
+        """Continue execution until the address is arrived, ignores other breakpoints"""
+        bp = self.target.BreakpointCreateByAddress(address)
+        while self.read_register("pc") != address:
+            self.target.run()
+        self.target.BreakpointDelete(bp.GetID())
+
 
     def record_snapshot(self) -> ProgramState:
         """Record the concrete target's state in a ProgramState object."""
@@ -208,3 +219,25 @@ class LLDBConcreteTarget:
         command = f'breakpoint delete {addr}'
         result = lldb.SBCommandReturnObject()
         self.interpreter.HandleCommand(command, result)
+    
+    def get_basic_block(self, addr: int) -> [lldb.SBInstruction]:
+        """Returns a basic block pointed by addr
+        a code section is considered a basic block only if
+        the last instruction is a brach, e.g. JUMP, CALL, RET
+        """
+        block = []
+        while not self.target.ReadInstructions(lldb.SBAddress(addr, self.target), 1)[0].is_branch:
+            block.append(self.target.ReadInstructions(lldb.SBAddress(addr, self.target), 1)[0])
+            addr += self.target.ReadInstructions(lldb.SBAddress(addr, self.target), 1)[0].size
+        block.append(self.target.ReadInstructions(lldb.SBAddress(addr, self.target), 1)[0])
+
+        return block
+    
+    def get_symbol(self, addr: int) -> lldb.SBSymbol:
+        """Returns the symbol that belongs to the addr"""
+        for s in self.target.module.symbols:
+            if (s.GetType() == lldb.eSymbolTypeCode 
+            and s.GetStartAddress().GetLoadAddress(self.target) <= addr 
+            and addr < s.GetEndAddress().GetLoadAddress(self.target)):
+                return s
+        raise ConcreteSectionError(f'Error getting the symbol to which address {hex(addr)} belongs to')
diff --git a/focaccia/symbolic.py b/focaccia/symbolic.py
index 7ab0d84..50108d4 100644
--- a/focaccia/symbolic.py
+++ b/focaccia/symbolic.py
@@ -195,8 +195,10 @@ class SymbolicTransform:
             def resolve_register(self, regname: str) -> int | None:
                 accessed_regs.add(regname)
                 return None
-            def resolve_memory(self, addr: int, size: int): assert(False)
-            def resolve_location(self, _): assert(False)
+            def resolve_memory(self, addr: int, size: int):
+                pass
+            def resolve_location(self, _):
+                assert(False)
 
         state = ConcreteStateWrapper()
         for expr in self.changed_regs.values():
@@ -421,7 +423,11 @@ def _run_block(pc: int, conc_state: MiasmConcreteState, ctx: DisassemblyContext)
         # Execute each instruction in the current basic block and record the
         # resulting change in program state.
         for assignblk in irblock:
-            modified = engine.eval_assignblk(assignblk)
+            # A clean engine for the single-instruction diff, otherwise
+            # it concatenates the current instruction to the previous ones in
+            # the block.
+            _engine = SymbolicExecutionEngine(ctx.lifter)
+            modified = _engine.eval_assignblk(assignblk)
             symb_trace.append((assignblk.instr.offset, modified))
 
             # Run a single instruction
diff --git a/tools/qemu_tool.py b/tools/qemu_tool.py
index c7730fd..d670692 100644
--- a/tools/qemu_tool.py
+++ b/tools/qemu_tool.py
@@ -3,134 +3,123 @@
     gdb -n --batch -x qemu_tool.py
 """
 
-import argparse
-import re
-import shlex
-import subprocess
-from typing import TextIO
+import gdb
+import platform
 
 import focaccia.parser as parser
-from focaccia.arch import x86
-from focaccia.lldb_target import MemoryMap
+from focaccia.arch import supported_architectures, Arch
+from focaccia.compare import compare_symbolic, ErrorTypes
 from focaccia.snapshot import ProgramState
-
-def parse_memory_maps(stream: TextIO) -> tuple[list[MemoryMap], str]:
-    """
-    :return: Returns the list of parsed memory mappings as well as the first
-             line in the stream that does not belong to the memory mapping
-             information, i.e. the line that terminates the block of mapping
-             information.
-             The line is returned for the technical reason that the parser
-             needs to read a line from the stream in order to determine that
-             this line does no longer belong to the mapping information; but it
-             might still contain other important information.
-    """
-    mappings = []
-    while True:
-        line = stream.readline()
-        split = line.split(' ')
-        if len(split) != 3 or not re.match('^[0-9a-f]+-[0-9a-f]+$', split[0]):
-            return mappings, line
-
-        addr_range, size, perms = split
-        start, end = addr_range.split('-')
-        start, end = int(start, 16), int(end, 16)
-        mappings.append(MemoryMap(start, end, '[unnamed]', perms))
-
-def copy_memory(proc, state: ProgramState, maps: list[MemoryMap]):
-    """Copy memory from a GDB process to a ProgramState object.
-
-    Problem: Reading large mappings via GDB takes way too long (~500ms for ~8MB).
-    """
-    for mapping in maps:
-        # Only copy read- and writeable memory from the process. This is a
-        # heuristic to try to copy only heap and stack.
-        if 'rw' not in mapping.perms:
-            continue
-
-        map_size = mapping.end_address - mapping.start_address
-        mem = proc.read_memory(mapping.start_address, map_size)
-        assert(mem.contiguous)
-        assert(mem.nbytes == len(mem.tobytes()))
-        assert(mem.nbytes == map_size)
-        state.write_memory(mapping.start_address, mem.tobytes())
-
-def run_gdb(qemu_log: TextIO, qemu_port: int) -> list[ProgramState]:
-    import gdb
-
-    gdb.execute('set pagination 0')
-    gdb.execute('set sysroot')
-    gdb.execute(f'target remote localhost:{qemu_port}')
-    process = gdb.selected_inferior()
-
-    arch = x86.ArchX86()
-    mappings: list[MemoryMap] = []
-    states: list[ProgramState] = []
-
-    while process.is_valid() and len(process.threads()) > 0:
-        for line in qemu_log:
-            if re.match('^start +end +size +prot$', line):
-                mappings, line = parse_memory_maps(qemu_log)
-
-            if line.startswith('Trace'):
-                states.append(ProgramState(arch))
-                copy_memory(process, states[-1], mappings)
-                continue
-
-            if states:
-                parser._parse_qemu_line(line, states[-1])
-
-        gdb.execute('si', to_string=True)
+from focaccia.symbolic import SymbolicTransform, eval_symbol
+from focaccia.utils import print_result
+
+from verify_qemu import make_argparser
+
+class GDBProgramState:
+    def __init__(self, process: gdb.Inferior, frame: gdb.Frame):
+        self._proc = process
+        self._frame = frame
+
+    def read_register(self, regname: str) -> int | None:
+        try:
+            return int(self._frame.read_register(regname.lower()))
+        except ValueError as err:
+            from focaccia.arch import x86
+            rflags = int(self._frame.read_register('eflags'))
+            rflags = x86.decompose_rflags(rflags)
+            if regname in rflags:
+                return rflags[regname]
+
+            print(f'{regname}: {err}')
+            return None
+
+    def read_memory(self, addr: int, size: int) -> bytes | None:
+        try:
+            return self._proc.read_memory(addr, size).tobytes()
+        except gdb.MemoryError as err:
+            print(f'@{size}[{hex(addr)}]: {err}')
+            return None
+
+class GDBServerStateIterator:
+    def __init__(self, address: str, port: int):
+        gdb.execute('set pagination 0')
+        gdb.execute('set sysroot')
+        gdb.execute(f'target remote {address}:{port}')
+        self._process = gdb.selected_inferior()
+        self._first_next = True
+
+    def __iter__(self):
+        return self
+
+    def __next__(self):
+        # The first call to __next__ should yield the first program state,
+        # i.e. before stepping the first time
+        if self._first_next:
+            self._first_next = False
+            return GDBProgramState(self._process, gdb.selected_frame())
+
+        # Step
+        pc = gdb.selected_frame().read_register('pc')
+        new_pc = pc
+        while pc == new_pc:
+            gdb.execute('si', to_string=True)
+            if not self._process.is_valid() or len(self._process.threads()) == 0:
+                raise StopIteration
+            new_pc = gdb.selected_frame().read_register('pc')
+
+        return GDBProgramState(self._process, gdb.selected_frame())
+
+def collect_conc_trace(arch: Arch, \
+                       gdb: GDBServerStateIterator, \
+                       strace: list[SymbolicTransform]) \
+        -> list[ProgramState]:
+    states = []
+    for qemu, transform in zip(gdb, strace):
+        qemu_pc = qemu.read_register('pc')
+        assert(qemu_pc is not None)
+
+        if qemu_pc != transform.addr:
+            print(f'Fatal error: QEMU\'s program counter'
+                  f' ({hex(qemu_pc)}) does not match the'
+                  f' expected program counter in the symbolic trace'
+                  f' ({hex(transform.addr)}).')
+            print(f'Processing only partial trace up to this instruction.')
+            return states
+
+        state = ProgramState(arch)
+        state.set_register('PC', transform.addr)
+
+        accessed_regs = transform.get_used_registers()
+        accessed_mems = transform.get_used_memory_addresses()
+        for regname in accessed_regs:
+            regval = qemu.read_register(regname)
+            if regval is not None:
+                state.set_register(regname, regval)
+        for mem in accessed_mems:
+            assert(mem.size % 8 == 0)
+            addr = eval_symbol(mem.ptr, qemu)
+            mem = qemu.read_memory(addr, int(mem.size / 8))
+            if mem is not None:
+                state.write_memory(addr, mem)
+        states.append(state)
 
     return states
 
-def make_argparser():
-    prog = argparse.ArgumentParser()
-    prog.add_argument('binary',
-                      type=str,
-                      help='The binary to run and record.')
-    prog.add_argument('--binary-args',
-                      type=str,
-                      help='A string of arguments to be passed to the binary.')
-    prog.add_argument('--output', '-o', help='Name of output file.')
-    prog.add_argument('--gdbserver-port',  type=int, default=12421)
-    prog.add_argument('--qemu',            type=str, default='qemu-x86_64',
-                      help='QEMU binary to invoke. [Default: qemu-x86_64')
-    prog.add_argument('--qemu-log',        type=str, default='qemu.log')
-    prog.add_argument('--qemu-extra-args', type=str, default='',
-                      help='Arguments passed to QEMU in addition to the'
-                           ' default ones required by this script.')
-    return prog
-
-if __name__ == "__main__":
+def main():
     args = make_argparser().parse_args()
 
-    binary = args.binary
-    binary_args = shlex.split(args.binary_args) if args.binary_args else ''
-
-    qemu_bin = args.qemu
     gdbserver_port = args.gdbserver_port
-    qemu_log_name = args.qemu_log
-    qemu_args = [
-        qemu_bin,
-        '--trace', 'target_mmap*',
-        '--trace', 'memory_notdirty_*',
-        # We write QEMU's output to a log file, then read it from that file.
-        # This is preferred over reading from the process's stdout pipe because
-        # we require a non-blocking solution that returns when all available
-        # lines have been read.
-        '-D', qemu_log_name,
-        '-d', 'cpu,fpu,exec,unimp,page,strace',
-        '-g', str(gdbserver_port),
-        *shlex.split(args.qemu_extra_args),
-        binary,
-        *binary_args,
-    ]
-
-    qemu = subprocess.Popen(qemu_args)
-
-    with open(qemu_log_name, 'r') as qemu_log:
-        snapshots = run_gdb(qemu_log, gdbserver_port)
-
-    with open(args.output, 'w') as file:
-        parser.serialize_snapshots(snapshots, file)
+    with open(args.symb_trace, 'r') as strace:
+        symb_transforms = parser.parse_transformations(strace)
+
+    arch = supported_architectures[platform.machine()]
+    conc_states = collect_conc_trace(
+        arch,
+        GDBServerStateIterator('localhost', gdbserver_port),
+        symb_transforms)
+
+    res = compare_symbolic(conc_states, symb_transforms)
+    print_result(res, ErrorTypes.POSSIBLE)
+
+if __name__ == "__main__":
+    main()
diff --git a/tools/invoke_qemu_tool.py b/tools/verify_qemu.py
index 152c208..98437cb 100644
--- a/tools/invoke_qemu_tool.py
+++ b/tools/verify_qemu.py
@@ -1,39 +1,32 @@
 """
-This mechanism exists to retrieve per-instruction program snapshots from QEMU,
-specifically including memory dumps. This is surprisingly nontrivial (we don't
-have a log option like `-d memory`), and the mechanism we have implemented to
-achieve this is accordingly complicated.
-
-In short: We use QEMU's feature to interact with the emulation via a GDB server
-interface together with parsing QEMU's logs to record register and memory state
-at single-instruction intervals.
-
-We need QEMU's log in addition to the GDB server because QEMU's GDB server does
-not support querying memory mapping information. We need this information to
-know from where we need to read memory, so we parse memory mappings from the
-log (option `-d page`).
+Spawn GDB, connect to QEMU's GDB server, and read test states from that.
 
 We need two scripts (this one and the primary `qemu_tool.py`) because we can't
 pass arguments to scripts executed via `gdb -x <script>`.
 
-This script (`invoke_qemu_tool.py`) is the one the user interfaces with. It
+This script (`verify_qemu.py`) is the one the user interfaces with. It
 eventually calls `execv` to spawn a GDB process that calls the main
-`qemu_tool.py` script; `python invoke_qemu_tool.py` essentially behaves as if
+`qemu_tool.py` script; `python verify_qemu.py` essentially behaves as if
 something like `gdb --batch -x qemu_tool.py` were executed instead. Before it
 starts GDB, though, it parses command line arguments and applies some weird but
 necessary logic to pass them to `qemu_tool.py`.
-
-The main script `qemu_tool.py`, which runs inside of GDB, finally forks a QEMU
-instance that provides a GDB server and writes its logs to a file. It then
-connects GDB to that server and incrementally reads the QEMU logs while
-stepping through the program. Doing that, it generates program snapshots at
-each instruction.
 """
 
+import argparse
 import os
+import subprocess
 import sys
 
-from qemu_tool import make_argparser
+def make_argparser():
+    """This is also used by the GDB-invoked script to parse its args."""
+    prog = argparse.ArgumentParser()
+    prog.add_argument('--symb-trace',
+                      required=True,
+                      help='A symbolic transformation trace to be used for' \
+                           ' verification.')
+    prog.add_argument('--output', '-o', help='Name of output file.')
+    prog.add_argument('gdbserver_port', type=int)
+    return prog
 
 def quoted(s: str) -> str:
     return f'"{s}"'
@@ -67,12 +60,18 @@ if __name__ == "__main__":
     # mechanism to pass arguments to a script that it executes, so we
     # overwrite `sys.argv` manually before invoking the script.
     argv_str = f'[{", ".join(quoted(a) for a in argv)}]'
+    path_str = f'[{", ".join(quoted(s) for s in sys.path)}]'
 
-    os.execv(args.gdb, [
+    gdb_cmd = [
         args.gdb,
         '-nx',  # Don't parse any .gdbinits
         '--batch-silent' if args.quiet else '--batch',
         '-ex', f'py import sys',
         '-ex', f'py sys.argv = {argv_str}',
+        '-ex', f'py sys.path = {path_str}',
         '-x', qemu_tool_path
-    ])
+    ]
+    proc = subprocess.Popen(gdb_cmd)
+
+    ret = proc.wait()
+    exit(ret)
author	Theofilos Augoustis <theofilos.augoustis@gmail.com>	2024-01-22 20:02:03 +0100
committer	Theofilos Augoustis <theofilos.augoustis@gmail.com>	2024-01-22 20:02:03 +0100
commit	c8bb7528b8dc053b45d8c652663479f327273ef8 (patch)
tree	06a1488549e2206784edd34e227a463ebf30b165
parent	605e12fc5cf0fb64e45f68378390a09aa28df2f9 (diff)
download	focaccia-c8bb7528b8dc053b45d8c652663479f327273ef8.tar.gz focaccia-c8bb7528b8dc053b45d8c652663479f327273ef8.zip