3 files changed, 261 insertions, 0 deletions
diff --git a/tools/convert.py b/tools/convert.py
new file mode 100644
index 0000000..27a8a4a
--- /dev/null
+++ b/tools/convert.py
@@ -0,0 +1,47 @@
+import argparse
+import sys
+
+import focaccia.parser as parser
+from focaccia.arch import supported_architectures
+
+convert_funcs = {
+    'qemu':     parser.parse_qemu,
+    'arancini': parser.parse_arancini,
+}
+
+def main():
+    """Main."""
+    prog = argparse.ArgumentParser()
+    prog.description = 'Convert other programs\' logs to focaccia\'s log format.'
+    prog.add_argument('file', help='The log to convert.')
+    prog.add_argument('--type',
+                      required=True,
+                      choices=convert_funcs.keys(),
+                      help='The log type of `file`')
+    prog.add_argument('--output', '-o',
+                      help='Output file (default is stdout)')
+    prog.add_argument('--arch',
+                      default='x86_64',
+                      choices=supported_architectures.keys(),
+                      help='Processor architecture of input log (default is x86)')
+    args = prog.parse_args()
+
+    # Parse arancini log
+    arch = supported_architectures[args.arch]
+    parse_log = convert_funcs[args.type]
+    with open(args.file, 'r') as in_file:
+        try:
+            snapshots = parse_log(in_file, arch)
+        except parser.ParseError as err:
+            print(f'Parse error: {err}. Exiting.', file=sys.stderr)
+            exit(1)
+
+    # Write log in focaccia's format
+    if args.output:
+        with open(args.output, 'w') as out_file:
+            parser.serialize_snapshots(snapshots, out_file)
+    else:
+        parser.serialize_snapshots(snapshots, sys.stdout)
+
+if __name__ == '__main__':
+    main()
diff --git a/tools/invoke_qemu_tool.py b/tools/invoke_qemu_tool.py
new file mode 100644
index 0000000..152c208
--- /dev/null
+++ b/tools/invoke_qemu_tool.py
@@ -0,0 +1,78 @@
+"""
+This mechanism exists to retrieve per-instruction program snapshots from QEMU,
+specifically including memory dumps. This is surprisingly nontrivial (we don't
+have a log option like `-d memory`), and the mechanism we have implemented to
+achieve this is accordingly complicated.
+
+In short: We use QEMU's feature to interact with the emulation via a GDB server
+interface together with parsing QEMU's logs to record register and memory state
+at single-instruction intervals.
+
+We need QEMU's log in addition to the GDB server because QEMU's GDB server does
+not support querying memory mapping information. We need this information to
+know from where we need to read memory, so we parse memory mappings from the
+log (option `-d page`).
+
+We need two scripts (this one and the primary `qemu_tool.py`) because we can't
+pass arguments to scripts executed via `gdb -x <script>`.
+
+This script (`invoke_qemu_tool.py`) is the one the user interfaces with. It
+eventually calls `execv` to spawn a GDB process that calls the main
+`qemu_tool.py` script; `python invoke_qemu_tool.py` essentially behaves as if
+something like `gdb --batch -x qemu_tool.py` were executed instead. Before it
+starts GDB, though, it parses command line arguments and applies some weird but
+necessary logic to pass them to `qemu_tool.py`.
+
+The main script `qemu_tool.py`, which runs inside of GDB, finally forks a QEMU
+instance that provides a GDB server and writes its logs to a file. It then
+connects GDB to that server and incrementally reads the QEMU logs while
+stepping through the program. Doing that, it generates program snapshots at
+each instruction.
+"""
+
+import os
+import sys
+
+from qemu_tool import make_argparser
+
+def quoted(s: str) -> str:
+    return f'"{s}"'
+
+def try_remove(l: list, v):
+    try:
+        l.remove(v)
+    except ValueError:
+        pass
+
+if __name__ == "__main__":
+    prog = make_argparser()
+    prog.add_argument('--gdb', default='/bin/gdb',
+                      help='GDB binary to invoke')
+    prog.add_argument('--quiet', '-q', action='store_true',
+                      help='Suppress all output')
+    args = prog.parse_args()
+
+    filepath = os.path.realpath(__file__)
+    qemu_tool_path = os.path.join(os.path.dirname(filepath), 'qemu_tool.py')
+
+    # We have to remove all arguments we don't want to pass to the qemu tool
+    # manually here. Not nice, but what can you do..
+    argv = sys.argv
+    try_remove(argv, '--gdb')
+    try_remove(argv, args.gdb)
+    try_remove(argv, '--quiet')
+    try_remove(argv, '-q')
+
+    # Assemble the argv array passed to the qemu tool. GDB does not have a
+    # mechanism to pass arguments to a script that it executes, so we
+    # overwrite `sys.argv` manually before invoking the script.
+    argv_str = f'[{", ".join(quoted(a) for a in argv)}]'
+
+    os.execv(args.gdb, [
+        args.gdb,
+        '-nx',  # Don't parse any .gdbinits
+        '--batch-silent' if args.quiet else '--batch',
+        '-ex', f'py import sys',
+        '-ex', f'py sys.argv = {argv_str}',
+        '-x', qemu_tool_path
+    ])
diff --git a/tools/qemu_tool.py b/tools/qemu_tool.py
new file mode 100644
index 0000000..d5f78af
--- /dev/null
+++ b/tools/qemu_tool.py
@@ -0,0 +1,136 @@
+"""Invocable like this:
+
+    gdb -n --batch -x qemu_tool.py
+"""
+
+import argparse
+import re
+import shlex
+import subprocess
+from typing import TextIO
+
+import parser
+from arch import x86
+from lldb_target import MemoryMap
+from snapshot import ProgramState
+
+def parse_memory_maps(stream: TextIO) -> tuple[list[MemoryMap], str]:
+    """
+    :return: Returns the list of parsed memory mappings as well as the first
+             line in the stream that does not belong to the memory mapping
+             information, i.e. the line that terminates the block of mapping
+             information.
+             The line is returned for the technical reason that the parser
+             needs to read a line from the stream in order to determine that
+             this line does no longer belong to the mapping information; but it
+             might still contain other important information.
+    """
+    mappings = []
+    while True:
+        line = stream.readline()
+        split = line.split(' ')
+        if len(split) != 3 or not re.match('^[0-9a-f]+-[0-9a-f]+$', split[0]):
+            return mappings, line
+
+        addr_range, size, perms = split
+        start, end = addr_range.split('-')
+        start, end = int(start, 16), int(end, 16)
+        mappings.append(MemoryMap(start, end, '[unnamed]', perms))
+
+def copy_memory(proc, state: ProgramState, maps: list[MemoryMap]):
+    """Copy memory from a GDB process to a ProgramState object.
+
+    Problem: Reading large mappings via GDB takes way too long (~500ms for ~8MB).
+    """
+    for mapping in maps:
+        # Only copy read- and writeable memory from the process. This is a
+        # heuristic to try to copy only heap and stack.
+        if 'rw' not in mapping.perms:
+            continue
+
+        map_size = mapping.end_address - mapping.start_address
+        mem = proc.read_memory(mapping.start_address, map_size)
+        assert(mem.contiguous)
+        assert(mem.nbytes == len(mem.tobytes()))
+        assert(mem.nbytes == map_size)
+        state.write_memory(mapping.start_address, mem.tobytes())
+
+def run_gdb(qemu_log: TextIO, qemu_port: int) -> list[ProgramState]:
+    import gdb
+
+    gdb.execute('set pagination 0')
+    gdb.execute('set sysroot')
+    gdb.execute(f'target remote localhost:{qemu_port}')
+    process = gdb.selected_inferior()
+
+    arch = x86.ArchX86()
+    mappings: list[MemoryMap] = []
+    states: list[ProgramState] = []
+
+    while process.is_valid() and len(process.threads()) > 0:
+        for line in qemu_log:
+            if re.match('^start +end +size +prot$', line):
+                mappings, line = parse_memory_maps(qemu_log)
+
+            if line.startswith('Trace'):
+                states.append(ProgramState(arch))
+                copy_memory(process, states[-1], mappings)
+                continue
+
+            if states:
+                parser._parse_qemu_line(line, states[-1])
+
+        gdb.execute('si', to_string=True)
+
+    return states
+
+def make_argparser():
+    prog = argparse.ArgumentParser()
+    prog.add_argument('binary',
+                      type=str,
+                      help='The binary to run and record.')
+    prog.add_argument('--binary-args',
+                      type=str,
+                      help='A string of arguments to be passed to the binary.')
+    prog.add_argument('--output', '-o', help='Name of output file.')
+    prog.add_argument('--gdbserver-port',  type=int, default=12421)
+    prog.add_argument('--qemu',            type=str, default='qemu-x86_64',
+                      help='QEMU binary to invoke. [Default: qemu-x86_64')
+    prog.add_argument('--qemu-log',        type=str, default='qemu.log')
+    prog.add_argument('--qemu-extra-args', type=str, default='',
+                      help='Arguments passed to QEMU in addition to the'
+                           ' default ones required by this script.')
+    return prog
+
+if __name__ == "__main__":
+    args = make_argparser().parse_args()
+
+    binary = args.binary
+    binary_args = shlex.split(args.binary_args) if args.binary_args else ''
+
+    qemu_bin = args.qemu
+    gdbserver_port = args.gdbserver_port
+    qemu_log_name = args.qemu_log
+    qemu_args = [
+        qemu_bin,
+        '--trace', 'target_mmap*',
+        '--trace', 'memory_notdirty_*',
+        # We write QEMU's output to a log file, then read it from that file.
+        # This is preferred over reading from the process's stdout pipe because
+        # we require a non-blocking solution that returns when all available
+        # lines have been read.
+        '-D', qemu_log_name,
+        '-d', 'cpu,fpu,exec,unimp,page,strace',
+        '-g', str(gdbserver_port),
+        *shlex.split(args.qemu_extra_args),
+        binary,
+        *binary_args,
+    ]
+
+    qemu = subprocess.Popen(qemu_args)
+
+    with open(qemu_log_name, 'r') as qemu_log:
+        snapshots = run_gdb(qemu_log, gdbserver_port)
+
+    with open(args.output, 'w') as file:
+        parser.serialize_snapshots(snapshots, file)