Refactor project structure

Read concrete state on demand during concolic exec During concolic tracing, don't record full program snapshots at each basic block, but instead read concrete values directly from the concrete target when they are needed.
author: Theofilos Augoustis <theofilos.augoustis@gmail.com> 2023-12-31 18:29:31 +0100
committer: Theofilos Augoustis <theofilos.augoustis@gmail.com> 2023-12-31 18:29:31 +0100
commit: eae0b3b08bd078ad2f621ce2ef201e656da3f16a (patch)
tree: eb93252f39543c46146297264ff548d9925178e0
parent: d26ae0a7d583da5034cd6271f953b6253119ceae (diff)
download: focaccia-eae0b3b08bd078ad2f621ce2ef201e656da3f16a.tar.gz
focaccia-eae0b3b08bd078ad2f621ce2ef201e656da3f16a.zip
17 files changed, 102 insertions, 121 deletions
diff --git a/README.md b/README.md
index 4200b8c..82918b0 100644
--- a/README.md
+++ b/README.md
@@ -38,33 +38,31 @@ The `tools/` directory contains additional utility scripts to work with focaccia
 
 The following files belong to a rough framework for the snapshot comparison engine:
 
- - `main.py`: Entry point to the tool. Handling of command line arguments, pre-processing of input logs, etc.
+ - `focaccia/snapshot.py`: Structures used to work with snapshots. The `ProgramState` class is our primary
+representation of program snapshots.
 
- - `snapshot.py`: Structures used to work with snapshots. The `ProgramState` class is our primary representation of
-program snapshots.
+ - `focaccia/compare.py`: The central algorithms that work on snapshots.
 
- - `compare.py`: The central algorithms that work on snapshots.
-
- - `parser.py`: Utilities for parsing logs from Arancini and QEMU, as well as serializing/deserializing to/from our own
-log format.
-
- - `arch/`: Abstractions over different processor architectures. Will be used to integrate support for more
+ - `focaccia/arch/`: Abstractions over different processor architectures. Will be used to integrate support for more
 architectures later. Currently, we only have X86.
 
-## Concolic execution
+### Concolic execution
 
 The following files belong to a prototype of a data-dependency generator based on symbolic
 execution:
 
- - `symbolic.py`: Algorithms and data structures to compute and manipulate symbolic program transformations. This
-handles the symbolic part of "concolic" execution.
+ - `focaccia/symbolic.py`: Algorithms and data structures to compute and manipulate symbolic program transformations.
+This handles the symbolic part of "concolic" execution.
 
- - `lldb_target.py`: Tools for executing a program concretely and tracking its execution using
+ - `focaccia/lldb_target.py`: Tools for executing a program concretely and tracking its execution using
 [LLDB](https://lldb.llvm.org/). This handles the concrete part of "concolic" execution.
 
- - `miasm_util.py`: Tools to evaluate Miasm's symbolic expressions based on a concrete state. Ties the symbolic and
-concrete parts together into "concolic" execution.
+ - `focaccia/miasm_util.py`: Tools to evaluate Miasm's symbolic expressions based on a concrete state. Ties the symbolic
+and concrete parts together into "concolic" execution.
+
+### Helpers
 
-## Helpers
+ - `focaccia/parser.py`: Utilities for parsing logs from Arancini and QEMU, as well as serializing/deserializing to/from
+our own log format.
 
  - `miasm_test.py`: A test script that traces a program concolically.
diff --git a/main.py b/focaccia.py
index 3167bbd..e140337 100755
--- a/main.py
+++ b/focaccia.py
@@ -4,14 +4,14 @@ import argparse
 import platform
 from typing import Iterable
 
-from arch import x86
-from compare import compare_simple, compare_symbolic, \
-                    ErrorSeverity, ErrorTypes
-from lldb_target import LLDBConcreteTarget
-from parser import parse_arancini
-from snapshot import ProgramState
-from symbolic import SymbolicTransform, collect_symbolic_trace
-from utils import check_version, print_separator
+from focaccia.arch import x86
+from focaccia.compare import compare_simple, compare_symbolic, \
+                             ErrorSeverity, ErrorTypes
+from focaccia.lldb_target import LLDBConcreteTarget
+from focaccia.parser import parse_arancini
+from focaccia.snapshot import ProgramState
+from focaccia.symbolic import SymbolicTransform, collect_symbolic_trace
+from focaccia.utils import print_separator
 
 def run_native_execution(oracle_program: str, breakpoints: Iterable[int]):
     """Gather snapshots from a native execution via an external debugger.
@@ -175,7 +175,7 @@ def main():
         assert(program is not None)
 
         print(f'Tracing {program} symbolically with arguments {prog_args}...')
-        transforms = collect_symbolic_trace(program, [program, *prog_args])
+        transforms = collect_symbolic_trace(program, prog_args)
         txl, transforms = match_traces(txl, transforms)
         result = compare_symbolic(txl, transforms)
     else:
@@ -183,6 +183,5 @@ def main():
 
     print_result(result, verbosity[args.error_level])
 
-if __name__ == "__main__":
-    check_version('3.7')
+if __name__ == '__main__':
     main()
diff --git a/focaccia/__init__.py b/focaccia/__init__.py
new file mode 100644
index 0000000..e69de29
--- /dev/null
+++ b/focaccia/__init__.py
diff --git a/arch/__init__.py b/focaccia/arch/__init__.py
index 2926d20..2926d20 100644
--- a/arch/__init__.py
+++ b/focaccia/arch/__init__.py
diff --git a/arch/arch.py b/focaccia/arch/arch.py
index f2be5cb..f2be5cb 100644
--- a/arch/arch.py
+++ b/focaccia/arch/arch.py
diff --git a/arch/x86.py b/focaccia/arch/x86.py
index 95e1a82..95e1a82 100644
--- a/arch/x86.py
+++ b/focaccia/arch/x86.py
diff --git a/compare.py b/focaccia/compare.py
index dfa5dbd..066127d 100644
--- a/compare.py
+++ b/focaccia/compare.py
@@ -1,8 +1,8 @@
 from functools import total_ordering
 from typing import Self
 
-from snapshot import ProgramState, MemoryAccessError
-from symbolic import SymbolicTransform
+from .snapshot import ProgramState, MemoryAccessError
+from .symbolic import SymbolicTransform
 
 @total_ordering
 class ErrorSeverity:
diff --git a/lldb_target.py b/focaccia/lldb_target.py
index b96f66b..146891f 100644
--- a/lldb_target.py
+++ b/focaccia/lldb_target.py
@@ -1,7 +1,7 @@
 import lldb
 
-from arch import supported_architectures, x86
-from snapshot import ProgramState
+from .arch import supported_architectures, x86
+from .snapshot import ProgramState
 
 class MemoryMap:
     """Description of a range of mapped memory.
@@ -30,9 +30,6 @@ class LLDBConcreteTarget:
     def __init__(self, executable: str, argv: list[str] = []):
         """Construct an LLDB concrete target. Stop at entry.
 
-        :param argv: The full argv array, including the executable's path as
-                     the first argument (as is convention).
-
         :raises RuntimeError: If the process is unable to launch.
         """
         self.debugger = lldb.SBDebugger.Create()
@@ -46,8 +43,10 @@ class LLDBConcreteTarget:
         self.error = lldb.SBError()
         self.listener = self.debugger.GetListener()
         self.process = self.target.Launch(self.listener,
-                                          argv, None, None,
-                                          None, None, None, 0,
+                                          argv, None,        # argv, envp
+                                          None, None, None,  # stdin, stdout, stderr
+                                          None,              # working directory
+                                          0,
                                           True, self.error)
         if not self.process.IsValid():
             raise RuntimeError(f'[In LLDBConcreteTarget.__init__]: Failed to'
diff --git a/miasm_util.py b/focaccia/miasm_util.py
index 0d3ab3d..d9a9936 100644
--- a/miasm_util.py
+++ b/focaccia/miasm_util.py
@@ -6,7 +6,7 @@ from miasm.expression.expression import Expr, ExprOp, ExprId, ExprLoc, \
                                         ExprSlice, ExprCond
 from miasm.expression.simplifications import expr_simp_explicit
 
-from snapshot import ProgramState
+from .snapshot import ProgramState
 
 def simp_segm(expr_simp, expr: ExprOp):
     """Simplify a segmentation expression to an addition of the segment
diff --git a/parser.py b/focaccia/parser.py
index 391d58a..8680eed 100644
--- a/parser.py
+++ b/focaccia/parser.py
@@ -5,8 +5,8 @@ import json
 import re
 from typing import TextIO
 
-from arch import supported_architectures, Arch
-from snapshot import ProgramState
+from .arch import supported_architectures, Arch
+from .snapshot import ProgramState
 
 class ParseError(Exception):
     """A parse error."""
@@ -139,18 +139,3 @@ def parse_arancini(stream: TextIO, arch: Arch) -> list[ProgramState]:
                 states[-1].set(regname, int(value, 16))
 
     return states
-
-if __name__ == "__main__":
-    from arch import x86
-    with open('qemu.log', 'r') as file:
-        states = parse_qemu(file, x86.ArchX86())
-        print(f'Parsed {len(states)} states from QEMU log.')
-    with open('dump.qemu', 'w') as file:
-        serialize_snapshots(states, file)
-
-    with open('emulator-log.txt', 'r') as file:
-        states = parse_arancini(file, x86.ArchX86())
-        print(f'Parsed {len(states)} states from Arancini log.')
-    with open('dump.arancini', 'w') as file:
-        serialize_snapshots(states, file)
-    exit(0)
diff --git a/snapshot.py b/focaccia/snapshot.py
index 9c9e4b3..0f10dda 100644
--- a/snapshot.py
+++ b/focaccia/snapshot.py
@@ -1,4 +1,4 @@
-from arch.arch import Arch
+from .arch.arch import Arch
 
 class MemoryAccessError(Exception):
     def __init__(self, addr: int, size: int, msg: str):
@@ -38,8 +38,8 @@ class SparseMemory:
             page_addr, off = self._to_page_addr_and_offset(addr)
             if page_addr not in self._pages:
                 raise MemoryAccessError(addr, size,
-                                        f'Address {addr} is not contained in'
-                                        f' the sparse memory.')
+                                        f'Address {hex(addr)} is not contained'
+                                        f' in the sparse memory.')
             data = self._pages[page_addr]
             assert(len(data) == self.page_size)
             read_size = min(size, self.page_size - off)
diff --git a/symbolic.py b/focaccia/symbolic.py
index 6e70bc9..e132ebd 100644
--- a/symbolic.py
+++ b/focaccia/symbolic.py
@@ -11,10 +11,12 @@ from miasm.ir.symbexec import SymbolicExecutionEngine
 from miasm.ir.ir import IRBlock
 from miasm.expression.expression import Expr, ExprId, ExprMem, ExprInt
 
-from lldb_target import LLDBConcreteTarget
-from miasm_util import MiasmConcreteState, eval_expr
-from snapshot import ProgramState
-from arch import Arch, supported_architectures
+from .arch import Arch, supported_architectures
+from .lldb_target import LLDBConcreteTarget, \
+                         ConcreteRegisterError, \
+                         ConcreteMemoryError
+from .miasm_util import MiasmConcreteState, eval_expr
+from .snapshot import ProgramState
 
 class SymbolicTransform:
     def __init__(self, from_addr: int, to_addr: int):
@@ -75,7 +77,7 @@ class MiasmSymbolicTransform(SymbolicTransform):
         class MiasmSymbolicState(MiasmConcreteState):
             """Drop-in replacement for MiasmConcreteState in eval_expr that
             returns the current transform's symbolic equations instead of
-            symbolic values. Calling eval_expr with this effectively nests the
+            concrete values. Calling eval_expr with this effectively nests the
             transformation into the concatenated transformation.
 
             We inherit from `MiasmSymbolicTransform` only for the purpose of
@@ -275,14 +277,48 @@ def _run_block(pc: int, conc_state: MiasmConcreteState, ctx: DisassemblyContext)
             # instructions are translated to multiple IR instructions.
             pass
 
+class LLDBConcreteState:
+    """A back-end replacement for the `ProgramState` object from which
+    `MiasmConcreteState` reads its values. This reads values directly from an
+    LLDB target instead. This saves us the trouble of recording a full program
+    state, and allows us instead to read values from LLDB on demand.
+    """
+    def __init__(self, target: LLDBConcreteTarget, arch: Arch):
+        self._target = target
+        self._arch = arch
+
+    def read(self, reg: str) -> int | None:
+        from focaccia.arch import x86
+
+        regname = self._arch.to_regname(reg)
+        if regname is None:
+            return None
+
+        try:
+            return self._target.read_register(regname)
+        except ConcreteRegisterError:
+            # Special case for X86
+            if self._arch.archname == x86.archname:
+                rflags = x86.decompose_rflags(self._target.read_register('rflags'))
+                if regname in rflags:
+                    return rflags[regname]
+            return None
+
+    def read_memory(self, addr: int, size: int):
+        try:
+            return self._target.read_memory(addr, size)
+        except ConcreteMemoryError:
+            return None
+
 def collect_symbolic_trace(binary: str,
-                           argv: list[str],
+                           args: list[str],
                            start_addr: int | None = None
                            ) -> list[SymbolicTransform]:
     """Execute a program and compute state transformations between executed
     instructions.
 
     :param binary: The binary to trace.
+    :param args:   Arguments to the program.
     """
     ctx = DisassemblyContext(binary)
 
@@ -298,16 +334,16 @@ def collect_symbolic_trace(binary: str,
     else:
         pc = start_addr
 
-    target = LLDBConcreteTarget(binary, argv)
+    target = LLDBConcreteTarget(binary, args)
     if target.read_register('pc') != pc:
         target.set_breakpoint(pc)
         target.run()
         target.remove_breakpoint(pc)
+    conc_state = LLDBConcreteState(target, arch)
 
     symb_trace = [] # The resulting list of symbolic transforms per instruction
 
     # Run until no more states can be reached
-    initial_state = target.record_snapshot()
     while pc is not None:
         assert(target.read_register('pc') == pc)
 
@@ -317,9 +353,8 @@ def collect_symbolic_trace(binary: str,
         try:
             pc, strace = _run_block(
                 pc,
-                MiasmConcreteState(initial_state, ctx.loc_db),
-                ctx
-            )
+                MiasmConcreteState(conc_state, ctx.loc_db),
+                ctx)
         except DisassemblyError as err:
             # This happens if we encounter an instruction that is not
             # implemented by Miasm. Try to skip that instruction and continue
@@ -339,7 +374,6 @@ def collect_symbolic_trace(binary: str,
 
             symb_trace.append((err.faulty_pc, {}))  # Generate empty transform
             pc = target.read_register('pc')
-            initial_state = target.record_snapshot()
             continue
 
         if pc is None:
@@ -358,17 +392,14 @@ def collect_symbolic_trace(binary: str,
         symb_trace.extend(strace)
 
         # Use this for extensive trace debugging
-        if [a for a, _ in strace] != ctrace:
-            print(f'[WARNING] Symbolic trace and concrete trace are not equal!'
-                  f'\n    symbolic: {[hex(a) for a, _ in strace]}'
-                  f'\n    concrete: {[hex(a) for a in ctrace]}')
+        #if [a for a, _ in strace] != ctrace:
+        #    print(f'[WARNING] Symbolic trace and concrete trace are not equal!'
+        #          f'\n    symbolic: {[hex(a) for a, _ in strace]}'
+        #          f'\n    concrete: {[hex(a) for a in ctrace]}')
 
         if target.is_exited():
             break
 
-        # Query the new reference state for symbolic execution
-        initial_state = target.record_snapshot()
-
     res = []
     for (start, diff), (end, _) in zip(symb_trace[:-1], symb_trace[1:]):
         res.append(MiasmSymbolicTransform(diff, arch, start, end))
diff --git a/focaccia/utils.py b/focaccia/utils.py
new file mode 100644
index 0000000..0c6f292
--- /dev/null
+++ b/focaccia/utils.py
@@ -0,0 +1,7 @@
+import sys
+import shutil
+
+def print_separator(separator: str = '-', stream=sys.stdout, count: int = 80):
+    maxtermsize = count
+    termsize = shutil.get_terminal_size((80, 20)).columns
+    print(separator * min(termsize, maxtermsize), file=stream)
diff --git a/miasm_test.py b/miasm_test.py
index 97c23de..8d5bd9a 100644
--- a/miasm_test.py
+++ b/miasm_test.py
@@ -1,6 +1,6 @@
 import argparse
 
-from symbolic import collect_symbolic_trace
+from focaccia.symbolic import collect_symbolic_trace
 
 def main():
     program = argparse.ArgumentParser()
@@ -21,7 +21,7 @@ def main():
             print(f'Start address must be a hexadecimal number. Exiting.')
             exit(1)
 
-    strace = collect_symbolic_trace(binary, [binary, *argv], pc)
+    strace = collect_symbolic_trace(binary, argv, pc)
 
     print(f'--- {len(strace)} instructions traced.')
     print(f'--- No new PC found. Exiting.')
diff --git a/test/test_sparse_memory.py b/test/test_sparse_memory.py
index 87b4456..4fd9cba 100644
--- a/test/test_sparse_memory.py
+++ b/test/test_sparse_memory.py
@@ -1,6 +1,6 @@
 import unittest
 
-from snapshot import SparseMemory, MemoryAccessError
+from focaccia.snapshot import SparseMemory, MemoryAccessError
 
 class TestSparseMemory(unittest.TestCase):
     def test_oob_read(self):
diff --git a/tools/qemu_tool.py b/tools/qemu_tool.py
index d5f78af..c7730fd 100644
--- a/tools/qemu_tool.py
+++ b/tools/qemu_tool.py
@@ -9,10 +9,10 @@ import shlex
 import subprocess
 from typing import TextIO
 
-import parser
-from arch import x86
-from lldb_target import MemoryMap
-from snapshot import ProgramState
+import focaccia.parser as parser
+from focaccia.arch import x86
+from focaccia.lldb_target import MemoryMap
+from focaccia.snapshot import ProgramState
 
 def parse_memory_maps(stream: TextIO) -> tuple[list[MemoryMap], str]:
     """
diff --git a/utils.py b/utils.py
deleted file mode 100644
index f2c2256..0000000
--- a/utils.py
+++ /dev/null
@@ -1,38 +0,0 @@
-import sys
-import shutil
-
-def print_separator(separator: str = '-', stream=sys.stdout, count: int = 80):
-    maxtermsize = count
-    termsize = shutil.get_terminal_size((80, 20)).columns
-    print(separator * min(termsize, maxtermsize), file=stream)
-
-def check_version(version: str):
-    # Script depends on ordered dicts in default dict()
-    split = version.split('.')
-    major = int(split[0])
-    minor = int(split[1])
-    if sys.version_info.major < major and sys.version_info.minor < minor:
-        raise EnvironmentError("Expected at least Python 3.7")
-
-def to_str(expr):
-    """Convert a claripy expression to a nice string representation.
-
-    Actually, the resulting representation is not very nice at all. It mostly
-    serves debugging purposes.
-    """
-    import claripy
-
-    if not issubclass(type(expr), claripy.ast.Base):
-        return f'{type(expr)}[{str(expr)}]'
-
-    assert(expr.depth > 0)
-    if expr.depth == 1:
-        if expr.symbolic:
-            name = expr._encoded_name.decode()
-            return f'symbol[{name}]'
-        else:
-            assert(expr.concrete)
-            return f'value{expr.length}[{hex(expr.v)}]'
-
-    args = [to_str(child) for child in expr.args]
-    return f'expr[{str(expr.op)}({", ".join(args)})]'
author	Theofilos Augoustis <theofilos.augoustis@gmail.com>	2023-12-31 18:29:31 +0100
committer	Theofilos Augoustis <theofilos.augoustis@gmail.com>	2023-12-31 18:29:31 +0100
commit	eae0b3b08bd078ad2f621ce2ef201e656da3f16a (patch)
tree	eb93252f39543c46146297264ff548d9925178e0
parent	d26ae0a7d583da5034cd6271f953b6253119ceae (diff)
download	focaccia-eae0b3b08bd078ad2f621ce2ef201e656da3f16a.tar.gz focaccia-eae0b3b08bd078ad2f621ce2ef201e656da3f16a.zip