summary refs log tree commit diff stats
path: root/hw/i386/kvm
diff options
context:
space:
mode:
Diffstat (limited to 'hw/i386/kvm')
-rw-r--r--hw/i386/kvm/meson.build1
-rw-r--r--hw/i386/kvm/trace-events15
-rw-r--r--hw/i386/kvm/xen_evtchn.c16
-rw-r--r--hw/i386/kvm/xen_gnttab.c325
-rw-r--r--hw/i386/kvm/xen_gnttab.h1
-rw-r--r--hw/i386/kvm/xen_xenstore.c1252
-rw-r--r--hw/i386/kvm/xenstore_impl.c1927
-rw-r--r--hw/i386/kvm/xenstore_impl.h63
8 files changed, 3583 insertions, 17 deletions
diff --git a/hw/i386/kvm/meson.build b/hw/i386/kvm/meson.build
index 82dd6ae7c6..6621ba5cd7 100644
--- a/hw/i386/kvm/meson.build
+++ b/hw/i386/kvm/meson.build
@@ -9,6 +9,7 @@ i386_kvm_ss.add(when: 'CONFIG_XEN_EMU', if_true: files(
   'xen_evtchn.c',
   'xen_gnttab.c',
   'xen_xenstore.c',
+  'xenstore_impl.c',
   ))
 
 i386_ss.add_all(when: 'CONFIG_KVM', if_true: i386_kvm_ss)
diff --git a/hw/i386/kvm/trace-events b/hw/i386/kvm/trace-events
index b83c3eb965..e4c82de6f3 100644
--- a/hw/i386/kvm/trace-events
+++ b/hw/i386/kvm/trace-events
@@ -3,3 +3,18 @@ kvm_xen_unmap_pirq(int pirq, int gsi) "pirq %d gsi %d"
 kvm_xen_get_free_pirq(int pirq, int type) "pirq %d type %d"
 kvm_xen_bind_pirq(int pirq, int port) "pirq %d port %d"
 kvm_xen_unmask_pirq(int pirq, char *dev, int vector) "pirq %d dev %s vector %d"
+xenstore_error(unsigned int id, unsigned int tx_id, const char *err) "req %u tx %u err %s"
+xenstore_read(unsigned int tx_id, const char *path) "tx %u path %s"
+xenstore_write(unsigned int tx_id, const char *path) "tx %u path %s"
+xenstore_mkdir(unsigned int tx_id, const char *path) "tx %u path %s"
+xenstore_directory(unsigned int tx_id, const char *path) "tx %u path %s"
+xenstore_directory_part(unsigned int tx_id, const char *path, unsigned int offset) "tx %u path %s offset %u"
+xenstore_transaction_start(unsigned int new_tx) "new_tx %u"
+xenstore_transaction_end(unsigned int tx_id, bool commit) "tx %u commit %d"
+xenstore_rm(unsigned int tx_id, const char *path) "tx %u path %s"
+xenstore_get_perms(unsigned int tx_id, const char *path) "tx %u path %s"
+xenstore_set_perms(unsigned int tx_id, const char *path) "tx %u path %s"
+xenstore_watch(const char *path, const char *token) "path %s token %s"
+xenstore_unwatch(const char *path, const char *token) "path %s token %s"
+xenstore_reset_watches(void) ""
+xenstore_watch_event(const char *path, const char *token) "path %s token %s"
diff --git a/hw/i386/kvm/xen_evtchn.c b/hw/i386/kvm/xen_evtchn.c
index 886fbf6b3b..3048329474 100644
--- a/hw/i386/kvm/xen_evtchn.c
+++ b/hw/i386/kvm/xen_evtchn.c
@@ -15,6 +15,7 @@
 #include "qemu/lockable.h"
 #include "qemu/main-loop.h"
 #include "qemu/log.h"
+#include "qemu/error-report.h"
 #include "monitor/monitor.h"
 #include "monitor/hmp.h"
 #include "qapi/error.h"
@@ -34,6 +35,7 @@
 #include "hw/pci/msi.h"
 #include "hw/pci/msix.h"
 #include "hw/irq.h"
+#include "hw/xen/xen_backend_ops.h"
 
 #include "xen_evtchn.h"
 #include "xen_overlay.h"
@@ -278,6 +280,17 @@ static const TypeInfo xen_evtchn_info = {
     .class_init    = xen_evtchn_class_init,
 };
 
+static struct evtchn_backend_ops emu_evtchn_backend_ops = {
+    .open = xen_be_evtchn_open,
+    .bind_interdomain = xen_be_evtchn_bind_interdomain,
+    .unbind = xen_be_evtchn_unbind,
+    .close = xen_be_evtchn_close,
+    .get_fd = xen_be_evtchn_fd,
+    .notify = xen_be_evtchn_notify,
+    .unmask = xen_be_evtchn_unmask,
+    .pending = xen_be_evtchn_pending,
+};
+
 static void gsi_assert_bh(void *opaque)
 {
     struct vcpu_info *vi = kvm_xen_get_vcpu_info_hva(0);
@@ -318,6 +331,9 @@ void xen_evtchn_create(void)
     s->nr_pirq_inuse_words = DIV_ROUND_UP(s->nr_pirqs, 64);
     s->pirq_inuse_bitmap = g_new0(uint64_t, s->nr_pirq_inuse_words);
     s->pirq = g_new0(struct pirq_info, s->nr_pirqs);
+
+    /* Set event channel functions for backend drivers to use */
+    xen_evtchn_ops = &emu_evtchn_backend_ops;
 }
 
 void xen_evtchn_connect_gsis(qemu_irq *system_gsis)
diff --git a/hw/i386/kvm/xen_gnttab.c b/hw/i386/kvm/xen_gnttab.c
index 1e691ded32..21c30e3659 100644
--- a/hw/i386/kvm/xen_gnttab.c
+++ b/hw/i386/kvm/xen_gnttab.c
@@ -22,6 +22,7 @@
 
 #include "hw/sysbus.h"
 #include "hw/xen/xen.h"
+#include "hw/xen/xen_backend_ops.h"
 #include "xen_overlay.h"
 #include "xen_gnttab.h"
 
@@ -34,11 +35,10 @@
 #define TYPE_XEN_GNTTAB "xen-gnttab"
 OBJECT_DECLARE_SIMPLE_TYPE(XenGnttabState, XEN_GNTTAB)
 
-#define XEN_PAGE_SHIFT 12
-#define XEN_PAGE_SIZE (1ULL << XEN_PAGE_SHIFT)
-
 #define ENTRIES_PER_FRAME_V1 (XEN_PAGE_SIZE / sizeof(grant_entry_v1_t))
 
+static struct gnttab_backend_ops emu_gnttab_backend_ops;
+
 struct XenGnttabState {
     /*< private >*/
     SysBusDevice busdev;
@@ -57,6 +57,8 @@ struct XenGnttabState {
     MemoryRegion gnt_frames;
     MemoryRegion *gnt_aliases;
     uint64_t *gnt_frame_gpas;
+
+    uint8_t *map_track;
 };
 
 struct XenGnttabState *xen_gnttab_singleton;
@@ -70,13 +72,11 @@ static void xen_gnttab_realize(DeviceState *dev, Error **errp)
         error_setg(errp, "Xen grant table support is for Xen emulation");
         return;
     }
-    s->nr_frames = 0;
     s->max_frames = kvm_xen_get_gnttab_max_frames();
     memory_region_init_ram(&s->gnt_frames, OBJECT(dev), "xen:grant_table",
                            XEN_PAGE_SIZE * s->max_frames, &error_abort);
     memory_region_set_enabled(&s->gnt_frames, true);
     s->entries.v1 = memory_region_get_ram_ptr(&s->gnt_frames);
-    memset(s->entries.v1, 0, XEN_PAGE_SIZE * s->max_frames);
 
     /* Create individual page-sizes aliases for overlays */
     s->gnt_aliases = (void *)g_new0(MemoryRegion, s->max_frames);
@@ -88,9 +88,18 @@ static void xen_gnttab_realize(DeviceState *dev, Error **errp)
         s->gnt_frame_gpas[i] = INVALID_GPA;
     }
 
+    s->nr_frames = 0;
+    memset(s->entries.v1, 0, XEN_PAGE_SIZE * s->max_frames);
+    s->entries.v1[GNTTAB_RESERVED_XENSTORE].flags = GTF_permit_access;
+    s->entries.v1[GNTTAB_RESERVED_XENSTORE].frame = XEN_SPECIAL_PFN(XENSTORE);
+
     qemu_mutex_init(&s->gnt_lock);
 
     xen_gnttab_singleton = s;
+
+    s->map_track = g_new0(uint8_t, s->max_frames * ENTRIES_PER_FRAME_V1);
+
+    xen_gnttab_ops = &emu_gnttab_backend_ops;
 }
 
 static int xen_gnttab_post_load(void *opaque, int version_id)
@@ -230,3 +239,309 @@ int xen_gnttab_query_size_op(struct gnttab_query_size *size)
     size->max_nr_frames = s->max_frames;
     return 0;
 }
+
+/* Track per-open refs, to allow close() to clean up. */
+struct active_ref {
+    MemoryRegionSection mrs;
+    void *virtaddr;
+    uint32_t refcnt;
+    int prot;
+};
+
+static void gnt_unref(XenGnttabState *s, grant_ref_t ref,
+                      MemoryRegionSection *mrs, int prot)
+{
+    if (mrs && mrs->mr) {
+        if (prot & PROT_WRITE) {
+            memory_region_set_dirty(mrs->mr, mrs->offset_within_region,
+                                    XEN_PAGE_SIZE);
+        }
+        memory_region_unref(mrs->mr);
+        mrs->mr = NULL;
+    }
+    assert(s->map_track[ref] != 0);
+
+    if (--s->map_track[ref] == 0) {
+        grant_entry_v1_t *gnt_p = &s->entries.v1[ref];
+        qatomic_and(&gnt_p->flags, (uint16_t)~(GTF_reading | GTF_writing));
+    }
+}
+
+static uint64_t gnt_ref(XenGnttabState *s, grant_ref_t ref, int prot)
+{
+    uint16_t mask = GTF_type_mask | GTF_sub_page;
+    grant_entry_v1_t gnt, *gnt_p;
+    int retries = 0;
+
+    if (ref >= s->max_frames * ENTRIES_PER_FRAME_V1 ||
+        s->map_track[ref] == UINT8_MAX) {
+        return INVALID_GPA;
+    }
+
+    if (prot & PROT_WRITE) {
+        mask |= GTF_readonly;
+    }
+
+    gnt_p = &s->entries.v1[ref];
+
+    /*
+     * The guest can legitimately be changing the GTF_readonly flag. Allow
+     * that, but don't let a malicious guest cause a livelock.
+     */
+    for (retries = 0; retries < 5; retries++) {
+        uint16_t new_flags;
+
+        /* Read the entry before an atomic operation on its flags */
+        gnt = *(volatile grant_entry_v1_t *)gnt_p;
+
+        if ((gnt.flags & mask) != GTF_permit_access ||
+            gnt.domid != DOMID_QEMU) {
+            return INVALID_GPA;
+        }
+
+        new_flags = gnt.flags | GTF_reading;
+        if (prot & PROT_WRITE) {
+            new_flags |= GTF_writing;
+        }
+
+        if (qatomic_cmpxchg(&gnt_p->flags, gnt.flags, new_flags) == gnt.flags) {
+            return (uint64_t)gnt.frame << XEN_PAGE_SHIFT;
+        }
+    }
+
+    return INVALID_GPA;
+}
+
+struct xengntdev_handle {
+    GHashTable *active_maps;
+};
+
+static int xen_be_gnttab_set_max_grants(struct xengntdev_handle *xgt,
+                                        uint32_t nr_grants)
+{
+    return 0;
+}
+
+static void *xen_be_gnttab_map_refs(struct xengntdev_handle *xgt,
+                                    uint32_t count, uint32_t domid,
+                                    uint32_t *refs, int prot)
+{
+    XenGnttabState *s = xen_gnttab_singleton;
+    struct active_ref *act;
+
+    if (!s) {
+        errno = ENOTSUP;
+        return NULL;
+    }
+
+    if (domid != xen_domid) {
+        errno = EINVAL;
+        return NULL;
+    }
+
+    if (!count || count > 4096) {
+        errno = EINVAL;
+        return NULL;
+    }
+
+    /*
+     * Making a contiguous mapping from potentially discontiguous grant
+     * references would be... distinctly non-trivial. We don't support it.
+     * Even changing the API to return an array of pointers, one per page,
+     * wouldn't be simple to use in PV backends because some structures
+     * actually cross page boundaries (e.g. 32-bit blkif_response ring
+     * entries are 12 bytes).
+     */
+    if (count != 1) {
+        errno = EINVAL;
+        return NULL;
+    }
+
+    QEMU_LOCK_GUARD(&s->gnt_lock);
+
+    act = g_hash_table_lookup(xgt->active_maps, GINT_TO_POINTER(refs[0]));
+    if (act) {
+        if ((prot & PROT_WRITE) && !(act->prot & PROT_WRITE)) {
+            if (gnt_ref(s, refs[0], prot) == INVALID_GPA) {
+                return NULL;
+            }
+            act->prot |= PROT_WRITE;
+        }
+        act->refcnt++;
+    } else {
+        uint64_t gpa = gnt_ref(s, refs[0], prot);
+        if (gpa == INVALID_GPA) {
+            errno = EINVAL;
+            return NULL;
+        }
+
+        act = g_new0(struct active_ref, 1);
+        act->prot = prot;
+        act->refcnt = 1;
+        act->mrs = memory_region_find(get_system_memory(), gpa, XEN_PAGE_SIZE);
+
+        if (act->mrs.mr &&
+            !int128_lt(act->mrs.size, int128_make64(XEN_PAGE_SIZE)) &&
+            memory_region_get_ram_addr(act->mrs.mr) != RAM_ADDR_INVALID) {
+            act->virtaddr = qemu_map_ram_ptr(act->mrs.mr->ram_block,
+                                             act->mrs.offset_within_region);
+        }
+        if (!act->virtaddr) {
+            gnt_unref(s, refs[0], &act->mrs, 0);
+            g_free(act);
+            errno = EINVAL;
+            return NULL;
+        }
+
+        s->map_track[refs[0]]++;
+        g_hash_table_insert(xgt->active_maps, GINT_TO_POINTER(refs[0]), act);
+    }
+
+    return act->virtaddr;
+}
+
+static gboolean do_unmap(gpointer key, gpointer value, gpointer user_data)
+{
+    XenGnttabState *s = user_data;
+    grant_ref_t gref = GPOINTER_TO_INT(key);
+    struct active_ref *act = value;
+
+    gnt_unref(s, gref, &act->mrs, act->prot);
+    g_free(act);
+    return true;
+}
+
+static int xen_be_gnttab_unmap(struct xengntdev_handle *xgt,
+                               void *start_address, uint32_t *refs,
+                               uint32_t count)
+{
+    XenGnttabState *s = xen_gnttab_singleton;
+    struct active_ref *act;
+
+    if (!s) {
+        return -ENOTSUP;
+    }
+
+    if (count != 1) {
+        return -EINVAL;
+    }
+
+    QEMU_LOCK_GUARD(&s->gnt_lock);
+
+    act = g_hash_table_lookup(xgt->active_maps, GINT_TO_POINTER(refs[0]));
+    if (!act) {
+        return -ENOENT;
+    }
+
+    if (act->virtaddr != start_address) {
+        return -EINVAL;
+    }
+
+    if (!--act->refcnt) {
+        do_unmap(GINT_TO_POINTER(refs[0]), act, s);
+        g_hash_table_remove(xgt->active_maps, GINT_TO_POINTER(refs[0]));
+    }
+
+    return 0;
+}
+
+/*
+ * This looks a bit like the one for true Xen in xen-operations.c but
+ * in emulation we don't support multi-page mappings. And under Xen we
+ * *want* the multi-page mappings so we have fewer bounces through the
+ * kernel and the hypervisor. So the code paths end up being similar,
+ * but different.
+ */
+static int xen_be_gnttab_copy(struct xengntdev_handle *xgt, bool to_domain,
+                              uint32_t domid, XenGrantCopySegment *segs,
+                              uint32_t nr_segs, Error **errp)
+{
+    int prot = to_domain ? PROT_WRITE : PROT_READ;
+    unsigned int i;
+
+    for (i = 0; i < nr_segs; i++) {
+        XenGrantCopySegment *seg = &segs[i];
+        void *page;
+        uint32_t ref = to_domain ? seg->dest.foreign.ref :
+            seg->source.foreign.ref;
+
+        page = xen_be_gnttab_map_refs(xgt, 1, domid, &ref, prot);
+        if (!page) {
+            if (errp) {
+                error_setg_errno(errp, errno,
+                                 "xen_be_gnttab_map_refs failed");
+            }
+            return -errno;
+        }
+
+        if (to_domain) {
+            memcpy(page + seg->dest.foreign.offset, seg->source.virt,
+                   seg->len);
+        } else {
+            memcpy(seg->dest.virt, page + seg->source.foreign.offset,
+                   seg->len);
+        }
+
+        if (xen_be_gnttab_unmap(xgt, page, &ref, 1)) {
+            if (errp) {
+                error_setg_errno(errp, errno, "xen_be_gnttab_unmap failed");
+            }
+            return -errno;
+        }
+    }
+
+    return 0;
+}
+
+static struct xengntdev_handle *xen_be_gnttab_open(void)
+{
+    struct xengntdev_handle *xgt = g_new0(struct xengntdev_handle, 1);
+
+    xgt->active_maps = g_hash_table_new(g_direct_hash, g_direct_equal);
+    return xgt;
+}
+
+static int xen_be_gnttab_close(struct xengntdev_handle *xgt)
+{
+    XenGnttabState *s = xen_gnttab_singleton;
+
+    if (!s) {
+        return -ENOTSUP;
+    }
+
+    g_hash_table_foreach_remove(xgt->active_maps, do_unmap, s);
+    g_hash_table_destroy(xgt->active_maps);
+    g_free(xgt);
+    return 0;
+}
+
+static struct gnttab_backend_ops emu_gnttab_backend_ops = {
+    .open = xen_be_gnttab_open,
+    .close = xen_be_gnttab_close,
+    .grant_copy = xen_be_gnttab_copy,
+    .set_max_grants = xen_be_gnttab_set_max_grants,
+    .map_refs = xen_be_gnttab_map_refs,
+    .unmap = xen_be_gnttab_unmap,
+};
+
+int xen_gnttab_reset(void)
+{
+    XenGnttabState *s = xen_gnttab_singleton;
+
+    if (!s) {
+        return -ENOTSUP;
+    }
+
+    QEMU_LOCK_GUARD(&s->gnt_lock);
+
+    s->nr_frames = 0;
+
+    memset(s->entries.v1, 0, XEN_PAGE_SIZE * s->max_frames);
+
+    s->entries.v1[GNTTAB_RESERVED_XENSTORE].flags = GTF_permit_access;
+    s->entries.v1[GNTTAB_RESERVED_XENSTORE].frame = XEN_SPECIAL_PFN(XENSTORE);
+
+    memset(s->map_track, 0, s->max_frames * ENTRIES_PER_FRAME_V1);
+
+    return 0;
+}
diff --git a/hw/i386/kvm/xen_gnttab.h b/hw/i386/kvm/xen_gnttab.h
index 3bdbe96191..ee215239b0 100644
--- a/hw/i386/kvm/xen_gnttab.h
+++ b/hw/i386/kvm/xen_gnttab.h
@@ -13,6 +13,7 @@
 #define QEMU_XEN_GNTTAB_H
 
 void xen_gnttab_create(void);
+int xen_gnttab_reset(void);
 int xen_gnttab_map_page(uint64_t idx, uint64_t gfn);
 
 struct gnttab_set_version;
diff --git a/hw/i386/kvm/xen_xenstore.c b/hw/i386/kvm/xen_xenstore.c
index 14193ef3f9..900679af8a 100644
--- a/hw/i386/kvm/xen_xenstore.c
+++ b/hw/i386/kvm/xen_xenstore.c
@@ -15,12 +15,14 @@
 #include "qemu/module.h"
 #include "qemu/main-loop.h"
 #include "qemu/cutils.h"
+#include "qemu/error-report.h"
 #include "qapi/error.h"
 #include "qom/object.h"
 #include "migration/vmstate.h"
 
 #include "hw/sysbus.h"
 #include "hw/xen/xen.h"
+#include "hw/xen/xen_backend_ops.h"
 #include "xen_overlay.h"
 #include "xen_evtchn.h"
 #include "xen_xenstore.h"
@@ -28,15 +30,17 @@
 #include "sysemu/kvm.h"
 #include "sysemu/kvm_xen.h"
 
+#include "trace.h"
+
+#include "xenstore_impl.h"
+
 #include "hw/xen/interface/io/xs_wire.h"
 #include "hw/xen/interface/event_channel.h"
+#include "hw/xen/interface/grant_table.h"
 
 #define TYPE_XEN_XENSTORE "xen-xenstore"
 OBJECT_DECLARE_SIMPLE_TYPE(XenXenstoreState, XEN_XENSTORE)
 
-#define XEN_PAGE_SHIFT 12
-#define XEN_PAGE_SIZE (1ULL << XEN_PAGE_SHIFT)
-
 #define ENTRIES_PER_FRAME_V1 (XEN_PAGE_SIZE / sizeof(grant_entry_v1_t))
 #define ENTRIES_PER_FRAME_V2 (XEN_PAGE_SIZE / sizeof(grant_entry_v2_t))
 
@@ -47,6 +51,9 @@ struct XenXenstoreState {
     SysBusDevice busdev;
     /*< public >*/
 
+    XenstoreImplState *impl;
+    GList *watch_events; /* for the guest */
+
     MemoryRegion xenstore_page;
     struct xenstore_domain_interface *xs;
     uint8_t req_data[XENSTORE_HEADER_SIZE + XENSTORE_PAYLOAD_MAX];
@@ -59,15 +66,54 @@ struct XenXenstoreState {
     evtchn_port_t guest_port;
     evtchn_port_t be_port;
     struct xenevtchn_handle *eh;
+
+    uint8_t *impl_state;
+    uint32_t impl_state_size;
+
+    struct xengntdev_handle *gt;
+    void *granted_xs;
 };
 
 struct XenXenstoreState *xen_xenstore_singleton;
 
 static void xen_xenstore_event(void *opaque);
+static void fire_watch_cb(void *opaque, const char *path, const char *token);
+
+static struct xenstore_backend_ops emu_xenstore_backend_ops;
+
+static void G_GNUC_PRINTF (4, 5) relpath_printf(XenXenstoreState *s,
+                                                GList *perms,
+                                                const char *relpath,
+                                                const char *fmt, ...)
+{
+    gchar *abspath;
+    gchar *value;
+    va_list args;
+    GByteArray *data;
+    int err;
+
+    abspath = g_strdup_printf("/local/domain/%u/%s", xen_domid, relpath);
+    va_start(args, fmt);
+    value = g_strdup_vprintf(fmt, args);
+    va_end(args);
+
+    data = g_byte_array_new_take((void *)value, strlen(value));
+
+    err = xs_impl_write(s->impl, DOMID_QEMU, XBT_NULL, abspath, data);
+    assert(!err);
+
+    g_byte_array_unref(data);
+
+    err = xs_impl_set_perms(s->impl, DOMID_QEMU, XBT_NULL, abspath, perms);
+    assert(!err);
+
+    g_free(abspath);
+}
 
 static void xen_xenstore_realize(DeviceState *dev, Error **errp)
 {
     XenXenstoreState *s = XEN_XENSTORE(dev);
+    GList *perms;
 
     if (xen_mode != XEN_EMULATE) {
         error_setg(errp, "Xen xenstore support is for Xen emulation");
@@ -89,6 +135,50 @@ static void xen_xenstore_realize(DeviceState *dev, Error **errp)
     }
     aio_set_fd_handler(qemu_get_aio_context(), xen_be_evtchn_fd(s->eh), true,
                        xen_xenstore_event, NULL, NULL, NULL, s);
+
+    s->impl = xs_impl_create(xen_domid);
+
+    /* Populate the default nodes */
+
+    /* Nodes owned by 'dom0' but readable by the guest */
+    perms = g_list_append(NULL, xs_perm_as_string(XS_PERM_NONE, DOMID_QEMU));
+    perms = g_list_append(perms, xs_perm_as_string(XS_PERM_READ, xen_domid));
+
+    relpath_printf(s, perms, "", "%s", "");
+
+    relpath_printf(s, perms, "domid", "%u", xen_domid);
+
+    relpath_printf(s, perms, "control/platform-feature-xs_reset_watches", "%u", 1);
+    relpath_printf(s, perms, "control/platform-feature-multiprocessor-suspend", "%u", 1);
+
+    relpath_printf(s, perms, "platform/acpi", "%u", 1);
+    relpath_printf(s, perms, "platform/acpi_s3", "%u", 1);
+    relpath_printf(s, perms, "platform/acpi_s4", "%u", 1);
+    relpath_printf(s, perms, "platform/acpi_laptop_slate", "%u", 0);
+
+    g_list_free_full(perms, g_free);
+
+    /* Nodes owned by the guest */
+    perms = g_list_append(NULL, xs_perm_as_string(XS_PERM_NONE, xen_domid));
+
+    relpath_printf(s, perms, "attr", "%s", "");
+
+    relpath_printf(s, perms, "control/shutdown", "%s", "");
+    relpath_printf(s, perms, "control/feature-poweroff", "%u", 1);
+    relpath_printf(s, perms, "control/feature-reboot", "%u", 1);
+    relpath_printf(s, perms, "control/feature-suspend", "%u", 1);
+    relpath_printf(s, perms, "control/feature-s3", "%u", 1);
+    relpath_printf(s, perms, "control/feature-s4", "%u", 1);
+
+    relpath_printf(s, perms, "data", "%s", "");
+    relpath_printf(s, perms, "device", "%s", "");
+    relpath_printf(s, perms, "drivers", "%s", "");
+    relpath_printf(s, perms, "error", "%s", "");
+    relpath_printf(s, perms, "feature", "%s", "");
+
+    g_list_free_full(perms, g_free);
+
+    xen_xenstore_ops = &emu_xenstore_backend_ops;
 }
 
 static bool xen_xenstore_is_needed(void *opaque)
@@ -99,16 +189,26 @@ static bool xen_xenstore_is_needed(void *opaque)
 static int xen_xenstore_pre_save(void *opaque)
 {
     XenXenstoreState *s = opaque;
+    GByteArray *save;
 
     if (s->eh) {
         s->guest_port = xen_be_evtchn_get_guest_port(s->eh);
     }
+
+    g_free(s->impl_state);
+    save = xs_impl_serialize(s->impl);
+    s->impl_state = save->data;
+    s->impl_state_size = save->len;
+    g_byte_array_free(save, false);
+
     return 0;
 }
 
 static int xen_xenstore_post_load(void *opaque, int ver)
 {
     XenXenstoreState *s = opaque;
+    GByteArray *save;
+    int ret;
 
     /*
      * As qemu/dom0, rebind to the guest's port. The Windows drivers may
@@ -125,11 +225,18 @@ static int xen_xenstore_post_load(void *opaque, int ver)
         }
         s->be_port = be_port;
     }
-    return 0;
+
+    save = g_byte_array_new_take(s->impl_state, s->impl_state_size);
+    s->impl_state = NULL;
+    s->impl_state_size = 0;
+
+    ret = xs_impl_deserialize(s->impl, save, xen_domid, fire_watch_cb, s);
+    return ret;
 }
 
 static const VMStateDescription xen_xenstore_vmstate = {
     .name = "xen_xenstore",
+    .unmigratable = 1, /* The PV back ends don't migrate yet */
     .version_id = 1,
     .minimum_version_id = 1,
     .needed = xen_xenstore_is_needed,
@@ -145,6 +252,10 @@ static const VMStateDescription xen_xenstore_vmstate = {
         VMSTATE_BOOL(rsp_pending, XenXenstoreState),
         VMSTATE_UINT32(guest_port, XenXenstoreState),
         VMSTATE_BOOL(fatal_error, XenXenstoreState),
+        VMSTATE_UINT32(impl_state_size, XenXenstoreState),
+        VMSTATE_VARRAY_UINT32_ALLOC(impl_state, XenXenstoreState,
+                                    impl_state_size, 0,
+                                    vmstate_info_uint8, uint8_t),
         VMSTATE_END_OF_LIST()
     }
 };
@@ -213,20 +324,761 @@ static void reset_rsp(XenXenstoreState *s)
     s->rsp_offset = 0;
 }
 
+static void xs_error(XenXenstoreState *s, unsigned int id,
+                     xs_transaction_t tx_id, int errnum)
+{
+    struct xsd_sockmsg *rsp = (struct xsd_sockmsg *)s->rsp_data;
+    const char *errstr = NULL;
+
+    for (unsigned int i = 0; i < ARRAY_SIZE(xsd_errors); i++) {
+        struct xsd_errors *xsd_error = &xsd_errors[i];
+
+        if (xsd_error->errnum == errnum) {
+            errstr = xsd_error->errstring;
+            break;
+        }
+    }
+    assert(errstr);
+
+    trace_xenstore_error(id, tx_id, errstr);
+
+    rsp->type = XS_ERROR;
+    rsp->req_id = id;
+    rsp->tx_id = tx_id;
+    rsp->len = (uint32_t)strlen(errstr) + 1;
+
+    memcpy(&rsp[1], errstr, rsp->len);
+}
+
+static void xs_ok(XenXenstoreState *s, unsigned int type, unsigned int req_id,
+                  xs_transaction_t tx_id)
+{
+    struct xsd_sockmsg *rsp = (struct xsd_sockmsg *)s->rsp_data;
+    const char *okstr = "OK";
+
+    rsp->type = type;
+    rsp->req_id = req_id;
+    rsp->tx_id = tx_id;
+    rsp->len = (uint32_t)strlen(okstr) + 1;
+
+    memcpy(&rsp[1], okstr, rsp->len);
+}
+
+/*
+ * The correct request and response formats are documented in xen.git:
+ * docs/misc/xenstore.txt. A summary is given below for convenience.
+ * The '|' symbol represents a NUL character.
+ *
+ * ---------- Database read, write and permissions operations ----------
+ *
+ * READ                    <path>|                 <value|>
+ * WRITE                   <path>|<value|>
+ *         Store and read the octet string <value> at <path>.
+ *         WRITE creates any missing parent paths, with empty values.
+ *
+ * MKDIR                   <path>|
+ *         Ensures that the <path> exists, by necessary by creating
+ *         it and any missing parents with empty values.  If <path>
+ *         or any parent already exists, its value is left unchanged.
+ *
+ * RM                      <path>|
+ *         Ensures that the <path> does not exist, by deleting
+ *         it and all of its children.  It is not an error if <path> does
+ *         not exist, but it _is_ an error if <path>'s immediate parent
+ *         does not exist either.
+ *
+ * DIRECTORY               <path>|                 <child-leaf-name>|*
+ *         Gives a list of the immediate children of <path>, as only the
+ *         leafnames.  The resulting children are each named
+ *         <path>/<child-leaf-name>.
+ *
+ * DIRECTORY_PART          <path>|<offset>         <gencnt>|<child-leaf-name>|*
+ *         Same as DIRECTORY, but to be used for children lists longer than
+ *         XENSTORE_PAYLOAD_MAX. Input are <path> and the byte offset into
+ *         the list of children to return. Return values are the generation
+ *         count <gencnt> of the node (to be used to ensure the node hasn't
+ *         changed between two reads: <gencnt> being the same for multiple
+ *         reads guarantees the node hasn't changed) and the list of children
+ *         starting at the specified <offset> of the complete list.
+ *
+ * GET_PERMS               <path>|                 <perm-as-string>|+
+ * SET_PERMS               <path>|<perm-as-string>|+?
+ *         <perm-as-string> is one of the following
+ *                 w<domid>        write only
+ *                 r<domid>        read only
+ *                 b<domid>        both read and write
+ *                 n<domid>        no access
+ *         See https://wiki.xen.org/wiki/XenBus section
+ *         `Permissions' for details of the permissions system.
+ *         It is possible to set permissions for the special watch paths
+ *         "@introduceDomain" and "@releaseDomain" to enable receiving those
+ *         watches in unprivileged domains.
+ *
+ * ---------- Watches ----------
+ *
+ * WATCH                   <wpath>|<token>|?
+ *         Adds a watch.
+ *
+ *         When a <path> is modified (including path creation, removal,
+ *         contents change or permissions change) this generates an event
+ *         on the changed <path>.  Changes made in transactions cause an
+ *         event only if and when committed.  Each occurring event is
+ *         matched against all the watches currently set up, and each
+ *         matching watch results in a WATCH_EVENT message (see below).
+ *
+ *         The event's path matches the watch's <wpath> if it is an child
+ *         of <wpath>.
+ *
+ *         <wpath> can be a <path> to watch or @<wspecial>.  In the
+ *         latter case <wspecial> may have any syntax but it matches
+ *         (according to the rules above) only the following special
+ *         events which are invented by xenstored:
+ *             @introduceDomain    occurs on INTRODUCE
+ *             @releaseDomain      occurs on any domain crash or
+ *                                 shutdown, and also on RELEASE
+ *                                 and domain destruction
+ *         <wspecial> events are sent to privileged callers or explicitly
+ *         via SET_PERMS enabled domains only.
+ *
+ *         When a watch is first set up it is triggered once straight
+ *         away, with <path> equal to <wpath>.  Watches may be triggered
+ *         spuriously.  The tx_id in a WATCH request is ignored.
+ *
+ *         Watches are supposed to be restricted by the permissions
+ *         system but in practice the implementation is imperfect.
+ *         Applications should not rely on being sent a notification for
+ *         paths that they cannot read; however, an application may rely
+ *         on being sent a watch when a path which it _is_ able to read
+ *         is deleted even if that leaves only a nonexistent unreadable
+ *         parent.  A notification may omitted if a node's permissions
+ *         are changed so as to make it unreadable, in which case future
+ *         notifications may be suppressed (and if the node is later made
+ *         readable, some notifications may have been lost).
+ *
+ * WATCH_EVENT                                     <epath>|<token>|
+ *         Unsolicited `reply' generated for matching modification events
+ *         as described above.  req_id and tx_id are both 0.
+ *
+ *         <epath> is the event's path, ie the actual path that was
+ *         modified; however if the event was the recursive removal of an
+ *         parent of <wpath>, <epath> is just
+ *         <wpath> (rather than the actual path which was removed).  So
+ *         <epath> is a child of <wpath>, regardless.
+ *
+ *         Iff <wpath> for the watch was specified as a relative pathname,
+ *         the <epath> path will also be relative (with the same base,
+ *         obviously).
+ *
+ * UNWATCH                 <wpath>|<token>|?
+ *
+ * RESET_WATCHES           |
+ *         Reset all watches and transactions of the caller.
+ *
+ * ---------- Transactions ----------
+ *
+ * TRANSACTION_START       |                       <transid>|
+ *         <transid> is an opaque uint32_t allocated by xenstored
+ *         represented as unsigned decimal.  After this, transaction may
+ *         be referenced by using <transid> (as 32-bit binary) in the
+ *         tx_id request header field.  When transaction is started whole
+ *         db is copied; reads and writes happen on the copy.
+ *         It is not legal to send non-0 tx_id in TRANSACTION_START.
+ *
+ * TRANSACTION_END         T|
+ * TRANSACTION_END         F|
+ *         tx_id must refer to existing transaction.  After this
+ *         request the tx_id is no longer valid and may be reused by
+ *         xenstore.  If F, the transaction is discarded.  If T,
+ *         it is committed: if there were any other intervening writes
+ *         then our END gets get EAGAIN.
+ *
+ *         The plan is that in the future only intervening `conflicting'
+ *         writes cause EAGAIN, meaning only writes or other commits
+ *         which changed paths which were read or written in the
+ *         transaction at hand.
+ *
+ */
+
+static void xs_read(XenXenstoreState *s, unsigned int req_id,
+                    xs_transaction_t tx_id, uint8_t *req_data, unsigned int len)
+{
+    const char *path = (const char *)req_data;
+    struct xsd_sockmsg *rsp = (struct xsd_sockmsg *)s->rsp_data;
+    uint8_t *rsp_data = (uint8_t *)&rsp[1];
+    g_autoptr(GByteArray) data = g_byte_array_new();
+    int err;
+
+    if (len == 0 || req_data[len - 1] != '\0') {
+        xs_error(s, req_id, tx_id, EINVAL);
+        return;
+    }
+
+    trace_xenstore_read(tx_id, path);
+    err = xs_impl_read(s->impl, xen_domid, tx_id, path, data);
+    if (err) {
+        xs_error(s, req_id, tx_id, err);
+        return;
+    }
+
+    rsp->type = XS_READ;
+    rsp->req_id = req_id;
+    rsp->tx_id = tx_id;
+    rsp->len = 0;
+
+    len = data->len;
+    if (len > XENSTORE_PAYLOAD_MAX) {
+        xs_error(s, req_id, tx_id, E2BIG);
+        return;
+    }
+
+    memcpy(&rsp_data[rsp->len], data->data, len);
+    rsp->len += len;
+}
+
+static void xs_write(XenXenstoreState *s, unsigned int req_id,
+                     xs_transaction_t tx_id, uint8_t *req_data,
+                     unsigned int len)
+{
+    g_autoptr(GByteArray) data = g_byte_array_new();
+    const char *path;
+    int err;
+
+    if (len == 0) {
+        xs_error(s, req_id, tx_id, EINVAL);
+        return;
+    }
+
+    path = (const char *)req_data;
+
+    while (len--) {
+        if (*req_data++ == '\0') {
+            break;
+        }
+        if (len == 0) {
+            xs_error(s, req_id, tx_id, EINVAL);
+            return;
+        }
+    }
+
+    g_byte_array_append(data, req_data, len);
+
+    trace_xenstore_write(tx_id, path);
+    err = xs_impl_write(s->impl, xen_domid, tx_id, path, data);
+    if (err) {
+        xs_error(s, req_id, tx_id, err);
+        return;
+    }
+
+    xs_ok(s, XS_WRITE, req_id, tx_id);
+}
+
+static void xs_mkdir(XenXenstoreState *s, unsigned int req_id,
+                     xs_transaction_t tx_id, uint8_t *req_data,
+                     unsigned int len)
+{
+    g_autoptr(GByteArray) data = g_byte_array_new();
+    const char *path;
+    int err;
+
+    if (len == 0 || req_data[len - 1] != '\0') {
+        xs_error(s, req_id, tx_id, EINVAL);
+        return;
+    }
+
+    path = (const char *)req_data;
+
+    trace_xenstore_mkdir(tx_id, path);
+    err = xs_impl_read(s->impl, xen_domid, tx_id, path, data);
+    if (err == ENOENT) {
+        err = xs_impl_write(s->impl, xen_domid, tx_id, path, data);
+    }
+
+    if (!err) {
+        xs_error(s, req_id, tx_id, err);
+        return;
+    }
+
+    xs_ok(s, XS_MKDIR, req_id, tx_id);
+}
+
+static void xs_append_strings(XenXenstoreState *s, struct xsd_sockmsg *rsp,
+                              GList *strings, unsigned int start, bool truncate)
+{
+    uint8_t *rsp_data = (uint8_t *)&rsp[1];
+    GList *l;
+
+    for (l = strings; l; l = l->next) {
+        size_t len = strlen(l->data) + 1; /* Including the NUL termination */
+        char *str = l->data;
+
+        if (rsp->len + len > XENSTORE_PAYLOAD_MAX) {
+            if (truncate) {
+                len = XENSTORE_PAYLOAD_MAX - rsp->len;
+                if (!len) {
+                    return;
+                }
+            } else {
+                xs_error(s, rsp->req_id, rsp->tx_id, E2BIG);
+                return;
+            }
+        }
+
+        if (start) {
+            if (start >= len) {
+                start -= len;
+                continue;
+            }
+
+            str += start;
+            len -= start;
+            start = 0;
+        }
+
+        memcpy(&rsp_data[rsp->len], str, len);
+        rsp->len += len;
+    }
+    /* XS_DIRECTORY_PART wants an extra NUL to indicate the end */
+    if (truncate && rsp->len < XENSTORE_PAYLOAD_MAX) {
+        rsp_data[rsp->len++] = '\0';
+    }
+}
+
+static void xs_directory(XenXenstoreState *s, unsigned int req_id,
+                         xs_transaction_t tx_id, uint8_t *req_data,
+                         unsigned int len)
+{
+    struct xsd_sockmsg *rsp = (struct xsd_sockmsg *)s->rsp_data;
+    GList *items = NULL;
+    const char *path;
+    int err;
+
+    if (len == 0 || req_data[len - 1] != '\0') {
+        xs_error(s, req_id, tx_id, EINVAL);
+        return;
+    }
+
+    path = (const char *)req_data;
+
+    trace_xenstore_directory(tx_id, path);
+    err = xs_impl_directory(s->impl, xen_domid, tx_id, path, NULL, &items);
+    if (err != 0) {
+        xs_error(s, req_id, tx_id, err);
+        return;
+    }
+
+    rsp->type = XS_DIRECTORY;
+    rsp->req_id = req_id;
+    rsp->tx_id = tx_id;
+    rsp->len = 0;
+
+    xs_append_strings(s, rsp, items, 0, false);
+
+    g_list_free_full(items, g_free);
+}
+
+static void xs_directory_part(XenXenstoreState *s, unsigned int req_id,
+                              xs_transaction_t tx_id, uint8_t *req_data,
+                              unsigned int len)
+{
+    const char *offset_str, *path = (const char *)req_data;
+    struct xsd_sockmsg *rsp = (struct xsd_sockmsg *)s->rsp_data;
+    char *rsp_data = (char *)&rsp[1];
+    uint64_t gencnt = 0;
+    unsigned int offset;
+    GList *items = NULL;
+    int err;
+
+    if (len == 0) {
+        xs_error(s, req_id, tx_id, EINVAL);
+        return;
+    }
+
+    while (len--) {
+        if (*req_data++ == '\0') {
+            break;
+        }
+        if (len == 0) {
+            xs_error(s, req_id, tx_id, EINVAL);
+            return;
+        }
+    }
+
+    offset_str = (const char *)req_data;
+    while (len--) {
+        if (*req_data++ == '\0') {
+            break;
+        }
+        if (len == 0) {
+            xs_error(s, req_id, tx_id, EINVAL);
+            return;
+        }
+    }
+
+    if (len) {
+        xs_error(s, req_id, tx_id, EINVAL);
+        return;
+    }
+
+    if (qemu_strtoui(offset_str, NULL, 10, &offset) < 0) {
+        xs_error(s, req_id, tx_id, EINVAL);
+        return;
+    }
+
+    trace_xenstore_directory_part(tx_id, path, offset);
+    err = xs_impl_directory(s->impl, xen_domid, tx_id, path, &gencnt, &items);
+    if (err != 0) {
+        xs_error(s, req_id, tx_id, err);
+        return;
+    }
+
+    rsp->type = XS_DIRECTORY_PART;
+    rsp->req_id = req_id;
+    rsp->tx_id = tx_id;
+    rsp->len = snprintf(rsp_data, XENSTORE_PAYLOAD_MAX, "%" PRIu64, gencnt) + 1;
+
+    xs_append_strings(s, rsp, items, offset, true);
+
+    g_list_free_full(items, g_free);
+}
+
+static void xs_transaction_start(XenXenstoreState *s, unsigned int req_id,
+                                 xs_transaction_t tx_id, uint8_t *req_data,
+                                 unsigned int len)
+{
+    struct xsd_sockmsg *rsp = (struct xsd_sockmsg *)s->rsp_data;
+    char *rsp_data = (char *)&rsp[1];
+    int err;
+
+    if (len != 1 || req_data[0] != '\0') {
+        xs_error(s, req_id, tx_id, EINVAL);
+        return;
+    }
+
+    rsp->type = XS_TRANSACTION_START;
+    rsp->req_id = req_id;
+    rsp->tx_id = tx_id;
+    rsp->len = 0;
+
+    err = xs_impl_transaction_start(s->impl, xen_domid, &tx_id);
+    if (err) {
+        xs_error(s, req_id, tx_id, err);
+        return;
+    }
+
+    trace_xenstore_transaction_start(tx_id);
+
+    rsp->len = snprintf(rsp_data, XENSTORE_PAYLOAD_MAX, "%u", tx_id);
+    assert(rsp->len < XENSTORE_PAYLOAD_MAX);
+    rsp->len++;
+}
+
+static void xs_transaction_end(XenXenstoreState *s, unsigned int req_id,
+                               xs_transaction_t tx_id, uint8_t *req_data,
+                               unsigned int len)
+{
+    bool commit;
+    int err;
+
+    if (len != 2 || req_data[1] != '\0') {
+        xs_error(s, req_id, tx_id, EINVAL);
+        return;
+    }
+
+    switch (req_data[0]) {
+    case 'T':
+        commit = true;
+        break;
+    case 'F':
+        commit = false;
+        break;
+    default:
+        xs_error(s, req_id, tx_id, EINVAL);
+        return;
+    }
+
+    trace_xenstore_transaction_end(tx_id, commit);
+    err = xs_impl_transaction_end(s->impl, xen_domid, tx_id, commit);
+    if (err) {
+        xs_error(s, req_id, tx_id, err);
+        return;
+    }
+
+    xs_ok(s, XS_TRANSACTION_END, req_id, tx_id);
+}
+
+static void xs_rm(XenXenstoreState *s, unsigned int req_id,
+                  xs_transaction_t tx_id, uint8_t *req_data, unsigned int len)
+{
+    const char *path = (const char *)req_data;
+    int err;
+
+    if (len == 0 || req_data[len - 1] != '\0') {
+        xs_error(s, req_id, tx_id, EINVAL);
+        return;
+    }
+
+    trace_xenstore_rm(tx_id, path);
+    err = xs_impl_rm(s->impl, xen_domid, tx_id, path);
+    if (err) {
+        xs_error(s, req_id, tx_id, err);
+        return;
+    }
+
+    xs_ok(s, XS_RM, req_id, tx_id);
+}
+
+static void xs_get_perms(XenXenstoreState *s, unsigned int req_id,
+                         xs_transaction_t tx_id, uint8_t *req_data,
+                         unsigned int len)
+{
+    const char *path = (const char *)req_data;
+    struct xsd_sockmsg *rsp = (struct xsd_sockmsg *)s->rsp_data;
+    GList *perms = NULL;
+    int err;
+
+    if (len == 0 || req_data[len - 1] != '\0') {
+        xs_error(s, req_id, tx_id, EINVAL);
+        return;
+    }
+
+    trace_xenstore_get_perms(tx_id, path);
+    err = xs_impl_get_perms(s->impl, xen_domid, tx_id, path, &perms);
+    if (err) {
+        xs_error(s, req_id, tx_id, err);
+        return;
+    }
+
+    rsp->type = XS_GET_PERMS;
+    rsp->req_id = req_id;
+    rsp->tx_id = tx_id;
+    rsp->len = 0;
+
+    xs_append_strings(s, rsp, perms, 0, false);
+
+    g_list_free_full(perms, g_free);
+}
+
+static void xs_set_perms(XenXenstoreState *s, unsigned int req_id,
+                         xs_transaction_t tx_id, uint8_t *req_data,
+                         unsigned int len)
+{
+    const char *path = (const char *)req_data;
+    uint8_t *perm;
+    GList *perms = NULL;
+    int err;
+
+    if (len == 0) {
+        xs_error(s, req_id, tx_id, EINVAL);
+        return;
+    }
+
+    while (len--) {
+        if (*req_data++ == '\0') {
+            break;
+        }
+        if (len == 0) {
+            xs_error(s, req_id, tx_id, EINVAL);
+            return;
+        }
+    }
+
+    perm = req_data;
+    while (len--) {
+        if (*req_data++ == '\0') {
+            perms = g_list_append(perms, perm);
+            perm = req_data;
+        }
+    }
+
+    /*
+     * Note that there may be trailing garbage at the end of the buffer.
+     * This is explicitly permitted by the '?' at the end of the definition:
+     *
+     *    SET_PERMS         <path>|<perm-as-string>|+?
+     */
+
+    trace_xenstore_set_perms(tx_id, path);
+    err = xs_impl_set_perms(s->impl, xen_domid, tx_id, path, perms);
+    g_list_free(perms);
+    if (err) {
+        xs_error(s, req_id, tx_id, err);
+        return;
+    }
+
+    xs_ok(s, XS_SET_PERMS, req_id, tx_id);
+}
+
+static void xs_watch(XenXenstoreState *s, unsigned int req_id,
+                     xs_transaction_t tx_id, uint8_t *req_data,
+                     unsigned int len)
+{
+    const char *token, *path = (const char *)req_data;
+    int err;
+
+    if (len == 0) {
+        xs_error(s, req_id, tx_id, EINVAL);
+        return;
+    }
+
+    while (len--) {
+        if (*req_data++ == '\0') {
+            break;
+        }
+        if (len == 0) {
+            xs_error(s, req_id, tx_id, EINVAL);
+            return;
+        }
+    }
+
+    token = (const char *)req_data;
+    while (len--) {
+        if (*req_data++ == '\0') {
+            break;
+        }
+        if (len == 0) {
+            xs_error(s, req_id, tx_id, EINVAL);
+            return;
+        }
+    }
+
+    /*
+     * Note that there may be trailing garbage at the end of the buffer.
+     * This is explicitly permitted by the '?' at the end of the definition:
+     *
+     *    WATCH             <wpath>|<token>|?
+     */
+
+    trace_xenstore_watch(path, token);
+    err = xs_impl_watch(s->impl, xen_domid, path, token, fire_watch_cb, s);
+    if (err) {
+        xs_error(s, req_id, tx_id, err);
+        return;
+    }
+
+    xs_ok(s, XS_WATCH, req_id, tx_id);
+}
+
+static void xs_unwatch(XenXenstoreState *s, unsigned int req_id,
+                       xs_transaction_t tx_id, uint8_t *req_data,
+                       unsigned int len)
+{
+    const char *token, *path = (const char *)req_data;
+    int err;
+
+    if (len == 0) {
+        xs_error(s, req_id, tx_id, EINVAL);
+        return;
+    }
+
+    while (len--) {
+        if (*req_data++ == '\0') {
+            break;
+        }
+        if (len == 0) {
+            xs_error(s, req_id, tx_id, EINVAL);
+            return;
+        }
+    }
+
+    token = (const char *)req_data;
+    while (len--) {
+        if (*req_data++ == '\0') {
+            break;
+        }
+        if (len == 0) {
+            xs_error(s, req_id, tx_id, EINVAL);
+            return;
+        }
+    }
+
+    trace_xenstore_unwatch(path, token);
+    err = xs_impl_unwatch(s->impl, xen_domid, path, token, fire_watch_cb, s);
+    if (err) {
+        xs_error(s, req_id, tx_id, err);
+        return;
+    }
+
+    xs_ok(s, XS_UNWATCH, req_id, tx_id);
+}
+
+static void xs_reset_watches(XenXenstoreState *s, unsigned int req_id,
+                             xs_transaction_t tx_id, uint8_t *req_data,
+                             unsigned int len)
+{
+    if (len == 0 || req_data[len - 1] != '\0') {
+        xs_error(s, req_id, tx_id, EINVAL);
+        return;
+    }
+
+    trace_xenstore_reset_watches();
+    xs_impl_reset_watches(s->impl, xen_domid);
+
+    xs_ok(s, XS_RESET_WATCHES, req_id, tx_id);
+}
+
+static void xs_priv(XenXenstoreState *s, unsigned int req_id,
+                    xs_transaction_t tx_id, uint8_t *data,
+                    unsigned int len)
+{
+    xs_error(s, req_id, tx_id, EACCES);
+}
+
+static void xs_unimpl(XenXenstoreState *s, unsigned int req_id,
+                      xs_transaction_t tx_id, uint8_t *data,
+                      unsigned int len)
+{
+    xs_error(s, req_id, tx_id, ENOSYS);
+}
+
+typedef void (*xs_impl)(XenXenstoreState *s, unsigned int req_id,
+                        xs_transaction_t tx_id, uint8_t *data,
+                        unsigned int len);
+
+struct xsd_req {
+    const char *name;
+    xs_impl fn;
+};
+#define XSD_REQ(_type, _fn)                           \
+    [_type] = { .name = #_type, .fn = _fn }
+
+struct xsd_req xsd_reqs[] = {
+    XSD_REQ(XS_READ, xs_read),
+    XSD_REQ(XS_WRITE, xs_write),
+    XSD_REQ(XS_MKDIR, xs_mkdir),
+    XSD_REQ(XS_DIRECTORY, xs_directory),
+    XSD_REQ(XS_DIRECTORY_PART, xs_directory_part),
+    XSD_REQ(XS_TRANSACTION_START, xs_transaction_start),
+    XSD_REQ(XS_TRANSACTION_END, xs_transaction_end),
+    XSD_REQ(XS_RM, xs_rm),
+    XSD_REQ(XS_GET_PERMS, xs_get_perms),
+    XSD_REQ(XS_SET_PERMS, xs_set_perms),
+    XSD_REQ(XS_WATCH, xs_watch),
+    XSD_REQ(XS_UNWATCH, xs_unwatch),
+    XSD_REQ(XS_CONTROL, xs_priv),
+    XSD_REQ(XS_INTRODUCE, xs_priv),
+    XSD_REQ(XS_RELEASE, xs_priv),
+    XSD_REQ(XS_IS_DOMAIN_INTRODUCED, xs_priv),
+    XSD_REQ(XS_RESUME, xs_priv),
+    XSD_REQ(XS_SET_TARGET, xs_priv),
+    XSD_REQ(XS_RESET_WATCHES, xs_reset_watches),
+};
+
 static void process_req(XenXenstoreState *s)
 {
     struct xsd_sockmsg *req = (struct xsd_sockmsg *)s->req_data;
-    struct xsd_sockmsg *rsp = (struct xsd_sockmsg *)s->rsp_data;
-    const char enosys[] = "ENOSYS";
+    xs_impl handler = NULL;
 
     assert(req_pending(s));
     assert(!s->rsp_pending);
 
-    rsp->type = XS_ERROR;
-    rsp->req_id = req->req_id;
-    rsp->tx_id = req->tx_id;
-    rsp->len = sizeof(enosys);
-    memcpy((void *)&rsp[1], enosys, sizeof(enosys));
+    if (req->type < ARRAY_SIZE(xsd_reqs)) {
+        handler = xsd_reqs[req->type].fn;
+    }
+    if (!handler) {
+        handler = &xs_unimpl;
+    }
+
+    handler(s, req->req_id, req->tx_id, (uint8_t *)&req[1], req->len);
 
     s->rsp_pending = true;
     reset_req(s);
@@ -415,6 +1267,113 @@ static unsigned int put_rsp(XenXenstoreState *s)
     return copylen;
 }
 
+static void deliver_watch(XenXenstoreState *s, const char *path,
+                          const char *token)
+{
+    struct xsd_sockmsg *rsp = (struct xsd_sockmsg *)s->rsp_data;
+    uint8_t *rsp_data = (uint8_t *)&rsp[1];
+    unsigned int len;
+
+    assert(!s->rsp_pending);
+
+    trace_xenstore_watch_event(path, token);
+
+    rsp->type = XS_WATCH_EVENT;
+    rsp->req_id = 0;
+    rsp->tx_id = 0;
+    rsp->len = 0;
+
+    len = strlen(path);
+
+    /* XENSTORE_ABS/REL_PATH_MAX should ensure there can be no overflow */
+    assert(rsp->len + len < XENSTORE_PAYLOAD_MAX);
+
+    memcpy(&rsp_data[rsp->len], path, len);
+    rsp->len += len;
+    rsp_data[rsp->len] = '\0';
+    rsp->len++;
+
+    len = strlen(token);
+    /*
+     * It is possible for the guest to have chosen a token that will
+     * not fit (along with the patch) into a watch event. We have no
+     * choice but to drop the event if this is the case.
+     */
+    if (rsp->len + len >= XENSTORE_PAYLOAD_MAX) {
+        return;
+    }
+
+    memcpy(&rsp_data[rsp->len], token, len);
+    rsp->len += len;
+    rsp_data[rsp->len] = '\0';
+    rsp->len++;
+
+    s->rsp_pending = true;
+}
+
+struct watch_event {
+    char *path;
+    char *token;
+};
+
+static void free_watch_event(struct watch_event *ev)
+{
+    if (ev) {
+        g_free(ev->path);
+        g_free(ev->token);
+        g_free(ev);
+    }
+}
+
+static void queue_watch(XenXenstoreState *s, const char *path,
+                        const char *token)
+{
+    struct watch_event *ev = g_new0(struct watch_event, 1);
+
+    ev->path = g_strdup(path);
+    ev->token = g_strdup(token);
+
+    s->watch_events = g_list_append(s->watch_events, ev);
+}
+
+static void fire_watch_cb(void *opaque, const char *path, const char *token)
+{
+    XenXenstoreState *s = opaque;
+
+    assert(qemu_mutex_iothread_locked());
+
+    /*
+     * If there's a response pending, we obviously can't scribble over
+     * it. But if there's a request pending, it has dibs on the buffer
+     * too.
+     *
+     * In the common case of a watch firing due to backend activity
+     * when the ring was otherwise idle, we should be able to copy the
+     * strings directly into the rsp_data and thence the actual ring,
+     * without needing to perform any allocations and queue them.
+     */
+    if (s->rsp_pending || req_pending(s)) {
+        queue_watch(s, path, token);
+    } else {
+        deliver_watch(s, path, token);
+        /*
+         * If the message was queued because there was already ring activity,
+         * no need to wake the guest. But if not, we need to send the evtchn.
+         */
+        xen_be_evtchn_notify(s->eh, s->be_port);
+    }
+}
+
+static void process_watch_events(XenXenstoreState *s)
+{
+    struct watch_event *ev = s->watch_events->data;
+
+    deliver_watch(s, ev->path, ev->token);
+
+    s->watch_events = g_list_remove(s->watch_events, ev);
+    free_watch_event(ev);
+}
+
 static void xen_xenstore_event(void *opaque)
 {
     XenXenstoreState *s = opaque;
@@ -433,6 +1392,10 @@ static void xen_xenstore_event(void *opaque)
         copied_to = copied_from = 0;
         processed = false;
 
+        if (!s->rsp_pending && s->watch_events) {
+            process_watch_events(s);
+        }
+
         if (s->rsp_pending) {
             copied_to = put_rsp(s);
         }
@@ -441,7 +1404,7 @@ static void xen_xenstore_event(void *opaque)
             copied_from = get_req(s);
         }
 
-        if (req_pending(s) && !s->rsp_pending) {
+        if (req_pending(s) && !s->rsp_pending && !s->watch_events) {
             process_req(s);
             processed = true;
         }
@@ -496,5 +1459,270 @@ int xen_xenstore_reset(void)
     }
     s->be_port = err;
 
+    /*
+     * We don't actually access the guest's page through the grant, because
+     * this isn't real Xen, and we can just use the page we gave it in the
+     * first place. Map the grant anyway, mostly for cosmetic purposes so
+     * it *looks* like it's in use in the guest-visible grant table.
+     */
+    s->gt = qemu_xen_gnttab_open();
+    uint32_t xs_gntref = GNTTAB_RESERVED_XENSTORE;
+    s->granted_xs = qemu_xen_gnttab_map_refs(s->gt, 1, xen_domid, &xs_gntref,
+                                             PROT_READ | PROT_WRITE);
+
     return 0;
 }
+
+struct qemu_xs_handle {
+    XenstoreImplState *impl;
+    GList *watches;
+    QEMUBH *watch_bh;
+};
+
+struct qemu_xs_watch {
+    struct qemu_xs_handle *h;
+    char *path;
+    xs_watch_fn fn;
+    void *opaque;
+    GList *events;
+};
+
+static char *xs_be_get_domain_path(struct qemu_xs_handle *h, unsigned int domid)
+{
+    return g_strdup_printf("/local/domain/%u", domid);
+}
+
+static char **xs_be_directory(struct qemu_xs_handle *h, xs_transaction_t t,
+                              const char *path, unsigned int *num)
+{
+    GList *items = NULL, *l;
+    unsigned int i = 0;
+    char **items_ret;
+    int err;
+
+    err = xs_impl_directory(h->impl, DOMID_QEMU, t, path, NULL, &items);
+    if (err) {
+        errno = err;
+        return NULL;
+    }
+
+    items_ret = g_new0(char *, g_list_length(items) + 1);
+    *num = 0;
+    for (l = items; l; l = l->next) {
+        items_ret[i++] = l->data;
+        (*num)++;
+    }
+    g_list_free(items);
+    return items_ret;
+}
+
+static void *xs_be_read(struct qemu_xs_handle *h, xs_transaction_t t,
+                        const char *path, unsigned int *len)
+{
+    GByteArray *data = g_byte_array_new();
+    bool free_segment = false;
+    int err;
+
+    err = xs_impl_read(h->impl, DOMID_QEMU, t, path, data);
+    if (err) {
+        free_segment = true;
+        errno = err;
+    } else {
+        if (len) {
+            *len = data->len;
+        }
+        /* The xen-bus-helper code expects to get NUL terminated string! */
+        g_byte_array_append(data, (void *)"", 1);
+    }
+
+    return g_byte_array_free(data, free_segment);
+}
+
+static bool xs_be_write(struct qemu_xs_handle *h, xs_transaction_t t,
+                        const char *path, const void *data, unsigned int len)
+{
+    GByteArray *gdata = g_byte_array_new();
+    int err;
+
+    g_byte_array_append(gdata, data, len);
+    err = xs_impl_write(h->impl, DOMID_QEMU, t, path, gdata);
+    g_byte_array_unref(gdata);
+    if (err) {
+        errno = err;
+        return false;
+    }
+    return true;
+}
+
+static bool xs_be_create(struct qemu_xs_handle *h, xs_transaction_t t,
+                         unsigned int owner, unsigned int domid,
+                         unsigned int perms, const char *path)
+{
+    g_autoptr(GByteArray) data = g_byte_array_new();
+    GList *perms_list = NULL;
+    int err;
+
+    /* mkdir does this */
+    err = xs_impl_read(h->impl, DOMID_QEMU, t, path, data);
+    if (err == ENOENT) {
+        err = xs_impl_write(h->impl, DOMID_QEMU, t, path, data);
+    }
+    if (err) {
+        errno = err;
+        return false;
+    }
+
+    perms_list = g_list_append(perms_list,
+                               xs_perm_as_string(XS_PERM_NONE, owner));
+    perms_list = g_list_append(perms_list,
+                               xs_perm_as_string(perms, domid));
+
+    err = xs_impl_set_perms(h->impl, DOMID_QEMU, t, path, perms_list);
+    g_list_free_full(perms_list, g_free);
+    if (err) {
+        errno = err;
+        return false;
+    }
+    return true;
+}
+
+static bool xs_be_destroy(struct qemu_xs_handle *h, xs_transaction_t t,
+                          const char *path)
+{
+    int err = xs_impl_rm(h->impl, DOMID_QEMU, t, path);
+    if (err) {
+        errno = err;
+        return false;
+    }
+    return true;
+}
+
+static void be_watch_bh(void *_h)
+{
+    struct qemu_xs_handle *h = _h;
+    GList *l;
+
+    for (l = h->watches; l; l = l->next) {
+        struct qemu_xs_watch *w = l->data;
+
+        while (w->events) {
+            struct watch_event *ev = w->events->data;
+
+            w->fn(w->opaque, ev->path);
+
+            w->events = g_list_remove(w->events, ev);
+            free_watch_event(ev);
+        }
+    }
+}
+
+static void xs_be_watch_cb(void *opaque, const char *path, const char *token)
+{
+    struct watch_event *ev = g_new0(struct watch_event, 1);
+    struct qemu_xs_watch *w = opaque;
+
+    /* We don't care about the token */
+    ev->path = g_strdup(path);
+    w->events = g_list_append(w->events, ev);
+
+    qemu_bh_schedule(w->h->watch_bh);
+}
+
+static struct qemu_xs_watch *xs_be_watch(struct qemu_xs_handle *h,
+                                         const char *path, xs_watch_fn fn,
+                                         void *opaque)
+{
+    struct qemu_xs_watch *w = g_new0(struct qemu_xs_watch, 1);
+    int err;
+
+    w->h = h;
+    w->fn = fn;
+    w->opaque = opaque;
+
+    err = xs_impl_watch(h->impl, DOMID_QEMU, path, NULL, xs_be_watch_cb, w);
+    if (err) {
+        errno = err;
+        g_free(w);
+        return NULL;
+    }
+
+    w->path = g_strdup(path);
+    h->watches = g_list_append(h->watches, w);
+    return w;
+}
+
+static void xs_be_unwatch(struct qemu_xs_handle *h, struct qemu_xs_watch *w)
+{
+    xs_impl_unwatch(h->impl, DOMID_QEMU, w->path, NULL, xs_be_watch_cb, w);
+
+    h->watches = g_list_remove(h->watches, w);
+    g_list_free_full(w->events, (GDestroyNotify)free_watch_event);
+    g_free(w->path);
+    g_free(w);
+}
+
+static xs_transaction_t xs_be_transaction_start(struct qemu_xs_handle *h)
+{
+    unsigned int new_tx = XBT_NULL;
+    int err = xs_impl_transaction_start(h->impl, DOMID_QEMU, &new_tx);
+    if (err) {
+        errno = err;
+        return XBT_NULL;
+    }
+    return new_tx;
+}
+
+static bool xs_be_transaction_end(struct qemu_xs_handle *h, xs_transaction_t t,
+                                  bool abort)
+{
+    int err = xs_impl_transaction_end(h->impl, DOMID_QEMU, t, !abort);
+    if (err) {
+        errno = err;
+        return false;
+    }
+    return true;
+}
+
+static struct qemu_xs_handle *xs_be_open(void)
+{
+    XenXenstoreState *s = xen_xenstore_singleton;
+    struct qemu_xs_handle *h;
+
+    if (!s && !s->impl) {
+        errno = -ENOSYS;
+        return NULL;
+    }
+
+    h = g_new0(struct qemu_xs_handle, 1);
+    h->impl = s->impl;
+
+    h->watch_bh = aio_bh_new(qemu_get_aio_context(), be_watch_bh, h);
+
+    return h;
+}
+
+static void xs_be_close(struct qemu_xs_handle *h)
+{
+    while (h->watches) {
+        struct qemu_xs_watch *w = h->watches->data;
+        xs_be_unwatch(h, w);
+    }
+
+    qemu_bh_delete(h->watch_bh);
+    g_free(h);
+}
+
+static struct xenstore_backend_ops emu_xenstore_backend_ops = {
+    .open = xs_be_open,
+    .close = xs_be_close,
+    .get_domain_path = xs_be_get_domain_path,
+    .directory = xs_be_directory,
+    .read = xs_be_read,
+    .write = xs_be_write,
+    .create = xs_be_create,
+    .destroy = xs_be_destroy,
+    .watch = xs_be_watch,
+    .unwatch = xs_be_unwatch,
+    .transaction_start = xs_be_transaction_start,
+    .transaction_end = xs_be_transaction_end,
+};
diff --git a/hw/i386/kvm/xenstore_impl.c b/hw/i386/kvm/xenstore_impl.c
new file mode 100644
index 0000000000..305fe75519
--- /dev/null
+++ b/hw/i386/kvm/xenstore_impl.c
@@ -0,0 +1,1927 @@
+/*
+ * QEMU Xen emulation: The actual implementation of XenStore
+ *
+ * Copyright © 2023 Amazon.com, Inc. or its affiliates. All Rights Reserved.
+ *
+ * Authors: David Woodhouse <dwmw2@infradead.org>, Paul Durrant <paul@xen.org>
+ *
+ * This work is licensed under the terms of the GNU GPL, version 2 or later.
+ * See the COPYING file in the top-level directory.
+ */
+
+#include "qemu/osdep.h"
+#include "qom/object.h"
+
+#include "hw/xen/xen.h"
+
+#include "xen_xenstore.h"
+#include "xenstore_impl.h"
+
+#include "hw/xen/interface/io/xs_wire.h"
+
+#define XS_MAX_WATCHES          128
+#define XS_MAX_DOMAIN_NODES     1000
+#define XS_MAX_NODE_SIZE        2048
+#define XS_MAX_TRANSACTIONS     10
+#define XS_MAX_PERMS_PER_NODE   5
+
+#define XS_VALID_CHARS "abcdefghijklmnopqrstuvwxyz" \
+                       "ABCDEFGHIJKLMNOPQRSTUVWXYZ" \
+                       "0123456789-/_"
+
+typedef struct XsNode {
+    uint32_t ref;
+    GByteArray *content;
+    GList *perms;
+    GHashTable *children;
+    uint64_t gencnt;
+    bool deleted_in_tx;
+    bool modified_in_tx;
+    unsigned int serialized_tx;
+#ifdef XS_NODE_UNIT_TEST
+    gchar *name; /* debug only */
+#endif
+} XsNode;
+
+typedef struct XsWatch {
+    struct XsWatch *next;
+    xs_impl_watch_fn *cb;
+    void *cb_opaque;
+    char *token;
+    unsigned int dom_id;
+    int rel_prefix;
+} XsWatch;
+
+typedef struct XsTransaction {
+    XsNode *root;
+    unsigned int nr_nodes;
+    unsigned int base_tx;
+    unsigned int tx_id;
+    unsigned int dom_id;
+} XsTransaction;
+
+struct XenstoreImplState {
+    XsNode *root;
+    unsigned int nr_nodes;
+    GHashTable *watches;
+    unsigned int nr_domu_watches;
+    GHashTable *transactions;
+    unsigned int nr_domu_transactions;
+    unsigned int root_tx;
+    unsigned int last_tx;
+    bool serialized;
+};
+
+
+static void nobble_tx(gpointer key, gpointer value, gpointer user_data)
+{
+    unsigned int *new_tx_id = user_data;
+    XsTransaction *tx = value;
+
+    if (tx->base_tx == *new_tx_id) {
+        /* Transactions based on XBT_NULL will always fail */
+        tx->base_tx = XBT_NULL;
+    }
+}
+
+static inline unsigned int next_tx(struct XenstoreImplState *s)
+{
+    unsigned int tx_id;
+
+    /* Find the next TX id which isn't either XBT_NULL or in use. */
+    do {
+        tx_id = ++s->last_tx;
+    } while (tx_id == XBT_NULL || tx_id == s->root_tx ||
+             g_hash_table_lookup(s->transactions, GINT_TO_POINTER(tx_id)));
+
+    /*
+     * It is vanishingly unlikely, but ensure that no outstanding transaction
+     * is based on the (previous incarnation of the) newly-allocated TX id.
+     */
+    g_hash_table_foreach(s->transactions, nobble_tx, &tx_id);
+
+    return tx_id;
+}
+
+static inline XsNode *xs_node_new(void)
+{
+    XsNode *n = g_new0(XsNode, 1);
+    n->ref = 1;
+
+#ifdef XS_NODE_UNIT_TEST
+    nr_xs_nodes++;
+    xs_node_list = g_list_prepend(xs_node_list, n);
+#endif
+    return n;
+}
+
+static inline XsNode *xs_node_ref(XsNode *n)
+{
+    /* With just 10 transactions, it can never get anywhere near this. */
+    g_assert(n->ref < INT_MAX);
+
+    g_assert(n->ref);
+    n->ref++;
+    return n;
+}
+
+static inline void xs_node_unref(XsNode *n)
+{
+    if (!n) {
+        return;
+    }
+    g_assert(n->ref);
+    if (--n->ref) {
+        return;
+    }
+
+    if (n->content) {
+        g_byte_array_unref(n->content);
+    }
+    if (n->perms) {
+        g_list_free_full(n->perms, g_free);
+    }
+    if (n->children) {
+        g_hash_table_unref(n->children);
+    }
+#ifdef XS_NODE_UNIT_TEST
+    g_free(n->name);
+    nr_xs_nodes--;
+    xs_node_list = g_list_remove(xs_node_list, n);
+#endif
+    g_free(n);
+}
+
+char *xs_perm_as_string(unsigned int perm, unsigned int domid)
+{
+    char letter;
+
+    switch (perm) {
+    case XS_PERM_READ | XS_PERM_WRITE:
+        letter = 'b';
+        break;
+    case XS_PERM_READ:
+        letter = 'r';
+        break;
+    case XS_PERM_WRITE:
+        letter = 'w';
+        break;
+    case XS_PERM_NONE:
+    default:
+        letter = 'n';
+        break;
+    }
+
+    return g_strdup_printf("%c%u", letter, domid);
+}
+
+static gpointer do_perm_copy(gconstpointer src, gpointer user_data)
+{
+    return g_strdup(src);
+}
+
+static XsNode *xs_node_create(const char *name, GList *perms)
+{
+    XsNode *n = xs_node_new();
+
+#ifdef XS_NODE_UNIT_TEST
+    if (name) {
+        n->name = g_strdup(name);
+    }
+#endif
+
+    n->perms = g_list_copy_deep(perms, do_perm_copy, NULL);
+
+    return n;
+}
+
+/* For copying from one hash table to another using g_hash_table_foreach() */
+static void do_child_insert(gpointer key, gpointer value, gpointer user_data)
+{
+    g_hash_table_insert(user_data, g_strdup(key), xs_node_ref(value));
+}
+
+static XsNode *xs_node_copy(XsNode *old)
+{
+    XsNode *n = xs_node_new();
+
+    n->gencnt = old->gencnt;
+
+#ifdef XS_NODE_UNIT_TEST
+    if (n->name) {
+        n->name = g_strdup(old->name);
+    }
+#endif
+
+    assert(old);
+    if (old->children) {
+        n->children = g_hash_table_new_full(g_str_hash, g_str_equal, g_free,
+                                            (GDestroyNotify)xs_node_unref);
+        g_hash_table_foreach(old->children, do_child_insert, n->children);
+    }
+    if (old->perms) {
+        n->perms = g_list_copy_deep(old->perms, do_perm_copy, NULL);
+    }
+    if (old->content) {
+        n->content = g_byte_array_ref(old->content);
+    }
+    return n;
+}
+
+/* Returns true if it made a change to the hash table */
+static bool xs_node_add_child(XsNode *n, const char *path_elem, XsNode *child)
+{
+    assert(!strchr(path_elem, '/'));
+
+    if (!child) {
+        assert(n->children);
+        return g_hash_table_remove(n->children, path_elem);
+    }
+
+#ifdef XS_NODE_UNIT_TEST
+    g_free(child->name);
+    child->name = g_strdup(path_elem);
+#endif
+    if (!n->children) {
+        n->children = g_hash_table_new_full(g_str_hash, g_str_equal, g_free,
+                                            (GDestroyNotify)xs_node_unref);
+    }
+
+    /*
+     * The documentation for g_hash_table_insert() says that it "returns a
+     * boolean value to indicate whether the newly added value was already
+     * in the hash table or not."
+     *
+     * It could perhaps be clearer that returning TRUE means it wasn't,
+     */
+    return g_hash_table_insert(n->children, g_strdup(path_elem), child);
+}
+
+struct walk_op {
+    struct XenstoreImplState *s;
+    char path[XENSTORE_ABS_PATH_MAX + 2]; /* Two NUL terminators */
+    int (*op_fn)(XsNode **n, struct walk_op *op);
+    void *op_opaque;
+    void *op_opaque2;
+
+    GList *watches;
+    unsigned int dom_id;
+    unsigned int tx_id;
+
+    /* The number of nodes which will exist in the tree if this op succeeds. */
+    unsigned int new_nr_nodes;
+
+    /*
+     * This is maintained on the way *down* the walk to indicate
+     * whether nodes can be modified in place or whether COW is
+     * required. It starts off being true, as we're always going to
+     * replace the root node. If we walk into a shared subtree it
+     * becomes false. If we start *creating* new nodes for a write,
+     * it becomes true again.
+     *
+     * Do not use it on the way back up.
+     */
+    bool inplace;
+    bool mutating;
+    bool create_dirs;
+    bool in_transaction;
+
+    /* Tracking during recursion so we know which is first. */
+    bool deleted_in_tx;
+};
+
+static void fire_watches(struct walk_op *op, bool parents)
+{
+    GList *l = NULL;
+    XsWatch *w;
+
+    if (!op->mutating || op->in_transaction) {
+        return;
+    }
+
+    if (parents) {
+        l = op->watches;
+    }
+
+    w = g_hash_table_lookup(op->s->watches, op->path);
+    while (w || l) {
+        if (!w) {
+            /* Fire the parent nodes from 'op' if asked to */
+            w = l->data;
+            l = l->next;
+            continue;
+        }
+
+        assert(strlen(op->path) > w->rel_prefix);
+        w->cb(w->cb_opaque, op->path + w->rel_prefix, w->token);
+
+        w = w->next;
+    }
+}
+
+static int xs_node_add_content(XsNode **n, struct walk_op *op)
+{
+    GByteArray *data = op->op_opaque;
+
+    if (op->dom_id) {
+        /*
+         * The real XenStored includes permissions and names of child nodes
+         * in the calculated datasize but life's too short. For a single
+         * tenant internal XenStore, we don't have to be quite as pedantic.
+         */
+        if (data->len > XS_MAX_NODE_SIZE) {
+            return E2BIG;
+        }
+    }
+    /* We *are* the node to be written. Either this or a copy. */
+    if (!op->inplace) {
+        XsNode *old = *n;
+        *n = xs_node_copy(old);
+        xs_node_unref(old);
+    }
+
+    if ((*n)->content) {
+        g_byte_array_unref((*n)->content);
+    }
+    (*n)->content = g_byte_array_ref(data);
+    if (op->tx_id != XBT_NULL) {
+        (*n)->modified_in_tx = true;
+    }
+    return 0;
+}
+
+static int xs_node_get_content(XsNode **n, struct walk_op *op)
+{
+    GByteArray *data = op->op_opaque;
+    GByteArray *node_data;
+
+    assert(op->inplace);
+    assert(*n);
+
+    node_data = (*n)->content;
+    if (node_data) {
+        g_byte_array_append(data, node_data->data, node_data->len);
+    }
+
+    return 0;
+}
+
+static int node_rm_recurse(gpointer key, gpointer value, gpointer user_data)
+{
+    struct walk_op *op = user_data;
+    int path_len = strlen(op->path);
+    int key_len = strlen(key);
+    XsNode *n = value;
+    bool this_inplace = op->inplace;
+
+    if (n->ref != 1) {
+        op->inplace = 0;
+    }
+
+    assert(key_len + path_len + 2 <= sizeof(op->path));
+    op->path[path_len] = '/';
+    memcpy(op->path + path_len + 1, key, key_len + 1);
+
+    if (n->children) {
+        g_hash_table_foreach_remove(n->children, node_rm_recurse, op);
+    }
+    op->new_nr_nodes--;
+
+    /*
+     * Fire watches on *this* node but not the parents because they are
+     * going to be deleted too, so the watch will fire for them anyway.
+     */
+    fire_watches(op, false);
+    op->path[path_len] = '\0';
+
+    /*
+     * Actually deleting the child here is just an optimisation; if we
+     * don't then the final unref on the topmost victim will just have
+     * to cascade down again repeating all the g_hash_table_foreach()
+     * calls.
+     */
+    return this_inplace;
+}
+
+static XsNode *xs_node_copy_deleted(XsNode *old, struct walk_op *op);
+static void copy_deleted_recurse(gpointer key, gpointer value,
+                                 gpointer user_data)
+{
+    struct walk_op *op = user_data;
+    GHashTable *siblings = op->op_opaque2;
+    XsNode *n = xs_node_copy_deleted(value, op);
+
+    /*
+     * Reinsert the deleted_in_tx copy of the node into the parent's
+     * 'children' hash table. Having stashed it from op->op_opaque2
+     * before the recursive call to xs_node_copy_deleted() scribbled
+     * over it.
+     */
+    g_hash_table_insert(siblings, g_strdup(key), n);
+}
+
+static XsNode *xs_node_copy_deleted(XsNode *old, struct walk_op *op)
+{
+    XsNode *n = xs_node_new();
+
+    n->gencnt = old->gencnt;
+
+#ifdef XS_NODE_UNIT_TEST
+    if (old->name) {
+        n->name = g_strdup(old->name);
+    }
+#endif
+
+    if (old->children) {
+        n->children = g_hash_table_new_full(g_str_hash, g_str_equal, g_free,
+                                            (GDestroyNotify)xs_node_unref);
+        op->op_opaque2 = n->children;
+        g_hash_table_foreach(old->children, copy_deleted_recurse, op);
+    }
+    if (old->perms) {
+        n->perms = g_list_copy_deep(old->perms, do_perm_copy, NULL);
+    }
+    n->deleted_in_tx = true;
+    /* If it gets resurrected we only fire a watch if it lost its content */
+    if (old->content) {
+        n->modified_in_tx = true;
+    }
+    op->new_nr_nodes--;
+    return n;
+}
+
+static int xs_node_rm(XsNode **n, struct walk_op *op)
+{
+    bool this_inplace = op->inplace;
+
+    if (op->tx_id != XBT_NULL) {
+        /* It's not trivial to do inplace handling for this one */
+        XsNode *old = *n;
+        *n = xs_node_copy_deleted(old, op);
+        xs_node_unref(old);
+        return 0;
+    }
+
+    /* Fire watches for, and count, nodes in the subtree which get deleted */
+    if ((*n)->children) {
+        g_hash_table_foreach_remove((*n)->children, node_rm_recurse, op);
+    }
+    op->new_nr_nodes--;
+
+    if (this_inplace) {
+        xs_node_unref(*n);
+    }
+    *n = NULL;
+    return 0;
+}
+
+static int xs_node_get_perms(XsNode **n, struct walk_op *op)
+{
+    GList **perms = op->op_opaque;
+
+    assert(op->inplace);
+    assert(*n);
+
+    *perms = g_list_copy_deep((*n)->perms, do_perm_copy, NULL);
+    return 0;
+}
+
+static void parse_perm(const char *perm, char *letter, unsigned int *dom_id)
+{
+    unsigned int n = sscanf(perm, "%c%u", letter, dom_id);
+
+    assert(n == 2);
+}
+
+static bool can_access(unsigned int dom_id, GList *perms, const char *letters)
+{
+    unsigned int i, n;
+    char perm_letter;
+    unsigned int perm_dom_id;
+    bool access;
+
+    if (dom_id == 0) {
+        return true;
+    }
+
+    n = g_list_length(perms);
+    assert(n >= 1);
+
+    /*
+     * The dom_id of the first perm is the owner, and the owner always has
+     * read-write access.
+     */
+    parse_perm(g_list_nth_data(perms, 0), &perm_letter, &perm_dom_id);
+    if (dom_id == perm_dom_id) {
+        return true;
+    }
+
+    /*
+     * The letter of the first perm specified the default access for all other
+     * domains.
+     */
+    access = !!strchr(letters, perm_letter);
+    for (i = 1; i < n; i++) {
+        parse_perm(g_list_nth_data(perms, i), &perm_letter, &perm_dom_id);
+        if (dom_id != perm_dom_id) {
+            continue;
+        }
+        access = !!strchr(letters, perm_letter);
+    }
+
+    return access;
+}
+
+static int xs_node_set_perms(XsNode **n, struct walk_op *op)
+{
+    GList *perms = op->op_opaque;
+
+    if (op->dom_id) {
+        unsigned int perm_dom_id;
+        char perm_letter;
+
+        /* A guest may not change permissions on nodes it does not own */
+        if (!can_access(op->dom_id, (*n)->perms, "")) {
+            return EPERM;
+        }
+
+        /* A guest may not change the owner of a node it owns. */
+        parse_perm(perms->data, &perm_letter, &perm_dom_id);
+        if (perm_dom_id != op->dom_id) {
+            return EPERM;
+        }
+
+        if (g_list_length(perms) > XS_MAX_PERMS_PER_NODE) {
+            return ENOSPC;
+        }
+    }
+
+    /* We *are* the node to be written. Either this or a copy. */
+    if (!op->inplace) {
+        XsNode *old = *n;
+        *n = xs_node_copy(old);
+        xs_node_unref(old);
+    }
+
+    if ((*n)->perms) {
+        g_list_free_full((*n)->perms, g_free);
+    }
+    (*n)->perms = g_list_copy_deep(perms, do_perm_copy, NULL);
+    if (op->tx_id != XBT_NULL) {
+        (*n)->modified_in_tx = true;
+    }
+    return 0;
+}
+
+/*
+ * Passed a full reference in *n which it may free if it needs to COW.
+ *
+ * When changing the tree, the op->inplace flag indicates whether this
+ * node may be modified in place (i.e. it and all its parents had a
+ * refcount of one). If walking down the tree we find a node whose
+ * refcount is higher, we must clear op->inplace and COW from there
+ * down. Unless we are creating new nodes as scaffolding for a write
+ * (which works like 'mkdir -p' does). In which case those newly
+ * created nodes can (and must) be modified in place again.
+ */
+static int xs_node_walk(XsNode **n, struct walk_op *op)
+{
+    char *child_name = NULL;
+    size_t namelen;
+    XsNode *old = *n, *child = NULL;
+    bool stole_child = false;
+    bool this_inplace;
+    XsWatch *watch;
+    int err;
+
+    namelen = strlen(op->path);
+    watch = g_hash_table_lookup(op->s->watches, op->path);
+
+    /* Is there a child, or do we hit the double-NUL termination? */
+    if (op->path[namelen + 1]) {
+        char *slash;
+        child_name = op->path + namelen + 1;
+        slash = strchr(child_name, '/');
+        if (slash) {
+            *slash = '\0';
+        }
+        op->path[namelen] = '/';
+    }
+
+    /* If we walk into a subtree which is shared, we must COW */
+    if (op->mutating && old->ref != 1) {
+        op->inplace = false;
+    }
+
+    if (!child_name) {
+        const char *letters = op->mutating ? "wb" : "rb";
+
+        if (!can_access(op->dom_id, old->perms, letters)) {
+            err = EACCES;
+            goto out;
+        }
+
+        /* This is the actual node on which the operation shall be performed */
+        err = op->op_fn(n, op);
+        if (!err) {
+            fire_watches(op, true);
+        }
+        goto out;
+    }
+
+    /* op->inplace will be further modified during the recursion */
+    this_inplace = op->inplace;
+
+    if (old && old->children) {
+        child = g_hash_table_lookup(old->children, child_name);
+        /* This is a *weak* reference to 'child', owned by the hash table */
+    }
+
+    if (child) {
+        if (child->deleted_in_tx) {
+            assert(child->ref == 1);
+            /* Cannot actually set child->deleted_in_tx = false until later */
+        }
+        xs_node_ref(child);
+        /*
+         * Now we own it too. But if we can modify inplace, that's going to
+         * foil the check and force it to COW. We want to be the *only* owner
+         * so that it can be modified in place, so remove it from the hash
+         * table in that case. We'll add it (or its replacement) back later.
+         */
+        if (op->mutating && this_inplace) {
+            g_hash_table_remove(old->children, child_name);
+            stole_child = true;
+        }
+    } else if (op->create_dirs) {
+        assert(op->mutating);
+
+        if (!can_access(op->dom_id, old->perms, "wb")) {
+            err = EACCES;
+            goto out;
+        }
+
+        if (op->dom_id && op->new_nr_nodes >= XS_MAX_DOMAIN_NODES) {
+            err = ENOSPC;
+            goto out;
+        }
+
+        child = xs_node_create(child_name, old->perms);
+        op->new_nr_nodes++;
+
+        /*
+         * If we're creating a new child, we can clearly modify it (and its
+         * children) in place from here on down.
+         */
+        op->inplace = true;
+    } else {
+        err = ENOENT;
+        goto out;
+    }
+
+    /*
+     * If there's a watch on this node, add it to the list to be fired
+     * (with the correct full pathname for the modified node) at the end.
+     */
+    if (watch) {
+        op->watches = g_list_append(op->watches, watch);
+    }
+
+    /*
+     * Except for the temporary child-stealing as noted, our node has not
+     * changed yet. We don't yet know the overall operation will complete.
+     */
+    err = xs_node_walk(&child, op);
+
+    if (watch) {
+        op->watches = g_list_remove(op->watches, watch);
+    }
+
+    if (err || !op->mutating) {
+        if (stole_child) {
+            /* Put it back as it was. */
+            g_hash_table_replace(old->children, g_strdup(child_name), child);
+        } else {
+            xs_node_unref(child);
+        }
+        goto out;
+    }
+
+    /*
+     * Now we know the operation has completed successfully and we're on
+     * the way back up. Make the change, substituting 'child' in the
+     * node at our level.
+     */
+    if (!this_inplace) {
+        *n = xs_node_copy(old);
+        xs_node_unref(old);
+    }
+
+    /*
+     * If we resurrected a deleted_in_tx node, we can mark it as no longer
+     * deleted now that we know the overall operation has succeeded.
+     */
+    if (op->create_dirs && child && child->deleted_in_tx) {
+        op->new_nr_nodes++;
+        child->deleted_in_tx = false;
+    }
+
+    /*
+     * The child may be NULL here, for a remove operation. Either way,
+     * xs_node_add_child() will do the right thing and return a value
+     * indicating whether it changed the parent's hash table or not.
+     *
+     * We bump the parent gencnt if it adds a child that we *didn't*
+     * steal from it in the first place, or if child==NULL and was
+     * thus removed (whether we stole it earlier and didn't put it
+     * back, or xs_node_add_child() actually removed it now).
+     */
+    if ((xs_node_add_child(*n, child_name, child) && !stole_child) || !child) {
+        (*n)->gencnt++;
+    }
+
+ out:
+    op->path[namelen] = '\0';
+    if (!namelen) {
+        assert(!op->watches);
+        /*
+         * On completing the recursion back up the path walk and reaching the
+         * top, assign the new node count if the operation was successful. If
+         * the main tree was changed, bump its tx ID so that outstanding
+         * transactions correctly fail. But don't bump it every time; only
+         * if it makes a difference.
+         */
+        if (!err && op->mutating) {
+            if (!op->in_transaction) {
+                if (op->s->root_tx != op->s->last_tx) {
+                    op->s->root_tx = next_tx(op->s);
+                }
+                op->s->nr_nodes = op->new_nr_nodes;
+            } else {
+                XsTransaction *tx = g_hash_table_lookup(op->s->transactions,
+                                                        GINT_TO_POINTER(op->tx_id));
+                assert(tx);
+                tx->nr_nodes = op->new_nr_nodes;
+            }
+        }
+    }
+    return err;
+}
+
+static void append_directory_item(gpointer key, gpointer value,
+                                  gpointer user_data)
+{
+    GList **items = user_data;
+
+    *items = g_list_insert_sorted(*items, g_strdup(key), (GCompareFunc)strcmp);
+}
+
+/* Populates items with char * names which caller must free. */
+static int xs_node_directory(XsNode **n, struct walk_op *op)
+{
+    GList **items = op->op_opaque;
+
+    assert(op->inplace);
+    assert(*n);
+
+    if ((*n)->children) {
+        g_hash_table_foreach((*n)->children, append_directory_item, items);
+    }
+
+    if (op->op_opaque2) {
+        *(uint64_t *)op->op_opaque2 = (*n)->gencnt;
+    }
+
+    return 0;
+}
+
+static int validate_path(char *outpath, const char *userpath,
+                         unsigned int dom_id)
+{
+    size_t i, pathlen = strlen(userpath);
+
+    if (!pathlen || userpath[pathlen] == '/' || strstr(userpath, "//")) {
+        return EINVAL;
+    }
+    for (i = 0; i < pathlen; i++) {
+        if (!strchr(XS_VALID_CHARS, userpath[i])) {
+            return EINVAL;
+        }
+    }
+    if (userpath[0] == '/') {
+        if (pathlen > XENSTORE_ABS_PATH_MAX) {
+            return E2BIG;
+        }
+        memcpy(outpath, userpath, pathlen + 1);
+    } else {
+        if (pathlen > XENSTORE_REL_PATH_MAX) {
+            return E2BIG;
+        }
+        snprintf(outpath, XENSTORE_ABS_PATH_MAX, "/local/domain/%u/%s", dom_id,
+                 userpath);
+    }
+    return 0;
+}
+
+
+static int init_walk_op(XenstoreImplState *s, struct walk_op *op,
+                        xs_transaction_t tx_id, unsigned int dom_id,
+                        const char *path, XsNode ***rootp)
+{
+    int ret = validate_path(op->path, path, dom_id);
+    if (ret) {
+        return ret;
+    }
+
+    /*
+     * We use *two* NUL terminators at the end of the path, as during the walk
+     * we will temporarily turn each '/' into a NUL to allow us to use that
+     * path element for the lookup.
+     */
+    op->path[strlen(op->path) + 1] = '\0';
+    op->watches = NULL;
+    op->path[0] = '\0';
+    op->inplace = true;
+    op->mutating = false;
+    op->create_dirs = false;
+    op->in_transaction = false;
+    op->dom_id = dom_id;
+    op->tx_id = tx_id;
+    op->s = s;
+
+    if (tx_id == XBT_NULL) {
+        *rootp = &s->root;
+        op->new_nr_nodes = s->nr_nodes;
+    } else {
+        XsTransaction *tx = g_hash_table_lookup(s->transactions,
+                                                GINT_TO_POINTER(tx_id));
+        if (!tx) {
+            return ENOENT;
+        }
+        *rootp = &tx->root;
+        op->new_nr_nodes = tx->nr_nodes;
+        op->in_transaction = true;
+    }
+
+    return 0;
+}
+
+int xs_impl_read(XenstoreImplState *s, unsigned int dom_id,
+                 xs_transaction_t tx_id, const char *path, GByteArray *data)
+{
+    /*
+     * The data GByteArray shall exist, and will be freed by caller.
+     * Just g_byte_array_append() to it.
+     */
+    struct walk_op op;
+    XsNode **n;
+    int ret;
+
+    ret = init_walk_op(s, &op, tx_id, dom_id, path, &n);
+    if (ret) {
+        return ret;
+    }
+    op.op_fn = xs_node_get_content;
+    op.op_opaque = data;
+    return xs_node_walk(n, &op);
+}
+
+int xs_impl_write(XenstoreImplState *s, unsigned int dom_id,
+                  xs_transaction_t tx_id, const char *path, GByteArray *data)
+{
+    /*
+     * The data GByteArray shall exist, will be freed by caller. You are
+     * free to use g_byte_array_steal() and keep the data. Or just ref it.
+     */
+    struct walk_op op;
+    XsNode **n;
+    int ret;
+
+    ret = init_walk_op(s, &op, tx_id, dom_id, path, &n);
+    if (ret) {
+        return ret;
+    }
+    op.op_fn = xs_node_add_content;
+    op.op_opaque = data;
+    op.mutating = true;
+    op.create_dirs = true;
+    return xs_node_walk(n, &op);
+}
+
+int xs_impl_directory(XenstoreImplState *s, unsigned int dom_id,
+                      xs_transaction_t tx_id, const char *path,
+                      uint64_t *gencnt, GList **items)
+{
+    /*
+     * The items are (char *) to be freed by caller. Although it's consumed
+     * immediately so if you want to change it to (const char *) and keep
+     * them, go ahead and change the caller.
+     */
+    struct walk_op op;
+    XsNode **n;
+    int ret;
+
+    ret = init_walk_op(s, &op, tx_id, dom_id, path, &n);
+    if (ret) {
+        return ret;
+    }
+    op.op_fn = xs_node_directory;
+    op.op_opaque = items;
+    op.op_opaque2 = gencnt;
+    return xs_node_walk(n, &op);
+}
+
+int xs_impl_transaction_start(XenstoreImplState *s, unsigned int dom_id,
+                              xs_transaction_t *tx_id)
+{
+    XsTransaction *tx;
+
+    if (*tx_id != XBT_NULL) {
+        return EINVAL;
+    }
+
+    if (dom_id && s->nr_domu_transactions >= XS_MAX_TRANSACTIONS) {
+        return ENOSPC;
+    }
+
+    tx = g_new0(XsTransaction, 1);
+
+    tx->nr_nodes = s->nr_nodes;
+    tx->tx_id = next_tx(s);
+    tx->base_tx = s->root_tx;
+    tx->root = xs_node_ref(s->root);
+    tx->dom_id = dom_id;
+
+    g_hash_table_insert(s->transactions, GINT_TO_POINTER(tx->tx_id), tx);
+    if (dom_id) {
+        s->nr_domu_transactions++;
+    }
+    *tx_id = tx->tx_id;
+    return 0;
+}
+
+static gboolean tx_commit_walk(gpointer key, gpointer value,
+                               gpointer user_data)
+{
+    struct walk_op *op = user_data;
+    int path_len = strlen(op->path);
+    int key_len = strlen(key);
+    bool fire_parents = true;
+    XsWatch *watch;
+    XsNode *n = value;
+
+    if (n->ref != 1) {
+        return false;
+    }
+
+    if (n->deleted_in_tx) {
+        /*
+         * We fire watches on our parents if we are the *first* node
+         * to be deleted (the topmost one). This matches the behaviour
+         * when deleting in the live tree.
+         */
+        fire_parents = !op->deleted_in_tx;
+
+        /* Only used on the way down so no need to clear it later */
+        op->deleted_in_tx = true;
+    }
+
+    assert(key_len + path_len + 2 <= sizeof(op->path));
+    op->path[path_len] = '/';
+    memcpy(op->path + path_len + 1, key, key_len + 1);
+
+    watch = g_hash_table_lookup(op->s->watches, op->path);
+    if (watch) {
+        op->watches = g_list_append(op->watches, watch);
+    }
+
+    if (n->children) {
+        g_hash_table_foreach_remove(n->children, tx_commit_walk, op);
+    }
+
+    if (watch) {
+        op->watches = g_list_remove(op->watches, watch);
+    }
+
+    /*
+     * Don't fire watches if this node was only copied because a
+     * descendent was changed. The modified_in_tx flag indicates the
+     * ones which were really changed.
+     */
+    if (n->modified_in_tx || n->deleted_in_tx) {
+        fire_watches(op, fire_parents);
+        n->modified_in_tx = false;
+    }
+    op->path[path_len] = '\0';
+
+    /* Deleted nodes really do get expunged when we commit */
+    return n->deleted_in_tx;
+}
+
+static int transaction_commit(XenstoreImplState *s, XsTransaction *tx)
+{
+    struct walk_op op;
+    XsNode **n;
+
+    if (s->root_tx != tx->base_tx) {
+        return EAGAIN;
+    }
+    xs_node_unref(s->root);
+    s->root = tx->root;
+    tx->root = NULL;
+    s->root_tx = tx->tx_id;
+    s->nr_nodes = tx->nr_nodes;
+
+    init_walk_op(s, &op, XBT_NULL, tx->dom_id, "/", &n);
+    op.deleted_in_tx = false;
+    op.mutating = true;
+
+    /*
+     * Walk the new root and fire watches on any node which has a
+     * refcount of one (which is therefore unique to this transaction).
+     */
+    if (s->root->children) {
+        g_hash_table_foreach_remove(s->root->children, tx_commit_walk, &op);
+    }
+
+    return 0;
+}
+
+int xs_impl_transaction_end(XenstoreImplState *s, unsigned int dom_id,
+                            xs_transaction_t tx_id, bool commit)
+{
+    int ret = 0;
+    XsTransaction *tx = g_hash_table_lookup(s->transactions,
+                                            GINT_TO_POINTER(tx_id));
+
+    if (!tx || tx->dom_id != dom_id) {
+        return ENOENT;
+    }
+
+    if (commit) {
+        ret = transaction_commit(s, tx);
+    }
+
+    g_hash_table_remove(s->transactions, GINT_TO_POINTER(tx_id));
+    if (dom_id) {
+        assert(s->nr_domu_transactions);
+        s->nr_domu_transactions--;
+    }
+    return ret;
+}
+
+int xs_impl_rm(XenstoreImplState *s, unsigned int dom_id,
+               xs_transaction_t tx_id, const char *path)
+{
+    struct walk_op op;
+    XsNode **n;
+    int ret;
+
+    ret = init_walk_op(s, &op, tx_id, dom_id, path, &n);
+    if (ret) {
+        return ret;
+    }
+    op.op_fn = xs_node_rm;
+    op.mutating = true;
+    return xs_node_walk(n, &op);
+}
+
+int xs_impl_get_perms(XenstoreImplState *s, unsigned int dom_id,
+                      xs_transaction_t tx_id, const char *path, GList **perms)
+{
+    struct walk_op op;
+    XsNode **n;
+    int ret;
+
+    ret = init_walk_op(s, &op, tx_id, dom_id, path, &n);
+    if (ret) {
+        return ret;
+    }
+    op.op_fn = xs_node_get_perms;
+    op.op_opaque = perms;
+    return xs_node_walk(n, &op);
+}
+
+static void is_valid_perm(gpointer data, gpointer user_data)
+{
+    char *perm = data;
+    bool *valid = user_data;
+    char letter;
+    unsigned int dom_id;
+
+    if (!*valid) {
+        return;
+    }
+
+    if (sscanf(perm, "%c%u", &letter, &dom_id) != 2) {
+        *valid = false;
+        return;
+    }
+
+    switch (letter) {
+    case 'n':
+    case 'r':
+    case 'w':
+    case 'b':
+        break;
+
+    default:
+        *valid = false;
+        break;
+    }
+}
+
+int xs_impl_set_perms(XenstoreImplState *s, unsigned int dom_id,
+                      xs_transaction_t tx_id, const char *path, GList *perms)
+{
+    struct walk_op op;
+    XsNode **n;
+    bool valid = true;
+    int ret;
+
+    if (!g_list_length(perms)) {
+        return EINVAL;
+    }
+
+    g_list_foreach(perms, is_valid_perm, &valid);
+    if (!valid) {
+        return EINVAL;
+    }
+
+    ret = init_walk_op(s, &op, tx_id, dom_id, path, &n);
+    if (ret) {
+        return ret;
+    }
+    op.op_fn = xs_node_set_perms;
+    op.op_opaque = perms;
+    op.mutating = true;
+    return xs_node_walk(n, &op);
+}
+
+static int do_xs_impl_watch(XenstoreImplState *s, unsigned int dom_id,
+                            const char *path, const char *token,
+                            xs_impl_watch_fn fn, void *opaque)
+
+{
+    char abspath[XENSTORE_ABS_PATH_MAX + 1];
+    XsWatch *w, *l;
+    int ret;
+
+    ret = validate_path(abspath, path, dom_id);
+    if (ret) {
+        return ret;
+    }
+
+    /* Check for duplicates */
+    l = w = g_hash_table_lookup(s->watches, abspath);
+    while (w) {
+        if (!g_strcmp0(token, w->token) &&  opaque == w->cb_opaque &&
+            fn == w->cb && dom_id == w->dom_id) {
+            return EEXIST;
+        }
+        w = w->next;
+    }
+
+    if (dom_id && s->nr_domu_watches >= XS_MAX_WATCHES) {
+        return E2BIG;
+    }
+
+    w = g_new0(XsWatch, 1);
+    w->token = g_strdup(token);
+    w->cb = fn;
+    w->cb_opaque = opaque;
+    w->dom_id = dom_id;
+    w->rel_prefix = strlen(abspath) - strlen(path);
+
+    /* l was looked up above when checking for duplicates */
+    if (l) {
+        w->next = l->next;
+        l->next = w;
+    } else {
+        g_hash_table_insert(s->watches, g_strdup(abspath), w);
+    }
+    if (dom_id) {
+        s->nr_domu_watches++;
+    }
+
+    return 0;
+}
+
+int xs_impl_watch(XenstoreImplState *s, unsigned int dom_id, const char *path,
+                  const char *token, xs_impl_watch_fn fn, void *opaque)
+{
+    int ret = do_xs_impl_watch(s, dom_id, path, token, fn, opaque);
+
+    if (!ret) {
+        /* A new watch should fire immediately */
+        fn(opaque, path, token);
+    }
+
+    return ret;
+}
+
+static XsWatch *free_watch(XenstoreImplState *s, XsWatch *w)
+{
+    XsWatch *next = w->next;
+
+    if (w->dom_id) {
+        assert(s->nr_domu_watches);
+        s->nr_domu_watches--;
+    }
+
+    g_free(w->token);
+    g_free(w);
+
+    return next;
+}
+
+int xs_impl_unwatch(XenstoreImplState *s, unsigned int dom_id,
+                    const char *path, const char *token,
+                    xs_impl_watch_fn fn, void *opaque)
+{
+    char abspath[XENSTORE_ABS_PATH_MAX + 1];
+    XsWatch *w, **l;
+    int ret;
+
+    ret = validate_path(abspath, path, dom_id);
+    if (ret) {
+        return ret;
+    }
+
+    w = g_hash_table_lookup(s->watches, abspath);
+    if (!w) {
+        return ENOENT;
+    }
+
+    /*
+     * The hash table contains the first element of a list of
+     * watches. Removing the first element in the list is a
+     * special case because we have to update the hash table to
+     * point to the next (or remove it if there's nothing left).
+     */
+    if (!g_strcmp0(token, w->token) && fn == w->cb && opaque == w->cb_opaque &&
+        dom_id == w->dom_id) {
+        if (w->next) {
+            /* Insert the previous 'next' into the hash table */
+            g_hash_table_insert(s->watches, g_strdup(abspath), w->next);
+        } else {
+            /* Nothing left; remove from hash table */
+            g_hash_table_remove(s->watches, abspath);
+        }
+        free_watch(s, w);
+        return 0;
+    }
+
+    /*
+     * We're all done messing with the hash table because the element
+     * it points to has survived the cull. Now it's just a simple
+     * linked list removal operation.
+     */
+    for (l = &w->next; *l; l = &w->next) {
+        w = *l;
+
+        if (!g_strcmp0(token, w->token) && fn == w->cb &&
+            opaque != w->cb_opaque && dom_id == w->dom_id) {
+            *l = free_watch(s, w);
+            return 0;
+        }
+    }
+
+    return ENOENT;
+}
+
+int xs_impl_reset_watches(XenstoreImplState *s, unsigned int dom_id)
+{
+    char **watch_paths;
+    guint nr_watch_paths;
+    guint i;
+
+    watch_paths = (char **)g_hash_table_get_keys_as_array(s->watches,
+                                                          &nr_watch_paths);
+
+    for (i = 0; i < nr_watch_paths; i++) {
+        XsWatch *w1 = g_hash_table_lookup(s->watches, watch_paths[i]);
+        XsWatch *w2, *w, **l;
+
+        /*
+         * w1 is the original list. The hash table has this pointer.
+         * w2 is the head of our newly-filtered list.
+         * w and l are temporary for processing. w is somewhat redundant
+         * with *l but makes my eyes bleed less.
+         */
+
+        w = w2 = w1;
+        l = &w;
+        while (w) {
+            if (w->dom_id == dom_id) {
+                /* If we're freeing the head of the list, bump w2 */
+                if (w2 == w) {
+                    w2 = w->next;
+                }
+                *l = free_watch(s, w);
+            } else {
+                l = &w->next;
+            }
+            w = *l;
+        }
+        /*
+         * If the head of the list survived the cull, we don't need to
+         * touch the hash table and we're done with this path. Else...
+         */
+        if (w1 != w2) {
+            g_hash_table_steal(s->watches, watch_paths[i]);
+
+            /*
+             * It was already freed. (Don't worry, this whole thing is
+             * single-threaded and nobody saw it in the meantime). And
+             * having *stolen* it, we now own the watch_paths[i] string
+             * so if we don't give it back to the hash table, we need
+             * to free it.
+             */
+            if (w2) {
+                g_hash_table_insert(s->watches, watch_paths[i], w2);
+            } else {
+                g_free(watch_paths[i]);
+            }
+        }
+    }
+    g_free(watch_paths);
+    return 0;
+}
+
+static void xs_tx_free(void *_tx)
+{
+    XsTransaction *tx = _tx;
+    if (tx->root) {
+        xs_node_unref(tx->root);
+    }
+    g_free(tx);
+}
+
+XenstoreImplState *xs_impl_create(unsigned int dom_id)
+{
+    XenstoreImplState *s = g_new0(XenstoreImplState, 1);
+    GList *perms;
+
+    s->watches = g_hash_table_new_full(g_str_hash, g_str_equal, g_free, NULL);
+    s->transactions = g_hash_table_new_full(g_direct_hash, g_direct_equal,
+                                            NULL, xs_tx_free);
+
+    perms = g_list_append(NULL, xs_perm_as_string(XS_PERM_NONE, 0));
+    s->root = xs_node_create("/", perms);
+    g_list_free_full(perms, g_free);
+    s->nr_nodes = 1;
+
+    s->root_tx = s->last_tx = 1;
+    return s;
+}
+
+
+static void clear_serialized_tx(gpointer key, gpointer value, gpointer opaque)
+{
+    XsNode *n = value;
+
+    n->serialized_tx = XBT_NULL;
+    if (n->children) {
+        g_hash_table_foreach(n->children, clear_serialized_tx, NULL);
+    }
+}
+
+static void clear_tx_serialized_tx(gpointer key, gpointer value,
+                                   gpointer opaque)
+{
+    XsTransaction *t = value;
+
+    clear_serialized_tx(NULL, t->root, NULL);
+}
+
+static void write_be32(GByteArray *save, uint32_t val)
+{
+    uint32_t be = htonl(val);
+    g_byte_array_append(save, (void *)&be, sizeof(be));
+}
+
+
+struct save_state {
+    GByteArray *bytes;
+    unsigned int tx_id;
+};
+
+#define MODIFIED_IN_TX  (1U << 0)
+#define DELETED_IN_TX   (1U << 1)
+#define NODE_REF        (1U << 2)
+
+static void save_node(gpointer key, gpointer value, gpointer opaque)
+{
+    struct save_state *ss = opaque;
+    XsNode *n = value;
+    char *name = key;
+    uint8_t flag = 0;
+
+    /* Child nodes (i.e. anything but the root) have a name */
+    if (name) {
+        g_byte_array_append(ss->bytes, key, strlen(key) + 1);
+    }
+
+    /*
+     * If we already wrote this node, refer to the previous copy.
+     * There's no rename/move in XenStore, so all we need to find
+     * it is the tx_id of the transation in which it exists. Which
+     * may be the root tx.
+     */
+    if (n->serialized_tx != XBT_NULL) {
+        flag = NODE_REF;
+        g_byte_array_append(ss->bytes, &flag, 1);
+        write_be32(ss->bytes, n->serialized_tx);
+    } else {
+        GList *l;
+        n->serialized_tx = ss->tx_id;
+
+        if (n->modified_in_tx) {
+            flag |= MODIFIED_IN_TX;
+        }
+        if (n->deleted_in_tx) {
+            flag |= DELETED_IN_TX;
+        }
+        g_byte_array_append(ss->bytes, &flag, 1);
+
+        if (n->content) {
+            write_be32(ss->bytes, n->content->len);
+            g_byte_array_append(ss->bytes, n->content->data, n->content->len);
+        } else {
+            write_be32(ss->bytes, 0);
+        }
+
+        for (l = n->perms; l; l = l->next) {
+            g_byte_array_append(ss->bytes, l->data, strlen(l->data) + 1);
+        }
+        /* NUL termination after perms */
+        g_byte_array_append(ss->bytes, (void *)"", 1);
+
+        if (n->children) {
+            g_hash_table_foreach(n->children, save_node, ss);
+        }
+        /* NUL termination after children (child name is NUL) */
+        g_byte_array_append(ss->bytes, (void *)"", 1);
+    }
+}
+
+static void save_tree(struct save_state *ss, uint32_t tx_id, XsNode *root)
+{
+    write_be32(ss->bytes, tx_id);
+    ss->tx_id = tx_id;
+    save_node(NULL, root, ss);
+}
+
+static void save_tx(gpointer key, gpointer value, gpointer opaque)
+{
+    uint32_t tx_id = GPOINTER_TO_INT(key);
+    struct save_state *ss = opaque;
+    XsTransaction *n = value;
+
+    write_be32(ss->bytes, n->base_tx);
+    write_be32(ss->bytes, n->dom_id);
+
+    save_tree(ss, tx_id, n->root);
+}
+
+static void save_watch(gpointer key, gpointer value, gpointer opaque)
+{
+    struct save_state *ss = opaque;
+    XsWatch *w = value;
+
+    /* We only save the *guest* watches. */
+    if (w->dom_id) {
+        gpointer relpath = key + w->rel_prefix;
+        g_byte_array_append(ss->bytes, relpath, strlen(relpath) + 1);
+        g_byte_array_append(ss->bytes, (void *)w->token, strlen(w->token) + 1);
+    }
+}
+
+GByteArray *xs_impl_serialize(XenstoreImplState *s)
+{
+    struct save_state ss;
+
+    ss.bytes = g_byte_array_new();
+
+    /*
+     * node = flags [ real_node / node_ref ]
+     *   flags = uint8_t (MODIFIED_IN_TX | DELETED_IN_TX | NODE_REF)
+     *   node_ref = tx_id (in which the original version of this node exists)
+     *   real_node = content perms child* NUL
+     *     content = len data
+     *       len = uint32_t
+     *       data = uint8_t{len}
+     *     perms = perm* NUL
+     *       perm = asciiz
+     *   child = name node
+     *     name = asciiz
+     *
+     * tree = tx_id node
+     *   tx_id = uint32_t
+     *
+     * transaction = base_tx_id dom_id tree
+     *   base_tx_id = uint32_t
+     *   dom_id = uint32_t
+     *
+     * tx_list = tree transaction* XBT_NULL
+     *
+     * watch = path token
+     *   path = asciiz
+     *   token = asciiz
+     *
+     * watch_list = watch* NUL
+     *
+     * xs_serialize_stream = last_tx tx_list watch_list
+     *   last_tx = uint32_t
+     */
+
+    /* Clear serialized_tx in every node. */
+    if (s->serialized) {
+        clear_serialized_tx(NULL, s->root, NULL);
+        g_hash_table_foreach(s->transactions, clear_tx_serialized_tx, NULL);
+    }
+
+    s->serialized = true;
+
+    write_be32(ss.bytes, s->last_tx);
+    save_tree(&ss, s->root_tx, s->root);
+    g_hash_table_foreach(s->transactions, save_tx, &ss);
+
+    write_be32(ss.bytes, XBT_NULL);
+
+    g_hash_table_foreach(s->watches, save_watch, &ss);
+    g_byte_array_append(ss.bytes, (void *)"", 1);
+
+    return ss.bytes;
+}
+
+struct unsave_state {
+    char path[XENSTORE_ABS_PATH_MAX + 1];
+    XenstoreImplState *s;
+    GByteArray *bytes;
+    uint8_t *d;
+    size_t l;
+    bool root_walk;
+};
+
+static int consume_be32(struct unsave_state *us, unsigned int *val)
+{
+    uint32_t d;
+
+    if (us->l < sizeof(d)) {
+        return -EINVAL;
+    }
+    memcpy(&d, us->d, sizeof(d));
+    *val = ntohl(d);
+    us->d += sizeof(d);
+    us->l -= sizeof(d);
+    return 0;
+}
+
+static int consume_string(struct unsave_state *us, char **str, size_t *len)
+{
+    size_t l;
+
+    if (!us->l) {
+        return -EINVAL;
+    }
+
+    l = strnlen((void *)us->d, us->l);
+    if (l == us->l) {
+        return -EINVAL;
+    }
+
+    if (str) {
+        *str = (void *)us->d;
+    }
+    if (len) {
+        *len = l;
+    }
+
+    us->d += l + 1;
+    us->l -= l + 1;
+    return 0;
+}
+
+static XsNode *lookup_node(XsNode *n, char *path)
+{
+    char *slash = strchr(path, '/');
+    XsNode *child;
+
+    if (path[0] == '\0') {
+        return n;
+    }
+
+    if (slash) {
+        *slash = '\0';
+    }
+
+    if (!n->children) {
+        return NULL;
+    }
+    child = g_hash_table_lookup(n->children, path);
+    if (!slash) {
+        return child;
+    }
+
+    *slash = '/';
+    if (!child) {
+        return NULL;
+    }
+    return lookup_node(child, slash + 1);
+}
+
+static XsNode *lookup_tx_node(struct unsave_state *us, unsigned int tx_id)
+{
+    XsTransaction *t;
+    if (tx_id == us->s->root_tx) {
+        return lookup_node(us->s->root, us->path + 1);
+    }
+
+    t = g_hash_table_lookup(us->s->transactions, GINT_TO_POINTER(tx_id));
+    if (!t) {
+        return NULL;
+    }
+    g_assert(t->root);
+    return lookup_node(t->root, us->path + 1);
+}
+
+static void count_child_nodes(gpointer key, gpointer value, gpointer user_data)
+{
+    unsigned int *nr_nodes = user_data;
+    XsNode *n = value;
+
+    (*nr_nodes)++;
+
+    if (n->children) {
+        g_hash_table_foreach(n->children, count_child_nodes, nr_nodes);
+    }
+}
+
+static int consume_node(struct unsave_state *us, XsNode **nodep,
+                        unsigned int *nr_nodes)
+{
+    XsNode *n = NULL;
+    uint8_t flags;
+    int ret;
+
+    if (us->l < 1) {
+        return -EINVAL;
+    }
+    flags = us->d[0];
+    us->d++;
+    us->l--;
+
+    if (flags == NODE_REF) {
+        unsigned int tx;
+
+        ret = consume_be32(us, &tx);
+        if (ret) {
+            return ret;
+        }
+
+        n = lookup_tx_node(us, tx);
+        if (!n) {
+            return -EINVAL;
+        }
+        n->ref++;
+        if (n->children) {
+            g_hash_table_foreach(n->children, count_child_nodes, nr_nodes);
+        }
+    } else {
+        uint32_t datalen;
+
+        if (flags & ~(DELETED_IN_TX | MODIFIED_IN_TX)) {
+            return -EINVAL;
+        }
+        n = xs_node_new();
+
+        if (flags & DELETED_IN_TX) {
+            n->deleted_in_tx = true;
+        }
+        if (flags & MODIFIED_IN_TX) {
+            n->modified_in_tx = true;
+        }
+        ret = consume_be32(us, &datalen);
+        if (ret) {
+            xs_node_unref(n);
+            return -EINVAL;
+        }
+        if (datalen) {
+            if (datalen > us->l) {
+                xs_node_unref(n);
+                return -EINVAL;
+            }
+
+            GByteArray *node_data = g_byte_array_new();
+            g_byte_array_append(node_data, us->d, datalen);
+            us->d += datalen;
+            us->l -= datalen;
+            n->content = node_data;
+
+            if (us->root_walk) {
+                n->modified_in_tx = true;
+            }
+        }
+        while (1) {
+            char *perm = NULL;
+            size_t permlen = 0;
+
+            ret = consume_string(us, &perm, &permlen);
+            if (ret) {
+                xs_node_unref(n);
+                return ret;
+            }
+
+            if (!permlen) {
+                break;
+            }
+
+            n->perms = g_list_append(n->perms, g_strdup(perm));
+        }
+
+        /* Now children */
+        while (1) {
+            size_t childlen;
+            char *childname;
+            char *pathend;
+            XsNode *child = NULL;
+
+            ret = consume_string(us, &childname, &childlen);
+            if (ret) {
+                xs_node_unref(n);
+                return ret;
+            }
+
+            if (!childlen) {
+                break;
+            }
+
+            pathend = us->path + strlen(us->path);
+            strncat(us->path, "/", sizeof(us->path) - 1);
+            strncat(us->path, childname, sizeof(us->path) - 1);
+
+            ret = consume_node(us, &child, nr_nodes);
+            *pathend = '\0';
+            if (ret) {
+                xs_node_unref(n);
+                return ret;
+            }
+            g_assert(child);
+            xs_node_add_child(n, childname, child);
+        }
+
+        /*
+         * If the node has no data and no children we still want to fire
+         * a watch on it.
+         */
+        if (us->root_walk && !n->children) {
+            n->modified_in_tx = true;
+        }
+    }
+
+    if (!n->deleted_in_tx) {
+        (*nr_nodes)++;
+    }
+
+    *nodep = n;
+    return 0;
+}
+
+static int consume_tree(struct unsave_state *us, XsTransaction *t)
+{
+    int ret;
+
+    ret = consume_be32(us, &t->tx_id);
+    if (ret) {
+        return ret;
+    }
+
+    if (t->tx_id > us->s->last_tx) {
+        return -EINVAL;
+    }
+
+    us->path[0] = '\0';
+
+    return consume_node(us, &t->root, &t->nr_nodes);
+}
+
+int xs_impl_deserialize(XenstoreImplState *s, GByteArray *bytes,
+                        unsigned int dom_id, xs_impl_watch_fn watch_fn,
+                        void *watch_opaque)
+{
+    struct unsave_state us;
+    XsTransaction base_t = { 0 };
+    int ret;
+
+    us.s = s;
+    us.bytes = bytes;
+    us.d = bytes->data;
+    us.l = bytes->len;
+
+    xs_impl_reset_watches(s, dom_id);
+    g_hash_table_remove_all(s->transactions);
+
+    xs_node_unref(s->root);
+    s->root = NULL;
+    s->root_tx = s->last_tx = XBT_NULL;
+
+    ret = consume_be32(&us, &s->last_tx);
+    if (ret) {
+        return ret;
+    }
+
+    /*
+     * Consume the base tree into a transaction so that watches can be
+     * fired as we commit it. By setting us.root_walk we cause the nodes
+     * to be marked as 'modified_in_tx' as they are created, so that the
+     * watches are triggered on them.
+     */
+    base_t.dom_id = dom_id;
+    base_t.base_tx = XBT_NULL;
+    us.root_walk = true;
+    ret = consume_tree(&us, &base_t);
+    if (ret) {
+        return ret;
+    }
+    us.root_walk = false;
+
+    /*
+     * Commit the transaction now while the refcount on all nodes is 1.
+     * Note that we haven't yet reinstated the *guest* watches but that's
+     * OK because we don't want the guest to see any changes. Even any
+     * backend nodes which get recreated should be *precisely* as they
+     * were before the migration. Back ends may have been instantiated
+     * already, and will see the frontend magically blink into existence
+     * now (well, from the aio_bh which fires the watches). It's their
+     * responsibility to rebuild everything precisely as it was before.
+     */
+    ret = transaction_commit(s, &base_t);
+    if (ret) {
+        return ret;
+    }
+
+    while (1) {
+        unsigned int base_tx;
+        XsTransaction *t;
+
+        ret = consume_be32(&us, &base_tx);
+        if (ret) {
+            return ret;
+        }
+        if (base_tx == XBT_NULL) {
+            break;
+        }
+
+        t = g_new0(XsTransaction, 1);
+        t->base_tx = base_tx;
+
+        ret = consume_be32(&us, &t->dom_id);
+        if (!ret) {
+            ret = consume_tree(&us, t);
+        }
+        if (ret) {
+            g_free(t);
+            return ret;
+        }
+        g_assert(t->root);
+        if (t->dom_id) {
+            s->nr_domu_transactions++;
+        }
+        g_hash_table_insert(s->transactions, GINT_TO_POINTER(t->tx_id), t);
+    }
+
+    while (1) {
+        char *path, *token;
+        size_t pathlen, toklen;
+
+        ret = consume_string(&us, &path, &pathlen);
+        if (ret) {
+            return ret;
+        }
+        if (!pathlen) {
+            break;
+        }
+
+        ret = consume_string(&us, &token, &toklen);
+        if (ret) {
+            return ret;
+        }
+
+        if (!watch_fn) {
+            continue;
+        }
+
+        ret = do_xs_impl_watch(s, dom_id, path, token, watch_fn, watch_opaque);
+        if (ret) {
+            return ret;
+        }
+    }
+
+    if (us.l) {
+        return -EINVAL;
+    }
+
+    return 0;
+}
diff --git a/hw/i386/kvm/xenstore_impl.h b/hw/i386/kvm/xenstore_impl.h
new file mode 100644
index 0000000000..0df2a91aae
--- /dev/null
+++ b/hw/i386/kvm/xenstore_impl.h
@@ -0,0 +1,63 @@
+/*
+ * QEMU Xen emulation: The actual implementation of XenStore
+ *
+ * Copyright © 2023 Amazon.com, Inc. or its affiliates. All Rights Reserved.
+ *
+ * Authors: David Woodhouse <dwmw2@infradead.org>
+ *
+ * This work is licensed under the terms of the GNU GPL, version 2 or later.
+ * See the COPYING file in the top-level directory.
+ */
+
+#ifndef QEMU_XENSTORE_IMPL_H
+#define QEMU_XENSTORE_IMPL_H
+
+#include "hw/xen/xen_backend_ops.h"
+
+typedef struct XenstoreImplState XenstoreImplState;
+
+XenstoreImplState *xs_impl_create(unsigned int dom_id);
+
+char *xs_perm_as_string(unsigned int perm, unsigned int domid);
+
+/*
+ * These functions return *positive* error numbers. This is a little
+ * unconventional but it helps to keep us honest because there is
+ * also a very limited set of error numbers that they are permitted
+ * to return (those in xsd_errors).
+ */
+
+int xs_impl_read(XenstoreImplState *s, unsigned int dom_id,
+                 xs_transaction_t tx_id, const char *path, GByteArray *data);
+int xs_impl_write(XenstoreImplState *s, unsigned int dom_id,
+                  xs_transaction_t tx_id, const char *path, GByteArray *data);
+int xs_impl_directory(XenstoreImplState *s, unsigned int dom_id,
+                      xs_transaction_t tx_id, const char *path,
+                      uint64_t *gencnt, GList **items);
+int xs_impl_transaction_start(XenstoreImplState *s, unsigned int dom_id,
+                              xs_transaction_t *tx_id);
+int xs_impl_transaction_end(XenstoreImplState *s, unsigned int dom_id,
+                            xs_transaction_t tx_id, bool commit);
+int xs_impl_rm(XenstoreImplState *s, unsigned int dom_id,
+               xs_transaction_t tx_id, const char *path);
+int xs_impl_get_perms(XenstoreImplState *s, unsigned int dom_id,
+                      xs_transaction_t tx_id, const char *path, GList **perms);
+int xs_impl_set_perms(XenstoreImplState *s, unsigned int dom_id,
+                      xs_transaction_t tx_id, const char *path, GList *perms);
+
+/* This differs from xs_watch_fn because it has the token */
+typedef void(xs_impl_watch_fn)(void *opaque, const char *path,
+                               const char *token);
+int xs_impl_watch(XenstoreImplState *s, unsigned int dom_id, const char *path,
+                  const char *token, xs_impl_watch_fn fn, void *opaque);
+int xs_impl_unwatch(XenstoreImplState *s, unsigned int dom_id,
+                    const char *path, const char *token, xs_impl_watch_fn fn,
+                    void *opaque);
+int xs_impl_reset_watches(XenstoreImplState *s, unsigned int dom_id);
+
+GByteArray *xs_impl_serialize(XenstoreImplState *s);
+int xs_impl_deserialize(XenstoreImplState *s, GByteArray *bytes,
+                        unsigned int dom_id, xs_impl_watch_fn watch_fn,
+                        void *watch_opaque);
+
+#endif /* QEMU_XENSTORE_IMPL_H */