diff options
Diffstat (limited to 'hw/i386/kvm')
| -rw-r--r-- | hw/i386/kvm/meson.build | 1 | ||||
| -rw-r--r-- | hw/i386/kvm/trace-events | 15 | ||||
| -rw-r--r-- | hw/i386/kvm/xen_evtchn.c | 16 | ||||
| -rw-r--r-- | hw/i386/kvm/xen_gnttab.c | 325 | ||||
| -rw-r--r-- | hw/i386/kvm/xen_gnttab.h | 1 | ||||
| -rw-r--r-- | hw/i386/kvm/xen_xenstore.c | 1252 | ||||
| -rw-r--r-- | hw/i386/kvm/xenstore_impl.c | 1927 | ||||
| -rw-r--r-- | hw/i386/kvm/xenstore_impl.h | 63 |
8 files changed, 3583 insertions, 17 deletions
diff --git a/hw/i386/kvm/meson.build b/hw/i386/kvm/meson.build index 82dd6ae7c6..6621ba5cd7 100644 --- a/hw/i386/kvm/meson.build +++ b/hw/i386/kvm/meson.build @@ -9,6 +9,7 @@ i386_kvm_ss.add(when: 'CONFIG_XEN_EMU', if_true: files( 'xen_evtchn.c', 'xen_gnttab.c', 'xen_xenstore.c', + 'xenstore_impl.c', )) i386_ss.add_all(when: 'CONFIG_KVM', if_true: i386_kvm_ss) diff --git a/hw/i386/kvm/trace-events b/hw/i386/kvm/trace-events index b83c3eb965..e4c82de6f3 100644 --- a/hw/i386/kvm/trace-events +++ b/hw/i386/kvm/trace-events @@ -3,3 +3,18 @@ kvm_xen_unmap_pirq(int pirq, int gsi) "pirq %d gsi %d" kvm_xen_get_free_pirq(int pirq, int type) "pirq %d type %d" kvm_xen_bind_pirq(int pirq, int port) "pirq %d port %d" kvm_xen_unmask_pirq(int pirq, char *dev, int vector) "pirq %d dev %s vector %d" +xenstore_error(unsigned int id, unsigned int tx_id, const char *err) "req %u tx %u err %s" +xenstore_read(unsigned int tx_id, const char *path) "tx %u path %s" +xenstore_write(unsigned int tx_id, const char *path) "tx %u path %s" +xenstore_mkdir(unsigned int tx_id, const char *path) "tx %u path %s" +xenstore_directory(unsigned int tx_id, const char *path) "tx %u path %s" +xenstore_directory_part(unsigned int tx_id, const char *path, unsigned int offset) "tx %u path %s offset %u" +xenstore_transaction_start(unsigned int new_tx) "new_tx %u" +xenstore_transaction_end(unsigned int tx_id, bool commit) "tx %u commit %d" +xenstore_rm(unsigned int tx_id, const char *path) "tx %u path %s" +xenstore_get_perms(unsigned int tx_id, const char *path) "tx %u path %s" +xenstore_set_perms(unsigned int tx_id, const char *path) "tx %u path %s" +xenstore_watch(const char *path, const char *token) "path %s token %s" +xenstore_unwatch(const char *path, const char *token) "path %s token %s" +xenstore_reset_watches(void) "" +xenstore_watch_event(const char *path, const char *token) "path %s token %s" diff --git a/hw/i386/kvm/xen_evtchn.c b/hw/i386/kvm/xen_evtchn.c index 886fbf6b3b..3048329474 100644 --- a/hw/i386/kvm/xen_evtchn.c +++ b/hw/i386/kvm/xen_evtchn.c @@ -15,6 +15,7 @@ #include "qemu/lockable.h" #include "qemu/main-loop.h" #include "qemu/log.h" +#include "qemu/error-report.h" #include "monitor/monitor.h" #include "monitor/hmp.h" #include "qapi/error.h" @@ -34,6 +35,7 @@ #include "hw/pci/msi.h" #include "hw/pci/msix.h" #include "hw/irq.h" +#include "hw/xen/xen_backend_ops.h" #include "xen_evtchn.h" #include "xen_overlay.h" @@ -278,6 +280,17 @@ static const TypeInfo xen_evtchn_info = { .class_init = xen_evtchn_class_init, }; +static struct evtchn_backend_ops emu_evtchn_backend_ops = { + .open = xen_be_evtchn_open, + .bind_interdomain = xen_be_evtchn_bind_interdomain, + .unbind = xen_be_evtchn_unbind, + .close = xen_be_evtchn_close, + .get_fd = xen_be_evtchn_fd, + .notify = xen_be_evtchn_notify, + .unmask = xen_be_evtchn_unmask, + .pending = xen_be_evtchn_pending, +}; + static void gsi_assert_bh(void *opaque) { struct vcpu_info *vi = kvm_xen_get_vcpu_info_hva(0); @@ -318,6 +331,9 @@ void xen_evtchn_create(void) s->nr_pirq_inuse_words = DIV_ROUND_UP(s->nr_pirqs, 64); s->pirq_inuse_bitmap = g_new0(uint64_t, s->nr_pirq_inuse_words); s->pirq = g_new0(struct pirq_info, s->nr_pirqs); + + /* Set event channel functions for backend drivers to use */ + xen_evtchn_ops = &emu_evtchn_backend_ops; } void xen_evtchn_connect_gsis(qemu_irq *system_gsis) diff --git a/hw/i386/kvm/xen_gnttab.c b/hw/i386/kvm/xen_gnttab.c index 1e691ded32..21c30e3659 100644 --- a/hw/i386/kvm/xen_gnttab.c +++ b/hw/i386/kvm/xen_gnttab.c @@ -22,6 +22,7 @@ #include "hw/sysbus.h" #include "hw/xen/xen.h" +#include "hw/xen/xen_backend_ops.h" #include "xen_overlay.h" #include "xen_gnttab.h" @@ -34,11 +35,10 @@ #define TYPE_XEN_GNTTAB "xen-gnttab" OBJECT_DECLARE_SIMPLE_TYPE(XenGnttabState, XEN_GNTTAB) -#define XEN_PAGE_SHIFT 12 -#define XEN_PAGE_SIZE (1ULL << XEN_PAGE_SHIFT) - #define ENTRIES_PER_FRAME_V1 (XEN_PAGE_SIZE / sizeof(grant_entry_v1_t)) +static struct gnttab_backend_ops emu_gnttab_backend_ops; + struct XenGnttabState { /*< private >*/ SysBusDevice busdev; @@ -57,6 +57,8 @@ struct XenGnttabState { MemoryRegion gnt_frames; MemoryRegion *gnt_aliases; uint64_t *gnt_frame_gpas; + + uint8_t *map_track; }; struct XenGnttabState *xen_gnttab_singleton; @@ -70,13 +72,11 @@ static void xen_gnttab_realize(DeviceState *dev, Error **errp) error_setg(errp, "Xen grant table support is for Xen emulation"); return; } - s->nr_frames = 0; s->max_frames = kvm_xen_get_gnttab_max_frames(); memory_region_init_ram(&s->gnt_frames, OBJECT(dev), "xen:grant_table", XEN_PAGE_SIZE * s->max_frames, &error_abort); memory_region_set_enabled(&s->gnt_frames, true); s->entries.v1 = memory_region_get_ram_ptr(&s->gnt_frames); - memset(s->entries.v1, 0, XEN_PAGE_SIZE * s->max_frames); /* Create individual page-sizes aliases for overlays */ s->gnt_aliases = (void *)g_new0(MemoryRegion, s->max_frames); @@ -88,9 +88,18 @@ static void xen_gnttab_realize(DeviceState *dev, Error **errp) s->gnt_frame_gpas[i] = INVALID_GPA; } + s->nr_frames = 0; + memset(s->entries.v1, 0, XEN_PAGE_SIZE * s->max_frames); + s->entries.v1[GNTTAB_RESERVED_XENSTORE].flags = GTF_permit_access; + s->entries.v1[GNTTAB_RESERVED_XENSTORE].frame = XEN_SPECIAL_PFN(XENSTORE); + qemu_mutex_init(&s->gnt_lock); xen_gnttab_singleton = s; + + s->map_track = g_new0(uint8_t, s->max_frames * ENTRIES_PER_FRAME_V1); + + xen_gnttab_ops = &emu_gnttab_backend_ops; } static int xen_gnttab_post_load(void *opaque, int version_id) @@ -230,3 +239,309 @@ int xen_gnttab_query_size_op(struct gnttab_query_size *size) size->max_nr_frames = s->max_frames; return 0; } + +/* Track per-open refs, to allow close() to clean up. */ +struct active_ref { + MemoryRegionSection mrs; + void *virtaddr; + uint32_t refcnt; + int prot; +}; + +static void gnt_unref(XenGnttabState *s, grant_ref_t ref, + MemoryRegionSection *mrs, int prot) +{ + if (mrs && mrs->mr) { + if (prot & PROT_WRITE) { + memory_region_set_dirty(mrs->mr, mrs->offset_within_region, + XEN_PAGE_SIZE); + } + memory_region_unref(mrs->mr); + mrs->mr = NULL; + } + assert(s->map_track[ref] != 0); + + if (--s->map_track[ref] == 0) { + grant_entry_v1_t *gnt_p = &s->entries.v1[ref]; + qatomic_and(&gnt_p->flags, (uint16_t)~(GTF_reading | GTF_writing)); + } +} + +static uint64_t gnt_ref(XenGnttabState *s, grant_ref_t ref, int prot) +{ + uint16_t mask = GTF_type_mask | GTF_sub_page; + grant_entry_v1_t gnt, *gnt_p; + int retries = 0; + + if (ref >= s->max_frames * ENTRIES_PER_FRAME_V1 || + s->map_track[ref] == UINT8_MAX) { + return INVALID_GPA; + } + + if (prot & PROT_WRITE) { + mask |= GTF_readonly; + } + + gnt_p = &s->entries.v1[ref]; + + /* + * The guest can legitimately be changing the GTF_readonly flag. Allow + * that, but don't let a malicious guest cause a livelock. + */ + for (retries = 0; retries < 5; retries++) { + uint16_t new_flags; + + /* Read the entry before an atomic operation on its flags */ + gnt = *(volatile grant_entry_v1_t *)gnt_p; + + if ((gnt.flags & mask) != GTF_permit_access || + gnt.domid != DOMID_QEMU) { + return INVALID_GPA; + } + + new_flags = gnt.flags | GTF_reading; + if (prot & PROT_WRITE) { + new_flags |= GTF_writing; + } + + if (qatomic_cmpxchg(&gnt_p->flags, gnt.flags, new_flags) == gnt.flags) { + return (uint64_t)gnt.frame << XEN_PAGE_SHIFT; + } + } + + return INVALID_GPA; +} + +struct xengntdev_handle { + GHashTable *active_maps; +}; + +static int xen_be_gnttab_set_max_grants(struct xengntdev_handle *xgt, + uint32_t nr_grants) +{ + return 0; +} + +static void *xen_be_gnttab_map_refs(struct xengntdev_handle *xgt, + uint32_t count, uint32_t domid, + uint32_t *refs, int prot) +{ + XenGnttabState *s = xen_gnttab_singleton; + struct active_ref *act; + + if (!s) { + errno = ENOTSUP; + return NULL; + } + + if (domid != xen_domid) { + errno = EINVAL; + return NULL; + } + + if (!count || count > 4096) { + errno = EINVAL; + return NULL; + } + + /* + * Making a contiguous mapping from potentially discontiguous grant + * references would be... distinctly non-trivial. We don't support it. + * Even changing the API to return an array of pointers, one per page, + * wouldn't be simple to use in PV backends because some structures + * actually cross page boundaries (e.g. 32-bit blkif_response ring + * entries are 12 bytes). + */ + if (count != 1) { + errno = EINVAL; + return NULL; + } + + QEMU_LOCK_GUARD(&s->gnt_lock); + + act = g_hash_table_lookup(xgt->active_maps, GINT_TO_POINTER(refs[0])); + if (act) { + if ((prot & PROT_WRITE) && !(act->prot & PROT_WRITE)) { + if (gnt_ref(s, refs[0], prot) == INVALID_GPA) { + return NULL; + } + act->prot |= PROT_WRITE; + } + act->refcnt++; + } else { + uint64_t gpa = gnt_ref(s, refs[0], prot); + if (gpa == INVALID_GPA) { + errno = EINVAL; + return NULL; + } + + act = g_new0(struct active_ref, 1); + act->prot = prot; + act->refcnt = 1; + act->mrs = memory_region_find(get_system_memory(), gpa, XEN_PAGE_SIZE); + + if (act->mrs.mr && + !int128_lt(act->mrs.size, int128_make64(XEN_PAGE_SIZE)) && + memory_region_get_ram_addr(act->mrs.mr) != RAM_ADDR_INVALID) { + act->virtaddr = qemu_map_ram_ptr(act->mrs.mr->ram_block, + act->mrs.offset_within_region); + } + if (!act->virtaddr) { + gnt_unref(s, refs[0], &act->mrs, 0); + g_free(act); + errno = EINVAL; + return NULL; + } + + s->map_track[refs[0]]++; + g_hash_table_insert(xgt->active_maps, GINT_TO_POINTER(refs[0]), act); + } + + return act->virtaddr; +} + +static gboolean do_unmap(gpointer key, gpointer value, gpointer user_data) +{ + XenGnttabState *s = user_data; + grant_ref_t gref = GPOINTER_TO_INT(key); + struct active_ref *act = value; + + gnt_unref(s, gref, &act->mrs, act->prot); + g_free(act); + return true; +} + +static int xen_be_gnttab_unmap(struct xengntdev_handle *xgt, + void *start_address, uint32_t *refs, + uint32_t count) +{ + XenGnttabState *s = xen_gnttab_singleton; + struct active_ref *act; + + if (!s) { + return -ENOTSUP; + } + + if (count != 1) { + return -EINVAL; + } + + QEMU_LOCK_GUARD(&s->gnt_lock); + + act = g_hash_table_lookup(xgt->active_maps, GINT_TO_POINTER(refs[0])); + if (!act) { + return -ENOENT; + } + + if (act->virtaddr != start_address) { + return -EINVAL; + } + + if (!--act->refcnt) { + do_unmap(GINT_TO_POINTER(refs[0]), act, s); + g_hash_table_remove(xgt->active_maps, GINT_TO_POINTER(refs[0])); + } + + return 0; +} + +/* + * This looks a bit like the one for true Xen in xen-operations.c but + * in emulation we don't support multi-page mappings. And under Xen we + * *want* the multi-page mappings so we have fewer bounces through the + * kernel and the hypervisor. So the code paths end up being similar, + * but different. + */ +static int xen_be_gnttab_copy(struct xengntdev_handle *xgt, bool to_domain, + uint32_t domid, XenGrantCopySegment *segs, + uint32_t nr_segs, Error **errp) +{ + int prot = to_domain ? PROT_WRITE : PROT_READ; + unsigned int i; + + for (i = 0; i < nr_segs; i++) { + XenGrantCopySegment *seg = &segs[i]; + void *page; + uint32_t ref = to_domain ? seg->dest.foreign.ref : + seg->source.foreign.ref; + + page = xen_be_gnttab_map_refs(xgt, 1, domid, &ref, prot); + if (!page) { + if (errp) { + error_setg_errno(errp, errno, + "xen_be_gnttab_map_refs failed"); + } + return -errno; + } + + if (to_domain) { + memcpy(page + seg->dest.foreign.offset, seg->source.virt, + seg->len); + } else { + memcpy(seg->dest.virt, page + seg->source.foreign.offset, + seg->len); + } + + if (xen_be_gnttab_unmap(xgt, page, &ref, 1)) { + if (errp) { + error_setg_errno(errp, errno, "xen_be_gnttab_unmap failed"); + } + return -errno; + } + } + + return 0; +} + +static struct xengntdev_handle *xen_be_gnttab_open(void) +{ + struct xengntdev_handle *xgt = g_new0(struct xengntdev_handle, 1); + + xgt->active_maps = g_hash_table_new(g_direct_hash, g_direct_equal); + return xgt; +} + +static int xen_be_gnttab_close(struct xengntdev_handle *xgt) +{ + XenGnttabState *s = xen_gnttab_singleton; + + if (!s) { + return -ENOTSUP; + } + + g_hash_table_foreach_remove(xgt->active_maps, do_unmap, s); + g_hash_table_destroy(xgt->active_maps); + g_free(xgt); + return 0; +} + +static struct gnttab_backend_ops emu_gnttab_backend_ops = { + .open = xen_be_gnttab_open, + .close = xen_be_gnttab_close, + .grant_copy = xen_be_gnttab_copy, + .set_max_grants = xen_be_gnttab_set_max_grants, + .map_refs = xen_be_gnttab_map_refs, + .unmap = xen_be_gnttab_unmap, +}; + +int xen_gnttab_reset(void) +{ + XenGnttabState *s = xen_gnttab_singleton; + + if (!s) { + return -ENOTSUP; + } + + QEMU_LOCK_GUARD(&s->gnt_lock); + + s->nr_frames = 0; + + memset(s->entries.v1, 0, XEN_PAGE_SIZE * s->max_frames); + + s->entries.v1[GNTTAB_RESERVED_XENSTORE].flags = GTF_permit_access; + s->entries.v1[GNTTAB_RESERVED_XENSTORE].frame = XEN_SPECIAL_PFN(XENSTORE); + + memset(s->map_track, 0, s->max_frames * ENTRIES_PER_FRAME_V1); + + return 0; +} diff --git a/hw/i386/kvm/xen_gnttab.h b/hw/i386/kvm/xen_gnttab.h index 3bdbe96191..ee215239b0 100644 --- a/hw/i386/kvm/xen_gnttab.h +++ b/hw/i386/kvm/xen_gnttab.h @@ -13,6 +13,7 @@ #define QEMU_XEN_GNTTAB_H void xen_gnttab_create(void); +int xen_gnttab_reset(void); int xen_gnttab_map_page(uint64_t idx, uint64_t gfn); struct gnttab_set_version; diff --git a/hw/i386/kvm/xen_xenstore.c b/hw/i386/kvm/xen_xenstore.c index 14193ef3f9..900679af8a 100644 --- a/hw/i386/kvm/xen_xenstore.c +++ b/hw/i386/kvm/xen_xenstore.c @@ -15,12 +15,14 @@ #include "qemu/module.h" #include "qemu/main-loop.h" #include "qemu/cutils.h" +#include "qemu/error-report.h" #include "qapi/error.h" #include "qom/object.h" #include "migration/vmstate.h" #include "hw/sysbus.h" #include "hw/xen/xen.h" +#include "hw/xen/xen_backend_ops.h" #include "xen_overlay.h" #include "xen_evtchn.h" #include "xen_xenstore.h" @@ -28,15 +30,17 @@ #include "sysemu/kvm.h" #include "sysemu/kvm_xen.h" +#include "trace.h" + +#include "xenstore_impl.h" + #include "hw/xen/interface/io/xs_wire.h" #include "hw/xen/interface/event_channel.h" +#include "hw/xen/interface/grant_table.h" #define TYPE_XEN_XENSTORE "xen-xenstore" OBJECT_DECLARE_SIMPLE_TYPE(XenXenstoreState, XEN_XENSTORE) -#define XEN_PAGE_SHIFT 12 -#define XEN_PAGE_SIZE (1ULL << XEN_PAGE_SHIFT) - #define ENTRIES_PER_FRAME_V1 (XEN_PAGE_SIZE / sizeof(grant_entry_v1_t)) #define ENTRIES_PER_FRAME_V2 (XEN_PAGE_SIZE / sizeof(grant_entry_v2_t)) @@ -47,6 +51,9 @@ struct XenXenstoreState { SysBusDevice busdev; /*< public >*/ + XenstoreImplState *impl; + GList *watch_events; /* for the guest */ + MemoryRegion xenstore_page; struct xenstore_domain_interface *xs; uint8_t req_data[XENSTORE_HEADER_SIZE + XENSTORE_PAYLOAD_MAX]; @@ -59,15 +66,54 @@ struct XenXenstoreState { evtchn_port_t guest_port; evtchn_port_t be_port; struct xenevtchn_handle *eh; + + uint8_t *impl_state; + uint32_t impl_state_size; + + struct xengntdev_handle *gt; + void *granted_xs; }; struct XenXenstoreState *xen_xenstore_singleton; static void xen_xenstore_event(void *opaque); +static void fire_watch_cb(void *opaque, const char *path, const char *token); + +static struct xenstore_backend_ops emu_xenstore_backend_ops; + +static void G_GNUC_PRINTF (4, 5) relpath_printf(XenXenstoreState *s, + GList *perms, + const char *relpath, + const char *fmt, ...) +{ + gchar *abspath; + gchar *value; + va_list args; + GByteArray *data; + int err; + + abspath = g_strdup_printf("/local/domain/%u/%s", xen_domid, relpath); + va_start(args, fmt); + value = g_strdup_vprintf(fmt, args); + va_end(args); + + data = g_byte_array_new_take((void *)value, strlen(value)); + + err = xs_impl_write(s->impl, DOMID_QEMU, XBT_NULL, abspath, data); + assert(!err); + + g_byte_array_unref(data); + + err = xs_impl_set_perms(s->impl, DOMID_QEMU, XBT_NULL, abspath, perms); + assert(!err); + + g_free(abspath); +} static void xen_xenstore_realize(DeviceState *dev, Error **errp) { XenXenstoreState *s = XEN_XENSTORE(dev); + GList *perms; if (xen_mode != XEN_EMULATE) { error_setg(errp, "Xen xenstore support is for Xen emulation"); @@ -89,6 +135,50 @@ static void xen_xenstore_realize(DeviceState *dev, Error **errp) } aio_set_fd_handler(qemu_get_aio_context(), xen_be_evtchn_fd(s->eh), true, xen_xenstore_event, NULL, NULL, NULL, s); + + s->impl = xs_impl_create(xen_domid); + + /* Populate the default nodes */ + + /* Nodes owned by 'dom0' but readable by the guest */ + perms = g_list_append(NULL, xs_perm_as_string(XS_PERM_NONE, DOMID_QEMU)); + perms = g_list_append(perms, xs_perm_as_string(XS_PERM_READ, xen_domid)); + + relpath_printf(s, perms, "", "%s", ""); + + relpath_printf(s, perms, "domid", "%u", xen_domid); + + relpath_printf(s, perms, "control/platform-feature-xs_reset_watches", "%u", 1); + relpath_printf(s, perms, "control/platform-feature-multiprocessor-suspend", "%u", 1); + + relpath_printf(s, perms, "platform/acpi", "%u", 1); + relpath_printf(s, perms, "platform/acpi_s3", "%u", 1); + relpath_printf(s, perms, "platform/acpi_s4", "%u", 1); + relpath_printf(s, perms, "platform/acpi_laptop_slate", "%u", 0); + + g_list_free_full(perms, g_free); + + /* Nodes owned by the guest */ + perms = g_list_append(NULL, xs_perm_as_string(XS_PERM_NONE, xen_domid)); + + relpath_printf(s, perms, "attr", "%s", ""); + + relpath_printf(s, perms, "control/shutdown", "%s", ""); + relpath_printf(s, perms, "control/feature-poweroff", "%u", 1); + relpath_printf(s, perms, "control/feature-reboot", "%u", 1); + relpath_printf(s, perms, "control/feature-suspend", "%u", 1); + relpath_printf(s, perms, "control/feature-s3", "%u", 1); + relpath_printf(s, perms, "control/feature-s4", "%u", 1); + + relpath_printf(s, perms, "data", "%s", ""); + relpath_printf(s, perms, "device", "%s", ""); + relpath_printf(s, perms, "drivers", "%s", ""); + relpath_printf(s, perms, "error", "%s", ""); + relpath_printf(s, perms, "feature", "%s", ""); + + g_list_free_full(perms, g_free); + + xen_xenstore_ops = &emu_xenstore_backend_ops; } static bool xen_xenstore_is_needed(void *opaque) @@ -99,16 +189,26 @@ static bool xen_xenstore_is_needed(void *opaque) static int xen_xenstore_pre_save(void *opaque) { XenXenstoreState *s = opaque; + GByteArray *save; if (s->eh) { s->guest_port = xen_be_evtchn_get_guest_port(s->eh); } + + g_free(s->impl_state); + save = xs_impl_serialize(s->impl); + s->impl_state = save->data; + s->impl_state_size = save->len; + g_byte_array_free(save, false); + return 0; } static int xen_xenstore_post_load(void *opaque, int ver) { XenXenstoreState *s = opaque; + GByteArray *save; + int ret; /* * As qemu/dom0, rebind to the guest's port. The Windows drivers may @@ -125,11 +225,18 @@ static int xen_xenstore_post_load(void *opaque, int ver) } s->be_port = be_port; } - return 0; + + save = g_byte_array_new_take(s->impl_state, s->impl_state_size); + s->impl_state = NULL; + s->impl_state_size = 0; + + ret = xs_impl_deserialize(s->impl, save, xen_domid, fire_watch_cb, s); + return ret; } static const VMStateDescription xen_xenstore_vmstate = { .name = "xen_xenstore", + .unmigratable = 1, /* The PV back ends don't migrate yet */ .version_id = 1, .minimum_version_id = 1, .needed = xen_xenstore_is_needed, @@ -145,6 +252,10 @@ static const VMStateDescription xen_xenstore_vmstate = { VMSTATE_BOOL(rsp_pending, XenXenstoreState), VMSTATE_UINT32(guest_port, XenXenstoreState), VMSTATE_BOOL(fatal_error, XenXenstoreState), + VMSTATE_UINT32(impl_state_size, XenXenstoreState), + VMSTATE_VARRAY_UINT32_ALLOC(impl_state, XenXenstoreState, + impl_state_size, 0, + vmstate_info_uint8, uint8_t), VMSTATE_END_OF_LIST() } }; @@ -213,20 +324,761 @@ static void reset_rsp(XenXenstoreState *s) s->rsp_offset = 0; } +static void xs_error(XenXenstoreState *s, unsigned int id, + xs_transaction_t tx_id, int errnum) +{ + struct xsd_sockmsg *rsp = (struct xsd_sockmsg *)s->rsp_data; + const char *errstr = NULL; + + for (unsigned int i = 0; i < ARRAY_SIZE(xsd_errors); i++) { + struct xsd_errors *xsd_error = &xsd_errors[i]; + + if (xsd_error->errnum == errnum) { + errstr = xsd_error->errstring; + break; + } + } + assert(errstr); + + trace_xenstore_error(id, tx_id, errstr); + + rsp->type = XS_ERROR; + rsp->req_id = id; + rsp->tx_id = tx_id; + rsp->len = (uint32_t)strlen(errstr) + 1; + + memcpy(&rsp[1], errstr, rsp->len); +} + +static void xs_ok(XenXenstoreState *s, unsigned int type, unsigned int req_id, + xs_transaction_t tx_id) +{ + struct xsd_sockmsg *rsp = (struct xsd_sockmsg *)s->rsp_data; + const char *okstr = "OK"; + + rsp->type = type; + rsp->req_id = req_id; + rsp->tx_id = tx_id; + rsp->len = (uint32_t)strlen(okstr) + 1; + + memcpy(&rsp[1], okstr, rsp->len); +} + +/* + * The correct request and response formats are documented in xen.git: + * docs/misc/xenstore.txt. A summary is given below for convenience. + * The '|' symbol represents a NUL character. + * + * ---------- Database read, write and permissions operations ---------- + * + * READ <path>| <value|> + * WRITE <path>|<value|> + * Store and read the octet string <value> at <path>. + * WRITE creates any missing parent paths, with empty values. + * + * MKDIR <path>| + * Ensures that the <path> exists, by necessary by creating + * it and any missing parents with empty values. If <path> + * or any parent already exists, its value is left unchanged. + * + * RM <path>| + * Ensures that the <path> does not exist, by deleting + * it and all of its children. It is not an error if <path> does + * not exist, but it _is_ an error if <path>'s immediate parent + * does not exist either. + * + * DIRECTORY <path>| <child-leaf-name>|* + * Gives a list of the immediate children of <path>, as only the + * leafnames. The resulting children are each named + * <path>/<child-leaf-name>. + * + * DIRECTORY_PART <path>|<offset> <gencnt>|<child-leaf-name>|* + * Same as DIRECTORY, but to be used for children lists longer than + * XENSTORE_PAYLOAD_MAX. Input are <path> and the byte offset into + * the list of children to return. Return values are the generation + * count <gencnt> of the node (to be used to ensure the node hasn't + * changed between two reads: <gencnt> being the same for multiple + * reads guarantees the node hasn't changed) and the list of children + * starting at the specified <offset> of the complete list. + * + * GET_PERMS <path>| <perm-as-string>|+ + * SET_PERMS <path>|<perm-as-string>|+? + * <perm-as-string> is one of the following + * w<domid> write only + * r<domid> read only + * b<domid> both read and write + * n<domid> no access + * See https://wiki.xen.org/wiki/XenBus section + * `Permissions' for details of the permissions system. + * It is possible to set permissions for the special watch paths + * "@introduceDomain" and "@releaseDomain" to enable receiving those + * watches in unprivileged domains. + * + * ---------- Watches ---------- + * + * WATCH <wpath>|<token>|? + * Adds a watch. + * + * When a <path> is modified (including path creation, removal, + * contents change or permissions change) this generates an event + * on the changed <path>. Changes made in transactions cause an + * event only if and when committed. Each occurring event is + * matched against all the watches currently set up, and each + * matching watch results in a WATCH_EVENT message (see below). + * + * The event's path matches the watch's <wpath> if it is an child + * of <wpath>. + * + * <wpath> can be a <path> to watch or @<wspecial>. In the + * latter case <wspecial> may have any syntax but it matches + * (according to the rules above) only the following special + * events which are invented by xenstored: + * @introduceDomain occurs on INTRODUCE + * @releaseDomain occurs on any domain crash or + * shutdown, and also on RELEASE + * and domain destruction + * <wspecial> events are sent to privileged callers or explicitly + * via SET_PERMS enabled domains only. + * + * When a watch is first set up it is triggered once straight + * away, with <path> equal to <wpath>. Watches may be triggered + * spuriously. The tx_id in a WATCH request is ignored. + * + * Watches are supposed to be restricted by the permissions + * system but in practice the implementation is imperfect. + * Applications should not rely on being sent a notification for + * paths that they cannot read; however, an application may rely + * on being sent a watch when a path which it _is_ able to read + * is deleted even if that leaves only a nonexistent unreadable + * parent. A notification may omitted if a node's permissions + * are changed so as to make it unreadable, in which case future + * notifications may be suppressed (and if the node is later made + * readable, some notifications may have been lost). + * + * WATCH_EVENT <epath>|<token>| + * Unsolicited `reply' generated for matching modification events + * as described above. req_id and tx_id are both 0. + * + * <epath> is the event's path, ie the actual path that was + * modified; however if the event was the recursive removal of an + * parent of <wpath>, <epath> is just + * <wpath> (rather than the actual path which was removed). So + * <epath> is a child of <wpath>, regardless. + * + * Iff <wpath> for the watch was specified as a relative pathname, + * the <epath> path will also be relative (with the same base, + * obviously). + * + * UNWATCH <wpath>|<token>|? + * + * RESET_WATCHES | + * Reset all watches and transactions of the caller. + * + * ---------- Transactions ---------- + * + * TRANSACTION_START | <transid>| + * <transid> is an opaque uint32_t allocated by xenstored + * represented as unsigned decimal. After this, transaction may + * be referenced by using <transid> (as 32-bit binary) in the + * tx_id request header field. When transaction is started whole + * db is copied; reads and writes happen on the copy. + * It is not legal to send non-0 tx_id in TRANSACTION_START. + * + * TRANSACTION_END T| + * TRANSACTION_END F| + * tx_id must refer to existing transaction. After this + * request the tx_id is no longer valid and may be reused by + * xenstore. If F, the transaction is discarded. If T, + * it is committed: if there were any other intervening writes + * then our END gets get EAGAIN. + * + * The plan is that in the future only intervening `conflicting' + * writes cause EAGAIN, meaning only writes or other commits + * which changed paths which were read or written in the + * transaction at hand. + * + */ + +static void xs_read(XenXenstoreState *s, unsigned int req_id, + xs_transaction_t tx_id, uint8_t *req_data, unsigned int len) +{ + const char *path = (const char *)req_data; + struct xsd_sockmsg *rsp = (struct xsd_sockmsg *)s->rsp_data; + uint8_t *rsp_data = (uint8_t *)&rsp[1]; + g_autoptr(GByteArray) data = g_byte_array_new(); + int err; + + if (len == 0 || req_data[len - 1] != '\0') { + xs_error(s, req_id, tx_id, EINVAL); + return; + } + + trace_xenstore_read(tx_id, path); + err = xs_impl_read(s->impl, xen_domid, tx_id, path, data); + if (err) { + xs_error(s, req_id, tx_id, err); + return; + } + + rsp->type = XS_READ; + rsp->req_id = req_id; + rsp->tx_id = tx_id; + rsp->len = 0; + + len = data->len; + if (len > XENSTORE_PAYLOAD_MAX) { + xs_error(s, req_id, tx_id, E2BIG); + return; + } + + memcpy(&rsp_data[rsp->len], data->data, len); + rsp->len += len; +} + +static void xs_write(XenXenstoreState *s, unsigned int req_id, + xs_transaction_t tx_id, uint8_t *req_data, + unsigned int len) +{ + g_autoptr(GByteArray) data = g_byte_array_new(); + const char *path; + int err; + + if (len == 0) { + xs_error(s, req_id, tx_id, EINVAL); + return; + } + + path = (const char *)req_data; + + while (len--) { + if (*req_data++ == '\0') { + break; + } + if (len == 0) { + xs_error(s, req_id, tx_id, EINVAL); + return; + } + } + + g_byte_array_append(data, req_data, len); + + trace_xenstore_write(tx_id, path); + err = xs_impl_write(s->impl, xen_domid, tx_id, path, data); + if (err) { + xs_error(s, req_id, tx_id, err); + return; + } + + xs_ok(s, XS_WRITE, req_id, tx_id); +} + +static void xs_mkdir(XenXenstoreState *s, unsigned int req_id, + xs_transaction_t tx_id, uint8_t *req_data, + unsigned int len) +{ + g_autoptr(GByteArray) data = g_byte_array_new(); + const char *path; + int err; + + if (len == 0 || req_data[len - 1] != '\0') { + xs_error(s, req_id, tx_id, EINVAL); + return; + } + + path = (const char *)req_data; + + trace_xenstore_mkdir(tx_id, path); + err = xs_impl_read(s->impl, xen_domid, tx_id, path, data); + if (err == ENOENT) { + err = xs_impl_write(s->impl, xen_domid, tx_id, path, data); + } + + if (!err) { + xs_error(s, req_id, tx_id, err); + return; + } + + xs_ok(s, XS_MKDIR, req_id, tx_id); +} + +static void xs_append_strings(XenXenstoreState *s, struct xsd_sockmsg *rsp, + GList *strings, unsigned int start, bool truncate) +{ + uint8_t *rsp_data = (uint8_t *)&rsp[1]; + GList *l; + + for (l = strings; l; l = l->next) { + size_t len = strlen(l->data) + 1; /* Including the NUL termination */ + char *str = l->data; + + if (rsp->len + len > XENSTORE_PAYLOAD_MAX) { + if (truncate) { + len = XENSTORE_PAYLOAD_MAX - rsp->len; + if (!len) { + return; + } + } else { + xs_error(s, rsp->req_id, rsp->tx_id, E2BIG); + return; + } + } + + if (start) { + if (start >= len) { + start -= len; + continue; + } + + str += start; + len -= start; + start = 0; + } + + memcpy(&rsp_data[rsp->len], str, len); + rsp->len += len; + } + /* XS_DIRECTORY_PART wants an extra NUL to indicate the end */ + if (truncate && rsp->len < XENSTORE_PAYLOAD_MAX) { + rsp_data[rsp->len++] = '\0'; + } +} + +static void xs_directory(XenXenstoreState *s, unsigned int req_id, + xs_transaction_t tx_id, uint8_t *req_data, + unsigned int len) +{ + struct xsd_sockmsg *rsp = (struct xsd_sockmsg *)s->rsp_data; + GList *items = NULL; + const char *path; + int err; + + if (len == 0 || req_data[len - 1] != '\0') { + xs_error(s, req_id, tx_id, EINVAL); + return; + } + + path = (const char *)req_data; + + trace_xenstore_directory(tx_id, path); + err = xs_impl_directory(s->impl, xen_domid, tx_id, path, NULL, &items); + if (err != 0) { + xs_error(s, req_id, tx_id, err); + return; + } + + rsp->type = XS_DIRECTORY; + rsp->req_id = req_id; + rsp->tx_id = tx_id; + rsp->len = 0; + + xs_append_strings(s, rsp, items, 0, false); + + g_list_free_full(items, g_free); +} + +static void xs_directory_part(XenXenstoreState *s, unsigned int req_id, + xs_transaction_t tx_id, uint8_t *req_data, + unsigned int len) +{ + const char *offset_str, *path = (const char *)req_data; + struct xsd_sockmsg *rsp = (struct xsd_sockmsg *)s->rsp_data; + char *rsp_data = (char *)&rsp[1]; + uint64_t gencnt = 0; + unsigned int offset; + GList *items = NULL; + int err; + + if (len == 0) { + xs_error(s, req_id, tx_id, EINVAL); + return; + } + + while (len--) { + if (*req_data++ == '\0') { + break; + } + if (len == 0) { + xs_error(s, req_id, tx_id, EINVAL); + return; + } + } + + offset_str = (const char *)req_data; + while (len--) { + if (*req_data++ == '\0') { + break; + } + if (len == 0) { + xs_error(s, req_id, tx_id, EINVAL); + return; + } + } + + if (len) { + xs_error(s, req_id, tx_id, EINVAL); + return; + } + + if (qemu_strtoui(offset_str, NULL, 10, &offset) < 0) { + xs_error(s, req_id, tx_id, EINVAL); + return; + } + + trace_xenstore_directory_part(tx_id, path, offset); + err = xs_impl_directory(s->impl, xen_domid, tx_id, path, &gencnt, &items); + if (err != 0) { + xs_error(s, req_id, tx_id, err); + return; + } + + rsp->type = XS_DIRECTORY_PART; + rsp->req_id = req_id; + rsp->tx_id = tx_id; + rsp->len = snprintf(rsp_data, XENSTORE_PAYLOAD_MAX, "%" PRIu64, gencnt) + 1; + + xs_append_strings(s, rsp, items, offset, true); + + g_list_free_full(items, g_free); +} + +static void xs_transaction_start(XenXenstoreState *s, unsigned int req_id, + xs_transaction_t tx_id, uint8_t *req_data, + unsigned int len) +{ + struct xsd_sockmsg *rsp = (struct xsd_sockmsg *)s->rsp_data; + char *rsp_data = (char *)&rsp[1]; + int err; + + if (len != 1 || req_data[0] != '\0') { + xs_error(s, req_id, tx_id, EINVAL); + return; + } + + rsp->type = XS_TRANSACTION_START; + rsp->req_id = req_id; + rsp->tx_id = tx_id; + rsp->len = 0; + + err = xs_impl_transaction_start(s->impl, xen_domid, &tx_id); + if (err) { + xs_error(s, req_id, tx_id, err); + return; + } + + trace_xenstore_transaction_start(tx_id); + + rsp->len = snprintf(rsp_data, XENSTORE_PAYLOAD_MAX, "%u", tx_id); + assert(rsp->len < XENSTORE_PAYLOAD_MAX); + rsp->len++; +} + +static void xs_transaction_end(XenXenstoreState *s, unsigned int req_id, + xs_transaction_t tx_id, uint8_t *req_data, + unsigned int len) +{ + bool commit; + int err; + + if (len != 2 || req_data[1] != '\0') { + xs_error(s, req_id, tx_id, EINVAL); + return; + } + + switch (req_data[0]) { + case 'T': + commit = true; + break; + case 'F': + commit = false; + break; + default: + xs_error(s, req_id, tx_id, EINVAL); + return; + } + + trace_xenstore_transaction_end(tx_id, commit); + err = xs_impl_transaction_end(s->impl, xen_domid, tx_id, commit); + if (err) { + xs_error(s, req_id, tx_id, err); + return; + } + + xs_ok(s, XS_TRANSACTION_END, req_id, tx_id); +} + +static void xs_rm(XenXenstoreState *s, unsigned int req_id, + xs_transaction_t tx_id, uint8_t *req_data, unsigned int len) +{ + const char *path = (const char *)req_data; + int err; + + if (len == 0 || req_data[len - 1] != '\0') { + xs_error(s, req_id, tx_id, EINVAL); + return; + } + + trace_xenstore_rm(tx_id, path); + err = xs_impl_rm(s->impl, xen_domid, tx_id, path); + if (err) { + xs_error(s, req_id, tx_id, err); + return; + } + + xs_ok(s, XS_RM, req_id, tx_id); +} + +static void xs_get_perms(XenXenstoreState *s, unsigned int req_id, + xs_transaction_t tx_id, uint8_t *req_data, + unsigned int len) +{ + const char *path = (const char *)req_data; + struct xsd_sockmsg *rsp = (struct xsd_sockmsg *)s->rsp_data; + GList *perms = NULL; + int err; + + if (len == 0 || req_data[len - 1] != '\0') { + xs_error(s, req_id, tx_id, EINVAL); + return; + } + + trace_xenstore_get_perms(tx_id, path); + err = xs_impl_get_perms(s->impl, xen_domid, tx_id, path, &perms); + if (err) { + xs_error(s, req_id, tx_id, err); + return; + } + + rsp->type = XS_GET_PERMS; + rsp->req_id = req_id; + rsp->tx_id = tx_id; + rsp->len = 0; + + xs_append_strings(s, rsp, perms, 0, false); + + g_list_free_full(perms, g_free); +} + +static void xs_set_perms(XenXenstoreState *s, unsigned int req_id, + xs_transaction_t tx_id, uint8_t *req_data, + unsigned int len) +{ + const char *path = (const char *)req_data; + uint8_t *perm; + GList *perms = NULL; + int err; + + if (len == 0) { + xs_error(s, req_id, tx_id, EINVAL); + return; + } + + while (len--) { + if (*req_data++ == '\0') { + break; + } + if (len == 0) { + xs_error(s, req_id, tx_id, EINVAL); + return; + } + } + + perm = req_data; + while (len--) { + if (*req_data++ == '\0') { + perms = g_list_append(perms, perm); + perm = req_data; + } + } + + /* + * Note that there may be trailing garbage at the end of the buffer. + * This is explicitly permitted by the '?' at the end of the definition: + * + * SET_PERMS <path>|<perm-as-string>|+? + */ + + trace_xenstore_set_perms(tx_id, path); + err = xs_impl_set_perms(s->impl, xen_domid, tx_id, path, perms); + g_list_free(perms); + if (err) { + xs_error(s, req_id, tx_id, err); + return; + } + + xs_ok(s, XS_SET_PERMS, req_id, tx_id); +} + +static void xs_watch(XenXenstoreState *s, unsigned int req_id, + xs_transaction_t tx_id, uint8_t *req_data, + unsigned int len) +{ + const char *token, *path = (const char *)req_data; + int err; + + if (len == 0) { + xs_error(s, req_id, tx_id, EINVAL); + return; + } + + while (len--) { + if (*req_data++ == '\0') { + break; + } + if (len == 0) { + xs_error(s, req_id, tx_id, EINVAL); + return; + } + } + + token = (const char *)req_data; + while (len--) { + if (*req_data++ == '\0') { + break; + } + if (len == 0) { + xs_error(s, req_id, tx_id, EINVAL); + return; + } + } + + /* + * Note that there may be trailing garbage at the end of the buffer. + * This is explicitly permitted by the '?' at the end of the definition: + * + * WATCH <wpath>|<token>|? + */ + + trace_xenstore_watch(path, token); + err = xs_impl_watch(s->impl, xen_domid, path, token, fire_watch_cb, s); + if (err) { + xs_error(s, req_id, tx_id, err); + return; + } + + xs_ok(s, XS_WATCH, req_id, tx_id); +} + +static void xs_unwatch(XenXenstoreState *s, unsigned int req_id, + xs_transaction_t tx_id, uint8_t *req_data, + unsigned int len) +{ + const char *token, *path = (const char *)req_data; + int err; + + if (len == 0) { + xs_error(s, req_id, tx_id, EINVAL); + return; + } + + while (len--) { + if (*req_data++ == '\0') { + break; + } + if (len == 0) { + xs_error(s, req_id, tx_id, EINVAL); + return; + } + } + + token = (const char *)req_data; + while (len--) { + if (*req_data++ == '\0') { + break; + } + if (len == 0) { + xs_error(s, req_id, tx_id, EINVAL); + return; + } + } + + trace_xenstore_unwatch(path, token); + err = xs_impl_unwatch(s->impl, xen_domid, path, token, fire_watch_cb, s); + if (err) { + xs_error(s, req_id, tx_id, err); + return; + } + + xs_ok(s, XS_UNWATCH, req_id, tx_id); +} + +static void xs_reset_watches(XenXenstoreState *s, unsigned int req_id, + xs_transaction_t tx_id, uint8_t *req_data, + unsigned int len) +{ + if (len == 0 || req_data[len - 1] != '\0') { + xs_error(s, req_id, tx_id, EINVAL); + return; + } + + trace_xenstore_reset_watches(); + xs_impl_reset_watches(s->impl, xen_domid); + + xs_ok(s, XS_RESET_WATCHES, req_id, tx_id); +} + +static void xs_priv(XenXenstoreState *s, unsigned int req_id, + xs_transaction_t tx_id, uint8_t *data, + unsigned int len) +{ + xs_error(s, req_id, tx_id, EACCES); +} + +static void xs_unimpl(XenXenstoreState *s, unsigned int req_id, + xs_transaction_t tx_id, uint8_t *data, + unsigned int len) +{ + xs_error(s, req_id, tx_id, ENOSYS); +} + +typedef void (*xs_impl)(XenXenstoreState *s, unsigned int req_id, + xs_transaction_t tx_id, uint8_t *data, + unsigned int len); + +struct xsd_req { + const char *name; + xs_impl fn; +}; +#define XSD_REQ(_type, _fn) \ + [_type] = { .name = #_type, .fn = _fn } + +struct xsd_req xsd_reqs[] = { + XSD_REQ(XS_READ, xs_read), + XSD_REQ(XS_WRITE, xs_write), + XSD_REQ(XS_MKDIR, xs_mkdir), + XSD_REQ(XS_DIRECTORY, xs_directory), + XSD_REQ(XS_DIRECTORY_PART, xs_directory_part), + XSD_REQ(XS_TRANSACTION_START, xs_transaction_start), + XSD_REQ(XS_TRANSACTION_END, xs_transaction_end), + XSD_REQ(XS_RM, xs_rm), + XSD_REQ(XS_GET_PERMS, xs_get_perms), + XSD_REQ(XS_SET_PERMS, xs_set_perms), + XSD_REQ(XS_WATCH, xs_watch), + XSD_REQ(XS_UNWATCH, xs_unwatch), + XSD_REQ(XS_CONTROL, xs_priv), + XSD_REQ(XS_INTRODUCE, xs_priv), + XSD_REQ(XS_RELEASE, xs_priv), + XSD_REQ(XS_IS_DOMAIN_INTRODUCED, xs_priv), + XSD_REQ(XS_RESUME, xs_priv), + XSD_REQ(XS_SET_TARGET, xs_priv), + XSD_REQ(XS_RESET_WATCHES, xs_reset_watches), +}; + static void process_req(XenXenstoreState *s) { struct xsd_sockmsg *req = (struct xsd_sockmsg *)s->req_data; - struct xsd_sockmsg *rsp = (struct xsd_sockmsg *)s->rsp_data; - const char enosys[] = "ENOSYS"; + xs_impl handler = NULL; assert(req_pending(s)); assert(!s->rsp_pending); - rsp->type = XS_ERROR; - rsp->req_id = req->req_id; - rsp->tx_id = req->tx_id; - rsp->len = sizeof(enosys); - memcpy((void *)&rsp[1], enosys, sizeof(enosys)); + if (req->type < ARRAY_SIZE(xsd_reqs)) { + handler = xsd_reqs[req->type].fn; + } + if (!handler) { + handler = &xs_unimpl; + } + + handler(s, req->req_id, req->tx_id, (uint8_t *)&req[1], req->len); s->rsp_pending = true; reset_req(s); @@ -415,6 +1267,113 @@ static unsigned int put_rsp(XenXenstoreState *s) return copylen; } +static void deliver_watch(XenXenstoreState *s, const char *path, + const char *token) +{ + struct xsd_sockmsg *rsp = (struct xsd_sockmsg *)s->rsp_data; + uint8_t *rsp_data = (uint8_t *)&rsp[1]; + unsigned int len; + + assert(!s->rsp_pending); + + trace_xenstore_watch_event(path, token); + + rsp->type = XS_WATCH_EVENT; + rsp->req_id = 0; + rsp->tx_id = 0; + rsp->len = 0; + + len = strlen(path); + + /* XENSTORE_ABS/REL_PATH_MAX should ensure there can be no overflow */ + assert(rsp->len + len < XENSTORE_PAYLOAD_MAX); + + memcpy(&rsp_data[rsp->len], path, len); + rsp->len += len; + rsp_data[rsp->len] = '\0'; + rsp->len++; + + len = strlen(token); + /* + * It is possible for the guest to have chosen a token that will + * not fit (along with the patch) into a watch event. We have no + * choice but to drop the event if this is the case. + */ + if (rsp->len + len >= XENSTORE_PAYLOAD_MAX) { + return; + } + + memcpy(&rsp_data[rsp->len], token, len); + rsp->len += len; + rsp_data[rsp->len] = '\0'; + rsp->len++; + + s->rsp_pending = true; +} + +struct watch_event { + char *path; + char *token; +}; + +static void free_watch_event(struct watch_event *ev) +{ + if (ev) { + g_free(ev->path); + g_free(ev->token); + g_free(ev); + } +} + +static void queue_watch(XenXenstoreState *s, const char *path, + const char *token) +{ + struct watch_event *ev = g_new0(struct watch_event, 1); + + ev->path = g_strdup(path); + ev->token = g_strdup(token); + + s->watch_events = g_list_append(s->watch_events, ev); +} + +static void fire_watch_cb(void *opaque, const char *path, const char *token) +{ + XenXenstoreState *s = opaque; + + assert(qemu_mutex_iothread_locked()); + + /* + * If there's a response pending, we obviously can't scribble over + * it. But if there's a request pending, it has dibs on the buffer + * too. + * + * In the common case of a watch firing due to backend activity + * when the ring was otherwise idle, we should be able to copy the + * strings directly into the rsp_data and thence the actual ring, + * without needing to perform any allocations and queue them. + */ + if (s->rsp_pending || req_pending(s)) { + queue_watch(s, path, token); + } else { + deliver_watch(s, path, token); + /* + * If the message was queued because there was already ring activity, + * no need to wake the guest. But if not, we need to send the evtchn. + */ + xen_be_evtchn_notify(s->eh, s->be_port); + } +} + +static void process_watch_events(XenXenstoreState *s) +{ + struct watch_event *ev = s->watch_events->data; + + deliver_watch(s, ev->path, ev->token); + + s->watch_events = g_list_remove(s->watch_events, ev); + free_watch_event(ev); +} + static void xen_xenstore_event(void *opaque) { XenXenstoreState *s = opaque; @@ -433,6 +1392,10 @@ static void xen_xenstore_event(void *opaque) copied_to = copied_from = 0; processed = false; + if (!s->rsp_pending && s->watch_events) { + process_watch_events(s); + } + if (s->rsp_pending) { copied_to = put_rsp(s); } @@ -441,7 +1404,7 @@ static void xen_xenstore_event(void *opaque) copied_from = get_req(s); } - if (req_pending(s) && !s->rsp_pending) { + if (req_pending(s) && !s->rsp_pending && !s->watch_events) { process_req(s); processed = true; } @@ -496,5 +1459,270 @@ int xen_xenstore_reset(void) } s->be_port = err; + /* + * We don't actually access the guest's page through the grant, because + * this isn't real Xen, and we can just use the page we gave it in the + * first place. Map the grant anyway, mostly for cosmetic purposes so + * it *looks* like it's in use in the guest-visible grant table. + */ + s->gt = qemu_xen_gnttab_open(); + uint32_t xs_gntref = GNTTAB_RESERVED_XENSTORE; + s->granted_xs = qemu_xen_gnttab_map_refs(s->gt, 1, xen_domid, &xs_gntref, + PROT_READ | PROT_WRITE); + return 0; } + +struct qemu_xs_handle { + XenstoreImplState *impl; + GList *watches; + QEMUBH *watch_bh; +}; + +struct qemu_xs_watch { + struct qemu_xs_handle *h; + char *path; + xs_watch_fn fn; + void *opaque; + GList *events; +}; + +static char *xs_be_get_domain_path(struct qemu_xs_handle *h, unsigned int domid) +{ + return g_strdup_printf("/local/domain/%u", domid); +} + +static char **xs_be_directory(struct qemu_xs_handle *h, xs_transaction_t t, + const char *path, unsigned int *num) +{ + GList *items = NULL, *l; + unsigned int i = 0; + char **items_ret; + int err; + + err = xs_impl_directory(h->impl, DOMID_QEMU, t, path, NULL, &items); + if (err) { + errno = err; + return NULL; + } + + items_ret = g_new0(char *, g_list_length(items) + 1); + *num = 0; + for (l = items; l; l = l->next) { + items_ret[i++] = l->data; + (*num)++; + } + g_list_free(items); + return items_ret; +} + +static void *xs_be_read(struct qemu_xs_handle *h, xs_transaction_t t, + const char *path, unsigned int *len) +{ + GByteArray *data = g_byte_array_new(); + bool free_segment = false; + int err; + + err = xs_impl_read(h->impl, DOMID_QEMU, t, path, data); + if (err) { + free_segment = true; + errno = err; + } else { + if (len) { + *len = data->len; + } + /* The xen-bus-helper code expects to get NUL terminated string! */ + g_byte_array_append(data, (void *)"", 1); + } + + return g_byte_array_free(data, free_segment); +} + +static bool xs_be_write(struct qemu_xs_handle *h, xs_transaction_t t, + const char *path, const void *data, unsigned int len) +{ + GByteArray *gdata = g_byte_array_new(); + int err; + + g_byte_array_append(gdata, data, len); + err = xs_impl_write(h->impl, DOMID_QEMU, t, path, gdata); + g_byte_array_unref(gdata); + if (err) { + errno = err; + return false; + } + return true; +} + +static bool xs_be_create(struct qemu_xs_handle *h, xs_transaction_t t, + unsigned int owner, unsigned int domid, + unsigned int perms, const char *path) +{ + g_autoptr(GByteArray) data = g_byte_array_new(); + GList *perms_list = NULL; + int err; + + /* mkdir does this */ + err = xs_impl_read(h->impl, DOMID_QEMU, t, path, data); + if (err == ENOENT) { + err = xs_impl_write(h->impl, DOMID_QEMU, t, path, data); + } + if (err) { + errno = err; + return false; + } + + perms_list = g_list_append(perms_list, + xs_perm_as_string(XS_PERM_NONE, owner)); + perms_list = g_list_append(perms_list, + xs_perm_as_string(perms, domid)); + + err = xs_impl_set_perms(h->impl, DOMID_QEMU, t, path, perms_list); + g_list_free_full(perms_list, g_free); + if (err) { + errno = err; + return false; + } + return true; +} + +static bool xs_be_destroy(struct qemu_xs_handle *h, xs_transaction_t t, + const char *path) +{ + int err = xs_impl_rm(h->impl, DOMID_QEMU, t, path); + if (err) { + errno = err; + return false; + } + return true; +} + +static void be_watch_bh(void *_h) +{ + struct qemu_xs_handle *h = _h; + GList *l; + + for (l = h->watches; l; l = l->next) { + struct qemu_xs_watch *w = l->data; + + while (w->events) { + struct watch_event *ev = w->events->data; + + w->fn(w->opaque, ev->path); + + w->events = g_list_remove(w->events, ev); + free_watch_event(ev); + } + } +} + +static void xs_be_watch_cb(void *opaque, const char *path, const char *token) +{ + struct watch_event *ev = g_new0(struct watch_event, 1); + struct qemu_xs_watch *w = opaque; + + /* We don't care about the token */ + ev->path = g_strdup(path); + w->events = g_list_append(w->events, ev); + + qemu_bh_schedule(w->h->watch_bh); +} + +static struct qemu_xs_watch *xs_be_watch(struct qemu_xs_handle *h, + const char *path, xs_watch_fn fn, + void *opaque) +{ + struct qemu_xs_watch *w = g_new0(struct qemu_xs_watch, 1); + int err; + + w->h = h; + w->fn = fn; + w->opaque = opaque; + + err = xs_impl_watch(h->impl, DOMID_QEMU, path, NULL, xs_be_watch_cb, w); + if (err) { + errno = err; + g_free(w); + return NULL; + } + + w->path = g_strdup(path); + h->watches = g_list_append(h->watches, w); + return w; +} + +static void xs_be_unwatch(struct qemu_xs_handle *h, struct qemu_xs_watch *w) +{ + xs_impl_unwatch(h->impl, DOMID_QEMU, w->path, NULL, xs_be_watch_cb, w); + + h->watches = g_list_remove(h->watches, w); + g_list_free_full(w->events, (GDestroyNotify)free_watch_event); + g_free(w->path); + g_free(w); +} + +static xs_transaction_t xs_be_transaction_start(struct qemu_xs_handle *h) +{ + unsigned int new_tx = XBT_NULL; + int err = xs_impl_transaction_start(h->impl, DOMID_QEMU, &new_tx); + if (err) { + errno = err; + return XBT_NULL; + } + return new_tx; +} + +static bool xs_be_transaction_end(struct qemu_xs_handle *h, xs_transaction_t t, + bool abort) +{ + int err = xs_impl_transaction_end(h->impl, DOMID_QEMU, t, !abort); + if (err) { + errno = err; + return false; + } + return true; +} + +static struct qemu_xs_handle *xs_be_open(void) +{ + XenXenstoreState *s = xen_xenstore_singleton; + struct qemu_xs_handle *h; + + if (!s && !s->impl) { + errno = -ENOSYS; + return NULL; + } + + h = g_new0(struct qemu_xs_handle, 1); + h->impl = s->impl; + + h->watch_bh = aio_bh_new(qemu_get_aio_context(), be_watch_bh, h); + + return h; +} + +static void xs_be_close(struct qemu_xs_handle *h) +{ + while (h->watches) { + struct qemu_xs_watch *w = h->watches->data; + xs_be_unwatch(h, w); + } + + qemu_bh_delete(h->watch_bh); + g_free(h); +} + +static struct xenstore_backend_ops emu_xenstore_backend_ops = { + .open = xs_be_open, + .close = xs_be_close, + .get_domain_path = xs_be_get_domain_path, + .directory = xs_be_directory, + .read = xs_be_read, + .write = xs_be_write, + .create = xs_be_create, + .destroy = xs_be_destroy, + .watch = xs_be_watch, + .unwatch = xs_be_unwatch, + .transaction_start = xs_be_transaction_start, + .transaction_end = xs_be_transaction_end, +}; diff --git a/hw/i386/kvm/xenstore_impl.c b/hw/i386/kvm/xenstore_impl.c new file mode 100644 index 0000000000..305fe75519 --- /dev/null +++ b/hw/i386/kvm/xenstore_impl.c @@ -0,0 +1,1927 @@ +/* + * QEMU Xen emulation: The actual implementation of XenStore + * + * Copyright © 2023 Amazon.com, Inc. or its affiliates. All Rights Reserved. + * + * Authors: David Woodhouse <dwmw2@infradead.org>, Paul Durrant <paul@xen.org> + * + * This work is licensed under the terms of the GNU GPL, version 2 or later. + * See the COPYING file in the top-level directory. + */ + +#include "qemu/osdep.h" +#include "qom/object.h" + +#include "hw/xen/xen.h" + +#include "xen_xenstore.h" +#include "xenstore_impl.h" + +#include "hw/xen/interface/io/xs_wire.h" + +#define XS_MAX_WATCHES 128 +#define XS_MAX_DOMAIN_NODES 1000 +#define XS_MAX_NODE_SIZE 2048 +#define XS_MAX_TRANSACTIONS 10 +#define XS_MAX_PERMS_PER_NODE 5 + +#define XS_VALID_CHARS "abcdefghijklmnopqrstuvwxyz" \ + "ABCDEFGHIJKLMNOPQRSTUVWXYZ" \ + "0123456789-/_" + +typedef struct XsNode { + uint32_t ref; + GByteArray *content; + GList *perms; + GHashTable *children; + uint64_t gencnt; + bool deleted_in_tx; + bool modified_in_tx; + unsigned int serialized_tx; +#ifdef XS_NODE_UNIT_TEST + gchar *name; /* debug only */ +#endif +} XsNode; + +typedef struct XsWatch { + struct XsWatch *next; + xs_impl_watch_fn *cb; + void *cb_opaque; + char *token; + unsigned int dom_id; + int rel_prefix; +} XsWatch; + +typedef struct XsTransaction { + XsNode *root; + unsigned int nr_nodes; + unsigned int base_tx; + unsigned int tx_id; + unsigned int dom_id; +} XsTransaction; + +struct XenstoreImplState { + XsNode *root; + unsigned int nr_nodes; + GHashTable *watches; + unsigned int nr_domu_watches; + GHashTable *transactions; + unsigned int nr_domu_transactions; + unsigned int root_tx; + unsigned int last_tx; + bool serialized; +}; + + +static void nobble_tx(gpointer key, gpointer value, gpointer user_data) +{ + unsigned int *new_tx_id = user_data; + XsTransaction *tx = value; + + if (tx->base_tx == *new_tx_id) { + /* Transactions based on XBT_NULL will always fail */ + tx->base_tx = XBT_NULL; + } +} + +static inline unsigned int next_tx(struct XenstoreImplState *s) +{ + unsigned int tx_id; + + /* Find the next TX id which isn't either XBT_NULL or in use. */ + do { + tx_id = ++s->last_tx; + } while (tx_id == XBT_NULL || tx_id == s->root_tx || + g_hash_table_lookup(s->transactions, GINT_TO_POINTER(tx_id))); + + /* + * It is vanishingly unlikely, but ensure that no outstanding transaction + * is based on the (previous incarnation of the) newly-allocated TX id. + */ + g_hash_table_foreach(s->transactions, nobble_tx, &tx_id); + + return tx_id; +} + +static inline XsNode *xs_node_new(void) +{ + XsNode *n = g_new0(XsNode, 1); + n->ref = 1; + +#ifdef XS_NODE_UNIT_TEST + nr_xs_nodes++; + xs_node_list = g_list_prepend(xs_node_list, n); +#endif + return n; +} + +static inline XsNode *xs_node_ref(XsNode *n) +{ + /* With just 10 transactions, it can never get anywhere near this. */ + g_assert(n->ref < INT_MAX); + + g_assert(n->ref); + n->ref++; + return n; +} + +static inline void xs_node_unref(XsNode *n) +{ + if (!n) { + return; + } + g_assert(n->ref); + if (--n->ref) { + return; + } + + if (n->content) { + g_byte_array_unref(n->content); + } + if (n->perms) { + g_list_free_full(n->perms, g_free); + } + if (n->children) { + g_hash_table_unref(n->children); + } +#ifdef XS_NODE_UNIT_TEST + g_free(n->name); + nr_xs_nodes--; + xs_node_list = g_list_remove(xs_node_list, n); +#endif + g_free(n); +} + +char *xs_perm_as_string(unsigned int perm, unsigned int domid) +{ + char letter; + + switch (perm) { + case XS_PERM_READ | XS_PERM_WRITE: + letter = 'b'; + break; + case XS_PERM_READ: + letter = 'r'; + break; + case XS_PERM_WRITE: + letter = 'w'; + break; + case XS_PERM_NONE: + default: + letter = 'n'; + break; + } + + return g_strdup_printf("%c%u", letter, domid); +} + +static gpointer do_perm_copy(gconstpointer src, gpointer user_data) +{ + return g_strdup(src); +} + +static XsNode *xs_node_create(const char *name, GList *perms) +{ + XsNode *n = xs_node_new(); + +#ifdef XS_NODE_UNIT_TEST + if (name) { + n->name = g_strdup(name); + } +#endif + + n->perms = g_list_copy_deep(perms, do_perm_copy, NULL); + + return n; +} + +/* For copying from one hash table to another using g_hash_table_foreach() */ +static void do_child_insert(gpointer key, gpointer value, gpointer user_data) +{ + g_hash_table_insert(user_data, g_strdup(key), xs_node_ref(value)); +} + +static XsNode *xs_node_copy(XsNode *old) +{ + XsNode *n = xs_node_new(); + + n->gencnt = old->gencnt; + +#ifdef XS_NODE_UNIT_TEST + if (n->name) { + n->name = g_strdup(old->name); + } +#endif + + assert(old); + if (old->children) { + n->children = g_hash_table_new_full(g_str_hash, g_str_equal, g_free, + (GDestroyNotify)xs_node_unref); + g_hash_table_foreach(old->children, do_child_insert, n->children); + } + if (old->perms) { + n->perms = g_list_copy_deep(old->perms, do_perm_copy, NULL); + } + if (old->content) { + n->content = g_byte_array_ref(old->content); + } + return n; +} + +/* Returns true if it made a change to the hash table */ +static bool xs_node_add_child(XsNode *n, const char *path_elem, XsNode *child) +{ + assert(!strchr(path_elem, '/')); + + if (!child) { + assert(n->children); + return g_hash_table_remove(n->children, path_elem); + } + +#ifdef XS_NODE_UNIT_TEST + g_free(child->name); + child->name = g_strdup(path_elem); +#endif + if (!n->children) { + n->children = g_hash_table_new_full(g_str_hash, g_str_equal, g_free, + (GDestroyNotify)xs_node_unref); + } + + /* + * The documentation for g_hash_table_insert() says that it "returns a + * boolean value to indicate whether the newly added value was already + * in the hash table or not." + * + * It could perhaps be clearer that returning TRUE means it wasn't, + */ + return g_hash_table_insert(n->children, g_strdup(path_elem), child); +} + +struct walk_op { + struct XenstoreImplState *s; + char path[XENSTORE_ABS_PATH_MAX + 2]; /* Two NUL terminators */ + int (*op_fn)(XsNode **n, struct walk_op *op); + void *op_opaque; + void *op_opaque2; + + GList *watches; + unsigned int dom_id; + unsigned int tx_id; + + /* The number of nodes which will exist in the tree if this op succeeds. */ + unsigned int new_nr_nodes; + + /* + * This is maintained on the way *down* the walk to indicate + * whether nodes can be modified in place or whether COW is + * required. It starts off being true, as we're always going to + * replace the root node. If we walk into a shared subtree it + * becomes false. If we start *creating* new nodes for a write, + * it becomes true again. + * + * Do not use it on the way back up. + */ + bool inplace; + bool mutating; + bool create_dirs; + bool in_transaction; + + /* Tracking during recursion so we know which is first. */ + bool deleted_in_tx; +}; + +static void fire_watches(struct walk_op *op, bool parents) +{ + GList *l = NULL; + XsWatch *w; + + if (!op->mutating || op->in_transaction) { + return; + } + + if (parents) { + l = op->watches; + } + + w = g_hash_table_lookup(op->s->watches, op->path); + while (w || l) { + if (!w) { + /* Fire the parent nodes from 'op' if asked to */ + w = l->data; + l = l->next; + continue; + } + + assert(strlen(op->path) > w->rel_prefix); + w->cb(w->cb_opaque, op->path + w->rel_prefix, w->token); + + w = w->next; + } +} + +static int xs_node_add_content(XsNode **n, struct walk_op *op) +{ + GByteArray *data = op->op_opaque; + + if (op->dom_id) { + /* + * The real XenStored includes permissions and names of child nodes + * in the calculated datasize but life's too short. For a single + * tenant internal XenStore, we don't have to be quite as pedantic. + */ + if (data->len > XS_MAX_NODE_SIZE) { + return E2BIG; + } + } + /* We *are* the node to be written. Either this or a copy. */ + if (!op->inplace) { + XsNode *old = *n; + *n = xs_node_copy(old); + xs_node_unref(old); + } + + if ((*n)->content) { + g_byte_array_unref((*n)->content); + } + (*n)->content = g_byte_array_ref(data); + if (op->tx_id != XBT_NULL) { + (*n)->modified_in_tx = true; + } + return 0; +} + +static int xs_node_get_content(XsNode **n, struct walk_op *op) +{ + GByteArray *data = op->op_opaque; + GByteArray *node_data; + + assert(op->inplace); + assert(*n); + + node_data = (*n)->content; + if (node_data) { + g_byte_array_append(data, node_data->data, node_data->len); + } + + return 0; +} + +static int node_rm_recurse(gpointer key, gpointer value, gpointer user_data) +{ + struct walk_op *op = user_data; + int path_len = strlen(op->path); + int key_len = strlen(key); + XsNode *n = value; + bool this_inplace = op->inplace; + + if (n->ref != 1) { + op->inplace = 0; + } + + assert(key_len + path_len + 2 <= sizeof(op->path)); + op->path[path_len] = '/'; + memcpy(op->path + path_len + 1, key, key_len + 1); + + if (n->children) { + g_hash_table_foreach_remove(n->children, node_rm_recurse, op); + } + op->new_nr_nodes--; + + /* + * Fire watches on *this* node but not the parents because they are + * going to be deleted too, so the watch will fire for them anyway. + */ + fire_watches(op, false); + op->path[path_len] = '\0'; + + /* + * Actually deleting the child here is just an optimisation; if we + * don't then the final unref on the topmost victim will just have + * to cascade down again repeating all the g_hash_table_foreach() + * calls. + */ + return this_inplace; +} + +static XsNode *xs_node_copy_deleted(XsNode *old, struct walk_op *op); +static void copy_deleted_recurse(gpointer key, gpointer value, + gpointer user_data) +{ + struct walk_op *op = user_data; + GHashTable *siblings = op->op_opaque2; + XsNode *n = xs_node_copy_deleted(value, op); + + /* + * Reinsert the deleted_in_tx copy of the node into the parent's + * 'children' hash table. Having stashed it from op->op_opaque2 + * before the recursive call to xs_node_copy_deleted() scribbled + * over it. + */ + g_hash_table_insert(siblings, g_strdup(key), n); +} + +static XsNode *xs_node_copy_deleted(XsNode *old, struct walk_op *op) +{ + XsNode *n = xs_node_new(); + + n->gencnt = old->gencnt; + +#ifdef XS_NODE_UNIT_TEST + if (old->name) { + n->name = g_strdup(old->name); + } +#endif + + if (old->children) { + n->children = g_hash_table_new_full(g_str_hash, g_str_equal, g_free, + (GDestroyNotify)xs_node_unref); + op->op_opaque2 = n->children; + g_hash_table_foreach(old->children, copy_deleted_recurse, op); + } + if (old->perms) { + n->perms = g_list_copy_deep(old->perms, do_perm_copy, NULL); + } + n->deleted_in_tx = true; + /* If it gets resurrected we only fire a watch if it lost its content */ + if (old->content) { + n->modified_in_tx = true; + } + op->new_nr_nodes--; + return n; +} + +static int xs_node_rm(XsNode **n, struct walk_op *op) +{ + bool this_inplace = op->inplace; + + if (op->tx_id != XBT_NULL) { + /* It's not trivial to do inplace handling for this one */ + XsNode *old = *n; + *n = xs_node_copy_deleted(old, op); + xs_node_unref(old); + return 0; + } + + /* Fire watches for, and count, nodes in the subtree which get deleted */ + if ((*n)->children) { + g_hash_table_foreach_remove((*n)->children, node_rm_recurse, op); + } + op->new_nr_nodes--; + + if (this_inplace) { + xs_node_unref(*n); + } + *n = NULL; + return 0; +} + +static int xs_node_get_perms(XsNode **n, struct walk_op *op) +{ + GList **perms = op->op_opaque; + + assert(op->inplace); + assert(*n); + + *perms = g_list_copy_deep((*n)->perms, do_perm_copy, NULL); + return 0; +} + +static void parse_perm(const char *perm, char *letter, unsigned int *dom_id) +{ + unsigned int n = sscanf(perm, "%c%u", letter, dom_id); + + assert(n == 2); +} + +static bool can_access(unsigned int dom_id, GList *perms, const char *letters) +{ + unsigned int i, n; + char perm_letter; + unsigned int perm_dom_id; + bool access; + + if (dom_id == 0) { + return true; + } + + n = g_list_length(perms); + assert(n >= 1); + + /* + * The dom_id of the first perm is the owner, and the owner always has + * read-write access. + */ + parse_perm(g_list_nth_data(perms, 0), &perm_letter, &perm_dom_id); + if (dom_id == perm_dom_id) { + return true; + } + + /* + * The letter of the first perm specified the default access for all other + * domains. + */ + access = !!strchr(letters, perm_letter); + for (i = 1; i < n; i++) { + parse_perm(g_list_nth_data(perms, i), &perm_letter, &perm_dom_id); + if (dom_id != perm_dom_id) { + continue; + } + access = !!strchr(letters, perm_letter); + } + + return access; +} + +static int xs_node_set_perms(XsNode **n, struct walk_op *op) +{ + GList *perms = op->op_opaque; + + if (op->dom_id) { + unsigned int perm_dom_id; + char perm_letter; + + /* A guest may not change permissions on nodes it does not own */ + if (!can_access(op->dom_id, (*n)->perms, "")) { + return EPERM; + } + + /* A guest may not change the owner of a node it owns. */ + parse_perm(perms->data, &perm_letter, &perm_dom_id); + if (perm_dom_id != op->dom_id) { + return EPERM; + } + + if (g_list_length(perms) > XS_MAX_PERMS_PER_NODE) { + return ENOSPC; + } + } + + /* We *are* the node to be written. Either this or a copy. */ + if (!op->inplace) { + XsNode *old = *n; + *n = xs_node_copy(old); + xs_node_unref(old); + } + + if ((*n)->perms) { + g_list_free_full((*n)->perms, g_free); + } + (*n)->perms = g_list_copy_deep(perms, do_perm_copy, NULL); + if (op->tx_id != XBT_NULL) { + (*n)->modified_in_tx = true; + } + return 0; +} + +/* + * Passed a full reference in *n which it may free if it needs to COW. + * + * When changing the tree, the op->inplace flag indicates whether this + * node may be modified in place (i.e. it and all its parents had a + * refcount of one). If walking down the tree we find a node whose + * refcount is higher, we must clear op->inplace and COW from there + * down. Unless we are creating new nodes as scaffolding for a write + * (which works like 'mkdir -p' does). In which case those newly + * created nodes can (and must) be modified in place again. + */ +static int xs_node_walk(XsNode **n, struct walk_op *op) +{ + char *child_name = NULL; + size_t namelen; + XsNode *old = *n, *child = NULL; + bool stole_child = false; + bool this_inplace; + XsWatch *watch; + int err; + + namelen = strlen(op->path); + watch = g_hash_table_lookup(op->s->watches, op->path); + + /* Is there a child, or do we hit the double-NUL termination? */ + if (op->path[namelen + 1]) { + char *slash; + child_name = op->path + namelen + 1; + slash = strchr(child_name, '/'); + if (slash) { + *slash = '\0'; + } + op->path[namelen] = '/'; + } + + /* If we walk into a subtree which is shared, we must COW */ + if (op->mutating && old->ref != 1) { + op->inplace = false; + } + + if (!child_name) { + const char *letters = op->mutating ? "wb" : "rb"; + + if (!can_access(op->dom_id, old->perms, letters)) { + err = EACCES; + goto out; + } + + /* This is the actual node on which the operation shall be performed */ + err = op->op_fn(n, op); + if (!err) { + fire_watches(op, true); + } + goto out; + } + + /* op->inplace will be further modified during the recursion */ + this_inplace = op->inplace; + + if (old && old->children) { + child = g_hash_table_lookup(old->children, child_name); + /* This is a *weak* reference to 'child', owned by the hash table */ + } + + if (child) { + if (child->deleted_in_tx) { + assert(child->ref == 1); + /* Cannot actually set child->deleted_in_tx = false until later */ + } + xs_node_ref(child); + /* + * Now we own it too. But if we can modify inplace, that's going to + * foil the check and force it to COW. We want to be the *only* owner + * so that it can be modified in place, so remove it from the hash + * table in that case. We'll add it (or its replacement) back later. + */ + if (op->mutating && this_inplace) { + g_hash_table_remove(old->children, child_name); + stole_child = true; + } + } else if (op->create_dirs) { + assert(op->mutating); + + if (!can_access(op->dom_id, old->perms, "wb")) { + err = EACCES; + goto out; + } + + if (op->dom_id && op->new_nr_nodes >= XS_MAX_DOMAIN_NODES) { + err = ENOSPC; + goto out; + } + + child = xs_node_create(child_name, old->perms); + op->new_nr_nodes++; + + /* + * If we're creating a new child, we can clearly modify it (and its + * children) in place from here on down. + */ + op->inplace = true; + } else { + err = ENOENT; + goto out; + } + + /* + * If there's a watch on this node, add it to the list to be fired + * (with the correct full pathname for the modified node) at the end. + */ + if (watch) { + op->watches = g_list_append(op->watches, watch); + } + + /* + * Except for the temporary child-stealing as noted, our node has not + * changed yet. We don't yet know the overall operation will complete. + */ + err = xs_node_walk(&child, op); + + if (watch) { + op->watches = g_list_remove(op->watches, watch); + } + + if (err || !op->mutating) { + if (stole_child) { + /* Put it back as it was. */ + g_hash_table_replace(old->children, g_strdup(child_name), child); + } else { + xs_node_unref(child); + } + goto out; + } + + /* + * Now we know the operation has completed successfully and we're on + * the way back up. Make the change, substituting 'child' in the + * node at our level. + */ + if (!this_inplace) { + *n = xs_node_copy(old); + xs_node_unref(old); + } + + /* + * If we resurrected a deleted_in_tx node, we can mark it as no longer + * deleted now that we know the overall operation has succeeded. + */ + if (op->create_dirs && child && child->deleted_in_tx) { + op->new_nr_nodes++; + child->deleted_in_tx = false; + } + + /* + * The child may be NULL here, for a remove operation. Either way, + * xs_node_add_child() will do the right thing and return a value + * indicating whether it changed the parent's hash table or not. + * + * We bump the parent gencnt if it adds a child that we *didn't* + * steal from it in the first place, or if child==NULL and was + * thus removed (whether we stole it earlier and didn't put it + * back, or xs_node_add_child() actually removed it now). + */ + if ((xs_node_add_child(*n, child_name, child) && !stole_child) || !child) { + (*n)->gencnt++; + } + + out: + op->path[namelen] = '\0'; + if (!namelen) { + assert(!op->watches); + /* + * On completing the recursion back up the path walk and reaching the + * top, assign the new node count if the operation was successful. If + * the main tree was changed, bump its tx ID so that outstanding + * transactions correctly fail. But don't bump it every time; only + * if it makes a difference. + */ + if (!err && op->mutating) { + if (!op->in_transaction) { + if (op->s->root_tx != op->s->last_tx) { + op->s->root_tx = next_tx(op->s); + } + op->s->nr_nodes = op->new_nr_nodes; + } else { + XsTransaction *tx = g_hash_table_lookup(op->s->transactions, + GINT_TO_POINTER(op->tx_id)); + assert(tx); + tx->nr_nodes = op->new_nr_nodes; + } + } + } + return err; +} + +static void append_directory_item(gpointer key, gpointer value, + gpointer user_data) +{ + GList **items = user_data; + + *items = g_list_insert_sorted(*items, g_strdup(key), (GCompareFunc)strcmp); +} + +/* Populates items with char * names which caller must free. */ +static int xs_node_directory(XsNode **n, struct walk_op *op) +{ + GList **items = op->op_opaque; + + assert(op->inplace); + assert(*n); + + if ((*n)->children) { + g_hash_table_foreach((*n)->children, append_directory_item, items); + } + + if (op->op_opaque2) { + *(uint64_t *)op->op_opaque2 = (*n)->gencnt; + } + + return 0; +} + +static int validate_path(char *outpath, const char *userpath, + unsigned int dom_id) +{ + size_t i, pathlen = strlen(userpath); + + if (!pathlen || userpath[pathlen] == '/' || strstr(userpath, "//")) { + return EINVAL; + } + for (i = 0; i < pathlen; i++) { + if (!strchr(XS_VALID_CHARS, userpath[i])) { + return EINVAL; + } + } + if (userpath[0] == '/') { + if (pathlen > XENSTORE_ABS_PATH_MAX) { + return E2BIG; + } + memcpy(outpath, userpath, pathlen + 1); + } else { + if (pathlen > XENSTORE_REL_PATH_MAX) { + return E2BIG; + } + snprintf(outpath, XENSTORE_ABS_PATH_MAX, "/local/domain/%u/%s", dom_id, + userpath); + } + return 0; +} + + +static int init_walk_op(XenstoreImplState *s, struct walk_op *op, + xs_transaction_t tx_id, unsigned int dom_id, + const char *path, XsNode ***rootp) +{ + int ret = validate_path(op->path, path, dom_id); + if (ret) { + return ret; + } + + /* + * We use *two* NUL terminators at the end of the path, as during the walk + * we will temporarily turn each '/' into a NUL to allow us to use that + * path element for the lookup. + */ + op->path[strlen(op->path) + 1] = '\0'; + op->watches = NULL; + op->path[0] = '\0'; + op->inplace = true; + op->mutating = false; + op->create_dirs = false; + op->in_transaction = false; + op->dom_id = dom_id; + op->tx_id = tx_id; + op->s = s; + + if (tx_id == XBT_NULL) { + *rootp = &s->root; + op->new_nr_nodes = s->nr_nodes; + } else { + XsTransaction *tx = g_hash_table_lookup(s->transactions, + GINT_TO_POINTER(tx_id)); + if (!tx) { + return ENOENT; + } + *rootp = &tx->root; + op->new_nr_nodes = tx->nr_nodes; + op->in_transaction = true; + } + + return 0; +} + +int xs_impl_read(XenstoreImplState *s, unsigned int dom_id, + xs_transaction_t tx_id, const char *path, GByteArray *data) +{ + /* + * The data GByteArray shall exist, and will be freed by caller. + * Just g_byte_array_append() to it. + */ + struct walk_op op; + XsNode **n; + int ret; + + ret = init_walk_op(s, &op, tx_id, dom_id, path, &n); + if (ret) { + return ret; + } + op.op_fn = xs_node_get_content; + op.op_opaque = data; + return xs_node_walk(n, &op); +} + +int xs_impl_write(XenstoreImplState *s, unsigned int dom_id, + xs_transaction_t tx_id, const char *path, GByteArray *data) +{ + /* + * The data GByteArray shall exist, will be freed by caller. You are + * free to use g_byte_array_steal() and keep the data. Or just ref it. + */ + struct walk_op op; + XsNode **n; + int ret; + + ret = init_walk_op(s, &op, tx_id, dom_id, path, &n); + if (ret) { + return ret; + } + op.op_fn = xs_node_add_content; + op.op_opaque = data; + op.mutating = true; + op.create_dirs = true; + return xs_node_walk(n, &op); +} + +int xs_impl_directory(XenstoreImplState *s, unsigned int dom_id, + xs_transaction_t tx_id, const char *path, + uint64_t *gencnt, GList **items) +{ + /* + * The items are (char *) to be freed by caller. Although it's consumed + * immediately so if you want to change it to (const char *) and keep + * them, go ahead and change the caller. + */ + struct walk_op op; + XsNode **n; + int ret; + + ret = init_walk_op(s, &op, tx_id, dom_id, path, &n); + if (ret) { + return ret; + } + op.op_fn = xs_node_directory; + op.op_opaque = items; + op.op_opaque2 = gencnt; + return xs_node_walk(n, &op); +} + +int xs_impl_transaction_start(XenstoreImplState *s, unsigned int dom_id, + xs_transaction_t *tx_id) +{ + XsTransaction *tx; + + if (*tx_id != XBT_NULL) { + return EINVAL; + } + + if (dom_id && s->nr_domu_transactions >= XS_MAX_TRANSACTIONS) { + return ENOSPC; + } + + tx = g_new0(XsTransaction, 1); + + tx->nr_nodes = s->nr_nodes; + tx->tx_id = next_tx(s); + tx->base_tx = s->root_tx; + tx->root = xs_node_ref(s->root); + tx->dom_id = dom_id; + + g_hash_table_insert(s->transactions, GINT_TO_POINTER(tx->tx_id), tx); + if (dom_id) { + s->nr_domu_transactions++; + } + *tx_id = tx->tx_id; + return 0; +} + +static gboolean tx_commit_walk(gpointer key, gpointer value, + gpointer user_data) +{ + struct walk_op *op = user_data; + int path_len = strlen(op->path); + int key_len = strlen(key); + bool fire_parents = true; + XsWatch *watch; + XsNode *n = value; + + if (n->ref != 1) { + return false; + } + + if (n->deleted_in_tx) { + /* + * We fire watches on our parents if we are the *first* node + * to be deleted (the topmost one). This matches the behaviour + * when deleting in the live tree. + */ + fire_parents = !op->deleted_in_tx; + + /* Only used on the way down so no need to clear it later */ + op->deleted_in_tx = true; + } + + assert(key_len + path_len + 2 <= sizeof(op->path)); + op->path[path_len] = '/'; + memcpy(op->path + path_len + 1, key, key_len + 1); + + watch = g_hash_table_lookup(op->s->watches, op->path); + if (watch) { + op->watches = g_list_append(op->watches, watch); + } + + if (n->children) { + g_hash_table_foreach_remove(n->children, tx_commit_walk, op); + } + + if (watch) { + op->watches = g_list_remove(op->watches, watch); + } + + /* + * Don't fire watches if this node was only copied because a + * descendent was changed. The modified_in_tx flag indicates the + * ones which were really changed. + */ + if (n->modified_in_tx || n->deleted_in_tx) { + fire_watches(op, fire_parents); + n->modified_in_tx = false; + } + op->path[path_len] = '\0'; + + /* Deleted nodes really do get expunged when we commit */ + return n->deleted_in_tx; +} + +static int transaction_commit(XenstoreImplState *s, XsTransaction *tx) +{ + struct walk_op op; + XsNode **n; + + if (s->root_tx != tx->base_tx) { + return EAGAIN; + } + xs_node_unref(s->root); + s->root = tx->root; + tx->root = NULL; + s->root_tx = tx->tx_id; + s->nr_nodes = tx->nr_nodes; + + init_walk_op(s, &op, XBT_NULL, tx->dom_id, "/", &n); + op.deleted_in_tx = false; + op.mutating = true; + + /* + * Walk the new root and fire watches on any node which has a + * refcount of one (which is therefore unique to this transaction). + */ + if (s->root->children) { + g_hash_table_foreach_remove(s->root->children, tx_commit_walk, &op); + } + + return 0; +} + +int xs_impl_transaction_end(XenstoreImplState *s, unsigned int dom_id, + xs_transaction_t tx_id, bool commit) +{ + int ret = 0; + XsTransaction *tx = g_hash_table_lookup(s->transactions, + GINT_TO_POINTER(tx_id)); + + if (!tx || tx->dom_id != dom_id) { + return ENOENT; + } + + if (commit) { + ret = transaction_commit(s, tx); + } + + g_hash_table_remove(s->transactions, GINT_TO_POINTER(tx_id)); + if (dom_id) { + assert(s->nr_domu_transactions); + s->nr_domu_transactions--; + } + return ret; +} + +int xs_impl_rm(XenstoreImplState *s, unsigned int dom_id, + xs_transaction_t tx_id, const char *path) +{ + struct walk_op op; + XsNode **n; + int ret; + + ret = init_walk_op(s, &op, tx_id, dom_id, path, &n); + if (ret) { + return ret; + } + op.op_fn = xs_node_rm; + op.mutating = true; + return xs_node_walk(n, &op); +} + +int xs_impl_get_perms(XenstoreImplState *s, unsigned int dom_id, + xs_transaction_t tx_id, const char *path, GList **perms) +{ + struct walk_op op; + XsNode **n; + int ret; + + ret = init_walk_op(s, &op, tx_id, dom_id, path, &n); + if (ret) { + return ret; + } + op.op_fn = xs_node_get_perms; + op.op_opaque = perms; + return xs_node_walk(n, &op); +} + +static void is_valid_perm(gpointer data, gpointer user_data) +{ + char *perm = data; + bool *valid = user_data; + char letter; + unsigned int dom_id; + + if (!*valid) { + return; + } + + if (sscanf(perm, "%c%u", &letter, &dom_id) != 2) { + *valid = false; + return; + } + + switch (letter) { + case 'n': + case 'r': + case 'w': + case 'b': + break; + + default: + *valid = false; + break; + } +} + +int xs_impl_set_perms(XenstoreImplState *s, unsigned int dom_id, + xs_transaction_t tx_id, const char *path, GList *perms) +{ + struct walk_op op; + XsNode **n; + bool valid = true; + int ret; + + if (!g_list_length(perms)) { + return EINVAL; + } + + g_list_foreach(perms, is_valid_perm, &valid); + if (!valid) { + return EINVAL; + } + + ret = init_walk_op(s, &op, tx_id, dom_id, path, &n); + if (ret) { + return ret; + } + op.op_fn = xs_node_set_perms; + op.op_opaque = perms; + op.mutating = true; + return xs_node_walk(n, &op); +} + +static int do_xs_impl_watch(XenstoreImplState *s, unsigned int dom_id, + const char *path, const char *token, + xs_impl_watch_fn fn, void *opaque) + +{ + char abspath[XENSTORE_ABS_PATH_MAX + 1]; + XsWatch *w, *l; + int ret; + + ret = validate_path(abspath, path, dom_id); + if (ret) { + return ret; + } + + /* Check for duplicates */ + l = w = g_hash_table_lookup(s->watches, abspath); + while (w) { + if (!g_strcmp0(token, w->token) && opaque == w->cb_opaque && + fn == w->cb && dom_id == w->dom_id) { + return EEXIST; + } + w = w->next; + } + + if (dom_id && s->nr_domu_watches >= XS_MAX_WATCHES) { + return E2BIG; + } + + w = g_new0(XsWatch, 1); + w->token = g_strdup(token); + w->cb = fn; + w->cb_opaque = opaque; + w->dom_id = dom_id; + w->rel_prefix = strlen(abspath) - strlen(path); + + /* l was looked up above when checking for duplicates */ + if (l) { + w->next = l->next; + l->next = w; + } else { + g_hash_table_insert(s->watches, g_strdup(abspath), w); + } + if (dom_id) { + s->nr_domu_watches++; + } + + return 0; +} + +int xs_impl_watch(XenstoreImplState *s, unsigned int dom_id, const char *path, + const char *token, xs_impl_watch_fn fn, void *opaque) +{ + int ret = do_xs_impl_watch(s, dom_id, path, token, fn, opaque); + + if (!ret) { + /* A new watch should fire immediately */ + fn(opaque, path, token); + } + + return ret; +} + +static XsWatch *free_watch(XenstoreImplState *s, XsWatch *w) +{ + XsWatch *next = w->next; + + if (w->dom_id) { + assert(s->nr_domu_watches); + s->nr_domu_watches--; + } + + g_free(w->token); + g_free(w); + + return next; +} + +int xs_impl_unwatch(XenstoreImplState *s, unsigned int dom_id, + const char *path, const char *token, + xs_impl_watch_fn fn, void *opaque) +{ + char abspath[XENSTORE_ABS_PATH_MAX + 1]; + XsWatch *w, **l; + int ret; + + ret = validate_path(abspath, path, dom_id); + if (ret) { + return ret; + } + + w = g_hash_table_lookup(s->watches, abspath); + if (!w) { + return ENOENT; + } + + /* + * The hash table contains the first element of a list of + * watches. Removing the first element in the list is a + * special case because we have to update the hash table to + * point to the next (or remove it if there's nothing left). + */ + if (!g_strcmp0(token, w->token) && fn == w->cb && opaque == w->cb_opaque && + dom_id == w->dom_id) { + if (w->next) { + /* Insert the previous 'next' into the hash table */ + g_hash_table_insert(s->watches, g_strdup(abspath), w->next); + } else { + /* Nothing left; remove from hash table */ + g_hash_table_remove(s->watches, abspath); + } + free_watch(s, w); + return 0; + } + + /* + * We're all done messing with the hash table because the element + * it points to has survived the cull. Now it's just a simple + * linked list removal operation. + */ + for (l = &w->next; *l; l = &w->next) { + w = *l; + + if (!g_strcmp0(token, w->token) && fn == w->cb && + opaque != w->cb_opaque && dom_id == w->dom_id) { + *l = free_watch(s, w); + return 0; + } + } + + return ENOENT; +} + +int xs_impl_reset_watches(XenstoreImplState *s, unsigned int dom_id) +{ + char **watch_paths; + guint nr_watch_paths; + guint i; + + watch_paths = (char **)g_hash_table_get_keys_as_array(s->watches, + &nr_watch_paths); + + for (i = 0; i < nr_watch_paths; i++) { + XsWatch *w1 = g_hash_table_lookup(s->watches, watch_paths[i]); + XsWatch *w2, *w, **l; + + /* + * w1 is the original list. The hash table has this pointer. + * w2 is the head of our newly-filtered list. + * w and l are temporary for processing. w is somewhat redundant + * with *l but makes my eyes bleed less. + */ + + w = w2 = w1; + l = &w; + while (w) { + if (w->dom_id == dom_id) { + /* If we're freeing the head of the list, bump w2 */ + if (w2 == w) { + w2 = w->next; + } + *l = free_watch(s, w); + } else { + l = &w->next; + } + w = *l; + } + /* + * If the head of the list survived the cull, we don't need to + * touch the hash table and we're done with this path. Else... + */ + if (w1 != w2) { + g_hash_table_steal(s->watches, watch_paths[i]); + + /* + * It was already freed. (Don't worry, this whole thing is + * single-threaded and nobody saw it in the meantime). And + * having *stolen* it, we now own the watch_paths[i] string + * so if we don't give it back to the hash table, we need + * to free it. + */ + if (w2) { + g_hash_table_insert(s->watches, watch_paths[i], w2); + } else { + g_free(watch_paths[i]); + } + } + } + g_free(watch_paths); + return 0; +} + +static void xs_tx_free(void *_tx) +{ + XsTransaction *tx = _tx; + if (tx->root) { + xs_node_unref(tx->root); + } + g_free(tx); +} + +XenstoreImplState *xs_impl_create(unsigned int dom_id) +{ + XenstoreImplState *s = g_new0(XenstoreImplState, 1); + GList *perms; + + s->watches = g_hash_table_new_full(g_str_hash, g_str_equal, g_free, NULL); + s->transactions = g_hash_table_new_full(g_direct_hash, g_direct_equal, + NULL, xs_tx_free); + + perms = g_list_append(NULL, xs_perm_as_string(XS_PERM_NONE, 0)); + s->root = xs_node_create("/", perms); + g_list_free_full(perms, g_free); + s->nr_nodes = 1; + + s->root_tx = s->last_tx = 1; + return s; +} + + +static void clear_serialized_tx(gpointer key, gpointer value, gpointer opaque) +{ + XsNode *n = value; + + n->serialized_tx = XBT_NULL; + if (n->children) { + g_hash_table_foreach(n->children, clear_serialized_tx, NULL); + } +} + +static void clear_tx_serialized_tx(gpointer key, gpointer value, + gpointer opaque) +{ + XsTransaction *t = value; + + clear_serialized_tx(NULL, t->root, NULL); +} + +static void write_be32(GByteArray *save, uint32_t val) +{ + uint32_t be = htonl(val); + g_byte_array_append(save, (void *)&be, sizeof(be)); +} + + +struct save_state { + GByteArray *bytes; + unsigned int tx_id; +}; + +#define MODIFIED_IN_TX (1U << 0) +#define DELETED_IN_TX (1U << 1) +#define NODE_REF (1U << 2) + +static void save_node(gpointer key, gpointer value, gpointer opaque) +{ + struct save_state *ss = opaque; + XsNode *n = value; + char *name = key; + uint8_t flag = 0; + + /* Child nodes (i.e. anything but the root) have a name */ + if (name) { + g_byte_array_append(ss->bytes, key, strlen(key) + 1); + } + + /* + * If we already wrote this node, refer to the previous copy. + * There's no rename/move in XenStore, so all we need to find + * it is the tx_id of the transation in which it exists. Which + * may be the root tx. + */ + if (n->serialized_tx != XBT_NULL) { + flag = NODE_REF; + g_byte_array_append(ss->bytes, &flag, 1); + write_be32(ss->bytes, n->serialized_tx); + } else { + GList *l; + n->serialized_tx = ss->tx_id; + + if (n->modified_in_tx) { + flag |= MODIFIED_IN_TX; + } + if (n->deleted_in_tx) { + flag |= DELETED_IN_TX; + } + g_byte_array_append(ss->bytes, &flag, 1); + + if (n->content) { + write_be32(ss->bytes, n->content->len); + g_byte_array_append(ss->bytes, n->content->data, n->content->len); + } else { + write_be32(ss->bytes, 0); + } + + for (l = n->perms; l; l = l->next) { + g_byte_array_append(ss->bytes, l->data, strlen(l->data) + 1); + } + /* NUL termination after perms */ + g_byte_array_append(ss->bytes, (void *)"", 1); + + if (n->children) { + g_hash_table_foreach(n->children, save_node, ss); + } + /* NUL termination after children (child name is NUL) */ + g_byte_array_append(ss->bytes, (void *)"", 1); + } +} + +static void save_tree(struct save_state *ss, uint32_t tx_id, XsNode *root) +{ + write_be32(ss->bytes, tx_id); + ss->tx_id = tx_id; + save_node(NULL, root, ss); +} + +static void save_tx(gpointer key, gpointer value, gpointer opaque) +{ + uint32_t tx_id = GPOINTER_TO_INT(key); + struct save_state *ss = opaque; + XsTransaction *n = value; + + write_be32(ss->bytes, n->base_tx); + write_be32(ss->bytes, n->dom_id); + + save_tree(ss, tx_id, n->root); +} + +static void save_watch(gpointer key, gpointer value, gpointer opaque) +{ + struct save_state *ss = opaque; + XsWatch *w = value; + + /* We only save the *guest* watches. */ + if (w->dom_id) { + gpointer relpath = key + w->rel_prefix; + g_byte_array_append(ss->bytes, relpath, strlen(relpath) + 1); + g_byte_array_append(ss->bytes, (void *)w->token, strlen(w->token) + 1); + } +} + +GByteArray *xs_impl_serialize(XenstoreImplState *s) +{ + struct save_state ss; + + ss.bytes = g_byte_array_new(); + + /* + * node = flags [ real_node / node_ref ] + * flags = uint8_t (MODIFIED_IN_TX | DELETED_IN_TX | NODE_REF) + * node_ref = tx_id (in which the original version of this node exists) + * real_node = content perms child* NUL + * content = len data + * len = uint32_t + * data = uint8_t{len} + * perms = perm* NUL + * perm = asciiz + * child = name node + * name = asciiz + * + * tree = tx_id node + * tx_id = uint32_t + * + * transaction = base_tx_id dom_id tree + * base_tx_id = uint32_t + * dom_id = uint32_t + * + * tx_list = tree transaction* XBT_NULL + * + * watch = path token + * path = asciiz + * token = asciiz + * + * watch_list = watch* NUL + * + * xs_serialize_stream = last_tx tx_list watch_list + * last_tx = uint32_t + */ + + /* Clear serialized_tx in every node. */ + if (s->serialized) { + clear_serialized_tx(NULL, s->root, NULL); + g_hash_table_foreach(s->transactions, clear_tx_serialized_tx, NULL); + } + + s->serialized = true; + + write_be32(ss.bytes, s->last_tx); + save_tree(&ss, s->root_tx, s->root); + g_hash_table_foreach(s->transactions, save_tx, &ss); + + write_be32(ss.bytes, XBT_NULL); + + g_hash_table_foreach(s->watches, save_watch, &ss); + g_byte_array_append(ss.bytes, (void *)"", 1); + + return ss.bytes; +} + +struct unsave_state { + char path[XENSTORE_ABS_PATH_MAX + 1]; + XenstoreImplState *s; + GByteArray *bytes; + uint8_t *d; + size_t l; + bool root_walk; +}; + +static int consume_be32(struct unsave_state *us, unsigned int *val) +{ + uint32_t d; + + if (us->l < sizeof(d)) { + return -EINVAL; + } + memcpy(&d, us->d, sizeof(d)); + *val = ntohl(d); + us->d += sizeof(d); + us->l -= sizeof(d); + return 0; +} + +static int consume_string(struct unsave_state *us, char **str, size_t *len) +{ + size_t l; + + if (!us->l) { + return -EINVAL; + } + + l = strnlen((void *)us->d, us->l); + if (l == us->l) { + return -EINVAL; + } + + if (str) { + *str = (void *)us->d; + } + if (len) { + *len = l; + } + + us->d += l + 1; + us->l -= l + 1; + return 0; +} + +static XsNode *lookup_node(XsNode *n, char *path) +{ + char *slash = strchr(path, '/'); + XsNode *child; + + if (path[0] == '\0') { + return n; + } + + if (slash) { + *slash = '\0'; + } + + if (!n->children) { + return NULL; + } + child = g_hash_table_lookup(n->children, path); + if (!slash) { + return child; + } + + *slash = '/'; + if (!child) { + return NULL; + } + return lookup_node(child, slash + 1); +} + +static XsNode *lookup_tx_node(struct unsave_state *us, unsigned int tx_id) +{ + XsTransaction *t; + if (tx_id == us->s->root_tx) { + return lookup_node(us->s->root, us->path + 1); + } + + t = g_hash_table_lookup(us->s->transactions, GINT_TO_POINTER(tx_id)); + if (!t) { + return NULL; + } + g_assert(t->root); + return lookup_node(t->root, us->path + 1); +} + +static void count_child_nodes(gpointer key, gpointer value, gpointer user_data) +{ + unsigned int *nr_nodes = user_data; + XsNode *n = value; + + (*nr_nodes)++; + + if (n->children) { + g_hash_table_foreach(n->children, count_child_nodes, nr_nodes); + } +} + +static int consume_node(struct unsave_state *us, XsNode **nodep, + unsigned int *nr_nodes) +{ + XsNode *n = NULL; + uint8_t flags; + int ret; + + if (us->l < 1) { + return -EINVAL; + } + flags = us->d[0]; + us->d++; + us->l--; + + if (flags == NODE_REF) { + unsigned int tx; + + ret = consume_be32(us, &tx); + if (ret) { + return ret; + } + + n = lookup_tx_node(us, tx); + if (!n) { + return -EINVAL; + } + n->ref++; + if (n->children) { + g_hash_table_foreach(n->children, count_child_nodes, nr_nodes); + } + } else { + uint32_t datalen; + + if (flags & ~(DELETED_IN_TX | MODIFIED_IN_TX)) { + return -EINVAL; + } + n = xs_node_new(); + + if (flags & DELETED_IN_TX) { + n->deleted_in_tx = true; + } + if (flags & MODIFIED_IN_TX) { + n->modified_in_tx = true; + } + ret = consume_be32(us, &datalen); + if (ret) { + xs_node_unref(n); + return -EINVAL; + } + if (datalen) { + if (datalen > us->l) { + xs_node_unref(n); + return -EINVAL; + } + + GByteArray *node_data = g_byte_array_new(); + g_byte_array_append(node_data, us->d, datalen); + us->d += datalen; + us->l -= datalen; + n->content = node_data; + + if (us->root_walk) { + n->modified_in_tx = true; + } + } + while (1) { + char *perm = NULL; + size_t permlen = 0; + + ret = consume_string(us, &perm, &permlen); + if (ret) { + xs_node_unref(n); + return ret; + } + + if (!permlen) { + break; + } + + n->perms = g_list_append(n->perms, g_strdup(perm)); + } + + /* Now children */ + while (1) { + size_t childlen; + char *childname; + char *pathend; + XsNode *child = NULL; + + ret = consume_string(us, &childname, &childlen); + if (ret) { + xs_node_unref(n); + return ret; + } + + if (!childlen) { + break; + } + + pathend = us->path + strlen(us->path); + strncat(us->path, "/", sizeof(us->path) - 1); + strncat(us->path, childname, sizeof(us->path) - 1); + + ret = consume_node(us, &child, nr_nodes); + *pathend = '\0'; + if (ret) { + xs_node_unref(n); + return ret; + } + g_assert(child); + xs_node_add_child(n, childname, child); + } + + /* + * If the node has no data and no children we still want to fire + * a watch on it. + */ + if (us->root_walk && !n->children) { + n->modified_in_tx = true; + } + } + + if (!n->deleted_in_tx) { + (*nr_nodes)++; + } + + *nodep = n; + return 0; +} + +static int consume_tree(struct unsave_state *us, XsTransaction *t) +{ + int ret; + + ret = consume_be32(us, &t->tx_id); + if (ret) { + return ret; + } + + if (t->tx_id > us->s->last_tx) { + return -EINVAL; + } + + us->path[0] = '\0'; + + return consume_node(us, &t->root, &t->nr_nodes); +} + +int xs_impl_deserialize(XenstoreImplState *s, GByteArray *bytes, + unsigned int dom_id, xs_impl_watch_fn watch_fn, + void *watch_opaque) +{ + struct unsave_state us; + XsTransaction base_t = { 0 }; + int ret; + + us.s = s; + us.bytes = bytes; + us.d = bytes->data; + us.l = bytes->len; + + xs_impl_reset_watches(s, dom_id); + g_hash_table_remove_all(s->transactions); + + xs_node_unref(s->root); + s->root = NULL; + s->root_tx = s->last_tx = XBT_NULL; + + ret = consume_be32(&us, &s->last_tx); + if (ret) { + return ret; + } + + /* + * Consume the base tree into a transaction so that watches can be + * fired as we commit it. By setting us.root_walk we cause the nodes + * to be marked as 'modified_in_tx' as they are created, so that the + * watches are triggered on them. + */ + base_t.dom_id = dom_id; + base_t.base_tx = XBT_NULL; + us.root_walk = true; + ret = consume_tree(&us, &base_t); + if (ret) { + return ret; + } + us.root_walk = false; + + /* + * Commit the transaction now while the refcount on all nodes is 1. + * Note that we haven't yet reinstated the *guest* watches but that's + * OK because we don't want the guest to see any changes. Even any + * backend nodes which get recreated should be *precisely* as they + * were before the migration. Back ends may have been instantiated + * already, and will see the frontend magically blink into existence + * now (well, from the aio_bh which fires the watches). It's their + * responsibility to rebuild everything precisely as it was before. + */ + ret = transaction_commit(s, &base_t); + if (ret) { + return ret; + } + + while (1) { + unsigned int base_tx; + XsTransaction *t; + + ret = consume_be32(&us, &base_tx); + if (ret) { + return ret; + } + if (base_tx == XBT_NULL) { + break; + } + + t = g_new0(XsTransaction, 1); + t->base_tx = base_tx; + + ret = consume_be32(&us, &t->dom_id); + if (!ret) { + ret = consume_tree(&us, t); + } + if (ret) { + g_free(t); + return ret; + } + g_assert(t->root); + if (t->dom_id) { + s->nr_domu_transactions++; + } + g_hash_table_insert(s->transactions, GINT_TO_POINTER(t->tx_id), t); + } + + while (1) { + char *path, *token; + size_t pathlen, toklen; + + ret = consume_string(&us, &path, &pathlen); + if (ret) { + return ret; + } + if (!pathlen) { + break; + } + + ret = consume_string(&us, &token, &toklen); + if (ret) { + return ret; + } + + if (!watch_fn) { + continue; + } + + ret = do_xs_impl_watch(s, dom_id, path, token, watch_fn, watch_opaque); + if (ret) { + return ret; + } + } + + if (us.l) { + return -EINVAL; + } + + return 0; +} diff --git a/hw/i386/kvm/xenstore_impl.h b/hw/i386/kvm/xenstore_impl.h new file mode 100644 index 0000000000..0df2a91aae --- /dev/null +++ b/hw/i386/kvm/xenstore_impl.h @@ -0,0 +1,63 @@ +/* + * QEMU Xen emulation: The actual implementation of XenStore + * + * Copyright © 2023 Amazon.com, Inc. or its affiliates. All Rights Reserved. + * + * Authors: David Woodhouse <dwmw2@infradead.org> + * + * This work is licensed under the terms of the GNU GPL, version 2 or later. + * See the COPYING file in the top-level directory. + */ + +#ifndef QEMU_XENSTORE_IMPL_H +#define QEMU_XENSTORE_IMPL_H + +#include "hw/xen/xen_backend_ops.h" + +typedef struct XenstoreImplState XenstoreImplState; + +XenstoreImplState *xs_impl_create(unsigned int dom_id); + +char *xs_perm_as_string(unsigned int perm, unsigned int domid); + +/* + * These functions return *positive* error numbers. This is a little + * unconventional but it helps to keep us honest because there is + * also a very limited set of error numbers that they are permitted + * to return (those in xsd_errors). + */ + +int xs_impl_read(XenstoreImplState *s, unsigned int dom_id, + xs_transaction_t tx_id, const char *path, GByteArray *data); +int xs_impl_write(XenstoreImplState *s, unsigned int dom_id, + xs_transaction_t tx_id, const char *path, GByteArray *data); +int xs_impl_directory(XenstoreImplState *s, unsigned int dom_id, + xs_transaction_t tx_id, const char *path, + uint64_t *gencnt, GList **items); +int xs_impl_transaction_start(XenstoreImplState *s, unsigned int dom_id, + xs_transaction_t *tx_id); +int xs_impl_transaction_end(XenstoreImplState *s, unsigned int dom_id, + xs_transaction_t tx_id, bool commit); +int xs_impl_rm(XenstoreImplState *s, unsigned int dom_id, + xs_transaction_t tx_id, const char *path); +int xs_impl_get_perms(XenstoreImplState *s, unsigned int dom_id, + xs_transaction_t tx_id, const char *path, GList **perms); +int xs_impl_set_perms(XenstoreImplState *s, unsigned int dom_id, + xs_transaction_t tx_id, const char *path, GList *perms); + +/* This differs from xs_watch_fn because it has the token */ +typedef void(xs_impl_watch_fn)(void *opaque, const char *path, + const char *token); +int xs_impl_watch(XenstoreImplState *s, unsigned int dom_id, const char *path, + const char *token, xs_impl_watch_fn fn, void *opaque); +int xs_impl_unwatch(XenstoreImplState *s, unsigned int dom_id, + const char *path, const char *token, xs_impl_watch_fn fn, + void *opaque); +int xs_impl_reset_watches(XenstoreImplState *s, unsigned int dom_id); + +GByteArray *xs_impl_serialize(XenstoreImplState *s); +int xs_impl_deserialize(XenstoreImplState *s, GByteArray *bytes, + unsigned int dom_id, xs_impl_watch_fn watch_fn, + void *watch_opaque); + +#endif /* QEMU_XENSTORE_IMPL_H */ |