summary refs log tree commit diff stats
path: root/hw
diff options
context:
space:
mode:
Diffstat (limited to 'hw')
-rw-r--r--hw/9pfs/meson.build2
-rw-r--r--hw/9pfs/xen-9p-backend.c32
-rw-r--r--hw/acpi/acpi-pci-hotplug-stub.c9
-rw-r--r--hw/acpi/ich9.c21
-rw-r--r--hw/acpi/pci-bridge.c14
-rw-r--r--hw/acpi/pcihp.c112
-rw-r--r--hw/acpi/piix4.c33
-rw-r--r--hw/arm/allwinner-h3.c29
-rw-r--r--hw/arm/aspeed.c9
-rw-r--r--hw/arm/aspeed_eeprom.c10
-rw-r--r--hw/arm/boot.c6
-rw-r--r--hw/audio/trace-events6
-rw-r--r--hw/audio/via-ac97.c455
-rw-r--r--hw/block/block.c3
-rw-r--r--hw/block/dataplane/meson.build2
-rw-r--r--hw/block/dataplane/xen-block.c12
-rw-r--r--hw/block/m25p80.c4
-rw-r--r--hw/block/meson.build2
-rw-r--r--hw/block/xen-block.c12
-rw-r--r--hw/char/meson.build2
-rw-r--r--hw/char/xen_console.c57
-rw-r--r--hw/core/loader.c91
-rw-r--r--hw/core/machine.c1
-rw-r--r--hw/cxl/cxl-component-utils.c20
-rw-r--r--hw/cxl/cxl-host.c31
-rw-r--r--hw/display/meson.build2
-rw-r--r--hw/display/sm501.c18
-rw-r--r--hw/display/xenfb.c32
-rw-r--r--hw/i2c/allwinner-i2c.c26
-rw-r--r--hw/i386/acpi-build.c179
-rw-r--r--hw/i386/kvm/meson.build1
-rw-r--r--hw/i386/kvm/trace-events15
-rw-r--r--hw/i386/kvm/xen_evtchn.c15
-rw-r--r--hw/i386/kvm/xen_gnttab.c325
-rw-r--r--hw/i386/kvm/xen_gnttab.h1
-rw-r--r--hw/i386/kvm/xen_xenstore.c1251
-rw-r--r--hw/i386/kvm/xenstore_impl.c1927
-rw-r--r--hw/i386/kvm/xenstore_impl.h63
-rw-r--r--hw/i386/pc.c7
-rw-r--r--hw/i386/pc_piix.c5
-rw-r--r--hw/i386/xen/xen-hvm.c38
-rw-r--r--hw/i386/xen/xen-mapcache.c2
-rw-r--r--hw/i386/xen/xen_platform.c7
-rw-r--r--hw/intc/i8259.c10
-rw-r--r--hw/intc/i8259_common.c24
-rw-r--r--hw/intc/mips_gic.c4
-rw-r--r--hw/intc/riscv_aclint.c16
-rw-r--r--hw/intc/riscv_aplic.c4
-rw-r--r--hw/intc/riscv_imsic.c6
-rw-r--r--hw/isa/i82378.c10
-rw-r--r--hw/isa/lpc_ich9.c1
-rw-r--r--hw/isa/trace-events1
-rw-r--r--hw/isa/vt82c686.c54
-rw-r--r--hw/mem/cxl_type3.c294
-rw-r--r--hw/mem/cxl_type3_stubs.c17
-rw-r--r--hw/mem/meson.build2
-rw-r--r--hw/mips/boston.c2
-rw-r--r--hw/mips/cps.c35
-rw-r--r--hw/mips/malta.c2
-rw-r--r--hw/misc/edu.c5
-rw-r--r--hw/misc/mips_cmgcr.c2
-rw-r--r--hw/misc/mips_itu.c30
-rw-r--r--hw/net/Kconfig5
-rw-r--r--hw/net/e1000.c259
-rw-r--r--hw/net/e1000_common.h102
-rw-r--r--hw/net/e1000_regs.h958
-rw-r--r--hw/net/e1000e.c102
-rw-r--r--hw/net/e1000e_core.c719
-rw-r--r--hw/net/e1000e_core.h70
-rw-r--r--hw/net/e1000x_common.c38
-rw-r--r--hw/net/e1000x_common.h133
-rw-r--r--hw/net/e1000x_regs.h967
-rw-r--r--hw/net/fsl_etsec/etsec.c11
-rw-r--r--hw/net/fsl_etsec/etsec.h17
-rw-r--r--hw/net/fsl_etsec/miim.c5
-rw-r--r--hw/net/igb.c623
-rw-r--r--hw/net/igb_common.h146
-rw-r--r--hw/net/igb_core.c4077
-rw-r--r--hw/net/igb_core.h146
-rw-r--r--hw/net/igb_regs.h648
-rw-r--r--hw/net/igbvf.c327
-rw-r--r--hw/net/meson.build2
-rw-r--r--hw/net/net_rx_pkt.c102
-rw-r--r--hw/net/net_rx_pkt.h31
-rw-r--r--hw/net/net_tx_pkt.c332
-rw-r--r--hw/net/net_tx_pkt.h27
-rw-r--r--hw/net/trace-events50
-rw-r--r--hw/net/virtio-net.c85
-rw-r--r--hw/net/vmxnet3.c58
-rw-r--r--hw/net/xen_nic.c25
-rw-r--r--hw/nvme/ctrl.c802
-rw-r--r--hw/nvme/ns.c147
-rw-r--r--hw/nvme/nvme.h92
-rw-r--r--hw/nvme/subsys.c94
-rw-r--r--hw/nvme/trace-events1
-rw-r--r--hw/pci-bridge/cxl_root_port.c64
-rw-r--r--hw/pci-bridge/pci_expander_bridge.c44
-rw-r--r--hw/pci-host/mv64361.c4
-rw-r--r--hw/pci/pci-internal.h1
-rw-r--r--hw/pci/pci.c57
-rw-r--r--hw/pci/pcie_aer.c14
-rw-r--r--hw/pci/pcie_port.c46
-rw-r--r--hw/pci/pcie_sriov.c5
-rw-r--r--hw/ppc/pegasos2.c26
-rw-r--r--hw/ppc/spapr_hcall.c1
-rw-r--r--hw/riscv/Kconfig1
-rw-r--r--hw/riscv/meson.build1
-rw-r--r--hw/riscv/virt-acpi-build.c416
-rw-r--r--hw/riscv/virt.c70
-rw-r--r--hw/usb/hcd-ohci.c23
-rw-r--r--hw/usb/meson.build2
-rw-r--r--hw/usb/vt82c686-uhci-pci.c12
-rw-r--r--hw/usb/xen-usb.c29
-rw-r--r--hw/vfio/common.c699
-rw-r--r--hw/vfio/migration.c28
-rw-r--r--hw/vfio/pci.c5
-rw-r--r--hw/vfio/trace-events7
-rw-r--r--hw/virtio/trace-events1
-rw-r--r--hw/virtio/vhost-shadow-virtqueue.c8
-rw-r--r--hw/virtio/vhost-user.c4
-rw-r--r--hw/virtio/vhost-vdpa.c126
-rw-r--r--hw/virtio/vhost.c3
-rw-r--r--hw/virtio/virtio-crypto.c48
-rw-r--r--hw/virtio/virtio.c11
-rw-r--r--hw/xen/meson.build6
-rw-r--r--hw/xen/trace-events2
-rw-r--r--hw/xen/xen-bus-helper.c62
-rw-r--r--hw/xen/xen-bus.c411
-rw-r--r--hw/xen/xen-legacy-backend.c254
-rw-r--r--hw/xen/xen-operations.c478
-rw-r--r--hw/xen/xen_devconfig.c4
-rw-r--r--hw/xen/xen_pt.c66
-rw-r--r--hw/xen/xen_pt.h22
-rw-r--r--hw/xen/xen_pt_config_init.c4
-rw-r--r--hw/xen/xen_pt_graphics.c1
-rw-r--r--hw/xen/xen_pt_msi.c4
-rw-r--r--hw/xen/xen_pt_stub.c4
-rw-r--r--hw/xen/xen_pvdev.c63
138 files changed, 16613 insertions, 3133 deletions
diff --git a/hw/9pfs/meson.build b/hw/9pfs/meson.build
index 12443b6ad5..fd37b7a02d 100644
--- a/hw/9pfs/meson.build
+++ b/hw/9pfs/meson.build
@@ -15,7 +15,7 @@ fs_ss.add(files(
 ))
 fs_ss.add(when: 'CONFIG_LINUX', if_true: files('9p-util-linux.c'))
 fs_ss.add(when: 'CONFIG_DARWIN', if_true: files('9p-util-darwin.c'))
-fs_ss.add(when: 'CONFIG_XEN', if_true: files('xen-9p-backend.c'))
+fs_ss.add(when: 'CONFIG_XEN_BUS', if_true: files('xen-9p-backend.c'))
 softmmu_ss.add_all(when: 'CONFIG_FSDEV_9P', if_true: fs_ss)
 
 specific_ss.add(when: 'CONFIG_VIRTIO_9P', if_true: files('virtio-9p-device.c'))
diff --git a/hw/9pfs/xen-9p-backend.c b/hw/9pfs/xen-9p-backend.c
index 65c4979c3c..74f3a05f88 100644
--- a/hw/9pfs/xen-9p-backend.c
+++ b/hw/9pfs/xen-9p-backend.c
@@ -22,6 +22,7 @@
 #include "qemu/config-file.h"
 #include "qemu/main-loop.h"
 #include "qemu/option.h"
+#include "qemu/iov.h"
 #include "fsdev/qemu-fsdev.h"
 
 #define VERSIONS "1"
@@ -241,7 +242,7 @@ static void xen_9pfs_push_and_notify(V9fsPDU *pdu)
     xen_wmb();
 
     ring->inprogress = false;
-    xenevtchn_notify(ring->evtchndev, ring->local_port);
+    qemu_xen_evtchn_notify(ring->evtchndev, ring->local_port);
 
     qemu_bh_schedule(ring->bh);
 }
@@ -324,8 +325,8 @@ static void xen_9pfs_evtchn_event(void *opaque)
     Xen9pfsRing *ring = opaque;
     evtchn_port_t port;
 
-    port = xenevtchn_pending(ring->evtchndev);
-    xenevtchn_unmask(ring->evtchndev, port);
+    port = qemu_xen_evtchn_pending(ring->evtchndev);
+    qemu_xen_evtchn_unmask(ring->evtchndev, port);
 
     qemu_bh_schedule(ring->bh);
 }
@@ -337,10 +338,10 @@ static void xen_9pfs_disconnect(struct XenLegacyDevice *xendev)
 
     for (i = 0; i < xen_9pdev->num_rings; i++) {
         if (xen_9pdev->rings[i].evtchndev != NULL) {
-            qemu_set_fd_handler(xenevtchn_fd(xen_9pdev->rings[i].evtchndev),
-                    NULL, NULL, NULL);
-            xenevtchn_unbind(xen_9pdev->rings[i].evtchndev,
-                             xen_9pdev->rings[i].local_port);
+            qemu_set_fd_handler(qemu_xen_evtchn_fd(xen_9pdev->rings[i].evtchndev),
+                                NULL, NULL, NULL);
+            qemu_xen_evtchn_unbind(xen_9pdev->rings[i].evtchndev,
+                                   xen_9pdev->rings[i].local_port);
             xen_9pdev->rings[i].evtchndev = NULL;
         }
     }
@@ -359,12 +360,13 @@ static int xen_9pfs_free(struct XenLegacyDevice *xendev)
         if (xen_9pdev->rings[i].data != NULL) {
             xen_be_unmap_grant_refs(&xen_9pdev->xendev,
                                     xen_9pdev->rings[i].data,
+                                    xen_9pdev->rings[i].intf->ref,
                                     (1 << xen_9pdev->rings[i].ring_order));
         }
         if (xen_9pdev->rings[i].intf != NULL) {
-            xen_be_unmap_grant_refs(&xen_9pdev->xendev,
-                                    xen_9pdev->rings[i].intf,
-                                    1);
+            xen_be_unmap_grant_ref(&xen_9pdev->xendev,
+                                   xen_9pdev->rings[i].intf,
+                                   xen_9pdev->rings[i].ref);
         }
         if (xen_9pdev->rings[i].bh != NULL) {
             qemu_bh_delete(xen_9pdev->rings[i].bh);
@@ -447,12 +449,12 @@ static int xen_9pfs_connect(struct XenLegacyDevice *xendev)
         xen_9pdev->rings[i].inprogress = false;
 
 
-        xen_9pdev->rings[i].evtchndev = xenevtchn_open(NULL, 0);
+        xen_9pdev->rings[i].evtchndev = qemu_xen_evtchn_open();
         if (xen_9pdev->rings[i].evtchndev == NULL) {
             goto out;
         }
-        qemu_set_cloexec(xenevtchn_fd(xen_9pdev->rings[i].evtchndev));
-        xen_9pdev->rings[i].local_port = xenevtchn_bind_interdomain
+        qemu_set_cloexec(qemu_xen_evtchn_fd(xen_9pdev->rings[i].evtchndev));
+        xen_9pdev->rings[i].local_port = qemu_xen_evtchn_bind_interdomain
                                             (xen_9pdev->rings[i].evtchndev,
                                              xendev->dom,
                                              xen_9pdev->rings[i].evtchn);
@@ -463,8 +465,8 @@ static int xen_9pfs_connect(struct XenLegacyDevice *xendev)
             goto out;
         }
         xen_pv_printf(xendev, 2, "bind evtchn port %d\n", xendev->local_port);
-        qemu_set_fd_handler(xenevtchn_fd(xen_9pdev->rings[i].evtchndev),
-                xen_9pfs_evtchn_event, NULL, &xen_9pdev->rings[i]);
+        qemu_set_fd_handler(qemu_xen_evtchn_fd(xen_9pdev->rings[i].evtchndev),
+                            xen_9pfs_evtchn_event, NULL, &xen_9pdev->rings[i]);
     }
 
     xen_9pdev->security_model = xenstore_read_be_str(xendev, "security_model");
diff --git a/hw/acpi/acpi-pci-hotplug-stub.c b/hw/acpi/acpi-pci-hotplug-stub.c
index a43f6dafc9..dcee3ad7a1 100644
--- a/hw/acpi/acpi-pci-hotplug-stub.c
+++ b/hw/acpi/acpi-pci-hotplug-stub.c
@@ -5,8 +5,7 @@
 const VMStateDescription vmstate_acpi_pcihp_pci_status;
 
 void acpi_pcihp_init(Object *owner, AcpiPciHpState *s, PCIBus *root_bus,
-                     MemoryRegion *address_space_io, bool bridges_enabled,
-                     uint16_t io_base)
+                     MemoryRegion *address_space_io, uint16_t io_base)
 {
     return;
 }
@@ -36,8 +35,12 @@ void acpi_pcihp_device_unplug_request_cb(HotplugHandler *hotplug_dev,
     return;
 }
 
-void acpi_pcihp_reset(AcpiPciHpState *s, bool acpihp_root_off)
+void acpi_pcihp_reset(AcpiPciHpState *s)
 {
     return;
 }
 
+bool acpi_pcihp_is_hotpluggbale_bus(AcpiPciHpState *s, BusState *bus)
+{
+    return true;
+}
diff --git a/hw/acpi/ich9.c b/hw/acpi/ich9.c
index d23bfcaa6b..25e2c7243e 100644
--- a/hw/acpi/ich9.c
+++ b/hw/acpi/ich9.c
@@ -218,7 +218,7 @@ static bool vmstate_test_use_pcihp(void *opaque)
 {
     ICH9LPCPMRegs *s = opaque;
 
-    return s->use_acpi_hotplug_bridge;
+    return s->acpi_pci_hotplug.use_acpi_hotplug_bridge;
 }
 
 static const VMStateDescription vmstate_pcihp_state = {
@@ -277,8 +277,8 @@ static void pm_reset(void *opaque)
     }
     pm->smi_en_wmask = ~0;
 
-    if (pm->use_acpi_hotplug_bridge) {
-        acpi_pcihp_reset(&pm->acpi_pci_hotplug, true);
+    if (pm->acpi_pci_hotplug.use_acpi_hotplug_bridge) {
+        acpi_pcihp_reset(&pm->acpi_pci_hotplug);
     }
 
     acpi_update_sci(&pm->acpi_regs, pm->irq);
@@ -316,12 +316,11 @@ void ich9_pm_init(PCIDevice *lpc_pci, ICH9LPCPMRegs *pm, qemu_irq sci_irq)
         acpi_pm_tco_init(&pm->tco_regs, &pm->io);
     }
 
-    if (pm->use_acpi_hotplug_bridge) {
+    if (pm->acpi_pci_hotplug.use_acpi_hotplug_bridge) {
         acpi_pcihp_init(OBJECT(lpc_pci),
                         &pm->acpi_pci_hotplug,
                         pci_get_bus(lpc_pci),
                         pci_address_space_io(lpc_pci),
-                        true,
                         ACPI_PCIHP_ADDR_ICH9);
 
         qbus_set_hotplug_handler(BUS(pci_get_bus(lpc_pci)),
@@ -403,14 +402,14 @@ static bool ich9_pm_get_acpi_pci_hotplug(Object *obj, Error **errp)
 {
     ICH9LPCState *s = ICH9_LPC_DEVICE(obj);
 
-    return s->pm.use_acpi_hotplug_bridge;
+    return s->pm.acpi_pci_hotplug.use_acpi_hotplug_bridge;
 }
 
 static void ich9_pm_set_acpi_pci_hotplug(Object *obj, bool value, Error **errp)
 {
     ICH9LPCState *s = ICH9_LPC_DEVICE(obj);
 
-    s->pm.use_acpi_hotplug_bridge = value;
+    s->pm.acpi_pci_hotplug.use_acpi_hotplug_bridge = value;
 }
 
 static bool ich9_pm_get_keep_pci_slot_hpc(Object *obj, Error **errp)
@@ -435,7 +434,7 @@ void ich9_pm_add_properties(Object *obj, ICH9LPCPMRegs *pm)
     pm->disable_s3 = 0;
     pm->disable_s4 = 0;
     pm->s4_val = 2;
-    pm->use_acpi_hotplug_bridge = true;
+    pm->acpi_pci_hotplug.use_acpi_hotplug_bridge = true;
     pm->keep_pci_slot_hpc = true;
     pm->enable_tco = true;
 
@@ -579,6 +578,12 @@ void ich9_pm_device_unplug_cb(HotplugHandler *hotplug_dev, DeviceState *dev,
     }
 }
 
+bool ich9_pm_is_hotpluggable_bus(HotplugHandler *hotplug_dev, BusState *bus)
+{
+    ICH9LPCState *lpc = ICH9_LPC_DEVICE(hotplug_dev);
+    return acpi_pcihp_is_hotpluggbale_bus(&lpc->pm.acpi_pci_hotplug, bus);
+}
+
 void ich9_pm_ospm_status(AcpiDeviceIf *adev, ACPIOSTInfoList ***list)
 {
     ICH9LPCState *s = ICH9_LPC_DEVICE(adev);
diff --git a/hw/acpi/pci-bridge.c b/hw/acpi/pci-bridge.c
index 5f3ee5157f..7baa7034a1 100644
--- a/hw/acpi/pci-bridge.c
+++ b/hw/acpi/pci-bridge.c
@@ -21,7 +21,17 @@ void build_pci_bridge_aml(AcpiDevAmlIf *adev, Aml *scope)
 {
     PCIBridge *br = PCI_BRIDGE(adev);
 
-    if (object_property_find(OBJECT(&br->sec_bus), ACPI_PCIHP_PROP_BSEL)) {
-        build_append_pci_bus_devices(scope, pci_bridge_get_sec_bus(br));
+    if (!DEVICE(br)->hotplugged) {
+        PCIBus *sec_bus = pci_bridge_get_sec_bus(br);
+
+        build_append_pci_bus_devices(scope, sec_bus);
+
+        /*
+         * generate hotplug slots descriptors if
+         * bridge has ACPI PCI hotplug attached,
+         */
+        if (object_property_find(OBJECT(sec_bus), ACPI_PCIHP_PROP_BSEL)) {
+            build_append_pcihp_slots(scope, sec_bus);
+        }
     }
 }
diff --git a/hw/acpi/pcihp.c b/hw/acpi/pcihp.c
index 5dc7377411..dcfb779a7a 100644
--- a/hw/acpi/pcihp.c
+++ b/hw/acpi/pcihp.c
@@ -54,21 +54,6 @@ typedef struct AcpiPciHpFind {
     PCIBus *bus;
 } AcpiPciHpFind;
 
-static gint g_cmp_uint32(gconstpointer a, gconstpointer b, gpointer user_data)
-{
-    return a - b;
-}
-
-static GSequence *pci_acpi_index_list(void)
-{
-    static GSequence *used_acpi_index_list;
-
-    if (!used_acpi_index_list) {
-        used_acpi_index_list = g_sequence_new(NULL);
-    }
-    return used_acpi_index_list;
-}
-
 static int acpi_pcihp_get_bsel(PCIBus *bus)
 {
     Error *local_err = NULL;
@@ -136,20 +121,6 @@ static void acpi_set_pci_info(bool has_bridge_hotplug)
     }
 }
 
-static void acpi_pcihp_disable_root_bus(void)
-{
-    Object *host = acpi_get_i386_pci_host();
-    PCIBus *bus;
-
-    bus = PCI_HOST_BRIDGE(host)->bus;
-    if (bus && qbus_is_hotpluggable(BUS(bus))) {
-        /* setting the hotplug handler to NULL makes the bus non-hotpluggable */
-        qbus_set_hotplug_handler(BUS(bus), NULL);
-    }
-
-    return;
-}
-
 static void acpi_pcihp_test_hotplug_bus(PCIBus *bus, void *opaque)
 {
     AcpiPciHpFind *find = opaque;
@@ -291,17 +262,12 @@ static void acpi_pcihp_update(AcpiPciHpState *s)
     }
 }
 
-void acpi_pcihp_reset(AcpiPciHpState *s, bool acpihp_root_off)
+void acpi_pcihp_reset(AcpiPciHpState *s)
 {
-    if (acpihp_root_off) {
-        acpi_pcihp_disable_root_bus();
-    }
-    acpi_set_pci_info(!s->legacy_piix);
+    acpi_set_pci_info(s->use_acpi_hotplug_bridge);
     acpi_pcihp_update(s);
 }
 
-#define ONBOARD_INDEX_MAX (16 * 1024 - 1)
-
 void acpi_pcihp_device_pre_plug_cb(HotplugHandler *hotplug_dev,
                                    DeviceState *dev, Error **errp)
 {
@@ -314,34 +280,6 @@ void acpi_pcihp_device_pre_plug_cb(HotplugHandler *hotplug_dev,
                    ACPI_PCIHP_PROP_BSEL "' set");
         return;
     }
-
-    /*
-     * capped by systemd (see: udev-builtin-net_id.c)
-     * as it's the only known user honor it to avoid users
-     * misconfigure QEMU and then wonder why acpi-index doesn't work
-     */
-    if (pdev->acpi_index > ONBOARD_INDEX_MAX) {
-        error_setg(errp, "acpi-index should be less or equal to %u",
-                   ONBOARD_INDEX_MAX);
-        return;
-    }
-
-    /*
-     * make sure that acpi-index is unique across all present PCI devices
-     */
-    if (pdev->acpi_index) {
-        GSequence *used_indexes = pci_acpi_index_list();
-
-        if (g_sequence_lookup(used_indexes, GINT_TO_POINTER(pdev->acpi_index),
-                              g_cmp_uint32, NULL)) {
-            error_setg(errp, "a PCI device with acpi-index = %" PRIu32
-                       " already exist", pdev->acpi_index);
-            return;
-        }
-        g_sequence_insert_sorted(used_indexes,
-                                 GINT_TO_POINTER(pdev->acpi_index),
-                                 g_cmp_uint32, NULL);
-    }
 }
 
 void acpi_pcihp_device_plug_cb(HotplugHandler *hotplug_dev, AcpiPciHpState *s,
@@ -361,17 +299,10 @@ void acpi_pcihp_device_plug_cb(HotplugHandler *hotplug_dev, AcpiPciHpState *s,
          * Overwrite the default hotplug handler with the ACPI PCI one
          * for cold plugged bridges only.
          */
-        if (!s->legacy_piix &&
+        if (s->use_acpi_hotplug_bridge &&
             object_dynamic_cast(OBJECT(dev), TYPE_PCI_BRIDGE)) {
             PCIBus *sec = pci_bridge_get_sec_bus(PCI_BRIDGE(pdev));
 
-            /* Remove all hot-plug handlers if hot-plug is disabled on slot */
-            if (object_dynamic_cast(OBJECT(dev), TYPE_PCIE_SLOT) &&
-                !PCIE_SLOT(pdev)->hotplug) {
-                qbus_set_hotplug_handler(BUS(sec), NULL);
-                return;
-            }
-
             qbus_set_hotplug_handler(BUS(sec), OBJECT(hotplug_dev));
             /* We don't have to overwrite any other hotplug handler yet */
             assert(QLIST_EMPTY(&sec->child));
@@ -401,17 +332,6 @@ void acpi_pcihp_device_unplug_cb(HotplugHandler *hotplug_dev, AcpiPciHpState *s,
     trace_acpi_pci_unplug(PCI_SLOT(pdev->devfn),
                           acpi_pcihp_get_bsel(pci_get_bus(pdev)));
 
-    /*
-     * clean up acpi-index so it could reused by another device
-     */
-    if (pdev->acpi_index) {
-        GSequence *used_indexes = pci_acpi_index_list();
-
-        g_sequence_remove(g_sequence_lookup(used_indexes,
-                          GINT_TO_POINTER(pdev->acpi_index),
-                          g_cmp_uint32, NULL));
-    }
-
     qdev_unrealize(dev);
 }
 
@@ -441,6 +361,24 @@ void acpi_pcihp_device_unplug_request_cb(HotplugHandler *hotplug_dev,
     acpi_send_event(DEVICE(hotplug_dev), ACPI_PCI_HOTPLUG_STATUS);
 }
 
+bool acpi_pcihp_is_hotpluggbale_bus(AcpiPciHpState *s, BusState *bus)
+{
+    Object *o = OBJECT(bus->parent);
+
+    if (s->use_acpi_hotplug_bridge &&
+        object_dynamic_cast(o, TYPE_PCI_BRIDGE)) {
+        if (object_dynamic_cast(o, TYPE_PCIE_SLOT) && !PCIE_SLOT(o)->hotplug) {
+            return false;
+        }
+        return true;
+    }
+
+    if (s->use_acpi_root_pci_hotplug) {
+        return true;
+    }
+    return false;
+}
+
 static uint64_t pci_read(void *opaque, hwaddr addr, unsigned int size)
 {
     AcpiPciHpState *s = opaque;
@@ -454,7 +392,7 @@ static uint64_t pci_read(void *opaque, hwaddr addr, unsigned int size)
     switch (addr) {
     case PCI_UP_BASE:
         val = s->acpi_pcihp_pci_status[bsel].up;
-        if (!s->legacy_piix) {
+        if (s->use_acpi_hotplug_bridge) {
             s->acpi_pcihp_pci_status[bsel].up = 0;
         }
         trace_acpi_pci_up_read(val);
@@ -529,7 +467,8 @@ static void pci_write(void *opaque, hwaddr addr, uint64_t data,
         trace_acpi_pci_ej_write(addr, data);
         break;
     case PCI_SEL_BASE:
-        s->hotplug_select = s->legacy_piix ? ACPI_PCIHP_BSEL_DEFAULT : data;
+        s->hotplug_select = s->use_acpi_hotplug_bridge ? data :
+            ACPI_PCIHP_BSEL_DEFAULT;
         trace_acpi_pci_sel_write(addr, data);
     default:
         break;
@@ -547,14 +486,13 @@ static const MemoryRegionOps acpi_pcihp_io_ops = {
 };
 
 void acpi_pcihp_init(Object *owner, AcpiPciHpState *s, PCIBus *root_bus,
-                     MemoryRegion *address_space_io, bool bridges_enabled,
+                     MemoryRegion *address_space_io,
                      uint16_t io_base)
 {
     s->io_len = ACPI_PCIHP_SIZE;
     s->io_base = io_base;
 
     s->root = root_bus;
-    s->legacy_piix = !bridges_enabled;
 
     memory_region_init_io(&s->io, owner, &acpi_pcihp_io_ops, s,
                           "acpi-pci-hotplug", s->io_len);
diff --git a/hw/acpi/piix4.c b/hw/acpi/piix4.c
index eac2125abd..63d2113b86 100644
--- a/hw/acpi/piix4.c
+++ b/hw/acpi/piix4.c
@@ -170,14 +170,14 @@ static const VMStateDescription vmstate_pci_status = {
 static bool vmstate_test_use_acpi_hotplug_bridge(void *opaque, int version_id)
 {
     PIIX4PMState *s = opaque;
-    return s->use_acpi_hotplug_bridge;
+    return s->acpi_pci_hotplug.use_acpi_hotplug_bridge;
 }
 
 static bool vmstate_test_no_use_acpi_hotplug_bridge(void *opaque,
                                                     int version_id)
 {
     PIIX4PMState *s = opaque;
-    return !s->use_acpi_hotplug_bridge;
+    return !s->acpi_pci_hotplug.use_acpi_hotplug_bridge;
 }
 
 static bool vmstate_test_use_memhp(void *opaque)
@@ -234,7 +234,8 @@ static bool piix4_vmstate_need_smbus(void *opaque, int version_id)
 static bool vmstate_test_migrate_acpi_index(void *opaque, int version_id)
 {
     PIIX4PMState *s = PIIX4_PM(opaque);
-    return s->use_acpi_hotplug_bridge && !s->not_migrate_acpi_index;
+    return s->acpi_pci_hotplug.use_acpi_hotplug_bridge &&
+           !s->not_migrate_acpi_index;
 }
 
 /* qemu-kvm 1.2 uses version 3 but advertised as 2
@@ -303,8 +304,9 @@ static void piix4_pm_reset(DeviceState *dev)
     acpi_update_sci(&s->ar, s->irq);
 
     pm_io_space_update(s);
-    if (s->use_acpi_hotplug_bridge || s->use_acpi_root_pci_hotplug) {
-        acpi_pcihp_reset(&s->acpi_pci_hotplug, !s->use_acpi_root_pci_hotplug);
+    if (s->acpi_pci_hotplug.use_acpi_hotplug_bridge ||
+        s->acpi_pci_hotplug.use_acpi_root_pci_hotplug) {
+        acpi_pcihp_reset(&s->acpi_pci_hotplug);
     }
 }
 
@@ -402,6 +404,13 @@ static void piix4_device_unplug_cb(HotplugHandler *hotplug_dev,
     }
 }
 
+static bool piix4_is_hotpluggable_bus(HotplugHandler *hotplug_dev,
+                                      BusState *bus)
+{
+    PIIX4PMState *s = PIIX4_PM(hotplug_dev);
+    return acpi_pcihp_is_hotpluggbale_bus(&s->acpi_pci_hotplug, bus);
+}
+
 static void piix4_pm_machine_ready(Notifier *n, void *opaque)
 {
     PIIX4PMState *s = container_of(n, PIIX4PMState, machine_ready);
@@ -487,12 +496,11 @@ static void piix4_pm_realize(PCIDevice *dev, Error **errp)
     qemu_add_machine_init_done_notifier(&s->machine_ready);
 
     if (xen_enabled()) {
-        s->use_acpi_hotplug_bridge = false;
+        s->acpi_pci_hotplug.use_acpi_hotplug_bridge = false;
     }
 
     piix4_acpi_system_hot_add_init(pci_address_space_io(dev),
                                    pci_get_bus(dev), s);
-    qbus_set_hotplug_handler(BUS(pci_get_bus(dev)), OBJECT(s));
 
     piix4_pm_add_properties(s);
 }
@@ -561,9 +569,11 @@ static void piix4_acpi_system_hot_add_init(MemoryRegion *parent,
                           "acpi-gpe0", GPE_LEN);
     memory_region_add_subregion(parent, GPE_BASE, &s->io_gpe);
 
-    if (s->use_acpi_hotplug_bridge || s->use_acpi_root_pci_hotplug) {
+    if (s->acpi_pci_hotplug.use_acpi_hotplug_bridge ||
+        s->acpi_pci_hotplug.use_acpi_root_pci_hotplug) {
         acpi_pcihp_init(OBJECT(s), &s->acpi_pci_hotplug, bus, parent,
-                        s->use_acpi_hotplug_bridge, ACPI_PCIHP_ADDR_PIIX4);
+                        ACPI_PCIHP_ADDR_PIIX4);
+        qbus_set_hotplug_handler(BUS(pci_get_bus(PCI_DEVICE(s))), OBJECT(s));
     }
 
     s->cpu_hotplug_legacy = true;
@@ -602,9 +612,9 @@ static Property piix4_pm_properties[] = {
     DEFINE_PROP_UINT8(ACPI_PM_PROP_S4_DISABLED, PIIX4PMState, disable_s4, 0),
     DEFINE_PROP_UINT8(ACPI_PM_PROP_S4_VAL, PIIX4PMState, s4_val, 2),
     DEFINE_PROP_BOOL(ACPI_PM_PROP_ACPI_PCIHP_BRIDGE, PIIX4PMState,
-                     use_acpi_hotplug_bridge, true),
+                     acpi_pci_hotplug.use_acpi_hotplug_bridge, true),
     DEFINE_PROP_BOOL(ACPI_PM_PROP_ACPI_PCI_ROOTHP, PIIX4PMState,
-                     use_acpi_root_pci_hotplug, true),
+                     acpi_pci_hotplug.use_acpi_root_pci_hotplug, true),
     DEFINE_PROP_BOOL("memory-hotplug-support", PIIX4PMState,
                      acpi_memory_hotplug.is_enabled, true),
     DEFINE_PROP_BOOL("smm-compat", PIIX4PMState, smm_compat, false),
@@ -641,6 +651,7 @@ static void piix4_pm_class_init(ObjectClass *klass, void *data)
     hc->plug = piix4_device_plug_cb;
     hc->unplug_request = piix4_device_unplug_request_cb;
     hc->unplug = piix4_device_unplug_cb;
+    hc->is_hotpluggable_bus = piix4_is_hotpluggable_bus;
     adevc->ospm_status = piix4_ospm_status;
     adevc->send_event = piix4_send_gpe;
     adevc->madt_cpu = pc_madt_cpu_entry;
diff --git a/hw/arm/allwinner-h3.c b/hw/arm/allwinner-h3.c
index bfce3c8d92..69d0ad6f50 100644
--- a/hw/arm/allwinner-h3.c
+++ b/hw/arm/allwinner-h3.c
@@ -54,6 +54,8 @@ const hwaddr allwinner_h3_memmap[] = {
     [AW_H3_DEV_UART2]      = 0x01c28800,
     [AW_H3_DEV_UART3]      = 0x01c28c00,
     [AW_H3_DEV_TWI0]       = 0x01c2ac00,
+    [AW_H3_DEV_TWI1]       = 0x01c2b000,
+    [AW_H3_DEV_TWI2]       = 0x01c2b400,
     [AW_H3_DEV_EMAC]       = 0x01c30000,
     [AW_H3_DEV_DRAMCOM]    = 0x01c62000,
     [AW_H3_DEV_DRAMCTL]    = 0x01c63000,
@@ -64,6 +66,7 @@ const hwaddr allwinner_h3_memmap[] = {
     [AW_H3_DEV_GIC_VCPU]   = 0x01c86000,
     [AW_H3_DEV_RTC]        = 0x01f00000,
     [AW_H3_DEV_CPUCFG]     = 0x01f01c00,
+    [AW_H3_DEV_R_TWI]      = 0x01f02400,
     [AW_H3_DEV_SDRAM]      = 0x40000000
 };
 
@@ -107,8 +110,6 @@ struct AwH3Unimplemented {
     { "uart1",     0x01c28400, 1 * KiB },
     { "uart2",     0x01c28800, 1 * KiB },
     { "uart3",     0x01c28c00, 1 * KiB },
-    { "twi1",      0x01c2b000, 1 * KiB },
-    { "twi2",      0x01c2b400, 1 * KiB },
     { "scr",       0x01c2c400, 1 * KiB },
     { "gpu",       0x01c40000, 64 * KiB },
     { "hstmr",     0x01c60000, 4 * KiB },
@@ -123,7 +124,6 @@ struct AwH3Unimplemented {
     { "r_prcm",    0x01f01400, 1 * KiB },
     { "r_twd",     0x01f01800, 1 * KiB },
     { "r_cir-rx",  0x01f02000, 1 * KiB },
-    { "r_twi",     0x01f02400, 1 * KiB },
     { "r_uart",    0x01f02800, 1 * KiB },
     { "r_pio",     0x01f02c00, 1 * KiB },
     { "r_pwm",     0x01f03800, 1 * KiB },
@@ -151,8 +151,11 @@ enum {
     AW_H3_GIC_SPI_UART2     =  2,
     AW_H3_GIC_SPI_UART3     =  3,
     AW_H3_GIC_SPI_TWI0      =  6,
+    AW_H3_GIC_SPI_TWI1      =  7,
+    AW_H3_GIC_SPI_TWI2      =  8,
     AW_H3_GIC_SPI_TIMER0    = 18,
     AW_H3_GIC_SPI_TIMER1    = 19,
+    AW_H3_GIC_SPI_R_TWI     = 44,
     AW_H3_GIC_SPI_MMC0      = 60,
     AW_H3_GIC_SPI_EHCI0     = 72,
     AW_H3_GIC_SPI_OHCI0     = 73,
@@ -227,7 +230,10 @@ static void allwinner_h3_init(Object *obj)
 
     object_initialize_child(obj, "rtc", &s->rtc, TYPE_AW_RTC_SUN6I);
 
-    object_initialize_child(obj, "twi0", &s->i2c0, TYPE_AW_I2C);
+    object_initialize_child(obj, "twi0",  &s->i2c0,  TYPE_AW_I2C_SUN6I);
+    object_initialize_child(obj, "twi1",  &s->i2c1,  TYPE_AW_I2C_SUN6I);
+    object_initialize_child(obj, "twi2",  &s->i2c2,  TYPE_AW_I2C_SUN6I);
+    object_initialize_child(obj, "r_twi", &s->r_twi, TYPE_AW_I2C_SUN6I);
 }
 
 static void allwinner_h3_realize(DeviceState *dev, Error **errp)
@@ -432,6 +438,21 @@ static void allwinner_h3_realize(DeviceState *dev, Error **errp)
     sysbus_connect_irq(SYS_BUS_DEVICE(&s->i2c0), 0,
                        qdev_get_gpio_in(DEVICE(&s->gic), AW_H3_GIC_SPI_TWI0));
 
+    sysbus_realize(SYS_BUS_DEVICE(&s->i2c1), &error_fatal);
+    sysbus_mmio_map(SYS_BUS_DEVICE(&s->i2c1), 0, s->memmap[AW_H3_DEV_TWI1]);
+    sysbus_connect_irq(SYS_BUS_DEVICE(&s->i2c1), 0,
+                       qdev_get_gpio_in(DEVICE(&s->gic), AW_H3_GIC_SPI_TWI1));
+
+    sysbus_realize(SYS_BUS_DEVICE(&s->i2c2), &error_fatal);
+    sysbus_mmio_map(SYS_BUS_DEVICE(&s->i2c2), 0, s->memmap[AW_H3_DEV_TWI2]);
+    sysbus_connect_irq(SYS_BUS_DEVICE(&s->i2c2), 0,
+                       qdev_get_gpio_in(DEVICE(&s->gic), AW_H3_GIC_SPI_TWI2));
+
+    sysbus_realize(SYS_BUS_DEVICE(&s->r_twi), &error_fatal);
+    sysbus_mmio_map(SYS_BUS_DEVICE(&s->r_twi), 0, s->memmap[AW_H3_DEV_R_TWI]);
+    sysbus_connect_irq(SYS_BUS_DEVICE(&s->r_twi), 0,
+                       qdev_get_gpio_in(DEVICE(&s->gic), AW_H3_GIC_SPI_R_TWI));
+
     /* Unimplemented devices */
     for (i = 0; i < ARRAY_SIZE(unimplemented); i++) {
         create_unimplemented_device(unimplemented[i].device_name,
diff --git a/hw/arm/aspeed.c b/hw/arm/aspeed.c
index 86601cb1a5..c1f2b9cfca 100644
--- a/hw/arm/aspeed.c
+++ b/hw/arm/aspeed.c
@@ -524,6 +524,11 @@ static void yosemitev2_bmc_i2c_init(AspeedMachineState *bmc)
     at24c_eeprom_init(aspeed_i2c_get_bus(&soc->i2c, 4), 0x51, 128 * KiB);
     at24c_eeprom_init_rom(aspeed_i2c_get_bus(&soc->i2c, 8), 0x51, 128 * KiB,
                           yosemitev2_bmc_fruid, yosemitev2_bmc_fruid_len);
+    /* TMP421 */
+    i2c_slave_create_simple(aspeed_i2c_get_bus(&soc->i2c, 11), "tmp421", 0x1f);
+    i2c_slave_create_simple(aspeed_i2c_get_bus(&soc->i2c, 9), "tmp421", 0x4e);
+    i2c_slave_create_simple(aspeed_i2c_get_bus(&soc->i2c, 9), "tmp421", 0x4f);
+
 }
 
 static void romulus_bmc_i2c_init(AspeedMachineState *bmc)
@@ -542,6 +547,10 @@ static void tiogapass_bmc_i2c_init(AspeedMachineState *bmc)
     at24c_eeprom_init(aspeed_i2c_get_bus(&soc->i2c, 4), 0x54, 128 * KiB);
     at24c_eeprom_init_rom(aspeed_i2c_get_bus(&soc->i2c, 6), 0x54, 128 * KiB,
                           tiogapass_bmc_fruid, tiogapass_bmc_fruid_len);
+    /* TMP421 */
+    i2c_slave_create_simple(aspeed_i2c_get_bus(&soc->i2c, 8), "tmp421", 0x1f);
+    i2c_slave_create_simple(aspeed_i2c_get_bus(&soc->i2c, 6), "tmp421", 0x4f);
+    i2c_slave_create_simple(aspeed_i2c_get_bus(&soc->i2c, 6), "tmp421", 0x4e);
 }
 
 static void create_pca9552(AspeedSoCState *soc, int bus_id, int addr)
diff --git a/hw/arm/aspeed_eeprom.c b/hw/arm/aspeed_eeprom.c
index 2fb2d5dbb7..dc33a88a54 100644
--- a/hw/arm/aspeed_eeprom.c
+++ b/hw/arm/aspeed_eeprom.c
@@ -101,17 +101,17 @@ const uint8_t fby35_bmc_fruid[] = {
 /* Yosemite V2 BMC FRU */
 const uint8_t yosemitev2_bmc_fruid[] = {
     0x01, 0x00, 0x00, 0x01, 0x0d, 0x00, 0x00, 0xf1, 0x01, 0x0c, 0x00, 0x36,
-    0xe6, 0xd0, 0xc6, 0x58, 0x58, 0x58, 0x58, 0x58, 0x58, 0xd2, 0x42, 0x4d,
-    0x43, 0x20, 0x53, 0x74, 0x6f, 0x72, 0x61, 0x67, 0x65, 0x20, 0x4d, 0x6f,
-    0x64, 0x75, 0x6c, 0x65, 0xcd, 0x58, 0x58, 0x58, 0x58, 0x58, 0x58, 0x58,
+    0xe6, 0xd0, 0xc6, 0x58, 0x58, 0x58, 0x58, 0x58, 0x58, 0xd2, 0x42, 0x61,
+    0x73, 0x65, 0x62, 0x6f, 0x61, 0x72, 0x64, 0x20, 0x4d, 0x50, 0x00, 0x00,
+    0x00, 0x00, 0x00, 0x00, 0xcd, 0x58, 0x58, 0x58, 0x58, 0x58, 0x58, 0x58,
     0x58, 0x58, 0x58, 0x58, 0x58, 0x58, 0xce, 0x58, 0x58, 0x58, 0x58, 0x58,
     0x58, 0x58, 0x58, 0x58, 0x58, 0x58, 0x58, 0x58, 0x58, 0xc3, 0x31, 0x2e,
     0x30, 0xc9, 0x58, 0x58, 0x58, 0x58, 0x58, 0x58, 0x58, 0x58, 0x58, 0xd2,
     0x58, 0x58, 0x58, 0x58, 0x58, 0x58, 0x58, 0x58, 0x58, 0x58, 0x58, 0x58,
     0x58, 0x58, 0x58, 0x58, 0x58, 0x58, 0xc1, 0x39, 0x01, 0x0c, 0x00, 0xc6,
     0x58, 0x58, 0x58, 0x58, 0x58, 0x58, 0xd2, 0x59, 0x6f, 0x73, 0x65, 0x6d,
-    0x69, 0x74, 0x65, 0x20, 0x56, 0x32, 0x2e, 0x30, 0x20, 0x45, 0x56, 0x54,
-    0x32, 0xce, 0x58, 0x58, 0x58, 0x58, 0x58, 0x58, 0x58, 0x58, 0x58, 0x58,
+    0x69, 0x74, 0x65, 0x20, 0x56, 0x32, 0x20, 0x4d, 0x50, 0x00, 0x00, 0x00,
+    0x00, 0xce, 0x58, 0x58, 0x58, 0x58, 0x58, 0x58, 0x58, 0x58, 0x58, 0x58,
     0x58, 0x58, 0x58, 0x58, 0xc4, 0x45, 0x56, 0x54, 0x32, 0xcd, 0x58, 0x58,
     0x58, 0x58, 0x58, 0x58, 0x58, 0x58, 0x58, 0x58, 0x58, 0x58, 0x58, 0xc7,
     0x58, 0x58, 0x58, 0x58, 0x58, 0x58, 0x58, 0xc3, 0x31, 0x2e, 0x30, 0xc9,
diff --git a/hw/arm/boot.c b/hw/arm/boot.c
index 1e021c4a34..50e5141116 100644
--- a/hw/arm/boot.c
+++ b/hw/arm/boot.c
@@ -926,6 +926,12 @@ static uint64_t load_aarch64_image(const char *filename, hwaddr mem_base,
             return -1;
         }
         size = len;
+
+        /* Unpack the image if it is a EFI zboot image */
+        if (unpack_efi_zboot_image(&buffer, &size) < 0) {
+            g_free(buffer);
+            return -1;
+        }
     }
 
     /* check the arm64 magic header value -- very old kernels may not have it */
diff --git a/hw/audio/trace-events b/hw/audio/trace-events
index e0e71cd9b1..4dec48a4fd 100644
--- a/hw/audio/trace-events
+++ b/hw/audio/trace-events
@@ -11,3 +11,9 @@ hda_audio_running(const char *stream, int nr, bool running) "st %s, nr %d, run %
 hda_audio_format(const char *stream, int chan, const char *fmt, int freq) "st %s, %d x %s @ %d Hz"
 hda_audio_adjust(const char *stream, int pos) "st %s, pos %d"
 hda_audio_overrun(const char *stream) "st %s"
+
+#via-ac97.c
+via_ac97_codec_write(uint8_t addr, uint16_t val) "0x%x <- 0x%x"
+via_ac97_sgd_fetch(uint32_t curr, uint32_t addr, char stop, char eol, char flag, uint32_t len) "curr=0x%x addr=0x%x %c%c%c len=%d"
+via_ac97_sgd_read(uint64_t addr, unsigned size, uint64_t val) "0x%"PRIx64" %d -> 0x%"PRIx64
+via_ac97_sgd_write(uint64_t addr, unsigned size, uint64_t val) "0x%"PRIx64" %d <- 0x%"PRIx64
diff --git a/hw/audio/via-ac97.c b/hw/audio/via-ac97.c
index d1a856f63d..676254b7a4 100644
--- a/hw/audio/via-ac97.c
+++ b/hw/audio/via-ac97.c
@@ -1,39 +1,482 @@
 /*
  * VIA south bridges sound support
  *
+ * Copyright (c) 2022-2023 BALATON Zoltan
+ *
  * This work is licensed under the GNU GPL license version 2 or later.
  */
 
 /*
- * TODO: This is entirely boiler plate just registering empty PCI devices
- * with the right ID guests expect, functionality should be added here.
+ * TODO: This is only a basic implementation of one audio playback channel
+ *       more functionality should be added here.
  */
 
 #include "qemu/osdep.h"
+#include "qemu/log.h"
 #include "hw/isa/vt82c686.h"
-#include "hw/pci/pci_device.h"
+#include "ac97.h"
+#include "trace.h"
+
+#define CLEN_IS_EOL(x)  ((x)->clen & BIT(31))
+#define CLEN_IS_FLAG(x) ((x)->clen & BIT(30))
+#define CLEN_IS_STOP(x) ((x)->clen & BIT(29))
+#define CLEN_LEN(x)     ((x)->clen & 0xffffff)
+
+#define STAT_ACTIVE BIT(7)
+#define STAT_PAUSED BIT(6)
+#define STAT_TRIG   BIT(3)
+#define STAT_STOP   BIT(2)
+#define STAT_EOL    BIT(1)
+#define STAT_FLAG   BIT(0)
+
+#define CNTL_START  BIT(7)
+#define CNTL_TERM   BIT(6)
+#define CNTL_PAUSE  BIT(3)
+
+static void open_voice_out(ViaAC97State *s);
+
+static uint16_t codec_rates[] = { 8000, 11025, 16000, 22050, 32000, 44100,
+                                  48000 };
+
+#define CODEC_REG(s, o)  ((s)->codec_regs[(o) / 2])
+#define CODEC_VOL(vol, mask)  ((255 * ((vol) & mask)) / mask)
+
+static void codec_volume_set_out(ViaAC97State *s)
+{
+    int lvol, rvol, mute;
+
+    lvol = 255 - CODEC_VOL(CODEC_REG(s, AC97_Master_Volume_Mute) >> 8, 0x1f);
+    lvol *= 255 - CODEC_VOL(CODEC_REG(s, AC97_PCM_Out_Volume_Mute) >> 8, 0x1f);
+    lvol /= 255;
+    rvol = 255 - CODEC_VOL(CODEC_REG(s, AC97_Master_Volume_Mute), 0x1f);
+    rvol *= 255 - CODEC_VOL(CODEC_REG(s, AC97_PCM_Out_Volume_Mute), 0x1f);
+    rvol /= 255;
+    mute = CODEC_REG(s, AC97_Master_Volume_Mute) >> MUTE_SHIFT;
+    mute |= CODEC_REG(s, AC97_PCM_Out_Volume_Mute) >> MUTE_SHIFT;
+    AUD_set_volume_out(s->vo, mute, lvol, rvol);
+}
+
+static void codec_reset(ViaAC97State *s)
+{
+    memset(s->codec_regs, 0, sizeof(s->codec_regs));
+    CODEC_REG(s, AC97_Reset) = 0x6a90;
+    CODEC_REG(s, AC97_Master_Volume_Mute) = 0x8000;
+    CODEC_REG(s, AC97_Headphone_Volume_Mute) = 0x8000;
+    CODEC_REG(s, AC97_Master_Volume_Mono_Mute) = 0x8000;
+    CODEC_REG(s, AC97_Phone_Volume_Mute) = 0x8008;
+    CODEC_REG(s, AC97_Mic_Volume_Mute) = 0x8008;
+    CODEC_REG(s, AC97_Line_In_Volume_Mute) = 0x8808;
+    CODEC_REG(s, AC97_CD_Volume_Mute) = 0x8808;
+    CODEC_REG(s, AC97_Video_Volume_Mute) = 0x8808;
+    CODEC_REG(s, AC97_Aux_Volume_Mute) = 0x8808;
+    CODEC_REG(s, AC97_PCM_Out_Volume_Mute) = 0x8808;
+    CODEC_REG(s, AC97_Record_Gain_Mute) = 0x8000;
+    CODEC_REG(s, AC97_Powerdown_Ctrl_Stat) = 0x000f;
+    CODEC_REG(s, AC97_Extended_Audio_ID) = 0x0a05;
+    CODEC_REG(s, AC97_Extended_Audio_Ctrl_Stat) = 0x0400;
+    CODEC_REG(s, AC97_PCM_Front_DAC_Rate) = 48000;
+    CODEC_REG(s, AC97_PCM_LR_ADC_Rate) = 48000;
+    /* Sigmatel 9766 (STAC9766) */
+    CODEC_REG(s, AC97_Vendor_ID1) = 0x8384;
+    CODEC_REG(s, AC97_Vendor_ID2) = 0x7666;
+}
+
+static uint16_t codec_read(ViaAC97State *s, uint8_t addr)
+{
+    return CODEC_REG(s, addr);
+}
+
+static void codec_write(ViaAC97State *s, uint8_t addr, uint16_t val)
+{
+    trace_via_ac97_codec_write(addr, val);
+    switch (addr) {
+    case AC97_Reset:
+        codec_reset(s);
+        return;
+    case AC97_Master_Volume_Mute:
+    case AC97_PCM_Out_Volume_Mute:
+        if (addr == AC97_Master_Volume_Mute) {
+            if (val & BIT(13)) {
+                val |= 0x1f00;
+            }
+            if (val & BIT(5)) {
+                val |= 0x1f;
+            }
+        }
+        CODEC_REG(s, addr) = val & 0x9f1f;
+        codec_volume_set_out(s);
+        return;
+    case AC97_Extended_Audio_Ctrl_Stat:
+        CODEC_REG(s, addr) &= ~EACS_VRA;
+        CODEC_REG(s, addr) |= val & EACS_VRA;
+        if (!(val & EACS_VRA)) {
+            CODEC_REG(s, AC97_PCM_Front_DAC_Rate) = 48000;
+            CODEC_REG(s, AC97_PCM_LR_ADC_Rate) = 48000;
+            open_voice_out(s);
+        }
+        return;
+    case AC97_PCM_Front_DAC_Rate:
+    case AC97_PCM_LR_ADC_Rate:
+        if (CODEC_REG(s, AC97_Extended_Audio_Ctrl_Stat) & EACS_VRA) {
+            int i;
+            uint16_t rate = val;
+
+            for (i = 0; i < ARRAY_SIZE(codec_rates) - 1; i++) {
+                if (rate < codec_rates[i] +
+                    (codec_rates[i + 1] - codec_rates[i]) / 2) {
+                    rate = codec_rates[i];
+                    break;
+                }
+            }
+            if (rate > 48000) {
+                rate = 48000;
+            }
+            CODEC_REG(s, addr) = rate;
+            open_voice_out(s);
+        }
+        return;
+    case AC97_Powerdown_Ctrl_Stat:
+        CODEC_REG(s, addr) = (val & 0xff00) | (CODEC_REG(s, addr) & 0xff);
+        return;
+    case AC97_Extended_Audio_ID:
+    case AC97_Vendor_ID1:
+    case AC97_Vendor_ID2:
+        /* Read only registers */
+        return;
+    default:
+        qemu_log_mask(LOG_UNIMP,
+                      "via-ac97: Unimplemented codec register 0x%x\n", addr);
+        CODEC_REG(s, addr) = val;
+    }
+}
+
+static void fetch_sgd(ViaAC97SGDChannel *c, PCIDevice *d)
+{
+    uint32_t b[2];
+
+    if (c->curr < c->base) {
+        c->curr = c->base;
+    }
+    if (unlikely(pci_dma_read(d, c->curr, b, sizeof(b)) != MEMTX_OK)) {
+        qemu_log_mask(LOG_GUEST_ERROR,
+                      "via-ac97: DMA error reading SGD table\n");
+        return;
+    }
+    c->addr = le32_to_cpu(b[0]);
+    c->clen = le32_to_cpu(b[1]);
+    trace_via_ac97_sgd_fetch(c->curr, c->addr, CLEN_IS_STOP(c) ? 'S' : '-',
+                             CLEN_IS_EOL(c) ? 'E' : '-',
+                             CLEN_IS_FLAG(c) ? 'F' : '-', CLEN_LEN(c));
+}
+
+static void out_cb(void *opaque, int avail)
+{
+    ViaAC97State *s = opaque;
+    ViaAC97SGDChannel *c = &s->aur;
+    int temp, to_copy, copied;
+    bool stop = false;
+    uint8_t tmpbuf[4096];
+
+    if (c->stat & STAT_PAUSED) {
+        return;
+    }
+    c->stat |= STAT_ACTIVE;
+    while (avail && !stop) {
+        if (!c->clen) {
+            fetch_sgd(c, &s->dev);
+        }
+        temp = MIN(CLEN_LEN(c), avail);
+        while (temp) {
+            to_copy = MIN(temp, sizeof(tmpbuf));
+            pci_dma_read(&s->dev, c->addr, tmpbuf, to_copy);
+            copied = AUD_write(s->vo, tmpbuf, to_copy);
+            if (!copied) {
+                stop = true;
+                break;
+            }
+            temp -= copied;
+            avail -= copied;
+            c->addr += copied;
+            c->clen -= copied;
+        }
+        if (CLEN_LEN(c) == 0) {
+            c->curr += 8;
+            if (CLEN_IS_EOL(c)) {
+                c->stat |= STAT_EOL;
+                if (c->type & CNTL_START) {
+                    c->curr = c->base;
+                    c->stat |= STAT_PAUSED;
+                } else {
+                    c->stat &= ~STAT_ACTIVE;
+                    AUD_set_active_out(s->vo, 0);
+                }
+                if (c->type & STAT_EOL) {
+                    pci_set_irq(&s->dev, 1);
+                }
+            }
+            if (CLEN_IS_FLAG(c)) {
+                c->stat |= STAT_FLAG;
+                c->stat |= STAT_PAUSED;
+                if (c->type & STAT_FLAG) {
+                    pci_set_irq(&s->dev, 1);
+                }
+            }
+            if (CLEN_IS_STOP(c)) {
+                c->stat |= STAT_STOP;
+                c->stat |= STAT_PAUSED;
+            }
+            c->clen = 0;
+            stop = true;
+        }
+    }
+}
+
+static void open_voice_out(ViaAC97State *s)
+{
+    struct audsettings as = {
+        .freq = CODEC_REG(s, AC97_PCM_Front_DAC_Rate),
+        .nchannels = s->aur.type & BIT(4) ? 2 : 1,
+        .fmt = s->aur.type & BIT(5) ? AUDIO_FORMAT_S16 : AUDIO_FORMAT_S8,
+        .endianness = 0,
+    };
+    s->vo = AUD_open_out(&s->card, s->vo, "via-ac97.out", s, out_cb, &as);
+}
+
+static uint64_t sgd_read(void *opaque, hwaddr addr, unsigned size)
+{
+    ViaAC97State *s = opaque;
+    uint64_t val = 0;
+
+    switch (addr) {
+    case 0:
+        val = s->aur.stat;
+        if (s->aur.type & CNTL_START) {
+            val |= STAT_TRIG;
+        }
+        break;
+    case 1:
+        val = s->aur.stat & STAT_PAUSED ? BIT(3) : 0;
+        break;
+    case 2:
+        val = s->aur.type;
+        break;
+    case 4:
+        val = s->aur.curr;
+        break;
+    case 0xc:
+        val = CLEN_LEN(&s->aur);
+        break;
+    case 0x10:
+        /* silence unimplemented log message that happens at every IRQ */
+        break;
+    case 0x80:
+        val = s->ac97_cmd;
+        break;
+    case 0x84:
+        val = s->aur.stat & STAT_FLAG;
+        if (s->aur.stat & STAT_EOL) {
+            val |= BIT(4);
+        }
+        if (s->aur.stat & STAT_STOP) {
+            val |= BIT(8);
+        }
+        if (s->aur.stat & STAT_ACTIVE) {
+            val |= BIT(12);
+        }
+        break;
+    default:
+        qemu_log_mask(LOG_UNIMP, "via-ac97: Unimplemented register read 0x%"
+                      HWADDR_PRIx"\n", addr);
+    }
+    trace_via_ac97_sgd_read(addr, size, val);
+    return val;
+}
+
+static void sgd_write(void *opaque, hwaddr addr, uint64_t val, unsigned size)
+{
+    ViaAC97State *s = opaque;
+
+    trace_via_ac97_sgd_write(addr, size, val);
+    switch (addr) {
+    case 0:
+        if (val & STAT_STOP) {
+            s->aur.stat &= ~STAT_PAUSED;
+        }
+        if (val & STAT_EOL) {
+            s->aur.stat &= ~(STAT_EOL | STAT_PAUSED);
+            if (s->aur.type & STAT_EOL) {
+                pci_set_irq(&s->dev, 0);
+            }
+        }
+        if (val & STAT_FLAG) {
+            s->aur.stat &= ~(STAT_FLAG | STAT_PAUSED);
+            if (s->aur.type & STAT_FLAG) {
+                pci_set_irq(&s->dev, 0);
+            }
+        }
+        break;
+    case 1:
+        if (val & CNTL_START) {
+            AUD_set_active_out(s->vo, 1);
+            s->aur.stat = STAT_ACTIVE;
+        }
+        if (val & CNTL_TERM) {
+            AUD_set_active_out(s->vo, 0);
+            s->aur.stat &= ~(STAT_ACTIVE | STAT_PAUSED);
+            s->aur.clen = 0;
+        }
+        if (val & CNTL_PAUSE) {
+            AUD_set_active_out(s->vo, 0);
+            s->aur.stat &= ~STAT_ACTIVE;
+            s->aur.stat |= STAT_PAUSED;
+        } else if (!(val & CNTL_PAUSE) && (s->aur.stat & STAT_PAUSED)) {
+            AUD_set_active_out(s->vo, 1);
+            s->aur.stat |= STAT_ACTIVE;
+            s->aur.stat &= ~STAT_PAUSED;
+        }
+        break;
+    case 2:
+    {
+        uint32_t oldval = s->aur.type;
+        s->aur.type = val;
+        if ((oldval & 0x30) != (val & 0x30)) {
+            open_voice_out(s);
+        }
+        break;
+    }
+    case 4:
+        s->aur.base = val & ~1ULL;
+        s->aur.curr = s->aur.base;
+        break;
+    case 0x80:
+        if (val >> 30) {
+            /* we only have primary codec */
+            break;
+        }
+        if (val & BIT(23)) { /* read reg */
+            s->ac97_cmd = val & 0xc0ff0000ULL;
+            s->ac97_cmd |= codec_read(s, (val >> 16) & 0x7f);
+            s->ac97_cmd |= BIT(25); /* data valid */
+        } else {
+            s->ac97_cmd = val & 0xc0ffffffULL;
+            codec_write(s, (val >> 16) & 0x7f, val);
+        }
+        break;
+    case 0xc:
+    case 0x84:
+        /* Read only */
+        break;
+    default:
+        qemu_log_mask(LOG_UNIMP, "via-ac97: Unimplemented register write 0x%"
+                      HWADDR_PRIx"\n", addr);
+    }
+}
+
+static const MemoryRegionOps sgd_ops = {
+    .read = sgd_read,
+    .write = sgd_write,
+    .endianness = DEVICE_LITTLE_ENDIAN,
+};
+
+static uint64_t fm_read(void *opaque, hwaddr addr, unsigned size)
+{
+    qemu_log_mask(LOG_UNIMP, "%s: 0x%"HWADDR_PRIx" %d\n", __func__, addr, size);
+    return 0;
+}
+
+static void fm_write(void *opaque, hwaddr addr, uint64_t val, unsigned size)
+{
+    qemu_log_mask(LOG_UNIMP, "%s: 0x%"HWADDR_PRIx" %d <= 0x%"PRIX64"\n",
+                  __func__, addr, size, val);
+}
+
+static const MemoryRegionOps fm_ops = {
+    .read = fm_read,
+    .write = fm_write,
+    .endianness = DEVICE_LITTLE_ENDIAN,
+};
+
+static uint64_t midi_read(void *opaque, hwaddr addr, unsigned size)
+{
+    qemu_log_mask(LOG_UNIMP, "%s: 0x%"HWADDR_PRIx" %d\n", __func__, addr, size);
+    return 0;
+}
+
+static void midi_write(void *opaque, hwaddr addr, uint64_t val, unsigned size)
+{
+    qemu_log_mask(LOG_UNIMP, "%s: 0x%"HWADDR_PRIx" %d <= 0x%"PRIX64"\n",
+                  __func__, addr, size, val);
+}
+
+static const MemoryRegionOps midi_ops = {
+    .read = midi_read,
+    .write = midi_write,
+    .endianness = DEVICE_LITTLE_ENDIAN,
+};
+
+static void via_ac97_reset(DeviceState *dev)
+{
+    ViaAC97State *s = VIA_AC97(dev);
+
+    codec_reset(s);
+}
 
 static void via_ac97_realize(PCIDevice *pci_dev, Error **errp)
 {
-    pci_set_word(pci_dev->config + PCI_COMMAND,
-                 PCI_COMMAND_INVALIDATE | PCI_COMMAND_PARITY);
+    ViaAC97State *s = VIA_AC97(pci_dev);
+    Object *o = OBJECT(s);
+
+    /*
+     * Command register Bus Master bit is documented to be fixed at 0 but it's
+     * needed for PCI DMA to work in QEMU. The pegasos2 firmware writes 0 here
+     * and the AmigaOS driver writes 1 only enabling IO bit which works on
+     * real hardware. So set it here and fix it to 1 to allow DMA.
+     */
+    pci_set_word(pci_dev->config + PCI_COMMAND, PCI_COMMAND_MASTER);
+    pci_set_word(pci_dev->wmask + PCI_COMMAND, PCI_COMMAND_IO);
     pci_set_word(pci_dev->config + PCI_STATUS,
                  PCI_STATUS_CAP_LIST | PCI_STATUS_DEVSEL_MEDIUM);
     pci_set_long(pci_dev->config + PCI_INTERRUPT_PIN, 0x03);
+    pci_set_byte(pci_dev->config + 0x40, 1); /* codec ready */
+
+    memory_region_init_io(&s->sgd, o, &sgd_ops, s, "via-ac97.sgd", 256);
+    pci_register_bar(pci_dev, 0, PCI_BASE_ADDRESS_SPACE_IO, &s->sgd);
+    memory_region_init_io(&s->fm, o, &fm_ops, s, "via-ac97.fm", 4);
+    pci_register_bar(pci_dev, 1, PCI_BASE_ADDRESS_SPACE_IO, &s->fm);
+    memory_region_init_io(&s->midi, o, &midi_ops, s, "via-ac97.midi", 4);
+    pci_register_bar(pci_dev, 2, PCI_BASE_ADDRESS_SPACE_IO, &s->midi);
+
+    AUD_register_card ("via-ac97", &s->card);
 }
 
+static void via_ac97_exit(PCIDevice *dev)
+{
+    ViaAC97State *s = VIA_AC97(dev);
+
+    AUD_close_out(&s->card, s->vo);
+    AUD_remove_card(&s->card);
+}
+
+static Property via_ac97_properties[] = {
+    DEFINE_AUDIO_PROPERTIES(ViaAC97State, card),
+    DEFINE_PROP_END_OF_LIST(),
+};
+
 static void via_ac97_class_init(ObjectClass *klass, void *data)
 {
     DeviceClass *dc = DEVICE_CLASS(klass);
     PCIDeviceClass *k = PCI_DEVICE_CLASS(klass);
 
     k->realize = via_ac97_realize;
+    k->exit = via_ac97_exit;
     k->vendor_id = PCI_VENDOR_ID_VIA;
     k->device_id = PCI_DEVICE_ID_VIA_AC97;
     k->revision = 0x50;
     k->class_id = PCI_CLASS_MULTIMEDIA_AUDIO;
+    device_class_set_props(dc, via_ac97_properties);
     set_bit(DEVICE_CATEGORY_SOUND, dc->categories);
     dc->desc = "VIA AC97";
+    dc->reset = via_ac97_reset;
     /* Reason: Part of a south bridge chip */
     dc->user_creatable = false;
 }
@@ -41,7 +484,7 @@ static void via_ac97_class_init(ObjectClass *klass, void *data)
 static const TypeInfo via_ac97_info = {
     .name          = TYPE_VIA_AC97,
     .parent        = TYPE_PCI_DEVICE,
-    .instance_size = sizeof(PCIDevice),
+    .instance_size = sizeof(ViaAC97State),
     .class_init    = via_ac97_class_init,
     .interfaces = (InterfaceInfo[]) {
         { INTERFACE_CONVENTIONAL_PCI_DEVICE },
diff --git a/hw/block/block.c b/hw/block/block.c
index af0710e477..9f52ee6e72 100644
--- a/hw/block/block.c
+++ b/hw/block/block.c
@@ -39,8 +39,7 @@ static int blk_pread_nonzeroes(BlockBackend *blk, hwaddr size, void *buf)
             return ret;
         }
         if (!(ret & BDRV_BLOCK_ZERO)) {
-            ret = bdrv_pread(bs->file, offset, bytes,
-                             (uint8_t *) buf + offset, 0);
+            ret = blk_pread(blk, offset, bytes, (uint8_t *) buf + offset, 0);
             if (ret < 0) {
                 return ret;
             }
diff --git a/hw/block/dataplane/meson.build b/hw/block/dataplane/meson.build
index 12c6a264f1..78d7ac1a11 100644
--- a/hw/block/dataplane/meson.build
+++ b/hw/block/dataplane/meson.build
@@ -1,2 +1,2 @@
 specific_ss.add(when: 'CONFIG_VIRTIO_BLK', if_true: files('virtio-blk.c'))
-specific_ss.add(when: 'CONFIG_XEN', if_true: files('xen-block.c'))
+specific_ss.add(when: 'CONFIG_XEN_BUS', if_true: files('xen-block.c'))
diff --git a/hw/block/dataplane/xen-block.c b/hw/block/dataplane/xen-block.c
index 2785b9e849..734da42ea7 100644
--- a/hw/block/dataplane/xen-block.c
+++ b/hw/block/dataplane/xen-block.c
@@ -23,8 +23,9 @@
 #include "qemu/main-loop.h"
 #include "qemu/memalign.h"
 #include "qapi/error.h"
-#include "hw/xen/xen_common.h"
+#include "hw/xen/xen.h"
 #include "hw/block/xen_blkif.h"
+#include "hw/xen/interface/io/ring.h"
 #include "sysemu/block-backend.h"
 #include "sysemu/iothread.h"
 #include "xen-block.h"
@@ -101,9 +102,9 @@ static XenBlockRequest *xen_block_start_request(XenBlockDataPlane *dataplane)
          * re-use requests, allocate the memory once here. It will be freed
          * xen_block_dataplane_destroy() when the request list is freed.
          */
-        request->buf = qemu_memalign(XC_PAGE_SIZE,
+        request->buf = qemu_memalign(XEN_PAGE_SIZE,
                                      BLKIF_MAX_SEGMENTS_PER_REQUEST *
-                                     XC_PAGE_SIZE);
+                                     XEN_PAGE_SIZE);
         dataplane->requests_total++;
         qemu_iovec_init(&request->v, 1);
     } else {
@@ -185,7 +186,7 @@ static int xen_block_parse_request(XenBlockRequest *request)
             goto err;
         }
         if (request->req.seg[i].last_sect * dataplane->sector_size >=
-            XC_PAGE_SIZE) {
+            XEN_PAGE_SIZE) {
             error_report("error: page crossing");
             goto err;
         }
@@ -705,6 +706,7 @@ void xen_block_dataplane_stop(XenBlockDataPlane *dataplane)
         Error *local_err = NULL;
 
         xen_device_unmap_grant_refs(xendev, dataplane->sring,
+                                    dataplane->ring_ref,
                                     dataplane->nr_ring_ref, &local_err);
         dataplane->sring = NULL;
 
@@ -739,7 +741,7 @@ void xen_block_dataplane_start(XenBlockDataPlane *dataplane,
 
     dataplane->protocol = protocol;
 
-    ring_size = XC_PAGE_SIZE * dataplane->nr_ring_ref;
+    ring_size = XEN_PAGE_SIZE * dataplane->nr_ring_ref;
     switch (dataplane->protocol) {
     case BLKIF_PROTOCOL_NATIVE:
     {
diff --git a/hw/block/m25p80.c b/hw/block/m25p80.c
index 802d2eb021..dc5ffbc4ff 100644
--- a/hw/block/m25p80.c
+++ b/hw/block/m25p80.c
@@ -24,6 +24,7 @@
 #include "qemu/osdep.h"
 #include "qemu/units.h"
 #include "sysemu/block-backend.h"
+#include "hw/block/block.h"
 #include "hw/qdev-properties.h"
 #include "hw/qdev-properties-system.h"
 #include "hw/ssi/ssi.h"
@@ -1615,8 +1616,7 @@ static void m25p80_realize(SSIPeripheral *ss, Error **errp)
         trace_m25p80_binding(s);
         s->storage = blk_blockalign(s->blk, s->size);
 
-        if (blk_pread(s->blk, 0, s->size, s->storage, 0) < 0) {
-            error_setg(errp, "failed to read the initial flash content");
+        if (!blk_check_size_and_read_all(s->blk, s->storage, s->size, errp)) {
             return;
         }
     } else {
diff --git a/hw/block/meson.build b/hw/block/meson.build
index b434d5654c..cc2a75cc50 100644
--- a/hw/block/meson.build
+++ b/hw/block/meson.build
@@ -14,7 +14,7 @@ softmmu_ss.add(when: 'CONFIG_PFLASH_CFI02', if_true: files('pflash_cfi02.c'))
 softmmu_ss.add(when: 'CONFIG_SSI_M25P80', if_true: files('m25p80.c'))
 softmmu_ss.add(when: 'CONFIG_SSI_M25P80', if_true: files('m25p80_sfdp.c'))
 softmmu_ss.add(when: 'CONFIG_SWIM', if_true: files('swim.c'))
-softmmu_ss.add(when: 'CONFIG_XEN', if_true: files('xen-block.c'))
+softmmu_ss.add(when: 'CONFIG_XEN_BUS', if_true: files('xen-block.c'))
 softmmu_ss.add(when: 'CONFIG_TC58128', if_true: files('tc58128.c'))
 
 specific_ss.add(when: 'CONFIG_VIRTIO_BLK', if_true: files('virtio-blk.c', 'virtio-blk-common.c'))
diff --git a/hw/block/xen-block.c b/hw/block/xen-block.c
index 345b284d70..f5a744589d 100644
--- a/hw/block/xen-block.c
+++ b/hw/block/xen-block.c
@@ -19,7 +19,6 @@
 #include "qapi/qmp/qdict.h"
 #include "qapi/qmp/qstring.h"
 #include "qom/object_interfaces.h"
-#include "hw/xen/xen_common.h"
 #include "hw/block/xen_blkif.h"
 #include "hw/qdev-properties.h"
 #include "hw/xen/xen-block.h"
@@ -84,7 +83,8 @@ static void xen_block_connect(XenDevice *xendev, Error **errp)
             g_free(ring_ref);
             return;
         }
-    } else if (order <= blockdev->props.max_ring_page_order) {
+    } else if (qemu_xen_gnttab_can_map_multi() &&
+               order <= blockdev->props.max_ring_page_order) {
         unsigned int i;
 
         nr_ring_ref = 1 << order;
@@ -256,8 +256,12 @@ static void xen_block_realize(XenDevice *xendev, Error **errp)
     }
 
     xen_device_backend_printf(xendev, "feature-flush-cache", "%u", 1);
-    xen_device_backend_printf(xendev, "max-ring-page-order", "%u",
-                              blockdev->props.max_ring_page_order);
+
+    if (qemu_xen_gnttab_can_map_multi()) {
+        xen_device_backend_printf(xendev, "max-ring-page-order", "%u",
+                                  blockdev->props.max_ring_page_order);
+    }
+
     xen_device_backend_printf(xendev, "info", "%u", blockdev->info);
 
     xen_device_frontend_printf(xendev, "virtual-device", "%lu",
diff --git a/hw/char/meson.build b/hw/char/meson.build
index 7b594f51b8..e02c60dd54 100644
--- a/hw/char/meson.build
+++ b/hw/char/meson.build
@@ -18,7 +18,7 @@ softmmu_ss.add(when: 'CONFIG_SERIAL_PCI', if_true: files('serial-pci.c'))
 softmmu_ss.add(when: 'CONFIG_SERIAL_PCI_MULTI', if_true: files('serial-pci-multi.c'))
 softmmu_ss.add(when: 'CONFIG_SHAKTI_UART', if_true: files('shakti_uart.c'))
 softmmu_ss.add(when: 'CONFIG_VIRTIO_SERIAL', if_true: files('virtio-console.c'))
-softmmu_ss.add(when: 'CONFIG_XEN', if_true: files('xen_console.c'))
+softmmu_ss.add(when: 'CONFIG_XEN_BUS', if_true: files('xen_console.c'))
 softmmu_ss.add(when: 'CONFIG_XILINX', if_true: files('xilinx_uartlite.c'))
 
 softmmu_ss.add(when: 'CONFIG_AVR_USART', if_true: files('avr_usart.c'))
diff --git a/hw/char/xen_console.c b/hw/char/xen_console.c
index 63153dfde4..c7a19c0e7c 100644
--- a/hw/char/xen_console.c
+++ b/hw/char/xen_console.c
@@ -173,6 +173,48 @@ static void xencons_send(struct XenConsole *con)
 
 /* -------------------------------------------------------------------- */
 
+static int store_con_info(struct XenConsole *con)
+{
+    Chardev *cs = qemu_chr_fe_get_driver(&con->chr);
+    char *pts = NULL;
+    char *dom_path;
+    GString *path;
+    int ret = -1;
+
+    /* Only continue if we're talking to a pty. */
+    if (!CHARDEV_IS_PTY(cs)) {
+        return 0;
+    }
+    pts = cs->filename + 4;
+
+    dom_path = qemu_xen_xs_get_domain_path(xenstore, xen_domid);
+    if (!dom_path) {
+        return 0;
+    }
+
+    path = g_string_new(dom_path);
+    free(dom_path);
+
+    if (con->xendev.dev) {
+        g_string_append_printf(path, "/device/console/%d", con->xendev.dev);
+    } else {
+        g_string_append(path, "/console");
+    }
+    g_string_append(path, "/tty");
+
+    if (xenstore_write_str(con->console, path->str, pts)) {
+        fprintf(stderr, "xenstore_write_str for '%s' fail", path->str);
+        goto out;
+    }
+    ret = 0;
+
+out:
+    g_string_free(path, true);
+    free(path);
+
+    return ret;
+}
+
 static int con_init(struct XenLegacyDevice *xendev)
 {
     struct XenConsole *con = container_of(xendev, struct XenConsole, xendev);
@@ -181,7 +223,7 @@ static int con_init(struct XenLegacyDevice *xendev)
     const char *output;
 
     /* setup */
-    dom = xs_get_domain_path(xenstore, con->xendev.dom);
+    dom = qemu_xen_xs_get_domain_path(xenstore, con->xendev.dom);
     if (!xendev->dev) {
         snprintf(con->console, sizeof(con->console), "%s/console", dom);
     } else {
@@ -215,8 +257,7 @@ static int con_init(struct XenLegacyDevice *xendev)
                          &error_abort);
     }
 
-    xenstore_store_pv_console_info(con->xendev.dev,
-                                   qemu_chr_fe_get_driver(&con->chr));
+    store_con_info(con);
 
 out:
     g_free(type);
@@ -237,9 +278,9 @@ static int con_initialise(struct XenLegacyDevice *xendev)
 
     if (!xendev->dev) {
         xen_pfn_t mfn = con->ring_ref;
-        con->sring = xenforeignmemory_map(xen_fmem, con->xendev.dom,
-                                          PROT_READ | PROT_WRITE,
-                                          1, &mfn, NULL);
+        con->sring = qemu_xen_foreignmem_map(con->xendev.dom, NULL,
+                                             PROT_READ | PROT_WRITE,
+                                             1, &mfn, NULL);
     } else {
         con->sring = xen_be_map_grant_ref(xendev, con->ring_ref,
                                           PROT_READ | PROT_WRITE);
@@ -269,9 +310,9 @@ static void con_disconnect(struct XenLegacyDevice *xendev)
 
     if (con->sring) {
         if (!xendev->dev) {
-            xenforeignmemory_unmap(xen_fmem, con->sring, 1);
+            qemu_xen_foreignmem_unmap(con->sring, 1);
         } else {
-            xen_be_unmap_grant_ref(xendev, con->sring);
+            xen_be_unmap_grant_ref(xendev, con->sring, con->ring_ref);
         }
         con->sring = NULL;
     }
diff --git a/hw/core/loader.c b/hw/core/loader.c
index 173f8f67f6..cd53235fed 100644
--- a/hw/core/loader.c
+++ b/hw/core/loader.c
@@ -857,6 +857,97 @@ ssize_t load_image_gzipped(const char *filename, hwaddr addr, uint64_t max_sz)
     return bytes;
 }
 
+/* The PE/COFF MS-DOS stub magic number */
+#define EFI_PE_MSDOS_MAGIC        "MZ"
+
+/*
+ * The Linux header magic number for a EFI PE/COFF
+ * image targetting an unspecified architecture.
+ */
+#define EFI_PE_LINUX_MAGIC        "\xcd\x23\x82\x81"
+
+/*
+ * Bootable Linux kernel images may be packaged as EFI zboot images, which are
+ * self-decompressing executables when loaded via EFI. The compressed payload
+ * can also be extracted from the image and decompressed by a non-EFI loader.
+ *
+ * The de facto specification for this format is at the following URL:
+ *
+ * https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/tree/drivers/firmware/efi/libstub/zboot-header.S
+ *
+ * This definition is based on Linux upstream commit 29636a5ce87beba.
+ */
+struct linux_efi_zboot_header {
+    uint8_t     msdos_magic[2];         /* PE/COFF 'MZ' magic number */
+    uint8_t     reserved0[2];
+    uint8_t     zimg[4];                /* "zimg" for Linux EFI zboot images */
+    uint32_t    payload_offset;         /* LE offset to compressed payload */
+    uint32_t    payload_size;           /* LE size of the compressed payload */
+    uint8_t     reserved1[8];
+    char        compression_type[32];   /* Compression type, NUL terminated */
+    uint8_t     linux_magic[4];         /* Linux header magic */
+    uint32_t    pe_header_offset;       /* LE offset to the PE header */
+};
+
+/*
+ * Check whether *buffer points to a Linux EFI zboot image in memory.
+ *
+ * If it does, attempt to decompress it to a new buffer, and free the old one.
+ * If any of this fails, return an error to the caller.
+ *
+ * If the image is not a Linux EFI zboot image, do nothing and return success.
+ */
+ssize_t unpack_efi_zboot_image(uint8_t **buffer, int *size)
+{
+    const struct linux_efi_zboot_header *header;
+    uint8_t *data = NULL;
+    int ploff, plsize;
+    ssize_t bytes;
+
+    /* ignore if this is too small to be a EFI zboot image */
+    if (*size < sizeof(*header)) {
+        return 0;
+    }
+
+    header = (struct linux_efi_zboot_header *)*buffer;
+
+    /* ignore if this is not a Linux EFI zboot image */
+    if (memcmp(&header->msdos_magic, EFI_PE_MSDOS_MAGIC, 2) != 0 ||
+        memcmp(&header->zimg, "zimg", 4) != 0 ||
+        memcmp(&header->linux_magic, EFI_PE_LINUX_MAGIC, 4) != 0) {
+        return 0;
+    }
+
+    if (strcmp(header->compression_type, "gzip") != 0) {
+        fprintf(stderr,
+                "unable to handle EFI zboot image with \"%.*s\" compression\n",
+                (int)sizeof(header->compression_type) - 1,
+                header->compression_type);
+        return -1;
+    }
+
+    ploff = ldl_le_p(&header->payload_offset);
+    plsize = ldl_le_p(&header->payload_size);
+
+    if (ploff < 0 || plsize < 0 || ploff + plsize > *size) {
+        fprintf(stderr, "unable to handle corrupt EFI zboot image\n");
+        return -1;
+    }
+
+    data = g_malloc(LOAD_IMAGE_MAX_GUNZIP_BYTES);
+    bytes = gunzip(data, LOAD_IMAGE_MAX_GUNZIP_BYTES, *buffer + ploff, plsize);
+    if (bytes < 0) {
+        fprintf(stderr, "failed to decompress EFI zboot image\n");
+        g_free(data);
+        return -1;
+    }
+
+    g_free(*buffer);
+    *buffer = g_realloc(data, bytes);
+    *size = bytes;
+    return bytes;
+}
+
 /*
  * Functions for reboot-persistent memory regions.
  *  - used for vga bios and option roms.
diff --git a/hw/core/machine.c b/hw/core/machine.c
index 1cf6822e06..45e3d24fdc 100644
--- a/hw/core/machine.c
+++ b/hw/core/machine.c
@@ -40,6 +40,7 @@
 #include "hw/virtio/virtio-pci.h"
 
 GlobalProperty hw_compat_7_2[] = {
+    { "e1000e", "migrate-timadj", "off" },
     { "virtio-mem", "x-early-migration", "false" },
 };
 const size_t hw_compat_7_2_len = G_N_ELEMENTS(hw_compat_7_2);
diff --git a/hw/cxl/cxl-component-utils.c b/hw/cxl/cxl-component-utils.c
index 3edd303a33..b665d4f565 100644
--- a/hw/cxl/cxl-component-utils.c
+++ b/hw/cxl/cxl-component-utils.c
@@ -141,17 +141,19 @@ static void ras_init_common(uint32_t *reg_state, uint32_t *write_msk)
      * Error status is RW1C but given bits are not yet set, it can
      * be handled as RO.
      */
-    reg_state[R_CXL_RAS_UNC_ERR_STATUS] = 0;
+    stl_le_p(reg_state + R_CXL_RAS_UNC_ERR_STATUS, 0);
+    stl_le_p(write_msk + R_CXL_RAS_UNC_ERR_STATUS, 0x1cfff);
     /* Bits 12-13 and 17-31 reserved in CXL 2.0 */
-    reg_state[R_CXL_RAS_UNC_ERR_MASK] = 0x1cfff;
-    write_msk[R_CXL_RAS_UNC_ERR_MASK] = 0x1cfff;
-    reg_state[R_CXL_RAS_UNC_ERR_SEVERITY] = 0x1cfff;
-    write_msk[R_CXL_RAS_UNC_ERR_SEVERITY] = 0x1cfff;
-    reg_state[R_CXL_RAS_COR_ERR_STATUS] = 0;
-    reg_state[R_CXL_RAS_COR_ERR_MASK] = 0x7f;
-    write_msk[R_CXL_RAS_COR_ERR_MASK] = 0x7f;
+    stl_le_p(reg_state + R_CXL_RAS_UNC_ERR_MASK, 0x1cfff);
+    stl_le_p(write_msk + R_CXL_RAS_UNC_ERR_MASK, 0x1cfff);
+    stl_le_p(reg_state + R_CXL_RAS_UNC_ERR_SEVERITY, 0x1cfff);
+    stl_le_p(write_msk + R_CXL_RAS_UNC_ERR_SEVERITY, 0x1cfff);
+    stl_le_p(reg_state + R_CXL_RAS_COR_ERR_STATUS, 0);
+    stl_le_p(write_msk + R_CXL_RAS_COR_ERR_STATUS, 0x7f);
+    stl_le_p(reg_state + R_CXL_RAS_COR_ERR_MASK, 0x7f);
+    stl_le_p(write_msk + R_CXL_RAS_COR_ERR_MASK, 0x7f);
     /* CXL switches and devices must set */
-    reg_state[R_CXL_RAS_ERR_CAP_CTRL] = 0x00;
+    stl_le_p(reg_state + R_CXL_RAS_ERR_CAP_CTRL, 0x200);
 }
 
 static void hdm_init_common(uint32_t *reg_state, uint32_t *write_msk,
diff --git a/hw/cxl/cxl-host.c b/hw/cxl/cxl-host.c
index 3c1ec8732a..6e923ceeaf 100644
--- a/hw/cxl/cxl-host.c
+++ b/hw/cxl/cxl-host.c
@@ -146,21 +146,28 @@ static PCIDevice *cxl_cfmws_find_device(CXLFixedWindow *fw, hwaddr addr)
         return NULL;
     }
 
-    hb_cstate = cxl_get_hb_cstate(hb);
-    if (!hb_cstate) {
-        return NULL;
-    }
+    if (cxl_get_hb_passthrough(hb)) {
+        rp = pcie_find_port_first(hb->bus);
+        if (!rp) {
+            return NULL;
+        }
+    } else {
+        hb_cstate = cxl_get_hb_cstate(hb);
+        if (!hb_cstate) {
+            return NULL;
+        }
 
-    cache_mem = hb_cstate->crb.cache_mem_registers;
+        cache_mem = hb_cstate->crb.cache_mem_registers;
 
-    target_found = cxl_hdm_find_target(cache_mem, addr, &target);
-    if (!target_found) {
-        return NULL;
-    }
+        target_found = cxl_hdm_find_target(cache_mem, addr, &target);
+        if (!target_found) {
+            return NULL;
+        }
 
-    rp = pcie_find_port_by_pn(hb->bus, target);
-    if (!rp) {
-        return NULL;
+        rp = pcie_find_port_by_pn(hb->bus, target);
+        if (!rp) {
+            return NULL;
+        }
     }
 
     d = pci_bridge_get_sec_bus(PCI_BRIDGE(rp))->devices[0];
diff --git a/hw/display/meson.build b/hw/display/meson.build
index f470179122..4191694380 100644
--- a/hw/display/meson.build
+++ b/hw/display/meson.build
@@ -14,7 +14,7 @@ softmmu_ss.add(when: 'CONFIG_PL110', if_true: files('pl110.c'))
 softmmu_ss.add(when: 'CONFIG_SII9022', if_true: files('sii9022.c'))
 softmmu_ss.add(when: 'CONFIG_SSD0303', if_true: files('ssd0303.c'))
 softmmu_ss.add(when: 'CONFIG_SSD0323', if_true: files('ssd0323.c'))
-softmmu_ss.add(when: 'CONFIG_XEN', if_true: files('xenfb.c'))
+softmmu_ss.add(when: 'CONFIG_XEN_BUS', if_true: files('xenfb.c'))
 
 softmmu_ss.add(when: 'CONFIG_VGA_PCI', if_true: files('vga-pci.c'))
 softmmu_ss.add(when: 'CONFIG_VGA_ISA', if_true: files('vga-isa.c'))
diff --git a/hw/display/sm501.c b/hw/display/sm501.c
index 17835159fc..dbabbc4339 100644
--- a/hw/display/sm501.c
+++ b/hw/display/sm501.c
@@ -465,6 +465,7 @@ typedef struct SM501State {
     uint32_t last_width;
     uint32_t last_height;
     bool do_full_update; /* perform a full update next time */
+    uint8_t use_pixman;
     I2CBus *i2c_bus;
 
     /* mmio registers */
@@ -827,7 +828,7 @@ static void sm501_2d_operation(SM501State *s)
                 de = db + (width + (height - 1) * dst_pitch) * bypp;
                 overlap = (db < se && sb < de);
             }
-            if (overlap) {
+            if (overlap && (s->use_pixman & BIT(2))) {
                 /* pixman can't do reverse blit: copy via temporary */
                 int tmp_stride = DIV_ROUND_UP(width * bypp, sizeof(uint32_t));
                 uint32_t *tmp = tmp_buf;
@@ -852,13 +853,15 @@ static void sm501_2d_operation(SM501State *s)
                 if (tmp != tmp_buf) {
                     g_free(tmp);
                 }
-            } else {
+            } else if (!overlap && (s->use_pixman & BIT(1))) {
                 fallback = !pixman_blt((uint32_t *)&s->local_mem[src_base],
                                        (uint32_t *)&s->local_mem[dst_base],
                                        src_pitch * bypp / sizeof(uint32_t),
                                        dst_pitch * bypp / sizeof(uint32_t),
                                        8 * bypp, 8 * bypp, src_x, src_y,
                                        dst_x, dst_y, width, height);
+            } else {
+                fallback = true;
             }
             if (fallback) {
                 uint8_t *sp = s->local_mem + src_base;
@@ -891,7 +894,7 @@ static void sm501_2d_operation(SM501State *s)
             color = cpu_to_le16(color);
         }
 
-        if ((width == 1 && height == 1) ||
+        if (!(s->use_pixman & BIT(0)) || (width == 1 && height == 1) ||
             !pixman_fill((uint32_t *)&s->local_mem[dst_base],
                          dst_pitch * bypp / sizeof(uint32_t), 8 * bypp,
                          dst_x, dst_y, width, height, color)) {
@@ -2035,6 +2038,7 @@ static void sm501_realize_sysbus(DeviceState *dev, Error **errp)
 
 static Property sm501_sysbus_properties[] = {
     DEFINE_PROP_UINT32("vram-size", SM501SysBusState, vram_size, 0),
+    DEFINE_PROP_UINT8("x-pixman", SM501SysBusState, state.use_pixman, 7),
     DEFINE_PROP_END_OF_LIST(),
 };
 
@@ -2122,6 +2126,7 @@ static void sm501_realize_pci(PCIDevice *dev, Error **errp)
 
 static Property sm501_pci_properties[] = {
     DEFINE_PROP_UINT32("vram-size", SM501PCIState, vram_size, 64 * MiB),
+    DEFINE_PROP_UINT8("x-pixman", SM501PCIState, state.use_pixman, 7),
     DEFINE_PROP_END_OF_LIST(),
 };
 
@@ -2162,11 +2167,18 @@ static void sm501_pci_class_init(ObjectClass *klass, void *data)
     dc->vmsd = &vmstate_sm501_pci;
 }
 
+static void sm501_pci_init(Object *o)
+{
+    object_property_set_description(o, "x-pixman", "Use pixman for: "
+                                    "1: fill, 2: blit, 4: overlap blit");
+}
+
 static const TypeInfo sm501_pci_info = {
     .name          = TYPE_PCI_SM501,
     .parent        = TYPE_PCI_DEVICE,
     .instance_size = sizeof(SM501PCIState),
     .class_init    = sm501_pci_class_init,
+    .instance_init = sm501_pci_init,
     .interfaces = (InterfaceInfo[]) {
         { INTERFACE_CONVENTIONAL_PCI_DEVICE },
         { },
diff --git a/hw/display/xenfb.c b/hw/display/xenfb.c
index 260eb38a76..0074a9b6f8 100644
--- a/hw/display/xenfb.c
+++ b/hw/display/xenfb.c
@@ -98,8 +98,9 @@ static int common_bind(struct common *c)
     if (xenstore_read_fe_int(&c->xendev, "event-channel", &c->xendev.remote_port) == -1)
         return -1;
 
-    c->page = xenforeignmemory_map(xen_fmem, c->xendev.dom,
-                                   PROT_READ | PROT_WRITE, 1, &mfn, NULL);
+    c->page = qemu_xen_foreignmem_map(c->xendev.dom, NULL,
+                                      PROT_READ | PROT_WRITE, 1, &mfn,
+                                      NULL);
     if (c->page == NULL)
         return -1;
 
@@ -115,7 +116,7 @@ static void common_unbind(struct common *c)
 {
     xen_pv_unbind_evtchn(&c->xendev);
     if (c->page) {
-        xenforeignmemory_unmap(xen_fmem, c->page, 1);
+        qemu_xen_foreignmem_unmap(c->page, 1);
         c->page = NULL;
     }
 }
@@ -488,27 +489,28 @@ static int xenfb_map_fb(struct XenFB *xenfb)
     }
 
     if (xenfb->pixels) {
-        munmap(xenfb->pixels, xenfb->fbpages * XC_PAGE_SIZE);
+        munmap(xenfb->pixels, xenfb->fbpages * XEN_PAGE_SIZE);
         xenfb->pixels = NULL;
     }
 
-    xenfb->fbpages = DIV_ROUND_UP(xenfb->fb_len, XC_PAGE_SIZE);
+    xenfb->fbpages = DIV_ROUND_UP(xenfb->fb_len, XEN_PAGE_SIZE);
     n_fbdirs = xenfb->fbpages * mode / 8;
-    n_fbdirs = DIV_ROUND_UP(n_fbdirs, XC_PAGE_SIZE);
+    n_fbdirs = DIV_ROUND_UP(n_fbdirs, XEN_PAGE_SIZE);
 
     pgmfns = g_new0(xen_pfn_t, n_fbdirs);
     fbmfns = g_new0(xen_pfn_t, xenfb->fbpages);
 
     xenfb_copy_mfns(mode, n_fbdirs, pgmfns, pd);
-    map = xenforeignmemory_map(xen_fmem, xenfb->c.xendev.dom,
-                               PROT_READ, n_fbdirs, pgmfns, NULL);
+    map = qemu_xen_foreignmem_map(xenfb->c.xendev.dom, NULL, PROT_READ,
+                                  n_fbdirs, pgmfns, NULL);
     if (map == NULL)
         goto out;
     xenfb_copy_mfns(mode, xenfb->fbpages, fbmfns, map);
-    xenforeignmemory_unmap(xen_fmem, map, n_fbdirs);
+    qemu_xen_foreignmem_unmap(map, n_fbdirs);
 
-    xenfb->pixels = xenforeignmemory_map(xen_fmem, xenfb->c.xendev.dom,
-            PROT_READ, xenfb->fbpages, fbmfns, NULL);
+    xenfb->pixels = qemu_xen_foreignmem_map(xenfb->c.xendev.dom, NULL,
+                                            PROT_READ, xenfb->fbpages,
+                                            fbmfns, NULL);
     if (xenfb->pixels == NULL)
         goto out;
 
@@ -526,8 +528,8 @@ static int xenfb_configure_fb(struct XenFB *xenfb, size_t fb_len_lim,
 {
     size_t mfn_sz = sizeof_field(struct xenfb_page, pd[0]);
     size_t pd_len = sizeof_field(struct xenfb_page, pd) / mfn_sz;
-    size_t fb_pages = pd_len * XC_PAGE_SIZE / mfn_sz;
-    size_t fb_len_max = fb_pages * XC_PAGE_SIZE;
+    size_t fb_pages = pd_len * XEN_PAGE_SIZE / mfn_sz;
+    size_t fb_len_max = fb_pages * XEN_PAGE_SIZE;
     int max_width, max_height;
 
     if (fb_len_lim > fb_len_max) {
@@ -927,8 +929,8 @@ static void fb_disconnect(struct XenLegacyDevice *xendev)
      *   Replacing the framebuffer with anonymous shared memory
      *   instead.  This releases the guest pages and keeps qemu happy.
      */
-    xenforeignmemory_unmap(xen_fmem, fb->pixels, fb->fbpages);
-    fb->pixels = mmap(fb->pixels, fb->fbpages * XC_PAGE_SIZE,
+    qemu_xen_foreignmem_unmap(fb->pixels, fb->fbpages);
+    fb->pixels = mmap(fb->pixels, fb->fbpages * XEN_PAGE_SIZE,
                       PROT_READ | PROT_WRITE, MAP_SHARED | MAP_ANON,
                       -1, 0);
     if (fb->pixels == MAP_FAILED) {
diff --git a/hw/i2c/allwinner-i2c.c b/hw/i2c/allwinner-i2c.c
index a435965836..f24c3ac6f0 100644
--- a/hw/i2c/allwinner-i2c.c
+++ b/hw/i2c/allwinner-i2c.c
@@ -357,10 +357,16 @@ static void allwinner_i2c_write(void *opaque, hwaddr offset,
                 s->stat = STAT_FROM_STA(STAT_IDLE);
                 s->cntr &= ~TWI_CNTR_M_STP;
             }
-            if ((s->cntr & TWI_CNTR_INT_FLAG) == 0) {
-                /* Interrupt flag cleared */
+
+            if (!s->irq_clear_inverted && !(s->cntr & TWI_CNTR_INT_FLAG)) {
+                /* Write 0 to clear this flag */
+                qemu_irq_lower(s->irq);
+            } else if (s->irq_clear_inverted && (s->cntr & TWI_CNTR_INT_FLAG)) {
+                /* Write 1 to clear this flag */
+                s->cntr &= ~TWI_CNTR_INT_FLAG;
                 qemu_irq_lower(s->irq);
             }
+
             if ((s->cntr & TWI_CNTR_A_ACK) == 0) {
                 if (STAT_TO_STA(s->stat) == STAT_M_DATA_RX_ACK) {
                     s->stat = STAT_FROM_STA(STAT_M_DATA_RX_NACK);
@@ -451,9 +457,25 @@ static const TypeInfo allwinner_i2c_type_info = {
     .class_init = allwinner_i2c_class_init,
 };
 
+static void allwinner_i2c_sun6i_init(Object *obj)
+{
+    AWI2CState *s = AW_I2C(obj);
+
+    s->irq_clear_inverted = true;
+}
+
+static const TypeInfo allwinner_i2c_sun6i_type_info = {
+    .name = TYPE_AW_I2C_SUN6I,
+    .parent = TYPE_SYS_BUS_DEVICE,
+    .instance_size = sizeof(AWI2CState),
+    .instance_init = allwinner_i2c_sun6i_init,
+    .class_init = allwinner_i2c_class_init,
+};
+
 static void allwinner_i2c_register_types(void)
 {
     type_register_static(&allwinner_i2c_type_info);
+    type_register_static(&allwinner_i2c_sun6i_type_info);
 }
 
 type_init(allwinner_i2c_register_types)
diff --git a/hw/i386/acpi-build.c b/hw/i386/acpi-build.c
index b19fb4259e..ec857a117e 100644
--- a/hw/i386/acpi-build.c
+++ b/hw/i386/acpi-build.c
@@ -373,6 +373,104 @@ Aml *aml_pci_device_dsm(void)
     return method;
 }
 
+static void build_append_pci_dsm_func0_common(Aml *ctx, Aml *retvar)
+{
+    Aml *UUID, *ifctx1;
+    uint8_t byte_list[1] = { 0 }; /* nothing supported yet */
+
+    aml_append(ctx, aml_store(aml_buffer(1, byte_list), retvar));
+    /*
+     * PCI Firmware Specification 3.1
+     * 4.6.  _DSM Definitions for PCI
+     */
+    UUID = aml_touuid("E5C937D0-3553-4D7A-9117-EA4D19C3434D");
+    ifctx1 = aml_if(aml_lnot(aml_equal(aml_arg(0), UUID)));
+    {
+        /* call is for unsupported UUID, bail out */
+        aml_append(ifctx1, aml_return(retvar));
+    }
+    aml_append(ctx, ifctx1);
+
+    ifctx1 = aml_if(aml_lless(aml_arg(1), aml_int(2)));
+    {
+        /* call is for unsupported REV, bail out */
+        aml_append(ifctx1, aml_return(retvar));
+    }
+    aml_append(ctx, ifctx1);
+}
+
+static Aml *aml_pci_edsm(void)
+{
+    Aml *method, *ifctx;
+    Aml *zero = aml_int(0);
+    Aml *func = aml_arg(2);
+    Aml *ret = aml_local(0);
+    Aml *aidx = aml_local(1);
+    Aml *params = aml_arg(4);
+
+    method = aml_method("EDSM", 5, AML_SERIALIZED);
+
+    /* get supported functions */
+    ifctx = aml_if(aml_equal(func, zero));
+    {
+        /* 1: have supported functions */
+        /* 7: support for function 7 */
+        const uint8_t caps = 1 | BIT(7);
+        build_append_pci_dsm_func0_common(ifctx, ret);
+        aml_append(ifctx, aml_store(aml_int(caps), aml_index(ret, zero)));
+        aml_append(ifctx, aml_return(ret));
+    }
+    aml_append(method, ifctx);
+
+    /* handle specific functions requests */
+    /*
+     * PCI Firmware Specification 3.1
+     * 4.6.7. _DSM for Naming a PCI or PCI Express Device Under
+     *        Operating Systems
+     */
+    ifctx = aml_if(aml_equal(func, aml_int(7)));
+    {
+       Aml *pkg = aml_package(2);
+       aml_append(pkg, zero);
+       /* optional, if not impl. should return null string */
+       aml_append(pkg, aml_string("%s", ""));
+       aml_append(ifctx, aml_store(pkg, ret));
+
+       /*
+        * IASL is fine when initializing Package with computational data,
+        * however it makes guest unhappy /it fails to process such AML/.
+        * So use runtime assignment to set acpi-index after initializer
+        * to make OSPM happy.
+        */
+       aml_append(ifctx,
+           aml_store(aml_derefof(aml_index(params, aml_int(0))), aidx));
+       aml_append(ifctx, aml_store(aidx, aml_index(ret, zero)));
+       aml_append(ifctx, aml_return(ret));
+    }
+    aml_append(method, ifctx);
+
+    return method;
+}
+
+static Aml *aml_pci_static_endpoint_dsm(PCIDevice *pdev)
+{
+    Aml *method;
+
+    g_assert(pdev->acpi_index != 0);
+    method = aml_method("_DSM", 4, AML_SERIALIZED);
+    {
+        Aml *params = aml_local(0);
+        Aml *pkg = aml_package(1);
+        aml_append(pkg, aml_int(pdev->acpi_index));
+        aml_append(method, aml_store(pkg, params));
+        aml_append(method,
+            aml_return(aml_call5("EDSM", aml_arg(0), aml_arg(1),
+                                 aml_arg(2), aml_arg(3), params))
+        );
+    }
+    return method;
+}
+
 static void build_append_pcihp_notify_entry(Aml *method, int slot)
 {
     Aml *if_ctx;
@@ -396,12 +494,6 @@ static bool is_devfn_ignored_generic(const int devfn, const PCIBus *bus)
             if (DEVICE(pdev)->hotplugged) {
                 return true;
             }
-        } else if (!get_dev_aml_func(DEVICE(pdev))) {
-            /*
-             * Ignore all other devices on !0 functions unless they
-             * have AML description (i.e have get_dev_aml_func() != 0)
-             */
-            return true;
         }
     }
     return false;
@@ -428,12 +520,14 @@ static bool is_devfn_ignored_hotplug(const int devfn, const PCIBus *bus)
     return false;
 }
 
-static void build_append_pcihp_slots(Aml *parent_scope, PCIBus *bus,
-                                     QObject *bsel)
+void build_append_pcihp_slots(Aml *parent_scope, PCIBus *bus)
 {
     int devfn;
     Aml *dev, *notify_method = NULL, *method;
+    QObject *bsel = object_property_get_qobject(OBJECT(bus),
+                        ACPI_PCIHP_PROP_BSEL, NULL);
     uint64_t bsel_val = qnum_get_uint(qobject_to(QNum, bsel));
+    qobject_unref(bsel);
 
     aml_append(parent_scope, aml_name_decl("BSEL", aml_int(bsel_val)));
     notify_method = aml_method("DVNT", 2, AML_NOTSERIALIZED);
@@ -478,12 +572,9 @@ static void build_append_pcihp_slots(Aml *parent_scope, PCIBus *bus,
 
 void build_append_pci_bus_devices(Aml *parent_scope, PCIBus *bus)
 {
-    QObject *bsel;
     int devfn;
     Aml *dev;
 
-    bsel = object_property_get_qobject(OBJECT(bus), ACPI_PCIHP_PROP_BSEL, NULL);
-
     for (devfn = 0; devfn < ARRAY_SIZE(bus->devices); devfn++) {
         /* ACPI spec: 1.0b: Table 6-2 _ADR Object Bus Types, PCI type */
         int adr = PCI_SLOT(devfn) << 16 | PCI_FUNC(devfn);
@@ -498,16 +589,16 @@ void build_append_pci_bus_devices(Aml *parent_scope, PCIBus *bus)
         aml_append(dev, aml_name_decl("_ADR", aml_int(adr)));
 
         call_dev_aml_func(DEVICE(bus->devices[devfn]), dev);
+        /* add _DSM if device has acpi-index set */
+        if (pdev->acpi_index &&
+            !object_property_get_bool(OBJECT(pdev), "hotpluggable",
+                                      &error_abort)) {
+            aml_append(dev, aml_pci_static_endpoint_dsm(pdev));
+        }
 
         /* device descriptor has been composed, add it into parent context */
         aml_append(parent_scope, dev);
     }
-
-    if (bsel) {
-        build_append_pcihp_slots(parent_scope, bus, bsel);
-    }
-
-    qobject_unref(bsel);
 }
 
 static bool build_append_notfication_callback(Aml *parent_scope,
@@ -517,16 +608,24 @@ static bool build_append_notfication_callback(Aml *parent_scope,
     PCIBus *sec;
     QObject *bsel;
     int nr_notifiers = 0;
+    GQueue *pcnt_bus_list = g_queue_new();
 
     QLIST_FOREACH(sec, &bus->child, sibling) {
         Aml *br_scope = aml_scope("S%.02X", sec->parent_dev->devfn);
-        if (pci_bus_is_root(sec) ||
-            !object_property_find(OBJECT(sec), ACPI_PCIHP_PROP_BSEL)) {
+        if (pci_bus_is_root(sec)) {
             continue;
         }
         nr_notifiers = nr_notifiers +
                        build_append_notfication_callback(br_scope, sec);
-        aml_append(parent_scope, br_scope);
+        /*
+         * add new child scope to parent
+         * and keep track of bus that have PCNT,
+         * bus list is used later to call children PCNTs from this level PCNT
+         */
+        if (nr_notifiers) {
+            g_queue_push_tail(pcnt_bus_list, sec);
+            aml_append(parent_scope, br_scope);
+        }
     }
 
     /*
@@ -550,30 +649,25 @@ static bool build_append_notfication_callback(Aml *parent_scope,
     }
 
     /* Notify about child bus events in any case */
-    QLIST_FOREACH(sec, &bus->child, sibling) {
-        if (pci_bus_is_root(sec) ||
-            !object_property_find(OBJECT(sec), ACPI_PCIHP_PROP_BSEL)) {
-            continue;
-        }
-
+    while ((sec = g_queue_pop_head(pcnt_bus_list))) {
         aml_append(method, aml_name("^S%.02X.PCNT", sec->parent_dev->devfn));
     }
 
     aml_append(parent_scope, method);
     qobject_unref(bsel);
+    g_queue_free(pcnt_bus_list);
     return !!nr_notifiers;
 }
 
 static Aml *aml_pci_pdsm(void)
 {
-    Aml *method, *UUID, *ifctx, *ifctx1;
+    Aml *method, *ifctx, *ifctx1;
     Aml *ret = aml_local(0);
     Aml *caps = aml_local(1);
     Aml *acpi_index = aml_local(2);
     Aml *zero = aml_int(0);
     Aml *one = aml_int(1);
     Aml *func = aml_arg(2);
-    Aml *rev = aml_arg(1);
     Aml *params = aml_arg(4);
     Aml *bnum = aml_derefof(aml_index(params, aml_int(0)));
     Aml *sunum = aml_derefof(aml_index(params, aml_int(1)));
@@ -583,29 +677,9 @@ static Aml *aml_pci_pdsm(void)
     /* get supported functions */
     ifctx = aml_if(aml_equal(func, zero));
     {
-        uint8_t byte_list[1] = { 0 }; /* nothing supported yet */
-        aml_append(ifctx, aml_store(aml_buffer(1, byte_list), ret));
-        aml_append(ifctx, aml_store(zero, caps));
-
-       /*
-        * PCI Firmware Specification 3.1
-        * 4.6.  _DSM Definitions for PCI
-        */
-        UUID = aml_touuid("E5C937D0-3553-4D7A-9117-EA4D19C3434D");
-        ifctx1 = aml_if(aml_lnot(aml_equal(aml_arg(0), UUID)));
-        {
-            /* call is for unsupported UUID, bail out */
-            aml_append(ifctx1, aml_return(ret));
-        }
-        aml_append(ifctx, ifctx1);
-
-        ifctx1 = aml_if(aml_lless(rev, aml_int(2)));
-        {
-            /* call is for unsupported REV, bail out */
-            aml_append(ifctx1, aml_return(ret));
-        }
-        aml_append(ifctx, ifctx1);
+        build_append_pci_dsm_func0_common(ifctx, ret);
 
+        aml_append(ifctx, aml_store(zero, caps));
         aml_append(ifctx,
             aml_store(aml_call2("AIDX", bnum, sunum), acpi_index));
         /*
@@ -1388,6 +1462,7 @@ build_dsdt(GArray *table_data, BIOSLinker *linker,
         aml_append(dev, aml_name_decl("_HID", aml_eisaid("PNP0A03")));
         aml_append(dev, aml_name_decl("_ADR", aml_int(0)));
         aml_append(dev, aml_name_decl("_UID", aml_int(pcmc->pci_root_uid)));
+        aml_append(dev, aml_pci_edsm());
         aml_append(sb_scope, dev);
         aml_append(dsdt, sb_scope);
 
@@ -1403,6 +1478,7 @@ build_dsdt(GArray *table_data, BIOSLinker *linker,
         aml_append(dev, aml_name_decl("_ADR", aml_int(0)));
         aml_append(dev, aml_name_decl("_UID", aml_int(pcmc->pci_root_uid)));
         aml_append(dev, build_q35_osc_method(!pm->pcihp_bridge_en));
+        aml_append(dev, aml_pci_edsm());
         aml_append(sb_scope, dev);
         if (mcfg_valid) {
             aml_append(sb_scope, build_q35_dram_controller(&mcfg));
@@ -1710,6 +1786,9 @@ build_dsdt(GArray *table_data, BIOSLinker *linker,
             Aml *scope = aml_scope("PCI0");
             /* Scan all PCI buses. Generate tables to support hotplug. */
             build_append_pci_bus_devices(scope, bus);
+            if (object_property_find(OBJECT(bus), ACPI_PCIHP_PROP_BSEL)) {
+                build_append_pcihp_slots(scope, bus);
+            }
             aml_append(sb_scope, scope);
         }
     }
diff --git a/hw/i386/kvm/meson.build b/hw/i386/kvm/meson.build
index 82dd6ae7c6..6621ba5cd7 100644
--- a/hw/i386/kvm/meson.build
+++ b/hw/i386/kvm/meson.build
@@ -9,6 +9,7 @@ i386_kvm_ss.add(when: 'CONFIG_XEN_EMU', if_true: files(
   'xen_evtchn.c',
   'xen_gnttab.c',
   'xen_xenstore.c',
+  'xenstore_impl.c',
   ))
 
 i386_ss.add_all(when: 'CONFIG_KVM', if_true: i386_kvm_ss)
diff --git a/hw/i386/kvm/trace-events b/hw/i386/kvm/trace-events
index b83c3eb965..e4c82de6f3 100644
--- a/hw/i386/kvm/trace-events
+++ b/hw/i386/kvm/trace-events
@@ -3,3 +3,18 @@ kvm_xen_unmap_pirq(int pirq, int gsi) "pirq %d gsi %d"
 kvm_xen_get_free_pirq(int pirq, int type) "pirq %d type %d"
 kvm_xen_bind_pirq(int pirq, int port) "pirq %d port %d"
 kvm_xen_unmask_pirq(int pirq, char *dev, int vector) "pirq %d dev %s vector %d"
+xenstore_error(unsigned int id, unsigned int tx_id, const char *err) "req %u tx %u err %s"
+xenstore_read(unsigned int tx_id, const char *path) "tx %u path %s"
+xenstore_write(unsigned int tx_id, const char *path) "tx %u path %s"
+xenstore_mkdir(unsigned int tx_id, const char *path) "tx %u path %s"
+xenstore_directory(unsigned int tx_id, const char *path) "tx %u path %s"
+xenstore_directory_part(unsigned int tx_id, const char *path, unsigned int offset) "tx %u path %s offset %u"
+xenstore_transaction_start(unsigned int new_tx) "new_tx %u"
+xenstore_transaction_end(unsigned int tx_id, bool commit) "tx %u commit %d"
+xenstore_rm(unsigned int tx_id, const char *path) "tx %u path %s"
+xenstore_get_perms(unsigned int tx_id, const char *path) "tx %u path %s"
+xenstore_set_perms(unsigned int tx_id, const char *path) "tx %u path %s"
+xenstore_watch(const char *path, const char *token) "path %s token %s"
+xenstore_unwatch(const char *path, const char *token) "path %s token %s"
+xenstore_reset_watches(void) ""
+xenstore_watch_event(const char *path, const char *token) "path %s token %s"
diff --git a/hw/i386/kvm/xen_evtchn.c b/hw/i386/kvm/xen_evtchn.c
index 886fbf6b3b..98a7b85047 100644
--- a/hw/i386/kvm/xen_evtchn.c
+++ b/hw/i386/kvm/xen_evtchn.c
@@ -34,6 +34,7 @@
 #include "hw/pci/msi.h"
 #include "hw/pci/msix.h"
 #include "hw/irq.h"
+#include "hw/xen/xen_backend_ops.h"
 
 #include "xen_evtchn.h"
 #include "xen_overlay.h"
@@ -278,6 +279,17 @@ static const TypeInfo xen_evtchn_info = {
     .class_init    = xen_evtchn_class_init,
 };
 
+static struct evtchn_backend_ops emu_evtchn_backend_ops = {
+    .open = xen_be_evtchn_open,
+    .bind_interdomain = xen_be_evtchn_bind_interdomain,
+    .unbind = xen_be_evtchn_unbind,
+    .close = xen_be_evtchn_close,
+    .get_fd = xen_be_evtchn_fd,
+    .notify = xen_be_evtchn_notify,
+    .unmask = xen_be_evtchn_unmask,
+    .pending = xen_be_evtchn_pending,
+};
+
 static void gsi_assert_bh(void *opaque)
 {
     struct vcpu_info *vi = kvm_xen_get_vcpu_info_hva(0);
@@ -318,6 +330,9 @@ void xen_evtchn_create(void)
     s->nr_pirq_inuse_words = DIV_ROUND_UP(s->nr_pirqs, 64);
     s->pirq_inuse_bitmap = g_new0(uint64_t, s->nr_pirq_inuse_words);
     s->pirq = g_new0(struct pirq_info, s->nr_pirqs);
+
+    /* Set event channel functions for backend drivers to use */
+    xen_evtchn_ops = &emu_evtchn_backend_ops;
 }
 
 void xen_evtchn_connect_gsis(qemu_irq *system_gsis)
diff --git a/hw/i386/kvm/xen_gnttab.c b/hw/i386/kvm/xen_gnttab.c
index 1e691ded32..21c30e3659 100644
--- a/hw/i386/kvm/xen_gnttab.c
+++ b/hw/i386/kvm/xen_gnttab.c
@@ -22,6 +22,7 @@
 
 #include "hw/sysbus.h"
 #include "hw/xen/xen.h"
+#include "hw/xen/xen_backend_ops.h"
 #include "xen_overlay.h"
 #include "xen_gnttab.h"
 
@@ -34,11 +35,10 @@
 #define TYPE_XEN_GNTTAB "xen-gnttab"
 OBJECT_DECLARE_SIMPLE_TYPE(XenGnttabState, XEN_GNTTAB)
 
-#define XEN_PAGE_SHIFT 12
-#define XEN_PAGE_SIZE (1ULL << XEN_PAGE_SHIFT)
-
 #define ENTRIES_PER_FRAME_V1 (XEN_PAGE_SIZE / sizeof(grant_entry_v1_t))
 
+static struct gnttab_backend_ops emu_gnttab_backend_ops;
+
 struct XenGnttabState {
     /*< private >*/
     SysBusDevice busdev;
@@ -57,6 +57,8 @@ struct XenGnttabState {
     MemoryRegion gnt_frames;
     MemoryRegion *gnt_aliases;
     uint64_t *gnt_frame_gpas;
+
+    uint8_t *map_track;
 };
 
 struct XenGnttabState *xen_gnttab_singleton;
@@ -70,13 +72,11 @@ static void xen_gnttab_realize(DeviceState *dev, Error **errp)
         error_setg(errp, "Xen grant table support is for Xen emulation");
         return;
     }
-    s->nr_frames = 0;
     s->max_frames = kvm_xen_get_gnttab_max_frames();
     memory_region_init_ram(&s->gnt_frames, OBJECT(dev), "xen:grant_table",
                            XEN_PAGE_SIZE * s->max_frames, &error_abort);
     memory_region_set_enabled(&s->gnt_frames, true);
     s->entries.v1 = memory_region_get_ram_ptr(&s->gnt_frames);
-    memset(s->entries.v1, 0, XEN_PAGE_SIZE * s->max_frames);
 
     /* Create individual page-sizes aliases for overlays */
     s->gnt_aliases = (void *)g_new0(MemoryRegion, s->max_frames);
@@ -88,9 +88,18 @@ static void xen_gnttab_realize(DeviceState *dev, Error **errp)
         s->gnt_frame_gpas[i] = INVALID_GPA;
     }
 
+    s->nr_frames = 0;
+    memset(s->entries.v1, 0, XEN_PAGE_SIZE * s->max_frames);
+    s->entries.v1[GNTTAB_RESERVED_XENSTORE].flags = GTF_permit_access;
+    s->entries.v1[GNTTAB_RESERVED_XENSTORE].frame = XEN_SPECIAL_PFN(XENSTORE);
+
     qemu_mutex_init(&s->gnt_lock);
 
     xen_gnttab_singleton = s;
+
+    s->map_track = g_new0(uint8_t, s->max_frames * ENTRIES_PER_FRAME_V1);
+
+    xen_gnttab_ops = &emu_gnttab_backend_ops;
 }
 
 static int xen_gnttab_post_load(void *opaque, int version_id)
@@ -230,3 +239,309 @@ int xen_gnttab_query_size_op(struct gnttab_query_size *size)
     size->max_nr_frames = s->max_frames;
     return 0;
 }
+
+/* Track per-open refs, to allow close() to clean up. */
+struct active_ref {
+    MemoryRegionSection mrs;
+    void *virtaddr;
+    uint32_t refcnt;
+    int prot;
+};
+
+static void gnt_unref(XenGnttabState *s, grant_ref_t ref,
+                      MemoryRegionSection *mrs, int prot)
+{
+    if (mrs && mrs->mr) {
+        if (prot & PROT_WRITE) {
+            memory_region_set_dirty(mrs->mr, mrs->offset_within_region,
+                                    XEN_PAGE_SIZE);
+        }
+        memory_region_unref(mrs->mr);
+        mrs->mr = NULL;
+    }
+    assert(s->map_track[ref] != 0);
+
+    if (--s->map_track[ref] == 0) {
+        grant_entry_v1_t *gnt_p = &s->entries.v1[ref];
+        qatomic_and(&gnt_p->flags, (uint16_t)~(GTF_reading | GTF_writing));
+    }
+}
+
+static uint64_t gnt_ref(XenGnttabState *s, grant_ref_t ref, int prot)
+{
+    uint16_t mask = GTF_type_mask | GTF_sub_page;
+    grant_entry_v1_t gnt, *gnt_p;
+    int retries = 0;
+
+    if (ref >= s->max_frames * ENTRIES_PER_FRAME_V1 ||
+        s->map_track[ref] == UINT8_MAX) {
+        return INVALID_GPA;
+    }
+
+    if (prot & PROT_WRITE) {
+        mask |= GTF_readonly;
+    }
+
+    gnt_p = &s->entries.v1[ref];
+
+    /*
+     * The guest can legitimately be changing the GTF_readonly flag. Allow
+     * that, but don't let a malicious guest cause a livelock.
+     */
+    for (retries = 0; retries < 5; retries++) {
+        uint16_t new_flags;
+
+        /* Read the entry before an atomic operation on its flags */
+        gnt = *(volatile grant_entry_v1_t *)gnt_p;
+
+        if ((gnt.flags & mask) != GTF_permit_access ||
+            gnt.domid != DOMID_QEMU) {
+            return INVALID_GPA;
+        }
+
+        new_flags = gnt.flags | GTF_reading;
+        if (prot & PROT_WRITE) {
+            new_flags |= GTF_writing;
+        }
+
+        if (qatomic_cmpxchg(&gnt_p->flags, gnt.flags, new_flags) == gnt.flags) {
+            return (uint64_t)gnt.frame << XEN_PAGE_SHIFT;
+        }
+    }
+
+    return INVALID_GPA;
+}
+
+struct xengntdev_handle {
+    GHashTable *active_maps;
+};
+
+static int xen_be_gnttab_set_max_grants(struct xengntdev_handle *xgt,
+                                        uint32_t nr_grants)
+{
+    return 0;
+}
+
+static void *xen_be_gnttab_map_refs(struct xengntdev_handle *xgt,
+                                    uint32_t count, uint32_t domid,
+                                    uint32_t *refs, int prot)
+{
+    XenGnttabState *s = xen_gnttab_singleton;
+    struct active_ref *act;
+
+    if (!s) {
+        errno = ENOTSUP;
+        return NULL;
+    }
+
+    if (domid != xen_domid) {
+        errno = EINVAL;
+        return NULL;
+    }
+
+    if (!count || count > 4096) {
+        errno = EINVAL;
+        return NULL;
+    }
+
+    /*
+     * Making a contiguous mapping from potentially discontiguous grant
+     * references would be... distinctly non-trivial. We don't support it.
+     * Even changing the API to return an array of pointers, one per page,
+     * wouldn't be simple to use in PV backends because some structures
+     * actually cross page boundaries (e.g. 32-bit blkif_response ring
+     * entries are 12 bytes).
+     */
+    if (count != 1) {
+        errno = EINVAL;
+        return NULL;
+    }
+
+    QEMU_LOCK_GUARD(&s->gnt_lock);
+
+    act = g_hash_table_lookup(xgt->active_maps, GINT_TO_POINTER(refs[0]));
+    if (act) {
+        if ((prot & PROT_WRITE) && !(act->prot & PROT_WRITE)) {
+            if (gnt_ref(s, refs[0], prot) == INVALID_GPA) {
+                return NULL;
+            }
+            act->prot |= PROT_WRITE;
+        }
+        act->refcnt++;
+    } else {
+        uint64_t gpa = gnt_ref(s, refs[0], prot);
+        if (gpa == INVALID_GPA) {
+            errno = EINVAL;
+            return NULL;
+        }
+
+        act = g_new0(struct active_ref, 1);
+        act->prot = prot;
+        act->refcnt = 1;
+        act->mrs = memory_region_find(get_system_memory(), gpa, XEN_PAGE_SIZE);
+
+        if (act->mrs.mr &&
+            !int128_lt(act->mrs.size, int128_make64(XEN_PAGE_SIZE)) &&
+            memory_region_get_ram_addr(act->mrs.mr) != RAM_ADDR_INVALID) {
+            act->virtaddr = qemu_map_ram_ptr(act->mrs.mr->ram_block,
+                                             act->mrs.offset_within_region);
+        }
+        if (!act->virtaddr) {
+            gnt_unref(s, refs[0], &act->mrs, 0);
+            g_free(act);
+            errno = EINVAL;
+            return NULL;
+        }
+
+        s->map_track[refs[0]]++;
+        g_hash_table_insert(xgt->active_maps, GINT_TO_POINTER(refs[0]), act);
+    }
+
+    return act->virtaddr;
+}
+
+static gboolean do_unmap(gpointer key, gpointer value, gpointer user_data)
+{
+    XenGnttabState *s = user_data;
+    grant_ref_t gref = GPOINTER_TO_INT(key);
+    struct active_ref *act = value;
+
+    gnt_unref(s, gref, &act->mrs, act->prot);
+    g_free(act);
+    return true;
+}
+
+static int xen_be_gnttab_unmap(struct xengntdev_handle *xgt,
+                               void *start_address, uint32_t *refs,
+                               uint32_t count)
+{
+    XenGnttabState *s = xen_gnttab_singleton;
+    struct active_ref *act;
+
+    if (!s) {
+        return -ENOTSUP;
+    }
+
+    if (count != 1) {
+        return -EINVAL;
+    }
+
+    QEMU_LOCK_GUARD(&s->gnt_lock);
+
+    act = g_hash_table_lookup(xgt->active_maps, GINT_TO_POINTER(refs[0]));
+    if (!act) {
+        return -ENOENT;
+    }
+
+    if (act->virtaddr != start_address) {
+        return -EINVAL;
+    }
+
+    if (!--act->refcnt) {
+        do_unmap(GINT_TO_POINTER(refs[0]), act, s);
+        g_hash_table_remove(xgt->active_maps, GINT_TO_POINTER(refs[0]));
+    }
+
+    return 0;
+}
+
+/*
+ * This looks a bit like the one for true Xen in xen-operations.c but
+ * in emulation we don't support multi-page mappings. And under Xen we
+ * *want* the multi-page mappings so we have fewer bounces through the
+ * kernel and the hypervisor. So the code paths end up being similar,
+ * but different.
+ */
+static int xen_be_gnttab_copy(struct xengntdev_handle *xgt, bool to_domain,
+                              uint32_t domid, XenGrantCopySegment *segs,
+                              uint32_t nr_segs, Error **errp)
+{
+    int prot = to_domain ? PROT_WRITE : PROT_READ;
+    unsigned int i;
+
+    for (i = 0; i < nr_segs; i++) {
+        XenGrantCopySegment *seg = &segs[i];
+        void *page;
+        uint32_t ref = to_domain ? seg->dest.foreign.ref :
+            seg->source.foreign.ref;
+
+        page = xen_be_gnttab_map_refs(xgt, 1, domid, &ref, prot);
+        if (!page) {
+            if (errp) {
+                error_setg_errno(errp, errno,
+                                 "xen_be_gnttab_map_refs failed");
+            }
+            return -errno;
+        }
+
+        if (to_domain) {
+            memcpy(page + seg->dest.foreign.offset, seg->source.virt,
+                   seg->len);
+        } else {
+            memcpy(seg->dest.virt, page + seg->source.foreign.offset,
+                   seg->len);
+        }
+
+        if (xen_be_gnttab_unmap(xgt, page, &ref, 1)) {
+            if (errp) {
+                error_setg_errno(errp, errno, "xen_be_gnttab_unmap failed");
+            }
+            return -errno;
+        }
+    }
+
+    return 0;
+}
+
+static struct xengntdev_handle *xen_be_gnttab_open(void)
+{
+    struct xengntdev_handle *xgt = g_new0(struct xengntdev_handle, 1);
+
+    xgt->active_maps = g_hash_table_new(g_direct_hash, g_direct_equal);
+    return xgt;
+}
+
+static int xen_be_gnttab_close(struct xengntdev_handle *xgt)
+{
+    XenGnttabState *s = xen_gnttab_singleton;
+
+    if (!s) {
+        return -ENOTSUP;
+    }
+
+    g_hash_table_foreach_remove(xgt->active_maps, do_unmap, s);
+    g_hash_table_destroy(xgt->active_maps);
+    g_free(xgt);
+    return 0;
+}
+
+static struct gnttab_backend_ops emu_gnttab_backend_ops = {
+    .open = xen_be_gnttab_open,
+    .close = xen_be_gnttab_close,
+    .grant_copy = xen_be_gnttab_copy,
+    .set_max_grants = xen_be_gnttab_set_max_grants,
+    .map_refs = xen_be_gnttab_map_refs,
+    .unmap = xen_be_gnttab_unmap,
+};
+
+int xen_gnttab_reset(void)
+{
+    XenGnttabState *s = xen_gnttab_singleton;
+
+    if (!s) {
+        return -ENOTSUP;
+    }
+
+    QEMU_LOCK_GUARD(&s->gnt_lock);
+
+    s->nr_frames = 0;
+
+    memset(s->entries.v1, 0, XEN_PAGE_SIZE * s->max_frames);
+
+    s->entries.v1[GNTTAB_RESERVED_XENSTORE].flags = GTF_permit_access;
+    s->entries.v1[GNTTAB_RESERVED_XENSTORE].frame = XEN_SPECIAL_PFN(XENSTORE);
+
+    memset(s->map_track, 0, s->max_frames * ENTRIES_PER_FRAME_V1);
+
+    return 0;
+}
diff --git a/hw/i386/kvm/xen_gnttab.h b/hw/i386/kvm/xen_gnttab.h
index 3bdbe96191..ee215239b0 100644
--- a/hw/i386/kvm/xen_gnttab.h
+++ b/hw/i386/kvm/xen_gnttab.h
@@ -13,6 +13,7 @@
 #define QEMU_XEN_GNTTAB_H
 
 void xen_gnttab_create(void);
+int xen_gnttab_reset(void);
 int xen_gnttab_map_page(uint64_t idx, uint64_t gfn);
 
 struct gnttab_set_version;
diff --git a/hw/i386/kvm/xen_xenstore.c b/hw/i386/kvm/xen_xenstore.c
index 14193ef3f9..2cadafd56a 100644
--- a/hw/i386/kvm/xen_xenstore.c
+++ b/hw/i386/kvm/xen_xenstore.c
@@ -21,6 +21,7 @@
 
 #include "hw/sysbus.h"
 #include "hw/xen/xen.h"
+#include "hw/xen/xen_backend_ops.h"
 #include "xen_overlay.h"
 #include "xen_evtchn.h"
 #include "xen_xenstore.h"
@@ -28,15 +29,17 @@
 #include "sysemu/kvm.h"
 #include "sysemu/kvm_xen.h"
 
+#include "trace.h"
+
+#include "xenstore_impl.h"
+
 #include "hw/xen/interface/io/xs_wire.h"
 #include "hw/xen/interface/event_channel.h"
+#include "hw/xen/interface/grant_table.h"
 
 #define TYPE_XEN_XENSTORE "xen-xenstore"
 OBJECT_DECLARE_SIMPLE_TYPE(XenXenstoreState, XEN_XENSTORE)
 
-#define XEN_PAGE_SHIFT 12
-#define XEN_PAGE_SIZE (1ULL << XEN_PAGE_SHIFT)
-
 #define ENTRIES_PER_FRAME_V1 (XEN_PAGE_SIZE / sizeof(grant_entry_v1_t))
 #define ENTRIES_PER_FRAME_V2 (XEN_PAGE_SIZE / sizeof(grant_entry_v2_t))
 
@@ -47,6 +50,9 @@ struct XenXenstoreState {
     SysBusDevice busdev;
     /*< public >*/
 
+    XenstoreImplState *impl;
+    GList *watch_events; /* for the guest */
+
     MemoryRegion xenstore_page;
     struct xenstore_domain_interface *xs;
     uint8_t req_data[XENSTORE_HEADER_SIZE + XENSTORE_PAYLOAD_MAX];
@@ -59,15 +65,54 @@ struct XenXenstoreState {
     evtchn_port_t guest_port;
     evtchn_port_t be_port;
     struct xenevtchn_handle *eh;
+
+    uint8_t *impl_state;
+    uint32_t impl_state_size;
+
+    struct xengntdev_handle *gt;
+    void *granted_xs;
 };
 
 struct XenXenstoreState *xen_xenstore_singleton;
 
 static void xen_xenstore_event(void *opaque);
+static void fire_watch_cb(void *opaque, const char *path, const char *token);
+
+static struct xenstore_backend_ops emu_xenstore_backend_ops;
+
+static void G_GNUC_PRINTF (4, 5) relpath_printf(XenXenstoreState *s,
+                                                GList *perms,
+                                                const char *relpath,
+                                                const char *fmt, ...)
+{
+    gchar *abspath;
+    gchar *value;
+    va_list args;
+    GByteArray *data;
+    int err;
+
+    abspath = g_strdup_printf("/local/domain/%u/%s", xen_domid, relpath);
+    va_start(args, fmt);
+    value = g_strdup_vprintf(fmt, args);
+    va_end(args);
+
+    data = g_byte_array_new_take((void *)value, strlen(value));
+
+    err = xs_impl_write(s->impl, DOMID_QEMU, XBT_NULL, abspath, data);
+    assert(!err);
+
+    g_byte_array_unref(data);
+
+    err = xs_impl_set_perms(s->impl, DOMID_QEMU, XBT_NULL, abspath, perms);
+    assert(!err);
+
+    g_free(abspath);
+}
 
 static void xen_xenstore_realize(DeviceState *dev, Error **errp)
 {
     XenXenstoreState *s = XEN_XENSTORE(dev);
+    GList *perms;
 
     if (xen_mode != XEN_EMULATE) {
         error_setg(errp, "Xen xenstore support is for Xen emulation");
@@ -89,6 +134,50 @@ static void xen_xenstore_realize(DeviceState *dev, Error **errp)
     }
     aio_set_fd_handler(qemu_get_aio_context(), xen_be_evtchn_fd(s->eh), true,
                        xen_xenstore_event, NULL, NULL, NULL, s);
+
+    s->impl = xs_impl_create(xen_domid);
+
+    /* Populate the default nodes */
+
+    /* Nodes owned by 'dom0' but readable by the guest */
+    perms = g_list_append(NULL, xs_perm_as_string(XS_PERM_NONE, DOMID_QEMU));
+    perms = g_list_append(perms, xs_perm_as_string(XS_PERM_READ, xen_domid));
+
+    relpath_printf(s, perms, "", "%s", "");
+
+    relpath_printf(s, perms, "domid", "%u", xen_domid);
+
+    relpath_printf(s, perms, "control/platform-feature-xs_reset_watches", "%u", 1);
+    relpath_printf(s, perms, "control/platform-feature-multiprocessor-suspend", "%u", 1);
+
+    relpath_printf(s, perms, "platform/acpi", "%u", 1);
+    relpath_printf(s, perms, "platform/acpi_s3", "%u", 1);
+    relpath_printf(s, perms, "platform/acpi_s4", "%u", 1);
+    relpath_printf(s, perms, "platform/acpi_laptop_slate", "%u", 0);
+
+    g_list_free_full(perms, g_free);
+
+    /* Nodes owned by the guest */
+    perms = g_list_append(NULL, xs_perm_as_string(XS_PERM_NONE, xen_domid));
+
+    relpath_printf(s, perms, "attr", "%s", "");
+
+    relpath_printf(s, perms, "control/shutdown", "%s", "");
+    relpath_printf(s, perms, "control/feature-poweroff", "%u", 1);
+    relpath_printf(s, perms, "control/feature-reboot", "%u", 1);
+    relpath_printf(s, perms, "control/feature-suspend", "%u", 1);
+    relpath_printf(s, perms, "control/feature-s3", "%u", 1);
+    relpath_printf(s, perms, "control/feature-s4", "%u", 1);
+
+    relpath_printf(s, perms, "data", "%s", "");
+    relpath_printf(s, perms, "device", "%s", "");
+    relpath_printf(s, perms, "drivers", "%s", "");
+    relpath_printf(s, perms, "error", "%s", "");
+    relpath_printf(s, perms, "feature", "%s", "");
+
+    g_list_free_full(perms, g_free);
+
+    xen_xenstore_ops = &emu_xenstore_backend_ops;
 }
 
 static bool xen_xenstore_is_needed(void *opaque)
@@ -99,16 +188,26 @@ static bool xen_xenstore_is_needed(void *opaque)
 static int xen_xenstore_pre_save(void *opaque)
 {
     XenXenstoreState *s = opaque;
+    GByteArray *save;
 
     if (s->eh) {
         s->guest_port = xen_be_evtchn_get_guest_port(s->eh);
     }
+
+    g_free(s->impl_state);
+    save = xs_impl_serialize(s->impl);
+    s->impl_state = save->data;
+    s->impl_state_size = save->len;
+    g_byte_array_free(save, false);
+
     return 0;
 }
 
 static int xen_xenstore_post_load(void *opaque, int ver)
 {
     XenXenstoreState *s = opaque;
+    GByteArray *save;
+    int ret;
 
     /*
      * As qemu/dom0, rebind to the guest's port. The Windows drivers may
@@ -125,11 +224,18 @@ static int xen_xenstore_post_load(void *opaque, int ver)
         }
         s->be_port = be_port;
     }
-    return 0;
+
+    save = g_byte_array_new_take(s->impl_state, s->impl_state_size);
+    s->impl_state = NULL;
+    s->impl_state_size = 0;
+
+    ret = xs_impl_deserialize(s->impl, save, xen_domid, fire_watch_cb, s);
+    return ret;
 }
 
 static const VMStateDescription xen_xenstore_vmstate = {
     .name = "xen_xenstore",
+    .unmigratable = 1, /* The PV back ends don't migrate yet */
     .version_id = 1,
     .minimum_version_id = 1,
     .needed = xen_xenstore_is_needed,
@@ -145,6 +251,10 @@ static const VMStateDescription xen_xenstore_vmstate = {
         VMSTATE_BOOL(rsp_pending, XenXenstoreState),
         VMSTATE_UINT32(guest_port, XenXenstoreState),
         VMSTATE_BOOL(fatal_error, XenXenstoreState),
+        VMSTATE_UINT32(impl_state_size, XenXenstoreState),
+        VMSTATE_VARRAY_UINT32_ALLOC(impl_state, XenXenstoreState,
+                                    impl_state_size, 0,
+                                    vmstate_info_uint8, uint8_t),
         VMSTATE_END_OF_LIST()
     }
 };
@@ -213,20 +323,761 @@ static void reset_rsp(XenXenstoreState *s)
     s->rsp_offset = 0;
 }
 
+static void xs_error(XenXenstoreState *s, unsigned int id,
+                     xs_transaction_t tx_id, int errnum)
+{
+    struct xsd_sockmsg *rsp = (struct xsd_sockmsg *)s->rsp_data;
+    const char *errstr = NULL;
+
+    for (unsigned int i = 0; i < ARRAY_SIZE(xsd_errors); i++) {
+        struct xsd_errors *xsd_error = &xsd_errors[i];
+
+        if (xsd_error->errnum == errnum) {
+            errstr = xsd_error->errstring;
+            break;
+        }
+    }
+    assert(errstr);
+
+    trace_xenstore_error(id, tx_id, errstr);
+
+    rsp->type = XS_ERROR;
+    rsp->req_id = id;
+    rsp->tx_id = tx_id;
+    rsp->len = (uint32_t)strlen(errstr) + 1;
+
+    memcpy(&rsp[1], errstr, rsp->len);
+}
+
+static void xs_ok(XenXenstoreState *s, unsigned int type, unsigned int req_id,
+                  xs_transaction_t tx_id)
+{
+    struct xsd_sockmsg *rsp = (struct xsd_sockmsg *)s->rsp_data;
+    const char *okstr = "OK";
+
+    rsp->type = type;
+    rsp->req_id = req_id;
+    rsp->tx_id = tx_id;
+    rsp->len = (uint32_t)strlen(okstr) + 1;
+
+    memcpy(&rsp[1], okstr, rsp->len);
+}
+
+/*
+ * The correct request and response formats are documented in xen.git:
+ * docs/misc/xenstore.txt. A summary is given below for convenience.
+ * The '|' symbol represents a NUL character.
+ *
+ * ---------- Database read, write and permissions operations ----------
+ *
+ * READ                    <path>|                 <value|>
+ * WRITE                   <path>|<value|>
+ *         Store and read the octet string <value> at <path>.
+ *         WRITE creates any missing parent paths, with empty values.
+ *
+ * MKDIR                   <path>|
+ *         Ensures that the <path> exists, by necessary by creating
+ *         it and any missing parents with empty values.  If <path>
+ *         or any parent already exists, its value is left unchanged.
+ *
+ * RM                      <path>|
+ *         Ensures that the <path> does not exist, by deleting
+ *         it and all of its children.  It is not an error if <path> does
+ *         not exist, but it _is_ an error if <path>'s immediate parent
+ *         does not exist either.
+ *
+ * DIRECTORY               <path>|                 <child-leaf-name>|*
+ *         Gives a list of the immediate children of <path>, as only the
+ *         leafnames.  The resulting children are each named
+ *         <path>/<child-leaf-name>.
+ *
+ * DIRECTORY_PART          <path>|<offset>         <gencnt>|<child-leaf-name>|*
+ *         Same as DIRECTORY, but to be used for children lists longer than
+ *         XENSTORE_PAYLOAD_MAX. Input are <path> and the byte offset into
+ *         the list of children to return. Return values are the generation
+ *         count <gencnt> of the node (to be used to ensure the node hasn't
+ *         changed between two reads: <gencnt> being the same for multiple
+ *         reads guarantees the node hasn't changed) and the list of children
+ *         starting at the specified <offset> of the complete list.
+ *
+ * GET_PERMS               <path>|                 <perm-as-string>|+
+ * SET_PERMS               <path>|<perm-as-string>|+?
+ *         <perm-as-string> is one of the following
+ *                 w<domid>        write only
+ *                 r<domid>        read only
+ *                 b<domid>        both read and write
+ *                 n<domid>        no access
+ *         See https://wiki.xen.org/wiki/XenBus section
+ *         `Permissions' for details of the permissions system.
+ *         It is possible to set permissions for the special watch paths
+ *         "@introduceDomain" and "@releaseDomain" to enable receiving those
+ *         watches in unprivileged domains.
+ *
+ * ---------- Watches ----------
+ *
+ * WATCH                   <wpath>|<token>|?
+ *         Adds a watch.
+ *
+ *         When a <path> is modified (including path creation, removal,
+ *         contents change or permissions change) this generates an event
+ *         on the changed <path>.  Changes made in transactions cause an
+ *         event only if and when committed.  Each occurring event is
+ *         matched against all the watches currently set up, and each
+ *         matching watch results in a WATCH_EVENT message (see below).
+ *
+ *         The event's path matches the watch's <wpath> if it is an child
+ *         of <wpath>.
+ *
+ *         <wpath> can be a <path> to watch or @<wspecial>.  In the
+ *         latter case <wspecial> may have any syntax but it matches
+ *         (according to the rules above) only the following special
+ *         events which are invented by xenstored:
+ *             @introduceDomain    occurs on INTRODUCE
+ *             @releaseDomain      occurs on any domain crash or
+ *                                 shutdown, and also on RELEASE
+ *                                 and domain destruction
+ *         <wspecial> events are sent to privileged callers or explicitly
+ *         via SET_PERMS enabled domains only.
+ *
+ *         When a watch is first set up it is triggered once straight
+ *         away, with <path> equal to <wpath>.  Watches may be triggered
+ *         spuriously.  The tx_id in a WATCH request is ignored.
+ *
+ *         Watches are supposed to be restricted by the permissions
+ *         system but in practice the implementation is imperfect.
+ *         Applications should not rely on being sent a notification for
+ *         paths that they cannot read; however, an application may rely
+ *         on being sent a watch when a path which it _is_ able to read
+ *         is deleted even if that leaves only a nonexistent unreadable
+ *         parent.  A notification may omitted if a node's permissions
+ *         are changed so as to make it unreadable, in which case future
+ *         notifications may be suppressed (and if the node is later made
+ *         readable, some notifications may have been lost).
+ *
+ * WATCH_EVENT                                     <epath>|<token>|
+ *         Unsolicited `reply' generated for matching modification events
+ *         as described above.  req_id and tx_id are both 0.
+ *
+ *         <epath> is the event's path, ie the actual path that was
+ *         modified; however if the event was the recursive removal of an
+ *         parent of <wpath>, <epath> is just
+ *         <wpath> (rather than the actual path which was removed).  So
+ *         <epath> is a child of <wpath>, regardless.
+ *
+ *         Iff <wpath> for the watch was specified as a relative pathname,
+ *         the <epath> path will also be relative (with the same base,
+ *         obviously).
+ *
+ * UNWATCH                 <wpath>|<token>|?
+ *
+ * RESET_WATCHES           |
+ *         Reset all watches and transactions of the caller.
+ *
+ * ---------- Transactions ----------
+ *
+ * TRANSACTION_START       |                       <transid>|
+ *         <transid> is an opaque uint32_t allocated by xenstored
+ *         represented as unsigned decimal.  After this, transaction may
+ *         be referenced by using <transid> (as 32-bit binary) in the
+ *         tx_id request header field.  When transaction is started whole
+ *         db is copied; reads and writes happen on the copy.
+ *         It is not legal to send non-0 tx_id in TRANSACTION_START.
+ *
+ * TRANSACTION_END         T|
+ * TRANSACTION_END         F|
+ *         tx_id must refer to existing transaction.  After this
+ *         request the tx_id is no longer valid and may be reused by
+ *         xenstore.  If F, the transaction is discarded.  If T,
+ *         it is committed: if there were any other intervening writes
+ *         then our END gets get EAGAIN.
+ *
+ *         The plan is that in the future only intervening `conflicting'
+ *         writes cause EAGAIN, meaning only writes or other commits
+ *         which changed paths which were read or written in the
+ *         transaction at hand.
+ *
+ */
+
+static void xs_read(XenXenstoreState *s, unsigned int req_id,
+                    xs_transaction_t tx_id, uint8_t *req_data, unsigned int len)
+{
+    const char *path = (const char *)req_data;
+    struct xsd_sockmsg *rsp = (struct xsd_sockmsg *)s->rsp_data;
+    uint8_t *rsp_data = (uint8_t *)&rsp[1];
+    g_autoptr(GByteArray) data = g_byte_array_new();
+    int err;
+
+    if (len == 0 || req_data[len - 1] != '\0') {
+        xs_error(s, req_id, tx_id, EINVAL);
+        return;
+    }
+
+    trace_xenstore_read(tx_id, path);
+    err = xs_impl_read(s->impl, xen_domid, tx_id, path, data);
+    if (err) {
+        xs_error(s, req_id, tx_id, err);
+        return;
+    }
+
+    rsp->type = XS_READ;
+    rsp->req_id = req_id;
+    rsp->tx_id = tx_id;
+    rsp->len = 0;
+
+    len = data->len;
+    if (len > XENSTORE_PAYLOAD_MAX) {
+        xs_error(s, req_id, tx_id, E2BIG);
+        return;
+    }
+
+    memcpy(&rsp_data[rsp->len], data->data, len);
+    rsp->len += len;
+}
+
+static void xs_write(XenXenstoreState *s, unsigned int req_id,
+                     xs_transaction_t tx_id, uint8_t *req_data,
+                     unsigned int len)
+{
+    g_autoptr(GByteArray) data = g_byte_array_new();
+    const char *path;
+    int err;
+
+    if (len == 0) {
+        xs_error(s, req_id, tx_id, EINVAL);
+        return;
+    }
+
+    path = (const char *)req_data;
+
+    while (len--) {
+        if (*req_data++ == '\0') {
+            break;
+        }
+        if (len == 0) {
+            xs_error(s, req_id, tx_id, EINVAL);
+            return;
+        }
+    }
+
+    g_byte_array_append(data, req_data, len);
+
+    trace_xenstore_write(tx_id, path);
+    err = xs_impl_write(s->impl, xen_domid, tx_id, path, data);
+    if (err) {
+        xs_error(s, req_id, tx_id, err);
+        return;
+    }
+
+    xs_ok(s, XS_WRITE, req_id, tx_id);
+}
+
+static void xs_mkdir(XenXenstoreState *s, unsigned int req_id,
+                     xs_transaction_t tx_id, uint8_t *req_data,
+                     unsigned int len)
+{
+    g_autoptr(GByteArray) data = g_byte_array_new();
+    const char *path;
+    int err;
+
+    if (len == 0 || req_data[len - 1] != '\0') {
+        xs_error(s, req_id, tx_id, EINVAL);
+        return;
+    }
+
+    path = (const char *)req_data;
+
+    trace_xenstore_mkdir(tx_id, path);
+    err = xs_impl_read(s->impl, xen_domid, tx_id, path, data);
+    if (err == ENOENT) {
+        err = xs_impl_write(s->impl, xen_domid, tx_id, path, data);
+    }
+
+    if (!err) {
+        xs_error(s, req_id, tx_id, err);
+        return;
+    }
+
+    xs_ok(s, XS_MKDIR, req_id, tx_id);
+}
+
+static void xs_append_strings(XenXenstoreState *s, struct xsd_sockmsg *rsp,
+                              GList *strings, unsigned int start, bool truncate)
+{
+    uint8_t *rsp_data = (uint8_t *)&rsp[1];
+    GList *l;
+
+    for (l = strings; l; l = l->next) {
+        size_t len = strlen(l->data) + 1; /* Including the NUL termination */
+        char *str = l->data;
+
+        if (rsp->len + len > XENSTORE_PAYLOAD_MAX) {
+            if (truncate) {
+                len = XENSTORE_PAYLOAD_MAX - rsp->len;
+                if (!len) {
+                    return;
+                }
+            } else {
+                xs_error(s, rsp->req_id, rsp->tx_id, E2BIG);
+                return;
+            }
+        }
+
+        if (start) {
+            if (start >= len) {
+                start -= len;
+                continue;
+            }
+
+            str += start;
+            len -= start;
+            start = 0;
+        }
+
+        memcpy(&rsp_data[rsp->len], str, len);
+        rsp->len += len;
+    }
+    /* XS_DIRECTORY_PART wants an extra NUL to indicate the end */
+    if (truncate && rsp->len < XENSTORE_PAYLOAD_MAX) {
+        rsp_data[rsp->len++] = '\0';
+    }
+}
+
+static void xs_directory(XenXenstoreState *s, unsigned int req_id,
+                         xs_transaction_t tx_id, uint8_t *req_data,
+                         unsigned int len)
+{
+    struct xsd_sockmsg *rsp = (struct xsd_sockmsg *)s->rsp_data;
+    GList *items = NULL;
+    const char *path;
+    int err;
+
+    if (len == 0 || req_data[len - 1] != '\0') {
+        xs_error(s, req_id, tx_id, EINVAL);
+        return;
+    }
+
+    path = (const char *)req_data;
+
+    trace_xenstore_directory(tx_id, path);
+    err = xs_impl_directory(s->impl, xen_domid, tx_id, path, NULL, &items);
+    if (err != 0) {
+        xs_error(s, req_id, tx_id, err);
+        return;
+    }
+
+    rsp->type = XS_DIRECTORY;
+    rsp->req_id = req_id;
+    rsp->tx_id = tx_id;
+    rsp->len = 0;
+
+    xs_append_strings(s, rsp, items, 0, false);
+
+    g_list_free_full(items, g_free);
+}
+
+static void xs_directory_part(XenXenstoreState *s, unsigned int req_id,
+                              xs_transaction_t tx_id, uint8_t *req_data,
+                              unsigned int len)
+{
+    const char *offset_str, *path = (const char *)req_data;
+    struct xsd_sockmsg *rsp = (struct xsd_sockmsg *)s->rsp_data;
+    char *rsp_data = (char *)&rsp[1];
+    uint64_t gencnt = 0;
+    unsigned int offset;
+    GList *items = NULL;
+    int err;
+
+    if (len == 0) {
+        xs_error(s, req_id, tx_id, EINVAL);
+        return;
+    }
+
+    while (len--) {
+        if (*req_data++ == '\0') {
+            break;
+        }
+        if (len == 0) {
+            xs_error(s, req_id, tx_id, EINVAL);
+            return;
+        }
+    }
+
+    offset_str = (const char *)req_data;
+    while (len--) {
+        if (*req_data++ == '\0') {
+            break;
+        }
+        if (len == 0) {
+            xs_error(s, req_id, tx_id, EINVAL);
+            return;
+        }
+    }
+
+    if (len) {
+        xs_error(s, req_id, tx_id, EINVAL);
+        return;
+    }
+
+    if (qemu_strtoui(offset_str, NULL, 10, &offset) < 0) {
+        xs_error(s, req_id, tx_id, EINVAL);
+        return;
+    }
+
+    trace_xenstore_directory_part(tx_id, path, offset);
+    err = xs_impl_directory(s->impl, xen_domid, tx_id, path, &gencnt, &items);
+    if (err != 0) {
+        xs_error(s, req_id, tx_id, err);
+        return;
+    }
+
+    rsp->type = XS_DIRECTORY_PART;
+    rsp->req_id = req_id;
+    rsp->tx_id = tx_id;
+    rsp->len = snprintf(rsp_data, XENSTORE_PAYLOAD_MAX, "%" PRIu64, gencnt) + 1;
+
+    xs_append_strings(s, rsp, items, offset, true);
+
+    g_list_free_full(items, g_free);
+}
+
+static void xs_transaction_start(XenXenstoreState *s, unsigned int req_id,
+                                 xs_transaction_t tx_id, uint8_t *req_data,
+                                 unsigned int len)
+{
+    struct xsd_sockmsg *rsp = (struct xsd_sockmsg *)s->rsp_data;
+    char *rsp_data = (char *)&rsp[1];
+    int err;
+
+    if (len != 1 || req_data[0] != '\0') {
+        xs_error(s, req_id, tx_id, EINVAL);
+        return;
+    }
+
+    rsp->type = XS_TRANSACTION_START;
+    rsp->req_id = req_id;
+    rsp->tx_id = tx_id;
+    rsp->len = 0;
+
+    err = xs_impl_transaction_start(s->impl, xen_domid, &tx_id);
+    if (err) {
+        xs_error(s, req_id, tx_id, err);
+        return;
+    }
+
+    trace_xenstore_transaction_start(tx_id);
+
+    rsp->len = snprintf(rsp_data, XENSTORE_PAYLOAD_MAX, "%u", tx_id);
+    assert(rsp->len < XENSTORE_PAYLOAD_MAX);
+    rsp->len++;
+}
+
+static void xs_transaction_end(XenXenstoreState *s, unsigned int req_id,
+                               xs_transaction_t tx_id, uint8_t *req_data,
+                               unsigned int len)
+{
+    bool commit;
+    int err;
+
+    if (len != 2 || req_data[1] != '\0') {
+        xs_error(s, req_id, tx_id, EINVAL);
+        return;
+    }
+
+    switch (req_data[0]) {
+    case 'T':
+        commit = true;
+        break;
+    case 'F':
+        commit = false;
+        break;
+    default:
+        xs_error(s, req_id, tx_id, EINVAL);
+        return;
+    }
+
+    trace_xenstore_transaction_end(tx_id, commit);
+    err = xs_impl_transaction_end(s->impl, xen_domid, tx_id, commit);
+    if (err) {
+        xs_error(s, req_id, tx_id, err);
+        return;
+    }
+
+    xs_ok(s, XS_TRANSACTION_END, req_id, tx_id);
+}
+
+static void xs_rm(XenXenstoreState *s, unsigned int req_id,
+                  xs_transaction_t tx_id, uint8_t *req_data, unsigned int len)
+{
+    const char *path = (const char *)req_data;
+    int err;
+
+    if (len == 0 || req_data[len - 1] != '\0') {
+        xs_error(s, req_id, tx_id, EINVAL);
+        return;
+    }
+
+    trace_xenstore_rm(tx_id, path);
+    err = xs_impl_rm(s->impl, xen_domid, tx_id, path);
+    if (err) {
+        xs_error(s, req_id, tx_id, err);
+        return;
+    }
+
+    xs_ok(s, XS_RM, req_id, tx_id);
+}
+
+static void xs_get_perms(XenXenstoreState *s, unsigned int req_id,
+                         xs_transaction_t tx_id, uint8_t *req_data,
+                         unsigned int len)
+{
+    const char *path = (const char *)req_data;
+    struct xsd_sockmsg *rsp = (struct xsd_sockmsg *)s->rsp_data;
+    GList *perms = NULL;
+    int err;
+
+    if (len == 0 || req_data[len - 1] != '\0') {
+        xs_error(s, req_id, tx_id, EINVAL);
+        return;
+    }
+
+    trace_xenstore_get_perms(tx_id, path);
+    err = xs_impl_get_perms(s->impl, xen_domid, tx_id, path, &perms);
+    if (err) {
+        xs_error(s, req_id, tx_id, err);
+        return;
+    }
+
+    rsp->type = XS_GET_PERMS;
+    rsp->req_id = req_id;
+    rsp->tx_id = tx_id;
+    rsp->len = 0;
+
+    xs_append_strings(s, rsp, perms, 0, false);
+
+    g_list_free_full(perms, g_free);
+}
+
+static void xs_set_perms(XenXenstoreState *s, unsigned int req_id,
+                         xs_transaction_t tx_id, uint8_t *req_data,
+                         unsigned int len)
+{
+    const char *path = (const char *)req_data;
+    uint8_t *perm;
+    GList *perms = NULL;
+    int err;
+
+    if (len == 0) {
+        xs_error(s, req_id, tx_id, EINVAL);
+        return;
+    }
+
+    while (len--) {
+        if (*req_data++ == '\0') {
+            break;
+        }
+        if (len == 0) {
+            xs_error(s, req_id, tx_id, EINVAL);
+            return;
+        }
+    }
+
+    perm = req_data;
+    while (len--) {
+        if (*req_data++ == '\0') {
+            perms = g_list_append(perms, perm);
+            perm = req_data;
+        }
+    }
+
+    /*
+     * Note that there may be trailing garbage at the end of the buffer.
+     * This is explicitly permitted by the '?' at the end of the definition:
+     *
+     *    SET_PERMS         <path>|<perm-as-string>|+?
+     */
+
+    trace_xenstore_set_perms(tx_id, path);
+    err = xs_impl_set_perms(s->impl, xen_domid, tx_id, path, perms);
+    g_list_free(perms);
+    if (err) {
+        xs_error(s, req_id, tx_id, err);
+        return;
+    }
+
+    xs_ok(s, XS_SET_PERMS, req_id, tx_id);
+}
+
+static void xs_watch(XenXenstoreState *s, unsigned int req_id,
+                     xs_transaction_t tx_id, uint8_t *req_data,
+                     unsigned int len)
+{
+    const char *token, *path = (const char *)req_data;
+    int err;
+
+    if (len == 0) {
+        xs_error(s, req_id, tx_id, EINVAL);
+        return;
+    }
+
+    while (len--) {
+        if (*req_data++ == '\0') {
+            break;
+        }
+        if (len == 0) {
+            xs_error(s, req_id, tx_id, EINVAL);
+            return;
+        }
+    }
+
+    token = (const char *)req_data;
+    while (len--) {
+        if (*req_data++ == '\0') {
+            break;
+        }
+        if (len == 0) {
+            xs_error(s, req_id, tx_id, EINVAL);
+            return;
+        }
+    }
+
+    /*
+     * Note that there may be trailing garbage at the end of the buffer.
+     * This is explicitly permitted by the '?' at the end of the definition:
+     *
+     *    WATCH             <wpath>|<token>|?
+     */
+
+    trace_xenstore_watch(path, token);
+    err = xs_impl_watch(s->impl, xen_domid, path, token, fire_watch_cb, s);
+    if (err) {
+        xs_error(s, req_id, tx_id, err);
+        return;
+    }
+
+    xs_ok(s, XS_WATCH, req_id, tx_id);
+}
+
+static void xs_unwatch(XenXenstoreState *s, unsigned int req_id,
+                       xs_transaction_t tx_id, uint8_t *req_data,
+                       unsigned int len)
+{
+    const char *token, *path = (const char *)req_data;
+    int err;
+
+    if (len == 0) {
+        xs_error(s, req_id, tx_id, EINVAL);
+        return;
+    }
+
+    while (len--) {
+        if (*req_data++ == '\0') {
+            break;
+        }
+        if (len == 0) {
+            xs_error(s, req_id, tx_id, EINVAL);
+            return;
+        }
+    }
+
+    token = (const char *)req_data;
+    while (len--) {
+        if (*req_data++ == '\0') {
+            break;
+        }
+        if (len == 0) {
+            xs_error(s, req_id, tx_id, EINVAL);
+            return;
+        }
+    }
+
+    trace_xenstore_unwatch(path, token);
+    err = xs_impl_unwatch(s->impl, xen_domid, path, token, fire_watch_cb, s);
+    if (err) {
+        xs_error(s, req_id, tx_id, err);
+        return;
+    }
+
+    xs_ok(s, XS_UNWATCH, req_id, tx_id);
+}
+
+static void xs_reset_watches(XenXenstoreState *s, unsigned int req_id,
+                             xs_transaction_t tx_id, uint8_t *req_data,
+                             unsigned int len)
+{
+    if (len == 0 || req_data[len - 1] != '\0') {
+        xs_error(s, req_id, tx_id, EINVAL);
+        return;
+    }
+
+    trace_xenstore_reset_watches();
+    xs_impl_reset_watches(s->impl, xen_domid);
+
+    xs_ok(s, XS_RESET_WATCHES, req_id, tx_id);
+}
+
+static void xs_priv(XenXenstoreState *s, unsigned int req_id,
+                    xs_transaction_t tx_id, uint8_t *data,
+                    unsigned int len)
+{
+    xs_error(s, req_id, tx_id, EACCES);
+}
+
+static void xs_unimpl(XenXenstoreState *s, unsigned int req_id,
+                      xs_transaction_t tx_id, uint8_t *data,
+                      unsigned int len)
+{
+    xs_error(s, req_id, tx_id, ENOSYS);
+}
+
+typedef void (*xs_impl)(XenXenstoreState *s, unsigned int req_id,
+                        xs_transaction_t tx_id, uint8_t *data,
+                        unsigned int len);
+
+struct xsd_req {
+    const char *name;
+    xs_impl fn;
+};
+#define XSD_REQ(_type, _fn)                           \
+    [_type] = { .name = #_type, .fn = _fn }
+
+struct xsd_req xsd_reqs[] = {
+    XSD_REQ(XS_READ, xs_read),
+    XSD_REQ(XS_WRITE, xs_write),
+    XSD_REQ(XS_MKDIR, xs_mkdir),
+    XSD_REQ(XS_DIRECTORY, xs_directory),
+    XSD_REQ(XS_DIRECTORY_PART, xs_directory_part),
+    XSD_REQ(XS_TRANSACTION_START, xs_transaction_start),
+    XSD_REQ(XS_TRANSACTION_END, xs_transaction_end),
+    XSD_REQ(XS_RM, xs_rm),
+    XSD_REQ(XS_GET_PERMS, xs_get_perms),
+    XSD_REQ(XS_SET_PERMS, xs_set_perms),
+    XSD_REQ(XS_WATCH, xs_watch),
+    XSD_REQ(XS_UNWATCH, xs_unwatch),
+    XSD_REQ(XS_CONTROL, xs_priv),
+    XSD_REQ(XS_INTRODUCE, xs_priv),
+    XSD_REQ(XS_RELEASE, xs_priv),
+    XSD_REQ(XS_IS_DOMAIN_INTRODUCED, xs_priv),
+    XSD_REQ(XS_RESUME, xs_priv),
+    XSD_REQ(XS_SET_TARGET, xs_priv),
+    XSD_REQ(XS_RESET_WATCHES, xs_reset_watches),
+};
+
 static void process_req(XenXenstoreState *s)
 {
     struct xsd_sockmsg *req = (struct xsd_sockmsg *)s->req_data;
-    struct xsd_sockmsg *rsp = (struct xsd_sockmsg *)s->rsp_data;
-    const char enosys[] = "ENOSYS";
+    xs_impl handler = NULL;
 
     assert(req_pending(s));
     assert(!s->rsp_pending);
 
-    rsp->type = XS_ERROR;
-    rsp->req_id = req->req_id;
-    rsp->tx_id = req->tx_id;
-    rsp->len = sizeof(enosys);
-    memcpy((void *)&rsp[1], enosys, sizeof(enosys));
+    if (req->type < ARRAY_SIZE(xsd_reqs)) {
+        handler = xsd_reqs[req->type].fn;
+    }
+    if (!handler) {
+        handler = &xs_unimpl;
+    }
+
+    handler(s, req->req_id, req->tx_id, (uint8_t *)&req[1], req->len);
 
     s->rsp_pending = true;
     reset_req(s);
@@ -415,6 +1266,113 @@ static unsigned int put_rsp(XenXenstoreState *s)
     return copylen;
 }
 
+static void deliver_watch(XenXenstoreState *s, const char *path,
+                          const char *token)
+{
+    struct xsd_sockmsg *rsp = (struct xsd_sockmsg *)s->rsp_data;
+    uint8_t *rsp_data = (uint8_t *)&rsp[1];
+    unsigned int len;
+
+    assert(!s->rsp_pending);
+
+    trace_xenstore_watch_event(path, token);
+
+    rsp->type = XS_WATCH_EVENT;
+    rsp->req_id = 0;
+    rsp->tx_id = 0;
+    rsp->len = 0;
+
+    len = strlen(path);
+
+    /* XENSTORE_ABS/REL_PATH_MAX should ensure there can be no overflow */
+    assert(rsp->len + len < XENSTORE_PAYLOAD_MAX);
+
+    memcpy(&rsp_data[rsp->len], path, len);
+    rsp->len += len;
+    rsp_data[rsp->len] = '\0';
+    rsp->len++;
+
+    len = strlen(token);
+    /*
+     * It is possible for the guest to have chosen a token that will
+     * not fit (along with the patch) into a watch event. We have no
+     * choice but to drop the event if this is the case.
+     */
+    if (rsp->len + len >= XENSTORE_PAYLOAD_MAX) {
+        return;
+    }
+
+    memcpy(&rsp_data[rsp->len], token, len);
+    rsp->len += len;
+    rsp_data[rsp->len] = '\0';
+    rsp->len++;
+
+    s->rsp_pending = true;
+}
+
+struct watch_event {
+    char *path;
+    char *token;
+};
+
+static void free_watch_event(struct watch_event *ev)
+{
+    if (ev) {
+        g_free(ev->path);
+        g_free(ev->token);
+        g_free(ev);
+    }
+}
+
+static void queue_watch(XenXenstoreState *s, const char *path,
+                        const char *token)
+{
+    struct watch_event *ev = g_new0(struct watch_event, 1);
+
+    ev->path = g_strdup(path);
+    ev->token = g_strdup(token);
+
+    s->watch_events = g_list_append(s->watch_events, ev);
+}
+
+static void fire_watch_cb(void *opaque, const char *path, const char *token)
+{
+    XenXenstoreState *s = opaque;
+
+    assert(qemu_mutex_iothread_locked());
+
+    /*
+     * If there's a response pending, we obviously can't scribble over
+     * it. But if there's a request pending, it has dibs on the buffer
+     * too.
+     *
+     * In the common case of a watch firing due to backend activity
+     * when the ring was otherwise idle, we should be able to copy the
+     * strings directly into the rsp_data and thence the actual ring,
+     * without needing to perform any allocations and queue them.
+     */
+    if (s->rsp_pending || req_pending(s)) {
+        queue_watch(s, path, token);
+    } else {
+        deliver_watch(s, path, token);
+        /*
+         * If the message was queued because there was already ring activity,
+         * no need to wake the guest. But if not, we need to send the evtchn.
+         */
+        xen_be_evtchn_notify(s->eh, s->be_port);
+    }
+}
+
+static void process_watch_events(XenXenstoreState *s)
+{
+    struct watch_event *ev = s->watch_events->data;
+
+    deliver_watch(s, ev->path, ev->token);
+
+    s->watch_events = g_list_remove(s->watch_events, ev);
+    free_watch_event(ev);
+}
+
 static void xen_xenstore_event(void *opaque)
 {
     XenXenstoreState *s = opaque;
@@ -433,6 +1391,10 @@ static void xen_xenstore_event(void *opaque)
         copied_to = copied_from = 0;
         processed = false;
 
+        if (!s->rsp_pending && s->watch_events) {
+            process_watch_events(s);
+        }
+
         if (s->rsp_pending) {
             copied_to = put_rsp(s);
         }
@@ -441,7 +1403,7 @@ static void xen_xenstore_event(void *opaque)
             copied_from = get_req(s);
         }
 
-        if (req_pending(s) && !s->rsp_pending) {
+        if (req_pending(s) && !s->rsp_pending && !s->watch_events) {
             process_req(s);
             processed = true;
         }
@@ -496,5 +1458,270 @@ int xen_xenstore_reset(void)
     }
     s->be_port = err;
 
+    /*
+     * We don't actually access the guest's page through the grant, because
+     * this isn't real Xen, and we can just use the page we gave it in the
+     * first place. Map the grant anyway, mostly for cosmetic purposes so
+     * it *looks* like it's in use in the guest-visible grant table.
+     */
+    s->gt = qemu_xen_gnttab_open();
+    uint32_t xs_gntref = GNTTAB_RESERVED_XENSTORE;
+    s->granted_xs = qemu_xen_gnttab_map_refs(s->gt, 1, xen_domid, &xs_gntref,
+                                             PROT_READ | PROT_WRITE);
+
     return 0;
 }
+
+struct qemu_xs_handle {
+    XenstoreImplState *impl;
+    GList *watches;
+    QEMUBH *watch_bh;
+};
+
+struct qemu_xs_watch {
+    struct qemu_xs_handle *h;
+    char *path;
+    xs_watch_fn fn;
+    void *opaque;
+    GList *events;
+};
+
+static char *xs_be_get_domain_path(struct qemu_xs_handle *h, unsigned int domid)
+{
+    return g_strdup_printf("/local/domain/%u", domid);
+}
+
+static char **xs_be_directory(struct qemu_xs_handle *h, xs_transaction_t t,
+                              const char *path, unsigned int *num)
+{
+    GList *items = NULL, *l;
+    unsigned int i = 0;
+    char **items_ret;
+    int err;
+
+    err = xs_impl_directory(h->impl, DOMID_QEMU, t, path, NULL, &items);
+    if (err) {
+        errno = err;
+        return NULL;
+    }
+
+    items_ret = g_new0(char *, g_list_length(items) + 1);
+    *num = 0;
+    for (l = items; l; l = l->next) {
+        items_ret[i++] = l->data;
+        (*num)++;
+    }
+    g_list_free(items);
+    return items_ret;
+}
+
+static void *xs_be_read(struct qemu_xs_handle *h, xs_transaction_t t,
+                        const char *path, unsigned int *len)
+{
+    GByteArray *data = g_byte_array_new();
+    bool free_segment = false;
+    int err;
+
+    err = xs_impl_read(h->impl, DOMID_QEMU, t, path, data);
+    if (err) {
+        free_segment = true;
+        errno = err;
+    } else {
+        if (len) {
+            *len = data->len;
+        }
+        /* The xen-bus-helper code expects to get NUL terminated string! */
+        g_byte_array_append(data, (void *)"", 1);
+    }
+
+    return g_byte_array_free(data, free_segment);
+}
+
+static bool xs_be_write(struct qemu_xs_handle *h, xs_transaction_t t,
+                        const char *path, const void *data, unsigned int len)
+{
+    GByteArray *gdata = g_byte_array_new();
+    int err;
+
+    g_byte_array_append(gdata, data, len);
+    err = xs_impl_write(h->impl, DOMID_QEMU, t, path, gdata);
+    g_byte_array_unref(gdata);
+    if (err) {
+        errno = err;
+        return false;
+    }
+    return true;
+}
+
+static bool xs_be_create(struct qemu_xs_handle *h, xs_transaction_t t,
+                         unsigned int owner, unsigned int domid,
+                         unsigned int perms, const char *path)
+{
+    g_autoptr(GByteArray) data = g_byte_array_new();
+    GList *perms_list = NULL;
+    int err;
+
+    /* mkdir does this */
+    err = xs_impl_read(h->impl, DOMID_QEMU, t, path, data);
+    if (err == ENOENT) {
+        err = xs_impl_write(h->impl, DOMID_QEMU, t, path, data);
+    }
+    if (err) {
+        errno = err;
+        return false;
+    }
+
+    perms_list = g_list_append(perms_list,
+                               xs_perm_as_string(XS_PERM_NONE, owner));
+    perms_list = g_list_append(perms_list,
+                               xs_perm_as_string(perms, domid));
+
+    err = xs_impl_set_perms(h->impl, DOMID_QEMU, t, path, perms_list);
+    g_list_free_full(perms_list, g_free);
+    if (err) {
+        errno = err;
+        return false;
+    }
+    return true;
+}
+
+static bool xs_be_destroy(struct qemu_xs_handle *h, xs_transaction_t t,
+                          const char *path)
+{
+    int err = xs_impl_rm(h->impl, DOMID_QEMU, t, path);
+    if (err) {
+        errno = err;
+        return false;
+    }
+    return true;
+}
+
+static void be_watch_bh(void *_h)
+{
+    struct qemu_xs_handle *h = _h;
+    GList *l;
+
+    for (l = h->watches; l; l = l->next) {
+        struct qemu_xs_watch *w = l->data;
+
+        while (w->events) {
+            struct watch_event *ev = w->events->data;
+
+            w->fn(w->opaque, ev->path);
+
+            w->events = g_list_remove(w->events, ev);
+            free_watch_event(ev);
+        }
+    }
+}
+
+static void xs_be_watch_cb(void *opaque, const char *path, const char *token)
+{
+    struct watch_event *ev = g_new0(struct watch_event, 1);
+    struct qemu_xs_watch *w = opaque;
+
+    /* We don't care about the token */
+    ev->path = g_strdup(path);
+    w->events = g_list_append(w->events, ev);
+
+    qemu_bh_schedule(w->h->watch_bh);
+}
+
+static struct qemu_xs_watch *xs_be_watch(struct qemu_xs_handle *h,
+                                         const char *path, xs_watch_fn fn,
+                                         void *opaque)
+{
+    struct qemu_xs_watch *w = g_new0(struct qemu_xs_watch, 1);
+    int err;
+
+    w->h = h;
+    w->fn = fn;
+    w->opaque = opaque;
+
+    err = xs_impl_watch(h->impl, DOMID_QEMU, path, NULL, xs_be_watch_cb, w);
+    if (err) {
+        errno = err;
+        g_free(w);
+        return NULL;
+    }
+
+    w->path = g_strdup(path);
+    h->watches = g_list_append(h->watches, w);
+    return w;
+}
+
+static void xs_be_unwatch(struct qemu_xs_handle *h, struct qemu_xs_watch *w)
+{
+    xs_impl_unwatch(h->impl, DOMID_QEMU, w->path, NULL, xs_be_watch_cb, w);
+
+    h->watches = g_list_remove(h->watches, w);
+    g_list_free_full(w->events, (GDestroyNotify)free_watch_event);
+    g_free(w->path);
+    g_free(w);
+}
+
+static xs_transaction_t xs_be_transaction_start(struct qemu_xs_handle *h)
+{
+    unsigned int new_tx = XBT_NULL;
+    int err = xs_impl_transaction_start(h->impl, DOMID_QEMU, &new_tx);
+    if (err) {
+        errno = err;
+        return XBT_NULL;
+    }
+    return new_tx;
+}
+
+static bool xs_be_transaction_end(struct qemu_xs_handle *h, xs_transaction_t t,
+                                  bool abort)
+{
+    int err = xs_impl_transaction_end(h->impl, DOMID_QEMU, t, !abort);
+    if (err) {
+        errno = err;
+        return false;
+    }
+    return true;
+}
+
+static struct qemu_xs_handle *xs_be_open(void)
+{
+    XenXenstoreState *s = xen_xenstore_singleton;
+    struct qemu_xs_handle *h;
+
+    if (!s && !s->impl) {
+        errno = -ENOSYS;
+        return NULL;
+    }
+
+    h = g_new0(struct qemu_xs_handle, 1);
+    h->impl = s->impl;
+
+    h->watch_bh = aio_bh_new(qemu_get_aio_context(), be_watch_bh, h);
+
+    return h;
+}
+
+static void xs_be_close(struct qemu_xs_handle *h)
+{
+    while (h->watches) {
+        struct qemu_xs_watch *w = h->watches->data;
+        xs_be_unwatch(h, w);
+    }
+
+    qemu_bh_delete(h->watch_bh);
+    g_free(h);
+}
+
+static struct xenstore_backend_ops emu_xenstore_backend_ops = {
+    .open = xs_be_open,
+    .close = xs_be_close,
+    .get_domain_path = xs_be_get_domain_path,
+    .directory = xs_be_directory,
+    .read = xs_be_read,
+    .write = xs_be_write,
+    .create = xs_be_create,
+    .destroy = xs_be_destroy,
+    .watch = xs_be_watch,
+    .unwatch = xs_be_unwatch,
+    .transaction_start = xs_be_transaction_start,
+    .transaction_end = xs_be_transaction_end,
+};
diff --git a/hw/i386/kvm/xenstore_impl.c b/hw/i386/kvm/xenstore_impl.c
new file mode 100644
index 0000000000..305fe75519
--- /dev/null
+++ b/hw/i386/kvm/xenstore_impl.c
@@ -0,0 +1,1927 @@
+/*
+ * QEMU Xen emulation: The actual implementation of XenStore
+ *
+ * Copyright © 2023 Amazon.com, Inc. or its affiliates. All Rights Reserved.
+ *
+ * Authors: David Woodhouse <dwmw2@infradead.org>, Paul Durrant <paul@xen.org>
+ *
+ * This work is licensed under the terms of the GNU GPL, version 2 or later.
+ * See the COPYING file in the top-level directory.
+ */
+
+#include "qemu/osdep.h"
+#include "qom/object.h"
+
+#include "hw/xen/xen.h"
+
+#include "xen_xenstore.h"
+#include "xenstore_impl.h"
+
+#include "hw/xen/interface/io/xs_wire.h"
+
+#define XS_MAX_WATCHES          128
+#define XS_MAX_DOMAIN_NODES     1000
+#define XS_MAX_NODE_SIZE        2048
+#define XS_MAX_TRANSACTIONS     10
+#define XS_MAX_PERMS_PER_NODE   5
+
+#define XS_VALID_CHARS "abcdefghijklmnopqrstuvwxyz" \
+                       "ABCDEFGHIJKLMNOPQRSTUVWXYZ" \
+                       "0123456789-/_"
+
+typedef struct XsNode {
+    uint32_t ref;
+    GByteArray *content;
+    GList *perms;
+    GHashTable *children;
+    uint64_t gencnt;
+    bool deleted_in_tx;
+    bool modified_in_tx;
+    unsigned int serialized_tx;
+#ifdef XS_NODE_UNIT_TEST
+    gchar *name; /* debug only */
+#endif
+} XsNode;
+
+typedef struct XsWatch {
+    struct XsWatch *next;
+    xs_impl_watch_fn *cb;
+    void *cb_opaque;
+    char *token;
+    unsigned int dom_id;
+    int rel_prefix;
+} XsWatch;
+
+typedef struct XsTransaction {
+    XsNode *root;
+    unsigned int nr_nodes;
+    unsigned int base_tx;
+    unsigned int tx_id;
+    unsigned int dom_id;
+} XsTransaction;
+
+struct XenstoreImplState {
+    XsNode *root;
+    unsigned int nr_nodes;
+    GHashTable *watches;
+    unsigned int nr_domu_watches;
+    GHashTable *transactions;
+    unsigned int nr_domu_transactions;
+    unsigned int root_tx;
+    unsigned int last_tx;
+    bool serialized;
+};
+
+
+static void nobble_tx(gpointer key, gpointer value, gpointer user_data)
+{
+    unsigned int *new_tx_id = user_data;
+    XsTransaction *tx = value;
+
+    if (tx->base_tx == *new_tx_id) {
+        /* Transactions based on XBT_NULL will always fail */
+        tx->base_tx = XBT_NULL;
+    }
+}
+
+static inline unsigned int next_tx(struct XenstoreImplState *s)
+{
+    unsigned int tx_id;
+
+    /* Find the next TX id which isn't either XBT_NULL or in use. */
+    do {
+        tx_id = ++s->last_tx;
+    } while (tx_id == XBT_NULL || tx_id == s->root_tx ||
+             g_hash_table_lookup(s->transactions, GINT_TO_POINTER(tx_id)));
+
+    /*
+     * It is vanishingly unlikely, but ensure that no outstanding transaction
+     * is based on the (previous incarnation of the) newly-allocated TX id.
+     */
+    g_hash_table_foreach(s->transactions, nobble_tx, &tx_id);
+
+    return tx_id;
+}
+
+static inline XsNode *xs_node_new(void)
+{
+    XsNode *n = g_new0(XsNode, 1);
+    n->ref = 1;
+
+#ifdef XS_NODE_UNIT_TEST
+    nr_xs_nodes++;
+    xs_node_list = g_list_prepend(xs_node_list, n);
+#endif
+    return n;
+}
+
+static inline XsNode *xs_node_ref(XsNode *n)
+{
+    /* With just 10 transactions, it can never get anywhere near this. */
+    g_assert(n->ref < INT_MAX);
+
+    g_assert(n->ref);
+    n->ref++;
+    return n;
+}
+
+static inline void xs_node_unref(XsNode *n)
+{
+    if (!n) {
+        return;
+    }
+    g_assert(n->ref);
+    if (--n->ref) {
+        return;
+    }
+
+    if (n->content) {
+        g_byte_array_unref(n->content);
+    }
+    if (n->perms) {
+        g_list_free_full(n->perms, g_free);
+    }
+    if (n->children) {
+        g_hash_table_unref(n->children);
+    }
+#ifdef XS_NODE_UNIT_TEST
+    g_free(n->name);
+    nr_xs_nodes--;
+    xs_node_list = g_list_remove(xs_node_list, n);
+#endif
+    g_free(n);
+}
+
+char *xs_perm_as_string(unsigned int perm, unsigned int domid)
+{
+    char letter;
+
+    switch (perm) {
+    case XS_PERM_READ | XS_PERM_WRITE:
+        letter = 'b';
+        break;
+    case XS_PERM_READ:
+        letter = 'r';
+        break;
+    case XS_PERM_WRITE:
+        letter = 'w';
+        break;
+    case XS_PERM_NONE:
+    default:
+        letter = 'n';
+        break;
+    }
+
+    return g_strdup_printf("%c%u", letter, domid);
+}
+
+static gpointer do_perm_copy(gconstpointer src, gpointer user_data)
+{
+    return g_strdup(src);
+}
+
+static XsNode *xs_node_create(const char *name, GList *perms)
+{
+    XsNode *n = xs_node_new();
+
+#ifdef XS_NODE_UNIT_TEST
+    if (name) {
+        n->name = g_strdup(name);
+    }
+#endif
+
+    n->perms = g_list_copy_deep(perms, do_perm_copy, NULL);
+
+    return n;
+}
+
+/* For copying from one hash table to another using g_hash_table_foreach() */
+static void do_child_insert(gpointer key, gpointer value, gpointer user_data)
+{
+    g_hash_table_insert(user_data, g_strdup(key), xs_node_ref(value));
+}
+
+static XsNode *xs_node_copy(XsNode *old)
+{
+    XsNode *n = xs_node_new();
+
+    n->gencnt = old->gencnt;
+
+#ifdef XS_NODE_UNIT_TEST
+    if (n->name) {
+        n->name = g_strdup(old->name);
+    }
+#endif
+
+    assert(old);
+    if (old->children) {
+        n->children = g_hash_table_new_full(g_str_hash, g_str_equal, g_free,
+                                            (GDestroyNotify)xs_node_unref);
+        g_hash_table_foreach(old->children, do_child_insert, n->children);
+    }
+    if (old->perms) {
+        n->perms = g_list_copy_deep(old->perms, do_perm_copy, NULL);
+    }
+    if (old->content) {
+        n->content = g_byte_array_ref(old->content);
+    }
+    return n;
+}
+
+/* Returns true if it made a change to the hash table */
+static bool xs_node_add_child(XsNode *n, const char *path_elem, XsNode *child)
+{
+    assert(!strchr(path_elem, '/'));
+
+    if (!child) {
+        assert(n->children);
+        return g_hash_table_remove(n->children, path_elem);
+    }
+
+#ifdef XS_NODE_UNIT_TEST
+    g_free(child->name);
+    child->name = g_strdup(path_elem);
+#endif
+    if (!n->children) {
+        n->children = g_hash_table_new_full(g_str_hash, g_str_equal, g_free,
+                                            (GDestroyNotify)xs_node_unref);
+    }
+
+    /*
+     * The documentation for g_hash_table_insert() says that it "returns a
+     * boolean value to indicate whether the newly added value was already
+     * in the hash table or not."
+     *
+     * It could perhaps be clearer that returning TRUE means it wasn't,
+     */
+    return g_hash_table_insert(n->children, g_strdup(path_elem), child);
+}
+
+struct walk_op {
+    struct XenstoreImplState *s;
+    char path[XENSTORE_ABS_PATH_MAX + 2]; /* Two NUL terminators */
+    int (*op_fn)(XsNode **n, struct walk_op *op);
+    void *op_opaque;
+    void *op_opaque2;
+
+    GList *watches;
+    unsigned int dom_id;
+    unsigned int tx_id;
+
+    /* The number of nodes which will exist in the tree if this op succeeds. */
+    unsigned int new_nr_nodes;
+
+    /*
+     * This is maintained on the way *down* the walk to indicate
+     * whether nodes can be modified in place or whether COW is
+     * required. It starts off being true, as we're always going to
+     * replace the root node. If we walk into a shared subtree it
+     * becomes false. If we start *creating* new nodes for a write,
+     * it becomes true again.
+     *
+     * Do not use it on the way back up.
+     */
+    bool inplace;
+    bool mutating;
+    bool create_dirs;
+    bool in_transaction;
+
+    /* Tracking during recursion so we know which is first. */
+    bool deleted_in_tx;
+};
+
+static void fire_watches(struct walk_op *op, bool parents)
+{
+    GList *l = NULL;
+    XsWatch *w;
+
+    if (!op->mutating || op->in_transaction) {
+        return;
+    }
+
+    if (parents) {
+        l = op->watches;
+    }
+
+    w = g_hash_table_lookup(op->s->watches, op->path);
+    while (w || l) {
+        if (!w) {
+            /* Fire the parent nodes from 'op' if asked to */
+            w = l->data;
+            l = l->next;
+            continue;
+        }
+
+        assert(strlen(op->path) > w->rel_prefix);
+        w->cb(w->cb_opaque, op->path + w->rel_prefix, w->token);
+
+        w = w->next;
+    }
+}
+
+static int xs_node_add_content(XsNode **n, struct walk_op *op)
+{
+    GByteArray *data = op->op_opaque;
+
+    if (op->dom_id) {
+        /*
+         * The real XenStored includes permissions and names of child nodes
+         * in the calculated datasize but life's too short. For a single
+         * tenant internal XenStore, we don't have to be quite as pedantic.
+         */
+        if (data->len > XS_MAX_NODE_SIZE) {
+            return E2BIG;
+        }
+    }
+    /* We *are* the node to be written. Either this or a copy. */
+    if (!op->inplace) {
+        XsNode *old = *n;
+        *n = xs_node_copy(old);
+        xs_node_unref(old);
+    }
+
+    if ((*n)->content) {
+        g_byte_array_unref((*n)->content);
+    }
+    (*n)->content = g_byte_array_ref(data);
+    if (op->tx_id != XBT_NULL) {
+        (*n)->modified_in_tx = true;
+    }
+    return 0;
+}
+
+static int xs_node_get_content(XsNode **n, struct walk_op *op)
+{
+    GByteArray *data = op->op_opaque;
+    GByteArray *node_data;
+
+    assert(op->inplace);
+    assert(*n);
+
+    node_data = (*n)->content;
+    if (node_data) {
+        g_byte_array_append(data, node_data->data, node_data->len);
+    }
+
+    return 0;
+}
+
+static int node_rm_recurse(gpointer key, gpointer value, gpointer user_data)
+{
+    struct walk_op *op = user_data;
+    int path_len = strlen(op->path);
+    int key_len = strlen(key);
+    XsNode *n = value;
+    bool this_inplace = op->inplace;
+
+    if (n->ref != 1) {
+        op->inplace = 0;
+    }
+
+    assert(key_len + path_len + 2 <= sizeof(op->path));
+    op->path[path_len] = '/';
+    memcpy(op->path + path_len + 1, key, key_len + 1);
+
+    if (n->children) {
+        g_hash_table_foreach_remove(n->children, node_rm_recurse, op);
+    }
+    op->new_nr_nodes--;
+
+    /*
+     * Fire watches on *this* node but not the parents because they are
+     * going to be deleted too, so the watch will fire for them anyway.
+     */
+    fire_watches(op, false);
+    op->path[path_len] = '\0';
+
+    /*
+     * Actually deleting the child here is just an optimisation; if we
+     * don't then the final unref on the topmost victim will just have
+     * to cascade down again repeating all the g_hash_table_foreach()
+     * calls.
+     */
+    return this_inplace;
+}
+
+static XsNode *xs_node_copy_deleted(XsNode *old, struct walk_op *op);
+static void copy_deleted_recurse(gpointer key, gpointer value,
+                                 gpointer user_data)
+{
+    struct walk_op *op = user_data;
+    GHashTable *siblings = op->op_opaque2;
+    XsNode *n = xs_node_copy_deleted(value, op);
+
+    /*
+     * Reinsert the deleted_in_tx copy of the node into the parent's
+     * 'children' hash table. Having stashed it from op->op_opaque2
+     * before the recursive call to xs_node_copy_deleted() scribbled
+     * over it.
+     */
+    g_hash_table_insert(siblings, g_strdup(key), n);
+}
+
+static XsNode *xs_node_copy_deleted(XsNode *old, struct walk_op *op)
+{
+    XsNode *n = xs_node_new();
+
+    n->gencnt = old->gencnt;
+
+#ifdef XS_NODE_UNIT_TEST
+    if (old->name) {
+        n->name = g_strdup(old->name);
+    }
+#endif
+
+    if (old->children) {
+        n->children = g_hash_table_new_full(g_str_hash, g_str_equal, g_free,
+                                            (GDestroyNotify)xs_node_unref);
+        op->op_opaque2 = n->children;
+        g_hash_table_foreach(old->children, copy_deleted_recurse, op);
+    }
+    if (old->perms) {
+        n->perms = g_list_copy_deep(old->perms, do_perm_copy, NULL);
+    }
+    n->deleted_in_tx = true;
+    /* If it gets resurrected we only fire a watch if it lost its content */
+    if (old->content) {
+        n->modified_in_tx = true;
+    }
+    op->new_nr_nodes--;
+    return n;
+}
+
+static int xs_node_rm(XsNode **n, struct walk_op *op)
+{
+    bool this_inplace = op->inplace;
+
+    if (op->tx_id != XBT_NULL) {
+        /* It's not trivial to do inplace handling for this one */
+        XsNode *old = *n;
+        *n = xs_node_copy_deleted(old, op);
+        xs_node_unref(old);
+        return 0;
+    }
+
+    /* Fire watches for, and count, nodes in the subtree which get deleted */
+    if ((*n)->children) {
+        g_hash_table_foreach_remove((*n)->children, node_rm_recurse, op);
+    }
+    op->new_nr_nodes--;
+
+    if (this_inplace) {
+        xs_node_unref(*n);
+    }
+    *n = NULL;
+    return 0;
+}
+
+static int xs_node_get_perms(XsNode **n, struct walk_op *op)
+{
+    GList **perms = op->op_opaque;
+
+    assert(op->inplace);
+    assert(*n);
+
+    *perms = g_list_copy_deep((*n)->perms, do_perm_copy, NULL);
+    return 0;
+}
+
+static void parse_perm(const char *perm, char *letter, unsigned int *dom_id)
+{
+    unsigned int n = sscanf(perm, "%c%u", letter, dom_id);
+
+    assert(n == 2);
+}
+
+static bool can_access(unsigned int dom_id, GList *perms, const char *letters)
+{
+    unsigned int i, n;
+    char perm_letter;
+    unsigned int perm_dom_id;
+    bool access;
+
+    if (dom_id == 0) {
+        return true;
+    }
+
+    n = g_list_length(perms);
+    assert(n >= 1);
+
+    /*
+     * The dom_id of the first perm is the owner, and the owner always has
+     * read-write access.
+     */
+    parse_perm(g_list_nth_data(perms, 0), &perm_letter, &perm_dom_id);
+    if (dom_id == perm_dom_id) {
+        return true;
+    }
+
+    /*
+     * The letter of the first perm specified the default access for all other
+     * domains.
+     */
+    access = !!strchr(letters, perm_letter);
+    for (i = 1; i < n; i++) {
+        parse_perm(g_list_nth_data(perms, i), &perm_letter, &perm_dom_id);
+        if (dom_id != perm_dom_id) {
+            continue;
+        }
+        access = !!strchr(letters, perm_letter);
+    }
+
+    return access;
+}
+
+static int xs_node_set_perms(XsNode **n, struct walk_op *op)
+{
+    GList *perms = op->op_opaque;
+
+    if (op->dom_id) {
+        unsigned int perm_dom_id;
+        char perm_letter;
+
+        /* A guest may not change permissions on nodes it does not own */
+        if (!can_access(op->dom_id, (*n)->perms, "")) {
+            return EPERM;
+        }
+
+        /* A guest may not change the owner of a node it owns. */
+        parse_perm(perms->data, &perm_letter, &perm_dom_id);
+        if (perm_dom_id != op->dom_id) {
+            return EPERM;
+        }
+
+        if (g_list_length(perms) > XS_MAX_PERMS_PER_NODE) {
+            return ENOSPC;
+        }
+    }
+
+    /* We *are* the node to be written. Either this or a copy. */
+    if (!op->inplace) {
+        XsNode *old = *n;
+        *n = xs_node_copy(old);
+        xs_node_unref(old);
+    }
+
+    if ((*n)->perms) {
+        g_list_free_full((*n)->perms, g_free);
+    }
+    (*n)->perms = g_list_copy_deep(perms, do_perm_copy, NULL);
+    if (op->tx_id != XBT_NULL) {
+        (*n)->modified_in_tx = true;
+    }
+    return 0;
+}
+
+/*
+ * Passed a full reference in *n which it may free if it needs to COW.
+ *
+ * When changing the tree, the op->inplace flag indicates whether this
+ * node may be modified in place (i.e. it and all its parents had a
+ * refcount of one). If walking down the tree we find a node whose
+ * refcount is higher, we must clear op->inplace and COW from there
+ * down. Unless we are creating new nodes as scaffolding for a write
+ * (which works like 'mkdir -p' does). In which case those newly
+ * created nodes can (and must) be modified in place again.
+ */
+static int xs_node_walk(XsNode **n, struct walk_op *op)
+{
+    char *child_name = NULL;
+    size_t namelen;
+    XsNode *old = *n, *child = NULL;
+    bool stole_child = false;
+    bool this_inplace;
+    XsWatch *watch;
+    int err;
+
+    namelen = strlen(op->path);
+    watch = g_hash_table_lookup(op->s->watches, op->path);
+
+    /* Is there a child, or do we hit the double-NUL termination? */
+    if (op->path[namelen + 1]) {
+        char *slash;
+        child_name = op->path + namelen + 1;
+        slash = strchr(child_name, '/');
+        if (slash) {
+            *slash = '\0';
+        }
+        op->path[namelen] = '/';
+    }
+
+    /* If we walk into a subtree which is shared, we must COW */
+    if (op->mutating && old->ref != 1) {
+        op->inplace = false;
+    }
+
+    if (!child_name) {
+        const char *letters = op->mutating ? "wb" : "rb";
+
+        if (!can_access(op->dom_id, old->perms, letters)) {
+            err = EACCES;
+            goto out;
+        }
+
+        /* This is the actual node on which the operation shall be performed */
+        err = op->op_fn(n, op);
+        if (!err) {
+            fire_watches(op, true);
+        }
+        goto out;
+    }
+
+    /* op->inplace will be further modified during the recursion */
+    this_inplace = op->inplace;
+
+    if (old && old->children) {
+        child = g_hash_table_lookup(old->children, child_name);
+        /* This is a *weak* reference to 'child', owned by the hash table */
+    }
+
+    if (child) {
+        if (child->deleted_in_tx) {
+            assert(child->ref == 1);
+            /* Cannot actually set child->deleted_in_tx = false until later */
+        }
+        xs_node_ref(child);
+        /*
+         * Now we own it too. But if we can modify inplace, that's going to
+         * foil the check and force it to COW. We want to be the *only* owner
+         * so that it can be modified in place, so remove it from the hash
+         * table in that case. We'll add it (or its replacement) back later.
+         */
+        if (op->mutating && this_inplace) {
+            g_hash_table_remove(old->children, child_name);
+            stole_child = true;
+        }
+    } else if (op->create_dirs) {
+        assert(op->mutating);
+
+        if (!can_access(op->dom_id, old->perms, "wb")) {
+            err = EACCES;
+            goto out;
+        }
+
+        if (op->dom_id && op->new_nr_nodes >= XS_MAX_DOMAIN_NODES) {
+            err = ENOSPC;
+            goto out;
+        }
+
+        child = xs_node_create(child_name, old->perms);
+        op->new_nr_nodes++;
+
+        /*
+         * If we're creating a new child, we can clearly modify it (and its
+         * children) in place from here on down.
+         */
+        op->inplace = true;
+    } else {
+        err = ENOENT;
+        goto out;
+    }
+
+    /*
+     * If there's a watch on this node, add it to the list to be fired
+     * (with the correct full pathname for the modified node) at the end.
+     */
+    if (watch) {
+        op->watches = g_list_append(op->watches, watch);
+    }
+
+    /*
+     * Except for the temporary child-stealing as noted, our node has not
+     * changed yet. We don't yet know the overall operation will complete.
+     */
+    err = xs_node_walk(&child, op);
+
+    if (watch) {
+        op->watches = g_list_remove(op->watches, watch);
+    }
+
+    if (err || !op->mutating) {
+        if (stole_child) {
+            /* Put it back as it was. */
+            g_hash_table_replace(old->children, g_strdup(child_name), child);
+        } else {
+            xs_node_unref(child);
+        }
+        goto out;
+    }
+
+    /*
+     * Now we know the operation has completed successfully and we're on
+     * the way back up. Make the change, substituting 'child' in the
+     * node at our level.
+     */
+    if (!this_inplace) {
+        *n = xs_node_copy(old);
+        xs_node_unref(old);
+    }
+
+    /*
+     * If we resurrected a deleted_in_tx node, we can mark it as no longer
+     * deleted now that we know the overall operation has succeeded.
+     */
+    if (op->create_dirs && child && child->deleted_in_tx) {
+        op->new_nr_nodes++;
+        child->deleted_in_tx = false;
+    }
+
+    /*
+     * The child may be NULL here, for a remove operation. Either way,
+     * xs_node_add_child() will do the right thing and return a value
+     * indicating whether it changed the parent's hash table or not.
+     *
+     * We bump the parent gencnt if it adds a child that we *didn't*
+     * steal from it in the first place, or if child==NULL and was
+     * thus removed (whether we stole it earlier and didn't put it
+     * back, or xs_node_add_child() actually removed it now).
+     */
+    if ((xs_node_add_child(*n, child_name, child) && !stole_child) || !child) {
+        (*n)->gencnt++;
+    }
+
+ out:
+    op->path[namelen] = '\0';
+    if (!namelen) {
+        assert(!op->watches);
+        /*
+         * On completing the recursion back up the path walk and reaching the
+         * top, assign the new node count if the operation was successful. If
+         * the main tree was changed, bump its tx ID so that outstanding
+         * transactions correctly fail. But don't bump it every time; only
+         * if it makes a difference.
+         */
+        if (!err && op->mutating) {
+            if (!op->in_transaction) {
+                if (op->s->root_tx != op->s->last_tx) {
+                    op->s->root_tx = next_tx(op->s);
+                }
+                op->s->nr_nodes = op->new_nr_nodes;
+            } else {
+                XsTransaction *tx = g_hash_table_lookup(op->s->transactions,
+                                                        GINT_TO_POINTER(op->tx_id));
+                assert(tx);
+                tx->nr_nodes = op->new_nr_nodes;
+            }
+        }
+    }
+    return err;
+}
+
+static void append_directory_item(gpointer key, gpointer value,
+                                  gpointer user_data)
+{
+    GList **items = user_data;
+
+    *items = g_list_insert_sorted(*items, g_strdup(key), (GCompareFunc)strcmp);
+}
+
+/* Populates items with char * names which caller must free. */
+static int xs_node_directory(XsNode **n, struct walk_op *op)
+{
+    GList **items = op->op_opaque;
+
+    assert(op->inplace);
+    assert(*n);
+
+    if ((*n)->children) {
+        g_hash_table_foreach((*n)->children, append_directory_item, items);
+    }
+
+    if (op->op_opaque2) {
+        *(uint64_t *)op->op_opaque2 = (*n)->gencnt;
+    }
+
+    return 0;
+}
+
+static int validate_path(char *outpath, const char *userpath,
+                         unsigned int dom_id)
+{
+    size_t i, pathlen = strlen(userpath);
+
+    if (!pathlen || userpath[pathlen] == '/' || strstr(userpath, "//")) {
+        return EINVAL;
+    }
+    for (i = 0; i < pathlen; i++) {
+        if (!strchr(XS_VALID_CHARS, userpath[i])) {
+            return EINVAL;
+        }
+    }
+    if (userpath[0] == '/') {
+        if (pathlen > XENSTORE_ABS_PATH_MAX) {
+            return E2BIG;
+        }
+        memcpy(outpath, userpath, pathlen + 1);
+    } else {
+        if (pathlen > XENSTORE_REL_PATH_MAX) {
+            return E2BIG;
+        }
+        snprintf(outpath, XENSTORE_ABS_PATH_MAX, "/local/domain/%u/%s", dom_id,
+                 userpath);
+    }
+    return 0;
+}
+
+
+static int init_walk_op(XenstoreImplState *s, struct walk_op *op,
+                        xs_transaction_t tx_id, unsigned int dom_id,
+                        const char *path, XsNode ***rootp)
+{
+    int ret = validate_path(op->path, path, dom_id);
+    if (ret) {
+        return ret;
+    }
+
+    /*
+     * We use *two* NUL terminators at the end of the path, as during the walk
+     * we will temporarily turn each '/' into a NUL to allow us to use that
+     * path element for the lookup.
+     */
+    op->path[strlen(op->path) + 1] = '\0';
+    op->watches = NULL;
+    op->path[0] = '\0';
+    op->inplace = true;
+    op->mutating = false;
+    op->create_dirs = false;
+    op->in_transaction = false;
+    op->dom_id = dom_id;
+    op->tx_id = tx_id;
+    op->s = s;
+
+    if (tx_id == XBT_NULL) {
+        *rootp = &s->root;
+        op->new_nr_nodes = s->nr_nodes;
+    } else {
+        XsTransaction *tx = g_hash_table_lookup(s->transactions,
+                                                GINT_TO_POINTER(tx_id));
+        if (!tx) {
+            return ENOENT;
+        }
+        *rootp = &tx->root;
+        op->new_nr_nodes = tx->nr_nodes;
+        op->in_transaction = true;
+    }
+
+    return 0;
+}
+
+int xs_impl_read(XenstoreImplState *s, unsigned int dom_id,
+                 xs_transaction_t tx_id, const char *path, GByteArray *data)
+{
+    /*
+     * The data GByteArray shall exist, and will be freed by caller.
+     * Just g_byte_array_append() to it.
+     */
+    struct walk_op op;
+    XsNode **n;
+    int ret;
+
+    ret = init_walk_op(s, &op, tx_id, dom_id, path, &n);
+    if (ret) {
+        return ret;
+    }
+    op.op_fn = xs_node_get_content;
+    op.op_opaque = data;
+    return xs_node_walk(n, &op);
+}
+
+int xs_impl_write(XenstoreImplState *s, unsigned int dom_id,
+                  xs_transaction_t tx_id, const char *path, GByteArray *data)
+{
+    /*
+     * The data GByteArray shall exist, will be freed by caller. You are
+     * free to use g_byte_array_steal() and keep the data. Or just ref it.
+     */
+    struct walk_op op;
+    XsNode **n;
+    int ret;
+
+    ret = init_walk_op(s, &op, tx_id, dom_id, path, &n);
+    if (ret) {
+        return ret;
+    }
+    op.op_fn = xs_node_add_content;
+    op.op_opaque = data;
+    op.mutating = true;
+    op.create_dirs = true;
+    return xs_node_walk(n, &op);
+}
+
+int xs_impl_directory(XenstoreImplState *s, unsigned int dom_id,
+                      xs_transaction_t tx_id, const char *path,
+                      uint64_t *gencnt, GList **items)
+{
+    /*
+     * The items are (char *) to be freed by caller. Although it's consumed
+     * immediately so if you want to change it to (const char *) and keep
+     * them, go ahead and change the caller.
+     */
+    struct walk_op op;
+    XsNode **n;
+    int ret;
+
+    ret = init_walk_op(s, &op, tx_id, dom_id, path, &n);
+    if (ret) {
+        return ret;
+    }
+    op.op_fn = xs_node_directory;
+    op.op_opaque = items;
+    op.op_opaque2 = gencnt;
+    return xs_node_walk(n, &op);
+}
+
+int xs_impl_transaction_start(XenstoreImplState *s, unsigned int dom_id,
+                              xs_transaction_t *tx_id)
+{
+    XsTransaction *tx;
+
+    if (*tx_id != XBT_NULL) {
+        return EINVAL;
+    }
+
+    if (dom_id && s->nr_domu_transactions >= XS_MAX_TRANSACTIONS) {
+        return ENOSPC;
+    }
+
+    tx = g_new0(XsTransaction, 1);
+
+    tx->nr_nodes = s->nr_nodes;
+    tx->tx_id = next_tx(s);
+    tx->base_tx = s->root_tx;
+    tx->root = xs_node_ref(s->root);
+    tx->dom_id = dom_id;
+
+    g_hash_table_insert(s->transactions, GINT_TO_POINTER(tx->tx_id), tx);
+    if (dom_id) {
+        s->nr_domu_transactions++;
+    }
+    *tx_id = tx->tx_id;
+    return 0;
+}
+
+static gboolean tx_commit_walk(gpointer key, gpointer value,
+                               gpointer user_data)
+{
+    struct walk_op *op = user_data;
+    int path_len = strlen(op->path);
+    int key_len = strlen(key);
+    bool fire_parents = true;
+    XsWatch *watch;
+    XsNode *n = value;
+
+    if (n->ref != 1) {
+        return false;
+    }
+
+    if (n->deleted_in_tx) {
+        /*
+         * We fire watches on our parents if we are the *first* node
+         * to be deleted (the topmost one). This matches the behaviour
+         * when deleting in the live tree.
+         */
+        fire_parents = !op->deleted_in_tx;
+
+        /* Only used on the way down so no need to clear it later */
+        op->deleted_in_tx = true;
+    }
+
+    assert(key_len + path_len + 2 <= sizeof(op->path));
+    op->path[path_len] = '/';
+    memcpy(op->path + path_len + 1, key, key_len + 1);
+
+    watch = g_hash_table_lookup(op->s->watches, op->path);
+    if (watch) {
+        op->watches = g_list_append(op->watches, watch);
+    }
+
+    if (n->children) {
+        g_hash_table_foreach_remove(n->children, tx_commit_walk, op);
+    }
+
+    if (watch) {
+        op->watches = g_list_remove(op->watches, watch);
+    }
+
+    /*
+     * Don't fire watches if this node was only copied because a
+     * descendent was changed. The modified_in_tx flag indicates the
+     * ones which were really changed.
+     */
+    if (n->modified_in_tx || n->deleted_in_tx) {
+        fire_watches(op, fire_parents);
+        n->modified_in_tx = false;
+    }
+    op->path[path_len] = '\0';
+
+    /* Deleted nodes really do get expunged when we commit */
+    return n->deleted_in_tx;
+}
+
+static int transaction_commit(XenstoreImplState *s, XsTransaction *tx)
+{
+    struct walk_op op;
+    XsNode **n;
+
+    if (s->root_tx != tx->base_tx) {
+        return EAGAIN;
+    }
+    xs_node_unref(s->root);
+    s->root = tx->root;
+    tx->root = NULL;
+    s->root_tx = tx->tx_id;
+    s->nr_nodes = tx->nr_nodes;
+
+    init_walk_op(s, &op, XBT_NULL, tx->dom_id, "/", &n);
+    op.deleted_in_tx = false;
+    op.mutating = true;
+
+    /*
+     * Walk the new root and fire watches on any node which has a
+     * refcount of one (which is therefore unique to this transaction).
+     */
+    if (s->root->children) {
+        g_hash_table_foreach_remove(s->root->children, tx_commit_walk, &op);
+    }
+
+    return 0;
+}
+
+int xs_impl_transaction_end(XenstoreImplState *s, unsigned int dom_id,
+                            xs_transaction_t tx_id, bool commit)
+{
+    int ret = 0;
+    XsTransaction *tx = g_hash_table_lookup(s->transactions,
+                                            GINT_TO_POINTER(tx_id));
+
+    if (!tx || tx->dom_id != dom_id) {
+        return ENOENT;
+    }
+
+    if (commit) {
+        ret = transaction_commit(s, tx);
+    }
+
+    g_hash_table_remove(s->transactions, GINT_TO_POINTER(tx_id));
+    if (dom_id) {
+        assert(s->nr_domu_transactions);
+        s->nr_domu_transactions--;
+    }
+    return ret;
+}
+
+int xs_impl_rm(XenstoreImplState *s, unsigned int dom_id,
+               xs_transaction_t tx_id, const char *path)
+{
+    struct walk_op op;
+    XsNode **n;
+    int ret;
+
+    ret = init_walk_op(s, &op, tx_id, dom_id, path, &n);
+    if (ret) {
+        return ret;
+    }
+    op.op_fn = xs_node_rm;
+    op.mutating = true;
+    return xs_node_walk(n, &op);
+}
+
+int xs_impl_get_perms(XenstoreImplState *s, unsigned int dom_id,
+                      xs_transaction_t tx_id, const char *path, GList **perms)
+{
+    struct walk_op op;
+    XsNode **n;
+    int ret;
+
+    ret = init_walk_op(s, &op, tx_id, dom_id, path, &n);
+    if (ret) {
+        return ret;
+    }
+    op.op_fn = xs_node_get_perms;
+    op.op_opaque = perms;
+    return xs_node_walk(n, &op);
+}
+
+static void is_valid_perm(gpointer data, gpointer user_data)
+{
+    char *perm = data;
+    bool *valid = user_data;
+    char letter;
+    unsigned int dom_id;
+
+    if (!*valid) {
+        return;
+    }
+
+    if (sscanf(perm, "%c%u", &letter, &dom_id) != 2) {
+        *valid = false;
+        return;
+    }
+
+    switch (letter) {
+    case 'n':
+    case 'r':
+    case 'w':
+    case 'b':
+        break;
+
+    default:
+        *valid = false;
+        break;
+    }
+}
+
+int xs_impl_set_perms(XenstoreImplState *s, unsigned int dom_id,
+                      xs_transaction_t tx_id, const char *path, GList *perms)
+{
+    struct walk_op op;
+    XsNode **n;
+    bool valid = true;
+    int ret;
+
+    if (!g_list_length(perms)) {
+        return EINVAL;
+    }
+
+    g_list_foreach(perms, is_valid_perm, &valid);
+    if (!valid) {
+        return EINVAL;
+    }
+
+    ret = init_walk_op(s, &op, tx_id, dom_id, path, &n);
+    if (ret) {
+        return ret;
+    }
+    op.op_fn = xs_node_set_perms;
+    op.op_opaque = perms;
+    op.mutating = true;
+    return xs_node_walk(n, &op);
+}
+
+static int do_xs_impl_watch(XenstoreImplState *s, unsigned int dom_id,
+                            const char *path, const char *token,
+                            xs_impl_watch_fn fn, void *opaque)
+
+{
+    char abspath[XENSTORE_ABS_PATH_MAX + 1];
+    XsWatch *w, *l;
+    int ret;
+
+    ret = validate_path(abspath, path, dom_id);
+    if (ret) {
+        return ret;
+    }
+
+    /* Check for duplicates */
+    l = w = g_hash_table_lookup(s->watches, abspath);
+    while (w) {
+        if (!g_strcmp0(token, w->token) &&  opaque == w->cb_opaque &&
+            fn == w->cb && dom_id == w->dom_id) {
+            return EEXIST;
+        }
+        w = w->next;
+    }
+
+    if (dom_id && s->nr_domu_watches >= XS_MAX_WATCHES) {
+        return E2BIG;
+    }
+
+    w = g_new0(XsWatch, 1);
+    w->token = g_strdup(token);
+    w->cb = fn;
+    w->cb_opaque = opaque;
+    w->dom_id = dom_id;
+    w->rel_prefix = strlen(abspath) - strlen(path);
+
+    /* l was looked up above when checking for duplicates */
+    if (l) {
+        w->next = l->next;
+        l->next = w;
+    } else {
+        g_hash_table_insert(s->watches, g_strdup(abspath), w);
+    }
+    if (dom_id) {
+        s->nr_domu_watches++;
+    }
+
+    return 0;
+}
+
+int xs_impl_watch(XenstoreImplState *s, unsigned int dom_id, const char *path,
+                  const char *token, xs_impl_watch_fn fn, void *opaque)
+{
+    int ret = do_xs_impl_watch(s, dom_id, path, token, fn, opaque);
+
+    if (!ret) {
+        /* A new watch should fire immediately */
+        fn(opaque, path, token);
+    }
+
+    return ret;
+}
+
+static XsWatch *free_watch(XenstoreImplState *s, XsWatch *w)
+{
+    XsWatch *next = w->next;
+
+    if (w->dom_id) {
+        assert(s->nr_domu_watches);
+        s->nr_domu_watches--;
+    }
+
+    g_free(w->token);
+    g_free(w);
+
+    return next;
+}
+
+int xs_impl_unwatch(XenstoreImplState *s, unsigned int dom_id,
+                    const char *path, const char *token,
+                    xs_impl_watch_fn fn, void *opaque)
+{
+    char abspath[XENSTORE_ABS_PATH_MAX + 1];
+    XsWatch *w, **l;
+    int ret;
+
+    ret = validate_path(abspath, path, dom_id);
+    if (ret) {
+        return ret;
+    }
+
+    w = g_hash_table_lookup(s->watches, abspath);
+    if (!w) {
+        return ENOENT;
+    }
+
+    /*
+     * The hash table contains the first element of a list of
+     * watches. Removing the first element in the list is a
+     * special case because we have to update the hash table to
+     * point to the next (or remove it if there's nothing left).
+     */
+    if (!g_strcmp0(token, w->token) && fn == w->cb && opaque == w->cb_opaque &&
+        dom_id == w->dom_id) {
+        if (w->next) {
+            /* Insert the previous 'next' into the hash table */
+            g_hash_table_insert(s->watches, g_strdup(abspath), w->next);
+        } else {
+            /* Nothing left; remove from hash table */
+            g_hash_table_remove(s->watches, abspath);
+        }
+        free_watch(s, w);
+        return 0;
+    }
+
+    /*
+     * We're all done messing with the hash table because the element
+     * it points to has survived the cull. Now it's just a simple
+     * linked list removal operation.
+     */
+    for (l = &w->next; *l; l = &w->next) {
+        w = *l;
+
+        if (!g_strcmp0(token, w->token) && fn == w->cb &&
+            opaque != w->cb_opaque && dom_id == w->dom_id) {
+            *l = free_watch(s, w);
+            return 0;
+        }
+    }
+
+    return ENOENT;
+}
+
+int xs_impl_reset_watches(XenstoreImplState *s, unsigned int dom_id)
+{
+    char **watch_paths;
+    guint nr_watch_paths;
+    guint i;
+
+    watch_paths = (char **)g_hash_table_get_keys_as_array(s->watches,
+                                                          &nr_watch_paths);
+
+    for (i = 0; i < nr_watch_paths; i++) {
+        XsWatch *w1 = g_hash_table_lookup(s->watches, watch_paths[i]);
+        XsWatch *w2, *w, **l;
+
+        /*
+         * w1 is the original list. The hash table has this pointer.
+         * w2 is the head of our newly-filtered list.
+         * w and l are temporary for processing. w is somewhat redundant
+         * with *l but makes my eyes bleed less.
+         */
+
+        w = w2 = w1;
+        l = &w;
+        while (w) {
+            if (w->dom_id == dom_id) {
+                /* If we're freeing the head of the list, bump w2 */
+                if (w2 == w) {
+                    w2 = w->next;
+                }
+                *l = free_watch(s, w);
+            } else {
+                l = &w->next;
+            }
+            w = *l;
+        }
+        /*
+         * If the head of the list survived the cull, we don't need to
+         * touch the hash table and we're done with this path. Else...
+         */
+        if (w1 != w2) {
+            g_hash_table_steal(s->watches, watch_paths[i]);
+
+            /*
+             * It was already freed. (Don't worry, this whole thing is
+             * single-threaded and nobody saw it in the meantime). And
+             * having *stolen* it, we now own the watch_paths[i] string
+             * so if we don't give it back to the hash table, we need
+             * to free it.
+             */
+            if (w2) {
+                g_hash_table_insert(s->watches, watch_paths[i], w2);
+            } else {
+                g_free(watch_paths[i]);
+            }
+        }
+    }
+    g_free(watch_paths);
+    return 0;
+}
+
+static void xs_tx_free(void *_tx)
+{
+    XsTransaction *tx = _tx;
+    if (tx->root) {
+        xs_node_unref(tx->root);
+    }
+    g_free(tx);
+}
+
+XenstoreImplState *xs_impl_create(unsigned int dom_id)
+{
+    XenstoreImplState *s = g_new0(XenstoreImplState, 1);
+    GList *perms;
+
+    s->watches = g_hash_table_new_full(g_str_hash, g_str_equal, g_free, NULL);
+    s->transactions = g_hash_table_new_full(g_direct_hash, g_direct_equal,
+                                            NULL, xs_tx_free);
+
+    perms = g_list_append(NULL, xs_perm_as_string(XS_PERM_NONE, 0));
+    s->root = xs_node_create("/", perms);
+    g_list_free_full(perms, g_free);
+    s->nr_nodes = 1;
+
+    s->root_tx = s->last_tx = 1;
+    return s;
+}
+
+
+static void clear_serialized_tx(gpointer key, gpointer value, gpointer opaque)
+{
+    XsNode *n = value;
+
+    n->serialized_tx = XBT_NULL;
+    if (n->children) {
+        g_hash_table_foreach(n->children, clear_serialized_tx, NULL);
+    }
+}
+
+static void clear_tx_serialized_tx(gpointer key, gpointer value,
+                                   gpointer opaque)
+{
+    XsTransaction *t = value;
+
+    clear_serialized_tx(NULL, t->root, NULL);
+}
+
+static void write_be32(GByteArray *save, uint32_t val)
+{
+    uint32_t be = htonl(val);
+    g_byte_array_append(save, (void *)&be, sizeof(be));
+}
+
+
+struct save_state {
+    GByteArray *bytes;
+    unsigned int tx_id;
+};
+
+#define MODIFIED_IN_TX  (1U << 0)
+#define DELETED_IN_TX   (1U << 1)
+#define NODE_REF        (1U << 2)
+
+static void save_node(gpointer key, gpointer value, gpointer opaque)
+{
+    struct save_state *ss = opaque;
+    XsNode *n = value;
+    char *name = key;
+    uint8_t flag = 0;
+
+    /* Child nodes (i.e. anything but the root) have a name */
+    if (name) {
+        g_byte_array_append(ss->bytes, key, strlen(key) + 1);
+    }
+
+    /*
+     * If we already wrote this node, refer to the previous copy.
+     * There's no rename/move in XenStore, so all we need to find
+     * it is the tx_id of the transation in which it exists. Which
+     * may be the root tx.
+     */
+    if (n->serialized_tx != XBT_NULL) {
+        flag = NODE_REF;
+        g_byte_array_append(ss->bytes, &flag, 1);
+        write_be32(ss->bytes, n->serialized_tx);
+    } else {
+        GList *l;
+        n->serialized_tx = ss->tx_id;
+
+        if (n->modified_in_tx) {
+            flag |= MODIFIED_IN_TX;
+        }
+        if (n->deleted_in_tx) {
+            flag |= DELETED_IN_TX;
+        }
+        g_byte_array_append(ss->bytes, &flag, 1);
+
+        if (n->content) {
+            write_be32(ss->bytes, n->content->len);
+            g_byte_array_append(ss->bytes, n->content->data, n->content->len);
+        } else {
+            write_be32(ss->bytes, 0);
+        }
+
+        for (l = n->perms; l; l = l->next) {
+            g_byte_array_append(ss->bytes, l->data, strlen(l->data) + 1);
+        }
+        /* NUL termination after perms */
+        g_byte_array_append(ss->bytes, (void *)"", 1);
+
+        if (n->children) {
+            g_hash_table_foreach(n->children, save_node, ss);
+        }
+        /* NUL termination after children (child name is NUL) */
+        g_byte_array_append(ss->bytes, (void *)"", 1);
+    }
+}
+
+static void save_tree(struct save_state *ss, uint32_t tx_id, XsNode *root)
+{
+    write_be32(ss->bytes, tx_id);
+    ss->tx_id = tx_id;
+    save_node(NULL, root, ss);
+}
+
+static void save_tx(gpointer key, gpointer value, gpointer opaque)
+{
+    uint32_t tx_id = GPOINTER_TO_INT(key);
+    struct save_state *ss = opaque;
+    XsTransaction *n = value;
+
+    write_be32(ss->bytes, n->base_tx);
+    write_be32(ss->bytes, n->dom_id);
+
+    save_tree(ss, tx_id, n->root);
+}
+
+static void save_watch(gpointer key, gpointer value, gpointer opaque)
+{
+    struct save_state *ss = opaque;
+    XsWatch *w = value;
+
+    /* We only save the *guest* watches. */
+    if (w->dom_id) {
+        gpointer relpath = key + w->rel_prefix;
+        g_byte_array_append(ss->bytes, relpath, strlen(relpath) + 1);
+        g_byte_array_append(ss->bytes, (void *)w->token, strlen(w->token) + 1);
+    }
+}
+
+GByteArray *xs_impl_serialize(XenstoreImplState *s)
+{
+    struct save_state ss;
+
+    ss.bytes = g_byte_array_new();
+
+    /*
+     * node = flags [ real_node / node_ref ]
+     *   flags = uint8_t (MODIFIED_IN_TX | DELETED_IN_TX | NODE_REF)
+     *   node_ref = tx_id (in which the original version of this node exists)
+     *   real_node = content perms child* NUL
+     *     content = len data
+     *       len = uint32_t
+     *       data = uint8_t{len}
+     *     perms = perm* NUL
+     *       perm = asciiz
+     *   child = name node
+     *     name = asciiz
+     *
+     * tree = tx_id node
+     *   tx_id = uint32_t
+     *
+     * transaction = base_tx_id dom_id tree
+     *   base_tx_id = uint32_t
+     *   dom_id = uint32_t
+     *
+     * tx_list = tree transaction* XBT_NULL
+     *
+     * watch = path token
+     *   path = asciiz
+     *   token = asciiz
+     *
+     * watch_list = watch* NUL
+     *
+     * xs_serialize_stream = last_tx tx_list watch_list
+     *   last_tx = uint32_t
+     */
+
+    /* Clear serialized_tx in every node. */
+    if (s->serialized) {
+        clear_serialized_tx(NULL, s->root, NULL);
+        g_hash_table_foreach(s->transactions, clear_tx_serialized_tx, NULL);
+    }
+
+    s->serialized = true;
+
+    write_be32(ss.bytes, s->last_tx);
+    save_tree(&ss, s->root_tx, s->root);
+    g_hash_table_foreach(s->transactions, save_tx, &ss);
+
+    write_be32(ss.bytes, XBT_NULL);
+
+    g_hash_table_foreach(s->watches, save_watch, &ss);
+    g_byte_array_append(ss.bytes, (void *)"", 1);
+
+    return ss.bytes;
+}
+
+struct unsave_state {
+    char path[XENSTORE_ABS_PATH_MAX + 1];
+    XenstoreImplState *s;
+    GByteArray *bytes;
+    uint8_t *d;
+    size_t l;
+    bool root_walk;
+};
+
+static int consume_be32(struct unsave_state *us, unsigned int *val)
+{
+    uint32_t d;
+
+    if (us->l < sizeof(d)) {
+        return -EINVAL;
+    }
+    memcpy(&d, us->d, sizeof(d));
+    *val = ntohl(d);
+    us->d += sizeof(d);
+    us->l -= sizeof(d);
+    return 0;
+}
+
+static int consume_string(struct unsave_state *us, char **str, size_t *len)
+{
+    size_t l;
+
+    if (!us->l) {
+        return -EINVAL;
+    }
+
+    l = strnlen((void *)us->d, us->l);
+    if (l == us->l) {
+        return -EINVAL;
+    }
+
+    if (str) {
+        *str = (void *)us->d;
+    }
+    if (len) {
+        *len = l;
+    }
+
+    us->d += l + 1;
+    us->l -= l + 1;
+    return 0;
+}
+
+static XsNode *lookup_node(XsNode *n, char *path)
+{
+    char *slash = strchr(path, '/');
+    XsNode *child;
+
+    if (path[0] == '\0') {
+        return n;
+    }
+
+    if (slash) {
+        *slash = '\0';
+    }
+
+    if (!n->children) {
+        return NULL;
+    }
+    child = g_hash_table_lookup(n->children, path);
+    if (!slash) {
+        return child;
+    }
+
+    *slash = '/';
+    if (!child) {
+        return NULL;
+    }
+    return lookup_node(child, slash + 1);
+}
+
+static XsNode *lookup_tx_node(struct unsave_state *us, unsigned int tx_id)
+{
+    XsTransaction *t;
+    if (tx_id == us->s->root_tx) {
+        return lookup_node(us->s->root, us->path + 1);
+    }
+
+    t = g_hash_table_lookup(us->s->transactions, GINT_TO_POINTER(tx_id));
+    if (!t) {
+        return NULL;
+    }
+    g_assert(t->root);
+    return lookup_node(t->root, us->path + 1);
+}
+
+static void count_child_nodes(gpointer key, gpointer value, gpointer user_data)
+{
+    unsigned int *nr_nodes = user_data;
+    XsNode *n = value;
+
+    (*nr_nodes)++;
+
+    if (n->children) {
+        g_hash_table_foreach(n->children, count_child_nodes, nr_nodes);
+    }
+}
+
+static int consume_node(struct unsave_state *us, XsNode **nodep,
+                        unsigned int *nr_nodes)
+{
+    XsNode *n = NULL;
+    uint8_t flags;
+    int ret;
+
+    if (us->l < 1) {
+        return -EINVAL;
+    }
+    flags = us->d[0];
+    us->d++;
+    us->l--;
+
+    if (flags == NODE_REF) {
+        unsigned int tx;
+
+        ret = consume_be32(us, &tx);
+        if (ret) {
+            return ret;
+        }
+
+        n = lookup_tx_node(us, tx);
+        if (!n) {
+            return -EINVAL;
+        }
+        n->ref++;
+        if (n->children) {
+            g_hash_table_foreach(n->children, count_child_nodes, nr_nodes);
+        }
+    } else {
+        uint32_t datalen;
+
+        if (flags & ~(DELETED_IN_TX | MODIFIED_IN_TX)) {
+            return -EINVAL;
+        }
+        n = xs_node_new();
+
+        if (flags & DELETED_IN_TX) {
+            n->deleted_in_tx = true;
+        }
+        if (flags & MODIFIED_IN_TX) {
+            n->modified_in_tx = true;
+        }
+        ret = consume_be32(us, &datalen);
+        if (ret) {
+            xs_node_unref(n);
+            return -EINVAL;
+        }
+        if (datalen) {
+            if (datalen > us->l) {
+                xs_node_unref(n);
+                return -EINVAL;
+            }
+
+            GByteArray *node_data = g_byte_array_new();
+            g_byte_array_append(node_data, us->d, datalen);
+            us->d += datalen;
+            us->l -= datalen;
+            n->content = node_data;
+
+            if (us->root_walk) {
+                n->modified_in_tx = true;
+            }
+        }
+        while (1) {
+            char *perm = NULL;
+            size_t permlen = 0;
+
+            ret = consume_string(us, &perm, &permlen);
+            if (ret) {
+                xs_node_unref(n);
+                return ret;
+            }
+
+            if (!permlen) {
+                break;
+            }
+
+            n->perms = g_list_append(n->perms, g_strdup(perm));
+        }
+
+        /* Now children */
+        while (1) {
+            size_t childlen;
+            char *childname;
+            char *pathend;
+            XsNode *child = NULL;
+
+            ret = consume_string(us, &childname, &childlen);
+            if (ret) {
+                xs_node_unref(n);
+                return ret;
+            }
+
+            if (!childlen) {
+                break;
+            }
+
+            pathend = us->path + strlen(us->path);
+            strncat(us->path, "/", sizeof(us->path) - 1);
+            strncat(us->path, childname, sizeof(us->path) - 1);
+
+            ret = consume_node(us, &child, nr_nodes);
+            *pathend = '\0';
+            if (ret) {
+                xs_node_unref(n);
+                return ret;
+            }
+            g_assert(child);
+            xs_node_add_child(n, childname, child);
+        }
+
+        /*
+         * If the node has no data and no children we still want to fire
+         * a watch on it.
+         */
+        if (us->root_walk && !n->children) {
+            n->modified_in_tx = true;
+        }
+    }
+
+    if (!n->deleted_in_tx) {
+        (*nr_nodes)++;
+    }
+
+    *nodep = n;
+    return 0;
+}
+
+static int consume_tree(struct unsave_state *us, XsTransaction *t)
+{
+    int ret;
+
+    ret = consume_be32(us, &t->tx_id);
+    if (ret) {
+        return ret;
+    }
+
+    if (t->tx_id > us->s->last_tx) {
+        return -EINVAL;
+    }
+
+    us->path[0] = '\0';
+
+    return consume_node(us, &t->root, &t->nr_nodes);
+}
+
+int xs_impl_deserialize(XenstoreImplState *s, GByteArray *bytes,
+                        unsigned int dom_id, xs_impl_watch_fn watch_fn,
+                        void *watch_opaque)
+{
+    struct unsave_state us;
+    XsTransaction base_t = { 0 };
+    int ret;
+
+    us.s = s;
+    us.bytes = bytes;
+    us.d = bytes->data;
+    us.l = bytes->len;
+
+    xs_impl_reset_watches(s, dom_id);
+    g_hash_table_remove_all(s->transactions);
+
+    xs_node_unref(s->root);
+    s->root = NULL;
+    s->root_tx = s->last_tx = XBT_NULL;
+
+    ret = consume_be32(&us, &s->last_tx);
+    if (ret) {
+        return ret;
+    }
+
+    /*
+     * Consume the base tree into a transaction so that watches can be
+     * fired as we commit it. By setting us.root_walk we cause the nodes
+     * to be marked as 'modified_in_tx' as they are created, so that the
+     * watches are triggered on them.
+     */
+    base_t.dom_id = dom_id;
+    base_t.base_tx = XBT_NULL;
+    us.root_walk = true;
+    ret = consume_tree(&us, &base_t);
+    if (ret) {
+        return ret;
+    }
+    us.root_walk = false;
+
+    /*
+     * Commit the transaction now while the refcount on all nodes is 1.
+     * Note that we haven't yet reinstated the *guest* watches but that's
+     * OK because we don't want the guest to see any changes. Even any
+     * backend nodes which get recreated should be *precisely* as they
+     * were before the migration. Back ends may have been instantiated
+     * already, and will see the frontend magically blink into existence
+     * now (well, from the aio_bh which fires the watches). It's their
+     * responsibility to rebuild everything precisely as it was before.
+     */
+    ret = transaction_commit(s, &base_t);
+    if (ret) {
+        return ret;
+    }
+
+    while (1) {
+        unsigned int base_tx;
+        XsTransaction *t;
+
+        ret = consume_be32(&us, &base_tx);
+        if (ret) {
+            return ret;
+        }
+        if (base_tx == XBT_NULL) {
+            break;
+        }
+
+        t = g_new0(XsTransaction, 1);
+        t->base_tx = base_tx;
+
+        ret = consume_be32(&us, &t->dom_id);
+        if (!ret) {
+            ret = consume_tree(&us, t);
+        }
+        if (ret) {
+            g_free(t);
+            return ret;
+        }
+        g_assert(t->root);
+        if (t->dom_id) {
+            s->nr_domu_transactions++;
+        }
+        g_hash_table_insert(s->transactions, GINT_TO_POINTER(t->tx_id), t);
+    }
+
+    while (1) {
+        char *path, *token;
+        size_t pathlen, toklen;
+
+        ret = consume_string(&us, &path, &pathlen);
+        if (ret) {
+            return ret;
+        }
+        if (!pathlen) {
+            break;
+        }
+
+        ret = consume_string(&us, &token, &toklen);
+        if (ret) {
+            return ret;
+        }
+
+        if (!watch_fn) {
+            continue;
+        }
+
+        ret = do_xs_impl_watch(s, dom_id, path, token, watch_fn, watch_opaque);
+        if (ret) {
+            return ret;
+        }
+    }
+
+    if (us.l) {
+        return -EINVAL;
+    }
+
+    return 0;
+}
diff --git a/hw/i386/kvm/xenstore_impl.h b/hw/i386/kvm/xenstore_impl.h
new file mode 100644
index 0000000000..0df2a91aae
--- /dev/null
+++ b/hw/i386/kvm/xenstore_impl.h
@@ -0,0 +1,63 @@
+/*
+ * QEMU Xen emulation: The actual implementation of XenStore
+ *
+ * Copyright © 2023 Amazon.com, Inc. or its affiliates. All Rights Reserved.
+ *
+ * Authors: David Woodhouse <dwmw2@infradead.org>
+ *
+ * This work is licensed under the terms of the GNU GPL, version 2 or later.
+ * See the COPYING file in the top-level directory.
+ */
+
+#ifndef QEMU_XENSTORE_IMPL_H
+#define QEMU_XENSTORE_IMPL_H
+
+#include "hw/xen/xen_backend_ops.h"
+
+typedef struct XenstoreImplState XenstoreImplState;
+
+XenstoreImplState *xs_impl_create(unsigned int dom_id);
+
+char *xs_perm_as_string(unsigned int perm, unsigned int domid);
+
+/*
+ * These functions return *positive* error numbers. This is a little
+ * unconventional but it helps to keep us honest because there is
+ * also a very limited set of error numbers that they are permitted
+ * to return (those in xsd_errors).
+ */
+
+int xs_impl_read(XenstoreImplState *s, unsigned int dom_id,
+                 xs_transaction_t tx_id, const char *path, GByteArray *data);
+int xs_impl_write(XenstoreImplState *s, unsigned int dom_id,
+                  xs_transaction_t tx_id, const char *path, GByteArray *data);
+int xs_impl_directory(XenstoreImplState *s, unsigned int dom_id,
+                      xs_transaction_t tx_id, const char *path,
+                      uint64_t *gencnt, GList **items);
+int xs_impl_transaction_start(XenstoreImplState *s, unsigned int dom_id,
+                              xs_transaction_t *tx_id);
+int xs_impl_transaction_end(XenstoreImplState *s, unsigned int dom_id,
+                            xs_transaction_t tx_id, bool commit);
+int xs_impl_rm(XenstoreImplState *s, unsigned int dom_id,
+               xs_transaction_t tx_id, const char *path);
+int xs_impl_get_perms(XenstoreImplState *s, unsigned int dom_id,
+                      xs_transaction_t tx_id, const char *path, GList **perms);
+int xs_impl_set_perms(XenstoreImplState *s, unsigned int dom_id,
+                      xs_transaction_t tx_id, const char *path, GList *perms);
+
+/* This differs from xs_watch_fn because it has the token */
+typedef void(xs_impl_watch_fn)(void *opaque, const char *path,
+                               const char *token);
+int xs_impl_watch(XenstoreImplState *s, unsigned int dom_id, const char *path,
+                  const char *token, xs_impl_watch_fn fn, void *opaque);
+int xs_impl_unwatch(XenstoreImplState *s, unsigned int dom_id,
+                    const char *path, const char *token, xs_impl_watch_fn fn,
+                    void *opaque);
+int xs_impl_reset_watches(XenstoreImplState *s, unsigned int dom_id);
+
+GByteArray *xs_impl_serialize(XenstoreImplState *s);
+int xs_impl_deserialize(XenstoreImplState *s, GByteArray *bytes,
+                        unsigned int dom_id, xs_impl_watch_fn watch_fn,
+                        void *watch_opaque);
+
+#endif /* QEMU_XENSTORE_IMPL_H */
diff --git a/hw/i386/pc.c b/hw/i386/pc.c
index 7bebea57e3..1489abf010 100644
--- a/hw/i386/pc.c
+++ b/hw/i386/pc.c
@@ -102,6 +102,11 @@
 #include "trace.h"
 #include CONFIG_DEVICES
 
+#ifdef CONFIG_XEN_EMU
+#include "hw/xen/xen-legacy-backend.h"
+#include "hw/xen/xen-bus.h"
+#endif
+
 /*
  * Helper for setting model-id for CPU models that changed model-id
  * depending on QEMU versions up to QEMU 2.4.
@@ -1318,6 +1323,8 @@ void pc_basic_device_init(struct PCMachineState *pcms,
         if (pcms->bus) {
             pci_create_simple(pcms->bus, -1, "xen-platform");
         }
+        xen_bus_init();
+        xen_be_init();
     }
 #endif
 
diff --git a/hw/i386/pc_piix.c b/hw/i386/pc_piix.c
index 2f16011bab..30eedd62a3 100644
--- a/hw/i386/pc_piix.c
+++ b/hw/i386/pc_piix.c
@@ -47,8 +47,6 @@
 #include "hw/kvm/clock.h"
 #include "hw/sysbus.h"
 #include "hw/i2c/smbus_eeprom.h"
-#include "hw/xen/xen-x86.h"
-#include "hw/xen/xen.h"
 #include "exec/memory.h"
 #include "hw/acpi/acpi.h"
 #include "hw/acpi/piix4.h"
@@ -60,6 +58,8 @@
 #include <xen/hvm/hvm_info_table.h>
 #include "hw/xen/xen_pt.h"
 #endif
+#include "hw/xen/xen-x86.h"
+#include "hw/xen/xen.h"
 #include "migration/global_state.h"
 #include "migration/misc.h"
 #include "sysemu/numa.h"
@@ -422,6 +422,7 @@ static void pc_xen_hvm_init(MachineState *machine)
     }
 
     pc_xen_hvm_init_pci(machine);
+    xen_igd_reserve_slot(pcms->bus);
     pci_create_simple(pcms->bus, -1, "xen-platform");
 }
 #endif
diff --git a/hw/i386/xen/xen-hvm.c b/hw/i386/xen/xen-hvm.c
index e5a1dd19f4..56641a550e 100644
--- a/hw/i386/xen/xen-hvm.c
+++ b/hw/i386/xen/xen-hvm.c
@@ -18,7 +18,7 @@
 #include "hw/irq.h"
 #include "hw/hw.h"
 #include "hw/i386/apic-msidef.h"
-#include "hw/xen/xen_common.h"
+#include "hw/xen/xen_native.h"
 #include "hw/xen/xen-legacy-backend.h"
 #include "hw/xen/xen-bus.h"
 #include "hw/xen/xen-x86.h"
@@ -52,10 +52,11 @@ static bool xen_in_migration;
 
 /* Compatibility with older version */
 
-/* This allows QEMU to build on a system that has Xen 4.5 or earlier
- * installed.  This here (not in hw/xen/xen_common.h) because xen/hvm/ioreq.h
- * needs to be included before this block and hw/xen/xen_common.h needs to
- * be included before xen/hvm/ioreq.h
+/*
+ * This allows QEMU to build on a system that has Xen 4.5 or earlier installed.
+ * This is here (not in hw/xen/xen_native.h) because xen/hvm/ioreq.h needs to
+ * be included before this block and hw/xen/xen_native.h needs to be included
+ * before xen/hvm/ioreq.h
  */
 #ifndef IOREQ_TYPE_VMWARE_PORT
 #define IOREQ_TYPE_VMWARE_PORT  3
@@ -761,7 +762,7 @@ static ioreq_t *cpu_get_ioreq(XenIOState *state)
     int i;
     evtchn_port_t port;
 
-    port = xenevtchn_pending(state->xce_handle);
+    port = qemu_xen_evtchn_pending(state->xce_handle);
     if (port == state->bufioreq_local_port) {
         timer_mod(state->buffered_io_timer,
                 BUFFER_IO_MAX_DELAY + qemu_clock_get_ms(QEMU_CLOCK_REALTIME));
@@ -780,7 +781,7 @@ static ioreq_t *cpu_get_ioreq(XenIOState *state)
         }
 
         /* unmask the wanted port again */
-        xenevtchn_unmask(state->xce_handle, port);
+        qemu_xen_evtchn_unmask(state->xce_handle, port);
 
         /* get the io packet from shared memory */
         state->send_vcpu = i;
@@ -1147,7 +1148,7 @@ static void handle_buffered_io(void *opaque)
                 BUFFER_IO_MAX_DELAY + qemu_clock_get_ms(QEMU_CLOCK_REALTIME));
     } else {
         timer_del(state->buffered_io_timer);
-        xenevtchn_unmask(state->xce_handle, state->bufioreq_local_port);
+        qemu_xen_evtchn_unmask(state->xce_handle, state->bufioreq_local_port);
     }
 }
 
@@ -1196,8 +1197,8 @@ static void cpu_handle_ioreq(void *opaque)
         }
 
         req->state = STATE_IORESP_READY;
-        xenevtchn_notify(state->xce_handle,
-                         state->ioreq_local_port[state->send_vcpu]);
+        qemu_xen_evtchn_notify(state->xce_handle,
+                               state->ioreq_local_port[state->send_vcpu]);
     }
 }
 
@@ -1206,7 +1207,7 @@ static void xen_main_loop_prepare(XenIOState *state)
     int evtchn_fd = -1;
 
     if (state->xce_handle != NULL) {
-        evtchn_fd = xenevtchn_fd(state->xce_handle);
+        evtchn_fd = qemu_xen_evtchn_fd(state->xce_handle);
     }
 
     state->buffered_io_timer = timer_new_ms(QEMU_CLOCK_REALTIME, handle_buffered_io,
@@ -1249,7 +1250,7 @@ static void xen_exit_notifier(Notifier *n, void *data)
         xenforeignmemory_unmap_resource(xen_fmem, state->fres);
     }
 
-    xenevtchn_close(state->xce_handle);
+    qemu_xen_evtchn_close(state->xce_handle);
     xs_daemon_close(state->xenstore);
 }
 
@@ -1397,9 +1398,11 @@ void xen_hvm_init_pc(PCMachineState *pcms, MemoryRegion **ram_memory)
     xen_pfn_t ioreq_pfn;
     XenIOState *state;
 
+    setup_xen_backend_ops();
+
     state = g_new0(XenIOState, 1);
 
-    state->xce_handle = xenevtchn_open(NULL, 0);
+    state->xce_handle = qemu_xen_evtchn_open();
     if (state->xce_handle == NULL) {
         perror("xen: event channel open");
         goto err;
@@ -1463,8 +1466,9 @@ void xen_hvm_init_pc(PCMachineState *pcms, MemoryRegion **ram_memory)
 
     /* FIXME: how about if we overflow the page here? */
     for (i = 0; i < max_cpus; i++) {
-        rc = xenevtchn_bind_interdomain(state->xce_handle, xen_domid,
-                                        xen_vcpu_eport(state->shared_page, i));
+        rc = qemu_xen_evtchn_bind_interdomain(state->xce_handle, xen_domid,
+                                              xen_vcpu_eport(state->shared_page,
+                                                             i));
         if (rc == -1) {
             error_report("shared evtchn %d bind error %d", i, errno);
             goto err;
@@ -1472,8 +1476,8 @@ void xen_hvm_init_pc(PCMachineState *pcms, MemoryRegion **ram_memory)
         state->ioreq_local_port[i] = rc;
     }
 
-    rc = xenevtchn_bind_interdomain(state->xce_handle, xen_domid,
-                                    state->bufioreq_remote_port);
+    rc = qemu_xen_evtchn_bind_interdomain(state->xce_handle, xen_domid,
+                                          state->bufioreq_remote_port);
     if (rc == -1) {
         error_report("buffered evtchn bind error %d", errno);
         goto err;
diff --git a/hw/i386/xen/xen-mapcache.c b/hw/i386/xen/xen-mapcache.c
index 1d0879d234..f7d974677d 100644
--- a/hw/i386/xen/xen-mapcache.c
+++ b/hw/i386/xen/xen-mapcache.c
@@ -14,7 +14,7 @@
 
 #include <sys/resource.h>
 
-#include "hw/xen/xen-legacy-backend.h"
+#include "hw/xen/xen_native.h"
 #include "qemu/bitmap.h"
 
 #include "sysemu/runstate.h"
diff --git a/hw/i386/xen/xen_platform.c b/hw/i386/xen/xen_platform.c
index 539f7da374..57f1d742c1 100644
--- a/hw/i386/xen/xen_platform.c
+++ b/hw/i386/xen/xen_platform.c
@@ -28,7 +28,6 @@
 #include "hw/ide/pci.h"
 #include "hw/pci/pci.h"
 #include "migration/vmstate.h"
-#include "hw/xen/xen.h"
 #include "net/net.h"
 #include "trace.h"
 #include "sysemu/xen.h"
@@ -38,10 +37,12 @@
 #include "qom/object.h"
 
 #ifdef CONFIG_XEN
-#include "hw/xen/xen_common.h"
-#include "hw/xen/xen-legacy-backend.h"
+#include "hw/xen/xen_native.h"
 #endif
 
+/* The rule is that xen_native.h must come first */
+#include "hw/xen/xen.h"
+
 //#define DEBUG_PLATFORM
 
 #ifdef DEBUG_PLATFORM
diff --git a/hw/intc/i8259.c b/hw/intc/i8259.c
index 17910f3bcb..bbae2d87f4 100644
--- a/hw/intc/i8259.c
+++ b/hw/intc/i8259.c
@@ -133,7 +133,7 @@ static void pic_set_irq(void *opaque, int irq, int level)
     }
 #endif
 
-    if (s->elcr & mask) {
+    if (s->ltim || (s->elcr & mask)) {
         /* level triggered */
         if (level) {
             s->irr |= mask;
@@ -167,7 +167,7 @@ static void pic_intack(PICCommonState *s, int irq)
         s->isr |= (1 << irq);
     }
     /* We don't clear a level sensitive interrupt here */
-    if (!(s->elcr & (1 << irq))) {
+    if (!s->ltim && !(s->elcr & (1 << irq))) {
         s->irr &= ~(1 << irq);
     }
     pic_update_irq(s);
@@ -224,6 +224,7 @@ static void pic_reset(DeviceState *dev)
     PICCommonState *s = PIC_COMMON(dev);
 
     s->elcr = 0;
+    s->ltim = 0;
     pic_init_reset(s);
 }
 
@@ -243,10 +244,7 @@ static void pic_ioport_write(void *opaque, hwaddr addr64,
             s->init_state = 1;
             s->init4 = val & 1;
             s->single_mode = val & 2;
-            if (val & 0x08) {
-                qemu_log_mask(LOG_UNIMP,
-                              "i8259: level sensitive irq not supported\n");
-            }
+            s->ltim = val & 8;
         } else if (val & 0x08) {
             if (val & 0x04) {
                 s->poll = 1;
diff --git a/hw/intc/i8259_common.c b/hw/intc/i8259_common.c
index af2e4a2241..c931dc2d07 100644
--- a/hw/intc/i8259_common.c
+++ b/hw/intc/i8259_common.c
@@ -51,7 +51,7 @@ void pic_reset_common(PICCommonState *s)
     s->special_fully_nested_mode = 0;
     s->init4 = 0;
     s->single_mode = 0;
-    /* Note: ELCR is not reset */
+    /* Note: ELCR and LTIM are not reset */
 }
 
 static int pic_dispatch_pre_save(void *opaque)
@@ -144,6 +144,24 @@ static void pic_print_info(InterruptStatsProvider *obj, Monitor *mon)
                    s->special_fully_nested_mode);
 }
 
+static bool ltim_state_needed(void *opaque)
+{
+    PICCommonState *s = PIC_COMMON(opaque);
+
+    return !!s->ltim;
+}
+
+static const VMStateDescription vmstate_pic_ltim = {
+    .name = "i8259/ltim",
+    .version_id = 1,
+    .minimum_version_id = 1,
+    .needed = ltim_state_needed,
+    .fields = (VMStateField[]) {
+        VMSTATE_UINT8(ltim, PICCommonState),
+        VMSTATE_END_OF_LIST()
+    }
+};
+
 static const VMStateDescription vmstate_pic_common = {
     .name = "i8259",
     .version_id = 1,
@@ -168,6 +186,10 @@ static const VMStateDescription vmstate_pic_common = {
         VMSTATE_UINT8(single_mode, PICCommonState),
         VMSTATE_UINT8(elcr, PICCommonState),
         VMSTATE_END_OF_LIST()
+    },
+    .subsections = (const VMStateDescription*[]) {
+        &vmstate_pic_ltim,
+        NULL
     }
 };
 
diff --git a/hw/intc/mips_gic.c b/hw/intc/mips_gic.c
index bda4549925..4bdc3b1bd1 100644
--- a/hw/intc/mips_gic.c
+++ b/hw/intc/mips_gic.c
@@ -439,8 +439,8 @@ static void mips_gic_realize(DeviceState *dev, Error **errp)
 }
 
 static Property mips_gic_properties[] = {
-    DEFINE_PROP_INT32("num-vp", MIPSGICState, num_vps, 1),
-    DEFINE_PROP_INT32("num-irq", MIPSGICState, num_irq, 256),
+    DEFINE_PROP_UINT32("num-vp", MIPSGICState, num_vps, 1),
+    DEFINE_PROP_UINT32("num-irq", MIPSGICState, num_irq, 256),
     DEFINE_PROP_END_OF_LIST(),
 };
 
diff --git a/hw/intc/riscv_aclint.c b/hw/intc/riscv_aclint.c
index eee04643cb..b466a6abaf 100644
--- a/hw/intc/riscv_aclint.c
+++ b/hw/intc/riscv_aclint.c
@@ -130,7 +130,7 @@ static uint64_t riscv_aclint_mtimer_read(void *opaque, hwaddr addr,
         addr < (mtimer->timecmp_base + (mtimer->num_harts << 3))) {
         size_t hartid = mtimer->hartid_base +
                         ((addr - mtimer->timecmp_base) >> 3);
-        CPUState *cpu = qemu_get_cpu(hartid);
+        CPUState *cpu = cpu_by_arch_id(hartid);
         CPURISCVState *env = cpu ? cpu->env_ptr : NULL;
         if (!env) {
             qemu_log_mask(LOG_GUEST_ERROR,
@@ -173,7 +173,7 @@ static void riscv_aclint_mtimer_write(void *opaque, hwaddr addr,
         addr < (mtimer->timecmp_base + (mtimer->num_harts << 3))) {
         size_t hartid = mtimer->hartid_base +
                         ((addr - mtimer->timecmp_base) >> 3);
-        CPUState *cpu = qemu_get_cpu(hartid);
+        CPUState *cpu = cpu_by_arch_id(hartid);
         CPURISCVState *env = cpu ? cpu->env_ptr : NULL;
         if (!env) {
             qemu_log_mask(LOG_GUEST_ERROR,
@@ -231,7 +231,7 @@ static void riscv_aclint_mtimer_write(void *opaque, hwaddr addr,
 
         /* Check if timer interrupt is triggered for each hart. */
         for (i = 0; i < mtimer->num_harts; i++) {
-            CPUState *cpu = qemu_get_cpu(mtimer->hartid_base + i);
+            CPUState *cpu = cpu_by_arch_id(mtimer->hartid_base + i);
             CPURISCVState *env = cpu ? cpu->env_ptr : NULL;
             if (!env) {
                 continue;
@@ -292,7 +292,7 @@ static void riscv_aclint_mtimer_realize(DeviceState *dev, Error **errp)
     s->timecmp = g_new0(uint64_t, s->num_harts);
     /* Claim timer interrupt bits */
     for (i = 0; i < s->num_harts; i++) {
-        RISCVCPU *cpu = RISCV_CPU(qemu_get_cpu(s->hartid_base + i));
+        RISCVCPU *cpu = RISCV_CPU(cpu_by_arch_id(s->hartid_base + i));
         if (riscv_cpu_claim_interrupts(cpu, MIP_MTIP) < 0) {
             error_report("MTIP already claimed");
             exit(1);
@@ -372,7 +372,7 @@ DeviceState *riscv_aclint_mtimer_create(hwaddr addr, hwaddr size,
     sysbus_mmio_map(SYS_BUS_DEVICE(dev), 0, addr);
 
     for (i = 0; i < num_harts; i++) {
-        CPUState *cpu = qemu_get_cpu(hartid_base + i);
+        CPUState *cpu = cpu_by_arch_id(hartid_base + i);
         RISCVCPU *rvcpu = RISCV_CPU(cpu);
         CPURISCVState *env = cpu ? cpu->env_ptr : NULL;
         riscv_aclint_mtimer_callback *cb =
@@ -407,7 +407,7 @@ static uint64_t riscv_aclint_swi_read(void *opaque, hwaddr addr,
 
     if (addr < (swi->num_harts << 2)) {
         size_t hartid = swi->hartid_base + (addr >> 2);
-        CPUState *cpu = qemu_get_cpu(hartid);
+        CPUState *cpu = cpu_by_arch_id(hartid);
         CPURISCVState *env = cpu ? cpu->env_ptr : NULL;
         if (!env) {
             qemu_log_mask(LOG_GUEST_ERROR,
@@ -430,7 +430,7 @@ static void riscv_aclint_swi_write(void *opaque, hwaddr addr, uint64_t value,
 
     if (addr < (swi->num_harts << 2)) {
         size_t hartid = swi->hartid_base + (addr >> 2);
-        CPUState *cpu = qemu_get_cpu(hartid);
+        CPUState *cpu = cpu_by_arch_id(hartid);
         CPURISCVState *env = cpu ? cpu->env_ptr : NULL;
         if (!env) {
             qemu_log_mask(LOG_GUEST_ERROR,
@@ -545,7 +545,7 @@ DeviceState *riscv_aclint_swi_create(hwaddr addr, uint32_t hartid_base,
     sysbus_mmio_map(SYS_BUS_DEVICE(dev), 0, addr);
 
     for (i = 0; i < num_harts; i++) {
-        CPUState *cpu = qemu_get_cpu(hartid_base + i);
+        CPUState *cpu = cpu_by_arch_id(hartid_base + i);
         RISCVCPU *rvcpu = RISCV_CPU(cpu);
 
         qdev_connect_gpio_out(dev, i,
diff --git a/hw/intc/riscv_aplic.c b/hw/intc/riscv_aplic.c
index cfd007e629..cd7efc4ad4 100644
--- a/hw/intc/riscv_aplic.c
+++ b/hw/intc/riscv_aplic.c
@@ -833,7 +833,7 @@ static void riscv_aplic_realize(DeviceState *dev, Error **errp)
 
         /* Claim the CPU interrupt to be triggered by this APLIC */
         for (i = 0; i < aplic->num_harts; i++) {
-            RISCVCPU *cpu = RISCV_CPU(qemu_get_cpu(aplic->hartid_base + i));
+            RISCVCPU *cpu = RISCV_CPU(cpu_by_arch_id(aplic->hartid_base + i));
             if (riscv_cpu_claim_interrupts(cpu,
                 (aplic->mmode) ? MIP_MEIP : MIP_SEIP) < 0) {
                 error_report("%s already claimed",
@@ -966,7 +966,7 @@ DeviceState *riscv_aplic_create(hwaddr addr, hwaddr size,
 
     if (!msimode) {
         for (i = 0; i < num_harts; i++) {
-            CPUState *cpu = qemu_get_cpu(hartid_base + i);
+            CPUState *cpu = cpu_by_arch_id(hartid_base + i);
 
             qdev_connect_gpio_out_named(dev, NULL, i,
                                         qdev_get_gpio_in(DEVICE(cpu),
diff --git a/hw/intc/riscv_imsic.c b/hw/intc/riscv_imsic.c
index 4d4d5b50ca..fea3385b51 100644
--- a/hw/intc/riscv_imsic.c
+++ b/hw/intc/riscv_imsic.c
@@ -316,8 +316,8 @@ static const MemoryRegionOps riscv_imsic_ops = {
 static void riscv_imsic_realize(DeviceState *dev, Error **errp)
 {
     RISCVIMSICState *imsic = RISCV_IMSIC(dev);
-    RISCVCPU *rcpu = RISCV_CPU(qemu_get_cpu(imsic->hartid));
-    CPUState *cpu = qemu_get_cpu(imsic->hartid);
+    RISCVCPU *rcpu = RISCV_CPU(cpu_by_arch_id(imsic->hartid));
+    CPUState *cpu = cpu_by_arch_id(imsic->hartid);
     CPURISCVState *env = cpu ? cpu->env_ptr : NULL;
 
     imsic->num_eistate = imsic->num_pages * imsic->num_irqs;
@@ -413,7 +413,7 @@ DeviceState *riscv_imsic_create(hwaddr addr, uint32_t hartid, bool mmode,
                                 uint32_t num_pages, uint32_t num_ids)
 {
     DeviceState *dev = qdev_new(TYPE_RISCV_IMSIC);
-    CPUState *cpu = qemu_get_cpu(hartid);
+    CPUState *cpu = cpu_by_arch_id(hartid);
     uint32_t i;
 
     assert(!(addr & (IMSIC_MMIO_PAGE_SZ - 1)));
diff --git a/hw/isa/i82378.c b/hw/isa/i82378.c
index 233059c6dc..5432ab5065 100644
--- a/hw/isa/i82378.c
+++ b/hw/isa/i82378.c
@@ -47,6 +47,12 @@ static const VMStateDescription vmstate_i82378 = {
     },
 };
 
+static void i82378_request_out0_irq(void *opaque, int irq, int level)
+{
+    I82378State *s = opaque;
+    qemu_set_irq(s->cpu_intr, level);
+}
+
 static void i82378_request_pic_irq(void *opaque, int irq, int level)
 {
     DeviceState *dev = opaque;
@@ -88,7 +94,9 @@ static void i82378_realize(PCIDevice *pci, Error **errp)
      */
 
     /* 2 82C59 (irq) */
-    s->isa_irqs_in = i8259_init(isabus, s->cpu_intr);
+    s->isa_irqs_in = i8259_init(isabus,
+                                qemu_allocate_irq(i82378_request_out0_irq,
+                                                  s, 0));
     isa_bus_register_input_irqs(isabus, s->isa_irqs_in);
 
     /* 1 82C54 (pit) */
diff --git a/hw/isa/lpc_ich9.c b/hw/isa/lpc_ich9.c
index d8303d0322..9714b0001e 100644
--- a/hw/isa/lpc_ich9.c
+++ b/hw/isa/lpc_ich9.c
@@ -865,6 +865,7 @@ static void ich9_lpc_class_init(ObjectClass *klass, void *data)
     hc->plug = ich9_pm_device_plug_cb;
     hc->unplug_request = ich9_pm_device_unplug_request_cb;
     hc->unplug = ich9_pm_device_unplug_cb;
+    hc->is_hotpluggable_bus = ich9_pm_is_hotpluggable_bus;
     adevc->ospm_status = ich9_pm_ospm_status;
     adevc->send_event = ich9_send_gpe;
     adevc->madt_cpu = pc_madt_cpu_entry;
diff --git a/hw/isa/trace-events b/hw/isa/trace-events
index c4567a9b47..1816e8307a 100644
--- a/hw/isa/trace-events
+++ b/hw/isa/trace-events
@@ -16,6 +16,7 @@ apm_io_write(uint8_t addr, uint8_t val) "write addr=0x%x val=0x%02x"
 
 # vt82c686.c
 via_isa_write(uint32_t addr, uint32_t val, int len) "addr 0x%x val 0x%x len 0x%x"
+via_pm_read(uint32_t addr, uint32_t val, int len) "addr 0x%x val 0x%x len 0x%x"
 via_pm_write(uint32_t addr, uint32_t val, int len) "addr 0x%x val 0x%x len 0x%x"
 via_pm_io_read(uint32_t addr, uint32_t val, int len) "addr 0x%x val 0x%x len 0x%x"
 via_pm_io_write(uint32_t addr, uint32_t val, int len) "addr 0x%x val 0x%x len 0x%x"
diff --git a/hw/isa/vt82c686.c b/hw/isa/vt82c686.c
index f4c40965cd..ca89119ce0 100644
--- a/hw/isa/vt82c686.c
+++ b/hw/isa/vt82c686.c
@@ -554,7 +554,7 @@ struct ViaISAState {
     PCIIDEState ide;
     UHCIState uhci[2];
     ViaPMState pm;
-    PCIDevice ac97;
+    ViaAC97State ac97;
     PCIDevice mc97;
 };
 
@@ -598,15 +598,63 @@ void via_isa_set_irq(PCIDevice *d, int n, int level)
     qemu_set_irq(s->isa_irqs_in[n], level);
 }
 
+static void via_isa_request_i8259_irq(void *opaque, int irq, int level)
+{
+    ViaISAState *s = opaque;
+    qemu_set_irq(s->cpu_intr, level);
+}
+
+static int via_isa_get_pci_irq(const ViaISAState *s, int irq_num)
+{
+    switch (irq_num) {
+    case 0:
+        return s->dev.config[0x55] >> 4;
+    case 1:
+        return s->dev.config[0x56] & 0xf;
+    case 2:
+        return s->dev.config[0x56] >> 4;
+    case 3:
+        return s->dev.config[0x57] >> 4;
+    }
+    return 0;
+}
+
+static void via_isa_set_pci_irq(void *opaque, int irq_num, int level)
+{
+    ViaISAState *s = opaque;
+    PCIBus *bus = pci_get_bus(&s->dev);
+    int i, pic_level, pic_irq = via_isa_get_pci_irq(s, irq_num);
+
+    /* IRQ 0: disabled, IRQ 2,8,13: reserved */
+    if (!pic_irq) {
+        return;
+    }
+    if (unlikely(pic_irq == 2 || pic_irq == 8 || pic_irq == 13)) {
+        qemu_log_mask(LOG_GUEST_ERROR, "Invalid ISA IRQ routing");
+    }
+
+    /* The pic level is the logical OR of all the PCI irqs mapped to it. */
+    pic_level = 0;
+    for (i = 0; i < PCI_NUM_PINS; i++) {
+        if (pic_irq == via_isa_get_pci_irq(s, i)) {
+            pic_level |= pci_bus_get_irq_level(bus, i);
+        }
+    }
+    /* Now we change the pic irq level according to the via irq mappings. */
+    qemu_set_irq(s->isa_irqs_in[pic_irq], pic_level);
+}
+
 static void via_isa_realize(PCIDevice *d, Error **errp)
 {
     ViaISAState *s = VIA_ISA(d);
     DeviceState *dev = DEVICE(d);
     PCIBus *pci_bus = pci_get_bus(d);
+    qemu_irq *isa_irq;
     ISABus *isa_bus;
     int i;
 
     qdev_init_gpio_out(dev, &s->cpu_intr, 1);
+    isa_irq = qemu_allocate_irqs(via_isa_request_i8259_irq, s, 1);
     isa_bus = isa_bus_new(dev, pci_address_space(d), pci_address_space_io(d),
                           errp);
 
@@ -614,11 +662,13 @@ static void via_isa_realize(PCIDevice *d, Error **errp)
         return;
     }
 
-    s->isa_irqs_in = i8259_init(isa_bus, s->cpu_intr);
+    s->isa_irqs_in = i8259_init(isa_bus, *isa_irq);
     isa_bus_register_input_irqs(isa_bus, s->isa_irqs_in);
     i8254_pit_init(isa_bus, 0x40, 0, NULL);
     i8257_dma_init(isa_bus, 0);
 
+    qdev_init_gpio_in_named(dev, via_isa_set_pci_irq, "pirq", PCI_NUM_PINS);
+
     /* RTC */
     qdev_prop_set_int32(DEVICE(&s->rtc), "base_year", 2000);
     if (!qdev_realize(DEVICE(&s->rtc), BUS(isa_bus), errp)) {
diff --git a/hw/mem/cxl_type3.c b/hw/mem/cxl_type3.c
index 217a5e639b..abe60b362c 100644
--- a/hw/mem/cxl_type3.c
+++ b/hw/mem/cxl_type3.c
@@ -1,6 +1,7 @@
 #include "qemu/osdep.h"
 #include "qemu/units.h"
 #include "qemu/error-report.h"
+#include "qapi/qapi-commands-cxl.h"
 #include "hw/mem/memory-device.h"
 #include "hw/mem/pc-dimm.h"
 #include "hw/pci/pci.h"
@@ -250,6 +251,7 @@ static void ct3d_config_write(PCIDevice *pci_dev, uint32_t addr, uint32_t val,
 
     pcie_doe_write_config(&ct3d->doe_cdat, addr, val, size);
     pci_default_write_config(pci_dev, addr, val, size);
+    pcie_aer_write_config(pci_dev, addr, val, size);
 }
 
 /*
@@ -322,6 +324,66 @@ static void hdm_decoder_commit(CXLType3Dev *ct3d, int which)
     ARRAY_FIELD_DP32(cache_mem, CXL_HDM_DECODER0_CTRL, COMMITTED, 1);
 }
 
+static int ct3d_qmp_uncor_err_to_cxl(CxlUncorErrorType qmp_err)
+{
+    switch (qmp_err) {
+    case CXL_UNCOR_ERROR_TYPE_CACHE_DATA_PARITY:
+        return CXL_RAS_UNC_ERR_CACHE_DATA_PARITY;
+    case CXL_UNCOR_ERROR_TYPE_CACHE_ADDRESS_PARITY:
+        return CXL_RAS_UNC_ERR_CACHE_ADDRESS_PARITY;
+    case CXL_UNCOR_ERROR_TYPE_CACHE_BE_PARITY:
+        return CXL_RAS_UNC_ERR_CACHE_BE_PARITY;
+    case CXL_UNCOR_ERROR_TYPE_CACHE_DATA_ECC:
+        return CXL_RAS_UNC_ERR_CACHE_DATA_ECC;
+    case CXL_UNCOR_ERROR_TYPE_MEM_DATA_PARITY:
+        return CXL_RAS_UNC_ERR_MEM_DATA_PARITY;
+    case CXL_UNCOR_ERROR_TYPE_MEM_ADDRESS_PARITY:
+        return CXL_RAS_UNC_ERR_MEM_ADDRESS_PARITY;
+    case CXL_UNCOR_ERROR_TYPE_MEM_BE_PARITY:
+        return CXL_RAS_UNC_ERR_MEM_BE_PARITY;
+    case CXL_UNCOR_ERROR_TYPE_MEM_DATA_ECC:
+        return CXL_RAS_UNC_ERR_MEM_DATA_ECC;
+    case CXL_UNCOR_ERROR_TYPE_REINIT_THRESHOLD:
+        return CXL_RAS_UNC_ERR_REINIT_THRESHOLD;
+    case CXL_UNCOR_ERROR_TYPE_RSVD_ENCODING:
+        return CXL_RAS_UNC_ERR_RSVD_ENCODING;
+    case CXL_UNCOR_ERROR_TYPE_POISON_RECEIVED:
+        return CXL_RAS_UNC_ERR_POISON_RECEIVED;
+    case CXL_UNCOR_ERROR_TYPE_RECEIVER_OVERFLOW:
+        return CXL_RAS_UNC_ERR_RECEIVER_OVERFLOW;
+    case CXL_UNCOR_ERROR_TYPE_INTERNAL:
+        return CXL_RAS_UNC_ERR_INTERNAL;
+    case CXL_UNCOR_ERROR_TYPE_CXL_IDE_TX:
+        return CXL_RAS_UNC_ERR_CXL_IDE_TX;
+    case CXL_UNCOR_ERROR_TYPE_CXL_IDE_RX:
+        return CXL_RAS_UNC_ERR_CXL_IDE_RX;
+    default:
+        return -EINVAL;
+    }
+}
+
+static int ct3d_qmp_cor_err_to_cxl(CxlCorErrorType qmp_err)
+{
+    switch (qmp_err) {
+    case CXL_COR_ERROR_TYPE_CACHE_DATA_ECC:
+        return CXL_RAS_COR_ERR_CACHE_DATA_ECC;
+    case CXL_COR_ERROR_TYPE_MEM_DATA_ECC:
+        return CXL_RAS_COR_ERR_MEM_DATA_ECC;
+    case CXL_COR_ERROR_TYPE_CRC_THRESHOLD:
+        return CXL_RAS_COR_ERR_CRC_THRESHOLD;
+    case CXL_COR_ERROR_TYPE_RETRY_THRESHOLD:
+        return CXL_RAS_COR_ERR_RETRY_THRESHOLD;
+    case CXL_COR_ERROR_TYPE_CACHE_POISON_RECEIVED:
+        return CXL_RAS_COR_ERR_CACHE_POISON_RECEIVED;
+    case CXL_COR_ERROR_TYPE_MEM_POISON_RECEIVED:
+        return CXL_RAS_COR_ERR_MEM_POISON_RECEIVED;
+    case CXL_COR_ERROR_TYPE_PHYSICAL:
+        return CXL_RAS_COR_ERR_PHYSICAL;
+    default:
+        return -EINVAL;
+    }
+}
+
 static void ct3d_reg_write(void *opaque, hwaddr offset, uint64_t value,
                            unsigned size)
 {
@@ -340,6 +402,83 @@ static void ct3d_reg_write(void *opaque, hwaddr offset, uint64_t value,
         should_commit = FIELD_EX32(value, CXL_HDM_DECODER0_CTRL, COMMIT);
         which_hdm = 0;
         break;
+    case A_CXL_RAS_UNC_ERR_STATUS:
+    {
+        uint32_t capctrl = ldl_le_p(cache_mem + R_CXL_RAS_ERR_CAP_CTRL);
+        uint32_t fe = FIELD_EX32(capctrl, CXL_RAS_ERR_CAP_CTRL, FIRST_ERROR_POINTER);
+        CXLError *cxl_err;
+        uint32_t unc_err;
+
+        /*
+         * If single bit written that corresponds to the first error
+         * pointer being cleared, update the status and header log.
+         */
+        if (!QTAILQ_EMPTY(&ct3d->error_list)) {
+            if ((1 << fe) ^ value) {
+                CXLError *cxl_next;
+                /*
+                 * Software is using wrong flow for multiple header recording
+                 * Following behavior in PCIe r6.0 and assuming multiple
+                 * header support. Implementation defined choice to clear all
+                 * matching records if more than one bit set - which corresponds
+                 * closest to behavior of hardware not capable of multiple
+                 * header recording.
+                 */
+                QTAILQ_FOREACH_SAFE(cxl_err, &ct3d->error_list, node, cxl_next) {
+                    if ((1 << cxl_err->type) & value) {
+                        QTAILQ_REMOVE(&ct3d->error_list, cxl_err, node);
+                        g_free(cxl_err);
+                    }
+                }
+            } else {
+                /* Done with previous FE, so drop from list */
+                cxl_err = QTAILQ_FIRST(&ct3d->error_list);
+                QTAILQ_REMOVE(&ct3d->error_list, cxl_err, node);
+                g_free(cxl_err);
+            }
+
+            /*
+             * If there is another FE, then put that in place and update
+             * the header log
+             */
+            if (!QTAILQ_EMPTY(&ct3d->error_list)) {
+                uint32_t *header_log = &cache_mem[R_CXL_RAS_ERR_HEADER0];
+                int i;
+
+                cxl_err = QTAILQ_FIRST(&ct3d->error_list);
+                for (i = 0; i < CXL_RAS_ERR_HEADER_NUM; i++) {
+                    stl_le_p(header_log + i, cxl_err->header[i]);
+                }
+                capctrl = FIELD_DP32(capctrl, CXL_RAS_ERR_CAP_CTRL,
+                                     FIRST_ERROR_POINTER, cxl_err->type);
+            } else {
+                /*
+                 * If no more errors, then follow recomendation of PCI spec
+                 * r6.0 6.2.4.2 to set the first error pointer to a status
+                 * bit that will never be used.
+                 */
+                capctrl = FIELD_DP32(capctrl, CXL_RAS_ERR_CAP_CTRL,
+                                     FIRST_ERROR_POINTER,
+                                     CXL_RAS_UNC_ERR_CXL_UNUSED);
+            }
+            stl_le_p((uint8_t *)cache_mem + A_CXL_RAS_ERR_CAP_CTRL, capctrl);
+        }
+        unc_err = 0;
+        QTAILQ_FOREACH(cxl_err, &ct3d->error_list, node) {
+            unc_err |= 1 << cxl_err->type;
+        }
+        stl_le_p((uint8_t *)cache_mem + offset, unc_err);
+
+        return;
+    }
+    case A_CXL_RAS_COR_ERR_STATUS:
+    {
+        uint32_t rw1c = value;
+        uint32_t temp = ldl_le_p((uint8_t *)cache_mem + offset);
+        temp &= ~rw1c;
+        stl_le_p((uint8_t *)cache_mem + offset, temp);
+        return;
+    }
     default:
         break;
     }
@@ -403,6 +542,8 @@ static void ct3_realize(PCIDevice *pci_dev, Error **errp)
     unsigned short msix_num = 1;
     int i, rc;
 
+    QTAILQ_INIT(&ct3d->error_list);
+
     if (!cxl_setup_memory(ct3d, errp)) {
         return;
     }
@@ -452,8 +593,19 @@ static void ct3_realize(PCIDevice *pci_dev, Error **errp)
     cxl_cstate->cdat.free_cdat_table = ct3_free_cdat_table;
     cxl_cstate->cdat.private = ct3d;
     cxl_doe_cdat_init(cxl_cstate, errp);
+
+    pcie_cap_deverr_init(pci_dev);
+    /* Leave a bit of room for expansion */
+    rc = pcie_aer_init(pci_dev, PCI_ERR_VER, 0x200, PCI_ERR_SIZEOF, NULL);
+    if (rc) {
+        goto err_release_cdat;
+    }
+
     return;
 
+err_release_cdat:
+    cxl_doe_cdat_release(cxl_cstate);
+    g_free(regs->special_ops);
 err_address_space_free:
     address_space_destroy(&ct3d->hostmem_as);
     return;
@@ -465,6 +617,7 @@ static void ct3_exit(PCIDevice *pci_dev)
     CXLComponentState *cxl_cstate = &ct3d->cxl_cstate;
     ComponentRegisters *regs = &cxl_cstate->crb;
 
+    pcie_aer_exit(pci_dev);
     cxl_doe_cdat_release(cxl_cstate);
     g_free(regs->special_ops);
     address_space_destroy(&ct3d->hostmem_as);
@@ -618,6 +771,147 @@ static void set_lsa(CXLType3Dev *ct3d, const void *buf, uint64_t size,
      */
 }
 
+/* For uncorrectable errors include support for multiple header recording */
+void qmp_cxl_inject_uncorrectable_errors(const char *path,
+                                         CXLUncorErrorRecordList *errors,
+                                         Error **errp)
+{
+    Object *obj = object_resolve_path(path, NULL);
+    static PCIEAERErr err = {};
+    CXLType3Dev *ct3d;
+    CXLError *cxl_err;
+    uint32_t *reg_state;
+    uint32_t unc_err;
+    bool first;
+
+    if (!obj) {
+        error_setg(errp, "Unable to resolve path");
+        return;
+    }
+
+    if (!object_dynamic_cast(obj, TYPE_CXL_TYPE3)) {
+        error_setg(errp, "Path does not point to a CXL type 3 device");
+        return;
+    }
+
+    err.status = PCI_ERR_UNC_INTN;
+    err.source_id = pci_requester_id(PCI_DEVICE(obj));
+    err.flags = 0;
+
+    ct3d = CXL_TYPE3(obj);
+
+    first = QTAILQ_EMPTY(&ct3d->error_list);
+    reg_state = ct3d->cxl_cstate.crb.cache_mem_registers;
+    while (errors) {
+        uint32List *header = errors->value->header;
+        uint8_t header_count = 0;
+        int cxl_err_code;
+
+        cxl_err_code = ct3d_qmp_uncor_err_to_cxl(errors->value->type);
+        if (cxl_err_code < 0) {
+            error_setg(errp, "Unknown error code");
+            return;
+        }
+
+        /* If the error is masked, nothing to do here */
+        if (!((1 << cxl_err_code) &
+              ~ldl_le_p(reg_state + R_CXL_RAS_UNC_ERR_MASK))) {
+            errors = errors->next;
+            continue;
+        }
+
+        cxl_err = g_malloc0(sizeof(*cxl_err));
+        if (!cxl_err) {
+            return;
+        }
+
+        cxl_err->type = cxl_err_code;
+        while (header && header_count < 32) {
+            cxl_err->header[header_count++] = header->value;
+            header = header->next;
+        }
+        if (header_count > 32) {
+            error_setg(errp, "Header must be 32 DWORD or less");
+            return;
+        }
+        QTAILQ_INSERT_TAIL(&ct3d->error_list, cxl_err, node);
+
+        errors = errors->next;
+    }
+
+    if (first && !QTAILQ_EMPTY(&ct3d->error_list)) {
+        uint32_t *cache_mem = ct3d->cxl_cstate.crb.cache_mem_registers;
+        uint32_t capctrl = ldl_le_p(cache_mem + R_CXL_RAS_ERR_CAP_CTRL);
+        uint32_t *header_log = &cache_mem[R_CXL_RAS_ERR_HEADER0];
+        int i;
+
+        cxl_err = QTAILQ_FIRST(&ct3d->error_list);
+        for (i = 0; i < CXL_RAS_ERR_HEADER_NUM; i++) {
+            stl_le_p(header_log + i, cxl_err->header[i]);
+        }
+
+        capctrl = FIELD_DP32(capctrl, CXL_RAS_ERR_CAP_CTRL,
+                             FIRST_ERROR_POINTER, cxl_err->type);
+        stl_le_p(cache_mem + R_CXL_RAS_ERR_CAP_CTRL, capctrl);
+    }
+
+    unc_err = 0;
+    QTAILQ_FOREACH(cxl_err, &ct3d->error_list, node) {
+        unc_err |= (1 << cxl_err->type);
+    }
+    if (!unc_err) {
+        return;
+    }
+
+    stl_le_p(reg_state + R_CXL_RAS_UNC_ERR_STATUS, unc_err);
+    pcie_aer_inject_error(PCI_DEVICE(obj), &err);
+
+    return;
+}
+
+void qmp_cxl_inject_correctable_error(const char *path, CxlCorErrorType type,
+                                      Error **errp)
+{
+    static PCIEAERErr err = {};
+    Object *obj = object_resolve_path(path, NULL);
+    CXLType3Dev *ct3d;
+    uint32_t *reg_state;
+    uint32_t cor_err;
+    int cxl_err_type;
+
+    if (!obj) {
+        error_setg(errp, "Unable to resolve path");
+        return;
+    }
+    if (!object_dynamic_cast(obj, TYPE_CXL_TYPE3)) {
+        error_setg(errp, "Path does not point to a CXL type 3 device");
+        return;
+    }
+
+    err.status = PCI_ERR_COR_INTERNAL;
+    err.source_id = pci_requester_id(PCI_DEVICE(obj));
+    err.flags = PCIE_AER_ERR_IS_CORRECTABLE;
+
+    ct3d = CXL_TYPE3(obj);
+    reg_state = ct3d->cxl_cstate.crb.cache_mem_registers;
+    cor_err = ldl_le_p(reg_state + R_CXL_RAS_COR_ERR_STATUS);
+
+    cxl_err_type = ct3d_qmp_cor_err_to_cxl(type);
+    if (cxl_err_type < 0) {
+        error_setg(errp, "Invalid COR error");
+        return;
+    }
+    /* If the error is masked, nothting to do here */
+    if (!((1 << cxl_err_type) & ~ldl_le_p(reg_state + R_CXL_RAS_COR_ERR_MASK))) {
+        return;
+    }
+
+    cor_err |= (1 << cxl_err_type);
+    stl_le_p(reg_state + R_CXL_RAS_COR_ERR_STATUS, cor_err);
+
+    pcie_aer_inject_error(PCI_DEVICE(obj), &err);
+}
+
 static void ct3_class_init(ObjectClass *oc, void *data)
 {
     DeviceClass *dc = DEVICE_CLASS(oc);
diff --git a/hw/mem/cxl_type3_stubs.c b/hw/mem/cxl_type3_stubs.c
new file mode 100644
index 0000000000..d574c58f9a
--- /dev/null
+++ b/hw/mem/cxl_type3_stubs.c
@@ -0,0 +1,17 @@
+
+#include "qemu/osdep.h"
+#include "qapi/error.h"
+#include "qapi/qapi-commands-cxl.h"
+
+void qmp_cxl_inject_uncorrectable_errors(const char *path,
+                                         CXLUncorErrorRecordList *errors,
+                                         Error **errp)
+{
+    error_setg(errp, "CXL Type 3 support is not compiled in");
+}
+
+void qmp_cxl_inject_correctable_error(const char *path, CxlCorErrorType type,
+                                      Error **errp)
+{
+    error_setg(errp, "CXL Type 3 support is not compiled in");
+}
diff --git a/hw/mem/meson.build b/hw/mem/meson.build
index 609b2b36fc..56c2618b84 100644
--- a/hw/mem/meson.build
+++ b/hw/mem/meson.build
@@ -4,6 +4,8 @@ mem_ss.add(when: 'CONFIG_DIMM', if_true: files('pc-dimm.c'))
 mem_ss.add(when: 'CONFIG_NPCM7XX', if_true: files('npcm7xx_mc.c'))
 mem_ss.add(when: 'CONFIG_NVDIMM', if_true: files('nvdimm.c'))
 mem_ss.add(when: 'CONFIG_CXL_MEM_DEVICE', if_true: files('cxl_type3.c'))
+softmmu_ss.add(when: 'CONFIG_CXL_MEM_DEVICE', if_false: files('cxl_type3_stubs.c'))
+softmmu_ss.add(when: 'CONFIG_ALL', if_true: files('cxl_type3_stubs.c'))
 
 softmmu_ss.add_all(when: 'CONFIG_MEM_DEVICE', if_true: mem_ss)
 
diff --git a/hw/mips/boston.c b/hw/mips/boston.c
index a9d87f3437..21ad844519 100644
--- a/hw/mips/boston.c
+++ b/hw/mips/boston.c
@@ -702,7 +702,7 @@ static void boston_mach_init(MachineState *machine)
     object_initialize_child(OBJECT(machine), "cps", &s->cps, TYPE_MIPS_CPS);
     object_property_set_str(OBJECT(&s->cps), "cpu-type", machine->cpu_type,
                             &error_fatal);
-    object_property_set_int(OBJECT(&s->cps), "num-vp", machine->smp.cpus,
+    object_property_set_uint(OBJECT(&s->cps), "num-vp", machine->smp.cpus,
                             &error_fatal);
     qdev_connect_clock_in(DEVICE(&s->cps), "clk-in",
                           qdev_get_clock_out(dev, "cpu-refclk"));
diff --git a/hw/mips/cps.c b/hw/mips/cps.c
index 2b436700ce..2b5269ebf1 100644
--- a/hw/mips/cps.c
+++ b/hw/mips/cps.c
@@ -66,20 +66,17 @@ static bool cpu_mips_itu_supported(CPUMIPSState *env)
 static void mips_cps_realize(DeviceState *dev, Error **errp)
 {
     MIPSCPSState *s = MIPS_CPS(dev);
-    CPUMIPSState *env;
-    MIPSCPU *cpu;
-    int i;
     target_ulong gcr_base;
     bool itu_present = false;
-    bool saar_present = false;
 
     if (!clock_get(s->clock)) {
         error_setg(errp, "CPS input clock is not connected to an output clock");
         return;
     }
 
-    for (i = 0; i < s->num_vp; i++) {
-        cpu = MIPS_CPU(object_new(s->cpu_type));
+    for (int i = 0; i < s->num_vp; i++) {
+        MIPSCPU *cpu = MIPS_CPU(object_new(s->cpu_type));
+        CPUMIPSState *env = &cpu->env;
 
         /* All VPs are halted on reset. Leave powering up to CPC. */
         if (!object_property_set_bool(OBJECT(cpu), "start-powered-off", true,
@@ -97,7 +94,6 @@ static void mips_cps_realize(DeviceState *dev, Error **errp)
         cpu_mips_irq_init_cpu(cpu);
         cpu_mips_clock_init(cpu);
 
-        env = &cpu->env;
         if (cpu_mips_itu_supported(env)) {
             itu_present = true;
             /* Attach ITC Tag to the VP */
@@ -107,22 +103,15 @@ static void mips_cps_realize(DeviceState *dev, Error **errp)
         qemu_register_reset(main_cpu_reset, cpu);
     }
 
-    cpu = MIPS_CPU(first_cpu);
-    env = &cpu->env;
-    saar_present = (bool)env->saarp;
-
     /* Inter-Thread Communication Unit */
     if (itu_present) {
         object_initialize_child(OBJECT(dev), "itu", &s->itu, TYPE_MIPS_ITU);
-        object_property_set_int(OBJECT(&s->itu), "num-fifo", 16,
+        object_property_set_link(OBJECT(&s->itu), "cpu[0]",
+                                 OBJECT(first_cpu), &error_abort);
+        object_property_set_uint(OBJECT(&s->itu), "num-fifo", 16,
                                 &error_abort);
-        object_property_set_int(OBJECT(&s->itu), "num-semaphores", 16,
+        object_property_set_uint(OBJECT(&s->itu), "num-semaphores", 16,
                                 &error_abort);
-        object_property_set_bool(OBJECT(&s->itu), "saar-present", saar_present,
-                                 &error_abort);
-        if (saar_present) {
-            s->itu.saar = &env->CP0_SAAR;
-        }
         if (!sysbus_realize(SYS_BUS_DEVICE(&s->itu), errp)) {
             return;
         }
@@ -133,7 +122,7 @@ static void mips_cps_realize(DeviceState *dev, Error **errp)
 
     /* Cluster Power Controller */
     object_initialize_child(OBJECT(dev), "cpc", &s->cpc, TYPE_MIPS_CPC);
-    object_property_set_int(OBJECT(&s->cpc), "num-vp", s->num_vp,
+    object_property_set_uint(OBJECT(&s->cpc), "num-vp", s->num_vp,
                             &error_abort);
     object_property_set_int(OBJECT(&s->cpc), "vp-start-running", 1,
                             &error_abort);
@@ -146,9 +135,9 @@ static void mips_cps_realize(DeviceState *dev, Error **errp)
 
     /* Global Interrupt Controller */
     object_initialize_child(OBJECT(dev), "gic", &s->gic, TYPE_MIPS_GIC);
-    object_property_set_int(OBJECT(&s->gic), "num-vp", s->num_vp,
+    object_property_set_uint(OBJECT(&s->gic), "num-vp", s->num_vp,
                             &error_abort);
-    object_property_set_int(OBJECT(&s->gic), "num-irq", 128,
+    object_property_set_uint(OBJECT(&s->gic), "num-irq", 128,
                             &error_abort);
     if (!sysbus_realize(SYS_BUS_DEVICE(&s->gic), errp)) {
         return;
@@ -158,10 +147,10 @@ static void mips_cps_realize(DeviceState *dev, Error **errp)
                             sysbus_mmio_get_region(SYS_BUS_DEVICE(&s->gic), 0));
 
     /* Global Configuration Registers */
-    gcr_base = env->CP0_CMGCRBase << 4;
+    gcr_base = MIPS_CPU(first_cpu)->env.CP0_CMGCRBase << 4;
 
     object_initialize_child(OBJECT(dev), "gcr", &s->gcr, TYPE_MIPS_GCR);
-    object_property_set_int(OBJECT(&s->gcr), "num-vp", s->num_vp,
+    object_property_set_uint(OBJECT(&s->gcr), "num-vp", s->num_vp,
                             &error_abort);
     object_property_set_int(OBJECT(&s->gcr), "gcr-rev", 0x800,
                             &error_abort);
diff --git a/hw/mips/malta.c b/hw/mips/malta.c
index ec172b111a..af9021316d 100644
--- a/hw/mips/malta.c
+++ b/hw/mips/malta.c
@@ -1066,7 +1066,7 @@ static void create_cps(MachineState *ms, MaltaState *s,
     object_initialize_child(OBJECT(s), "cps", &s->cps, TYPE_MIPS_CPS);
     object_property_set_str(OBJECT(&s->cps), "cpu-type", ms->cpu_type,
                             &error_fatal);
-    object_property_set_int(OBJECT(&s->cps), "num-vp", ms->smp.cpus,
+    object_property_set_uint(OBJECT(&s->cps), "num-vp", ms->smp.cpus,
                             &error_fatal);
     qdev_connect_clock_in(DEVICE(&s->cps), "clk-in", s->cpuclk);
     sysbus_realize(SYS_BUS_DEVICE(&s->cps), &error_fatal);
diff --git a/hw/misc/edu.c b/hw/misc/edu.c
index e935c418d4..a1f8bc77e7 100644
--- a/hw/misc/edu.c
+++ b/hw/misc/edu.c
@@ -267,6 +267,8 @@ static void edu_mmio_write(void *opaque, hwaddr addr, uint64_t val,
     case 0x20:
         if (val & EDU_STATUS_IRQFACT) {
             qatomic_or(&edu->status, EDU_STATUS_IRQFACT);
+            /* Order check of the COMPUTING flag after setting IRQFACT.  */
+            smp_mb__after_rmw();
         } else {
             qatomic_and(&edu->status, ~EDU_STATUS_IRQFACT);
         }
@@ -349,6 +351,9 @@ static void *edu_fact_thread(void *opaque)
         qemu_mutex_unlock(&edu->thr_mutex);
         qatomic_and(&edu->status, ~EDU_STATUS_COMPUTING);
 
+        /* Clear COMPUTING flag before checking IRQFACT.  */
+        smp_mb__after_rmw();
+
         if (qatomic_read(&edu->status) & EDU_STATUS_IRQFACT) {
             qemu_mutex_lock_iothread();
             edu_raise_irq(edu, FACT_IRQ);
diff --git a/hw/misc/mips_cmgcr.c b/hw/misc/mips_cmgcr.c
index 3c8b37f700..66eb11662c 100644
--- a/hw/misc/mips_cmgcr.c
+++ b/hw/misc/mips_cmgcr.c
@@ -212,7 +212,7 @@ static const VMStateDescription vmstate_mips_gcr = {
 };
 
 static Property mips_gcr_properties[] = {
-    DEFINE_PROP_INT32("num-vp", MIPSGCRState, num_vps, 1),
+    DEFINE_PROP_UINT32("num-vp", MIPSGCRState, num_vps, 1),
     DEFINE_PROP_INT32("gcr-rev", MIPSGCRState, gcr_rev, 0x800),
     DEFINE_PROP_UINT64("gcr-base", MIPSGCRState, gcr_base, GCR_BASE_ADDR),
     DEFINE_PROP_LINK("gic", MIPSGCRState, gic_mr, TYPE_MEMORY_REGION,
diff --git a/hw/misc/mips_itu.c b/hw/misc/mips_itu.c
index badef5c214..0eda302db4 100644
--- a/hw/misc/mips_itu.c
+++ b/hw/misc/mips_itu.c
@@ -93,10 +93,10 @@ void itc_reconfigure(MIPSITUState *tag)
     uint64_t size = (1 * KiB) + (am[1] & ITC_AM1_ADDR_MASK_MASK);
     bool is_enabled = (am[0] & ITC_AM0_EN_MASK) != 0;
 
-    if (tag->saar_present) {
-        address = ((*(uint64_t *) tag->saar) & 0xFFFFFFFFE000ULL) << 4;
-        size = 1ULL << ((*(uint64_t *) tag->saar >> 1) & 0x1f);
-        is_enabled = *(uint64_t *) tag->saar & 1;
+    if (tag->saar) {
+        address = (tag->saar[0] & 0xFFFFFFFFE000ULL) << 4;
+        size = 1ULL << ((tag->saar[0] >> 1) & 0x1f);
+        is_enabled = tag->saar[0] & 1;
     }
 
     memory_region_transaction_begin();
@@ -157,7 +157,7 @@ static inline ITCView get_itc_view(hwaddr addr)
 static inline int get_cell_stride_shift(const MIPSITUState *s)
 {
     /* Minimum interval (for EntryGain = 0) is 128 B */
-    if (s->saar_present) {
+    if (s->saar) {
         return 7 + ((s->icr0 >> ITC_ICR0_BLK_GRAIN) &
                     ITC_ICR0_BLK_GRAIN_MASK);
     } else {
@@ -515,6 +515,7 @@ static void mips_itu_init(Object *obj)
 static void mips_itu_realize(DeviceState *dev, Error **errp)
 {
     MIPSITUState *s = MIPS_ITU(dev);
+    CPUMIPSState *env;
 
     if (s->num_fifo > ITC_FIFO_NUM_MAX) {
         error_setg(errp, "Exceed maximum number of FIFO cells: %d",
@@ -526,6 +527,15 @@ static void mips_itu_realize(DeviceState *dev, Error **errp)
                    s->num_semaphores);
         return;
     }
+    if (!s->cpu0) {
+        error_setg(errp, "Missing 'cpu[0]' property");
+        return;
+    }
+
+    env = &s->cpu0->env;
+    if (env->saarp) {
+        s->saar = env->CP0_SAAR;
+    }
 
     s->cell = g_new(ITCStorageCell, get_num_cells(s));
 }
@@ -534,8 +544,8 @@ static void mips_itu_reset(DeviceState *dev)
 {
     MIPSITUState *s = MIPS_ITU(dev);
 
-    if (s->saar_present) {
-        *(uint64_t *) s->saar = 0x11 << 1;
+    if (s->saar) {
+        s->saar[0] = 0x11 << 1;
         s->icr0 = get_num_cells(s) << ITC_ICR0_CELL_NUM;
     } else {
         s->ITCAddressMap[0] = 0;
@@ -549,11 +559,11 @@ static void mips_itu_reset(DeviceState *dev)
 }
 
 static Property mips_itu_properties[] = {
-    DEFINE_PROP_INT32("num-fifo", MIPSITUState, num_fifo,
+    DEFINE_PROP_UINT32("num-fifo", MIPSITUState, num_fifo,
                       ITC_FIFO_NUM_MAX),
-    DEFINE_PROP_INT32("num-semaphores", MIPSITUState, num_semaphores,
+    DEFINE_PROP_UINT32("num-semaphores", MIPSITUState, num_semaphores,
                       ITC_SEMAPH_NUM_MAX),
-    DEFINE_PROP_BOOL("saar-present", MIPSITUState, saar_present, false),
+    DEFINE_PROP_LINK("cpu[0]", MIPSITUState, cpu0, TYPE_MIPS_CPU, MIPSCPU *),
     DEFINE_PROP_END_OF_LIST(),
 };
 
diff --git a/hw/net/Kconfig b/hw/net/Kconfig
index 1cc1c5775e..18c7851efe 100644
--- a/hw/net/Kconfig
+++ b/hw/net/Kconfig
@@ -44,6 +44,11 @@ config E1000E_PCI_EXPRESS
     default y if PCI_DEVICES
     depends on PCI_EXPRESS && MSI_NONBROKEN
 
+config IGB_PCI_EXPRESS
+    bool
+    default y if PCI_DEVICES
+    depends on PCI_EXPRESS && MSI_NONBROKEN
+
 config RTL8139_PCI
     bool
     default y if PCI_DEVICES
diff --git a/hw/net/e1000.c b/hw/net/e1000.c
index 7efb8a4c52..23d660619f 100644
--- a/hw/net/e1000.c
+++ b/hw/net/e1000.c
@@ -26,6 +26,7 @@
 
 
 #include "qemu/osdep.h"
+#include "hw/net/mii.h"
 #include "hw/pci/pci_device.h"
 #include "hw/qdev-properties.h"
 #include "migration/vmstate.h"
@@ -38,12 +39,11 @@
 #include "qemu/module.h"
 #include "qemu/range.h"
 
+#include "e1000_common.h"
 #include "e1000x_common.h"
 #include "trace.h"
 #include "qom/object.h"
 
-static const uint8_t bcast[] = {0xff, 0xff, 0xff, 0xff, 0xff, 0xff};
-
 /* #define E1000_DEBUG */
 
 #ifdef E1000_DEBUG
@@ -66,9 +66,8 @@ static int debugflags = DBGBIT(TXERR) | DBGBIT(GENERAL);
 
 #define IOPORT_SIZE       0x40
 #define PNPMMIO_SIZE      0x20000
-#define MIN_BUF_SIZE      60 /* Min. octets in an ethernet frame sans FCS */
 
-#define MAXIMUM_ETHERNET_HDR_LEN (14+4)
+#define MAXIMUM_ETHERNET_HDR_LEN (ETH_HLEN + 4)
 
 /*
  * HW models:
@@ -181,67 +180,73 @@ e1000_autoneg_done(E1000State *s)
 static bool
 have_autoneg(E1000State *s)
 {
-    return chkflag(AUTONEG) && (s->phy_reg[PHY_CTRL] & MII_CR_AUTO_NEG_EN);
+    return chkflag(AUTONEG) && (s->phy_reg[MII_BMCR] & MII_BMCR_AUTOEN);
 }
 
 static void
 set_phy_ctrl(E1000State *s, int index, uint16_t val)
 {
-    /* bits 0-5 reserved; MII_CR_[RESTART_AUTO_NEG,RESET] are self clearing */
-    s->phy_reg[PHY_CTRL] = val & ~(0x3f |
-                                   MII_CR_RESET |
-                                   MII_CR_RESTART_AUTO_NEG);
+    /* bits 0-5 reserved; MII_BMCR_[ANRESTART,RESET] are self clearing */
+    s->phy_reg[MII_BMCR] = val & ~(0x3f |
+                                   MII_BMCR_RESET |
+                                   MII_BMCR_ANRESTART);
 
     /*
      * QEMU 1.3 does not support link auto-negotiation emulation, so if we
      * migrate during auto negotiation, after migration the link will be
      * down.
      */
-    if (have_autoneg(s) && (val & MII_CR_RESTART_AUTO_NEG)) {
+    if (have_autoneg(s) && (val & MII_BMCR_ANRESTART)) {
         e1000x_restart_autoneg(s->mac_reg, s->phy_reg, s->autoneg_timer);
     }
 }
 
 static void (*phyreg_writeops[])(E1000State *, int, uint16_t) = {
-    [PHY_CTRL] = set_phy_ctrl,
+    [MII_BMCR] = set_phy_ctrl,
 };
 
 enum { NPHYWRITEOPS = ARRAY_SIZE(phyreg_writeops) };
 
 enum { PHY_R = 1, PHY_W = 2, PHY_RW = PHY_R | PHY_W };
 static const char phy_regcap[0x20] = {
-    [PHY_STATUS]      = PHY_R,     [M88E1000_EXT_PHY_SPEC_CTRL] = PHY_RW,
-    [PHY_ID1]         = PHY_R,     [M88E1000_PHY_SPEC_CTRL]     = PHY_RW,
-    [PHY_CTRL]        = PHY_RW,    [PHY_1000T_CTRL]             = PHY_RW,
-    [PHY_LP_ABILITY]  = PHY_R,     [PHY_1000T_STATUS]           = PHY_R,
-    [PHY_AUTONEG_ADV] = PHY_RW,    [M88E1000_RX_ERR_CNTR]       = PHY_R,
-    [PHY_ID2]         = PHY_R,     [M88E1000_PHY_SPEC_STATUS]   = PHY_R,
-    [PHY_AUTONEG_EXP] = PHY_R,
+    [MII_BMSR]   = PHY_R,     [M88E1000_EXT_PHY_SPEC_CTRL] = PHY_RW,
+    [MII_PHYID1] = PHY_R,     [M88E1000_PHY_SPEC_CTRL]     = PHY_RW,
+    [MII_BMCR]   = PHY_RW,    [MII_CTRL1000]               = PHY_RW,
+    [MII_ANLPAR] = PHY_R,     [MII_STAT1000]               = PHY_R,
+    [MII_ANAR]   = PHY_RW,    [M88E1000_RX_ERR_CNTR]       = PHY_R,
+    [MII_PHYID2] = PHY_R,     [M88E1000_PHY_SPEC_STATUS]   = PHY_R,
+    [MII_ANER]   = PHY_R,
 };
 
-/* PHY_ID2 documented in 8254x_GBe_SDM.pdf, pp. 250 */
+/* MII_PHYID2 documented in 8254x_GBe_SDM.pdf, pp. 250 */
 static const uint16_t phy_reg_init[] = {
-    [PHY_CTRL]   = MII_CR_SPEED_SELECT_MSB |
-                   MII_CR_FULL_DUPLEX |
-                   MII_CR_AUTO_NEG_EN,
-
-    [PHY_STATUS] = MII_SR_EXTENDED_CAPS |
-                   MII_SR_LINK_STATUS |   /* link initially up */
-                   MII_SR_AUTONEG_CAPS |
-                   /* MII_SR_AUTONEG_COMPLETE: initially NOT completed */
-                   MII_SR_PREAMBLE_SUPPRESS |
-                   MII_SR_EXTENDED_STATUS |
-                   MII_SR_10T_HD_CAPS |
-                   MII_SR_10T_FD_CAPS |
-                   MII_SR_100X_HD_CAPS |
-                   MII_SR_100X_FD_CAPS,
-
-    [PHY_ID1] = 0x141,
-    /* [PHY_ID2] configured per DevId, from e1000_reset() */
-    [PHY_AUTONEG_ADV] = 0xde1,
-    [PHY_LP_ABILITY] = 0x1e0,
-    [PHY_1000T_CTRL] = 0x0e00,
-    [PHY_1000T_STATUS] = 0x3c00,
+    [MII_BMCR] = MII_BMCR_SPEED1000 |
+                 MII_BMCR_FD |
+                 MII_BMCR_AUTOEN,
+
+    [MII_BMSR] = MII_BMSR_EXTCAP |
+                 MII_BMSR_LINK_ST |   /* link initially up */
+                 MII_BMSR_AUTONEG |
+                 /* MII_BMSR_AN_COMP: initially NOT completed */
+                 MII_BMSR_MFPS |
+                 MII_BMSR_EXTSTAT |
+                 MII_BMSR_10T_HD |
+                 MII_BMSR_10T_FD |
+                 MII_BMSR_100TX_HD |
+                 MII_BMSR_100TX_FD,
+
+    [MII_PHYID1] = 0x141,
+    /* [MII_PHYID2] configured per DevId, from e1000_reset() */
+    [MII_ANAR] = MII_ANAR_CSMACD | MII_ANAR_10 |
+                 MII_ANAR_10FD | MII_ANAR_TX |
+                 MII_ANAR_TXFD | MII_ANAR_PAUSE |
+                 MII_ANAR_PAUSE_ASYM,
+    [MII_ANLPAR] = MII_ANLPAR_10 | MII_ANLPAR_10FD |
+                   MII_ANLPAR_TX | MII_ANLPAR_TXFD,
+    [MII_CTRL1000] = MII_CTRL1000_FULL | MII_CTRL1000_PORT |
+                     MII_CTRL1000_MASTER,
+    [MII_STAT1000] = MII_STAT1000_HALF | MII_STAT1000_FULL |
+                     MII_STAT1000_ROK | MII_STAT1000_LOK,
     [M88E1000_PHY_SPEC_CTRL] = 0x360,
     [M88E1000_PHY_SPEC_STATUS] = 0xac00,
     [M88E1000_EXT_PHY_SPEC_CTRL] = 0x0d60,
@@ -373,9 +378,9 @@ static bool e1000_vet_init_need(void *opaque)
     return chkflag(VET);
 }
 
-static void e1000_reset(void *opaque)
+static void e1000_reset_hold(Object *obj)
 {
-    E1000State *d = opaque;
+    E1000State *d = E1000(obj);
     E1000BaseClass *edc = E1000_GET_CLASS(d);
     uint8_t *macaddr = d->conf.macaddr.a;
 
@@ -386,10 +391,10 @@ static void e1000_reset(void *opaque)
     d->mit_irq_level = 0;
     d->mit_ide = 0;
     memset(d->phy_reg, 0, sizeof d->phy_reg);
-    memmove(d->phy_reg, phy_reg_init, sizeof phy_reg_init);
-    d->phy_reg[PHY_ID2] = edc->phy_id2;
+    memcpy(d->phy_reg, phy_reg_init, sizeof phy_reg_init);
+    d->phy_reg[MII_PHYID2] = edc->phy_id2;
     memset(d->mac_reg, 0, sizeof d->mac_reg);
-    memmove(d->mac_reg, mac_reg_init, sizeof mac_reg_init);
+    memcpy(d->mac_reg, mac_reg_init, sizeof mac_reg_init);
     d->rxbuf_min_shift = 1;
     memset(&d->tx, 0, sizeof d->tx);
 
@@ -547,9 +552,9 @@ putsum(uint8_t *data, uint32_t n, uint32_t sloc, uint32_t css, uint32_t cse)
 static inline void
 inc_tx_bcast_or_mcast_count(E1000State *s, const unsigned char *arr)
 {
-    if (!memcmp(arr, bcast, sizeof bcast)) {
+    if (is_broadcast_ether_addr(arr)) {
         e1000x_inc_reg_if_not_full(s->mac_reg, BPTC);
-    } else if (arr[0] & 1) {
+    } else if (is_multicast_ether_addr(arr)) {
         e1000x_inc_reg_if_not_full(s->mac_reg, MPTC);
     }
 }
@@ -561,13 +566,13 @@ e1000_send_packet(E1000State *s, const uint8_t *buf, int size)
                                     PTC1023, PTC1522 };
 
     NetClientState *nc = qemu_get_queue(s->nic);
-    if (s->phy_reg[PHY_CTRL] & MII_CR_LOOPBACK) {
+    if (s->phy_reg[MII_BMCR] & MII_BMCR_LOOPBACK) {
         qemu_receive_packet(nc, buf, size);
     } else {
         qemu_send_packet(nc, buf, size);
     }
     inc_tx_bcast_or_mcast_count(s, buf);
-    e1000x_increase_size_stats(s->mac_reg, PTCregs, size);
+    e1000x_increase_size_stats(s->mac_reg, PTCregs, size + 4);
 }
 
 static void
@@ -631,7 +636,7 @@ xmit_seg(E1000State *s)
     }
 
     e1000x_inc_reg_if_not_full(s->mac_reg, TPT);
-    e1000x_grow_8reg_if_not_full(s->mac_reg, TOTL, s->tx.size);
+    e1000x_grow_8reg_if_not_full(s->mac_reg, TOTL, s->tx.size + 4);
     s->mac_reg[GPTC] = s->mac_reg[TPT];
     s->mac_reg[GOTCL] = s->mac_reg[TOTL];
     s->mac_reg[GOTCH] = s->mac_reg[TOTH];
@@ -803,15 +808,18 @@ static int
 receive_filter(E1000State *s, const uint8_t *buf, int size)
 {
     uint32_t rctl = s->mac_reg[RCTL];
-    int isbcast = !memcmp(buf, bcast, sizeof bcast), ismcast = (buf[0] & 1);
+    int isbcast = is_broadcast_ether_addr(buf);
+    int ismcast = is_multicast_ether_addr(buf);
 
     if (e1000x_is_vlan_packet(buf, le16_to_cpu(s->mac_reg[VET])) &&
         e1000x_vlan_rx_filter_enabled(s->mac_reg)) {
-        uint16_t vid = lduw_be_p(buf + 14);
-        uint32_t vfta = ldl_le_p((uint32_t*)(s->mac_reg + VFTA) +
-                                 ((vid >> 5) & 0x7f));
-        if ((vfta & (1 << (vid & 0x1f))) == 0)
+        uint16_t vid = lduw_be_p(&PKT_GET_VLAN_HDR(buf)->h_tci);
+        uint32_t vfta =
+            ldl_le_p((uint32_t *)(s->mac_reg + VFTA) +
+                     ((vid >> E1000_VFTA_ENTRY_SHIFT) & E1000_VFTA_ENTRY_MASK));
+        if ((vfta & (1 << (vid & E1000_VFTA_ENTRY_BIT_SHIFT_MASK))) == 0) {
             return 0;
+        }
     }
 
     if (!isbcast && !ismcast && (rctl & E1000_RCTL_UPE)) { /* promiscuous ucast */
@@ -841,7 +849,7 @@ e1000_set_link_status(NetClientState *nc)
         e1000x_update_regs_on_link_down(s->mac_reg, s->phy_reg);
     } else {
         if (have_autoneg(s) &&
-            !(s->phy_reg[PHY_STATUS] & MII_SR_AUTONEG_COMPLETE)) {
+            !(s->phy_reg[MII_BMSR] & MII_BMSR_AN_COMP)) {
             e1000x_restart_autoneg(s->mac_reg, s->phy_reg, s->autoneg_timer);
         } else {
             e1000_link_up(s);
@@ -907,7 +915,7 @@ e1000_receive_iov(NetClientState *nc, const struct iovec *iov, int iovcnt)
     uint32_t rdh_start;
     uint16_t vlan_special = 0;
     uint8_t vlan_status = 0;
-    uint8_t min_buf[MIN_BUF_SIZE];
+    uint8_t min_buf[ETH_ZLEN];
     struct iovec min_iov;
     uint8_t *filter_buf = iov->iov_base;
     size_t size = iov_size(iov, iovcnt);
@@ -1061,30 +1069,6 @@ mac_readreg(E1000State *s, int index)
 }
 
 static uint32_t
-mac_low4_read(E1000State *s, int index)
-{
-    return s->mac_reg[index] & 0xf;
-}
-
-static uint32_t
-mac_low11_read(E1000State *s, int index)
-{
-    return s->mac_reg[index] & 0x7ff;
-}
-
-static uint32_t
-mac_low13_read(E1000State *s, int index)
-{
-    return s->mac_reg[index] & 0x1fff;
-}
-
-static uint32_t
-mac_low16_read(E1000State *s, int index)
-{
-    return s->mac_reg[index] & 0xffff;
-}
-
-static uint32_t
 mac_icr_read(E1000State *s, int index)
 {
     uint32_t ret = s->mac_reg[ICR];
@@ -1136,11 +1120,17 @@ set_rdt(E1000State *s, int index, uint32_t val)
     }
 }
 
-static void
-set_16bit(E1000State *s, int index, uint32_t val)
-{
-    s->mac_reg[index] = val & 0xffff;
-}
+#define LOW_BITS_SET_FUNC(num)                             \
+    static void                                            \
+    set_##num##bit(E1000State *s, int index, uint32_t val) \
+    {                                                      \
+        s->mac_reg[index] = val & (BIT(num) - 1);          \
+    }
+
+LOW_BITS_SET_FUNC(4)
+LOW_BITS_SET_FUNC(11)
+LOW_BITS_SET_FUNC(13)
+LOW_BITS_SET_FUNC(16)
 
 static void
 set_dlen(E1000State *s, int index, uint32_t val)
@@ -1194,7 +1184,9 @@ static const readops macreg_readops[] = {
     getreg(XONRXC),   getreg(XONTXC),   getreg(XOFFRXC),  getreg(XOFFTXC),
     getreg(RFC),      getreg(RJC),      getreg(RNBC),     getreg(TSCTFC),
     getreg(MGTPRC),   getreg(MGTPDC),   getreg(MGTPTC),   getreg(GORCL),
-    getreg(GOTCL),
+    getreg(GOTCL),    getreg(RDFH),     getreg(RDFT),     getreg(RDFHS),
+    getreg(RDFTS),    getreg(RDFPC),    getreg(TDFH),     getreg(TDFT),
+    getreg(TDFHS),    getreg(TDFTS),    getreg(TDFPC),    getreg(AIT),
 
     [TOTH]    = mac_read_clr8,      [TORH]    = mac_read_clr8,
     [GOTCH]   = mac_read_clr8,      [GORCH]   = mac_read_clr8,
@@ -1212,24 +1204,17 @@ static const readops macreg_readops[] = {
     [MPTC]    = mac_read_clr4,
     [ICR]     = mac_icr_read,       [EECD]    = get_eecd,
     [EERD]    = flash_eerd_read,
-    [RDFH]    = mac_low13_read,     [RDFT]    = mac_low13_read,
-    [RDFHS]   = mac_low13_read,     [RDFTS]   = mac_low13_read,
-    [RDFPC]   = mac_low13_read,
-    [TDFH]    = mac_low11_read,     [TDFT]    = mac_low11_read,
-    [TDFHS]   = mac_low13_read,     [TDFTS]   = mac_low13_read,
-    [TDFPC]   = mac_low13_read,
-    [AIT]     = mac_low16_read,
-
-    [CRCERRS ... MPC]   = &mac_readreg,
-    [IP6AT ... IP6AT+3] = &mac_readreg,    [IP4AT ... IP4AT+6] = &mac_readreg,
-    [FFLT ... FFLT+6]   = &mac_low11_read,
-    [RA ... RA+31]      = &mac_readreg,
-    [WUPM ... WUPM+31]  = &mac_readreg,
-    [MTA ... MTA+127]   = &mac_readreg,
-    [VFTA ... VFTA+127] = &mac_readreg,
-    [FFMT ... FFMT+254] = &mac_low4_read,
-    [FFVT ... FFVT+254] = &mac_readreg,
-    [PBM ... PBM+16383] = &mac_readreg,
+
+    [CRCERRS ... MPC]     = &mac_readreg,
+    [IP6AT ... IP6AT + 3] = &mac_readreg,    [IP4AT ... IP4AT + 6] = &mac_readreg,
+    [FFLT ... FFLT + 6]   = &mac_readreg,
+    [RA ... RA + 31]      = &mac_readreg,
+    [WUPM ... WUPM + 31]  = &mac_readreg,
+    [MTA ... MTA + E1000_MC_TBL_SIZE - 1]   = &mac_readreg,
+    [VFTA ... VFTA + E1000_VLAN_FILTER_TBL_SIZE - 1] = &mac_readreg,
+    [FFMT ... FFMT + 254] = &mac_readreg,
+    [FFVT ... FFVT + 254] = &mac_readreg,
+    [PBM ... PBM + 16383] = &mac_readreg,
 };
 enum { NREADOPS = ARRAY_SIZE(macreg_readops) };
 
@@ -1239,27 +1224,28 @@ static const writeops macreg_writeops[] = {
     putreg(PBA),      putreg(EERD),     putreg(SWSM),     putreg(WUFC),
     putreg(TDBAL),    putreg(TDBAH),    putreg(TXDCTL),   putreg(RDBAH),
     putreg(RDBAL),    putreg(LEDCTL),   putreg(VET),      putreg(FCRUC),
-    putreg(TDFH),     putreg(TDFT),     putreg(TDFHS),    putreg(TDFTS),
-    putreg(TDFPC),    putreg(RDFH),     putreg(RDFT),     putreg(RDFHS),
-    putreg(RDFTS),    putreg(RDFPC),    putreg(IPAV),     putreg(WUC),
-    putreg(WUS),      putreg(AIT),
-
-    [TDLEN]  = set_dlen,   [RDLEN]  = set_dlen,       [TCTL] = set_tctl,
-    [TDT]    = set_tctl,   [MDIC]   = set_mdic,       [ICS]  = set_ics,
-    [TDH]    = set_16bit,  [RDH]    = set_16bit,      [RDT]  = set_rdt,
-    [IMC]    = set_imc,    [IMS]    = set_ims,        [ICR]  = set_icr,
-    [EECD]   = set_eecd,   [RCTL]   = set_rx_control, [CTRL] = set_ctrl,
-    [RDTR]   = set_16bit,  [RADV]   = set_16bit,      [TADV] = set_16bit,
-    [ITR]    = set_16bit,
-
-    [IP6AT ... IP6AT+3] = &mac_writereg, [IP4AT ... IP4AT+6] = &mac_writereg,
-    [FFLT ... FFLT+6]   = &mac_writereg,
-    [RA ... RA+31]      = &mac_writereg,
-    [WUPM ... WUPM+31]  = &mac_writereg,
-    [MTA ... MTA+127]   = &mac_writereg,
-    [VFTA ... VFTA+127] = &mac_writereg,
-    [FFMT ... FFMT+254] = &mac_writereg, [FFVT ... FFVT+254] = &mac_writereg,
-    [PBM ... PBM+16383] = &mac_writereg,
+    putreg(IPAV),     putreg(WUC),
+    putreg(WUS),
+
+    [TDLEN]  = set_dlen,   [RDLEN]  = set_dlen,       [TCTL]  = set_tctl,
+    [TDT]    = set_tctl,   [MDIC]   = set_mdic,       [ICS]   = set_ics,
+    [TDH]    = set_16bit,  [RDH]    = set_16bit,      [RDT]   = set_rdt,
+    [IMC]    = set_imc,    [IMS]    = set_ims,        [ICR]   = set_icr,
+    [EECD]   = set_eecd,   [RCTL]   = set_rx_control, [CTRL]  = set_ctrl,
+    [RDTR]   = set_16bit,  [RADV]   = set_16bit,      [TADV]  = set_16bit,
+    [ITR]    = set_16bit,  [TDFH]   = set_11bit,      [TDFT]  = set_11bit,
+    [TDFHS]  = set_13bit,  [TDFTS]  = set_13bit,      [TDFPC] = set_13bit,
+    [RDFH]   = set_13bit,  [RDFT]   = set_13bit,      [RDFHS] = set_13bit,
+    [RDFTS]  = set_13bit,  [RDFPC]  = set_13bit,      [AIT]   = set_16bit,
+
+    [IP6AT ... IP6AT + 3] = &mac_writereg, [IP4AT ... IP4AT + 6] = &mac_writereg,
+    [FFLT ... FFLT + 6]   = &set_11bit,
+    [RA ... RA + 31]      = &mac_writereg,
+    [WUPM ... WUPM + 31]  = &mac_writereg,
+    [MTA ... MTA + E1000_MC_TBL_SIZE - 1] = &mac_writereg,
+    [VFTA ... VFTA + E1000_VLAN_FILTER_TBL_SIZE - 1] = &mac_writereg,
+    [FFMT ... FFMT + 254] = &set_4bit,     [FFVT ... FFVT + 254] = &mac_writereg,
+    [PBM ... PBM + 16383] = &mac_writereg,
 };
 
 enum { NWRITEOPS = ARRAY_SIZE(macreg_writeops) };
@@ -1415,10 +1401,10 @@ static int e1000_pre_save(void *opaque)
     /*
      * If link is down and auto-negotiation is supported and ongoing,
      * complete auto-negotiation immediately. This allows us to look
-     * at MII_SR_AUTONEG_COMPLETE to infer link status on load.
+     * at MII_BMSR_AN_COMP to infer link status on load.
      */
     if (nc->link_down && have_autoneg(s)) {
-        s->phy_reg[PHY_STATUS] |= MII_SR_AUTONEG_COMPLETE;
+        s->phy_reg[MII_BMSR] |= MII_BMSR_AN_COMP;
     }
 
     /* Decide which set of props to migrate in the main structure */
@@ -1457,8 +1443,7 @@ static int e1000_post_load(void *opaque, int version_id)
      * Alternatively, restart link negotiation if it was in progress. */
     nc->link_down = (s->mac_reg[STATUS] & E1000_STATUS_LU) == 0;
 
-    if (have_autoneg(s) &&
-        !(s->phy_reg[PHY_STATUS] & MII_SR_AUTONEG_COMPLETE)) {
+    if (have_autoneg(s) && !(s->phy_reg[MII_BMSR] & MII_BMSR_AN_COMP)) {
         nc->link_down = false;
         timer_mod(s->autoneg_timer,
                   qemu_clock_get_ms(QEMU_CLOCK_VIRTUAL) + 500);
@@ -1624,8 +1609,9 @@ static const VMStateDescription vmstate_e1000 = {
         VMSTATE_UINT32(mac_reg[WUFC], E1000State),
         VMSTATE_UINT32(mac_reg[VET], E1000State),
         VMSTATE_UINT32_SUB_ARRAY(mac_reg, E1000State, RA, 32),
-        VMSTATE_UINT32_SUB_ARRAY(mac_reg, E1000State, MTA, 128),
-        VMSTATE_UINT32_SUB_ARRAY(mac_reg, E1000State, VFTA, 128),
+        VMSTATE_UINT32_SUB_ARRAY(mac_reg, E1000State, MTA, E1000_MC_TBL_SIZE),
+        VMSTATE_UINT32_SUB_ARRAY(mac_reg, E1000State, VFTA,
+                                 E1000_VLAN_FILTER_TBL_SIZE),
         VMSTATE_END_OF_LIST()
     },
     .subsections = (const VMStateDescription*[]) {
@@ -1746,12 +1732,6 @@ static void pci_e1000_realize(PCIDevice *pci_dev, Error **errp)
                                         e1000_flush_queue_timer, d);
 }
 
-static void qdev_e1000_reset(DeviceState *dev)
-{
-    E1000State *d = E1000(dev);
-    e1000_reset(d);
-}
-
 static Property e1000_properties[] = {
     DEFINE_NIC_PROPERTIES(E1000State, conf),
     DEFINE_PROP_BIT("autonegotiation", E1000State,
@@ -1777,6 +1757,7 @@ typedef struct E1000Info {
 static void e1000_class_init(ObjectClass *klass, void *data)
 {
     DeviceClass *dc = DEVICE_CLASS(klass);
+    ResettableClass *rc = RESETTABLE_CLASS(klass);
     PCIDeviceClass *k = PCI_DEVICE_CLASS(klass);
     E1000BaseClass *e = E1000_CLASS(klass);
     const E1000Info *info = data;
@@ -1789,9 +1770,9 @@ static void e1000_class_init(ObjectClass *klass, void *data)
     k->revision = info->revision;
     e->phy_id2 = info->phy_id2;
     k->class_id = PCI_CLASS_NETWORK_ETHERNET;
+    rc->phases.hold = e1000_reset_hold;
     set_bit(DEVICE_CATEGORY_NETWORK, dc->categories);
     dc->desc = "Intel Gigabit Ethernet";
-    dc->reset = qdev_e1000_reset;
     dc->vmsd = &vmstate_e1000;
     device_class_set_props(dc, e1000_properties);
 }
diff --git a/hw/net/e1000_common.h b/hw/net/e1000_common.h
new file mode 100644
index 0000000000..48feda7404
--- /dev/null
+++ b/hw/net/e1000_common.h
@@ -0,0 +1,102 @@
+/*
+ * QEMU e1000(e) emulation - shared definitions
+ *
+ * Copyright (c) 2008 Qumranet
+ *
+ * Based on work done by:
+ * Nir Peleg, Tutis Systems Ltd. for Qumranet Inc.
+ * Copyright (c) 2007 Dan Aloni
+ * Copyright (c) 2004 Antony T Curtis
+ *
+ * This library is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * This library is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with this library; if not, see <http://www.gnu.org/licenses/>.
+ */
+
+#ifndef HW_NET_E1000_COMMON_H
+#define HW_NET_E1000_COMMON_H
+
+#include "e1000_regs.h"
+
+#define defreg(x)   x = (E1000_##x >> 2)
+enum {
+    defreg(CTRL),    defreg(EECD),    defreg(EERD),    defreg(GPRC),
+    defreg(GPTC),    defreg(ICR),     defreg(ICS),     defreg(IMC),
+    defreg(IMS),     defreg(LEDCTL),  defreg(MANC),    defreg(MDIC),
+    defreg(MPC),     defreg(PBA),     defreg(RCTL),    defreg(RDBAH0),
+    defreg(RDBAL0),  defreg(RDH0),    defreg(RDLEN0),  defreg(RDT0),
+    defreg(STATUS),  defreg(SWSM),    defreg(TCTL),    defreg(TDBAH),
+    defreg(TDBAL),   defreg(TDH),     defreg(TDLEN),   defreg(TDT),
+    defreg(TDLEN1),  defreg(TDBAL1),  defreg(TDBAH1),  defreg(TDH1),
+    defreg(TDT1),    defreg(TORH),    defreg(TORL),    defreg(TOTH),
+    defreg(TOTL),    defreg(TPR),     defreg(TPT),     defreg(TXDCTL),
+    defreg(WUFC),    defreg(RA),      defreg(MTA),     defreg(CRCERRS),
+    defreg(VFTA),    defreg(VET),     defreg(RDTR),    defreg(RADV),
+    defreg(TADV),    defreg(ITR),     defreg(SCC),     defreg(ECOL),
+    defreg(MCC),     defreg(LATECOL), defreg(COLC),    defreg(DC),
+    defreg(TNCRS),   defreg(SEQEC),   defreg(CEXTERR), defreg(RLEC),
+    defreg(XONRXC),  defreg(XONTXC),  defreg(XOFFRXC), defreg(XOFFTXC),
+    defreg(FCRUC),   defreg(AIT),     defreg(TDFH),    defreg(TDFT),
+    defreg(TDFHS),   defreg(TDFTS),   defreg(TDFPC),   defreg(WUC),
+    defreg(WUS),     defreg(POEMB),   defreg(PBS),     defreg(RDFH),
+    defreg(RDFT),    defreg(RDFHS),   defreg(RDFTS),   defreg(RDFPC),
+    defreg(PBM),     defreg(IPAV),    defreg(IP4AT),   defreg(IP6AT),
+    defreg(WUPM),    defreg(FFLT),    defreg(FFMT),    defreg(FFVT),
+    defreg(TARC0),   defreg(TARC1),   defreg(IAM),     defreg(EXTCNF_CTRL),
+    defreg(GCR),     defreg(TIMINCA), defreg(EIAC),    defreg(CTRL_EXT),
+    defreg(IVAR),    defreg(MFUTP01), defreg(MFUTP23), defreg(MANC2H),
+    defreg(MFVAL),   defreg(MDEF),    defreg(FACTPS),  defreg(FTFT),
+    defreg(RUC),     defreg(ROC),     defreg(RFC),     defreg(RJC),
+    defreg(PRC64),   defreg(PRC127),  defreg(PRC255),  defreg(PRC511),
+    defreg(PRC1023), defreg(PRC1522), defreg(PTC64),   defreg(PTC127),
+    defreg(PTC255),  defreg(PTC511),  defreg(PTC1023), defreg(PTC1522),
+    defreg(GORCL),   defreg(GORCH),   defreg(GOTCL),   defreg(GOTCH),
+    defreg(RNBC),    defreg(BPRC),    defreg(MPRC),    defreg(RFCTL),
+    defreg(PSRCTL),  defreg(MPTC),    defreg(BPTC),    defreg(TSCTFC),
+    defreg(IAC),     defreg(MGTPRC),  defreg(MGTPDC),  defreg(MGTPTC),
+    defreg(TSCTC),   defreg(RXCSUM),  defreg(FUNCTAG), defreg(GSCL_1),
+    defreg(GSCL_2),  defreg(GSCL_3),  defreg(GSCL_4),  defreg(GSCN_0),
+    defreg(GSCN_1),  defreg(GSCN_2),  defreg(GSCN_3),  defreg(GCR2),
+    defreg(RAID),    defreg(RSRPD),   defreg(TIDV),    defreg(EITR),
+    defreg(MRQC),    defreg(RETA),    defreg(RSSRK),   defreg(RDBAH1),
+    defreg(RDBAL1),  defreg(RDLEN1),  defreg(RDH1),    defreg(RDT1),
+    defreg(PBACLR),  defreg(FCAL),    defreg(FCAH),    defreg(FCT),
+    defreg(FCRTH),   defreg(FCRTL),   defreg(FCTTV),   defreg(FCRTV),
+    defreg(FLA),     defreg(EEWR),    defreg(FLOP),    defreg(FLOL),
+    defreg(FLSWCTL), defreg(FLSWCNT), defreg(RXDCTL),  defreg(RXDCTL1),
+    defreg(MAVTV0),  defreg(MAVTV1),  defreg(MAVTV2),  defreg(MAVTV3),
+    defreg(TXSTMPL), defreg(TXSTMPH), defreg(SYSTIML), defreg(SYSTIMH),
+    defreg(RXCFGL),  defreg(RXUDP),   defreg(TIMADJL), defreg(TIMADJH),
+    defreg(RXSTMPH), defreg(RXSTMPL), defreg(RXSATRL), defreg(RXSATRH),
+    defreg(FLASHT),  defreg(TIPG),    defreg(RDH),     defreg(RDT),
+    defreg(RDLEN),   defreg(RDBAH),   defreg(RDBAL),
+    defreg(TXDCTL1),
+    defreg(FLSWDATA),
+    defreg(CTRL_DUP),
+    defreg(EXTCNF_SIZE),
+    defreg(EEMNGCTL),
+    defreg(EEMNGDATA),
+    defreg(FLMNGCTL),
+    defreg(FLMNGDATA),
+    defreg(FLMNGCNT),
+    defreg(TSYNCRXCTL),
+    defreg(TSYNCTXCTL),
+
+    /* Aliases */
+    defreg(RDH0_A),  defreg(RDT0_A),  defreg(RDTR_A),  defreg(RDFH_A),
+    defreg(RDFT_A),  defreg(TDH_A),   defreg(TDT_A),   defreg(TIDV_A),
+    defreg(TDFH_A),  defreg(TDFT_A),  defreg(RA_A),    defreg(RDBAL0_A),
+    defreg(TDBAL_A), defreg(TDLEN_A), defreg(VFTA_A),  defreg(RDLEN0_A),
+    defreg(FCRTL_A), defreg(FCRTH_A)
+};
+
+#endif
diff --git a/hw/net/e1000_regs.h b/hw/net/e1000_regs.h
index 59e050742b..8a4ce82034 100644
--- a/hw/net/e1000_regs.h
+++ b/hw/net/e1000_regs.h
@@ -32,157 +32,35 @@
 #ifndef HW_E1000_REGS_H
 #define HW_E1000_REGS_H
 
-/* PCI Device IDs */
-#define E1000_DEV_ID_82542               0x1000
-#define E1000_DEV_ID_82543GC_FIBER       0x1001
-#define E1000_DEV_ID_82543GC_COPPER      0x1004
-#define E1000_DEV_ID_82544EI_COPPER      0x1008
-#define E1000_DEV_ID_82544EI_FIBER       0x1009
-#define E1000_DEV_ID_82544GC_COPPER      0x100C
-#define E1000_DEV_ID_82544GC_LOM         0x100D
-#define E1000_DEV_ID_82540EM             0x100E
-#define E1000_DEV_ID_82540EM_LOM         0x1015
-#define E1000_DEV_ID_82540EP_LOM         0x1016
-#define E1000_DEV_ID_82540EP             0x1017
-#define E1000_DEV_ID_82540EP_LP          0x101E
-#define E1000_DEV_ID_82545EM_COPPER      0x100F
-#define E1000_DEV_ID_82545EM_FIBER       0x1011
-#define E1000_DEV_ID_82545GM_COPPER      0x1026
-#define E1000_DEV_ID_82545GM_FIBER       0x1027
-#define E1000_DEV_ID_82545GM_SERDES      0x1028
-#define E1000_DEV_ID_82546EB_COPPER      0x1010
-#define E1000_DEV_ID_82546EB_FIBER       0x1012
-#define E1000_DEV_ID_82546EB_QUAD_COPPER 0x101D
-#define E1000_DEV_ID_82541EI             0x1013
-#define E1000_DEV_ID_82541EI_MOBILE      0x1018
-#define E1000_DEV_ID_82541ER_LOM         0x1014
-#define E1000_DEV_ID_82541ER             0x1078
-#define E1000_DEV_ID_82547GI             0x1075
-#define E1000_DEV_ID_82541GI             0x1076
-#define E1000_DEV_ID_82541GI_MOBILE      0x1077
-#define E1000_DEV_ID_82541GI_LF          0x107C
-#define E1000_DEV_ID_82546GB_COPPER      0x1079
-#define E1000_DEV_ID_82546GB_FIBER       0x107A
-#define E1000_DEV_ID_82546GB_SERDES      0x107B
-#define E1000_DEV_ID_82546GB_PCIE        0x108A
-#define E1000_DEV_ID_82546GB_QUAD_COPPER 0x1099
-#define E1000_DEV_ID_82547EI             0x1019
-#define E1000_DEV_ID_82547EI_MOBILE      0x101A
-#define E1000_DEV_ID_82571EB_COPPER      0x105E
-#define E1000_DEV_ID_82571EB_FIBER       0x105F
-#define E1000_DEV_ID_82571EB_SERDES      0x1060
-#define E1000_DEV_ID_82571EB_QUAD_COPPER 0x10A4
-#define E1000_DEV_ID_82571PT_QUAD_COPPER 0x10D5
-#define E1000_DEV_ID_82571EB_QUAD_FIBER  0x10A5
-#define E1000_DEV_ID_82571EB_QUAD_COPPER_LOWPROFILE  0x10BC
-#define E1000_DEV_ID_82571EB_SERDES_DUAL 0x10D9
-#define E1000_DEV_ID_82571EB_SERDES_QUAD 0x10DA
-#define E1000_DEV_ID_82572EI_COPPER      0x107D
-#define E1000_DEV_ID_82572EI_FIBER       0x107E
-#define E1000_DEV_ID_82572EI_SERDES      0x107F
-#define E1000_DEV_ID_82572EI             0x10B9
-#define E1000_DEV_ID_82573E              0x108B
-#define E1000_DEV_ID_82573E_IAMT         0x108C
-#define E1000_DEV_ID_82573L              0x109A
-#define E1000_DEV_ID_82574L              0x10D3
-#define E1000_DEV_ID_82546GB_QUAD_COPPER_KSP3 0x10B5
-#define E1000_DEV_ID_80003ES2LAN_COPPER_DPT     0x1096
-#define E1000_DEV_ID_80003ES2LAN_SERDES_DPT     0x1098
-#define E1000_DEV_ID_80003ES2LAN_COPPER_SPT     0x10BA
-#define E1000_DEV_ID_80003ES2LAN_SERDES_SPT     0x10BB
+#include "e1000x_regs.h"
 
-#define E1000_DEV_ID_ICH8_IGP_M_AMT      0x1049
-#define E1000_DEV_ID_ICH8_IGP_AMT        0x104A
-#define E1000_DEV_ID_ICH8_IGP_C          0x104B
-#define E1000_DEV_ID_ICH8_IFE            0x104C
-#define E1000_DEV_ID_ICH8_IFE_GT         0x10C4
-#define E1000_DEV_ID_ICH8_IFE_G          0x10C5
-#define E1000_DEV_ID_ICH8_IGP_M          0x104D
-
-/* Device Specific Register Defaults */
-#define E1000_PHY_ID2_82541x 0x380
-#define E1000_PHY_ID2_82544x 0xC30
-#define E1000_PHY_ID2_8254xx_DEFAULT 0xC20 /* 82540x, 82545x, and 82546x */
-#define E1000_PHY_ID2_82573x 0xCC0
-#define E1000_PHY_ID2_82574x 0xCB1
-
-/* Register Set. (82543, 82544)
- *
- * Registers are defined to be 32 bits and  should be accessed as 32 bit values.
- * These registers are physically located on the NIC, but are mapped into the
- * host memory address space.
- *
- * RW - register is both readable and writable
- * RO - register is read only
- * WO - register is write only
- * R/clr - register is read only and is cleared when read
- * A - register array
- */
-#define E1000_CTRL     0x00000  /* Device Control - RW */
-#define E1000_CTRL_DUP 0x00004  /* Device Control Duplicate (Shadow) - RW */
-#define E1000_STATUS   0x00008  /* Device Status - RO */
-#define E1000_EECD     0x00010  /* EEPROM/Flash Control - RW */
-#define E1000_EERD     0x00014  /* EEPROM Read - RW */
-#define E1000_CTRL_EXT 0x00018  /* Extended Device Control - RW */
-#define E1000_FLA      0x0001C  /* Flash Access - RW */
-#define E1000_MDIC     0x00020  /* MDI Control - RW */
-#define E1000_SCTL     0x00024  /* SerDes Control - RW */
-#define E1000_FEXTNVM  0x00028  /* Future Extended NVM register */
-#define E1000_FCAL     0x00028  /* Flow Control Address Low - RW */
-#define E1000_FCAH     0x0002C  /* Flow Control Address High -RW */
-#define E1000_FCT      0x00030  /* Flow Control Type - RW */
-#define E1000_VET      0x00038  /* VLAN Ether Type - RW */
-#define E1000_ICR      0x000C0  /* Interrupt Cause Read - R/clr */
 #define E1000_ITR      0x000C4  /* Interrupt Throttling Rate - RW */
-#define E1000_ICS      0x000C8  /* Interrupt Cause Set - WO */
-#define E1000_IMS      0x000D0  /* Interrupt Mask Set - RW */
 #define E1000_EIAC     0x000DC  /* Ext. Interrupt Auto Clear - RW */
-#define E1000_IMC      0x000D8  /* Interrupt Mask Clear - WO */
-#define E1000_IAM      0x000E0  /* Interrupt Acknowledge Auto Mask */
 #define E1000_IVAR     0x000E4  /* Interrupt Vector Allocation Register - RW */
 #define E1000_EITR     0x000E8  /* Extended Interrupt Throttling Rate - RW */
-#define E1000_RCTL     0x00100  /* RX Control - RW */
-#define E1000_RDTR1    0x02820  /* RX Delay Timer (1) - RW */
 #define E1000_RDBAL1   0x02900  /* RX Descriptor Base Address Low (1) - RW */
 #define E1000_RDBAH1   0x02904  /* RX Descriptor Base Address High (1) - RW */
 #define E1000_RDLEN1   0x02908  /* RX Descriptor Length (1) - RW */
 #define E1000_RDH1     0x02910  /* RX Descriptor Head (1) - RW */
 #define E1000_RDT1     0x02918  /* RX Descriptor Tail (1) - RW */
-#define E1000_FCTTV    0x00170  /* Flow Control Transmit Timer Value - RW */
 #define E1000_FCRTV    0x05F40  /* Flow Control Refresh Timer Value - RW */
 #define E1000_TXCW     0x00178  /* TX Configuration Word - RW */
 #define E1000_RXCW     0x00180  /* RX Configuration Word - RO */
-#define E1000_TCTL     0x00400  /* TX Control - RW */
-#define E1000_TCTL_EXT 0x00404  /* Extended TX Control - RW */
-#define E1000_TIPG     0x00410  /* TX Inter-packet gap -RW */
 #define E1000_TBT      0x00448  /* TX Burst Timer - RW */
 #define E1000_AIT      0x00458  /* Adaptive Interframe Spacing Throttle - RW */
-#define E1000_LEDCTL   0x00E00  /* LED Control - RW */
 #define E1000_EXTCNF_CTRL  0x00F00  /* Extended Configuration Control */
 #define E1000_EXTCNF_SIZE  0x00F08  /* Extended Configuration Size */
 #define E1000_PHY_CTRL     0x00F10  /* PHY Control Register in CSR */
-#define FEXTNVM_SW_CONFIG  0x0001
 #define E1000_PBA      0x01000  /* Packet Buffer Allocation - RW */
 #define E1000_PBM      0x10000  /* Packet Buffer Memory - RW */
 #define E1000_PBS      0x01008  /* Packet Buffer Size - RW */
-#define E1000_EEMNGCTL 0x01010  /* MNG EEprom Control */
-#define E1000_EEMNGDATA    0x01014 /* MNG EEPROM Read/Write data */
-#define E1000_FLMNGCTL     0x01018 /* MNG Flash Control */
-#define E1000_FLMNGDATA    0x0101C /* MNG FLASH Read data */
-#define E1000_FLMNGCNT     0x01020 /* MNG FLASH Read Counter */
-#define E1000_FLASH_UPDATES 1000
-#define E1000_EEARBC   0x01024  /* EEPROM Auto Read Bus Control */
 #define E1000_FLASHT   0x01028  /* FLASH Timer Register */
 #define E1000_EEWR     0x0102C  /* EEPROM Write Register - RW */
 #define E1000_FLSWCTL  0x01030  /* FLASH control register */
 #define E1000_FLSWDATA 0x01034  /* FLASH data register */
 #define E1000_FLSWCNT  0x01038  /* FLASH Access Counter */
-#define E1000_FLOP     0x0103C  /* FLASH Opcode Register */
 #define E1000_FLOL     0x01050  /* FEEP Auto Load */
 #define E1000_ERT      0x02008  /* Early Rx Threshold - RW */
-#define E1000_FCRTL    0x02160  /* Flow Control Receive Threshold Low - RW */
-#define E1000_FCRTL_A  0x00168  /* Alias to FCRTL */
-#define E1000_FCRTH    0x02168  /* Flow Control Receive Threshold High - RW */
 #define E1000_FCRTH_A  0x00160  /* Alias to FCRTH */
 #define E1000_PSRCTL   0x02170  /* Packet Split Receive Control - RW */
 #define E1000_RDBAL    0x02800  /* RX Descriptor Base Address Low - RW */
@@ -208,23 +86,7 @@
 #define E1000_RADV     0x0282C  /* RX Interrupt Absolute Delay Timer - RW */
 #define E1000_RSRPD    0x02C00  /* RX Small Packet Detect - RW */
 #define E1000_RAID     0x02C08  /* Receive Ack Interrupt Delay - RW */
-#define E1000_TXDMAC   0x03000  /* TX DMA Control - RW */
-#define E1000_KABGTXD  0x03004  /* AFE Band Gap Transmit Ref Data */
 #define E1000_POEMB    0x00F10  /* PHY OEM Bits Register - RW */
-#define E1000_RDFH     0x02410  /* Receive Data FIFO Head Register - RW */
-#define E1000_RDFH_A   0x08000  /* Alias to RDFH */
-#define E1000_RDFT     0x02418  /* Receive Data FIFO Tail Register - RW */
-#define E1000_RDFT_A   0x08008  /* Alias to RDFT */
-#define E1000_RDFHS    0x02420  /* Receive Data FIFO Head Saved Register - RW */
-#define E1000_RDFTS    0x02428  /* Receive Data FIFO Tail Saved Register - RW */
-#define E1000_RDFPC    0x02430  /* Receive Data FIFO Packet Count - RW */
-#define E1000_TDFH     0x03410  /* TX Data FIFO Head - RW */
-#define E1000_TDFH_A   0x08010  /* Alias to TDFH */
-#define E1000_TDFT     0x03418  /* TX Data FIFO Tail - RW */
-#define E1000_TDFT_A   0x08018  /* Alias to TDFT */
-#define E1000_TDFHS    0x03420  /* TX Data FIFO Head Saved - RW */
-#define E1000_TDFTS    0x03428  /* TX Data FIFO Tail Saved - RW */
-#define E1000_TDFPC    0x03430  /* TX Data FIFO Packet Count - RW */
 #define E1000_TDBAL    0x03800  /* TX Descriptor Base Address Low - RW */
 #define E1000_TDBAL_A  0x00420  /* Alias to TDBAL */
 #define E1000_TDBAH    0x03804  /* TX Descriptor Base Address High - RW */
@@ -248,174 +110,40 @@
 #define E1000_TDT1     0x03918  /* TX Desc Tail (1) - RW */
 #define E1000_TXDCTL1  0x03928  /* TX Descriptor Control (1) - RW */
 #define E1000_TARC1    0x03940  /* TX Arbitration Count (1) */
-#define E1000_CRCERRS  0x04000  /* CRC Error Count - R/clr */
-#define E1000_ALGNERRC 0x04004  /* Alignment Error Count - R/clr */
-#define E1000_SYMERRS  0x04008  /* Symbol Error Count - R/clr */
-#define E1000_RXERRC   0x0400C  /* Receive Error Count - R/clr */
-#define E1000_MPC      0x04010  /* Missed Packet Count - R/clr */
-#define E1000_SCC      0x04014  /* Single Collision Count - R/clr */
-#define E1000_ECOL     0x04018  /* Excessive Collision Count - R/clr */
-#define E1000_MCC      0x0401C  /* Multiple Collision Count - R/clr */
-#define E1000_LATECOL  0x04020  /* Late Collision Count - R/clr */
-#define E1000_COLC     0x04028  /* Collision Count - R/clr */
-#define E1000_DC       0x04030  /* Defer Count - R/clr */
-#define E1000_TNCRS    0x04034  /* TX-No CRS - R/clr */
 #define E1000_SEQEC    0x04038  /* Sequence Error Count - R/clr */
 #define E1000_CEXTERR  0x0403C  /* Carrier Extension Error Count - R/clr */
-#define E1000_RLEC     0x04040  /* Receive Length Error Count - R/clr */
-#define E1000_XONRXC   0x04048  /* XON RX Count - R/clr */
-#define E1000_XONTXC   0x0404C  /* XON TX Count - R/clr */
-#define E1000_XOFFRXC  0x04050  /* XOFF RX Count - R/clr */
-#define E1000_XOFFTXC  0x04054  /* XOFF TX Count - R/clr */
-#define E1000_FCRUC    0x04058  /* Flow Control RX Unsupported Count- R/clr */
-#define E1000_PRC64    0x0405C  /* Packets RX (64 bytes) - R/clr */
-#define E1000_PRC127   0x04060  /* Packets RX (65-127 bytes) - R/clr */
-#define E1000_PRC255   0x04064  /* Packets RX (128-255 bytes) - R/clr */
-#define E1000_PRC511   0x04068  /* Packets RX (255-511 bytes) - R/clr */
-#define E1000_PRC1023  0x0406C  /* Packets RX (512-1023 bytes) - R/clr */
-#define E1000_PRC1522  0x04070  /* Packets RX (1024-1522 bytes) - R/clr */
-#define E1000_GPRC     0x04074  /* Good Packets RX Count - R/clr */
-#define E1000_BPRC     0x04078  /* Broadcast Packets RX Count - R/clr */
-#define E1000_MPRC     0x0407C  /* Multicast Packets RX Count - R/clr */
-#define E1000_GPTC     0x04080  /* Good Packets TX Count - R/clr */
-#define E1000_GORCL    0x04088  /* Good Octets RX Count Low - R/clr */
-#define E1000_GORCH    0x0408C  /* Good Octets RX Count High - R/clr */
-#define E1000_GOTCL    0x04090  /* Good Octets TX Count Low - R/clr */
-#define E1000_GOTCH    0x04094  /* Good Octets TX Count High - R/clr */
-#define E1000_RNBC     0x040A0  /* RX No Buffers Count - R/clr */
-#define E1000_RUC      0x040A4  /* RX Undersize Count - R/clr */
-#define E1000_RFC      0x040A8  /* RX Fragment Count - R/clr */
-#define E1000_ROC      0x040AC  /* RX Oversize Count - R/clr */
-#define E1000_RJC      0x040B0  /* RX Jabber Count - R/clr */
-#define E1000_MGTPRC   0x040B4  /* Management Packets RX Count - R/clr */
-#define E1000_MGTPDC   0x040B8  /* Management Packets Dropped Count - R/clr */
-#define E1000_MGTPTC   0x040BC  /* Management Packets TX Count - R/clr */
-#define E1000_TORL     0x040C0  /* Total Octets RX Low - R/clr */
-#define E1000_TORH     0x040C4  /* Total Octets RX High - R/clr */
-#define E1000_TOTL     0x040C8  /* Total Octets TX Low - R/clr */
-#define E1000_TOTH     0x040CC  /* Total Octets TX High - R/clr */
-#define E1000_TPR      0x040D0  /* Total Packets RX - R/clr */
-#define E1000_TPT      0x040D4  /* Total Packets TX - R/clr */
-#define E1000_PTC64    0x040D8  /* Packets TX (64 bytes) - R/clr */
-#define E1000_PTC127   0x040DC  /* Packets TX (65-127 bytes) - R/clr */
-#define E1000_PTC255   0x040E0  /* Packets TX (128-255 bytes) - R/clr */
-#define E1000_PTC511   0x040E4  /* Packets TX (256-511 bytes) - R/clr */
-#define E1000_PTC1023  0x040E8  /* Packets TX (512-1023 bytes) - R/clr */
-#define E1000_PTC1522  0x040EC  /* Packets TX (1024-1522 Bytes) - R/clr */
-#define E1000_MPTC     0x040F0  /* Multicast Packets TX Count - R/clr */
-#define E1000_BPTC     0x040F4  /* Broadcast Packets TX Count - R/clr */
-#define E1000_TSCTC    0x040F8  /* TCP Segmentation Context TX - R/clr */
 #define E1000_TSCTFC   0x040FC  /* TCP Segmentation Context TX Fail - R/clr */
-#define E1000_IAC      0x04100  /* Interrupt Assertion Count */
-#define E1000_ICRXPTC  0x04104  /* Interrupt Cause Rx Packet Timer Expire Count */
 #define E1000_ICRXATC  0x04108  /* Interrupt Cause Rx Absolute Timer Expire Count */
 #define E1000_ICTXPTC  0x0410C  /* Interrupt Cause Tx Packet Timer Expire Count */
 #define E1000_ICTXATC  0x04110  /* Interrupt Cause Tx Absolute Timer Expire Count */
 #define E1000_ICTXQEC  0x04118  /* Interrupt Cause Tx Queue Empty Count */
 #define E1000_ICTXQMTC 0x0411C  /* Interrupt Cause Tx Queue Minimum Threshold Count */
-#define E1000_ICRXDMTC 0x04120  /* Interrupt Cause Rx Descriptor Minimum Threshold Count */
 #define E1000_ICRXOC   0x04124  /* Interrupt Cause Receiver Overrun Count */
-#define E1000_RXCSUM   0x05000  /* RX Checksum Control - RW */
-#define E1000_RFCTL    0x05008  /* Receive Filter Control*/
-#define E1000_MAVTV0   0x05010  /* Management VLAN TAG Value 0 */
-#define E1000_MAVTV1   0x05014  /* Management VLAN TAG Value 1 */
-#define E1000_MAVTV2   0x05018  /* Management VLAN TAG Value 2 */
-#define E1000_MAVTV3   0x0501c  /* Management VLAN TAG Value 3 */
-#define E1000_MTA      0x05200  /* Multicast Table Array - RW Array */
-#define E1000_RA       0x05400  /* Receive Address - RW Array */
-#define E1000_RA_A     0x00040  /* Alias to RA */
-#define E1000_VFTA     0x05600  /* VLAN Filter Table Array - RW Array */
-#define E1000_VFTA_A   0x00600  /* Alias to VFTA */
-#define E1000_WUC      0x05800  /* Wakeup Control - RW */
-#define E1000_WUFC     0x05808  /* Wakeup Filter Control - RW */
-#define E1000_WUS      0x05810  /* Wakeup Status - RO */
-#define E1000_MANC     0x05820  /* Management Control - RW */
-#define E1000_IPAV     0x05838  /* IP Address Valid - RW */
-#define E1000_IP4AT    0x05840  /* IPv4 Address Table - RW Array */
-#define E1000_IP6AT    0x05880  /* IPv6 Address Table - RW Array */
-#define E1000_WUPL     0x05900  /* Wakeup Packet Length - RW */
-#define E1000_WUPM     0x05A00  /* Wakeup Packet Memory - RO A */
 #define E1000_MFUTP01  0x05828  /* Management Flex UDP/TCP Ports 0/1 - RW */
 #define E1000_MFUTP23  0x05830  /* Management Flex UDP/TCP Ports 2/3 - RW */
-#define E1000_MFVAL    0x05824  /* Manageability Filters Valid - RW */
-#define E1000_MDEF     0x05890  /* Manageability Decision Filters - RW Array */
 #define E1000_FFLT     0x05F00  /* Flexible Filter Length Table - RW Array */
 #define E1000_HOST_IF  0x08800  /* Host Interface */
-#define E1000_FFMT     0x09000  /* Flexible Filter Mask Table - RW Array */
-#define E1000_FTFT     0x09400  /* Flexible TCO Filter Table - RW Array */
 #define E1000_FFVT     0x09800  /* Flexible Filter Value Table - RW Array */
 
 #define E1000_KUMCTRLSTA 0x00034 /* MAC-PHY interface - RW */
 #define E1000_MDPHYA     0x0003C /* PHY address - RW */
-#define E1000_MANC2H     0x05860 /* Management Control To Host - RW */
-#define E1000_SW_FW_SYNC 0x05B5C /* Software-Firmware Synchronization - RW */
 
-#define E1000_GCR       0x05B00 /* PCI-Ex Control */
-#define E1000_FUNCTAG   0x05B08 /* Function-Tag Register */
-#define E1000_GSCL_1    0x05B10 /* PCI-Ex Statistic Control #1 */
-#define E1000_GSCL_2    0x05B14 /* PCI-Ex Statistic Control #2 */
-#define E1000_GSCL_3    0x05B18 /* PCI-Ex Statistic Control #3 */
-#define E1000_GSCL_4    0x05B1C /* PCI-Ex Statistic Control #4 */
-#define E1000_GSCN_0    0x05B20 /* 3GIO Statistic Counter Register #0 */
-#define E1000_GSCN_1    0x05B24 /* 3GIO Statistic Counter Register #1 */
-#define E1000_GSCN_2    0x05B28 /* 3GIO Statistic Counter Register #2 */
-#define E1000_GSCN_3    0x05B2C /* 3GIO Statistic Counter Register #3 */
-#define E1000_FACTPS    0x05B30 /* Function Active and Power State to MNG */
-#define E1000_SWSM      0x05B50 /* SW Semaphore */
 #define E1000_GCR2      0x05B64 /* 3GIO Control Register 2 */
-#define E1000_FWSM      0x05B54 /* FW Semaphore */
-#define E1000_PBACLR    0x05B68 /* MSI-X PBA Clear */
 #define E1000_FFLT_DBG  0x05F04 /* Debug Register */
 #define E1000_HICR      0x08F00 /* Host Inteface Control */
 
-#define E1000_TSYNCRXCTL 0x0B620 /* Rx Time Sync Control register - RW */
-#define E1000_TSYNCTXCTL 0x0B614 /* Tx Time Sync Control register - RW */
-#define E1000_TIMINCA    0x0B608 /* Increment attributes register - RW */
-#define E1000_RXSTMPL    0x0B624 /* Rx timestamp Low - RO */
-#define E1000_RXSTMPH    0x0B628 /* Rx timestamp High - RO */
-#define E1000_TXSTMPL    0x0B618 /* Tx timestamp value Low - RO */
-#define E1000_TXSTMPH    0x0B61C /* Tx timestamp value High - RO */
-#define E1000_SYSTIML    0x0B600 /* System time register Low - RO */
-#define E1000_SYSTIMH    0x0B604 /* System time register High - RO */
-#define E1000_TIMINCA    0x0B608 /* Increment attributes register - RW */
 #define E1000_RXMTRL     0x0B634 /* Time sync Rx EtherType and Msg Type - RW */
 #define E1000_RXUDP      0x0B638 /* Time Sync Rx UDP Port - RW */
-#define E1000_RXSATRL    0x0B62C /* Rx timestamp attribute low - RO */
-#define E1000_RXSATRH    0x0B630 /* Rx timestamp attribute high - RO */
-#define E1000_TIMADJL    0x0B60C /* Time Adjustment Offset register Low - RW */
-#define E1000_TIMADJH    0x0B610 /* Time Adjustment Offset register High - RW */
 #define E1000_RXCFGL     0x0B634 /* RX Ethertype and Message Type - RW*/
 
-/* RSS registers */
+#define E1000_MRQC_ENABLED(mrqc) (((mrqc) & (BIT(0) | BIT(1))) == BIT(0))
+
 #define E1000_CPUVEC    0x02C10 /* CPU Vector Register - RW */
-#define E1000_MRQC      0x05818 /* Multiple Receive Control - RW */
-#define E1000_RETA      0x05C00 /* Redirection Table - RW Array */
-#define E1000_RSSRK     0x05C80 /* RSS Random Key - RW Array */
 #define E1000_RSSIM     0x05864 /* RSS Interrupt Mask */
 #define E1000_RSSIR     0x05868 /* RSS Interrupt Request */
 
-#define E1000_MRQC_ENABLED(mrqc) (((mrqc) & (BIT(0) | BIT(1))) == BIT(0))
-
-#define E1000_RETA_IDX(hash)        ((hash) & (BIT(7) - 1))
-#define E1000_RETA_VAL(reta, hash)  (((uint8_t *)(reta))[E1000_RETA_IDX(hash)])
 #define E1000_RSS_QUEUE(reta, hash) ((E1000_RETA_VAL(reta, hash) & BIT(7)) >> 7)
 
-#define E1000_MRQC_EN_TCPIPV4(mrqc) ((mrqc) & BIT(16))
-#define E1000_MRQC_EN_IPV4(mrqc)    ((mrqc) & BIT(17))
-#define E1000_MRQC_EN_TCPIPV6(mrqc) ((mrqc) & BIT(18))
-#define E1000_MRQC_EN_IPV6EX(mrqc)  ((mrqc) & BIT(19))
-#define E1000_MRQC_EN_IPV6(mrqc)    ((mrqc) & BIT(20))
-
-#define E1000_MRQ_RSS_TYPE_NONE     (0)
-#define E1000_MRQ_RSS_TYPE_IPV4TCP  (1)
-#define E1000_MRQ_RSS_TYPE_IPV4     (2)
-#define E1000_MRQ_RSS_TYPE_IPV6TCP  (3)
-#define E1000_MRQ_RSS_TYPE_IPV6EX   (4)
-#define E1000_MRQ_RSS_TYPE_IPV6     (5)
-
-#define E1000_ICR_ASSERTED BIT(31)
-#define E1000_EIAC_MASK    0x01F00000
-
 /* [TR]DBAL and [TR]DLEN masks */
 #define E1000_XDBAL_MASK            (~(BIT(4) - 1))
 #define E1000_XDLEN_MASK            ((BIT(20) - 1) & (~(BIT(7) - 1)))
@@ -444,18 +172,8 @@
 
 #define E1000_IVAR_TX_INT_EVERY_WB  BIT(31)
 
-/* RFCTL register bits */
-#define E1000_RFCTL_ISCSI_DIS           0x00000001
-#define E1000_RFCTL_NFSW_DIS            0x00000040
-#define E1000_RFCTL_NFSR_DIS            0x00000080
-#define E1000_RFCTL_IPV6_DIS            0x00000400
-#define E1000_RFCTL_IPV6_XSUM_DIS       0x00000800
 #define E1000_RFCTL_ACK_DIS             0x00001000
 #define E1000_RFCTL_ACK_DATA_DIS        0x00002000
-#define E1000_RFCTL_IPFRSP_DIS          0x00004000
-#define E1000_RFCTL_EXTEN               0x00008000
-#define E1000_RFCTL_IPV6_EX_DIS         0x00010000
-#define E1000_RFCTL_NEW_IPV6_EXT_DIS    0x00020000
 
 /* PSRCTL parsing */
 #define E1000_PSRCTL_BSIZE0_MASK   0x0000007F
@@ -470,24 +188,7 @@
 
 #define E1000_PSRCTL_BUFFS_PER_DESC 4
 
-/* TARC* parsing */
-#define E1000_TARC_ENABLE BIT(10)
-
 /* PHY 1000 MII Register/Bit Definitions */
-/* PHY Registers defined by IEEE */
-#define PHY_CTRL         0x00 /* Control Register */
-#define PHY_STATUS       0x01 /* Status Regiser */
-#define PHY_ID1          0x02 /* Phy Id Reg (word 1) */
-#define PHY_ID2          0x03 /* Phy Id Reg (word 2) */
-#define PHY_AUTONEG_ADV  0x04 /* Autoneg Advertisement */
-#define PHY_LP_ABILITY   0x05 /* Link Partner Ability (Base Page) */
-#define PHY_AUTONEG_EXP  0x06 /* Autoneg Expansion Reg */
-#define PHY_NEXT_PAGE_TX 0x07 /* Next Page TX */
-#define PHY_LP_NEXT_PAGE 0x08 /* Link Partner Next Page */
-#define PHY_1000T_CTRL   0x09 /* 1000Base-T Control Reg */
-#define PHY_1000T_STATUS 0x0A /* 1000Base-T Status Reg */
-#define PHY_EXT_STATUS   0x0F /* Extended Status Reg */
-
 /* 82574-specific registers */
 #define PHY_COPPER_CTRL1      0x10 /* Copper Specific Control Register 1 */
 #define PHY_COPPER_STAT1      0x11 /* Copper Specific Status Register 1 */
@@ -539,287 +240,6 @@
 #define M88E1000_PHY_VCO_REG_BIT8  0x100 /* Bits 8 & 11 are adjusted for */
 #define M88E1000_PHY_VCO_REG_BIT11 0x800    /* improved BER performance */
 
-/* PHY Control Register */
-#define MII_CR_SPEED_SELECT_MSB 0x0040 /* bits 6,13: 10=1000, 01=100, 00=10 */
-#define MII_CR_COLL_TEST_ENABLE 0x0080 /* Collision test enable */
-#define MII_CR_FULL_DUPLEX      0x0100 /* FDX =1, half duplex =0 */
-#define MII_CR_RESTART_AUTO_NEG 0x0200 /* Restart auto negotiation */
-#define MII_CR_ISOLATE          0x0400 /* Isolate PHY from MII */
-#define MII_CR_POWER_DOWN       0x0800 /* Power down */
-#define MII_CR_AUTO_NEG_EN      0x1000 /* Auto Neg Enable */
-#define MII_CR_SPEED_SELECT_LSB 0x2000 /* bits 6,13: 10=1000, 01=100, 00=10 */
-#define MII_CR_LOOPBACK         0x4000 /* 0 = normal, 1 = loopback */
-#define MII_CR_RESET            0x8000 /* 0 = normal, 1 = PHY reset */
-
-/* PHY Status Register */
-#define MII_SR_EXTENDED_CAPS     0x0001    /* Extended register capabilities */
-#define MII_SR_JABBER_DETECT     0x0002    /* Jabber Detected */
-#define MII_SR_LINK_STATUS       0x0004    /* Link Status 1 = link */
-#define MII_SR_AUTONEG_CAPS      0x0008    /* Auto Neg Capable */
-#define MII_SR_REMOTE_FAULT      0x0010    /* Remote Fault Detect */
-#define MII_SR_AUTONEG_COMPLETE  0x0020    /* Auto Neg Complete */
-#define MII_SR_PREAMBLE_SUPPRESS 0x0040    /* Preamble may be suppressed */
-#define MII_SR_EXTENDED_STATUS   0x0100    /* Ext. status info in Reg 0x0F */
-#define MII_SR_100T2_HD_CAPS     0x0200    /* 100T2 Half Duplex Capable */
-#define MII_SR_100T2_FD_CAPS     0x0400    /* 100T2 Full Duplex Capable */
-#define MII_SR_10T_HD_CAPS       0x0800    /* 10T   Half Duplex Capable */
-#define MII_SR_10T_FD_CAPS       0x1000    /* 10T   Full Duplex Capable */
-#define MII_SR_100X_HD_CAPS      0x2000    /* 100X  Half Duplex Capable */
-#define MII_SR_100X_FD_CAPS      0x4000    /* 100X  Full Duplex Capable */
-#define MII_SR_100T4_CAPS        0x8000    /* 100T4 Capable */
-
-/* PHY Link Partner Ability Register */
-#define MII_LPAR_LPACK           0x4000 /* Acked by link partner */
-
-/* Interrupt Cause Read */
-#define E1000_ICR_TXDW          0x00000001 /* Transmit desc written back */
-#define E1000_ICR_TXQE          0x00000002 /* Transmit Queue empty */
-#define E1000_ICR_LSC           0x00000004 /* Link Status Change */
-#define E1000_ICR_RXSEQ         0x00000008 /* rx sequence error */
-#define E1000_ICR_RXDMT0        0x00000010 /* rx desc min. threshold (0) */
-#define E1000_ICR_RXO           0x00000040 /* rx overrun */
-#define E1000_ICR_RXT0          0x00000080 /* rx timer intr (ring 0) */
-#define E1000_ICR_MDAC          0x00000200 /* MDIO access complete */
-#define E1000_ICR_RXCFG         0x00000400 /* RX /c/ ordered set */
-#define E1000_ICR_GPI_EN0       0x00000800 /* GP Int 0 */
-#define E1000_ICR_GPI_EN1       0x00001000 /* GP Int 1 */
-#define E1000_ICR_GPI_EN2       0x00002000 /* GP Int 2 */
-#define E1000_ICR_GPI_EN3       0x00004000 /* GP Int 3 */
-#define E1000_ICR_TXD_LOW       0x00008000
-#define E1000_ICR_SRPD          0x00010000
-#define E1000_ICR_ACK           0x00020000 /* Receive Ack frame */
-#define E1000_ICR_MNG           0x00040000 /* Manageability event */
-#define E1000_ICR_DOCK          0x00080000 /* Dock/Undock */
-#define E1000_ICR_INT_ASSERTED  0x80000000 /* If this bit asserted, the driver should claim the interrupt */
-#define E1000_ICR_RXD_FIFO_PAR0 0x00100000 /* queue 0 Rx descriptor FIFO parity error */
-#define E1000_ICR_TXD_FIFO_PAR0 0x00200000 /* queue 0 Tx descriptor FIFO parity error */
-#define E1000_ICR_HOST_ARB_PAR  0x00400000 /* host arb read buffer parity error */
-#define E1000_ICR_PB_PAR        0x00800000 /* packet buffer parity error */
-#define E1000_ICR_RXD_FIFO_PAR1 0x01000000 /* queue 1 Rx descriptor FIFO parity error */
-#define E1000_ICR_TXD_FIFO_PAR1 0x02000000 /* queue 1 Tx descriptor FIFO parity error */
-#define E1000_ICR_ALL_PARITY    0x03F00000 /* all parity error bits */
-#define E1000_ICR_DSW           0x00000020 /* FW changed the status of DISSW bit in the FWSM */
-#define E1000_ICR_PHYINT        0x00001000 /* LAN connected device generates an interrupt */
-#define E1000_ICR_EPRST         0x00100000 /* ME handware reset occurs */
-#define E1000_ICR_RXQ0          0x00100000 /* Rx Queue 0 Interrupt */
-#define E1000_ICR_RXQ1          0x00200000 /* Rx Queue 1 Interrupt */
-#define E1000_ICR_TXQ0          0x00400000 /* Tx Queue 0 Interrupt */
-#define E1000_ICR_TXQ1          0x00800000 /* Tx Queue 1 Interrupt */
-#define E1000_ICR_OTHER         0x01000000 /* Other Interrupts */
-
-#define E1000_ICR_OTHER_CAUSES (E1000_ICR_LSC  | \
-                                E1000_ICR_RXO  | \
-                                E1000_ICR_MDAC | \
-                                E1000_ICR_SRPD | \
-                                E1000_ICR_ACK  | \
-                                E1000_ICR_MNG)
-
-/* Interrupt Cause Set */
-#define E1000_ICS_TXDW      E1000_ICR_TXDW      /* Transmit desc written back */
-#define E1000_ICS_TXQE      E1000_ICR_TXQE      /* Transmit Queue empty */
-#define E1000_ICS_LSC       E1000_ICR_LSC       /* Link Status Change */
-#define E1000_ICS_RXSEQ     E1000_ICR_RXSEQ     /* rx sequence error */
-#define E1000_ICS_RXDMT0    E1000_ICR_RXDMT0    /* rx desc min. threshold */
-#define E1000_ICS_RXO       E1000_ICR_RXO       /* rx overrun */
-#define E1000_ICS_RXT0      E1000_ICR_RXT0      /* rx timer intr */
-#define E1000_ICS_MDAC      E1000_ICR_MDAC      /* MDIO access complete */
-#define E1000_ICS_RXCFG     E1000_ICR_RXCFG     /* RX /c/ ordered set */
-#define E1000_ICS_GPI_EN0   E1000_ICR_GPI_EN0   /* GP Int 0 */
-#define E1000_ICS_GPI_EN1   E1000_ICR_GPI_EN1   /* GP Int 1 */
-#define E1000_ICS_GPI_EN2   E1000_ICR_GPI_EN2   /* GP Int 2 */
-#define E1000_ICS_GPI_EN3   E1000_ICR_GPI_EN3   /* GP Int 3 */
-#define E1000_ICS_TXD_LOW   E1000_ICR_TXD_LOW
-#define E1000_ICS_SRPD      E1000_ICR_SRPD
-#define E1000_ICS_ACK       E1000_ICR_ACK       /* Receive Ack frame */
-#define E1000_ICS_MNG       E1000_ICR_MNG       /* Manageability event */
-#define E1000_ICS_DOCK      E1000_ICR_DOCK      /* Dock/Undock */
-#define E1000_ICS_RXD_FIFO_PAR0 E1000_ICR_RXD_FIFO_PAR0 /* queue 0 Rx descriptor FIFO parity error */
-#define E1000_ICS_TXD_FIFO_PAR0 E1000_ICR_TXD_FIFO_PAR0 /* queue 0 Tx descriptor FIFO parity error */
-#define E1000_ICS_HOST_ARB_PAR  E1000_ICR_HOST_ARB_PAR  /* host arb read buffer parity error */
-#define E1000_ICS_PB_PAR        E1000_ICR_PB_PAR        /* packet buffer parity error */
-#define E1000_ICS_RXD_FIFO_PAR1 E1000_ICR_RXD_FIFO_PAR1 /* queue 1 Rx descriptor FIFO parity error */
-#define E1000_ICS_TXD_FIFO_PAR1 E1000_ICR_TXD_FIFO_PAR1 /* queue 1 Tx descriptor FIFO parity error */
-#define E1000_ICS_DSW       E1000_ICR_DSW
-#define E1000_ICS_PHYINT    E1000_ICR_PHYINT
-#define E1000_ICS_EPRST     E1000_ICR_EPRST
-
-/* Interrupt Mask Set */
-#define E1000_IMS_TXDW      E1000_ICR_TXDW      /* Transmit desc written back */
-#define E1000_IMS_TXQE      E1000_ICR_TXQE      /* Transmit Queue empty */
-#define E1000_IMS_LSC       E1000_ICR_LSC       /* Link Status Change */
-#define E1000_IMS_RXSEQ     E1000_ICR_RXSEQ     /* rx sequence error */
-#define E1000_IMS_RXDMT0    E1000_ICR_RXDMT0    /* rx desc min. threshold */
-#define E1000_IMS_RXO       E1000_ICR_RXO       /* rx overrun */
-#define E1000_IMS_RXT0      E1000_ICR_RXT0      /* rx timer intr */
-#define E1000_IMS_MDAC      E1000_ICR_MDAC      /* MDIO access complete */
-#define E1000_IMS_RXCFG     E1000_ICR_RXCFG     /* RX /c/ ordered set */
-#define E1000_IMS_GPI_EN0   E1000_ICR_GPI_EN0   /* GP Int 0 */
-#define E1000_IMS_GPI_EN1   E1000_ICR_GPI_EN1   /* GP Int 1 */
-#define E1000_IMS_GPI_EN2   E1000_ICR_GPI_EN2   /* GP Int 2 */
-#define E1000_IMS_GPI_EN3   E1000_ICR_GPI_EN3   /* GP Int 3 */
-#define E1000_IMS_TXD_LOW   E1000_ICR_TXD_LOW
-#define E1000_IMS_SRPD      E1000_ICR_SRPD
-#define E1000_IMS_ACK       E1000_ICR_ACK       /* Receive Ack frame */
-#define E1000_IMS_MNG       E1000_ICR_MNG       /* Manageability event */
-#define E1000_IMS_RXQ0      E1000_ICR_RXQ0
-#define E1000_IMS_RXQ1      E1000_ICR_RXQ1
-#define E1000_IMS_TXQ0      E1000_ICR_TXQ0
-#define E1000_IMS_TXQ1      E1000_ICR_TXQ1
-#define E1000_IMS_OTHER     E1000_ICR_OTHER
-#define E1000_IMS_DOCK      E1000_ICR_DOCK      /* Dock/Undock */
-#define E1000_IMS_RXD_FIFO_PAR0 E1000_ICR_RXD_FIFO_PAR0 /* queue 0 Rx descriptor FIFO parity error */
-#define E1000_IMS_TXD_FIFO_PAR0 E1000_ICR_TXD_FIFO_PAR0 /* queue 0 Tx descriptor FIFO parity error */
-#define E1000_IMS_HOST_ARB_PAR  E1000_ICR_HOST_ARB_PAR  /* host arb read buffer parity error */
-#define E1000_IMS_PB_PAR        E1000_ICR_PB_PAR        /* packet buffer parity error */
-#define E1000_IMS_RXD_FIFO_PAR1 E1000_ICR_RXD_FIFO_PAR1 /* queue 1 Rx descriptor FIFO parity error */
-#define E1000_IMS_TXD_FIFO_PAR1 E1000_ICR_TXD_FIFO_PAR1 /* queue 1 Tx descriptor FIFO parity error */
-#define E1000_IMS_DSW       E1000_ICR_DSW
-#define E1000_IMS_PHYINT    E1000_ICR_PHYINT
-#define E1000_IMS_EPRST     E1000_ICR_EPRST
-
-/* Interrupt Mask Clear */
-#define E1000_IMC_TXDW      E1000_ICR_TXDW      /* Transmit desc written back */
-#define E1000_IMC_TXQE      E1000_ICR_TXQE      /* Transmit Queue empty */
-#define E1000_IMC_LSC       E1000_ICR_LSC       /* Link Status Change */
-#define E1000_IMC_RXSEQ     E1000_ICR_RXSEQ     /* rx sequence error */
-#define E1000_IMC_RXDMT0    E1000_ICR_RXDMT0    /* rx desc min. threshold */
-#define E1000_IMC_RXO       E1000_ICR_RXO       /* rx overrun */
-#define E1000_IMC_RXT0      E1000_ICR_RXT0      /* rx timer intr */
-#define E1000_IMC_MDAC      E1000_ICR_MDAC      /* MDIO access complete */
-#define E1000_IMC_RXCFG     E1000_ICR_RXCFG     /* RX /c/ ordered set */
-#define E1000_IMC_GPI_EN0   E1000_ICR_GPI_EN0   /* GP Int 0 */
-#define E1000_IMC_GPI_EN1   E1000_ICR_GPI_EN1   /* GP Int 1 */
-#define E1000_IMC_GPI_EN2   E1000_ICR_GPI_EN2   /* GP Int 2 */
-#define E1000_IMC_GPI_EN3   E1000_ICR_GPI_EN3   /* GP Int 3 */
-#define E1000_IMC_TXD_LOW   E1000_ICR_TXD_LOW
-#define E1000_IMC_SRPD      E1000_ICR_SRPD
-#define E1000_IMC_ACK       E1000_ICR_ACK       /* Receive Ack frame */
-#define E1000_IMC_MNG       E1000_ICR_MNG       /* Manageability event */
-#define E1000_IMC_DOCK      E1000_ICR_DOCK      /* Dock/Undock */
-#define E1000_IMC_RXD_FIFO_PAR0 E1000_ICR_RXD_FIFO_PAR0 /* queue 0 Rx descriptor FIFO parity error */
-#define E1000_IMC_TXD_FIFO_PAR0 E1000_ICR_TXD_FIFO_PAR0 /* queue 0 Tx descriptor FIFO parity error */
-#define E1000_IMC_HOST_ARB_PAR  E1000_ICR_HOST_ARB_PAR  /* host arb read buffer parity error */
-#define E1000_IMC_PB_PAR        E1000_ICR_PB_PAR        /* packet buffer parity error */
-#define E1000_IMC_RXD_FIFO_PAR1 E1000_ICR_RXD_FIFO_PAR1 /* queue 1 Rx descriptor FIFO parity error */
-#define E1000_IMC_TXD_FIFO_PAR1 E1000_ICR_TXD_FIFO_PAR1 /* queue 1 Tx descriptor FIFO parity error */
-#define E1000_IMC_DSW       E1000_ICR_DSW
-#define E1000_IMC_PHYINT    E1000_ICR_PHYINT
-#define E1000_IMC_EPRST     E1000_ICR_EPRST
-
-/* Receive Control */
-#define E1000_RCTL_RST            0x00000001    /* Software reset */
-#define E1000_RCTL_EN             0x00000002    /* enable */
-#define E1000_RCTL_SBP            0x00000004    /* store bad packet */
-#define E1000_RCTL_UPE            0x00000008    /* unicast promiscuous enable */
-#define E1000_RCTL_MPE            0x00000010    /* multicast promiscuous enab */
-#define E1000_RCTL_LPE            0x00000020    /* long packet enable */
-#define E1000_RCTL_LBM_NO         0x00000000    /* no loopback mode */
-#define E1000_RCTL_LBM_MAC        0x00000040    /* MAC loopback mode */
-#define E1000_RCTL_LBM_SLP        0x00000080    /* serial link loopback mode */
-#define E1000_RCTL_LBM_TCVR       0x000000C0    /* tcvr loopback mode */
-#define E1000_RCTL_DTYP_MASK      0x00000C00    /* Descriptor type mask */
-#define E1000_RCTL_DTYP_PS        0x00000400    /* Packet Split descriptor */
-#define E1000_RCTL_RDMTS_HALF     0x00000000    /* rx desc min threshold size */
-#define E1000_RCTL_RDMTS_QUAT     0x00000100    /* rx desc min threshold size */
-#define E1000_RCTL_RDMTS_EIGTH    0x00000200    /* rx desc min threshold size */
-#define E1000_RCTL_MO_SHIFT       12            /* multicast offset shift */
-#define E1000_RCTL_MO_0           0x00000000    /* multicast offset 11:0 */
-#define E1000_RCTL_MO_1           0x00001000    /* multicast offset 12:1 */
-#define E1000_RCTL_MO_2           0x00002000    /* multicast offset 13:2 */
-#define E1000_RCTL_MO_3           0x00003000    /* multicast offset 15:4 */
-#define E1000_RCTL_MDR            0x00004000    /* multicast desc ring 0 */
-#define E1000_RCTL_BAM            0x00008000    /* broadcast enable */
-/* these buffer sizes are valid if E1000_RCTL_BSEX is 0 */
-#define E1000_RCTL_SZ_2048        0x00000000    /* rx buffer size 2048 */
-#define E1000_RCTL_SZ_1024        0x00010000    /* rx buffer size 1024 */
-#define E1000_RCTL_SZ_512         0x00020000    /* rx buffer size 512 */
-#define E1000_RCTL_SZ_256         0x00030000    /* rx buffer size 256 */
-/* these buffer sizes are valid if E1000_RCTL_BSEX is 1 */
-#define E1000_RCTL_SZ_16384       0x00010000    /* rx buffer size 16384 */
-#define E1000_RCTL_SZ_8192        0x00020000    /* rx buffer size 8192 */
-#define E1000_RCTL_SZ_4096        0x00030000    /* rx buffer size 4096 */
-#define E1000_RCTL_VFE            0x00040000    /* vlan filter enable */
-#define E1000_RCTL_CFIEN          0x00080000    /* canonical form enable */
-#define E1000_RCTL_CFI            0x00100000    /* canonical form indicator */
-#define E1000_RCTL_DPF            0x00400000    /* discard pause frames */
-#define E1000_RCTL_PMCF           0x00800000    /* pass MAC control frames */
-#define E1000_RCTL_BSEX           0x02000000    /* Buffer size extension */
-#define E1000_RCTL_SECRC          0x04000000    /* Strip Ethernet CRC */
-#define E1000_RCTL_FLXBUF_MASK    0x78000000    /* Flexible buffer size */
-#define E1000_RCTL_FLXBUF_SHIFT   27            /* Flexible buffer shift */
-
-
-#define E1000_EEPROM_SWDPIN0   0x0001   /* SWDPIN 0 EEPROM Value */
-#define E1000_EEPROM_LED_LOGIC 0x0020   /* Led Logic Word */
-#define E1000_EEPROM_RW_REG_DATA   16   /* Offset to data in EEPROM read/write registers */
-#define E1000_EEPROM_RW_REG_DONE   0x10 /* Offset to READ/WRITE done bit */
-#define E1000_EEPROM_RW_REG_START  1    /* First bit for telling part to start operation */
-#define E1000_EEPROM_RW_ADDR_SHIFT 8    /* Shift to the address bits */
-#define E1000_EEPROM_POLL_WRITE    1    /* Flag for polling for write complete */
-#define E1000_EEPROM_POLL_READ     0    /* Flag for polling for read complete */
-
-/* 82574 EERD/EEWR registers layout */
-#define E1000_EERW_START        BIT(0)
-#define E1000_EERW_DONE         BIT(1)
-#define E1000_EERW_ADDR_SHIFT   2
-#define E1000_EERW_ADDR_MASK    ((1L << 14) - 1)
-#define E1000_EERW_DATA_SHIFT   16
-#define E1000_EERW_DATA_MASK   ((1L << 16) - 1)
-
-/* Register Bit Masks */
-/* Device Control */
-#define E1000_CTRL_FD       0x00000001  /* Full duplex.0=half; 1=full */
-#define E1000_CTRL_BEM      0x00000002  /* Endian Mode.0=little,1=big */
-#define E1000_CTRL_PRIOR    0x00000004  /* Priority on PCI. 0=rx,1=fair */
-#define E1000_CTRL_GIO_MASTER_DISABLE 0x00000004 /*Blocks new Master requests */
-#define E1000_CTRL_LRST     0x00000008  /* Link reset. 0=normal,1=reset */
-#define E1000_CTRL_TME      0x00000010  /* Test mode. 0=normal,1=test */
-#define E1000_CTRL_SLE      0x00000020  /* Serial Link on 0=dis,1=en */
-#define E1000_CTRL_ASDE     0x00000020  /* Auto-speed detect enable */
-#define E1000_CTRL_SLU      0x00000040  /* Set link up (Force Link) */
-#define E1000_CTRL_ILOS     0x00000080  /* Invert Loss-Of Signal */
-#define E1000_CTRL_SPD_SEL  0x00000300  /* Speed Select Mask */
-#define E1000_CTRL_SPD_10   0x00000000  /* Force 10Mb */
-#define E1000_CTRL_SPD_100  0x00000100  /* Force 100Mb */
-#define E1000_CTRL_SPD_1000 0x00000200  /* Force 1Gb */
-#define E1000_CTRL_BEM32    0x00000400  /* Big Endian 32 mode */
-#define E1000_CTRL_FRCSPD   0x00000800  /* Force Speed */
-#define E1000_CTRL_FRCDPX   0x00001000  /* Force Duplex */
-#define E1000_CTRL_D_UD_EN  0x00002000  /* Dock/Undock enable */
-#define E1000_CTRL_D_UD_POLARITY 0x00004000 /* Defined polarity of Dock/Undock indication in SDP[0] */
-#define E1000_CTRL_FORCE_PHY_RESET 0x00008000 /* Reset both PHY ports, through PHYRST_N pin */
-#define E1000_CTRL_SPD_SHIFT 8          /* Speed Select Shift */
-
-#define E1000_CTRL_EXT_ASDCHK  0x00001000 /* auto speed detection check */
-#define E1000_CTRL_EXT_EE_RST  0x00002000 /* EEPROM reset */
-#define E1000_CTRL_EXT_LINK_EN 0x00010000 /* enable link status from external LINK_0 and LINK_1 pins */
-#define E1000_CTRL_EXT_DRV_LOAD 0x10000000 /* Driver loaded bit for FW */
-#define E1000_CTRL_EXT_EIAME   0x01000000
-#define E1000_CTRL_EXT_IAME    0x08000000 /* Int ACK Auto-mask */
-#define E1000_CTRL_EXT_PBA_CLR 0x80000000 /* PBA Clear */
-#define E1000_CTRL_EXT_INT_TIMERS_CLEAR_ENA 0x20000000
-#define E1000_CTRL_EXT_SPD_BYPS  0x00008000 /* Speed Select Bypass */
-
-#define E1000_CTRL_SWDPIN0  0x00040000  /* SWDPIN 0 value */
-#define E1000_CTRL_SWDPIN1  0x00080000  /* SWDPIN 1 value */
-#define E1000_CTRL_SWDPIN2  0x00100000  /* SWDPIN 2 value */
-#define E1000_CTRL_SWDPIN3  0x00200000  /* SWDPIN 3 value */
-#define E1000_CTRL_SWDPIO0  0x00400000  /* SWDPIN 0 Input or output */
-#define E1000_CTRL_SWDPIO1  0x00800000  /* SWDPIN 1 input or output */
-#define E1000_CTRL_SWDPIO2  0x01000000  /* SWDPIN 2 input or output */
-#define E1000_CTRL_SWDPIO3  0x02000000  /* SWDPIN 3 input or output */
-#define E1000_CTRL_ADVD3WUC 0x00100000  /* D3 WUC */
-#define E1000_CTRL_RST      0x04000000  /* Global reset */
-#define E1000_CTRL_RFCE     0x08000000  /* Receive Flow Control enable */
-#define E1000_CTRL_TFCE     0x10000000  /* Transmit flow control enable */
-#define E1000_CTRL_RTE      0x20000000  /* Routing tag enable */
-#define E1000_CTRL_VME      0x40000000  /* IEEE VLAN mode enable */
-#define E1000_CTRL_PHY_RST  0x80000000  /* PHY Reset */
-#define E1000_CTRL_SW2FW_INT 0x02000000  /* Initiate an interrupt to manageability engine */
-
-/* Device Status */
-#define E1000_STATUS_FD         0x00000001      /* Full duplex.0=half,1=full */
-#define E1000_STATUS_LU         0x00000002      /* Link up.0=no,1=link */
 #define E1000_STATUS_FUNC_MASK  0x0000000C      /* PCI Function Mask */
 #define E1000_STATUS_FUNC_SHIFT 2
 #define E1000_STATUS_FUNC_0     0x00000000      /* Function 0 */
@@ -827,9 +247,6 @@
 #define E1000_STATUS_TXOFF      0x00000010      /* transmission paused */
 #define E1000_STATUS_TBIMODE    0x00000020      /* TBI mode */
 #define E1000_STATUS_SPEED_MASK 0x000000C0
-#define E1000_STATUS_SPEED_10   0x00000000      /* Speed 10Mb/s */
-#define E1000_STATUS_SPEED_100  0x00000040      /* Speed 100Mb/s */
-#define E1000_STATUS_SPEED_1000 0x00000080      /* Speed 1000Mb/s */
 #define E1000_STATUS_LAN_INIT_DONE 0x00000200   /* Lan Init Completion
                                                    by EEPROM/Flash */
 #define E1000_STATUS_ASDV       0x00000300      /* Auto speed detect value */
@@ -837,9 +254,7 @@
 #define E1000_STATUS_ASDV_100   0x00000100      /* ASDV 100Mb */
 #define E1000_STATUS_ASDV_1000  0x00000200      /* ASDV 1Gb */
 #define E1000_STATUS_DOCK_CI    0x00000800      /* Change in Dock/Undock state. Clear on write '0'. */
-#define E1000_STATUS_GIO_MASTER_ENABLE 0x00080000 /* Status of Master requests. */
 #define E1000_STATUS_MTXCKOK    0x00000400      /* MTX clock running OK */
-#define E1000_STATUS_PHYRA      0x00000400      /* PHY Reset Asserted */
 #define E1000_STATUS_PCI66      0x00000800      /* In 66Mhz slot */
 #define E1000_STATUS_BUS64      0x00001000      /* In 64 bit slot */
 #define E1000_STATUS_PCIX_MODE  0x00002000      /* PCI-X mode */
@@ -857,111 +272,6 @@
 #define E1000_STATUS_SPEED_SHIFT  6
 #define E1000_STATUS_ASDV_SHIFT   8
 
-/* EEPROM/Flash Control */
-#define E1000_EECD_SK        0x00000001 /* EEPROM Clock */
-#define E1000_EECD_CS        0x00000002 /* EEPROM Chip Select */
-#define E1000_EECD_DI        0x00000004 /* EEPROM Data In */
-#define E1000_EECD_DO        0x00000008 /* EEPROM Data Out */
-#define E1000_EECD_FWE_MASK  0x00000030
-#define E1000_EECD_FWE_DIS   0x00000010 /* Disable FLASH writes */
-#define E1000_EECD_FWE_EN    0x00000020 /* Enable FLASH writes */
-#define E1000_EECD_FWE_SHIFT 4
-#define E1000_EECD_REQ       0x00000040 /* EEPROM Access Request */
-#define E1000_EECD_GNT       0x00000080 /* EEPROM Access Grant */
-#define E1000_EECD_PRES      0x00000100 /* EEPROM Present */
-#define E1000_EECD_SIZE      0x00000200 /* EEPROM Size (0=64 word 1=256 word) */
-#define E1000_EECD_ADDR_BITS 0x00000400 /* EEPROM Addressing bits based on type
-                                         * (0-small, 1-large) */
-#define E1000_EECD_TYPE      0x00002000 /* EEPROM Type (1-SPI, 0-Microwire) */
-#ifndef E1000_EEPROM_GRANT_ATTEMPTS
-#define E1000_EEPROM_GRANT_ATTEMPTS 1000 /* EEPROM # attempts to gain grant */
-#endif
-#define E1000_EECD_AUTO_RD          0x00000200  /* EEPROM Auto Read done */
-#define E1000_EECD_SIZE_EX_MASK     0x00007800  /* EEprom Size */
-#define E1000_EECD_SIZE_EX_SHIFT    11
-#define E1000_EECD_NVADDS    0x00018000 /* NVM Address Size */
-#define E1000_EECD_SELSHAD   0x00020000 /* Select Shadow RAM */
-#define E1000_EECD_INITSRAM  0x00040000 /* Initialize Shadow RAM */
-#define E1000_EECD_FLUPD     0x00080000 /* Update FLASH */
-#define E1000_EECD_AUPDEN    0x00100000 /* Enable Autonomous FLASH update */
-#define E1000_EECD_SHADV     0x00200000 /* Shadow RAM Data Valid */
-#define E1000_EECD_SEC1VAL   0x00400000 /* Sector One Valid */
-
-
-#define E1000_EECD_SECVAL_SHIFT      22
-#define E1000_STM_OPCODE     0xDB00
-#define E1000_HICR_FW_RESET  0xC0
-
-#define E1000_SHADOW_RAM_WORDS     2048
-#define E1000_ICH_NVM_SIG_WORD     0x13
-#define E1000_ICH_NVM_SIG_MASK     0xC0
-
-/* MDI Control */
-#define E1000_MDIC_DATA_MASK 0x0000FFFF
-#define E1000_MDIC_REG_MASK  0x001F0000
-#define E1000_MDIC_REG_SHIFT 16
-#define E1000_MDIC_PHY_MASK  0x03E00000
-#define E1000_MDIC_PHY_SHIFT 21
-#define E1000_MDIC_OP_WRITE  0x04000000
-#define E1000_MDIC_OP_READ   0x08000000
-#define E1000_MDIC_READY     0x10000000
-#define E1000_MDIC_INT_EN    0x20000000
-#define E1000_MDIC_ERROR     0x40000000
-
-/* Rx Interrupt Delay Timer */
-#define E1000_RDTR_FPD       BIT(31)
-
-/* Tx Interrupt Delay Timer */
-#define E1000_TIDV_FPD       BIT(31)
-
-/* Delay increments in nanoseconds for delayed interrupts registers */
-#define E1000_INTR_DELAY_NS_RES (1024)
-
-/* Delay increments in nanoseconds for interrupt throttling registers */
-#define E1000_INTR_THROTTLING_NS_RES (256)
-
-/* EEPROM Commands - Microwire */
-#define EEPROM_READ_OPCODE_MICROWIRE  0x6  /* EEPROM read opcode */
-#define EEPROM_WRITE_OPCODE_MICROWIRE 0x5  /* EEPROM write opcode */
-#define EEPROM_ERASE_OPCODE_MICROWIRE 0x7  /* EEPROM erase opcode */
-#define EEPROM_EWEN_OPCODE_MICROWIRE  0x13 /* EEPROM erase/write enable */
-#define EEPROM_EWDS_OPCODE_MICROWIRE  0x10 /* EEPROM erast/write disable */
-
-/* EEPROM Word Offsets */
-#define EEPROM_COMPAT                 0x0003
-#define EEPROM_ID_LED_SETTINGS        0x0004
-#define EEPROM_VERSION                0x0005
-#define EEPROM_SERDES_AMPLITUDE       0x0006 /* For SERDES output amplitude adjustment. */
-#define EEPROM_PHY_CLASS_WORD         0x0007
-#define EEPROM_INIT_CONTROL1_REG      0x000A
-#define EEPROM_INIT_CONTROL2_REG      0x000F
-#define EEPROM_SWDEF_PINS_CTRL_PORT_1 0x0010
-#define EEPROM_INIT_CONTROL3_PORT_B   0x0014
-#define EEPROM_INIT_3GIO_3            0x001A
-#define EEPROM_SWDEF_PINS_CTRL_PORT_0 0x0020
-#define EEPROM_INIT_CONTROL3_PORT_A   0x0024
-#define EEPROM_CFG                    0x0012
-#define EEPROM_FLASH_VERSION          0x0032
-#define EEPROM_CHECKSUM_REG           0x003F
-
-#define E1000_EEPROM_CFG_DONE         0x00040000   /* MNG config cycle done */
-#define E1000_EEPROM_CFG_DONE_PORT_1  0x00080000   /* ...for second port */
-
-/* PCI Express Control */
-/* 3GIO Control Register - GCR (0x05B00; RW) */
-#define E1000_L0S_ADJUST              (1 << 9)
-#define E1000_L1_ENTRY_LATENCY_MSB    (1 << 23)
-#define E1000_L1_ENTRY_LATENCY_LSB    (1 << 25 | 1 << 26)
-
-#define E1000_L0S_ADJUST              (1 << 9)
-#define E1000_L1_ENTRY_LATENCY_MSB    (1 << 23)
-#define E1000_L1_ENTRY_LATENCY_LSB    (1 << 25 | 1 << 26)
-
-#define E1000_GCR_RO_BITS             (1 << 23 | 1 << 25 | 1 << 26)
-
-/* MSI-X PBA Clear register */
-#define E1000_PBACLR_VALID_MASK       (BIT(5) - 1)
-
 /* Transmit Descriptor */
 struct e1000_tx_desc {
     uint64_t buffer_addr;       /* Address of the descriptor's data buffer */
@@ -983,269 +293,7 @@ struct e1000_tx_desc {
     } upper;
 };
 
-/* Transmit Descriptor bit definitions */
-#define E1000_TXD_DTYP_D     0x00100000 /* Data Descriptor */
-#define E1000_TXD_DTYP_C     0x00000000 /* Context Descriptor */
 #define E1000_TXD_POPTS_IXSM 0x01       /* Insert IP checksum */
 #define E1000_TXD_POPTS_TXSM 0x02       /* Insert TCP/UDP checksum */
-#define E1000_TXD_CMD_EOP    0x01000000 /* End of Packet */
-#define E1000_TXD_CMD_IFCS   0x02000000 /* Insert FCS (Ethernet CRC) */
-#define E1000_TXD_CMD_IC     0x04000000 /* Insert Checksum */
-#define E1000_TXD_CMD_RS     0x08000000 /* Report Status */
-#define E1000_TXD_CMD_RPS    0x10000000 /* Report Packet Sent */
-#define E1000_TXD_CMD_DEXT   0x20000000 /* Descriptor extension (0 = legacy) */
-#define E1000_TXD_CMD_VLE    0x40000000 /* Add VLAN tag */
-#define E1000_TXD_CMD_IDE    0x80000000 /* Enable Tidv register */
-#define E1000_TXD_STAT_DD    0x00000001 /* Descriptor Done */
-#define E1000_TXD_STAT_EC    0x00000002 /* Excess Collisions */
-#define E1000_TXD_STAT_LC    0x00000004 /* Late Collisions */
-#define E1000_TXD_STAT_TU    0x00000008 /* Transmit underrun */
-#define E1000_TXD_CMD_TCP    0x01000000 /* TCP packet */
-#define E1000_TXD_CMD_IP     0x02000000 /* IP packet */
-#define E1000_TXD_CMD_TSE    0x04000000 /* TCP Seg enable */
-#define E1000_TXD_CMD_SNAP   0x40000000 /* Update SNAP header */
-#define E1000_TXD_STAT_TC    0x00000004 /* Tx Underrun */
-#define E1000_TXD_EXTCMD_TSTAMP 0x00000010 /* IEEE1588 Timestamp packet */
-
-/* Transmit Control */
-#define E1000_TCTL_RST    0x00000001    /* software reset */
-#define E1000_TCTL_EN     0x00000002    /* enable tx */
-#define E1000_TCTL_BCE    0x00000004    /* busy check enable */
-#define E1000_TCTL_PSP    0x00000008    /* pad short packets */
-#define E1000_TCTL_CT     0x00000ff0    /* collision threshold */
-#define E1000_TCTL_COLD   0x003ff000    /* collision distance */
-#define E1000_TCTL_SWXOFF 0x00400000    /* SW Xoff transmission */
-#define E1000_TCTL_PBE    0x00800000    /* Packet Burst Enable */
-#define E1000_TCTL_RTLC   0x01000000    /* Re-transmit on late collision */
-#define E1000_TCTL_NRTU   0x02000000    /* No Re-transmit on underrun */
-#define E1000_TCTL_MULR   0x10000000    /* Multiple request support */
-
-/* Legacy Receive Descriptor */
-struct e1000_rx_desc {
-    uint64_t buffer_addr; /* Address of the descriptor's data buffer */
-    uint16_t length;     /* Length of data DMAed into data buffer */
-    uint16_t csum;       /* Packet checksum */
-    uint8_t status;      /* Descriptor status */
-    uint8_t errors;      /* Descriptor Errors */
-    uint16_t special;
-};
-
-/* Extended Receive Descriptor */
-union e1000_rx_desc_extended {
-    struct {
-        uint64_t buffer_addr;
-        uint64_t reserved;
-    } read;
-    struct {
-        struct {
-            uint32_t mrq;           /* Multiple Rx Queues */
-            union {
-                uint32_t rss;       /* RSS Hash */
-                struct {
-                    uint16_t ip_id; /* IP id */
-                    uint16_t csum;  /* Packet Checksum */
-                } csum_ip;
-            } hi_dword;
-        } lower;
-        struct {
-            uint32_t status_error;  /* ext status/error */
-            uint16_t length;
-            uint16_t vlan;          /* VLAN tag */
-        } upper;
-    } wb;                           /* writeback */
-};
-
-#define MAX_PS_BUFFERS 4
-
-/* Number of packet split data buffers (not including the header buffer) */
-#define PS_PAGE_BUFFERS    (MAX_PS_BUFFERS - 1)
-
-/* Receive Descriptor - Packet Split */
-union e1000_rx_desc_packet_split {
-    struct {
-        /* one buffer for protocol header(s), three data buffers */
-        uint64_t buffer_addr[MAX_PS_BUFFERS];
-    } read;
-    struct {
-        struct {
-            uint32_t mrq;          /* Multiple Rx Queues */
-            union {
-                uint32_t rss;          /* RSS Hash */
-                struct {
-                    uint16_t ip_id;    /* IP id */
-                    uint16_t csum;     /* Packet Checksum */
-                } csum_ip;
-            } hi_dword;
-        } lower;
-        struct {
-            uint32_t status_error;     /* ext status/error */
-            uint16_t length0;      /* length of buffer 0 */
-            uint16_t vlan;         /* VLAN tag */
-        } middle;
-        struct {
-            uint16_t header_status;
-            /* length of buffers 1-3 */
-            uint16_t length[PS_PAGE_BUFFERS];
-        } upper;
-        uint64_t reserved;
-    } wb; /* writeback */
-};
-
-/* Receive Checksum Control bits */
-#define E1000_RXCSUM_IPOFLD     0x100   /* IP Checksum Offload Enable */
-#define E1000_RXCSUM_TUOFLD     0x200   /* TCP/UDP Checksum Offload Enable */
-#define E1000_RXCSUM_PCSD       0x2000  /* Packet Checksum Disable */
-
-#define E1000_RING_DESC_LEN       (16)
-#define E1000_RING_DESC_LEN_SHIFT (4)
-
-#define E1000_MIN_RX_DESC_LEN   E1000_RING_DESC_LEN
-#define E1000_MAX_RX_DESC_LEN   (sizeof(union e1000_rx_desc_packet_split))
-
-/* Receive Descriptor bit definitions */
-#define E1000_RXD_STAT_DD       0x01    /* Descriptor Done */
-#define E1000_RXD_STAT_EOP      0x02    /* End of Packet */
-#define E1000_RXD_STAT_IXSM     0x04    /* Ignore checksum */
-#define E1000_RXD_STAT_VP       0x08    /* IEEE VLAN Packet */
-#define E1000_RXD_STAT_UDPCS    0x10    /* UDP xsum caculated */
-#define E1000_RXD_STAT_TCPCS    0x20    /* TCP xsum calculated */
-#define E1000_RXD_STAT_IPCS     0x40    /* IP xsum calculated */
-#define E1000_RXD_STAT_PIF      0x80    /* passed in-exact filter */
-#define E1000_RXD_STAT_IPIDV    0x200   /* IP identification valid */
-#define E1000_RXD_STAT_UDPV     0x400   /* Valid UDP checksum */
-#define E1000_RXD_STAT_ACK      0x8000  /* ACK Packet indication */
-#define E1000_RXD_ERR_CE        0x01    /* CRC Error */
-#define E1000_RXD_ERR_SE        0x02    /* Symbol Error */
-#define E1000_RXD_ERR_SEQ       0x04    /* Sequence Error */
-#define E1000_RXD_ERR_CXE       0x10    /* Carrier Extension Error */
-#define E1000_RXD_ERR_TCPE      0x20    /* TCP/UDP Checksum Error */
-#define E1000_RXD_ERR_IPE       0x40    /* IP Checksum Error */
-#define E1000_RXD_ERR_RXE       0x80    /* Rx Data Error */
-#define E1000_RXD_SPC_VLAN_MASK 0x0FFF  /* VLAN ID is in lower 12 bits */
-#define E1000_RXD_SPC_PRI_MASK  0xE000  /* Priority is in upper 3 bits */
-#define E1000_RXD_SPC_PRI_SHIFT 13
-#define E1000_RXD_SPC_CFI_MASK  0x1000  /* CFI is bit 12 */
-#define E1000_RXD_SPC_CFI_SHIFT 12
-
-/* RX packet types */
-#define E1000_RXD_PKT_MAC       (0)
-#define E1000_RXD_PKT_IP4       (1)
-#define E1000_RXD_PKT_IP4_XDP   (2)
-#define E1000_RXD_PKT_IP6       (5)
-#define E1000_RXD_PKT_IP6_XDP   (6)
-
-#define E1000_RXD_PKT_TYPE(t) ((t) << 16)
-
-#define E1000_RXDEXT_STATERR_CE    0x01000000
-#define E1000_RXDEXT_STATERR_SE    0x02000000
-#define E1000_RXDEXT_STATERR_SEQ   0x04000000
-#define E1000_RXDEXT_STATERR_CXE   0x10000000
-#define E1000_RXDEXT_STATERR_TCPE  0x20000000
-#define E1000_RXDEXT_STATERR_IPE   0x40000000
-#define E1000_RXDEXT_STATERR_RXE   0x80000000
-
-#define E1000_RXDPS_HDRSTAT_HDRSP        0x00008000
-#define E1000_RXDPS_HDRSTAT_HDRLEN_MASK  0x000003FF
-
-/* Receive Address */
-#define E1000_RAH_AV  0x80000000        /* Receive descriptor valid */
-
-/* Offload Context Descriptor */
-struct e1000_context_desc {
-    union {
-        uint32_t ip_config;
-        struct {
-            uint8_t ipcss;      /* IP checksum start */
-            uint8_t ipcso;      /* IP checksum offset */
-            uint16_t ipcse;     /* IP checksum end */
-        } ip_fields;
-    } lower_setup;
-    union {
-        uint32_t tcp_config;
-        struct {
-            uint8_t tucss;      /* TCP checksum start */
-            uint8_t tucso;      /* TCP checksum offset */
-            uint16_t tucse;     /* TCP checksum end */
-        } tcp_fields;
-    } upper_setup;
-    uint32_t cmd_and_length;    /* */
-    union {
-        uint32_t data;
-        struct {
-            uint8_t status;     /* Descriptor status */
-            uint8_t hdr_len;    /* Header length */
-            uint16_t mss;       /* Maximum segment size */
-        } fields;
-    } tcp_seg_setup;
-};
-
-/* Offload data descriptor */
-struct e1000_data_desc {
-    uint64_t buffer_addr;       /* Address of the descriptor's buffer address */
-    union {
-        uint32_t data;
-        struct {
-            uint16_t length;    /* Data buffer length */
-            uint8_t typ_len_ext;        /* */
-            uint8_t cmd;        /* */
-        } flags;
-    } lower;
-    union {
-        uint32_t data;
-        struct {
-            uint8_t status;     /* Descriptor status */
-            uint8_t popts;      /* Packet Options */
-            uint16_t special;   /* */
-        } fields;
-    } upper;
-};
-
-/* Management Control */
-#define E1000_MANC_SMBUS_EN      0x00000001 /* SMBus Enabled - RO */
-#define E1000_MANC_ASF_EN        0x00000002 /* ASF Enabled - RO */
-#define E1000_MANC_R_ON_FORCE    0x00000004 /* Reset on Force TCO - RO */
-#define E1000_MANC_RMCP_EN       0x00000100 /* Enable RCMP 026Fh Filtering */
-#define E1000_MANC_0298_EN       0x00000200 /* Enable RCMP 0298h Filtering */
-#define E1000_MANC_IPV4_EN       0x00000400 /* Enable IPv4 */
-#define E1000_MANC_IPV6_EN       0x00000800 /* Enable IPv6 */
-#define E1000_MANC_SNAP_EN       0x00001000 /* Accept LLC/SNAP */
-#define E1000_MANC_ARP_EN        0x00002000 /* Enable ARP Request Filtering */
-#define E1000_MANC_NEIGHBOR_EN   0x00004000 /* Enable Neighbor Discovery
-                                             * Filtering */
-#define E1000_MANC_ARP_RES_EN    0x00008000 /* Enable ARP response Filtering */
-#define E1000_MANC_DIS_IP_CHK_ARP  0x10000000 /* Disable IP address chacking */
-                                              /*for ARP packets - in 82574 */
-#define E1000_MANC_TCO_RESET     0x00010000 /* TCO Reset Occurred */
-#define E1000_MANC_RCV_TCO_EN    0x00020000 /* Receive TCO Packets Enabled */
-#define E1000_MANC_REPORT_STATUS 0x00040000 /* Status Reporting Enabled */
-#define E1000_MANC_RCV_ALL       0x00080000 /* Receive All Enabled */
-#define E1000_MANC_BLK_PHY_RST_ON_IDE   0x00040000 /* Block phy resets */
-#define E1000_MANC_EN_MAC_ADDR_FILTER   0x00100000 /* Enable MAC address
-                                                    * filtering */
-#define E1000_MANC_EN_MNG2HOST   0x00200000 /* Enable MNG packets to host
-                                             * memory */
-#define E1000_MANC_EN_IP_ADDR_FILTER    0x00400000 /* Enable IP address
-                                                    * filtering */
-#define E1000_MANC_EN_XSUM_FILTER   0x00800000 /* Enable checksum filtering */
-#define E1000_MANC_BR_EN         0x01000000 /* Enable broadcast filtering */
-#define E1000_MANC_SMB_REQ       0x01000000 /* SMBus Request */
-#define E1000_MANC_SMB_GNT       0x02000000 /* SMBus Grant */
-#define E1000_MANC_SMB_CLK_IN    0x04000000 /* SMBus Clock In */
-#define E1000_MANC_SMB_DATA_IN   0x08000000 /* SMBus Data In */
-#define E1000_MANC_SMB_DATA_OUT  0x10000000 /* SMBus Data Out */
-#define E1000_MANC_SMB_CLK_OUT   0x20000000 /* SMBus Clock Out */
-
-#define E1000_MANC_SMB_DATA_OUT_SHIFT  28 /* SMBus Data Out Shift */
-#define E1000_MANC_SMB_CLK_OUT_SHIFT   29 /* SMBus Clock Out Shift */
-
-/* FACTPS Control */
-#define E1000_FACTPS_LAN0_ON     0x00000004 /* Lan 0 enable */
-
-/* For checksumming, the sum of all words in the EEPROM should equal 0xBABA. */
-#define EEPROM_SUM 0xBABA
-
-/* I/O-Mapped Access to Internal Registers, Memories, and Flash */
-#define E1000_IOADDR 0x00
-#define E1000_IODATA 0x04
 
 #endif /* HW_E1000_REGS_H */
diff --git a/hw/net/e1000e.c b/hw/net/e1000e.c
index 7523e9f5d2..c3848797b8 100644
--- a/hw/net/e1000e.c
+++ b/hw/net/e1000e.c
@@ -1,37 +1,37 @@
 /*
-* QEMU INTEL 82574 GbE NIC emulation
-*
-* Software developer's manuals:
-* http://www.intel.com/content/dam/doc/datasheet/82574l-gbe-controller-datasheet.pdf
-*
-* Copyright (c) 2015 Ravello Systems LTD (http://ravellosystems.com)
-* Developed by Daynix Computing LTD (http://www.daynix.com)
-*
-* Authors:
-* Dmitry Fleytman <dmitry@daynix.com>
-* Leonid Bloch <leonid@daynix.com>
-* Yan Vugenfirer <yan@daynix.com>
-*
-* Based on work done by:
-* Nir Peleg, Tutis Systems Ltd. for Qumranet Inc.
-* Copyright (c) 2008 Qumranet
-* Based on work done by:
-* Copyright (c) 2007 Dan Aloni
-* Copyright (c) 2004 Antony T Curtis
-*
-* This library is free software; you can redistribute it and/or
-* modify it under the terms of the GNU Lesser General Public
-* License as published by the Free Software Foundation; either
-* version 2.1 of the License, or (at your option) any later version.
-*
-* This library is distributed in the hope that it will be useful,
-* but WITHOUT ANY WARRANTY; without even the implied warranty of
-* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
-* Lesser General Public License for more details.
-*
-* You should have received a copy of the GNU Lesser General Public
-* License along with this library; if not, see <http://www.gnu.org/licenses/>.
-*/
+ * QEMU INTEL 82574 GbE NIC emulation
+ *
+ * Software developer's manuals:
+ * http://www.intel.com/content/dam/doc/datasheet/82574l-gbe-controller-datasheet.pdf
+ *
+ * Copyright (c) 2015 Ravello Systems LTD (http://ravellosystems.com)
+ * Developed by Daynix Computing LTD (http://www.daynix.com)
+ *
+ * Authors:
+ * Dmitry Fleytman <dmitry@daynix.com>
+ * Leonid Bloch <leonid@daynix.com>
+ * Yan Vugenfirer <yan@daynix.com>
+ *
+ * Based on work done by:
+ * Nir Peleg, Tutis Systems Ltd. for Qumranet Inc.
+ * Copyright (c) 2008 Qumranet
+ * Based on work done by:
+ * Copyright (c) 2007 Dan Aloni
+ * Copyright (c) 2004 Antony T Curtis
+ *
+ * This library is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * This library is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with this library; if not, see <http://www.gnu.org/licenses/>.
+ */
 
 #include "qemu/osdep.h"
 #include "qemu/units.h"
@@ -42,13 +42,13 @@
 #include "qemu/range.h"
 #include "sysemu/sysemu.h"
 #include "hw/hw.h"
+#include "hw/net/mii.h"
 #include "hw/pci/msi.h"
 #include "hw/pci/msix.h"
 #include "hw/qdev-properties.h"
 #include "migration/vmstate.h"
 
-#include "e1000_regs.h"
-
+#include "e1000_common.h"
 #include "e1000x_common.h"
 #include "e1000e_core.h"
 
@@ -81,6 +81,7 @@ struct E1000EState {
 
     E1000ECore core;
     bool init_vet;
+    bool timadj;
 };
 
 #define E1000E_MMIO_IDX     0
@@ -239,9 +240,9 @@ static NetClientInfo net_e1000e_info = {
 };
 
 /*
-* EEPROM (NVM) contents documented in Table 36, section 6.1
-* and generally 6.1.2 Software accessed words.
-*/
+ * EEPROM (NVM) contents documented in Table 36, section 6.1
+ * and generally 6.1.2 Software accessed words.
+ */
 static const uint16_t e1000e_eeprom_template[64] = {
   /*        Address        |    Compat.    | ImVer |   Compat.     */
     0x0000, 0x0000, 0x0000, 0x0420, 0xf746, 0x2010, 0xffff, 0xffff,
@@ -512,11 +513,11 @@ static void e1000e_pci_uninit(PCIDevice *pci_dev)
     msi_uninit(pci_dev);
 }
 
-static void e1000e_qdev_reset(DeviceState *dev)
+static void e1000e_qdev_reset_hold(Object *obj)
 {
-    E1000EState *s = E1000E(dev);
+    E1000EState *s = E1000E(obj);
 
-    trace_e1000e_cb_qdev_reset();
+    trace_e1000e_cb_qdev_reset_hold();
 
     e1000e_core_reset(&s->core);
 
@@ -553,6 +554,12 @@ static int e1000e_post_load(void *opaque, int version_id)
     return e1000e_core_post_load(&s->core);
 }
 
+static bool e1000e_migrate_timadj(void *opaque, int version_id)
+{
+    E1000EState *s = opaque;
+    return s->timadj;
+}
+
 static const VMStateDescription e1000e_vmstate_tx = {
     .name = "e1000e-tx",
     .version_id = 1,
@@ -630,12 +637,11 @@ static const VMStateDescription e1000e_vmstate = {
         VMSTATE_E1000E_INTR_DELAY_TIMER(core.tidv, E1000EState),
 
         VMSTATE_E1000E_INTR_DELAY_TIMER(core.itr, E1000EState),
-        VMSTATE_BOOL(core.itr_intr_pending, E1000EState),
+        VMSTATE_UNUSED(1),
 
         VMSTATE_E1000E_INTR_DELAY_TIMER_ARRAY(core.eitr, E1000EState,
                                               E1000E_MSIX_VEC_NUM),
-        VMSTATE_BOOL_ARRAY(core.eitr_intr_pending, E1000EState,
-                           E1000E_MSIX_VEC_NUM),
+        VMSTATE_UNUSED(E1000E_MSIX_VEC_NUM),
 
         VMSTATE_UINT32(core.itr_guest_value, E1000EState),
         VMSTATE_UINT32_ARRAY(core.eitr_guest_value, E1000EState,
@@ -645,6 +651,9 @@ static const VMStateDescription e1000e_vmstate = {
 
         VMSTATE_STRUCT_ARRAY(core.tx, E1000EState, E1000E_NUM_QUEUES, 0,
                              e1000e_vmstate_tx, struct e1000e_tx),
+
+        VMSTATE_INT64_TEST(core.timadj, E1000EState, e1000e_migrate_timadj),
+
         VMSTATE_END_OF_LIST()
     }
 };
@@ -663,12 +672,14 @@ static Property e1000e_properties[] = {
     DEFINE_PROP_SIGNED("subsys", E1000EState, subsys, 0,
                         e1000e_prop_subsys, uint16_t),
     DEFINE_PROP_BOOL("init-vet", E1000EState, init_vet, true),
+    DEFINE_PROP_BOOL("migrate-timadj", E1000EState, timadj, true),
     DEFINE_PROP_END_OF_LIST(),
 };
 
 static void e1000e_class_init(ObjectClass *class, void *data)
 {
     DeviceClass *dc = DEVICE_CLASS(class);
+    ResettableClass *rc = RESETTABLE_CLASS(class);
     PCIDeviceClass *c = PCI_DEVICE_CLASS(class);
 
     c->realize = e1000e_pci_realize;
@@ -679,8 +690,9 @@ static void e1000e_class_init(ObjectClass *class, void *data)
     c->romfile = "efi-e1000e.rom";
     c->class_id = PCI_CLASS_NETWORK_ETHERNET;
 
+    rc->phases.hold = e1000e_qdev_reset_hold;
+
     dc->desc = "Intel 82574L GbE Controller";
-    dc->reset = e1000e_qdev_reset;
     dc->vmsd = &e1000e_vmstate;
 
     e1000e_prop_disable_vnet = qdev_prop_uint8;
diff --git a/hw/net/e1000e_core.c b/hw/net/e1000e_core.c
index fc9cdb4528..4d9679ca0b 100644
--- a/hw/net/e1000e_core.c
+++ b/hw/net/e1000e_core.c
@@ -1,42 +1,43 @@
 /*
-* Core code for QEMU e1000e emulation
-*
-* Software developer's manuals:
-* http://www.intel.com/content/dam/doc/datasheet/82574l-gbe-controller-datasheet.pdf
-*
-* Copyright (c) 2015 Ravello Systems LTD (http://ravellosystems.com)
-* Developed by Daynix Computing LTD (http://www.daynix.com)
-*
-* Authors:
-* Dmitry Fleytman <dmitry@daynix.com>
-* Leonid Bloch <leonid@daynix.com>
-* Yan Vugenfirer <yan@daynix.com>
-*
-* Based on work done by:
-* Nir Peleg, Tutis Systems Ltd. for Qumranet Inc.
-* Copyright (c) 2008 Qumranet
-* Based on work done by:
-* Copyright (c) 2007 Dan Aloni
-* Copyright (c) 2004 Antony T Curtis
-*
-* This library is free software; you can redistribute it and/or
-* modify it under the terms of the GNU Lesser General Public
-* License as published by the Free Software Foundation; either
-* version 2.1 of the License, or (at your option) any later version.
-*
-* This library is distributed in the hope that it will be useful,
-* but WITHOUT ANY WARRANTY; without even the implied warranty of
-* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
-* Lesser General Public License for more details.
-*
-* You should have received a copy of the GNU Lesser General Public
-* License along with this library; if not, see <http://www.gnu.org/licenses/>.
-*/
+ * Core code for QEMU e1000e emulation
+ *
+ * Software developer's manuals:
+ * http://www.intel.com/content/dam/doc/datasheet/82574l-gbe-controller-datasheet.pdf
+ *
+ * Copyright (c) 2015 Ravello Systems LTD (http://ravellosystems.com)
+ * Developed by Daynix Computing LTD (http://www.daynix.com)
+ *
+ * Authors:
+ * Dmitry Fleytman <dmitry@daynix.com>
+ * Leonid Bloch <leonid@daynix.com>
+ * Yan Vugenfirer <yan@daynix.com>
+ *
+ * Based on work done by:
+ * Nir Peleg, Tutis Systems Ltd. for Qumranet Inc.
+ * Copyright (c) 2008 Qumranet
+ * Based on work done by:
+ * Copyright (c) 2007 Dan Aloni
+ * Copyright (c) 2004 Antony T Curtis
+ *
+ * This library is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * This library is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with this library; if not, see <http://www.gnu.org/licenses/>.
+ */
 
 #include "qemu/osdep.h"
 #include "qemu/log.h"
 #include "net/net.h"
 #include "net/tap.h"
+#include "hw/net/mii.h"
 #include "hw/pci/msi.h"
 #include "hw/pci/msix.h"
 #include "sysemu/runstate.h"
@@ -44,18 +45,32 @@
 #include "net_tx_pkt.h"
 #include "net_rx_pkt.h"
 
+#include "e1000_common.h"
 #include "e1000x_common.h"
 #include "e1000e_core.h"
 
 #include "trace.h"
 
-#define E1000E_MIN_XITR     (500) /* No more then 7813 interrupts per
-                                     second according to spec 10.2.4.2 */
+/* No more then 7813 interrupts per second according to spec 10.2.4.2 */
+#define E1000E_MIN_XITR     (500)
+
 #define E1000E_MAX_TX_FRAGS (64)
 
+union e1000_rx_desc_union {
+    struct e1000_rx_desc legacy;
+    union e1000_rx_desc_extended extended;
+    union e1000_rx_desc_packet_split packet_split;
+};
+
+static ssize_t
+e1000e_receive_internal(E1000ECore *core, const struct iovec *iov, int iovcnt,
+                        bool has_vnet);
+
 static inline void
 e1000e_set_interrupt_cause(E1000ECore *core, uint32_t val);
 
+static void e1000e_reset(E1000ECore *core, bool sw);
+
 static inline void
 e1000e_process_ts_option(E1000ECore *core, struct e1000_tx_desc *dp)
 {
@@ -148,15 +163,8 @@ e1000e_intrmgr_on_throttling_timer(void *opaque)
 {
     E1000IntrDelayTimer *timer = opaque;
 
-    assert(!msix_enabled(timer->core->owner));
-
     timer->running = false;
 
-    if (!timer->core->itr_intr_pending) {
-        trace_e1000e_irq_throttling_no_pending_interrupts();
-        return;
-    }
-
     if (msi_enabled(timer->core->owner)) {
         trace_e1000e_irq_msi_notify_postponed();
         /* Clear msi_causes_pending to fire MSI eventually */
@@ -174,15 +182,8 @@ e1000e_intrmgr_on_msix_throttling_timer(void *opaque)
     E1000IntrDelayTimer *timer = opaque;
     int idx = timer - &timer->core->eitr[0];
 
-    assert(msix_enabled(timer->core->owner));
-
     timer->running = false;
 
-    if (!timer->core->eitr_intr_pending[idx]) {
-        trace_e1000e_irq_throttling_no_pending_vec(idx);
-        return;
-    }
-
     trace_e1000e_irq_msix_notify_postponed_vec(idx);
     msix_notify(timer->core->owner, idx);
 }
@@ -282,14 +283,18 @@ e1000e_intrmgr_delay_rx_causes(E1000ECore *core, uint32_t *causes)
     core->delayed_causes |= *causes & delayable_causes;
     *causes &= ~delayable_causes;
 
-    /* Check if delayed RX interrupts disabled by client
-       or if there are causes that cannot be delayed */
+    /*
+     * Check if delayed RX interrupts disabled by client
+     * or if there are causes that cannot be delayed
+     */
     if ((rdtr == 0) || (*causes != 0)) {
         return false;
     }
 
-    /* Check if delayed RX ACK interrupts disabled by client
-       and there is an ACK packet received */
+    /*
+     * Check if delayed RX ACK interrupts disabled by client
+     * and there is an ACK packet received
+     */
     if ((raid == 0) && (core->delayed_causes & E1000_ICR_ACK)) {
         return false;
     }
@@ -493,27 +498,27 @@ typedef struct E1000E_RSSInfo_st {
 static uint32_t
 e1000e_rss_get_hash_type(E1000ECore *core, struct NetRxPkt *pkt)
 {
-    bool isip4, isip6, isudp, istcp;
+    bool hasip4, hasip6;
+    EthL4HdrProto l4hdr_proto;
 
     assert(e1000e_rss_enabled(core));
 
-    net_rx_pkt_get_protocols(pkt, &isip4, &isip6, &isudp, &istcp);
-
-    if (isip4) {
-        bool fragment = net_rx_pkt_get_ip4_info(pkt)->fragment;
+    net_rx_pkt_get_protocols(pkt, &hasip4, &hasip6, &l4hdr_proto);
 
-        trace_e1000e_rx_rss_ip4(fragment, istcp, core->mac[MRQC],
+    if (hasip4) {
+        trace_e1000e_rx_rss_ip4(l4hdr_proto, core->mac[MRQC],
                                 E1000_MRQC_EN_TCPIPV4(core->mac[MRQC]),
                                 E1000_MRQC_EN_IPV4(core->mac[MRQC]));
 
-        if (!fragment && istcp && E1000_MRQC_EN_TCPIPV4(core->mac[MRQC])) {
+        if (l4hdr_proto == ETH_L4_HDR_PROTO_TCP &&
+            E1000_MRQC_EN_TCPIPV4(core->mac[MRQC])) {
             return E1000_MRQ_RSS_TYPE_IPV4TCP;
         }
 
         if (E1000_MRQC_EN_IPV4(core->mac[MRQC])) {
             return E1000_MRQ_RSS_TYPE_IPV4;
         }
-    } else if (isip6) {
+    } else if (hasip6) {
         eth_ip6_hdr_info *ip6info = net_rx_pkt_get_ip6_info(pkt);
 
         bool ex_dis = core->mac[RFCTL] & E1000_RFCTL_IPV6_EX_DIS;
@@ -527,7 +532,7 @@ e1000e_rss_get_hash_type(E1000ECore *core, struct NetRxPkt *pkt)
          * backends like these.
          */
         trace_e1000e_rx_rss_ip6_rfctl(core->mac[RFCTL]);
-        trace_e1000e_rx_rss_ip6(ex_dis, new_ex_dis, istcp,
+        trace_e1000e_rx_rss_ip6(ex_dis, new_ex_dis, l4hdr_proto,
                                 ip6info->has_ext_hdrs,
                                 ip6info->rss_ex_dst_valid,
                                 ip6info->rss_ex_src_valid,
@@ -540,7 +545,7 @@ e1000e_rss_get_hash_type(E1000ECore *core, struct NetRxPkt *pkt)
             (!new_ex_dis || !(ip6info->rss_ex_dst_valid ||
                               ip6info->rss_ex_src_valid))) {
 
-            if (istcp && !ip6info->fragment &&
+            if (l4hdr_proto == ETH_L4_HDR_PROTO_TCP &&
                 E1000_MRQC_EN_TCPIPV6(core->mac[MRQC])) {
                 return E1000_MRQ_RSS_TYPE_IPV6TCP;
             }
@@ -625,23 +630,39 @@ e1000e_rss_parse_packet(E1000ECore *core,
     info->queue = E1000_RSS_QUEUE(&core->mac[RETA], info->hash);
 }
 
-static void
+static bool
 e1000e_setup_tx_offloads(E1000ECore *core, struct e1000e_tx *tx)
 {
     if (tx->props.tse && tx->cptse) {
-        net_tx_pkt_build_vheader(tx->tx_pkt, true, true, tx->props.mss);
+        if (!net_tx_pkt_build_vheader(tx->tx_pkt, true, true, tx->props.mss)) {
+            return false;
+        }
+
         net_tx_pkt_update_ip_checksums(tx->tx_pkt);
         e1000x_inc_reg_if_not_full(core->mac, TSCTC);
-        return;
+        return true;
     }
 
     if (tx->sum_needed & E1000_TXD_POPTS_TXSM) {
-        net_tx_pkt_build_vheader(tx->tx_pkt, false, true, 0);
+        if (!net_tx_pkt_build_vheader(tx->tx_pkt, false, true, 0)) {
+            return false;
+        }
     }
 
     if (tx->sum_needed & E1000_TXD_POPTS_IXSM) {
         net_tx_pkt_update_ip_hdr_checksum(tx->tx_pkt);
     }
+
+    return true;
+}
+
+static void e1000e_tx_pkt_callback(void *core,
+                                   const struct iovec *iov,
+                                   int iovcnt,
+                                   const struct iovec *virt_iov,
+                                   int virt_iovcnt)
+{
+    e1000e_receive_internal(core, virt_iov, virt_iovcnt, true);
 }
 
 static bool
@@ -650,13 +671,16 @@ e1000e_tx_pkt_send(E1000ECore *core, struct e1000e_tx *tx, int queue_index)
     int target_queue = MIN(core->max_queue_num, queue_index);
     NetClientState *queue = qemu_get_subqueue(core->owner_nic, target_queue);
 
-    e1000e_setup_tx_offloads(core, tx);
+    if (!e1000e_setup_tx_offloads(core, tx)) {
+        return false;
+    }
 
     net_tx_pkt_dump(tx->tx_pkt);
 
-    if ((core->phy[0][PHY_CTRL] & MII_CR_LOOPBACK) ||
+    if ((core->phy[0][MII_BMCR] & MII_BMCR_LOOPBACK) ||
         ((core->mac[RCTL] & E1000_RCTL_LBM_MAC) == E1000_RCTL_LBM_MAC)) {
-        return net_tx_pkt_send_loopback(tx->tx_pkt, queue);
+        return net_tx_pkt_send_custom(tx->tx_pkt, false,
+                                      e1000e_tx_pkt_callback, core);
     } else {
         return net_tx_pkt_send(tx->tx_pkt, queue);
     }
@@ -668,7 +692,7 @@ e1000e_on_tx_done_update_stats(E1000ECore *core, struct NetTxPkt *tx_pkt)
     static const int PTCregs[6] = { PTC64, PTC127, PTC255, PTC511,
                                     PTC1023, PTC1522 };
 
-    size_t tot_len = net_tx_pkt_get_total_len(tx_pkt);
+    size_t tot_len = net_tx_pkt_get_total_len(tx_pkt) + 4;
 
     e1000x_increase_size_stats(core->mac, PTCregs, tot_len);
     e1000x_inc_reg_if_not_full(core->mac, TPT);
@@ -1016,10 +1040,11 @@ e1000e_receive_filter(E1000ECore *core, const uint8_t *buf, int size)
 
     if (e1000x_is_vlan_packet(buf, core->mac[VET]) &&
         e1000x_vlan_rx_filter_enabled(core->mac)) {
-        uint16_t vid = lduw_be_p(buf + 14);
-        uint32_t vfta = ldl_le_p((uint32_t *)(core->mac + VFTA) +
-                                 ((vid >> 5) & 0x7f));
-        if ((vfta & (1 << (vid & 0x1f))) == 0) {
+        uint16_t vid = lduw_be_p(&PKT_GET_VLAN_HDR(buf)->h_tci);
+        uint32_t vfta =
+            ldl_le_p((uint32_t *)(core->mac + VFTA) +
+                     ((vid >> E1000_VFTA_ENTRY_SHIFT) & E1000_VFTA_ENTRY_MASK));
+        if ((vfta & (1 << (vid & E1000_VFTA_ENTRY_BIT_SHIFT_MASK))) == 0) {
             trace_e1000e_rx_flt_vlan_mismatch(vid);
             return false;
         } else {
@@ -1054,48 +1079,47 @@ e1000e_receive_filter(E1000ECore *core, const uint8_t *buf, int size)
 }
 
 static inline void
-e1000e_read_lgcy_rx_descr(E1000ECore *core, uint8_t *desc, hwaddr *buff_addr)
+e1000e_read_lgcy_rx_descr(E1000ECore *core, struct e1000_rx_desc *desc,
+                          hwaddr *buff_addr)
 {
-    struct e1000_rx_desc *d = (struct e1000_rx_desc *) desc;
-    *buff_addr = le64_to_cpu(d->buffer_addr);
+    *buff_addr = le64_to_cpu(desc->buffer_addr);
 }
 
 static inline void
-e1000e_read_ext_rx_descr(E1000ECore *core, uint8_t *desc, hwaddr *buff_addr)
+e1000e_read_ext_rx_descr(E1000ECore *core, union e1000_rx_desc_extended *desc,
+                         hwaddr *buff_addr)
 {
-    union e1000_rx_desc_extended *d = (union e1000_rx_desc_extended *) desc;
-    *buff_addr = le64_to_cpu(d->read.buffer_addr);
+    *buff_addr = le64_to_cpu(desc->read.buffer_addr);
 }
 
 static inline void
-e1000e_read_ps_rx_descr(E1000ECore *core, uint8_t *desc,
-                        hwaddr (*buff_addr)[MAX_PS_BUFFERS])
+e1000e_read_ps_rx_descr(E1000ECore *core,
+                        union e1000_rx_desc_packet_split *desc,
+                        hwaddr buff_addr[MAX_PS_BUFFERS])
 {
     int i;
-    union e1000_rx_desc_packet_split *d =
-        (union e1000_rx_desc_packet_split *) desc;
 
     for (i = 0; i < MAX_PS_BUFFERS; i++) {
-        (*buff_addr)[i] = le64_to_cpu(d->read.buffer_addr[i]);
+        buff_addr[i] = le64_to_cpu(desc->read.buffer_addr[i]);
     }
 
-    trace_e1000e_rx_desc_ps_read((*buff_addr)[0], (*buff_addr)[1],
-                                 (*buff_addr)[2], (*buff_addr)[3]);
+    trace_e1000e_rx_desc_ps_read(buff_addr[0], buff_addr[1],
+                                 buff_addr[2], buff_addr[3]);
 }
 
 static inline void
-e1000e_read_rx_descr(E1000ECore *core, uint8_t *desc,
-                     hwaddr (*buff_addr)[MAX_PS_BUFFERS])
+e1000e_read_rx_descr(E1000ECore *core, union e1000_rx_desc_union *desc,
+                     hwaddr buff_addr[MAX_PS_BUFFERS])
 {
     if (e1000e_rx_use_legacy_descriptor(core)) {
-        e1000e_read_lgcy_rx_descr(core, desc, &(*buff_addr)[0]);
-        (*buff_addr)[1] = (*buff_addr)[2] = (*buff_addr)[3] = 0;
+        e1000e_read_lgcy_rx_descr(core, &desc->legacy, &buff_addr[0]);
+        buff_addr[1] = buff_addr[2] = buff_addr[3] = 0;
     } else {
         if (core->mac[RCTL] & E1000_RCTL_DTYP_PS) {
-            e1000e_read_ps_rx_descr(core, desc, buff_addr);
+            e1000e_read_ps_rx_descr(core, &desc->packet_split, buff_addr);
         } else {
-            e1000e_read_ext_rx_descr(core, desc, &(*buff_addr)[0]);
-            (*buff_addr)[1] = (*buff_addr)[2] = (*buff_addr)[3] = 0;
+            e1000e_read_ext_rx_descr(core, &desc->extended, &buff_addr[0]);
+            buff_addr[1] = buff_addr[2] = buff_addr[3] = 0;
         }
     }
 }
@@ -1104,7 +1128,7 @@ static void
 e1000e_verify_csum_in_sw(E1000ECore *core,
                          struct NetRxPkt *pkt,
                          uint32_t *status_flags,
-                         bool istcp, bool isudp)
+                         EthL4HdrProto l4hdr_proto)
 {
     bool csum_valid;
     uint32_t csum_error;
@@ -1131,14 +1155,10 @@ e1000e_verify_csum_in_sw(E1000ECore *core,
     }
 
     csum_error = csum_valid ? 0 : E1000_RXDEXT_STATERR_TCPE;
+    *status_flags |= E1000_RXD_STAT_TCPCS | csum_error;
 
-    if (istcp) {
-        *status_flags |= E1000_RXD_STAT_TCPCS |
-                         csum_error;
-    } else if (isudp) {
-        *status_flags |= E1000_RXD_STAT_TCPCS |
-                         E1000_RXD_STAT_UDPCS |
-                         csum_error;
+    if (l4hdr_proto == ETH_L4_HDR_PROTO_UDP) {
+        *status_flags |= E1000_RXD_STAT_UDPCS;
     }
 }
 
@@ -1167,7 +1187,8 @@ e1000e_build_rx_metadata(E1000ECore *core,
                          uint16_t *vlan_tag)
 {
     struct virtio_net_hdr *vhdr;
-    bool isip4, isip6, istcp, isudp;
+    bool hasip4, hasip6;
+    EthL4HdrProto l4hdr_proto;
     uint32_t pkt_type;
 
     *status_flags = E1000_RXD_STAT_DD;
@@ -1179,8 +1200,8 @@ e1000e_build_rx_metadata(E1000ECore *core,
 
     *status_flags |= E1000_RXD_STAT_EOP;
 
-    net_rx_pkt_get_protocols(pkt, &isip4, &isip6, &isudp, &istcp);
-    trace_e1000e_rx_metadata_protocols(isip4, isip6, isudp, istcp);
+    net_rx_pkt_get_protocols(pkt, &hasip4, &hasip6, &l4hdr_proto);
+    trace_e1000e_rx_metadata_protocols(hasip4, hasip6, l4hdr_proto);
 
     /* VLAN state */
     if (net_rx_pkt_is_vlan_stripped(pkt)) {
@@ -1196,24 +1217,25 @@ e1000e_build_rx_metadata(E1000ECore *core,
             *mrq = cpu_to_le32(rss_info->type | (rss_info->queue << 8));
             trace_e1000e_rx_metadata_rss(*rss, *mrq);
         }
-    } else if (isip4) {
+    } else if (hasip4) {
             *status_flags |= E1000_RXD_STAT_IPIDV;
             *ip_id = cpu_to_le16(net_rx_pkt_get_ip_id(pkt));
             trace_e1000e_rx_metadata_ip_id(*ip_id);
     }
 
-    if (istcp && e1000e_is_tcp_ack(core, pkt)) {
+    if (l4hdr_proto == ETH_L4_HDR_PROTO_TCP && e1000e_is_tcp_ack(core, pkt)) {
         *status_flags |= E1000_RXD_STAT_ACK;
         trace_e1000e_rx_metadata_ack();
     }
 
-    if (isip6 && (core->mac[RFCTL] & E1000_RFCTL_IPV6_DIS)) {
+    if (hasip6 && (core->mac[RFCTL] & E1000_RFCTL_IPV6_DIS)) {
         trace_e1000e_rx_metadata_ipv6_filtering_disabled();
         pkt_type = E1000_RXD_PKT_MAC;
-    } else if (istcp || isudp) {
-        pkt_type = isip4 ? E1000_RXD_PKT_IP4_XDP : E1000_RXD_PKT_IP6_XDP;
-    } else if (isip4 || isip6) {
-        pkt_type = isip4 ? E1000_RXD_PKT_IP4 : E1000_RXD_PKT_IP6;
+    } else if (l4hdr_proto == ETH_L4_HDR_PROTO_TCP ||
+               l4hdr_proto == ETH_L4_HDR_PROTO_UDP) {
+        pkt_type = hasip4 ? E1000_RXD_PKT_IP4_XDP : E1000_RXD_PKT_IP6_XDP;
+    } else if (hasip4 || hasip6) {
+        pkt_type = hasip4 ? E1000_RXD_PKT_IP4 : E1000_RXD_PKT_IP6;
     } else {
         pkt_type = E1000_RXD_PKT_MAC;
     }
@@ -1222,37 +1244,38 @@ e1000e_build_rx_metadata(E1000ECore *core,
     trace_e1000e_rx_metadata_pkt_type(pkt_type);
 
     /* RX CSO information */
-    if (isip6 && (core->mac[RFCTL] & E1000_RFCTL_IPV6_XSUM_DIS)) {
+    if (hasip6 && (core->mac[RFCTL] & E1000_RFCTL_IPV6_XSUM_DIS)) {
         trace_e1000e_rx_metadata_ipv6_sum_disabled();
         goto func_exit;
     }
 
-    if (!net_rx_pkt_has_virt_hdr(pkt)) {
-        trace_e1000e_rx_metadata_no_virthdr();
-        e1000e_verify_csum_in_sw(core, pkt, status_flags, istcp, isudp);
-        goto func_exit;
-    }
-
     vhdr = net_rx_pkt_get_vhdr(pkt);
 
     if (!(vhdr->flags & VIRTIO_NET_HDR_F_DATA_VALID) &&
         !(vhdr->flags & VIRTIO_NET_HDR_F_NEEDS_CSUM)) {
         trace_e1000e_rx_metadata_virthdr_no_csum_info();
-        e1000e_verify_csum_in_sw(core, pkt, status_flags, istcp, isudp);
+        e1000e_verify_csum_in_sw(core, pkt, status_flags, l4hdr_proto);
         goto func_exit;
     }
 
     if (e1000e_rx_l3_cso_enabled(core)) {
-        *status_flags |= isip4 ? E1000_RXD_STAT_IPCS : 0;
+        *status_flags |= hasip4 ? E1000_RXD_STAT_IPCS : 0;
     } else {
         trace_e1000e_rx_metadata_l3_cso_disabled();
     }
 
     if (e1000e_rx_l4_cso_enabled(core)) {
-        if (istcp) {
+        switch (l4hdr_proto) {
+        case ETH_L4_HDR_PROTO_TCP:
             *status_flags |= E1000_RXD_STAT_TCPCS;
-        } else if (isudp) {
+            break;
+
+        case ETH_L4_HDR_PROTO_UDP:
             *status_flags |= E1000_RXD_STAT_TCPCS | E1000_RXD_STAT_UDPCS;
+            break;
+
+        default:
+            break;
         }
     } else {
         trace_e1000e_rx_metadata_l4_cso_disabled();
@@ -1265,7 +1288,7 @@ func_exit:
 }
 
 static inline void
-e1000e_write_lgcy_rx_descr(E1000ECore *core, uint8_t *desc,
+e1000e_write_lgcy_rx_descr(E1000ECore *core, struct e1000_rx_desc *desc,
                            struct NetRxPkt *pkt,
                            const E1000E_RSSInfo *rss_info,
                            uint16_t length)
@@ -1273,71 +1296,66 @@ e1000e_write_lgcy_rx_descr(E1000ECore *core, uint8_t *desc,
     uint32_t status_flags, rss, mrq;
     uint16_t ip_id;
 
-    struct e1000_rx_desc *d = (struct e1000_rx_desc *) desc;
-
     assert(!rss_info->enabled);
 
-    d->length = cpu_to_le16(length);
-    d->csum = 0;
+    desc->length = cpu_to_le16(length);
+    desc->csum = 0;
 
     e1000e_build_rx_metadata(core, pkt, pkt != NULL,
                              rss_info,
                              &rss, &mrq,
                              &status_flags, &ip_id,
-                             &d->special);
-    d->errors = (uint8_t) (le32_to_cpu(status_flags) >> 24);
-    d->status = (uint8_t) le32_to_cpu(status_flags);
+                             &desc->special);
+    desc->errors = (uint8_t) (le32_to_cpu(status_flags) >> 24);
+    desc->status = (uint8_t) le32_to_cpu(status_flags);
 }
 
 static inline void
-e1000e_write_ext_rx_descr(E1000ECore *core, uint8_t *desc,
+e1000e_write_ext_rx_descr(E1000ECore *core, union e1000_rx_desc_extended *desc,
                           struct NetRxPkt *pkt,
                           const E1000E_RSSInfo *rss_info,
                           uint16_t length)
 {
-    union e1000_rx_desc_extended *d = (union e1000_rx_desc_extended *) desc;
-
-    memset(&d->wb, 0, sizeof(d->wb));
+    memset(&desc->wb, 0, sizeof(desc->wb));
 
-    d->wb.upper.length = cpu_to_le16(length);
+    desc->wb.upper.length = cpu_to_le16(length);
 
     e1000e_build_rx_metadata(core, pkt, pkt != NULL,
                              rss_info,
-                             &d->wb.lower.hi_dword.rss,
-                             &d->wb.lower.mrq,
-                             &d->wb.upper.status_error,
-                             &d->wb.lower.hi_dword.csum_ip.ip_id,
-                             &d->wb.upper.vlan);
+                             &desc->wb.lower.hi_dword.rss,
+                             &desc->wb.lower.mrq,
+                             &desc->wb.upper.status_error,
+                             &desc->wb.lower.hi_dword.csum_ip.ip_id,
+                             &desc->wb.upper.vlan);
 }
 
 static inline void
-e1000e_write_ps_rx_descr(E1000ECore *core, uint8_t *desc,
+e1000e_write_ps_rx_descr(E1000ECore *core,
+                         union e1000_rx_desc_packet_split *desc,
                          struct NetRxPkt *pkt,
                          const E1000E_RSSInfo *rss_info,
                          size_t ps_hdr_len,
                          uint16_t(*written)[MAX_PS_BUFFERS])
 {
     int i;
-    union e1000_rx_desc_packet_split *d =
-        (union e1000_rx_desc_packet_split *) desc;
 
-    memset(&d->wb, 0, sizeof(d->wb));
+    memset(&desc->wb, 0, sizeof(desc->wb));
 
-    d->wb.middle.length0 = cpu_to_le16((*written)[0]);
+    desc->wb.middle.length0 = cpu_to_le16((*written)[0]);
 
     for (i = 0; i < PS_PAGE_BUFFERS; i++) {
-        d->wb.upper.length[i] = cpu_to_le16((*written)[i + 1]);
+        desc->wb.upper.length[i] = cpu_to_le16((*written)[i + 1]);
     }
 
     e1000e_build_rx_metadata(core, pkt, pkt != NULL,
                              rss_info,
-                             &d->wb.lower.hi_dword.rss,
-                             &d->wb.lower.mrq,
-                             &d->wb.middle.status_error,
-                             &d->wb.lower.hi_dword.csum_ip.ip_id,
-                             &d->wb.middle.vlan);
+                             &desc->wb.lower.hi_dword.rss,
+                             &desc->wb.lower.mrq,
+                             &desc->wb.middle.status_error,
+                             &desc->wb.lower.hi_dword.csum_ip.ip_id,
+                             &desc->wb.middle.vlan);
 
-    d->wb.upper.header_status =
+    desc->wb.upper.header_status =
         cpu_to_le16(ps_hdr_len | (ps_hdr_len ? E1000_RXDPS_HDRSTAT_HDRSP : 0));
 
     trace_e1000e_rx_desc_ps_write((*written)[0], (*written)[1],
@@ -1345,20 +1363,21 @@ e1000e_write_ps_rx_descr(E1000ECore *core, uint8_t *desc,
 }
 
 static inline void
-e1000e_write_rx_descr(E1000ECore *core, uint8_t *desc,
+e1000e_write_rx_descr(E1000ECore *core, union e1000_rx_desc_union *desc,
 struct NetRxPkt *pkt, const E1000E_RSSInfo *rss_info,
     size_t ps_hdr_len, uint16_t(*written)[MAX_PS_BUFFERS])
 {
     if (e1000e_rx_use_legacy_descriptor(core)) {
         assert(ps_hdr_len == 0);
-        e1000e_write_lgcy_rx_descr(core, desc, pkt, rss_info, (*written)[0]);
+        e1000e_write_lgcy_rx_descr(core, &desc->legacy, pkt, rss_info,
+                                   (*written)[0]);
     } else {
         if (core->mac[RCTL] & E1000_RCTL_DTYP_PS) {
-            e1000e_write_ps_rx_descr(core, desc, pkt, rss_info,
+            e1000e_write_ps_rx_descr(core, &desc->packet_split, pkt, rss_info,
                                       ps_hdr_len, written);
         } else {
             assert(ps_hdr_len == 0);
-            e1000e_write_ext_rx_descr(core, desc, pkt, rss_info,
+            e1000e_write_ext_rx_descr(core, &desc->extended, pkt, rss_info,
                                        (*written)[0]);
         }
     }
@@ -1366,12 +1385,12 @@ struct NetRxPkt *pkt, const E1000E_RSSInfo *rss_info,
 
 static inline void
 e1000e_pci_dma_write_rx_desc(E1000ECore *core, dma_addr_t addr,
-                             uint8_t *desc, dma_addr_t len)
+                             union e1000_rx_desc_union *desc, dma_addr_t len)
 {
     PCIDevice *dev = core->owner;
 
     if (e1000e_rx_use_legacy_descriptor(core)) {
-        struct e1000_rx_desc *d = (struct e1000_rx_desc *) desc;
+        struct e1000_rx_desc *d = &desc->legacy;
         size_t offset = offsetof(struct e1000_rx_desc, status);
         uint8_t status = d->status;
 
@@ -1384,8 +1403,7 @@ e1000e_pci_dma_write_rx_desc(E1000ECore *core, dma_addr_t addr,
         }
     } else {
         if (core->mac[RCTL] & E1000_RCTL_DTYP_PS) {
-            union e1000_rx_desc_packet_split *d =
-                (union e1000_rx_desc_packet_split *) desc;
+            union e1000_rx_desc_packet_split *d = &desc->packet_split;
             size_t offset = offsetof(union e1000_rx_desc_packet_split,
                 wb.middle.status_error);
             uint32_t status = d->wb.middle.status_error;
@@ -1398,8 +1416,7 @@ e1000e_pci_dma_write_rx_desc(E1000ECore *core, dma_addr_t addr,
                 pci_dma_write(dev, addr + offset, &status, sizeof(status));
             }
         } else {
-            union e1000_rx_desc_extended *d =
-                (union e1000_rx_desc_extended *) desc;
+            union e1000_rx_desc_extended *d = &desc->extended;
             size_t offset = offsetof(union e1000_rx_desc_extended,
                 wb.upper.status_error);
             uint32_t status = d->wb.upper.status_error;
@@ -1422,14 +1439,14 @@ typedef struct e1000e_ba_state_st {
 
 static inline void
 e1000e_write_hdr_to_rx_buffers(E1000ECore *core,
-                               hwaddr (*ba)[MAX_PS_BUFFERS],
+                               hwaddr ba[MAX_PS_BUFFERS],
                                e1000e_ba_state *bastate,
                                const char *data,
                                dma_addr_t data_len)
 {
     assert(data_len <= core->rxbuf_sizes[0] - bastate->written[0]);
 
-    pci_dma_write(core->owner, (*ba)[0] + bastate->written[0], data, data_len);
+    pci_dma_write(core->owner, ba[0] + bastate->written[0], data, data_len);
     bastate->written[0] += data_len;
 
     bastate->cur_idx = 1;
@@ -1437,7 +1454,7 @@ e1000e_write_hdr_to_rx_buffers(E1000ECore *core,
 
 static void
 e1000e_write_to_rx_buffers(E1000ECore *core,
-                           hwaddr (*ba)[MAX_PS_BUFFERS],
+                           hwaddr ba[MAX_PS_BUFFERS],
                            e1000e_ba_state *bastate,
                            const char *data,
                            dma_addr_t data_len)
@@ -1449,13 +1466,13 @@ e1000e_write_to_rx_buffers(E1000ECore *core,
         uint32_t bytes_to_write = MIN(data_len, cur_buf_bytes_left);
 
         trace_e1000e_rx_desc_buff_write(bastate->cur_idx,
-                                        (*ba)[bastate->cur_idx],
+                                        ba[bastate->cur_idx],
                                         bastate->written[bastate->cur_idx],
                                         data,
                                         bytes_to_write);
 
         pci_dma_write(core->owner,
-            (*ba)[bastate->cur_idx] + bastate->written[bastate->cur_idx],
+            ba[bastate->cur_idx] + bastate->written[bastate->cur_idx],
             data, bytes_to_write);
 
         bastate->written[bastate->cur_idx] += bytes_to_write;
@@ -1501,18 +1518,19 @@ e1000e_rx_descr_threshold_hit(E1000ECore *core, const E1000E_RingInfo *rxi)
 static bool
 e1000e_do_ps(E1000ECore *core, struct NetRxPkt *pkt, size_t *hdr_len)
 {
-    bool isip4, isip6, isudp, istcp;
+    bool hasip4, hasip6;
+    EthL4HdrProto l4hdr_proto;
     bool fragment;
 
     if (!e1000e_rx_use_ps_descriptor(core)) {
         return false;
     }
 
-    net_rx_pkt_get_protocols(pkt, &isip4, &isip6, &isudp, &istcp);
+    net_rx_pkt_get_protocols(pkt, &hasip4, &hasip6, &l4hdr_proto);
 
-    if (isip4) {
+    if (hasip4) {
         fragment = net_rx_pkt_get_ip4_info(pkt)->fragment;
-    } else if (isip6) {
+    } else if (hasip6) {
         fragment = net_rx_pkt_get_ip6_info(pkt)->fragment;
     } else {
         return false;
@@ -1522,7 +1540,8 @@ e1000e_do_ps(E1000ECore *core, struct NetRxPkt *pkt, size_t *hdr_len)
         return false;
     }
 
-    if (!fragment && (isudp || istcp)) {
+    if (l4hdr_proto == ETH_L4_HDR_PROTO_TCP ||
+        l4hdr_proto == ETH_L4_HDR_PROTO_UDP) {
         *hdr_len = net_rx_pkt_get_l5_hdr_offset(pkt);
     } else {
         *hdr_len = net_rx_pkt_get_l4_hdr_offset(pkt);
@@ -1543,7 +1562,7 @@ e1000e_write_packet_to_guest(E1000ECore *core, struct NetRxPkt *pkt,
 {
     PCIDevice *d = core->owner;
     dma_addr_t base;
-    uint8_t desc[E1000_MAX_RX_DESC_LEN];
+    union e1000_rx_desc_union desc;
     size_t desc_size;
     size_t desc_offset = 0;
     size_t iov_ofs = 0;
@@ -1579,7 +1598,7 @@ e1000e_write_packet_to_guest(E1000ECore *core, struct NetRxPkt *pkt,
 
         trace_e1000e_rx_descr(rxi->idx, base, core->rx_desc_len);
 
-        e1000e_read_rx_descr(core, desc, &ba);
+        e1000e_read_rx_descr(core, &desc, ba);
 
         if (ba[0]) {
             if (desc_offset < size) {
@@ -1598,7 +1617,7 @@ e1000e_write_packet_to_guest(E1000ECore *core, struct NetRxPkt *pkt,
                             iov_copy = MIN(ps_hdr_len - ps_hdr_copied,
                                            iov->iov_len - iov_ofs);
 
-                            e1000e_write_hdr_to_rx_buffers(core, &ba, &bastate,
+                            e1000e_write_hdr_to_rx_buffers(core, ba, &bastate,
                                                       iov->iov_base, iov_copy);
 
                             copy_size -= iov_copy;
@@ -1615,7 +1634,7 @@ e1000e_write_packet_to_guest(E1000ECore *core, struct NetRxPkt *pkt,
                     } else {
                         /* Leave buffer 0 of each descriptor except first */
                         /* empty as per spec 7.1.5.1                      */
-                        e1000e_write_hdr_to_rx_buffers(core, &ba, &bastate,
+                        e1000e_write_hdr_to_rx_buffers(core, ba, &bastate,
                                                        NULL, 0);
                     }
                 }
@@ -1624,7 +1643,7 @@ e1000e_write_packet_to_guest(E1000ECore *core, struct NetRxPkt *pkt,
                 while (copy_size) {
                     iov_copy = MIN(copy_size, iov->iov_len - iov_ofs);
 
-                    e1000e_write_to_rx_buffers(core, &ba, &bastate,
+                    e1000e_write_to_rx_buffers(core, ba, &bastate,
                                             iov->iov_base + iov_ofs, iov_copy);
 
                     copy_size -= iov_copy;
@@ -1637,7 +1656,7 @@ e1000e_write_packet_to_guest(E1000ECore *core, struct NetRxPkt *pkt,
 
                 if (desc_offset + desc_size >= total_size) {
                     /* Simulate FCS checksum presence in the last descriptor */
-                    e1000e_write_to_rx_buffers(core, &ba, &bastate,
+                    e1000e_write_to_rx_buffers(core, ba, &bastate,
                           (const char *) &fcs_pad, e1000x_fcs_len(core->mac));
                 }
             }
@@ -1649,9 +1668,9 @@ e1000e_write_packet_to_guest(E1000ECore *core, struct NetRxPkt *pkt,
             is_last = true;
         }
 
-        e1000e_write_rx_descr(core, desc, is_last ? core->rx_pkt : NULL,
+        e1000e_write_rx_descr(core, &desc, is_last ? core->rx_pkt : NULL,
                            rss_info, do_ps ? ps_hdr_len : 0, &bastate.written);
-        e1000e_pci_dma_write_rx_desc(core, base, desc, core->rx_desc_len);
+        e1000e_pci_dma_write_rx_desc(core, base, &desc, core->rx_desc_len);
 
         e1000e_ring_advance(core, rxi,
                             core->rx_desc_len / E1000_MIN_RX_DESC_LEN);
@@ -1664,25 +1683,27 @@ e1000e_write_packet_to_guest(E1000ECore *core, struct NetRxPkt *pkt,
 static inline void
 e1000e_rx_fix_l4_csum(E1000ECore *core, struct NetRxPkt *pkt)
 {
-    if (net_rx_pkt_has_virt_hdr(pkt)) {
-        struct virtio_net_hdr *vhdr = net_rx_pkt_get_vhdr(pkt);
+    struct virtio_net_hdr *vhdr = net_rx_pkt_get_vhdr(pkt);
 
-        if (vhdr->flags & VIRTIO_NET_HDR_F_NEEDS_CSUM) {
-            net_rx_pkt_fix_l4_csum(pkt);
-        }
+    if (vhdr->flags & VIRTIO_NET_HDR_F_NEEDS_CSUM) {
+        net_rx_pkt_fix_l4_csum(pkt);
     }
 }
 
-/* Min. octets in an ethernet frame sans FCS */
-#define MIN_BUF_SIZE 60
-
 ssize_t
 e1000e_receive_iov(E1000ECore *core, const struct iovec *iov, int iovcnt)
 {
-    static const int maximum_ethernet_hdr_len = (14 + 4);
+    return e1000e_receive_internal(core, iov, iovcnt, core->has_vnet);
+}
+
+static ssize_t
+e1000e_receive_internal(E1000ECore *core, const struct iovec *iov, int iovcnt,
+                        bool has_vnet)
+{
+    static const int maximum_ethernet_hdr_len = (ETH_HLEN + 4);
 
     uint32_t n = 0;
-    uint8_t min_buf[MIN_BUF_SIZE];
+    uint8_t min_buf[ETH_ZLEN];
     struct iovec min_iov;
     uint8_t *filter_buf;
     size_t size, orig_size;
@@ -1700,9 +1721,11 @@ e1000e_receive_iov(E1000ECore *core, const struct iovec *iov, int iovcnt)
     }
 
     /* Pull virtio header in */
-    if (core->has_vnet) {
+    if (has_vnet) {
         net_rx_pkt_set_vhdr_iovec(core->rx_pkt, iov, iovcnt);
         iov_ofs = sizeof(struct virtio_net_hdr);
+    } else {
+        net_rx_pkt_unset_vhdr(core->rx_pkt);
     }
 
     filter_buf = iov->iov_base + iov_ofs;
@@ -1744,8 +1767,6 @@ e1000e_receive_iov(E1000ECore *core, const struct iovec *iov, int iovcnt)
     e1000e_rss_parse_packet(core, core->rx_pkt, &rss_info);
     e1000e_rx_ring_init(core, &rxr, rss_info.queue);
 
-    trace_e1000e_rx_rss_dispatched_to_queue(rxr.i->idx);
-
     total_size = net_rx_pkt_get_total_len(core->rx_pkt) +
         e1000x_fcs_len(core->mac);
 
@@ -1771,12 +1792,12 @@ e1000e_receive_iov(E1000ECore *core, const struct iovec *iov, int iovcnt)
         rdmts_hit = e1000e_rx_descr_threshold_hit(core, rxr.i);
         n |= e1000e_rx_wb_interrupt_cause(core, rxr.i->idx, rdmts_hit);
 
-        trace_e1000e_rx_written_to_guest(n);
+        trace_e1000e_rx_written_to_guest(rxr.i->idx);
     } else {
         n |= E1000_ICS_RXO;
         retval = 0;
 
-        trace_e1000e_rx_not_written_to_guest(n);
+        trace_e1000e_rx_not_written_to_guest(rxr.i->idx);
     }
 
     if (!e1000e_intrmgr_delay_rx_causes(core, &n)) {
@@ -1792,13 +1813,13 @@ e1000e_receive_iov(E1000ECore *core, const struct iovec *iov, int iovcnt)
 static inline bool
 e1000e_have_autoneg(E1000ECore *core)
 {
-    return core->phy[0][PHY_CTRL] & MII_CR_AUTO_NEG_EN;
+    return core->phy[0][MII_BMCR] & MII_BMCR_AUTOEN;
 }
 
 static void e1000e_update_flowctl_status(E1000ECore *core)
 {
     if (e1000e_have_autoneg(core) &&
-        core->phy[0][PHY_STATUS] & MII_SR_AUTONEG_COMPLETE) {
+        core->phy[0][MII_BMSR] & MII_BMSR_AN_COMP) {
         trace_e1000e_link_autoneg_flowctl(true);
         core->mac[CTRL] |= E1000_CTRL_TFCE | E1000_CTRL_RFCE;
     } else {
@@ -1816,12 +1837,12 @@ e1000e_link_down(E1000ECore *core)
 static inline void
 e1000e_set_phy_ctrl(E1000ECore *core, int index, uint16_t val)
 {
-    /* bits 0-5 reserved; MII_CR_[RESTART_AUTO_NEG,RESET] are self clearing */
-    core->phy[0][PHY_CTRL] = val & ~(0x3f |
-                                     MII_CR_RESET |
-                                     MII_CR_RESTART_AUTO_NEG);
+    /* bits 0-5 reserved; MII_BMCR_[ANRESTART,RESET] are self clearing */
+    core->phy[0][MII_BMCR] = val & ~(0x3f |
+                                     MII_BMCR_RESET |
+                                     MII_BMCR_ANRESTART);
 
-    if ((val & MII_CR_RESTART_AUTO_NEG) &&
+    if ((val & MII_BMCR_ANRESTART) &&
         e1000e_have_autoneg(core)) {
         e1000x_restart_autoneg(core->mac, core->phy[0], core->autoneg_timer);
     }
@@ -1855,7 +1876,7 @@ e1000e_core_set_link_status(E1000ECore *core)
         e1000x_update_regs_on_link_down(core->mac, core->phy[0]);
     } else {
         if (e1000e_have_autoneg(core) &&
-            !(core->phy[0][PHY_STATUS] & MII_SR_AUTONEG_COMPLETE)) {
+            !(core->phy[0][MII_BMSR] & MII_BMSR_AN_COMP)) {
             e1000x_restart_autoneg(core->mac, core->phy[0],
                                    core->autoneg_timer);
         } else {
@@ -1888,7 +1909,7 @@ e1000e_set_ctrl(E1000ECore *core, int index, uint32_t val)
 
     if (val & E1000_CTRL_RST) {
         trace_e1000e_core_ctrl_sw_reset();
-        e1000x_reset_mac_addr(core->owner_nic, core->mac, core->permanent_mac);
+        e1000e_reset(core, true);
     }
 
     if (val & E1000_CTRL_PHY_RST) {
@@ -1997,7 +2018,7 @@ static
 void(*e1000e_phyreg_writeops[E1000E_PHY_PAGES][E1000E_PHY_PAGE_SIZE])
 (E1000ECore *, int, uint16_t) = {
     [0] = {
-        [PHY_CTRL]     = e1000e_set_phy_ctrl,
+        [MII_BMCR]     = e1000e_set_phy_ctrl,
         [PHY_PAGE]     = e1000e_set_phy_page,
         [PHY_OEM_BITS] = e1000e_set_phy_oem_bits
     }
@@ -2011,13 +2032,11 @@ e1000e_clear_ims_bits(E1000ECore *core, uint32_t bits)
 }
 
 static inline bool
-e1000e_postpone_interrupt(bool *interrupt_pending,
-                           E1000IntrDelayTimer *timer)
+e1000e_postpone_interrupt(E1000IntrDelayTimer *timer)
 {
     if (timer->running) {
         trace_e1000e_irq_postponed_by_xitr(timer->delay_reg << 2);
 
-        *interrupt_pending = true;
         return true;
     }
 
@@ -2031,14 +2050,13 @@ e1000e_postpone_interrupt(bool *interrupt_pending,
 static inline bool
 e1000e_itr_should_postpone(E1000ECore *core)
 {
-    return e1000e_postpone_interrupt(&core->itr_intr_pending, &core->itr);
+    return e1000e_postpone_interrupt(&core->itr);
 }
 
 static inline bool
 e1000e_eitr_should_postpone(E1000ECore *core, int idx)
 {
-    return e1000e_postpone_interrupt(&core->eitr_intr_pending[idx],
-                                     &core->eitr[idx]);
+    return e1000e_postpone_interrupt(&core->eitr[idx]);
 }
 
 static void
@@ -2269,19 +2287,19 @@ e1000e_get_reg_index_with_offset(const uint16_t *mac_reg_access, hwaddr addr)
 
 static const char e1000e_phy_regcap[E1000E_PHY_PAGES][0x20] = {
     [0] = {
-        [PHY_CTRL]          = PHY_ANYPAGE | PHY_RW,
-        [PHY_STATUS]        = PHY_ANYPAGE | PHY_R,
-        [PHY_ID1]           = PHY_ANYPAGE | PHY_R,
-        [PHY_ID2]           = PHY_ANYPAGE | PHY_R,
-        [PHY_AUTONEG_ADV]   = PHY_ANYPAGE | PHY_RW,
-        [PHY_LP_ABILITY]    = PHY_ANYPAGE | PHY_R,
-        [PHY_AUTONEG_EXP]   = PHY_ANYPAGE | PHY_R,
-        [PHY_NEXT_PAGE_TX]  = PHY_ANYPAGE | PHY_RW,
-        [PHY_LP_NEXT_PAGE]  = PHY_ANYPAGE | PHY_R,
-        [PHY_1000T_CTRL]    = PHY_ANYPAGE | PHY_RW,
-        [PHY_1000T_STATUS]  = PHY_ANYPAGE | PHY_R,
-        [PHY_EXT_STATUS]    = PHY_ANYPAGE | PHY_R,
-        [PHY_PAGE]          = PHY_ANYPAGE | PHY_RW,
+        [MII_BMCR]              = PHY_ANYPAGE | PHY_RW,
+        [MII_BMSR]              = PHY_ANYPAGE | PHY_R,
+        [MII_PHYID1]            = PHY_ANYPAGE | PHY_R,
+        [MII_PHYID2]            = PHY_ANYPAGE | PHY_R,
+        [MII_ANAR]              = PHY_ANYPAGE | PHY_RW,
+        [MII_ANLPAR]            = PHY_ANYPAGE | PHY_R,
+        [MII_ANER]              = PHY_ANYPAGE | PHY_R,
+        [MII_ANNP]              = PHY_ANYPAGE | PHY_RW,
+        [MII_ANLPRNP]           = PHY_ANYPAGE | PHY_R,
+        [MII_CTRL1000]          = PHY_ANYPAGE | PHY_RW,
+        [MII_STAT1000]          = PHY_ANYPAGE | PHY_R,
+        [MII_EXTSTAT]           = PHY_ANYPAGE | PHY_R,
+        [PHY_PAGE]              = PHY_ANYPAGE | PHY_RW,
 
         [PHY_COPPER_CTRL1]      = PHY_RW,
         [PHY_COPPER_STAT1]      = PHY_R,
@@ -2434,17 +2452,19 @@ e1000e_set_fcrtl(E1000ECore *core, int index, uint32_t val)
     core->mac[FCRTL] = val & 0x8000FFF8;
 }
 
-static inline void
-e1000e_set_16bit(E1000ECore *core, int index, uint32_t val)
-{
-    core->mac[index] = val & 0xffff;
-}
+#define E1000E_LOW_BITS_SET_FUNC(num)                                \
+    static void                                                      \
+    e1000e_set_##num##bit(E1000ECore *core, int index, uint32_t val) \
+    {                                                                \
+        core->mac[index] = val & (BIT(num) - 1);                     \
+    }
 
-static void
-e1000e_set_12bit(E1000ECore *core, int index, uint32_t val)
-{
-    core->mac[index] = val & 0xfff;
-}
+E1000E_LOW_BITS_SET_FUNC(4)
+E1000E_LOW_BITS_SET_FUNC(6)
+E1000E_LOW_BITS_SET_FUNC(11)
+E1000E_LOW_BITS_SET_FUNC(12)
+E1000E_LOW_BITS_SET_FUNC(13)
+E1000E_LOW_BITS_SET_FUNC(16)
 
 static void
 e1000e_set_vet(E1000ECore *core, int index, uint32_t val)
@@ -2515,7 +2535,8 @@ e1000e_set_icr(E1000ECore *core, int index, uint32_t val)
     }
 
     icr = core->mac[ICR] & ~val;
-    /* Windows driver expects that the "receive overrun" bit and other
+    /*
+     * Windows driver expects that the "receive overrun" bit and other
      * ones to be cleared when the "Other" bit (#24) is cleared.
      */
     icr = (val & E1000_ICR_OTHER) ? (icr & ~E1000_ICR_OTHER_CAUSES) : icr;
@@ -2614,27 +2635,11 @@ e1000e_mac_ims_read(E1000ECore *core, int index)
     return core->mac[IMS];
 }
 
-#define E1000E_LOW_BITS_READ_FUNC(num)                      \
-    static uint32_t                                         \
-    e1000e_mac_low##num##_read(E1000ECore *core, int index) \
-    {                                                       \
-        return core->mac[index] & (BIT(num) - 1);           \
-    }                                                       \
-
-#define E1000E_LOW_BITS_READ(num)                           \
-    e1000e_mac_low##num##_read
-
-E1000E_LOW_BITS_READ_FUNC(4);
-E1000E_LOW_BITS_READ_FUNC(6);
-E1000E_LOW_BITS_READ_FUNC(11);
-E1000E_LOW_BITS_READ_FUNC(13);
-E1000E_LOW_BITS_READ_FUNC(16);
-
 static uint32_t
 e1000e_mac_swsm_read(E1000ECore *core, int index)
 {
     uint32_t val = core->mac[SWSM];
-    core->mac[SWSM] = val | 1;
+    core->mac[SWSM] = val | E1000_SWSM_SMBI;
     return val;
 }
 
@@ -2908,6 +2913,35 @@ e1000e_set_gcr(E1000ECore *core, int index, uint32_t val)
     core->mac[GCR] = (val & ~E1000_GCR_RO_BITS) | ro_bits;
 }
 
+static uint32_t e1000e_get_systiml(E1000ECore *core, int index)
+{
+    e1000x_timestamp(core->mac, core->timadj, SYSTIML, SYSTIMH);
+    return core->mac[SYSTIML];
+}
+
+static uint32_t e1000e_get_rxsatrh(E1000ECore *core, int index)
+{
+    core->mac[TSYNCRXCTL] &= ~E1000_TSYNCRXCTL_VALID;
+    return core->mac[RXSATRH];
+}
+
+static uint32_t e1000e_get_txstmph(E1000ECore *core, int index)
+{
+    core->mac[TSYNCTXCTL] &= ~E1000_TSYNCTXCTL_VALID;
+    return core->mac[TXSTMPH];
+}
+
+static void e1000e_set_timinca(E1000ECore *core, int index, uint32_t val)
+{
+    e1000x_set_timinca(core->mac, &core->timadj, val);
+}
+
+static void e1000e_set_timadjh(E1000ECore *core, int index, uint32_t val)
+{
+    core->mac[TIMADJH] = val;
+    core->timadj += core->mac[TIMADJL] | ((int64_t)core->mac[TIMADJH] << 32);
+}
+
 #define e1000e_getreg(x)    [x] = e1000e_mac_readreg
 typedef uint32_t (*readops)(E1000ECore *, int);
 static const readops e1000e_macreg_readops[] = {
@@ -2923,7 +2957,19 @@ static const readops e1000e_macreg_readops[] = {
     e1000e_getreg(LATECOL),
     e1000e_getreg(SEQEC),
     e1000e_getreg(XONTXC),
+    e1000e_getreg(AIT),
+    e1000e_getreg(TDFH),
+    e1000e_getreg(TDFT),
+    e1000e_getreg(TDFHS),
+    e1000e_getreg(TDFTS),
+    e1000e_getreg(TDFPC),
     e1000e_getreg(WUS),
+    e1000e_getreg(PBS),
+    e1000e_getreg(RDFH),
+    e1000e_getreg(RDFT),
+    e1000e_getreg(RDFHS),
+    e1000e_getreg(RDFTS),
+    e1000e_getreg(RDFPC),
     e1000e_getreg(GORCL),
     e1000e_getreg(MGTPRC),
     e1000e_getreg(EERD),
@@ -2951,7 +2997,6 @@ static const readops e1000e_macreg_readops[] = {
     e1000e_getreg(GSCL_2),
     e1000e_getreg(RDBAH1),
     e1000e_getreg(FLSWDATA),
-    e1000e_getreg(RXSATRH),
     e1000e_getreg(TIPG),
     e1000e_getreg(FLMNGCTL),
     e1000e_getreg(FLMNGCNT),
@@ -2992,7 +3037,6 @@ static const readops e1000e_macreg_readops[] = {
     e1000e_getreg(FLSWCTL),
     e1000e_getreg(RXDCTL1),
     e1000e_getreg(RXSATRL),
-    e1000e_getreg(SYSTIML),
     e1000e_getreg(RXUDP),
     e1000e_getreg(TORL),
     e1000e_getreg(TDLEN1),
@@ -3032,7 +3076,6 @@ static const readops e1000e_macreg_readops[] = {
     e1000e_getreg(FLOL),
     e1000e_getreg(RXDCTL),
     e1000e_getreg(RXSTMPL),
-    e1000e_getreg(TXSTMPH),
     e1000e_getreg(TIMADJH),
     e1000e_getreg(FCRTL),
     e1000e_getreg(TDBAH),
@@ -3059,16 +3102,9 @@ static const readops e1000e_macreg_readops[] = {
     [MPTC]    = e1000e_mac_read_clr4,
     [IAC]     = e1000e_mac_read_clr4,
     [ICR]     = e1000e_mac_icr_read,
-    [RDFH]    = E1000E_LOW_BITS_READ(13),
-    [RDFHS]   = E1000E_LOW_BITS_READ(13),
-    [RDFPC]   = E1000E_LOW_BITS_READ(13),
-    [TDFH]    = E1000E_LOW_BITS_READ(13),
-    [TDFHS]   = E1000E_LOW_BITS_READ(13),
     [STATUS]  = e1000e_get_status,
     [TARC0]   = e1000e_get_tarc,
-    [PBS]     = E1000E_LOW_BITS_READ(6),
     [ICS]     = e1000e_mac_ics_read,
-    [AIT]     = E1000E_LOW_BITS_READ(16),
     [TORH]    = e1000e_mac_read_clr8,
     [GORCH]   = e1000e_mac_read_clr8,
     [PRC127]  = e1000e_mac_read_clr4,
@@ -3084,27 +3120,25 @@ static const readops e1000e_macreg_readops[] = {
     [BPTC]    = e1000e_mac_read_clr4,
     [TSCTC]   = e1000e_mac_read_clr4,
     [ITR]     = e1000e_mac_itr_read,
-    [RDFT]    = E1000E_LOW_BITS_READ(13),
-    [RDFTS]   = E1000E_LOW_BITS_READ(13),
-    [TDFPC]   = E1000E_LOW_BITS_READ(13),
-    [TDFT]    = E1000E_LOW_BITS_READ(13),
-    [TDFTS]   = E1000E_LOW_BITS_READ(13),
     [CTRL]    = e1000e_get_ctrl,
     [TARC1]   = e1000e_get_tarc,
     [SWSM]    = e1000e_mac_swsm_read,
     [IMS]     = e1000e_mac_ims_read,
+    [SYSTIML] = e1000e_get_systiml,
+    [RXSATRH] = e1000e_get_rxsatrh,
+    [TXSTMPH] = e1000e_get_txstmph,
 
     [CRCERRS ... MPC]      = e1000e_mac_readreg,
     [IP6AT ... IP6AT + 3]  = e1000e_mac_readreg,
     [IP4AT ... IP4AT + 6]  = e1000e_mac_readreg,
     [RA ... RA + 31]       = e1000e_mac_readreg,
     [WUPM ... WUPM + 31]   = e1000e_mac_readreg,
-    [MTA ... MTA + 127]    = e1000e_mac_readreg,
-    [VFTA ... VFTA + 127]  = e1000e_mac_readreg,
-    [FFMT ... FFMT + 254]  = E1000E_LOW_BITS_READ(4),
+    [MTA ... MTA + E1000_MC_TBL_SIZE - 1] = e1000e_mac_readreg,
+    [VFTA ... VFTA + E1000_VLAN_FILTER_TBL_SIZE - 1]  = e1000e_mac_readreg,
+    [FFMT ... FFMT + 254]  = e1000e_mac_readreg,
     [FFVT ... FFVT + 254]  = e1000e_mac_readreg,
     [MDEF ... MDEF + 7]    = e1000e_mac_readreg,
-    [FFLT ... FFLT + 10]   = E1000E_LOW_BITS_READ(11),
+    [FFLT ... FFLT + 10]   = e1000e_mac_readreg,
     [FTFT ... FTFT + 254]  = e1000e_mac_readreg,
     [PBM ... PBM + 10239]  = e1000e_mac_readreg,
     [RETA ... RETA + 31]   = e1000e_mac_readreg,
@@ -3127,22 +3161,10 @@ static const writeops e1000e_macreg_writeops[] = {
     e1000e_putreg(LEDCTL),
     e1000e_putreg(FCAL),
     e1000e_putreg(FCRUC),
-    e1000e_putreg(AIT),
-    e1000e_putreg(TDFH),
-    e1000e_putreg(TDFT),
-    e1000e_putreg(TDFHS),
-    e1000e_putreg(TDFTS),
-    e1000e_putreg(TDFPC),
     e1000e_putreg(WUC),
     e1000e_putreg(WUS),
-    e1000e_putreg(RDFH),
-    e1000e_putreg(RDFT),
-    e1000e_putreg(RDFHS),
-    e1000e_putreg(RDFTS),
-    e1000e_putreg(RDFPC),
     e1000e_putreg(IPAV),
     e1000e_putreg(TDBAH1),
-    e1000e_putreg(TIMINCA),
     e1000e_putreg(IAM),
     e1000e_putreg(EIAC),
     e1000e_putreg(IVAR),
@@ -3150,7 +3172,6 @@ static const writeops e1000e_macreg_writeops[] = {
     e1000e_putreg(TARC1),
     e1000e_putreg(FLSWDATA),
     e1000e_putreg(POEMB),
-    e1000e_putreg(PBS),
     e1000e_putreg(MFUTP01),
     e1000e_putreg(MFUTP23),
     e1000e_putreg(MANC),
@@ -3186,7 +3207,6 @@ static const writeops e1000e_macreg_writeops[] = {
     e1000e_putreg(SYSTIML),
     e1000e_putreg(SYSTIMH),
     e1000e_putreg(TIMADJL),
-    e1000e_putreg(TIMADJH),
     e1000e_putreg(RXUDP),
     e1000e_putreg(RXCFGL),
     e1000e_putreg(TSYNCRXCTL),
@@ -3215,6 +3235,18 @@ static const writeops e1000e_macreg_writeops[] = {
     [TADV]     = e1000e_set_16bit,
     [ITR]      = e1000e_set_itr,
     [EERD]     = e1000e_set_eerd,
+    [AIT]      = e1000e_set_16bit,
+    [TDFH]     = e1000e_set_13bit,
+    [TDFT]     = e1000e_set_13bit,
+    [TDFHS]    = e1000e_set_13bit,
+    [TDFTS]    = e1000e_set_13bit,
+    [TDFPC]    = e1000e_set_13bit,
+    [RDFH]     = e1000e_set_13bit,
+    [RDFHS]    = e1000e_set_13bit,
+    [RDFT]     = e1000e_set_13bit,
+    [RDFTS]    = e1000e_set_13bit,
+    [RDFPC]    = e1000e_set_13bit,
+    [PBS]      = e1000e_set_6bit,
     [GCR]      = e1000e_set_gcr,
     [PSRCTL]   = e1000e_set_psrctl,
     [RXCSUM]   = e1000e_set_rxcsum,
@@ -3247,18 +3279,20 @@ static const writeops e1000e_macreg_writeops[] = {
     [CTRL_DUP] = e1000e_set_ctrl,
     [RFCTL]    = e1000e_set_rfctl,
     [RA + 1]   = e1000e_mac_setmacaddr,
+    [TIMINCA]  = e1000e_set_timinca,
+    [TIMADJH]  = e1000e_set_timadjh,
 
     [IP6AT ... IP6AT + 3]    = e1000e_mac_writereg,
     [IP4AT ... IP4AT + 6]    = e1000e_mac_writereg,
     [RA + 2 ... RA + 31]     = e1000e_mac_writereg,
     [WUPM ... WUPM + 31]     = e1000e_mac_writereg,
-    [MTA ... MTA + 127]      = e1000e_mac_writereg,
-    [VFTA ... VFTA + 127]    = e1000e_mac_writereg,
-    [FFMT ... FFMT + 254]    = e1000e_mac_writereg,
+    [MTA ... MTA + E1000_MC_TBL_SIZE - 1] = e1000e_mac_writereg,
+    [VFTA ... VFTA + E1000_VLAN_FILTER_TBL_SIZE - 1]    = e1000e_mac_writereg,
+    [FFMT ... FFMT + 254]    = e1000e_set_4bit,
     [FFVT ... FFVT + 254]    = e1000e_mac_writereg,
     [PBM ... PBM + 10239]    = e1000e_mac_writereg,
     [MDEF ... MDEF + 7]      = e1000e_mac_writereg,
-    [FFLT ... FFLT + 10]     = e1000e_mac_writereg,
+    [FFLT ... FFLT + 10]     = e1000e_set_11bit,
     [FTFT ... FTFT + 254]    = e1000e_mac_writereg,
     [RETA ... RETA + 31]     = e1000e_mac_writereg,
     [RSSRK ... RSSRK + 31]   = e1000e_mac_writereg,
@@ -3269,10 +3303,12 @@ enum { E1000E_NWRITEOPS = ARRAY_SIZE(e1000e_macreg_writeops) };
 
 enum { MAC_ACCESS_PARTIAL = 1 };
 
-/* The array below combines alias offsets of the index values for the
+/*
+ * The array below combines alias offsets of the index values for the
  * MAC registers that have aliases, with the indication of not fully
  * implemented registers (lowest bit). This combination is possible
- * because all of the offsets are even. */
+ * because all of the offsets are even.
+ */
 static const uint16_t mac_reg_access[E1000E_MAC_SIZE] = {
     /* Alias index offsets */
     [FCRTL_A] = 0x07fe, [FCRTH_A] = 0x0802,
@@ -3281,7 +3317,7 @@ static const uint16_t mac_reg_access[E1000E_MAC_SIZE] = {
     [TDH_A]   = 0x0cf8, [TDT_A]   = 0x0cf8, [TIDV_A] = 0x0cf8,
     [TDFH_A]  = 0xed00, [TDFT_A]  = 0xed00,
     [RA_A ... RA_A + 31]      = 0x14f0,
-    [VFTA_A ... VFTA_A + 127] = 0x1400,
+    [VFTA_A ... VFTA_A + E1000_VLAN_FILTER_TBL_SIZE - 1] = 0x1400,
     [RDBAL0_A ... RDLEN0_A] = 0x09bc,
     [TDBAL_A ... TDLEN_A]   = 0x0cf8,
     /* Access options */
@@ -3347,7 +3383,7 @@ static void
 e1000e_autoneg_resume(E1000ECore *core)
 {
     if (e1000e_have_autoneg(core) &&
-        !(core->phy[0][PHY_STATUS] & MII_SR_AUTONEG_COMPLETE)) {
+        !(core->phy[0][MII_BMSR] & MII_BMSR_AN_COMP)) {
         qemu_get_queue(core->owner_nic)->link_down = false;
         timer_mod(core->autoneg_timer,
                   qemu_clock_get_ms(QEMU_CLOCK_VIRTUAL) + 500);
@@ -3386,11 +3422,10 @@ e1000e_core_pci_realize(E1000ECore     *core,
         qemu_add_vm_change_state_handler(e1000e_vm_state_change, core);
 
     for (i = 0; i < E1000E_NUM_QUEUES; i++) {
-        net_tx_pkt_init(&core->tx[i].tx_pkt, core->owner,
-                        E1000E_MAX_TX_FRAGS, core->has_vnet);
+        net_tx_pkt_init(&core->tx[i].tx_pkt, core->owner, E1000E_MAX_TX_FRAGS);
     }
 
-    net_rx_pkt_init(&core->rx_pkt, core->has_vnet);
+    net_rx_pkt_init(&core->rx_pkt);
 
     e1000x_core_prepare_eeprom(core->eeprom,
                                eeprom_templ,
@@ -3422,29 +3457,36 @@ e1000e_core_pci_uninit(E1000ECore *core)
 static const uint16_t
 e1000e_phy_reg_init[E1000E_PHY_PAGES][E1000E_PHY_PAGE_SIZE] = {
     [0] = {
-        [PHY_CTRL] =   MII_CR_SPEED_SELECT_MSB  |
-                       MII_CR_FULL_DUPLEX       |
-                       MII_CR_AUTO_NEG_EN,
-
-        [PHY_STATUS] = MII_SR_EXTENDED_CAPS     |
-                       MII_SR_LINK_STATUS       |
-                       MII_SR_AUTONEG_CAPS      |
-                       MII_SR_PREAMBLE_SUPPRESS |
-                       MII_SR_EXTENDED_STATUS   |
-                       MII_SR_10T_HD_CAPS       |
-                       MII_SR_10T_FD_CAPS       |
-                       MII_SR_100X_HD_CAPS      |
-                       MII_SR_100X_FD_CAPS,
-
-        [PHY_ID1]               = 0x141,
-        [PHY_ID2]               = E1000_PHY_ID2_82574x,
-        [PHY_AUTONEG_ADV]       = 0xde1,
-        [PHY_LP_ABILITY]        = 0x7e0,
-        [PHY_AUTONEG_EXP]       = BIT(2),
-        [PHY_NEXT_PAGE_TX]      = BIT(0) | BIT(13),
-        [PHY_1000T_CTRL]        = BIT(8) | BIT(9) | BIT(10) | BIT(11),
-        [PHY_1000T_STATUS]      = 0x3c00,
-        [PHY_EXT_STATUS]        = BIT(12) | BIT(13),
+        [MII_BMCR] = MII_BMCR_SPEED1000 |
+                     MII_BMCR_FD        |
+                     MII_BMCR_AUTOEN,
+
+        [MII_BMSR] = MII_BMSR_EXTCAP    |
+                     MII_BMSR_LINK_ST   |
+                     MII_BMSR_AUTONEG   |
+                     MII_BMSR_MFPS      |
+                     MII_BMSR_EXTSTAT   |
+                     MII_BMSR_10T_HD    |
+                     MII_BMSR_10T_FD    |
+                     MII_BMSR_100TX_HD  |
+                     MII_BMSR_100TX_FD,
+
+        [MII_PHYID1]            = 0x141,
+        [MII_PHYID2]            = E1000_PHY_ID2_82574x,
+        [MII_ANAR]              = MII_ANAR_CSMACD | MII_ANAR_10 |
+                                  MII_ANAR_10FD | MII_ANAR_TX |
+                                  MII_ANAR_TXFD | MII_ANAR_PAUSE |
+                                  MII_ANAR_PAUSE_ASYM,
+        [MII_ANLPAR]            = MII_ANLPAR_10 | MII_ANLPAR_10FD |
+                                  MII_ANLPAR_TX | MII_ANLPAR_TXFD |
+                                  MII_ANLPAR_T4 | MII_ANLPAR_PAUSE,
+        [MII_ANER]              = MII_ANER_NP | MII_ANER_NWAY,
+        [MII_ANNP]              = 1 | MII_ANNP_MP,
+        [MII_CTRL1000]          = MII_CTRL1000_HALF | MII_CTRL1000_FULL |
+                                  MII_CTRL1000_PORT | MII_CTRL1000_MASTER,
+        [MII_STAT1000]          = MII_STAT1000_HALF | MII_STAT1000_FULL |
+                                  MII_STAT1000_ROK | MII_STAT1000_LOK,
+        [MII_EXTSTAT]           = MII_EXTSTAT_1000T_HD | MII_EXTSTAT_1000T_FD,
 
         [PHY_COPPER_CTRL1]      = BIT(5) | BIT(6) | BIT(8) | BIT(9) |
                                   BIT(12) | BIT(13),
@@ -3501,8 +3543,7 @@ static const uint32_t e1000e_mac_reg_init[] = {
     [EITR...EITR + E1000E_MSIX_VEC_NUM - 1] = E1000E_MIN_XITR,
 };
 
-void
-e1000e_core_reset(E1000ECore *core)
+static void e1000e_reset(E1000ECore *core, bool sw)
 {
     int i;
 
@@ -3511,9 +3552,16 @@ e1000e_core_reset(E1000ECore *core)
     e1000e_intrmgr_reset(core);
 
     memset(core->phy, 0, sizeof core->phy);
-    memmove(core->phy, e1000e_phy_reg_init, sizeof e1000e_phy_reg_init);
-    memset(core->mac, 0, sizeof core->mac);
-    memmove(core->mac, e1000e_mac_reg_init, sizeof e1000e_mac_reg_init);
+    memcpy(core->phy, e1000e_phy_reg_init, sizeof e1000e_phy_reg_init);
+
+    for (i = 0; i < E1000E_MAC_SIZE; i++) {
+        if (sw && (i == PBA || i == PBS || i == FLA)) {
+            continue;
+        }
+
+        core->mac[i] = i < ARRAY_SIZE(e1000e_mac_reg_init) ?
+                       e1000e_mac_reg_init[i] : 0;
+    }
 
     core->rxbuf_min_shift = 1 + E1000_RING_DESC_LEN_SHIFT;
 
@@ -3530,18 +3578,24 @@ e1000e_core_reset(E1000ECore *core)
     }
 }
 
+void
+e1000e_core_reset(E1000ECore *core)
+{
+    e1000e_reset(core, false);
+}
+
 void e1000e_core_pre_save(E1000ECore *core)
 {
     int i;
     NetClientState *nc = qemu_get_queue(core->owner_nic);
 
     /*
-    * If link is down and auto-negotiation is supported and ongoing,
-    * complete auto-negotiation immediately. This allows us to look
-    * at MII_SR_AUTONEG_COMPLETE to infer link status on load.
-    */
+     * If link is down and auto-negotiation is supported and ongoing,
+     * complete auto-negotiation immediately. This allows us to look
+     * at MII_BMSR_AN_COMP to infer link status on load.
+     */
     if (nc->link_down && e1000e_have_autoneg(core)) {
-        core->phy[0][PHY_STATUS] |= MII_SR_AUTONEG_COMPLETE;
+        core->phy[0][MII_BMSR] |= MII_BMSR_AN_COMP;
         e1000e_update_flowctl_status(core);
     }
 
@@ -3557,7 +3611,8 @@ e1000e_core_post_load(E1000ECore *core)
 {
     NetClientState *nc = qemu_get_queue(core->owner_nic);
 
-    /* nc.link_down can't be migrated, so infer link_down according
+    /*
+     * nc.link_down can't be migrated, so infer link_down according
      * to link status bit in core.mac[STATUS].
      */
     nc->link_down = (core->mac[STATUS] & E1000_STATUS_LU) == 0;
diff --git a/hw/net/e1000e_core.h b/hw/net/e1000e_core.h
index 4ddb4d2c39..213a70530d 100644
--- a/hw/net/e1000e_core.h
+++ b/hw/net/e1000e_core.h
@@ -1,37 +1,37 @@
 /*
-* Core code for QEMU e1000e emulation
-*
-* Software developer's manuals:
-* http://www.intel.com/content/dam/doc/datasheet/82574l-gbe-controller-datasheet.pdf
-*
-* Copyright (c) 2015 Ravello Systems LTD (http://ravellosystems.com)
-* Developed by Daynix Computing LTD (http://www.daynix.com)
-*
-* Authors:
-* Dmitry Fleytman <dmitry@daynix.com>
-* Leonid Bloch <leonid@daynix.com>
-* Yan Vugenfirer <yan@daynix.com>
-*
-* Based on work done by:
-* Nir Peleg, Tutis Systems Ltd. for Qumranet Inc.
-* Copyright (c) 2008 Qumranet
-* Based on work done by:
-* Copyright (c) 2007 Dan Aloni
-* Copyright (c) 2004 Antony T Curtis
-*
-* This library is free software; you can redistribute it and/or
-* modify it under the terms of the GNU Lesser General Public
-* License as published by the Free Software Foundation; either
-* version 2.1 of the License, or (at your option) any later version.
-*
-* This library is distributed in the hope that it will be useful,
-* but WITHOUT ANY WARRANTY; without even the implied warranty of
-* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
-* Lesser General Public License for more details.
-*
-* You should have received a copy of the GNU Lesser General Public
-* License along with this library; if not, see <http://www.gnu.org/licenses/>.
-*/
+ * Core code for QEMU e1000e emulation
+ *
+ * Software developer's manuals:
+ * http://www.intel.com/content/dam/doc/datasheet/82574l-gbe-controller-datasheet.pdf
+ *
+ * Copyright (c) 2015 Ravello Systems LTD (http://ravellosystems.com)
+ * Developed by Daynix Computing LTD (http://www.daynix.com)
+ *
+ * Authors:
+ * Dmitry Fleytman <dmitry@daynix.com>
+ * Leonid Bloch <leonid@daynix.com>
+ * Yan Vugenfirer <yan@daynix.com>
+ *
+ * Based on work done by:
+ * Nir Peleg, Tutis Systems Ltd. for Qumranet Inc.
+ * Copyright (c) 2008 Qumranet
+ * Based on work done by:
+ * Copyright (c) 2007 Dan Aloni
+ * Copyright (c) 2004 Antony T Curtis
+ *
+ * This library is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * This library is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with this library; if not, see <http://www.gnu.org/licenses/>.
+ */
 
 #ifndef HW_NET_E1000E_CORE_H
 #define HW_NET_E1000E_CORE_H
@@ -95,10 +95,8 @@ struct E1000Core {
     E1000IntrDelayTimer tidv;
 
     E1000IntrDelayTimer itr;
-    bool itr_intr_pending;
 
     E1000IntrDelayTimer eitr[E1000E_MSIX_VEC_NUM];
-    bool eitr_intr_pending[E1000E_MSIX_VEC_NUM];
 
     VMChangeStateEntry *vmstate;
 
@@ -114,6 +112,8 @@ struct E1000Core {
     void (*owner_start_recv)(PCIDevice *d);
 
     uint32_t msi_causes_pending;
+
+    int64_t timadj;
 };
 
 void
diff --git a/hw/net/e1000x_common.c b/hw/net/e1000x_common.c
index 2f43e8cd13..b844af590a 100644
--- a/hw/net/e1000x_common.c
+++ b/hw/net/e1000x_common.c
@@ -24,9 +24,12 @@
 
 #include "qemu/osdep.h"
 #include "qemu/units.h"
+#include "hw/net/mii.h"
 #include "hw/pci/pci_device.h"
+#include "net/eth.h"
 #include "net/net.h"
 
+#include "e1000_common.h"
 #include "e1000x_common.h"
 
 #include "trace.h"
@@ -45,9 +48,9 @@ bool e1000x_rx_ready(PCIDevice *d, uint32_t *mac)
     return true;
 }
 
-bool e1000x_is_vlan_packet(const uint8_t *buf, uint16_t vet)
+bool e1000x_is_vlan_packet(const void *buf, uint16_t vet)
 {
-    uint16_t eth_proto = lduw_be_p(buf + 12);
+    uint16_t eth_proto = lduw_be_p(&PKT_GET_ETH_HDR(buf)->h_proto);
     bool res = (eth_proto == vet);
 
     trace_e1000x_vlan_is_vlan_pkt(res, eth_proto, vet);
@@ -66,7 +69,7 @@ bool e1000x_rx_group_filter(uint32_t *mac, const uint8_t *buf)
         }
         ra[0] = cpu_to_le32(rp[0]);
         ra[1] = cpu_to_le32(rp[1]);
-        if (!memcmp(buf, (uint8_t *)ra, 6)) {
+        if (!memcmp(buf, (uint8_t *)ra, ETH_ALEN)) {
             trace_e1000x_rx_flt_ucast_match((int)(rp - mac - RA) / 2,
                                             MAC_ARG(buf));
             return true;
@@ -152,8 +155,8 @@ void e1000x_reset_mac_addr(NICState *nic, uint32_t *mac_regs,
 void e1000x_update_regs_on_autoneg_done(uint32_t *mac, uint16_t *phy)
 {
     e1000x_update_regs_on_link_up(mac, phy);
-    phy[PHY_LP_ABILITY] |= MII_LPAR_LPACK;
-    phy[PHY_STATUS] |= MII_SR_AUTONEG_COMPLETE;
+    phy[MII_ANLPAR] |= MII_ANLPAR_ACK;
+    phy[MII_BMSR] |= MII_BMSR_AN_COMP;
     trace_e1000x_link_negotiation_done();
 }
 
@@ -265,3 +268,28 @@ e1000x_read_tx_ctx_descr(struct e1000_context_desc *d,
     props->tcp = (op & E1000_TXD_CMD_TCP) ? 1 : 0;
     props->tse = (op & E1000_TXD_CMD_TSE) ? 1 : 0;
 }
+
+void e1000x_timestamp(uint32_t *mac, int64_t timadj, size_t lo, size_t hi)
+{
+    int64_t ns = qemu_clock_get_ns(QEMU_CLOCK_VIRTUAL);
+    uint32_t timinca = mac[TIMINCA];
+    uint32_t incvalue = timinca & E1000_TIMINCA_INCVALUE_MASK;
+    uint32_t incperiod = MAX(timinca >> E1000_TIMINCA_INCPERIOD_SHIFT, 1);
+    int64_t timestamp = timadj + muldiv64(ns, incvalue, incperiod * 16);
+
+    mac[lo] = timestamp & 0xffffffff;
+    mac[hi] = timestamp >> 32;
+}
+
+void e1000x_set_timinca(uint32_t *mac, int64_t *timadj, uint32_t val)
+{
+    int64_t ns = qemu_clock_get_ns(QEMU_CLOCK_VIRTUAL);
+    uint32_t old_val = mac[TIMINCA];
+    uint32_t old_incvalue = old_val & E1000_TIMINCA_INCVALUE_MASK;
+    uint32_t old_incperiod = MAX(old_val >> E1000_TIMINCA_INCPERIOD_SHIFT, 1);
+    uint32_t incvalue = val & E1000_TIMINCA_INCVALUE_MASK;
+    uint32_t incperiod = MAX(val >> E1000_TIMINCA_INCPERIOD_SHIFT, 1);
+
+    mac[TIMINCA] = val;
+    *timadj += (muldiv64(ns, incvalue, incperiod) - muldiv64(ns, old_incvalue, old_incperiod)) / 16;
+}
diff --git a/hw/net/e1000x_common.h b/hw/net/e1000x_common.h
index b7742775c4..911abd8a90 100644
--- a/hw/net/e1000x_common.h
+++ b/hw/net/e1000x_common.h
@@ -1,108 +1,34 @@
 /*
-* QEMU e1000(e) emulation - shared code
-*
-* Copyright (c) 2008 Qumranet
-*
-* Based on work done by:
-* Nir Peleg, Tutis Systems Ltd. for Qumranet Inc.
-* Copyright (c) 2007 Dan Aloni
-* Copyright (c) 2004 Antony T Curtis
-*
-* This library is free software; you can redistribute it and/or
-* modify it under the terms of the GNU Lesser General Public
-* License as published by the Free Software Foundation; either
-* version 2.1 of the License, or (at your option) any later version.
-*
-* This library is distributed in the hope that it will be useful,
-* but WITHOUT ANY WARRANTY; without even the implied warranty of
-* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
-* Lesser General Public License for more details.
-*
-* You should have received a copy of the GNU Lesser General Public
-* License along with this library; if not, see <http://www.gnu.org/licenses/>.
-*/
+ * QEMU e1000(e) emulation - shared code
+ *
+ * Copyright (c) 2008 Qumranet
+ *
+ * Based on work done by:
+ * Nir Peleg, Tutis Systems Ltd. for Qumranet Inc.
+ * Copyright (c) 2007 Dan Aloni
+ * Copyright (c) 2004 Antony T Curtis
+ *
+ * This library is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * This library is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with this library; if not, see <http://www.gnu.org/licenses/>.
+ */
 
 #ifndef HW_NET_E1000X_COMMON_H
 #define HW_NET_E1000X_COMMON_H
 
-#include "e1000_regs.h"
-
-#define defreg(x)   x = (E1000_##x >> 2)
-enum {
-    defreg(CTRL),    defreg(EECD),    defreg(EERD),    defreg(GPRC),
-    defreg(GPTC),    defreg(ICR),     defreg(ICS),     defreg(IMC),
-    defreg(IMS),     defreg(LEDCTL),  defreg(MANC),    defreg(MDIC),
-    defreg(MPC),     defreg(PBA),     defreg(RCTL),    defreg(RDBAH0),
-    defreg(RDBAL0),  defreg(RDH0),    defreg(RDLEN0),  defreg(RDT0),
-    defreg(STATUS),  defreg(SWSM),    defreg(TCTL),    defreg(TDBAH),
-    defreg(TDBAL),   defreg(TDH),     defreg(TDLEN),   defreg(TDT),
-    defreg(TDLEN1),  defreg(TDBAL1),  defreg(TDBAH1),  defreg(TDH1),
-    defreg(TDT1),    defreg(TORH),    defreg(TORL),    defreg(TOTH),
-    defreg(TOTL),    defreg(TPR),     defreg(TPT),     defreg(TXDCTL),
-    defreg(WUFC),    defreg(RA),      defreg(MTA),     defreg(CRCERRS),
-    defreg(VFTA),    defreg(VET),     defreg(RDTR),    defreg(RADV),
-    defreg(TADV),    defreg(ITR),     defreg(SCC),     defreg(ECOL),
-    defreg(MCC),     defreg(LATECOL), defreg(COLC),    defreg(DC),
-    defreg(TNCRS),   defreg(SEQEC),   defreg(CEXTERR), defreg(RLEC),
-    defreg(XONRXC),  defreg(XONTXC),  defreg(XOFFRXC), defreg(XOFFTXC),
-    defreg(FCRUC),   defreg(AIT),     defreg(TDFH),    defreg(TDFT),
-    defreg(TDFHS),   defreg(TDFTS),   defreg(TDFPC),   defreg(WUC),
-    defreg(WUS),     defreg(POEMB),   defreg(PBS),     defreg(RDFH),
-    defreg(RDFT),    defreg(RDFHS),   defreg(RDFTS),   defreg(RDFPC),
-    defreg(PBM),     defreg(IPAV),    defreg(IP4AT),   defreg(IP6AT),
-    defreg(WUPM),    defreg(FFLT),    defreg(FFMT),    defreg(FFVT),
-    defreg(TARC0),   defreg(TARC1),   defreg(IAM),     defreg(EXTCNF_CTRL),
-    defreg(GCR),     defreg(TIMINCA), defreg(EIAC),    defreg(CTRL_EXT),
-    defreg(IVAR),    defreg(MFUTP01), defreg(MFUTP23), defreg(MANC2H),
-    defreg(MFVAL),   defreg(MDEF),    defreg(FACTPS),  defreg(FTFT),
-    defreg(RUC),     defreg(ROC),     defreg(RFC),     defreg(RJC),
-    defreg(PRC64),   defreg(PRC127),  defreg(PRC255),  defreg(PRC511),
-    defreg(PRC1023), defreg(PRC1522), defreg(PTC64),   defreg(PTC127),
-    defreg(PTC255),  defreg(PTC511),  defreg(PTC1023), defreg(PTC1522),
-    defreg(GORCL),   defreg(GORCH),   defreg(GOTCL),   defreg(GOTCH),
-    defreg(RNBC),    defreg(BPRC),    defreg(MPRC),    defreg(RFCTL),
-    defreg(PSRCTL),  defreg(MPTC),    defreg(BPTC),    defreg(TSCTFC),
-    defreg(IAC),     defreg(MGTPRC),  defreg(MGTPDC),  defreg(MGTPTC),
-    defreg(TSCTC),   defreg(RXCSUM),  defreg(FUNCTAG), defreg(GSCL_1),
-    defreg(GSCL_2),  defreg(GSCL_3),  defreg(GSCL_4),  defreg(GSCN_0),
-    defreg(GSCN_1),  defreg(GSCN_2),  defreg(GSCN_3),  defreg(GCR2),
-    defreg(RAID),    defreg(RSRPD),   defreg(TIDV),    defreg(EITR),
-    defreg(MRQC),    defreg(RETA),    defreg(RSSRK),   defreg(RDBAH1),
-    defreg(RDBAL1),  defreg(RDLEN1),  defreg(RDH1),    defreg(RDT1),
-    defreg(PBACLR),  defreg(FCAL),    defreg(FCAH),    defreg(FCT),
-    defreg(FCRTH),   defreg(FCRTL),   defreg(FCTTV),   defreg(FCRTV),
-    defreg(FLA),     defreg(EEWR),    defreg(FLOP),    defreg(FLOL),
-    defreg(FLSWCTL), defreg(FLSWCNT), defreg(RXDCTL),  defreg(RXDCTL1),
-    defreg(MAVTV0),  defreg(MAVTV1),  defreg(MAVTV2),  defreg(MAVTV3),
-    defreg(TXSTMPL), defreg(TXSTMPH), defreg(SYSTIML), defreg(SYSTIMH),
-    defreg(RXCFGL),  defreg(RXUDP),   defreg(TIMADJL), defreg(TIMADJH),
-    defreg(RXSTMPH), defreg(RXSTMPL), defreg(RXSATRL), defreg(RXSATRH),
-    defreg(FLASHT),  defreg(TIPG),    defreg(RDH),     defreg(RDT),
-    defreg(RDLEN),   defreg(RDBAH),   defreg(RDBAL),
-    defreg(TXDCTL1),
-    defreg(FLSWDATA),
-    defreg(CTRL_DUP),
-    defreg(EXTCNF_SIZE),
-    defreg(EEMNGCTL),
-    defreg(EEMNGDATA),
-    defreg(FLMNGCTL),
-    defreg(FLMNGDATA),
-    defreg(FLMNGCNT),
-    defreg(TSYNCRXCTL),
-    defreg(TSYNCTXCTL),
-
-    /* Aliases */
-    defreg(RDH0_A),  defreg(RDT0_A),  defreg(RDTR_A),  defreg(RDFH_A),
-    defreg(RDFT_A),  defreg(TDH_A),   defreg(TDT_A),   defreg(TIDV_A),
-    defreg(TDFH_A),  defreg(TDFT_A),  defreg(RA_A),    defreg(RDBAL0_A),
-    defreg(TDBAL_A), defreg(TDLEN_A), defreg(VFTA_A),  defreg(RDLEN0_A),
-    defreg(FCRTL_A), defreg(FCRTH_A)
-};
-
 static inline void
 e1000x_inc_reg_if_not_full(uint32_t *mac, int index)
 {
-    if (mac[index] != 0xffffffff) {
+    if (mac[index] != UINT32_MAX) {
         mac[index]++;
     }
 }
@@ -152,16 +78,16 @@ static inline void
 e1000x_update_regs_on_link_down(uint32_t *mac, uint16_t *phy)
 {
     mac[STATUS] &= ~E1000_STATUS_LU;
-    phy[PHY_STATUS] &= ~MII_SR_LINK_STATUS;
-    phy[PHY_STATUS] &= ~MII_SR_AUTONEG_COMPLETE;
-    phy[PHY_LP_ABILITY] &= ~MII_LPAR_LPACK;
+    phy[MII_BMSR] &= ~MII_BMSR_LINK_ST;
+    phy[MII_BMSR] &= ~MII_BMSR_AN_COMP;
+    phy[MII_ANLPAR] &= ~MII_ANLPAR_ACK;
 }
 
 static inline void
 e1000x_update_regs_on_link_up(uint32_t *mac, uint16_t *phy)
 {
     mac[STATUS] |= E1000_STATUS_LU;
-    phy[PHY_STATUS] |= MII_SR_LINK_STATUS;
+    phy[MII_BMSR] |= MII_BMSR_LINK_ST;
 }
 
 void e1000x_update_rx_total_stats(uint32_t *mac,
@@ -178,7 +104,7 @@ uint32_t e1000x_rxbufsize(uint32_t rctl);
 
 bool e1000x_rx_ready(PCIDevice *d, uint32_t *mac);
 
-bool e1000x_is_vlan_packet(const uint8_t *buf, uint16_t vet);
+bool e1000x_is_vlan_packet(const void *buf, uint16_t vet);
 
 bool e1000x_rx_group_filter(uint32_t *mac, const uint8_t *buf);
 
@@ -213,4 +139,7 @@ typedef struct e1000x_txd_props {
 void e1000x_read_tx_ctx_descr(struct e1000_context_desc *d,
                               e1000x_txd_props *props);
 
+void e1000x_timestamp(uint32_t *mac, int64_t timadj, size_t lo, size_t hi);
+void e1000x_set_timinca(uint32_t *mac, int64_t *timadj, uint32_t val);
+
 #endif
diff --git a/hw/net/e1000x_regs.h b/hw/net/e1000x_regs.h
new file mode 100644
index 0000000000..c0832fa23d
--- /dev/null
+++ b/hw/net/e1000x_regs.h
@@ -0,0 +1,967 @@
+/*******************************************************************************
+
+  Intel PRO/1000 Linux driver
+  Copyright(c) 1999 - 2006 Intel Corporation.
+
+  This program is free software; you can redistribute it and/or modify it
+  under the terms and conditions of the GNU General Public License,
+  version 2, as published by the Free Software Foundation.
+
+  This program is distributed in the hope it will be useful, but WITHOUT
+  ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+  FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
+  more details.
+
+  You should have received a copy of the GNU General Public License along with
+  this program; if not, see <http://www.gnu.org/licenses/>.
+
+  The full GNU General Public License is included in this distribution in
+  the file called "COPYING".
+
+  Contact Information:
+  Linux NICS <linux.nics@intel.com>
+  e1000-devel Mailing List <e1000-devel@lists.sourceforge.net>
+  Intel Corporation, 5200 N.E. Elam Young Parkway, Hillsboro, OR 97124-6497
+
+*******************************************************************************/
+
+/* e1000_hw.h
+ * Structures, enums, and macros for the MAC
+ */
+
+#ifndef HW_E1000X_REGS_H
+#define HW_E1000X_REGS_H
+
+/* PCI Device IDs */
+#define E1000_DEV_ID_82542               0x1000
+#define E1000_DEV_ID_82543GC_FIBER       0x1001
+#define E1000_DEV_ID_82543GC_COPPER      0x1004
+#define E1000_DEV_ID_82544EI_COPPER      0x1008
+#define E1000_DEV_ID_82544EI_FIBER       0x1009
+#define E1000_DEV_ID_82544GC_COPPER      0x100C
+#define E1000_DEV_ID_82544GC_LOM         0x100D
+#define E1000_DEV_ID_82540EM             0x100E
+#define E1000_DEV_ID_82540EM_LOM         0x1015
+#define E1000_DEV_ID_82540EP_LOM         0x1016
+#define E1000_DEV_ID_82540EP             0x1017
+#define E1000_DEV_ID_82540EP_LP          0x101E
+#define E1000_DEV_ID_82545EM_COPPER      0x100F
+#define E1000_DEV_ID_82545EM_FIBER       0x1011
+#define E1000_DEV_ID_82545GM_COPPER      0x1026
+#define E1000_DEV_ID_82545GM_FIBER       0x1027
+#define E1000_DEV_ID_82545GM_SERDES      0x1028
+#define E1000_DEV_ID_82546EB_COPPER      0x1010
+#define E1000_DEV_ID_82546EB_FIBER       0x1012
+#define E1000_DEV_ID_82546EB_QUAD_COPPER 0x101D
+#define E1000_DEV_ID_82541EI             0x1013
+#define E1000_DEV_ID_82541EI_MOBILE      0x1018
+#define E1000_DEV_ID_82541ER_LOM         0x1014
+#define E1000_DEV_ID_82541ER             0x1078
+#define E1000_DEV_ID_82547GI             0x1075
+#define E1000_DEV_ID_82541GI             0x1076
+#define E1000_DEV_ID_82541GI_MOBILE      0x1077
+#define E1000_DEV_ID_82541GI_LF          0x107C
+#define E1000_DEV_ID_82546GB_COPPER      0x1079
+#define E1000_DEV_ID_82546GB_FIBER       0x107A
+#define E1000_DEV_ID_82546GB_SERDES      0x107B
+#define E1000_DEV_ID_82546GB_PCIE        0x108A
+#define E1000_DEV_ID_82546GB_QUAD_COPPER 0x1099
+#define E1000_DEV_ID_82547EI             0x1019
+#define E1000_DEV_ID_82547EI_MOBILE      0x101A
+#define E1000_DEV_ID_82571EB_COPPER      0x105E
+#define E1000_DEV_ID_82571EB_FIBER       0x105F
+#define E1000_DEV_ID_82571EB_SERDES      0x1060
+#define E1000_DEV_ID_82571EB_QUAD_COPPER 0x10A4
+#define E1000_DEV_ID_82571PT_QUAD_COPPER 0x10D5
+#define E1000_DEV_ID_82571EB_QUAD_FIBER  0x10A5
+#define E1000_DEV_ID_82571EB_QUAD_COPPER_LOWPROFILE  0x10BC
+#define E1000_DEV_ID_82571EB_SERDES_DUAL 0x10D9
+#define E1000_DEV_ID_82571EB_SERDES_QUAD 0x10DA
+#define E1000_DEV_ID_82572EI_COPPER      0x107D
+#define E1000_DEV_ID_82572EI_FIBER       0x107E
+#define E1000_DEV_ID_82572EI_SERDES      0x107F
+#define E1000_DEV_ID_82572EI             0x10B9
+#define E1000_DEV_ID_82573E              0x108B
+#define E1000_DEV_ID_82573E_IAMT         0x108C
+#define E1000_DEV_ID_82573L              0x109A
+#define E1000_DEV_ID_82574L              0x10D3
+#define E1000_DEV_ID_82546GB_QUAD_COPPER_KSP3 0x10B5
+#define E1000_DEV_ID_80003ES2LAN_COPPER_DPT     0x1096
+#define E1000_DEV_ID_80003ES2LAN_SERDES_DPT     0x1098
+#define E1000_DEV_ID_80003ES2LAN_COPPER_SPT     0x10BA
+#define E1000_DEV_ID_80003ES2LAN_SERDES_SPT     0x10BB
+
+#define E1000_DEV_ID_ICH8_IGP_M_AMT      0x1049
+#define E1000_DEV_ID_ICH8_IGP_AMT        0x104A
+#define E1000_DEV_ID_ICH8_IGP_C          0x104B
+#define E1000_DEV_ID_ICH8_IFE            0x104C
+#define E1000_DEV_ID_ICH8_IFE_GT         0x10C4
+#define E1000_DEV_ID_ICH8_IFE_G          0x10C5
+#define E1000_DEV_ID_ICH8_IGP_M          0x104D
+
+/* Device Specific Register Defaults */
+#define E1000_PHY_ID2_82541x 0x380
+#define E1000_PHY_ID2_82544x 0xC30
+#define E1000_PHY_ID2_8254xx_DEFAULT 0xC20 /* 82540x, 82545x, and 82546x */
+#define E1000_PHY_ID2_82573x 0xCC0
+#define E1000_PHY_ID2_82574x 0xCB1
+
+/* Register Set. (82543, 82544)
+ *
+ * Registers are defined to be 32 bits and  should be accessed as 32 bit values.
+ * These registers are physically located on the NIC, but are mapped into the
+ * host memory address space.
+ *
+ * RW - register is both readable and writable
+ * RO - register is read only
+ * WO - register is write only
+ * R/clr - register is read only and is cleared when read
+ * A - register array
+ */
+#define E1000_CTRL     0x00000  /* Device Control - RW */
+#define E1000_CTRL_DUP 0x00004  /* Device Control Duplicate (Shadow) - RW */
+#define E1000_STATUS   0x00008  /* Device Status - RO */
+#define E1000_EECD     0x00010  /* EEPROM/Flash Control - RW */
+#define E1000_EERD     0x00014  /* EEPROM Read - RW */
+#define E1000_CTRL_EXT 0x00018  /* Extended Device Control - RW */
+#define E1000_FLA      0x0001C  /* Flash Access - RW */
+#define E1000_MDIC     0x00020  /* MDI Control - RW */
+#define E1000_SCTL     0x00024  /* SerDes Control - RW */
+#define E1000_FCAL     0x00028  /* Flow Control Address Low - RW */
+#define E1000_FCAH     0x0002C  /* Flow Control Address High -RW */
+#define E1000_FCT      0x00030  /* Flow Control Type - RW */
+#define E1000_VET      0x00038  /* VLAN Ether Type - RW */
+#define E1000_ICR      0x000C0  /* Interrupt Cause Read - R/clr */
+#define E1000_ICS      0x000C8  /* Interrupt Cause Set - WO */
+#define E1000_IMS      0x000D0  /* Interrupt Mask Set - RW */
+#define E1000_IMC      0x000D8  /* Interrupt Mask Clear - WO */
+#define E1000_IAM      0x000E0  /* Interrupt Acknowledge Auto Mask */
+#define E1000_RCTL     0x00100  /* RX Control - RW */
+#define E1000_FCTTV    0x00170  /* Flow Control Transmit Timer Value - RW */
+#define E1000_TCTL     0x00400  /* TX Control - RW */
+#define E1000_TCTL_EXT 0x00404  /* Extended TX Control - RW */
+#define E1000_TIPG     0x00410  /* TX Inter-packet gap -RW */
+#define E1000_LEDCTL   0x00E00  /* LED Control - RW */
+#define E1000_EEMNGCTL 0x01010  /* MNG EEprom Control */
+#define E1000_EEMNGDATA    0x01014 /* MNG EEPROM Read/Write data */
+#define E1000_FLMNGCTL     0x01018 /* MNG Flash Control */
+#define E1000_FLMNGDATA    0x0101C /* MNG FLASH Read data */
+#define E1000_FLMNGCNT     0x01020 /* MNG FLASH Read Counter */
+#define E1000_EEARBC   0x01024  /* EEPROM Auto Read Bus Control */
+#define E1000_FLOP     0x0103C  /* FLASH Opcode Register */
+#define E1000_FCRTL    0x02160  /* Flow Control Receive Threshold Low - RW */
+#define E1000_FCRTL_A  0x00168  /* Alias to FCRTL */
+#define E1000_FCRTH    0x02168  /* Flow Control Receive Threshold High - RW */
+#define E1000_RDFH     0x02410  /* Receive Data FIFO Head Register - RW */
+#define E1000_RDFH_A   0x08000  /* Alias to RDFH */
+#define E1000_RDFT     0x02418  /* Receive Data FIFO Tail Register - RW */
+#define E1000_RDFT_A   0x08008  /* Alias to RDFT */
+#define E1000_RDFHS    0x02420  /* Receive Data FIFO Head Saved Register - RW */
+#define E1000_RDFTS    0x02428  /* Receive Data FIFO Tail Saved Register - RW */
+#define E1000_RDFPC    0x02430  /* Receive Data FIFO Packet Count - RW */
+#define E1000_TDFH     0x03410  /* TX Data FIFO Head - RW */
+#define E1000_TDFH_A   0x08010  /* Alias to TDFH */
+#define E1000_TDFT     0x03418  /* TX Data FIFO Tail - RW */
+#define E1000_TDFT_A   0x08018  /* Alias to TDFT */
+#define E1000_TDFHS    0x03420  /* TX Data FIFO Head Saved - RW */
+#define E1000_TDFTS    0x03428  /* TX Data FIFO Tail Saved - RW */
+#define E1000_TDFPC    0x03430  /* TX Data FIFO Packet Count - RW */
+#define E1000_CRCERRS  0x04000  /* CRC Error Count - R/clr */
+#define E1000_ALGNERRC 0x04004  /* Alignment Error Count - R/clr */
+#define E1000_SYMERRS  0x04008  /* Symbol Error Count - R/clr */
+#define E1000_RXERRC   0x0400C  /* Receive Error Count - R/clr */
+#define E1000_MPC      0x04010  /* Missed Packet Count - R/clr */
+#define E1000_SCC      0x04014  /* Single Collision Count - R/clr */
+#define E1000_ECOL     0x04018  /* Excessive Collision Count - R/clr */
+#define E1000_MCC      0x0401C  /* Multiple Collision Count - R/clr */
+#define E1000_LATECOL  0x04020  /* Late Collision Count - R/clr */
+#define E1000_COLC     0x04028  /* Collision Count - R/clr */
+#define E1000_DC       0x04030  /* Defer Count - R/clr */
+#define E1000_TNCRS    0x04034  /* TX-No CRS - R/clr */
+#define E1000_RLEC     0x04040  /* Receive Length Error Count - R/clr */
+#define E1000_XONRXC   0x04048  /* XON RX Count - R/clr */
+#define E1000_XONTXC   0x0404C  /* XON TX Count - R/clr */
+#define E1000_XOFFRXC  0x04050  /* XOFF RX Count - R/clr */
+#define E1000_XOFFTXC  0x04054  /* XOFF TX Count - R/clr */
+#define E1000_FCRUC    0x04058  /* Flow Control RX Unsupported Count- R/clr */
+#define E1000_PRC64    0x0405C  /* Packets RX (64 bytes) - R/clr */
+#define E1000_PRC127   0x04060  /* Packets RX (65-127 bytes) - R/clr */
+#define E1000_PRC255   0x04064  /* Packets RX (128-255 bytes) - R/clr */
+#define E1000_PRC511   0x04068  /* Packets RX (255-511 bytes) - R/clr */
+#define E1000_PRC1023  0x0406C  /* Packets RX (512-1023 bytes) - R/clr */
+#define E1000_PRC1522  0x04070  /* Packets RX (1024-1522 bytes) - R/clr */
+#define E1000_GPRC     0x04074  /* Good Packets RX Count - R/clr */
+#define E1000_BPRC     0x04078  /* Broadcast Packets RX Count - R/clr */
+#define E1000_MPRC     0x0407C  /* Multicast Packets RX Count - R/clr */
+#define E1000_GPTC     0x04080  /* Good Packets TX Count - R/clr */
+#define E1000_GORCL    0x04088  /* Good Octets RX Count Low - R/clr */
+#define E1000_GORCH    0x0408C  /* Good Octets RX Count High - R/clr */
+#define E1000_GOTCL    0x04090  /* Good Octets TX Count Low - R/clr */
+#define E1000_GOTCH    0x04094  /* Good Octets TX Count High - R/clr */
+#define E1000_RNBC     0x040A0  /* RX No Buffers Count - R/clr */
+#define E1000_RUC      0x040A4  /* RX Undersize Count - R/clr */
+#define E1000_RFC      0x040A8  /* RX Fragment Count - R/clr */
+#define E1000_ROC      0x040AC  /* RX Oversize Count - R/clr */
+#define E1000_RJC      0x040B0  /* RX Jabber Count - R/clr */
+#define E1000_MGTPRC   0x040B4  /* Management Packets RX Count - R/clr */
+#define E1000_MGTPDC   0x040B8  /* Management Packets Dropped Count - R/clr */
+#define E1000_MGTPTC   0x040BC  /* Management Packets TX Count - R/clr */
+#define E1000_TORL     0x040C0  /* Total Octets RX Low - R/clr */
+#define E1000_TORH     0x040C4  /* Total Octets RX High - R/clr */
+#define E1000_TOTL     0x040C8  /* Total Octets TX Low - R/clr */
+#define E1000_TOTH     0x040CC  /* Total Octets TX High - R/clr */
+#define E1000_TPR      0x040D0  /* Total Packets RX - R/clr */
+#define E1000_TPT      0x040D4  /* Total Packets TX - R/clr */
+#define E1000_PTC64    0x040D8  /* Packets TX (64 bytes) - R/clr */
+#define E1000_PTC127   0x040DC  /* Packets TX (65-127 bytes) - R/clr */
+#define E1000_PTC255   0x040E0  /* Packets TX (128-255 bytes) - R/clr */
+#define E1000_PTC511   0x040E4  /* Packets TX (256-511 bytes) - R/clr */
+#define E1000_PTC1023  0x040E8  /* Packets TX (512-1023 bytes) - R/clr */
+#define E1000_PTC1522  0x040EC  /* Packets TX (1024-1522 Bytes) - R/clr */
+#define E1000_MPTC     0x040F0  /* Multicast Packets TX Count - R/clr */
+#define E1000_BPTC     0x040F4  /* Broadcast Packets TX Count - R/clr */
+#define E1000_TSCTC    0x040F8  /* TCP Segmentation Context TX - R/clr */
+#define E1000_IAC      0x04100  /* Interrupt Assertion Count */
+#define E1000_ICRXPTC  0x04104  /* Interrupt Cause Rx Packet Timer Expire Count */
+#define E1000_ICRXDMTC 0x04120  /* Interrupt Cause Rx Descriptor Minimum Threshold Count */
+#define E1000_RXCSUM   0x05000  /* RX Checksum Control - RW */
+#define E1000_RFCTL    0x05008  /* Receive Filter Control*/
+#define E1000_MAVTV0   0x05010  /* Management VLAN TAG Value 0 */
+#define E1000_MAVTV1   0x05014  /* Management VLAN TAG Value 1 */
+#define E1000_MAVTV2   0x05018  /* Management VLAN TAG Value 2 */
+#define E1000_MAVTV3   0x0501c  /* Management VLAN TAG Value 3 */
+#define E1000_MTA      0x05200  /* Multicast Table Array - RW Array */
+#define E1000_RA       0x05400  /* Receive Address - RW Array */
+#define E1000_RA_A     0x00040  /* Alias to RA */
+#define E1000_VFTA     0x05600  /* VLAN Filter Table Array - RW Array */
+#define E1000_VFTA_A   0x00600  /* Alias to VFTA */
+#define E1000_WUC      0x05800  /* Wakeup Control - RW */
+#define E1000_WUFC     0x05808  /* Wakeup Filter Control - RW */
+#define E1000_WUS      0x05810  /* Wakeup Status - RO */
+#define E1000_MANC     0x05820  /* Management Control - RW */
+#define E1000_IPAV     0x05838  /* IP Address Valid - RW */
+#define E1000_IP4AT    0x05840  /* IPv4 Address Table - RW Array */
+#define E1000_IP6AT    0x05880  /* IPv6 Address Table - RW Array */
+#define E1000_WUPL     0x05900  /* Wakeup Packet Length - RW */
+#define E1000_WUPM     0x05A00  /* Wakeup Packet Memory - RO A */
+#define E1000_MFVAL    0x05824  /* Manageability Filters Valid - RW */
+#define E1000_MDEF     0x05890  /* Manageability Decision Filters - RW Array */
+#define E1000_FFMT     0x09000  /* Flexible Filter Mask Table - RW Array */
+#define E1000_FTFT     0x09400  /* Flexible TCO Filter Table - RW Array */
+
+#define E1000_MANC2H     0x05860 /* Management Control To Host - RW */
+#define E1000_SW_FW_SYNC 0x05B5C /* Software-Firmware Synchronization - RW */
+
+#define E1000_GCR       0x05B00 /* PCI-Ex Control */
+#define E1000_FUNCTAG   0x05B08 /* Function-Tag Register */
+#define E1000_GSCL_1    0x05B10 /* PCI-Ex Statistic Control #1 */
+#define E1000_GSCL_2    0x05B14 /* PCI-Ex Statistic Control #2 */
+#define E1000_GSCL_3    0x05B18 /* PCI-Ex Statistic Control #3 */
+#define E1000_GSCL_4    0x05B1C /* PCI-Ex Statistic Control #4 */
+#define E1000_GSCN_0    0x05B20 /* 3GIO Statistic Counter Register #0 */
+#define E1000_GSCN_1    0x05B24 /* 3GIO Statistic Counter Register #1 */
+#define E1000_GSCN_2    0x05B28 /* 3GIO Statistic Counter Register #2 */
+#define E1000_GSCN_3    0x05B2C /* 3GIO Statistic Counter Register #3 */
+#define E1000_FACTPS    0x05B30 /* Function Active and Power State to MNG */
+#define E1000_SWSM      0x05B50 /* SW Semaphore */
+#define E1000_FWSM      0x05B54 /* FW Semaphore */
+#define E1000_PBACLR    0x05B68 /* MSI-X PBA Clear */
+
+#define E1000_TSYNCRXCTL 0x0B620 /* Rx Time Sync Control register - RW */
+#define E1000_TSYNCTXCTL 0x0B614 /* Tx Time Sync Control register - RW */
+#define E1000_TIMINCA    0x0B608 /* Increment attributes register - RW */
+#define E1000_RXSTMPL    0x0B624 /* Rx timestamp Low - RO */
+#define E1000_RXSTMPH    0x0B628 /* Rx timestamp High - RO */
+#define E1000_TXSTMPL    0x0B618 /* Tx timestamp value Low - RO */
+#define E1000_TXSTMPH    0x0B61C /* Tx timestamp value High - RO */
+#define E1000_SYSTIML    0x0B600 /* System time register Low - RO */
+#define E1000_SYSTIMH    0x0B604 /* System time register High - RO */
+#define E1000_TIMINCA    0x0B608 /* Increment attributes register - RW */
+#define E1000_RXSATRL    0x0B62C /* Rx timestamp attribute low - RO */
+#define E1000_RXSATRH    0x0B630 /* Rx timestamp attribute high - RO */
+#define E1000_TIMADJL    0x0B60C /* Time Adjustment Offset register Low - RW */
+#define E1000_TIMADJH    0x0B610 /* Time Adjustment Offset register High - RW */
+
+/* RSS registers */
+#define E1000_MRQC      0x05818 /* Multiple Receive Control - RW */
+#define E1000_RETA      0x05C00 /* Redirection Table - RW Array */
+#define E1000_RSSRK     0x05C80 /* RSS Random Key - RW Array */
+
+#define E1000_RETA_IDX(hash)        ((hash) & (BIT(7) - 1))
+#define E1000_RETA_VAL(reta, hash)  (((uint8_t *)(reta))[E1000_RETA_IDX(hash)])
+
+#define E1000_MRQC_EN_TCPIPV4(mrqc) ((mrqc) & BIT(16))
+#define E1000_MRQC_EN_IPV4(mrqc)    ((mrqc) & BIT(17))
+#define E1000_MRQC_EN_TCPIPV6(mrqc) ((mrqc) & BIT(18))
+#define E1000_MRQC_EN_IPV6EX(mrqc)  ((mrqc) & BIT(19))
+#define E1000_MRQC_EN_IPV6(mrqc)    ((mrqc) & BIT(20))
+
+#define E1000_MRQ_RSS_TYPE_NONE     (0)
+#define E1000_MRQ_RSS_TYPE_IPV4TCP  (1)
+#define E1000_MRQ_RSS_TYPE_IPV4     (2)
+#define E1000_MRQ_RSS_TYPE_IPV6TCP  (3)
+#define E1000_MRQ_RSS_TYPE_IPV6EX   (4)
+#define E1000_MRQ_RSS_TYPE_IPV6     (5)
+
+#define E1000_ICR_ASSERTED BIT(31)
+#define E1000_EIAC_MASK    0x01F00000
+
+/* RFCTL register bits */
+#define E1000_RFCTL_ISCSI_DIS           0x00000001
+#define E1000_RFCTL_NFSW_DIS            0x00000040
+#define E1000_RFCTL_NFSR_DIS            0x00000080
+#define E1000_RFCTL_IPV6_DIS            0x00000400
+#define E1000_RFCTL_IPV6_XSUM_DIS       0x00000800
+#define E1000_RFCTL_IPFRSP_DIS          0x00004000
+#define E1000_RFCTL_EXTEN               0x00008000
+#define E1000_RFCTL_IPV6_EX_DIS         0x00010000
+#define E1000_RFCTL_NEW_IPV6_EXT_DIS    0x00020000
+
+/* TARC* parsing */
+#define E1000_TARC_ENABLE BIT(10)
+
+/* SW Semaphore Register */
+#define E1000_SWSM_SMBI         0x00000001 /* Driver Semaphore bit */
+#define E1000_SWSM_SWESMBI      0x00000002 /* FW Semaphore bit */
+#define E1000_SWSM_DRV_LOAD     0x00000008 /* Driver Loaded Bit */
+
+#define E1000_SWSM2_LOCK        0x00000002 /* Secondary driver semaphore bit */
+
+/* Interrupt Cause Read */
+#define E1000_ICR_TXDW          0x00000001 /* Transmit desc written back */
+#define E1000_ICR_TXQE          0x00000002 /* Transmit Queue empty */
+#define E1000_ICR_LSC           0x00000004 /* Link Status Change */
+#define E1000_ICR_RXSEQ         0x00000008 /* rx sequence error */
+#define E1000_ICR_RXDMT0        0x00000010 /* rx desc min. threshold (0) */
+#define E1000_ICR_RXO           0x00000040 /* rx overrun */
+#define E1000_ICR_RXT0          0x00000080 /* rx timer intr (ring 0) */
+#define E1000_ICR_MDAC          0x00000200 /* MDIO access complete */
+#define E1000_ICR_RXCFG         0x00000400 /* RX /c/ ordered set */
+#define E1000_ICR_GPI_EN0       0x00000800 /* GP Int 0 */
+#define E1000_ICR_GPI_EN1       0x00001000 /* GP Int 1 */
+#define E1000_ICR_GPI_EN2       0x00002000 /* GP Int 2 */
+#define E1000_ICR_GPI_EN3       0x00004000 /* GP Int 3 */
+#define E1000_ICR_TXD_LOW       0x00008000
+#define E1000_ICR_SRPD          0x00010000
+#define E1000_ICR_ACK           0x00020000 /* Receive Ack frame */
+#define E1000_ICR_MNG           0x00040000 /* Manageability event */
+#define E1000_ICR_DOCK          0x00080000 /* Dock/Undock */
+#define E1000_ICR_INT_ASSERTED  0x80000000 /* If this bit asserted, the driver should claim the interrupt */
+#define E1000_ICR_RXD_FIFO_PAR0 0x00100000 /* queue 0 Rx descriptor FIFO parity error */
+#define E1000_ICR_TXD_FIFO_PAR0 0x00200000 /* queue 0 Tx descriptor FIFO parity error */
+#define E1000_ICR_HOST_ARB_PAR  0x00400000 /* host arb read buffer parity error */
+#define E1000_ICR_PB_PAR        0x00800000 /* packet buffer parity error */
+#define E1000_ICR_RXD_FIFO_PAR1 0x01000000 /* queue 1 Rx descriptor FIFO parity error */
+#define E1000_ICR_TXD_FIFO_PAR1 0x02000000 /* queue 1 Tx descriptor FIFO parity error */
+#define E1000_ICR_ALL_PARITY    0x03F00000 /* all parity error bits */
+#define E1000_ICR_DSW           0x00000020 /* FW changed the status of DISSW bit in the FWSM */
+#define E1000_ICR_PHYINT        0x00001000 /* LAN connected device generates an interrupt */
+#define E1000_ICR_EPRST         0x00100000 /* ME handware reset occurs */
+#define E1000_ICR_RXQ0          0x00100000 /* Rx Queue 0 Interrupt */
+#define E1000_ICR_RXQ1          0x00200000 /* Rx Queue 1 Interrupt */
+#define E1000_ICR_TXQ0          0x00400000 /* Tx Queue 0 Interrupt */
+#define E1000_ICR_TXQ1          0x00800000 /* Tx Queue 1 Interrupt */
+#define E1000_ICR_OTHER         0x01000000 /* Other Interrupts */
+
+#define E1000_ICR_OTHER_CAUSES (E1000_ICR_LSC  | \
+                                E1000_ICR_RXO  | \
+                                E1000_ICR_MDAC | \
+                                E1000_ICR_SRPD | \
+                                E1000_ICR_ACK  | \
+                                E1000_ICR_MNG)
+
+/* Interrupt Cause Set */
+#define E1000_ICS_TXDW      E1000_ICR_TXDW      /* Transmit desc written back */
+#define E1000_ICS_TXQE      E1000_ICR_TXQE      /* Transmit Queue empty */
+#define E1000_ICS_LSC       E1000_ICR_LSC       /* Link Status Change */
+#define E1000_ICS_RXSEQ     E1000_ICR_RXSEQ     /* rx sequence error */
+#define E1000_ICS_RXDMT0    E1000_ICR_RXDMT0    /* rx desc min. threshold */
+#define E1000_ICS_RXO       E1000_ICR_RXO       /* rx overrun */
+#define E1000_ICS_RXT0      E1000_ICR_RXT0      /* rx timer intr */
+#define E1000_ICS_MDAC      E1000_ICR_MDAC      /* MDIO access complete */
+#define E1000_ICS_RXCFG     E1000_ICR_RXCFG     /* RX /c/ ordered set */
+#define E1000_ICS_GPI_EN0   E1000_ICR_GPI_EN0   /* GP Int 0 */
+#define E1000_ICS_GPI_EN1   E1000_ICR_GPI_EN1   /* GP Int 1 */
+#define E1000_ICS_GPI_EN2   E1000_ICR_GPI_EN2   /* GP Int 2 */
+#define E1000_ICS_GPI_EN3   E1000_ICR_GPI_EN3   /* GP Int 3 */
+#define E1000_ICS_TXD_LOW   E1000_ICR_TXD_LOW
+#define E1000_ICS_SRPD      E1000_ICR_SRPD
+#define E1000_ICS_ACK       E1000_ICR_ACK       /* Receive Ack frame */
+#define E1000_ICS_MNG       E1000_ICR_MNG       /* Manageability event */
+#define E1000_ICS_DOCK      E1000_ICR_DOCK      /* Dock/Undock */
+#define E1000_ICS_RXD_FIFO_PAR0 E1000_ICR_RXD_FIFO_PAR0 /* queue 0 Rx descriptor FIFO parity error */
+#define E1000_ICS_TXD_FIFO_PAR0 E1000_ICR_TXD_FIFO_PAR0 /* queue 0 Tx descriptor FIFO parity error */
+#define E1000_ICS_HOST_ARB_PAR  E1000_ICR_HOST_ARB_PAR  /* host arb read buffer parity error */
+#define E1000_ICS_PB_PAR        E1000_ICR_PB_PAR        /* packet buffer parity error */
+#define E1000_ICS_RXD_FIFO_PAR1 E1000_ICR_RXD_FIFO_PAR1 /* queue 1 Rx descriptor FIFO parity error */
+#define E1000_ICS_TXD_FIFO_PAR1 E1000_ICR_TXD_FIFO_PAR1 /* queue 1 Tx descriptor FIFO parity error */
+#define E1000_ICS_DSW       E1000_ICR_DSW
+#define E1000_ICS_PHYINT    E1000_ICR_PHYINT
+#define E1000_ICS_EPRST     E1000_ICR_EPRST
+
+/* Interrupt Mask Set */
+#define E1000_IMS_TXDW      E1000_ICR_TXDW      /* Transmit desc written back */
+#define E1000_IMS_TXQE      E1000_ICR_TXQE      /* Transmit Queue empty */
+#define E1000_IMS_LSC       E1000_ICR_LSC       /* Link Status Change */
+#define E1000_IMS_RXSEQ     E1000_ICR_RXSEQ     /* rx sequence error */
+#define E1000_IMS_RXDMT0    E1000_ICR_RXDMT0    /* rx desc min. threshold */
+#define E1000_IMS_RXO       E1000_ICR_RXO       /* rx overrun */
+#define E1000_IMS_RXT0      E1000_ICR_RXT0      /* rx timer intr */
+#define E1000_IMS_MDAC      E1000_ICR_MDAC      /* MDIO access complete */
+#define E1000_IMS_RXCFG     E1000_ICR_RXCFG     /* RX /c/ ordered set */
+#define E1000_IMS_GPI_EN0   E1000_ICR_GPI_EN0   /* GP Int 0 */
+#define E1000_IMS_GPI_EN1   E1000_ICR_GPI_EN1   /* GP Int 1 */
+#define E1000_IMS_GPI_EN2   E1000_ICR_GPI_EN2   /* GP Int 2 */
+#define E1000_IMS_GPI_EN3   E1000_ICR_GPI_EN3   /* GP Int 3 */
+#define E1000_IMS_TXD_LOW   E1000_ICR_TXD_LOW
+#define E1000_IMS_SRPD      E1000_ICR_SRPD
+#define E1000_IMS_ACK       E1000_ICR_ACK       /* Receive Ack frame */
+#define E1000_IMS_MNG       E1000_ICR_MNG       /* Manageability event */
+#define E1000_IMS_RXQ0      E1000_ICR_RXQ0
+#define E1000_IMS_RXQ1      E1000_ICR_RXQ1
+#define E1000_IMS_TXQ0      E1000_ICR_TXQ0
+#define E1000_IMS_TXQ1      E1000_ICR_TXQ1
+#define E1000_IMS_OTHER     E1000_ICR_OTHER
+#define E1000_IMS_DOCK      E1000_ICR_DOCK      /* Dock/Undock */
+#define E1000_IMS_RXD_FIFO_PAR0 E1000_ICR_RXD_FIFO_PAR0 /* queue 0 Rx descriptor FIFO parity error */
+#define E1000_IMS_TXD_FIFO_PAR0 E1000_ICR_TXD_FIFO_PAR0 /* queue 0 Tx descriptor FIFO parity error */
+#define E1000_IMS_HOST_ARB_PAR  E1000_ICR_HOST_ARB_PAR  /* host arb read buffer parity error */
+#define E1000_IMS_PB_PAR        E1000_ICR_PB_PAR        /* packet buffer parity error */
+#define E1000_IMS_RXD_FIFO_PAR1 E1000_ICR_RXD_FIFO_PAR1 /* queue 1 Rx descriptor FIFO parity error */
+#define E1000_IMS_TXD_FIFO_PAR1 E1000_ICR_TXD_FIFO_PAR1 /* queue 1 Tx descriptor FIFO parity error */
+#define E1000_IMS_DSW       E1000_ICR_DSW
+#define E1000_IMS_PHYINT    E1000_ICR_PHYINT
+#define E1000_IMS_EPRST     E1000_ICR_EPRST
+
+/* Interrupt Mask Clear */
+#define E1000_IMC_TXDW      E1000_ICR_TXDW      /* Transmit desc written back */
+#define E1000_IMC_TXQE      E1000_ICR_TXQE      /* Transmit Queue empty */
+#define E1000_IMC_LSC       E1000_ICR_LSC       /* Link Status Change */
+#define E1000_IMC_RXSEQ     E1000_ICR_RXSEQ     /* rx sequence error */
+#define E1000_IMC_RXDMT0    E1000_ICR_RXDMT0    /* rx desc min. threshold */
+#define E1000_IMC_RXO       E1000_ICR_RXO       /* rx overrun */
+#define E1000_IMC_RXT0      E1000_ICR_RXT0      /* rx timer intr */
+#define E1000_IMC_MDAC      E1000_ICR_MDAC      /* MDIO access complete */
+#define E1000_IMC_RXCFG     E1000_ICR_RXCFG     /* RX /c/ ordered set */
+#define E1000_IMC_GPI_EN0   E1000_ICR_GPI_EN0   /* GP Int 0 */
+#define E1000_IMC_GPI_EN1   E1000_ICR_GPI_EN1   /* GP Int 1 */
+#define E1000_IMC_GPI_EN2   E1000_ICR_GPI_EN2   /* GP Int 2 */
+#define E1000_IMC_GPI_EN3   E1000_ICR_GPI_EN3   /* GP Int 3 */
+#define E1000_IMC_TXD_LOW   E1000_ICR_TXD_LOW
+#define E1000_IMC_SRPD      E1000_ICR_SRPD
+#define E1000_IMC_ACK       E1000_ICR_ACK       /* Receive Ack frame */
+#define E1000_IMC_MNG       E1000_ICR_MNG       /* Manageability event */
+#define E1000_IMC_DOCK      E1000_ICR_DOCK      /* Dock/Undock */
+#define E1000_IMC_RXD_FIFO_PAR0 E1000_ICR_RXD_FIFO_PAR0 /* queue 0 Rx descriptor FIFO parity error */
+#define E1000_IMC_TXD_FIFO_PAR0 E1000_ICR_TXD_FIFO_PAR0 /* queue 0 Tx descriptor FIFO parity error */
+#define E1000_IMC_HOST_ARB_PAR  E1000_ICR_HOST_ARB_PAR  /* host arb read buffer parity error */
+#define E1000_IMC_PB_PAR        E1000_ICR_PB_PAR        /* packet buffer parity error */
+#define E1000_IMC_RXD_FIFO_PAR1 E1000_ICR_RXD_FIFO_PAR1 /* queue 1 Rx descriptor FIFO parity error */
+#define E1000_IMC_TXD_FIFO_PAR1 E1000_ICR_TXD_FIFO_PAR1 /* queue 1 Tx descriptor FIFO parity error */
+#define E1000_IMC_DSW       E1000_ICR_DSW
+#define E1000_IMC_PHYINT    E1000_ICR_PHYINT
+#define E1000_IMC_EPRST     E1000_ICR_EPRST
+
+/* Receive Control */
+#define E1000_RCTL_RST            0x00000001    /* Software reset */
+#define E1000_RCTL_EN             0x00000002    /* enable */
+#define E1000_RCTL_SBP            0x00000004    /* store bad packet */
+#define E1000_RCTL_UPE            0x00000008    /* unicast promiscuous enable */
+#define E1000_RCTL_MPE            0x00000010    /* multicast promiscuous enab */
+#define E1000_RCTL_LPE            0x00000020    /* long packet enable */
+#define E1000_RCTL_LBM_NO         0x00000000    /* no loopback mode */
+#define E1000_RCTL_LBM_MAC        0x00000040    /* MAC loopback mode */
+#define E1000_RCTL_LBM_SLP        0x00000080    /* serial link loopback mode */
+#define E1000_RCTL_LBM_TCVR       0x000000C0    /* tcvr loopback mode */
+#define E1000_RCTL_DTYP_MASK      0x00000C00    /* Descriptor type mask */
+#define E1000_RCTL_DTYP_PS        0x00000400    /* Packet Split descriptor */
+#define E1000_RCTL_RDMTS_HALF     0x00000000    /* rx desc min threshold size */
+#define E1000_RCTL_RDMTS_QUAT     0x00000100    /* rx desc min threshold size */
+#define E1000_RCTL_RDMTS_EIGTH    0x00000200    /* rx desc min threshold size */
+#define E1000_RCTL_MO_SHIFT       12            /* multicast offset shift */
+#define E1000_RCTL_MO_0           0x00000000    /* multicast offset 11:0 */
+#define E1000_RCTL_MO_1           0x00001000    /* multicast offset 12:1 */
+#define E1000_RCTL_MO_2           0x00002000    /* multicast offset 13:2 */
+#define E1000_RCTL_MO_3           0x00003000    /* multicast offset 15:4 */
+#define E1000_RCTL_MDR            0x00004000    /* multicast desc ring 0 */
+#define E1000_RCTL_BAM            0x00008000    /* broadcast enable */
+/* these buffer sizes are valid if E1000_RCTL_BSEX is 0 */
+#define E1000_RCTL_SZ_2048        0x00000000    /* rx buffer size 2048 */
+#define E1000_RCTL_SZ_1024        0x00010000    /* rx buffer size 1024 */
+#define E1000_RCTL_SZ_512         0x00020000    /* rx buffer size 512 */
+#define E1000_RCTL_SZ_256         0x00030000    /* rx buffer size 256 */
+/* these buffer sizes are valid if E1000_RCTL_BSEX is 1 */
+#define E1000_RCTL_SZ_16384       0x00010000    /* rx buffer size 16384 */
+#define E1000_RCTL_SZ_8192        0x00020000    /* rx buffer size 8192 */
+#define E1000_RCTL_SZ_4096        0x00030000    /* rx buffer size 4096 */
+#define E1000_RCTL_VFE            0x00040000    /* vlan filter enable */
+#define E1000_RCTL_CFIEN          0x00080000    /* canonical form enable */
+#define E1000_RCTL_CFI            0x00100000    /* canonical form indicator */
+#define E1000_RCTL_DPF            0x00400000    /* discard pause frames */
+#define E1000_RCTL_PMCF           0x00800000    /* pass MAC control frames */
+#define E1000_RCTL_BSEX           0x02000000    /* Buffer size extension */
+#define E1000_RCTL_SECRC          0x04000000    /* Strip Ethernet CRC */
+#define E1000_RCTL_FLXBUF_MASK    0x78000000    /* Flexible buffer size */
+#define E1000_RCTL_FLXBUF_SHIFT   27            /* Flexible buffer shift */
+
+
+#define E1000_EEPROM_SWDPIN0   0x0001   /* SWDPIN 0 EEPROM Value */
+#define E1000_EEPROM_LED_LOGIC 0x0020   /* Led Logic Word */
+#define E1000_EEPROM_RW_REG_DATA   16   /* Offset to data in EEPROM read/write registers */
+#define E1000_EEPROM_RW_REG_DONE   0x10 /* Offset to READ/WRITE done bit */
+#define E1000_EEPROM_RW_REG_START  1    /* First bit for telling part to start operation */
+#define E1000_EEPROM_RW_ADDR_SHIFT 8    /* Shift to the address bits */
+#define E1000_EEPROM_POLL_WRITE    1    /* Flag for polling for write complete */
+#define E1000_EEPROM_POLL_READ     0    /* Flag for polling for read complete */
+
+/* 82574 EERD/EEWR registers layout */
+#define E1000_EERW_START        BIT(0)
+#define E1000_EERW_DONE         BIT(1)
+#define E1000_EERW_ADDR_SHIFT   2
+#define E1000_EERW_ADDR_MASK    ((1L << 14) - 1)
+#define E1000_EERW_DATA_SHIFT   16
+#define E1000_EERW_DATA_MASK   ((1L << 16) - 1)
+
+/* Register Bit Masks */
+/* Device Control */
+#define E1000_CTRL_FD       0x00000001  /* Full duplex.0=half; 1=full */
+#define E1000_CTRL_BEM      0x00000002  /* Endian Mode.0=little,1=big */
+#define E1000_CTRL_PRIOR    0x00000004  /* Priority on PCI. 0=rx,1=fair */
+#define E1000_CTRL_GIO_MASTER_DISABLE 0x00000004 /*Blocks new Master requests */
+#define E1000_CTRL_LRST     0x00000008  /* Link reset. 0=normal,1=reset */
+#define E1000_CTRL_TME      0x00000010  /* Test mode. 0=normal,1=test */
+#define E1000_CTRL_SLE      0x00000020  /* Serial Link on 0=dis,1=en */
+#define E1000_CTRL_ASDE     0x00000020  /* Auto-speed detect enable */
+#define E1000_CTRL_SLU      0x00000040  /* Set link up (Force Link) */
+#define E1000_CTRL_ILOS     0x00000080  /* Invert Loss-Of Signal */
+#define E1000_CTRL_SPD_SEL  0x00000300  /* Speed Select Mask */
+#define E1000_CTRL_SPD_10   0x00000000  /* Force 10Mb */
+#define E1000_CTRL_SPD_100  0x00000100  /* Force 100Mb */
+#define E1000_CTRL_SPD_1000 0x00000200  /* Force 1Gb */
+#define E1000_CTRL_BEM32    0x00000400  /* Big Endian 32 mode */
+#define E1000_CTRL_FRCSPD   0x00000800  /* Force Speed */
+#define E1000_CTRL_FRCDPX   0x00001000  /* Force Duplex */
+#define E1000_CTRL_D_UD_EN  0x00002000  /* Dock/Undock enable */
+#define E1000_CTRL_D_UD_POLARITY 0x00004000 /* Defined polarity of Dock/Undock indication in SDP[0] */
+#define E1000_CTRL_FORCE_PHY_RESET 0x00008000 /* Reset both PHY ports, through PHYRST_N pin */
+#define E1000_CTRL_SPD_SHIFT 8          /* Speed Select Shift */
+
+#define E1000_CTRL_EXT_ASDCHK  0x00001000 /* auto speed detection check */
+#define E1000_CTRL_EXT_EE_RST  0x00002000 /* EEPROM reset */
+#define E1000_CTRL_EXT_LINK_EN 0x00010000 /* enable link status from external LINK_0 and LINK_1 pins */
+#define E1000_CTRL_EXT_DRV_LOAD 0x10000000 /* Driver loaded bit for FW */
+#define E1000_CTRL_EXT_EIAME   0x01000000
+#define E1000_CTRL_EXT_IAME    0x08000000 /* Int ACK Auto-mask */
+#define E1000_CTRL_EXT_PBA_CLR 0x80000000 /* PBA Clear */
+#define E1000_CTRL_EXT_INT_TIMERS_CLEAR_ENA 0x20000000
+#define E1000_CTRL_EXT_SPD_BYPS  0x00008000 /* Speed Select Bypass */
+
+#define E1000_CTRL_SWDPIN0  0x00040000  /* SWDPIN 0 value */
+#define E1000_CTRL_SWDPIN1  0x00080000  /* SWDPIN 1 value */
+#define E1000_CTRL_SWDPIN2  0x00100000  /* SWDPIN 2 value */
+#define E1000_CTRL_SWDPIN3  0x00200000  /* SWDPIN 3 value */
+#define E1000_CTRL_SWDPIO0  0x00400000  /* SWDPIN 0 Input or output */
+#define E1000_CTRL_SWDPIO1  0x00800000  /* SWDPIN 1 input or output */
+#define E1000_CTRL_SWDPIO2  0x01000000  /* SWDPIN 2 input or output */
+#define E1000_CTRL_SWDPIO3  0x02000000  /* SWDPIN 3 input or output */
+#define E1000_CTRL_ADVD3WUC 0x00100000  /* D3 WUC */
+#define E1000_CTRL_RST      0x04000000  /* Global reset */
+#define E1000_CTRL_RFCE     0x08000000  /* Receive Flow Control enable */
+#define E1000_CTRL_TFCE     0x10000000  /* Transmit flow control enable */
+#define E1000_CTRL_RTE      0x20000000  /* Routing tag enable */
+#define E1000_CTRL_VME      0x40000000  /* IEEE VLAN mode enable */
+#define E1000_CTRL_PHY_RST  0x80000000  /* PHY Reset */
+#define E1000_CTRL_SW2FW_INT 0x02000000  /* Initiate an interrupt to manageability engine */
+
+/* Device Status */
+#define E1000_STATUS_FD                 0x00000001 /* Full duplex.0=half,1=full */
+#define E1000_STATUS_LU                 0x00000002 /* Link up.0=no,1=link */
+#define E1000_STATUS_SPEED_10           0x00000000 /* Speed 10Mb/s */
+#define E1000_STATUS_SPEED_100          0x00000040 /* Speed 100Mb/s */
+#define E1000_STATUS_SPEED_1000         0x00000080 /* Speed 1000Mb/s */
+#define E1000_STATUS_PHYRA              0x00000400 /* PHY Reset Asserted */
+#define E1000_STATUS_GIO_MASTER_ENABLE  0x00080000
+
+/* EEPROM/Flash Control */
+#define E1000_EECD_SK        0x00000001 /* EEPROM Clock */
+#define E1000_EECD_CS        0x00000002 /* EEPROM Chip Select */
+#define E1000_EECD_DI        0x00000004 /* EEPROM Data In */
+#define E1000_EECD_DO        0x00000008 /* EEPROM Data Out */
+#define E1000_EECD_FWE_MASK  0x00000030
+#define E1000_EECD_FWE_DIS   0x00000010 /* Disable FLASH writes */
+#define E1000_EECD_FWE_EN    0x00000020 /* Enable FLASH writes */
+#define E1000_EECD_FWE_SHIFT 4
+#define E1000_EECD_REQ       0x00000040 /* EEPROM Access Request */
+#define E1000_EECD_GNT       0x00000080 /* EEPROM Access Grant */
+#define E1000_EECD_PRES      0x00000100 /* EEPROM Present */
+#define E1000_EECD_SIZE      0x00000200 /* EEPROM Size (0=64 word 1=256 word) */
+#define E1000_EECD_ADDR_BITS 0x00000400 /* EEPROM Addressing bits based on type
+                                         * (0-small, 1-large) */
+#define E1000_EECD_TYPE      0x00002000 /* EEPROM Type (1-SPI, 0-Microwire) */
+#ifndef E1000_EEPROM_GRANT_ATTEMPTS
+#define E1000_EEPROM_GRANT_ATTEMPTS 1000 /* EEPROM # attempts to gain grant */
+#endif
+#define E1000_EECD_AUTO_RD          0x00000200  /* EEPROM Auto Read done */
+#define E1000_EECD_SIZE_EX_MASK     0x00007800  /* EEprom Size */
+#define E1000_EECD_SIZE_EX_SHIFT    11
+#define E1000_EECD_NVADDS    0x00018000 /* NVM Address Size */
+#define E1000_EECD_SELSHAD   0x00020000 /* Select Shadow RAM */
+#define E1000_EECD_INITSRAM  0x00040000 /* Initialize Shadow RAM */
+#define E1000_EECD_FLUPD     0x00080000 /* Update FLASH */
+#define E1000_EECD_AUPDEN    0x00100000 /* Enable Autonomous FLASH update */
+#define E1000_EECD_SHADV     0x00200000 /* Shadow RAM Data Valid */
+#define E1000_EECD_SEC1VAL   0x00400000 /* Sector One Valid */
+
+
+#define E1000_EECD_SECVAL_SHIFT      22
+#define E1000_STM_OPCODE     0xDB00
+#define E1000_HICR_FW_RESET  0xC0
+
+#define E1000_SHADOW_RAM_WORDS     2048
+#define E1000_ICH_NVM_SIG_WORD     0x13
+#define E1000_ICH_NVM_SIG_MASK     0xC0
+
+/* MDI Control */
+#define E1000_MDIC_DATA_MASK 0x0000FFFF
+#define E1000_MDIC_REG_MASK  0x001F0000
+#define E1000_MDIC_REG_SHIFT 16
+#define E1000_MDIC_PHY_MASK  0x03E00000
+#define E1000_MDIC_PHY_SHIFT 21
+#define E1000_MDIC_OP_WRITE  0x04000000
+#define E1000_MDIC_OP_READ   0x08000000
+#define E1000_MDIC_READY     0x10000000
+#define E1000_MDIC_INT_EN    0x20000000
+#define E1000_MDIC_ERROR     0x40000000
+
+/* Rx Interrupt Delay Timer */
+#define E1000_RDTR_FPD       BIT(31)
+
+/* Tx Interrupt Delay Timer */
+#define E1000_TIDV_FPD       BIT(31)
+
+/* Delay increments in nanoseconds for delayed interrupts registers */
+#define E1000_INTR_DELAY_NS_RES (1024)
+
+/* Delay increments in nanoseconds for interrupt throttling registers */
+#define E1000_INTR_THROTTLING_NS_RES (256)
+
+/* EEPROM Commands - Microwire */
+#define EEPROM_READ_OPCODE_MICROWIRE  0x6  /* EEPROM read opcode */
+#define EEPROM_WRITE_OPCODE_MICROWIRE 0x5  /* EEPROM write opcode */
+#define EEPROM_ERASE_OPCODE_MICROWIRE 0x7  /* EEPROM erase opcode */
+#define EEPROM_EWEN_OPCODE_MICROWIRE  0x13 /* EEPROM erase/write enable */
+#define EEPROM_EWDS_OPCODE_MICROWIRE  0x10 /* EEPROM erast/write disable */
+
+/* EEPROM Word Offsets */
+#define EEPROM_COMPAT                 0x0003
+#define EEPROM_ID_LED_SETTINGS        0x0004
+#define EEPROM_VERSION                0x0005
+#define EEPROM_SERDES_AMPLITUDE       0x0006 /* For SERDES output amplitude adjustment. */
+#define EEPROM_PHY_CLASS_WORD         0x0007
+#define EEPROM_INIT_CONTROL1_REG      0x000A
+#define EEPROM_INIT_CONTROL2_REG      0x000F
+#define EEPROM_SWDEF_PINS_CTRL_PORT_1 0x0010
+#define EEPROM_INIT_CONTROL3_PORT_B   0x0014
+#define EEPROM_INIT_3GIO_3            0x001A
+#define EEPROM_SWDEF_PINS_CTRL_PORT_0 0x0020
+#define EEPROM_INIT_CONTROL3_PORT_A   0x0024
+#define EEPROM_CFG                    0x0012
+#define EEPROM_FLASH_VERSION          0x0032
+#define EEPROM_CHECKSUM_REG           0x003F
+
+#define E1000_EEPROM_CFG_DONE         0x00040000   /* MNG config cycle done */
+#define E1000_EEPROM_CFG_DONE_PORT_1  0x00080000   /* ...for second port */
+
+/* HH Time Sync */
+#define E1000_TSYNCTXCTL_MAX_ALLOWED_DLY_MASK 0x0000F000 /* max delay */
+#define E1000_TSYNCTXCTL_SYNC_COMP            0x40000000 /* sync complete */
+#define E1000_TSYNCTXCTL_START_SYNC           0x80000000 /* initiate sync */
+
+#define E1000_TSYNCTXCTL_VALID                0x00000001 /* Tx timestamp valid */
+#define E1000_TSYNCTXCTL_ENABLED              0x00000010 /* enable Tx timestamping */
+
+#define E1000_TSYNCRXCTL_VALID                0x00000001 /* Rx timestamp valid */
+#define E1000_TSYNCRXCTL_TYPE_MASK            0x0000000E /* Rx type mask */
+#define E1000_TSYNCRXCTL_TYPE_L2_V2           0x00
+#define E1000_TSYNCRXCTL_TYPE_L4_V1           0x02
+#define E1000_TSYNCRXCTL_TYPE_L2_L4_V2        0x04
+#define E1000_TSYNCRXCTL_TYPE_ALL             0x08
+#define E1000_TSYNCRXCTL_TYPE_EVENT_V2        0x0A
+#define E1000_TSYNCRXCTL_ENABLED              0x00000010 /* enable Rx timestamping */
+#define E1000_TSYNCRXCTL_SYSCFI               0x00000020 /* Sys clock frequency */
+
+#define E1000_RXMTRL_PTP_V1_SYNC_MESSAGE      0x00000000
+#define E1000_RXMTRL_PTP_V1_DELAY_REQ_MESSAGE 0x00010000
+
+#define E1000_RXMTRL_PTP_V2_SYNC_MESSAGE      0x00000000
+#define E1000_RXMTRL_PTP_V2_DELAY_REQ_MESSAGE 0x01000000
+
+#define E1000_TIMINCA_INCPERIOD_SHIFT         24
+#define E1000_TIMINCA_INCVALUE_MASK           0x00FFFFFF
+
+/* PCI Express Control */
+/* 3GIO Control Register - GCR (0x05B00; RW) */
+#define E1000_L0S_ADJUST              (1 << 9)
+#define E1000_L1_ENTRY_LATENCY_MSB    (1 << 23)
+#define E1000_L1_ENTRY_LATENCY_LSB    (1 << 25 | 1 << 26)
+
+#define E1000_L0S_ADJUST              (1 << 9)
+#define E1000_L1_ENTRY_LATENCY_MSB    (1 << 23)
+#define E1000_L1_ENTRY_LATENCY_LSB    (1 << 25 | 1 << 26)
+
+#define E1000_GCR_RO_BITS             (1 << 23 | 1 << 25 | 1 << 26)
+
+/* MSI-X PBA Clear register */
+#define E1000_PBACLR_VALID_MASK       (BIT(5) - 1)
+
+/* Transmit Descriptor bit definitions */
+#define E1000_TXD_DTYP_D     0x00100000 /* Data Descriptor */
+#define E1000_TXD_DTYP_C     0x00000000 /* Context Descriptor */
+#define E1000_TXD_CMD_EOP    0x01000000 /* End of Packet */
+#define E1000_TXD_CMD_IFCS   0x02000000 /* Insert FCS (Ethernet CRC) */
+#define E1000_TXD_CMD_IC     0x04000000 /* Insert Checksum */
+#define E1000_TXD_CMD_RS     0x08000000 /* Report Status */
+#define E1000_TXD_CMD_RPS    0x10000000 /* Report Packet Sent */
+#define E1000_TXD_CMD_DEXT   0x20000000 /* Descriptor extension (0 = legacy) */
+#define E1000_TXD_CMD_VLE    0x40000000 /* Add VLAN tag */
+#define E1000_TXD_CMD_IDE    0x80000000 /* Enable Tidv register */
+#define E1000_TXD_STAT_DD    0x00000001 /* Descriptor Done */
+#define E1000_TXD_STAT_EC    0x00000002 /* Excess Collisions */
+#define E1000_TXD_STAT_LC    0x00000004 /* Late Collisions */
+#define E1000_TXD_STAT_TU    0x00000008 /* Transmit underrun */
+#define E1000_TXD_CMD_TCP    0x01000000 /* TCP packet */
+#define E1000_TXD_CMD_IP     0x02000000 /* IP packet */
+#define E1000_TXD_CMD_TSE    0x04000000 /* TCP Seg enable */
+#define E1000_TXD_CMD_SNAP   0x40000000 /* Update SNAP header */
+#define E1000_TXD_STAT_TC    0x00000004 /* Tx Underrun */
+#define E1000_TXD_EXTCMD_TSTAMP 0x00000010 /* IEEE1588 Timestamp packet */
+
+/* Transmit Control */
+#define E1000_TCTL_RST    0x00000001    /* software reset */
+#define E1000_TCTL_EN     0x00000002    /* enable tx */
+#define E1000_TCTL_BCE    0x00000004    /* busy check enable */
+#define E1000_TCTL_PSP    0x00000008    /* pad short packets */
+#define E1000_TCTL_CT     0x00000ff0    /* collision threshold */
+#define E1000_TCTL_COLD   0x003ff000    /* collision distance */
+#define E1000_TCTL_SWXOFF 0x00400000    /* SW Xoff transmission */
+#define E1000_TCTL_PBE    0x00800000    /* Packet Burst Enable */
+#define E1000_TCTL_RTLC   0x01000000    /* Re-transmit on late collision */
+#define E1000_TCTL_NRTU   0x02000000    /* No Re-transmit on underrun */
+#define E1000_TCTL_MULR   0x10000000    /* Multiple request support */
+
+/* Legacy Receive Descriptor */
+struct e1000_rx_desc {
+    uint64_t buffer_addr; /* Address of the descriptor's data buffer */
+    uint16_t length;     /* Length of data DMAed into data buffer */
+    uint16_t csum;       /* Packet checksum */
+    uint8_t status;      /* Descriptor status */
+    uint8_t errors;      /* Descriptor Errors */
+    uint16_t special;
+};
+
+/* Extended Receive Descriptor */
+union e1000_rx_desc_extended {
+    struct {
+        uint64_t buffer_addr;
+        uint64_t reserved;
+    } read;
+    struct {
+        struct {
+            uint32_t mrq;           /* Multiple Rx Queues */
+            union {
+                uint32_t rss;       /* RSS Hash */
+                struct {
+                    uint16_t ip_id; /* IP id */
+                    uint16_t csum;  /* Packet Checksum */
+                } csum_ip;
+            } hi_dword;
+        } lower;
+        struct {
+            uint32_t status_error;  /* ext status/error */
+            uint16_t length;
+            uint16_t vlan;          /* VLAN tag */
+        } upper;
+    } wb;                           /* writeback */
+};
+
+#define MAX_PS_BUFFERS 4
+
+/* Number of packet split data buffers (not including the header buffer) */
+#define PS_PAGE_BUFFERS    (MAX_PS_BUFFERS - 1)
+
+/* Receive Descriptor - Packet Split */
+union e1000_rx_desc_packet_split {
+    struct {
+        /* one buffer for protocol header(s), three data buffers */
+        uint64_t buffer_addr[MAX_PS_BUFFERS];
+    } read;
+    struct {
+        struct {
+            uint32_t mrq;          /* Multiple Rx Queues */
+            union {
+                uint32_t rss;          /* RSS Hash */
+                struct {
+                    uint16_t ip_id;    /* IP id */
+                    uint16_t csum;     /* Packet Checksum */
+                } csum_ip;
+            } hi_dword;
+        } lower;
+        struct {
+            uint32_t status_error;     /* ext status/error */
+            uint16_t length0;      /* length of buffer 0 */
+            uint16_t vlan;         /* VLAN tag */
+        } middle;
+        struct {
+            uint16_t header_status;
+            /* length of buffers 1-3 */
+            uint16_t length[PS_PAGE_BUFFERS];
+        } upper;
+        uint64_t reserved;
+    } wb; /* writeback */
+};
+
+/* Receive Checksum Control bits */
+#define E1000_RXCSUM_IPOFLD     0x100   /* IP Checksum Offload Enable */
+#define E1000_RXCSUM_TUOFLD     0x200   /* TCP/UDP Checksum Offload Enable */
+#define E1000_RXCSUM_PCSD       0x2000  /* Packet Checksum Disable */
+
+#define E1000_RING_DESC_LEN       (16)
+#define E1000_RING_DESC_LEN_SHIFT (4)
+
+#define E1000_MIN_RX_DESC_LEN   E1000_RING_DESC_LEN
+
+/* Receive Descriptor bit definitions */
+#define E1000_RXD_STAT_DD       0x01    /* Descriptor Done */
+#define E1000_RXD_STAT_EOP      0x02    /* End of Packet */
+#define E1000_RXD_STAT_IXSM     0x04    /* Ignore checksum */
+#define E1000_RXD_STAT_VP       0x08    /* IEEE VLAN Packet */
+#define E1000_RXD_STAT_UDPCS    0x10    /* UDP xsum caculated */
+#define E1000_RXD_STAT_TCPCS    0x20    /* TCP xsum calculated */
+#define E1000_RXD_STAT_IPCS     0x40    /* IP xsum calculated */
+#define E1000_RXD_STAT_PIF      0x80    /* passed in-exact filter */
+#define E1000_RXD_STAT_IPIDV    0x200   /* IP identification valid */
+#define E1000_RXD_STAT_UDPV     0x400   /* Valid UDP checksum */
+#define E1000_RXD_STAT_ACK      0x8000  /* ACK Packet indication */
+#define E1000_RXD_ERR_CE        0x01    /* CRC Error */
+#define E1000_RXD_ERR_SE        0x02    /* Symbol Error */
+#define E1000_RXD_ERR_SEQ       0x04    /* Sequence Error */
+#define E1000_RXD_ERR_CXE       0x10    /* Carrier Extension Error */
+#define E1000_RXD_ERR_TCPE      0x20    /* TCP/UDP Checksum Error */
+#define E1000_RXD_ERR_IPE       0x40    /* IP Checksum Error */
+#define E1000_RXD_ERR_RXE       0x80    /* Rx Data Error */
+#define E1000_RXD_SPC_VLAN_MASK 0x0FFF  /* VLAN ID is in lower 12 bits */
+#define E1000_RXD_SPC_PRI_MASK  0xE000  /* Priority is in upper 3 bits */
+#define E1000_RXD_SPC_PRI_SHIFT 13
+#define E1000_RXD_SPC_CFI_MASK  0x1000  /* CFI is bit 12 */
+#define E1000_RXD_SPC_CFI_SHIFT 12
+
+/* RX packet types */
+#define E1000_RXD_PKT_MAC       (0)
+#define E1000_RXD_PKT_IP4       (1)
+#define E1000_RXD_PKT_IP4_XDP   (2)
+#define E1000_RXD_PKT_IP6       (5)
+#define E1000_RXD_PKT_IP6_XDP   (6)
+
+#define E1000_RXD_PKT_TYPE(t) ((t) << 16)
+
+#define E1000_RXDEXT_STATERR_CE    0x01000000
+#define E1000_RXDEXT_STATERR_SE    0x02000000
+#define E1000_RXDEXT_STATERR_SEQ   0x04000000
+#define E1000_RXDEXT_STATERR_CXE   0x10000000
+#define E1000_RXDEXT_STATERR_TCPE  0x20000000
+#define E1000_RXDEXT_STATERR_IPE   0x40000000
+#define E1000_RXDEXT_STATERR_RXE   0x80000000
+
+#define E1000_RXDPS_HDRSTAT_HDRSP        0x00008000
+#define E1000_RXDPS_HDRSTAT_HDRLEN_MASK  0x000003FF
+
+/* Receive Address */
+#define E1000_RAH_AV  0x80000000        /* Receive descriptor valid */
+
+/* Offload Context Descriptor */
+struct e1000_context_desc {
+    union {
+        uint32_t ip_config;
+        struct {
+            uint8_t ipcss;      /* IP checksum start */
+            uint8_t ipcso;      /* IP checksum offset */
+            uint16_t ipcse;     /* IP checksum end */
+        } ip_fields;
+    } lower_setup;
+    union {
+        uint32_t tcp_config;
+        struct {
+            uint8_t tucss;      /* TCP checksum start */
+            uint8_t tucso;      /* TCP checksum offset */
+            uint16_t tucse;     /* TCP checksum end */
+        } tcp_fields;
+    } upper_setup;
+    uint32_t cmd_and_length;    /* */
+    union {
+        uint32_t data;
+        struct {
+            uint8_t status;     /* Descriptor status */
+            uint8_t hdr_len;    /* Header length */
+            uint16_t mss;       /* Maximum segment size */
+        } fields;
+    } tcp_seg_setup;
+};
+
+/* Filters */
+#define E1000_NUM_UNICAST          16  /* Unicast filter entries */
+#define E1000_MC_TBL_SIZE          128 /* Multicast Filter Table (4096 bits) */
+#define E1000_VLAN_FILTER_TBL_SIZE 128 /* VLAN Filter Table (4096 bits) */
+
+/* Management Control */
+#define E1000_MANC_SMBUS_EN      0x00000001 /* SMBus Enabled - RO */
+#define E1000_MANC_ASF_EN        0x00000002 /* ASF Enabled - RO */
+#define E1000_MANC_R_ON_FORCE    0x00000004 /* Reset on Force TCO - RO */
+#define E1000_MANC_RMCP_EN       0x00000100 /* Enable RCMP 026Fh Filtering */
+#define E1000_MANC_0298_EN       0x00000200 /* Enable RCMP 0298h Filtering */
+#define E1000_MANC_IPV4_EN       0x00000400 /* Enable IPv4 */
+#define E1000_MANC_IPV6_EN       0x00000800 /* Enable IPv6 */
+#define E1000_MANC_SNAP_EN       0x00001000 /* Accept LLC/SNAP */
+#define E1000_MANC_ARP_EN        0x00002000 /* Enable ARP Request Filtering */
+#define E1000_MANC_NEIGHBOR_EN   0x00004000 /* Enable Neighbor Discovery
+                                             * Filtering */
+#define E1000_MANC_ARP_RES_EN    0x00008000 /* Enable ARP response Filtering */
+#define E1000_MANC_DIS_IP_CHK_ARP  0x10000000 /* Disable IP address chacking */
+                                              /*for ARP packets - in 82574 */
+#define E1000_MANC_TCO_RESET     0x00010000 /* TCO Reset Occurred */
+#define E1000_MANC_RCV_TCO_EN    0x00020000 /* Receive TCO Packets Enabled */
+#define E1000_MANC_REPORT_STATUS 0x00040000 /* Status Reporting Enabled */
+#define E1000_MANC_RCV_ALL       0x00080000 /* Receive All Enabled */
+#define E1000_MANC_BLK_PHY_RST_ON_IDE   0x00040000 /* Block phy resets */
+#define E1000_MANC_EN_MAC_ADDR_FILTER   0x00100000 /* Enable MAC address
+                                                    * filtering */
+#define E1000_MANC_EN_MNG2HOST   0x00200000 /* Enable MNG packets to host
+                                             * memory */
+#define E1000_MANC_EN_IP_ADDR_FILTER    0x00400000 /* Enable IP address
+                                                    * filtering */
+#define E1000_MANC_EN_XSUM_FILTER   0x00800000 /* Enable checksum filtering */
+#define E1000_MANC_BR_EN         0x01000000 /* Enable broadcast filtering */
+#define E1000_MANC_SMB_REQ       0x01000000 /* SMBus Request */
+#define E1000_MANC_SMB_GNT       0x02000000 /* SMBus Grant */
+#define E1000_MANC_SMB_CLK_IN    0x04000000 /* SMBus Clock In */
+#define E1000_MANC_SMB_DATA_IN   0x08000000 /* SMBus Data In */
+#define E1000_MANC_SMB_DATA_OUT  0x10000000 /* SMBus Data Out */
+#define E1000_MANC_SMB_CLK_OUT   0x20000000 /* SMBus Clock Out */
+
+#define E1000_MANC_SMB_DATA_OUT_SHIFT  28 /* SMBus Data Out Shift */
+#define E1000_MANC_SMB_CLK_OUT_SHIFT   29 /* SMBus Clock Out Shift */
+
+/* FACTPS Control */
+#define E1000_FACTPS_LAN0_ON     0x00000004 /* Lan 0 enable */
+
+/* For checksumming, the sum of all words in the EEPROM should equal 0xBABA. */
+#define EEPROM_SUM 0xBABA
+
+/* I/O-Mapped Access to Internal Registers, Memories, and Flash */
+#define E1000_IOADDR 0x00
+#define E1000_IODATA 0x04
+
+#define E1000_VFTA_ENTRY_SHIFT          5
+#define E1000_VFTA_ENTRY_MASK           0x7F
+#define E1000_VFTA_ENTRY_BIT_SHIFT_MASK 0x1F
+
+#endif /* HW_E1000_REGS_H */
diff --git a/hw/net/fsl_etsec/etsec.c b/hw/net/fsl_etsec/etsec.c
index c753bfb3a8..798ea33d08 100644
--- a/hw/net/fsl_etsec/etsec.c
+++ b/hw/net/fsl_etsec/etsec.c
@@ -29,6 +29,7 @@
 #include "qemu/osdep.h"
 #include "hw/sysbus.h"
 #include "hw/irq.h"
+#include "hw/net/mii.h"
 #include "hw/ptimer.h"
 #include "hw/qdev-properties.h"
 #include "etsec.h"
@@ -339,11 +340,11 @@ static void etsec_reset(DeviceState *d)
     etsec->rx_buffer_len = 0;
 
     etsec->phy_status =
-        MII_SR_EXTENDED_CAPS    | MII_SR_LINK_STATUS   | MII_SR_AUTONEG_CAPS  |
-        MII_SR_AUTONEG_COMPLETE | MII_SR_PREAMBLE_SUPPRESS |
-        MII_SR_EXTENDED_STATUS  | MII_SR_100T2_HD_CAPS | MII_SR_100T2_FD_CAPS |
-        MII_SR_10T_HD_CAPS      | MII_SR_10T_FD_CAPS   | MII_SR_100X_HD_CAPS  |
-        MII_SR_100X_FD_CAPS     | MII_SR_100T4_CAPS;
+        MII_BMSR_EXTCAP   | MII_BMSR_LINK_ST  | MII_BMSR_AUTONEG  |
+        MII_BMSR_AN_COMP  | MII_BMSR_MFPS     | MII_BMSR_EXTSTAT  |
+        MII_BMSR_100T2_HD | MII_BMSR_100T2_FD |
+        MII_BMSR_10T_HD   | MII_BMSR_10T_FD   |
+        MII_BMSR_100TX_HD | MII_BMSR_100TX_FD | MII_BMSR_100T4;
 
     etsec_update_irq(etsec);
 }
diff --git a/hw/net/fsl_etsec/etsec.h b/hw/net/fsl_etsec/etsec.h
index 3c625c955c..3860864a3f 100644
--- a/hw/net/fsl_etsec/etsec.h
+++ b/hw/net/fsl_etsec/etsec.h
@@ -76,23 +76,6 @@ typedef struct eTSEC_rxtx_bd {
 #define FCB_TX_CTU     (1 << 1)
 #define FCB_TX_NPH     (1 << 0)
 
-/* PHY Status Register */
-#define MII_SR_EXTENDED_CAPS     0x0001    /* Extended register capabilities */
-#define MII_SR_JABBER_DETECT     0x0002    /* Jabber Detected */
-#define MII_SR_LINK_STATUS       0x0004    /* Link Status 1 = link */
-#define MII_SR_AUTONEG_CAPS      0x0008    /* Auto Neg Capable */
-#define MII_SR_REMOTE_FAULT      0x0010    /* Remote Fault Detect */
-#define MII_SR_AUTONEG_COMPLETE  0x0020    /* Auto Neg Complete */
-#define MII_SR_PREAMBLE_SUPPRESS 0x0040    /* Preamble may be suppressed */
-#define MII_SR_EXTENDED_STATUS   0x0100    /* Ext. status info in Reg 0x0F */
-#define MII_SR_100T2_HD_CAPS     0x0200    /* 100T2 Half Duplex Capable */
-#define MII_SR_100T2_FD_CAPS     0x0400    /* 100T2 Full Duplex Capable */
-#define MII_SR_10T_HD_CAPS       0x0800    /* 10T   Half Duplex Capable */
-#define MII_SR_10T_FD_CAPS       0x1000    /* 10T   Full Duplex Capable */
-#define MII_SR_100X_HD_CAPS      0x2000    /* 100X  Half Duplex Capable */
-#define MII_SR_100X_FD_CAPS      0x4000    /* 100X  Full Duplex Capable */
-#define MII_SR_100T4_CAPS        0x8000    /* 100T4 Capable */
-
 /* eTSEC */
 
 /* Number of register in the device */
diff --git a/hw/net/fsl_etsec/miim.c b/hw/net/fsl_etsec/miim.c
index 6bba01c82a..b48d2cb57b 100644
--- a/hw/net/fsl_etsec/miim.c
+++ b/hw/net/fsl_etsec/miim.c
@@ -23,6 +23,7 @@
  */
 
 #include "qemu/osdep.h"
+#include "hw/net/mii.h"
 #include "etsec.h"
 #include "registers.h"
 
@@ -140,8 +141,8 @@ void etsec_miim_link_status(eTSEC *etsec, NetClientState *nc)
 {
     /* Set link status */
     if (nc->link_down) {
-        etsec->phy_status &= ~MII_SR_LINK_STATUS;
+        etsec->phy_status &= ~MII_BMSR_LINK_ST;
     } else {
-        etsec->phy_status |= MII_SR_LINK_STATUS;
+        etsec->phy_status |= MII_BMSR_LINK_ST;
     }
 }
diff --git a/hw/net/igb.c b/hw/net/igb.c
new file mode 100644
index 0000000000..c6d753df87
--- /dev/null
+++ b/hw/net/igb.c
@@ -0,0 +1,623 @@
+/*
+ * QEMU Intel 82576 SR/IOV Ethernet Controller Emulation
+ *
+ * Datasheet:
+ * https://www.intel.com/content/dam/www/public/us/en/documents/datasheets/82576eg-gbe-datasheet.pdf
+ *
+ * Copyright (c) 2020-2023 Red Hat, Inc.
+ * Copyright (c) 2015 Ravello Systems LTD (http://ravellosystems.com)
+ * Developed by Daynix Computing LTD (http://www.daynix.com)
+ *
+ * Authors:
+ * Akihiko Odaki <akihiko.odaki@daynix.com>
+ * Gal Hammmer <gal.hammer@sap.com>
+ * Marcel Apfelbaum <marcel.apfelbaum@gmail.com>
+ * Dmitry Fleytman <dmitry@daynix.com>
+ * Leonid Bloch <leonid@daynix.com>
+ * Yan Vugenfirer <yan@daynix.com>
+ *
+ * Based on work done by:
+ * Nir Peleg, Tutis Systems Ltd. for Qumranet Inc.
+ * Copyright (c) 2008 Qumranet
+ * Based on work done by:
+ * Copyright (c) 2007 Dan Aloni
+ * Copyright (c) 2004 Antony T Curtis
+ *
+ * This library is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * This library is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with this library; if not, see <http://www.gnu.org/licenses/>.
+ */
+
+#include "qemu/osdep.h"
+#include "qemu/units.h"
+#include "net/eth.h"
+#include "net/net.h"
+#include "net/tap.h"
+#include "qemu/module.h"
+#include "qemu/range.h"
+#include "sysemu/sysemu.h"
+#include "hw/hw.h"
+#include "hw/net/mii.h"
+#include "hw/pci/pci.h"
+#include "hw/pci/pcie.h"
+#include "hw/pci/pcie_sriov.h"
+#include "hw/pci/msi.h"
+#include "hw/pci/msix.h"
+#include "hw/qdev-properties.h"
+#include "migration/vmstate.h"
+
+#include "igb_common.h"
+#include "igb_core.h"
+
+#include "trace.h"
+#include "qapi/error.h"
+#include "qom/object.h"
+
+#define TYPE_IGB "igb"
+OBJECT_DECLARE_SIMPLE_TYPE(IGBState, IGB)
+
+struct IGBState {
+    PCIDevice parent_obj;
+    NICState *nic;
+    NICConf conf;
+
+    MemoryRegion mmio;
+    MemoryRegion flash;
+    MemoryRegion io;
+    MemoryRegion msix;
+
+    uint32_t ioaddr;
+
+    IGBCore core;
+};
+
+#define IGB_CAP_SRIOV_OFFSET    (0x160)
+#define IGB_VF_OFFSET           (0x80)
+#define IGB_VF_STRIDE           (2)
+
+#define E1000E_MMIO_IDX     0
+#define E1000E_FLASH_IDX    1
+#define E1000E_IO_IDX       2
+#define E1000E_MSIX_IDX     3
+
+#define E1000E_MMIO_SIZE    (128 * KiB)
+#define E1000E_FLASH_SIZE   (128 * KiB)
+#define E1000E_IO_SIZE      (32)
+#define E1000E_MSIX_SIZE    (16 * KiB)
+
+static void igb_write_config(PCIDevice *dev, uint32_t addr,
+    uint32_t val, int len)
+{
+    IGBState *s = IGB(dev);
+
+    trace_igb_write_config(addr, val, len);
+    pci_default_write_config(dev, addr, val, len);
+
+    if (range_covers_byte(addr, len, PCI_COMMAND) &&
+        (dev->config[PCI_COMMAND] & PCI_COMMAND_MASTER)) {
+        igb_start_recv(&s->core);
+    }
+}
+
+uint64_t
+igb_mmio_read(void *opaque, hwaddr addr, unsigned size)
+{
+    IGBState *s = opaque;
+    return igb_core_read(&s->core, addr, size);
+}
+
+void
+igb_mmio_write(void *opaque, hwaddr addr, uint64_t val, unsigned size)
+{
+    IGBState *s = opaque;
+    igb_core_write(&s->core, addr, val, size);
+}
+
+static bool
+igb_io_get_reg_index(IGBState *s, uint32_t *idx)
+{
+    if (s->ioaddr < 0x1FFFF) {
+        *idx = s->ioaddr;
+        return true;
+    }
+
+    if (s->ioaddr < 0x7FFFF) {
+        trace_e1000e_wrn_io_addr_undefined(s->ioaddr);
+        return false;
+    }
+
+    if (s->ioaddr < 0xFFFFF) {
+        trace_e1000e_wrn_io_addr_flash(s->ioaddr);
+        return false;
+    }
+
+    trace_e1000e_wrn_io_addr_unknown(s->ioaddr);
+    return false;
+}
+
+static uint64_t
+igb_io_read(void *opaque, hwaddr addr, unsigned size)
+{
+    IGBState *s = opaque;
+    uint32_t idx = 0;
+    uint64_t val;
+
+    switch (addr) {
+    case E1000_IOADDR:
+        trace_e1000e_io_read_addr(s->ioaddr);
+        return s->ioaddr;
+    case E1000_IODATA:
+        if (igb_io_get_reg_index(s, &idx)) {
+            val = igb_core_read(&s->core, idx, sizeof(val));
+            trace_e1000e_io_read_data(idx, val);
+            return val;
+        }
+        return 0;
+    default:
+        trace_e1000e_wrn_io_read_unknown(addr);
+        return 0;
+    }
+}
+
+static void
+igb_io_write(void *opaque, hwaddr addr, uint64_t val, unsigned size)
+{
+    IGBState *s = opaque;
+    uint32_t idx = 0;
+
+    switch (addr) {
+    case E1000_IOADDR:
+        trace_e1000e_io_write_addr(val);
+        s->ioaddr = (uint32_t) val;
+        return;
+    case E1000_IODATA:
+        if (igb_io_get_reg_index(s, &idx)) {
+            trace_e1000e_io_write_data(idx, val);
+            igb_core_write(&s->core, idx, val, sizeof(val));
+        }
+        return;
+    default:
+        trace_e1000e_wrn_io_write_unknown(addr);
+        return;
+    }
+}
+
+static const MemoryRegionOps mmio_ops = {
+    .read = igb_mmio_read,
+    .write = igb_mmio_write,
+    .endianness = DEVICE_LITTLE_ENDIAN,
+    .impl = {
+        .min_access_size = 4,
+        .max_access_size = 4,
+    },
+};
+
+static const MemoryRegionOps io_ops = {
+    .read = igb_io_read,
+    .write = igb_io_write,
+    .endianness = DEVICE_LITTLE_ENDIAN,
+    .impl = {
+        .min_access_size = 4,
+        .max_access_size = 4,
+    },
+};
+
+static bool
+igb_nc_can_receive(NetClientState *nc)
+{
+    IGBState *s = qemu_get_nic_opaque(nc);
+    return igb_can_receive(&s->core);
+}
+
+static ssize_t
+igb_nc_receive_iov(NetClientState *nc, const struct iovec *iov, int iovcnt)
+{
+    IGBState *s = qemu_get_nic_opaque(nc);
+    return igb_receive_iov(&s->core, iov, iovcnt);
+}
+
+static ssize_t
+igb_nc_receive(NetClientState *nc, const uint8_t *buf, size_t size)
+{
+    IGBState *s = qemu_get_nic_opaque(nc);
+    return igb_receive(&s->core, buf, size);
+}
+
+static void
+igb_set_link_status(NetClientState *nc)
+{
+    IGBState *s = qemu_get_nic_opaque(nc);
+    igb_core_set_link_status(&s->core);
+}
+
+static NetClientInfo net_igb_info = {
+    .type = NET_CLIENT_DRIVER_NIC,
+    .size = sizeof(NICState),
+    .can_receive = igb_nc_can_receive,
+    .receive = igb_nc_receive,
+    .receive_iov = igb_nc_receive_iov,
+    .link_status_changed = igb_set_link_status,
+};
+
+/*
+ * EEPROM (NVM) contents documented in section 6.1, table 6-1:
+ * and in 6.10 Software accessed words.
+ */
+static const uint16_t igb_eeprom_template[] = {
+  /*        Address        |Compat.|OEM sp.| ImRev |    OEM sp.    */
+    0x0000, 0x0000, 0x0000, 0x0d34, 0xffff, 0x2010, 0xffff, 0xffff,
+  /*      PBA      |ICtrl1 | SSID  | SVID  | DevID |-------|ICtrl2 */
+    0x1040, 0xffff, 0x002b, 0x0000, 0x8086, 0x10c9, 0x0000, 0x70c3,
+  /* SwPin0| DevID | EESZ  |-------|ICtrl3 |PCI-tc | MSIX  | APtr  */
+    0x0004, 0x10c9, 0x5c00, 0x0000, 0x2880, 0x0014, 0x4a40, 0x0060,
+  /* PCIe Init. Conf 1,2,3 |PCICtrl| LD1,3 |DDevID |DevRev | LD0,2 */
+    0x6cfb, 0xc7b0, 0x0abe, 0x0403, 0x0783, 0x10a6, 0x0001, 0x0602,
+  /* SwPin1| FunC  |LAN-PWR|ManHwC |ICtrl3 | IOVct |VDevID |-------*/
+    0x0004, 0x0020, 0x0000, 0x004a, 0x2080, 0x00f5, 0x10ca, 0x0000,
+  /*---------------| LD1,3 | LD0,2 | ROEnd | ROSta | Wdog  | VPD   */
+    0x0000, 0x0000, 0x4784, 0x4602, 0x0000, 0x0000, 0x1000, 0xffff,
+  /* PCSet0| Ccfg0 |PXEver |IBAcap |PCSet1 | Ccfg1 |iSCVer | ??    */
+    0x0100, 0x4000, 0x131f, 0x4013, 0x0100, 0x4000, 0xffff, 0xffff,
+  /* PCSet2| Ccfg2 |PCSet3 | Ccfg3 | ??    |AltMacP| ??    |CHKSUM */
+    0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0x00e0, 0xffff, 0x0000,
+  /* NC-SIC */
+    0x0003,
+};
+
+static void igb_core_realize(IGBState *s)
+{
+    s->core.owner = &s->parent_obj;
+    s->core.owner_nic = s->nic;
+}
+
+static void
+igb_init_msix(IGBState *s)
+{
+    int i, res;
+
+    res = msix_init(PCI_DEVICE(s), IGB_MSIX_VEC_NUM,
+                    &s->msix,
+                    E1000E_MSIX_IDX, 0,
+                    &s->msix,
+                    E1000E_MSIX_IDX, 0x2000,
+                    0x70, NULL);
+
+    if (res < 0) {
+        trace_e1000e_msix_init_fail(res);
+    } else {
+        for (i = 0; i < IGB_MSIX_VEC_NUM; i++) {
+            msix_vector_use(PCI_DEVICE(s), i);
+        }
+    }
+}
+
+static void
+igb_cleanup_msix(IGBState *s)
+{
+    msix_unuse_all_vectors(PCI_DEVICE(s));
+    msix_uninit(PCI_DEVICE(s), &s->msix, &s->msix);
+}
+
+static void
+igb_init_net_peer(IGBState *s, PCIDevice *pci_dev, uint8_t *macaddr)
+{
+    DeviceState *dev = DEVICE(pci_dev);
+    NetClientState *nc;
+    int i;
+
+    s->nic = qemu_new_nic(&net_igb_info, &s->conf,
+        object_get_typename(OBJECT(s)), dev->id, s);
+
+    s->core.max_queue_num = s->conf.peers.queues ? s->conf.peers.queues - 1 : 0;
+
+    trace_e1000e_mac_set_permanent(MAC_ARG(macaddr));
+    memcpy(s->core.permanent_mac, macaddr, sizeof(s->core.permanent_mac));
+
+    qemu_format_nic_info_str(qemu_get_queue(s->nic), macaddr);
+
+    /* Setup virtio headers */
+    for (i = 0; i < s->conf.peers.queues; i++) {
+        nc = qemu_get_subqueue(s->nic, i);
+        if (!nc->peer || !qemu_has_vnet_hdr(nc->peer)) {
+            trace_e1000e_cfg_support_virtio(false);
+            return;
+        }
+    }
+
+    trace_e1000e_cfg_support_virtio(true);
+    s->core.has_vnet = true;
+
+    for (i = 0; i < s->conf.peers.queues; i++) {
+        nc = qemu_get_subqueue(s->nic, i);
+        qemu_set_vnet_hdr_len(nc->peer, sizeof(struct virtio_net_hdr));
+        qemu_using_vnet_hdr(nc->peer, true);
+    }
+}
+
+static int
+igb_add_pm_capability(PCIDevice *pdev, uint8_t offset, uint16_t pmc)
+{
+    Error *local_err = NULL;
+    int ret = pci_add_capability(pdev, PCI_CAP_ID_PM, offset,
+                                 PCI_PM_SIZEOF, &local_err);
+
+    if (local_err) {
+        error_report_err(local_err);
+        return ret;
+    }
+
+    pci_set_word(pdev->config + offset + PCI_PM_PMC,
+                 PCI_PM_CAP_VER_1_1 |
+                 pmc);
+
+    pci_set_word(pdev->wmask + offset + PCI_PM_CTRL,
+                 PCI_PM_CTRL_STATE_MASK |
+                 PCI_PM_CTRL_PME_ENABLE |
+                 PCI_PM_CTRL_DATA_SEL_MASK);
+
+    pci_set_word(pdev->w1cmask + offset + PCI_PM_CTRL,
+                 PCI_PM_CTRL_PME_STATUS);
+
+    return ret;
+}
+
+static void igb_pci_realize(PCIDevice *pci_dev, Error **errp)
+{
+    IGBState *s = IGB(pci_dev);
+    uint8_t *macaddr;
+    int ret;
+
+    trace_e1000e_cb_pci_realize();
+
+    pci_dev->config_write = igb_write_config;
+
+    pci_dev->config[PCI_CACHE_LINE_SIZE] = 0x10;
+    pci_dev->config[PCI_INTERRUPT_PIN] = 1;
+
+    /* Define IO/MMIO regions */
+    memory_region_init_io(&s->mmio, OBJECT(s), &mmio_ops, s,
+                          "igb-mmio", E1000E_MMIO_SIZE);
+    pci_register_bar(pci_dev, E1000E_MMIO_IDX,
+                     PCI_BASE_ADDRESS_SPACE_MEMORY, &s->mmio);
+
+    /*
+     * We provide a dummy implementation for the flash BAR
+     * for drivers that may theoretically probe for its presence.
+     */
+    memory_region_init(&s->flash, OBJECT(s),
+                       "igb-flash", E1000E_FLASH_SIZE);
+    pci_register_bar(pci_dev, E1000E_FLASH_IDX,
+                     PCI_BASE_ADDRESS_SPACE_MEMORY, &s->flash);
+
+    memory_region_init_io(&s->io, OBJECT(s), &io_ops, s,
+                          "igb-io", E1000E_IO_SIZE);
+    pci_register_bar(pci_dev, E1000E_IO_IDX,
+                     PCI_BASE_ADDRESS_SPACE_IO, &s->io);
+
+    memory_region_init(&s->msix, OBJECT(s), "igb-msix",
+                       E1000E_MSIX_SIZE);
+    pci_register_bar(pci_dev, E1000E_MSIX_IDX,
+                     PCI_BASE_ADDRESS_MEM_TYPE_64, &s->msix);
+
+    /* Create networking backend */
+    qemu_macaddr_default_if_unset(&s->conf.macaddr);
+    macaddr = s->conf.macaddr.a;
+
+    /* Add PCI capabilities in reverse order */
+    assert(pcie_endpoint_cap_init(pci_dev, 0xa0) > 0);
+
+    igb_init_msix(s);
+
+    ret = msi_init(pci_dev, 0x50, 1, true, true, NULL);
+    if (ret) {
+        trace_e1000e_msi_init_fail(ret);
+    }
+
+    if (igb_add_pm_capability(pci_dev, 0x40, PCI_PM_CAP_DSI) < 0) {
+        hw_error("Failed to initialize PM capability");
+    }
+
+    /* PCIe extended capabilities (in order) */
+    if (pcie_aer_init(pci_dev, 1, 0x100, 0x40, errp) < 0) {
+        hw_error("Failed to initialize AER capability");
+    }
+
+    pcie_ari_init(pci_dev, 0x150, 1);
+
+    pcie_sriov_pf_init(pci_dev, IGB_CAP_SRIOV_OFFSET, "igbvf",
+        IGB_82576_VF_DEV_ID, IGB_MAX_VF_FUNCTIONS, IGB_MAX_VF_FUNCTIONS,
+        IGB_VF_OFFSET, IGB_VF_STRIDE);
+
+    pcie_sriov_pf_init_vf_bar(pci_dev, 0,
+        PCI_BASE_ADDRESS_MEM_TYPE_64 | PCI_BASE_ADDRESS_MEM_PREFETCH,
+        16 * KiB);
+    pcie_sriov_pf_init_vf_bar(pci_dev, 3,
+        PCI_BASE_ADDRESS_MEM_TYPE_64 | PCI_BASE_ADDRESS_MEM_PREFETCH,
+        16 * KiB);
+
+    igb_init_net_peer(s, pci_dev, macaddr);
+
+    /* Initialize core */
+    igb_core_realize(s);
+
+    igb_core_pci_realize(&s->core,
+                         igb_eeprom_template,
+                         sizeof(igb_eeprom_template),
+                         macaddr);
+}
+
+static void igb_pci_uninit(PCIDevice *pci_dev)
+{
+    IGBState *s = IGB(pci_dev);
+
+    trace_e1000e_cb_pci_uninit();
+
+    igb_core_pci_uninit(&s->core);
+
+    pcie_sriov_pf_exit(pci_dev);
+    pcie_cap_exit(pci_dev);
+
+    qemu_del_nic(s->nic);
+
+    igb_cleanup_msix(s);
+    msi_uninit(pci_dev);
+}
+
+static void igb_qdev_reset_hold(Object *obj)
+{
+    PCIDevice *d = PCI_DEVICE(obj);
+    IGBState *s = IGB(obj);
+
+    trace_e1000e_cb_qdev_reset_hold();
+
+    pcie_sriov_pf_disable_vfs(d);
+    igb_core_reset(&s->core);
+}
+
+static int igb_pre_save(void *opaque)
+{
+    IGBState *s = opaque;
+
+    trace_e1000e_cb_pre_save();
+
+    igb_core_pre_save(&s->core);
+
+    return 0;
+}
+
+static int igb_post_load(void *opaque, int version_id)
+{
+    IGBState *s = opaque;
+
+    trace_e1000e_cb_post_load();
+    return igb_core_post_load(&s->core);
+}
+
+static const VMStateDescription igb_vmstate_tx = {
+    .name = "igb-tx",
+    .version_id = 1,
+    .minimum_version_id = 1,
+    .fields = (VMStateField[]) {
+        VMSTATE_UINT16(vlan, struct igb_tx),
+        VMSTATE_UINT16(mss, struct igb_tx),
+        VMSTATE_BOOL(tse, struct igb_tx),
+        VMSTATE_BOOL(ixsm, struct igb_tx),
+        VMSTATE_BOOL(txsm, struct igb_tx),
+        VMSTATE_BOOL(first, struct igb_tx),
+        VMSTATE_BOOL(skip_cp, struct igb_tx),
+        VMSTATE_END_OF_LIST()
+    }
+};
+
+static const VMStateDescription igb_vmstate_intr_timer = {
+    .name = "igb-intr-timer",
+    .version_id = 1,
+    .minimum_version_id = 1,
+    .fields = (VMStateField[]) {
+        VMSTATE_TIMER_PTR(timer, IGBIntrDelayTimer),
+        VMSTATE_BOOL(running, IGBIntrDelayTimer),
+        VMSTATE_END_OF_LIST()
+    }
+};
+
+#define VMSTATE_IGB_INTR_DELAY_TIMER(_f, _s)                        \
+    VMSTATE_STRUCT(_f, _s, 0,                                       \
+                   igb_vmstate_intr_timer, IGBIntrDelayTimer)
+
+#define VMSTATE_IGB_INTR_DELAY_TIMER_ARRAY(_f, _s, _num)            \
+    VMSTATE_STRUCT_ARRAY(_f, _s, _num, 0,                           \
+                         igb_vmstate_intr_timer, IGBIntrDelayTimer)
+
+static const VMStateDescription igb_vmstate = {
+    .name = "igb",
+    .version_id = 1,
+    .minimum_version_id = 1,
+    .pre_save = igb_pre_save,
+    .post_load = igb_post_load,
+    .fields = (VMStateField[]) {
+        VMSTATE_PCI_DEVICE(parent_obj, IGBState),
+        VMSTATE_MSIX(parent_obj, IGBState),
+
+        VMSTATE_UINT32(ioaddr, IGBState),
+        VMSTATE_UINT8(core.rx_desc_len, IGBState),
+        VMSTATE_UINT16_ARRAY(core.eeprom, IGBState, IGB_EEPROM_SIZE),
+        VMSTATE_UINT16_ARRAY(core.phy, IGBState, MAX_PHY_REG_ADDRESS + 1),
+        VMSTATE_UINT32_ARRAY(core.mac, IGBState, E1000E_MAC_SIZE),
+        VMSTATE_UINT8_ARRAY(core.permanent_mac, IGBState, ETH_ALEN),
+
+        VMSTATE_IGB_INTR_DELAY_TIMER_ARRAY(core.eitr, IGBState,
+                                           IGB_INTR_NUM),
+
+        VMSTATE_UINT32_ARRAY(core.eitr_guest_value, IGBState, IGB_INTR_NUM),
+
+        VMSTATE_STRUCT_ARRAY(core.tx, IGBState, IGB_NUM_QUEUES, 0,
+                             igb_vmstate_tx, struct igb_tx),
+
+        VMSTATE_INT64(core.timadj, IGBState),
+
+        VMSTATE_END_OF_LIST()
+    }
+};
+
+static Property igb_properties[] = {
+    DEFINE_NIC_PROPERTIES(IGBState, conf),
+    DEFINE_PROP_END_OF_LIST(),
+};
+
+static void igb_class_init(ObjectClass *class, void *data)
+{
+    DeviceClass *dc = DEVICE_CLASS(class);
+    ResettableClass *rc = RESETTABLE_CLASS(class);
+    PCIDeviceClass *c = PCI_DEVICE_CLASS(class);
+
+    c->realize = igb_pci_realize;
+    c->exit = igb_pci_uninit;
+    c->vendor_id = PCI_VENDOR_ID_INTEL;
+    c->device_id = E1000_DEV_ID_82576;
+    c->revision = 1;
+    c->class_id = PCI_CLASS_NETWORK_ETHERNET;
+
+    rc->phases.hold = igb_qdev_reset_hold;
+
+    dc->desc = "Intel 82576 Gigabit Ethernet Controller";
+    dc->vmsd = &igb_vmstate;
+
+    device_class_set_props(dc, igb_properties);
+    set_bit(DEVICE_CATEGORY_NETWORK, dc->categories);
+}
+
+static void igb_instance_init(Object *obj)
+{
+    IGBState *s = IGB(obj);
+    device_add_bootindex_property(obj, &s->conf.bootindex,
+                                  "bootindex", "/ethernet-phy@0",
+                                  DEVICE(obj));
+}
+
+static const TypeInfo igb_info = {
+    .name = TYPE_IGB,
+    .parent = TYPE_PCI_DEVICE,
+    .instance_size = sizeof(IGBState),
+    .class_init = igb_class_init,
+    .instance_init = igb_instance_init,
+    .interfaces = (InterfaceInfo[]) {
+        { INTERFACE_PCIE_DEVICE },
+        { }
+    },
+};
+
+static void igb_register_types(void)
+{
+    type_register_static(&igb_info);
+}
+
+type_init(igb_register_types)
diff --git a/hw/net/igb_common.h b/hw/net/igb_common.h
new file mode 100644
index 0000000000..69ac490f75
--- /dev/null
+++ b/hw/net/igb_common.h
@@ -0,0 +1,146 @@
+/*
+ * QEMU igb emulation - shared definitions
+ *
+ * Copyright (c) 2020-2023 Red Hat, Inc.
+ * Copyright (c) 2008 Qumranet
+ *
+ * Based on work done by:
+ * Nir Peleg, Tutis Systems Ltd. for Qumranet Inc.
+ * Copyright (c) 2007 Dan Aloni
+ * Copyright (c) 2004 Antony T Curtis
+ *
+ * This library is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * This library is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with this library; if not, see <http://www.gnu.org/licenses/>.
+ */
+
+#ifndef HW_NET_IGB_COMMON_H
+#define HW_NET_IGB_COMMON_H
+
+#include "igb_regs.h"
+
+#define defreg(x) x = (E1000_##x >> 2)
+#define defreg_indexed(x, i) x##i = (E1000_##x(i) >> 2)
+#define defreg_indexeda(x, i) x##i##_A = (E1000_##x##_A(i) >> 2)
+
+#define defregd(x) defreg_indexed(x, 0),  defreg_indexed(x, 1),  \
+                   defreg_indexed(x, 2),  defreg_indexed(x, 3),  \
+                   defreg_indexed(x, 4),  defreg_indexed(x, 5),  \
+                   defreg_indexed(x, 6),  defreg_indexed(x, 7),  \
+                   defreg_indexed(x, 8),  defreg_indexed(x, 9),  \
+                   defreg_indexed(x, 10), defreg_indexed(x, 11), \
+                   defreg_indexed(x, 12), defreg_indexed(x, 13), \
+                   defreg_indexed(x, 14), defreg_indexed(x, 15), \
+                   defreg_indexeda(x, 0), defreg_indexeda(x, 1), \
+                   defreg_indexeda(x, 2), defreg_indexeda(x, 3)
+
+#define defregv(x) defreg_indexed(x, 0), defreg_indexed(x, 1),   \
+                   defreg_indexed(x, 2), defreg_indexed(x, 3),   \
+                   defreg_indexed(x, 4), defreg_indexed(x, 5),   \
+                   defreg_indexed(x, 6), defreg_indexed(x, 7)
+
+enum {
+    defreg(CTRL),    defreg(EECD),    defreg(EERD),    defreg(GPRC),
+    defreg(GPTC),    defreg(ICR),     defreg(ICS),     defreg(IMC),
+    defreg(IMS),     defreg(LEDCTL),  defreg(MANC),    defreg(MDIC),
+    defreg(MPC),     defreg(RCTL),
+    defreg(STATUS),  defreg(SWSM),    defreg(TCTL),
+    defreg(TORH),    defreg(TORL),    defreg(TOTH),
+    defreg(TOTL),    defreg(TPR),     defreg(TPT),
+    defreg(WUFC),    defreg(RA),      defreg(MTA),     defreg(CRCERRS),
+    defreg(VFTA),    defreg(VET),
+    defreg(SCC),     defreg(ECOL),
+    defreg(MCC),     defreg(LATECOL), defreg(COLC),    defreg(DC),
+    defreg(TNCRS),   defreg(RLEC),
+    defreg(XONRXC),  defreg(XONTXC),  defreg(XOFFRXC), defreg(XOFFTXC),
+    defreg(FCRUC),   defreg(TDFH),    defreg(TDFT),
+    defreg(TDFHS),   defreg(TDFTS),   defreg(TDFPC),   defreg(WUC),
+    defreg(WUS),     defreg(RDFH),
+    defreg(RDFT),    defreg(RDFHS),   defreg(RDFTS),   defreg(RDFPC),
+    defreg(IPAV),    defreg(IP4AT),   defreg(IP6AT),
+    defreg(WUPM),    defreg(FFMT),
+    defreg(IAM),
+    defreg(GCR),     defreg(TIMINCA), defreg(EIAC),    defreg(CTRL_EXT),
+    defreg(IVAR0),   defreg(MANC2H),
+    defreg(MFVAL),   defreg(MDEF),    defreg(FACTPS),  defreg(FTFT),
+    defreg(RUC),     defreg(ROC),     defreg(RFC),     defreg(RJC),
+    defreg(PRC64),   defreg(PRC127),  defreg(PRC255),  defreg(PRC511),
+    defreg(PRC1023), defreg(PRC1522), defreg(PTC64),   defreg(PTC127),
+    defreg(PTC255),  defreg(PTC511),  defreg(PTC1023), defreg(PTC1522),
+    defreg(GORCL),   defreg(GORCH),   defreg(GOTCL),   defreg(GOTCH),
+    defreg(RNBC),    defreg(BPRC),    defreg(MPRC),    defreg(RFCTL),
+    defreg(MPTC),    defreg(BPTC),
+    defreg(IAC),     defreg(MGTPRC),  defreg(MGTPDC),  defreg(MGTPTC),
+    defreg(TSCTC),   defreg(RXCSUM),  defreg(FUNCTAG), defreg(GSCL_1),
+    defreg(GSCL_2),  defreg(GSCL_3),  defreg(GSCL_4),  defreg(GSCN_0),
+    defreg(GSCN_1),  defreg(GSCN_2),  defreg(GSCN_3),
+    defreg_indexed(EITR, 0),
+    defreg(MRQC),    defreg(RETA),    defreg(RSSRK),
+    defreg(PBACLR),  defreg(FCAL),    defreg(FCAH),    defreg(FCT),
+    defreg(FCRTH),   defreg(FCRTL),   defreg(FCTTV),   defreg(FCRTV),
+    defreg(FLA),     defreg(FLOP),
+    defreg(MAVTV0),  defreg(MAVTV1),  defreg(MAVTV2),  defreg(MAVTV3),
+    defreg(TXSTMPL), defreg(TXSTMPH), defreg(SYSTIML), defreg(SYSTIMH),
+    defreg(TIMADJL), defreg(TIMADJH),
+    defreg(RXSTMPH), defreg(RXSTMPL), defreg(RXSATRL), defreg(RXSATRH),
+    defreg(TIPG),
+    defreg(CTRL_DUP),
+    defreg(EEMNGCTL),
+    defreg(EEMNGDATA),
+    defreg(FLMNGCTL),
+    defreg(FLMNGDATA),
+    defreg(FLMNGCNT),
+    defreg(TSYNCRXCTL),
+    defreg(TSYNCTXCTL),
+    defreg(RLPML),
+    defreg(UTA),
+
+    /* Aliases */
+    defreg(RDFH_A),      defreg(RDFT_A),     defreg(TDFH_A),     defreg(TDFT_A),
+    defreg(RA_A),        defreg(VFTA_A),     defreg(FCRTL_A),
+
+    /* Additional regs used by IGB */
+    defreg(FWSM),        defreg(SW_FW_SYNC),
+
+    defreg(EICS),        defreg(EIMS),        defreg(EIMC),       defreg(EIAM),
+    defreg(EICR),        defreg(IVAR_MISC),   defreg(GPIE),
+
+    defreg(RXPBS),      defregd(RDBAL),       defregd(RDBAH),     defregd(RDLEN),
+    defregd(SRRCTL),    defregd(RDH),         defregd(RDT),
+    defregd(RXDCTL),    defregd(RXCTL),       defregd(RQDPC),     defreg(RA2),
+
+    defreg(TXPBS),       defreg(TCTL_EXT),    defreg(DTXCTL),     defreg(HTCBDPC),
+    defregd(TDBAL),      defregd(TDBAH),      defregd(TDLEN),     defregd(TDH),
+    defregd(TDT),        defregd(TXDCTL),     defregd(TXCTL),
+    defregd(TDWBAL),     defregd(TDWBAH),
+
+    defreg(VT_CTL),
+
+    defregv(P2VMAILBOX), defregv(V2PMAILBOX), defreg(MBVFICR),    defreg(MBVFIMR),
+    defreg(VFLRE),       defreg(VFRE),        defreg(VFTE),       defreg(WVBR),
+    defreg(QDE),         defreg(DTXSWC),      defreg_indexed(VLVF, 0),
+    defregv(VMOLR),      defreg(RPLOLR),      defregv(VMBMEM),    defregv(VMVIR),
+
+    defregv(PVTCTRL),    defregv(PVTEICS),    defregv(PVTEIMS),   defregv(PVTEIMC),
+    defregv(PVTEIAC),    defregv(PVTEIAM),    defregv(PVTEICR),   defregv(PVFGPRC),
+    defregv(PVFGPTC),    defregv(PVFGORC),    defregv(PVFGOTC),   defregv(PVFMPRC),
+    defregv(PVFGPRLBC),  defregv(PVFGPTLBC),  defregv(PVFGORLBC), defregv(PVFGOTLBC),
+
+    defreg(MTA_A),
+
+    defreg(VTIVAR), defreg(VTIVAR_MISC),
+};
+
+uint64_t igb_mmio_read(void *opaque, hwaddr addr, unsigned size);
+void igb_mmio_write(void *opaque, hwaddr addr, uint64_t val, unsigned size);
+
+#endif
diff --git a/hw/net/igb_core.c b/hw/net/igb_core.c
new file mode 100644
index 0000000000..a7c7bfdc75
--- /dev/null
+++ b/hw/net/igb_core.c
@@ -0,0 +1,4077 @@
+/*
+ * Core code for QEMU igb emulation
+ *
+ * Datasheet:
+ * https://www.intel.com/content/dam/www/public/us/en/documents/datasheets/82576eg-gbe-datasheet.pdf
+ *
+ * Copyright (c) 2020-2023 Red Hat, Inc.
+ * Copyright (c) 2015 Ravello Systems LTD (http://ravellosystems.com)
+ * Developed by Daynix Computing LTD (http://www.daynix.com)
+ *
+ * Authors:
+ * Akihiko Odaki <akihiko.odaki@daynix.com>
+ * Gal Hammmer <gal.hammer@sap.com>
+ * Marcel Apfelbaum <marcel.apfelbaum@gmail.com>
+ * Dmitry Fleytman <dmitry@daynix.com>
+ * Leonid Bloch <leonid@daynix.com>
+ * Yan Vugenfirer <yan@daynix.com>
+ *
+ * Based on work done by:
+ * Nir Peleg, Tutis Systems Ltd. for Qumranet Inc.
+ * Copyright (c) 2008 Qumranet
+ * Based on work done by:
+ * Copyright (c) 2007 Dan Aloni
+ * Copyright (c) 2004 Antony T Curtis
+ *
+ * This library is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * This library is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with this library; if not, see <http://www.gnu.org/licenses/>.
+ */
+
+#include "qemu/osdep.h"
+#include "qemu/log.h"
+#include "net/net.h"
+#include "net/tap.h"
+#include "hw/net/mii.h"
+#include "hw/pci/msi.h"
+#include "hw/pci/msix.h"
+#include "sysemu/runstate.h"
+
+#include "net_tx_pkt.h"
+#include "net_rx_pkt.h"
+
+#include "igb_common.h"
+#include "e1000x_common.h"
+#include "igb_core.h"
+
+#include "trace.h"
+
+#define E1000E_MAX_TX_FRAGS (64)
+
+union e1000_rx_desc_union {
+    struct e1000_rx_desc legacy;
+    union e1000_adv_rx_desc adv;
+};
+
+typedef struct IGBTxPktVmdqCallbackContext {
+    IGBCore *core;
+    NetClientState *nc;
+} IGBTxPktVmdqCallbackContext;
+
+static ssize_t
+igb_receive_internal(IGBCore *core, const struct iovec *iov, int iovcnt,
+                     bool has_vnet, bool *external_tx);
+
+static inline void
+igb_set_interrupt_cause(IGBCore *core, uint32_t val);
+
+static void igb_update_interrupt_state(IGBCore *core);
+static void igb_reset(IGBCore *core, bool sw);
+
+static inline void
+igb_raise_legacy_irq(IGBCore *core)
+{
+    trace_e1000e_irq_legacy_notify(true);
+    e1000x_inc_reg_if_not_full(core->mac, IAC);
+    pci_set_irq(core->owner, 1);
+}
+
+static inline void
+igb_lower_legacy_irq(IGBCore *core)
+{
+    trace_e1000e_irq_legacy_notify(false);
+    pci_set_irq(core->owner, 0);
+}
+
+static void igb_msix_notify(IGBCore *core, unsigned int vector)
+{
+    PCIDevice *dev = core->owner;
+    uint16_t vfn;
+
+    vfn = 8 - (vector + 2) / IGBVF_MSIX_VEC_NUM;
+    if (vfn < pcie_sriov_num_vfs(core->owner)) {
+        dev = pcie_sriov_get_vf_at_index(core->owner, vfn);
+        assert(dev);
+        vector = (vector + 2) % IGBVF_MSIX_VEC_NUM;
+    } else if (vector >= IGB_MSIX_VEC_NUM) {
+        qemu_log_mask(LOG_GUEST_ERROR,
+                      "igb: Tried to use vector unavailable for PF");
+        return;
+    }
+
+    msix_notify(dev, vector);
+}
+
+static inline void
+igb_intrmgr_rearm_timer(IGBIntrDelayTimer *timer)
+{
+    int64_t delay_ns = (int64_t) timer->core->mac[timer->delay_reg] *
+                                 timer->delay_resolution_ns;
+
+    trace_e1000e_irq_rearm_timer(timer->delay_reg << 2, delay_ns);
+
+    timer_mod(timer->timer, qemu_clock_get_ns(QEMU_CLOCK_VIRTUAL) + delay_ns);
+
+    timer->running = true;
+}
+
+static void
+igb_intmgr_timer_resume(IGBIntrDelayTimer *timer)
+{
+    if (timer->running) {
+        igb_intrmgr_rearm_timer(timer);
+    }
+}
+
+static void
+igb_intmgr_timer_pause(IGBIntrDelayTimer *timer)
+{
+    if (timer->running) {
+        timer_del(timer->timer);
+    }
+}
+
+static void
+igb_intrmgr_on_msix_throttling_timer(void *opaque)
+{
+    IGBIntrDelayTimer *timer = opaque;
+    int idx = timer - &timer->core->eitr[0];
+
+    timer->running = false;
+
+    trace_e1000e_irq_msix_notify_postponed_vec(idx);
+    igb_msix_notify(timer->core, idx);
+}
+
+static void
+igb_intrmgr_initialize_all_timers(IGBCore *core, bool create)
+{
+    int i;
+
+    for (i = 0; i < IGB_INTR_NUM; i++) {
+        core->eitr[i].core = core;
+        core->eitr[i].delay_reg = EITR0 + i;
+        core->eitr[i].delay_resolution_ns = E1000_INTR_DELAY_NS_RES;
+    }
+
+    if (!create) {
+        return;
+    }
+
+    for (i = 0; i < IGB_INTR_NUM; i++) {
+        core->eitr[i].timer = timer_new_ns(QEMU_CLOCK_VIRTUAL,
+                                           igb_intrmgr_on_msix_throttling_timer,
+                                           &core->eitr[i]);
+    }
+}
+
+static void
+igb_intrmgr_resume(IGBCore *core)
+{
+    int i;
+
+    for (i = 0; i < IGB_INTR_NUM; i++) {
+        igb_intmgr_timer_resume(&core->eitr[i]);
+    }
+}
+
+static void
+igb_intrmgr_pause(IGBCore *core)
+{
+    int i;
+
+    for (i = 0; i < IGB_INTR_NUM; i++) {
+        igb_intmgr_timer_pause(&core->eitr[i]);
+    }
+}
+
+static void
+igb_intrmgr_reset(IGBCore *core)
+{
+    int i;
+
+    for (i = 0; i < IGB_INTR_NUM; i++) {
+        if (core->eitr[i].running) {
+            timer_del(core->eitr[i].timer);
+            igb_intrmgr_on_msix_throttling_timer(&core->eitr[i]);
+        }
+    }
+}
+
+static void
+igb_intrmgr_pci_unint(IGBCore *core)
+{
+    int i;
+
+    for (i = 0; i < IGB_INTR_NUM; i++) {
+        timer_free(core->eitr[i].timer);
+    }
+}
+
+static void
+igb_intrmgr_pci_realize(IGBCore *core)
+{
+    igb_intrmgr_initialize_all_timers(core, true);
+}
+
+static inline bool
+igb_rx_csum_enabled(IGBCore *core)
+{
+    return (core->mac[RXCSUM] & E1000_RXCSUM_PCSD) ? false : true;
+}
+
+static inline bool
+igb_rx_use_legacy_descriptor(IGBCore *core)
+{
+    /*
+     * TODO: If SRRCTL[n],DESCTYPE = 000b, the 82576 uses the legacy Rx
+     * descriptor.
+     */
+    return false;
+}
+
+static inline bool
+igb_rss_enabled(IGBCore *core)
+{
+    return (core->mac[MRQC] & 3) == E1000_MRQC_ENABLE_RSS_MQ &&
+           !igb_rx_csum_enabled(core) &&
+           !igb_rx_use_legacy_descriptor(core);
+}
+
+typedef struct E1000E_RSSInfo_st {
+    bool enabled;
+    uint32_t hash;
+    uint32_t queue;
+    uint32_t type;
+} E1000E_RSSInfo;
+
+static uint32_t
+igb_rss_get_hash_type(IGBCore *core, struct NetRxPkt *pkt)
+{
+    bool hasip4, hasip6;
+    EthL4HdrProto l4hdr_proto;
+
+    assert(igb_rss_enabled(core));
+
+    net_rx_pkt_get_protocols(pkt, &hasip4, &hasip6, &l4hdr_proto);
+
+    if (hasip4) {
+        trace_e1000e_rx_rss_ip4(l4hdr_proto, core->mac[MRQC],
+                                E1000_MRQC_EN_TCPIPV4(core->mac[MRQC]),
+                                E1000_MRQC_EN_IPV4(core->mac[MRQC]));
+
+        if (l4hdr_proto == ETH_L4_HDR_PROTO_TCP &&
+            E1000_MRQC_EN_TCPIPV4(core->mac[MRQC])) {
+            return E1000_MRQ_RSS_TYPE_IPV4TCP;
+        }
+
+        if (E1000_MRQC_EN_IPV4(core->mac[MRQC])) {
+            return E1000_MRQ_RSS_TYPE_IPV4;
+        }
+    } else if (hasip6) {
+        eth_ip6_hdr_info *ip6info = net_rx_pkt_get_ip6_info(pkt);
+
+        bool ex_dis = core->mac[RFCTL] & E1000_RFCTL_IPV6_EX_DIS;
+        bool new_ex_dis = core->mac[RFCTL] & E1000_RFCTL_NEW_IPV6_EXT_DIS;
+
+        /*
+         * Following two traces must not be combined because resulting
+         * event will have 11 arguments totally and some trace backends
+         * (at least "ust") have limitation of maximum 10 arguments per
+         * event. Events with more arguments fail to compile for
+         * backends like these.
+         */
+        trace_e1000e_rx_rss_ip6_rfctl(core->mac[RFCTL]);
+        trace_e1000e_rx_rss_ip6(ex_dis, new_ex_dis, l4hdr_proto,
+                                ip6info->has_ext_hdrs,
+                                ip6info->rss_ex_dst_valid,
+                                ip6info->rss_ex_src_valid,
+                                core->mac[MRQC],
+                                E1000_MRQC_EN_TCPIPV6(core->mac[MRQC]),
+                                E1000_MRQC_EN_IPV6EX(core->mac[MRQC]),
+                                E1000_MRQC_EN_IPV6(core->mac[MRQC]));
+
+        if ((!ex_dis || !ip6info->has_ext_hdrs) &&
+            (!new_ex_dis || !(ip6info->rss_ex_dst_valid ||
+                              ip6info->rss_ex_src_valid))) {
+
+            if (l4hdr_proto == ETH_L4_HDR_PROTO_TCP &&
+                E1000_MRQC_EN_TCPIPV6(core->mac[MRQC])) {
+                return E1000_MRQ_RSS_TYPE_IPV6TCP;
+            }
+
+            if (E1000_MRQC_EN_IPV6EX(core->mac[MRQC])) {
+                return E1000_MRQ_RSS_TYPE_IPV6EX;
+            }
+
+        }
+
+        if (E1000_MRQC_EN_IPV6(core->mac[MRQC])) {
+            return E1000_MRQ_RSS_TYPE_IPV6;
+        }
+
+    }
+
+    return E1000_MRQ_RSS_TYPE_NONE;
+}
+
+static uint32_t
+igb_rss_calc_hash(IGBCore *core, struct NetRxPkt *pkt, E1000E_RSSInfo *info)
+{
+    NetRxPktRssType type;
+
+    assert(igb_rss_enabled(core));
+
+    switch (info->type) {
+    case E1000_MRQ_RSS_TYPE_IPV4:
+        type = NetPktRssIpV4;
+        break;
+    case E1000_MRQ_RSS_TYPE_IPV4TCP:
+        type = NetPktRssIpV4Tcp;
+        break;
+    case E1000_MRQ_RSS_TYPE_IPV6TCP:
+        type = NetPktRssIpV6TcpEx;
+        break;
+    case E1000_MRQ_RSS_TYPE_IPV6:
+        type = NetPktRssIpV6;
+        break;
+    case E1000_MRQ_RSS_TYPE_IPV6EX:
+        type = NetPktRssIpV6Ex;
+        break;
+    default:
+        assert(false);
+        return 0;
+    }
+
+    return net_rx_pkt_calc_rss_hash(pkt, type, (uint8_t *) &core->mac[RSSRK]);
+}
+
+static void
+igb_rss_parse_packet(IGBCore *core, struct NetRxPkt *pkt, bool tx,
+                     E1000E_RSSInfo *info)
+{
+    trace_e1000e_rx_rss_started();
+
+    if (tx || !igb_rss_enabled(core)) {
+        info->enabled = false;
+        info->hash = 0;
+        info->queue = 0;
+        info->type = 0;
+        trace_e1000e_rx_rss_disabled();
+        return;
+    }
+
+    info->enabled = true;
+
+    info->type = igb_rss_get_hash_type(core, pkt);
+
+    trace_e1000e_rx_rss_type(info->type);
+
+    if (info->type == E1000_MRQ_RSS_TYPE_NONE) {
+        info->hash = 0;
+        info->queue = 0;
+        return;
+    }
+
+    info->hash = igb_rss_calc_hash(core, pkt, info);
+    info->queue = E1000_RSS_QUEUE(&core->mac[RETA], info->hash);
+}
+
+static bool
+igb_setup_tx_offloads(IGBCore *core, struct igb_tx *tx)
+{
+    if (tx->tse) {
+        if (!net_tx_pkt_build_vheader(tx->tx_pkt, true, true, tx->mss)) {
+            return false;
+        }
+
+        net_tx_pkt_update_ip_checksums(tx->tx_pkt);
+        e1000x_inc_reg_if_not_full(core->mac, TSCTC);
+        return true;
+    }
+
+    if (tx->txsm) {
+        if (!net_tx_pkt_build_vheader(tx->tx_pkt, false, true, 0)) {
+            return false;
+        }
+    }
+
+    if (tx->ixsm) {
+        net_tx_pkt_update_ip_hdr_checksum(tx->tx_pkt);
+    }
+
+    return true;
+}
+
+static void igb_tx_pkt_mac_callback(void *core,
+                                    const struct iovec *iov,
+                                    int iovcnt,
+                                    const struct iovec *virt_iov,
+                                    int virt_iovcnt)
+{
+    igb_receive_internal(core, virt_iov, virt_iovcnt, true, NULL);
+}
+
+static void igb_tx_pkt_vmdq_callback(void *opaque,
+                                     const struct iovec *iov,
+                                     int iovcnt,
+                                     const struct iovec *virt_iov,
+                                     int virt_iovcnt)
+{
+    IGBTxPktVmdqCallbackContext *context = opaque;
+    bool external_tx;
+
+    igb_receive_internal(context->core, virt_iov, virt_iovcnt, true,
+                         &external_tx);
+
+    if (external_tx) {
+        if (context->core->has_vnet) {
+            qemu_sendv_packet(context->nc, virt_iov, virt_iovcnt);
+        } else {
+            qemu_sendv_packet(context->nc, iov, iovcnt);
+        }
+    }
+}
+
+/* TX Packets Switching (7.10.3.6) */
+static bool igb_tx_pkt_switch(IGBCore *core, struct igb_tx *tx,
+                              NetClientState *nc)
+{
+    IGBTxPktVmdqCallbackContext context;
+
+    /* TX switching is only used to serve VM to VM traffic. */
+    if (!(core->mac[MRQC] & 1)) {
+        goto send_out;
+    }
+
+    /* TX switching requires DTXSWC.Loopback_en bit enabled. */
+    if (!(core->mac[DTXSWC] & E1000_DTXSWC_VMDQ_LOOPBACK_EN)) {
+        goto send_out;
+    }
+
+    context.core = core;
+    context.nc = nc;
+
+    return net_tx_pkt_send_custom(tx->tx_pkt, false,
+                                  igb_tx_pkt_vmdq_callback, &context);
+
+send_out:
+    return net_tx_pkt_send(tx->tx_pkt, nc);
+}
+
+static bool
+igb_tx_pkt_send(IGBCore *core, struct igb_tx *tx, int queue_index)
+{
+    int target_queue = MIN(core->max_queue_num, queue_index);
+    NetClientState *queue = qemu_get_subqueue(core->owner_nic, target_queue);
+
+    if (!igb_setup_tx_offloads(core, tx)) {
+        return false;
+    }
+
+    net_tx_pkt_dump(tx->tx_pkt);
+
+    if ((core->phy[MII_BMCR] & MII_BMCR_LOOPBACK) ||
+        ((core->mac[RCTL] & E1000_RCTL_LBM_MAC) == E1000_RCTL_LBM_MAC)) {
+        return net_tx_pkt_send_custom(tx->tx_pkt, false,
+                                      igb_tx_pkt_mac_callback, core);
+    } else {
+        return igb_tx_pkt_switch(core, tx, queue);
+    }
+}
+
+static void
+igb_on_tx_done_update_stats(IGBCore *core, struct NetTxPkt *tx_pkt)
+{
+    static const int PTCregs[6] = { PTC64, PTC127, PTC255, PTC511,
+                                    PTC1023, PTC1522 };
+
+    size_t tot_len = net_tx_pkt_get_total_len(tx_pkt) + 4;
+
+    e1000x_increase_size_stats(core->mac, PTCregs, tot_len);
+    e1000x_inc_reg_if_not_full(core->mac, TPT);
+    e1000x_grow_8reg_if_not_full(core->mac, TOTL, tot_len);
+
+    switch (net_tx_pkt_get_packet_type(tx_pkt)) {
+    case ETH_PKT_BCAST:
+        e1000x_inc_reg_if_not_full(core->mac, BPTC);
+        break;
+    case ETH_PKT_MCAST:
+        e1000x_inc_reg_if_not_full(core->mac, MPTC);
+        break;
+    case ETH_PKT_UCAST:
+        break;
+    default:
+        g_assert_not_reached();
+    }
+
+    core->mac[GPTC] = core->mac[TPT];
+    core->mac[GOTCL] = core->mac[TOTL];
+    core->mac[GOTCH] = core->mac[TOTH];
+}
+
+static void
+igb_process_tx_desc(IGBCore *core,
+                    struct igb_tx *tx,
+                    union e1000_adv_tx_desc *tx_desc,
+                    int queue_index)
+{
+    struct e1000_adv_tx_context_desc *tx_ctx_desc;
+    uint32_t cmd_type_len;
+    uint32_t olinfo_status;
+    uint64_t buffer_addr;
+    uint16_t length;
+
+    cmd_type_len = le32_to_cpu(tx_desc->read.cmd_type_len);
+
+    if (cmd_type_len & E1000_ADVTXD_DCMD_DEXT) {
+        if ((cmd_type_len & E1000_ADVTXD_DTYP_DATA) ==
+            E1000_ADVTXD_DTYP_DATA) {
+            /* advanced transmit data descriptor */
+            if (tx->first) {
+                olinfo_status = le32_to_cpu(tx_desc->read.olinfo_status);
+
+                tx->tse = !!(cmd_type_len & E1000_ADVTXD_DCMD_TSE);
+                tx->ixsm = !!(olinfo_status & E1000_ADVTXD_POTS_IXSM);
+                tx->txsm = !!(olinfo_status & E1000_ADVTXD_POTS_TXSM);
+
+                tx->first = false;
+            }
+        } else if ((cmd_type_len & E1000_ADVTXD_DTYP_CTXT) ==
+                   E1000_ADVTXD_DTYP_CTXT) {
+            /* advanced transmit context descriptor */
+            tx_ctx_desc = (struct e1000_adv_tx_context_desc *)tx_desc;
+            tx->vlan = le32_to_cpu(tx_ctx_desc->vlan_macip_lens) >> 16;
+            tx->mss = le32_to_cpu(tx_ctx_desc->mss_l4len_idx) >> 16;
+            return;
+        } else {
+            /* unknown descriptor type */
+            return;
+        }
+    } else {
+        /* legacy descriptor */
+
+        /* TODO: Implement a support for legacy descriptors (7.2.2.1). */
+    }
+
+    buffer_addr = le64_to_cpu(tx_desc->read.buffer_addr);
+    length = cmd_type_len & 0xFFFF;
+
+    if (!tx->skip_cp) {
+        if (!net_tx_pkt_add_raw_fragment(tx->tx_pkt, buffer_addr, length)) {
+            tx->skip_cp = true;
+        }
+    }
+
+    if (cmd_type_len & E1000_TXD_CMD_EOP) {
+        if (!tx->skip_cp && net_tx_pkt_parse(tx->tx_pkt)) {
+            if (cmd_type_len & E1000_TXD_CMD_VLE) {
+                net_tx_pkt_setup_vlan_header_ex(tx->tx_pkt, tx->vlan,
+                    core->mac[VET] & 0xffff);
+            }
+            if (igb_tx_pkt_send(core, tx, queue_index)) {
+                igb_on_tx_done_update_stats(core, tx->tx_pkt);
+            }
+        }
+
+        tx->first = true;
+        tx->skip_cp = false;
+        net_tx_pkt_reset(tx->tx_pkt);
+    }
+}
+
+static uint32_t igb_tx_wb_eic(IGBCore *core, int queue_idx)
+{
+    uint32_t n, ent = 0;
+
+    n = igb_ivar_entry_tx(queue_idx);
+    ent = (core->mac[IVAR0 + n / 4] >> (8 * (n % 4))) & 0xff;
+
+    return (ent & E1000_IVAR_VALID) ? BIT(ent & 0x1f) : 0;
+}
+
+static uint32_t igb_rx_wb_eic(IGBCore *core, int queue_idx)
+{
+    uint32_t n, ent = 0;
+
+    n = igb_ivar_entry_rx(queue_idx);
+    ent = (core->mac[IVAR0 + n / 4] >> (8 * (n % 4))) & 0xff;
+
+    return (ent & E1000_IVAR_VALID) ? BIT(ent & 0x1f) : 0;
+}
+
+typedef struct E1000E_RingInfo_st {
+    int dbah;
+    int dbal;
+    int dlen;
+    int dh;
+    int dt;
+    int idx;
+} E1000E_RingInfo;
+
+static inline bool
+igb_ring_empty(IGBCore *core, const E1000E_RingInfo *r)
+{
+    return core->mac[r->dh] == core->mac[r->dt] ||
+                core->mac[r->dt] >= core->mac[r->dlen] / E1000_RING_DESC_LEN;
+}
+
+static inline uint64_t
+igb_ring_base(IGBCore *core, const E1000E_RingInfo *r)
+{
+    uint64_t bah = core->mac[r->dbah];
+    uint64_t bal = core->mac[r->dbal];
+
+    return (bah << 32) + bal;
+}
+
+static inline uint64_t
+igb_ring_head_descr(IGBCore *core, const E1000E_RingInfo *r)
+{
+    return igb_ring_base(core, r) + E1000_RING_DESC_LEN * core->mac[r->dh];
+}
+
+static inline void
+igb_ring_advance(IGBCore *core, const E1000E_RingInfo *r, uint32_t count)
+{
+    core->mac[r->dh] += count;
+
+    if (core->mac[r->dh] * E1000_RING_DESC_LEN >= core->mac[r->dlen]) {
+        core->mac[r->dh] = 0;
+    }
+}
+
+static inline uint32_t
+igb_ring_free_descr_num(IGBCore *core, const E1000E_RingInfo *r)
+{
+    trace_e1000e_ring_free_space(r->idx, core->mac[r->dlen],
+                                 core->mac[r->dh],  core->mac[r->dt]);
+
+    if (core->mac[r->dh] <= core->mac[r->dt]) {
+        return core->mac[r->dt] - core->mac[r->dh];
+    }
+
+    if (core->mac[r->dh] > core->mac[r->dt]) {
+        return core->mac[r->dlen] / E1000_RING_DESC_LEN +
+               core->mac[r->dt] - core->mac[r->dh];
+    }
+
+    g_assert_not_reached();
+    return 0;
+}
+
+static inline bool
+igb_ring_enabled(IGBCore *core, const E1000E_RingInfo *r)
+{
+    return core->mac[r->dlen] > 0;
+}
+
+typedef struct IGB_TxRing_st {
+    const E1000E_RingInfo *i;
+    struct igb_tx *tx;
+} IGB_TxRing;
+
+static inline int
+igb_mq_queue_idx(int base_reg_idx, int reg_idx)
+{
+    return (reg_idx - base_reg_idx) / 16;
+}
+
+static inline void
+igb_tx_ring_init(IGBCore *core, IGB_TxRing *txr, int idx)
+{
+    static const E1000E_RingInfo i[IGB_NUM_QUEUES] = {
+        { TDBAH0, TDBAL0, TDLEN0, TDH0, TDT0, 0 },
+        { TDBAH1, TDBAL1, TDLEN1, TDH1, TDT1, 1 },
+        { TDBAH2, TDBAL2, TDLEN2, TDH2, TDT2, 2 },
+        { TDBAH3, TDBAL3, TDLEN3, TDH3, TDT3, 3 },
+        { TDBAH4, TDBAL4, TDLEN4, TDH4, TDT4, 4 },
+        { TDBAH5, TDBAL5, TDLEN5, TDH5, TDT5, 5 },
+        { TDBAH6, TDBAL6, TDLEN6, TDH6, TDT6, 6 },
+        { TDBAH7, TDBAL7, TDLEN7, TDH7, TDT7, 7 },
+        { TDBAH8, TDBAL8, TDLEN8, TDH8, TDT8, 8 },
+        { TDBAH9, TDBAL9, TDLEN9, TDH9, TDT9, 9 },
+        { TDBAH10, TDBAL10, TDLEN10, TDH10, TDT10, 10 },
+        { TDBAH11, TDBAL11, TDLEN11, TDH11, TDT11, 11 },
+        { TDBAH12, TDBAL12, TDLEN12, TDH12, TDT12, 12 },
+        { TDBAH13, TDBAL13, TDLEN13, TDH13, TDT13, 13 },
+        { TDBAH14, TDBAL14, TDLEN14, TDH14, TDT14, 14 },
+        { TDBAH15, TDBAL15, TDLEN15, TDH15, TDT15, 15 }
+    };
+
+    assert(idx < ARRAY_SIZE(i));
+
+    txr->i     = &i[idx];
+    txr->tx    = &core->tx[idx];
+}
+
+typedef struct E1000E_RxRing_st {
+    const E1000E_RingInfo *i;
+} E1000E_RxRing;
+
+static inline void
+igb_rx_ring_init(IGBCore *core, E1000E_RxRing *rxr, int idx)
+{
+    static const E1000E_RingInfo i[IGB_NUM_QUEUES] = {
+        { RDBAH0, RDBAL0, RDLEN0, RDH0, RDT0, 0 },
+        { RDBAH1, RDBAL1, RDLEN1, RDH1, RDT1, 1 },
+        { RDBAH2, RDBAL2, RDLEN2, RDH2, RDT2, 2 },
+        { RDBAH3, RDBAL3, RDLEN3, RDH3, RDT3, 3 },
+        { RDBAH4, RDBAL4, RDLEN4, RDH4, RDT4, 4 },
+        { RDBAH5, RDBAL5, RDLEN5, RDH5, RDT5, 5 },
+        { RDBAH6, RDBAL6, RDLEN6, RDH6, RDT6, 6 },
+        { RDBAH7, RDBAL7, RDLEN7, RDH7, RDT7, 7 },
+        { RDBAH8, RDBAL8, RDLEN8, RDH8, RDT8, 8 },
+        { RDBAH9, RDBAL9, RDLEN9, RDH9, RDT9, 9 },
+        { RDBAH10, RDBAL10, RDLEN10, RDH10, RDT10, 10 },
+        { RDBAH11, RDBAL11, RDLEN11, RDH11, RDT11, 11 },
+        { RDBAH12, RDBAL12, RDLEN12, RDH12, RDT12, 12 },
+        { RDBAH13, RDBAL13, RDLEN13, RDH13, RDT13, 13 },
+        { RDBAH14, RDBAL14, RDLEN14, RDH14, RDT14, 14 },
+        { RDBAH15, RDBAL15, RDLEN15, RDH15, RDT15, 15 }
+    };
+
+    assert(idx < ARRAY_SIZE(i));
+
+    rxr->i      = &i[idx];
+}
+
+static uint32_t
+igb_txdesc_writeback(IGBCore *core, dma_addr_t base,
+                     union e1000_adv_tx_desc *tx_desc,
+                     const E1000E_RingInfo *txi)
+{
+    PCIDevice *d;
+    uint32_t cmd_type_len = le32_to_cpu(tx_desc->read.cmd_type_len);
+    uint64_t tdwba;
+
+    tdwba = core->mac[E1000_TDWBAL(txi->idx) >> 2];
+    tdwba |= (uint64_t)core->mac[E1000_TDWBAH(txi->idx) >> 2] << 32;
+
+    if (!(cmd_type_len & E1000_TXD_CMD_RS)) {
+        return 0;
+    }
+
+    d = pcie_sriov_get_vf_at_index(core->owner, txi->idx % 8);
+    if (!d) {
+        d = core->owner;
+    }
+
+    if (tdwba & 1) {
+        uint32_t buffer = cpu_to_le32(core->mac[txi->dh]);
+        pci_dma_write(d, tdwba & ~3, &buffer, sizeof(buffer));
+    } else {
+        uint32_t status = le32_to_cpu(tx_desc->wb.status) | E1000_TXD_STAT_DD;
+
+        tx_desc->wb.status = cpu_to_le32(status);
+        pci_dma_write(d, base + offsetof(union e1000_adv_tx_desc, wb),
+            &tx_desc->wb, sizeof(tx_desc->wb));
+    }
+
+    return igb_tx_wb_eic(core, txi->idx);
+}
+
+static void
+igb_start_xmit(IGBCore *core, const IGB_TxRing *txr)
+{
+    PCIDevice *d;
+    dma_addr_t base;
+    union e1000_adv_tx_desc desc;
+    const E1000E_RingInfo *txi = txr->i;
+    uint32_t eic = 0;
+
+    /* TODO: check if the queue itself is enabled too. */
+    if (!(core->mac[TCTL] & E1000_TCTL_EN)) {
+        trace_e1000e_tx_disabled();
+        return;
+    }
+
+    d = pcie_sriov_get_vf_at_index(core->owner, txi->idx % 8);
+    if (!d) {
+        d = core->owner;
+    }
+
+    while (!igb_ring_empty(core, txi)) {
+        base = igb_ring_head_descr(core, txi);
+
+        pci_dma_read(d, base, &desc, sizeof(desc));
+
+        trace_e1000e_tx_descr((void *)(intptr_t)desc.read.buffer_addr,
+                              desc.read.cmd_type_len, desc.wb.status);
+
+        igb_process_tx_desc(core, txr->tx, &desc, txi->idx);
+        igb_ring_advance(core, txi, 1);
+        eic |= igb_txdesc_writeback(core, base, &desc, txi);
+    }
+
+    if (eic) {
+        core->mac[EICR] |= eic;
+        igb_set_interrupt_cause(core, E1000_ICR_TXDW);
+    }
+}
+
+static uint32_t
+igb_rxbufsize(IGBCore *core, const E1000E_RingInfo *r)
+{
+    uint32_t srrctl = core->mac[E1000_SRRCTL(r->idx) >> 2];
+    uint32_t bsizepkt = srrctl & E1000_SRRCTL_BSIZEPKT_MASK;
+    if (bsizepkt) {
+        return bsizepkt << E1000_SRRCTL_BSIZEPKT_SHIFT;
+    }
+
+    return e1000x_rxbufsize(core->mac[RCTL]);
+}
+
+static bool
+igb_has_rxbufs(IGBCore *core, const E1000E_RingInfo *r, size_t total_size)
+{
+    uint32_t bufs = igb_ring_free_descr_num(core, r);
+    uint32_t bufsize = igb_rxbufsize(core, r);
+
+    trace_e1000e_rx_has_buffers(r->idx, bufs, total_size, bufsize);
+
+    return total_size <= bufs / (core->rx_desc_len / E1000_MIN_RX_DESC_LEN) *
+                         bufsize;
+}
+
+void
+igb_start_recv(IGBCore *core)
+{
+    int i;
+
+    trace_e1000e_rx_start_recv();
+
+    for (i = 0; i <= core->max_queue_num; i++) {
+        qemu_flush_queued_packets(qemu_get_subqueue(core->owner_nic, i));
+    }
+}
+
+bool
+igb_can_receive(IGBCore *core)
+{
+    int i;
+
+    if (!e1000x_rx_ready(core->owner, core->mac)) {
+        return false;
+    }
+
+    for (i = 0; i < IGB_NUM_QUEUES; i++) {
+        E1000E_RxRing rxr;
+
+        igb_rx_ring_init(core, &rxr, i);
+        if (igb_ring_enabled(core, rxr.i) && igb_has_rxbufs(core, rxr.i, 1)) {
+            trace_e1000e_rx_can_recv();
+            return true;
+        }
+    }
+
+    trace_e1000e_rx_can_recv_rings_full();
+    return false;
+}
+
+ssize_t
+igb_receive(IGBCore *core, const uint8_t *buf, size_t size)
+{
+    const struct iovec iov = {
+        .iov_base = (uint8_t *)buf,
+        .iov_len = size
+    };
+
+    return igb_receive_iov(core, &iov, 1);
+}
+
+static inline bool
+igb_rx_l3_cso_enabled(IGBCore *core)
+{
+    return !!(core->mac[RXCSUM] & E1000_RXCSUM_IPOFLD);
+}
+
+static inline bool
+igb_rx_l4_cso_enabled(IGBCore *core)
+{
+    return !!(core->mac[RXCSUM] & E1000_RXCSUM_TUOFLD);
+}
+
+static uint16_t igb_receive_assign(IGBCore *core, const struct eth_header *ehdr,
+                                   E1000E_RSSInfo *rss_info, bool *external_tx)
+{
+    static const int ta_shift[] = { 4, 3, 2, 0 };
+    uint32_t f, ra[2], *macp, rctl = core->mac[RCTL];
+    uint16_t queues = 0;
+    uint16_t vid = lduw_be_p(&PKT_GET_VLAN_HDR(ehdr)->h_tci) & VLAN_VID_MASK;
+    bool accepted = false;
+    int i;
+
+    memset(rss_info, 0, sizeof(E1000E_RSSInfo));
+
+    if (external_tx) {
+        *external_tx = true;
+    }
+
+    if (e1000x_is_vlan_packet(ehdr, core->mac[VET] & 0xffff) &&
+        e1000x_vlan_rx_filter_enabled(core->mac)) {
+        uint32_t vfta =
+            ldl_le_p((uint32_t *)(core->mac + VFTA) +
+                     ((vid >> E1000_VFTA_ENTRY_SHIFT) & E1000_VFTA_ENTRY_MASK));
+        if ((vfta & (1 << (vid & E1000_VFTA_ENTRY_BIT_SHIFT_MASK))) == 0) {
+            trace_e1000e_rx_flt_vlan_mismatch(vid);
+            return queues;
+        } else {
+            trace_e1000e_rx_flt_vlan_match(vid);
+        }
+    }
+
+    if (core->mac[MRQC] & 1) {
+        if (is_broadcast_ether_addr(ehdr->h_dest)) {
+            for (i = 0; i < 8; i++) {
+                if (core->mac[VMOLR0 + i] & E1000_VMOLR_BAM) {
+                    queues |= BIT(i);
+                }
+            }
+        } else {
+            for (macp = core->mac + RA; macp < core->mac + RA + 32; macp += 2) {
+                if (!(macp[1] & E1000_RAH_AV)) {
+                    continue;
+                }
+                ra[0] = cpu_to_le32(macp[0]);
+                ra[1] = cpu_to_le32(macp[1]);
+                if (!memcmp(ehdr->h_dest, (uint8_t *)ra, ETH_ALEN)) {
+                    queues |= (macp[1] & E1000_RAH_POOL_MASK) / E1000_RAH_POOL_1;
+                }
+            }
+
+            for (macp = core->mac + RA2; macp < core->mac + RA2 + 16; macp += 2) {
+                if (!(macp[1] & E1000_RAH_AV)) {
+                    continue;
+                }
+                ra[0] = cpu_to_le32(macp[0]);
+                ra[1] = cpu_to_le32(macp[1]);
+                if (!memcmp(ehdr->h_dest, (uint8_t *)ra, ETH_ALEN)) {
+                    queues |= (macp[1] & E1000_RAH_POOL_MASK) / E1000_RAH_POOL_1;
+                }
+            }
+
+            if (!queues) {
+                macp = core->mac + (is_multicast_ether_addr(ehdr->h_dest) ? MTA : UTA);
+
+                f = ta_shift[(rctl >> E1000_RCTL_MO_SHIFT) & 3];
+                f = (((ehdr->h_dest[5] << 8) | ehdr->h_dest[4]) >> f) & 0xfff;
+                if (macp[f >> 5] & (1 << (f & 0x1f))) {
+                    for (i = 0; i < 8; i++) {
+                        if (core->mac[VMOLR0 + i] & E1000_VMOLR_ROMPE) {
+                            queues |= BIT(i);
+                        }
+                    }
+                }
+            } else if (is_unicast_ether_addr(ehdr->h_dest) && external_tx) {
+                *external_tx = false;
+            }
+        }
+
+        if (e1000x_vlan_rx_filter_enabled(core->mac)) {
+            uint16_t mask = 0;
+
+            if (e1000x_is_vlan_packet(ehdr, core->mac[VET] & 0xffff)) {
+                for (i = 0; i < E1000_VLVF_ARRAY_SIZE; i++) {
+                    if ((core->mac[VLVF0 + i] & E1000_VLVF_VLANID_MASK) == vid &&
+                        (core->mac[VLVF0 + i] & E1000_VLVF_VLANID_ENABLE)) {
+                        uint32_t poolsel = core->mac[VLVF0 + i] & E1000_VLVF_POOLSEL_MASK;
+                        mask |= poolsel >> E1000_VLVF_POOLSEL_SHIFT;
+                    }
+                }
+            } else {
+                for (i = 0; i < 8; i++) {
+                    if (core->mac[VMOLR0 + i] & E1000_VMOLR_AUPE) {
+                        mask |= BIT(i);
+                    }
+                }
+            }
+
+            queues &= mask;
+        }
+
+        if (is_unicast_ether_addr(ehdr->h_dest) && !queues && !external_tx &&
+            !(core->mac[VT_CTL] & E1000_VT_CTL_DISABLE_DEF_POOL)) {
+            uint32_t def_pl = core->mac[VT_CTL] & E1000_VT_CTL_DEFAULT_POOL_MASK;
+            queues = BIT(def_pl >> E1000_VT_CTL_DEFAULT_POOL_SHIFT);
+        }
+
+        igb_rss_parse_packet(core, core->rx_pkt, external_tx != NULL, rss_info);
+        if (rss_info->queue & 1) {
+            queues <<= 8;
+        }
+    } else {
+        switch (net_rx_pkt_get_packet_type(core->rx_pkt)) {
+        case ETH_PKT_UCAST:
+            if (rctl & E1000_RCTL_UPE) {
+                accepted = true; /* promiscuous ucast */
+            }
+            break;
+
+        case ETH_PKT_BCAST:
+            if (rctl & E1000_RCTL_BAM) {
+                accepted = true; /* broadcast enabled */
+            }
+            break;
+
+        case ETH_PKT_MCAST:
+            if (rctl & E1000_RCTL_MPE) {
+                accepted = true; /* promiscuous mcast */
+            }
+            break;
+
+        default:
+            g_assert_not_reached();
+        }
+
+        if (!accepted) {
+            accepted = e1000x_rx_group_filter(core->mac, ehdr->h_dest);
+        }
+
+        if (!accepted) {
+            for (macp = core->mac + RA2; macp < core->mac + RA2 + 16; macp += 2) {
+                if (!(macp[1] & E1000_RAH_AV)) {
+                    continue;
+                }
+                ra[0] = cpu_to_le32(macp[0]);
+                ra[1] = cpu_to_le32(macp[1]);
+                if (!memcmp(ehdr->h_dest, (uint8_t *)ra, ETH_ALEN)) {
+                    trace_e1000x_rx_flt_ucast_match((int)(macp - core->mac - RA2) / 2,
+                                                    MAC_ARG(ehdr->h_dest));
+
+                    accepted = true;
+                    break;
+                }
+            }
+        }
+
+        if (accepted) {
+            igb_rss_parse_packet(core, core->rx_pkt, false, rss_info);
+            queues = BIT(rss_info->queue);
+        }
+    }
+
+    return queues;
+}
+
+static inline void
+igb_read_lgcy_rx_descr(IGBCore *core, struct e1000_rx_desc *desc,
+                       hwaddr *buff_addr)
+{
+    *buff_addr = le64_to_cpu(desc->buffer_addr);
+}
+
+static inline void
+igb_read_adv_rx_descr(IGBCore *core, union e1000_adv_rx_desc *desc,
+                      hwaddr *buff_addr)
+{
+    *buff_addr = le64_to_cpu(desc->read.pkt_addr);
+}
+
+static inline void
+igb_read_rx_descr(IGBCore *core, union e1000_rx_desc_union *desc,
+                  hwaddr *buff_addr)
+{
+    if (igb_rx_use_legacy_descriptor(core)) {
+        igb_read_lgcy_rx_descr(core, &desc->legacy, buff_addr);
+    } else {
+        igb_read_adv_rx_descr(core, &desc->adv, buff_addr);
+    }
+}
+
+static void
+igb_verify_csum_in_sw(IGBCore *core,
+                      struct NetRxPkt *pkt,
+                      uint32_t *status_flags,
+                      EthL4HdrProto l4hdr_proto)
+{
+    bool csum_valid;
+    uint32_t csum_error;
+
+    if (igb_rx_l3_cso_enabled(core)) {
+        if (!net_rx_pkt_validate_l3_csum(pkt, &csum_valid)) {
+            trace_e1000e_rx_metadata_l3_csum_validation_failed();
+        } else {
+            csum_error = csum_valid ? 0 : E1000_RXDEXT_STATERR_IPE;
+            *status_flags |= E1000_RXD_STAT_IPCS | csum_error;
+        }
+    } else {
+        trace_e1000e_rx_metadata_l3_cso_disabled();
+    }
+
+    if (!igb_rx_l4_cso_enabled(core)) {
+        trace_e1000e_rx_metadata_l4_cso_disabled();
+        return;
+    }
+
+    if (!net_rx_pkt_validate_l4_csum(pkt, &csum_valid)) {
+        trace_e1000e_rx_metadata_l4_csum_validation_failed();
+        return;
+    }
+
+    csum_error = csum_valid ? 0 : E1000_RXDEXT_STATERR_TCPE;
+    *status_flags |= E1000_RXD_STAT_TCPCS | csum_error;
+
+    if (l4hdr_proto == ETH_L4_HDR_PROTO_UDP) {
+        *status_flags |= E1000_RXD_STAT_UDPCS;
+    }
+}
+
+static void
+igb_build_rx_metadata(IGBCore *core,
+                      struct NetRxPkt *pkt,
+                      bool is_eop,
+                      const E1000E_RSSInfo *rss_info,
+                      uint16_t *pkt_info, uint16_t *hdr_info,
+                      uint32_t *rss,
+                      uint32_t *status_flags,
+                      uint16_t *ip_id,
+                      uint16_t *vlan_tag)
+{
+    struct virtio_net_hdr *vhdr;
+    bool hasip4, hasip6;
+    EthL4HdrProto l4hdr_proto;
+    uint32_t pkt_type;
+
+    *status_flags = E1000_RXD_STAT_DD;
+
+    /* No additional metadata needed for non-EOP descriptors */
+    /* TODO: EOP apply only to status so don't skip whole function. */
+    if (!is_eop) {
+        goto func_exit;
+    }
+
+    *status_flags |= E1000_RXD_STAT_EOP;
+
+    net_rx_pkt_get_protocols(pkt, &hasip4, &hasip6, &l4hdr_proto);
+    trace_e1000e_rx_metadata_protocols(hasip4, hasip6, l4hdr_proto);
+
+    /* VLAN state */
+    if (net_rx_pkt_is_vlan_stripped(pkt)) {
+        *status_flags |= E1000_RXD_STAT_VP;
+        *vlan_tag = cpu_to_le16(net_rx_pkt_get_vlan_tag(pkt));
+        trace_e1000e_rx_metadata_vlan(*vlan_tag);
+    }
+
+    /* Packet parsing results */
+    if ((core->mac[RXCSUM] & E1000_RXCSUM_PCSD) != 0) {
+        if (rss_info->enabled) {
+            *rss = cpu_to_le32(rss_info->hash);
+            trace_igb_rx_metadata_rss(*rss);
+        }
+    } else if (hasip4) {
+            *status_flags |= E1000_RXD_STAT_IPIDV;
+            *ip_id = cpu_to_le16(net_rx_pkt_get_ip_id(pkt));
+            trace_e1000e_rx_metadata_ip_id(*ip_id);
+    }
+
+    if (l4hdr_proto == ETH_L4_HDR_PROTO_TCP && net_rx_pkt_is_tcp_ack(pkt)) {
+        *status_flags |= E1000_RXD_STAT_ACK;
+        trace_e1000e_rx_metadata_ack();
+    }
+
+    if (hasip6 && (core->mac[RFCTL] & E1000_RFCTL_IPV6_DIS)) {
+        trace_e1000e_rx_metadata_ipv6_filtering_disabled();
+        pkt_type = E1000_RXD_PKT_MAC;
+    } else if (l4hdr_proto == ETH_L4_HDR_PROTO_TCP ||
+               l4hdr_proto == ETH_L4_HDR_PROTO_UDP) {
+        pkt_type = hasip4 ? E1000_RXD_PKT_IP4_XDP : E1000_RXD_PKT_IP6_XDP;
+    } else if (hasip4 || hasip6) {
+        pkt_type = hasip4 ? E1000_RXD_PKT_IP4 : E1000_RXD_PKT_IP6;
+    } else {
+        pkt_type = E1000_RXD_PKT_MAC;
+    }
+
+    trace_e1000e_rx_metadata_pkt_type(pkt_type);
+
+    if (pkt_info) {
+        if (rss_info->enabled) {
+            *pkt_info = rss_info->type;
+        }
+
+        *pkt_info |= (pkt_type << 4);
+    } else {
+        *status_flags |= E1000_RXD_PKT_TYPE(pkt_type);
+    }
+
+    if (hdr_info) {
+        *hdr_info = 0;
+    }
+
+    /* RX CSO information */
+    if (hasip6 && (core->mac[RFCTL] & E1000_RFCTL_IPV6_XSUM_DIS)) {
+        trace_e1000e_rx_metadata_ipv6_sum_disabled();
+        goto func_exit;
+    }
+
+    vhdr = net_rx_pkt_get_vhdr(pkt);
+
+    if (!(vhdr->flags & VIRTIO_NET_HDR_F_DATA_VALID) &&
+        !(vhdr->flags & VIRTIO_NET_HDR_F_NEEDS_CSUM)) {
+        trace_e1000e_rx_metadata_virthdr_no_csum_info();
+        igb_verify_csum_in_sw(core, pkt, status_flags, l4hdr_proto);
+        goto func_exit;
+    }
+
+    if (igb_rx_l3_cso_enabled(core)) {
+        *status_flags |= hasip4 ? E1000_RXD_STAT_IPCS : 0;
+    } else {
+        trace_e1000e_rx_metadata_l3_cso_disabled();
+    }
+
+    if (igb_rx_l4_cso_enabled(core)) {
+        switch (l4hdr_proto) {
+        case ETH_L4_HDR_PROTO_TCP:
+            *status_flags |= E1000_RXD_STAT_TCPCS;
+            break;
+
+        case ETH_L4_HDR_PROTO_UDP:
+            *status_flags |= E1000_RXD_STAT_TCPCS | E1000_RXD_STAT_UDPCS;
+            break;
+
+        default:
+            goto func_exit;
+        }
+    } else {
+        trace_e1000e_rx_metadata_l4_cso_disabled();
+    }
+
+    trace_e1000e_rx_metadata_status_flags(*status_flags);
+
+func_exit:
+    *status_flags = cpu_to_le32(*status_flags);
+}
+
+static inline void
+igb_write_lgcy_rx_descr(IGBCore *core, struct e1000_rx_desc *desc,
+                        struct NetRxPkt *pkt,
+                        const E1000E_RSSInfo *rss_info,
+                        uint16_t length)
+{
+    uint32_t status_flags, rss;
+    uint16_t ip_id;
+
+    assert(!rss_info->enabled);
+    desc->length = cpu_to_le16(length);
+    desc->csum = 0;
+
+    igb_build_rx_metadata(core, pkt, pkt != NULL,
+                          rss_info,
+                          NULL, NULL, &rss,
+                          &status_flags, &ip_id,
+                          &desc->special);
+    desc->errors = (uint8_t) (le32_to_cpu(status_flags) >> 24);
+    desc->status = (uint8_t) le32_to_cpu(status_flags);
+}
+
+static inline void
+igb_write_adv_rx_descr(IGBCore *core, union e1000_adv_rx_desc *desc,
+                       struct NetRxPkt *pkt,
+                       const E1000E_RSSInfo *rss_info,
+                       uint16_t length)
+{
+    memset(&desc->wb, 0, sizeof(desc->wb));
+
+    desc->wb.upper.length = cpu_to_le16(length);
+
+    igb_build_rx_metadata(core, pkt, pkt != NULL,
+                          rss_info,
+                          &desc->wb.lower.lo_dword.pkt_info,
+                          &desc->wb.lower.lo_dword.hdr_info,
+                          &desc->wb.lower.hi_dword.rss,
+                          &desc->wb.upper.status_error,
+                          &desc->wb.lower.hi_dword.csum_ip.ip_id,
+                          &desc->wb.upper.vlan);
+}
+
+static inline void
+igb_write_rx_descr(IGBCore *core, union e1000_rx_desc_union *desc,
+struct NetRxPkt *pkt, const E1000E_RSSInfo *rss_info, uint16_t length)
+{
+    if (igb_rx_use_legacy_descriptor(core)) {
+        igb_write_lgcy_rx_descr(core, &desc->legacy, pkt, rss_info, length);
+    } else {
+        igb_write_adv_rx_descr(core, &desc->adv, pkt, rss_info, length);
+    }
+}
+
+static inline void
+igb_pci_dma_write_rx_desc(IGBCore *core, PCIDevice *dev, dma_addr_t addr,
+                          union e1000_rx_desc_union *desc, dma_addr_t len)
+{
+    if (igb_rx_use_legacy_descriptor(core)) {
+        struct e1000_rx_desc *d = &desc->legacy;
+        size_t offset = offsetof(struct e1000_rx_desc, status);
+        uint8_t status = d->status;
+
+        d->status &= ~E1000_RXD_STAT_DD;
+        pci_dma_write(dev, addr, desc, len);
+
+        if (status & E1000_RXD_STAT_DD) {
+            d->status = status;
+            pci_dma_write(dev, addr + offset, &status, sizeof(status));
+        }
+    } else {
+        union e1000_adv_rx_desc *d = &desc->adv;
+        size_t offset =
+            offsetof(union e1000_adv_rx_desc, wb.upper.status_error);
+        uint32_t status = d->wb.upper.status_error;
+
+        d->wb.upper.status_error &= ~E1000_RXD_STAT_DD;
+        pci_dma_write(dev, addr, desc, len);
+
+        if (status & E1000_RXD_STAT_DD) {
+            d->wb.upper.status_error = status;
+            pci_dma_write(dev, addr + offset, &status, sizeof(status));
+        }
+    }
+}
+
+static void
+igb_write_to_rx_buffers(IGBCore *core,
+                        PCIDevice *d,
+                        hwaddr ba,
+                        uint16_t *written,
+                        const char *data,
+                        dma_addr_t data_len)
+{
+    trace_igb_rx_desc_buff_write(ba, *written, data, data_len);
+    pci_dma_write(d, ba + *written, data, data_len);
+    *written += data_len;
+}
+
+static void
+igb_update_rx_stats(IGBCore *core, size_t data_size, size_t data_fcs_size)
+{
+    e1000x_update_rx_total_stats(core->mac, data_size, data_fcs_size);
+
+    switch (net_rx_pkt_get_packet_type(core->rx_pkt)) {
+    case ETH_PKT_BCAST:
+        e1000x_inc_reg_if_not_full(core->mac, BPRC);
+        break;
+
+    case ETH_PKT_MCAST:
+        e1000x_inc_reg_if_not_full(core->mac, MPRC);
+        break;
+
+    default:
+        break;
+    }
+}
+
+static inline bool
+igb_rx_descr_threshold_hit(IGBCore *core, const E1000E_RingInfo *rxi)
+{
+    return igb_ring_free_descr_num(core, rxi) ==
+           ((core->mac[E1000_SRRCTL(rxi->idx) >> 2] >> 20) & 31) * 16;
+}
+
+static void
+igb_write_packet_to_guest(IGBCore *core, struct NetRxPkt *pkt,
+                          const E1000E_RxRing *rxr,
+                          const E1000E_RSSInfo *rss_info)
+{
+    PCIDevice *d;
+    dma_addr_t base;
+    union e1000_rx_desc_union desc;
+    size_t desc_size;
+    size_t desc_offset = 0;
+    size_t iov_ofs = 0;
+
+    struct iovec *iov = net_rx_pkt_get_iovec(pkt);
+    size_t size = net_rx_pkt_get_total_len(pkt);
+    size_t total_size = size + e1000x_fcs_len(core->mac);
+    const E1000E_RingInfo *rxi = rxr->i;
+    size_t bufsize = igb_rxbufsize(core, rxi);
+
+    d = pcie_sriov_get_vf_at_index(core->owner, rxi->idx % 8);
+    if (!d) {
+        d = core->owner;
+    }
+
+    do {
+        hwaddr ba;
+        uint16_t written = 0;
+        bool is_last = false;
+
+        desc_size = total_size - desc_offset;
+
+        if (desc_size > bufsize) {
+            desc_size = bufsize;
+        }
+
+        if (igb_ring_empty(core, rxi)) {
+            return;
+        }
+
+        base = igb_ring_head_descr(core, rxi);
+
+        pci_dma_read(d, base, &desc, core->rx_desc_len);
+
+        trace_e1000e_rx_descr(rxi->idx, base, core->rx_desc_len);
+
+        igb_read_rx_descr(core, &desc, &ba);
+
+        if (ba) {
+            if (desc_offset < size) {
+                static const uint32_t fcs_pad;
+                size_t iov_copy;
+                size_t copy_size = size - desc_offset;
+                if (copy_size > bufsize) {
+                    copy_size = bufsize;
+                }
+
+                /* Copy packet payload */
+                while (copy_size) {
+                    iov_copy = MIN(copy_size, iov->iov_len - iov_ofs);
+
+                    igb_write_to_rx_buffers(core, d, ba, &written,
+                                            iov->iov_base + iov_ofs, iov_copy);
+
+                    copy_size -= iov_copy;
+                    iov_ofs += iov_copy;
+                    if (iov_ofs == iov->iov_len) {
+                        iov++;
+                        iov_ofs = 0;
+                    }
+                }
+
+                if (desc_offset + desc_size >= total_size) {
+                    /* Simulate FCS checksum presence in the last descriptor */
+                    igb_write_to_rx_buffers(core, d, ba, &written,
+                          (const char *) &fcs_pad, e1000x_fcs_len(core->mac));
+                }
+            }
+        } else { /* as per intel docs; skip descriptors with null buf addr */
+            trace_e1000e_rx_null_descriptor();
+        }
+        desc_offset += desc_size;
+        if (desc_offset >= total_size) {
+            is_last = true;
+        }
+
+        igb_write_rx_descr(core, &desc, is_last ? core->rx_pkt : NULL,
+                           rss_info, written);
+        igb_pci_dma_write_rx_desc(core, d, base, &desc, core->rx_desc_len);
+
+        igb_ring_advance(core, rxi, core->rx_desc_len / E1000_MIN_RX_DESC_LEN);
+
+    } while (desc_offset < total_size);
+
+    igb_update_rx_stats(core, size, total_size);
+}
+
+static inline void
+igb_rx_fix_l4_csum(IGBCore *core, struct NetRxPkt *pkt)
+{
+    struct virtio_net_hdr *vhdr = net_rx_pkt_get_vhdr(pkt);
+
+    if (vhdr->flags & VIRTIO_NET_HDR_F_NEEDS_CSUM) {
+        net_rx_pkt_fix_l4_csum(pkt);
+    }
+}
+
+ssize_t
+igb_receive_iov(IGBCore *core, const struct iovec *iov, int iovcnt)
+{
+    return igb_receive_internal(core, iov, iovcnt, core->has_vnet, NULL);
+}
+
+static ssize_t
+igb_receive_internal(IGBCore *core, const struct iovec *iov, int iovcnt,
+                     bool has_vnet, bool *external_tx)
+{
+    static const int maximum_ethernet_hdr_len = (ETH_HLEN + 4);
+
+    uint16_t queues = 0;
+    uint32_t n = 0;
+    uint8_t min_buf[ETH_ZLEN];
+    struct iovec min_iov;
+    struct eth_header *ehdr;
+    uint8_t *filter_buf;
+    size_t size, orig_size;
+    size_t iov_ofs = 0;
+    E1000E_RxRing rxr;
+    E1000E_RSSInfo rss_info;
+    size_t total_size;
+    int i;
+
+    trace_e1000e_rx_receive_iov(iovcnt);
+
+    if (external_tx) {
+        *external_tx = true;
+    }
+
+    if (!e1000x_hw_rx_enabled(core->mac)) {
+        return -1;
+    }
+
+    /* Pull virtio header in */
+    if (has_vnet) {
+        net_rx_pkt_set_vhdr_iovec(core->rx_pkt, iov, iovcnt);
+        iov_ofs = sizeof(struct virtio_net_hdr);
+    } else {
+        net_rx_pkt_unset_vhdr(core->rx_pkt);
+    }
+
+    filter_buf = iov->iov_base + iov_ofs;
+    orig_size = iov_size(iov, iovcnt);
+    size = orig_size - iov_ofs;
+
+    /* Pad to minimum Ethernet frame length */
+    if (size < sizeof(min_buf)) {
+        iov_to_buf(iov, iovcnt, iov_ofs, min_buf, size);
+        memset(&min_buf[size], 0, sizeof(min_buf) - size);
+        e1000x_inc_reg_if_not_full(core->mac, RUC);
+        min_iov.iov_base = filter_buf = min_buf;
+        min_iov.iov_len = size = sizeof(min_buf);
+        iovcnt = 1;
+        iov = &min_iov;
+        iov_ofs = 0;
+    } else if (iov->iov_len < maximum_ethernet_hdr_len) {
+        /* This is very unlikely, but may happen. */
+        iov_to_buf(iov, iovcnt, iov_ofs, min_buf, maximum_ethernet_hdr_len);
+        filter_buf = min_buf;
+    }
+
+    /* Discard oversized packets if !LPE and !SBP. */
+    if (e1000x_is_oversized(core->mac, size)) {
+        return orig_size;
+    }
+
+    ehdr = PKT_GET_ETH_HDR(filter_buf);
+    net_rx_pkt_set_packet_type(core->rx_pkt, get_eth_packet_type(ehdr));
+
+    net_rx_pkt_attach_iovec_ex(core->rx_pkt, iov, iovcnt, iov_ofs,
+                               e1000x_vlan_enabled(core->mac),
+                               core->mac[VET] & 0xffff);
+
+    queues = igb_receive_assign(core, ehdr, &rss_info, external_tx);
+    if (!queues) {
+        trace_e1000e_rx_flt_dropped();
+        return orig_size;
+    }
+
+    total_size = net_rx_pkt_get_total_len(core->rx_pkt) +
+        e1000x_fcs_len(core->mac);
+
+    for (i = 0; i < IGB_NUM_QUEUES; i++) {
+        if (!(queues & BIT(i))) {
+            continue;
+        }
+
+        igb_rx_ring_init(core, &rxr, i);
+
+        if (!igb_has_rxbufs(core, rxr.i, total_size)) {
+            n |= E1000_ICS_RXO;
+            trace_e1000e_rx_not_written_to_guest(rxr.i->idx);
+            continue;
+        }
+
+        n |= E1000_ICR_RXT0;
+
+        igb_rx_fix_l4_csum(core, core->rx_pkt);
+        igb_write_packet_to_guest(core, core->rx_pkt, &rxr, &rss_info);
+
+        /* Check if receive descriptor minimum threshold hit */
+        if (igb_rx_descr_threshold_hit(core, rxr.i)) {
+            n |= E1000_ICS_RXDMT0;
+        }
+
+        core->mac[EICR] |= igb_rx_wb_eic(core, rxr.i->idx);
+
+        trace_e1000e_rx_written_to_guest(rxr.i->idx);
+    }
+
+    trace_e1000e_rx_interrupt_set(n);
+    igb_set_interrupt_cause(core, n);
+
+    return orig_size;
+}
+
+static inline bool
+igb_have_autoneg(IGBCore *core)
+{
+    return core->phy[MII_BMCR] & MII_BMCR_AUTOEN;
+}
+
+static void igb_update_flowctl_status(IGBCore *core)
+{
+    if (igb_have_autoneg(core) && core->phy[MII_BMSR] & MII_BMSR_AN_COMP) {
+        trace_e1000e_link_autoneg_flowctl(true);
+        core->mac[CTRL] |= E1000_CTRL_TFCE | E1000_CTRL_RFCE;
+    } else {
+        trace_e1000e_link_autoneg_flowctl(false);
+    }
+}
+
+static inline void
+igb_link_down(IGBCore *core)
+{
+    e1000x_update_regs_on_link_down(core->mac, core->phy);
+    igb_update_flowctl_status(core);
+}
+
+static inline void
+igb_set_phy_ctrl(IGBCore *core, uint16_t val)
+{
+    /* bits 0-5 reserved; MII_BMCR_[ANRESTART,RESET] are self clearing */
+    core->phy[MII_BMCR] = val & ~(0x3f | MII_BMCR_RESET | MII_BMCR_ANRESTART);
+
+    if ((val & MII_BMCR_ANRESTART) && igb_have_autoneg(core)) {
+        e1000x_restart_autoneg(core->mac, core->phy, core->autoneg_timer);
+    }
+}
+
+void igb_core_set_link_status(IGBCore *core)
+{
+    NetClientState *nc = qemu_get_queue(core->owner_nic);
+    uint32_t old_status = core->mac[STATUS];
+
+    trace_e1000e_link_status_changed(nc->link_down ? false : true);
+
+    if (nc->link_down) {
+        e1000x_update_regs_on_link_down(core->mac, core->phy);
+    } else {
+        if (igb_have_autoneg(core) &&
+            !(core->phy[MII_BMSR] & MII_BMSR_AN_COMP)) {
+            e1000x_restart_autoneg(core->mac, core->phy,
+                                   core->autoneg_timer);
+        } else {
+            e1000x_update_regs_on_link_up(core->mac, core->phy);
+            igb_start_recv(core);
+        }
+    }
+
+    if (core->mac[STATUS] != old_status) {
+        igb_set_interrupt_cause(core, E1000_ICR_LSC);
+    }
+}
+
+static void
+igb_set_ctrl(IGBCore *core, int index, uint32_t val)
+{
+    trace_e1000e_core_ctrl_write(index, val);
+
+    /* RST is self clearing */
+    core->mac[CTRL] = val & ~E1000_CTRL_RST;
+    core->mac[CTRL_DUP] = core->mac[CTRL];
+
+    trace_e1000e_link_set_params(
+        !!(val & E1000_CTRL_ASDE),
+        (val & E1000_CTRL_SPD_SEL) >> E1000_CTRL_SPD_SHIFT,
+        !!(val & E1000_CTRL_FRCSPD),
+        !!(val & E1000_CTRL_FRCDPX),
+        !!(val & E1000_CTRL_RFCE),
+        !!(val & E1000_CTRL_TFCE));
+
+    if (val & E1000_CTRL_RST) {
+        trace_e1000e_core_ctrl_sw_reset();
+        igb_reset(core, true);
+    }
+
+    if (val & E1000_CTRL_PHY_RST) {
+        trace_e1000e_core_ctrl_phy_reset();
+        core->mac[STATUS] |= E1000_STATUS_PHYRA;
+    }
+}
+
+static void
+igb_set_rfctl(IGBCore *core, int index, uint32_t val)
+{
+    trace_e1000e_rx_set_rfctl(val);
+
+    if (!(val & E1000_RFCTL_ISCSI_DIS)) {
+        trace_e1000e_wrn_iscsi_filtering_not_supported();
+    }
+
+    if (!(val & E1000_RFCTL_NFSW_DIS)) {
+        trace_e1000e_wrn_nfsw_filtering_not_supported();
+    }
+
+    if (!(val & E1000_RFCTL_NFSR_DIS)) {
+        trace_e1000e_wrn_nfsr_filtering_not_supported();
+    }
+
+    core->mac[RFCTL] = val;
+}
+
+static void
+igb_calc_rxdesclen(IGBCore *core)
+{
+    if (igb_rx_use_legacy_descriptor(core)) {
+        core->rx_desc_len = sizeof(struct e1000_rx_desc);
+    } else {
+        core->rx_desc_len = sizeof(union e1000_adv_rx_desc);
+    }
+    trace_e1000e_rx_desc_len(core->rx_desc_len);
+}
+
+static void
+igb_set_rx_control(IGBCore *core, int index, uint32_t val)
+{
+    core->mac[RCTL] = val;
+    trace_e1000e_rx_set_rctl(core->mac[RCTL]);
+
+    if (val & E1000_RCTL_DTYP_MASK) {
+        qemu_log_mask(LOG_GUEST_ERROR,
+                      "igb: RCTL.DTYP must be zero for compatibility");
+    }
+
+    if (val & E1000_RCTL_EN) {
+        igb_calc_rxdesclen(core);
+        igb_start_recv(core);
+    }
+}
+
+static inline void
+igb_clear_ims_bits(IGBCore *core, uint32_t bits)
+{
+    trace_e1000e_irq_clear_ims(bits, core->mac[IMS], core->mac[IMS] & ~bits);
+    core->mac[IMS] &= ~bits;
+}
+
+static inline bool
+igb_postpone_interrupt(IGBIntrDelayTimer *timer)
+{
+    if (timer->running) {
+        trace_e1000e_irq_postponed_by_xitr(timer->delay_reg << 2);
+
+        return true;
+    }
+
+    if (timer->core->mac[timer->delay_reg] != 0) {
+        igb_intrmgr_rearm_timer(timer);
+    }
+
+    return false;
+}
+
+static inline bool
+igb_eitr_should_postpone(IGBCore *core, int idx)
+{
+    return igb_postpone_interrupt(&core->eitr[idx]);
+}
+
+static void igb_send_msix(IGBCore *core)
+{
+    uint32_t causes = core->mac[EICR] & core->mac[EIMS];
+    uint32_t effective_eiac;
+    int vector;
+
+    for (vector = 0; vector < IGB_INTR_NUM; ++vector) {
+        if ((causes & BIT(vector)) && !igb_eitr_should_postpone(core, vector)) {
+
+            trace_e1000e_irq_msix_notify_vec(vector);
+            igb_msix_notify(core, vector);
+
+            trace_e1000e_irq_icr_clear_eiac(core->mac[EICR], core->mac[EIAC]);
+            effective_eiac = core->mac[EIAC] & BIT(vector);
+            core->mac[EICR] &= ~effective_eiac;
+        }
+    }
+}
+
+static inline void
+igb_fix_icr_asserted(IGBCore *core)
+{
+    core->mac[ICR] &= ~E1000_ICR_ASSERTED;
+    if (core->mac[ICR]) {
+        core->mac[ICR] |= E1000_ICR_ASSERTED;
+    }
+
+    trace_e1000e_irq_fix_icr_asserted(core->mac[ICR]);
+}
+
+static void
+igb_update_interrupt_state(IGBCore *core)
+{
+    uint32_t icr;
+    uint32_t causes;
+    uint32_t int_alloc;
+
+    icr = core->mac[ICR] & core->mac[IMS];
+
+    if (msix_enabled(core->owner)) {
+        if (icr) {
+            causes = 0;
+            if (icr & E1000_ICR_DRSTA) {
+                int_alloc = core->mac[IVAR_MISC] & 0xff;
+                if (int_alloc & E1000_IVAR_VALID) {
+                    causes |= BIT(int_alloc & 0x1f);
+                }
+            }
+            /* Check if other bits (excluding the TCP Timer) are enabled. */
+            if (icr & ~E1000_ICR_DRSTA) {
+                int_alloc = (core->mac[IVAR_MISC] >> 8) & 0xff;
+                if (int_alloc & E1000_IVAR_VALID) {
+                    causes |= BIT(int_alloc & 0x1f);
+                }
+                trace_e1000e_irq_add_msi_other(core->mac[EICR]);
+            }
+            core->mac[EICR] |= causes;
+        }
+
+        if ((core->mac[EICR] & core->mac[EIMS])) {
+            igb_send_msix(core);
+        }
+    } else {
+        igb_fix_icr_asserted(core);
+
+        if (icr) {
+            core->mac[EICR] |= (icr & E1000_ICR_DRSTA) | E1000_EICR_OTHER;
+        } else {
+            core->mac[EICR] &= ~E1000_EICR_OTHER;
+        }
+
+        trace_e1000e_irq_pending_interrupts(core->mac[ICR] & core->mac[IMS],
+                                            core->mac[ICR], core->mac[IMS]);
+
+        if (msi_enabled(core->owner)) {
+            if (icr) {
+                msi_notify(core->owner, 0);
+            }
+        } else {
+            if (icr) {
+                igb_raise_legacy_irq(core);
+            } else {
+                igb_lower_legacy_irq(core);
+            }
+        }
+    }
+}
+
+static void
+igb_set_interrupt_cause(IGBCore *core, uint32_t val)
+{
+    trace_e1000e_irq_set_cause_entry(val, core->mac[ICR]);
+
+    core->mac[ICR] |= val;
+
+    trace_e1000e_irq_set_cause_exit(val, core->mac[ICR]);
+
+    igb_update_interrupt_state(core);
+}
+
+static void igb_set_eics(IGBCore *core, int index, uint32_t val)
+{
+    bool msix = !!(core->mac[GPIE] & E1000_GPIE_MSIX_MODE);
+
+    trace_igb_irq_write_eics(val, msix);
+
+    core->mac[EICS] |=
+        val & (msix ? E1000_EICR_MSIX_MASK : E1000_EICR_LEGACY_MASK);
+
+    /*
+     * TODO: Move to igb_update_interrupt_state if EICS is modified in other
+     * places.
+     */
+    core->mac[EICR] = core->mac[EICS];
+
+    igb_update_interrupt_state(core);
+}
+
+static void igb_set_eims(IGBCore *core, int index, uint32_t val)
+{
+    bool msix = !!(core->mac[GPIE] & E1000_GPIE_MSIX_MODE);
+
+    trace_igb_irq_write_eims(val, msix);
+
+    core->mac[EIMS] |=
+        val & (msix ? E1000_EICR_MSIX_MASK : E1000_EICR_LEGACY_MASK);
+
+    igb_update_interrupt_state(core);
+}
+
+static void igb_vf_reset(IGBCore *core, uint16_t vfn)
+{
+    /* TODO: Reset of the queue enable and the interrupt registers of the VF. */
+
+    core->mac[V2PMAILBOX0 + vfn] &= ~E1000_V2PMAILBOX_RSTI;
+    core->mac[V2PMAILBOX0 + vfn] = E1000_V2PMAILBOX_RSTD;
+}
+
+static void mailbox_interrupt_to_vf(IGBCore *core, uint16_t vfn)
+{
+    uint32_t ent = core->mac[VTIVAR_MISC + vfn];
+
+    if ((ent & E1000_IVAR_VALID)) {
+        core->mac[EICR] |= (ent & 0x3) << (22 - vfn * IGBVF_MSIX_VEC_NUM);
+        igb_update_interrupt_state(core);
+    }
+}
+
+static void mailbox_interrupt_to_pf(IGBCore *core)
+{
+    igb_set_interrupt_cause(core, E1000_ICR_VMMB);
+}
+
+static void igb_set_pfmailbox(IGBCore *core, int index, uint32_t val)
+{
+    uint16_t vfn = index - P2VMAILBOX0;
+
+    trace_igb_set_pfmailbox(vfn, val);
+
+    if (val & E1000_P2VMAILBOX_STS) {
+        core->mac[V2PMAILBOX0 + vfn] |= E1000_V2PMAILBOX_PFSTS;
+        mailbox_interrupt_to_vf(core, vfn);
+    }
+
+    if (val & E1000_P2VMAILBOX_ACK) {
+        core->mac[V2PMAILBOX0 + vfn] |= E1000_V2PMAILBOX_PFACK;
+        mailbox_interrupt_to_vf(core, vfn);
+    }
+
+    /* Buffer Taken by PF (can be set only if the VFU is cleared). */
+    if (val & E1000_P2VMAILBOX_PFU) {
+        if (!(core->mac[index] & E1000_P2VMAILBOX_VFU)) {
+            core->mac[index] |= E1000_P2VMAILBOX_PFU;
+            core->mac[V2PMAILBOX0 + vfn] |= E1000_V2PMAILBOX_PFU;
+        }
+    } else {
+        core->mac[index] &= ~E1000_P2VMAILBOX_PFU;
+        core->mac[V2PMAILBOX0 + vfn] &= ~E1000_V2PMAILBOX_PFU;
+    }
+
+    if (val & E1000_P2VMAILBOX_RVFU) {
+        core->mac[V2PMAILBOX0 + vfn] &= ~E1000_V2PMAILBOX_VFU;
+        core->mac[MBVFICR] &= ~((E1000_MBVFICR_VFACK_VF1 << vfn) |
+                                (E1000_MBVFICR_VFREQ_VF1 << vfn));
+    }
+}
+
+static void igb_set_vfmailbox(IGBCore *core, int index, uint32_t val)
+{
+    uint16_t vfn = index - V2PMAILBOX0;
+
+    trace_igb_set_vfmailbox(vfn, val);
+
+    if (val & E1000_V2PMAILBOX_REQ) {
+        core->mac[MBVFICR] |= E1000_MBVFICR_VFREQ_VF1 << vfn;
+        mailbox_interrupt_to_pf(core);
+    }
+
+    if (val & E1000_V2PMAILBOX_ACK) {
+        core->mac[MBVFICR] |= E1000_MBVFICR_VFACK_VF1 << vfn;
+        mailbox_interrupt_to_pf(core);
+    }
+
+    /* Buffer Taken by VF (can be set only if the PFU is cleared). */
+    if (val & E1000_V2PMAILBOX_VFU) {
+        if (!(core->mac[index] & E1000_V2PMAILBOX_PFU)) {
+            core->mac[index] |= E1000_V2PMAILBOX_VFU;
+            core->mac[P2VMAILBOX0 + vfn] |= E1000_P2VMAILBOX_VFU;
+        }
+    } else {
+        core->mac[index] &= ~E1000_V2PMAILBOX_VFU;
+        core->mac[P2VMAILBOX0 + vfn] &= ~E1000_P2VMAILBOX_VFU;
+    }
+}
+
+static void igb_w1c(IGBCore *core, int index, uint32_t val)
+{
+    core->mac[index] &= ~val;
+}
+
+static void igb_set_eimc(IGBCore *core, int index, uint32_t val)
+{
+    bool msix = !!(core->mac[GPIE] & E1000_GPIE_MSIX_MODE);
+
+    /* Interrupts are disabled via a write to EIMC and reflected in EIMS. */
+    core->mac[EIMS] &=
+        ~(val & (msix ? E1000_EICR_MSIX_MASK : E1000_EICR_LEGACY_MASK));
+
+    trace_igb_irq_write_eimc(val, core->mac[EIMS], msix);
+    igb_update_interrupt_state(core);
+}
+
+static void igb_set_eiac(IGBCore *core, int index, uint32_t val)
+{
+    bool msix = !!(core->mac[GPIE] & E1000_GPIE_MSIX_MODE);
+
+    if (msix) {
+        trace_igb_irq_write_eiac(val);
+
+        /*
+         * TODO: When using IOV, the bits that correspond to MSI-X vectors
+         * that are assigned to a VF are read-only.
+         */
+        core->mac[EIAC] |= (val & E1000_EICR_MSIX_MASK);
+    }
+}
+
+static void igb_set_eiam(IGBCore *core, int index, uint32_t val)
+{
+    bool msix = !!(core->mac[GPIE] & E1000_GPIE_MSIX_MODE);
+
+    /*
+     * TODO: When using IOV, the bits that correspond to MSI-X vectors that
+     * are assigned to a VF are read-only.
+     */
+    core->mac[EIAM] |=
+        ~(val & (msix ? E1000_EICR_MSIX_MASK : E1000_EICR_LEGACY_MASK));
+
+    trace_igb_irq_write_eiam(val, msix);
+}
+
+static void igb_set_eicr(IGBCore *core, int index, uint32_t val)
+{
+    bool msix = !!(core->mac[GPIE] & E1000_GPIE_MSIX_MODE);
+
+    /*
+     * TODO: In IOV mode, only bit zero of this vector is available for the PF
+     * function.
+     */
+    core->mac[EICR] &=
+        ~(val & (msix ? E1000_EICR_MSIX_MASK : E1000_EICR_LEGACY_MASK));
+
+    trace_igb_irq_write_eicr(val, msix);
+    igb_update_interrupt_state(core);
+}
+
+static void igb_set_vtctrl(IGBCore *core, int index, uint32_t val)
+{
+    uint16_t vfn;
+
+    if (val & E1000_CTRL_RST) {
+        vfn = (index - PVTCTRL0) / 0x40;
+        igb_vf_reset(core, vfn);
+    }
+}
+
+static void igb_set_vteics(IGBCore *core, int index, uint32_t val)
+{
+    uint16_t vfn = (index - PVTEICS0) / 0x40;
+
+    core->mac[index] = val;
+    igb_set_eics(core, EICS, (val & 0x7) << (22 - vfn * IGBVF_MSIX_VEC_NUM));
+}
+
+static void igb_set_vteims(IGBCore *core, int index, uint32_t val)
+{
+    uint16_t vfn = (index - PVTEIMS0) / 0x40;
+
+    core->mac[index] = val;
+    igb_set_eims(core, EIMS, (val & 0x7) << (22 - vfn * IGBVF_MSIX_VEC_NUM));
+}
+
+static void igb_set_vteimc(IGBCore *core, int index, uint32_t val)
+{
+    uint16_t vfn = (index - PVTEIMC0) / 0x40;
+
+    core->mac[index] = val;
+    igb_set_eimc(core, EIMC, (val & 0x7) << (22 - vfn * IGBVF_MSIX_VEC_NUM));
+}
+
+static void igb_set_vteiac(IGBCore *core, int index, uint32_t val)
+{
+    uint16_t vfn = (index - PVTEIAC0) / 0x40;
+
+    core->mac[index] = val;
+    igb_set_eiac(core, EIAC, (val & 0x7) << (22 - vfn * IGBVF_MSIX_VEC_NUM));
+}
+
+static void igb_set_vteiam(IGBCore *core, int index, uint32_t val)
+{
+    uint16_t vfn = (index - PVTEIAM0) / 0x40;
+
+    core->mac[index] = val;
+    igb_set_eiam(core, EIAM, (val & 0x7) << (22 - vfn * IGBVF_MSIX_VEC_NUM));
+}
+
+static void igb_set_vteicr(IGBCore *core, int index, uint32_t val)
+{
+    uint16_t vfn = (index - PVTEICR0) / 0x40;
+
+    core->mac[index] = val;
+    igb_set_eicr(core, EICR, (val & 0x7) << (22 - vfn * IGBVF_MSIX_VEC_NUM));
+}
+
+static void igb_set_vtivar(IGBCore *core, int index, uint32_t val)
+{
+    uint16_t vfn = (index - VTIVAR);
+    uint16_t qn = vfn;
+    uint8_t ent;
+    int n;
+
+    core->mac[index] = val;
+
+    /* Get assigned vector associated with queue Rx#0. */
+    if ((val & E1000_IVAR_VALID)) {
+        n = igb_ivar_entry_rx(qn);
+        ent = E1000_IVAR_VALID | (24 - vfn * IGBVF_MSIX_VEC_NUM - (2 - (val & 0x7)));
+        core->mac[IVAR0 + n / 4] |= ent << 8 * (n % 4);
+    }
+
+    /* Get assigned vector associated with queue Tx#0 */
+    ent = val >> 8;
+    if ((ent & E1000_IVAR_VALID)) {
+        n = igb_ivar_entry_tx(qn);
+        ent = E1000_IVAR_VALID | (24 - vfn * IGBVF_MSIX_VEC_NUM - (2 - (ent & 0x7)));
+        core->mac[IVAR0 + n / 4] |= ent << 8 * (n % 4);
+    }
+
+    /*
+     * Ignoring assigned vectors associated with queues Rx#1 and Tx#1 for now.
+     */
+}
+
+static inline void
+igb_autoneg_timer(void *opaque)
+{
+    IGBCore *core = opaque;
+    if (!qemu_get_queue(core->owner_nic)->link_down) {
+        e1000x_update_regs_on_autoneg_done(core->mac, core->phy);
+        igb_start_recv(core);
+
+        igb_update_flowctl_status(core);
+        /* signal link status change to the guest */
+        igb_set_interrupt_cause(core, E1000_ICR_LSC);
+    }
+}
+
+static inline uint16_t
+igb_get_reg_index_with_offset(const uint16_t *mac_reg_access, hwaddr addr)
+{
+    uint16_t index = (addr & 0x1ffff) >> 2;
+    return index + (mac_reg_access[index] & 0xfffe);
+}
+
+static const char igb_phy_regcap[MAX_PHY_REG_ADDRESS + 1] = {
+    [MII_BMCR]                   = PHY_RW,
+    [MII_BMSR]                   = PHY_R,
+    [MII_PHYID1]                 = PHY_R,
+    [MII_PHYID2]                 = PHY_R,
+    [MII_ANAR]                   = PHY_RW,
+    [MII_ANLPAR]                 = PHY_R,
+    [MII_ANER]                   = PHY_R,
+    [MII_ANNP]                   = PHY_RW,
+    [MII_ANLPRNP]                = PHY_R,
+    [MII_CTRL1000]               = PHY_RW,
+    [MII_STAT1000]               = PHY_R,
+    [MII_EXTSTAT]                = PHY_R,
+
+    [IGP01E1000_PHY_PORT_CONFIG] = PHY_RW,
+    [IGP01E1000_PHY_PORT_STATUS] = PHY_R,
+    [IGP01E1000_PHY_PORT_CTRL]   = PHY_RW,
+    [IGP01E1000_PHY_LINK_HEALTH] = PHY_R,
+    [IGP02E1000_PHY_POWER_MGMT]  = PHY_RW,
+    [IGP01E1000_PHY_PAGE_SELECT] = PHY_W
+};
+
+static void
+igb_phy_reg_write(IGBCore *core, uint32_t addr, uint16_t data)
+{
+    assert(addr <= MAX_PHY_REG_ADDRESS);
+
+    if (addr == MII_BMCR) {
+        igb_set_phy_ctrl(core, data);
+    } else {
+        core->phy[addr] = data;
+    }
+}
+
+static void
+igb_set_mdic(IGBCore *core, int index, uint32_t val)
+{
+    uint32_t data = val & E1000_MDIC_DATA_MASK;
+    uint32_t addr = ((val & E1000_MDIC_REG_MASK) >> E1000_MDIC_REG_SHIFT);
+
+    if ((val & E1000_MDIC_PHY_MASK) >> E1000_MDIC_PHY_SHIFT != 1) { /* phy # */
+        val = core->mac[MDIC] | E1000_MDIC_ERROR;
+    } else if (val & E1000_MDIC_OP_READ) {
+        if (!(igb_phy_regcap[addr] & PHY_R)) {
+            trace_igb_core_mdic_read_unhandled(addr);
+            val |= E1000_MDIC_ERROR;
+        } else {
+            val = (val ^ data) | core->phy[addr];
+            trace_igb_core_mdic_read(addr, val);
+        }
+    } else if (val & E1000_MDIC_OP_WRITE) {
+        if (!(igb_phy_regcap[addr] & PHY_W)) {
+            trace_igb_core_mdic_write_unhandled(addr);
+            val |= E1000_MDIC_ERROR;
+        } else {
+            trace_igb_core_mdic_write(addr, data);
+            igb_phy_reg_write(core, addr, data);
+        }
+    }
+    core->mac[MDIC] = val | E1000_MDIC_READY;
+
+    if (val & E1000_MDIC_INT_EN) {
+        igb_set_interrupt_cause(core, E1000_ICR_MDAC);
+    }
+}
+
+static void
+igb_set_rdt(IGBCore *core, int index, uint32_t val)
+{
+    core->mac[index] = val & 0xffff;
+    trace_e1000e_rx_set_rdt(igb_mq_queue_idx(RDT0, index), val);
+    igb_start_recv(core);
+}
+
+static void
+igb_set_status(IGBCore *core, int index, uint32_t val)
+{
+    if ((val & E1000_STATUS_PHYRA) == 0) {
+        core->mac[index] &= ~E1000_STATUS_PHYRA;
+    }
+}
+
+static void
+igb_set_ctrlext(IGBCore *core, int index, uint32_t val)
+{
+    trace_e1000e_link_set_ext_params(!!(val & E1000_CTRL_EXT_ASDCHK),
+                                     !!(val & E1000_CTRL_EXT_SPD_BYPS));
+
+    /* TODO: PFRSTD */
+
+    /* Zero self-clearing bits */
+    val &= ~(E1000_CTRL_EXT_ASDCHK | E1000_CTRL_EXT_EE_RST);
+    core->mac[CTRL_EXT] = val;
+}
+
+static void
+igb_set_pbaclr(IGBCore *core, int index, uint32_t val)
+{
+    int i;
+
+    core->mac[PBACLR] = val & E1000_PBACLR_VALID_MASK;
+
+    if (!msix_enabled(core->owner)) {
+        return;
+    }
+
+    for (i = 0; i < IGB_INTR_NUM; i++) {
+        if (core->mac[PBACLR] & BIT(i)) {
+            msix_clr_pending(core->owner, i);
+        }
+    }
+}
+
+static void
+igb_set_fcrth(IGBCore *core, int index, uint32_t val)
+{
+    core->mac[FCRTH] = val & 0xFFF8;
+}
+
+static void
+igb_set_fcrtl(IGBCore *core, int index, uint32_t val)
+{
+    core->mac[FCRTL] = val & 0x8000FFF8;
+}
+
+#define IGB_LOW_BITS_SET_FUNC(num)                             \
+    static void                                                \
+    igb_set_##num##bit(IGBCore *core, int index, uint32_t val) \
+    {                                                          \
+        core->mac[index] = val & (BIT(num) - 1);               \
+    }
+
+IGB_LOW_BITS_SET_FUNC(4)
+IGB_LOW_BITS_SET_FUNC(13)
+IGB_LOW_BITS_SET_FUNC(16)
+
+static void
+igb_set_dlen(IGBCore *core, int index, uint32_t val)
+{
+    core->mac[index] = val & 0xffff0;
+}
+
+static void
+igb_set_dbal(IGBCore *core, int index, uint32_t val)
+{
+    core->mac[index] = val & E1000_XDBAL_MASK;
+}
+
+static void
+igb_set_tdt(IGBCore *core, int index, uint32_t val)
+{
+    IGB_TxRing txr;
+    int qn = igb_mq_queue_idx(TDT0, index);
+
+    core->mac[index] = val & 0xffff;
+
+    igb_tx_ring_init(core, &txr, qn);
+    igb_start_xmit(core, &txr);
+}
+
+static void
+igb_set_ics(IGBCore *core, int index, uint32_t val)
+{
+    trace_e1000e_irq_write_ics(val);
+    igb_set_interrupt_cause(core, val);
+}
+
+static void
+igb_set_imc(IGBCore *core, int index, uint32_t val)
+{
+    trace_e1000e_irq_ims_clear_set_imc(val);
+    igb_clear_ims_bits(core, val);
+    igb_update_interrupt_state(core);
+}
+
+static void
+igb_set_ims(IGBCore *core, int index, uint32_t val)
+{
+    uint32_t valid_val = val & 0x77D4FBFD;
+
+    trace_e1000e_irq_set_ims(val, core->mac[IMS], core->mac[IMS] | valid_val);
+    core->mac[IMS] |= valid_val;
+    igb_update_interrupt_state(core);
+}
+
+static void igb_commit_icr(IGBCore *core)
+{
+    /*
+     * If GPIE.NSICR = 0, then the copy of IAM to IMS will occur only if at
+     * least one bit is set in the IMS and there is a true interrupt as
+     * reflected in ICR.INTA.
+     */
+    if ((core->mac[GPIE] & E1000_GPIE_NSICR) ||
+        (core->mac[IMS] && (core->mac[ICR] & E1000_ICR_INT_ASSERTED))) {
+        igb_set_ims(core, IMS, core->mac[IAM]);
+    } else {
+        igb_update_interrupt_state(core);
+    }
+}
+
+static void igb_set_icr(IGBCore *core, int index, uint32_t val)
+{
+    uint32_t icr = core->mac[ICR] & ~val;
+
+    trace_igb_irq_icr_write(val, core->mac[ICR], icr);
+    core->mac[ICR] = icr;
+    igb_commit_icr(core);
+}
+
+static uint32_t
+igb_mac_readreg(IGBCore *core, int index)
+{
+    return core->mac[index];
+}
+
+static uint32_t
+igb_mac_ics_read(IGBCore *core, int index)
+{
+    trace_e1000e_irq_read_ics(core->mac[ICS]);
+    return core->mac[ICS];
+}
+
+static uint32_t
+igb_mac_ims_read(IGBCore *core, int index)
+{
+    trace_e1000e_irq_read_ims(core->mac[IMS]);
+    return core->mac[IMS];
+}
+
+static uint32_t
+igb_mac_swsm_read(IGBCore *core, int index)
+{
+    uint32_t val = core->mac[SWSM];
+    core->mac[SWSM] = val | E1000_SWSM_SMBI;
+    return val;
+}
+
+static uint32_t
+igb_mac_eitr_read(IGBCore *core, int index)
+{
+    return core->eitr_guest_value[index - EITR0];
+}
+
+static uint32_t igb_mac_vfmailbox_read(IGBCore *core, int index)
+{
+    uint32_t val = core->mac[index];
+
+    core->mac[index] &= ~(E1000_V2PMAILBOX_PFSTS | E1000_V2PMAILBOX_PFACK |
+                          E1000_V2PMAILBOX_RSTD);
+
+    return val;
+}
+
+static uint32_t
+igb_mac_icr_read(IGBCore *core, int index)
+{
+    uint32_t ret = core->mac[ICR];
+    trace_e1000e_irq_icr_read_entry(ret);
+
+    if (core->mac[GPIE] & E1000_GPIE_NSICR) {
+        trace_igb_irq_icr_clear_gpie_nsicr();
+        core->mac[ICR] = 0;
+    } else if (core->mac[IMS] == 0) {
+        trace_e1000e_irq_icr_clear_zero_ims();
+        core->mac[ICR] = 0;
+    } else if (!msix_enabled(core->owner)) {
+        trace_e1000e_irq_icr_clear_nonmsix_icr_read();
+        core->mac[ICR] = 0;
+    }
+
+    trace_e1000e_irq_icr_read_exit(core->mac[ICR]);
+    igb_commit_icr(core);
+    return ret;
+}
+
+static uint32_t
+igb_mac_read_clr4(IGBCore *core, int index)
+{
+    uint32_t ret = core->mac[index];
+
+    core->mac[index] = 0;
+    return ret;
+}
+
+static uint32_t
+igb_mac_read_clr8(IGBCore *core, int index)
+{
+    uint32_t ret = core->mac[index];
+
+    core->mac[index] = 0;
+    core->mac[index - 1] = 0;
+    return ret;
+}
+
+static uint32_t
+igb_get_ctrl(IGBCore *core, int index)
+{
+    uint32_t val = core->mac[CTRL];
+
+    trace_e1000e_link_read_params(
+        !!(val & E1000_CTRL_ASDE),
+        (val & E1000_CTRL_SPD_SEL) >> E1000_CTRL_SPD_SHIFT,
+        !!(val & E1000_CTRL_FRCSPD),
+        !!(val & E1000_CTRL_FRCDPX),
+        !!(val & E1000_CTRL_RFCE),
+        !!(val & E1000_CTRL_TFCE));
+
+    return val;
+}
+
+static uint32_t igb_get_status(IGBCore *core, int index)
+{
+    uint32_t res = core->mac[STATUS];
+    uint16_t num_vfs = pcie_sriov_num_vfs(core->owner);
+
+    if (core->mac[CTRL] & E1000_CTRL_FRCDPX) {
+        res |= (core->mac[CTRL] & E1000_CTRL_FD) ? E1000_STATUS_FD : 0;
+    } else {
+        res |= E1000_STATUS_FD;
+    }
+
+    if ((core->mac[CTRL] & E1000_CTRL_FRCSPD) ||
+        (core->mac[CTRL_EXT] & E1000_CTRL_EXT_SPD_BYPS)) {
+        switch (core->mac[CTRL] & E1000_CTRL_SPD_SEL) {
+        case E1000_CTRL_SPD_10:
+            res |= E1000_STATUS_SPEED_10;
+            break;
+        case E1000_CTRL_SPD_100:
+            res |= E1000_STATUS_SPEED_100;
+            break;
+        case E1000_CTRL_SPD_1000:
+        default:
+            res |= E1000_STATUS_SPEED_1000;
+            break;
+        }
+    } else {
+        res |= E1000_STATUS_SPEED_1000;
+    }
+
+    if (num_vfs) {
+        res |= num_vfs << E1000_STATUS_NUM_VFS_SHIFT;
+        res |= E1000_STATUS_IOV_MODE;
+    }
+
+    /*
+     * Windows driver 12.18.9.23 resets if E1000_STATUS_GIO_MASTER_ENABLE is
+     * left set after E1000_CTRL_LRST is set.
+     */
+    if (!(core->mac[CTRL] & E1000_CTRL_GIO_MASTER_DISABLE) &&
+        !(core->mac[CTRL] & E1000_CTRL_LRST)) {
+        res |= E1000_STATUS_GIO_MASTER_ENABLE;
+    }
+
+    return res;
+}
+
+static void
+igb_mac_writereg(IGBCore *core, int index, uint32_t val)
+{
+    core->mac[index] = val;
+}
+
+static void
+igb_mac_setmacaddr(IGBCore *core, int index, uint32_t val)
+{
+    uint32_t macaddr[2];
+
+    core->mac[index] = val;
+
+    macaddr[0] = cpu_to_le32(core->mac[RA]);
+    macaddr[1] = cpu_to_le32(core->mac[RA + 1]);
+    qemu_format_nic_info_str(qemu_get_queue(core->owner_nic),
+        (uint8_t *) macaddr);
+
+    trace_e1000e_mac_set_sw(MAC_ARG(macaddr));
+}
+
+static void
+igb_set_eecd(IGBCore *core, int index, uint32_t val)
+{
+    static const uint32_t ro_bits = E1000_EECD_PRES          |
+                                    E1000_EECD_AUTO_RD       |
+                                    E1000_EECD_SIZE_EX_MASK;
+
+    core->mac[EECD] = (core->mac[EECD] & ro_bits) | (val & ~ro_bits);
+}
+
+static void
+igb_set_eerd(IGBCore *core, int index, uint32_t val)
+{
+    uint32_t addr = (val >> E1000_EERW_ADDR_SHIFT) & E1000_EERW_ADDR_MASK;
+    uint32_t flags = 0;
+    uint32_t data = 0;
+
+    if ((addr < IGB_EEPROM_SIZE) && (val & E1000_EERW_START)) {
+        data = core->eeprom[addr];
+        flags = E1000_EERW_DONE;
+    }
+
+    core->mac[EERD] = flags                           |
+                      (addr << E1000_EERW_ADDR_SHIFT) |
+                      (data << E1000_EERW_DATA_SHIFT);
+}
+
+static void
+igb_set_eitr(IGBCore *core, int index, uint32_t val)
+{
+    uint32_t eitr_num = index - EITR0;
+
+    trace_igb_irq_eitr_set(eitr_num, val);
+
+    core->eitr_guest_value[eitr_num] = val & ~E1000_EITR_CNT_IGNR;
+    core->mac[index] = val & 0x7FFE;
+}
+
+static void
+igb_update_rx_offloads(IGBCore *core)
+{
+    int cso_state = igb_rx_l4_cso_enabled(core);
+
+    trace_e1000e_rx_set_cso(cso_state);
+
+    if (core->has_vnet) {
+        qemu_set_offload(qemu_get_queue(core->owner_nic)->peer,
+                         cso_state, 0, 0, 0, 0);
+    }
+}
+
+static void
+igb_set_rxcsum(IGBCore *core, int index, uint32_t val)
+{
+    core->mac[RXCSUM] = val;
+    igb_update_rx_offloads(core);
+}
+
+static void
+igb_set_gcr(IGBCore *core, int index, uint32_t val)
+{
+    uint32_t ro_bits = core->mac[GCR] & E1000_GCR_RO_BITS;
+    core->mac[GCR] = (val & ~E1000_GCR_RO_BITS) | ro_bits;
+}
+
+static uint32_t igb_get_systiml(IGBCore *core, int index)
+{
+    e1000x_timestamp(core->mac, core->timadj, SYSTIML, SYSTIMH);
+    return core->mac[SYSTIML];
+}
+
+static uint32_t igb_get_rxsatrh(IGBCore *core, int index)
+{
+    core->mac[TSYNCRXCTL] &= ~E1000_TSYNCRXCTL_VALID;
+    return core->mac[RXSATRH];
+}
+
+static uint32_t igb_get_txstmph(IGBCore *core, int index)
+{
+    core->mac[TSYNCTXCTL] &= ~E1000_TSYNCTXCTL_VALID;
+    return core->mac[TXSTMPH];
+}
+
+static void igb_set_timinca(IGBCore *core, int index, uint32_t val)
+{
+    e1000x_set_timinca(core->mac, &core->timadj, val);
+}
+
+static void igb_set_timadjh(IGBCore *core, int index, uint32_t val)
+{
+    core->mac[TIMADJH] = val;
+    core->timadj += core->mac[TIMADJL] | ((int64_t)core->mac[TIMADJH] << 32);
+}
+
+#define igb_getreg(x)    [x] = igb_mac_readreg
+typedef uint32_t (*readops)(IGBCore *, int);
+static const readops igb_macreg_readops[] = {
+    igb_getreg(WUFC),
+    igb_getreg(MANC),
+    igb_getreg(TOTL),
+    igb_getreg(RDT0),
+    igb_getreg(RDT1),
+    igb_getreg(RDT2),
+    igb_getreg(RDT3),
+    igb_getreg(RDT4),
+    igb_getreg(RDT5),
+    igb_getreg(RDT6),
+    igb_getreg(RDT7),
+    igb_getreg(RDT8),
+    igb_getreg(RDT9),
+    igb_getreg(RDT10),
+    igb_getreg(RDT11),
+    igb_getreg(RDT12),
+    igb_getreg(RDT13),
+    igb_getreg(RDT14),
+    igb_getreg(RDT15),
+    igb_getreg(RDBAH0),
+    igb_getreg(RDBAH1),
+    igb_getreg(RDBAH2),
+    igb_getreg(RDBAH3),
+    igb_getreg(RDBAH4),
+    igb_getreg(RDBAH5),
+    igb_getreg(RDBAH6),
+    igb_getreg(RDBAH7),
+    igb_getreg(RDBAH8),
+    igb_getreg(RDBAH9),
+    igb_getreg(RDBAH10),
+    igb_getreg(RDBAH11),
+    igb_getreg(RDBAH12),
+    igb_getreg(RDBAH13),
+    igb_getreg(RDBAH14),
+    igb_getreg(RDBAH15),
+    igb_getreg(TDBAL0),
+    igb_getreg(TDBAL1),
+    igb_getreg(TDBAL2),
+    igb_getreg(TDBAL3),
+    igb_getreg(TDBAL4),
+    igb_getreg(TDBAL5),
+    igb_getreg(TDBAL6),
+    igb_getreg(TDBAL7),
+    igb_getreg(TDBAL8),
+    igb_getreg(TDBAL9),
+    igb_getreg(TDBAL10),
+    igb_getreg(TDBAL11),
+    igb_getreg(TDBAL12),
+    igb_getreg(TDBAL13),
+    igb_getreg(TDBAL14),
+    igb_getreg(TDBAL15),
+    igb_getreg(RDLEN0),
+    igb_getreg(RDLEN1),
+    igb_getreg(RDLEN2),
+    igb_getreg(RDLEN3),
+    igb_getreg(RDLEN4),
+    igb_getreg(RDLEN5),
+    igb_getreg(RDLEN6),
+    igb_getreg(RDLEN7),
+    igb_getreg(RDLEN8),
+    igb_getreg(RDLEN9),
+    igb_getreg(RDLEN10),
+    igb_getreg(RDLEN11),
+    igb_getreg(RDLEN12),
+    igb_getreg(RDLEN13),
+    igb_getreg(RDLEN14),
+    igb_getreg(RDLEN15),
+    igb_getreg(SRRCTL0),
+    igb_getreg(SRRCTL1),
+    igb_getreg(SRRCTL2),
+    igb_getreg(SRRCTL3),
+    igb_getreg(SRRCTL4),
+    igb_getreg(SRRCTL5),
+    igb_getreg(SRRCTL6),
+    igb_getreg(SRRCTL7),
+    igb_getreg(SRRCTL8),
+    igb_getreg(SRRCTL9),
+    igb_getreg(SRRCTL10),
+    igb_getreg(SRRCTL11),
+    igb_getreg(SRRCTL12),
+    igb_getreg(SRRCTL13),
+    igb_getreg(SRRCTL14),
+    igb_getreg(SRRCTL15),
+    igb_getreg(LATECOL),
+    igb_getreg(XONTXC),
+    igb_getreg(TDFH),
+    igb_getreg(TDFT),
+    igb_getreg(TDFHS),
+    igb_getreg(TDFTS),
+    igb_getreg(TDFPC),
+    igb_getreg(WUS),
+    igb_getreg(RDFH),
+    igb_getreg(RDFT),
+    igb_getreg(RDFHS),
+    igb_getreg(RDFTS),
+    igb_getreg(RDFPC),
+    igb_getreg(GORCL),
+    igb_getreg(MGTPRC),
+    igb_getreg(EERD),
+    igb_getreg(EIAC),
+    igb_getreg(MANC2H),
+    igb_getreg(RXCSUM),
+    igb_getreg(GSCL_3),
+    igb_getreg(GSCN_2),
+    igb_getreg(FCAH),
+    igb_getreg(FCRTH),
+    igb_getreg(FLOP),
+    igb_getreg(RXSTMPH),
+    igb_getreg(TXSTMPL),
+    igb_getreg(TIMADJL),
+    igb_getreg(RDH0),
+    igb_getreg(RDH1),
+    igb_getreg(RDH2),
+    igb_getreg(RDH3),
+    igb_getreg(RDH4),
+    igb_getreg(RDH5),
+    igb_getreg(RDH6),
+    igb_getreg(RDH7),
+    igb_getreg(RDH8),
+    igb_getreg(RDH9),
+    igb_getreg(RDH10),
+    igb_getreg(RDH11),
+    igb_getreg(RDH12),
+    igb_getreg(RDH13),
+    igb_getreg(RDH14),
+    igb_getreg(RDH15),
+    igb_getreg(TDT0),
+    igb_getreg(TDT1),
+    igb_getreg(TDT2),
+    igb_getreg(TDT3),
+    igb_getreg(TDT4),
+    igb_getreg(TDT5),
+    igb_getreg(TDT6),
+    igb_getreg(TDT7),
+    igb_getreg(TDT8),
+    igb_getreg(TDT9),
+    igb_getreg(TDT10),
+    igb_getreg(TDT11),
+    igb_getreg(TDT12),
+    igb_getreg(TDT13),
+    igb_getreg(TDT14),
+    igb_getreg(TDT15),
+    igb_getreg(TNCRS),
+    igb_getreg(RJC),
+    igb_getreg(IAM),
+    igb_getreg(GSCL_2),
+    igb_getreg(TIPG),
+    igb_getreg(FLMNGCTL),
+    igb_getreg(FLMNGCNT),
+    igb_getreg(TSYNCTXCTL),
+    igb_getreg(EEMNGDATA),
+    igb_getreg(CTRL_EXT),
+    igb_getreg(SYSTIMH),
+    igb_getreg(EEMNGCTL),
+    igb_getreg(FLMNGDATA),
+    igb_getreg(TSYNCRXCTL),
+    igb_getreg(LEDCTL),
+    igb_getreg(TCTL),
+    igb_getreg(TCTL_EXT),
+    igb_getreg(DTXCTL),
+    igb_getreg(RXPBS),
+    igb_getreg(TDH0),
+    igb_getreg(TDH1),
+    igb_getreg(TDH2),
+    igb_getreg(TDH3),
+    igb_getreg(TDH4),
+    igb_getreg(TDH5),
+    igb_getreg(TDH6),
+    igb_getreg(TDH7),
+    igb_getreg(TDH8),
+    igb_getreg(TDH9),
+    igb_getreg(TDH10),
+    igb_getreg(TDH11),
+    igb_getreg(TDH12),
+    igb_getreg(TDH13),
+    igb_getreg(TDH14),
+    igb_getreg(TDH15),
+    igb_getreg(ECOL),
+    igb_getreg(DC),
+    igb_getreg(RLEC),
+    igb_getreg(XOFFTXC),
+    igb_getreg(RFC),
+    igb_getreg(RNBC),
+    igb_getreg(MGTPTC),
+    igb_getreg(TIMINCA),
+    igb_getreg(FACTPS),
+    igb_getreg(GSCL_1),
+    igb_getreg(GSCN_0),
+    igb_getreg(PBACLR),
+    igb_getreg(FCTTV),
+    igb_getreg(RXSATRL),
+    igb_getreg(TORL),
+    igb_getreg(TDLEN0),
+    igb_getreg(TDLEN1),
+    igb_getreg(TDLEN2),
+    igb_getreg(TDLEN3),
+    igb_getreg(TDLEN4),
+    igb_getreg(TDLEN5),
+    igb_getreg(TDLEN6),
+    igb_getreg(TDLEN7),
+    igb_getreg(TDLEN8),
+    igb_getreg(TDLEN9),
+    igb_getreg(TDLEN10),
+    igb_getreg(TDLEN11),
+    igb_getreg(TDLEN12),
+    igb_getreg(TDLEN13),
+    igb_getreg(TDLEN14),
+    igb_getreg(TDLEN15),
+    igb_getreg(MCC),
+    igb_getreg(WUC),
+    igb_getreg(EECD),
+    igb_getreg(FCRTV),
+    igb_getreg(TXDCTL0),
+    igb_getreg(TXDCTL1),
+    igb_getreg(TXDCTL2),
+    igb_getreg(TXDCTL3),
+    igb_getreg(TXDCTL4),
+    igb_getreg(TXDCTL5),
+    igb_getreg(TXDCTL6),
+    igb_getreg(TXDCTL7),
+    igb_getreg(TXDCTL8),
+    igb_getreg(TXDCTL9),
+    igb_getreg(TXDCTL10),
+    igb_getreg(TXDCTL11),
+    igb_getreg(TXDCTL12),
+    igb_getreg(TXDCTL13),
+    igb_getreg(TXDCTL14),
+    igb_getreg(TXDCTL15),
+    igb_getreg(TXCTL0),
+    igb_getreg(TXCTL1),
+    igb_getreg(TXCTL2),
+    igb_getreg(TXCTL3),
+    igb_getreg(TXCTL4),
+    igb_getreg(TXCTL5),
+    igb_getreg(TXCTL6),
+    igb_getreg(TXCTL7),
+    igb_getreg(TXCTL8),
+    igb_getreg(TXCTL9),
+    igb_getreg(TXCTL10),
+    igb_getreg(TXCTL11),
+    igb_getreg(TXCTL12),
+    igb_getreg(TXCTL13),
+    igb_getreg(TXCTL14),
+    igb_getreg(TXCTL15),
+    igb_getreg(TDWBAL0),
+    igb_getreg(TDWBAL1),
+    igb_getreg(TDWBAL2),
+    igb_getreg(TDWBAL3),
+    igb_getreg(TDWBAL4),
+    igb_getreg(TDWBAL5),
+    igb_getreg(TDWBAL6),
+    igb_getreg(TDWBAL7),
+    igb_getreg(TDWBAL8),
+    igb_getreg(TDWBAL9),
+    igb_getreg(TDWBAL10),
+    igb_getreg(TDWBAL11),
+    igb_getreg(TDWBAL12),
+    igb_getreg(TDWBAL13),
+    igb_getreg(TDWBAL14),
+    igb_getreg(TDWBAL15),
+    igb_getreg(TDWBAH0),
+    igb_getreg(TDWBAH1),
+    igb_getreg(TDWBAH2),
+    igb_getreg(TDWBAH3),
+    igb_getreg(TDWBAH4),
+    igb_getreg(TDWBAH5),
+    igb_getreg(TDWBAH6),
+    igb_getreg(TDWBAH7),
+    igb_getreg(TDWBAH8),
+    igb_getreg(TDWBAH9),
+    igb_getreg(TDWBAH10),
+    igb_getreg(TDWBAH11),
+    igb_getreg(TDWBAH12),
+    igb_getreg(TDWBAH13),
+    igb_getreg(TDWBAH14),
+    igb_getreg(TDWBAH15),
+    igb_getreg(PVTCTRL0),
+    igb_getreg(PVTCTRL1),
+    igb_getreg(PVTCTRL2),
+    igb_getreg(PVTCTRL3),
+    igb_getreg(PVTCTRL4),
+    igb_getreg(PVTCTRL5),
+    igb_getreg(PVTCTRL6),
+    igb_getreg(PVTCTRL7),
+    igb_getreg(PVTEIMS0),
+    igb_getreg(PVTEIMS1),
+    igb_getreg(PVTEIMS2),
+    igb_getreg(PVTEIMS3),
+    igb_getreg(PVTEIMS4),
+    igb_getreg(PVTEIMS5),
+    igb_getreg(PVTEIMS6),
+    igb_getreg(PVTEIMS7),
+    igb_getreg(PVTEIAC0),
+    igb_getreg(PVTEIAC1),
+    igb_getreg(PVTEIAC2),
+    igb_getreg(PVTEIAC3),
+    igb_getreg(PVTEIAC4),
+    igb_getreg(PVTEIAC5),
+    igb_getreg(PVTEIAC6),
+    igb_getreg(PVTEIAC7),
+    igb_getreg(PVTEIAM0),
+    igb_getreg(PVTEIAM1),
+    igb_getreg(PVTEIAM2),
+    igb_getreg(PVTEIAM3),
+    igb_getreg(PVTEIAM4),
+    igb_getreg(PVTEIAM5),
+    igb_getreg(PVTEIAM6),
+    igb_getreg(PVTEIAM7),
+    igb_getreg(PVFGPRC0),
+    igb_getreg(PVFGPRC1),
+    igb_getreg(PVFGPRC2),
+    igb_getreg(PVFGPRC3),
+    igb_getreg(PVFGPRC4),
+    igb_getreg(PVFGPRC5),
+    igb_getreg(PVFGPRC6),
+    igb_getreg(PVFGPRC7),
+    igb_getreg(PVFGPTC0),
+    igb_getreg(PVFGPTC1),
+    igb_getreg(PVFGPTC2),
+    igb_getreg(PVFGPTC3),
+    igb_getreg(PVFGPTC4),
+    igb_getreg(PVFGPTC5),
+    igb_getreg(PVFGPTC6),
+    igb_getreg(PVFGPTC7),
+    igb_getreg(PVFGORC0),
+    igb_getreg(PVFGORC1),
+    igb_getreg(PVFGORC2),
+    igb_getreg(PVFGORC3),
+    igb_getreg(PVFGORC4),
+    igb_getreg(PVFGORC5),
+    igb_getreg(PVFGORC6),
+    igb_getreg(PVFGORC7),
+    igb_getreg(PVFGOTC0),
+    igb_getreg(PVFGOTC1),
+    igb_getreg(PVFGOTC2),
+    igb_getreg(PVFGOTC3),
+    igb_getreg(PVFGOTC4),
+    igb_getreg(PVFGOTC5),
+    igb_getreg(PVFGOTC6),
+    igb_getreg(PVFGOTC7),
+    igb_getreg(PVFMPRC0),
+    igb_getreg(PVFMPRC1),
+    igb_getreg(PVFMPRC2),
+    igb_getreg(PVFMPRC3),
+    igb_getreg(PVFMPRC4),
+    igb_getreg(PVFMPRC5),
+    igb_getreg(PVFMPRC6),
+    igb_getreg(PVFMPRC7),
+    igb_getreg(PVFGPRLBC0),
+    igb_getreg(PVFGPRLBC1),
+    igb_getreg(PVFGPRLBC2),
+    igb_getreg(PVFGPRLBC3),
+    igb_getreg(PVFGPRLBC4),
+    igb_getreg(PVFGPRLBC5),
+    igb_getreg(PVFGPRLBC6),
+    igb_getreg(PVFGPRLBC7),
+    igb_getreg(PVFGPTLBC0),
+    igb_getreg(PVFGPTLBC1),
+    igb_getreg(PVFGPTLBC2),
+    igb_getreg(PVFGPTLBC3),
+    igb_getreg(PVFGPTLBC4),
+    igb_getreg(PVFGPTLBC5),
+    igb_getreg(PVFGPTLBC6),
+    igb_getreg(PVFGPTLBC7),
+    igb_getreg(PVFGORLBC0),
+    igb_getreg(PVFGORLBC1),
+    igb_getreg(PVFGORLBC2),
+    igb_getreg(PVFGORLBC3),
+    igb_getreg(PVFGORLBC4),
+    igb_getreg(PVFGORLBC5),
+    igb_getreg(PVFGORLBC6),
+    igb_getreg(PVFGORLBC7),
+    igb_getreg(PVFGOTLBC0),
+    igb_getreg(PVFGOTLBC1),
+    igb_getreg(PVFGOTLBC2),
+    igb_getreg(PVFGOTLBC3),
+    igb_getreg(PVFGOTLBC4),
+    igb_getreg(PVFGOTLBC5),
+    igb_getreg(PVFGOTLBC6),
+    igb_getreg(PVFGOTLBC7),
+    igb_getreg(RCTL),
+    igb_getreg(MDIC),
+    igb_getreg(FCRUC),
+    igb_getreg(VET),
+    igb_getreg(RDBAL0),
+    igb_getreg(RDBAL1),
+    igb_getreg(RDBAL2),
+    igb_getreg(RDBAL3),
+    igb_getreg(RDBAL4),
+    igb_getreg(RDBAL5),
+    igb_getreg(RDBAL6),
+    igb_getreg(RDBAL7),
+    igb_getreg(RDBAL8),
+    igb_getreg(RDBAL9),
+    igb_getreg(RDBAL10),
+    igb_getreg(RDBAL11),
+    igb_getreg(RDBAL12),
+    igb_getreg(RDBAL13),
+    igb_getreg(RDBAL14),
+    igb_getreg(RDBAL15),
+    igb_getreg(TDBAH0),
+    igb_getreg(TDBAH1),
+    igb_getreg(TDBAH2),
+    igb_getreg(TDBAH3),
+    igb_getreg(TDBAH4),
+    igb_getreg(TDBAH5),
+    igb_getreg(TDBAH6),
+    igb_getreg(TDBAH7),
+    igb_getreg(TDBAH8),
+    igb_getreg(TDBAH9),
+    igb_getreg(TDBAH10),
+    igb_getreg(TDBAH11),
+    igb_getreg(TDBAH12),
+    igb_getreg(TDBAH13),
+    igb_getreg(TDBAH14),
+    igb_getreg(TDBAH15),
+    igb_getreg(SCC),
+    igb_getreg(COLC),
+    igb_getreg(XOFFRXC),
+    igb_getreg(IPAV),
+    igb_getreg(GOTCL),
+    igb_getreg(MGTPDC),
+    igb_getreg(GCR),
+    igb_getreg(MFVAL),
+    igb_getreg(FUNCTAG),
+    igb_getreg(GSCL_4),
+    igb_getreg(GSCN_3),
+    igb_getreg(MRQC),
+    igb_getreg(FCT),
+    igb_getreg(FLA),
+    igb_getreg(RXDCTL0),
+    igb_getreg(RXDCTL1),
+    igb_getreg(RXDCTL2),
+    igb_getreg(RXDCTL3),
+    igb_getreg(RXDCTL4),
+    igb_getreg(RXDCTL5),
+    igb_getreg(RXDCTL6),
+    igb_getreg(RXDCTL7),
+    igb_getreg(RXDCTL8),
+    igb_getreg(RXDCTL9),
+    igb_getreg(RXDCTL10),
+    igb_getreg(RXDCTL11),
+    igb_getreg(RXDCTL12),
+    igb_getreg(RXDCTL13),
+    igb_getreg(RXDCTL14),
+    igb_getreg(RXDCTL15),
+    igb_getreg(RXSTMPL),
+    igb_getreg(TIMADJH),
+    igb_getreg(FCRTL),
+    igb_getreg(XONRXC),
+    igb_getreg(RFCTL),
+    igb_getreg(GSCN_1),
+    igb_getreg(FCAL),
+    igb_getreg(GPIE),
+    igb_getreg(TXPBS),
+    igb_getreg(RLPML),
+
+    [TOTH]    = igb_mac_read_clr8,
+    [GOTCH]   = igb_mac_read_clr8,
+    [PRC64]   = igb_mac_read_clr4,
+    [PRC255]  = igb_mac_read_clr4,
+    [PRC1023] = igb_mac_read_clr4,
+    [PTC64]   = igb_mac_read_clr4,
+    [PTC255]  = igb_mac_read_clr4,
+    [PTC1023] = igb_mac_read_clr4,
+    [GPRC]    = igb_mac_read_clr4,
+    [TPT]     = igb_mac_read_clr4,
+    [RUC]     = igb_mac_read_clr4,
+    [BPRC]    = igb_mac_read_clr4,
+    [MPTC]    = igb_mac_read_clr4,
+    [IAC]     = igb_mac_read_clr4,
+    [ICR]     = igb_mac_icr_read,
+    [STATUS]  = igb_get_status,
+    [ICS]     = igb_mac_ics_read,
+    /*
+     * 8.8.10: Reading the IMC register returns the value of the IMS register.
+     */
+    [IMC]     = igb_mac_ims_read,
+    [TORH]    = igb_mac_read_clr8,
+    [GORCH]   = igb_mac_read_clr8,
+    [PRC127]  = igb_mac_read_clr4,
+    [PRC511]  = igb_mac_read_clr4,
+    [PRC1522] = igb_mac_read_clr4,
+    [PTC127]  = igb_mac_read_clr4,
+    [PTC511]  = igb_mac_read_clr4,
+    [PTC1522] = igb_mac_read_clr4,
+    [GPTC]    = igb_mac_read_clr4,
+    [TPR]     = igb_mac_read_clr4,
+    [ROC]     = igb_mac_read_clr4,
+    [MPRC]    = igb_mac_read_clr4,
+    [BPTC]    = igb_mac_read_clr4,
+    [TSCTC]   = igb_mac_read_clr4,
+    [CTRL]    = igb_get_ctrl,
+    [SWSM]    = igb_mac_swsm_read,
+    [IMS]     = igb_mac_ims_read,
+    [SYSTIML] = igb_get_systiml,
+    [RXSATRH] = igb_get_rxsatrh,
+    [TXSTMPH] = igb_get_txstmph,
+
+    [CRCERRS ... MPC]      = igb_mac_readreg,
+    [IP6AT ... IP6AT + 3]  = igb_mac_readreg,
+    [IP4AT ... IP4AT + 6]  = igb_mac_readreg,
+    [RA ... RA + 31]       = igb_mac_readreg,
+    [RA2 ... RA2 + 31]     = igb_mac_readreg,
+    [WUPM ... WUPM + 31]   = igb_mac_readreg,
+    [MTA ... MTA + E1000_MC_TBL_SIZE - 1]    = igb_mac_readreg,
+    [VFTA ... VFTA + E1000_VLAN_FILTER_TBL_SIZE - 1]  = igb_mac_readreg,
+    [FFMT ... FFMT + 254]  = igb_mac_readreg,
+    [MDEF ... MDEF + 7]    = igb_mac_readreg,
+    [FTFT ... FTFT + 254]  = igb_mac_readreg,
+    [RETA ... RETA + 31]   = igb_mac_readreg,
+    [RSSRK ... RSSRK + 9]  = igb_mac_readreg,
+    [MAVTV0 ... MAVTV3]    = igb_mac_readreg,
+    [EITR0 ... EITR0 + IGB_INTR_NUM - 1] = igb_mac_eitr_read,
+    [PVTEICR0] = igb_mac_read_clr4,
+    [PVTEICR1] = igb_mac_read_clr4,
+    [PVTEICR2] = igb_mac_read_clr4,
+    [PVTEICR3] = igb_mac_read_clr4,
+    [PVTEICR4] = igb_mac_read_clr4,
+    [PVTEICR5] = igb_mac_read_clr4,
+    [PVTEICR6] = igb_mac_read_clr4,
+    [PVTEICR7] = igb_mac_read_clr4,
+
+    /* IGB specific: */
+    [FWSM]       = igb_mac_readreg,
+    [SW_FW_SYNC] = igb_mac_readreg,
+    [HTCBDPC]    = igb_mac_read_clr4,
+    [EICR]       = igb_mac_read_clr4,
+    [EIMS]       = igb_mac_readreg,
+    [EIAM]       = igb_mac_readreg,
+    [IVAR0 ... IVAR0 + 7] = igb_mac_readreg,
+    igb_getreg(IVAR_MISC),
+    igb_getreg(VT_CTL),
+    [P2VMAILBOX0 ... P2VMAILBOX7] = igb_mac_readreg,
+    [V2PMAILBOX0 ... V2PMAILBOX7] = igb_mac_vfmailbox_read,
+    igb_getreg(MBVFICR),
+    [VMBMEM0 ... VMBMEM0 + 127] = igb_mac_readreg,
+    igb_getreg(MBVFIMR),
+    igb_getreg(VFLRE),
+    igb_getreg(VFRE),
+    igb_getreg(VFTE),
+    igb_getreg(QDE),
+    igb_getreg(DTXSWC),
+    igb_getreg(RPLOLR),
+    [VLVF0 ... VLVF0 + E1000_VLVF_ARRAY_SIZE - 1] = igb_mac_readreg,
+    [VMVIR0 ... VMVIR7] = igb_mac_readreg,
+    [VMOLR0 ... VMOLR7] = igb_mac_readreg,
+    [WVBR] = igb_mac_read_clr4,
+    [RQDPC0] = igb_mac_read_clr4,
+    [RQDPC1] = igb_mac_read_clr4,
+    [RQDPC2] = igb_mac_read_clr4,
+    [RQDPC3] = igb_mac_read_clr4,
+    [RQDPC4] = igb_mac_read_clr4,
+    [RQDPC5] = igb_mac_read_clr4,
+    [RQDPC6] = igb_mac_read_clr4,
+    [RQDPC7] = igb_mac_read_clr4,
+    [RQDPC8] = igb_mac_read_clr4,
+    [RQDPC9] = igb_mac_read_clr4,
+    [RQDPC10] = igb_mac_read_clr4,
+    [RQDPC11] = igb_mac_read_clr4,
+    [RQDPC12] = igb_mac_read_clr4,
+    [RQDPC13] = igb_mac_read_clr4,
+    [RQDPC14] = igb_mac_read_clr4,
+    [RQDPC15] = igb_mac_read_clr4,
+    [VTIVAR ... VTIVAR + 7] = igb_mac_readreg,
+    [VTIVAR_MISC ... VTIVAR_MISC + 7] = igb_mac_readreg,
+};
+enum { IGB_NREADOPS = ARRAY_SIZE(igb_macreg_readops) };
+
+#define igb_putreg(x)    [x] = igb_mac_writereg
+typedef void (*writeops)(IGBCore *, int, uint32_t);
+static const writeops igb_macreg_writeops[] = {
+    igb_putreg(SWSM),
+    igb_putreg(WUFC),
+    igb_putreg(RDBAH0),
+    igb_putreg(RDBAH1),
+    igb_putreg(RDBAH2),
+    igb_putreg(RDBAH3),
+    igb_putreg(RDBAH4),
+    igb_putreg(RDBAH5),
+    igb_putreg(RDBAH6),
+    igb_putreg(RDBAH7),
+    igb_putreg(RDBAH8),
+    igb_putreg(RDBAH9),
+    igb_putreg(RDBAH10),
+    igb_putreg(RDBAH11),
+    igb_putreg(RDBAH12),
+    igb_putreg(RDBAH13),
+    igb_putreg(RDBAH14),
+    igb_putreg(RDBAH15),
+    igb_putreg(SRRCTL0),
+    igb_putreg(SRRCTL1),
+    igb_putreg(SRRCTL2),
+    igb_putreg(SRRCTL3),
+    igb_putreg(SRRCTL4),
+    igb_putreg(SRRCTL5),
+    igb_putreg(SRRCTL6),
+    igb_putreg(SRRCTL7),
+    igb_putreg(SRRCTL8),
+    igb_putreg(SRRCTL9),
+    igb_putreg(SRRCTL10),
+    igb_putreg(SRRCTL11),
+    igb_putreg(SRRCTL12),
+    igb_putreg(SRRCTL13),
+    igb_putreg(SRRCTL14),
+    igb_putreg(SRRCTL15),
+    igb_putreg(RXDCTL0),
+    igb_putreg(RXDCTL1),
+    igb_putreg(RXDCTL2),
+    igb_putreg(RXDCTL3),
+    igb_putreg(RXDCTL4),
+    igb_putreg(RXDCTL5),
+    igb_putreg(RXDCTL6),
+    igb_putreg(RXDCTL7),
+    igb_putreg(RXDCTL8),
+    igb_putreg(RXDCTL9),
+    igb_putreg(RXDCTL10),
+    igb_putreg(RXDCTL11),
+    igb_putreg(RXDCTL12),
+    igb_putreg(RXDCTL13),
+    igb_putreg(RXDCTL14),
+    igb_putreg(RXDCTL15),
+    igb_putreg(LEDCTL),
+    igb_putreg(TCTL),
+    igb_putreg(TCTL_EXT),
+    igb_putreg(DTXCTL),
+    igb_putreg(RXPBS),
+    igb_putreg(RQDPC0),
+    igb_putreg(FCAL),
+    igb_putreg(FCRUC),
+    igb_putreg(WUC),
+    igb_putreg(WUS),
+    igb_putreg(IPAV),
+    igb_putreg(TDBAH0),
+    igb_putreg(TDBAH1),
+    igb_putreg(TDBAH2),
+    igb_putreg(TDBAH3),
+    igb_putreg(TDBAH4),
+    igb_putreg(TDBAH5),
+    igb_putreg(TDBAH6),
+    igb_putreg(TDBAH7),
+    igb_putreg(TDBAH8),
+    igb_putreg(TDBAH9),
+    igb_putreg(TDBAH10),
+    igb_putreg(TDBAH11),
+    igb_putreg(TDBAH12),
+    igb_putreg(TDBAH13),
+    igb_putreg(TDBAH14),
+    igb_putreg(TDBAH15),
+    igb_putreg(IAM),
+    igb_putreg(MANC),
+    igb_putreg(MANC2H),
+    igb_putreg(MFVAL),
+    igb_putreg(FACTPS),
+    igb_putreg(FUNCTAG),
+    igb_putreg(GSCL_1),
+    igb_putreg(GSCL_2),
+    igb_putreg(GSCL_3),
+    igb_putreg(GSCL_4),
+    igb_putreg(GSCN_0),
+    igb_putreg(GSCN_1),
+    igb_putreg(GSCN_2),
+    igb_putreg(GSCN_3),
+    igb_putreg(MRQC),
+    igb_putreg(FLOP),
+    igb_putreg(FLA),
+    igb_putreg(TXDCTL0),
+    igb_putreg(TXDCTL1),
+    igb_putreg(TXDCTL2),
+    igb_putreg(TXDCTL3),
+    igb_putreg(TXDCTL4),
+    igb_putreg(TXDCTL5),
+    igb_putreg(TXDCTL6),
+    igb_putreg(TXDCTL7),
+    igb_putreg(TXDCTL8),
+    igb_putreg(TXDCTL9),
+    igb_putreg(TXDCTL10),
+    igb_putreg(TXDCTL11),
+    igb_putreg(TXDCTL12),
+    igb_putreg(TXDCTL13),
+    igb_putreg(TXDCTL14),
+    igb_putreg(TXDCTL15),
+    igb_putreg(TXCTL0),
+    igb_putreg(TXCTL1),
+    igb_putreg(TXCTL2),
+    igb_putreg(TXCTL3),
+    igb_putreg(TXCTL4),
+    igb_putreg(TXCTL5),
+    igb_putreg(TXCTL6),
+    igb_putreg(TXCTL7),
+    igb_putreg(TXCTL8),
+    igb_putreg(TXCTL9),
+    igb_putreg(TXCTL10),
+    igb_putreg(TXCTL11),
+    igb_putreg(TXCTL12),
+    igb_putreg(TXCTL13),
+    igb_putreg(TXCTL14),
+    igb_putreg(TXCTL15),
+    igb_putreg(TDWBAL0),
+    igb_putreg(TDWBAL1),
+    igb_putreg(TDWBAL2),
+    igb_putreg(TDWBAL3),
+    igb_putreg(TDWBAL4),
+    igb_putreg(TDWBAL5),
+    igb_putreg(TDWBAL6),
+    igb_putreg(TDWBAL7),
+    igb_putreg(TDWBAL8),
+    igb_putreg(TDWBAL9),
+    igb_putreg(TDWBAL10),
+    igb_putreg(TDWBAL11),
+    igb_putreg(TDWBAL12),
+    igb_putreg(TDWBAL13),
+    igb_putreg(TDWBAL14),
+    igb_putreg(TDWBAL15),
+    igb_putreg(TDWBAH0),
+    igb_putreg(TDWBAH1),
+    igb_putreg(TDWBAH2),
+    igb_putreg(TDWBAH3),
+    igb_putreg(TDWBAH4),
+    igb_putreg(TDWBAH5),
+    igb_putreg(TDWBAH6),
+    igb_putreg(TDWBAH7),
+    igb_putreg(TDWBAH8),
+    igb_putreg(TDWBAH9),
+    igb_putreg(TDWBAH10),
+    igb_putreg(TDWBAH11),
+    igb_putreg(TDWBAH12),
+    igb_putreg(TDWBAH13),
+    igb_putreg(TDWBAH14),
+    igb_putreg(TDWBAH15),
+    igb_putreg(TIPG),
+    igb_putreg(RXSTMPH),
+    igb_putreg(RXSTMPL),
+    igb_putreg(RXSATRL),
+    igb_putreg(RXSATRH),
+    igb_putreg(TXSTMPL),
+    igb_putreg(TXSTMPH),
+    igb_putreg(SYSTIML),
+    igb_putreg(SYSTIMH),
+    igb_putreg(TIMADJL),
+    igb_putreg(TSYNCRXCTL),
+    igb_putreg(TSYNCTXCTL),
+    igb_putreg(EEMNGCTL),
+    igb_putreg(GPIE),
+    igb_putreg(TXPBS),
+    igb_putreg(RLPML),
+    igb_putreg(VET),
+
+    [TDH0]     = igb_set_16bit,
+    [TDH1]     = igb_set_16bit,
+    [TDH2]     = igb_set_16bit,
+    [TDH3]     = igb_set_16bit,
+    [TDH4]     = igb_set_16bit,
+    [TDH5]     = igb_set_16bit,
+    [TDH6]     = igb_set_16bit,
+    [TDH7]     = igb_set_16bit,
+    [TDH8]     = igb_set_16bit,
+    [TDH9]     = igb_set_16bit,
+    [TDH10]    = igb_set_16bit,
+    [TDH11]    = igb_set_16bit,
+    [TDH12]    = igb_set_16bit,
+    [TDH13]    = igb_set_16bit,
+    [TDH14]    = igb_set_16bit,
+    [TDH15]    = igb_set_16bit,
+    [TDT0]     = igb_set_tdt,
+    [TDT1]     = igb_set_tdt,
+    [TDT2]     = igb_set_tdt,
+    [TDT3]     = igb_set_tdt,
+    [TDT4]     = igb_set_tdt,
+    [TDT5]     = igb_set_tdt,
+    [TDT6]     = igb_set_tdt,
+    [TDT7]     = igb_set_tdt,
+    [TDT8]     = igb_set_tdt,
+    [TDT9]     = igb_set_tdt,
+    [TDT10]    = igb_set_tdt,
+    [TDT11]    = igb_set_tdt,
+    [TDT12]    = igb_set_tdt,
+    [TDT13]    = igb_set_tdt,
+    [TDT14]    = igb_set_tdt,
+    [TDT15]    = igb_set_tdt,
+    [MDIC]     = igb_set_mdic,
+    [ICS]      = igb_set_ics,
+    [RDH0]     = igb_set_16bit,
+    [RDH1]     = igb_set_16bit,
+    [RDH2]     = igb_set_16bit,
+    [RDH3]     = igb_set_16bit,
+    [RDH4]     = igb_set_16bit,
+    [RDH5]     = igb_set_16bit,
+    [RDH6]     = igb_set_16bit,
+    [RDH7]     = igb_set_16bit,
+    [RDH8]     = igb_set_16bit,
+    [RDH9]     = igb_set_16bit,
+    [RDH10]    = igb_set_16bit,
+    [RDH11]    = igb_set_16bit,
+    [RDH12]    = igb_set_16bit,
+    [RDH13]    = igb_set_16bit,
+    [RDH14]    = igb_set_16bit,
+    [RDH15]    = igb_set_16bit,
+    [RDT0]     = igb_set_rdt,
+    [RDT1]     = igb_set_rdt,
+    [RDT2]     = igb_set_rdt,
+    [RDT3]     = igb_set_rdt,
+    [RDT4]     = igb_set_rdt,
+    [RDT5]     = igb_set_rdt,
+    [RDT6]     = igb_set_rdt,
+    [RDT7]     = igb_set_rdt,
+    [RDT8]     = igb_set_rdt,
+    [RDT9]     = igb_set_rdt,
+    [RDT10]    = igb_set_rdt,
+    [RDT11]    = igb_set_rdt,
+    [RDT12]    = igb_set_rdt,
+    [RDT13]    = igb_set_rdt,
+    [RDT14]    = igb_set_rdt,
+    [RDT15]    = igb_set_rdt,
+    [IMC]      = igb_set_imc,
+    [IMS]      = igb_set_ims,
+    [ICR]      = igb_set_icr,
+    [EECD]     = igb_set_eecd,
+    [RCTL]     = igb_set_rx_control,
+    [CTRL]     = igb_set_ctrl,
+    [EERD]     = igb_set_eerd,
+    [TDFH]     = igb_set_13bit,
+    [TDFT]     = igb_set_13bit,
+    [TDFHS]    = igb_set_13bit,
+    [TDFTS]    = igb_set_13bit,
+    [TDFPC]    = igb_set_13bit,
+    [RDFH]     = igb_set_13bit,
+    [RDFT]     = igb_set_13bit,
+    [RDFHS]    = igb_set_13bit,
+    [RDFTS]    = igb_set_13bit,
+    [RDFPC]    = igb_set_13bit,
+    [GCR]      = igb_set_gcr,
+    [RXCSUM]   = igb_set_rxcsum,
+    [TDLEN0]   = igb_set_dlen,
+    [TDLEN1]   = igb_set_dlen,
+    [TDLEN2]   = igb_set_dlen,
+    [TDLEN3]   = igb_set_dlen,
+    [TDLEN4]   = igb_set_dlen,
+    [TDLEN5]   = igb_set_dlen,
+    [TDLEN6]   = igb_set_dlen,
+    [TDLEN7]   = igb_set_dlen,
+    [TDLEN8]   = igb_set_dlen,
+    [TDLEN9]   = igb_set_dlen,
+    [TDLEN10]  = igb_set_dlen,
+    [TDLEN11]  = igb_set_dlen,
+    [TDLEN12]  = igb_set_dlen,
+    [TDLEN13]  = igb_set_dlen,
+    [TDLEN14]  = igb_set_dlen,
+    [TDLEN15]  = igb_set_dlen,
+    [RDLEN0]   = igb_set_dlen,
+    [RDLEN1]   = igb_set_dlen,
+    [RDLEN2]   = igb_set_dlen,
+    [RDLEN3]   = igb_set_dlen,
+    [RDLEN4]   = igb_set_dlen,
+    [RDLEN5]   = igb_set_dlen,
+    [RDLEN6]   = igb_set_dlen,
+    [RDLEN7]   = igb_set_dlen,
+    [RDLEN8]   = igb_set_dlen,
+    [RDLEN9]   = igb_set_dlen,
+    [RDLEN10]  = igb_set_dlen,
+    [RDLEN11]  = igb_set_dlen,
+    [RDLEN12]  = igb_set_dlen,
+    [RDLEN13]  = igb_set_dlen,
+    [RDLEN14]  = igb_set_dlen,
+    [RDLEN15]  = igb_set_dlen,
+    [TDBAL0]   = igb_set_dbal,
+    [TDBAL1]   = igb_set_dbal,
+    [TDBAL2]   = igb_set_dbal,
+    [TDBAL3]   = igb_set_dbal,
+    [TDBAL4]   = igb_set_dbal,
+    [TDBAL5]   = igb_set_dbal,
+    [TDBAL6]   = igb_set_dbal,
+    [TDBAL7]   = igb_set_dbal,
+    [TDBAL8]   = igb_set_dbal,
+    [TDBAL9]   = igb_set_dbal,
+    [TDBAL10]  = igb_set_dbal,
+    [TDBAL11]  = igb_set_dbal,
+    [TDBAL12]  = igb_set_dbal,
+    [TDBAL13]  = igb_set_dbal,
+    [TDBAL14]  = igb_set_dbal,
+    [TDBAL15]  = igb_set_dbal,
+    [RDBAL0]   = igb_set_dbal,
+    [RDBAL1]   = igb_set_dbal,
+    [RDBAL2]   = igb_set_dbal,
+    [RDBAL3]   = igb_set_dbal,
+    [RDBAL4]   = igb_set_dbal,
+    [RDBAL5]   = igb_set_dbal,
+    [RDBAL6]   = igb_set_dbal,
+    [RDBAL7]   = igb_set_dbal,
+    [RDBAL8]   = igb_set_dbal,
+    [RDBAL9]   = igb_set_dbal,
+    [RDBAL10]  = igb_set_dbal,
+    [RDBAL11]  = igb_set_dbal,
+    [RDBAL12]  = igb_set_dbal,
+    [RDBAL13]  = igb_set_dbal,
+    [RDBAL14]  = igb_set_dbal,
+    [RDBAL15]  = igb_set_dbal,
+    [STATUS]   = igb_set_status,
+    [PBACLR]   = igb_set_pbaclr,
+    [CTRL_EXT] = igb_set_ctrlext,
+    [FCAH]     = igb_set_16bit,
+    [FCT]      = igb_set_16bit,
+    [FCTTV]    = igb_set_16bit,
+    [FCRTV]    = igb_set_16bit,
+    [FCRTH]    = igb_set_fcrth,
+    [FCRTL]    = igb_set_fcrtl,
+    [CTRL_DUP] = igb_set_ctrl,
+    [RFCTL]    = igb_set_rfctl,
+    [TIMINCA]  = igb_set_timinca,
+    [TIMADJH]  = igb_set_timadjh,
+
+    [IP6AT ... IP6AT + 3]    = igb_mac_writereg,
+    [IP4AT ... IP4AT + 6]    = igb_mac_writereg,
+    [RA]                     = igb_mac_writereg,
+    [RA + 1]                 = igb_mac_setmacaddr,
+    [RA + 2 ... RA + 31]     = igb_mac_writereg,
+    [RA2 ... RA2 + 31]       = igb_mac_writereg,
+    [WUPM ... WUPM + 31]     = igb_mac_writereg,
+    [MTA ... MTA + E1000_MC_TBL_SIZE - 1] = igb_mac_writereg,
+    [VFTA ... VFTA + E1000_VLAN_FILTER_TBL_SIZE - 1] = igb_mac_writereg,
+    [FFMT ... FFMT + 254]    = igb_set_4bit,
+    [MDEF ... MDEF + 7]      = igb_mac_writereg,
+    [FTFT ... FTFT + 254]    = igb_mac_writereg,
+    [RETA ... RETA + 31]     = igb_mac_writereg,
+    [RSSRK ... RSSRK + 9]    = igb_mac_writereg,
+    [MAVTV0 ... MAVTV3]      = igb_mac_writereg,
+    [EITR0 ... EITR0 + IGB_INTR_NUM - 1] = igb_set_eitr,
+
+    /* IGB specific: */
+    [FWSM]     = igb_mac_writereg,
+    [SW_FW_SYNC] = igb_mac_writereg,
+    [EICR] = igb_set_eicr,
+    [EICS] = igb_set_eics,
+    [EIAC] = igb_set_eiac,
+    [EIAM] = igb_set_eiam,
+    [EIMC] = igb_set_eimc,
+    [EIMS] = igb_set_eims,
+    [IVAR0 ... IVAR0 + 7] = igb_mac_writereg,
+    igb_putreg(IVAR_MISC),
+    igb_putreg(VT_CTL),
+    [P2VMAILBOX0 ... P2VMAILBOX7] = igb_set_pfmailbox,
+    [V2PMAILBOX0 ... V2PMAILBOX7] = igb_set_vfmailbox,
+    [MBVFICR] = igb_w1c,
+    [VMBMEM0 ... VMBMEM0 + 127] = igb_mac_writereg,
+    igb_putreg(MBVFIMR),
+    [VFLRE] = igb_w1c,
+    igb_putreg(VFRE),
+    igb_putreg(VFTE),
+    igb_putreg(QDE),
+    igb_putreg(DTXSWC),
+    igb_putreg(RPLOLR),
+    [VLVF0 ... VLVF0 + E1000_VLVF_ARRAY_SIZE - 1] = igb_mac_writereg,
+    [VMVIR0 ... VMVIR7] = igb_mac_writereg,
+    [VMOLR0 ... VMOLR7] = igb_mac_writereg,
+    [UTA ... UTA + E1000_MC_TBL_SIZE - 1] = igb_mac_writereg,
+    [PVTCTRL0] = igb_set_vtctrl,
+    [PVTCTRL1] = igb_set_vtctrl,
+    [PVTCTRL2] = igb_set_vtctrl,
+    [PVTCTRL3] = igb_set_vtctrl,
+    [PVTCTRL4] = igb_set_vtctrl,
+    [PVTCTRL5] = igb_set_vtctrl,
+    [PVTCTRL6] = igb_set_vtctrl,
+    [PVTCTRL7] = igb_set_vtctrl,
+    [PVTEICS0] = igb_set_vteics,
+    [PVTEICS1] = igb_set_vteics,
+    [PVTEICS2] = igb_set_vteics,
+    [PVTEICS3] = igb_set_vteics,
+    [PVTEICS4] = igb_set_vteics,
+    [PVTEICS5] = igb_set_vteics,
+    [PVTEICS6] = igb_set_vteics,
+    [PVTEICS7] = igb_set_vteics,
+    [PVTEIMS0] = igb_set_vteims,
+    [PVTEIMS1] = igb_set_vteims,
+    [PVTEIMS2] = igb_set_vteims,
+    [PVTEIMS3] = igb_set_vteims,
+    [PVTEIMS4] = igb_set_vteims,
+    [PVTEIMS5] = igb_set_vteims,
+    [PVTEIMS6] = igb_set_vteims,
+    [PVTEIMS7] = igb_set_vteims,
+    [PVTEIMC0] = igb_set_vteimc,
+    [PVTEIMC1] = igb_set_vteimc,
+    [PVTEIMC2] = igb_set_vteimc,
+    [PVTEIMC3] = igb_set_vteimc,
+    [PVTEIMC4] = igb_set_vteimc,
+    [PVTEIMC5] = igb_set_vteimc,
+    [PVTEIMC6] = igb_set_vteimc,
+    [PVTEIMC7] = igb_set_vteimc,
+    [PVTEIAC0] = igb_set_vteiac,
+    [PVTEIAC1] = igb_set_vteiac,
+    [PVTEIAC2] = igb_set_vteiac,
+    [PVTEIAC3] = igb_set_vteiac,
+    [PVTEIAC4] = igb_set_vteiac,
+    [PVTEIAC5] = igb_set_vteiac,
+    [PVTEIAC6] = igb_set_vteiac,
+    [PVTEIAC7] = igb_set_vteiac,
+    [PVTEIAM0] = igb_set_vteiam,
+    [PVTEIAM1] = igb_set_vteiam,
+    [PVTEIAM2] = igb_set_vteiam,
+    [PVTEIAM3] = igb_set_vteiam,
+    [PVTEIAM4] = igb_set_vteiam,
+    [PVTEIAM5] = igb_set_vteiam,
+    [PVTEIAM6] = igb_set_vteiam,
+    [PVTEIAM7] = igb_set_vteiam,
+    [PVTEICR0] = igb_set_vteicr,
+    [PVTEICR1] = igb_set_vteicr,
+    [PVTEICR2] = igb_set_vteicr,
+    [PVTEICR3] = igb_set_vteicr,
+    [PVTEICR4] = igb_set_vteicr,
+    [PVTEICR5] = igb_set_vteicr,
+    [PVTEICR6] = igb_set_vteicr,
+    [PVTEICR7] = igb_set_vteicr,
+    [VTIVAR ... VTIVAR + 7] = igb_set_vtivar,
+    [VTIVAR_MISC ... VTIVAR_MISC + 7] = igb_mac_writereg
+};
+enum { IGB_NWRITEOPS = ARRAY_SIZE(igb_macreg_writeops) };
+
+enum { MAC_ACCESS_PARTIAL = 1 };
+
+/*
+ * The array below combines alias offsets of the index values for the
+ * MAC registers that have aliases, with the indication of not fully
+ * implemented registers (lowest bit). This combination is possible
+ * because all of the offsets are even.
+ */
+static const uint16_t mac_reg_access[E1000E_MAC_SIZE] = {
+    /* Alias index offsets */
+    [FCRTL_A] = 0x07fe,
+    [RDFH_A]  = 0xe904, [RDFT_A]  = 0xe904,
+    [TDFH_A]  = 0xed00, [TDFT_A]  = 0xed00,
+    [RA_A ... RA_A + 31]      = 0x14f0,
+    [VFTA_A ... VFTA_A + E1000_VLAN_FILTER_TBL_SIZE - 1] = 0x1400,
+
+    [RDBAL0_A] = 0x2600,
+    [RDBAH0_A] = 0x2600,
+    [RDLEN0_A] = 0x2600,
+    [SRRCTL0_A] = 0x2600,
+    [RDH0_A] = 0x2600,
+    [RDT0_A] = 0x2600,
+    [RXDCTL0_A] = 0x2600,
+    [RXCTL0_A] = 0x2600,
+    [RQDPC0_A] = 0x2600,
+    [RDBAL1_A] = 0x25D0,
+    [RDBAL2_A] = 0x25A0,
+    [RDBAL3_A] = 0x2570,
+    [RDBAH1_A] = 0x25D0,
+    [RDBAH2_A] = 0x25A0,
+    [RDBAH3_A] = 0x2570,
+    [RDLEN1_A] = 0x25D0,
+    [RDLEN2_A] = 0x25A0,
+    [RDLEN3_A] = 0x2570,
+    [SRRCTL1_A] = 0x25D0,
+    [SRRCTL2_A] = 0x25A0,
+    [SRRCTL3_A] = 0x2570,
+    [RDH1_A] = 0x25D0,
+    [RDH2_A] = 0x25A0,
+    [RDH3_A] = 0x2570,
+    [RDT1_A] = 0x25D0,
+    [RDT2_A] = 0x25A0,
+    [RDT3_A] = 0x2570,
+    [RXDCTL1_A] = 0x25D0,
+    [RXDCTL2_A] = 0x25A0,
+    [RXDCTL3_A] = 0x2570,
+    [RXCTL1_A] = 0x25D0,
+    [RXCTL2_A] = 0x25A0,
+    [RXCTL3_A] = 0x2570,
+    [RQDPC1_A] = 0x25D0,
+    [RQDPC2_A] = 0x25A0,
+    [RQDPC3_A] = 0x2570,
+    [TDBAL0_A] = 0x2A00,
+    [TDBAH0_A] = 0x2A00,
+    [TDLEN0_A] = 0x2A00,
+    [TDH0_A] = 0x2A00,
+    [TDT0_A] = 0x2A00,
+    [TXCTL0_A] = 0x2A00,
+    [TDWBAL0_A] = 0x2A00,
+    [TDWBAH0_A] = 0x2A00,
+    [TDBAL1_A] = 0x29D0,
+    [TDBAL2_A] = 0x29A0,
+    [TDBAL3_A] = 0x2970,
+    [TDBAH1_A] = 0x29D0,
+    [TDBAH2_A] = 0x29A0,
+    [TDBAH3_A] = 0x2970,
+    [TDLEN1_A] = 0x29D0,
+    [TDLEN2_A] = 0x29A0,
+    [TDLEN3_A] = 0x2970,
+    [TDH1_A] = 0x29D0,
+    [TDH2_A] = 0x29A0,
+    [TDH3_A] = 0x2970,
+    [TDT1_A] = 0x29D0,
+    [TDT2_A] = 0x29A0,
+    [TDT3_A] = 0x2970,
+    [TXDCTL0_A] = 0x2A00,
+    [TXDCTL1_A] = 0x29D0,
+    [TXDCTL2_A] = 0x29A0,
+    [TXDCTL3_A] = 0x2970,
+    [TXCTL1_A] = 0x29D0,
+    [TXCTL2_A] = 0x29A0,
+    [TXCTL3_A] = 0x29D0,
+    [TDWBAL1_A] = 0x29D0,
+    [TDWBAL2_A] = 0x29A0,
+    [TDWBAL3_A] = 0x2970,
+    [TDWBAH1_A] = 0x29D0,
+    [TDWBAH2_A] = 0x29A0,
+    [TDWBAH3_A] = 0x2970,
+
+    /* Access options */
+    [RDFH]  = MAC_ACCESS_PARTIAL,    [RDFT]  = MAC_ACCESS_PARTIAL,
+    [RDFHS] = MAC_ACCESS_PARTIAL,    [RDFTS] = MAC_ACCESS_PARTIAL,
+    [RDFPC] = MAC_ACCESS_PARTIAL,
+    [TDFH]  = MAC_ACCESS_PARTIAL,    [TDFT]  = MAC_ACCESS_PARTIAL,
+    [TDFHS] = MAC_ACCESS_PARTIAL,    [TDFTS] = MAC_ACCESS_PARTIAL,
+    [TDFPC] = MAC_ACCESS_PARTIAL,    [EECD]  = MAC_ACCESS_PARTIAL,
+    [FLA]   = MAC_ACCESS_PARTIAL,
+    [FCAL]  = MAC_ACCESS_PARTIAL,    [FCAH]  = MAC_ACCESS_PARTIAL,
+    [FCT]   = MAC_ACCESS_PARTIAL,    [FCTTV] = MAC_ACCESS_PARTIAL,
+    [FCRTV] = MAC_ACCESS_PARTIAL,    [FCRTL] = MAC_ACCESS_PARTIAL,
+    [FCRTH] = MAC_ACCESS_PARTIAL,
+    [MAVTV0 ... MAVTV3] = MAC_ACCESS_PARTIAL
+};
+
+void
+igb_core_write(IGBCore *core, hwaddr addr, uint64_t val, unsigned size)
+{
+    uint16_t index = igb_get_reg_index_with_offset(mac_reg_access, addr);
+
+    if (index < IGB_NWRITEOPS && igb_macreg_writeops[index]) {
+        if (mac_reg_access[index] & MAC_ACCESS_PARTIAL) {
+            trace_e1000e_wrn_regs_write_trivial(index << 2);
+        }
+        trace_e1000e_core_write(index << 2, size, val);
+        igb_macreg_writeops[index](core, index, val);
+    } else if (index < IGB_NREADOPS && igb_macreg_readops[index]) {
+        trace_e1000e_wrn_regs_write_ro(index << 2, size, val);
+    } else {
+        trace_e1000e_wrn_regs_write_unknown(index << 2, size, val);
+    }
+}
+
+uint64_t
+igb_core_read(IGBCore *core, hwaddr addr, unsigned size)
+{
+    uint64_t val;
+    uint16_t index = igb_get_reg_index_with_offset(mac_reg_access, addr);
+
+    if (index < IGB_NREADOPS && igb_macreg_readops[index]) {
+        if (mac_reg_access[index] & MAC_ACCESS_PARTIAL) {
+            trace_e1000e_wrn_regs_read_trivial(index << 2);
+        }
+        val = igb_macreg_readops[index](core, index);
+        trace_e1000e_core_read(index << 2, size, val);
+        return val;
+    } else {
+        trace_e1000e_wrn_regs_read_unknown(index << 2, size);
+    }
+    return 0;
+}
+
+static inline void
+igb_autoneg_pause(IGBCore *core)
+{
+    timer_del(core->autoneg_timer);
+}
+
+static void
+igb_autoneg_resume(IGBCore *core)
+{
+    if (igb_have_autoneg(core) &&
+        !(core->phy[MII_BMSR] & MII_BMSR_AN_COMP)) {
+        qemu_get_queue(core->owner_nic)->link_down = false;
+        timer_mod(core->autoneg_timer,
+                  qemu_clock_get_ms(QEMU_CLOCK_VIRTUAL) + 500);
+    }
+}
+
+static void
+igb_vm_state_change(void *opaque, bool running, RunState state)
+{
+    IGBCore *core = opaque;
+
+    if (running) {
+        trace_e1000e_vm_state_running();
+        igb_intrmgr_resume(core);
+        igb_autoneg_resume(core);
+    } else {
+        trace_e1000e_vm_state_stopped();
+        igb_autoneg_pause(core);
+        igb_intrmgr_pause(core);
+    }
+}
+
+void
+igb_core_pci_realize(IGBCore        *core,
+                     const uint16_t *eeprom_templ,
+                     uint32_t        eeprom_size,
+                     const uint8_t  *macaddr)
+{
+    int i;
+
+    core->autoneg_timer = timer_new_ms(QEMU_CLOCK_VIRTUAL,
+                                       igb_autoneg_timer, core);
+    igb_intrmgr_pci_realize(core);
+
+    core->vmstate = qemu_add_vm_change_state_handler(igb_vm_state_change, core);
+
+    for (i = 0; i < IGB_NUM_QUEUES; i++) {
+        net_tx_pkt_init(&core->tx[i].tx_pkt, core->owner, E1000E_MAX_TX_FRAGS);
+    }
+
+    net_rx_pkt_init(&core->rx_pkt);
+
+    e1000x_core_prepare_eeprom(core->eeprom,
+                               eeprom_templ,
+                               eeprom_size,
+                               PCI_DEVICE_GET_CLASS(core->owner)->device_id,
+                               macaddr);
+    igb_update_rx_offloads(core);
+}
+
+void
+igb_core_pci_uninit(IGBCore *core)
+{
+    int i;
+
+    timer_free(core->autoneg_timer);
+
+    igb_intrmgr_pci_unint(core);
+
+    qemu_del_vm_change_state_handler(core->vmstate);
+
+    for (i = 0; i < IGB_NUM_QUEUES; i++) {
+        net_tx_pkt_reset(core->tx[i].tx_pkt);
+        net_tx_pkt_uninit(core->tx[i].tx_pkt);
+    }
+
+    net_rx_pkt_uninit(core->rx_pkt);
+}
+
+static const uint16_t
+igb_phy_reg_init[] = {
+    [MII_BMCR] = MII_BMCR_SPEED1000 |
+                 MII_BMCR_FD        |
+                 MII_BMCR_AUTOEN,
+
+    [MII_BMSR] = MII_BMSR_EXTCAP    |
+                 MII_BMSR_LINK_ST   |
+                 MII_BMSR_AUTONEG   |
+                 MII_BMSR_MFPS      |
+                 MII_BMSR_EXTSTAT   |
+                 MII_BMSR_10T_HD    |
+                 MII_BMSR_10T_FD    |
+                 MII_BMSR_100TX_HD  |
+                 MII_BMSR_100TX_FD,
+
+    [MII_PHYID1]            = IGP03E1000_E_PHY_ID >> 16,
+    [MII_PHYID2]            = (IGP03E1000_E_PHY_ID & 0xfff0) | 1,
+    [MII_ANAR]              = MII_ANAR_CSMACD | MII_ANAR_10 |
+                              MII_ANAR_10FD | MII_ANAR_TX |
+                              MII_ANAR_TXFD | MII_ANAR_PAUSE |
+                              MII_ANAR_PAUSE_ASYM,
+    [MII_ANLPAR]            = MII_ANLPAR_10 | MII_ANLPAR_10FD |
+                              MII_ANLPAR_TX | MII_ANLPAR_TXFD |
+                              MII_ANLPAR_T4 | MII_ANLPAR_PAUSE,
+    [MII_ANER]              = MII_ANER_NP | MII_ANER_NWAY,
+    [MII_ANNP]              = 0x1 | MII_ANNP_MP,
+    [MII_CTRL1000]          = MII_CTRL1000_HALF | MII_CTRL1000_FULL |
+                              MII_CTRL1000_PORT | MII_CTRL1000_MASTER,
+    [MII_STAT1000]          = MII_STAT1000_HALF | MII_STAT1000_FULL |
+                              MII_STAT1000_ROK | MII_STAT1000_LOK,
+    [MII_EXTSTAT]           = MII_EXTSTAT_1000T_HD | MII_EXTSTAT_1000T_FD,
+
+    [IGP01E1000_PHY_PORT_CONFIG] = BIT(5) | BIT(8),
+    [IGP01E1000_PHY_PORT_STATUS] = IGP01E1000_PSSR_SPEED_1000MBPS,
+    [IGP02E1000_PHY_POWER_MGMT]  = BIT(0) | BIT(3) | IGP02E1000_PM_D3_LPLU |
+                                   IGP01E1000_PSCFR_SMART_SPEED
+};
+
+static const uint32_t igb_mac_reg_init[] = {
+    [LEDCTL]        = 2 | (3 << 8) | BIT(15) | (6 << 16) | (7 << 24),
+    [EEMNGCTL]      = BIT(31),
+    [RXDCTL0]       = E1000_RXDCTL_QUEUE_ENABLE | (1 << 16),
+    [RXDCTL1]       = 1 << 16,
+    [RXDCTL2]       = 1 << 16,
+    [RXDCTL3]       = 1 << 16,
+    [RXDCTL4]       = 1 << 16,
+    [RXDCTL5]       = 1 << 16,
+    [RXDCTL6]       = 1 << 16,
+    [RXDCTL7]       = 1 << 16,
+    [RXDCTL8]       = 1 << 16,
+    [RXDCTL9]       = 1 << 16,
+    [RXDCTL10]      = 1 << 16,
+    [RXDCTL11]      = 1 << 16,
+    [RXDCTL12]      = 1 << 16,
+    [RXDCTL13]      = 1 << 16,
+    [RXDCTL14]      = 1 << 16,
+    [RXDCTL15]      = 1 << 16,
+    [TIPG]          = 0x08 | (0x04 << 10) | (0x06 << 20),
+    [CTRL]          = E1000_CTRL_FD | E1000_CTRL_LRST | E1000_CTRL_SPD_1000 |
+                      E1000_CTRL_ADVD3WUC,
+    [STATUS]        = E1000_STATUS_PHYRA | BIT(31),
+    [EECD]          = E1000_EECD_FWE_DIS | E1000_EECD_PRES |
+                      (2 << E1000_EECD_SIZE_EX_SHIFT),
+    [GCR]           = E1000_L0S_ADJUST |
+                      E1000_GCR_CMPL_TMOUT_RESEND |
+                      E1000_GCR_CAP_VER2 |
+                      E1000_L1_ENTRY_LATENCY_MSB |
+                      E1000_L1_ENTRY_LATENCY_LSB,
+    [RXCSUM]        = E1000_RXCSUM_IPOFLD | E1000_RXCSUM_TUOFLD,
+    [TXPBS]         = 0x28,
+    [RXPBS]         = 0x40,
+    [TCTL]          = E1000_TCTL_PSP | (0xF << E1000_CT_SHIFT) |
+                      (0x40 << E1000_COLD_SHIFT) | (0x1 << 26) | (0xA << 28),
+    [TCTL_EXT]      = 0x40 | (0x42 << 10),
+    [DTXCTL]        = E1000_DTXCTL_8023LL | E1000_DTXCTL_SPOOF_INT,
+    [VET]           = ETH_P_VLAN | (ETH_P_VLAN << 16),
+
+    [V2PMAILBOX0 ... V2PMAILBOX0 + IGB_MAX_VF_FUNCTIONS - 1] = E1000_V2PMAILBOX_RSTI,
+    [MBVFIMR]       = 0xFF,
+    [VFRE]          = 0xFF,
+    [VFTE]          = 0xFF,
+    [VMOLR0 ... VMOLR0 + 7] = 0x2600 | E1000_VMOLR_STRCRC,
+    [RPLOLR]        = E1000_RPLOLR_STRCRC,
+    [RLPML]         = 0x2600,
+    [TXCTL0]       = E1000_DCA_TXCTRL_DATA_RRO_EN |
+                     E1000_DCA_TXCTRL_TX_WB_RO_EN |
+                     E1000_DCA_TXCTRL_DESC_RRO_EN,
+    [TXCTL1]       = E1000_DCA_TXCTRL_DATA_RRO_EN |
+                     E1000_DCA_TXCTRL_TX_WB_RO_EN |
+                     E1000_DCA_TXCTRL_DESC_RRO_EN,
+    [TXCTL2]       = E1000_DCA_TXCTRL_DATA_RRO_EN |
+                     E1000_DCA_TXCTRL_TX_WB_RO_EN |
+                     E1000_DCA_TXCTRL_DESC_RRO_EN,
+    [TXCTL3]       = E1000_DCA_TXCTRL_DATA_RRO_EN |
+                     E1000_DCA_TXCTRL_TX_WB_RO_EN |
+                     E1000_DCA_TXCTRL_DESC_RRO_EN,
+    [TXCTL4]       = E1000_DCA_TXCTRL_DATA_RRO_EN |
+                     E1000_DCA_TXCTRL_TX_WB_RO_EN |
+                     E1000_DCA_TXCTRL_DESC_RRO_EN,
+    [TXCTL5]       = E1000_DCA_TXCTRL_DATA_RRO_EN |
+                     E1000_DCA_TXCTRL_TX_WB_RO_EN |
+                     E1000_DCA_TXCTRL_DESC_RRO_EN,
+    [TXCTL6]       = E1000_DCA_TXCTRL_DATA_RRO_EN |
+                     E1000_DCA_TXCTRL_TX_WB_RO_EN |
+                     E1000_DCA_TXCTRL_DESC_RRO_EN,
+    [TXCTL7]       = E1000_DCA_TXCTRL_DATA_RRO_EN |
+                     E1000_DCA_TXCTRL_TX_WB_RO_EN |
+                     E1000_DCA_TXCTRL_DESC_RRO_EN,
+    [TXCTL8]       = E1000_DCA_TXCTRL_DATA_RRO_EN |
+                     E1000_DCA_TXCTRL_TX_WB_RO_EN |
+                     E1000_DCA_TXCTRL_DESC_RRO_EN,
+    [TXCTL9]       = E1000_DCA_TXCTRL_DATA_RRO_EN |
+                     E1000_DCA_TXCTRL_TX_WB_RO_EN |
+                     E1000_DCA_TXCTRL_DESC_RRO_EN,
+    [TXCTL10]      = E1000_DCA_TXCTRL_DATA_RRO_EN |
+                     E1000_DCA_TXCTRL_TX_WB_RO_EN |
+                     E1000_DCA_TXCTRL_DESC_RRO_EN,
+    [TXCTL11]      = E1000_DCA_TXCTRL_DATA_RRO_EN |
+                     E1000_DCA_TXCTRL_TX_WB_RO_EN |
+                     E1000_DCA_TXCTRL_DESC_RRO_EN,
+    [TXCTL12]      = E1000_DCA_TXCTRL_DATA_RRO_EN |
+                     E1000_DCA_TXCTRL_TX_WB_RO_EN |
+                     E1000_DCA_TXCTRL_DESC_RRO_EN,
+    [TXCTL13]      = E1000_DCA_TXCTRL_DATA_RRO_EN |
+                     E1000_DCA_TXCTRL_TX_WB_RO_EN |
+                     E1000_DCA_TXCTRL_DESC_RRO_EN,
+    [TXCTL14]      = E1000_DCA_TXCTRL_DATA_RRO_EN |
+                     E1000_DCA_TXCTRL_TX_WB_RO_EN |
+                     E1000_DCA_TXCTRL_DESC_RRO_EN,
+    [TXCTL15]      = E1000_DCA_TXCTRL_DATA_RRO_EN |
+                     E1000_DCA_TXCTRL_TX_WB_RO_EN |
+                     E1000_DCA_TXCTRL_DESC_RRO_EN,
+};
+
+static void igb_reset(IGBCore *core, bool sw)
+{
+    struct igb_tx *tx;
+    int i;
+
+    timer_del(core->autoneg_timer);
+
+    igb_intrmgr_reset(core);
+
+    memset(core->phy, 0, sizeof core->phy);
+    memcpy(core->phy, igb_phy_reg_init, sizeof igb_phy_reg_init);
+
+    for (i = 0; i < E1000E_MAC_SIZE; i++) {
+        if (sw &&
+            (i == RXPBS || i == TXPBS ||
+             (i >= EITR0 && i < EITR0 + IGB_INTR_NUM))) {
+            continue;
+        }
+
+        core->mac[i] = i < ARRAY_SIZE(igb_mac_reg_init) ?
+                       igb_mac_reg_init[i] : 0;
+    }
+
+    if (qemu_get_queue(core->owner_nic)->link_down) {
+        igb_link_down(core);
+    }
+
+    e1000x_reset_mac_addr(core->owner_nic, core->mac, core->permanent_mac);
+
+    for (i = 0; i < ARRAY_SIZE(core->tx); i++) {
+        tx = &core->tx[i];
+        net_tx_pkt_reset(tx->tx_pkt);
+        tx->vlan = 0;
+        tx->mss = 0;
+        tx->tse = false;
+        tx->ixsm = false;
+        tx->txsm = false;
+        tx->first = true;
+        tx->skip_cp = false;
+    }
+}
+
+void
+igb_core_reset(IGBCore *core)
+{
+    igb_reset(core, false);
+}
+
+void igb_core_pre_save(IGBCore *core)
+{
+    int i;
+    NetClientState *nc = qemu_get_queue(core->owner_nic);
+
+    /*
+     * If link is down and auto-negotiation is supported and ongoing,
+     * complete auto-negotiation immediately. This allows us to look
+     * at MII_BMSR_AN_COMP to infer link status on load.
+     */
+    if (nc->link_down && igb_have_autoneg(core)) {
+        core->phy[MII_BMSR] |= MII_BMSR_AN_COMP;
+        igb_update_flowctl_status(core);
+    }
+
+    for (i = 0; i < ARRAY_SIZE(core->tx); i++) {
+        if (net_tx_pkt_has_fragments(core->tx[i].tx_pkt)) {
+            core->tx[i].skip_cp = true;
+        }
+    }
+}
+
+int
+igb_core_post_load(IGBCore *core)
+{
+    NetClientState *nc = qemu_get_queue(core->owner_nic);
+
+    /*
+     * nc.link_down can't be migrated, so infer link_down according
+     * to link status bit in core.mac[STATUS].
+     */
+    nc->link_down = (core->mac[STATUS] & E1000_STATUS_LU) == 0;
+
+    return 0;
+}
diff --git a/hw/net/igb_core.h b/hw/net/igb_core.h
new file mode 100644
index 0000000000..814c1e264b
--- /dev/null
+++ b/hw/net/igb_core.h
@@ -0,0 +1,146 @@
+/*
+ * Core code for QEMU igb emulation
+ *
+ * Datasheet:
+ * https://www.intel.com/content/dam/www/public/us/en/documents/datasheets/82576eg-gbe-datasheet.pdf
+ *
+ * Copyright (c) 2020-2023 Red Hat, Inc.
+ * Copyright (c) 2015 Ravello Systems LTD (http://ravellosystems.com)
+ * Developed by Daynix Computing LTD (http://www.daynix.com)
+ *
+ * Authors:
+ * Akihiko Odaki <akihiko.odaki@daynix.com>
+ * Gal Hammmer <gal.hammer@sap.com>
+ * Marcel Apfelbaum <marcel.apfelbaum@gmail.com>
+ * Dmitry Fleytman <dmitry@daynix.com>
+ * Leonid Bloch <leonid@daynix.com>
+ * Yan Vugenfirer <yan@daynix.com>
+ *
+ * Based on work done by:
+ * Nir Peleg, Tutis Systems Ltd. for Qumranet Inc.
+ * Copyright (c) 2008 Qumranet
+ * Based on work done by:
+ * Copyright (c) 2007 Dan Aloni
+ * Copyright (c) 2004 Antony T Curtis
+ *
+ * This library is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * This library is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with this library; if not, see <http://www.gnu.org/licenses/>.
+ */
+
+#ifndef HW_NET_IGB_CORE_H
+#define HW_NET_IGB_CORE_H
+
+#define E1000E_MAC_SIZE         (0x8000)
+#define IGB_EEPROM_SIZE         (1024)
+
+#define IGB_INTR_NUM            (25)
+#define IGB_MSIX_VEC_NUM        (10)
+#define IGBVF_MSIX_VEC_NUM      (3)
+#define IGB_NUM_QUEUES          (16)
+
+typedef struct IGBCore IGBCore;
+
+enum { PHY_R = BIT(0),
+       PHY_W = BIT(1),
+       PHY_RW = PHY_R | PHY_W };
+
+typedef struct IGBIntrDelayTimer_st {
+    QEMUTimer *timer;
+    bool running;
+    uint32_t delay_reg;
+    uint32_t delay_resolution_ns;
+    IGBCore *core;
+} IGBIntrDelayTimer;
+
+struct IGBCore {
+    uint32_t mac[E1000E_MAC_SIZE];
+    uint16_t phy[MAX_PHY_REG_ADDRESS + 1];
+    uint16_t eeprom[IGB_EEPROM_SIZE];
+
+    uint8_t rx_desc_len;
+
+    QEMUTimer *autoneg_timer;
+
+    struct igb_tx {
+        uint16_t vlan;  /* VLAN Tag */
+        uint16_t mss;   /* Maximum Segment Size */
+        bool tse;       /* TCP/UDP Segmentation Enable */
+        bool ixsm;      /* Insert IP Checksum */
+        bool txsm;      /* Insert TCP/UDP Checksum */
+
+        bool first;
+        bool skip_cp;
+
+        struct NetTxPkt *tx_pkt;
+    } tx[IGB_NUM_QUEUES];
+
+    struct NetRxPkt *rx_pkt;
+
+    bool has_vnet;
+    int max_queue_num;
+
+    IGBIntrDelayTimer eitr[IGB_INTR_NUM];
+
+    VMChangeStateEntry *vmstate;
+
+    uint32_t eitr_guest_value[IGB_INTR_NUM];
+
+    uint8_t permanent_mac[ETH_ALEN];
+
+    NICState *owner_nic;
+    PCIDevice *owner;
+    void (*owner_start_recv)(PCIDevice *d);
+
+    int64_t timadj;
+};
+
+void
+igb_core_write(IGBCore *core, hwaddr addr, uint64_t val, unsigned size);
+
+uint64_t
+igb_core_read(IGBCore *core, hwaddr addr, unsigned size);
+
+void
+igb_core_pci_realize(IGBCore        *regs,
+                     const uint16_t *eeprom_templ,
+                     uint32_t        eeprom_size,
+                     const uint8_t  *macaddr);
+
+void
+igb_core_reset(IGBCore *core);
+
+void
+igb_core_pre_save(IGBCore *core);
+
+int
+igb_core_post_load(IGBCore *core);
+
+void
+igb_core_set_link_status(IGBCore *core);
+
+void
+igb_core_pci_uninit(IGBCore *core);
+
+bool
+igb_can_receive(IGBCore *core);
+
+ssize_t
+igb_receive(IGBCore *core, const uint8_t *buf, size_t size);
+
+ssize_t
+igb_receive_iov(IGBCore *core, const struct iovec *iov, int iovcnt);
+
+void
+igb_start_recv(IGBCore *core);
+
+#endif
diff --git a/hw/net/igb_regs.h b/hw/net/igb_regs.h
new file mode 100644
index 0000000000..00934d4f20
--- /dev/null
+++ b/hw/net/igb_regs.h
@@ -0,0 +1,648 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/*
+ * This is copied + edited from kernel header files in
+ * drivers/net/ethernet/intel/igb
+ */
+
+#ifndef HW_IGB_REGS_H_
+#define HW_IGB_REGS_H_
+
+#include "e1000x_regs.h"
+
+/* from igb/e1000_hw.h */
+
+#define E1000_DEV_ID_82576                 0x10C9
+#define E1000_DEV_ID_82576_FIBER           0x10E6
+#define E1000_DEV_ID_82576_SERDES          0x10E7
+#define E1000_DEV_ID_82576_QUAD_COPPER             0x10E8
+#define E1000_DEV_ID_82576_QUAD_COPPER_ET2 0x1526
+#define E1000_DEV_ID_82576_NS                      0x150A
+#define E1000_DEV_ID_82576_NS_SERDES               0x1518
+#define E1000_DEV_ID_82576_SERDES_QUAD             0x150D
+
+/* Context Descriptor */
+struct e1000_adv_tx_context_desc {
+    uint32_t vlan_macip_lens;
+    uint32_t seqnum_seed;
+    uint32_t type_tucmd_mlhl;
+    uint32_t mss_l4len_idx;
+};
+
+/* Advanced Transmit Descriptor */
+union e1000_adv_tx_desc {
+    struct {
+        uint64_t buffer_addr;     /* Address of descriptor's data buffer */
+        uint32_t cmd_type_len;
+        uint32_t olinfo_status;
+    } read;
+    struct {
+        uint64_t rsvd;            /* Reserved */
+        uint32_t nxtseq_seed;
+        uint32_t status;
+    } wb;
+};
+
+#define E1000_ADVTXD_DTYP_CTXT  0x00200000 /* Advanced Context Descriptor */
+#define E1000_ADVTXD_DTYP_DATA  0x00300000 /* Advanced Data Descriptor */
+#define E1000_ADVTXD_DCMD_DEXT  0x20000000 /* Descriptor Extension (1=Adv) */
+#define E1000_ADVTXD_DCMD_TSE   0x80000000 /* TCP/UDP Segmentation Enable */
+
+#define E1000_ADVTXD_POTS_IXSM  0x00000100 /* Insert TCP/UDP Checksum */
+#define E1000_ADVTXD_POTS_TXSM  0x00000200 /* Insert TCP/UDP Checksum */
+
+#define E1000_TXD_POPTS_IXSM 0x00000001 /* Insert IP checksum */
+#define E1000_TXD_POPTS_TXSM 0x00000002 /* Insert TCP/UDP checksum */
+
+/* Receive Descriptor - Advanced */
+union e1000_adv_rx_desc {
+    struct {
+        uint64_t pkt_addr;                /* Packet Buffer Address */
+        uint64_t hdr_addr;                /* Header Buffer Address */
+    } read;
+    struct {
+        struct {
+            struct {
+                uint16_t pkt_info;        /* RSS Type, Packet Type */
+                uint16_t hdr_info;        /* Split Head, Buffer Length */
+            } lo_dword;
+            union {
+                uint32_t rss;             /* RSS Hash */
+                struct {
+                        uint16_t ip_id;   /* IP Id */
+                        uint16_t csum;    /* Packet Checksum */
+                } csum_ip;
+            } hi_dword;
+        } lower;
+        struct {
+            uint32_t status_error;        /* Ext Status/Error */
+            uint16_t length;              /* Packet Length */
+            uint16_t vlan;                /* VLAN tag */
+        } upper;
+    } wb;  /* writeback */
+};
+
+/* from igb/e1000_phy.h */
+
+/* IGP01E1000 Specific Registers */
+#define IGP01E1000_PHY_PORT_CONFIG        0x10 /* Port Config */
+#define IGP01E1000_PHY_PORT_STATUS        0x11 /* Status */
+#define IGP01E1000_PHY_PORT_CTRL          0x12 /* Control */
+#define IGP01E1000_PHY_LINK_HEALTH        0x13 /* PHY Link Health */
+#define IGP02E1000_PHY_POWER_MGMT         0x19 /* Power Management */
+#define IGP01E1000_PHY_PAGE_SELECT        0x1F /* Page Select */
+#define IGP01E1000_PHY_PCS_INIT_REG       0x00B4
+#define IGP01E1000_PHY_POLARITY_MASK      0x0078
+#define IGP01E1000_PSCR_AUTO_MDIX         0x1000
+#define IGP01E1000_PSCR_FORCE_MDI_MDIX    0x2000 /* 0=MDI, 1=MDIX */
+#define IGP01E1000_PSCFR_SMART_SPEED      0x0080
+
+/* Enable flexible speed on link-up */
+#define IGP02E1000_PM_D0_LPLU             0x0002 /* For D0a states */
+#define IGP02E1000_PM_D3_LPLU             0x0004 /* For all other states */
+#define IGP01E1000_PLHR_SS_DOWNGRADE      0x8000
+#define IGP01E1000_PSSR_POLARITY_REVERSED 0x0002
+#define IGP01E1000_PSSR_MDIX              0x0800
+#define IGP01E1000_PSSR_SPEED_MASK        0xC000
+#define IGP01E1000_PSSR_SPEED_1000MBPS    0xC000
+#define IGP02E1000_PHY_CHANNEL_NUM        4
+#define IGP02E1000_PHY_AGC_A              0x11B1
+#define IGP02E1000_PHY_AGC_B              0x12B1
+#define IGP02E1000_PHY_AGC_C              0x14B1
+#define IGP02E1000_PHY_AGC_D              0x18B1
+#define IGP02E1000_AGC_LENGTH_SHIFT       9   /* Course - 15:13, Fine - 12:9 */
+#define IGP02E1000_AGC_LENGTH_MASK        0x7F
+#define IGP02E1000_AGC_RANGE              15
+
+/* from igb/igb.h */
+
+#define E1000_PCS_CFG_IGN_SD     1
+
+/* Interrupt defines */
+#define IGB_START_ITR            648 /* ~6000 ints/sec */
+#define IGB_4K_ITR               980
+#define IGB_20K_ITR              196
+#define IGB_70K_ITR              56
+
+/* TX/RX descriptor defines */
+#define IGB_DEFAULT_TXD          256
+#define IGB_DEFAULT_TX_WORK      128
+#define IGB_MIN_TXD              80
+#define IGB_MAX_TXD              4096
+
+#define IGB_DEFAULT_RXD          256
+#define IGB_MIN_RXD              80
+#define IGB_MAX_RXD              4096
+
+#define IGB_DEFAULT_ITR           3 /* dynamic */
+#define IGB_MAX_ITR_USECS         10000
+#define IGB_MIN_ITR_USECS         10
+#define NON_Q_VECTORS             1
+#define MAX_Q_VECTORS             8
+#define MAX_MSIX_ENTRIES          10
+
+/* Transmit and receive queues */
+#define IGB_MAX_RX_QUEUES          8
+#define IGB_MAX_RX_QUEUES_82575    4
+#define IGB_MAX_RX_QUEUES_I211     2
+#define IGB_MAX_TX_QUEUES          8
+#define IGB_MAX_VF_MC_ENTRIES      30
+#define IGB_MAX_VF_FUNCTIONS       8
+#define IGB_MAX_VFTA_ENTRIES       128
+#define IGB_82576_VF_DEV_ID        0x10CA
+#define IGB_I350_VF_DEV_ID         0x1520
+
+/* from igb/e1000_82575.h */
+
+#define E1000_MRQC_ENABLE_RSS_MQ            0x00000002
+#define E1000_MRQC_ENABLE_VMDQ              0x00000003
+#define E1000_MRQC_RSS_FIELD_IPV4_UDP       0x00400000
+#define E1000_MRQC_ENABLE_VMDQ_RSS_MQ       0x00000005
+#define E1000_MRQC_RSS_FIELD_IPV6_UDP       0x00800000
+#define E1000_MRQC_RSS_FIELD_IPV6_UDP_EX    0x01000000
+
+/* Additional Receive Descriptor Control definitions */
+#define E1000_RXDCTL_QUEUE_ENABLE  0x02000000 /* Enable specific Rx Queue */
+
+/* Direct Cache Access (DCA) definitions */
+#define E1000_DCA_CTRL_DCA_MODE_DISABLE 0x01 /* DCA Disable */
+#define E1000_DCA_CTRL_DCA_MODE_CB2     0x02 /* DCA Mode CB2 */
+
+#define E1000_DCA_RXCTRL_CPUID_MASK 0x0000001F /* Rx CPUID Mask */
+#define E1000_DCA_RXCTRL_DESC_DCA_EN BIT(5) /* DCA Rx Desc enable */
+#define E1000_DCA_RXCTRL_HEAD_DCA_EN BIT(6) /* DCA Rx Desc header enable */
+#define E1000_DCA_RXCTRL_DATA_DCA_EN BIT(7) /* DCA Rx Desc payload enable */
+#define E1000_DCA_RXCTRL_DESC_RRO_EN BIT(9) /* DCA Rx rd Desc Relax Order */
+
+#define E1000_DCA_TXCTRL_CPUID_MASK 0x0000001F /* Tx CPUID Mask */
+#define E1000_DCA_TXCTRL_DESC_DCA_EN BIT(5) /* DCA Tx Desc enable */
+#define E1000_DCA_TXCTRL_DESC_RRO_EN BIT(9) /* Tx rd Desc Relax Order */
+#define E1000_DCA_TXCTRL_TX_WB_RO_EN BIT(11) /* Tx Desc writeback RO bit */
+#define E1000_DCA_TXCTRL_DATA_RRO_EN BIT(13) /* Tx rd data Relax Order */
+
+/* Additional DCA related definitions, note change in position of CPUID */
+#define E1000_DCA_TXCTRL_CPUID_MASK_82576 0xFF000000 /* Tx CPUID Mask */
+#define E1000_DCA_RXCTRL_CPUID_MASK_82576 0xFF000000 /* Rx CPUID Mask */
+#define E1000_DCA_TXCTRL_CPUID_SHIFT 24 /* Tx CPUID now in the last byte */
+#define E1000_DCA_RXCTRL_CPUID_SHIFT 24 /* Rx CPUID now in the last byte */
+
+#define E1000_DTXSWC_MAC_SPOOF_MASK   0x000000FF /* Per VF MAC spoof control */
+#define E1000_DTXSWC_VLAN_SPOOF_MASK  0x0000FF00 /* Per VF VLAN spoof control */
+#define E1000_DTXSWC_LLE_MASK         0x00FF0000 /* Per VF Local LB enables */
+#define E1000_DTXSWC_VLAN_SPOOF_SHIFT 8
+#define E1000_DTXSWC_VMDQ_LOOPBACK_EN BIT(31)  /* global VF LB enable */
+
+/* Easy defines for setting default pool, would normally be left a zero */
+#define E1000_VT_CTL_DEFAULT_POOL_SHIFT 7
+#define E1000_VT_CTL_DEFAULT_POOL_MASK  (0x7 << E1000_VT_CTL_DEFAULT_POOL_SHIFT)
+
+/* Other useful VMD_CTL register defines */
+#define E1000_VT_CTL_IGNORE_MAC         BIT(28)
+#define E1000_VT_CTL_DISABLE_DEF_POOL   BIT(29)
+#define E1000_VT_CTL_VM_REPL_EN         BIT(30)
+
+/* Per VM Offload register setup */
+#define E1000_VMOLR_RLPML_MASK 0x00003FFF /* Long Packet Maximum Length mask */
+#define E1000_VMOLR_LPE        0x00010000 /* Accept Long packet */
+#define E1000_VMOLR_RSSE       0x00020000 /* Enable RSS */
+#define E1000_VMOLR_AUPE       0x01000000 /* Accept untagged packets */
+#define E1000_VMOLR_ROMPE      0x02000000 /* Accept overflow multicast */
+#define E1000_VMOLR_ROPE       0x04000000 /* Accept overflow unicast */
+#define E1000_VMOLR_BAM        0x08000000 /* Accept Broadcast packets */
+#define E1000_VMOLR_MPME       0x10000000 /* Multicast promiscuous mode */
+#define E1000_VMOLR_STRVLAN    0x40000000 /* Vlan stripping enable */
+#define E1000_VMOLR_STRCRC     0x80000000 /* CRC stripping enable */
+
+#define E1000_DVMOLR_HIDEVLAN  0x20000000 /* Hide vlan enable */
+#define E1000_DVMOLR_STRVLAN   0x40000000 /* Vlan stripping enable */
+#define E1000_DVMOLR_STRCRC    0x80000000 /* CRC stripping enable */
+
+#define E1000_VLVF_ARRAY_SIZE     32
+#define E1000_VLVF_VLANID_MASK    0x00000FFF
+#define E1000_VLVF_POOLSEL_SHIFT  12
+#define E1000_VLVF_POOLSEL_MASK   (0xFF << E1000_VLVF_POOLSEL_SHIFT)
+#define E1000_VLVF_LVLAN          0x00100000
+#define E1000_VLVF_VLANID_ENABLE  0x80000000
+
+#define E1000_VMVIR_VLANA_DEFAULT      0x40000000 /* Always use default VLAN */
+#define E1000_VMVIR_VLANA_NEVER        0x80000000 /* Never insert VLAN tag */
+
+#define E1000_IOVCTL 0x05BBC
+#define E1000_IOVCTL_REUSE_VFQ 0x00000001
+
+#define E1000_RPLOLR_STRVLAN   0x40000000
+#define E1000_RPLOLR_STRCRC    0x80000000
+
+#define E1000_DTXCTL_8023LL     0x0004
+#define E1000_DTXCTL_VLAN_ADDED 0x0008
+#define E1000_DTXCTL_OOS_ENABLE 0x0010
+#define E1000_DTXCTL_MDP_EN     0x0020
+#define E1000_DTXCTL_SPOOF_INT  0x0040
+
+/* from igb/e1000_defines.h */
+
+#define E1000_IVAR_VALID     0x80
+#define E1000_GPIE_NSICR     0x00000001
+#define E1000_GPIE_MSIX_MODE 0x00000010
+#define E1000_GPIE_EIAME     0x40000000
+#define E1000_GPIE_PBA       0x80000000
+
+/* Transmit Control */
+#define E1000_TCTL_EN     0x00000002    /* enable tx */
+#define E1000_TCTL_PSP    0x00000008    /* pad short packets */
+#define E1000_TCTL_CT     0x00000ff0    /* collision threshold */
+#define E1000_TCTL_COLD   0x003ff000    /* collision distance */
+#define E1000_TCTL_RTLC   0x01000000    /* Re-transmit on late collision */
+
+/* Collision related configuration parameters */
+#define E1000_COLLISION_THRESHOLD       15
+#define E1000_CT_SHIFT                  4
+#define E1000_COLLISION_DISTANCE        63
+#define E1000_COLD_SHIFT                12
+
+#define E1000_RAH_POOL_MASK 0x03FC0000
+#define E1000_RAH_POOL_1 0x00040000
+
+#define E1000_ICR_VMMB         0x00000100 /* VM MB event */
+#define E1000_ICR_TS           0x00080000 /* Time Sync Interrupt */
+#define E1000_ICR_DRSTA        0x40000000 /* Device Reset Asserted */
+/* If this bit asserted, the driver should claim the interrupt */
+#define E1000_ICR_INT_ASSERTED 0x80000000
+/* LAN connected device generates an interrupt */
+#define E1000_ICR_DOUTSYNC     0x10000000 /* NIC DMA out of sync */
+
+/* Extended Interrupt Cause Read */
+#define E1000_EICR_RX_QUEUE0    0x00000001 /* Rx Queue 0 Interrupt */
+#define E1000_EICR_RX_QUEUE1    0x00000002 /* Rx Queue 1 Interrupt */
+#define E1000_EICR_RX_QUEUE2    0x00000004 /* Rx Queue 2 Interrupt */
+#define E1000_EICR_RX_QUEUE3    0x00000008 /* Rx Queue 3 Interrupt */
+#define E1000_EICR_TX_QUEUE0    0x00000100 /* Tx Queue 0 Interrupt */
+#define E1000_EICR_TX_QUEUE1    0x00000200 /* Tx Queue 1 Interrupt */
+#define E1000_EICR_TX_QUEUE2    0x00000400 /* Tx Queue 2 Interrupt */
+#define E1000_EICR_TX_QUEUE3    0x00000800 /* Tx Queue 3 Interrupt */
+#define E1000_EICR_OTHER        0x80000000 /* Interrupt Cause Active */
+
+/* Extended Interrupt Cause Set */
+/* E1000_EITR_CNT_IGNR is only for 82576 and newer */
+#define E1000_EITR_CNT_IGNR     0x80000000 /* Don't reset counters on write */
+
+/* PCI Express Control */
+#define E1000_GCR_CMPL_TMOUT_MASK       0x0000F000
+#define E1000_GCR_CMPL_TMOUT_10ms       0x00001000
+#define E1000_GCR_CMPL_TMOUT_RESEND     0x00010000
+#define E1000_GCR_CAP_VER2              0x00040000
+
+#define PHY_REVISION_MASK      0xFFFFFFF0
+#define MAX_PHY_REG_ADDRESS    0x1F  /* 5 bit address bus (0-0x1F) */
+#define MAX_PHY_MULTI_PAGE_REG 0xF
+
+#define IGP03E1000_E_PHY_ID 0x02A80390
+
+/* from igb/e1000_mbox.h */
+
+#define E1000_P2VMAILBOX_STS  0x00000001 /* Initiate message send to VF */
+#define E1000_P2VMAILBOX_ACK  0x00000002 /* Ack message recv'd from VF */
+#define E1000_P2VMAILBOX_VFU  0x00000004 /* VF owns the mailbox buffer */
+#define E1000_P2VMAILBOX_PFU  0x00000008 /* PF owns the mailbox buffer */
+#define E1000_P2VMAILBOX_RVFU 0x00000010 /* Reset VFU - used when VF stuck */
+
+#define E1000_MBVFICR_VFREQ_MASK 0x000000FF /* bits for VF messages */
+#define E1000_MBVFICR_VFREQ_VF1  0x00000001 /* bit for VF 1 message */
+#define E1000_MBVFICR_VFACK_MASK 0x00FF0000 /* bits for VF acks */
+#define E1000_MBVFICR_VFACK_VF1  0x00010000 /* bit for VF 1 ack */
+
+#define E1000_V2PMAILBOX_SIZE 16 /* 16 32 bit words - 64 bytes */
+
+/*
+ * If it's a E1000_VF_* msg then it originates in the VF and is sent to the
+ * PF.  The reverse is true if it is E1000_PF_*.
+ * Message ACK's are the value or'd with 0xF0000000
+ */
+/* Messages below or'd with this are the ACK */
+#define E1000_VT_MSGTYPE_ACK 0x80000000
+/* Messages below or'd with this are the NACK */
+#define E1000_VT_MSGTYPE_NACK 0x40000000
+/* Indicates that VF is still clear to send requests */
+#define E1000_VT_MSGTYPE_CTS 0x20000000
+#define E1000_VT_MSGINFO_SHIFT 16
+/* bits 23:16 are used for exra info for certain messages */
+#define E1000_VT_MSGINFO_MASK (0xFF << E1000_VT_MSGINFO_SHIFT)
+
+#define E1000_VF_RESET                 0x01 /* VF requests reset */
+#define E1000_VF_SET_MAC_ADDR          0x02 /* VF requests to set MAC addr */
+/* VF requests to clear all unicast MAC filters */
+#define E1000_VF_MAC_FILTER_CLR        (0x01 << E1000_VT_MSGINFO_SHIFT)
+/* VF requests to add unicast MAC filter */
+#define E1000_VF_MAC_FILTER_ADD        (0x02 << E1000_VT_MSGINFO_SHIFT)
+#define E1000_VF_SET_MULTICAST         0x03 /* VF requests to set MC addr */
+#define E1000_VF_SET_VLAN              0x04 /* VF requests to set VLAN */
+#define E1000_VF_SET_LPE               0x05 /* VF requests to set VMOLR.LPE */
+#define E1000_VF_SET_PROMISC           0x06 /*VF requests to clear VMOLR.ROPE/MPME*/
+#define E1000_VF_SET_PROMISC_MULTICAST (0x02 << E1000_VT_MSGINFO_SHIFT)
+
+#define E1000_PF_CONTROL_MSG 0x0100 /* PF control message */
+
+/* from igb/e1000_regs.h */
+
+#define E1000_EICR      0x01580  /* Ext. Interrupt Cause Read - R/clr */
+#define E1000_EITR(_n)  (0x01680 + (0x4 * (_n)))
+#define E1000_EICS      0x01520  /* Ext. Interrupt Cause Set - W0 */
+#define E1000_EIMS      0x01524  /* Ext. Interrupt Mask Set/Read - RW */
+#define E1000_EIMC      0x01528  /* Ext. Interrupt Mask Clear - WO */
+#define E1000_EIAC      0x0152C  /* Ext. Interrupt Auto Clear - RW */
+#define E1000_EIAM      0x01530  /* Ext. Interrupt Ack Auto Clear Mask - RW */
+#define E1000_GPIE      0x01514  /* General Purpose Interrupt Enable; RW */
+#define E1000_IVAR0     0x01700  /* Interrupt Vector Allocation Register - RW */
+#define E1000_IVAR_MISC 0x01740  /* Interrupt Vector Allocation Register (last) - RW */
+#define E1000_FRTIMER   0x01048  /* Free Running Timer - RW */
+#define E1000_FCRTV     0x02460  /* Flow Control Refresh Timer Value - RW */
+
+#define E1000_RQDPC(_n) (0x0C030 + ((_n) * 0x40))
+
+#define E1000_RXPBS 0x02404  /* Rx Packet Buffer Size - RW */
+#define E1000_TXPBS 0x03404  /* Tx Packet Buffer Size - RW */
+
+#define E1000_DTXCTL 0x03590  /* DMA TX Control - RW */
+
+#define E1000_HTCBDPC     0x04124  /* Host TX Circuit Breaker Dropped Count */
+#define E1000_RLPML       0x05004  /* RX Long Packet Max Length */
+#define E1000_RA2         0x054E0  /* 2nd half of Rx address array - RW Array */
+#define E1000_PSRTYPE(_i) (0x05480 + ((_i) * 4))
+#define E1000_VT_CTL   0x0581C  /* VMDq Control - RW */
+
+/* VT Registers */
+#define E1000_MBVFICR   0x00C80 /* Mailbox VF Cause - RWC */
+#define E1000_MBVFIMR   0x00C84 /* Mailbox VF int Mask - RW */
+#define E1000_VFLRE     0x00C88 /* VF Register Events - RWC */
+#define E1000_VFRE      0x00C8C /* VF Receive Enables */
+#define E1000_VFTE      0x00C90 /* VF Transmit Enables */
+#define E1000_QDE       0x02408 /* Queue Drop Enable - RW */
+#define E1000_DTXSWC    0x03500 /* DMA Tx Switch Control - RW */
+#define E1000_WVBR      0x03554 /* VM Wrong Behavior - RWS */
+#define E1000_RPLOLR    0x05AF0 /* Replication Offload - RW */
+#define E1000_UTA       0x0A000 /* Unicast Table Array - RW */
+#define E1000_IOVTCL    0x05BBC /* IOV Control Register */
+#define E1000_TXSWC     0x05ACC /* Tx Switch Control */
+#define E1000_LVMMC     0x03548 /* Last VM Misbehavior cause */
+/* These act per VF so an array friendly macro is used */
+#define E1000_P2VMAILBOX(_n)   (0x00C00 + (4 * (_n)))
+#define E1000_VMBMEM(_n)       (0x00800 + (64 * (_n)))
+#define E1000_VMOLR(_n)        (0x05AD0 + (4 * (_n)))
+#define E1000_DVMOLR(_n)       (0x0C038 + (64 * (_n)))
+#define E1000_VLVF(_n)         (0x05D00 + (4 * (_n))) /* VLAN VM Filter */
+#define E1000_VMVIR(_n)        (0x03700 + (4 * (_n)))
+
+/* from igbvf/defines.h */
+
+/* SRRCTL bit definitions */
+#define E1000_SRRCTL_BSIZEPKT_SHIFT            10 /* Shift _right_ */
+#define E1000_SRRCTL_BSIZEHDRSIZE_MASK         0x00000F00
+#define E1000_SRRCTL_BSIZEHDRSIZE_SHIFT        2  /* Shift _left_ */
+#define E1000_SRRCTL_DESCTYPE_ADV_ONEBUF       0x02000000
+#define E1000_SRRCTL_DESCTYPE_HDR_SPLIT_ALWAYS 0x0A000000
+#define E1000_SRRCTL_DESCTYPE_MASK             0x0E000000
+#define E1000_SRRCTL_DROP_EN                   0x80000000
+
+#define E1000_SRRCTL_BSIZEPKT_MASK             0x0000007F
+#define E1000_SRRCTL_BSIZEHDR_MASK             0x00003F00
+
+/* from igbvf/mbox.h */
+
+#define E1000_V2PMAILBOX_REQ      0x00000001 /* Request for PF Ready bit */
+#define E1000_V2PMAILBOX_ACK      0x00000002 /* Ack PF message received */
+#define E1000_V2PMAILBOX_VFU      0x00000004 /* VF owns the mailbox buffer */
+#define E1000_V2PMAILBOX_PFU      0x00000008 /* PF owns the mailbox buffer */
+#define E1000_V2PMAILBOX_PFSTS    0x00000010 /* PF wrote a message in the MB */
+#define E1000_V2PMAILBOX_PFACK    0x00000020 /* PF ack the previous VF msg */
+#define E1000_V2PMAILBOX_RSTI     0x00000040 /* PF has reset indication */
+#define E1000_V2PMAILBOX_RSTD     0x00000080 /* PF has indicated reset done */
+#define E1000_V2PMAILBOX_R2C_BITS 0x000000B0 /* All read to clear bits */
+
+#define E1000_VFMAILBOX_SIZE      16 /* 16 32 bit words - 64 bytes */
+
+/*
+ * If it's a E1000_VF_* msg then it originates in the VF and is sent to the
+ * PF.  The reverse is true if it is E1000_PF_*.
+ * Message ACK's are the value or'd with 0xF0000000
+ */
+/* Messages below or'd with this are the ACK */
+#define E1000_VT_MSGTYPE_ACK      0x80000000
+/* Messages below or'd with this are the NACK */
+#define E1000_VT_MSGTYPE_NACK     0x40000000
+/* Indicates that VF is still clear to send requests */
+#define E1000_VT_MSGTYPE_CTS      0x20000000
+
+/* We have a total wait time of 1s for vf mailbox posted messages */
+#define E1000_VF_MBX_INIT_TIMEOUT 2000 /* retry count for mbx timeout */
+#define E1000_VF_MBX_INIT_DELAY   500  /* usec delay between retries */
+
+#define E1000_VT_MSGINFO_SHIFT    16
+/* bits 23:16 are used for exra info for certain messages */
+#define E1000_VT_MSGINFO_MASK     (0xFF << E1000_VT_MSGINFO_SHIFT)
+
+#define E1000_VF_RESET            0x01 /* VF requests reset */
+#define E1000_VF_SET_MAC_ADDR     0x02 /* VF requests PF to set MAC addr */
+/* VF requests PF to clear all unicast MAC filters */
+#define E1000_VF_MAC_FILTER_CLR   (0x01 << E1000_VT_MSGINFO_SHIFT)
+/* VF requests PF to add unicast MAC filter */
+#define E1000_VF_MAC_FILTER_ADD   (0x02 << E1000_VT_MSGINFO_SHIFT)
+#define E1000_VF_SET_MULTICAST    0x03 /* VF requests PF to set MC addr */
+#define E1000_VF_SET_VLAN         0x04 /* VF requests PF to set VLAN */
+#define E1000_VF_SET_LPE          0x05 /* VF requests PF to set VMOLR.LPE */
+
+#define E1000_PF_CONTROL_MSG      0x0100 /* PF control message */
+
+/* from igbvf/regs.h */
+
+/* Statistics registers */
+#define E1000_VFGPRC   0x00F10
+#define E1000_VFGORC   0x00F18
+#define E1000_VFMPRC   0x00F3C
+#define E1000_VFGPTC   0x00F14
+#define E1000_VFGOTC   0x00F34
+#define E1000_VFGOTLBC 0x00F50
+#define E1000_VFGPTLBC 0x00F44
+#define E1000_VFGORLBC 0x00F48
+#define E1000_VFGPRLBC 0x00F40
+
+/* These act per VF so an array friendly macro is used */
+#define E1000_V2PMAILBOX(_n) (0x00C40 + (4 * (_n)))
+#define E1000_VMBMEM(_n)     (0x00800 + (64 * (_n)))
+
+/* from igbvf/vf.h */
+
+#define E1000_DEV_ID_82576_VF 0x10CA
+
+/* new */
+
+/* Receive Registers */
+
+/* RX Descriptor Base Low; RW */
+#define E1000_RDBAL(_n)    (0x0C000 + (0x40  * (_n)))
+#define E1000_RDBAL_A(_n)  (0x02800 + (0x100 * (_n)))
+
+/* RX Descriptor Base High; RW */
+#define E1000_RDBAH(_n)    (0x0C004 + (0x40  * (_n)))
+#define E1000_RDBAH_A(_n)  (0x02804 + (0x100 * (_n)))
+
+/* RX Descriptor Ring Length; RW */
+#define E1000_RDLEN(_n)    (0x0C008 + (0x40  * (_n)))
+#define E1000_RDLEN_A(_n)  (0x02808 + (0x100 * (_n)))
+
+/* Split and Replication Receive Control; RW */
+#define E1000_SRRCTL(_n)   (0x0C00C + (0x40  * (_n)))
+#define E1000_SRRCTL_A(_n) (0x0280C + (0x100 * (_n)))
+
+/* RX Descriptor Head; RW */
+#define E1000_RDH(_n)      (0x0C010 + (0x40  * (_n)))
+#define E1000_RDH_A(_n)    (0x02810 + (0x100 * (_n)))
+
+/* RX DCA Control; RW */
+#define E1000_RXCTL(_n)    (0x0C014 + (0x40  * (_n)))
+#define E1000_RXCTL_A(_n)  (0x02814 + (0x100 * (_n)))
+
+/* RX Descriptor Tail; RW */
+#define E1000_RDT(_n)      (0x0C018 + (0x40  * (_n)))
+#define E1000_RDT_A(_n)    (0x02818 + (0x100 * (_n)))
+
+/* RX Descriptor Control; RW */
+#define E1000_RXDCTL(_n)   (0x0C028 + (0x40  * (_n)))
+#define E1000_RXDCTL_A(_n) (0x02828 + (0x100 * (_n)))
+
+/* RX Queue Drop Packet Count; RC */
+#define E1000_RQDPC_A(_n)  (0x02830 + (0x100 * (_n)))
+
+/* Transmit Registers */
+
+/* TX Descriptor Base Low; RW */
+#define E1000_TDBAL(_n)    (0x0E000 + (0x40  * (_n)))
+#define E1000_TDBAL_A(_n)  (0x03800 + (0x100 * (_n)))
+
+/* TX Descriptor Base High; RW */
+#define E1000_TDBAH(_n)    (0x0E004 + (0x40  * (_n)))
+#define E1000_TDBAH_A(_n)  (0x03804 + (0x100 * (_n)))
+
+/* TX Descriptor Ring Length; RW */
+#define E1000_TDLEN(_n)    (0x0E008 + (0x40  * (_n)))
+#define E1000_TDLEN_A(_n)  (0x03808 + (0x100 * (_n)))
+
+/* TX Descriptor Head; RW */
+#define E1000_TDH(_n)      (0x0E010 + (0x40  * (_n)))
+#define E1000_TDH_A(_n)    (0x03810 + (0x100 * (_n)))
+
+/* TX DCA Control; RW */
+#define E1000_TXCTL(_n)    (0x0E014 + (0x40  * (_n)))
+#define E1000_TXCTL_A(_n)  (0x03814 + (0x100 * (_n)))
+
+/* TX Descriptor Tail; RW */
+#define E1000_TDT(_n)      (0x0E018 + (0x40  * (_n)))
+#define E1000_TDT_A(_n)    (0x03818 + (0x100 * (_n)))
+
+/* TX Descriptor Control; RW */
+#define E1000_TXDCTL(_n)   (0x0E028 + (0x40  * (_n)))
+#define E1000_TXDCTL_A(_n) (0x03828 + (0x100 * (_n)))
+
+/* TX Descriptor Completion Write–Back Address Low; RW */
+#define E1000_TDWBAL(_n)   (0x0E038 + (0x40  * (_n)))
+#define E1000_TDWBAL_A(_n) (0x03838 + (0x100 * (_n)))
+
+/* TX Descriptor Completion Write–Back Address High; RW */
+#define E1000_TDWBAH(_n)   (0x0E03C + (0x40  * (_n)))
+#define E1000_TDWBAH_A(_n) (0x0383C + (0x100 * (_n)))
+
+#define E1000_MTA_A        0x0200
+
+#define E1000_XDBAL_MASK (~(BIT(5) - 1)) /* TDBAL and RDBAL Registers Mask */
+
+#define E1000_ICR_MACSEC   0x00000020 /* MACSec */
+#define E1000_ICR_RX0      0x00000040 /* Receiver Overrun */
+#define E1000_ICR_GPI_SDP0 0x00000800 /* General Purpose, SDP0 pin */
+#define E1000_ICR_GPI_SDP1 0x00001000 /* General Purpose, SDP1 pin */
+#define E1000_ICR_GPI_SDP2 0x00002000 /* General Purpose, SDP2 pin */
+#define E1000_ICR_GPI_SDP3 0x00004000 /* General Purpose, SDP3 pin */
+#define E1000_ICR_PTRAP    0x00008000 /* Probe Trap */
+#define E1000_ICR_MNG      0x00040000 /* Management Event */
+#define E1000_ICR_OMED     0x00100000 /* Other Media Energy Detected */
+#define E1000_ICR_FER      0x00400000 /* Fatal Error */
+#define E1000_ICR_NFER     0x00800000 /* Non Fatal Error */
+#define E1000_ICR_CSRTO    0x01000000 /* CSR access Time Out Indication */
+#define E1000_ICR_SCE      0x02000000 /* Storm Control Event */
+#define E1000_ICR_SW_WD    0x04000000 /* Software Watchdog */
+
+/* Extended Interrupts */
+
+#define E1000_EICR_MSIX_MASK   0x01FFFFFF /* Bits used in MSI-X mode */
+#define E1000_EICR_LEGACY_MASK 0x4000FFFF /* Bits used in non MSI-X mode */
+
+/* Mirror VF Control (only RST bit); RW */
+#define E1000_PVTCTRL(_n) (0x10000 + (_n) * 0x100)
+
+/* Mirror Good Packets Received Count; RO */
+#define E1000_PVFGPRC(_n) (0x10010 + (_n) * 0x100)
+
+/* Mirror Good Packets Transmitted Count; RO */
+#define E1000_PVFGPTC(_n) (0x10014 + (_n) * 0x100)
+
+/* Mirror Good Octets Received Count; RO */
+#define E1000_PVFGORC(_n) (0x10018 + (_n) * 0x100)
+
+/* Mirror Extended Interrupt Cause Set; WO */
+#define E1000_PVTEICS(_n) (0x10020 + (_n) * 0x100)
+
+/* Mirror Extended Interrupt Mask Set/Read; RW */
+#define E1000_PVTEIMS(_n) (0x10024 + (_n) * 0x100)
+
+/* Mirror Extended Interrupt Mask Clear; WO */
+#define E1000_PVTEIMC(_n) (0x10028 + (_n) * 0x100)
+
+/* Mirror Extended Interrupt Auto Clear; RW */
+#define E1000_PVTEIAC(_n) (0x1002C + (_n) * 0x100)
+
+/* Mirror Extended Interrupt Auto Mask Enable; RW */
+#define E1000_PVTEIAM(_n) (0x10030 + (_n) * 0x100)
+
+/* Mirror Good Octets Transmitted Count; RO */
+#define E1000_PVFGOTC(_n) (0x10034 + (_n) * 0x100)
+
+/* Mirror Multicast Packets Received Count; RO */
+#define E1000_PVFMPRC(_n) (0x1003C + (_n) * 0x100)
+
+/* Mirror Good RX Packets loopback Count; RO */
+#define E1000_PVFGPRLBC(_n) (0x10040 + (_n) * 0x100)
+
+/* Mirror Good TX packets loopback Count; RO */
+#define E1000_PVFGPTLBC(_n) (0x10044 + (_n) * 0x100)
+
+/* Mirror Good RX Octets loopback Count; RO */
+#define E1000_PVFGORLBC(_n) (0x10048 + (_n) * 0x100)
+
+/* Mirror Good TX Octets loopback Count; RO */
+#define E1000_PVFGOTLBC(_n) (0x10050 + (_n) * 0x100)
+
+/* Mirror Extended Interrupt Cause Set; RC/W1C */
+#define E1000_PVTEICR(_n) (0x10080 + (_n) * 0x100)
+
+/*
+ * These are fake addresses that, according to the specification, the device
+ * is not using. They are used to distinguish between the PF and the VFs
+ * accessing their VTIVAR register (which is the same address, 0x1700)
+ */
+#define E1000_VTIVAR      0x11700
+#define E1000_VTIVAR_MISC 0x11720
+
+#define E1000_RSS_QUEUE(reta, hash) (E1000_RETA_VAL(reta, hash) & 0x0F)
+
+#define E1000_STATUS_IOV_MODE 0x00040000
+
+#define E1000_STATUS_NUM_VFS_SHIFT 14
+
+static inline uint8_t igb_ivar_entry_rx(uint8_t i)
+{
+    return i < 8 ? i * 4 : (i - 8) * 4 + 2;
+}
+
+static inline uint8_t igb_ivar_entry_tx(uint8_t i)
+{
+    return i < 8 ? i * 4 + 1 : (i - 8) * 4 + 3;
+}
+
+#endif
diff --git a/hw/net/igbvf.c b/hw/net/igbvf.c
new file mode 100644
index 0000000000..70beb7af50
--- /dev/null
+++ b/hw/net/igbvf.c
@@ -0,0 +1,327 @@
+/*
+ * QEMU Intel 82576 SR/IOV Ethernet Controller Emulation
+ *
+ * Datasheet:
+ * https://www.intel.com/content/dam/www/public/us/en/documents/datasheets/82576eg-gbe-datasheet.pdf
+ *
+ * Copyright (c) 2020-2023 Red Hat, Inc.
+ * Copyright (c) 2015 Ravello Systems LTD (http://ravellosystems.com)
+ * Developed by Daynix Computing LTD (http://www.daynix.com)
+ *
+ * Authors:
+ * Akihiko Odaki <akihiko.odaki@daynix.com>
+ * Gal Hammmer <gal.hammer@sap.com>
+ * Marcel Apfelbaum <marcel.apfelbaum@gmail.com>
+ * Dmitry Fleytman <dmitry@daynix.com>
+ * Leonid Bloch <leonid@daynix.com>
+ * Yan Vugenfirer <yan@daynix.com>
+ *
+ * Based on work done by:
+ * Nir Peleg, Tutis Systems Ltd. for Qumranet Inc.
+ * Copyright (c) 2008 Qumranet
+ * Based on work done by:
+ * Copyright (c) 2007 Dan Aloni
+ * Copyright (c) 2004 Antony T Curtis
+ *
+ * This library is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * This library is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with this library; if not, see <http://www.gnu.org/licenses/>.
+ */
+
+#include "qemu/osdep.h"
+#include "hw/hw.h"
+#include "hw/net/mii.h"
+#include "hw/pci/pci_device.h"
+#include "hw/pci/pcie.h"
+#include "hw/pci/msix.h"
+#include "net/eth.h"
+#include "net/net.h"
+#include "igb_common.h"
+#include "igb_core.h"
+#include "trace.h"
+#include "qapi/error.h"
+
+#define TYPE_IGBVF "igbvf"
+OBJECT_DECLARE_SIMPLE_TYPE(IgbVfState, IGBVF)
+
+#define IGBVF_MMIO_BAR_IDX  (0)
+#define IGBVF_MSIX_BAR_IDX  (3)
+
+#define IGBVF_MMIO_SIZE     (16 * 1024)
+#define IGBVF_MSIX_SIZE     (16 * 1024)
+
+struct IgbVfState {
+    PCIDevice parent_obj;
+
+    MemoryRegion mmio;
+    MemoryRegion msix;
+};
+
+static hwaddr vf_to_pf_addr(hwaddr addr, uint16_t vfn, bool write)
+{
+    switch (addr) {
+    case E1000_CTRL:
+    case E1000_CTRL_DUP:
+        return E1000_PVTCTRL(vfn);
+    case E1000_EICS:
+        return E1000_PVTEICS(vfn);
+    case E1000_EIMS:
+        return E1000_PVTEIMS(vfn);
+    case E1000_EIMC:
+        return E1000_PVTEIMC(vfn);
+    case E1000_EIAC:
+        return E1000_PVTEIAC(vfn);
+    case E1000_EIAM:
+        return E1000_PVTEIAM(vfn);
+    case E1000_EICR:
+        return E1000_PVTEICR(vfn);
+    case E1000_EITR(0):
+    case E1000_EITR(1):
+    case E1000_EITR(2):
+        return E1000_EITR(22) + (addr - E1000_EITR(0)) - vfn * 0xC;
+    case E1000_IVAR0:
+        return E1000_VTIVAR + vfn * 4;
+    case E1000_IVAR_MISC:
+        return E1000_VTIVAR_MISC + vfn * 4;
+    case 0x0F04: /* PBACL */
+        return E1000_PBACLR;
+    case 0x0F0C: /* PSRTYPE */
+        return E1000_PSRTYPE(vfn);
+    case E1000_V2PMAILBOX(0):
+        return E1000_V2PMAILBOX(vfn);
+    case E1000_VMBMEM(0) ... E1000_VMBMEM(0) + 0x3F:
+        return addr + vfn * 0x40;
+    case E1000_RDBAL_A(0):
+        return E1000_RDBAL(vfn);
+    case E1000_RDBAL_A(1):
+        return E1000_RDBAL(vfn + IGB_MAX_VF_FUNCTIONS);
+    case E1000_RDBAH_A(0):
+        return E1000_RDBAH(vfn);
+    case E1000_RDBAH_A(1):
+        return E1000_RDBAH(vfn + IGB_MAX_VF_FUNCTIONS);
+    case E1000_RDLEN_A(0):
+        return E1000_RDLEN(vfn);
+    case E1000_RDLEN_A(1):
+        return E1000_RDLEN(vfn + IGB_MAX_VF_FUNCTIONS);
+    case E1000_SRRCTL_A(0):
+        return E1000_SRRCTL(vfn);
+    case E1000_SRRCTL_A(1):
+        return E1000_SRRCTL(vfn + IGB_MAX_VF_FUNCTIONS);
+    case E1000_RDH_A(0):
+        return E1000_RDH(vfn);
+    case E1000_RDH_A(1):
+        return E1000_RDH(vfn + IGB_MAX_VF_FUNCTIONS);
+    case E1000_RXCTL_A(0):
+        return E1000_RXCTL(vfn);
+    case E1000_RXCTL_A(1):
+        return E1000_RXCTL(vfn + IGB_MAX_VF_FUNCTIONS);
+    case E1000_RDT_A(0):
+        return E1000_RDT(vfn);
+    case E1000_RDT_A(1):
+        return E1000_RDT(vfn + IGB_MAX_VF_FUNCTIONS);
+    case E1000_RXDCTL_A(0):
+        return E1000_RXDCTL(vfn);
+    case E1000_RXDCTL_A(1):
+        return E1000_RXDCTL(vfn + IGB_MAX_VF_FUNCTIONS);
+    case E1000_RQDPC_A(0):
+        return E1000_RQDPC(vfn);
+    case E1000_RQDPC_A(1):
+        return E1000_RQDPC(vfn + IGB_MAX_VF_FUNCTIONS);
+    case E1000_TDBAL_A(0):
+        return E1000_TDBAL(vfn);
+    case E1000_TDBAL_A(1):
+        return E1000_TDBAL(vfn + IGB_MAX_VF_FUNCTIONS);
+    case E1000_TDBAH_A(0):
+        return E1000_TDBAH(vfn);
+    case E1000_TDBAH_A(1):
+        return E1000_TDBAH(vfn + IGB_MAX_VF_FUNCTIONS);
+    case E1000_TDLEN_A(0):
+        return E1000_TDLEN(vfn);
+    case E1000_TDLEN_A(1):
+        return E1000_TDLEN(vfn + IGB_MAX_VF_FUNCTIONS);
+    case E1000_TDH_A(0):
+        return E1000_TDH(vfn);
+    case E1000_TDH_A(1):
+        return E1000_TDH(vfn + IGB_MAX_VF_FUNCTIONS);
+    case E1000_TXCTL_A(0):
+        return E1000_TXCTL(vfn);
+    case E1000_TXCTL_A(1):
+        return E1000_TXCTL(vfn + IGB_MAX_VF_FUNCTIONS);
+    case E1000_TDT_A(0):
+        return E1000_TDT(vfn);
+    case E1000_TDT_A(1):
+        return E1000_TDT(vfn + IGB_MAX_VF_FUNCTIONS);
+    case E1000_TXDCTL_A(0):
+        return E1000_TXDCTL(vfn);
+    case E1000_TXDCTL_A(1):
+        return E1000_TXDCTL(vfn + IGB_MAX_VF_FUNCTIONS);
+    case E1000_TDWBAL_A(0):
+        return E1000_TDWBAL(vfn);
+    case E1000_TDWBAL_A(1):
+        return E1000_TDWBAL(vfn + IGB_MAX_VF_FUNCTIONS);
+    case E1000_TDWBAH_A(0):
+        return E1000_TDWBAH(vfn);
+    case E1000_TDWBAH_A(1):
+        return E1000_TDWBAH(vfn + IGB_MAX_VF_FUNCTIONS);
+    case E1000_VFGPRC:
+        return E1000_PVFGPRC(vfn);
+    case E1000_VFGPTC:
+        return E1000_PVFGPTC(vfn);
+    case E1000_VFGORC:
+        return E1000_PVFGORC(vfn);
+    case E1000_VFGOTC:
+        return E1000_PVFGOTC(vfn);
+    case E1000_VFMPRC:
+        return E1000_PVFMPRC(vfn);
+    case E1000_VFGPRLBC:
+        return E1000_PVFGPRLBC(vfn);
+    case E1000_VFGPTLBC:
+        return E1000_PVFGPTLBC(vfn);
+    case E1000_VFGORLBC:
+        return E1000_PVFGORLBC(vfn);
+    case E1000_VFGOTLBC:
+        return E1000_PVFGOTLBC(vfn);
+    case E1000_STATUS:
+    case E1000_FRTIMER:
+        if (write) {
+            return HWADDR_MAX;
+        }
+        /* fallthrough */
+    case 0x34E8: /* PBTWAC */
+    case 0x24E8: /* PBRWAC */
+        return addr;
+    }
+
+    trace_igbvf_wrn_io_addr_unknown(addr);
+
+    return HWADDR_MAX;
+}
+
+static void igbvf_write_config(PCIDevice *dev, uint32_t addr, uint32_t val,
+    int len)
+{
+    trace_igbvf_write_config(addr, val, len);
+    pci_default_write_config(dev, addr, val, len);
+}
+
+static uint64_t igbvf_mmio_read(void *opaque, hwaddr addr, unsigned size)
+{
+    PCIDevice *vf = PCI_DEVICE(opaque);
+    PCIDevice *pf = pcie_sriov_get_pf(vf);
+
+    addr = vf_to_pf_addr(addr, pcie_sriov_vf_number(vf), false);
+    return addr == HWADDR_MAX ? 0 : igb_mmio_read(pf, addr, size);
+}
+
+static void igbvf_mmio_write(void *opaque, hwaddr addr, uint64_t val,
+    unsigned size)
+{
+    PCIDevice *vf = PCI_DEVICE(opaque);
+    PCIDevice *pf = pcie_sriov_get_pf(vf);
+
+    addr = vf_to_pf_addr(addr, pcie_sriov_vf_number(vf), true);
+    if (addr != HWADDR_MAX) {
+        igb_mmio_write(pf, addr, val, size);
+    }
+}
+
+static const MemoryRegionOps mmio_ops = {
+    .read = igbvf_mmio_read,
+    .write = igbvf_mmio_write,
+    .endianness = DEVICE_LITTLE_ENDIAN,
+    .impl = {
+        .min_access_size = 4,
+        .max_access_size = 4,
+    },
+};
+
+static void igbvf_pci_realize(PCIDevice *dev, Error **errp)
+{
+    IgbVfState *s = IGBVF(dev);
+    int ret;
+    int i;
+
+    dev->config_write = igbvf_write_config;
+
+    memory_region_init_io(&s->mmio, OBJECT(dev), &mmio_ops, s, "igbvf-mmio",
+        IGBVF_MMIO_SIZE);
+    pcie_sriov_vf_register_bar(dev, IGBVF_MMIO_BAR_IDX, &s->mmio);
+
+    memory_region_init(&s->msix, OBJECT(dev), "igbvf-msix", IGBVF_MSIX_SIZE);
+    pcie_sriov_vf_register_bar(dev, IGBVF_MSIX_BAR_IDX, &s->msix);
+
+    ret = msix_init(dev, IGBVF_MSIX_VEC_NUM, &s->msix, IGBVF_MSIX_BAR_IDX, 0,
+        &s->msix, IGBVF_MSIX_BAR_IDX, 0x2000, 0x70, errp);
+    if (ret) {
+        return;
+    }
+
+    for (i = 0; i < IGBVF_MSIX_VEC_NUM; i++) {
+        msix_vector_use(dev, i);
+    }
+
+    if (pcie_endpoint_cap_init(dev, 0xa0) < 0) {
+        hw_error("Failed to initialize PCIe capability");
+    }
+
+    if (pcie_aer_init(dev, 1, 0x100, 0x40, errp) < 0) {
+        hw_error("Failed to initialize AER capability");
+    }
+
+    pcie_ari_init(dev, 0x150, 1);
+}
+
+static void igbvf_pci_uninit(PCIDevice *dev)
+{
+    IgbVfState *s = IGBVF(dev);
+
+    pcie_aer_exit(dev);
+    pcie_cap_exit(dev);
+    msix_unuse_all_vectors(dev);
+    msix_uninit(dev, &s->msix, &s->msix);
+}
+
+static void igbvf_class_init(ObjectClass *class, void *data)
+{
+    DeviceClass *dc = DEVICE_CLASS(class);
+    PCIDeviceClass *c = PCI_DEVICE_CLASS(class);
+
+    c->realize = igbvf_pci_realize;
+    c->exit = igbvf_pci_uninit;
+    c->vendor_id = PCI_VENDOR_ID_INTEL;
+    c->device_id = E1000_DEV_ID_82576_VF;
+    c->revision = 1;
+    c->class_id = PCI_CLASS_NETWORK_ETHERNET;
+
+    dc->desc = "Intel 82576 Virtual Function";
+    dc->user_creatable = false;
+
+    set_bit(DEVICE_CATEGORY_NETWORK, dc->categories);
+}
+
+static const TypeInfo igbvf_info = {
+    .name = TYPE_IGBVF,
+    .parent = TYPE_PCI_DEVICE,
+    .instance_size = sizeof(IgbVfState),
+    .class_init = igbvf_class_init,
+    .interfaces = (InterfaceInfo[]) {
+        { INTERFACE_PCIE_DEVICE },
+        { }
+    },
+};
+
+static void igb_register_types(void)
+{
+    type_register_static(&igbvf_info);
+}
+
+type_init(igb_register_types)
diff --git a/hw/net/meson.build b/hw/net/meson.build
index 4285145715..e2be0654a1 100644
--- a/hw/net/meson.build
+++ b/hw/net/meson.build
@@ -10,6 +10,8 @@ softmmu_ss.add(when: 'CONFIG_PCNET_COMMON', if_true: files('pcnet.c'))
 softmmu_ss.add(when: 'CONFIG_E1000_PCI', if_true: files('e1000.c', 'e1000x_common.c'))
 softmmu_ss.add(when: 'CONFIG_E1000E_PCI_EXPRESS', if_true: files('net_tx_pkt.c', 'net_rx_pkt.c'))
 softmmu_ss.add(when: 'CONFIG_E1000E_PCI_EXPRESS', if_true: files('e1000e.c', 'e1000e_core.c', 'e1000x_common.c'))
+softmmu_ss.add(when: 'CONFIG_IGB_PCI_EXPRESS', if_true: files('net_tx_pkt.c', 'net_rx_pkt.c'))
+softmmu_ss.add(when: 'CONFIG_IGB_PCI_EXPRESS', if_true: files('igb.c', 'igbvf.c', 'igb_core.c'))
 softmmu_ss.add(when: 'CONFIG_RTL8139_PCI', if_true: files('rtl8139.c'))
 softmmu_ss.add(when: 'CONFIG_TULIP', if_true: files('tulip.c'))
 softmmu_ss.add(when: 'CONFIG_VMXNET3_PCI', if_true: files('net_tx_pkt.c', 'net_rx_pkt.c'))
diff --git a/hw/net/net_rx_pkt.c b/hw/net/net_rx_pkt.c
index 1e1c504e42..39cdea06de 100644
--- a/hw/net/net_rx_pkt.c
+++ b/hw/net/net_rx_pkt.c
@@ -30,14 +30,11 @@ struct NetRxPkt {
     uint32_t tot_len;
     uint16_t tci;
     size_t ehdr_buf_len;
-    bool has_virt_hdr;
     eth_pkt_types_e packet_type;
 
     /* Analysis results */
-    bool isip4;
-    bool isip6;
-    bool isudp;
-    bool istcp;
+    bool hasip4;
+    bool hasip6;
 
     size_t l3hdr_off;
     size_t l4hdr_off;
@@ -48,10 +45,9 @@ struct NetRxPkt {
     eth_l4_hdr_info  l4hdr_info;
 };
 
-void net_rx_pkt_init(struct NetRxPkt **pkt, bool has_virt_hdr)
+void net_rx_pkt_init(struct NetRxPkt **pkt)
 {
     struct NetRxPkt *p = g_malloc0(sizeof *p);
-    p->has_virt_hdr = has_virt_hdr;
     p->vec = NULL;
     p->vec_len_total = 0;
     *pkt = p;
@@ -107,12 +103,11 @@ net_rx_pkt_pull_data(struct NetRxPkt *pkt,
                                 iov, iovcnt, ploff, pkt->tot_len);
     }
 
-    eth_get_protocols(pkt->vec, pkt->vec_len, &pkt->isip4, &pkt->isip6,
-                      &pkt->isudp, &pkt->istcp,
+    eth_get_protocols(pkt->vec, pkt->vec_len, &pkt->hasip4, &pkt->hasip6,
                       &pkt->l3hdr_off, &pkt->l4hdr_off, &pkt->l5hdr_off,
                       &pkt->ip6hdr_info, &pkt->ip4hdr_info, &pkt->l4hdr_info);
 
-    trace_net_rx_pkt_parsed(pkt->isip4, pkt->isip6, pkt->isudp, pkt->istcp,
+    trace_net_rx_pkt_parsed(pkt->hasip4, pkt->hasip6, pkt->l4hdr_info.proto,
                             pkt->l3hdr_off, pkt->l4hdr_off, pkt->l5hdr_off);
 }
 
@@ -201,22 +196,20 @@ void net_rx_pkt_set_protocols(struct NetRxPkt *pkt, const void *data,
 
     assert(pkt);
 
-    eth_get_protocols(&iov, 1, &pkt->isip4, &pkt->isip6,
-                      &pkt->isudp, &pkt->istcp,
+    eth_get_protocols(&iov, 1, &pkt->hasip4, &pkt->hasip6,
                       &pkt->l3hdr_off, &pkt->l4hdr_off, &pkt->l5hdr_off,
                       &pkt->ip6hdr_info, &pkt->ip4hdr_info, &pkt->l4hdr_info);
 }
 
 void net_rx_pkt_get_protocols(struct NetRxPkt *pkt,
-                              bool *isip4, bool *isip6,
-                              bool *isudp, bool *istcp)
+                              bool *hasip4, bool *hasip6,
+                              EthL4HdrProto *l4hdr_proto)
 {
     assert(pkt);
 
-    *isip4 = pkt->isip4;
-    *isip6 = pkt->isip6;
-    *isudp = pkt->isudp;
-    *istcp = pkt->istcp;
+    *hasip4 = pkt->hasip4;
+    *hasip6 = pkt->hasip6;
+    *l4hdr_proto = pkt->l4hdr_info.proto;
 }
 
 size_t net_rx_pkt_get_l3_hdr_offset(struct NetRxPkt *pkt)
@@ -333,58 +326,58 @@ net_rx_pkt_calc_rss_hash(struct NetRxPkt *pkt,
 
     switch (type) {
     case NetPktRssIpV4:
-        assert(pkt->isip4);
+        assert(pkt->hasip4);
         trace_net_rx_pkt_rss_ip4();
         _net_rx_rss_prepare_ip4(&rss_input[0], pkt, &rss_length);
         break;
     case NetPktRssIpV4Tcp:
-        assert(pkt->isip4);
-        assert(pkt->istcp);
+        assert(pkt->hasip4);
+        assert(pkt->l4hdr_info.proto == ETH_L4_HDR_PROTO_TCP);
         trace_net_rx_pkt_rss_ip4_tcp();
         _net_rx_rss_prepare_ip4(&rss_input[0], pkt, &rss_length);
         _net_rx_rss_prepare_tcp(&rss_input[0], pkt, &rss_length);
         break;
     case NetPktRssIpV6Tcp:
-        assert(pkt->isip6);
-        assert(pkt->istcp);
+        assert(pkt->hasip6);
+        assert(pkt->l4hdr_info.proto == ETH_L4_HDR_PROTO_TCP);
         trace_net_rx_pkt_rss_ip6_tcp();
         _net_rx_rss_prepare_ip6(&rss_input[0], pkt, false, &rss_length);
         _net_rx_rss_prepare_tcp(&rss_input[0], pkt, &rss_length);
         break;
     case NetPktRssIpV6:
-        assert(pkt->isip6);
+        assert(pkt->hasip6);
         trace_net_rx_pkt_rss_ip6();
         _net_rx_rss_prepare_ip6(&rss_input[0], pkt, false, &rss_length);
         break;
     case NetPktRssIpV6Ex:
-        assert(pkt->isip6);
+        assert(pkt->hasip6);
         trace_net_rx_pkt_rss_ip6_ex();
         _net_rx_rss_prepare_ip6(&rss_input[0], pkt, true, &rss_length);
         break;
     case NetPktRssIpV6TcpEx:
-        assert(pkt->isip6);
-        assert(pkt->istcp);
+        assert(pkt->hasip6);
+        assert(pkt->l4hdr_info.proto == ETH_L4_HDR_PROTO_TCP);
         trace_net_rx_pkt_rss_ip6_ex_tcp();
         _net_rx_rss_prepare_ip6(&rss_input[0], pkt, true, &rss_length);
         _net_rx_rss_prepare_tcp(&rss_input[0], pkt, &rss_length);
         break;
     case NetPktRssIpV4Udp:
-        assert(pkt->isip4);
-        assert(pkt->isudp);
+        assert(pkt->hasip4);
+        assert(pkt->l4hdr_info.proto == ETH_L4_HDR_PROTO_UDP);
         trace_net_rx_pkt_rss_ip4_udp();
         _net_rx_rss_prepare_ip4(&rss_input[0], pkt, &rss_length);
         _net_rx_rss_prepare_udp(&rss_input[0], pkt, &rss_length);
         break;
     case NetPktRssIpV6Udp:
-        assert(pkt->isip6);
-        assert(pkt->isudp);
+        assert(pkt->hasip6);
+        assert(pkt->l4hdr_info.proto == ETH_L4_HDR_PROTO_UDP);
         trace_net_rx_pkt_rss_ip6_udp();
         _net_rx_rss_prepare_ip6(&rss_input[0], pkt, false, &rss_length);
         _net_rx_rss_prepare_udp(&rss_input[0], pkt, &rss_length);
         break;
     case NetPktRssIpV6UdpEx:
-        assert(pkt->isip6);
-        assert(pkt->isudp);
+        assert(pkt->hasip6);
+        assert(pkt->l4hdr_info.proto == ETH_L4_HDR_PROTO_UDP);
         trace_net_rx_pkt_rss_ip6_ex_udp();
         _net_rx_rss_prepare_ip6(&rss_input[0], pkt, true, &rss_length);
         _net_rx_rss_prepare_udp(&rss_input[0], pkt, &rss_length);
@@ -406,7 +399,7 @@ uint16_t net_rx_pkt_get_ip_id(struct NetRxPkt *pkt)
 {
     assert(pkt);
 
-    if (pkt->isip4) {
+    if (pkt->hasip4) {
         return be16_to_cpu(pkt->ip4hdr_info.ip4_hdr.ip_id);
     }
 
@@ -417,7 +410,7 @@ bool net_rx_pkt_is_tcp_ack(struct NetRxPkt *pkt)
 {
     assert(pkt);
 
-    if (pkt->istcp) {
+    if (pkt->l4hdr_info.proto == ETH_L4_HDR_PROTO_TCP) {
         return TCP_HEADER_FLAGS(&pkt->l4hdr_info.hdr.tcp) & TCP_FLAG_ACK;
     }
 
@@ -428,7 +421,7 @@ bool net_rx_pkt_has_tcp_data(struct NetRxPkt *pkt)
 {
     assert(pkt);
 
-    if (pkt->istcp) {
+    if (pkt->l4hdr_info.proto == ETH_L4_HDR_PROTO_TCP) {
         return pkt->l4hdr_info.has_tcp_data;
     }
 
@@ -465,18 +458,18 @@ void net_rx_pkt_set_vhdr_iovec(struct NetRxPkt *pkt,
     iov_to_buf(iov, iovcnt, 0, &pkt->virt_hdr, sizeof pkt->virt_hdr);
 }
 
-bool net_rx_pkt_is_vlan_stripped(struct NetRxPkt *pkt)
+void net_rx_pkt_unset_vhdr(struct NetRxPkt *pkt)
 {
     assert(pkt);
 
-    return pkt->ehdr_buf_len ? true : false;
+    memset(&pkt->virt_hdr, 0, sizeof(pkt->virt_hdr));
 }
 
-bool net_rx_pkt_has_virt_hdr(struct NetRxPkt *pkt)
+bool net_rx_pkt_is_vlan_stripped(struct NetRxPkt *pkt)
 {
     assert(pkt);
 
-    return pkt->has_virt_hdr;
+    return pkt->ehdr_buf_len ? true : false;
 }
 
 uint16_t net_rx_pkt_get_vlan_tag(struct NetRxPkt *pkt)
@@ -494,7 +487,7 @@ bool net_rx_pkt_validate_l3_csum(struct NetRxPkt *pkt, bool *csum_valid)
 
     trace_net_rx_pkt_l3_csum_validate_entry();
 
-    if (!pkt->isip4) {
+    if (!pkt->hasip4) {
         trace_net_rx_pkt_l3_csum_validate_not_ip4();
         return false;
     }
@@ -525,8 +518,8 @@ _net_rx_pkt_calc_l4_csum(struct NetRxPkt *pkt)
 
     trace_net_rx_pkt_l4_csum_calc_entry();
 
-    if (pkt->isip4) {
-        if (pkt->isudp) {
+    if (pkt->hasip4) {
+        if (pkt->l4hdr_info.proto == ETH_L4_HDR_PROTO_UDP) {
             csl = be16_to_cpu(pkt->l4hdr_info.hdr.udp.uh_ulen);
             trace_net_rx_pkt_l4_csum_calc_ip4_udp();
         } else {
@@ -539,7 +532,7 @@ _net_rx_pkt_calc_l4_csum(struct NetRxPkt *pkt)
                                             csl, &cso);
         trace_net_rx_pkt_l4_csum_calc_ph_csum(cntr, csl);
     } else {
-        if (pkt->isudp) {
+        if (pkt->l4hdr_info.proto == ETH_L4_HDR_PROTO_UDP) {
             csl = be16_to_cpu(pkt->l4hdr_info.hdr.udp.uh_ulen);
             trace_net_rx_pkt_l4_csum_calc_ip6_udp();
         } else {
@@ -573,17 +566,19 @@ bool net_rx_pkt_validate_l4_csum(struct NetRxPkt *pkt, bool *csum_valid)
 
     trace_net_rx_pkt_l4_csum_validate_entry();
 
-    if (!pkt->istcp && !pkt->isudp) {
+    if (pkt->l4hdr_info.proto != ETH_L4_HDR_PROTO_TCP &&
+        pkt->l4hdr_info.proto != ETH_L4_HDR_PROTO_UDP) {
         trace_net_rx_pkt_l4_csum_validate_not_xxp();
         return false;
     }
 
-    if (pkt->isudp && (pkt->l4hdr_info.hdr.udp.uh_sum == 0)) {
+    if (pkt->l4hdr_info.proto == ETH_L4_HDR_PROTO_UDP &&
+        pkt->l4hdr_info.hdr.udp.uh_sum == 0) {
         trace_net_rx_pkt_l4_csum_validate_udp_with_no_checksum();
         return false;
     }
 
-    if (pkt->isip4 && pkt->ip4hdr_info.fragment) {
+    if (pkt->hasip4 && pkt->ip4hdr_info.fragment) {
         trace_net_rx_pkt_l4_csum_validate_ip4_fragment();
         return false;
     }
@@ -604,22 +599,27 @@ bool net_rx_pkt_fix_l4_csum(struct NetRxPkt *pkt)
 
     trace_net_rx_pkt_l4_csum_fix_entry();
 
-    if (pkt->istcp) {
+    switch (pkt->l4hdr_info.proto) {
+    case ETH_L4_HDR_PROTO_TCP:
         l4_cso = offsetof(struct tcp_header, th_sum);
         trace_net_rx_pkt_l4_csum_fix_tcp(l4_cso);
-    } else if (pkt->isudp) {
+        break;
+
+    case ETH_L4_HDR_PROTO_UDP:
         if (pkt->l4hdr_info.hdr.udp.uh_sum == 0) {
             trace_net_rx_pkt_l4_csum_fix_udp_with_no_checksum();
             return false;
         }
         l4_cso = offsetof(struct udp_header, uh_sum);
         trace_net_rx_pkt_l4_csum_fix_udp(l4_cso);
-    } else {
+        break;
+
+    default:
         trace_net_rx_pkt_l4_csum_fix_not_xxp();
         return false;
     }
 
-    if (pkt->isip4 && pkt->ip4hdr_info.fragment) {
+    if (pkt->hasip4 && pkt->ip4hdr_info.fragment) {
             trace_net_rx_pkt_l4_csum_fix_ip4_fragment();
             return false;
     }
diff --git a/hw/net/net_rx_pkt.h b/hw/net/net_rx_pkt.h
index 048e3461f0..d00b484900 100644
--- a/hw/net/net_rx_pkt.h
+++ b/hw/net/net_rx_pkt.h
@@ -37,10 +37,9 @@ void net_rx_pkt_uninit(struct NetRxPkt *pkt);
  * Init function for rx packet functionality
  *
  * @pkt:            packet pointer
- * @has_virt_hdr:   device uses virtio header
  *
  */
-void net_rx_pkt_init(struct NetRxPkt **pkt, bool has_virt_hdr);
+void net_rx_pkt_init(struct NetRxPkt **pkt);
 
 /**
  * returns total length of data attached to rx context
@@ -67,15 +66,14 @@ void net_rx_pkt_set_protocols(struct NetRxPkt *pkt, const void *data,
  * fetches packet analysis results
  *
  * @pkt:            packet
- * @isip4:          whether the packet given is IPv4
- * @isip6:          whether the packet given is IPv6
- * @isudp:          whether the packet given is UDP
- * @istcp:          whether the packet given is TCP
+ * @hasip4:          whether the packet has an IPv4 header
+ * @hasip6:          whether the packet has an IPv6 header
+ * @l4hdr_proto:     protocol of L4 header
  *
  */
 void net_rx_pkt_get_protocols(struct NetRxPkt *pkt,
-                                 bool *isip4, bool *isip6,
-                                 bool *isudp, bool *istcp);
+                                 bool *hasip4, bool *hasip6,
+                                 EthL4HdrProto *l4hdr_proto);
 
 /**
 * fetches L3 header offset
@@ -215,15 +213,6 @@ uint16_t net_rx_pkt_get_vlan_tag(struct NetRxPkt *pkt);
 bool net_rx_pkt_is_vlan_stripped(struct NetRxPkt *pkt);
 
 /**
- * notifies caller if the packet has virtio header
- *
- * @pkt:            packet
- * @ret:            true if packet has virtio header, false otherwize
- *
- */
-bool net_rx_pkt_has_virt_hdr(struct NetRxPkt *pkt);
-
-/**
 * attach scatter-gather data to rx packet
 *
 * @pkt:            packet
@@ -323,6 +312,14 @@ void net_rx_pkt_set_vhdr_iovec(struct NetRxPkt *pkt,
     const struct iovec *iov, int iovcnt);
 
 /**
+ * unset vhdr data from packet context
+ *
+ * @pkt:            packet
+ *
+ */
+void net_rx_pkt_unset_vhdr(struct NetRxPkt *pkt);
+
+/**
  * save packet type in packet context
  *
  * @pkt:            packet
diff --git a/hw/net/net_tx_pkt.c b/hw/net/net_tx_pkt.c
index 2533ea2700..986a3adfe9 100644
--- a/hw/net/net_tx_pkt.c
+++ b/hw/net/net_tx_pkt.c
@@ -35,7 +35,6 @@ struct NetTxPkt {
     PCIDevice *pci_dev;
 
     struct virtio_net_hdr virt_hdr;
-    bool has_virt_hdr;
 
     struct iovec *raw;
     uint32_t raw_frags;
@@ -54,12 +53,10 @@ struct NetTxPkt {
     uint16_t hdr_len;
     eth_pkt_types_e packet_type;
     uint8_t l4proto;
-
-    bool is_loopback;
 };
 
 void net_tx_pkt_init(struct NetTxPkt **pkt, PCIDevice *pci_dev,
-    uint32_t max_frags, bool has_virt_hdr)
+    uint32_t max_frags)
 {
     struct NetTxPkt *p = g_malloc0(sizeof *p);
 
@@ -71,10 +68,8 @@ void net_tx_pkt_init(struct NetTxPkt **pkt, PCIDevice *pci_dev,
 
     p->max_payload_frags = max_frags;
     p->max_raw_frags = max_frags;
-    p->has_virt_hdr = has_virt_hdr;
     p->vec[NET_TX_PKT_VHDR_FRAG].iov_base = &p->virt_hdr;
-    p->vec[NET_TX_PKT_VHDR_FRAG].iov_len =
-        p->has_virt_hdr ? sizeof p->virt_hdr : 0;
+    p->vec[NET_TX_PKT_VHDR_FRAG].iov_len = sizeof p->virt_hdr;
     p->vec[NET_TX_PKT_L2HDR_FRAG].iov_base = &p->l2_hdr;
     p->vec[NET_TX_PKT_L3HDR_FRAG].iov_base = &p->l3_hdr;
 
@@ -304,10 +299,11 @@ func_exit:
     return rc;
 }
 
-void net_tx_pkt_build_vheader(struct NetTxPkt *pkt, bool tso_enable,
+bool net_tx_pkt_build_vheader(struct NetTxPkt *pkt, bool tso_enable,
     bool csum_enable, uint32_t gso_size)
 {
     struct tcp_hdr l4hdr;
+    size_t bytes_read;
     assert(pkt);
 
     /* csum has to be enabled if tso is. */
@@ -328,8 +324,13 @@ void net_tx_pkt_build_vheader(struct NetTxPkt *pkt, bool tso_enable,
 
     case VIRTIO_NET_HDR_GSO_TCPV4:
     case VIRTIO_NET_HDR_GSO_TCPV6:
-        iov_to_buf(&pkt->vec[NET_TX_PKT_PL_START_FRAG], pkt->payload_frags,
-                   0, &l4hdr, sizeof(l4hdr));
+        bytes_read = iov_to_buf(&pkt->vec[NET_TX_PKT_PL_START_FRAG],
+                                pkt->payload_frags, 0, &l4hdr, sizeof(l4hdr));
+        if (bytes_read < sizeof(l4hdr) ||
+            l4hdr.th_off * sizeof(uint32_t) < sizeof(l4hdr)) {
+            return false;
+        }
+
         pkt->virt_hdr.hdr_len = pkt->hdr_len + l4hdr.th_off * sizeof(uint32_t);
         pkt->virt_hdr.gso_size = gso_size;
         break;
@@ -341,11 +342,17 @@ void net_tx_pkt_build_vheader(struct NetTxPkt *pkt, bool tso_enable,
     if (csum_enable) {
         switch (pkt->l4proto) {
         case IP_PROTO_TCP:
+            if (pkt->payload_len < sizeof(struct tcp_hdr)) {
+                return false;
+            }
             pkt->virt_hdr.flags = VIRTIO_NET_HDR_F_NEEDS_CSUM;
             pkt->virt_hdr.csum_start = pkt->hdr_len;
             pkt->virt_hdr.csum_offset = offsetof(struct tcp_hdr, th_sum);
             break;
         case IP_PROTO_UDP:
+            if (pkt->payload_len < sizeof(struct udp_hdr)) {
+                return false;
+            }
             pkt->virt_hdr.flags = VIRTIO_NET_HDR_F_NEEDS_CSUM;
             pkt->virt_hdr.csum_start = pkt->hdr_len;
             pkt->virt_hdr.csum_offset = offsetof(struct udp_hdr, uh_sum);
@@ -354,6 +361,8 @@ void net_tx_pkt_build_vheader(struct NetTxPkt *pkt, bool tso_enable,
             break;
         }
     }
+
+    return true;
 }
 
 void net_tx_pkt_setup_vlan_header_ex(struct NetTxPkt *pkt,
@@ -464,15 +473,14 @@ void net_tx_pkt_reset(struct NetTxPkt *pkt)
     pkt->l4proto = 0;
 }
 
-static void net_tx_pkt_do_sw_csum(struct NetTxPkt *pkt)
+static void net_tx_pkt_do_sw_csum(struct NetTxPkt *pkt,
+                                  struct iovec *iov, uint32_t iov_len,
+                                  uint16_t csl)
 {
-    struct iovec *iov = &pkt->vec[NET_TX_PKT_L2HDR_FRAG];
     uint32_t csum_cntr;
     uint16_t csum = 0;
     uint32_t cso;
     /* num of iovec without vhdr */
-    uint32_t iov_len = pkt->payload_frags + NET_TX_PKT_PL_START_FRAG - 1;
-    uint16_t csl;
     size_t csum_offset = pkt->virt_hdr.csum_start + pkt->virt_hdr.csum_offset;
     uint16_t l3_proto = eth_get_l3_proto(iov, 1, iov->iov_len);
 
@@ -480,8 +488,6 @@ static void net_tx_pkt_do_sw_csum(struct NetTxPkt *pkt)
     iov_from_buf(iov, iov_len, csum_offset, &csum, sizeof csum);
 
     /* Calculate L4 TCP/UDP checksum */
-    csl = pkt->payload_len;
-
     csum_cntr = 0;
     cso = 0;
     /* add pseudo header to csum */
@@ -504,23 +510,16 @@ static void net_tx_pkt_do_sw_csum(struct NetTxPkt *pkt)
     iov_from_buf(iov, iov_len, csum_offset, &csum, sizeof csum);
 }
 
-enum {
-    NET_TX_PKT_FRAGMENT_L2_HDR_POS = 0,
-    NET_TX_PKT_FRAGMENT_L3_HDR_POS,
-    NET_TX_PKT_FRAGMENT_HEADER_NUM
-};
-
 #define NET_MAX_FRAG_SG_LIST (64)
 
 static size_t net_tx_pkt_fetch_fragment(struct NetTxPkt *pkt,
-    int *src_idx, size_t *src_offset, struct iovec *dst, int *dst_idx)
+    int *src_idx, size_t *src_offset, size_t src_len,
+    struct iovec *dst, int *dst_idx)
 {
     size_t fetched = 0;
     struct iovec *src = pkt->vec;
 
-    *dst_idx = NET_TX_PKT_FRAGMENT_HEADER_NUM;
-
-    while (fetched < IP_FRAG_ALIGN_SIZE(pkt->virt_hdr.gso_size)) {
+    while (fetched < src_len) {
 
         /* no more place in fragment iov */
         if (*dst_idx == NET_MAX_FRAG_SG_LIST) {
@@ -535,7 +534,7 @@ static size_t net_tx_pkt_fetch_fragment(struct NetTxPkt *pkt,
 
         dst[*dst_idx].iov_base = src[*src_idx].iov_base + *src_offset;
         dst[*dst_idx].iov_len = MIN(src[*src_idx].iov_len - *src_offset,
-            IP_FRAG_ALIGN_SIZE(pkt->virt_hdr.gso_size) - fetched);
+            src_len - fetched);
 
         *src_offset += dst[*dst_idx].iov_len;
         fetched += dst[*dst_idx].iov_len;
@@ -551,71 +550,250 @@ static size_t net_tx_pkt_fetch_fragment(struct NetTxPkt *pkt,
     return fetched;
 }
 
-static inline void net_tx_pkt_sendv(struct NetTxPkt *pkt,
-    NetClientState *nc, const struct iovec *iov, int iov_cnt)
+static void net_tx_pkt_sendv(
+    void *opaque, const struct iovec *iov, int iov_cnt,
+    const struct iovec *virt_iov, int virt_iov_cnt)
 {
-    if (pkt->is_loopback) {
-        qemu_receive_packet_iov(nc, iov, iov_cnt);
+    NetClientState *nc = opaque;
+
+    if (qemu_get_using_vnet_hdr(nc->peer)) {
+        qemu_sendv_packet(nc, virt_iov, virt_iov_cnt);
     } else {
         qemu_sendv_packet(nc, iov, iov_cnt);
     }
 }
 
+static bool net_tx_pkt_tcp_fragment_init(struct NetTxPkt *pkt,
+                                         struct iovec *fragment,
+                                         int *pl_idx,
+                                         size_t *l4hdr_len,
+                                         int *src_idx,
+                                         size_t *src_offset,
+                                         size_t *src_len)
+{
+    struct iovec *l4 = fragment + NET_TX_PKT_PL_START_FRAG;
+    size_t bytes_read = 0;
+    struct tcp_hdr *th;
+
+    if (!pkt->payload_frags) {
+        return false;
+    }
+
+    l4->iov_len = pkt->virt_hdr.hdr_len - pkt->hdr_len;
+    l4->iov_base = g_malloc(l4->iov_len);
+
+    *src_idx = NET_TX_PKT_PL_START_FRAG;
+    while (pkt->vec[*src_idx].iov_len < l4->iov_len - bytes_read) {
+        memcpy((char *)l4->iov_base + bytes_read, pkt->vec[*src_idx].iov_base,
+               pkt->vec[*src_idx].iov_len);
+
+        bytes_read += pkt->vec[*src_idx].iov_len;
+
+        (*src_idx)++;
+        if (*src_idx >= pkt->payload_frags + NET_TX_PKT_PL_START_FRAG) {
+            g_free(l4->iov_base);
+            return false;
+        }
+    }
+
+    *src_offset = l4->iov_len - bytes_read;
+    memcpy((char *)l4->iov_base + bytes_read, pkt->vec[*src_idx].iov_base,
+           *src_offset);
+
+    th = l4->iov_base;
+    th->th_flags &= ~(TH_FIN | TH_PUSH);
+
+    *pl_idx = NET_TX_PKT_PL_START_FRAG + 1;
+    *l4hdr_len = l4->iov_len;
+    *src_len = pkt->virt_hdr.gso_size;
+
+    return true;
+}
+
+static void net_tx_pkt_tcp_fragment_deinit(struct iovec *fragment)
+{
+    g_free(fragment[NET_TX_PKT_PL_START_FRAG].iov_base);
+}
+
+static void net_tx_pkt_tcp_fragment_fix(struct NetTxPkt *pkt,
+                                        struct iovec *fragment,
+                                        size_t fragment_len,
+                                        uint8_t gso_type)
+{
+    struct iovec *l3hdr = fragment + NET_TX_PKT_L3HDR_FRAG;
+    struct iovec *l4hdr = fragment + NET_TX_PKT_PL_START_FRAG;
+    struct ip_header *ip = l3hdr->iov_base;
+    struct ip6_header *ip6 = l3hdr->iov_base;
+    size_t len = l3hdr->iov_len + l4hdr->iov_len + fragment_len;
+
+    switch (gso_type) {
+    case VIRTIO_NET_HDR_GSO_TCPV4:
+        ip->ip_len = cpu_to_be16(len);
+        eth_fix_ip4_checksum(l3hdr->iov_base, l3hdr->iov_len);
+        break;
+
+    case VIRTIO_NET_HDR_GSO_TCPV6:
+        len -= sizeof(struct ip6_header);
+        ip6->ip6_ctlun.ip6_un1.ip6_un1_plen = cpu_to_be16(len);
+        break;
+    }
+}
+
+static void net_tx_pkt_tcp_fragment_advance(struct NetTxPkt *pkt,
+                                            struct iovec *fragment,
+                                            size_t fragment_len,
+                                            uint8_t gso_type)
+{
+    struct iovec *l3hdr = fragment + NET_TX_PKT_L3HDR_FRAG;
+    struct iovec *l4hdr = fragment + NET_TX_PKT_PL_START_FRAG;
+    struct ip_header *ip = l3hdr->iov_base;
+    struct tcp_hdr *th = l4hdr->iov_base;
+
+    if (gso_type == VIRTIO_NET_HDR_GSO_TCPV4) {
+        ip->ip_id = cpu_to_be16(be16_to_cpu(ip->ip_id) + 1);
+    }
+
+    th->th_seq = cpu_to_be32(be32_to_cpu(th->th_seq) + fragment_len);
+    th->th_flags &= ~TH_CWR;
+}
+
+static void net_tx_pkt_udp_fragment_init(struct NetTxPkt *pkt,
+                                         int *pl_idx,
+                                         size_t *l4hdr_len,
+                                         int *src_idx, size_t *src_offset,
+                                         size_t *src_len)
+{
+    *pl_idx = NET_TX_PKT_PL_START_FRAG;
+    *l4hdr_len = 0;
+    *src_idx = NET_TX_PKT_PL_START_FRAG;
+    *src_offset = 0;
+    *src_len = IP_FRAG_ALIGN_SIZE(pkt->virt_hdr.gso_size);
+}
+
+static void net_tx_pkt_udp_fragment_fix(struct NetTxPkt *pkt,
+                                        struct iovec *fragment,
+                                        size_t fragment_offset,
+                                        size_t fragment_len)
+{
+    bool more_frags = fragment_offset + fragment_len < pkt->payload_len;
+    uint16_t orig_flags;
+    struct iovec *l3hdr = fragment + NET_TX_PKT_L3HDR_FRAG;
+    struct ip_header *ip = l3hdr->iov_base;
+    uint16_t frag_off_units = fragment_offset / IP_FRAG_UNIT_SIZE;
+    uint16_t new_ip_off;
+
+    assert(fragment_offset % IP_FRAG_UNIT_SIZE == 0);
+    assert((frag_off_units & ~IP_OFFMASK) == 0);
+
+    orig_flags = be16_to_cpu(ip->ip_off) & ~(IP_OFFMASK | IP_MF);
+    new_ip_off = frag_off_units | orig_flags | (more_frags ? IP_MF : 0);
+    ip->ip_off = cpu_to_be16(new_ip_off);
+    ip->ip_len = cpu_to_be16(l3hdr->iov_len + fragment_len);
+
+    eth_fix_ip4_checksum(l3hdr->iov_base, l3hdr->iov_len);
+}
+
 static bool net_tx_pkt_do_sw_fragmentation(struct NetTxPkt *pkt,
-    NetClientState *nc)
+                                           NetTxPktCallback callback,
+                                           void *context)
 {
+    uint8_t gso_type = pkt->virt_hdr.gso_type & ~VIRTIO_NET_HDR_GSO_ECN;
+
     struct iovec fragment[NET_MAX_FRAG_SG_LIST];
-    size_t fragment_len = 0;
-    bool more_frags = false;
-
-    /* some pointers for shorter code */
-    void *l2_iov_base, *l3_iov_base;
-    size_t l2_iov_len, l3_iov_len;
-    int src_idx =  NET_TX_PKT_PL_START_FRAG, dst_idx;
-    size_t src_offset = 0;
-    size_t fragment_offset = 0;
+    size_t fragment_len;
+    size_t l4hdr_len;
+    size_t src_len;
 
-    l2_iov_base = pkt->vec[NET_TX_PKT_L2HDR_FRAG].iov_base;
-    l2_iov_len = pkt->vec[NET_TX_PKT_L2HDR_FRAG].iov_len;
-    l3_iov_base = pkt->vec[NET_TX_PKT_L3HDR_FRAG].iov_base;
-    l3_iov_len = pkt->vec[NET_TX_PKT_L3HDR_FRAG].iov_len;
+    int src_idx, dst_idx, pl_idx;
+    size_t src_offset;
+    size_t fragment_offset = 0;
+    struct virtio_net_hdr virt_hdr = {
+        .flags = pkt->virt_hdr.flags & VIRTIO_NET_HDR_F_NEEDS_CSUM ?
+                 VIRTIO_NET_HDR_F_DATA_VALID : 0
+    };
 
     /* Copy headers */
-    fragment[NET_TX_PKT_FRAGMENT_L2_HDR_POS].iov_base = l2_iov_base;
-    fragment[NET_TX_PKT_FRAGMENT_L2_HDR_POS].iov_len = l2_iov_len;
-    fragment[NET_TX_PKT_FRAGMENT_L3_HDR_POS].iov_base = l3_iov_base;
-    fragment[NET_TX_PKT_FRAGMENT_L3_HDR_POS].iov_len = l3_iov_len;
+    fragment[NET_TX_PKT_VHDR_FRAG].iov_base = &virt_hdr;
+    fragment[NET_TX_PKT_VHDR_FRAG].iov_len = sizeof(virt_hdr);
+    fragment[NET_TX_PKT_L2HDR_FRAG] = pkt->vec[NET_TX_PKT_L2HDR_FRAG];
+    fragment[NET_TX_PKT_L3HDR_FRAG] = pkt->vec[NET_TX_PKT_L3HDR_FRAG];
+
+    switch (gso_type) {
+    case VIRTIO_NET_HDR_GSO_TCPV4:
+    case VIRTIO_NET_HDR_GSO_TCPV6:
+        if (!net_tx_pkt_tcp_fragment_init(pkt, fragment, &pl_idx, &l4hdr_len,
+                                          &src_idx, &src_offset, &src_len)) {
+            return false;
+        }
+        break;
+
+    case VIRTIO_NET_HDR_GSO_UDP:
+        net_tx_pkt_do_sw_csum(pkt, &pkt->vec[NET_TX_PKT_L2HDR_FRAG],
+                              pkt->payload_frags + NET_TX_PKT_PL_START_FRAG - 1,
+                              pkt->payload_len);
+        net_tx_pkt_udp_fragment_init(pkt, &pl_idx, &l4hdr_len,
+                                     &src_idx, &src_offset, &src_len);
+        break;
 
+    default:
+        abort();
+    }
 
     /* Put as much data as possible and send */
-    do {
-        fragment_len = net_tx_pkt_fetch_fragment(pkt, &src_idx, &src_offset,
-            fragment, &dst_idx);
+    while (true) {
+        dst_idx = pl_idx;
+        fragment_len = net_tx_pkt_fetch_fragment(pkt,
+            &src_idx, &src_offset, src_len, fragment, &dst_idx);
+        if (!fragment_len) {
+            break;
+        }
 
-        more_frags = (fragment_offset + fragment_len < pkt->payload_len);
+        switch (gso_type) {
+        case VIRTIO_NET_HDR_GSO_TCPV4:
+        case VIRTIO_NET_HDR_GSO_TCPV6:
+            net_tx_pkt_tcp_fragment_fix(pkt, fragment, fragment_len, gso_type);
+            net_tx_pkt_do_sw_csum(pkt, fragment + NET_TX_PKT_L2HDR_FRAG,
+                                  dst_idx - NET_TX_PKT_L2HDR_FRAG,
+                                  l4hdr_len + fragment_len);
+            break;
 
-        eth_setup_ip4_fragmentation(l2_iov_base, l2_iov_len, l3_iov_base,
-            l3_iov_len, fragment_len, fragment_offset, more_frags);
+        case VIRTIO_NET_HDR_GSO_UDP:
+            net_tx_pkt_udp_fragment_fix(pkt, fragment, fragment_offset,
+                                        fragment_len);
+            break;
+        }
 
-        eth_fix_ip4_checksum(l3_iov_base, l3_iov_len);
+        callback(context,
+                 fragment + NET_TX_PKT_L2HDR_FRAG, dst_idx - NET_TX_PKT_L2HDR_FRAG,
+                 fragment + NET_TX_PKT_VHDR_FRAG, dst_idx - NET_TX_PKT_VHDR_FRAG);
 
-        net_tx_pkt_sendv(pkt, nc, fragment, dst_idx);
+        if (gso_type == VIRTIO_NET_HDR_GSO_TCPV4 ||
+            gso_type == VIRTIO_NET_HDR_GSO_TCPV6) {
+            net_tx_pkt_tcp_fragment_advance(pkt, fragment, fragment_len,
+                                            gso_type);
+        }
 
         fragment_offset += fragment_len;
+    }
 
-    } while (fragment_len && more_frags);
+    if (gso_type == VIRTIO_NET_HDR_GSO_TCPV4 ||
+        gso_type == VIRTIO_NET_HDR_GSO_TCPV6) {
+        net_tx_pkt_tcp_fragment_deinit(fragment);
+    }
 
     return true;
 }
 
 bool net_tx_pkt_send(struct NetTxPkt *pkt, NetClientState *nc)
 {
-    assert(pkt);
+    bool offload = qemu_get_using_vnet_hdr(nc->peer);
+    return net_tx_pkt_send_custom(pkt, offload, net_tx_pkt_sendv, nc);
+}
 
-    if (!pkt->has_virt_hdr &&
-        pkt->virt_hdr.flags & VIRTIO_NET_HDR_F_NEEDS_CSUM) {
-        net_tx_pkt_do_sw_csum(pkt);
-    }
+bool net_tx_pkt_send_custom(struct NetTxPkt *pkt, bool offload,
+                            NetTxPktCallback callback, void *context)
+{
+    assert(pkt);
 
     /*
      * Since underlying infrastructure does not support IP datagrams longer
@@ -629,26 +807,22 @@ bool net_tx_pkt_send(struct NetTxPkt *pkt, NetClientState *nc)
         }
     }
 
-    if (pkt->has_virt_hdr ||
-        pkt->virt_hdr.gso_type == VIRTIO_NET_HDR_GSO_NONE) {
+    if (offload || pkt->virt_hdr.gso_type == VIRTIO_NET_HDR_GSO_NONE) {
+        if (!offload && pkt->virt_hdr.flags & VIRTIO_NET_HDR_F_NEEDS_CSUM) {
+            net_tx_pkt_do_sw_csum(pkt, &pkt->vec[NET_TX_PKT_L2HDR_FRAG],
+                                  pkt->payload_frags + NET_TX_PKT_PL_START_FRAG - 1,
+                                  pkt->payload_len);
+        }
+
         net_tx_pkt_fix_ip6_payload_len(pkt);
-        net_tx_pkt_sendv(pkt, nc, pkt->vec,
-            pkt->payload_frags + NET_TX_PKT_PL_START_FRAG);
+        callback(context, pkt->vec + NET_TX_PKT_L2HDR_FRAG,
+                 pkt->payload_frags + NET_TX_PKT_PL_START_FRAG - NET_TX_PKT_L2HDR_FRAG,
+                 pkt->vec + NET_TX_PKT_VHDR_FRAG,
+                 pkt->payload_frags + NET_TX_PKT_PL_START_FRAG - NET_TX_PKT_VHDR_FRAG);
         return true;
     }
 
-    return net_tx_pkt_do_sw_fragmentation(pkt, nc);
-}
-
-bool net_tx_pkt_send_loopback(struct NetTxPkt *pkt, NetClientState *nc)
-{
-    bool res;
-
-    pkt->is_loopback = true;
-    res = net_tx_pkt_send(pkt, nc);
-    pkt->is_loopback = false;
-
-    return res;
+    return net_tx_pkt_do_sw_fragmentation(pkt, callback, context);
 }
 
 void net_tx_pkt_fix_ip6_payload_len(struct NetTxPkt *pkt)
diff --git a/hw/net/net_tx_pkt.h b/hw/net/net_tx_pkt.h
index 4ec8bbe9bd..f57b4e034b 100644
--- a/hw/net/net_tx_pkt.h
+++ b/hw/net/net_tx_pkt.h
@@ -26,16 +26,17 @@
 
 struct NetTxPkt;
 
+typedef void (* NetTxPktCallback)(void *, const struct iovec *, int, const struct iovec *, int);
+
 /**
  * Init function for tx packet functionality
  *
  * @pkt:            packet pointer
  * @pci_dev:        PCI device processing this packet
  * @max_frags:      max tx ip fragments
- * @has_virt_hdr:   device uses virtio header.
  */
 void net_tx_pkt_init(struct NetTxPkt **pkt, PCIDevice *pci_dev,
-    uint32_t max_frags, bool has_virt_hdr);
+    uint32_t max_frags);
 
 /**
  * Clean all tx packet resources.
@@ -59,9 +60,10 @@ struct virtio_net_hdr *net_tx_pkt_get_vhdr(struct NetTxPkt *pkt);
  * @tso_enable:     TSO enabled
  * @csum_enable:    CSO enabled
  * @gso_size:       MSS size for TSO
+ * @ret:            operation result
  *
  */
-void net_tx_pkt_build_vheader(struct NetTxPkt *pkt, bool tso_enable,
+bool net_tx_pkt_build_vheader(struct NetTxPkt *pkt, bool tso_enable,
     bool csum_enable, uint32_t gso_size);
 
 /**
@@ -161,15 +163,16 @@ void net_tx_pkt_reset(struct NetTxPkt *pkt);
 bool net_tx_pkt_send(struct NetTxPkt *pkt, NetClientState *nc);
 
 /**
-* Redirect packet directly to receive path (emulate loopback phy).
-* Handles sw offloads if vhdr is not supported.
-*
-* @pkt:            packet
-* @nc:             NetClientState
-* @ret:            operation result
-*
-*/
-bool net_tx_pkt_send_loopback(struct NetTxPkt *pkt, NetClientState *nc);
+ * Send packet with a custom function.
+ *
+ * @pkt:            packet
+ * @offload:        whether the callback implements offloading
+ * @callback:       a function to be called back for each transformed packet
+ * @context:        a pointer to be passed to the callback.
+ * @ret:            operation result
+ */
+bool net_tx_pkt_send_custom(struct NetTxPkt *pkt, bool offload,
+                            NetTxPktCallback callback, void *context);
 
 /**
  * parse raw packet data and analyze offload requirements.
diff --git a/hw/net/trace-events b/hw/net/trace-events
index 4c0ec3fda1..65753411fc 100644
--- a/hw/net/trace-events
+++ b/hw/net/trace-events
@@ -61,7 +61,7 @@ pcnet_ioport_read(void *opaque, uint64_t addr, unsigned size) "opaque=%p addr=0x
 pcnet_ioport_write(void *opaque, uint64_t addr, uint64_t data, unsigned size) "opaque=%p addr=0x%"PRIx64" data=0x%"PRIx64" size=%d"
 
 # net_rx_pkt.c
-net_rx_pkt_parsed(bool ip4, bool ip6, bool udp, bool tcp, size_t l3o, size_t l4o, size_t l5o) "RX packet parsed: ip4: %d, ip6: %d, udp: %d, tcp: %d, l3 offset: %zu, l4 offset: %zu, l5 offset: %zu"
+net_rx_pkt_parsed(bool ip4, bool ip6, int l4proto, size_t l3o, size_t l4o, size_t l5o) "RX packet parsed: ip4: %d, ip6: %d, l4 protocol: %d, l3 offset: %zu, l4 offset: %zu, l5 offset: %zu"
 net_rx_pkt_l4_csum_validate_entry(void) "Starting L4 checksum validation"
 net_rx_pkt_l4_csum_validate_not_xxp(void) "Not a TCP/UDP packet"
 net_rx_pkt_l4_csum_validate_udp_with_no_checksum(void) "UDP packet without checksum"
@@ -165,8 +165,8 @@ e1000e_rx_descr(int ridx, uint64_t base, uint8_t len) "Next RX descriptor: ring
 e1000e_rx_set_rctl(uint32_t rctl) "RCTL = 0x%x"
 e1000e_rx_receive_iov(int iovcnt) "Received vector of %d fragments"
 e1000e_rx_flt_dropped(void) "Received packet dropped by RX filter"
-e1000e_rx_written_to_guest(uint32_t causes) "Received packet written to guest (ICR causes %u)"
-e1000e_rx_not_written_to_guest(uint32_t causes) "Received packet NOT written to guest (ICR causes %u)"
+e1000e_rx_written_to_guest(int queue_idx) "Received packet written to guest (queue %d)"
+e1000e_rx_not_written_to_guest(int queue_idx) "Received packet NOT written to guest (queue %d)"
 e1000e_rx_interrupt_set(uint32_t causes) "Receive interrupt set (ICR causes %u)"
 e1000e_rx_interrupt_delayed(uint32_t causes) "Receive interrupt delayed (ICR causes %u)"
 e1000e_rx_set_cso(int cso_state) "RX CSO state set to %d"
@@ -177,18 +177,16 @@ e1000e_rx_start_recv(void)
 e1000e_rx_rss_started(void) "Starting RSS processing"
 e1000e_rx_rss_disabled(void) "RSS is disabled"
 e1000e_rx_rss_type(uint32_t type) "RSS type is %u"
-e1000e_rx_rss_ip4(bool isfragment, bool istcp, uint32_t mrqc, bool tcpipv4_enabled, bool ipv4_enabled) "RSS IPv4: fragment %d, tcp %d, mrqc 0x%X, tcpipv4 enabled %d, ipv4 enabled %d"
+e1000e_rx_rss_ip4(int l4hdr_proto, uint32_t mrqc, bool tcpipv4_enabled, bool ipv4_enabled) "RSS IPv4: L4 header protocol %d, mrqc 0x%X, tcpipv4 enabled %d, ipv4 enabled %d"
 e1000e_rx_rss_ip6_rfctl(uint32_t rfctl) "RSS IPv6: rfctl 0x%X"
-e1000e_rx_rss_ip6(bool ex_dis, bool new_ex_dis, bool istcp, bool has_ext_headers, bool ex_dst_valid, bool ex_src_valid, uint32_t mrqc, bool tcpipv6_enabled, bool ipv6ex_enabled, bool ipv6_enabled) "RSS IPv6: ex_dis: %d, new_ex_dis: %d, tcp %d, has_ext_headers %d, ex_dst_valid %d, ex_src_valid %d, mrqc 0x%X, tcpipv6 enabled %d, ipv6ex enabled %d, ipv6 enabled %d"
-e1000e_rx_rss_dispatched_to_queue(int queue_idx) "Packet being dispatched to queue %d"
+e1000e_rx_rss_ip6(bool ex_dis, bool new_ex_dis, int l4hdr_proto, bool has_ext_headers, bool ex_dst_valid, bool ex_src_valid, uint32_t mrqc, bool tcpipv6_enabled, bool ipv6ex_enabled, bool ipv6_enabled) "RSS IPv6: ex_dis: %d, new_ex_dis: %d, L4 header protocol %d, has_ext_headers %d, ex_dst_valid %d, ex_src_valid %d, mrqc 0x%X, tcpipv6 enabled %d, ipv6ex enabled %d, ipv6 enabled %d"
 
-e1000e_rx_metadata_protocols(bool isip4, bool isip6, bool isudp, bool istcp) "protocols: ip4: %d, ip6: %d, udp: %d, tcp: %d"
+e1000e_rx_metadata_protocols(bool hasip4, bool hasip6, int l4hdr_protocol) "protocols: ip4: %d, ip6: %d, l4hdr: %d"
 e1000e_rx_metadata_vlan(uint16_t vlan_tag) "VLAN tag is 0x%X"
 e1000e_rx_metadata_rss(uint32_t rss, uint32_t mrq) "RSS data: rss: 0x%X, mrq: 0x%X"
 e1000e_rx_metadata_ip_id(uint16_t ip_id) "the IPv4 ID is 0x%X"
 e1000e_rx_metadata_ack(void) "the packet is TCP ACK"
 e1000e_rx_metadata_pkt_type(uint32_t pkt_type) "the packet type is %u"
-e1000e_rx_metadata_no_virthdr(void) "the packet has no virt-header"
 e1000e_rx_metadata_virthdr_no_csum_info(void) "virt-header does not contain checksum info"
 e1000e_rx_metadata_l3_cso_disabled(void) "IP4 CSO is disabled"
 e1000e_rx_metadata_l4_cso_disabled(void) "TCP/UDP CSO is disabled"
@@ -201,10 +199,8 @@ e1000e_rx_metadata_ipv6_filtering_disabled(void) "IPv6 RX filtering disabled by
 e1000e_vlan_vet(uint16_t vet) "Setting VLAN ethernet type 0x%X"
 
 e1000e_irq_msi_notify(uint32_t cause) "MSI notify 0x%x"
-e1000e_irq_throttling_no_pending_interrupts(void) "No pending interrupts to notify"
 e1000e_irq_msi_notify_postponed(void) "Sending MSI postponed by ITR"
 e1000e_irq_legacy_notify_postponed(void) "Raising legacy IRQ postponed by ITR"
-e1000e_irq_throttling_no_pending_vec(int idx) "No pending interrupts for vector %d"
 e1000e_irq_msix_notify_postponed_vec(int idx) "Sending MSI-X postponed by EITR[%d]"
 e1000e_irq_legacy_notify(bool level) "IRQ line state: %d"
 e1000e_irq_msix_notify_vec(uint32_t vector) "MSI-X notify vector 0x%x"
@@ -253,7 +249,7 @@ e1000e_vm_state_stopped(void) "VM state is stopped"
 # e1000e.c
 e1000e_cb_pci_realize(void) "E1000E PCI realize entry"
 e1000e_cb_pci_uninit(void) "E1000E PCI unit entry"
-e1000e_cb_qdev_reset(void) "E1000E qdev reset entry"
+e1000e_cb_qdev_reset_hold(void) "E1000E qdev reset hold"
 e1000e_cb_pre_save(void) "E1000E pre save entry"
 e1000e_cb_post_load(void) "E1000E post load entry"
 
@@ -274,6 +270,38 @@ e1000e_msix_use_vector_fail(uint32_t vec, int32_t res) "Failed to use MSI-X vect
 e1000e_mac_set_permanent(uint8_t b0, uint8_t b1, uint8_t b2, uint8_t b3, uint8_t b4, uint8_t b5) "Set permanent MAC: %02x:%02x:%02x:%02x:%02x:%02x"
 e1000e_cfg_support_virtio(bool support) "Virtio header supported: %d"
 
+# igb.c
+igb_write_config(uint32_t address, uint32_t val, int len) "CONFIG write 0x%"PRIx32", value: 0x%"PRIx32", len: %"PRId32
+igbvf_write_config(uint32_t address, uint32_t val, int len) "CONFIG write 0x%"PRIx32", value: 0x%"PRIx32", len: %"PRId32
+
+# igb_core.c
+igb_core_mdic_read(uint32_t addr, uint32_t data) "MDIC READ: PHY[%u] = 0x%x"
+igb_core_mdic_read_unhandled(uint32_t addr) "MDIC READ: PHY[%u] UNHANDLED"
+igb_core_mdic_write(uint32_t addr, uint32_t data) "MDIC WRITE: PHY[%u] = 0x%x"
+igb_core_mdic_write_unhandled(uint32_t addr) "MDIC WRITE: PHY[%u] UNHANDLED"
+
+igb_rx_desc_buff_size(uint32_t b) "buffer size: %u"
+igb_rx_desc_buff_write(uint64_t addr, uint16_t offset, const void* source, uint32_t len) "addr: 0x%"PRIx64", offset: %u, from: %p, length: %u"
+
+igb_rx_metadata_rss(uint32_t rss) "RSS data: 0x%X"
+
+igb_irq_icr_clear_gpie_nsicr(void) "Clearing ICR on read due to GPIE.NSICR enabled"
+igb_irq_icr_write(uint32_t bits, uint32_t old_icr, uint32_t new_icr) "Clearing ICR bits 0x%x: 0x%x --> 0x%x"
+igb_irq_set_iam(uint32_t icr) "Update IAM: 0x%x"
+igb_irq_read_iam(uint32_t icr) "Current IAM: 0x%x"
+igb_irq_write_eics(uint32_t val, bool msix) "Update EICS: 0x%x MSI-X: %d"
+igb_irq_write_eims(uint32_t val, bool msix) "Update EIMS: 0x%x MSI-X: %d"
+igb_irq_write_eimc(uint32_t val, uint32_t eims, bool msix) "Update EIMC: 0x%x EIMS: 0x%x MSI-X: %d"
+igb_irq_write_eiac(uint32_t val) "Update EIAC: 0x%x"
+igb_irq_write_eiam(uint32_t val, bool msix) "Update EIAM: 0x%x MSI-X: %d"
+igb_irq_write_eicr(uint32_t val, bool msix) "Update EICR: 0x%x MSI-X: %d"
+igb_irq_eitr_set(uint32_t eitr_num, uint32_t val) "EITR[%u] = 0x%x"
+igb_set_pfmailbox(uint32_t vf_num, uint32_t val) "PFMailbox[%d]: 0x%x"
+igb_set_vfmailbox(uint32_t vf_num, uint32_t val) "VFMailbox[%d]: 0x%x"
+
+# igbvf.c
+igbvf_wrn_io_addr_unknown(uint64_t addr) "IO unknown register 0x%"PRIx64
+
 # spapr_llan.c
 spapr_vlan_get_rx_bd_from_pool_found(int pool, int32_t count, uint32_t rx_bufs) "pool=%d count=%"PRId32" rxbufs=%"PRIu32
 spapr_vlan_get_rx_bd_from_page(int buf_ptr, uint64_t bd) "use_buf_ptr=%d bd=0x%016"PRIx64
diff --git a/hw/net/virtio-net.c b/hw/net/virtio-net.c
index 09d5c7a664..53e1c32643 100644
--- a/hw/net/virtio-net.c
+++ b/hw/net/virtio-net.c
@@ -1746,39 +1746,61 @@ static int receive_filter(VirtIONet *n, const uint8_t *buf, int size)
     return 0;
 }
 
-static uint8_t virtio_net_get_hash_type(bool isip4,
-                                        bool isip6,
-                                        bool isudp,
-                                        bool istcp,
+static uint8_t virtio_net_get_hash_type(bool hasip4,
+                                        bool hasip6,
+                                        EthL4HdrProto l4hdr_proto,
                                         uint32_t types)
 {
-    if (isip4) {
-        if (istcp && (types & VIRTIO_NET_RSS_HASH_TYPE_TCPv4)) {
-            return NetPktRssIpV4Tcp;
-        }
-        if (isudp && (types & VIRTIO_NET_RSS_HASH_TYPE_UDPv4)) {
-            return NetPktRssIpV4Udp;
+    if (hasip4) {
+        switch (l4hdr_proto) {
+        case ETH_L4_HDR_PROTO_TCP:
+            if (types & VIRTIO_NET_RSS_HASH_TYPE_TCPv4) {
+                return NetPktRssIpV4Tcp;
+            }
+            break;
+
+        case ETH_L4_HDR_PROTO_UDP:
+            if (types & VIRTIO_NET_RSS_HASH_TYPE_UDPv4) {
+                return NetPktRssIpV4Udp;
+            }
+            break;
+
+        default:
+            break;
         }
+
         if (types & VIRTIO_NET_RSS_HASH_TYPE_IPv4) {
             return NetPktRssIpV4;
         }
-    } else if (isip6) {
-        uint32_t mask = VIRTIO_NET_RSS_HASH_TYPE_TCP_EX |
-                        VIRTIO_NET_RSS_HASH_TYPE_TCPv6;
+    } else if (hasip6) {
+        switch (l4hdr_proto) {
+        case ETH_L4_HDR_PROTO_TCP:
+            if (types & VIRTIO_NET_RSS_HASH_TYPE_TCP_EX) {
+                return NetPktRssIpV6TcpEx;
+            }
+            if (types & VIRTIO_NET_RSS_HASH_TYPE_TCPv6) {
+                return NetPktRssIpV6Tcp;
+            }
+            break;
+
+        case ETH_L4_HDR_PROTO_UDP:
+            if (types & VIRTIO_NET_RSS_HASH_TYPE_UDP_EX) {
+                return NetPktRssIpV6UdpEx;
+            }
+            if (types & VIRTIO_NET_RSS_HASH_TYPE_UDPv6) {
+                return NetPktRssIpV6Udp;
+            }
+            break;
 
-        if (istcp && (types & mask)) {
-            return (types & VIRTIO_NET_RSS_HASH_TYPE_TCP_EX) ?
-                NetPktRssIpV6TcpEx : NetPktRssIpV6Tcp;
+        default:
+            break;
         }
-        mask = VIRTIO_NET_RSS_HASH_TYPE_UDP_EX | VIRTIO_NET_RSS_HASH_TYPE_UDPv6;
-        if (isudp && (types & mask)) {
-            return (types & VIRTIO_NET_RSS_HASH_TYPE_UDP_EX) ?
-                NetPktRssIpV6UdpEx : NetPktRssIpV6Udp;
+
+        if (types & VIRTIO_NET_RSS_HASH_TYPE_IP_EX) {
+            return NetPktRssIpV6Ex;
         }
-        mask = VIRTIO_NET_RSS_HASH_TYPE_IP_EX | VIRTIO_NET_RSS_HASH_TYPE_IPv6;
-        if (types & mask) {
-            return (types & VIRTIO_NET_RSS_HASH_TYPE_IP_EX) ?
-                NetPktRssIpV6Ex : NetPktRssIpV6;
+        if (types & VIRTIO_NET_RSS_HASH_TYPE_IPv6) {
+            return NetPktRssIpV6;
         }
     }
     return 0xff;
@@ -1800,7 +1822,8 @@ static int virtio_net_process_rss(NetClientState *nc, const uint8_t *buf,
     struct NetRxPkt *pkt = n->rx_pkt;
     uint8_t net_hash_type;
     uint32_t hash;
-    bool isip4, isip6, isudp, istcp;
+    bool hasip4, hasip6;
+    EthL4HdrProto l4hdr_proto;
     static const uint8_t reports[NetPktRssIpV6UdpEx + 1] = {
         VIRTIO_NET_HASH_REPORT_IPv4,
         VIRTIO_NET_HASH_REPORT_TCPv4,
@@ -1815,14 +1838,8 @@ static int virtio_net_process_rss(NetClientState *nc, const uint8_t *buf,
 
     net_rx_pkt_set_protocols(pkt, buf + n->host_hdr_len,
                              size - n->host_hdr_len);
-    net_rx_pkt_get_protocols(pkt, &isip4, &isip6, &isudp, &istcp);
-    if (isip4 && (net_rx_pkt_get_ip4_info(pkt)->fragment)) {
-        istcp = isudp = false;
-    }
-    if (isip6 && (net_rx_pkt_get_ip6_info(pkt)->fragment)) {
-        istcp = isudp = false;
-    }
-    net_hash_type = virtio_net_get_hash_type(isip4, isip6, isudp, istcp,
+    net_rx_pkt_get_protocols(pkt, &hasip4, &hasip6, &l4hdr_proto);
+    net_hash_type = virtio_net_get_hash_type(hasip4, hasip6, l4hdr_proto,
                                              n->rss_data.hash_types);
     if (net_hash_type > NetPktRssIpV6UdpEx) {
         if (n->rss_data.populate_hash) {
@@ -3718,7 +3735,7 @@ static void virtio_net_device_realize(DeviceState *dev, Error **errp)
     QTAILQ_INIT(&n->rsc_chains);
     n->qdev = dev;
 
-    net_rx_pkt_init(&n->rx_pkt, false);
+    net_rx_pkt_init(&n->rx_pkt);
 
     if (virtio_has_feature(n->host_features, VIRTIO_NET_F_RSS)) {
         virtio_net_load_ebpf(n);
diff --git a/hw/net/vmxnet3.c b/hw/net/vmxnet3.c
index 56559cda24..1068b80868 100644
--- a/hw/net/vmxnet3.c
+++ b/hw/net/vmxnet3.c
@@ -440,19 +440,19 @@ vmxnet3_setup_tx_offloads(VMXNET3State *s)
 {
     switch (s->offload_mode) {
     case VMXNET3_OM_NONE:
-        net_tx_pkt_build_vheader(s->tx_pkt, false, false, 0);
-        break;
+        return net_tx_pkt_build_vheader(s->tx_pkt, false, false, 0);
 
     case VMXNET3_OM_CSUM:
-        net_tx_pkt_build_vheader(s->tx_pkt, false, true, 0);
         VMW_PKPRN("L4 CSO requested\n");
-        break;
+        return net_tx_pkt_build_vheader(s->tx_pkt, false, true, 0);
 
     case VMXNET3_OM_TSO:
-        net_tx_pkt_build_vheader(s->tx_pkt, true, true,
-            s->cso_or_gso_size);
-        net_tx_pkt_update_ip_checksums(s->tx_pkt);
         VMW_PKPRN("GSO offload requested.");
+        if (!net_tx_pkt_build_vheader(s->tx_pkt, true, true,
+            s->cso_or_gso_size)) {
+            return false;
+        }
+        net_tx_pkt_update_ip_checksums(s->tx_pkt);
         break;
 
     default:
@@ -847,21 +847,20 @@ static void vmxnet3_rx_need_csum_calculate(struct NetRxPkt *pkt,
                                            size_t pkt_len)
 {
     struct virtio_net_hdr *vhdr;
-    bool isip4, isip6, istcp, isudp;
+    bool hasip4, hasip6;
+    EthL4HdrProto l4hdr_proto;
     uint8_t *data;
     int len;
 
-    if (!net_rx_pkt_has_virt_hdr(pkt)) {
-        return;
-    }
-
     vhdr = net_rx_pkt_get_vhdr(pkt);
     if (!VMXNET_FLAG_IS_SET(vhdr->flags, VIRTIO_NET_HDR_F_NEEDS_CSUM)) {
         return;
     }
 
-    net_rx_pkt_get_protocols(pkt, &isip4, &isip6, &isudp, &istcp);
-    if (!(isip4 || isip6) || !(istcp || isudp)) {
+    net_rx_pkt_get_protocols(pkt, &hasip4, &hasip6, &l4hdr_proto);
+    if (!(hasip4 || hasip6) ||
+        (l4hdr_proto != ETH_L4_HDR_PROTO_TCP &&
+         l4hdr_proto != ETH_L4_HDR_PROTO_UDP)) {
         return;
     }
 
@@ -889,7 +888,8 @@ static void vmxnet3_rx_update_descr(struct NetRxPkt *pkt,
     struct Vmxnet3_RxCompDesc *rxcd)
 {
     int csum_ok, is_gso;
-    bool isip4, isip6, istcp, isudp;
+    bool hasip4, hasip6;
+    EthL4HdrProto l4hdr_proto;
     struct virtio_net_hdr *vhdr;
     uint8_t offload_type;
 
@@ -898,10 +898,6 @@ static void vmxnet3_rx_update_descr(struct NetRxPkt *pkt,
         rxcd->tci = net_rx_pkt_get_vlan_tag(pkt);
     }
 
-    if (!net_rx_pkt_has_virt_hdr(pkt)) {
-        goto nocsum;
-    }
-
     vhdr = net_rx_pkt_get_vhdr(pkt);
     /*
      * Checksum is valid when lower level tell so or when lower level
@@ -919,16 +915,18 @@ static void vmxnet3_rx_update_descr(struct NetRxPkt *pkt,
         goto nocsum;
     }
 
-    net_rx_pkt_get_protocols(pkt, &isip4, &isip6, &isudp, &istcp);
-    if ((!istcp && !isudp) || (!isip4 && !isip6)) {
+    net_rx_pkt_get_protocols(pkt, &hasip4, &hasip6, &l4hdr_proto);
+    if ((l4hdr_proto != ETH_L4_HDR_PROTO_TCP &&
+         l4hdr_proto != ETH_L4_HDR_PROTO_UDP) ||
+        (!hasip4 && !hasip6)) {
         goto nocsum;
     }
 
     rxcd->cnc = 0;
-    rxcd->v4 = isip4 ? 1 : 0;
-    rxcd->v6 = isip6 ? 1 : 0;
-    rxcd->tcp = istcp ? 1 : 0;
-    rxcd->udp = isudp ? 1 : 0;
+    rxcd->v4 = hasip4 ? 1 : 0;
+    rxcd->v6 = hasip6 ? 1 : 0;
+    rxcd->tcp = l4hdr_proto == ETH_L4_HDR_PROTO_TCP;
+    rxcd->udp = l4hdr_proto == ETH_L4_HDR_PROTO_UDP;
     rxcd->fcs = rxcd->tuc = rxcd->ipc = 1;
     return;
 
@@ -1521,9 +1519,8 @@ static void vmxnet3_activate_device(VMXNET3State *s)
 
     /* Preallocate TX packet wrapper */
     VMW_CFPRN("Max TX fragments is %u", s->max_tx_frags);
-    net_tx_pkt_init(&s->tx_pkt, PCI_DEVICE(s),
-                    s->max_tx_frags, s->peer_has_vhdr);
-    net_rx_pkt_init(&s->rx_pkt, s->peer_has_vhdr);
+    net_tx_pkt_init(&s->tx_pkt, PCI_DEVICE(s), s->max_tx_frags);
+    net_rx_pkt_init(&s->rx_pkt);
 
     /* Read rings memory locations for RX queues */
     for (i = 0; i < s->rxq_num; i++) {
@@ -2402,9 +2399,8 @@ static int vmxnet3_post_load(void *opaque, int version_id)
 {
     VMXNET3State *s = opaque;
 
-    net_tx_pkt_init(&s->tx_pkt, PCI_DEVICE(s),
-                    s->max_tx_frags, s->peer_has_vhdr);
-    net_rx_pkt_init(&s->rx_pkt, s->peer_has_vhdr);
+    net_tx_pkt_init(&s->tx_pkt, PCI_DEVICE(s), s->max_tx_frags);
+    net_rx_pkt_init(&s->rx_pkt);
 
     if (s->msix_used) {
         vmxnet3_use_msix_vectors(s, VMXNET3_MAX_INTRS);
diff --git a/hw/net/xen_nic.c b/hw/net/xen_nic.c
index 7d92c2d022..9bbf6599fc 100644
--- a/hw/net/xen_nic.c
+++ b/hw/net/xen_nic.c
@@ -145,7 +145,7 @@ static void net_tx_packets(struct XenNetDev *netdev)
                 continue;
             }
 
-            if ((txreq.offset + txreq.size) > XC_PAGE_SIZE) {
+            if ((txreq.offset + txreq.size) > XEN_PAGE_SIZE) {
                 xen_pv_printf(&netdev->xendev, 0, "error: page crossing\n");
                 net_tx_error(netdev, &txreq, rc);
                 continue;
@@ -171,7 +171,7 @@ static void net_tx_packets(struct XenNetDev *netdev)
             if (txreq.flags & NETTXF_csum_blank) {
                 /* have read-only mapping -> can't fill checksum in-place */
                 if (!tmpbuf) {
-                    tmpbuf = g_malloc(XC_PAGE_SIZE);
+                    tmpbuf = g_malloc(XEN_PAGE_SIZE);
                 }
                 memcpy(tmpbuf, page + txreq.offset, txreq.size);
                 net_checksum_calculate(tmpbuf, txreq.size, CSUM_ALL);
@@ -181,7 +181,7 @@ static void net_tx_packets(struct XenNetDev *netdev)
                 qemu_send_packet(qemu_get_queue(netdev->nic),
                                  page + txreq.offset, txreq.size);
             }
-            xen_be_unmap_grant_ref(&netdev->xendev, page);
+            xen_be_unmap_grant_ref(&netdev->xendev, page, txreq.gref);
             net_tx_response(netdev, &txreq, NETIF_RSP_OKAY);
         }
         if (!netdev->tx_work) {
@@ -243,9 +243,9 @@ static ssize_t net_rx_packet(NetClientState *nc, const uint8_t *buf, size_t size
     if (rc == rp || RING_REQUEST_CONS_OVERFLOW(&netdev->rx_ring, rc)) {
         return 0;
     }
-    if (size > XC_PAGE_SIZE - NET_IP_ALIGN) {
+    if (size > XEN_PAGE_SIZE - NET_IP_ALIGN) {
         xen_pv_printf(&netdev->xendev, 0, "packet too big (%lu > %ld)",
-                      (unsigned long)size, XC_PAGE_SIZE - NET_IP_ALIGN);
+                      (unsigned long)size, XEN_PAGE_SIZE - NET_IP_ALIGN);
         return -1;
     }
 
@@ -261,7 +261,7 @@ static ssize_t net_rx_packet(NetClientState *nc, const uint8_t *buf, size_t size
         return -1;
     }
     memcpy(page + NET_IP_ALIGN, buf, size);
-    xen_be_unmap_grant_ref(&netdev->xendev, page);
+    xen_be_unmap_grant_ref(&netdev->xendev, page, rxreq.gref);
     net_rx_response(netdev, &rxreq, NETIF_RSP_OKAY, NET_IP_ALIGN, size, 0);
 
     return size;
@@ -343,12 +343,13 @@ static int net_connect(struct XenLegacyDevice *xendev)
                                        netdev->rx_ring_ref,
                                        PROT_READ | PROT_WRITE);
     if (!netdev->rxs) {
-        xen_be_unmap_grant_ref(&netdev->xendev, netdev->txs);
+        xen_be_unmap_grant_ref(&netdev->xendev, netdev->txs,
+                               netdev->tx_ring_ref);
         netdev->txs = NULL;
         return -1;
     }
-    BACK_RING_INIT(&netdev->tx_ring, netdev->txs, XC_PAGE_SIZE);
-    BACK_RING_INIT(&netdev->rx_ring, netdev->rxs, XC_PAGE_SIZE);
+    BACK_RING_INIT(&netdev->tx_ring, netdev->txs, XEN_PAGE_SIZE);
+    BACK_RING_INIT(&netdev->rx_ring, netdev->rxs, XEN_PAGE_SIZE);
 
     xen_be_bind_evtchn(&netdev->xendev);
 
@@ -368,11 +369,13 @@ static void net_disconnect(struct XenLegacyDevice *xendev)
     xen_pv_unbind_evtchn(&netdev->xendev);
 
     if (netdev->txs) {
-        xen_be_unmap_grant_ref(&netdev->xendev, netdev->txs);
+        xen_be_unmap_grant_ref(&netdev->xendev, netdev->txs,
+                               netdev->tx_ring_ref);
         netdev->txs = NULL;
     }
     if (netdev->rxs) {
-        xen_be_unmap_grant_ref(&netdev->xendev, netdev->rxs);
+        xen_be_unmap_grant_ref(&netdev->xendev, netdev->rxs,
+                               netdev->rx_ring_ref);
         netdev->rxs = NULL;
     }
 }
diff --git a/hw/nvme/ctrl.c b/hw/nvme/ctrl.c
index f25cc2c235..49c1210fce 100644
--- a/hw/nvme/ctrl.c
+++ b/hw/nvme/ctrl.c
@@ -238,6 +238,8 @@ static const bool nvme_feature_support[NVME_FID_MAX] = {
     [NVME_TIMESTAMP]                = true,
     [NVME_HOST_BEHAVIOR_SUPPORT]    = true,
     [NVME_COMMAND_SET_PROFILE]      = true,
+    [NVME_FDP_MODE]                 = true,
+    [NVME_FDP_EVENTS]               = true,
 };
 
 static const uint32_t nvme_feature_cap[NVME_FID_MAX] = {
@@ -249,6 +251,8 @@ static const uint32_t nvme_feature_cap[NVME_FID_MAX] = {
     [NVME_TIMESTAMP]                = NVME_FEAT_CAP_CHANGE,
     [NVME_HOST_BEHAVIOR_SUPPORT]    = NVME_FEAT_CAP_CHANGE,
     [NVME_COMMAND_SET_PROFILE]      = NVME_FEAT_CAP_CHANGE,
+    [NVME_FDP_MODE]                 = NVME_FEAT_CAP_CHANGE,
+    [NVME_FDP_EVENTS]               = NVME_FEAT_CAP_CHANGE | NVME_FEAT_CAP_NS,
 };
 
 static const uint32_t nvme_cse_acs[256] = {
@@ -266,6 +270,8 @@ static const uint32_t nvme_cse_acs[256] = {
     [NVME_ADM_CMD_VIRT_MNGMT]       = NVME_CMD_EFF_CSUPP,
     [NVME_ADM_CMD_DBBUF_CONFIG]     = NVME_CMD_EFF_CSUPP,
     [NVME_ADM_CMD_FORMAT_NVM]       = NVME_CMD_EFF_CSUPP | NVME_CMD_EFF_LBCC,
+    [NVME_ADM_CMD_DIRECTIVE_RECV]   = NVME_CMD_EFF_CSUPP,
+    [NVME_ADM_CMD_DIRECTIVE_SEND]   = NVME_CMD_EFF_CSUPP,
 };
 
 static const uint32_t nvme_cse_iocs_none[256];
@@ -279,6 +285,8 @@ static const uint32_t nvme_cse_iocs_nvm[256] = {
     [NVME_CMD_VERIFY]               = NVME_CMD_EFF_CSUPP,
     [NVME_CMD_COPY]                 = NVME_CMD_EFF_CSUPP | NVME_CMD_EFF_LBCC,
     [NVME_CMD_COMPARE]              = NVME_CMD_EFF_CSUPP,
+    [NVME_CMD_IO_MGMT_RECV]         = NVME_CMD_EFF_CSUPP,
+    [NVME_CMD_IO_MGMT_SEND]         = NVME_CMD_EFF_CSUPP | NVME_CMD_EFF_LBCC,
 };
 
 static const uint32_t nvme_cse_iocs_zoned[256] = {
@@ -297,12 +305,66 @@ static const uint32_t nvme_cse_iocs_zoned[256] = {
 
 static void nvme_process_sq(void *opaque);
 static void nvme_ctrl_reset(NvmeCtrl *n, NvmeResetType rst);
+static inline uint64_t nvme_get_timestamp(const NvmeCtrl *n);
 
 static uint16_t nvme_sqid(NvmeRequest *req)
 {
     return le16_to_cpu(req->sq->sqid);
 }
 
+static inline uint16_t nvme_make_pid(NvmeNamespace *ns, uint16_t rg,
+                                     uint16_t ph)
+{
+    uint16_t rgif = ns->endgrp->fdp.rgif;
+
+    if (!rgif) {
+        return ph;
+    }
+
+    return (rg << (16 - rgif)) | ph;
+}
+
+static inline bool nvme_ph_valid(NvmeNamespace *ns, uint16_t ph)
+{
+    return ph < ns->fdp.nphs;
+}
+
+static inline bool nvme_rg_valid(NvmeEnduranceGroup *endgrp, uint16_t rg)
+{
+    return rg < endgrp->fdp.nrg;
+}
+
+static inline uint16_t nvme_pid2ph(NvmeNamespace *ns, uint16_t pid)
+{
+    uint16_t rgif = ns->endgrp->fdp.rgif;
+
+    if (!rgif) {
+        return pid;
+    }
+
+    return pid & ((1 << (15 - rgif)) - 1);
+}
+
+static inline uint16_t nvme_pid2rg(NvmeNamespace *ns, uint16_t pid)
+{
+    uint16_t rgif = ns->endgrp->fdp.rgif;
+
+    if (!rgif) {
+        return 0;
+    }
+
+    return pid >> (16 - rgif);
+}
+
+static inline bool nvme_parse_pid(NvmeNamespace *ns, uint16_t pid,
+                                  uint16_t *ph, uint16_t *rg)
+{
+    *rg = nvme_pid2rg(ns, pid);
+    *ph = nvme_pid2ph(ns, pid);
+
+    return nvme_ph_valid(ns, *ph) && nvme_rg_valid(ns->endgrp, *rg);
+}
+
 static void nvme_assign_zone_state(NvmeNamespace *ns, NvmeZone *zone,
                                    NvmeZoneState state)
 {
@@ -376,6 +438,69 @@ static uint16_t nvme_aor_check(NvmeNamespace *ns, uint32_t act, uint32_t opn)
     return nvme_zns_check_resources(ns, act, opn, 0);
 }
 
+static NvmeFdpEvent *nvme_fdp_alloc_event(NvmeCtrl *n, NvmeFdpEventBuffer *ebuf)
+{
+    NvmeFdpEvent *ret = NULL;
+    bool is_full = ebuf->next == ebuf->start && ebuf->nelems;
+
+    ret = &ebuf->events[ebuf->next++];
+    if (unlikely(ebuf->next == NVME_FDP_MAX_EVENTS)) {
+        ebuf->next = 0;
+    }
+    if (is_full) {
+        ebuf->start = ebuf->next;
+    } else {
+        ebuf->nelems++;
+    }
+
+    memset(ret, 0, sizeof(NvmeFdpEvent));
+    ret->timestamp = nvme_get_timestamp(n);
+
+    return ret;
+}
+
+static inline int log_event(NvmeRuHandle *ruh, uint8_t event_type)
+{
+    return (ruh->event_filter >> nvme_fdp_evf_shifts[event_type]) & 0x1;
+}
+
+static bool nvme_update_ruh(NvmeCtrl *n, NvmeNamespace *ns, uint16_t pid)
+{
+    NvmeEnduranceGroup *endgrp = ns->endgrp;
+    NvmeRuHandle *ruh;
+    NvmeReclaimUnit *ru;
+    NvmeFdpEvent *e = NULL;
+    uint16_t ph, rg, ruhid;
+
+    if (!nvme_parse_pid(ns, pid, &ph, &rg)) {
+        return false;
+    }
+
+    ruhid = ns->fdp.phs[ph];
+
+    ruh = &endgrp->fdp.ruhs[ruhid];
+    ru = &ruh->rus[rg];
+
+    if (ru->ruamw) {
+        if (log_event(ruh, FDP_EVT_RU_NOT_FULLY_WRITTEN)) {
+            e = nvme_fdp_alloc_event(n, &endgrp->fdp.host_events);
+            e->type = FDP_EVT_RU_NOT_FULLY_WRITTEN;
+            e->flags = FDPEF_PIV | FDPEF_NSIDV | FDPEF_LV;
+            e->pid = cpu_to_le16(pid);
+            e->nsid = cpu_to_le32(ns->params.nsid);
+            e->rgid = cpu_to_le16(rg);
+            e->ruhid = cpu_to_le16(ruhid);
+        }
+
+        /* log (eventual) GC overhead of prematurely swapping the RU */
+        nvme_fdp_stat_inc(&endgrp->fdp.mbmw, nvme_l2b(ns, ru->ruamw));
+    }
+
+    ru->ruamw = ruh->ruamw;
+
+    return true;
+}
+
 static bool nvme_addr_is_cmb(NvmeCtrl *n, hwaddr addr)
 {
     hwaddr hi, lo;
@@ -3320,6 +3445,41 @@ invalid:
     return status | NVME_DNR;
 }
 
+static void nvme_do_write_fdp(NvmeCtrl *n, NvmeRequest *req, uint64_t slba,
+                              uint32_t nlb)
+{
+    NvmeNamespace *ns = req->ns;
+    NvmeRwCmd *rw = (NvmeRwCmd *)&req->cmd;
+    uint64_t data_size = nvme_l2b(ns, nlb);
+    uint32_t dw12 = le32_to_cpu(req->cmd.cdw12);
+    uint8_t dtype = (dw12 >> 20) & 0xf;
+    uint16_t pid = le16_to_cpu(rw->dspec);
+    uint16_t ph, rg, ruhid;
+    NvmeReclaimUnit *ru;
+
+    if (dtype != NVME_DIRECTIVE_DATA_PLACEMENT ||
+        !nvme_parse_pid(ns, pid, &ph, &rg)) {
+        ph = 0;
+        rg = 0;
+    }
+
+    ruhid = ns->fdp.phs[ph];
+    ru = &ns->endgrp->fdp.ruhs[ruhid].rus[rg];
+
+    nvme_fdp_stat_inc(&ns->endgrp->fdp.hbmw, data_size);
+    nvme_fdp_stat_inc(&ns->endgrp->fdp.mbmw, data_size);
+
+    while (nlb) {
+        if (nlb < ru->ruamw) {
+            ru->ruamw -= nlb;
+            break;
+        }
+
+        nlb -= ru->ruamw;
+        nvme_update_ruh(n, ns, pid);
+    }
+}
+
 static uint16_t nvme_do_write(NvmeCtrl *n, NvmeRequest *req, bool append,
                               bool wrz)
 {
@@ -3429,6 +3589,8 @@ static uint16_t nvme_do_write(NvmeCtrl *n, NvmeRequest *req, bool append,
         if (!(zone->d.za & NVME_ZA_ZRWA_VALID)) {
             zone->w_ptr += nlb;
         }
+    } else if (ns->endgrp && ns->endgrp->fdp.enabled) {
+        nvme_do_write_fdp(n, req, slba, nlb);
     }
 
     data_offset = nvme_l2b(ns, slba);
@@ -4086,6 +4248,126 @@ static uint16_t nvme_zone_mgmt_recv(NvmeCtrl *n, NvmeRequest *req)
     return status;
 }
 
+static uint16_t nvme_io_mgmt_recv_ruhs(NvmeCtrl *n, NvmeRequest *req,
+                                       size_t len)
+{
+    NvmeNamespace *ns = req->ns;
+    NvmeEnduranceGroup *endgrp;
+    NvmeRuhStatus *hdr;
+    NvmeRuhStatusDescr *ruhsd;
+    unsigned int nruhsd;
+    uint16_t rg, ph, *ruhid;
+    size_t trans_len;
+    g_autofree uint8_t *buf = NULL;
+
+    if (!n->subsys) {
+        return NVME_INVALID_FIELD | NVME_DNR;
+    }
+
+    if (ns->params.nsid == 0 || ns->params.nsid == 0xffffffff) {
+        return NVME_INVALID_NSID | NVME_DNR;
+    }
+
+    if (!n->subsys->endgrp.fdp.enabled) {
+        return NVME_FDP_DISABLED | NVME_DNR;
+    }
+
+    endgrp = ns->endgrp;
+
+    nruhsd = ns->fdp.nphs * endgrp->fdp.nrg;
+    trans_len = sizeof(NvmeRuhStatus) + nruhsd * sizeof(NvmeRuhStatusDescr);
+    buf = g_malloc(trans_len);
+
+    trans_len = MIN(trans_len, len);
+
+    hdr = (NvmeRuhStatus *)buf;
+    ruhsd = (NvmeRuhStatusDescr *)(buf + sizeof(NvmeRuhStatus));
+
+    hdr->nruhsd = cpu_to_le16(nruhsd);
+
+    ruhid = ns->fdp.phs;
+
+    for (ph = 0; ph < ns->fdp.nphs; ph++, ruhid++) {
+        NvmeRuHandle *ruh = &endgrp->fdp.ruhs[*ruhid];
+
+        for (rg = 0; rg < endgrp->fdp.nrg; rg++, ruhsd++) {
+            uint16_t pid = nvme_make_pid(ns, rg, ph);
+
+            ruhsd->pid = cpu_to_le16(pid);
+            ruhsd->ruhid = *ruhid;
+            ruhsd->earutr = 0;
+            ruhsd->ruamw = cpu_to_le64(ruh->rus[rg].ruamw);
+        }
+    }
+
+    return nvme_c2h(n, buf, trans_len, req);
+}
+
+static uint16_t nvme_io_mgmt_recv(NvmeCtrl *n, NvmeRequest *req)
+{
+    NvmeCmd *cmd = &req->cmd;
+    uint32_t cdw10 = le32_to_cpu(cmd->cdw10);
+    uint32_t numd = le32_to_cpu(cmd->cdw11);
+    uint8_t mo = (cdw10 & 0xff);
+    size_t len = (numd + 1) << 2;
+
+    switch (mo) {
+    case NVME_IOMR_MO_NOP:
+        return 0;
+    case NVME_IOMR_MO_RUH_STATUS:
+        return nvme_io_mgmt_recv_ruhs(n, req, len);
+    default:
+        return NVME_INVALID_FIELD | NVME_DNR;
+    };
+}
+
+static uint16_t nvme_io_mgmt_send_ruh_update(NvmeCtrl *n, NvmeRequest *req)
+{
+    NvmeCmd *cmd = &req->cmd;
+    NvmeNamespace *ns = req->ns;
+    uint32_t cdw10 = le32_to_cpu(cmd->cdw10);
+    uint16_t ret = NVME_SUCCESS;
+    uint32_t npid = (cdw10 >> 1) + 1;
+    unsigned int i = 0;
+    g_autofree uint16_t *pids = NULL;
+    uint32_t maxnpid = n->subsys->endgrp.fdp.nrg * n->subsys->endgrp.fdp.nruh;
+
+    if (unlikely(npid >= MIN(NVME_FDP_MAXPIDS, maxnpid))) {
+        return NVME_INVALID_FIELD | NVME_DNR;
+    }
+
+    pids = g_new(uint16_t, npid);
+
+    ret = nvme_h2c(n, pids, npid * sizeof(uint16_t), req);
+    if (ret) {
+        return ret;
+    }
+
+    for (; i < npid; i++) {
+        if (!nvme_update_ruh(n, ns, pids[i])) {
+            return NVME_INVALID_FIELD | NVME_DNR;
+        }
+    }
+
+    return ret;
+}
+
+static uint16_t nvme_io_mgmt_send(NvmeCtrl *n, NvmeRequest *req)
+{
+    NvmeCmd *cmd = &req->cmd;
+    uint32_t cdw10 = le32_to_cpu(cmd->cdw10);
+    uint8_t mo = (cdw10 & 0xff);
+
+    switch (mo) {
+    case NVME_IOMS_MO_NOP:
+        return 0;
+    case NVME_IOMS_MO_RUH_UPDATE:
+        return nvme_io_mgmt_send_ruh_update(n, req);
+    default:
+        return NVME_INVALID_FIELD | NVME_DNR;
+    };
+}
+
 static uint16_t nvme_io_cmd(NvmeCtrl *n, NvmeRequest *req)
 {
     NvmeNamespace *ns;
@@ -4162,6 +4444,10 @@ static uint16_t nvme_io_cmd(NvmeCtrl *n, NvmeRequest *req)
         return nvme_zone_mgmt_send(n, req);
     case NVME_CMD_ZONE_MGMT_RECV:
         return nvme_zone_mgmt_recv(n, req);
+    case NVME_CMD_IO_MGMT_RECV:
+        return nvme_io_mgmt_recv(n, req);
+    case NVME_CMD_IO_MGMT_SEND:
+        return nvme_io_mgmt_send(n, req);
     default:
         assert(false);
     }
@@ -4386,8 +4672,8 @@ static void nvme_set_blk_stats(NvmeNamespace *ns, struct nvme_stats *stats)
 {
     BlockAcctStats *s = blk_get_stats(ns->blkconf.blk);
 
-    stats->units_read += s->nr_bytes[BLOCK_ACCT_READ] >> BDRV_SECTOR_BITS;
-    stats->units_written += s->nr_bytes[BLOCK_ACCT_WRITE] >> BDRV_SECTOR_BITS;
+    stats->units_read += s->nr_bytes[BLOCK_ACCT_READ];
+    stats->units_written += s->nr_bytes[BLOCK_ACCT_WRITE];
     stats->read_commands += s->nr_ops[BLOCK_ACCT_READ];
     stats->write_commands += s->nr_ops[BLOCK_ACCT_WRITE];
 }
@@ -4401,6 +4687,7 @@ static uint16_t nvme_smart_info(NvmeCtrl *n, uint8_t rae, uint32_t buf_len,
     uint32_t trans_len;
     NvmeNamespace *ns;
     time_t current_ms;
+    uint64_t u_read, u_written;
 
     if (off >= sizeof(smart)) {
         return NVME_INVALID_FIELD | NVME_DNR;
@@ -4427,10 +4714,11 @@ static uint16_t nvme_smart_info(NvmeCtrl *n, uint8_t rae, uint32_t buf_len,
     trans_len = MIN(sizeof(smart) - off, buf_len);
     smart.critical_warning = n->smart_critical_warning;
 
-    smart.data_units_read[0] = cpu_to_le64(DIV_ROUND_UP(stats.units_read,
-                                                        1000));
-    smart.data_units_written[0] = cpu_to_le64(DIV_ROUND_UP(stats.units_written,
-                                                           1000));
+    u_read = DIV_ROUND_UP(stats.units_read >> BDRV_SECTOR_BITS, 1000);
+    u_written = DIV_ROUND_UP(stats.units_written >> BDRV_SECTOR_BITS, 1000);
+
+    smart.data_units_read[0] = cpu_to_le64(u_read);
+    smart.data_units_written[0] = cpu_to_le64(u_written);
     smart.host_read_commands[0] = cpu_to_le64(stats.read_commands);
     smart.host_write_commands[0] = cpu_to_le64(stats.write_commands);
 
@@ -4452,6 +4740,48 @@ static uint16_t nvme_smart_info(NvmeCtrl *n, uint8_t rae, uint32_t buf_len,
     return nvme_c2h(n, (uint8_t *) &smart + off, trans_len, req);
 }
 
+static uint16_t nvme_endgrp_info(NvmeCtrl *n,  uint8_t rae, uint32_t buf_len,
+                                 uint64_t off, NvmeRequest *req)
+{
+    uint32_t dw11 = le32_to_cpu(req->cmd.cdw11);
+    uint16_t endgrpid = (dw11 >> 16) & 0xffff;
+    struct nvme_stats stats = {};
+    NvmeEndGrpLog info = {};
+    int i;
+
+    if (!n->subsys || endgrpid != 0x1) {
+        return NVME_INVALID_FIELD | NVME_DNR;
+    }
+
+    if (off >= sizeof(info)) {
+        return NVME_INVALID_FIELD | NVME_DNR;
+    }
+
+    for (i = 1; i <= NVME_MAX_NAMESPACES; i++) {
+        NvmeNamespace *ns = nvme_subsys_ns(n->subsys, i);
+        if (!ns) {
+            continue;
+        }
+
+        nvme_set_blk_stats(ns, &stats);
+    }
+
+    info.data_units_read[0] =
+        cpu_to_le64(DIV_ROUND_UP(stats.units_read / 1000000000, 1000000000));
+    info.data_units_written[0] =
+        cpu_to_le64(DIV_ROUND_UP(stats.units_written / 1000000000, 1000000000));
+    info.media_units_written[0] =
+        cpu_to_le64(DIV_ROUND_UP(stats.units_written / 1000000000, 1000000000));
+
+    info.host_read_commands[0] = cpu_to_le64(stats.read_commands);
+    info.host_write_commands[0] = cpu_to_le64(stats.write_commands);
+
+    buf_len = MIN(sizeof(info) - off, buf_len);
+
+    return nvme_c2h(n, (uint8_t *)&info + off, buf_len, req);
+}
+
+
 static uint16_t nvme_fw_log_info(NvmeCtrl *n, uint32_t buf_len, uint64_t off,
                                  NvmeRequest *req)
 {
@@ -4577,6 +4907,207 @@ static uint16_t nvme_cmd_effects(NvmeCtrl *n, uint8_t csi, uint32_t buf_len,
     return nvme_c2h(n, ((uint8_t *)&log) + off, trans_len, req);
 }
 
+static size_t sizeof_fdp_conf_descr(size_t nruh, size_t vss)
+{
+    size_t entry_siz = sizeof(NvmeFdpDescrHdr) + nruh * sizeof(NvmeRuhDescr)
+                       + vss;
+    return ROUND_UP(entry_siz, 8);
+}
+
+static uint16_t nvme_fdp_confs(NvmeCtrl *n, uint32_t endgrpid, uint32_t buf_len,
+                               uint64_t off, NvmeRequest *req)
+{
+    uint32_t log_size, trans_len;
+    g_autofree uint8_t *buf = NULL;
+    NvmeFdpDescrHdr *hdr;
+    NvmeRuhDescr *ruhd;
+    NvmeEnduranceGroup *endgrp;
+    NvmeFdpConfsHdr *log;
+    size_t nruh, fdp_descr_size;
+    int i;
+
+    if (endgrpid != 1 || !n->subsys) {
+        return NVME_INVALID_FIELD | NVME_DNR;
+    }
+
+    endgrp = &n->subsys->endgrp;
+
+    if (endgrp->fdp.enabled) {
+        nruh = endgrp->fdp.nruh;
+    } else {
+        nruh = 1;
+    }
+
+    fdp_descr_size = sizeof_fdp_conf_descr(nruh, FDPVSS);
+    log_size = sizeof(NvmeFdpConfsHdr) + fdp_descr_size;
+
+    if (off >= log_size) {
+        return NVME_INVALID_FIELD | NVME_DNR;
+    }
+
+    trans_len = MIN(log_size - off, buf_len);
+
+    buf = g_malloc0(log_size);
+    log = (NvmeFdpConfsHdr *)buf;
+    hdr = (NvmeFdpDescrHdr *)(log + 1);
+    ruhd = (NvmeRuhDescr *)(buf + sizeof(*log) + sizeof(*hdr));
+
+    log->num_confs = cpu_to_le16(0);
+    log->size = cpu_to_le32(log_size);
+
+    hdr->descr_size = cpu_to_le16(fdp_descr_size);
+    if (endgrp->fdp.enabled) {
+        hdr->fdpa = FIELD_DP8(hdr->fdpa, FDPA, VALID, 1);
+        hdr->fdpa = FIELD_DP8(hdr->fdpa, FDPA, RGIF, endgrp->fdp.rgif);
+        hdr->nrg = cpu_to_le16(endgrp->fdp.nrg);
+        hdr->nruh = cpu_to_le16(endgrp->fdp.nruh);
+        hdr->maxpids = cpu_to_le16(NVME_FDP_MAXPIDS - 1);
+        hdr->nnss = cpu_to_le32(NVME_MAX_NAMESPACES);
+        hdr->runs = cpu_to_le64(endgrp->fdp.runs);
+
+        for (i = 0; i < nruh; i++) {
+            ruhd->ruht = NVME_RUHT_INITIALLY_ISOLATED;
+            ruhd++;
+        }
+    } else {
+        /* 1 bit for RUH in PIF -> 2 RUHs max. */
+        hdr->nrg = cpu_to_le16(1);
+        hdr->nruh = cpu_to_le16(1);
+        hdr->maxpids = cpu_to_le16(NVME_FDP_MAXPIDS - 1);
+        hdr->nnss = cpu_to_le32(1);
+        hdr->runs = cpu_to_le64(96 * MiB);
+
+        ruhd->ruht = NVME_RUHT_INITIALLY_ISOLATED;
+    }
+
+    return nvme_c2h(n, (uint8_t *)buf + off, trans_len, req);
+}
+
+static uint16_t nvme_fdp_ruh_usage(NvmeCtrl *n, uint32_t endgrpid,
+                                   uint32_t dw10, uint32_t dw12,
+                                   uint32_t buf_len, uint64_t off,
+                                   NvmeRequest *req)
+{
+    NvmeRuHandle *ruh;
+    NvmeRuhuLog *hdr;
+    NvmeRuhuDescr *ruhud;
+    NvmeEnduranceGroup *endgrp;
+    g_autofree uint8_t *buf = NULL;
+    uint32_t log_size, trans_len;
+    uint16_t i;
+
+    if (endgrpid != 1 || !n->subsys) {
+        return NVME_INVALID_FIELD | NVME_DNR;
+    }
+
+    endgrp = &n->subsys->endgrp;
+
+    if (!endgrp->fdp.enabled) {
+        return NVME_FDP_DISABLED | NVME_DNR;
+    }
+
+    log_size = sizeof(NvmeRuhuLog) + endgrp->fdp.nruh * sizeof(NvmeRuhuDescr);
+
+    if (off >= log_size) {
+        return NVME_INVALID_FIELD | NVME_DNR;
+    }
+
+    trans_len = MIN(log_size - off, buf_len);
+
+    buf = g_malloc0(log_size);
+    hdr = (NvmeRuhuLog *)buf;
+    ruhud = (NvmeRuhuDescr *)(hdr + 1);
+
+    ruh = endgrp->fdp.ruhs;
+    hdr->nruh = cpu_to_le16(endgrp->fdp.nruh);
+
+    for (i = 0; i < endgrp->fdp.nruh; i++, ruhud++, ruh++) {
+        ruhud->ruha = ruh->ruha;
+    }
+
+    return nvme_c2h(n, (uint8_t *)buf + off, trans_len, req);
+}
+
+static uint16_t nvme_fdp_stats(NvmeCtrl *n, uint32_t endgrpid, uint32_t buf_len,
+                               uint64_t off, NvmeRequest *req)
+{
+    NvmeEnduranceGroup *endgrp;
+    NvmeFdpStatsLog log = {};
+    uint32_t trans_len;
+
+    if (off >= sizeof(NvmeFdpStatsLog)) {
+        return NVME_INVALID_FIELD | NVME_DNR;
+    }
+
+    if (endgrpid != 1 || !n->subsys) {
+        return NVME_INVALID_FIELD | NVME_DNR;
+    }
+
+    if (!n->subsys->endgrp.fdp.enabled) {
+        return NVME_FDP_DISABLED | NVME_DNR;
+    }
+
+    endgrp = &n->subsys->endgrp;
+
+    trans_len = MIN(sizeof(log) - off, buf_len);
+
+    /* spec value is 128 bit, we only use 64 bit */
+    log.hbmw[0] = cpu_to_le64(endgrp->fdp.hbmw);
+    log.mbmw[0] = cpu_to_le64(endgrp->fdp.mbmw);
+    log.mbe[0] = cpu_to_le64(endgrp->fdp.mbe);
+
+    return nvme_c2h(n, (uint8_t *)&log + off, trans_len, req);
+}
+
+static uint16_t nvme_fdp_events(NvmeCtrl *n, uint32_t endgrpid,
+                                uint32_t buf_len, uint64_t off,
+                                NvmeRequest *req)
+{
+    NvmeEnduranceGroup *endgrp;
+    NvmeCmd *cmd = &req->cmd;
+    bool host_events = (cmd->cdw10 >> 8) & 0x1;
+    uint32_t log_size, trans_len;
+    NvmeFdpEventBuffer *ebuf;
+    g_autofree NvmeFdpEventsLog *elog = NULL;
+    NvmeFdpEvent *event;
+
+    if (endgrpid != 1 || !n->subsys) {
+        return NVME_INVALID_FIELD | NVME_DNR;
+    }
+
+    endgrp = &n->subsys->endgrp;
+
+    if (!endgrp->fdp.enabled) {
+        return NVME_FDP_DISABLED | NVME_DNR;
+    }
+
+    if (host_events) {
+        ebuf = &endgrp->fdp.host_events;
+    } else {
+        ebuf = &endgrp->fdp.ctrl_events;
+    }
+
+    log_size = sizeof(NvmeFdpEventsLog) + ebuf->nelems * sizeof(NvmeFdpEvent);
+    trans_len = MIN(log_size - off, buf_len);
+    elog = g_malloc0(log_size);
+    elog->num_events = cpu_to_le32(ebuf->nelems);
+    event = (NvmeFdpEvent *)(elog + 1);
+
+    if (ebuf->nelems && ebuf->start == ebuf->next) {
+        unsigned int nelems = (NVME_FDP_MAX_EVENTS - ebuf->start);
+        /* wrap over, copy [start;NVME_FDP_MAX_EVENTS[ and [0; next[ */
+        memcpy(event, &ebuf->events[ebuf->start],
+               sizeof(NvmeFdpEvent) * nelems);
+        memcpy(event + nelems, ebuf->events,
+               sizeof(NvmeFdpEvent) * ebuf->next);
+    } else if (ebuf->start < ebuf->next) {
+        memcpy(event, &ebuf->events[ebuf->start],
+               sizeof(NvmeFdpEvent) * (ebuf->next - ebuf->start));
+    }
+
+    return nvme_c2h(n, (uint8_t *)elog + off, trans_len, req);
+}
+
 static uint16_t nvme_get_log(NvmeCtrl *n, NvmeRequest *req)
 {
     NvmeCmd *cmd = &req->cmd;
@@ -4589,13 +5120,14 @@ static uint16_t nvme_get_log(NvmeCtrl *n, NvmeRequest *req)
     uint8_t  lsp = (dw10 >> 8) & 0xf;
     uint8_t  rae = (dw10 >> 15) & 0x1;
     uint8_t  csi = le32_to_cpu(cmd->cdw14) >> 24;
-    uint32_t numdl, numdu;
+    uint32_t numdl, numdu, lspi;
     uint64_t off, lpol, lpou;
     size_t   len;
     uint16_t status;
 
     numdl = (dw10 >> 16);
     numdu = (dw11 & 0xffff);
+    lspi = (dw11 >> 16);
     lpol = dw12;
     lpou = dw13;
 
@@ -4624,6 +5156,16 @@ static uint16_t nvme_get_log(NvmeCtrl *n, NvmeRequest *req)
         return nvme_changed_nslist(n, rae, len, off, req);
     case NVME_LOG_CMD_EFFECTS:
         return nvme_cmd_effects(n, csi, len, off, req);
+    case NVME_LOG_ENDGRP:
+        return nvme_endgrp_info(n, rae, len, off, req);
+    case NVME_LOG_FDP_CONFS:
+        return nvme_fdp_confs(n, lspi, len, off, req);
+    case NVME_LOG_FDP_RUH_USAGE:
+        return nvme_fdp_ruh_usage(n, lspi, dw10, dw12, len, off, req);
+    case NVME_LOG_FDP_STATS:
+        return nvme_fdp_stats(n, lspi, len, off, req);
+    case NVME_LOG_FDP_EVENTS:
+        return nvme_fdp_events(n, lspi, len, off, req);
     default:
         trace_pci_nvme_err_invalid_log_page(nvme_cid(req), lid);
         return NVME_INVALID_FIELD | NVME_DNR;
@@ -5210,6 +5752,84 @@ static uint16_t nvme_get_feature_timestamp(NvmeCtrl *n, NvmeRequest *req)
     return nvme_c2h(n, (uint8_t *)&timestamp, sizeof(timestamp), req);
 }
 
+static int nvme_get_feature_fdp(NvmeCtrl *n, uint32_t endgrpid,
+                                uint32_t *result)
+{
+    *result = 0;
+
+    if (!n->subsys || !n->subsys->endgrp.fdp.enabled) {
+        return NVME_INVALID_FIELD | NVME_DNR;
+    }
+
+    *result = FIELD_DP16(0, FEAT_FDP, FDPE, 1);
+    *result = FIELD_DP16(*result, FEAT_FDP, CONF_NDX, 0);
+
+    return NVME_SUCCESS;
+}
+
+static uint16_t nvme_get_feature_fdp_events(NvmeCtrl *n, NvmeNamespace *ns,
+                                            NvmeRequest *req, uint32_t *result)
+{
+    NvmeCmd *cmd = &req->cmd;
+    uint32_t cdw11 = le32_to_cpu(cmd->cdw11);
+    uint16_t ph = cdw11 & 0xffff;
+    uint8_t noet = (cdw11 >> 16) & 0xff;
+    uint16_t ruhid, ret;
+    uint32_t nentries = 0;
+    uint8_t s_events_ndx = 0;
+    size_t s_events_siz = sizeof(NvmeFdpEventDescr) * noet;
+    g_autofree NvmeFdpEventDescr *s_events = g_malloc0(s_events_siz);
+    NvmeRuHandle *ruh;
+    NvmeFdpEventDescr *s_event;
+
+    if (!n->subsys || !n->subsys->endgrp.fdp.enabled) {
+        return NVME_FDP_DISABLED | NVME_DNR;
+    }
+
+    if (!nvme_ph_valid(ns, ph)) {
+        return NVME_INVALID_FIELD | NVME_DNR;
+    }
+
+    ruhid = ns->fdp.phs[ph];
+    ruh = &n->subsys->endgrp.fdp.ruhs[ruhid];
+
+    assert(ruh);
+
+    if (unlikely(noet == 0)) {
+        return NVME_INVALID_FIELD | NVME_DNR;
+    }
+
+    for (uint8_t event_type = 0; event_type < FDP_EVT_MAX; event_type++) {
+        uint8_t shift = nvme_fdp_evf_shifts[event_type];
+        if (!shift && event_type) {
+            /*
+             * only first entry (event_type == 0) has a shift value of 0
+             * other entries are simply unpopulated.
+             */
+            continue;
+        }
+
+        nentries++;
+
+        s_event = &s_events[s_events_ndx];
+        s_event->evt = event_type;
+        s_event->evta = (ruh->event_filter >> shift) & 0x1;
+
+        /* break if all `noet` entries are filled */
+        if ((++s_events_ndx) == noet) {
+            break;
+        }
+    }
+
+    ret = nvme_c2h(n, s_events, s_events_siz, req);
+    if (ret) {
+        return ret;
+    }
+
+    *result = nentries;
+    return NVME_SUCCESS;
+}
+
 static uint16_t nvme_get_feature(NvmeCtrl *n, NvmeRequest *req)
 {
     NvmeCmd *cmd = &req->cmd;
@@ -5222,6 +5842,7 @@ static uint16_t nvme_get_feature(NvmeCtrl *n, NvmeRequest *req)
     uint16_t iv;
     NvmeNamespace *ns;
     int i;
+    uint16_t endgrpid = 0, ret = NVME_SUCCESS;
 
     static const uint32_t nvme_feature_default[NVME_FID_MAX] = {
         [NVME_ARBITRATION] = NVME_ARB_AB_NOLIMIT,
@@ -5319,6 +5940,33 @@ static uint16_t nvme_get_feature(NvmeCtrl *n, NvmeRequest *req)
     case NVME_HOST_BEHAVIOR_SUPPORT:
         return nvme_c2h(n, (uint8_t *)&n->features.hbs,
                         sizeof(n->features.hbs), req);
+    case NVME_FDP_MODE:
+        endgrpid = dw11 & 0xff;
+
+        if (endgrpid != 0x1) {
+            return NVME_INVALID_FIELD | NVME_DNR;
+        }
+
+        ret = nvme_get_feature_fdp(n, endgrpid, &result);
+        if (ret) {
+            return ret;
+        }
+        goto out;
+    case NVME_FDP_EVENTS:
+        if (!nvme_nsid_valid(n, nsid)) {
+            return NVME_INVALID_NSID | NVME_DNR;
+        }
+
+        ns = nvme_ns(n, nsid);
+        if (unlikely(!ns)) {
+            return NVME_INVALID_FIELD | NVME_DNR;
+        }
+
+        ret = nvme_get_feature_fdp_events(n, ns, req, &result);
+        if (ret) {
+            return ret;
+        }
+        goto out;
     default:
         break;
     }
@@ -5352,6 +6000,20 @@ defaults:
             result |= NVME_INTVC_NOCOALESCING;
         }
         break;
+    case NVME_FDP_MODE:
+        endgrpid = dw11 & 0xff;
+
+        if (endgrpid != 0x1) {
+            return NVME_INVALID_FIELD | NVME_DNR;
+        }
+
+        ret = nvme_get_feature_fdp(n, endgrpid, &result);
+        if (ret) {
+            return ret;
+        }
+        goto out;
+
+        break;
     default:
         result = nvme_feature_default[fid];
         break;
@@ -5359,7 +6021,7 @@ defaults:
 
 out:
     req->cqe.result = cpu_to_le32(result);
-    return NVME_SUCCESS;
+    return ret;
 }
 
 static uint16_t nvme_set_feature_timestamp(NvmeCtrl *n, NvmeRequest *req)
@@ -5377,6 +6039,51 @@ static uint16_t nvme_set_feature_timestamp(NvmeCtrl *n, NvmeRequest *req)
     return NVME_SUCCESS;
 }
 
+static uint16_t nvme_set_feature_fdp_events(NvmeCtrl *n, NvmeNamespace *ns,
+                                            NvmeRequest *req)
+{
+    NvmeCmd *cmd = &req->cmd;
+    uint32_t cdw11 = le32_to_cpu(cmd->cdw11);
+    uint16_t ph = cdw11 & 0xffff;
+    uint8_t noet = (cdw11 >> 16) & 0xff;
+    uint16_t ret, ruhid;
+    uint8_t enable = le32_to_cpu(cmd->cdw12) & 0x1;
+    uint8_t event_mask = 0;
+    unsigned int i;
+    g_autofree uint8_t *events = g_malloc0(noet);
+    NvmeRuHandle *ruh = NULL;
+
+    assert(ns);
+
+    if (!n->subsys || !n->subsys->endgrp.fdp.enabled) {
+        return NVME_FDP_DISABLED | NVME_DNR;
+    }
+
+    if (!nvme_ph_valid(ns, ph)) {
+        return NVME_INVALID_FIELD | NVME_DNR;
+    }
+
+    ruhid = ns->fdp.phs[ph];
+    ruh = &n->subsys->endgrp.fdp.ruhs[ruhid];
+
+    ret = nvme_h2c(n, events, noet, req);
+    if (ret) {
+        return ret;
+    }
+
+    for (i = 0; i < noet; i++) {
+        event_mask |= (1 << nvme_fdp_evf_shifts[events[i]]);
+    }
+
+    if (enable) {
+        ruh->event_filter |= event_mask;
+    } else {
+        ruh->event_filter = ruh->event_filter & ~event_mask;
+    }
+
+    return NVME_SUCCESS;
+}
+
 static uint16_t nvme_set_feature(NvmeCtrl *n, NvmeRequest *req)
 {
     NvmeNamespace *ns = NULL;
@@ -5536,6 +6243,11 @@ static uint16_t nvme_set_feature(NvmeCtrl *n, NvmeRequest *req)
             return NVME_CMD_SET_CMB_REJECTED | NVME_DNR;
         }
         break;
+    case NVME_FDP_MODE:
+        /* spec: abort with cmd seq err if there's one or more NS' in endgrp */
+        return NVME_CMD_SEQ_ERROR | NVME_DNR;
+    case NVME_FDP_EVENTS:
+        return nvme_set_feature_fdp_events(n, ns, req);
     default:
         return NVME_FEAT_NOT_CHANGEABLE | NVME_DNR;
     }
@@ -6104,6 +6816,61 @@ static uint16_t nvme_dbbuf_config(NvmeCtrl *n, const NvmeRequest *req)
     return NVME_SUCCESS;
 }
 
+static uint16_t nvme_directive_send(NvmeCtrl *n, NvmeRequest *req)
+{
+    return NVME_INVALID_FIELD | NVME_DNR;
+}
+
+static uint16_t nvme_directive_receive(NvmeCtrl *n, NvmeRequest *req)
+{
+    NvmeNamespace *ns;
+    uint32_t dw10 = le32_to_cpu(req->cmd.cdw10);
+    uint32_t dw11 = le32_to_cpu(req->cmd.cdw11);
+    uint32_t nsid = le32_to_cpu(req->cmd.nsid);
+    uint8_t doper, dtype;
+    uint32_t numd, trans_len;
+    NvmeDirectiveIdentify id = {
+        .supported = 1 << NVME_DIRECTIVE_IDENTIFY,
+        .enabled = 1 << NVME_DIRECTIVE_IDENTIFY,
+    };
+
+    numd = dw10 + 1;
+    doper = dw11 & 0xff;
+    dtype = (dw11 >> 8) & 0xff;
+
+    trans_len = MIN(sizeof(NvmeDirectiveIdentify), numd << 2);
+
+    if (nsid == NVME_NSID_BROADCAST || dtype != NVME_DIRECTIVE_IDENTIFY ||
+        doper != NVME_DIRECTIVE_RETURN_PARAMS) {
+        return NVME_INVALID_FIELD | NVME_DNR;
+    }
+
+    ns = nvme_ns(n, nsid);
+    if (!ns) {
+        return NVME_INVALID_FIELD | NVME_DNR;
+    }
+
+    switch (dtype) {
+    case NVME_DIRECTIVE_IDENTIFY:
+        switch (doper) {
+        case NVME_DIRECTIVE_RETURN_PARAMS:
+            if (ns->endgrp->fdp.enabled) {
+                id.supported |= 1 << NVME_DIRECTIVE_DATA_PLACEMENT;
+                id.enabled |= 1 << NVME_DIRECTIVE_DATA_PLACEMENT;
+                id.persistent |= 1 << NVME_DIRECTIVE_DATA_PLACEMENT;
+            }
+
+            return nvme_c2h(n, (uint8_t *)&id, trans_len, req);
+
+        default:
+            return NVME_INVALID_FIELD | NVME_DNR;
+        }
+
+    default:
+        return NVME_INVALID_FIELD;
+    }
+}
+
 static uint16_t nvme_admin_cmd(NvmeCtrl *n, NvmeRequest *req)
 {
     trace_pci_nvme_admin_cmd(nvme_cid(req), nvme_sqid(req), req->cmd.opcode,
@@ -6152,6 +6919,10 @@ static uint16_t nvme_admin_cmd(NvmeCtrl *n, NvmeRequest *req)
         return nvme_dbbuf_config(n, req);
     case NVME_ADM_CMD_FORMAT_NVM:
         return nvme_format(n, req);
+    case NVME_ADM_CMD_DIRECTIVE_SEND:
+        return nvme_directive_send(n, req);
+    case NVME_ADM_CMD_DIRECTIVE_RECV:
+        return nvme_directive_receive(n, req);
     default:
         assert(false);
     }
@@ -7380,6 +8151,7 @@ static void nvme_init_ctrl(NvmeCtrl *n, PCIDevice *pci_dev)
     uint8_t *pci_conf = pci_dev->config;
     uint64_t cap = ldq_le_p(&n->bar.cap);
     NvmeSecCtrlEntry *sctrl = nvme_sctrl(n);
+    uint32_t ctratt;
 
     id->vid = cpu_to_le16(pci_get_word(pci_conf + PCI_VENDOR_ID));
     id->ssvid = cpu_to_le16(pci_get_word(pci_conf + PCI_SUBSYSTEM_VENDOR_ID));
@@ -7390,7 +8162,7 @@ static void nvme_init_ctrl(NvmeCtrl *n, PCIDevice *pci_dev)
     id->cntlid = cpu_to_le16(n->cntlid);
 
     id->oaes = cpu_to_le32(NVME_OAES_NS_ATTR);
-    id->ctratt |= cpu_to_le32(NVME_CTRATT_ELBAS);
+    ctratt = NVME_CTRATT_ELBAS;
 
     id->rab = 6;
 
@@ -7407,7 +8179,8 @@ static void nvme_init_ctrl(NvmeCtrl *n, PCIDevice *pci_dev)
     id->mdts = n->params.mdts;
     id->ver = cpu_to_le32(NVME_SPEC_VER);
     id->oacs =
-        cpu_to_le16(NVME_OACS_NS_MGMT | NVME_OACS_FORMAT | NVME_OACS_DBBUF);
+        cpu_to_le16(NVME_OACS_NS_MGMT | NVME_OACS_FORMAT | NVME_OACS_DBBUF |
+                    NVME_OACS_DIRECTIVES);
     id->cntrltype = 0x1;
 
     /*
@@ -7457,8 +8230,17 @@ static void nvme_init_ctrl(NvmeCtrl *n, PCIDevice *pci_dev)
 
     if (n->subsys) {
         id->cmic |= NVME_CMIC_MULTI_CTRL;
+        ctratt |= NVME_CTRATT_ENDGRPS;
+
+        id->endgidmax = cpu_to_le16(0x1);
+
+        if (n->subsys->endgrp.fdp.enabled) {
+            ctratt |= NVME_CTRATT_FDPS;
+        }
     }
 
+    id->ctratt = cpu_to_le32(ctratt);
+
     NVME_CAP_SET_MQES(cap, 0x7ff);
     NVME_CAP_SET_CQR(cap, 1);
     NVME_CAP_SET_TO(cap, 0xf);
diff --git a/hw/nvme/ns.c b/hw/nvme/ns.c
index 62a1f97be0..cfac960dcf 100644
--- a/hw/nvme/ns.c
+++ b/hw/nvme/ns.c
@@ -14,8 +14,10 @@
 
 #include "qemu/osdep.h"
 #include "qemu/units.h"
+#include "qemu/cutils.h"
 #include "qemu/error-report.h"
 #include "qapi/error.h"
+#include "qemu/bitops.h"
 #include "sysemu/sysemu.h"
 #include "sysemu/block-backend.h"
 
@@ -377,6 +379,130 @@ static void nvme_zoned_ns_shutdown(NvmeNamespace *ns)
     assert(ns->nr_open_zones == 0);
 }
 
+static NvmeRuHandle *nvme_find_ruh_by_attr(NvmeEnduranceGroup *endgrp,
+                                           uint8_t ruha, uint16_t *ruhid)
+{
+    for (uint16_t i = 0; i < endgrp->fdp.nruh; i++) {
+        NvmeRuHandle *ruh = &endgrp->fdp.ruhs[i];
+
+        if (ruh->ruha == ruha) {
+            *ruhid = i;
+            return ruh;
+        }
+    }
+
+    return NULL;
+}
+
+static bool nvme_ns_init_fdp(NvmeNamespace *ns, Error **errp)
+{
+    NvmeEnduranceGroup *endgrp = ns->endgrp;
+    NvmeRuHandle *ruh;
+    uint8_t lbafi = NVME_ID_NS_FLBAS_INDEX(ns->id_ns.flbas);
+    unsigned int *ruhid, *ruhids;
+    char *r, *p, *token;
+    uint16_t *ph;
+
+    if (!ns->params.fdp.ruhs) {
+        ns->fdp.nphs = 1;
+        ph = ns->fdp.phs = g_new(uint16_t, 1);
+
+        ruh = nvme_find_ruh_by_attr(endgrp, NVME_RUHA_CTRL, ph);
+        if (!ruh) {
+            ruh = nvme_find_ruh_by_attr(endgrp, NVME_RUHA_UNUSED, ph);
+            if (!ruh) {
+                error_setg(errp, "no unused reclaim unit handles left");
+                return false;
+            }
+
+            ruh->ruha = NVME_RUHA_CTRL;
+            ruh->lbafi = lbafi;
+            ruh->ruamw = endgrp->fdp.runs >> ns->lbaf.ds;
+
+            for (uint16_t rg = 0; rg < endgrp->fdp.nrg; rg++) {
+                ruh->rus[rg].ruamw = ruh->ruamw;
+            }
+        } else if (ruh->lbafi != lbafi) {
+            error_setg(errp, "lba format index of controller assigned "
+                       "reclaim unit handle does not match namespace lba "
+                       "format index");
+            return false;
+        }
+
+        return true;
+    }
+
+    ruhid = ruhids = g_new0(unsigned int, endgrp->fdp.nruh);
+    r = p = strdup(ns->params.fdp.ruhs);
+
+    /* parse the placement handle identifiers */
+    while ((token = qemu_strsep(&p, ";")) != NULL) {
+        ns->fdp.nphs += 1;
+        if (ns->fdp.nphs > NVME_FDP_MAXPIDS ||
+            ns->fdp.nphs == endgrp->fdp.nruh) {
+            error_setg(errp, "too many placement handles");
+            free(r);
+            return false;
+        }
+
+        if (qemu_strtoui(token, NULL, 0, ruhid++) < 0) {
+            error_setg(errp, "cannot parse reclaim unit handle identifier");
+            free(r);
+            return false;
+        }
+    }
+
+    free(r);
+
+    ph = ns->fdp.phs = g_new(uint16_t, ns->fdp.nphs);
+
+    ruhid = ruhids;
+
+    /* verify the identifiers */
+    for (unsigned int i = 0; i < ns->fdp.nphs; i++, ruhid++, ph++) {
+        if (*ruhid >= endgrp->fdp.nruh) {
+            error_setg(errp, "invalid reclaim unit handle identifier");
+            return false;
+        }
+
+        ruh = &endgrp->fdp.ruhs[*ruhid];
+
+        switch (ruh->ruha) {
+        case NVME_RUHA_UNUSED:
+            ruh->ruha = NVME_RUHA_HOST;
+            ruh->lbafi = lbafi;
+            ruh->ruamw = endgrp->fdp.runs >> ns->lbaf.ds;
+
+            for (uint16_t rg = 0; rg < endgrp->fdp.nrg; rg++) {
+                ruh->rus[rg].ruamw = ruh->ruamw;
+            }
+
+            break;
+
+        case NVME_RUHA_HOST:
+            if (ruh->lbafi != lbafi) {
+                error_setg(errp, "lba format index of host assigned"
+                           "reclaim unit handle does not match namespace "
+                           "lba format index");
+                return false;
+            }
+
+            break;
+
+        case NVME_RUHA_CTRL:
+            error_setg(errp, "reclaim unit handle is controller assigned");
+            return false;
+
+        default:
+            abort();
+        }
+
+        *ph = *ruhid;
+    }
+
+    return true;
+}
+
 static int nvme_ns_check_constraints(NvmeNamespace *ns, Error **errp)
 {
     unsigned int pi_size;
@@ -417,6 +543,11 @@ static int nvme_ns_check_constraints(NvmeNamespace *ns, Error **errp)
         return -1;
     }
 
+    if (ns->params.zoned && ns->endgrp && ns->endgrp->fdp.enabled) {
+        error_setg(errp, "cannot be a zoned- in an FDP configuration");
+        return -1;
+    }
+
     if (ns->params.zoned) {
         if (ns->params.max_active_zones) {
             if (ns->params.max_open_zones > ns->params.max_active_zones) {
@@ -502,6 +633,12 @@ int nvme_ns_setup(NvmeNamespace *ns, Error **errp)
         nvme_ns_init_zoned(ns);
     }
 
+    if (ns->endgrp && ns->endgrp->fdp.enabled) {
+        if (!nvme_ns_init_fdp(ns, errp)) {
+            return -1;
+        }
+    }
+
     return 0;
 }
 
@@ -525,6 +662,10 @@ void nvme_ns_cleanup(NvmeNamespace *ns)
         g_free(ns->zone_array);
         g_free(ns->zd_extensions);
     }
+
+    if (ns->endgrp && ns->endgrp->fdp.enabled) {
+        g_free(ns->fdp.phs);
+    }
 }
 
 static void nvme_ns_unrealize(DeviceState *dev)
@@ -561,6 +702,8 @@ static void nvme_ns_realize(DeviceState *dev, Error **errp)
         if (!qdev_set_parent_bus(dev, &subsys->bus.parent_bus, errp)) {
             return;
         }
+        ns->subsys = subsys;
+        ns->endgrp = &subsys->endgrp;
     }
 
     if (nvme_ns_setup(ns, errp)) {
@@ -591,6 +734,8 @@ static void nvme_ns_realize(DeviceState *dev, Error **errp)
     if (subsys) {
         subsys->namespaces[nsid] = ns;
 
+        ns->id_ns.endgid = cpu_to_le16(0x1);
+
         if (ns->params.detached) {
             return;
         }
@@ -606,6 +751,7 @@ static void nvme_ns_realize(DeviceState *dev, Error **errp)
 
             return;
         }
+
     }
 
     nvme_attach_ns(n, ns);
@@ -644,6 +790,7 @@ static Property nvme_ns_props[] = {
     DEFINE_PROP_SIZE("zoned.zrwafg", NvmeNamespace, params.zrwafg, -1),
     DEFINE_PROP_BOOL("eui64-default", NvmeNamespace, params.eui64_default,
                      false),
+    DEFINE_PROP_STRING("fdp.ruhs", NvmeNamespace, params.fdp.ruhs),
     DEFINE_PROP_END_OF_LIST(),
 };
 
diff --git a/hw/nvme/nvme.h b/hw/nvme/nvme.h
index 16da27a69b..209e8f5b4c 100644
--- a/hw/nvme/nvme.h
+++ b/hw/nvme/nvme.h
@@ -27,6 +27,8 @@
 #define NVME_MAX_CONTROLLERS 256
 #define NVME_MAX_NAMESPACES  256
 #define NVME_EUI64_DEFAULT ((uint64_t)0x5254000000000000)
+#define NVME_FDP_MAX_EVENTS 63
+#define NVME_FDP_MAXPIDS 128
 
 QEMU_BUILD_BUG_ON(NVME_MAX_NAMESPACES > NVME_NSID_BROADCAST - 1);
 
@@ -45,17 +47,68 @@ typedef struct NvmeBus {
     OBJECT_CHECK(NvmeSubsystem, (obj), TYPE_NVME_SUBSYS)
 #define SUBSYS_SLOT_RSVD (void *)0xFFFF
 
+typedef struct NvmeReclaimUnit {
+    uint64_t ruamw;
+} NvmeReclaimUnit;
+
+typedef struct NvmeRuHandle {
+    uint8_t  ruht;
+    uint8_t  ruha;
+    uint64_t event_filter;
+    uint8_t  lbafi;
+    uint64_t ruamw;
+
+    /* reclaim units indexed by reclaim group */
+    NvmeReclaimUnit *rus;
+} NvmeRuHandle;
+
+typedef struct NvmeFdpEventBuffer {
+    NvmeFdpEvent     events[NVME_FDP_MAX_EVENTS];
+    unsigned int     nelems;
+    unsigned int     start;
+    unsigned int     next;
+} NvmeFdpEventBuffer;
+
+typedef struct NvmeEnduranceGroup {
+    uint8_t event_conf;
+
+    struct {
+        NvmeFdpEventBuffer host_events, ctrl_events;
+
+        uint16_t nruh;
+        uint16_t nrg;
+        uint8_t  rgif;
+        uint64_t runs;
+
+        uint64_t hbmw;
+        uint64_t mbmw;
+        uint64_t mbe;
+
+        bool enabled;
+
+        NvmeRuHandle *ruhs;
+    } fdp;
+} NvmeEnduranceGroup;
+
 typedef struct NvmeSubsystem {
     DeviceState parent_obj;
     NvmeBus     bus;
     uint8_t     subnqn[256];
     char        *serial;
 
-    NvmeCtrl      *ctrls[NVME_MAX_CONTROLLERS];
-    NvmeNamespace *namespaces[NVME_MAX_NAMESPACES + 1];
+    NvmeCtrl           *ctrls[NVME_MAX_CONTROLLERS];
+    NvmeNamespace      *namespaces[NVME_MAX_NAMESPACES + 1];
+    NvmeEnduranceGroup endgrp;
 
     struct {
         char *nqn;
+
+        struct {
+            bool     enabled;
+            uint64_t runs;
+            uint16_t nruh;
+            uint32_t nrg;
+        } fdp;
     } params;
 } NvmeSubsystem;
 
@@ -96,6 +149,21 @@ typedef struct NvmeZone {
     QTAILQ_ENTRY(NvmeZone) entry;
 } NvmeZone;
 
+#define FDP_EVT_MAX 0xff
+#define NVME_FDP_MAX_NS_RUHS 32u
+#define FDPVSS 0
+
+static const uint8_t nvme_fdp_evf_shifts[FDP_EVT_MAX] = {
+    /* Host events */
+    [FDP_EVT_RU_NOT_FULLY_WRITTEN]      = 0,
+    [FDP_EVT_RU_ATL_EXCEEDED]           = 1,
+    [FDP_EVT_CTRL_RESET_RUH]            = 2,
+    [FDP_EVT_INVALID_PID]               = 3,
+    /* CTRL events */
+    [FDP_EVT_MEDIA_REALLOC]             = 32,
+    [FDP_EVT_RUH_IMPLICIT_RU_CHANGE]    = 33,
+};
+
 typedef struct NvmeNamespaceParams {
     bool     detached;
     bool     shared;
@@ -125,6 +193,10 @@ typedef struct NvmeNamespaceParams {
     uint32_t numzrwa;
     uint64_t zrwas;
     uint64_t zrwafg;
+
+    struct {
+        char *ruhs;
+    } fdp;
 } NvmeNamespaceParams;
 
 typedef struct NvmeNamespace {
@@ -167,10 +239,18 @@ typedef struct NvmeNamespace {
     int32_t         nr_active_zones;
 
     NvmeNamespaceParams params;
+    NvmeSubsystem *subsys;
+    NvmeEnduranceGroup *endgrp;
 
     struct {
         uint32_t err_rec;
     } features;
+
+    struct {
+        uint16_t nphs;
+        /* reclaim unit handle identifiers indexed by placement handle */
+        uint16_t *phs;
+    } fdp;
 } NvmeNamespace;
 
 static inline uint32_t nvme_nsid(NvmeNamespace *ns)
@@ -274,6 +354,12 @@ static inline void nvme_aor_dec_active(NvmeNamespace *ns)
     assert(ns->nr_active_zones >= 0);
 }
 
+static inline void nvme_fdp_stat_inc(uint64_t *a, uint64_t b)
+{
+    uint64_t ret = *a + b;
+    *a = ret < *a ? UINT64_MAX : ret;
+}
+
 void nvme_ns_init_format(NvmeNamespace *ns);
 int nvme_ns_setup(NvmeNamespace *ns, Error **errp);
 void nvme_ns_drain(NvmeNamespace *ns);
@@ -340,7 +426,9 @@ static inline const char *nvme_adm_opc_str(uint8_t opc)
     case NVME_ADM_CMD_GET_FEATURES:     return "NVME_ADM_CMD_GET_FEATURES";
     case NVME_ADM_CMD_ASYNC_EV_REQ:     return "NVME_ADM_CMD_ASYNC_EV_REQ";
     case NVME_ADM_CMD_NS_ATTACHMENT:    return "NVME_ADM_CMD_NS_ATTACHMENT";
+    case NVME_ADM_CMD_DIRECTIVE_SEND:   return "NVME_ADM_CMD_DIRECTIVE_SEND";
     case NVME_ADM_CMD_VIRT_MNGMT:       return "NVME_ADM_CMD_VIRT_MNGMT";
+    case NVME_ADM_CMD_DIRECTIVE_RECV:   return "NVME_ADM_CMD_DIRECTIVE_RECV";
     case NVME_ADM_CMD_DBBUF_CONFIG:     return "NVME_ADM_CMD_DBBUF_CONFIG";
     case NVME_ADM_CMD_FORMAT_NVM:       return "NVME_ADM_CMD_FORMAT_NVM";
     default:                            return "NVME_ADM_CMD_UNKNOWN";
diff --git a/hw/nvme/subsys.c b/hw/nvme/subsys.c
index 9d2643678b..24ddec860e 100644
--- a/hw/nvme/subsys.c
+++ b/hw/nvme/subsys.c
@@ -7,10 +7,13 @@
  */
 
 #include "qemu/osdep.h"
+#include "qemu/units.h"
 #include "qapi/error.h"
 
 #include "nvme.h"
 
+#define NVME_DEFAULT_RU_SIZE (96 * MiB)
+
 static int nvme_subsys_reserve_cntlids(NvmeCtrl *n, int start, int num)
 {
     NvmeSubsystem *subsys = n->subsys;
@@ -109,13 +112,95 @@ void nvme_subsys_unregister_ctrl(NvmeSubsystem *subsys, NvmeCtrl *n)
     n->cntlid = -1;
 }
 
-static void nvme_subsys_setup(NvmeSubsystem *subsys)
+static bool nvme_calc_rgif(uint16_t nruh, uint16_t nrg, uint8_t *rgif)
+{
+    uint16_t val;
+    unsigned int i;
+
+    if (unlikely(nrg == 1)) {
+        /* PIDRG_NORGI scenario, all of pid is used for PHID */
+        *rgif = 0;
+        return true;
+    }
+
+    val = nrg;
+    i = 0;
+    while (val) {
+        val >>= 1;
+        i++;
+    }
+    *rgif = i;
+
+    /* ensure remaining bits suffice to represent number of phids in a RG */
+    if (unlikely((UINT16_MAX >> i) < nruh)) {
+        *rgif = 0;
+        return false;
+    }
+
+    return true;
+}
+
+static bool nvme_subsys_setup_fdp(NvmeSubsystem *subsys, Error **errp)
+{
+    NvmeEnduranceGroup *endgrp = &subsys->endgrp;
+
+    if (!subsys->params.fdp.runs) {
+        error_setg(errp, "fdp.runs must be non-zero");
+        return false;
+    }
+
+    endgrp->fdp.runs = subsys->params.fdp.runs;
+
+    if (!subsys->params.fdp.nrg) {
+        error_setg(errp, "fdp.nrg must be non-zero");
+        return false;
+    }
+
+    endgrp->fdp.nrg = subsys->params.fdp.nrg;
+
+    if (!subsys->params.fdp.nruh) {
+        error_setg(errp, "fdp.nruh must be non-zero");
+        return false;
+    }
+
+    endgrp->fdp.nruh = subsys->params.fdp.nruh;
+
+    if (!nvme_calc_rgif(endgrp->fdp.nruh, endgrp->fdp.nrg, &endgrp->fdp.rgif)) {
+        error_setg(errp,
+                   "cannot derive a valid rgif (nruh %"PRIu16" nrg %"PRIu32")",
+                   endgrp->fdp.nruh, endgrp->fdp.nrg);
+        return false;
+    }
+
+    endgrp->fdp.ruhs = g_new(NvmeRuHandle, endgrp->fdp.nruh);
+
+    for (uint16_t ruhid = 0; ruhid < endgrp->fdp.nruh; ruhid++) {
+        endgrp->fdp.ruhs[ruhid] = (NvmeRuHandle) {
+            .ruht = NVME_RUHT_INITIALLY_ISOLATED,
+            .ruha = NVME_RUHA_UNUSED,
+        };
+
+        endgrp->fdp.ruhs[ruhid].rus = g_new(NvmeReclaimUnit, endgrp->fdp.nrg);
+    }
+
+    endgrp->fdp.enabled = true;
+
+    return true;
+}
+
+static bool nvme_subsys_setup(NvmeSubsystem *subsys, Error **errp)
 {
     const char *nqn = subsys->params.nqn ?
         subsys->params.nqn : subsys->parent_obj.id;
 
     snprintf((char *)subsys->subnqn, sizeof(subsys->subnqn),
              "nqn.2019-08.org.qemu:%s", nqn);
+
+    if (subsys->params.fdp.enabled && !nvme_subsys_setup_fdp(subsys, errp)) {
+        return false;
+    }
+
+    return true;
 }
 
 static void nvme_subsys_realize(DeviceState *dev, Error **errp)
@@ -124,11 +209,16 @@ static void nvme_subsys_realize(DeviceState *dev, Error **errp)
 
     qbus_init(&subsys->bus, sizeof(NvmeBus), TYPE_NVME_BUS, dev, dev->id);
 
-    nvme_subsys_setup(subsys);
+    nvme_subsys_setup(subsys, errp);
 }
 
 static Property nvme_subsystem_props[] = {
     DEFINE_PROP_STRING("nqn", NvmeSubsystem, params.nqn),
+    DEFINE_PROP_BOOL("fdp", NvmeSubsystem, params.fdp.enabled, false),
+    DEFINE_PROP_SIZE("fdp.runs", NvmeSubsystem, params.fdp.runs,
+                     NVME_DEFAULT_RU_SIZE),
+    DEFINE_PROP_UINT32("fdp.nrg", NvmeSubsystem, params.fdp.nrg, 1),
+    DEFINE_PROP_UINT16("fdp.nruh", NvmeSubsystem, params.fdp.nruh, 0),
     DEFINE_PROP_END_OF_LIST(),
 };
 
diff --git a/hw/nvme/trace-events b/hw/nvme/trace-events
index b16f2260b4..7f7837e1a2 100644
--- a/hw/nvme/trace-events
+++ b/hw/nvme/trace-events
@@ -117,6 +117,7 @@ pci_nvme_clear_ns_reset(uint32_t state, uint64_t slba) "zone state=%"PRIu32", sl
 pci_nvme_zoned_zrwa_implicit_flush(uint64_t zslba, uint32_t nlb) "zslba 0x%"PRIx64" nlb %"PRIu32""
 pci_nvme_pci_reset(void) "PCI Function Level Reset"
 pci_nvme_virt_mngmt(uint16_t cid, uint16_t act, uint16_t cntlid, const char* rt, uint16_t nr) "cid %"PRIu16", act=0x%"PRIx16", ctrlid=%"PRIu16" %s nr=%"PRIu16""
+pci_nvme_fdp_ruh_change(uint16_t rgid, uint16_t ruhid) "change RU on RUH rgid=%"PRIu16", ruhid=%"PRIu16""
 
 # error conditions
 pci_nvme_err_mdts(size_t len) "len %zu"
diff --git a/hw/pci-bridge/cxl_root_port.c b/hw/pci-bridge/cxl_root_port.c
index 6664783974..7dfd20aa67 100644
--- a/hw/pci-bridge/cxl_root_port.c
+++ b/hw/pci-bridge/cxl_root_port.c
@@ -22,6 +22,7 @@
 #include "qemu/range.h"
 #include "hw/pci/pci_bridge.h"
 #include "hw/pci/pcie_port.h"
+#include "hw/pci/msi.h"
 #include "hw/qdev-properties.h"
 #include "hw/sysbus.h"
 #include "qapi/error.h"
@@ -29,6 +30,10 @@
 
 #define CXL_ROOT_PORT_DID 0x7075
 
+#define CXL_RP_MSI_OFFSET               0x60
+#define CXL_RP_MSI_SUPPORTED_FLAGS      PCI_MSI_FLAGS_MASKBIT
+#define CXL_RP_MSI_NR_VECTOR            2
+
 /* Copied from the gen root port which we derive */
 #define GEN_PCIE_ROOT_PORT_AER_OFFSET 0x100
 #define GEN_PCIE_ROOT_PORT_ACS_OFFSET \
@@ -47,6 +52,49 @@ typedef struct CXLRootPort {
 #define TYPE_CXL_ROOT_PORT "cxl-rp"
 DECLARE_INSTANCE_CHECKER(CXLRootPort, CXL_ROOT_PORT, TYPE_CXL_ROOT_PORT)
 
+/*
+ * If two MSI vector are allocated, Advanced Error Interrupt Message Number
+ * is 1. otherwise 0.
+ * 17.12.5.10 RPERRSTS,  32:27 bit Advanced Error Interrupt Message Number.
+ */
+static uint8_t cxl_rp_aer_vector(const PCIDevice *d)
+{
+    switch (msi_nr_vectors_allocated(d)) {
+    case 1:
+        return 0;
+    case 2:
+        return 1;
+    case 4:
+    case 8:
+    case 16:
+    case 32:
+    default:
+        break;
+    }
+    abort();
+    return 0;
+}
+
+static int cxl_rp_interrupts_init(PCIDevice *d, Error **errp)
+{
+    int rc;
+
+    rc = msi_init(d, CXL_RP_MSI_OFFSET, CXL_RP_MSI_NR_VECTOR,
+                  CXL_RP_MSI_SUPPORTED_FLAGS & PCI_MSI_FLAGS_64BIT,
+                  CXL_RP_MSI_SUPPORTED_FLAGS & PCI_MSI_FLAGS_MASKBIT,
+                  errp);
+    if (rc < 0) {
+        assert(rc == -ENOTSUP);
+    }
+
+    return rc;
+}
+
+static void cxl_rp_interrupts_uninit(PCIDevice *d)
+{
+    msi_uninit(d);
+}
+
 static void latch_registers(CXLRootPort *crp)
 {
     uint32_t *reg_state = crp->cxl_cstate.crb.cache_mem_registers;
@@ -183,16 +231,29 @@ static void cxl_rp_dvsec_write_config(PCIDevice *dev, uint32_t addr,
     }
 }
 
+static void cxl_rp_aer_vector_update(PCIDevice *d)
+{
+    PCIERootPortClass *rpc = PCIE_ROOT_PORT_GET_CLASS(d);
+
+    if (rpc->aer_vector) {
+        pcie_aer_root_set_vector(d, rpc->aer_vector(d));
+    }
+}
+
 static void cxl_rp_write_config(PCIDevice *d, uint32_t address, uint32_t val,
                                 int len)
 {
     uint16_t slt_ctl, slt_sta;
+    uint32_t root_cmd =
+        pci_get_long(d->config + d->exp.aer_cap + PCI_ERR_ROOT_COMMAND);
 
     pcie_cap_slot_get(d, &slt_ctl, &slt_sta);
     pci_bridge_write_config(d, address, val, len);
+    cxl_rp_aer_vector_update(d);
     pcie_cap_flr_write_config(d, address, val, len);
     pcie_cap_slot_write_config(d, slt_ctl, slt_sta, address, val, len);
     pcie_aer_write_config(d, address, val, len);
+    pcie_aer_root_write_config(d, address, val, len, root_cmd);
 
     cxl_rp_dvsec_write_config(d, address, val, len);
 }
@@ -217,6 +278,9 @@ static void cxl_root_port_class_init(ObjectClass *oc, void *data)
 
     rpc->aer_offset = GEN_PCIE_ROOT_PORT_AER_OFFSET;
     rpc->acs_offset = GEN_PCIE_ROOT_PORT_ACS_OFFSET;
+    rpc->aer_vector = cxl_rp_aer_vector;
+    rpc->interrupts_init = cxl_rp_interrupts_init;
+    rpc->interrupts_uninit = cxl_rp_interrupts_uninit;
 
     dc->hotpluggable = false;
 }
diff --git a/hw/pci-bridge/pci_expander_bridge.c b/hw/pci-bridge/pci_expander_bridge.c
index e752a21292..ead33f0c05 100644
--- a/hw/pci-bridge/pci_expander_bridge.c
+++ b/hw/pci-bridge/pci_expander_bridge.c
@@ -15,6 +15,7 @@
 #include "hw/pci/pci.h"
 #include "hw/pci/pci_bus.h"
 #include "hw/pci/pci_host.h"
+#include "hw/pci/pcie_port.h"
 #include "hw/qdev-properties.h"
 #include "hw/pci/pci_bridge.h"
 #include "hw/pci-bridge/pci_expander_bridge.h"
@@ -79,6 +80,13 @@ CXLComponentState *cxl_get_hb_cstate(PCIHostState *hb)
     return &host->cxl_cstate;
 }
 
+bool cxl_get_hb_passthrough(PCIHostState *hb)
+{
+    CXLHost *host = PXB_CXL_HOST(hb);
+
+    return host->passthrough;
+}
+
 static int pxb_bus_num(PCIBus *bus)
 {
     PXBDev *pxb = convert_to_pxb(bus->parent_dev);
@@ -289,15 +297,32 @@ static int pxb_map_irq_fn(PCIDevice *pci_dev, int pin)
     return pin - PCI_SLOT(pxb->devfn);
 }
 
-static void pxb_dev_reset(DeviceState *dev)
+static void pxb_cxl_dev_reset(DeviceState *dev)
 {
     CXLHost *cxl = PXB_CXL_DEV(dev)->cxl.cxl_host_bridge;
     CXLComponentState *cxl_cstate = &cxl->cxl_cstate;
+    PCIHostState *hb = PCI_HOST_BRIDGE(cxl);
     uint32_t *reg_state = cxl_cstate->crb.cache_mem_registers;
     uint32_t *write_msk = cxl_cstate->crb.cache_mem_regs_write_mask;
+    int dsp_count = 0;
 
     cxl_component_register_init_common(reg_state, write_msk, CXL2_ROOT_PORT);
-    ARRAY_FIELD_DP32(reg_state, CXL_HDM_DECODER_CAPABILITY, TARGET_COUNT, 8);
+    /*
+     * The CXL specification allows for host bridges with no HDM decoders
+     * if they only have a single root port.
+     */
+    if (!PXB_DEV(dev)->hdm_for_passthrough) {
+        dsp_count = pcie_count_ds_ports(hb->bus);
+    }
+    /* Initial reset will have 0 dsp so wait until > 0 */
+    if (dsp_count == 1) {
+        cxl->passthrough = true;
+        /* Set Capability ID in header to NONE */
+        ARRAY_FIELD_DP32(reg_state, CXL_HDM_CAPABILITY_HEADER, ID, 0);
+    } else {
+        ARRAY_FIELD_DP32(reg_state, CXL_HDM_DECODER_CAPABILITY, TARGET_COUNT,
+                         8);
+    }
 }
 
 static gint pxb_compare(gconstpointer a, gconstpointer b)
@@ -481,9 +506,18 @@ static void pxb_cxl_dev_realize(PCIDevice *dev, Error **errp)
     }
 
     pxb_dev_realize_common(dev, CXL, errp);
-    pxb_dev_reset(DEVICE(dev));
+    pxb_cxl_dev_reset(DEVICE(dev));
 }
 
+static Property pxb_cxl_dev_properties[] = {
+    /* Note: 0 is not a legal PXB bus number. */
+    DEFINE_PROP_UINT8("bus_nr", PXBDev, bus_nr, 0),
+    DEFINE_PROP_UINT16("numa_node", PXBDev, numa_node, NUMA_NODE_UNASSIGNED),
+    DEFINE_PROP_BOOL("bypass_iommu", PXBDev, bypass_iommu, false),
+    DEFINE_PROP_BOOL("hdm_for_passthrough", PXBDev, hdm_for_passthrough, false),
+    DEFINE_PROP_END_OF_LIST(),
+};
+
 static void pxb_cxl_dev_class_init(ObjectClass *klass, void *data)
 {
     DeviceClass *dc   = DEVICE_CLASS(klass);
@@ -497,12 +531,12 @@ static void pxb_cxl_dev_class_init(ObjectClass *klass, void *data)
      */
 
     dc->desc = "CXL Host Bridge";
-    device_class_set_props(dc, pxb_dev_properties);
+    device_class_set_props(dc, pxb_cxl_dev_properties);
     set_bit(DEVICE_CATEGORY_BRIDGE, dc->categories);
 
     /* Host bridges aren't hotpluggable. FIXME: spec reference */
     dc->hotpluggable = false;
-    dc->reset = pxb_dev_reset;
+    dc->reset = pxb_cxl_dev_reset;
 }
 
 static const TypeInfo pxb_cxl_dev_info = {
diff --git a/hw/pci-host/mv64361.c b/hw/pci-host/mv64361.c
index 298564f1f5..19e8031a3f 100644
--- a/hw/pci-host/mv64361.c
+++ b/hw/pci-host/mv64361.c
@@ -873,10 +873,6 @@ static void mv64361_realize(DeviceState *dev, Error **errp)
     }
     sysbus_init_irq(SYS_BUS_DEVICE(dev), &s->cpu_irq);
     qdev_init_gpio_in_named(dev, mv64361_gpp_irq, "gpp", 32);
-    /* FIXME: PCI IRQ connections may be board specific */
-    for (i = 0; i < PCI_NUM_PINS; i++) {
-        s->pci[1].irq[i] = qdev_get_gpio_in_named(dev, "gpp", 12 + i);
-    }
 }
 
 static void mv64361_reset(DeviceState *dev)
diff --git a/hw/pci/pci-internal.h b/hw/pci/pci-internal.h
index 2ea356bdf5..a7d6d8a732 100644
--- a/hw/pci/pci-internal.h
+++ b/hw/pci/pci-internal.h
@@ -20,6 +20,5 @@ void pcibus_dev_print(Monitor *mon, DeviceState *dev, int indent);
 
 int pcie_aer_parse_error_string(const char *error_name,
                                 uint32_t *status, bool *correctable);
-int pcie_aer_inject_error(PCIDevice *dev, const PCIEAERErr *err);
 
 #endif
diff --git a/hw/pci/pci.c b/hw/pci/pci.c
index 034fe49e9a..def5000e7b 100644
--- a/hw/pci/pci.c
+++ b/hw/pci/pci.c
@@ -95,6 +95,21 @@ static const VMStateDescription vmstate_pcibus = {
     }
 };
 
+static gint g_cmp_uint32(gconstpointer a, gconstpointer b, gpointer user_data)
+{
+    return a - b;
+}
+
+static GSequence *pci_acpi_index_list(void)
+{
+    static GSequence *used_acpi_index_list;
+
+    if (!used_acpi_index_list) {
+        used_acpi_index_list = g_sequence_new(NULL);
+    }
+    return used_acpi_index_list;
+}
+
 static void pci_init_bus_master(PCIDevice *pci_dev)
 {
     AddressSpace *dma_as = pci_device_iommu_address_space(pci_dev);
@@ -1246,6 +1261,17 @@ static void pci_qdev_unrealize(DeviceState *dev)
     do_pci_unregister_device(pci_dev);
 
     pci_dev->msi_trigger = NULL;
+
+    /*
+     * clean up acpi-index so it could reused by another device
+     */
+    if (pci_dev->acpi_index) {
+        GSequence *used_indexes = pci_acpi_index_list();
+
+        g_sequence_remove(g_sequence_lookup(used_indexes,
+                          GINT_TO_POINTER(pci_dev->acpi_index),
+                          g_cmp_uint32, NULL));
+    }
 }
 
 void pci_register_bar(PCIDevice *pci_dev, int region_num,
@@ -2005,6 +2031,8 @@ PCIDevice *pci_find_device(PCIBus *bus, int bus_num, uint8_t devfn)
     return bus->devices[devfn];
 }
 
+#define ONBOARD_INDEX_MAX (16 * 1024 - 1)
+
 static void pci_qdev_realize(DeviceState *qdev, Error **errp)
 {
     PCIDevice *pci_dev = (PCIDevice *)qdev;
@@ -2014,6 +2042,35 @@ static void pci_qdev_realize(DeviceState *qdev, Error **errp)
     bool is_default_rom;
     uint16_t class_id;
 
+    /*
+     * capped by systemd (see: udev-builtin-net_id.c)
+     * as it's the only known user honor it to avoid users
+     * misconfigure QEMU and then wonder why acpi-index doesn't work
+     */
+    if (pci_dev->acpi_index > ONBOARD_INDEX_MAX) {
+        error_setg(errp, "acpi-index should be less or equal to %u",
+                   ONBOARD_INDEX_MAX);
+        return;
+    }
+
+    /*
+     * make sure that acpi-index is unique across all present PCI devices
+     */
+    if (pci_dev->acpi_index) {
+        GSequence *used_indexes = pci_acpi_index_list();
+
+        if (g_sequence_lookup(used_indexes,
+                              GINT_TO_POINTER(pci_dev->acpi_index),
+                              g_cmp_uint32, NULL)) {
+            error_setg(errp, "a PCI device with acpi-index = %" PRIu32
+                       " already exist", pci_dev->acpi_index);
+            return;
+        }
+        g_sequence_insert_sorted(used_indexes,
+                                 GINT_TO_POINTER(pci_dev->acpi_index),
+                                 g_cmp_uint32, NULL);
+    }
+
     if (pci_dev->romsize != -1 && !is_power_of_2(pci_dev->romsize)) {
         error_setg(errp, "ROM size %u is not a power of two", pci_dev->romsize);
         return;
diff --git a/hw/pci/pcie_aer.c b/hw/pci/pcie_aer.c
index 9a19be44ae..103667c368 100644
--- a/hw/pci/pcie_aer.c
+++ b/hw/pci/pcie_aer.c
@@ -112,6 +112,10 @@ int pcie_aer_init(PCIDevice *dev, uint8_t cap_ver, uint16_t offset,
 
     pci_set_long(dev->w1cmask + offset + PCI_ERR_UNCOR_STATUS,
                  PCI_ERR_UNC_SUPPORTED);
+    pci_set_long(dev->config + offset + PCI_ERR_UNCOR_MASK,
+                 PCI_ERR_UNC_MASK_DEFAULT);
+    pci_set_long(dev->wmask + offset + PCI_ERR_UNCOR_MASK,
+                 PCI_ERR_UNC_SUPPORTED);
 
     pci_set_long(dev->config + offset + PCI_ERR_UNCOR_SEVER,
                  PCI_ERR_UNC_SEVERITY_DEFAULT);
@@ -188,8 +192,16 @@ static void pcie_aer_update_uncor_status(PCIDevice *dev)
 static bool
 pcie_aer_msg_alldev(PCIDevice *dev, const PCIEAERMsg *msg)
 {
+    uint16_t devctl = pci_get_word(dev->config + dev->exp.exp_cap +
+                                   PCI_EXP_DEVCTL);
     if (!(pcie_aer_msg_is_uncor(msg) &&
-          (pci_get_word(dev->config + PCI_COMMAND) & PCI_COMMAND_SERR))) {
+          (pci_get_word(dev->config + PCI_COMMAND) & PCI_COMMAND_SERR)) &&
+        !((msg->severity == PCI_ERR_ROOT_CMD_NONFATAL_EN) &&
+          (devctl & PCI_EXP_DEVCTL_NFERE)) &&
+        !((msg->severity == PCI_ERR_ROOT_CMD_COR_EN) &&
+          (devctl & PCI_EXP_DEVCTL_CERE)) &&
+        !((msg->severity == PCI_ERR_ROOT_CMD_FATAL_EN) &&
+          (devctl & PCI_EXP_DEVCTL_FERE))) {
         return false;
     }
 
diff --git a/hw/pci/pcie_port.c b/hw/pci/pcie_port.c
index 65a397ad23..20ff2b39e8 100644
--- a/hw/pci/pcie_port.c
+++ b/hw/pci/pcie_port.c
@@ -161,6 +161,51 @@ PCIDevice *pcie_find_port_by_pn(PCIBus *bus, uint8_t pn)
     return NULL;
 }
 
+/* Find first port in devfn number order */
+PCIDevice *pcie_find_port_first(PCIBus *bus)
+{
+    int devfn;
+
+    for (devfn = 0; devfn < ARRAY_SIZE(bus->devices); devfn++) {
+        PCIDevice *d = bus->devices[devfn];
+
+        if (!d || !pci_is_express(d) || !d->exp.exp_cap) {
+            continue;
+        }
+
+        if (object_dynamic_cast(OBJECT(d), TYPE_PCIE_PORT)) {
+            return d;
+        }
+    }
+
+    return NULL;
+}
+
+int pcie_count_ds_ports(PCIBus *bus)
+{
+    int dsp_count = 0;
+    int devfn;
+
+    for (devfn = 0; devfn < ARRAY_SIZE(bus->devices); devfn++) {
+        PCIDevice *d = bus->devices[devfn];
+
+        if (!d || !pci_is_express(d) || !d->exp.exp_cap) {
+            continue;
+        }
+        if (object_dynamic_cast(OBJECT(d), TYPE_PCIE_PORT)) {
+            dsp_count++;
+        }
+    }
+    return dsp_count;
+}
+
+static bool pcie_slot_is_hotpluggbale_bus(HotplugHandler *plug_handler,
+                                          BusState *bus)
+{
+    PCIESlot *s = PCIE_SLOT(bus->parent);
+    return s->hotplug;
+}
+
 static const TypeInfo pcie_port_type_info = {
     .name = TYPE_PCIE_PORT,
     .parent = TYPE_PCI_BRIDGE,
@@ -188,6 +233,7 @@ static void pcie_slot_class_init(ObjectClass *oc, void *data)
     hc->plug = pcie_cap_slot_plug_cb;
     hc->unplug = pcie_cap_slot_unplug_cb;
     hc->unplug_request = pcie_cap_slot_unplug_request_cb;
+    hc->is_hotpluggable_bus = pcie_slot_is_hotpluggbale_bus;
 }
 
 static const TypeInfo pcie_slot_type_info = {
diff --git a/hw/pci/pcie_sriov.c b/hw/pci/pcie_sriov.c
index f0bd72e069..aa5a757b11 100644
--- a/hw/pci/pcie_sriov.c
+++ b/hw/pci/pcie_sriov.c
@@ -300,3 +300,8 @@ PCIDevice *pcie_sriov_get_vf_at_index(PCIDevice *dev, int n)
     }
     return NULL;
 }
+
+uint16_t pcie_sriov_num_vfs(PCIDevice *dev)
+{
+    return dev->exp.sriov_pf.num_vfs;
+}
diff --git a/hw/ppc/pegasos2.c b/hw/ppc/pegasos2.c
index 7cc375df05..f1650be5ee 100644
--- a/hw/ppc/pegasos2.c
+++ b/hw/ppc/pegasos2.c
@@ -73,6 +73,8 @@ struct Pegasos2MachineState {
     MachineState parent_obj;
     PowerPCCPU *cpu;
     DeviceState *mv;
+    qemu_irq mv_pirq[PCI_NUM_PINS];
+    qemu_irq via_pirq[PCI_NUM_PINS];
     Vof *vof;
     void *fdt_blob;
     uint64_t kernel_addr;
@@ -95,6 +97,15 @@ static void pegasos2_cpu_reset(void *opaque)
     }
 }
 
+static void pegasos2_pci_irq(void *opaque, int n, int level)
+{
+    Pegasos2MachineState *pm = opaque;
+
+    /* PCI interrupt lines are connected to both MV64361 and VT8231 */
+    qemu_set_irq(pm->mv_pirq[n], level);
+    qemu_set_irq(pm->via_pirq[n], level);
+}
+
 static void pegasos2_init(MachineState *machine)
 {
     Pegasos2MachineState *pm = PEGASOS2_MACHINE(machine);
@@ -106,7 +117,7 @@ static void pegasos2_init(MachineState *machine)
     I2CBus *i2c_bus;
     const char *fwname = machine->firmware ?: PROM_FILENAME;
     char *filename;
-    int sz;
+    int i, sz;
     uint8_t *spd_data;
 
     /* init CPU */
@@ -156,11 +167,18 @@ static void pegasos2_init(MachineState *machine)
     /* Marvell Discovery II system controller */
     pm->mv = DEVICE(sysbus_create_simple(TYPE_MV64361, -1,
                           qdev_get_gpio_in(DEVICE(pm->cpu), PPC6xx_INPUT_INT)));
+    for (i = 0; i < PCI_NUM_PINS; i++) {
+        pm->mv_pirq[i] = qdev_get_gpio_in_named(pm->mv, "gpp", 12 + i);
+    }
     pci_bus = mv64361_get_pci_bus(pm->mv, 1);
+    pci_bus_irqs(pci_bus, pegasos2_pci_irq, pm, PCI_NUM_PINS);
 
     /* VIA VT8231 South Bridge (multifunction PCI device) */
     via = OBJECT(pci_create_simple_multifunction(pci_bus, PCI_DEVFN(12, 0),
                                                  true, TYPE_VT8231_ISA));
+    for (i = 0; i < PCI_NUM_PINS; i++) {
+        pm->via_pirq[i] = qdev_get_gpio_in_named(DEVICE(via), "pirq", i);
+    }
     object_property_add_alias(OBJECT(machine), "rtc-time",
                               object_resolve_path_component(via, "rtc"),
                               "date");
@@ -267,6 +285,12 @@ static void pegasos2_machine_reset(MachineState *machine, ShutdownCause reason)
                               PCI_INTERRUPT_LINE, 2, 0x9);
     pegasos2_pci_config_write(pm, 1, (PCI_DEVFN(12, 0) << 8) |
                               0x50, 1, 0x2);
+    pegasos2_pci_config_write(pm, 1, (PCI_DEVFN(12, 0) << 8) |
+                              0x55, 1, 0x90);
+    pegasos2_pci_config_write(pm, 1, (PCI_DEVFN(12, 0) << 8) |
+                              0x56, 1, 0x99);
+    pegasos2_pci_config_write(pm, 1, (PCI_DEVFN(12, 0) << 8) |
+                              0x57, 1, 0x90);
 
     pegasos2_pci_config_write(pm, 1, (PCI_DEVFN(12, 1) << 8) |
                               PCI_INTERRUPT_LINE, 2, 0x109);
diff --git a/hw/ppc/spapr_hcall.c b/hw/ppc/spapr_hcall.c
index 925ff523cc..ec4def62f8 100644
--- a/hw/ppc/spapr_hcall.c
+++ b/hw/ppc/spapr_hcall.c
@@ -8,6 +8,7 @@
 #include "qemu/module.h"
 #include "qemu/error-report.h"
 #include "exec/exec-all.h"
+#include "exec/tb-flush.h"
 #include "helper_regs.h"
 #include "hw/ppc/ppc.h"
 #include "hw/ppc/spapr.h"
diff --git a/hw/riscv/Kconfig b/hw/riscv/Kconfig
index 4550b3b938..6528ebfa3a 100644
--- a/hw/riscv/Kconfig
+++ b/hw/riscv/Kconfig
@@ -44,6 +44,7 @@ config RISCV_VIRT
     select VIRTIO_MMIO
     select FW_CFG_DMA
     select PLATFORM_BUS
+    select ACPI
 
 config SHAKTI_C
     bool
diff --git a/hw/riscv/meson.build b/hw/riscv/meson.build
index ab6cae57ea..2f7ee81be3 100644
--- a/hw/riscv/meson.build
+++ b/hw/riscv/meson.build
@@ -9,5 +9,6 @@ riscv_ss.add(when: 'CONFIG_SIFIVE_E', if_true: files('sifive_e.c'))
 riscv_ss.add(when: 'CONFIG_SIFIVE_U', if_true: files('sifive_u.c'))
 riscv_ss.add(when: 'CONFIG_SPIKE', if_true: files('spike.c'))
 riscv_ss.add(when: 'CONFIG_MICROCHIP_PFSOC', if_true: files('microchip_pfsoc.c'))
+riscv_ss.add(when: 'CONFIG_ACPI', if_true: files('virt-acpi-build.c'))
 
 hw_arch += {'riscv': riscv_ss}
diff --git a/hw/riscv/virt-acpi-build.c b/hw/riscv/virt-acpi-build.c
new file mode 100644
index 0000000000..82da0a238c
--- /dev/null
+++ b/hw/riscv/virt-acpi-build.c
@@ -0,0 +1,416 @@
+/*
+ * Support for generating ACPI tables and passing them to Guests
+ *
+ * RISC-V virt ACPI generation
+ *
+ * Copyright (C) 2008-2010  Kevin O'Connor <kevin@koconnor.net>
+ * Copyright (C) 2006 Fabrice Bellard
+ * Copyright (C) 2013 Red Hat Inc
+ * Copyright (c) 2015 HUAWEI TECHNOLOGIES CO.,LTD.
+ * Copyright (C) 2021-2023 Ventana Micro Systems Inc
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+
+ * You should have received a copy of the GNU General Public License along
+ * with this program; if not, see <http://www.gnu.org/licenses/>.
+ */
+
+#include "qemu/osdep.h"
+#include "hw/acpi/acpi-defs.h"
+#include "hw/acpi/acpi.h"
+#include "hw/acpi/aml-build.h"
+#include "hw/acpi/utils.h"
+#include "qapi/error.h"
+#include "sysemu/reset.h"
+#include "migration/vmstate.h"
+#include "hw/riscv/virt.h"
+#include "hw/riscv/numa.h"
+#include "hw/intc/riscv_aclint.h"
+
+#define ACPI_BUILD_TABLE_SIZE             0x20000
+
+typedef struct AcpiBuildState {
+    /* Copy of table in RAM (for patching) */
+    MemoryRegion *table_mr;
+    MemoryRegion *rsdp_mr;
+    MemoryRegion *linker_mr;
+    /* Is table patched? */
+    bool patched;
+} AcpiBuildState;
+
+static void acpi_align_size(GArray *blob, unsigned align)
+{
+    /*
+     * Align size to multiple of given size. This reduces the chance
+     * we need to change size in the future (breaking cross version migration).
+     */
+    g_array_set_size(blob, ROUND_UP(acpi_data_len(blob), align));
+}
+
+static void riscv_acpi_madt_add_rintc(uint32_t uid,
+                                      const CPUArchIdList *arch_ids,
+                                      GArray *entry)
+{
+    uint64_t hart_id = arch_ids->cpus[uid].arch_id;
+
+    build_append_int_noprefix(entry, 0x18, 1);       /* Type     */
+    build_append_int_noprefix(entry, 20, 1);         /* Length   */
+    build_append_int_noprefix(entry, 1, 1);          /* Version  */
+    build_append_int_noprefix(entry, 0, 1);          /* Reserved */
+    build_append_int_noprefix(entry, 0x1, 4);        /* Flags    */
+    build_append_int_noprefix(entry, hart_id, 8);    /* Hart ID  */
+    build_append_int_noprefix(entry, uid, 4);        /* ACPI Processor UID */
+}
+
+static void acpi_dsdt_add_cpus(Aml *scope, RISCVVirtState *s)
+{
+    MachineClass *mc = MACHINE_GET_CLASS(s);
+    MachineState *ms = MACHINE(s);
+    const CPUArchIdList *arch_ids = mc->possible_cpu_arch_ids(ms);
+
+    for (int i = 0; i < arch_ids->len; i++) {
+            Aml *dev;
+            GArray *madt_buf = g_array_new(0, 1, 1);
+
+            dev = aml_device("C%.03X", i);
+            aml_append(dev, aml_name_decl("_HID", aml_string("ACPI0007")));
+            aml_append(dev, aml_name_decl("_UID",
+                       aml_int(arch_ids->cpus[i].arch_id)));
+
+            /* build _MAT object */
+            riscv_acpi_madt_add_rintc(i, arch_ids, madt_buf);
+            aml_append(dev, aml_name_decl("_MAT",
+                                          aml_buffer(madt_buf->len,
+                                          (uint8_t *)madt_buf->data)));
+            g_array_free(madt_buf, true);
+
+            aml_append(scope, dev);
+    }
+}
+
+static void acpi_dsdt_add_fw_cfg(Aml *scope, const MemMapEntry *fw_cfg_memmap)
+{
+    Aml *dev = aml_device("FWCF");
+    aml_append(dev, aml_name_decl("_HID", aml_string("QEMU0002")));
+
+    /* device present, functioning, decoding, not shown in UI */
+    aml_append(dev, aml_name_decl("_STA", aml_int(0xB)));
+    aml_append(dev, aml_name_decl("_CCA", aml_int(1)));
+
+    Aml *crs = aml_resource_template();
+    aml_append(crs, aml_memory32_fixed(fw_cfg_memmap->base,
+                                       fw_cfg_memmap->size, AML_READ_WRITE));
+    aml_append(dev, aml_name_decl("_CRS", crs));
+    aml_append(scope, dev);
+}
+
+/* RHCT Node[N] starts at offset 56 */
+#define RHCT_NODE_ARRAY_OFFSET 56
+
+/*
+ * ACPI spec, Revision 6.5+
+ * 5.2.36 RISC-V Hart Capabilities Table (RHCT)
+ * REF: https://github.com/riscv-non-isa/riscv-acpi/issues/16
+ *      https://drive.google.com/file/d/1nP3nFiH4jkPMp6COOxP6123DCZKR-tia/view
+ */
+static void build_rhct(GArray *table_data,
+                       BIOSLinker *linker,
+                       RISCVVirtState *s)
+{
+    MachineClass *mc = MACHINE_GET_CLASS(s);
+    MachineState *ms = MACHINE(s);
+    const CPUArchIdList *arch_ids = mc->possible_cpu_arch_ids(ms);
+    size_t len, aligned_len;
+    uint32_t isa_offset, num_rhct_nodes;
+    RISCVCPU *cpu;
+    char *isa;
+
+    AcpiTable table = { .sig = "RHCT", .rev = 1, .oem_id = s->oem_id,
+                        .oem_table_id = s->oem_table_id };
+
+    acpi_table_begin(&table, table_data);
+
+    build_append_int_noprefix(table_data, 0x0, 4);   /* Reserved */
+
+    /* Time Base Frequency */
+    build_append_int_noprefix(table_data,
+                              RISCV_ACLINT_DEFAULT_TIMEBASE_FREQ, 8);
+
+    /* ISA + N hart info */
+    num_rhct_nodes = 1 + ms->smp.cpus;
+
+    /* Number of RHCT nodes*/
+    build_append_int_noprefix(table_data, num_rhct_nodes, 4);
+
+    /* Offset to the RHCT node array */
+    build_append_int_noprefix(table_data, RHCT_NODE_ARRAY_OFFSET, 4);
+
+    /* ISA String Node */
+    isa_offset = table_data->len - table.table_offset;
+    build_append_int_noprefix(table_data, 0, 2);   /* Type 0 */
+
+    cpu = &s->soc[0].harts[0];
+    isa = riscv_isa_string(cpu);
+    len = 8 + strlen(isa) + 1;
+    aligned_len = (len % 2) ? (len + 1) : len;
+
+    build_append_int_noprefix(table_data, aligned_len, 2);   /* Length */
+    build_append_int_noprefix(table_data, 0x1, 2);           /* Revision */
+
+    /* ISA string length including NUL */
+    build_append_int_noprefix(table_data, strlen(isa) + 1, 2);
+    g_array_append_vals(table_data, isa, strlen(isa) + 1);   /* ISA string */
+
+    if (aligned_len != len) {
+        build_append_int_noprefix(table_data, 0x0, 1);   /* Optional Padding */
+    }
+
+    /* Hart Info Node */
+    for (int i = 0; i < arch_ids->len; i++) {
+        build_append_int_noprefix(table_data, 0xFFFF, 2);  /* Type */
+        build_append_int_noprefix(table_data, 16, 2);      /* Length */
+        build_append_int_noprefix(table_data, 0x1, 2);     /* Revision */
+        build_append_int_noprefix(table_data, 1, 2);    /* Number of offsets */
+        build_append_int_noprefix(table_data, i, 4);    /* ACPI Processor UID */
+        build_append_int_noprefix(table_data, isa_offset, 4); /* Offsets[0] */
+    }
+
+    acpi_table_end(linker, &table);
+}
+
+/* FADT */
+static void build_fadt_rev6(GArray *table_data,
+                            BIOSLinker *linker,
+                            RISCVVirtState *s,
+                            unsigned dsdt_tbl_offset)
+{
+    AcpiFadtData fadt = {
+        .rev = 6,
+        .minor_ver = 5,
+        .flags = 1 << ACPI_FADT_F_HW_REDUCED_ACPI,
+        .xdsdt_tbl_offset = &dsdt_tbl_offset,
+    };
+
+    build_fadt(table_data, linker, &fadt, s->oem_id, s->oem_table_id);
+}
+
+/* DSDT */
+static void build_dsdt(GArray *table_data,
+                       BIOSLinker *linker,
+                       RISCVVirtState *s)
+{
+    Aml *scope, *dsdt;
+    const MemMapEntry *memmap = s->memmap;
+    AcpiTable table = { .sig = "DSDT", .rev = 2, .oem_id = s->oem_id,
+                        .oem_table_id = s->oem_table_id };
+
+
+    acpi_table_begin(&table, table_data);
+    dsdt = init_aml_allocator();
+
+    /*
+     * When booting the VM with UEFI, UEFI takes ownership of the RTC hardware.
+     * While UEFI can use libfdt to disable the RTC device node in the DTB that
+     * it passes to the OS, it cannot modify AML. Therefore, we won't generate
+     * the RTC ACPI device at all when using UEFI.
+     */
+    scope = aml_scope("\\_SB");
+    acpi_dsdt_add_cpus(scope, s);
+
+    acpi_dsdt_add_fw_cfg(scope, &memmap[VIRT_FW_CFG]);
+
+    aml_append(dsdt, scope);
+
+    /* copy AML table into ACPI tables blob and patch header there */
+    g_array_append_vals(table_data, dsdt->buf->data, dsdt->buf->len);
+
+    acpi_table_end(linker, &table);
+    free_aml_allocator();
+}
+
+/*
+ * ACPI spec, Revision 6.5+
+ * 5.2.12 Multiple APIC Description Table (MADT)
+ * REF: https://github.com/riscv-non-isa/riscv-acpi/issues/15
+ *      https://drive.google.com/file/d/1R6k4MshhN3WTT-hwqAquu5nX6xSEqK2l/view
+ */
+static void build_madt(GArray *table_data,
+                       BIOSLinker *linker,
+                       RISCVVirtState *s)
+{
+    MachineClass *mc = MACHINE_GET_CLASS(s);
+    MachineState *ms = MACHINE(s);
+    const CPUArchIdList *arch_ids = mc->possible_cpu_arch_ids(ms);
+
+    AcpiTable table = { .sig = "APIC", .rev = 6, .oem_id = s->oem_id,
+                        .oem_table_id = s->oem_table_id };
+
+    acpi_table_begin(&table, table_data);
+    /* Local Interrupt Controller Address */
+    build_append_int_noprefix(table_data, 0, 4);
+    build_append_int_noprefix(table_data, 0, 4);   /* MADT Flags */
+
+    /* RISC-V Local INTC structures per HART */
+    for (int i = 0; i < arch_ids->len; i++) {
+        riscv_acpi_madt_add_rintc(i, arch_ids, table_data);
+    }
+
+    acpi_table_end(linker, &table);
+}
+
+static void virt_acpi_build(RISCVVirtState *s, AcpiBuildTables *tables)
+{
+    GArray *table_offsets;
+    unsigned dsdt, xsdt;
+    GArray *tables_blob = tables->table_data;
+
+    table_offsets = g_array_new(false, true,
+                                sizeof(uint32_t));
+
+    bios_linker_loader_alloc(tables->linker,
+                             ACPI_BUILD_TABLE_FILE, tables_blob,
+                             64, false);
+
+    /* DSDT is pointed to by FADT */
+    dsdt = tables_blob->len;
+    build_dsdt(tables_blob, tables->linker, s);
+
+    /* FADT and others pointed to by XSDT */
+    acpi_add_table(table_offsets, tables_blob);
+    build_fadt_rev6(tables_blob, tables->linker, s, dsdt);
+
+    acpi_add_table(table_offsets, tables_blob);
+    build_madt(tables_blob, tables->linker, s);
+
+    acpi_add_table(table_offsets, tables_blob);
+    build_rhct(tables_blob, tables->linker, s);
+
+    /* XSDT is pointed to by RSDP */
+    xsdt = tables_blob->len;
+    build_xsdt(tables_blob, tables->linker, table_offsets, s->oem_id,
+                s->oem_table_id);
+
+    /* RSDP is in FSEG memory, so allocate it separately */
+    {
+        AcpiRsdpData rsdp_data = {
+            .revision = 2,
+            .oem_id = s->oem_id,
+            .xsdt_tbl_offset = &xsdt,
+            .rsdt_tbl_offset = NULL,
+        };
+        build_rsdp(tables->rsdp, tables->linker, &rsdp_data);
+    }
+
+    /*
+     * The align size is 128, warn if 64k is not enough therefore
+     * the align size could be resized.
+     */
+    if (tables_blob->len > ACPI_BUILD_TABLE_SIZE / 2) {
+        warn_report("ACPI table size %u exceeds %d bytes,"
+                    " migration may not work",
+                    tables_blob->len, ACPI_BUILD_TABLE_SIZE / 2);
+        error_printf("Try removing some objects.");
+    }
+
+    acpi_align_size(tables_blob, ACPI_BUILD_TABLE_SIZE);
+
+    /* Clean up memory that's no longer used */
+    g_array_free(table_offsets, true);
+}
+
+static void acpi_ram_update(MemoryRegion *mr, GArray *data)
+{
+    uint32_t size = acpi_data_len(data);
+
+    /*
+     * Make sure RAM size is correct - in case it got changed
+     * e.g. by migration
+     */
+    memory_region_ram_resize(mr, size, &error_abort);
+
+    memcpy(memory_region_get_ram_ptr(mr), data->data, size);
+    memory_region_set_dirty(mr, 0, size);
+}
+
+static void virt_acpi_build_update(void *build_opaque)
+{
+    AcpiBuildState *build_state = build_opaque;
+    AcpiBuildTables tables;
+
+    /* No state to update or already patched? Nothing to do. */
+    if (!build_state || build_state->patched) {
+        return;
+    }
+
+    build_state->patched = true;
+
+    acpi_build_tables_init(&tables);
+
+    virt_acpi_build(RISCV_VIRT_MACHINE(qdev_get_machine()), &tables);
+
+    acpi_ram_update(build_state->table_mr, tables.table_data);
+    acpi_ram_update(build_state->rsdp_mr, tables.rsdp);
+    acpi_ram_update(build_state->linker_mr, tables.linker->cmd_blob);
+
+    acpi_build_tables_cleanup(&tables, true);
+}
+
+static void virt_acpi_build_reset(void *build_opaque)
+{
+    AcpiBuildState *build_state = build_opaque;
+    build_state->patched = false;
+}
+
+static const VMStateDescription vmstate_virt_acpi_build = {
+    .name = "virt_acpi_build",
+    .version_id = 1,
+    .minimum_version_id = 1,
+    .fields = (VMStateField[]) {
+        VMSTATE_BOOL(patched, AcpiBuildState),
+        VMSTATE_END_OF_LIST()
+    },
+};
+
+void virt_acpi_setup(RISCVVirtState *s)
+{
+    AcpiBuildTables tables;
+    AcpiBuildState *build_state;
+
+    build_state = g_malloc0(sizeof *build_state);
+
+    acpi_build_tables_init(&tables);
+    virt_acpi_build(s, &tables);
+
+    /* Now expose it all to Guest */
+    build_state->table_mr = acpi_add_rom_blob(virt_acpi_build_update,
+                                              build_state, tables.table_data,
+                                              ACPI_BUILD_TABLE_FILE);
+    assert(build_state->table_mr != NULL);
+
+    build_state->linker_mr = acpi_add_rom_blob(virt_acpi_build_update,
+                                               build_state,
+                                               tables.linker->cmd_blob,
+                                               ACPI_BUILD_LOADER_FILE);
+
+    build_state->rsdp_mr = acpi_add_rom_blob(virt_acpi_build_update,
+                                             build_state, tables.rsdp,
+                                             ACPI_BUILD_RSDP_FILE);
+
+    qemu_register_reset(virt_acpi_build_reset, build_state);
+    virt_acpi_build_reset(build_state);
+    vmstate_register(NULL, 0, &vmstate_virt_acpi_build, build_state);
+
+    /*
+     * Clean up tables but don't free the memory: we track it
+     * in build_state.
+     */
+    acpi_build_tables_cleanup(&tables, false);
+}
diff --git a/hw/riscv/virt.c b/hw/riscv/virt.c
index 4f8191860b..4e3efbee16 100644
--- a/hw/riscv/virt.c
+++ b/hw/riscv/virt.c
@@ -49,6 +49,8 @@
 #include "hw/pci/pci.h"
 #include "hw/pci-host/gpex.h"
 #include "hw/display/ramfb.h"
+#include "hw/acpi/aml-build.h"
+#include "qapi/qapi-visit-common.h"
 
 /*
  * The virt machine physical address space used by some of the devices
@@ -228,8 +230,9 @@ static void create_fdt_socket_cpus(RISCVVirtState *s, int socket,
     int cpu;
     uint32_t cpu_phandle;
     MachineState *ms = MACHINE(s);
-    char *name, *cpu_name, *core_name, *intc_name;
+    char *name, *cpu_name, *core_name, *intc_name, *sv_name;
     bool is_32_bit = riscv_is_32bit(&s->soc[0]);
+    uint8_t satp_mode_max;
 
     for (cpu = s->soc[socket].num_harts - 1; cpu >= 0; cpu--) {
         RISCVCPU *cpu_ptr = &s->soc[socket].harts[cpu];
@@ -239,16 +242,29 @@ static void create_fdt_socket_cpus(RISCVVirtState *s, int socket,
         cpu_name = g_strdup_printf("/cpus/cpu@%d",
             s->soc[socket].hartid_base + cpu);
         qemu_fdt_add_subnode(ms->fdt, cpu_name);
-        if (cpu_ptr->cfg.mmu) {
-            qemu_fdt_setprop_string(ms->fdt, cpu_name, "mmu-type",
-                                    (is_32_bit) ? "riscv,sv32" : "riscv,sv48");
-        } else {
-            qemu_fdt_setprop_string(ms->fdt, cpu_name, "mmu-type",
-                                    "riscv,none");
-        }
+
+        satp_mode_max = satp_mode_max_from_map(
+            s->soc[socket].harts[cpu].cfg.satp_mode.map);
+        sv_name = g_strdup_printf("riscv,%s",
+                                  satp_mode_str(satp_mode_max, is_32_bit));
+        qemu_fdt_setprop_string(ms->fdt, cpu_name, "mmu-type", sv_name);
+        g_free(sv_name);
+
+
         name = riscv_isa_string(cpu_ptr);
         qemu_fdt_setprop_string(ms->fdt, cpu_name, "riscv,isa", name);
         g_free(name);
+
+        if (cpu_ptr->cfg.ext_icbom) {
+            qemu_fdt_setprop_cell(ms->fdt, cpu_name, "riscv,cbom-block-size",
+                                  cpu_ptr->cfg.cbom_blocksize);
+        }
+
+        if (cpu_ptr->cfg.ext_icboz) {
+            qemu_fdt_setprop_cell(ms->fdt, cpu_name, "riscv,cboz-block-size",
+                                  cpu_ptr->cfg.cboz_blocksize);
+        }
+
         qemu_fdt_setprop_string(ms->fdt, cpu_name, "compatible", "riscv");
         qemu_fdt_setprop_string(ms->fdt, cpu_name, "status", "okay");
         qemu_fdt_setprop_cell(ms->fdt, cpu_name, "reg",
@@ -1307,6 +1323,10 @@ static void virt_machine_done(Notifier *notifier, void *data)
     if (kvm_enabled()) {
         riscv_setup_direct_kernel(kernel_entry, fdt_load_addr);
     }
+
+    if (virt_is_acpi_enabled(s)) {
+        virt_acpi_setup(s);
+    }
 }
 
 static void virt_machine_init(MachineState *machine)
@@ -1442,6 +1462,8 @@ static void virt_machine_init(MachineState *machine)
             ROUND_UP(virt_high_pcie_memmap.base, virt_high_pcie_memmap.size);
     }
 
+    s->memmap = virt_memmap;
+
     /* register system main memory (actual RAM) */
     memory_region_add_subregion(system_memory, memmap[VIRT_DRAM].base,
         machine->ram);
@@ -1514,6 +1536,11 @@ static void virt_machine_init(MachineState *machine)
 
 static void virt_machine_instance_init(Object *obj)
 {
+    RISCVVirtState *s = RISCV_VIRT_MACHINE(obj);
+
+    s->oem_id = g_strndup(ACPI_BUILD_APPNAME6, 6);
+    s->oem_table_id = g_strndup(ACPI_BUILD_APPNAME8, 8);
+    s->acpi = ON_OFF_AUTO_AUTO;
 }
 
 static char *virt_get_aia_guests(Object *obj, Error **errp)
@@ -1588,6 +1615,28 @@ static void virt_set_aclint(Object *obj, bool value, Error **errp)
     s->have_aclint = value;
 }
 
+bool virt_is_acpi_enabled(RISCVVirtState *s)
+{
+    return s->acpi != ON_OFF_AUTO_OFF;
+}
+
+static void virt_get_acpi(Object *obj, Visitor *v, const char *name,
+                          void *opaque, Error **errp)
+{
+    RISCVVirtState *s = RISCV_VIRT_MACHINE(obj);
+    OnOffAuto acpi = s->acpi;
+
+    visit_type_OnOffAuto(v, name, &acpi, errp);
+}
+
+static void virt_set_acpi(Object *obj, Visitor *v, const char *name,
+                          void *opaque, Error **errp)
+{
+    RISCVVirtState *s = RISCV_VIRT_MACHINE(obj);
+
+    visit_type_OnOffAuto(v, name, &s->acpi, errp);
+}
+
 static HotplugHandler *virt_machine_get_hotplug_handler(MachineState *machine,
                                                         DeviceState *dev)
 {
@@ -1659,6 +1708,11 @@ static void virt_machine_class_init(ObjectClass *oc, void *data)
     sprintf(str, "Set number of guest MMIO pages for AIA IMSIC. Valid value "
                  "should be between 0 and %d.", VIRT_IRQCHIP_MAX_GUESTS);
     object_class_property_set_description(oc, "aia-guests", str);
+    object_class_property_add(oc, "acpi", "OnOffAuto",
+                              virt_get_acpi, virt_set_acpi,
+                              NULL, NULL);
+    object_class_property_set_description(oc, "acpi",
+                                          "Enable ACPI");
 }
 
 static const TypeInfo virt_machine_typeinfo = {
diff --git a/hw/usb/hcd-ohci.c b/hw/usb/hcd-ohci.c
index 6f8b543243..88d2b4b13c 100644
--- a/hw/usb/hcd-ohci.c
+++ b/hw/usb/hcd-ohci.c
@@ -1410,6 +1410,18 @@ static void ohci_set_hub_status(OHCIState *ohci, uint32_t val)
     }
 }
 
+/* This is the one state transition the controller can do by itself */
+static bool ohci_resume(OHCIState *s)
+{
+    if ((s->ctl & OHCI_CTL_HCFS) == OHCI_USB_SUSPEND) {
+        trace_usb_ohci_remote_wakeup(s->name);
+        s->ctl &= ~OHCI_CTL_HCFS;
+        s->ctl |= OHCI_USB_RESUME;
+        return true;
+    }
+    return false;
+}
+
 /*
  * Sets a flag in a port status reg but only set it if the port is connected.
  * If not set ConnectStatusChange flag. If flag is enabled return 1.
@@ -1426,7 +1438,10 @@ static int ohci_port_set_if_connected(OHCIState *ohci, int i, uint32_t val)
     if (!(ohci->rhport[i].ctrl & OHCI_PORT_CCS)) {
         ohci->rhport[i].ctrl |= OHCI_PORT_CSC;
         if (ohci->rhstatus & OHCI_RHS_DRWE) {
-            /* TODO: CSC is a wakeup event */
+            /* CSC is a wakeup event */
+            if (ohci_resume(ohci)) {
+                ohci_set_interrupt(ohci, OHCI_INTR_RD);
+            }
         }
         return 0;
     }
@@ -1828,11 +1843,7 @@ static void ohci_wakeup(USBPort *port1)
         intr = OHCI_INTR_RHSC;
     }
     /* Note that the controller can be suspended even if this port is not */
-    if ((s->ctl & OHCI_CTL_HCFS) == OHCI_USB_SUSPEND) {
-        trace_usb_ohci_remote_wakeup(s->name);
-        /* This is the one state transition the controller can do by itself */
-        s->ctl &= ~OHCI_CTL_HCFS;
-        s->ctl |= OHCI_USB_RESUME;
+    if (ohci_resume(s)) {
         /*
          * In suspend mode only ResumeDetected is possible, not RHSC:
          * see the OHCI spec 5.1.2.3.
diff --git a/hw/usb/meson.build b/hw/usb/meson.build
index bdf34cbd3e..599dc24f0d 100644
--- a/hw/usb/meson.build
+++ b/hw/usb/meson.build
@@ -84,6 +84,6 @@ if libusb.found()
   hw_usb_modules += {'host': usbhost_ss}
 endif
 
-softmmu_ss.add(when: ['CONFIG_USB', 'CONFIG_XEN', libusb], if_true: files('xen-usb.c'))
+softmmu_ss.add(when: ['CONFIG_USB', 'CONFIG_XEN_BUS', libusb], if_true: files('xen-usb.c'))
 
 modules += { 'hw-usb': hw_usb_modules }
diff --git a/hw/usb/vt82c686-uhci-pci.c b/hw/usb/vt82c686-uhci-pci.c
index 46a901f56f..b4884c9011 100644
--- a/hw/usb/vt82c686-uhci-pci.c
+++ b/hw/usb/vt82c686-uhci-pci.c
@@ -1,17 +1,7 @@
 #include "qemu/osdep.h"
-#include "hw/irq.h"
 #include "hw/isa/vt82c686.h"
 #include "hcd-uhci.h"
 
-static void uhci_isa_set_irq(void *opaque, int irq_num, int level)
-{
-    UHCIState *s = opaque;
-    uint8_t irq = pci_get_byte(s->dev.config + PCI_INTERRUPT_LINE);
-    if (irq > 0 && irq < 15) {
-        via_isa_set_irq(pci_get_function_0(&s->dev), irq, level);
-    }
-}
-
 static void usb_uhci_vt82c686b_realize(PCIDevice *dev, Error **errp)
 {
     UHCIState *s = UHCI(dev);
@@ -25,8 +15,6 @@ static void usb_uhci_vt82c686b_realize(PCIDevice *dev, Error **errp)
     pci_set_long(pci_conf + 0xc0, 0x00002000);
 
     usb_uhci_common_realize(dev, errp);
-    object_unref(s->irq);
-    s->irq = qemu_allocate_irq(uhci_isa_set_irq, s, 0);
 }
 
 static UHCIInfo uhci_info[] = {
diff --git a/hw/usb/xen-usb.c b/hw/usb/xen-usb.c
index 0f7369e7ed..66cb3f7c24 100644
--- a/hw/usb/xen-usb.c
+++ b/hw/usb/xen-usb.c
@@ -101,6 +101,8 @@ struct usbback_hotplug {
 struct usbback_info {
     struct XenLegacyDevice         xendev;  /* must be first */
     USBBus                   bus;
+    uint32_t                 urb_ring_ref;
+    uint32_t                 conn_ring_ref;
     void                     *urb_sring;
     void                     *conn_sring;
     struct usbif_urb_back_ring urb_ring;
@@ -159,7 +161,7 @@ static int usbback_gnttab_map(struct usbback_req *usbback_req)
 
     for (i = 0; i < nr_segs; i++) {
         if ((unsigned)usbback_req->req.seg[i].offset +
-            (unsigned)usbback_req->req.seg[i].length > XC_PAGE_SIZE) {
+            (unsigned)usbback_req->req.seg[i].length > XEN_PAGE_SIZE) {
             xen_pv_printf(xendev, 0, "segment crosses page boundary\n");
             return -EINVAL;
         }
@@ -183,7 +185,7 @@ static int usbback_gnttab_map(struct usbback_req *usbback_req)
 
         for (i = 0; i < usbback_req->nr_buffer_segs; i++) {
             seg = usbback_req->req.seg + i;
-            addr = usbback_req->buffer + i * XC_PAGE_SIZE + seg->offset;
+            addr = usbback_req->buffer + i * XEN_PAGE_SIZE + seg->offset;
             qemu_iovec_add(&usbback_req->packet.iov, addr, seg->length);
         }
     }
@@ -277,10 +279,11 @@ static int usbback_init_packet(struct usbback_req *usbback_req)
 static void usbback_do_response(struct usbback_req *usbback_req, int32_t status,
                                 int32_t actual_length, int32_t error_count)
 {
+    uint32_t ref[USBIF_MAX_SEGMENTS_PER_REQUEST];
     struct usbback_info *usbif;
     struct usbif_urb_response *res;
     struct XenLegacyDevice *xendev;
-    unsigned int notify;
+    unsigned int notify, i;
 
     usbif = usbback_req->usbif;
     xendev = &usbif->xendev;
@@ -293,13 +296,19 @@ static void usbback_do_response(struct usbback_req *usbback_req, int32_t status,
     }
 
     if (usbback_req->buffer) {
-        xen_be_unmap_grant_refs(xendev, usbback_req->buffer,
+        for (i = 0; i < usbback_req->nr_buffer_segs; i++) {
+            ref[i] = usbback_req->req.seg[i].gref;
+        }
+        xen_be_unmap_grant_refs(xendev, usbback_req->buffer, ref,
                                 usbback_req->nr_buffer_segs);
         usbback_req->buffer = NULL;
     }
 
     if (usbback_req->isoc_buffer) {
-        xen_be_unmap_grant_refs(xendev, usbback_req->isoc_buffer,
+        for (i = 0; i < usbback_req->nr_extra_segs; i++) {
+            ref[i] = usbback_req->req.seg[i + usbback_req->req.nr_buffer_segs].gref;
+        }
+        xen_be_unmap_grant_refs(xendev, usbback_req->isoc_buffer, ref,
                                 usbback_req->nr_extra_segs);
         usbback_req->isoc_buffer = NULL;
     }
@@ -832,11 +841,11 @@ static void usbback_disconnect(struct XenLegacyDevice *xendev)
     xen_pv_unbind_evtchn(xendev);
 
     if (usbif->urb_sring) {
-        xen_be_unmap_grant_ref(xendev, usbif->urb_sring);
+        xen_be_unmap_grant_ref(xendev, usbif->urb_sring, usbif->urb_ring_ref);
         usbif->urb_sring = NULL;
     }
     if (usbif->conn_sring) {
-        xen_be_unmap_grant_ref(xendev, usbif->conn_sring);
+        xen_be_unmap_grant_ref(xendev, usbif->conn_sring, usbif->conn_ring_ref);
         usbif->conn_sring = NULL;
     }
 
@@ -889,10 +898,12 @@ static int usbback_connect(struct XenLegacyDevice *xendev)
         return -1;
     }
 
+    usbif->urb_ring_ref = urb_ring_ref;
+    usbif->conn_ring_ref = conn_ring_ref;
     urb_sring = usbif->urb_sring;
     conn_sring = usbif->conn_sring;
-    BACK_RING_INIT(&usbif->urb_ring, urb_sring, XC_PAGE_SIZE);
-    BACK_RING_INIT(&usbif->conn_ring, conn_sring, XC_PAGE_SIZE);
+    BACK_RING_INIT(&usbif->urb_ring, urb_sring, XEN_PAGE_SIZE);
+    BACK_RING_INIT(&usbif->conn_ring, conn_sring, XEN_PAGE_SIZE);
 
     xen_be_bind_evtchn(xendev);
 
diff --git a/hw/vfio/common.c b/hw/vfio/common.c
index bab83c0e55..4d01ea3515 100644
--- a/hw/vfio/common.c
+++ b/hw/vfio/common.c
@@ -42,6 +42,7 @@
 #include "migration/migration.h"
 #include "migration/misc.h"
 #include "migration/blocker.h"
+#include "migration/qemu-file.h"
 #include "sysemu/tpm.h"
 
 VFIOGroupList vfio_group_list =
@@ -319,6 +320,28 @@ const MemoryRegionOps vfio_region_ops = {
  * Device state interfaces
  */
 
+typedef struct {
+    unsigned long *bitmap;
+    hwaddr size;
+    hwaddr pages;
+} VFIOBitmap;
+
+static int vfio_bitmap_alloc(VFIOBitmap *vbmap, hwaddr size)
+{
+    vbmap->pages = REAL_HOST_PAGE_ALIGN(size) / qemu_real_host_page_size();
+    vbmap->size = ROUND_UP(vbmap->pages, sizeof(__u64) * BITS_PER_BYTE) /
+                                         BITS_PER_BYTE;
+    vbmap->bitmap = g_try_malloc0(vbmap->size);
+    if (!vbmap->bitmap) {
+        return -ENOMEM;
+    }
+
+    return 0;
+}
+
+static int vfio_get_dirty_bitmap(VFIOContainer *container, uint64_t iova,
+                                 uint64_t size, ram_addr_t ram_addr);
+
 bool vfio_mig_active(void)
 {
     VFIOGroup *group;
@@ -339,6 +362,7 @@ bool vfio_mig_active(void)
 }
 
 static Error *multiple_devices_migration_blocker;
+static Error *giommu_migration_blocker;
 
 static unsigned int vfio_migratable_device_num(void)
 {
@@ -390,6 +414,64 @@ void vfio_unblock_multiple_devices_migration(void)
     multiple_devices_migration_blocker = NULL;
 }
 
+static bool vfio_viommu_preset(void)
+{
+    VFIOAddressSpace *space;
+
+    QLIST_FOREACH(space, &vfio_address_spaces, list) {
+        if (space->as != &address_space_memory) {
+            return true;
+        }
+    }
+
+    return false;
+}
+
+int vfio_block_giommu_migration(Error **errp)
+{
+    int ret;
+
+    if (giommu_migration_blocker ||
+        !vfio_viommu_preset()) {
+        return 0;
+    }
+
+    error_setg(&giommu_migration_blocker,
+               "Migration is currently not supported with vIOMMU enabled");
+    ret = migrate_add_blocker(giommu_migration_blocker, errp);
+    if (ret < 0) {
+        error_free(giommu_migration_blocker);
+        giommu_migration_blocker = NULL;
+    }
+
+    return ret;
+}
+
+void vfio_migration_finalize(void)
+{
+    if (!giommu_migration_blocker ||
+        vfio_viommu_preset()) {
+        return;
+    }
+
+    migrate_del_blocker(giommu_migration_blocker);
+    error_free(giommu_migration_blocker);
+    giommu_migration_blocker = NULL;
+}
+
+static void vfio_set_migration_error(int err)
+{
+    MigrationState *ms = migrate_get_current();
+
+    if (migration_is_setup_or_active(ms->state)) {
+        WITH_QEMU_LOCK_GUARD(&ms->qemu_file_lock) {
+            if (ms->to_dst_file) {
+                qemu_file_set_error(ms->to_dst_file, err);
+            }
+        }
+    }
+}
+
 static bool vfio_devices_all_dirty_tracking(VFIOContainer *container)
 {
     VFIOGroup *group;
@@ -417,6 +499,22 @@ static bool vfio_devices_all_dirty_tracking(VFIOContainer *container)
     return true;
 }
 
+static bool vfio_devices_all_device_dirty_tracking(VFIOContainer *container)
+{
+    VFIOGroup *group;
+    VFIODevice *vbasedev;
+
+    QLIST_FOREACH(group, &container->group_list, container_next) {
+        QLIST_FOREACH(vbasedev, &group->device_list, next) {
+            if (!vbasedev->dirty_pages_supported) {
+                return false;
+            }
+        }
+    }
+
+    return true;
+}
+
 /*
  * Check if all VFIO devices are running and migration is active, which is
  * essentially equivalent to the migration being in pre-copy phase.
@@ -454,9 +552,14 @@ static int vfio_dma_unmap_bitmap(VFIOContainer *container,
 {
     struct vfio_iommu_type1_dma_unmap *unmap;
     struct vfio_bitmap *bitmap;
-    uint64_t pages = REAL_HOST_PAGE_ALIGN(size) / qemu_real_host_page_size();
+    VFIOBitmap vbmap;
     int ret;
 
+    ret = vfio_bitmap_alloc(&vbmap, size);
+    if (ret) {
+        return ret;
+    }
+
     unmap = g_malloc0(sizeof(*unmap) + sizeof(*bitmap));
 
     unmap->argsz = sizeof(*unmap) + sizeof(*bitmap);
@@ -470,35 +573,28 @@ static int vfio_dma_unmap_bitmap(VFIOContainer *container,
      * qemu_real_host_page_size to mark those dirty. Hence set bitmap_pgsize
      * to qemu_real_host_page_size.
      */
-
     bitmap->pgsize = qemu_real_host_page_size();
-    bitmap->size = ROUND_UP(pages, sizeof(__u64) * BITS_PER_BYTE) /
-                   BITS_PER_BYTE;
+    bitmap->size = vbmap.size;
+    bitmap->data = (__u64 *)vbmap.bitmap;
 
-    if (bitmap->size > container->max_dirty_bitmap_size) {
-        error_report("UNMAP: Size of bitmap too big 0x%"PRIx64,
-                     (uint64_t)bitmap->size);
+    if (vbmap.size > container->max_dirty_bitmap_size) {
+        error_report("UNMAP: Size of bitmap too big 0x%"PRIx64, vbmap.size);
         ret = -E2BIG;
         goto unmap_exit;
     }
 
-    bitmap->data = g_try_malloc0(bitmap->size);
-    if (!bitmap->data) {
-        ret = -ENOMEM;
-        goto unmap_exit;
-    }
-
     ret = ioctl(container->fd, VFIO_IOMMU_UNMAP_DMA, unmap);
     if (!ret) {
-        cpu_physical_memory_set_dirty_lebitmap((unsigned long *)bitmap->data,
-                iotlb->translated_addr, pages);
+        cpu_physical_memory_set_dirty_lebitmap(vbmap.bitmap,
+                iotlb->translated_addr, vbmap.pages);
     } else {
         error_report("VFIO_UNMAP_DMA with DIRTY_BITMAP : %m");
     }
 
-    g_free(bitmap->data);
 unmap_exit:
     g_free(unmap);
+    g_free(vbmap.bitmap);
+
     return ret;
 }
 
@@ -515,10 +611,16 @@ static int vfio_dma_unmap(VFIOContainer *container,
         .iova = iova,
         .size = size,
     };
+    bool need_dirty_sync = false;
+    int ret;
 
-    if (iotlb && container->dirty_pages_supported &&
-        vfio_devices_all_running_and_mig_active(container)) {
-        return vfio_dma_unmap_bitmap(container, iova, size, iotlb);
+    if (iotlb && vfio_devices_all_running_and_mig_active(container)) {
+        if (!vfio_devices_all_device_dirty_tracking(container) &&
+            container->dirty_pages_supported) {
+            return vfio_dma_unmap_bitmap(container, iova, size, iotlb);
+        }
+
+        need_dirty_sync = true;
     }
 
     while (ioctl(container->fd, VFIO_IOMMU_UNMAP_DMA, &unmap)) {
@@ -544,10 +646,12 @@ static int vfio_dma_unmap(VFIOContainer *container,
         return -errno;
     }
 
-    if (iotlb && vfio_devices_all_running_and_mig_active(container)) {
-        cpu_physical_memory_set_dirty_range(iotlb->translated_addr, size,
-                                            tcg_enabled() ? DIRTY_CLIENTS_ALL :
-                                            DIRTY_CLIENTS_NOCODE);
+    if (need_dirty_sync) {
+        ret = vfio_get_dirty_bitmap(container, iova, size,
+                                    iotlb->translated_addr);
+        if (ret) {
+            return ret;
+        }
     }
 
     return 0;
@@ -680,6 +784,7 @@ static void vfio_iommu_map_notify(IOMMUNotifier *n, IOMMUTLBEntry *iotlb)
     if (iotlb->target_as != &address_space_memory) {
         error_report("Wrong target AS \"%s\", only system memory is allowed",
                      iotlb->target_as->name ? iotlb->target_as->name : "none");
+        vfio_set_migration_error(-EINVAL);
         return;
     }
 
@@ -703,17 +808,18 @@ static void vfio_iommu_map_notify(IOMMUNotifier *n, IOMMUTLBEntry *iotlb)
                            read_only);
         if (ret) {
             error_report("vfio_dma_map(%p, 0x%"HWADDR_PRIx", "
-                         "0x%"HWADDR_PRIx", %p) = %d (%m)",
+                         "0x%"HWADDR_PRIx", %p) = %d (%s)",
                          container, iova,
-                         iotlb->addr_mask + 1, vaddr, ret);
+                         iotlb->addr_mask + 1, vaddr, ret, strerror(-ret));
         }
     } else {
         ret = vfio_dma_unmap(container, iova, iotlb->addr_mask + 1, iotlb);
         if (ret) {
             error_report("vfio_dma_unmap(%p, 0x%"HWADDR_PRIx", "
-                         "0x%"HWADDR_PRIx") = %d (%m)",
+                         "0x%"HWADDR_PRIx") = %d (%s)",
                          container, iova,
-                         iotlb->addr_mask + 1, ret);
+                         iotlb->addr_mask + 1, ret, strerror(-ret));
+            vfio_set_migration_error(ret);
         }
     }
 out:
@@ -868,6 +974,22 @@ static void vfio_unregister_ram_discard_listener(VFIOContainer *container,
     g_free(vrdl);
 }
 
+static VFIOHostDMAWindow *vfio_find_hostwin(VFIOContainer *container,
+                                            hwaddr iova, hwaddr end)
+{
+    VFIOHostDMAWindow *hostwin;
+    bool hostwin_found = false;
+
+    QLIST_FOREACH(hostwin, &container->hostwin_list, hostwin_next) {
+        if (hostwin->min_iova <= iova && end <= hostwin->max_iova) {
+            hostwin_found = true;
+            break;
+        }
+    }
+
+    return hostwin_found ? hostwin : NULL;
+}
+
 static bool vfio_known_safe_misalignment(MemoryRegionSection *section)
 {
     MemoryRegion *mr = section->mr;
@@ -884,24 +1006,15 @@ static bool vfio_known_safe_misalignment(MemoryRegionSection *section)
     return true;
 }
 
-static void vfio_listener_region_add(MemoryListener *listener,
-                                     MemoryRegionSection *section)
+static bool vfio_listener_valid_section(MemoryRegionSection *section,
+                                        const char *name)
 {
-    VFIOContainer *container = container_of(listener, VFIOContainer, listener);
-    hwaddr iova, end;
-    Int128 llend, llsize;
-    void *vaddr;
-    int ret;
-    VFIOHostDMAWindow *hostwin;
-    bool hostwin_found;
-    Error *err = NULL;
-
     if (vfio_listener_skipped_section(section)) {
-        trace_vfio_listener_region_add_skip(
+        trace_vfio_listener_region_skip(name,
                 section->offset_within_address_space,
                 section->offset_within_address_space +
                 int128_get64(int128_sub(section->size, int128_one())));
-        return;
+        return false;
     }
 
     if (unlikely((section->offset_within_address_space &
@@ -916,15 +1029,53 @@ static void vfio_listener_region_add(MemoryListener *listener,
                          section->offset_within_region,
                          qemu_real_host_page_size());
         }
-        return;
+        return false;
     }
 
+    return true;
+}
+
+static bool vfio_get_section_iova_range(VFIOContainer *container,
+                                        MemoryRegionSection *section,
+                                        hwaddr *out_iova, hwaddr *out_end,
+                                        Int128 *out_llend)
+{
+    Int128 llend;
+    hwaddr iova;
+
     iova = REAL_HOST_PAGE_ALIGN(section->offset_within_address_space);
     llend = int128_make64(section->offset_within_address_space);
     llend = int128_add(llend, section->size);
     llend = int128_and(llend, int128_exts64(qemu_real_host_page_mask()));
 
     if (int128_ge(int128_make64(iova), llend)) {
+        return false;
+    }
+
+    *out_iova = iova;
+    *out_end = int128_get64(int128_sub(llend, int128_one()));
+    if (out_llend) {
+        *out_llend = llend;
+    }
+    return true;
+}
+
+static void vfio_listener_region_add(MemoryListener *listener,
+                                     MemoryRegionSection *section)
+{
+    VFIOContainer *container = container_of(listener, VFIOContainer, listener);
+    hwaddr iova, end;
+    Int128 llend, llsize;
+    void *vaddr;
+    int ret;
+    VFIOHostDMAWindow *hostwin;
+    Error *err = NULL;
+
+    if (!vfio_listener_valid_section(section, "region_add")) {
+        return;
+    }
+
+    if (!vfio_get_section_iova_range(container, section, &iova, &end, &llend)) {
         if (memory_region_is_ram_device(section->mr)) {
             trace_vfio_listener_region_add_no_dma_map(
                 memory_region_name(section->mr),
@@ -934,7 +1085,6 @@ static void vfio_listener_region_add(MemoryListener *listener,
         }
         return;
     }
-    end = int128_get64(int128_sub(llend, int128_one()));
 
     if (container->iommu_type == VFIO_SPAPR_TCE_v2_IOMMU) {
         hwaddr pgsize = 0;
@@ -994,15 +1144,8 @@ static void vfio_listener_region_add(MemoryListener *listener,
 #endif
     }
 
-    hostwin_found = false;
-    QLIST_FOREACH(hostwin, &container->hostwin_list, hostwin_next) {
-        if (hostwin->min_iova <= iova && end <= hostwin->max_iova) {
-            hostwin_found = true;
-            break;
-        }
-    }
-
-    if (!hostwin_found) {
+    hostwin = vfio_find_hostwin(container, iova, end);
+    if (!hostwin) {
         error_setg(&err, "Container %p can't map guest IOVA region"
                    " 0x%"HWADDR_PRIx"..0x%"HWADDR_PRIx, container, iova, end);
         goto fail;
@@ -1095,8 +1238,9 @@ static void vfio_listener_region_add(MemoryListener *listener,
                        vaddr, section->readonly);
     if (ret) {
         error_setg(&err, "vfio_dma_map(%p, 0x%"HWADDR_PRIx", "
-                   "0x%"HWADDR_PRIx", %p) = %d (%m)",
-                   container, iova, int128_get64(llsize), vaddr, ret);
+                   "0x%"HWADDR_PRIx", %p) = %d (%s)",
+                   container, iova, int128_get64(llsize), vaddr, ret,
+                   strerror(-ret));
         if (memory_region_is_ram_device(section->mr)) {
             /* Allow unexpected mappings not to be fatal for RAM devices */
             error_report_err(err);
@@ -1140,26 +1284,7 @@ static void vfio_listener_region_del(MemoryListener *listener,
     int ret;
     bool try_unmap = true;
 
-    if (vfio_listener_skipped_section(section)) {
-        trace_vfio_listener_region_del_skip(
-                section->offset_within_address_space,
-                section->offset_within_address_space +
-                int128_get64(int128_sub(section->size, int128_one())));
-        return;
-    }
-
-    if (unlikely((section->offset_within_address_space &
-                  ~qemu_real_host_page_mask()) !=
-                 (section->offset_within_region & ~qemu_real_host_page_mask()))) {
-        if (!vfio_known_safe_misalignment(section)) {
-            error_report("%s received unaligned region %s iova=0x%"PRIx64
-                         " offset_within_region=0x%"PRIx64
-                         " qemu_real_host_page_size=0x%"PRIxPTR,
-                         __func__, memory_region_name(section->mr),
-                         section->offset_within_address_space,
-                         section->offset_within_region,
-                         qemu_real_host_page_size());
-        }
+    if (!vfio_listener_valid_section(section, "region_del")) {
         return;
     }
 
@@ -1186,15 +1311,9 @@ static void vfio_listener_region_del(MemoryListener *listener,
          */
     }
 
-    iova = REAL_HOST_PAGE_ALIGN(section->offset_within_address_space);
-    llend = int128_make64(section->offset_within_address_space);
-    llend = int128_add(llend, section->size);
-    llend = int128_and(llend, int128_exts64(qemu_real_host_page_mask()));
-
-    if (int128_ge(int128_make64(iova), llend)) {
+    if (!vfio_get_section_iova_range(container, section, &iova, &end, &llend)) {
         return;
     }
-    end = int128_get64(int128_sub(llend, int128_one()));
 
     llsize = int128_sub(llend, int128_make64(iova));
 
@@ -1203,15 +1322,9 @@ static void vfio_listener_region_del(MemoryListener *listener,
     if (memory_region_is_ram_device(section->mr)) {
         hwaddr pgmask;
         VFIOHostDMAWindow *hostwin;
-        bool hostwin_found = false;
 
-        QLIST_FOREACH(hostwin, &container->hostwin_list, hostwin_next) {
-            if (hostwin->min_iova <= iova && end <= hostwin->max_iova) {
-                hostwin_found = true;
-                break;
-            }
-        }
-        assert(hostwin_found); /* or region_add() would have failed */
+        hostwin = vfio_find_hostwin(container, iova, end);
+        assert(hostwin); /* or region_add() would have failed */
 
         pgmask = (1ULL << ctz64(hostwin->iova_pgsizes)) - 1;
         try_unmap = !((iova & pgmask) || (int128_get64(llsize) & pgmask));
@@ -1228,16 +1341,18 @@ static void vfio_listener_region_del(MemoryListener *listener,
             ret = vfio_dma_unmap(container, iova, int128_get64(llsize), NULL);
             if (ret) {
                 error_report("vfio_dma_unmap(%p, 0x%"HWADDR_PRIx", "
-                             "0x%"HWADDR_PRIx") = %d (%m)",
-                             container, iova, int128_get64(llsize), ret);
+                             "0x%"HWADDR_PRIx") = %d (%s)",
+                             container, iova, int128_get64(llsize), ret,
+                             strerror(-ret));
             }
             iova += int128_get64(llsize);
         }
         ret = vfio_dma_unmap(container, iova, int128_get64(llsize), NULL);
         if (ret) {
             error_report("vfio_dma_unmap(%p, 0x%"HWADDR_PRIx", "
-                         "0x%"HWADDR_PRIx") = %d (%m)",
-                         container, iova, int128_get64(llsize), ret);
+                         "0x%"HWADDR_PRIx") = %d (%s)",
+                         container, iova, int128_get64(llsize), ret,
+                         strerror(-ret));
         }
     }
 
@@ -1256,7 +1371,7 @@ static void vfio_listener_region_del(MemoryListener *listener,
     }
 }
 
-static void vfio_set_dirty_page_tracking(VFIOContainer *container, bool start)
+static int vfio_set_dirty_page_tracking(VFIOContainer *container, bool start)
 {
     int ret;
     struct vfio_iommu_type1_dirty_bitmap dirty = {
@@ -1264,7 +1379,7 @@ static void vfio_set_dirty_page_tracking(VFIOContainer *container, bool start)
     };
 
     if (!container->dirty_pages_supported) {
-        return;
+        return 0;
     }
 
     if (start) {
@@ -1275,40 +1390,327 @@ static void vfio_set_dirty_page_tracking(VFIOContainer *container, bool start)
 
     ret = ioctl(container->fd, VFIO_IOMMU_DIRTY_PAGES, &dirty);
     if (ret) {
+        ret = -errno;
         error_report("Failed to set dirty tracking flag 0x%x errno: %d",
                      dirty.flags, errno);
     }
+
+    return ret;
+}
+
+typedef struct VFIODirtyRanges {
+    hwaddr min32;
+    hwaddr max32;
+    hwaddr min64;
+    hwaddr max64;
+} VFIODirtyRanges;
+
+typedef struct VFIODirtyRangesListener {
+    VFIOContainer *container;
+    VFIODirtyRanges ranges;
+    MemoryListener listener;
+} VFIODirtyRangesListener;
+
+static void vfio_dirty_tracking_update(MemoryListener *listener,
+                                       MemoryRegionSection *section)
+{
+    VFIODirtyRangesListener *dirty = container_of(listener,
+                                                  VFIODirtyRangesListener,
+                                                  listener);
+    VFIODirtyRanges *range = &dirty->ranges;
+    hwaddr iova, end, *min, *max;
+
+    if (!vfio_listener_valid_section(section, "tracking_update") ||
+        !vfio_get_section_iova_range(dirty->container, section,
+                                     &iova, &end, NULL)) {
+        return;
+    }
+
+    /*
+     * The address space passed to the dirty tracker is reduced to two ranges:
+     * one for 32-bit DMA ranges, and another one for 64-bit DMA ranges.
+     * The underlying reports of dirty will query a sub-interval of each of
+     * these ranges.
+     *
+     * The purpose of the dual range handling is to handle known cases of big
+     * holes in the address space, like the x86 AMD 1T hole. The alternative
+     * would be an IOVATree but that has a much bigger runtime overhead and
+     * unnecessary complexity.
+     */
+    min = (end <= UINT32_MAX) ? &range->min32 : &range->min64;
+    max = (end <= UINT32_MAX) ? &range->max32 : &range->max64;
+
+    if (*min > iova) {
+        *min = iova;
+    }
+    if (*max < end) {
+        *max = end;
+    }
+
+    trace_vfio_device_dirty_tracking_update(iova, end, *min, *max);
+    return;
+}
+
+static const MemoryListener vfio_dirty_tracking_listener = {
+    .name = "vfio-tracking",
+    .region_add = vfio_dirty_tracking_update,
+};
+
+static void vfio_dirty_tracking_init(VFIOContainer *container,
+                                     VFIODirtyRanges *ranges)
+{
+    VFIODirtyRangesListener dirty;
+
+    memset(&dirty, 0, sizeof(dirty));
+    dirty.ranges.min32 = UINT32_MAX;
+    dirty.ranges.min64 = UINT64_MAX;
+    dirty.listener = vfio_dirty_tracking_listener;
+    dirty.container = container;
+
+    memory_listener_register(&dirty.listener,
+                             container->space->as);
+
+    *ranges = dirty.ranges;
+
+    /*
+     * The memory listener is synchronous, and used to calculate the range
+     * to dirty tracking. Unregister it after we are done as we are not
+     * interested in any follow-up updates.
+     */
+    memory_listener_unregister(&dirty.listener);
+}
+
+static void vfio_devices_dma_logging_stop(VFIOContainer *container)
+{
+    uint64_t buf[DIV_ROUND_UP(sizeof(struct vfio_device_feature),
+                              sizeof(uint64_t))] = {};
+    struct vfio_device_feature *feature = (struct vfio_device_feature *)buf;
+    VFIODevice *vbasedev;
+    VFIOGroup *group;
+
+    feature->argsz = sizeof(buf);
+    feature->flags = VFIO_DEVICE_FEATURE_SET |
+                     VFIO_DEVICE_FEATURE_DMA_LOGGING_STOP;
+
+    QLIST_FOREACH(group, &container->group_list, container_next) {
+        QLIST_FOREACH(vbasedev, &group->device_list, next) {
+            if (!vbasedev->dirty_tracking) {
+                continue;
+            }
+
+            if (ioctl(vbasedev->fd, VFIO_DEVICE_FEATURE, feature)) {
+                warn_report("%s: Failed to stop DMA logging, err %d (%s)",
+                             vbasedev->name, -errno, strerror(errno));
+            }
+            vbasedev->dirty_tracking = false;
+        }
+    }
+}
+
+static struct vfio_device_feature *
+vfio_device_feature_dma_logging_start_create(VFIOContainer *container,
+                                             VFIODirtyRanges *tracking)
+{
+    struct vfio_device_feature *feature;
+    size_t feature_size;
+    struct vfio_device_feature_dma_logging_control *control;
+    struct vfio_device_feature_dma_logging_range *ranges;
+
+    feature_size = sizeof(struct vfio_device_feature) +
+                   sizeof(struct vfio_device_feature_dma_logging_control);
+    feature = g_try_malloc0(feature_size);
+    if (!feature) {
+        errno = ENOMEM;
+        return NULL;
+    }
+    feature->argsz = feature_size;
+    feature->flags = VFIO_DEVICE_FEATURE_SET |
+                     VFIO_DEVICE_FEATURE_DMA_LOGGING_START;
+
+    control = (struct vfio_device_feature_dma_logging_control *)feature->data;
+    control->page_size = qemu_real_host_page_size();
+
+    /*
+     * DMA logging uAPI guarantees to support at least a number of ranges that
+     * fits into a single host kernel base page.
+     */
+    control->num_ranges = !!tracking->max32 + !!tracking->max64;
+    ranges = g_try_new0(struct vfio_device_feature_dma_logging_range,
+                        control->num_ranges);
+    if (!ranges) {
+        g_free(feature);
+        errno = ENOMEM;
+
+        return NULL;
+    }
+
+    control->ranges = (__u64)(uintptr_t)ranges;
+    if (tracking->max32) {
+        ranges->iova = tracking->min32;
+        ranges->length = (tracking->max32 - tracking->min32) + 1;
+        ranges++;
+    }
+    if (tracking->max64) {
+        ranges->iova = tracking->min64;
+        ranges->length = (tracking->max64 - tracking->min64) + 1;
+    }
+
+    trace_vfio_device_dirty_tracking_start(control->num_ranges,
+                                           tracking->min32, tracking->max32,
+                                           tracking->min64, tracking->max64);
+
+    return feature;
+}
+
+static void vfio_device_feature_dma_logging_start_destroy(
+    struct vfio_device_feature *feature)
+{
+    struct vfio_device_feature_dma_logging_control *control =
+        (struct vfio_device_feature_dma_logging_control *)feature->data;
+    struct vfio_device_feature_dma_logging_range *ranges =
+        (struct vfio_device_feature_dma_logging_range *)(uintptr_t)control->ranges;
+
+    g_free(ranges);
+    g_free(feature);
+}
+
+static int vfio_devices_dma_logging_start(VFIOContainer *container)
+{
+    struct vfio_device_feature *feature;
+    VFIODirtyRanges ranges;
+    VFIODevice *vbasedev;
+    VFIOGroup *group;
+    int ret = 0;
+
+    vfio_dirty_tracking_init(container, &ranges);
+    feature = vfio_device_feature_dma_logging_start_create(container,
+                                                           &ranges);
+    if (!feature) {
+        return -errno;
+    }
+
+    QLIST_FOREACH(group, &container->group_list, container_next) {
+        QLIST_FOREACH(vbasedev, &group->device_list, next) {
+            if (vbasedev->dirty_tracking) {
+                continue;
+            }
+
+            ret = ioctl(vbasedev->fd, VFIO_DEVICE_FEATURE, feature);
+            if (ret) {
+                ret = -errno;
+                error_report("%s: Failed to start DMA logging, err %d (%s)",
+                             vbasedev->name, ret, strerror(errno));
+                goto out;
+            }
+            vbasedev->dirty_tracking = true;
+        }
+    }
+
+out:
+    if (ret) {
+        vfio_devices_dma_logging_stop(container);
+    }
+
+    vfio_device_feature_dma_logging_start_destroy(feature);
+
+    return ret;
 }
 
 static void vfio_listener_log_global_start(MemoryListener *listener)
 {
     VFIOContainer *container = container_of(listener, VFIOContainer, listener);
+    int ret;
+
+    if (vfio_devices_all_device_dirty_tracking(container)) {
+        ret = vfio_devices_dma_logging_start(container);
+    } else {
+        ret = vfio_set_dirty_page_tracking(container, true);
+    }
 
-    vfio_set_dirty_page_tracking(container, true);
+    if (ret) {
+        error_report("vfio: Could not start dirty page tracking, err: %d (%s)",
+                     ret, strerror(-ret));
+        vfio_set_migration_error(ret);
+    }
 }
 
 static void vfio_listener_log_global_stop(MemoryListener *listener)
 {
     VFIOContainer *container = container_of(listener, VFIOContainer, listener);
+    int ret = 0;
+
+    if (vfio_devices_all_device_dirty_tracking(container)) {
+        vfio_devices_dma_logging_stop(container);
+    } else {
+        ret = vfio_set_dirty_page_tracking(container, false);
+    }
 
-    vfio_set_dirty_page_tracking(container, false);
+    if (ret) {
+        error_report("vfio: Could not stop dirty page tracking, err: %d (%s)",
+                     ret, strerror(-ret));
+        vfio_set_migration_error(ret);
+    }
 }
 
-static int vfio_get_dirty_bitmap(VFIOContainer *container, uint64_t iova,
-                                 uint64_t size, ram_addr_t ram_addr)
+static int vfio_device_dma_logging_report(VFIODevice *vbasedev, hwaddr iova,
+                                          hwaddr size, void *bitmap)
 {
-    struct vfio_iommu_type1_dirty_bitmap *dbitmap;
-    struct vfio_iommu_type1_dirty_bitmap_get *range;
-    uint64_t pages;
+    uint64_t buf[DIV_ROUND_UP(sizeof(struct vfio_device_feature) +
+                        sizeof(struct vfio_device_feature_dma_logging_report),
+                        sizeof(__u64))] = {};
+    struct vfio_device_feature *feature = (struct vfio_device_feature *)buf;
+    struct vfio_device_feature_dma_logging_report *report =
+        (struct vfio_device_feature_dma_logging_report *)feature->data;
+
+    report->iova = iova;
+    report->length = size;
+    report->page_size = qemu_real_host_page_size();
+    report->bitmap = (__u64)(uintptr_t)bitmap;
+
+    feature->argsz = sizeof(buf);
+    feature->flags = VFIO_DEVICE_FEATURE_GET |
+                     VFIO_DEVICE_FEATURE_DMA_LOGGING_REPORT;
+
+    if (ioctl(vbasedev->fd, VFIO_DEVICE_FEATURE, feature)) {
+        return -errno;
+    }
+
+    return 0;
+}
+
+static int vfio_devices_query_dirty_bitmap(VFIOContainer *container,
+                                           VFIOBitmap *vbmap, hwaddr iova,
+                                           hwaddr size)
+{
+    VFIODevice *vbasedev;
+    VFIOGroup *group;
     int ret;
 
-    if (!container->dirty_pages_supported) {
-        cpu_physical_memory_set_dirty_range(ram_addr, size,
-                                            tcg_enabled() ? DIRTY_CLIENTS_ALL :
-                                            DIRTY_CLIENTS_NOCODE);
-        return 0;
+    QLIST_FOREACH(group, &container->group_list, container_next) {
+        QLIST_FOREACH(vbasedev, &group->device_list, next) {
+            ret = vfio_device_dma_logging_report(vbasedev, iova, size,
+                                                 vbmap->bitmap);
+            if (ret) {
+                error_report("%s: Failed to get DMA logging report, iova: "
+                             "0x%" HWADDR_PRIx ", size: 0x%" HWADDR_PRIx
+                             ", err: %d (%s)",
+                             vbasedev->name, iova, size, ret, strerror(-ret));
+
+                return ret;
+            }
+        }
     }
 
+    return 0;
+}
+
+static int vfio_query_dirty_bitmap(VFIOContainer *container, VFIOBitmap *vbmap,
+                                   hwaddr iova, hwaddr size)
+{
+    struct vfio_iommu_type1_dirty_bitmap *dbitmap;
+    struct vfio_iommu_type1_dirty_bitmap_get *range;
+    int ret;
+
     dbitmap = g_malloc0(sizeof(*dbitmap) + sizeof(*range));
 
     dbitmap->argsz = sizeof(*dbitmap) + sizeof(*range);
@@ -1323,36 +1725,63 @@ static int vfio_get_dirty_bitmap(VFIOContainer *container, uint64_t iova,
      * to qemu_real_host_page_size.
      */
     range->bitmap.pgsize = qemu_real_host_page_size();
-
-    pages = REAL_HOST_PAGE_ALIGN(range->size) / qemu_real_host_page_size();
-    range->bitmap.size = ROUND_UP(pages, sizeof(__u64) * BITS_PER_BYTE) /
-                                         BITS_PER_BYTE;
-    range->bitmap.data = g_try_malloc0(range->bitmap.size);
-    if (!range->bitmap.data) {
-        ret = -ENOMEM;
-        goto err_out;
-    }
+    range->bitmap.size = vbmap->size;
+    range->bitmap.data = (__u64 *)vbmap->bitmap;
 
     ret = ioctl(container->fd, VFIO_IOMMU_DIRTY_PAGES, dbitmap);
     if (ret) {
+        ret = -errno;
         error_report("Failed to get dirty bitmap for iova: 0x%"PRIx64
                 " size: 0x%"PRIx64" err: %d", (uint64_t)range->iova,
                 (uint64_t)range->size, errno);
-        goto err_out;
     }
 
-    cpu_physical_memory_set_dirty_lebitmap((unsigned long *)range->bitmap.data,
-                                            ram_addr, pages);
-
-    trace_vfio_get_dirty_bitmap(container->fd, range->iova, range->size,
-                                range->bitmap.size, ram_addr);
-err_out:
-    g_free(range->bitmap.data);
     g_free(dbitmap);
 
     return ret;
 }
 
+static int vfio_get_dirty_bitmap(VFIOContainer *container, uint64_t iova,
+                                 uint64_t size, ram_addr_t ram_addr)
+{
+    bool all_device_dirty_tracking =
+        vfio_devices_all_device_dirty_tracking(container);
+    VFIOBitmap vbmap;
+    int ret;
+
+    if (!container->dirty_pages_supported && !all_device_dirty_tracking) {
+        cpu_physical_memory_set_dirty_range(ram_addr, size,
+                                            tcg_enabled() ? DIRTY_CLIENTS_ALL :
+                                            DIRTY_CLIENTS_NOCODE);
+        return 0;
+    }
+
+    ret = vfio_bitmap_alloc(&vbmap, size);
+    if (ret) {
+        return ret;
+    }
+
+    if (all_device_dirty_tracking) {
+        ret = vfio_devices_query_dirty_bitmap(container, &vbmap, iova, size);
+    } else {
+        ret = vfio_query_dirty_bitmap(container, &vbmap, iova, size);
+    }
+
+    if (ret) {
+        goto out;
+    }
+
+    cpu_physical_memory_set_dirty_lebitmap(vbmap.bitmap, ram_addr,
+                                           vbmap.pages);
+
+    trace_vfio_get_dirty_bitmap(container->fd, iova, size, vbmap.size,
+                                ram_addr);
+out:
+    g_free(vbmap.bitmap);
+
+    return ret;
+}
+
 typedef struct {
     IOMMUNotifier n;
     VFIOGuestIOMMU *giommu;
@@ -1366,29 +1795,33 @@ static void vfio_iommu_map_dirty_notify(IOMMUNotifier *n, IOMMUTLBEntry *iotlb)
     VFIOContainer *container = giommu->container;
     hwaddr iova = iotlb->iova + giommu->iommu_offset;
     ram_addr_t translated_addr;
+    int ret = -EINVAL;
 
     trace_vfio_iommu_map_dirty_notify(iova, iova + iotlb->addr_mask);
 
     if (iotlb->target_as != &address_space_memory) {
         error_report("Wrong target AS \"%s\", only system memory is allowed",
                      iotlb->target_as->name ? iotlb->target_as->name : "none");
-        return;
+        goto out;
     }
 
     rcu_read_lock();
     if (vfio_get_xlat_addr(iotlb, NULL, &translated_addr, NULL)) {
-        int ret;
-
         ret = vfio_get_dirty_bitmap(container, iova, iotlb->addr_mask + 1,
                                     translated_addr);
         if (ret) {
             error_report("vfio_iommu_map_dirty_notify(%p, 0x%"HWADDR_PRIx", "
-                         "0x%"HWADDR_PRIx") = %d (%m)",
-                         container, iova,
-                         iotlb->addr_mask + 1, ret);
+                         "0x%"HWADDR_PRIx") = %d (%s)",
+                         container, iova, iotlb->addr_mask + 1, ret,
+                         strerror(-ret));
         }
     }
     rcu_read_unlock();
+
+out:
+    if (ret) {
+        vfio_set_migration_error(ret);
+    }
 }
 
 static int vfio_ram_discard_get_dirty_bitmap(MemoryRegionSection *section,
@@ -1481,13 +1914,19 @@ static void vfio_listener_log_sync(MemoryListener *listener,
         MemoryRegionSection *section)
 {
     VFIOContainer *container = container_of(listener, VFIOContainer, listener);
+    int ret;
 
     if (vfio_listener_skipped_section(section)) {
         return;
     }
 
     if (vfio_devices_all_dirty_tracking(container)) {
-        vfio_sync_dirty_bitmap(container, section);
+        ret = vfio_sync_dirty_bitmap(container, section);
+        if (ret) {
+            error_report("vfio: Failed to sync dirty bitmap, err: %d (%s)", ret,
+                         strerror(-ret));
+            vfio_set_migration_error(ret);
+        }
     }
 }
 
diff --git a/hw/vfio/migration.c b/hw/vfio/migration.c
index a2c3d9bade..1a1a8659c8 100644
--- a/hw/vfio/migration.c
+++ b/hw/vfio/migration.c
@@ -521,7 +521,7 @@ static void vfio_migration_state_notifier(Notifier *notifier, void *data)
     }
 }
 
-static void vfio_migration_exit(VFIODevice *vbasedev)
+static void vfio_migration_free(VFIODevice *vbasedev)
 {
     g_free(vbasedev->migration);
     vbasedev->migration = NULL;
@@ -555,6 +555,19 @@ static int vfio_migration_query_flags(VFIODevice *vbasedev, uint64_t *mig_flags)
     return 0;
 }
 
+static bool vfio_dma_logging_supported(VFIODevice *vbasedev)
+{
+    uint64_t buf[DIV_ROUND_UP(sizeof(struct vfio_device_feature),
+                              sizeof(uint64_t))] = {};
+    struct vfio_device_feature *feature = (struct vfio_device_feature *)buf;
+
+    feature->argsz = sizeof(buf);
+    feature->flags = VFIO_DEVICE_FEATURE_PROBE |
+                     VFIO_DEVICE_FEATURE_DMA_LOGGING_START;
+
+    return !ioctl(vbasedev->fd, VFIO_DEVICE_FEATURE, feature);
+}
+
 static int vfio_migration_init(VFIODevice *vbasedev)
 {
     int ret;
@@ -589,6 +602,8 @@ static int vfio_migration_init(VFIODevice *vbasedev)
     migration->device_state = VFIO_DEVICE_STATE_RUNNING;
     migration->data_fd = -1;
 
+    vbasedev->dirty_pages_supported = vfio_dma_logging_supported(vbasedev);
+
     oid = vmstate_if_get_id(VMSTATE_IF(DEVICE(obj)));
     if (oid) {
         path = g_strdup_printf("%s/vfio", oid);
@@ -616,7 +631,7 @@ int64_t vfio_mig_bytes_transferred(void)
     return bytes_transferred;
 }
 
-int vfio_migration_probe(VFIODevice *vbasedev, Error **errp)
+int vfio_migration_realize(VFIODevice *vbasedev, Error **errp)
 {
     int ret = -ENOTSUP;
 
@@ -634,6 +649,11 @@ int vfio_migration_probe(VFIODevice *vbasedev, Error **errp)
         return ret;
     }
 
+    ret = vfio_block_giommu_migration(errp);
+    if (ret) {
+        return ret;
+    }
+
     trace_vfio_migration_probe(vbasedev->name);
     return 0;
 
@@ -649,7 +669,7 @@ add_blocker:
     return ret;
 }
 
-void vfio_migration_finalize(VFIODevice *vbasedev)
+void vfio_migration_exit(VFIODevice *vbasedev)
 {
     if (vbasedev->migration) {
         VFIOMigration *migration = vbasedev->migration;
@@ -657,7 +677,7 @@ void vfio_migration_finalize(VFIODevice *vbasedev)
         remove_migration_state_change_notifier(&migration->migration_state);
         qemu_del_vm_change_state_handler(migration->vm_state);
         unregister_savevm(VMSTATE_IF(vbasedev->dev), "vfio", vbasedev);
-        vfio_migration_exit(vbasedev);
+        vfio_migration_free(vbasedev);
         vfio_unblock_multiple_devices_migration();
     }
 
diff --git a/hw/vfio/pci.c b/hw/vfio/pci.c
index 939dcc3d4a..ec9a854361 100644
--- a/hw/vfio/pci.c
+++ b/hw/vfio/pci.c
@@ -3145,7 +3145,7 @@ static void vfio_realize(PCIDevice *pdev, Error **errp)
     }
 
     if (!pdev->failover_pair_id) {
-        ret = vfio_migration_probe(vbasedev, errp);
+        ret = vfio_migration_realize(vbasedev, errp);
         if (ret) {
             error_report("%s: Migration disabled", vbasedev->name);
         }
@@ -3185,6 +3185,7 @@ static void vfio_instance_finalize(Object *obj)
      */
     vfio_put_device(vdev);
     vfio_put_group(group);
+    vfio_migration_finalize();
 }
 
 static void vfio_exitfn(PCIDevice *pdev)
@@ -3203,7 +3204,7 @@ static void vfio_exitfn(PCIDevice *pdev)
     }
     vfio_teardown_msi(vdev);
     vfio_bars_exit(vdev);
-    vfio_migration_finalize(&vdev->vbasedev);
+    vfio_migration_exit(&vdev->vbasedev);
 }
 
 static void vfio_pci_reset(DeviceState *dev)
diff --git a/hw/vfio/trace-events b/hw/vfio/trace-events
index 669d9fe07c..646e42fd27 100644
--- a/hw/vfio/trace-events
+++ b/hw/vfio/trace-events
@@ -96,14 +96,15 @@ vfio_pci_igd_lpc_bridge_enabled(const char *name) "%s"
 vfio_region_write(const char *name, int index, uint64_t addr, uint64_t data, unsigned size) " (%s:region%d+0x%"PRIx64", 0x%"PRIx64 ", %d)"
 vfio_region_read(char *name, int index, uint64_t addr, unsigned size, uint64_t data) " (%s:region%d+0x%"PRIx64", %d) = 0x%"PRIx64
 vfio_iommu_map_notify(const char *op, uint64_t iova_start, uint64_t iova_end) "iommu %s @ 0x%"PRIx64" - 0x%"PRIx64
-vfio_listener_region_add_skip(uint64_t start, uint64_t end) "SKIPPING region_add 0x%"PRIx64" - 0x%"PRIx64
+vfio_listener_region_skip(const char *name, uint64_t start, uint64_t end) "SKIPPING %s 0x%"PRIx64" - 0x%"PRIx64
 vfio_spapr_group_attach(int groupfd, int tablefd) "Attached groupfd %d to liobn fd %d"
 vfio_listener_region_add_iommu(uint64_t start, uint64_t end) "region_add [iommu] 0x%"PRIx64" - 0x%"PRIx64
 vfio_listener_region_add_ram(uint64_t iova_start, uint64_t iova_end, void *vaddr) "region_add [ram] 0x%"PRIx64" - 0x%"PRIx64" [%p]"
 vfio_known_safe_misalignment(const char *name, uint64_t iova, uint64_t offset_within_region, uintptr_t page_size) "Region \"%s\" iova=0x%"PRIx64" offset_within_region=0x%"PRIx64" qemu_real_host_page_size=0x%"PRIxPTR
 vfio_listener_region_add_no_dma_map(const char *name, uint64_t iova, uint64_t size, uint64_t page_size) "Region \"%s\" 0x%"PRIx64" size=0x%"PRIx64" is not aligned to 0x%"PRIx64" and cannot be mapped for DMA"
-vfio_listener_region_del_skip(uint64_t start, uint64_t end) "SKIPPING region_del 0x%"PRIx64" - 0x%"PRIx64
 vfio_listener_region_del(uint64_t start, uint64_t end) "region_del 0x%"PRIx64" - 0x%"PRIx64
+vfio_device_dirty_tracking_update(uint64_t start, uint64_t end, uint64_t min, uint64_t max) "section 0x%"PRIx64" - 0x%"PRIx64" -> update [0x%"PRIx64" - 0x%"PRIx64"]"
+vfio_device_dirty_tracking_start(int nr_ranges, uint64_t min32, uint64_t max32, uint64_t min64, uint64_t max64) "nr_ranges %d 32:[0x%"PRIx64" - 0x%"PRIx64"], 64:[0x%"PRIx64" - 0x%"PRIx64"]"
 vfio_disconnect_container(int fd) "close container->fd=%d"
 vfio_put_group(int fd) "close group->fd=%d"
 vfio_get_device(const char * name, unsigned int flags, unsigned int num_regions, unsigned int num_irqs) "Device %s flags: %u, regions: %u, irqs: %u"
@@ -117,7 +118,7 @@ vfio_region_mmaps_set_enabled(const char *name, bool enabled) "Region %s mmaps e
 vfio_region_unmap(const char *name, unsigned long offset, unsigned long end) "Region %s unmap [0x%lx - 0x%lx]"
 vfio_region_sparse_mmap_header(const char *name, int index, int nr_areas) "Device %s region %d: %d sparse mmap entries"
 vfio_region_sparse_mmap_entry(int i, unsigned long start, unsigned long end) "sparse entry %d [0x%lx - 0x%lx]"
-vfio_get_dev_region(const char *name, int index, uint32_t type, uint32_t subtype) "%s index %d, %08x/%0x8"
+vfio_get_dev_region(const char *name, int index, uint32_t type, uint32_t subtype) "%s index %d, %08x/%08x"
 vfio_dma_unmap_overflow_workaround(void) ""
 vfio_get_dirty_bitmap(int fd, uint64_t iova, uint64_t size, uint64_t bitmap_size, uint64_t start) "container fd=%d, iova=0x%"PRIx64" size= 0x%"PRIx64" bitmap_size=0x%"PRIx64" start=0x%"PRIx64
 vfio_iommu_map_dirty_notify(uint64_t iova_start, uint64_t iova_end) "iommu dirty @ 0x%"PRIx64" - 0x%"PRIx64
diff --git a/hw/virtio/trace-events b/hw/virtio/trace-events
index a87c5f39a2..8f8d05cf9b 100644
--- a/hw/virtio/trace-events
+++ b/hw/virtio/trace-events
@@ -50,6 +50,7 @@ vhost_vdpa_set_vring_ready(void *dev) "dev: %p"
 vhost_vdpa_dump_config(void *dev, const char *line) "dev: %p %s"
 vhost_vdpa_set_config(void *dev, uint32_t offset, uint32_t size, uint32_t flags) "dev: %p offset: %"PRIu32" size: %"PRIu32" flags: 0x%"PRIx32
 vhost_vdpa_get_config(void *dev, void *config, uint32_t config_len) "dev: %p config: %p config_len: %"PRIu32
+vhost_vdpa_suspend(void *dev) "dev: %p"
 vhost_vdpa_dev_start(void *dev, bool started) "dev: %p started: %d"
 vhost_vdpa_set_log_base(void *dev, uint64_t base, unsigned long long size, int refcnt, int fd, void *log) "dev: %p base: 0x%"PRIx64" size: %llu refcnt: %d fd: %d log: %p"
 vhost_vdpa_set_vring_addr(void *dev, unsigned int index, unsigned int flags, uint64_t desc_user_addr, uint64_t used_user_addr, uint64_t avail_user_addr, uint64_t log_guest_addr) "dev: %p index: %u flags: 0x%x desc_user_addr: 0x%"PRIx64" used_user_addr: 0x%"PRIx64" avail_user_addr: 0x%"PRIx64" log_guest_addr: 0x%"PRIx64
diff --git a/hw/virtio/vhost-shadow-virtqueue.c b/hw/virtio/vhost-shadow-virtqueue.c
index 515ccf870d..8361e70d1b 100644
--- a/hw/virtio/vhost-shadow-virtqueue.c
+++ b/hw/virtio/vhost-shadow-virtqueue.c
@@ -694,13 +694,17 @@ void vhost_svq_stop(VhostShadowVirtqueue *svq)
         g_autofree VirtQueueElement *elem = NULL;
         elem = g_steal_pointer(&svq->desc_state[i].elem);
         if (elem) {
-            virtqueue_detach_element(svq->vq, elem, 0);
+            /*
+             * TODO: This is ok for networking, but other kinds of devices
+             * might have problems with just unpop these.
+             */
+            virtqueue_unpop(svq->vq, elem, 0);
         }
     }
 
     next_avail_elem = g_steal_pointer(&svq->next_guest_avail_elem);
     if (next_avail_elem) {
-        virtqueue_detach_element(svq->vq, next_avail_elem, 0);
+        virtqueue_unpop(svq->vq, next_avail_elem, 0);
     }
     svq->vq = NULL;
     g_free(svq->desc_next);
diff --git a/hw/virtio/vhost-user.c b/hw/virtio/vhost-user.c
index 8968541514..e5285df4ba 100644
--- a/hw/virtio/vhost-user.c
+++ b/hw/virtio/vhost-user.c
@@ -2031,8 +2031,8 @@ static int vhost_user_backend_init(struct vhost_dev *dev, void *opaque,
         } else {
             if (virtio_has_feature(protocol_features,
                                    VHOST_USER_PROTOCOL_F_CONFIG)) {
-                warn_reportf_err(*errp, "vhost-user backend supports "
-                                 "VHOST_USER_PROTOCOL_F_CONFIG but QEMU does not.");
+                warn_report("vhost-user backend supports "
+                            "VHOST_USER_PROTOCOL_F_CONFIG but QEMU does not.");
                 protocol_features &= ~(1ULL << VHOST_USER_PROTOCOL_F_CONFIG);
             }
         }
diff --git a/hw/virtio/vhost-vdpa.c b/hw/virtio/vhost-vdpa.c
index df3a1e92ac..bc6bad23d5 100644
--- a/hw/virtio/vhost-vdpa.c
+++ b/hw/virtio/vhost-vdpa.c
@@ -431,16 +431,6 @@ static int vhost_vdpa_init(struct vhost_dev *dev, void *opaque, Error **errp)
     trace_vhost_vdpa_init(dev, opaque);
     int ret;
 
-    /*
-     * Similar to VFIO, we end up pinning all guest memory and have to
-     * disable discarding of RAM.
-     */
-    ret = ram_block_discard_disable(true);
-    if (ret) {
-        error_report("Cannot set discarding of RAM broken");
-        return ret;
-    }
-
     v = opaque;
     v->dev = dev;
     dev->opaque =  opaque ;
@@ -448,10 +438,36 @@ static int vhost_vdpa_init(struct vhost_dev *dev, void *opaque, Error **errp)
     v->msg_type = VHOST_IOTLB_MSG_V2;
     vhost_vdpa_init_svq(dev, v);
 
+    error_propagate(&dev->migration_blocker, v->migration_blocker);
     if (!vhost_vdpa_first_dev(dev)) {
         return 0;
     }
 
+    /*
+     * If dev->shadow_vqs_enabled at initialization that means the device has
+     * been started with x-svq=on, so don't block migration
+     */
+    if (dev->migration_blocker == NULL && !v->shadow_vqs_enabled) {
+        /* We don't have dev->features yet */
+        uint64_t features;
+        ret = vhost_vdpa_get_dev_features(dev, &features);
+        if (unlikely(ret)) {
+            error_setg_errno(errp, -ret, "Could not get device features");
+            return ret;
+        }
+        vhost_svq_valid_features(features, &dev->migration_blocker);
+    }
+
+    /*
+     * Similar to VFIO, we end up pinning all guest memory and have to
+     * disable discarding of RAM.
+     */
+    ret = ram_block_discard_disable(true);
+    if (ret) {
+        error_report("Cannot set discarding of RAM broken");
+        return ret;
+    }
+
     vhost_vdpa_add_status(dev, VIRTIO_CONFIG_S_ACKNOWLEDGE |
                                VIRTIO_CONFIG_S_DRIVER);
 
@@ -577,12 +593,15 @@ static int vhost_vdpa_cleanup(struct vhost_dev *dev)
     assert(dev->vhost_ops->backend_type == VHOST_BACKEND_TYPE_VDPA);
     v = dev->opaque;
     trace_vhost_vdpa_cleanup(dev, v);
+    if (vhost_vdpa_first_dev(dev)) {
+        ram_block_discard_disable(false);
+    }
+
     vhost_vdpa_host_notifiers_uninit(dev, dev->nvqs);
     memory_listener_unregister(&v->listener);
     vhost_vdpa_svq_cleanup(dev);
 
     dev->opaque = NULL;
-    ram_block_discard_disable(false);
 
     return 0;
 }
@@ -659,7 +678,8 @@ static int vhost_vdpa_set_backend_cap(struct vhost_dev *dev)
     uint64_t features;
     uint64_t f = 0x1ULL << VHOST_BACKEND_F_IOTLB_MSG_V2 |
         0x1ULL << VHOST_BACKEND_F_IOTLB_BATCH |
-        0x1ULL << VHOST_BACKEND_F_IOTLB_ASID;
+        0x1ULL << VHOST_BACKEND_F_IOTLB_ASID |
+        0x1ULL << VHOST_BACKEND_F_SUSPEND;
     int r;
 
     if (vhost_vdpa_call(dev, VHOST_GET_BACKEND_FEATURES, &features)) {
@@ -691,11 +711,13 @@ static int vhost_vdpa_get_device_id(struct vhost_dev *dev,
 
 static int vhost_vdpa_reset_device(struct vhost_dev *dev)
 {
+    struct vhost_vdpa *v = dev->opaque;
     int ret;
     uint8_t status = 0;
 
     ret = vhost_vdpa_call(dev, VHOST_VDPA_SET_STATUS, &status);
     trace_vhost_vdpa_reset_device(dev, status);
+    v->suspended = false;
     return ret;
 }
 
@@ -1094,6 +1116,29 @@ static void vhost_vdpa_svqs_stop(struct vhost_dev *dev)
     }
 }
 
+static void vhost_vdpa_suspend(struct vhost_dev *dev)
+{
+    struct vhost_vdpa *v = dev->opaque;
+    int r;
+
+    if (!vhost_vdpa_first_dev(dev)) {
+        return;
+    }
+
+    if (dev->backend_cap & BIT_ULL(VHOST_BACKEND_F_SUSPEND)) {
+        trace_vhost_vdpa_suspend(dev);
+        r = ioctl(v->device_fd, VHOST_VDPA_SUSPEND);
+        if (unlikely(r)) {
+            error_report("Cannot suspend: %s(%d)", g_strerror(errno), errno);
+        } else {
+            v->suspended = true;
+            return;
+        }
+    }
+
+    vhost_vdpa_reset_device(dev);
+}
+
 static int vhost_vdpa_dev_start(struct vhost_dev *dev, bool started)
 {
     struct vhost_vdpa *v = dev->opaque;
@@ -1108,6 +1153,7 @@ static int vhost_vdpa_dev_start(struct vhost_dev *dev, bool started)
         }
         vhost_vdpa_set_vring_ready(dev);
     } else {
+        vhost_vdpa_suspend(dev);
         vhost_vdpa_svqs_stop(dev);
         vhost_vdpa_host_notifiers_uninit(dev, dev->nvqs);
     }
@@ -1119,14 +1165,23 @@ static int vhost_vdpa_dev_start(struct vhost_dev *dev, bool started)
     if (started) {
         memory_listener_register(&v->listener, &address_space_memory);
         return vhost_vdpa_add_status(dev, VIRTIO_CONFIG_S_DRIVER_OK);
-    } else {
-        vhost_vdpa_reset_device(dev);
-        vhost_vdpa_add_status(dev, VIRTIO_CONFIG_S_ACKNOWLEDGE |
-                                   VIRTIO_CONFIG_S_DRIVER);
-        memory_listener_unregister(&v->listener);
+    }
 
-        return 0;
+    return 0;
+}
+
+static void vhost_vdpa_reset_status(struct vhost_dev *dev)
+{
+    struct vhost_vdpa *v = dev->opaque;
+
+    if (dev->vq_index + dev->nvqs != dev->vq_index_end) {
+        return;
     }
+
+    vhost_vdpa_reset_device(dev);
+    vhost_vdpa_add_status(dev, VIRTIO_CONFIG_S_ACKNOWLEDGE |
+                               VIRTIO_CONFIG_S_DRIVER);
+    memory_listener_unregister(&v->listener);
 }
 
 static int vhost_vdpa_set_log_base(struct vhost_dev *dev, uint64_t base,
@@ -1169,18 +1224,7 @@ static int vhost_vdpa_set_vring_base(struct vhost_dev *dev,
                                        struct vhost_vring_state *ring)
 {
     struct vhost_vdpa *v = dev->opaque;
-    VirtQueue *vq = virtio_get_queue(dev->vdev, ring->index);
 
-    /*
-     * vhost-vdpa devices does not support in-flight requests. Set all of them
-     * as available.
-     *
-     * TODO: This is ok for networking, but other kinds of devices might
-     * have problems with these retransmissions.
-     */
-    while (virtqueue_rewind(vq, 1)) {
-        continue;
-    }
     if (v->shadow_vqs_enabled) {
         /*
          * Device vring base was set at device start. SVQ base is handled by
@@ -1203,6 +1247,14 @@ static int vhost_vdpa_get_vring_base(struct vhost_dev *dev,
         return 0;
     }
 
+    if (!v->suspended) {
+        /*
+         * Cannot trust in value returned by device, let vhost recover used
+         * idx from guest.
+         */
+        return -1;
+    }
+
     ret = vhost_vdpa_call(dev, VHOST_GET_VRING_BASE, ring);
     trace_vhost_vdpa_get_vring_base(dev, ring->index, ring->num);
     return ret;
@@ -1227,25 +1279,24 @@ static int vhost_vdpa_set_vring_call(struct vhost_dev *dev,
                                        struct vhost_vring_file *file)
 {
     struct vhost_vdpa *v = dev->opaque;
+    int vdpa_idx = file->index - dev->vq_index;
+    VhostShadowVirtqueue *svq = g_ptr_array_index(v->shadow_vqs, vdpa_idx);
 
+    /* Remember last call fd because we can switch to SVQ anytime. */
+    vhost_svq_set_svq_call_fd(svq, file->fd);
     if (v->shadow_vqs_enabled) {
-        int vdpa_idx = file->index - dev->vq_index;
-        VhostShadowVirtqueue *svq = g_ptr_array_index(v->shadow_vqs, vdpa_idx);
-
-        vhost_svq_set_svq_call_fd(svq, file->fd);
         return 0;
-    } else {
-        return vhost_vdpa_set_vring_dev_call(dev, file);
     }
+
+    return vhost_vdpa_set_vring_dev_call(dev, file);
 }
 
 static int vhost_vdpa_get_features(struct vhost_dev *dev,
                                      uint64_t *features)
 {
-    struct vhost_vdpa *v = dev->opaque;
     int ret = vhost_vdpa_get_dev_features(dev, features);
 
-    if (ret == 0 && v->shadow_vqs_enabled) {
+    if (ret == 0) {
         /* Add SVQ logging capabilities */
         *features |= BIT_ULL(VHOST_F_LOG_ALL);
     }
@@ -1313,4 +1364,5 @@ const VhostOps vdpa_ops = {
         .vhost_vq_get_addr = vhost_vdpa_vq_get_addr,
         .vhost_force_iommu = vhost_vdpa_force_iommu,
         .vhost_set_config_call = vhost_vdpa_set_config_call,
+        .vhost_reset_status = vhost_vdpa_reset_status,
 };
diff --git a/hw/virtio/vhost.c b/hw/virtio/vhost.c
index eb8c4c378c..a266396576 100644
--- a/hw/virtio/vhost.c
+++ b/hw/virtio/vhost.c
@@ -2049,6 +2049,9 @@ void vhost_dev_stop(struct vhost_dev *hdev, VirtIODevice *vdev, bool vrings)
                              hdev->vqs + i,
                              hdev->vq_index + i);
     }
+    if (hdev->vhost_ops->vhost_reset_status) {
+        hdev->vhost_ops->vhost_reset_status(hdev);
+    }
 
     if (vhost_dev_has_iommu(hdev)) {
         if (hdev->vhost_ops->vhost_set_iotlb_callback) {
diff --git a/hw/virtio/virtio-crypto.c b/hw/virtio/virtio-crypto.c
index 516425e26a..802e1b9659 100644
--- a/hw/virtio/virtio-crypto.c
+++ b/hw/virtio/virtio-crypto.c
@@ -462,7 +462,7 @@ static void virtio_crypto_init_request(VirtIOCrypto *vcrypto, VirtQueue *vq,
     req->in_iov = NULL;
     req->in_num = 0;
     req->in_len = 0;
-    req->flags = CRYPTODEV_BACKEND_ALG__MAX;
+    req->flags = QCRYPTODEV_BACKEND_ALG__MAX;
     memset(&req->op_info, 0x00, sizeof(req->op_info));
 }
 
@@ -472,7 +472,7 @@ static void virtio_crypto_free_request(VirtIOCryptoReq *req)
         return;
     }
 
-    if (req->flags == CRYPTODEV_BACKEND_ALG_SYM) {
+    if (req->flags == QCRYPTODEV_BACKEND_ALG_SYM) {
         size_t max_len;
         CryptoDevBackendSymOpInfo *op_info = req->op_info.u.sym_op_info;
 
@@ -485,7 +485,7 @@ static void virtio_crypto_free_request(VirtIOCryptoReq *req)
         /* Zeroize and free request data structure */
         memset(op_info, 0, sizeof(*op_info) + max_len);
         g_free(op_info);
-    } else if (req->flags == CRYPTODEV_BACKEND_ALG_ASYM) {
+    } else if (req->flags == QCRYPTODEV_BACKEND_ALG_ASYM) {
         CryptoDevBackendAsymOpInfo *op_info = req->op_info.u.asym_op_info;
         if (op_info) {
             g_free(op_info->src);
@@ -570,10 +570,10 @@ static void virtio_crypto_req_complete(void *opaque, int ret)
     VirtIODevice *vdev = VIRTIO_DEVICE(vcrypto);
     uint8_t status = -ret;
 
-    if (req->flags == CRYPTODEV_BACKEND_ALG_SYM) {
+    if (req->flags == QCRYPTODEV_BACKEND_ALG_SYM) {
         virtio_crypto_sym_input_data_helper(vdev, req, status,
                                             req->op_info.u.sym_op_info);
-    } else if (req->flags == CRYPTODEV_BACKEND_ALG_ASYM) {
+    } else if (req->flags == QCRYPTODEV_BACKEND_ALG_ASYM) {
         virtio_crypto_akcipher_input_data_helper(vdev, req, status,
                                              req->op_info.u.asym_op_info);
     }
@@ -871,11 +871,14 @@ virtio_crypto_handle_request(VirtIOCryptoReq *request)
     opcode = ldl_le_p(&req.header.opcode);
     op_info->session_id = ldq_le_p(&req.header.session_id);
     op_info->op_code = opcode;
+    op_info->queue_index = queue_index;
+    op_info->cb = virtio_crypto_req_complete;
+    op_info->opaque = request;
 
     switch (opcode) {
     case VIRTIO_CRYPTO_CIPHER_ENCRYPT:
     case VIRTIO_CRYPTO_CIPHER_DECRYPT:
-        op_info->algtype = request->flags = CRYPTODEV_BACKEND_ALG_SYM;
+        op_info->algtype = request->flags = QCRYPTODEV_BACKEND_ALG_SYM;
         ret = virtio_crypto_handle_sym_req(vcrypto,
                          &req.u.sym_req, op_info,
                          out_iov, out_num);
@@ -885,7 +888,7 @@ virtio_crypto_handle_request(VirtIOCryptoReq *request)
     case VIRTIO_CRYPTO_AKCIPHER_DECRYPT:
     case VIRTIO_CRYPTO_AKCIPHER_SIGN:
     case VIRTIO_CRYPTO_AKCIPHER_VERIFY:
-        op_info->algtype = request->flags = CRYPTODEV_BACKEND_ALG_ASYM;
+        op_info->algtype = request->flags = QCRYPTODEV_BACKEND_ALG_ASYM;
         ret = virtio_crypto_handle_asym_req(vcrypto,
                          &req.u.akcipher_req, op_info,
                          out_iov, out_num);
@@ -898,9 +901,7 @@ check_result:
             virtio_crypto_req_complete(request, -VIRTIO_CRYPTO_NOTSUPP);
         } else {
             ret = cryptodev_backend_crypto_operation(vcrypto->cryptodev,
-                                    request, queue_index,
-                                    virtio_crypto_req_complete,
-                                    request);
+                                    op_info);
             if (ret < 0) {
                 virtio_crypto_req_complete(request, ret);
             }
@@ -997,12 +998,35 @@ static void virtio_crypto_reset(VirtIODevice *vdev)
     }
 }
 
+static uint32_t virtio_crypto_init_services(uint32_t qservices)
+{
+    uint32_t vservices = 0;
+
+    if (qservices & (1 << QCRYPTODEV_BACKEND_SERVICE_CIPHER)) {
+        vservices |= (1 << VIRTIO_CRYPTO_SERVICE_CIPHER);
+    }
+    if (qservices & (1 << QCRYPTODEV_BACKEND_SERVICE_HASH)) {
+        vservices |= (1 << VIRTIO_CRYPTO_SERVICE_HASH);
+    }
+    if (qservices & (1 << QCRYPTODEV_BACKEND_SERVICE_MAC)) {
+        vservices |= (1 << VIRTIO_CRYPTO_SERVICE_MAC);
+    }
+    if (qservices & (1 << QCRYPTODEV_BACKEND_SERVICE_AEAD)) {
+        vservices |= (1 << VIRTIO_CRYPTO_SERVICE_AEAD);
+    }
+    if (qservices & (1 << QCRYPTODEV_BACKEND_SERVICE_AKCIPHER)) {
+        vservices |= (1 << VIRTIO_CRYPTO_SERVICE_AKCIPHER);
+    }
+
+    return vservices;
+}
+
 static void virtio_crypto_init_config(VirtIODevice *vdev)
 {
     VirtIOCrypto *vcrypto = VIRTIO_CRYPTO(vdev);
 
-    vcrypto->conf.crypto_services =
-                     vcrypto->conf.cryptodev->conf.crypto_services;
+    vcrypto->conf.crypto_services = virtio_crypto_init_services(
+                     vcrypto->conf.cryptodev->conf.crypto_services);
     vcrypto->conf.cipher_algo_l =
                      vcrypto->conf.cryptodev->conf.cipher_algo_l;
     vcrypto->conf.cipher_algo_h =
diff --git a/hw/virtio/virtio.c b/hw/virtio/virtio.c
index f35178f5fc..98c4819fcc 100644
--- a/hw/virtio/virtio.c
+++ b/hw/virtio/virtio.c
@@ -1069,7 +1069,7 @@ static void virtqueue_split_get_avail_bytes(VirtQueue *vq,
                             VRingMemoryRegionCaches *caches)
 {
     VirtIODevice *vdev = vq->vdev;
-    unsigned int max, idx;
+    unsigned int idx;
     unsigned int total_bufs, in_total, out_total;
     MemoryRegionCache indirect_desc_cache = MEMORY_REGION_CACHE_INVALID;
     int64_t len = 0;
@@ -1078,13 +1078,12 @@ static void virtqueue_split_get_avail_bytes(VirtQueue *vq,
     idx = vq->last_avail_idx;
     total_bufs = in_total = out_total = 0;
 
-    max = vq->vring.num;
-
     while ((rc = virtqueue_num_heads(vq, idx)) > 0) {
         MemoryRegionCache *desc_cache = &caches->desc;
         unsigned int num_bufs;
         VRingDesc desc;
         unsigned int i;
+        unsigned int max = vq->vring.num;
 
         num_bufs = total_bufs;
 
@@ -1206,7 +1205,7 @@ static void virtqueue_packed_get_avail_bytes(VirtQueue *vq,
                                              VRingMemoryRegionCaches *caches)
 {
     VirtIODevice *vdev = vq->vdev;
-    unsigned int max, idx;
+    unsigned int idx;
     unsigned int total_bufs, in_total, out_total;
     MemoryRegionCache *desc_cache;
     MemoryRegionCache indirect_desc_cache = MEMORY_REGION_CACHE_INVALID;
@@ -1218,14 +1217,14 @@ static void virtqueue_packed_get_avail_bytes(VirtQueue *vq,
     wrap_counter = vq->last_avail_wrap_counter;
     total_bufs = in_total = out_total = 0;
 
-    max = vq->vring.num;
-
     for (;;) {
         unsigned int num_bufs = total_bufs;
         unsigned int i = idx;
         int rc;
+        unsigned int max = vq->vring.num;
 
         desc_cache = &caches->desc;
+
         vring_packed_desc_read(vdev, &desc, desc_cache, idx, true);
         if (!is_desc_avail(desc.flags, wrap_counter)) {
             break;
diff --git a/hw/xen/meson.build b/hw/xen/meson.build
index ae0ace3046..19c6aabc7c 100644
--- a/hw/xen/meson.build
+++ b/hw/xen/meson.build
@@ -1,4 +1,4 @@
-softmmu_ss.add(when: ['CONFIG_XEN', xen], if_true: files(
+softmmu_ss.add(when: ['CONFIG_XEN_BUS'], if_true: files(
   'xen-backend.c',
   'xen-bus-helper.c',
   'xen-bus.c',
@@ -7,6 +7,10 @@ softmmu_ss.add(when: ['CONFIG_XEN', xen], if_true: files(
   'xen_pvdev.c',
 ))
 
+softmmu_ss.add(when: ['CONFIG_XEN', xen], if_true: files(
+  'xen-operations.c',
+))
+
 xen_specific_ss = ss.source_set()
 if have_xen_pci_passthrough
   xen_specific_ss.add(files(
diff --git a/hw/xen/trace-events b/hw/xen/trace-events
index 3da3fd8348..55c9e1df68 100644
--- a/hw/xen/trace-events
+++ b/hw/xen/trace-events
@@ -1,6 +1,6 @@
 # See docs/devel/tracing.rst for syntax documentation.
 
-# ../../include/hw/xen/xen_common.h
+# ../../include/hw/xen/xen_native.h
 xen_default_ioreq_server(void) ""
 xen_ioreq_server_create(uint32_t id) "id: %u"
 xen_ioreq_server_destroy(uint32_t id) "id: %u"
diff --git a/hw/xen/xen-bus-helper.c b/hw/xen/xen-bus-helper.c
index 5a1e12b374..b2b2cc9c5d 100644
--- a/hw/xen/xen-bus-helper.c
+++ b/hw/xen/xen-bus-helper.c
@@ -10,6 +10,7 @@
 #include "hw/xen/xen-bus.h"
 #include "hw/xen/xen-bus-helper.h"
 #include "qapi/error.h"
+#include "trace.h"
 
 #include <glib/gprintf.h>
 
@@ -46,34 +47,28 @@ const char *xs_strstate(enum xenbus_state state)
     return "INVALID";
 }
 
-void xs_node_create(struct xs_handle *xsh, xs_transaction_t tid,
-                    const char *node, struct xs_permissions perms[],
-                    unsigned int nr_perms, Error **errp)
+void xs_node_create(struct qemu_xs_handle *h, xs_transaction_t tid,
+                    const char *node, unsigned int owner, unsigned int domid,
+                    unsigned int perms, Error **errp)
 {
     trace_xs_node_create(node);
 
-    if (!xs_write(xsh, tid, node, "", 0)) {
+    if (!qemu_xen_xs_create(h, tid, owner, domid, perms, node)) {
         error_setg_errno(errp, errno, "failed to create node '%s'", node);
-        return;
-    }
-
-    if (!xs_set_permissions(xsh, tid, node, perms, nr_perms)) {
-        error_setg_errno(errp, errno, "failed to set node '%s' permissions",
-                         node);
     }
 }
 
-void xs_node_destroy(struct xs_handle *xsh, xs_transaction_t tid,
+void xs_node_destroy(struct qemu_xs_handle *h, xs_transaction_t tid,
                      const char *node, Error **errp)
 {
     trace_xs_node_destroy(node);
 
-    if (!xs_rm(xsh, tid, node)) {
+    if (!qemu_xen_xs_destroy(h, tid, node)) {
         error_setg_errno(errp, errno, "failed to destroy node '%s'", node);
     }
 }
 
-void xs_node_vprintf(struct xs_handle *xsh, xs_transaction_t tid,
+void xs_node_vprintf(struct qemu_xs_handle *h, xs_transaction_t tid,
                      const char *node, const char *key, Error **errp,
                      const char *fmt, va_list ap)
 {
@@ -86,7 +81,7 @@ void xs_node_vprintf(struct xs_handle *xsh, xs_transaction_t tid,
 
     trace_xs_node_vprintf(path, value);
 
-    if (!xs_write(xsh, tid, path, value, len)) {
+    if (!qemu_xen_xs_write(h, tid, path, value, len)) {
         error_setg_errno(errp, errno, "failed to write '%s' to '%s'",
                          value, path);
     }
@@ -95,18 +90,18 @@ void xs_node_vprintf(struct xs_handle *xsh, xs_transaction_t tid,
     g_free(path);
 }
 
-void xs_node_printf(struct xs_handle *xsh,  xs_transaction_t tid,
+void xs_node_printf(struct qemu_xs_handle *h,  xs_transaction_t tid,
                     const char *node, const char *key, Error **errp,
                     const char *fmt, ...)
 {
     va_list ap;
 
     va_start(ap, fmt);
-    xs_node_vprintf(xsh, tid, node, key, errp, fmt, ap);
+    xs_node_vprintf(h, tid, node, key, errp, fmt, ap);
     va_end(ap);
 }
 
-int xs_node_vscanf(struct xs_handle *xsh,  xs_transaction_t tid,
+int xs_node_vscanf(struct qemu_xs_handle *h,  xs_transaction_t tid,
                    const char *node, const char *key, Error **errp,
                    const char *fmt, va_list ap)
 {
@@ -115,7 +110,7 @@ int xs_node_vscanf(struct xs_handle *xsh,  xs_transaction_t tid,
 
     path = (strlen(node) != 0) ? g_strdup_printf("%s/%s", node, key) :
         g_strdup(key);
-    value = xs_read(xsh, tid, path, NULL);
+    value = qemu_xen_xs_read(h, tid, path, NULL);
 
     trace_xs_node_vscanf(path, value);
 
@@ -133,7 +128,7 @@ int xs_node_vscanf(struct xs_handle *xsh,  xs_transaction_t tid,
     return rc;
 }
 
-int xs_node_scanf(struct xs_handle *xsh,  xs_transaction_t tid,
+int xs_node_scanf(struct qemu_xs_handle *h,  xs_transaction_t tid,
                   const char *node, const char *key, Error **errp,
                   const char *fmt, ...)
 {
@@ -141,42 +136,35 @@ int xs_node_scanf(struct xs_handle *xsh,  xs_transaction_t tid,
     int rc;
 
     va_start(ap, fmt);
-    rc = xs_node_vscanf(xsh, tid, node, key, errp, fmt, ap);
+    rc = xs_node_vscanf(h, tid, node, key, errp, fmt, ap);
     va_end(ap);
 
     return rc;
 }
 
-void xs_node_watch(struct xs_handle *xsh, const char *node, const char *key,
-                   char *token, Error **errp)
+struct qemu_xs_watch *xs_node_watch(struct qemu_xs_handle *h, const char *node,
+                                    const char *key, xs_watch_fn fn,
+                                    void *opaque, Error **errp)
 {
     char *path;
+    struct qemu_xs_watch *w;
 
     path = (strlen(node) != 0) ? g_strdup_printf("%s/%s", node, key) :
         g_strdup(key);
 
     trace_xs_node_watch(path);
 
-    if (!xs_watch(xsh, path, token)) {
+    w = qemu_xen_xs_watch(h, path, fn, opaque);
+    if (!w) {
         error_setg_errno(errp, errno, "failed to watch node '%s'", path);
     }
 
     g_free(path);
+
+    return w;
 }
 
-void xs_node_unwatch(struct xs_handle *xsh, const char *node,
-                     const char *key, const char *token, Error **errp)
+void xs_node_unwatch(struct qemu_xs_handle *h, struct qemu_xs_watch *w)
 {
-    char *path;
-
-    path = (strlen(node) != 0) ? g_strdup_printf("%s/%s", node, key) :
-        g_strdup(key);
-
-    trace_xs_node_unwatch(path);
-
-    if (!xs_unwatch(xsh, path, token)) {
-        error_setg_errno(errp, errno, "failed to unwatch node '%s'", path);
-    }
-
-    g_free(path);
+    qemu_xen_xs_unwatch(h, w);
 }
diff --git a/hw/xen/xen-bus.c b/hw/xen/xen-bus.c
index df3f6b9ae0..c59850b1de 100644
--- a/hw/xen/xen-bus.c
+++ b/hw/xen/xen-bus.c
@@ -62,7 +62,7 @@ static void xen_device_unplug(XenDevice *xendev, Error **errp)
 
     /* Mimic the way the Xen toolstack does an unplug */
 again:
-    tid = xs_transaction_start(xenbus->xsh);
+    tid = qemu_xen_xs_transaction_start(xenbus->xsh);
     if (tid == XBT_NULL) {
         error_setg_errno(errp, errno, "failed xs_transaction_start");
         return;
@@ -80,7 +80,7 @@ again:
         goto abort;
     }
 
-    if (!xs_transaction_end(xenbus->xsh, tid, false)) {
+    if (!qemu_xen_xs_transaction_end(xenbus->xsh, tid, false)) {
         if (errno == EAGAIN) {
             goto again;
         }
@@ -95,7 +95,7 @@ abort:
      * We only abort if there is already a failure so ignore any error
      * from ending the transaction.
      */
-    xs_transaction_end(xenbus->xsh, tid, true);
+    qemu_xen_xs_transaction_end(xenbus->xsh, tid, true);
 }
 
 static void xen_bus_print_dev(Monitor *mon, DeviceState *dev, int indent)
@@ -111,143 +111,6 @@ static char *xen_bus_get_dev_path(DeviceState *dev)
     return xen_device_get_backend_path(XEN_DEVICE(dev));
 }
 
-struct XenWatch {
-    char *node, *key;
-    char *token;
-    XenWatchHandler handler;
-    void *opaque;
-    Notifier notifier;
-};
-
-static void watch_notify(Notifier *n, void *data)
-{
-    XenWatch *watch = container_of(n, XenWatch, notifier);
-    const char *token = data;
-
-    if (!strcmp(watch->token, token)) {
-        watch->handler(watch->opaque);
-    }
-}
-
-static XenWatch *new_watch(const char *node, const char *key,
-                           XenWatchHandler handler, void *opaque)
-{
-    XenWatch *watch = g_new0(XenWatch, 1);
-    QemuUUID uuid;
-
-    qemu_uuid_generate(&uuid);
-
-    watch->token = qemu_uuid_unparse_strdup(&uuid);
-    watch->node = g_strdup(node);
-    watch->key = g_strdup(key);
-    watch->handler = handler;
-    watch->opaque = opaque;
-    watch->notifier.notify = watch_notify;
-
-    return watch;
-}
-
-static void free_watch(XenWatch *watch)
-{
-    g_free(watch->token);
-    g_free(watch->key);
-    g_free(watch->node);
-
-    g_free(watch);
-}
-
-struct XenWatchList {
-    struct xs_handle *xsh;
-    NotifierList notifiers;
-};
-
-static void watch_list_event(void *opaque)
-{
-    XenWatchList *watch_list = opaque;
-    char **v;
-    const char *token;
-
-    v = xs_check_watch(watch_list->xsh);
-    if (!v) {
-        return;
-    }
-
-    token = v[XS_WATCH_TOKEN];
-
-    notifier_list_notify(&watch_list->notifiers, (void *)token);
-
-    free(v);
-}
-
-static XenWatchList *watch_list_create(struct xs_handle *xsh)
-{
-    XenWatchList *watch_list = g_new0(XenWatchList, 1);
-
-    g_assert(xsh);
-
-    watch_list->xsh = xsh;
-    notifier_list_init(&watch_list->notifiers);
-    qemu_set_fd_handler(xs_fileno(watch_list->xsh), watch_list_event, NULL,
-                        watch_list);
-
-    return watch_list;
-}
-
-static void watch_list_destroy(XenWatchList *watch_list)
-{
-    g_assert(notifier_list_empty(&watch_list->notifiers));
-    qemu_set_fd_handler(xs_fileno(watch_list->xsh), NULL, NULL, NULL);
-    g_free(watch_list);
-}
-
-static XenWatch *watch_list_add(XenWatchList *watch_list, const char *node,
-                                const char *key, XenWatchHandler handler,
-                                void *opaque, Error **errp)
-{
-    ERRP_GUARD();
-    XenWatch *watch = new_watch(node, key, handler, opaque);
-
-    notifier_list_add(&watch_list->notifiers, &watch->notifier);
-
-    xs_node_watch(watch_list->xsh, node, key, watch->token, errp);
-    if (*errp) {
-        notifier_remove(&watch->notifier);
-        free_watch(watch);
-
-        return NULL;
-    }
-
-    return watch;
-}
-
-static void watch_list_remove(XenWatchList *watch_list, XenWatch *watch,
-                              Error **errp)
-{
-    xs_node_unwatch(watch_list->xsh, watch->node, watch->key, watch->token,
-                    errp);
-
-    notifier_remove(&watch->notifier);
-    free_watch(watch);
-}
-
-static XenWatch *xen_bus_add_watch(XenBus *xenbus, const char *node,
-                                   const char *key, XenWatchHandler handler,
-                                   Error **errp)
-{
-    trace_xen_bus_add_watch(node, key);
-
-    return watch_list_add(xenbus->watch_list, node, key, handler, xenbus,
-                          errp);
-}
-
-static void xen_bus_remove_watch(XenBus *xenbus, XenWatch *watch,
-                                 Error **errp)
-{
-    trace_xen_bus_remove_watch(watch->node, watch->key);
-
-    watch_list_remove(xenbus->watch_list, watch, errp);
-}
-
 static void xen_bus_backend_create(XenBus *xenbus, const char *type,
                                    const char *name, char *path,
                                    Error **errp)
@@ -261,15 +124,15 @@ static void xen_bus_backend_create(XenBus *xenbus, const char *type,
     trace_xen_bus_backend_create(type, path);
 
 again:
-    tid = xs_transaction_start(xenbus->xsh);
+    tid = qemu_xen_xs_transaction_start(xenbus->xsh);
     if (tid == XBT_NULL) {
         error_setg(errp, "failed xs_transaction_start");
         return;
     }
 
-    key = xs_directory(xenbus->xsh, tid, path, &n);
+    key = qemu_xen_xs_directory(xenbus->xsh, tid, path, &n);
     if (!key) {
-        if (!xs_transaction_end(xenbus->xsh, tid, true)) {
+        if (!qemu_xen_xs_transaction_end(xenbus->xsh, tid, true)) {
             error_setg_errno(errp, errno, "failed xs_transaction_end");
         }
         return;
@@ -300,7 +163,7 @@ again:
 
     free(key);
 
-    if (!xs_transaction_end(xenbus->xsh, tid, false)) {
+    if (!qemu_xen_xs_transaction_end(xenbus->xsh, tid, false)) {
         qobject_unref(opts);
 
         if (errno == EAGAIN) {
@@ -327,7 +190,7 @@ static void xen_bus_type_enumerate(XenBus *xenbus, const char *type)
 
     trace_xen_bus_type_enumerate(type);
 
-    backend = xs_directory(xenbus->xsh, XBT_NULL, domain_path, &n);
+    backend = qemu_xen_xs_directory(xenbus->xsh, XBT_NULL, domain_path, &n);
     if (!backend) {
         goto out;
     }
@@ -372,7 +235,7 @@ static void xen_bus_enumerate(XenBus *xenbus)
 
     trace_xen_bus_enumerate();
 
-    type = xs_directory(xenbus->xsh, XBT_NULL, "backend", &n);
+    type = qemu_xen_xs_directory(xenbus->xsh, XBT_NULL, "backend", &n);
     if (!type) {
         return;
     }
@@ -415,7 +278,7 @@ static void xen_bus_cleanup(XenBus *xenbus)
     }
 }
 
-static void xen_bus_backend_changed(void *opaque)
+static void xen_bus_backend_changed(void *opaque, const char *path)
 {
     XenBus *xenbus = opaque;
 
@@ -434,7 +297,7 @@ static void xen_bus_unrealize(BusState *bus)
 
         for (i = 0; i < xenbus->backend_types; i++) {
             if (xenbus->backend_watch[i]) {
-                xen_bus_remove_watch(xenbus, xenbus->backend_watch[i], NULL);
+                xs_node_unwatch(xenbus->xsh, xenbus->backend_watch[i]);
             }
         }
 
@@ -442,13 +305,8 @@ static void xen_bus_unrealize(BusState *bus)
         xenbus->backend_watch = NULL;
     }
 
-    if (xenbus->watch_list) {
-        watch_list_destroy(xenbus->watch_list);
-        xenbus->watch_list = NULL;
-    }
-
     if (xenbus->xsh) {
-        xs_close(xenbus->xsh);
+        qemu_xen_xs_close(xenbus->xsh);
     }
 }
 
@@ -463,7 +321,7 @@ static void xen_bus_realize(BusState *bus, Error **errp)
 
     trace_xen_bus_realize();
 
-    xenbus->xsh = xs_open(0);
+    xenbus->xsh = qemu_xen_xs_open();
     if (!xenbus->xsh) {
         error_setg_errno(errp, errno, "failed xs_open");
         goto fail;
@@ -476,19 +334,18 @@ static void xen_bus_realize(BusState *bus, Error **errp)
         xenbus->backend_id = 0; /* Assume lack of node means dom0 */
     }
 
-    xenbus->watch_list = watch_list_create(xenbus->xsh);
-
     module_call_init(MODULE_INIT_XEN_BACKEND);
 
     type = xen_backend_get_types(&xenbus->backend_types);
-    xenbus->backend_watch = g_new(XenWatch *, xenbus->backend_types);
+    xenbus->backend_watch = g_new(struct qemu_xs_watch *,
+                                  xenbus->backend_types);
 
     for (i = 0; i < xenbus->backend_types; i++) {
         char *node = g_strdup_printf("backend/%s", type[i]);
 
         xenbus->backend_watch[i] =
-            xen_bus_add_watch(xenbus, node, key, xen_bus_backend_changed,
-                              &local_err);
+            xs_node_watch(xenbus->xsh, node, key, xen_bus_backend_changed,
+                          xenbus, &local_err);
         if (local_err) {
             /* This need not be treated as a hard error so don't propagate */
             error_reportf_err(local_err,
@@ -631,7 +488,7 @@ static bool xen_device_frontend_is_active(XenDevice *xendev)
     }
 }
 
-static void xen_device_backend_changed(void *opaque)
+static void xen_device_backend_changed(void *opaque, const char *path)
 {
     XenDevice *xendev = opaque;
     const char *type = object_get_typename(OBJECT(xendev));
@@ -685,66 +542,35 @@ static void xen_device_backend_changed(void *opaque)
     }
 }
 
-static XenWatch *xen_device_add_watch(XenDevice *xendev, const char *node,
-                                      const char *key,
-                                      XenWatchHandler handler,
-                                      Error **errp)
-{
-    const char *type = object_get_typename(OBJECT(xendev));
-
-    trace_xen_device_add_watch(type, xendev->name, node, key);
-
-    return watch_list_add(xendev->watch_list, node, key, handler, xendev,
-                          errp);
-}
-
-static void xen_device_remove_watch(XenDevice *xendev, XenWatch *watch,
-                                    Error **errp)
-{
-    const char *type = object_get_typename(OBJECT(xendev));
-
-    trace_xen_device_remove_watch(type, xendev->name, watch->node,
-                                  watch->key);
-
-    watch_list_remove(xendev->watch_list, watch, errp);
-}
-
-
 static void xen_device_backend_create(XenDevice *xendev, Error **errp)
 {
     ERRP_GUARD();
     XenBus *xenbus = XEN_BUS(qdev_get_parent_bus(DEVICE(xendev)));
-    struct xs_permissions perms[2];
 
     xendev->backend_path = xen_device_get_backend_path(xendev);
 
-    perms[0].id = xenbus->backend_id;
-    perms[0].perms = XS_PERM_NONE;
-    perms[1].id = xendev->frontend_id;
-    perms[1].perms = XS_PERM_READ;
-
     g_assert(xenbus->xsh);
 
-    xs_node_create(xenbus->xsh, XBT_NULL, xendev->backend_path, perms,
-                   ARRAY_SIZE(perms), errp);
+    xs_node_create(xenbus->xsh, XBT_NULL, xendev->backend_path,
+                   xenbus->backend_id, xendev->frontend_id, XS_PERM_READ, errp);
     if (*errp) {
         error_prepend(errp, "failed to create backend: ");
         return;
     }
 
     xendev->backend_state_watch =
-        xen_device_add_watch(xendev, xendev->backend_path,
-                             "state", xen_device_backend_changed,
-                             errp);
+        xs_node_watch(xendev->xsh, xendev->backend_path,
+                      "state", xen_device_backend_changed, xendev,
+                      errp);
     if (*errp) {
         error_prepend(errp, "failed to watch backend state: ");
         return;
     }
 
     xendev->backend_online_watch =
-        xen_device_add_watch(xendev, xendev->backend_path,
-                             "online", xen_device_backend_changed,
-                             errp);
+        xs_node_watch(xendev->xsh, xendev->backend_path,
+                      "online", xen_device_backend_changed, xendev,
+                      errp);
     if (*errp) {
         error_prepend(errp, "failed to watch backend online: ");
         return;
@@ -757,12 +583,12 @@ static void xen_device_backend_destroy(XenDevice *xendev)
     Error *local_err = NULL;
 
     if (xendev->backend_online_watch) {
-        xen_device_remove_watch(xendev, xendev->backend_online_watch, NULL);
+        xs_node_unwatch(xendev->xsh, xendev->backend_online_watch);
         xendev->backend_online_watch = NULL;
     }
 
     if (xendev->backend_state_watch) {
-        xen_device_remove_watch(xendev, xendev->backend_state_watch, NULL);
+        xs_node_unwatch(xendev->xsh, xendev->backend_state_watch);
         xendev->backend_state_watch = NULL;
     }
 
@@ -837,7 +663,7 @@ static void xen_device_frontend_set_state(XenDevice *xendev,
     }
 }
 
-static void xen_device_frontend_changed(void *opaque)
+static void xen_device_frontend_changed(void *opaque, const char *path)
 {
     XenDevice *xendev = opaque;
     XenDeviceClass *xendev_class = XEN_DEVICE_GET_CLASS(xendev);
@@ -885,7 +711,6 @@ static void xen_device_frontend_create(XenDevice *xendev, Error **errp)
 {
     ERRP_GUARD();
     XenBus *xenbus = XEN_BUS(qdev_get_parent_bus(DEVICE(xendev)));
-    struct xs_permissions perms[2];
 
     xendev->frontend_path = xen_device_get_frontend_path(xendev);
 
@@ -894,15 +719,11 @@ static void xen_device_frontend_create(XenDevice *xendev, Error **errp)
      * toolstack.
      */
     if (!xen_device_frontend_exists(xendev)) {
-        perms[0].id = xendev->frontend_id;
-        perms[0].perms = XS_PERM_NONE;
-        perms[1].id = xenbus->backend_id;
-        perms[1].perms = XS_PERM_READ | XS_PERM_WRITE;
-
         g_assert(xenbus->xsh);
 
-        xs_node_create(xenbus->xsh, XBT_NULL, xendev->frontend_path, perms,
-                       ARRAY_SIZE(perms), errp);
+        xs_node_create(xenbus->xsh, XBT_NULL, xendev->frontend_path,
+                       xendev->frontend_id, xenbus->backend_id,
+                       XS_PERM_READ | XS_PERM_WRITE, errp);
         if (*errp) {
             error_prepend(errp, "failed to create frontend: ");
             return;
@@ -910,8 +731,8 @@ static void xen_device_frontend_create(XenDevice *xendev, Error **errp)
     }
 
     xendev->frontend_state_watch =
-        xen_device_add_watch(xendev, xendev->frontend_path, "state",
-                             xen_device_frontend_changed, errp);
+        xs_node_watch(xendev->xsh, xendev->frontend_path, "state",
+                      xen_device_frontend_changed, xendev, errp);
     if (*errp) {
         error_prepend(errp, "failed to watch frontend state: ");
     }
@@ -923,8 +744,7 @@ static void xen_device_frontend_destroy(XenDevice *xendev)
     Error *local_err = NULL;
 
     if (xendev->frontend_state_watch) {
-        xen_device_remove_watch(xendev, xendev->frontend_state_watch,
-                                NULL);
+        xs_node_unwatch(xendev->xsh, xendev->frontend_state_watch);
         xendev->frontend_state_watch = NULL;
     }
 
@@ -947,7 +767,7 @@ static void xen_device_frontend_destroy(XenDevice *xendev)
 void xen_device_set_max_grant_refs(XenDevice *xendev, unsigned int nr_refs,
                                    Error **errp)
 {
-    if (xengnttab_set_max_grants(xendev->xgth, nr_refs)) {
+    if (qemu_xen_gnttab_set_max_grants(xendev->xgth, nr_refs)) {
         error_setg_errno(errp, errno, "xengnttab_set_max_grants failed");
     }
 }
@@ -956,9 +776,8 @@ void *xen_device_map_grant_refs(XenDevice *xendev, uint32_t *refs,
                                 unsigned int nr_refs, int prot,
                                 Error **errp)
 {
-    void *map = xengnttab_map_domain_grant_refs(xendev->xgth, nr_refs,
-                                                xendev->frontend_id, refs,
-                                                prot);
+    void *map = qemu_xen_gnttab_map_refs(xendev->xgth, nr_refs,
+                                         xendev->frontend_id, refs, prot);
 
     if (!map) {
         error_setg_errno(errp, errno,
@@ -968,112 +787,20 @@ void *xen_device_map_grant_refs(XenDevice *xendev, uint32_t *refs,
     return map;
 }
 
-void xen_device_unmap_grant_refs(XenDevice *xendev, void *map,
+void xen_device_unmap_grant_refs(XenDevice *xendev, void *map, uint32_t *refs,
                                  unsigned int nr_refs, Error **errp)
 {
-    if (xengnttab_unmap(xendev->xgth, map, nr_refs)) {
+    if (qemu_xen_gnttab_unmap(xendev->xgth, map, refs, nr_refs)) {
         error_setg_errno(errp, errno, "xengnttab_unmap failed");
     }
 }
 
-static void compat_copy_grant_refs(XenDevice *xendev, bool to_domain,
-                                   XenDeviceGrantCopySegment segs[],
-                                   unsigned int nr_segs, Error **errp)
-{
-    uint32_t *refs = g_new(uint32_t, nr_segs);
-    int prot = to_domain ? PROT_WRITE : PROT_READ;
-    void *map;
-    unsigned int i;
-
-    for (i = 0; i < nr_segs; i++) {
-        XenDeviceGrantCopySegment *seg = &segs[i];
-
-        refs[i] = to_domain ? seg->dest.foreign.ref :
-            seg->source.foreign.ref;
-    }
-
-    map = xengnttab_map_domain_grant_refs(xendev->xgth, nr_segs,
-                                          xendev->frontend_id, refs,
-                                          prot);
-    if (!map) {
-        error_setg_errno(errp, errno,
-                         "xengnttab_map_domain_grant_refs failed");
-        goto done;
-    }
-
-    for (i = 0; i < nr_segs; i++) {
-        XenDeviceGrantCopySegment *seg = &segs[i];
-        void *page = map + (i * XC_PAGE_SIZE);
-
-        if (to_domain) {
-            memcpy(page + seg->dest.foreign.offset, seg->source.virt,
-                   seg->len);
-        } else {
-            memcpy(seg->dest.virt, page + seg->source.foreign.offset,
-                   seg->len);
-        }
-    }
-
-    if (xengnttab_unmap(xendev->xgth, map, nr_segs)) {
-        error_setg_errno(errp, errno, "xengnttab_unmap failed");
-    }
-
-done:
-    g_free(refs);
-}
-
 void xen_device_copy_grant_refs(XenDevice *xendev, bool to_domain,
                                 XenDeviceGrantCopySegment segs[],
                                 unsigned int nr_segs, Error **errp)
 {
-    xengnttab_grant_copy_segment_t *xengnttab_segs;
-    unsigned int i;
-
-    if (!xendev->feature_grant_copy) {
-        compat_copy_grant_refs(xendev, to_domain, segs, nr_segs, errp);
-        return;
-    }
-
-    xengnttab_segs = g_new0(xengnttab_grant_copy_segment_t, nr_segs);
-
-    for (i = 0; i < nr_segs; i++) {
-        XenDeviceGrantCopySegment *seg = &segs[i];
-        xengnttab_grant_copy_segment_t *xengnttab_seg = &xengnttab_segs[i];
-
-        if (to_domain) {
-            xengnttab_seg->flags = GNTCOPY_dest_gref;
-            xengnttab_seg->dest.foreign.domid = xendev->frontend_id;
-            xengnttab_seg->dest.foreign.ref = seg->dest.foreign.ref;
-            xengnttab_seg->dest.foreign.offset = seg->dest.foreign.offset;
-            xengnttab_seg->source.virt = seg->source.virt;
-        } else {
-            xengnttab_seg->flags = GNTCOPY_source_gref;
-            xengnttab_seg->source.foreign.domid = xendev->frontend_id;
-            xengnttab_seg->source.foreign.ref = seg->source.foreign.ref;
-            xengnttab_seg->source.foreign.offset =
-                seg->source.foreign.offset;
-            xengnttab_seg->dest.virt = seg->dest.virt;
-        }
-
-        xengnttab_seg->len = seg->len;
-    }
-
-    if (xengnttab_grant_copy(xendev->xgth, nr_segs, xengnttab_segs)) {
-        error_setg_errno(errp, errno, "xengnttab_grant_copy failed");
-        goto done;
-    }
-
-    for (i = 0; i < nr_segs; i++) {
-        xengnttab_grant_copy_segment_t *xengnttab_seg = &xengnttab_segs[i];
-
-        if (xengnttab_seg->status != GNTST_okay) {
-            error_setg(errp, "xengnttab_grant_copy seg[%u] failed", i);
-            break;
-        }
-    }
-
-done:
-    g_free(xengnttab_segs);
+    qemu_xen_gnttab_grant_copy(xendev->xgth, to_domain, xendev->frontend_id,
+                               (XenGrantCopySegment *)segs, nr_segs, errp);
 }
 
 struct XenEventChannel {
@@ -1095,12 +822,12 @@ static bool xen_device_poll(void *opaque)
 static void xen_device_event(void *opaque)
 {
     XenEventChannel *channel = opaque;
-    unsigned long port = xenevtchn_pending(channel->xeh);
+    unsigned long port = qemu_xen_evtchn_pending(channel->xeh);
 
     if (port == channel->local_port) {
         xen_device_poll(channel);
 
-        xenevtchn_unmask(channel->xeh, port);
+        qemu_xen_evtchn_unmask(channel->xeh, port);
     }
 }
 
@@ -1115,11 +842,11 @@ void xen_device_set_event_channel_context(XenDevice *xendev,
     }
 
     if (channel->ctx)
-        aio_set_fd_handler(channel->ctx, xenevtchn_fd(channel->xeh), true,
+        aio_set_fd_handler(channel->ctx, qemu_xen_evtchn_fd(channel->xeh), true,
                            NULL, NULL, NULL, NULL, NULL);
 
     channel->ctx = ctx;
-    aio_set_fd_handler(channel->ctx, xenevtchn_fd(channel->xeh), true,
+    aio_set_fd_handler(channel->ctx, qemu_xen_evtchn_fd(channel->xeh), true,
                        xen_device_event, NULL, xen_device_poll, NULL, channel);
 }
 
@@ -1131,13 +858,13 @@ XenEventChannel *xen_device_bind_event_channel(XenDevice *xendev,
     XenEventChannel *channel = g_new0(XenEventChannel, 1);
     xenevtchn_port_or_error_t local_port;
 
-    channel->xeh = xenevtchn_open(NULL, 0);
+    channel->xeh = qemu_xen_evtchn_open();
     if (!channel->xeh) {
         error_setg_errno(errp, errno, "failed xenevtchn_open");
         goto fail;
     }
 
-    local_port = xenevtchn_bind_interdomain(channel->xeh,
+    local_port = qemu_xen_evtchn_bind_interdomain(channel->xeh,
                                             xendev->frontend_id,
                                             port);
     if (local_port < 0) {
@@ -1160,7 +887,7 @@ XenEventChannel *xen_device_bind_event_channel(XenDevice *xendev,
 
 fail:
     if (channel->xeh) {
-        xenevtchn_close(channel->xeh);
+        qemu_xen_evtchn_close(channel->xeh);
     }
 
     g_free(channel);
@@ -1177,7 +904,7 @@ void xen_device_notify_event_channel(XenDevice *xendev,
         return;
     }
 
-    if (xenevtchn_notify(channel->xeh, channel->local_port) < 0) {
+    if (qemu_xen_evtchn_notify(channel->xeh, channel->local_port) < 0) {
         error_setg_errno(errp, errno, "xenevtchn_notify failed");
     }
 }
@@ -1193,14 +920,14 @@ void xen_device_unbind_event_channel(XenDevice *xendev,
 
     QLIST_REMOVE(channel, list);
 
-    aio_set_fd_handler(channel->ctx, xenevtchn_fd(channel->xeh), true,
+    aio_set_fd_handler(channel->ctx, qemu_xen_evtchn_fd(channel->xeh), true,
                        NULL, NULL, NULL, NULL, NULL);
 
-    if (xenevtchn_unbind(channel->xeh, channel->local_port) < 0) {
+    if (qemu_xen_evtchn_unbind(channel->xeh, channel->local_port) < 0) {
         error_setg_errno(errp, errno, "xenevtchn_unbind failed");
     }
 
-    xenevtchn_close(channel->xeh);
+    qemu_xen_evtchn_close(channel->xeh);
     g_free(channel);
 }
 
@@ -1235,17 +962,12 @@ static void xen_device_unrealize(DeviceState *dev)
     xen_device_backend_destroy(xendev);
 
     if (xendev->xgth) {
-        xengnttab_close(xendev->xgth);
+        qemu_xen_gnttab_close(xendev->xgth);
         xendev->xgth = NULL;
     }
 
-    if (xendev->watch_list) {
-        watch_list_destroy(xendev->watch_list);
-        xendev->watch_list = NULL;
-    }
-
     if (xendev->xsh) {
-        xs_close(xendev->xsh);
+        qemu_xen_xs_close(xendev->xsh);
         xendev->xsh = NULL;
     }
 
@@ -1290,23 +1012,18 @@ static void xen_device_realize(DeviceState *dev, Error **errp)
 
     trace_xen_device_realize(type, xendev->name);
 
-    xendev->xsh = xs_open(0);
+    xendev->xsh = qemu_xen_xs_open();
     if (!xendev->xsh) {
         error_setg_errno(errp, errno, "failed xs_open");
         goto unrealize;
     }
 
-    xendev->watch_list = watch_list_create(xendev->xsh);
-
-    xendev->xgth = xengnttab_open(NULL, 0);
+    xendev->xgth = qemu_xen_gnttab_open();
     if (!xendev->xgth) {
         error_setg_errno(errp, errno, "failed xengnttab_open");
         goto unrealize;
     }
 
-    xendev->feature_grant_copy =
-        (xengnttab_grant_copy(xendev->xgth, 0, NULL) == 0);
-
     xen_device_backend_create(xendev, errp);
     if (*errp) {
         goto unrealize;
@@ -1317,13 +1034,6 @@ static void xen_device_realize(DeviceState *dev, Error **errp)
         goto unrealize;
     }
 
-    if (xendev_class->realize) {
-        xendev_class->realize(xendev, errp);
-        if (*errp) {
-            goto unrealize;
-        }
-    }
-
     xen_device_backend_printf(xendev, "frontend", "%s",
                               xendev->frontend_path);
     xen_device_backend_printf(xendev, "frontend-id", "%u",
@@ -1342,6 +1052,13 @@ static void xen_device_realize(DeviceState *dev, Error **errp)
         xen_device_frontend_set_state(xendev, XenbusStateInitialising, true);
     }
 
+    if (xendev_class->realize) {
+        xendev_class->realize(xendev, errp);
+        if (*errp) {
+            goto unrealize;
+        }
+    }
+
     xendev->exit.notify = xen_device_exit;
     qemu_add_exit_notifier(&xendev->exit);
     return;
diff --git a/hw/xen/xen-legacy-backend.c b/hw/xen/xen-legacy-backend.c
index afba71f6eb..4ded3cec23 100644
--- a/hw/xen/xen-legacy-backend.c
+++ b/hw/xen/xen-legacy-backend.c
@@ -39,11 +39,10 @@ BusState *xen_sysbus;
 /* ------------------------------------------------------------- */
 
 /* public */
-struct xs_handle *xenstore;
+struct qemu_xs_handle *xenstore;
 const char *xen_protocol;
 
 /* private */
-static bool xen_feature_grant_copy;
 static int debug;
 
 int xenstore_write_be_str(struct XenLegacyDevice *xendev, const char *node,
@@ -113,7 +112,7 @@ void xen_be_set_max_grant_refs(struct XenLegacyDevice *xendev,
 {
     assert(xendev->ops->flags & DEVOPS_FLAG_NEED_GNTDEV);
 
-    if (xengnttab_set_max_grants(xendev->gnttabdev, nr_refs)) {
+    if (qemu_xen_gnttab_set_max_grants(xendev->gnttabdev, nr_refs)) {
         xen_pv_printf(xendev, 0, "xengnttab_set_max_grants failed: %s\n",
                       strerror(errno));
     }
@@ -126,8 +125,8 @@ void *xen_be_map_grant_refs(struct XenLegacyDevice *xendev, uint32_t *refs,
 
     assert(xendev->ops->flags & DEVOPS_FLAG_NEED_GNTDEV);
 
-    ptr = xengnttab_map_domain_grant_refs(xendev->gnttabdev, nr_refs,
-                                          xen_domid, refs, prot);
+    ptr = qemu_xen_gnttab_map_refs(xendev->gnttabdev, nr_refs, xen_domid, refs,
+                                   prot);
     if (!ptr) {
         xen_pv_printf(xendev, 0,
                       "xengnttab_map_domain_grant_refs failed: %s\n",
@@ -138,123 +137,31 @@ void *xen_be_map_grant_refs(struct XenLegacyDevice *xendev, uint32_t *refs,
 }
 
 void xen_be_unmap_grant_refs(struct XenLegacyDevice *xendev, void *ptr,
-                             unsigned int nr_refs)
+                             uint32_t *refs, unsigned int nr_refs)
 {
     assert(xendev->ops->flags & DEVOPS_FLAG_NEED_GNTDEV);
 
-    if (xengnttab_unmap(xendev->gnttabdev, ptr, nr_refs)) {
+    if (qemu_xen_gnttab_unmap(xendev->gnttabdev, ptr, refs, nr_refs)) {
         xen_pv_printf(xendev, 0, "xengnttab_unmap failed: %s\n",
                       strerror(errno));
     }
 }
 
-static int compat_copy_grant_refs(struct XenLegacyDevice *xendev,
-                                  bool to_domain,
-                                  XenGrantCopySegment segs[],
-                                  unsigned int nr_segs)
-{
-    uint32_t *refs = g_new(uint32_t, nr_segs);
-    int prot = to_domain ? PROT_WRITE : PROT_READ;
-    void *pages;
-    unsigned int i;
-
-    for (i = 0; i < nr_segs; i++) {
-        XenGrantCopySegment *seg = &segs[i];
-
-        refs[i] = to_domain ?
-            seg->dest.foreign.ref : seg->source.foreign.ref;
-    }
-
-    pages = xengnttab_map_domain_grant_refs(xendev->gnttabdev, nr_segs,
-                                            xen_domid, refs, prot);
-    if (!pages) {
-        xen_pv_printf(xendev, 0,
-                      "xengnttab_map_domain_grant_refs failed: %s\n",
-                      strerror(errno));
-        g_free(refs);
-        return -1;
-    }
-
-    for (i = 0; i < nr_segs; i++) {
-        XenGrantCopySegment *seg = &segs[i];
-        void *page = pages + (i * XC_PAGE_SIZE);
-
-        if (to_domain) {
-            memcpy(page + seg->dest.foreign.offset, seg->source.virt,
-                   seg->len);
-        } else {
-            memcpy(seg->dest.virt, page + seg->source.foreign.offset,
-                   seg->len);
-        }
-    }
-
-    if (xengnttab_unmap(xendev->gnttabdev, pages, nr_segs)) {
-        xen_pv_printf(xendev, 0, "xengnttab_unmap failed: %s\n",
-                      strerror(errno));
-    }
-
-    g_free(refs);
-    return 0;
-}
-
 int xen_be_copy_grant_refs(struct XenLegacyDevice *xendev,
                            bool to_domain,
                            XenGrantCopySegment segs[],
                            unsigned int nr_segs)
 {
-    xengnttab_grant_copy_segment_t *xengnttab_segs;
-    unsigned int i;
     int rc;
 
     assert(xendev->ops->flags & DEVOPS_FLAG_NEED_GNTDEV);
 
-    if (!xen_feature_grant_copy) {
-        return compat_copy_grant_refs(xendev, to_domain, segs, nr_segs);
-    }
-
-    xengnttab_segs = g_new0(xengnttab_grant_copy_segment_t, nr_segs);
-
-    for (i = 0; i < nr_segs; i++) {
-        XenGrantCopySegment *seg = &segs[i];
-        xengnttab_grant_copy_segment_t *xengnttab_seg = &xengnttab_segs[i];
-
-        if (to_domain) {
-            xengnttab_seg->flags = GNTCOPY_dest_gref;
-            xengnttab_seg->dest.foreign.domid = xen_domid;
-            xengnttab_seg->dest.foreign.ref = seg->dest.foreign.ref;
-            xengnttab_seg->dest.foreign.offset = seg->dest.foreign.offset;
-            xengnttab_seg->source.virt = seg->source.virt;
-        } else {
-            xengnttab_seg->flags = GNTCOPY_source_gref;
-            xengnttab_seg->source.foreign.domid = xen_domid;
-            xengnttab_seg->source.foreign.ref = seg->source.foreign.ref;
-            xengnttab_seg->source.foreign.offset =
-                seg->source.foreign.offset;
-            xengnttab_seg->dest.virt = seg->dest.virt;
-        }
-
-        xengnttab_seg->len = seg->len;
-    }
-
-    rc = xengnttab_grant_copy(xendev->gnttabdev, nr_segs, xengnttab_segs);
-
+    rc = qemu_xen_gnttab_grant_copy(xendev->gnttabdev, to_domain, xen_domid,
+                                    segs, nr_segs, NULL);
     if (rc) {
-        xen_pv_printf(xendev, 0, "xengnttab_copy failed: %s\n",
-                      strerror(errno));
-    }
-
-    for (i = 0; i < nr_segs; i++) {
-        xengnttab_grant_copy_segment_t *xengnttab_seg =
-            &xengnttab_segs[i];
-
-        if (xengnttab_seg->status != GNTST_okay) {
-            xen_pv_printf(xendev, 0, "segment[%u] status: %d\n", i,
-                          xengnttab_seg->status);
-            rc = -1;
-        }
+        xen_pv_printf(xendev, 0, "xengnttab_grant_copy failed: %s\n",
+                      strerror(-rc));
     }
-
-    g_free(xengnttab_segs);
     return rc;
 }
 
@@ -294,13 +201,13 @@ static struct XenLegacyDevice *xen_be_get_xendev(const char *type, int dom,
     xendev->debug      = debug;
     xendev->local_port = -1;
 
-    xendev->evtchndev = xenevtchn_open(NULL, 0);
+    xendev->evtchndev = qemu_xen_evtchn_open();
     if (xendev->evtchndev == NULL) {
         xen_pv_printf(NULL, 0, "can't open evtchn device\n");
         qdev_unplug(DEVICE(xendev), NULL);
         return NULL;
     }
-    qemu_set_cloexec(xenevtchn_fd(xendev->evtchndev));
+    qemu_set_cloexec(qemu_xen_evtchn_fd(xendev->evtchndev));
 
     xen_pv_insert_xendev(xendev);
 
@@ -367,6 +274,25 @@ static void xen_be_frontend_changed(struct XenLegacyDevice *xendev,
     }
 }
 
+static void xenstore_update_fe(void *opaque, const char *watch)
+{
+    struct XenLegacyDevice *xendev = opaque;
+    const char *node;
+    unsigned int len;
+
+    len = strlen(xendev->fe);
+    if (strncmp(xendev->fe, watch, len) != 0) {
+        return;
+    }
+    if (watch[len] != '/') {
+        return;
+    }
+    node = watch + len + 1;
+
+    xen_be_frontend_changed(xendev, node);
+    xen_be_check_state(xendev);
+}
+
 /* ------------------------------------------------------------- */
 /* Check for possible state transitions and perform them.        */
 
@@ -380,7 +306,6 @@ static void xen_be_frontend_changed(struct XenLegacyDevice *xendev,
  */
 static int xen_be_try_setup(struct XenLegacyDevice *xendev)
 {
-    char token[XEN_BUFSIZE];
     int be_state;
 
     if (xenstore_read_be_int(xendev, "state", &be_state) == -1) {
@@ -401,8 +326,9 @@ static int xen_be_try_setup(struct XenLegacyDevice *xendev)
     }
 
     /* setup frontend watch */
-    snprintf(token, sizeof(token), "fe:%p", xendev);
-    if (!xs_watch(xenstore, xendev->fe, token)) {
+    xendev->watch = qemu_xen_xs_watch(xenstore, xendev->fe, xenstore_update_fe,
+                                      xendev);
+    if (!xendev->watch) {
         xen_pv_printf(xendev, 0, "watching frontend path (%s) failed\n",
                       xendev->fe);
         return -1;
@@ -466,7 +392,7 @@ static int xen_be_try_initialise(struct XenLegacyDevice *xendev)
     }
 
     if (xendev->ops->flags & DEVOPS_FLAG_NEED_GNTDEV) {
-        xendev->gnttabdev = xengnttab_open(NULL, 0);
+        xendev->gnttabdev = qemu_xen_gnttab_open();
         if (xendev->gnttabdev == NULL) {
             xen_pv_printf(NULL, 0, "can't open gnttab device\n");
             return -1;
@@ -524,7 +450,7 @@ static void xen_be_disconnect(struct XenLegacyDevice *xendev,
         xendev->ops->disconnect(xendev);
     }
     if (xendev->gnttabdev) {
-        xengnttab_close(xendev->gnttabdev);
+        qemu_xen_gnttab_close(xendev->gnttabdev);
         xendev->gnttabdev = NULL;
     }
     if (xendev->be_state != state) {
@@ -591,46 +517,20 @@ void xen_be_check_state(struct XenLegacyDevice *xendev)
 
 /* ------------------------------------------------------------- */
 
-static int xenstore_scan(const char *type, int dom, struct XenDevOps *ops)
-{
-    struct XenLegacyDevice *xendev;
-    char path[XEN_BUFSIZE], token[XEN_BUFSIZE];
-    char **dev = NULL;
-    unsigned int cdev, j;
-
-    /* setup watch */
-    snprintf(token, sizeof(token), "be:%p:%d:%p", type, dom, ops);
-    snprintf(path, sizeof(path), "backend/%s/%d", type, dom);
-    if (!xs_watch(xenstore, path, token)) {
-        xen_pv_printf(NULL, 0, "xen be: watching backend path (%s) failed\n",
-                      path);
-        return -1;
-    }
-
-    /* look for backends */
-    dev = xs_directory(xenstore, 0, path, &cdev);
-    if (!dev) {
-        return 0;
-    }
-    for (j = 0; j < cdev; j++) {
-        xendev = xen_be_get_xendev(type, dom, atoi(dev[j]), ops);
-        if (xendev == NULL) {
-            continue;
-        }
-        xen_be_check_state(xendev);
-    }
-    free(dev);
-    return 0;
-}
+struct xenstore_be {
+    const char *type;
+    int dom;
+    struct XenDevOps *ops;
+};
 
-void xenstore_update_be(char *watch, char *type, int dom,
-                        struct XenDevOps *ops)
+static void xenstore_update_be(void *opaque, const char *watch)
 {
+    struct xenstore_be *be = opaque;
     struct XenLegacyDevice *xendev;
     char path[XEN_BUFSIZE], *bepath;
     unsigned int len, dev;
 
-    len = snprintf(path, sizeof(path), "backend/%s/%d", type, dom);
+    len = snprintf(path, sizeof(path), "backend/%s/%d", be->type, be->dom);
     if (strncmp(path, watch, len) != 0) {
         return;
     }
@@ -644,9 +544,9 @@ void xenstore_update_be(char *watch, char *type, int dom,
         return;
     }
 
-    xendev = xen_be_get_xendev(type, dom, dev, ops);
+    xendev = xen_be_get_xendev(be->type, be->dom, dev, be->ops);
     if (xendev != NULL) {
-        bepath = xs_read(xenstore, 0, xendev->be, &len);
+        bepath = qemu_xen_xs_read(xenstore, 0, xendev->be, &len);
         if (bepath == NULL) {
             xen_pv_del_xendev(xendev);
         } else {
@@ -657,23 +557,41 @@ void xenstore_update_be(char *watch, char *type, int dom,
     }
 }
 
-void xenstore_update_fe(char *watch, struct XenLegacyDevice *xendev)
+static int xenstore_scan(const char *type, int dom, struct XenDevOps *ops)
 {
-    char *node;
-    unsigned int len;
+    struct XenLegacyDevice *xendev;
+    char path[XEN_BUFSIZE];
+    struct xenstore_be *be = g_new0(struct xenstore_be, 1);
+    char **dev = NULL;
+    unsigned int cdev, j;
 
-    len = strlen(xendev->fe);
-    if (strncmp(xendev->fe, watch, len) != 0) {
-        return;
-    }
-    if (watch[len] != '/') {
-        return;
+    /* setup watch */
+    be->type = type;
+    be->dom = dom;
+    be->ops = ops;
+    snprintf(path, sizeof(path), "backend/%s/%d", type, dom);
+    if (!qemu_xen_xs_watch(xenstore, path, xenstore_update_be, be)) {
+        xen_pv_printf(NULL, 0, "xen be: watching backend path (%s) failed\n",
+                      path);
+        return -1;
     }
-    node = watch + len + 1;
 
-    xen_be_frontend_changed(xendev, node);
-    xen_be_check_state(xendev);
+    /* look for backends */
+    dev = qemu_xen_xs_directory(xenstore, 0, path, &cdev);
+    if (!dev) {
+        return 0;
+    }
+    for (j = 0; j < cdev; j++) {
+        xendev = xen_be_get_xendev(type, dom, atoi(dev[j]), ops);
+        if (xendev == NULL) {
+            continue;
+        }
+        xen_be_check_state(xendev);
+    }
+    free(dev);
+    return 0;
 }
+
 /* -------------------------------------------------------------------- */
 
 static void xen_set_dynamic_sysbus(void)
@@ -687,29 +605,17 @@ static void xen_set_dynamic_sysbus(void)
 
 void xen_be_init(void)
 {
-    xengnttab_handle *gnttabdev;
-
-    xenstore = xs_daemon_open();
+    xenstore = qemu_xen_xs_open();
     if (!xenstore) {
         xen_pv_printf(NULL, 0, "can't connect to xenstored\n");
         exit(1);
     }
 
-    qemu_set_fd_handler(xs_fileno(xenstore), xenstore_update, NULL, NULL);
-
-    if (xen_xc == NULL || xen_fmem == NULL) {
+    if (xen_evtchn_ops == NULL || xen_gnttab_ops == NULL) {
         xen_pv_printf(NULL, 0, "Xen operations not set up\n");
         exit(1);
     }
 
-    gnttabdev = xengnttab_open(NULL, 0);
-    if (gnttabdev != NULL) {
-        if (xengnttab_grant_copy(gnttabdev, 0, NULL) == 0) {
-            xen_feature_grant_copy = true;
-        }
-        xengnttab_close(gnttabdev);
-    }
-
     xen_sysdev = qdev_new(TYPE_XENSYSDEV);
     sysbus_realize_and_unref(SYS_BUS_DEVICE(xen_sysdev), &error_fatal);
     xen_sysbus = qbus_new(TYPE_XENSYSBUS, xen_sysdev, "xen-sysbus");
@@ -751,14 +657,14 @@ int xen_be_bind_evtchn(struct XenLegacyDevice *xendev)
     if (xendev->local_port != -1) {
         return 0;
     }
-    xendev->local_port = xenevtchn_bind_interdomain
+    xendev->local_port = qemu_xen_evtchn_bind_interdomain
         (xendev->evtchndev, xendev->dom, xendev->remote_port);
     if (xendev->local_port == -1) {
         xen_pv_printf(xendev, 0, "xenevtchn_bind_interdomain failed\n");
         return -1;
     }
     xen_pv_printf(xendev, 2, "bind evtchn port %d\n", xendev->local_port);
-    qemu_set_fd_handler(xenevtchn_fd(xendev->evtchndev),
+    qemu_set_fd_handler(qemu_xen_evtchn_fd(xendev->evtchndev),
                         xen_pv_evtchn_event, NULL, xendev);
     return 0;
 }
diff --git a/hw/xen/xen-operations.c b/hw/xen/xen-operations.c
new file mode 100644
index 0000000000..4b78fbf4bd
--- /dev/null
+++ b/hw/xen/xen-operations.c
@@ -0,0 +1,478 @@
+/*
+ * QEMU Xen backend support: Operations for true Xen
+ *
+ * Copyright © 2022 Amazon.com, Inc. or its affiliates. All Rights Reserved.
+ *
+ * Authors: David Woodhouse <dwmw2@infradead.org>
+ *
+ * This work is licensed under the terms of the GNU GPL, version 2 or later.
+ * See the COPYING file in the top-level directory.
+ */
+
+#include "qemu/osdep.h"
+#include "qemu/uuid.h"
+#include "qapi/error.h"
+
+#include "hw/xen/xen_native.h"
+#include "hw/xen/xen_backend_ops.h"
+
+/*
+ * If we have new enough libxenctrl then we do not want/need these compat
+ * interfaces, despite what the user supplied cflags might say. They
+ * must be undefined before including xenctrl.h
+ */
+#undef XC_WANT_COMPAT_EVTCHN_API
+#undef XC_WANT_COMPAT_GNTTAB_API
+#undef XC_WANT_COMPAT_MAP_FOREIGN_API
+
+#include <xenctrl.h>
+
+/*
+ * We don't support Xen prior to 4.2.0.
+ */
+
+/* Xen 4.2 through 4.6 */
+#if CONFIG_XEN_CTRL_INTERFACE_VERSION < 40701
+
+typedef xc_evtchn xenevtchn_handle;
+typedef evtchn_port_or_error_t xenevtchn_port_or_error_t;
+
+#define xenevtchn_open(l, f) xc_evtchn_open(l, f);
+#define xenevtchn_close(h) xc_evtchn_close(h)
+#define xenevtchn_fd(h) xc_evtchn_fd(h)
+#define xenevtchn_pending(h) xc_evtchn_pending(h)
+#define xenevtchn_notify(h, p) xc_evtchn_notify(h, p)
+#define xenevtchn_bind_interdomain(h, d, p) xc_evtchn_bind_interdomain(h, d, p)
+#define xenevtchn_unmask(h, p) xc_evtchn_unmask(h, p)
+#define xenevtchn_unbind(h, p) xc_evtchn_unbind(h, p)
+
+typedef xc_gnttab xengnttab_handle;
+
+#define xengnttab_open(l, f) xc_gnttab_open(l, f)
+#define xengnttab_close(h) xc_gnttab_close(h)
+#define xengnttab_set_max_grants(h, n) xc_gnttab_set_max_grants(h, n)
+#define xengnttab_map_grant_ref(h, d, r, p) xc_gnttab_map_grant_ref(h, d, r, p)
+#define xengnttab_unmap(h, a, n) xc_gnttab_munmap(h, a, n)
+#define xengnttab_map_grant_refs(h, c, d, r, p) \
+    xc_gnttab_map_grant_refs(h, c, d, r, p)
+#define xengnttab_map_domain_grant_refs(h, c, d, r, p) \
+    xc_gnttab_map_domain_grant_refs(h, c, d, r, p)
+
+typedef xc_interface xenforeignmemory_handle;
+
+#else /* CONFIG_XEN_CTRL_INTERFACE_VERSION >= 40701 */
+
+#include <xenevtchn.h>
+#include <xengnttab.h>
+#include <xenforeignmemory.h>
+
+#endif
+
+/* Xen before 4.8 */
+
+static int libxengnttab_fallback_grant_copy(xengnttab_handle *xgt,
+                                            bool to_domain, uint32_t domid,
+                                            XenGrantCopySegment segs[],
+                                            unsigned int nr_segs, Error **errp)
+{
+    uint32_t *refs = g_new(uint32_t, nr_segs);
+    int prot = to_domain ? PROT_WRITE : PROT_READ;
+    void *map;
+    unsigned int i;
+    int rc = 0;
+
+    for (i = 0; i < nr_segs; i++) {
+        XenGrantCopySegment *seg = &segs[i];
+
+        refs[i] = to_domain ? seg->dest.foreign.ref :
+            seg->source.foreign.ref;
+    }
+    map = xengnttab_map_domain_grant_refs(xgt, nr_segs, domid, refs, prot);
+    if (!map) {
+        if (errp) {
+            error_setg_errno(errp, errno,
+                             "xengnttab_map_domain_grant_refs failed");
+        }
+        rc = -errno;
+        goto done;
+    }
+
+    for (i = 0; i < nr_segs; i++) {
+        XenGrantCopySegment *seg = &segs[i];
+        void *page = map + (i * XEN_PAGE_SIZE);
+
+        if (to_domain) {
+            memcpy(page + seg->dest.foreign.offset, seg->source.virt,
+                   seg->len);
+        } else {
+            memcpy(seg->dest.virt, page + seg->source.foreign.offset,
+                   seg->len);
+        }
+    }
+
+    if (xengnttab_unmap(xgt, map, nr_segs)) {
+        if (errp) {
+            error_setg_errno(errp, errno, "xengnttab_unmap failed");
+        }
+        rc = -errno;
+    }
+
+done:
+    g_free(refs);
+    return rc;
+}
+
+#if CONFIG_XEN_CTRL_INTERFACE_VERSION >= 40800
+
+static int libxengnttab_backend_grant_copy(xengnttab_handle *xgt,
+                                           bool to_domain, uint32_t domid,
+                                           XenGrantCopySegment *segs,
+                                           uint32_t nr_segs, Error **errp)
+{
+    xengnttab_grant_copy_segment_t *xengnttab_segs;
+    unsigned int i;
+    int rc;
+
+    xengnttab_segs = g_new0(xengnttab_grant_copy_segment_t, nr_segs);
+
+    for (i = 0; i < nr_segs; i++) {
+        XenGrantCopySegment *seg = &segs[i];
+        xengnttab_grant_copy_segment_t *xengnttab_seg = &xengnttab_segs[i];
+
+        if (to_domain) {
+            xengnttab_seg->flags = GNTCOPY_dest_gref;
+            xengnttab_seg->dest.foreign.domid = domid;
+            xengnttab_seg->dest.foreign.ref = seg->dest.foreign.ref;
+            xengnttab_seg->dest.foreign.offset = seg->dest.foreign.offset;
+            xengnttab_seg->source.virt = seg->source.virt;
+        } else {
+            xengnttab_seg->flags = GNTCOPY_source_gref;
+            xengnttab_seg->source.foreign.domid = domid;
+            xengnttab_seg->source.foreign.ref = seg->source.foreign.ref;
+            xengnttab_seg->source.foreign.offset =
+                seg->source.foreign.offset;
+            xengnttab_seg->dest.virt = seg->dest.virt;
+        }
+
+        xengnttab_seg->len = seg->len;
+    }
+
+    if (xengnttab_grant_copy(xgt, nr_segs, xengnttab_segs)) {
+        if (errp) {
+            error_setg_errno(errp, errno, "xengnttab_grant_copy failed");
+        }
+        rc = -errno;
+        goto done;
+    }
+
+    rc = 0;
+    for (i = 0; i < nr_segs; i++) {
+        xengnttab_grant_copy_segment_t *xengnttab_seg = &xengnttab_segs[i];
+
+        if (xengnttab_seg->status != GNTST_okay) {
+            if (errp) {
+                error_setg(errp, "xengnttab_grant_copy seg[%u] failed", i);
+            }
+            rc = -EIO;
+            break;
+        }
+    }
+
+done:
+    g_free(xengnttab_segs);
+    return rc;
+}
+#endif
+
+static xenevtchn_handle *libxenevtchn_backend_open(void)
+{
+    return xenevtchn_open(NULL, 0);
+}
+
+struct evtchn_backend_ops libxenevtchn_backend_ops = {
+    .open = libxenevtchn_backend_open,
+    .close = xenevtchn_close,
+    .bind_interdomain = xenevtchn_bind_interdomain,
+    .unbind = xenevtchn_unbind,
+    .get_fd = xenevtchn_fd,
+    .notify = xenevtchn_notify,
+    .unmask = xenevtchn_unmask,
+    .pending = xenevtchn_pending,
+};
+
+static xengnttab_handle *libxengnttab_backend_open(void)
+{
+    return xengnttab_open(NULL, 0);
+}
+
+static int libxengnttab_backend_unmap(xengnttab_handle *xgt,
+                                      void *start_address, uint32_t *refs,
+                                      uint32_t count)
+{
+    return xengnttab_unmap(xgt, start_address, count);
+}
+
+
+static struct gnttab_backend_ops libxengnttab_backend_ops = {
+    .features = XEN_GNTTAB_OP_FEATURE_MAP_MULTIPLE,
+    .open = libxengnttab_backend_open,
+    .close = xengnttab_close,
+    .grant_copy = libxengnttab_fallback_grant_copy,
+    .set_max_grants = xengnttab_set_max_grants,
+    .map_refs = xengnttab_map_domain_grant_refs,
+    .unmap = libxengnttab_backend_unmap,
+};
+
+#if CONFIG_XEN_CTRL_INTERFACE_VERSION < 40701
+
+static void *libxenforeignmem_backend_map(uint32_t dom, void *addr, int prot,
+                                          size_t pages, xfn_pfn_t *pfns,
+                                          int *errs)
+{
+    if (errs) {
+        return xc_map_foreign_bulk(xen_xc, dom, prot, pfns, errs, pages);
+    } else {
+        return xc_map_foreign_pages(xen_xc, dom, prot, pfns, pages);
+    }
+}
+
+static int libxenforeignmem_backend_unmap(void *addr, size_t pages)
+{
+    return munmap(addr, pages * XC_PAGE_SIZE);
+}
+
+#else /* CONFIG_XEN_CTRL_INTERFACE_VERSION >= 40701 */
+
+static void *libxenforeignmem_backend_map(uint32_t dom, void *addr, int prot,
+                                          size_t pages, xen_pfn_t *pfns,
+                                          int *errs)
+{
+    return xenforeignmemory_map2(xen_fmem, dom, addr, prot, 0, pages, pfns,
+                                 errs);
+}
+
+static int libxenforeignmem_backend_unmap(void *addr, size_t pages)
+{
+    return xenforeignmemory_unmap(xen_fmem, addr, pages);
+}
+
+#endif
+
+struct foreignmem_backend_ops libxenforeignmem_backend_ops = {
+    .map = libxenforeignmem_backend_map,
+    .unmap = libxenforeignmem_backend_unmap,
+};
+
+struct qemu_xs_handle {
+    struct xs_handle *xsh;
+    NotifierList notifiers;
+};
+
+static void watch_event(void *opaque)
+{
+    struct qemu_xs_handle *h = opaque;
+
+    for (;;) {
+        char **v = xs_check_watch(h->xsh);
+
+        if (!v) {
+            break;
+        }
+
+        notifier_list_notify(&h->notifiers, v);
+        free(v);
+    }
+}
+
+static struct qemu_xs_handle *libxenstore_open(void)
+{
+    struct xs_handle *xsh = xs_open(0);
+    struct qemu_xs_handle *h = g_new0(struct qemu_xs_handle, 1);
+
+    if (!xsh) {
+        return NULL;
+    }
+
+    h = g_new0(struct qemu_xs_handle, 1);
+    h->xsh = xsh;
+
+    notifier_list_init(&h->notifiers);
+    qemu_set_fd_handler(xs_fileno(h->xsh), watch_event, NULL, h);
+
+    return h;
+}
+
+static void libxenstore_close(struct qemu_xs_handle *h)
+{
+    g_assert(notifier_list_empty(&h->notifiers));
+    qemu_set_fd_handler(xs_fileno(h->xsh), NULL, NULL, NULL);
+    xs_close(h->xsh);
+    g_free(h);
+}
+
+static char *libxenstore_get_domain_path(struct qemu_xs_handle *h,
+                                         unsigned int domid)
+{
+    return xs_get_domain_path(h->xsh, domid);
+}
+
+static char **libxenstore_directory(struct qemu_xs_handle *h,
+                                    xs_transaction_t t, const char *path,
+                                    unsigned int *num)
+{
+    return xs_directory(h->xsh, t, path, num);
+}
+
+static void *libxenstore_read(struct qemu_xs_handle *h, xs_transaction_t t,
+                              const char *path, unsigned int *len)
+{
+    return xs_read(h->xsh, t, path, len);
+}
+
+static bool libxenstore_write(struct qemu_xs_handle *h, xs_transaction_t t,
+                              const char *path, const void *data,
+                              unsigned int len)
+{
+    return xs_write(h->xsh, t, path, data, len);
+}
+
+static bool libxenstore_create(struct qemu_xs_handle *h, xs_transaction_t t,
+                               unsigned int owner, unsigned int domid,
+                               unsigned int perms, const char *path)
+{
+    struct xs_permissions perms_list[] = {
+        {
+            .id    = owner,
+            .perms = XS_PERM_NONE,
+        },
+        {
+            .id    = domid,
+            .perms = perms,
+        },
+    };
+
+    if (!xs_mkdir(h->xsh, t, path)) {
+        return false;
+    }
+
+    return xs_set_permissions(h->xsh, t, path, perms_list,
+                              ARRAY_SIZE(perms_list));
+}
+
+static bool libxenstore_destroy(struct qemu_xs_handle *h, xs_transaction_t t,
+                                const char *path)
+{
+    return xs_rm(h->xsh, t, path);
+}
+
+struct qemu_xs_watch {
+    char *path;
+    char *token;
+    xs_watch_fn fn;
+    void *opaque;
+    Notifier notifier;
+};
+
+static void watch_notify(Notifier *n, void *data)
+{
+    struct qemu_xs_watch *w = container_of(n, struct qemu_xs_watch, notifier);
+    const char **v = data;
+
+    if (!strcmp(w->token, v[XS_WATCH_TOKEN])) {
+        w->fn(w->opaque, v[XS_WATCH_PATH]);
+    }
+}
+
+static struct qemu_xs_watch *new_watch(const char *path, xs_watch_fn fn,
+                                       void *opaque)
+{
+    struct qemu_xs_watch *w = g_new0(struct qemu_xs_watch, 1);
+    QemuUUID uuid;
+
+    qemu_uuid_generate(&uuid);
+
+    w->token = qemu_uuid_unparse_strdup(&uuid);
+    w->path = g_strdup(path);
+    w->fn = fn;
+    w->opaque = opaque;
+    w->notifier.notify = watch_notify;
+
+    return w;
+}
+
+static void free_watch(struct qemu_xs_watch *w)
+{
+    g_free(w->token);
+    g_free(w->path);
+
+    g_free(w);
+}
+
+static struct qemu_xs_watch *libxenstore_watch(struct qemu_xs_handle *h,
+                                               const char *path, xs_watch_fn fn,
+                                               void *opaque)
+{
+    struct qemu_xs_watch *w = new_watch(path, fn, opaque);
+
+    notifier_list_add(&h->notifiers, &w->notifier);
+
+    if (!xs_watch(h->xsh, path, w->token)) {
+        notifier_remove(&w->notifier);
+        free_watch(w);
+        return NULL;
+    }
+
+    return w;
+}
+
+static void libxenstore_unwatch(struct qemu_xs_handle *h,
+                                struct qemu_xs_watch *w)
+{
+    xs_unwatch(h->xsh, w->path, w->token);
+    notifier_remove(&w->notifier);
+    free_watch(w);
+}
+
+static xs_transaction_t libxenstore_transaction_start(struct qemu_xs_handle *h)
+{
+    return xs_transaction_start(h->xsh);
+}
+
+static bool libxenstore_transaction_end(struct qemu_xs_handle *h,
+                                        xs_transaction_t t, bool abort)
+{
+    return xs_transaction_end(h->xsh, t, abort);
+}
+
+struct xenstore_backend_ops libxenstore_backend_ops = {
+    .open = libxenstore_open,
+    .close = libxenstore_close,
+    .get_domain_path = libxenstore_get_domain_path,
+    .directory = libxenstore_directory,
+    .read = libxenstore_read,
+    .write = libxenstore_write,
+    .create = libxenstore_create,
+    .destroy = libxenstore_destroy,
+    .watch = libxenstore_watch,
+    .unwatch = libxenstore_unwatch,
+    .transaction_start = libxenstore_transaction_start,
+    .transaction_end = libxenstore_transaction_end,
+};
+
+void setup_xen_backend_ops(void)
+{
+#if CONFIG_XEN_CTRL_INTERFACE_VERSION >= 40800
+    xengnttab_handle *xgt = xengnttab_open(NULL, 0);
+
+    if (xgt) {
+        if (xengnttab_grant_copy(xgt, 0, NULL) == 0) {
+            libxengnttab_backend_ops.grant_copy = libxengnttab_backend_grant_copy;
+        }
+        xengnttab_close(xgt);
+    }
+#endif
+    xen_evtchn_ops = &libxenevtchn_backend_ops;
+    xen_gnttab_ops = &libxengnttab_backend_ops;
+    xen_foreignmem_ops = &libxenforeignmem_backend_ops;
+    xen_xenstore_ops = &libxenstore_backend_ops;
+}
diff --git a/hw/xen/xen_devconfig.c b/hw/xen/xen_devconfig.c
index 46ee4a7f02..9b7304e544 100644
--- a/hw/xen/xen_devconfig.c
+++ b/hw/xen/xen_devconfig.c
@@ -11,11 +11,11 @@ static int xen_config_dev_dirs(const char *ftype, const char *btype, int vdev,
 {
     char *dom;
 
-    dom = xs_get_domain_path(xenstore, xen_domid);
+    dom = qemu_xen_xs_get_domain_path(xenstore, xen_domid);
     snprintf(fe, len, "%s/device/%s/%d", dom, ftype, vdev);
     free(dom);
 
-    dom = xs_get_domain_path(xenstore, 0);
+    dom = qemu_xen_xs_get_domain_path(xenstore, 0);
     snprintf(be, len, "%s/backend/%s/%d/%d", dom, btype, xen_domid, vdev);
     free(dom);
 
diff --git a/hw/xen/xen_pt.c b/hw/xen/xen_pt.c
index 8db0532632..2d33d178ad 100644
--- a/hw/xen/xen_pt.c
+++ b/hw/xen/xen_pt.c
@@ -57,11 +57,12 @@
 #include <sys/ioctl.h>
 
 #include "hw/pci/pci.h"
+#include "hw/pci/pci_bus.h"
 #include "hw/qdev-properties.h"
 #include "hw/qdev-properties-system.h"
+#include "xen_pt.h"
 #include "hw/xen/xen.h"
 #include "hw/xen/xen-legacy-backend.h"
-#include "xen_pt.h"
 #include "qemu/range.h"
 
 static bool has_igd_gfx_passthru;
@@ -780,15 +781,6 @@ static void xen_pt_realize(PCIDevice *d, Error **errp)
                s->hostaddr.bus, s->hostaddr.slot, s->hostaddr.function,
                s->dev.devfn);
 
-    xen_host_pci_device_get(&s->real_device,
-                            s->hostaddr.domain, s->hostaddr.bus,
-                            s->hostaddr.slot, s->hostaddr.function,
-                            errp);
-    if (*errp) {
-        error_append_hint(errp, "Failed to \"open\" the real pci device");
-        return;
-    }
-
     s->is_virtfn = s->real_device.is_virtfn;
     if (s->is_virtfn) {
         XEN_PT_LOG(d, "%04x:%02x:%02x.%d is a SR-IOV Virtual Function\n",
@@ -803,8 +795,10 @@ static void xen_pt_realize(PCIDevice *d, Error **errp)
     s->io_listener = xen_pt_io_listener;
 
     /* Setup VGA bios for passthrough GFX */
-    if ((s->real_device.domain == 0) && (s->real_device.bus == 0) &&
-        (s->real_device.dev == 2) && (s->real_device.func == 0)) {
+    if ((s->real_device.domain == XEN_PCI_IGD_DOMAIN) &&
+        (s->real_device.bus == XEN_PCI_IGD_BUS) &&
+        (s->real_device.dev == XEN_PCI_IGD_DEV) &&
+        (s->real_device.func == XEN_PCI_IGD_FN)) {
         if (!is_igd_vga_passthrough(&s->real_device)) {
             error_setg(errp, "Need to enable igd-passthru if you're trying"
                     " to passthrough IGD GFX");
@@ -950,11 +944,58 @@ static void xen_pci_passthrough_instance_init(Object *obj)
     PCI_DEVICE(obj)->cap_present |= QEMU_PCI_CAP_EXPRESS;
 }
 
+void xen_igd_reserve_slot(PCIBus *pci_bus)
+{
+    if (!xen_igd_gfx_pt_enabled()) {
+        return;
+    }
+
+    XEN_PT_LOG(0, "Reserving PCI slot 2 for IGD\n");
+    pci_bus->slot_reserved_mask |= XEN_PCI_IGD_SLOT_MASK;
+}
+
+static void xen_igd_clear_slot(DeviceState *qdev, Error **errp)
+{
+    ERRP_GUARD();
+    PCIDevice *pci_dev = (PCIDevice *)qdev;
+    XenPCIPassthroughState *s = XEN_PT_DEVICE(pci_dev);
+    XenPTDeviceClass *xpdc = XEN_PT_DEVICE_GET_CLASS(s);
+    PCIBus *pci_bus = pci_get_bus(pci_dev);
+
+    xen_host_pci_device_get(&s->real_device,
+                            s->hostaddr.domain, s->hostaddr.bus,
+                            s->hostaddr.slot, s->hostaddr.function,
+                            errp);
+    if (*errp) {
+        error_append_hint(errp, "Failed to \"open\" the real pci device");
+        return;
+    }
+
+    if (!(pci_bus->slot_reserved_mask & XEN_PCI_IGD_SLOT_MASK)) {
+        xpdc->pci_qdev_realize(qdev, errp);
+        return;
+    }
+
+    if (is_igd_vga_passthrough(&s->real_device) &&
+        s->real_device.domain == XEN_PCI_IGD_DOMAIN &&
+        s->real_device.bus == XEN_PCI_IGD_BUS &&
+        s->real_device.dev == XEN_PCI_IGD_DEV &&
+        s->real_device.func == XEN_PCI_IGD_FN &&
+        s->real_device.vendor_id == PCI_VENDOR_ID_INTEL) {
+        pci_bus->slot_reserved_mask &= ~XEN_PCI_IGD_SLOT_MASK;
+        XEN_PT_LOG(pci_dev, "Intel IGD found, using slot 2\n");
+    }
+    xpdc->pci_qdev_realize(qdev, errp);
+}
+
 static void xen_pci_passthrough_class_init(ObjectClass *klass, void *data)
 {
     DeviceClass *dc = DEVICE_CLASS(klass);
     PCIDeviceClass *k = PCI_DEVICE_CLASS(klass);
 
+    XenPTDeviceClass *xpdc = XEN_PT_DEVICE_CLASS(klass);
+    xpdc->pci_qdev_realize = dc->realize;
+    dc->realize = xen_igd_clear_slot;
     k->realize = xen_pt_realize;
     k->exit = xen_pt_unregister_device;
     k->config_read = xen_pt_pci_read_config;
@@ -977,6 +1018,7 @@ static const TypeInfo xen_pci_passthrough_info = {
     .instance_size = sizeof(XenPCIPassthroughState),
     .instance_finalize = xen_pci_passthrough_finalize,
     .class_init = xen_pci_passthrough_class_init,
+    .class_size = sizeof(XenPTDeviceClass),
     .instance_init = xen_pci_passthrough_instance_init,
     .interfaces = (InterfaceInfo[]) {
         { INTERFACE_CONVENTIONAL_PCI_DEVICE },
diff --git a/hw/xen/xen_pt.h b/hw/xen/xen_pt.h
index cf10fc7bbf..b20744f7c7 100644
--- a/hw/xen/xen_pt.h
+++ b/hw/xen/xen_pt.h
@@ -1,7 +1,7 @@
 #ifndef XEN_PT_H
 #define XEN_PT_H
 
-#include "hw/xen/xen_common.h"
+#include "hw/xen/xen_native.h"
 #include "xen-host-pci-device.h"
 #include "qom/object.h"
 
@@ -40,7 +40,20 @@ typedef struct XenPTReg XenPTReg;
 #define TYPE_XEN_PT_DEVICE "xen-pci-passthrough"
 OBJECT_DECLARE_SIMPLE_TYPE(XenPCIPassthroughState, XEN_PT_DEVICE)
 
+#define XEN_PT_DEVICE_CLASS(klass) \
+    OBJECT_CLASS_CHECK(XenPTDeviceClass, klass, TYPE_XEN_PT_DEVICE)
+#define XEN_PT_DEVICE_GET_CLASS(obj) \
+    OBJECT_GET_CLASS(XenPTDeviceClass, obj, TYPE_XEN_PT_DEVICE)
+
+typedef void (*XenPTQdevRealize)(DeviceState *qdev, Error **errp);
+
+typedef struct XenPTDeviceClass {
+    PCIDeviceClass parent_class;
+    XenPTQdevRealize pci_qdev_realize;
+} XenPTDeviceClass;
+
 uint32_t igd_read_opregion(XenPCIPassthroughState *s);
+void xen_igd_reserve_slot(PCIBus *pci_bus);
 void igd_write_opregion(XenPCIPassthroughState *s, uint32_t val);
 void xen_igd_passthrough_isa_bridge_create(XenPCIPassthroughState *s,
                                            XenHostPCIDevice *dev);
@@ -75,6 +88,13 @@ typedef int (*xen_pt_conf_byte_read)
 
 #define XEN_PCI_INTEL_OPREGION 0xfc
 
+#define XEN_PCI_IGD_DOMAIN 0
+#define XEN_PCI_IGD_BUS 0
+#define XEN_PCI_IGD_DEV 2
+#define XEN_PCI_IGD_FN 0
+#define XEN_PCI_IGD_SLOT_MASK \
+    (1UL << PCI_SLOT(PCI_DEVFN(XEN_PCI_IGD_DEV, XEN_PCI_IGD_FN)))
+
 typedef enum {
     XEN_PT_GRP_TYPE_HARDWIRED = 0,  /* 0 Hardwired reg group */
     XEN_PT_GRP_TYPE_EMU,            /* emul reg group */
diff --git a/hw/xen/xen_pt_config_init.c b/hw/xen/xen_pt_config_init.c
index cde898b744..2b8680b112 100644
--- a/hw/xen/xen_pt_config_init.c
+++ b/hw/xen/xen_pt_config_init.c
@@ -15,8 +15,8 @@
 #include "qemu/osdep.h"
 #include "qapi/error.h"
 #include "qemu/timer.h"
-#include "hw/xen/xen-legacy-backend.h"
 #include "xen_pt.h"
+#include "hw/xen/xen-legacy-backend.h"
 
 #define XEN_PT_MERGE_VALUE(value, data, val_mask) \
     (((value) & (val_mask)) | ((data) & ~(val_mask)))
@@ -1924,7 +1924,7 @@ static void xen_pt_config_reg_init(XenPCIPassthroughState *s,
     if (reg->init) {
         uint32_t host_mask, size_mask;
         unsigned int offset;
-        uint32_t val;
+        uint32_t val = 0;
 
         /* initialize emulate register */
         rc = reg->init(s, reg_entry->reg,
diff --git a/hw/xen/xen_pt_graphics.c b/hw/xen/xen_pt_graphics.c
index f303f67c9c..0aed3bb6fd 100644
--- a/hw/xen/xen_pt_graphics.c
+++ b/hw/xen/xen_pt_graphics.c
@@ -5,7 +5,6 @@
 #include "qapi/error.h"
 #include "xen_pt.h"
 #include "xen-host-pci-device.h"
-#include "hw/xen/xen-legacy-backend.h"
 
 static unsigned long igd_guest_opregion;
 static unsigned long igd_host_opregion;
diff --git a/hw/xen/xen_pt_msi.c b/hw/xen/xen_pt_msi.c
index b71563f98a..09cca4eecb 100644
--- a/hw/xen/xen_pt_msi.c
+++ b/hw/xen/xen_pt_msi.c
@@ -11,9 +11,9 @@
 
 #include "qemu/osdep.h"
 
-#include "hw/xen/xen-legacy-backend.h"
-#include "xen_pt.h"
 #include "hw/i386/apic-msidef.h"
+#include "xen_pt.h"
+#include "hw/xen/xen-legacy-backend.h"
 
 
 #define XEN_PT_AUTO_ASSIGN -1
diff --git a/hw/xen/xen_pt_stub.c b/hw/xen/xen_pt_stub.c
index 2d8cac8d54..5c108446a8 100644
--- a/hw/xen/xen_pt_stub.c
+++ b/hw/xen/xen_pt_stub.c
@@ -20,3 +20,7 @@ void xen_igd_gfx_pt_set(bool value, Error **errp)
         error_setg(errp, "Xen PCI passthrough support not built in");
     }
 }
+
+void xen_igd_reserve_slot(PCIBus *pci_bus)
+{
+}
diff --git a/hw/xen/xen_pvdev.c b/hw/xen/xen_pvdev.c
index 1a5177b354..be1504b82c 100644
--- a/hw/xen/xen_pvdev.c
+++ b/hw/xen/xen_pvdev.c
@@ -54,31 +54,17 @@ void xen_config_cleanup(void)
     struct xs_dirs *d;
 
     QTAILQ_FOREACH(d, &xs_cleanup, list) {
-        xs_rm(xenstore, 0, d->xs_dir);
+        qemu_xen_xs_destroy(xenstore, 0, d->xs_dir);
     }
 }
 
 int xenstore_mkdir(char *path, int p)
 {
-    struct xs_permissions perms[2] = {
-        {
-            .id    = 0, /* set owner: dom0 */
-        }, {
-            .id    = xen_domid,
-            .perms = p,
-        }
-    };
-
-    if (!xs_mkdir(xenstore, 0, path)) {
+    if (!qemu_xen_xs_create(xenstore, 0, 0, xen_domid, p, path)) {
         xen_pv_printf(NULL, 0, "xs_mkdir %s: failed\n", path);
         return -1;
     }
     xenstore_cleanup_dir(g_strdup(path));
-
-    if (!xs_set_permissions(xenstore, 0, path, perms, 2)) {
-        xen_pv_printf(NULL, 0, "xs_set_permissions %s: failed\n", path);
-        return -1;
-    }
     return 0;
 }
 
@@ -87,7 +73,7 @@ int xenstore_write_str(const char *base, const char *node, const char *val)
     char abspath[XEN_BUFSIZE];
 
     snprintf(abspath, sizeof(abspath), "%s/%s", base, node);
-    if (!xs_write(xenstore, 0, abspath, val, strlen(val))) {
+    if (!qemu_xen_xs_write(xenstore, 0, abspath, val, strlen(val))) {
         return -1;
     }
     return 0;
@@ -100,7 +86,7 @@ char *xenstore_read_str(const char *base, const char *node)
     char *str, *ret = NULL;
 
     snprintf(abspath, sizeof(abspath), "%s/%s", base, node);
-    str = xs_read(xenstore, 0, abspath, &len);
+    str = qemu_xen_xs_read(xenstore, 0, abspath, &len);
     if (str != NULL) {
         /* move to qemu-allocated memory to make sure
          * callers can savely g_free() stuff. */
@@ -152,29 +138,6 @@ int xenstore_read_uint64(const char *base, const char *node, uint64_t *uval)
     return rc;
 }
 
-void xenstore_update(void *unused)
-{
-    char **vec = NULL;
-    intptr_t type, ops, ptr;
-    unsigned int dom, count;
-
-    vec = xs_read_watch(xenstore, &count);
-    if (vec == NULL) {
-        goto cleanup;
-    }
-
-    if (sscanf(vec[XS_WATCH_TOKEN], "be:%" PRIxPTR ":%d:%" PRIxPTR,
-               &type, &dom, &ops) == 3) {
-        xenstore_update_be(vec[XS_WATCH_PATH], (void *)type, dom, (void*)ops);
-    }
-    if (sscanf(vec[XS_WATCH_TOKEN], "fe:%" PRIxPTR, &ptr) == 1) {
-        xenstore_update_fe(vec[XS_WATCH_PATH], (void *)ptr);
-    }
-
-cleanup:
-    free(vec);
-}
-
 const char *xenbus_strstate(enum xenbus_state state)
 {
     static const char *const name[] = {
@@ -238,14 +201,14 @@ void xen_pv_evtchn_event(void *opaque)
     struct XenLegacyDevice *xendev = opaque;
     evtchn_port_t port;
 
-    port = xenevtchn_pending(xendev->evtchndev);
+    port = qemu_xen_evtchn_pending(xendev->evtchndev);
     if (port != xendev->local_port) {
         xen_pv_printf(xendev, 0,
                       "xenevtchn_pending returned %d (expected %d)\n",
                       port, xendev->local_port);
         return;
     }
-    xenevtchn_unmask(xendev->evtchndev, port);
+    qemu_xen_evtchn_unmask(xendev->evtchndev, port);
 
     if (xendev->ops->event) {
         xendev->ops->event(xendev);
@@ -257,15 +220,15 @@ void xen_pv_unbind_evtchn(struct XenLegacyDevice *xendev)
     if (xendev->local_port == -1) {
         return;
     }
-    qemu_set_fd_handler(xenevtchn_fd(xendev->evtchndev), NULL, NULL, NULL);
-    xenevtchn_unbind(xendev->evtchndev, xendev->local_port);
+    qemu_set_fd_handler(qemu_xen_evtchn_fd(xendev->evtchndev), NULL, NULL, NULL);
+    qemu_xen_evtchn_unbind(xendev->evtchndev, xendev->local_port);
     xen_pv_printf(xendev, 2, "unbind evtchn port %d\n", xendev->local_port);
     xendev->local_port = -1;
 }
 
 int xen_pv_send_notify(struct XenLegacyDevice *xendev)
 {
-    return xenevtchn_notify(xendev->evtchndev, xendev->local_port);
+    return qemu_xen_evtchn_notify(xendev->evtchndev, xendev->local_port);
 }
 
 /* ------------------------------------------------------------- */
@@ -299,17 +262,15 @@ void xen_pv_del_xendev(struct XenLegacyDevice *xendev)
     }
 
     if (xendev->fe) {
-        char token[XEN_BUFSIZE];
-        snprintf(token, sizeof(token), "fe:%p", xendev);
-        xs_unwatch(xenstore, xendev->fe, token);
+        qemu_xen_xs_unwatch(xenstore, xendev->watch);
         g_free(xendev->fe);
     }
 
     if (xendev->evtchndev != NULL) {
-        xenevtchn_close(xendev->evtchndev);
+        qemu_xen_evtchn_close(xendev->evtchndev);
     }
     if (xendev->gnttabdev != NULL) {
-        xengnttab_close(xendev->gnttabdev);
+        qemu_xen_gnttab_close(xendev->gnttabdev);
     }
 
     QTAILQ_REMOVE(&xendevs, xendev, next);