diff options
Diffstat (limited to 'hw')
138 files changed, 16613 insertions, 3133 deletions
diff --git a/hw/9pfs/meson.build b/hw/9pfs/meson.build index 12443b6ad5..fd37b7a02d 100644 --- a/hw/9pfs/meson.build +++ b/hw/9pfs/meson.build @@ -15,7 +15,7 @@ fs_ss.add(files( )) fs_ss.add(when: 'CONFIG_LINUX', if_true: files('9p-util-linux.c')) fs_ss.add(when: 'CONFIG_DARWIN', if_true: files('9p-util-darwin.c')) -fs_ss.add(when: 'CONFIG_XEN', if_true: files('xen-9p-backend.c')) +fs_ss.add(when: 'CONFIG_XEN_BUS', if_true: files('xen-9p-backend.c')) softmmu_ss.add_all(when: 'CONFIG_FSDEV_9P', if_true: fs_ss) specific_ss.add(when: 'CONFIG_VIRTIO_9P', if_true: files('virtio-9p-device.c')) diff --git a/hw/9pfs/xen-9p-backend.c b/hw/9pfs/xen-9p-backend.c index 65c4979c3c..74f3a05f88 100644 --- a/hw/9pfs/xen-9p-backend.c +++ b/hw/9pfs/xen-9p-backend.c @@ -22,6 +22,7 @@ #include "qemu/config-file.h" #include "qemu/main-loop.h" #include "qemu/option.h" +#include "qemu/iov.h" #include "fsdev/qemu-fsdev.h" #define VERSIONS "1" @@ -241,7 +242,7 @@ static void xen_9pfs_push_and_notify(V9fsPDU *pdu) xen_wmb(); ring->inprogress = false; - xenevtchn_notify(ring->evtchndev, ring->local_port); + qemu_xen_evtchn_notify(ring->evtchndev, ring->local_port); qemu_bh_schedule(ring->bh); } @@ -324,8 +325,8 @@ static void xen_9pfs_evtchn_event(void *opaque) Xen9pfsRing *ring = opaque; evtchn_port_t port; - port = xenevtchn_pending(ring->evtchndev); - xenevtchn_unmask(ring->evtchndev, port); + port = qemu_xen_evtchn_pending(ring->evtchndev); + qemu_xen_evtchn_unmask(ring->evtchndev, port); qemu_bh_schedule(ring->bh); } @@ -337,10 +338,10 @@ static void xen_9pfs_disconnect(struct XenLegacyDevice *xendev) for (i = 0; i < xen_9pdev->num_rings; i++) { if (xen_9pdev->rings[i].evtchndev != NULL) { - qemu_set_fd_handler(xenevtchn_fd(xen_9pdev->rings[i].evtchndev), - NULL, NULL, NULL); - xenevtchn_unbind(xen_9pdev->rings[i].evtchndev, - xen_9pdev->rings[i].local_port); + qemu_set_fd_handler(qemu_xen_evtchn_fd(xen_9pdev->rings[i].evtchndev), + NULL, NULL, NULL); + qemu_xen_evtchn_unbind(xen_9pdev->rings[i].evtchndev, + xen_9pdev->rings[i].local_port); xen_9pdev->rings[i].evtchndev = NULL; } } @@ -359,12 +360,13 @@ static int xen_9pfs_free(struct XenLegacyDevice *xendev) if (xen_9pdev->rings[i].data != NULL) { xen_be_unmap_grant_refs(&xen_9pdev->xendev, xen_9pdev->rings[i].data, + xen_9pdev->rings[i].intf->ref, (1 << xen_9pdev->rings[i].ring_order)); } if (xen_9pdev->rings[i].intf != NULL) { - xen_be_unmap_grant_refs(&xen_9pdev->xendev, - xen_9pdev->rings[i].intf, - 1); + xen_be_unmap_grant_ref(&xen_9pdev->xendev, + xen_9pdev->rings[i].intf, + xen_9pdev->rings[i].ref); } if (xen_9pdev->rings[i].bh != NULL) { qemu_bh_delete(xen_9pdev->rings[i].bh); @@ -447,12 +449,12 @@ static int xen_9pfs_connect(struct XenLegacyDevice *xendev) xen_9pdev->rings[i].inprogress = false; - xen_9pdev->rings[i].evtchndev = xenevtchn_open(NULL, 0); + xen_9pdev->rings[i].evtchndev = qemu_xen_evtchn_open(); if (xen_9pdev->rings[i].evtchndev == NULL) { goto out; } - qemu_set_cloexec(xenevtchn_fd(xen_9pdev->rings[i].evtchndev)); - xen_9pdev->rings[i].local_port = xenevtchn_bind_interdomain + qemu_set_cloexec(qemu_xen_evtchn_fd(xen_9pdev->rings[i].evtchndev)); + xen_9pdev->rings[i].local_port = qemu_xen_evtchn_bind_interdomain (xen_9pdev->rings[i].evtchndev, xendev->dom, xen_9pdev->rings[i].evtchn); @@ -463,8 +465,8 @@ static int xen_9pfs_connect(struct XenLegacyDevice *xendev) goto out; } xen_pv_printf(xendev, 2, "bind evtchn port %d\n", xendev->local_port); - qemu_set_fd_handler(xenevtchn_fd(xen_9pdev->rings[i].evtchndev), - xen_9pfs_evtchn_event, NULL, &xen_9pdev->rings[i]); + qemu_set_fd_handler(qemu_xen_evtchn_fd(xen_9pdev->rings[i].evtchndev), + xen_9pfs_evtchn_event, NULL, &xen_9pdev->rings[i]); } xen_9pdev->security_model = xenstore_read_be_str(xendev, "security_model"); diff --git a/hw/acpi/acpi-pci-hotplug-stub.c b/hw/acpi/acpi-pci-hotplug-stub.c index a43f6dafc9..dcee3ad7a1 100644 --- a/hw/acpi/acpi-pci-hotplug-stub.c +++ b/hw/acpi/acpi-pci-hotplug-stub.c @@ -5,8 +5,7 @@ const VMStateDescription vmstate_acpi_pcihp_pci_status; void acpi_pcihp_init(Object *owner, AcpiPciHpState *s, PCIBus *root_bus, - MemoryRegion *address_space_io, bool bridges_enabled, - uint16_t io_base) + MemoryRegion *address_space_io, uint16_t io_base) { return; } @@ -36,8 +35,12 @@ void acpi_pcihp_device_unplug_request_cb(HotplugHandler *hotplug_dev, return; } -void acpi_pcihp_reset(AcpiPciHpState *s, bool acpihp_root_off) +void acpi_pcihp_reset(AcpiPciHpState *s) { return; } +bool acpi_pcihp_is_hotpluggbale_bus(AcpiPciHpState *s, BusState *bus) +{ + return true; +} diff --git a/hw/acpi/ich9.c b/hw/acpi/ich9.c index d23bfcaa6b..25e2c7243e 100644 --- a/hw/acpi/ich9.c +++ b/hw/acpi/ich9.c @@ -218,7 +218,7 @@ static bool vmstate_test_use_pcihp(void *opaque) { ICH9LPCPMRegs *s = opaque; - return s->use_acpi_hotplug_bridge; + return s->acpi_pci_hotplug.use_acpi_hotplug_bridge; } static const VMStateDescription vmstate_pcihp_state = { @@ -277,8 +277,8 @@ static void pm_reset(void *opaque) } pm->smi_en_wmask = ~0; - if (pm->use_acpi_hotplug_bridge) { - acpi_pcihp_reset(&pm->acpi_pci_hotplug, true); + if (pm->acpi_pci_hotplug.use_acpi_hotplug_bridge) { + acpi_pcihp_reset(&pm->acpi_pci_hotplug); } acpi_update_sci(&pm->acpi_regs, pm->irq); @@ -316,12 +316,11 @@ void ich9_pm_init(PCIDevice *lpc_pci, ICH9LPCPMRegs *pm, qemu_irq sci_irq) acpi_pm_tco_init(&pm->tco_regs, &pm->io); } - if (pm->use_acpi_hotplug_bridge) { + if (pm->acpi_pci_hotplug.use_acpi_hotplug_bridge) { acpi_pcihp_init(OBJECT(lpc_pci), &pm->acpi_pci_hotplug, pci_get_bus(lpc_pci), pci_address_space_io(lpc_pci), - true, ACPI_PCIHP_ADDR_ICH9); qbus_set_hotplug_handler(BUS(pci_get_bus(lpc_pci)), @@ -403,14 +402,14 @@ static bool ich9_pm_get_acpi_pci_hotplug(Object *obj, Error **errp) { ICH9LPCState *s = ICH9_LPC_DEVICE(obj); - return s->pm.use_acpi_hotplug_bridge; + return s->pm.acpi_pci_hotplug.use_acpi_hotplug_bridge; } static void ich9_pm_set_acpi_pci_hotplug(Object *obj, bool value, Error **errp) { ICH9LPCState *s = ICH9_LPC_DEVICE(obj); - s->pm.use_acpi_hotplug_bridge = value; + s->pm.acpi_pci_hotplug.use_acpi_hotplug_bridge = value; } static bool ich9_pm_get_keep_pci_slot_hpc(Object *obj, Error **errp) @@ -435,7 +434,7 @@ void ich9_pm_add_properties(Object *obj, ICH9LPCPMRegs *pm) pm->disable_s3 = 0; pm->disable_s4 = 0; pm->s4_val = 2; - pm->use_acpi_hotplug_bridge = true; + pm->acpi_pci_hotplug.use_acpi_hotplug_bridge = true; pm->keep_pci_slot_hpc = true; pm->enable_tco = true; @@ -579,6 +578,12 @@ void ich9_pm_device_unplug_cb(HotplugHandler *hotplug_dev, DeviceState *dev, } } +bool ich9_pm_is_hotpluggable_bus(HotplugHandler *hotplug_dev, BusState *bus) +{ + ICH9LPCState *lpc = ICH9_LPC_DEVICE(hotplug_dev); + return acpi_pcihp_is_hotpluggbale_bus(&lpc->pm.acpi_pci_hotplug, bus); +} + void ich9_pm_ospm_status(AcpiDeviceIf *adev, ACPIOSTInfoList ***list) { ICH9LPCState *s = ICH9_LPC_DEVICE(adev); diff --git a/hw/acpi/pci-bridge.c b/hw/acpi/pci-bridge.c index 5f3ee5157f..7baa7034a1 100644 --- a/hw/acpi/pci-bridge.c +++ b/hw/acpi/pci-bridge.c @@ -21,7 +21,17 @@ void build_pci_bridge_aml(AcpiDevAmlIf *adev, Aml *scope) { PCIBridge *br = PCI_BRIDGE(adev); - if (object_property_find(OBJECT(&br->sec_bus), ACPI_PCIHP_PROP_BSEL)) { - build_append_pci_bus_devices(scope, pci_bridge_get_sec_bus(br)); + if (!DEVICE(br)->hotplugged) { + PCIBus *sec_bus = pci_bridge_get_sec_bus(br); + + build_append_pci_bus_devices(scope, sec_bus); + + /* + * generate hotplug slots descriptors if + * bridge has ACPI PCI hotplug attached, + */ + if (object_property_find(OBJECT(sec_bus), ACPI_PCIHP_PROP_BSEL)) { + build_append_pcihp_slots(scope, sec_bus); + } } } diff --git a/hw/acpi/pcihp.c b/hw/acpi/pcihp.c index 5dc7377411..dcfb779a7a 100644 --- a/hw/acpi/pcihp.c +++ b/hw/acpi/pcihp.c @@ -54,21 +54,6 @@ typedef struct AcpiPciHpFind { PCIBus *bus; } AcpiPciHpFind; -static gint g_cmp_uint32(gconstpointer a, gconstpointer b, gpointer user_data) -{ - return a - b; -} - -static GSequence *pci_acpi_index_list(void) -{ - static GSequence *used_acpi_index_list; - - if (!used_acpi_index_list) { - used_acpi_index_list = g_sequence_new(NULL); - } - return used_acpi_index_list; -} - static int acpi_pcihp_get_bsel(PCIBus *bus) { Error *local_err = NULL; @@ -136,20 +121,6 @@ static void acpi_set_pci_info(bool has_bridge_hotplug) } } -static void acpi_pcihp_disable_root_bus(void) -{ - Object *host = acpi_get_i386_pci_host(); - PCIBus *bus; - - bus = PCI_HOST_BRIDGE(host)->bus; - if (bus && qbus_is_hotpluggable(BUS(bus))) { - /* setting the hotplug handler to NULL makes the bus non-hotpluggable */ - qbus_set_hotplug_handler(BUS(bus), NULL); - } - - return; -} - static void acpi_pcihp_test_hotplug_bus(PCIBus *bus, void *opaque) { AcpiPciHpFind *find = opaque; @@ -291,17 +262,12 @@ static void acpi_pcihp_update(AcpiPciHpState *s) } } -void acpi_pcihp_reset(AcpiPciHpState *s, bool acpihp_root_off) +void acpi_pcihp_reset(AcpiPciHpState *s) { - if (acpihp_root_off) { - acpi_pcihp_disable_root_bus(); - } - acpi_set_pci_info(!s->legacy_piix); + acpi_set_pci_info(s->use_acpi_hotplug_bridge); acpi_pcihp_update(s); } -#define ONBOARD_INDEX_MAX (16 * 1024 - 1) - void acpi_pcihp_device_pre_plug_cb(HotplugHandler *hotplug_dev, DeviceState *dev, Error **errp) { @@ -314,34 +280,6 @@ void acpi_pcihp_device_pre_plug_cb(HotplugHandler *hotplug_dev, ACPI_PCIHP_PROP_BSEL "' set"); return; } - - /* - * capped by systemd (see: udev-builtin-net_id.c) - * as it's the only known user honor it to avoid users - * misconfigure QEMU and then wonder why acpi-index doesn't work - */ - if (pdev->acpi_index > ONBOARD_INDEX_MAX) { - error_setg(errp, "acpi-index should be less or equal to %u", - ONBOARD_INDEX_MAX); - return; - } - - /* - * make sure that acpi-index is unique across all present PCI devices - */ - if (pdev->acpi_index) { - GSequence *used_indexes = pci_acpi_index_list(); - - if (g_sequence_lookup(used_indexes, GINT_TO_POINTER(pdev->acpi_index), - g_cmp_uint32, NULL)) { - error_setg(errp, "a PCI device with acpi-index = %" PRIu32 - " already exist", pdev->acpi_index); - return; - } - g_sequence_insert_sorted(used_indexes, - GINT_TO_POINTER(pdev->acpi_index), - g_cmp_uint32, NULL); - } } void acpi_pcihp_device_plug_cb(HotplugHandler *hotplug_dev, AcpiPciHpState *s, @@ -361,17 +299,10 @@ void acpi_pcihp_device_plug_cb(HotplugHandler *hotplug_dev, AcpiPciHpState *s, * Overwrite the default hotplug handler with the ACPI PCI one * for cold plugged bridges only. */ - if (!s->legacy_piix && + if (s->use_acpi_hotplug_bridge && object_dynamic_cast(OBJECT(dev), TYPE_PCI_BRIDGE)) { PCIBus *sec = pci_bridge_get_sec_bus(PCI_BRIDGE(pdev)); - /* Remove all hot-plug handlers if hot-plug is disabled on slot */ - if (object_dynamic_cast(OBJECT(dev), TYPE_PCIE_SLOT) && - !PCIE_SLOT(pdev)->hotplug) { - qbus_set_hotplug_handler(BUS(sec), NULL); - return; - } - qbus_set_hotplug_handler(BUS(sec), OBJECT(hotplug_dev)); /* We don't have to overwrite any other hotplug handler yet */ assert(QLIST_EMPTY(&sec->child)); @@ -401,17 +332,6 @@ void acpi_pcihp_device_unplug_cb(HotplugHandler *hotplug_dev, AcpiPciHpState *s, trace_acpi_pci_unplug(PCI_SLOT(pdev->devfn), acpi_pcihp_get_bsel(pci_get_bus(pdev))); - /* - * clean up acpi-index so it could reused by another device - */ - if (pdev->acpi_index) { - GSequence *used_indexes = pci_acpi_index_list(); - - g_sequence_remove(g_sequence_lookup(used_indexes, - GINT_TO_POINTER(pdev->acpi_index), - g_cmp_uint32, NULL)); - } - qdev_unrealize(dev); } @@ -441,6 +361,24 @@ void acpi_pcihp_device_unplug_request_cb(HotplugHandler *hotplug_dev, acpi_send_event(DEVICE(hotplug_dev), ACPI_PCI_HOTPLUG_STATUS); } +bool acpi_pcihp_is_hotpluggbale_bus(AcpiPciHpState *s, BusState *bus) +{ + Object *o = OBJECT(bus->parent); + + if (s->use_acpi_hotplug_bridge && + object_dynamic_cast(o, TYPE_PCI_BRIDGE)) { + if (object_dynamic_cast(o, TYPE_PCIE_SLOT) && !PCIE_SLOT(o)->hotplug) { + return false; + } + return true; + } + + if (s->use_acpi_root_pci_hotplug) { + return true; + } + return false; +} + static uint64_t pci_read(void *opaque, hwaddr addr, unsigned int size) { AcpiPciHpState *s = opaque; @@ -454,7 +392,7 @@ static uint64_t pci_read(void *opaque, hwaddr addr, unsigned int size) switch (addr) { case PCI_UP_BASE: val = s->acpi_pcihp_pci_status[bsel].up; - if (!s->legacy_piix) { + if (s->use_acpi_hotplug_bridge) { s->acpi_pcihp_pci_status[bsel].up = 0; } trace_acpi_pci_up_read(val); @@ -529,7 +467,8 @@ static void pci_write(void *opaque, hwaddr addr, uint64_t data, trace_acpi_pci_ej_write(addr, data); break; case PCI_SEL_BASE: - s->hotplug_select = s->legacy_piix ? ACPI_PCIHP_BSEL_DEFAULT : data; + s->hotplug_select = s->use_acpi_hotplug_bridge ? data : + ACPI_PCIHP_BSEL_DEFAULT; trace_acpi_pci_sel_write(addr, data); default: break; @@ -547,14 +486,13 @@ static const MemoryRegionOps acpi_pcihp_io_ops = { }; void acpi_pcihp_init(Object *owner, AcpiPciHpState *s, PCIBus *root_bus, - MemoryRegion *address_space_io, bool bridges_enabled, + MemoryRegion *address_space_io, uint16_t io_base) { s->io_len = ACPI_PCIHP_SIZE; s->io_base = io_base; s->root = root_bus; - s->legacy_piix = !bridges_enabled; memory_region_init_io(&s->io, owner, &acpi_pcihp_io_ops, s, "acpi-pci-hotplug", s->io_len); diff --git a/hw/acpi/piix4.c b/hw/acpi/piix4.c index eac2125abd..63d2113b86 100644 --- a/hw/acpi/piix4.c +++ b/hw/acpi/piix4.c @@ -170,14 +170,14 @@ static const VMStateDescription vmstate_pci_status = { static bool vmstate_test_use_acpi_hotplug_bridge(void *opaque, int version_id) { PIIX4PMState *s = opaque; - return s->use_acpi_hotplug_bridge; + return s->acpi_pci_hotplug.use_acpi_hotplug_bridge; } static bool vmstate_test_no_use_acpi_hotplug_bridge(void *opaque, int version_id) { PIIX4PMState *s = opaque; - return !s->use_acpi_hotplug_bridge; + return !s->acpi_pci_hotplug.use_acpi_hotplug_bridge; } static bool vmstate_test_use_memhp(void *opaque) @@ -234,7 +234,8 @@ static bool piix4_vmstate_need_smbus(void *opaque, int version_id) static bool vmstate_test_migrate_acpi_index(void *opaque, int version_id) { PIIX4PMState *s = PIIX4_PM(opaque); - return s->use_acpi_hotplug_bridge && !s->not_migrate_acpi_index; + return s->acpi_pci_hotplug.use_acpi_hotplug_bridge && + !s->not_migrate_acpi_index; } /* qemu-kvm 1.2 uses version 3 but advertised as 2 @@ -303,8 +304,9 @@ static void piix4_pm_reset(DeviceState *dev) acpi_update_sci(&s->ar, s->irq); pm_io_space_update(s); - if (s->use_acpi_hotplug_bridge || s->use_acpi_root_pci_hotplug) { - acpi_pcihp_reset(&s->acpi_pci_hotplug, !s->use_acpi_root_pci_hotplug); + if (s->acpi_pci_hotplug.use_acpi_hotplug_bridge || + s->acpi_pci_hotplug.use_acpi_root_pci_hotplug) { + acpi_pcihp_reset(&s->acpi_pci_hotplug); } } @@ -402,6 +404,13 @@ static void piix4_device_unplug_cb(HotplugHandler *hotplug_dev, } } +static bool piix4_is_hotpluggable_bus(HotplugHandler *hotplug_dev, + BusState *bus) +{ + PIIX4PMState *s = PIIX4_PM(hotplug_dev); + return acpi_pcihp_is_hotpluggbale_bus(&s->acpi_pci_hotplug, bus); +} + static void piix4_pm_machine_ready(Notifier *n, void *opaque) { PIIX4PMState *s = container_of(n, PIIX4PMState, machine_ready); @@ -487,12 +496,11 @@ static void piix4_pm_realize(PCIDevice *dev, Error **errp) qemu_add_machine_init_done_notifier(&s->machine_ready); if (xen_enabled()) { - s->use_acpi_hotplug_bridge = false; + s->acpi_pci_hotplug.use_acpi_hotplug_bridge = false; } piix4_acpi_system_hot_add_init(pci_address_space_io(dev), pci_get_bus(dev), s); - qbus_set_hotplug_handler(BUS(pci_get_bus(dev)), OBJECT(s)); piix4_pm_add_properties(s); } @@ -561,9 +569,11 @@ static void piix4_acpi_system_hot_add_init(MemoryRegion *parent, "acpi-gpe0", GPE_LEN); memory_region_add_subregion(parent, GPE_BASE, &s->io_gpe); - if (s->use_acpi_hotplug_bridge || s->use_acpi_root_pci_hotplug) { + if (s->acpi_pci_hotplug.use_acpi_hotplug_bridge || + s->acpi_pci_hotplug.use_acpi_root_pci_hotplug) { acpi_pcihp_init(OBJECT(s), &s->acpi_pci_hotplug, bus, parent, - s->use_acpi_hotplug_bridge, ACPI_PCIHP_ADDR_PIIX4); + ACPI_PCIHP_ADDR_PIIX4); + qbus_set_hotplug_handler(BUS(pci_get_bus(PCI_DEVICE(s))), OBJECT(s)); } s->cpu_hotplug_legacy = true; @@ -602,9 +612,9 @@ static Property piix4_pm_properties[] = { DEFINE_PROP_UINT8(ACPI_PM_PROP_S4_DISABLED, PIIX4PMState, disable_s4, 0), DEFINE_PROP_UINT8(ACPI_PM_PROP_S4_VAL, PIIX4PMState, s4_val, 2), DEFINE_PROP_BOOL(ACPI_PM_PROP_ACPI_PCIHP_BRIDGE, PIIX4PMState, - use_acpi_hotplug_bridge, true), + acpi_pci_hotplug.use_acpi_hotplug_bridge, true), DEFINE_PROP_BOOL(ACPI_PM_PROP_ACPI_PCI_ROOTHP, PIIX4PMState, - use_acpi_root_pci_hotplug, true), + acpi_pci_hotplug.use_acpi_root_pci_hotplug, true), DEFINE_PROP_BOOL("memory-hotplug-support", PIIX4PMState, acpi_memory_hotplug.is_enabled, true), DEFINE_PROP_BOOL("smm-compat", PIIX4PMState, smm_compat, false), @@ -641,6 +651,7 @@ static void piix4_pm_class_init(ObjectClass *klass, void *data) hc->plug = piix4_device_plug_cb; hc->unplug_request = piix4_device_unplug_request_cb; hc->unplug = piix4_device_unplug_cb; + hc->is_hotpluggable_bus = piix4_is_hotpluggable_bus; adevc->ospm_status = piix4_ospm_status; adevc->send_event = piix4_send_gpe; adevc->madt_cpu = pc_madt_cpu_entry; diff --git a/hw/arm/allwinner-h3.c b/hw/arm/allwinner-h3.c index bfce3c8d92..69d0ad6f50 100644 --- a/hw/arm/allwinner-h3.c +++ b/hw/arm/allwinner-h3.c @@ -54,6 +54,8 @@ const hwaddr allwinner_h3_memmap[] = { [AW_H3_DEV_UART2] = 0x01c28800, [AW_H3_DEV_UART3] = 0x01c28c00, [AW_H3_DEV_TWI0] = 0x01c2ac00, + [AW_H3_DEV_TWI1] = 0x01c2b000, + [AW_H3_DEV_TWI2] = 0x01c2b400, [AW_H3_DEV_EMAC] = 0x01c30000, [AW_H3_DEV_DRAMCOM] = 0x01c62000, [AW_H3_DEV_DRAMCTL] = 0x01c63000, @@ -64,6 +66,7 @@ const hwaddr allwinner_h3_memmap[] = { [AW_H3_DEV_GIC_VCPU] = 0x01c86000, [AW_H3_DEV_RTC] = 0x01f00000, [AW_H3_DEV_CPUCFG] = 0x01f01c00, + [AW_H3_DEV_R_TWI] = 0x01f02400, [AW_H3_DEV_SDRAM] = 0x40000000 }; @@ -107,8 +110,6 @@ struct AwH3Unimplemented { { "uart1", 0x01c28400, 1 * KiB }, { "uart2", 0x01c28800, 1 * KiB }, { "uart3", 0x01c28c00, 1 * KiB }, - { "twi1", 0x01c2b000, 1 * KiB }, - { "twi2", 0x01c2b400, 1 * KiB }, { "scr", 0x01c2c400, 1 * KiB }, { "gpu", 0x01c40000, 64 * KiB }, { "hstmr", 0x01c60000, 4 * KiB }, @@ -123,7 +124,6 @@ struct AwH3Unimplemented { { "r_prcm", 0x01f01400, 1 * KiB }, { "r_twd", 0x01f01800, 1 * KiB }, { "r_cir-rx", 0x01f02000, 1 * KiB }, - { "r_twi", 0x01f02400, 1 * KiB }, { "r_uart", 0x01f02800, 1 * KiB }, { "r_pio", 0x01f02c00, 1 * KiB }, { "r_pwm", 0x01f03800, 1 * KiB }, @@ -151,8 +151,11 @@ enum { AW_H3_GIC_SPI_UART2 = 2, AW_H3_GIC_SPI_UART3 = 3, AW_H3_GIC_SPI_TWI0 = 6, + AW_H3_GIC_SPI_TWI1 = 7, + AW_H3_GIC_SPI_TWI2 = 8, AW_H3_GIC_SPI_TIMER0 = 18, AW_H3_GIC_SPI_TIMER1 = 19, + AW_H3_GIC_SPI_R_TWI = 44, AW_H3_GIC_SPI_MMC0 = 60, AW_H3_GIC_SPI_EHCI0 = 72, AW_H3_GIC_SPI_OHCI0 = 73, @@ -227,7 +230,10 @@ static void allwinner_h3_init(Object *obj) object_initialize_child(obj, "rtc", &s->rtc, TYPE_AW_RTC_SUN6I); - object_initialize_child(obj, "twi0", &s->i2c0, TYPE_AW_I2C); + object_initialize_child(obj, "twi0", &s->i2c0, TYPE_AW_I2C_SUN6I); + object_initialize_child(obj, "twi1", &s->i2c1, TYPE_AW_I2C_SUN6I); + object_initialize_child(obj, "twi2", &s->i2c2, TYPE_AW_I2C_SUN6I); + object_initialize_child(obj, "r_twi", &s->r_twi, TYPE_AW_I2C_SUN6I); } static void allwinner_h3_realize(DeviceState *dev, Error **errp) @@ -432,6 +438,21 @@ static void allwinner_h3_realize(DeviceState *dev, Error **errp) sysbus_connect_irq(SYS_BUS_DEVICE(&s->i2c0), 0, qdev_get_gpio_in(DEVICE(&s->gic), AW_H3_GIC_SPI_TWI0)); + sysbus_realize(SYS_BUS_DEVICE(&s->i2c1), &error_fatal); + sysbus_mmio_map(SYS_BUS_DEVICE(&s->i2c1), 0, s->memmap[AW_H3_DEV_TWI1]); + sysbus_connect_irq(SYS_BUS_DEVICE(&s->i2c1), 0, + qdev_get_gpio_in(DEVICE(&s->gic), AW_H3_GIC_SPI_TWI1)); + + sysbus_realize(SYS_BUS_DEVICE(&s->i2c2), &error_fatal); + sysbus_mmio_map(SYS_BUS_DEVICE(&s->i2c2), 0, s->memmap[AW_H3_DEV_TWI2]); + sysbus_connect_irq(SYS_BUS_DEVICE(&s->i2c2), 0, + qdev_get_gpio_in(DEVICE(&s->gic), AW_H3_GIC_SPI_TWI2)); + + sysbus_realize(SYS_BUS_DEVICE(&s->r_twi), &error_fatal); + sysbus_mmio_map(SYS_BUS_DEVICE(&s->r_twi), 0, s->memmap[AW_H3_DEV_R_TWI]); + sysbus_connect_irq(SYS_BUS_DEVICE(&s->r_twi), 0, + qdev_get_gpio_in(DEVICE(&s->gic), AW_H3_GIC_SPI_R_TWI)); + /* Unimplemented devices */ for (i = 0; i < ARRAY_SIZE(unimplemented); i++) { create_unimplemented_device(unimplemented[i].device_name, diff --git a/hw/arm/aspeed.c b/hw/arm/aspeed.c index 86601cb1a5..c1f2b9cfca 100644 --- a/hw/arm/aspeed.c +++ b/hw/arm/aspeed.c @@ -524,6 +524,11 @@ static void yosemitev2_bmc_i2c_init(AspeedMachineState *bmc) at24c_eeprom_init(aspeed_i2c_get_bus(&soc->i2c, 4), 0x51, 128 * KiB); at24c_eeprom_init_rom(aspeed_i2c_get_bus(&soc->i2c, 8), 0x51, 128 * KiB, yosemitev2_bmc_fruid, yosemitev2_bmc_fruid_len); + /* TMP421 */ + i2c_slave_create_simple(aspeed_i2c_get_bus(&soc->i2c, 11), "tmp421", 0x1f); + i2c_slave_create_simple(aspeed_i2c_get_bus(&soc->i2c, 9), "tmp421", 0x4e); + i2c_slave_create_simple(aspeed_i2c_get_bus(&soc->i2c, 9), "tmp421", 0x4f); + } static void romulus_bmc_i2c_init(AspeedMachineState *bmc) @@ -542,6 +547,10 @@ static void tiogapass_bmc_i2c_init(AspeedMachineState *bmc) at24c_eeprom_init(aspeed_i2c_get_bus(&soc->i2c, 4), 0x54, 128 * KiB); at24c_eeprom_init_rom(aspeed_i2c_get_bus(&soc->i2c, 6), 0x54, 128 * KiB, tiogapass_bmc_fruid, tiogapass_bmc_fruid_len); + /* TMP421 */ + i2c_slave_create_simple(aspeed_i2c_get_bus(&soc->i2c, 8), "tmp421", 0x1f); + i2c_slave_create_simple(aspeed_i2c_get_bus(&soc->i2c, 6), "tmp421", 0x4f); + i2c_slave_create_simple(aspeed_i2c_get_bus(&soc->i2c, 6), "tmp421", 0x4e); } static void create_pca9552(AspeedSoCState *soc, int bus_id, int addr) diff --git a/hw/arm/aspeed_eeprom.c b/hw/arm/aspeed_eeprom.c index 2fb2d5dbb7..dc33a88a54 100644 --- a/hw/arm/aspeed_eeprom.c +++ b/hw/arm/aspeed_eeprom.c @@ -101,17 +101,17 @@ const uint8_t fby35_bmc_fruid[] = { /* Yosemite V2 BMC FRU */ const uint8_t yosemitev2_bmc_fruid[] = { 0x01, 0x00, 0x00, 0x01, 0x0d, 0x00, 0x00, 0xf1, 0x01, 0x0c, 0x00, 0x36, - 0xe6, 0xd0, 0xc6, 0x58, 0x58, 0x58, 0x58, 0x58, 0x58, 0xd2, 0x42, 0x4d, - 0x43, 0x20, 0x53, 0x74, 0x6f, 0x72, 0x61, 0x67, 0x65, 0x20, 0x4d, 0x6f, - 0x64, 0x75, 0x6c, 0x65, 0xcd, 0x58, 0x58, 0x58, 0x58, 0x58, 0x58, 0x58, + 0xe6, 0xd0, 0xc6, 0x58, 0x58, 0x58, 0x58, 0x58, 0x58, 0xd2, 0x42, 0x61, + 0x73, 0x65, 0x62, 0x6f, 0x61, 0x72, 0x64, 0x20, 0x4d, 0x50, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0xcd, 0x58, 0x58, 0x58, 0x58, 0x58, 0x58, 0x58, 0x58, 0x58, 0x58, 0x58, 0x58, 0x58, 0xce, 0x58, 0x58, 0x58, 0x58, 0x58, 0x58, 0x58, 0x58, 0x58, 0x58, 0x58, 0x58, 0x58, 0x58, 0xc3, 0x31, 0x2e, 0x30, 0xc9, 0x58, 0x58, 0x58, 0x58, 0x58, 0x58, 0x58, 0x58, 0x58, 0xd2, 0x58, 0x58, 0x58, 0x58, 0x58, 0x58, 0x58, 0x58, 0x58, 0x58, 0x58, 0x58, 0x58, 0x58, 0x58, 0x58, 0x58, 0x58, 0xc1, 0x39, 0x01, 0x0c, 0x00, 0xc6, 0x58, 0x58, 0x58, 0x58, 0x58, 0x58, 0xd2, 0x59, 0x6f, 0x73, 0x65, 0x6d, - 0x69, 0x74, 0x65, 0x20, 0x56, 0x32, 0x2e, 0x30, 0x20, 0x45, 0x56, 0x54, - 0x32, 0xce, 0x58, 0x58, 0x58, 0x58, 0x58, 0x58, 0x58, 0x58, 0x58, 0x58, + 0x69, 0x74, 0x65, 0x20, 0x56, 0x32, 0x20, 0x4d, 0x50, 0x00, 0x00, 0x00, + 0x00, 0xce, 0x58, 0x58, 0x58, 0x58, 0x58, 0x58, 0x58, 0x58, 0x58, 0x58, 0x58, 0x58, 0x58, 0x58, 0xc4, 0x45, 0x56, 0x54, 0x32, 0xcd, 0x58, 0x58, 0x58, 0x58, 0x58, 0x58, 0x58, 0x58, 0x58, 0x58, 0x58, 0x58, 0x58, 0xc7, 0x58, 0x58, 0x58, 0x58, 0x58, 0x58, 0x58, 0xc3, 0x31, 0x2e, 0x30, 0xc9, diff --git a/hw/arm/boot.c b/hw/arm/boot.c index 1e021c4a34..50e5141116 100644 --- a/hw/arm/boot.c +++ b/hw/arm/boot.c @@ -926,6 +926,12 @@ static uint64_t load_aarch64_image(const char *filename, hwaddr mem_base, return -1; } size = len; + + /* Unpack the image if it is a EFI zboot image */ + if (unpack_efi_zboot_image(&buffer, &size) < 0) { + g_free(buffer); + return -1; + } } /* check the arm64 magic header value -- very old kernels may not have it */ diff --git a/hw/audio/trace-events b/hw/audio/trace-events index e0e71cd9b1..4dec48a4fd 100644 --- a/hw/audio/trace-events +++ b/hw/audio/trace-events @@ -11,3 +11,9 @@ hda_audio_running(const char *stream, int nr, bool running) "st %s, nr %d, run % hda_audio_format(const char *stream, int chan, const char *fmt, int freq) "st %s, %d x %s @ %d Hz" hda_audio_adjust(const char *stream, int pos) "st %s, pos %d" hda_audio_overrun(const char *stream) "st %s" + +#via-ac97.c +via_ac97_codec_write(uint8_t addr, uint16_t val) "0x%x <- 0x%x" +via_ac97_sgd_fetch(uint32_t curr, uint32_t addr, char stop, char eol, char flag, uint32_t len) "curr=0x%x addr=0x%x %c%c%c len=%d" +via_ac97_sgd_read(uint64_t addr, unsigned size, uint64_t val) "0x%"PRIx64" %d -> 0x%"PRIx64 +via_ac97_sgd_write(uint64_t addr, unsigned size, uint64_t val) "0x%"PRIx64" %d <- 0x%"PRIx64 diff --git a/hw/audio/via-ac97.c b/hw/audio/via-ac97.c index d1a856f63d..676254b7a4 100644 --- a/hw/audio/via-ac97.c +++ b/hw/audio/via-ac97.c @@ -1,39 +1,482 @@ /* * VIA south bridges sound support * + * Copyright (c) 2022-2023 BALATON Zoltan + * * This work is licensed under the GNU GPL license version 2 or later. */ /* - * TODO: This is entirely boiler plate just registering empty PCI devices - * with the right ID guests expect, functionality should be added here. + * TODO: This is only a basic implementation of one audio playback channel + * more functionality should be added here. */ #include "qemu/osdep.h" +#include "qemu/log.h" #include "hw/isa/vt82c686.h" -#include "hw/pci/pci_device.h" +#include "ac97.h" +#include "trace.h" + +#define CLEN_IS_EOL(x) ((x)->clen & BIT(31)) +#define CLEN_IS_FLAG(x) ((x)->clen & BIT(30)) +#define CLEN_IS_STOP(x) ((x)->clen & BIT(29)) +#define CLEN_LEN(x) ((x)->clen & 0xffffff) + +#define STAT_ACTIVE BIT(7) +#define STAT_PAUSED BIT(6) +#define STAT_TRIG BIT(3) +#define STAT_STOP BIT(2) +#define STAT_EOL BIT(1) +#define STAT_FLAG BIT(0) + +#define CNTL_START BIT(7) +#define CNTL_TERM BIT(6) +#define CNTL_PAUSE BIT(3) + +static void open_voice_out(ViaAC97State *s); + +static uint16_t codec_rates[] = { 8000, 11025, 16000, 22050, 32000, 44100, + 48000 }; + +#define CODEC_REG(s, o) ((s)->codec_regs[(o) / 2]) +#define CODEC_VOL(vol, mask) ((255 * ((vol) & mask)) / mask) + +static void codec_volume_set_out(ViaAC97State *s) +{ + int lvol, rvol, mute; + + lvol = 255 - CODEC_VOL(CODEC_REG(s, AC97_Master_Volume_Mute) >> 8, 0x1f); + lvol *= 255 - CODEC_VOL(CODEC_REG(s, AC97_PCM_Out_Volume_Mute) >> 8, 0x1f); + lvol /= 255; + rvol = 255 - CODEC_VOL(CODEC_REG(s, AC97_Master_Volume_Mute), 0x1f); + rvol *= 255 - CODEC_VOL(CODEC_REG(s, AC97_PCM_Out_Volume_Mute), 0x1f); + rvol /= 255; + mute = CODEC_REG(s, AC97_Master_Volume_Mute) >> MUTE_SHIFT; + mute |= CODEC_REG(s, AC97_PCM_Out_Volume_Mute) >> MUTE_SHIFT; + AUD_set_volume_out(s->vo, mute, lvol, rvol); +} + +static void codec_reset(ViaAC97State *s) +{ + memset(s->codec_regs, 0, sizeof(s->codec_regs)); + CODEC_REG(s, AC97_Reset) = 0x6a90; + CODEC_REG(s, AC97_Master_Volume_Mute) = 0x8000; + CODEC_REG(s, AC97_Headphone_Volume_Mute) = 0x8000; + CODEC_REG(s, AC97_Master_Volume_Mono_Mute) = 0x8000; + CODEC_REG(s, AC97_Phone_Volume_Mute) = 0x8008; + CODEC_REG(s, AC97_Mic_Volume_Mute) = 0x8008; + CODEC_REG(s, AC97_Line_In_Volume_Mute) = 0x8808; + CODEC_REG(s, AC97_CD_Volume_Mute) = 0x8808; + CODEC_REG(s, AC97_Video_Volume_Mute) = 0x8808; + CODEC_REG(s, AC97_Aux_Volume_Mute) = 0x8808; + CODEC_REG(s, AC97_PCM_Out_Volume_Mute) = 0x8808; + CODEC_REG(s, AC97_Record_Gain_Mute) = 0x8000; + CODEC_REG(s, AC97_Powerdown_Ctrl_Stat) = 0x000f; + CODEC_REG(s, AC97_Extended_Audio_ID) = 0x0a05; + CODEC_REG(s, AC97_Extended_Audio_Ctrl_Stat) = 0x0400; + CODEC_REG(s, AC97_PCM_Front_DAC_Rate) = 48000; + CODEC_REG(s, AC97_PCM_LR_ADC_Rate) = 48000; + /* Sigmatel 9766 (STAC9766) */ + CODEC_REG(s, AC97_Vendor_ID1) = 0x8384; + CODEC_REG(s, AC97_Vendor_ID2) = 0x7666; +} + +static uint16_t codec_read(ViaAC97State *s, uint8_t addr) +{ + return CODEC_REG(s, addr); +} + +static void codec_write(ViaAC97State *s, uint8_t addr, uint16_t val) +{ + trace_via_ac97_codec_write(addr, val); + switch (addr) { + case AC97_Reset: + codec_reset(s); + return; + case AC97_Master_Volume_Mute: + case AC97_PCM_Out_Volume_Mute: + if (addr == AC97_Master_Volume_Mute) { + if (val & BIT(13)) { + val |= 0x1f00; + } + if (val & BIT(5)) { + val |= 0x1f; + } + } + CODEC_REG(s, addr) = val & 0x9f1f; + codec_volume_set_out(s); + return; + case AC97_Extended_Audio_Ctrl_Stat: + CODEC_REG(s, addr) &= ~EACS_VRA; + CODEC_REG(s, addr) |= val & EACS_VRA; + if (!(val & EACS_VRA)) { + CODEC_REG(s, AC97_PCM_Front_DAC_Rate) = 48000; + CODEC_REG(s, AC97_PCM_LR_ADC_Rate) = 48000; + open_voice_out(s); + } + return; + case AC97_PCM_Front_DAC_Rate: + case AC97_PCM_LR_ADC_Rate: + if (CODEC_REG(s, AC97_Extended_Audio_Ctrl_Stat) & EACS_VRA) { + int i; + uint16_t rate = val; + + for (i = 0; i < ARRAY_SIZE(codec_rates) - 1; i++) { + if (rate < codec_rates[i] + + (codec_rates[i + 1] - codec_rates[i]) / 2) { + rate = codec_rates[i]; + break; + } + } + if (rate > 48000) { + rate = 48000; + } + CODEC_REG(s, addr) = rate; + open_voice_out(s); + } + return; + case AC97_Powerdown_Ctrl_Stat: + CODEC_REG(s, addr) = (val & 0xff00) | (CODEC_REG(s, addr) & 0xff); + return; + case AC97_Extended_Audio_ID: + case AC97_Vendor_ID1: + case AC97_Vendor_ID2: + /* Read only registers */ + return; + default: + qemu_log_mask(LOG_UNIMP, + "via-ac97: Unimplemented codec register 0x%x\n", addr); + CODEC_REG(s, addr) = val; + } +} + +static void fetch_sgd(ViaAC97SGDChannel *c, PCIDevice *d) +{ + uint32_t b[2]; + + if (c->curr < c->base) { + c->curr = c->base; + } + if (unlikely(pci_dma_read(d, c->curr, b, sizeof(b)) != MEMTX_OK)) { + qemu_log_mask(LOG_GUEST_ERROR, + "via-ac97: DMA error reading SGD table\n"); + return; + } + c->addr = le32_to_cpu(b[0]); + c->clen = le32_to_cpu(b[1]); + trace_via_ac97_sgd_fetch(c->curr, c->addr, CLEN_IS_STOP(c) ? 'S' : '-', + CLEN_IS_EOL(c) ? 'E' : '-', + CLEN_IS_FLAG(c) ? 'F' : '-', CLEN_LEN(c)); +} + +static void out_cb(void *opaque, int avail) +{ + ViaAC97State *s = opaque; + ViaAC97SGDChannel *c = &s->aur; + int temp, to_copy, copied; + bool stop = false; + uint8_t tmpbuf[4096]; + + if (c->stat & STAT_PAUSED) { + return; + } + c->stat |= STAT_ACTIVE; + while (avail && !stop) { + if (!c->clen) { + fetch_sgd(c, &s->dev); + } + temp = MIN(CLEN_LEN(c), avail); + while (temp) { + to_copy = MIN(temp, sizeof(tmpbuf)); + pci_dma_read(&s->dev, c->addr, tmpbuf, to_copy); + copied = AUD_write(s->vo, tmpbuf, to_copy); + if (!copied) { + stop = true; + break; + } + temp -= copied; + avail -= copied; + c->addr += copied; + c->clen -= copied; + } + if (CLEN_LEN(c) == 0) { + c->curr += 8; + if (CLEN_IS_EOL(c)) { + c->stat |= STAT_EOL; + if (c->type & CNTL_START) { + c->curr = c->base; + c->stat |= STAT_PAUSED; + } else { + c->stat &= ~STAT_ACTIVE; + AUD_set_active_out(s->vo, 0); + } + if (c->type & STAT_EOL) { + pci_set_irq(&s->dev, 1); + } + } + if (CLEN_IS_FLAG(c)) { + c->stat |= STAT_FLAG; + c->stat |= STAT_PAUSED; + if (c->type & STAT_FLAG) { + pci_set_irq(&s->dev, 1); + } + } + if (CLEN_IS_STOP(c)) { + c->stat |= STAT_STOP; + c->stat |= STAT_PAUSED; + } + c->clen = 0; + stop = true; + } + } +} + +static void open_voice_out(ViaAC97State *s) +{ + struct audsettings as = { + .freq = CODEC_REG(s, AC97_PCM_Front_DAC_Rate), + .nchannels = s->aur.type & BIT(4) ? 2 : 1, + .fmt = s->aur.type & BIT(5) ? AUDIO_FORMAT_S16 : AUDIO_FORMAT_S8, + .endianness = 0, + }; + s->vo = AUD_open_out(&s->card, s->vo, "via-ac97.out", s, out_cb, &as); +} + +static uint64_t sgd_read(void *opaque, hwaddr addr, unsigned size) +{ + ViaAC97State *s = opaque; + uint64_t val = 0; + + switch (addr) { + case 0: + val = s->aur.stat; + if (s->aur.type & CNTL_START) { + val |= STAT_TRIG; + } + break; + case 1: + val = s->aur.stat & STAT_PAUSED ? BIT(3) : 0; + break; + case 2: + val = s->aur.type; + break; + case 4: + val = s->aur.curr; + break; + case 0xc: + val = CLEN_LEN(&s->aur); + break; + case 0x10: + /* silence unimplemented log message that happens at every IRQ */ + break; + case 0x80: + val = s->ac97_cmd; + break; + case 0x84: + val = s->aur.stat & STAT_FLAG; + if (s->aur.stat & STAT_EOL) { + val |= BIT(4); + } + if (s->aur.stat & STAT_STOP) { + val |= BIT(8); + } + if (s->aur.stat & STAT_ACTIVE) { + val |= BIT(12); + } + break; + default: + qemu_log_mask(LOG_UNIMP, "via-ac97: Unimplemented register read 0x%" + HWADDR_PRIx"\n", addr); + } + trace_via_ac97_sgd_read(addr, size, val); + return val; +} + +static void sgd_write(void *opaque, hwaddr addr, uint64_t val, unsigned size) +{ + ViaAC97State *s = opaque; + + trace_via_ac97_sgd_write(addr, size, val); + switch (addr) { + case 0: + if (val & STAT_STOP) { + s->aur.stat &= ~STAT_PAUSED; + } + if (val & STAT_EOL) { + s->aur.stat &= ~(STAT_EOL | STAT_PAUSED); + if (s->aur.type & STAT_EOL) { + pci_set_irq(&s->dev, 0); + } + } + if (val & STAT_FLAG) { + s->aur.stat &= ~(STAT_FLAG | STAT_PAUSED); + if (s->aur.type & STAT_FLAG) { + pci_set_irq(&s->dev, 0); + } + } + break; + case 1: + if (val & CNTL_START) { + AUD_set_active_out(s->vo, 1); + s->aur.stat = STAT_ACTIVE; + } + if (val & CNTL_TERM) { + AUD_set_active_out(s->vo, 0); + s->aur.stat &= ~(STAT_ACTIVE | STAT_PAUSED); + s->aur.clen = 0; + } + if (val & CNTL_PAUSE) { + AUD_set_active_out(s->vo, 0); + s->aur.stat &= ~STAT_ACTIVE; + s->aur.stat |= STAT_PAUSED; + } else if (!(val & CNTL_PAUSE) && (s->aur.stat & STAT_PAUSED)) { + AUD_set_active_out(s->vo, 1); + s->aur.stat |= STAT_ACTIVE; + s->aur.stat &= ~STAT_PAUSED; + } + break; + case 2: + { + uint32_t oldval = s->aur.type; + s->aur.type = val; + if ((oldval & 0x30) != (val & 0x30)) { + open_voice_out(s); + } + break; + } + case 4: + s->aur.base = val & ~1ULL; + s->aur.curr = s->aur.base; + break; + case 0x80: + if (val >> 30) { + /* we only have primary codec */ + break; + } + if (val & BIT(23)) { /* read reg */ + s->ac97_cmd = val & 0xc0ff0000ULL; + s->ac97_cmd |= codec_read(s, (val >> 16) & 0x7f); + s->ac97_cmd |= BIT(25); /* data valid */ + } else { + s->ac97_cmd = val & 0xc0ffffffULL; + codec_write(s, (val >> 16) & 0x7f, val); + } + break; + case 0xc: + case 0x84: + /* Read only */ + break; + default: + qemu_log_mask(LOG_UNIMP, "via-ac97: Unimplemented register write 0x%" + HWADDR_PRIx"\n", addr); + } +} + +static const MemoryRegionOps sgd_ops = { + .read = sgd_read, + .write = sgd_write, + .endianness = DEVICE_LITTLE_ENDIAN, +}; + +static uint64_t fm_read(void *opaque, hwaddr addr, unsigned size) +{ + qemu_log_mask(LOG_UNIMP, "%s: 0x%"HWADDR_PRIx" %d\n", __func__, addr, size); + return 0; +} + +static void fm_write(void *opaque, hwaddr addr, uint64_t val, unsigned size) +{ + qemu_log_mask(LOG_UNIMP, "%s: 0x%"HWADDR_PRIx" %d <= 0x%"PRIX64"\n", + __func__, addr, size, val); +} + +static const MemoryRegionOps fm_ops = { + .read = fm_read, + .write = fm_write, + .endianness = DEVICE_LITTLE_ENDIAN, +}; + +static uint64_t midi_read(void *opaque, hwaddr addr, unsigned size) +{ + qemu_log_mask(LOG_UNIMP, "%s: 0x%"HWADDR_PRIx" %d\n", __func__, addr, size); + return 0; +} + +static void midi_write(void *opaque, hwaddr addr, uint64_t val, unsigned size) +{ + qemu_log_mask(LOG_UNIMP, "%s: 0x%"HWADDR_PRIx" %d <= 0x%"PRIX64"\n", + __func__, addr, size, val); +} + +static const MemoryRegionOps midi_ops = { + .read = midi_read, + .write = midi_write, + .endianness = DEVICE_LITTLE_ENDIAN, +}; + +static void via_ac97_reset(DeviceState *dev) +{ + ViaAC97State *s = VIA_AC97(dev); + + codec_reset(s); +} static void via_ac97_realize(PCIDevice *pci_dev, Error **errp) { - pci_set_word(pci_dev->config + PCI_COMMAND, - PCI_COMMAND_INVALIDATE | PCI_COMMAND_PARITY); + ViaAC97State *s = VIA_AC97(pci_dev); + Object *o = OBJECT(s); + + /* + * Command register Bus Master bit is documented to be fixed at 0 but it's + * needed for PCI DMA to work in QEMU. The pegasos2 firmware writes 0 here + * and the AmigaOS driver writes 1 only enabling IO bit which works on + * real hardware. So set it here and fix it to 1 to allow DMA. + */ + pci_set_word(pci_dev->config + PCI_COMMAND, PCI_COMMAND_MASTER); + pci_set_word(pci_dev->wmask + PCI_COMMAND, PCI_COMMAND_IO); pci_set_word(pci_dev->config + PCI_STATUS, PCI_STATUS_CAP_LIST | PCI_STATUS_DEVSEL_MEDIUM); pci_set_long(pci_dev->config + PCI_INTERRUPT_PIN, 0x03); + pci_set_byte(pci_dev->config + 0x40, 1); /* codec ready */ + + memory_region_init_io(&s->sgd, o, &sgd_ops, s, "via-ac97.sgd", 256); + pci_register_bar(pci_dev, 0, PCI_BASE_ADDRESS_SPACE_IO, &s->sgd); + memory_region_init_io(&s->fm, o, &fm_ops, s, "via-ac97.fm", 4); + pci_register_bar(pci_dev, 1, PCI_BASE_ADDRESS_SPACE_IO, &s->fm); + memory_region_init_io(&s->midi, o, &midi_ops, s, "via-ac97.midi", 4); + pci_register_bar(pci_dev, 2, PCI_BASE_ADDRESS_SPACE_IO, &s->midi); + + AUD_register_card ("via-ac97", &s->card); } +static void via_ac97_exit(PCIDevice *dev) +{ + ViaAC97State *s = VIA_AC97(dev); + + AUD_close_out(&s->card, s->vo); + AUD_remove_card(&s->card); +} + +static Property via_ac97_properties[] = { + DEFINE_AUDIO_PROPERTIES(ViaAC97State, card), + DEFINE_PROP_END_OF_LIST(), +}; + static void via_ac97_class_init(ObjectClass *klass, void *data) { DeviceClass *dc = DEVICE_CLASS(klass); PCIDeviceClass *k = PCI_DEVICE_CLASS(klass); k->realize = via_ac97_realize; + k->exit = via_ac97_exit; k->vendor_id = PCI_VENDOR_ID_VIA; k->device_id = PCI_DEVICE_ID_VIA_AC97; k->revision = 0x50; k->class_id = PCI_CLASS_MULTIMEDIA_AUDIO; + device_class_set_props(dc, via_ac97_properties); set_bit(DEVICE_CATEGORY_SOUND, dc->categories); dc->desc = "VIA AC97"; + dc->reset = via_ac97_reset; /* Reason: Part of a south bridge chip */ dc->user_creatable = false; } @@ -41,7 +484,7 @@ static void via_ac97_class_init(ObjectClass *klass, void *data) static const TypeInfo via_ac97_info = { .name = TYPE_VIA_AC97, .parent = TYPE_PCI_DEVICE, - .instance_size = sizeof(PCIDevice), + .instance_size = sizeof(ViaAC97State), .class_init = via_ac97_class_init, .interfaces = (InterfaceInfo[]) { { INTERFACE_CONVENTIONAL_PCI_DEVICE }, diff --git a/hw/block/block.c b/hw/block/block.c index af0710e477..9f52ee6e72 100644 --- a/hw/block/block.c +++ b/hw/block/block.c @@ -39,8 +39,7 @@ static int blk_pread_nonzeroes(BlockBackend *blk, hwaddr size, void *buf) return ret; } if (!(ret & BDRV_BLOCK_ZERO)) { - ret = bdrv_pread(bs->file, offset, bytes, - (uint8_t *) buf + offset, 0); + ret = blk_pread(blk, offset, bytes, (uint8_t *) buf + offset, 0); if (ret < 0) { return ret; } diff --git a/hw/block/dataplane/meson.build b/hw/block/dataplane/meson.build index 12c6a264f1..78d7ac1a11 100644 --- a/hw/block/dataplane/meson.build +++ b/hw/block/dataplane/meson.build @@ -1,2 +1,2 @@ specific_ss.add(when: 'CONFIG_VIRTIO_BLK', if_true: files('virtio-blk.c')) -specific_ss.add(when: 'CONFIG_XEN', if_true: files('xen-block.c')) +specific_ss.add(when: 'CONFIG_XEN_BUS', if_true: files('xen-block.c')) diff --git a/hw/block/dataplane/xen-block.c b/hw/block/dataplane/xen-block.c index 2785b9e849..734da42ea7 100644 --- a/hw/block/dataplane/xen-block.c +++ b/hw/block/dataplane/xen-block.c @@ -23,8 +23,9 @@ #include "qemu/main-loop.h" #include "qemu/memalign.h" #include "qapi/error.h" -#include "hw/xen/xen_common.h" +#include "hw/xen/xen.h" #include "hw/block/xen_blkif.h" +#include "hw/xen/interface/io/ring.h" #include "sysemu/block-backend.h" #include "sysemu/iothread.h" #include "xen-block.h" @@ -101,9 +102,9 @@ static XenBlockRequest *xen_block_start_request(XenBlockDataPlane *dataplane) * re-use requests, allocate the memory once here. It will be freed * xen_block_dataplane_destroy() when the request list is freed. */ - request->buf = qemu_memalign(XC_PAGE_SIZE, + request->buf = qemu_memalign(XEN_PAGE_SIZE, BLKIF_MAX_SEGMENTS_PER_REQUEST * - XC_PAGE_SIZE); + XEN_PAGE_SIZE); dataplane->requests_total++; qemu_iovec_init(&request->v, 1); } else { @@ -185,7 +186,7 @@ static int xen_block_parse_request(XenBlockRequest *request) goto err; } if (request->req.seg[i].last_sect * dataplane->sector_size >= - XC_PAGE_SIZE) { + XEN_PAGE_SIZE) { error_report("error: page crossing"); goto err; } @@ -705,6 +706,7 @@ void xen_block_dataplane_stop(XenBlockDataPlane *dataplane) Error *local_err = NULL; xen_device_unmap_grant_refs(xendev, dataplane->sring, + dataplane->ring_ref, dataplane->nr_ring_ref, &local_err); dataplane->sring = NULL; @@ -739,7 +741,7 @@ void xen_block_dataplane_start(XenBlockDataPlane *dataplane, dataplane->protocol = protocol; - ring_size = XC_PAGE_SIZE * dataplane->nr_ring_ref; + ring_size = XEN_PAGE_SIZE * dataplane->nr_ring_ref; switch (dataplane->protocol) { case BLKIF_PROTOCOL_NATIVE: { diff --git a/hw/block/m25p80.c b/hw/block/m25p80.c index 802d2eb021..dc5ffbc4ff 100644 --- a/hw/block/m25p80.c +++ b/hw/block/m25p80.c @@ -24,6 +24,7 @@ #include "qemu/osdep.h" #include "qemu/units.h" #include "sysemu/block-backend.h" +#include "hw/block/block.h" #include "hw/qdev-properties.h" #include "hw/qdev-properties-system.h" #include "hw/ssi/ssi.h" @@ -1615,8 +1616,7 @@ static void m25p80_realize(SSIPeripheral *ss, Error **errp) trace_m25p80_binding(s); s->storage = blk_blockalign(s->blk, s->size); - if (blk_pread(s->blk, 0, s->size, s->storage, 0) < 0) { - error_setg(errp, "failed to read the initial flash content"); + if (!blk_check_size_and_read_all(s->blk, s->storage, s->size, errp)) { return; } } else { diff --git a/hw/block/meson.build b/hw/block/meson.build index b434d5654c..cc2a75cc50 100644 --- a/hw/block/meson.build +++ b/hw/block/meson.build @@ -14,7 +14,7 @@ softmmu_ss.add(when: 'CONFIG_PFLASH_CFI02', if_true: files('pflash_cfi02.c')) softmmu_ss.add(when: 'CONFIG_SSI_M25P80', if_true: files('m25p80.c')) softmmu_ss.add(when: 'CONFIG_SSI_M25P80', if_true: files('m25p80_sfdp.c')) softmmu_ss.add(when: 'CONFIG_SWIM', if_true: files('swim.c')) -softmmu_ss.add(when: 'CONFIG_XEN', if_true: files('xen-block.c')) +softmmu_ss.add(when: 'CONFIG_XEN_BUS', if_true: files('xen-block.c')) softmmu_ss.add(when: 'CONFIG_TC58128', if_true: files('tc58128.c')) specific_ss.add(when: 'CONFIG_VIRTIO_BLK', if_true: files('virtio-blk.c', 'virtio-blk-common.c')) diff --git a/hw/block/xen-block.c b/hw/block/xen-block.c index 345b284d70..f5a744589d 100644 --- a/hw/block/xen-block.c +++ b/hw/block/xen-block.c @@ -19,7 +19,6 @@ #include "qapi/qmp/qdict.h" #include "qapi/qmp/qstring.h" #include "qom/object_interfaces.h" -#include "hw/xen/xen_common.h" #include "hw/block/xen_blkif.h" #include "hw/qdev-properties.h" #include "hw/xen/xen-block.h" @@ -84,7 +83,8 @@ static void xen_block_connect(XenDevice *xendev, Error **errp) g_free(ring_ref); return; } - } else if (order <= blockdev->props.max_ring_page_order) { + } else if (qemu_xen_gnttab_can_map_multi() && + order <= blockdev->props.max_ring_page_order) { unsigned int i; nr_ring_ref = 1 << order; @@ -256,8 +256,12 @@ static void xen_block_realize(XenDevice *xendev, Error **errp) } xen_device_backend_printf(xendev, "feature-flush-cache", "%u", 1); - xen_device_backend_printf(xendev, "max-ring-page-order", "%u", - blockdev->props.max_ring_page_order); + + if (qemu_xen_gnttab_can_map_multi()) { + xen_device_backend_printf(xendev, "max-ring-page-order", "%u", + blockdev->props.max_ring_page_order); + } + xen_device_backend_printf(xendev, "info", "%u", blockdev->info); xen_device_frontend_printf(xendev, "virtual-device", "%lu", diff --git a/hw/char/meson.build b/hw/char/meson.build index 7b594f51b8..e02c60dd54 100644 --- a/hw/char/meson.build +++ b/hw/char/meson.build @@ -18,7 +18,7 @@ softmmu_ss.add(when: 'CONFIG_SERIAL_PCI', if_true: files('serial-pci.c')) softmmu_ss.add(when: 'CONFIG_SERIAL_PCI_MULTI', if_true: files('serial-pci-multi.c')) softmmu_ss.add(when: 'CONFIG_SHAKTI_UART', if_true: files('shakti_uart.c')) softmmu_ss.add(when: 'CONFIG_VIRTIO_SERIAL', if_true: files('virtio-console.c')) -softmmu_ss.add(when: 'CONFIG_XEN', if_true: files('xen_console.c')) +softmmu_ss.add(when: 'CONFIG_XEN_BUS', if_true: files('xen_console.c')) softmmu_ss.add(when: 'CONFIG_XILINX', if_true: files('xilinx_uartlite.c')) softmmu_ss.add(when: 'CONFIG_AVR_USART', if_true: files('avr_usart.c')) diff --git a/hw/char/xen_console.c b/hw/char/xen_console.c index 63153dfde4..c7a19c0e7c 100644 --- a/hw/char/xen_console.c +++ b/hw/char/xen_console.c @@ -173,6 +173,48 @@ static void xencons_send(struct XenConsole *con) /* -------------------------------------------------------------------- */ +static int store_con_info(struct XenConsole *con) +{ + Chardev *cs = qemu_chr_fe_get_driver(&con->chr); + char *pts = NULL; + char *dom_path; + GString *path; + int ret = -1; + + /* Only continue if we're talking to a pty. */ + if (!CHARDEV_IS_PTY(cs)) { + return 0; + } + pts = cs->filename + 4; + + dom_path = qemu_xen_xs_get_domain_path(xenstore, xen_domid); + if (!dom_path) { + return 0; + } + + path = g_string_new(dom_path); + free(dom_path); + + if (con->xendev.dev) { + g_string_append_printf(path, "/device/console/%d", con->xendev.dev); + } else { + g_string_append(path, "/console"); + } + g_string_append(path, "/tty"); + + if (xenstore_write_str(con->console, path->str, pts)) { + fprintf(stderr, "xenstore_write_str for '%s' fail", path->str); + goto out; + } + ret = 0; + +out: + g_string_free(path, true); + free(path); + + return ret; +} + static int con_init(struct XenLegacyDevice *xendev) { struct XenConsole *con = container_of(xendev, struct XenConsole, xendev); @@ -181,7 +223,7 @@ static int con_init(struct XenLegacyDevice *xendev) const char *output; /* setup */ - dom = xs_get_domain_path(xenstore, con->xendev.dom); + dom = qemu_xen_xs_get_domain_path(xenstore, con->xendev.dom); if (!xendev->dev) { snprintf(con->console, sizeof(con->console), "%s/console", dom); } else { @@ -215,8 +257,7 @@ static int con_init(struct XenLegacyDevice *xendev) &error_abort); } - xenstore_store_pv_console_info(con->xendev.dev, - qemu_chr_fe_get_driver(&con->chr)); + store_con_info(con); out: g_free(type); @@ -237,9 +278,9 @@ static int con_initialise(struct XenLegacyDevice *xendev) if (!xendev->dev) { xen_pfn_t mfn = con->ring_ref; - con->sring = xenforeignmemory_map(xen_fmem, con->xendev.dom, - PROT_READ | PROT_WRITE, - 1, &mfn, NULL); + con->sring = qemu_xen_foreignmem_map(con->xendev.dom, NULL, + PROT_READ | PROT_WRITE, + 1, &mfn, NULL); } else { con->sring = xen_be_map_grant_ref(xendev, con->ring_ref, PROT_READ | PROT_WRITE); @@ -269,9 +310,9 @@ static void con_disconnect(struct XenLegacyDevice *xendev) if (con->sring) { if (!xendev->dev) { - xenforeignmemory_unmap(xen_fmem, con->sring, 1); + qemu_xen_foreignmem_unmap(con->sring, 1); } else { - xen_be_unmap_grant_ref(xendev, con->sring); + xen_be_unmap_grant_ref(xendev, con->sring, con->ring_ref); } con->sring = NULL; } diff --git a/hw/core/loader.c b/hw/core/loader.c index 173f8f67f6..cd53235fed 100644 --- a/hw/core/loader.c +++ b/hw/core/loader.c @@ -857,6 +857,97 @@ ssize_t load_image_gzipped(const char *filename, hwaddr addr, uint64_t max_sz) return bytes; } +/* The PE/COFF MS-DOS stub magic number */ +#define EFI_PE_MSDOS_MAGIC "MZ" + +/* + * The Linux header magic number for a EFI PE/COFF + * image targetting an unspecified architecture. + */ +#define EFI_PE_LINUX_MAGIC "\xcd\x23\x82\x81" + +/* + * Bootable Linux kernel images may be packaged as EFI zboot images, which are + * self-decompressing executables when loaded via EFI. The compressed payload + * can also be extracted from the image and decompressed by a non-EFI loader. + * + * The de facto specification for this format is at the following URL: + * + * https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/tree/drivers/firmware/efi/libstub/zboot-header.S + * + * This definition is based on Linux upstream commit 29636a5ce87beba. + */ +struct linux_efi_zboot_header { + uint8_t msdos_magic[2]; /* PE/COFF 'MZ' magic number */ + uint8_t reserved0[2]; + uint8_t zimg[4]; /* "zimg" for Linux EFI zboot images */ + uint32_t payload_offset; /* LE offset to compressed payload */ + uint32_t payload_size; /* LE size of the compressed payload */ + uint8_t reserved1[8]; + char compression_type[32]; /* Compression type, NUL terminated */ + uint8_t linux_magic[4]; /* Linux header magic */ + uint32_t pe_header_offset; /* LE offset to the PE header */ +}; + +/* + * Check whether *buffer points to a Linux EFI zboot image in memory. + * + * If it does, attempt to decompress it to a new buffer, and free the old one. + * If any of this fails, return an error to the caller. + * + * If the image is not a Linux EFI zboot image, do nothing and return success. + */ +ssize_t unpack_efi_zboot_image(uint8_t **buffer, int *size) +{ + const struct linux_efi_zboot_header *header; + uint8_t *data = NULL; + int ploff, plsize; + ssize_t bytes; + + /* ignore if this is too small to be a EFI zboot image */ + if (*size < sizeof(*header)) { + return 0; + } + + header = (struct linux_efi_zboot_header *)*buffer; + + /* ignore if this is not a Linux EFI zboot image */ + if (memcmp(&header->msdos_magic, EFI_PE_MSDOS_MAGIC, 2) != 0 || + memcmp(&header->zimg, "zimg", 4) != 0 || + memcmp(&header->linux_magic, EFI_PE_LINUX_MAGIC, 4) != 0) { + return 0; + } + + if (strcmp(header->compression_type, "gzip") != 0) { + fprintf(stderr, + "unable to handle EFI zboot image with \"%.*s\" compression\n", + (int)sizeof(header->compression_type) - 1, + header->compression_type); + return -1; + } + + ploff = ldl_le_p(&header->payload_offset); + plsize = ldl_le_p(&header->payload_size); + + if (ploff < 0 || plsize < 0 || ploff + plsize > *size) { + fprintf(stderr, "unable to handle corrupt EFI zboot image\n"); + return -1; + } + + data = g_malloc(LOAD_IMAGE_MAX_GUNZIP_BYTES); + bytes = gunzip(data, LOAD_IMAGE_MAX_GUNZIP_BYTES, *buffer + ploff, plsize); + if (bytes < 0) { + fprintf(stderr, "failed to decompress EFI zboot image\n"); + g_free(data); + return -1; + } + + g_free(*buffer); + *buffer = g_realloc(data, bytes); + *size = bytes; + return bytes; +} + /* * Functions for reboot-persistent memory regions. * - used for vga bios and option roms. diff --git a/hw/core/machine.c b/hw/core/machine.c index 1cf6822e06..45e3d24fdc 100644 --- a/hw/core/machine.c +++ b/hw/core/machine.c @@ -40,6 +40,7 @@ #include "hw/virtio/virtio-pci.h" GlobalProperty hw_compat_7_2[] = { + { "e1000e", "migrate-timadj", "off" }, { "virtio-mem", "x-early-migration", "false" }, }; const size_t hw_compat_7_2_len = G_N_ELEMENTS(hw_compat_7_2); diff --git a/hw/cxl/cxl-component-utils.c b/hw/cxl/cxl-component-utils.c index 3edd303a33..b665d4f565 100644 --- a/hw/cxl/cxl-component-utils.c +++ b/hw/cxl/cxl-component-utils.c @@ -141,17 +141,19 @@ static void ras_init_common(uint32_t *reg_state, uint32_t *write_msk) * Error status is RW1C but given bits are not yet set, it can * be handled as RO. */ - reg_state[R_CXL_RAS_UNC_ERR_STATUS] = 0; + stl_le_p(reg_state + R_CXL_RAS_UNC_ERR_STATUS, 0); + stl_le_p(write_msk + R_CXL_RAS_UNC_ERR_STATUS, 0x1cfff); /* Bits 12-13 and 17-31 reserved in CXL 2.0 */ - reg_state[R_CXL_RAS_UNC_ERR_MASK] = 0x1cfff; - write_msk[R_CXL_RAS_UNC_ERR_MASK] = 0x1cfff; - reg_state[R_CXL_RAS_UNC_ERR_SEVERITY] = 0x1cfff; - write_msk[R_CXL_RAS_UNC_ERR_SEVERITY] = 0x1cfff; - reg_state[R_CXL_RAS_COR_ERR_STATUS] = 0; - reg_state[R_CXL_RAS_COR_ERR_MASK] = 0x7f; - write_msk[R_CXL_RAS_COR_ERR_MASK] = 0x7f; + stl_le_p(reg_state + R_CXL_RAS_UNC_ERR_MASK, 0x1cfff); + stl_le_p(write_msk + R_CXL_RAS_UNC_ERR_MASK, 0x1cfff); + stl_le_p(reg_state + R_CXL_RAS_UNC_ERR_SEVERITY, 0x1cfff); + stl_le_p(write_msk + R_CXL_RAS_UNC_ERR_SEVERITY, 0x1cfff); + stl_le_p(reg_state + R_CXL_RAS_COR_ERR_STATUS, 0); + stl_le_p(write_msk + R_CXL_RAS_COR_ERR_STATUS, 0x7f); + stl_le_p(reg_state + R_CXL_RAS_COR_ERR_MASK, 0x7f); + stl_le_p(write_msk + R_CXL_RAS_COR_ERR_MASK, 0x7f); /* CXL switches and devices must set */ - reg_state[R_CXL_RAS_ERR_CAP_CTRL] = 0x00; + stl_le_p(reg_state + R_CXL_RAS_ERR_CAP_CTRL, 0x200); } static void hdm_init_common(uint32_t *reg_state, uint32_t *write_msk, diff --git a/hw/cxl/cxl-host.c b/hw/cxl/cxl-host.c index 3c1ec8732a..6e923ceeaf 100644 --- a/hw/cxl/cxl-host.c +++ b/hw/cxl/cxl-host.c @@ -146,21 +146,28 @@ static PCIDevice *cxl_cfmws_find_device(CXLFixedWindow *fw, hwaddr addr) return NULL; } - hb_cstate = cxl_get_hb_cstate(hb); - if (!hb_cstate) { - return NULL; - } + if (cxl_get_hb_passthrough(hb)) { + rp = pcie_find_port_first(hb->bus); + if (!rp) { + return NULL; + } + } else { + hb_cstate = cxl_get_hb_cstate(hb); + if (!hb_cstate) { + return NULL; + } - cache_mem = hb_cstate->crb.cache_mem_registers; + cache_mem = hb_cstate->crb.cache_mem_registers; - target_found = cxl_hdm_find_target(cache_mem, addr, &target); - if (!target_found) { - return NULL; - } + target_found = cxl_hdm_find_target(cache_mem, addr, &target); + if (!target_found) { + return NULL; + } - rp = pcie_find_port_by_pn(hb->bus, target); - if (!rp) { - return NULL; + rp = pcie_find_port_by_pn(hb->bus, target); + if (!rp) { + return NULL; + } } d = pci_bridge_get_sec_bus(PCI_BRIDGE(rp))->devices[0]; diff --git a/hw/display/meson.build b/hw/display/meson.build index f470179122..4191694380 100644 --- a/hw/display/meson.build +++ b/hw/display/meson.build @@ -14,7 +14,7 @@ softmmu_ss.add(when: 'CONFIG_PL110', if_true: files('pl110.c')) softmmu_ss.add(when: 'CONFIG_SII9022', if_true: files('sii9022.c')) softmmu_ss.add(when: 'CONFIG_SSD0303', if_true: files('ssd0303.c')) softmmu_ss.add(when: 'CONFIG_SSD0323', if_true: files('ssd0323.c')) -softmmu_ss.add(when: 'CONFIG_XEN', if_true: files('xenfb.c')) +softmmu_ss.add(when: 'CONFIG_XEN_BUS', if_true: files('xenfb.c')) softmmu_ss.add(when: 'CONFIG_VGA_PCI', if_true: files('vga-pci.c')) softmmu_ss.add(when: 'CONFIG_VGA_ISA', if_true: files('vga-isa.c')) diff --git a/hw/display/sm501.c b/hw/display/sm501.c index 17835159fc..dbabbc4339 100644 --- a/hw/display/sm501.c +++ b/hw/display/sm501.c @@ -465,6 +465,7 @@ typedef struct SM501State { uint32_t last_width; uint32_t last_height; bool do_full_update; /* perform a full update next time */ + uint8_t use_pixman; I2CBus *i2c_bus; /* mmio registers */ @@ -827,7 +828,7 @@ static void sm501_2d_operation(SM501State *s) de = db + (width + (height - 1) * dst_pitch) * bypp; overlap = (db < se && sb < de); } - if (overlap) { + if (overlap && (s->use_pixman & BIT(2))) { /* pixman can't do reverse blit: copy via temporary */ int tmp_stride = DIV_ROUND_UP(width * bypp, sizeof(uint32_t)); uint32_t *tmp = tmp_buf; @@ -852,13 +853,15 @@ static void sm501_2d_operation(SM501State *s) if (tmp != tmp_buf) { g_free(tmp); } - } else { + } else if (!overlap && (s->use_pixman & BIT(1))) { fallback = !pixman_blt((uint32_t *)&s->local_mem[src_base], (uint32_t *)&s->local_mem[dst_base], src_pitch * bypp / sizeof(uint32_t), dst_pitch * bypp / sizeof(uint32_t), 8 * bypp, 8 * bypp, src_x, src_y, dst_x, dst_y, width, height); + } else { + fallback = true; } if (fallback) { uint8_t *sp = s->local_mem + src_base; @@ -891,7 +894,7 @@ static void sm501_2d_operation(SM501State *s) color = cpu_to_le16(color); } - if ((width == 1 && height == 1) || + if (!(s->use_pixman & BIT(0)) || (width == 1 && height == 1) || !pixman_fill((uint32_t *)&s->local_mem[dst_base], dst_pitch * bypp / sizeof(uint32_t), 8 * bypp, dst_x, dst_y, width, height, color)) { @@ -2035,6 +2038,7 @@ static void sm501_realize_sysbus(DeviceState *dev, Error **errp) static Property sm501_sysbus_properties[] = { DEFINE_PROP_UINT32("vram-size", SM501SysBusState, vram_size, 0), + DEFINE_PROP_UINT8("x-pixman", SM501SysBusState, state.use_pixman, 7), DEFINE_PROP_END_OF_LIST(), }; @@ -2122,6 +2126,7 @@ static void sm501_realize_pci(PCIDevice *dev, Error **errp) static Property sm501_pci_properties[] = { DEFINE_PROP_UINT32("vram-size", SM501PCIState, vram_size, 64 * MiB), + DEFINE_PROP_UINT8("x-pixman", SM501PCIState, state.use_pixman, 7), DEFINE_PROP_END_OF_LIST(), }; @@ -2162,11 +2167,18 @@ static void sm501_pci_class_init(ObjectClass *klass, void *data) dc->vmsd = &vmstate_sm501_pci; } +static void sm501_pci_init(Object *o) +{ + object_property_set_description(o, "x-pixman", "Use pixman for: " + "1: fill, 2: blit, 4: overlap blit"); +} + static const TypeInfo sm501_pci_info = { .name = TYPE_PCI_SM501, .parent = TYPE_PCI_DEVICE, .instance_size = sizeof(SM501PCIState), .class_init = sm501_pci_class_init, + .instance_init = sm501_pci_init, .interfaces = (InterfaceInfo[]) { { INTERFACE_CONVENTIONAL_PCI_DEVICE }, { }, diff --git a/hw/display/xenfb.c b/hw/display/xenfb.c index 260eb38a76..0074a9b6f8 100644 --- a/hw/display/xenfb.c +++ b/hw/display/xenfb.c @@ -98,8 +98,9 @@ static int common_bind(struct common *c) if (xenstore_read_fe_int(&c->xendev, "event-channel", &c->xendev.remote_port) == -1) return -1; - c->page = xenforeignmemory_map(xen_fmem, c->xendev.dom, - PROT_READ | PROT_WRITE, 1, &mfn, NULL); + c->page = qemu_xen_foreignmem_map(c->xendev.dom, NULL, + PROT_READ | PROT_WRITE, 1, &mfn, + NULL); if (c->page == NULL) return -1; @@ -115,7 +116,7 @@ static void common_unbind(struct common *c) { xen_pv_unbind_evtchn(&c->xendev); if (c->page) { - xenforeignmemory_unmap(xen_fmem, c->page, 1); + qemu_xen_foreignmem_unmap(c->page, 1); c->page = NULL; } } @@ -488,27 +489,28 @@ static int xenfb_map_fb(struct XenFB *xenfb) } if (xenfb->pixels) { - munmap(xenfb->pixels, xenfb->fbpages * XC_PAGE_SIZE); + munmap(xenfb->pixels, xenfb->fbpages * XEN_PAGE_SIZE); xenfb->pixels = NULL; } - xenfb->fbpages = DIV_ROUND_UP(xenfb->fb_len, XC_PAGE_SIZE); + xenfb->fbpages = DIV_ROUND_UP(xenfb->fb_len, XEN_PAGE_SIZE); n_fbdirs = xenfb->fbpages * mode / 8; - n_fbdirs = DIV_ROUND_UP(n_fbdirs, XC_PAGE_SIZE); + n_fbdirs = DIV_ROUND_UP(n_fbdirs, XEN_PAGE_SIZE); pgmfns = g_new0(xen_pfn_t, n_fbdirs); fbmfns = g_new0(xen_pfn_t, xenfb->fbpages); xenfb_copy_mfns(mode, n_fbdirs, pgmfns, pd); - map = xenforeignmemory_map(xen_fmem, xenfb->c.xendev.dom, - PROT_READ, n_fbdirs, pgmfns, NULL); + map = qemu_xen_foreignmem_map(xenfb->c.xendev.dom, NULL, PROT_READ, + n_fbdirs, pgmfns, NULL); if (map == NULL) goto out; xenfb_copy_mfns(mode, xenfb->fbpages, fbmfns, map); - xenforeignmemory_unmap(xen_fmem, map, n_fbdirs); + qemu_xen_foreignmem_unmap(map, n_fbdirs); - xenfb->pixels = xenforeignmemory_map(xen_fmem, xenfb->c.xendev.dom, - PROT_READ, xenfb->fbpages, fbmfns, NULL); + xenfb->pixels = qemu_xen_foreignmem_map(xenfb->c.xendev.dom, NULL, + PROT_READ, xenfb->fbpages, + fbmfns, NULL); if (xenfb->pixels == NULL) goto out; @@ -526,8 +528,8 @@ static int xenfb_configure_fb(struct XenFB *xenfb, size_t fb_len_lim, { size_t mfn_sz = sizeof_field(struct xenfb_page, pd[0]); size_t pd_len = sizeof_field(struct xenfb_page, pd) / mfn_sz; - size_t fb_pages = pd_len * XC_PAGE_SIZE / mfn_sz; - size_t fb_len_max = fb_pages * XC_PAGE_SIZE; + size_t fb_pages = pd_len * XEN_PAGE_SIZE / mfn_sz; + size_t fb_len_max = fb_pages * XEN_PAGE_SIZE; int max_width, max_height; if (fb_len_lim > fb_len_max) { @@ -927,8 +929,8 @@ static void fb_disconnect(struct XenLegacyDevice *xendev) * Replacing the framebuffer with anonymous shared memory * instead. This releases the guest pages and keeps qemu happy. */ - xenforeignmemory_unmap(xen_fmem, fb->pixels, fb->fbpages); - fb->pixels = mmap(fb->pixels, fb->fbpages * XC_PAGE_SIZE, + qemu_xen_foreignmem_unmap(fb->pixels, fb->fbpages); + fb->pixels = mmap(fb->pixels, fb->fbpages * XEN_PAGE_SIZE, PROT_READ | PROT_WRITE, MAP_SHARED | MAP_ANON, -1, 0); if (fb->pixels == MAP_FAILED) { diff --git a/hw/i2c/allwinner-i2c.c b/hw/i2c/allwinner-i2c.c index a435965836..f24c3ac6f0 100644 --- a/hw/i2c/allwinner-i2c.c +++ b/hw/i2c/allwinner-i2c.c @@ -357,10 +357,16 @@ static void allwinner_i2c_write(void *opaque, hwaddr offset, s->stat = STAT_FROM_STA(STAT_IDLE); s->cntr &= ~TWI_CNTR_M_STP; } - if ((s->cntr & TWI_CNTR_INT_FLAG) == 0) { - /* Interrupt flag cleared */ + + if (!s->irq_clear_inverted && !(s->cntr & TWI_CNTR_INT_FLAG)) { + /* Write 0 to clear this flag */ + qemu_irq_lower(s->irq); + } else if (s->irq_clear_inverted && (s->cntr & TWI_CNTR_INT_FLAG)) { + /* Write 1 to clear this flag */ + s->cntr &= ~TWI_CNTR_INT_FLAG; qemu_irq_lower(s->irq); } + if ((s->cntr & TWI_CNTR_A_ACK) == 0) { if (STAT_TO_STA(s->stat) == STAT_M_DATA_RX_ACK) { s->stat = STAT_FROM_STA(STAT_M_DATA_RX_NACK); @@ -451,9 +457,25 @@ static const TypeInfo allwinner_i2c_type_info = { .class_init = allwinner_i2c_class_init, }; +static void allwinner_i2c_sun6i_init(Object *obj) +{ + AWI2CState *s = AW_I2C(obj); + + s->irq_clear_inverted = true; +} + +static const TypeInfo allwinner_i2c_sun6i_type_info = { + .name = TYPE_AW_I2C_SUN6I, + .parent = TYPE_SYS_BUS_DEVICE, + .instance_size = sizeof(AWI2CState), + .instance_init = allwinner_i2c_sun6i_init, + .class_init = allwinner_i2c_class_init, +}; + static void allwinner_i2c_register_types(void) { type_register_static(&allwinner_i2c_type_info); + type_register_static(&allwinner_i2c_sun6i_type_info); } type_init(allwinner_i2c_register_types) diff --git a/hw/i386/acpi-build.c b/hw/i386/acpi-build.c index b19fb4259e..ec857a117e 100644 --- a/hw/i386/acpi-build.c +++ b/hw/i386/acpi-build.c @@ -373,6 +373,104 @@ Aml *aml_pci_device_dsm(void) return method; } +static void build_append_pci_dsm_func0_common(Aml *ctx, Aml *retvar) +{ + Aml *UUID, *ifctx1; + uint8_t byte_list[1] = { 0 }; /* nothing supported yet */ + + aml_append(ctx, aml_store(aml_buffer(1, byte_list), retvar)); + /* + * PCI Firmware Specification 3.1 + * 4.6. _DSM Definitions for PCI + */ + UUID = aml_touuid("E5C937D0-3553-4D7A-9117-EA4D19C3434D"); + ifctx1 = aml_if(aml_lnot(aml_equal(aml_arg(0), UUID))); + { + /* call is for unsupported UUID, bail out */ + aml_append(ifctx1, aml_return(retvar)); + } + aml_append(ctx, ifctx1); + + ifctx1 = aml_if(aml_lless(aml_arg(1), aml_int(2))); + { + /* call is for unsupported REV, bail out */ + aml_append(ifctx1, aml_return(retvar)); + } + aml_append(ctx, ifctx1); +} + +static Aml *aml_pci_edsm(void) +{ + Aml *method, *ifctx; + Aml *zero = aml_int(0); + Aml *func = aml_arg(2); + Aml *ret = aml_local(0); + Aml *aidx = aml_local(1); + Aml *params = aml_arg(4); + + method = aml_method("EDSM", 5, AML_SERIALIZED); + + /* get supported functions */ + ifctx = aml_if(aml_equal(func, zero)); + { + /* 1: have supported functions */ + /* 7: support for function 7 */ + const uint8_t caps = 1 | BIT(7); + build_append_pci_dsm_func0_common(ifctx, ret); + aml_append(ifctx, aml_store(aml_int(caps), aml_index(ret, zero))); + aml_append(ifctx, aml_return(ret)); + } + aml_append(method, ifctx); + + /* handle specific functions requests */ + /* + * PCI Firmware Specification 3.1 + * 4.6.7. _DSM for Naming a PCI or PCI Express Device Under + * Operating Systems + */ + ifctx = aml_if(aml_equal(func, aml_int(7))); + { + Aml *pkg = aml_package(2); + aml_append(pkg, zero); + /* optional, if not impl. should return null string */ + aml_append(pkg, aml_string("%s", "")); + aml_append(ifctx, aml_store(pkg, ret)); + + /* + * IASL is fine when initializing Package with computational data, + * however it makes guest unhappy /it fails to process such AML/. + * So use runtime assignment to set acpi-index after initializer + * to make OSPM happy. + */ + aml_append(ifctx, + aml_store(aml_derefof(aml_index(params, aml_int(0))), aidx)); + aml_append(ifctx, aml_store(aidx, aml_index(ret, zero))); + aml_append(ifctx, aml_return(ret)); + } + aml_append(method, ifctx); + + return method; +} + +static Aml *aml_pci_static_endpoint_dsm(PCIDevice *pdev) +{ + Aml *method; + + g_assert(pdev->acpi_index != 0); + method = aml_method("_DSM", 4, AML_SERIALIZED); + { + Aml *params = aml_local(0); + Aml *pkg = aml_package(1); + aml_append(pkg, aml_int(pdev->acpi_index)); + aml_append(method, aml_store(pkg, params)); + aml_append(method, + aml_return(aml_call5("EDSM", aml_arg(0), aml_arg(1), + aml_arg(2), aml_arg(3), params)) + ); + } + return method; +} + static void build_append_pcihp_notify_entry(Aml *method, int slot) { Aml *if_ctx; @@ -396,12 +494,6 @@ static bool is_devfn_ignored_generic(const int devfn, const PCIBus *bus) if (DEVICE(pdev)->hotplugged) { return true; } - } else if (!get_dev_aml_func(DEVICE(pdev))) { - /* - * Ignore all other devices on !0 functions unless they - * have AML description (i.e have get_dev_aml_func() != 0) - */ - return true; } } return false; @@ -428,12 +520,14 @@ static bool is_devfn_ignored_hotplug(const int devfn, const PCIBus *bus) return false; } -static void build_append_pcihp_slots(Aml *parent_scope, PCIBus *bus, - QObject *bsel) +void build_append_pcihp_slots(Aml *parent_scope, PCIBus *bus) { int devfn; Aml *dev, *notify_method = NULL, *method; + QObject *bsel = object_property_get_qobject(OBJECT(bus), + ACPI_PCIHP_PROP_BSEL, NULL); uint64_t bsel_val = qnum_get_uint(qobject_to(QNum, bsel)); + qobject_unref(bsel); aml_append(parent_scope, aml_name_decl("BSEL", aml_int(bsel_val))); notify_method = aml_method("DVNT", 2, AML_NOTSERIALIZED); @@ -478,12 +572,9 @@ static void build_append_pcihp_slots(Aml *parent_scope, PCIBus *bus, void build_append_pci_bus_devices(Aml *parent_scope, PCIBus *bus) { - QObject *bsel; int devfn; Aml *dev; - bsel = object_property_get_qobject(OBJECT(bus), ACPI_PCIHP_PROP_BSEL, NULL); - for (devfn = 0; devfn < ARRAY_SIZE(bus->devices); devfn++) { /* ACPI spec: 1.0b: Table 6-2 _ADR Object Bus Types, PCI type */ int adr = PCI_SLOT(devfn) << 16 | PCI_FUNC(devfn); @@ -498,16 +589,16 @@ void build_append_pci_bus_devices(Aml *parent_scope, PCIBus *bus) aml_append(dev, aml_name_decl("_ADR", aml_int(adr))); call_dev_aml_func(DEVICE(bus->devices[devfn]), dev); + /* add _DSM if device has acpi-index set */ + if (pdev->acpi_index && + !object_property_get_bool(OBJECT(pdev), "hotpluggable", + &error_abort)) { + aml_append(dev, aml_pci_static_endpoint_dsm(pdev)); + } /* device descriptor has been composed, add it into parent context */ aml_append(parent_scope, dev); } - - if (bsel) { - build_append_pcihp_slots(parent_scope, bus, bsel); - } - - qobject_unref(bsel); } static bool build_append_notfication_callback(Aml *parent_scope, @@ -517,16 +608,24 @@ static bool build_append_notfication_callback(Aml *parent_scope, PCIBus *sec; QObject *bsel; int nr_notifiers = 0; + GQueue *pcnt_bus_list = g_queue_new(); QLIST_FOREACH(sec, &bus->child, sibling) { Aml *br_scope = aml_scope("S%.02X", sec->parent_dev->devfn); - if (pci_bus_is_root(sec) || - !object_property_find(OBJECT(sec), ACPI_PCIHP_PROP_BSEL)) { + if (pci_bus_is_root(sec)) { continue; } nr_notifiers = nr_notifiers + build_append_notfication_callback(br_scope, sec); - aml_append(parent_scope, br_scope); + /* + * add new child scope to parent + * and keep track of bus that have PCNT, + * bus list is used later to call children PCNTs from this level PCNT + */ + if (nr_notifiers) { + g_queue_push_tail(pcnt_bus_list, sec); + aml_append(parent_scope, br_scope); + } } /* @@ -550,30 +649,25 @@ static bool build_append_notfication_callback(Aml *parent_scope, } /* Notify about child bus events in any case */ - QLIST_FOREACH(sec, &bus->child, sibling) { - if (pci_bus_is_root(sec) || - !object_property_find(OBJECT(sec), ACPI_PCIHP_PROP_BSEL)) { - continue; - } - + while ((sec = g_queue_pop_head(pcnt_bus_list))) { aml_append(method, aml_name("^S%.02X.PCNT", sec->parent_dev->devfn)); } aml_append(parent_scope, method); qobject_unref(bsel); + g_queue_free(pcnt_bus_list); return !!nr_notifiers; } static Aml *aml_pci_pdsm(void) { - Aml *method, *UUID, *ifctx, *ifctx1; + Aml *method, *ifctx, *ifctx1; Aml *ret = aml_local(0); Aml *caps = aml_local(1); Aml *acpi_index = aml_local(2); Aml *zero = aml_int(0); Aml *one = aml_int(1); Aml *func = aml_arg(2); - Aml *rev = aml_arg(1); Aml *params = aml_arg(4); Aml *bnum = aml_derefof(aml_index(params, aml_int(0))); Aml *sunum = aml_derefof(aml_index(params, aml_int(1))); @@ -583,29 +677,9 @@ static Aml *aml_pci_pdsm(void) /* get supported functions */ ifctx = aml_if(aml_equal(func, zero)); { - uint8_t byte_list[1] = { 0 }; /* nothing supported yet */ - aml_append(ifctx, aml_store(aml_buffer(1, byte_list), ret)); - aml_append(ifctx, aml_store(zero, caps)); - - /* - * PCI Firmware Specification 3.1 - * 4.6. _DSM Definitions for PCI - */ - UUID = aml_touuid("E5C937D0-3553-4D7A-9117-EA4D19C3434D"); - ifctx1 = aml_if(aml_lnot(aml_equal(aml_arg(0), UUID))); - { - /* call is for unsupported UUID, bail out */ - aml_append(ifctx1, aml_return(ret)); - } - aml_append(ifctx, ifctx1); - - ifctx1 = aml_if(aml_lless(rev, aml_int(2))); - { - /* call is for unsupported REV, bail out */ - aml_append(ifctx1, aml_return(ret)); - } - aml_append(ifctx, ifctx1); + build_append_pci_dsm_func0_common(ifctx, ret); + aml_append(ifctx, aml_store(zero, caps)); aml_append(ifctx, aml_store(aml_call2("AIDX", bnum, sunum), acpi_index)); /* @@ -1388,6 +1462,7 @@ build_dsdt(GArray *table_data, BIOSLinker *linker, aml_append(dev, aml_name_decl("_HID", aml_eisaid("PNP0A03"))); aml_append(dev, aml_name_decl("_ADR", aml_int(0))); aml_append(dev, aml_name_decl("_UID", aml_int(pcmc->pci_root_uid))); + aml_append(dev, aml_pci_edsm()); aml_append(sb_scope, dev); aml_append(dsdt, sb_scope); @@ -1403,6 +1478,7 @@ build_dsdt(GArray *table_data, BIOSLinker *linker, aml_append(dev, aml_name_decl("_ADR", aml_int(0))); aml_append(dev, aml_name_decl("_UID", aml_int(pcmc->pci_root_uid))); aml_append(dev, build_q35_osc_method(!pm->pcihp_bridge_en)); + aml_append(dev, aml_pci_edsm()); aml_append(sb_scope, dev); if (mcfg_valid) { aml_append(sb_scope, build_q35_dram_controller(&mcfg)); @@ -1710,6 +1786,9 @@ build_dsdt(GArray *table_data, BIOSLinker *linker, Aml *scope = aml_scope("PCI0"); /* Scan all PCI buses. Generate tables to support hotplug. */ build_append_pci_bus_devices(scope, bus); + if (object_property_find(OBJECT(bus), ACPI_PCIHP_PROP_BSEL)) { + build_append_pcihp_slots(scope, bus); + } aml_append(sb_scope, scope); } } diff --git a/hw/i386/kvm/meson.build b/hw/i386/kvm/meson.build index 82dd6ae7c6..6621ba5cd7 100644 --- a/hw/i386/kvm/meson.build +++ b/hw/i386/kvm/meson.build @@ -9,6 +9,7 @@ i386_kvm_ss.add(when: 'CONFIG_XEN_EMU', if_true: files( 'xen_evtchn.c', 'xen_gnttab.c', 'xen_xenstore.c', + 'xenstore_impl.c', )) i386_ss.add_all(when: 'CONFIG_KVM', if_true: i386_kvm_ss) diff --git a/hw/i386/kvm/trace-events b/hw/i386/kvm/trace-events index b83c3eb965..e4c82de6f3 100644 --- a/hw/i386/kvm/trace-events +++ b/hw/i386/kvm/trace-events @@ -3,3 +3,18 @@ kvm_xen_unmap_pirq(int pirq, int gsi) "pirq %d gsi %d" kvm_xen_get_free_pirq(int pirq, int type) "pirq %d type %d" kvm_xen_bind_pirq(int pirq, int port) "pirq %d port %d" kvm_xen_unmask_pirq(int pirq, char *dev, int vector) "pirq %d dev %s vector %d" +xenstore_error(unsigned int id, unsigned int tx_id, const char *err) "req %u tx %u err %s" +xenstore_read(unsigned int tx_id, const char *path) "tx %u path %s" +xenstore_write(unsigned int tx_id, const char *path) "tx %u path %s" +xenstore_mkdir(unsigned int tx_id, const char *path) "tx %u path %s" +xenstore_directory(unsigned int tx_id, const char *path) "tx %u path %s" +xenstore_directory_part(unsigned int tx_id, const char *path, unsigned int offset) "tx %u path %s offset %u" +xenstore_transaction_start(unsigned int new_tx) "new_tx %u" +xenstore_transaction_end(unsigned int tx_id, bool commit) "tx %u commit %d" +xenstore_rm(unsigned int tx_id, const char *path) "tx %u path %s" +xenstore_get_perms(unsigned int tx_id, const char *path) "tx %u path %s" +xenstore_set_perms(unsigned int tx_id, const char *path) "tx %u path %s" +xenstore_watch(const char *path, const char *token) "path %s token %s" +xenstore_unwatch(const char *path, const char *token) "path %s token %s" +xenstore_reset_watches(void) "" +xenstore_watch_event(const char *path, const char *token) "path %s token %s" diff --git a/hw/i386/kvm/xen_evtchn.c b/hw/i386/kvm/xen_evtchn.c index 886fbf6b3b..98a7b85047 100644 --- a/hw/i386/kvm/xen_evtchn.c +++ b/hw/i386/kvm/xen_evtchn.c @@ -34,6 +34,7 @@ #include "hw/pci/msi.h" #include "hw/pci/msix.h" #include "hw/irq.h" +#include "hw/xen/xen_backend_ops.h" #include "xen_evtchn.h" #include "xen_overlay.h" @@ -278,6 +279,17 @@ static const TypeInfo xen_evtchn_info = { .class_init = xen_evtchn_class_init, }; +static struct evtchn_backend_ops emu_evtchn_backend_ops = { + .open = xen_be_evtchn_open, + .bind_interdomain = xen_be_evtchn_bind_interdomain, + .unbind = xen_be_evtchn_unbind, + .close = xen_be_evtchn_close, + .get_fd = xen_be_evtchn_fd, + .notify = xen_be_evtchn_notify, + .unmask = xen_be_evtchn_unmask, + .pending = xen_be_evtchn_pending, +}; + static void gsi_assert_bh(void *opaque) { struct vcpu_info *vi = kvm_xen_get_vcpu_info_hva(0); @@ -318,6 +330,9 @@ void xen_evtchn_create(void) s->nr_pirq_inuse_words = DIV_ROUND_UP(s->nr_pirqs, 64); s->pirq_inuse_bitmap = g_new0(uint64_t, s->nr_pirq_inuse_words); s->pirq = g_new0(struct pirq_info, s->nr_pirqs); + + /* Set event channel functions for backend drivers to use */ + xen_evtchn_ops = &emu_evtchn_backend_ops; } void xen_evtchn_connect_gsis(qemu_irq *system_gsis) diff --git a/hw/i386/kvm/xen_gnttab.c b/hw/i386/kvm/xen_gnttab.c index 1e691ded32..21c30e3659 100644 --- a/hw/i386/kvm/xen_gnttab.c +++ b/hw/i386/kvm/xen_gnttab.c @@ -22,6 +22,7 @@ #include "hw/sysbus.h" #include "hw/xen/xen.h" +#include "hw/xen/xen_backend_ops.h" #include "xen_overlay.h" #include "xen_gnttab.h" @@ -34,11 +35,10 @@ #define TYPE_XEN_GNTTAB "xen-gnttab" OBJECT_DECLARE_SIMPLE_TYPE(XenGnttabState, XEN_GNTTAB) -#define XEN_PAGE_SHIFT 12 -#define XEN_PAGE_SIZE (1ULL << XEN_PAGE_SHIFT) - #define ENTRIES_PER_FRAME_V1 (XEN_PAGE_SIZE / sizeof(grant_entry_v1_t)) +static struct gnttab_backend_ops emu_gnttab_backend_ops; + struct XenGnttabState { /*< private >*/ SysBusDevice busdev; @@ -57,6 +57,8 @@ struct XenGnttabState { MemoryRegion gnt_frames; MemoryRegion *gnt_aliases; uint64_t *gnt_frame_gpas; + + uint8_t *map_track; }; struct XenGnttabState *xen_gnttab_singleton; @@ -70,13 +72,11 @@ static void xen_gnttab_realize(DeviceState *dev, Error **errp) error_setg(errp, "Xen grant table support is for Xen emulation"); return; } - s->nr_frames = 0; s->max_frames = kvm_xen_get_gnttab_max_frames(); memory_region_init_ram(&s->gnt_frames, OBJECT(dev), "xen:grant_table", XEN_PAGE_SIZE * s->max_frames, &error_abort); memory_region_set_enabled(&s->gnt_frames, true); s->entries.v1 = memory_region_get_ram_ptr(&s->gnt_frames); - memset(s->entries.v1, 0, XEN_PAGE_SIZE * s->max_frames); /* Create individual page-sizes aliases for overlays */ s->gnt_aliases = (void *)g_new0(MemoryRegion, s->max_frames); @@ -88,9 +88,18 @@ static void xen_gnttab_realize(DeviceState *dev, Error **errp) s->gnt_frame_gpas[i] = INVALID_GPA; } + s->nr_frames = 0; + memset(s->entries.v1, 0, XEN_PAGE_SIZE * s->max_frames); + s->entries.v1[GNTTAB_RESERVED_XENSTORE].flags = GTF_permit_access; + s->entries.v1[GNTTAB_RESERVED_XENSTORE].frame = XEN_SPECIAL_PFN(XENSTORE); + qemu_mutex_init(&s->gnt_lock); xen_gnttab_singleton = s; + + s->map_track = g_new0(uint8_t, s->max_frames * ENTRIES_PER_FRAME_V1); + + xen_gnttab_ops = &emu_gnttab_backend_ops; } static int xen_gnttab_post_load(void *opaque, int version_id) @@ -230,3 +239,309 @@ int xen_gnttab_query_size_op(struct gnttab_query_size *size) size->max_nr_frames = s->max_frames; return 0; } + +/* Track per-open refs, to allow close() to clean up. */ +struct active_ref { + MemoryRegionSection mrs; + void *virtaddr; + uint32_t refcnt; + int prot; +}; + +static void gnt_unref(XenGnttabState *s, grant_ref_t ref, + MemoryRegionSection *mrs, int prot) +{ + if (mrs && mrs->mr) { + if (prot & PROT_WRITE) { + memory_region_set_dirty(mrs->mr, mrs->offset_within_region, + XEN_PAGE_SIZE); + } + memory_region_unref(mrs->mr); + mrs->mr = NULL; + } + assert(s->map_track[ref] != 0); + + if (--s->map_track[ref] == 0) { + grant_entry_v1_t *gnt_p = &s->entries.v1[ref]; + qatomic_and(&gnt_p->flags, (uint16_t)~(GTF_reading | GTF_writing)); + } +} + +static uint64_t gnt_ref(XenGnttabState *s, grant_ref_t ref, int prot) +{ + uint16_t mask = GTF_type_mask | GTF_sub_page; + grant_entry_v1_t gnt, *gnt_p; + int retries = 0; + + if (ref >= s->max_frames * ENTRIES_PER_FRAME_V1 || + s->map_track[ref] == UINT8_MAX) { + return INVALID_GPA; + } + + if (prot & PROT_WRITE) { + mask |= GTF_readonly; + } + + gnt_p = &s->entries.v1[ref]; + + /* + * The guest can legitimately be changing the GTF_readonly flag. Allow + * that, but don't let a malicious guest cause a livelock. + */ + for (retries = 0; retries < 5; retries++) { + uint16_t new_flags; + + /* Read the entry before an atomic operation on its flags */ + gnt = *(volatile grant_entry_v1_t *)gnt_p; + + if ((gnt.flags & mask) != GTF_permit_access || + gnt.domid != DOMID_QEMU) { + return INVALID_GPA; + } + + new_flags = gnt.flags | GTF_reading; + if (prot & PROT_WRITE) { + new_flags |= GTF_writing; + } + + if (qatomic_cmpxchg(&gnt_p->flags, gnt.flags, new_flags) == gnt.flags) { + return (uint64_t)gnt.frame << XEN_PAGE_SHIFT; + } + } + + return INVALID_GPA; +} + +struct xengntdev_handle { + GHashTable *active_maps; +}; + +static int xen_be_gnttab_set_max_grants(struct xengntdev_handle *xgt, + uint32_t nr_grants) +{ + return 0; +} + +static void *xen_be_gnttab_map_refs(struct xengntdev_handle *xgt, + uint32_t count, uint32_t domid, + uint32_t *refs, int prot) +{ + XenGnttabState *s = xen_gnttab_singleton; + struct active_ref *act; + + if (!s) { + errno = ENOTSUP; + return NULL; + } + + if (domid != xen_domid) { + errno = EINVAL; + return NULL; + } + + if (!count || count > 4096) { + errno = EINVAL; + return NULL; + } + + /* + * Making a contiguous mapping from potentially discontiguous grant + * references would be... distinctly non-trivial. We don't support it. + * Even changing the API to return an array of pointers, one per page, + * wouldn't be simple to use in PV backends because some structures + * actually cross page boundaries (e.g. 32-bit blkif_response ring + * entries are 12 bytes). + */ + if (count != 1) { + errno = EINVAL; + return NULL; + } + + QEMU_LOCK_GUARD(&s->gnt_lock); + + act = g_hash_table_lookup(xgt->active_maps, GINT_TO_POINTER(refs[0])); + if (act) { + if ((prot & PROT_WRITE) && !(act->prot & PROT_WRITE)) { + if (gnt_ref(s, refs[0], prot) == INVALID_GPA) { + return NULL; + } + act->prot |= PROT_WRITE; + } + act->refcnt++; + } else { + uint64_t gpa = gnt_ref(s, refs[0], prot); + if (gpa == INVALID_GPA) { + errno = EINVAL; + return NULL; + } + + act = g_new0(struct active_ref, 1); + act->prot = prot; + act->refcnt = 1; + act->mrs = memory_region_find(get_system_memory(), gpa, XEN_PAGE_SIZE); + + if (act->mrs.mr && + !int128_lt(act->mrs.size, int128_make64(XEN_PAGE_SIZE)) && + memory_region_get_ram_addr(act->mrs.mr) != RAM_ADDR_INVALID) { + act->virtaddr = qemu_map_ram_ptr(act->mrs.mr->ram_block, + act->mrs.offset_within_region); + } + if (!act->virtaddr) { + gnt_unref(s, refs[0], &act->mrs, 0); + g_free(act); + errno = EINVAL; + return NULL; + } + + s->map_track[refs[0]]++; + g_hash_table_insert(xgt->active_maps, GINT_TO_POINTER(refs[0]), act); + } + + return act->virtaddr; +} + +static gboolean do_unmap(gpointer key, gpointer value, gpointer user_data) +{ + XenGnttabState *s = user_data; + grant_ref_t gref = GPOINTER_TO_INT(key); + struct active_ref *act = value; + + gnt_unref(s, gref, &act->mrs, act->prot); + g_free(act); + return true; +} + +static int xen_be_gnttab_unmap(struct xengntdev_handle *xgt, + void *start_address, uint32_t *refs, + uint32_t count) +{ + XenGnttabState *s = xen_gnttab_singleton; + struct active_ref *act; + + if (!s) { + return -ENOTSUP; + } + + if (count != 1) { + return -EINVAL; + } + + QEMU_LOCK_GUARD(&s->gnt_lock); + + act = g_hash_table_lookup(xgt->active_maps, GINT_TO_POINTER(refs[0])); + if (!act) { + return -ENOENT; + } + + if (act->virtaddr != start_address) { + return -EINVAL; + } + + if (!--act->refcnt) { + do_unmap(GINT_TO_POINTER(refs[0]), act, s); + g_hash_table_remove(xgt->active_maps, GINT_TO_POINTER(refs[0])); + } + + return 0; +} + +/* + * This looks a bit like the one for true Xen in xen-operations.c but + * in emulation we don't support multi-page mappings. And under Xen we + * *want* the multi-page mappings so we have fewer bounces through the + * kernel and the hypervisor. So the code paths end up being similar, + * but different. + */ +static int xen_be_gnttab_copy(struct xengntdev_handle *xgt, bool to_domain, + uint32_t domid, XenGrantCopySegment *segs, + uint32_t nr_segs, Error **errp) +{ + int prot = to_domain ? PROT_WRITE : PROT_READ; + unsigned int i; + + for (i = 0; i < nr_segs; i++) { + XenGrantCopySegment *seg = &segs[i]; + void *page; + uint32_t ref = to_domain ? seg->dest.foreign.ref : + seg->source.foreign.ref; + + page = xen_be_gnttab_map_refs(xgt, 1, domid, &ref, prot); + if (!page) { + if (errp) { + error_setg_errno(errp, errno, + "xen_be_gnttab_map_refs failed"); + } + return -errno; + } + + if (to_domain) { + memcpy(page + seg->dest.foreign.offset, seg->source.virt, + seg->len); + } else { + memcpy(seg->dest.virt, page + seg->source.foreign.offset, + seg->len); + } + + if (xen_be_gnttab_unmap(xgt, page, &ref, 1)) { + if (errp) { + error_setg_errno(errp, errno, "xen_be_gnttab_unmap failed"); + } + return -errno; + } + } + + return 0; +} + +static struct xengntdev_handle *xen_be_gnttab_open(void) +{ + struct xengntdev_handle *xgt = g_new0(struct xengntdev_handle, 1); + + xgt->active_maps = g_hash_table_new(g_direct_hash, g_direct_equal); + return xgt; +} + +static int xen_be_gnttab_close(struct xengntdev_handle *xgt) +{ + XenGnttabState *s = xen_gnttab_singleton; + + if (!s) { + return -ENOTSUP; + } + + g_hash_table_foreach_remove(xgt->active_maps, do_unmap, s); + g_hash_table_destroy(xgt->active_maps); + g_free(xgt); + return 0; +} + +static struct gnttab_backend_ops emu_gnttab_backend_ops = { + .open = xen_be_gnttab_open, + .close = xen_be_gnttab_close, + .grant_copy = xen_be_gnttab_copy, + .set_max_grants = xen_be_gnttab_set_max_grants, + .map_refs = xen_be_gnttab_map_refs, + .unmap = xen_be_gnttab_unmap, +}; + +int xen_gnttab_reset(void) +{ + XenGnttabState *s = xen_gnttab_singleton; + + if (!s) { + return -ENOTSUP; + } + + QEMU_LOCK_GUARD(&s->gnt_lock); + + s->nr_frames = 0; + + memset(s->entries.v1, 0, XEN_PAGE_SIZE * s->max_frames); + + s->entries.v1[GNTTAB_RESERVED_XENSTORE].flags = GTF_permit_access; + s->entries.v1[GNTTAB_RESERVED_XENSTORE].frame = XEN_SPECIAL_PFN(XENSTORE); + + memset(s->map_track, 0, s->max_frames * ENTRIES_PER_FRAME_V1); + + return 0; +} diff --git a/hw/i386/kvm/xen_gnttab.h b/hw/i386/kvm/xen_gnttab.h index 3bdbe96191..ee215239b0 100644 --- a/hw/i386/kvm/xen_gnttab.h +++ b/hw/i386/kvm/xen_gnttab.h @@ -13,6 +13,7 @@ #define QEMU_XEN_GNTTAB_H void xen_gnttab_create(void); +int xen_gnttab_reset(void); int xen_gnttab_map_page(uint64_t idx, uint64_t gfn); struct gnttab_set_version; diff --git a/hw/i386/kvm/xen_xenstore.c b/hw/i386/kvm/xen_xenstore.c index 14193ef3f9..2cadafd56a 100644 --- a/hw/i386/kvm/xen_xenstore.c +++ b/hw/i386/kvm/xen_xenstore.c @@ -21,6 +21,7 @@ #include "hw/sysbus.h" #include "hw/xen/xen.h" +#include "hw/xen/xen_backend_ops.h" #include "xen_overlay.h" #include "xen_evtchn.h" #include "xen_xenstore.h" @@ -28,15 +29,17 @@ #include "sysemu/kvm.h" #include "sysemu/kvm_xen.h" +#include "trace.h" + +#include "xenstore_impl.h" + #include "hw/xen/interface/io/xs_wire.h" #include "hw/xen/interface/event_channel.h" +#include "hw/xen/interface/grant_table.h" #define TYPE_XEN_XENSTORE "xen-xenstore" OBJECT_DECLARE_SIMPLE_TYPE(XenXenstoreState, XEN_XENSTORE) -#define XEN_PAGE_SHIFT 12 -#define XEN_PAGE_SIZE (1ULL << XEN_PAGE_SHIFT) - #define ENTRIES_PER_FRAME_V1 (XEN_PAGE_SIZE / sizeof(grant_entry_v1_t)) #define ENTRIES_PER_FRAME_V2 (XEN_PAGE_SIZE / sizeof(grant_entry_v2_t)) @@ -47,6 +50,9 @@ struct XenXenstoreState { SysBusDevice busdev; /*< public >*/ + XenstoreImplState *impl; + GList *watch_events; /* for the guest */ + MemoryRegion xenstore_page; struct xenstore_domain_interface *xs; uint8_t req_data[XENSTORE_HEADER_SIZE + XENSTORE_PAYLOAD_MAX]; @@ -59,15 +65,54 @@ struct XenXenstoreState { evtchn_port_t guest_port; evtchn_port_t be_port; struct xenevtchn_handle *eh; + + uint8_t *impl_state; + uint32_t impl_state_size; + + struct xengntdev_handle *gt; + void *granted_xs; }; struct XenXenstoreState *xen_xenstore_singleton; static void xen_xenstore_event(void *opaque); +static void fire_watch_cb(void *opaque, const char *path, const char *token); + +static struct xenstore_backend_ops emu_xenstore_backend_ops; + +static void G_GNUC_PRINTF (4, 5) relpath_printf(XenXenstoreState *s, + GList *perms, + const char *relpath, + const char *fmt, ...) +{ + gchar *abspath; + gchar *value; + va_list args; + GByteArray *data; + int err; + + abspath = g_strdup_printf("/local/domain/%u/%s", xen_domid, relpath); + va_start(args, fmt); + value = g_strdup_vprintf(fmt, args); + va_end(args); + + data = g_byte_array_new_take((void *)value, strlen(value)); + + err = xs_impl_write(s->impl, DOMID_QEMU, XBT_NULL, abspath, data); + assert(!err); + + g_byte_array_unref(data); + + err = xs_impl_set_perms(s->impl, DOMID_QEMU, XBT_NULL, abspath, perms); + assert(!err); + + g_free(abspath); +} static void xen_xenstore_realize(DeviceState *dev, Error **errp) { XenXenstoreState *s = XEN_XENSTORE(dev); + GList *perms; if (xen_mode != XEN_EMULATE) { error_setg(errp, "Xen xenstore support is for Xen emulation"); @@ -89,6 +134,50 @@ static void xen_xenstore_realize(DeviceState *dev, Error **errp) } aio_set_fd_handler(qemu_get_aio_context(), xen_be_evtchn_fd(s->eh), true, xen_xenstore_event, NULL, NULL, NULL, s); + + s->impl = xs_impl_create(xen_domid); + + /* Populate the default nodes */ + + /* Nodes owned by 'dom0' but readable by the guest */ + perms = g_list_append(NULL, xs_perm_as_string(XS_PERM_NONE, DOMID_QEMU)); + perms = g_list_append(perms, xs_perm_as_string(XS_PERM_READ, xen_domid)); + + relpath_printf(s, perms, "", "%s", ""); + + relpath_printf(s, perms, "domid", "%u", xen_domid); + + relpath_printf(s, perms, "control/platform-feature-xs_reset_watches", "%u", 1); + relpath_printf(s, perms, "control/platform-feature-multiprocessor-suspend", "%u", 1); + + relpath_printf(s, perms, "platform/acpi", "%u", 1); + relpath_printf(s, perms, "platform/acpi_s3", "%u", 1); + relpath_printf(s, perms, "platform/acpi_s4", "%u", 1); + relpath_printf(s, perms, "platform/acpi_laptop_slate", "%u", 0); + + g_list_free_full(perms, g_free); + + /* Nodes owned by the guest */ + perms = g_list_append(NULL, xs_perm_as_string(XS_PERM_NONE, xen_domid)); + + relpath_printf(s, perms, "attr", "%s", ""); + + relpath_printf(s, perms, "control/shutdown", "%s", ""); + relpath_printf(s, perms, "control/feature-poweroff", "%u", 1); + relpath_printf(s, perms, "control/feature-reboot", "%u", 1); + relpath_printf(s, perms, "control/feature-suspend", "%u", 1); + relpath_printf(s, perms, "control/feature-s3", "%u", 1); + relpath_printf(s, perms, "control/feature-s4", "%u", 1); + + relpath_printf(s, perms, "data", "%s", ""); + relpath_printf(s, perms, "device", "%s", ""); + relpath_printf(s, perms, "drivers", "%s", ""); + relpath_printf(s, perms, "error", "%s", ""); + relpath_printf(s, perms, "feature", "%s", ""); + + g_list_free_full(perms, g_free); + + xen_xenstore_ops = &emu_xenstore_backend_ops; } static bool xen_xenstore_is_needed(void *opaque) @@ -99,16 +188,26 @@ static bool xen_xenstore_is_needed(void *opaque) static int xen_xenstore_pre_save(void *opaque) { XenXenstoreState *s = opaque; + GByteArray *save; if (s->eh) { s->guest_port = xen_be_evtchn_get_guest_port(s->eh); } + + g_free(s->impl_state); + save = xs_impl_serialize(s->impl); + s->impl_state = save->data; + s->impl_state_size = save->len; + g_byte_array_free(save, false); + return 0; } static int xen_xenstore_post_load(void *opaque, int ver) { XenXenstoreState *s = opaque; + GByteArray *save; + int ret; /* * As qemu/dom0, rebind to the guest's port. The Windows drivers may @@ -125,11 +224,18 @@ static int xen_xenstore_post_load(void *opaque, int ver) } s->be_port = be_port; } - return 0; + + save = g_byte_array_new_take(s->impl_state, s->impl_state_size); + s->impl_state = NULL; + s->impl_state_size = 0; + + ret = xs_impl_deserialize(s->impl, save, xen_domid, fire_watch_cb, s); + return ret; } static const VMStateDescription xen_xenstore_vmstate = { .name = "xen_xenstore", + .unmigratable = 1, /* The PV back ends don't migrate yet */ .version_id = 1, .minimum_version_id = 1, .needed = xen_xenstore_is_needed, @@ -145,6 +251,10 @@ static const VMStateDescription xen_xenstore_vmstate = { VMSTATE_BOOL(rsp_pending, XenXenstoreState), VMSTATE_UINT32(guest_port, XenXenstoreState), VMSTATE_BOOL(fatal_error, XenXenstoreState), + VMSTATE_UINT32(impl_state_size, XenXenstoreState), + VMSTATE_VARRAY_UINT32_ALLOC(impl_state, XenXenstoreState, + impl_state_size, 0, + vmstate_info_uint8, uint8_t), VMSTATE_END_OF_LIST() } }; @@ -213,20 +323,761 @@ static void reset_rsp(XenXenstoreState *s) s->rsp_offset = 0; } +static void xs_error(XenXenstoreState *s, unsigned int id, + xs_transaction_t tx_id, int errnum) +{ + struct xsd_sockmsg *rsp = (struct xsd_sockmsg *)s->rsp_data; + const char *errstr = NULL; + + for (unsigned int i = 0; i < ARRAY_SIZE(xsd_errors); i++) { + struct xsd_errors *xsd_error = &xsd_errors[i]; + + if (xsd_error->errnum == errnum) { + errstr = xsd_error->errstring; + break; + } + } + assert(errstr); + + trace_xenstore_error(id, tx_id, errstr); + + rsp->type = XS_ERROR; + rsp->req_id = id; + rsp->tx_id = tx_id; + rsp->len = (uint32_t)strlen(errstr) + 1; + + memcpy(&rsp[1], errstr, rsp->len); +} + +static void xs_ok(XenXenstoreState *s, unsigned int type, unsigned int req_id, + xs_transaction_t tx_id) +{ + struct xsd_sockmsg *rsp = (struct xsd_sockmsg *)s->rsp_data; + const char *okstr = "OK"; + + rsp->type = type; + rsp->req_id = req_id; + rsp->tx_id = tx_id; + rsp->len = (uint32_t)strlen(okstr) + 1; + + memcpy(&rsp[1], okstr, rsp->len); +} + +/* + * The correct request and response formats are documented in xen.git: + * docs/misc/xenstore.txt. A summary is given below for convenience. + * The '|' symbol represents a NUL character. + * + * ---------- Database read, write and permissions operations ---------- + * + * READ <path>| <value|> + * WRITE <path>|<value|> + * Store and read the octet string <value> at <path>. + * WRITE creates any missing parent paths, with empty values. + * + * MKDIR <path>| + * Ensures that the <path> exists, by necessary by creating + * it and any missing parents with empty values. If <path> + * or any parent already exists, its value is left unchanged. + * + * RM <path>| + * Ensures that the <path> does not exist, by deleting + * it and all of its children. It is not an error if <path> does + * not exist, but it _is_ an error if <path>'s immediate parent + * does not exist either. + * + * DIRECTORY <path>| <child-leaf-name>|* + * Gives a list of the immediate children of <path>, as only the + * leafnames. The resulting children are each named + * <path>/<child-leaf-name>. + * + * DIRECTORY_PART <path>|<offset> <gencnt>|<child-leaf-name>|* + * Same as DIRECTORY, but to be used for children lists longer than + * XENSTORE_PAYLOAD_MAX. Input are <path> and the byte offset into + * the list of children to return. Return values are the generation + * count <gencnt> of the node (to be used to ensure the node hasn't + * changed between two reads: <gencnt> being the same for multiple + * reads guarantees the node hasn't changed) and the list of children + * starting at the specified <offset> of the complete list. + * + * GET_PERMS <path>| <perm-as-string>|+ + * SET_PERMS <path>|<perm-as-string>|+? + * <perm-as-string> is one of the following + * w<domid> write only + * r<domid> read only + * b<domid> both read and write + * n<domid> no access + * See https://wiki.xen.org/wiki/XenBus section + * `Permissions' for details of the permissions system. + * It is possible to set permissions for the special watch paths + * "@introduceDomain" and "@releaseDomain" to enable receiving those + * watches in unprivileged domains. + * + * ---------- Watches ---------- + * + * WATCH <wpath>|<token>|? + * Adds a watch. + * + * When a <path> is modified (including path creation, removal, + * contents change or permissions change) this generates an event + * on the changed <path>. Changes made in transactions cause an + * event only if and when committed. Each occurring event is + * matched against all the watches currently set up, and each + * matching watch results in a WATCH_EVENT message (see below). + * + * The event's path matches the watch's <wpath> if it is an child + * of <wpath>. + * + * <wpath> can be a <path> to watch or @<wspecial>. In the + * latter case <wspecial> may have any syntax but it matches + * (according to the rules above) only the following special + * events which are invented by xenstored: + * @introduceDomain occurs on INTRODUCE + * @releaseDomain occurs on any domain crash or + * shutdown, and also on RELEASE + * and domain destruction + * <wspecial> events are sent to privileged callers or explicitly + * via SET_PERMS enabled domains only. + * + * When a watch is first set up it is triggered once straight + * away, with <path> equal to <wpath>. Watches may be triggered + * spuriously. The tx_id in a WATCH request is ignored. + * + * Watches are supposed to be restricted by the permissions + * system but in practice the implementation is imperfect. + * Applications should not rely on being sent a notification for + * paths that they cannot read; however, an application may rely + * on being sent a watch when a path which it _is_ able to read + * is deleted even if that leaves only a nonexistent unreadable + * parent. A notification may omitted if a node's permissions + * are changed so as to make it unreadable, in which case future + * notifications may be suppressed (and if the node is later made + * readable, some notifications may have been lost). + * + * WATCH_EVENT <epath>|<token>| + * Unsolicited `reply' generated for matching modification events + * as described above. req_id and tx_id are both 0. + * + * <epath> is the event's path, ie the actual path that was + * modified; however if the event was the recursive removal of an + * parent of <wpath>, <epath> is just + * <wpath> (rather than the actual path which was removed). So + * <epath> is a child of <wpath>, regardless. + * + * Iff <wpath> for the watch was specified as a relative pathname, + * the <epath> path will also be relative (with the same base, + * obviously). + * + * UNWATCH <wpath>|<token>|? + * + * RESET_WATCHES | + * Reset all watches and transactions of the caller. + * + * ---------- Transactions ---------- + * + * TRANSACTION_START | <transid>| + * <transid> is an opaque uint32_t allocated by xenstored + * represented as unsigned decimal. After this, transaction may + * be referenced by using <transid> (as 32-bit binary) in the + * tx_id request header field. When transaction is started whole + * db is copied; reads and writes happen on the copy. + * It is not legal to send non-0 tx_id in TRANSACTION_START. + * + * TRANSACTION_END T| + * TRANSACTION_END F| + * tx_id must refer to existing transaction. After this + * request the tx_id is no longer valid and may be reused by + * xenstore. If F, the transaction is discarded. If T, + * it is committed: if there were any other intervening writes + * then our END gets get EAGAIN. + * + * The plan is that in the future only intervening `conflicting' + * writes cause EAGAIN, meaning only writes or other commits + * which changed paths which were read or written in the + * transaction at hand. + * + */ + +static void xs_read(XenXenstoreState *s, unsigned int req_id, + xs_transaction_t tx_id, uint8_t *req_data, unsigned int len) +{ + const char *path = (const char *)req_data; + struct xsd_sockmsg *rsp = (struct xsd_sockmsg *)s->rsp_data; + uint8_t *rsp_data = (uint8_t *)&rsp[1]; + g_autoptr(GByteArray) data = g_byte_array_new(); + int err; + + if (len == 0 || req_data[len - 1] != '\0') { + xs_error(s, req_id, tx_id, EINVAL); + return; + } + + trace_xenstore_read(tx_id, path); + err = xs_impl_read(s->impl, xen_domid, tx_id, path, data); + if (err) { + xs_error(s, req_id, tx_id, err); + return; + } + + rsp->type = XS_READ; + rsp->req_id = req_id; + rsp->tx_id = tx_id; + rsp->len = 0; + + len = data->len; + if (len > XENSTORE_PAYLOAD_MAX) { + xs_error(s, req_id, tx_id, E2BIG); + return; + } + + memcpy(&rsp_data[rsp->len], data->data, len); + rsp->len += len; +} + +static void xs_write(XenXenstoreState *s, unsigned int req_id, + xs_transaction_t tx_id, uint8_t *req_data, + unsigned int len) +{ + g_autoptr(GByteArray) data = g_byte_array_new(); + const char *path; + int err; + + if (len == 0) { + xs_error(s, req_id, tx_id, EINVAL); + return; + } + + path = (const char *)req_data; + + while (len--) { + if (*req_data++ == '\0') { + break; + } + if (len == 0) { + xs_error(s, req_id, tx_id, EINVAL); + return; + } + } + + g_byte_array_append(data, req_data, len); + + trace_xenstore_write(tx_id, path); + err = xs_impl_write(s->impl, xen_domid, tx_id, path, data); + if (err) { + xs_error(s, req_id, tx_id, err); + return; + } + + xs_ok(s, XS_WRITE, req_id, tx_id); +} + +static void xs_mkdir(XenXenstoreState *s, unsigned int req_id, + xs_transaction_t tx_id, uint8_t *req_data, + unsigned int len) +{ + g_autoptr(GByteArray) data = g_byte_array_new(); + const char *path; + int err; + + if (len == 0 || req_data[len - 1] != '\0') { + xs_error(s, req_id, tx_id, EINVAL); + return; + } + + path = (const char *)req_data; + + trace_xenstore_mkdir(tx_id, path); + err = xs_impl_read(s->impl, xen_domid, tx_id, path, data); + if (err == ENOENT) { + err = xs_impl_write(s->impl, xen_domid, tx_id, path, data); + } + + if (!err) { + xs_error(s, req_id, tx_id, err); + return; + } + + xs_ok(s, XS_MKDIR, req_id, tx_id); +} + +static void xs_append_strings(XenXenstoreState *s, struct xsd_sockmsg *rsp, + GList *strings, unsigned int start, bool truncate) +{ + uint8_t *rsp_data = (uint8_t *)&rsp[1]; + GList *l; + + for (l = strings; l; l = l->next) { + size_t len = strlen(l->data) + 1; /* Including the NUL termination */ + char *str = l->data; + + if (rsp->len + len > XENSTORE_PAYLOAD_MAX) { + if (truncate) { + len = XENSTORE_PAYLOAD_MAX - rsp->len; + if (!len) { + return; + } + } else { + xs_error(s, rsp->req_id, rsp->tx_id, E2BIG); + return; + } + } + + if (start) { + if (start >= len) { + start -= len; + continue; + } + + str += start; + len -= start; + start = 0; + } + + memcpy(&rsp_data[rsp->len], str, len); + rsp->len += len; + } + /* XS_DIRECTORY_PART wants an extra NUL to indicate the end */ + if (truncate && rsp->len < XENSTORE_PAYLOAD_MAX) { + rsp_data[rsp->len++] = '\0'; + } +} + +static void xs_directory(XenXenstoreState *s, unsigned int req_id, + xs_transaction_t tx_id, uint8_t *req_data, + unsigned int len) +{ + struct xsd_sockmsg *rsp = (struct xsd_sockmsg *)s->rsp_data; + GList *items = NULL; + const char *path; + int err; + + if (len == 0 || req_data[len - 1] != '\0') { + xs_error(s, req_id, tx_id, EINVAL); + return; + } + + path = (const char *)req_data; + + trace_xenstore_directory(tx_id, path); + err = xs_impl_directory(s->impl, xen_domid, tx_id, path, NULL, &items); + if (err != 0) { + xs_error(s, req_id, tx_id, err); + return; + } + + rsp->type = XS_DIRECTORY; + rsp->req_id = req_id; + rsp->tx_id = tx_id; + rsp->len = 0; + + xs_append_strings(s, rsp, items, 0, false); + + g_list_free_full(items, g_free); +} + +static void xs_directory_part(XenXenstoreState *s, unsigned int req_id, + xs_transaction_t tx_id, uint8_t *req_data, + unsigned int len) +{ + const char *offset_str, *path = (const char *)req_data; + struct xsd_sockmsg *rsp = (struct xsd_sockmsg *)s->rsp_data; + char *rsp_data = (char *)&rsp[1]; + uint64_t gencnt = 0; + unsigned int offset; + GList *items = NULL; + int err; + + if (len == 0) { + xs_error(s, req_id, tx_id, EINVAL); + return; + } + + while (len--) { + if (*req_data++ == '\0') { + break; + } + if (len == 0) { + xs_error(s, req_id, tx_id, EINVAL); + return; + } + } + + offset_str = (const char *)req_data; + while (len--) { + if (*req_data++ == '\0') { + break; + } + if (len == 0) { + xs_error(s, req_id, tx_id, EINVAL); + return; + } + } + + if (len) { + xs_error(s, req_id, tx_id, EINVAL); + return; + } + + if (qemu_strtoui(offset_str, NULL, 10, &offset) < 0) { + xs_error(s, req_id, tx_id, EINVAL); + return; + } + + trace_xenstore_directory_part(tx_id, path, offset); + err = xs_impl_directory(s->impl, xen_domid, tx_id, path, &gencnt, &items); + if (err != 0) { + xs_error(s, req_id, tx_id, err); + return; + } + + rsp->type = XS_DIRECTORY_PART; + rsp->req_id = req_id; + rsp->tx_id = tx_id; + rsp->len = snprintf(rsp_data, XENSTORE_PAYLOAD_MAX, "%" PRIu64, gencnt) + 1; + + xs_append_strings(s, rsp, items, offset, true); + + g_list_free_full(items, g_free); +} + +static void xs_transaction_start(XenXenstoreState *s, unsigned int req_id, + xs_transaction_t tx_id, uint8_t *req_data, + unsigned int len) +{ + struct xsd_sockmsg *rsp = (struct xsd_sockmsg *)s->rsp_data; + char *rsp_data = (char *)&rsp[1]; + int err; + + if (len != 1 || req_data[0] != '\0') { + xs_error(s, req_id, tx_id, EINVAL); + return; + } + + rsp->type = XS_TRANSACTION_START; + rsp->req_id = req_id; + rsp->tx_id = tx_id; + rsp->len = 0; + + err = xs_impl_transaction_start(s->impl, xen_domid, &tx_id); + if (err) { + xs_error(s, req_id, tx_id, err); + return; + } + + trace_xenstore_transaction_start(tx_id); + + rsp->len = snprintf(rsp_data, XENSTORE_PAYLOAD_MAX, "%u", tx_id); + assert(rsp->len < XENSTORE_PAYLOAD_MAX); + rsp->len++; +} + +static void xs_transaction_end(XenXenstoreState *s, unsigned int req_id, + xs_transaction_t tx_id, uint8_t *req_data, + unsigned int len) +{ + bool commit; + int err; + + if (len != 2 || req_data[1] != '\0') { + xs_error(s, req_id, tx_id, EINVAL); + return; + } + + switch (req_data[0]) { + case 'T': + commit = true; + break; + case 'F': + commit = false; + break; + default: + xs_error(s, req_id, tx_id, EINVAL); + return; + } + + trace_xenstore_transaction_end(tx_id, commit); + err = xs_impl_transaction_end(s->impl, xen_domid, tx_id, commit); + if (err) { + xs_error(s, req_id, tx_id, err); + return; + } + + xs_ok(s, XS_TRANSACTION_END, req_id, tx_id); +} + +static void xs_rm(XenXenstoreState *s, unsigned int req_id, + xs_transaction_t tx_id, uint8_t *req_data, unsigned int len) +{ + const char *path = (const char *)req_data; + int err; + + if (len == 0 || req_data[len - 1] != '\0') { + xs_error(s, req_id, tx_id, EINVAL); + return; + } + + trace_xenstore_rm(tx_id, path); + err = xs_impl_rm(s->impl, xen_domid, tx_id, path); + if (err) { + xs_error(s, req_id, tx_id, err); + return; + } + + xs_ok(s, XS_RM, req_id, tx_id); +} + +static void xs_get_perms(XenXenstoreState *s, unsigned int req_id, + xs_transaction_t tx_id, uint8_t *req_data, + unsigned int len) +{ + const char *path = (const char *)req_data; + struct xsd_sockmsg *rsp = (struct xsd_sockmsg *)s->rsp_data; + GList *perms = NULL; + int err; + + if (len == 0 || req_data[len - 1] != '\0') { + xs_error(s, req_id, tx_id, EINVAL); + return; + } + + trace_xenstore_get_perms(tx_id, path); + err = xs_impl_get_perms(s->impl, xen_domid, tx_id, path, &perms); + if (err) { + xs_error(s, req_id, tx_id, err); + return; + } + + rsp->type = XS_GET_PERMS; + rsp->req_id = req_id; + rsp->tx_id = tx_id; + rsp->len = 0; + + xs_append_strings(s, rsp, perms, 0, false); + + g_list_free_full(perms, g_free); +} + +static void xs_set_perms(XenXenstoreState *s, unsigned int req_id, + xs_transaction_t tx_id, uint8_t *req_data, + unsigned int len) +{ + const char *path = (const char *)req_data; + uint8_t *perm; + GList *perms = NULL; + int err; + + if (len == 0) { + xs_error(s, req_id, tx_id, EINVAL); + return; + } + + while (len--) { + if (*req_data++ == '\0') { + break; + } + if (len == 0) { + xs_error(s, req_id, tx_id, EINVAL); + return; + } + } + + perm = req_data; + while (len--) { + if (*req_data++ == '\0') { + perms = g_list_append(perms, perm); + perm = req_data; + } + } + + /* + * Note that there may be trailing garbage at the end of the buffer. + * This is explicitly permitted by the '?' at the end of the definition: + * + * SET_PERMS <path>|<perm-as-string>|+? + */ + + trace_xenstore_set_perms(tx_id, path); + err = xs_impl_set_perms(s->impl, xen_domid, tx_id, path, perms); + g_list_free(perms); + if (err) { + xs_error(s, req_id, tx_id, err); + return; + } + + xs_ok(s, XS_SET_PERMS, req_id, tx_id); +} + +static void xs_watch(XenXenstoreState *s, unsigned int req_id, + xs_transaction_t tx_id, uint8_t *req_data, + unsigned int len) +{ + const char *token, *path = (const char *)req_data; + int err; + + if (len == 0) { + xs_error(s, req_id, tx_id, EINVAL); + return; + } + + while (len--) { + if (*req_data++ == '\0') { + break; + } + if (len == 0) { + xs_error(s, req_id, tx_id, EINVAL); + return; + } + } + + token = (const char *)req_data; + while (len--) { + if (*req_data++ == '\0') { + break; + } + if (len == 0) { + xs_error(s, req_id, tx_id, EINVAL); + return; + } + } + + /* + * Note that there may be trailing garbage at the end of the buffer. + * This is explicitly permitted by the '?' at the end of the definition: + * + * WATCH <wpath>|<token>|? + */ + + trace_xenstore_watch(path, token); + err = xs_impl_watch(s->impl, xen_domid, path, token, fire_watch_cb, s); + if (err) { + xs_error(s, req_id, tx_id, err); + return; + } + + xs_ok(s, XS_WATCH, req_id, tx_id); +} + +static void xs_unwatch(XenXenstoreState *s, unsigned int req_id, + xs_transaction_t tx_id, uint8_t *req_data, + unsigned int len) +{ + const char *token, *path = (const char *)req_data; + int err; + + if (len == 0) { + xs_error(s, req_id, tx_id, EINVAL); + return; + } + + while (len--) { + if (*req_data++ == '\0') { + break; + } + if (len == 0) { + xs_error(s, req_id, tx_id, EINVAL); + return; + } + } + + token = (const char *)req_data; + while (len--) { + if (*req_data++ == '\0') { + break; + } + if (len == 0) { + xs_error(s, req_id, tx_id, EINVAL); + return; + } + } + + trace_xenstore_unwatch(path, token); + err = xs_impl_unwatch(s->impl, xen_domid, path, token, fire_watch_cb, s); + if (err) { + xs_error(s, req_id, tx_id, err); + return; + } + + xs_ok(s, XS_UNWATCH, req_id, tx_id); +} + +static void xs_reset_watches(XenXenstoreState *s, unsigned int req_id, + xs_transaction_t tx_id, uint8_t *req_data, + unsigned int len) +{ + if (len == 0 || req_data[len - 1] != '\0') { + xs_error(s, req_id, tx_id, EINVAL); + return; + } + + trace_xenstore_reset_watches(); + xs_impl_reset_watches(s->impl, xen_domid); + + xs_ok(s, XS_RESET_WATCHES, req_id, tx_id); +} + +static void xs_priv(XenXenstoreState *s, unsigned int req_id, + xs_transaction_t tx_id, uint8_t *data, + unsigned int len) +{ + xs_error(s, req_id, tx_id, EACCES); +} + +static void xs_unimpl(XenXenstoreState *s, unsigned int req_id, + xs_transaction_t tx_id, uint8_t *data, + unsigned int len) +{ + xs_error(s, req_id, tx_id, ENOSYS); +} + +typedef void (*xs_impl)(XenXenstoreState *s, unsigned int req_id, + xs_transaction_t tx_id, uint8_t *data, + unsigned int len); + +struct xsd_req { + const char *name; + xs_impl fn; +}; +#define XSD_REQ(_type, _fn) \ + [_type] = { .name = #_type, .fn = _fn } + +struct xsd_req xsd_reqs[] = { + XSD_REQ(XS_READ, xs_read), + XSD_REQ(XS_WRITE, xs_write), + XSD_REQ(XS_MKDIR, xs_mkdir), + XSD_REQ(XS_DIRECTORY, xs_directory), + XSD_REQ(XS_DIRECTORY_PART, xs_directory_part), + XSD_REQ(XS_TRANSACTION_START, xs_transaction_start), + XSD_REQ(XS_TRANSACTION_END, xs_transaction_end), + XSD_REQ(XS_RM, xs_rm), + XSD_REQ(XS_GET_PERMS, xs_get_perms), + XSD_REQ(XS_SET_PERMS, xs_set_perms), + XSD_REQ(XS_WATCH, xs_watch), + XSD_REQ(XS_UNWATCH, xs_unwatch), + XSD_REQ(XS_CONTROL, xs_priv), + XSD_REQ(XS_INTRODUCE, xs_priv), + XSD_REQ(XS_RELEASE, xs_priv), + XSD_REQ(XS_IS_DOMAIN_INTRODUCED, xs_priv), + XSD_REQ(XS_RESUME, xs_priv), + XSD_REQ(XS_SET_TARGET, xs_priv), + XSD_REQ(XS_RESET_WATCHES, xs_reset_watches), +}; + static void process_req(XenXenstoreState *s) { struct xsd_sockmsg *req = (struct xsd_sockmsg *)s->req_data; - struct xsd_sockmsg *rsp = (struct xsd_sockmsg *)s->rsp_data; - const char enosys[] = "ENOSYS"; + xs_impl handler = NULL; assert(req_pending(s)); assert(!s->rsp_pending); - rsp->type = XS_ERROR; - rsp->req_id = req->req_id; - rsp->tx_id = req->tx_id; - rsp->len = sizeof(enosys); - memcpy((void *)&rsp[1], enosys, sizeof(enosys)); + if (req->type < ARRAY_SIZE(xsd_reqs)) { + handler = xsd_reqs[req->type].fn; + } + if (!handler) { + handler = &xs_unimpl; + } + + handler(s, req->req_id, req->tx_id, (uint8_t *)&req[1], req->len); s->rsp_pending = true; reset_req(s); @@ -415,6 +1266,113 @@ static unsigned int put_rsp(XenXenstoreState *s) return copylen; } +static void deliver_watch(XenXenstoreState *s, const char *path, + const char *token) +{ + struct xsd_sockmsg *rsp = (struct xsd_sockmsg *)s->rsp_data; + uint8_t *rsp_data = (uint8_t *)&rsp[1]; + unsigned int len; + + assert(!s->rsp_pending); + + trace_xenstore_watch_event(path, token); + + rsp->type = XS_WATCH_EVENT; + rsp->req_id = 0; + rsp->tx_id = 0; + rsp->len = 0; + + len = strlen(path); + + /* XENSTORE_ABS/REL_PATH_MAX should ensure there can be no overflow */ + assert(rsp->len + len < XENSTORE_PAYLOAD_MAX); + + memcpy(&rsp_data[rsp->len], path, len); + rsp->len += len; + rsp_data[rsp->len] = '\0'; + rsp->len++; + + len = strlen(token); + /* + * It is possible for the guest to have chosen a token that will + * not fit (along with the patch) into a watch event. We have no + * choice but to drop the event if this is the case. + */ + if (rsp->len + len >= XENSTORE_PAYLOAD_MAX) { + return; + } + + memcpy(&rsp_data[rsp->len], token, len); + rsp->len += len; + rsp_data[rsp->len] = '\0'; + rsp->len++; + + s->rsp_pending = true; +} + +struct watch_event { + char *path; + char *token; +}; + +static void free_watch_event(struct watch_event *ev) +{ + if (ev) { + g_free(ev->path); + g_free(ev->token); + g_free(ev); + } +} + +static void queue_watch(XenXenstoreState *s, const char *path, + const char *token) +{ + struct watch_event *ev = g_new0(struct watch_event, 1); + + ev->path = g_strdup(path); + ev->token = g_strdup(token); + + s->watch_events = g_list_append(s->watch_events, ev); +} + +static void fire_watch_cb(void *opaque, const char *path, const char *token) +{ + XenXenstoreState *s = opaque; + + assert(qemu_mutex_iothread_locked()); + + /* + * If there's a response pending, we obviously can't scribble over + * it. But if there's a request pending, it has dibs on the buffer + * too. + * + * In the common case of a watch firing due to backend activity + * when the ring was otherwise idle, we should be able to copy the + * strings directly into the rsp_data and thence the actual ring, + * without needing to perform any allocations and queue them. + */ + if (s->rsp_pending || req_pending(s)) { + queue_watch(s, path, token); + } else { + deliver_watch(s, path, token); + /* + * If the message was queued because there was already ring activity, + * no need to wake the guest. But if not, we need to send the evtchn. + */ + xen_be_evtchn_notify(s->eh, s->be_port); + } +} + +static void process_watch_events(XenXenstoreState *s) +{ + struct watch_event *ev = s->watch_events->data; + + deliver_watch(s, ev->path, ev->token); + + s->watch_events = g_list_remove(s->watch_events, ev); + free_watch_event(ev); +} + static void xen_xenstore_event(void *opaque) { XenXenstoreState *s = opaque; @@ -433,6 +1391,10 @@ static void xen_xenstore_event(void *opaque) copied_to = copied_from = 0; processed = false; + if (!s->rsp_pending && s->watch_events) { + process_watch_events(s); + } + if (s->rsp_pending) { copied_to = put_rsp(s); } @@ -441,7 +1403,7 @@ static void xen_xenstore_event(void *opaque) copied_from = get_req(s); } - if (req_pending(s) && !s->rsp_pending) { + if (req_pending(s) && !s->rsp_pending && !s->watch_events) { process_req(s); processed = true; } @@ -496,5 +1458,270 @@ int xen_xenstore_reset(void) } s->be_port = err; + /* + * We don't actually access the guest's page through the grant, because + * this isn't real Xen, and we can just use the page we gave it in the + * first place. Map the grant anyway, mostly for cosmetic purposes so + * it *looks* like it's in use in the guest-visible grant table. + */ + s->gt = qemu_xen_gnttab_open(); + uint32_t xs_gntref = GNTTAB_RESERVED_XENSTORE; + s->granted_xs = qemu_xen_gnttab_map_refs(s->gt, 1, xen_domid, &xs_gntref, + PROT_READ | PROT_WRITE); + return 0; } + +struct qemu_xs_handle { + XenstoreImplState *impl; + GList *watches; + QEMUBH *watch_bh; +}; + +struct qemu_xs_watch { + struct qemu_xs_handle *h; + char *path; + xs_watch_fn fn; + void *opaque; + GList *events; +}; + +static char *xs_be_get_domain_path(struct qemu_xs_handle *h, unsigned int domid) +{ + return g_strdup_printf("/local/domain/%u", domid); +} + +static char **xs_be_directory(struct qemu_xs_handle *h, xs_transaction_t t, + const char *path, unsigned int *num) +{ + GList *items = NULL, *l; + unsigned int i = 0; + char **items_ret; + int err; + + err = xs_impl_directory(h->impl, DOMID_QEMU, t, path, NULL, &items); + if (err) { + errno = err; + return NULL; + } + + items_ret = g_new0(char *, g_list_length(items) + 1); + *num = 0; + for (l = items; l; l = l->next) { + items_ret[i++] = l->data; + (*num)++; + } + g_list_free(items); + return items_ret; +} + +static void *xs_be_read(struct qemu_xs_handle *h, xs_transaction_t t, + const char *path, unsigned int *len) +{ + GByteArray *data = g_byte_array_new(); + bool free_segment = false; + int err; + + err = xs_impl_read(h->impl, DOMID_QEMU, t, path, data); + if (err) { + free_segment = true; + errno = err; + } else { + if (len) { + *len = data->len; + } + /* The xen-bus-helper code expects to get NUL terminated string! */ + g_byte_array_append(data, (void *)"", 1); + } + + return g_byte_array_free(data, free_segment); +} + +static bool xs_be_write(struct qemu_xs_handle *h, xs_transaction_t t, + const char *path, const void *data, unsigned int len) +{ + GByteArray *gdata = g_byte_array_new(); + int err; + + g_byte_array_append(gdata, data, len); + err = xs_impl_write(h->impl, DOMID_QEMU, t, path, gdata); + g_byte_array_unref(gdata); + if (err) { + errno = err; + return false; + } + return true; +} + +static bool xs_be_create(struct qemu_xs_handle *h, xs_transaction_t t, + unsigned int owner, unsigned int domid, + unsigned int perms, const char *path) +{ + g_autoptr(GByteArray) data = g_byte_array_new(); + GList *perms_list = NULL; + int err; + + /* mkdir does this */ + err = xs_impl_read(h->impl, DOMID_QEMU, t, path, data); + if (err == ENOENT) { + err = xs_impl_write(h->impl, DOMID_QEMU, t, path, data); + } + if (err) { + errno = err; + return false; + } + + perms_list = g_list_append(perms_list, + xs_perm_as_string(XS_PERM_NONE, owner)); + perms_list = g_list_append(perms_list, + xs_perm_as_string(perms, domid)); + + err = xs_impl_set_perms(h->impl, DOMID_QEMU, t, path, perms_list); + g_list_free_full(perms_list, g_free); + if (err) { + errno = err; + return false; + } + return true; +} + +static bool xs_be_destroy(struct qemu_xs_handle *h, xs_transaction_t t, + const char *path) +{ + int err = xs_impl_rm(h->impl, DOMID_QEMU, t, path); + if (err) { + errno = err; + return false; + } + return true; +} + +static void be_watch_bh(void *_h) +{ + struct qemu_xs_handle *h = _h; + GList *l; + + for (l = h->watches; l; l = l->next) { + struct qemu_xs_watch *w = l->data; + + while (w->events) { + struct watch_event *ev = w->events->data; + + w->fn(w->opaque, ev->path); + + w->events = g_list_remove(w->events, ev); + free_watch_event(ev); + } + } +} + +static void xs_be_watch_cb(void *opaque, const char *path, const char *token) +{ + struct watch_event *ev = g_new0(struct watch_event, 1); + struct qemu_xs_watch *w = opaque; + + /* We don't care about the token */ + ev->path = g_strdup(path); + w->events = g_list_append(w->events, ev); + + qemu_bh_schedule(w->h->watch_bh); +} + +static struct qemu_xs_watch *xs_be_watch(struct qemu_xs_handle *h, + const char *path, xs_watch_fn fn, + void *opaque) +{ + struct qemu_xs_watch *w = g_new0(struct qemu_xs_watch, 1); + int err; + + w->h = h; + w->fn = fn; + w->opaque = opaque; + + err = xs_impl_watch(h->impl, DOMID_QEMU, path, NULL, xs_be_watch_cb, w); + if (err) { + errno = err; + g_free(w); + return NULL; + } + + w->path = g_strdup(path); + h->watches = g_list_append(h->watches, w); + return w; +} + +static void xs_be_unwatch(struct qemu_xs_handle *h, struct qemu_xs_watch *w) +{ + xs_impl_unwatch(h->impl, DOMID_QEMU, w->path, NULL, xs_be_watch_cb, w); + + h->watches = g_list_remove(h->watches, w); + g_list_free_full(w->events, (GDestroyNotify)free_watch_event); + g_free(w->path); + g_free(w); +} + +static xs_transaction_t xs_be_transaction_start(struct qemu_xs_handle *h) +{ + unsigned int new_tx = XBT_NULL; + int err = xs_impl_transaction_start(h->impl, DOMID_QEMU, &new_tx); + if (err) { + errno = err; + return XBT_NULL; + } + return new_tx; +} + +static bool xs_be_transaction_end(struct qemu_xs_handle *h, xs_transaction_t t, + bool abort) +{ + int err = xs_impl_transaction_end(h->impl, DOMID_QEMU, t, !abort); + if (err) { + errno = err; + return false; + } + return true; +} + +static struct qemu_xs_handle *xs_be_open(void) +{ + XenXenstoreState *s = xen_xenstore_singleton; + struct qemu_xs_handle *h; + + if (!s && !s->impl) { + errno = -ENOSYS; + return NULL; + } + + h = g_new0(struct qemu_xs_handle, 1); + h->impl = s->impl; + + h->watch_bh = aio_bh_new(qemu_get_aio_context(), be_watch_bh, h); + + return h; +} + +static void xs_be_close(struct qemu_xs_handle *h) +{ + while (h->watches) { + struct qemu_xs_watch *w = h->watches->data; + xs_be_unwatch(h, w); + } + + qemu_bh_delete(h->watch_bh); + g_free(h); +} + +static struct xenstore_backend_ops emu_xenstore_backend_ops = { + .open = xs_be_open, + .close = xs_be_close, + .get_domain_path = xs_be_get_domain_path, + .directory = xs_be_directory, + .read = xs_be_read, + .write = xs_be_write, + .create = xs_be_create, + .destroy = xs_be_destroy, + .watch = xs_be_watch, + .unwatch = xs_be_unwatch, + .transaction_start = xs_be_transaction_start, + .transaction_end = xs_be_transaction_end, +}; diff --git a/hw/i386/kvm/xenstore_impl.c b/hw/i386/kvm/xenstore_impl.c new file mode 100644 index 0000000000..305fe75519 --- /dev/null +++ b/hw/i386/kvm/xenstore_impl.c @@ -0,0 +1,1927 @@ +/* + * QEMU Xen emulation: The actual implementation of XenStore + * + * Copyright © 2023 Amazon.com, Inc. or its affiliates. All Rights Reserved. + * + * Authors: David Woodhouse <dwmw2@infradead.org>, Paul Durrant <paul@xen.org> + * + * This work is licensed under the terms of the GNU GPL, version 2 or later. + * See the COPYING file in the top-level directory. + */ + +#include "qemu/osdep.h" +#include "qom/object.h" + +#include "hw/xen/xen.h" + +#include "xen_xenstore.h" +#include "xenstore_impl.h" + +#include "hw/xen/interface/io/xs_wire.h" + +#define XS_MAX_WATCHES 128 +#define XS_MAX_DOMAIN_NODES 1000 +#define XS_MAX_NODE_SIZE 2048 +#define XS_MAX_TRANSACTIONS 10 +#define XS_MAX_PERMS_PER_NODE 5 + +#define XS_VALID_CHARS "abcdefghijklmnopqrstuvwxyz" \ + "ABCDEFGHIJKLMNOPQRSTUVWXYZ" \ + "0123456789-/_" + +typedef struct XsNode { + uint32_t ref; + GByteArray *content; + GList *perms; + GHashTable *children; + uint64_t gencnt; + bool deleted_in_tx; + bool modified_in_tx; + unsigned int serialized_tx; +#ifdef XS_NODE_UNIT_TEST + gchar *name; /* debug only */ +#endif +} XsNode; + +typedef struct XsWatch { + struct XsWatch *next; + xs_impl_watch_fn *cb; + void *cb_opaque; + char *token; + unsigned int dom_id; + int rel_prefix; +} XsWatch; + +typedef struct XsTransaction { + XsNode *root; + unsigned int nr_nodes; + unsigned int base_tx; + unsigned int tx_id; + unsigned int dom_id; +} XsTransaction; + +struct XenstoreImplState { + XsNode *root; + unsigned int nr_nodes; + GHashTable *watches; + unsigned int nr_domu_watches; + GHashTable *transactions; + unsigned int nr_domu_transactions; + unsigned int root_tx; + unsigned int last_tx; + bool serialized; +}; + + +static void nobble_tx(gpointer key, gpointer value, gpointer user_data) +{ + unsigned int *new_tx_id = user_data; + XsTransaction *tx = value; + + if (tx->base_tx == *new_tx_id) { + /* Transactions based on XBT_NULL will always fail */ + tx->base_tx = XBT_NULL; + } +} + +static inline unsigned int next_tx(struct XenstoreImplState *s) +{ + unsigned int tx_id; + + /* Find the next TX id which isn't either XBT_NULL or in use. */ + do { + tx_id = ++s->last_tx; + } while (tx_id == XBT_NULL || tx_id == s->root_tx || + g_hash_table_lookup(s->transactions, GINT_TO_POINTER(tx_id))); + + /* + * It is vanishingly unlikely, but ensure that no outstanding transaction + * is based on the (previous incarnation of the) newly-allocated TX id. + */ + g_hash_table_foreach(s->transactions, nobble_tx, &tx_id); + + return tx_id; +} + +static inline XsNode *xs_node_new(void) +{ + XsNode *n = g_new0(XsNode, 1); + n->ref = 1; + +#ifdef XS_NODE_UNIT_TEST + nr_xs_nodes++; + xs_node_list = g_list_prepend(xs_node_list, n); +#endif + return n; +} + +static inline XsNode *xs_node_ref(XsNode *n) +{ + /* With just 10 transactions, it can never get anywhere near this. */ + g_assert(n->ref < INT_MAX); + + g_assert(n->ref); + n->ref++; + return n; +} + +static inline void xs_node_unref(XsNode *n) +{ + if (!n) { + return; + } + g_assert(n->ref); + if (--n->ref) { + return; + } + + if (n->content) { + g_byte_array_unref(n->content); + } + if (n->perms) { + g_list_free_full(n->perms, g_free); + } + if (n->children) { + g_hash_table_unref(n->children); + } +#ifdef XS_NODE_UNIT_TEST + g_free(n->name); + nr_xs_nodes--; + xs_node_list = g_list_remove(xs_node_list, n); +#endif + g_free(n); +} + +char *xs_perm_as_string(unsigned int perm, unsigned int domid) +{ + char letter; + + switch (perm) { + case XS_PERM_READ | XS_PERM_WRITE: + letter = 'b'; + break; + case XS_PERM_READ: + letter = 'r'; + break; + case XS_PERM_WRITE: + letter = 'w'; + break; + case XS_PERM_NONE: + default: + letter = 'n'; + break; + } + + return g_strdup_printf("%c%u", letter, domid); +} + +static gpointer do_perm_copy(gconstpointer src, gpointer user_data) +{ + return g_strdup(src); +} + +static XsNode *xs_node_create(const char *name, GList *perms) +{ + XsNode *n = xs_node_new(); + +#ifdef XS_NODE_UNIT_TEST + if (name) { + n->name = g_strdup(name); + } +#endif + + n->perms = g_list_copy_deep(perms, do_perm_copy, NULL); + + return n; +} + +/* For copying from one hash table to another using g_hash_table_foreach() */ +static void do_child_insert(gpointer key, gpointer value, gpointer user_data) +{ + g_hash_table_insert(user_data, g_strdup(key), xs_node_ref(value)); +} + +static XsNode *xs_node_copy(XsNode *old) +{ + XsNode *n = xs_node_new(); + + n->gencnt = old->gencnt; + +#ifdef XS_NODE_UNIT_TEST + if (n->name) { + n->name = g_strdup(old->name); + } +#endif + + assert(old); + if (old->children) { + n->children = g_hash_table_new_full(g_str_hash, g_str_equal, g_free, + (GDestroyNotify)xs_node_unref); + g_hash_table_foreach(old->children, do_child_insert, n->children); + } + if (old->perms) { + n->perms = g_list_copy_deep(old->perms, do_perm_copy, NULL); + } + if (old->content) { + n->content = g_byte_array_ref(old->content); + } + return n; +} + +/* Returns true if it made a change to the hash table */ +static bool xs_node_add_child(XsNode *n, const char *path_elem, XsNode *child) +{ + assert(!strchr(path_elem, '/')); + + if (!child) { + assert(n->children); + return g_hash_table_remove(n->children, path_elem); + } + +#ifdef XS_NODE_UNIT_TEST + g_free(child->name); + child->name = g_strdup(path_elem); +#endif + if (!n->children) { + n->children = g_hash_table_new_full(g_str_hash, g_str_equal, g_free, + (GDestroyNotify)xs_node_unref); + } + + /* + * The documentation for g_hash_table_insert() says that it "returns a + * boolean value to indicate whether the newly added value was already + * in the hash table or not." + * + * It could perhaps be clearer that returning TRUE means it wasn't, + */ + return g_hash_table_insert(n->children, g_strdup(path_elem), child); +} + +struct walk_op { + struct XenstoreImplState *s; + char path[XENSTORE_ABS_PATH_MAX + 2]; /* Two NUL terminators */ + int (*op_fn)(XsNode **n, struct walk_op *op); + void *op_opaque; + void *op_opaque2; + + GList *watches; + unsigned int dom_id; + unsigned int tx_id; + + /* The number of nodes which will exist in the tree if this op succeeds. */ + unsigned int new_nr_nodes; + + /* + * This is maintained on the way *down* the walk to indicate + * whether nodes can be modified in place or whether COW is + * required. It starts off being true, as we're always going to + * replace the root node. If we walk into a shared subtree it + * becomes false. If we start *creating* new nodes for a write, + * it becomes true again. + * + * Do not use it on the way back up. + */ + bool inplace; + bool mutating; + bool create_dirs; + bool in_transaction; + + /* Tracking during recursion so we know which is first. */ + bool deleted_in_tx; +}; + +static void fire_watches(struct walk_op *op, bool parents) +{ + GList *l = NULL; + XsWatch *w; + + if (!op->mutating || op->in_transaction) { + return; + } + + if (parents) { + l = op->watches; + } + + w = g_hash_table_lookup(op->s->watches, op->path); + while (w || l) { + if (!w) { + /* Fire the parent nodes from 'op' if asked to */ + w = l->data; + l = l->next; + continue; + } + + assert(strlen(op->path) > w->rel_prefix); + w->cb(w->cb_opaque, op->path + w->rel_prefix, w->token); + + w = w->next; + } +} + +static int xs_node_add_content(XsNode **n, struct walk_op *op) +{ + GByteArray *data = op->op_opaque; + + if (op->dom_id) { + /* + * The real XenStored includes permissions and names of child nodes + * in the calculated datasize but life's too short. For a single + * tenant internal XenStore, we don't have to be quite as pedantic. + */ + if (data->len > XS_MAX_NODE_SIZE) { + return E2BIG; + } + } + /* We *are* the node to be written. Either this or a copy. */ + if (!op->inplace) { + XsNode *old = *n; + *n = xs_node_copy(old); + xs_node_unref(old); + } + + if ((*n)->content) { + g_byte_array_unref((*n)->content); + } + (*n)->content = g_byte_array_ref(data); + if (op->tx_id != XBT_NULL) { + (*n)->modified_in_tx = true; + } + return 0; +} + +static int xs_node_get_content(XsNode **n, struct walk_op *op) +{ + GByteArray *data = op->op_opaque; + GByteArray *node_data; + + assert(op->inplace); + assert(*n); + + node_data = (*n)->content; + if (node_data) { + g_byte_array_append(data, node_data->data, node_data->len); + } + + return 0; +} + +static int node_rm_recurse(gpointer key, gpointer value, gpointer user_data) +{ + struct walk_op *op = user_data; + int path_len = strlen(op->path); + int key_len = strlen(key); + XsNode *n = value; + bool this_inplace = op->inplace; + + if (n->ref != 1) { + op->inplace = 0; + } + + assert(key_len + path_len + 2 <= sizeof(op->path)); + op->path[path_len] = '/'; + memcpy(op->path + path_len + 1, key, key_len + 1); + + if (n->children) { + g_hash_table_foreach_remove(n->children, node_rm_recurse, op); + } + op->new_nr_nodes--; + + /* + * Fire watches on *this* node but not the parents because they are + * going to be deleted too, so the watch will fire for them anyway. + */ + fire_watches(op, false); + op->path[path_len] = '\0'; + + /* + * Actually deleting the child here is just an optimisation; if we + * don't then the final unref on the topmost victim will just have + * to cascade down again repeating all the g_hash_table_foreach() + * calls. + */ + return this_inplace; +} + +static XsNode *xs_node_copy_deleted(XsNode *old, struct walk_op *op); +static void copy_deleted_recurse(gpointer key, gpointer value, + gpointer user_data) +{ + struct walk_op *op = user_data; + GHashTable *siblings = op->op_opaque2; + XsNode *n = xs_node_copy_deleted(value, op); + + /* + * Reinsert the deleted_in_tx copy of the node into the parent's + * 'children' hash table. Having stashed it from op->op_opaque2 + * before the recursive call to xs_node_copy_deleted() scribbled + * over it. + */ + g_hash_table_insert(siblings, g_strdup(key), n); +} + +static XsNode *xs_node_copy_deleted(XsNode *old, struct walk_op *op) +{ + XsNode *n = xs_node_new(); + + n->gencnt = old->gencnt; + +#ifdef XS_NODE_UNIT_TEST + if (old->name) { + n->name = g_strdup(old->name); + } +#endif + + if (old->children) { + n->children = g_hash_table_new_full(g_str_hash, g_str_equal, g_free, + (GDestroyNotify)xs_node_unref); + op->op_opaque2 = n->children; + g_hash_table_foreach(old->children, copy_deleted_recurse, op); + } + if (old->perms) { + n->perms = g_list_copy_deep(old->perms, do_perm_copy, NULL); + } + n->deleted_in_tx = true; + /* If it gets resurrected we only fire a watch if it lost its content */ + if (old->content) { + n->modified_in_tx = true; + } + op->new_nr_nodes--; + return n; +} + +static int xs_node_rm(XsNode **n, struct walk_op *op) +{ + bool this_inplace = op->inplace; + + if (op->tx_id != XBT_NULL) { + /* It's not trivial to do inplace handling for this one */ + XsNode *old = *n; + *n = xs_node_copy_deleted(old, op); + xs_node_unref(old); + return 0; + } + + /* Fire watches for, and count, nodes in the subtree which get deleted */ + if ((*n)->children) { + g_hash_table_foreach_remove((*n)->children, node_rm_recurse, op); + } + op->new_nr_nodes--; + + if (this_inplace) { + xs_node_unref(*n); + } + *n = NULL; + return 0; +} + +static int xs_node_get_perms(XsNode **n, struct walk_op *op) +{ + GList **perms = op->op_opaque; + + assert(op->inplace); + assert(*n); + + *perms = g_list_copy_deep((*n)->perms, do_perm_copy, NULL); + return 0; +} + +static void parse_perm(const char *perm, char *letter, unsigned int *dom_id) +{ + unsigned int n = sscanf(perm, "%c%u", letter, dom_id); + + assert(n == 2); +} + +static bool can_access(unsigned int dom_id, GList *perms, const char *letters) +{ + unsigned int i, n; + char perm_letter; + unsigned int perm_dom_id; + bool access; + + if (dom_id == 0) { + return true; + } + + n = g_list_length(perms); + assert(n >= 1); + + /* + * The dom_id of the first perm is the owner, and the owner always has + * read-write access. + */ + parse_perm(g_list_nth_data(perms, 0), &perm_letter, &perm_dom_id); + if (dom_id == perm_dom_id) { + return true; + } + + /* + * The letter of the first perm specified the default access for all other + * domains. + */ + access = !!strchr(letters, perm_letter); + for (i = 1; i < n; i++) { + parse_perm(g_list_nth_data(perms, i), &perm_letter, &perm_dom_id); + if (dom_id != perm_dom_id) { + continue; + } + access = !!strchr(letters, perm_letter); + } + + return access; +} + +static int xs_node_set_perms(XsNode **n, struct walk_op *op) +{ + GList *perms = op->op_opaque; + + if (op->dom_id) { + unsigned int perm_dom_id; + char perm_letter; + + /* A guest may not change permissions on nodes it does not own */ + if (!can_access(op->dom_id, (*n)->perms, "")) { + return EPERM; + } + + /* A guest may not change the owner of a node it owns. */ + parse_perm(perms->data, &perm_letter, &perm_dom_id); + if (perm_dom_id != op->dom_id) { + return EPERM; + } + + if (g_list_length(perms) > XS_MAX_PERMS_PER_NODE) { + return ENOSPC; + } + } + + /* We *are* the node to be written. Either this or a copy. */ + if (!op->inplace) { + XsNode *old = *n; + *n = xs_node_copy(old); + xs_node_unref(old); + } + + if ((*n)->perms) { + g_list_free_full((*n)->perms, g_free); + } + (*n)->perms = g_list_copy_deep(perms, do_perm_copy, NULL); + if (op->tx_id != XBT_NULL) { + (*n)->modified_in_tx = true; + } + return 0; +} + +/* + * Passed a full reference in *n which it may free if it needs to COW. + * + * When changing the tree, the op->inplace flag indicates whether this + * node may be modified in place (i.e. it and all its parents had a + * refcount of one). If walking down the tree we find a node whose + * refcount is higher, we must clear op->inplace and COW from there + * down. Unless we are creating new nodes as scaffolding for a write + * (which works like 'mkdir -p' does). In which case those newly + * created nodes can (and must) be modified in place again. + */ +static int xs_node_walk(XsNode **n, struct walk_op *op) +{ + char *child_name = NULL; + size_t namelen; + XsNode *old = *n, *child = NULL; + bool stole_child = false; + bool this_inplace; + XsWatch *watch; + int err; + + namelen = strlen(op->path); + watch = g_hash_table_lookup(op->s->watches, op->path); + + /* Is there a child, or do we hit the double-NUL termination? */ + if (op->path[namelen + 1]) { + char *slash; + child_name = op->path + namelen + 1; + slash = strchr(child_name, '/'); + if (slash) { + *slash = '\0'; + } + op->path[namelen] = '/'; + } + + /* If we walk into a subtree which is shared, we must COW */ + if (op->mutating && old->ref != 1) { + op->inplace = false; + } + + if (!child_name) { + const char *letters = op->mutating ? "wb" : "rb"; + + if (!can_access(op->dom_id, old->perms, letters)) { + err = EACCES; + goto out; + } + + /* This is the actual node on which the operation shall be performed */ + err = op->op_fn(n, op); + if (!err) { + fire_watches(op, true); + } + goto out; + } + + /* op->inplace will be further modified during the recursion */ + this_inplace = op->inplace; + + if (old && old->children) { + child = g_hash_table_lookup(old->children, child_name); + /* This is a *weak* reference to 'child', owned by the hash table */ + } + + if (child) { + if (child->deleted_in_tx) { + assert(child->ref == 1); + /* Cannot actually set child->deleted_in_tx = false until later */ + } + xs_node_ref(child); + /* + * Now we own it too. But if we can modify inplace, that's going to + * foil the check and force it to COW. We want to be the *only* owner + * so that it can be modified in place, so remove it from the hash + * table in that case. We'll add it (or its replacement) back later. + */ + if (op->mutating && this_inplace) { + g_hash_table_remove(old->children, child_name); + stole_child = true; + } + } else if (op->create_dirs) { + assert(op->mutating); + + if (!can_access(op->dom_id, old->perms, "wb")) { + err = EACCES; + goto out; + } + + if (op->dom_id && op->new_nr_nodes >= XS_MAX_DOMAIN_NODES) { + err = ENOSPC; + goto out; + } + + child = xs_node_create(child_name, old->perms); + op->new_nr_nodes++; + + /* + * If we're creating a new child, we can clearly modify it (and its + * children) in place from here on down. + */ + op->inplace = true; + } else { + err = ENOENT; + goto out; + } + + /* + * If there's a watch on this node, add it to the list to be fired + * (with the correct full pathname for the modified node) at the end. + */ + if (watch) { + op->watches = g_list_append(op->watches, watch); + } + + /* + * Except for the temporary child-stealing as noted, our node has not + * changed yet. We don't yet know the overall operation will complete. + */ + err = xs_node_walk(&child, op); + + if (watch) { + op->watches = g_list_remove(op->watches, watch); + } + + if (err || !op->mutating) { + if (stole_child) { + /* Put it back as it was. */ + g_hash_table_replace(old->children, g_strdup(child_name), child); + } else { + xs_node_unref(child); + } + goto out; + } + + /* + * Now we know the operation has completed successfully and we're on + * the way back up. Make the change, substituting 'child' in the + * node at our level. + */ + if (!this_inplace) { + *n = xs_node_copy(old); + xs_node_unref(old); + } + + /* + * If we resurrected a deleted_in_tx node, we can mark it as no longer + * deleted now that we know the overall operation has succeeded. + */ + if (op->create_dirs && child && child->deleted_in_tx) { + op->new_nr_nodes++; + child->deleted_in_tx = false; + } + + /* + * The child may be NULL here, for a remove operation. Either way, + * xs_node_add_child() will do the right thing and return a value + * indicating whether it changed the parent's hash table or not. + * + * We bump the parent gencnt if it adds a child that we *didn't* + * steal from it in the first place, or if child==NULL and was + * thus removed (whether we stole it earlier and didn't put it + * back, or xs_node_add_child() actually removed it now). + */ + if ((xs_node_add_child(*n, child_name, child) && !stole_child) || !child) { + (*n)->gencnt++; + } + + out: + op->path[namelen] = '\0'; + if (!namelen) { + assert(!op->watches); + /* + * On completing the recursion back up the path walk and reaching the + * top, assign the new node count if the operation was successful. If + * the main tree was changed, bump its tx ID so that outstanding + * transactions correctly fail. But don't bump it every time; only + * if it makes a difference. + */ + if (!err && op->mutating) { + if (!op->in_transaction) { + if (op->s->root_tx != op->s->last_tx) { + op->s->root_tx = next_tx(op->s); + } + op->s->nr_nodes = op->new_nr_nodes; + } else { + XsTransaction *tx = g_hash_table_lookup(op->s->transactions, + GINT_TO_POINTER(op->tx_id)); + assert(tx); + tx->nr_nodes = op->new_nr_nodes; + } + } + } + return err; +} + +static void append_directory_item(gpointer key, gpointer value, + gpointer user_data) +{ + GList **items = user_data; + + *items = g_list_insert_sorted(*items, g_strdup(key), (GCompareFunc)strcmp); +} + +/* Populates items with char * names which caller must free. */ +static int xs_node_directory(XsNode **n, struct walk_op *op) +{ + GList **items = op->op_opaque; + + assert(op->inplace); + assert(*n); + + if ((*n)->children) { + g_hash_table_foreach((*n)->children, append_directory_item, items); + } + + if (op->op_opaque2) { + *(uint64_t *)op->op_opaque2 = (*n)->gencnt; + } + + return 0; +} + +static int validate_path(char *outpath, const char *userpath, + unsigned int dom_id) +{ + size_t i, pathlen = strlen(userpath); + + if (!pathlen || userpath[pathlen] == '/' || strstr(userpath, "//")) { + return EINVAL; + } + for (i = 0; i < pathlen; i++) { + if (!strchr(XS_VALID_CHARS, userpath[i])) { + return EINVAL; + } + } + if (userpath[0] == '/') { + if (pathlen > XENSTORE_ABS_PATH_MAX) { + return E2BIG; + } + memcpy(outpath, userpath, pathlen + 1); + } else { + if (pathlen > XENSTORE_REL_PATH_MAX) { + return E2BIG; + } + snprintf(outpath, XENSTORE_ABS_PATH_MAX, "/local/domain/%u/%s", dom_id, + userpath); + } + return 0; +} + + +static int init_walk_op(XenstoreImplState *s, struct walk_op *op, + xs_transaction_t tx_id, unsigned int dom_id, + const char *path, XsNode ***rootp) +{ + int ret = validate_path(op->path, path, dom_id); + if (ret) { + return ret; + } + + /* + * We use *two* NUL terminators at the end of the path, as during the walk + * we will temporarily turn each '/' into a NUL to allow us to use that + * path element for the lookup. + */ + op->path[strlen(op->path) + 1] = '\0'; + op->watches = NULL; + op->path[0] = '\0'; + op->inplace = true; + op->mutating = false; + op->create_dirs = false; + op->in_transaction = false; + op->dom_id = dom_id; + op->tx_id = tx_id; + op->s = s; + + if (tx_id == XBT_NULL) { + *rootp = &s->root; + op->new_nr_nodes = s->nr_nodes; + } else { + XsTransaction *tx = g_hash_table_lookup(s->transactions, + GINT_TO_POINTER(tx_id)); + if (!tx) { + return ENOENT; + } + *rootp = &tx->root; + op->new_nr_nodes = tx->nr_nodes; + op->in_transaction = true; + } + + return 0; +} + +int xs_impl_read(XenstoreImplState *s, unsigned int dom_id, + xs_transaction_t tx_id, const char *path, GByteArray *data) +{ + /* + * The data GByteArray shall exist, and will be freed by caller. + * Just g_byte_array_append() to it. + */ + struct walk_op op; + XsNode **n; + int ret; + + ret = init_walk_op(s, &op, tx_id, dom_id, path, &n); + if (ret) { + return ret; + } + op.op_fn = xs_node_get_content; + op.op_opaque = data; + return xs_node_walk(n, &op); +} + +int xs_impl_write(XenstoreImplState *s, unsigned int dom_id, + xs_transaction_t tx_id, const char *path, GByteArray *data) +{ + /* + * The data GByteArray shall exist, will be freed by caller. You are + * free to use g_byte_array_steal() and keep the data. Or just ref it. + */ + struct walk_op op; + XsNode **n; + int ret; + + ret = init_walk_op(s, &op, tx_id, dom_id, path, &n); + if (ret) { + return ret; + } + op.op_fn = xs_node_add_content; + op.op_opaque = data; + op.mutating = true; + op.create_dirs = true; + return xs_node_walk(n, &op); +} + +int xs_impl_directory(XenstoreImplState *s, unsigned int dom_id, + xs_transaction_t tx_id, const char *path, + uint64_t *gencnt, GList **items) +{ + /* + * The items are (char *) to be freed by caller. Although it's consumed + * immediately so if you want to change it to (const char *) and keep + * them, go ahead and change the caller. + */ + struct walk_op op; + XsNode **n; + int ret; + + ret = init_walk_op(s, &op, tx_id, dom_id, path, &n); + if (ret) { + return ret; + } + op.op_fn = xs_node_directory; + op.op_opaque = items; + op.op_opaque2 = gencnt; + return xs_node_walk(n, &op); +} + +int xs_impl_transaction_start(XenstoreImplState *s, unsigned int dom_id, + xs_transaction_t *tx_id) +{ + XsTransaction *tx; + + if (*tx_id != XBT_NULL) { + return EINVAL; + } + + if (dom_id && s->nr_domu_transactions >= XS_MAX_TRANSACTIONS) { + return ENOSPC; + } + + tx = g_new0(XsTransaction, 1); + + tx->nr_nodes = s->nr_nodes; + tx->tx_id = next_tx(s); + tx->base_tx = s->root_tx; + tx->root = xs_node_ref(s->root); + tx->dom_id = dom_id; + + g_hash_table_insert(s->transactions, GINT_TO_POINTER(tx->tx_id), tx); + if (dom_id) { + s->nr_domu_transactions++; + } + *tx_id = tx->tx_id; + return 0; +} + +static gboolean tx_commit_walk(gpointer key, gpointer value, + gpointer user_data) +{ + struct walk_op *op = user_data; + int path_len = strlen(op->path); + int key_len = strlen(key); + bool fire_parents = true; + XsWatch *watch; + XsNode *n = value; + + if (n->ref != 1) { + return false; + } + + if (n->deleted_in_tx) { + /* + * We fire watches on our parents if we are the *first* node + * to be deleted (the topmost one). This matches the behaviour + * when deleting in the live tree. + */ + fire_parents = !op->deleted_in_tx; + + /* Only used on the way down so no need to clear it later */ + op->deleted_in_tx = true; + } + + assert(key_len + path_len + 2 <= sizeof(op->path)); + op->path[path_len] = '/'; + memcpy(op->path + path_len + 1, key, key_len + 1); + + watch = g_hash_table_lookup(op->s->watches, op->path); + if (watch) { + op->watches = g_list_append(op->watches, watch); + } + + if (n->children) { + g_hash_table_foreach_remove(n->children, tx_commit_walk, op); + } + + if (watch) { + op->watches = g_list_remove(op->watches, watch); + } + + /* + * Don't fire watches if this node was only copied because a + * descendent was changed. The modified_in_tx flag indicates the + * ones which were really changed. + */ + if (n->modified_in_tx || n->deleted_in_tx) { + fire_watches(op, fire_parents); + n->modified_in_tx = false; + } + op->path[path_len] = '\0'; + + /* Deleted nodes really do get expunged when we commit */ + return n->deleted_in_tx; +} + +static int transaction_commit(XenstoreImplState *s, XsTransaction *tx) +{ + struct walk_op op; + XsNode **n; + + if (s->root_tx != tx->base_tx) { + return EAGAIN; + } + xs_node_unref(s->root); + s->root = tx->root; + tx->root = NULL; + s->root_tx = tx->tx_id; + s->nr_nodes = tx->nr_nodes; + + init_walk_op(s, &op, XBT_NULL, tx->dom_id, "/", &n); + op.deleted_in_tx = false; + op.mutating = true; + + /* + * Walk the new root and fire watches on any node which has a + * refcount of one (which is therefore unique to this transaction). + */ + if (s->root->children) { + g_hash_table_foreach_remove(s->root->children, tx_commit_walk, &op); + } + + return 0; +} + +int xs_impl_transaction_end(XenstoreImplState *s, unsigned int dom_id, + xs_transaction_t tx_id, bool commit) +{ + int ret = 0; + XsTransaction *tx = g_hash_table_lookup(s->transactions, + GINT_TO_POINTER(tx_id)); + + if (!tx || tx->dom_id != dom_id) { + return ENOENT; + } + + if (commit) { + ret = transaction_commit(s, tx); + } + + g_hash_table_remove(s->transactions, GINT_TO_POINTER(tx_id)); + if (dom_id) { + assert(s->nr_domu_transactions); + s->nr_domu_transactions--; + } + return ret; +} + +int xs_impl_rm(XenstoreImplState *s, unsigned int dom_id, + xs_transaction_t tx_id, const char *path) +{ + struct walk_op op; + XsNode **n; + int ret; + + ret = init_walk_op(s, &op, tx_id, dom_id, path, &n); + if (ret) { + return ret; + } + op.op_fn = xs_node_rm; + op.mutating = true; + return xs_node_walk(n, &op); +} + +int xs_impl_get_perms(XenstoreImplState *s, unsigned int dom_id, + xs_transaction_t tx_id, const char *path, GList **perms) +{ + struct walk_op op; + XsNode **n; + int ret; + + ret = init_walk_op(s, &op, tx_id, dom_id, path, &n); + if (ret) { + return ret; + } + op.op_fn = xs_node_get_perms; + op.op_opaque = perms; + return xs_node_walk(n, &op); +} + +static void is_valid_perm(gpointer data, gpointer user_data) +{ + char *perm = data; + bool *valid = user_data; + char letter; + unsigned int dom_id; + + if (!*valid) { + return; + } + + if (sscanf(perm, "%c%u", &letter, &dom_id) != 2) { + *valid = false; + return; + } + + switch (letter) { + case 'n': + case 'r': + case 'w': + case 'b': + break; + + default: + *valid = false; + break; + } +} + +int xs_impl_set_perms(XenstoreImplState *s, unsigned int dom_id, + xs_transaction_t tx_id, const char *path, GList *perms) +{ + struct walk_op op; + XsNode **n; + bool valid = true; + int ret; + + if (!g_list_length(perms)) { + return EINVAL; + } + + g_list_foreach(perms, is_valid_perm, &valid); + if (!valid) { + return EINVAL; + } + + ret = init_walk_op(s, &op, tx_id, dom_id, path, &n); + if (ret) { + return ret; + } + op.op_fn = xs_node_set_perms; + op.op_opaque = perms; + op.mutating = true; + return xs_node_walk(n, &op); +} + +static int do_xs_impl_watch(XenstoreImplState *s, unsigned int dom_id, + const char *path, const char *token, + xs_impl_watch_fn fn, void *opaque) + +{ + char abspath[XENSTORE_ABS_PATH_MAX + 1]; + XsWatch *w, *l; + int ret; + + ret = validate_path(abspath, path, dom_id); + if (ret) { + return ret; + } + + /* Check for duplicates */ + l = w = g_hash_table_lookup(s->watches, abspath); + while (w) { + if (!g_strcmp0(token, w->token) && opaque == w->cb_opaque && + fn == w->cb && dom_id == w->dom_id) { + return EEXIST; + } + w = w->next; + } + + if (dom_id && s->nr_domu_watches >= XS_MAX_WATCHES) { + return E2BIG; + } + + w = g_new0(XsWatch, 1); + w->token = g_strdup(token); + w->cb = fn; + w->cb_opaque = opaque; + w->dom_id = dom_id; + w->rel_prefix = strlen(abspath) - strlen(path); + + /* l was looked up above when checking for duplicates */ + if (l) { + w->next = l->next; + l->next = w; + } else { + g_hash_table_insert(s->watches, g_strdup(abspath), w); + } + if (dom_id) { + s->nr_domu_watches++; + } + + return 0; +} + +int xs_impl_watch(XenstoreImplState *s, unsigned int dom_id, const char *path, + const char *token, xs_impl_watch_fn fn, void *opaque) +{ + int ret = do_xs_impl_watch(s, dom_id, path, token, fn, opaque); + + if (!ret) { + /* A new watch should fire immediately */ + fn(opaque, path, token); + } + + return ret; +} + +static XsWatch *free_watch(XenstoreImplState *s, XsWatch *w) +{ + XsWatch *next = w->next; + + if (w->dom_id) { + assert(s->nr_domu_watches); + s->nr_domu_watches--; + } + + g_free(w->token); + g_free(w); + + return next; +} + +int xs_impl_unwatch(XenstoreImplState *s, unsigned int dom_id, + const char *path, const char *token, + xs_impl_watch_fn fn, void *opaque) +{ + char abspath[XENSTORE_ABS_PATH_MAX + 1]; + XsWatch *w, **l; + int ret; + + ret = validate_path(abspath, path, dom_id); + if (ret) { + return ret; + } + + w = g_hash_table_lookup(s->watches, abspath); + if (!w) { + return ENOENT; + } + + /* + * The hash table contains the first element of a list of + * watches. Removing the first element in the list is a + * special case because we have to update the hash table to + * point to the next (or remove it if there's nothing left). + */ + if (!g_strcmp0(token, w->token) && fn == w->cb && opaque == w->cb_opaque && + dom_id == w->dom_id) { + if (w->next) { + /* Insert the previous 'next' into the hash table */ + g_hash_table_insert(s->watches, g_strdup(abspath), w->next); + } else { + /* Nothing left; remove from hash table */ + g_hash_table_remove(s->watches, abspath); + } + free_watch(s, w); + return 0; + } + + /* + * We're all done messing with the hash table because the element + * it points to has survived the cull. Now it's just a simple + * linked list removal operation. + */ + for (l = &w->next; *l; l = &w->next) { + w = *l; + + if (!g_strcmp0(token, w->token) && fn == w->cb && + opaque != w->cb_opaque && dom_id == w->dom_id) { + *l = free_watch(s, w); + return 0; + } + } + + return ENOENT; +} + +int xs_impl_reset_watches(XenstoreImplState *s, unsigned int dom_id) +{ + char **watch_paths; + guint nr_watch_paths; + guint i; + + watch_paths = (char **)g_hash_table_get_keys_as_array(s->watches, + &nr_watch_paths); + + for (i = 0; i < nr_watch_paths; i++) { + XsWatch *w1 = g_hash_table_lookup(s->watches, watch_paths[i]); + XsWatch *w2, *w, **l; + + /* + * w1 is the original list. The hash table has this pointer. + * w2 is the head of our newly-filtered list. + * w and l are temporary for processing. w is somewhat redundant + * with *l but makes my eyes bleed less. + */ + + w = w2 = w1; + l = &w; + while (w) { + if (w->dom_id == dom_id) { + /* If we're freeing the head of the list, bump w2 */ + if (w2 == w) { + w2 = w->next; + } + *l = free_watch(s, w); + } else { + l = &w->next; + } + w = *l; + } + /* + * If the head of the list survived the cull, we don't need to + * touch the hash table and we're done with this path. Else... + */ + if (w1 != w2) { + g_hash_table_steal(s->watches, watch_paths[i]); + + /* + * It was already freed. (Don't worry, this whole thing is + * single-threaded and nobody saw it in the meantime). And + * having *stolen* it, we now own the watch_paths[i] string + * so if we don't give it back to the hash table, we need + * to free it. + */ + if (w2) { + g_hash_table_insert(s->watches, watch_paths[i], w2); + } else { + g_free(watch_paths[i]); + } + } + } + g_free(watch_paths); + return 0; +} + +static void xs_tx_free(void *_tx) +{ + XsTransaction *tx = _tx; + if (tx->root) { + xs_node_unref(tx->root); + } + g_free(tx); +} + +XenstoreImplState *xs_impl_create(unsigned int dom_id) +{ + XenstoreImplState *s = g_new0(XenstoreImplState, 1); + GList *perms; + + s->watches = g_hash_table_new_full(g_str_hash, g_str_equal, g_free, NULL); + s->transactions = g_hash_table_new_full(g_direct_hash, g_direct_equal, + NULL, xs_tx_free); + + perms = g_list_append(NULL, xs_perm_as_string(XS_PERM_NONE, 0)); + s->root = xs_node_create("/", perms); + g_list_free_full(perms, g_free); + s->nr_nodes = 1; + + s->root_tx = s->last_tx = 1; + return s; +} + + +static void clear_serialized_tx(gpointer key, gpointer value, gpointer opaque) +{ + XsNode *n = value; + + n->serialized_tx = XBT_NULL; + if (n->children) { + g_hash_table_foreach(n->children, clear_serialized_tx, NULL); + } +} + +static void clear_tx_serialized_tx(gpointer key, gpointer value, + gpointer opaque) +{ + XsTransaction *t = value; + + clear_serialized_tx(NULL, t->root, NULL); +} + +static void write_be32(GByteArray *save, uint32_t val) +{ + uint32_t be = htonl(val); + g_byte_array_append(save, (void *)&be, sizeof(be)); +} + + +struct save_state { + GByteArray *bytes; + unsigned int tx_id; +}; + +#define MODIFIED_IN_TX (1U << 0) +#define DELETED_IN_TX (1U << 1) +#define NODE_REF (1U << 2) + +static void save_node(gpointer key, gpointer value, gpointer opaque) +{ + struct save_state *ss = opaque; + XsNode *n = value; + char *name = key; + uint8_t flag = 0; + + /* Child nodes (i.e. anything but the root) have a name */ + if (name) { + g_byte_array_append(ss->bytes, key, strlen(key) + 1); + } + + /* + * If we already wrote this node, refer to the previous copy. + * There's no rename/move in XenStore, so all we need to find + * it is the tx_id of the transation in which it exists. Which + * may be the root tx. + */ + if (n->serialized_tx != XBT_NULL) { + flag = NODE_REF; + g_byte_array_append(ss->bytes, &flag, 1); + write_be32(ss->bytes, n->serialized_tx); + } else { + GList *l; + n->serialized_tx = ss->tx_id; + + if (n->modified_in_tx) { + flag |= MODIFIED_IN_TX; + } + if (n->deleted_in_tx) { + flag |= DELETED_IN_TX; + } + g_byte_array_append(ss->bytes, &flag, 1); + + if (n->content) { + write_be32(ss->bytes, n->content->len); + g_byte_array_append(ss->bytes, n->content->data, n->content->len); + } else { + write_be32(ss->bytes, 0); + } + + for (l = n->perms; l; l = l->next) { + g_byte_array_append(ss->bytes, l->data, strlen(l->data) + 1); + } + /* NUL termination after perms */ + g_byte_array_append(ss->bytes, (void *)"", 1); + + if (n->children) { + g_hash_table_foreach(n->children, save_node, ss); + } + /* NUL termination after children (child name is NUL) */ + g_byte_array_append(ss->bytes, (void *)"", 1); + } +} + +static void save_tree(struct save_state *ss, uint32_t tx_id, XsNode *root) +{ + write_be32(ss->bytes, tx_id); + ss->tx_id = tx_id; + save_node(NULL, root, ss); +} + +static void save_tx(gpointer key, gpointer value, gpointer opaque) +{ + uint32_t tx_id = GPOINTER_TO_INT(key); + struct save_state *ss = opaque; + XsTransaction *n = value; + + write_be32(ss->bytes, n->base_tx); + write_be32(ss->bytes, n->dom_id); + + save_tree(ss, tx_id, n->root); +} + +static void save_watch(gpointer key, gpointer value, gpointer opaque) +{ + struct save_state *ss = opaque; + XsWatch *w = value; + + /* We only save the *guest* watches. */ + if (w->dom_id) { + gpointer relpath = key + w->rel_prefix; + g_byte_array_append(ss->bytes, relpath, strlen(relpath) + 1); + g_byte_array_append(ss->bytes, (void *)w->token, strlen(w->token) + 1); + } +} + +GByteArray *xs_impl_serialize(XenstoreImplState *s) +{ + struct save_state ss; + + ss.bytes = g_byte_array_new(); + + /* + * node = flags [ real_node / node_ref ] + * flags = uint8_t (MODIFIED_IN_TX | DELETED_IN_TX | NODE_REF) + * node_ref = tx_id (in which the original version of this node exists) + * real_node = content perms child* NUL + * content = len data + * len = uint32_t + * data = uint8_t{len} + * perms = perm* NUL + * perm = asciiz + * child = name node + * name = asciiz + * + * tree = tx_id node + * tx_id = uint32_t + * + * transaction = base_tx_id dom_id tree + * base_tx_id = uint32_t + * dom_id = uint32_t + * + * tx_list = tree transaction* XBT_NULL + * + * watch = path token + * path = asciiz + * token = asciiz + * + * watch_list = watch* NUL + * + * xs_serialize_stream = last_tx tx_list watch_list + * last_tx = uint32_t + */ + + /* Clear serialized_tx in every node. */ + if (s->serialized) { + clear_serialized_tx(NULL, s->root, NULL); + g_hash_table_foreach(s->transactions, clear_tx_serialized_tx, NULL); + } + + s->serialized = true; + + write_be32(ss.bytes, s->last_tx); + save_tree(&ss, s->root_tx, s->root); + g_hash_table_foreach(s->transactions, save_tx, &ss); + + write_be32(ss.bytes, XBT_NULL); + + g_hash_table_foreach(s->watches, save_watch, &ss); + g_byte_array_append(ss.bytes, (void *)"", 1); + + return ss.bytes; +} + +struct unsave_state { + char path[XENSTORE_ABS_PATH_MAX + 1]; + XenstoreImplState *s; + GByteArray *bytes; + uint8_t *d; + size_t l; + bool root_walk; +}; + +static int consume_be32(struct unsave_state *us, unsigned int *val) +{ + uint32_t d; + + if (us->l < sizeof(d)) { + return -EINVAL; + } + memcpy(&d, us->d, sizeof(d)); + *val = ntohl(d); + us->d += sizeof(d); + us->l -= sizeof(d); + return 0; +} + +static int consume_string(struct unsave_state *us, char **str, size_t *len) +{ + size_t l; + + if (!us->l) { + return -EINVAL; + } + + l = strnlen((void *)us->d, us->l); + if (l == us->l) { + return -EINVAL; + } + + if (str) { + *str = (void *)us->d; + } + if (len) { + *len = l; + } + + us->d += l + 1; + us->l -= l + 1; + return 0; +} + +static XsNode *lookup_node(XsNode *n, char *path) +{ + char *slash = strchr(path, '/'); + XsNode *child; + + if (path[0] == '\0') { + return n; + } + + if (slash) { + *slash = '\0'; + } + + if (!n->children) { + return NULL; + } + child = g_hash_table_lookup(n->children, path); + if (!slash) { + return child; + } + + *slash = '/'; + if (!child) { + return NULL; + } + return lookup_node(child, slash + 1); +} + +static XsNode *lookup_tx_node(struct unsave_state *us, unsigned int tx_id) +{ + XsTransaction *t; + if (tx_id == us->s->root_tx) { + return lookup_node(us->s->root, us->path + 1); + } + + t = g_hash_table_lookup(us->s->transactions, GINT_TO_POINTER(tx_id)); + if (!t) { + return NULL; + } + g_assert(t->root); + return lookup_node(t->root, us->path + 1); +} + +static void count_child_nodes(gpointer key, gpointer value, gpointer user_data) +{ + unsigned int *nr_nodes = user_data; + XsNode *n = value; + + (*nr_nodes)++; + + if (n->children) { + g_hash_table_foreach(n->children, count_child_nodes, nr_nodes); + } +} + +static int consume_node(struct unsave_state *us, XsNode **nodep, + unsigned int *nr_nodes) +{ + XsNode *n = NULL; + uint8_t flags; + int ret; + + if (us->l < 1) { + return -EINVAL; + } + flags = us->d[0]; + us->d++; + us->l--; + + if (flags == NODE_REF) { + unsigned int tx; + + ret = consume_be32(us, &tx); + if (ret) { + return ret; + } + + n = lookup_tx_node(us, tx); + if (!n) { + return -EINVAL; + } + n->ref++; + if (n->children) { + g_hash_table_foreach(n->children, count_child_nodes, nr_nodes); + } + } else { + uint32_t datalen; + + if (flags & ~(DELETED_IN_TX | MODIFIED_IN_TX)) { + return -EINVAL; + } + n = xs_node_new(); + + if (flags & DELETED_IN_TX) { + n->deleted_in_tx = true; + } + if (flags & MODIFIED_IN_TX) { + n->modified_in_tx = true; + } + ret = consume_be32(us, &datalen); + if (ret) { + xs_node_unref(n); + return -EINVAL; + } + if (datalen) { + if (datalen > us->l) { + xs_node_unref(n); + return -EINVAL; + } + + GByteArray *node_data = g_byte_array_new(); + g_byte_array_append(node_data, us->d, datalen); + us->d += datalen; + us->l -= datalen; + n->content = node_data; + + if (us->root_walk) { + n->modified_in_tx = true; + } + } + while (1) { + char *perm = NULL; + size_t permlen = 0; + + ret = consume_string(us, &perm, &permlen); + if (ret) { + xs_node_unref(n); + return ret; + } + + if (!permlen) { + break; + } + + n->perms = g_list_append(n->perms, g_strdup(perm)); + } + + /* Now children */ + while (1) { + size_t childlen; + char *childname; + char *pathend; + XsNode *child = NULL; + + ret = consume_string(us, &childname, &childlen); + if (ret) { + xs_node_unref(n); + return ret; + } + + if (!childlen) { + break; + } + + pathend = us->path + strlen(us->path); + strncat(us->path, "/", sizeof(us->path) - 1); + strncat(us->path, childname, sizeof(us->path) - 1); + + ret = consume_node(us, &child, nr_nodes); + *pathend = '\0'; + if (ret) { + xs_node_unref(n); + return ret; + } + g_assert(child); + xs_node_add_child(n, childname, child); + } + + /* + * If the node has no data and no children we still want to fire + * a watch on it. + */ + if (us->root_walk && !n->children) { + n->modified_in_tx = true; + } + } + + if (!n->deleted_in_tx) { + (*nr_nodes)++; + } + + *nodep = n; + return 0; +} + +static int consume_tree(struct unsave_state *us, XsTransaction *t) +{ + int ret; + + ret = consume_be32(us, &t->tx_id); + if (ret) { + return ret; + } + + if (t->tx_id > us->s->last_tx) { + return -EINVAL; + } + + us->path[0] = '\0'; + + return consume_node(us, &t->root, &t->nr_nodes); +} + +int xs_impl_deserialize(XenstoreImplState *s, GByteArray *bytes, + unsigned int dom_id, xs_impl_watch_fn watch_fn, + void *watch_opaque) +{ + struct unsave_state us; + XsTransaction base_t = { 0 }; + int ret; + + us.s = s; + us.bytes = bytes; + us.d = bytes->data; + us.l = bytes->len; + + xs_impl_reset_watches(s, dom_id); + g_hash_table_remove_all(s->transactions); + + xs_node_unref(s->root); + s->root = NULL; + s->root_tx = s->last_tx = XBT_NULL; + + ret = consume_be32(&us, &s->last_tx); + if (ret) { + return ret; + } + + /* + * Consume the base tree into a transaction so that watches can be + * fired as we commit it. By setting us.root_walk we cause the nodes + * to be marked as 'modified_in_tx' as they are created, so that the + * watches are triggered on them. + */ + base_t.dom_id = dom_id; + base_t.base_tx = XBT_NULL; + us.root_walk = true; + ret = consume_tree(&us, &base_t); + if (ret) { + return ret; + } + us.root_walk = false; + + /* + * Commit the transaction now while the refcount on all nodes is 1. + * Note that we haven't yet reinstated the *guest* watches but that's + * OK because we don't want the guest to see any changes. Even any + * backend nodes which get recreated should be *precisely* as they + * were before the migration. Back ends may have been instantiated + * already, and will see the frontend magically blink into existence + * now (well, from the aio_bh which fires the watches). It's their + * responsibility to rebuild everything precisely as it was before. + */ + ret = transaction_commit(s, &base_t); + if (ret) { + return ret; + } + + while (1) { + unsigned int base_tx; + XsTransaction *t; + + ret = consume_be32(&us, &base_tx); + if (ret) { + return ret; + } + if (base_tx == XBT_NULL) { + break; + } + + t = g_new0(XsTransaction, 1); + t->base_tx = base_tx; + + ret = consume_be32(&us, &t->dom_id); + if (!ret) { + ret = consume_tree(&us, t); + } + if (ret) { + g_free(t); + return ret; + } + g_assert(t->root); + if (t->dom_id) { + s->nr_domu_transactions++; + } + g_hash_table_insert(s->transactions, GINT_TO_POINTER(t->tx_id), t); + } + + while (1) { + char *path, *token; + size_t pathlen, toklen; + + ret = consume_string(&us, &path, &pathlen); + if (ret) { + return ret; + } + if (!pathlen) { + break; + } + + ret = consume_string(&us, &token, &toklen); + if (ret) { + return ret; + } + + if (!watch_fn) { + continue; + } + + ret = do_xs_impl_watch(s, dom_id, path, token, watch_fn, watch_opaque); + if (ret) { + return ret; + } + } + + if (us.l) { + return -EINVAL; + } + + return 0; +} diff --git a/hw/i386/kvm/xenstore_impl.h b/hw/i386/kvm/xenstore_impl.h new file mode 100644 index 0000000000..0df2a91aae --- /dev/null +++ b/hw/i386/kvm/xenstore_impl.h @@ -0,0 +1,63 @@ +/* + * QEMU Xen emulation: The actual implementation of XenStore + * + * Copyright © 2023 Amazon.com, Inc. or its affiliates. All Rights Reserved. + * + * Authors: David Woodhouse <dwmw2@infradead.org> + * + * This work is licensed under the terms of the GNU GPL, version 2 or later. + * See the COPYING file in the top-level directory. + */ + +#ifndef QEMU_XENSTORE_IMPL_H +#define QEMU_XENSTORE_IMPL_H + +#include "hw/xen/xen_backend_ops.h" + +typedef struct XenstoreImplState XenstoreImplState; + +XenstoreImplState *xs_impl_create(unsigned int dom_id); + +char *xs_perm_as_string(unsigned int perm, unsigned int domid); + +/* + * These functions return *positive* error numbers. This is a little + * unconventional but it helps to keep us honest because there is + * also a very limited set of error numbers that they are permitted + * to return (those in xsd_errors). + */ + +int xs_impl_read(XenstoreImplState *s, unsigned int dom_id, + xs_transaction_t tx_id, const char *path, GByteArray *data); +int xs_impl_write(XenstoreImplState *s, unsigned int dom_id, + xs_transaction_t tx_id, const char *path, GByteArray *data); +int xs_impl_directory(XenstoreImplState *s, unsigned int dom_id, + xs_transaction_t tx_id, const char *path, + uint64_t *gencnt, GList **items); +int xs_impl_transaction_start(XenstoreImplState *s, unsigned int dom_id, + xs_transaction_t *tx_id); +int xs_impl_transaction_end(XenstoreImplState *s, unsigned int dom_id, + xs_transaction_t tx_id, bool commit); +int xs_impl_rm(XenstoreImplState *s, unsigned int dom_id, + xs_transaction_t tx_id, const char *path); +int xs_impl_get_perms(XenstoreImplState *s, unsigned int dom_id, + xs_transaction_t tx_id, const char *path, GList **perms); +int xs_impl_set_perms(XenstoreImplState *s, unsigned int dom_id, + xs_transaction_t tx_id, const char *path, GList *perms); + +/* This differs from xs_watch_fn because it has the token */ +typedef void(xs_impl_watch_fn)(void *opaque, const char *path, + const char *token); +int xs_impl_watch(XenstoreImplState *s, unsigned int dom_id, const char *path, + const char *token, xs_impl_watch_fn fn, void *opaque); +int xs_impl_unwatch(XenstoreImplState *s, unsigned int dom_id, + const char *path, const char *token, xs_impl_watch_fn fn, + void *opaque); +int xs_impl_reset_watches(XenstoreImplState *s, unsigned int dom_id); + +GByteArray *xs_impl_serialize(XenstoreImplState *s); +int xs_impl_deserialize(XenstoreImplState *s, GByteArray *bytes, + unsigned int dom_id, xs_impl_watch_fn watch_fn, + void *watch_opaque); + +#endif /* QEMU_XENSTORE_IMPL_H */ diff --git a/hw/i386/pc.c b/hw/i386/pc.c index 7bebea57e3..1489abf010 100644 --- a/hw/i386/pc.c +++ b/hw/i386/pc.c @@ -102,6 +102,11 @@ #include "trace.h" #include CONFIG_DEVICES +#ifdef CONFIG_XEN_EMU +#include "hw/xen/xen-legacy-backend.h" +#include "hw/xen/xen-bus.h" +#endif + /* * Helper for setting model-id for CPU models that changed model-id * depending on QEMU versions up to QEMU 2.4. @@ -1318,6 +1323,8 @@ void pc_basic_device_init(struct PCMachineState *pcms, if (pcms->bus) { pci_create_simple(pcms->bus, -1, "xen-platform"); } + xen_bus_init(); + xen_be_init(); } #endif diff --git a/hw/i386/pc_piix.c b/hw/i386/pc_piix.c index 2f16011bab..30eedd62a3 100644 --- a/hw/i386/pc_piix.c +++ b/hw/i386/pc_piix.c @@ -47,8 +47,6 @@ #include "hw/kvm/clock.h" #include "hw/sysbus.h" #include "hw/i2c/smbus_eeprom.h" -#include "hw/xen/xen-x86.h" -#include "hw/xen/xen.h" #include "exec/memory.h" #include "hw/acpi/acpi.h" #include "hw/acpi/piix4.h" @@ -60,6 +58,8 @@ #include <xen/hvm/hvm_info_table.h> #include "hw/xen/xen_pt.h" #endif +#include "hw/xen/xen-x86.h" +#include "hw/xen/xen.h" #include "migration/global_state.h" #include "migration/misc.h" #include "sysemu/numa.h" @@ -422,6 +422,7 @@ static void pc_xen_hvm_init(MachineState *machine) } pc_xen_hvm_init_pci(machine); + xen_igd_reserve_slot(pcms->bus); pci_create_simple(pcms->bus, -1, "xen-platform"); } #endif diff --git a/hw/i386/xen/xen-hvm.c b/hw/i386/xen/xen-hvm.c index e5a1dd19f4..56641a550e 100644 --- a/hw/i386/xen/xen-hvm.c +++ b/hw/i386/xen/xen-hvm.c @@ -18,7 +18,7 @@ #include "hw/irq.h" #include "hw/hw.h" #include "hw/i386/apic-msidef.h" -#include "hw/xen/xen_common.h" +#include "hw/xen/xen_native.h" #include "hw/xen/xen-legacy-backend.h" #include "hw/xen/xen-bus.h" #include "hw/xen/xen-x86.h" @@ -52,10 +52,11 @@ static bool xen_in_migration; /* Compatibility with older version */ -/* This allows QEMU to build on a system that has Xen 4.5 or earlier - * installed. This here (not in hw/xen/xen_common.h) because xen/hvm/ioreq.h - * needs to be included before this block and hw/xen/xen_common.h needs to - * be included before xen/hvm/ioreq.h +/* + * This allows QEMU to build on a system that has Xen 4.5 or earlier installed. + * This is here (not in hw/xen/xen_native.h) because xen/hvm/ioreq.h needs to + * be included before this block and hw/xen/xen_native.h needs to be included + * before xen/hvm/ioreq.h */ #ifndef IOREQ_TYPE_VMWARE_PORT #define IOREQ_TYPE_VMWARE_PORT 3 @@ -761,7 +762,7 @@ static ioreq_t *cpu_get_ioreq(XenIOState *state) int i; evtchn_port_t port; - port = xenevtchn_pending(state->xce_handle); + port = qemu_xen_evtchn_pending(state->xce_handle); if (port == state->bufioreq_local_port) { timer_mod(state->buffered_io_timer, BUFFER_IO_MAX_DELAY + qemu_clock_get_ms(QEMU_CLOCK_REALTIME)); @@ -780,7 +781,7 @@ static ioreq_t *cpu_get_ioreq(XenIOState *state) } /* unmask the wanted port again */ - xenevtchn_unmask(state->xce_handle, port); + qemu_xen_evtchn_unmask(state->xce_handle, port); /* get the io packet from shared memory */ state->send_vcpu = i; @@ -1147,7 +1148,7 @@ static void handle_buffered_io(void *opaque) BUFFER_IO_MAX_DELAY + qemu_clock_get_ms(QEMU_CLOCK_REALTIME)); } else { timer_del(state->buffered_io_timer); - xenevtchn_unmask(state->xce_handle, state->bufioreq_local_port); + qemu_xen_evtchn_unmask(state->xce_handle, state->bufioreq_local_port); } } @@ -1196,8 +1197,8 @@ static void cpu_handle_ioreq(void *opaque) } req->state = STATE_IORESP_READY; - xenevtchn_notify(state->xce_handle, - state->ioreq_local_port[state->send_vcpu]); + qemu_xen_evtchn_notify(state->xce_handle, + state->ioreq_local_port[state->send_vcpu]); } } @@ -1206,7 +1207,7 @@ static void xen_main_loop_prepare(XenIOState *state) int evtchn_fd = -1; if (state->xce_handle != NULL) { - evtchn_fd = xenevtchn_fd(state->xce_handle); + evtchn_fd = qemu_xen_evtchn_fd(state->xce_handle); } state->buffered_io_timer = timer_new_ms(QEMU_CLOCK_REALTIME, handle_buffered_io, @@ -1249,7 +1250,7 @@ static void xen_exit_notifier(Notifier *n, void *data) xenforeignmemory_unmap_resource(xen_fmem, state->fres); } - xenevtchn_close(state->xce_handle); + qemu_xen_evtchn_close(state->xce_handle); xs_daemon_close(state->xenstore); } @@ -1397,9 +1398,11 @@ void xen_hvm_init_pc(PCMachineState *pcms, MemoryRegion **ram_memory) xen_pfn_t ioreq_pfn; XenIOState *state; + setup_xen_backend_ops(); + state = g_new0(XenIOState, 1); - state->xce_handle = xenevtchn_open(NULL, 0); + state->xce_handle = qemu_xen_evtchn_open(); if (state->xce_handle == NULL) { perror("xen: event channel open"); goto err; @@ -1463,8 +1466,9 @@ void xen_hvm_init_pc(PCMachineState *pcms, MemoryRegion **ram_memory) /* FIXME: how about if we overflow the page here? */ for (i = 0; i < max_cpus; i++) { - rc = xenevtchn_bind_interdomain(state->xce_handle, xen_domid, - xen_vcpu_eport(state->shared_page, i)); + rc = qemu_xen_evtchn_bind_interdomain(state->xce_handle, xen_domid, + xen_vcpu_eport(state->shared_page, + i)); if (rc == -1) { error_report("shared evtchn %d bind error %d", i, errno); goto err; @@ -1472,8 +1476,8 @@ void xen_hvm_init_pc(PCMachineState *pcms, MemoryRegion **ram_memory) state->ioreq_local_port[i] = rc; } - rc = xenevtchn_bind_interdomain(state->xce_handle, xen_domid, - state->bufioreq_remote_port); + rc = qemu_xen_evtchn_bind_interdomain(state->xce_handle, xen_domid, + state->bufioreq_remote_port); if (rc == -1) { error_report("buffered evtchn bind error %d", errno); goto err; diff --git a/hw/i386/xen/xen-mapcache.c b/hw/i386/xen/xen-mapcache.c index 1d0879d234..f7d974677d 100644 --- a/hw/i386/xen/xen-mapcache.c +++ b/hw/i386/xen/xen-mapcache.c @@ -14,7 +14,7 @@ #include <sys/resource.h> -#include "hw/xen/xen-legacy-backend.h" +#include "hw/xen/xen_native.h" #include "qemu/bitmap.h" #include "sysemu/runstate.h" diff --git a/hw/i386/xen/xen_platform.c b/hw/i386/xen/xen_platform.c index 539f7da374..57f1d742c1 100644 --- a/hw/i386/xen/xen_platform.c +++ b/hw/i386/xen/xen_platform.c @@ -28,7 +28,6 @@ #include "hw/ide/pci.h" #include "hw/pci/pci.h" #include "migration/vmstate.h" -#include "hw/xen/xen.h" #include "net/net.h" #include "trace.h" #include "sysemu/xen.h" @@ -38,10 +37,12 @@ #include "qom/object.h" #ifdef CONFIG_XEN -#include "hw/xen/xen_common.h" -#include "hw/xen/xen-legacy-backend.h" +#include "hw/xen/xen_native.h" #endif +/* The rule is that xen_native.h must come first */ +#include "hw/xen/xen.h" + //#define DEBUG_PLATFORM #ifdef DEBUG_PLATFORM diff --git a/hw/intc/i8259.c b/hw/intc/i8259.c index 17910f3bcb..bbae2d87f4 100644 --- a/hw/intc/i8259.c +++ b/hw/intc/i8259.c @@ -133,7 +133,7 @@ static void pic_set_irq(void *opaque, int irq, int level) } #endif - if (s->elcr & mask) { + if (s->ltim || (s->elcr & mask)) { /* level triggered */ if (level) { s->irr |= mask; @@ -167,7 +167,7 @@ static void pic_intack(PICCommonState *s, int irq) s->isr |= (1 << irq); } /* We don't clear a level sensitive interrupt here */ - if (!(s->elcr & (1 << irq))) { + if (!s->ltim && !(s->elcr & (1 << irq))) { s->irr &= ~(1 << irq); } pic_update_irq(s); @@ -224,6 +224,7 @@ static void pic_reset(DeviceState *dev) PICCommonState *s = PIC_COMMON(dev); s->elcr = 0; + s->ltim = 0; pic_init_reset(s); } @@ -243,10 +244,7 @@ static void pic_ioport_write(void *opaque, hwaddr addr64, s->init_state = 1; s->init4 = val & 1; s->single_mode = val & 2; - if (val & 0x08) { - qemu_log_mask(LOG_UNIMP, - "i8259: level sensitive irq not supported\n"); - } + s->ltim = val & 8; } else if (val & 0x08) { if (val & 0x04) { s->poll = 1; diff --git a/hw/intc/i8259_common.c b/hw/intc/i8259_common.c index af2e4a2241..c931dc2d07 100644 --- a/hw/intc/i8259_common.c +++ b/hw/intc/i8259_common.c @@ -51,7 +51,7 @@ void pic_reset_common(PICCommonState *s) s->special_fully_nested_mode = 0; s->init4 = 0; s->single_mode = 0; - /* Note: ELCR is not reset */ + /* Note: ELCR and LTIM are not reset */ } static int pic_dispatch_pre_save(void *opaque) @@ -144,6 +144,24 @@ static void pic_print_info(InterruptStatsProvider *obj, Monitor *mon) s->special_fully_nested_mode); } +static bool ltim_state_needed(void *opaque) +{ + PICCommonState *s = PIC_COMMON(opaque); + + return !!s->ltim; +} + +static const VMStateDescription vmstate_pic_ltim = { + .name = "i8259/ltim", + .version_id = 1, + .minimum_version_id = 1, + .needed = ltim_state_needed, + .fields = (VMStateField[]) { + VMSTATE_UINT8(ltim, PICCommonState), + VMSTATE_END_OF_LIST() + } +}; + static const VMStateDescription vmstate_pic_common = { .name = "i8259", .version_id = 1, @@ -168,6 +186,10 @@ static const VMStateDescription vmstate_pic_common = { VMSTATE_UINT8(single_mode, PICCommonState), VMSTATE_UINT8(elcr, PICCommonState), VMSTATE_END_OF_LIST() + }, + .subsections = (const VMStateDescription*[]) { + &vmstate_pic_ltim, + NULL } }; diff --git a/hw/intc/mips_gic.c b/hw/intc/mips_gic.c index bda4549925..4bdc3b1bd1 100644 --- a/hw/intc/mips_gic.c +++ b/hw/intc/mips_gic.c @@ -439,8 +439,8 @@ static void mips_gic_realize(DeviceState *dev, Error **errp) } static Property mips_gic_properties[] = { - DEFINE_PROP_INT32("num-vp", MIPSGICState, num_vps, 1), - DEFINE_PROP_INT32("num-irq", MIPSGICState, num_irq, 256), + DEFINE_PROP_UINT32("num-vp", MIPSGICState, num_vps, 1), + DEFINE_PROP_UINT32("num-irq", MIPSGICState, num_irq, 256), DEFINE_PROP_END_OF_LIST(), }; diff --git a/hw/intc/riscv_aclint.c b/hw/intc/riscv_aclint.c index eee04643cb..b466a6abaf 100644 --- a/hw/intc/riscv_aclint.c +++ b/hw/intc/riscv_aclint.c @@ -130,7 +130,7 @@ static uint64_t riscv_aclint_mtimer_read(void *opaque, hwaddr addr, addr < (mtimer->timecmp_base + (mtimer->num_harts << 3))) { size_t hartid = mtimer->hartid_base + ((addr - mtimer->timecmp_base) >> 3); - CPUState *cpu = qemu_get_cpu(hartid); + CPUState *cpu = cpu_by_arch_id(hartid); CPURISCVState *env = cpu ? cpu->env_ptr : NULL; if (!env) { qemu_log_mask(LOG_GUEST_ERROR, @@ -173,7 +173,7 @@ static void riscv_aclint_mtimer_write(void *opaque, hwaddr addr, addr < (mtimer->timecmp_base + (mtimer->num_harts << 3))) { size_t hartid = mtimer->hartid_base + ((addr - mtimer->timecmp_base) >> 3); - CPUState *cpu = qemu_get_cpu(hartid); + CPUState *cpu = cpu_by_arch_id(hartid); CPURISCVState *env = cpu ? cpu->env_ptr : NULL; if (!env) { qemu_log_mask(LOG_GUEST_ERROR, @@ -231,7 +231,7 @@ static void riscv_aclint_mtimer_write(void *opaque, hwaddr addr, /* Check if timer interrupt is triggered for each hart. */ for (i = 0; i < mtimer->num_harts; i++) { - CPUState *cpu = qemu_get_cpu(mtimer->hartid_base + i); + CPUState *cpu = cpu_by_arch_id(mtimer->hartid_base + i); CPURISCVState *env = cpu ? cpu->env_ptr : NULL; if (!env) { continue; @@ -292,7 +292,7 @@ static void riscv_aclint_mtimer_realize(DeviceState *dev, Error **errp) s->timecmp = g_new0(uint64_t, s->num_harts); /* Claim timer interrupt bits */ for (i = 0; i < s->num_harts; i++) { - RISCVCPU *cpu = RISCV_CPU(qemu_get_cpu(s->hartid_base + i)); + RISCVCPU *cpu = RISCV_CPU(cpu_by_arch_id(s->hartid_base + i)); if (riscv_cpu_claim_interrupts(cpu, MIP_MTIP) < 0) { error_report("MTIP already claimed"); exit(1); @@ -372,7 +372,7 @@ DeviceState *riscv_aclint_mtimer_create(hwaddr addr, hwaddr size, sysbus_mmio_map(SYS_BUS_DEVICE(dev), 0, addr); for (i = 0; i < num_harts; i++) { - CPUState *cpu = qemu_get_cpu(hartid_base + i); + CPUState *cpu = cpu_by_arch_id(hartid_base + i); RISCVCPU *rvcpu = RISCV_CPU(cpu); CPURISCVState *env = cpu ? cpu->env_ptr : NULL; riscv_aclint_mtimer_callback *cb = @@ -407,7 +407,7 @@ static uint64_t riscv_aclint_swi_read(void *opaque, hwaddr addr, if (addr < (swi->num_harts << 2)) { size_t hartid = swi->hartid_base + (addr >> 2); - CPUState *cpu = qemu_get_cpu(hartid); + CPUState *cpu = cpu_by_arch_id(hartid); CPURISCVState *env = cpu ? cpu->env_ptr : NULL; if (!env) { qemu_log_mask(LOG_GUEST_ERROR, @@ -430,7 +430,7 @@ static void riscv_aclint_swi_write(void *opaque, hwaddr addr, uint64_t value, if (addr < (swi->num_harts << 2)) { size_t hartid = swi->hartid_base + (addr >> 2); - CPUState *cpu = qemu_get_cpu(hartid); + CPUState *cpu = cpu_by_arch_id(hartid); CPURISCVState *env = cpu ? cpu->env_ptr : NULL; if (!env) { qemu_log_mask(LOG_GUEST_ERROR, @@ -545,7 +545,7 @@ DeviceState *riscv_aclint_swi_create(hwaddr addr, uint32_t hartid_base, sysbus_mmio_map(SYS_BUS_DEVICE(dev), 0, addr); for (i = 0; i < num_harts; i++) { - CPUState *cpu = qemu_get_cpu(hartid_base + i); + CPUState *cpu = cpu_by_arch_id(hartid_base + i); RISCVCPU *rvcpu = RISCV_CPU(cpu); qdev_connect_gpio_out(dev, i, diff --git a/hw/intc/riscv_aplic.c b/hw/intc/riscv_aplic.c index cfd007e629..cd7efc4ad4 100644 --- a/hw/intc/riscv_aplic.c +++ b/hw/intc/riscv_aplic.c @@ -833,7 +833,7 @@ static void riscv_aplic_realize(DeviceState *dev, Error **errp) /* Claim the CPU interrupt to be triggered by this APLIC */ for (i = 0; i < aplic->num_harts; i++) { - RISCVCPU *cpu = RISCV_CPU(qemu_get_cpu(aplic->hartid_base + i)); + RISCVCPU *cpu = RISCV_CPU(cpu_by_arch_id(aplic->hartid_base + i)); if (riscv_cpu_claim_interrupts(cpu, (aplic->mmode) ? MIP_MEIP : MIP_SEIP) < 0) { error_report("%s already claimed", @@ -966,7 +966,7 @@ DeviceState *riscv_aplic_create(hwaddr addr, hwaddr size, if (!msimode) { for (i = 0; i < num_harts; i++) { - CPUState *cpu = qemu_get_cpu(hartid_base + i); + CPUState *cpu = cpu_by_arch_id(hartid_base + i); qdev_connect_gpio_out_named(dev, NULL, i, qdev_get_gpio_in(DEVICE(cpu), diff --git a/hw/intc/riscv_imsic.c b/hw/intc/riscv_imsic.c index 4d4d5b50ca..fea3385b51 100644 --- a/hw/intc/riscv_imsic.c +++ b/hw/intc/riscv_imsic.c @@ -316,8 +316,8 @@ static const MemoryRegionOps riscv_imsic_ops = { static void riscv_imsic_realize(DeviceState *dev, Error **errp) { RISCVIMSICState *imsic = RISCV_IMSIC(dev); - RISCVCPU *rcpu = RISCV_CPU(qemu_get_cpu(imsic->hartid)); - CPUState *cpu = qemu_get_cpu(imsic->hartid); + RISCVCPU *rcpu = RISCV_CPU(cpu_by_arch_id(imsic->hartid)); + CPUState *cpu = cpu_by_arch_id(imsic->hartid); CPURISCVState *env = cpu ? cpu->env_ptr : NULL; imsic->num_eistate = imsic->num_pages * imsic->num_irqs; @@ -413,7 +413,7 @@ DeviceState *riscv_imsic_create(hwaddr addr, uint32_t hartid, bool mmode, uint32_t num_pages, uint32_t num_ids) { DeviceState *dev = qdev_new(TYPE_RISCV_IMSIC); - CPUState *cpu = qemu_get_cpu(hartid); + CPUState *cpu = cpu_by_arch_id(hartid); uint32_t i; assert(!(addr & (IMSIC_MMIO_PAGE_SZ - 1))); diff --git a/hw/isa/i82378.c b/hw/isa/i82378.c index 233059c6dc..5432ab5065 100644 --- a/hw/isa/i82378.c +++ b/hw/isa/i82378.c @@ -47,6 +47,12 @@ static const VMStateDescription vmstate_i82378 = { }, }; +static void i82378_request_out0_irq(void *opaque, int irq, int level) +{ + I82378State *s = opaque; + qemu_set_irq(s->cpu_intr, level); +} + static void i82378_request_pic_irq(void *opaque, int irq, int level) { DeviceState *dev = opaque; @@ -88,7 +94,9 @@ static void i82378_realize(PCIDevice *pci, Error **errp) */ /* 2 82C59 (irq) */ - s->isa_irqs_in = i8259_init(isabus, s->cpu_intr); + s->isa_irqs_in = i8259_init(isabus, + qemu_allocate_irq(i82378_request_out0_irq, + s, 0)); isa_bus_register_input_irqs(isabus, s->isa_irqs_in); /* 1 82C54 (pit) */ diff --git a/hw/isa/lpc_ich9.c b/hw/isa/lpc_ich9.c index d8303d0322..9714b0001e 100644 --- a/hw/isa/lpc_ich9.c +++ b/hw/isa/lpc_ich9.c @@ -865,6 +865,7 @@ static void ich9_lpc_class_init(ObjectClass *klass, void *data) hc->plug = ich9_pm_device_plug_cb; hc->unplug_request = ich9_pm_device_unplug_request_cb; hc->unplug = ich9_pm_device_unplug_cb; + hc->is_hotpluggable_bus = ich9_pm_is_hotpluggable_bus; adevc->ospm_status = ich9_pm_ospm_status; adevc->send_event = ich9_send_gpe; adevc->madt_cpu = pc_madt_cpu_entry; diff --git a/hw/isa/trace-events b/hw/isa/trace-events index c4567a9b47..1816e8307a 100644 --- a/hw/isa/trace-events +++ b/hw/isa/trace-events @@ -16,6 +16,7 @@ apm_io_write(uint8_t addr, uint8_t val) "write addr=0x%x val=0x%02x" # vt82c686.c via_isa_write(uint32_t addr, uint32_t val, int len) "addr 0x%x val 0x%x len 0x%x" +via_pm_read(uint32_t addr, uint32_t val, int len) "addr 0x%x val 0x%x len 0x%x" via_pm_write(uint32_t addr, uint32_t val, int len) "addr 0x%x val 0x%x len 0x%x" via_pm_io_read(uint32_t addr, uint32_t val, int len) "addr 0x%x val 0x%x len 0x%x" via_pm_io_write(uint32_t addr, uint32_t val, int len) "addr 0x%x val 0x%x len 0x%x" diff --git a/hw/isa/vt82c686.c b/hw/isa/vt82c686.c index f4c40965cd..ca89119ce0 100644 --- a/hw/isa/vt82c686.c +++ b/hw/isa/vt82c686.c @@ -554,7 +554,7 @@ struct ViaISAState { PCIIDEState ide; UHCIState uhci[2]; ViaPMState pm; - PCIDevice ac97; + ViaAC97State ac97; PCIDevice mc97; }; @@ -598,15 +598,63 @@ void via_isa_set_irq(PCIDevice *d, int n, int level) qemu_set_irq(s->isa_irqs_in[n], level); } +static void via_isa_request_i8259_irq(void *opaque, int irq, int level) +{ + ViaISAState *s = opaque; + qemu_set_irq(s->cpu_intr, level); +} + +static int via_isa_get_pci_irq(const ViaISAState *s, int irq_num) +{ + switch (irq_num) { + case 0: + return s->dev.config[0x55] >> 4; + case 1: + return s->dev.config[0x56] & 0xf; + case 2: + return s->dev.config[0x56] >> 4; + case 3: + return s->dev.config[0x57] >> 4; + } + return 0; +} + +static void via_isa_set_pci_irq(void *opaque, int irq_num, int level) +{ + ViaISAState *s = opaque; + PCIBus *bus = pci_get_bus(&s->dev); + int i, pic_level, pic_irq = via_isa_get_pci_irq(s, irq_num); + + /* IRQ 0: disabled, IRQ 2,8,13: reserved */ + if (!pic_irq) { + return; + } + if (unlikely(pic_irq == 2 || pic_irq == 8 || pic_irq == 13)) { + qemu_log_mask(LOG_GUEST_ERROR, "Invalid ISA IRQ routing"); + } + + /* The pic level is the logical OR of all the PCI irqs mapped to it. */ + pic_level = 0; + for (i = 0; i < PCI_NUM_PINS; i++) { + if (pic_irq == via_isa_get_pci_irq(s, i)) { + pic_level |= pci_bus_get_irq_level(bus, i); + } + } + /* Now we change the pic irq level according to the via irq mappings. */ + qemu_set_irq(s->isa_irqs_in[pic_irq], pic_level); +} + static void via_isa_realize(PCIDevice *d, Error **errp) { ViaISAState *s = VIA_ISA(d); DeviceState *dev = DEVICE(d); PCIBus *pci_bus = pci_get_bus(d); + qemu_irq *isa_irq; ISABus *isa_bus; int i; qdev_init_gpio_out(dev, &s->cpu_intr, 1); + isa_irq = qemu_allocate_irqs(via_isa_request_i8259_irq, s, 1); isa_bus = isa_bus_new(dev, pci_address_space(d), pci_address_space_io(d), errp); @@ -614,11 +662,13 @@ static void via_isa_realize(PCIDevice *d, Error **errp) return; } - s->isa_irqs_in = i8259_init(isa_bus, s->cpu_intr); + s->isa_irqs_in = i8259_init(isa_bus, *isa_irq); isa_bus_register_input_irqs(isa_bus, s->isa_irqs_in); i8254_pit_init(isa_bus, 0x40, 0, NULL); i8257_dma_init(isa_bus, 0); + qdev_init_gpio_in_named(dev, via_isa_set_pci_irq, "pirq", PCI_NUM_PINS); + /* RTC */ qdev_prop_set_int32(DEVICE(&s->rtc), "base_year", 2000); if (!qdev_realize(DEVICE(&s->rtc), BUS(isa_bus), errp)) { diff --git a/hw/mem/cxl_type3.c b/hw/mem/cxl_type3.c index 217a5e639b..abe60b362c 100644 --- a/hw/mem/cxl_type3.c +++ b/hw/mem/cxl_type3.c @@ -1,6 +1,7 @@ #include "qemu/osdep.h" #include "qemu/units.h" #include "qemu/error-report.h" +#include "qapi/qapi-commands-cxl.h" #include "hw/mem/memory-device.h" #include "hw/mem/pc-dimm.h" #include "hw/pci/pci.h" @@ -250,6 +251,7 @@ static void ct3d_config_write(PCIDevice *pci_dev, uint32_t addr, uint32_t val, pcie_doe_write_config(&ct3d->doe_cdat, addr, val, size); pci_default_write_config(pci_dev, addr, val, size); + pcie_aer_write_config(pci_dev, addr, val, size); } /* @@ -322,6 +324,66 @@ static void hdm_decoder_commit(CXLType3Dev *ct3d, int which) ARRAY_FIELD_DP32(cache_mem, CXL_HDM_DECODER0_CTRL, COMMITTED, 1); } +static int ct3d_qmp_uncor_err_to_cxl(CxlUncorErrorType qmp_err) +{ + switch (qmp_err) { + case CXL_UNCOR_ERROR_TYPE_CACHE_DATA_PARITY: + return CXL_RAS_UNC_ERR_CACHE_DATA_PARITY; + case CXL_UNCOR_ERROR_TYPE_CACHE_ADDRESS_PARITY: + return CXL_RAS_UNC_ERR_CACHE_ADDRESS_PARITY; + case CXL_UNCOR_ERROR_TYPE_CACHE_BE_PARITY: + return CXL_RAS_UNC_ERR_CACHE_BE_PARITY; + case CXL_UNCOR_ERROR_TYPE_CACHE_DATA_ECC: + return CXL_RAS_UNC_ERR_CACHE_DATA_ECC; + case CXL_UNCOR_ERROR_TYPE_MEM_DATA_PARITY: + return CXL_RAS_UNC_ERR_MEM_DATA_PARITY; + case CXL_UNCOR_ERROR_TYPE_MEM_ADDRESS_PARITY: + return CXL_RAS_UNC_ERR_MEM_ADDRESS_PARITY; + case CXL_UNCOR_ERROR_TYPE_MEM_BE_PARITY: + return CXL_RAS_UNC_ERR_MEM_BE_PARITY; + case CXL_UNCOR_ERROR_TYPE_MEM_DATA_ECC: + return CXL_RAS_UNC_ERR_MEM_DATA_ECC; + case CXL_UNCOR_ERROR_TYPE_REINIT_THRESHOLD: + return CXL_RAS_UNC_ERR_REINIT_THRESHOLD; + case CXL_UNCOR_ERROR_TYPE_RSVD_ENCODING: + return CXL_RAS_UNC_ERR_RSVD_ENCODING; + case CXL_UNCOR_ERROR_TYPE_POISON_RECEIVED: + return CXL_RAS_UNC_ERR_POISON_RECEIVED; + case CXL_UNCOR_ERROR_TYPE_RECEIVER_OVERFLOW: + return CXL_RAS_UNC_ERR_RECEIVER_OVERFLOW; + case CXL_UNCOR_ERROR_TYPE_INTERNAL: + return CXL_RAS_UNC_ERR_INTERNAL; + case CXL_UNCOR_ERROR_TYPE_CXL_IDE_TX: + return CXL_RAS_UNC_ERR_CXL_IDE_TX; + case CXL_UNCOR_ERROR_TYPE_CXL_IDE_RX: + return CXL_RAS_UNC_ERR_CXL_IDE_RX; + default: + return -EINVAL; + } +} + +static int ct3d_qmp_cor_err_to_cxl(CxlCorErrorType qmp_err) +{ + switch (qmp_err) { + case CXL_COR_ERROR_TYPE_CACHE_DATA_ECC: + return CXL_RAS_COR_ERR_CACHE_DATA_ECC; + case CXL_COR_ERROR_TYPE_MEM_DATA_ECC: + return CXL_RAS_COR_ERR_MEM_DATA_ECC; + case CXL_COR_ERROR_TYPE_CRC_THRESHOLD: + return CXL_RAS_COR_ERR_CRC_THRESHOLD; + case CXL_COR_ERROR_TYPE_RETRY_THRESHOLD: + return CXL_RAS_COR_ERR_RETRY_THRESHOLD; + case CXL_COR_ERROR_TYPE_CACHE_POISON_RECEIVED: + return CXL_RAS_COR_ERR_CACHE_POISON_RECEIVED; + case CXL_COR_ERROR_TYPE_MEM_POISON_RECEIVED: + return CXL_RAS_COR_ERR_MEM_POISON_RECEIVED; + case CXL_COR_ERROR_TYPE_PHYSICAL: + return CXL_RAS_COR_ERR_PHYSICAL; + default: + return -EINVAL; + } +} + static void ct3d_reg_write(void *opaque, hwaddr offset, uint64_t value, unsigned size) { @@ -340,6 +402,83 @@ static void ct3d_reg_write(void *opaque, hwaddr offset, uint64_t value, should_commit = FIELD_EX32(value, CXL_HDM_DECODER0_CTRL, COMMIT); which_hdm = 0; break; + case A_CXL_RAS_UNC_ERR_STATUS: + { + uint32_t capctrl = ldl_le_p(cache_mem + R_CXL_RAS_ERR_CAP_CTRL); + uint32_t fe = FIELD_EX32(capctrl, CXL_RAS_ERR_CAP_CTRL, FIRST_ERROR_POINTER); + CXLError *cxl_err; + uint32_t unc_err; + + /* + * If single bit written that corresponds to the first error + * pointer being cleared, update the status and header log. + */ + if (!QTAILQ_EMPTY(&ct3d->error_list)) { + if ((1 << fe) ^ value) { + CXLError *cxl_next; + /* + * Software is using wrong flow for multiple header recording + * Following behavior in PCIe r6.0 and assuming multiple + * header support. Implementation defined choice to clear all + * matching records if more than one bit set - which corresponds + * closest to behavior of hardware not capable of multiple + * header recording. + */ + QTAILQ_FOREACH_SAFE(cxl_err, &ct3d->error_list, node, cxl_next) { + if ((1 << cxl_err->type) & value) { + QTAILQ_REMOVE(&ct3d->error_list, cxl_err, node); + g_free(cxl_err); + } + } + } else { + /* Done with previous FE, so drop from list */ + cxl_err = QTAILQ_FIRST(&ct3d->error_list); + QTAILQ_REMOVE(&ct3d->error_list, cxl_err, node); + g_free(cxl_err); + } + + /* + * If there is another FE, then put that in place and update + * the header log + */ + if (!QTAILQ_EMPTY(&ct3d->error_list)) { + uint32_t *header_log = &cache_mem[R_CXL_RAS_ERR_HEADER0]; + int i; + + cxl_err = QTAILQ_FIRST(&ct3d->error_list); + for (i = 0; i < CXL_RAS_ERR_HEADER_NUM; i++) { + stl_le_p(header_log + i, cxl_err->header[i]); + } + capctrl = FIELD_DP32(capctrl, CXL_RAS_ERR_CAP_CTRL, + FIRST_ERROR_POINTER, cxl_err->type); + } else { + /* + * If no more errors, then follow recomendation of PCI spec + * r6.0 6.2.4.2 to set the first error pointer to a status + * bit that will never be used. + */ + capctrl = FIELD_DP32(capctrl, CXL_RAS_ERR_CAP_CTRL, + FIRST_ERROR_POINTER, + CXL_RAS_UNC_ERR_CXL_UNUSED); + } + stl_le_p((uint8_t *)cache_mem + A_CXL_RAS_ERR_CAP_CTRL, capctrl); + } + unc_err = 0; + QTAILQ_FOREACH(cxl_err, &ct3d->error_list, node) { + unc_err |= 1 << cxl_err->type; + } + stl_le_p((uint8_t *)cache_mem + offset, unc_err); + + return; + } + case A_CXL_RAS_COR_ERR_STATUS: + { + uint32_t rw1c = value; + uint32_t temp = ldl_le_p((uint8_t *)cache_mem + offset); + temp &= ~rw1c; + stl_le_p((uint8_t *)cache_mem + offset, temp); + return; + } default: break; } @@ -403,6 +542,8 @@ static void ct3_realize(PCIDevice *pci_dev, Error **errp) unsigned short msix_num = 1; int i, rc; + QTAILQ_INIT(&ct3d->error_list); + if (!cxl_setup_memory(ct3d, errp)) { return; } @@ -452,8 +593,19 @@ static void ct3_realize(PCIDevice *pci_dev, Error **errp) cxl_cstate->cdat.free_cdat_table = ct3_free_cdat_table; cxl_cstate->cdat.private = ct3d; cxl_doe_cdat_init(cxl_cstate, errp); + + pcie_cap_deverr_init(pci_dev); + /* Leave a bit of room for expansion */ + rc = pcie_aer_init(pci_dev, PCI_ERR_VER, 0x200, PCI_ERR_SIZEOF, NULL); + if (rc) { + goto err_release_cdat; + } + return; +err_release_cdat: + cxl_doe_cdat_release(cxl_cstate); + g_free(regs->special_ops); err_address_space_free: address_space_destroy(&ct3d->hostmem_as); return; @@ -465,6 +617,7 @@ static void ct3_exit(PCIDevice *pci_dev) CXLComponentState *cxl_cstate = &ct3d->cxl_cstate; ComponentRegisters *regs = &cxl_cstate->crb; + pcie_aer_exit(pci_dev); cxl_doe_cdat_release(cxl_cstate); g_free(regs->special_ops); address_space_destroy(&ct3d->hostmem_as); @@ -618,6 +771,147 @@ static void set_lsa(CXLType3Dev *ct3d, const void *buf, uint64_t size, */ } +/* For uncorrectable errors include support for multiple header recording */ +void qmp_cxl_inject_uncorrectable_errors(const char *path, + CXLUncorErrorRecordList *errors, + Error **errp) +{ + Object *obj = object_resolve_path(path, NULL); + static PCIEAERErr err = {}; + CXLType3Dev *ct3d; + CXLError *cxl_err; + uint32_t *reg_state; + uint32_t unc_err; + bool first; + + if (!obj) { + error_setg(errp, "Unable to resolve path"); + return; + } + + if (!object_dynamic_cast(obj, TYPE_CXL_TYPE3)) { + error_setg(errp, "Path does not point to a CXL type 3 device"); + return; + } + + err.status = PCI_ERR_UNC_INTN; + err.source_id = pci_requester_id(PCI_DEVICE(obj)); + err.flags = 0; + + ct3d = CXL_TYPE3(obj); + + first = QTAILQ_EMPTY(&ct3d->error_list); + reg_state = ct3d->cxl_cstate.crb.cache_mem_registers; + while (errors) { + uint32List *header = errors->value->header; + uint8_t header_count = 0; + int cxl_err_code; + + cxl_err_code = ct3d_qmp_uncor_err_to_cxl(errors->value->type); + if (cxl_err_code < 0) { + error_setg(errp, "Unknown error code"); + return; + } + + /* If the error is masked, nothing to do here */ + if (!((1 << cxl_err_code) & + ~ldl_le_p(reg_state + R_CXL_RAS_UNC_ERR_MASK))) { + errors = errors->next; + continue; + } + + cxl_err = g_malloc0(sizeof(*cxl_err)); + if (!cxl_err) { + return; + } + + cxl_err->type = cxl_err_code; + while (header && header_count < 32) { + cxl_err->header[header_count++] = header->value; + header = header->next; + } + if (header_count > 32) { + error_setg(errp, "Header must be 32 DWORD or less"); + return; + } + QTAILQ_INSERT_TAIL(&ct3d->error_list, cxl_err, node); + + errors = errors->next; + } + + if (first && !QTAILQ_EMPTY(&ct3d->error_list)) { + uint32_t *cache_mem = ct3d->cxl_cstate.crb.cache_mem_registers; + uint32_t capctrl = ldl_le_p(cache_mem + R_CXL_RAS_ERR_CAP_CTRL); + uint32_t *header_log = &cache_mem[R_CXL_RAS_ERR_HEADER0]; + int i; + + cxl_err = QTAILQ_FIRST(&ct3d->error_list); + for (i = 0; i < CXL_RAS_ERR_HEADER_NUM; i++) { + stl_le_p(header_log + i, cxl_err->header[i]); + } + + capctrl = FIELD_DP32(capctrl, CXL_RAS_ERR_CAP_CTRL, + FIRST_ERROR_POINTER, cxl_err->type); + stl_le_p(cache_mem + R_CXL_RAS_ERR_CAP_CTRL, capctrl); + } + + unc_err = 0; + QTAILQ_FOREACH(cxl_err, &ct3d->error_list, node) { + unc_err |= (1 << cxl_err->type); + } + if (!unc_err) { + return; + } + + stl_le_p(reg_state + R_CXL_RAS_UNC_ERR_STATUS, unc_err); + pcie_aer_inject_error(PCI_DEVICE(obj), &err); + + return; +} + +void qmp_cxl_inject_correctable_error(const char *path, CxlCorErrorType type, + Error **errp) +{ + static PCIEAERErr err = {}; + Object *obj = object_resolve_path(path, NULL); + CXLType3Dev *ct3d; + uint32_t *reg_state; + uint32_t cor_err; + int cxl_err_type; + + if (!obj) { + error_setg(errp, "Unable to resolve path"); + return; + } + if (!object_dynamic_cast(obj, TYPE_CXL_TYPE3)) { + error_setg(errp, "Path does not point to a CXL type 3 device"); + return; + } + + err.status = PCI_ERR_COR_INTERNAL; + err.source_id = pci_requester_id(PCI_DEVICE(obj)); + err.flags = PCIE_AER_ERR_IS_CORRECTABLE; + + ct3d = CXL_TYPE3(obj); + reg_state = ct3d->cxl_cstate.crb.cache_mem_registers; + cor_err = ldl_le_p(reg_state + R_CXL_RAS_COR_ERR_STATUS); + + cxl_err_type = ct3d_qmp_cor_err_to_cxl(type); + if (cxl_err_type < 0) { + error_setg(errp, "Invalid COR error"); + return; + } + /* If the error is masked, nothting to do here */ + if (!((1 << cxl_err_type) & ~ldl_le_p(reg_state + R_CXL_RAS_COR_ERR_MASK))) { + return; + } + + cor_err |= (1 << cxl_err_type); + stl_le_p(reg_state + R_CXL_RAS_COR_ERR_STATUS, cor_err); + + pcie_aer_inject_error(PCI_DEVICE(obj), &err); +} + static void ct3_class_init(ObjectClass *oc, void *data) { DeviceClass *dc = DEVICE_CLASS(oc); diff --git a/hw/mem/cxl_type3_stubs.c b/hw/mem/cxl_type3_stubs.c new file mode 100644 index 0000000000..d574c58f9a --- /dev/null +++ b/hw/mem/cxl_type3_stubs.c @@ -0,0 +1,17 @@ + +#include "qemu/osdep.h" +#include "qapi/error.h" +#include "qapi/qapi-commands-cxl.h" + +void qmp_cxl_inject_uncorrectable_errors(const char *path, + CXLUncorErrorRecordList *errors, + Error **errp) +{ + error_setg(errp, "CXL Type 3 support is not compiled in"); +} + +void qmp_cxl_inject_correctable_error(const char *path, CxlCorErrorType type, + Error **errp) +{ + error_setg(errp, "CXL Type 3 support is not compiled in"); +} diff --git a/hw/mem/meson.build b/hw/mem/meson.build index 609b2b36fc..56c2618b84 100644 --- a/hw/mem/meson.build +++ b/hw/mem/meson.build @@ -4,6 +4,8 @@ mem_ss.add(when: 'CONFIG_DIMM', if_true: files('pc-dimm.c')) mem_ss.add(when: 'CONFIG_NPCM7XX', if_true: files('npcm7xx_mc.c')) mem_ss.add(when: 'CONFIG_NVDIMM', if_true: files('nvdimm.c')) mem_ss.add(when: 'CONFIG_CXL_MEM_DEVICE', if_true: files('cxl_type3.c')) +softmmu_ss.add(when: 'CONFIG_CXL_MEM_DEVICE', if_false: files('cxl_type3_stubs.c')) +softmmu_ss.add(when: 'CONFIG_ALL', if_true: files('cxl_type3_stubs.c')) softmmu_ss.add_all(when: 'CONFIG_MEM_DEVICE', if_true: mem_ss) diff --git a/hw/mips/boston.c b/hw/mips/boston.c index a9d87f3437..21ad844519 100644 --- a/hw/mips/boston.c +++ b/hw/mips/boston.c @@ -702,7 +702,7 @@ static void boston_mach_init(MachineState *machine) object_initialize_child(OBJECT(machine), "cps", &s->cps, TYPE_MIPS_CPS); object_property_set_str(OBJECT(&s->cps), "cpu-type", machine->cpu_type, &error_fatal); - object_property_set_int(OBJECT(&s->cps), "num-vp", machine->smp.cpus, + object_property_set_uint(OBJECT(&s->cps), "num-vp", machine->smp.cpus, &error_fatal); qdev_connect_clock_in(DEVICE(&s->cps), "clk-in", qdev_get_clock_out(dev, "cpu-refclk")); diff --git a/hw/mips/cps.c b/hw/mips/cps.c index 2b436700ce..2b5269ebf1 100644 --- a/hw/mips/cps.c +++ b/hw/mips/cps.c @@ -66,20 +66,17 @@ static bool cpu_mips_itu_supported(CPUMIPSState *env) static void mips_cps_realize(DeviceState *dev, Error **errp) { MIPSCPSState *s = MIPS_CPS(dev); - CPUMIPSState *env; - MIPSCPU *cpu; - int i; target_ulong gcr_base; bool itu_present = false; - bool saar_present = false; if (!clock_get(s->clock)) { error_setg(errp, "CPS input clock is not connected to an output clock"); return; } - for (i = 0; i < s->num_vp; i++) { - cpu = MIPS_CPU(object_new(s->cpu_type)); + for (int i = 0; i < s->num_vp; i++) { + MIPSCPU *cpu = MIPS_CPU(object_new(s->cpu_type)); + CPUMIPSState *env = &cpu->env; /* All VPs are halted on reset. Leave powering up to CPC. */ if (!object_property_set_bool(OBJECT(cpu), "start-powered-off", true, @@ -97,7 +94,6 @@ static void mips_cps_realize(DeviceState *dev, Error **errp) cpu_mips_irq_init_cpu(cpu); cpu_mips_clock_init(cpu); - env = &cpu->env; if (cpu_mips_itu_supported(env)) { itu_present = true; /* Attach ITC Tag to the VP */ @@ -107,22 +103,15 @@ static void mips_cps_realize(DeviceState *dev, Error **errp) qemu_register_reset(main_cpu_reset, cpu); } - cpu = MIPS_CPU(first_cpu); - env = &cpu->env; - saar_present = (bool)env->saarp; - /* Inter-Thread Communication Unit */ if (itu_present) { object_initialize_child(OBJECT(dev), "itu", &s->itu, TYPE_MIPS_ITU); - object_property_set_int(OBJECT(&s->itu), "num-fifo", 16, + object_property_set_link(OBJECT(&s->itu), "cpu[0]", + OBJECT(first_cpu), &error_abort); + object_property_set_uint(OBJECT(&s->itu), "num-fifo", 16, &error_abort); - object_property_set_int(OBJECT(&s->itu), "num-semaphores", 16, + object_property_set_uint(OBJECT(&s->itu), "num-semaphores", 16, &error_abort); - object_property_set_bool(OBJECT(&s->itu), "saar-present", saar_present, - &error_abort); - if (saar_present) { - s->itu.saar = &env->CP0_SAAR; - } if (!sysbus_realize(SYS_BUS_DEVICE(&s->itu), errp)) { return; } @@ -133,7 +122,7 @@ static void mips_cps_realize(DeviceState *dev, Error **errp) /* Cluster Power Controller */ object_initialize_child(OBJECT(dev), "cpc", &s->cpc, TYPE_MIPS_CPC); - object_property_set_int(OBJECT(&s->cpc), "num-vp", s->num_vp, + object_property_set_uint(OBJECT(&s->cpc), "num-vp", s->num_vp, &error_abort); object_property_set_int(OBJECT(&s->cpc), "vp-start-running", 1, &error_abort); @@ -146,9 +135,9 @@ static void mips_cps_realize(DeviceState *dev, Error **errp) /* Global Interrupt Controller */ object_initialize_child(OBJECT(dev), "gic", &s->gic, TYPE_MIPS_GIC); - object_property_set_int(OBJECT(&s->gic), "num-vp", s->num_vp, + object_property_set_uint(OBJECT(&s->gic), "num-vp", s->num_vp, &error_abort); - object_property_set_int(OBJECT(&s->gic), "num-irq", 128, + object_property_set_uint(OBJECT(&s->gic), "num-irq", 128, &error_abort); if (!sysbus_realize(SYS_BUS_DEVICE(&s->gic), errp)) { return; @@ -158,10 +147,10 @@ static void mips_cps_realize(DeviceState *dev, Error **errp) sysbus_mmio_get_region(SYS_BUS_DEVICE(&s->gic), 0)); /* Global Configuration Registers */ - gcr_base = env->CP0_CMGCRBase << 4; + gcr_base = MIPS_CPU(first_cpu)->env.CP0_CMGCRBase << 4; object_initialize_child(OBJECT(dev), "gcr", &s->gcr, TYPE_MIPS_GCR); - object_property_set_int(OBJECT(&s->gcr), "num-vp", s->num_vp, + object_property_set_uint(OBJECT(&s->gcr), "num-vp", s->num_vp, &error_abort); object_property_set_int(OBJECT(&s->gcr), "gcr-rev", 0x800, &error_abort); diff --git a/hw/mips/malta.c b/hw/mips/malta.c index ec172b111a..af9021316d 100644 --- a/hw/mips/malta.c +++ b/hw/mips/malta.c @@ -1066,7 +1066,7 @@ static void create_cps(MachineState *ms, MaltaState *s, object_initialize_child(OBJECT(s), "cps", &s->cps, TYPE_MIPS_CPS); object_property_set_str(OBJECT(&s->cps), "cpu-type", ms->cpu_type, &error_fatal); - object_property_set_int(OBJECT(&s->cps), "num-vp", ms->smp.cpus, + object_property_set_uint(OBJECT(&s->cps), "num-vp", ms->smp.cpus, &error_fatal); qdev_connect_clock_in(DEVICE(&s->cps), "clk-in", s->cpuclk); sysbus_realize(SYS_BUS_DEVICE(&s->cps), &error_fatal); diff --git a/hw/misc/edu.c b/hw/misc/edu.c index e935c418d4..a1f8bc77e7 100644 --- a/hw/misc/edu.c +++ b/hw/misc/edu.c @@ -267,6 +267,8 @@ static void edu_mmio_write(void *opaque, hwaddr addr, uint64_t val, case 0x20: if (val & EDU_STATUS_IRQFACT) { qatomic_or(&edu->status, EDU_STATUS_IRQFACT); + /* Order check of the COMPUTING flag after setting IRQFACT. */ + smp_mb__after_rmw(); } else { qatomic_and(&edu->status, ~EDU_STATUS_IRQFACT); } @@ -349,6 +351,9 @@ static void *edu_fact_thread(void *opaque) qemu_mutex_unlock(&edu->thr_mutex); qatomic_and(&edu->status, ~EDU_STATUS_COMPUTING); + /* Clear COMPUTING flag before checking IRQFACT. */ + smp_mb__after_rmw(); + if (qatomic_read(&edu->status) & EDU_STATUS_IRQFACT) { qemu_mutex_lock_iothread(); edu_raise_irq(edu, FACT_IRQ); diff --git a/hw/misc/mips_cmgcr.c b/hw/misc/mips_cmgcr.c index 3c8b37f700..66eb11662c 100644 --- a/hw/misc/mips_cmgcr.c +++ b/hw/misc/mips_cmgcr.c @@ -212,7 +212,7 @@ static const VMStateDescription vmstate_mips_gcr = { }; static Property mips_gcr_properties[] = { - DEFINE_PROP_INT32("num-vp", MIPSGCRState, num_vps, 1), + DEFINE_PROP_UINT32("num-vp", MIPSGCRState, num_vps, 1), DEFINE_PROP_INT32("gcr-rev", MIPSGCRState, gcr_rev, 0x800), DEFINE_PROP_UINT64("gcr-base", MIPSGCRState, gcr_base, GCR_BASE_ADDR), DEFINE_PROP_LINK("gic", MIPSGCRState, gic_mr, TYPE_MEMORY_REGION, diff --git a/hw/misc/mips_itu.c b/hw/misc/mips_itu.c index badef5c214..0eda302db4 100644 --- a/hw/misc/mips_itu.c +++ b/hw/misc/mips_itu.c @@ -93,10 +93,10 @@ void itc_reconfigure(MIPSITUState *tag) uint64_t size = (1 * KiB) + (am[1] & ITC_AM1_ADDR_MASK_MASK); bool is_enabled = (am[0] & ITC_AM0_EN_MASK) != 0; - if (tag->saar_present) { - address = ((*(uint64_t *) tag->saar) & 0xFFFFFFFFE000ULL) << 4; - size = 1ULL << ((*(uint64_t *) tag->saar >> 1) & 0x1f); - is_enabled = *(uint64_t *) tag->saar & 1; + if (tag->saar) { + address = (tag->saar[0] & 0xFFFFFFFFE000ULL) << 4; + size = 1ULL << ((tag->saar[0] >> 1) & 0x1f); + is_enabled = tag->saar[0] & 1; } memory_region_transaction_begin(); @@ -157,7 +157,7 @@ static inline ITCView get_itc_view(hwaddr addr) static inline int get_cell_stride_shift(const MIPSITUState *s) { /* Minimum interval (for EntryGain = 0) is 128 B */ - if (s->saar_present) { + if (s->saar) { return 7 + ((s->icr0 >> ITC_ICR0_BLK_GRAIN) & ITC_ICR0_BLK_GRAIN_MASK); } else { @@ -515,6 +515,7 @@ static void mips_itu_init(Object *obj) static void mips_itu_realize(DeviceState *dev, Error **errp) { MIPSITUState *s = MIPS_ITU(dev); + CPUMIPSState *env; if (s->num_fifo > ITC_FIFO_NUM_MAX) { error_setg(errp, "Exceed maximum number of FIFO cells: %d", @@ -526,6 +527,15 @@ static void mips_itu_realize(DeviceState *dev, Error **errp) s->num_semaphores); return; } + if (!s->cpu0) { + error_setg(errp, "Missing 'cpu[0]' property"); + return; + } + + env = &s->cpu0->env; + if (env->saarp) { + s->saar = env->CP0_SAAR; + } s->cell = g_new(ITCStorageCell, get_num_cells(s)); } @@ -534,8 +544,8 @@ static void mips_itu_reset(DeviceState *dev) { MIPSITUState *s = MIPS_ITU(dev); - if (s->saar_present) { - *(uint64_t *) s->saar = 0x11 << 1; + if (s->saar) { + s->saar[0] = 0x11 << 1; s->icr0 = get_num_cells(s) << ITC_ICR0_CELL_NUM; } else { s->ITCAddressMap[0] = 0; @@ -549,11 +559,11 @@ static void mips_itu_reset(DeviceState *dev) } static Property mips_itu_properties[] = { - DEFINE_PROP_INT32("num-fifo", MIPSITUState, num_fifo, + DEFINE_PROP_UINT32("num-fifo", MIPSITUState, num_fifo, ITC_FIFO_NUM_MAX), - DEFINE_PROP_INT32("num-semaphores", MIPSITUState, num_semaphores, + DEFINE_PROP_UINT32("num-semaphores", MIPSITUState, num_semaphores, ITC_SEMAPH_NUM_MAX), - DEFINE_PROP_BOOL("saar-present", MIPSITUState, saar_present, false), + DEFINE_PROP_LINK("cpu[0]", MIPSITUState, cpu0, TYPE_MIPS_CPU, MIPSCPU *), DEFINE_PROP_END_OF_LIST(), }; diff --git a/hw/net/Kconfig b/hw/net/Kconfig index 1cc1c5775e..18c7851efe 100644 --- a/hw/net/Kconfig +++ b/hw/net/Kconfig @@ -44,6 +44,11 @@ config E1000E_PCI_EXPRESS default y if PCI_DEVICES depends on PCI_EXPRESS && MSI_NONBROKEN +config IGB_PCI_EXPRESS + bool + default y if PCI_DEVICES + depends on PCI_EXPRESS && MSI_NONBROKEN + config RTL8139_PCI bool default y if PCI_DEVICES diff --git a/hw/net/e1000.c b/hw/net/e1000.c index 7efb8a4c52..23d660619f 100644 --- a/hw/net/e1000.c +++ b/hw/net/e1000.c @@ -26,6 +26,7 @@ #include "qemu/osdep.h" +#include "hw/net/mii.h" #include "hw/pci/pci_device.h" #include "hw/qdev-properties.h" #include "migration/vmstate.h" @@ -38,12 +39,11 @@ #include "qemu/module.h" #include "qemu/range.h" +#include "e1000_common.h" #include "e1000x_common.h" #include "trace.h" #include "qom/object.h" -static const uint8_t bcast[] = {0xff, 0xff, 0xff, 0xff, 0xff, 0xff}; - /* #define E1000_DEBUG */ #ifdef E1000_DEBUG @@ -66,9 +66,8 @@ static int debugflags = DBGBIT(TXERR) | DBGBIT(GENERAL); #define IOPORT_SIZE 0x40 #define PNPMMIO_SIZE 0x20000 -#define MIN_BUF_SIZE 60 /* Min. octets in an ethernet frame sans FCS */ -#define MAXIMUM_ETHERNET_HDR_LEN (14+4) +#define MAXIMUM_ETHERNET_HDR_LEN (ETH_HLEN + 4) /* * HW models: @@ -181,67 +180,73 @@ e1000_autoneg_done(E1000State *s) static bool have_autoneg(E1000State *s) { - return chkflag(AUTONEG) && (s->phy_reg[PHY_CTRL] & MII_CR_AUTO_NEG_EN); + return chkflag(AUTONEG) && (s->phy_reg[MII_BMCR] & MII_BMCR_AUTOEN); } static void set_phy_ctrl(E1000State *s, int index, uint16_t val) { - /* bits 0-5 reserved; MII_CR_[RESTART_AUTO_NEG,RESET] are self clearing */ - s->phy_reg[PHY_CTRL] = val & ~(0x3f | - MII_CR_RESET | - MII_CR_RESTART_AUTO_NEG); + /* bits 0-5 reserved; MII_BMCR_[ANRESTART,RESET] are self clearing */ + s->phy_reg[MII_BMCR] = val & ~(0x3f | + MII_BMCR_RESET | + MII_BMCR_ANRESTART); /* * QEMU 1.3 does not support link auto-negotiation emulation, so if we * migrate during auto negotiation, after migration the link will be * down. */ - if (have_autoneg(s) && (val & MII_CR_RESTART_AUTO_NEG)) { + if (have_autoneg(s) && (val & MII_BMCR_ANRESTART)) { e1000x_restart_autoneg(s->mac_reg, s->phy_reg, s->autoneg_timer); } } static void (*phyreg_writeops[])(E1000State *, int, uint16_t) = { - [PHY_CTRL] = set_phy_ctrl, + [MII_BMCR] = set_phy_ctrl, }; enum { NPHYWRITEOPS = ARRAY_SIZE(phyreg_writeops) }; enum { PHY_R = 1, PHY_W = 2, PHY_RW = PHY_R | PHY_W }; static const char phy_regcap[0x20] = { - [PHY_STATUS] = PHY_R, [M88E1000_EXT_PHY_SPEC_CTRL] = PHY_RW, - [PHY_ID1] = PHY_R, [M88E1000_PHY_SPEC_CTRL] = PHY_RW, - [PHY_CTRL] = PHY_RW, [PHY_1000T_CTRL] = PHY_RW, - [PHY_LP_ABILITY] = PHY_R, [PHY_1000T_STATUS] = PHY_R, - [PHY_AUTONEG_ADV] = PHY_RW, [M88E1000_RX_ERR_CNTR] = PHY_R, - [PHY_ID2] = PHY_R, [M88E1000_PHY_SPEC_STATUS] = PHY_R, - [PHY_AUTONEG_EXP] = PHY_R, + [MII_BMSR] = PHY_R, [M88E1000_EXT_PHY_SPEC_CTRL] = PHY_RW, + [MII_PHYID1] = PHY_R, [M88E1000_PHY_SPEC_CTRL] = PHY_RW, + [MII_BMCR] = PHY_RW, [MII_CTRL1000] = PHY_RW, + [MII_ANLPAR] = PHY_R, [MII_STAT1000] = PHY_R, + [MII_ANAR] = PHY_RW, [M88E1000_RX_ERR_CNTR] = PHY_R, + [MII_PHYID2] = PHY_R, [M88E1000_PHY_SPEC_STATUS] = PHY_R, + [MII_ANER] = PHY_R, }; -/* PHY_ID2 documented in 8254x_GBe_SDM.pdf, pp. 250 */ +/* MII_PHYID2 documented in 8254x_GBe_SDM.pdf, pp. 250 */ static const uint16_t phy_reg_init[] = { - [PHY_CTRL] = MII_CR_SPEED_SELECT_MSB | - MII_CR_FULL_DUPLEX | - MII_CR_AUTO_NEG_EN, - - [PHY_STATUS] = MII_SR_EXTENDED_CAPS | - MII_SR_LINK_STATUS | /* link initially up */ - MII_SR_AUTONEG_CAPS | - /* MII_SR_AUTONEG_COMPLETE: initially NOT completed */ - MII_SR_PREAMBLE_SUPPRESS | - MII_SR_EXTENDED_STATUS | - MII_SR_10T_HD_CAPS | - MII_SR_10T_FD_CAPS | - MII_SR_100X_HD_CAPS | - MII_SR_100X_FD_CAPS, - - [PHY_ID1] = 0x141, - /* [PHY_ID2] configured per DevId, from e1000_reset() */ - [PHY_AUTONEG_ADV] = 0xde1, - [PHY_LP_ABILITY] = 0x1e0, - [PHY_1000T_CTRL] = 0x0e00, - [PHY_1000T_STATUS] = 0x3c00, + [MII_BMCR] = MII_BMCR_SPEED1000 | + MII_BMCR_FD | + MII_BMCR_AUTOEN, + + [MII_BMSR] = MII_BMSR_EXTCAP | + MII_BMSR_LINK_ST | /* link initially up */ + MII_BMSR_AUTONEG | + /* MII_BMSR_AN_COMP: initially NOT completed */ + MII_BMSR_MFPS | + MII_BMSR_EXTSTAT | + MII_BMSR_10T_HD | + MII_BMSR_10T_FD | + MII_BMSR_100TX_HD | + MII_BMSR_100TX_FD, + + [MII_PHYID1] = 0x141, + /* [MII_PHYID2] configured per DevId, from e1000_reset() */ + [MII_ANAR] = MII_ANAR_CSMACD | MII_ANAR_10 | + MII_ANAR_10FD | MII_ANAR_TX | + MII_ANAR_TXFD | MII_ANAR_PAUSE | + MII_ANAR_PAUSE_ASYM, + [MII_ANLPAR] = MII_ANLPAR_10 | MII_ANLPAR_10FD | + MII_ANLPAR_TX | MII_ANLPAR_TXFD, + [MII_CTRL1000] = MII_CTRL1000_FULL | MII_CTRL1000_PORT | + MII_CTRL1000_MASTER, + [MII_STAT1000] = MII_STAT1000_HALF | MII_STAT1000_FULL | + MII_STAT1000_ROK | MII_STAT1000_LOK, [M88E1000_PHY_SPEC_CTRL] = 0x360, [M88E1000_PHY_SPEC_STATUS] = 0xac00, [M88E1000_EXT_PHY_SPEC_CTRL] = 0x0d60, @@ -373,9 +378,9 @@ static bool e1000_vet_init_need(void *opaque) return chkflag(VET); } -static void e1000_reset(void *opaque) +static void e1000_reset_hold(Object *obj) { - E1000State *d = opaque; + E1000State *d = E1000(obj); E1000BaseClass *edc = E1000_GET_CLASS(d); uint8_t *macaddr = d->conf.macaddr.a; @@ -386,10 +391,10 @@ static void e1000_reset(void *opaque) d->mit_irq_level = 0; d->mit_ide = 0; memset(d->phy_reg, 0, sizeof d->phy_reg); - memmove(d->phy_reg, phy_reg_init, sizeof phy_reg_init); - d->phy_reg[PHY_ID2] = edc->phy_id2; + memcpy(d->phy_reg, phy_reg_init, sizeof phy_reg_init); + d->phy_reg[MII_PHYID2] = edc->phy_id2; memset(d->mac_reg, 0, sizeof d->mac_reg); - memmove(d->mac_reg, mac_reg_init, sizeof mac_reg_init); + memcpy(d->mac_reg, mac_reg_init, sizeof mac_reg_init); d->rxbuf_min_shift = 1; memset(&d->tx, 0, sizeof d->tx); @@ -547,9 +552,9 @@ putsum(uint8_t *data, uint32_t n, uint32_t sloc, uint32_t css, uint32_t cse) static inline void inc_tx_bcast_or_mcast_count(E1000State *s, const unsigned char *arr) { - if (!memcmp(arr, bcast, sizeof bcast)) { + if (is_broadcast_ether_addr(arr)) { e1000x_inc_reg_if_not_full(s->mac_reg, BPTC); - } else if (arr[0] & 1) { + } else if (is_multicast_ether_addr(arr)) { e1000x_inc_reg_if_not_full(s->mac_reg, MPTC); } } @@ -561,13 +566,13 @@ e1000_send_packet(E1000State *s, const uint8_t *buf, int size) PTC1023, PTC1522 }; NetClientState *nc = qemu_get_queue(s->nic); - if (s->phy_reg[PHY_CTRL] & MII_CR_LOOPBACK) { + if (s->phy_reg[MII_BMCR] & MII_BMCR_LOOPBACK) { qemu_receive_packet(nc, buf, size); } else { qemu_send_packet(nc, buf, size); } inc_tx_bcast_or_mcast_count(s, buf); - e1000x_increase_size_stats(s->mac_reg, PTCregs, size); + e1000x_increase_size_stats(s->mac_reg, PTCregs, size + 4); } static void @@ -631,7 +636,7 @@ xmit_seg(E1000State *s) } e1000x_inc_reg_if_not_full(s->mac_reg, TPT); - e1000x_grow_8reg_if_not_full(s->mac_reg, TOTL, s->tx.size); + e1000x_grow_8reg_if_not_full(s->mac_reg, TOTL, s->tx.size + 4); s->mac_reg[GPTC] = s->mac_reg[TPT]; s->mac_reg[GOTCL] = s->mac_reg[TOTL]; s->mac_reg[GOTCH] = s->mac_reg[TOTH]; @@ -803,15 +808,18 @@ static int receive_filter(E1000State *s, const uint8_t *buf, int size) { uint32_t rctl = s->mac_reg[RCTL]; - int isbcast = !memcmp(buf, bcast, sizeof bcast), ismcast = (buf[0] & 1); + int isbcast = is_broadcast_ether_addr(buf); + int ismcast = is_multicast_ether_addr(buf); if (e1000x_is_vlan_packet(buf, le16_to_cpu(s->mac_reg[VET])) && e1000x_vlan_rx_filter_enabled(s->mac_reg)) { - uint16_t vid = lduw_be_p(buf + 14); - uint32_t vfta = ldl_le_p((uint32_t*)(s->mac_reg + VFTA) + - ((vid >> 5) & 0x7f)); - if ((vfta & (1 << (vid & 0x1f))) == 0) + uint16_t vid = lduw_be_p(&PKT_GET_VLAN_HDR(buf)->h_tci); + uint32_t vfta = + ldl_le_p((uint32_t *)(s->mac_reg + VFTA) + + ((vid >> E1000_VFTA_ENTRY_SHIFT) & E1000_VFTA_ENTRY_MASK)); + if ((vfta & (1 << (vid & E1000_VFTA_ENTRY_BIT_SHIFT_MASK))) == 0) { return 0; + } } if (!isbcast && !ismcast && (rctl & E1000_RCTL_UPE)) { /* promiscuous ucast */ @@ -841,7 +849,7 @@ e1000_set_link_status(NetClientState *nc) e1000x_update_regs_on_link_down(s->mac_reg, s->phy_reg); } else { if (have_autoneg(s) && - !(s->phy_reg[PHY_STATUS] & MII_SR_AUTONEG_COMPLETE)) { + !(s->phy_reg[MII_BMSR] & MII_BMSR_AN_COMP)) { e1000x_restart_autoneg(s->mac_reg, s->phy_reg, s->autoneg_timer); } else { e1000_link_up(s); @@ -907,7 +915,7 @@ e1000_receive_iov(NetClientState *nc, const struct iovec *iov, int iovcnt) uint32_t rdh_start; uint16_t vlan_special = 0; uint8_t vlan_status = 0; - uint8_t min_buf[MIN_BUF_SIZE]; + uint8_t min_buf[ETH_ZLEN]; struct iovec min_iov; uint8_t *filter_buf = iov->iov_base; size_t size = iov_size(iov, iovcnt); @@ -1061,30 +1069,6 @@ mac_readreg(E1000State *s, int index) } static uint32_t -mac_low4_read(E1000State *s, int index) -{ - return s->mac_reg[index] & 0xf; -} - -static uint32_t -mac_low11_read(E1000State *s, int index) -{ - return s->mac_reg[index] & 0x7ff; -} - -static uint32_t -mac_low13_read(E1000State *s, int index) -{ - return s->mac_reg[index] & 0x1fff; -} - -static uint32_t -mac_low16_read(E1000State *s, int index) -{ - return s->mac_reg[index] & 0xffff; -} - -static uint32_t mac_icr_read(E1000State *s, int index) { uint32_t ret = s->mac_reg[ICR]; @@ -1136,11 +1120,17 @@ set_rdt(E1000State *s, int index, uint32_t val) } } -static void -set_16bit(E1000State *s, int index, uint32_t val) -{ - s->mac_reg[index] = val & 0xffff; -} +#define LOW_BITS_SET_FUNC(num) \ + static void \ + set_##num##bit(E1000State *s, int index, uint32_t val) \ + { \ + s->mac_reg[index] = val & (BIT(num) - 1); \ + } + +LOW_BITS_SET_FUNC(4) +LOW_BITS_SET_FUNC(11) +LOW_BITS_SET_FUNC(13) +LOW_BITS_SET_FUNC(16) static void set_dlen(E1000State *s, int index, uint32_t val) @@ -1194,7 +1184,9 @@ static const readops macreg_readops[] = { getreg(XONRXC), getreg(XONTXC), getreg(XOFFRXC), getreg(XOFFTXC), getreg(RFC), getreg(RJC), getreg(RNBC), getreg(TSCTFC), getreg(MGTPRC), getreg(MGTPDC), getreg(MGTPTC), getreg(GORCL), - getreg(GOTCL), + getreg(GOTCL), getreg(RDFH), getreg(RDFT), getreg(RDFHS), + getreg(RDFTS), getreg(RDFPC), getreg(TDFH), getreg(TDFT), + getreg(TDFHS), getreg(TDFTS), getreg(TDFPC), getreg(AIT), [TOTH] = mac_read_clr8, [TORH] = mac_read_clr8, [GOTCH] = mac_read_clr8, [GORCH] = mac_read_clr8, @@ -1212,24 +1204,17 @@ static const readops macreg_readops[] = { [MPTC] = mac_read_clr4, [ICR] = mac_icr_read, [EECD] = get_eecd, [EERD] = flash_eerd_read, - [RDFH] = mac_low13_read, [RDFT] = mac_low13_read, - [RDFHS] = mac_low13_read, [RDFTS] = mac_low13_read, - [RDFPC] = mac_low13_read, - [TDFH] = mac_low11_read, [TDFT] = mac_low11_read, - [TDFHS] = mac_low13_read, [TDFTS] = mac_low13_read, - [TDFPC] = mac_low13_read, - [AIT] = mac_low16_read, - - [CRCERRS ... MPC] = &mac_readreg, - [IP6AT ... IP6AT+3] = &mac_readreg, [IP4AT ... IP4AT+6] = &mac_readreg, - [FFLT ... FFLT+6] = &mac_low11_read, - [RA ... RA+31] = &mac_readreg, - [WUPM ... WUPM+31] = &mac_readreg, - [MTA ... MTA+127] = &mac_readreg, - [VFTA ... VFTA+127] = &mac_readreg, - [FFMT ... FFMT+254] = &mac_low4_read, - [FFVT ... FFVT+254] = &mac_readreg, - [PBM ... PBM+16383] = &mac_readreg, + + [CRCERRS ... MPC] = &mac_readreg, + [IP6AT ... IP6AT + 3] = &mac_readreg, [IP4AT ... IP4AT + 6] = &mac_readreg, + [FFLT ... FFLT + 6] = &mac_readreg, + [RA ... RA + 31] = &mac_readreg, + [WUPM ... WUPM + 31] = &mac_readreg, + [MTA ... MTA + E1000_MC_TBL_SIZE - 1] = &mac_readreg, + [VFTA ... VFTA + E1000_VLAN_FILTER_TBL_SIZE - 1] = &mac_readreg, + [FFMT ... FFMT + 254] = &mac_readreg, + [FFVT ... FFVT + 254] = &mac_readreg, + [PBM ... PBM + 16383] = &mac_readreg, }; enum { NREADOPS = ARRAY_SIZE(macreg_readops) }; @@ -1239,27 +1224,28 @@ static const writeops macreg_writeops[] = { putreg(PBA), putreg(EERD), putreg(SWSM), putreg(WUFC), putreg(TDBAL), putreg(TDBAH), putreg(TXDCTL), putreg(RDBAH), putreg(RDBAL), putreg(LEDCTL), putreg(VET), putreg(FCRUC), - putreg(TDFH), putreg(TDFT), putreg(TDFHS), putreg(TDFTS), - putreg(TDFPC), putreg(RDFH), putreg(RDFT), putreg(RDFHS), - putreg(RDFTS), putreg(RDFPC), putreg(IPAV), putreg(WUC), - putreg(WUS), putreg(AIT), - - [TDLEN] = set_dlen, [RDLEN] = set_dlen, [TCTL] = set_tctl, - [TDT] = set_tctl, [MDIC] = set_mdic, [ICS] = set_ics, - [TDH] = set_16bit, [RDH] = set_16bit, [RDT] = set_rdt, - [IMC] = set_imc, [IMS] = set_ims, [ICR] = set_icr, - [EECD] = set_eecd, [RCTL] = set_rx_control, [CTRL] = set_ctrl, - [RDTR] = set_16bit, [RADV] = set_16bit, [TADV] = set_16bit, - [ITR] = set_16bit, - - [IP6AT ... IP6AT+3] = &mac_writereg, [IP4AT ... IP4AT+6] = &mac_writereg, - [FFLT ... FFLT+6] = &mac_writereg, - [RA ... RA+31] = &mac_writereg, - [WUPM ... WUPM+31] = &mac_writereg, - [MTA ... MTA+127] = &mac_writereg, - [VFTA ... VFTA+127] = &mac_writereg, - [FFMT ... FFMT+254] = &mac_writereg, [FFVT ... FFVT+254] = &mac_writereg, - [PBM ... PBM+16383] = &mac_writereg, + putreg(IPAV), putreg(WUC), + putreg(WUS), + + [TDLEN] = set_dlen, [RDLEN] = set_dlen, [TCTL] = set_tctl, + [TDT] = set_tctl, [MDIC] = set_mdic, [ICS] = set_ics, + [TDH] = set_16bit, [RDH] = set_16bit, [RDT] = set_rdt, + [IMC] = set_imc, [IMS] = set_ims, [ICR] = set_icr, + [EECD] = set_eecd, [RCTL] = set_rx_control, [CTRL] = set_ctrl, + [RDTR] = set_16bit, [RADV] = set_16bit, [TADV] = set_16bit, + [ITR] = set_16bit, [TDFH] = set_11bit, [TDFT] = set_11bit, + [TDFHS] = set_13bit, [TDFTS] = set_13bit, [TDFPC] = set_13bit, + [RDFH] = set_13bit, [RDFT] = set_13bit, [RDFHS] = set_13bit, + [RDFTS] = set_13bit, [RDFPC] = set_13bit, [AIT] = set_16bit, + + [IP6AT ... IP6AT + 3] = &mac_writereg, [IP4AT ... IP4AT + 6] = &mac_writereg, + [FFLT ... FFLT + 6] = &set_11bit, + [RA ... RA + 31] = &mac_writereg, + [WUPM ... WUPM + 31] = &mac_writereg, + [MTA ... MTA + E1000_MC_TBL_SIZE - 1] = &mac_writereg, + [VFTA ... VFTA + E1000_VLAN_FILTER_TBL_SIZE - 1] = &mac_writereg, + [FFMT ... FFMT + 254] = &set_4bit, [FFVT ... FFVT + 254] = &mac_writereg, + [PBM ... PBM + 16383] = &mac_writereg, }; enum { NWRITEOPS = ARRAY_SIZE(macreg_writeops) }; @@ -1415,10 +1401,10 @@ static int e1000_pre_save(void *opaque) /* * If link is down and auto-negotiation is supported and ongoing, * complete auto-negotiation immediately. This allows us to look - * at MII_SR_AUTONEG_COMPLETE to infer link status on load. + * at MII_BMSR_AN_COMP to infer link status on load. */ if (nc->link_down && have_autoneg(s)) { - s->phy_reg[PHY_STATUS] |= MII_SR_AUTONEG_COMPLETE; + s->phy_reg[MII_BMSR] |= MII_BMSR_AN_COMP; } /* Decide which set of props to migrate in the main structure */ @@ -1457,8 +1443,7 @@ static int e1000_post_load(void *opaque, int version_id) * Alternatively, restart link negotiation if it was in progress. */ nc->link_down = (s->mac_reg[STATUS] & E1000_STATUS_LU) == 0; - if (have_autoneg(s) && - !(s->phy_reg[PHY_STATUS] & MII_SR_AUTONEG_COMPLETE)) { + if (have_autoneg(s) && !(s->phy_reg[MII_BMSR] & MII_BMSR_AN_COMP)) { nc->link_down = false; timer_mod(s->autoneg_timer, qemu_clock_get_ms(QEMU_CLOCK_VIRTUAL) + 500); @@ -1624,8 +1609,9 @@ static const VMStateDescription vmstate_e1000 = { VMSTATE_UINT32(mac_reg[WUFC], E1000State), VMSTATE_UINT32(mac_reg[VET], E1000State), VMSTATE_UINT32_SUB_ARRAY(mac_reg, E1000State, RA, 32), - VMSTATE_UINT32_SUB_ARRAY(mac_reg, E1000State, MTA, 128), - VMSTATE_UINT32_SUB_ARRAY(mac_reg, E1000State, VFTA, 128), + VMSTATE_UINT32_SUB_ARRAY(mac_reg, E1000State, MTA, E1000_MC_TBL_SIZE), + VMSTATE_UINT32_SUB_ARRAY(mac_reg, E1000State, VFTA, + E1000_VLAN_FILTER_TBL_SIZE), VMSTATE_END_OF_LIST() }, .subsections = (const VMStateDescription*[]) { @@ -1746,12 +1732,6 @@ static void pci_e1000_realize(PCIDevice *pci_dev, Error **errp) e1000_flush_queue_timer, d); } -static void qdev_e1000_reset(DeviceState *dev) -{ - E1000State *d = E1000(dev); - e1000_reset(d); -} - static Property e1000_properties[] = { DEFINE_NIC_PROPERTIES(E1000State, conf), DEFINE_PROP_BIT("autonegotiation", E1000State, @@ -1777,6 +1757,7 @@ typedef struct E1000Info { static void e1000_class_init(ObjectClass *klass, void *data) { DeviceClass *dc = DEVICE_CLASS(klass); + ResettableClass *rc = RESETTABLE_CLASS(klass); PCIDeviceClass *k = PCI_DEVICE_CLASS(klass); E1000BaseClass *e = E1000_CLASS(klass); const E1000Info *info = data; @@ -1789,9 +1770,9 @@ static void e1000_class_init(ObjectClass *klass, void *data) k->revision = info->revision; e->phy_id2 = info->phy_id2; k->class_id = PCI_CLASS_NETWORK_ETHERNET; + rc->phases.hold = e1000_reset_hold; set_bit(DEVICE_CATEGORY_NETWORK, dc->categories); dc->desc = "Intel Gigabit Ethernet"; - dc->reset = qdev_e1000_reset; dc->vmsd = &vmstate_e1000; device_class_set_props(dc, e1000_properties); } diff --git a/hw/net/e1000_common.h b/hw/net/e1000_common.h new file mode 100644 index 0000000000..48feda7404 --- /dev/null +++ b/hw/net/e1000_common.h @@ -0,0 +1,102 @@ +/* + * QEMU e1000(e) emulation - shared definitions + * + * Copyright (c) 2008 Qumranet + * + * Based on work done by: + * Nir Peleg, Tutis Systems Ltd. for Qumranet Inc. + * Copyright (c) 2007 Dan Aloni + * Copyright (c) 2004 Antony T Curtis + * + * This library is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * This library is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with this library; if not, see <http://www.gnu.org/licenses/>. + */ + +#ifndef HW_NET_E1000_COMMON_H +#define HW_NET_E1000_COMMON_H + +#include "e1000_regs.h" + +#define defreg(x) x = (E1000_##x >> 2) +enum { + defreg(CTRL), defreg(EECD), defreg(EERD), defreg(GPRC), + defreg(GPTC), defreg(ICR), defreg(ICS), defreg(IMC), + defreg(IMS), defreg(LEDCTL), defreg(MANC), defreg(MDIC), + defreg(MPC), defreg(PBA), defreg(RCTL), defreg(RDBAH0), + defreg(RDBAL0), defreg(RDH0), defreg(RDLEN0), defreg(RDT0), + defreg(STATUS), defreg(SWSM), defreg(TCTL), defreg(TDBAH), + defreg(TDBAL), defreg(TDH), defreg(TDLEN), defreg(TDT), + defreg(TDLEN1), defreg(TDBAL1), defreg(TDBAH1), defreg(TDH1), + defreg(TDT1), defreg(TORH), defreg(TORL), defreg(TOTH), + defreg(TOTL), defreg(TPR), defreg(TPT), defreg(TXDCTL), + defreg(WUFC), defreg(RA), defreg(MTA), defreg(CRCERRS), + defreg(VFTA), defreg(VET), defreg(RDTR), defreg(RADV), + defreg(TADV), defreg(ITR), defreg(SCC), defreg(ECOL), + defreg(MCC), defreg(LATECOL), defreg(COLC), defreg(DC), + defreg(TNCRS), defreg(SEQEC), defreg(CEXTERR), defreg(RLEC), + defreg(XONRXC), defreg(XONTXC), defreg(XOFFRXC), defreg(XOFFTXC), + defreg(FCRUC), defreg(AIT), defreg(TDFH), defreg(TDFT), + defreg(TDFHS), defreg(TDFTS), defreg(TDFPC), defreg(WUC), + defreg(WUS), defreg(POEMB), defreg(PBS), defreg(RDFH), + defreg(RDFT), defreg(RDFHS), defreg(RDFTS), defreg(RDFPC), + defreg(PBM), defreg(IPAV), defreg(IP4AT), defreg(IP6AT), + defreg(WUPM), defreg(FFLT), defreg(FFMT), defreg(FFVT), + defreg(TARC0), defreg(TARC1), defreg(IAM), defreg(EXTCNF_CTRL), + defreg(GCR), defreg(TIMINCA), defreg(EIAC), defreg(CTRL_EXT), + defreg(IVAR), defreg(MFUTP01), defreg(MFUTP23), defreg(MANC2H), + defreg(MFVAL), defreg(MDEF), defreg(FACTPS), defreg(FTFT), + defreg(RUC), defreg(ROC), defreg(RFC), defreg(RJC), + defreg(PRC64), defreg(PRC127), defreg(PRC255), defreg(PRC511), + defreg(PRC1023), defreg(PRC1522), defreg(PTC64), defreg(PTC127), + defreg(PTC255), defreg(PTC511), defreg(PTC1023), defreg(PTC1522), + defreg(GORCL), defreg(GORCH), defreg(GOTCL), defreg(GOTCH), + defreg(RNBC), defreg(BPRC), defreg(MPRC), defreg(RFCTL), + defreg(PSRCTL), defreg(MPTC), defreg(BPTC), defreg(TSCTFC), + defreg(IAC), defreg(MGTPRC), defreg(MGTPDC), defreg(MGTPTC), + defreg(TSCTC), defreg(RXCSUM), defreg(FUNCTAG), defreg(GSCL_1), + defreg(GSCL_2), defreg(GSCL_3), defreg(GSCL_4), defreg(GSCN_0), + defreg(GSCN_1), defreg(GSCN_2), defreg(GSCN_3), defreg(GCR2), + defreg(RAID), defreg(RSRPD), defreg(TIDV), defreg(EITR), + defreg(MRQC), defreg(RETA), defreg(RSSRK), defreg(RDBAH1), + defreg(RDBAL1), defreg(RDLEN1), defreg(RDH1), defreg(RDT1), + defreg(PBACLR), defreg(FCAL), defreg(FCAH), defreg(FCT), + defreg(FCRTH), defreg(FCRTL), defreg(FCTTV), defreg(FCRTV), + defreg(FLA), defreg(EEWR), defreg(FLOP), defreg(FLOL), + defreg(FLSWCTL), defreg(FLSWCNT), defreg(RXDCTL), defreg(RXDCTL1), + defreg(MAVTV0), defreg(MAVTV1), defreg(MAVTV2), defreg(MAVTV3), + defreg(TXSTMPL), defreg(TXSTMPH), defreg(SYSTIML), defreg(SYSTIMH), + defreg(RXCFGL), defreg(RXUDP), defreg(TIMADJL), defreg(TIMADJH), + defreg(RXSTMPH), defreg(RXSTMPL), defreg(RXSATRL), defreg(RXSATRH), + defreg(FLASHT), defreg(TIPG), defreg(RDH), defreg(RDT), + defreg(RDLEN), defreg(RDBAH), defreg(RDBAL), + defreg(TXDCTL1), + defreg(FLSWDATA), + defreg(CTRL_DUP), + defreg(EXTCNF_SIZE), + defreg(EEMNGCTL), + defreg(EEMNGDATA), + defreg(FLMNGCTL), + defreg(FLMNGDATA), + defreg(FLMNGCNT), + defreg(TSYNCRXCTL), + defreg(TSYNCTXCTL), + + /* Aliases */ + defreg(RDH0_A), defreg(RDT0_A), defreg(RDTR_A), defreg(RDFH_A), + defreg(RDFT_A), defreg(TDH_A), defreg(TDT_A), defreg(TIDV_A), + defreg(TDFH_A), defreg(TDFT_A), defreg(RA_A), defreg(RDBAL0_A), + defreg(TDBAL_A), defreg(TDLEN_A), defreg(VFTA_A), defreg(RDLEN0_A), + defreg(FCRTL_A), defreg(FCRTH_A) +}; + +#endif diff --git a/hw/net/e1000_regs.h b/hw/net/e1000_regs.h index 59e050742b..8a4ce82034 100644 --- a/hw/net/e1000_regs.h +++ b/hw/net/e1000_regs.h @@ -32,157 +32,35 @@ #ifndef HW_E1000_REGS_H #define HW_E1000_REGS_H -/* PCI Device IDs */ -#define E1000_DEV_ID_82542 0x1000 -#define E1000_DEV_ID_82543GC_FIBER 0x1001 -#define E1000_DEV_ID_82543GC_COPPER 0x1004 -#define E1000_DEV_ID_82544EI_COPPER 0x1008 -#define E1000_DEV_ID_82544EI_FIBER 0x1009 -#define E1000_DEV_ID_82544GC_COPPER 0x100C -#define E1000_DEV_ID_82544GC_LOM 0x100D -#define E1000_DEV_ID_82540EM 0x100E -#define E1000_DEV_ID_82540EM_LOM 0x1015 -#define E1000_DEV_ID_82540EP_LOM 0x1016 -#define E1000_DEV_ID_82540EP 0x1017 -#define E1000_DEV_ID_82540EP_LP 0x101E -#define E1000_DEV_ID_82545EM_COPPER 0x100F -#define E1000_DEV_ID_82545EM_FIBER 0x1011 -#define E1000_DEV_ID_82545GM_COPPER 0x1026 -#define E1000_DEV_ID_82545GM_FIBER 0x1027 -#define E1000_DEV_ID_82545GM_SERDES 0x1028 -#define E1000_DEV_ID_82546EB_COPPER 0x1010 -#define E1000_DEV_ID_82546EB_FIBER 0x1012 -#define E1000_DEV_ID_82546EB_QUAD_COPPER 0x101D -#define E1000_DEV_ID_82541EI 0x1013 -#define E1000_DEV_ID_82541EI_MOBILE 0x1018 -#define E1000_DEV_ID_82541ER_LOM 0x1014 -#define E1000_DEV_ID_82541ER 0x1078 -#define E1000_DEV_ID_82547GI 0x1075 -#define E1000_DEV_ID_82541GI 0x1076 -#define E1000_DEV_ID_82541GI_MOBILE 0x1077 -#define E1000_DEV_ID_82541GI_LF 0x107C -#define E1000_DEV_ID_82546GB_COPPER 0x1079 -#define E1000_DEV_ID_82546GB_FIBER 0x107A -#define E1000_DEV_ID_82546GB_SERDES 0x107B -#define E1000_DEV_ID_82546GB_PCIE 0x108A -#define E1000_DEV_ID_82546GB_QUAD_COPPER 0x1099 -#define E1000_DEV_ID_82547EI 0x1019 -#define E1000_DEV_ID_82547EI_MOBILE 0x101A -#define E1000_DEV_ID_82571EB_COPPER 0x105E -#define E1000_DEV_ID_82571EB_FIBER 0x105F -#define E1000_DEV_ID_82571EB_SERDES 0x1060 -#define E1000_DEV_ID_82571EB_QUAD_COPPER 0x10A4 -#define E1000_DEV_ID_82571PT_QUAD_COPPER 0x10D5 -#define E1000_DEV_ID_82571EB_QUAD_FIBER 0x10A5 -#define E1000_DEV_ID_82571EB_QUAD_COPPER_LOWPROFILE 0x10BC -#define E1000_DEV_ID_82571EB_SERDES_DUAL 0x10D9 -#define E1000_DEV_ID_82571EB_SERDES_QUAD 0x10DA -#define E1000_DEV_ID_82572EI_COPPER 0x107D -#define E1000_DEV_ID_82572EI_FIBER 0x107E -#define E1000_DEV_ID_82572EI_SERDES 0x107F -#define E1000_DEV_ID_82572EI 0x10B9 -#define E1000_DEV_ID_82573E 0x108B -#define E1000_DEV_ID_82573E_IAMT 0x108C -#define E1000_DEV_ID_82573L 0x109A -#define E1000_DEV_ID_82574L 0x10D3 -#define E1000_DEV_ID_82546GB_QUAD_COPPER_KSP3 0x10B5 -#define E1000_DEV_ID_80003ES2LAN_COPPER_DPT 0x1096 -#define E1000_DEV_ID_80003ES2LAN_SERDES_DPT 0x1098 -#define E1000_DEV_ID_80003ES2LAN_COPPER_SPT 0x10BA -#define E1000_DEV_ID_80003ES2LAN_SERDES_SPT 0x10BB +#include "e1000x_regs.h" -#define E1000_DEV_ID_ICH8_IGP_M_AMT 0x1049 -#define E1000_DEV_ID_ICH8_IGP_AMT 0x104A -#define E1000_DEV_ID_ICH8_IGP_C 0x104B -#define E1000_DEV_ID_ICH8_IFE 0x104C -#define E1000_DEV_ID_ICH8_IFE_GT 0x10C4 -#define E1000_DEV_ID_ICH8_IFE_G 0x10C5 -#define E1000_DEV_ID_ICH8_IGP_M 0x104D - -/* Device Specific Register Defaults */ -#define E1000_PHY_ID2_82541x 0x380 -#define E1000_PHY_ID2_82544x 0xC30 -#define E1000_PHY_ID2_8254xx_DEFAULT 0xC20 /* 82540x, 82545x, and 82546x */ -#define E1000_PHY_ID2_82573x 0xCC0 -#define E1000_PHY_ID2_82574x 0xCB1 - -/* Register Set. (82543, 82544) - * - * Registers are defined to be 32 bits and should be accessed as 32 bit values. - * These registers are physically located on the NIC, but are mapped into the - * host memory address space. - * - * RW - register is both readable and writable - * RO - register is read only - * WO - register is write only - * R/clr - register is read only and is cleared when read - * A - register array - */ -#define E1000_CTRL 0x00000 /* Device Control - RW */ -#define E1000_CTRL_DUP 0x00004 /* Device Control Duplicate (Shadow) - RW */ -#define E1000_STATUS 0x00008 /* Device Status - RO */ -#define E1000_EECD 0x00010 /* EEPROM/Flash Control - RW */ -#define E1000_EERD 0x00014 /* EEPROM Read - RW */ -#define E1000_CTRL_EXT 0x00018 /* Extended Device Control - RW */ -#define E1000_FLA 0x0001C /* Flash Access - RW */ -#define E1000_MDIC 0x00020 /* MDI Control - RW */ -#define E1000_SCTL 0x00024 /* SerDes Control - RW */ -#define E1000_FEXTNVM 0x00028 /* Future Extended NVM register */ -#define E1000_FCAL 0x00028 /* Flow Control Address Low - RW */ -#define E1000_FCAH 0x0002C /* Flow Control Address High -RW */ -#define E1000_FCT 0x00030 /* Flow Control Type - RW */ -#define E1000_VET 0x00038 /* VLAN Ether Type - RW */ -#define E1000_ICR 0x000C0 /* Interrupt Cause Read - R/clr */ #define E1000_ITR 0x000C4 /* Interrupt Throttling Rate - RW */ -#define E1000_ICS 0x000C8 /* Interrupt Cause Set - WO */ -#define E1000_IMS 0x000D0 /* Interrupt Mask Set - RW */ #define E1000_EIAC 0x000DC /* Ext. Interrupt Auto Clear - RW */ -#define E1000_IMC 0x000D8 /* Interrupt Mask Clear - WO */ -#define E1000_IAM 0x000E0 /* Interrupt Acknowledge Auto Mask */ #define E1000_IVAR 0x000E4 /* Interrupt Vector Allocation Register - RW */ #define E1000_EITR 0x000E8 /* Extended Interrupt Throttling Rate - RW */ -#define E1000_RCTL 0x00100 /* RX Control - RW */ -#define E1000_RDTR1 0x02820 /* RX Delay Timer (1) - RW */ #define E1000_RDBAL1 0x02900 /* RX Descriptor Base Address Low (1) - RW */ #define E1000_RDBAH1 0x02904 /* RX Descriptor Base Address High (1) - RW */ #define E1000_RDLEN1 0x02908 /* RX Descriptor Length (1) - RW */ #define E1000_RDH1 0x02910 /* RX Descriptor Head (1) - RW */ #define E1000_RDT1 0x02918 /* RX Descriptor Tail (1) - RW */ -#define E1000_FCTTV 0x00170 /* Flow Control Transmit Timer Value - RW */ #define E1000_FCRTV 0x05F40 /* Flow Control Refresh Timer Value - RW */ #define E1000_TXCW 0x00178 /* TX Configuration Word - RW */ #define E1000_RXCW 0x00180 /* RX Configuration Word - RO */ -#define E1000_TCTL 0x00400 /* TX Control - RW */ -#define E1000_TCTL_EXT 0x00404 /* Extended TX Control - RW */ -#define E1000_TIPG 0x00410 /* TX Inter-packet gap -RW */ #define E1000_TBT 0x00448 /* TX Burst Timer - RW */ #define E1000_AIT 0x00458 /* Adaptive Interframe Spacing Throttle - RW */ -#define E1000_LEDCTL 0x00E00 /* LED Control - RW */ #define E1000_EXTCNF_CTRL 0x00F00 /* Extended Configuration Control */ #define E1000_EXTCNF_SIZE 0x00F08 /* Extended Configuration Size */ #define E1000_PHY_CTRL 0x00F10 /* PHY Control Register in CSR */ -#define FEXTNVM_SW_CONFIG 0x0001 #define E1000_PBA 0x01000 /* Packet Buffer Allocation - RW */ #define E1000_PBM 0x10000 /* Packet Buffer Memory - RW */ #define E1000_PBS 0x01008 /* Packet Buffer Size - RW */ -#define E1000_EEMNGCTL 0x01010 /* MNG EEprom Control */ -#define E1000_EEMNGDATA 0x01014 /* MNG EEPROM Read/Write data */ -#define E1000_FLMNGCTL 0x01018 /* MNG Flash Control */ -#define E1000_FLMNGDATA 0x0101C /* MNG FLASH Read data */ -#define E1000_FLMNGCNT 0x01020 /* MNG FLASH Read Counter */ -#define E1000_FLASH_UPDATES 1000 -#define E1000_EEARBC 0x01024 /* EEPROM Auto Read Bus Control */ #define E1000_FLASHT 0x01028 /* FLASH Timer Register */ #define E1000_EEWR 0x0102C /* EEPROM Write Register - RW */ #define E1000_FLSWCTL 0x01030 /* FLASH control register */ #define E1000_FLSWDATA 0x01034 /* FLASH data register */ #define E1000_FLSWCNT 0x01038 /* FLASH Access Counter */ -#define E1000_FLOP 0x0103C /* FLASH Opcode Register */ #define E1000_FLOL 0x01050 /* FEEP Auto Load */ #define E1000_ERT 0x02008 /* Early Rx Threshold - RW */ -#define E1000_FCRTL 0x02160 /* Flow Control Receive Threshold Low - RW */ -#define E1000_FCRTL_A 0x00168 /* Alias to FCRTL */ -#define E1000_FCRTH 0x02168 /* Flow Control Receive Threshold High - RW */ #define E1000_FCRTH_A 0x00160 /* Alias to FCRTH */ #define E1000_PSRCTL 0x02170 /* Packet Split Receive Control - RW */ #define E1000_RDBAL 0x02800 /* RX Descriptor Base Address Low - RW */ @@ -208,23 +86,7 @@ #define E1000_RADV 0x0282C /* RX Interrupt Absolute Delay Timer - RW */ #define E1000_RSRPD 0x02C00 /* RX Small Packet Detect - RW */ #define E1000_RAID 0x02C08 /* Receive Ack Interrupt Delay - RW */ -#define E1000_TXDMAC 0x03000 /* TX DMA Control - RW */ -#define E1000_KABGTXD 0x03004 /* AFE Band Gap Transmit Ref Data */ #define E1000_POEMB 0x00F10 /* PHY OEM Bits Register - RW */ -#define E1000_RDFH 0x02410 /* Receive Data FIFO Head Register - RW */ -#define E1000_RDFH_A 0x08000 /* Alias to RDFH */ -#define E1000_RDFT 0x02418 /* Receive Data FIFO Tail Register - RW */ -#define E1000_RDFT_A 0x08008 /* Alias to RDFT */ -#define E1000_RDFHS 0x02420 /* Receive Data FIFO Head Saved Register - RW */ -#define E1000_RDFTS 0x02428 /* Receive Data FIFO Tail Saved Register - RW */ -#define E1000_RDFPC 0x02430 /* Receive Data FIFO Packet Count - RW */ -#define E1000_TDFH 0x03410 /* TX Data FIFO Head - RW */ -#define E1000_TDFH_A 0x08010 /* Alias to TDFH */ -#define E1000_TDFT 0x03418 /* TX Data FIFO Tail - RW */ -#define E1000_TDFT_A 0x08018 /* Alias to TDFT */ -#define E1000_TDFHS 0x03420 /* TX Data FIFO Head Saved - RW */ -#define E1000_TDFTS 0x03428 /* TX Data FIFO Tail Saved - RW */ -#define E1000_TDFPC 0x03430 /* TX Data FIFO Packet Count - RW */ #define E1000_TDBAL 0x03800 /* TX Descriptor Base Address Low - RW */ #define E1000_TDBAL_A 0x00420 /* Alias to TDBAL */ #define E1000_TDBAH 0x03804 /* TX Descriptor Base Address High - RW */ @@ -248,174 +110,40 @@ #define E1000_TDT1 0x03918 /* TX Desc Tail (1) - RW */ #define E1000_TXDCTL1 0x03928 /* TX Descriptor Control (1) - RW */ #define E1000_TARC1 0x03940 /* TX Arbitration Count (1) */ -#define E1000_CRCERRS 0x04000 /* CRC Error Count - R/clr */ -#define E1000_ALGNERRC 0x04004 /* Alignment Error Count - R/clr */ -#define E1000_SYMERRS 0x04008 /* Symbol Error Count - R/clr */ -#define E1000_RXERRC 0x0400C /* Receive Error Count - R/clr */ -#define E1000_MPC 0x04010 /* Missed Packet Count - R/clr */ -#define E1000_SCC 0x04014 /* Single Collision Count - R/clr */ -#define E1000_ECOL 0x04018 /* Excessive Collision Count - R/clr */ -#define E1000_MCC 0x0401C /* Multiple Collision Count - R/clr */ -#define E1000_LATECOL 0x04020 /* Late Collision Count - R/clr */ -#define E1000_COLC 0x04028 /* Collision Count - R/clr */ -#define E1000_DC 0x04030 /* Defer Count - R/clr */ -#define E1000_TNCRS 0x04034 /* TX-No CRS - R/clr */ #define E1000_SEQEC 0x04038 /* Sequence Error Count - R/clr */ #define E1000_CEXTERR 0x0403C /* Carrier Extension Error Count - R/clr */ -#define E1000_RLEC 0x04040 /* Receive Length Error Count - R/clr */ -#define E1000_XONRXC 0x04048 /* XON RX Count - R/clr */ -#define E1000_XONTXC 0x0404C /* XON TX Count - R/clr */ -#define E1000_XOFFRXC 0x04050 /* XOFF RX Count - R/clr */ -#define E1000_XOFFTXC 0x04054 /* XOFF TX Count - R/clr */ -#define E1000_FCRUC 0x04058 /* Flow Control RX Unsupported Count- R/clr */ -#define E1000_PRC64 0x0405C /* Packets RX (64 bytes) - R/clr */ -#define E1000_PRC127 0x04060 /* Packets RX (65-127 bytes) - R/clr */ -#define E1000_PRC255 0x04064 /* Packets RX (128-255 bytes) - R/clr */ -#define E1000_PRC511 0x04068 /* Packets RX (255-511 bytes) - R/clr */ -#define E1000_PRC1023 0x0406C /* Packets RX (512-1023 bytes) - R/clr */ -#define E1000_PRC1522 0x04070 /* Packets RX (1024-1522 bytes) - R/clr */ -#define E1000_GPRC 0x04074 /* Good Packets RX Count - R/clr */ -#define E1000_BPRC 0x04078 /* Broadcast Packets RX Count - R/clr */ -#define E1000_MPRC 0x0407C /* Multicast Packets RX Count - R/clr */ -#define E1000_GPTC 0x04080 /* Good Packets TX Count - R/clr */ -#define E1000_GORCL 0x04088 /* Good Octets RX Count Low - R/clr */ -#define E1000_GORCH 0x0408C /* Good Octets RX Count High - R/clr */ -#define E1000_GOTCL 0x04090 /* Good Octets TX Count Low - R/clr */ -#define E1000_GOTCH 0x04094 /* Good Octets TX Count High - R/clr */ -#define E1000_RNBC 0x040A0 /* RX No Buffers Count - R/clr */ -#define E1000_RUC 0x040A4 /* RX Undersize Count - R/clr */ -#define E1000_RFC 0x040A8 /* RX Fragment Count - R/clr */ -#define E1000_ROC 0x040AC /* RX Oversize Count - R/clr */ -#define E1000_RJC 0x040B0 /* RX Jabber Count - R/clr */ -#define E1000_MGTPRC 0x040B4 /* Management Packets RX Count - R/clr */ -#define E1000_MGTPDC 0x040B8 /* Management Packets Dropped Count - R/clr */ -#define E1000_MGTPTC 0x040BC /* Management Packets TX Count - R/clr */ -#define E1000_TORL 0x040C0 /* Total Octets RX Low - R/clr */ -#define E1000_TORH 0x040C4 /* Total Octets RX High - R/clr */ -#define E1000_TOTL 0x040C8 /* Total Octets TX Low - R/clr */ -#define E1000_TOTH 0x040CC /* Total Octets TX High - R/clr */ -#define E1000_TPR 0x040D0 /* Total Packets RX - R/clr */ -#define E1000_TPT 0x040D4 /* Total Packets TX - R/clr */ -#define E1000_PTC64 0x040D8 /* Packets TX (64 bytes) - R/clr */ -#define E1000_PTC127 0x040DC /* Packets TX (65-127 bytes) - R/clr */ -#define E1000_PTC255 0x040E0 /* Packets TX (128-255 bytes) - R/clr */ -#define E1000_PTC511 0x040E4 /* Packets TX (256-511 bytes) - R/clr */ -#define E1000_PTC1023 0x040E8 /* Packets TX (512-1023 bytes) - R/clr */ -#define E1000_PTC1522 0x040EC /* Packets TX (1024-1522 Bytes) - R/clr */ -#define E1000_MPTC 0x040F0 /* Multicast Packets TX Count - R/clr */ -#define E1000_BPTC 0x040F4 /* Broadcast Packets TX Count - R/clr */ -#define E1000_TSCTC 0x040F8 /* TCP Segmentation Context TX - R/clr */ #define E1000_TSCTFC 0x040FC /* TCP Segmentation Context TX Fail - R/clr */ -#define E1000_IAC 0x04100 /* Interrupt Assertion Count */ -#define E1000_ICRXPTC 0x04104 /* Interrupt Cause Rx Packet Timer Expire Count */ #define E1000_ICRXATC 0x04108 /* Interrupt Cause Rx Absolute Timer Expire Count */ #define E1000_ICTXPTC 0x0410C /* Interrupt Cause Tx Packet Timer Expire Count */ #define E1000_ICTXATC 0x04110 /* Interrupt Cause Tx Absolute Timer Expire Count */ #define E1000_ICTXQEC 0x04118 /* Interrupt Cause Tx Queue Empty Count */ #define E1000_ICTXQMTC 0x0411C /* Interrupt Cause Tx Queue Minimum Threshold Count */ -#define E1000_ICRXDMTC 0x04120 /* Interrupt Cause Rx Descriptor Minimum Threshold Count */ #define E1000_ICRXOC 0x04124 /* Interrupt Cause Receiver Overrun Count */ -#define E1000_RXCSUM 0x05000 /* RX Checksum Control - RW */ -#define E1000_RFCTL 0x05008 /* Receive Filter Control*/ -#define E1000_MAVTV0 0x05010 /* Management VLAN TAG Value 0 */ -#define E1000_MAVTV1 0x05014 /* Management VLAN TAG Value 1 */ -#define E1000_MAVTV2 0x05018 /* Management VLAN TAG Value 2 */ -#define E1000_MAVTV3 0x0501c /* Management VLAN TAG Value 3 */ -#define E1000_MTA 0x05200 /* Multicast Table Array - RW Array */ -#define E1000_RA 0x05400 /* Receive Address - RW Array */ -#define E1000_RA_A 0x00040 /* Alias to RA */ -#define E1000_VFTA 0x05600 /* VLAN Filter Table Array - RW Array */ -#define E1000_VFTA_A 0x00600 /* Alias to VFTA */ -#define E1000_WUC 0x05800 /* Wakeup Control - RW */ -#define E1000_WUFC 0x05808 /* Wakeup Filter Control - RW */ -#define E1000_WUS 0x05810 /* Wakeup Status - RO */ -#define E1000_MANC 0x05820 /* Management Control - RW */ -#define E1000_IPAV 0x05838 /* IP Address Valid - RW */ -#define E1000_IP4AT 0x05840 /* IPv4 Address Table - RW Array */ -#define E1000_IP6AT 0x05880 /* IPv6 Address Table - RW Array */ -#define E1000_WUPL 0x05900 /* Wakeup Packet Length - RW */ -#define E1000_WUPM 0x05A00 /* Wakeup Packet Memory - RO A */ #define E1000_MFUTP01 0x05828 /* Management Flex UDP/TCP Ports 0/1 - RW */ #define E1000_MFUTP23 0x05830 /* Management Flex UDP/TCP Ports 2/3 - RW */ -#define E1000_MFVAL 0x05824 /* Manageability Filters Valid - RW */ -#define E1000_MDEF 0x05890 /* Manageability Decision Filters - RW Array */ #define E1000_FFLT 0x05F00 /* Flexible Filter Length Table - RW Array */ #define E1000_HOST_IF 0x08800 /* Host Interface */ -#define E1000_FFMT 0x09000 /* Flexible Filter Mask Table - RW Array */ -#define E1000_FTFT 0x09400 /* Flexible TCO Filter Table - RW Array */ #define E1000_FFVT 0x09800 /* Flexible Filter Value Table - RW Array */ #define E1000_KUMCTRLSTA 0x00034 /* MAC-PHY interface - RW */ #define E1000_MDPHYA 0x0003C /* PHY address - RW */ -#define E1000_MANC2H 0x05860 /* Management Control To Host - RW */ -#define E1000_SW_FW_SYNC 0x05B5C /* Software-Firmware Synchronization - RW */ -#define E1000_GCR 0x05B00 /* PCI-Ex Control */ -#define E1000_FUNCTAG 0x05B08 /* Function-Tag Register */ -#define E1000_GSCL_1 0x05B10 /* PCI-Ex Statistic Control #1 */ -#define E1000_GSCL_2 0x05B14 /* PCI-Ex Statistic Control #2 */ -#define E1000_GSCL_3 0x05B18 /* PCI-Ex Statistic Control #3 */ -#define E1000_GSCL_4 0x05B1C /* PCI-Ex Statistic Control #4 */ -#define E1000_GSCN_0 0x05B20 /* 3GIO Statistic Counter Register #0 */ -#define E1000_GSCN_1 0x05B24 /* 3GIO Statistic Counter Register #1 */ -#define E1000_GSCN_2 0x05B28 /* 3GIO Statistic Counter Register #2 */ -#define E1000_GSCN_3 0x05B2C /* 3GIO Statistic Counter Register #3 */ -#define E1000_FACTPS 0x05B30 /* Function Active and Power State to MNG */ -#define E1000_SWSM 0x05B50 /* SW Semaphore */ #define E1000_GCR2 0x05B64 /* 3GIO Control Register 2 */ -#define E1000_FWSM 0x05B54 /* FW Semaphore */ -#define E1000_PBACLR 0x05B68 /* MSI-X PBA Clear */ #define E1000_FFLT_DBG 0x05F04 /* Debug Register */ #define E1000_HICR 0x08F00 /* Host Inteface Control */ -#define E1000_TSYNCRXCTL 0x0B620 /* Rx Time Sync Control register - RW */ -#define E1000_TSYNCTXCTL 0x0B614 /* Tx Time Sync Control register - RW */ -#define E1000_TIMINCA 0x0B608 /* Increment attributes register - RW */ -#define E1000_RXSTMPL 0x0B624 /* Rx timestamp Low - RO */ -#define E1000_RXSTMPH 0x0B628 /* Rx timestamp High - RO */ -#define E1000_TXSTMPL 0x0B618 /* Tx timestamp value Low - RO */ -#define E1000_TXSTMPH 0x0B61C /* Tx timestamp value High - RO */ -#define E1000_SYSTIML 0x0B600 /* System time register Low - RO */ -#define E1000_SYSTIMH 0x0B604 /* System time register High - RO */ -#define E1000_TIMINCA 0x0B608 /* Increment attributes register - RW */ #define E1000_RXMTRL 0x0B634 /* Time sync Rx EtherType and Msg Type - RW */ #define E1000_RXUDP 0x0B638 /* Time Sync Rx UDP Port - RW */ -#define E1000_RXSATRL 0x0B62C /* Rx timestamp attribute low - RO */ -#define E1000_RXSATRH 0x0B630 /* Rx timestamp attribute high - RO */ -#define E1000_TIMADJL 0x0B60C /* Time Adjustment Offset register Low - RW */ -#define E1000_TIMADJH 0x0B610 /* Time Adjustment Offset register High - RW */ #define E1000_RXCFGL 0x0B634 /* RX Ethertype and Message Type - RW*/ -/* RSS registers */ +#define E1000_MRQC_ENABLED(mrqc) (((mrqc) & (BIT(0) | BIT(1))) == BIT(0)) + #define E1000_CPUVEC 0x02C10 /* CPU Vector Register - RW */ -#define E1000_MRQC 0x05818 /* Multiple Receive Control - RW */ -#define E1000_RETA 0x05C00 /* Redirection Table - RW Array */ -#define E1000_RSSRK 0x05C80 /* RSS Random Key - RW Array */ #define E1000_RSSIM 0x05864 /* RSS Interrupt Mask */ #define E1000_RSSIR 0x05868 /* RSS Interrupt Request */ -#define E1000_MRQC_ENABLED(mrqc) (((mrqc) & (BIT(0) | BIT(1))) == BIT(0)) - -#define E1000_RETA_IDX(hash) ((hash) & (BIT(7) - 1)) -#define E1000_RETA_VAL(reta, hash) (((uint8_t *)(reta))[E1000_RETA_IDX(hash)]) #define E1000_RSS_QUEUE(reta, hash) ((E1000_RETA_VAL(reta, hash) & BIT(7)) >> 7) -#define E1000_MRQC_EN_TCPIPV4(mrqc) ((mrqc) & BIT(16)) -#define E1000_MRQC_EN_IPV4(mrqc) ((mrqc) & BIT(17)) -#define E1000_MRQC_EN_TCPIPV6(mrqc) ((mrqc) & BIT(18)) -#define E1000_MRQC_EN_IPV6EX(mrqc) ((mrqc) & BIT(19)) -#define E1000_MRQC_EN_IPV6(mrqc) ((mrqc) & BIT(20)) - -#define E1000_MRQ_RSS_TYPE_NONE (0) -#define E1000_MRQ_RSS_TYPE_IPV4TCP (1) -#define E1000_MRQ_RSS_TYPE_IPV4 (2) -#define E1000_MRQ_RSS_TYPE_IPV6TCP (3) -#define E1000_MRQ_RSS_TYPE_IPV6EX (4) -#define E1000_MRQ_RSS_TYPE_IPV6 (5) - -#define E1000_ICR_ASSERTED BIT(31) -#define E1000_EIAC_MASK 0x01F00000 - /* [TR]DBAL and [TR]DLEN masks */ #define E1000_XDBAL_MASK (~(BIT(4) - 1)) #define E1000_XDLEN_MASK ((BIT(20) - 1) & (~(BIT(7) - 1))) @@ -444,18 +172,8 @@ #define E1000_IVAR_TX_INT_EVERY_WB BIT(31) -/* RFCTL register bits */ -#define E1000_RFCTL_ISCSI_DIS 0x00000001 -#define E1000_RFCTL_NFSW_DIS 0x00000040 -#define E1000_RFCTL_NFSR_DIS 0x00000080 -#define E1000_RFCTL_IPV6_DIS 0x00000400 -#define E1000_RFCTL_IPV6_XSUM_DIS 0x00000800 #define E1000_RFCTL_ACK_DIS 0x00001000 #define E1000_RFCTL_ACK_DATA_DIS 0x00002000 -#define E1000_RFCTL_IPFRSP_DIS 0x00004000 -#define E1000_RFCTL_EXTEN 0x00008000 -#define E1000_RFCTL_IPV6_EX_DIS 0x00010000 -#define E1000_RFCTL_NEW_IPV6_EXT_DIS 0x00020000 /* PSRCTL parsing */ #define E1000_PSRCTL_BSIZE0_MASK 0x0000007F @@ -470,24 +188,7 @@ #define E1000_PSRCTL_BUFFS_PER_DESC 4 -/* TARC* parsing */ -#define E1000_TARC_ENABLE BIT(10) - /* PHY 1000 MII Register/Bit Definitions */ -/* PHY Registers defined by IEEE */ -#define PHY_CTRL 0x00 /* Control Register */ -#define PHY_STATUS 0x01 /* Status Regiser */ -#define PHY_ID1 0x02 /* Phy Id Reg (word 1) */ -#define PHY_ID2 0x03 /* Phy Id Reg (word 2) */ -#define PHY_AUTONEG_ADV 0x04 /* Autoneg Advertisement */ -#define PHY_LP_ABILITY 0x05 /* Link Partner Ability (Base Page) */ -#define PHY_AUTONEG_EXP 0x06 /* Autoneg Expansion Reg */ -#define PHY_NEXT_PAGE_TX 0x07 /* Next Page TX */ -#define PHY_LP_NEXT_PAGE 0x08 /* Link Partner Next Page */ -#define PHY_1000T_CTRL 0x09 /* 1000Base-T Control Reg */ -#define PHY_1000T_STATUS 0x0A /* 1000Base-T Status Reg */ -#define PHY_EXT_STATUS 0x0F /* Extended Status Reg */ - /* 82574-specific registers */ #define PHY_COPPER_CTRL1 0x10 /* Copper Specific Control Register 1 */ #define PHY_COPPER_STAT1 0x11 /* Copper Specific Status Register 1 */ @@ -539,287 +240,6 @@ #define M88E1000_PHY_VCO_REG_BIT8 0x100 /* Bits 8 & 11 are adjusted for */ #define M88E1000_PHY_VCO_REG_BIT11 0x800 /* improved BER performance */ -/* PHY Control Register */ -#define MII_CR_SPEED_SELECT_MSB 0x0040 /* bits 6,13: 10=1000, 01=100, 00=10 */ -#define MII_CR_COLL_TEST_ENABLE 0x0080 /* Collision test enable */ -#define MII_CR_FULL_DUPLEX 0x0100 /* FDX =1, half duplex =0 */ -#define MII_CR_RESTART_AUTO_NEG 0x0200 /* Restart auto negotiation */ -#define MII_CR_ISOLATE 0x0400 /* Isolate PHY from MII */ -#define MII_CR_POWER_DOWN 0x0800 /* Power down */ -#define MII_CR_AUTO_NEG_EN 0x1000 /* Auto Neg Enable */ -#define MII_CR_SPEED_SELECT_LSB 0x2000 /* bits 6,13: 10=1000, 01=100, 00=10 */ -#define MII_CR_LOOPBACK 0x4000 /* 0 = normal, 1 = loopback */ -#define MII_CR_RESET 0x8000 /* 0 = normal, 1 = PHY reset */ - -/* PHY Status Register */ -#define MII_SR_EXTENDED_CAPS 0x0001 /* Extended register capabilities */ -#define MII_SR_JABBER_DETECT 0x0002 /* Jabber Detected */ -#define MII_SR_LINK_STATUS 0x0004 /* Link Status 1 = link */ -#define MII_SR_AUTONEG_CAPS 0x0008 /* Auto Neg Capable */ -#define MII_SR_REMOTE_FAULT 0x0010 /* Remote Fault Detect */ -#define MII_SR_AUTONEG_COMPLETE 0x0020 /* Auto Neg Complete */ -#define MII_SR_PREAMBLE_SUPPRESS 0x0040 /* Preamble may be suppressed */ -#define MII_SR_EXTENDED_STATUS 0x0100 /* Ext. status info in Reg 0x0F */ -#define MII_SR_100T2_HD_CAPS 0x0200 /* 100T2 Half Duplex Capable */ -#define MII_SR_100T2_FD_CAPS 0x0400 /* 100T2 Full Duplex Capable */ -#define MII_SR_10T_HD_CAPS 0x0800 /* 10T Half Duplex Capable */ -#define MII_SR_10T_FD_CAPS 0x1000 /* 10T Full Duplex Capable */ -#define MII_SR_100X_HD_CAPS 0x2000 /* 100X Half Duplex Capable */ -#define MII_SR_100X_FD_CAPS 0x4000 /* 100X Full Duplex Capable */ -#define MII_SR_100T4_CAPS 0x8000 /* 100T4 Capable */ - -/* PHY Link Partner Ability Register */ -#define MII_LPAR_LPACK 0x4000 /* Acked by link partner */ - -/* Interrupt Cause Read */ -#define E1000_ICR_TXDW 0x00000001 /* Transmit desc written back */ -#define E1000_ICR_TXQE 0x00000002 /* Transmit Queue empty */ -#define E1000_ICR_LSC 0x00000004 /* Link Status Change */ -#define E1000_ICR_RXSEQ 0x00000008 /* rx sequence error */ -#define E1000_ICR_RXDMT0 0x00000010 /* rx desc min. threshold (0) */ -#define E1000_ICR_RXO 0x00000040 /* rx overrun */ -#define E1000_ICR_RXT0 0x00000080 /* rx timer intr (ring 0) */ -#define E1000_ICR_MDAC 0x00000200 /* MDIO access complete */ -#define E1000_ICR_RXCFG 0x00000400 /* RX /c/ ordered set */ -#define E1000_ICR_GPI_EN0 0x00000800 /* GP Int 0 */ -#define E1000_ICR_GPI_EN1 0x00001000 /* GP Int 1 */ -#define E1000_ICR_GPI_EN2 0x00002000 /* GP Int 2 */ -#define E1000_ICR_GPI_EN3 0x00004000 /* GP Int 3 */ -#define E1000_ICR_TXD_LOW 0x00008000 -#define E1000_ICR_SRPD 0x00010000 -#define E1000_ICR_ACK 0x00020000 /* Receive Ack frame */ -#define E1000_ICR_MNG 0x00040000 /* Manageability event */ -#define E1000_ICR_DOCK 0x00080000 /* Dock/Undock */ -#define E1000_ICR_INT_ASSERTED 0x80000000 /* If this bit asserted, the driver should claim the interrupt */ -#define E1000_ICR_RXD_FIFO_PAR0 0x00100000 /* queue 0 Rx descriptor FIFO parity error */ -#define E1000_ICR_TXD_FIFO_PAR0 0x00200000 /* queue 0 Tx descriptor FIFO parity error */ -#define E1000_ICR_HOST_ARB_PAR 0x00400000 /* host arb read buffer parity error */ -#define E1000_ICR_PB_PAR 0x00800000 /* packet buffer parity error */ -#define E1000_ICR_RXD_FIFO_PAR1 0x01000000 /* queue 1 Rx descriptor FIFO parity error */ -#define E1000_ICR_TXD_FIFO_PAR1 0x02000000 /* queue 1 Tx descriptor FIFO parity error */ -#define E1000_ICR_ALL_PARITY 0x03F00000 /* all parity error bits */ -#define E1000_ICR_DSW 0x00000020 /* FW changed the status of DISSW bit in the FWSM */ -#define E1000_ICR_PHYINT 0x00001000 /* LAN connected device generates an interrupt */ -#define E1000_ICR_EPRST 0x00100000 /* ME handware reset occurs */ -#define E1000_ICR_RXQ0 0x00100000 /* Rx Queue 0 Interrupt */ -#define E1000_ICR_RXQ1 0x00200000 /* Rx Queue 1 Interrupt */ -#define E1000_ICR_TXQ0 0x00400000 /* Tx Queue 0 Interrupt */ -#define E1000_ICR_TXQ1 0x00800000 /* Tx Queue 1 Interrupt */ -#define E1000_ICR_OTHER 0x01000000 /* Other Interrupts */ - -#define E1000_ICR_OTHER_CAUSES (E1000_ICR_LSC | \ - E1000_ICR_RXO | \ - E1000_ICR_MDAC | \ - E1000_ICR_SRPD | \ - E1000_ICR_ACK | \ - E1000_ICR_MNG) - -/* Interrupt Cause Set */ -#define E1000_ICS_TXDW E1000_ICR_TXDW /* Transmit desc written back */ -#define E1000_ICS_TXQE E1000_ICR_TXQE /* Transmit Queue empty */ -#define E1000_ICS_LSC E1000_ICR_LSC /* Link Status Change */ -#define E1000_ICS_RXSEQ E1000_ICR_RXSEQ /* rx sequence error */ -#define E1000_ICS_RXDMT0 E1000_ICR_RXDMT0 /* rx desc min. threshold */ -#define E1000_ICS_RXO E1000_ICR_RXO /* rx overrun */ -#define E1000_ICS_RXT0 E1000_ICR_RXT0 /* rx timer intr */ -#define E1000_ICS_MDAC E1000_ICR_MDAC /* MDIO access complete */ -#define E1000_ICS_RXCFG E1000_ICR_RXCFG /* RX /c/ ordered set */ -#define E1000_ICS_GPI_EN0 E1000_ICR_GPI_EN0 /* GP Int 0 */ -#define E1000_ICS_GPI_EN1 E1000_ICR_GPI_EN1 /* GP Int 1 */ -#define E1000_ICS_GPI_EN2 E1000_ICR_GPI_EN2 /* GP Int 2 */ -#define E1000_ICS_GPI_EN3 E1000_ICR_GPI_EN3 /* GP Int 3 */ -#define E1000_ICS_TXD_LOW E1000_ICR_TXD_LOW -#define E1000_ICS_SRPD E1000_ICR_SRPD -#define E1000_ICS_ACK E1000_ICR_ACK /* Receive Ack frame */ -#define E1000_ICS_MNG E1000_ICR_MNG /* Manageability event */ -#define E1000_ICS_DOCK E1000_ICR_DOCK /* Dock/Undock */ -#define E1000_ICS_RXD_FIFO_PAR0 E1000_ICR_RXD_FIFO_PAR0 /* queue 0 Rx descriptor FIFO parity error */ -#define E1000_ICS_TXD_FIFO_PAR0 E1000_ICR_TXD_FIFO_PAR0 /* queue 0 Tx descriptor FIFO parity error */ -#define E1000_ICS_HOST_ARB_PAR E1000_ICR_HOST_ARB_PAR /* host arb read buffer parity error */ -#define E1000_ICS_PB_PAR E1000_ICR_PB_PAR /* packet buffer parity error */ -#define E1000_ICS_RXD_FIFO_PAR1 E1000_ICR_RXD_FIFO_PAR1 /* queue 1 Rx descriptor FIFO parity error */ -#define E1000_ICS_TXD_FIFO_PAR1 E1000_ICR_TXD_FIFO_PAR1 /* queue 1 Tx descriptor FIFO parity error */ -#define E1000_ICS_DSW E1000_ICR_DSW -#define E1000_ICS_PHYINT E1000_ICR_PHYINT -#define E1000_ICS_EPRST E1000_ICR_EPRST - -/* Interrupt Mask Set */ -#define E1000_IMS_TXDW E1000_ICR_TXDW /* Transmit desc written back */ -#define E1000_IMS_TXQE E1000_ICR_TXQE /* Transmit Queue empty */ -#define E1000_IMS_LSC E1000_ICR_LSC /* Link Status Change */ -#define E1000_IMS_RXSEQ E1000_ICR_RXSEQ /* rx sequence error */ -#define E1000_IMS_RXDMT0 E1000_ICR_RXDMT0 /* rx desc min. threshold */ -#define E1000_IMS_RXO E1000_ICR_RXO /* rx overrun */ -#define E1000_IMS_RXT0 E1000_ICR_RXT0 /* rx timer intr */ -#define E1000_IMS_MDAC E1000_ICR_MDAC /* MDIO access complete */ -#define E1000_IMS_RXCFG E1000_ICR_RXCFG /* RX /c/ ordered set */ -#define E1000_IMS_GPI_EN0 E1000_ICR_GPI_EN0 /* GP Int 0 */ -#define E1000_IMS_GPI_EN1 E1000_ICR_GPI_EN1 /* GP Int 1 */ -#define E1000_IMS_GPI_EN2 E1000_ICR_GPI_EN2 /* GP Int 2 */ -#define E1000_IMS_GPI_EN3 E1000_ICR_GPI_EN3 /* GP Int 3 */ -#define E1000_IMS_TXD_LOW E1000_ICR_TXD_LOW -#define E1000_IMS_SRPD E1000_ICR_SRPD -#define E1000_IMS_ACK E1000_ICR_ACK /* Receive Ack frame */ -#define E1000_IMS_MNG E1000_ICR_MNG /* Manageability event */ -#define E1000_IMS_RXQ0 E1000_ICR_RXQ0 -#define E1000_IMS_RXQ1 E1000_ICR_RXQ1 -#define E1000_IMS_TXQ0 E1000_ICR_TXQ0 -#define E1000_IMS_TXQ1 E1000_ICR_TXQ1 -#define E1000_IMS_OTHER E1000_ICR_OTHER -#define E1000_IMS_DOCK E1000_ICR_DOCK /* Dock/Undock */ -#define E1000_IMS_RXD_FIFO_PAR0 E1000_ICR_RXD_FIFO_PAR0 /* queue 0 Rx descriptor FIFO parity error */ -#define E1000_IMS_TXD_FIFO_PAR0 E1000_ICR_TXD_FIFO_PAR0 /* queue 0 Tx descriptor FIFO parity error */ -#define E1000_IMS_HOST_ARB_PAR E1000_ICR_HOST_ARB_PAR /* host arb read buffer parity error */ -#define E1000_IMS_PB_PAR E1000_ICR_PB_PAR /* packet buffer parity error */ -#define E1000_IMS_RXD_FIFO_PAR1 E1000_ICR_RXD_FIFO_PAR1 /* queue 1 Rx descriptor FIFO parity error */ -#define E1000_IMS_TXD_FIFO_PAR1 E1000_ICR_TXD_FIFO_PAR1 /* queue 1 Tx descriptor FIFO parity error */ -#define E1000_IMS_DSW E1000_ICR_DSW -#define E1000_IMS_PHYINT E1000_ICR_PHYINT -#define E1000_IMS_EPRST E1000_ICR_EPRST - -/* Interrupt Mask Clear */ -#define E1000_IMC_TXDW E1000_ICR_TXDW /* Transmit desc written back */ -#define E1000_IMC_TXQE E1000_ICR_TXQE /* Transmit Queue empty */ -#define E1000_IMC_LSC E1000_ICR_LSC /* Link Status Change */ -#define E1000_IMC_RXSEQ E1000_ICR_RXSEQ /* rx sequence error */ -#define E1000_IMC_RXDMT0 E1000_ICR_RXDMT0 /* rx desc min. threshold */ -#define E1000_IMC_RXO E1000_ICR_RXO /* rx overrun */ -#define E1000_IMC_RXT0 E1000_ICR_RXT0 /* rx timer intr */ -#define E1000_IMC_MDAC E1000_ICR_MDAC /* MDIO access complete */ -#define E1000_IMC_RXCFG E1000_ICR_RXCFG /* RX /c/ ordered set */ -#define E1000_IMC_GPI_EN0 E1000_ICR_GPI_EN0 /* GP Int 0 */ -#define E1000_IMC_GPI_EN1 E1000_ICR_GPI_EN1 /* GP Int 1 */ -#define E1000_IMC_GPI_EN2 E1000_ICR_GPI_EN2 /* GP Int 2 */ -#define E1000_IMC_GPI_EN3 E1000_ICR_GPI_EN3 /* GP Int 3 */ -#define E1000_IMC_TXD_LOW E1000_ICR_TXD_LOW -#define E1000_IMC_SRPD E1000_ICR_SRPD -#define E1000_IMC_ACK E1000_ICR_ACK /* Receive Ack frame */ -#define E1000_IMC_MNG E1000_ICR_MNG /* Manageability event */ -#define E1000_IMC_DOCK E1000_ICR_DOCK /* Dock/Undock */ -#define E1000_IMC_RXD_FIFO_PAR0 E1000_ICR_RXD_FIFO_PAR0 /* queue 0 Rx descriptor FIFO parity error */ -#define E1000_IMC_TXD_FIFO_PAR0 E1000_ICR_TXD_FIFO_PAR0 /* queue 0 Tx descriptor FIFO parity error */ -#define E1000_IMC_HOST_ARB_PAR E1000_ICR_HOST_ARB_PAR /* host arb read buffer parity error */ -#define E1000_IMC_PB_PAR E1000_ICR_PB_PAR /* packet buffer parity error */ -#define E1000_IMC_RXD_FIFO_PAR1 E1000_ICR_RXD_FIFO_PAR1 /* queue 1 Rx descriptor FIFO parity error */ -#define E1000_IMC_TXD_FIFO_PAR1 E1000_ICR_TXD_FIFO_PAR1 /* queue 1 Tx descriptor FIFO parity error */ -#define E1000_IMC_DSW E1000_ICR_DSW -#define E1000_IMC_PHYINT E1000_ICR_PHYINT -#define E1000_IMC_EPRST E1000_ICR_EPRST - -/* Receive Control */ -#define E1000_RCTL_RST 0x00000001 /* Software reset */ -#define E1000_RCTL_EN 0x00000002 /* enable */ -#define E1000_RCTL_SBP 0x00000004 /* store bad packet */ -#define E1000_RCTL_UPE 0x00000008 /* unicast promiscuous enable */ -#define E1000_RCTL_MPE 0x00000010 /* multicast promiscuous enab */ -#define E1000_RCTL_LPE 0x00000020 /* long packet enable */ -#define E1000_RCTL_LBM_NO 0x00000000 /* no loopback mode */ -#define E1000_RCTL_LBM_MAC 0x00000040 /* MAC loopback mode */ -#define E1000_RCTL_LBM_SLP 0x00000080 /* serial link loopback mode */ -#define E1000_RCTL_LBM_TCVR 0x000000C0 /* tcvr loopback mode */ -#define E1000_RCTL_DTYP_MASK 0x00000C00 /* Descriptor type mask */ -#define E1000_RCTL_DTYP_PS 0x00000400 /* Packet Split descriptor */ -#define E1000_RCTL_RDMTS_HALF 0x00000000 /* rx desc min threshold size */ -#define E1000_RCTL_RDMTS_QUAT 0x00000100 /* rx desc min threshold size */ -#define E1000_RCTL_RDMTS_EIGTH 0x00000200 /* rx desc min threshold size */ -#define E1000_RCTL_MO_SHIFT 12 /* multicast offset shift */ -#define E1000_RCTL_MO_0 0x00000000 /* multicast offset 11:0 */ -#define E1000_RCTL_MO_1 0x00001000 /* multicast offset 12:1 */ -#define E1000_RCTL_MO_2 0x00002000 /* multicast offset 13:2 */ -#define E1000_RCTL_MO_3 0x00003000 /* multicast offset 15:4 */ -#define E1000_RCTL_MDR 0x00004000 /* multicast desc ring 0 */ -#define E1000_RCTL_BAM 0x00008000 /* broadcast enable */ -/* these buffer sizes are valid if E1000_RCTL_BSEX is 0 */ -#define E1000_RCTL_SZ_2048 0x00000000 /* rx buffer size 2048 */ -#define E1000_RCTL_SZ_1024 0x00010000 /* rx buffer size 1024 */ -#define E1000_RCTL_SZ_512 0x00020000 /* rx buffer size 512 */ -#define E1000_RCTL_SZ_256 0x00030000 /* rx buffer size 256 */ -/* these buffer sizes are valid if E1000_RCTL_BSEX is 1 */ -#define E1000_RCTL_SZ_16384 0x00010000 /* rx buffer size 16384 */ -#define E1000_RCTL_SZ_8192 0x00020000 /* rx buffer size 8192 */ -#define E1000_RCTL_SZ_4096 0x00030000 /* rx buffer size 4096 */ -#define E1000_RCTL_VFE 0x00040000 /* vlan filter enable */ -#define E1000_RCTL_CFIEN 0x00080000 /* canonical form enable */ -#define E1000_RCTL_CFI 0x00100000 /* canonical form indicator */ -#define E1000_RCTL_DPF 0x00400000 /* discard pause frames */ -#define E1000_RCTL_PMCF 0x00800000 /* pass MAC control frames */ -#define E1000_RCTL_BSEX 0x02000000 /* Buffer size extension */ -#define E1000_RCTL_SECRC 0x04000000 /* Strip Ethernet CRC */ -#define E1000_RCTL_FLXBUF_MASK 0x78000000 /* Flexible buffer size */ -#define E1000_RCTL_FLXBUF_SHIFT 27 /* Flexible buffer shift */ - - -#define E1000_EEPROM_SWDPIN0 0x0001 /* SWDPIN 0 EEPROM Value */ -#define E1000_EEPROM_LED_LOGIC 0x0020 /* Led Logic Word */ -#define E1000_EEPROM_RW_REG_DATA 16 /* Offset to data in EEPROM read/write registers */ -#define E1000_EEPROM_RW_REG_DONE 0x10 /* Offset to READ/WRITE done bit */ -#define E1000_EEPROM_RW_REG_START 1 /* First bit for telling part to start operation */ -#define E1000_EEPROM_RW_ADDR_SHIFT 8 /* Shift to the address bits */ -#define E1000_EEPROM_POLL_WRITE 1 /* Flag for polling for write complete */ -#define E1000_EEPROM_POLL_READ 0 /* Flag for polling for read complete */ - -/* 82574 EERD/EEWR registers layout */ -#define E1000_EERW_START BIT(0) -#define E1000_EERW_DONE BIT(1) -#define E1000_EERW_ADDR_SHIFT 2 -#define E1000_EERW_ADDR_MASK ((1L << 14) - 1) -#define E1000_EERW_DATA_SHIFT 16 -#define E1000_EERW_DATA_MASK ((1L << 16) - 1) - -/* Register Bit Masks */ -/* Device Control */ -#define E1000_CTRL_FD 0x00000001 /* Full duplex.0=half; 1=full */ -#define E1000_CTRL_BEM 0x00000002 /* Endian Mode.0=little,1=big */ -#define E1000_CTRL_PRIOR 0x00000004 /* Priority on PCI. 0=rx,1=fair */ -#define E1000_CTRL_GIO_MASTER_DISABLE 0x00000004 /*Blocks new Master requests */ -#define E1000_CTRL_LRST 0x00000008 /* Link reset. 0=normal,1=reset */ -#define E1000_CTRL_TME 0x00000010 /* Test mode. 0=normal,1=test */ -#define E1000_CTRL_SLE 0x00000020 /* Serial Link on 0=dis,1=en */ -#define E1000_CTRL_ASDE 0x00000020 /* Auto-speed detect enable */ -#define E1000_CTRL_SLU 0x00000040 /* Set link up (Force Link) */ -#define E1000_CTRL_ILOS 0x00000080 /* Invert Loss-Of Signal */ -#define E1000_CTRL_SPD_SEL 0x00000300 /* Speed Select Mask */ -#define E1000_CTRL_SPD_10 0x00000000 /* Force 10Mb */ -#define E1000_CTRL_SPD_100 0x00000100 /* Force 100Mb */ -#define E1000_CTRL_SPD_1000 0x00000200 /* Force 1Gb */ -#define E1000_CTRL_BEM32 0x00000400 /* Big Endian 32 mode */ -#define E1000_CTRL_FRCSPD 0x00000800 /* Force Speed */ -#define E1000_CTRL_FRCDPX 0x00001000 /* Force Duplex */ -#define E1000_CTRL_D_UD_EN 0x00002000 /* Dock/Undock enable */ -#define E1000_CTRL_D_UD_POLARITY 0x00004000 /* Defined polarity of Dock/Undock indication in SDP[0] */ -#define E1000_CTRL_FORCE_PHY_RESET 0x00008000 /* Reset both PHY ports, through PHYRST_N pin */ -#define E1000_CTRL_SPD_SHIFT 8 /* Speed Select Shift */ - -#define E1000_CTRL_EXT_ASDCHK 0x00001000 /* auto speed detection check */ -#define E1000_CTRL_EXT_EE_RST 0x00002000 /* EEPROM reset */ -#define E1000_CTRL_EXT_LINK_EN 0x00010000 /* enable link status from external LINK_0 and LINK_1 pins */ -#define E1000_CTRL_EXT_DRV_LOAD 0x10000000 /* Driver loaded bit for FW */ -#define E1000_CTRL_EXT_EIAME 0x01000000 -#define E1000_CTRL_EXT_IAME 0x08000000 /* Int ACK Auto-mask */ -#define E1000_CTRL_EXT_PBA_CLR 0x80000000 /* PBA Clear */ -#define E1000_CTRL_EXT_INT_TIMERS_CLEAR_ENA 0x20000000 -#define E1000_CTRL_EXT_SPD_BYPS 0x00008000 /* Speed Select Bypass */ - -#define E1000_CTRL_SWDPIN0 0x00040000 /* SWDPIN 0 value */ -#define E1000_CTRL_SWDPIN1 0x00080000 /* SWDPIN 1 value */ -#define E1000_CTRL_SWDPIN2 0x00100000 /* SWDPIN 2 value */ -#define E1000_CTRL_SWDPIN3 0x00200000 /* SWDPIN 3 value */ -#define E1000_CTRL_SWDPIO0 0x00400000 /* SWDPIN 0 Input or output */ -#define E1000_CTRL_SWDPIO1 0x00800000 /* SWDPIN 1 input or output */ -#define E1000_CTRL_SWDPIO2 0x01000000 /* SWDPIN 2 input or output */ -#define E1000_CTRL_SWDPIO3 0x02000000 /* SWDPIN 3 input or output */ -#define E1000_CTRL_ADVD3WUC 0x00100000 /* D3 WUC */ -#define E1000_CTRL_RST 0x04000000 /* Global reset */ -#define E1000_CTRL_RFCE 0x08000000 /* Receive Flow Control enable */ -#define E1000_CTRL_TFCE 0x10000000 /* Transmit flow control enable */ -#define E1000_CTRL_RTE 0x20000000 /* Routing tag enable */ -#define E1000_CTRL_VME 0x40000000 /* IEEE VLAN mode enable */ -#define E1000_CTRL_PHY_RST 0x80000000 /* PHY Reset */ -#define E1000_CTRL_SW2FW_INT 0x02000000 /* Initiate an interrupt to manageability engine */ - -/* Device Status */ -#define E1000_STATUS_FD 0x00000001 /* Full duplex.0=half,1=full */ -#define E1000_STATUS_LU 0x00000002 /* Link up.0=no,1=link */ #define E1000_STATUS_FUNC_MASK 0x0000000C /* PCI Function Mask */ #define E1000_STATUS_FUNC_SHIFT 2 #define E1000_STATUS_FUNC_0 0x00000000 /* Function 0 */ @@ -827,9 +247,6 @@ #define E1000_STATUS_TXOFF 0x00000010 /* transmission paused */ #define E1000_STATUS_TBIMODE 0x00000020 /* TBI mode */ #define E1000_STATUS_SPEED_MASK 0x000000C0 -#define E1000_STATUS_SPEED_10 0x00000000 /* Speed 10Mb/s */ -#define E1000_STATUS_SPEED_100 0x00000040 /* Speed 100Mb/s */ -#define E1000_STATUS_SPEED_1000 0x00000080 /* Speed 1000Mb/s */ #define E1000_STATUS_LAN_INIT_DONE 0x00000200 /* Lan Init Completion by EEPROM/Flash */ #define E1000_STATUS_ASDV 0x00000300 /* Auto speed detect value */ @@ -837,9 +254,7 @@ #define E1000_STATUS_ASDV_100 0x00000100 /* ASDV 100Mb */ #define E1000_STATUS_ASDV_1000 0x00000200 /* ASDV 1Gb */ #define E1000_STATUS_DOCK_CI 0x00000800 /* Change in Dock/Undock state. Clear on write '0'. */ -#define E1000_STATUS_GIO_MASTER_ENABLE 0x00080000 /* Status of Master requests. */ #define E1000_STATUS_MTXCKOK 0x00000400 /* MTX clock running OK */ -#define E1000_STATUS_PHYRA 0x00000400 /* PHY Reset Asserted */ #define E1000_STATUS_PCI66 0x00000800 /* In 66Mhz slot */ #define E1000_STATUS_BUS64 0x00001000 /* In 64 bit slot */ #define E1000_STATUS_PCIX_MODE 0x00002000 /* PCI-X mode */ @@ -857,111 +272,6 @@ #define E1000_STATUS_SPEED_SHIFT 6 #define E1000_STATUS_ASDV_SHIFT 8 -/* EEPROM/Flash Control */ -#define E1000_EECD_SK 0x00000001 /* EEPROM Clock */ -#define E1000_EECD_CS 0x00000002 /* EEPROM Chip Select */ -#define E1000_EECD_DI 0x00000004 /* EEPROM Data In */ -#define E1000_EECD_DO 0x00000008 /* EEPROM Data Out */ -#define E1000_EECD_FWE_MASK 0x00000030 -#define E1000_EECD_FWE_DIS 0x00000010 /* Disable FLASH writes */ -#define E1000_EECD_FWE_EN 0x00000020 /* Enable FLASH writes */ -#define E1000_EECD_FWE_SHIFT 4 -#define E1000_EECD_REQ 0x00000040 /* EEPROM Access Request */ -#define E1000_EECD_GNT 0x00000080 /* EEPROM Access Grant */ -#define E1000_EECD_PRES 0x00000100 /* EEPROM Present */ -#define E1000_EECD_SIZE 0x00000200 /* EEPROM Size (0=64 word 1=256 word) */ -#define E1000_EECD_ADDR_BITS 0x00000400 /* EEPROM Addressing bits based on type - * (0-small, 1-large) */ -#define E1000_EECD_TYPE 0x00002000 /* EEPROM Type (1-SPI, 0-Microwire) */ -#ifndef E1000_EEPROM_GRANT_ATTEMPTS -#define E1000_EEPROM_GRANT_ATTEMPTS 1000 /* EEPROM # attempts to gain grant */ -#endif -#define E1000_EECD_AUTO_RD 0x00000200 /* EEPROM Auto Read done */ -#define E1000_EECD_SIZE_EX_MASK 0x00007800 /* EEprom Size */ -#define E1000_EECD_SIZE_EX_SHIFT 11 -#define E1000_EECD_NVADDS 0x00018000 /* NVM Address Size */ -#define E1000_EECD_SELSHAD 0x00020000 /* Select Shadow RAM */ -#define E1000_EECD_INITSRAM 0x00040000 /* Initialize Shadow RAM */ -#define E1000_EECD_FLUPD 0x00080000 /* Update FLASH */ -#define E1000_EECD_AUPDEN 0x00100000 /* Enable Autonomous FLASH update */ -#define E1000_EECD_SHADV 0x00200000 /* Shadow RAM Data Valid */ -#define E1000_EECD_SEC1VAL 0x00400000 /* Sector One Valid */ - - -#define E1000_EECD_SECVAL_SHIFT 22 -#define E1000_STM_OPCODE 0xDB00 -#define E1000_HICR_FW_RESET 0xC0 - -#define E1000_SHADOW_RAM_WORDS 2048 -#define E1000_ICH_NVM_SIG_WORD 0x13 -#define E1000_ICH_NVM_SIG_MASK 0xC0 - -/* MDI Control */ -#define E1000_MDIC_DATA_MASK 0x0000FFFF -#define E1000_MDIC_REG_MASK 0x001F0000 -#define E1000_MDIC_REG_SHIFT 16 -#define E1000_MDIC_PHY_MASK 0x03E00000 -#define E1000_MDIC_PHY_SHIFT 21 -#define E1000_MDIC_OP_WRITE 0x04000000 -#define E1000_MDIC_OP_READ 0x08000000 -#define E1000_MDIC_READY 0x10000000 -#define E1000_MDIC_INT_EN 0x20000000 -#define E1000_MDIC_ERROR 0x40000000 - -/* Rx Interrupt Delay Timer */ -#define E1000_RDTR_FPD BIT(31) - -/* Tx Interrupt Delay Timer */ -#define E1000_TIDV_FPD BIT(31) - -/* Delay increments in nanoseconds for delayed interrupts registers */ -#define E1000_INTR_DELAY_NS_RES (1024) - -/* Delay increments in nanoseconds for interrupt throttling registers */ -#define E1000_INTR_THROTTLING_NS_RES (256) - -/* EEPROM Commands - Microwire */ -#define EEPROM_READ_OPCODE_MICROWIRE 0x6 /* EEPROM read opcode */ -#define EEPROM_WRITE_OPCODE_MICROWIRE 0x5 /* EEPROM write opcode */ -#define EEPROM_ERASE_OPCODE_MICROWIRE 0x7 /* EEPROM erase opcode */ -#define EEPROM_EWEN_OPCODE_MICROWIRE 0x13 /* EEPROM erase/write enable */ -#define EEPROM_EWDS_OPCODE_MICROWIRE 0x10 /* EEPROM erast/write disable */ - -/* EEPROM Word Offsets */ -#define EEPROM_COMPAT 0x0003 -#define EEPROM_ID_LED_SETTINGS 0x0004 -#define EEPROM_VERSION 0x0005 -#define EEPROM_SERDES_AMPLITUDE 0x0006 /* For SERDES output amplitude adjustment. */ -#define EEPROM_PHY_CLASS_WORD 0x0007 -#define EEPROM_INIT_CONTROL1_REG 0x000A -#define EEPROM_INIT_CONTROL2_REG 0x000F -#define EEPROM_SWDEF_PINS_CTRL_PORT_1 0x0010 -#define EEPROM_INIT_CONTROL3_PORT_B 0x0014 -#define EEPROM_INIT_3GIO_3 0x001A -#define EEPROM_SWDEF_PINS_CTRL_PORT_0 0x0020 -#define EEPROM_INIT_CONTROL3_PORT_A 0x0024 -#define EEPROM_CFG 0x0012 -#define EEPROM_FLASH_VERSION 0x0032 -#define EEPROM_CHECKSUM_REG 0x003F - -#define E1000_EEPROM_CFG_DONE 0x00040000 /* MNG config cycle done */ -#define E1000_EEPROM_CFG_DONE_PORT_1 0x00080000 /* ...for second port */ - -/* PCI Express Control */ -/* 3GIO Control Register - GCR (0x05B00; RW) */ -#define E1000_L0S_ADJUST (1 << 9) -#define E1000_L1_ENTRY_LATENCY_MSB (1 << 23) -#define E1000_L1_ENTRY_LATENCY_LSB (1 << 25 | 1 << 26) - -#define E1000_L0S_ADJUST (1 << 9) -#define E1000_L1_ENTRY_LATENCY_MSB (1 << 23) -#define E1000_L1_ENTRY_LATENCY_LSB (1 << 25 | 1 << 26) - -#define E1000_GCR_RO_BITS (1 << 23 | 1 << 25 | 1 << 26) - -/* MSI-X PBA Clear register */ -#define E1000_PBACLR_VALID_MASK (BIT(5) - 1) - /* Transmit Descriptor */ struct e1000_tx_desc { uint64_t buffer_addr; /* Address of the descriptor's data buffer */ @@ -983,269 +293,7 @@ struct e1000_tx_desc { } upper; }; -/* Transmit Descriptor bit definitions */ -#define E1000_TXD_DTYP_D 0x00100000 /* Data Descriptor */ -#define E1000_TXD_DTYP_C 0x00000000 /* Context Descriptor */ #define E1000_TXD_POPTS_IXSM 0x01 /* Insert IP checksum */ #define E1000_TXD_POPTS_TXSM 0x02 /* Insert TCP/UDP checksum */ -#define E1000_TXD_CMD_EOP 0x01000000 /* End of Packet */ -#define E1000_TXD_CMD_IFCS 0x02000000 /* Insert FCS (Ethernet CRC) */ -#define E1000_TXD_CMD_IC 0x04000000 /* Insert Checksum */ -#define E1000_TXD_CMD_RS 0x08000000 /* Report Status */ -#define E1000_TXD_CMD_RPS 0x10000000 /* Report Packet Sent */ -#define E1000_TXD_CMD_DEXT 0x20000000 /* Descriptor extension (0 = legacy) */ -#define E1000_TXD_CMD_VLE 0x40000000 /* Add VLAN tag */ -#define E1000_TXD_CMD_IDE 0x80000000 /* Enable Tidv register */ -#define E1000_TXD_STAT_DD 0x00000001 /* Descriptor Done */ -#define E1000_TXD_STAT_EC 0x00000002 /* Excess Collisions */ -#define E1000_TXD_STAT_LC 0x00000004 /* Late Collisions */ -#define E1000_TXD_STAT_TU 0x00000008 /* Transmit underrun */ -#define E1000_TXD_CMD_TCP 0x01000000 /* TCP packet */ -#define E1000_TXD_CMD_IP 0x02000000 /* IP packet */ -#define E1000_TXD_CMD_TSE 0x04000000 /* TCP Seg enable */ -#define E1000_TXD_CMD_SNAP 0x40000000 /* Update SNAP header */ -#define E1000_TXD_STAT_TC 0x00000004 /* Tx Underrun */ -#define E1000_TXD_EXTCMD_TSTAMP 0x00000010 /* IEEE1588 Timestamp packet */ - -/* Transmit Control */ -#define E1000_TCTL_RST 0x00000001 /* software reset */ -#define E1000_TCTL_EN 0x00000002 /* enable tx */ -#define E1000_TCTL_BCE 0x00000004 /* busy check enable */ -#define E1000_TCTL_PSP 0x00000008 /* pad short packets */ -#define E1000_TCTL_CT 0x00000ff0 /* collision threshold */ -#define E1000_TCTL_COLD 0x003ff000 /* collision distance */ -#define E1000_TCTL_SWXOFF 0x00400000 /* SW Xoff transmission */ -#define E1000_TCTL_PBE 0x00800000 /* Packet Burst Enable */ -#define E1000_TCTL_RTLC 0x01000000 /* Re-transmit on late collision */ -#define E1000_TCTL_NRTU 0x02000000 /* No Re-transmit on underrun */ -#define E1000_TCTL_MULR 0x10000000 /* Multiple request support */ - -/* Legacy Receive Descriptor */ -struct e1000_rx_desc { - uint64_t buffer_addr; /* Address of the descriptor's data buffer */ - uint16_t length; /* Length of data DMAed into data buffer */ - uint16_t csum; /* Packet checksum */ - uint8_t status; /* Descriptor status */ - uint8_t errors; /* Descriptor Errors */ - uint16_t special; -}; - -/* Extended Receive Descriptor */ -union e1000_rx_desc_extended { - struct { - uint64_t buffer_addr; - uint64_t reserved; - } read; - struct { - struct { - uint32_t mrq; /* Multiple Rx Queues */ - union { - uint32_t rss; /* RSS Hash */ - struct { - uint16_t ip_id; /* IP id */ - uint16_t csum; /* Packet Checksum */ - } csum_ip; - } hi_dword; - } lower; - struct { - uint32_t status_error; /* ext status/error */ - uint16_t length; - uint16_t vlan; /* VLAN tag */ - } upper; - } wb; /* writeback */ -}; - -#define MAX_PS_BUFFERS 4 - -/* Number of packet split data buffers (not including the header buffer) */ -#define PS_PAGE_BUFFERS (MAX_PS_BUFFERS - 1) - -/* Receive Descriptor - Packet Split */ -union e1000_rx_desc_packet_split { - struct { - /* one buffer for protocol header(s), three data buffers */ - uint64_t buffer_addr[MAX_PS_BUFFERS]; - } read; - struct { - struct { - uint32_t mrq; /* Multiple Rx Queues */ - union { - uint32_t rss; /* RSS Hash */ - struct { - uint16_t ip_id; /* IP id */ - uint16_t csum; /* Packet Checksum */ - } csum_ip; - } hi_dword; - } lower; - struct { - uint32_t status_error; /* ext status/error */ - uint16_t length0; /* length of buffer 0 */ - uint16_t vlan; /* VLAN tag */ - } middle; - struct { - uint16_t header_status; - /* length of buffers 1-3 */ - uint16_t length[PS_PAGE_BUFFERS]; - } upper; - uint64_t reserved; - } wb; /* writeback */ -}; - -/* Receive Checksum Control bits */ -#define E1000_RXCSUM_IPOFLD 0x100 /* IP Checksum Offload Enable */ -#define E1000_RXCSUM_TUOFLD 0x200 /* TCP/UDP Checksum Offload Enable */ -#define E1000_RXCSUM_PCSD 0x2000 /* Packet Checksum Disable */ - -#define E1000_RING_DESC_LEN (16) -#define E1000_RING_DESC_LEN_SHIFT (4) - -#define E1000_MIN_RX_DESC_LEN E1000_RING_DESC_LEN -#define E1000_MAX_RX_DESC_LEN (sizeof(union e1000_rx_desc_packet_split)) - -/* Receive Descriptor bit definitions */ -#define E1000_RXD_STAT_DD 0x01 /* Descriptor Done */ -#define E1000_RXD_STAT_EOP 0x02 /* End of Packet */ -#define E1000_RXD_STAT_IXSM 0x04 /* Ignore checksum */ -#define E1000_RXD_STAT_VP 0x08 /* IEEE VLAN Packet */ -#define E1000_RXD_STAT_UDPCS 0x10 /* UDP xsum caculated */ -#define E1000_RXD_STAT_TCPCS 0x20 /* TCP xsum calculated */ -#define E1000_RXD_STAT_IPCS 0x40 /* IP xsum calculated */ -#define E1000_RXD_STAT_PIF 0x80 /* passed in-exact filter */ -#define E1000_RXD_STAT_IPIDV 0x200 /* IP identification valid */ -#define E1000_RXD_STAT_UDPV 0x400 /* Valid UDP checksum */ -#define E1000_RXD_STAT_ACK 0x8000 /* ACK Packet indication */ -#define E1000_RXD_ERR_CE 0x01 /* CRC Error */ -#define E1000_RXD_ERR_SE 0x02 /* Symbol Error */ -#define E1000_RXD_ERR_SEQ 0x04 /* Sequence Error */ -#define E1000_RXD_ERR_CXE 0x10 /* Carrier Extension Error */ -#define E1000_RXD_ERR_TCPE 0x20 /* TCP/UDP Checksum Error */ -#define E1000_RXD_ERR_IPE 0x40 /* IP Checksum Error */ -#define E1000_RXD_ERR_RXE 0x80 /* Rx Data Error */ -#define E1000_RXD_SPC_VLAN_MASK 0x0FFF /* VLAN ID is in lower 12 bits */ -#define E1000_RXD_SPC_PRI_MASK 0xE000 /* Priority is in upper 3 bits */ -#define E1000_RXD_SPC_PRI_SHIFT 13 -#define E1000_RXD_SPC_CFI_MASK 0x1000 /* CFI is bit 12 */ -#define E1000_RXD_SPC_CFI_SHIFT 12 - -/* RX packet types */ -#define E1000_RXD_PKT_MAC (0) -#define E1000_RXD_PKT_IP4 (1) -#define E1000_RXD_PKT_IP4_XDP (2) -#define E1000_RXD_PKT_IP6 (5) -#define E1000_RXD_PKT_IP6_XDP (6) - -#define E1000_RXD_PKT_TYPE(t) ((t) << 16) - -#define E1000_RXDEXT_STATERR_CE 0x01000000 -#define E1000_RXDEXT_STATERR_SE 0x02000000 -#define E1000_RXDEXT_STATERR_SEQ 0x04000000 -#define E1000_RXDEXT_STATERR_CXE 0x10000000 -#define E1000_RXDEXT_STATERR_TCPE 0x20000000 -#define E1000_RXDEXT_STATERR_IPE 0x40000000 -#define E1000_RXDEXT_STATERR_RXE 0x80000000 - -#define E1000_RXDPS_HDRSTAT_HDRSP 0x00008000 -#define E1000_RXDPS_HDRSTAT_HDRLEN_MASK 0x000003FF - -/* Receive Address */ -#define E1000_RAH_AV 0x80000000 /* Receive descriptor valid */ - -/* Offload Context Descriptor */ -struct e1000_context_desc { - union { - uint32_t ip_config; - struct { - uint8_t ipcss; /* IP checksum start */ - uint8_t ipcso; /* IP checksum offset */ - uint16_t ipcse; /* IP checksum end */ - } ip_fields; - } lower_setup; - union { - uint32_t tcp_config; - struct { - uint8_t tucss; /* TCP checksum start */ - uint8_t tucso; /* TCP checksum offset */ - uint16_t tucse; /* TCP checksum end */ - } tcp_fields; - } upper_setup; - uint32_t cmd_and_length; /* */ - union { - uint32_t data; - struct { - uint8_t status; /* Descriptor status */ - uint8_t hdr_len; /* Header length */ - uint16_t mss; /* Maximum segment size */ - } fields; - } tcp_seg_setup; -}; - -/* Offload data descriptor */ -struct e1000_data_desc { - uint64_t buffer_addr; /* Address of the descriptor's buffer address */ - union { - uint32_t data; - struct { - uint16_t length; /* Data buffer length */ - uint8_t typ_len_ext; /* */ - uint8_t cmd; /* */ - } flags; - } lower; - union { - uint32_t data; - struct { - uint8_t status; /* Descriptor status */ - uint8_t popts; /* Packet Options */ - uint16_t special; /* */ - } fields; - } upper; -}; - -/* Management Control */ -#define E1000_MANC_SMBUS_EN 0x00000001 /* SMBus Enabled - RO */ -#define E1000_MANC_ASF_EN 0x00000002 /* ASF Enabled - RO */ -#define E1000_MANC_R_ON_FORCE 0x00000004 /* Reset on Force TCO - RO */ -#define E1000_MANC_RMCP_EN 0x00000100 /* Enable RCMP 026Fh Filtering */ -#define E1000_MANC_0298_EN 0x00000200 /* Enable RCMP 0298h Filtering */ -#define E1000_MANC_IPV4_EN 0x00000400 /* Enable IPv4 */ -#define E1000_MANC_IPV6_EN 0x00000800 /* Enable IPv6 */ -#define E1000_MANC_SNAP_EN 0x00001000 /* Accept LLC/SNAP */ -#define E1000_MANC_ARP_EN 0x00002000 /* Enable ARP Request Filtering */ -#define E1000_MANC_NEIGHBOR_EN 0x00004000 /* Enable Neighbor Discovery - * Filtering */ -#define E1000_MANC_ARP_RES_EN 0x00008000 /* Enable ARP response Filtering */ -#define E1000_MANC_DIS_IP_CHK_ARP 0x10000000 /* Disable IP address chacking */ - /*for ARP packets - in 82574 */ -#define E1000_MANC_TCO_RESET 0x00010000 /* TCO Reset Occurred */ -#define E1000_MANC_RCV_TCO_EN 0x00020000 /* Receive TCO Packets Enabled */ -#define E1000_MANC_REPORT_STATUS 0x00040000 /* Status Reporting Enabled */ -#define E1000_MANC_RCV_ALL 0x00080000 /* Receive All Enabled */ -#define E1000_MANC_BLK_PHY_RST_ON_IDE 0x00040000 /* Block phy resets */ -#define E1000_MANC_EN_MAC_ADDR_FILTER 0x00100000 /* Enable MAC address - * filtering */ -#define E1000_MANC_EN_MNG2HOST 0x00200000 /* Enable MNG packets to host - * memory */ -#define E1000_MANC_EN_IP_ADDR_FILTER 0x00400000 /* Enable IP address - * filtering */ -#define E1000_MANC_EN_XSUM_FILTER 0x00800000 /* Enable checksum filtering */ -#define E1000_MANC_BR_EN 0x01000000 /* Enable broadcast filtering */ -#define E1000_MANC_SMB_REQ 0x01000000 /* SMBus Request */ -#define E1000_MANC_SMB_GNT 0x02000000 /* SMBus Grant */ -#define E1000_MANC_SMB_CLK_IN 0x04000000 /* SMBus Clock In */ -#define E1000_MANC_SMB_DATA_IN 0x08000000 /* SMBus Data In */ -#define E1000_MANC_SMB_DATA_OUT 0x10000000 /* SMBus Data Out */ -#define E1000_MANC_SMB_CLK_OUT 0x20000000 /* SMBus Clock Out */ - -#define E1000_MANC_SMB_DATA_OUT_SHIFT 28 /* SMBus Data Out Shift */ -#define E1000_MANC_SMB_CLK_OUT_SHIFT 29 /* SMBus Clock Out Shift */ - -/* FACTPS Control */ -#define E1000_FACTPS_LAN0_ON 0x00000004 /* Lan 0 enable */ - -/* For checksumming, the sum of all words in the EEPROM should equal 0xBABA. */ -#define EEPROM_SUM 0xBABA - -/* I/O-Mapped Access to Internal Registers, Memories, and Flash */ -#define E1000_IOADDR 0x00 -#define E1000_IODATA 0x04 #endif /* HW_E1000_REGS_H */ diff --git a/hw/net/e1000e.c b/hw/net/e1000e.c index 7523e9f5d2..c3848797b8 100644 --- a/hw/net/e1000e.c +++ b/hw/net/e1000e.c @@ -1,37 +1,37 @@ /* -* QEMU INTEL 82574 GbE NIC emulation -* -* Software developer's manuals: -* http://www.intel.com/content/dam/doc/datasheet/82574l-gbe-controller-datasheet.pdf -* -* Copyright (c) 2015 Ravello Systems LTD (http://ravellosystems.com) -* Developed by Daynix Computing LTD (http://www.daynix.com) -* -* Authors: -* Dmitry Fleytman <dmitry@daynix.com> -* Leonid Bloch <leonid@daynix.com> -* Yan Vugenfirer <yan@daynix.com> -* -* Based on work done by: -* Nir Peleg, Tutis Systems Ltd. for Qumranet Inc. -* Copyright (c) 2008 Qumranet -* Based on work done by: -* Copyright (c) 2007 Dan Aloni -* Copyright (c) 2004 Antony T Curtis -* -* This library is free software; you can redistribute it and/or -* modify it under the terms of the GNU Lesser General Public -* License as published by the Free Software Foundation; either -* version 2.1 of the License, or (at your option) any later version. -* -* This library is distributed in the hope that it will be useful, -* but WITHOUT ANY WARRANTY; without even the implied warranty of -* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU -* Lesser General Public License for more details. -* -* You should have received a copy of the GNU Lesser General Public -* License along with this library; if not, see <http://www.gnu.org/licenses/>. -*/ + * QEMU INTEL 82574 GbE NIC emulation + * + * Software developer's manuals: + * http://www.intel.com/content/dam/doc/datasheet/82574l-gbe-controller-datasheet.pdf + * + * Copyright (c) 2015 Ravello Systems LTD (http://ravellosystems.com) + * Developed by Daynix Computing LTD (http://www.daynix.com) + * + * Authors: + * Dmitry Fleytman <dmitry@daynix.com> + * Leonid Bloch <leonid@daynix.com> + * Yan Vugenfirer <yan@daynix.com> + * + * Based on work done by: + * Nir Peleg, Tutis Systems Ltd. for Qumranet Inc. + * Copyright (c) 2008 Qumranet + * Based on work done by: + * Copyright (c) 2007 Dan Aloni + * Copyright (c) 2004 Antony T Curtis + * + * This library is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * This library is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with this library; if not, see <http://www.gnu.org/licenses/>. + */ #include "qemu/osdep.h" #include "qemu/units.h" @@ -42,13 +42,13 @@ #include "qemu/range.h" #include "sysemu/sysemu.h" #include "hw/hw.h" +#include "hw/net/mii.h" #include "hw/pci/msi.h" #include "hw/pci/msix.h" #include "hw/qdev-properties.h" #include "migration/vmstate.h" -#include "e1000_regs.h" - +#include "e1000_common.h" #include "e1000x_common.h" #include "e1000e_core.h" @@ -81,6 +81,7 @@ struct E1000EState { E1000ECore core; bool init_vet; + bool timadj; }; #define E1000E_MMIO_IDX 0 @@ -239,9 +240,9 @@ static NetClientInfo net_e1000e_info = { }; /* -* EEPROM (NVM) contents documented in Table 36, section 6.1 -* and generally 6.1.2 Software accessed words. -*/ + * EEPROM (NVM) contents documented in Table 36, section 6.1 + * and generally 6.1.2 Software accessed words. + */ static const uint16_t e1000e_eeprom_template[64] = { /* Address | Compat. | ImVer | Compat. */ 0x0000, 0x0000, 0x0000, 0x0420, 0xf746, 0x2010, 0xffff, 0xffff, @@ -512,11 +513,11 @@ static void e1000e_pci_uninit(PCIDevice *pci_dev) msi_uninit(pci_dev); } -static void e1000e_qdev_reset(DeviceState *dev) +static void e1000e_qdev_reset_hold(Object *obj) { - E1000EState *s = E1000E(dev); + E1000EState *s = E1000E(obj); - trace_e1000e_cb_qdev_reset(); + trace_e1000e_cb_qdev_reset_hold(); e1000e_core_reset(&s->core); @@ -553,6 +554,12 @@ static int e1000e_post_load(void *opaque, int version_id) return e1000e_core_post_load(&s->core); } +static bool e1000e_migrate_timadj(void *opaque, int version_id) +{ + E1000EState *s = opaque; + return s->timadj; +} + static const VMStateDescription e1000e_vmstate_tx = { .name = "e1000e-tx", .version_id = 1, @@ -630,12 +637,11 @@ static const VMStateDescription e1000e_vmstate = { VMSTATE_E1000E_INTR_DELAY_TIMER(core.tidv, E1000EState), VMSTATE_E1000E_INTR_DELAY_TIMER(core.itr, E1000EState), - VMSTATE_BOOL(core.itr_intr_pending, E1000EState), + VMSTATE_UNUSED(1), VMSTATE_E1000E_INTR_DELAY_TIMER_ARRAY(core.eitr, E1000EState, E1000E_MSIX_VEC_NUM), - VMSTATE_BOOL_ARRAY(core.eitr_intr_pending, E1000EState, - E1000E_MSIX_VEC_NUM), + VMSTATE_UNUSED(E1000E_MSIX_VEC_NUM), VMSTATE_UINT32(core.itr_guest_value, E1000EState), VMSTATE_UINT32_ARRAY(core.eitr_guest_value, E1000EState, @@ -645,6 +651,9 @@ static const VMStateDescription e1000e_vmstate = { VMSTATE_STRUCT_ARRAY(core.tx, E1000EState, E1000E_NUM_QUEUES, 0, e1000e_vmstate_tx, struct e1000e_tx), + + VMSTATE_INT64_TEST(core.timadj, E1000EState, e1000e_migrate_timadj), + VMSTATE_END_OF_LIST() } }; @@ -663,12 +672,14 @@ static Property e1000e_properties[] = { DEFINE_PROP_SIGNED("subsys", E1000EState, subsys, 0, e1000e_prop_subsys, uint16_t), DEFINE_PROP_BOOL("init-vet", E1000EState, init_vet, true), + DEFINE_PROP_BOOL("migrate-timadj", E1000EState, timadj, true), DEFINE_PROP_END_OF_LIST(), }; static void e1000e_class_init(ObjectClass *class, void *data) { DeviceClass *dc = DEVICE_CLASS(class); + ResettableClass *rc = RESETTABLE_CLASS(class); PCIDeviceClass *c = PCI_DEVICE_CLASS(class); c->realize = e1000e_pci_realize; @@ -679,8 +690,9 @@ static void e1000e_class_init(ObjectClass *class, void *data) c->romfile = "efi-e1000e.rom"; c->class_id = PCI_CLASS_NETWORK_ETHERNET; + rc->phases.hold = e1000e_qdev_reset_hold; + dc->desc = "Intel 82574L GbE Controller"; - dc->reset = e1000e_qdev_reset; dc->vmsd = &e1000e_vmstate; e1000e_prop_disable_vnet = qdev_prop_uint8; diff --git a/hw/net/e1000e_core.c b/hw/net/e1000e_core.c index fc9cdb4528..4d9679ca0b 100644 --- a/hw/net/e1000e_core.c +++ b/hw/net/e1000e_core.c @@ -1,42 +1,43 @@ /* -* Core code for QEMU e1000e emulation -* -* Software developer's manuals: -* http://www.intel.com/content/dam/doc/datasheet/82574l-gbe-controller-datasheet.pdf -* -* Copyright (c) 2015 Ravello Systems LTD (http://ravellosystems.com) -* Developed by Daynix Computing LTD (http://www.daynix.com) -* -* Authors: -* Dmitry Fleytman <dmitry@daynix.com> -* Leonid Bloch <leonid@daynix.com> -* Yan Vugenfirer <yan@daynix.com> -* -* Based on work done by: -* Nir Peleg, Tutis Systems Ltd. for Qumranet Inc. -* Copyright (c) 2008 Qumranet -* Based on work done by: -* Copyright (c) 2007 Dan Aloni -* Copyright (c) 2004 Antony T Curtis -* -* This library is free software; you can redistribute it and/or -* modify it under the terms of the GNU Lesser General Public -* License as published by the Free Software Foundation; either -* version 2.1 of the License, or (at your option) any later version. -* -* This library is distributed in the hope that it will be useful, -* but WITHOUT ANY WARRANTY; without even the implied warranty of -* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU -* Lesser General Public License for more details. -* -* You should have received a copy of the GNU Lesser General Public -* License along with this library; if not, see <http://www.gnu.org/licenses/>. -*/ + * Core code for QEMU e1000e emulation + * + * Software developer's manuals: + * http://www.intel.com/content/dam/doc/datasheet/82574l-gbe-controller-datasheet.pdf + * + * Copyright (c) 2015 Ravello Systems LTD (http://ravellosystems.com) + * Developed by Daynix Computing LTD (http://www.daynix.com) + * + * Authors: + * Dmitry Fleytman <dmitry@daynix.com> + * Leonid Bloch <leonid@daynix.com> + * Yan Vugenfirer <yan@daynix.com> + * + * Based on work done by: + * Nir Peleg, Tutis Systems Ltd. for Qumranet Inc. + * Copyright (c) 2008 Qumranet + * Based on work done by: + * Copyright (c) 2007 Dan Aloni + * Copyright (c) 2004 Antony T Curtis + * + * This library is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * This library is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with this library; if not, see <http://www.gnu.org/licenses/>. + */ #include "qemu/osdep.h" #include "qemu/log.h" #include "net/net.h" #include "net/tap.h" +#include "hw/net/mii.h" #include "hw/pci/msi.h" #include "hw/pci/msix.h" #include "sysemu/runstate.h" @@ -44,18 +45,32 @@ #include "net_tx_pkt.h" #include "net_rx_pkt.h" +#include "e1000_common.h" #include "e1000x_common.h" #include "e1000e_core.h" #include "trace.h" -#define E1000E_MIN_XITR (500) /* No more then 7813 interrupts per - second according to spec 10.2.4.2 */ +/* No more then 7813 interrupts per second according to spec 10.2.4.2 */ +#define E1000E_MIN_XITR (500) + #define E1000E_MAX_TX_FRAGS (64) +union e1000_rx_desc_union { + struct e1000_rx_desc legacy; + union e1000_rx_desc_extended extended; + union e1000_rx_desc_packet_split packet_split; +}; + +static ssize_t +e1000e_receive_internal(E1000ECore *core, const struct iovec *iov, int iovcnt, + bool has_vnet); + static inline void e1000e_set_interrupt_cause(E1000ECore *core, uint32_t val); +static void e1000e_reset(E1000ECore *core, bool sw); + static inline void e1000e_process_ts_option(E1000ECore *core, struct e1000_tx_desc *dp) { @@ -148,15 +163,8 @@ e1000e_intrmgr_on_throttling_timer(void *opaque) { E1000IntrDelayTimer *timer = opaque; - assert(!msix_enabled(timer->core->owner)); - timer->running = false; - if (!timer->core->itr_intr_pending) { - trace_e1000e_irq_throttling_no_pending_interrupts(); - return; - } - if (msi_enabled(timer->core->owner)) { trace_e1000e_irq_msi_notify_postponed(); /* Clear msi_causes_pending to fire MSI eventually */ @@ -174,15 +182,8 @@ e1000e_intrmgr_on_msix_throttling_timer(void *opaque) E1000IntrDelayTimer *timer = opaque; int idx = timer - &timer->core->eitr[0]; - assert(msix_enabled(timer->core->owner)); - timer->running = false; - if (!timer->core->eitr_intr_pending[idx]) { - trace_e1000e_irq_throttling_no_pending_vec(idx); - return; - } - trace_e1000e_irq_msix_notify_postponed_vec(idx); msix_notify(timer->core->owner, idx); } @@ -282,14 +283,18 @@ e1000e_intrmgr_delay_rx_causes(E1000ECore *core, uint32_t *causes) core->delayed_causes |= *causes & delayable_causes; *causes &= ~delayable_causes; - /* Check if delayed RX interrupts disabled by client - or if there are causes that cannot be delayed */ + /* + * Check if delayed RX interrupts disabled by client + * or if there are causes that cannot be delayed + */ if ((rdtr == 0) || (*causes != 0)) { return false; } - /* Check if delayed RX ACK interrupts disabled by client - and there is an ACK packet received */ + /* + * Check if delayed RX ACK interrupts disabled by client + * and there is an ACK packet received + */ if ((raid == 0) && (core->delayed_causes & E1000_ICR_ACK)) { return false; } @@ -493,27 +498,27 @@ typedef struct E1000E_RSSInfo_st { static uint32_t e1000e_rss_get_hash_type(E1000ECore *core, struct NetRxPkt *pkt) { - bool isip4, isip6, isudp, istcp; + bool hasip4, hasip6; + EthL4HdrProto l4hdr_proto; assert(e1000e_rss_enabled(core)); - net_rx_pkt_get_protocols(pkt, &isip4, &isip6, &isudp, &istcp); - - if (isip4) { - bool fragment = net_rx_pkt_get_ip4_info(pkt)->fragment; + net_rx_pkt_get_protocols(pkt, &hasip4, &hasip6, &l4hdr_proto); - trace_e1000e_rx_rss_ip4(fragment, istcp, core->mac[MRQC], + if (hasip4) { + trace_e1000e_rx_rss_ip4(l4hdr_proto, core->mac[MRQC], E1000_MRQC_EN_TCPIPV4(core->mac[MRQC]), E1000_MRQC_EN_IPV4(core->mac[MRQC])); - if (!fragment && istcp && E1000_MRQC_EN_TCPIPV4(core->mac[MRQC])) { + if (l4hdr_proto == ETH_L4_HDR_PROTO_TCP && + E1000_MRQC_EN_TCPIPV4(core->mac[MRQC])) { return E1000_MRQ_RSS_TYPE_IPV4TCP; } if (E1000_MRQC_EN_IPV4(core->mac[MRQC])) { return E1000_MRQ_RSS_TYPE_IPV4; } - } else if (isip6) { + } else if (hasip6) { eth_ip6_hdr_info *ip6info = net_rx_pkt_get_ip6_info(pkt); bool ex_dis = core->mac[RFCTL] & E1000_RFCTL_IPV6_EX_DIS; @@ -527,7 +532,7 @@ e1000e_rss_get_hash_type(E1000ECore *core, struct NetRxPkt *pkt) * backends like these. */ trace_e1000e_rx_rss_ip6_rfctl(core->mac[RFCTL]); - trace_e1000e_rx_rss_ip6(ex_dis, new_ex_dis, istcp, + trace_e1000e_rx_rss_ip6(ex_dis, new_ex_dis, l4hdr_proto, ip6info->has_ext_hdrs, ip6info->rss_ex_dst_valid, ip6info->rss_ex_src_valid, @@ -540,7 +545,7 @@ e1000e_rss_get_hash_type(E1000ECore *core, struct NetRxPkt *pkt) (!new_ex_dis || !(ip6info->rss_ex_dst_valid || ip6info->rss_ex_src_valid))) { - if (istcp && !ip6info->fragment && + if (l4hdr_proto == ETH_L4_HDR_PROTO_TCP && E1000_MRQC_EN_TCPIPV6(core->mac[MRQC])) { return E1000_MRQ_RSS_TYPE_IPV6TCP; } @@ -625,23 +630,39 @@ e1000e_rss_parse_packet(E1000ECore *core, info->queue = E1000_RSS_QUEUE(&core->mac[RETA], info->hash); } -static void +static bool e1000e_setup_tx_offloads(E1000ECore *core, struct e1000e_tx *tx) { if (tx->props.tse && tx->cptse) { - net_tx_pkt_build_vheader(tx->tx_pkt, true, true, tx->props.mss); + if (!net_tx_pkt_build_vheader(tx->tx_pkt, true, true, tx->props.mss)) { + return false; + } + net_tx_pkt_update_ip_checksums(tx->tx_pkt); e1000x_inc_reg_if_not_full(core->mac, TSCTC); - return; + return true; } if (tx->sum_needed & E1000_TXD_POPTS_TXSM) { - net_tx_pkt_build_vheader(tx->tx_pkt, false, true, 0); + if (!net_tx_pkt_build_vheader(tx->tx_pkt, false, true, 0)) { + return false; + } } if (tx->sum_needed & E1000_TXD_POPTS_IXSM) { net_tx_pkt_update_ip_hdr_checksum(tx->tx_pkt); } + + return true; +} + +static void e1000e_tx_pkt_callback(void *core, + const struct iovec *iov, + int iovcnt, + const struct iovec *virt_iov, + int virt_iovcnt) +{ + e1000e_receive_internal(core, virt_iov, virt_iovcnt, true); } static bool @@ -650,13 +671,16 @@ e1000e_tx_pkt_send(E1000ECore *core, struct e1000e_tx *tx, int queue_index) int target_queue = MIN(core->max_queue_num, queue_index); NetClientState *queue = qemu_get_subqueue(core->owner_nic, target_queue); - e1000e_setup_tx_offloads(core, tx); + if (!e1000e_setup_tx_offloads(core, tx)) { + return false; + } net_tx_pkt_dump(tx->tx_pkt); - if ((core->phy[0][PHY_CTRL] & MII_CR_LOOPBACK) || + if ((core->phy[0][MII_BMCR] & MII_BMCR_LOOPBACK) || ((core->mac[RCTL] & E1000_RCTL_LBM_MAC) == E1000_RCTL_LBM_MAC)) { - return net_tx_pkt_send_loopback(tx->tx_pkt, queue); + return net_tx_pkt_send_custom(tx->tx_pkt, false, + e1000e_tx_pkt_callback, core); } else { return net_tx_pkt_send(tx->tx_pkt, queue); } @@ -668,7 +692,7 @@ e1000e_on_tx_done_update_stats(E1000ECore *core, struct NetTxPkt *tx_pkt) static const int PTCregs[6] = { PTC64, PTC127, PTC255, PTC511, PTC1023, PTC1522 }; - size_t tot_len = net_tx_pkt_get_total_len(tx_pkt); + size_t tot_len = net_tx_pkt_get_total_len(tx_pkt) + 4; e1000x_increase_size_stats(core->mac, PTCregs, tot_len); e1000x_inc_reg_if_not_full(core->mac, TPT); @@ -1016,10 +1040,11 @@ e1000e_receive_filter(E1000ECore *core, const uint8_t *buf, int size) if (e1000x_is_vlan_packet(buf, core->mac[VET]) && e1000x_vlan_rx_filter_enabled(core->mac)) { - uint16_t vid = lduw_be_p(buf + 14); - uint32_t vfta = ldl_le_p((uint32_t *)(core->mac + VFTA) + - ((vid >> 5) & 0x7f)); - if ((vfta & (1 << (vid & 0x1f))) == 0) { + uint16_t vid = lduw_be_p(&PKT_GET_VLAN_HDR(buf)->h_tci); + uint32_t vfta = + ldl_le_p((uint32_t *)(core->mac + VFTA) + + ((vid >> E1000_VFTA_ENTRY_SHIFT) & E1000_VFTA_ENTRY_MASK)); + if ((vfta & (1 << (vid & E1000_VFTA_ENTRY_BIT_SHIFT_MASK))) == 0) { trace_e1000e_rx_flt_vlan_mismatch(vid); return false; } else { @@ -1054,48 +1079,47 @@ e1000e_receive_filter(E1000ECore *core, const uint8_t *buf, int size) } static inline void -e1000e_read_lgcy_rx_descr(E1000ECore *core, uint8_t *desc, hwaddr *buff_addr) +e1000e_read_lgcy_rx_descr(E1000ECore *core, struct e1000_rx_desc *desc, + hwaddr *buff_addr) { - struct e1000_rx_desc *d = (struct e1000_rx_desc *) desc; - *buff_addr = le64_to_cpu(d->buffer_addr); + *buff_addr = le64_to_cpu(desc->buffer_addr); } static inline void -e1000e_read_ext_rx_descr(E1000ECore *core, uint8_t *desc, hwaddr *buff_addr) +e1000e_read_ext_rx_descr(E1000ECore *core, union e1000_rx_desc_extended *desc, + hwaddr *buff_addr) { - union e1000_rx_desc_extended *d = (union e1000_rx_desc_extended *) desc; - *buff_addr = le64_to_cpu(d->read.buffer_addr); + *buff_addr = le64_to_cpu(desc->read.buffer_addr); } static inline void -e1000e_read_ps_rx_descr(E1000ECore *core, uint8_t *desc, - hwaddr (*buff_addr)[MAX_PS_BUFFERS]) +e1000e_read_ps_rx_descr(E1000ECore *core, + union e1000_rx_desc_packet_split *desc, + hwaddr buff_addr[MAX_PS_BUFFERS]) { int i; - union e1000_rx_desc_packet_split *d = - (union e1000_rx_desc_packet_split *) desc; for (i = 0; i < MAX_PS_BUFFERS; i++) { - (*buff_addr)[i] = le64_to_cpu(d->read.buffer_addr[i]); + buff_addr[i] = le64_to_cpu(desc->read.buffer_addr[i]); } - trace_e1000e_rx_desc_ps_read((*buff_addr)[0], (*buff_addr)[1], - (*buff_addr)[2], (*buff_addr)[3]); + trace_e1000e_rx_desc_ps_read(buff_addr[0], buff_addr[1], + buff_addr[2], buff_addr[3]); } static inline void -e1000e_read_rx_descr(E1000ECore *core, uint8_t *desc, - hwaddr (*buff_addr)[MAX_PS_BUFFERS]) +e1000e_read_rx_descr(E1000ECore *core, union e1000_rx_desc_union *desc, + hwaddr buff_addr[MAX_PS_BUFFERS]) { if (e1000e_rx_use_legacy_descriptor(core)) { - e1000e_read_lgcy_rx_descr(core, desc, &(*buff_addr)[0]); - (*buff_addr)[1] = (*buff_addr)[2] = (*buff_addr)[3] = 0; + e1000e_read_lgcy_rx_descr(core, &desc->legacy, &buff_addr[0]); + buff_addr[1] = buff_addr[2] = buff_addr[3] = 0; } else { if (core->mac[RCTL] & E1000_RCTL_DTYP_PS) { - e1000e_read_ps_rx_descr(core, desc, buff_addr); + e1000e_read_ps_rx_descr(core, &desc->packet_split, buff_addr); } else { - e1000e_read_ext_rx_descr(core, desc, &(*buff_addr)[0]); - (*buff_addr)[1] = (*buff_addr)[2] = (*buff_addr)[3] = 0; + e1000e_read_ext_rx_descr(core, &desc->extended, &buff_addr[0]); + buff_addr[1] = buff_addr[2] = buff_addr[3] = 0; } } } @@ -1104,7 +1128,7 @@ static void e1000e_verify_csum_in_sw(E1000ECore *core, struct NetRxPkt *pkt, uint32_t *status_flags, - bool istcp, bool isudp) + EthL4HdrProto l4hdr_proto) { bool csum_valid; uint32_t csum_error; @@ -1131,14 +1155,10 @@ e1000e_verify_csum_in_sw(E1000ECore *core, } csum_error = csum_valid ? 0 : E1000_RXDEXT_STATERR_TCPE; + *status_flags |= E1000_RXD_STAT_TCPCS | csum_error; - if (istcp) { - *status_flags |= E1000_RXD_STAT_TCPCS | - csum_error; - } else if (isudp) { - *status_flags |= E1000_RXD_STAT_TCPCS | - E1000_RXD_STAT_UDPCS | - csum_error; + if (l4hdr_proto == ETH_L4_HDR_PROTO_UDP) { + *status_flags |= E1000_RXD_STAT_UDPCS; } } @@ -1167,7 +1187,8 @@ e1000e_build_rx_metadata(E1000ECore *core, uint16_t *vlan_tag) { struct virtio_net_hdr *vhdr; - bool isip4, isip6, istcp, isudp; + bool hasip4, hasip6; + EthL4HdrProto l4hdr_proto; uint32_t pkt_type; *status_flags = E1000_RXD_STAT_DD; @@ -1179,8 +1200,8 @@ e1000e_build_rx_metadata(E1000ECore *core, *status_flags |= E1000_RXD_STAT_EOP; - net_rx_pkt_get_protocols(pkt, &isip4, &isip6, &isudp, &istcp); - trace_e1000e_rx_metadata_protocols(isip4, isip6, isudp, istcp); + net_rx_pkt_get_protocols(pkt, &hasip4, &hasip6, &l4hdr_proto); + trace_e1000e_rx_metadata_protocols(hasip4, hasip6, l4hdr_proto); /* VLAN state */ if (net_rx_pkt_is_vlan_stripped(pkt)) { @@ -1196,24 +1217,25 @@ e1000e_build_rx_metadata(E1000ECore *core, *mrq = cpu_to_le32(rss_info->type | (rss_info->queue << 8)); trace_e1000e_rx_metadata_rss(*rss, *mrq); } - } else if (isip4) { + } else if (hasip4) { *status_flags |= E1000_RXD_STAT_IPIDV; *ip_id = cpu_to_le16(net_rx_pkt_get_ip_id(pkt)); trace_e1000e_rx_metadata_ip_id(*ip_id); } - if (istcp && e1000e_is_tcp_ack(core, pkt)) { + if (l4hdr_proto == ETH_L4_HDR_PROTO_TCP && e1000e_is_tcp_ack(core, pkt)) { *status_flags |= E1000_RXD_STAT_ACK; trace_e1000e_rx_metadata_ack(); } - if (isip6 && (core->mac[RFCTL] & E1000_RFCTL_IPV6_DIS)) { + if (hasip6 && (core->mac[RFCTL] & E1000_RFCTL_IPV6_DIS)) { trace_e1000e_rx_metadata_ipv6_filtering_disabled(); pkt_type = E1000_RXD_PKT_MAC; - } else if (istcp || isudp) { - pkt_type = isip4 ? E1000_RXD_PKT_IP4_XDP : E1000_RXD_PKT_IP6_XDP; - } else if (isip4 || isip6) { - pkt_type = isip4 ? E1000_RXD_PKT_IP4 : E1000_RXD_PKT_IP6; + } else if (l4hdr_proto == ETH_L4_HDR_PROTO_TCP || + l4hdr_proto == ETH_L4_HDR_PROTO_UDP) { + pkt_type = hasip4 ? E1000_RXD_PKT_IP4_XDP : E1000_RXD_PKT_IP6_XDP; + } else if (hasip4 || hasip6) { + pkt_type = hasip4 ? E1000_RXD_PKT_IP4 : E1000_RXD_PKT_IP6; } else { pkt_type = E1000_RXD_PKT_MAC; } @@ -1222,37 +1244,38 @@ e1000e_build_rx_metadata(E1000ECore *core, trace_e1000e_rx_metadata_pkt_type(pkt_type); /* RX CSO information */ - if (isip6 && (core->mac[RFCTL] & E1000_RFCTL_IPV6_XSUM_DIS)) { + if (hasip6 && (core->mac[RFCTL] & E1000_RFCTL_IPV6_XSUM_DIS)) { trace_e1000e_rx_metadata_ipv6_sum_disabled(); goto func_exit; } - if (!net_rx_pkt_has_virt_hdr(pkt)) { - trace_e1000e_rx_metadata_no_virthdr(); - e1000e_verify_csum_in_sw(core, pkt, status_flags, istcp, isudp); - goto func_exit; - } - vhdr = net_rx_pkt_get_vhdr(pkt); if (!(vhdr->flags & VIRTIO_NET_HDR_F_DATA_VALID) && !(vhdr->flags & VIRTIO_NET_HDR_F_NEEDS_CSUM)) { trace_e1000e_rx_metadata_virthdr_no_csum_info(); - e1000e_verify_csum_in_sw(core, pkt, status_flags, istcp, isudp); + e1000e_verify_csum_in_sw(core, pkt, status_flags, l4hdr_proto); goto func_exit; } if (e1000e_rx_l3_cso_enabled(core)) { - *status_flags |= isip4 ? E1000_RXD_STAT_IPCS : 0; + *status_flags |= hasip4 ? E1000_RXD_STAT_IPCS : 0; } else { trace_e1000e_rx_metadata_l3_cso_disabled(); } if (e1000e_rx_l4_cso_enabled(core)) { - if (istcp) { + switch (l4hdr_proto) { + case ETH_L4_HDR_PROTO_TCP: *status_flags |= E1000_RXD_STAT_TCPCS; - } else if (isudp) { + break; + + case ETH_L4_HDR_PROTO_UDP: *status_flags |= E1000_RXD_STAT_TCPCS | E1000_RXD_STAT_UDPCS; + break; + + default: + break; } } else { trace_e1000e_rx_metadata_l4_cso_disabled(); @@ -1265,7 +1288,7 @@ func_exit: } static inline void -e1000e_write_lgcy_rx_descr(E1000ECore *core, uint8_t *desc, +e1000e_write_lgcy_rx_descr(E1000ECore *core, struct e1000_rx_desc *desc, struct NetRxPkt *pkt, const E1000E_RSSInfo *rss_info, uint16_t length) @@ -1273,71 +1296,66 @@ e1000e_write_lgcy_rx_descr(E1000ECore *core, uint8_t *desc, uint32_t status_flags, rss, mrq; uint16_t ip_id; - struct e1000_rx_desc *d = (struct e1000_rx_desc *) desc; - assert(!rss_info->enabled); - d->length = cpu_to_le16(length); - d->csum = 0; + desc->length = cpu_to_le16(length); + desc->csum = 0; e1000e_build_rx_metadata(core, pkt, pkt != NULL, rss_info, &rss, &mrq, &status_flags, &ip_id, - &d->special); - d->errors = (uint8_t) (le32_to_cpu(status_flags) >> 24); - d->status = (uint8_t) le32_to_cpu(status_flags); + &desc->special); + desc->errors = (uint8_t) (le32_to_cpu(status_flags) >> 24); + desc->status = (uint8_t) le32_to_cpu(status_flags); } static inline void -e1000e_write_ext_rx_descr(E1000ECore *core, uint8_t *desc, +e1000e_write_ext_rx_descr(E1000ECore *core, union e1000_rx_desc_extended *desc, struct NetRxPkt *pkt, const E1000E_RSSInfo *rss_info, uint16_t length) { - union e1000_rx_desc_extended *d = (union e1000_rx_desc_extended *) desc; - - memset(&d->wb, 0, sizeof(d->wb)); + memset(&desc->wb, 0, sizeof(desc->wb)); - d->wb.upper.length = cpu_to_le16(length); + desc->wb.upper.length = cpu_to_le16(length); e1000e_build_rx_metadata(core, pkt, pkt != NULL, rss_info, - &d->wb.lower.hi_dword.rss, - &d->wb.lower.mrq, - &d->wb.upper.status_error, - &d->wb.lower.hi_dword.csum_ip.ip_id, - &d->wb.upper.vlan); + &desc->wb.lower.hi_dword.rss, + &desc->wb.lower.mrq, + &desc->wb.upper.status_error, + &desc->wb.lower.hi_dword.csum_ip.ip_id, + &desc->wb.upper.vlan); } static inline void -e1000e_write_ps_rx_descr(E1000ECore *core, uint8_t *desc, +e1000e_write_ps_rx_descr(E1000ECore *core, + union e1000_rx_desc_packet_split *desc, struct NetRxPkt *pkt, const E1000E_RSSInfo *rss_info, size_t ps_hdr_len, uint16_t(*written)[MAX_PS_BUFFERS]) { int i; - union e1000_rx_desc_packet_split *d = - (union e1000_rx_desc_packet_split *) desc; - memset(&d->wb, 0, sizeof(d->wb)); + memset(&desc->wb, 0, sizeof(desc->wb)); - d->wb.middle.length0 = cpu_to_le16((*written)[0]); + desc->wb.middle.length0 = cpu_to_le16((*written)[0]); for (i = 0; i < PS_PAGE_BUFFERS; i++) { - d->wb.upper.length[i] = cpu_to_le16((*written)[i + 1]); + desc->wb.upper.length[i] = cpu_to_le16((*written)[i + 1]); } e1000e_build_rx_metadata(core, pkt, pkt != NULL, rss_info, - &d->wb.lower.hi_dword.rss, - &d->wb.lower.mrq, - &d->wb.middle.status_error, - &d->wb.lower.hi_dword.csum_ip.ip_id, - &d->wb.middle.vlan); + &desc->wb.lower.hi_dword.rss, + &desc->wb.lower.mrq, + &desc->wb.middle.status_error, + &desc->wb.lower.hi_dword.csum_ip.ip_id, + &desc->wb.middle.vlan); - d->wb.upper.header_status = + desc->wb.upper.header_status = cpu_to_le16(ps_hdr_len | (ps_hdr_len ? E1000_RXDPS_HDRSTAT_HDRSP : 0)); trace_e1000e_rx_desc_ps_write((*written)[0], (*written)[1], @@ -1345,20 +1363,21 @@ e1000e_write_ps_rx_descr(E1000ECore *core, uint8_t *desc, } static inline void -e1000e_write_rx_descr(E1000ECore *core, uint8_t *desc, +e1000e_write_rx_descr(E1000ECore *core, union e1000_rx_desc_union *desc, struct NetRxPkt *pkt, const E1000E_RSSInfo *rss_info, size_t ps_hdr_len, uint16_t(*written)[MAX_PS_BUFFERS]) { if (e1000e_rx_use_legacy_descriptor(core)) { assert(ps_hdr_len == 0); - e1000e_write_lgcy_rx_descr(core, desc, pkt, rss_info, (*written)[0]); + e1000e_write_lgcy_rx_descr(core, &desc->legacy, pkt, rss_info, + (*written)[0]); } else { if (core->mac[RCTL] & E1000_RCTL_DTYP_PS) { - e1000e_write_ps_rx_descr(core, desc, pkt, rss_info, + e1000e_write_ps_rx_descr(core, &desc->packet_split, pkt, rss_info, ps_hdr_len, written); } else { assert(ps_hdr_len == 0); - e1000e_write_ext_rx_descr(core, desc, pkt, rss_info, + e1000e_write_ext_rx_descr(core, &desc->extended, pkt, rss_info, (*written)[0]); } } @@ -1366,12 +1385,12 @@ struct NetRxPkt *pkt, const E1000E_RSSInfo *rss_info, static inline void e1000e_pci_dma_write_rx_desc(E1000ECore *core, dma_addr_t addr, - uint8_t *desc, dma_addr_t len) + union e1000_rx_desc_union *desc, dma_addr_t len) { PCIDevice *dev = core->owner; if (e1000e_rx_use_legacy_descriptor(core)) { - struct e1000_rx_desc *d = (struct e1000_rx_desc *) desc; + struct e1000_rx_desc *d = &desc->legacy; size_t offset = offsetof(struct e1000_rx_desc, status); uint8_t status = d->status; @@ -1384,8 +1403,7 @@ e1000e_pci_dma_write_rx_desc(E1000ECore *core, dma_addr_t addr, } } else { if (core->mac[RCTL] & E1000_RCTL_DTYP_PS) { - union e1000_rx_desc_packet_split *d = - (union e1000_rx_desc_packet_split *) desc; + union e1000_rx_desc_packet_split *d = &desc->packet_split; size_t offset = offsetof(union e1000_rx_desc_packet_split, wb.middle.status_error); uint32_t status = d->wb.middle.status_error; @@ -1398,8 +1416,7 @@ e1000e_pci_dma_write_rx_desc(E1000ECore *core, dma_addr_t addr, pci_dma_write(dev, addr + offset, &status, sizeof(status)); } } else { - union e1000_rx_desc_extended *d = - (union e1000_rx_desc_extended *) desc; + union e1000_rx_desc_extended *d = &desc->extended; size_t offset = offsetof(union e1000_rx_desc_extended, wb.upper.status_error); uint32_t status = d->wb.upper.status_error; @@ -1422,14 +1439,14 @@ typedef struct e1000e_ba_state_st { static inline void e1000e_write_hdr_to_rx_buffers(E1000ECore *core, - hwaddr (*ba)[MAX_PS_BUFFERS], + hwaddr ba[MAX_PS_BUFFERS], e1000e_ba_state *bastate, const char *data, dma_addr_t data_len) { assert(data_len <= core->rxbuf_sizes[0] - bastate->written[0]); - pci_dma_write(core->owner, (*ba)[0] + bastate->written[0], data, data_len); + pci_dma_write(core->owner, ba[0] + bastate->written[0], data, data_len); bastate->written[0] += data_len; bastate->cur_idx = 1; @@ -1437,7 +1454,7 @@ e1000e_write_hdr_to_rx_buffers(E1000ECore *core, static void e1000e_write_to_rx_buffers(E1000ECore *core, - hwaddr (*ba)[MAX_PS_BUFFERS], + hwaddr ba[MAX_PS_BUFFERS], e1000e_ba_state *bastate, const char *data, dma_addr_t data_len) @@ -1449,13 +1466,13 @@ e1000e_write_to_rx_buffers(E1000ECore *core, uint32_t bytes_to_write = MIN(data_len, cur_buf_bytes_left); trace_e1000e_rx_desc_buff_write(bastate->cur_idx, - (*ba)[bastate->cur_idx], + ba[bastate->cur_idx], bastate->written[bastate->cur_idx], data, bytes_to_write); pci_dma_write(core->owner, - (*ba)[bastate->cur_idx] + bastate->written[bastate->cur_idx], + ba[bastate->cur_idx] + bastate->written[bastate->cur_idx], data, bytes_to_write); bastate->written[bastate->cur_idx] += bytes_to_write; @@ -1501,18 +1518,19 @@ e1000e_rx_descr_threshold_hit(E1000ECore *core, const E1000E_RingInfo *rxi) static bool e1000e_do_ps(E1000ECore *core, struct NetRxPkt *pkt, size_t *hdr_len) { - bool isip4, isip6, isudp, istcp; + bool hasip4, hasip6; + EthL4HdrProto l4hdr_proto; bool fragment; if (!e1000e_rx_use_ps_descriptor(core)) { return false; } - net_rx_pkt_get_protocols(pkt, &isip4, &isip6, &isudp, &istcp); + net_rx_pkt_get_protocols(pkt, &hasip4, &hasip6, &l4hdr_proto); - if (isip4) { + if (hasip4) { fragment = net_rx_pkt_get_ip4_info(pkt)->fragment; - } else if (isip6) { + } else if (hasip6) { fragment = net_rx_pkt_get_ip6_info(pkt)->fragment; } else { return false; @@ -1522,7 +1540,8 @@ e1000e_do_ps(E1000ECore *core, struct NetRxPkt *pkt, size_t *hdr_len) return false; } - if (!fragment && (isudp || istcp)) { + if (l4hdr_proto == ETH_L4_HDR_PROTO_TCP || + l4hdr_proto == ETH_L4_HDR_PROTO_UDP) { *hdr_len = net_rx_pkt_get_l5_hdr_offset(pkt); } else { *hdr_len = net_rx_pkt_get_l4_hdr_offset(pkt); @@ -1543,7 +1562,7 @@ e1000e_write_packet_to_guest(E1000ECore *core, struct NetRxPkt *pkt, { PCIDevice *d = core->owner; dma_addr_t base; - uint8_t desc[E1000_MAX_RX_DESC_LEN]; + union e1000_rx_desc_union desc; size_t desc_size; size_t desc_offset = 0; size_t iov_ofs = 0; @@ -1579,7 +1598,7 @@ e1000e_write_packet_to_guest(E1000ECore *core, struct NetRxPkt *pkt, trace_e1000e_rx_descr(rxi->idx, base, core->rx_desc_len); - e1000e_read_rx_descr(core, desc, &ba); + e1000e_read_rx_descr(core, &desc, ba); if (ba[0]) { if (desc_offset < size) { @@ -1598,7 +1617,7 @@ e1000e_write_packet_to_guest(E1000ECore *core, struct NetRxPkt *pkt, iov_copy = MIN(ps_hdr_len - ps_hdr_copied, iov->iov_len - iov_ofs); - e1000e_write_hdr_to_rx_buffers(core, &ba, &bastate, + e1000e_write_hdr_to_rx_buffers(core, ba, &bastate, iov->iov_base, iov_copy); copy_size -= iov_copy; @@ -1615,7 +1634,7 @@ e1000e_write_packet_to_guest(E1000ECore *core, struct NetRxPkt *pkt, } else { /* Leave buffer 0 of each descriptor except first */ /* empty as per spec 7.1.5.1 */ - e1000e_write_hdr_to_rx_buffers(core, &ba, &bastate, + e1000e_write_hdr_to_rx_buffers(core, ba, &bastate, NULL, 0); } } @@ -1624,7 +1643,7 @@ e1000e_write_packet_to_guest(E1000ECore *core, struct NetRxPkt *pkt, while (copy_size) { iov_copy = MIN(copy_size, iov->iov_len - iov_ofs); - e1000e_write_to_rx_buffers(core, &ba, &bastate, + e1000e_write_to_rx_buffers(core, ba, &bastate, iov->iov_base + iov_ofs, iov_copy); copy_size -= iov_copy; @@ -1637,7 +1656,7 @@ e1000e_write_packet_to_guest(E1000ECore *core, struct NetRxPkt *pkt, if (desc_offset + desc_size >= total_size) { /* Simulate FCS checksum presence in the last descriptor */ - e1000e_write_to_rx_buffers(core, &ba, &bastate, + e1000e_write_to_rx_buffers(core, ba, &bastate, (const char *) &fcs_pad, e1000x_fcs_len(core->mac)); } } @@ -1649,9 +1668,9 @@ e1000e_write_packet_to_guest(E1000ECore *core, struct NetRxPkt *pkt, is_last = true; } - e1000e_write_rx_descr(core, desc, is_last ? core->rx_pkt : NULL, + e1000e_write_rx_descr(core, &desc, is_last ? core->rx_pkt : NULL, rss_info, do_ps ? ps_hdr_len : 0, &bastate.written); - e1000e_pci_dma_write_rx_desc(core, base, desc, core->rx_desc_len); + e1000e_pci_dma_write_rx_desc(core, base, &desc, core->rx_desc_len); e1000e_ring_advance(core, rxi, core->rx_desc_len / E1000_MIN_RX_DESC_LEN); @@ -1664,25 +1683,27 @@ e1000e_write_packet_to_guest(E1000ECore *core, struct NetRxPkt *pkt, static inline void e1000e_rx_fix_l4_csum(E1000ECore *core, struct NetRxPkt *pkt) { - if (net_rx_pkt_has_virt_hdr(pkt)) { - struct virtio_net_hdr *vhdr = net_rx_pkt_get_vhdr(pkt); + struct virtio_net_hdr *vhdr = net_rx_pkt_get_vhdr(pkt); - if (vhdr->flags & VIRTIO_NET_HDR_F_NEEDS_CSUM) { - net_rx_pkt_fix_l4_csum(pkt); - } + if (vhdr->flags & VIRTIO_NET_HDR_F_NEEDS_CSUM) { + net_rx_pkt_fix_l4_csum(pkt); } } -/* Min. octets in an ethernet frame sans FCS */ -#define MIN_BUF_SIZE 60 - ssize_t e1000e_receive_iov(E1000ECore *core, const struct iovec *iov, int iovcnt) { - static const int maximum_ethernet_hdr_len = (14 + 4); + return e1000e_receive_internal(core, iov, iovcnt, core->has_vnet); +} + +static ssize_t +e1000e_receive_internal(E1000ECore *core, const struct iovec *iov, int iovcnt, + bool has_vnet) +{ + static const int maximum_ethernet_hdr_len = (ETH_HLEN + 4); uint32_t n = 0; - uint8_t min_buf[MIN_BUF_SIZE]; + uint8_t min_buf[ETH_ZLEN]; struct iovec min_iov; uint8_t *filter_buf; size_t size, orig_size; @@ -1700,9 +1721,11 @@ e1000e_receive_iov(E1000ECore *core, const struct iovec *iov, int iovcnt) } /* Pull virtio header in */ - if (core->has_vnet) { + if (has_vnet) { net_rx_pkt_set_vhdr_iovec(core->rx_pkt, iov, iovcnt); iov_ofs = sizeof(struct virtio_net_hdr); + } else { + net_rx_pkt_unset_vhdr(core->rx_pkt); } filter_buf = iov->iov_base + iov_ofs; @@ -1744,8 +1767,6 @@ e1000e_receive_iov(E1000ECore *core, const struct iovec *iov, int iovcnt) e1000e_rss_parse_packet(core, core->rx_pkt, &rss_info); e1000e_rx_ring_init(core, &rxr, rss_info.queue); - trace_e1000e_rx_rss_dispatched_to_queue(rxr.i->idx); - total_size = net_rx_pkt_get_total_len(core->rx_pkt) + e1000x_fcs_len(core->mac); @@ -1771,12 +1792,12 @@ e1000e_receive_iov(E1000ECore *core, const struct iovec *iov, int iovcnt) rdmts_hit = e1000e_rx_descr_threshold_hit(core, rxr.i); n |= e1000e_rx_wb_interrupt_cause(core, rxr.i->idx, rdmts_hit); - trace_e1000e_rx_written_to_guest(n); + trace_e1000e_rx_written_to_guest(rxr.i->idx); } else { n |= E1000_ICS_RXO; retval = 0; - trace_e1000e_rx_not_written_to_guest(n); + trace_e1000e_rx_not_written_to_guest(rxr.i->idx); } if (!e1000e_intrmgr_delay_rx_causes(core, &n)) { @@ -1792,13 +1813,13 @@ e1000e_receive_iov(E1000ECore *core, const struct iovec *iov, int iovcnt) static inline bool e1000e_have_autoneg(E1000ECore *core) { - return core->phy[0][PHY_CTRL] & MII_CR_AUTO_NEG_EN; + return core->phy[0][MII_BMCR] & MII_BMCR_AUTOEN; } static void e1000e_update_flowctl_status(E1000ECore *core) { if (e1000e_have_autoneg(core) && - core->phy[0][PHY_STATUS] & MII_SR_AUTONEG_COMPLETE) { + core->phy[0][MII_BMSR] & MII_BMSR_AN_COMP) { trace_e1000e_link_autoneg_flowctl(true); core->mac[CTRL] |= E1000_CTRL_TFCE | E1000_CTRL_RFCE; } else { @@ -1816,12 +1837,12 @@ e1000e_link_down(E1000ECore *core) static inline void e1000e_set_phy_ctrl(E1000ECore *core, int index, uint16_t val) { - /* bits 0-5 reserved; MII_CR_[RESTART_AUTO_NEG,RESET] are self clearing */ - core->phy[0][PHY_CTRL] = val & ~(0x3f | - MII_CR_RESET | - MII_CR_RESTART_AUTO_NEG); + /* bits 0-5 reserved; MII_BMCR_[ANRESTART,RESET] are self clearing */ + core->phy[0][MII_BMCR] = val & ~(0x3f | + MII_BMCR_RESET | + MII_BMCR_ANRESTART); - if ((val & MII_CR_RESTART_AUTO_NEG) && + if ((val & MII_BMCR_ANRESTART) && e1000e_have_autoneg(core)) { e1000x_restart_autoneg(core->mac, core->phy[0], core->autoneg_timer); } @@ -1855,7 +1876,7 @@ e1000e_core_set_link_status(E1000ECore *core) e1000x_update_regs_on_link_down(core->mac, core->phy[0]); } else { if (e1000e_have_autoneg(core) && - !(core->phy[0][PHY_STATUS] & MII_SR_AUTONEG_COMPLETE)) { + !(core->phy[0][MII_BMSR] & MII_BMSR_AN_COMP)) { e1000x_restart_autoneg(core->mac, core->phy[0], core->autoneg_timer); } else { @@ -1888,7 +1909,7 @@ e1000e_set_ctrl(E1000ECore *core, int index, uint32_t val) if (val & E1000_CTRL_RST) { trace_e1000e_core_ctrl_sw_reset(); - e1000x_reset_mac_addr(core->owner_nic, core->mac, core->permanent_mac); + e1000e_reset(core, true); } if (val & E1000_CTRL_PHY_RST) { @@ -1997,7 +2018,7 @@ static void(*e1000e_phyreg_writeops[E1000E_PHY_PAGES][E1000E_PHY_PAGE_SIZE]) (E1000ECore *, int, uint16_t) = { [0] = { - [PHY_CTRL] = e1000e_set_phy_ctrl, + [MII_BMCR] = e1000e_set_phy_ctrl, [PHY_PAGE] = e1000e_set_phy_page, [PHY_OEM_BITS] = e1000e_set_phy_oem_bits } @@ -2011,13 +2032,11 @@ e1000e_clear_ims_bits(E1000ECore *core, uint32_t bits) } static inline bool -e1000e_postpone_interrupt(bool *interrupt_pending, - E1000IntrDelayTimer *timer) +e1000e_postpone_interrupt(E1000IntrDelayTimer *timer) { if (timer->running) { trace_e1000e_irq_postponed_by_xitr(timer->delay_reg << 2); - *interrupt_pending = true; return true; } @@ -2031,14 +2050,13 @@ e1000e_postpone_interrupt(bool *interrupt_pending, static inline bool e1000e_itr_should_postpone(E1000ECore *core) { - return e1000e_postpone_interrupt(&core->itr_intr_pending, &core->itr); + return e1000e_postpone_interrupt(&core->itr); } static inline bool e1000e_eitr_should_postpone(E1000ECore *core, int idx) { - return e1000e_postpone_interrupt(&core->eitr_intr_pending[idx], - &core->eitr[idx]); + return e1000e_postpone_interrupt(&core->eitr[idx]); } static void @@ -2269,19 +2287,19 @@ e1000e_get_reg_index_with_offset(const uint16_t *mac_reg_access, hwaddr addr) static const char e1000e_phy_regcap[E1000E_PHY_PAGES][0x20] = { [0] = { - [PHY_CTRL] = PHY_ANYPAGE | PHY_RW, - [PHY_STATUS] = PHY_ANYPAGE | PHY_R, - [PHY_ID1] = PHY_ANYPAGE | PHY_R, - [PHY_ID2] = PHY_ANYPAGE | PHY_R, - [PHY_AUTONEG_ADV] = PHY_ANYPAGE | PHY_RW, - [PHY_LP_ABILITY] = PHY_ANYPAGE | PHY_R, - [PHY_AUTONEG_EXP] = PHY_ANYPAGE | PHY_R, - [PHY_NEXT_PAGE_TX] = PHY_ANYPAGE | PHY_RW, - [PHY_LP_NEXT_PAGE] = PHY_ANYPAGE | PHY_R, - [PHY_1000T_CTRL] = PHY_ANYPAGE | PHY_RW, - [PHY_1000T_STATUS] = PHY_ANYPAGE | PHY_R, - [PHY_EXT_STATUS] = PHY_ANYPAGE | PHY_R, - [PHY_PAGE] = PHY_ANYPAGE | PHY_RW, + [MII_BMCR] = PHY_ANYPAGE | PHY_RW, + [MII_BMSR] = PHY_ANYPAGE | PHY_R, + [MII_PHYID1] = PHY_ANYPAGE | PHY_R, + [MII_PHYID2] = PHY_ANYPAGE | PHY_R, + [MII_ANAR] = PHY_ANYPAGE | PHY_RW, + [MII_ANLPAR] = PHY_ANYPAGE | PHY_R, + [MII_ANER] = PHY_ANYPAGE | PHY_R, + [MII_ANNP] = PHY_ANYPAGE | PHY_RW, + [MII_ANLPRNP] = PHY_ANYPAGE | PHY_R, + [MII_CTRL1000] = PHY_ANYPAGE | PHY_RW, + [MII_STAT1000] = PHY_ANYPAGE | PHY_R, + [MII_EXTSTAT] = PHY_ANYPAGE | PHY_R, + [PHY_PAGE] = PHY_ANYPAGE | PHY_RW, [PHY_COPPER_CTRL1] = PHY_RW, [PHY_COPPER_STAT1] = PHY_R, @@ -2434,17 +2452,19 @@ e1000e_set_fcrtl(E1000ECore *core, int index, uint32_t val) core->mac[FCRTL] = val & 0x8000FFF8; } -static inline void -e1000e_set_16bit(E1000ECore *core, int index, uint32_t val) -{ - core->mac[index] = val & 0xffff; -} +#define E1000E_LOW_BITS_SET_FUNC(num) \ + static void \ + e1000e_set_##num##bit(E1000ECore *core, int index, uint32_t val) \ + { \ + core->mac[index] = val & (BIT(num) - 1); \ + } -static void -e1000e_set_12bit(E1000ECore *core, int index, uint32_t val) -{ - core->mac[index] = val & 0xfff; -} +E1000E_LOW_BITS_SET_FUNC(4) +E1000E_LOW_BITS_SET_FUNC(6) +E1000E_LOW_BITS_SET_FUNC(11) +E1000E_LOW_BITS_SET_FUNC(12) +E1000E_LOW_BITS_SET_FUNC(13) +E1000E_LOW_BITS_SET_FUNC(16) static void e1000e_set_vet(E1000ECore *core, int index, uint32_t val) @@ -2515,7 +2535,8 @@ e1000e_set_icr(E1000ECore *core, int index, uint32_t val) } icr = core->mac[ICR] & ~val; - /* Windows driver expects that the "receive overrun" bit and other + /* + * Windows driver expects that the "receive overrun" bit and other * ones to be cleared when the "Other" bit (#24) is cleared. */ icr = (val & E1000_ICR_OTHER) ? (icr & ~E1000_ICR_OTHER_CAUSES) : icr; @@ -2614,27 +2635,11 @@ e1000e_mac_ims_read(E1000ECore *core, int index) return core->mac[IMS]; } -#define E1000E_LOW_BITS_READ_FUNC(num) \ - static uint32_t \ - e1000e_mac_low##num##_read(E1000ECore *core, int index) \ - { \ - return core->mac[index] & (BIT(num) - 1); \ - } \ - -#define E1000E_LOW_BITS_READ(num) \ - e1000e_mac_low##num##_read - -E1000E_LOW_BITS_READ_FUNC(4); -E1000E_LOW_BITS_READ_FUNC(6); -E1000E_LOW_BITS_READ_FUNC(11); -E1000E_LOW_BITS_READ_FUNC(13); -E1000E_LOW_BITS_READ_FUNC(16); - static uint32_t e1000e_mac_swsm_read(E1000ECore *core, int index) { uint32_t val = core->mac[SWSM]; - core->mac[SWSM] = val | 1; + core->mac[SWSM] = val | E1000_SWSM_SMBI; return val; } @@ -2908,6 +2913,35 @@ e1000e_set_gcr(E1000ECore *core, int index, uint32_t val) core->mac[GCR] = (val & ~E1000_GCR_RO_BITS) | ro_bits; } +static uint32_t e1000e_get_systiml(E1000ECore *core, int index) +{ + e1000x_timestamp(core->mac, core->timadj, SYSTIML, SYSTIMH); + return core->mac[SYSTIML]; +} + +static uint32_t e1000e_get_rxsatrh(E1000ECore *core, int index) +{ + core->mac[TSYNCRXCTL] &= ~E1000_TSYNCRXCTL_VALID; + return core->mac[RXSATRH]; +} + +static uint32_t e1000e_get_txstmph(E1000ECore *core, int index) +{ + core->mac[TSYNCTXCTL] &= ~E1000_TSYNCTXCTL_VALID; + return core->mac[TXSTMPH]; +} + +static void e1000e_set_timinca(E1000ECore *core, int index, uint32_t val) +{ + e1000x_set_timinca(core->mac, &core->timadj, val); +} + +static void e1000e_set_timadjh(E1000ECore *core, int index, uint32_t val) +{ + core->mac[TIMADJH] = val; + core->timadj += core->mac[TIMADJL] | ((int64_t)core->mac[TIMADJH] << 32); +} + #define e1000e_getreg(x) [x] = e1000e_mac_readreg typedef uint32_t (*readops)(E1000ECore *, int); static const readops e1000e_macreg_readops[] = { @@ -2923,7 +2957,19 @@ static const readops e1000e_macreg_readops[] = { e1000e_getreg(LATECOL), e1000e_getreg(SEQEC), e1000e_getreg(XONTXC), + e1000e_getreg(AIT), + e1000e_getreg(TDFH), + e1000e_getreg(TDFT), + e1000e_getreg(TDFHS), + e1000e_getreg(TDFTS), + e1000e_getreg(TDFPC), e1000e_getreg(WUS), + e1000e_getreg(PBS), + e1000e_getreg(RDFH), + e1000e_getreg(RDFT), + e1000e_getreg(RDFHS), + e1000e_getreg(RDFTS), + e1000e_getreg(RDFPC), e1000e_getreg(GORCL), e1000e_getreg(MGTPRC), e1000e_getreg(EERD), @@ -2951,7 +2997,6 @@ static const readops e1000e_macreg_readops[] = { e1000e_getreg(GSCL_2), e1000e_getreg(RDBAH1), e1000e_getreg(FLSWDATA), - e1000e_getreg(RXSATRH), e1000e_getreg(TIPG), e1000e_getreg(FLMNGCTL), e1000e_getreg(FLMNGCNT), @@ -2992,7 +3037,6 @@ static const readops e1000e_macreg_readops[] = { e1000e_getreg(FLSWCTL), e1000e_getreg(RXDCTL1), e1000e_getreg(RXSATRL), - e1000e_getreg(SYSTIML), e1000e_getreg(RXUDP), e1000e_getreg(TORL), e1000e_getreg(TDLEN1), @@ -3032,7 +3076,6 @@ static const readops e1000e_macreg_readops[] = { e1000e_getreg(FLOL), e1000e_getreg(RXDCTL), e1000e_getreg(RXSTMPL), - e1000e_getreg(TXSTMPH), e1000e_getreg(TIMADJH), e1000e_getreg(FCRTL), e1000e_getreg(TDBAH), @@ -3059,16 +3102,9 @@ static const readops e1000e_macreg_readops[] = { [MPTC] = e1000e_mac_read_clr4, [IAC] = e1000e_mac_read_clr4, [ICR] = e1000e_mac_icr_read, - [RDFH] = E1000E_LOW_BITS_READ(13), - [RDFHS] = E1000E_LOW_BITS_READ(13), - [RDFPC] = E1000E_LOW_BITS_READ(13), - [TDFH] = E1000E_LOW_BITS_READ(13), - [TDFHS] = E1000E_LOW_BITS_READ(13), [STATUS] = e1000e_get_status, [TARC0] = e1000e_get_tarc, - [PBS] = E1000E_LOW_BITS_READ(6), [ICS] = e1000e_mac_ics_read, - [AIT] = E1000E_LOW_BITS_READ(16), [TORH] = e1000e_mac_read_clr8, [GORCH] = e1000e_mac_read_clr8, [PRC127] = e1000e_mac_read_clr4, @@ -3084,27 +3120,25 @@ static const readops e1000e_macreg_readops[] = { [BPTC] = e1000e_mac_read_clr4, [TSCTC] = e1000e_mac_read_clr4, [ITR] = e1000e_mac_itr_read, - [RDFT] = E1000E_LOW_BITS_READ(13), - [RDFTS] = E1000E_LOW_BITS_READ(13), - [TDFPC] = E1000E_LOW_BITS_READ(13), - [TDFT] = E1000E_LOW_BITS_READ(13), - [TDFTS] = E1000E_LOW_BITS_READ(13), [CTRL] = e1000e_get_ctrl, [TARC1] = e1000e_get_tarc, [SWSM] = e1000e_mac_swsm_read, [IMS] = e1000e_mac_ims_read, + [SYSTIML] = e1000e_get_systiml, + [RXSATRH] = e1000e_get_rxsatrh, + [TXSTMPH] = e1000e_get_txstmph, [CRCERRS ... MPC] = e1000e_mac_readreg, [IP6AT ... IP6AT + 3] = e1000e_mac_readreg, [IP4AT ... IP4AT + 6] = e1000e_mac_readreg, [RA ... RA + 31] = e1000e_mac_readreg, [WUPM ... WUPM + 31] = e1000e_mac_readreg, - [MTA ... MTA + 127] = e1000e_mac_readreg, - [VFTA ... VFTA + 127] = e1000e_mac_readreg, - [FFMT ... FFMT + 254] = E1000E_LOW_BITS_READ(4), + [MTA ... MTA + E1000_MC_TBL_SIZE - 1] = e1000e_mac_readreg, + [VFTA ... VFTA + E1000_VLAN_FILTER_TBL_SIZE - 1] = e1000e_mac_readreg, + [FFMT ... FFMT + 254] = e1000e_mac_readreg, [FFVT ... FFVT + 254] = e1000e_mac_readreg, [MDEF ... MDEF + 7] = e1000e_mac_readreg, - [FFLT ... FFLT + 10] = E1000E_LOW_BITS_READ(11), + [FFLT ... FFLT + 10] = e1000e_mac_readreg, [FTFT ... FTFT + 254] = e1000e_mac_readreg, [PBM ... PBM + 10239] = e1000e_mac_readreg, [RETA ... RETA + 31] = e1000e_mac_readreg, @@ -3127,22 +3161,10 @@ static const writeops e1000e_macreg_writeops[] = { e1000e_putreg(LEDCTL), e1000e_putreg(FCAL), e1000e_putreg(FCRUC), - e1000e_putreg(AIT), - e1000e_putreg(TDFH), - e1000e_putreg(TDFT), - e1000e_putreg(TDFHS), - e1000e_putreg(TDFTS), - e1000e_putreg(TDFPC), e1000e_putreg(WUC), e1000e_putreg(WUS), - e1000e_putreg(RDFH), - e1000e_putreg(RDFT), - e1000e_putreg(RDFHS), - e1000e_putreg(RDFTS), - e1000e_putreg(RDFPC), e1000e_putreg(IPAV), e1000e_putreg(TDBAH1), - e1000e_putreg(TIMINCA), e1000e_putreg(IAM), e1000e_putreg(EIAC), e1000e_putreg(IVAR), @@ -3150,7 +3172,6 @@ static const writeops e1000e_macreg_writeops[] = { e1000e_putreg(TARC1), e1000e_putreg(FLSWDATA), e1000e_putreg(POEMB), - e1000e_putreg(PBS), e1000e_putreg(MFUTP01), e1000e_putreg(MFUTP23), e1000e_putreg(MANC), @@ -3186,7 +3207,6 @@ static const writeops e1000e_macreg_writeops[] = { e1000e_putreg(SYSTIML), e1000e_putreg(SYSTIMH), e1000e_putreg(TIMADJL), - e1000e_putreg(TIMADJH), e1000e_putreg(RXUDP), e1000e_putreg(RXCFGL), e1000e_putreg(TSYNCRXCTL), @@ -3215,6 +3235,18 @@ static const writeops e1000e_macreg_writeops[] = { [TADV] = e1000e_set_16bit, [ITR] = e1000e_set_itr, [EERD] = e1000e_set_eerd, + [AIT] = e1000e_set_16bit, + [TDFH] = e1000e_set_13bit, + [TDFT] = e1000e_set_13bit, + [TDFHS] = e1000e_set_13bit, + [TDFTS] = e1000e_set_13bit, + [TDFPC] = e1000e_set_13bit, + [RDFH] = e1000e_set_13bit, + [RDFHS] = e1000e_set_13bit, + [RDFT] = e1000e_set_13bit, + [RDFTS] = e1000e_set_13bit, + [RDFPC] = e1000e_set_13bit, + [PBS] = e1000e_set_6bit, [GCR] = e1000e_set_gcr, [PSRCTL] = e1000e_set_psrctl, [RXCSUM] = e1000e_set_rxcsum, @@ -3247,18 +3279,20 @@ static const writeops e1000e_macreg_writeops[] = { [CTRL_DUP] = e1000e_set_ctrl, [RFCTL] = e1000e_set_rfctl, [RA + 1] = e1000e_mac_setmacaddr, + [TIMINCA] = e1000e_set_timinca, + [TIMADJH] = e1000e_set_timadjh, [IP6AT ... IP6AT + 3] = e1000e_mac_writereg, [IP4AT ... IP4AT + 6] = e1000e_mac_writereg, [RA + 2 ... RA + 31] = e1000e_mac_writereg, [WUPM ... WUPM + 31] = e1000e_mac_writereg, - [MTA ... MTA + 127] = e1000e_mac_writereg, - [VFTA ... VFTA + 127] = e1000e_mac_writereg, - [FFMT ... FFMT + 254] = e1000e_mac_writereg, + [MTA ... MTA + E1000_MC_TBL_SIZE - 1] = e1000e_mac_writereg, + [VFTA ... VFTA + E1000_VLAN_FILTER_TBL_SIZE - 1] = e1000e_mac_writereg, + [FFMT ... FFMT + 254] = e1000e_set_4bit, [FFVT ... FFVT + 254] = e1000e_mac_writereg, [PBM ... PBM + 10239] = e1000e_mac_writereg, [MDEF ... MDEF + 7] = e1000e_mac_writereg, - [FFLT ... FFLT + 10] = e1000e_mac_writereg, + [FFLT ... FFLT + 10] = e1000e_set_11bit, [FTFT ... FTFT + 254] = e1000e_mac_writereg, [RETA ... RETA + 31] = e1000e_mac_writereg, [RSSRK ... RSSRK + 31] = e1000e_mac_writereg, @@ -3269,10 +3303,12 @@ enum { E1000E_NWRITEOPS = ARRAY_SIZE(e1000e_macreg_writeops) }; enum { MAC_ACCESS_PARTIAL = 1 }; -/* The array below combines alias offsets of the index values for the +/* + * The array below combines alias offsets of the index values for the * MAC registers that have aliases, with the indication of not fully * implemented registers (lowest bit). This combination is possible - * because all of the offsets are even. */ + * because all of the offsets are even. + */ static const uint16_t mac_reg_access[E1000E_MAC_SIZE] = { /* Alias index offsets */ [FCRTL_A] = 0x07fe, [FCRTH_A] = 0x0802, @@ -3281,7 +3317,7 @@ static const uint16_t mac_reg_access[E1000E_MAC_SIZE] = { [TDH_A] = 0x0cf8, [TDT_A] = 0x0cf8, [TIDV_A] = 0x0cf8, [TDFH_A] = 0xed00, [TDFT_A] = 0xed00, [RA_A ... RA_A + 31] = 0x14f0, - [VFTA_A ... VFTA_A + 127] = 0x1400, + [VFTA_A ... VFTA_A + E1000_VLAN_FILTER_TBL_SIZE - 1] = 0x1400, [RDBAL0_A ... RDLEN0_A] = 0x09bc, [TDBAL_A ... TDLEN_A] = 0x0cf8, /* Access options */ @@ -3347,7 +3383,7 @@ static void e1000e_autoneg_resume(E1000ECore *core) { if (e1000e_have_autoneg(core) && - !(core->phy[0][PHY_STATUS] & MII_SR_AUTONEG_COMPLETE)) { + !(core->phy[0][MII_BMSR] & MII_BMSR_AN_COMP)) { qemu_get_queue(core->owner_nic)->link_down = false; timer_mod(core->autoneg_timer, qemu_clock_get_ms(QEMU_CLOCK_VIRTUAL) + 500); @@ -3386,11 +3422,10 @@ e1000e_core_pci_realize(E1000ECore *core, qemu_add_vm_change_state_handler(e1000e_vm_state_change, core); for (i = 0; i < E1000E_NUM_QUEUES; i++) { - net_tx_pkt_init(&core->tx[i].tx_pkt, core->owner, - E1000E_MAX_TX_FRAGS, core->has_vnet); + net_tx_pkt_init(&core->tx[i].tx_pkt, core->owner, E1000E_MAX_TX_FRAGS); } - net_rx_pkt_init(&core->rx_pkt, core->has_vnet); + net_rx_pkt_init(&core->rx_pkt); e1000x_core_prepare_eeprom(core->eeprom, eeprom_templ, @@ -3422,29 +3457,36 @@ e1000e_core_pci_uninit(E1000ECore *core) static const uint16_t e1000e_phy_reg_init[E1000E_PHY_PAGES][E1000E_PHY_PAGE_SIZE] = { [0] = { - [PHY_CTRL] = MII_CR_SPEED_SELECT_MSB | - MII_CR_FULL_DUPLEX | - MII_CR_AUTO_NEG_EN, - - [PHY_STATUS] = MII_SR_EXTENDED_CAPS | - MII_SR_LINK_STATUS | - MII_SR_AUTONEG_CAPS | - MII_SR_PREAMBLE_SUPPRESS | - MII_SR_EXTENDED_STATUS | - MII_SR_10T_HD_CAPS | - MII_SR_10T_FD_CAPS | - MII_SR_100X_HD_CAPS | - MII_SR_100X_FD_CAPS, - - [PHY_ID1] = 0x141, - [PHY_ID2] = E1000_PHY_ID2_82574x, - [PHY_AUTONEG_ADV] = 0xde1, - [PHY_LP_ABILITY] = 0x7e0, - [PHY_AUTONEG_EXP] = BIT(2), - [PHY_NEXT_PAGE_TX] = BIT(0) | BIT(13), - [PHY_1000T_CTRL] = BIT(8) | BIT(9) | BIT(10) | BIT(11), - [PHY_1000T_STATUS] = 0x3c00, - [PHY_EXT_STATUS] = BIT(12) | BIT(13), + [MII_BMCR] = MII_BMCR_SPEED1000 | + MII_BMCR_FD | + MII_BMCR_AUTOEN, + + [MII_BMSR] = MII_BMSR_EXTCAP | + MII_BMSR_LINK_ST | + MII_BMSR_AUTONEG | + MII_BMSR_MFPS | + MII_BMSR_EXTSTAT | + MII_BMSR_10T_HD | + MII_BMSR_10T_FD | + MII_BMSR_100TX_HD | + MII_BMSR_100TX_FD, + + [MII_PHYID1] = 0x141, + [MII_PHYID2] = E1000_PHY_ID2_82574x, + [MII_ANAR] = MII_ANAR_CSMACD | MII_ANAR_10 | + MII_ANAR_10FD | MII_ANAR_TX | + MII_ANAR_TXFD | MII_ANAR_PAUSE | + MII_ANAR_PAUSE_ASYM, + [MII_ANLPAR] = MII_ANLPAR_10 | MII_ANLPAR_10FD | + MII_ANLPAR_TX | MII_ANLPAR_TXFD | + MII_ANLPAR_T4 | MII_ANLPAR_PAUSE, + [MII_ANER] = MII_ANER_NP | MII_ANER_NWAY, + [MII_ANNP] = 1 | MII_ANNP_MP, + [MII_CTRL1000] = MII_CTRL1000_HALF | MII_CTRL1000_FULL | + MII_CTRL1000_PORT | MII_CTRL1000_MASTER, + [MII_STAT1000] = MII_STAT1000_HALF | MII_STAT1000_FULL | + MII_STAT1000_ROK | MII_STAT1000_LOK, + [MII_EXTSTAT] = MII_EXTSTAT_1000T_HD | MII_EXTSTAT_1000T_FD, [PHY_COPPER_CTRL1] = BIT(5) | BIT(6) | BIT(8) | BIT(9) | BIT(12) | BIT(13), @@ -3501,8 +3543,7 @@ static const uint32_t e1000e_mac_reg_init[] = { [EITR...EITR + E1000E_MSIX_VEC_NUM - 1] = E1000E_MIN_XITR, }; -void -e1000e_core_reset(E1000ECore *core) +static void e1000e_reset(E1000ECore *core, bool sw) { int i; @@ -3511,9 +3552,16 @@ e1000e_core_reset(E1000ECore *core) e1000e_intrmgr_reset(core); memset(core->phy, 0, sizeof core->phy); - memmove(core->phy, e1000e_phy_reg_init, sizeof e1000e_phy_reg_init); - memset(core->mac, 0, sizeof core->mac); - memmove(core->mac, e1000e_mac_reg_init, sizeof e1000e_mac_reg_init); + memcpy(core->phy, e1000e_phy_reg_init, sizeof e1000e_phy_reg_init); + + for (i = 0; i < E1000E_MAC_SIZE; i++) { + if (sw && (i == PBA || i == PBS || i == FLA)) { + continue; + } + + core->mac[i] = i < ARRAY_SIZE(e1000e_mac_reg_init) ? + e1000e_mac_reg_init[i] : 0; + } core->rxbuf_min_shift = 1 + E1000_RING_DESC_LEN_SHIFT; @@ -3530,18 +3578,24 @@ e1000e_core_reset(E1000ECore *core) } } +void +e1000e_core_reset(E1000ECore *core) +{ + e1000e_reset(core, false); +} + void e1000e_core_pre_save(E1000ECore *core) { int i; NetClientState *nc = qemu_get_queue(core->owner_nic); /* - * If link is down and auto-negotiation is supported and ongoing, - * complete auto-negotiation immediately. This allows us to look - * at MII_SR_AUTONEG_COMPLETE to infer link status on load. - */ + * If link is down and auto-negotiation is supported and ongoing, + * complete auto-negotiation immediately. This allows us to look + * at MII_BMSR_AN_COMP to infer link status on load. + */ if (nc->link_down && e1000e_have_autoneg(core)) { - core->phy[0][PHY_STATUS] |= MII_SR_AUTONEG_COMPLETE; + core->phy[0][MII_BMSR] |= MII_BMSR_AN_COMP; e1000e_update_flowctl_status(core); } @@ -3557,7 +3611,8 @@ e1000e_core_post_load(E1000ECore *core) { NetClientState *nc = qemu_get_queue(core->owner_nic); - /* nc.link_down can't be migrated, so infer link_down according + /* + * nc.link_down can't be migrated, so infer link_down according * to link status bit in core.mac[STATUS]. */ nc->link_down = (core->mac[STATUS] & E1000_STATUS_LU) == 0; diff --git a/hw/net/e1000e_core.h b/hw/net/e1000e_core.h index 4ddb4d2c39..213a70530d 100644 --- a/hw/net/e1000e_core.h +++ b/hw/net/e1000e_core.h @@ -1,37 +1,37 @@ /* -* Core code for QEMU e1000e emulation -* -* Software developer's manuals: -* http://www.intel.com/content/dam/doc/datasheet/82574l-gbe-controller-datasheet.pdf -* -* Copyright (c) 2015 Ravello Systems LTD (http://ravellosystems.com) -* Developed by Daynix Computing LTD (http://www.daynix.com) -* -* Authors: -* Dmitry Fleytman <dmitry@daynix.com> -* Leonid Bloch <leonid@daynix.com> -* Yan Vugenfirer <yan@daynix.com> -* -* Based on work done by: -* Nir Peleg, Tutis Systems Ltd. for Qumranet Inc. -* Copyright (c) 2008 Qumranet -* Based on work done by: -* Copyright (c) 2007 Dan Aloni -* Copyright (c) 2004 Antony T Curtis -* -* This library is free software; you can redistribute it and/or -* modify it under the terms of the GNU Lesser General Public -* License as published by the Free Software Foundation; either -* version 2.1 of the License, or (at your option) any later version. -* -* This library is distributed in the hope that it will be useful, -* but WITHOUT ANY WARRANTY; without even the implied warranty of -* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU -* Lesser General Public License for more details. -* -* You should have received a copy of the GNU Lesser General Public -* License along with this library; if not, see <http://www.gnu.org/licenses/>. -*/ + * Core code for QEMU e1000e emulation + * + * Software developer's manuals: + * http://www.intel.com/content/dam/doc/datasheet/82574l-gbe-controller-datasheet.pdf + * + * Copyright (c) 2015 Ravello Systems LTD (http://ravellosystems.com) + * Developed by Daynix Computing LTD (http://www.daynix.com) + * + * Authors: + * Dmitry Fleytman <dmitry@daynix.com> + * Leonid Bloch <leonid@daynix.com> + * Yan Vugenfirer <yan@daynix.com> + * + * Based on work done by: + * Nir Peleg, Tutis Systems Ltd. for Qumranet Inc. + * Copyright (c) 2008 Qumranet + * Based on work done by: + * Copyright (c) 2007 Dan Aloni + * Copyright (c) 2004 Antony T Curtis + * + * This library is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * This library is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with this library; if not, see <http://www.gnu.org/licenses/>. + */ #ifndef HW_NET_E1000E_CORE_H #define HW_NET_E1000E_CORE_H @@ -95,10 +95,8 @@ struct E1000Core { E1000IntrDelayTimer tidv; E1000IntrDelayTimer itr; - bool itr_intr_pending; E1000IntrDelayTimer eitr[E1000E_MSIX_VEC_NUM]; - bool eitr_intr_pending[E1000E_MSIX_VEC_NUM]; VMChangeStateEntry *vmstate; @@ -114,6 +112,8 @@ struct E1000Core { void (*owner_start_recv)(PCIDevice *d); uint32_t msi_causes_pending; + + int64_t timadj; }; void diff --git a/hw/net/e1000x_common.c b/hw/net/e1000x_common.c index 2f43e8cd13..b844af590a 100644 --- a/hw/net/e1000x_common.c +++ b/hw/net/e1000x_common.c @@ -24,9 +24,12 @@ #include "qemu/osdep.h" #include "qemu/units.h" +#include "hw/net/mii.h" #include "hw/pci/pci_device.h" +#include "net/eth.h" #include "net/net.h" +#include "e1000_common.h" #include "e1000x_common.h" #include "trace.h" @@ -45,9 +48,9 @@ bool e1000x_rx_ready(PCIDevice *d, uint32_t *mac) return true; } -bool e1000x_is_vlan_packet(const uint8_t *buf, uint16_t vet) +bool e1000x_is_vlan_packet(const void *buf, uint16_t vet) { - uint16_t eth_proto = lduw_be_p(buf + 12); + uint16_t eth_proto = lduw_be_p(&PKT_GET_ETH_HDR(buf)->h_proto); bool res = (eth_proto == vet); trace_e1000x_vlan_is_vlan_pkt(res, eth_proto, vet); @@ -66,7 +69,7 @@ bool e1000x_rx_group_filter(uint32_t *mac, const uint8_t *buf) } ra[0] = cpu_to_le32(rp[0]); ra[1] = cpu_to_le32(rp[1]); - if (!memcmp(buf, (uint8_t *)ra, 6)) { + if (!memcmp(buf, (uint8_t *)ra, ETH_ALEN)) { trace_e1000x_rx_flt_ucast_match((int)(rp - mac - RA) / 2, MAC_ARG(buf)); return true; @@ -152,8 +155,8 @@ void e1000x_reset_mac_addr(NICState *nic, uint32_t *mac_regs, void e1000x_update_regs_on_autoneg_done(uint32_t *mac, uint16_t *phy) { e1000x_update_regs_on_link_up(mac, phy); - phy[PHY_LP_ABILITY] |= MII_LPAR_LPACK; - phy[PHY_STATUS] |= MII_SR_AUTONEG_COMPLETE; + phy[MII_ANLPAR] |= MII_ANLPAR_ACK; + phy[MII_BMSR] |= MII_BMSR_AN_COMP; trace_e1000x_link_negotiation_done(); } @@ -265,3 +268,28 @@ e1000x_read_tx_ctx_descr(struct e1000_context_desc *d, props->tcp = (op & E1000_TXD_CMD_TCP) ? 1 : 0; props->tse = (op & E1000_TXD_CMD_TSE) ? 1 : 0; } + +void e1000x_timestamp(uint32_t *mac, int64_t timadj, size_t lo, size_t hi) +{ + int64_t ns = qemu_clock_get_ns(QEMU_CLOCK_VIRTUAL); + uint32_t timinca = mac[TIMINCA]; + uint32_t incvalue = timinca & E1000_TIMINCA_INCVALUE_MASK; + uint32_t incperiod = MAX(timinca >> E1000_TIMINCA_INCPERIOD_SHIFT, 1); + int64_t timestamp = timadj + muldiv64(ns, incvalue, incperiod * 16); + + mac[lo] = timestamp & 0xffffffff; + mac[hi] = timestamp >> 32; +} + +void e1000x_set_timinca(uint32_t *mac, int64_t *timadj, uint32_t val) +{ + int64_t ns = qemu_clock_get_ns(QEMU_CLOCK_VIRTUAL); + uint32_t old_val = mac[TIMINCA]; + uint32_t old_incvalue = old_val & E1000_TIMINCA_INCVALUE_MASK; + uint32_t old_incperiod = MAX(old_val >> E1000_TIMINCA_INCPERIOD_SHIFT, 1); + uint32_t incvalue = val & E1000_TIMINCA_INCVALUE_MASK; + uint32_t incperiod = MAX(val >> E1000_TIMINCA_INCPERIOD_SHIFT, 1); + + mac[TIMINCA] = val; + *timadj += (muldiv64(ns, incvalue, incperiod) - muldiv64(ns, old_incvalue, old_incperiod)) / 16; +} diff --git a/hw/net/e1000x_common.h b/hw/net/e1000x_common.h index b7742775c4..911abd8a90 100644 --- a/hw/net/e1000x_common.h +++ b/hw/net/e1000x_common.h @@ -1,108 +1,34 @@ /* -* QEMU e1000(e) emulation - shared code -* -* Copyright (c) 2008 Qumranet -* -* Based on work done by: -* Nir Peleg, Tutis Systems Ltd. for Qumranet Inc. -* Copyright (c) 2007 Dan Aloni -* Copyright (c) 2004 Antony T Curtis -* -* This library is free software; you can redistribute it and/or -* modify it under the terms of the GNU Lesser General Public -* License as published by the Free Software Foundation; either -* version 2.1 of the License, or (at your option) any later version. -* -* This library is distributed in the hope that it will be useful, -* but WITHOUT ANY WARRANTY; without even the implied warranty of -* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU -* Lesser General Public License for more details. -* -* You should have received a copy of the GNU Lesser General Public -* License along with this library; if not, see <http://www.gnu.org/licenses/>. -*/ + * QEMU e1000(e) emulation - shared code + * + * Copyright (c) 2008 Qumranet + * + * Based on work done by: + * Nir Peleg, Tutis Systems Ltd. for Qumranet Inc. + * Copyright (c) 2007 Dan Aloni + * Copyright (c) 2004 Antony T Curtis + * + * This library is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * This library is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with this library; if not, see <http://www.gnu.org/licenses/>. + */ #ifndef HW_NET_E1000X_COMMON_H #define HW_NET_E1000X_COMMON_H -#include "e1000_regs.h" - -#define defreg(x) x = (E1000_##x >> 2) -enum { - defreg(CTRL), defreg(EECD), defreg(EERD), defreg(GPRC), - defreg(GPTC), defreg(ICR), defreg(ICS), defreg(IMC), - defreg(IMS), defreg(LEDCTL), defreg(MANC), defreg(MDIC), - defreg(MPC), defreg(PBA), defreg(RCTL), defreg(RDBAH0), - defreg(RDBAL0), defreg(RDH0), defreg(RDLEN0), defreg(RDT0), - defreg(STATUS), defreg(SWSM), defreg(TCTL), defreg(TDBAH), - defreg(TDBAL), defreg(TDH), defreg(TDLEN), defreg(TDT), - defreg(TDLEN1), defreg(TDBAL1), defreg(TDBAH1), defreg(TDH1), - defreg(TDT1), defreg(TORH), defreg(TORL), defreg(TOTH), - defreg(TOTL), defreg(TPR), defreg(TPT), defreg(TXDCTL), - defreg(WUFC), defreg(RA), defreg(MTA), defreg(CRCERRS), - defreg(VFTA), defreg(VET), defreg(RDTR), defreg(RADV), - defreg(TADV), defreg(ITR), defreg(SCC), defreg(ECOL), - defreg(MCC), defreg(LATECOL), defreg(COLC), defreg(DC), - defreg(TNCRS), defreg(SEQEC), defreg(CEXTERR), defreg(RLEC), - defreg(XONRXC), defreg(XONTXC), defreg(XOFFRXC), defreg(XOFFTXC), - defreg(FCRUC), defreg(AIT), defreg(TDFH), defreg(TDFT), - defreg(TDFHS), defreg(TDFTS), defreg(TDFPC), defreg(WUC), - defreg(WUS), defreg(POEMB), defreg(PBS), defreg(RDFH), - defreg(RDFT), defreg(RDFHS), defreg(RDFTS), defreg(RDFPC), - defreg(PBM), defreg(IPAV), defreg(IP4AT), defreg(IP6AT), - defreg(WUPM), defreg(FFLT), defreg(FFMT), defreg(FFVT), - defreg(TARC0), defreg(TARC1), defreg(IAM), defreg(EXTCNF_CTRL), - defreg(GCR), defreg(TIMINCA), defreg(EIAC), defreg(CTRL_EXT), - defreg(IVAR), defreg(MFUTP01), defreg(MFUTP23), defreg(MANC2H), - defreg(MFVAL), defreg(MDEF), defreg(FACTPS), defreg(FTFT), - defreg(RUC), defreg(ROC), defreg(RFC), defreg(RJC), - defreg(PRC64), defreg(PRC127), defreg(PRC255), defreg(PRC511), - defreg(PRC1023), defreg(PRC1522), defreg(PTC64), defreg(PTC127), - defreg(PTC255), defreg(PTC511), defreg(PTC1023), defreg(PTC1522), - defreg(GORCL), defreg(GORCH), defreg(GOTCL), defreg(GOTCH), - defreg(RNBC), defreg(BPRC), defreg(MPRC), defreg(RFCTL), - defreg(PSRCTL), defreg(MPTC), defreg(BPTC), defreg(TSCTFC), - defreg(IAC), defreg(MGTPRC), defreg(MGTPDC), defreg(MGTPTC), - defreg(TSCTC), defreg(RXCSUM), defreg(FUNCTAG), defreg(GSCL_1), - defreg(GSCL_2), defreg(GSCL_3), defreg(GSCL_4), defreg(GSCN_0), - defreg(GSCN_1), defreg(GSCN_2), defreg(GSCN_3), defreg(GCR2), - defreg(RAID), defreg(RSRPD), defreg(TIDV), defreg(EITR), - defreg(MRQC), defreg(RETA), defreg(RSSRK), defreg(RDBAH1), - defreg(RDBAL1), defreg(RDLEN1), defreg(RDH1), defreg(RDT1), - defreg(PBACLR), defreg(FCAL), defreg(FCAH), defreg(FCT), - defreg(FCRTH), defreg(FCRTL), defreg(FCTTV), defreg(FCRTV), - defreg(FLA), defreg(EEWR), defreg(FLOP), defreg(FLOL), - defreg(FLSWCTL), defreg(FLSWCNT), defreg(RXDCTL), defreg(RXDCTL1), - defreg(MAVTV0), defreg(MAVTV1), defreg(MAVTV2), defreg(MAVTV3), - defreg(TXSTMPL), defreg(TXSTMPH), defreg(SYSTIML), defreg(SYSTIMH), - defreg(RXCFGL), defreg(RXUDP), defreg(TIMADJL), defreg(TIMADJH), - defreg(RXSTMPH), defreg(RXSTMPL), defreg(RXSATRL), defreg(RXSATRH), - defreg(FLASHT), defreg(TIPG), defreg(RDH), defreg(RDT), - defreg(RDLEN), defreg(RDBAH), defreg(RDBAL), - defreg(TXDCTL1), - defreg(FLSWDATA), - defreg(CTRL_DUP), - defreg(EXTCNF_SIZE), - defreg(EEMNGCTL), - defreg(EEMNGDATA), - defreg(FLMNGCTL), - defreg(FLMNGDATA), - defreg(FLMNGCNT), - defreg(TSYNCRXCTL), - defreg(TSYNCTXCTL), - - /* Aliases */ - defreg(RDH0_A), defreg(RDT0_A), defreg(RDTR_A), defreg(RDFH_A), - defreg(RDFT_A), defreg(TDH_A), defreg(TDT_A), defreg(TIDV_A), - defreg(TDFH_A), defreg(TDFT_A), defreg(RA_A), defreg(RDBAL0_A), - defreg(TDBAL_A), defreg(TDLEN_A), defreg(VFTA_A), defreg(RDLEN0_A), - defreg(FCRTL_A), defreg(FCRTH_A) -}; - static inline void e1000x_inc_reg_if_not_full(uint32_t *mac, int index) { - if (mac[index] != 0xffffffff) { + if (mac[index] != UINT32_MAX) { mac[index]++; } } @@ -152,16 +78,16 @@ static inline void e1000x_update_regs_on_link_down(uint32_t *mac, uint16_t *phy) { mac[STATUS] &= ~E1000_STATUS_LU; - phy[PHY_STATUS] &= ~MII_SR_LINK_STATUS; - phy[PHY_STATUS] &= ~MII_SR_AUTONEG_COMPLETE; - phy[PHY_LP_ABILITY] &= ~MII_LPAR_LPACK; + phy[MII_BMSR] &= ~MII_BMSR_LINK_ST; + phy[MII_BMSR] &= ~MII_BMSR_AN_COMP; + phy[MII_ANLPAR] &= ~MII_ANLPAR_ACK; } static inline void e1000x_update_regs_on_link_up(uint32_t *mac, uint16_t *phy) { mac[STATUS] |= E1000_STATUS_LU; - phy[PHY_STATUS] |= MII_SR_LINK_STATUS; + phy[MII_BMSR] |= MII_BMSR_LINK_ST; } void e1000x_update_rx_total_stats(uint32_t *mac, @@ -178,7 +104,7 @@ uint32_t e1000x_rxbufsize(uint32_t rctl); bool e1000x_rx_ready(PCIDevice *d, uint32_t *mac); -bool e1000x_is_vlan_packet(const uint8_t *buf, uint16_t vet); +bool e1000x_is_vlan_packet(const void *buf, uint16_t vet); bool e1000x_rx_group_filter(uint32_t *mac, const uint8_t *buf); @@ -213,4 +139,7 @@ typedef struct e1000x_txd_props { void e1000x_read_tx_ctx_descr(struct e1000_context_desc *d, e1000x_txd_props *props); +void e1000x_timestamp(uint32_t *mac, int64_t timadj, size_t lo, size_t hi); +void e1000x_set_timinca(uint32_t *mac, int64_t *timadj, uint32_t val); + #endif diff --git a/hw/net/e1000x_regs.h b/hw/net/e1000x_regs.h new file mode 100644 index 0000000000..c0832fa23d --- /dev/null +++ b/hw/net/e1000x_regs.h @@ -0,0 +1,967 @@ +/******************************************************************************* + + Intel PRO/1000 Linux driver + Copyright(c) 1999 - 2006 Intel Corporation. + + This program is free software; you can redistribute it and/or modify it + under the terms and conditions of the GNU General Public License, + version 2, as published by the Free Software Foundation. + + This program is distributed in the hope it will be useful, but WITHOUT + ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or + FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for + more details. + + You should have received a copy of the GNU General Public License along with + this program; if not, see <http://www.gnu.org/licenses/>. + + The full GNU General Public License is included in this distribution in + the file called "COPYING". + + Contact Information: + Linux NICS <linux.nics@intel.com> + e1000-devel Mailing List <e1000-devel@lists.sourceforge.net> + Intel Corporation, 5200 N.E. Elam Young Parkway, Hillsboro, OR 97124-6497 + +*******************************************************************************/ + +/* e1000_hw.h + * Structures, enums, and macros for the MAC + */ + +#ifndef HW_E1000X_REGS_H +#define HW_E1000X_REGS_H + +/* PCI Device IDs */ +#define E1000_DEV_ID_82542 0x1000 +#define E1000_DEV_ID_82543GC_FIBER 0x1001 +#define E1000_DEV_ID_82543GC_COPPER 0x1004 +#define E1000_DEV_ID_82544EI_COPPER 0x1008 +#define E1000_DEV_ID_82544EI_FIBER 0x1009 +#define E1000_DEV_ID_82544GC_COPPER 0x100C +#define E1000_DEV_ID_82544GC_LOM 0x100D +#define E1000_DEV_ID_82540EM 0x100E +#define E1000_DEV_ID_82540EM_LOM 0x1015 +#define E1000_DEV_ID_82540EP_LOM 0x1016 +#define E1000_DEV_ID_82540EP 0x1017 +#define E1000_DEV_ID_82540EP_LP 0x101E +#define E1000_DEV_ID_82545EM_COPPER 0x100F +#define E1000_DEV_ID_82545EM_FIBER 0x1011 +#define E1000_DEV_ID_82545GM_COPPER 0x1026 +#define E1000_DEV_ID_82545GM_FIBER 0x1027 +#define E1000_DEV_ID_82545GM_SERDES 0x1028 +#define E1000_DEV_ID_82546EB_COPPER 0x1010 +#define E1000_DEV_ID_82546EB_FIBER 0x1012 +#define E1000_DEV_ID_82546EB_QUAD_COPPER 0x101D +#define E1000_DEV_ID_82541EI 0x1013 +#define E1000_DEV_ID_82541EI_MOBILE 0x1018 +#define E1000_DEV_ID_82541ER_LOM 0x1014 +#define E1000_DEV_ID_82541ER 0x1078 +#define E1000_DEV_ID_82547GI 0x1075 +#define E1000_DEV_ID_82541GI 0x1076 +#define E1000_DEV_ID_82541GI_MOBILE 0x1077 +#define E1000_DEV_ID_82541GI_LF 0x107C +#define E1000_DEV_ID_82546GB_COPPER 0x1079 +#define E1000_DEV_ID_82546GB_FIBER 0x107A +#define E1000_DEV_ID_82546GB_SERDES 0x107B +#define E1000_DEV_ID_82546GB_PCIE 0x108A +#define E1000_DEV_ID_82546GB_QUAD_COPPER 0x1099 +#define E1000_DEV_ID_82547EI 0x1019 +#define E1000_DEV_ID_82547EI_MOBILE 0x101A +#define E1000_DEV_ID_82571EB_COPPER 0x105E +#define E1000_DEV_ID_82571EB_FIBER 0x105F +#define E1000_DEV_ID_82571EB_SERDES 0x1060 +#define E1000_DEV_ID_82571EB_QUAD_COPPER 0x10A4 +#define E1000_DEV_ID_82571PT_QUAD_COPPER 0x10D5 +#define E1000_DEV_ID_82571EB_QUAD_FIBER 0x10A5 +#define E1000_DEV_ID_82571EB_QUAD_COPPER_LOWPROFILE 0x10BC +#define E1000_DEV_ID_82571EB_SERDES_DUAL 0x10D9 +#define E1000_DEV_ID_82571EB_SERDES_QUAD 0x10DA +#define E1000_DEV_ID_82572EI_COPPER 0x107D +#define E1000_DEV_ID_82572EI_FIBER 0x107E +#define E1000_DEV_ID_82572EI_SERDES 0x107F +#define E1000_DEV_ID_82572EI 0x10B9 +#define E1000_DEV_ID_82573E 0x108B +#define E1000_DEV_ID_82573E_IAMT 0x108C +#define E1000_DEV_ID_82573L 0x109A +#define E1000_DEV_ID_82574L 0x10D3 +#define E1000_DEV_ID_82546GB_QUAD_COPPER_KSP3 0x10B5 +#define E1000_DEV_ID_80003ES2LAN_COPPER_DPT 0x1096 +#define E1000_DEV_ID_80003ES2LAN_SERDES_DPT 0x1098 +#define E1000_DEV_ID_80003ES2LAN_COPPER_SPT 0x10BA +#define E1000_DEV_ID_80003ES2LAN_SERDES_SPT 0x10BB + +#define E1000_DEV_ID_ICH8_IGP_M_AMT 0x1049 +#define E1000_DEV_ID_ICH8_IGP_AMT 0x104A +#define E1000_DEV_ID_ICH8_IGP_C 0x104B +#define E1000_DEV_ID_ICH8_IFE 0x104C +#define E1000_DEV_ID_ICH8_IFE_GT 0x10C4 +#define E1000_DEV_ID_ICH8_IFE_G 0x10C5 +#define E1000_DEV_ID_ICH8_IGP_M 0x104D + +/* Device Specific Register Defaults */ +#define E1000_PHY_ID2_82541x 0x380 +#define E1000_PHY_ID2_82544x 0xC30 +#define E1000_PHY_ID2_8254xx_DEFAULT 0xC20 /* 82540x, 82545x, and 82546x */ +#define E1000_PHY_ID2_82573x 0xCC0 +#define E1000_PHY_ID2_82574x 0xCB1 + +/* Register Set. (82543, 82544) + * + * Registers are defined to be 32 bits and should be accessed as 32 bit values. + * These registers are physically located on the NIC, but are mapped into the + * host memory address space. + * + * RW - register is both readable and writable + * RO - register is read only + * WO - register is write only + * R/clr - register is read only and is cleared when read + * A - register array + */ +#define E1000_CTRL 0x00000 /* Device Control - RW */ +#define E1000_CTRL_DUP 0x00004 /* Device Control Duplicate (Shadow) - RW */ +#define E1000_STATUS 0x00008 /* Device Status - RO */ +#define E1000_EECD 0x00010 /* EEPROM/Flash Control - RW */ +#define E1000_EERD 0x00014 /* EEPROM Read - RW */ +#define E1000_CTRL_EXT 0x00018 /* Extended Device Control - RW */ +#define E1000_FLA 0x0001C /* Flash Access - RW */ +#define E1000_MDIC 0x00020 /* MDI Control - RW */ +#define E1000_SCTL 0x00024 /* SerDes Control - RW */ +#define E1000_FCAL 0x00028 /* Flow Control Address Low - RW */ +#define E1000_FCAH 0x0002C /* Flow Control Address High -RW */ +#define E1000_FCT 0x00030 /* Flow Control Type - RW */ +#define E1000_VET 0x00038 /* VLAN Ether Type - RW */ +#define E1000_ICR 0x000C0 /* Interrupt Cause Read - R/clr */ +#define E1000_ICS 0x000C8 /* Interrupt Cause Set - WO */ +#define E1000_IMS 0x000D0 /* Interrupt Mask Set - RW */ +#define E1000_IMC 0x000D8 /* Interrupt Mask Clear - WO */ +#define E1000_IAM 0x000E0 /* Interrupt Acknowledge Auto Mask */ +#define E1000_RCTL 0x00100 /* RX Control - RW */ +#define E1000_FCTTV 0x00170 /* Flow Control Transmit Timer Value - RW */ +#define E1000_TCTL 0x00400 /* TX Control - RW */ +#define E1000_TCTL_EXT 0x00404 /* Extended TX Control - RW */ +#define E1000_TIPG 0x00410 /* TX Inter-packet gap -RW */ +#define E1000_LEDCTL 0x00E00 /* LED Control - RW */ +#define E1000_EEMNGCTL 0x01010 /* MNG EEprom Control */ +#define E1000_EEMNGDATA 0x01014 /* MNG EEPROM Read/Write data */ +#define E1000_FLMNGCTL 0x01018 /* MNG Flash Control */ +#define E1000_FLMNGDATA 0x0101C /* MNG FLASH Read data */ +#define E1000_FLMNGCNT 0x01020 /* MNG FLASH Read Counter */ +#define E1000_EEARBC 0x01024 /* EEPROM Auto Read Bus Control */ +#define E1000_FLOP 0x0103C /* FLASH Opcode Register */ +#define E1000_FCRTL 0x02160 /* Flow Control Receive Threshold Low - RW */ +#define E1000_FCRTL_A 0x00168 /* Alias to FCRTL */ +#define E1000_FCRTH 0x02168 /* Flow Control Receive Threshold High - RW */ +#define E1000_RDFH 0x02410 /* Receive Data FIFO Head Register - RW */ +#define E1000_RDFH_A 0x08000 /* Alias to RDFH */ +#define E1000_RDFT 0x02418 /* Receive Data FIFO Tail Register - RW */ +#define E1000_RDFT_A 0x08008 /* Alias to RDFT */ +#define E1000_RDFHS 0x02420 /* Receive Data FIFO Head Saved Register - RW */ +#define E1000_RDFTS 0x02428 /* Receive Data FIFO Tail Saved Register - RW */ +#define E1000_RDFPC 0x02430 /* Receive Data FIFO Packet Count - RW */ +#define E1000_TDFH 0x03410 /* TX Data FIFO Head - RW */ +#define E1000_TDFH_A 0x08010 /* Alias to TDFH */ +#define E1000_TDFT 0x03418 /* TX Data FIFO Tail - RW */ +#define E1000_TDFT_A 0x08018 /* Alias to TDFT */ +#define E1000_TDFHS 0x03420 /* TX Data FIFO Head Saved - RW */ +#define E1000_TDFTS 0x03428 /* TX Data FIFO Tail Saved - RW */ +#define E1000_TDFPC 0x03430 /* TX Data FIFO Packet Count - RW */ +#define E1000_CRCERRS 0x04000 /* CRC Error Count - R/clr */ +#define E1000_ALGNERRC 0x04004 /* Alignment Error Count - R/clr */ +#define E1000_SYMERRS 0x04008 /* Symbol Error Count - R/clr */ +#define E1000_RXERRC 0x0400C /* Receive Error Count - R/clr */ +#define E1000_MPC 0x04010 /* Missed Packet Count - R/clr */ +#define E1000_SCC 0x04014 /* Single Collision Count - R/clr */ +#define E1000_ECOL 0x04018 /* Excessive Collision Count - R/clr */ +#define E1000_MCC 0x0401C /* Multiple Collision Count - R/clr */ +#define E1000_LATECOL 0x04020 /* Late Collision Count - R/clr */ +#define E1000_COLC 0x04028 /* Collision Count - R/clr */ +#define E1000_DC 0x04030 /* Defer Count - R/clr */ +#define E1000_TNCRS 0x04034 /* TX-No CRS - R/clr */ +#define E1000_RLEC 0x04040 /* Receive Length Error Count - R/clr */ +#define E1000_XONRXC 0x04048 /* XON RX Count - R/clr */ +#define E1000_XONTXC 0x0404C /* XON TX Count - R/clr */ +#define E1000_XOFFRXC 0x04050 /* XOFF RX Count - R/clr */ +#define E1000_XOFFTXC 0x04054 /* XOFF TX Count - R/clr */ +#define E1000_FCRUC 0x04058 /* Flow Control RX Unsupported Count- R/clr */ +#define E1000_PRC64 0x0405C /* Packets RX (64 bytes) - R/clr */ +#define E1000_PRC127 0x04060 /* Packets RX (65-127 bytes) - R/clr */ +#define E1000_PRC255 0x04064 /* Packets RX (128-255 bytes) - R/clr */ +#define E1000_PRC511 0x04068 /* Packets RX (255-511 bytes) - R/clr */ +#define E1000_PRC1023 0x0406C /* Packets RX (512-1023 bytes) - R/clr */ +#define E1000_PRC1522 0x04070 /* Packets RX (1024-1522 bytes) - R/clr */ +#define E1000_GPRC 0x04074 /* Good Packets RX Count - R/clr */ +#define E1000_BPRC 0x04078 /* Broadcast Packets RX Count - R/clr */ +#define E1000_MPRC 0x0407C /* Multicast Packets RX Count - R/clr */ +#define E1000_GPTC 0x04080 /* Good Packets TX Count - R/clr */ +#define E1000_GORCL 0x04088 /* Good Octets RX Count Low - R/clr */ +#define E1000_GORCH 0x0408C /* Good Octets RX Count High - R/clr */ +#define E1000_GOTCL 0x04090 /* Good Octets TX Count Low - R/clr */ +#define E1000_GOTCH 0x04094 /* Good Octets TX Count High - R/clr */ +#define E1000_RNBC 0x040A0 /* RX No Buffers Count - R/clr */ +#define E1000_RUC 0x040A4 /* RX Undersize Count - R/clr */ +#define E1000_RFC 0x040A8 /* RX Fragment Count - R/clr */ +#define E1000_ROC 0x040AC /* RX Oversize Count - R/clr */ +#define E1000_RJC 0x040B0 /* RX Jabber Count - R/clr */ +#define E1000_MGTPRC 0x040B4 /* Management Packets RX Count - R/clr */ +#define E1000_MGTPDC 0x040B8 /* Management Packets Dropped Count - R/clr */ +#define E1000_MGTPTC 0x040BC /* Management Packets TX Count - R/clr */ +#define E1000_TORL 0x040C0 /* Total Octets RX Low - R/clr */ +#define E1000_TORH 0x040C4 /* Total Octets RX High - R/clr */ +#define E1000_TOTL 0x040C8 /* Total Octets TX Low - R/clr */ +#define E1000_TOTH 0x040CC /* Total Octets TX High - R/clr */ +#define E1000_TPR 0x040D0 /* Total Packets RX - R/clr */ +#define E1000_TPT 0x040D4 /* Total Packets TX - R/clr */ +#define E1000_PTC64 0x040D8 /* Packets TX (64 bytes) - R/clr */ +#define E1000_PTC127 0x040DC /* Packets TX (65-127 bytes) - R/clr */ +#define E1000_PTC255 0x040E0 /* Packets TX (128-255 bytes) - R/clr */ +#define E1000_PTC511 0x040E4 /* Packets TX (256-511 bytes) - R/clr */ +#define E1000_PTC1023 0x040E8 /* Packets TX (512-1023 bytes) - R/clr */ +#define E1000_PTC1522 0x040EC /* Packets TX (1024-1522 Bytes) - R/clr */ +#define E1000_MPTC 0x040F0 /* Multicast Packets TX Count - R/clr */ +#define E1000_BPTC 0x040F4 /* Broadcast Packets TX Count - R/clr */ +#define E1000_TSCTC 0x040F8 /* TCP Segmentation Context TX - R/clr */ +#define E1000_IAC 0x04100 /* Interrupt Assertion Count */ +#define E1000_ICRXPTC 0x04104 /* Interrupt Cause Rx Packet Timer Expire Count */ +#define E1000_ICRXDMTC 0x04120 /* Interrupt Cause Rx Descriptor Minimum Threshold Count */ +#define E1000_RXCSUM 0x05000 /* RX Checksum Control - RW */ +#define E1000_RFCTL 0x05008 /* Receive Filter Control*/ +#define E1000_MAVTV0 0x05010 /* Management VLAN TAG Value 0 */ +#define E1000_MAVTV1 0x05014 /* Management VLAN TAG Value 1 */ +#define E1000_MAVTV2 0x05018 /* Management VLAN TAG Value 2 */ +#define E1000_MAVTV3 0x0501c /* Management VLAN TAG Value 3 */ +#define E1000_MTA 0x05200 /* Multicast Table Array - RW Array */ +#define E1000_RA 0x05400 /* Receive Address - RW Array */ +#define E1000_RA_A 0x00040 /* Alias to RA */ +#define E1000_VFTA 0x05600 /* VLAN Filter Table Array - RW Array */ +#define E1000_VFTA_A 0x00600 /* Alias to VFTA */ +#define E1000_WUC 0x05800 /* Wakeup Control - RW */ +#define E1000_WUFC 0x05808 /* Wakeup Filter Control - RW */ +#define E1000_WUS 0x05810 /* Wakeup Status - RO */ +#define E1000_MANC 0x05820 /* Management Control - RW */ +#define E1000_IPAV 0x05838 /* IP Address Valid - RW */ +#define E1000_IP4AT 0x05840 /* IPv4 Address Table - RW Array */ +#define E1000_IP6AT 0x05880 /* IPv6 Address Table - RW Array */ +#define E1000_WUPL 0x05900 /* Wakeup Packet Length - RW */ +#define E1000_WUPM 0x05A00 /* Wakeup Packet Memory - RO A */ +#define E1000_MFVAL 0x05824 /* Manageability Filters Valid - RW */ +#define E1000_MDEF 0x05890 /* Manageability Decision Filters - RW Array */ +#define E1000_FFMT 0x09000 /* Flexible Filter Mask Table - RW Array */ +#define E1000_FTFT 0x09400 /* Flexible TCO Filter Table - RW Array */ + +#define E1000_MANC2H 0x05860 /* Management Control To Host - RW */ +#define E1000_SW_FW_SYNC 0x05B5C /* Software-Firmware Synchronization - RW */ + +#define E1000_GCR 0x05B00 /* PCI-Ex Control */ +#define E1000_FUNCTAG 0x05B08 /* Function-Tag Register */ +#define E1000_GSCL_1 0x05B10 /* PCI-Ex Statistic Control #1 */ +#define E1000_GSCL_2 0x05B14 /* PCI-Ex Statistic Control #2 */ +#define E1000_GSCL_3 0x05B18 /* PCI-Ex Statistic Control #3 */ +#define E1000_GSCL_4 0x05B1C /* PCI-Ex Statistic Control #4 */ +#define E1000_GSCN_0 0x05B20 /* 3GIO Statistic Counter Register #0 */ +#define E1000_GSCN_1 0x05B24 /* 3GIO Statistic Counter Register #1 */ +#define E1000_GSCN_2 0x05B28 /* 3GIO Statistic Counter Register #2 */ +#define E1000_GSCN_3 0x05B2C /* 3GIO Statistic Counter Register #3 */ +#define E1000_FACTPS 0x05B30 /* Function Active and Power State to MNG */ +#define E1000_SWSM 0x05B50 /* SW Semaphore */ +#define E1000_FWSM 0x05B54 /* FW Semaphore */ +#define E1000_PBACLR 0x05B68 /* MSI-X PBA Clear */ + +#define E1000_TSYNCRXCTL 0x0B620 /* Rx Time Sync Control register - RW */ +#define E1000_TSYNCTXCTL 0x0B614 /* Tx Time Sync Control register - RW */ +#define E1000_TIMINCA 0x0B608 /* Increment attributes register - RW */ +#define E1000_RXSTMPL 0x0B624 /* Rx timestamp Low - RO */ +#define E1000_RXSTMPH 0x0B628 /* Rx timestamp High - RO */ +#define E1000_TXSTMPL 0x0B618 /* Tx timestamp value Low - RO */ +#define E1000_TXSTMPH 0x0B61C /* Tx timestamp value High - RO */ +#define E1000_SYSTIML 0x0B600 /* System time register Low - RO */ +#define E1000_SYSTIMH 0x0B604 /* System time register High - RO */ +#define E1000_TIMINCA 0x0B608 /* Increment attributes register - RW */ +#define E1000_RXSATRL 0x0B62C /* Rx timestamp attribute low - RO */ +#define E1000_RXSATRH 0x0B630 /* Rx timestamp attribute high - RO */ +#define E1000_TIMADJL 0x0B60C /* Time Adjustment Offset register Low - RW */ +#define E1000_TIMADJH 0x0B610 /* Time Adjustment Offset register High - RW */ + +/* RSS registers */ +#define E1000_MRQC 0x05818 /* Multiple Receive Control - RW */ +#define E1000_RETA 0x05C00 /* Redirection Table - RW Array */ +#define E1000_RSSRK 0x05C80 /* RSS Random Key - RW Array */ + +#define E1000_RETA_IDX(hash) ((hash) & (BIT(7) - 1)) +#define E1000_RETA_VAL(reta, hash) (((uint8_t *)(reta))[E1000_RETA_IDX(hash)]) + +#define E1000_MRQC_EN_TCPIPV4(mrqc) ((mrqc) & BIT(16)) +#define E1000_MRQC_EN_IPV4(mrqc) ((mrqc) & BIT(17)) +#define E1000_MRQC_EN_TCPIPV6(mrqc) ((mrqc) & BIT(18)) +#define E1000_MRQC_EN_IPV6EX(mrqc) ((mrqc) & BIT(19)) +#define E1000_MRQC_EN_IPV6(mrqc) ((mrqc) & BIT(20)) + +#define E1000_MRQ_RSS_TYPE_NONE (0) +#define E1000_MRQ_RSS_TYPE_IPV4TCP (1) +#define E1000_MRQ_RSS_TYPE_IPV4 (2) +#define E1000_MRQ_RSS_TYPE_IPV6TCP (3) +#define E1000_MRQ_RSS_TYPE_IPV6EX (4) +#define E1000_MRQ_RSS_TYPE_IPV6 (5) + +#define E1000_ICR_ASSERTED BIT(31) +#define E1000_EIAC_MASK 0x01F00000 + +/* RFCTL register bits */ +#define E1000_RFCTL_ISCSI_DIS 0x00000001 +#define E1000_RFCTL_NFSW_DIS 0x00000040 +#define E1000_RFCTL_NFSR_DIS 0x00000080 +#define E1000_RFCTL_IPV6_DIS 0x00000400 +#define E1000_RFCTL_IPV6_XSUM_DIS 0x00000800 +#define E1000_RFCTL_IPFRSP_DIS 0x00004000 +#define E1000_RFCTL_EXTEN 0x00008000 +#define E1000_RFCTL_IPV6_EX_DIS 0x00010000 +#define E1000_RFCTL_NEW_IPV6_EXT_DIS 0x00020000 + +/* TARC* parsing */ +#define E1000_TARC_ENABLE BIT(10) + +/* SW Semaphore Register */ +#define E1000_SWSM_SMBI 0x00000001 /* Driver Semaphore bit */ +#define E1000_SWSM_SWESMBI 0x00000002 /* FW Semaphore bit */ +#define E1000_SWSM_DRV_LOAD 0x00000008 /* Driver Loaded Bit */ + +#define E1000_SWSM2_LOCK 0x00000002 /* Secondary driver semaphore bit */ + +/* Interrupt Cause Read */ +#define E1000_ICR_TXDW 0x00000001 /* Transmit desc written back */ +#define E1000_ICR_TXQE 0x00000002 /* Transmit Queue empty */ +#define E1000_ICR_LSC 0x00000004 /* Link Status Change */ +#define E1000_ICR_RXSEQ 0x00000008 /* rx sequence error */ +#define E1000_ICR_RXDMT0 0x00000010 /* rx desc min. threshold (0) */ +#define E1000_ICR_RXO 0x00000040 /* rx overrun */ +#define E1000_ICR_RXT0 0x00000080 /* rx timer intr (ring 0) */ +#define E1000_ICR_MDAC 0x00000200 /* MDIO access complete */ +#define E1000_ICR_RXCFG 0x00000400 /* RX /c/ ordered set */ +#define E1000_ICR_GPI_EN0 0x00000800 /* GP Int 0 */ +#define E1000_ICR_GPI_EN1 0x00001000 /* GP Int 1 */ +#define E1000_ICR_GPI_EN2 0x00002000 /* GP Int 2 */ +#define E1000_ICR_GPI_EN3 0x00004000 /* GP Int 3 */ +#define E1000_ICR_TXD_LOW 0x00008000 +#define E1000_ICR_SRPD 0x00010000 +#define E1000_ICR_ACK 0x00020000 /* Receive Ack frame */ +#define E1000_ICR_MNG 0x00040000 /* Manageability event */ +#define E1000_ICR_DOCK 0x00080000 /* Dock/Undock */ +#define E1000_ICR_INT_ASSERTED 0x80000000 /* If this bit asserted, the driver should claim the interrupt */ +#define E1000_ICR_RXD_FIFO_PAR0 0x00100000 /* queue 0 Rx descriptor FIFO parity error */ +#define E1000_ICR_TXD_FIFO_PAR0 0x00200000 /* queue 0 Tx descriptor FIFO parity error */ +#define E1000_ICR_HOST_ARB_PAR 0x00400000 /* host arb read buffer parity error */ +#define E1000_ICR_PB_PAR 0x00800000 /* packet buffer parity error */ +#define E1000_ICR_RXD_FIFO_PAR1 0x01000000 /* queue 1 Rx descriptor FIFO parity error */ +#define E1000_ICR_TXD_FIFO_PAR1 0x02000000 /* queue 1 Tx descriptor FIFO parity error */ +#define E1000_ICR_ALL_PARITY 0x03F00000 /* all parity error bits */ +#define E1000_ICR_DSW 0x00000020 /* FW changed the status of DISSW bit in the FWSM */ +#define E1000_ICR_PHYINT 0x00001000 /* LAN connected device generates an interrupt */ +#define E1000_ICR_EPRST 0x00100000 /* ME handware reset occurs */ +#define E1000_ICR_RXQ0 0x00100000 /* Rx Queue 0 Interrupt */ +#define E1000_ICR_RXQ1 0x00200000 /* Rx Queue 1 Interrupt */ +#define E1000_ICR_TXQ0 0x00400000 /* Tx Queue 0 Interrupt */ +#define E1000_ICR_TXQ1 0x00800000 /* Tx Queue 1 Interrupt */ +#define E1000_ICR_OTHER 0x01000000 /* Other Interrupts */ + +#define E1000_ICR_OTHER_CAUSES (E1000_ICR_LSC | \ + E1000_ICR_RXO | \ + E1000_ICR_MDAC | \ + E1000_ICR_SRPD | \ + E1000_ICR_ACK | \ + E1000_ICR_MNG) + +/* Interrupt Cause Set */ +#define E1000_ICS_TXDW E1000_ICR_TXDW /* Transmit desc written back */ +#define E1000_ICS_TXQE E1000_ICR_TXQE /* Transmit Queue empty */ +#define E1000_ICS_LSC E1000_ICR_LSC /* Link Status Change */ +#define E1000_ICS_RXSEQ E1000_ICR_RXSEQ /* rx sequence error */ +#define E1000_ICS_RXDMT0 E1000_ICR_RXDMT0 /* rx desc min. threshold */ +#define E1000_ICS_RXO E1000_ICR_RXO /* rx overrun */ +#define E1000_ICS_RXT0 E1000_ICR_RXT0 /* rx timer intr */ +#define E1000_ICS_MDAC E1000_ICR_MDAC /* MDIO access complete */ +#define E1000_ICS_RXCFG E1000_ICR_RXCFG /* RX /c/ ordered set */ +#define E1000_ICS_GPI_EN0 E1000_ICR_GPI_EN0 /* GP Int 0 */ +#define E1000_ICS_GPI_EN1 E1000_ICR_GPI_EN1 /* GP Int 1 */ +#define E1000_ICS_GPI_EN2 E1000_ICR_GPI_EN2 /* GP Int 2 */ +#define E1000_ICS_GPI_EN3 E1000_ICR_GPI_EN3 /* GP Int 3 */ +#define E1000_ICS_TXD_LOW E1000_ICR_TXD_LOW +#define E1000_ICS_SRPD E1000_ICR_SRPD +#define E1000_ICS_ACK E1000_ICR_ACK /* Receive Ack frame */ +#define E1000_ICS_MNG E1000_ICR_MNG /* Manageability event */ +#define E1000_ICS_DOCK E1000_ICR_DOCK /* Dock/Undock */ +#define E1000_ICS_RXD_FIFO_PAR0 E1000_ICR_RXD_FIFO_PAR0 /* queue 0 Rx descriptor FIFO parity error */ +#define E1000_ICS_TXD_FIFO_PAR0 E1000_ICR_TXD_FIFO_PAR0 /* queue 0 Tx descriptor FIFO parity error */ +#define E1000_ICS_HOST_ARB_PAR E1000_ICR_HOST_ARB_PAR /* host arb read buffer parity error */ +#define E1000_ICS_PB_PAR E1000_ICR_PB_PAR /* packet buffer parity error */ +#define E1000_ICS_RXD_FIFO_PAR1 E1000_ICR_RXD_FIFO_PAR1 /* queue 1 Rx descriptor FIFO parity error */ +#define E1000_ICS_TXD_FIFO_PAR1 E1000_ICR_TXD_FIFO_PAR1 /* queue 1 Tx descriptor FIFO parity error */ +#define E1000_ICS_DSW E1000_ICR_DSW +#define E1000_ICS_PHYINT E1000_ICR_PHYINT +#define E1000_ICS_EPRST E1000_ICR_EPRST + +/* Interrupt Mask Set */ +#define E1000_IMS_TXDW E1000_ICR_TXDW /* Transmit desc written back */ +#define E1000_IMS_TXQE E1000_ICR_TXQE /* Transmit Queue empty */ +#define E1000_IMS_LSC E1000_ICR_LSC /* Link Status Change */ +#define E1000_IMS_RXSEQ E1000_ICR_RXSEQ /* rx sequence error */ +#define E1000_IMS_RXDMT0 E1000_ICR_RXDMT0 /* rx desc min. threshold */ +#define E1000_IMS_RXO E1000_ICR_RXO /* rx overrun */ +#define E1000_IMS_RXT0 E1000_ICR_RXT0 /* rx timer intr */ +#define E1000_IMS_MDAC E1000_ICR_MDAC /* MDIO access complete */ +#define E1000_IMS_RXCFG E1000_ICR_RXCFG /* RX /c/ ordered set */ +#define E1000_IMS_GPI_EN0 E1000_ICR_GPI_EN0 /* GP Int 0 */ +#define E1000_IMS_GPI_EN1 E1000_ICR_GPI_EN1 /* GP Int 1 */ +#define E1000_IMS_GPI_EN2 E1000_ICR_GPI_EN2 /* GP Int 2 */ +#define E1000_IMS_GPI_EN3 E1000_ICR_GPI_EN3 /* GP Int 3 */ +#define E1000_IMS_TXD_LOW E1000_ICR_TXD_LOW +#define E1000_IMS_SRPD E1000_ICR_SRPD +#define E1000_IMS_ACK E1000_ICR_ACK /* Receive Ack frame */ +#define E1000_IMS_MNG E1000_ICR_MNG /* Manageability event */ +#define E1000_IMS_RXQ0 E1000_ICR_RXQ0 +#define E1000_IMS_RXQ1 E1000_ICR_RXQ1 +#define E1000_IMS_TXQ0 E1000_ICR_TXQ0 +#define E1000_IMS_TXQ1 E1000_ICR_TXQ1 +#define E1000_IMS_OTHER E1000_ICR_OTHER +#define E1000_IMS_DOCK E1000_ICR_DOCK /* Dock/Undock */ +#define E1000_IMS_RXD_FIFO_PAR0 E1000_ICR_RXD_FIFO_PAR0 /* queue 0 Rx descriptor FIFO parity error */ +#define E1000_IMS_TXD_FIFO_PAR0 E1000_ICR_TXD_FIFO_PAR0 /* queue 0 Tx descriptor FIFO parity error */ +#define E1000_IMS_HOST_ARB_PAR E1000_ICR_HOST_ARB_PAR /* host arb read buffer parity error */ +#define E1000_IMS_PB_PAR E1000_ICR_PB_PAR /* packet buffer parity error */ +#define E1000_IMS_RXD_FIFO_PAR1 E1000_ICR_RXD_FIFO_PAR1 /* queue 1 Rx descriptor FIFO parity error */ +#define E1000_IMS_TXD_FIFO_PAR1 E1000_ICR_TXD_FIFO_PAR1 /* queue 1 Tx descriptor FIFO parity error */ +#define E1000_IMS_DSW E1000_ICR_DSW +#define E1000_IMS_PHYINT E1000_ICR_PHYINT +#define E1000_IMS_EPRST E1000_ICR_EPRST + +/* Interrupt Mask Clear */ +#define E1000_IMC_TXDW E1000_ICR_TXDW /* Transmit desc written back */ +#define E1000_IMC_TXQE E1000_ICR_TXQE /* Transmit Queue empty */ +#define E1000_IMC_LSC E1000_ICR_LSC /* Link Status Change */ +#define E1000_IMC_RXSEQ E1000_ICR_RXSEQ /* rx sequence error */ +#define E1000_IMC_RXDMT0 E1000_ICR_RXDMT0 /* rx desc min. threshold */ +#define E1000_IMC_RXO E1000_ICR_RXO /* rx overrun */ +#define E1000_IMC_RXT0 E1000_ICR_RXT0 /* rx timer intr */ +#define E1000_IMC_MDAC E1000_ICR_MDAC /* MDIO access complete */ +#define E1000_IMC_RXCFG E1000_ICR_RXCFG /* RX /c/ ordered set */ +#define E1000_IMC_GPI_EN0 E1000_ICR_GPI_EN0 /* GP Int 0 */ +#define E1000_IMC_GPI_EN1 E1000_ICR_GPI_EN1 /* GP Int 1 */ +#define E1000_IMC_GPI_EN2 E1000_ICR_GPI_EN2 /* GP Int 2 */ +#define E1000_IMC_GPI_EN3 E1000_ICR_GPI_EN3 /* GP Int 3 */ +#define E1000_IMC_TXD_LOW E1000_ICR_TXD_LOW +#define E1000_IMC_SRPD E1000_ICR_SRPD +#define E1000_IMC_ACK E1000_ICR_ACK /* Receive Ack frame */ +#define E1000_IMC_MNG E1000_ICR_MNG /* Manageability event */ +#define E1000_IMC_DOCK E1000_ICR_DOCK /* Dock/Undock */ +#define E1000_IMC_RXD_FIFO_PAR0 E1000_ICR_RXD_FIFO_PAR0 /* queue 0 Rx descriptor FIFO parity error */ +#define E1000_IMC_TXD_FIFO_PAR0 E1000_ICR_TXD_FIFO_PAR0 /* queue 0 Tx descriptor FIFO parity error */ +#define E1000_IMC_HOST_ARB_PAR E1000_ICR_HOST_ARB_PAR /* host arb read buffer parity error */ +#define E1000_IMC_PB_PAR E1000_ICR_PB_PAR /* packet buffer parity error */ +#define E1000_IMC_RXD_FIFO_PAR1 E1000_ICR_RXD_FIFO_PAR1 /* queue 1 Rx descriptor FIFO parity error */ +#define E1000_IMC_TXD_FIFO_PAR1 E1000_ICR_TXD_FIFO_PAR1 /* queue 1 Tx descriptor FIFO parity error */ +#define E1000_IMC_DSW E1000_ICR_DSW +#define E1000_IMC_PHYINT E1000_ICR_PHYINT +#define E1000_IMC_EPRST E1000_ICR_EPRST + +/* Receive Control */ +#define E1000_RCTL_RST 0x00000001 /* Software reset */ +#define E1000_RCTL_EN 0x00000002 /* enable */ +#define E1000_RCTL_SBP 0x00000004 /* store bad packet */ +#define E1000_RCTL_UPE 0x00000008 /* unicast promiscuous enable */ +#define E1000_RCTL_MPE 0x00000010 /* multicast promiscuous enab */ +#define E1000_RCTL_LPE 0x00000020 /* long packet enable */ +#define E1000_RCTL_LBM_NO 0x00000000 /* no loopback mode */ +#define E1000_RCTL_LBM_MAC 0x00000040 /* MAC loopback mode */ +#define E1000_RCTL_LBM_SLP 0x00000080 /* serial link loopback mode */ +#define E1000_RCTL_LBM_TCVR 0x000000C0 /* tcvr loopback mode */ +#define E1000_RCTL_DTYP_MASK 0x00000C00 /* Descriptor type mask */ +#define E1000_RCTL_DTYP_PS 0x00000400 /* Packet Split descriptor */ +#define E1000_RCTL_RDMTS_HALF 0x00000000 /* rx desc min threshold size */ +#define E1000_RCTL_RDMTS_QUAT 0x00000100 /* rx desc min threshold size */ +#define E1000_RCTL_RDMTS_EIGTH 0x00000200 /* rx desc min threshold size */ +#define E1000_RCTL_MO_SHIFT 12 /* multicast offset shift */ +#define E1000_RCTL_MO_0 0x00000000 /* multicast offset 11:0 */ +#define E1000_RCTL_MO_1 0x00001000 /* multicast offset 12:1 */ +#define E1000_RCTL_MO_2 0x00002000 /* multicast offset 13:2 */ +#define E1000_RCTL_MO_3 0x00003000 /* multicast offset 15:4 */ +#define E1000_RCTL_MDR 0x00004000 /* multicast desc ring 0 */ +#define E1000_RCTL_BAM 0x00008000 /* broadcast enable */ +/* these buffer sizes are valid if E1000_RCTL_BSEX is 0 */ +#define E1000_RCTL_SZ_2048 0x00000000 /* rx buffer size 2048 */ +#define E1000_RCTL_SZ_1024 0x00010000 /* rx buffer size 1024 */ +#define E1000_RCTL_SZ_512 0x00020000 /* rx buffer size 512 */ +#define E1000_RCTL_SZ_256 0x00030000 /* rx buffer size 256 */ +/* these buffer sizes are valid if E1000_RCTL_BSEX is 1 */ +#define E1000_RCTL_SZ_16384 0x00010000 /* rx buffer size 16384 */ +#define E1000_RCTL_SZ_8192 0x00020000 /* rx buffer size 8192 */ +#define E1000_RCTL_SZ_4096 0x00030000 /* rx buffer size 4096 */ +#define E1000_RCTL_VFE 0x00040000 /* vlan filter enable */ +#define E1000_RCTL_CFIEN 0x00080000 /* canonical form enable */ +#define E1000_RCTL_CFI 0x00100000 /* canonical form indicator */ +#define E1000_RCTL_DPF 0x00400000 /* discard pause frames */ +#define E1000_RCTL_PMCF 0x00800000 /* pass MAC control frames */ +#define E1000_RCTL_BSEX 0x02000000 /* Buffer size extension */ +#define E1000_RCTL_SECRC 0x04000000 /* Strip Ethernet CRC */ +#define E1000_RCTL_FLXBUF_MASK 0x78000000 /* Flexible buffer size */ +#define E1000_RCTL_FLXBUF_SHIFT 27 /* Flexible buffer shift */ + + +#define E1000_EEPROM_SWDPIN0 0x0001 /* SWDPIN 0 EEPROM Value */ +#define E1000_EEPROM_LED_LOGIC 0x0020 /* Led Logic Word */ +#define E1000_EEPROM_RW_REG_DATA 16 /* Offset to data in EEPROM read/write registers */ +#define E1000_EEPROM_RW_REG_DONE 0x10 /* Offset to READ/WRITE done bit */ +#define E1000_EEPROM_RW_REG_START 1 /* First bit for telling part to start operation */ +#define E1000_EEPROM_RW_ADDR_SHIFT 8 /* Shift to the address bits */ +#define E1000_EEPROM_POLL_WRITE 1 /* Flag for polling for write complete */ +#define E1000_EEPROM_POLL_READ 0 /* Flag for polling for read complete */ + +/* 82574 EERD/EEWR registers layout */ +#define E1000_EERW_START BIT(0) +#define E1000_EERW_DONE BIT(1) +#define E1000_EERW_ADDR_SHIFT 2 +#define E1000_EERW_ADDR_MASK ((1L << 14) - 1) +#define E1000_EERW_DATA_SHIFT 16 +#define E1000_EERW_DATA_MASK ((1L << 16) - 1) + +/* Register Bit Masks */ +/* Device Control */ +#define E1000_CTRL_FD 0x00000001 /* Full duplex.0=half; 1=full */ +#define E1000_CTRL_BEM 0x00000002 /* Endian Mode.0=little,1=big */ +#define E1000_CTRL_PRIOR 0x00000004 /* Priority on PCI. 0=rx,1=fair */ +#define E1000_CTRL_GIO_MASTER_DISABLE 0x00000004 /*Blocks new Master requests */ +#define E1000_CTRL_LRST 0x00000008 /* Link reset. 0=normal,1=reset */ +#define E1000_CTRL_TME 0x00000010 /* Test mode. 0=normal,1=test */ +#define E1000_CTRL_SLE 0x00000020 /* Serial Link on 0=dis,1=en */ +#define E1000_CTRL_ASDE 0x00000020 /* Auto-speed detect enable */ +#define E1000_CTRL_SLU 0x00000040 /* Set link up (Force Link) */ +#define E1000_CTRL_ILOS 0x00000080 /* Invert Loss-Of Signal */ +#define E1000_CTRL_SPD_SEL 0x00000300 /* Speed Select Mask */ +#define E1000_CTRL_SPD_10 0x00000000 /* Force 10Mb */ +#define E1000_CTRL_SPD_100 0x00000100 /* Force 100Mb */ +#define E1000_CTRL_SPD_1000 0x00000200 /* Force 1Gb */ +#define E1000_CTRL_BEM32 0x00000400 /* Big Endian 32 mode */ +#define E1000_CTRL_FRCSPD 0x00000800 /* Force Speed */ +#define E1000_CTRL_FRCDPX 0x00001000 /* Force Duplex */ +#define E1000_CTRL_D_UD_EN 0x00002000 /* Dock/Undock enable */ +#define E1000_CTRL_D_UD_POLARITY 0x00004000 /* Defined polarity of Dock/Undock indication in SDP[0] */ +#define E1000_CTRL_FORCE_PHY_RESET 0x00008000 /* Reset both PHY ports, through PHYRST_N pin */ +#define E1000_CTRL_SPD_SHIFT 8 /* Speed Select Shift */ + +#define E1000_CTRL_EXT_ASDCHK 0x00001000 /* auto speed detection check */ +#define E1000_CTRL_EXT_EE_RST 0x00002000 /* EEPROM reset */ +#define E1000_CTRL_EXT_LINK_EN 0x00010000 /* enable link status from external LINK_0 and LINK_1 pins */ +#define E1000_CTRL_EXT_DRV_LOAD 0x10000000 /* Driver loaded bit for FW */ +#define E1000_CTRL_EXT_EIAME 0x01000000 +#define E1000_CTRL_EXT_IAME 0x08000000 /* Int ACK Auto-mask */ +#define E1000_CTRL_EXT_PBA_CLR 0x80000000 /* PBA Clear */ +#define E1000_CTRL_EXT_INT_TIMERS_CLEAR_ENA 0x20000000 +#define E1000_CTRL_EXT_SPD_BYPS 0x00008000 /* Speed Select Bypass */ + +#define E1000_CTRL_SWDPIN0 0x00040000 /* SWDPIN 0 value */ +#define E1000_CTRL_SWDPIN1 0x00080000 /* SWDPIN 1 value */ +#define E1000_CTRL_SWDPIN2 0x00100000 /* SWDPIN 2 value */ +#define E1000_CTRL_SWDPIN3 0x00200000 /* SWDPIN 3 value */ +#define E1000_CTRL_SWDPIO0 0x00400000 /* SWDPIN 0 Input or output */ +#define E1000_CTRL_SWDPIO1 0x00800000 /* SWDPIN 1 input or output */ +#define E1000_CTRL_SWDPIO2 0x01000000 /* SWDPIN 2 input or output */ +#define E1000_CTRL_SWDPIO3 0x02000000 /* SWDPIN 3 input or output */ +#define E1000_CTRL_ADVD3WUC 0x00100000 /* D3 WUC */ +#define E1000_CTRL_RST 0x04000000 /* Global reset */ +#define E1000_CTRL_RFCE 0x08000000 /* Receive Flow Control enable */ +#define E1000_CTRL_TFCE 0x10000000 /* Transmit flow control enable */ +#define E1000_CTRL_RTE 0x20000000 /* Routing tag enable */ +#define E1000_CTRL_VME 0x40000000 /* IEEE VLAN mode enable */ +#define E1000_CTRL_PHY_RST 0x80000000 /* PHY Reset */ +#define E1000_CTRL_SW2FW_INT 0x02000000 /* Initiate an interrupt to manageability engine */ + +/* Device Status */ +#define E1000_STATUS_FD 0x00000001 /* Full duplex.0=half,1=full */ +#define E1000_STATUS_LU 0x00000002 /* Link up.0=no,1=link */ +#define E1000_STATUS_SPEED_10 0x00000000 /* Speed 10Mb/s */ +#define E1000_STATUS_SPEED_100 0x00000040 /* Speed 100Mb/s */ +#define E1000_STATUS_SPEED_1000 0x00000080 /* Speed 1000Mb/s */ +#define E1000_STATUS_PHYRA 0x00000400 /* PHY Reset Asserted */ +#define E1000_STATUS_GIO_MASTER_ENABLE 0x00080000 + +/* EEPROM/Flash Control */ +#define E1000_EECD_SK 0x00000001 /* EEPROM Clock */ +#define E1000_EECD_CS 0x00000002 /* EEPROM Chip Select */ +#define E1000_EECD_DI 0x00000004 /* EEPROM Data In */ +#define E1000_EECD_DO 0x00000008 /* EEPROM Data Out */ +#define E1000_EECD_FWE_MASK 0x00000030 +#define E1000_EECD_FWE_DIS 0x00000010 /* Disable FLASH writes */ +#define E1000_EECD_FWE_EN 0x00000020 /* Enable FLASH writes */ +#define E1000_EECD_FWE_SHIFT 4 +#define E1000_EECD_REQ 0x00000040 /* EEPROM Access Request */ +#define E1000_EECD_GNT 0x00000080 /* EEPROM Access Grant */ +#define E1000_EECD_PRES 0x00000100 /* EEPROM Present */ +#define E1000_EECD_SIZE 0x00000200 /* EEPROM Size (0=64 word 1=256 word) */ +#define E1000_EECD_ADDR_BITS 0x00000400 /* EEPROM Addressing bits based on type + * (0-small, 1-large) */ +#define E1000_EECD_TYPE 0x00002000 /* EEPROM Type (1-SPI, 0-Microwire) */ +#ifndef E1000_EEPROM_GRANT_ATTEMPTS +#define E1000_EEPROM_GRANT_ATTEMPTS 1000 /* EEPROM # attempts to gain grant */ +#endif +#define E1000_EECD_AUTO_RD 0x00000200 /* EEPROM Auto Read done */ +#define E1000_EECD_SIZE_EX_MASK 0x00007800 /* EEprom Size */ +#define E1000_EECD_SIZE_EX_SHIFT 11 +#define E1000_EECD_NVADDS 0x00018000 /* NVM Address Size */ +#define E1000_EECD_SELSHAD 0x00020000 /* Select Shadow RAM */ +#define E1000_EECD_INITSRAM 0x00040000 /* Initialize Shadow RAM */ +#define E1000_EECD_FLUPD 0x00080000 /* Update FLASH */ +#define E1000_EECD_AUPDEN 0x00100000 /* Enable Autonomous FLASH update */ +#define E1000_EECD_SHADV 0x00200000 /* Shadow RAM Data Valid */ +#define E1000_EECD_SEC1VAL 0x00400000 /* Sector One Valid */ + + +#define E1000_EECD_SECVAL_SHIFT 22 +#define E1000_STM_OPCODE 0xDB00 +#define E1000_HICR_FW_RESET 0xC0 + +#define E1000_SHADOW_RAM_WORDS 2048 +#define E1000_ICH_NVM_SIG_WORD 0x13 +#define E1000_ICH_NVM_SIG_MASK 0xC0 + +/* MDI Control */ +#define E1000_MDIC_DATA_MASK 0x0000FFFF +#define E1000_MDIC_REG_MASK 0x001F0000 +#define E1000_MDIC_REG_SHIFT 16 +#define E1000_MDIC_PHY_MASK 0x03E00000 +#define E1000_MDIC_PHY_SHIFT 21 +#define E1000_MDIC_OP_WRITE 0x04000000 +#define E1000_MDIC_OP_READ 0x08000000 +#define E1000_MDIC_READY 0x10000000 +#define E1000_MDIC_INT_EN 0x20000000 +#define E1000_MDIC_ERROR 0x40000000 + +/* Rx Interrupt Delay Timer */ +#define E1000_RDTR_FPD BIT(31) + +/* Tx Interrupt Delay Timer */ +#define E1000_TIDV_FPD BIT(31) + +/* Delay increments in nanoseconds for delayed interrupts registers */ +#define E1000_INTR_DELAY_NS_RES (1024) + +/* Delay increments in nanoseconds for interrupt throttling registers */ +#define E1000_INTR_THROTTLING_NS_RES (256) + +/* EEPROM Commands - Microwire */ +#define EEPROM_READ_OPCODE_MICROWIRE 0x6 /* EEPROM read opcode */ +#define EEPROM_WRITE_OPCODE_MICROWIRE 0x5 /* EEPROM write opcode */ +#define EEPROM_ERASE_OPCODE_MICROWIRE 0x7 /* EEPROM erase opcode */ +#define EEPROM_EWEN_OPCODE_MICROWIRE 0x13 /* EEPROM erase/write enable */ +#define EEPROM_EWDS_OPCODE_MICROWIRE 0x10 /* EEPROM erast/write disable */ + +/* EEPROM Word Offsets */ +#define EEPROM_COMPAT 0x0003 +#define EEPROM_ID_LED_SETTINGS 0x0004 +#define EEPROM_VERSION 0x0005 +#define EEPROM_SERDES_AMPLITUDE 0x0006 /* For SERDES output amplitude adjustment. */ +#define EEPROM_PHY_CLASS_WORD 0x0007 +#define EEPROM_INIT_CONTROL1_REG 0x000A +#define EEPROM_INIT_CONTROL2_REG 0x000F +#define EEPROM_SWDEF_PINS_CTRL_PORT_1 0x0010 +#define EEPROM_INIT_CONTROL3_PORT_B 0x0014 +#define EEPROM_INIT_3GIO_3 0x001A +#define EEPROM_SWDEF_PINS_CTRL_PORT_0 0x0020 +#define EEPROM_INIT_CONTROL3_PORT_A 0x0024 +#define EEPROM_CFG 0x0012 +#define EEPROM_FLASH_VERSION 0x0032 +#define EEPROM_CHECKSUM_REG 0x003F + +#define E1000_EEPROM_CFG_DONE 0x00040000 /* MNG config cycle done */ +#define E1000_EEPROM_CFG_DONE_PORT_1 0x00080000 /* ...for second port */ + +/* HH Time Sync */ +#define E1000_TSYNCTXCTL_MAX_ALLOWED_DLY_MASK 0x0000F000 /* max delay */ +#define E1000_TSYNCTXCTL_SYNC_COMP 0x40000000 /* sync complete */ +#define E1000_TSYNCTXCTL_START_SYNC 0x80000000 /* initiate sync */ + +#define E1000_TSYNCTXCTL_VALID 0x00000001 /* Tx timestamp valid */ +#define E1000_TSYNCTXCTL_ENABLED 0x00000010 /* enable Tx timestamping */ + +#define E1000_TSYNCRXCTL_VALID 0x00000001 /* Rx timestamp valid */ +#define E1000_TSYNCRXCTL_TYPE_MASK 0x0000000E /* Rx type mask */ +#define E1000_TSYNCRXCTL_TYPE_L2_V2 0x00 +#define E1000_TSYNCRXCTL_TYPE_L4_V1 0x02 +#define E1000_TSYNCRXCTL_TYPE_L2_L4_V2 0x04 +#define E1000_TSYNCRXCTL_TYPE_ALL 0x08 +#define E1000_TSYNCRXCTL_TYPE_EVENT_V2 0x0A +#define E1000_TSYNCRXCTL_ENABLED 0x00000010 /* enable Rx timestamping */ +#define E1000_TSYNCRXCTL_SYSCFI 0x00000020 /* Sys clock frequency */ + +#define E1000_RXMTRL_PTP_V1_SYNC_MESSAGE 0x00000000 +#define E1000_RXMTRL_PTP_V1_DELAY_REQ_MESSAGE 0x00010000 + +#define E1000_RXMTRL_PTP_V2_SYNC_MESSAGE 0x00000000 +#define E1000_RXMTRL_PTP_V2_DELAY_REQ_MESSAGE 0x01000000 + +#define E1000_TIMINCA_INCPERIOD_SHIFT 24 +#define E1000_TIMINCA_INCVALUE_MASK 0x00FFFFFF + +/* PCI Express Control */ +/* 3GIO Control Register - GCR (0x05B00; RW) */ +#define E1000_L0S_ADJUST (1 << 9) +#define E1000_L1_ENTRY_LATENCY_MSB (1 << 23) +#define E1000_L1_ENTRY_LATENCY_LSB (1 << 25 | 1 << 26) + +#define E1000_L0S_ADJUST (1 << 9) +#define E1000_L1_ENTRY_LATENCY_MSB (1 << 23) +#define E1000_L1_ENTRY_LATENCY_LSB (1 << 25 | 1 << 26) + +#define E1000_GCR_RO_BITS (1 << 23 | 1 << 25 | 1 << 26) + +/* MSI-X PBA Clear register */ +#define E1000_PBACLR_VALID_MASK (BIT(5) - 1) + +/* Transmit Descriptor bit definitions */ +#define E1000_TXD_DTYP_D 0x00100000 /* Data Descriptor */ +#define E1000_TXD_DTYP_C 0x00000000 /* Context Descriptor */ +#define E1000_TXD_CMD_EOP 0x01000000 /* End of Packet */ +#define E1000_TXD_CMD_IFCS 0x02000000 /* Insert FCS (Ethernet CRC) */ +#define E1000_TXD_CMD_IC 0x04000000 /* Insert Checksum */ +#define E1000_TXD_CMD_RS 0x08000000 /* Report Status */ +#define E1000_TXD_CMD_RPS 0x10000000 /* Report Packet Sent */ +#define E1000_TXD_CMD_DEXT 0x20000000 /* Descriptor extension (0 = legacy) */ +#define E1000_TXD_CMD_VLE 0x40000000 /* Add VLAN tag */ +#define E1000_TXD_CMD_IDE 0x80000000 /* Enable Tidv register */ +#define E1000_TXD_STAT_DD 0x00000001 /* Descriptor Done */ +#define E1000_TXD_STAT_EC 0x00000002 /* Excess Collisions */ +#define E1000_TXD_STAT_LC 0x00000004 /* Late Collisions */ +#define E1000_TXD_STAT_TU 0x00000008 /* Transmit underrun */ +#define E1000_TXD_CMD_TCP 0x01000000 /* TCP packet */ +#define E1000_TXD_CMD_IP 0x02000000 /* IP packet */ +#define E1000_TXD_CMD_TSE 0x04000000 /* TCP Seg enable */ +#define E1000_TXD_CMD_SNAP 0x40000000 /* Update SNAP header */ +#define E1000_TXD_STAT_TC 0x00000004 /* Tx Underrun */ +#define E1000_TXD_EXTCMD_TSTAMP 0x00000010 /* IEEE1588 Timestamp packet */ + +/* Transmit Control */ +#define E1000_TCTL_RST 0x00000001 /* software reset */ +#define E1000_TCTL_EN 0x00000002 /* enable tx */ +#define E1000_TCTL_BCE 0x00000004 /* busy check enable */ +#define E1000_TCTL_PSP 0x00000008 /* pad short packets */ +#define E1000_TCTL_CT 0x00000ff0 /* collision threshold */ +#define E1000_TCTL_COLD 0x003ff000 /* collision distance */ +#define E1000_TCTL_SWXOFF 0x00400000 /* SW Xoff transmission */ +#define E1000_TCTL_PBE 0x00800000 /* Packet Burst Enable */ +#define E1000_TCTL_RTLC 0x01000000 /* Re-transmit on late collision */ +#define E1000_TCTL_NRTU 0x02000000 /* No Re-transmit on underrun */ +#define E1000_TCTL_MULR 0x10000000 /* Multiple request support */ + +/* Legacy Receive Descriptor */ +struct e1000_rx_desc { + uint64_t buffer_addr; /* Address of the descriptor's data buffer */ + uint16_t length; /* Length of data DMAed into data buffer */ + uint16_t csum; /* Packet checksum */ + uint8_t status; /* Descriptor status */ + uint8_t errors; /* Descriptor Errors */ + uint16_t special; +}; + +/* Extended Receive Descriptor */ +union e1000_rx_desc_extended { + struct { + uint64_t buffer_addr; + uint64_t reserved; + } read; + struct { + struct { + uint32_t mrq; /* Multiple Rx Queues */ + union { + uint32_t rss; /* RSS Hash */ + struct { + uint16_t ip_id; /* IP id */ + uint16_t csum; /* Packet Checksum */ + } csum_ip; + } hi_dword; + } lower; + struct { + uint32_t status_error; /* ext status/error */ + uint16_t length; + uint16_t vlan; /* VLAN tag */ + } upper; + } wb; /* writeback */ +}; + +#define MAX_PS_BUFFERS 4 + +/* Number of packet split data buffers (not including the header buffer) */ +#define PS_PAGE_BUFFERS (MAX_PS_BUFFERS - 1) + +/* Receive Descriptor - Packet Split */ +union e1000_rx_desc_packet_split { + struct { + /* one buffer for protocol header(s), three data buffers */ + uint64_t buffer_addr[MAX_PS_BUFFERS]; + } read; + struct { + struct { + uint32_t mrq; /* Multiple Rx Queues */ + union { + uint32_t rss; /* RSS Hash */ + struct { + uint16_t ip_id; /* IP id */ + uint16_t csum; /* Packet Checksum */ + } csum_ip; + } hi_dword; + } lower; + struct { + uint32_t status_error; /* ext status/error */ + uint16_t length0; /* length of buffer 0 */ + uint16_t vlan; /* VLAN tag */ + } middle; + struct { + uint16_t header_status; + /* length of buffers 1-3 */ + uint16_t length[PS_PAGE_BUFFERS]; + } upper; + uint64_t reserved; + } wb; /* writeback */ +}; + +/* Receive Checksum Control bits */ +#define E1000_RXCSUM_IPOFLD 0x100 /* IP Checksum Offload Enable */ +#define E1000_RXCSUM_TUOFLD 0x200 /* TCP/UDP Checksum Offload Enable */ +#define E1000_RXCSUM_PCSD 0x2000 /* Packet Checksum Disable */ + +#define E1000_RING_DESC_LEN (16) +#define E1000_RING_DESC_LEN_SHIFT (4) + +#define E1000_MIN_RX_DESC_LEN E1000_RING_DESC_LEN + +/* Receive Descriptor bit definitions */ +#define E1000_RXD_STAT_DD 0x01 /* Descriptor Done */ +#define E1000_RXD_STAT_EOP 0x02 /* End of Packet */ +#define E1000_RXD_STAT_IXSM 0x04 /* Ignore checksum */ +#define E1000_RXD_STAT_VP 0x08 /* IEEE VLAN Packet */ +#define E1000_RXD_STAT_UDPCS 0x10 /* UDP xsum caculated */ +#define E1000_RXD_STAT_TCPCS 0x20 /* TCP xsum calculated */ +#define E1000_RXD_STAT_IPCS 0x40 /* IP xsum calculated */ +#define E1000_RXD_STAT_PIF 0x80 /* passed in-exact filter */ +#define E1000_RXD_STAT_IPIDV 0x200 /* IP identification valid */ +#define E1000_RXD_STAT_UDPV 0x400 /* Valid UDP checksum */ +#define E1000_RXD_STAT_ACK 0x8000 /* ACK Packet indication */ +#define E1000_RXD_ERR_CE 0x01 /* CRC Error */ +#define E1000_RXD_ERR_SE 0x02 /* Symbol Error */ +#define E1000_RXD_ERR_SEQ 0x04 /* Sequence Error */ +#define E1000_RXD_ERR_CXE 0x10 /* Carrier Extension Error */ +#define E1000_RXD_ERR_TCPE 0x20 /* TCP/UDP Checksum Error */ +#define E1000_RXD_ERR_IPE 0x40 /* IP Checksum Error */ +#define E1000_RXD_ERR_RXE 0x80 /* Rx Data Error */ +#define E1000_RXD_SPC_VLAN_MASK 0x0FFF /* VLAN ID is in lower 12 bits */ +#define E1000_RXD_SPC_PRI_MASK 0xE000 /* Priority is in upper 3 bits */ +#define E1000_RXD_SPC_PRI_SHIFT 13 +#define E1000_RXD_SPC_CFI_MASK 0x1000 /* CFI is bit 12 */ +#define E1000_RXD_SPC_CFI_SHIFT 12 + +/* RX packet types */ +#define E1000_RXD_PKT_MAC (0) +#define E1000_RXD_PKT_IP4 (1) +#define E1000_RXD_PKT_IP4_XDP (2) +#define E1000_RXD_PKT_IP6 (5) +#define E1000_RXD_PKT_IP6_XDP (6) + +#define E1000_RXD_PKT_TYPE(t) ((t) << 16) + +#define E1000_RXDEXT_STATERR_CE 0x01000000 +#define E1000_RXDEXT_STATERR_SE 0x02000000 +#define E1000_RXDEXT_STATERR_SEQ 0x04000000 +#define E1000_RXDEXT_STATERR_CXE 0x10000000 +#define E1000_RXDEXT_STATERR_TCPE 0x20000000 +#define E1000_RXDEXT_STATERR_IPE 0x40000000 +#define E1000_RXDEXT_STATERR_RXE 0x80000000 + +#define E1000_RXDPS_HDRSTAT_HDRSP 0x00008000 +#define E1000_RXDPS_HDRSTAT_HDRLEN_MASK 0x000003FF + +/* Receive Address */ +#define E1000_RAH_AV 0x80000000 /* Receive descriptor valid */ + +/* Offload Context Descriptor */ +struct e1000_context_desc { + union { + uint32_t ip_config; + struct { + uint8_t ipcss; /* IP checksum start */ + uint8_t ipcso; /* IP checksum offset */ + uint16_t ipcse; /* IP checksum end */ + } ip_fields; + } lower_setup; + union { + uint32_t tcp_config; + struct { + uint8_t tucss; /* TCP checksum start */ + uint8_t tucso; /* TCP checksum offset */ + uint16_t tucse; /* TCP checksum end */ + } tcp_fields; + } upper_setup; + uint32_t cmd_and_length; /* */ + union { + uint32_t data; + struct { + uint8_t status; /* Descriptor status */ + uint8_t hdr_len; /* Header length */ + uint16_t mss; /* Maximum segment size */ + } fields; + } tcp_seg_setup; +}; + +/* Filters */ +#define E1000_NUM_UNICAST 16 /* Unicast filter entries */ +#define E1000_MC_TBL_SIZE 128 /* Multicast Filter Table (4096 bits) */ +#define E1000_VLAN_FILTER_TBL_SIZE 128 /* VLAN Filter Table (4096 bits) */ + +/* Management Control */ +#define E1000_MANC_SMBUS_EN 0x00000001 /* SMBus Enabled - RO */ +#define E1000_MANC_ASF_EN 0x00000002 /* ASF Enabled - RO */ +#define E1000_MANC_R_ON_FORCE 0x00000004 /* Reset on Force TCO - RO */ +#define E1000_MANC_RMCP_EN 0x00000100 /* Enable RCMP 026Fh Filtering */ +#define E1000_MANC_0298_EN 0x00000200 /* Enable RCMP 0298h Filtering */ +#define E1000_MANC_IPV4_EN 0x00000400 /* Enable IPv4 */ +#define E1000_MANC_IPV6_EN 0x00000800 /* Enable IPv6 */ +#define E1000_MANC_SNAP_EN 0x00001000 /* Accept LLC/SNAP */ +#define E1000_MANC_ARP_EN 0x00002000 /* Enable ARP Request Filtering */ +#define E1000_MANC_NEIGHBOR_EN 0x00004000 /* Enable Neighbor Discovery + * Filtering */ +#define E1000_MANC_ARP_RES_EN 0x00008000 /* Enable ARP response Filtering */ +#define E1000_MANC_DIS_IP_CHK_ARP 0x10000000 /* Disable IP address chacking */ + /*for ARP packets - in 82574 */ +#define E1000_MANC_TCO_RESET 0x00010000 /* TCO Reset Occurred */ +#define E1000_MANC_RCV_TCO_EN 0x00020000 /* Receive TCO Packets Enabled */ +#define E1000_MANC_REPORT_STATUS 0x00040000 /* Status Reporting Enabled */ +#define E1000_MANC_RCV_ALL 0x00080000 /* Receive All Enabled */ +#define E1000_MANC_BLK_PHY_RST_ON_IDE 0x00040000 /* Block phy resets */ +#define E1000_MANC_EN_MAC_ADDR_FILTER 0x00100000 /* Enable MAC address + * filtering */ +#define E1000_MANC_EN_MNG2HOST 0x00200000 /* Enable MNG packets to host + * memory */ +#define E1000_MANC_EN_IP_ADDR_FILTER 0x00400000 /* Enable IP address + * filtering */ +#define E1000_MANC_EN_XSUM_FILTER 0x00800000 /* Enable checksum filtering */ +#define E1000_MANC_BR_EN 0x01000000 /* Enable broadcast filtering */ +#define E1000_MANC_SMB_REQ 0x01000000 /* SMBus Request */ +#define E1000_MANC_SMB_GNT 0x02000000 /* SMBus Grant */ +#define E1000_MANC_SMB_CLK_IN 0x04000000 /* SMBus Clock In */ +#define E1000_MANC_SMB_DATA_IN 0x08000000 /* SMBus Data In */ +#define E1000_MANC_SMB_DATA_OUT 0x10000000 /* SMBus Data Out */ +#define E1000_MANC_SMB_CLK_OUT 0x20000000 /* SMBus Clock Out */ + +#define E1000_MANC_SMB_DATA_OUT_SHIFT 28 /* SMBus Data Out Shift */ +#define E1000_MANC_SMB_CLK_OUT_SHIFT 29 /* SMBus Clock Out Shift */ + +/* FACTPS Control */ +#define E1000_FACTPS_LAN0_ON 0x00000004 /* Lan 0 enable */ + +/* For checksumming, the sum of all words in the EEPROM should equal 0xBABA. */ +#define EEPROM_SUM 0xBABA + +/* I/O-Mapped Access to Internal Registers, Memories, and Flash */ +#define E1000_IOADDR 0x00 +#define E1000_IODATA 0x04 + +#define E1000_VFTA_ENTRY_SHIFT 5 +#define E1000_VFTA_ENTRY_MASK 0x7F +#define E1000_VFTA_ENTRY_BIT_SHIFT_MASK 0x1F + +#endif /* HW_E1000_REGS_H */ diff --git a/hw/net/fsl_etsec/etsec.c b/hw/net/fsl_etsec/etsec.c index c753bfb3a8..798ea33d08 100644 --- a/hw/net/fsl_etsec/etsec.c +++ b/hw/net/fsl_etsec/etsec.c @@ -29,6 +29,7 @@ #include "qemu/osdep.h" #include "hw/sysbus.h" #include "hw/irq.h" +#include "hw/net/mii.h" #include "hw/ptimer.h" #include "hw/qdev-properties.h" #include "etsec.h" @@ -339,11 +340,11 @@ static void etsec_reset(DeviceState *d) etsec->rx_buffer_len = 0; etsec->phy_status = - MII_SR_EXTENDED_CAPS | MII_SR_LINK_STATUS | MII_SR_AUTONEG_CAPS | - MII_SR_AUTONEG_COMPLETE | MII_SR_PREAMBLE_SUPPRESS | - MII_SR_EXTENDED_STATUS | MII_SR_100T2_HD_CAPS | MII_SR_100T2_FD_CAPS | - MII_SR_10T_HD_CAPS | MII_SR_10T_FD_CAPS | MII_SR_100X_HD_CAPS | - MII_SR_100X_FD_CAPS | MII_SR_100T4_CAPS; + MII_BMSR_EXTCAP | MII_BMSR_LINK_ST | MII_BMSR_AUTONEG | + MII_BMSR_AN_COMP | MII_BMSR_MFPS | MII_BMSR_EXTSTAT | + MII_BMSR_100T2_HD | MII_BMSR_100T2_FD | + MII_BMSR_10T_HD | MII_BMSR_10T_FD | + MII_BMSR_100TX_HD | MII_BMSR_100TX_FD | MII_BMSR_100T4; etsec_update_irq(etsec); } diff --git a/hw/net/fsl_etsec/etsec.h b/hw/net/fsl_etsec/etsec.h index 3c625c955c..3860864a3f 100644 --- a/hw/net/fsl_etsec/etsec.h +++ b/hw/net/fsl_etsec/etsec.h @@ -76,23 +76,6 @@ typedef struct eTSEC_rxtx_bd { #define FCB_TX_CTU (1 << 1) #define FCB_TX_NPH (1 << 0) -/* PHY Status Register */ -#define MII_SR_EXTENDED_CAPS 0x0001 /* Extended register capabilities */ -#define MII_SR_JABBER_DETECT 0x0002 /* Jabber Detected */ -#define MII_SR_LINK_STATUS 0x0004 /* Link Status 1 = link */ -#define MII_SR_AUTONEG_CAPS 0x0008 /* Auto Neg Capable */ -#define MII_SR_REMOTE_FAULT 0x0010 /* Remote Fault Detect */ -#define MII_SR_AUTONEG_COMPLETE 0x0020 /* Auto Neg Complete */ -#define MII_SR_PREAMBLE_SUPPRESS 0x0040 /* Preamble may be suppressed */ -#define MII_SR_EXTENDED_STATUS 0x0100 /* Ext. status info in Reg 0x0F */ -#define MII_SR_100T2_HD_CAPS 0x0200 /* 100T2 Half Duplex Capable */ -#define MII_SR_100T2_FD_CAPS 0x0400 /* 100T2 Full Duplex Capable */ -#define MII_SR_10T_HD_CAPS 0x0800 /* 10T Half Duplex Capable */ -#define MII_SR_10T_FD_CAPS 0x1000 /* 10T Full Duplex Capable */ -#define MII_SR_100X_HD_CAPS 0x2000 /* 100X Half Duplex Capable */ -#define MII_SR_100X_FD_CAPS 0x4000 /* 100X Full Duplex Capable */ -#define MII_SR_100T4_CAPS 0x8000 /* 100T4 Capable */ - /* eTSEC */ /* Number of register in the device */ diff --git a/hw/net/fsl_etsec/miim.c b/hw/net/fsl_etsec/miim.c index 6bba01c82a..b48d2cb57b 100644 --- a/hw/net/fsl_etsec/miim.c +++ b/hw/net/fsl_etsec/miim.c @@ -23,6 +23,7 @@ */ #include "qemu/osdep.h" +#include "hw/net/mii.h" #include "etsec.h" #include "registers.h" @@ -140,8 +141,8 @@ void etsec_miim_link_status(eTSEC *etsec, NetClientState *nc) { /* Set link status */ if (nc->link_down) { - etsec->phy_status &= ~MII_SR_LINK_STATUS; + etsec->phy_status &= ~MII_BMSR_LINK_ST; } else { - etsec->phy_status |= MII_SR_LINK_STATUS; + etsec->phy_status |= MII_BMSR_LINK_ST; } } diff --git a/hw/net/igb.c b/hw/net/igb.c new file mode 100644 index 0000000000..c6d753df87 --- /dev/null +++ b/hw/net/igb.c @@ -0,0 +1,623 @@ +/* + * QEMU Intel 82576 SR/IOV Ethernet Controller Emulation + * + * Datasheet: + * https://www.intel.com/content/dam/www/public/us/en/documents/datasheets/82576eg-gbe-datasheet.pdf + * + * Copyright (c) 2020-2023 Red Hat, Inc. + * Copyright (c) 2015 Ravello Systems LTD (http://ravellosystems.com) + * Developed by Daynix Computing LTD (http://www.daynix.com) + * + * Authors: + * Akihiko Odaki <akihiko.odaki@daynix.com> + * Gal Hammmer <gal.hammer@sap.com> + * Marcel Apfelbaum <marcel.apfelbaum@gmail.com> + * Dmitry Fleytman <dmitry@daynix.com> + * Leonid Bloch <leonid@daynix.com> + * Yan Vugenfirer <yan@daynix.com> + * + * Based on work done by: + * Nir Peleg, Tutis Systems Ltd. for Qumranet Inc. + * Copyright (c) 2008 Qumranet + * Based on work done by: + * Copyright (c) 2007 Dan Aloni + * Copyright (c) 2004 Antony T Curtis + * + * This library is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * This library is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with this library; if not, see <http://www.gnu.org/licenses/>. + */ + +#include "qemu/osdep.h" +#include "qemu/units.h" +#include "net/eth.h" +#include "net/net.h" +#include "net/tap.h" +#include "qemu/module.h" +#include "qemu/range.h" +#include "sysemu/sysemu.h" +#include "hw/hw.h" +#include "hw/net/mii.h" +#include "hw/pci/pci.h" +#include "hw/pci/pcie.h" +#include "hw/pci/pcie_sriov.h" +#include "hw/pci/msi.h" +#include "hw/pci/msix.h" +#include "hw/qdev-properties.h" +#include "migration/vmstate.h" + +#include "igb_common.h" +#include "igb_core.h" + +#include "trace.h" +#include "qapi/error.h" +#include "qom/object.h" + +#define TYPE_IGB "igb" +OBJECT_DECLARE_SIMPLE_TYPE(IGBState, IGB) + +struct IGBState { + PCIDevice parent_obj; + NICState *nic; + NICConf conf; + + MemoryRegion mmio; + MemoryRegion flash; + MemoryRegion io; + MemoryRegion msix; + + uint32_t ioaddr; + + IGBCore core; +}; + +#define IGB_CAP_SRIOV_OFFSET (0x160) +#define IGB_VF_OFFSET (0x80) +#define IGB_VF_STRIDE (2) + +#define E1000E_MMIO_IDX 0 +#define E1000E_FLASH_IDX 1 +#define E1000E_IO_IDX 2 +#define E1000E_MSIX_IDX 3 + +#define E1000E_MMIO_SIZE (128 * KiB) +#define E1000E_FLASH_SIZE (128 * KiB) +#define E1000E_IO_SIZE (32) +#define E1000E_MSIX_SIZE (16 * KiB) + +static void igb_write_config(PCIDevice *dev, uint32_t addr, + uint32_t val, int len) +{ + IGBState *s = IGB(dev); + + trace_igb_write_config(addr, val, len); + pci_default_write_config(dev, addr, val, len); + + if (range_covers_byte(addr, len, PCI_COMMAND) && + (dev->config[PCI_COMMAND] & PCI_COMMAND_MASTER)) { + igb_start_recv(&s->core); + } +} + +uint64_t +igb_mmio_read(void *opaque, hwaddr addr, unsigned size) +{ + IGBState *s = opaque; + return igb_core_read(&s->core, addr, size); +} + +void +igb_mmio_write(void *opaque, hwaddr addr, uint64_t val, unsigned size) +{ + IGBState *s = opaque; + igb_core_write(&s->core, addr, val, size); +} + +static bool +igb_io_get_reg_index(IGBState *s, uint32_t *idx) +{ + if (s->ioaddr < 0x1FFFF) { + *idx = s->ioaddr; + return true; + } + + if (s->ioaddr < 0x7FFFF) { + trace_e1000e_wrn_io_addr_undefined(s->ioaddr); + return false; + } + + if (s->ioaddr < 0xFFFFF) { + trace_e1000e_wrn_io_addr_flash(s->ioaddr); + return false; + } + + trace_e1000e_wrn_io_addr_unknown(s->ioaddr); + return false; +} + +static uint64_t +igb_io_read(void *opaque, hwaddr addr, unsigned size) +{ + IGBState *s = opaque; + uint32_t idx = 0; + uint64_t val; + + switch (addr) { + case E1000_IOADDR: + trace_e1000e_io_read_addr(s->ioaddr); + return s->ioaddr; + case E1000_IODATA: + if (igb_io_get_reg_index(s, &idx)) { + val = igb_core_read(&s->core, idx, sizeof(val)); + trace_e1000e_io_read_data(idx, val); + return val; + } + return 0; + default: + trace_e1000e_wrn_io_read_unknown(addr); + return 0; + } +} + +static void +igb_io_write(void *opaque, hwaddr addr, uint64_t val, unsigned size) +{ + IGBState *s = opaque; + uint32_t idx = 0; + + switch (addr) { + case E1000_IOADDR: + trace_e1000e_io_write_addr(val); + s->ioaddr = (uint32_t) val; + return; + case E1000_IODATA: + if (igb_io_get_reg_index(s, &idx)) { + trace_e1000e_io_write_data(idx, val); + igb_core_write(&s->core, idx, val, sizeof(val)); + } + return; + default: + trace_e1000e_wrn_io_write_unknown(addr); + return; + } +} + +static const MemoryRegionOps mmio_ops = { + .read = igb_mmio_read, + .write = igb_mmio_write, + .endianness = DEVICE_LITTLE_ENDIAN, + .impl = { + .min_access_size = 4, + .max_access_size = 4, + }, +}; + +static const MemoryRegionOps io_ops = { + .read = igb_io_read, + .write = igb_io_write, + .endianness = DEVICE_LITTLE_ENDIAN, + .impl = { + .min_access_size = 4, + .max_access_size = 4, + }, +}; + +static bool +igb_nc_can_receive(NetClientState *nc) +{ + IGBState *s = qemu_get_nic_opaque(nc); + return igb_can_receive(&s->core); +} + +static ssize_t +igb_nc_receive_iov(NetClientState *nc, const struct iovec *iov, int iovcnt) +{ + IGBState *s = qemu_get_nic_opaque(nc); + return igb_receive_iov(&s->core, iov, iovcnt); +} + +static ssize_t +igb_nc_receive(NetClientState *nc, const uint8_t *buf, size_t size) +{ + IGBState *s = qemu_get_nic_opaque(nc); + return igb_receive(&s->core, buf, size); +} + +static void +igb_set_link_status(NetClientState *nc) +{ + IGBState *s = qemu_get_nic_opaque(nc); + igb_core_set_link_status(&s->core); +} + +static NetClientInfo net_igb_info = { + .type = NET_CLIENT_DRIVER_NIC, + .size = sizeof(NICState), + .can_receive = igb_nc_can_receive, + .receive = igb_nc_receive, + .receive_iov = igb_nc_receive_iov, + .link_status_changed = igb_set_link_status, +}; + +/* + * EEPROM (NVM) contents documented in section 6.1, table 6-1: + * and in 6.10 Software accessed words. + */ +static const uint16_t igb_eeprom_template[] = { + /* Address |Compat.|OEM sp.| ImRev | OEM sp. */ + 0x0000, 0x0000, 0x0000, 0x0d34, 0xffff, 0x2010, 0xffff, 0xffff, + /* PBA |ICtrl1 | SSID | SVID | DevID |-------|ICtrl2 */ + 0x1040, 0xffff, 0x002b, 0x0000, 0x8086, 0x10c9, 0x0000, 0x70c3, + /* SwPin0| DevID | EESZ |-------|ICtrl3 |PCI-tc | MSIX | APtr */ + 0x0004, 0x10c9, 0x5c00, 0x0000, 0x2880, 0x0014, 0x4a40, 0x0060, + /* PCIe Init. Conf 1,2,3 |PCICtrl| LD1,3 |DDevID |DevRev | LD0,2 */ + 0x6cfb, 0xc7b0, 0x0abe, 0x0403, 0x0783, 0x10a6, 0x0001, 0x0602, + /* SwPin1| FunC |LAN-PWR|ManHwC |ICtrl3 | IOVct |VDevID |-------*/ + 0x0004, 0x0020, 0x0000, 0x004a, 0x2080, 0x00f5, 0x10ca, 0x0000, + /*---------------| LD1,3 | LD0,2 | ROEnd | ROSta | Wdog | VPD */ + 0x0000, 0x0000, 0x4784, 0x4602, 0x0000, 0x0000, 0x1000, 0xffff, + /* PCSet0| Ccfg0 |PXEver |IBAcap |PCSet1 | Ccfg1 |iSCVer | ?? */ + 0x0100, 0x4000, 0x131f, 0x4013, 0x0100, 0x4000, 0xffff, 0xffff, + /* PCSet2| Ccfg2 |PCSet3 | Ccfg3 | ?? |AltMacP| ?? |CHKSUM */ + 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0x00e0, 0xffff, 0x0000, + /* NC-SIC */ + 0x0003, +}; + +static void igb_core_realize(IGBState *s) +{ + s->core.owner = &s->parent_obj; + s->core.owner_nic = s->nic; +} + +static void +igb_init_msix(IGBState *s) +{ + int i, res; + + res = msix_init(PCI_DEVICE(s), IGB_MSIX_VEC_NUM, + &s->msix, + E1000E_MSIX_IDX, 0, + &s->msix, + E1000E_MSIX_IDX, 0x2000, + 0x70, NULL); + + if (res < 0) { + trace_e1000e_msix_init_fail(res); + } else { + for (i = 0; i < IGB_MSIX_VEC_NUM; i++) { + msix_vector_use(PCI_DEVICE(s), i); + } + } +} + +static void +igb_cleanup_msix(IGBState *s) +{ + msix_unuse_all_vectors(PCI_DEVICE(s)); + msix_uninit(PCI_DEVICE(s), &s->msix, &s->msix); +} + +static void +igb_init_net_peer(IGBState *s, PCIDevice *pci_dev, uint8_t *macaddr) +{ + DeviceState *dev = DEVICE(pci_dev); + NetClientState *nc; + int i; + + s->nic = qemu_new_nic(&net_igb_info, &s->conf, + object_get_typename(OBJECT(s)), dev->id, s); + + s->core.max_queue_num = s->conf.peers.queues ? s->conf.peers.queues - 1 : 0; + + trace_e1000e_mac_set_permanent(MAC_ARG(macaddr)); + memcpy(s->core.permanent_mac, macaddr, sizeof(s->core.permanent_mac)); + + qemu_format_nic_info_str(qemu_get_queue(s->nic), macaddr); + + /* Setup virtio headers */ + for (i = 0; i < s->conf.peers.queues; i++) { + nc = qemu_get_subqueue(s->nic, i); + if (!nc->peer || !qemu_has_vnet_hdr(nc->peer)) { + trace_e1000e_cfg_support_virtio(false); + return; + } + } + + trace_e1000e_cfg_support_virtio(true); + s->core.has_vnet = true; + + for (i = 0; i < s->conf.peers.queues; i++) { + nc = qemu_get_subqueue(s->nic, i); + qemu_set_vnet_hdr_len(nc->peer, sizeof(struct virtio_net_hdr)); + qemu_using_vnet_hdr(nc->peer, true); + } +} + +static int +igb_add_pm_capability(PCIDevice *pdev, uint8_t offset, uint16_t pmc) +{ + Error *local_err = NULL; + int ret = pci_add_capability(pdev, PCI_CAP_ID_PM, offset, + PCI_PM_SIZEOF, &local_err); + + if (local_err) { + error_report_err(local_err); + return ret; + } + + pci_set_word(pdev->config + offset + PCI_PM_PMC, + PCI_PM_CAP_VER_1_1 | + pmc); + + pci_set_word(pdev->wmask + offset + PCI_PM_CTRL, + PCI_PM_CTRL_STATE_MASK | + PCI_PM_CTRL_PME_ENABLE | + PCI_PM_CTRL_DATA_SEL_MASK); + + pci_set_word(pdev->w1cmask + offset + PCI_PM_CTRL, + PCI_PM_CTRL_PME_STATUS); + + return ret; +} + +static void igb_pci_realize(PCIDevice *pci_dev, Error **errp) +{ + IGBState *s = IGB(pci_dev); + uint8_t *macaddr; + int ret; + + trace_e1000e_cb_pci_realize(); + + pci_dev->config_write = igb_write_config; + + pci_dev->config[PCI_CACHE_LINE_SIZE] = 0x10; + pci_dev->config[PCI_INTERRUPT_PIN] = 1; + + /* Define IO/MMIO regions */ + memory_region_init_io(&s->mmio, OBJECT(s), &mmio_ops, s, + "igb-mmio", E1000E_MMIO_SIZE); + pci_register_bar(pci_dev, E1000E_MMIO_IDX, + PCI_BASE_ADDRESS_SPACE_MEMORY, &s->mmio); + + /* + * We provide a dummy implementation for the flash BAR + * for drivers that may theoretically probe for its presence. + */ + memory_region_init(&s->flash, OBJECT(s), + "igb-flash", E1000E_FLASH_SIZE); + pci_register_bar(pci_dev, E1000E_FLASH_IDX, + PCI_BASE_ADDRESS_SPACE_MEMORY, &s->flash); + + memory_region_init_io(&s->io, OBJECT(s), &io_ops, s, + "igb-io", E1000E_IO_SIZE); + pci_register_bar(pci_dev, E1000E_IO_IDX, + PCI_BASE_ADDRESS_SPACE_IO, &s->io); + + memory_region_init(&s->msix, OBJECT(s), "igb-msix", + E1000E_MSIX_SIZE); + pci_register_bar(pci_dev, E1000E_MSIX_IDX, + PCI_BASE_ADDRESS_MEM_TYPE_64, &s->msix); + + /* Create networking backend */ + qemu_macaddr_default_if_unset(&s->conf.macaddr); + macaddr = s->conf.macaddr.a; + + /* Add PCI capabilities in reverse order */ + assert(pcie_endpoint_cap_init(pci_dev, 0xa0) > 0); + + igb_init_msix(s); + + ret = msi_init(pci_dev, 0x50, 1, true, true, NULL); + if (ret) { + trace_e1000e_msi_init_fail(ret); + } + + if (igb_add_pm_capability(pci_dev, 0x40, PCI_PM_CAP_DSI) < 0) { + hw_error("Failed to initialize PM capability"); + } + + /* PCIe extended capabilities (in order) */ + if (pcie_aer_init(pci_dev, 1, 0x100, 0x40, errp) < 0) { + hw_error("Failed to initialize AER capability"); + } + + pcie_ari_init(pci_dev, 0x150, 1); + + pcie_sriov_pf_init(pci_dev, IGB_CAP_SRIOV_OFFSET, "igbvf", + IGB_82576_VF_DEV_ID, IGB_MAX_VF_FUNCTIONS, IGB_MAX_VF_FUNCTIONS, + IGB_VF_OFFSET, IGB_VF_STRIDE); + + pcie_sriov_pf_init_vf_bar(pci_dev, 0, + PCI_BASE_ADDRESS_MEM_TYPE_64 | PCI_BASE_ADDRESS_MEM_PREFETCH, + 16 * KiB); + pcie_sriov_pf_init_vf_bar(pci_dev, 3, + PCI_BASE_ADDRESS_MEM_TYPE_64 | PCI_BASE_ADDRESS_MEM_PREFETCH, + 16 * KiB); + + igb_init_net_peer(s, pci_dev, macaddr); + + /* Initialize core */ + igb_core_realize(s); + + igb_core_pci_realize(&s->core, + igb_eeprom_template, + sizeof(igb_eeprom_template), + macaddr); +} + +static void igb_pci_uninit(PCIDevice *pci_dev) +{ + IGBState *s = IGB(pci_dev); + + trace_e1000e_cb_pci_uninit(); + + igb_core_pci_uninit(&s->core); + + pcie_sriov_pf_exit(pci_dev); + pcie_cap_exit(pci_dev); + + qemu_del_nic(s->nic); + + igb_cleanup_msix(s); + msi_uninit(pci_dev); +} + +static void igb_qdev_reset_hold(Object *obj) +{ + PCIDevice *d = PCI_DEVICE(obj); + IGBState *s = IGB(obj); + + trace_e1000e_cb_qdev_reset_hold(); + + pcie_sriov_pf_disable_vfs(d); + igb_core_reset(&s->core); +} + +static int igb_pre_save(void *opaque) +{ + IGBState *s = opaque; + + trace_e1000e_cb_pre_save(); + + igb_core_pre_save(&s->core); + + return 0; +} + +static int igb_post_load(void *opaque, int version_id) +{ + IGBState *s = opaque; + + trace_e1000e_cb_post_load(); + return igb_core_post_load(&s->core); +} + +static const VMStateDescription igb_vmstate_tx = { + .name = "igb-tx", + .version_id = 1, + .minimum_version_id = 1, + .fields = (VMStateField[]) { + VMSTATE_UINT16(vlan, struct igb_tx), + VMSTATE_UINT16(mss, struct igb_tx), + VMSTATE_BOOL(tse, struct igb_tx), + VMSTATE_BOOL(ixsm, struct igb_tx), + VMSTATE_BOOL(txsm, struct igb_tx), + VMSTATE_BOOL(first, struct igb_tx), + VMSTATE_BOOL(skip_cp, struct igb_tx), + VMSTATE_END_OF_LIST() + } +}; + +static const VMStateDescription igb_vmstate_intr_timer = { + .name = "igb-intr-timer", + .version_id = 1, + .minimum_version_id = 1, + .fields = (VMStateField[]) { + VMSTATE_TIMER_PTR(timer, IGBIntrDelayTimer), + VMSTATE_BOOL(running, IGBIntrDelayTimer), + VMSTATE_END_OF_LIST() + } +}; + +#define VMSTATE_IGB_INTR_DELAY_TIMER(_f, _s) \ + VMSTATE_STRUCT(_f, _s, 0, \ + igb_vmstate_intr_timer, IGBIntrDelayTimer) + +#define VMSTATE_IGB_INTR_DELAY_TIMER_ARRAY(_f, _s, _num) \ + VMSTATE_STRUCT_ARRAY(_f, _s, _num, 0, \ + igb_vmstate_intr_timer, IGBIntrDelayTimer) + +static const VMStateDescription igb_vmstate = { + .name = "igb", + .version_id = 1, + .minimum_version_id = 1, + .pre_save = igb_pre_save, + .post_load = igb_post_load, + .fields = (VMStateField[]) { + VMSTATE_PCI_DEVICE(parent_obj, IGBState), + VMSTATE_MSIX(parent_obj, IGBState), + + VMSTATE_UINT32(ioaddr, IGBState), + VMSTATE_UINT8(core.rx_desc_len, IGBState), + VMSTATE_UINT16_ARRAY(core.eeprom, IGBState, IGB_EEPROM_SIZE), + VMSTATE_UINT16_ARRAY(core.phy, IGBState, MAX_PHY_REG_ADDRESS + 1), + VMSTATE_UINT32_ARRAY(core.mac, IGBState, E1000E_MAC_SIZE), + VMSTATE_UINT8_ARRAY(core.permanent_mac, IGBState, ETH_ALEN), + + VMSTATE_IGB_INTR_DELAY_TIMER_ARRAY(core.eitr, IGBState, + IGB_INTR_NUM), + + VMSTATE_UINT32_ARRAY(core.eitr_guest_value, IGBState, IGB_INTR_NUM), + + VMSTATE_STRUCT_ARRAY(core.tx, IGBState, IGB_NUM_QUEUES, 0, + igb_vmstate_tx, struct igb_tx), + + VMSTATE_INT64(core.timadj, IGBState), + + VMSTATE_END_OF_LIST() + } +}; + +static Property igb_properties[] = { + DEFINE_NIC_PROPERTIES(IGBState, conf), + DEFINE_PROP_END_OF_LIST(), +}; + +static void igb_class_init(ObjectClass *class, void *data) +{ + DeviceClass *dc = DEVICE_CLASS(class); + ResettableClass *rc = RESETTABLE_CLASS(class); + PCIDeviceClass *c = PCI_DEVICE_CLASS(class); + + c->realize = igb_pci_realize; + c->exit = igb_pci_uninit; + c->vendor_id = PCI_VENDOR_ID_INTEL; + c->device_id = E1000_DEV_ID_82576; + c->revision = 1; + c->class_id = PCI_CLASS_NETWORK_ETHERNET; + + rc->phases.hold = igb_qdev_reset_hold; + + dc->desc = "Intel 82576 Gigabit Ethernet Controller"; + dc->vmsd = &igb_vmstate; + + device_class_set_props(dc, igb_properties); + set_bit(DEVICE_CATEGORY_NETWORK, dc->categories); +} + +static void igb_instance_init(Object *obj) +{ + IGBState *s = IGB(obj); + device_add_bootindex_property(obj, &s->conf.bootindex, + "bootindex", "/ethernet-phy@0", + DEVICE(obj)); +} + +static const TypeInfo igb_info = { + .name = TYPE_IGB, + .parent = TYPE_PCI_DEVICE, + .instance_size = sizeof(IGBState), + .class_init = igb_class_init, + .instance_init = igb_instance_init, + .interfaces = (InterfaceInfo[]) { + { INTERFACE_PCIE_DEVICE }, + { } + }, +}; + +static void igb_register_types(void) +{ + type_register_static(&igb_info); +} + +type_init(igb_register_types) diff --git a/hw/net/igb_common.h b/hw/net/igb_common.h new file mode 100644 index 0000000000..69ac490f75 --- /dev/null +++ b/hw/net/igb_common.h @@ -0,0 +1,146 @@ +/* + * QEMU igb emulation - shared definitions + * + * Copyright (c) 2020-2023 Red Hat, Inc. + * Copyright (c) 2008 Qumranet + * + * Based on work done by: + * Nir Peleg, Tutis Systems Ltd. for Qumranet Inc. + * Copyright (c) 2007 Dan Aloni + * Copyright (c) 2004 Antony T Curtis + * + * This library is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * This library is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with this library; if not, see <http://www.gnu.org/licenses/>. + */ + +#ifndef HW_NET_IGB_COMMON_H +#define HW_NET_IGB_COMMON_H + +#include "igb_regs.h" + +#define defreg(x) x = (E1000_##x >> 2) +#define defreg_indexed(x, i) x##i = (E1000_##x(i) >> 2) +#define defreg_indexeda(x, i) x##i##_A = (E1000_##x##_A(i) >> 2) + +#define defregd(x) defreg_indexed(x, 0), defreg_indexed(x, 1), \ + defreg_indexed(x, 2), defreg_indexed(x, 3), \ + defreg_indexed(x, 4), defreg_indexed(x, 5), \ + defreg_indexed(x, 6), defreg_indexed(x, 7), \ + defreg_indexed(x, 8), defreg_indexed(x, 9), \ + defreg_indexed(x, 10), defreg_indexed(x, 11), \ + defreg_indexed(x, 12), defreg_indexed(x, 13), \ + defreg_indexed(x, 14), defreg_indexed(x, 15), \ + defreg_indexeda(x, 0), defreg_indexeda(x, 1), \ + defreg_indexeda(x, 2), defreg_indexeda(x, 3) + +#define defregv(x) defreg_indexed(x, 0), defreg_indexed(x, 1), \ + defreg_indexed(x, 2), defreg_indexed(x, 3), \ + defreg_indexed(x, 4), defreg_indexed(x, 5), \ + defreg_indexed(x, 6), defreg_indexed(x, 7) + +enum { + defreg(CTRL), defreg(EECD), defreg(EERD), defreg(GPRC), + defreg(GPTC), defreg(ICR), defreg(ICS), defreg(IMC), + defreg(IMS), defreg(LEDCTL), defreg(MANC), defreg(MDIC), + defreg(MPC), defreg(RCTL), + defreg(STATUS), defreg(SWSM), defreg(TCTL), + defreg(TORH), defreg(TORL), defreg(TOTH), + defreg(TOTL), defreg(TPR), defreg(TPT), + defreg(WUFC), defreg(RA), defreg(MTA), defreg(CRCERRS), + defreg(VFTA), defreg(VET), + defreg(SCC), defreg(ECOL), + defreg(MCC), defreg(LATECOL), defreg(COLC), defreg(DC), + defreg(TNCRS), defreg(RLEC), + defreg(XONRXC), defreg(XONTXC), defreg(XOFFRXC), defreg(XOFFTXC), + defreg(FCRUC), defreg(TDFH), defreg(TDFT), + defreg(TDFHS), defreg(TDFTS), defreg(TDFPC), defreg(WUC), + defreg(WUS), defreg(RDFH), + defreg(RDFT), defreg(RDFHS), defreg(RDFTS), defreg(RDFPC), + defreg(IPAV), defreg(IP4AT), defreg(IP6AT), + defreg(WUPM), defreg(FFMT), + defreg(IAM), + defreg(GCR), defreg(TIMINCA), defreg(EIAC), defreg(CTRL_EXT), + defreg(IVAR0), defreg(MANC2H), + defreg(MFVAL), defreg(MDEF), defreg(FACTPS), defreg(FTFT), + defreg(RUC), defreg(ROC), defreg(RFC), defreg(RJC), + defreg(PRC64), defreg(PRC127), defreg(PRC255), defreg(PRC511), + defreg(PRC1023), defreg(PRC1522), defreg(PTC64), defreg(PTC127), + defreg(PTC255), defreg(PTC511), defreg(PTC1023), defreg(PTC1522), + defreg(GORCL), defreg(GORCH), defreg(GOTCL), defreg(GOTCH), + defreg(RNBC), defreg(BPRC), defreg(MPRC), defreg(RFCTL), + defreg(MPTC), defreg(BPTC), + defreg(IAC), defreg(MGTPRC), defreg(MGTPDC), defreg(MGTPTC), + defreg(TSCTC), defreg(RXCSUM), defreg(FUNCTAG), defreg(GSCL_1), + defreg(GSCL_2), defreg(GSCL_3), defreg(GSCL_4), defreg(GSCN_0), + defreg(GSCN_1), defreg(GSCN_2), defreg(GSCN_3), + defreg_indexed(EITR, 0), + defreg(MRQC), defreg(RETA), defreg(RSSRK), + defreg(PBACLR), defreg(FCAL), defreg(FCAH), defreg(FCT), + defreg(FCRTH), defreg(FCRTL), defreg(FCTTV), defreg(FCRTV), + defreg(FLA), defreg(FLOP), + defreg(MAVTV0), defreg(MAVTV1), defreg(MAVTV2), defreg(MAVTV3), + defreg(TXSTMPL), defreg(TXSTMPH), defreg(SYSTIML), defreg(SYSTIMH), + defreg(TIMADJL), defreg(TIMADJH), + defreg(RXSTMPH), defreg(RXSTMPL), defreg(RXSATRL), defreg(RXSATRH), + defreg(TIPG), + defreg(CTRL_DUP), + defreg(EEMNGCTL), + defreg(EEMNGDATA), + defreg(FLMNGCTL), + defreg(FLMNGDATA), + defreg(FLMNGCNT), + defreg(TSYNCRXCTL), + defreg(TSYNCTXCTL), + defreg(RLPML), + defreg(UTA), + + /* Aliases */ + defreg(RDFH_A), defreg(RDFT_A), defreg(TDFH_A), defreg(TDFT_A), + defreg(RA_A), defreg(VFTA_A), defreg(FCRTL_A), + + /* Additional regs used by IGB */ + defreg(FWSM), defreg(SW_FW_SYNC), + + defreg(EICS), defreg(EIMS), defreg(EIMC), defreg(EIAM), + defreg(EICR), defreg(IVAR_MISC), defreg(GPIE), + + defreg(RXPBS), defregd(RDBAL), defregd(RDBAH), defregd(RDLEN), + defregd(SRRCTL), defregd(RDH), defregd(RDT), + defregd(RXDCTL), defregd(RXCTL), defregd(RQDPC), defreg(RA2), + + defreg(TXPBS), defreg(TCTL_EXT), defreg(DTXCTL), defreg(HTCBDPC), + defregd(TDBAL), defregd(TDBAH), defregd(TDLEN), defregd(TDH), + defregd(TDT), defregd(TXDCTL), defregd(TXCTL), + defregd(TDWBAL), defregd(TDWBAH), + + defreg(VT_CTL), + + defregv(P2VMAILBOX), defregv(V2PMAILBOX), defreg(MBVFICR), defreg(MBVFIMR), + defreg(VFLRE), defreg(VFRE), defreg(VFTE), defreg(WVBR), + defreg(QDE), defreg(DTXSWC), defreg_indexed(VLVF, 0), + defregv(VMOLR), defreg(RPLOLR), defregv(VMBMEM), defregv(VMVIR), + + defregv(PVTCTRL), defregv(PVTEICS), defregv(PVTEIMS), defregv(PVTEIMC), + defregv(PVTEIAC), defregv(PVTEIAM), defregv(PVTEICR), defregv(PVFGPRC), + defregv(PVFGPTC), defregv(PVFGORC), defregv(PVFGOTC), defregv(PVFMPRC), + defregv(PVFGPRLBC), defregv(PVFGPTLBC), defregv(PVFGORLBC), defregv(PVFGOTLBC), + + defreg(MTA_A), + + defreg(VTIVAR), defreg(VTIVAR_MISC), +}; + +uint64_t igb_mmio_read(void *opaque, hwaddr addr, unsigned size); +void igb_mmio_write(void *opaque, hwaddr addr, uint64_t val, unsigned size); + +#endif diff --git a/hw/net/igb_core.c b/hw/net/igb_core.c new file mode 100644 index 0000000000..a7c7bfdc75 --- /dev/null +++ b/hw/net/igb_core.c @@ -0,0 +1,4077 @@ +/* + * Core code for QEMU igb emulation + * + * Datasheet: + * https://www.intel.com/content/dam/www/public/us/en/documents/datasheets/82576eg-gbe-datasheet.pdf + * + * Copyright (c) 2020-2023 Red Hat, Inc. + * Copyright (c) 2015 Ravello Systems LTD (http://ravellosystems.com) + * Developed by Daynix Computing LTD (http://www.daynix.com) + * + * Authors: + * Akihiko Odaki <akihiko.odaki@daynix.com> + * Gal Hammmer <gal.hammer@sap.com> + * Marcel Apfelbaum <marcel.apfelbaum@gmail.com> + * Dmitry Fleytman <dmitry@daynix.com> + * Leonid Bloch <leonid@daynix.com> + * Yan Vugenfirer <yan@daynix.com> + * + * Based on work done by: + * Nir Peleg, Tutis Systems Ltd. for Qumranet Inc. + * Copyright (c) 2008 Qumranet + * Based on work done by: + * Copyright (c) 2007 Dan Aloni + * Copyright (c) 2004 Antony T Curtis + * + * This library is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * This library is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with this library; if not, see <http://www.gnu.org/licenses/>. + */ + +#include "qemu/osdep.h" +#include "qemu/log.h" +#include "net/net.h" +#include "net/tap.h" +#include "hw/net/mii.h" +#include "hw/pci/msi.h" +#include "hw/pci/msix.h" +#include "sysemu/runstate.h" + +#include "net_tx_pkt.h" +#include "net_rx_pkt.h" + +#include "igb_common.h" +#include "e1000x_common.h" +#include "igb_core.h" + +#include "trace.h" + +#define E1000E_MAX_TX_FRAGS (64) + +union e1000_rx_desc_union { + struct e1000_rx_desc legacy; + union e1000_adv_rx_desc adv; +}; + +typedef struct IGBTxPktVmdqCallbackContext { + IGBCore *core; + NetClientState *nc; +} IGBTxPktVmdqCallbackContext; + +static ssize_t +igb_receive_internal(IGBCore *core, const struct iovec *iov, int iovcnt, + bool has_vnet, bool *external_tx); + +static inline void +igb_set_interrupt_cause(IGBCore *core, uint32_t val); + +static void igb_update_interrupt_state(IGBCore *core); +static void igb_reset(IGBCore *core, bool sw); + +static inline void +igb_raise_legacy_irq(IGBCore *core) +{ + trace_e1000e_irq_legacy_notify(true); + e1000x_inc_reg_if_not_full(core->mac, IAC); + pci_set_irq(core->owner, 1); +} + +static inline void +igb_lower_legacy_irq(IGBCore *core) +{ + trace_e1000e_irq_legacy_notify(false); + pci_set_irq(core->owner, 0); +} + +static void igb_msix_notify(IGBCore *core, unsigned int vector) +{ + PCIDevice *dev = core->owner; + uint16_t vfn; + + vfn = 8 - (vector + 2) / IGBVF_MSIX_VEC_NUM; + if (vfn < pcie_sriov_num_vfs(core->owner)) { + dev = pcie_sriov_get_vf_at_index(core->owner, vfn); + assert(dev); + vector = (vector + 2) % IGBVF_MSIX_VEC_NUM; + } else if (vector >= IGB_MSIX_VEC_NUM) { + qemu_log_mask(LOG_GUEST_ERROR, + "igb: Tried to use vector unavailable for PF"); + return; + } + + msix_notify(dev, vector); +} + +static inline void +igb_intrmgr_rearm_timer(IGBIntrDelayTimer *timer) +{ + int64_t delay_ns = (int64_t) timer->core->mac[timer->delay_reg] * + timer->delay_resolution_ns; + + trace_e1000e_irq_rearm_timer(timer->delay_reg << 2, delay_ns); + + timer_mod(timer->timer, qemu_clock_get_ns(QEMU_CLOCK_VIRTUAL) + delay_ns); + + timer->running = true; +} + +static void +igb_intmgr_timer_resume(IGBIntrDelayTimer *timer) +{ + if (timer->running) { + igb_intrmgr_rearm_timer(timer); + } +} + +static void +igb_intmgr_timer_pause(IGBIntrDelayTimer *timer) +{ + if (timer->running) { + timer_del(timer->timer); + } +} + +static void +igb_intrmgr_on_msix_throttling_timer(void *opaque) +{ + IGBIntrDelayTimer *timer = opaque; + int idx = timer - &timer->core->eitr[0]; + + timer->running = false; + + trace_e1000e_irq_msix_notify_postponed_vec(idx); + igb_msix_notify(timer->core, idx); +} + +static void +igb_intrmgr_initialize_all_timers(IGBCore *core, bool create) +{ + int i; + + for (i = 0; i < IGB_INTR_NUM; i++) { + core->eitr[i].core = core; + core->eitr[i].delay_reg = EITR0 + i; + core->eitr[i].delay_resolution_ns = E1000_INTR_DELAY_NS_RES; + } + + if (!create) { + return; + } + + for (i = 0; i < IGB_INTR_NUM; i++) { + core->eitr[i].timer = timer_new_ns(QEMU_CLOCK_VIRTUAL, + igb_intrmgr_on_msix_throttling_timer, + &core->eitr[i]); + } +} + +static void +igb_intrmgr_resume(IGBCore *core) +{ + int i; + + for (i = 0; i < IGB_INTR_NUM; i++) { + igb_intmgr_timer_resume(&core->eitr[i]); + } +} + +static void +igb_intrmgr_pause(IGBCore *core) +{ + int i; + + for (i = 0; i < IGB_INTR_NUM; i++) { + igb_intmgr_timer_pause(&core->eitr[i]); + } +} + +static void +igb_intrmgr_reset(IGBCore *core) +{ + int i; + + for (i = 0; i < IGB_INTR_NUM; i++) { + if (core->eitr[i].running) { + timer_del(core->eitr[i].timer); + igb_intrmgr_on_msix_throttling_timer(&core->eitr[i]); + } + } +} + +static void +igb_intrmgr_pci_unint(IGBCore *core) +{ + int i; + + for (i = 0; i < IGB_INTR_NUM; i++) { + timer_free(core->eitr[i].timer); + } +} + +static void +igb_intrmgr_pci_realize(IGBCore *core) +{ + igb_intrmgr_initialize_all_timers(core, true); +} + +static inline bool +igb_rx_csum_enabled(IGBCore *core) +{ + return (core->mac[RXCSUM] & E1000_RXCSUM_PCSD) ? false : true; +} + +static inline bool +igb_rx_use_legacy_descriptor(IGBCore *core) +{ + /* + * TODO: If SRRCTL[n],DESCTYPE = 000b, the 82576 uses the legacy Rx + * descriptor. + */ + return false; +} + +static inline bool +igb_rss_enabled(IGBCore *core) +{ + return (core->mac[MRQC] & 3) == E1000_MRQC_ENABLE_RSS_MQ && + !igb_rx_csum_enabled(core) && + !igb_rx_use_legacy_descriptor(core); +} + +typedef struct E1000E_RSSInfo_st { + bool enabled; + uint32_t hash; + uint32_t queue; + uint32_t type; +} E1000E_RSSInfo; + +static uint32_t +igb_rss_get_hash_type(IGBCore *core, struct NetRxPkt *pkt) +{ + bool hasip4, hasip6; + EthL4HdrProto l4hdr_proto; + + assert(igb_rss_enabled(core)); + + net_rx_pkt_get_protocols(pkt, &hasip4, &hasip6, &l4hdr_proto); + + if (hasip4) { + trace_e1000e_rx_rss_ip4(l4hdr_proto, core->mac[MRQC], + E1000_MRQC_EN_TCPIPV4(core->mac[MRQC]), + E1000_MRQC_EN_IPV4(core->mac[MRQC])); + + if (l4hdr_proto == ETH_L4_HDR_PROTO_TCP && + E1000_MRQC_EN_TCPIPV4(core->mac[MRQC])) { + return E1000_MRQ_RSS_TYPE_IPV4TCP; + } + + if (E1000_MRQC_EN_IPV4(core->mac[MRQC])) { + return E1000_MRQ_RSS_TYPE_IPV4; + } + } else if (hasip6) { + eth_ip6_hdr_info *ip6info = net_rx_pkt_get_ip6_info(pkt); + + bool ex_dis = core->mac[RFCTL] & E1000_RFCTL_IPV6_EX_DIS; + bool new_ex_dis = core->mac[RFCTL] & E1000_RFCTL_NEW_IPV6_EXT_DIS; + + /* + * Following two traces must not be combined because resulting + * event will have 11 arguments totally and some trace backends + * (at least "ust") have limitation of maximum 10 arguments per + * event. Events with more arguments fail to compile for + * backends like these. + */ + trace_e1000e_rx_rss_ip6_rfctl(core->mac[RFCTL]); + trace_e1000e_rx_rss_ip6(ex_dis, new_ex_dis, l4hdr_proto, + ip6info->has_ext_hdrs, + ip6info->rss_ex_dst_valid, + ip6info->rss_ex_src_valid, + core->mac[MRQC], + E1000_MRQC_EN_TCPIPV6(core->mac[MRQC]), + E1000_MRQC_EN_IPV6EX(core->mac[MRQC]), + E1000_MRQC_EN_IPV6(core->mac[MRQC])); + + if ((!ex_dis || !ip6info->has_ext_hdrs) && + (!new_ex_dis || !(ip6info->rss_ex_dst_valid || + ip6info->rss_ex_src_valid))) { + + if (l4hdr_proto == ETH_L4_HDR_PROTO_TCP && + E1000_MRQC_EN_TCPIPV6(core->mac[MRQC])) { + return E1000_MRQ_RSS_TYPE_IPV6TCP; + } + + if (E1000_MRQC_EN_IPV6EX(core->mac[MRQC])) { + return E1000_MRQ_RSS_TYPE_IPV6EX; + } + + } + + if (E1000_MRQC_EN_IPV6(core->mac[MRQC])) { + return E1000_MRQ_RSS_TYPE_IPV6; + } + + } + + return E1000_MRQ_RSS_TYPE_NONE; +} + +static uint32_t +igb_rss_calc_hash(IGBCore *core, struct NetRxPkt *pkt, E1000E_RSSInfo *info) +{ + NetRxPktRssType type; + + assert(igb_rss_enabled(core)); + + switch (info->type) { + case E1000_MRQ_RSS_TYPE_IPV4: + type = NetPktRssIpV4; + break; + case E1000_MRQ_RSS_TYPE_IPV4TCP: + type = NetPktRssIpV4Tcp; + break; + case E1000_MRQ_RSS_TYPE_IPV6TCP: + type = NetPktRssIpV6TcpEx; + break; + case E1000_MRQ_RSS_TYPE_IPV6: + type = NetPktRssIpV6; + break; + case E1000_MRQ_RSS_TYPE_IPV6EX: + type = NetPktRssIpV6Ex; + break; + default: + assert(false); + return 0; + } + + return net_rx_pkt_calc_rss_hash(pkt, type, (uint8_t *) &core->mac[RSSRK]); +} + +static void +igb_rss_parse_packet(IGBCore *core, struct NetRxPkt *pkt, bool tx, + E1000E_RSSInfo *info) +{ + trace_e1000e_rx_rss_started(); + + if (tx || !igb_rss_enabled(core)) { + info->enabled = false; + info->hash = 0; + info->queue = 0; + info->type = 0; + trace_e1000e_rx_rss_disabled(); + return; + } + + info->enabled = true; + + info->type = igb_rss_get_hash_type(core, pkt); + + trace_e1000e_rx_rss_type(info->type); + + if (info->type == E1000_MRQ_RSS_TYPE_NONE) { + info->hash = 0; + info->queue = 0; + return; + } + + info->hash = igb_rss_calc_hash(core, pkt, info); + info->queue = E1000_RSS_QUEUE(&core->mac[RETA], info->hash); +} + +static bool +igb_setup_tx_offloads(IGBCore *core, struct igb_tx *tx) +{ + if (tx->tse) { + if (!net_tx_pkt_build_vheader(tx->tx_pkt, true, true, tx->mss)) { + return false; + } + + net_tx_pkt_update_ip_checksums(tx->tx_pkt); + e1000x_inc_reg_if_not_full(core->mac, TSCTC); + return true; + } + + if (tx->txsm) { + if (!net_tx_pkt_build_vheader(tx->tx_pkt, false, true, 0)) { + return false; + } + } + + if (tx->ixsm) { + net_tx_pkt_update_ip_hdr_checksum(tx->tx_pkt); + } + + return true; +} + +static void igb_tx_pkt_mac_callback(void *core, + const struct iovec *iov, + int iovcnt, + const struct iovec *virt_iov, + int virt_iovcnt) +{ + igb_receive_internal(core, virt_iov, virt_iovcnt, true, NULL); +} + +static void igb_tx_pkt_vmdq_callback(void *opaque, + const struct iovec *iov, + int iovcnt, + const struct iovec *virt_iov, + int virt_iovcnt) +{ + IGBTxPktVmdqCallbackContext *context = opaque; + bool external_tx; + + igb_receive_internal(context->core, virt_iov, virt_iovcnt, true, + &external_tx); + + if (external_tx) { + if (context->core->has_vnet) { + qemu_sendv_packet(context->nc, virt_iov, virt_iovcnt); + } else { + qemu_sendv_packet(context->nc, iov, iovcnt); + } + } +} + +/* TX Packets Switching (7.10.3.6) */ +static bool igb_tx_pkt_switch(IGBCore *core, struct igb_tx *tx, + NetClientState *nc) +{ + IGBTxPktVmdqCallbackContext context; + + /* TX switching is only used to serve VM to VM traffic. */ + if (!(core->mac[MRQC] & 1)) { + goto send_out; + } + + /* TX switching requires DTXSWC.Loopback_en bit enabled. */ + if (!(core->mac[DTXSWC] & E1000_DTXSWC_VMDQ_LOOPBACK_EN)) { + goto send_out; + } + + context.core = core; + context.nc = nc; + + return net_tx_pkt_send_custom(tx->tx_pkt, false, + igb_tx_pkt_vmdq_callback, &context); + +send_out: + return net_tx_pkt_send(tx->tx_pkt, nc); +} + +static bool +igb_tx_pkt_send(IGBCore *core, struct igb_tx *tx, int queue_index) +{ + int target_queue = MIN(core->max_queue_num, queue_index); + NetClientState *queue = qemu_get_subqueue(core->owner_nic, target_queue); + + if (!igb_setup_tx_offloads(core, tx)) { + return false; + } + + net_tx_pkt_dump(tx->tx_pkt); + + if ((core->phy[MII_BMCR] & MII_BMCR_LOOPBACK) || + ((core->mac[RCTL] & E1000_RCTL_LBM_MAC) == E1000_RCTL_LBM_MAC)) { + return net_tx_pkt_send_custom(tx->tx_pkt, false, + igb_tx_pkt_mac_callback, core); + } else { + return igb_tx_pkt_switch(core, tx, queue); + } +} + +static void +igb_on_tx_done_update_stats(IGBCore *core, struct NetTxPkt *tx_pkt) +{ + static const int PTCregs[6] = { PTC64, PTC127, PTC255, PTC511, + PTC1023, PTC1522 }; + + size_t tot_len = net_tx_pkt_get_total_len(tx_pkt) + 4; + + e1000x_increase_size_stats(core->mac, PTCregs, tot_len); + e1000x_inc_reg_if_not_full(core->mac, TPT); + e1000x_grow_8reg_if_not_full(core->mac, TOTL, tot_len); + + switch (net_tx_pkt_get_packet_type(tx_pkt)) { + case ETH_PKT_BCAST: + e1000x_inc_reg_if_not_full(core->mac, BPTC); + break; + case ETH_PKT_MCAST: + e1000x_inc_reg_if_not_full(core->mac, MPTC); + break; + case ETH_PKT_UCAST: + break; + default: + g_assert_not_reached(); + } + + core->mac[GPTC] = core->mac[TPT]; + core->mac[GOTCL] = core->mac[TOTL]; + core->mac[GOTCH] = core->mac[TOTH]; +} + +static void +igb_process_tx_desc(IGBCore *core, + struct igb_tx *tx, + union e1000_adv_tx_desc *tx_desc, + int queue_index) +{ + struct e1000_adv_tx_context_desc *tx_ctx_desc; + uint32_t cmd_type_len; + uint32_t olinfo_status; + uint64_t buffer_addr; + uint16_t length; + + cmd_type_len = le32_to_cpu(tx_desc->read.cmd_type_len); + + if (cmd_type_len & E1000_ADVTXD_DCMD_DEXT) { + if ((cmd_type_len & E1000_ADVTXD_DTYP_DATA) == + E1000_ADVTXD_DTYP_DATA) { + /* advanced transmit data descriptor */ + if (tx->first) { + olinfo_status = le32_to_cpu(tx_desc->read.olinfo_status); + + tx->tse = !!(cmd_type_len & E1000_ADVTXD_DCMD_TSE); + tx->ixsm = !!(olinfo_status & E1000_ADVTXD_POTS_IXSM); + tx->txsm = !!(olinfo_status & E1000_ADVTXD_POTS_TXSM); + + tx->first = false; + } + } else if ((cmd_type_len & E1000_ADVTXD_DTYP_CTXT) == + E1000_ADVTXD_DTYP_CTXT) { + /* advanced transmit context descriptor */ + tx_ctx_desc = (struct e1000_adv_tx_context_desc *)tx_desc; + tx->vlan = le32_to_cpu(tx_ctx_desc->vlan_macip_lens) >> 16; + tx->mss = le32_to_cpu(tx_ctx_desc->mss_l4len_idx) >> 16; + return; + } else { + /* unknown descriptor type */ + return; + } + } else { + /* legacy descriptor */ + + /* TODO: Implement a support for legacy descriptors (7.2.2.1). */ + } + + buffer_addr = le64_to_cpu(tx_desc->read.buffer_addr); + length = cmd_type_len & 0xFFFF; + + if (!tx->skip_cp) { + if (!net_tx_pkt_add_raw_fragment(tx->tx_pkt, buffer_addr, length)) { + tx->skip_cp = true; + } + } + + if (cmd_type_len & E1000_TXD_CMD_EOP) { + if (!tx->skip_cp && net_tx_pkt_parse(tx->tx_pkt)) { + if (cmd_type_len & E1000_TXD_CMD_VLE) { + net_tx_pkt_setup_vlan_header_ex(tx->tx_pkt, tx->vlan, + core->mac[VET] & 0xffff); + } + if (igb_tx_pkt_send(core, tx, queue_index)) { + igb_on_tx_done_update_stats(core, tx->tx_pkt); + } + } + + tx->first = true; + tx->skip_cp = false; + net_tx_pkt_reset(tx->tx_pkt); + } +} + +static uint32_t igb_tx_wb_eic(IGBCore *core, int queue_idx) +{ + uint32_t n, ent = 0; + + n = igb_ivar_entry_tx(queue_idx); + ent = (core->mac[IVAR0 + n / 4] >> (8 * (n % 4))) & 0xff; + + return (ent & E1000_IVAR_VALID) ? BIT(ent & 0x1f) : 0; +} + +static uint32_t igb_rx_wb_eic(IGBCore *core, int queue_idx) +{ + uint32_t n, ent = 0; + + n = igb_ivar_entry_rx(queue_idx); + ent = (core->mac[IVAR0 + n / 4] >> (8 * (n % 4))) & 0xff; + + return (ent & E1000_IVAR_VALID) ? BIT(ent & 0x1f) : 0; +} + +typedef struct E1000E_RingInfo_st { + int dbah; + int dbal; + int dlen; + int dh; + int dt; + int idx; +} E1000E_RingInfo; + +static inline bool +igb_ring_empty(IGBCore *core, const E1000E_RingInfo *r) +{ + return core->mac[r->dh] == core->mac[r->dt] || + core->mac[r->dt] >= core->mac[r->dlen] / E1000_RING_DESC_LEN; +} + +static inline uint64_t +igb_ring_base(IGBCore *core, const E1000E_RingInfo *r) +{ + uint64_t bah = core->mac[r->dbah]; + uint64_t bal = core->mac[r->dbal]; + + return (bah << 32) + bal; +} + +static inline uint64_t +igb_ring_head_descr(IGBCore *core, const E1000E_RingInfo *r) +{ + return igb_ring_base(core, r) + E1000_RING_DESC_LEN * core->mac[r->dh]; +} + +static inline void +igb_ring_advance(IGBCore *core, const E1000E_RingInfo *r, uint32_t count) +{ + core->mac[r->dh] += count; + + if (core->mac[r->dh] * E1000_RING_DESC_LEN >= core->mac[r->dlen]) { + core->mac[r->dh] = 0; + } +} + +static inline uint32_t +igb_ring_free_descr_num(IGBCore *core, const E1000E_RingInfo *r) +{ + trace_e1000e_ring_free_space(r->idx, core->mac[r->dlen], + core->mac[r->dh], core->mac[r->dt]); + + if (core->mac[r->dh] <= core->mac[r->dt]) { + return core->mac[r->dt] - core->mac[r->dh]; + } + + if (core->mac[r->dh] > core->mac[r->dt]) { + return core->mac[r->dlen] / E1000_RING_DESC_LEN + + core->mac[r->dt] - core->mac[r->dh]; + } + + g_assert_not_reached(); + return 0; +} + +static inline bool +igb_ring_enabled(IGBCore *core, const E1000E_RingInfo *r) +{ + return core->mac[r->dlen] > 0; +} + +typedef struct IGB_TxRing_st { + const E1000E_RingInfo *i; + struct igb_tx *tx; +} IGB_TxRing; + +static inline int +igb_mq_queue_idx(int base_reg_idx, int reg_idx) +{ + return (reg_idx - base_reg_idx) / 16; +} + +static inline void +igb_tx_ring_init(IGBCore *core, IGB_TxRing *txr, int idx) +{ + static const E1000E_RingInfo i[IGB_NUM_QUEUES] = { + { TDBAH0, TDBAL0, TDLEN0, TDH0, TDT0, 0 }, + { TDBAH1, TDBAL1, TDLEN1, TDH1, TDT1, 1 }, + { TDBAH2, TDBAL2, TDLEN2, TDH2, TDT2, 2 }, + { TDBAH3, TDBAL3, TDLEN3, TDH3, TDT3, 3 }, + { TDBAH4, TDBAL4, TDLEN4, TDH4, TDT4, 4 }, + { TDBAH5, TDBAL5, TDLEN5, TDH5, TDT5, 5 }, + { TDBAH6, TDBAL6, TDLEN6, TDH6, TDT6, 6 }, + { TDBAH7, TDBAL7, TDLEN7, TDH7, TDT7, 7 }, + { TDBAH8, TDBAL8, TDLEN8, TDH8, TDT8, 8 }, + { TDBAH9, TDBAL9, TDLEN9, TDH9, TDT9, 9 }, + { TDBAH10, TDBAL10, TDLEN10, TDH10, TDT10, 10 }, + { TDBAH11, TDBAL11, TDLEN11, TDH11, TDT11, 11 }, + { TDBAH12, TDBAL12, TDLEN12, TDH12, TDT12, 12 }, + { TDBAH13, TDBAL13, TDLEN13, TDH13, TDT13, 13 }, + { TDBAH14, TDBAL14, TDLEN14, TDH14, TDT14, 14 }, + { TDBAH15, TDBAL15, TDLEN15, TDH15, TDT15, 15 } + }; + + assert(idx < ARRAY_SIZE(i)); + + txr->i = &i[idx]; + txr->tx = &core->tx[idx]; +} + +typedef struct E1000E_RxRing_st { + const E1000E_RingInfo *i; +} E1000E_RxRing; + +static inline void +igb_rx_ring_init(IGBCore *core, E1000E_RxRing *rxr, int idx) +{ + static const E1000E_RingInfo i[IGB_NUM_QUEUES] = { + { RDBAH0, RDBAL0, RDLEN0, RDH0, RDT0, 0 }, + { RDBAH1, RDBAL1, RDLEN1, RDH1, RDT1, 1 }, + { RDBAH2, RDBAL2, RDLEN2, RDH2, RDT2, 2 }, + { RDBAH3, RDBAL3, RDLEN3, RDH3, RDT3, 3 }, + { RDBAH4, RDBAL4, RDLEN4, RDH4, RDT4, 4 }, + { RDBAH5, RDBAL5, RDLEN5, RDH5, RDT5, 5 }, + { RDBAH6, RDBAL6, RDLEN6, RDH6, RDT6, 6 }, + { RDBAH7, RDBAL7, RDLEN7, RDH7, RDT7, 7 }, + { RDBAH8, RDBAL8, RDLEN8, RDH8, RDT8, 8 }, + { RDBAH9, RDBAL9, RDLEN9, RDH9, RDT9, 9 }, + { RDBAH10, RDBAL10, RDLEN10, RDH10, RDT10, 10 }, + { RDBAH11, RDBAL11, RDLEN11, RDH11, RDT11, 11 }, + { RDBAH12, RDBAL12, RDLEN12, RDH12, RDT12, 12 }, + { RDBAH13, RDBAL13, RDLEN13, RDH13, RDT13, 13 }, + { RDBAH14, RDBAL14, RDLEN14, RDH14, RDT14, 14 }, + { RDBAH15, RDBAL15, RDLEN15, RDH15, RDT15, 15 } + }; + + assert(idx < ARRAY_SIZE(i)); + + rxr->i = &i[idx]; +} + +static uint32_t +igb_txdesc_writeback(IGBCore *core, dma_addr_t base, + union e1000_adv_tx_desc *tx_desc, + const E1000E_RingInfo *txi) +{ + PCIDevice *d; + uint32_t cmd_type_len = le32_to_cpu(tx_desc->read.cmd_type_len); + uint64_t tdwba; + + tdwba = core->mac[E1000_TDWBAL(txi->idx) >> 2]; + tdwba |= (uint64_t)core->mac[E1000_TDWBAH(txi->idx) >> 2] << 32; + + if (!(cmd_type_len & E1000_TXD_CMD_RS)) { + return 0; + } + + d = pcie_sriov_get_vf_at_index(core->owner, txi->idx % 8); + if (!d) { + d = core->owner; + } + + if (tdwba & 1) { + uint32_t buffer = cpu_to_le32(core->mac[txi->dh]); + pci_dma_write(d, tdwba & ~3, &buffer, sizeof(buffer)); + } else { + uint32_t status = le32_to_cpu(tx_desc->wb.status) | E1000_TXD_STAT_DD; + + tx_desc->wb.status = cpu_to_le32(status); + pci_dma_write(d, base + offsetof(union e1000_adv_tx_desc, wb), + &tx_desc->wb, sizeof(tx_desc->wb)); + } + + return igb_tx_wb_eic(core, txi->idx); +} + +static void +igb_start_xmit(IGBCore *core, const IGB_TxRing *txr) +{ + PCIDevice *d; + dma_addr_t base; + union e1000_adv_tx_desc desc; + const E1000E_RingInfo *txi = txr->i; + uint32_t eic = 0; + + /* TODO: check if the queue itself is enabled too. */ + if (!(core->mac[TCTL] & E1000_TCTL_EN)) { + trace_e1000e_tx_disabled(); + return; + } + + d = pcie_sriov_get_vf_at_index(core->owner, txi->idx % 8); + if (!d) { + d = core->owner; + } + + while (!igb_ring_empty(core, txi)) { + base = igb_ring_head_descr(core, txi); + + pci_dma_read(d, base, &desc, sizeof(desc)); + + trace_e1000e_tx_descr((void *)(intptr_t)desc.read.buffer_addr, + desc.read.cmd_type_len, desc.wb.status); + + igb_process_tx_desc(core, txr->tx, &desc, txi->idx); + igb_ring_advance(core, txi, 1); + eic |= igb_txdesc_writeback(core, base, &desc, txi); + } + + if (eic) { + core->mac[EICR] |= eic; + igb_set_interrupt_cause(core, E1000_ICR_TXDW); + } +} + +static uint32_t +igb_rxbufsize(IGBCore *core, const E1000E_RingInfo *r) +{ + uint32_t srrctl = core->mac[E1000_SRRCTL(r->idx) >> 2]; + uint32_t bsizepkt = srrctl & E1000_SRRCTL_BSIZEPKT_MASK; + if (bsizepkt) { + return bsizepkt << E1000_SRRCTL_BSIZEPKT_SHIFT; + } + + return e1000x_rxbufsize(core->mac[RCTL]); +} + +static bool +igb_has_rxbufs(IGBCore *core, const E1000E_RingInfo *r, size_t total_size) +{ + uint32_t bufs = igb_ring_free_descr_num(core, r); + uint32_t bufsize = igb_rxbufsize(core, r); + + trace_e1000e_rx_has_buffers(r->idx, bufs, total_size, bufsize); + + return total_size <= bufs / (core->rx_desc_len / E1000_MIN_RX_DESC_LEN) * + bufsize; +} + +void +igb_start_recv(IGBCore *core) +{ + int i; + + trace_e1000e_rx_start_recv(); + + for (i = 0; i <= core->max_queue_num; i++) { + qemu_flush_queued_packets(qemu_get_subqueue(core->owner_nic, i)); + } +} + +bool +igb_can_receive(IGBCore *core) +{ + int i; + + if (!e1000x_rx_ready(core->owner, core->mac)) { + return false; + } + + for (i = 0; i < IGB_NUM_QUEUES; i++) { + E1000E_RxRing rxr; + + igb_rx_ring_init(core, &rxr, i); + if (igb_ring_enabled(core, rxr.i) && igb_has_rxbufs(core, rxr.i, 1)) { + trace_e1000e_rx_can_recv(); + return true; + } + } + + trace_e1000e_rx_can_recv_rings_full(); + return false; +} + +ssize_t +igb_receive(IGBCore *core, const uint8_t *buf, size_t size) +{ + const struct iovec iov = { + .iov_base = (uint8_t *)buf, + .iov_len = size + }; + + return igb_receive_iov(core, &iov, 1); +} + +static inline bool +igb_rx_l3_cso_enabled(IGBCore *core) +{ + return !!(core->mac[RXCSUM] & E1000_RXCSUM_IPOFLD); +} + +static inline bool +igb_rx_l4_cso_enabled(IGBCore *core) +{ + return !!(core->mac[RXCSUM] & E1000_RXCSUM_TUOFLD); +} + +static uint16_t igb_receive_assign(IGBCore *core, const struct eth_header *ehdr, + E1000E_RSSInfo *rss_info, bool *external_tx) +{ + static const int ta_shift[] = { 4, 3, 2, 0 }; + uint32_t f, ra[2], *macp, rctl = core->mac[RCTL]; + uint16_t queues = 0; + uint16_t vid = lduw_be_p(&PKT_GET_VLAN_HDR(ehdr)->h_tci) & VLAN_VID_MASK; + bool accepted = false; + int i; + + memset(rss_info, 0, sizeof(E1000E_RSSInfo)); + + if (external_tx) { + *external_tx = true; + } + + if (e1000x_is_vlan_packet(ehdr, core->mac[VET] & 0xffff) && + e1000x_vlan_rx_filter_enabled(core->mac)) { + uint32_t vfta = + ldl_le_p((uint32_t *)(core->mac + VFTA) + + ((vid >> E1000_VFTA_ENTRY_SHIFT) & E1000_VFTA_ENTRY_MASK)); + if ((vfta & (1 << (vid & E1000_VFTA_ENTRY_BIT_SHIFT_MASK))) == 0) { + trace_e1000e_rx_flt_vlan_mismatch(vid); + return queues; + } else { + trace_e1000e_rx_flt_vlan_match(vid); + } + } + + if (core->mac[MRQC] & 1) { + if (is_broadcast_ether_addr(ehdr->h_dest)) { + for (i = 0; i < 8; i++) { + if (core->mac[VMOLR0 + i] & E1000_VMOLR_BAM) { + queues |= BIT(i); + } + } + } else { + for (macp = core->mac + RA; macp < core->mac + RA + 32; macp += 2) { + if (!(macp[1] & E1000_RAH_AV)) { + continue; + } + ra[0] = cpu_to_le32(macp[0]); + ra[1] = cpu_to_le32(macp[1]); + if (!memcmp(ehdr->h_dest, (uint8_t *)ra, ETH_ALEN)) { + queues |= (macp[1] & E1000_RAH_POOL_MASK) / E1000_RAH_POOL_1; + } + } + + for (macp = core->mac + RA2; macp < core->mac + RA2 + 16; macp += 2) { + if (!(macp[1] & E1000_RAH_AV)) { + continue; + } + ra[0] = cpu_to_le32(macp[0]); + ra[1] = cpu_to_le32(macp[1]); + if (!memcmp(ehdr->h_dest, (uint8_t *)ra, ETH_ALEN)) { + queues |= (macp[1] & E1000_RAH_POOL_MASK) / E1000_RAH_POOL_1; + } + } + + if (!queues) { + macp = core->mac + (is_multicast_ether_addr(ehdr->h_dest) ? MTA : UTA); + + f = ta_shift[(rctl >> E1000_RCTL_MO_SHIFT) & 3]; + f = (((ehdr->h_dest[5] << 8) | ehdr->h_dest[4]) >> f) & 0xfff; + if (macp[f >> 5] & (1 << (f & 0x1f))) { + for (i = 0; i < 8; i++) { + if (core->mac[VMOLR0 + i] & E1000_VMOLR_ROMPE) { + queues |= BIT(i); + } + } + } + } else if (is_unicast_ether_addr(ehdr->h_dest) && external_tx) { + *external_tx = false; + } + } + + if (e1000x_vlan_rx_filter_enabled(core->mac)) { + uint16_t mask = 0; + + if (e1000x_is_vlan_packet(ehdr, core->mac[VET] & 0xffff)) { + for (i = 0; i < E1000_VLVF_ARRAY_SIZE; i++) { + if ((core->mac[VLVF0 + i] & E1000_VLVF_VLANID_MASK) == vid && + (core->mac[VLVF0 + i] & E1000_VLVF_VLANID_ENABLE)) { + uint32_t poolsel = core->mac[VLVF0 + i] & E1000_VLVF_POOLSEL_MASK; + mask |= poolsel >> E1000_VLVF_POOLSEL_SHIFT; + } + } + } else { + for (i = 0; i < 8; i++) { + if (core->mac[VMOLR0 + i] & E1000_VMOLR_AUPE) { + mask |= BIT(i); + } + } + } + + queues &= mask; + } + + if (is_unicast_ether_addr(ehdr->h_dest) && !queues && !external_tx && + !(core->mac[VT_CTL] & E1000_VT_CTL_DISABLE_DEF_POOL)) { + uint32_t def_pl = core->mac[VT_CTL] & E1000_VT_CTL_DEFAULT_POOL_MASK; + queues = BIT(def_pl >> E1000_VT_CTL_DEFAULT_POOL_SHIFT); + } + + igb_rss_parse_packet(core, core->rx_pkt, external_tx != NULL, rss_info); + if (rss_info->queue & 1) { + queues <<= 8; + } + } else { + switch (net_rx_pkt_get_packet_type(core->rx_pkt)) { + case ETH_PKT_UCAST: + if (rctl & E1000_RCTL_UPE) { + accepted = true; /* promiscuous ucast */ + } + break; + + case ETH_PKT_BCAST: + if (rctl & E1000_RCTL_BAM) { + accepted = true; /* broadcast enabled */ + } + break; + + case ETH_PKT_MCAST: + if (rctl & E1000_RCTL_MPE) { + accepted = true; /* promiscuous mcast */ + } + break; + + default: + g_assert_not_reached(); + } + + if (!accepted) { + accepted = e1000x_rx_group_filter(core->mac, ehdr->h_dest); + } + + if (!accepted) { + for (macp = core->mac + RA2; macp < core->mac + RA2 + 16; macp += 2) { + if (!(macp[1] & E1000_RAH_AV)) { + continue; + } + ra[0] = cpu_to_le32(macp[0]); + ra[1] = cpu_to_le32(macp[1]); + if (!memcmp(ehdr->h_dest, (uint8_t *)ra, ETH_ALEN)) { + trace_e1000x_rx_flt_ucast_match((int)(macp - core->mac - RA2) / 2, + MAC_ARG(ehdr->h_dest)); + + accepted = true; + break; + } + } + } + + if (accepted) { + igb_rss_parse_packet(core, core->rx_pkt, false, rss_info); + queues = BIT(rss_info->queue); + } + } + + return queues; +} + +static inline void +igb_read_lgcy_rx_descr(IGBCore *core, struct e1000_rx_desc *desc, + hwaddr *buff_addr) +{ + *buff_addr = le64_to_cpu(desc->buffer_addr); +} + +static inline void +igb_read_adv_rx_descr(IGBCore *core, union e1000_adv_rx_desc *desc, + hwaddr *buff_addr) +{ + *buff_addr = le64_to_cpu(desc->read.pkt_addr); +} + +static inline void +igb_read_rx_descr(IGBCore *core, union e1000_rx_desc_union *desc, + hwaddr *buff_addr) +{ + if (igb_rx_use_legacy_descriptor(core)) { + igb_read_lgcy_rx_descr(core, &desc->legacy, buff_addr); + } else { + igb_read_adv_rx_descr(core, &desc->adv, buff_addr); + } +} + +static void +igb_verify_csum_in_sw(IGBCore *core, + struct NetRxPkt *pkt, + uint32_t *status_flags, + EthL4HdrProto l4hdr_proto) +{ + bool csum_valid; + uint32_t csum_error; + + if (igb_rx_l3_cso_enabled(core)) { + if (!net_rx_pkt_validate_l3_csum(pkt, &csum_valid)) { + trace_e1000e_rx_metadata_l3_csum_validation_failed(); + } else { + csum_error = csum_valid ? 0 : E1000_RXDEXT_STATERR_IPE; + *status_flags |= E1000_RXD_STAT_IPCS | csum_error; + } + } else { + trace_e1000e_rx_metadata_l3_cso_disabled(); + } + + if (!igb_rx_l4_cso_enabled(core)) { + trace_e1000e_rx_metadata_l4_cso_disabled(); + return; + } + + if (!net_rx_pkt_validate_l4_csum(pkt, &csum_valid)) { + trace_e1000e_rx_metadata_l4_csum_validation_failed(); + return; + } + + csum_error = csum_valid ? 0 : E1000_RXDEXT_STATERR_TCPE; + *status_flags |= E1000_RXD_STAT_TCPCS | csum_error; + + if (l4hdr_proto == ETH_L4_HDR_PROTO_UDP) { + *status_flags |= E1000_RXD_STAT_UDPCS; + } +} + +static void +igb_build_rx_metadata(IGBCore *core, + struct NetRxPkt *pkt, + bool is_eop, + const E1000E_RSSInfo *rss_info, + uint16_t *pkt_info, uint16_t *hdr_info, + uint32_t *rss, + uint32_t *status_flags, + uint16_t *ip_id, + uint16_t *vlan_tag) +{ + struct virtio_net_hdr *vhdr; + bool hasip4, hasip6; + EthL4HdrProto l4hdr_proto; + uint32_t pkt_type; + + *status_flags = E1000_RXD_STAT_DD; + + /* No additional metadata needed for non-EOP descriptors */ + /* TODO: EOP apply only to status so don't skip whole function. */ + if (!is_eop) { + goto func_exit; + } + + *status_flags |= E1000_RXD_STAT_EOP; + + net_rx_pkt_get_protocols(pkt, &hasip4, &hasip6, &l4hdr_proto); + trace_e1000e_rx_metadata_protocols(hasip4, hasip6, l4hdr_proto); + + /* VLAN state */ + if (net_rx_pkt_is_vlan_stripped(pkt)) { + *status_flags |= E1000_RXD_STAT_VP; + *vlan_tag = cpu_to_le16(net_rx_pkt_get_vlan_tag(pkt)); + trace_e1000e_rx_metadata_vlan(*vlan_tag); + } + + /* Packet parsing results */ + if ((core->mac[RXCSUM] & E1000_RXCSUM_PCSD) != 0) { + if (rss_info->enabled) { + *rss = cpu_to_le32(rss_info->hash); + trace_igb_rx_metadata_rss(*rss); + } + } else if (hasip4) { + *status_flags |= E1000_RXD_STAT_IPIDV; + *ip_id = cpu_to_le16(net_rx_pkt_get_ip_id(pkt)); + trace_e1000e_rx_metadata_ip_id(*ip_id); + } + + if (l4hdr_proto == ETH_L4_HDR_PROTO_TCP && net_rx_pkt_is_tcp_ack(pkt)) { + *status_flags |= E1000_RXD_STAT_ACK; + trace_e1000e_rx_metadata_ack(); + } + + if (hasip6 && (core->mac[RFCTL] & E1000_RFCTL_IPV6_DIS)) { + trace_e1000e_rx_metadata_ipv6_filtering_disabled(); + pkt_type = E1000_RXD_PKT_MAC; + } else if (l4hdr_proto == ETH_L4_HDR_PROTO_TCP || + l4hdr_proto == ETH_L4_HDR_PROTO_UDP) { + pkt_type = hasip4 ? E1000_RXD_PKT_IP4_XDP : E1000_RXD_PKT_IP6_XDP; + } else if (hasip4 || hasip6) { + pkt_type = hasip4 ? E1000_RXD_PKT_IP4 : E1000_RXD_PKT_IP6; + } else { + pkt_type = E1000_RXD_PKT_MAC; + } + + trace_e1000e_rx_metadata_pkt_type(pkt_type); + + if (pkt_info) { + if (rss_info->enabled) { + *pkt_info = rss_info->type; + } + + *pkt_info |= (pkt_type << 4); + } else { + *status_flags |= E1000_RXD_PKT_TYPE(pkt_type); + } + + if (hdr_info) { + *hdr_info = 0; + } + + /* RX CSO information */ + if (hasip6 && (core->mac[RFCTL] & E1000_RFCTL_IPV6_XSUM_DIS)) { + trace_e1000e_rx_metadata_ipv6_sum_disabled(); + goto func_exit; + } + + vhdr = net_rx_pkt_get_vhdr(pkt); + + if (!(vhdr->flags & VIRTIO_NET_HDR_F_DATA_VALID) && + !(vhdr->flags & VIRTIO_NET_HDR_F_NEEDS_CSUM)) { + trace_e1000e_rx_metadata_virthdr_no_csum_info(); + igb_verify_csum_in_sw(core, pkt, status_flags, l4hdr_proto); + goto func_exit; + } + + if (igb_rx_l3_cso_enabled(core)) { + *status_flags |= hasip4 ? E1000_RXD_STAT_IPCS : 0; + } else { + trace_e1000e_rx_metadata_l3_cso_disabled(); + } + + if (igb_rx_l4_cso_enabled(core)) { + switch (l4hdr_proto) { + case ETH_L4_HDR_PROTO_TCP: + *status_flags |= E1000_RXD_STAT_TCPCS; + break; + + case ETH_L4_HDR_PROTO_UDP: + *status_flags |= E1000_RXD_STAT_TCPCS | E1000_RXD_STAT_UDPCS; + break; + + default: + goto func_exit; + } + } else { + trace_e1000e_rx_metadata_l4_cso_disabled(); + } + + trace_e1000e_rx_metadata_status_flags(*status_flags); + +func_exit: + *status_flags = cpu_to_le32(*status_flags); +} + +static inline void +igb_write_lgcy_rx_descr(IGBCore *core, struct e1000_rx_desc *desc, + struct NetRxPkt *pkt, + const E1000E_RSSInfo *rss_info, + uint16_t length) +{ + uint32_t status_flags, rss; + uint16_t ip_id; + + assert(!rss_info->enabled); + desc->length = cpu_to_le16(length); + desc->csum = 0; + + igb_build_rx_metadata(core, pkt, pkt != NULL, + rss_info, + NULL, NULL, &rss, + &status_flags, &ip_id, + &desc->special); + desc->errors = (uint8_t) (le32_to_cpu(status_flags) >> 24); + desc->status = (uint8_t) le32_to_cpu(status_flags); +} + +static inline void +igb_write_adv_rx_descr(IGBCore *core, union e1000_adv_rx_desc *desc, + struct NetRxPkt *pkt, + const E1000E_RSSInfo *rss_info, + uint16_t length) +{ + memset(&desc->wb, 0, sizeof(desc->wb)); + + desc->wb.upper.length = cpu_to_le16(length); + + igb_build_rx_metadata(core, pkt, pkt != NULL, + rss_info, + &desc->wb.lower.lo_dword.pkt_info, + &desc->wb.lower.lo_dword.hdr_info, + &desc->wb.lower.hi_dword.rss, + &desc->wb.upper.status_error, + &desc->wb.lower.hi_dword.csum_ip.ip_id, + &desc->wb.upper.vlan); +} + +static inline void +igb_write_rx_descr(IGBCore *core, union e1000_rx_desc_union *desc, +struct NetRxPkt *pkt, const E1000E_RSSInfo *rss_info, uint16_t length) +{ + if (igb_rx_use_legacy_descriptor(core)) { + igb_write_lgcy_rx_descr(core, &desc->legacy, pkt, rss_info, length); + } else { + igb_write_adv_rx_descr(core, &desc->adv, pkt, rss_info, length); + } +} + +static inline void +igb_pci_dma_write_rx_desc(IGBCore *core, PCIDevice *dev, dma_addr_t addr, + union e1000_rx_desc_union *desc, dma_addr_t len) +{ + if (igb_rx_use_legacy_descriptor(core)) { + struct e1000_rx_desc *d = &desc->legacy; + size_t offset = offsetof(struct e1000_rx_desc, status); + uint8_t status = d->status; + + d->status &= ~E1000_RXD_STAT_DD; + pci_dma_write(dev, addr, desc, len); + + if (status & E1000_RXD_STAT_DD) { + d->status = status; + pci_dma_write(dev, addr + offset, &status, sizeof(status)); + } + } else { + union e1000_adv_rx_desc *d = &desc->adv; + size_t offset = + offsetof(union e1000_adv_rx_desc, wb.upper.status_error); + uint32_t status = d->wb.upper.status_error; + + d->wb.upper.status_error &= ~E1000_RXD_STAT_DD; + pci_dma_write(dev, addr, desc, len); + + if (status & E1000_RXD_STAT_DD) { + d->wb.upper.status_error = status; + pci_dma_write(dev, addr + offset, &status, sizeof(status)); + } + } +} + +static void +igb_write_to_rx_buffers(IGBCore *core, + PCIDevice *d, + hwaddr ba, + uint16_t *written, + const char *data, + dma_addr_t data_len) +{ + trace_igb_rx_desc_buff_write(ba, *written, data, data_len); + pci_dma_write(d, ba + *written, data, data_len); + *written += data_len; +} + +static void +igb_update_rx_stats(IGBCore *core, size_t data_size, size_t data_fcs_size) +{ + e1000x_update_rx_total_stats(core->mac, data_size, data_fcs_size); + + switch (net_rx_pkt_get_packet_type(core->rx_pkt)) { + case ETH_PKT_BCAST: + e1000x_inc_reg_if_not_full(core->mac, BPRC); + break; + + case ETH_PKT_MCAST: + e1000x_inc_reg_if_not_full(core->mac, MPRC); + break; + + default: + break; + } +} + +static inline bool +igb_rx_descr_threshold_hit(IGBCore *core, const E1000E_RingInfo *rxi) +{ + return igb_ring_free_descr_num(core, rxi) == + ((core->mac[E1000_SRRCTL(rxi->idx) >> 2] >> 20) & 31) * 16; +} + +static void +igb_write_packet_to_guest(IGBCore *core, struct NetRxPkt *pkt, + const E1000E_RxRing *rxr, + const E1000E_RSSInfo *rss_info) +{ + PCIDevice *d; + dma_addr_t base; + union e1000_rx_desc_union desc; + size_t desc_size; + size_t desc_offset = 0; + size_t iov_ofs = 0; + + struct iovec *iov = net_rx_pkt_get_iovec(pkt); + size_t size = net_rx_pkt_get_total_len(pkt); + size_t total_size = size + e1000x_fcs_len(core->mac); + const E1000E_RingInfo *rxi = rxr->i; + size_t bufsize = igb_rxbufsize(core, rxi); + + d = pcie_sriov_get_vf_at_index(core->owner, rxi->idx % 8); + if (!d) { + d = core->owner; + } + + do { + hwaddr ba; + uint16_t written = 0; + bool is_last = false; + + desc_size = total_size - desc_offset; + + if (desc_size > bufsize) { + desc_size = bufsize; + } + + if (igb_ring_empty(core, rxi)) { + return; + } + + base = igb_ring_head_descr(core, rxi); + + pci_dma_read(d, base, &desc, core->rx_desc_len); + + trace_e1000e_rx_descr(rxi->idx, base, core->rx_desc_len); + + igb_read_rx_descr(core, &desc, &ba); + + if (ba) { + if (desc_offset < size) { + static const uint32_t fcs_pad; + size_t iov_copy; + size_t copy_size = size - desc_offset; + if (copy_size > bufsize) { + copy_size = bufsize; + } + + /* Copy packet payload */ + while (copy_size) { + iov_copy = MIN(copy_size, iov->iov_len - iov_ofs); + + igb_write_to_rx_buffers(core, d, ba, &written, + iov->iov_base + iov_ofs, iov_copy); + + copy_size -= iov_copy; + iov_ofs += iov_copy; + if (iov_ofs == iov->iov_len) { + iov++; + iov_ofs = 0; + } + } + + if (desc_offset + desc_size >= total_size) { + /* Simulate FCS checksum presence in the last descriptor */ + igb_write_to_rx_buffers(core, d, ba, &written, + (const char *) &fcs_pad, e1000x_fcs_len(core->mac)); + } + } + } else { /* as per intel docs; skip descriptors with null buf addr */ + trace_e1000e_rx_null_descriptor(); + } + desc_offset += desc_size; + if (desc_offset >= total_size) { + is_last = true; + } + + igb_write_rx_descr(core, &desc, is_last ? core->rx_pkt : NULL, + rss_info, written); + igb_pci_dma_write_rx_desc(core, d, base, &desc, core->rx_desc_len); + + igb_ring_advance(core, rxi, core->rx_desc_len / E1000_MIN_RX_DESC_LEN); + + } while (desc_offset < total_size); + + igb_update_rx_stats(core, size, total_size); +} + +static inline void +igb_rx_fix_l4_csum(IGBCore *core, struct NetRxPkt *pkt) +{ + struct virtio_net_hdr *vhdr = net_rx_pkt_get_vhdr(pkt); + + if (vhdr->flags & VIRTIO_NET_HDR_F_NEEDS_CSUM) { + net_rx_pkt_fix_l4_csum(pkt); + } +} + +ssize_t +igb_receive_iov(IGBCore *core, const struct iovec *iov, int iovcnt) +{ + return igb_receive_internal(core, iov, iovcnt, core->has_vnet, NULL); +} + +static ssize_t +igb_receive_internal(IGBCore *core, const struct iovec *iov, int iovcnt, + bool has_vnet, bool *external_tx) +{ + static const int maximum_ethernet_hdr_len = (ETH_HLEN + 4); + + uint16_t queues = 0; + uint32_t n = 0; + uint8_t min_buf[ETH_ZLEN]; + struct iovec min_iov; + struct eth_header *ehdr; + uint8_t *filter_buf; + size_t size, orig_size; + size_t iov_ofs = 0; + E1000E_RxRing rxr; + E1000E_RSSInfo rss_info; + size_t total_size; + int i; + + trace_e1000e_rx_receive_iov(iovcnt); + + if (external_tx) { + *external_tx = true; + } + + if (!e1000x_hw_rx_enabled(core->mac)) { + return -1; + } + + /* Pull virtio header in */ + if (has_vnet) { + net_rx_pkt_set_vhdr_iovec(core->rx_pkt, iov, iovcnt); + iov_ofs = sizeof(struct virtio_net_hdr); + } else { + net_rx_pkt_unset_vhdr(core->rx_pkt); + } + + filter_buf = iov->iov_base + iov_ofs; + orig_size = iov_size(iov, iovcnt); + size = orig_size - iov_ofs; + + /* Pad to minimum Ethernet frame length */ + if (size < sizeof(min_buf)) { + iov_to_buf(iov, iovcnt, iov_ofs, min_buf, size); + memset(&min_buf[size], 0, sizeof(min_buf) - size); + e1000x_inc_reg_if_not_full(core->mac, RUC); + min_iov.iov_base = filter_buf = min_buf; + min_iov.iov_len = size = sizeof(min_buf); + iovcnt = 1; + iov = &min_iov; + iov_ofs = 0; + } else if (iov->iov_len < maximum_ethernet_hdr_len) { + /* This is very unlikely, but may happen. */ + iov_to_buf(iov, iovcnt, iov_ofs, min_buf, maximum_ethernet_hdr_len); + filter_buf = min_buf; + } + + /* Discard oversized packets if !LPE and !SBP. */ + if (e1000x_is_oversized(core->mac, size)) { + return orig_size; + } + + ehdr = PKT_GET_ETH_HDR(filter_buf); + net_rx_pkt_set_packet_type(core->rx_pkt, get_eth_packet_type(ehdr)); + + net_rx_pkt_attach_iovec_ex(core->rx_pkt, iov, iovcnt, iov_ofs, + e1000x_vlan_enabled(core->mac), + core->mac[VET] & 0xffff); + + queues = igb_receive_assign(core, ehdr, &rss_info, external_tx); + if (!queues) { + trace_e1000e_rx_flt_dropped(); + return orig_size; + } + + total_size = net_rx_pkt_get_total_len(core->rx_pkt) + + e1000x_fcs_len(core->mac); + + for (i = 0; i < IGB_NUM_QUEUES; i++) { + if (!(queues & BIT(i))) { + continue; + } + + igb_rx_ring_init(core, &rxr, i); + + if (!igb_has_rxbufs(core, rxr.i, total_size)) { + n |= E1000_ICS_RXO; + trace_e1000e_rx_not_written_to_guest(rxr.i->idx); + continue; + } + + n |= E1000_ICR_RXT0; + + igb_rx_fix_l4_csum(core, core->rx_pkt); + igb_write_packet_to_guest(core, core->rx_pkt, &rxr, &rss_info); + + /* Check if receive descriptor minimum threshold hit */ + if (igb_rx_descr_threshold_hit(core, rxr.i)) { + n |= E1000_ICS_RXDMT0; + } + + core->mac[EICR] |= igb_rx_wb_eic(core, rxr.i->idx); + + trace_e1000e_rx_written_to_guest(rxr.i->idx); + } + + trace_e1000e_rx_interrupt_set(n); + igb_set_interrupt_cause(core, n); + + return orig_size; +} + +static inline bool +igb_have_autoneg(IGBCore *core) +{ + return core->phy[MII_BMCR] & MII_BMCR_AUTOEN; +} + +static void igb_update_flowctl_status(IGBCore *core) +{ + if (igb_have_autoneg(core) && core->phy[MII_BMSR] & MII_BMSR_AN_COMP) { + trace_e1000e_link_autoneg_flowctl(true); + core->mac[CTRL] |= E1000_CTRL_TFCE | E1000_CTRL_RFCE; + } else { + trace_e1000e_link_autoneg_flowctl(false); + } +} + +static inline void +igb_link_down(IGBCore *core) +{ + e1000x_update_regs_on_link_down(core->mac, core->phy); + igb_update_flowctl_status(core); +} + +static inline void +igb_set_phy_ctrl(IGBCore *core, uint16_t val) +{ + /* bits 0-5 reserved; MII_BMCR_[ANRESTART,RESET] are self clearing */ + core->phy[MII_BMCR] = val & ~(0x3f | MII_BMCR_RESET | MII_BMCR_ANRESTART); + + if ((val & MII_BMCR_ANRESTART) && igb_have_autoneg(core)) { + e1000x_restart_autoneg(core->mac, core->phy, core->autoneg_timer); + } +} + +void igb_core_set_link_status(IGBCore *core) +{ + NetClientState *nc = qemu_get_queue(core->owner_nic); + uint32_t old_status = core->mac[STATUS]; + + trace_e1000e_link_status_changed(nc->link_down ? false : true); + + if (nc->link_down) { + e1000x_update_regs_on_link_down(core->mac, core->phy); + } else { + if (igb_have_autoneg(core) && + !(core->phy[MII_BMSR] & MII_BMSR_AN_COMP)) { + e1000x_restart_autoneg(core->mac, core->phy, + core->autoneg_timer); + } else { + e1000x_update_regs_on_link_up(core->mac, core->phy); + igb_start_recv(core); + } + } + + if (core->mac[STATUS] != old_status) { + igb_set_interrupt_cause(core, E1000_ICR_LSC); + } +} + +static void +igb_set_ctrl(IGBCore *core, int index, uint32_t val) +{ + trace_e1000e_core_ctrl_write(index, val); + + /* RST is self clearing */ + core->mac[CTRL] = val & ~E1000_CTRL_RST; + core->mac[CTRL_DUP] = core->mac[CTRL]; + + trace_e1000e_link_set_params( + !!(val & E1000_CTRL_ASDE), + (val & E1000_CTRL_SPD_SEL) >> E1000_CTRL_SPD_SHIFT, + !!(val & E1000_CTRL_FRCSPD), + !!(val & E1000_CTRL_FRCDPX), + !!(val & E1000_CTRL_RFCE), + !!(val & E1000_CTRL_TFCE)); + + if (val & E1000_CTRL_RST) { + trace_e1000e_core_ctrl_sw_reset(); + igb_reset(core, true); + } + + if (val & E1000_CTRL_PHY_RST) { + trace_e1000e_core_ctrl_phy_reset(); + core->mac[STATUS] |= E1000_STATUS_PHYRA; + } +} + +static void +igb_set_rfctl(IGBCore *core, int index, uint32_t val) +{ + trace_e1000e_rx_set_rfctl(val); + + if (!(val & E1000_RFCTL_ISCSI_DIS)) { + trace_e1000e_wrn_iscsi_filtering_not_supported(); + } + + if (!(val & E1000_RFCTL_NFSW_DIS)) { + trace_e1000e_wrn_nfsw_filtering_not_supported(); + } + + if (!(val & E1000_RFCTL_NFSR_DIS)) { + trace_e1000e_wrn_nfsr_filtering_not_supported(); + } + + core->mac[RFCTL] = val; +} + +static void +igb_calc_rxdesclen(IGBCore *core) +{ + if (igb_rx_use_legacy_descriptor(core)) { + core->rx_desc_len = sizeof(struct e1000_rx_desc); + } else { + core->rx_desc_len = sizeof(union e1000_adv_rx_desc); + } + trace_e1000e_rx_desc_len(core->rx_desc_len); +} + +static void +igb_set_rx_control(IGBCore *core, int index, uint32_t val) +{ + core->mac[RCTL] = val; + trace_e1000e_rx_set_rctl(core->mac[RCTL]); + + if (val & E1000_RCTL_DTYP_MASK) { + qemu_log_mask(LOG_GUEST_ERROR, + "igb: RCTL.DTYP must be zero for compatibility"); + } + + if (val & E1000_RCTL_EN) { + igb_calc_rxdesclen(core); + igb_start_recv(core); + } +} + +static inline void +igb_clear_ims_bits(IGBCore *core, uint32_t bits) +{ + trace_e1000e_irq_clear_ims(bits, core->mac[IMS], core->mac[IMS] & ~bits); + core->mac[IMS] &= ~bits; +} + +static inline bool +igb_postpone_interrupt(IGBIntrDelayTimer *timer) +{ + if (timer->running) { + trace_e1000e_irq_postponed_by_xitr(timer->delay_reg << 2); + + return true; + } + + if (timer->core->mac[timer->delay_reg] != 0) { + igb_intrmgr_rearm_timer(timer); + } + + return false; +} + +static inline bool +igb_eitr_should_postpone(IGBCore *core, int idx) +{ + return igb_postpone_interrupt(&core->eitr[idx]); +} + +static void igb_send_msix(IGBCore *core) +{ + uint32_t causes = core->mac[EICR] & core->mac[EIMS]; + uint32_t effective_eiac; + int vector; + + for (vector = 0; vector < IGB_INTR_NUM; ++vector) { + if ((causes & BIT(vector)) && !igb_eitr_should_postpone(core, vector)) { + + trace_e1000e_irq_msix_notify_vec(vector); + igb_msix_notify(core, vector); + + trace_e1000e_irq_icr_clear_eiac(core->mac[EICR], core->mac[EIAC]); + effective_eiac = core->mac[EIAC] & BIT(vector); + core->mac[EICR] &= ~effective_eiac; + } + } +} + +static inline void +igb_fix_icr_asserted(IGBCore *core) +{ + core->mac[ICR] &= ~E1000_ICR_ASSERTED; + if (core->mac[ICR]) { + core->mac[ICR] |= E1000_ICR_ASSERTED; + } + + trace_e1000e_irq_fix_icr_asserted(core->mac[ICR]); +} + +static void +igb_update_interrupt_state(IGBCore *core) +{ + uint32_t icr; + uint32_t causes; + uint32_t int_alloc; + + icr = core->mac[ICR] & core->mac[IMS]; + + if (msix_enabled(core->owner)) { + if (icr) { + causes = 0; + if (icr & E1000_ICR_DRSTA) { + int_alloc = core->mac[IVAR_MISC] & 0xff; + if (int_alloc & E1000_IVAR_VALID) { + causes |= BIT(int_alloc & 0x1f); + } + } + /* Check if other bits (excluding the TCP Timer) are enabled. */ + if (icr & ~E1000_ICR_DRSTA) { + int_alloc = (core->mac[IVAR_MISC] >> 8) & 0xff; + if (int_alloc & E1000_IVAR_VALID) { + causes |= BIT(int_alloc & 0x1f); + } + trace_e1000e_irq_add_msi_other(core->mac[EICR]); + } + core->mac[EICR] |= causes; + } + + if ((core->mac[EICR] & core->mac[EIMS])) { + igb_send_msix(core); + } + } else { + igb_fix_icr_asserted(core); + + if (icr) { + core->mac[EICR] |= (icr & E1000_ICR_DRSTA) | E1000_EICR_OTHER; + } else { + core->mac[EICR] &= ~E1000_EICR_OTHER; + } + + trace_e1000e_irq_pending_interrupts(core->mac[ICR] & core->mac[IMS], + core->mac[ICR], core->mac[IMS]); + + if (msi_enabled(core->owner)) { + if (icr) { + msi_notify(core->owner, 0); + } + } else { + if (icr) { + igb_raise_legacy_irq(core); + } else { + igb_lower_legacy_irq(core); + } + } + } +} + +static void +igb_set_interrupt_cause(IGBCore *core, uint32_t val) +{ + trace_e1000e_irq_set_cause_entry(val, core->mac[ICR]); + + core->mac[ICR] |= val; + + trace_e1000e_irq_set_cause_exit(val, core->mac[ICR]); + + igb_update_interrupt_state(core); +} + +static void igb_set_eics(IGBCore *core, int index, uint32_t val) +{ + bool msix = !!(core->mac[GPIE] & E1000_GPIE_MSIX_MODE); + + trace_igb_irq_write_eics(val, msix); + + core->mac[EICS] |= + val & (msix ? E1000_EICR_MSIX_MASK : E1000_EICR_LEGACY_MASK); + + /* + * TODO: Move to igb_update_interrupt_state if EICS is modified in other + * places. + */ + core->mac[EICR] = core->mac[EICS]; + + igb_update_interrupt_state(core); +} + +static void igb_set_eims(IGBCore *core, int index, uint32_t val) +{ + bool msix = !!(core->mac[GPIE] & E1000_GPIE_MSIX_MODE); + + trace_igb_irq_write_eims(val, msix); + + core->mac[EIMS] |= + val & (msix ? E1000_EICR_MSIX_MASK : E1000_EICR_LEGACY_MASK); + + igb_update_interrupt_state(core); +} + +static void igb_vf_reset(IGBCore *core, uint16_t vfn) +{ + /* TODO: Reset of the queue enable and the interrupt registers of the VF. */ + + core->mac[V2PMAILBOX0 + vfn] &= ~E1000_V2PMAILBOX_RSTI; + core->mac[V2PMAILBOX0 + vfn] = E1000_V2PMAILBOX_RSTD; +} + +static void mailbox_interrupt_to_vf(IGBCore *core, uint16_t vfn) +{ + uint32_t ent = core->mac[VTIVAR_MISC + vfn]; + + if ((ent & E1000_IVAR_VALID)) { + core->mac[EICR] |= (ent & 0x3) << (22 - vfn * IGBVF_MSIX_VEC_NUM); + igb_update_interrupt_state(core); + } +} + +static void mailbox_interrupt_to_pf(IGBCore *core) +{ + igb_set_interrupt_cause(core, E1000_ICR_VMMB); +} + +static void igb_set_pfmailbox(IGBCore *core, int index, uint32_t val) +{ + uint16_t vfn = index - P2VMAILBOX0; + + trace_igb_set_pfmailbox(vfn, val); + + if (val & E1000_P2VMAILBOX_STS) { + core->mac[V2PMAILBOX0 + vfn] |= E1000_V2PMAILBOX_PFSTS; + mailbox_interrupt_to_vf(core, vfn); + } + + if (val & E1000_P2VMAILBOX_ACK) { + core->mac[V2PMAILBOX0 + vfn] |= E1000_V2PMAILBOX_PFACK; + mailbox_interrupt_to_vf(core, vfn); + } + + /* Buffer Taken by PF (can be set only if the VFU is cleared). */ + if (val & E1000_P2VMAILBOX_PFU) { + if (!(core->mac[index] & E1000_P2VMAILBOX_VFU)) { + core->mac[index] |= E1000_P2VMAILBOX_PFU; + core->mac[V2PMAILBOX0 + vfn] |= E1000_V2PMAILBOX_PFU; + } + } else { + core->mac[index] &= ~E1000_P2VMAILBOX_PFU; + core->mac[V2PMAILBOX0 + vfn] &= ~E1000_V2PMAILBOX_PFU; + } + + if (val & E1000_P2VMAILBOX_RVFU) { + core->mac[V2PMAILBOX0 + vfn] &= ~E1000_V2PMAILBOX_VFU; + core->mac[MBVFICR] &= ~((E1000_MBVFICR_VFACK_VF1 << vfn) | + (E1000_MBVFICR_VFREQ_VF1 << vfn)); + } +} + +static void igb_set_vfmailbox(IGBCore *core, int index, uint32_t val) +{ + uint16_t vfn = index - V2PMAILBOX0; + + trace_igb_set_vfmailbox(vfn, val); + + if (val & E1000_V2PMAILBOX_REQ) { + core->mac[MBVFICR] |= E1000_MBVFICR_VFREQ_VF1 << vfn; + mailbox_interrupt_to_pf(core); + } + + if (val & E1000_V2PMAILBOX_ACK) { + core->mac[MBVFICR] |= E1000_MBVFICR_VFACK_VF1 << vfn; + mailbox_interrupt_to_pf(core); + } + + /* Buffer Taken by VF (can be set only if the PFU is cleared). */ + if (val & E1000_V2PMAILBOX_VFU) { + if (!(core->mac[index] & E1000_V2PMAILBOX_PFU)) { + core->mac[index] |= E1000_V2PMAILBOX_VFU; + core->mac[P2VMAILBOX0 + vfn] |= E1000_P2VMAILBOX_VFU; + } + } else { + core->mac[index] &= ~E1000_V2PMAILBOX_VFU; + core->mac[P2VMAILBOX0 + vfn] &= ~E1000_P2VMAILBOX_VFU; + } +} + +static void igb_w1c(IGBCore *core, int index, uint32_t val) +{ + core->mac[index] &= ~val; +} + +static void igb_set_eimc(IGBCore *core, int index, uint32_t val) +{ + bool msix = !!(core->mac[GPIE] & E1000_GPIE_MSIX_MODE); + + /* Interrupts are disabled via a write to EIMC and reflected in EIMS. */ + core->mac[EIMS] &= + ~(val & (msix ? E1000_EICR_MSIX_MASK : E1000_EICR_LEGACY_MASK)); + + trace_igb_irq_write_eimc(val, core->mac[EIMS], msix); + igb_update_interrupt_state(core); +} + +static void igb_set_eiac(IGBCore *core, int index, uint32_t val) +{ + bool msix = !!(core->mac[GPIE] & E1000_GPIE_MSIX_MODE); + + if (msix) { + trace_igb_irq_write_eiac(val); + + /* + * TODO: When using IOV, the bits that correspond to MSI-X vectors + * that are assigned to a VF are read-only. + */ + core->mac[EIAC] |= (val & E1000_EICR_MSIX_MASK); + } +} + +static void igb_set_eiam(IGBCore *core, int index, uint32_t val) +{ + bool msix = !!(core->mac[GPIE] & E1000_GPIE_MSIX_MODE); + + /* + * TODO: When using IOV, the bits that correspond to MSI-X vectors that + * are assigned to a VF are read-only. + */ + core->mac[EIAM] |= + ~(val & (msix ? E1000_EICR_MSIX_MASK : E1000_EICR_LEGACY_MASK)); + + trace_igb_irq_write_eiam(val, msix); +} + +static void igb_set_eicr(IGBCore *core, int index, uint32_t val) +{ + bool msix = !!(core->mac[GPIE] & E1000_GPIE_MSIX_MODE); + + /* + * TODO: In IOV mode, only bit zero of this vector is available for the PF + * function. + */ + core->mac[EICR] &= + ~(val & (msix ? E1000_EICR_MSIX_MASK : E1000_EICR_LEGACY_MASK)); + + trace_igb_irq_write_eicr(val, msix); + igb_update_interrupt_state(core); +} + +static void igb_set_vtctrl(IGBCore *core, int index, uint32_t val) +{ + uint16_t vfn; + + if (val & E1000_CTRL_RST) { + vfn = (index - PVTCTRL0) / 0x40; + igb_vf_reset(core, vfn); + } +} + +static void igb_set_vteics(IGBCore *core, int index, uint32_t val) +{ + uint16_t vfn = (index - PVTEICS0) / 0x40; + + core->mac[index] = val; + igb_set_eics(core, EICS, (val & 0x7) << (22 - vfn * IGBVF_MSIX_VEC_NUM)); +} + +static void igb_set_vteims(IGBCore *core, int index, uint32_t val) +{ + uint16_t vfn = (index - PVTEIMS0) / 0x40; + + core->mac[index] = val; + igb_set_eims(core, EIMS, (val & 0x7) << (22 - vfn * IGBVF_MSIX_VEC_NUM)); +} + +static void igb_set_vteimc(IGBCore *core, int index, uint32_t val) +{ + uint16_t vfn = (index - PVTEIMC0) / 0x40; + + core->mac[index] = val; + igb_set_eimc(core, EIMC, (val & 0x7) << (22 - vfn * IGBVF_MSIX_VEC_NUM)); +} + +static void igb_set_vteiac(IGBCore *core, int index, uint32_t val) +{ + uint16_t vfn = (index - PVTEIAC0) / 0x40; + + core->mac[index] = val; + igb_set_eiac(core, EIAC, (val & 0x7) << (22 - vfn * IGBVF_MSIX_VEC_NUM)); +} + +static void igb_set_vteiam(IGBCore *core, int index, uint32_t val) +{ + uint16_t vfn = (index - PVTEIAM0) / 0x40; + + core->mac[index] = val; + igb_set_eiam(core, EIAM, (val & 0x7) << (22 - vfn * IGBVF_MSIX_VEC_NUM)); +} + +static void igb_set_vteicr(IGBCore *core, int index, uint32_t val) +{ + uint16_t vfn = (index - PVTEICR0) / 0x40; + + core->mac[index] = val; + igb_set_eicr(core, EICR, (val & 0x7) << (22 - vfn * IGBVF_MSIX_VEC_NUM)); +} + +static void igb_set_vtivar(IGBCore *core, int index, uint32_t val) +{ + uint16_t vfn = (index - VTIVAR); + uint16_t qn = vfn; + uint8_t ent; + int n; + + core->mac[index] = val; + + /* Get assigned vector associated with queue Rx#0. */ + if ((val & E1000_IVAR_VALID)) { + n = igb_ivar_entry_rx(qn); + ent = E1000_IVAR_VALID | (24 - vfn * IGBVF_MSIX_VEC_NUM - (2 - (val & 0x7))); + core->mac[IVAR0 + n / 4] |= ent << 8 * (n % 4); + } + + /* Get assigned vector associated with queue Tx#0 */ + ent = val >> 8; + if ((ent & E1000_IVAR_VALID)) { + n = igb_ivar_entry_tx(qn); + ent = E1000_IVAR_VALID | (24 - vfn * IGBVF_MSIX_VEC_NUM - (2 - (ent & 0x7))); + core->mac[IVAR0 + n / 4] |= ent << 8 * (n % 4); + } + + /* + * Ignoring assigned vectors associated with queues Rx#1 and Tx#1 for now. + */ +} + +static inline void +igb_autoneg_timer(void *opaque) +{ + IGBCore *core = opaque; + if (!qemu_get_queue(core->owner_nic)->link_down) { + e1000x_update_regs_on_autoneg_done(core->mac, core->phy); + igb_start_recv(core); + + igb_update_flowctl_status(core); + /* signal link status change to the guest */ + igb_set_interrupt_cause(core, E1000_ICR_LSC); + } +} + +static inline uint16_t +igb_get_reg_index_with_offset(const uint16_t *mac_reg_access, hwaddr addr) +{ + uint16_t index = (addr & 0x1ffff) >> 2; + return index + (mac_reg_access[index] & 0xfffe); +} + +static const char igb_phy_regcap[MAX_PHY_REG_ADDRESS + 1] = { + [MII_BMCR] = PHY_RW, + [MII_BMSR] = PHY_R, + [MII_PHYID1] = PHY_R, + [MII_PHYID2] = PHY_R, + [MII_ANAR] = PHY_RW, + [MII_ANLPAR] = PHY_R, + [MII_ANER] = PHY_R, + [MII_ANNP] = PHY_RW, + [MII_ANLPRNP] = PHY_R, + [MII_CTRL1000] = PHY_RW, + [MII_STAT1000] = PHY_R, + [MII_EXTSTAT] = PHY_R, + + [IGP01E1000_PHY_PORT_CONFIG] = PHY_RW, + [IGP01E1000_PHY_PORT_STATUS] = PHY_R, + [IGP01E1000_PHY_PORT_CTRL] = PHY_RW, + [IGP01E1000_PHY_LINK_HEALTH] = PHY_R, + [IGP02E1000_PHY_POWER_MGMT] = PHY_RW, + [IGP01E1000_PHY_PAGE_SELECT] = PHY_W +}; + +static void +igb_phy_reg_write(IGBCore *core, uint32_t addr, uint16_t data) +{ + assert(addr <= MAX_PHY_REG_ADDRESS); + + if (addr == MII_BMCR) { + igb_set_phy_ctrl(core, data); + } else { + core->phy[addr] = data; + } +} + +static void +igb_set_mdic(IGBCore *core, int index, uint32_t val) +{ + uint32_t data = val & E1000_MDIC_DATA_MASK; + uint32_t addr = ((val & E1000_MDIC_REG_MASK) >> E1000_MDIC_REG_SHIFT); + + if ((val & E1000_MDIC_PHY_MASK) >> E1000_MDIC_PHY_SHIFT != 1) { /* phy # */ + val = core->mac[MDIC] | E1000_MDIC_ERROR; + } else if (val & E1000_MDIC_OP_READ) { + if (!(igb_phy_regcap[addr] & PHY_R)) { + trace_igb_core_mdic_read_unhandled(addr); + val |= E1000_MDIC_ERROR; + } else { + val = (val ^ data) | core->phy[addr]; + trace_igb_core_mdic_read(addr, val); + } + } else if (val & E1000_MDIC_OP_WRITE) { + if (!(igb_phy_regcap[addr] & PHY_W)) { + trace_igb_core_mdic_write_unhandled(addr); + val |= E1000_MDIC_ERROR; + } else { + trace_igb_core_mdic_write(addr, data); + igb_phy_reg_write(core, addr, data); + } + } + core->mac[MDIC] = val | E1000_MDIC_READY; + + if (val & E1000_MDIC_INT_EN) { + igb_set_interrupt_cause(core, E1000_ICR_MDAC); + } +} + +static void +igb_set_rdt(IGBCore *core, int index, uint32_t val) +{ + core->mac[index] = val & 0xffff; + trace_e1000e_rx_set_rdt(igb_mq_queue_idx(RDT0, index), val); + igb_start_recv(core); +} + +static void +igb_set_status(IGBCore *core, int index, uint32_t val) +{ + if ((val & E1000_STATUS_PHYRA) == 0) { + core->mac[index] &= ~E1000_STATUS_PHYRA; + } +} + +static void +igb_set_ctrlext(IGBCore *core, int index, uint32_t val) +{ + trace_e1000e_link_set_ext_params(!!(val & E1000_CTRL_EXT_ASDCHK), + !!(val & E1000_CTRL_EXT_SPD_BYPS)); + + /* TODO: PFRSTD */ + + /* Zero self-clearing bits */ + val &= ~(E1000_CTRL_EXT_ASDCHK | E1000_CTRL_EXT_EE_RST); + core->mac[CTRL_EXT] = val; +} + +static void +igb_set_pbaclr(IGBCore *core, int index, uint32_t val) +{ + int i; + + core->mac[PBACLR] = val & E1000_PBACLR_VALID_MASK; + + if (!msix_enabled(core->owner)) { + return; + } + + for (i = 0; i < IGB_INTR_NUM; i++) { + if (core->mac[PBACLR] & BIT(i)) { + msix_clr_pending(core->owner, i); + } + } +} + +static void +igb_set_fcrth(IGBCore *core, int index, uint32_t val) +{ + core->mac[FCRTH] = val & 0xFFF8; +} + +static void +igb_set_fcrtl(IGBCore *core, int index, uint32_t val) +{ + core->mac[FCRTL] = val & 0x8000FFF8; +} + +#define IGB_LOW_BITS_SET_FUNC(num) \ + static void \ + igb_set_##num##bit(IGBCore *core, int index, uint32_t val) \ + { \ + core->mac[index] = val & (BIT(num) - 1); \ + } + +IGB_LOW_BITS_SET_FUNC(4) +IGB_LOW_BITS_SET_FUNC(13) +IGB_LOW_BITS_SET_FUNC(16) + +static void +igb_set_dlen(IGBCore *core, int index, uint32_t val) +{ + core->mac[index] = val & 0xffff0; +} + +static void +igb_set_dbal(IGBCore *core, int index, uint32_t val) +{ + core->mac[index] = val & E1000_XDBAL_MASK; +} + +static void +igb_set_tdt(IGBCore *core, int index, uint32_t val) +{ + IGB_TxRing txr; + int qn = igb_mq_queue_idx(TDT0, index); + + core->mac[index] = val & 0xffff; + + igb_tx_ring_init(core, &txr, qn); + igb_start_xmit(core, &txr); +} + +static void +igb_set_ics(IGBCore *core, int index, uint32_t val) +{ + trace_e1000e_irq_write_ics(val); + igb_set_interrupt_cause(core, val); +} + +static void +igb_set_imc(IGBCore *core, int index, uint32_t val) +{ + trace_e1000e_irq_ims_clear_set_imc(val); + igb_clear_ims_bits(core, val); + igb_update_interrupt_state(core); +} + +static void +igb_set_ims(IGBCore *core, int index, uint32_t val) +{ + uint32_t valid_val = val & 0x77D4FBFD; + + trace_e1000e_irq_set_ims(val, core->mac[IMS], core->mac[IMS] | valid_val); + core->mac[IMS] |= valid_val; + igb_update_interrupt_state(core); +} + +static void igb_commit_icr(IGBCore *core) +{ + /* + * If GPIE.NSICR = 0, then the copy of IAM to IMS will occur only if at + * least one bit is set in the IMS and there is a true interrupt as + * reflected in ICR.INTA. + */ + if ((core->mac[GPIE] & E1000_GPIE_NSICR) || + (core->mac[IMS] && (core->mac[ICR] & E1000_ICR_INT_ASSERTED))) { + igb_set_ims(core, IMS, core->mac[IAM]); + } else { + igb_update_interrupt_state(core); + } +} + +static void igb_set_icr(IGBCore *core, int index, uint32_t val) +{ + uint32_t icr = core->mac[ICR] & ~val; + + trace_igb_irq_icr_write(val, core->mac[ICR], icr); + core->mac[ICR] = icr; + igb_commit_icr(core); +} + +static uint32_t +igb_mac_readreg(IGBCore *core, int index) +{ + return core->mac[index]; +} + +static uint32_t +igb_mac_ics_read(IGBCore *core, int index) +{ + trace_e1000e_irq_read_ics(core->mac[ICS]); + return core->mac[ICS]; +} + +static uint32_t +igb_mac_ims_read(IGBCore *core, int index) +{ + trace_e1000e_irq_read_ims(core->mac[IMS]); + return core->mac[IMS]; +} + +static uint32_t +igb_mac_swsm_read(IGBCore *core, int index) +{ + uint32_t val = core->mac[SWSM]; + core->mac[SWSM] = val | E1000_SWSM_SMBI; + return val; +} + +static uint32_t +igb_mac_eitr_read(IGBCore *core, int index) +{ + return core->eitr_guest_value[index - EITR0]; +} + +static uint32_t igb_mac_vfmailbox_read(IGBCore *core, int index) +{ + uint32_t val = core->mac[index]; + + core->mac[index] &= ~(E1000_V2PMAILBOX_PFSTS | E1000_V2PMAILBOX_PFACK | + E1000_V2PMAILBOX_RSTD); + + return val; +} + +static uint32_t +igb_mac_icr_read(IGBCore *core, int index) +{ + uint32_t ret = core->mac[ICR]; + trace_e1000e_irq_icr_read_entry(ret); + + if (core->mac[GPIE] & E1000_GPIE_NSICR) { + trace_igb_irq_icr_clear_gpie_nsicr(); + core->mac[ICR] = 0; + } else if (core->mac[IMS] == 0) { + trace_e1000e_irq_icr_clear_zero_ims(); + core->mac[ICR] = 0; + } else if (!msix_enabled(core->owner)) { + trace_e1000e_irq_icr_clear_nonmsix_icr_read(); + core->mac[ICR] = 0; + } + + trace_e1000e_irq_icr_read_exit(core->mac[ICR]); + igb_commit_icr(core); + return ret; +} + +static uint32_t +igb_mac_read_clr4(IGBCore *core, int index) +{ + uint32_t ret = core->mac[index]; + + core->mac[index] = 0; + return ret; +} + +static uint32_t +igb_mac_read_clr8(IGBCore *core, int index) +{ + uint32_t ret = core->mac[index]; + + core->mac[index] = 0; + core->mac[index - 1] = 0; + return ret; +} + +static uint32_t +igb_get_ctrl(IGBCore *core, int index) +{ + uint32_t val = core->mac[CTRL]; + + trace_e1000e_link_read_params( + !!(val & E1000_CTRL_ASDE), + (val & E1000_CTRL_SPD_SEL) >> E1000_CTRL_SPD_SHIFT, + !!(val & E1000_CTRL_FRCSPD), + !!(val & E1000_CTRL_FRCDPX), + !!(val & E1000_CTRL_RFCE), + !!(val & E1000_CTRL_TFCE)); + + return val; +} + +static uint32_t igb_get_status(IGBCore *core, int index) +{ + uint32_t res = core->mac[STATUS]; + uint16_t num_vfs = pcie_sriov_num_vfs(core->owner); + + if (core->mac[CTRL] & E1000_CTRL_FRCDPX) { + res |= (core->mac[CTRL] & E1000_CTRL_FD) ? E1000_STATUS_FD : 0; + } else { + res |= E1000_STATUS_FD; + } + + if ((core->mac[CTRL] & E1000_CTRL_FRCSPD) || + (core->mac[CTRL_EXT] & E1000_CTRL_EXT_SPD_BYPS)) { + switch (core->mac[CTRL] & E1000_CTRL_SPD_SEL) { + case E1000_CTRL_SPD_10: + res |= E1000_STATUS_SPEED_10; + break; + case E1000_CTRL_SPD_100: + res |= E1000_STATUS_SPEED_100; + break; + case E1000_CTRL_SPD_1000: + default: + res |= E1000_STATUS_SPEED_1000; + break; + } + } else { + res |= E1000_STATUS_SPEED_1000; + } + + if (num_vfs) { + res |= num_vfs << E1000_STATUS_NUM_VFS_SHIFT; + res |= E1000_STATUS_IOV_MODE; + } + + /* + * Windows driver 12.18.9.23 resets if E1000_STATUS_GIO_MASTER_ENABLE is + * left set after E1000_CTRL_LRST is set. + */ + if (!(core->mac[CTRL] & E1000_CTRL_GIO_MASTER_DISABLE) && + !(core->mac[CTRL] & E1000_CTRL_LRST)) { + res |= E1000_STATUS_GIO_MASTER_ENABLE; + } + + return res; +} + +static void +igb_mac_writereg(IGBCore *core, int index, uint32_t val) +{ + core->mac[index] = val; +} + +static void +igb_mac_setmacaddr(IGBCore *core, int index, uint32_t val) +{ + uint32_t macaddr[2]; + + core->mac[index] = val; + + macaddr[0] = cpu_to_le32(core->mac[RA]); + macaddr[1] = cpu_to_le32(core->mac[RA + 1]); + qemu_format_nic_info_str(qemu_get_queue(core->owner_nic), + (uint8_t *) macaddr); + + trace_e1000e_mac_set_sw(MAC_ARG(macaddr)); +} + +static void +igb_set_eecd(IGBCore *core, int index, uint32_t val) +{ + static const uint32_t ro_bits = E1000_EECD_PRES | + E1000_EECD_AUTO_RD | + E1000_EECD_SIZE_EX_MASK; + + core->mac[EECD] = (core->mac[EECD] & ro_bits) | (val & ~ro_bits); +} + +static void +igb_set_eerd(IGBCore *core, int index, uint32_t val) +{ + uint32_t addr = (val >> E1000_EERW_ADDR_SHIFT) & E1000_EERW_ADDR_MASK; + uint32_t flags = 0; + uint32_t data = 0; + + if ((addr < IGB_EEPROM_SIZE) && (val & E1000_EERW_START)) { + data = core->eeprom[addr]; + flags = E1000_EERW_DONE; + } + + core->mac[EERD] = flags | + (addr << E1000_EERW_ADDR_SHIFT) | + (data << E1000_EERW_DATA_SHIFT); +} + +static void +igb_set_eitr(IGBCore *core, int index, uint32_t val) +{ + uint32_t eitr_num = index - EITR0; + + trace_igb_irq_eitr_set(eitr_num, val); + + core->eitr_guest_value[eitr_num] = val & ~E1000_EITR_CNT_IGNR; + core->mac[index] = val & 0x7FFE; +} + +static void +igb_update_rx_offloads(IGBCore *core) +{ + int cso_state = igb_rx_l4_cso_enabled(core); + + trace_e1000e_rx_set_cso(cso_state); + + if (core->has_vnet) { + qemu_set_offload(qemu_get_queue(core->owner_nic)->peer, + cso_state, 0, 0, 0, 0); + } +} + +static void +igb_set_rxcsum(IGBCore *core, int index, uint32_t val) +{ + core->mac[RXCSUM] = val; + igb_update_rx_offloads(core); +} + +static void +igb_set_gcr(IGBCore *core, int index, uint32_t val) +{ + uint32_t ro_bits = core->mac[GCR] & E1000_GCR_RO_BITS; + core->mac[GCR] = (val & ~E1000_GCR_RO_BITS) | ro_bits; +} + +static uint32_t igb_get_systiml(IGBCore *core, int index) +{ + e1000x_timestamp(core->mac, core->timadj, SYSTIML, SYSTIMH); + return core->mac[SYSTIML]; +} + +static uint32_t igb_get_rxsatrh(IGBCore *core, int index) +{ + core->mac[TSYNCRXCTL] &= ~E1000_TSYNCRXCTL_VALID; + return core->mac[RXSATRH]; +} + +static uint32_t igb_get_txstmph(IGBCore *core, int index) +{ + core->mac[TSYNCTXCTL] &= ~E1000_TSYNCTXCTL_VALID; + return core->mac[TXSTMPH]; +} + +static void igb_set_timinca(IGBCore *core, int index, uint32_t val) +{ + e1000x_set_timinca(core->mac, &core->timadj, val); +} + +static void igb_set_timadjh(IGBCore *core, int index, uint32_t val) +{ + core->mac[TIMADJH] = val; + core->timadj += core->mac[TIMADJL] | ((int64_t)core->mac[TIMADJH] << 32); +} + +#define igb_getreg(x) [x] = igb_mac_readreg +typedef uint32_t (*readops)(IGBCore *, int); +static const readops igb_macreg_readops[] = { + igb_getreg(WUFC), + igb_getreg(MANC), + igb_getreg(TOTL), + igb_getreg(RDT0), + igb_getreg(RDT1), + igb_getreg(RDT2), + igb_getreg(RDT3), + igb_getreg(RDT4), + igb_getreg(RDT5), + igb_getreg(RDT6), + igb_getreg(RDT7), + igb_getreg(RDT8), + igb_getreg(RDT9), + igb_getreg(RDT10), + igb_getreg(RDT11), + igb_getreg(RDT12), + igb_getreg(RDT13), + igb_getreg(RDT14), + igb_getreg(RDT15), + igb_getreg(RDBAH0), + igb_getreg(RDBAH1), + igb_getreg(RDBAH2), + igb_getreg(RDBAH3), + igb_getreg(RDBAH4), + igb_getreg(RDBAH5), + igb_getreg(RDBAH6), + igb_getreg(RDBAH7), + igb_getreg(RDBAH8), + igb_getreg(RDBAH9), + igb_getreg(RDBAH10), + igb_getreg(RDBAH11), + igb_getreg(RDBAH12), + igb_getreg(RDBAH13), + igb_getreg(RDBAH14), + igb_getreg(RDBAH15), + igb_getreg(TDBAL0), + igb_getreg(TDBAL1), + igb_getreg(TDBAL2), + igb_getreg(TDBAL3), + igb_getreg(TDBAL4), + igb_getreg(TDBAL5), + igb_getreg(TDBAL6), + igb_getreg(TDBAL7), + igb_getreg(TDBAL8), + igb_getreg(TDBAL9), + igb_getreg(TDBAL10), + igb_getreg(TDBAL11), + igb_getreg(TDBAL12), + igb_getreg(TDBAL13), + igb_getreg(TDBAL14), + igb_getreg(TDBAL15), + igb_getreg(RDLEN0), + igb_getreg(RDLEN1), + igb_getreg(RDLEN2), + igb_getreg(RDLEN3), + igb_getreg(RDLEN4), + igb_getreg(RDLEN5), + igb_getreg(RDLEN6), + igb_getreg(RDLEN7), + igb_getreg(RDLEN8), + igb_getreg(RDLEN9), + igb_getreg(RDLEN10), + igb_getreg(RDLEN11), + igb_getreg(RDLEN12), + igb_getreg(RDLEN13), + igb_getreg(RDLEN14), + igb_getreg(RDLEN15), + igb_getreg(SRRCTL0), + igb_getreg(SRRCTL1), + igb_getreg(SRRCTL2), + igb_getreg(SRRCTL3), + igb_getreg(SRRCTL4), + igb_getreg(SRRCTL5), + igb_getreg(SRRCTL6), + igb_getreg(SRRCTL7), + igb_getreg(SRRCTL8), + igb_getreg(SRRCTL9), + igb_getreg(SRRCTL10), + igb_getreg(SRRCTL11), + igb_getreg(SRRCTL12), + igb_getreg(SRRCTL13), + igb_getreg(SRRCTL14), + igb_getreg(SRRCTL15), + igb_getreg(LATECOL), + igb_getreg(XONTXC), + igb_getreg(TDFH), + igb_getreg(TDFT), + igb_getreg(TDFHS), + igb_getreg(TDFTS), + igb_getreg(TDFPC), + igb_getreg(WUS), + igb_getreg(RDFH), + igb_getreg(RDFT), + igb_getreg(RDFHS), + igb_getreg(RDFTS), + igb_getreg(RDFPC), + igb_getreg(GORCL), + igb_getreg(MGTPRC), + igb_getreg(EERD), + igb_getreg(EIAC), + igb_getreg(MANC2H), + igb_getreg(RXCSUM), + igb_getreg(GSCL_3), + igb_getreg(GSCN_2), + igb_getreg(FCAH), + igb_getreg(FCRTH), + igb_getreg(FLOP), + igb_getreg(RXSTMPH), + igb_getreg(TXSTMPL), + igb_getreg(TIMADJL), + igb_getreg(RDH0), + igb_getreg(RDH1), + igb_getreg(RDH2), + igb_getreg(RDH3), + igb_getreg(RDH4), + igb_getreg(RDH5), + igb_getreg(RDH6), + igb_getreg(RDH7), + igb_getreg(RDH8), + igb_getreg(RDH9), + igb_getreg(RDH10), + igb_getreg(RDH11), + igb_getreg(RDH12), + igb_getreg(RDH13), + igb_getreg(RDH14), + igb_getreg(RDH15), + igb_getreg(TDT0), + igb_getreg(TDT1), + igb_getreg(TDT2), + igb_getreg(TDT3), + igb_getreg(TDT4), + igb_getreg(TDT5), + igb_getreg(TDT6), + igb_getreg(TDT7), + igb_getreg(TDT8), + igb_getreg(TDT9), + igb_getreg(TDT10), + igb_getreg(TDT11), + igb_getreg(TDT12), + igb_getreg(TDT13), + igb_getreg(TDT14), + igb_getreg(TDT15), + igb_getreg(TNCRS), + igb_getreg(RJC), + igb_getreg(IAM), + igb_getreg(GSCL_2), + igb_getreg(TIPG), + igb_getreg(FLMNGCTL), + igb_getreg(FLMNGCNT), + igb_getreg(TSYNCTXCTL), + igb_getreg(EEMNGDATA), + igb_getreg(CTRL_EXT), + igb_getreg(SYSTIMH), + igb_getreg(EEMNGCTL), + igb_getreg(FLMNGDATA), + igb_getreg(TSYNCRXCTL), + igb_getreg(LEDCTL), + igb_getreg(TCTL), + igb_getreg(TCTL_EXT), + igb_getreg(DTXCTL), + igb_getreg(RXPBS), + igb_getreg(TDH0), + igb_getreg(TDH1), + igb_getreg(TDH2), + igb_getreg(TDH3), + igb_getreg(TDH4), + igb_getreg(TDH5), + igb_getreg(TDH6), + igb_getreg(TDH7), + igb_getreg(TDH8), + igb_getreg(TDH9), + igb_getreg(TDH10), + igb_getreg(TDH11), + igb_getreg(TDH12), + igb_getreg(TDH13), + igb_getreg(TDH14), + igb_getreg(TDH15), + igb_getreg(ECOL), + igb_getreg(DC), + igb_getreg(RLEC), + igb_getreg(XOFFTXC), + igb_getreg(RFC), + igb_getreg(RNBC), + igb_getreg(MGTPTC), + igb_getreg(TIMINCA), + igb_getreg(FACTPS), + igb_getreg(GSCL_1), + igb_getreg(GSCN_0), + igb_getreg(PBACLR), + igb_getreg(FCTTV), + igb_getreg(RXSATRL), + igb_getreg(TORL), + igb_getreg(TDLEN0), + igb_getreg(TDLEN1), + igb_getreg(TDLEN2), + igb_getreg(TDLEN3), + igb_getreg(TDLEN4), + igb_getreg(TDLEN5), + igb_getreg(TDLEN6), + igb_getreg(TDLEN7), + igb_getreg(TDLEN8), + igb_getreg(TDLEN9), + igb_getreg(TDLEN10), + igb_getreg(TDLEN11), + igb_getreg(TDLEN12), + igb_getreg(TDLEN13), + igb_getreg(TDLEN14), + igb_getreg(TDLEN15), + igb_getreg(MCC), + igb_getreg(WUC), + igb_getreg(EECD), + igb_getreg(FCRTV), + igb_getreg(TXDCTL0), + igb_getreg(TXDCTL1), + igb_getreg(TXDCTL2), + igb_getreg(TXDCTL3), + igb_getreg(TXDCTL4), + igb_getreg(TXDCTL5), + igb_getreg(TXDCTL6), + igb_getreg(TXDCTL7), + igb_getreg(TXDCTL8), + igb_getreg(TXDCTL9), + igb_getreg(TXDCTL10), + igb_getreg(TXDCTL11), + igb_getreg(TXDCTL12), + igb_getreg(TXDCTL13), + igb_getreg(TXDCTL14), + igb_getreg(TXDCTL15), + igb_getreg(TXCTL0), + igb_getreg(TXCTL1), + igb_getreg(TXCTL2), + igb_getreg(TXCTL3), + igb_getreg(TXCTL4), + igb_getreg(TXCTL5), + igb_getreg(TXCTL6), + igb_getreg(TXCTL7), + igb_getreg(TXCTL8), + igb_getreg(TXCTL9), + igb_getreg(TXCTL10), + igb_getreg(TXCTL11), + igb_getreg(TXCTL12), + igb_getreg(TXCTL13), + igb_getreg(TXCTL14), + igb_getreg(TXCTL15), + igb_getreg(TDWBAL0), + igb_getreg(TDWBAL1), + igb_getreg(TDWBAL2), + igb_getreg(TDWBAL3), + igb_getreg(TDWBAL4), + igb_getreg(TDWBAL5), + igb_getreg(TDWBAL6), + igb_getreg(TDWBAL7), + igb_getreg(TDWBAL8), + igb_getreg(TDWBAL9), + igb_getreg(TDWBAL10), + igb_getreg(TDWBAL11), + igb_getreg(TDWBAL12), + igb_getreg(TDWBAL13), + igb_getreg(TDWBAL14), + igb_getreg(TDWBAL15), + igb_getreg(TDWBAH0), + igb_getreg(TDWBAH1), + igb_getreg(TDWBAH2), + igb_getreg(TDWBAH3), + igb_getreg(TDWBAH4), + igb_getreg(TDWBAH5), + igb_getreg(TDWBAH6), + igb_getreg(TDWBAH7), + igb_getreg(TDWBAH8), + igb_getreg(TDWBAH9), + igb_getreg(TDWBAH10), + igb_getreg(TDWBAH11), + igb_getreg(TDWBAH12), + igb_getreg(TDWBAH13), + igb_getreg(TDWBAH14), + igb_getreg(TDWBAH15), + igb_getreg(PVTCTRL0), + igb_getreg(PVTCTRL1), + igb_getreg(PVTCTRL2), + igb_getreg(PVTCTRL3), + igb_getreg(PVTCTRL4), + igb_getreg(PVTCTRL5), + igb_getreg(PVTCTRL6), + igb_getreg(PVTCTRL7), + igb_getreg(PVTEIMS0), + igb_getreg(PVTEIMS1), + igb_getreg(PVTEIMS2), + igb_getreg(PVTEIMS3), + igb_getreg(PVTEIMS4), + igb_getreg(PVTEIMS5), + igb_getreg(PVTEIMS6), + igb_getreg(PVTEIMS7), + igb_getreg(PVTEIAC0), + igb_getreg(PVTEIAC1), + igb_getreg(PVTEIAC2), + igb_getreg(PVTEIAC3), + igb_getreg(PVTEIAC4), + igb_getreg(PVTEIAC5), + igb_getreg(PVTEIAC6), + igb_getreg(PVTEIAC7), + igb_getreg(PVTEIAM0), + igb_getreg(PVTEIAM1), + igb_getreg(PVTEIAM2), + igb_getreg(PVTEIAM3), + igb_getreg(PVTEIAM4), + igb_getreg(PVTEIAM5), + igb_getreg(PVTEIAM6), + igb_getreg(PVTEIAM7), + igb_getreg(PVFGPRC0), + igb_getreg(PVFGPRC1), + igb_getreg(PVFGPRC2), + igb_getreg(PVFGPRC3), + igb_getreg(PVFGPRC4), + igb_getreg(PVFGPRC5), + igb_getreg(PVFGPRC6), + igb_getreg(PVFGPRC7), + igb_getreg(PVFGPTC0), + igb_getreg(PVFGPTC1), + igb_getreg(PVFGPTC2), + igb_getreg(PVFGPTC3), + igb_getreg(PVFGPTC4), + igb_getreg(PVFGPTC5), + igb_getreg(PVFGPTC6), + igb_getreg(PVFGPTC7), + igb_getreg(PVFGORC0), + igb_getreg(PVFGORC1), + igb_getreg(PVFGORC2), + igb_getreg(PVFGORC3), + igb_getreg(PVFGORC4), + igb_getreg(PVFGORC5), + igb_getreg(PVFGORC6), + igb_getreg(PVFGORC7), + igb_getreg(PVFGOTC0), + igb_getreg(PVFGOTC1), + igb_getreg(PVFGOTC2), + igb_getreg(PVFGOTC3), + igb_getreg(PVFGOTC4), + igb_getreg(PVFGOTC5), + igb_getreg(PVFGOTC6), + igb_getreg(PVFGOTC7), + igb_getreg(PVFMPRC0), + igb_getreg(PVFMPRC1), + igb_getreg(PVFMPRC2), + igb_getreg(PVFMPRC3), + igb_getreg(PVFMPRC4), + igb_getreg(PVFMPRC5), + igb_getreg(PVFMPRC6), + igb_getreg(PVFMPRC7), + igb_getreg(PVFGPRLBC0), + igb_getreg(PVFGPRLBC1), + igb_getreg(PVFGPRLBC2), + igb_getreg(PVFGPRLBC3), + igb_getreg(PVFGPRLBC4), + igb_getreg(PVFGPRLBC5), + igb_getreg(PVFGPRLBC6), + igb_getreg(PVFGPRLBC7), + igb_getreg(PVFGPTLBC0), + igb_getreg(PVFGPTLBC1), + igb_getreg(PVFGPTLBC2), + igb_getreg(PVFGPTLBC3), + igb_getreg(PVFGPTLBC4), + igb_getreg(PVFGPTLBC5), + igb_getreg(PVFGPTLBC6), + igb_getreg(PVFGPTLBC7), + igb_getreg(PVFGORLBC0), + igb_getreg(PVFGORLBC1), + igb_getreg(PVFGORLBC2), + igb_getreg(PVFGORLBC3), + igb_getreg(PVFGORLBC4), + igb_getreg(PVFGORLBC5), + igb_getreg(PVFGORLBC6), + igb_getreg(PVFGORLBC7), + igb_getreg(PVFGOTLBC0), + igb_getreg(PVFGOTLBC1), + igb_getreg(PVFGOTLBC2), + igb_getreg(PVFGOTLBC3), + igb_getreg(PVFGOTLBC4), + igb_getreg(PVFGOTLBC5), + igb_getreg(PVFGOTLBC6), + igb_getreg(PVFGOTLBC7), + igb_getreg(RCTL), + igb_getreg(MDIC), + igb_getreg(FCRUC), + igb_getreg(VET), + igb_getreg(RDBAL0), + igb_getreg(RDBAL1), + igb_getreg(RDBAL2), + igb_getreg(RDBAL3), + igb_getreg(RDBAL4), + igb_getreg(RDBAL5), + igb_getreg(RDBAL6), + igb_getreg(RDBAL7), + igb_getreg(RDBAL8), + igb_getreg(RDBAL9), + igb_getreg(RDBAL10), + igb_getreg(RDBAL11), + igb_getreg(RDBAL12), + igb_getreg(RDBAL13), + igb_getreg(RDBAL14), + igb_getreg(RDBAL15), + igb_getreg(TDBAH0), + igb_getreg(TDBAH1), + igb_getreg(TDBAH2), + igb_getreg(TDBAH3), + igb_getreg(TDBAH4), + igb_getreg(TDBAH5), + igb_getreg(TDBAH6), + igb_getreg(TDBAH7), + igb_getreg(TDBAH8), + igb_getreg(TDBAH9), + igb_getreg(TDBAH10), + igb_getreg(TDBAH11), + igb_getreg(TDBAH12), + igb_getreg(TDBAH13), + igb_getreg(TDBAH14), + igb_getreg(TDBAH15), + igb_getreg(SCC), + igb_getreg(COLC), + igb_getreg(XOFFRXC), + igb_getreg(IPAV), + igb_getreg(GOTCL), + igb_getreg(MGTPDC), + igb_getreg(GCR), + igb_getreg(MFVAL), + igb_getreg(FUNCTAG), + igb_getreg(GSCL_4), + igb_getreg(GSCN_3), + igb_getreg(MRQC), + igb_getreg(FCT), + igb_getreg(FLA), + igb_getreg(RXDCTL0), + igb_getreg(RXDCTL1), + igb_getreg(RXDCTL2), + igb_getreg(RXDCTL3), + igb_getreg(RXDCTL4), + igb_getreg(RXDCTL5), + igb_getreg(RXDCTL6), + igb_getreg(RXDCTL7), + igb_getreg(RXDCTL8), + igb_getreg(RXDCTL9), + igb_getreg(RXDCTL10), + igb_getreg(RXDCTL11), + igb_getreg(RXDCTL12), + igb_getreg(RXDCTL13), + igb_getreg(RXDCTL14), + igb_getreg(RXDCTL15), + igb_getreg(RXSTMPL), + igb_getreg(TIMADJH), + igb_getreg(FCRTL), + igb_getreg(XONRXC), + igb_getreg(RFCTL), + igb_getreg(GSCN_1), + igb_getreg(FCAL), + igb_getreg(GPIE), + igb_getreg(TXPBS), + igb_getreg(RLPML), + + [TOTH] = igb_mac_read_clr8, + [GOTCH] = igb_mac_read_clr8, + [PRC64] = igb_mac_read_clr4, + [PRC255] = igb_mac_read_clr4, + [PRC1023] = igb_mac_read_clr4, + [PTC64] = igb_mac_read_clr4, + [PTC255] = igb_mac_read_clr4, + [PTC1023] = igb_mac_read_clr4, + [GPRC] = igb_mac_read_clr4, + [TPT] = igb_mac_read_clr4, + [RUC] = igb_mac_read_clr4, + [BPRC] = igb_mac_read_clr4, + [MPTC] = igb_mac_read_clr4, + [IAC] = igb_mac_read_clr4, + [ICR] = igb_mac_icr_read, + [STATUS] = igb_get_status, + [ICS] = igb_mac_ics_read, + /* + * 8.8.10: Reading the IMC register returns the value of the IMS register. + */ + [IMC] = igb_mac_ims_read, + [TORH] = igb_mac_read_clr8, + [GORCH] = igb_mac_read_clr8, + [PRC127] = igb_mac_read_clr4, + [PRC511] = igb_mac_read_clr4, + [PRC1522] = igb_mac_read_clr4, + [PTC127] = igb_mac_read_clr4, + [PTC511] = igb_mac_read_clr4, + [PTC1522] = igb_mac_read_clr4, + [GPTC] = igb_mac_read_clr4, + [TPR] = igb_mac_read_clr4, + [ROC] = igb_mac_read_clr4, + [MPRC] = igb_mac_read_clr4, + [BPTC] = igb_mac_read_clr4, + [TSCTC] = igb_mac_read_clr4, + [CTRL] = igb_get_ctrl, + [SWSM] = igb_mac_swsm_read, + [IMS] = igb_mac_ims_read, + [SYSTIML] = igb_get_systiml, + [RXSATRH] = igb_get_rxsatrh, + [TXSTMPH] = igb_get_txstmph, + + [CRCERRS ... MPC] = igb_mac_readreg, + [IP6AT ... IP6AT + 3] = igb_mac_readreg, + [IP4AT ... IP4AT + 6] = igb_mac_readreg, + [RA ... RA + 31] = igb_mac_readreg, + [RA2 ... RA2 + 31] = igb_mac_readreg, + [WUPM ... WUPM + 31] = igb_mac_readreg, + [MTA ... MTA + E1000_MC_TBL_SIZE - 1] = igb_mac_readreg, + [VFTA ... VFTA + E1000_VLAN_FILTER_TBL_SIZE - 1] = igb_mac_readreg, + [FFMT ... FFMT + 254] = igb_mac_readreg, + [MDEF ... MDEF + 7] = igb_mac_readreg, + [FTFT ... FTFT + 254] = igb_mac_readreg, + [RETA ... RETA + 31] = igb_mac_readreg, + [RSSRK ... RSSRK + 9] = igb_mac_readreg, + [MAVTV0 ... MAVTV3] = igb_mac_readreg, + [EITR0 ... EITR0 + IGB_INTR_NUM - 1] = igb_mac_eitr_read, + [PVTEICR0] = igb_mac_read_clr4, + [PVTEICR1] = igb_mac_read_clr4, + [PVTEICR2] = igb_mac_read_clr4, + [PVTEICR3] = igb_mac_read_clr4, + [PVTEICR4] = igb_mac_read_clr4, + [PVTEICR5] = igb_mac_read_clr4, + [PVTEICR6] = igb_mac_read_clr4, + [PVTEICR7] = igb_mac_read_clr4, + + /* IGB specific: */ + [FWSM] = igb_mac_readreg, + [SW_FW_SYNC] = igb_mac_readreg, + [HTCBDPC] = igb_mac_read_clr4, + [EICR] = igb_mac_read_clr4, + [EIMS] = igb_mac_readreg, + [EIAM] = igb_mac_readreg, + [IVAR0 ... IVAR0 + 7] = igb_mac_readreg, + igb_getreg(IVAR_MISC), + igb_getreg(VT_CTL), + [P2VMAILBOX0 ... P2VMAILBOX7] = igb_mac_readreg, + [V2PMAILBOX0 ... V2PMAILBOX7] = igb_mac_vfmailbox_read, + igb_getreg(MBVFICR), + [VMBMEM0 ... VMBMEM0 + 127] = igb_mac_readreg, + igb_getreg(MBVFIMR), + igb_getreg(VFLRE), + igb_getreg(VFRE), + igb_getreg(VFTE), + igb_getreg(QDE), + igb_getreg(DTXSWC), + igb_getreg(RPLOLR), + [VLVF0 ... VLVF0 + E1000_VLVF_ARRAY_SIZE - 1] = igb_mac_readreg, + [VMVIR0 ... VMVIR7] = igb_mac_readreg, + [VMOLR0 ... VMOLR7] = igb_mac_readreg, + [WVBR] = igb_mac_read_clr4, + [RQDPC0] = igb_mac_read_clr4, + [RQDPC1] = igb_mac_read_clr4, + [RQDPC2] = igb_mac_read_clr4, + [RQDPC3] = igb_mac_read_clr4, + [RQDPC4] = igb_mac_read_clr4, + [RQDPC5] = igb_mac_read_clr4, + [RQDPC6] = igb_mac_read_clr4, + [RQDPC7] = igb_mac_read_clr4, + [RQDPC8] = igb_mac_read_clr4, + [RQDPC9] = igb_mac_read_clr4, + [RQDPC10] = igb_mac_read_clr4, + [RQDPC11] = igb_mac_read_clr4, + [RQDPC12] = igb_mac_read_clr4, + [RQDPC13] = igb_mac_read_clr4, + [RQDPC14] = igb_mac_read_clr4, + [RQDPC15] = igb_mac_read_clr4, + [VTIVAR ... VTIVAR + 7] = igb_mac_readreg, + [VTIVAR_MISC ... VTIVAR_MISC + 7] = igb_mac_readreg, +}; +enum { IGB_NREADOPS = ARRAY_SIZE(igb_macreg_readops) }; + +#define igb_putreg(x) [x] = igb_mac_writereg +typedef void (*writeops)(IGBCore *, int, uint32_t); +static const writeops igb_macreg_writeops[] = { + igb_putreg(SWSM), + igb_putreg(WUFC), + igb_putreg(RDBAH0), + igb_putreg(RDBAH1), + igb_putreg(RDBAH2), + igb_putreg(RDBAH3), + igb_putreg(RDBAH4), + igb_putreg(RDBAH5), + igb_putreg(RDBAH6), + igb_putreg(RDBAH7), + igb_putreg(RDBAH8), + igb_putreg(RDBAH9), + igb_putreg(RDBAH10), + igb_putreg(RDBAH11), + igb_putreg(RDBAH12), + igb_putreg(RDBAH13), + igb_putreg(RDBAH14), + igb_putreg(RDBAH15), + igb_putreg(SRRCTL0), + igb_putreg(SRRCTL1), + igb_putreg(SRRCTL2), + igb_putreg(SRRCTL3), + igb_putreg(SRRCTL4), + igb_putreg(SRRCTL5), + igb_putreg(SRRCTL6), + igb_putreg(SRRCTL7), + igb_putreg(SRRCTL8), + igb_putreg(SRRCTL9), + igb_putreg(SRRCTL10), + igb_putreg(SRRCTL11), + igb_putreg(SRRCTL12), + igb_putreg(SRRCTL13), + igb_putreg(SRRCTL14), + igb_putreg(SRRCTL15), + igb_putreg(RXDCTL0), + igb_putreg(RXDCTL1), + igb_putreg(RXDCTL2), + igb_putreg(RXDCTL3), + igb_putreg(RXDCTL4), + igb_putreg(RXDCTL5), + igb_putreg(RXDCTL6), + igb_putreg(RXDCTL7), + igb_putreg(RXDCTL8), + igb_putreg(RXDCTL9), + igb_putreg(RXDCTL10), + igb_putreg(RXDCTL11), + igb_putreg(RXDCTL12), + igb_putreg(RXDCTL13), + igb_putreg(RXDCTL14), + igb_putreg(RXDCTL15), + igb_putreg(LEDCTL), + igb_putreg(TCTL), + igb_putreg(TCTL_EXT), + igb_putreg(DTXCTL), + igb_putreg(RXPBS), + igb_putreg(RQDPC0), + igb_putreg(FCAL), + igb_putreg(FCRUC), + igb_putreg(WUC), + igb_putreg(WUS), + igb_putreg(IPAV), + igb_putreg(TDBAH0), + igb_putreg(TDBAH1), + igb_putreg(TDBAH2), + igb_putreg(TDBAH3), + igb_putreg(TDBAH4), + igb_putreg(TDBAH5), + igb_putreg(TDBAH6), + igb_putreg(TDBAH7), + igb_putreg(TDBAH8), + igb_putreg(TDBAH9), + igb_putreg(TDBAH10), + igb_putreg(TDBAH11), + igb_putreg(TDBAH12), + igb_putreg(TDBAH13), + igb_putreg(TDBAH14), + igb_putreg(TDBAH15), + igb_putreg(IAM), + igb_putreg(MANC), + igb_putreg(MANC2H), + igb_putreg(MFVAL), + igb_putreg(FACTPS), + igb_putreg(FUNCTAG), + igb_putreg(GSCL_1), + igb_putreg(GSCL_2), + igb_putreg(GSCL_3), + igb_putreg(GSCL_4), + igb_putreg(GSCN_0), + igb_putreg(GSCN_1), + igb_putreg(GSCN_2), + igb_putreg(GSCN_3), + igb_putreg(MRQC), + igb_putreg(FLOP), + igb_putreg(FLA), + igb_putreg(TXDCTL0), + igb_putreg(TXDCTL1), + igb_putreg(TXDCTL2), + igb_putreg(TXDCTL3), + igb_putreg(TXDCTL4), + igb_putreg(TXDCTL5), + igb_putreg(TXDCTL6), + igb_putreg(TXDCTL7), + igb_putreg(TXDCTL8), + igb_putreg(TXDCTL9), + igb_putreg(TXDCTL10), + igb_putreg(TXDCTL11), + igb_putreg(TXDCTL12), + igb_putreg(TXDCTL13), + igb_putreg(TXDCTL14), + igb_putreg(TXDCTL15), + igb_putreg(TXCTL0), + igb_putreg(TXCTL1), + igb_putreg(TXCTL2), + igb_putreg(TXCTL3), + igb_putreg(TXCTL4), + igb_putreg(TXCTL5), + igb_putreg(TXCTL6), + igb_putreg(TXCTL7), + igb_putreg(TXCTL8), + igb_putreg(TXCTL9), + igb_putreg(TXCTL10), + igb_putreg(TXCTL11), + igb_putreg(TXCTL12), + igb_putreg(TXCTL13), + igb_putreg(TXCTL14), + igb_putreg(TXCTL15), + igb_putreg(TDWBAL0), + igb_putreg(TDWBAL1), + igb_putreg(TDWBAL2), + igb_putreg(TDWBAL3), + igb_putreg(TDWBAL4), + igb_putreg(TDWBAL5), + igb_putreg(TDWBAL6), + igb_putreg(TDWBAL7), + igb_putreg(TDWBAL8), + igb_putreg(TDWBAL9), + igb_putreg(TDWBAL10), + igb_putreg(TDWBAL11), + igb_putreg(TDWBAL12), + igb_putreg(TDWBAL13), + igb_putreg(TDWBAL14), + igb_putreg(TDWBAL15), + igb_putreg(TDWBAH0), + igb_putreg(TDWBAH1), + igb_putreg(TDWBAH2), + igb_putreg(TDWBAH3), + igb_putreg(TDWBAH4), + igb_putreg(TDWBAH5), + igb_putreg(TDWBAH6), + igb_putreg(TDWBAH7), + igb_putreg(TDWBAH8), + igb_putreg(TDWBAH9), + igb_putreg(TDWBAH10), + igb_putreg(TDWBAH11), + igb_putreg(TDWBAH12), + igb_putreg(TDWBAH13), + igb_putreg(TDWBAH14), + igb_putreg(TDWBAH15), + igb_putreg(TIPG), + igb_putreg(RXSTMPH), + igb_putreg(RXSTMPL), + igb_putreg(RXSATRL), + igb_putreg(RXSATRH), + igb_putreg(TXSTMPL), + igb_putreg(TXSTMPH), + igb_putreg(SYSTIML), + igb_putreg(SYSTIMH), + igb_putreg(TIMADJL), + igb_putreg(TSYNCRXCTL), + igb_putreg(TSYNCTXCTL), + igb_putreg(EEMNGCTL), + igb_putreg(GPIE), + igb_putreg(TXPBS), + igb_putreg(RLPML), + igb_putreg(VET), + + [TDH0] = igb_set_16bit, + [TDH1] = igb_set_16bit, + [TDH2] = igb_set_16bit, + [TDH3] = igb_set_16bit, + [TDH4] = igb_set_16bit, + [TDH5] = igb_set_16bit, + [TDH6] = igb_set_16bit, + [TDH7] = igb_set_16bit, + [TDH8] = igb_set_16bit, + [TDH9] = igb_set_16bit, + [TDH10] = igb_set_16bit, + [TDH11] = igb_set_16bit, + [TDH12] = igb_set_16bit, + [TDH13] = igb_set_16bit, + [TDH14] = igb_set_16bit, + [TDH15] = igb_set_16bit, + [TDT0] = igb_set_tdt, + [TDT1] = igb_set_tdt, + [TDT2] = igb_set_tdt, + [TDT3] = igb_set_tdt, + [TDT4] = igb_set_tdt, + [TDT5] = igb_set_tdt, + [TDT6] = igb_set_tdt, + [TDT7] = igb_set_tdt, + [TDT8] = igb_set_tdt, + [TDT9] = igb_set_tdt, + [TDT10] = igb_set_tdt, + [TDT11] = igb_set_tdt, + [TDT12] = igb_set_tdt, + [TDT13] = igb_set_tdt, + [TDT14] = igb_set_tdt, + [TDT15] = igb_set_tdt, + [MDIC] = igb_set_mdic, + [ICS] = igb_set_ics, + [RDH0] = igb_set_16bit, + [RDH1] = igb_set_16bit, + [RDH2] = igb_set_16bit, + [RDH3] = igb_set_16bit, + [RDH4] = igb_set_16bit, + [RDH5] = igb_set_16bit, + [RDH6] = igb_set_16bit, + [RDH7] = igb_set_16bit, + [RDH8] = igb_set_16bit, + [RDH9] = igb_set_16bit, + [RDH10] = igb_set_16bit, + [RDH11] = igb_set_16bit, + [RDH12] = igb_set_16bit, + [RDH13] = igb_set_16bit, + [RDH14] = igb_set_16bit, + [RDH15] = igb_set_16bit, + [RDT0] = igb_set_rdt, + [RDT1] = igb_set_rdt, + [RDT2] = igb_set_rdt, + [RDT3] = igb_set_rdt, + [RDT4] = igb_set_rdt, + [RDT5] = igb_set_rdt, + [RDT6] = igb_set_rdt, + [RDT7] = igb_set_rdt, + [RDT8] = igb_set_rdt, + [RDT9] = igb_set_rdt, + [RDT10] = igb_set_rdt, + [RDT11] = igb_set_rdt, + [RDT12] = igb_set_rdt, + [RDT13] = igb_set_rdt, + [RDT14] = igb_set_rdt, + [RDT15] = igb_set_rdt, + [IMC] = igb_set_imc, + [IMS] = igb_set_ims, + [ICR] = igb_set_icr, + [EECD] = igb_set_eecd, + [RCTL] = igb_set_rx_control, + [CTRL] = igb_set_ctrl, + [EERD] = igb_set_eerd, + [TDFH] = igb_set_13bit, + [TDFT] = igb_set_13bit, + [TDFHS] = igb_set_13bit, + [TDFTS] = igb_set_13bit, + [TDFPC] = igb_set_13bit, + [RDFH] = igb_set_13bit, + [RDFT] = igb_set_13bit, + [RDFHS] = igb_set_13bit, + [RDFTS] = igb_set_13bit, + [RDFPC] = igb_set_13bit, + [GCR] = igb_set_gcr, + [RXCSUM] = igb_set_rxcsum, + [TDLEN0] = igb_set_dlen, + [TDLEN1] = igb_set_dlen, + [TDLEN2] = igb_set_dlen, + [TDLEN3] = igb_set_dlen, + [TDLEN4] = igb_set_dlen, + [TDLEN5] = igb_set_dlen, + [TDLEN6] = igb_set_dlen, + [TDLEN7] = igb_set_dlen, + [TDLEN8] = igb_set_dlen, + [TDLEN9] = igb_set_dlen, + [TDLEN10] = igb_set_dlen, + [TDLEN11] = igb_set_dlen, + [TDLEN12] = igb_set_dlen, + [TDLEN13] = igb_set_dlen, + [TDLEN14] = igb_set_dlen, + [TDLEN15] = igb_set_dlen, + [RDLEN0] = igb_set_dlen, + [RDLEN1] = igb_set_dlen, + [RDLEN2] = igb_set_dlen, + [RDLEN3] = igb_set_dlen, + [RDLEN4] = igb_set_dlen, + [RDLEN5] = igb_set_dlen, + [RDLEN6] = igb_set_dlen, + [RDLEN7] = igb_set_dlen, + [RDLEN8] = igb_set_dlen, + [RDLEN9] = igb_set_dlen, + [RDLEN10] = igb_set_dlen, + [RDLEN11] = igb_set_dlen, + [RDLEN12] = igb_set_dlen, + [RDLEN13] = igb_set_dlen, + [RDLEN14] = igb_set_dlen, + [RDLEN15] = igb_set_dlen, + [TDBAL0] = igb_set_dbal, + [TDBAL1] = igb_set_dbal, + [TDBAL2] = igb_set_dbal, + [TDBAL3] = igb_set_dbal, + [TDBAL4] = igb_set_dbal, + [TDBAL5] = igb_set_dbal, + [TDBAL6] = igb_set_dbal, + [TDBAL7] = igb_set_dbal, + [TDBAL8] = igb_set_dbal, + [TDBAL9] = igb_set_dbal, + [TDBAL10] = igb_set_dbal, + [TDBAL11] = igb_set_dbal, + [TDBAL12] = igb_set_dbal, + [TDBAL13] = igb_set_dbal, + [TDBAL14] = igb_set_dbal, + [TDBAL15] = igb_set_dbal, + [RDBAL0] = igb_set_dbal, + [RDBAL1] = igb_set_dbal, + [RDBAL2] = igb_set_dbal, + [RDBAL3] = igb_set_dbal, + [RDBAL4] = igb_set_dbal, + [RDBAL5] = igb_set_dbal, + [RDBAL6] = igb_set_dbal, + [RDBAL7] = igb_set_dbal, + [RDBAL8] = igb_set_dbal, + [RDBAL9] = igb_set_dbal, + [RDBAL10] = igb_set_dbal, + [RDBAL11] = igb_set_dbal, + [RDBAL12] = igb_set_dbal, + [RDBAL13] = igb_set_dbal, + [RDBAL14] = igb_set_dbal, + [RDBAL15] = igb_set_dbal, + [STATUS] = igb_set_status, + [PBACLR] = igb_set_pbaclr, + [CTRL_EXT] = igb_set_ctrlext, + [FCAH] = igb_set_16bit, + [FCT] = igb_set_16bit, + [FCTTV] = igb_set_16bit, + [FCRTV] = igb_set_16bit, + [FCRTH] = igb_set_fcrth, + [FCRTL] = igb_set_fcrtl, + [CTRL_DUP] = igb_set_ctrl, + [RFCTL] = igb_set_rfctl, + [TIMINCA] = igb_set_timinca, + [TIMADJH] = igb_set_timadjh, + + [IP6AT ... IP6AT + 3] = igb_mac_writereg, + [IP4AT ... IP4AT + 6] = igb_mac_writereg, + [RA] = igb_mac_writereg, + [RA + 1] = igb_mac_setmacaddr, + [RA + 2 ... RA + 31] = igb_mac_writereg, + [RA2 ... RA2 + 31] = igb_mac_writereg, + [WUPM ... WUPM + 31] = igb_mac_writereg, + [MTA ... MTA + E1000_MC_TBL_SIZE - 1] = igb_mac_writereg, + [VFTA ... VFTA + E1000_VLAN_FILTER_TBL_SIZE - 1] = igb_mac_writereg, + [FFMT ... FFMT + 254] = igb_set_4bit, + [MDEF ... MDEF + 7] = igb_mac_writereg, + [FTFT ... FTFT + 254] = igb_mac_writereg, + [RETA ... RETA + 31] = igb_mac_writereg, + [RSSRK ... RSSRK + 9] = igb_mac_writereg, + [MAVTV0 ... MAVTV3] = igb_mac_writereg, + [EITR0 ... EITR0 + IGB_INTR_NUM - 1] = igb_set_eitr, + + /* IGB specific: */ + [FWSM] = igb_mac_writereg, + [SW_FW_SYNC] = igb_mac_writereg, + [EICR] = igb_set_eicr, + [EICS] = igb_set_eics, + [EIAC] = igb_set_eiac, + [EIAM] = igb_set_eiam, + [EIMC] = igb_set_eimc, + [EIMS] = igb_set_eims, + [IVAR0 ... IVAR0 + 7] = igb_mac_writereg, + igb_putreg(IVAR_MISC), + igb_putreg(VT_CTL), + [P2VMAILBOX0 ... P2VMAILBOX7] = igb_set_pfmailbox, + [V2PMAILBOX0 ... V2PMAILBOX7] = igb_set_vfmailbox, + [MBVFICR] = igb_w1c, + [VMBMEM0 ... VMBMEM0 + 127] = igb_mac_writereg, + igb_putreg(MBVFIMR), + [VFLRE] = igb_w1c, + igb_putreg(VFRE), + igb_putreg(VFTE), + igb_putreg(QDE), + igb_putreg(DTXSWC), + igb_putreg(RPLOLR), + [VLVF0 ... VLVF0 + E1000_VLVF_ARRAY_SIZE - 1] = igb_mac_writereg, + [VMVIR0 ... VMVIR7] = igb_mac_writereg, + [VMOLR0 ... VMOLR7] = igb_mac_writereg, + [UTA ... UTA + E1000_MC_TBL_SIZE - 1] = igb_mac_writereg, + [PVTCTRL0] = igb_set_vtctrl, + [PVTCTRL1] = igb_set_vtctrl, + [PVTCTRL2] = igb_set_vtctrl, + [PVTCTRL3] = igb_set_vtctrl, + [PVTCTRL4] = igb_set_vtctrl, + [PVTCTRL5] = igb_set_vtctrl, + [PVTCTRL6] = igb_set_vtctrl, + [PVTCTRL7] = igb_set_vtctrl, + [PVTEICS0] = igb_set_vteics, + [PVTEICS1] = igb_set_vteics, + [PVTEICS2] = igb_set_vteics, + [PVTEICS3] = igb_set_vteics, + [PVTEICS4] = igb_set_vteics, + [PVTEICS5] = igb_set_vteics, + [PVTEICS6] = igb_set_vteics, + [PVTEICS7] = igb_set_vteics, + [PVTEIMS0] = igb_set_vteims, + [PVTEIMS1] = igb_set_vteims, + [PVTEIMS2] = igb_set_vteims, + [PVTEIMS3] = igb_set_vteims, + [PVTEIMS4] = igb_set_vteims, + [PVTEIMS5] = igb_set_vteims, + [PVTEIMS6] = igb_set_vteims, + [PVTEIMS7] = igb_set_vteims, + [PVTEIMC0] = igb_set_vteimc, + [PVTEIMC1] = igb_set_vteimc, + [PVTEIMC2] = igb_set_vteimc, + [PVTEIMC3] = igb_set_vteimc, + [PVTEIMC4] = igb_set_vteimc, + [PVTEIMC5] = igb_set_vteimc, + [PVTEIMC6] = igb_set_vteimc, + [PVTEIMC7] = igb_set_vteimc, + [PVTEIAC0] = igb_set_vteiac, + [PVTEIAC1] = igb_set_vteiac, + [PVTEIAC2] = igb_set_vteiac, + [PVTEIAC3] = igb_set_vteiac, + [PVTEIAC4] = igb_set_vteiac, + [PVTEIAC5] = igb_set_vteiac, + [PVTEIAC6] = igb_set_vteiac, + [PVTEIAC7] = igb_set_vteiac, + [PVTEIAM0] = igb_set_vteiam, + [PVTEIAM1] = igb_set_vteiam, + [PVTEIAM2] = igb_set_vteiam, + [PVTEIAM3] = igb_set_vteiam, + [PVTEIAM4] = igb_set_vteiam, + [PVTEIAM5] = igb_set_vteiam, + [PVTEIAM6] = igb_set_vteiam, + [PVTEIAM7] = igb_set_vteiam, + [PVTEICR0] = igb_set_vteicr, + [PVTEICR1] = igb_set_vteicr, + [PVTEICR2] = igb_set_vteicr, + [PVTEICR3] = igb_set_vteicr, + [PVTEICR4] = igb_set_vteicr, + [PVTEICR5] = igb_set_vteicr, + [PVTEICR6] = igb_set_vteicr, + [PVTEICR7] = igb_set_vteicr, + [VTIVAR ... VTIVAR + 7] = igb_set_vtivar, + [VTIVAR_MISC ... VTIVAR_MISC + 7] = igb_mac_writereg +}; +enum { IGB_NWRITEOPS = ARRAY_SIZE(igb_macreg_writeops) }; + +enum { MAC_ACCESS_PARTIAL = 1 }; + +/* + * The array below combines alias offsets of the index values for the + * MAC registers that have aliases, with the indication of not fully + * implemented registers (lowest bit). This combination is possible + * because all of the offsets are even. + */ +static const uint16_t mac_reg_access[E1000E_MAC_SIZE] = { + /* Alias index offsets */ + [FCRTL_A] = 0x07fe, + [RDFH_A] = 0xe904, [RDFT_A] = 0xe904, + [TDFH_A] = 0xed00, [TDFT_A] = 0xed00, + [RA_A ... RA_A + 31] = 0x14f0, + [VFTA_A ... VFTA_A + E1000_VLAN_FILTER_TBL_SIZE - 1] = 0x1400, + + [RDBAL0_A] = 0x2600, + [RDBAH0_A] = 0x2600, + [RDLEN0_A] = 0x2600, + [SRRCTL0_A] = 0x2600, + [RDH0_A] = 0x2600, + [RDT0_A] = 0x2600, + [RXDCTL0_A] = 0x2600, + [RXCTL0_A] = 0x2600, + [RQDPC0_A] = 0x2600, + [RDBAL1_A] = 0x25D0, + [RDBAL2_A] = 0x25A0, + [RDBAL3_A] = 0x2570, + [RDBAH1_A] = 0x25D0, + [RDBAH2_A] = 0x25A0, + [RDBAH3_A] = 0x2570, + [RDLEN1_A] = 0x25D0, + [RDLEN2_A] = 0x25A0, + [RDLEN3_A] = 0x2570, + [SRRCTL1_A] = 0x25D0, + [SRRCTL2_A] = 0x25A0, + [SRRCTL3_A] = 0x2570, + [RDH1_A] = 0x25D0, + [RDH2_A] = 0x25A0, + [RDH3_A] = 0x2570, + [RDT1_A] = 0x25D0, + [RDT2_A] = 0x25A0, + [RDT3_A] = 0x2570, + [RXDCTL1_A] = 0x25D0, + [RXDCTL2_A] = 0x25A0, + [RXDCTL3_A] = 0x2570, + [RXCTL1_A] = 0x25D0, + [RXCTL2_A] = 0x25A0, + [RXCTL3_A] = 0x2570, + [RQDPC1_A] = 0x25D0, + [RQDPC2_A] = 0x25A0, + [RQDPC3_A] = 0x2570, + [TDBAL0_A] = 0x2A00, + [TDBAH0_A] = 0x2A00, + [TDLEN0_A] = 0x2A00, + [TDH0_A] = 0x2A00, + [TDT0_A] = 0x2A00, + [TXCTL0_A] = 0x2A00, + [TDWBAL0_A] = 0x2A00, + [TDWBAH0_A] = 0x2A00, + [TDBAL1_A] = 0x29D0, + [TDBAL2_A] = 0x29A0, + [TDBAL3_A] = 0x2970, + [TDBAH1_A] = 0x29D0, + [TDBAH2_A] = 0x29A0, + [TDBAH3_A] = 0x2970, + [TDLEN1_A] = 0x29D0, + [TDLEN2_A] = 0x29A0, + [TDLEN3_A] = 0x2970, + [TDH1_A] = 0x29D0, + [TDH2_A] = 0x29A0, + [TDH3_A] = 0x2970, + [TDT1_A] = 0x29D0, + [TDT2_A] = 0x29A0, + [TDT3_A] = 0x2970, + [TXDCTL0_A] = 0x2A00, + [TXDCTL1_A] = 0x29D0, + [TXDCTL2_A] = 0x29A0, + [TXDCTL3_A] = 0x2970, + [TXCTL1_A] = 0x29D0, + [TXCTL2_A] = 0x29A0, + [TXCTL3_A] = 0x29D0, + [TDWBAL1_A] = 0x29D0, + [TDWBAL2_A] = 0x29A0, + [TDWBAL3_A] = 0x2970, + [TDWBAH1_A] = 0x29D0, + [TDWBAH2_A] = 0x29A0, + [TDWBAH3_A] = 0x2970, + + /* Access options */ + [RDFH] = MAC_ACCESS_PARTIAL, [RDFT] = MAC_ACCESS_PARTIAL, + [RDFHS] = MAC_ACCESS_PARTIAL, [RDFTS] = MAC_ACCESS_PARTIAL, + [RDFPC] = MAC_ACCESS_PARTIAL, + [TDFH] = MAC_ACCESS_PARTIAL, [TDFT] = MAC_ACCESS_PARTIAL, + [TDFHS] = MAC_ACCESS_PARTIAL, [TDFTS] = MAC_ACCESS_PARTIAL, + [TDFPC] = MAC_ACCESS_PARTIAL, [EECD] = MAC_ACCESS_PARTIAL, + [FLA] = MAC_ACCESS_PARTIAL, + [FCAL] = MAC_ACCESS_PARTIAL, [FCAH] = MAC_ACCESS_PARTIAL, + [FCT] = MAC_ACCESS_PARTIAL, [FCTTV] = MAC_ACCESS_PARTIAL, + [FCRTV] = MAC_ACCESS_PARTIAL, [FCRTL] = MAC_ACCESS_PARTIAL, + [FCRTH] = MAC_ACCESS_PARTIAL, + [MAVTV0 ... MAVTV3] = MAC_ACCESS_PARTIAL +}; + +void +igb_core_write(IGBCore *core, hwaddr addr, uint64_t val, unsigned size) +{ + uint16_t index = igb_get_reg_index_with_offset(mac_reg_access, addr); + + if (index < IGB_NWRITEOPS && igb_macreg_writeops[index]) { + if (mac_reg_access[index] & MAC_ACCESS_PARTIAL) { + trace_e1000e_wrn_regs_write_trivial(index << 2); + } + trace_e1000e_core_write(index << 2, size, val); + igb_macreg_writeops[index](core, index, val); + } else if (index < IGB_NREADOPS && igb_macreg_readops[index]) { + trace_e1000e_wrn_regs_write_ro(index << 2, size, val); + } else { + trace_e1000e_wrn_regs_write_unknown(index << 2, size, val); + } +} + +uint64_t +igb_core_read(IGBCore *core, hwaddr addr, unsigned size) +{ + uint64_t val; + uint16_t index = igb_get_reg_index_with_offset(mac_reg_access, addr); + + if (index < IGB_NREADOPS && igb_macreg_readops[index]) { + if (mac_reg_access[index] & MAC_ACCESS_PARTIAL) { + trace_e1000e_wrn_regs_read_trivial(index << 2); + } + val = igb_macreg_readops[index](core, index); + trace_e1000e_core_read(index << 2, size, val); + return val; + } else { + trace_e1000e_wrn_regs_read_unknown(index << 2, size); + } + return 0; +} + +static inline void +igb_autoneg_pause(IGBCore *core) +{ + timer_del(core->autoneg_timer); +} + +static void +igb_autoneg_resume(IGBCore *core) +{ + if (igb_have_autoneg(core) && + !(core->phy[MII_BMSR] & MII_BMSR_AN_COMP)) { + qemu_get_queue(core->owner_nic)->link_down = false; + timer_mod(core->autoneg_timer, + qemu_clock_get_ms(QEMU_CLOCK_VIRTUAL) + 500); + } +} + +static void +igb_vm_state_change(void *opaque, bool running, RunState state) +{ + IGBCore *core = opaque; + + if (running) { + trace_e1000e_vm_state_running(); + igb_intrmgr_resume(core); + igb_autoneg_resume(core); + } else { + trace_e1000e_vm_state_stopped(); + igb_autoneg_pause(core); + igb_intrmgr_pause(core); + } +} + +void +igb_core_pci_realize(IGBCore *core, + const uint16_t *eeprom_templ, + uint32_t eeprom_size, + const uint8_t *macaddr) +{ + int i; + + core->autoneg_timer = timer_new_ms(QEMU_CLOCK_VIRTUAL, + igb_autoneg_timer, core); + igb_intrmgr_pci_realize(core); + + core->vmstate = qemu_add_vm_change_state_handler(igb_vm_state_change, core); + + for (i = 0; i < IGB_NUM_QUEUES; i++) { + net_tx_pkt_init(&core->tx[i].tx_pkt, core->owner, E1000E_MAX_TX_FRAGS); + } + + net_rx_pkt_init(&core->rx_pkt); + + e1000x_core_prepare_eeprom(core->eeprom, + eeprom_templ, + eeprom_size, + PCI_DEVICE_GET_CLASS(core->owner)->device_id, + macaddr); + igb_update_rx_offloads(core); +} + +void +igb_core_pci_uninit(IGBCore *core) +{ + int i; + + timer_free(core->autoneg_timer); + + igb_intrmgr_pci_unint(core); + + qemu_del_vm_change_state_handler(core->vmstate); + + for (i = 0; i < IGB_NUM_QUEUES; i++) { + net_tx_pkt_reset(core->tx[i].tx_pkt); + net_tx_pkt_uninit(core->tx[i].tx_pkt); + } + + net_rx_pkt_uninit(core->rx_pkt); +} + +static const uint16_t +igb_phy_reg_init[] = { + [MII_BMCR] = MII_BMCR_SPEED1000 | + MII_BMCR_FD | + MII_BMCR_AUTOEN, + + [MII_BMSR] = MII_BMSR_EXTCAP | + MII_BMSR_LINK_ST | + MII_BMSR_AUTONEG | + MII_BMSR_MFPS | + MII_BMSR_EXTSTAT | + MII_BMSR_10T_HD | + MII_BMSR_10T_FD | + MII_BMSR_100TX_HD | + MII_BMSR_100TX_FD, + + [MII_PHYID1] = IGP03E1000_E_PHY_ID >> 16, + [MII_PHYID2] = (IGP03E1000_E_PHY_ID & 0xfff0) | 1, + [MII_ANAR] = MII_ANAR_CSMACD | MII_ANAR_10 | + MII_ANAR_10FD | MII_ANAR_TX | + MII_ANAR_TXFD | MII_ANAR_PAUSE | + MII_ANAR_PAUSE_ASYM, + [MII_ANLPAR] = MII_ANLPAR_10 | MII_ANLPAR_10FD | + MII_ANLPAR_TX | MII_ANLPAR_TXFD | + MII_ANLPAR_T4 | MII_ANLPAR_PAUSE, + [MII_ANER] = MII_ANER_NP | MII_ANER_NWAY, + [MII_ANNP] = 0x1 | MII_ANNP_MP, + [MII_CTRL1000] = MII_CTRL1000_HALF | MII_CTRL1000_FULL | + MII_CTRL1000_PORT | MII_CTRL1000_MASTER, + [MII_STAT1000] = MII_STAT1000_HALF | MII_STAT1000_FULL | + MII_STAT1000_ROK | MII_STAT1000_LOK, + [MII_EXTSTAT] = MII_EXTSTAT_1000T_HD | MII_EXTSTAT_1000T_FD, + + [IGP01E1000_PHY_PORT_CONFIG] = BIT(5) | BIT(8), + [IGP01E1000_PHY_PORT_STATUS] = IGP01E1000_PSSR_SPEED_1000MBPS, + [IGP02E1000_PHY_POWER_MGMT] = BIT(0) | BIT(3) | IGP02E1000_PM_D3_LPLU | + IGP01E1000_PSCFR_SMART_SPEED +}; + +static const uint32_t igb_mac_reg_init[] = { + [LEDCTL] = 2 | (3 << 8) | BIT(15) | (6 << 16) | (7 << 24), + [EEMNGCTL] = BIT(31), + [RXDCTL0] = E1000_RXDCTL_QUEUE_ENABLE | (1 << 16), + [RXDCTL1] = 1 << 16, + [RXDCTL2] = 1 << 16, + [RXDCTL3] = 1 << 16, + [RXDCTL4] = 1 << 16, + [RXDCTL5] = 1 << 16, + [RXDCTL6] = 1 << 16, + [RXDCTL7] = 1 << 16, + [RXDCTL8] = 1 << 16, + [RXDCTL9] = 1 << 16, + [RXDCTL10] = 1 << 16, + [RXDCTL11] = 1 << 16, + [RXDCTL12] = 1 << 16, + [RXDCTL13] = 1 << 16, + [RXDCTL14] = 1 << 16, + [RXDCTL15] = 1 << 16, + [TIPG] = 0x08 | (0x04 << 10) | (0x06 << 20), + [CTRL] = E1000_CTRL_FD | E1000_CTRL_LRST | E1000_CTRL_SPD_1000 | + E1000_CTRL_ADVD3WUC, + [STATUS] = E1000_STATUS_PHYRA | BIT(31), + [EECD] = E1000_EECD_FWE_DIS | E1000_EECD_PRES | + (2 << E1000_EECD_SIZE_EX_SHIFT), + [GCR] = E1000_L0S_ADJUST | + E1000_GCR_CMPL_TMOUT_RESEND | + E1000_GCR_CAP_VER2 | + E1000_L1_ENTRY_LATENCY_MSB | + E1000_L1_ENTRY_LATENCY_LSB, + [RXCSUM] = E1000_RXCSUM_IPOFLD | E1000_RXCSUM_TUOFLD, + [TXPBS] = 0x28, + [RXPBS] = 0x40, + [TCTL] = E1000_TCTL_PSP | (0xF << E1000_CT_SHIFT) | + (0x40 << E1000_COLD_SHIFT) | (0x1 << 26) | (0xA << 28), + [TCTL_EXT] = 0x40 | (0x42 << 10), + [DTXCTL] = E1000_DTXCTL_8023LL | E1000_DTXCTL_SPOOF_INT, + [VET] = ETH_P_VLAN | (ETH_P_VLAN << 16), + + [V2PMAILBOX0 ... V2PMAILBOX0 + IGB_MAX_VF_FUNCTIONS - 1] = E1000_V2PMAILBOX_RSTI, + [MBVFIMR] = 0xFF, + [VFRE] = 0xFF, + [VFTE] = 0xFF, + [VMOLR0 ... VMOLR0 + 7] = 0x2600 | E1000_VMOLR_STRCRC, + [RPLOLR] = E1000_RPLOLR_STRCRC, + [RLPML] = 0x2600, + [TXCTL0] = E1000_DCA_TXCTRL_DATA_RRO_EN | + E1000_DCA_TXCTRL_TX_WB_RO_EN | + E1000_DCA_TXCTRL_DESC_RRO_EN, + [TXCTL1] = E1000_DCA_TXCTRL_DATA_RRO_EN | + E1000_DCA_TXCTRL_TX_WB_RO_EN | + E1000_DCA_TXCTRL_DESC_RRO_EN, + [TXCTL2] = E1000_DCA_TXCTRL_DATA_RRO_EN | + E1000_DCA_TXCTRL_TX_WB_RO_EN | + E1000_DCA_TXCTRL_DESC_RRO_EN, + [TXCTL3] = E1000_DCA_TXCTRL_DATA_RRO_EN | + E1000_DCA_TXCTRL_TX_WB_RO_EN | + E1000_DCA_TXCTRL_DESC_RRO_EN, + [TXCTL4] = E1000_DCA_TXCTRL_DATA_RRO_EN | + E1000_DCA_TXCTRL_TX_WB_RO_EN | + E1000_DCA_TXCTRL_DESC_RRO_EN, + [TXCTL5] = E1000_DCA_TXCTRL_DATA_RRO_EN | + E1000_DCA_TXCTRL_TX_WB_RO_EN | + E1000_DCA_TXCTRL_DESC_RRO_EN, + [TXCTL6] = E1000_DCA_TXCTRL_DATA_RRO_EN | + E1000_DCA_TXCTRL_TX_WB_RO_EN | + E1000_DCA_TXCTRL_DESC_RRO_EN, + [TXCTL7] = E1000_DCA_TXCTRL_DATA_RRO_EN | + E1000_DCA_TXCTRL_TX_WB_RO_EN | + E1000_DCA_TXCTRL_DESC_RRO_EN, + [TXCTL8] = E1000_DCA_TXCTRL_DATA_RRO_EN | + E1000_DCA_TXCTRL_TX_WB_RO_EN | + E1000_DCA_TXCTRL_DESC_RRO_EN, + [TXCTL9] = E1000_DCA_TXCTRL_DATA_RRO_EN | + E1000_DCA_TXCTRL_TX_WB_RO_EN | + E1000_DCA_TXCTRL_DESC_RRO_EN, + [TXCTL10] = E1000_DCA_TXCTRL_DATA_RRO_EN | + E1000_DCA_TXCTRL_TX_WB_RO_EN | + E1000_DCA_TXCTRL_DESC_RRO_EN, + [TXCTL11] = E1000_DCA_TXCTRL_DATA_RRO_EN | + E1000_DCA_TXCTRL_TX_WB_RO_EN | + E1000_DCA_TXCTRL_DESC_RRO_EN, + [TXCTL12] = E1000_DCA_TXCTRL_DATA_RRO_EN | + E1000_DCA_TXCTRL_TX_WB_RO_EN | + E1000_DCA_TXCTRL_DESC_RRO_EN, + [TXCTL13] = E1000_DCA_TXCTRL_DATA_RRO_EN | + E1000_DCA_TXCTRL_TX_WB_RO_EN | + E1000_DCA_TXCTRL_DESC_RRO_EN, + [TXCTL14] = E1000_DCA_TXCTRL_DATA_RRO_EN | + E1000_DCA_TXCTRL_TX_WB_RO_EN | + E1000_DCA_TXCTRL_DESC_RRO_EN, + [TXCTL15] = E1000_DCA_TXCTRL_DATA_RRO_EN | + E1000_DCA_TXCTRL_TX_WB_RO_EN | + E1000_DCA_TXCTRL_DESC_RRO_EN, +}; + +static void igb_reset(IGBCore *core, bool sw) +{ + struct igb_tx *tx; + int i; + + timer_del(core->autoneg_timer); + + igb_intrmgr_reset(core); + + memset(core->phy, 0, sizeof core->phy); + memcpy(core->phy, igb_phy_reg_init, sizeof igb_phy_reg_init); + + for (i = 0; i < E1000E_MAC_SIZE; i++) { + if (sw && + (i == RXPBS || i == TXPBS || + (i >= EITR0 && i < EITR0 + IGB_INTR_NUM))) { + continue; + } + + core->mac[i] = i < ARRAY_SIZE(igb_mac_reg_init) ? + igb_mac_reg_init[i] : 0; + } + + if (qemu_get_queue(core->owner_nic)->link_down) { + igb_link_down(core); + } + + e1000x_reset_mac_addr(core->owner_nic, core->mac, core->permanent_mac); + + for (i = 0; i < ARRAY_SIZE(core->tx); i++) { + tx = &core->tx[i]; + net_tx_pkt_reset(tx->tx_pkt); + tx->vlan = 0; + tx->mss = 0; + tx->tse = false; + tx->ixsm = false; + tx->txsm = false; + tx->first = true; + tx->skip_cp = false; + } +} + +void +igb_core_reset(IGBCore *core) +{ + igb_reset(core, false); +} + +void igb_core_pre_save(IGBCore *core) +{ + int i; + NetClientState *nc = qemu_get_queue(core->owner_nic); + + /* + * If link is down and auto-negotiation is supported and ongoing, + * complete auto-negotiation immediately. This allows us to look + * at MII_BMSR_AN_COMP to infer link status on load. + */ + if (nc->link_down && igb_have_autoneg(core)) { + core->phy[MII_BMSR] |= MII_BMSR_AN_COMP; + igb_update_flowctl_status(core); + } + + for (i = 0; i < ARRAY_SIZE(core->tx); i++) { + if (net_tx_pkt_has_fragments(core->tx[i].tx_pkt)) { + core->tx[i].skip_cp = true; + } + } +} + +int +igb_core_post_load(IGBCore *core) +{ + NetClientState *nc = qemu_get_queue(core->owner_nic); + + /* + * nc.link_down can't be migrated, so infer link_down according + * to link status bit in core.mac[STATUS]. + */ + nc->link_down = (core->mac[STATUS] & E1000_STATUS_LU) == 0; + + return 0; +} diff --git a/hw/net/igb_core.h b/hw/net/igb_core.h new file mode 100644 index 0000000000..814c1e264b --- /dev/null +++ b/hw/net/igb_core.h @@ -0,0 +1,146 @@ +/* + * Core code for QEMU igb emulation + * + * Datasheet: + * https://www.intel.com/content/dam/www/public/us/en/documents/datasheets/82576eg-gbe-datasheet.pdf + * + * Copyright (c) 2020-2023 Red Hat, Inc. + * Copyright (c) 2015 Ravello Systems LTD (http://ravellosystems.com) + * Developed by Daynix Computing LTD (http://www.daynix.com) + * + * Authors: + * Akihiko Odaki <akihiko.odaki@daynix.com> + * Gal Hammmer <gal.hammer@sap.com> + * Marcel Apfelbaum <marcel.apfelbaum@gmail.com> + * Dmitry Fleytman <dmitry@daynix.com> + * Leonid Bloch <leonid@daynix.com> + * Yan Vugenfirer <yan@daynix.com> + * + * Based on work done by: + * Nir Peleg, Tutis Systems Ltd. for Qumranet Inc. + * Copyright (c) 2008 Qumranet + * Based on work done by: + * Copyright (c) 2007 Dan Aloni + * Copyright (c) 2004 Antony T Curtis + * + * This library is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * This library is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with this library; if not, see <http://www.gnu.org/licenses/>. + */ + +#ifndef HW_NET_IGB_CORE_H +#define HW_NET_IGB_CORE_H + +#define E1000E_MAC_SIZE (0x8000) +#define IGB_EEPROM_SIZE (1024) + +#define IGB_INTR_NUM (25) +#define IGB_MSIX_VEC_NUM (10) +#define IGBVF_MSIX_VEC_NUM (3) +#define IGB_NUM_QUEUES (16) + +typedef struct IGBCore IGBCore; + +enum { PHY_R = BIT(0), + PHY_W = BIT(1), + PHY_RW = PHY_R | PHY_W }; + +typedef struct IGBIntrDelayTimer_st { + QEMUTimer *timer; + bool running; + uint32_t delay_reg; + uint32_t delay_resolution_ns; + IGBCore *core; +} IGBIntrDelayTimer; + +struct IGBCore { + uint32_t mac[E1000E_MAC_SIZE]; + uint16_t phy[MAX_PHY_REG_ADDRESS + 1]; + uint16_t eeprom[IGB_EEPROM_SIZE]; + + uint8_t rx_desc_len; + + QEMUTimer *autoneg_timer; + + struct igb_tx { + uint16_t vlan; /* VLAN Tag */ + uint16_t mss; /* Maximum Segment Size */ + bool tse; /* TCP/UDP Segmentation Enable */ + bool ixsm; /* Insert IP Checksum */ + bool txsm; /* Insert TCP/UDP Checksum */ + + bool first; + bool skip_cp; + + struct NetTxPkt *tx_pkt; + } tx[IGB_NUM_QUEUES]; + + struct NetRxPkt *rx_pkt; + + bool has_vnet; + int max_queue_num; + + IGBIntrDelayTimer eitr[IGB_INTR_NUM]; + + VMChangeStateEntry *vmstate; + + uint32_t eitr_guest_value[IGB_INTR_NUM]; + + uint8_t permanent_mac[ETH_ALEN]; + + NICState *owner_nic; + PCIDevice *owner; + void (*owner_start_recv)(PCIDevice *d); + + int64_t timadj; +}; + +void +igb_core_write(IGBCore *core, hwaddr addr, uint64_t val, unsigned size); + +uint64_t +igb_core_read(IGBCore *core, hwaddr addr, unsigned size); + +void +igb_core_pci_realize(IGBCore *regs, + const uint16_t *eeprom_templ, + uint32_t eeprom_size, + const uint8_t *macaddr); + +void +igb_core_reset(IGBCore *core); + +void +igb_core_pre_save(IGBCore *core); + +int +igb_core_post_load(IGBCore *core); + +void +igb_core_set_link_status(IGBCore *core); + +void +igb_core_pci_uninit(IGBCore *core); + +bool +igb_can_receive(IGBCore *core); + +ssize_t +igb_receive(IGBCore *core, const uint8_t *buf, size_t size); + +ssize_t +igb_receive_iov(IGBCore *core, const struct iovec *iov, int iovcnt); + +void +igb_start_recv(IGBCore *core); + +#endif diff --git a/hw/net/igb_regs.h b/hw/net/igb_regs.h new file mode 100644 index 0000000000..00934d4f20 --- /dev/null +++ b/hw/net/igb_regs.h @@ -0,0 +1,648 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* + * This is copied + edited from kernel header files in + * drivers/net/ethernet/intel/igb + */ + +#ifndef HW_IGB_REGS_H_ +#define HW_IGB_REGS_H_ + +#include "e1000x_regs.h" + +/* from igb/e1000_hw.h */ + +#define E1000_DEV_ID_82576 0x10C9 +#define E1000_DEV_ID_82576_FIBER 0x10E6 +#define E1000_DEV_ID_82576_SERDES 0x10E7 +#define E1000_DEV_ID_82576_QUAD_COPPER 0x10E8 +#define E1000_DEV_ID_82576_QUAD_COPPER_ET2 0x1526 +#define E1000_DEV_ID_82576_NS 0x150A +#define E1000_DEV_ID_82576_NS_SERDES 0x1518 +#define E1000_DEV_ID_82576_SERDES_QUAD 0x150D + +/* Context Descriptor */ +struct e1000_adv_tx_context_desc { + uint32_t vlan_macip_lens; + uint32_t seqnum_seed; + uint32_t type_tucmd_mlhl; + uint32_t mss_l4len_idx; +}; + +/* Advanced Transmit Descriptor */ +union e1000_adv_tx_desc { + struct { + uint64_t buffer_addr; /* Address of descriptor's data buffer */ + uint32_t cmd_type_len; + uint32_t olinfo_status; + } read; + struct { + uint64_t rsvd; /* Reserved */ + uint32_t nxtseq_seed; + uint32_t status; + } wb; +}; + +#define E1000_ADVTXD_DTYP_CTXT 0x00200000 /* Advanced Context Descriptor */ +#define E1000_ADVTXD_DTYP_DATA 0x00300000 /* Advanced Data Descriptor */ +#define E1000_ADVTXD_DCMD_DEXT 0x20000000 /* Descriptor Extension (1=Adv) */ +#define E1000_ADVTXD_DCMD_TSE 0x80000000 /* TCP/UDP Segmentation Enable */ + +#define E1000_ADVTXD_POTS_IXSM 0x00000100 /* Insert TCP/UDP Checksum */ +#define E1000_ADVTXD_POTS_TXSM 0x00000200 /* Insert TCP/UDP Checksum */ + +#define E1000_TXD_POPTS_IXSM 0x00000001 /* Insert IP checksum */ +#define E1000_TXD_POPTS_TXSM 0x00000002 /* Insert TCP/UDP checksum */ + +/* Receive Descriptor - Advanced */ +union e1000_adv_rx_desc { + struct { + uint64_t pkt_addr; /* Packet Buffer Address */ + uint64_t hdr_addr; /* Header Buffer Address */ + } read; + struct { + struct { + struct { + uint16_t pkt_info; /* RSS Type, Packet Type */ + uint16_t hdr_info; /* Split Head, Buffer Length */ + } lo_dword; + union { + uint32_t rss; /* RSS Hash */ + struct { + uint16_t ip_id; /* IP Id */ + uint16_t csum; /* Packet Checksum */ + } csum_ip; + } hi_dword; + } lower; + struct { + uint32_t status_error; /* Ext Status/Error */ + uint16_t length; /* Packet Length */ + uint16_t vlan; /* VLAN tag */ + } upper; + } wb; /* writeback */ +}; + +/* from igb/e1000_phy.h */ + +/* IGP01E1000 Specific Registers */ +#define IGP01E1000_PHY_PORT_CONFIG 0x10 /* Port Config */ +#define IGP01E1000_PHY_PORT_STATUS 0x11 /* Status */ +#define IGP01E1000_PHY_PORT_CTRL 0x12 /* Control */ +#define IGP01E1000_PHY_LINK_HEALTH 0x13 /* PHY Link Health */ +#define IGP02E1000_PHY_POWER_MGMT 0x19 /* Power Management */ +#define IGP01E1000_PHY_PAGE_SELECT 0x1F /* Page Select */ +#define IGP01E1000_PHY_PCS_INIT_REG 0x00B4 +#define IGP01E1000_PHY_POLARITY_MASK 0x0078 +#define IGP01E1000_PSCR_AUTO_MDIX 0x1000 +#define IGP01E1000_PSCR_FORCE_MDI_MDIX 0x2000 /* 0=MDI, 1=MDIX */ +#define IGP01E1000_PSCFR_SMART_SPEED 0x0080 + +/* Enable flexible speed on link-up */ +#define IGP02E1000_PM_D0_LPLU 0x0002 /* For D0a states */ +#define IGP02E1000_PM_D3_LPLU 0x0004 /* For all other states */ +#define IGP01E1000_PLHR_SS_DOWNGRADE 0x8000 +#define IGP01E1000_PSSR_POLARITY_REVERSED 0x0002 +#define IGP01E1000_PSSR_MDIX 0x0800 +#define IGP01E1000_PSSR_SPEED_MASK 0xC000 +#define IGP01E1000_PSSR_SPEED_1000MBPS 0xC000 +#define IGP02E1000_PHY_CHANNEL_NUM 4 +#define IGP02E1000_PHY_AGC_A 0x11B1 +#define IGP02E1000_PHY_AGC_B 0x12B1 +#define IGP02E1000_PHY_AGC_C 0x14B1 +#define IGP02E1000_PHY_AGC_D 0x18B1 +#define IGP02E1000_AGC_LENGTH_SHIFT 9 /* Course - 15:13, Fine - 12:9 */ +#define IGP02E1000_AGC_LENGTH_MASK 0x7F +#define IGP02E1000_AGC_RANGE 15 + +/* from igb/igb.h */ + +#define E1000_PCS_CFG_IGN_SD 1 + +/* Interrupt defines */ +#define IGB_START_ITR 648 /* ~6000 ints/sec */ +#define IGB_4K_ITR 980 +#define IGB_20K_ITR 196 +#define IGB_70K_ITR 56 + +/* TX/RX descriptor defines */ +#define IGB_DEFAULT_TXD 256 +#define IGB_DEFAULT_TX_WORK 128 +#define IGB_MIN_TXD 80 +#define IGB_MAX_TXD 4096 + +#define IGB_DEFAULT_RXD 256 +#define IGB_MIN_RXD 80 +#define IGB_MAX_RXD 4096 + +#define IGB_DEFAULT_ITR 3 /* dynamic */ +#define IGB_MAX_ITR_USECS 10000 +#define IGB_MIN_ITR_USECS 10 +#define NON_Q_VECTORS 1 +#define MAX_Q_VECTORS 8 +#define MAX_MSIX_ENTRIES 10 + +/* Transmit and receive queues */ +#define IGB_MAX_RX_QUEUES 8 +#define IGB_MAX_RX_QUEUES_82575 4 +#define IGB_MAX_RX_QUEUES_I211 2 +#define IGB_MAX_TX_QUEUES 8 +#define IGB_MAX_VF_MC_ENTRIES 30 +#define IGB_MAX_VF_FUNCTIONS 8 +#define IGB_MAX_VFTA_ENTRIES 128 +#define IGB_82576_VF_DEV_ID 0x10CA +#define IGB_I350_VF_DEV_ID 0x1520 + +/* from igb/e1000_82575.h */ + +#define E1000_MRQC_ENABLE_RSS_MQ 0x00000002 +#define E1000_MRQC_ENABLE_VMDQ 0x00000003 +#define E1000_MRQC_RSS_FIELD_IPV4_UDP 0x00400000 +#define E1000_MRQC_ENABLE_VMDQ_RSS_MQ 0x00000005 +#define E1000_MRQC_RSS_FIELD_IPV6_UDP 0x00800000 +#define E1000_MRQC_RSS_FIELD_IPV6_UDP_EX 0x01000000 + +/* Additional Receive Descriptor Control definitions */ +#define E1000_RXDCTL_QUEUE_ENABLE 0x02000000 /* Enable specific Rx Queue */ + +/* Direct Cache Access (DCA) definitions */ +#define E1000_DCA_CTRL_DCA_MODE_DISABLE 0x01 /* DCA Disable */ +#define E1000_DCA_CTRL_DCA_MODE_CB2 0x02 /* DCA Mode CB2 */ + +#define E1000_DCA_RXCTRL_CPUID_MASK 0x0000001F /* Rx CPUID Mask */ +#define E1000_DCA_RXCTRL_DESC_DCA_EN BIT(5) /* DCA Rx Desc enable */ +#define E1000_DCA_RXCTRL_HEAD_DCA_EN BIT(6) /* DCA Rx Desc header enable */ +#define E1000_DCA_RXCTRL_DATA_DCA_EN BIT(7) /* DCA Rx Desc payload enable */ +#define E1000_DCA_RXCTRL_DESC_RRO_EN BIT(9) /* DCA Rx rd Desc Relax Order */ + +#define E1000_DCA_TXCTRL_CPUID_MASK 0x0000001F /* Tx CPUID Mask */ +#define E1000_DCA_TXCTRL_DESC_DCA_EN BIT(5) /* DCA Tx Desc enable */ +#define E1000_DCA_TXCTRL_DESC_RRO_EN BIT(9) /* Tx rd Desc Relax Order */ +#define E1000_DCA_TXCTRL_TX_WB_RO_EN BIT(11) /* Tx Desc writeback RO bit */ +#define E1000_DCA_TXCTRL_DATA_RRO_EN BIT(13) /* Tx rd data Relax Order */ + +/* Additional DCA related definitions, note change in position of CPUID */ +#define E1000_DCA_TXCTRL_CPUID_MASK_82576 0xFF000000 /* Tx CPUID Mask */ +#define E1000_DCA_RXCTRL_CPUID_MASK_82576 0xFF000000 /* Rx CPUID Mask */ +#define E1000_DCA_TXCTRL_CPUID_SHIFT 24 /* Tx CPUID now in the last byte */ +#define E1000_DCA_RXCTRL_CPUID_SHIFT 24 /* Rx CPUID now in the last byte */ + +#define E1000_DTXSWC_MAC_SPOOF_MASK 0x000000FF /* Per VF MAC spoof control */ +#define E1000_DTXSWC_VLAN_SPOOF_MASK 0x0000FF00 /* Per VF VLAN spoof control */ +#define E1000_DTXSWC_LLE_MASK 0x00FF0000 /* Per VF Local LB enables */ +#define E1000_DTXSWC_VLAN_SPOOF_SHIFT 8 +#define E1000_DTXSWC_VMDQ_LOOPBACK_EN BIT(31) /* global VF LB enable */ + +/* Easy defines for setting default pool, would normally be left a zero */ +#define E1000_VT_CTL_DEFAULT_POOL_SHIFT 7 +#define E1000_VT_CTL_DEFAULT_POOL_MASK (0x7 << E1000_VT_CTL_DEFAULT_POOL_SHIFT) + +/* Other useful VMD_CTL register defines */ +#define E1000_VT_CTL_IGNORE_MAC BIT(28) +#define E1000_VT_CTL_DISABLE_DEF_POOL BIT(29) +#define E1000_VT_CTL_VM_REPL_EN BIT(30) + +/* Per VM Offload register setup */ +#define E1000_VMOLR_RLPML_MASK 0x00003FFF /* Long Packet Maximum Length mask */ +#define E1000_VMOLR_LPE 0x00010000 /* Accept Long packet */ +#define E1000_VMOLR_RSSE 0x00020000 /* Enable RSS */ +#define E1000_VMOLR_AUPE 0x01000000 /* Accept untagged packets */ +#define E1000_VMOLR_ROMPE 0x02000000 /* Accept overflow multicast */ +#define E1000_VMOLR_ROPE 0x04000000 /* Accept overflow unicast */ +#define E1000_VMOLR_BAM 0x08000000 /* Accept Broadcast packets */ +#define E1000_VMOLR_MPME 0x10000000 /* Multicast promiscuous mode */ +#define E1000_VMOLR_STRVLAN 0x40000000 /* Vlan stripping enable */ +#define E1000_VMOLR_STRCRC 0x80000000 /* CRC stripping enable */ + +#define E1000_DVMOLR_HIDEVLAN 0x20000000 /* Hide vlan enable */ +#define E1000_DVMOLR_STRVLAN 0x40000000 /* Vlan stripping enable */ +#define E1000_DVMOLR_STRCRC 0x80000000 /* CRC stripping enable */ + +#define E1000_VLVF_ARRAY_SIZE 32 +#define E1000_VLVF_VLANID_MASK 0x00000FFF +#define E1000_VLVF_POOLSEL_SHIFT 12 +#define E1000_VLVF_POOLSEL_MASK (0xFF << E1000_VLVF_POOLSEL_SHIFT) +#define E1000_VLVF_LVLAN 0x00100000 +#define E1000_VLVF_VLANID_ENABLE 0x80000000 + +#define E1000_VMVIR_VLANA_DEFAULT 0x40000000 /* Always use default VLAN */ +#define E1000_VMVIR_VLANA_NEVER 0x80000000 /* Never insert VLAN tag */ + +#define E1000_IOVCTL 0x05BBC +#define E1000_IOVCTL_REUSE_VFQ 0x00000001 + +#define E1000_RPLOLR_STRVLAN 0x40000000 +#define E1000_RPLOLR_STRCRC 0x80000000 + +#define E1000_DTXCTL_8023LL 0x0004 +#define E1000_DTXCTL_VLAN_ADDED 0x0008 +#define E1000_DTXCTL_OOS_ENABLE 0x0010 +#define E1000_DTXCTL_MDP_EN 0x0020 +#define E1000_DTXCTL_SPOOF_INT 0x0040 + +/* from igb/e1000_defines.h */ + +#define E1000_IVAR_VALID 0x80 +#define E1000_GPIE_NSICR 0x00000001 +#define E1000_GPIE_MSIX_MODE 0x00000010 +#define E1000_GPIE_EIAME 0x40000000 +#define E1000_GPIE_PBA 0x80000000 + +/* Transmit Control */ +#define E1000_TCTL_EN 0x00000002 /* enable tx */ +#define E1000_TCTL_PSP 0x00000008 /* pad short packets */ +#define E1000_TCTL_CT 0x00000ff0 /* collision threshold */ +#define E1000_TCTL_COLD 0x003ff000 /* collision distance */ +#define E1000_TCTL_RTLC 0x01000000 /* Re-transmit on late collision */ + +/* Collision related configuration parameters */ +#define E1000_COLLISION_THRESHOLD 15 +#define E1000_CT_SHIFT 4 +#define E1000_COLLISION_DISTANCE 63 +#define E1000_COLD_SHIFT 12 + +#define E1000_RAH_POOL_MASK 0x03FC0000 +#define E1000_RAH_POOL_1 0x00040000 + +#define E1000_ICR_VMMB 0x00000100 /* VM MB event */ +#define E1000_ICR_TS 0x00080000 /* Time Sync Interrupt */ +#define E1000_ICR_DRSTA 0x40000000 /* Device Reset Asserted */ +/* If this bit asserted, the driver should claim the interrupt */ +#define E1000_ICR_INT_ASSERTED 0x80000000 +/* LAN connected device generates an interrupt */ +#define E1000_ICR_DOUTSYNC 0x10000000 /* NIC DMA out of sync */ + +/* Extended Interrupt Cause Read */ +#define E1000_EICR_RX_QUEUE0 0x00000001 /* Rx Queue 0 Interrupt */ +#define E1000_EICR_RX_QUEUE1 0x00000002 /* Rx Queue 1 Interrupt */ +#define E1000_EICR_RX_QUEUE2 0x00000004 /* Rx Queue 2 Interrupt */ +#define E1000_EICR_RX_QUEUE3 0x00000008 /* Rx Queue 3 Interrupt */ +#define E1000_EICR_TX_QUEUE0 0x00000100 /* Tx Queue 0 Interrupt */ +#define E1000_EICR_TX_QUEUE1 0x00000200 /* Tx Queue 1 Interrupt */ +#define E1000_EICR_TX_QUEUE2 0x00000400 /* Tx Queue 2 Interrupt */ +#define E1000_EICR_TX_QUEUE3 0x00000800 /* Tx Queue 3 Interrupt */ +#define E1000_EICR_OTHER 0x80000000 /* Interrupt Cause Active */ + +/* Extended Interrupt Cause Set */ +/* E1000_EITR_CNT_IGNR is only for 82576 and newer */ +#define E1000_EITR_CNT_IGNR 0x80000000 /* Don't reset counters on write */ + +/* PCI Express Control */ +#define E1000_GCR_CMPL_TMOUT_MASK 0x0000F000 +#define E1000_GCR_CMPL_TMOUT_10ms 0x00001000 +#define E1000_GCR_CMPL_TMOUT_RESEND 0x00010000 +#define E1000_GCR_CAP_VER2 0x00040000 + +#define PHY_REVISION_MASK 0xFFFFFFF0 +#define MAX_PHY_REG_ADDRESS 0x1F /* 5 bit address bus (0-0x1F) */ +#define MAX_PHY_MULTI_PAGE_REG 0xF + +#define IGP03E1000_E_PHY_ID 0x02A80390 + +/* from igb/e1000_mbox.h */ + +#define E1000_P2VMAILBOX_STS 0x00000001 /* Initiate message send to VF */ +#define E1000_P2VMAILBOX_ACK 0x00000002 /* Ack message recv'd from VF */ +#define E1000_P2VMAILBOX_VFU 0x00000004 /* VF owns the mailbox buffer */ +#define E1000_P2VMAILBOX_PFU 0x00000008 /* PF owns the mailbox buffer */ +#define E1000_P2VMAILBOX_RVFU 0x00000010 /* Reset VFU - used when VF stuck */ + +#define E1000_MBVFICR_VFREQ_MASK 0x000000FF /* bits for VF messages */ +#define E1000_MBVFICR_VFREQ_VF1 0x00000001 /* bit for VF 1 message */ +#define E1000_MBVFICR_VFACK_MASK 0x00FF0000 /* bits for VF acks */ +#define E1000_MBVFICR_VFACK_VF1 0x00010000 /* bit for VF 1 ack */ + +#define E1000_V2PMAILBOX_SIZE 16 /* 16 32 bit words - 64 bytes */ + +/* + * If it's a E1000_VF_* msg then it originates in the VF and is sent to the + * PF. The reverse is true if it is E1000_PF_*. + * Message ACK's are the value or'd with 0xF0000000 + */ +/* Messages below or'd with this are the ACK */ +#define E1000_VT_MSGTYPE_ACK 0x80000000 +/* Messages below or'd with this are the NACK */ +#define E1000_VT_MSGTYPE_NACK 0x40000000 +/* Indicates that VF is still clear to send requests */ +#define E1000_VT_MSGTYPE_CTS 0x20000000 +#define E1000_VT_MSGINFO_SHIFT 16 +/* bits 23:16 are used for exra info for certain messages */ +#define E1000_VT_MSGINFO_MASK (0xFF << E1000_VT_MSGINFO_SHIFT) + +#define E1000_VF_RESET 0x01 /* VF requests reset */ +#define E1000_VF_SET_MAC_ADDR 0x02 /* VF requests to set MAC addr */ +/* VF requests to clear all unicast MAC filters */ +#define E1000_VF_MAC_FILTER_CLR (0x01 << E1000_VT_MSGINFO_SHIFT) +/* VF requests to add unicast MAC filter */ +#define E1000_VF_MAC_FILTER_ADD (0x02 << E1000_VT_MSGINFO_SHIFT) +#define E1000_VF_SET_MULTICAST 0x03 /* VF requests to set MC addr */ +#define E1000_VF_SET_VLAN 0x04 /* VF requests to set VLAN */ +#define E1000_VF_SET_LPE 0x05 /* VF requests to set VMOLR.LPE */ +#define E1000_VF_SET_PROMISC 0x06 /*VF requests to clear VMOLR.ROPE/MPME*/ +#define E1000_VF_SET_PROMISC_MULTICAST (0x02 << E1000_VT_MSGINFO_SHIFT) + +#define E1000_PF_CONTROL_MSG 0x0100 /* PF control message */ + +/* from igb/e1000_regs.h */ + +#define E1000_EICR 0x01580 /* Ext. Interrupt Cause Read - R/clr */ +#define E1000_EITR(_n) (0x01680 + (0x4 * (_n))) +#define E1000_EICS 0x01520 /* Ext. Interrupt Cause Set - W0 */ +#define E1000_EIMS 0x01524 /* Ext. Interrupt Mask Set/Read - RW */ +#define E1000_EIMC 0x01528 /* Ext. Interrupt Mask Clear - WO */ +#define E1000_EIAC 0x0152C /* Ext. Interrupt Auto Clear - RW */ +#define E1000_EIAM 0x01530 /* Ext. Interrupt Ack Auto Clear Mask - RW */ +#define E1000_GPIE 0x01514 /* General Purpose Interrupt Enable; RW */ +#define E1000_IVAR0 0x01700 /* Interrupt Vector Allocation Register - RW */ +#define E1000_IVAR_MISC 0x01740 /* Interrupt Vector Allocation Register (last) - RW */ +#define E1000_FRTIMER 0x01048 /* Free Running Timer - RW */ +#define E1000_FCRTV 0x02460 /* Flow Control Refresh Timer Value - RW */ + +#define E1000_RQDPC(_n) (0x0C030 + ((_n) * 0x40)) + +#define E1000_RXPBS 0x02404 /* Rx Packet Buffer Size - RW */ +#define E1000_TXPBS 0x03404 /* Tx Packet Buffer Size - RW */ + +#define E1000_DTXCTL 0x03590 /* DMA TX Control - RW */ + +#define E1000_HTCBDPC 0x04124 /* Host TX Circuit Breaker Dropped Count */ +#define E1000_RLPML 0x05004 /* RX Long Packet Max Length */ +#define E1000_RA2 0x054E0 /* 2nd half of Rx address array - RW Array */ +#define E1000_PSRTYPE(_i) (0x05480 + ((_i) * 4)) +#define E1000_VT_CTL 0x0581C /* VMDq Control - RW */ + +/* VT Registers */ +#define E1000_MBVFICR 0x00C80 /* Mailbox VF Cause - RWC */ +#define E1000_MBVFIMR 0x00C84 /* Mailbox VF int Mask - RW */ +#define E1000_VFLRE 0x00C88 /* VF Register Events - RWC */ +#define E1000_VFRE 0x00C8C /* VF Receive Enables */ +#define E1000_VFTE 0x00C90 /* VF Transmit Enables */ +#define E1000_QDE 0x02408 /* Queue Drop Enable - RW */ +#define E1000_DTXSWC 0x03500 /* DMA Tx Switch Control - RW */ +#define E1000_WVBR 0x03554 /* VM Wrong Behavior - RWS */ +#define E1000_RPLOLR 0x05AF0 /* Replication Offload - RW */ +#define E1000_UTA 0x0A000 /* Unicast Table Array - RW */ +#define E1000_IOVTCL 0x05BBC /* IOV Control Register */ +#define E1000_TXSWC 0x05ACC /* Tx Switch Control */ +#define E1000_LVMMC 0x03548 /* Last VM Misbehavior cause */ +/* These act per VF so an array friendly macro is used */ +#define E1000_P2VMAILBOX(_n) (0x00C00 + (4 * (_n))) +#define E1000_VMBMEM(_n) (0x00800 + (64 * (_n))) +#define E1000_VMOLR(_n) (0x05AD0 + (4 * (_n))) +#define E1000_DVMOLR(_n) (0x0C038 + (64 * (_n))) +#define E1000_VLVF(_n) (0x05D00 + (4 * (_n))) /* VLAN VM Filter */ +#define E1000_VMVIR(_n) (0x03700 + (4 * (_n))) + +/* from igbvf/defines.h */ + +/* SRRCTL bit definitions */ +#define E1000_SRRCTL_BSIZEPKT_SHIFT 10 /* Shift _right_ */ +#define E1000_SRRCTL_BSIZEHDRSIZE_MASK 0x00000F00 +#define E1000_SRRCTL_BSIZEHDRSIZE_SHIFT 2 /* Shift _left_ */ +#define E1000_SRRCTL_DESCTYPE_ADV_ONEBUF 0x02000000 +#define E1000_SRRCTL_DESCTYPE_HDR_SPLIT_ALWAYS 0x0A000000 +#define E1000_SRRCTL_DESCTYPE_MASK 0x0E000000 +#define E1000_SRRCTL_DROP_EN 0x80000000 + +#define E1000_SRRCTL_BSIZEPKT_MASK 0x0000007F +#define E1000_SRRCTL_BSIZEHDR_MASK 0x00003F00 + +/* from igbvf/mbox.h */ + +#define E1000_V2PMAILBOX_REQ 0x00000001 /* Request for PF Ready bit */ +#define E1000_V2PMAILBOX_ACK 0x00000002 /* Ack PF message received */ +#define E1000_V2PMAILBOX_VFU 0x00000004 /* VF owns the mailbox buffer */ +#define E1000_V2PMAILBOX_PFU 0x00000008 /* PF owns the mailbox buffer */ +#define E1000_V2PMAILBOX_PFSTS 0x00000010 /* PF wrote a message in the MB */ +#define E1000_V2PMAILBOX_PFACK 0x00000020 /* PF ack the previous VF msg */ +#define E1000_V2PMAILBOX_RSTI 0x00000040 /* PF has reset indication */ +#define E1000_V2PMAILBOX_RSTD 0x00000080 /* PF has indicated reset done */ +#define E1000_V2PMAILBOX_R2C_BITS 0x000000B0 /* All read to clear bits */ + +#define E1000_VFMAILBOX_SIZE 16 /* 16 32 bit words - 64 bytes */ + +/* + * If it's a E1000_VF_* msg then it originates in the VF and is sent to the + * PF. The reverse is true if it is E1000_PF_*. + * Message ACK's are the value or'd with 0xF0000000 + */ +/* Messages below or'd with this are the ACK */ +#define E1000_VT_MSGTYPE_ACK 0x80000000 +/* Messages below or'd with this are the NACK */ +#define E1000_VT_MSGTYPE_NACK 0x40000000 +/* Indicates that VF is still clear to send requests */ +#define E1000_VT_MSGTYPE_CTS 0x20000000 + +/* We have a total wait time of 1s for vf mailbox posted messages */ +#define E1000_VF_MBX_INIT_TIMEOUT 2000 /* retry count for mbx timeout */ +#define E1000_VF_MBX_INIT_DELAY 500 /* usec delay between retries */ + +#define E1000_VT_MSGINFO_SHIFT 16 +/* bits 23:16 are used for exra info for certain messages */ +#define E1000_VT_MSGINFO_MASK (0xFF << E1000_VT_MSGINFO_SHIFT) + +#define E1000_VF_RESET 0x01 /* VF requests reset */ +#define E1000_VF_SET_MAC_ADDR 0x02 /* VF requests PF to set MAC addr */ +/* VF requests PF to clear all unicast MAC filters */ +#define E1000_VF_MAC_FILTER_CLR (0x01 << E1000_VT_MSGINFO_SHIFT) +/* VF requests PF to add unicast MAC filter */ +#define E1000_VF_MAC_FILTER_ADD (0x02 << E1000_VT_MSGINFO_SHIFT) +#define E1000_VF_SET_MULTICAST 0x03 /* VF requests PF to set MC addr */ +#define E1000_VF_SET_VLAN 0x04 /* VF requests PF to set VLAN */ +#define E1000_VF_SET_LPE 0x05 /* VF requests PF to set VMOLR.LPE */ + +#define E1000_PF_CONTROL_MSG 0x0100 /* PF control message */ + +/* from igbvf/regs.h */ + +/* Statistics registers */ +#define E1000_VFGPRC 0x00F10 +#define E1000_VFGORC 0x00F18 +#define E1000_VFMPRC 0x00F3C +#define E1000_VFGPTC 0x00F14 +#define E1000_VFGOTC 0x00F34 +#define E1000_VFGOTLBC 0x00F50 +#define E1000_VFGPTLBC 0x00F44 +#define E1000_VFGORLBC 0x00F48 +#define E1000_VFGPRLBC 0x00F40 + +/* These act per VF so an array friendly macro is used */ +#define E1000_V2PMAILBOX(_n) (0x00C40 + (4 * (_n))) +#define E1000_VMBMEM(_n) (0x00800 + (64 * (_n))) + +/* from igbvf/vf.h */ + +#define E1000_DEV_ID_82576_VF 0x10CA + +/* new */ + +/* Receive Registers */ + +/* RX Descriptor Base Low; RW */ +#define E1000_RDBAL(_n) (0x0C000 + (0x40 * (_n))) +#define E1000_RDBAL_A(_n) (0x02800 + (0x100 * (_n))) + +/* RX Descriptor Base High; RW */ +#define E1000_RDBAH(_n) (0x0C004 + (0x40 * (_n))) +#define E1000_RDBAH_A(_n) (0x02804 + (0x100 * (_n))) + +/* RX Descriptor Ring Length; RW */ +#define E1000_RDLEN(_n) (0x0C008 + (0x40 * (_n))) +#define E1000_RDLEN_A(_n) (0x02808 + (0x100 * (_n))) + +/* Split and Replication Receive Control; RW */ +#define E1000_SRRCTL(_n) (0x0C00C + (0x40 * (_n))) +#define E1000_SRRCTL_A(_n) (0x0280C + (0x100 * (_n))) + +/* RX Descriptor Head; RW */ +#define E1000_RDH(_n) (0x0C010 + (0x40 * (_n))) +#define E1000_RDH_A(_n) (0x02810 + (0x100 * (_n))) + +/* RX DCA Control; RW */ +#define E1000_RXCTL(_n) (0x0C014 + (0x40 * (_n))) +#define E1000_RXCTL_A(_n) (0x02814 + (0x100 * (_n))) + +/* RX Descriptor Tail; RW */ +#define E1000_RDT(_n) (0x0C018 + (0x40 * (_n))) +#define E1000_RDT_A(_n) (0x02818 + (0x100 * (_n))) + +/* RX Descriptor Control; RW */ +#define E1000_RXDCTL(_n) (0x0C028 + (0x40 * (_n))) +#define E1000_RXDCTL_A(_n) (0x02828 + (0x100 * (_n))) + +/* RX Queue Drop Packet Count; RC */ +#define E1000_RQDPC_A(_n) (0x02830 + (0x100 * (_n))) + +/* Transmit Registers */ + +/* TX Descriptor Base Low; RW */ +#define E1000_TDBAL(_n) (0x0E000 + (0x40 * (_n))) +#define E1000_TDBAL_A(_n) (0x03800 + (0x100 * (_n))) + +/* TX Descriptor Base High; RW */ +#define E1000_TDBAH(_n) (0x0E004 + (0x40 * (_n))) +#define E1000_TDBAH_A(_n) (0x03804 + (0x100 * (_n))) + +/* TX Descriptor Ring Length; RW */ +#define E1000_TDLEN(_n) (0x0E008 + (0x40 * (_n))) +#define E1000_TDLEN_A(_n) (0x03808 + (0x100 * (_n))) + +/* TX Descriptor Head; RW */ +#define E1000_TDH(_n) (0x0E010 + (0x40 * (_n))) +#define E1000_TDH_A(_n) (0x03810 + (0x100 * (_n))) + +/* TX DCA Control; RW */ +#define E1000_TXCTL(_n) (0x0E014 + (0x40 * (_n))) +#define E1000_TXCTL_A(_n) (0x03814 + (0x100 * (_n))) + +/* TX Descriptor Tail; RW */ +#define E1000_TDT(_n) (0x0E018 + (0x40 * (_n))) +#define E1000_TDT_A(_n) (0x03818 + (0x100 * (_n))) + +/* TX Descriptor Control; RW */ +#define E1000_TXDCTL(_n) (0x0E028 + (0x40 * (_n))) +#define E1000_TXDCTL_A(_n) (0x03828 + (0x100 * (_n))) + +/* TX Descriptor Completion Write–Back Address Low; RW */ +#define E1000_TDWBAL(_n) (0x0E038 + (0x40 * (_n))) +#define E1000_TDWBAL_A(_n) (0x03838 + (0x100 * (_n))) + +/* TX Descriptor Completion Write–Back Address High; RW */ +#define E1000_TDWBAH(_n) (0x0E03C + (0x40 * (_n))) +#define E1000_TDWBAH_A(_n) (0x0383C + (0x100 * (_n))) + +#define E1000_MTA_A 0x0200 + +#define E1000_XDBAL_MASK (~(BIT(5) - 1)) /* TDBAL and RDBAL Registers Mask */ + +#define E1000_ICR_MACSEC 0x00000020 /* MACSec */ +#define E1000_ICR_RX0 0x00000040 /* Receiver Overrun */ +#define E1000_ICR_GPI_SDP0 0x00000800 /* General Purpose, SDP0 pin */ +#define E1000_ICR_GPI_SDP1 0x00001000 /* General Purpose, SDP1 pin */ +#define E1000_ICR_GPI_SDP2 0x00002000 /* General Purpose, SDP2 pin */ +#define E1000_ICR_GPI_SDP3 0x00004000 /* General Purpose, SDP3 pin */ +#define E1000_ICR_PTRAP 0x00008000 /* Probe Trap */ +#define E1000_ICR_MNG 0x00040000 /* Management Event */ +#define E1000_ICR_OMED 0x00100000 /* Other Media Energy Detected */ +#define E1000_ICR_FER 0x00400000 /* Fatal Error */ +#define E1000_ICR_NFER 0x00800000 /* Non Fatal Error */ +#define E1000_ICR_CSRTO 0x01000000 /* CSR access Time Out Indication */ +#define E1000_ICR_SCE 0x02000000 /* Storm Control Event */ +#define E1000_ICR_SW_WD 0x04000000 /* Software Watchdog */ + +/* Extended Interrupts */ + +#define E1000_EICR_MSIX_MASK 0x01FFFFFF /* Bits used in MSI-X mode */ +#define E1000_EICR_LEGACY_MASK 0x4000FFFF /* Bits used in non MSI-X mode */ + +/* Mirror VF Control (only RST bit); RW */ +#define E1000_PVTCTRL(_n) (0x10000 + (_n) * 0x100) + +/* Mirror Good Packets Received Count; RO */ +#define E1000_PVFGPRC(_n) (0x10010 + (_n) * 0x100) + +/* Mirror Good Packets Transmitted Count; RO */ +#define E1000_PVFGPTC(_n) (0x10014 + (_n) * 0x100) + +/* Mirror Good Octets Received Count; RO */ +#define E1000_PVFGORC(_n) (0x10018 + (_n) * 0x100) + +/* Mirror Extended Interrupt Cause Set; WO */ +#define E1000_PVTEICS(_n) (0x10020 + (_n) * 0x100) + +/* Mirror Extended Interrupt Mask Set/Read; RW */ +#define E1000_PVTEIMS(_n) (0x10024 + (_n) * 0x100) + +/* Mirror Extended Interrupt Mask Clear; WO */ +#define E1000_PVTEIMC(_n) (0x10028 + (_n) * 0x100) + +/* Mirror Extended Interrupt Auto Clear; RW */ +#define E1000_PVTEIAC(_n) (0x1002C + (_n) * 0x100) + +/* Mirror Extended Interrupt Auto Mask Enable; RW */ +#define E1000_PVTEIAM(_n) (0x10030 + (_n) * 0x100) + +/* Mirror Good Octets Transmitted Count; RO */ +#define E1000_PVFGOTC(_n) (0x10034 + (_n) * 0x100) + +/* Mirror Multicast Packets Received Count; RO */ +#define E1000_PVFMPRC(_n) (0x1003C + (_n) * 0x100) + +/* Mirror Good RX Packets loopback Count; RO */ +#define E1000_PVFGPRLBC(_n) (0x10040 + (_n) * 0x100) + +/* Mirror Good TX packets loopback Count; RO */ +#define E1000_PVFGPTLBC(_n) (0x10044 + (_n) * 0x100) + +/* Mirror Good RX Octets loopback Count; RO */ +#define E1000_PVFGORLBC(_n) (0x10048 + (_n) * 0x100) + +/* Mirror Good TX Octets loopback Count; RO */ +#define E1000_PVFGOTLBC(_n) (0x10050 + (_n) * 0x100) + +/* Mirror Extended Interrupt Cause Set; RC/W1C */ +#define E1000_PVTEICR(_n) (0x10080 + (_n) * 0x100) + +/* + * These are fake addresses that, according to the specification, the device + * is not using. They are used to distinguish between the PF and the VFs + * accessing their VTIVAR register (which is the same address, 0x1700) + */ +#define E1000_VTIVAR 0x11700 +#define E1000_VTIVAR_MISC 0x11720 + +#define E1000_RSS_QUEUE(reta, hash) (E1000_RETA_VAL(reta, hash) & 0x0F) + +#define E1000_STATUS_IOV_MODE 0x00040000 + +#define E1000_STATUS_NUM_VFS_SHIFT 14 + +static inline uint8_t igb_ivar_entry_rx(uint8_t i) +{ + return i < 8 ? i * 4 : (i - 8) * 4 + 2; +} + +static inline uint8_t igb_ivar_entry_tx(uint8_t i) +{ + return i < 8 ? i * 4 + 1 : (i - 8) * 4 + 3; +} + +#endif diff --git a/hw/net/igbvf.c b/hw/net/igbvf.c new file mode 100644 index 0000000000..70beb7af50 --- /dev/null +++ b/hw/net/igbvf.c @@ -0,0 +1,327 @@ +/* + * QEMU Intel 82576 SR/IOV Ethernet Controller Emulation + * + * Datasheet: + * https://www.intel.com/content/dam/www/public/us/en/documents/datasheets/82576eg-gbe-datasheet.pdf + * + * Copyright (c) 2020-2023 Red Hat, Inc. + * Copyright (c) 2015 Ravello Systems LTD (http://ravellosystems.com) + * Developed by Daynix Computing LTD (http://www.daynix.com) + * + * Authors: + * Akihiko Odaki <akihiko.odaki@daynix.com> + * Gal Hammmer <gal.hammer@sap.com> + * Marcel Apfelbaum <marcel.apfelbaum@gmail.com> + * Dmitry Fleytman <dmitry@daynix.com> + * Leonid Bloch <leonid@daynix.com> + * Yan Vugenfirer <yan@daynix.com> + * + * Based on work done by: + * Nir Peleg, Tutis Systems Ltd. for Qumranet Inc. + * Copyright (c) 2008 Qumranet + * Based on work done by: + * Copyright (c) 2007 Dan Aloni + * Copyright (c) 2004 Antony T Curtis + * + * This library is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * This library is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with this library; if not, see <http://www.gnu.org/licenses/>. + */ + +#include "qemu/osdep.h" +#include "hw/hw.h" +#include "hw/net/mii.h" +#include "hw/pci/pci_device.h" +#include "hw/pci/pcie.h" +#include "hw/pci/msix.h" +#include "net/eth.h" +#include "net/net.h" +#include "igb_common.h" +#include "igb_core.h" +#include "trace.h" +#include "qapi/error.h" + +#define TYPE_IGBVF "igbvf" +OBJECT_DECLARE_SIMPLE_TYPE(IgbVfState, IGBVF) + +#define IGBVF_MMIO_BAR_IDX (0) +#define IGBVF_MSIX_BAR_IDX (3) + +#define IGBVF_MMIO_SIZE (16 * 1024) +#define IGBVF_MSIX_SIZE (16 * 1024) + +struct IgbVfState { + PCIDevice parent_obj; + + MemoryRegion mmio; + MemoryRegion msix; +}; + +static hwaddr vf_to_pf_addr(hwaddr addr, uint16_t vfn, bool write) +{ + switch (addr) { + case E1000_CTRL: + case E1000_CTRL_DUP: + return E1000_PVTCTRL(vfn); + case E1000_EICS: + return E1000_PVTEICS(vfn); + case E1000_EIMS: + return E1000_PVTEIMS(vfn); + case E1000_EIMC: + return E1000_PVTEIMC(vfn); + case E1000_EIAC: + return E1000_PVTEIAC(vfn); + case E1000_EIAM: + return E1000_PVTEIAM(vfn); + case E1000_EICR: + return E1000_PVTEICR(vfn); + case E1000_EITR(0): + case E1000_EITR(1): + case E1000_EITR(2): + return E1000_EITR(22) + (addr - E1000_EITR(0)) - vfn * 0xC; + case E1000_IVAR0: + return E1000_VTIVAR + vfn * 4; + case E1000_IVAR_MISC: + return E1000_VTIVAR_MISC + vfn * 4; + case 0x0F04: /* PBACL */ + return E1000_PBACLR; + case 0x0F0C: /* PSRTYPE */ + return E1000_PSRTYPE(vfn); + case E1000_V2PMAILBOX(0): + return E1000_V2PMAILBOX(vfn); + case E1000_VMBMEM(0) ... E1000_VMBMEM(0) + 0x3F: + return addr + vfn * 0x40; + case E1000_RDBAL_A(0): + return E1000_RDBAL(vfn); + case E1000_RDBAL_A(1): + return E1000_RDBAL(vfn + IGB_MAX_VF_FUNCTIONS); + case E1000_RDBAH_A(0): + return E1000_RDBAH(vfn); + case E1000_RDBAH_A(1): + return E1000_RDBAH(vfn + IGB_MAX_VF_FUNCTIONS); + case E1000_RDLEN_A(0): + return E1000_RDLEN(vfn); + case E1000_RDLEN_A(1): + return E1000_RDLEN(vfn + IGB_MAX_VF_FUNCTIONS); + case E1000_SRRCTL_A(0): + return E1000_SRRCTL(vfn); + case E1000_SRRCTL_A(1): + return E1000_SRRCTL(vfn + IGB_MAX_VF_FUNCTIONS); + case E1000_RDH_A(0): + return E1000_RDH(vfn); + case E1000_RDH_A(1): + return E1000_RDH(vfn + IGB_MAX_VF_FUNCTIONS); + case E1000_RXCTL_A(0): + return E1000_RXCTL(vfn); + case E1000_RXCTL_A(1): + return E1000_RXCTL(vfn + IGB_MAX_VF_FUNCTIONS); + case E1000_RDT_A(0): + return E1000_RDT(vfn); + case E1000_RDT_A(1): + return E1000_RDT(vfn + IGB_MAX_VF_FUNCTIONS); + case E1000_RXDCTL_A(0): + return E1000_RXDCTL(vfn); + case E1000_RXDCTL_A(1): + return E1000_RXDCTL(vfn + IGB_MAX_VF_FUNCTIONS); + case E1000_RQDPC_A(0): + return E1000_RQDPC(vfn); + case E1000_RQDPC_A(1): + return E1000_RQDPC(vfn + IGB_MAX_VF_FUNCTIONS); + case E1000_TDBAL_A(0): + return E1000_TDBAL(vfn); + case E1000_TDBAL_A(1): + return E1000_TDBAL(vfn + IGB_MAX_VF_FUNCTIONS); + case E1000_TDBAH_A(0): + return E1000_TDBAH(vfn); + case E1000_TDBAH_A(1): + return E1000_TDBAH(vfn + IGB_MAX_VF_FUNCTIONS); + case E1000_TDLEN_A(0): + return E1000_TDLEN(vfn); + case E1000_TDLEN_A(1): + return E1000_TDLEN(vfn + IGB_MAX_VF_FUNCTIONS); + case E1000_TDH_A(0): + return E1000_TDH(vfn); + case E1000_TDH_A(1): + return E1000_TDH(vfn + IGB_MAX_VF_FUNCTIONS); + case E1000_TXCTL_A(0): + return E1000_TXCTL(vfn); + case E1000_TXCTL_A(1): + return E1000_TXCTL(vfn + IGB_MAX_VF_FUNCTIONS); + case E1000_TDT_A(0): + return E1000_TDT(vfn); + case E1000_TDT_A(1): + return E1000_TDT(vfn + IGB_MAX_VF_FUNCTIONS); + case E1000_TXDCTL_A(0): + return E1000_TXDCTL(vfn); + case E1000_TXDCTL_A(1): + return E1000_TXDCTL(vfn + IGB_MAX_VF_FUNCTIONS); + case E1000_TDWBAL_A(0): + return E1000_TDWBAL(vfn); + case E1000_TDWBAL_A(1): + return E1000_TDWBAL(vfn + IGB_MAX_VF_FUNCTIONS); + case E1000_TDWBAH_A(0): + return E1000_TDWBAH(vfn); + case E1000_TDWBAH_A(1): + return E1000_TDWBAH(vfn + IGB_MAX_VF_FUNCTIONS); + case E1000_VFGPRC: + return E1000_PVFGPRC(vfn); + case E1000_VFGPTC: + return E1000_PVFGPTC(vfn); + case E1000_VFGORC: + return E1000_PVFGORC(vfn); + case E1000_VFGOTC: + return E1000_PVFGOTC(vfn); + case E1000_VFMPRC: + return E1000_PVFMPRC(vfn); + case E1000_VFGPRLBC: + return E1000_PVFGPRLBC(vfn); + case E1000_VFGPTLBC: + return E1000_PVFGPTLBC(vfn); + case E1000_VFGORLBC: + return E1000_PVFGORLBC(vfn); + case E1000_VFGOTLBC: + return E1000_PVFGOTLBC(vfn); + case E1000_STATUS: + case E1000_FRTIMER: + if (write) { + return HWADDR_MAX; + } + /* fallthrough */ + case 0x34E8: /* PBTWAC */ + case 0x24E8: /* PBRWAC */ + return addr; + } + + trace_igbvf_wrn_io_addr_unknown(addr); + + return HWADDR_MAX; +} + +static void igbvf_write_config(PCIDevice *dev, uint32_t addr, uint32_t val, + int len) +{ + trace_igbvf_write_config(addr, val, len); + pci_default_write_config(dev, addr, val, len); +} + +static uint64_t igbvf_mmio_read(void *opaque, hwaddr addr, unsigned size) +{ + PCIDevice *vf = PCI_DEVICE(opaque); + PCIDevice *pf = pcie_sriov_get_pf(vf); + + addr = vf_to_pf_addr(addr, pcie_sriov_vf_number(vf), false); + return addr == HWADDR_MAX ? 0 : igb_mmio_read(pf, addr, size); +} + +static void igbvf_mmio_write(void *opaque, hwaddr addr, uint64_t val, + unsigned size) +{ + PCIDevice *vf = PCI_DEVICE(opaque); + PCIDevice *pf = pcie_sriov_get_pf(vf); + + addr = vf_to_pf_addr(addr, pcie_sriov_vf_number(vf), true); + if (addr != HWADDR_MAX) { + igb_mmio_write(pf, addr, val, size); + } +} + +static const MemoryRegionOps mmio_ops = { + .read = igbvf_mmio_read, + .write = igbvf_mmio_write, + .endianness = DEVICE_LITTLE_ENDIAN, + .impl = { + .min_access_size = 4, + .max_access_size = 4, + }, +}; + +static void igbvf_pci_realize(PCIDevice *dev, Error **errp) +{ + IgbVfState *s = IGBVF(dev); + int ret; + int i; + + dev->config_write = igbvf_write_config; + + memory_region_init_io(&s->mmio, OBJECT(dev), &mmio_ops, s, "igbvf-mmio", + IGBVF_MMIO_SIZE); + pcie_sriov_vf_register_bar(dev, IGBVF_MMIO_BAR_IDX, &s->mmio); + + memory_region_init(&s->msix, OBJECT(dev), "igbvf-msix", IGBVF_MSIX_SIZE); + pcie_sriov_vf_register_bar(dev, IGBVF_MSIX_BAR_IDX, &s->msix); + + ret = msix_init(dev, IGBVF_MSIX_VEC_NUM, &s->msix, IGBVF_MSIX_BAR_IDX, 0, + &s->msix, IGBVF_MSIX_BAR_IDX, 0x2000, 0x70, errp); + if (ret) { + return; + } + + for (i = 0; i < IGBVF_MSIX_VEC_NUM; i++) { + msix_vector_use(dev, i); + } + + if (pcie_endpoint_cap_init(dev, 0xa0) < 0) { + hw_error("Failed to initialize PCIe capability"); + } + + if (pcie_aer_init(dev, 1, 0x100, 0x40, errp) < 0) { + hw_error("Failed to initialize AER capability"); + } + + pcie_ari_init(dev, 0x150, 1); +} + +static void igbvf_pci_uninit(PCIDevice *dev) +{ + IgbVfState *s = IGBVF(dev); + + pcie_aer_exit(dev); + pcie_cap_exit(dev); + msix_unuse_all_vectors(dev); + msix_uninit(dev, &s->msix, &s->msix); +} + +static void igbvf_class_init(ObjectClass *class, void *data) +{ + DeviceClass *dc = DEVICE_CLASS(class); + PCIDeviceClass *c = PCI_DEVICE_CLASS(class); + + c->realize = igbvf_pci_realize; + c->exit = igbvf_pci_uninit; + c->vendor_id = PCI_VENDOR_ID_INTEL; + c->device_id = E1000_DEV_ID_82576_VF; + c->revision = 1; + c->class_id = PCI_CLASS_NETWORK_ETHERNET; + + dc->desc = "Intel 82576 Virtual Function"; + dc->user_creatable = false; + + set_bit(DEVICE_CATEGORY_NETWORK, dc->categories); +} + +static const TypeInfo igbvf_info = { + .name = TYPE_IGBVF, + .parent = TYPE_PCI_DEVICE, + .instance_size = sizeof(IgbVfState), + .class_init = igbvf_class_init, + .interfaces = (InterfaceInfo[]) { + { INTERFACE_PCIE_DEVICE }, + { } + }, +}; + +static void igb_register_types(void) +{ + type_register_static(&igbvf_info); +} + +type_init(igb_register_types) diff --git a/hw/net/meson.build b/hw/net/meson.build index 4285145715..e2be0654a1 100644 --- a/hw/net/meson.build +++ b/hw/net/meson.build @@ -10,6 +10,8 @@ softmmu_ss.add(when: 'CONFIG_PCNET_COMMON', if_true: files('pcnet.c')) softmmu_ss.add(when: 'CONFIG_E1000_PCI', if_true: files('e1000.c', 'e1000x_common.c')) softmmu_ss.add(when: 'CONFIG_E1000E_PCI_EXPRESS', if_true: files('net_tx_pkt.c', 'net_rx_pkt.c')) softmmu_ss.add(when: 'CONFIG_E1000E_PCI_EXPRESS', if_true: files('e1000e.c', 'e1000e_core.c', 'e1000x_common.c')) +softmmu_ss.add(when: 'CONFIG_IGB_PCI_EXPRESS', if_true: files('net_tx_pkt.c', 'net_rx_pkt.c')) +softmmu_ss.add(when: 'CONFIG_IGB_PCI_EXPRESS', if_true: files('igb.c', 'igbvf.c', 'igb_core.c')) softmmu_ss.add(when: 'CONFIG_RTL8139_PCI', if_true: files('rtl8139.c')) softmmu_ss.add(when: 'CONFIG_TULIP', if_true: files('tulip.c')) softmmu_ss.add(when: 'CONFIG_VMXNET3_PCI', if_true: files('net_tx_pkt.c', 'net_rx_pkt.c')) diff --git a/hw/net/net_rx_pkt.c b/hw/net/net_rx_pkt.c index 1e1c504e42..39cdea06de 100644 --- a/hw/net/net_rx_pkt.c +++ b/hw/net/net_rx_pkt.c @@ -30,14 +30,11 @@ struct NetRxPkt { uint32_t tot_len; uint16_t tci; size_t ehdr_buf_len; - bool has_virt_hdr; eth_pkt_types_e packet_type; /* Analysis results */ - bool isip4; - bool isip6; - bool isudp; - bool istcp; + bool hasip4; + bool hasip6; size_t l3hdr_off; size_t l4hdr_off; @@ -48,10 +45,9 @@ struct NetRxPkt { eth_l4_hdr_info l4hdr_info; }; -void net_rx_pkt_init(struct NetRxPkt **pkt, bool has_virt_hdr) +void net_rx_pkt_init(struct NetRxPkt **pkt) { struct NetRxPkt *p = g_malloc0(sizeof *p); - p->has_virt_hdr = has_virt_hdr; p->vec = NULL; p->vec_len_total = 0; *pkt = p; @@ -107,12 +103,11 @@ net_rx_pkt_pull_data(struct NetRxPkt *pkt, iov, iovcnt, ploff, pkt->tot_len); } - eth_get_protocols(pkt->vec, pkt->vec_len, &pkt->isip4, &pkt->isip6, - &pkt->isudp, &pkt->istcp, + eth_get_protocols(pkt->vec, pkt->vec_len, &pkt->hasip4, &pkt->hasip6, &pkt->l3hdr_off, &pkt->l4hdr_off, &pkt->l5hdr_off, &pkt->ip6hdr_info, &pkt->ip4hdr_info, &pkt->l4hdr_info); - trace_net_rx_pkt_parsed(pkt->isip4, pkt->isip6, pkt->isudp, pkt->istcp, + trace_net_rx_pkt_parsed(pkt->hasip4, pkt->hasip6, pkt->l4hdr_info.proto, pkt->l3hdr_off, pkt->l4hdr_off, pkt->l5hdr_off); } @@ -201,22 +196,20 @@ void net_rx_pkt_set_protocols(struct NetRxPkt *pkt, const void *data, assert(pkt); - eth_get_protocols(&iov, 1, &pkt->isip4, &pkt->isip6, - &pkt->isudp, &pkt->istcp, + eth_get_protocols(&iov, 1, &pkt->hasip4, &pkt->hasip6, &pkt->l3hdr_off, &pkt->l4hdr_off, &pkt->l5hdr_off, &pkt->ip6hdr_info, &pkt->ip4hdr_info, &pkt->l4hdr_info); } void net_rx_pkt_get_protocols(struct NetRxPkt *pkt, - bool *isip4, bool *isip6, - bool *isudp, bool *istcp) + bool *hasip4, bool *hasip6, + EthL4HdrProto *l4hdr_proto) { assert(pkt); - *isip4 = pkt->isip4; - *isip6 = pkt->isip6; - *isudp = pkt->isudp; - *istcp = pkt->istcp; + *hasip4 = pkt->hasip4; + *hasip6 = pkt->hasip6; + *l4hdr_proto = pkt->l4hdr_info.proto; } size_t net_rx_pkt_get_l3_hdr_offset(struct NetRxPkt *pkt) @@ -333,58 +326,58 @@ net_rx_pkt_calc_rss_hash(struct NetRxPkt *pkt, switch (type) { case NetPktRssIpV4: - assert(pkt->isip4); + assert(pkt->hasip4); trace_net_rx_pkt_rss_ip4(); _net_rx_rss_prepare_ip4(&rss_input[0], pkt, &rss_length); break; case NetPktRssIpV4Tcp: - assert(pkt->isip4); - assert(pkt->istcp); + assert(pkt->hasip4); + assert(pkt->l4hdr_info.proto == ETH_L4_HDR_PROTO_TCP); trace_net_rx_pkt_rss_ip4_tcp(); _net_rx_rss_prepare_ip4(&rss_input[0], pkt, &rss_length); _net_rx_rss_prepare_tcp(&rss_input[0], pkt, &rss_length); break; case NetPktRssIpV6Tcp: - assert(pkt->isip6); - assert(pkt->istcp); + assert(pkt->hasip6); + assert(pkt->l4hdr_info.proto == ETH_L4_HDR_PROTO_TCP); trace_net_rx_pkt_rss_ip6_tcp(); _net_rx_rss_prepare_ip6(&rss_input[0], pkt, false, &rss_length); _net_rx_rss_prepare_tcp(&rss_input[0], pkt, &rss_length); break; case NetPktRssIpV6: - assert(pkt->isip6); + assert(pkt->hasip6); trace_net_rx_pkt_rss_ip6(); _net_rx_rss_prepare_ip6(&rss_input[0], pkt, false, &rss_length); break; case NetPktRssIpV6Ex: - assert(pkt->isip6); + assert(pkt->hasip6); trace_net_rx_pkt_rss_ip6_ex(); _net_rx_rss_prepare_ip6(&rss_input[0], pkt, true, &rss_length); break; case NetPktRssIpV6TcpEx: - assert(pkt->isip6); - assert(pkt->istcp); + assert(pkt->hasip6); + assert(pkt->l4hdr_info.proto == ETH_L4_HDR_PROTO_TCP); trace_net_rx_pkt_rss_ip6_ex_tcp(); _net_rx_rss_prepare_ip6(&rss_input[0], pkt, true, &rss_length); _net_rx_rss_prepare_tcp(&rss_input[0], pkt, &rss_length); break; case NetPktRssIpV4Udp: - assert(pkt->isip4); - assert(pkt->isudp); + assert(pkt->hasip4); + assert(pkt->l4hdr_info.proto == ETH_L4_HDR_PROTO_UDP); trace_net_rx_pkt_rss_ip4_udp(); _net_rx_rss_prepare_ip4(&rss_input[0], pkt, &rss_length); _net_rx_rss_prepare_udp(&rss_input[0], pkt, &rss_length); break; case NetPktRssIpV6Udp: - assert(pkt->isip6); - assert(pkt->isudp); + assert(pkt->hasip6); + assert(pkt->l4hdr_info.proto == ETH_L4_HDR_PROTO_UDP); trace_net_rx_pkt_rss_ip6_udp(); _net_rx_rss_prepare_ip6(&rss_input[0], pkt, false, &rss_length); _net_rx_rss_prepare_udp(&rss_input[0], pkt, &rss_length); break; case NetPktRssIpV6UdpEx: - assert(pkt->isip6); - assert(pkt->isudp); + assert(pkt->hasip6); + assert(pkt->l4hdr_info.proto == ETH_L4_HDR_PROTO_UDP); trace_net_rx_pkt_rss_ip6_ex_udp(); _net_rx_rss_prepare_ip6(&rss_input[0], pkt, true, &rss_length); _net_rx_rss_prepare_udp(&rss_input[0], pkt, &rss_length); @@ -406,7 +399,7 @@ uint16_t net_rx_pkt_get_ip_id(struct NetRxPkt *pkt) { assert(pkt); - if (pkt->isip4) { + if (pkt->hasip4) { return be16_to_cpu(pkt->ip4hdr_info.ip4_hdr.ip_id); } @@ -417,7 +410,7 @@ bool net_rx_pkt_is_tcp_ack(struct NetRxPkt *pkt) { assert(pkt); - if (pkt->istcp) { + if (pkt->l4hdr_info.proto == ETH_L4_HDR_PROTO_TCP) { return TCP_HEADER_FLAGS(&pkt->l4hdr_info.hdr.tcp) & TCP_FLAG_ACK; } @@ -428,7 +421,7 @@ bool net_rx_pkt_has_tcp_data(struct NetRxPkt *pkt) { assert(pkt); - if (pkt->istcp) { + if (pkt->l4hdr_info.proto == ETH_L4_HDR_PROTO_TCP) { return pkt->l4hdr_info.has_tcp_data; } @@ -465,18 +458,18 @@ void net_rx_pkt_set_vhdr_iovec(struct NetRxPkt *pkt, iov_to_buf(iov, iovcnt, 0, &pkt->virt_hdr, sizeof pkt->virt_hdr); } -bool net_rx_pkt_is_vlan_stripped(struct NetRxPkt *pkt) +void net_rx_pkt_unset_vhdr(struct NetRxPkt *pkt) { assert(pkt); - return pkt->ehdr_buf_len ? true : false; + memset(&pkt->virt_hdr, 0, sizeof(pkt->virt_hdr)); } -bool net_rx_pkt_has_virt_hdr(struct NetRxPkt *pkt) +bool net_rx_pkt_is_vlan_stripped(struct NetRxPkt *pkt) { assert(pkt); - return pkt->has_virt_hdr; + return pkt->ehdr_buf_len ? true : false; } uint16_t net_rx_pkt_get_vlan_tag(struct NetRxPkt *pkt) @@ -494,7 +487,7 @@ bool net_rx_pkt_validate_l3_csum(struct NetRxPkt *pkt, bool *csum_valid) trace_net_rx_pkt_l3_csum_validate_entry(); - if (!pkt->isip4) { + if (!pkt->hasip4) { trace_net_rx_pkt_l3_csum_validate_not_ip4(); return false; } @@ -525,8 +518,8 @@ _net_rx_pkt_calc_l4_csum(struct NetRxPkt *pkt) trace_net_rx_pkt_l4_csum_calc_entry(); - if (pkt->isip4) { - if (pkt->isudp) { + if (pkt->hasip4) { + if (pkt->l4hdr_info.proto == ETH_L4_HDR_PROTO_UDP) { csl = be16_to_cpu(pkt->l4hdr_info.hdr.udp.uh_ulen); trace_net_rx_pkt_l4_csum_calc_ip4_udp(); } else { @@ -539,7 +532,7 @@ _net_rx_pkt_calc_l4_csum(struct NetRxPkt *pkt) csl, &cso); trace_net_rx_pkt_l4_csum_calc_ph_csum(cntr, csl); } else { - if (pkt->isudp) { + if (pkt->l4hdr_info.proto == ETH_L4_HDR_PROTO_UDP) { csl = be16_to_cpu(pkt->l4hdr_info.hdr.udp.uh_ulen); trace_net_rx_pkt_l4_csum_calc_ip6_udp(); } else { @@ -573,17 +566,19 @@ bool net_rx_pkt_validate_l4_csum(struct NetRxPkt *pkt, bool *csum_valid) trace_net_rx_pkt_l4_csum_validate_entry(); - if (!pkt->istcp && !pkt->isudp) { + if (pkt->l4hdr_info.proto != ETH_L4_HDR_PROTO_TCP && + pkt->l4hdr_info.proto != ETH_L4_HDR_PROTO_UDP) { trace_net_rx_pkt_l4_csum_validate_not_xxp(); return false; } - if (pkt->isudp && (pkt->l4hdr_info.hdr.udp.uh_sum == 0)) { + if (pkt->l4hdr_info.proto == ETH_L4_HDR_PROTO_UDP && + pkt->l4hdr_info.hdr.udp.uh_sum == 0) { trace_net_rx_pkt_l4_csum_validate_udp_with_no_checksum(); return false; } - if (pkt->isip4 && pkt->ip4hdr_info.fragment) { + if (pkt->hasip4 && pkt->ip4hdr_info.fragment) { trace_net_rx_pkt_l4_csum_validate_ip4_fragment(); return false; } @@ -604,22 +599,27 @@ bool net_rx_pkt_fix_l4_csum(struct NetRxPkt *pkt) trace_net_rx_pkt_l4_csum_fix_entry(); - if (pkt->istcp) { + switch (pkt->l4hdr_info.proto) { + case ETH_L4_HDR_PROTO_TCP: l4_cso = offsetof(struct tcp_header, th_sum); trace_net_rx_pkt_l4_csum_fix_tcp(l4_cso); - } else if (pkt->isudp) { + break; + + case ETH_L4_HDR_PROTO_UDP: if (pkt->l4hdr_info.hdr.udp.uh_sum == 0) { trace_net_rx_pkt_l4_csum_fix_udp_with_no_checksum(); return false; } l4_cso = offsetof(struct udp_header, uh_sum); trace_net_rx_pkt_l4_csum_fix_udp(l4_cso); - } else { + break; + + default: trace_net_rx_pkt_l4_csum_fix_not_xxp(); return false; } - if (pkt->isip4 && pkt->ip4hdr_info.fragment) { + if (pkt->hasip4 && pkt->ip4hdr_info.fragment) { trace_net_rx_pkt_l4_csum_fix_ip4_fragment(); return false; } diff --git a/hw/net/net_rx_pkt.h b/hw/net/net_rx_pkt.h index 048e3461f0..d00b484900 100644 --- a/hw/net/net_rx_pkt.h +++ b/hw/net/net_rx_pkt.h @@ -37,10 +37,9 @@ void net_rx_pkt_uninit(struct NetRxPkt *pkt); * Init function for rx packet functionality * * @pkt: packet pointer - * @has_virt_hdr: device uses virtio header * */ -void net_rx_pkt_init(struct NetRxPkt **pkt, bool has_virt_hdr); +void net_rx_pkt_init(struct NetRxPkt **pkt); /** * returns total length of data attached to rx context @@ -67,15 +66,14 @@ void net_rx_pkt_set_protocols(struct NetRxPkt *pkt, const void *data, * fetches packet analysis results * * @pkt: packet - * @isip4: whether the packet given is IPv4 - * @isip6: whether the packet given is IPv6 - * @isudp: whether the packet given is UDP - * @istcp: whether the packet given is TCP + * @hasip4: whether the packet has an IPv4 header + * @hasip6: whether the packet has an IPv6 header + * @l4hdr_proto: protocol of L4 header * */ void net_rx_pkt_get_protocols(struct NetRxPkt *pkt, - bool *isip4, bool *isip6, - bool *isudp, bool *istcp); + bool *hasip4, bool *hasip6, + EthL4HdrProto *l4hdr_proto); /** * fetches L3 header offset @@ -215,15 +213,6 @@ uint16_t net_rx_pkt_get_vlan_tag(struct NetRxPkt *pkt); bool net_rx_pkt_is_vlan_stripped(struct NetRxPkt *pkt); /** - * notifies caller if the packet has virtio header - * - * @pkt: packet - * @ret: true if packet has virtio header, false otherwize - * - */ -bool net_rx_pkt_has_virt_hdr(struct NetRxPkt *pkt); - -/** * attach scatter-gather data to rx packet * * @pkt: packet @@ -323,6 +312,14 @@ void net_rx_pkt_set_vhdr_iovec(struct NetRxPkt *pkt, const struct iovec *iov, int iovcnt); /** + * unset vhdr data from packet context + * + * @pkt: packet + * + */ +void net_rx_pkt_unset_vhdr(struct NetRxPkt *pkt); + +/** * save packet type in packet context * * @pkt: packet diff --git a/hw/net/net_tx_pkt.c b/hw/net/net_tx_pkt.c index 2533ea2700..986a3adfe9 100644 --- a/hw/net/net_tx_pkt.c +++ b/hw/net/net_tx_pkt.c @@ -35,7 +35,6 @@ struct NetTxPkt { PCIDevice *pci_dev; struct virtio_net_hdr virt_hdr; - bool has_virt_hdr; struct iovec *raw; uint32_t raw_frags; @@ -54,12 +53,10 @@ struct NetTxPkt { uint16_t hdr_len; eth_pkt_types_e packet_type; uint8_t l4proto; - - bool is_loopback; }; void net_tx_pkt_init(struct NetTxPkt **pkt, PCIDevice *pci_dev, - uint32_t max_frags, bool has_virt_hdr) + uint32_t max_frags) { struct NetTxPkt *p = g_malloc0(sizeof *p); @@ -71,10 +68,8 @@ void net_tx_pkt_init(struct NetTxPkt **pkt, PCIDevice *pci_dev, p->max_payload_frags = max_frags; p->max_raw_frags = max_frags; - p->has_virt_hdr = has_virt_hdr; p->vec[NET_TX_PKT_VHDR_FRAG].iov_base = &p->virt_hdr; - p->vec[NET_TX_PKT_VHDR_FRAG].iov_len = - p->has_virt_hdr ? sizeof p->virt_hdr : 0; + p->vec[NET_TX_PKT_VHDR_FRAG].iov_len = sizeof p->virt_hdr; p->vec[NET_TX_PKT_L2HDR_FRAG].iov_base = &p->l2_hdr; p->vec[NET_TX_PKT_L3HDR_FRAG].iov_base = &p->l3_hdr; @@ -304,10 +299,11 @@ func_exit: return rc; } -void net_tx_pkt_build_vheader(struct NetTxPkt *pkt, bool tso_enable, +bool net_tx_pkt_build_vheader(struct NetTxPkt *pkt, bool tso_enable, bool csum_enable, uint32_t gso_size) { struct tcp_hdr l4hdr; + size_t bytes_read; assert(pkt); /* csum has to be enabled if tso is. */ @@ -328,8 +324,13 @@ void net_tx_pkt_build_vheader(struct NetTxPkt *pkt, bool tso_enable, case VIRTIO_NET_HDR_GSO_TCPV4: case VIRTIO_NET_HDR_GSO_TCPV6: - iov_to_buf(&pkt->vec[NET_TX_PKT_PL_START_FRAG], pkt->payload_frags, - 0, &l4hdr, sizeof(l4hdr)); + bytes_read = iov_to_buf(&pkt->vec[NET_TX_PKT_PL_START_FRAG], + pkt->payload_frags, 0, &l4hdr, sizeof(l4hdr)); + if (bytes_read < sizeof(l4hdr) || + l4hdr.th_off * sizeof(uint32_t) < sizeof(l4hdr)) { + return false; + } + pkt->virt_hdr.hdr_len = pkt->hdr_len + l4hdr.th_off * sizeof(uint32_t); pkt->virt_hdr.gso_size = gso_size; break; @@ -341,11 +342,17 @@ void net_tx_pkt_build_vheader(struct NetTxPkt *pkt, bool tso_enable, if (csum_enable) { switch (pkt->l4proto) { case IP_PROTO_TCP: + if (pkt->payload_len < sizeof(struct tcp_hdr)) { + return false; + } pkt->virt_hdr.flags = VIRTIO_NET_HDR_F_NEEDS_CSUM; pkt->virt_hdr.csum_start = pkt->hdr_len; pkt->virt_hdr.csum_offset = offsetof(struct tcp_hdr, th_sum); break; case IP_PROTO_UDP: + if (pkt->payload_len < sizeof(struct udp_hdr)) { + return false; + } pkt->virt_hdr.flags = VIRTIO_NET_HDR_F_NEEDS_CSUM; pkt->virt_hdr.csum_start = pkt->hdr_len; pkt->virt_hdr.csum_offset = offsetof(struct udp_hdr, uh_sum); @@ -354,6 +361,8 @@ void net_tx_pkt_build_vheader(struct NetTxPkt *pkt, bool tso_enable, break; } } + + return true; } void net_tx_pkt_setup_vlan_header_ex(struct NetTxPkt *pkt, @@ -464,15 +473,14 @@ void net_tx_pkt_reset(struct NetTxPkt *pkt) pkt->l4proto = 0; } -static void net_tx_pkt_do_sw_csum(struct NetTxPkt *pkt) +static void net_tx_pkt_do_sw_csum(struct NetTxPkt *pkt, + struct iovec *iov, uint32_t iov_len, + uint16_t csl) { - struct iovec *iov = &pkt->vec[NET_TX_PKT_L2HDR_FRAG]; uint32_t csum_cntr; uint16_t csum = 0; uint32_t cso; /* num of iovec without vhdr */ - uint32_t iov_len = pkt->payload_frags + NET_TX_PKT_PL_START_FRAG - 1; - uint16_t csl; size_t csum_offset = pkt->virt_hdr.csum_start + pkt->virt_hdr.csum_offset; uint16_t l3_proto = eth_get_l3_proto(iov, 1, iov->iov_len); @@ -480,8 +488,6 @@ static void net_tx_pkt_do_sw_csum(struct NetTxPkt *pkt) iov_from_buf(iov, iov_len, csum_offset, &csum, sizeof csum); /* Calculate L4 TCP/UDP checksum */ - csl = pkt->payload_len; - csum_cntr = 0; cso = 0; /* add pseudo header to csum */ @@ -504,23 +510,16 @@ static void net_tx_pkt_do_sw_csum(struct NetTxPkt *pkt) iov_from_buf(iov, iov_len, csum_offset, &csum, sizeof csum); } -enum { - NET_TX_PKT_FRAGMENT_L2_HDR_POS = 0, - NET_TX_PKT_FRAGMENT_L3_HDR_POS, - NET_TX_PKT_FRAGMENT_HEADER_NUM -}; - #define NET_MAX_FRAG_SG_LIST (64) static size_t net_tx_pkt_fetch_fragment(struct NetTxPkt *pkt, - int *src_idx, size_t *src_offset, struct iovec *dst, int *dst_idx) + int *src_idx, size_t *src_offset, size_t src_len, + struct iovec *dst, int *dst_idx) { size_t fetched = 0; struct iovec *src = pkt->vec; - *dst_idx = NET_TX_PKT_FRAGMENT_HEADER_NUM; - - while (fetched < IP_FRAG_ALIGN_SIZE(pkt->virt_hdr.gso_size)) { + while (fetched < src_len) { /* no more place in fragment iov */ if (*dst_idx == NET_MAX_FRAG_SG_LIST) { @@ -535,7 +534,7 @@ static size_t net_tx_pkt_fetch_fragment(struct NetTxPkt *pkt, dst[*dst_idx].iov_base = src[*src_idx].iov_base + *src_offset; dst[*dst_idx].iov_len = MIN(src[*src_idx].iov_len - *src_offset, - IP_FRAG_ALIGN_SIZE(pkt->virt_hdr.gso_size) - fetched); + src_len - fetched); *src_offset += dst[*dst_idx].iov_len; fetched += dst[*dst_idx].iov_len; @@ -551,71 +550,250 @@ static size_t net_tx_pkt_fetch_fragment(struct NetTxPkt *pkt, return fetched; } -static inline void net_tx_pkt_sendv(struct NetTxPkt *pkt, - NetClientState *nc, const struct iovec *iov, int iov_cnt) +static void net_tx_pkt_sendv( + void *opaque, const struct iovec *iov, int iov_cnt, + const struct iovec *virt_iov, int virt_iov_cnt) { - if (pkt->is_loopback) { - qemu_receive_packet_iov(nc, iov, iov_cnt); + NetClientState *nc = opaque; + + if (qemu_get_using_vnet_hdr(nc->peer)) { + qemu_sendv_packet(nc, virt_iov, virt_iov_cnt); } else { qemu_sendv_packet(nc, iov, iov_cnt); } } +static bool net_tx_pkt_tcp_fragment_init(struct NetTxPkt *pkt, + struct iovec *fragment, + int *pl_idx, + size_t *l4hdr_len, + int *src_idx, + size_t *src_offset, + size_t *src_len) +{ + struct iovec *l4 = fragment + NET_TX_PKT_PL_START_FRAG; + size_t bytes_read = 0; + struct tcp_hdr *th; + + if (!pkt->payload_frags) { + return false; + } + + l4->iov_len = pkt->virt_hdr.hdr_len - pkt->hdr_len; + l4->iov_base = g_malloc(l4->iov_len); + + *src_idx = NET_TX_PKT_PL_START_FRAG; + while (pkt->vec[*src_idx].iov_len < l4->iov_len - bytes_read) { + memcpy((char *)l4->iov_base + bytes_read, pkt->vec[*src_idx].iov_base, + pkt->vec[*src_idx].iov_len); + + bytes_read += pkt->vec[*src_idx].iov_len; + + (*src_idx)++; + if (*src_idx >= pkt->payload_frags + NET_TX_PKT_PL_START_FRAG) { + g_free(l4->iov_base); + return false; + } + } + + *src_offset = l4->iov_len - bytes_read; + memcpy((char *)l4->iov_base + bytes_read, pkt->vec[*src_idx].iov_base, + *src_offset); + + th = l4->iov_base; + th->th_flags &= ~(TH_FIN | TH_PUSH); + + *pl_idx = NET_TX_PKT_PL_START_FRAG + 1; + *l4hdr_len = l4->iov_len; + *src_len = pkt->virt_hdr.gso_size; + + return true; +} + +static void net_tx_pkt_tcp_fragment_deinit(struct iovec *fragment) +{ + g_free(fragment[NET_TX_PKT_PL_START_FRAG].iov_base); +} + +static void net_tx_pkt_tcp_fragment_fix(struct NetTxPkt *pkt, + struct iovec *fragment, + size_t fragment_len, + uint8_t gso_type) +{ + struct iovec *l3hdr = fragment + NET_TX_PKT_L3HDR_FRAG; + struct iovec *l4hdr = fragment + NET_TX_PKT_PL_START_FRAG; + struct ip_header *ip = l3hdr->iov_base; + struct ip6_header *ip6 = l3hdr->iov_base; + size_t len = l3hdr->iov_len + l4hdr->iov_len + fragment_len; + + switch (gso_type) { + case VIRTIO_NET_HDR_GSO_TCPV4: + ip->ip_len = cpu_to_be16(len); + eth_fix_ip4_checksum(l3hdr->iov_base, l3hdr->iov_len); + break; + + case VIRTIO_NET_HDR_GSO_TCPV6: + len -= sizeof(struct ip6_header); + ip6->ip6_ctlun.ip6_un1.ip6_un1_plen = cpu_to_be16(len); + break; + } +} + +static void net_tx_pkt_tcp_fragment_advance(struct NetTxPkt *pkt, + struct iovec *fragment, + size_t fragment_len, + uint8_t gso_type) +{ + struct iovec *l3hdr = fragment + NET_TX_PKT_L3HDR_FRAG; + struct iovec *l4hdr = fragment + NET_TX_PKT_PL_START_FRAG; + struct ip_header *ip = l3hdr->iov_base; + struct tcp_hdr *th = l4hdr->iov_base; + + if (gso_type == VIRTIO_NET_HDR_GSO_TCPV4) { + ip->ip_id = cpu_to_be16(be16_to_cpu(ip->ip_id) + 1); + } + + th->th_seq = cpu_to_be32(be32_to_cpu(th->th_seq) + fragment_len); + th->th_flags &= ~TH_CWR; +} + +static void net_tx_pkt_udp_fragment_init(struct NetTxPkt *pkt, + int *pl_idx, + size_t *l4hdr_len, + int *src_idx, size_t *src_offset, + size_t *src_len) +{ + *pl_idx = NET_TX_PKT_PL_START_FRAG; + *l4hdr_len = 0; + *src_idx = NET_TX_PKT_PL_START_FRAG; + *src_offset = 0; + *src_len = IP_FRAG_ALIGN_SIZE(pkt->virt_hdr.gso_size); +} + +static void net_tx_pkt_udp_fragment_fix(struct NetTxPkt *pkt, + struct iovec *fragment, + size_t fragment_offset, + size_t fragment_len) +{ + bool more_frags = fragment_offset + fragment_len < pkt->payload_len; + uint16_t orig_flags; + struct iovec *l3hdr = fragment + NET_TX_PKT_L3HDR_FRAG; + struct ip_header *ip = l3hdr->iov_base; + uint16_t frag_off_units = fragment_offset / IP_FRAG_UNIT_SIZE; + uint16_t new_ip_off; + + assert(fragment_offset % IP_FRAG_UNIT_SIZE == 0); + assert((frag_off_units & ~IP_OFFMASK) == 0); + + orig_flags = be16_to_cpu(ip->ip_off) & ~(IP_OFFMASK | IP_MF); + new_ip_off = frag_off_units | orig_flags | (more_frags ? IP_MF : 0); + ip->ip_off = cpu_to_be16(new_ip_off); + ip->ip_len = cpu_to_be16(l3hdr->iov_len + fragment_len); + + eth_fix_ip4_checksum(l3hdr->iov_base, l3hdr->iov_len); +} + static bool net_tx_pkt_do_sw_fragmentation(struct NetTxPkt *pkt, - NetClientState *nc) + NetTxPktCallback callback, + void *context) { + uint8_t gso_type = pkt->virt_hdr.gso_type & ~VIRTIO_NET_HDR_GSO_ECN; + struct iovec fragment[NET_MAX_FRAG_SG_LIST]; - size_t fragment_len = 0; - bool more_frags = false; - - /* some pointers for shorter code */ - void *l2_iov_base, *l3_iov_base; - size_t l2_iov_len, l3_iov_len; - int src_idx = NET_TX_PKT_PL_START_FRAG, dst_idx; - size_t src_offset = 0; - size_t fragment_offset = 0; + size_t fragment_len; + size_t l4hdr_len; + size_t src_len; - l2_iov_base = pkt->vec[NET_TX_PKT_L2HDR_FRAG].iov_base; - l2_iov_len = pkt->vec[NET_TX_PKT_L2HDR_FRAG].iov_len; - l3_iov_base = pkt->vec[NET_TX_PKT_L3HDR_FRAG].iov_base; - l3_iov_len = pkt->vec[NET_TX_PKT_L3HDR_FRAG].iov_len; + int src_idx, dst_idx, pl_idx; + size_t src_offset; + size_t fragment_offset = 0; + struct virtio_net_hdr virt_hdr = { + .flags = pkt->virt_hdr.flags & VIRTIO_NET_HDR_F_NEEDS_CSUM ? + VIRTIO_NET_HDR_F_DATA_VALID : 0 + }; /* Copy headers */ - fragment[NET_TX_PKT_FRAGMENT_L2_HDR_POS].iov_base = l2_iov_base; - fragment[NET_TX_PKT_FRAGMENT_L2_HDR_POS].iov_len = l2_iov_len; - fragment[NET_TX_PKT_FRAGMENT_L3_HDR_POS].iov_base = l3_iov_base; - fragment[NET_TX_PKT_FRAGMENT_L3_HDR_POS].iov_len = l3_iov_len; + fragment[NET_TX_PKT_VHDR_FRAG].iov_base = &virt_hdr; + fragment[NET_TX_PKT_VHDR_FRAG].iov_len = sizeof(virt_hdr); + fragment[NET_TX_PKT_L2HDR_FRAG] = pkt->vec[NET_TX_PKT_L2HDR_FRAG]; + fragment[NET_TX_PKT_L3HDR_FRAG] = pkt->vec[NET_TX_PKT_L3HDR_FRAG]; + + switch (gso_type) { + case VIRTIO_NET_HDR_GSO_TCPV4: + case VIRTIO_NET_HDR_GSO_TCPV6: + if (!net_tx_pkt_tcp_fragment_init(pkt, fragment, &pl_idx, &l4hdr_len, + &src_idx, &src_offset, &src_len)) { + return false; + } + break; + + case VIRTIO_NET_HDR_GSO_UDP: + net_tx_pkt_do_sw_csum(pkt, &pkt->vec[NET_TX_PKT_L2HDR_FRAG], + pkt->payload_frags + NET_TX_PKT_PL_START_FRAG - 1, + pkt->payload_len); + net_tx_pkt_udp_fragment_init(pkt, &pl_idx, &l4hdr_len, + &src_idx, &src_offset, &src_len); + break; + default: + abort(); + } /* Put as much data as possible and send */ - do { - fragment_len = net_tx_pkt_fetch_fragment(pkt, &src_idx, &src_offset, - fragment, &dst_idx); + while (true) { + dst_idx = pl_idx; + fragment_len = net_tx_pkt_fetch_fragment(pkt, + &src_idx, &src_offset, src_len, fragment, &dst_idx); + if (!fragment_len) { + break; + } - more_frags = (fragment_offset + fragment_len < pkt->payload_len); + switch (gso_type) { + case VIRTIO_NET_HDR_GSO_TCPV4: + case VIRTIO_NET_HDR_GSO_TCPV6: + net_tx_pkt_tcp_fragment_fix(pkt, fragment, fragment_len, gso_type); + net_tx_pkt_do_sw_csum(pkt, fragment + NET_TX_PKT_L2HDR_FRAG, + dst_idx - NET_TX_PKT_L2HDR_FRAG, + l4hdr_len + fragment_len); + break; - eth_setup_ip4_fragmentation(l2_iov_base, l2_iov_len, l3_iov_base, - l3_iov_len, fragment_len, fragment_offset, more_frags); + case VIRTIO_NET_HDR_GSO_UDP: + net_tx_pkt_udp_fragment_fix(pkt, fragment, fragment_offset, + fragment_len); + break; + } - eth_fix_ip4_checksum(l3_iov_base, l3_iov_len); + callback(context, + fragment + NET_TX_PKT_L2HDR_FRAG, dst_idx - NET_TX_PKT_L2HDR_FRAG, + fragment + NET_TX_PKT_VHDR_FRAG, dst_idx - NET_TX_PKT_VHDR_FRAG); - net_tx_pkt_sendv(pkt, nc, fragment, dst_idx); + if (gso_type == VIRTIO_NET_HDR_GSO_TCPV4 || + gso_type == VIRTIO_NET_HDR_GSO_TCPV6) { + net_tx_pkt_tcp_fragment_advance(pkt, fragment, fragment_len, + gso_type); + } fragment_offset += fragment_len; + } - } while (fragment_len && more_frags); + if (gso_type == VIRTIO_NET_HDR_GSO_TCPV4 || + gso_type == VIRTIO_NET_HDR_GSO_TCPV6) { + net_tx_pkt_tcp_fragment_deinit(fragment); + } return true; } bool net_tx_pkt_send(struct NetTxPkt *pkt, NetClientState *nc) { - assert(pkt); + bool offload = qemu_get_using_vnet_hdr(nc->peer); + return net_tx_pkt_send_custom(pkt, offload, net_tx_pkt_sendv, nc); +} - if (!pkt->has_virt_hdr && - pkt->virt_hdr.flags & VIRTIO_NET_HDR_F_NEEDS_CSUM) { - net_tx_pkt_do_sw_csum(pkt); - } +bool net_tx_pkt_send_custom(struct NetTxPkt *pkt, bool offload, + NetTxPktCallback callback, void *context) +{ + assert(pkt); /* * Since underlying infrastructure does not support IP datagrams longer @@ -629,26 +807,22 @@ bool net_tx_pkt_send(struct NetTxPkt *pkt, NetClientState *nc) } } - if (pkt->has_virt_hdr || - pkt->virt_hdr.gso_type == VIRTIO_NET_HDR_GSO_NONE) { + if (offload || pkt->virt_hdr.gso_type == VIRTIO_NET_HDR_GSO_NONE) { + if (!offload && pkt->virt_hdr.flags & VIRTIO_NET_HDR_F_NEEDS_CSUM) { + net_tx_pkt_do_sw_csum(pkt, &pkt->vec[NET_TX_PKT_L2HDR_FRAG], + pkt->payload_frags + NET_TX_PKT_PL_START_FRAG - 1, + pkt->payload_len); + } + net_tx_pkt_fix_ip6_payload_len(pkt); - net_tx_pkt_sendv(pkt, nc, pkt->vec, - pkt->payload_frags + NET_TX_PKT_PL_START_FRAG); + callback(context, pkt->vec + NET_TX_PKT_L2HDR_FRAG, + pkt->payload_frags + NET_TX_PKT_PL_START_FRAG - NET_TX_PKT_L2HDR_FRAG, + pkt->vec + NET_TX_PKT_VHDR_FRAG, + pkt->payload_frags + NET_TX_PKT_PL_START_FRAG - NET_TX_PKT_VHDR_FRAG); return true; } - return net_tx_pkt_do_sw_fragmentation(pkt, nc); -} - -bool net_tx_pkt_send_loopback(struct NetTxPkt *pkt, NetClientState *nc) -{ - bool res; - - pkt->is_loopback = true; - res = net_tx_pkt_send(pkt, nc); - pkt->is_loopback = false; - - return res; + return net_tx_pkt_do_sw_fragmentation(pkt, callback, context); } void net_tx_pkt_fix_ip6_payload_len(struct NetTxPkt *pkt) diff --git a/hw/net/net_tx_pkt.h b/hw/net/net_tx_pkt.h index 4ec8bbe9bd..f57b4e034b 100644 --- a/hw/net/net_tx_pkt.h +++ b/hw/net/net_tx_pkt.h @@ -26,16 +26,17 @@ struct NetTxPkt; +typedef void (* NetTxPktCallback)(void *, const struct iovec *, int, const struct iovec *, int); + /** * Init function for tx packet functionality * * @pkt: packet pointer * @pci_dev: PCI device processing this packet * @max_frags: max tx ip fragments - * @has_virt_hdr: device uses virtio header. */ void net_tx_pkt_init(struct NetTxPkt **pkt, PCIDevice *pci_dev, - uint32_t max_frags, bool has_virt_hdr); + uint32_t max_frags); /** * Clean all tx packet resources. @@ -59,9 +60,10 @@ struct virtio_net_hdr *net_tx_pkt_get_vhdr(struct NetTxPkt *pkt); * @tso_enable: TSO enabled * @csum_enable: CSO enabled * @gso_size: MSS size for TSO + * @ret: operation result * */ -void net_tx_pkt_build_vheader(struct NetTxPkt *pkt, bool tso_enable, +bool net_tx_pkt_build_vheader(struct NetTxPkt *pkt, bool tso_enable, bool csum_enable, uint32_t gso_size); /** @@ -161,15 +163,16 @@ void net_tx_pkt_reset(struct NetTxPkt *pkt); bool net_tx_pkt_send(struct NetTxPkt *pkt, NetClientState *nc); /** -* Redirect packet directly to receive path (emulate loopback phy). -* Handles sw offloads if vhdr is not supported. -* -* @pkt: packet -* @nc: NetClientState -* @ret: operation result -* -*/ -bool net_tx_pkt_send_loopback(struct NetTxPkt *pkt, NetClientState *nc); + * Send packet with a custom function. + * + * @pkt: packet + * @offload: whether the callback implements offloading + * @callback: a function to be called back for each transformed packet + * @context: a pointer to be passed to the callback. + * @ret: operation result + */ +bool net_tx_pkt_send_custom(struct NetTxPkt *pkt, bool offload, + NetTxPktCallback callback, void *context); /** * parse raw packet data and analyze offload requirements. diff --git a/hw/net/trace-events b/hw/net/trace-events index 4c0ec3fda1..65753411fc 100644 --- a/hw/net/trace-events +++ b/hw/net/trace-events @@ -61,7 +61,7 @@ pcnet_ioport_read(void *opaque, uint64_t addr, unsigned size) "opaque=%p addr=0x pcnet_ioport_write(void *opaque, uint64_t addr, uint64_t data, unsigned size) "opaque=%p addr=0x%"PRIx64" data=0x%"PRIx64" size=%d" # net_rx_pkt.c -net_rx_pkt_parsed(bool ip4, bool ip6, bool udp, bool tcp, size_t l3o, size_t l4o, size_t l5o) "RX packet parsed: ip4: %d, ip6: %d, udp: %d, tcp: %d, l3 offset: %zu, l4 offset: %zu, l5 offset: %zu" +net_rx_pkt_parsed(bool ip4, bool ip6, int l4proto, size_t l3o, size_t l4o, size_t l5o) "RX packet parsed: ip4: %d, ip6: %d, l4 protocol: %d, l3 offset: %zu, l4 offset: %zu, l5 offset: %zu" net_rx_pkt_l4_csum_validate_entry(void) "Starting L4 checksum validation" net_rx_pkt_l4_csum_validate_not_xxp(void) "Not a TCP/UDP packet" net_rx_pkt_l4_csum_validate_udp_with_no_checksum(void) "UDP packet without checksum" @@ -165,8 +165,8 @@ e1000e_rx_descr(int ridx, uint64_t base, uint8_t len) "Next RX descriptor: ring e1000e_rx_set_rctl(uint32_t rctl) "RCTL = 0x%x" e1000e_rx_receive_iov(int iovcnt) "Received vector of %d fragments" e1000e_rx_flt_dropped(void) "Received packet dropped by RX filter" -e1000e_rx_written_to_guest(uint32_t causes) "Received packet written to guest (ICR causes %u)" -e1000e_rx_not_written_to_guest(uint32_t causes) "Received packet NOT written to guest (ICR causes %u)" +e1000e_rx_written_to_guest(int queue_idx) "Received packet written to guest (queue %d)" +e1000e_rx_not_written_to_guest(int queue_idx) "Received packet NOT written to guest (queue %d)" e1000e_rx_interrupt_set(uint32_t causes) "Receive interrupt set (ICR causes %u)" e1000e_rx_interrupt_delayed(uint32_t causes) "Receive interrupt delayed (ICR causes %u)" e1000e_rx_set_cso(int cso_state) "RX CSO state set to %d" @@ -177,18 +177,16 @@ e1000e_rx_start_recv(void) e1000e_rx_rss_started(void) "Starting RSS processing" e1000e_rx_rss_disabled(void) "RSS is disabled" e1000e_rx_rss_type(uint32_t type) "RSS type is %u" -e1000e_rx_rss_ip4(bool isfragment, bool istcp, uint32_t mrqc, bool tcpipv4_enabled, bool ipv4_enabled) "RSS IPv4: fragment %d, tcp %d, mrqc 0x%X, tcpipv4 enabled %d, ipv4 enabled %d" +e1000e_rx_rss_ip4(int l4hdr_proto, uint32_t mrqc, bool tcpipv4_enabled, bool ipv4_enabled) "RSS IPv4: L4 header protocol %d, mrqc 0x%X, tcpipv4 enabled %d, ipv4 enabled %d" e1000e_rx_rss_ip6_rfctl(uint32_t rfctl) "RSS IPv6: rfctl 0x%X" -e1000e_rx_rss_ip6(bool ex_dis, bool new_ex_dis, bool istcp, bool has_ext_headers, bool ex_dst_valid, bool ex_src_valid, uint32_t mrqc, bool tcpipv6_enabled, bool ipv6ex_enabled, bool ipv6_enabled) "RSS IPv6: ex_dis: %d, new_ex_dis: %d, tcp %d, has_ext_headers %d, ex_dst_valid %d, ex_src_valid %d, mrqc 0x%X, tcpipv6 enabled %d, ipv6ex enabled %d, ipv6 enabled %d" -e1000e_rx_rss_dispatched_to_queue(int queue_idx) "Packet being dispatched to queue %d" +e1000e_rx_rss_ip6(bool ex_dis, bool new_ex_dis, int l4hdr_proto, bool has_ext_headers, bool ex_dst_valid, bool ex_src_valid, uint32_t mrqc, bool tcpipv6_enabled, bool ipv6ex_enabled, bool ipv6_enabled) "RSS IPv6: ex_dis: %d, new_ex_dis: %d, L4 header protocol %d, has_ext_headers %d, ex_dst_valid %d, ex_src_valid %d, mrqc 0x%X, tcpipv6 enabled %d, ipv6ex enabled %d, ipv6 enabled %d" -e1000e_rx_metadata_protocols(bool isip4, bool isip6, bool isudp, bool istcp) "protocols: ip4: %d, ip6: %d, udp: %d, tcp: %d" +e1000e_rx_metadata_protocols(bool hasip4, bool hasip6, int l4hdr_protocol) "protocols: ip4: %d, ip6: %d, l4hdr: %d" e1000e_rx_metadata_vlan(uint16_t vlan_tag) "VLAN tag is 0x%X" e1000e_rx_metadata_rss(uint32_t rss, uint32_t mrq) "RSS data: rss: 0x%X, mrq: 0x%X" e1000e_rx_metadata_ip_id(uint16_t ip_id) "the IPv4 ID is 0x%X" e1000e_rx_metadata_ack(void) "the packet is TCP ACK" e1000e_rx_metadata_pkt_type(uint32_t pkt_type) "the packet type is %u" -e1000e_rx_metadata_no_virthdr(void) "the packet has no virt-header" e1000e_rx_metadata_virthdr_no_csum_info(void) "virt-header does not contain checksum info" e1000e_rx_metadata_l3_cso_disabled(void) "IP4 CSO is disabled" e1000e_rx_metadata_l4_cso_disabled(void) "TCP/UDP CSO is disabled" @@ -201,10 +199,8 @@ e1000e_rx_metadata_ipv6_filtering_disabled(void) "IPv6 RX filtering disabled by e1000e_vlan_vet(uint16_t vet) "Setting VLAN ethernet type 0x%X" e1000e_irq_msi_notify(uint32_t cause) "MSI notify 0x%x" -e1000e_irq_throttling_no_pending_interrupts(void) "No pending interrupts to notify" e1000e_irq_msi_notify_postponed(void) "Sending MSI postponed by ITR" e1000e_irq_legacy_notify_postponed(void) "Raising legacy IRQ postponed by ITR" -e1000e_irq_throttling_no_pending_vec(int idx) "No pending interrupts for vector %d" e1000e_irq_msix_notify_postponed_vec(int idx) "Sending MSI-X postponed by EITR[%d]" e1000e_irq_legacy_notify(bool level) "IRQ line state: %d" e1000e_irq_msix_notify_vec(uint32_t vector) "MSI-X notify vector 0x%x" @@ -253,7 +249,7 @@ e1000e_vm_state_stopped(void) "VM state is stopped" # e1000e.c e1000e_cb_pci_realize(void) "E1000E PCI realize entry" e1000e_cb_pci_uninit(void) "E1000E PCI unit entry" -e1000e_cb_qdev_reset(void) "E1000E qdev reset entry" +e1000e_cb_qdev_reset_hold(void) "E1000E qdev reset hold" e1000e_cb_pre_save(void) "E1000E pre save entry" e1000e_cb_post_load(void) "E1000E post load entry" @@ -274,6 +270,38 @@ e1000e_msix_use_vector_fail(uint32_t vec, int32_t res) "Failed to use MSI-X vect e1000e_mac_set_permanent(uint8_t b0, uint8_t b1, uint8_t b2, uint8_t b3, uint8_t b4, uint8_t b5) "Set permanent MAC: %02x:%02x:%02x:%02x:%02x:%02x" e1000e_cfg_support_virtio(bool support) "Virtio header supported: %d" +# igb.c +igb_write_config(uint32_t address, uint32_t val, int len) "CONFIG write 0x%"PRIx32", value: 0x%"PRIx32", len: %"PRId32 +igbvf_write_config(uint32_t address, uint32_t val, int len) "CONFIG write 0x%"PRIx32", value: 0x%"PRIx32", len: %"PRId32 + +# igb_core.c +igb_core_mdic_read(uint32_t addr, uint32_t data) "MDIC READ: PHY[%u] = 0x%x" +igb_core_mdic_read_unhandled(uint32_t addr) "MDIC READ: PHY[%u] UNHANDLED" +igb_core_mdic_write(uint32_t addr, uint32_t data) "MDIC WRITE: PHY[%u] = 0x%x" +igb_core_mdic_write_unhandled(uint32_t addr) "MDIC WRITE: PHY[%u] UNHANDLED" + +igb_rx_desc_buff_size(uint32_t b) "buffer size: %u" +igb_rx_desc_buff_write(uint64_t addr, uint16_t offset, const void* source, uint32_t len) "addr: 0x%"PRIx64", offset: %u, from: %p, length: %u" + +igb_rx_metadata_rss(uint32_t rss) "RSS data: 0x%X" + +igb_irq_icr_clear_gpie_nsicr(void) "Clearing ICR on read due to GPIE.NSICR enabled" +igb_irq_icr_write(uint32_t bits, uint32_t old_icr, uint32_t new_icr) "Clearing ICR bits 0x%x: 0x%x --> 0x%x" +igb_irq_set_iam(uint32_t icr) "Update IAM: 0x%x" +igb_irq_read_iam(uint32_t icr) "Current IAM: 0x%x" +igb_irq_write_eics(uint32_t val, bool msix) "Update EICS: 0x%x MSI-X: %d" +igb_irq_write_eims(uint32_t val, bool msix) "Update EIMS: 0x%x MSI-X: %d" +igb_irq_write_eimc(uint32_t val, uint32_t eims, bool msix) "Update EIMC: 0x%x EIMS: 0x%x MSI-X: %d" +igb_irq_write_eiac(uint32_t val) "Update EIAC: 0x%x" +igb_irq_write_eiam(uint32_t val, bool msix) "Update EIAM: 0x%x MSI-X: %d" +igb_irq_write_eicr(uint32_t val, bool msix) "Update EICR: 0x%x MSI-X: %d" +igb_irq_eitr_set(uint32_t eitr_num, uint32_t val) "EITR[%u] = 0x%x" +igb_set_pfmailbox(uint32_t vf_num, uint32_t val) "PFMailbox[%d]: 0x%x" +igb_set_vfmailbox(uint32_t vf_num, uint32_t val) "VFMailbox[%d]: 0x%x" + +# igbvf.c +igbvf_wrn_io_addr_unknown(uint64_t addr) "IO unknown register 0x%"PRIx64 + # spapr_llan.c spapr_vlan_get_rx_bd_from_pool_found(int pool, int32_t count, uint32_t rx_bufs) "pool=%d count=%"PRId32" rxbufs=%"PRIu32 spapr_vlan_get_rx_bd_from_page(int buf_ptr, uint64_t bd) "use_buf_ptr=%d bd=0x%016"PRIx64 diff --git a/hw/net/virtio-net.c b/hw/net/virtio-net.c index 09d5c7a664..53e1c32643 100644 --- a/hw/net/virtio-net.c +++ b/hw/net/virtio-net.c @@ -1746,39 +1746,61 @@ static int receive_filter(VirtIONet *n, const uint8_t *buf, int size) return 0; } -static uint8_t virtio_net_get_hash_type(bool isip4, - bool isip6, - bool isudp, - bool istcp, +static uint8_t virtio_net_get_hash_type(bool hasip4, + bool hasip6, + EthL4HdrProto l4hdr_proto, uint32_t types) { - if (isip4) { - if (istcp && (types & VIRTIO_NET_RSS_HASH_TYPE_TCPv4)) { - return NetPktRssIpV4Tcp; - } - if (isudp && (types & VIRTIO_NET_RSS_HASH_TYPE_UDPv4)) { - return NetPktRssIpV4Udp; + if (hasip4) { + switch (l4hdr_proto) { + case ETH_L4_HDR_PROTO_TCP: + if (types & VIRTIO_NET_RSS_HASH_TYPE_TCPv4) { + return NetPktRssIpV4Tcp; + } + break; + + case ETH_L4_HDR_PROTO_UDP: + if (types & VIRTIO_NET_RSS_HASH_TYPE_UDPv4) { + return NetPktRssIpV4Udp; + } + break; + + default: + break; } + if (types & VIRTIO_NET_RSS_HASH_TYPE_IPv4) { return NetPktRssIpV4; } - } else if (isip6) { - uint32_t mask = VIRTIO_NET_RSS_HASH_TYPE_TCP_EX | - VIRTIO_NET_RSS_HASH_TYPE_TCPv6; + } else if (hasip6) { + switch (l4hdr_proto) { + case ETH_L4_HDR_PROTO_TCP: + if (types & VIRTIO_NET_RSS_HASH_TYPE_TCP_EX) { + return NetPktRssIpV6TcpEx; + } + if (types & VIRTIO_NET_RSS_HASH_TYPE_TCPv6) { + return NetPktRssIpV6Tcp; + } + break; + + case ETH_L4_HDR_PROTO_UDP: + if (types & VIRTIO_NET_RSS_HASH_TYPE_UDP_EX) { + return NetPktRssIpV6UdpEx; + } + if (types & VIRTIO_NET_RSS_HASH_TYPE_UDPv6) { + return NetPktRssIpV6Udp; + } + break; - if (istcp && (types & mask)) { - return (types & VIRTIO_NET_RSS_HASH_TYPE_TCP_EX) ? - NetPktRssIpV6TcpEx : NetPktRssIpV6Tcp; + default: + break; } - mask = VIRTIO_NET_RSS_HASH_TYPE_UDP_EX | VIRTIO_NET_RSS_HASH_TYPE_UDPv6; - if (isudp && (types & mask)) { - return (types & VIRTIO_NET_RSS_HASH_TYPE_UDP_EX) ? - NetPktRssIpV6UdpEx : NetPktRssIpV6Udp; + + if (types & VIRTIO_NET_RSS_HASH_TYPE_IP_EX) { + return NetPktRssIpV6Ex; } - mask = VIRTIO_NET_RSS_HASH_TYPE_IP_EX | VIRTIO_NET_RSS_HASH_TYPE_IPv6; - if (types & mask) { - return (types & VIRTIO_NET_RSS_HASH_TYPE_IP_EX) ? - NetPktRssIpV6Ex : NetPktRssIpV6; + if (types & VIRTIO_NET_RSS_HASH_TYPE_IPv6) { + return NetPktRssIpV6; } } return 0xff; @@ -1800,7 +1822,8 @@ static int virtio_net_process_rss(NetClientState *nc, const uint8_t *buf, struct NetRxPkt *pkt = n->rx_pkt; uint8_t net_hash_type; uint32_t hash; - bool isip4, isip6, isudp, istcp; + bool hasip4, hasip6; + EthL4HdrProto l4hdr_proto; static const uint8_t reports[NetPktRssIpV6UdpEx + 1] = { VIRTIO_NET_HASH_REPORT_IPv4, VIRTIO_NET_HASH_REPORT_TCPv4, @@ -1815,14 +1838,8 @@ static int virtio_net_process_rss(NetClientState *nc, const uint8_t *buf, net_rx_pkt_set_protocols(pkt, buf + n->host_hdr_len, size - n->host_hdr_len); - net_rx_pkt_get_protocols(pkt, &isip4, &isip6, &isudp, &istcp); - if (isip4 && (net_rx_pkt_get_ip4_info(pkt)->fragment)) { - istcp = isudp = false; - } - if (isip6 && (net_rx_pkt_get_ip6_info(pkt)->fragment)) { - istcp = isudp = false; - } - net_hash_type = virtio_net_get_hash_type(isip4, isip6, isudp, istcp, + net_rx_pkt_get_protocols(pkt, &hasip4, &hasip6, &l4hdr_proto); + net_hash_type = virtio_net_get_hash_type(hasip4, hasip6, l4hdr_proto, n->rss_data.hash_types); if (net_hash_type > NetPktRssIpV6UdpEx) { if (n->rss_data.populate_hash) { @@ -3718,7 +3735,7 @@ static void virtio_net_device_realize(DeviceState *dev, Error **errp) QTAILQ_INIT(&n->rsc_chains); n->qdev = dev; - net_rx_pkt_init(&n->rx_pkt, false); + net_rx_pkt_init(&n->rx_pkt); if (virtio_has_feature(n->host_features, VIRTIO_NET_F_RSS)) { virtio_net_load_ebpf(n); diff --git a/hw/net/vmxnet3.c b/hw/net/vmxnet3.c index 56559cda24..1068b80868 100644 --- a/hw/net/vmxnet3.c +++ b/hw/net/vmxnet3.c @@ -440,19 +440,19 @@ vmxnet3_setup_tx_offloads(VMXNET3State *s) { switch (s->offload_mode) { case VMXNET3_OM_NONE: - net_tx_pkt_build_vheader(s->tx_pkt, false, false, 0); - break; + return net_tx_pkt_build_vheader(s->tx_pkt, false, false, 0); case VMXNET3_OM_CSUM: - net_tx_pkt_build_vheader(s->tx_pkt, false, true, 0); VMW_PKPRN("L4 CSO requested\n"); - break; + return net_tx_pkt_build_vheader(s->tx_pkt, false, true, 0); case VMXNET3_OM_TSO: - net_tx_pkt_build_vheader(s->tx_pkt, true, true, - s->cso_or_gso_size); - net_tx_pkt_update_ip_checksums(s->tx_pkt); VMW_PKPRN("GSO offload requested."); + if (!net_tx_pkt_build_vheader(s->tx_pkt, true, true, + s->cso_or_gso_size)) { + return false; + } + net_tx_pkt_update_ip_checksums(s->tx_pkt); break; default: @@ -847,21 +847,20 @@ static void vmxnet3_rx_need_csum_calculate(struct NetRxPkt *pkt, size_t pkt_len) { struct virtio_net_hdr *vhdr; - bool isip4, isip6, istcp, isudp; + bool hasip4, hasip6; + EthL4HdrProto l4hdr_proto; uint8_t *data; int len; - if (!net_rx_pkt_has_virt_hdr(pkt)) { - return; - } - vhdr = net_rx_pkt_get_vhdr(pkt); if (!VMXNET_FLAG_IS_SET(vhdr->flags, VIRTIO_NET_HDR_F_NEEDS_CSUM)) { return; } - net_rx_pkt_get_protocols(pkt, &isip4, &isip6, &isudp, &istcp); - if (!(isip4 || isip6) || !(istcp || isudp)) { + net_rx_pkt_get_protocols(pkt, &hasip4, &hasip6, &l4hdr_proto); + if (!(hasip4 || hasip6) || + (l4hdr_proto != ETH_L4_HDR_PROTO_TCP && + l4hdr_proto != ETH_L4_HDR_PROTO_UDP)) { return; } @@ -889,7 +888,8 @@ static void vmxnet3_rx_update_descr(struct NetRxPkt *pkt, struct Vmxnet3_RxCompDesc *rxcd) { int csum_ok, is_gso; - bool isip4, isip6, istcp, isudp; + bool hasip4, hasip6; + EthL4HdrProto l4hdr_proto; struct virtio_net_hdr *vhdr; uint8_t offload_type; @@ -898,10 +898,6 @@ static void vmxnet3_rx_update_descr(struct NetRxPkt *pkt, rxcd->tci = net_rx_pkt_get_vlan_tag(pkt); } - if (!net_rx_pkt_has_virt_hdr(pkt)) { - goto nocsum; - } - vhdr = net_rx_pkt_get_vhdr(pkt); /* * Checksum is valid when lower level tell so or when lower level @@ -919,16 +915,18 @@ static void vmxnet3_rx_update_descr(struct NetRxPkt *pkt, goto nocsum; } - net_rx_pkt_get_protocols(pkt, &isip4, &isip6, &isudp, &istcp); - if ((!istcp && !isudp) || (!isip4 && !isip6)) { + net_rx_pkt_get_protocols(pkt, &hasip4, &hasip6, &l4hdr_proto); + if ((l4hdr_proto != ETH_L4_HDR_PROTO_TCP && + l4hdr_proto != ETH_L4_HDR_PROTO_UDP) || + (!hasip4 && !hasip6)) { goto nocsum; } rxcd->cnc = 0; - rxcd->v4 = isip4 ? 1 : 0; - rxcd->v6 = isip6 ? 1 : 0; - rxcd->tcp = istcp ? 1 : 0; - rxcd->udp = isudp ? 1 : 0; + rxcd->v4 = hasip4 ? 1 : 0; + rxcd->v6 = hasip6 ? 1 : 0; + rxcd->tcp = l4hdr_proto == ETH_L4_HDR_PROTO_TCP; + rxcd->udp = l4hdr_proto == ETH_L4_HDR_PROTO_UDP; rxcd->fcs = rxcd->tuc = rxcd->ipc = 1; return; @@ -1521,9 +1519,8 @@ static void vmxnet3_activate_device(VMXNET3State *s) /* Preallocate TX packet wrapper */ VMW_CFPRN("Max TX fragments is %u", s->max_tx_frags); - net_tx_pkt_init(&s->tx_pkt, PCI_DEVICE(s), - s->max_tx_frags, s->peer_has_vhdr); - net_rx_pkt_init(&s->rx_pkt, s->peer_has_vhdr); + net_tx_pkt_init(&s->tx_pkt, PCI_DEVICE(s), s->max_tx_frags); + net_rx_pkt_init(&s->rx_pkt); /* Read rings memory locations for RX queues */ for (i = 0; i < s->rxq_num; i++) { @@ -2402,9 +2399,8 @@ static int vmxnet3_post_load(void *opaque, int version_id) { VMXNET3State *s = opaque; - net_tx_pkt_init(&s->tx_pkt, PCI_DEVICE(s), - s->max_tx_frags, s->peer_has_vhdr); - net_rx_pkt_init(&s->rx_pkt, s->peer_has_vhdr); + net_tx_pkt_init(&s->tx_pkt, PCI_DEVICE(s), s->max_tx_frags); + net_rx_pkt_init(&s->rx_pkt); if (s->msix_used) { vmxnet3_use_msix_vectors(s, VMXNET3_MAX_INTRS); diff --git a/hw/net/xen_nic.c b/hw/net/xen_nic.c index 7d92c2d022..9bbf6599fc 100644 --- a/hw/net/xen_nic.c +++ b/hw/net/xen_nic.c @@ -145,7 +145,7 @@ static void net_tx_packets(struct XenNetDev *netdev) continue; } - if ((txreq.offset + txreq.size) > XC_PAGE_SIZE) { + if ((txreq.offset + txreq.size) > XEN_PAGE_SIZE) { xen_pv_printf(&netdev->xendev, 0, "error: page crossing\n"); net_tx_error(netdev, &txreq, rc); continue; @@ -171,7 +171,7 @@ static void net_tx_packets(struct XenNetDev *netdev) if (txreq.flags & NETTXF_csum_blank) { /* have read-only mapping -> can't fill checksum in-place */ if (!tmpbuf) { - tmpbuf = g_malloc(XC_PAGE_SIZE); + tmpbuf = g_malloc(XEN_PAGE_SIZE); } memcpy(tmpbuf, page + txreq.offset, txreq.size); net_checksum_calculate(tmpbuf, txreq.size, CSUM_ALL); @@ -181,7 +181,7 @@ static void net_tx_packets(struct XenNetDev *netdev) qemu_send_packet(qemu_get_queue(netdev->nic), page + txreq.offset, txreq.size); } - xen_be_unmap_grant_ref(&netdev->xendev, page); + xen_be_unmap_grant_ref(&netdev->xendev, page, txreq.gref); net_tx_response(netdev, &txreq, NETIF_RSP_OKAY); } if (!netdev->tx_work) { @@ -243,9 +243,9 @@ static ssize_t net_rx_packet(NetClientState *nc, const uint8_t *buf, size_t size if (rc == rp || RING_REQUEST_CONS_OVERFLOW(&netdev->rx_ring, rc)) { return 0; } - if (size > XC_PAGE_SIZE - NET_IP_ALIGN) { + if (size > XEN_PAGE_SIZE - NET_IP_ALIGN) { xen_pv_printf(&netdev->xendev, 0, "packet too big (%lu > %ld)", - (unsigned long)size, XC_PAGE_SIZE - NET_IP_ALIGN); + (unsigned long)size, XEN_PAGE_SIZE - NET_IP_ALIGN); return -1; } @@ -261,7 +261,7 @@ static ssize_t net_rx_packet(NetClientState *nc, const uint8_t *buf, size_t size return -1; } memcpy(page + NET_IP_ALIGN, buf, size); - xen_be_unmap_grant_ref(&netdev->xendev, page); + xen_be_unmap_grant_ref(&netdev->xendev, page, rxreq.gref); net_rx_response(netdev, &rxreq, NETIF_RSP_OKAY, NET_IP_ALIGN, size, 0); return size; @@ -343,12 +343,13 @@ static int net_connect(struct XenLegacyDevice *xendev) netdev->rx_ring_ref, PROT_READ | PROT_WRITE); if (!netdev->rxs) { - xen_be_unmap_grant_ref(&netdev->xendev, netdev->txs); + xen_be_unmap_grant_ref(&netdev->xendev, netdev->txs, + netdev->tx_ring_ref); netdev->txs = NULL; return -1; } - BACK_RING_INIT(&netdev->tx_ring, netdev->txs, XC_PAGE_SIZE); - BACK_RING_INIT(&netdev->rx_ring, netdev->rxs, XC_PAGE_SIZE); + BACK_RING_INIT(&netdev->tx_ring, netdev->txs, XEN_PAGE_SIZE); + BACK_RING_INIT(&netdev->rx_ring, netdev->rxs, XEN_PAGE_SIZE); xen_be_bind_evtchn(&netdev->xendev); @@ -368,11 +369,13 @@ static void net_disconnect(struct XenLegacyDevice *xendev) xen_pv_unbind_evtchn(&netdev->xendev); if (netdev->txs) { - xen_be_unmap_grant_ref(&netdev->xendev, netdev->txs); + xen_be_unmap_grant_ref(&netdev->xendev, netdev->txs, + netdev->tx_ring_ref); netdev->txs = NULL; } if (netdev->rxs) { - xen_be_unmap_grant_ref(&netdev->xendev, netdev->rxs); + xen_be_unmap_grant_ref(&netdev->xendev, netdev->rxs, + netdev->rx_ring_ref); netdev->rxs = NULL; } } diff --git a/hw/nvme/ctrl.c b/hw/nvme/ctrl.c index f25cc2c235..49c1210fce 100644 --- a/hw/nvme/ctrl.c +++ b/hw/nvme/ctrl.c @@ -238,6 +238,8 @@ static const bool nvme_feature_support[NVME_FID_MAX] = { [NVME_TIMESTAMP] = true, [NVME_HOST_BEHAVIOR_SUPPORT] = true, [NVME_COMMAND_SET_PROFILE] = true, + [NVME_FDP_MODE] = true, + [NVME_FDP_EVENTS] = true, }; static const uint32_t nvme_feature_cap[NVME_FID_MAX] = { @@ -249,6 +251,8 @@ static const uint32_t nvme_feature_cap[NVME_FID_MAX] = { [NVME_TIMESTAMP] = NVME_FEAT_CAP_CHANGE, [NVME_HOST_BEHAVIOR_SUPPORT] = NVME_FEAT_CAP_CHANGE, [NVME_COMMAND_SET_PROFILE] = NVME_FEAT_CAP_CHANGE, + [NVME_FDP_MODE] = NVME_FEAT_CAP_CHANGE, + [NVME_FDP_EVENTS] = NVME_FEAT_CAP_CHANGE | NVME_FEAT_CAP_NS, }; static const uint32_t nvme_cse_acs[256] = { @@ -266,6 +270,8 @@ static const uint32_t nvme_cse_acs[256] = { [NVME_ADM_CMD_VIRT_MNGMT] = NVME_CMD_EFF_CSUPP, [NVME_ADM_CMD_DBBUF_CONFIG] = NVME_CMD_EFF_CSUPP, [NVME_ADM_CMD_FORMAT_NVM] = NVME_CMD_EFF_CSUPP | NVME_CMD_EFF_LBCC, + [NVME_ADM_CMD_DIRECTIVE_RECV] = NVME_CMD_EFF_CSUPP, + [NVME_ADM_CMD_DIRECTIVE_SEND] = NVME_CMD_EFF_CSUPP, }; static const uint32_t nvme_cse_iocs_none[256]; @@ -279,6 +285,8 @@ static const uint32_t nvme_cse_iocs_nvm[256] = { [NVME_CMD_VERIFY] = NVME_CMD_EFF_CSUPP, [NVME_CMD_COPY] = NVME_CMD_EFF_CSUPP | NVME_CMD_EFF_LBCC, [NVME_CMD_COMPARE] = NVME_CMD_EFF_CSUPP, + [NVME_CMD_IO_MGMT_RECV] = NVME_CMD_EFF_CSUPP, + [NVME_CMD_IO_MGMT_SEND] = NVME_CMD_EFF_CSUPP | NVME_CMD_EFF_LBCC, }; static const uint32_t nvme_cse_iocs_zoned[256] = { @@ -297,12 +305,66 @@ static const uint32_t nvme_cse_iocs_zoned[256] = { static void nvme_process_sq(void *opaque); static void nvme_ctrl_reset(NvmeCtrl *n, NvmeResetType rst); +static inline uint64_t nvme_get_timestamp(const NvmeCtrl *n); static uint16_t nvme_sqid(NvmeRequest *req) { return le16_to_cpu(req->sq->sqid); } +static inline uint16_t nvme_make_pid(NvmeNamespace *ns, uint16_t rg, + uint16_t ph) +{ + uint16_t rgif = ns->endgrp->fdp.rgif; + + if (!rgif) { + return ph; + } + + return (rg << (16 - rgif)) | ph; +} + +static inline bool nvme_ph_valid(NvmeNamespace *ns, uint16_t ph) +{ + return ph < ns->fdp.nphs; +} + +static inline bool nvme_rg_valid(NvmeEnduranceGroup *endgrp, uint16_t rg) +{ + return rg < endgrp->fdp.nrg; +} + +static inline uint16_t nvme_pid2ph(NvmeNamespace *ns, uint16_t pid) +{ + uint16_t rgif = ns->endgrp->fdp.rgif; + + if (!rgif) { + return pid; + } + + return pid & ((1 << (15 - rgif)) - 1); +} + +static inline uint16_t nvme_pid2rg(NvmeNamespace *ns, uint16_t pid) +{ + uint16_t rgif = ns->endgrp->fdp.rgif; + + if (!rgif) { + return 0; + } + + return pid >> (16 - rgif); +} + +static inline bool nvme_parse_pid(NvmeNamespace *ns, uint16_t pid, + uint16_t *ph, uint16_t *rg) +{ + *rg = nvme_pid2rg(ns, pid); + *ph = nvme_pid2ph(ns, pid); + + return nvme_ph_valid(ns, *ph) && nvme_rg_valid(ns->endgrp, *rg); +} + static void nvme_assign_zone_state(NvmeNamespace *ns, NvmeZone *zone, NvmeZoneState state) { @@ -376,6 +438,69 @@ static uint16_t nvme_aor_check(NvmeNamespace *ns, uint32_t act, uint32_t opn) return nvme_zns_check_resources(ns, act, opn, 0); } +static NvmeFdpEvent *nvme_fdp_alloc_event(NvmeCtrl *n, NvmeFdpEventBuffer *ebuf) +{ + NvmeFdpEvent *ret = NULL; + bool is_full = ebuf->next == ebuf->start && ebuf->nelems; + + ret = &ebuf->events[ebuf->next++]; + if (unlikely(ebuf->next == NVME_FDP_MAX_EVENTS)) { + ebuf->next = 0; + } + if (is_full) { + ebuf->start = ebuf->next; + } else { + ebuf->nelems++; + } + + memset(ret, 0, sizeof(NvmeFdpEvent)); + ret->timestamp = nvme_get_timestamp(n); + + return ret; +} + +static inline int log_event(NvmeRuHandle *ruh, uint8_t event_type) +{ + return (ruh->event_filter >> nvme_fdp_evf_shifts[event_type]) & 0x1; +} + +static bool nvme_update_ruh(NvmeCtrl *n, NvmeNamespace *ns, uint16_t pid) +{ + NvmeEnduranceGroup *endgrp = ns->endgrp; + NvmeRuHandle *ruh; + NvmeReclaimUnit *ru; + NvmeFdpEvent *e = NULL; + uint16_t ph, rg, ruhid; + + if (!nvme_parse_pid(ns, pid, &ph, &rg)) { + return false; + } + + ruhid = ns->fdp.phs[ph]; + + ruh = &endgrp->fdp.ruhs[ruhid]; + ru = &ruh->rus[rg]; + + if (ru->ruamw) { + if (log_event(ruh, FDP_EVT_RU_NOT_FULLY_WRITTEN)) { + e = nvme_fdp_alloc_event(n, &endgrp->fdp.host_events); + e->type = FDP_EVT_RU_NOT_FULLY_WRITTEN; + e->flags = FDPEF_PIV | FDPEF_NSIDV | FDPEF_LV; + e->pid = cpu_to_le16(pid); + e->nsid = cpu_to_le32(ns->params.nsid); + e->rgid = cpu_to_le16(rg); + e->ruhid = cpu_to_le16(ruhid); + } + + /* log (eventual) GC overhead of prematurely swapping the RU */ + nvme_fdp_stat_inc(&endgrp->fdp.mbmw, nvme_l2b(ns, ru->ruamw)); + } + + ru->ruamw = ruh->ruamw; + + return true; +} + static bool nvme_addr_is_cmb(NvmeCtrl *n, hwaddr addr) { hwaddr hi, lo; @@ -3320,6 +3445,41 @@ invalid: return status | NVME_DNR; } +static void nvme_do_write_fdp(NvmeCtrl *n, NvmeRequest *req, uint64_t slba, + uint32_t nlb) +{ + NvmeNamespace *ns = req->ns; + NvmeRwCmd *rw = (NvmeRwCmd *)&req->cmd; + uint64_t data_size = nvme_l2b(ns, nlb); + uint32_t dw12 = le32_to_cpu(req->cmd.cdw12); + uint8_t dtype = (dw12 >> 20) & 0xf; + uint16_t pid = le16_to_cpu(rw->dspec); + uint16_t ph, rg, ruhid; + NvmeReclaimUnit *ru; + + if (dtype != NVME_DIRECTIVE_DATA_PLACEMENT || + !nvme_parse_pid(ns, pid, &ph, &rg)) { + ph = 0; + rg = 0; + } + + ruhid = ns->fdp.phs[ph]; + ru = &ns->endgrp->fdp.ruhs[ruhid].rus[rg]; + + nvme_fdp_stat_inc(&ns->endgrp->fdp.hbmw, data_size); + nvme_fdp_stat_inc(&ns->endgrp->fdp.mbmw, data_size); + + while (nlb) { + if (nlb < ru->ruamw) { + ru->ruamw -= nlb; + break; + } + + nlb -= ru->ruamw; + nvme_update_ruh(n, ns, pid); + } +} + static uint16_t nvme_do_write(NvmeCtrl *n, NvmeRequest *req, bool append, bool wrz) { @@ -3429,6 +3589,8 @@ static uint16_t nvme_do_write(NvmeCtrl *n, NvmeRequest *req, bool append, if (!(zone->d.za & NVME_ZA_ZRWA_VALID)) { zone->w_ptr += nlb; } + } else if (ns->endgrp && ns->endgrp->fdp.enabled) { + nvme_do_write_fdp(n, req, slba, nlb); } data_offset = nvme_l2b(ns, slba); @@ -4086,6 +4248,126 @@ static uint16_t nvme_zone_mgmt_recv(NvmeCtrl *n, NvmeRequest *req) return status; } +static uint16_t nvme_io_mgmt_recv_ruhs(NvmeCtrl *n, NvmeRequest *req, + size_t len) +{ + NvmeNamespace *ns = req->ns; + NvmeEnduranceGroup *endgrp; + NvmeRuhStatus *hdr; + NvmeRuhStatusDescr *ruhsd; + unsigned int nruhsd; + uint16_t rg, ph, *ruhid; + size_t trans_len; + g_autofree uint8_t *buf = NULL; + + if (!n->subsys) { + return NVME_INVALID_FIELD | NVME_DNR; + } + + if (ns->params.nsid == 0 || ns->params.nsid == 0xffffffff) { + return NVME_INVALID_NSID | NVME_DNR; + } + + if (!n->subsys->endgrp.fdp.enabled) { + return NVME_FDP_DISABLED | NVME_DNR; + } + + endgrp = ns->endgrp; + + nruhsd = ns->fdp.nphs * endgrp->fdp.nrg; + trans_len = sizeof(NvmeRuhStatus) + nruhsd * sizeof(NvmeRuhStatusDescr); + buf = g_malloc(trans_len); + + trans_len = MIN(trans_len, len); + + hdr = (NvmeRuhStatus *)buf; + ruhsd = (NvmeRuhStatusDescr *)(buf + sizeof(NvmeRuhStatus)); + + hdr->nruhsd = cpu_to_le16(nruhsd); + + ruhid = ns->fdp.phs; + + for (ph = 0; ph < ns->fdp.nphs; ph++, ruhid++) { + NvmeRuHandle *ruh = &endgrp->fdp.ruhs[*ruhid]; + + for (rg = 0; rg < endgrp->fdp.nrg; rg++, ruhsd++) { + uint16_t pid = nvme_make_pid(ns, rg, ph); + + ruhsd->pid = cpu_to_le16(pid); + ruhsd->ruhid = *ruhid; + ruhsd->earutr = 0; + ruhsd->ruamw = cpu_to_le64(ruh->rus[rg].ruamw); + } + } + + return nvme_c2h(n, buf, trans_len, req); +} + +static uint16_t nvme_io_mgmt_recv(NvmeCtrl *n, NvmeRequest *req) +{ + NvmeCmd *cmd = &req->cmd; + uint32_t cdw10 = le32_to_cpu(cmd->cdw10); + uint32_t numd = le32_to_cpu(cmd->cdw11); + uint8_t mo = (cdw10 & 0xff); + size_t len = (numd + 1) << 2; + + switch (mo) { + case NVME_IOMR_MO_NOP: + return 0; + case NVME_IOMR_MO_RUH_STATUS: + return nvme_io_mgmt_recv_ruhs(n, req, len); + default: + return NVME_INVALID_FIELD | NVME_DNR; + }; +} + +static uint16_t nvme_io_mgmt_send_ruh_update(NvmeCtrl *n, NvmeRequest *req) +{ + NvmeCmd *cmd = &req->cmd; + NvmeNamespace *ns = req->ns; + uint32_t cdw10 = le32_to_cpu(cmd->cdw10); + uint16_t ret = NVME_SUCCESS; + uint32_t npid = (cdw10 >> 1) + 1; + unsigned int i = 0; + g_autofree uint16_t *pids = NULL; + uint32_t maxnpid = n->subsys->endgrp.fdp.nrg * n->subsys->endgrp.fdp.nruh; + + if (unlikely(npid >= MIN(NVME_FDP_MAXPIDS, maxnpid))) { + return NVME_INVALID_FIELD | NVME_DNR; + } + + pids = g_new(uint16_t, npid); + + ret = nvme_h2c(n, pids, npid * sizeof(uint16_t), req); + if (ret) { + return ret; + } + + for (; i < npid; i++) { + if (!nvme_update_ruh(n, ns, pids[i])) { + return NVME_INVALID_FIELD | NVME_DNR; + } + } + + return ret; +} + +static uint16_t nvme_io_mgmt_send(NvmeCtrl *n, NvmeRequest *req) +{ + NvmeCmd *cmd = &req->cmd; + uint32_t cdw10 = le32_to_cpu(cmd->cdw10); + uint8_t mo = (cdw10 & 0xff); + + switch (mo) { + case NVME_IOMS_MO_NOP: + return 0; + case NVME_IOMS_MO_RUH_UPDATE: + return nvme_io_mgmt_send_ruh_update(n, req); + default: + return NVME_INVALID_FIELD | NVME_DNR; + }; +} + static uint16_t nvme_io_cmd(NvmeCtrl *n, NvmeRequest *req) { NvmeNamespace *ns; @@ -4162,6 +4444,10 @@ static uint16_t nvme_io_cmd(NvmeCtrl *n, NvmeRequest *req) return nvme_zone_mgmt_send(n, req); case NVME_CMD_ZONE_MGMT_RECV: return nvme_zone_mgmt_recv(n, req); + case NVME_CMD_IO_MGMT_RECV: + return nvme_io_mgmt_recv(n, req); + case NVME_CMD_IO_MGMT_SEND: + return nvme_io_mgmt_send(n, req); default: assert(false); } @@ -4386,8 +4672,8 @@ static void nvme_set_blk_stats(NvmeNamespace *ns, struct nvme_stats *stats) { BlockAcctStats *s = blk_get_stats(ns->blkconf.blk); - stats->units_read += s->nr_bytes[BLOCK_ACCT_READ] >> BDRV_SECTOR_BITS; - stats->units_written += s->nr_bytes[BLOCK_ACCT_WRITE] >> BDRV_SECTOR_BITS; + stats->units_read += s->nr_bytes[BLOCK_ACCT_READ]; + stats->units_written += s->nr_bytes[BLOCK_ACCT_WRITE]; stats->read_commands += s->nr_ops[BLOCK_ACCT_READ]; stats->write_commands += s->nr_ops[BLOCK_ACCT_WRITE]; } @@ -4401,6 +4687,7 @@ static uint16_t nvme_smart_info(NvmeCtrl *n, uint8_t rae, uint32_t buf_len, uint32_t trans_len; NvmeNamespace *ns; time_t current_ms; + uint64_t u_read, u_written; if (off >= sizeof(smart)) { return NVME_INVALID_FIELD | NVME_DNR; @@ -4427,10 +4714,11 @@ static uint16_t nvme_smart_info(NvmeCtrl *n, uint8_t rae, uint32_t buf_len, trans_len = MIN(sizeof(smart) - off, buf_len); smart.critical_warning = n->smart_critical_warning; - smart.data_units_read[0] = cpu_to_le64(DIV_ROUND_UP(stats.units_read, - 1000)); - smart.data_units_written[0] = cpu_to_le64(DIV_ROUND_UP(stats.units_written, - 1000)); + u_read = DIV_ROUND_UP(stats.units_read >> BDRV_SECTOR_BITS, 1000); + u_written = DIV_ROUND_UP(stats.units_written >> BDRV_SECTOR_BITS, 1000); + + smart.data_units_read[0] = cpu_to_le64(u_read); + smart.data_units_written[0] = cpu_to_le64(u_written); smart.host_read_commands[0] = cpu_to_le64(stats.read_commands); smart.host_write_commands[0] = cpu_to_le64(stats.write_commands); @@ -4452,6 +4740,48 @@ static uint16_t nvme_smart_info(NvmeCtrl *n, uint8_t rae, uint32_t buf_len, return nvme_c2h(n, (uint8_t *) &smart + off, trans_len, req); } +static uint16_t nvme_endgrp_info(NvmeCtrl *n, uint8_t rae, uint32_t buf_len, + uint64_t off, NvmeRequest *req) +{ + uint32_t dw11 = le32_to_cpu(req->cmd.cdw11); + uint16_t endgrpid = (dw11 >> 16) & 0xffff; + struct nvme_stats stats = {}; + NvmeEndGrpLog info = {}; + int i; + + if (!n->subsys || endgrpid != 0x1) { + return NVME_INVALID_FIELD | NVME_DNR; + } + + if (off >= sizeof(info)) { + return NVME_INVALID_FIELD | NVME_DNR; + } + + for (i = 1; i <= NVME_MAX_NAMESPACES; i++) { + NvmeNamespace *ns = nvme_subsys_ns(n->subsys, i); + if (!ns) { + continue; + } + + nvme_set_blk_stats(ns, &stats); + } + + info.data_units_read[0] = + cpu_to_le64(DIV_ROUND_UP(stats.units_read / 1000000000, 1000000000)); + info.data_units_written[0] = + cpu_to_le64(DIV_ROUND_UP(stats.units_written / 1000000000, 1000000000)); + info.media_units_written[0] = + cpu_to_le64(DIV_ROUND_UP(stats.units_written / 1000000000, 1000000000)); + + info.host_read_commands[0] = cpu_to_le64(stats.read_commands); + info.host_write_commands[0] = cpu_to_le64(stats.write_commands); + + buf_len = MIN(sizeof(info) - off, buf_len); + + return nvme_c2h(n, (uint8_t *)&info + off, buf_len, req); +} + + static uint16_t nvme_fw_log_info(NvmeCtrl *n, uint32_t buf_len, uint64_t off, NvmeRequest *req) { @@ -4577,6 +4907,207 @@ static uint16_t nvme_cmd_effects(NvmeCtrl *n, uint8_t csi, uint32_t buf_len, return nvme_c2h(n, ((uint8_t *)&log) + off, trans_len, req); } +static size_t sizeof_fdp_conf_descr(size_t nruh, size_t vss) +{ + size_t entry_siz = sizeof(NvmeFdpDescrHdr) + nruh * sizeof(NvmeRuhDescr) + + vss; + return ROUND_UP(entry_siz, 8); +} + +static uint16_t nvme_fdp_confs(NvmeCtrl *n, uint32_t endgrpid, uint32_t buf_len, + uint64_t off, NvmeRequest *req) +{ + uint32_t log_size, trans_len; + g_autofree uint8_t *buf = NULL; + NvmeFdpDescrHdr *hdr; + NvmeRuhDescr *ruhd; + NvmeEnduranceGroup *endgrp; + NvmeFdpConfsHdr *log; + size_t nruh, fdp_descr_size; + int i; + + if (endgrpid != 1 || !n->subsys) { + return NVME_INVALID_FIELD | NVME_DNR; + } + + endgrp = &n->subsys->endgrp; + + if (endgrp->fdp.enabled) { + nruh = endgrp->fdp.nruh; + } else { + nruh = 1; + } + + fdp_descr_size = sizeof_fdp_conf_descr(nruh, FDPVSS); + log_size = sizeof(NvmeFdpConfsHdr) + fdp_descr_size; + + if (off >= log_size) { + return NVME_INVALID_FIELD | NVME_DNR; + } + + trans_len = MIN(log_size - off, buf_len); + + buf = g_malloc0(log_size); + log = (NvmeFdpConfsHdr *)buf; + hdr = (NvmeFdpDescrHdr *)(log + 1); + ruhd = (NvmeRuhDescr *)(buf + sizeof(*log) + sizeof(*hdr)); + + log->num_confs = cpu_to_le16(0); + log->size = cpu_to_le32(log_size); + + hdr->descr_size = cpu_to_le16(fdp_descr_size); + if (endgrp->fdp.enabled) { + hdr->fdpa = FIELD_DP8(hdr->fdpa, FDPA, VALID, 1); + hdr->fdpa = FIELD_DP8(hdr->fdpa, FDPA, RGIF, endgrp->fdp.rgif); + hdr->nrg = cpu_to_le16(endgrp->fdp.nrg); + hdr->nruh = cpu_to_le16(endgrp->fdp.nruh); + hdr->maxpids = cpu_to_le16(NVME_FDP_MAXPIDS - 1); + hdr->nnss = cpu_to_le32(NVME_MAX_NAMESPACES); + hdr->runs = cpu_to_le64(endgrp->fdp.runs); + + for (i = 0; i < nruh; i++) { + ruhd->ruht = NVME_RUHT_INITIALLY_ISOLATED; + ruhd++; + } + } else { + /* 1 bit for RUH in PIF -> 2 RUHs max. */ + hdr->nrg = cpu_to_le16(1); + hdr->nruh = cpu_to_le16(1); + hdr->maxpids = cpu_to_le16(NVME_FDP_MAXPIDS - 1); + hdr->nnss = cpu_to_le32(1); + hdr->runs = cpu_to_le64(96 * MiB); + + ruhd->ruht = NVME_RUHT_INITIALLY_ISOLATED; + } + + return nvme_c2h(n, (uint8_t *)buf + off, trans_len, req); +} + +static uint16_t nvme_fdp_ruh_usage(NvmeCtrl *n, uint32_t endgrpid, + uint32_t dw10, uint32_t dw12, + uint32_t buf_len, uint64_t off, + NvmeRequest *req) +{ + NvmeRuHandle *ruh; + NvmeRuhuLog *hdr; + NvmeRuhuDescr *ruhud; + NvmeEnduranceGroup *endgrp; + g_autofree uint8_t *buf = NULL; + uint32_t log_size, trans_len; + uint16_t i; + + if (endgrpid != 1 || !n->subsys) { + return NVME_INVALID_FIELD | NVME_DNR; + } + + endgrp = &n->subsys->endgrp; + + if (!endgrp->fdp.enabled) { + return NVME_FDP_DISABLED | NVME_DNR; + } + + log_size = sizeof(NvmeRuhuLog) + endgrp->fdp.nruh * sizeof(NvmeRuhuDescr); + + if (off >= log_size) { + return NVME_INVALID_FIELD | NVME_DNR; + } + + trans_len = MIN(log_size - off, buf_len); + + buf = g_malloc0(log_size); + hdr = (NvmeRuhuLog *)buf; + ruhud = (NvmeRuhuDescr *)(hdr + 1); + + ruh = endgrp->fdp.ruhs; + hdr->nruh = cpu_to_le16(endgrp->fdp.nruh); + + for (i = 0; i < endgrp->fdp.nruh; i++, ruhud++, ruh++) { + ruhud->ruha = ruh->ruha; + } + + return nvme_c2h(n, (uint8_t *)buf + off, trans_len, req); +} + +static uint16_t nvme_fdp_stats(NvmeCtrl *n, uint32_t endgrpid, uint32_t buf_len, + uint64_t off, NvmeRequest *req) +{ + NvmeEnduranceGroup *endgrp; + NvmeFdpStatsLog log = {}; + uint32_t trans_len; + + if (off >= sizeof(NvmeFdpStatsLog)) { + return NVME_INVALID_FIELD | NVME_DNR; + } + + if (endgrpid != 1 || !n->subsys) { + return NVME_INVALID_FIELD | NVME_DNR; + } + + if (!n->subsys->endgrp.fdp.enabled) { + return NVME_FDP_DISABLED | NVME_DNR; + } + + endgrp = &n->subsys->endgrp; + + trans_len = MIN(sizeof(log) - off, buf_len); + + /* spec value is 128 bit, we only use 64 bit */ + log.hbmw[0] = cpu_to_le64(endgrp->fdp.hbmw); + log.mbmw[0] = cpu_to_le64(endgrp->fdp.mbmw); + log.mbe[0] = cpu_to_le64(endgrp->fdp.mbe); + + return nvme_c2h(n, (uint8_t *)&log + off, trans_len, req); +} + +static uint16_t nvme_fdp_events(NvmeCtrl *n, uint32_t endgrpid, + uint32_t buf_len, uint64_t off, + NvmeRequest *req) +{ + NvmeEnduranceGroup *endgrp; + NvmeCmd *cmd = &req->cmd; + bool host_events = (cmd->cdw10 >> 8) & 0x1; + uint32_t log_size, trans_len; + NvmeFdpEventBuffer *ebuf; + g_autofree NvmeFdpEventsLog *elog = NULL; + NvmeFdpEvent *event; + + if (endgrpid != 1 || !n->subsys) { + return NVME_INVALID_FIELD | NVME_DNR; + } + + endgrp = &n->subsys->endgrp; + + if (!endgrp->fdp.enabled) { + return NVME_FDP_DISABLED | NVME_DNR; + } + + if (host_events) { + ebuf = &endgrp->fdp.host_events; + } else { + ebuf = &endgrp->fdp.ctrl_events; + } + + log_size = sizeof(NvmeFdpEventsLog) + ebuf->nelems * sizeof(NvmeFdpEvent); + trans_len = MIN(log_size - off, buf_len); + elog = g_malloc0(log_size); + elog->num_events = cpu_to_le32(ebuf->nelems); + event = (NvmeFdpEvent *)(elog + 1); + + if (ebuf->nelems && ebuf->start == ebuf->next) { + unsigned int nelems = (NVME_FDP_MAX_EVENTS - ebuf->start); + /* wrap over, copy [start;NVME_FDP_MAX_EVENTS[ and [0; next[ */ + memcpy(event, &ebuf->events[ebuf->start], + sizeof(NvmeFdpEvent) * nelems); + memcpy(event + nelems, ebuf->events, + sizeof(NvmeFdpEvent) * ebuf->next); + } else if (ebuf->start < ebuf->next) { + memcpy(event, &ebuf->events[ebuf->start], + sizeof(NvmeFdpEvent) * (ebuf->next - ebuf->start)); + } + + return nvme_c2h(n, (uint8_t *)elog + off, trans_len, req); +} + static uint16_t nvme_get_log(NvmeCtrl *n, NvmeRequest *req) { NvmeCmd *cmd = &req->cmd; @@ -4589,13 +5120,14 @@ static uint16_t nvme_get_log(NvmeCtrl *n, NvmeRequest *req) uint8_t lsp = (dw10 >> 8) & 0xf; uint8_t rae = (dw10 >> 15) & 0x1; uint8_t csi = le32_to_cpu(cmd->cdw14) >> 24; - uint32_t numdl, numdu; + uint32_t numdl, numdu, lspi; uint64_t off, lpol, lpou; size_t len; uint16_t status; numdl = (dw10 >> 16); numdu = (dw11 & 0xffff); + lspi = (dw11 >> 16); lpol = dw12; lpou = dw13; @@ -4624,6 +5156,16 @@ static uint16_t nvme_get_log(NvmeCtrl *n, NvmeRequest *req) return nvme_changed_nslist(n, rae, len, off, req); case NVME_LOG_CMD_EFFECTS: return nvme_cmd_effects(n, csi, len, off, req); + case NVME_LOG_ENDGRP: + return nvme_endgrp_info(n, rae, len, off, req); + case NVME_LOG_FDP_CONFS: + return nvme_fdp_confs(n, lspi, len, off, req); + case NVME_LOG_FDP_RUH_USAGE: + return nvme_fdp_ruh_usage(n, lspi, dw10, dw12, len, off, req); + case NVME_LOG_FDP_STATS: + return nvme_fdp_stats(n, lspi, len, off, req); + case NVME_LOG_FDP_EVENTS: + return nvme_fdp_events(n, lspi, len, off, req); default: trace_pci_nvme_err_invalid_log_page(nvme_cid(req), lid); return NVME_INVALID_FIELD | NVME_DNR; @@ -5210,6 +5752,84 @@ static uint16_t nvme_get_feature_timestamp(NvmeCtrl *n, NvmeRequest *req) return nvme_c2h(n, (uint8_t *)×tamp, sizeof(timestamp), req); } +static int nvme_get_feature_fdp(NvmeCtrl *n, uint32_t endgrpid, + uint32_t *result) +{ + *result = 0; + + if (!n->subsys || !n->subsys->endgrp.fdp.enabled) { + return NVME_INVALID_FIELD | NVME_DNR; + } + + *result = FIELD_DP16(0, FEAT_FDP, FDPE, 1); + *result = FIELD_DP16(*result, FEAT_FDP, CONF_NDX, 0); + + return NVME_SUCCESS; +} + +static uint16_t nvme_get_feature_fdp_events(NvmeCtrl *n, NvmeNamespace *ns, + NvmeRequest *req, uint32_t *result) +{ + NvmeCmd *cmd = &req->cmd; + uint32_t cdw11 = le32_to_cpu(cmd->cdw11); + uint16_t ph = cdw11 & 0xffff; + uint8_t noet = (cdw11 >> 16) & 0xff; + uint16_t ruhid, ret; + uint32_t nentries = 0; + uint8_t s_events_ndx = 0; + size_t s_events_siz = sizeof(NvmeFdpEventDescr) * noet; + g_autofree NvmeFdpEventDescr *s_events = g_malloc0(s_events_siz); + NvmeRuHandle *ruh; + NvmeFdpEventDescr *s_event; + + if (!n->subsys || !n->subsys->endgrp.fdp.enabled) { + return NVME_FDP_DISABLED | NVME_DNR; + } + + if (!nvme_ph_valid(ns, ph)) { + return NVME_INVALID_FIELD | NVME_DNR; + } + + ruhid = ns->fdp.phs[ph]; + ruh = &n->subsys->endgrp.fdp.ruhs[ruhid]; + + assert(ruh); + + if (unlikely(noet == 0)) { + return NVME_INVALID_FIELD | NVME_DNR; + } + + for (uint8_t event_type = 0; event_type < FDP_EVT_MAX; event_type++) { + uint8_t shift = nvme_fdp_evf_shifts[event_type]; + if (!shift && event_type) { + /* + * only first entry (event_type == 0) has a shift value of 0 + * other entries are simply unpopulated. + */ + continue; + } + + nentries++; + + s_event = &s_events[s_events_ndx]; + s_event->evt = event_type; + s_event->evta = (ruh->event_filter >> shift) & 0x1; + + /* break if all `noet` entries are filled */ + if ((++s_events_ndx) == noet) { + break; + } + } + + ret = nvme_c2h(n, s_events, s_events_siz, req); + if (ret) { + return ret; + } + + *result = nentries; + return NVME_SUCCESS; +} + static uint16_t nvme_get_feature(NvmeCtrl *n, NvmeRequest *req) { NvmeCmd *cmd = &req->cmd; @@ -5222,6 +5842,7 @@ static uint16_t nvme_get_feature(NvmeCtrl *n, NvmeRequest *req) uint16_t iv; NvmeNamespace *ns; int i; + uint16_t endgrpid = 0, ret = NVME_SUCCESS; static const uint32_t nvme_feature_default[NVME_FID_MAX] = { [NVME_ARBITRATION] = NVME_ARB_AB_NOLIMIT, @@ -5319,6 +5940,33 @@ static uint16_t nvme_get_feature(NvmeCtrl *n, NvmeRequest *req) case NVME_HOST_BEHAVIOR_SUPPORT: return nvme_c2h(n, (uint8_t *)&n->features.hbs, sizeof(n->features.hbs), req); + case NVME_FDP_MODE: + endgrpid = dw11 & 0xff; + + if (endgrpid != 0x1) { + return NVME_INVALID_FIELD | NVME_DNR; + } + + ret = nvme_get_feature_fdp(n, endgrpid, &result); + if (ret) { + return ret; + } + goto out; + case NVME_FDP_EVENTS: + if (!nvme_nsid_valid(n, nsid)) { + return NVME_INVALID_NSID | NVME_DNR; + } + + ns = nvme_ns(n, nsid); + if (unlikely(!ns)) { + return NVME_INVALID_FIELD | NVME_DNR; + } + + ret = nvme_get_feature_fdp_events(n, ns, req, &result); + if (ret) { + return ret; + } + goto out; default: break; } @@ -5352,6 +6000,20 @@ defaults: result |= NVME_INTVC_NOCOALESCING; } break; + case NVME_FDP_MODE: + endgrpid = dw11 & 0xff; + + if (endgrpid != 0x1) { + return NVME_INVALID_FIELD | NVME_DNR; + } + + ret = nvme_get_feature_fdp(n, endgrpid, &result); + if (ret) { + return ret; + } + goto out; + + break; default: result = nvme_feature_default[fid]; break; @@ -5359,7 +6021,7 @@ defaults: out: req->cqe.result = cpu_to_le32(result); - return NVME_SUCCESS; + return ret; } static uint16_t nvme_set_feature_timestamp(NvmeCtrl *n, NvmeRequest *req) @@ -5377,6 +6039,51 @@ static uint16_t nvme_set_feature_timestamp(NvmeCtrl *n, NvmeRequest *req) return NVME_SUCCESS; } +static uint16_t nvme_set_feature_fdp_events(NvmeCtrl *n, NvmeNamespace *ns, + NvmeRequest *req) +{ + NvmeCmd *cmd = &req->cmd; + uint32_t cdw11 = le32_to_cpu(cmd->cdw11); + uint16_t ph = cdw11 & 0xffff; + uint8_t noet = (cdw11 >> 16) & 0xff; + uint16_t ret, ruhid; + uint8_t enable = le32_to_cpu(cmd->cdw12) & 0x1; + uint8_t event_mask = 0; + unsigned int i; + g_autofree uint8_t *events = g_malloc0(noet); + NvmeRuHandle *ruh = NULL; + + assert(ns); + + if (!n->subsys || !n->subsys->endgrp.fdp.enabled) { + return NVME_FDP_DISABLED | NVME_DNR; + } + + if (!nvme_ph_valid(ns, ph)) { + return NVME_INVALID_FIELD | NVME_DNR; + } + + ruhid = ns->fdp.phs[ph]; + ruh = &n->subsys->endgrp.fdp.ruhs[ruhid]; + + ret = nvme_h2c(n, events, noet, req); + if (ret) { + return ret; + } + + for (i = 0; i < noet; i++) { + event_mask |= (1 << nvme_fdp_evf_shifts[events[i]]); + } + + if (enable) { + ruh->event_filter |= event_mask; + } else { + ruh->event_filter = ruh->event_filter & ~event_mask; + } + + return NVME_SUCCESS; +} + static uint16_t nvme_set_feature(NvmeCtrl *n, NvmeRequest *req) { NvmeNamespace *ns = NULL; @@ -5536,6 +6243,11 @@ static uint16_t nvme_set_feature(NvmeCtrl *n, NvmeRequest *req) return NVME_CMD_SET_CMB_REJECTED | NVME_DNR; } break; + case NVME_FDP_MODE: + /* spec: abort with cmd seq err if there's one or more NS' in endgrp */ + return NVME_CMD_SEQ_ERROR | NVME_DNR; + case NVME_FDP_EVENTS: + return nvme_set_feature_fdp_events(n, ns, req); default: return NVME_FEAT_NOT_CHANGEABLE | NVME_DNR; } @@ -6104,6 +6816,61 @@ static uint16_t nvme_dbbuf_config(NvmeCtrl *n, const NvmeRequest *req) return NVME_SUCCESS; } +static uint16_t nvme_directive_send(NvmeCtrl *n, NvmeRequest *req) +{ + return NVME_INVALID_FIELD | NVME_DNR; +} + +static uint16_t nvme_directive_receive(NvmeCtrl *n, NvmeRequest *req) +{ + NvmeNamespace *ns; + uint32_t dw10 = le32_to_cpu(req->cmd.cdw10); + uint32_t dw11 = le32_to_cpu(req->cmd.cdw11); + uint32_t nsid = le32_to_cpu(req->cmd.nsid); + uint8_t doper, dtype; + uint32_t numd, trans_len; + NvmeDirectiveIdentify id = { + .supported = 1 << NVME_DIRECTIVE_IDENTIFY, + .enabled = 1 << NVME_DIRECTIVE_IDENTIFY, + }; + + numd = dw10 + 1; + doper = dw11 & 0xff; + dtype = (dw11 >> 8) & 0xff; + + trans_len = MIN(sizeof(NvmeDirectiveIdentify), numd << 2); + + if (nsid == NVME_NSID_BROADCAST || dtype != NVME_DIRECTIVE_IDENTIFY || + doper != NVME_DIRECTIVE_RETURN_PARAMS) { + return NVME_INVALID_FIELD | NVME_DNR; + } + + ns = nvme_ns(n, nsid); + if (!ns) { + return NVME_INVALID_FIELD | NVME_DNR; + } + + switch (dtype) { + case NVME_DIRECTIVE_IDENTIFY: + switch (doper) { + case NVME_DIRECTIVE_RETURN_PARAMS: + if (ns->endgrp->fdp.enabled) { + id.supported |= 1 << NVME_DIRECTIVE_DATA_PLACEMENT; + id.enabled |= 1 << NVME_DIRECTIVE_DATA_PLACEMENT; + id.persistent |= 1 << NVME_DIRECTIVE_DATA_PLACEMENT; + } + + return nvme_c2h(n, (uint8_t *)&id, trans_len, req); + + default: + return NVME_INVALID_FIELD | NVME_DNR; + } + + default: + return NVME_INVALID_FIELD; + } +} + static uint16_t nvme_admin_cmd(NvmeCtrl *n, NvmeRequest *req) { trace_pci_nvme_admin_cmd(nvme_cid(req), nvme_sqid(req), req->cmd.opcode, @@ -6152,6 +6919,10 @@ static uint16_t nvme_admin_cmd(NvmeCtrl *n, NvmeRequest *req) return nvme_dbbuf_config(n, req); case NVME_ADM_CMD_FORMAT_NVM: return nvme_format(n, req); + case NVME_ADM_CMD_DIRECTIVE_SEND: + return nvme_directive_send(n, req); + case NVME_ADM_CMD_DIRECTIVE_RECV: + return nvme_directive_receive(n, req); default: assert(false); } @@ -7380,6 +8151,7 @@ static void nvme_init_ctrl(NvmeCtrl *n, PCIDevice *pci_dev) uint8_t *pci_conf = pci_dev->config; uint64_t cap = ldq_le_p(&n->bar.cap); NvmeSecCtrlEntry *sctrl = nvme_sctrl(n); + uint32_t ctratt; id->vid = cpu_to_le16(pci_get_word(pci_conf + PCI_VENDOR_ID)); id->ssvid = cpu_to_le16(pci_get_word(pci_conf + PCI_SUBSYSTEM_VENDOR_ID)); @@ -7390,7 +8162,7 @@ static void nvme_init_ctrl(NvmeCtrl *n, PCIDevice *pci_dev) id->cntlid = cpu_to_le16(n->cntlid); id->oaes = cpu_to_le32(NVME_OAES_NS_ATTR); - id->ctratt |= cpu_to_le32(NVME_CTRATT_ELBAS); + ctratt = NVME_CTRATT_ELBAS; id->rab = 6; @@ -7407,7 +8179,8 @@ static void nvme_init_ctrl(NvmeCtrl *n, PCIDevice *pci_dev) id->mdts = n->params.mdts; id->ver = cpu_to_le32(NVME_SPEC_VER); id->oacs = - cpu_to_le16(NVME_OACS_NS_MGMT | NVME_OACS_FORMAT | NVME_OACS_DBBUF); + cpu_to_le16(NVME_OACS_NS_MGMT | NVME_OACS_FORMAT | NVME_OACS_DBBUF | + NVME_OACS_DIRECTIVES); id->cntrltype = 0x1; /* @@ -7457,8 +8230,17 @@ static void nvme_init_ctrl(NvmeCtrl *n, PCIDevice *pci_dev) if (n->subsys) { id->cmic |= NVME_CMIC_MULTI_CTRL; + ctratt |= NVME_CTRATT_ENDGRPS; + + id->endgidmax = cpu_to_le16(0x1); + + if (n->subsys->endgrp.fdp.enabled) { + ctratt |= NVME_CTRATT_FDPS; + } } + id->ctratt = cpu_to_le32(ctratt); + NVME_CAP_SET_MQES(cap, 0x7ff); NVME_CAP_SET_CQR(cap, 1); NVME_CAP_SET_TO(cap, 0xf); diff --git a/hw/nvme/ns.c b/hw/nvme/ns.c index 62a1f97be0..cfac960dcf 100644 --- a/hw/nvme/ns.c +++ b/hw/nvme/ns.c @@ -14,8 +14,10 @@ #include "qemu/osdep.h" #include "qemu/units.h" +#include "qemu/cutils.h" #include "qemu/error-report.h" #include "qapi/error.h" +#include "qemu/bitops.h" #include "sysemu/sysemu.h" #include "sysemu/block-backend.h" @@ -377,6 +379,130 @@ static void nvme_zoned_ns_shutdown(NvmeNamespace *ns) assert(ns->nr_open_zones == 0); } +static NvmeRuHandle *nvme_find_ruh_by_attr(NvmeEnduranceGroup *endgrp, + uint8_t ruha, uint16_t *ruhid) +{ + for (uint16_t i = 0; i < endgrp->fdp.nruh; i++) { + NvmeRuHandle *ruh = &endgrp->fdp.ruhs[i]; + + if (ruh->ruha == ruha) { + *ruhid = i; + return ruh; + } + } + + return NULL; +} + +static bool nvme_ns_init_fdp(NvmeNamespace *ns, Error **errp) +{ + NvmeEnduranceGroup *endgrp = ns->endgrp; + NvmeRuHandle *ruh; + uint8_t lbafi = NVME_ID_NS_FLBAS_INDEX(ns->id_ns.flbas); + unsigned int *ruhid, *ruhids; + char *r, *p, *token; + uint16_t *ph; + + if (!ns->params.fdp.ruhs) { + ns->fdp.nphs = 1; + ph = ns->fdp.phs = g_new(uint16_t, 1); + + ruh = nvme_find_ruh_by_attr(endgrp, NVME_RUHA_CTRL, ph); + if (!ruh) { + ruh = nvme_find_ruh_by_attr(endgrp, NVME_RUHA_UNUSED, ph); + if (!ruh) { + error_setg(errp, "no unused reclaim unit handles left"); + return false; + } + + ruh->ruha = NVME_RUHA_CTRL; + ruh->lbafi = lbafi; + ruh->ruamw = endgrp->fdp.runs >> ns->lbaf.ds; + + for (uint16_t rg = 0; rg < endgrp->fdp.nrg; rg++) { + ruh->rus[rg].ruamw = ruh->ruamw; + } + } else if (ruh->lbafi != lbafi) { + error_setg(errp, "lba format index of controller assigned " + "reclaim unit handle does not match namespace lba " + "format index"); + return false; + } + + return true; + } + + ruhid = ruhids = g_new0(unsigned int, endgrp->fdp.nruh); + r = p = strdup(ns->params.fdp.ruhs); + + /* parse the placement handle identifiers */ + while ((token = qemu_strsep(&p, ";")) != NULL) { + ns->fdp.nphs += 1; + if (ns->fdp.nphs > NVME_FDP_MAXPIDS || + ns->fdp.nphs == endgrp->fdp.nruh) { + error_setg(errp, "too many placement handles"); + free(r); + return false; + } + + if (qemu_strtoui(token, NULL, 0, ruhid++) < 0) { + error_setg(errp, "cannot parse reclaim unit handle identifier"); + free(r); + return false; + } + } + + free(r); + + ph = ns->fdp.phs = g_new(uint16_t, ns->fdp.nphs); + + ruhid = ruhids; + + /* verify the identifiers */ + for (unsigned int i = 0; i < ns->fdp.nphs; i++, ruhid++, ph++) { + if (*ruhid >= endgrp->fdp.nruh) { + error_setg(errp, "invalid reclaim unit handle identifier"); + return false; + } + + ruh = &endgrp->fdp.ruhs[*ruhid]; + + switch (ruh->ruha) { + case NVME_RUHA_UNUSED: + ruh->ruha = NVME_RUHA_HOST; + ruh->lbafi = lbafi; + ruh->ruamw = endgrp->fdp.runs >> ns->lbaf.ds; + + for (uint16_t rg = 0; rg < endgrp->fdp.nrg; rg++) { + ruh->rus[rg].ruamw = ruh->ruamw; + } + + break; + + case NVME_RUHA_HOST: + if (ruh->lbafi != lbafi) { + error_setg(errp, "lba format index of host assigned" + "reclaim unit handle does not match namespace " + "lba format index"); + return false; + } + + break; + + case NVME_RUHA_CTRL: + error_setg(errp, "reclaim unit handle is controller assigned"); + return false; + + default: + abort(); + } + + *ph = *ruhid; + } + + return true; +} + static int nvme_ns_check_constraints(NvmeNamespace *ns, Error **errp) { unsigned int pi_size; @@ -417,6 +543,11 @@ static int nvme_ns_check_constraints(NvmeNamespace *ns, Error **errp) return -1; } + if (ns->params.zoned && ns->endgrp && ns->endgrp->fdp.enabled) { + error_setg(errp, "cannot be a zoned- in an FDP configuration"); + return -1; + } + if (ns->params.zoned) { if (ns->params.max_active_zones) { if (ns->params.max_open_zones > ns->params.max_active_zones) { @@ -502,6 +633,12 @@ int nvme_ns_setup(NvmeNamespace *ns, Error **errp) nvme_ns_init_zoned(ns); } + if (ns->endgrp && ns->endgrp->fdp.enabled) { + if (!nvme_ns_init_fdp(ns, errp)) { + return -1; + } + } + return 0; } @@ -525,6 +662,10 @@ void nvme_ns_cleanup(NvmeNamespace *ns) g_free(ns->zone_array); g_free(ns->zd_extensions); } + + if (ns->endgrp && ns->endgrp->fdp.enabled) { + g_free(ns->fdp.phs); + } } static void nvme_ns_unrealize(DeviceState *dev) @@ -561,6 +702,8 @@ static void nvme_ns_realize(DeviceState *dev, Error **errp) if (!qdev_set_parent_bus(dev, &subsys->bus.parent_bus, errp)) { return; } + ns->subsys = subsys; + ns->endgrp = &subsys->endgrp; } if (nvme_ns_setup(ns, errp)) { @@ -591,6 +734,8 @@ static void nvme_ns_realize(DeviceState *dev, Error **errp) if (subsys) { subsys->namespaces[nsid] = ns; + ns->id_ns.endgid = cpu_to_le16(0x1); + if (ns->params.detached) { return; } @@ -606,6 +751,7 @@ static void nvme_ns_realize(DeviceState *dev, Error **errp) return; } + } nvme_attach_ns(n, ns); @@ -644,6 +790,7 @@ static Property nvme_ns_props[] = { DEFINE_PROP_SIZE("zoned.zrwafg", NvmeNamespace, params.zrwafg, -1), DEFINE_PROP_BOOL("eui64-default", NvmeNamespace, params.eui64_default, false), + DEFINE_PROP_STRING("fdp.ruhs", NvmeNamespace, params.fdp.ruhs), DEFINE_PROP_END_OF_LIST(), }; diff --git a/hw/nvme/nvme.h b/hw/nvme/nvme.h index 16da27a69b..209e8f5b4c 100644 --- a/hw/nvme/nvme.h +++ b/hw/nvme/nvme.h @@ -27,6 +27,8 @@ #define NVME_MAX_CONTROLLERS 256 #define NVME_MAX_NAMESPACES 256 #define NVME_EUI64_DEFAULT ((uint64_t)0x5254000000000000) +#define NVME_FDP_MAX_EVENTS 63 +#define NVME_FDP_MAXPIDS 128 QEMU_BUILD_BUG_ON(NVME_MAX_NAMESPACES > NVME_NSID_BROADCAST - 1); @@ -45,17 +47,68 @@ typedef struct NvmeBus { OBJECT_CHECK(NvmeSubsystem, (obj), TYPE_NVME_SUBSYS) #define SUBSYS_SLOT_RSVD (void *)0xFFFF +typedef struct NvmeReclaimUnit { + uint64_t ruamw; +} NvmeReclaimUnit; + +typedef struct NvmeRuHandle { + uint8_t ruht; + uint8_t ruha; + uint64_t event_filter; + uint8_t lbafi; + uint64_t ruamw; + + /* reclaim units indexed by reclaim group */ + NvmeReclaimUnit *rus; +} NvmeRuHandle; + +typedef struct NvmeFdpEventBuffer { + NvmeFdpEvent events[NVME_FDP_MAX_EVENTS]; + unsigned int nelems; + unsigned int start; + unsigned int next; +} NvmeFdpEventBuffer; + +typedef struct NvmeEnduranceGroup { + uint8_t event_conf; + + struct { + NvmeFdpEventBuffer host_events, ctrl_events; + + uint16_t nruh; + uint16_t nrg; + uint8_t rgif; + uint64_t runs; + + uint64_t hbmw; + uint64_t mbmw; + uint64_t mbe; + + bool enabled; + + NvmeRuHandle *ruhs; + } fdp; +} NvmeEnduranceGroup; + typedef struct NvmeSubsystem { DeviceState parent_obj; NvmeBus bus; uint8_t subnqn[256]; char *serial; - NvmeCtrl *ctrls[NVME_MAX_CONTROLLERS]; - NvmeNamespace *namespaces[NVME_MAX_NAMESPACES + 1]; + NvmeCtrl *ctrls[NVME_MAX_CONTROLLERS]; + NvmeNamespace *namespaces[NVME_MAX_NAMESPACES + 1]; + NvmeEnduranceGroup endgrp; struct { char *nqn; + + struct { + bool enabled; + uint64_t runs; + uint16_t nruh; + uint32_t nrg; + } fdp; } params; } NvmeSubsystem; @@ -96,6 +149,21 @@ typedef struct NvmeZone { QTAILQ_ENTRY(NvmeZone) entry; } NvmeZone; +#define FDP_EVT_MAX 0xff +#define NVME_FDP_MAX_NS_RUHS 32u +#define FDPVSS 0 + +static const uint8_t nvme_fdp_evf_shifts[FDP_EVT_MAX] = { + /* Host events */ + [FDP_EVT_RU_NOT_FULLY_WRITTEN] = 0, + [FDP_EVT_RU_ATL_EXCEEDED] = 1, + [FDP_EVT_CTRL_RESET_RUH] = 2, + [FDP_EVT_INVALID_PID] = 3, + /* CTRL events */ + [FDP_EVT_MEDIA_REALLOC] = 32, + [FDP_EVT_RUH_IMPLICIT_RU_CHANGE] = 33, +}; + typedef struct NvmeNamespaceParams { bool detached; bool shared; @@ -125,6 +193,10 @@ typedef struct NvmeNamespaceParams { uint32_t numzrwa; uint64_t zrwas; uint64_t zrwafg; + + struct { + char *ruhs; + } fdp; } NvmeNamespaceParams; typedef struct NvmeNamespace { @@ -167,10 +239,18 @@ typedef struct NvmeNamespace { int32_t nr_active_zones; NvmeNamespaceParams params; + NvmeSubsystem *subsys; + NvmeEnduranceGroup *endgrp; struct { uint32_t err_rec; } features; + + struct { + uint16_t nphs; + /* reclaim unit handle identifiers indexed by placement handle */ + uint16_t *phs; + } fdp; } NvmeNamespace; static inline uint32_t nvme_nsid(NvmeNamespace *ns) @@ -274,6 +354,12 @@ static inline void nvme_aor_dec_active(NvmeNamespace *ns) assert(ns->nr_active_zones >= 0); } +static inline void nvme_fdp_stat_inc(uint64_t *a, uint64_t b) +{ + uint64_t ret = *a + b; + *a = ret < *a ? UINT64_MAX : ret; +} + void nvme_ns_init_format(NvmeNamespace *ns); int nvme_ns_setup(NvmeNamespace *ns, Error **errp); void nvme_ns_drain(NvmeNamespace *ns); @@ -340,7 +426,9 @@ static inline const char *nvme_adm_opc_str(uint8_t opc) case NVME_ADM_CMD_GET_FEATURES: return "NVME_ADM_CMD_GET_FEATURES"; case NVME_ADM_CMD_ASYNC_EV_REQ: return "NVME_ADM_CMD_ASYNC_EV_REQ"; case NVME_ADM_CMD_NS_ATTACHMENT: return "NVME_ADM_CMD_NS_ATTACHMENT"; + case NVME_ADM_CMD_DIRECTIVE_SEND: return "NVME_ADM_CMD_DIRECTIVE_SEND"; case NVME_ADM_CMD_VIRT_MNGMT: return "NVME_ADM_CMD_VIRT_MNGMT"; + case NVME_ADM_CMD_DIRECTIVE_RECV: return "NVME_ADM_CMD_DIRECTIVE_RECV"; case NVME_ADM_CMD_DBBUF_CONFIG: return "NVME_ADM_CMD_DBBUF_CONFIG"; case NVME_ADM_CMD_FORMAT_NVM: return "NVME_ADM_CMD_FORMAT_NVM"; default: return "NVME_ADM_CMD_UNKNOWN"; diff --git a/hw/nvme/subsys.c b/hw/nvme/subsys.c index 9d2643678b..24ddec860e 100644 --- a/hw/nvme/subsys.c +++ b/hw/nvme/subsys.c @@ -7,10 +7,13 @@ */ #include "qemu/osdep.h" +#include "qemu/units.h" #include "qapi/error.h" #include "nvme.h" +#define NVME_DEFAULT_RU_SIZE (96 * MiB) + static int nvme_subsys_reserve_cntlids(NvmeCtrl *n, int start, int num) { NvmeSubsystem *subsys = n->subsys; @@ -109,13 +112,95 @@ void nvme_subsys_unregister_ctrl(NvmeSubsystem *subsys, NvmeCtrl *n) n->cntlid = -1; } -static void nvme_subsys_setup(NvmeSubsystem *subsys) +static bool nvme_calc_rgif(uint16_t nruh, uint16_t nrg, uint8_t *rgif) +{ + uint16_t val; + unsigned int i; + + if (unlikely(nrg == 1)) { + /* PIDRG_NORGI scenario, all of pid is used for PHID */ + *rgif = 0; + return true; + } + + val = nrg; + i = 0; + while (val) { + val >>= 1; + i++; + } + *rgif = i; + + /* ensure remaining bits suffice to represent number of phids in a RG */ + if (unlikely((UINT16_MAX >> i) < nruh)) { + *rgif = 0; + return false; + } + + return true; +} + +static bool nvme_subsys_setup_fdp(NvmeSubsystem *subsys, Error **errp) +{ + NvmeEnduranceGroup *endgrp = &subsys->endgrp; + + if (!subsys->params.fdp.runs) { + error_setg(errp, "fdp.runs must be non-zero"); + return false; + } + + endgrp->fdp.runs = subsys->params.fdp.runs; + + if (!subsys->params.fdp.nrg) { + error_setg(errp, "fdp.nrg must be non-zero"); + return false; + } + + endgrp->fdp.nrg = subsys->params.fdp.nrg; + + if (!subsys->params.fdp.nruh) { + error_setg(errp, "fdp.nruh must be non-zero"); + return false; + } + + endgrp->fdp.nruh = subsys->params.fdp.nruh; + + if (!nvme_calc_rgif(endgrp->fdp.nruh, endgrp->fdp.nrg, &endgrp->fdp.rgif)) { + error_setg(errp, + "cannot derive a valid rgif (nruh %"PRIu16" nrg %"PRIu32")", + endgrp->fdp.nruh, endgrp->fdp.nrg); + return false; + } + + endgrp->fdp.ruhs = g_new(NvmeRuHandle, endgrp->fdp.nruh); + + for (uint16_t ruhid = 0; ruhid < endgrp->fdp.nruh; ruhid++) { + endgrp->fdp.ruhs[ruhid] = (NvmeRuHandle) { + .ruht = NVME_RUHT_INITIALLY_ISOLATED, + .ruha = NVME_RUHA_UNUSED, + }; + + endgrp->fdp.ruhs[ruhid].rus = g_new(NvmeReclaimUnit, endgrp->fdp.nrg); + } + + endgrp->fdp.enabled = true; + + return true; +} + +static bool nvme_subsys_setup(NvmeSubsystem *subsys, Error **errp) { const char *nqn = subsys->params.nqn ? subsys->params.nqn : subsys->parent_obj.id; snprintf((char *)subsys->subnqn, sizeof(subsys->subnqn), "nqn.2019-08.org.qemu:%s", nqn); + + if (subsys->params.fdp.enabled && !nvme_subsys_setup_fdp(subsys, errp)) { + return false; + } + + return true; } static void nvme_subsys_realize(DeviceState *dev, Error **errp) @@ -124,11 +209,16 @@ static void nvme_subsys_realize(DeviceState *dev, Error **errp) qbus_init(&subsys->bus, sizeof(NvmeBus), TYPE_NVME_BUS, dev, dev->id); - nvme_subsys_setup(subsys); + nvme_subsys_setup(subsys, errp); } static Property nvme_subsystem_props[] = { DEFINE_PROP_STRING("nqn", NvmeSubsystem, params.nqn), + DEFINE_PROP_BOOL("fdp", NvmeSubsystem, params.fdp.enabled, false), + DEFINE_PROP_SIZE("fdp.runs", NvmeSubsystem, params.fdp.runs, + NVME_DEFAULT_RU_SIZE), + DEFINE_PROP_UINT32("fdp.nrg", NvmeSubsystem, params.fdp.nrg, 1), + DEFINE_PROP_UINT16("fdp.nruh", NvmeSubsystem, params.fdp.nruh, 0), DEFINE_PROP_END_OF_LIST(), }; diff --git a/hw/nvme/trace-events b/hw/nvme/trace-events index b16f2260b4..7f7837e1a2 100644 --- a/hw/nvme/trace-events +++ b/hw/nvme/trace-events @@ -117,6 +117,7 @@ pci_nvme_clear_ns_reset(uint32_t state, uint64_t slba) "zone state=%"PRIu32", sl pci_nvme_zoned_zrwa_implicit_flush(uint64_t zslba, uint32_t nlb) "zslba 0x%"PRIx64" nlb %"PRIu32"" pci_nvme_pci_reset(void) "PCI Function Level Reset" pci_nvme_virt_mngmt(uint16_t cid, uint16_t act, uint16_t cntlid, const char* rt, uint16_t nr) "cid %"PRIu16", act=0x%"PRIx16", ctrlid=%"PRIu16" %s nr=%"PRIu16"" +pci_nvme_fdp_ruh_change(uint16_t rgid, uint16_t ruhid) "change RU on RUH rgid=%"PRIu16", ruhid=%"PRIu16"" # error conditions pci_nvme_err_mdts(size_t len) "len %zu" diff --git a/hw/pci-bridge/cxl_root_port.c b/hw/pci-bridge/cxl_root_port.c index 6664783974..7dfd20aa67 100644 --- a/hw/pci-bridge/cxl_root_port.c +++ b/hw/pci-bridge/cxl_root_port.c @@ -22,6 +22,7 @@ #include "qemu/range.h" #include "hw/pci/pci_bridge.h" #include "hw/pci/pcie_port.h" +#include "hw/pci/msi.h" #include "hw/qdev-properties.h" #include "hw/sysbus.h" #include "qapi/error.h" @@ -29,6 +30,10 @@ #define CXL_ROOT_PORT_DID 0x7075 +#define CXL_RP_MSI_OFFSET 0x60 +#define CXL_RP_MSI_SUPPORTED_FLAGS PCI_MSI_FLAGS_MASKBIT +#define CXL_RP_MSI_NR_VECTOR 2 + /* Copied from the gen root port which we derive */ #define GEN_PCIE_ROOT_PORT_AER_OFFSET 0x100 #define GEN_PCIE_ROOT_PORT_ACS_OFFSET \ @@ -47,6 +52,49 @@ typedef struct CXLRootPort { #define TYPE_CXL_ROOT_PORT "cxl-rp" DECLARE_INSTANCE_CHECKER(CXLRootPort, CXL_ROOT_PORT, TYPE_CXL_ROOT_PORT) +/* + * If two MSI vector are allocated, Advanced Error Interrupt Message Number + * is 1. otherwise 0. + * 17.12.5.10 RPERRSTS, 32:27 bit Advanced Error Interrupt Message Number. + */ +static uint8_t cxl_rp_aer_vector(const PCIDevice *d) +{ + switch (msi_nr_vectors_allocated(d)) { + case 1: + return 0; + case 2: + return 1; + case 4: + case 8: + case 16: + case 32: + default: + break; + } + abort(); + return 0; +} + +static int cxl_rp_interrupts_init(PCIDevice *d, Error **errp) +{ + int rc; + + rc = msi_init(d, CXL_RP_MSI_OFFSET, CXL_RP_MSI_NR_VECTOR, + CXL_RP_MSI_SUPPORTED_FLAGS & PCI_MSI_FLAGS_64BIT, + CXL_RP_MSI_SUPPORTED_FLAGS & PCI_MSI_FLAGS_MASKBIT, + errp); + if (rc < 0) { + assert(rc == -ENOTSUP); + } + + return rc; +} + +static void cxl_rp_interrupts_uninit(PCIDevice *d) +{ + msi_uninit(d); +} + static void latch_registers(CXLRootPort *crp) { uint32_t *reg_state = crp->cxl_cstate.crb.cache_mem_registers; @@ -183,16 +231,29 @@ static void cxl_rp_dvsec_write_config(PCIDevice *dev, uint32_t addr, } } +static void cxl_rp_aer_vector_update(PCIDevice *d) +{ + PCIERootPortClass *rpc = PCIE_ROOT_PORT_GET_CLASS(d); + + if (rpc->aer_vector) { + pcie_aer_root_set_vector(d, rpc->aer_vector(d)); + } +} + static void cxl_rp_write_config(PCIDevice *d, uint32_t address, uint32_t val, int len) { uint16_t slt_ctl, slt_sta; + uint32_t root_cmd = + pci_get_long(d->config + d->exp.aer_cap + PCI_ERR_ROOT_COMMAND); pcie_cap_slot_get(d, &slt_ctl, &slt_sta); pci_bridge_write_config(d, address, val, len); + cxl_rp_aer_vector_update(d); pcie_cap_flr_write_config(d, address, val, len); pcie_cap_slot_write_config(d, slt_ctl, slt_sta, address, val, len); pcie_aer_write_config(d, address, val, len); + pcie_aer_root_write_config(d, address, val, len, root_cmd); cxl_rp_dvsec_write_config(d, address, val, len); } @@ -217,6 +278,9 @@ static void cxl_root_port_class_init(ObjectClass *oc, void *data) rpc->aer_offset = GEN_PCIE_ROOT_PORT_AER_OFFSET; rpc->acs_offset = GEN_PCIE_ROOT_PORT_ACS_OFFSET; + rpc->aer_vector = cxl_rp_aer_vector; + rpc->interrupts_init = cxl_rp_interrupts_init; + rpc->interrupts_uninit = cxl_rp_interrupts_uninit; dc->hotpluggable = false; } diff --git a/hw/pci-bridge/pci_expander_bridge.c b/hw/pci-bridge/pci_expander_bridge.c index e752a21292..ead33f0c05 100644 --- a/hw/pci-bridge/pci_expander_bridge.c +++ b/hw/pci-bridge/pci_expander_bridge.c @@ -15,6 +15,7 @@ #include "hw/pci/pci.h" #include "hw/pci/pci_bus.h" #include "hw/pci/pci_host.h" +#include "hw/pci/pcie_port.h" #include "hw/qdev-properties.h" #include "hw/pci/pci_bridge.h" #include "hw/pci-bridge/pci_expander_bridge.h" @@ -79,6 +80,13 @@ CXLComponentState *cxl_get_hb_cstate(PCIHostState *hb) return &host->cxl_cstate; } +bool cxl_get_hb_passthrough(PCIHostState *hb) +{ + CXLHost *host = PXB_CXL_HOST(hb); + + return host->passthrough; +} + static int pxb_bus_num(PCIBus *bus) { PXBDev *pxb = convert_to_pxb(bus->parent_dev); @@ -289,15 +297,32 @@ static int pxb_map_irq_fn(PCIDevice *pci_dev, int pin) return pin - PCI_SLOT(pxb->devfn); } -static void pxb_dev_reset(DeviceState *dev) +static void pxb_cxl_dev_reset(DeviceState *dev) { CXLHost *cxl = PXB_CXL_DEV(dev)->cxl.cxl_host_bridge; CXLComponentState *cxl_cstate = &cxl->cxl_cstate; + PCIHostState *hb = PCI_HOST_BRIDGE(cxl); uint32_t *reg_state = cxl_cstate->crb.cache_mem_registers; uint32_t *write_msk = cxl_cstate->crb.cache_mem_regs_write_mask; + int dsp_count = 0; cxl_component_register_init_common(reg_state, write_msk, CXL2_ROOT_PORT); - ARRAY_FIELD_DP32(reg_state, CXL_HDM_DECODER_CAPABILITY, TARGET_COUNT, 8); + /* + * The CXL specification allows for host bridges with no HDM decoders + * if they only have a single root port. + */ + if (!PXB_DEV(dev)->hdm_for_passthrough) { + dsp_count = pcie_count_ds_ports(hb->bus); + } + /* Initial reset will have 0 dsp so wait until > 0 */ + if (dsp_count == 1) { + cxl->passthrough = true; + /* Set Capability ID in header to NONE */ + ARRAY_FIELD_DP32(reg_state, CXL_HDM_CAPABILITY_HEADER, ID, 0); + } else { + ARRAY_FIELD_DP32(reg_state, CXL_HDM_DECODER_CAPABILITY, TARGET_COUNT, + 8); + } } static gint pxb_compare(gconstpointer a, gconstpointer b) @@ -481,9 +506,18 @@ static void pxb_cxl_dev_realize(PCIDevice *dev, Error **errp) } pxb_dev_realize_common(dev, CXL, errp); - pxb_dev_reset(DEVICE(dev)); + pxb_cxl_dev_reset(DEVICE(dev)); } +static Property pxb_cxl_dev_properties[] = { + /* Note: 0 is not a legal PXB bus number. */ + DEFINE_PROP_UINT8("bus_nr", PXBDev, bus_nr, 0), + DEFINE_PROP_UINT16("numa_node", PXBDev, numa_node, NUMA_NODE_UNASSIGNED), + DEFINE_PROP_BOOL("bypass_iommu", PXBDev, bypass_iommu, false), + DEFINE_PROP_BOOL("hdm_for_passthrough", PXBDev, hdm_for_passthrough, false), + DEFINE_PROP_END_OF_LIST(), +}; + static void pxb_cxl_dev_class_init(ObjectClass *klass, void *data) { DeviceClass *dc = DEVICE_CLASS(klass); @@ -497,12 +531,12 @@ static void pxb_cxl_dev_class_init(ObjectClass *klass, void *data) */ dc->desc = "CXL Host Bridge"; - device_class_set_props(dc, pxb_dev_properties); + device_class_set_props(dc, pxb_cxl_dev_properties); set_bit(DEVICE_CATEGORY_BRIDGE, dc->categories); /* Host bridges aren't hotpluggable. FIXME: spec reference */ dc->hotpluggable = false; - dc->reset = pxb_dev_reset; + dc->reset = pxb_cxl_dev_reset; } static const TypeInfo pxb_cxl_dev_info = { diff --git a/hw/pci-host/mv64361.c b/hw/pci-host/mv64361.c index 298564f1f5..19e8031a3f 100644 --- a/hw/pci-host/mv64361.c +++ b/hw/pci-host/mv64361.c @@ -873,10 +873,6 @@ static void mv64361_realize(DeviceState *dev, Error **errp) } sysbus_init_irq(SYS_BUS_DEVICE(dev), &s->cpu_irq); qdev_init_gpio_in_named(dev, mv64361_gpp_irq, "gpp", 32); - /* FIXME: PCI IRQ connections may be board specific */ - for (i = 0; i < PCI_NUM_PINS; i++) { - s->pci[1].irq[i] = qdev_get_gpio_in_named(dev, "gpp", 12 + i); - } } static void mv64361_reset(DeviceState *dev) diff --git a/hw/pci/pci-internal.h b/hw/pci/pci-internal.h index 2ea356bdf5..a7d6d8a732 100644 --- a/hw/pci/pci-internal.h +++ b/hw/pci/pci-internal.h @@ -20,6 +20,5 @@ void pcibus_dev_print(Monitor *mon, DeviceState *dev, int indent); int pcie_aer_parse_error_string(const char *error_name, uint32_t *status, bool *correctable); -int pcie_aer_inject_error(PCIDevice *dev, const PCIEAERErr *err); #endif diff --git a/hw/pci/pci.c b/hw/pci/pci.c index 034fe49e9a..def5000e7b 100644 --- a/hw/pci/pci.c +++ b/hw/pci/pci.c @@ -95,6 +95,21 @@ static const VMStateDescription vmstate_pcibus = { } }; +static gint g_cmp_uint32(gconstpointer a, gconstpointer b, gpointer user_data) +{ + return a - b; +} + +static GSequence *pci_acpi_index_list(void) +{ + static GSequence *used_acpi_index_list; + + if (!used_acpi_index_list) { + used_acpi_index_list = g_sequence_new(NULL); + } + return used_acpi_index_list; +} + static void pci_init_bus_master(PCIDevice *pci_dev) { AddressSpace *dma_as = pci_device_iommu_address_space(pci_dev); @@ -1246,6 +1261,17 @@ static void pci_qdev_unrealize(DeviceState *dev) do_pci_unregister_device(pci_dev); pci_dev->msi_trigger = NULL; + + /* + * clean up acpi-index so it could reused by another device + */ + if (pci_dev->acpi_index) { + GSequence *used_indexes = pci_acpi_index_list(); + + g_sequence_remove(g_sequence_lookup(used_indexes, + GINT_TO_POINTER(pci_dev->acpi_index), + g_cmp_uint32, NULL)); + } } void pci_register_bar(PCIDevice *pci_dev, int region_num, @@ -2005,6 +2031,8 @@ PCIDevice *pci_find_device(PCIBus *bus, int bus_num, uint8_t devfn) return bus->devices[devfn]; } +#define ONBOARD_INDEX_MAX (16 * 1024 - 1) + static void pci_qdev_realize(DeviceState *qdev, Error **errp) { PCIDevice *pci_dev = (PCIDevice *)qdev; @@ -2014,6 +2042,35 @@ static void pci_qdev_realize(DeviceState *qdev, Error **errp) bool is_default_rom; uint16_t class_id; + /* + * capped by systemd (see: udev-builtin-net_id.c) + * as it's the only known user honor it to avoid users + * misconfigure QEMU and then wonder why acpi-index doesn't work + */ + if (pci_dev->acpi_index > ONBOARD_INDEX_MAX) { + error_setg(errp, "acpi-index should be less or equal to %u", + ONBOARD_INDEX_MAX); + return; + } + + /* + * make sure that acpi-index is unique across all present PCI devices + */ + if (pci_dev->acpi_index) { + GSequence *used_indexes = pci_acpi_index_list(); + + if (g_sequence_lookup(used_indexes, + GINT_TO_POINTER(pci_dev->acpi_index), + g_cmp_uint32, NULL)) { + error_setg(errp, "a PCI device with acpi-index = %" PRIu32 + " already exist", pci_dev->acpi_index); + return; + } + g_sequence_insert_sorted(used_indexes, + GINT_TO_POINTER(pci_dev->acpi_index), + g_cmp_uint32, NULL); + } + if (pci_dev->romsize != -1 && !is_power_of_2(pci_dev->romsize)) { error_setg(errp, "ROM size %u is not a power of two", pci_dev->romsize); return; diff --git a/hw/pci/pcie_aer.c b/hw/pci/pcie_aer.c index 9a19be44ae..103667c368 100644 --- a/hw/pci/pcie_aer.c +++ b/hw/pci/pcie_aer.c @@ -112,6 +112,10 @@ int pcie_aer_init(PCIDevice *dev, uint8_t cap_ver, uint16_t offset, pci_set_long(dev->w1cmask + offset + PCI_ERR_UNCOR_STATUS, PCI_ERR_UNC_SUPPORTED); + pci_set_long(dev->config + offset + PCI_ERR_UNCOR_MASK, + PCI_ERR_UNC_MASK_DEFAULT); + pci_set_long(dev->wmask + offset + PCI_ERR_UNCOR_MASK, + PCI_ERR_UNC_SUPPORTED); pci_set_long(dev->config + offset + PCI_ERR_UNCOR_SEVER, PCI_ERR_UNC_SEVERITY_DEFAULT); @@ -188,8 +192,16 @@ static void pcie_aer_update_uncor_status(PCIDevice *dev) static bool pcie_aer_msg_alldev(PCIDevice *dev, const PCIEAERMsg *msg) { + uint16_t devctl = pci_get_word(dev->config + dev->exp.exp_cap + + PCI_EXP_DEVCTL); if (!(pcie_aer_msg_is_uncor(msg) && - (pci_get_word(dev->config + PCI_COMMAND) & PCI_COMMAND_SERR))) { + (pci_get_word(dev->config + PCI_COMMAND) & PCI_COMMAND_SERR)) && + !((msg->severity == PCI_ERR_ROOT_CMD_NONFATAL_EN) && + (devctl & PCI_EXP_DEVCTL_NFERE)) && + !((msg->severity == PCI_ERR_ROOT_CMD_COR_EN) && + (devctl & PCI_EXP_DEVCTL_CERE)) && + !((msg->severity == PCI_ERR_ROOT_CMD_FATAL_EN) && + (devctl & PCI_EXP_DEVCTL_FERE))) { return false; } diff --git a/hw/pci/pcie_port.c b/hw/pci/pcie_port.c index 65a397ad23..20ff2b39e8 100644 --- a/hw/pci/pcie_port.c +++ b/hw/pci/pcie_port.c @@ -161,6 +161,51 @@ PCIDevice *pcie_find_port_by_pn(PCIBus *bus, uint8_t pn) return NULL; } +/* Find first port in devfn number order */ +PCIDevice *pcie_find_port_first(PCIBus *bus) +{ + int devfn; + + for (devfn = 0; devfn < ARRAY_SIZE(bus->devices); devfn++) { + PCIDevice *d = bus->devices[devfn]; + + if (!d || !pci_is_express(d) || !d->exp.exp_cap) { + continue; + } + + if (object_dynamic_cast(OBJECT(d), TYPE_PCIE_PORT)) { + return d; + } + } + + return NULL; +} + +int pcie_count_ds_ports(PCIBus *bus) +{ + int dsp_count = 0; + int devfn; + + for (devfn = 0; devfn < ARRAY_SIZE(bus->devices); devfn++) { + PCIDevice *d = bus->devices[devfn]; + + if (!d || !pci_is_express(d) || !d->exp.exp_cap) { + continue; + } + if (object_dynamic_cast(OBJECT(d), TYPE_PCIE_PORT)) { + dsp_count++; + } + } + return dsp_count; +} + +static bool pcie_slot_is_hotpluggbale_bus(HotplugHandler *plug_handler, + BusState *bus) +{ + PCIESlot *s = PCIE_SLOT(bus->parent); + return s->hotplug; +} + static const TypeInfo pcie_port_type_info = { .name = TYPE_PCIE_PORT, .parent = TYPE_PCI_BRIDGE, @@ -188,6 +233,7 @@ static void pcie_slot_class_init(ObjectClass *oc, void *data) hc->plug = pcie_cap_slot_plug_cb; hc->unplug = pcie_cap_slot_unplug_cb; hc->unplug_request = pcie_cap_slot_unplug_request_cb; + hc->is_hotpluggable_bus = pcie_slot_is_hotpluggbale_bus; } static const TypeInfo pcie_slot_type_info = { diff --git a/hw/pci/pcie_sriov.c b/hw/pci/pcie_sriov.c index f0bd72e069..aa5a757b11 100644 --- a/hw/pci/pcie_sriov.c +++ b/hw/pci/pcie_sriov.c @@ -300,3 +300,8 @@ PCIDevice *pcie_sriov_get_vf_at_index(PCIDevice *dev, int n) } return NULL; } + +uint16_t pcie_sriov_num_vfs(PCIDevice *dev) +{ + return dev->exp.sriov_pf.num_vfs; +} diff --git a/hw/ppc/pegasos2.c b/hw/ppc/pegasos2.c index 7cc375df05..f1650be5ee 100644 --- a/hw/ppc/pegasos2.c +++ b/hw/ppc/pegasos2.c @@ -73,6 +73,8 @@ struct Pegasos2MachineState { MachineState parent_obj; PowerPCCPU *cpu; DeviceState *mv; + qemu_irq mv_pirq[PCI_NUM_PINS]; + qemu_irq via_pirq[PCI_NUM_PINS]; Vof *vof; void *fdt_blob; uint64_t kernel_addr; @@ -95,6 +97,15 @@ static void pegasos2_cpu_reset(void *opaque) } } +static void pegasos2_pci_irq(void *opaque, int n, int level) +{ + Pegasos2MachineState *pm = opaque; + + /* PCI interrupt lines are connected to both MV64361 and VT8231 */ + qemu_set_irq(pm->mv_pirq[n], level); + qemu_set_irq(pm->via_pirq[n], level); +} + static void pegasos2_init(MachineState *machine) { Pegasos2MachineState *pm = PEGASOS2_MACHINE(machine); @@ -106,7 +117,7 @@ static void pegasos2_init(MachineState *machine) I2CBus *i2c_bus; const char *fwname = machine->firmware ?: PROM_FILENAME; char *filename; - int sz; + int i, sz; uint8_t *spd_data; /* init CPU */ @@ -156,11 +167,18 @@ static void pegasos2_init(MachineState *machine) /* Marvell Discovery II system controller */ pm->mv = DEVICE(sysbus_create_simple(TYPE_MV64361, -1, qdev_get_gpio_in(DEVICE(pm->cpu), PPC6xx_INPUT_INT))); + for (i = 0; i < PCI_NUM_PINS; i++) { + pm->mv_pirq[i] = qdev_get_gpio_in_named(pm->mv, "gpp", 12 + i); + } pci_bus = mv64361_get_pci_bus(pm->mv, 1); + pci_bus_irqs(pci_bus, pegasos2_pci_irq, pm, PCI_NUM_PINS); /* VIA VT8231 South Bridge (multifunction PCI device) */ via = OBJECT(pci_create_simple_multifunction(pci_bus, PCI_DEVFN(12, 0), true, TYPE_VT8231_ISA)); + for (i = 0; i < PCI_NUM_PINS; i++) { + pm->via_pirq[i] = qdev_get_gpio_in_named(DEVICE(via), "pirq", i); + } object_property_add_alias(OBJECT(machine), "rtc-time", object_resolve_path_component(via, "rtc"), "date"); @@ -267,6 +285,12 @@ static void pegasos2_machine_reset(MachineState *machine, ShutdownCause reason) PCI_INTERRUPT_LINE, 2, 0x9); pegasos2_pci_config_write(pm, 1, (PCI_DEVFN(12, 0) << 8) | 0x50, 1, 0x2); + pegasos2_pci_config_write(pm, 1, (PCI_DEVFN(12, 0) << 8) | + 0x55, 1, 0x90); + pegasos2_pci_config_write(pm, 1, (PCI_DEVFN(12, 0) << 8) | + 0x56, 1, 0x99); + pegasos2_pci_config_write(pm, 1, (PCI_DEVFN(12, 0) << 8) | + 0x57, 1, 0x90); pegasos2_pci_config_write(pm, 1, (PCI_DEVFN(12, 1) << 8) | PCI_INTERRUPT_LINE, 2, 0x109); diff --git a/hw/ppc/spapr_hcall.c b/hw/ppc/spapr_hcall.c index 925ff523cc..ec4def62f8 100644 --- a/hw/ppc/spapr_hcall.c +++ b/hw/ppc/spapr_hcall.c @@ -8,6 +8,7 @@ #include "qemu/module.h" #include "qemu/error-report.h" #include "exec/exec-all.h" +#include "exec/tb-flush.h" #include "helper_regs.h" #include "hw/ppc/ppc.h" #include "hw/ppc/spapr.h" diff --git a/hw/riscv/Kconfig b/hw/riscv/Kconfig index 4550b3b938..6528ebfa3a 100644 --- a/hw/riscv/Kconfig +++ b/hw/riscv/Kconfig @@ -44,6 +44,7 @@ config RISCV_VIRT select VIRTIO_MMIO select FW_CFG_DMA select PLATFORM_BUS + select ACPI config SHAKTI_C bool diff --git a/hw/riscv/meson.build b/hw/riscv/meson.build index ab6cae57ea..2f7ee81be3 100644 --- a/hw/riscv/meson.build +++ b/hw/riscv/meson.build @@ -9,5 +9,6 @@ riscv_ss.add(when: 'CONFIG_SIFIVE_E', if_true: files('sifive_e.c')) riscv_ss.add(when: 'CONFIG_SIFIVE_U', if_true: files('sifive_u.c')) riscv_ss.add(when: 'CONFIG_SPIKE', if_true: files('spike.c')) riscv_ss.add(when: 'CONFIG_MICROCHIP_PFSOC', if_true: files('microchip_pfsoc.c')) +riscv_ss.add(when: 'CONFIG_ACPI', if_true: files('virt-acpi-build.c')) hw_arch += {'riscv': riscv_ss} diff --git a/hw/riscv/virt-acpi-build.c b/hw/riscv/virt-acpi-build.c new file mode 100644 index 0000000000..82da0a238c --- /dev/null +++ b/hw/riscv/virt-acpi-build.c @@ -0,0 +1,416 @@ +/* + * Support for generating ACPI tables and passing them to Guests + * + * RISC-V virt ACPI generation + * + * Copyright (C) 2008-2010 Kevin O'Connor <kevin@koconnor.net> + * Copyright (C) 2006 Fabrice Bellard + * Copyright (C) 2013 Red Hat Inc + * Copyright (c) 2015 HUAWEI TECHNOLOGIES CO.,LTD. + * Copyright (C) 2021-2023 Ventana Micro Systems Inc + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + + * You should have received a copy of the GNU General Public License along + * with this program; if not, see <http://www.gnu.org/licenses/>. + */ + +#include "qemu/osdep.h" +#include "hw/acpi/acpi-defs.h" +#include "hw/acpi/acpi.h" +#include "hw/acpi/aml-build.h" +#include "hw/acpi/utils.h" +#include "qapi/error.h" +#include "sysemu/reset.h" +#include "migration/vmstate.h" +#include "hw/riscv/virt.h" +#include "hw/riscv/numa.h" +#include "hw/intc/riscv_aclint.h" + +#define ACPI_BUILD_TABLE_SIZE 0x20000 + +typedef struct AcpiBuildState { + /* Copy of table in RAM (for patching) */ + MemoryRegion *table_mr; + MemoryRegion *rsdp_mr; + MemoryRegion *linker_mr; + /* Is table patched? */ + bool patched; +} AcpiBuildState; + +static void acpi_align_size(GArray *blob, unsigned align) +{ + /* + * Align size to multiple of given size. This reduces the chance + * we need to change size in the future (breaking cross version migration). + */ + g_array_set_size(blob, ROUND_UP(acpi_data_len(blob), align)); +} + +static void riscv_acpi_madt_add_rintc(uint32_t uid, + const CPUArchIdList *arch_ids, + GArray *entry) +{ + uint64_t hart_id = arch_ids->cpus[uid].arch_id; + + build_append_int_noprefix(entry, 0x18, 1); /* Type */ + build_append_int_noprefix(entry, 20, 1); /* Length */ + build_append_int_noprefix(entry, 1, 1); /* Version */ + build_append_int_noprefix(entry, 0, 1); /* Reserved */ + build_append_int_noprefix(entry, 0x1, 4); /* Flags */ + build_append_int_noprefix(entry, hart_id, 8); /* Hart ID */ + build_append_int_noprefix(entry, uid, 4); /* ACPI Processor UID */ +} + +static void acpi_dsdt_add_cpus(Aml *scope, RISCVVirtState *s) +{ + MachineClass *mc = MACHINE_GET_CLASS(s); + MachineState *ms = MACHINE(s); + const CPUArchIdList *arch_ids = mc->possible_cpu_arch_ids(ms); + + for (int i = 0; i < arch_ids->len; i++) { + Aml *dev; + GArray *madt_buf = g_array_new(0, 1, 1); + + dev = aml_device("C%.03X", i); + aml_append(dev, aml_name_decl("_HID", aml_string("ACPI0007"))); + aml_append(dev, aml_name_decl("_UID", + aml_int(arch_ids->cpus[i].arch_id))); + + /* build _MAT object */ + riscv_acpi_madt_add_rintc(i, arch_ids, madt_buf); + aml_append(dev, aml_name_decl("_MAT", + aml_buffer(madt_buf->len, + (uint8_t *)madt_buf->data))); + g_array_free(madt_buf, true); + + aml_append(scope, dev); + } +} + +static void acpi_dsdt_add_fw_cfg(Aml *scope, const MemMapEntry *fw_cfg_memmap) +{ + Aml *dev = aml_device("FWCF"); + aml_append(dev, aml_name_decl("_HID", aml_string("QEMU0002"))); + + /* device present, functioning, decoding, not shown in UI */ + aml_append(dev, aml_name_decl("_STA", aml_int(0xB))); + aml_append(dev, aml_name_decl("_CCA", aml_int(1))); + + Aml *crs = aml_resource_template(); + aml_append(crs, aml_memory32_fixed(fw_cfg_memmap->base, + fw_cfg_memmap->size, AML_READ_WRITE)); + aml_append(dev, aml_name_decl("_CRS", crs)); + aml_append(scope, dev); +} + +/* RHCT Node[N] starts at offset 56 */ +#define RHCT_NODE_ARRAY_OFFSET 56 + +/* + * ACPI spec, Revision 6.5+ + * 5.2.36 RISC-V Hart Capabilities Table (RHCT) + * REF: https://github.com/riscv-non-isa/riscv-acpi/issues/16 + * https://drive.google.com/file/d/1nP3nFiH4jkPMp6COOxP6123DCZKR-tia/view + */ +static void build_rhct(GArray *table_data, + BIOSLinker *linker, + RISCVVirtState *s) +{ + MachineClass *mc = MACHINE_GET_CLASS(s); + MachineState *ms = MACHINE(s); + const CPUArchIdList *arch_ids = mc->possible_cpu_arch_ids(ms); + size_t len, aligned_len; + uint32_t isa_offset, num_rhct_nodes; + RISCVCPU *cpu; + char *isa; + + AcpiTable table = { .sig = "RHCT", .rev = 1, .oem_id = s->oem_id, + .oem_table_id = s->oem_table_id }; + + acpi_table_begin(&table, table_data); + + build_append_int_noprefix(table_data, 0x0, 4); /* Reserved */ + + /* Time Base Frequency */ + build_append_int_noprefix(table_data, + RISCV_ACLINT_DEFAULT_TIMEBASE_FREQ, 8); + + /* ISA + N hart info */ + num_rhct_nodes = 1 + ms->smp.cpus; + + /* Number of RHCT nodes*/ + build_append_int_noprefix(table_data, num_rhct_nodes, 4); + + /* Offset to the RHCT node array */ + build_append_int_noprefix(table_data, RHCT_NODE_ARRAY_OFFSET, 4); + + /* ISA String Node */ + isa_offset = table_data->len - table.table_offset; + build_append_int_noprefix(table_data, 0, 2); /* Type 0 */ + + cpu = &s->soc[0].harts[0]; + isa = riscv_isa_string(cpu); + len = 8 + strlen(isa) + 1; + aligned_len = (len % 2) ? (len + 1) : len; + + build_append_int_noprefix(table_data, aligned_len, 2); /* Length */ + build_append_int_noprefix(table_data, 0x1, 2); /* Revision */ + + /* ISA string length including NUL */ + build_append_int_noprefix(table_data, strlen(isa) + 1, 2); + g_array_append_vals(table_data, isa, strlen(isa) + 1); /* ISA string */ + + if (aligned_len != len) { + build_append_int_noprefix(table_data, 0x0, 1); /* Optional Padding */ + } + + /* Hart Info Node */ + for (int i = 0; i < arch_ids->len; i++) { + build_append_int_noprefix(table_data, 0xFFFF, 2); /* Type */ + build_append_int_noprefix(table_data, 16, 2); /* Length */ + build_append_int_noprefix(table_data, 0x1, 2); /* Revision */ + build_append_int_noprefix(table_data, 1, 2); /* Number of offsets */ + build_append_int_noprefix(table_data, i, 4); /* ACPI Processor UID */ + build_append_int_noprefix(table_data, isa_offset, 4); /* Offsets[0] */ + } + + acpi_table_end(linker, &table); +} + +/* FADT */ +static void build_fadt_rev6(GArray *table_data, + BIOSLinker *linker, + RISCVVirtState *s, + unsigned dsdt_tbl_offset) +{ + AcpiFadtData fadt = { + .rev = 6, + .minor_ver = 5, + .flags = 1 << ACPI_FADT_F_HW_REDUCED_ACPI, + .xdsdt_tbl_offset = &dsdt_tbl_offset, + }; + + build_fadt(table_data, linker, &fadt, s->oem_id, s->oem_table_id); +} + +/* DSDT */ +static void build_dsdt(GArray *table_data, + BIOSLinker *linker, + RISCVVirtState *s) +{ + Aml *scope, *dsdt; + const MemMapEntry *memmap = s->memmap; + AcpiTable table = { .sig = "DSDT", .rev = 2, .oem_id = s->oem_id, + .oem_table_id = s->oem_table_id }; + + + acpi_table_begin(&table, table_data); + dsdt = init_aml_allocator(); + + /* + * When booting the VM with UEFI, UEFI takes ownership of the RTC hardware. + * While UEFI can use libfdt to disable the RTC device node in the DTB that + * it passes to the OS, it cannot modify AML. Therefore, we won't generate + * the RTC ACPI device at all when using UEFI. + */ + scope = aml_scope("\\_SB"); + acpi_dsdt_add_cpus(scope, s); + + acpi_dsdt_add_fw_cfg(scope, &memmap[VIRT_FW_CFG]); + + aml_append(dsdt, scope); + + /* copy AML table into ACPI tables blob and patch header there */ + g_array_append_vals(table_data, dsdt->buf->data, dsdt->buf->len); + + acpi_table_end(linker, &table); + free_aml_allocator(); +} + +/* + * ACPI spec, Revision 6.5+ + * 5.2.12 Multiple APIC Description Table (MADT) + * REF: https://github.com/riscv-non-isa/riscv-acpi/issues/15 + * https://drive.google.com/file/d/1R6k4MshhN3WTT-hwqAquu5nX6xSEqK2l/view + */ +static void build_madt(GArray *table_data, + BIOSLinker *linker, + RISCVVirtState *s) +{ + MachineClass *mc = MACHINE_GET_CLASS(s); + MachineState *ms = MACHINE(s); + const CPUArchIdList *arch_ids = mc->possible_cpu_arch_ids(ms); + + AcpiTable table = { .sig = "APIC", .rev = 6, .oem_id = s->oem_id, + .oem_table_id = s->oem_table_id }; + + acpi_table_begin(&table, table_data); + /* Local Interrupt Controller Address */ + build_append_int_noprefix(table_data, 0, 4); + build_append_int_noprefix(table_data, 0, 4); /* MADT Flags */ + + /* RISC-V Local INTC structures per HART */ + for (int i = 0; i < arch_ids->len; i++) { + riscv_acpi_madt_add_rintc(i, arch_ids, table_data); + } + + acpi_table_end(linker, &table); +} + +static void virt_acpi_build(RISCVVirtState *s, AcpiBuildTables *tables) +{ + GArray *table_offsets; + unsigned dsdt, xsdt; + GArray *tables_blob = tables->table_data; + + table_offsets = g_array_new(false, true, + sizeof(uint32_t)); + + bios_linker_loader_alloc(tables->linker, + ACPI_BUILD_TABLE_FILE, tables_blob, + 64, false); + + /* DSDT is pointed to by FADT */ + dsdt = tables_blob->len; + build_dsdt(tables_blob, tables->linker, s); + + /* FADT and others pointed to by XSDT */ + acpi_add_table(table_offsets, tables_blob); + build_fadt_rev6(tables_blob, tables->linker, s, dsdt); + + acpi_add_table(table_offsets, tables_blob); + build_madt(tables_blob, tables->linker, s); + + acpi_add_table(table_offsets, tables_blob); + build_rhct(tables_blob, tables->linker, s); + + /* XSDT is pointed to by RSDP */ + xsdt = tables_blob->len; + build_xsdt(tables_blob, tables->linker, table_offsets, s->oem_id, + s->oem_table_id); + + /* RSDP is in FSEG memory, so allocate it separately */ + { + AcpiRsdpData rsdp_data = { + .revision = 2, + .oem_id = s->oem_id, + .xsdt_tbl_offset = &xsdt, + .rsdt_tbl_offset = NULL, + }; + build_rsdp(tables->rsdp, tables->linker, &rsdp_data); + } + + /* + * The align size is 128, warn if 64k is not enough therefore + * the align size could be resized. + */ + if (tables_blob->len > ACPI_BUILD_TABLE_SIZE / 2) { + warn_report("ACPI table size %u exceeds %d bytes," + " migration may not work", + tables_blob->len, ACPI_BUILD_TABLE_SIZE / 2); + error_printf("Try removing some objects."); + } + + acpi_align_size(tables_blob, ACPI_BUILD_TABLE_SIZE); + + /* Clean up memory that's no longer used */ + g_array_free(table_offsets, true); +} + +static void acpi_ram_update(MemoryRegion *mr, GArray *data) +{ + uint32_t size = acpi_data_len(data); + + /* + * Make sure RAM size is correct - in case it got changed + * e.g. by migration + */ + memory_region_ram_resize(mr, size, &error_abort); + + memcpy(memory_region_get_ram_ptr(mr), data->data, size); + memory_region_set_dirty(mr, 0, size); +} + +static void virt_acpi_build_update(void *build_opaque) +{ + AcpiBuildState *build_state = build_opaque; + AcpiBuildTables tables; + + /* No state to update or already patched? Nothing to do. */ + if (!build_state || build_state->patched) { + return; + } + + build_state->patched = true; + + acpi_build_tables_init(&tables); + + virt_acpi_build(RISCV_VIRT_MACHINE(qdev_get_machine()), &tables); + + acpi_ram_update(build_state->table_mr, tables.table_data); + acpi_ram_update(build_state->rsdp_mr, tables.rsdp); + acpi_ram_update(build_state->linker_mr, tables.linker->cmd_blob); + + acpi_build_tables_cleanup(&tables, true); +} + +static void virt_acpi_build_reset(void *build_opaque) +{ + AcpiBuildState *build_state = build_opaque; + build_state->patched = false; +} + +static const VMStateDescription vmstate_virt_acpi_build = { + .name = "virt_acpi_build", + .version_id = 1, + .minimum_version_id = 1, + .fields = (VMStateField[]) { + VMSTATE_BOOL(patched, AcpiBuildState), + VMSTATE_END_OF_LIST() + }, +}; + +void virt_acpi_setup(RISCVVirtState *s) +{ + AcpiBuildTables tables; + AcpiBuildState *build_state; + + build_state = g_malloc0(sizeof *build_state); + + acpi_build_tables_init(&tables); + virt_acpi_build(s, &tables); + + /* Now expose it all to Guest */ + build_state->table_mr = acpi_add_rom_blob(virt_acpi_build_update, + build_state, tables.table_data, + ACPI_BUILD_TABLE_FILE); + assert(build_state->table_mr != NULL); + + build_state->linker_mr = acpi_add_rom_blob(virt_acpi_build_update, + build_state, + tables.linker->cmd_blob, + ACPI_BUILD_LOADER_FILE); + + build_state->rsdp_mr = acpi_add_rom_blob(virt_acpi_build_update, + build_state, tables.rsdp, + ACPI_BUILD_RSDP_FILE); + + qemu_register_reset(virt_acpi_build_reset, build_state); + virt_acpi_build_reset(build_state); + vmstate_register(NULL, 0, &vmstate_virt_acpi_build, build_state); + + /* + * Clean up tables but don't free the memory: we track it + * in build_state. + */ + acpi_build_tables_cleanup(&tables, false); +} diff --git a/hw/riscv/virt.c b/hw/riscv/virt.c index 4f8191860b..4e3efbee16 100644 --- a/hw/riscv/virt.c +++ b/hw/riscv/virt.c @@ -49,6 +49,8 @@ #include "hw/pci/pci.h" #include "hw/pci-host/gpex.h" #include "hw/display/ramfb.h" +#include "hw/acpi/aml-build.h" +#include "qapi/qapi-visit-common.h" /* * The virt machine physical address space used by some of the devices @@ -228,8 +230,9 @@ static void create_fdt_socket_cpus(RISCVVirtState *s, int socket, int cpu; uint32_t cpu_phandle; MachineState *ms = MACHINE(s); - char *name, *cpu_name, *core_name, *intc_name; + char *name, *cpu_name, *core_name, *intc_name, *sv_name; bool is_32_bit = riscv_is_32bit(&s->soc[0]); + uint8_t satp_mode_max; for (cpu = s->soc[socket].num_harts - 1; cpu >= 0; cpu--) { RISCVCPU *cpu_ptr = &s->soc[socket].harts[cpu]; @@ -239,16 +242,29 @@ static void create_fdt_socket_cpus(RISCVVirtState *s, int socket, cpu_name = g_strdup_printf("/cpus/cpu@%d", s->soc[socket].hartid_base + cpu); qemu_fdt_add_subnode(ms->fdt, cpu_name); - if (cpu_ptr->cfg.mmu) { - qemu_fdt_setprop_string(ms->fdt, cpu_name, "mmu-type", - (is_32_bit) ? "riscv,sv32" : "riscv,sv48"); - } else { - qemu_fdt_setprop_string(ms->fdt, cpu_name, "mmu-type", - "riscv,none"); - } + + satp_mode_max = satp_mode_max_from_map( + s->soc[socket].harts[cpu].cfg.satp_mode.map); + sv_name = g_strdup_printf("riscv,%s", + satp_mode_str(satp_mode_max, is_32_bit)); + qemu_fdt_setprop_string(ms->fdt, cpu_name, "mmu-type", sv_name); + g_free(sv_name); + + name = riscv_isa_string(cpu_ptr); qemu_fdt_setprop_string(ms->fdt, cpu_name, "riscv,isa", name); g_free(name); + + if (cpu_ptr->cfg.ext_icbom) { + qemu_fdt_setprop_cell(ms->fdt, cpu_name, "riscv,cbom-block-size", + cpu_ptr->cfg.cbom_blocksize); + } + + if (cpu_ptr->cfg.ext_icboz) { + qemu_fdt_setprop_cell(ms->fdt, cpu_name, "riscv,cboz-block-size", + cpu_ptr->cfg.cboz_blocksize); + } + qemu_fdt_setprop_string(ms->fdt, cpu_name, "compatible", "riscv"); qemu_fdt_setprop_string(ms->fdt, cpu_name, "status", "okay"); qemu_fdt_setprop_cell(ms->fdt, cpu_name, "reg", @@ -1307,6 +1323,10 @@ static void virt_machine_done(Notifier *notifier, void *data) if (kvm_enabled()) { riscv_setup_direct_kernel(kernel_entry, fdt_load_addr); } + + if (virt_is_acpi_enabled(s)) { + virt_acpi_setup(s); + } } static void virt_machine_init(MachineState *machine) @@ -1442,6 +1462,8 @@ static void virt_machine_init(MachineState *machine) ROUND_UP(virt_high_pcie_memmap.base, virt_high_pcie_memmap.size); } + s->memmap = virt_memmap; + /* register system main memory (actual RAM) */ memory_region_add_subregion(system_memory, memmap[VIRT_DRAM].base, machine->ram); @@ -1514,6 +1536,11 @@ static void virt_machine_init(MachineState *machine) static void virt_machine_instance_init(Object *obj) { + RISCVVirtState *s = RISCV_VIRT_MACHINE(obj); + + s->oem_id = g_strndup(ACPI_BUILD_APPNAME6, 6); + s->oem_table_id = g_strndup(ACPI_BUILD_APPNAME8, 8); + s->acpi = ON_OFF_AUTO_AUTO; } static char *virt_get_aia_guests(Object *obj, Error **errp) @@ -1588,6 +1615,28 @@ static void virt_set_aclint(Object *obj, bool value, Error **errp) s->have_aclint = value; } +bool virt_is_acpi_enabled(RISCVVirtState *s) +{ + return s->acpi != ON_OFF_AUTO_OFF; +} + +static void virt_get_acpi(Object *obj, Visitor *v, const char *name, + void *opaque, Error **errp) +{ + RISCVVirtState *s = RISCV_VIRT_MACHINE(obj); + OnOffAuto acpi = s->acpi; + + visit_type_OnOffAuto(v, name, &acpi, errp); +} + +static void virt_set_acpi(Object *obj, Visitor *v, const char *name, + void *opaque, Error **errp) +{ + RISCVVirtState *s = RISCV_VIRT_MACHINE(obj); + + visit_type_OnOffAuto(v, name, &s->acpi, errp); +} + static HotplugHandler *virt_machine_get_hotplug_handler(MachineState *machine, DeviceState *dev) { @@ -1659,6 +1708,11 @@ static void virt_machine_class_init(ObjectClass *oc, void *data) sprintf(str, "Set number of guest MMIO pages for AIA IMSIC. Valid value " "should be between 0 and %d.", VIRT_IRQCHIP_MAX_GUESTS); object_class_property_set_description(oc, "aia-guests", str); + object_class_property_add(oc, "acpi", "OnOffAuto", + virt_get_acpi, virt_set_acpi, + NULL, NULL); + object_class_property_set_description(oc, "acpi", + "Enable ACPI"); } static const TypeInfo virt_machine_typeinfo = { diff --git a/hw/usb/hcd-ohci.c b/hw/usb/hcd-ohci.c index 6f8b543243..88d2b4b13c 100644 --- a/hw/usb/hcd-ohci.c +++ b/hw/usb/hcd-ohci.c @@ -1410,6 +1410,18 @@ static void ohci_set_hub_status(OHCIState *ohci, uint32_t val) } } +/* This is the one state transition the controller can do by itself */ +static bool ohci_resume(OHCIState *s) +{ + if ((s->ctl & OHCI_CTL_HCFS) == OHCI_USB_SUSPEND) { + trace_usb_ohci_remote_wakeup(s->name); + s->ctl &= ~OHCI_CTL_HCFS; + s->ctl |= OHCI_USB_RESUME; + return true; + } + return false; +} + /* * Sets a flag in a port status reg but only set it if the port is connected. * If not set ConnectStatusChange flag. If flag is enabled return 1. @@ -1426,7 +1438,10 @@ static int ohci_port_set_if_connected(OHCIState *ohci, int i, uint32_t val) if (!(ohci->rhport[i].ctrl & OHCI_PORT_CCS)) { ohci->rhport[i].ctrl |= OHCI_PORT_CSC; if (ohci->rhstatus & OHCI_RHS_DRWE) { - /* TODO: CSC is a wakeup event */ + /* CSC is a wakeup event */ + if (ohci_resume(ohci)) { + ohci_set_interrupt(ohci, OHCI_INTR_RD); + } } return 0; } @@ -1828,11 +1843,7 @@ static void ohci_wakeup(USBPort *port1) intr = OHCI_INTR_RHSC; } /* Note that the controller can be suspended even if this port is not */ - if ((s->ctl & OHCI_CTL_HCFS) == OHCI_USB_SUSPEND) { - trace_usb_ohci_remote_wakeup(s->name); - /* This is the one state transition the controller can do by itself */ - s->ctl &= ~OHCI_CTL_HCFS; - s->ctl |= OHCI_USB_RESUME; + if (ohci_resume(s)) { /* * In suspend mode only ResumeDetected is possible, not RHSC: * see the OHCI spec 5.1.2.3. diff --git a/hw/usb/meson.build b/hw/usb/meson.build index bdf34cbd3e..599dc24f0d 100644 --- a/hw/usb/meson.build +++ b/hw/usb/meson.build @@ -84,6 +84,6 @@ if libusb.found() hw_usb_modules += {'host': usbhost_ss} endif -softmmu_ss.add(when: ['CONFIG_USB', 'CONFIG_XEN', libusb], if_true: files('xen-usb.c')) +softmmu_ss.add(when: ['CONFIG_USB', 'CONFIG_XEN_BUS', libusb], if_true: files('xen-usb.c')) modules += { 'hw-usb': hw_usb_modules } diff --git a/hw/usb/vt82c686-uhci-pci.c b/hw/usb/vt82c686-uhci-pci.c index 46a901f56f..b4884c9011 100644 --- a/hw/usb/vt82c686-uhci-pci.c +++ b/hw/usb/vt82c686-uhci-pci.c @@ -1,17 +1,7 @@ #include "qemu/osdep.h" -#include "hw/irq.h" #include "hw/isa/vt82c686.h" #include "hcd-uhci.h" -static void uhci_isa_set_irq(void *opaque, int irq_num, int level) -{ - UHCIState *s = opaque; - uint8_t irq = pci_get_byte(s->dev.config + PCI_INTERRUPT_LINE); - if (irq > 0 && irq < 15) { - via_isa_set_irq(pci_get_function_0(&s->dev), irq, level); - } -} - static void usb_uhci_vt82c686b_realize(PCIDevice *dev, Error **errp) { UHCIState *s = UHCI(dev); @@ -25,8 +15,6 @@ static void usb_uhci_vt82c686b_realize(PCIDevice *dev, Error **errp) pci_set_long(pci_conf + 0xc0, 0x00002000); usb_uhci_common_realize(dev, errp); - object_unref(s->irq); - s->irq = qemu_allocate_irq(uhci_isa_set_irq, s, 0); } static UHCIInfo uhci_info[] = { diff --git a/hw/usb/xen-usb.c b/hw/usb/xen-usb.c index 0f7369e7ed..66cb3f7c24 100644 --- a/hw/usb/xen-usb.c +++ b/hw/usb/xen-usb.c @@ -101,6 +101,8 @@ struct usbback_hotplug { struct usbback_info { struct XenLegacyDevice xendev; /* must be first */ USBBus bus; + uint32_t urb_ring_ref; + uint32_t conn_ring_ref; void *urb_sring; void *conn_sring; struct usbif_urb_back_ring urb_ring; @@ -159,7 +161,7 @@ static int usbback_gnttab_map(struct usbback_req *usbback_req) for (i = 0; i < nr_segs; i++) { if ((unsigned)usbback_req->req.seg[i].offset + - (unsigned)usbback_req->req.seg[i].length > XC_PAGE_SIZE) { + (unsigned)usbback_req->req.seg[i].length > XEN_PAGE_SIZE) { xen_pv_printf(xendev, 0, "segment crosses page boundary\n"); return -EINVAL; } @@ -183,7 +185,7 @@ static int usbback_gnttab_map(struct usbback_req *usbback_req) for (i = 0; i < usbback_req->nr_buffer_segs; i++) { seg = usbback_req->req.seg + i; - addr = usbback_req->buffer + i * XC_PAGE_SIZE + seg->offset; + addr = usbback_req->buffer + i * XEN_PAGE_SIZE + seg->offset; qemu_iovec_add(&usbback_req->packet.iov, addr, seg->length); } } @@ -277,10 +279,11 @@ static int usbback_init_packet(struct usbback_req *usbback_req) static void usbback_do_response(struct usbback_req *usbback_req, int32_t status, int32_t actual_length, int32_t error_count) { + uint32_t ref[USBIF_MAX_SEGMENTS_PER_REQUEST]; struct usbback_info *usbif; struct usbif_urb_response *res; struct XenLegacyDevice *xendev; - unsigned int notify; + unsigned int notify, i; usbif = usbback_req->usbif; xendev = &usbif->xendev; @@ -293,13 +296,19 @@ static void usbback_do_response(struct usbback_req *usbback_req, int32_t status, } if (usbback_req->buffer) { - xen_be_unmap_grant_refs(xendev, usbback_req->buffer, + for (i = 0; i < usbback_req->nr_buffer_segs; i++) { + ref[i] = usbback_req->req.seg[i].gref; + } + xen_be_unmap_grant_refs(xendev, usbback_req->buffer, ref, usbback_req->nr_buffer_segs); usbback_req->buffer = NULL; } if (usbback_req->isoc_buffer) { - xen_be_unmap_grant_refs(xendev, usbback_req->isoc_buffer, + for (i = 0; i < usbback_req->nr_extra_segs; i++) { + ref[i] = usbback_req->req.seg[i + usbback_req->req.nr_buffer_segs].gref; + } + xen_be_unmap_grant_refs(xendev, usbback_req->isoc_buffer, ref, usbback_req->nr_extra_segs); usbback_req->isoc_buffer = NULL; } @@ -832,11 +841,11 @@ static void usbback_disconnect(struct XenLegacyDevice *xendev) xen_pv_unbind_evtchn(xendev); if (usbif->urb_sring) { - xen_be_unmap_grant_ref(xendev, usbif->urb_sring); + xen_be_unmap_grant_ref(xendev, usbif->urb_sring, usbif->urb_ring_ref); usbif->urb_sring = NULL; } if (usbif->conn_sring) { - xen_be_unmap_grant_ref(xendev, usbif->conn_sring); + xen_be_unmap_grant_ref(xendev, usbif->conn_sring, usbif->conn_ring_ref); usbif->conn_sring = NULL; } @@ -889,10 +898,12 @@ static int usbback_connect(struct XenLegacyDevice *xendev) return -1; } + usbif->urb_ring_ref = urb_ring_ref; + usbif->conn_ring_ref = conn_ring_ref; urb_sring = usbif->urb_sring; conn_sring = usbif->conn_sring; - BACK_RING_INIT(&usbif->urb_ring, urb_sring, XC_PAGE_SIZE); - BACK_RING_INIT(&usbif->conn_ring, conn_sring, XC_PAGE_SIZE); + BACK_RING_INIT(&usbif->urb_ring, urb_sring, XEN_PAGE_SIZE); + BACK_RING_INIT(&usbif->conn_ring, conn_sring, XEN_PAGE_SIZE); xen_be_bind_evtchn(xendev); diff --git a/hw/vfio/common.c b/hw/vfio/common.c index bab83c0e55..4d01ea3515 100644 --- a/hw/vfio/common.c +++ b/hw/vfio/common.c @@ -42,6 +42,7 @@ #include "migration/migration.h" #include "migration/misc.h" #include "migration/blocker.h" +#include "migration/qemu-file.h" #include "sysemu/tpm.h" VFIOGroupList vfio_group_list = @@ -319,6 +320,28 @@ const MemoryRegionOps vfio_region_ops = { * Device state interfaces */ +typedef struct { + unsigned long *bitmap; + hwaddr size; + hwaddr pages; +} VFIOBitmap; + +static int vfio_bitmap_alloc(VFIOBitmap *vbmap, hwaddr size) +{ + vbmap->pages = REAL_HOST_PAGE_ALIGN(size) / qemu_real_host_page_size(); + vbmap->size = ROUND_UP(vbmap->pages, sizeof(__u64) * BITS_PER_BYTE) / + BITS_PER_BYTE; + vbmap->bitmap = g_try_malloc0(vbmap->size); + if (!vbmap->bitmap) { + return -ENOMEM; + } + + return 0; +} + +static int vfio_get_dirty_bitmap(VFIOContainer *container, uint64_t iova, + uint64_t size, ram_addr_t ram_addr); + bool vfio_mig_active(void) { VFIOGroup *group; @@ -339,6 +362,7 @@ bool vfio_mig_active(void) } static Error *multiple_devices_migration_blocker; +static Error *giommu_migration_blocker; static unsigned int vfio_migratable_device_num(void) { @@ -390,6 +414,64 @@ void vfio_unblock_multiple_devices_migration(void) multiple_devices_migration_blocker = NULL; } +static bool vfio_viommu_preset(void) +{ + VFIOAddressSpace *space; + + QLIST_FOREACH(space, &vfio_address_spaces, list) { + if (space->as != &address_space_memory) { + return true; + } + } + + return false; +} + +int vfio_block_giommu_migration(Error **errp) +{ + int ret; + + if (giommu_migration_blocker || + !vfio_viommu_preset()) { + return 0; + } + + error_setg(&giommu_migration_blocker, + "Migration is currently not supported with vIOMMU enabled"); + ret = migrate_add_blocker(giommu_migration_blocker, errp); + if (ret < 0) { + error_free(giommu_migration_blocker); + giommu_migration_blocker = NULL; + } + + return ret; +} + +void vfio_migration_finalize(void) +{ + if (!giommu_migration_blocker || + vfio_viommu_preset()) { + return; + } + + migrate_del_blocker(giommu_migration_blocker); + error_free(giommu_migration_blocker); + giommu_migration_blocker = NULL; +} + +static void vfio_set_migration_error(int err) +{ + MigrationState *ms = migrate_get_current(); + + if (migration_is_setup_or_active(ms->state)) { + WITH_QEMU_LOCK_GUARD(&ms->qemu_file_lock) { + if (ms->to_dst_file) { + qemu_file_set_error(ms->to_dst_file, err); + } + } + } +} + static bool vfio_devices_all_dirty_tracking(VFIOContainer *container) { VFIOGroup *group; @@ -417,6 +499,22 @@ static bool vfio_devices_all_dirty_tracking(VFIOContainer *container) return true; } +static bool vfio_devices_all_device_dirty_tracking(VFIOContainer *container) +{ + VFIOGroup *group; + VFIODevice *vbasedev; + + QLIST_FOREACH(group, &container->group_list, container_next) { + QLIST_FOREACH(vbasedev, &group->device_list, next) { + if (!vbasedev->dirty_pages_supported) { + return false; + } + } + } + + return true; +} + /* * Check if all VFIO devices are running and migration is active, which is * essentially equivalent to the migration being in pre-copy phase. @@ -454,9 +552,14 @@ static int vfio_dma_unmap_bitmap(VFIOContainer *container, { struct vfio_iommu_type1_dma_unmap *unmap; struct vfio_bitmap *bitmap; - uint64_t pages = REAL_HOST_PAGE_ALIGN(size) / qemu_real_host_page_size(); + VFIOBitmap vbmap; int ret; + ret = vfio_bitmap_alloc(&vbmap, size); + if (ret) { + return ret; + } + unmap = g_malloc0(sizeof(*unmap) + sizeof(*bitmap)); unmap->argsz = sizeof(*unmap) + sizeof(*bitmap); @@ -470,35 +573,28 @@ static int vfio_dma_unmap_bitmap(VFIOContainer *container, * qemu_real_host_page_size to mark those dirty. Hence set bitmap_pgsize * to qemu_real_host_page_size. */ - bitmap->pgsize = qemu_real_host_page_size(); - bitmap->size = ROUND_UP(pages, sizeof(__u64) * BITS_PER_BYTE) / - BITS_PER_BYTE; + bitmap->size = vbmap.size; + bitmap->data = (__u64 *)vbmap.bitmap; - if (bitmap->size > container->max_dirty_bitmap_size) { - error_report("UNMAP: Size of bitmap too big 0x%"PRIx64, - (uint64_t)bitmap->size); + if (vbmap.size > container->max_dirty_bitmap_size) { + error_report("UNMAP: Size of bitmap too big 0x%"PRIx64, vbmap.size); ret = -E2BIG; goto unmap_exit; } - bitmap->data = g_try_malloc0(bitmap->size); - if (!bitmap->data) { - ret = -ENOMEM; - goto unmap_exit; - } - ret = ioctl(container->fd, VFIO_IOMMU_UNMAP_DMA, unmap); if (!ret) { - cpu_physical_memory_set_dirty_lebitmap((unsigned long *)bitmap->data, - iotlb->translated_addr, pages); + cpu_physical_memory_set_dirty_lebitmap(vbmap.bitmap, + iotlb->translated_addr, vbmap.pages); } else { error_report("VFIO_UNMAP_DMA with DIRTY_BITMAP : %m"); } - g_free(bitmap->data); unmap_exit: g_free(unmap); + g_free(vbmap.bitmap); + return ret; } @@ -515,10 +611,16 @@ static int vfio_dma_unmap(VFIOContainer *container, .iova = iova, .size = size, }; + bool need_dirty_sync = false; + int ret; - if (iotlb && container->dirty_pages_supported && - vfio_devices_all_running_and_mig_active(container)) { - return vfio_dma_unmap_bitmap(container, iova, size, iotlb); + if (iotlb && vfio_devices_all_running_and_mig_active(container)) { + if (!vfio_devices_all_device_dirty_tracking(container) && + container->dirty_pages_supported) { + return vfio_dma_unmap_bitmap(container, iova, size, iotlb); + } + + need_dirty_sync = true; } while (ioctl(container->fd, VFIO_IOMMU_UNMAP_DMA, &unmap)) { @@ -544,10 +646,12 @@ static int vfio_dma_unmap(VFIOContainer *container, return -errno; } - if (iotlb && vfio_devices_all_running_and_mig_active(container)) { - cpu_physical_memory_set_dirty_range(iotlb->translated_addr, size, - tcg_enabled() ? DIRTY_CLIENTS_ALL : - DIRTY_CLIENTS_NOCODE); + if (need_dirty_sync) { + ret = vfio_get_dirty_bitmap(container, iova, size, + iotlb->translated_addr); + if (ret) { + return ret; + } } return 0; @@ -680,6 +784,7 @@ static void vfio_iommu_map_notify(IOMMUNotifier *n, IOMMUTLBEntry *iotlb) if (iotlb->target_as != &address_space_memory) { error_report("Wrong target AS \"%s\", only system memory is allowed", iotlb->target_as->name ? iotlb->target_as->name : "none"); + vfio_set_migration_error(-EINVAL); return; } @@ -703,17 +808,18 @@ static void vfio_iommu_map_notify(IOMMUNotifier *n, IOMMUTLBEntry *iotlb) read_only); if (ret) { error_report("vfio_dma_map(%p, 0x%"HWADDR_PRIx", " - "0x%"HWADDR_PRIx", %p) = %d (%m)", + "0x%"HWADDR_PRIx", %p) = %d (%s)", container, iova, - iotlb->addr_mask + 1, vaddr, ret); + iotlb->addr_mask + 1, vaddr, ret, strerror(-ret)); } } else { ret = vfio_dma_unmap(container, iova, iotlb->addr_mask + 1, iotlb); if (ret) { error_report("vfio_dma_unmap(%p, 0x%"HWADDR_PRIx", " - "0x%"HWADDR_PRIx") = %d (%m)", + "0x%"HWADDR_PRIx") = %d (%s)", container, iova, - iotlb->addr_mask + 1, ret); + iotlb->addr_mask + 1, ret, strerror(-ret)); + vfio_set_migration_error(ret); } } out: @@ -868,6 +974,22 @@ static void vfio_unregister_ram_discard_listener(VFIOContainer *container, g_free(vrdl); } +static VFIOHostDMAWindow *vfio_find_hostwin(VFIOContainer *container, + hwaddr iova, hwaddr end) +{ + VFIOHostDMAWindow *hostwin; + bool hostwin_found = false; + + QLIST_FOREACH(hostwin, &container->hostwin_list, hostwin_next) { + if (hostwin->min_iova <= iova && end <= hostwin->max_iova) { + hostwin_found = true; + break; + } + } + + return hostwin_found ? hostwin : NULL; +} + static bool vfio_known_safe_misalignment(MemoryRegionSection *section) { MemoryRegion *mr = section->mr; @@ -884,24 +1006,15 @@ static bool vfio_known_safe_misalignment(MemoryRegionSection *section) return true; } -static void vfio_listener_region_add(MemoryListener *listener, - MemoryRegionSection *section) +static bool vfio_listener_valid_section(MemoryRegionSection *section, + const char *name) { - VFIOContainer *container = container_of(listener, VFIOContainer, listener); - hwaddr iova, end; - Int128 llend, llsize; - void *vaddr; - int ret; - VFIOHostDMAWindow *hostwin; - bool hostwin_found; - Error *err = NULL; - if (vfio_listener_skipped_section(section)) { - trace_vfio_listener_region_add_skip( + trace_vfio_listener_region_skip(name, section->offset_within_address_space, section->offset_within_address_space + int128_get64(int128_sub(section->size, int128_one()))); - return; + return false; } if (unlikely((section->offset_within_address_space & @@ -916,15 +1029,53 @@ static void vfio_listener_region_add(MemoryListener *listener, section->offset_within_region, qemu_real_host_page_size()); } - return; + return false; } + return true; +} + +static bool vfio_get_section_iova_range(VFIOContainer *container, + MemoryRegionSection *section, + hwaddr *out_iova, hwaddr *out_end, + Int128 *out_llend) +{ + Int128 llend; + hwaddr iova; + iova = REAL_HOST_PAGE_ALIGN(section->offset_within_address_space); llend = int128_make64(section->offset_within_address_space); llend = int128_add(llend, section->size); llend = int128_and(llend, int128_exts64(qemu_real_host_page_mask())); if (int128_ge(int128_make64(iova), llend)) { + return false; + } + + *out_iova = iova; + *out_end = int128_get64(int128_sub(llend, int128_one())); + if (out_llend) { + *out_llend = llend; + } + return true; +} + +static void vfio_listener_region_add(MemoryListener *listener, + MemoryRegionSection *section) +{ + VFIOContainer *container = container_of(listener, VFIOContainer, listener); + hwaddr iova, end; + Int128 llend, llsize; + void *vaddr; + int ret; + VFIOHostDMAWindow *hostwin; + Error *err = NULL; + + if (!vfio_listener_valid_section(section, "region_add")) { + return; + } + + if (!vfio_get_section_iova_range(container, section, &iova, &end, &llend)) { if (memory_region_is_ram_device(section->mr)) { trace_vfio_listener_region_add_no_dma_map( memory_region_name(section->mr), @@ -934,7 +1085,6 @@ static void vfio_listener_region_add(MemoryListener *listener, } return; } - end = int128_get64(int128_sub(llend, int128_one())); if (container->iommu_type == VFIO_SPAPR_TCE_v2_IOMMU) { hwaddr pgsize = 0; @@ -994,15 +1144,8 @@ static void vfio_listener_region_add(MemoryListener *listener, #endif } - hostwin_found = false; - QLIST_FOREACH(hostwin, &container->hostwin_list, hostwin_next) { - if (hostwin->min_iova <= iova && end <= hostwin->max_iova) { - hostwin_found = true; - break; - } - } - - if (!hostwin_found) { + hostwin = vfio_find_hostwin(container, iova, end); + if (!hostwin) { error_setg(&err, "Container %p can't map guest IOVA region" " 0x%"HWADDR_PRIx"..0x%"HWADDR_PRIx, container, iova, end); goto fail; @@ -1095,8 +1238,9 @@ static void vfio_listener_region_add(MemoryListener *listener, vaddr, section->readonly); if (ret) { error_setg(&err, "vfio_dma_map(%p, 0x%"HWADDR_PRIx", " - "0x%"HWADDR_PRIx", %p) = %d (%m)", - container, iova, int128_get64(llsize), vaddr, ret); + "0x%"HWADDR_PRIx", %p) = %d (%s)", + container, iova, int128_get64(llsize), vaddr, ret, + strerror(-ret)); if (memory_region_is_ram_device(section->mr)) { /* Allow unexpected mappings not to be fatal for RAM devices */ error_report_err(err); @@ -1140,26 +1284,7 @@ static void vfio_listener_region_del(MemoryListener *listener, int ret; bool try_unmap = true; - if (vfio_listener_skipped_section(section)) { - trace_vfio_listener_region_del_skip( - section->offset_within_address_space, - section->offset_within_address_space + - int128_get64(int128_sub(section->size, int128_one()))); - return; - } - - if (unlikely((section->offset_within_address_space & - ~qemu_real_host_page_mask()) != - (section->offset_within_region & ~qemu_real_host_page_mask()))) { - if (!vfio_known_safe_misalignment(section)) { - error_report("%s received unaligned region %s iova=0x%"PRIx64 - " offset_within_region=0x%"PRIx64 - " qemu_real_host_page_size=0x%"PRIxPTR, - __func__, memory_region_name(section->mr), - section->offset_within_address_space, - section->offset_within_region, - qemu_real_host_page_size()); - } + if (!vfio_listener_valid_section(section, "region_del")) { return; } @@ -1186,15 +1311,9 @@ static void vfio_listener_region_del(MemoryListener *listener, */ } - iova = REAL_HOST_PAGE_ALIGN(section->offset_within_address_space); - llend = int128_make64(section->offset_within_address_space); - llend = int128_add(llend, section->size); - llend = int128_and(llend, int128_exts64(qemu_real_host_page_mask())); - - if (int128_ge(int128_make64(iova), llend)) { + if (!vfio_get_section_iova_range(container, section, &iova, &end, &llend)) { return; } - end = int128_get64(int128_sub(llend, int128_one())); llsize = int128_sub(llend, int128_make64(iova)); @@ -1203,15 +1322,9 @@ static void vfio_listener_region_del(MemoryListener *listener, if (memory_region_is_ram_device(section->mr)) { hwaddr pgmask; VFIOHostDMAWindow *hostwin; - bool hostwin_found = false; - QLIST_FOREACH(hostwin, &container->hostwin_list, hostwin_next) { - if (hostwin->min_iova <= iova && end <= hostwin->max_iova) { - hostwin_found = true; - break; - } - } - assert(hostwin_found); /* or region_add() would have failed */ + hostwin = vfio_find_hostwin(container, iova, end); + assert(hostwin); /* or region_add() would have failed */ pgmask = (1ULL << ctz64(hostwin->iova_pgsizes)) - 1; try_unmap = !((iova & pgmask) || (int128_get64(llsize) & pgmask)); @@ -1228,16 +1341,18 @@ static void vfio_listener_region_del(MemoryListener *listener, ret = vfio_dma_unmap(container, iova, int128_get64(llsize), NULL); if (ret) { error_report("vfio_dma_unmap(%p, 0x%"HWADDR_PRIx", " - "0x%"HWADDR_PRIx") = %d (%m)", - container, iova, int128_get64(llsize), ret); + "0x%"HWADDR_PRIx") = %d (%s)", + container, iova, int128_get64(llsize), ret, + strerror(-ret)); } iova += int128_get64(llsize); } ret = vfio_dma_unmap(container, iova, int128_get64(llsize), NULL); if (ret) { error_report("vfio_dma_unmap(%p, 0x%"HWADDR_PRIx", " - "0x%"HWADDR_PRIx") = %d (%m)", - container, iova, int128_get64(llsize), ret); + "0x%"HWADDR_PRIx") = %d (%s)", + container, iova, int128_get64(llsize), ret, + strerror(-ret)); } } @@ -1256,7 +1371,7 @@ static void vfio_listener_region_del(MemoryListener *listener, } } -static void vfio_set_dirty_page_tracking(VFIOContainer *container, bool start) +static int vfio_set_dirty_page_tracking(VFIOContainer *container, bool start) { int ret; struct vfio_iommu_type1_dirty_bitmap dirty = { @@ -1264,7 +1379,7 @@ static void vfio_set_dirty_page_tracking(VFIOContainer *container, bool start) }; if (!container->dirty_pages_supported) { - return; + return 0; } if (start) { @@ -1275,40 +1390,327 @@ static void vfio_set_dirty_page_tracking(VFIOContainer *container, bool start) ret = ioctl(container->fd, VFIO_IOMMU_DIRTY_PAGES, &dirty); if (ret) { + ret = -errno; error_report("Failed to set dirty tracking flag 0x%x errno: %d", dirty.flags, errno); } + + return ret; +} + +typedef struct VFIODirtyRanges { + hwaddr min32; + hwaddr max32; + hwaddr min64; + hwaddr max64; +} VFIODirtyRanges; + +typedef struct VFIODirtyRangesListener { + VFIOContainer *container; + VFIODirtyRanges ranges; + MemoryListener listener; +} VFIODirtyRangesListener; + +static void vfio_dirty_tracking_update(MemoryListener *listener, + MemoryRegionSection *section) +{ + VFIODirtyRangesListener *dirty = container_of(listener, + VFIODirtyRangesListener, + listener); + VFIODirtyRanges *range = &dirty->ranges; + hwaddr iova, end, *min, *max; + + if (!vfio_listener_valid_section(section, "tracking_update") || + !vfio_get_section_iova_range(dirty->container, section, + &iova, &end, NULL)) { + return; + } + + /* + * The address space passed to the dirty tracker is reduced to two ranges: + * one for 32-bit DMA ranges, and another one for 64-bit DMA ranges. + * The underlying reports of dirty will query a sub-interval of each of + * these ranges. + * + * The purpose of the dual range handling is to handle known cases of big + * holes in the address space, like the x86 AMD 1T hole. The alternative + * would be an IOVATree but that has a much bigger runtime overhead and + * unnecessary complexity. + */ + min = (end <= UINT32_MAX) ? &range->min32 : &range->min64; + max = (end <= UINT32_MAX) ? &range->max32 : &range->max64; + + if (*min > iova) { + *min = iova; + } + if (*max < end) { + *max = end; + } + + trace_vfio_device_dirty_tracking_update(iova, end, *min, *max); + return; +} + +static const MemoryListener vfio_dirty_tracking_listener = { + .name = "vfio-tracking", + .region_add = vfio_dirty_tracking_update, +}; + +static void vfio_dirty_tracking_init(VFIOContainer *container, + VFIODirtyRanges *ranges) +{ + VFIODirtyRangesListener dirty; + + memset(&dirty, 0, sizeof(dirty)); + dirty.ranges.min32 = UINT32_MAX; + dirty.ranges.min64 = UINT64_MAX; + dirty.listener = vfio_dirty_tracking_listener; + dirty.container = container; + + memory_listener_register(&dirty.listener, + container->space->as); + + *ranges = dirty.ranges; + + /* + * The memory listener is synchronous, and used to calculate the range + * to dirty tracking. Unregister it after we are done as we are not + * interested in any follow-up updates. + */ + memory_listener_unregister(&dirty.listener); +} + +static void vfio_devices_dma_logging_stop(VFIOContainer *container) +{ + uint64_t buf[DIV_ROUND_UP(sizeof(struct vfio_device_feature), + sizeof(uint64_t))] = {}; + struct vfio_device_feature *feature = (struct vfio_device_feature *)buf; + VFIODevice *vbasedev; + VFIOGroup *group; + + feature->argsz = sizeof(buf); + feature->flags = VFIO_DEVICE_FEATURE_SET | + VFIO_DEVICE_FEATURE_DMA_LOGGING_STOP; + + QLIST_FOREACH(group, &container->group_list, container_next) { + QLIST_FOREACH(vbasedev, &group->device_list, next) { + if (!vbasedev->dirty_tracking) { + continue; + } + + if (ioctl(vbasedev->fd, VFIO_DEVICE_FEATURE, feature)) { + warn_report("%s: Failed to stop DMA logging, err %d (%s)", + vbasedev->name, -errno, strerror(errno)); + } + vbasedev->dirty_tracking = false; + } + } +} + +static struct vfio_device_feature * +vfio_device_feature_dma_logging_start_create(VFIOContainer *container, + VFIODirtyRanges *tracking) +{ + struct vfio_device_feature *feature; + size_t feature_size; + struct vfio_device_feature_dma_logging_control *control; + struct vfio_device_feature_dma_logging_range *ranges; + + feature_size = sizeof(struct vfio_device_feature) + + sizeof(struct vfio_device_feature_dma_logging_control); + feature = g_try_malloc0(feature_size); + if (!feature) { + errno = ENOMEM; + return NULL; + } + feature->argsz = feature_size; + feature->flags = VFIO_DEVICE_FEATURE_SET | + VFIO_DEVICE_FEATURE_DMA_LOGGING_START; + + control = (struct vfio_device_feature_dma_logging_control *)feature->data; + control->page_size = qemu_real_host_page_size(); + + /* + * DMA logging uAPI guarantees to support at least a number of ranges that + * fits into a single host kernel base page. + */ + control->num_ranges = !!tracking->max32 + !!tracking->max64; + ranges = g_try_new0(struct vfio_device_feature_dma_logging_range, + control->num_ranges); + if (!ranges) { + g_free(feature); + errno = ENOMEM; + + return NULL; + } + + control->ranges = (__u64)(uintptr_t)ranges; + if (tracking->max32) { + ranges->iova = tracking->min32; + ranges->length = (tracking->max32 - tracking->min32) + 1; + ranges++; + } + if (tracking->max64) { + ranges->iova = tracking->min64; + ranges->length = (tracking->max64 - tracking->min64) + 1; + } + + trace_vfio_device_dirty_tracking_start(control->num_ranges, + tracking->min32, tracking->max32, + tracking->min64, tracking->max64); + + return feature; +} + +static void vfio_device_feature_dma_logging_start_destroy( + struct vfio_device_feature *feature) +{ + struct vfio_device_feature_dma_logging_control *control = + (struct vfio_device_feature_dma_logging_control *)feature->data; + struct vfio_device_feature_dma_logging_range *ranges = + (struct vfio_device_feature_dma_logging_range *)(uintptr_t)control->ranges; + + g_free(ranges); + g_free(feature); +} + +static int vfio_devices_dma_logging_start(VFIOContainer *container) +{ + struct vfio_device_feature *feature; + VFIODirtyRanges ranges; + VFIODevice *vbasedev; + VFIOGroup *group; + int ret = 0; + + vfio_dirty_tracking_init(container, &ranges); + feature = vfio_device_feature_dma_logging_start_create(container, + &ranges); + if (!feature) { + return -errno; + } + + QLIST_FOREACH(group, &container->group_list, container_next) { + QLIST_FOREACH(vbasedev, &group->device_list, next) { + if (vbasedev->dirty_tracking) { + continue; + } + + ret = ioctl(vbasedev->fd, VFIO_DEVICE_FEATURE, feature); + if (ret) { + ret = -errno; + error_report("%s: Failed to start DMA logging, err %d (%s)", + vbasedev->name, ret, strerror(errno)); + goto out; + } + vbasedev->dirty_tracking = true; + } + } + +out: + if (ret) { + vfio_devices_dma_logging_stop(container); + } + + vfio_device_feature_dma_logging_start_destroy(feature); + + return ret; } static void vfio_listener_log_global_start(MemoryListener *listener) { VFIOContainer *container = container_of(listener, VFIOContainer, listener); + int ret; + + if (vfio_devices_all_device_dirty_tracking(container)) { + ret = vfio_devices_dma_logging_start(container); + } else { + ret = vfio_set_dirty_page_tracking(container, true); + } - vfio_set_dirty_page_tracking(container, true); + if (ret) { + error_report("vfio: Could not start dirty page tracking, err: %d (%s)", + ret, strerror(-ret)); + vfio_set_migration_error(ret); + } } static void vfio_listener_log_global_stop(MemoryListener *listener) { VFIOContainer *container = container_of(listener, VFIOContainer, listener); + int ret = 0; + + if (vfio_devices_all_device_dirty_tracking(container)) { + vfio_devices_dma_logging_stop(container); + } else { + ret = vfio_set_dirty_page_tracking(container, false); + } - vfio_set_dirty_page_tracking(container, false); + if (ret) { + error_report("vfio: Could not stop dirty page tracking, err: %d (%s)", + ret, strerror(-ret)); + vfio_set_migration_error(ret); + } } -static int vfio_get_dirty_bitmap(VFIOContainer *container, uint64_t iova, - uint64_t size, ram_addr_t ram_addr) +static int vfio_device_dma_logging_report(VFIODevice *vbasedev, hwaddr iova, + hwaddr size, void *bitmap) { - struct vfio_iommu_type1_dirty_bitmap *dbitmap; - struct vfio_iommu_type1_dirty_bitmap_get *range; - uint64_t pages; + uint64_t buf[DIV_ROUND_UP(sizeof(struct vfio_device_feature) + + sizeof(struct vfio_device_feature_dma_logging_report), + sizeof(__u64))] = {}; + struct vfio_device_feature *feature = (struct vfio_device_feature *)buf; + struct vfio_device_feature_dma_logging_report *report = + (struct vfio_device_feature_dma_logging_report *)feature->data; + + report->iova = iova; + report->length = size; + report->page_size = qemu_real_host_page_size(); + report->bitmap = (__u64)(uintptr_t)bitmap; + + feature->argsz = sizeof(buf); + feature->flags = VFIO_DEVICE_FEATURE_GET | + VFIO_DEVICE_FEATURE_DMA_LOGGING_REPORT; + + if (ioctl(vbasedev->fd, VFIO_DEVICE_FEATURE, feature)) { + return -errno; + } + + return 0; +} + +static int vfio_devices_query_dirty_bitmap(VFIOContainer *container, + VFIOBitmap *vbmap, hwaddr iova, + hwaddr size) +{ + VFIODevice *vbasedev; + VFIOGroup *group; int ret; - if (!container->dirty_pages_supported) { - cpu_physical_memory_set_dirty_range(ram_addr, size, - tcg_enabled() ? DIRTY_CLIENTS_ALL : - DIRTY_CLIENTS_NOCODE); - return 0; + QLIST_FOREACH(group, &container->group_list, container_next) { + QLIST_FOREACH(vbasedev, &group->device_list, next) { + ret = vfio_device_dma_logging_report(vbasedev, iova, size, + vbmap->bitmap); + if (ret) { + error_report("%s: Failed to get DMA logging report, iova: " + "0x%" HWADDR_PRIx ", size: 0x%" HWADDR_PRIx + ", err: %d (%s)", + vbasedev->name, iova, size, ret, strerror(-ret)); + + return ret; + } + } } + return 0; +} + +static int vfio_query_dirty_bitmap(VFIOContainer *container, VFIOBitmap *vbmap, + hwaddr iova, hwaddr size) +{ + struct vfio_iommu_type1_dirty_bitmap *dbitmap; + struct vfio_iommu_type1_dirty_bitmap_get *range; + int ret; + dbitmap = g_malloc0(sizeof(*dbitmap) + sizeof(*range)); dbitmap->argsz = sizeof(*dbitmap) + sizeof(*range); @@ -1323,36 +1725,63 @@ static int vfio_get_dirty_bitmap(VFIOContainer *container, uint64_t iova, * to qemu_real_host_page_size. */ range->bitmap.pgsize = qemu_real_host_page_size(); - - pages = REAL_HOST_PAGE_ALIGN(range->size) / qemu_real_host_page_size(); - range->bitmap.size = ROUND_UP(pages, sizeof(__u64) * BITS_PER_BYTE) / - BITS_PER_BYTE; - range->bitmap.data = g_try_malloc0(range->bitmap.size); - if (!range->bitmap.data) { - ret = -ENOMEM; - goto err_out; - } + range->bitmap.size = vbmap->size; + range->bitmap.data = (__u64 *)vbmap->bitmap; ret = ioctl(container->fd, VFIO_IOMMU_DIRTY_PAGES, dbitmap); if (ret) { + ret = -errno; error_report("Failed to get dirty bitmap for iova: 0x%"PRIx64 " size: 0x%"PRIx64" err: %d", (uint64_t)range->iova, (uint64_t)range->size, errno); - goto err_out; } - cpu_physical_memory_set_dirty_lebitmap((unsigned long *)range->bitmap.data, - ram_addr, pages); - - trace_vfio_get_dirty_bitmap(container->fd, range->iova, range->size, - range->bitmap.size, ram_addr); -err_out: - g_free(range->bitmap.data); g_free(dbitmap); return ret; } +static int vfio_get_dirty_bitmap(VFIOContainer *container, uint64_t iova, + uint64_t size, ram_addr_t ram_addr) +{ + bool all_device_dirty_tracking = + vfio_devices_all_device_dirty_tracking(container); + VFIOBitmap vbmap; + int ret; + + if (!container->dirty_pages_supported && !all_device_dirty_tracking) { + cpu_physical_memory_set_dirty_range(ram_addr, size, + tcg_enabled() ? DIRTY_CLIENTS_ALL : + DIRTY_CLIENTS_NOCODE); + return 0; + } + + ret = vfio_bitmap_alloc(&vbmap, size); + if (ret) { + return ret; + } + + if (all_device_dirty_tracking) { + ret = vfio_devices_query_dirty_bitmap(container, &vbmap, iova, size); + } else { + ret = vfio_query_dirty_bitmap(container, &vbmap, iova, size); + } + + if (ret) { + goto out; + } + + cpu_physical_memory_set_dirty_lebitmap(vbmap.bitmap, ram_addr, + vbmap.pages); + + trace_vfio_get_dirty_bitmap(container->fd, iova, size, vbmap.size, + ram_addr); +out: + g_free(vbmap.bitmap); + + return ret; +} + typedef struct { IOMMUNotifier n; VFIOGuestIOMMU *giommu; @@ -1366,29 +1795,33 @@ static void vfio_iommu_map_dirty_notify(IOMMUNotifier *n, IOMMUTLBEntry *iotlb) VFIOContainer *container = giommu->container; hwaddr iova = iotlb->iova + giommu->iommu_offset; ram_addr_t translated_addr; + int ret = -EINVAL; trace_vfio_iommu_map_dirty_notify(iova, iova + iotlb->addr_mask); if (iotlb->target_as != &address_space_memory) { error_report("Wrong target AS \"%s\", only system memory is allowed", iotlb->target_as->name ? iotlb->target_as->name : "none"); - return; + goto out; } rcu_read_lock(); if (vfio_get_xlat_addr(iotlb, NULL, &translated_addr, NULL)) { - int ret; - ret = vfio_get_dirty_bitmap(container, iova, iotlb->addr_mask + 1, translated_addr); if (ret) { error_report("vfio_iommu_map_dirty_notify(%p, 0x%"HWADDR_PRIx", " - "0x%"HWADDR_PRIx") = %d (%m)", - container, iova, - iotlb->addr_mask + 1, ret); + "0x%"HWADDR_PRIx") = %d (%s)", + container, iova, iotlb->addr_mask + 1, ret, + strerror(-ret)); } } rcu_read_unlock(); + +out: + if (ret) { + vfio_set_migration_error(ret); + } } static int vfio_ram_discard_get_dirty_bitmap(MemoryRegionSection *section, @@ -1481,13 +1914,19 @@ static void vfio_listener_log_sync(MemoryListener *listener, MemoryRegionSection *section) { VFIOContainer *container = container_of(listener, VFIOContainer, listener); + int ret; if (vfio_listener_skipped_section(section)) { return; } if (vfio_devices_all_dirty_tracking(container)) { - vfio_sync_dirty_bitmap(container, section); + ret = vfio_sync_dirty_bitmap(container, section); + if (ret) { + error_report("vfio: Failed to sync dirty bitmap, err: %d (%s)", ret, + strerror(-ret)); + vfio_set_migration_error(ret); + } } } diff --git a/hw/vfio/migration.c b/hw/vfio/migration.c index a2c3d9bade..1a1a8659c8 100644 --- a/hw/vfio/migration.c +++ b/hw/vfio/migration.c @@ -521,7 +521,7 @@ static void vfio_migration_state_notifier(Notifier *notifier, void *data) } } -static void vfio_migration_exit(VFIODevice *vbasedev) +static void vfio_migration_free(VFIODevice *vbasedev) { g_free(vbasedev->migration); vbasedev->migration = NULL; @@ -555,6 +555,19 @@ static int vfio_migration_query_flags(VFIODevice *vbasedev, uint64_t *mig_flags) return 0; } +static bool vfio_dma_logging_supported(VFIODevice *vbasedev) +{ + uint64_t buf[DIV_ROUND_UP(sizeof(struct vfio_device_feature), + sizeof(uint64_t))] = {}; + struct vfio_device_feature *feature = (struct vfio_device_feature *)buf; + + feature->argsz = sizeof(buf); + feature->flags = VFIO_DEVICE_FEATURE_PROBE | + VFIO_DEVICE_FEATURE_DMA_LOGGING_START; + + return !ioctl(vbasedev->fd, VFIO_DEVICE_FEATURE, feature); +} + static int vfio_migration_init(VFIODevice *vbasedev) { int ret; @@ -589,6 +602,8 @@ static int vfio_migration_init(VFIODevice *vbasedev) migration->device_state = VFIO_DEVICE_STATE_RUNNING; migration->data_fd = -1; + vbasedev->dirty_pages_supported = vfio_dma_logging_supported(vbasedev); + oid = vmstate_if_get_id(VMSTATE_IF(DEVICE(obj))); if (oid) { path = g_strdup_printf("%s/vfio", oid); @@ -616,7 +631,7 @@ int64_t vfio_mig_bytes_transferred(void) return bytes_transferred; } -int vfio_migration_probe(VFIODevice *vbasedev, Error **errp) +int vfio_migration_realize(VFIODevice *vbasedev, Error **errp) { int ret = -ENOTSUP; @@ -634,6 +649,11 @@ int vfio_migration_probe(VFIODevice *vbasedev, Error **errp) return ret; } + ret = vfio_block_giommu_migration(errp); + if (ret) { + return ret; + } + trace_vfio_migration_probe(vbasedev->name); return 0; @@ -649,7 +669,7 @@ add_blocker: return ret; } -void vfio_migration_finalize(VFIODevice *vbasedev) +void vfio_migration_exit(VFIODevice *vbasedev) { if (vbasedev->migration) { VFIOMigration *migration = vbasedev->migration; @@ -657,7 +677,7 @@ void vfio_migration_finalize(VFIODevice *vbasedev) remove_migration_state_change_notifier(&migration->migration_state); qemu_del_vm_change_state_handler(migration->vm_state); unregister_savevm(VMSTATE_IF(vbasedev->dev), "vfio", vbasedev); - vfio_migration_exit(vbasedev); + vfio_migration_free(vbasedev); vfio_unblock_multiple_devices_migration(); } diff --git a/hw/vfio/pci.c b/hw/vfio/pci.c index 939dcc3d4a..ec9a854361 100644 --- a/hw/vfio/pci.c +++ b/hw/vfio/pci.c @@ -3145,7 +3145,7 @@ static void vfio_realize(PCIDevice *pdev, Error **errp) } if (!pdev->failover_pair_id) { - ret = vfio_migration_probe(vbasedev, errp); + ret = vfio_migration_realize(vbasedev, errp); if (ret) { error_report("%s: Migration disabled", vbasedev->name); } @@ -3185,6 +3185,7 @@ static void vfio_instance_finalize(Object *obj) */ vfio_put_device(vdev); vfio_put_group(group); + vfio_migration_finalize(); } static void vfio_exitfn(PCIDevice *pdev) @@ -3203,7 +3204,7 @@ static void vfio_exitfn(PCIDevice *pdev) } vfio_teardown_msi(vdev); vfio_bars_exit(vdev); - vfio_migration_finalize(&vdev->vbasedev); + vfio_migration_exit(&vdev->vbasedev); } static void vfio_pci_reset(DeviceState *dev) diff --git a/hw/vfio/trace-events b/hw/vfio/trace-events index 669d9fe07c..646e42fd27 100644 --- a/hw/vfio/trace-events +++ b/hw/vfio/trace-events @@ -96,14 +96,15 @@ vfio_pci_igd_lpc_bridge_enabled(const char *name) "%s" vfio_region_write(const char *name, int index, uint64_t addr, uint64_t data, unsigned size) " (%s:region%d+0x%"PRIx64", 0x%"PRIx64 ", %d)" vfio_region_read(char *name, int index, uint64_t addr, unsigned size, uint64_t data) " (%s:region%d+0x%"PRIx64", %d) = 0x%"PRIx64 vfio_iommu_map_notify(const char *op, uint64_t iova_start, uint64_t iova_end) "iommu %s @ 0x%"PRIx64" - 0x%"PRIx64 -vfio_listener_region_add_skip(uint64_t start, uint64_t end) "SKIPPING region_add 0x%"PRIx64" - 0x%"PRIx64 +vfio_listener_region_skip(const char *name, uint64_t start, uint64_t end) "SKIPPING %s 0x%"PRIx64" - 0x%"PRIx64 vfio_spapr_group_attach(int groupfd, int tablefd) "Attached groupfd %d to liobn fd %d" vfio_listener_region_add_iommu(uint64_t start, uint64_t end) "region_add [iommu] 0x%"PRIx64" - 0x%"PRIx64 vfio_listener_region_add_ram(uint64_t iova_start, uint64_t iova_end, void *vaddr) "region_add [ram] 0x%"PRIx64" - 0x%"PRIx64" [%p]" vfio_known_safe_misalignment(const char *name, uint64_t iova, uint64_t offset_within_region, uintptr_t page_size) "Region \"%s\" iova=0x%"PRIx64" offset_within_region=0x%"PRIx64" qemu_real_host_page_size=0x%"PRIxPTR vfio_listener_region_add_no_dma_map(const char *name, uint64_t iova, uint64_t size, uint64_t page_size) "Region \"%s\" 0x%"PRIx64" size=0x%"PRIx64" is not aligned to 0x%"PRIx64" and cannot be mapped for DMA" -vfio_listener_region_del_skip(uint64_t start, uint64_t end) "SKIPPING region_del 0x%"PRIx64" - 0x%"PRIx64 vfio_listener_region_del(uint64_t start, uint64_t end) "region_del 0x%"PRIx64" - 0x%"PRIx64 +vfio_device_dirty_tracking_update(uint64_t start, uint64_t end, uint64_t min, uint64_t max) "section 0x%"PRIx64" - 0x%"PRIx64" -> update [0x%"PRIx64" - 0x%"PRIx64"]" +vfio_device_dirty_tracking_start(int nr_ranges, uint64_t min32, uint64_t max32, uint64_t min64, uint64_t max64) "nr_ranges %d 32:[0x%"PRIx64" - 0x%"PRIx64"], 64:[0x%"PRIx64" - 0x%"PRIx64"]" vfio_disconnect_container(int fd) "close container->fd=%d" vfio_put_group(int fd) "close group->fd=%d" vfio_get_device(const char * name, unsigned int flags, unsigned int num_regions, unsigned int num_irqs) "Device %s flags: %u, regions: %u, irqs: %u" @@ -117,7 +118,7 @@ vfio_region_mmaps_set_enabled(const char *name, bool enabled) "Region %s mmaps e vfio_region_unmap(const char *name, unsigned long offset, unsigned long end) "Region %s unmap [0x%lx - 0x%lx]" vfio_region_sparse_mmap_header(const char *name, int index, int nr_areas) "Device %s region %d: %d sparse mmap entries" vfio_region_sparse_mmap_entry(int i, unsigned long start, unsigned long end) "sparse entry %d [0x%lx - 0x%lx]" -vfio_get_dev_region(const char *name, int index, uint32_t type, uint32_t subtype) "%s index %d, %08x/%0x8" +vfio_get_dev_region(const char *name, int index, uint32_t type, uint32_t subtype) "%s index %d, %08x/%08x" vfio_dma_unmap_overflow_workaround(void) "" vfio_get_dirty_bitmap(int fd, uint64_t iova, uint64_t size, uint64_t bitmap_size, uint64_t start) "container fd=%d, iova=0x%"PRIx64" size= 0x%"PRIx64" bitmap_size=0x%"PRIx64" start=0x%"PRIx64 vfio_iommu_map_dirty_notify(uint64_t iova_start, uint64_t iova_end) "iommu dirty @ 0x%"PRIx64" - 0x%"PRIx64 diff --git a/hw/virtio/trace-events b/hw/virtio/trace-events index a87c5f39a2..8f8d05cf9b 100644 --- a/hw/virtio/trace-events +++ b/hw/virtio/trace-events @@ -50,6 +50,7 @@ vhost_vdpa_set_vring_ready(void *dev) "dev: %p" vhost_vdpa_dump_config(void *dev, const char *line) "dev: %p %s" vhost_vdpa_set_config(void *dev, uint32_t offset, uint32_t size, uint32_t flags) "dev: %p offset: %"PRIu32" size: %"PRIu32" flags: 0x%"PRIx32 vhost_vdpa_get_config(void *dev, void *config, uint32_t config_len) "dev: %p config: %p config_len: %"PRIu32 +vhost_vdpa_suspend(void *dev) "dev: %p" vhost_vdpa_dev_start(void *dev, bool started) "dev: %p started: %d" vhost_vdpa_set_log_base(void *dev, uint64_t base, unsigned long long size, int refcnt, int fd, void *log) "dev: %p base: 0x%"PRIx64" size: %llu refcnt: %d fd: %d log: %p" vhost_vdpa_set_vring_addr(void *dev, unsigned int index, unsigned int flags, uint64_t desc_user_addr, uint64_t used_user_addr, uint64_t avail_user_addr, uint64_t log_guest_addr) "dev: %p index: %u flags: 0x%x desc_user_addr: 0x%"PRIx64" used_user_addr: 0x%"PRIx64" avail_user_addr: 0x%"PRIx64" log_guest_addr: 0x%"PRIx64 diff --git a/hw/virtio/vhost-shadow-virtqueue.c b/hw/virtio/vhost-shadow-virtqueue.c index 515ccf870d..8361e70d1b 100644 --- a/hw/virtio/vhost-shadow-virtqueue.c +++ b/hw/virtio/vhost-shadow-virtqueue.c @@ -694,13 +694,17 @@ void vhost_svq_stop(VhostShadowVirtqueue *svq) g_autofree VirtQueueElement *elem = NULL; elem = g_steal_pointer(&svq->desc_state[i].elem); if (elem) { - virtqueue_detach_element(svq->vq, elem, 0); + /* + * TODO: This is ok for networking, but other kinds of devices + * might have problems with just unpop these. + */ + virtqueue_unpop(svq->vq, elem, 0); } } next_avail_elem = g_steal_pointer(&svq->next_guest_avail_elem); if (next_avail_elem) { - virtqueue_detach_element(svq->vq, next_avail_elem, 0); + virtqueue_unpop(svq->vq, next_avail_elem, 0); } svq->vq = NULL; g_free(svq->desc_next); diff --git a/hw/virtio/vhost-user.c b/hw/virtio/vhost-user.c index 8968541514..e5285df4ba 100644 --- a/hw/virtio/vhost-user.c +++ b/hw/virtio/vhost-user.c @@ -2031,8 +2031,8 @@ static int vhost_user_backend_init(struct vhost_dev *dev, void *opaque, } else { if (virtio_has_feature(protocol_features, VHOST_USER_PROTOCOL_F_CONFIG)) { - warn_reportf_err(*errp, "vhost-user backend supports " - "VHOST_USER_PROTOCOL_F_CONFIG but QEMU does not."); + warn_report("vhost-user backend supports " + "VHOST_USER_PROTOCOL_F_CONFIG but QEMU does not."); protocol_features &= ~(1ULL << VHOST_USER_PROTOCOL_F_CONFIG); } } diff --git a/hw/virtio/vhost-vdpa.c b/hw/virtio/vhost-vdpa.c index df3a1e92ac..bc6bad23d5 100644 --- a/hw/virtio/vhost-vdpa.c +++ b/hw/virtio/vhost-vdpa.c @@ -431,16 +431,6 @@ static int vhost_vdpa_init(struct vhost_dev *dev, void *opaque, Error **errp) trace_vhost_vdpa_init(dev, opaque); int ret; - /* - * Similar to VFIO, we end up pinning all guest memory and have to - * disable discarding of RAM. - */ - ret = ram_block_discard_disable(true); - if (ret) { - error_report("Cannot set discarding of RAM broken"); - return ret; - } - v = opaque; v->dev = dev; dev->opaque = opaque ; @@ -448,10 +438,36 @@ static int vhost_vdpa_init(struct vhost_dev *dev, void *opaque, Error **errp) v->msg_type = VHOST_IOTLB_MSG_V2; vhost_vdpa_init_svq(dev, v); + error_propagate(&dev->migration_blocker, v->migration_blocker); if (!vhost_vdpa_first_dev(dev)) { return 0; } + /* + * If dev->shadow_vqs_enabled at initialization that means the device has + * been started with x-svq=on, so don't block migration + */ + if (dev->migration_blocker == NULL && !v->shadow_vqs_enabled) { + /* We don't have dev->features yet */ + uint64_t features; + ret = vhost_vdpa_get_dev_features(dev, &features); + if (unlikely(ret)) { + error_setg_errno(errp, -ret, "Could not get device features"); + return ret; + } + vhost_svq_valid_features(features, &dev->migration_blocker); + } + + /* + * Similar to VFIO, we end up pinning all guest memory and have to + * disable discarding of RAM. + */ + ret = ram_block_discard_disable(true); + if (ret) { + error_report("Cannot set discarding of RAM broken"); + return ret; + } + vhost_vdpa_add_status(dev, VIRTIO_CONFIG_S_ACKNOWLEDGE | VIRTIO_CONFIG_S_DRIVER); @@ -577,12 +593,15 @@ static int vhost_vdpa_cleanup(struct vhost_dev *dev) assert(dev->vhost_ops->backend_type == VHOST_BACKEND_TYPE_VDPA); v = dev->opaque; trace_vhost_vdpa_cleanup(dev, v); + if (vhost_vdpa_first_dev(dev)) { + ram_block_discard_disable(false); + } + vhost_vdpa_host_notifiers_uninit(dev, dev->nvqs); memory_listener_unregister(&v->listener); vhost_vdpa_svq_cleanup(dev); dev->opaque = NULL; - ram_block_discard_disable(false); return 0; } @@ -659,7 +678,8 @@ static int vhost_vdpa_set_backend_cap(struct vhost_dev *dev) uint64_t features; uint64_t f = 0x1ULL << VHOST_BACKEND_F_IOTLB_MSG_V2 | 0x1ULL << VHOST_BACKEND_F_IOTLB_BATCH | - 0x1ULL << VHOST_BACKEND_F_IOTLB_ASID; + 0x1ULL << VHOST_BACKEND_F_IOTLB_ASID | + 0x1ULL << VHOST_BACKEND_F_SUSPEND; int r; if (vhost_vdpa_call(dev, VHOST_GET_BACKEND_FEATURES, &features)) { @@ -691,11 +711,13 @@ static int vhost_vdpa_get_device_id(struct vhost_dev *dev, static int vhost_vdpa_reset_device(struct vhost_dev *dev) { + struct vhost_vdpa *v = dev->opaque; int ret; uint8_t status = 0; ret = vhost_vdpa_call(dev, VHOST_VDPA_SET_STATUS, &status); trace_vhost_vdpa_reset_device(dev, status); + v->suspended = false; return ret; } @@ -1094,6 +1116,29 @@ static void vhost_vdpa_svqs_stop(struct vhost_dev *dev) } } +static void vhost_vdpa_suspend(struct vhost_dev *dev) +{ + struct vhost_vdpa *v = dev->opaque; + int r; + + if (!vhost_vdpa_first_dev(dev)) { + return; + } + + if (dev->backend_cap & BIT_ULL(VHOST_BACKEND_F_SUSPEND)) { + trace_vhost_vdpa_suspend(dev); + r = ioctl(v->device_fd, VHOST_VDPA_SUSPEND); + if (unlikely(r)) { + error_report("Cannot suspend: %s(%d)", g_strerror(errno), errno); + } else { + v->suspended = true; + return; + } + } + + vhost_vdpa_reset_device(dev); +} + static int vhost_vdpa_dev_start(struct vhost_dev *dev, bool started) { struct vhost_vdpa *v = dev->opaque; @@ -1108,6 +1153,7 @@ static int vhost_vdpa_dev_start(struct vhost_dev *dev, bool started) } vhost_vdpa_set_vring_ready(dev); } else { + vhost_vdpa_suspend(dev); vhost_vdpa_svqs_stop(dev); vhost_vdpa_host_notifiers_uninit(dev, dev->nvqs); } @@ -1119,14 +1165,23 @@ static int vhost_vdpa_dev_start(struct vhost_dev *dev, bool started) if (started) { memory_listener_register(&v->listener, &address_space_memory); return vhost_vdpa_add_status(dev, VIRTIO_CONFIG_S_DRIVER_OK); - } else { - vhost_vdpa_reset_device(dev); - vhost_vdpa_add_status(dev, VIRTIO_CONFIG_S_ACKNOWLEDGE | - VIRTIO_CONFIG_S_DRIVER); - memory_listener_unregister(&v->listener); + } - return 0; + return 0; +} + +static void vhost_vdpa_reset_status(struct vhost_dev *dev) +{ + struct vhost_vdpa *v = dev->opaque; + + if (dev->vq_index + dev->nvqs != dev->vq_index_end) { + return; } + + vhost_vdpa_reset_device(dev); + vhost_vdpa_add_status(dev, VIRTIO_CONFIG_S_ACKNOWLEDGE | + VIRTIO_CONFIG_S_DRIVER); + memory_listener_unregister(&v->listener); } static int vhost_vdpa_set_log_base(struct vhost_dev *dev, uint64_t base, @@ -1169,18 +1224,7 @@ static int vhost_vdpa_set_vring_base(struct vhost_dev *dev, struct vhost_vring_state *ring) { struct vhost_vdpa *v = dev->opaque; - VirtQueue *vq = virtio_get_queue(dev->vdev, ring->index); - /* - * vhost-vdpa devices does not support in-flight requests. Set all of them - * as available. - * - * TODO: This is ok for networking, but other kinds of devices might - * have problems with these retransmissions. - */ - while (virtqueue_rewind(vq, 1)) { - continue; - } if (v->shadow_vqs_enabled) { /* * Device vring base was set at device start. SVQ base is handled by @@ -1203,6 +1247,14 @@ static int vhost_vdpa_get_vring_base(struct vhost_dev *dev, return 0; } + if (!v->suspended) { + /* + * Cannot trust in value returned by device, let vhost recover used + * idx from guest. + */ + return -1; + } + ret = vhost_vdpa_call(dev, VHOST_GET_VRING_BASE, ring); trace_vhost_vdpa_get_vring_base(dev, ring->index, ring->num); return ret; @@ -1227,25 +1279,24 @@ static int vhost_vdpa_set_vring_call(struct vhost_dev *dev, struct vhost_vring_file *file) { struct vhost_vdpa *v = dev->opaque; + int vdpa_idx = file->index - dev->vq_index; + VhostShadowVirtqueue *svq = g_ptr_array_index(v->shadow_vqs, vdpa_idx); + /* Remember last call fd because we can switch to SVQ anytime. */ + vhost_svq_set_svq_call_fd(svq, file->fd); if (v->shadow_vqs_enabled) { - int vdpa_idx = file->index - dev->vq_index; - VhostShadowVirtqueue *svq = g_ptr_array_index(v->shadow_vqs, vdpa_idx); - - vhost_svq_set_svq_call_fd(svq, file->fd); return 0; - } else { - return vhost_vdpa_set_vring_dev_call(dev, file); } + + return vhost_vdpa_set_vring_dev_call(dev, file); } static int vhost_vdpa_get_features(struct vhost_dev *dev, uint64_t *features) { - struct vhost_vdpa *v = dev->opaque; int ret = vhost_vdpa_get_dev_features(dev, features); - if (ret == 0 && v->shadow_vqs_enabled) { + if (ret == 0) { /* Add SVQ logging capabilities */ *features |= BIT_ULL(VHOST_F_LOG_ALL); } @@ -1313,4 +1364,5 @@ const VhostOps vdpa_ops = { .vhost_vq_get_addr = vhost_vdpa_vq_get_addr, .vhost_force_iommu = vhost_vdpa_force_iommu, .vhost_set_config_call = vhost_vdpa_set_config_call, + .vhost_reset_status = vhost_vdpa_reset_status, }; diff --git a/hw/virtio/vhost.c b/hw/virtio/vhost.c index eb8c4c378c..a266396576 100644 --- a/hw/virtio/vhost.c +++ b/hw/virtio/vhost.c @@ -2049,6 +2049,9 @@ void vhost_dev_stop(struct vhost_dev *hdev, VirtIODevice *vdev, bool vrings) hdev->vqs + i, hdev->vq_index + i); } + if (hdev->vhost_ops->vhost_reset_status) { + hdev->vhost_ops->vhost_reset_status(hdev); + } if (vhost_dev_has_iommu(hdev)) { if (hdev->vhost_ops->vhost_set_iotlb_callback) { diff --git a/hw/virtio/virtio-crypto.c b/hw/virtio/virtio-crypto.c index 516425e26a..802e1b9659 100644 --- a/hw/virtio/virtio-crypto.c +++ b/hw/virtio/virtio-crypto.c @@ -462,7 +462,7 @@ static void virtio_crypto_init_request(VirtIOCrypto *vcrypto, VirtQueue *vq, req->in_iov = NULL; req->in_num = 0; req->in_len = 0; - req->flags = CRYPTODEV_BACKEND_ALG__MAX; + req->flags = QCRYPTODEV_BACKEND_ALG__MAX; memset(&req->op_info, 0x00, sizeof(req->op_info)); } @@ -472,7 +472,7 @@ static void virtio_crypto_free_request(VirtIOCryptoReq *req) return; } - if (req->flags == CRYPTODEV_BACKEND_ALG_SYM) { + if (req->flags == QCRYPTODEV_BACKEND_ALG_SYM) { size_t max_len; CryptoDevBackendSymOpInfo *op_info = req->op_info.u.sym_op_info; @@ -485,7 +485,7 @@ static void virtio_crypto_free_request(VirtIOCryptoReq *req) /* Zeroize and free request data structure */ memset(op_info, 0, sizeof(*op_info) + max_len); g_free(op_info); - } else if (req->flags == CRYPTODEV_BACKEND_ALG_ASYM) { + } else if (req->flags == QCRYPTODEV_BACKEND_ALG_ASYM) { CryptoDevBackendAsymOpInfo *op_info = req->op_info.u.asym_op_info; if (op_info) { g_free(op_info->src); @@ -570,10 +570,10 @@ static void virtio_crypto_req_complete(void *opaque, int ret) VirtIODevice *vdev = VIRTIO_DEVICE(vcrypto); uint8_t status = -ret; - if (req->flags == CRYPTODEV_BACKEND_ALG_SYM) { + if (req->flags == QCRYPTODEV_BACKEND_ALG_SYM) { virtio_crypto_sym_input_data_helper(vdev, req, status, req->op_info.u.sym_op_info); - } else if (req->flags == CRYPTODEV_BACKEND_ALG_ASYM) { + } else if (req->flags == QCRYPTODEV_BACKEND_ALG_ASYM) { virtio_crypto_akcipher_input_data_helper(vdev, req, status, req->op_info.u.asym_op_info); } @@ -871,11 +871,14 @@ virtio_crypto_handle_request(VirtIOCryptoReq *request) opcode = ldl_le_p(&req.header.opcode); op_info->session_id = ldq_le_p(&req.header.session_id); op_info->op_code = opcode; + op_info->queue_index = queue_index; + op_info->cb = virtio_crypto_req_complete; + op_info->opaque = request; switch (opcode) { case VIRTIO_CRYPTO_CIPHER_ENCRYPT: case VIRTIO_CRYPTO_CIPHER_DECRYPT: - op_info->algtype = request->flags = CRYPTODEV_BACKEND_ALG_SYM; + op_info->algtype = request->flags = QCRYPTODEV_BACKEND_ALG_SYM; ret = virtio_crypto_handle_sym_req(vcrypto, &req.u.sym_req, op_info, out_iov, out_num); @@ -885,7 +888,7 @@ virtio_crypto_handle_request(VirtIOCryptoReq *request) case VIRTIO_CRYPTO_AKCIPHER_DECRYPT: case VIRTIO_CRYPTO_AKCIPHER_SIGN: case VIRTIO_CRYPTO_AKCIPHER_VERIFY: - op_info->algtype = request->flags = CRYPTODEV_BACKEND_ALG_ASYM; + op_info->algtype = request->flags = QCRYPTODEV_BACKEND_ALG_ASYM; ret = virtio_crypto_handle_asym_req(vcrypto, &req.u.akcipher_req, op_info, out_iov, out_num); @@ -898,9 +901,7 @@ check_result: virtio_crypto_req_complete(request, -VIRTIO_CRYPTO_NOTSUPP); } else { ret = cryptodev_backend_crypto_operation(vcrypto->cryptodev, - request, queue_index, - virtio_crypto_req_complete, - request); + op_info); if (ret < 0) { virtio_crypto_req_complete(request, ret); } @@ -997,12 +998,35 @@ static void virtio_crypto_reset(VirtIODevice *vdev) } } +static uint32_t virtio_crypto_init_services(uint32_t qservices) +{ + uint32_t vservices = 0; + + if (qservices & (1 << QCRYPTODEV_BACKEND_SERVICE_CIPHER)) { + vservices |= (1 << VIRTIO_CRYPTO_SERVICE_CIPHER); + } + if (qservices & (1 << QCRYPTODEV_BACKEND_SERVICE_HASH)) { + vservices |= (1 << VIRTIO_CRYPTO_SERVICE_HASH); + } + if (qservices & (1 << QCRYPTODEV_BACKEND_SERVICE_MAC)) { + vservices |= (1 << VIRTIO_CRYPTO_SERVICE_MAC); + } + if (qservices & (1 << QCRYPTODEV_BACKEND_SERVICE_AEAD)) { + vservices |= (1 << VIRTIO_CRYPTO_SERVICE_AEAD); + } + if (qservices & (1 << QCRYPTODEV_BACKEND_SERVICE_AKCIPHER)) { + vservices |= (1 << VIRTIO_CRYPTO_SERVICE_AKCIPHER); + } + + return vservices; +} + static void virtio_crypto_init_config(VirtIODevice *vdev) { VirtIOCrypto *vcrypto = VIRTIO_CRYPTO(vdev); - vcrypto->conf.crypto_services = - vcrypto->conf.cryptodev->conf.crypto_services; + vcrypto->conf.crypto_services = virtio_crypto_init_services( + vcrypto->conf.cryptodev->conf.crypto_services); vcrypto->conf.cipher_algo_l = vcrypto->conf.cryptodev->conf.cipher_algo_l; vcrypto->conf.cipher_algo_h = diff --git a/hw/virtio/virtio.c b/hw/virtio/virtio.c index f35178f5fc..98c4819fcc 100644 --- a/hw/virtio/virtio.c +++ b/hw/virtio/virtio.c @@ -1069,7 +1069,7 @@ static void virtqueue_split_get_avail_bytes(VirtQueue *vq, VRingMemoryRegionCaches *caches) { VirtIODevice *vdev = vq->vdev; - unsigned int max, idx; + unsigned int idx; unsigned int total_bufs, in_total, out_total; MemoryRegionCache indirect_desc_cache = MEMORY_REGION_CACHE_INVALID; int64_t len = 0; @@ -1078,13 +1078,12 @@ static void virtqueue_split_get_avail_bytes(VirtQueue *vq, idx = vq->last_avail_idx; total_bufs = in_total = out_total = 0; - max = vq->vring.num; - while ((rc = virtqueue_num_heads(vq, idx)) > 0) { MemoryRegionCache *desc_cache = &caches->desc; unsigned int num_bufs; VRingDesc desc; unsigned int i; + unsigned int max = vq->vring.num; num_bufs = total_bufs; @@ -1206,7 +1205,7 @@ static void virtqueue_packed_get_avail_bytes(VirtQueue *vq, VRingMemoryRegionCaches *caches) { VirtIODevice *vdev = vq->vdev; - unsigned int max, idx; + unsigned int idx; unsigned int total_bufs, in_total, out_total; MemoryRegionCache *desc_cache; MemoryRegionCache indirect_desc_cache = MEMORY_REGION_CACHE_INVALID; @@ -1218,14 +1217,14 @@ static void virtqueue_packed_get_avail_bytes(VirtQueue *vq, wrap_counter = vq->last_avail_wrap_counter; total_bufs = in_total = out_total = 0; - max = vq->vring.num; - for (;;) { unsigned int num_bufs = total_bufs; unsigned int i = idx; int rc; + unsigned int max = vq->vring.num; desc_cache = &caches->desc; + vring_packed_desc_read(vdev, &desc, desc_cache, idx, true); if (!is_desc_avail(desc.flags, wrap_counter)) { break; diff --git a/hw/xen/meson.build b/hw/xen/meson.build index ae0ace3046..19c6aabc7c 100644 --- a/hw/xen/meson.build +++ b/hw/xen/meson.build @@ -1,4 +1,4 @@ -softmmu_ss.add(when: ['CONFIG_XEN', xen], if_true: files( +softmmu_ss.add(when: ['CONFIG_XEN_BUS'], if_true: files( 'xen-backend.c', 'xen-bus-helper.c', 'xen-bus.c', @@ -7,6 +7,10 @@ softmmu_ss.add(when: ['CONFIG_XEN', xen], if_true: files( 'xen_pvdev.c', )) +softmmu_ss.add(when: ['CONFIG_XEN', xen], if_true: files( + 'xen-operations.c', +)) + xen_specific_ss = ss.source_set() if have_xen_pci_passthrough xen_specific_ss.add(files( diff --git a/hw/xen/trace-events b/hw/xen/trace-events index 3da3fd8348..55c9e1df68 100644 --- a/hw/xen/trace-events +++ b/hw/xen/trace-events @@ -1,6 +1,6 @@ # See docs/devel/tracing.rst for syntax documentation. -# ../../include/hw/xen/xen_common.h +# ../../include/hw/xen/xen_native.h xen_default_ioreq_server(void) "" xen_ioreq_server_create(uint32_t id) "id: %u" xen_ioreq_server_destroy(uint32_t id) "id: %u" diff --git a/hw/xen/xen-bus-helper.c b/hw/xen/xen-bus-helper.c index 5a1e12b374..b2b2cc9c5d 100644 --- a/hw/xen/xen-bus-helper.c +++ b/hw/xen/xen-bus-helper.c @@ -10,6 +10,7 @@ #include "hw/xen/xen-bus.h" #include "hw/xen/xen-bus-helper.h" #include "qapi/error.h" +#include "trace.h" #include <glib/gprintf.h> @@ -46,34 +47,28 @@ const char *xs_strstate(enum xenbus_state state) return "INVALID"; } -void xs_node_create(struct xs_handle *xsh, xs_transaction_t tid, - const char *node, struct xs_permissions perms[], - unsigned int nr_perms, Error **errp) +void xs_node_create(struct qemu_xs_handle *h, xs_transaction_t tid, + const char *node, unsigned int owner, unsigned int domid, + unsigned int perms, Error **errp) { trace_xs_node_create(node); - if (!xs_write(xsh, tid, node, "", 0)) { + if (!qemu_xen_xs_create(h, tid, owner, domid, perms, node)) { error_setg_errno(errp, errno, "failed to create node '%s'", node); - return; - } - - if (!xs_set_permissions(xsh, tid, node, perms, nr_perms)) { - error_setg_errno(errp, errno, "failed to set node '%s' permissions", - node); } } -void xs_node_destroy(struct xs_handle *xsh, xs_transaction_t tid, +void xs_node_destroy(struct qemu_xs_handle *h, xs_transaction_t tid, const char *node, Error **errp) { trace_xs_node_destroy(node); - if (!xs_rm(xsh, tid, node)) { + if (!qemu_xen_xs_destroy(h, tid, node)) { error_setg_errno(errp, errno, "failed to destroy node '%s'", node); } } -void xs_node_vprintf(struct xs_handle *xsh, xs_transaction_t tid, +void xs_node_vprintf(struct qemu_xs_handle *h, xs_transaction_t tid, const char *node, const char *key, Error **errp, const char *fmt, va_list ap) { @@ -86,7 +81,7 @@ void xs_node_vprintf(struct xs_handle *xsh, xs_transaction_t tid, trace_xs_node_vprintf(path, value); - if (!xs_write(xsh, tid, path, value, len)) { + if (!qemu_xen_xs_write(h, tid, path, value, len)) { error_setg_errno(errp, errno, "failed to write '%s' to '%s'", value, path); } @@ -95,18 +90,18 @@ void xs_node_vprintf(struct xs_handle *xsh, xs_transaction_t tid, g_free(path); } -void xs_node_printf(struct xs_handle *xsh, xs_transaction_t tid, +void xs_node_printf(struct qemu_xs_handle *h, xs_transaction_t tid, const char *node, const char *key, Error **errp, const char *fmt, ...) { va_list ap; va_start(ap, fmt); - xs_node_vprintf(xsh, tid, node, key, errp, fmt, ap); + xs_node_vprintf(h, tid, node, key, errp, fmt, ap); va_end(ap); } -int xs_node_vscanf(struct xs_handle *xsh, xs_transaction_t tid, +int xs_node_vscanf(struct qemu_xs_handle *h, xs_transaction_t tid, const char *node, const char *key, Error **errp, const char *fmt, va_list ap) { @@ -115,7 +110,7 @@ int xs_node_vscanf(struct xs_handle *xsh, xs_transaction_t tid, path = (strlen(node) != 0) ? g_strdup_printf("%s/%s", node, key) : g_strdup(key); - value = xs_read(xsh, tid, path, NULL); + value = qemu_xen_xs_read(h, tid, path, NULL); trace_xs_node_vscanf(path, value); @@ -133,7 +128,7 @@ int xs_node_vscanf(struct xs_handle *xsh, xs_transaction_t tid, return rc; } -int xs_node_scanf(struct xs_handle *xsh, xs_transaction_t tid, +int xs_node_scanf(struct qemu_xs_handle *h, xs_transaction_t tid, const char *node, const char *key, Error **errp, const char *fmt, ...) { @@ -141,42 +136,35 @@ int xs_node_scanf(struct xs_handle *xsh, xs_transaction_t tid, int rc; va_start(ap, fmt); - rc = xs_node_vscanf(xsh, tid, node, key, errp, fmt, ap); + rc = xs_node_vscanf(h, tid, node, key, errp, fmt, ap); va_end(ap); return rc; } -void xs_node_watch(struct xs_handle *xsh, const char *node, const char *key, - char *token, Error **errp) +struct qemu_xs_watch *xs_node_watch(struct qemu_xs_handle *h, const char *node, + const char *key, xs_watch_fn fn, + void *opaque, Error **errp) { char *path; + struct qemu_xs_watch *w; path = (strlen(node) != 0) ? g_strdup_printf("%s/%s", node, key) : g_strdup(key); trace_xs_node_watch(path); - if (!xs_watch(xsh, path, token)) { + w = qemu_xen_xs_watch(h, path, fn, opaque); + if (!w) { error_setg_errno(errp, errno, "failed to watch node '%s'", path); } g_free(path); + + return w; } -void xs_node_unwatch(struct xs_handle *xsh, const char *node, - const char *key, const char *token, Error **errp) +void xs_node_unwatch(struct qemu_xs_handle *h, struct qemu_xs_watch *w) { - char *path; - - path = (strlen(node) != 0) ? g_strdup_printf("%s/%s", node, key) : - g_strdup(key); - - trace_xs_node_unwatch(path); - - if (!xs_unwatch(xsh, path, token)) { - error_setg_errno(errp, errno, "failed to unwatch node '%s'", path); - } - - g_free(path); + qemu_xen_xs_unwatch(h, w); } diff --git a/hw/xen/xen-bus.c b/hw/xen/xen-bus.c index df3f6b9ae0..c59850b1de 100644 --- a/hw/xen/xen-bus.c +++ b/hw/xen/xen-bus.c @@ -62,7 +62,7 @@ static void xen_device_unplug(XenDevice *xendev, Error **errp) /* Mimic the way the Xen toolstack does an unplug */ again: - tid = xs_transaction_start(xenbus->xsh); + tid = qemu_xen_xs_transaction_start(xenbus->xsh); if (tid == XBT_NULL) { error_setg_errno(errp, errno, "failed xs_transaction_start"); return; @@ -80,7 +80,7 @@ again: goto abort; } - if (!xs_transaction_end(xenbus->xsh, tid, false)) { + if (!qemu_xen_xs_transaction_end(xenbus->xsh, tid, false)) { if (errno == EAGAIN) { goto again; } @@ -95,7 +95,7 @@ abort: * We only abort if there is already a failure so ignore any error * from ending the transaction. */ - xs_transaction_end(xenbus->xsh, tid, true); + qemu_xen_xs_transaction_end(xenbus->xsh, tid, true); } static void xen_bus_print_dev(Monitor *mon, DeviceState *dev, int indent) @@ -111,143 +111,6 @@ static char *xen_bus_get_dev_path(DeviceState *dev) return xen_device_get_backend_path(XEN_DEVICE(dev)); } -struct XenWatch { - char *node, *key; - char *token; - XenWatchHandler handler; - void *opaque; - Notifier notifier; -}; - -static void watch_notify(Notifier *n, void *data) -{ - XenWatch *watch = container_of(n, XenWatch, notifier); - const char *token = data; - - if (!strcmp(watch->token, token)) { - watch->handler(watch->opaque); - } -} - -static XenWatch *new_watch(const char *node, const char *key, - XenWatchHandler handler, void *opaque) -{ - XenWatch *watch = g_new0(XenWatch, 1); - QemuUUID uuid; - - qemu_uuid_generate(&uuid); - - watch->token = qemu_uuid_unparse_strdup(&uuid); - watch->node = g_strdup(node); - watch->key = g_strdup(key); - watch->handler = handler; - watch->opaque = opaque; - watch->notifier.notify = watch_notify; - - return watch; -} - -static void free_watch(XenWatch *watch) -{ - g_free(watch->token); - g_free(watch->key); - g_free(watch->node); - - g_free(watch); -} - -struct XenWatchList { - struct xs_handle *xsh; - NotifierList notifiers; -}; - -static void watch_list_event(void *opaque) -{ - XenWatchList *watch_list = opaque; - char **v; - const char *token; - - v = xs_check_watch(watch_list->xsh); - if (!v) { - return; - } - - token = v[XS_WATCH_TOKEN]; - - notifier_list_notify(&watch_list->notifiers, (void *)token); - - free(v); -} - -static XenWatchList *watch_list_create(struct xs_handle *xsh) -{ - XenWatchList *watch_list = g_new0(XenWatchList, 1); - - g_assert(xsh); - - watch_list->xsh = xsh; - notifier_list_init(&watch_list->notifiers); - qemu_set_fd_handler(xs_fileno(watch_list->xsh), watch_list_event, NULL, - watch_list); - - return watch_list; -} - -static void watch_list_destroy(XenWatchList *watch_list) -{ - g_assert(notifier_list_empty(&watch_list->notifiers)); - qemu_set_fd_handler(xs_fileno(watch_list->xsh), NULL, NULL, NULL); - g_free(watch_list); -} - -static XenWatch *watch_list_add(XenWatchList *watch_list, const char *node, - const char *key, XenWatchHandler handler, - void *opaque, Error **errp) -{ - ERRP_GUARD(); - XenWatch *watch = new_watch(node, key, handler, opaque); - - notifier_list_add(&watch_list->notifiers, &watch->notifier); - - xs_node_watch(watch_list->xsh, node, key, watch->token, errp); - if (*errp) { - notifier_remove(&watch->notifier); - free_watch(watch); - - return NULL; - } - - return watch; -} - -static void watch_list_remove(XenWatchList *watch_list, XenWatch *watch, - Error **errp) -{ - xs_node_unwatch(watch_list->xsh, watch->node, watch->key, watch->token, - errp); - - notifier_remove(&watch->notifier); - free_watch(watch); -} - -static XenWatch *xen_bus_add_watch(XenBus *xenbus, const char *node, - const char *key, XenWatchHandler handler, - Error **errp) -{ - trace_xen_bus_add_watch(node, key); - - return watch_list_add(xenbus->watch_list, node, key, handler, xenbus, - errp); -} - -static void xen_bus_remove_watch(XenBus *xenbus, XenWatch *watch, - Error **errp) -{ - trace_xen_bus_remove_watch(watch->node, watch->key); - - watch_list_remove(xenbus->watch_list, watch, errp); -} - static void xen_bus_backend_create(XenBus *xenbus, const char *type, const char *name, char *path, Error **errp) @@ -261,15 +124,15 @@ static void xen_bus_backend_create(XenBus *xenbus, const char *type, trace_xen_bus_backend_create(type, path); again: - tid = xs_transaction_start(xenbus->xsh); + tid = qemu_xen_xs_transaction_start(xenbus->xsh); if (tid == XBT_NULL) { error_setg(errp, "failed xs_transaction_start"); return; } - key = xs_directory(xenbus->xsh, tid, path, &n); + key = qemu_xen_xs_directory(xenbus->xsh, tid, path, &n); if (!key) { - if (!xs_transaction_end(xenbus->xsh, tid, true)) { + if (!qemu_xen_xs_transaction_end(xenbus->xsh, tid, true)) { error_setg_errno(errp, errno, "failed xs_transaction_end"); } return; @@ -300,7 +163,7 @@ again: free(key); - if (!xs_transaction_end(xenbus->xsh, tid, false)) { + if (!qemu_xen_xs_transaction_end(xenbus->xsh, tid, false)) { qobject_unref(opts); if (errno == EAGAIN) { @@ -327,7 +190,7 @@ static void xen_bus_type_enumerate(XenBus *xenbus, const char *type) trace_xen_bus_type_enumerate(type); - backend = xs_directory(xenbus->xsh, XBT_NULL, domain_path, &n); + backend = qemu_xen_xs_directory(xenbus->xsh, XBT_NULL, domain_path, &n); if (!backend) { goto out; } @@ -372,7 +235,7 @@ static void xen_bus_enumerate(XenBus *xenbus) trace_xen_bus_enumerate(); - type = xs_directory(xenbus->xsh, XBT_NULL, "backend", &n); + type = qemu_xen_xs_directory(xenbus->xsh, XBT_NULL, "backend", &n); if (!type) { return; } @@ -415,7 +278,7 @@ static void xen_bus_cleanup(XenBus *xenbus) } } -static void xen_bus_backend_changed(void *opaque) +static void xen_bus_backend_changed(void *opaque, const char *path) { XenBus *xenbus = opaque; @@ -434,7 +297,7 @@ static void xen_bus_unrealize(BusState *bus) for (i = 0; i < xenbus->backend_types; i++) { if (xenbus->backend_watch[i]) { - xen_bus_remove_watch(xenbus, xenbus->backend_watch[i], NULL); + xs_node_unwatch(xenbus->xsh, xenbus->backend_watch[i]); } } @@ -442,13 +305,8 @@ static void xen_bus_unrealize(BusState *bus) xenbus->backend_watch = NULL; } - if (xenbus->watch_list) { - watch_list_destroy(xenbus->watch_list); - xenbus->watch_list = NULL; - } - if (xenbus->xsh) { - xs_close(xenbus->xsh); + qemu_xen_xs_close(xenbus->xsh); } } @@ -463,7 +321,7 @@ static void xen_bus_realize(BusState *bus, Error **errp) trace_xen_bus_realize(); - xenbus->xsh = xs_open(0); + xenbus->xsh = qemu_xen_xs_open(); if (!xenbus->xsh) { error_setg_errno(errp, errno, "failed xs_open"); goto fail; @@ -476,19 +334,18 @@ static void xen_bus_realize(BusState *bus, Error **errp) xenbus->backend_id = 0; /* Assume lack of node means dom0 */ } - xenbus->watch_list = watch_list_create(xenbus->xsh); - module_call_init(MODULE_INIT_XEN_BACKEND); type = xen_backend_get_types(&xenbus->backend_types); - xenbus->backend_watch = g_new(XenWatch *, xenbus->backend_types); + xenbus->backend_watch = g_new(struct qemu_xs_watch *, + xenbus->backend_types); for (i = 0; i < xenbus->backend_types; i++) { char *node = g_strdup_printf("backend/%s", type[i]); xenbus->backend_watch[i] = - xen_bus_add_watch(xenbus, node, key, xen_bus_backend_changed, - &local_err); + xs_node_watch(xenbus->xsh, node, key, xen_bus_backend_changed, + xenbus, &local_err); if (local_err) { /* This need not be treated as a hard error so don't propagate */ error_reportf_err(local_err, @@ -631,7 +488,7 @@ static bool xen_device_frontend_is_active(XenDevice *xendev) } } -static void xen_device_backend_changed(void *opaque) +static void xen_device_backend_changed(void *opaque, const char *path) { XenDevice *xendev = opaque; const char *type = object_get_typename(OBJECT(xendev)); @@ -685,66 +542,35 @@ static void xen_device_backend_changed(void *opaque) } } -static XenWatch *xen_device_add_watch(XenDevice *xendev, const char *node, - const char *key, - XenWatchHandler handler, - Error **errp) -{ - const char *type = object_get_typename(OBJECT(xendev)); - - trace_xen_device_add_watch(type, xendev->name, node, key); - - return watch_list_add(xendev->watch_list, node, key, handler, xendev, - errp); -} - -static void xen_device_remove_watch(XenDevice *xendev, XenWatch *watch, - Error **errp) -{ - const char *type = object_get_typename(OBJECT(xendev)); - - trace_xen_device_remove_watch(type, xendev->name, watch->node, - watch->key); - - watch_list_remove(xendev->watch_list, watch, errp); -} - - static void xen_device_backend_create(XenDevice *xendev, Error **errp) { ERRP_GUARD(); XenBus *xenbus = XEN_BUS(qdev_get_parent_bus(DEVICE(xendev))); - struct xs_permissions perms[2]; xendev->backend_path = xen_device_get_backend_path(xendev); - perms[0].id = xenbus->backend_id; - perms[0].perms = XS_PERM_NONE; - perms[1].id = xendev->frontend_id; - perms[1].perms = XS_PERM_READ; - g_assert(xenbus->xsh); - xs_node_create(xenbus->xsh, XBT_NULL, xendev->backend_path, perms, - ARRAY_SIZE(perms), errp); + xs_node_create(xenbus->xsh, XBT_NULL, xendev->backend_path, + xenbus->backend_id, xendev->frontend_id, XS_PERM_READ, errp); if (*errp) { error_prepend(errp, "failed to create backend: "); return; } xendev->backend_state_watch = - xen_device_add_watch(xendev, xendev->backend_path, - "state", xen_device_backend_changed, - errp); + xs_node_watch(xendev->xsh, xendev->backend_path, + "state", xen_device_backend_changed, xendev, + errp); if (*errp) { error_prepend(errp, "failed to watch backend state: "); return; } xendev->backend_online_watch = - xen_device_add_watch(xendev, xendev->backend_path, - "online", xen_device_backend_changed, - errp); + xs_node_watch(xendev->xsh, xendev->backend_path, + "online", xen_device_backend_changed, xendev, + errp); if (*errp) { error_prepend(errp, "failed to watch backend online: "); return; @@ -757,12 +583,12 @@ static void xen_device_backend_destroy(XenDevice *xendev) Error *local_err = NULL; if (xendev->backend_online_watch) { - xen_device_remove_watch(xendev, xendev->backend_online_watch, NULL); + xs_node_unwatch(xendev->xsh, xendev->backend_online_watch); xendev->backend_online_watch = NULL; } if (xendev->backend_state_watch) { - xen_device_remove_watch(xendev, xendev->backend_state_watch, NULL); + xs_node_unwatch(xendev->xsh, xendev->backend_state_watch); xendev->backend_state_watch = NULL; } @@ -837,7 +663,7 @@ static void xen_device_frontend_set_state(XenDevice *xendev, } } -static void xen_device_frontend_changed(void *opaque) +static void xen_device_frontend_changed(void *opaque, const char *path) { XenDevice *xendev = opaque; XenDeviceClass *xendev_class = XEN_DEVICE_GET_CLASS(xendev); @@ -885,7 +711,6 @@ static void xen_device_frontend_create(XenDevice *xendev, Error **errp) { ERRP_GUARD(); XenBus *xenbus = XEN_BUS(qdev_get_parent_bus(DEVICE(xendev))); - struct xs_permissions perms[2]; xendev->frontend_path = xen_device_get_frontend_path(xendev); @@ -894,15 +719,11 @@ static void xen_device_frontend_create(XenDevice *xendev, Error **errp) * toolstack. */ if (!xen_device_frontend_exists(xendev)) { - perms[0].id = xendev->frontend_id; - perms[0].perms = XS_PERM_NONE; - perms[1].id = xenbus->backend_id; - perms[1].perms = XS_PERM_READ | XS_PERM_WRITE; - g_assert(xenbus->xsh); - xs_node_create(xenbus->xsh, XBT_NULL, xendev->frontend_path, perms, - ARRAY_SIZE(perms), errp); + xs_node_create(xenbus->xsh, XBT_NULL, xendev->frontend_path, + xendev->frontend_id, xenbus->backend_id, + XS_PERM_READ | XS_PERM_WRITE, errp); if (*errp) { error_prepend(errp, "failed to create frontend: "); return; @@ -910,8 +731,8 @@ static void xen_device_frontend_create(XenDevice *xendev, Error **errp) } xendev->frontend_state_watch = - xen_device_add_watch(xendev, xendev->frontend_path, "state", - xen_device_frontend_changed, errp); + xs_node_watch(xendev->xsh, xendev->frontend_path, "state", + xen_device_frontend_changed, xendev, errp); if (*errp) { error_prepend(errp, "failed to watch frontend state: "); } @@ -923,8 +744,7 @@ static void xen_device_frontend_destroy(XenDevice *xendev) Error *local_err = NULL; if (xendev->frontend_state_watch) { - xen_device_remove_watch(xendev, xendev->frontend_state_watch, - NULL); + xs_node_unwatch(xendev->xsh, xendev->frontend_state_watch); xendev->frontend_state_watch = NULL; } @@ -947,7 +767,7 @@ static void xen_device_frontend_destroy(XenDevice *xendev) void xen_device_set_max_grant_refs(XenDevice *xendev, unsigned int nr_refs, Error **errp) { - if (xengnttab_set_max_grants(xendev->xgth, nr_refs)) { + if (qemu_xen_gnttab_set_max_grants(xendev->xgth, nr_refs)) { error_setg_errno(errp, errno, "xengnttab_set_max_grants failed"); } } @@ -956,9 +776,8 @@ void *xen_device_map_grant_refs(XenDevice *xendev, uint32_t *refs, unsigned int nr_refs, int prot, Error **errp) { - void *map = xengnttab_map_domain_grant_refs(xendev->xgth, nr_refs, - xendev->frontend_id, refs, - prot); + void *map = qemu_xen_gnttab_map_refs(xendev->xgth, nr_refs, + xendev->frontend_id, refs, prot); if (!map) { error_setg_errno(errp, errno, @@ -968,112 +787,20 @@ void *xen_device_map_grant_refs(XenDevice *xendev, uint32_t *refs, return map; } -void xen_device_unmap_grant_refs(XenDevice *xendev, void *map, +void xen_device_unmap_grant_refs(XenDevice *xendev, void *map, uint32_t *refs, unsigned int nr_refs, Error **errp) { - if (xengnttab_unmap(xendev->xgth, map, nr_refs)) { + if (qemu_xen_gnttab_unmap(xendev->xgth, map, refs, nr_refs)) { error_setg_errno(errp, errno, "xengnttab_unmap failed"); } } -static void compat_copy_grant_refs(XenDevice *xendev, bool to_domain, - XenDeviceGrantCopySegment segs[], - unsigned int nr_segs, Error **errp) -{ - uint32_t *refs = g_new(uint32_t, nr_segs); - int prot = to_domain ? PROT_WRITE : PROT_READ; - void *map; - unsigned int i; - - for (i = 0; i < nr_segs; i++) { - XenDeviceGrantCopySegment *seg = &segs[i]; - - refs[i] = to_domain ? seg->dest.foreign.ref : - seg->source.foreign.ref; - } - - map = xengnttab_map_domain_grant_refs(xendev->xgth, nr_segs, - xendev->frontend_id, refs, - prot); - if (!map) { - error_setg_errno(errp, errno, - "xengnttab_map_domain_grant_refs failed"); - goto done; - } - - for (i = 0; i < nr_segs; i++) { - XenDeviceGrantCopySegment *seg = &segs[i]; - void *page = map + (i * XC_PAGE_SIZE); - - if (to_domain) { - memcpy(page + seg->dest.foreign.offset, seg->source.virt, - seg->len); - } else { - memcpy(seg->dest.virt, page + seg->source.foreign.offset, - seg->len); - } - } - - if (xengnttab_unmap(xendev->xgth, map, nr_segs)) { - error_setg_errno(errp, errno, "xengnttab_unmap failed"); - } - -done: - g_free(refs); -} - void xen_device_copy_grant_refs(XenDevice *xendev, bool to_domain, XenDeviceGrantCopySegment segs[], unsigned int nr_segs, Error **errp) { - xengnttab_grant_copy_segment_t *xengnttab_segs; - unsigned int i; - - if (!xendev->feature_grant_copy) { - compat_copy_grant_refs(xendev, to_domain, segs, nr_segs, errp); - return; - } - - xengnttab_segs = g_new0(xengnttab_grant_copy_segment_t, nr_segs); - - for (i = 0; i < nr_segs; i++) { - XenDeviceGrantCopySegment *seg = &segs[i]; - xengnttab_grant_copy_segment_t *xengnttab_seg = &xengnttab_segs[i]; - - if (to_domain) { - xengnttab_seg->flags = GNTCOPY_dest_gref; - xengnttab_seg->dest.foreign.domid = xendev->frontend_id; - xengnttab_seg->dest.foreign.ref = seg->dest.foreign.ref; - xengnttab_seg->dest.foreign.offset = seg->dest.foreign.offset; - xengnttab_seg->source.virt = seg->source.virt; - } else { - xengnttab_seg->flags = GNTCOPY_source_gref; - xengnttab_seg->source.foreign.domid = xendev->frontend_id; - xengnttab_seg->source.foreign.ref = seg->source.foreign.ref; - xengnttab_seg->source.foreign.offset = - seg->source.foreign.offset; - xengnttab_seg->dest.virt = seg->dest.virt; - } - - xengnttab_seg->len = seg->len; - } - - if (xengnttab_grant_copy(xendev->xgth, nr_segs, xengnttab_segs)) { - error_setg_errno(errp, errno, "xengnttab_grant_copy failed"); - goto done; - } - - for (i = 0; i < nr_segs; i++) { - xengnttab_grant_copy_segment_t *xengnttab_seg = &xengnttab_segs[i]; - - if (xengnttab_seg->status != GNTST_okay) { - error_setg(errp, "xengnttab_grant_copy seg[%u] failed", i); - break; - } - } - -done: - g_free(xengnttab_segs); + qemu_xen_gnttab_grant_copy(xendev->xgth, to_domain, xendev->frontend_id, + (XenGrantCopySegment *)segs, nr_segs, errp); } struct XenEventChannel { @@ -1095,12 +822,12 @@ static bool xen_device_poll(void *opaque) static void xen_device_event(void *opaque) { XenEventChannel *channel = opaque; - unsigned long port = xenevtchn_pending(channel->xeh); + unsigned long port = qemu_xen_evtchn_pending(channel->xeh); if (port == channel->local_port) { xen_device_poll(channel); - xenevtchn_unmask(channel->xeh, port); + qemu_xen_evtchn_unmask(channel->xeh, port); } } @@ -1115,11 +842,11 @@ void xen_device_set_event_channel_context(XenDevice *xendev, } if (channel->ctx) - aio_set_fd_handler(channel->ctx, xenevtchn_fd(channel->xeh), true, + aio_set_fd_handler(channel->ctx, qemu_xen_evtchn_fd(channel->xeh), true, NULL, NULL, NULL, NULL, NULL); channel->ctx = ctx; - aio_set_fd_handler(channel->ctx, xenevtchn_fd(channel->xeh), true, + aio_set_fd_handler(channel->ctx, qemu_xen_evtchn_fd(channel->xeh), true, xen_device_event, NULL, xen_device_poll, NULL, channel); } @@ -1131,13 +858,13 @@ XenEventChannel *xen_device_bind_event_channel(XenDevice *xendev, XenEventChannel *channel = g_new0(XenEventChannel, 1); xenevtchn_port_or_error_t local_port; - channel->xeh = xenevtchn_open(NULL, 0); + channel->xeh = qemu_xen_evtchn_open(); if (!channel->xeh) { error_setg_errno(errp, errno, "failed xenevtchn_open"); goto fail; } - local_port = xenevtchn_bind_interdomain(channel->xeh, + local_port = qemu_xen_evtchn_bind_interdomain(channel->xeh, xendev->frontend_id, port); if (local_port < 0) { @@ -1160,7 +887,7 @@ XenEventChannel *xen_device_bind_event_channel(XenDevice *xendev, fail: if (channel->xeh) { - xenevtchn_close(channel->xeh); + qemu_xen_evtchn_close(channel->xeh); } g_free(channel); @@ -1177,7 +904,7 @@ void xen_device_notify_event_channel(XenDevice *xendev, return; } - if (xenevtchn_notify(channel->xeh, channel->local_port) < 0) { + if (qemu_xen_evtchn_notify(channel->xeh, channel->local_port) < 0) { error_setg_errno(errp, errno, "xenevtchn_notify failed"); } } @@ -1193,14 +920,14 @@ void xen_device_unbind_event_channel(XenDevice *xendev, QLIST_REMOVE(channel, list); - aio_set_fd_handler(channel->ctx, xenevtchn_fd(channel->xeh), true, + aio_set_fd_handler(channel->ctx, qemu_xen_evtchn_fd(channel->xeh), true, NULL, NULL, NULL, NULL, NULL); - if (xenevtchn_unbind(channel->xeh, channel->local_port) < 0) { + if (qemu_xen_evtchn_unbind(channel->xeh, channel->local_port) < 0) { error_setg_errno(errp, errno, "xenevtchn_unbind failed"); } - xenevtchn_close(channel->xeh); + qemu_xen_evtchn_close(channel->xeh); g_free(channel); } @@ -1235,17 +962,12 @@ static void xen_device_unrealize(DeviceState *dev) xen_device_backend_destroy(xendev); if (xendev->xgth) { - xengnttab_close(xendev->xgth); + qemu_xen_gnttab_close(xendev->xgth); xendev->xgth = NULL; } - if (xendev->watch_list) { - watch_list_destroy(xendev->watch_list); - xendev->watch_list = NULL; - } - if (xendev->xsh) { - xs_close(xendev->xsh); + qemu_xen_xs_close(xendev->xsh); xendev->xsh = NULL; } @@ -1290,23 +1012,18 @@ static void xen_device_realize(DeviceState *dev, Error **errp) trace_xen_device_realize(type, xendev->name); - xendev->xsh = xs_open(0); + xendev->xsh = qemu_xen_xs_open(); if (!xendev->xsh) { error_setg_errno(errp, errno, "failed xs_open"); goto unrealize; } - xendev->watch_list = watch_list_create(xendev->xsh); - - xendev->xgth = xengnttab_open(NULL, 0); + xendev->xgth = qemu_xen_gnttab_open(); if (!xendev->xgth) { error_setg_errno(errp, errno, "failed xengnttab_open"); goto unrealize; } - xendev->feature_grant_copy = - (xengnttab_grant_copy(xendev->xgth, 0, NULL) == 0); - xen_device_backend_create(xendev, errp); if (*errp) { goto unrealize; @@ -1317,13 +1034,6 @@ static void xen_device_realize(DeviceState *dev, Error **errp) goto unrealize; } - if (xendev_class->realize) { - xendev_class->realize(xendev, errp); - if (*errp) { - goto unrealize; - } - } - xen_device_backend_printf(xendev, "frontend", "%s", xendev->frontend_path); xen_device_backend_printf(xendev, "frontend-id", "%u", @@ -1342,6 +1052,13 @@ static void xen_device_realize(DeviceState *dev, Error **errp) xen_device_frontend_set_state(xendev, XenbusStateInitialising, true); } + if (xendev_class->realize) { + xendev_class->realize(xendev, errp); + if (*errp) { + goto unrealize; + } + } + xendev->exit.notify = xen_device_exit; qemu_add_exit_notifier(&xendev->exit); return; diff --git a/hw/xen/xen-legacy-backend.c b/hw/xen/xen-legacy-backend.c index afba71f6eb..4ded3cec23 100644 --- a/hw/xen/xen-legacy-backend.c +++ b/hw/xen/xen-legacy-backend.c @@ -39,11 +39,10 @@ BusState *xen_sysbus; /* ------------------------------------------------------------- */ /* public */ -struct xs_handle *xenstore; +struct qemu_xs_handle *xenstore; const char *xen_protocol; /* private */ -static bool xen_feature_grant_copy; static int debug; int xenstore_write_be_str(struct XenLegacyDevice *xendev, const char *node, @@ -113,7 +112,7 @@ void xen_be_set_max_grant_refs(struct XenLegacyDevice *xendev, { assert(xendev->ops->flags & DEVOPS_FLAG_NEED_GNTDEV); - if (xengnttab_set_max_grants(xendev->gnttabdev, nr_refs)) { + if (qemu_xen_gnttab_set_max_grants(xendev->gnttabdev, nr_refs)) { xen_pv_printf(xendev, 0, "xengnttab_set_max_grants failed: %s\n", strerror(errno)); } @@ -126,8 +125,8 @@ void *xen_be_map_grant_refs(struct XenLegacyDevice *xendev, uint32_t *refs, assert(xendev->ops->flags & DEVOPS_FLAG_NEED_GNTDEV); - ptr = xengnttab_map_domain_grant_refs(xendev->gnttabdev, nr_refs, - xen_domid, refs, prot); + ptr = qemu_xen_gnttab_map_refs(xendev->gnttabdev, nr_refs, xen_domid, refs, + prot); if (!ptr) { xen_pv_printf(xendev, 0, "xengnttab_map_domain_grant_refs failed: %s\n", @@ -138,123 +137,31 @@ void *xen_be_map_grant_refs(struct XenLegacyDevice *xendev, uint32_t *refs, } void xen_be_unmap_grant_refs(struct XenLegacyDevice *xendev, void *ptr, - unsigned int nr_refs) + uint32_t *refs, unsigned int nr_refs) { assert(xendev->ops->flags & DEVOPS_FLAG_NEED_GNTDEV); - if (xengnttab_unmap(xendev->gnttabdev, ptr, nr_refs)) { + if (qemu_xen_gnttab_unmap(xendev->gnttabdev, ptr, refs, nr_refs)) { xen_pv_printf(xendev, 0, "xengnttab_unmap failed: %s\n", strerror(errno)); } } -static int compat_copy_grant_refs(struct XenLegacyDevice *xendev, - bool to_domain, - XenGrantCopySegment segs[], - unsigned int nr_segs) -{ - uint32_t *refs = g_new(uint32_t, nr_segs); - int prot = to_domain ? PROT_WRITE : PROT_READ; - void *pages; - unsigned int i; - - for (i = 0; i < nr_segs; i++) { - XenGrantCopySegment *seg = &segs[i]; - - refs[i] = to_domain ? - seg->dest.foreign.ref : seg->source.foreign.ref; - } - - pages = xengnttab_map_domain_grant_refs(xendev->gnttabdev, nr_segs, - xen_domid, refs, prot); - if (!pages) { - xen_pv_printf(xendev, 0, - "xengnttab_map_domain_grant_refs failed: %s\n", - strerror(errno)); - g_free(refs); - return -1; - } - - for (i = 0; i < nr_segs; i++) { - XenGrantCopySegment *seg = &segs[i]; - void *page = pages + (i * XC_PAGE_SIZE); - - if (to_domain) { - memcpy(page + seg->dest.foreign.offset, seg->source.virt, - seg->len); - } else { - memcpy(seg->dest.virt, page + seg->source.foreign.offset, - seg->len); - } - } - - if (xengnttab_unmap(xendev->gnttabdev, pages, nr_segs)) { - xen_pv_printf(xendev, 0, "xengnttab_unmap failed: %s\n", - strerror(errno)); - } - - g_free(refs); - return 0; -} - int xen_be_copy_grant_refs(struct XenLegacyDevice *xendev, bool to_domain, XenGrantCopySegment segs[], unsigned int nr_segs) { - xengnttab_grant_copy_segment_t *xengnttab_segs; - unsigned int i; int rc; assert(xendev->ops->flags & DEVOPS_FLAG_NEED_GNTDEV); - if (!xen_feature_grant_copy) { - return compat_copy_grant_refs(xendev, to_domain, segs, nr_segs); - } - - xengnttab_segs = g_new0(xengnttab_grant_copy_segment_t, nr_segs); - - for (i = 0; i < nr_segs; i++) { - XenGrantCopySegment *seg = &segs[i]; - xengnttab_grant_copy_segment_t *xengnttab_seg = &xengnttab_segs[i]; - - if (to_domain) { - xengnttab_seg->flags = GNTCOPY_dest_gref; - xengnttab_seg->dest.foreign.domid = xen_domid; - xengnttab_seg->dest.foreign.ref = seg->dest.foreign.ref; - xengnttab_seg->dest.foreign.offset = seg->dest.foreign.offset; - xengnttab_seg->source.virt = seg->source.virt; - } else { - xengnttab_seg->flags = GNTCOPY_source_gref; - xengnttab_seg->source.foreign.domid = xen_domid; - xengnttab_seg->source.foreign.ref = seg->source.foreign.ref; - xengnttab_seg->source.foreign.offset = - seg->source.foreign.offset; - xengnttab_seg->dest.virt = seg->dest.virt; - } - - xengnttab_seg->len = seg->len; - } - - rc = xengnttab_grant_copy(xendev->gnttabdev, nr_segs, xengnttab_segs); - + rc = qemu_xen_gnttab_grant_copy(xendev->gnttabdev, to_domain, xen_domid, + segs, nr_segs, NULL); if (rc) { - xen_pv_printf(xendev, 0, "xengnttab_copy failed: %s\n", - strerror(errno)); - } - - for (i = 0; i < nr_segs; i++) { - xengnttab_grant_copy_segment_t *xengnttab_seg = - &xengnttab_segs[i]; - - if (xengnttab_seg->status != GNTST_okay) { - xen_pv_printf(xendev, 0, "segment[%u] status: %d\n", i, - xengnttab_seg->status); - rc = -1; - } + xen_pv_printf(xendev, 0, "xengnttab_grant_copy failed: %s\n", + strerror(-rc)); } - - g_free(xengnttab_segs); return rc; } @@ -294,13 +201,13 @@ static struct XenLegacyDevice *xen_be_get_xendev(const char *type, int dom, xendev->debug = debug; xendev->local_port = -1; - xendev->evtchndev = xenevtchn_open(NULL, 0); + xendev->evtchndev = qemu_xen_evtchn_open(); if (xendev->evtchndev == NULL) { xen_pv_printf(NULL, 0, "can't open evtchn device\n"); qdev_unplug(DEVICE(xendev), NULL); return NULL; } - qemu_set_cloexec(xenevtchn_fd(xendev->evtchndev)); + qemu_set_cloexec(qemu_xen_evtchn_fd(xendev->evtchndev)); xen_pv_insert_xendev(xendev); @@ -367,6 +274,25 @@ static void xen_be_frontend_changed(struct XenLegacyDevice *xendev, } } +static void xenstore_update_fe(void *opaque, const char *watch) +{ + struct XenLegacyDevice *xendev = opaque; + const char *node; + unsigned int len; + + len = strlen(xendev->fe); + if (strncmp(xendev->fe, watch, len) != 0) { + return; + } + if (watch[len] != '/') { + return; + } + node = watch + len + 1; + + xen_be_frontend_changed(xendev, node); + xen_be_check_state(xendev); +} + /* ------------------------------------------------------------- */ /* Check for possible state transitions and perform them. */ @@ -380,7 +306,6 @@ static void xen_be_frontend_changed(struct XenLegacyDevice *xendev, */ static int xen_be_try_setup(struct XenLegacyDevice *xendev) { - char token[XEN_BUFSIZE]; int be_state; if (xenstore_read_be_int(xendev, "state", &be_state) == -1) { @@ -401,8 +326,9 @@ static int xen_be_try_setup(struct XenLegacyDevice *xendev) } /* setup frontend watch */ - snprintf(token, sizeof(token), "fe:%p", xendev); - if (!xs_watch(xenstore, xendev->fe, token)) { + xendev->watch = qemu_xen_xs_watch(xenstore, xendev->fe, xenstore_update_fe, + xendev); + if (!xendev->watch) { xen_pv_printf(xendev, 0, "watching frontend path (%s) failed\n", xendev->fe); return -1; @@ -466,7 +392,7 @@ static int xen_be_try_initialise(struct XenLegacyDevice *xendev) } if (xendev->ops->flags & DEVOPS_FLAG_NEED_GNTDEV) { - xendev->gnttabdev = xengnttab_open(NULL, 0); + xendev->gnttabdev = qemu_xen_gnttab_open(); if (xendev->gnttabdev == NULL) { xen_pv_printf(NULL, 0, "can't open gnttab device\n"); return -1; @@ -524,7 +450,7 @@ static void xen_be_disconnect(struct XenLegacyDevice *xendev, xendev->ops->disconnect(xendev); } if (xendev->gnttabdev) { - xengnttab_close(xendev->gnttabdev); + qemu_xen_gnttab_close(xendev->gnttabdev); xendev->gnttabdev = NULL; } if (xendev->be_state != state) { @@ -591,46 +517,20 @@ void xen_be_check_state(struct XenLegacyDevice *xendev) /* ------------------------------------------------------------- */ -static int xenstore_scan(const char *type, int dom, struct XenDevOps *ops) -{ - struct XenLegacyDevice *xendev; - char path[XEN_BUFSIZE], token[XEN_BUFSIZE]; - char **dev = NULL; - unsigned int cdev, j; - - /* setup watch */ - snprintf(token, sizeof(token), "be:%p:%d:%p", type, dom, ops); - snprintf(path, sizeof(path), "backend/%s/%d", type, dom); - if (!xs_watch(xenstore, path, token)) { - xen_pv_printf(NULL, 0, "xen be: watching backend path (%s) failed\n", - path); - return -1; - } - - /* look for backends */ - dev = xs_directory(xenstore, 0, path, &cdev); - if (!dev) { - return 0; - } - for (j = 0; j < cdev; j++) { - xendev = xen_be_get_xendev(type, dom, atoi(dev[j]), ops); - if (xendev == NULL) { - continue; - } - xen_be_check_state(xendev); - } - free(dev); - return 0; -} +struct xenstore_be { + const char *type; + int dom; + struct XenDevOps *ops; +}; -void xenstore_update_be(char *watch, char *type, int dom, - struct XenDevOps *ops) +static void xenstore_update_be(void *opaque, const char *watch) { + struct xenstore_be *be = opaque; struct XenLegacyDevice *xendev; char path[XEN_BUFSIZE], *bepath; unsigned int len, dev; - len = snprintf(path, sizeof(path), "backend/%s/%d", type, dom); + len = snprintf(path, sizeof(path), "backend/%s/%d", be->type, be->dom); if (strncmp(path, watch, len) != 0) { return; } @@ -644,9 +544,9 @@ void xenstore_update_be(char *watch, char *type, int dom, return; } - xendev = xen_be_get_xendev(type, dom, dev, ops); + xendev = xen_be_get_xendev(be->type, be->dom, dev, be->ops); if (xendev != NULL) { - bepath = xs_read(xenstore, 0, xendev->be, &len); + bepath = qemu_xen_xs_read(xenstore, 0, xendev->be, &len); if (bepath == NULL) { xen_pv_del_xendev(xendev); } else { @@ -657,23 +557,41 @@ void xenstore_update_be(char *watch, char *type, int dom, } } -void xenstore_update_fe(char *watch, struct XenLegacyDevice *xendev) +static int xenstore_scan(const char *type, int dom, struct XenDevOps *ops) { - char *node; - unsigned int len; + struct XenLegacyDevice *xendev; + char path[XEN_BUFSIZE]; + struct xenstore_be *be = g_new0(struct xenstore_be, 1); + char **dev = NULL; + unsigned int cdev, j; - len = strlen(xendev->fe); - if (strncmp(xendev->fe, watch, len) != 0) { - return; - } - if (watch[len] != '/') { - return; + /* setup watch */ + be->type = type; + be->dom = dom; + be->ops = ops; + snprintf(path, sizeof(path), "backend/%s/%d", type, dom); + if (!qemu_xen_xs_watch(xenstore, path, xenstore_update_be, be)) { + xen_pv_printf(NULL, 0, "xen be: watching backend path (%s) failed\n", + path); + return -1; } - node = watch + len + 1; - xen_be_frontend_changed(xendev, node); - xen_be_check_state(xendev); + /* look for backends */ + dev = qemu_xen_xs_directory(xenstore, 0, path, &cdev); + if (!dev) { + return 0; + } + for (j = 0; j < cdev; j++) { + xendev = xen_be_get_xendev(type, dom, atoi(dev[j]), ops); + if (xendev == NULL) { + continue; + } + xen_be_check_state(xendev); + } + free(dev); + return 0; } + /* -------------------------------------------------------------------- */ static void xen_set_dynamic_sysbus(void) @@ -687,29 +605,17 @@ static void xen_set_dynamic_sysbus(void) void xen_be_init(void) { - xengnttab_handle *gnttabdev; - - xenstore = xs_daemon_open(); + xenstore = qemu_xen_xs_open(); if (!xenstore) { xen_pv_printf(NULL, 0, "can't connect to xenstored\n"); exit(1); } - qemu_set_fd_handler(xs_fileno(xenstore), xenstore_update, NULL, NULL); - - if (xen_xc == NULL || xen_fmem == NULL) { + if (xen_evtchn_ops == NULL || xen_gnttab_ops == NULL) { xen_pv_printf(NULL, 0, "Xen operations not set up\n"); exit(1); } - gnttabdev = xengnttab_open(NULL, 0); - if (gnttabdev != NULL) { - if (xengnttab_grant_copy(gnttabdev, 0, NULL) == 0) { - xen_feature_grant_copy = true; - } - xengnttab_close(gnttabdev); - } - xen_sysdev = qdev_new(TYPE_XENSYSDEV); sysbus_realize_and_unref(SYS_BUS_DEVICE(xen_sysdev), &error_fatal); xen_sysbus = qbus_new(TYPE_XENSYSBUS, xen_sysdev, "xen-sysbus"); @@ -751,14 +657,14 @@ int xen_be_bind_evtchn(struct XenLegacyDevice *xendev) if (xendev->local_port != -1) { return 0; } - xendev->local_port = xenevtchn_bind_interdomain + xendev->local_port = qemu_xen_evtchn_bind_interdomain (xendev->evtchndev, xendev->dom, xendev->remote_port); if (xendev->local_port == -1) { xen_pv_printf(xendev, 0, "xenevtchn_bind_interdomain failed\n"); return -1; } xen_pv_printf(xendev, 2, "bind evtchn port %d\n", xendev->local_port); - qemu_set_fd_handler(xenevtchn_fd(xendev->evtchndev), + qemu_set_fd_handler(qemu_xen_evtchn_fd(xendev->evtchndev), xen_pv_evtchn_event, NULL, xendev); return 0; } diff --git a/hw/xen/xen-operations.c b/hw/xen/xen-operations.c new file mode 100644 index 0000000000..4b78fbf4bd --- /dev/null +++ b/hw/xen/xen-operations.c @@ -0,0 +1,478 @@ +/* + * QEMU Xen backend support: Operations for true Xen + * + * Copyright © 2022 Amazon.com, Inc. or its affiliates. All Rights Reserved. + * + * Authors: David Woodhouse <dwmw2@infradead.org> + * + * This work is licensed under the terms of the GNU GPL, version 2 or later. + * See the COPYING file in the top-level directory. + */ + +#include "qemu/osdep.h" +#include "qemu/uuid.h" +#include "qapi/error.h" + +#include "hw/xen/xen_native.h" +#include "hw/xen/xen_backend_ops.h" + +/* + * If we have new enough libxenctrl then we do not want/need these compat + * interfaces, despite what the user supplied cflags might say. They + * must be undefined before including xenctrl.h + */ +#undef XC_WANT_COMPAT_EVTCHN_API +#undef XC_WANT_COMPAT_GNTTAB_API +#undef XC_WANT_COMPAT_MAP_FOREIGN_API + +#include <xenctrl.h> + +/* + * We don't support Xen prior to 4.2.0. + */ + +/* Xen 4.2 through 4.6 */ +#if CONFIG_XEN_CTRL_INTERFACE_VERSION < 40701 + +typedef xc_evtchn xenevtchn_handle; +typedef evtchn_port_or_error_t xenevtchn_port_or_error_t; + +#define xenevtchn_open(l, f) xc_evtchn_open(l, f); +#define xenevtchn_close(h) xc_evtchn_close(h) +#define xenevtchn_fd(h) xc_evtchn_fd(h) +#define xenevtchn_pending(h) xc_evtchn_pending(h) +#define xenevtchn_notify(h, p) xc_evtchn_notify(h, p) +#define xenevtchn_bind_interdomain(h, d, p) xc_evtchn_bind_interdomain(h, d, p) +#define xenevtchn_unmask(h, p) xc_evtchn_unmask(h, p) +#define xenevtchn_unbind(h, p) xc_evtchn_unbind(h, p) + +typedef xc_gnttab xengnttab_handle; + +#define xengnttab_open(l, f) xc_gnttab_open(l, f) +#define xengnttab_close(h) xc_gnttab_close(h) +#define xengnttab_set_max_grants(h, n) xc_gnttab_set_max_grants(h, n) +#define xengnttab_map_grant_ref(h, d, r, p) xc_gnttab_map_grant_ref(h, d, r, p) +#define xengnttab_unmap(h, a, n) xc_gnttab_munmap(h, a, n) +#define xengnttab_map_grant_refs(h, c, d, r, p) \ + xc_gnttab_map_grant_refs(h, c, d, r, p) +#define xengnttab_map_domain_grant_refs(h, c, d, r, p) \ + xc_gnttab_map_domain_grant_refs(h, c, d, r, p) + +typedef xc_interface xenforeignmemory_handle; + +#else /* CONFIG_XEN_CTRL_INTERFACE_VERSION >= 40701 */ + +#include <xenevtchn.h> +#include <xengnttab.h> +#include <xenforeignmemory.h> + +#endif + +/* Xen before 4.8 */ + +static int libxengnttab_fallback_grant_copy(xengnttab_handle *xgt, + bool to_domain, uint32_t domid, + XenGrantCopySegment segs[], + unsigned int nr_segs, Error **errp) +{ + uint32_t *refs = g_new(uint32_t, nr_segs); + int prot = to_domain ? PROT_WRITE : PROT_READ; + void *map; + unsigned int i; + int rc = 0; + + for (i = 0; i < nr_segs; i++) { + XenGrantCopySegment *seg = &segs[i]; + + refs[i] = to_domain ? seg->dest.foreign.ref : + seg->source.foreign.ref; + } + map = xengnttab_map_domain_grant_refs(xgt, nr_segs, domid, refs, prot); + if (!map) { + if (errp) { + error_setg_errno(errp, errno, + "xengnttab_map_domain_grant_refs failed"); + } + rc = -errno; + goto done; + } + + for (i = 0; i < nr_segs; i++) { + XenGrantCopySegment *seg = &segs[i]; + void *page = map + (i * XEN_PAGE_SIZE); + + if (to_domain) { + memcpy(page + seg->dest.foreign.offset, seg->source.virt, + seg->len); + } else { + memcpy(seg->dest.virt, page + seg->source.foreign.offset, + seg->len); + } + } + + if (xengnttab_unmap(xgt, map, nr_segs)) { + if (errp) { + error_setg_errno(errp, errno, "xengnttab_unmap failed"); + } + rc = -errno; + } + +done: + g_free(refs); + return rc; +} + +#if CONFIG_XEN_CTRL_INTERFACE_VERSION >= 40800 + +static int libxengnttab_backend_grant_copy(xengnttab_handle *xgt, + bool to_domain, uint32_t domid, + XenGrantCopySegment *segs, + uint32_t nr_segs, Error **errp) +{ + xengnttab_grant_copy_segment_t *xengnttab_segs; + unsigned int i; + int rc; + + xengnttab_segs = g_new0(xengnttab_grant_copy_segment_t, nr_segs); + + for (i = 0; i < nr_segs; i++) { + XenGrantCopySegment *seg = &segs[i]; + xengnttab_grant_copy_segment_t *xengnttab_seg = &xengnttab_segs[i]; + + if (to_domain) { + xengnttab_seg->flags = GNTCOPY_dest_gref; + xengnttab_seg->dest.foreign.domid = domid; + xengnttab_seg->dest.foreign.ref = seg->dest.foreign.ref; + xengnttab_seg->dest.foreign.offset = seg->dest.foreign.offset; + xengnttab_seg->source.virt = seg->source.virt; + } else { + xengnttab_seg->flags = GNTCOPY_source_gref; + xengnttab_seg->source.foreign.domid = domid; + xengnttab_seg->source.foreign.ref = seg->source.foreign.ref; + xengnttab_seg->source.foreign.offset = + seg->source.foreign.offset; + xengnttab_seg->dest.virt = seg->dest.virt; + } + + xengnttab_seg->len = seg->len; + } + + if (xengnttab_grant_copy(xgt, nr_segs, xengnttab_segs)) { + if (errp) { + error_setg_errno(errp, errno, "xengnttab_grant_copy failed"); + } + rc = -errno; + goto done; + } + + rc = 0; + for (i = 0; i < nr_segs; i++) { + xengnttab_grant_copy_segment_t *xengnttab_seg = &xengnttab_segs[i]; + + if (xengnttab_seg->status != GNTST_okay) { + if (errp) { + error_setg(errp, "xengnttab_grant_copy seg[%u] failed", i); + } + rc = -EIO; + break; + } + } + +done: + g_free(xengnttab_segs); + return rc; +} +#endif + +static xenevtchn_handle *libxenevtchn_backend_open(void) +{ + return xenevtchn_open(NULL, 0); +} + +struct evtchn_backend_ops libxenevtchn_backend_ops = { + .open = libxenevtchn_backend_open, + .close = xenevtchn_close, + .bind_interdomain = xenevtchn_bind_interdomain, + .unbind = xenevtchn_unbind, + .get_fd = xenevtchn_fd, + .notify = xenevtchn_notify, + .unmask = xenevtchn_unmask, + .pending = xenevtchn_pending, +}; + +static xengnttab_handle *libxengnttab_backend_open(void) +{ + return xengnttab_open(NULL, 0); +} + +static int libxengnttab_backend_unmap(xengnttab_handle *xgt, + void *start_address, uint32_t *refs, + uint32_t count) +{ + return xengnttab_unmap(xgt, start_address, count); +} + + +static struct gnttab_backend_ops libxengnttab_backend_ops = { + .features = XEN_GNTTAB_OP_FEATURE_MAP_MULTIPLE, + .open = libxengnttab_backend_open, + .close = xengnttab_close, + .grant_copy = libxengnttab_fallback_grant_copy, + .set_max_grants = xengnttab_set_max_grants, + .map_refs = xengnttab_map_domain_grant_refs, + .unmap = libxengnttab_backend_unmap, +}; + +#if CONFIG_XEN_CTRL_INTERFACE_VERSION < 40701 + +static void *libxenforeignmem_backend_map(uint32_t dom, void *addr, int prot, + size_t pages, xfn_pfn_t *pfns, + int *errs) +{ + if (errs) { + return xc_map_foreign_bulk(xen_xc, dom, prot, pfns, errs, pages); + } else { + return xc_map_foreign_pages(xen_xc, dom, prot, pfns, pages); + } +} + +static int libxenforeignmem_backend_unmap(void *addr, size_t pages) +{ + return munmap(addr, pages * XC_PAGE_SIZE); +} + +#else /* CONFIG_XEN_CTRL_INTERFACE_VERSION >= 40701 */ + +static void *libxenforeignmem_backend_map(uint32_t dom, void *addr, int prot, + size_t pages, xen_pfn_t *pfns, + int *errs) +{ + return xenforeignmemory_map2(xen_fmem, dom, addr, prot, 0, pages, pfns, + errs); +} + +static int libxenforeignmem_backend_unmap(void *addr, size_t pages) +{ + return xenforeignmemory_unmap(xen_fmem, addr, pages); +} + +#endif + +struct foreignmem_backend_ops libxenforeignmem_backend_ops = { + .map = libxenforeignmem_backend_map, + .unmap = libxenforeignmem_backend_unmap, +}; + +struct qemu_xs_handle { + struct xs_handle *xsh; + NotifierList notifiers; +}; + +static void watch_event(void *opaque) +{ + struct qemu_xs_handle *h = opaque; + + for (;;) { + char **v = xs_check_watch(h->xsh); + + if (!v) { + break; + } + + notifier_list_notify(&h->notifiers, v); + free(v); + } +} + +static struct qemu_xs_handle *libxenstore_open(void) +{ + struct xs_handle *xsh = xs_open(0); + struct qemu_xs_handle *h = g_new0(struct qemu_xs_handle, 1); + + if (!xsh) { + return NULL; + } + + h = g_new0(struct qemu_xs_handle, 1); + h->xsh = xsh; + + notifier_list_init(&h->notifiers); + qemu_set_fd_handler(xs_fileno(h->xsh), watch_event, NULL, h); + + return h; +} + +static void libxenstore_close(struct qemu_xs_handle *h) +{ + g_assert(notifier_list_empty(&h->notifiers)); + qemu_set_fd_handler(xs_fileno(h->xsh), NULL, NULL, NULL); + xs_close(h->xsh); + g_free(h); +} + +static char *libxenstore_get_domain_path(struct qemu_xs_handle *h, + unsigned int domid) +{ + return xs_get_domain_path(h->xsh, domid); +} + +static char **libxenstore_directory(struct qemu_xs_handle *h, + xs_transaction_t t, const char *path, + unsigned int *num) +{ + return xs_directory(h->xsh, t, path, num); +} + +static void *libxenstore_read(struct qemu_xs_handle *h, xs_transaction_t t, + const char *path, unsigned int *len) +{ + return xs_read(h->xsh, t, path, len); +} + +static bool libxenstore_write(struct qemu_xs_handle *h, xs_transaction_t t, + const char *path, const void *data, + unsigned int len) +{ + return xs_write(h->xsh, t, path, data, len); +} + +static bool libxenstore_create(struct qemu_xs_handle *h, xs_transaction_t t, + unsigned int owner, unsigned int domid, + unsigned int perms, const char *path) +{ + struct xs_permissions perms_list[] = { + { + .id = owner, + .perms = XS_PERM_NONE, + }, + { + .id = domid, + .perms = perms, + }, + }; + + if (!xs_mkdir(h->xsh, t, path)) { + return false; + } + + return xs_set_permissions(h->xsh, t, path, perms_list, + ARRAY_SIZE(perms_list)); +} + +static bool libxenstore_destroy(struct qemu_xs_handle *h, xs_transaction_t t, + const char *path) +{ + return xs_rm(h->xsh, t, path); +} + +struct qemu_xs_watch { + char *path; + char *token; + xs_watch_fn fn; + void *opaque; + Notifier notifier; +}; + +static void watch_notify(Notifier *n, void *data) +{ + struct qemu_xs_watch *w = container_of(n, struct qemu_xs_watch, notifier); + const char **v = data; + + if (!strcmp(w->token, v[XS_WATCH_TOKEN])) { + w->fn(w->opaque, v[XS_WATCH_PATH]); + } +} + +static struct qemu_xs_watch *new_watch(const char *path, xs_watch_fn fn, + void *opaque) +{ + struct qemu_xs_watch *w = g_new0(struct qemu_xs_watch, 1); + QemuUUID uuid; + + qemu_uuid_generate(&uuid); + + w->token = qemu_uuid_unparse_strdup(&uuid); + w->path = g_strdup(path); + w->fn = fn; + w->opaque = opaque; + w->notifier.notify = watch_notify; + + return w; +} + +static void free_watch(struct qemu_xs_watch *w) +{ + g_free(w->token); + g_free(w->path); + + g_free(w); +} + +static struct qemu_xs_watch *libxenstore_watch(struct qemu_xs_handle *h, + const char *path, xs_watch_fn fn, + void *opaque) +{ + struct qemu_xs_watch *w = new_watch(path, fn, opaque); + + notifier_list_add(&h->notifiers, &w->notifier); + + if (!xs_watch(h->xsh, path, w->token)) { + notifier_remove(&w->notifier); + free_watch(w); + return NULL; + } + + return w; +} + +static void libxenstore_unwatch(struct qemu_xs_handle *h, + struct qemu_xs_watch *w) +{ + xs_unwatch(h->xsh, w->path, w->token); + notifier_remove(&w->notifier); + free_watch(w); +} + +static xs_transaction_t libxenstore_transaction_start(struct qemu_xs_handle *h) +{ + return xs_transaction_start(h->xsh); +} + +static bool libxenstore_transaction_end(struct qemu_xs_handle *h, + xs_transaction_t t, bool abort) +{ + return xs_transaction_end(h->xsh, t, abort); +} + +struct xenstore_backend_ops libxenstore_backend_ops = { + .open = libxenstore_open, + .close = libxenstore_close, + .get_domain_path = libxenstore_get_domain_path, + .directory = libxenstore_directory, + .read = libxenstore_read, + .write = libxenstore_write, + .create = libxenstore_create, + .destroy = libxenstore_destroy, + .watch = libxenstore_watch, + .unwatch = libxenstore_unwatch, + .transaction_start = libxenstore_transaction_start, + .transaction_end = libxenstore_transaction_end, +}; + +void setup_xen_backend_ops(void) +{ +#if CONFIG_XEN_CTRL_INTERFACE_VERSION >= 40800 + xengnttab_handle *xgt = xengnttab_open(NULL, 0); + + if (xgt) { + if (xengnttab_grant_copy(xgt, 0, NULL) == 0) { + libxengnttab_backend_ops.grant_copy = libxengnttab_backend_grant_copy; + } + xengnttab_close(xgt); + } +#endif + xen_evtchn_ops = &libxenevtchn_backend_ops; + xen_gnttab_ops = &libxengnttab_backend_ops; + xen_foreignmem_ops = &libxenforeignmem_backend_ops; + xen_xenstore_ops = &libxenstore_backend_ops; +} diff --git a/hw/xen/xen_devconfig.c b/hw/xen/xen_devconfig.c index 46ee4a7f02..9b7304e544 100644 --- a/hw/xen/xen_devconfig.c +++ b/hw/xen/xen_devconfig.c @@ -11,11 +11,11 @@ static int xen_config_dev_dirs(const char *ftype, const char *btype, int vdev, { char *dom; - dom = xs_get_domain_path(xenstore, xen_domid); + dom = qemu_xen_xs_get_domain_path(xenstore, xen_domid); snprintf(fe, len, "%s/device/%s/%d", dom, ftype, vdev); free(dom); - dom = xs_get_domain_path(xenstore, 0); + dom = qemu_xen_xs_get_domain_path(xenstore, 0); snprintf(be, len, "%s/backend/%s/%d/%d", dom, btype, xen_domid, vdev); free(dom); diff --git a/hw/xen/xen_pt.c b/hw/xen/xen_pt.c index 8db0532632..2d33d178ad 100644 --- a/hw/xen/xen_pt.c +++ b/hw/xen/xen_pt.c @@ -57,11 +57,12 @@ #include <sys/ioctl.h> #include "hw/pci/pci.h" +#include "hw/pci/pci_bus.h" #include "hw/qdev-properties.h" #include "hw/qdev-properties-system.h" +#include "xen_pt.h" #include "hw/xen/xen.h" #include "hw/xen/xen-legacy-backend.h" -#include "xen_pt.h" #include "qemu/range.h" static bool has_igd_gfx_passthru; @@ -780,15 +781,6 @@ static void xen_pt_realize(PCIDevice *d, Error **errp) s->hostaddr.bus, s->hostaddr.slot, s->hostaddr.function, s->dev.devfn); - xen_host_pci_device_get(&s->real_device, - s->hostaddr.domain, s->hostaddr.bus, - s->hostaddr.slot, s->hostaddr.function, - errp); - if (*errp) { - error_append_hint(errp, "Failed to \"open\" the real pci device"); - return; - } - s->is_virtfn = s->real_device.is_virtfn; if (s->is_virtfn) { XEN_PT_LOG(d, "%04x:%02x:%02x.%d is a SR-IOV Virtual Function\n", @@ -803,8 +795,10 @@ static void xen_pt_realize(PCIDevice *d, Error **errp) s->io_listener = xen_pt_io_listener; /* Setup VGA bios for passthrough GFX */ - if ((s->real_device.domain == 0) && (s->real_device.bus == 0) && - (s->real_device.dev == 2) && (s->real_device.func == 0)) { + if ((s->real_device.domain == XEN_PCI_IGD_DOMAIN) && + (s->real_device.bus == XEN_PCI_IGD_BUS) && + (s->real_device.dev == XEN_PCI_IGD_DEV) && + (s->real_device.func == XEN_PCI_IGD_FN)) { if (!is_igd_vga_passthrough(&s->real_device)) { error_setg(errp, "Need to enable igd-passthru if you're trying" " to passthrough IGD GFX"); @@ -950,11 +944,58 @@ static void xen_pci_passthrough_instance_init(Object *obj) PCI_DEVICE(obj)->cap_present |= QEMU_PCI_CAP_EXPRESS; } +void xen_igd_reserve_slot(PCIBus *pci_bus) +{ + if (!xen_igd_gfx_pt_enabled()) { + return; + } + + XEN_PT_LOG(0, "Reserving PCI slot 2 for IGD\n"); + pci_bus->slot_reserved_mask |= XEN_PCI_IGD_SLOT_MASK; +} + +static void xen_igd_clear_slot(DeviceState *qdev, Error **errp) +{ + ERRP_GUARD(); + PCIDevice *pci_dev = (PCIDevice *)qdev; + XenPCIPassthroughState *s = XEN_PT_DEVICE(pci_dev); + XenPTDeviceClass *xpdc = XEN_PT_DEVICE_GET_CLASS(s); + PCIBus *pci_bus = pci_get_bus(pci_dev); + + xen_host_pci_device_get(&s->real_device, + s->hostaddr.domain, s->hostaddr.bus, + s->hostaddr.slot, s->hostaddr.function, + errp); + if (*errp) { + error_append_hint(errp, "Failed to \"open\" the real pci device"); + return; + } + + if (!(pci_bus->slot_reserved_mask & XEN_PCI_IGD_SLOT_MASK)) { + xpdc->pci_qdev_realize(qdev, errp); + return; + } + + if (is_igd_vga_passthrough(&s->real_device) && + s->real_device.domain == XEN_PCI_IGD_DOMAIN && + s->real_device.bus == XEN_PCI_IGD_BUS && + s->real_device.dev == XEN_PCI_IGD_DEV && + s->real_device.func == XEN_PCI_IGD_FN && + s->real_device.vendor_id == PCI_VENDOR_ID_INTEL) { + pci_bus->slot_reserved_mask &= ~XEN_PCI_IGD_SLOT_MASK; + XEN_PT_LOG(pci_dev, "Intel IGD found, using slot 2\n"); + } + xpdc->pci_qdev_realize(qdev, errp); +} + static void xen_pci_passthrough_class_init(ObjectClass *klass, void *data) { DeviceClass *dc = DEVICE_CLASS(klass); PCIDeviceClass *k = PCI_DEVICE_CLASS(klass); + XenPTDeviceClass *xpdc = XEN_PT_DEVICE_CLASS(klass); + xpdc->pci_qdev_realize = dc->realize; + dc->realize = xen_igd_clear_slot; k->realize = xen_pt_realize; k->exit = xen_pt_unregister_device; k->config_read = xen_pt_pci_read_config; @@ -977,6 +1018,7 @@ static const TypeInfo xen_pci_passthrough_info = { .instance_size = sizeof(XenPCIPassthroughState), .instance_finalize = xen_pci_passthrough_finalize, .class_init = xen_pci_passthrough_class_init, + .class_size = sizeof(XenPTDeviceClass), .instance_init = xen_pci_passthrough_instance_init, .interfaces = (InterfaceInfo[]) { { INTERFACE_CONVENTIONAL_PCI_DEVICE }, diff --git a/hw/xen/xen_pt.h b/hw/xen/xen_pt.h index cf10fc7bbf..b20744f7c7 100644 --- a/hw/xen/xen_pt.h +++ b/hw/xen/xen_pt.h @@ -1,7 +1,7 @@ #ifndef XEN_PT_H #define XEN_PT_H -#include "hw/xen/xen_common.h" +#include "hw/xen/xen_native.h" #include "xen-host-pci-device.h" #include "qom/object.h" @@ -40,7 +40,20 @@ typedef struct XenPTReg XenPTReg; #define TYPE_XEN_PT_DEVICE "xen-pci-passthrough" OBJECT_DECLARE_SIMPLE_TYPE(XenPCIPassthroughState, XEN_PT_DEVICE) +#define XEN_PT_DEVICE_CLASS(klass) \ + OBJECT_CLASS_CHECK(XenPTDeviceClass, klass, TYPE_XEN_PT_DEVICE) +#define XEN_PT_DEVICE_GET_CLASS(obj) \ + OBJECT_GET_CLASS(XenPTDeviceClass, obj, TYPE_XEN_PT_DEVICE) + +typedef void (*XenPTQdevRealize)(DeviceState *qdev, Error **errp); + +typedef struct XenPTDeviceClass { + PCIDeviceClass parent_class; + XenPTQdevRealize pci_qdev_realize; +} XenPTDeviceClass; + uint32_t igd_read_opregion(XenPCIPassthroughState *s); +void xen_igd_reserve_slot(PCIBus *pci_bus); void igd_write_opregion(XenPCIPassthroughState *s, uint32_t val); void xen_igd_passthrough_isa_bridge_create(XenPCIPassthroughState *s, XenHostPCIDevice *dev); @@ -75,6 +88,13 @@ typedef int (*xen_pt_conf_byte_read) #define XEN_PCI_INTEL_OPREGION 0xfc +#define XEN_PCI_IGD_DOMAIN 0 +#define XEN_PCI_IGD_BUS 0 +#define XEN_PCI_IGD_DEV 2 +#define XEN_PCI_IGD_FN 0 +#define XEN_PCI_IGD_SLOT_MASK \ + (1UL << PCI_SLOT(PCI_DEVFN(XEN_PCI_IGD_DEV, XEN_PCI_IGD_FN))) + typedef enum { XEN_PT_GRP_TYPE_HARDWIRED = 0, /* 0 Hardwired reg group */ XEN_PT_GRP_TYPE_EMU, /* emul reg group */ diff --git a/hw/xen/xen_pt_config_init.c b/hw/xen/xen_pt_config_init.c index cde898b744..2b8680b112 100644 --- a/hw/xen/xen_pt_config_init.c +++ b/hw/xen/xen_pt_config_init.c @@ -15,8 +15,8 @@ #include "qemu/osdep.h" #include "qapi/error.h" #include "qemu/timer.h" -#include "hw/xen/xen-legacy-backend.h" #include "xen_pt.h" +#include "hw/xen/xen-legacy-backend.h" #define XEN_PT_MERGE_VALUE(value, data, val_mask) \ (((value) & (val_mask)) | ((data) & ~(val_mask))) @@ -1924,7 +1924,7 @@ static void xen_pt_config_reg_init(XenPCIPassthroughState *s, if (reg->init) { uint32_t host_mask, size_mask; unsigned int offset; - uint32_t val; + uint32_t val = 0; /* initialize emulate register */ rc = reg->init(s, reg_entry->reg, diff --git a/hw/xen/xen_pt_graphics.c b/hw/xen/xen_pt_graphics.c index f303f67c9c..0aed3bb6fd 100644 --- a/hw/xen/xen_pt_graphics.c +++ b/hw/xen/xen_pt_graphics.c @@ -5,7 +5,6 @@ #include "qapi/error.h" #include "xen_pt.h" #include "xen-host-pci-device.h" -#include "hw/xen/xen-legacy-backend.h" static unsigned long igd_guest_opregion; static unsigned long igd_host_opregion; diff --git a/hw/xen/xen_pt_msi.c b/hw/xen/xen_pt_msi.c index b71563f98a..09cca4eecb 100644 --- a/hw/xen/xen_pt_msi.c +++ b/hw/xen/xen_pt_msi.c @@ -11,9 +11,9 @@ #include "qemu/osdep.h" -#include "hw/xen/xen-legacy-backend.h" -#include "xen_pt.h" #include "hw/i386/apic-msidef.h" +#include "xen_pt.h" +#include "hw/xen/xen-legacy-backend.h" #define XEN_PT_AUTO_ASSIGN -1 diff --git a/hw/xen/xen_pt_stub.c b/hw/xen/xen_pt_stub.c index 2d8cac8d54..5c108446a8 100644 --- a/hw/xen/xen_pt_stub.c +++ b/hw/xen/xen_pt_stub.c @@ -20,3 +20,7 @@ void xen_igd_gfx_pt_set(bool value, Error **errp) error_setg(errp, "Xen PCI passthrough support not built in"); } } + +void xen_igd_reserve_slot(PCIBus *pci_bus) +{ +} diff --git a/hw/xen/xen_pvdev.c b/hw/xen/xen_pvdev.c index 1a5177b354..be1504b82c 100644 --- a/hw/xen/xen_pvdev.c +++ b/hw/xen/xen_pvdev.c @@ -54,31 +54,17 @@ void xen_config_cleanup(void) struct xs_dirs *d; QTAILQ_FOREACH(d, &xs_cleanup, list) { - xs_rm(xenstore, 0, d->xs_dir); + qemu_xen_xs_destroy(xenstore, 0, d->xs_dir); } } int xenstore_mkdir(char *path, int p) { - struct xs_permissions perms[2] = { - { - .id = 0, /* set owner: dom0 */ - }, { - .id = xen_domid, - .perms = p, - } - }; - - if (!xs_mkdir(xenstore, 0, path)) { + if (!qemu_xen_xs_create(xenstore, 0, 0, xen_domid, p, path)) { xen_pv_printf(NULL, 0, "xs_mkdir %s: failed\n", path); return -1; } xenstore_cleanup_dir(g_strdup(path)); - - if (!xs_set_permissions(xenstore, 0, path, perms, 2)) { - xen_pv_printf(NULL, 0, "xs_set_permissions %s: failed\n", path); - return -1; - } return 0; } @@ -87,7 +73,7 @@ int xenstore_write_str(const char *base, const char *node, const char *val) char abspath[XEN_BUFSIZE]; snprintf(abspath, sizeof(abspath), "%s/%s", base, node); - if (!xs_write(xenstore, 0, abspath, val, strlen(val))) { + if (!qemu_xen_xs_write(xenstore, 0, abspath, val, strlen(val))) { return -1; } return 0; @@ -100,7 +86,7 @@ char *xenstore_read_str(const char *base, const char *node) char *str, *ret = NULL; snprintf(abspath, sizeof(abspath), "%s/%s", base, node); - str = xs_read(xenstore, 0, abspath, &len); + str = qemu_xen_xs_read(xenstore, 0, abspath, &len); if (str != NULL) { /* move to qemu-allocated memory to make sure * callers can savely g_free() stuff. */ @@ -152,29 +138,6 @@ int xenstore_read_uint64(const char *base, const char *node, uint64_t *uval) return rc; } -void xenstore_update(void *unused) -{ - char **vec = NULL; - intptr_t type, ops, ptr; - unsigned int dom, count; - - vec = xs_read_watch(xenstore, &count); - if (vec == NULL) { - goto cleanup; - } - - if (sscanf(vec[XS_WATCH_TOKEN], "be:%" PRIxPTR ":%d:%" PRIxPTR, - &type, &dom, &ops) == 3) { - xenstore_update_be(vec[XS_WATCH_PATH], (void *)type, dom, (void*)ops); - } - if (sscanf(vec[XS_WATCH_TOKEN], "fe:%" PRIxPTR, &ptr) == 1) { - xenstore_update_fe(vec[XS_WATCH_PATH], (void *)ptr); - } - -cleanup: - free(vec); -} - const char *xenbus_strstate(enum xenbus_state state) { static const char *const name[] = { @@ -238,14 +201,14 @@ void xen_pv_evtchn_event(void *opaque) struct XenLegacyDevice *xendev = opaque; evtchn_port_t port; - port = xenevtchn_pending(xendev->evtchndev); + port = qemu_xen_evtchn_pending(xendev->evtchndev); if (port != xendev->local_port) { xen_pv_printf(xendev, 0, "xenevtchn_pending returned %d (expected %d)\n", port, xendev->local_port); return; } - xenevtchn_unmask(xendev->evtchndev, port); + qemu_xen_evtchn_unmask(xendev->evtchndev, port); if (xendev->ops->event) { xendev->ops->event(xendev); @@ -257,15 +220,15 @@ void xen_pv_unbind_evtchn(struct XenLegacyDevice *xendev) if (xendev->local_port == -1) { return; } - qemu_set_fd_handler(xenevtchn_fd(xendev->evtchndev), NULL, NULL, NULL); - xenevtchn_unbind(xendev->evtchndev, xendev->local_port); + qemu_set_fd_handler(qemu_xen_evtchn_fd(xendev->evtchndev), NULL, NULL, NULL); + qemu_xen_evtchn_unbind(xendev->evtchndev, xendev->local_port); xen_pv_printf(xendev, 2, "unbind evtchn port %d\n", xendev->local_port); xendev->local_port = -1; } int xen_pv_send_notify(struct XenLegacyDevice *xendev) { - return xenevtchn_notify(xendev->evtchndev, xendev->local_port); + return qemu_xen_evtchn_notify(xendev->evtchndev, xendev->local_port); } /* ------------------------------------------------------------- */ @@ -299,17 +262,15 @@ void xen_pv_del_xendev(struct XenLegacyDevice *xendev) } if (xendev->fe) { - char token[XEN_BUFSIZE]; - snprintf(token, sizeof(token), "fe:%p", xendev); - xs_unwatch(xenstore, xendev->fe, token); + qemu_xen_xs_unwatch(xenstore, xendev->watch); g_free(xendev->fe); } if (xendev->evtchndev != NULL) { - xenevtchn_close(xendev->evtchndev); + qemu_xen_evtchn_close(xendev->evtchndev); } if (xendev->gnttabdev != NULL) { - xengnttab_close(xendev->gnttabdev); + qemu_xen_gnttab_close(xendev->gnttabdev); } QTAILQ_REMOVE(&xendevs, xendev, next); |