diff options
| author | Richard Henderson <richard.henderson@linaro.org> | 2025-10-06 08:14:03 -0700 |
|---|---|---|
| committer | Richard Henderson <richard.henderson@linaro.org> | 2025-10-06 08:14:03 -0700 |
| commit | 92a0dcbd751d771512b9dedd97e00553181b7699 (patch) | |
| tree | 9397cb6a11d637c3861e98e134925d550afbae68 /hw | |
| parent | 5cbb72eef0bd78be04cc1abbfbf5a0e00d807a23 (diff) | |
| parent | e209d4d7a31b9f82925a2205e6e19e61a3facbe0 (diff) | |
| download | focaccia-qemu-92a0dcbd751d771512b9dedd97e00553181b7699.tar.gz focaccia-qemu-92a0dcbd751d771512b9dedd97e00553181b7699.zip | |
Merge tag 'for_upstream' of https://git.kernel.org/pub/scm/virt/kvm/mst/qemu into staging
virtio,pci,pc: features, fixes users can now control VM bit in smbios. vhost-user-device is now user-createable. intel_iommu now supports PRI virtio-net now supports GSO over UDP tunnel ghes now supports error injection amd iommu now supports dma remapping for vfio better error messages for virtio small fixes all over the place. Signed-off-by: Michael S. Tsirkin <mst@redhat.com> # -----BEGIN PGP SIGNATURE----- # # iQFDBAABCgAtFiEEXQn9CHHI+FuUyooNKB8NuNKNVGkFAmji0s0PHG1zdEByZWRo # YXQuY29tAAoJECgfDbjSjVRpuH4H/09h70IqAWZGHIWKGmmGGtdKOj3g54KuI0Ss # mGECEsHvvBexOy670Qy8jdgXfaW4UuNui8BiOnJnGsBX8Y0dy+/yZori3KhkXkaY # D57Ap9agkpHem7Vw0zgNsAF2bzDdlzTiQ6ns5oDnSq8yt82onCb5WGkWTGkPs/jL # Gf8Jv+Ddcpt5SU4/hHPYC8pUhl7z4xPOOyl0Qp1GG21Pxf5v4sGFcWuGGB7UEPSQ # MjZeoM0rSnLDtNg18sGwD5RPLQs13TbtgsVwijI79c3w3rcSpPNhGR5OWkdRCIYF # 8A0Nhq0Yfo0ogTht7yt1QNPf/ktJkuoBuGVirvpDaix2tCBECes= # =Zvq/ # -----END PGP SIGNATURE----- # gpg: Signature made Sun 05 Oct 2025 01:19:25 PM PDT # gpg: using RSA key 5D09FD0871C8F85B94CA8A0D281F0DB8D28D5469 # gpg: issuer "mst@redhat.com" # gpg: Good signature from "Michael S. Tsirkin <mst@kernel.org>" [unknown] # gpg: aka "Michael S. Tsirkin <mst@redhat.com>" [unknown] # gpg: WARNING: The key's User ID is not certified with a trusted signature! # gpg: There is no indication that the signature belongs to the owner. # Primary key fingerprint: 0270 606B 6F3C DF3D 0B17 0970 C350 3912 AFBE 8E67 # Subkey fingerprint: 5D09 FD08 71C8 F85B 94CA 8A0D 281F 0DB8 D28D 5469 * tag 'for_upstream' of https://git.kernel.org/pub/scm/virt/kvm/mst/qemu: (75 commits) virtio: improve virtqueue mapping error messages pci: Fix wrong parameter passing to pci_device_get_iommu_bus_devfn() intel_iommu: Simplify caching mode check with VFIO device intel_iommu: Enable Enhanced Set Root Table Pointer Support (ESRTPS) vdpa-dev: add get_vhost() callback for vhost-vdpa device amd_iommu: HATDis/HATS=11 support intel-iommu: Move dma_translation to x86-iommu amd_iommu: Refactor amdvi_page_walk() to use common code for page walk amd_iommu: Do not assume passthrough translation when DTE[TV]=0 amd_iommu: Toggle address translation mode on devtab entry invalidation amd_iommu: Add dma-remap property to AMD vIOMMU device amd_iommu: Set all address spaces to use passthrough mode on reset amd_iommu: Toggle memory regions based on address translation mode amd_iommu: Invalidate address translations on INVALIDATE_IOMMU_ALL amd_iommu: Add replay callback amd_iommu: Unmap all address spaces under the AMD IOMMU on reset amd_iommu: Use iova_tree records to determine large page size on UNMAP amd_iommu: Sync shadow page tables on page invalidation amd_iommu: Add basic structure to support IOMMU notifier updates amd_iommu: Add a page walker to sync shadow page tables on invalidation ... Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
Diffstat (limited to 'hw')
50 files changed, 2384 insertions, 582 deletions
diff --git a/hw/acpi/Kconfig b/hw/acpi/Kconfig index 1d4e9f0845..daabbe6cd1 100644 --- a/hw/acpi/Kconfig +++ b/hw/acpi/Kconfig @@ -51,6 +51,11 @@ config ACPI_APEI bool depends on ACPI +config GHES_CPER + bool + depends on ACPI_APEI + default y + config ACPI_PCI bool depends on ACPI && PCI diff --git a/hw/acpi/aml-build.c b/hw/acpi/aml-build.c index 1e685f982f..2d5826a8f1 100644 --- a/hw/acpi/aml-build.c +++ b/hw/acpi/aml-build.c @@ -2629,3 +2629,13 @@ Aml *aml_i2c_serial_bus_device(uint16_t address, const char *resource_source) return var; } + +/* ACPI 5.0b: 18.3.2.6.2 Event Notification For Generic Error Sources */ +Aml *aml_error_device(void) +{ + Aml *dev = aml_device(ACPI_APEI_ERROR_DEVICE); + aml_append(dev, aml_name_decl("_HID", aml_string("PNP0C33"))); + aml_append(dev, aml_name_decl("_UID", aml_int(0))); + + return dev; +} diff --git a/hw/acpi/generic_event_device.c b/hw/acpi/generic_event_device.c index 95682b79a2..e7b773d84d 100644 --- a/hw/acpi/generic_event_device.c +++ b/hw/acpi/generic_event_device.c @@ -30,6 +30,7 @@ static const uint32_t ged_supported_events[] = { ACPI_GED_NVDIMM_HOTPLUG_EVT, ACPI_GED_CPU_HOTPLUG_EVT, ACPI_GED_PCI_HOTPLUG_EVT, + ACPI_GED_ERROR_EVT, }; /* @@ -120,6 +121,16 @@ void build_ged_aml(Aml *table, const char *name, HotplugHandler *hotplug_dev, aml_notify(aml_name(ACPI_POWER_BUTTON_DEVICE), aml_int(0x80))); break; + case ACPI_GED_ERROR_EVT: + /* + * ACPI 5.0b: 5.6.6 Device Object Notifications + * Table 5-135 Error Device Notification Values + * Defines 0x80 as the value to be used on notifications + */ + aml_append(if_ctx, + aml_notify(aml_name(ACPI_APEI_ERROR_DEVICE), + aml_int(0x80))); + break; case ACPI_GED_NVDIMM_HOTPLUG_EVT: aml_append(if_ctx, aml_notify(aml_name("\\_SB.NVDR"), @@ -320,6 +331,8 @@ static void acpi_ged_send_event(AcpiDeviceIf *adev, AcpiEventStatusBits ev) sel = ACPI_GED_MEM_HOTPLUG_EVT; } else if (ev & ACPI_POWER_DOWN_STATUS) { sel = ACPI_GED_PWR_DOWN_EVT; + } else if (ev & ACPI_GENERIC_ERROR) { + sel = ACPI_GED_ERROR_EVT; } else if (ev & ACPI_NVDIMM_HOTPLUG_STATUS) { sel = ACPI_GED_NVDIMM_HOTPLUG_EVT; } else if (ev & ACPI_CPU_HOTPLUG_STATUS) { @@ -349,6 +362,8 @@ static const Property acpi_ged_properties[] = { pcihp_state.use_acpi_hotplug_bridge, 0), DEFINE_PROP_LINK("bus", AcpiGedState, pcihp_state.root, TYPE_PCI_BUS, PCIBus *), + DEFINE_PROP_BOOL("x-has-hest-addr", AcpiGedState, + ghes_state.use_hest_addr, true), }; static const VMStateDescription vmstate_memhp_state = { @@ -436,6 +451,34 @@ static const VMStateDescription vmstate_pcihp_state = { } }; +static const VMStateDescription vmstate_hest = { + .name = "acpi-hest", + .version_id = 1, + .minimum_version_id = 1, + .fields = (const VMStateField[]) { + VMSTATE_UINT64(hest_addr_le, AcpiGhesState), + VMSTATE_END_OF_LIST() + }, +}; + +static bool hest_needed(void *opaque) +{ + AcpiGedState *s = opaque; + return s->ghes_state.hest_addr_le; +} + +static const VMStateDescription vmstate_hest_state = { + .name = "acpi-ged/hest", + .version_id = 1, + .minimum_version_id = 1, + .needed = hest_needed, + .fields = (const VMStateField[]) { + VMSTATE_STRUCT(ghes_state, AcpiGedState, 1, + vmstate_hest, AcpiGhesState), + VMSTATE_END_OF_LIST() + } +}; + static const VMStateDescription vmstate_acpi_ged = { .name = "acpi-ged", .version_id = 1, @@ -449,6 +492,7 @@ static const VMStateDescription vmstate_acpi_ged = { &vmstate_cpuhp_state, &vmstate_ghes_state, &vmstate_pcihp_state, + &vmstate_hest_state, NULL } }; diff --git a/hw/acpi/ghes-stub.c b/hw/acpi/ghes-stub.c index 7cec1812da..40f660c246 100644 --- a/hw/acpi/ghes-stub.c +++ b/hw/acpi/ghes-stub.c @@ -11,12 +11,13 @@ #include "qemu/osdep.h" #include "hw/acpi/ghes.h" -int acpi_ghes_memory_errors(uint16_t source_id, uint64_t physical_address) +int acpi_ghes_memory_errors(AcpiGhesState *ags, uint16_t source_id, + uint64_t physical_address) { return -1; } -bool acpi_ghes_present(void) +AcpiGhesState *acpi_ghes_get_state(void) { - return false; + return NULL; } diff --git a/hw/acpi/ghes.c b/hw/acpi/ghes.c index b85bb48195..06555905ce 100644 --- a/hw/acpi/ghes.c +++ b/hw/acpi/ghes.c @@ -30,6 +30,7 @@ #define ACPI_HW_ERROR_FW_CFG_FILE "etc/hardware_errors" #define ACPI_HW_ERROR_ADDR_FW_CFG_FILE "etc/hardware_errors_addr" +#define ACPI_HEST_ADDR_FW_CFG_FILE "etc/acpi_table_hest_addr" /* The max size in bytes for one error block */ #define ACPI_GHES_MAX_RAW_DATA_LENGTH (1 * KiB) @@ -41,6 +42,12 @@ #define GAS_ADDR_OFFSET 4 /* + * ACPI spec 1.0b + * 5.2.3 System Description Table Header + */ +#define ACPI_DESC_HEADER_OFFSET 36 + +/* * The total size of Generic Error Data Entry * ACPI 6.1/6.2: 18.3.2.7.1 Generic Error Data, * Table 18-343 Generic Error Data Entry @@ -61,6 +68,30 @@ #define ACPI_GHES_GESB_SIZE 20 /* + * See the memory layout map at docs/specs/acpi_hest_ghes.rst. + */ + +/* + * ACPI 6.1: 18.3.2.8 Generic Hardware Error Source version 2 + * Table 18-344 Generic Hardware Error Source version 2 (GHESv2) Structure + */ +#define HEST_GHES_V2_ENTRY_SIZE 92 + +/* + * ACPI 6.1: 18.3.2.8 Generic Hardware Error Source version 2 + * Table 18-344 Generic Hardware Error Source version 2 (GHESv2) Structure + * Read Ack Register + */ +#define GHES_READ_ACK_ADDR_OFF 64 + +/* + * ACPI 6.1: 18.3.2.7: Generic Hardware Error Source + * Table 18-341 Generic Hardware Error Source Structure + * Error Status Address + */ +#define GHES_ERR_STATUS_ADDR_OFF 20 + +/* * Values for error_severity field */ enum AcpiGenericErrorSeverity { @@ -206,17 +237,18 @@ ghes_gen_err_data_uncorrectable_recoverable(GArray *block, * Initialize "etc/hardware_errors" and "etc/hardware_errors_addr" fw_cfg blobs. * See docs/specs/acpi_hest_ghes.rst for blobs format. */ -static void build_ghes_error_table(GArray *hardware_errors, BIOSLinker *linker) +static void build_ghes_error_table(AcpiGhesState *ags, GArray *hardware_errors, + BIOSLinker *linker, int num_sources) { int i, error_status_block_offset; /* Build error_block_address */ - for (i = 0; i < ACPI_GHES_ERROR_SOURCE_COUNT; i++) { + for (i = 0; i < num_sources; i++) { build_append_int_noprefix(hardware_errors, 0, sizeof(uint64_t)); } /* Build read_ack_register */ - for (i = 0; i < ACPI_GHES_ERROR_SOURCE_COUNT; i++) { + for (i = 0; i < num_sources; i++) { /* * Initialize the value of read_ack_register to 1, so GHES can be * writable after (re)boot. @@ -231,13 +263,13 @@ static void build_ghes_error_table(GArray *hardware_errors, BIOSLinker *linker) /* Reserve space for Error Status Data Block */ acpi_data_push(hardware_errors, - ACPI_GHES_MAX_RAW_DATA_LENGTH * ACPI_GHES_ERROR_SOURCE_COUNT); + ACPI_GHES_MAX_RAW_DATA_LENGTH * num_sources); /* Tell guest firmware to place hardware_errors blob into RAM */ bios_linker_loader_alloc(linker, ACPI_HW_ERROR_FW_CFG_FILE, hardware_errors, sizeof(uint64_t), false); - for (i = 0; i < ACPI_GHES_ERROR_SOURCE_COUNT; i++) { + for (i = 0; i < num_sources; i++) { /* * Tell firmware to patch error_block_address entries to point to * corresponding "Generic Error Status Block" @@ -251,22 +283,26 @@ static void build_ghes_error_table(GArray *hardware_errors, BIOSLinker *linker) i * ACPI_GHES_MAX_RAW_DATA_LENGTH); } - /* - * tell firmware to write hardware_errors GPA into - * hardware_errors_addr fw_cfg, once the former has been initialized. - */ - bios_linker_loader_write_pointer(linker, ACPI_HW_ERROR_ADDR_FW_CFG_FILE, 0, - sizeof(uint64_t), - ACPI_HW_ERROR_FW_CFG_FILE, 0); + if (!ags->use_hest_addr) { + /* + * Tell firmware to write hardware_errors GPA into + * hardware_errors_addr fw_cfg, once the former has been initialized. + */ + bios_linker_loader_write_pointer(linker, ACPI_HW_ERROR_ADDR_FW_CFG_FILE, + 0, sizeof(uint64_t), + ACPI_HW_ERROR_FW_CFG_FILE, 0); + } } /* Build Generic Hardware Error Source version 2 (GHESv2) */ -static void build_ghes_v2(GArray *table_data, - BIOSLinker *linker, - enum AcpiGhesNotifyType notify, - uint16_t source_id) +static void build_ghes_v2_entry(GArray *table_data, + BIOSLinker *linker, + const AcpiNotificationSourceId *notif_src, + uint16_t index, int num_sources) { uint64_t address_offset; + const uint16_t notify = notif_src->notify; + const uint16_t source_id = notif_src->source_id; /* * Type: @@ -297,7 +333,7 @@ static void build_ghes_v2(GArray *table_data, address_offset + GAS_ADDR_OFFSET, sizeof(uint64_t), ACPI_HW_ERROR_FW_CFG_FILE, - source_id * sizeof(uint64_t)); + index * sizeof(uint64_t)); /* Notification Structure */ build_ghes_hw_error_notification(table_data, notify); @@ -317,8 +353,7 @@ static void build_ghes_v2(GArray *table_data, address_offset + GAS_ADDR_OFFSET, sizeof(uint64_t), ACPI_HW_ERROR_FW_CFG_FILE, - (ACPI_GHES_ERROR_SOURCE_COUNT + source_id) - * sizeof(uint64_t)); + (num_sources + index) * sizeof(uint64_t)); /* * Read Ack Preserve field @@ -331,23 +366,42 @@ static void build_ghes_v2(GArray *table_data, } /* Build Hardware Error Source Table */ -void acpi_build_hest(GArray *table_data, GArray *hardware_errors, +void acpi_build_hest(AcpiGhesState *ags, GArray *table_data, + GArray *hardware_errors, BIOSLinker *linker, + const AcpiNotificationSourceId *notif_source, + int num_sources, const char *oem_id, const char *oem_table_id) { AcpiTable table = { .sig = "HEST", .rev = 1, .oem_id = oem_id, .oem_table_id = oem_table_id }; + uint32_t hest_offset; + int i; - build_ghes_error_table(hardware_errors, linker); + hest_offset = table_data->len; + + build_ghes_error_table(ags, hardware_errors, linker, num_sources); acpi_table_begin(&table, table_data); /* Error Source Count */ - build_append_int_noprefix(table_data, ACPI_GHES_ERROR_SOURCE_COUNT, 4); - build_ghes_v2(table_data, linker, - ACPI_GHES_NOTIFY_SEA, ACPI_HEST_SRC_ID_SEA); + build_append_int_noprefix(table_data, num_sources, 4); + for (i = 0; i < num_sources; i++) { + build_ghes_v2_entry(table_data, linker, ¬if_source[i], i, num_sources); + } acpi_table_end(linker, &table); + + if (ags->use_hest_addr) { + /* + * Tell firmware to write into GPA the address of HEST via fw_cfg, + * once initialized. + */ + bios_linker_loader_write_pointer(linker, + ACPI_HEST_ADDR_FW_CFG_FILE, 0, + sizeof(uint64_t), + ACPI_BUILD_TABLE_FILE, hest_offset); + } } void acpi_ghes_add_fw_cfg(AcpiGhesState *ags, FWCfgState *s, @@ -357,21 +411,20 @@ void acpi_ghes_add_fw_cfg(AcpiGhesState *ags, FWCfgState *s, fw_cfg_add_file(s, ACPI_HW_ERROR_FW_CFG_FILE, hardware_error->data, hardware_error->len); - /* Create a read-write fw_cfg file for Address */ - fw_cfg_add_file_callback(s, ACPI_HW_ERROR_ADDR_FW_CFG_FILE, NULL, NULL, - NULL, &(ags->hw_error_le), sizeof(ags->hw_error_le), false); - - ags->present = true; + if (ags->use_hest_addr) { + fw_cfg_add_file_callback(s, ACPI_HEST_ADDR_FW_CFG_FILE, NULL, NULL, + NULL, &(ags->hest_addr_le), sizeof(ags->hest_addr_le), false); + } else { + /* Create a read-write fw_cfg file for Address */ + fw_cfg_add_file_callback(s, ACPI_HW_ERROR_ADDR_FW_CFG_FILE, NULL, NULL, + NULL, &(ags->hw_error_le), sizeof(ags->hw_error_le), false); + } } static void get_hw_error_offsets(uint64_t ghes_addr, uint64_t *cper_addr, uint64_t *read_ack_register_addr) { - if (!ghes_addr) { - return; - } - /* * non-HEST version supports only one source, so no need to change * the start offset based on the source ID. Also, we can't validate @@ -390,35 +443,94 @@ static void get_hw_error_offsets(uint64_t ghes_addr, *read_ack_register_addr = ghes_addr + sizeof(uint64_t); } -static void ghes_record_cper_errors(const void *cper, size_t len, - uint16_t source_id, Error **errp) +static void get_ghes_source_offsets(uint16_t source_id, + uint64_t hest_addr, + uint64_t *cper_addr, + uint64_t *read_ack_start_addr, + Error **errp) { - uint64_t cper_addr = 0, read_ack_register_addr = 0, read_ack_register; - AcpiGedState *acpi_ged_state; - AcpiGhesState *ags; + uint64_t hest_err_block_addr, hest_read_ack_addr; + uint64_t err_source_entry, error_block_addr; + uint32_t num_sources, i; - if (len > ACPI_GHES_MAX_RAW_DATA_LENGTH) { - error_setg(errp, "GHES CPER record is too big: %zd", len); - return; - } + hest_addr += ACPI_DESC_HEADER_OFFSET; - acpi_ged_state = ACPI_GED(object_resolve_path_type("", TYPE_ACPI_GED, - NULL)); - if (!acpi_ged_state) { - error_setg(errp, "Can't find ACPI_GED object"); + cpu_physical_memory_read(hest_addr, &num_sources, + sizeof(num_sources)); + num_sources = le32_to_cpu(num_sources); + + err_source_entry = hest_addr + sizeof(num_sources); + + /* + * Currently, HEST Error source navigates only for GHESv2 tables + */ + for (i = 0; i < num_sources; i++) { + uint64_t addr = err_source_entry; + uint16_t type, src_id; + + cpu_physical_memory_read(addr, &type, sizeof(type)); + type = le16_to_cpu(type); + + /* For now, we only know the size of GHESv2 table */ + if (type != ACPI_GHES_SOURCE_GENERIC_ERROR_V2) { + error_setg(errp, "HEST: type %d not supported.", type); + return; + } + + /* Compare CPER source ID at the GHESv2 structure */ + addr += sizeof(type); + cpu_physical_memory_read(addr, &src_id, sizeof(src_id)); + if (le16_to_cpu(src_id) == source_id) { + break; + } + + err_source_entry += HEST_GHES_V2_ENTRY_SIZE; + } + if (i == num_sources) { + error_setg(errp, "HEST: Source %d not found.", source_id); return; } - ags = &acpi_ged_state->ghes_state; - assert(ACPI_GHES_ERROR_SOURCE_COUNT == 1); - get_hw_error_offsets(le64_to_cpu(ags->hw_error_le), - &cper_addr, &read_ack_register_addr); + /* Navigate through table address pointers */ + hest_err_block_addr = err_source_entry + GHES_ERR_STATUS_ADDR_OFF + + GAS_ADDR_OFFSET; + + cpu_physical_memory_read(hest_err_block_addr, &error_block_addr, + sizeof(error_block_addr)); + error_block_addr = le64_to_cpu(error_block_addr); - if (!cper_addr) { - error_setg(errp, "can not find Generic Error Status Block"); + cpu_physical_memory_read(error_block_addr, cper_addr, + sizeof(*cper_addr)); + *cper_addr = le64_to_cpu(*cper_addr); + + hest_read_ack_addr = err_source_entry + GHES_READ_ACK_ADDR_OFF + + GAS_ADDR_OFFSET; + cpu_physical_memory_read(hest_read_ack_addr, read_ack_start_addr, + sizeof(*read_ack_start_addr)); + *read_ack_start_addr = le64_to_cpu(*read_ack_start_addr); +} + +NotifierList acpi_generic_error_notifiers = + NOTIFIER_LIST_INITIALIZER(acpi_generic_error_notifiers); + +void ghes_record_cper_errors(AcpiGhesState *ags, const void *cper, size_t len, + uint16_t source_id, Error **errp) +{ + uint64_t cper_addr = 0, read_ack_register_addr = 0, read_ack_register; + + if (len > ACPI_GHES_MAX_RAW_DATA_LENGTH) { + error_setg(errp, "GHES CPER record is too big: %zd", len); return; } + if (!ags->use_hest_addr) { + get_hw_error_offsets(le64_to_cpu(ags->hw_error_le), + &cper_addr, &read_ack_register_addr); + } else { + get_ghes_source_offsets(source_id, le64_to_cpu(ags->hest_addr_le), + &cper_addr, &read_ack_register_addr, errp); + } + cpu_physical_memory_read(read_ack_register_addr, &read_ack_register, sizeof(read_ack_register)); @@ -440,9 +552,12 @@ static void ghes_record_cper_errors(const void *cper, size_t len, /* Write the generic error data entry into guest memory */ cpu_physical_memory_write(cper_addr, cper, len); + + notifier_list_notify(&acpi_generic_error_notifiers, &source_id); } -int acpi_ghes_memory_errors(uint16_t source_id, uint64_t physical_address) +int acpi_ghes_memory_errors(AcpiGhesState *ags, uint16_t source_id, + uint64_t physical_address) { /* Memory Error Section Type */ const uint8_t guid[] = @@ -468,7 +583,7 @@ int acpi_ghes_memory_errors(uint16_t source_id, uint64_t physical_address) acpi_ghes_build_append_mem_cper(block, physical_address); /* Report the error */ - ghes_record_cper_errors(block->data, block->len, source_id, &errp); + ghes_record_cper_errors(ags, block->data, block->len, source_id, &errp); g_array_free(block, true); @@ -480,7 +595,7 @@ int acpi_ghes_memory_errors(uint16_t source_id, uint64_t physical_address) return 0; } -bool acpi_ghes_present(void) +AcpiGhesState *acpi_ghes_get_state(void) { AcpiGedState *acpi_ged_state; AcpiGhesState *ags; @@ -489,8 +604,12 @@ bool acpi_ghes_present(void) NULL)); if (!acpi_ged_state) { - return false; + return NULL; } ags = &acpi_ged_state->ghes_state; - return ags->present; + + if (!ags->hw_error_le && !ags->hest_addr_le) { + return NULL; + } + return ags; } diff --git a/hw/acpi/ghes_cper.c b/hw/acpi/ghes_cper.c new file mode 100644 index 0000000000..31cb2ffabe --- /dev/null +++ b/hw/acpi/ghes_cper.c @@ -0,0 +1,40 @@ +/* + * CPER payload parser for error injection + * + * Copyright(C) 2024-2025 Huawei LTD. + * + * This code is licensed under the GPL version 2 or later. See the + * COPYING file in the top-level directory. + * + * SPDX-License-Identifier: GPL-2.0-or-later + */ + +#include "qemu/osdep.h" + +#include "qemu/base64.h" +#include "qemu/error-report.h" +#include "qemu/uuid.h" +#include "qapi/qapi-commands-acpi-hest.h" +#include "hw/acpi/ghes.h" + +void qmp_inject_ghes_v2_error(const char *qmp_cper, Error **errp) +{ + AcpiGhesState *ags; + uint8_t *cper; + size_t len; + + ags = acpi_ghes_get_state(); + if (!ags) { + return; + } + + cper = qbase64_decode(qmp_cper, -1, &len, errp); + if (!cper) { + error_setg(errp, "missing GHES CPER payload"); + return; + } + + ghes_record_cper_errors(ags, cper, len, ACPI_HEST_SRC_ID_QMP, errp); + + g_free(cper); +} diff --git a/hw/acpi/ghes_cper_stub.c b/hw/acpi/ghes_cper_stub.c new file mode 100644 index 0000000000..b16be73502 --- /dev/null +++ b/hw/acpi/ghes_cper_stub.c @@ -0,0 +1,20 @@ +/* + * Stub interface for CPER payload parser for error injection + * + * Copyright(C) 2024-2025 Huawei LTD. + * + * This code is licensed under the GPL version 2 or later. See the + * COPYING file in the top-level directory. + * + * SPDX-License-Identifier: GPL-2.0-or-later + */ + +#include "qemu/osdep.h" +#include "qapi/error.h" +#include "qapi/qapi-commands-acpi-hest.h" +#include "hw/acpi/ghes.h" + +void qmp_inject_ghes_v2_error(const char *cper, Error **errp) +{ + error_setg(errp, "GHES QMP error inject is not compiled in"); +} diff --git a/hw/acpi/meson.build b/hw/acpi/meson.build index 73f02b9691..56b5d1ec96 100644 --- a/hw/acpi/meson.build +++ b/hw/acpi/meson.build @@ -34,4 +34,6 @@ endif system_ss.add(when: 'CONFIG_ACPI', if_false: files('acpi-stub.c', 'aml-build-stub.c', 'ghes-stub.c', 'acpi_interface.c')) system_ss.add(when: 'CONFIG_ACPI_PCI_BRIDGE', if_false: files('pci-bridge-stub.c')) system_ss.add_all(when: 'CONFIG_ACPI', if_true: acpi_ss) +system_ss.add(when: 'CONFIG_GHES_CPER', if_true: files('ghes_cper.c')) +system_ss.add(when: 'CONFIG_GHES_CPER', if_false: files('ghes_cper_stub.c')) system_ss.add(files('acpi-qmp-cmds.c')) diff --git a/hw/arm/virt-acpi-build.c b/hw/arm/virt-acpi-build.c index 96830f7c4e..8bb6b60515 100644 --- a/hw/arm/virt-acpi-build.c +++ b/hw/arm/virt-acpi-build.c @@ -1066,6 +1066,7 @@ build_dsdt(GArray *table_data, BIOSLinker *linker, VirtMachineState *vms) } acpi_dsdt_add_power_button(scope); + aml_append(scope, aml_error_device()); #ifdef CONFIG_TPM acpi_dsdt_add_tpm(scope, vms); #endif @@ -1125,6 +1126,15 @@ static void acpi_align_size(GArray *blob, unsigned align) g_array_set_size(blob, ROUND_UP(acpi_data_len(blob), align)); } +static const AcpiNotificationSourceId hest_ghes_notify[] = { + { ACPI_HEST_SRC_ID_SYNC, ACPI_GHES_NOTIFY_SEA }, + { ACPI_HEST_SRC_ID_QMP, ACPI_GHES_NOTIFY_GPIO }, +}; + +static const AcpiNotificationSourceId hest_ghes_notify_10_0[] = { + { ACPI_HEST_SRC_ID_SYNC, ACPI_GHES_NOTIFY_SEA }, +}; + static void virt_acpi_build(VirtMachineState *vms, AcpiBuildTables *tables) { @@ -1181,9 +1191,28 @@ void virt_acpi_build(VirtMachineState *vms, AcpiBuildTables *tables) build_dbg2(tables_blob, tables->linker, vms); if (vms->ras) { - acpi_add_table(table_offsets, tables_blob); - acpi_build_hest(tables_blob, tables->hardware_errors, tables->linker, - vms->oem_id, vms->oem_table_id); + AcpiGedState *acpi_ged_state; + static const AcpiNotificationSourceId *notify; + unsigned int notify_sz; + AcpiGhesState *ags; + + acpi_ged_state = ACPI_GED(vms->acpi_dev); + ags = &acpi_ged_state->ghes_state; + if (ags) { + acpi_add_table(table_offsets, tables_blob); + + if (!ags->use_hest_addr) { + notify = hest_ghes_notify_10_0; + notify_sz = ARRAY_SIZE(hest_ghes_notify_10_0); + } else { + notify = hest_ghes_notify; + notify_sz = ARRAY_SIZE(hest_ghes_notify); + } + + acpi_build_hest(ags, tables_blob, tables->hardware_errors, + tables->linker, notify, notify_sz, + vms->oem_id, vms->oem_table_id); + } } if (ms->numa_state->num_nodes > 0) { diff --git a/hw/arm/virt.c b/hw/arm/virt.c index 02209fadcf..175023897a 100644 --- a/hw/arm/virt.c +++ b/hw/arm/virt.c @@ -693,7 +693,7 @@ static inline DeviceState *create_acpi_ged(VirtMachineState *vms) MachineState *ms = MACHINE(vms); SysBusDevice *sbdev; int irq = vms->irqmap[VIRT_ACPI_GED]; - uint32_t event = ACPI_GED_PWR_DOWN_EVT; + uint32_t event = ACPI_GED_PWR_DOWN_EVT | ACPI_GED_ERROR_EVT; bool acpi_pcihp; if (ms->ram_slots) { @@ -1050,6 +1050,20 @@ static void virt_powerdown_req(Notifier *n, void *opaque) } } +static void virt_generic_error_req(Notifier *n, void *opaque) +{ + uint16_t *source_id = opaque; + + /* Currently, only QMP source ID is async */ + if (*source_id != ACPI_HEST_SRC_ID_QMP) { + return; + } + + VirtMachineState *s = container_of(n, VirtMachineState, generic_error_notifier); + + acpi_send_event(s->acpi_dev, ACPI_GENERIC_ERROR); +} + static void create_gpio_keys(char *fdt, DeviceState *pl061_dev, uint32_t phandle) { @@ -2500,6 +2514,9 @@ static void machvirt_init(MachineState *machine) if (has_ged && aarch64 && firmware_loaded && virt_is_acpi_enabled(vms)) { vms->acpi_dev = create_acpi_ged(vms); + vms->generic_error_notifier.notify = virt_generic_error_req; + notifier_list_add(&acpi_generic_error_notifiers, + &vms->generic_error_notifier); } else { create_gpio_devices(vms, VIRT_GPIO, sysmem); } @@ -3520,6 +3537,7 @@ DEFINE_VIRT_MACHINE_AS_LATEST(10, 2) static void virt_machine_10_1_options(MachineClass *mc) { virt_machine_10_2_options(mc); + mc->smbios_memory_device_size = 2047 * TiB; compat_props_add(mc->compat_props, hw_compat_10_1, hw_compat_10_1_len); } DEFINE_VIRT_MACHINE(10, 1) diff --git a/hw/block/virtio-blk.c b/hw/block/virtio-blk.c index 9bab2716c1..64efce4846 100644 --- a/hw/block/virtio-blk.c +++ b/hw/block/virtio-blk.c @@ -62,11 +62,7 @@ void virtio_blk_req_complete(VirtIOBlockReq *req, unsigned char status) iov_discard_undo(&req->inhdr_undo); iov_discard_undo(&req->outhdr_undo); virtqueue_push(req->vq, &req->elem, req->in_len); - if (qemu_in_iothread()) { - virtio_notify_irqfd(vdev, req->vq); - } else { - virtio_notify(vdev, req->vq); - } + virtio_notify(vdev, req->vq); } static int virtio_blk_handle_rw_error(VirtIOBlockReq *req, int error, diff --git a/hw/core/machine.c b/hw/core/machine.c index 38c949c4f2..681adbb7ac 100644 --- a/hw/core/machine.c +++ b/hw/core/machine.c @@ -35,9 +35,12 @@ #include "hw/virtio/virtio-pci.h" #include "hw/virtio/virtio-net.h" #include "hw/virtio/virtio-iommu.h" +#include "hw/acpi/generic_event_device.h" #include "audio/audio.h" -GlobalProperty hw_compat_10_1[] = {}; +GlobalProperty hw_compat_10_1[] = { + { TYPE_ACPI_GED, "x-has-hest-addr", "false" }, +}; const size_t hw_compat_10_1_len = G_N_ELEMENTS(hw_compat_10_1); GlobalProperty hw_compat_10_0[] = { @@ -1115,8 +1118,11 @@ static void machine_class_init(ObjectClass *oc, const void *data) * SMBIOS 3.1.0 7.18.5 Memory Device — Extended Size * use max possible value that could be encoded into * 'Extended Size' field (2047Tb). + * + * Unfortunately (current) Windows Server 2025 and earlier do not handle + * 4Tb+ DIMM size. */ - mc->smbios_memory_device_size = 2047 * TiB; + mc->smbios_memory_device_size = 2 * TiB; /* numa node memory size aligned on 8MB by default. * On Linux, each node's border has to be 8MB aligned diff --git a/hw/core/qdev.c b/hw/core/qdev.c index f600226176..fab42a7270 100644 --- a/hw/core/qdev.c +++ b/hw/core/qdev.c @@ -411,6 +411,35 @@ char *qdev_get_dev_path(DeviceState *dev) return NULL; } +const char *qdev_get_printable_name(DeviceState *vdev) +{ + /* + * Return device ID if explicity set + * (e.g. -device virtio-blk-pci,id=foo) + * This allows users to correlate errors with their custom device + * names. + */ + if (vdev->id) { + return vdev->id; + } + /* + * Fall back to the canonical QOM device path (eg. ID for PCI + * devices). + * This ensures the device is still uniquely and meaningfully + * identified. + */ + const char *path = qdev_get_dev_path(vdev); + if (path) { + return path; + } + + /* + * Final fallback: if all else fails, return a placeholder string. + * This ensures the error message always contains a valid string. + */ + return "<unknown device>"; +} + void qdev_add_unplug_blocker(DeviceState *dev, Error *reason) { dev->unplug_blockers = g_slist_prepend(dev->unplug_blockers, reason); diff --git a/hw/i386/acpi-build.c b/hw/i386/acpi-build.c index 423c4959fe..9446a9f862 100644 --- a/hw/i386/acpi-build.c +++ b/hw/i386/acpi-build.c @@ -1863,7 +1863,11 @@ build_amd_iommu(GArray *table_data, BIOSLinker *linker, const char *oem_id, /* IOMMU info */ build_append_int_noprefix(table_data, 0, 2); /* IOMMU Attributes */ - build_append_int_noprefix(table_data, 0, 4); + if (!s->iommu.dma_translation) { + build_append_int_noprefix(table_data, (1UL << 0) /* HATDis */, 4); + } else { + build_append_int_noprefix(table_data, 0, 4); + } /* EFR Register Image */ build_append_int_noprefix(table_data, amdvi_extended_feature_register(s), diff --git a/hw/i386/amd_iommu.c b/hw/i386/amd_iommu.c index 26be69bec8..378e0cb55e 100644 --- a/hw/i386/amd_iommu.c +++ b/hw/i386/amd_iommu.c @@ -33,6 +33,7 @@ #include "hw/i386/apic-msidef.h" #include "hw/qdev-properties.h" #include "kvm/kvm_i386.h" +#include "qemu/iova-tree.h" /* used AMD-Vi MMIO registers */ const char *amdvi_mmio_low[] = { @@ -66,6 +67,15 @@ struct AMDVIAddressSpace { MemoryRegion iommu_nodma; /* Alias of shared nodma memory region */ MemoryRegion iommu_ir; /* Device's interrupt remapping region */ AddressSpace as; /* device's corresponding address space */ + + /* DMA address translation support */ + IOMMUNotifierFlag notifier_flags; + /* entry in list of Address spaces with registered notifiers */ + QLIST_ENTRY(AMDVIAddressSpace) next; + /* Record DMA translation ranges */ + IOVATree *iova_tree; + /* DMA address translation active */ + bool addr_translation; }; /* AMDVI cache entry */ @@ -77,12 +87,29 @@ typedef struct AMDVIIOTLBEntry { uint64_t page_mask; /* physical page size */ } AMDVIIOTLBEntry; +/* + * These 'fault' reasons have an overloaded meaning since they are not only + * intended for describing reasons that generate an IO_PAGE_FAULT as per the AMD + * IOMMU specification, but are also used to signal internal errors in the + * emulation code. + */ +typedef enum AMDVIFaultReason { + AMDVI_FR_DTE_RTR_ERR = 1, /* Failure to retrieve DTE */ + AMDVI_FR_DTE_V, /* DTE[V] = 0 */ + AMDVI_FR_DTE_TV, /* DTE[TV] = 0 */ + AMDVI_FR_PT_ROOT_INV, /* Page Table Root ptr invalid */ + AMDVI_FR_PT_ENTRY_INV, /* Failure to read PTE from guest memory */ +} AMDVIFaultReason; + uint64_t amdvi_extended_feature_register(AMDVIState *s) { uint64_t feature = AMDVI_DEFAULT_EXT_FEATURES; if (s->xtsup) { feature |= AMDVI_FEATURE_XT; } + if (!s->iommu.dma_translation) { + feature |= AMDVI_HATS_MODE_RESERVED; + } return feature; } @@ -438,18 +465,704 @@ static void amdvi_completion_wait(AMDVIState *s, uint64_t *cmd) trace_amdvi_completion_wait(addr, data); } +static inline uint64_t amdvi_get_perms(uint64_t entry) +{ + return (entry & (AMDVI_DEV_PERM_READ | AMDVI_DEV_PERM_WRITE)) >> + AMDVI_DEV_PERM_SHIFT; +} + +/* validate that reserved bits are honoured */ +static bool amdvi_validate_dte(AMDVIState *s, uint16_t devid, + uint64_t *dte) +{ + + uint64_t root; + + if ((dte[0] & AMDVI_DTE_QUAD0_RESERVED) || + (dte[1] & AMDVI_DTE_QUAD1_RESERVED) || + (dte[2] & AMDVI_DTE_QUAD2_RESERVED) || + (dte[3] & AMDVI_DTE_QUAD3_RESERVED)) { + amdvi_log_illegaldevtab_error(s, devid, + s->devtab + + devid * AMDVI_DEVTAB_ENTRY_SIZE, 0); + return false; + } + + /* + * 1 = Host Address Translation is not supported. Value in MMIO Offset + * 0030h[HATS] is not meaningful. A non-zero host page table root pointer + * in the DTE would result in an ILLEGAL_DEV_TABLE_ENTRY event. + */ + root = (dte[0] & AMDVI_DEV_PT_ROOT_MASK) >> 12; + if (root && !s->iommu.dma_translation) { + amdvi_log_illegaldevtab_error(s, devid, + s->devtab + + devid * AMDVI_DEVTAB_ENTRY_SIZE, 0); + return false; + } + + return true; +} + +/* get a device table entry given the devid */ +static bool amdvi_get_dte(AMDVIState *s, int devid, uint64_t *entry) +{ + uint32_t offset = devid * AMDVI_DEVTAB_ENTRY_SIZE; + + if (dma_memory_read(&address_space_memory, s->devtab + offset, entry, + AMDVI_DEVTAB_ENTRY_SIZE, MEMTXATTRS_UNSPECIFIED)) { + trace_amdvi_dte_get_fail(s->devtab, offset); + /* log error accessing dte */ + amdvi_log_devtab_error(s, devid, s->devtab + offset, 0); + return false; + } + + *entry = le64_to_cpu(*entry); + if (!amdvi_validate_dte(s, devid, entry)) { + trace_amdvi_invalid_dte(entry[0]); + return false; + } + + return true; +} + +/* get pte translation mode */ +static inline uint8_t get_pte_translation_mode(uint64_t pte) +{ + return (pte >> AMDVI_DEV_MODE_RSHIFT) & AMDVI_DEV_MODE_MASK; +} + +static inline uint64_t amdvi_get_pte_entry(AMDVIState *s, uint64_t pte_addr, + uint16_t devid) +{ + uint64_t pte; + + if (dma_memory_read(&address_space_memory, pte_addr, + &pte, sizeof(pte), MEMTXATTRS_UNSPECIFIED)) { + trace_amdvi_get_pte_hwerror(pte_addr); + amdvi_log_pagetab_error(s, devid, pte_addr, 0); + pte = (uint64_t)-1; + return pte; + } + + pte = le64_to_cpu(pte); + return pte; +} + +static int amdvi_as_to_dte(AMDVIAddressSpace *as, uint64_t *dte) +{ + uint16_t devid = PCI_BUILD_BDF(as->bus_num, as->devfn); + AMDVIState *s = as->iommu_state; + + if (!amdvi_get_dte(s, devid, dte)) { + /* Unable to retrieve DTE for devid */ + return -AMDVI_FR_DTE_RTR_ERR; + } + + if (!(dte[0] & AMDVI_DEV_VALID)) { + /* DTE[V] not set, address is passed untranslated for devid */ + return -AMDVI_FR_DTE_V; + } + + if (!(dte[0] & AMDVI_DEV_TRANSLATION_VALID)) { + /* DTE[TV] not set, host page table not valid for devid */ + return -AMDVI_FR_DTE_TV; + } + return 0; +} + +/* + * For a PTE encoding a large page, return the page size it encodes as described + * by the AMD IOMMU Specification Table 14: Example Page Size Encodings. + * No need to adjust the value of the PTE to point to the first PTE in the large + * page since the encoding guarantees all "base" PTEs in the large page are the + * same. + */ +static uint64_t large_pte_page_size(uint64_t pte) +{ + assert(PTE_NEXT_LEVEL(pte) == 7); + + /* Determine size of the large/contiguous page encoded in the PTE */ + return PTE_LARGE_PAGE_SIZE(pte); +} + +/* + * Helper function to fetch a PTE using AMD v1 pgtable format. + * On successful page walk, returns 0 and pte parameter points to a valid PTE. + * On failure, returns: + * -AMDVI_FR_PT_ROOT_INV: A page walk is not possible due to conditions like DTE + * with invalid permissions, Page Table Root can not be read from DTE, or a + * larger IOVA than supported by page table level encoded in DTE[Mode]. + * -AMDVI_FR_PT_ENTRY_INV: A PTE could not be read from guest memory during a + * page table walk. This means that the DTE has valid data, but one of the + * lower level entries in the Page Table could not be read. + */ +static uint64_t fetch_pte(AMDVIAddressSpace *as, hwaddr address, uint64_t dte, + uint64_t *pte, hwaddr *page_size) +{ + IOMMUAccessFlags perms = amdvi_get_perms(dte); + + uint8_t level, mode; + uint64_t pte_addr; + + *pte = dte; + *page_size = 0; + + if (perms == IOMMU_NONE) { + return -AMDVI_FR_PT_ROOT_INV; + } + + /* + * The Linux kernel driver initializes the default mode to 3, corresponding + * to a 39-bit GPA space, where each entry in the pagetable translates to a + * 1GB (2^30) page size. + */ + level = mode = get_pte_translation_mode(dte); + assert(mode > 0 && mode < 7); + + /* + * If IOVA is larger than the max supported by the current pgtable level, + * there is nothing to do. + */ + if (address > PT_LEVEL_MAX_ADDR(mode - 1)) { + /* IOVA too large for the current DTE */ + return -AMDVI_FR_PT_ROOT_INV; + } + + do { + level -= 1; + + /* Update the page_size */ + *page_size = PTE_LEVEL_PAGE_SIZE(level); + + /* Permission bits are ANDed at every level, including the DTE */ + perms &= amdvi_get_perms(*pte); + if (perms == IOMMU_NONE) { + return 0; + } + + /* Not Present */ + if (!IOMMU_PTE_PRESENT(*pte)) { + return 0; + } + + /* Large or Leaf PTE found */ + if (PTE_NEXT_LEVEL(*pte) == 7 || PTE_NEXT_LEVEL(*pte) == 0) { + /* Leaf PTE found */ + break; + } + + /* + * Index the pgtable using the IOVA bits corresponding to current level + * and walk down to the lower level. + */ + pte_addr = NEXT_PTE_ADDR(*pte, level, address); + *pte = amdvi_get_pte_entry(as->iommu_state, pte_addr, as->devfn); + + if (*pte == (uint64_t)-1) { + /* + * A returned PTE of -1 indicates a failure to read the page table + * entry from guest memory. + */ + if (level == mode - 1) { + /* Failure to retrieve the Page Table from Root Pointer */ + *page_size = 0; + return -AMDVI_FR_PT_ROOT_INV; + } else { + /* Failure to read PTE. Page walk skips a page_size chunk */ + return -AMDVI_FR_PT_ENTRY_INV; + } + } + } while (level > 0); + + assert(PTE_NEXT_LEVEL(*pte) == 0 || PTE_NEXT_LEVEL(*pte) == 7 || + level == 0); + /* + * Page walk ends when Next Level field on PTE shows that either a leaf PTE + * or a series of large PTEs have been reached. In the latter case, even if + * the range starts in the middle of a contiguous page, the returned PTE + * must be the first PTE of the series. + */ + if (PTE_NEXT_LEVEL(*pte) == 7) { + /* Update page_size with the large PTE page size */ + *page_size = large_pte_page_size(*pte); + } + + return 0; +} + +/* + * Invoke notifiers registered for the address space. Update record of mapped + * ranges in IOVA Tree. + */ +static void amdvi_notify_iommu(AMDVIAddressSpace *as, IOMMUTLBEvent *event) +{ + IOMMUTLBEntry *entry = &event->entry; + + DMAMap target = { + .iova = entry->iova, + .size = entry->addr_mask, + .translated_addr = entry->translated_addr, + .perm = entry->perm, + }; + + /* + * Search the IOVA Tree for an existing translation for the target, and skip + * the notification if the mapping is already recorded. + * When the guest uses large pages, comparing against the record makes it + * possible to determine the size of the original MAP and adjust the UNMAP + * request to match it. This avoids failed checks against the mappings kept + * by the VFIO kernel driver. + */ + const DMAMap *mapped = iova_tree_find(as->iova_tree, &target); + + if (event->type == IOMMU_NOTIFIER_UNMAP) { + if (!mapped) { + /* No record exists of this mapping, nothing to do */ + return; + } + /* + * Adjust the size based on the original record. This is essential to + * determine when large/contiguous pages are used, since the guest has + * already cleared the PTE (erasing the pagesize encoded on it) before + * issuing the invalidation command. + */ + if (mapped->size != target.size) { + assert(mapped->size > target.size); + target.size = mapped->size; + /* Adjust event to invoke notifier with correct range */ + entry->addr_mask = mapped->size; + } + iova_tree_remove(as->iova_tree, target); + } else { /* IOMMU_NOTIFIER_MAP */ + if (mapped) { + /* + * If a mapping is present and matches the request, skip the + * notification. + */ + if (!memcmp(mapped, &target, sizeof(DMAMap))) { + return; + } else { + /* + * This should never happen unless a buggy guest OS omits or + * sends incorrect invalidation(s). Report an error in the event + * it does happen. + */ + error_report("Found conflicting translation. This could be due " + "to an incorrect or missing invalidation command"); + } + } + /* Record the new mapping */ + iova_tree_insert(as->iova_tree, &target); + } + + /* Invoke the notifiers registered for this address space */ + memory_region_notify_iommu(&as->iommu, 0, *event); +} + +/* + * Walk the guest page table for an IOVA and range and signal the registered + * notifiers to sync the shadow page tables in the host. + * Must be called with a valid DTE for DMA remapping i.e. V=1,TV=1 + */ +static void amdvi_sync_shadow_page_table_range(AMDVIAddressSpace *as, + uint64_t *dte, hwaddr addr, + uint64_t size, bool send_unmap) +{ + IOMMUTLBEvent event; + + hwaddr page_mask, pagesize; + hwaddr iova = addr; + hwaddr end = iova + size - 1; + + uint64_t pte; + int ret; + + while (iova < end) { + + ret = fetch_pte(as, iova, dte[0], &pte, &pagesize); + + if (ret == -AMDVI_FR_PT_ROOT_INV) { + /* + * Invalid conditions such as the IOVA being larger than supported + * by current page table mode as configured in the DTE, or a failure + * to fetch the Page Table from the Page Table Root Pointer in DTE. + */ + assert(pagesize == 0); + return; + } + /* PTE has been validated for major errors and pagesize is set */ + assert(pagesize); + page_mask = ~(pagesize - 1); + + if (ret == -AMDVI_FR_PT_ENTRY_INV) { + /* + * Failure to read PTE from memory, the pagesize matches the current + * level. Unable to determine the region type, so a safe strategy is + * to skip the range and continue the page walk. + */ + goto next; + } + + event.entry.target_as = &address_space_memory; + event.entry.iova = iova & page_mask; + /* translated_addr is irrelevant for the unmap case */ + event.entry.translated_addr = (pte & AMDVI_DEV_PT_ROOT_MASK) & + page_mask; + event.entry.addr_mask = ~page_mask; + event.entry.perm = amdvi_get_perms(pte); + + /* + * In cases where the leaf PTE is not found, or it has invalid + * permissions, an UNMAP type notification is sent, but only if the + * caller requested it. + */ + if (!IOMMU_PTE_PRESENT(pte) || (event.entry.perm == IOMMU_NONE)) { + if (!send_unmap) { + goto next; + } + event.type = IOMMU_NOTIFIER_UNMAP; + } else { + event.type = IOMMU_NOTIFIER_MAP; + } + + /* + * The following call might need to adjust event.entry.size in cases + * where the guest unmapped a series of large pages. + */ + amdvi_notify_iommu(as, &event); + /* + * In the special scenario where the guest is unmapping a large page, + * addr_mask has been adjusted before sending the notification. Update + * pagesize accordingly in order to correctly compute the next IOVA. + */ + pagesize = event.entry.addr_mask + 1; + +next: + iova &= ~(pagesize - 1); + + /* Check for 64-bit overflow and terminate walk in such cases */ + if ((iova + pagesize) < iova) { + break; + } else { + iova += pagesize; + } + } +} + +/* + * Unmap entire range that the notifier registered for i.e. the full AS. + * + * This is seemingly technically equivalent to directly calling + * memory_region_unmap_iommu_notifier_range(), but it allows to check for + * notifier boundaries and issue notifications with ranges within those bounds. + */ +static void amdvi_address_space_unmap(AMDVIAddressSpace *as, IOMMUNotifier *n) +{ + + hwaddr start = n->start; + hwaddr end = n->end; + hwaddr remain; + DMAMap map; + + assert(start <= end); + remain = end - start + 1; + + /* + * Divide the notifier range into chunks that are aligned and do not exceed + * the notifier boundaries. + */ + while (remain >= AMDVI_PAGE_SIZE) { + + IOMMUTLBEvent event; + + uint64_t mask = dma_aligned_pow2_mask(start, end, 64); + + event.type = IOMMU_NOTIFIER_UNMAP; + + IOMMUTLBEntry entry = { + .target_as = &address_space_memory, + .iova = start, + .translated_addr = 0, /* irrelevant for unmap case */ + .addr_mask = mask, + .perm = IOMMU_NONE, + }; + event.entry = entry; + + /* Call notifier registered for updates on this address space */ + memory_region_notify_iommu_one(n, &event); + + start += mask + 1; + remain -= mask + 1; + } + + assert(!remain); + + map.iova = n->start; + map.size = n->end - n->start; + + iova_tree_remove(as->iova_tree, map); +} + +/* + * For all the address spaces with notifiers registered, unmap the entire range + * the notifier registered for i.e. clear all the address spaces managed by the + * IOMMU. + */ +static void amdvi_address_space_unmap_all(AMDVIState *s) +{ + AMDVIAddressSpace *as; + IOMMUNotifier *n; + + QLIST_FOREACH(as, &s->amdvi_as_with_notifiers, next) { + IOMMU_NOTIFIER_FOREACH(n, &as->iommu) { + amdvi_address_space_unmap(as, n); + } + } +} + +/* + * For every translation present in the IOMMU, construct IOMMUTLBEntry data + * and pass it as parameter to notifier callback. + */ +static void amdvi_iommu_replay(IOMMUMemoryRegion *iommu_mr, IOMMUNotifier *n) +{ + AMDVIAddressSpace *as = container_of(iommu_mr, AMDVIAddressSpace, iommu); + uint64_t dte[4] = { 0 }; + + if (!(n->notifier_flags & IOMMU_NOTIFIER_MAP)) { + return; + } + + if (amdvi_as_to_dte(as, dte)) { + return; + } + + /* Dropping all mappings for the address space. Also clears the IOVA tree */ + amdvi_address_space_unmap(as, n); + + amdvi_sync_shadow_page_table_range(as, &dte[0], 0, UINT64_MAX, false); +} + +static void amdvi_address_space_sync(AMDVIAddressSpace *as) +{ + IOMMUNotifier *n; + uint64_t dte[4] = { 0 }; + + /* If only UNMAP notifiers are registered, drop all existing mappings */ + if (!(as->notifier_flags & IOMMU_NOTIFIER_MAP)) { + IOMMU_NOTIFIER_FOREACH(n, &as->iommu) { + /* + * Directly calling memory_region_unmap_iommu_notifier_range() does + * not guarantee that the addr_mask eventually passed as parameter + * to the notifier is valid. Use amdvi_address_space_unmap() which + * ensures the notifier range is divided into properly aligned + * regions, and issues notifications for each one. + */ + amdvi_address_space_unmap(as, n); + } + return; + } + + if (amdvi_as_to_dte(as, dte)) { + return; + } + + amdvi_sync_shadow_page_table_range(as, &dte[0], 0, UINT64_MAX, true); +} + +/* + * This differs from the replay() method in that it issues both MAP and UNMAP + * notifications since it is called after global invalidation events in order to + * re-sync all address spaces. + */ +static void amdvi_iommu_address_space_sync_all(AMDVIState *s) +{ + AMDVIAddressSpace *as; + + QLIST_FOREACH(as, &s->amdvi_as_with_notifiers, next) { + amdvi_address_space_sync(as); + } +} + +/* + * Toggle between address translation and passthrough modes by enabling the + * corresponding memory regions. + */ +static void amdvi_switch_address_space(AMDVIAddressSpace *amdvi_as) +{ + AMDVIState *s = amdvi_as->iommu_state; + + if (s->dma_remap && amdvi_as->addr_translation) { + /* Enabling DMA region */ + memory_region_set_enabled(&amdvi_as->iommu_nodma, false); + memory_region_set_enabled(MEMORY_REGION(&amdvi_as->iommu), true); + } else { + /* Disabling DMA region, using passthrough */ + memory_region_set_enabled(MEMORY_REGION(&amdvi_as->iommu), false); + memory_region_set_enabled(&amdvi_as->iommu_nodma, true); + } +} + +/* + * For all existing address spaces managed by the IOMMU, enable/disable the + * corresponding memory regions to reset the address translation mode and + * use passthrough by default. + */ +static void amdvi_reset_address_translation_all(AMDVIState *s) +{ + AMDVIAddressSpace **iommu_as; + + for (int bus_num = 0; bus_num < PCI_BUS_MAX; bus_num++) { + + /* Nothing to do if there are no devices on the current bus */ + if (!s->address_spaces[bus_num]) { + continue; + } + iommu_as = s->address_spaces[bus_num]; + + for (int devfn = 0; devfn < PCI_DEVFN_MAX; devfn++) { + + if (!iommu_as[devfn]) { + continue; + } + /* Use passthrough as default mode after reset */ + iommu_as[devfn]->addr_translation = false; + amdvi_switch_address_space(iommu_as[devfn]); + } + } +} + +static void enable_dma_mode(AMDVIAddressSpace *as, bool inval_current) +{ + /* + * When enabling DMA mode for the purpose of isolating guest devices on + * a failure to retrieve or invalid DTE, all existing mappings must be + * dropped. + */ + if (inval_current) { + IOMMUNotifier *n; + IOMMU_NOTIFIER_FOREACH(n, &as->iommu) { + amdvi_address_space_unmap(as, n); + } + } + + if (as->addr_translation) { + return; + } + + /* Installing DTE enabling translation, activate region */ + as->addr_translation = true; + amdvi_switch_address_space(as); + /* Sync shadow page tables */ + amdvi_address_space_sync(as); +} + +/* + * If paging was previously in use in the address space + * - invalidate all existing mappings + * - switch to no_dma memory region + */ +static void enable_nodma_mode(AMDVIAddressSpace *as) +{ + IOMMUNotifier *n; + + if (!as->addr_translation) { + /* passthrough is already active, nothing to do */ + return; + } + + as->addr_translation = false; + IOMMU_NOTIFIER_FOREACH(n, &as->iommu) { + /* Drop all mappings for the address space */ + amdvi_address_space_unmap(as, n); + } + amdvi_switch_address_space(as); +} + +/* + * A guest driver must issue the INVALIDATE_DEVTAB_ENTRY command to the IOMMU + * after changing a Device Table entry. We can use this fact to detect when a + * Device Table entry is created for a device attached to a paging domain and + * enable the corresponding IOMMU memory region to allow for DMA translation if + * appropriate. + */ +static void amdvi_update_addr_translation_mode(AMDVIState *s, uint16_t devid) +{ + uint8_t bus_num, devfn, dte_mode; + AMDVIAddressSpace *as; + uint64_t dte[4] = { 0 }; + int ret; + + /* + * Convert the devid encoded in the command to a bus and devfn in + * order to retrieve the corresponding address space. + */ + bus_num = PCI_BUS_NUM(devid); + devfn = devid & 0xff; + + /* + * The main buffer of size (AMDVIAddressSpace *) * (PCI_BUS_MAX) has already + * been allocated within AMDVIState, but must be careful to not access + * unallocated devfn. + */ + if (!s->address_spaces[bus_num] || !s->address_spaces[bus_num][devfn]) { + return; + } + as = s->address_spaces[bus_num][devfn]; + + ret = amdvi_as_to_dte(as, dte); + + if (!ret) { + dte_mode = (dte[0] >> AMDVI_DEV_MODE_RSHIFT) & AMDVI_DEV_MODE_MASK; + } + + switch (ret) { + case 0: + /* DTE was successfully retrieved */ + if (!dte_mode) { + enable_nodma_mode(as); /* DTE[V]=1 && DTE[Mode]=0 => passthrough */ + } else { + enable_dma_mode(as, false); /* Enable DMA translation */ + } + break; + case -AMDVI_FR_DTE_V: + /* DTE[V]=0, address is passed untranslated */ + enable_nodma_mode(as); + break; + case -AMDVI_FR_DTE_RTR_ERR: + case -AMDVI_FR_DTE_TV: + /* + * Enforce isolation by using DMA in rare scenarios where the DTE cannot + * be retrieved or DTE[TV]=0. Existing mappings are dropped. + */ + enable_dma_mode(as, true); + break; + } +} + /* log error without aborting since linux seems to be using reserved bits */ static void amdvi_inval_devtab_entry(AMDVIState *s, uint64_t *cmd) { uint16_t devid = cpu_to_le16((uint16_t)extract64(cmd[0], 0, 16)); + trace_amdvi_devtab_inval(PCI_BUS_NUM(devid), PCI_SLOT(devid), + PCI_FUNC(devid)); + /* This command should invalidate internal caches of which there isn't */ if (extract64(cmd[0], 16, 44) || cmd[1]) { amdvi_log_illegalcom_error(s, extract64(cmd[0], 60, 4), s->cmdbuf + s->cmdbuf_head); + return; + } + + /* + * When DMA remapping capability is enabled, check if updated DTE is setup + * for paging or not, and configure the corresponding memory regions. + */ + if (s->dma_remap) { + amdvi_update_addr_translation_mode(s, devid); } - trace_amdvi_devtab_inval(PCI_BUS_NUM(devid), PCI_SLOT(devid), - PCI_FUNC(devid)); } static void amdvi_complete_ppr(AMDVIState *s, uint64_t *cmd) @@ -480,6 +1193,13 @@ static void amdvi_inval_all(AMDVIState *s, uint64_t *cmd) amdvi_intremap_inval_notify_all(s, true, 0, 0); amdvi_iotlb_reset(s); + + /* + * Fully replay the address space i.e. send both UNMAP and MAP events in + * order to synchronize guest and host IO page tables tables. + */ + amdvi_iommu_address_space_sync_all(s); + trace_amdvi_all_inval(); } @@ -491,10 +1211,109 @@ static gboolean amdvi_iotlb_remove_by_domid(gpointer key, gpointer value, return entry->domid == domid; } +/* + * Helper to decode the size of the range to invalidate encoded in the + * INVALIDATE_IOMMU_PAGES Command format. + * The size of the region to invalidate depends on the S bit and address. + * S bit value: + * 0 : Invalidation size is 4 Kbytes. + * 1 : Invalidation size is determined by first zero bit in the address + * starting from Address[12]. + * + * In the AMD IOMMU Linux driver, an invalidation command with address + * ((1 << 63) - 1) is sent when intending to clear the entire cache. + * However, Table 14: Example Page Size Encodings shows that an address of + * ((1ULL << 51) - 1) encodes the entire cache, so effectively any address with + * first zero at bit 51 or larger is a request to invalidate the entire address + * space. + */ +static uint64_t amdvi_decode_invalidation_size(hwaddr addr, uint16_t flags) +{ + uint64_t size = AMDVI_PAGE_SIZE; + uint8_t fzbit = 0; + + if (flags & AMDVI_CMD_INVAL_IOMMU_PAGES_S) { + fzbit = cto64(addr | 0xFFF); + + if (fzbit >= 51) { + size = AMDVI_INV_ALL_PAGES; + } else { + size = 1ULL << (fzbit + 1); + } + } + return size; +} + +/* + * Synchronize the guest page tables with the shadow page tables kept in the + * host for the specified range. + * The invalidation command issued by the guest and intercepted by the VMM + * does not specify a device, but a domain, since all devices in the same domain + * share the same page tables. However, vIOMMU emulation creates separate + * address spaces per device, so it is necessary to traverse the list of all of + * address spaces (i.e. devices) that have notifiers registered in order to + * propagate the changes to the host page tables. + * We cannot return early from this function once a matching domain has been + * identified and its page tables synced (based on the fact that all devices in + * the same domain share the page tables). The reason is that different devices + * (i.e. address spaces) could have different notifiers registered, and by + * skipping address spaces that appear later on the amdvi_as_with_notifiers list + * their notifiers (which could differ from the ones registered for the first + * device/address space) would not be invoked. + */ +static void amdvi_sync_domain(AMDVIState *s, uint16_t domid, uint64_t addr, + uint16_t flags) +{ + AMDVIAddressSpace *as; + + uint64_t size = amdvi_decode_invalidation_size(addr, flags); + + if (size == AMDVI_INV_ALL_PAGES) { + addr = 0; /* Set start address to 0 and invalidate entire AS */ + } else { + addr &= ~(size - 1); + } + + /* + * Call notifiers that have registered for each address space matching the + * domain ID, in order to sync the guest pagetable state with the host. + */ + QLIST_FOREACH(as, &s->amdvi_as_with_notifiers, next) { + + uint64_t dte[4] = { 0 }; + + /* + * Retrieve the Device Table entry for the devid corresponding to the + * current address space, and verify the DomainID matches i.e. the page + * tables to be synced belong to devices in the domain. + */ + if (amdvi_as_to_dte(as, dte)) { + continue; + } + + /* Only need to sync the Page Tables for a matching domain */ + if (domid != (dte[1] & AMDVI_DEV_DOMID_ID_MASK)) { + continue; + } + + /* + * We have determined that there is a valid Device Table Entry for a + * device matching the DomainID in the INV_IOMMU_PAGES command issued by + * the guest. Walk the guest page table to sync shadow page table. + */ + if (as->notifier_flags & IOMMU_NOTIFIER_MAP) { + /* Sync guest IOMMU mappings with host */ + amdvi_sync_shadow_page_table_range(as, &dte[0], addr, size, true); + } + } +} + /* we don't have devid - we can't remove pages by address */ static void amdvi_inval_pages(AMDVIState *s, uint64_t *cmd) { uint16_t domid = cpu_to_le16((uint16_t)extract64(cmd[0], 32, 16)); + uint64_t addr = cpu_to_le64(extract64(cmd[1], 12, 52)) << 12; + uint16_t flags = cpu_to_le16((uint16_t)extract64(cmd[1], 0, 3)); if (extract64(cmd[0], 20, 12) || extract64(cmd[0], 48, 12) || extract64(cmd[1], 3, 9)) { @@ -504,6 +1323,8 @@ static void amdvi_inval_pages(AMDVIState *s, uint64_t *cmd) g_hash_table_foreach_remove(s->iotlb, amdvi_iotlb_remove_by_domid, &domid); + + amdvi_sync_domain(s, domid, addr, flags); trace_amdvi_pages_inval(domid); } @@ -894,150 +1715,68 @@ static void amdvi_mmio_write(void *opaque, hwaddr addr, uint64_t val, } } -static inline uint64_t amdvi_get_perms(uint64_t entry) -{ - return (entry & (AMDVI_DEV_PERM_READ | AMDVI_DEV_PERM_WRITE)) >> - AMDVI_DEV_PERM_SHIFT; -} - -/* validate that reserved bits are honoured */ -static bool amdvi_validate_dte(AMDVIState *s, uint16_t devid, - uint64_t *dte) -{ - if ((dte[0] & AMDVI_DTE_QUAD0_RESERVED) || - (dte[1] & AMDVI_DTE_QUAD1_RESERVED) || - (dte[2] & AMDVI_DTE_QUAD2_RESERVED) || - (dte[3] & AMDVI_DTE_QUAD3_RESERVED)) { - amdvi_log_illegaldevtab_error(s, devid, - s->devtab + - devid * AMDVI_DEVTAB_ENTRY_SIZE, 0); - return false; - } - - return true; -} - -/* get a device table entry given the devid */ -static bool amdvi_get_dte(AMDVIState *s, int devid, uint64_t *entry) +static void amdvi_page_walk(AMDVIAddressSpace *as, uint64_t *dte, + IOMMUTLBEntry *ret, unsigned perms, + hwaddr addr) { - uint32_t offset = devid * AMDVI_DEVTAB_ENTRY_SIZE; + hwaddr page_mask, pagesize = 0; + uint8_t mode; + uint64_t pte; + int fetch_ret; - if (dma_memory_read(&address_space_memory, s->devtab + offset, entry, - AMDVI_DEVTAB_ENTRY_SIZE, MEMTXATTRS_UNSPECIFIED)) { - trace_amdvi_dte_get_fail(s->devtab, offset); - /* log error accessing dte */ - amdvi_log_devtab_error(s, devid, s->devtab + offset, 0); - return false; + /* make sure the DTE has TV = 1 */ + if (!(dte[0] & AMDVI_DEV_TRANSLATION_VALID)) { + /* + * A DTE with V=1, TV=0 does not have a valid Page Table Root Pointer. + * An IOMMU processing a request that requires a table walk terminates + * the walk when it encounters this condition. Do the same and return + * instead of assuming that the address is forwarded without translation + * i.e. the passthrough case, as it is done for the case where DTE[V]=0. + */ + return; } - *entry = le64_to_cpu(*entry); - if (!amdvi_validate_dte(s, devid, entry)) { - trace_amdvi_invalid_dte(entry[0]); - return false; + mode = get_pte_translation_mode(dte[0]); + if (mode >= 7) { + trace_amdvi_mode_invalid(mode, addr); + return; } - - return true; -} - -/* get pte translation mode */ -static inline uint8_t get_pte_translation_mode(uint64_t pte) -{ - return (pte >> AMDVI_DEV_MODE_RSHIFT) & AMDVI_DEV_MODE_MASK; -} - -static inline uint64_t pte_override_page_mask(uint64_t pte) -{ - uint8_t page_mask = 13; - uint64_t addr = (pte & AMDVI_DEV_PT_ROOT_MASK) >> 12; - /* find the first zero bit */ - while (addr & 1) { - page_mask++; - addr = addr >> 1; + if (mode == 0) { + goto no_remap; } - return ~((1ULL << page_mask) - 1); -} - -static inline uint64_t pte_get_page_mask(uint64_t oldlevel) -{ - return ~((1UL << ((oldlevel * 9) + 3)) - 1); -} + /* Attempt to fetch the PTE to determine if a valid mapping exists */ + fetch_ret = fetch_pte(as, addr, dte[0], &pte, &pagesize); -static inline uint64_t amdvi_get_pte_entry(AMDVIState *s, uint64_t pte_addr, - uint16_t devid) -{ - uint64_t pte; + /* + * If walking the page table results in an error of any type, returns an + * empty PTE i.e. no mapping, or the permissions do not match, return since + * there is no translation available. + */ + if (fetch_ret < 0 || !IOMMU_PTE_PRESENT(pte) || + perms != (perms & amdvi_get_perms(pte))) { - if (dma_memory_read(&address_space_memory, pte_addr, - &pte, sizeof(pte), MEMTXATTRS_UNSPECIFIED)) { - trace_amdvi_get_pte_hwerror(pte_addr); - amdvi_log_pagetab_error(s, devid, pte_addr, 0); - pte = 0; - return pte; + amdvi_page_fault(as->iommu_state, as->devfn, addr, perms); + trace_amdvi_page_fault(addr); + return; } - pte = le64_to_cpu(pte); - return pte; -} - -static void amdvi_page_walk(AMDVIAddressSpace *as, uint64_t *dte, - IOMMUTLBEntry *ret, unsigned perms, - hwaddr addr) -{ - unsigned level, present, pte_perms, oldlevel; - uint64_t pte = dte[0], pte_addr, page_mask; - - /* make sure the DTE has TV = 1 */ - if (pte & AMDVI_DEV_TRANSLATION_VALID) { - level = get_pte_translation_mode(pte); - if (level >= 7) { - trace_amdvi_mode_invalid(level, addr); - return; - } - if (level == 0) { - goto no_remap; - } - - /* we are at the leaf page table or page table encodes a huge page */ - do { - pte_perms = amdvi_get_perms(pte); - present = pte & 1; - if (!present || perms != (perms & pte_perms)) { - amdvi_page_fault(as->iommu_state, as->devfn, addr, perms); - trace_amdvi_page_fault(addr); - return; - } - - /* go to the next lower level */ - pte_addr = pte & AMDVI_DEV_PT_ROOT_MASK; - /* add offset and load pte */ - pte_addr += ((addr >> (3 + 9 * level)) & 0x1FF) << 3; - pte = amdvi_get_pte_entry(as->iommu_state, pte_addr, as->devfn); - if (!pte) { - return; - } - oldlevel = level; - level = get_pte_translation_mode(pte); - } while (level > 0 && level < 7); + /* A valid PTE and page size has been retrieved */ + assert(pagesize); + page_mask = ~(pagesize - 1); - if (level == 0x7) { - page_mask = pte_override_page_mask(pte); - } else { - page_mask = pte_get_page_mask(oldlevel); - } + /* get access permissions from pte */ + ret->iova = addr & page_mask; + ret->translated_addr = (pte & AMDVI_DEV_PT_ROOT_MASK) & page_mask; + ret->addr_mask = ~page_mask; + ret->perm = amdvi_get_perms(pte); + return; - /* get access permissions from pte */ - ret->iova = addr & page_mask; - ret->translated_addr = (pte & AMDVI_DEV_PT_ROOT_MASK) & page_mask; - ret->addr_mask = ~page_mask; - ret->perm = amdvi_get_perms(pte); - return; - } no_remap: ret->iova = addr & AMDVI_PAGE_MASK_4K; ret->translated_addr = addr & AMDVI_PAGE_MASK_4K; ret->addr_mask = ~AMDVI_PAGE_MASK_4K; - ret->perm = amdvi_get_perms(pte); + ret->perm = amdvi_get_perms(dte[0]); } static void amdvi_do_translate(AMDVIAddressSpace *as, hwaddr addr, @@ -1047,6 +1786,7 @@ static void amdvi_do_translate(AMDVIAddressSpace *as, hwaddr addr, uint16_t devid = PCI_BUILD_BDF(as->bus_num, as->devfn); AMDVIIOTLBEntry *iotlb_entry = amdvi_iotlb_lookup(s, addr, devid); uint64_t entry[4]; + int dte_ret; if (iotlb_entry) { trace_amdvi_iotlb_hit(PCI_BUS_NUM(devid), PCI_SLOT(devid), @@ -1058,13 +1798,14 @@ static void amdvi_do_translate(AMDVIAddressSpace *as, hwaddr addr, return; } - if (!amdvi_get_dte(s, devid, entry)) { - return; - } + dte_ret = amdvi_as_to_dte(as, entry); - /* devices with V = 0 are not translated */ - if (!(entry[0] & AMDVI_DEV_VALID)) { - goto out; + if (dte_ret < 0) { + if (dte_ret == -AMDVI_FR_DTE_V) { + /* DTE[V]=0, address is passed untranslated */ + goto out; + } + return; } amdvi_page_walk(as, entry, ret, @@ -1500,6 +2241,9 @@ static AddressSpace *amdvi_host_dma_iommu(PCIBus *bus, void *opaque, int devfn) iommu_as[devfn]->bus_num = (uint8_t)bus_num; iommu_as[devfn]->devfn = (uint8_t)devfn; iommu_as[devfn]->iommu_state = s; + iommu_as[devfn]->notifier_flags = IOMMU_NOTIFIER_NONE; + iommu_as[devfn]->iova_tree = iova_tree_new(); + iommu_as[devfn]->addr_translation = false; amdvi_dev_as = iommu_as[devfn]; @@ -1542,8 +2286,7 @@ static AddressSpace *amdvi_host_dma_iommu(PCIBus *bus, void *opaque, int devfn) AMDVI_INT_ADDR_FIRST, &amdvi_dev_as->iommu_ir, 1); - memory_region_set_enabled(&amdvi_dev_as->iommu_nodma, false); - memory_region_set_enabled(MEMORY_REGION(&amdvi_dev_as->iommu), true); + amdvi_switch_address_space(amdvi_dev_as); } return &iommu_as[devfn]->as; } @@ -1573,14 +2316,35 @@ static int amdvi_iommu_notify_flag_changed(IOMMUMemoryRegion *iommu, Error **errp) { AMDVIAddressSpace *as = container_of(iommu, AMDVIAddressSpace, iommu); + AMDVIState *s = as->iommu_state; - if (new & IOMMU_NOTIFIER_MAP) { - error_setg(errp, - "device %02x.%02x.%x requires iommu notifier which is not " - "currently supported", as->bus_num, PCI_SLOT(as->devfn), - PCI_FUNC(as->devfn)); - return -EINVAL; + /* + * Accurate synchronization of the vIOMMU page tables required to support + * MAP notifiers is provided by the dma-remap feature. In addition, this + * also requires that the vIOMMU presents the NpCache capability, so a guest + * driver issues invalidations for both map() and unmap() operations. The + * capability is already set by default as part of AMDVI_CAPAB_FEATURES and + * written to the configuration in amdvi_pci_realize(). + */ + if (!s->dma_remap && (new & IOMMU_NOTIFIER_MAP)) { + error_setg_errno(errp, ENOTSUP, + "device %02x.%02x.%x requires dma-remap=1", + as->bus_num, PCI_SLOT(as->devfn), PCI_FUNC(as->devfn)); + return -ENOTSUP; } + + /* + * Update notifier flags for address space and the list of address spaces + * with registered notifiers. + */ + as->notifier_flags = new; + + if (old == IOMMU_NOTIFIER_NONE) { + QLIST_INSERT_HEAD(&s->amdvi_as_with_notifiers, as, next); + } else if (new == IOMMU_NOTIFIER_NONE) { + QLIST_REMOVE(as, next); + } + return 0; } @@ -1657,6 +2421,10 @@ static void amdvi_sysbus_reset(DeviceState *dev) msi_reset(&s->pci->dev); amdvi_init(s); + + /* Discard all mappings on device reset */ + amdvi_address_space_unmap_all(s); + amdvi_reset_address_translation_all(s); } static const VMStateDescription vmstate_amdvi_sysbus_migratable = { @@ -1787,6 +2555,7 @@ static void amdvi_sysbus_realize(DeviceState *dev, Error **errp) static const Property amdvi_properties[] = { DEFINE_PROP_BOOL("xtsup", AMDVIState, xtsup, false), DEFINE_PROP_STRING("pci-id", AMDVIState, pci_id), + DEFINE_PROP_BOOL("dma-remap", AMDVIState, dma_remap, false), }; static const VMStateDescription vmstate_amdvi_sysbus = { @@ -1848,6 +2617,7 @@ static void amdvi_iommu_memory_region_class_init(ObjectClass *klass, imrc->translate = amdvi_translate; imrc->notify_flag_changed = amdvi_iommu_notify_flag_changed; + imrc->replay = amdvi_iommu_replay; } static const TypeInfo amdvi_iommu_memory_region_info = { diff --git a/hw/i386/amd_iommu.h b/hw/i386/amd_iommu.h index 2476296c49..daf82fc85f 100644 --- a/hw/i386/amd_iommu.h +++ b/hw/i386/amd_iommu.h @@ -126,6 +126,10 @@ #define AMDVI_CMD_COMPLETE_PPR_REQUEST 0x07 #define AMDVI_CMD_INVAL_AMDVI_ALL 0x08 + +#define AMDVI_CMD_INVAL_IOMMU_PAGES_S (1ULL << 0) +#define AMDVI_INV_ALL_PAGES (1ULL << 52) + #define AMDVI_DEVTAB_ENTRY_SIZE 32 /* Device table entry bits 0:63 */ @@ -173,6 +177,47 @@ /* AMDVI paging mode */ #define AMDVI_GATS_MODE (2ULL << 12) #define AMDVI_HATS_MODE (2ULL << 10) +#define AMDVI_HATS_MODE_RESERVED (3ULL << 10) + +/* Page Table format */ + +#define AMDVI_PTE_PR (1ULL << 0) +#define AMDVI_PTE_NEXT_LEVEL_MASK GENMASK64(11, 9) + +#define IOMMU_PTE_PRESENT(pte) ((pte) & AMDVI_PTE_PR) + +/* Using level=0 for leaf PTE at 4K page size */ +#define PT_LEVEL_SHIFT(level) (12 + ((level) * 9)) + +/* Return IOVA bit group used to index the Page Table at specific level */ +#define PT_LEVEL_INDEX(level, iova) (((iova) >> PT_LEVEL_SHIFT(level)) & \ + GENMASK64(8, 0)) + +/* Return the max address for a specified level i.e. max_oaddr */ +#define PT_LEVEL_MAX_ADDR(x) (((x) < 5) ? \ + ((1ULL << PT_LEVEL_SHIFT((x + 1))) - 1) : \ + (~(0ULL))) + +/* Extract the NextLevel field from PTE/PDE */ +#define PTE_NEXT_LEVEL(pte) (((pte) & AMDVI_PTE_NEXT_LEVEL_MASK) >> 9) + +/* Take page table level and return default pagetable size for level */ +#define PTE_LEVEL_PAGE_SIZE(level) (1ULL << (PT_LEVEL_SHIFT(level))) + +/* + * Return address of lower level page table encoded in PTE and specified by + * current level and corresponding IOVA bit group at such level. + */ +#define NEXT_PTE_ADDR(pte, level, iova) (((pte) & AMDVI_DEV_PT_ROOT_MASK) + \ + (PT_LEVEL_INDEX(level, iova) * 8)) + +/* + * Take a PTE value with mode=0x07 and return the page size it encodes. + */ +#define PTE_LARGE_PAGE_SIZE(pte) (1ULL << (1 + cto64(((pte) | 0xfffULL)))) + +/* Return number of PTEs to use for a given page size (expected power of 2) */ +#define PAGE_SIZE_PTE_COUNT(pgsz) (1ULL << ((ctz64(pgsz) - 12) % 9)) /* IOTLB */ #define AMDVI_IOTLB_MAX_SIZE 1024 @@ -365,12 +410,18 @@ struct AMDVIState { /* for each served device */ AMDVIAddressSpace **address_spaces[PCI_BUS_MAX]; + /* list of address spaces with registered notifiers */ + QLIST_HEAD(, AMDVIAddressSpace) amdvi_as_with_notifiers; + /* IOTLB */ GHashTable *iotlb; /* Interrupt remapping */ bool ga_enabled; bool xtsup; + + /* DMA address translation */ + bool dma_remap; }; uint64_t amdvi_extended_feature_register(AMDVIState *s); diff --git a/hw/i386/intel_iommu.c b/hw/i386/intel_iommu.c index 83c5e44413..6a168d5107 100644 --- a/hw/i386/intel_iommu.c +++ b/hw/i386/intel_iommu.c @@ -45,6 +45,8 @@ ((ce)->val[1] & VTD_SM_CONTEXT_ENTRY_RID2PASID_MASK) #define VTD_CE_GET_PASID_DIR_TABLE(ce) \ ((ce)->val[0] & VTD_PASID_DIR_BASE_ADDR_MASK) +#define VTD_CE_GET_PRE(ce) \ + ((ce)->val[0] & VTD_SM_CONTEXT_ENTRY_PRE) /* pe operations */ #define VTD_PE_GET_TYPE(pe) ((pe)->val[0] & VTD_SM_PASID_ENTRY_PGTT) @@ -85,13 +87,6 @@ struct vtd_iotlb_key { static void vtd_address_space_refresh_all(IntelIOMMUState *s); static void vtd_address_space_unmap(VTDAddressSpace *as, IOMMUNotifier *n); -static void vtd_panic_require_caching_mode(void) -{ - error_report("We need to set caching-mode=on for intel-iommu to enable " - "device assignment with IOMMU protection."); - exit(1); -} - static void vtd_define_quad(IntelIOMMUState *s, hwaddr addr, uint64_t val, uint64_t wmask, uint64_t w1cmask) { @@ -1838,6 +1833,7 @@ static const bool vtd_qualified_faults[] = { [VTD_FR_FS_NON_CANONICAL] = true, [VTD_FR_FS_PAGING_ENTRY_US] = true, [VTD_FR_SM_WRITE] = true, + [VTD_FR_SM_PRE_ABS] = true, [VTD_FR_SM_INTERRUPT_ADDR] = true, [VTD_FR_FS_BIT_UPDATE_FAILED] = true, [VTD_FR_MAX] = false, @@ -2701,7 +2697,7 @@ static void vtd_handle_gcmd_write(IntelIOMMUState *s) uint32_t changed = status ^ val; trace_vtd_reg_write_gcmd(status, val); - if ((changed & VTD_GCMD_TE) && s->dma_translation) { + if ((changed & VTD_GCMD_TE) && x86_iommu->dma_translation) { /* Translation enable/disable */ vtd_handle_gcmd_te(s, val & VTD_GCMD_TE); } @@ -2857,7 +2853,13 @@ static bool vtd_process_wait_desc(IntelIOMMUState *s, VTDInvDesc *inv_desc) vtd_generate_completion_event(s); } - if (!(inv_desc->lo & (VTD_INV_DESC_WAIT_IF | VTD_INV_DESC_WAIT_SW))) { + /* + * SW=0, IF=0, FN=1 is also a valid descriptor (VT-d 7.10) + * Nothing to do as we process the descriptors in order + */ + + if (!(inv_desc->lo & (VTD_INV_DESC_WAIT_IF | VTD_INV_DESC_WAIT_SW | + VTD_INV_DESC_WAIT_FN))) { error_report_once("%s: invalid wait desc: hi=%"PRIx64", lo=%"PRIx64 " (unknown type)", __func__, inv_desc->hi, inv_desc->lo); @@ -3146,6 +3148,59 @@ static bool vtd_process_device_piotlb_desc(IntelIOMMUState *s, return true; } +static bool vtd_process_page_group_response_desc(IntelIOMMUState *s, + VTDInvDesc *inv_desc) +{ + VTDAddressSpace *vtd_dev_as; + bool pasid_present; + uint8_t response_code; + uint16_t rid; + uint32_t pasid; + uint16_t prgi; + IOMMUPRIResponse response; + + if ((inv_desc->lo & VTD_INV_DESC_PGRESP_RSVD_LO) || + (inv_desc->hi & VTD_INV_DESC_PGRESP_RSVD_HI)) { + error_report_once("%s: invalid page group response desc: hi=%"PRIx64 + ", lo=%"PRIx64" (reserved nonzero)", __func__, + inv_desc->hi, inv_desc->lo); + return false; + } + + pasid_present = VTD_INV_DESC_PGRESP_PP(inv_desc->lo); + response_code = VTD_INV_DESC_PGRESP_RC(inv_desc->lo); + rid = VTD_INV_DESC_PGRESP_RID(inv_desc->lo); + pasid = VTD_INV_DESC_PGRESP_PASID(inv_desc->lo); + prgi = VTD_INV_DESC_PGRESP_PRGI(inv_desc->hi); + + if (!pasid_present) { + error_report_once("Page group response without PASID is" + "not supported yet"); + return false; + } + + vtd_dev_as = vtd_get_as_by_sid_and_pasid(s, rid, pasid); + if (!vtd_dev_as) { + return true; + } + + response.prgi = prgi; + + if (response_code == 0x0u) { + response.response_code = IOMMU_PRI_RESP_SUCCESS; + } else if (response_code == 0x1u) { + response.response_code = IOMMU_PRI_RESP_INVALID_REQUEST; + } else { + response.response_code = IOMMU_PRI_RESP_FAILURE; + } + + if (vtd_dev_as->pri_notifier) { + vtd_dev_as->pri_notifier->notify(vtd_dev_as->pri_notifier, &response); + } + + return true; +} + static bool vtd_process_device_iotlb_desc(IntelIOMMUState *s, VTDInvDesc *inv_desc) { @@ -3246,6 +3301,13 @@ static bool vtd_process_inv_desc(IntelIOMMUState *s) } break; + case VTD_INV_DESC_PGRESP: + trace_vtd_inv_desc("page group response", inv_desc.hi, inv_desc.lo); + if (!vtd_process_page_group_response_desc(s, &inv_desc)) { + return false; + } + break; + /* * TODO: the entity of below two cases will be implemented in future series. * To make guest (which integrates scalable mode support patch set in @@ -3380,6 +3442,27 @@ static void vtd_handle_iectl_write(IntelIOMMUState *s) } } +static void vtd_handle_prs_write(IntelIOMMUState *s) +{ + uint32_t prs = vtd_get_long_raw(s, DMAR_PRS_REG); + if (!(prs & VTD_PR_STATUS_PPR) && !(prs & VTD_PR_STATUS_PRO)) { + vtd_set_clear_mask_long(s, DMAR_PECTL_REG, VTD_PR_PECTL_IP, 0); + } +} + +static void vtd_handle_pectl_write(IntelIOMMUState *s) +{ + uint32_t pectl = vtd_get_long_raw(s, DMAR_PECTL_REG); + if ((pectl & VTD_PR_PECTL_IP) && !(pectl & VTD_PR_PECTL_IM)) { + /* + * If IP field was 1 when software clears the IM field, + * the interrupt is generated along with clearing the IP field. + */ + vtd_set_clear_mask_long(s, DMAR_PECTL_REG, VTD_PR_PECTL_IP, 0); + vtd_generate_interrupt(s, DMAR_PEADDR_REG, DMAR_PEDATA_REG); + } +} + static uint64_t vtd_mem_read(void *opaque, hwaddr addr, unsigned size) { IntelIOMMUState *s = opaque; @@ -3422,6 +3505,11 @@ static uint64_t vtd_mem_read(void *opaque, hwaddr addr, unsigned size) val = s->iq >> 32; break; + case DMAR_PEUADDR_REG: + assert(size == 4); + val = vtd_get_long_raw(s, DMAR_PEUADDR_REG); + break; + default: if (size == 4) { val = vtd_get_long(s, addr); @@ -3485,6 +3573,11 @@ static void vtd_mem_write(void *opaque, hwaddr addr, vtd_handle_iotlb_write(s); break; + case DMAR_PEUADDR_REG: + assert(size == 4); + vtd_set_long(s, addr, val); + break; + /* Invalidate Address Register, 64-bit */ case DMAR_IVA_REG: if (size == 4) { @@ -3665,6 +3758,18 @@ static void vtd_mem_write(void *opaque, hwaddr addr, vtd_set_long(s, addr, val); break; + case DMAR_PRS_REG: + assert(size == 4); + vtd_set_long(s, addr, val); + vtd_handle_prs_write(s); + break; + + case DMAR_PECTL_REG: + assert(size == 4); + vtd_set_long(s, addr, val); + vtd_handle_pectl_write(s); + break; + default: if (size == 4) { vtd_set_long(s, addr, val); @@ -3835,7 +3940,6 @@ static const Property vtd_properties[] = { DEFINE_PROP_BOOL("snoop-control", IntelIOMMUState, snoop_control, false), DEFINE_PROP_BOOL("x-pasid-mode", IntelIOMMUState, pasid, false), DEFINE_PROP_BOOL("dma-drain", IntelIOMMUState, dma_drain, true), - DEFINE_PROP_BOOL("dma-translation", IntelIOMMUState, dma_translation, true), DEFINE_PROP_BOOL("stale-tm", IntelIOMMUState, stale_tm, false), DEFINE_PROP_BOOL("fs1gp", IntelIOMMUState, fs1gp, true), }; @@ -4378,6 +4482,12 @@ static bool vtd_dev_set_iommu_device(PCIBus *bus, void *opaque, int devfn, assert(hiod); + if (!s->caching_mode) { + error_setg(errp, "Device assignment is not allowed without enabling " + "caching-mode=on for Intel IOMMU."); + return false; + } + vtd_iommu_lock(s); if (g_hash_table_lookup(s->vtd_host_iommu_dev, &key)) { @@ -4549,11 +4659,11 @@ static void vtd_cap_init(IntelIOMMUState *s) s->cap = VTD_CAP_FRO | VTD_CAP_NFR | VTD_CAP_ND | VTD_CAP_MAMV | VTD_CAP_PSI | VTD_CAP_SLLPS | - VTD_CAP_MGAW(s->aw_bits); + VTD_CAP_ESRTPS | VTD_CAP_MGAW(s->aw_bits); if (s->dma_drain) { s->cap |= VTD_CAP_DRAIN; } - if (s->dma_translation) { + if (x86_iommu->dma_translation) { if (s->aw_bits >= VTD_HOST_AW_39BIT) { s->cap |= VTD_CAP_SAGAW_39bit; } @@ -4716,6 +4826,18 @@ static void vtd_init(IntelIOMMUState *s) * Interrupt remapping registers. */ vtd_define_quad(s, DMAR_IRTA_REG, 0, 0xfffffffffffff80fULL, 0); + + /* Page request registers */ + if (s->ecap & VTD_ECAP_PRS) { + vtd_define_quad(s, DMAR_PQH_REG, 0, 0x7ffe0ULL, 0); + vtd_define_quad(s, DMAR_PQT_REG, 0, 0x7ffe0ULL, 0); + vtd_define_quad(s, DMAR_PQA_REG, 0, 0xfffffffffffff007ULL, 0); + vtd_define_long(s, DMAR_PRS_REG, 0, 0, 0x3UL); + vtd_define_long(s, DMAR_PECTL_REG, 0, 0x80000000UL, 0); + vtd_define_long(s, DMAR_PEDATA_REG, 0, 0xffffUL, 0); + vtd_define_long(s, DMAR_PEADDR_REG, 0, 0xfffffffcUL, 0); + vtd_define_long(s, DMAR_PEUADDR_REG, 0, 0xffffffffUL, 0); + } } /* Should not reset address_spaces when reset because devices will still use @@ -4803,6 +4925,194 @@ static ssize_t vtd_ats_request_translation(PCIBus *bus, void *opaque, return res_index; } +/* 11.4.11.3 : The number of entries in the page request queue is 2^(PQS + 7) */ +static inline uint64_t vtd_prq_size(IntelIOMMUState *s) +{ + return 1ULL << ((vtd_get_quad(s, DMAR_PQA_REG) & VTD_PQA_SIZE) + 7); +} + +/** + * Return true if the bit is accessible and correctly set, false otherwise + */ +static bool vtd_check_pre_bit(VTDAddressSpace *vtd_as, hwaddr addr, + uint16_t sid, bool is_write) +{ + int ret; + IntelIOMMUState *s = vtd_as->iommu_state; + uint8_t bus_n = pci_bus_num(vtd_as->bus); + VTDContextEntry ce; + bool is_fpd_set = false; + + ret = vtd_dev_to_context_entry(s, bus_n, vtd_as->devfn, &ce); + + if (ret) { + goto error_report; + } + + if (!VTD_CE_GET_PRE(&ce)) { + ret = -VTD_FR_SM_PRE_ABS; + goto error_get_fpd_and_report; + } + + return true; + +error_get_fpd_and_report: + /* Try to get fpd (may not work but we are already on an error path) */ + is_fpd_set = ce.lo & VTD_CONTEXT_ENTRY_FPD; + vtd_ce_get_pasid_fpd(s, &ce, &is_fpd_set, vtd_as->pasid); +error_report: + vtd_report_fault(s, -ret, is_fpd_set, sid, addr, is_write, + vtd_as->pasid != PCI_NO_PASID, vtd_as->pasid); + return false; +} + +/* Logic described in section 7.5 */ +static void vtd_generate_page_request_event(IntelIOMMUState *s, + uint32_t old_pr_status) +{ + uint32_t current_pectl = vtd_get_long(s, DMAR_PECTL_REG); + /* + * Hardware evaluates PPR and PRO fields in the Page Request Status Register + * and if any of them is set, Page Request Event is not generated + */ + if (old_pr_status & (VTD_PR_STATUS_PRO | VTD_PR_STATUS_PPR)) { + return; + } + + vtd_set_clear_mask_long(s, DMAR_PECTL_REG, 0, VTD_PR_PECTL_IP); + if (!(current_pectl & VTD_PR_PECTL_IM)) { + vtd_set_clear_mask_long(s, DMAR_PECTL_REG, VTD_PR_PECTL_IP, 0); + vtd_generate_interrupt(s, DMAR_PEADDR_REG, DMAR_PEDATA_REG); + } +} + +/* When calling this function, we known that we are in scalable mode */ +static int vtd_pri_perform_implicit_invalidation(VTDAddressSpace *vtd_as, + hwaddr addr) +{ + IntelIOMMUState *s = vtd_as->iommu_state; + VTDContextEntry ce; + VTDPASIDEntry pe; + uint16_t pgtt; + uint16_t domain_id; + int ret = vtd_dev_to_context_entry(s, pci_bus_num(vtd_as->bus), + vtd_as->devfn, &ce); + if (ret) { + return -EINVAL; + } + ret = vtd_ce_get_rid2pasid_entry(s, &ce, &pe, vtd_as->pasid); + if (ret) { + return -EINVAL; + } + pgtt = VTD_PE_GET_TYPE(&pe); + domain_id = VTD_SM_PASID_ENTRY_DID(pe.val[1]); + ret = 0; + switch (pgtt) { + case VTD_SM_PASID_ENTRY_FLT: + vtd_piotlb_page_invalidate(s, domain_id, vtd_as->pasid, addr, 0); + break; + /* Room for other pgtt values */ + default: + error_report_once("Translation type not supported yet : %d", pgtt); + ret = -EINVAL; + break; + } + + return ret; +} + +/* Page Request Descriptor : 7.4.1.1 */ +static int vtd_pri_request_page(PCIBus *bus, void *opaque, int devfn, + uint32_t pasid, bool priv_req, bool exec_req, + hwaddr addr, bool lpig, uint16_t prgi, + bool is_read, bool is_write) +{ + IntelIOMMUState *s = opaque; + VTDAddressSpace *vtd_as; + + vtd_as = vtd_find_add_as(s, bus, devfn, pasid); + + uint64_t queue_addr_reg = vtd_get_quad(s, DMAR_PQA_REG); + uint64_t queue_tail_offset_reg = vtd_get_quad(s, DMAR_PQT_REG); + uint64_t new_queue_tail_offset = ( + (queue_tail_offset_reg + VTD_PQA_ENTRY_SIZE) % + (vtd_prq_size(s) * VTD_PQA_ENTRY_SIZE)); + uint64_t queue_head_offset_reg = vtd_get_quad(s, DMAR_PQH_REG); + hwaddr queue_tail = (queue_addr_reg & VTD_PQA_ADDR) + queue_tail_offset_reg; + uint32_t old_pr_status = vtd_get_long(s, DMAR_PRS_REG); + uint16_t sid = PCI_BUILD_BDF(pci_bus_num(vtd_as->bus), vtd_as->devfn); + VTDPRDesc desc; + + if (!(s->ecap & VTD_ECAP_PRS)) { + return -EPERM; + } + + /* + * No need to check if scalable mode is enabled as we already known that + * VTD_ECAP_PRS is set (see vtd_decide_config) + */ + + /* We do not support PRI without PASID */ + if (vtd_as->pasid == PCI_NO_PASID) { + return -EPERM; + } + if (exec_req && !is_read) { + return -EINVAL; + } + + /* Check PRE bit in the scalable mode context entry */ + if (!vtd_check_pre_bit(vtd_as, addr, sid, is_write)) { + return -EPERM; + } + + if (old_pr_status & VTD_PR_STATUS_PRO) { + /* + * No action is taken by hardware to report a fault + * or generate an event + */ + return -ENOSPC; + } + + /* Check for overflow */ + if (new_queue_tail_offset == queue_head_offset_reg) { + vtd_set_clear_mask_long(s, DMAR_PRS_REG, 0, VTD_PR_STATUS_PRO); + vtd_generate_page_request_event(s, old_pr_status); + return -ENOSPC; + } + + if (vtd_pri_perform_implicit_invalidation(vtd_as, addr)) { + return -EINVAL; + } + + desc.lo = VTD_PRD_TYPE | VTD_PRD_PP(true) | VTD_PRD_RID(sid) | + VTD_PRD_PASID(vtd_as->pasid) | VTD_PRD_PMR(priv_req); + desc.hi = VTD_PRD_RDR(is_read) | VTD_PRD_WRR(is_write) | + VTD_PRD_LPIG(lpig) | VTD_PRD_PRGI(prgi) | VTD_PRD_ADDR(addr); + + desc.lo = cpu_to_le64(desc.lo); + desc.hi = cpu_to_le64(desc.hi); + if (dma_memory_write(&address_space_memory, queue_tail, &desc, sizeof(desc), + MEMTXATTRS_UNSPECIFIED)) { + error_report_once("IO error, the PQ tail cannot be updated"); + return -EIO; + } + + /* increment the tail register and set the pending request bit */ + vtd_set_quad(s, DMAR_PQT_REG, new_queue_tail_offset); + /* + * read status again so that the kernel does not miss a request. + * in some cases, we can trigger an unecessary interrupt but this strategy + * drastically improves performance as we don't need to take a lock. + */ + old_pr_status = vtd_get_long(s, DMAR_PRS_REG); + if (!(old_pr_status & VTD_PR_STATUS_PPR)) { + vtd_set_clear_mask_long(s, DMAR_PRS_REG, 0, VTD_PR_STATUS_PPR); + vtd_generate_page_request_event(s, old_pr_status); + } + + return 0; +} + static void vtd_init_iotlb_notifier(PCIBus *bus, void *opaque, int devfn, IOMMUNotifier *n, IOMMUNotify fn, void *user_opaque) @@ -4844,6 +5154,26 @@ static void vtd_unregister_iotlb_notifier(PCIBus *bus, void *opaque, memory_region_unregister_iommu_notifier(MEMORY_REGION(&vtd_as->iommu), n); } +static void vtd_pri_register_notifier(PCIBus *bus, void *opaque, int devfn, + uint32_t pasid, IOMMUPRINotifier *notifier) +{ + IntelIOMMUState *s = opaque; + VTDAddressSpace *vtd_as; + + vtd_as = vtd_find_add_as(s, bus, devfn, pasid); + vtd_as->pri_notifier = notifier; +} + +static void vtd_pri_unregister_notifier(PCIBus *bus, void *opaque, + int devfn, uint32_t pasid) +{ + IntelIOMMUState *s = opaque; + VTDAddressSpace *vtd_as; + + vtd_as = vtd_find_add_as(s, bus, devfn, pasid); + vtd_as->pri_notifier = NULL; +} + static PCIIOMMUOps vtd_iommu_ops = { .get_address_space = vtd_host_dma_iommu, .set_iommu_device = vtd_dev_set_iommu_device, @@ -4853,6 +5183,9 @@ static PCIIOMMUOps vtd_iommu_ops = { .register_iotlb_notifier = vtd_register_iotlb_notifier, .unregister_iotlb_notifier = vtd_unregister_iotlb_notifier, .ats_request_translation = vtd_ats_request_translation, + .pri_register_notifier = vtd_pri_register_notifier, + .pri_unregister_notifier = vtd_pri_unregister_notifier, + .pri_request_page = vtd_pri_request_page, }; static bool vtd_decide_config(IntelIOMMUState *s, Error **errp) @@ -4910,32 +5243,6 @@ static bool vtd_decide_config(IntelIOMMUState *s, Error **errp) return true; } -static int vtd_machine_done_notify_one(Object *child, void *unused) -{ - IntelIOMMUState *iommu = INTEL_IOMMU_DEVICE(x86_iommu_get_default()); - - /* - * We hard-coded here because vfio-pci is the only special case - * here. Let's be more elegant in the future when we can, but so - * far there seems to be no better way. - */ - if (object_dynamic_cast(child, "vfio-pci") && !iommu->caching_mode) { - vtd_panic_require_caching_mode(); - } - - return 0; -} - -static void vtd_machine_done_hook(Notifier *notifier, void *unused) -{ - object_child_foreach_recursive(object_get_root(), - vtd_machine_done_notify_one, NULL); -} - -static Notifier vtd_machine_done_notify = { - .notify = vtd_machine_done_hook, -}; - static void vtd_realize(DeviceState *dev, Error **errp) { MachineState *ms = MACHINE(qdev_get_machine()); @@ -4990,7 +5297,6 @@ static void vtd_realize(DeviceState *dev, Error **errp) pci_setup_iommu(bus, &vtd_iommu_ops, dev); /* Pseudo address space under root PCI bus. */ x86ms->ioapic_as = vtd_host_dma_iommu(bus, s, Q35_PSEUDO_DEVFN_IOAPIC); - qemu_add_machine_init_done_notifier(&vtd_machine_done_notify); } static void vtd_class_init(ObjectClass *klass, const void *data) diff --git a/hw/i386/intel_iommu_internal.h b/hw/i386/intel_iommu_internal.h index 360e937989..0f6a1237e4 100644 --- a/hw/i386/intel_iommu_internal.h +++ b/hw/i386/intel_iommu_internal.h @@ -190,6 +190,7 @@ #define VTD_ECAP_EIM (1ULL << 4) #define VTD_ECAP_PT (1ULL << 6) #define VTD_ECAP_SC (1ULL << 7) +#define VTD_ECAP_PRS (1ULL << 29) #define VTD_ECAP_MHMV (15ULL << 20) #define VTD_ECAP_SRS (1ULL << 31) #define VTD_ECAP_PSS (7ULL << 35) /* limit: MemTxAttrs::pid */ @@ -214,6 +215,7 @@ #define VTD_CAP_DRAIN_WRITE (1ULL << 54) #define VTD_CAP_DRAIN_READ (1ULL << 55) #define VTD_CAP_FS1GP (1ULL << 56) +#define VTD_CAP_ESRTPS (1ULL << 63) #define VTD_CAP_DRAIN (VTD_CAP_DRAIN_READ | VTD_CAP_DRAIN_WRITE) #define VTD_CAP_CM (1ULL << 7) #define VTD_PASID_ID_SHIFT 20 @@ -314,6 +316,8 @@ typedef enum VTDFaultReason { * request while disabled */ VTD_FR_IR_SID_ERR = 0x26, /* Invalid Source-ID */ + VTD_FR_SM_PRE_ABS = 0x47, /* SCT.8 : PRE bit in a present SM CE is 0 */ + /* PASID directory entry access failure */ VTD_FR_PASID_DIR_ACCESS_ERR = 0x50, /* The Present(P) field of pasid directory entry is 0 */ @@ -376,6 +380,18 @@ union VTDInvDesc { }; typedef union VTDInvDesc VTDInvDesc; +/* Page Request Descriptor */ +union VTDPRDesc { + struct { + uint64_t lo; + uint64_t hi; + }; + struct { + uint64_t val[4]; + }; +}; +typedef union VTDPRDesc VTDPRDesc; + /* Masks for struct VTDInvDesc */ #define VTD_INV_DESC_ALL_ONE -1ULL #define VTD_INV_DESC_TYPE(val) ((((val) >> 5) & 0x70ULL) | \ @@ -389,6 +405,7 @@ typedef union VTDInvDesc VTDInvDesc; #define VTD_INV_DESC_PIOTLB 0x6 /* PASID-IOTLB Invalidate Desc */ #define VTD_INV_DESC_PC 0x7 /* PASID-cache Invalidate Desc */ #define VTD_INV_DESC_DEV_PIOTLB 0x8 /* PASID-based-DIOTLB inv_desc*/ +#define VTD_INV_DESC_PGRESP 0x9 /* Page Group Response Desc */ #define VTD_INV_DESC_NONE 0 /* Not an Invalidate Descriptor */ /* Masks for Invalidation Wait Descriptor*/ @@ -440,6 +457,15 @@ typedef union VTDInvDesc VTDInvDesc; #define VTD_INV_DESC_PASID_DEVICE_IOTLB_RSVD_VAL0 0xfff000000000f000ULL #define VTD_INV_DESC_PASID_DEVICE_IOTLB_RSVD_VAL1 0x7feULL +/* Mask for Page Group Response Descriptor */ +#define VTD_INV_DESC_PGRESP_RSVD_HI 0xfffffffffffff003ULL +#define VTD_INV_DESC_PGRESP_RSVD_LO 0xfff00000000001e0ULL +#define VTD_INV_DESC_PGRESP_PP(val) (((val) >> 4) & 0x1ULL) +#define VTD_INV_DESC_PGRESP_RC(val) (((val) >> 12) & 0xfULL) +#define VTD_INV_DESC_PGRESP_RID(val) (((val) >> 16) & 0xffffULL) +#define VTD_INV_DESC_PGRESP_PASID(val) (((val) >> 32) & 0xfffffULL) +#define VTD_INV_DESC_PGRESP_PRGI(val) (((val) >> 3) & 0x1ffULL) + /* Rsvd field masks for spte */ #define VTD_SPTE_SNP 0x800ULL @@ -491,6 +517,31 @@ typedef union VTDInvDesc VTDInvDesc; #define VTD_INV_DESC_PIOTLB_RSVD_VAL0 0xfff000000000f1c0ULL #define VTD_INV_DESC_PIOTLB_RSVD_VAL1 0xf80ULL +/* Page Request Descriptor */ +/* For the low 64-bit of 128-bit */ +#define VTD_PRD_TYPE (1ULL) +#define VTD_PRD_PP(val) (((val) & 1ULL) << 8) +#define VTD_PRD_RID(val) (((val) & 0xffffULL) << 16) +#define VTD_PRD_PASID(val) (((val) & 0xfffffULL) << 32) +#define VTD_PRD_EXR(val) (((val) & 1ULL) << 52) +#define VTD_PRD_PMR(val) (((val) & 1ULL) << 53) +/* For the high 64-bit of 128-bit */ +#define VTD_PRD_RDR(val) ((val) & 1ULL) +#define VTD_PRD_WRR(val) (((val) & 1ULL) << 1) +#define VTD_PRD_LPIG(val) (((val) & 1ULL) << 2) +#define VTD_PRD_PRGI(val) (((val) & 0x1ffULL) << 3) +#define VTD_PRD_ADDR(val) ((val) & 0xfffffffffffff000ULL) + +/* Page Request Queue constants */ +#define VTD_PQA_ENTRY_SIZE 32 /* Size of an entry in bytes */ +/* Page Request Queue masks */ +#define VTD_PQA_ADDR 0xfffffffffffff000ULL /* PR queue address */ +#define VTD_PQA_SIZE 0x7ULL /* PR queue size */ +#define VTD_PR_STATUS_PPR 1UL /* Pending page request */ +#define VTD_PR_STATUS_PRO 2UL /* Page request overflow */ +#define VTD_PR_PECTL_IP 0x40000000UL /* PR control interrup pending */ +#define VTD_PR_PECTL_IM 0x80000000UL /* PR control interrup mask */ + /* Information about page-selective IOTLB invalidate */ struct VTDIOTLBPageInvInfo { uint16_t domain_id; @@ -550,6 +601,7 @@ typedef struct VTDRootEntry VTDRootEntry; #define VTD_SM_CONTEXT_ENTRY_RID2PASID_MASK 0xfffff #define VTD_SM_CONTEXT_ENTRY_RSVD_VAL0(aw) (0x1e0ULL | ~VTD_HAW_MASK(aw)) #define VTD_SM_CONTEXT_ENTRY_RSVD_VAL1 0xffffffffffe00000ULL +#define VTD_SM_CONTEXT_ENTRY_PRE 0x10ULL /* PASID Table Related Definitions */ #define VTD_PASID_DIR_BASE_ADDR_MASK (~0xfffULL) diff --git a/hw/i386/pc.c b/hw/i386/pc.c index bc048a6d13..34b00663f2 100644 --- a/hw/i386/pc.c +++ b/hw/i386/pc.c @@ -837,6 +837,7 @@ void pc_memory_init(PCMachineState *pcms, hwaddr maxphysaddr, maxusedaddr; hwaddr cxl_base, cxl_resv_end = 0; X86CPU *cpu = X86_CPU(first_cpu); + uint64_t res_mem_end; assert(machine->ram_size == x86ms->below_4g_mem_size + x86ms->above_4g_mem_size); @@ -978,16 +979,17 @@ void pc_memory_init(PCMachineState *pcms, rom_set_fw(fw_cfg); - if (machine->device_memory) { - uint64_t *val = g_malloc(sizeof(*val)); - uint64_t res_mem_end; + if (pcms->cxl_devices_state.is_enabled) { + res_mem_end = cxl_resv_end; + } else if (machine->device_memory) { + res_mem_end = machine->device_memory->base + + memory_region_size(&machine->device_memory->mr); + } else { + res_mem_end = 0; + } - if (pcms->cxl_devices_state.is_enabled) { - res_mem_end = cxl_resv_end; - } else { - res_mem_end = machine->device_memory->base - + memory_region_size(&machine->device_memory->mr); - } + if (res_mem_end) { + uint64_t *val = g_malloc(sizeof(*val)); *val = cpu_to_le64(ROUND_UP(res_mem_end, 1 * GiB)); fw_cfg_add_file(fw_cfg, "etc/reserved-memory-end", val, sizeof(*val)); } @@ -1720,25 +1722,6 @@ static void pc_machine_wakeup(MachineState *machine) cpu_synchronize_all_post_reset(); } -static bool pc_hotplug_allowed(MachineState *ms, DeviceState *dev, Error **errp) -{ - X86IOMMUState *iommu = x86_iommu_get_default(); - IntelIOMMUState *intel_iommu; - - if (iommu && - object_dynamic_cast((Object *)iommu, TYPE_INTEL_IOMMU_DEVICE) && - object_dynamic_cast((Object *)dev, "vfio-pci")) { - intel_iommu = INTEL_IOMMU_DEVICE(iommu); - if (!intel_iommu->caching_mode) { - error_setg(errp, "Device assignment is not allowed without " - "enabling caching-mode=on for Intel IOMMU."); - return false; - } - } - - return true; -} - static void pc_machine_class_init(ObjectClass *oc, const void *data) { MachineClass *mc = MACHINE_CLASS(oc); @@ -1758,7 +1741,6 @@ static void pc_machine_class_init(ObjectClass *oc, const void *data) x86mc->apic_xrupt_override = true; assert(!mc->get_hotplug_handler); mc->get_hotplug_handler = pc_get_hotplug_handler; - mc->hotplug_allowed = pc_hotplug_allowed; mc->auto_enable_numa_with_memhp = true; mc->auto_enable_numa_with_memdev = true; mc->has_hotpluggable_cpus = true; diff --git a/hw/i386/pc_piix.c b/hw/i386/pc_piix.c index caf8bab68e..7b3611e973 100644 --- a/hw/i386/pc_piix.c +++ b/hw/i386/pc_piix.c @@ -448,6 +448,7 @@ DEFINE_I440FX_MACHINE_AS_LATEST(10, 2); static void pc_i440fx_machine_10_1_options(MachineClass *m) { pc_i440fx_machine_10_2_options(m); + m->smbios_memory_device_size = 2047 * TiB; compat_props_add(m->compat_props, hw_compat_10_1, hw_compat_10_1_len); compat_props_add(m->compat_props, pc_compat_10_1, pc_compat_10_1_len); } diff --git a/hw/i386/pc_q35.c b/hw/i386/pc_q35.c index e89951285e..6015e639d7 100644 --- a/hw/i386/pc_q35.c +++ b/hw/i386/pc_q35.c @@ -384,6 +384,7 @@ DEFINE_Q35_MACHINE_AS_LATEST(10, 2); static void pc_q35_machine_10_1_options(MachineClass *m) { pc_q35_machine_10_2_options(m); + m->smbios_memory_device_size = 2047 * TiB; compat_props_add(m->compat_props, hw_compat_10_1, hw_compat_10_1_len); compat_props_add(m->compat_props, pc_compat_10_1, pc_compat_10_1_len); } diff --git a/hw/i386/x86-iommu.c b/hw/i386/x86-iommu.c index d34a6849f4..c127a44bb4 100644 --- a/hw/i386/x86-iommu.c +++ b/hw/i386/x86-iommu.c @@ -130,6 +130,7 @@ static const Property x86_iommu_properties[] = { intr_supported, ON_OFF_AUTO_AUTO), DEFINE_PROP_BOOL("device-iotlb", X86IOMMUState, dt_supported, false), DEFINE_PROP_BOOL("pt", X86IOMMUState, pt_supported, true), + DEFINE_PROP_BOOL("dma-translation", X86IOMMUState, dma_translation, true), }; static void x86_iommu_class_init(ObjectClass *klass, const void *data) diff --git a/hw/isa/lpc_ich9.c b/hw/isa/lpc_ich9.c index 304dffac32..c9cb8f7779 100644 --- a/hw/isa/lpc_ich9.c +++ b/hw/isa/lpc_ich9.c @@ -132,6 +132,11 @@ static void ich9_cc_init(ICH9LPCState *lpc) static void ich9_cc_reset(ICH9LPCState *lpc) { uint8_t *c = lpc->chip_config; + uint32_t gcs = ICH9_CC_GCS_DEFAULT; + + if (lpc->pin_strap.spkr_hi) { + gcs |= ICH9_CC_GCS_NO_REBOOT; + } memset(lpc->chip_config, 0, sizeof(lpc->chip_config)); @@ -142,7 +147,7 @@ static void ich9_cc_reset(ICH9LPCState *lpc) pci_set_long(c + ICH9_CC_D27IR, ICH9_CC_DIR_DEFAULT); pci_set_long(c + ICH9_CC_D26IR, ICH9_CC_DIR_DEFAULT); pci_set_long(c + ICH9_CC_D25IR, ICH9_CC_DIR_DEFAULT); - pci_set_long(c + ICH9_CC_GCS, ICH9_CC_GCS_DEFAULT); + pci_set_long(c + ICH9_CC_GCS, gcs); ich9_cc_update(lpc); } diff --git a/hw/net/e1000e_core.c b/hw/net/e1000e_core.c index 06657bb3ac..8fef598b49 100644 --- a/hw/net/e1000e_core.c +++ b/hw/net/e1000e_core.c @@ -2822,8 +2822,9 @@ e1000e_update_rx_offloads(E1000ECore *core) trace_e1000e_rx_set_cso(cso_state); if (core->has_vnet) { - qemu_set_offload(qemu_get_queue(core->owner_nic)->peer, - cso_state, 0, 0, 0, 0, 0, 0); + NetOffloads ol = { .csum = cso_state }; + + qemu_set_offload(qemu_get_queue(core->owner_nic)->peer, &ol); } } diff --git a/hw/net/igb_core.c b/hw/net/igb_core.c index 39e3ce1c8f..45d8fd795b 100644 --- a/hw/net/igb_core.c +++ b/hw/net/igb_core.c @@ -3058,8 +3058,9 @@ igb_update_rx_offloads(IGBCore *core) trace_e1000e_rx_set_cso(cso_state); if (core->has_vnet) { - qemu_set_offload(qemu_get_queue(core->owner_nic)->peer, - cso_state, 0, 0, 0, 0, 0, 0); + NetOffloads ol = {.csum = cso_state }; + + qemu_set_offload(qemu_get_queue(core->owner_nic)->peer, &ol); } } diff --git a/hw/net/igbvf.c b/hw/net/igbvf.c index 31d72c4977..9b0db8f841 100644 --- a/hw/net/igbvf.c +++ b/hw/net/igbvf.c @@ -251,10 +251,12 @@ static void igbvf_pci_realize(PCIDevice *dev, Error **errp) memory_region_init_io(&s->mmio, OBJECT(dev), &mmio_ops, s, "igbvf-mmio", IGBVF_MMIO_SIZE); - pcie_sriov_vf_register_bar(dev, IGBVF_MMIO_BAR_IDX, &s->mmio); + pci_register_bar(dev, IGBVF_MMIO_BAR_IDX, PCI_BASE_ADDRESS_MEM_TYPE_64 | + PCI_BASE_ADDRESS_MEM_PREFETCH, &s->mmio); memory_region_init(&s->msix, OBJECT(dev), "igbvf-msix", IGBVF_MSIX_SIZE); - pcie_sriov_vf_register_bar(dev, IGBVF_MSIX_BAR_IDX, &s->msix); + pci_register_bar(dev, IGBVF_MSIX_BAR_IDX, PCI_BASE_ADDRESS_MEM_TYPE_64 | + PCI_BASE_ADDRESS_MEM_PREFETCH, &s->msix); ret = msix_init(dev, IGBVF_MSIX_VEC_NUM, &s->msix, IGBVF_MSIX_BAR_IDX, 0, &s->msix, IGBVF_MSIX_BAR_IDX, 0x2000, 0x70, errp); diff --git a/hw/net/vhost_net-stub.c b/hw/net/vhost_net-stub.c index 7d49f82906..0740d5a2eb 100644 --- a/hw/net/vhost_net-stub.c +++ b/hw/net/vhost_net-stub.c @@ -46,9 +46,8 @@ void vhost_net_cleanup(struct vhost_net *net) { } -uint64_t vhost_net_get_features(struct vhost_net *net, uint64_t features) +void vhost_net_get_features_ex(struct vhost_net *net, uint64_t *features) { - return features; } int vhost_net_get_config(struct vhost_net *net, uint8_t *config, @@ -62,13 +61,12 @@ int vhost_net_set_config(struct vhost_net *net, const uint8_t *data, return 0; } -void vhost_net_ack_features(struct vhost_net *net, uint64_t features) +void vhost_net_ack_features_ex(struct vhost_net *net, const uint64_t *features) { } -uint64_t vhost_net_get_acked_features(VHostNetState *net) +void vhost_net_get_acked_features_ex(VHostNetState *net, uint64_t *features) { - return 0; } bool vhost_net_virtqueue_pending(VHostNetState *net, int idx) diff --git a/hw/net/vhost_net.c b/hw/net/vhost_net.c index 540492b37d..a8ee18a912 100644 --- a/hw/net/vhost_net.c +++ b/hw/net/vhost_net.c @@ -35,10 +35,9 @@ #include "hw/virtio/virtio-bus.h" #include "linux-headers/linux/vhost.h" -uint64_t vhost_net_get_features(struct vhost_net *net, uint64_t features) +void vhost_net_get_features_ex(struct vhost_net *net, uint64_t *features) { - return vhost_get_features(&net->dev, net->feature_bits, - features); + vhost_get_features_ex(&net->dev, net->feature_bits, features); } int vhost_net_get_config(struct vhost_net *net, uint8_t *config, uint32_t config_len) @@ -51,10 +50,11 @@ int vhost_net_set_config(struct vhost_net *net, const uint8_t *data, return vhost_dev_set_config(&net->dev, data, offset, size, flags); } -void vhost_net_ack_features(struct vhost_net *net, uint64_t features) +void vhost_net_ack_features_ex(struct vhost_net *net, const uint64_t *features) { - net->dev.acked_features = net->dev.backend_features; - vhost_ack_features(&net->dev, net->feature_bits, features); + virtio_features_copy(net->dev.acked_features_ex, + net->dev.backend_features_ex); + vhost_ack_features_ex(&net->dev, net->feature_bits, features); } uint64_t vhost_net_get_max_queues(VHostNetState *net) @@ -62,9 +62,9 @@ uint64_t vhost_net_get_max_queues(VHostNetState *net) return net->dev.max_queues; } -uint64_t vhost_net_get_acked_features(VHostNetState *net) +void vhost_net_get_acked_features_ex(VHostNetState *net, uint64_t *features) { - return net->dev.acked_features; + virtio_features_copy(features, net->dev.acked_features_ex); } void vhost_net_save_acked_features(NetClientState *nc) @@ -234,7 +234,8 @@ struct vhost_net *vhost_net_init(VhostNetOptions *options) int r; bool backend_kernel = options->backend_type == VHOST_BACKEND_TYPE_KERNEL; struct vhost_net *net = g_new0(struct vhost_net, 1); - uint64_t features = 0; + uint64_t missing_features[VIRTIO_FEATURES_NU64S]; + uint64_t features[VIRTIO_FEATURES_NU64S]; Error *local_err = NULL; if (!options->net_backend) { @@ -247,6 +248,7 @@ struct vhost_net *vhost_net_init(VhostNetOptions *options) net->save_acked_features = options->save_acked_features; net->max_tx_queue_size = options->max_tx_queue_size; net->is_vhost_user = options->is_vhost_user; + virtio_features_clear(features); net->dev.max_queues = 1; net->dev.vqs = net->vqs; @@ -261,7 +263,7 @@ struct vhost_net *vhost_net_init(VhostNetOptions *options) net->backend = r; net->dev.protocol_features = 0; } else { - net->dev.backend_features = 0; + virtio_features_clear(net->dev.backend_features_ex); net->dev.protocol_features = 0; net->backend = -1; @@ -281,26 +283,29 @@ struct vhost_net *vhost_net_init(VhostNetOptions *options) sizeof(struct virtio_net_hdr_mrg_rxbuf))) { net->dev.features &= ~(1ULL << VIRTIO_NET_F_MRG_RXBUF); } - if (~net->dev.features & net->dev.backend_features) { - fprintf(stderr, "vhost lacks feature mask 0x%" PRIx64 - " for backend\n", - (uint64_t)(~net->dev.features & net->dev.backend_features)); + + if (virtio_features_andnot(missing_features, + net->dev.backend_features_ex, + net->dev.features_ex)) { + fprintf(stderr, "vhost lacks feature mask 0x" VIRTIO_FEATURES_FMT + " for backend\n", VIRTIO_FEATURES_PR(missing_features)); goto fail; } } /* Set sane init value. Override when guest acks. */ if (options->get_acked_features) { - features = options->get_acked_features(net->nc); - if (~net->dev.features & features) { - fprintf(stderr, "vhost lacks feature mask 0x%" PRIx64 - " for backend\n", - (uint64_t)(~net->dev.features & features)); + virtio_features_from_u64(features, + options->get_acked_features(net->nc)); + if (virtio_features_andnot(missing_features, features, + net->dev.features_ex)) { + fprintf(stderr, "vhost lacks feature mask 0x" VIRTIO_FEATURES_FMT + " for backend\n", VIRTIO_FEATURES_PR(missing_features)); goto fail; } } - vhost_net_ack_features(net, features); + vhost_net_ack_features_ex(net, features); return net; diff --git a/hw/net/virtio-net.c b/hw/net/virtio-net.c index 7848e26278..33116712eb 100644 --- a/hw/net/virtio-net.c +++ b/hw/net/virtio-net.c @@ -90,6 +90,25 @@ VIRTIO_NET_RSS_HASH_TYPE_TCP_EX | \ VIRTIO_NET_RSS_HASH_TYPE_UDP_EX) +/* + * Features starting from VIRTIO_NET_FEATURES_MAP_MIN bit correspond + * to guest offloads in the VIRTIO_NET_OFFLOAD_MAP range + */ +#define VIRTIO_NET_OFFLOAD_MAP_MIN 46 +#define VIRTIO_NET_OFFLOAD_MAP_LENGTH 4 +#define VIRTIO_NET_OFFLOAD_MAP MAKE_64BIT_MASK( \ + VIRTIO_NET_OFFLOAD_MAP_MIN, \ + VIRTIO_NET_OFFLOAD_MAP_LENGTH) +#define VIRTIO_NET_FEATURES_MAP_MIN 65 +#define VIRTIO_NET_F2O_SHIFT (VIRTIO_NET_OFFLOAD_MAP_MIN - \ + VIRTIO_NET_FEATURES_MAP_MIN + 64) + +static bool virtio_has_tunnel_hdr(const uint64_t *features) +{ + return virtio_has_feature_ex(features, VIRTIO_NET_F_HOST_UDP_TUNNEL_GSO) || + virtio_has_feature_ex(features, VIRTIO_NET_F_GUEST_UDP_TUNNEL_GSO); +} + static const VirtIOFeature feature_sizes[] = { {.flags = 1ULL << VIRTIO_NET_F_MAC, .end = endof(struct virtio_net_config, mac)}, @@ -636,8 +655,18 @@ static int peer_has_uso(VirtIONet *n) return qemu_has_uso(qemu_get_queue(n->nic)->peer); } +static bool peer_has_tunnel(VirtIONet *n) +{ + if (!peer_has_vnet_hdr(n)) { + return false; + } + + return qemu_has_tunnel(qemu_get_queue(n->nic)->peer); +} + static void virtio_net_set_mrg_rx_bufs(VirtIONet *n, int mergeable_rx_bufs, - int version_1, int hash_report) + int version_1, int hash_report, + int tunnel) { int i; NetClientState *nc; @@ -645,9 +674,11 @@ static void virtio_net_set_mrg_rx_bufs(VirtIONet *n, int mergeable_rx_bufs, n->mergeable_rx_bufs = mergeable_rx_bufs; if (version_1) { - n->guest_hdr_len = hash_report ? - sizeof(struct virtio_net_hdr_v1_hash) : - sizeof(struct virtio_net_hdr_mrg_rxbuf); + n->guest_hdr_len = tunnel ? + sizeof(struct virtio_net_hdr_v1_hash_tunnel) : + (hash_report ? + sizeof(struct virtio_net_hdr_v1_hash) : + sizeof(struct virtio_net_hdr_mrg_rxbuf)); n->rss_data.populate_hash = !!hash_report; } else { n->guest_hdr_len = n->mergeable_rx_bufs ? @@ -773,17 +804,31 @@ static uint64_t virtio_net_bad_features(VirtIODevice *vdev) static void virtio_net_apply_guest_offloads(VirtIONet *n) { - qemu_set_offload(qemu_get_queue(n->nic)->peer, - !!(n->curr_guest_offloads & (1ULL << VIRTIO_NET_F_GUEST_CSUM)), - !!(n->curr_guest_offloads & (1ULL << VIRTIO_NET_F_GUEST_TSO4)), - !!(n->curr_guest_offloads & (1ULL << VIRTIO_NET_F_GUEST_TSO6)), - !!(n->curr_guest_offloads & (1ULL << VIRTIO_NET_F_GUEST_ECN)), - !!(n->curr_guest_offloads & (1ULL << VIRTIO_NET_F_GUEST_UFO)), - !!(n->curr_guest_offloads & (1ULL << VIRTIO_NET_F_GUEST_USO4)), - !!(n->curr_guest_offloads & (1ULL << VIRTIO_NET_F_GUEST_USO6))); + NetOffloads ol = { + .csum = !!(n->curr_guest_offloads & (1ULL << VIRTIO_NET_F_GUEST_CSUM)), + .tso4 = !!(n->curr_guest_offloads & (1ULL << VIRTIO_NET_F_GUEST_TSO4)), + .tso6 = !!(n->curr_guest_offloads & (1ULL << VIRTIO_NET_F_GUEST_TSO6)), + .ecn = !!(n->curr_guest_offloads & (1ULL << VIRTIO_NET_F_GUEST_ECN)), + .ufo = !!(n->curr_guest_offloads & (1ULL << VIRTIO_NET_F_GUEST_UFO)), + .uso4 = !!(n->curr_guest_offloads & (1ULL << VIRTIO_NET_F_GUEST_USO4)), + .uso6 = !!(n->curr_guest_offloads & (1ULL << VIRTIO_NET_F_GUEST_USO6)), + .tnl = !!(n->curr_guest_offloads & + (1ULL << VIRTIO_NET_F_GUEST_UDP_TUNNEL_GSO_MAPPED)), + .tnl_csum = !!(n->curr_guest_offloads & + (1ULL << VIRTIO_NET_F_GUEST_UDP_TUNNEL_GSO_CSUM_MAPPED)), + }; + + qemu_set_offload(qemu_get_queue(n->nic)->peer, &ol); +} + +static uint64_t virtio_net_features_to_offload(const uint64_t *features) +{ + return (features[0] & ~VIRTIO_NET_OFFLOAD_MAP) | + ((features[1] << VIRTIO_NET_F2O_SHIFT) & VIRTIO_NET_OFFLOAD_MAP); } -static uint64_t virtio_net_guest_offloads_by_features(uint64_t features) +static uint64_t +virtio_net_guest_offloads_by_features(const uint64_t *features) { static const uint64_t guest_offloads_mask = (1ULL << VIRTIO_NET_F_GUEST_CSUM) | @@ -792,15 +837,17 @@ static uint64_t virtio_net_guest_offloads_by_features(uint64_t features) (1ULL << VIRTIO_NET_F_GUEST_ECN) | (1ULL << VIRTIO_NET_F_GUEST_UFO) | (1ULL << VIRTIO_NET_F_GUEST_USO4) | - (1ULL << VIRTIO_NET_F_GUEST_USO6); + (1ULL << VIRTIO_NET_F_GUEST_USO6) | + (1ULL << VIRTIO_NET_F_GUEST_UDP_TUNNEL_GSO_MAPPED) | + (1ULL << VIRTIO_NET_F_GUEST_UDP_TUNNEL_GSO_CSUM_MAPPED); - return guest_offloads_mask & features; + return guest_offloads_mask & virtio_net_features_to_offload(features); } uint64_t virtio_net_supported_guest_offloads(const VirtIONet *n) { VirtIODevice *vdev = VIRTIO_DEVICE(n); - return virtio_net_guest_offloads_by_features(vdev->guest_features); + return virtio_net_guest_offloads_by_features(vdev->guest_features_ex); } typedef struct { @@ -879,34 +926,40 @@ static void failover_add_primary(VirtIONet *n, Error **errp) error_propagate(errp, err); } -static void virtio_net_set_features(VirtIODevice *vdev, uint64_t features) +static void virtio_net_set_features(VirtIODevice *vdev, + const uint64_t *in_features) { + uint64_t features[VIRTIO_FEATURES_NU64S]; VirtIONet *n = VIRTIO_NET(vdev); Error *err = NULL; int i; + virtio_features_copy(features, in_features); if (n->mtu_bypass_backend && !virtio_has_feature(vdev->backend_features, VIRTIO_NET_F_MTU)) { - features &= ~(1ULL << VIRTIO_NET_F_MTU); + virtio_clear_feature_ex(features, VIRTIO_NET_F_MTU); } virtio_net_set_multiqueue(n, - virtio_has_feature(features, VIRTIO_NET_F_RSS) || - virtio_has_feature(features, VIRTIO_NET_F_MQ)); + virtio_has_feature_ex(features, + VIRTIO_NET_F_RSS) || + virtio_has_feature_ex(features, + VIRTIO_NET_F_MQ)); virtio_net_set_mrg_rx_bufs(n, - virtio_has_feature(features, + virtio_has_feature_ex(features, VIRTIO_NET_F_MRG_RXBUF), - virtio_has_feature(features, + virtio_has_feature_ex(features, VIRTIO_F_VERSION_1), - virtio_has_feature(features, - VIRTIO_NET_F_HASH_REPORT)); + virtio_has_feature_ex(features, + VIRTIO_NET_F_HASH_REPORT), + virtio_has_tunnel_hdr(features)); - n->rsc4_enabled = virtio_has_feature(features, VIRTIO_NET_F_RSC_EXT) && - virtio_has_feature(features, VIRTIO_NET_F_GUEST_TSO4); - n->rsc6_enabled = virtio_has_feature(features, VIRTIO_NET_F_RSC_EXT) && - virtio_has_feature(features, VIRTIO_NET_F_GUEST_TSO6); - n->rss_data.redirect = virtio_has_feature(features, VIRTIO_NET_F_RSS); + n->rsc4_enabled = virtio_has_feature_ex(features, VIRTIO_NET_F_RSC_EXT) && + virtio_has_feature_ex(features, VIRTIO_NET_F_GUEST_TSO4); + n->rsc6_enabled = virtio_has_feature_ex(features, VIRTIO_NET_F_RSC_EXT) && + virtio_has_feature_ex(features, VIRTIO_NET_F_GUEST_TSO6); + n->rss_data.redirect = virtio_has_feature_ex(features, VIRTIO_NET_F_RSS); if (n->has_vnet_hdr) { n->curr_guest_offloads = @@ -920,7 +973,7 @@ static void virtio_net_set_features(VirtIODevice *vdev, uint64_t features) if (!get_vhost_net(nc->peer)) { continue; } - vhost_net_ack_features(get_vhost_net(nc->peer), features); + vhost_net_ack_features_ex(get_vhost_net(nc->peer), features); /* * keep acked_features in NetVhostUserState up-to-date so it @@ -929,12 +982,14 @@ static void virtio_net_set_features(VirtIODevice *vdev, uint64_t features) vhost_net_save_acked_features(nc->peer); } - if (virtio_has_feature(vdev->guest_features ^ features, VIRTIO_NET_F_CTRL_VLAN)) { - bool vlan = virtio_has_feature(features, VIRTIO_NET_F_CTRL_VLAN); + if (virtio_has_feature_ex(features, VIRTIO_NET_F_CTRL_VLAN) != + virtio_has_feature_ex(vdev->guest_features_ex, + VIRTIO_NET_F_CTRL_VLAN)) { + bool vlan = virtio_has_feature_ex(features, VIRTIO_NET_F_CTRL_VLAN); memset(n->vlans, vlan ? 0 : 0xff, MAX_VLAN >> 3); } - if (virtio_has_feature(features, VIRTIO_NET_F_STANDBY)) { + if (virtio_has_feature_ex(features, VIRTIO_NET_F_STANDBY)) { qapi_event_send_failover_negotiated(n->netclient_name); qatomic_set(&n->failover_primary_hidden, false); failover_add_primary(n, &err); @@ -1905,10 +1960,10 @@ static ssize_t virtio_net_receive_rcu(NetClientState *nc, const uint8_t *buf, virtio_error(vdev, "virtio-net unexpected empty queue: " "i %zd mergeable %d offset %zd, size %zd, " "guest hdr len %zd, host hdr len %zd " - "guest features 0x%" PRIx64, + "guest features 0x" VIRTIO_FEATURES_FMT, i, n->mergeable_rx_bufs, offset, size, n->guest_hdr_len, n->host_hdr_len, - vdev->guest_features); + VIRTIO_FEATURES_PR(vdev->guest_features_ex)); } err = -1; goto err; @@ -3015,8 +3070,8 @@ static int virtio_net_pre_load_queues(VirtIODevice *vdev, uint32_t n) return 0; } -static uint64_t virtio_net_get_features(VirtIODevice *vdev, uint64_t features, - Error **errp) +static void virtio_net_get_features(VirtIODevice *vdev, uint64_t *features, + Error **errp) { VirtIONet *n = VIRTIO_NET(vdev); NetClientState *nc = qemu_get_queue(n->nic); @@ -3030,68 +3085,83 @@ static uint64_t virtio_net_get_features(VirtIODevice *vdev, uint64_t features, (supported_hash_types & peer_hash_types) == supported_hash_types; /* Firstly sync all virtio-net possible supported features */ - features |= n->host_features; + virtio_features_or(features, features, n->host_features_ex); - virtio_add_feature(&features, VIRTIO_NET_F_MAC); + virtio_add_feature_ex(features, VIRTIO_NET_F_MAC); if (!peer_has_vnet_hdr(n)) { - virtio_clear_feature(&features, VIRTIO_NET_F_CSUM); - virtio_clear_feature(&features, VIRTIO_NET_F_HOST_TSO4); - virtio_clear_feature(&features, VIRTIO_NET_F_HOST_TSO6); - virtio_clear_feature(&features, VIRTIO_NET_F_HOST_ECN); + virtio_clear_feature_ex(features, VIRTIO_NET_F_CSUM); + virtio_clear_feature_ex(features, VIRTIO_NET_F_HOST_TSO4); + virtio_clear_feature_ex(features, VIRTIO_NET_F_HOST_TSO6); + virtio_clear_feature_ex(features, VIRTIO_NET_F_HOST_ECN); + + virtio_clear_feature_ex(features, VIRTIO_NET_F_GUEST_CSUM); + virtio_clear_feature_ex(features, VIRTIO_NET_F_GUEST_TSO4); + virtio_clear_feature_ex(features, VIRTIO_NET_F_GUEST_TSO6); + virtio_clear_feature_ex(features, VIRTIO_NET_F_GUEST_ECN); - virtio_clear_feature(&features, VIRTIO_NET_F_GUEST_CSUM); - virtio_clear_feature(&features, VIRTIO_NET_F_GUEST_TSO4); - virtio_clear_feature(&features, VIRTIO_NET_F_GUEST_TSO6); - virtio_clear_feature(&features, VIRTIO_NET_F_GUEST_ECN); + virtio_clear_feature_ex(features, VIRTIO_NET_F_HOST_USO); + virtio_clear_feature_ex(features, VIRTIO_NET_F_GUEST_USO4); + virtio_clear_feature_ex(features, VIRTIO_NET_F_GUEST_USO6); - virtio_clear_feature(&features, VIRTIO_NET_F_HOST_USO); - virtio_clear_feature(&features, VIRTIO_NET_F_GUEST_USO4); - virtio_clear_feature(&features, VIRTIO_NET_F_GUEST_USO6); + virtio_clear_feature_ex(features, VIRTIO_NET_F_GUEST_UDP_TUNNEL_GSO); + virtio_clear_feature_ex(features, VIRTIO_NET_F_HOST_UDP_TUNNEL_GSO); + virtio_clear_feature_ex(features, + VIRTIO_NET_F_GUEST_UDP_TUNNEL_GSO_CSUM); + virtio_clear_feature_ex(features, + VIRTIO_NET_F_HOST_UDP_TUNNEL_GSO_CSUM); - virtio_clear_feature(&features, VIRTIO_NET_F_HASH_REPORT); + virtio_clear_feature_ex(features, VIRTIO_NET_F_HASH_REPORT); } if (!peer_has_vnet_hdr(n) || !peer_has_ufo(n)) { - virtio_clear_feature(&features, VIRTIO_NET_F_GUEST_UFO); - virtio_clear_feature(&features, VIRTIO_NET_F_HOST_UFO); + virtio_clear_feature_ex(features, VIRTIO_NET_F_GUEST_UFO); + virtio_clear_feature_ex(features, VIRTIO_NET_F_HOST_UFO); } - if (!peer_has_uso(n)) { - virtio_clear_feature(&features, VIRTIO_NET_F_HOST_USO); - virtio_clear_feature(&features, VIRTIO_NET_F_GUEST_USO4); - virtio_clear_feature(&features, VIRTIO_NET_F_GUEST_USO6); + virtio_clear_feature_ex(features, VIRTIO_NET_F_HOST_USO); + virtio_clear_feature_ex(features, VIRTIO_NET_F_GUEST_USO4); + virtio_clear_feature_ex(features, VIRTIO_NET_F_GUEST_USO6); + } + + if (!peer_has_tunnel(n)) { + virtio_clear_feature_ex(features, VIRTIO_NET_F_GUEST_UDP_TUNNEL_GSO); + virtio_clear_feature_ex(features, VIRTIO_NET_F_HOST_UDP_TUNNEL_GSO); + virtio_clear_feature_ex(features, + VIRTIO_NET_F_GUEST_UDP_TUNNEL_GSO_CSUM); + virtio_clear_feature_ex(features, + VIRTIO_NET_F_HOST_UDP_TUNNEL_GSO_CSUM); } if (!get_vhost_net(nc->peer)) { if (!use_own_hash) { - virtio_clear_feature(&features, VIRTIO_NET_F_HASH_REPORT); - virtio_clear_feature(&features, VIRTIO_NET_F_RSS); - } else if (virtio_has_feature(features, VIRTIO_NET_F_RSS)) { + virtio_clear_feature_ex(features, VIRTIO_NET_F_HASH_REPORT); + virtio_clear_feature_ex(features, VIRTIO_NET_F_RSS); + } else if (virtio_has_feature_ex(features, VIRTIO_NET_F_RSS)) { virtio_net_load_ebpf(n, errp); } - return features; + return; } if (!use_peer_hash) { - virtio_clear_feature(&features, VIRTIO_NET_F_HASH_REPORT); + virtio_clear_feature_ex(features, VIRTIO_NET_F_HASH_REPORT); if (!use_own_hash || !virtio_net_attach_ebpf_to_backend(n->nic, -1)) { if (!virtio_net_load_ebpf(n, errp)) { - return features; + return; } - virtio_clear_feature(&features, VIRTIO_NET_F_RSS); + virtio_clear_feature_ex(features, VIRTIO_NET_F_RSS); } } - features = vhost_net_get_features(get_vhost_net(nc->peer), features); - vdev->backend_features = features; + vhost_net_get_features_ex(get_vhost_net(nc->peer), features); + virtio_features_copy(vdev->backend_features_ex, features); if (n->mtu_bypass_backend && (n->host_features & 1ULL << VIRTIO_NET_F_MTU)) { - features |= (1ULL << VIRTIO_NET_F_MTU); + virtio_add_feature_ex(features, VIRTIO_NET_F_MTU); } /* @@ -3106,10 +3176,8 @@ static uint64_t virtio_net_get_features(VirtIODevice *vdev, uint64_t features, * support it. */ if (!virtio_has_feature(vdev->backend_features, VIRTIO_NET_F_CTRL_VQ)) { - virtio_clear_feature(&features, VIRTIO_NET_F_GUEST_ANNOUNCE); + virtio_clear_feature_ex(features, VIRTIO_NET_F_GUEST_ANNOUNCE); } - - return features; } static int virtio_net_post_load_device(void *opaque, int version_id) @@ -3117,13 +3185,15 @@ static int virtio_net_post_load_device(void *opaque, int version_id) VirtIONet *n = opaque; VirtIODevice *vdev = VIRTIO_DEVICE(n); int i, link_down; + bool has_tunnel_hdr = virtio_has_tunnel_hdr(vdev->guest_features_ex); trace_virtio_net_post_load_device(); virtio_net_set_mrg_rx_bufs(n, n->mergeable_rx_bufs, virtio_vdev_has_feature(vdev, VIRTIO_F_VERSION_1), virtio_vdev_has_feature(vdev, - VIRTIO_NET_F_HASH_REPORT)); + VIRTIO_NET_F_HASH_REPORT), + has_tunnel_hdr); /* MAC_TABLE_ENTRIES may be different from the saved image */ if (n->mac_table.in_use > MAC_TABLE_ENTRIES) { @@ -3943,7 +4013,7 @@ static void virtio_net_device_realize(DeviceState *dev, Error **errp) n->vqs[0].tx_waiting = 0; n->tx_burst = n->net_conf.txburst; - virtio_net_set_mrg_rx_bufs(n, 0, 0, 0); + virtio_net_set_mrg_rx_bufs(n, 0, 0, 0, 0); n->promisc = 1; /* for compatibility */ n->mac_table.macs = g_malloc0(MAC_TABLE_ENTRIES * ETH_ALEN); @@ -4227,6 +4297,22 @@ static const Property virtio_net_properties[] = { rss_data.specified_hash_types, VIRTIO_NET_HASH_REPORT_UDPv6_EX - 1, ON_OFF_AUTO_AUTO), + VIRTIO_DEFINE_PROP_FEATURE("host_tunnel", VirtIONet, + host_features_ex, + VIRTIO_NET_F_HOST_UDP_TUNNEL_GSO, + false), + VIRTIO_DEFINE_PROP_FEATURE("host_tunnel_csum", VirtIONet, + host_features_ex, + VIRTIO_NET_F_HOST_UDP_TUNNEL_GSO_CSUM, + false), + VIRTIO_DEFINE_PROP_FEATURE("guest_tunnel", VirtIONet, + host_features_ex, + VIRTIO_NET_F_GUEST_UDP_TUNNEL_GSO, + false), + VIRTIO_DEFINE_PROP_FEATURE("guest_tunnel_csum", VirtIONet, + host_features_ex, + VIRTIO_NET_F_GUEST_UDP_TUNNEL_GSO_CSUM, + false), }; static void virtio_net_class_init(ObjectClass *klass, const void *data) @@ -4241,8 +4327,8 @@ static void virtio_net_class_init(ObjectClass *klass, const void *data) vdc->unrealize = virtio_net_device_unrealize; vdc->get_config = virtio_net_get_config; vdc->set_config = virtio_net_set_config; - vdc->get_features = virtio_net_get_features; - vdc->set_features = virtio_net_set_features; + vdc->get_features_ex = virtio_net_get_features; + vdc->set_features_ex = virtio_net_set_features; vdc->bad_features = virtio_net_bad_features; vdc->reset = virtio_net_reset; vdc->queue_reset = virtio_net_queue_reset; diff --git a/hw/net/vmxnet3.c b/hw/net/vmxnet3.c index af73aa8ef2..03732375a7 100644 --- a/hw/net/vmxnet3.c +++ b/hw/net/vmxnet3.c @@ -1322,14 +1322,11 @@ static void vmxnet3_update_features(VMXNET3State *s) s->lro_supported, rxcso_supported, s->rx_vlan_stripping); if (s->peer_has_vhdr) { - qemu_set_offload(qemu_get_queue(s->nic)->peer, - rxcso_supported, - s->lro_supported, - s->lro_supported, - 0, - 0, - 0, - 0); + NetOffloads ol = { .csum = rxcso_supported, + .tso4 = s->lro_supported, + .tso6 = s->lro_supported }; + + qemu_set_offload(qemu_get_queue(s->nic)->peer, &ol); } } diff --git a/hw/nvme/ctrl.c b/hw/nvme/ctrl.c index f5ee6bf260..cd81f73997 100644 --- a/hw/nvme/ctrl.c +++ b/hw/nvme/ctrl.c @@ -8708,12 +8708,8 @@ static bool nvme_init_pci(NvmeCtrl *n, PCIDevice *pci_dev, Error **errp) msix_table_offset); memory_region_add_subregion(&n->bar0, 0, &n->iomem); - if (pci_is_vf(pci_dev)) { - pcie_sriov_vf_register_bar(pci_dev, 0, &n->bar0); - } else { - pci_register_bar(pci_dev, 0, PCI_BASE_ADDRESS_SPACE_MEMORY | - PCI_BASE_ADDRESS_MEM_TYPE_64, &n->bar0); - } + pci_register_bar(pci_dev, 0, PCI_BASE_ADDRESS_SPACE_MEMORY | + PCI_BASE_ADDRESS_MEM_TYPE_64, &n->bar0); ret = msix_init(pci_dev, nr_vectors, &n->bar0, 0, msix_table_offset, diff --git a/hw/pci/pci.c b/hw/pci/pci.c index 5e2c3c6fc2..acc03fd470 100644 --- a/hw/pci/pci.c +++ b/hw/pci/pci.c @@ -1492,9 +1492,6 @@ void pci_register_bar(PCIDevice *pci_dev, int region_num, : pci_get_bus(pci_dev)->address_space_mem; if (pci_is_vf(pci_dev)) { - PCIDevice *pf = pci_dev->exp.sriov_vf.pf; - assert(!pf || type == pf->exp.sriov_pf.vf_bar_type[region_num]); - r->addr = pci_bar_address(pci_dev, region_num, r->type, r->size); if (r->addr != PCI_BAR_UNMAPPED) { memory_region_add_subregion_overlap(r->address_space, @@ -2968,7 +2965,7 @@ int pci_iommu_init_iotlb_notifier(PCIDevice *dev, IOMMUNotifier *n, PCIBus *iommu_bus; int devfn; - pci_device_get_iommu_bus_devfn(dev, &bus, &iommu_bus, &devfn); + pci_device_get_iommu_bus_devfn(dev, &iommu_bus, &bus, &devfn); if (iommu_bus && iommu_bus->iommu_ops->init_iotlb_notifier) { iommu_bus->iommu_ops->init_iotlb_notifier(bus, iommu_bus->iommu_opaque, devfn, n, fn, opaque); @@ -3026,7 +3023,7 @@ int pci_pri_request_page(PCIDevice *dev, uint32_t pasid, bool priv_req, return -EPERM; } - pci_device_get_iommu_bus_devfn(dev, &bus, &iommu_bus, &devfn); + pci_device_get_iommu_bus_devfn(dev, &iommu_bus, &bus, &devfn); if (iommu_bus && iommu_bus->iommu_ops->pri_request_page) { return iommu_bus->iommu_ops->pri_request_page(bus, iommu_bus->iommu_opaque, @@ -3050,7 +3047,7 @@ int pci_pri_register_notifier(PCIDevice *dev, uint32_t pasid, return -EPERM; } - pci_device_get_iommu_bus_devfn(dev, &bus, &iommu_bus, &devfn); + pci_device_get_iommu_bus_devfn(dev, &iommu_bus, &bus, &devfn); if (iommu_bus && iommu_bus->iommu_ops->pri_register_notifier) { iommu_bus->iommu_ops->pri_register_notifier(bus, iommu_bus->iommu_opaque, @@ -3067,7 +3064,7 @@ void pci_pri_unregister_notifier(PCIDevice *dev, uint32_t pasid) PCIBus *iommu_bus; int devfn; - pci_device_get_iommu_bus_devfn(dev, &bus, &iommu_bus, &devfn); + pci_device_get_iommu_bus_devfn(dev, &iommu_bus, &bus, &devfn); if (iommu_bus && iommu_bus->iommu_ops->pri_unregister_notifier) { iommu_bus->iommu_ops->pri_unregister_notifier(bus, iommu_bus->iommu_opaque, @@ -3099,7 +3096,7 @@ ssize_t pci_ats_request_translation(PCIDevice *dev, uint32_t pasid, return -EPERM; } - pci_device_get_iommu_bus_devfn(dev, &bus, &iommu_bus, &devfn); + pci_device_get_iommu_bus_devfn(dev, &iommu_bus, &bus, &devfn); if (iommu_bus && iommu_bus->iommu_ops->ats_request_translation) { return iommu_bus->iommu_ops->ats_request_translation(bus, iommu_bus->iommu_opaque, @@ -3123,7 +3120,7 @@ int pci_iommu_register_iotlb_notifier(PCIDevice *dev, uint32_t pasid, return -EPERM; } - pci_device_get_iommu_bus_devfn(dev, &bus, &iommu_bus, &devfn); + pci_device_get_iommu_bus_devfn(dev, &iommu_bus, &bus, &devfn); if (iommu_bus && iommu_bus->iommu_ops->register_iotlb_notifier) { iommu_bus->iommu_ops->register_iotlb_notifier(bus, iommu_bus->iommu_opaque, devfn, @@ -3145,7 +3142,7 @@ int pci_iommu_unregister_iotlb_notifier(PCIDevice *dev, uint32_t pasid, return -EPERM; } - pci_device_get_iommu_bus_devfn(dev, &bus, &iommu_bus, &devfn); + pci_device_get_iommu_bus_devfn(dev, &iommu_bus, &bus, &devfn); if (iommu_bus && iommu_bus->iommu_ops->unregister_iotlb_notifier) { iommu_bus->iommu_ops->unregister_iotlb_notifier(bus, iommu_bus->iommu_opaque, @@ -3159,11 +3156,9 @@ int pci_iommu_unregister_iotlb_notifier(PCIDevice *dev, uint32_t pasid, int pci_iommu_get_iotlb_info(PCIDevice *dev, uint8_t *addr_width, uint32_t *min_page_size) { - PCIBus *bus; PCIBus *iommu_bus; - int devfn; - pci_device_get_iommu_bus_devfn(dev, &bus, &iommu_bus, &devfn); + pci_device_get_iommu_bus_devfn(dev, &iommu_bus, NULL, NULL); if (iommu_bus && iommu_bus->iommu_ops->get_iotlb_info) { iommu_bus->iommu_ops->get_iotlb_info(iommu_bus->iommu_opaque, addr_width, min_page_size); diff --git a/hw/pci/pcie.c b/hw/pci/pcie.c index eaeb68894e..b302de6419 100644 --- a/hw/pci/pcie.c +++ b/hw/pci/pcie.c @@ -1266,6 +1266,14 @@ void pcie_pri_init(PCIDevice *dev, uint16_t offset, uint32_t outstanding_pr_cap, dev->exp.pri_cap = offset; } +uint32_t pcie_pri_get_req_alloc(const PCIDevice *dev) +{ + if (!pcie_pri_enabled(dev)) { + return 0; + } + return pci_get_long(dev->config + dev->exp.pri_cap + PCI_PRI_ALLOC_REQ); +} + bool pcie_pri_enabled(const PCIDevice *dev) { if (!pci_is_express(dev) || !dev->exp.pri_cap) { diff --git a/hw/pci/pcie_sriov.c b/hw/pci/pcie_sriov.c index 8a4bf0d6f7..c4f88f0975 100644 --- a/hw/pci/pcie_sriov.c +++ b/hw/pci/pcie_sriov.c @@ -195,7 +195,9 @@ bool pcie_sriov_pf_init(PCIDevice *dev, uint16_t offset, void pcie_sriov_pf_exit(PCIDevice *dev) { - uint8_t *cfg = dev->config + dev->exp.sriov_cap; + if (dev->exp.sriov_cap == 0) { + return; + } if (dev->exp.sriov_pf.vf_user_created) { uint16_t ven_id = pci_get_word(dev->config + PCI_VENDOR_ID); @@ -211,6 +213,8 @@ void pcie_sriov_pf_exit(PCIDevice *dev) pci_config_set_device_id(dev->exp.sriov_pf.vf[i]->config, vf_dev_id); } } else { + uint8_t *cfg = dev->config + dev->exp.sriov_cap; + unparent_vfs(dev, pci_get_word(cfg + PCI_SRIOV_TOTAL_VF)); } } @@ -242,17 +246,6 @@ void pcie_sriov_pf_init_vf_bar(PCIDevice *dev, int region_num, dev->exp.sriov_pf.vf_bar_type[region_num] = type; } -void pcie_sriov_vf_register_bar(PCIDevice *dev, int region_num, - MemoryRegion *memory) -{ - uint8_t type; - - assert(dev->exp.sriov_vf.pf); - type = dev->exp.sriov_vf.pf->exp.sriov_pf.vf_bar_type[region_num]; - - return pci_register_bar(dev, region_num, type, memory); -} - static gint compare_vf_devfns(gconstpointer a, gconstpointer b) { return (*(PCIDevice **)a)->devfn - (*(PCIDevice **)b)->devfn; diff --git a/hw/scsi/virtio-scsi.c b/hw/scsi/virtio-scsi.c index 34ae14f7bf..d817fc42b4 100644 --- a/hw/scsi/virtio-scsi.c +++ b/hw/scsi/virtio-scsi.c @@ -116,11 +116,7 @@ static void virtio_scsi_complete_req(VirtIOSCSIReq *req, QemuMutex *vq_lock) } virtqueue_push(vq, &req->elem, req->qsgl.size + req->resp_iov.size); - if (s->dataplane_started && !s->dataplane_fenced) { - virtio_notify_irqfd(vdev, vq); - } else { - virtio_notify(vdev, vq); - } + virtio_notify(vdev, vq); if (vq_lock) { qemu_mutex_unlock(vq_lock); diff --git a/hw/smbios/smbios.c b/hw/smbios/smbios.c index 1ac063cfb4..13e21a9c43 100644 --- a/hw/smbios/smbios.c +++ b/hw/smbios/smbios.c @@ -179,6 +179,10 @@ static const QemuOptDesc qemu_smbios_type0_opts[] = { .name = "uefi", .type = QEMU_OPT_BOOL, .help = "uefi support", + },{ + .name = "vm", + .type = QEMU_OPT_BOOL, + .help = "virtual machine", }, { /* end of list */ } }; @@ -574,10 +578,14 @@ static void smbios_build_type_0_table(void) t->bios_characteristics = cpu_to_le64(0x08); /* Not supported */ t->bios_characteristics_extension_bytes[0] = 0; - t->bios_characteristics_extension_bytes[1] = 0x14; /* TCD/SVVP | VM */ + + t->bios_characteristics_extension_bytes[1] = 0x04; /* TCD/SVVP */ if (smbios_type0.uefi) { t->bios_characteristics_extension_bytes[1] |= 0x08; /* |= UEFI */ } + if (smbios_type0.vm) { + t->bios_characteristics_extension_bytes[1] |= 0x10; /* |= VM */ + } if (smbios_type0.have_major_minor) { t->system_bios_major_release = smbios_type0.major; @@ -1405,6 +1413,7 @@ void smbios_entry_add(QemuOpts *opts, Error **errp) save_opt(&smbios_type0.version, opts, "version"); save_opt(&smbios_type0.date, opts, "date"); smbios_type0.uefi = qemu_opt_get_bool(opts, "uefi", false); + smbios_type0.vm = qemu_opt_get_bool(opts, "vm", true); val = qemu_opt_get(opts, "release"); if (val) { diff --git a/hw/virtio/Kconfig b/hw/virtio/Kconfig index 7648a2d68d..10f5c53ac0 100644 --- a/hw/virtio/Kconfig +++ b/hw/virtio/Kconfig @@ -126,3 +126,8 @@ config VHOST_USER_SCMI bool default y depends on VIRTIO && VHOST_USER && ARM + +config VHOST_USER_TEST + bool + default y + depends on VIRTIO && VHOST_USER diff --git a/hw/virtio/meson.build b/hw/virtio/meson.build index 3ea7b3cec8..48b9fedfa5 100644 --- a/hw/virtio/meson.build +++ b/hw/virtio/meson.build @@ -22,7 +22,7 @@ if have_vhost system_virtio_ss.add(files('vhost-user-base.c')) # MMIO Stubs - system_virtio_ss.add(files('vhost-user-device.c')) + system_virtio_ss.add(when: 'CONFIG_VHOST_USER_TEST', if_true: files('vhost-user-test-device.c')) system_virtio_ss.add(when: 'CONFIG_VHOST_USER_GPIO', if_true: files('vhost-user-gpio.c')) system_virtio_ss.add(when: 'CONFIG_VHOST_USER_I2C', if_true: files('vhost-user-i2c.c')) system_virtio_ss.add(when: 'CONFIG_VHOST_USER_RNG', if_true: files('vhost-user-rng.c')) @@ -30,7 +30,8 @@ if have_vhost system_virtio_ss.add(when: 'CONFIG_VHOST_USER_INPUT', if_true: files('vhost-user-input.c')) # PCI Stubs - system_virtio_ss.add(when: 'CONFIG_VIRTIO_PCI', if_true: files('vhost-user-device-pci.c')) + system_virtio_ss.add(when: ['CONFIG_VIRTIO_PCI', 'CONFIG_VHOST_USER_TEST'], + if_true: files('vhost-user-test-device-pci.c')) system_virtio_ss.add(when: ['CONFIG_VIRTIO_PCI', 'CONFIG_VHOST_USER_GPIO'], if_true: files('vhost-user-gpio-pci.c')) system_virtio_ss.add(when: ['CONFIG_VIRTIO_PCI', 'CONFIG_VHOST_USER_I2C'], diff --git a/hw/virtio/trace-events b/hw/virtio/trace-events index 76f0d458b2..658cc365e7 100644 --- a/hw/virtio/trace-events +++ b/hw/virtio/trace-events @@ -75,7 +75,6 @@ virtqueue_flush(void *vq, unsigned int count) "vq %p count %u" virtqueue_pop(void *vq, void *elem, unsigned int in_num, unsigned int out_num) "vq %p elem %p in_num %u out_num %u" virtio_queue_notify(void *vdev, int n, void *vq) "vdev %p n %d vq %p" virtio_notify_irqfd_deferred_fn(void *vdev, void *vq) "vdev %p vq %p" -virtio_notify_irqfd(void *vdev, void *vq) "vdev %p vq %p" virtio_notify(void *vdev, void *vq) "vdev %p vq %p" virtio_set_status(void *vdev, uint8_t val) "vdev %p val %u" diff --git a/hw/virtio/vdpa-dev.c b/hw/virtio/vdpa-dev.c index d1da40afc8..4a7b970976 100644 --- a/hw/virtio/vdpa-dev.c +++ b/hw/virtio/vdpa-dev.c @@ -338,6 +338,12 @@ static int vhost_vdpa_device_set_status(VirtIODevice *vdev, uint8_t status) return 0; } +static struct vhost_dev *vhost_vdpa_device_get_vhost(VirtIODevice *vdev) +{ + VhostVdpaDevice *s = VHOST_VDPA_DEVICE(vdev); + return &s->dev; +} + static const Property vhost_vdpa_device_properties[] = { DEFINE_PROP_STRING("vhostdev", VhostVdpaDevice, vhostdev), DEFINE_PROP_UINT16("queue-size", VhostVdpaDevice, queue_size, 0), @@ -369,6 +375,7 @@ static void vhost_vdpa_device_class_init(ObjectClass *klass, const void *data) vdc->set_config = vhost_vdpa_device_set_config; vdc->get_features = vhost_vdpa_device_get_features; vdc->set_status = vhost_vdpa_device_set_status; + vdc->get_vhost = vhost_vdpa_device_get_vhost; } static void vhost_vdpa_device_instance_init(Object *obj) diff --git a/hw/virtio/vhost-backend.c b/hw/virtio/vhost-backend.c index 833804dd40..4367db0d95 100644 --- a/hw/virtio/vhost-backend.c +++ b/hw/virtio/vhost-backend.c @@ -20,6 +20,11 @@ #include <linux/vhost.h> #include <sys/ioctl.h> +struct vhost_features { + uint64_t count; + uint64_t features[VIRTIO_FEATURES_NU64S]; +}; + static int vhost_kernel_call(struct vhost_dev *dev, unsigned long int request, void *arg) { @@ -182,12 +187,6 @@ static int vhost_kernel_get_vring_worker(struct vhost_dev *dev, return vhost_kernel_call(dev, VHOST_GET_VRING_WORKER, worker); } -static int vhost_kernel_set_features(struct vhost_dev *dev, - uint64_t features) -{ - return vhost_kernel_call(dev, VHOST_SET_FEATURES, &features); -} - static int vhost_kernel_set_backend_cap(struct vhost_dev *dev) { uint64_t features; @@ -210,10 +209,51 @@ static int vhost_kernel_set_backend_cap(struct vhost_dev *dev) return 0; } -static int vhost_kernel_get_features(struct vhost_dev *dev, - uint64_t *features) +static int vhost_kernel_set_features(struct vhost_dev *dev, + const uint64_t *features) { - return vhost_kernel_call(dev, VHOST_GET_FEATURES, features); + struct vhost_features farray; + bool extended_in_use; + int r; + + farray.count = VIRTIO_FEATURES_NU64S; + virtio_features_copy(farray.features, features); + extended_in_use = virtio_features_use_ex(farray.features); + + /* + * Can't check for ENOTTY: for unknown ioctls the kernel interprets + * the argument as a virtio queue id and most likely errors out validating + * such id, instead of reporting an unknown operation. + */ + r = vhost_kernel_call(dev, VHOST_SET_FEATURES_ARRAY, &farray); + if (!r) { + return 0; + } + + if (extended_in_use) { + error_report("Trying to set extended features without kernel support"); + return -EINVAL; + } + return vhost_kernel_call(dev, VHOST_SET_FEATURES, &farray.features[0]); +} + +static int vhost_kernel_get_features(struct vhost_dev *dev, uint64_t *features) +{ + struct vhost_features farray; + int r; + + farray.count = VIRTIO_FEATURES_NU64S; + r = vhost_kernel_call(dev, VHOST_GET_FEATURES_ARRAY, &farray); + if (r) { + memset(&farray, 0, sizeof(farray)); + r = vhost_kernel_call(dev, VHOST_GET_FEATURES, &farray.features[0]); + } + if (r) { + return r; + } + + virtio_features_copy(features, farray.features); + return 0; } static int vhost_kernel_set_owner(struct vhost_dev *dev) @@ -341,8 +381,8 @@ const VhostOps kernel_ops = { .vhost_attach_vring_worker = vhost_kernel_attach_vring_worker, .vhost_new_worker = vhost_kernel_new_worker, .vhost_free_worker = vhost_kernel_free_worker, - .vhost_set_features = vhost_kernel_set_features, - .vhost_get_features = vhost_kernel_get_features, + .vhost_set_features_ex = vhost_kernel_set_features, + .vhost_get_features_ex = vhost_kernel_get_features, .vhost_set_backend_cap = vhost_kernel_set_backend_cap, .vhost_set_owner = vhost_kernel_set_owner, .vhost_get_vq_index = vhost_kernel_get_vq_index, diff --git a/hw/virtio/vhost-user-device-pci.c b/hw/virtio/vhost-user-test-device-pci.c index f10bac874e..b4ed0efb50 100644 --- a/hw/virtio/vhost-user-device-pci.c +++ b/hw/virtio/vhost-user-test-device-pci.c @@ -18,13 +18,13 @@ struct VHostUserDevicePCI { VHostUserBase vub; }; -#define TYPE_VHOST_USER_DEVICE_PCI "vhost-user-device-pci-base" +#define TYPE_VHOST_USER_TEST_DEVICE_PCI "vhost-user-test-device-pci-base" -OBJECT_DECLARE_SIMPLE_TYPE(VHostUserDevicePCI, VHOST_USER_DEVICE_PCI) +OBJECT_DECLARE_SIMPLE_TYPE(VHostUserDevicePCI, VHOST_USER_TEST_DEVICE_PCI) static void vhost_user_device_pci_realize(VirtIOPCIProxy *vpci_dev, Error **errp) { - VHostUserDevicePCI *dev = VHOST_USER_DEVICE_PCI(vpci_dev); + VHostUserDevicePCI *dev = VHOST_USER_TEST_DEVICE_PCI(vpci_dev); DeviceState *vdev = DEVICE(&dev->vub); vpci_dev->nvectors = 1; @@ -38,9 +38,6 @@ static void vhost_user_device_pci_class_init(ObjectClass *klass, VirtioPCIClass *k = VIRTIO_PCI_CLASS(klass); PCIDeviceClass *pcidev_k = PCI_DEVICE_CLASS(klass); - /* Reason: stop users confusing themselves */ - dc->user_creatable = false; - k->realize = vhost_user_device_pci_realize; set_bit(DEVICE_CATEGORY_INPUT, dc->categories); pcidev_k->vendor_id = PCI_VENDOR_ID_REDHAT_QUMRANET; @@ -51,15 +48,15 @@ static void vhost_user_device_pci_class_init(ObjectClass *klass, static void vhost_user_device_pci_instance_init(Object *obj) { - VHostUserDevicePCI *dev = VHOST_USER_DEVICE_PCI(obj); + VHostUserDevicePCI *dev = VHOST_USER_TEST_DEVICE_PCI(obj); virtio_instance_init_common(obj, &dev->vub, sizeof(dev->vub), - TYPE_VHOST_USER_DEVICE); + TYPE_VHOST_USER_TEST_DEVICE); } static const VirtioPCIDeviceTypeInfo vhost_user_device_pci_info = { - .base_name = TYPE_VHOST_USER_DEVICE_PCI, - .non_transitional_name = "vhost-user-device-pci", + .base_name = TYPE_VHOST_USER_TEST_DEVICE_PCI, + .non_transitional_name = "vhost-user-test-device-pci", .instance_size = sizeof(VHostUserDevicePCI), .instance_init = vhost_user_device_pci_instance_init, .class_init = vhost_user_device_pci_class_init, diff --git a/hw/virtio/vhost-user-device.c b/hw/virtio/vhost-user-test-device.c index 3939bdf755..1b98ea3e48 100644 --- a/hw/virtio/vhost-user-device.c +++ b/hw/virtio/vhost-user-test-device.c @@ -1,5 +1,5 @@ /* - * Generic vhost-user-device implementation for any vhost-user-backend + * Generic vhost-user-test-device implementation for any vhost-user-backend * * This is a concrete implementation of vhost-user-base which can be * configured via properties. It is useful for development and @@ -25,7 +25,7 @@ */ static const VMStateDescription vud_vmstate = { - .name = "vhost-user-device", + .name = "vhost-user-test-device", .unmigratable = 1, }; @@ -41,16 +41,13 @@ static void vud_class_init(ObjectClass *klass, const void *data) { DeviceClass *dc = DEVICE_CLASS(klass); - /* Reason: stop inexperienced users confusing themselves */ - dc->user_creatable = false; - device_class_set_props(dc, vud_properties); dc->vmsd = &vud_vmstate; set_bit(DEVICE_CATEGORY_INPUT, dc->categories); } static const TypeInfo vud_info = { - .name = TYPE_VHOST_USER_DEVICE, + .name = TYPE_VHOST_USER_TEST_DEVICE, .parent = TYPE_VHOST_USER_BASE, .class_init = vud_class_init, }; diff --git a/hw/virtio/vhost.c b/hw/virtio/vhost.c index 6557c58d12..c120ef38b9 100644 --- a/hw/virtio/vhost.c +++ b/hw/virtio/vhost.c @@ -972,20 +972,34 @@ static int vhost_virtqueue_set_addr(struct vhost_dev *dev, static int vhost_dev_set_features(struct vhost_dev *dev, bool enable_log) { - uint64_t features = dev->acked_features; + uint64_t features[VIRTIO_FEATURES_NU64S]; int r; + + virtio_features_copy(features, dev->acked_features_ex); if (enable_log) { - features |= 0x1ULL << VHOST_F_LOG_ALL; + virtio_add_feature_ex(features, VHOST_F_LOG_ALL); } if (!vhost_dev_has_iommu(dev)) { - features &= ~(0x1ULL << VIRTIO_F_IOMMU_PLATFORM); + virtio_clear_feature_ex(features, VIRTIO_F_IOMMU_PLATFORM); } if (dev->vhost_ops->vhost_force_iommu) { if (dev->vhost_ops->vhost_force_iommu(dev) == true) { - features |= 0x1ULL << VIRTIO_F_IOMMU_PLATFORM; + virtio_add_feature_ex(features, VIRTIO_F_IOMMU_PLATFORM); } } - r = dev->vhost_ops->vhost_set_features(dev, features); + + if (virtio_features_use_ex(features) && + !dev->vhost_ops->vhost_set_features_ex) { + r = -EINVAL; + VHOST_OPS_DEBUG(r, "extended features without device support"); + goto out; + } + + if (dev->vhost_ops->vhost_set_features_ex) { + r = dev->vhost_ops->vhost_set_features_ex(dev, features); + } else { + r = dev->vhost_ops->vhost_set_features(dev, features[0]); + } if (r < 0) { VHOST_OPS_DEBUG(r, "vhost_set_features failed"); goto out; @@ -1508,12 +1522,27 @@ static void vhost_virtqueue_cleanup(struct vhost_virtqueue *vq) } } +static int vhost_dev_get_features(struct vhost_dev *hdev, + uint64_t *features) +{ + uint64_t features64; + int r; + + if (hdev->vhost_ops->vhost_get_features_ex) { + return hdev->vhost_ops->vhost_get_features_ex(hdev, features); + } + + r = hdev->vhost_ops->vhost_get_features(hdev, &features64); + virtio_features_from_u64(features, features64); + return r; +} + int vhost_dev_init(struct vhost_dev *hdev, void *opaque, VhostBackendType backend_type, uint32_t busyloop_timeout, Error **errp) { + uint64_t features[VIRTIO_FEATURES_NU64S]; unsigned int used, reserved, limit; - uint64_t features; int i, r, n_initialized_vqs = 0; hdev->vdev = NULL; @@ -1533,7 +1562,7 @@ int vhost_dev_init(struct vhost_dev *hdev, void *opaque, goto fail; } - r = hdev->vhost_ops->vhost_get_features(hdev, &features); + r = vhost_dev_get_features(hdev, features); if (r < 0) { error_setg_errno(errp, -r, "vhost_get_features failed"); goto fail; @@ -1571,7 +1600,7 @@ int vhost_dev_init(struct vhost_dev *hdev, void *opaque, } } - hdev->features = features; + virtio_features_copy(hdev->features_ex, features); hdev->memory_listener = (MemoryListener) { .name = "vhost", @@ -1594,7 +1623,7 @@ int vhost_dev_init(struct vhost_dev *hdev, void *opaque, }; if (hdev->migration_blocker == NULL) { - if (!(hdev->features & (0x1ULL << VHOST_F_LOG_ALL))) { + if (!virtio_has_feature_ex(hdev->features_ex, VHOST_F_LOG_ALL)) { error_setg(&hdev->migration_blocker, "Migration disabled: vhost lacks VHOST_F_LOG_ALL feature."); } else if (vhost_dev_log_is_shared(hdev) && !qemu_memfd_alloc_check()) { @@ -1817,7 +1846,7 @@ void vhost_config_mask(struct vhost_dev *hdev, VirtIODevice *vdev, bool mask) int r; EventNotifier *notifier = &hdev->vqs[VHOST_QUEUE_NUM_CONFIG_INR].masked_config_notifier; - EventNotifier *config_notifier = &vdev->config_notifier; + EventNotifier *config_notifier = virtio_config_get_guest_notifier(vdev); assert(hdev->vhost_ops); if ((hdev->started == false) || @@ -1848,39 +1877,40 @@ static void vhost_stop_config_intr(struct vhost_dev *dev) static void vhost_start_config_intr(struct vhost_dev *dev) { int r; + EventNotifier *config_notifier = + virtio_config_get_guest_notifier(dev->vdev); assert(dev->vhost_ops); - int fd = event_notifier_get_fd(&dev->vdev->config_notifier); + int fd = event_notifier_get_fd(config_notifier); if (dev->vhost_ops->vhost_set_config_call) { r = dev->vhost_ops->vhost_set_config_call(dev, fd); if (!r) { - event_notifier_set(&dev->vdev->config_notifier); + event_notifier_set(config_notifier); } } } -uint64_t vhost_get_features(struct vhost_dev *hdev, const int *feature_bits, - uint64_t features) +void vhost_get_features_ex(struct vhost_dev *hdev, + const int *feature_bits, + uint64_t *features) { const int *bit = feature_bits; + while (*bit != VHOST_INVALID_FEATURE_BIT) { - uint64_t bit_mask = (1ULL << *bit); - if (!(hdev->features & bit_mask)) { - features &= ~bit_mask; + if (!virtio_has_feature_ex(hdev->features_ex, *bit)) { + virtio_clear_feature_ex(features, *bit); } bit++; } - return features; } -void vhost_ack_features(struct vhost_dev *hdev, const int *feature_bits, - uint64_t features) +void vhost_ack_features_ex(struct vhost_dev *hdev, const int *feature_bits, + const uint64_t *features) { const int *bit = feature_bits; while (*bit != VHOST_INVALID_FEATURE_BIT) { - uint64_t bit_mask = (1ULL << *bit); - if (features & bit_mask) { - hdev->acked_features |= bit_mask; + if (virtio_has_feature_ex(features, *bit)) { + virtio_add_feature_ex(hdev->acked_features_ex, *bit); } bit++; } @@ -2139,12 +2169,13 @@ static int do_vhost_dev_stop(struct vhost_dev *hdev, VirtIODevice *vdev, { int i; int rc = 0; + EventNotifier *config_notifier = virtio_config_get_guest_notifier(vdev); /* should only be called after backend is connected */ assert(hdev->vhost_ops); event_notifier_test_and_clear( &hdev->vqs[VHOST_QUEUE_NUM_CONFIG_INR].masked_config_notifier); - event_notifier_test_and_clear(&vdev->config_notifier); + event_notifier_test_and_clear(config_notifier); event_notifier_cleanup( &hdev->vqs[VHOST_QUEUE_NUM_CONFIG_INR].masked_config_notifier); diff --git a/hw/virtio/virtio-bus.c b/hw/virtio/virtio-bus.c index 11adfbf3ab..cef944e015 100644 --- a/hw/virtio/virtio-bus.c +++ b/hw/virtio/virtio-bus.c @@ -62,9 +62,14 @@ void virtio_bus_device_plugged(VirtIODevice *vdev, Error **errp) } /* Get the features of the plugged device. */ - assert(vdc->get_features != NULL); - vdev->host_features = vdc->get_features(vdev, vdev->host_features, - &local_err); + if (vdc->get_features_ex) { + vdc->get_features_ex(vdev, vdev->host_features_ex, &local_err); + } else { + assert(vdc->get_features != NULL); + virtio_features_from_u64(vdev->host_features_ex, + vdc->get_features(vdev, vdev->host_features, + &local_err)); + } if (local_err) { error_propagate(errp, local_err); return; diff --git a/hw/virtio/virtio-hmp-cmds.c b/hw/virtio/virtio-hmp-cmds.c index 7d8677bcf0..1daae482d3 100644 --- a/hw/virtio/virtio-hmp-cmds.c +++ b/hw/virtio/virtio-hmp-cmds.c @@ -74,7 +74,8 @@ static void hmp_virtio_dump_features(Monitor *mon, } if (features->has_unknown_dev_features) { - monitor_printf(mon, " unknown-features(0x%016"PRIx64")\n", + monitor_printf(mon, " unknown-features(0x%016"PRIx64"%016"PRIx64")\n", + features->unknown_dev_features2, features->unknown_dev_features); } } diff --git a/hw/virtio/virtio-pci.c b/hw/virtio/virtio-pci.c index d2595fbd55..ab96ce1776 100644 --- a/hw/virtio/virtio-pci.c +++ b/hw/virtio/virtio-pci.c @@ -109,6 +109,29 @@ static const VMStateDescription vmstate_virtio_pci_modern_queue_state = { } }; +static bool virtio_pci_modern_state_features128_needed(void *opaque) +{ + VirtIOPCIProxy *proxy = opaque; + uint32_t features = 0; + int i; + + for (i = 2; i < ARRAY_SIZE(proxy->guest_features); ++i) { + features |= proxy->guest_features[i]; + } + return features; +} + +static const VMStateDescription vmstate_virtio_pci_modern_state_features128 = { + .name = "virtio_pci/modern_state/features128", + .version_id = 1, + .minimum_version_id = 1, + .needed = &virtio_pci_modern_state_features128_needed, + .fields = (const VMStateField[]) { + VMSTATE_UINT32_SUB_ARRAY(guest_features, VirtIOPCIProxy, 2, 2), + VMSTATE_END_OF_LIST() + } +}; + static bool virtio_pci_modern_state_needed(void *opaque) { VirtIOPCIProxy *proxy = opaque; @@ -116,6 +139,12 @@ static bool virtio_pci_modern_state_needed(void *opaque) return virtio_pci_modern(proxy); } +/* + * Avoid silently breaking migration should the feature space increase + * even more in the (far away) future + */ +QEMU_BUILD_BUG_ON(VIRTIO_FEATURES_NU32S != 4); + static const VMStateDescription vmstate_virtio_pci_modern_state_sub = { .name = "virtio_pci/modern_state", .version_id = 1, @@ -124,11 +153,15 @@ static const VMStateDescription vmstate_virtio_pci_modern_state_sub = { .fields = (const VMStateField[]) { VMSTATE_UINT32(dfselect, VirtIOPCIProxy), VMSTATE_UINT32(gfselect, VirtIOPCIProxy), - VMSTATE_UINT32_ARRAY(guest_features, VirtIOPCIProxy, 2), + VMSTATE_UINT32_SUB_ARRAY(guest_features, VirtIOPCIProxy, 0, 2), VMSTATE_STRUCT_ARRAY(vqs, VirtIOPCIProxy, VIRTIO_QUEUE_MAX, 0, vmstate_virtio_pci_modern_queue_state, VirtIOPCIQueue), VMSTATE_END_OF_LIST() + }, + .subsections = (const VMStateDescription * const []) { + &vmstate_virtio_pci_modern_state_features128, + NULL } }; @@ -1477,6 +1510,19 @@ int virtio_pci_add_shm_cap(VirtIOPCIProxy *proxy, return virtio_pci_add_mem_cap(proxy, &cap.cap); } +static int virtio_pci_select_max(const VirtIODevice *vdev) +{ + int i; + + for (i = VIRTIO_FEATURES_NU64S - 1; i > 0; i--) { + if (vdev->host_features_ex[i]) { + return (i + 1) * 2; + } + } + + return 2; +} + static uint64_t virtio_pci_common_read(void *opaque, hwaddr addr, unsigned size) { @@ -1494,18 +1540,21 @@ static uint64_t virtio_pci_common_read(void *opaque, hwaddr addr, val = proxy->dfselect; break; case VIRTIO_PCI_COMMON_DF: - if (proxy->dfselect <= 1) { + if (proxy->dfselect < virtio_pci_select_max(vdev)) { VirtioDeviceClass *vdc = VIRTIO_DEVICE_GET_CLASS(vdev); - val = (vdev->host_features & ~vdc->legacy_features) >> - (32 * proxy->dfselect); + val = vdev->host_features_ex[proxy->dfselect >> 1] >> + (32 * (proxy->dfselect & 1)); + if (proxy->dfselect <= 1) { + val &= (~vdc->legacy_features) >> (32 * proxy->dfselect); + } } break; case VIRTIO_PCI_COMMON_GFSELECT: val = proxy->gfselect; break; case VIRTIO_PCI_COMMON_GF: - if (proxy->gfselect < ARRAY_SIZE(proxy->guest_features)) { + if (proxy->gfselect < virtio_pci_select_max(vdev)) { val = proxy->guest_features[proxy->gfselect]; } break; @@ -1588,11 +1637,18 @@ static void virtio_pci_common_write(void *opaque, hwaddr addr, proxy->gfselect = val; break; case VIRTIO_PCI_COMMON_GF: - if (proxy->gfselect < ARRAY_SIZE(proxy->guest_features)) { + if (proxy->gfselect < virtio_pci_select_max(vdev)) { + uint64_t features[VIRTIO_FEATURES_NU64S]; + int i; + proxy->guest_features[proxy->gfselect] = val; - virtio_set_features(vdev, - (((uint64_t)proxy->guest_features[1]) << 32) | - proxy->guest_features[0]); + virtio_features_clear(features); + for (i = 0; i < ARRAY_SIZE(proxy->guest_features); ++i) { + uint64_t cur = proxy->guest_features[i]; + + features[i >> 1] |= cur << ((i & 1) * 32); + } + virtio_set_features_ex(vdev, features); } break; case VIRTIO_PCI_COMMON_MSIX: @@ -2311,6 +2367,8 @@ static void virtio_pci_reset(DeviceState *qdev) virtio_bus_reset(bus); msix_unuse_all_vectors(&proxy->pci_dev); + memset(proxy->guest_features, 0, sizeof(proxy->guest_features)); + for (i = 0; i < VIRTIO_QUEUE_MAX; i++) { proxy->vqs[i].enabled = 0; proxy->vqs[i].reset = 0; diff --git a/hw/virtio/virtio-qmp.c b/hw/virtio/virtio-qmp.c index 3b6377cf0d..b338344c6c 100644 --- a/hw/virtio/virtio-qmp.c +++ b/hw/virtio/virtio-qmp.c @@ -325,6 +325,20 @@ static const qmp_virtio_feature_map_t virtio_net_feature_map[] = { FEATURE_ENTRY(VHOST_USER_F_PROTOCOL_FEATURES, \ "VHOST_USER_F_PROTOCOL_FEATURES: Vhost-user protocol features " "negotiation supported"), + FEATURE_ENTRY(VIRTIO_NET_F_GUEST_UDP_TUNNEL_GSO, \ + "VIRTIO_NET_F_GUEST_UDP_TUNNEL_GSO: Driver can receive GSO over " + "UDP tunnel packets"), + FEATURE_ENTRY(VIRTIO_NET_F_GUEST_UDP_TUNNEL_GSO_CSUM, \ + "VIRTIO_NET_F_GUEST_UDP_TUNNEL_GSO: Driver can receive GSO over " + "UDP tunnel packets requiring checksum offload for the outer " + "header"), + FEATURE_ENTRY(VIRTIO_NET_F_HOST_UDP_TUNNEL_GSO, \ + "VIRTIO_NET_F_HOST_UDP_TUNNEL_GSO: Device can receive GSO over " + "UDP tunnel packets"), + FEATURE_ENTRY(VIRTIO_NET_F_HOST_UDP_TUNNEL_GSO_CSUM, \ + "VIRTIO_NET_F_HOST_UDP_TUNNEL_GSO_CSUM: Device can receive GSO over " + "UDP tunnel packets requiring checksum offload for the outer " + "header"), { -1, "" } }; #endif @@ -510,6 +524,24 @@ static const qmp_virtio_feature_map_t virtio_gpio_feature_map[] = { list; \ }) +#define CONVERT_FEATURES_EX(type, map, bitmap) \ + ({ \ + type *list = NULL; \ + type *node; \ + for (i = 0; map[i].virtio_bit != -1; i++) { \ + bit = map[i].virtio_bit; \ + if (!virtio_has_feature_ex(bitmap, bit)) { \ + continue; \ + } \ + node = g_new0(type, 1); \ + node->value = g_strdup(map[i].feature_desc); \ + node->next = list; \ + list = node; \ + virtio_clear_feature_ex(bitmap, bit); \ + } \ + list; \ + }) + VirtioDeviceStatus *qmp_decode_status(uint8_t bitmap) { VirtioDeviceStatus *status; @@ -545,109 +577,112 @@ VhostDeviceProtocols *qmp_decode_protocols(uint64_t bitmap) return vhu_protocols; } -VirtioDeviceFeatures *qmp_decode_features(uint16_t device_id, uint64_t bitmap) +VirtioDeviceFeatures *qmp_decode_features(uint16_t device_id, + const uint64_t *bmap) { + uint64_t bitmap[VIRTIO_FEATURES_NU64S]; VirtioDeviceFeatures *features; uint64_t bit; int i; + virtio_features_copy(bitmap, bmap); features = g_new0(VirtioDeviceFeatures, 1); features->has_dev_features = true; /* transport features */ - features->transports = CONVERT_FEATURES(strList, virtio_transport_map, 0, - bitmap); + features->transports = CONVERT_FEATURES_EX(strList, virtio_transport_map, + bitmap); /* device features */ switch (device_id) { #ifdef CONFIG_VIRTIO_SERIAL case VIRTIO_ID_CONSOLE: features->dev_features = - CONVERT_FEATURES(strList, virtio_serial_feature_map, 0, bitmap); + CONVERT_FEATURES_EX(strList, virtio_serial_feature_map, bitmap); break; #endif #ifdef CONFIG_VIRTIO_BLK case VIRTIO_ID_BLOCK: features->dev_features = - CONVERT_FEATURES(strList, virtio_blk_feature_map, 0, bitmap); + CONVERT_FEATURES_EX(strList, virtio_blk_feature_map, bitmap); break; #endif #ifdef CONFIG_VIRTIO_GPU case VIRTIO_ID_GPU: features->dev_features = - CONVERT_FEATURES(strList, virtio_gpu_feature_map, 0, bitmap); + CONVERT_FEATURES_EX(strList, virtio_gpu_feature_map, bitmap); break; #endif #ifdef CONFIG_VIRTIO_NET case VIRTIO_ID_NET: features->dev_features = - CONVERT_FEATURES(strList, virtio_net_feature_map, 0, bitmap); + CONVERT_FEATURES_EX(strList, virtio_net_feature_map, bitmap); break; #endif #ifdef CONFIG_VIRTIO_SCSI case VIRTIO_ID_SCSI: features->dev_features = - CONVERT_FEATURES(strList, virtio_scsi_feature_map, 0, bitmap); + CONVERT_FEATURES_EX(strList, virtio_scsi_feature_map, bitmap); break; #endif #ifdef CONFIG_VIRTIO_BALLOON case VIRTIO_ID_BALLOON: features->dev_features = - CONVERT_FEATURES(strList, virtio_balloon_feature_map, 0, bitmap); + CONVERT_FEATURES_EX(strList, virtio_balloon_feature_map, bitmap); break; #endif #ifdef CONFIG_VIRTIO_IOMMU case VIRTIO_ID_IOMMU: features->dev_features = - CONVERT_FEATURES(strList, virtio_iommu_feature_map, 0, bitmap); + CONVERT_FEATURES_EX(strList, virtio_iommu_feature_map, bitmap); break; #endif #ifdef CONFIG_VIRTIO_INPUT case VIRTIO_ID_INPUT: features->dev_features = - CONVERT_FEATURES(strList, virtio_input_feature_map, 0, bitmap); + CONVERT_FEATURES_EX(strList, virtio_input_feature_map, bitmap); break; #endif #ifdef CONFIG_VHOST_USER_FS case VIRTIO_ID_FS: features->dev_features = - CONVERT_FEATURES(strList, virtio_fs_feature_map, 0, bitmap); + CONVERT_FEATURES_EX(strList, virtio_fs_feature_map, bitmap); break; #endif #ifdef CONFIG_VHOST_VSOCK case VIRTIO_ID_VSOCK: features->dev_features = - CONVERT_FEATURES(strList, virtio_vsock_feature_map, 0, bitmap); + CONVERT_FEATURES_EX(strList, virtio_vsock_feature_map, bitmap); break; #endif #ifdef CONFIG_VIRTIO_CRYPTO case VIRTIO_ID_CRYPTO: features->dev_features = - CONVERT_FEATURES(strList, virtio_crypto_feature_map, 0, bitmap); + CONVERT_FEATURES_EX(strList, virtio_crypto_feature_map, bitmap); break; #endif #ifdef CONFIG_VIRTIO_MEM case VIRTIO_ID_MEM: features->dev_features = - CONVERT_FEATURES(strList, virtio_mem_feature_map, 0, bitmap); + CONVERT_FEATURES_EX(strList, virtio_mem_feature_map, bitmap); break; #endif #ifdef CONFIG_VIRTIO_I2C_ADAPTER case VIRTIO_ID_I2C_ADAPTER: features->dev_features = - CONVERT_FEATURES(strList, virtio_i2c_feature_map, 0, bitmap); + CONVERT_FEATURES_EX(strList, virtio_i2c_feature_map, bitmap); break; #endif #ifdef CONFIG_VIRTIO_RNG case VIRTIO_ID_RNG: features->dev_features = - CONVERT_FEATURES(strList, virtio_rng_feature_map, 0, bitmap); + CONVERT_FEATURES_EX(strList, virtio_rng_feature_map, bitmap); break; #endif #ifdef CONFIG_VHOST_USER_GPIO case VIRTIO_ID_GPIO: features->dev_features = - CONVERT_FEATURES(strList, virtio_gpio_feature_map, 0, bitmap); + CONVERT_FEATURES_EX(strList, virtio_gpio_feature_map, bitmap); break; #endif /* No features */ @@ -680,10 +715,9 @@ VirtioDeviceFeatures *qmp_decode_features(uint16_t device_id, uint64_t bitmap) g_assert_not_reached(); } - features->has_unknown_dev_features = bitmap != 0; - if (features->has_unknown_dev_features) { - features->unknown_dev_features = bitmap; - } + features->has_unknown_dev_features = !virtio_features_empty(bitmap); + features->unknown_dev_features = bitmap[0]; + features->unknown_dev_features2 = bitmap[1]; return features; } @@ -743,11 +777,11 @@ VirtioStatus *qmp_x_query_virtio_status(const char *path, Error **errp) status->device_id = vdev->device_id; status->vhost_started = vdev->vhost_started; status->guest_features = qmp_decode_features(vdev->device_id, - vdev->guest_features); + vdev->guest_features_ex); status->host_features = qmp_decode_features(vdev->device_id, - vdev->host_features); + vdev->host_features_ex); status->backend_features = qmp_decode_features(vdev->device_id, - vdev->backend_features); + vdev->backend_features_ex); switch (vdev->device_endian) { case VIRTIO_DEVICE_ENDIAN_LITTLE: @@ -785,11 +819,12 @@ VirtioStatus *qmp_x_query_virtio_status(const char *path, Error **errp) status->vhost_dev->nvqs = hdev->nvqs; status->vhost_dev->vq_index = hdev->vq_index; status->vhost_dev->features = - qmp_decode_features(vdev->device_id, hdev->features); + qmp_decode_features(vdev->device_id, hdev->features_ex); status->vhost_dev->acked_features = - qmp_decode_features(vdev->device_id, hdev->acked_features); + qmp_decode_features(vdev->device_id, hdev->acked_features_ex); status->vhost_dev->backend_features = - qmp_decode_features(vdev->device_id, hdev->backend_features); + qmp_decode_features(vdev->device_id, hdev->backend_features_ex); + status->vhost_dev->protocol_features = qmp_decode_protocols(hdev->protocol_features); status->vhost_dev->max_queues = hdev->max_queues; diff --git a/hw/virtio/virtio-qmp.h b/hw/virtio/virtio-qmp.h index 245a446a56..e0a1e49035 100644 --- a/hw/virtio/virtio-qmp.h +++ b/hw/virtio/virtio-qmp.h @@ -18,6 +18,7 @@ VirtIODevice *qmp_find_virtio_device(const char *path); VirtioDeviceStatus *qmp_decode_status(uint8_t bitmap); VhostDeviceProtocols *qmp_decode_protocols(uint64_t bitmap); -VirtioDeviceFeatures *qmp_decode_features(uint16_t device_id, uint64_t bitmap); +VirtioDeviceFeatures *qmp_decode_features(uint16_t device_id, + const uint64_t *bitmap); #endif diff --git a/hw/virtio/virtio.c b/hw/virtio/virtio.c index 0a68f1b6f1..be73753b59 100644 --- a/hw/virtio/virtio.c +++ b/hw/virtio/virtio.c @@ -31,6 +31,7 @@ #include "hw/qdev-properties.h" #include "hw/virtio/virtio-access.h" #include "system/dma.h" +#include "system/iothread.h" #include "system/runstate.h" #include "virtio-qmp.h" @@ -256,7 +257,10 @@ void virtio_init_region_cache(VirtIODevice *vdev, int n) len = address_space_cache_init(&new->desc, vdev->dma_as, addr, size, packed); if (len < size) { - virtio_error(vdev, "Cannot map desc"); + virtio_error(vdev, + "Failed to map descriptor ring for device %s: " + "invalid guest physical address or corrupted queue setup", + qdev_get_printable_name(DEVICE(vdev))); goto err_desc; } @@ -264,7 +268,10 @@ void virtio_init_region_cache(VirtIODevice *vdev, int n) len = address_space_cache_init(&new->used, vdev->dma_as, vq->vring.used, size, true); if (len < size) { - virtio_error(vdev, "Cannot map used"); + virtio_error(vdev, + "Failed to map used ring for device %s: " + "possible guest misconfiguration or insufficient memory", + qdev_get_printable_name(DEVICE(vdev))); goto err_used; } @@ -272,7 +279,10 @@ void virtio_init_region_cache(VirtIODevice *vdev, int n) len = address_space_cache_init(&new->avail, vdev->dma_as, vq->vring.avail, size, false); if (len < size) { - virtio_error(vdev, "Cannot map avail"); + virtio_error(vdev, + "Failed to map avalaible ring for device %s: " + "possible queue misconfiguration or overlapping memory region", + qdev_get_printable_name(DEVICE(vdev))); goto err_avail; } @@ -2654,16 +2664,8 @@ static void virtio_notify_irqfd_deferred_fn(void *opaque) event_notifier_set(notifier); } -void virtio_notify_irqfd(VirtIODevice *vdev, VirtQueue *vq) +static void virtio_irq(VirtQueue *vq) { - WITH_RCU_READ_LOCK_GUARD() { - if (!virtio_should_notify(vdev, vq)) { - return; - } - } - - trace_virtio_notify_irqfd(vdev, vq); - /* * virtio spec 1.0 says ISR bit 0 should be ignored with MSI, but * windows drivers included in virtio-win 1.8.0 (circa 2015) are @@ -2680,13 +2682,18 @@ void virtio_notify_irqfd(VirtIODevice *vdev, VirtQueue *vq) * to an atomic operation. */ virtio_set_isr(vq->vdev, 0x1); - defer_call(virtio_notify_irqfd_deferred_fn, &vq->guest_notifier); -} -static void virtio_irq(VirtQueue *vq) -{ - virtio_set_isr(vq->vdev, 0x1); - virtio_notify_vector(vq->vdev, vq->vector); + /* + * The interrupt code path requires the Big QEMU Lock (BQL), so use the + * notifier instead when in an IOThread. This assumes that device models + * have already called ->set_guest_notifiers() sometime before calling this + * function. + */ + if (qemu_in_iothread()) { + defer_call(virtio_notify_irqfd_deferred_fn, &vq->guest_notifier); + } else { + virtio_notify_vector(vq->vdev, vq->vector); + } } void virtio_notify(VirtIODevice *vdev, VirtQueue *vq) @@ -2708,7 +2715,12 @@ void virtio_notify_config(VirtIODevice *vdev) virtio_set_isr(vdev, 0x3); vdev->generation++; - virtio_notify_vector(vdev, vdev->config_vector); + + if (qemu_in_iothread()) { + defer_call(virtio_notify_irqfd_deferred_fn, &vdev->config_notifier); + } else { + virtio_notify_vector(vdev, vdev->config_vector); + } } static bool virtio_device_endian_needed(void *opaque) @@ -2964,6 +2976,30 @@ static const VMStateDescription vmstate_virtio_disabled = { } }; +static bool virtio_128bit_features_needed(void *opaque) +{ + VirtIODevice *vdev = opaque; + + return virtio_features_use_ex(vdev->host_features_ex); +} + +static const VMStateDescription vmstate_virtio_128bit_features = { + .name = "virtio/128bit_features", + .version_id = 1, + .minimum_version_id = 1, + .needed = &virtio_128bit_features_needed, + .fields = (const VMStateField[]) { + VMSTATE_UINT64(guest_features_ex[1], VirtIODevice), + VMSTATE_END_OF_LIST() + } +}; + +/* + * Avoid silently breaking migration should the feature space increase + * even more in the (far away) future + */ +QEMU_BUILD_BUG_ON(VIRTIO_FEATURES_NU64S != 2); + static const VMStateDescription vmstate_virtio = { .name = "virtio", .version_id = 1, @@ -2973,6 +3009,7 @@ static const VMStateDescription vmstate_virtio = { }, .subsections = (const VMStateDescription * const []) { &vmstate_virtio_device_endian, + &vmstate_virtio_128bit_features, &vmstate_virtio_64bit_features, &vmstate_virtio_virtqueues, &vmstate_virtio_ringsize, @@ -3071,23 +3108,30 @@ const VMStateInfo virtio_vmstate_info = { .put = virtio_device_put, }; -static int virtio_set_features_nocheck(VirtIODevice *vdev, uint64_t val) +static int virtio_set_features_nocheck(VirtIODevice *vdev, const uint64_t *val) { VirtioDeviceClass *k = VIRTIO_DEVICE_GET_CLASS(vdev); - bool bad = (val & ~(vdev->host_features)) != 0; + uint64_t tmp[VIRTIO_FEATURES_NU64S]; + bool bad; + + bad = virtio_features_andnot(tmp, val, vdev->host_features_ex); + virtio_features_and(tmp, val, vdev->host_features_ex); - val &= vdev->host_features; - if (k->set_features) { - k->set_features(vdev, val); + if (k->set_features_ex) { + k->set_features_ex(vdev, val); + } else if (k->set_features) { + bad = bad || virtio_features_use_ex(tmp); + k->set_features(vdev, tmp[0]); } - vdev->guest_features = val; + + virtio_features_copy(vdev->guest_features_ex, tmp); return bad ? -1 : 0; } typedef struct VirtioSetFeaturesNocheckData { Coroutine *co; VirtIODevice *vdev; - uint64_t val; + uint64_t val[VIRTIO_FEATURES_NU64S]; int ret; } VirtioSetFeaturesNocheckData; @@ -3100,14 +3144,15 @@ static void virtio_set_features_nocheck_bh(void *opaque) } static int coroutine_mixed_fn -virtio_set_features_nocheck_maybe_co(VirtIODevice *vdev, uint64_t val) +virtio_set_features_nocheck_maybe_co(VirtIODevice *vdev, + const uint64_t *val) { if (qemu_in_coroutine()) { VirtioSetFeaturesNocheckData data = { .co = qemu_coroutine_self(), .vdev = vdev, - .val = val, }; + virtio_features_copy(data.val, val); aio_bh_schedule_oneshot(qemu_get_current_aio_context(), virtio_set_features_nocheck_bh, &data); qemu_coroutine_yield(); @@ -3119,6 +3164,14 @@ virtio_set_features_nocheck_maybe_co(VirtIODevice *vdev, uint64_t val) int virtio_set_features(VirtIODevice *vdev, uint64_t val) { + uint64_t features[VIRTIO_FEATURES_NU64S]; + + virtio_features_from_u64(features, val); + return virtio_set_features_ex(vdev, features); +} + +int virtio_set_features_ex(VirtIODevice *vdev, const uint64_t *features) +{ int ret; /* * The driver must not attempt to set features after feature negotiation @@ -3128,13 +3181,13 @@ int virtio_set_features(VirtIODevice *vdev, uint64_t val) return -EINVAL; } - if (val & (1ull << VIRTIO_F_BAD_FEATURE)) { + if (features[0] & (1ull << VIRTIO_F_BAD_FEATURE)) { qemu_log_mask(LOG_GUEST_ERROR, "%s: guest driver for %s has enabled UNUSED(30) feature bit!\n", __func__, vdev->name); } - ret = virtio_set_features_nocheck(vdev, val); + ret = virtio_set_features_nocheck(vdev, features); if (virtio_vdev_has_feature(vdev, VIRTIO_RING_F_EVENT_IDX)) { /* VIRTIO_RING_F_EVENT_IDX changes the size of the caches. */ int i; @@ -3157,6 +3210,7 @@ void virtio_reset(void *opaque) { VirtIODevice *vdev = opaque; VirtioDeviceClass *k = VIRTIO_DEVICE_GET_CLASS(vdev); + uint64_t features[VIRTIO_FEATURES_NU64S]; int i; virtio_set_status(vdev, 0); @@ -3183,7 +3237,8 @@ void virtio_reset(void *opaque) vdev->start_on_kick = false; vdev->started = false; vdev->broken = false; - virtio_set_features_nocheck(vdev, 0); + virtio_features_clear(features); + virtio_set_features_nocheck(vdev, features); vdev->queue_sel = 0; vdev->status = 0; vdev->disabled = false; @@ -3267,7 +3322,7 @@ virtio_load(VirtIODevice *vdev, QEMUFile *f, int version_id) * Note: devices should always test host features in future - don't create * new dependencies like this. */ - vdev->guest_features = features; + virtio_features_from_u64(vdev->guest_features_ex, features); config_len = qemu_get_be32(f); @@ -3348,26 +3403,17 @@ virtio_load(VirtIODevice *vdev, QEMUFile *f, int version_id) vdev->device_endian = virtio_default_endian(); } - if (virtio_64bit_features_needed(vdev)) { - /* - * Subsection load filled vdev->guest_features. Run them - * through virtio_set_features to sanity-check them against - * host_features. - */ - uint64_t features64 = vdev->guest_features; - if (virtio_set_features_nocheck_maybe_co(vdev, features64) < 0) { - error_report("Features 0x%" PRIx64 " unsupported. " - "Allowed features: 0x%" PRIx64, - features64, vdev->host_features); - return -1; - } - } else { - if (virtio_set_features_nocheck_maybe_co(vdev, features) < 0) { - error_report("Features 0x%x unsupported. " - "Allowed features: 0x%" PRIx64, - features, vdev->host_features); - return -1; - } + /* + * guest_features_ex is fully initialized with u32 features and upper + * bits have been filled as needed by the later load. + */ + if (virtio_set_features_nocheck_maybe_co(vdev, + vdev->guest_features_ex) < 0) { + error_report("Features 0x" VIRTIO_FEATURES_FMT " unsupported. " + "Allowed features: 0x" VIRTIO_FEATURES_FMT, + VIRTIO_FEATURES_PR(vdev->guest_features_ex), + VIRTIO_FEATURES_PR(vdev->host_features_ex)); + return -1; } if (!virtio_device_started(vdev, vdev->status) && |