diff options
138 files changed, 4507 insertions, 717 deletions
diff --git a/MAINTAINERS b/MAINTAINERS index 406cef88f0..75e1fa5c30 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -2173,6 +2173,16 @@ F: hw/acpi/ghes.c F: include/hw/acpi/ghes.h F: docs/specs/acpi_hest_ghes.rst +ACPI/HEST/GHES/ARM processor CPER +R: Mauro Carvalho Chehab <mchehab+huawei@kernel.org> +S: Maintained +F: hw/arm/ghes_cper.c +F: hw/acpi/ghes_cper_stub.c +F: qapi/acpi-hest.json +F: scripts/ghes_inject.py +F: scripts/arm_processor_error.py +F: scripts/qmp_helper.py + ppc4xx L: qemu-ppc@nongnu.org S: Orphan diff --git a/docs/pcie_sriov.txt b/docs/pcie_sriov.txt index ab2142807f..00d7bd93fd 100644 --- a/docs/pcie_sriov.txt +++ b/docs/pcie_sriov.txt @@ -72,8 +72,7 @@ setting up a BAR for a VF. 2) Similarly in the implementation of the virtual function, you need to make it a PCI Express device and add a similar set of capabilities except for the SR/IOV capability. Then you need to set up the VF BARs as - subregions of the PFs SR/IOV VF BARs by calling - pcie_sriov_vf_register_bar() instead of the normal pci_register_bar() call: + subregions of the PFs SR/IOV VF BARs by calling pci_register_bar(): pci_your_vf_dev_realize( ... ) { @@ -83,7 +82,7 @@ setting up a BAR for a VF. pcie_ari_init(d, 0x100); ... memory_region_init(mr, ... ) - pcie_sriov_vf_register_bar(d, bar_nr, mr); + pci_register_bar(d, bar_nr, bar_type, mr); ... } diff --git a/docs/specs/acpi_hest_ghes.rst b/docs/specs/acpi_hest_ghes.rst index c3e9f8d9a7..aaf7b1ad11 100644 --- a/docs/specs/acpi_hest_ghes.rst +++ b/docs/specs/acpi_hest_ghes.rst @@ -89,12 +89,21 @@ Design Details addresses in the "error_block_address" fields with a pointer to the respective "Error Status Data Block" in the "etc/hardware_errors" blob. -(8) QEMU defines a third and write-only fw_cfg blob which is called - "etc/hardware_errors_addr". Through that blob, the firmware can send back - the guest-side allocation addresses to QEMU. The "etc/hardware_errors_addr" - blob contains a 8-byte entry. QEMU generates a single WRITE_POINTER command - for the firmware. The firmware will write back the start address of - "etc/hardware_errors" blob to the fw_cfg file "etc/hardware_errors_addr". +(8) QEMU defines a third and write-only fw_cfg blob to store the location + where the error block offsets, read ack registers and CPER records are + stored. + + Up to QEMU 9.2, the location was at "etc/hardware_errors_addr", and + contains a GPA for the beginning of "etc/hardware_errors". + + Newer versions place the location at "etc/acpi_table_hest_addr", + pointing to the GPA of the HEST table. + + Using above mentioned 'fw_cfg' files, the firmware can send back the + guest-side allocation addresses to QEMU. They contain a 8-byte entry. + QEMU generates a single WRITE_POINTER command for the firmware. The + firmware will write back the start address of either "etc/hardware_errors" + or HEST table at the corresponding fw_cfg file. (9) When QEMU gets a SIGBUS from the kernel, QEMU writes CPER into corresponding "Error Status Data Block", guest memory, and then injects platform specific @@ -105,8 +114,5 @@ Design Details kernel, on receiving notification, guest APEI driver could read the CPER error and take appropriate action. -(11) kvm_arch_on_sigbus_vcpu() uses source_id as index in "etc/hardware_errors" to - find out "Error Status Data Block" entry corresponding to error source. So supported - source_id values should be assigned here and not be changed afterwards to make sure - that guest will write error into expected "Error Status Data Block" even if guest was - migrated to a newer QEMU. +(11) kvm_arch_on_sigbus_vcpu() reports RAS errors via a SEA notifications, + when a SIGBUS event is triggered. diff --git a/docs/system/devices/vhost-user.rst b/docs/system/devices/vhost-user.rst index 35259d8ec7..bddf8df5ed 100644 --- a/docs/system/devices/vhost-user.rst +++ b/docs/system/devices/vhost-user.rst @@ -62,26 +62,20 @@ platform details for what sort of virtio bus to use. The referenced *daemons* are not exhaustive, any conforming backend implementing the device and using the vhost-user protocol should work. -vhost-user-device -^^^^^^^^^^^^^^^^^ +vhost-user-test-device +^^^^^^^^^^^^^^^^^^^^^^ -The vhost-user-device is a generic development device intended for -expert use while developing new backends. The user needs to specify -all the required parameters including: +The vhost-user-test-device is a generic development device intended +for expert use while developing new backends. The user needs to +specify all the required parameters including: - Device ``virtio-id`` - The ``num_vqs`` it needs and their ``vq_size`` - The ``config_size`` if needed .. note:: - To prevent user confusion you cannot currently instantiate - vhost-user-device without first patching out:: - - /* Reason: stop inexperienced users confusing themselves */ - dc->user_creatable = false; - - in ``vhost-user-device.c`` and ``vhost-user-device-pci.c`` file and - rebuilding. + While this is a useful device for development it is not recommended + for production use. vhost-user daemon ================= diff --git a/hw/acpi/Kconfig b/hw/acpi/Kconfig index 1d4e9f0845..daabbe6cd1 100644 --- a/hw/acpi/Kconfig +++ b/hw/acpi/Kconfig @@ -51,6 +51,11 @@ config ACPI_APEI bool depends on ACPI +config GHES_CPER + bool + depends on ACPI_APEI + default y + config ACPI_PCI bool depends on ACPI && PCI diff --git a/hw/acpi/aml-build.c b/hw/acpi/aml-build.c index 1e685f982f..2d5826a8f1 100644 --- a/hw/acpi/aml-build.c +++ b/hw/acpi/aml-build.c @@ -2629,3 +2629,13 @@ Aml *aml_i2c_serial_bus_device(uint16_t address, const char *resource_source) return var; } + +/* ACPI 5.0b: 18.3.2.6.2 Event Notification For Generic Error Sources */ +Aml *aml_error_device(void) +{ + Aml *dev = aml_device(ACPI_APEI_ERROR_DEVICE); + aml_append(dev, aml_name_decl("_HID", aml_string("PNP0C33"))); + aml_append(dev, aml_name_decl("_UID", aml_int(0))); + + return dev; +} diff --git a/hw/acpi/generic_event_device.c b/hw/acpi/generic_event_device.c index 95682b79a2..e7b773d84d 100644 --- a/hw/acpi/generic_event_device.c +++ b/hw/acpi/generic_event_device.c @@ -30,6 +30,7 @@ static const uint32_t ged_supported_events[] = { ACPI_GED_NVDIMM_HOTPLUG_EVT, ACPI_GED_CPU_HOTPLUG_EVT, ACPI_GED_PCI_HOTPLUG_EVT, + ACPI_GED_ERROR_EVT, }; /* @@ -120,6 +121,16 @@ void build_ged_aml(Aml *table, const char *name, HotplugHandler *hotplug_dev, aml_notify(aml_name(ACPI_POWER_BUTTON_DEVICE), aml_int(0x80))); break; + case ACPI_GED_ERROR_EVT: + /* + * ACPI 5.0b: 5.6.6 Device Object Notifications + * Table 5-135 Error Device Notification Values + * Defines 0x80 as the value to be used on notifications + */ + aml_append(if_ctx, + aml_notify(aml_name(ACPI_APEI_ERROR_DEVICE), + aml_int(0x80))); + break; case ACPI_GED_NVDIMM_HOTPLUG_EVT: aml_append(if_ctx, aml_notify(aml_name("\\_SB.NVDR"), @@ -320,6 +331,8 @@ static void acpi_ged_send_event(AcpiDeviceIf *adev, AcpiEventStatusBits ev) sel = ACPI_GED_MEM_HOTPLUG_EVT; } else if (ev & ACPI_POWER_DOWN_STATUS) { sel = ACPI_GED_PWR_DOWN_EVT; + } else if (ev & ACPI_GENERIC_ERROR) { + sel = ACPI_GED_ERROR_EVT; } else if (ev & ACPI_NVDIMM_HOTPLUG_STATUS) { sel = ACPI_GED_NVDIMM_HOTPLUG_EVT; } else if (ev & ACPI_CPU_HOTPLUG_STATUS) { @@ -349,6 +362,8 @@ static const Property acpi_ged_properties[] = { pcihp_state.use_acpi_hotplug_bridge, 0), DEFINE_PROP_LINK("bus", AcpiGedState, pcihp_state.root, TYPE_PCI_BUS, PCIBus *), + DEFINE_PROP_BOOL("x-has-hest-addr", AcpiGedState, + ghes_state.use_hest_addr, true), }; static const VMStateDescription vmstate_memhp_state = { @@ -436,6 +451,34 @@ static const VMStateDescription vmstate_pcihp_state = { } }; +static const VMStateDescription vmstate_hest = { + .name = "acpi-hest", + .version_id = 1, + .minimum_version_id = 1, + .fields = (const VMStateField[]) { + VMSTATE_UINT64(hest_addr_le, AcpiGhesState), + VMSTATE_END_OF_LIST() + }, +}; + +static bool hest_needed(void *opaque) +{ + AcpiGedState *s = opaque; + return s->ghes_state.hest_addr_le; +} + +static const VMStateDescription vmstate_hest_state = { + .name = "acpi-ged/hest", + .version_id = 1, + .minimum_version_id = 1, + .needed = hest_needed, + .fields = (const VMStateField[]) { + VMSTATE_STRUCT(ghes_state, AcpiGedState, 1, + vmstate_hest, AcpiGhesState), + VMSTATE_END_OF_LIST() + } +}; + static const VMStateDescription vmstate_acpi_ged = { .name = "acpi-ged", .version_id = 1, @@ -449,6 +492,7 @@ static const VMStateDescription vmstate_acpi_ged = { &vmstate_cpuhp_state, &vmstate_ghes_state, &vmstate_pcihp_state, + &vmstate_hest_state, NULL } }; diff --git a/hw/acpi/ghes-stub.c b/hw/acpi/ghes-stub.c index 7cec1812da..40f660c246 100644 --- a/hw/acpi/ghes-stub.c +++ b/hw/acpi/ghes-stub.c @@ -11,12 +11,13 @@ #include "qemu/osdep.h" #include "hw/acpi/ghes.h" -int acpi_ghes_memory_errors(uint16_t source_id, uint64_t physical_address) +int acpi_ghes_memory_errors(AcpiGhesState *ags, uint16_t source_id, + uint64_t physical_address) { return -1; } -bool acpi_ghes_present(void) +AcpiGhesState *acpi_ghes_get_state(void) { - return false; + return NULL; } diff --git a/hw/acpi/ghes.c b/hw/acpi/ghes.c index b85bb48195..06555905ce 100644 --- a/hw/acpi/ghes.c +++ b/hw/acpi/ghes.c @@ -30,6 +30,7 @@ #define ACPI_HW_ERROR_FW_CFG_FILE "etc/hardware_errors" #define ACPI_HW_ERROR_ADDR_FW_CFG_FILE "etc/hardware_errors_addr" +#define ACPI_HEST_ADDR_FW_CFG_FILE "etc/acpi_table_hest_addr" /* The max size in bytes for one error block */ #define ACPI_GHES_MAX_RAW_DATA_LENGTH (1 * KiB) @@ -41,6 +42,12 @@ #define GAS_ADDR_OFFSET 4 /* + * ACPI spec 1.0b + * 5.2.3 System Description Table Header + */ +#define ACPI_DESC_HEADER_OFFSET 36 + +/* * The total size of Generic Error Data Entry * ACPI 6.1/6.2: 18.3.2.7.1 Generic Error Data, * Table 18-343 Generic Error Data Entry @@ -61,6 +68,30 @@ #define ACPI_GHES_GESB_SIZE 20 /* + * See the memory layout map at docs/specs/acpi_hest_ghes.rst. + */ + +/* + * ACPI 6.1: 18.3.2.8 Generic Hardware Error Source version 2 + * Table 18-344 Generic Hardware Error Source version 2 (GHESv2) Structure + */ +#define HEST_GHES_V2_ENTRY_SIZE 92 + +/* + * ACPI 6.1: 18.3.2.8 Generic Hardware Error Source version 2 + * Table 18-344 Generic Hardware Error Source version 2 (GHESv2) Structure + * Read Ack Register + */ +#define GHES_READ_ACK_ADDR_OFF 64 + +/* + * ACPI 6.1: 18.3.2.7: Generic Hardware Error Source + * Table 18-341 Generic Hardware Error Source Structure + * Error Status Address + */ +#define GHES_ERR_STATUS_ADDR_OFF 20 + +/* * Values for error_severity field */ enum AcpiGenericErrorSeverity { @@ -206,17 +237,18 @@ ghes_gen_err_data_uncorrectable_recoverable(GArray *block, * Initialize "etc/hardware_errors" and "etc/hardware_errors_addr" fw_cfg blobs. * See docs/specs/acpi_hest_ghes.rst for blobs format. */ -static void build_ghes_error_table(GArray *hardware_errors, BIOSLinker *linker) +static void build_ghes_error_table(AcpiGhesState *ags, GArray *hardware_errors, + BIOSLinker *linker, int num_sources) { int i, error_status_block_offset; /* Build error_block_address */ - for (i = 0; i < ACPI_GHES_ERROR_SOURCE_COUNT; i++) { + for (i = 0; i < num_sources; i++) { build_append_int_noprefix(hardware_errors, 0, sizeof(uint64_t)); } /* Build read_ack_register */ - for (i = 0; i < ACPI_GHES_ERROR_SOURCE_COUNT; i++) { + for (i = 0; i < num_sources; i++) { /* * Initialize the value of read_ack_register to 1, so GHES can be * writable after (re)boot. @@ -231,13 +263,13 @@ static void build_ghes_error_table(GArray *hardware_errors, BIOSLinker *linker) /* Reserve space for Error Status Data Block */ acpi_data_push(hardware_errors, - ACPI_GHES_MAX_RAW_DATA_LENGTH * ACPI_GHES_ERROR_SOURCE_COUNT); + ACPI_GHES_MAX_RAW_DATA_LENGTH * num_sources); /* Tell guest firmware to place hardware_errors blob into RAM */ bios_linker_loader_alloc(linker, ACPI_HW_ERROR_FW_CFG_FILE, hardware_errors, sizeof(uint64_t), false); - for (i = 0; i < ACPI_GHES_ERROR_SOURCE_COUNT; i++) { + for (i = 0; i < num_sources; i++) { /* * Tell firmware to patch error_block_address entries to point to * corresponding "Generic Error Status Block" @@ -251,22 +283,26 @@ static void build_ghes_error_table(GArray *hardware_errors, BIOSLinker *linker) i * ACPI_GHES_MAX_RAW_DATA_LENGTH); } - /* - * tell firmware to write hardware_errors GPA into - * hardware_errors_addr fw_cfg, once the former has been initialized. - */ - bios_linker_loader_write_pointer(linker, ACPI_HW_ERROR_ADDR_FW_CFG_FILE, 0, - sizeof(uint64_t), - ACPI_HW_ERROR_FW_CFG_FILE, 0); + if (!ags->use_hest_addr) { + /* + * Tell firmware to write hardware_errors GPA into + * hardware_errors_addr fw_cfg, once the former has been initialized. + */ + bios_linker_loader_write_pointer(linker, ACPI_HW_ERROR_ADDR_FW_CFG_FILE, + 0, sizeof(uint64_t), + ACPI_HW_ERROR_FW_CFG_FILE, 0); + } } /* Build Generic Hardware Error Source version 2 (GHESv2) */ -static void build_ghes_v2(GArray *table_data, - BIOSLinker *linker, - enum AcpiGhesNotifyType notify, - uint16_t source_id) +static void build_ghes_v2_entry(GArray *table_data, + BIOSLinker *linker, + const AcpiNotificationSourceId *notif_src, + uint16_t index, int num_sources) { uint64_t address_offset; + const uint16_t notify = notif_src->notify; + const uint16_t source_id = notif_src->source_id; /* * Type: @@ -297,7 +333,7 @@ static void build_ghes_v2(GArray *table_data, address_offset + GAS_ADDR_OFFSET, sizeof(uint64_t), ACPI_HW_ERROR_FW_CFG_FILE, - source_id * sizeof(uint64_t)); + index * sizeof(uint64_t)); /* Notification Structure */ build_ghes_hw_error_notification(table_data, notify); @@ -317,8 +353,7 @@ static void build_ghes_v2(GArray *table_data, address_offset + GAS_ADDR_OFFSET, sizeof(uint64_t), ACPI_HW_ERROR_FW_CFG_FILE, - (ACPI_GHES_ERROR_SOURCE_COUNT + source_id) - * sizeof(uint64_t)); + (num_sources + index) * sizeof(uint64_t)); /* * Read Ack Preserve field @@ -331,23 +366,42 @@ static void build_ghes_v2(GArray *table_data, } /* Build Hardware Error Source Table */ -void acpi_build_hest(GArray *table_data, GArray *hardware_errors, +void acpi_build_hest(AcpiGhesState *ags, GArray *table_data, + GArray *hardware_errors, BIOSLinker *linker, + const AcpiNotificationSourceId *notif_source, + int num_sources, const char *oem_id, const char *oem_table_id) { AcpiTable table = { .sig = "HEST", .rev = 1, .oem_id = oem_id, .oem_table_id = oem_table_id }; + uint32_t hest_offset; + int i; - build_ghes_error_table(hardware_errors, linker); + hest_offset = table_data->len; + + build_ghes_error_table(ags, hardware_errors, linker, num_sources); acpi_table_begin(&table, table_data); /* Error Source Count */ - build_append_int_noprefix(table_data, ACPI_GHES_ERROR_SOURCE_COUNT, 4); - build_ghes_v2(table_data, linker, - ACPI_GHES_NOTIFY_SEA, ACPI_HEST_SRC_ID_SEA); + build_append_int_noprefix(table_data, num_sources, 4); + for (i = 0; i < num_sources; i++) { + build_ghes_v2_entry(table_data, linker, ¬if_source[i], i, num_sources); + } acpi_table_end(linker, &table); + + if (ags->use_hest_addr) { + /* + * Tell firmware to write into GPA the address of HEST via fw_cfg, + * once initialized. + */ + bios_linker_loader_write_pointer(linker, + ACPI_HEST_ADDR_FW_CFG_FILE, 0, + sizeof(uint64_t), + ACPI_BUILD_TABLE_FILE, hest_offset); + } } void acpi_ghes_add_fw_cfg(AcpiGhesState *ags, FWCfgState *s, @@ -357,21 +411,20 @@ void acpi_ghes_add_fw_cfg(AcpiGhesState *ags, FWCfgState *s, fw_cfg_add_file(s, ACPI_HW_ERROR_FW_CFG_FILE, hardware_error->data, hardware_error->len); - /* Create a read-write fw_cfg file for Address */ - fw_cfg_add_file_callback(s, ACPI_HW_ERROR_ADDR_FW_CFG_FILE, NULL, NULL, - NULL, &(ags->hw_error_le), sizeof(ags->hw_error_le), false); - - ags->present = true; + if (ags->use_hest_addr) { + fw_cfg_add_file_callback(s, ACPI_HEST_ADDR_FW_CFG_FILE, NULL, NULL, + NULL, &(ags->hest_addr_le), sizeof(ags->hest_addr_le), false); + } else { + /* Create a read-write fw_cfg file for Address */ + fw_cfg_add_file_callback(s, ACPI_HW_ERROR_ADDR_FW_CFG_FILE, NULL, NULL, + NULL, &(ags->hw_error_le), sizeof(ags->hw_error_le), false); + } } static void get_hw_error_offsets(uint64_t ghes_addr, uint64_t *cper_addr, uint64_t *read_ack_register_addr) { - if (!ghes_addr) { - return; - } - /* * non-HEST version supports only one source, so no need to change * the start offset based on the source ID. Also, we can't validate @@ -390,35 +443,94 @@ static void get_hw_error_offsets(uint64_t ghes_addr, *read_ack_register_addr = ghes_addr + sizeof(uint64_t); } -static void ghes_record_cper_errors(const void *cper, size_t len, - uint16_t source_id, Error **errp) +static void get_ghes_source_offsets(uint16_t source_id, + uint64_t hest_addr, + uint64_t *cper_addr, + uint64_t *read_ack_start_addr, + Error **errp) { - uint64_t cper_addr = 0, read_ack_register_addr = 0, read_ack_register; - AcpiGedState *acpi_ged_state; - AcpiGhesState *ags; + uint64_t hest_err_block_addr, hest_read_ack_addr; + uint64_t err_source_entry, error_block_addr; + uint32_t num_sources, i; - if (len > ACPI_GHES_MAX_RAW_DATA_LENGTH) { - error_setg(errp, "GHES CPER record is too big: %zd", len); - return; - } + hest_addr += ACPI_DESC_HEADER_OFFSET; - acpi_ged_state = ACPI_GED(object_resolve_path_type("", TYPE_ACPI_GED, - NULL)); - if (!acpi_ged_state) { - error_setg(errp, "Can't find ACPI_GED object"); + cpu_physical_memory_read(hest_addr, &num_sources, + sizeof(num_sources)); + num_sources = le32_to_cpu(num_sources); + + err_source_entry = hest_addr + sizeof(num_sources); + + /* + * Currently, HEST Error source navigates only for GHESv2 tables + */ + for (i = 0; i < num_sources; i++) { + uint64_t addr = err_source_entry; + uint16_t type, src_id; + + cpu_physical_memory_read(addr, &type, sizeof(type)); + type = le16_to_cpu(type); + + /* For now, we only know the size of GHESv2 table */ + if (type != ACPI_GHES_SOURCE_GENERIC_ERROR_V2) { + error_setg(errp, "HEST: type %d not supported.", type); + return; + } + + /* Compare CPER source ID at the GHESv2 structure */ + addr += sizeof(type); + cpu_physical_memory_read(addr, &src_id, sizeof(src_id)); + if (le16_to_cpu(src_id) == source_id) { + break; + } + + err_source_entry += HEST_GHES_V2_ENTRY_SIZE; + } + if (i == num_sources) { + error_setg(errp, "HEST: Source %d not found.", source_id); return; } - ags = &acpi_ged_state->ghes_state; - assert(ACPI_GHES_ERROR_SOURCE_COUNT == 1); - get_hw_error_offsets(le64_to_cpu(ags->hw_error_le), - &cper_addr, &read_ack_register_addr); + /* Navigate through table address pointers */ + hest_err_block_addr = err_source_entry + GHES_ERR_STATUS_ADDR_OFF + + GAS_ADDR_OFFSET; + + cpu_physical_memory_read(hest_err_block_addr, &error_block_addr, + sizeof(error_block_addr)); + error_block_addr = le64_to_cpu(error_block_addr); - if (!cper_addr) { - error_setg(errp, "can not find Generic Error Status Block"); + cpu_physical_memory_read(error_block_addr, cper_addr, + sizeof(*cper_addr)); + *cper_addr = le64_to_cpu(*cper_addr); + + hest_read_ack_addr = err_source_entry + GHES_READ_ACK_ADDR_OFF + + GAS_ADDR_OFFSET; + cpu_physical_memory_read(hest_read_ack_addr, read_ack_start_addr, + sizeof(*read_ack_start_addr)); + *read_ack_start_addr = le64_to_cpu(*read_ack_start_addr); +} + +NotifierList acpi_generic_error_notifiers = + NOTIFIER_LIST_INITIALIZER(acpi_generic_error_notifiers); + +void ghes_record_cper_errors(AcpiGhesState *ags, const void *cper, size_t len, + uint16_t source_id, Error **errp) +{ + uint64_t cper_addr = 0, read_ack_register_addr = 0, read_ack_register; + + if (len > ACPI_GHES_MAX_RAW_DATA_LENGTH) { + error_setg(errp, "GHES CPER record is too big: %zd", len); return; } + if (!ags->use_hest_addr) { + get_hw_error_offsets(le64_to_cpu(ags->hw_error_le), + &cper_addr, &read_ack_register_addr); + } else { + get_ghes_source_offsets(source_id, le64_to_cpu(ags->hest_addr_le), + &cper_addr, &read_ack_register_addr, errp); + } + cpu_physical_memory_read(read_ack_register_addr, &read_ack_register, sizeof(read_ack_register)); @@ -440,9 +552,12 @@ static void ghes_record_cper_errors(const void *cper, size_t len, /* Write the generic error data entry into guest memory */ cpu_physical_memory_write(cper_addr, cper, len); + + notifier_list_notify(&acpi_generic_error_notifiers, &source_id); } -int acpi_ghes_memory_errors(uint16_t source_id, uint64_t physical_address) +int acpi_ghes_memory_errors(AcpiGhesState *ags, uint16_t source_id, + uint64_t physical_address) { /* Memory Error Section Type */ const uint8_t guid[] = @@ -468,7 +583,7 @@ int acpi_ghes_memory_errors(uint16_t source_id, uint64_t physical_address) acpi_ghes_build_append_mem_cper(block, physical_address); /* Report the error */ - ghes_record_cper_errors(block->data, block->len, source_id, &errp); + ghes_record_cper_errors(ags, block->data, block->len, source_id, &errp); g_array_free(block, true); @@ -480,7 +595,7 @@ int acpi_ghes_memory_errors(uint16_t source_id, uint64_t physical_address) return 0; } -bool acpi_ghes_present(void) +AcpiGhesState *acpi_ghes_get_state(void) { AcpiGedState *acpi_ged_state; AcpiGhesState *ags; @@ -489,8 +604,12 @@ bool acpi_ghes_present(void) NULL)); if (!acpi_ged_state) { - return false; + return NULL; } ags = &acpi_ged_state->ghes_state; - return ags->present; + + if (!ags->hw_error_le && !ags->hest_addr_le) { + return NULL; + } + return ags; } diff --git a/hw/acpi/ghes_cper.c b/hw/acpi/ghes_cper.c new file mode 100644 index 0000000000..31cb2ffabe --- /dev/null +++ b/hw/acpi/ghes_cper.c @@ -0,0 +1,40 @@ +/* + * CPER payload parser for error injection + * + * Copyright(C) 2024-2025 Huawei LTD. + * + * This code is licensed under the GPL version 2 or later. See the + * COPYING file in the top-level directory. + * + * SPDX-License-Identifier: GPL-2.0-or-later + */ + +#include "qemu/osdep.h" + +#include "qemu/base64.h" +#include "qemu/error-report.h" +#include "qemu/uuid.h" +#include "qapi/qapi-commands-acpi-hest.h" +#include "hw/acpi/ghes.h" + +void qmp_inject_ghes_v2_error(const char *qmp_cper, Error **errp) +{ + AcpiGhesState *ags; + uint8_t *cper; + size_t len; + + ags = acpi_ghes_get_state(); + if (!ags) { + return; + } + + cper = qbase64_decode(qmp_cper, -1, &len, errp); + if (!cper) { + error_setg(errp, "missing GHES CPER payload"); + return; + } + + ghes_record_cper_errors(ags, cper, len, ACPI_HEST_SRC_ID_QMP, errp); + + g_free(cper); +} diff --git a/hw/acpi/ghes_cper_stub.c b/hw/acpi/ghes_cper_stub.c new file mode 100644 index 0000000000..b16be73502 --- /dev/null +++ b/hw/acpi/ghes_cper_stub.c @@ -0,0 +1,20 @@ +/* + * Stub interface for CPER payload parser for error injection + * + * Copyright(C) 2024-2025 Huawei LTD. + * + * This code is licensed under the GPL version 2 or later. See the + * COPYING file in the top-level directory. + * + * SPDX-License-Identifier: GPL-2.0-or-later + */ + +#include "qemu/osdep.h" +#include "qapi/error.h" +#include "qapi/qapi-commands-acpi-hest.h" +#include "hw/acpi/ghes.h" + +void qmp_inject_ghes_v2_error(const char *cper, Error **errp) +{ + error_setg(errp, "GHES QMP error inject is not compiled in"); +} diff --git a/hw/acpi/meson.build b/hw/acpi/meson.build index 73f02b9691..56b5d1ec96 100644 --- a/hw/acpi/meson.build +++ b/hw/acpi/meson.build @@ -34,4 +34,6 @@ endif system_ss.add(when: 'CONFIG_ACPI', if_false: files('acpi-stub.c', 'aml-build-stub.c', 'ghes-stub.c', 'acpi_interface.c')) system_ss.add(when: 'CONFIG_ACPI_PCI_BRIDGE', if_false: files('pci-bridge-stub.c')) system_ss.add_all(when: 'CONFIG_ACPI', if_true: acpi_ss) +system_ss.add(when: 'CONFIG_GHES_CPER', if_true: files('ghes_cper.c')) +system_ss.add(when: 'CONFIG_GHES_CPER', if_false: files('ghes_cper_stub.c')) system_ss.add(files('acpi-qmp-cmds.c')) diff --git a/hw/arm/virt-acpi-build.c b/hw/arm/virt-acpi-build.c index 96830f7c4e..8bb6b60515 100644 --- a/hw/arm/virt-acpi-build.c +++ b/hw/arm/virt-acpi-build.c @@ -1066,6 +1066,7 @@ build_dsdt(GArray *table_data, BIOSLinker *linker, VirtMachineState *vms) } acpi_dsdt_add_power_button(scope); + aml_append(scope, aml_error_device()); #ifdef CONFIG_TPM acpi_dsdt_add_tpm(scope, vms); #endif @@ -1125,6 +1126,15 @@ static void acpi_align_size(GArray *blob, unsigned align) g_array_set_size(blob, ROUND_UP(acpi_data_len(blob), align)); } +static const AcpiNotificationSourceId hest_ghes_notify[] = { + { ACPI_HEST_SRC_ID_SYNC, ACPI_GHES_NOTIFY_SEA }, + { ACPI_HEST_SRC_ID_QMP, ACPI_GHES_NOTIFY_GPIO }, +}; + +static const AcpiNotificationSourceId hest_ghes_notify_10_0[] = { + { ACPI_HEST_SRC_ID_SYNC, ACPI_GHES_NOTIFY_SEA }, +}; + static void virt_acpi_build(VirtMachineState *vms, AcpiBuildTables *tables) { @@ -1181,9 +1191,28 @@ void virt_acpi_build(VirtMachineState *vms, AcpiBuildTables *tables) build_dbg2(tables_blob, tables->linker, vms); if (vms->ras) { - acpi_add_table(table_offsets, tables_blob); - acpi_build_hest(tables_blob, tables->hardware_errors, tables->linker, - vms->oem_id, vms->oem_table_id); + AcpiGedState *acpi_ged_state; + static const AcpiNotificationSourceId *notify; + unsigned int notify_sz; + AcpiGhesState *ags; + + acpi_ged_state = ACPI_GED(vms->acpi_dev); + ags = &acpi_ged_state->ghes_state; + if (ags) { + acpi_add_table(table_offsets, tables_blob); + + if (!ags->use_hest_addr) { + notify = hest_ghes_notify_10_0; + notify_sz = ARRAY_SIZE(hest_ghes_notify_10_0); + } else { + notify = hest_ghes_notify; + notify_sz = ARRAY_SIZE(hest_ghes_notify); + } + + acpi_build_hest(ags, tables_blob, tables->hardware_errors, + tables->linker, notify, notify_sz, + vms->oem_id, vms->oem_table_id); + } } if (ms->numa_state->num_nodes > 0) { diff --git a/hw/arm/virt.c b/hw/arm/virt.c index 02209fadcf..175023897a 100644 --- a/hw/arm/virt.c +++ b/hw/arm/virt.c @@ -693,7 +693,7 @@ static inline DeviceState *create_acpi_ged(VirtMachineState *vms) MachineState *ms = MACHINE(vms); SysBusDevice *sbdev; int irq = vms->irqmap[VIRT_ACPI_GED]; - uint32_t event = ACPI_GED_PWR_DOWN_EVT; + uint32_t event = ACPI_GED_PWR_DOWN_EVT | ACPI_GED_ERROR_EVT; bool acpi_pcihp; if (ms->ram_slots) { @@ -1050,6 +1050,20 @@ static void virt_powerdown_req(Notifier *n, void *opaque) } } +static void virt_generic_error_req(Notifier *n, void *opaque) +{ + uint16_t *source_id = opaque; + + /* Currently, only QMP source ID is async */ + if (*source_id != ACPI_HEST_SRC_ID_QMP) { + return; + } + + VirtMachineState *s = container_of(n, VirtMachineState, generic_error_notifier); + + acpi_send_event(s->acpi_dev, ACPI_GENERIC_ERROR); +} + static void create_gpio_keys(char *fdt, DeviceState *pl061_dev, uint32_t phandle) { @@ -2500,6 +2514,9 @@ static void machvirt_init(MachineState *machine) if (has_ged && aarch64 && firmware_loaded && virt_is_acpi_enabled(vms)) { vms->acpi_dev = create_acpi_ged(vms); + vms->generic_error_notifier.notify = virt_generic_error_req; + notifier_list_add(&acpi_generic_error_notifiers, + &vms->generic_error_notifier); } else { create_gpio_devices(vms, VIRT_GPIO, sysmem); } @@ -3520,6 +3537,7 @@ DEFINE_VIRT_MACHINE_AS_LATEST(10, 2) static void virt_machine_10_1_options(MachineClass *mc) { virt_machine_10_2_options(mc); + mc->smbios_memory_device_size = 2047 * TiB; compat_props_add(mc->compat_props, hw_compat_10_1, hw_compat_10_1_len); } DEFINE_VIRT_MACHINE(10, 1) diff --git a/hw/block/virtio-blk.c b/hw/block/virtio-blk.c index 9bab2716c1..64efce4846 100644 --- a/hw/block/virtio-blk.c +++ b/hw/block/virtio-blk.c @@ -62,11 +62,7 @@ void virtio_blk_req_complete(VirtIOBlockReq *req, unsigned char status) iov_discard_undo(&req->inhdr_undo); iov_discard_undo(&req->outhdr_undo); virtqueue_push(req->vq, &req->elem, req->in_len); - if (qemu_in_iothread()) { - virtio_notify_irqfd(vdev, req->vq); - } else { - virtio_notify(vdev, req->vq); - } + virtio_notify(vdev, req->vq); } static int virtio_blk_handle_rw_error(VirtIOBlockReq *req, int error, diff --git a/hw/core/machine.c b/hw/core/machine.c index 38c949c4f2..681adbb7ac 100644 --- a/hw/core/machine.c +++ b/hw/core/machine.c @@ -35,9 +35,12 @@ #include "hw/virtio/virtio-pci.h" #include "hw/virtio/virtio-net.h" #include "hw/virtio/virtio-iommu.h" +#include "hw/acpi/generic_event_device.h" #include "audio/audio.h" -GlobalProperty hw_compat_10_1[] = {}; +GlobalProperty hw_compat_10_1[] = { + { TYPE_ACPI_GED, "x-has-hest-addr", "false" }, +}; const size_t hw_compat_10_1_len = G_N_ELEMENTS(hw_compat_10_1); GlobalProperty hw_compat_10_0[] = { @@ -1115,8 +1118,11 @@ static void machine_class_init(ObjectClass *oc, const void *data) * SMBIOS 3.1.0 7.18.5 Memory Device — Extended Size * use max possible value that could be encoded into * 'Extended Size' field (2047Tb). + * + * Unfortunately (current) Windows Server 2025 and earlier do not handle + * 4Tb+ DIMM size. */ - mc->smbios_memory_device_size = 2047 * TiB; + mc->smbios_memory_device_size = 2 * TiB; /* numa node memory size aligned on 8MB by default. * On Linux, each node's border has to be 8MB aligned diff --git a/hw/core/qdev.c b/hw/core/qdev.c index f600226176..fab42a7270 100644 --- a/hw/core/qdev.c +++ b/hw/core/qdev.c @@ -411,6 +411,35 @@ char *qdev_get_dev_path(DeviceState *dev) return NULL; } +const char *qdev_get_printable_name(DeviceState *vdev) +{ + /* + * Return device ID if explicity set + * (e.g. -device virtio-blk-pci,id=foo) + * This allows users to correlate errors with their custom device + * names. + */ + if (vdev->id) { + return vdev->id; + } + /* + * Fall back to the canonical QOM device path (eg. ID for PCI + * devices). + * This ensures the device is still uniquely and meaningfully + * identified. + */ + const char *path = qdev_get_dev_path(vdev); + if (path) { + return path; + } + + /* + * Final fallback: if all else fails, return a placeholder string. + * This ensures the error message always contains a valid string. + */ + return "<unknown device>"; +} + void qdev_add_unplug_blocker(DeviceState *dev, Error *reason) { dev->unplug_blockers = g_slist_prepend(dev->unplug_blockers, reason); diff --git a/hw/i386/acpi-build.c b/hw/i386/acpi-build.c index 423c4959fe..9446a9f862 100644 --- a/hw/i386/acpi-build.c +++ b/hw/i386/acpi-build.c @@ -1863,7 +1863,11 @@ build_amd_iommu(GArray *table_data, BIOSLinker *linker, const char *oem_id, /* IOMMU info */ build_append_int_noprefix(table_data, 0, 2); /* IOMMU Attributes */ - build_append_int_noprefix(table_data, 0, 4); + if (!s->iommu.dma_translation) { + build_append_int_noprefix(table_data, (1UL << 0) /* HATDis */, 4); + } else { + build_append_int_noprefix(table_data, 0, 4); + } /* EFR Register Image */ build_append_int_noprefix(table_data, amdvi_extended_feature_register(s), diff --git a/hw/i386/amd_iommu.c b/hw/i386/amd_iommu.c index 26be69bec8..378e0cb55e 100644 --- a/hw/i386/amd_iommu.c +++ b/hw/i386/amd_iommu.c @@ -33,6 +33,7 @@ #include "hw/i386/apic-msidef.h" #include "hw/qdev-properties.h" #include "kvm/kvm_i386.h" +#include "qemu/iova-tree.h" /* used AMD-Vi MMIO registers */ const char *amdvi_mmio_low[] = { @@ -66,6 +67,15 @@ struct AMDVIAddressSpace { MemoryRegion iommu_nodma; /* Alias of shared nodma memory region */ MemoryRegion iommu_ir; /* Device's interrupt remapping region */ AddressSpace as; /* device's corresponding address space */ + + /* DMA address translation support */ + IOMMUNotifierFlag notifier_flags; + /* entry in list of Address spaces with registered notifiers */ + QLIST_ENTRY(AMDVIAddressSpace) next; + /* Record DMA translation ranges */ + IOVATree *iova_tree; + /* DMA address translation active */ + bool addr_translation; }; /* AMDVI cache entry */ @@ -77,12 +87,29 @@ typedef struct AMDVIIOTLBEntry { uint64_t page_mask; /* physical page size */ } AMDVIIOTLBEntry; +/* + * These 'fault' reasons have an overloaded meaning since they are not only + * intended for describing reasons that generate an IO_PAGE_FAULT as per the AMD + * IOMMU specification, but are also used to signal internal errors in the + * emulation code. + */ +typedef enum AMDVIFaultReason { + AMDVI_FR_DTE_RTR_ERR = 1, /* Failure to retrieve DTE */ + AMDVI_FR_DTE_V, /* DTE[V] = 0 */ + AMDVI_FR_DTE_TV, /* DTE[TV] = 0 */ + AMDVI_FR_PT_ROOT_INV, /* Page Table Root ptr invalid */ + AMDVI_FR_PT_ENTRY_INV, /* Failure to read PTE from guest memory */ +} AMDVIFaultReason; + uint64_t amdvi_extended_feature_register(AMDVIState *s) { uint64_t feature = AMDVI_DEFAULT_EXT_FEATURES; if (s->xtsup) { feature |= AMDVI_FEATURE_XT; } + if (!s->iommu.dma_translation) { + feature |= AMDVI_HATS_MODE_RESERVED; + } return feature; } @@ -438,18 +465,704 @@ static void amdvi_completion_wait(AMDVIState *s, uint64_t *cmd) trace_amdvi_completion_wait(addr, data); } +static inline uint64_t amdvi_get_perms(uint64_t entry) +{ + return (entry & (AMDVI_DEV_PERM_READ | AMDVI_DEV_PERM_WRITE)) >> + AMDVI_DEV_PERM_SHIFT; +} + +/* validate that reserved bits are honoured */ +static bool amdvi_validate_dte(AMDVIState *s, uint16_t devid, + uint64_t *dte) +{ + + uint64_t root; + + if ((dte[0] & AMDVI_DTE_QUAD0_RESERVED) || + (dte[1] & AMDVI_DTE_QUAD1_RESERVED) || + (dte[2] & AMDVI_DTE_QUAD2_RESERVED) || + (dte[3] & AMDVI_DTE_QUAD3_RESERVED)) { + amdvi_log_illegaldevtab_error(s, devid, + s->devtab + + devid * AMDVI_DEVTAB_ENTRY_SIZE, 0); + return false; + } + + /* + * 1 = Host Address Translation is not supported. Value in MMIO Offset + * 0030h[HATS] is not meaningful. A non-zero host page table root pointer + * in the DTE would result in an ILLEGAL_DEV_TABLE_ENTRY event. + */ + root = (dte[0] & AMDVI_DEV_PT_ROOT_MASK) >> 12; + if (root && !s->iommu.dma_translation) { + amdvi_log_illegaldevtab_error(s, devid, + s->devtab + + devid * AMDVI_DEVTAB_ENTRY_SIZE, 0); + return false; + } + + return true; +} + +/* get a device table entry given the devid */ +static bool amdvi_get_dte(AMDVIState *s, int devid, uint64_t *entry) +{ + uint32_t offset = devid * AMDVI_DEVTAB_ENTRY_SIZE; + + if (dma_memory_read(&address_space_memory, s->devtab + offset, entry, + AMDVI_DEVTAB_ENTRY_SIZE, MEMTXATTRS_UNSPECIFIED)) { + trace_amdvi_dte_get_fail(s->devtab, offset); + /* log error accessing dte */ + amdvi_log_devtab_error(s, devid, s->devtab + offset, 0); + return false; + } + + *entry = le64_to_cpu(*entry); + if (!amdvi_validate_dte(s, devid, entry)) { + trace_amdvi_invalid_dte(entry[0]); + return false; + } + + return true; +} + +/* get pte translation mode */ +static inline uint8_t get_pte_translation_mode(uint64_t pte) +{ + return (pte >> AMDVI_DEV_MODE_RSHIFT) & AMDVI_DEV_MODE_MASK; +} + +static inline uint64_t amdvi_get_pte_entry(AMDVIState *s, uint64_t pte_addr, + uint16_t devid) +{ + uint64_t pte; + + if (dma_memory_read(&address_space_memory, pte_addr, + &pte, sizeof(pte), MEMTXATTRS_UNSPECIFIED)) { + trace_amdvi_get_pte_hwerror(pte_addr); + amdvi_log_pagetab_error(s, devid, pte_addr, 0); + pte = (uint64_t)-1; + return pte; + } + + pte = le64_to_cpu(pte); + return pte; +} + +static int amdvi_as_to_dte(AMDVIAddressSpace *as, uint64_t *dte) +{ + uint16_t devid = PCI_BUILD_BDF(as->bus_num, as->devfn); + AMDVIState *s = as->iommu_state; + + if (!amdvi_get_dte(s, devid, dte)) { + /* Unable to retrieve DTE for devid */ + return -AMDVI_FR_DTE_RTR_ERR; + } + + if (!(dte[0] & AMDVI_DEV_VALID)) { + /* DTE[V] not set, address is passed untranslated for devid */ + return -AMDVI_FR_DTE_V; + } + + if (!(dte[0] & AMDVI_DEV_TRANSLATION_VALID)) { + /* DTE[TV] not set, host page table not valid for devid */ + return -AMDVI_FR_DTE_TV; + } + return 0; +} + +/* + * For a PTE encoding a large page, return the page size it encodes as described + * by the AMD IOMMU Specification Table 14: Example Page Size Encodings. + * No need to adjust the value of the PTE to point to the first PTE in the large + * page since the encoding guarantees all "base" PTEs in the large page are the + * same. + */ +static uint64_t large_pte_page_size(uint64_t pte) +{ + assert(PTE_NEXT_LEVEL(pte) == 7); + + /* Determine size of the large/contiguous page encoded in the PTE */ + return PTE_LARGE_PAGE_SIZE(pte); +} + +/* + * Helper function to fetch a PTE using AMD v1 pgtable format. + * On successful page walk, returns 0 and pte parameter points to a valid PTE. + * On failure, returns: + * -AMDVI_FR_PT_ROOT_INV: A page walk is not possible due to conditions like DTE + * with invalid permissions, Page Table Root can not be read from DTE, or a + * larger IOVA than supported by page table level encoded in DTE[Mode]. + * -AMDVI_FR_PT_ENTRY_INV: A PTE could not be read from guest memory during a + * page table walk. This means that the DTE has valid data, but one of the + * lower level entries in the Page Table could not be read. + */ +static uint64_t fetch_pte(AMDVIAddressSpace *as, hwaddr address, uint64_t dte, + uint64_t *pte, hwaddr *page_size) +{ + IOMMUAccessFlags perms = amdvi_get_perms(dte); + + uint8_t level, mode; + uint64_t pte_addr; + + *pte = dte; + *page_size = 0; + + if (perms == IOMMU_NONE) { + return -AMDVI_FR_PT_ROOT_INV; + } + + /* + * The Linux kernel driver initializes the default mode to 3, corresponding + * to a 39-bit GPA space, where each entry in the pagetable translates to a + * 1GB (2^30) page size. + */ + level = mode = get_pte_translation_mode(dte); + assert(mode > 0 && mode < 7); + + /* + * If IOVA is larger than the max supported by the current pgtable level, + * there is nothing to do. + */ + if (address > PT_LEVEL_MAX_ADDR(mode - 1)) { + /* IOVA too large for the current DTE */ + return -AMDVI_FR_PT_ROOT_INV; + } + + do { + level -= 1; + + /* Update the page_size */ + *page_size = PTE_LEVEL_PAGE_SIZE(level); + + /* Permission bits are ANDed at every level, including the DTE */ + perms &= amdvi_get_perms(*pte); + if (perms == IOMMU_NONE) { + return 0; + } + + /* Not Present */ + if (!IOMMU_PTE_PRESENT(*pte)) { + return 0; + } + + /* Large or Leaf PTE found */ + if (PTE_NEXT_LEVEL(*pte) == 7 || PTE_NEXT_LEVEL(*pte) == 0) { + /* Leaf PTE found */ + break; + } + + /* + * Index the pgtable using the IOVA bits corresponding to current level + * and walk down to the lower level. + */ + pte_addr = NEXT_PTE_ADDR(*pte, level, address); + *pte = amdvi_get_pte_entry(as->iommu_state, pte_addr, as->devfn); + + if (*pte == (uint64_t)-1) { + /* + * A returned PTE of -1 indicates a failure to read the page table + * entry from guest memory. + */ + if (level == mode - 1) { + /* Failure to retrieve the Page Table from Root Pointer */ + *page_size = 0; + return -AMDVI_FR_PT_ROOT_INV; + } else { + /* Failure to read PTE. Page walk skips a page_size chunk */ + return -AMDVI_FR_PT_ENTRY_INV; + } + } + } while (level > 0); + + assert(PTE_NEXT_LEVEL(*pte) == 0 || PTE_NEXT_LEVEL(*pte) == 7 || + level == 0); + /* + * Page walk ends when Next Level field on PTE shows that either a leaf PTE + * or a series of large PTEs have been reached. In the latter case, even if + * the range starts in the middle of a contiguous page, the returned PTE + * must be the first PTE of the series. + */ + if (PTE_NEXT_LEVEL(*pte) == 7) { + /* Update page_size with the large PTE page size */ + *page_size = large_pte_page_size(*pte); + } + + return 0; +} + +/* + * Invoke notifiers registered for the address space. Update record of mapped + * ranges in IOVA Tree. + */ +static void amdvi_notify_iommu(AMDVIAddressSpace *as, IOMMUTLBEvent *event) +{ + IOMMUTLBEntry *entry = &event->entry; + + DMAMap target = { + .iova = entry->iova, + .size = entry->addr_mask, + .translated_addr = entry->translated_addr, + .perm = entry->perm, + }; + + /* + * Search the IOVA Tree for an existing translation for the target, and skip + * the notification if the mapping is already recorded. + * When the guest uses large pages, comparing against the record makes it + * possible to determine the size of the original MAP and adjust the UNMAP + * request to match it. This avoids failed checks against the mappings kept + * by the VFIO kernel driver. + */ + const DMAMap *mapped = iova_tree_find(as->iova_tree, &target); + + if (event->type == IOMMU_NOTIFIER_UNMAP) { + if (!mapped) { + /* No record exists of this mapping, nothing to do */ + return; + } + /* + * Adjust the size based on the original record. This is essential to + * determine when large/contiguous pages are used, since the guest has + * already cleared the PTE (erasing the pagesize encoded on it) before + * issuing the invalidation command. + */ + if (mapped->size != target.size) { + assert(mapped->size > target.size); + target.size = mapped->size; + /* Adjust event to invoke notifier with correct range */ + entry->addr_mask = mapped->size; + } + iova_tree_remove(as->iova_tree, target); + } else { /* IOMMU_NOTIFIER_MAP */ + if (mapped) { + /* + * If a mapping is present and matches the request, skip the + * notification. + */ + if (!memcmp(mapped, &target, sizeof(DMAMap))) { + return; + } else { + /* + * This should never happen unless a buggy guest OS omits or + * sends incorrect invalidation(s). Report an error in the event + * it does happen. + */ + error_report("Found conflicting translation. This could be due " + "to an incorrect or missing invalidation command"); + } + } + /* Record the new mapping */ + iova_tree_insert(as->iova_tree, &target); + } + + /* Invoke the notifiers registered for this address space */ + memory_region_notify_iommu(&as->iommu, 0, *event); +} + +/* + * Walk the guest page table for an IOVA and range and signal the registered + * notifiers to sync the shadow page tables in the host. + * Must be called with a valid DTE for DMA remapping i.e. V=1,TV=1 + */ +static void amdvi_sync_shadow_page_table_range(AMDVIAddressSpace *as, + uint64_t *dte, hwaddr addr, + uint64_t size, bool send_unmap) +{ + IOMMUTLBEvent event; + + hwaddr page_mask, pagesize; + hwaddr iova = addr; + hwaddr end = iova + size - 1; + + uint64_t pte; + int ret; + + while (iova < end) { + + ret = fetch_pte(as, iova, dte[0], &pte, &pagesize); + + if (ret == -AMDVI_FR_PT_ROOT_INV) { + /* + * Invalid conditions such as the IOVA being larger than supported + * by current page table mode as configured in the DTE, or a failure + * to fetch the Page Table from the Page Table Root Pointer in DTE. + */ + assert(pagesize == 0); + return; + } + /* PTE has been validated for major errors and pagesize is set */ + assert(pagesize); + page_mask = ~(pagesize - 1); + + if (ret == -AMDVI_FR_PT_ENTRY_INV) { + /* + * Failure to read PTE from memory, the pagesize matches the current + * level. Unable to determine the region type, so a safe strategy is + * to skip the range and continue the page walk. + */ + goto next; + } + + event.entry.target_as = &address_space_memory; + event.entry.iova = iova & page_mask; + /* translated_addr is irrelevant for the unmap case */ + event.entry.translated_addr = (pte & AMDVI_DEV_PT_ROOT_MASK) & + page_mask; + event.entry.addr_mask = ~page_mask; + event.entry.perm = amdvi_get_perms(pte); + + /* + * In cases where the leaf PTE is not found, or it has invalid + * permissions, an UNMAP type notification is sent, but only if the + * caller requested it. + */ + if (!IOMMU_PTE_PRESENT(pte) || (event.entry.perm == IOMMU_NONE)) { + if (!send_unmap) { + goto next; + } + event.type = IOMMU_NOTIFIER_UNMAP; + } else { + event.type = IOMMU_NOTIFIER_MAP; + } + + /* + * The following call might need to adjust event.entry.size in cases + * where the guest unmapped a series of large pages. + */ + amdvi_notify_iommu(as, &event); + /* + * In the special scenario where the guest is unmapping a large page, + * addr_mask has been adjusted before sending the notification. Update + * pagesize accordingly in order to correctly compute the next IOVA. + */ + pagesize = event.entry.addr_mask + 1; + +next: + iova &= ~(pagesize - 1); + + /* Check for 64-bit overflow and terminate walk in such cases */ + if ((iova + pagesize) < iova) { + break; + } else { + iova += pagesize; + } + } +} + +/* + * Unmap entire range that the notifier registered for i.e. the full AS. + * + * This is seemingly technically equivalent to directly calling + * memory_region_unmap_iommu_notifier_range(), but it allows to check for + * notifier boundaries and issue notifications with ranges within those bounds. + */ +static void amdvi_address_space_unmap(AMDVIAddressSpace *as, IOMMUNotifier *n) +{ + + hwaddr start = n->start; + hwaddr end = n->end; + hwaddr remain; + DMAMap map; + + assert(start <= end); + remain = end - start + 1; + + /* + * Divide the notifier range into chunks that are aligned and do not exceed + * the notifier boundaries. + */ + while (remain >= AMDVI_PAGE_SIZE) { + + IOMMUTLBEvent event; + + uint64_t mask = dma_aligned_pow2_mask(start, end, 64); + + event.type = IOMMU_NOTIFIER_UNMAP; + + IOMMUTLBEntry entry = { + .target_as = &address_space_memory, + .iova = start, + .translated_addr = 0, /* irrelevant for unmap case */ + .addr_mask = mask, + .perm = IOMMU_NONE, + }; + event.entry = entry; + + /* Call notifier registered for updates on this address space */ + memory_region_notify_iommu_one(n, &event); + + start += mask + 1; + remain -= mask + 1; + } + + assert(!remain); + + map.iova = n->start; + map.size = n->end - n->start; + + iova_tree_remove(as->iova_tree, map); +} + +/* + * For all the address spaces with notifiers registered, unmap the entire range + * the notifier registered for i.e. clear all the address spaces managed by the + * IOMMU. + */ +static void amdvi_address_space_unmap_all(AMDVIState *s) +{ + AMDVIAddressSpace *as; + IOMMUNotifier *n; + + QLIST_FOREACH(as, &s->amdvi_as_with_notifiers, next) { + IOMMU_NOTIFIER_FOREACH(n, &as->iommu) { + amdvi_address_space_unmap(as, n); + } + } +} + +/* + * For every translation present in the IOMMU, construct IOMMUTLBEntry data + * and pass it as parameter to notifier callback. + */ +static void amdvi_iommu_replay(IOMMUMemoryRegion *iommu_mr, IOMMUNotifier *n) +{ + AMDVIAddressSpace *as = container_of(iommu_mr, AMDVIAddressSpace, iommu); + uint64_t dte[4] = { 0 }; + + if (!(n->notifier_flags & IOMMU_NOTIFIER_MAP)) { + return; + } + + if (amdvi_as_to_dte(as, dte)) { + return; + } + + /* Dropping all mappings for the address space. Also clears the IOVA tree */ + amdvi_address_space_unmap(as, n); + + amdvi_sync_shadow_page_table_range(as, &dte[0], 0, UINT64_MAX, false); +} + +static void amdvi_address_space_sync(AMDVIAddressSpace *as) +{ + IOMMUNotifier *n; + uint64_t dte[4] = { 0 }; + + /* If only UNMAP notifiers are registered, drop all existing mappings */ + if (!(as->notifier_flags & IOMMU_NOTIFIER_MAP)) { + IOMMU_NOTIFIER_FOREACH(n, &as->iommu) { + /* + * Directly calling memory_region_unmap_iommu_notifier_range() does + * not guarantee that the addr_mask eventually passed as parameter + * to the notifier is valid. Use amdvi_address_space_unmap() which + * ensures the notifier range is divided into properly aligned + * regions, and issues notifications for each one. + */ + amdvi_address_space_unmap(as, n); + } + return; + } + + if (amdvi_as_to_dte(as, dte)) { + return; + } + + amdvi_sync_shadow_page_table_range(as, &dte[0], 0, UINT64_MAX, true); +} + +/* + * This differs from the replay() method in that it issues both MAP and UNMAP + * notifications since it is called after global invalidation events in order to + * re-sync all address spaces. + */ +static void amdvi_iommu_address_space_sync_all(AMDVIState *s) +{ + AMDVIAddressSpace *as; + + QLIST_FOREACH(as, &s->amdvi_as_with_notifiers, next) { + amdvi_address_space_sync(as); + } +} + +/* + * Toggle between address translation and passthrough modes by enabling the + * corresponding memory regions. + */ +static void amdvi_switch_address_space(AMDVIAddressSpace *amdvi_as) +{ + AMDVIState *s = amdvi_as->iommu_state; + + if (s->dma_remap && amdvi_as->addr_translation) { + /* Enabling DMA region */ + memory_region_set_enabled(&amdvi_as->iommu_nodma, false); + memory_region_set_enabled(MEMORY_REGION(&amdvi_as->iommu), true); + } else { + /* Disabling DMA region, using passthrough */ + memory_region_set_enabled(MEMORY_REGION(&amdvi_as->iommu), false); + memory_region_set_enabled(&amdvi_as->iommu_nodma, true); + } +} + +/* + * For all existing address spaces managed by the IOMMU, enable/disable the + * corresponding memory regions to reset the address translation mode and + * use passthrough by default. + */ +static void amdvi_reset_address_translation_all(AMDVIState *s) +{ + AMDVIAddressSpace **iommu_as; + + for (int bus_num = 0; bus_num < PCI_BUS_MAX; bus_num++) { + + /* Nothing to do if there are no devices on the current bus */ + if (!s->address_spaces[bus_num]) { + continue; + } + iommu_as = s->address_spaces[bus_num]; + + for (int devfn = 0; devfn < PCI_DEVFN_MAX; devfn++) { + + if (!iommu_as[devfn]) { + continue; + } + /* Use passthrough as default mode after reset */ + iommu_as[devfn]->addr_translation = false; + amdvi_switch_address_space(iommu_as[devfn]); + } + } +} + +static void enable_dma_mode(AMDVIAddressSpace *as, bool inval_current) +{ + /* + * When enabling DMA mode for the purpose of isolating guest devices on + * a failure to retrieve or invalid DTE, all existing mappings must be + * dropped. + */ + if (inval_current) { + IOMMUNotifier *n; + IOMMU_NOTIFIER_FOREACH(n, &as->iommu) { + amdvi_address_space_unmap(as, n); + } + } + + if (as->addr_translation) { + return; + } + + /* Installing DTE enabling translation, activate region */ + as->addr_translation = true; + amdvi_switch_address_space(as); + /* Sync shadow page tables */ + amdvi_address_space_sync(as); +} + +/* + * If paging was previously in use in the address space + * - invalidate all existing mappings + * - switch to no_dma memory region + */ +static void enable_nodma_mode(AMDVIAddressSpace *as) +{ + IOMMUNotifier *n; + + if (!as->addr_translation) { + /* passthrough is already active, nothing to do */ + return; + } + + as->addr_translation = false; + IOMMU_NOTIFIER_FOREACH(n, &as->iommu) { + /* Drop all mappings for the address space */ + amdvi_address_space_unmap(as, n); + } + amdvi_switch_address_space(as); +} + +/* + * A guest driver must issue the INVALIDATE_DEVTAB_ENTRY command to the IOMMU + * after changing a Device Table entry. We can use this fact to detect when a + * Device Table entry is created for a device attached to a paging domain and + * enable the corresponding IOMMU memory region to allow for DMA translation if + * appropriate. + */ +static void amdvi_update_addr_translation_mode(AMDVIState *s, uint16_t devid) +{ + uint8_t bus_num, devfn, dte_mode; + AMDVIAddressSpace *as; + uint64_t dte[4] = { 0 }; + int ret; + + /* + * Convert the devid encoded in the command to a bus and devfn in + * order to retrieve the corresponding address space. + */ + bus_num = PCI_BUS_NUM(devid); + devfn = devid & 0xff; + + /* + * The main buffer of size (AMDVIAddressSpace *) * (PCI_BUS_MAX) has already + * been allocated within AMDVIState, but must be careful to not access + * unallocated devfn. + */ + if (!s->address_spaces[bus_num] || !s->address_spaces[bus_num][devfn]) { + return; + } + as = s->address_spaces[bus_num][devfn]; + + ret = amdvi_as_to_dte(as, dte); + + if (!ret) { + dte_mode = (dte[0] >> AMDVI_DEV_MODE_RSHIFT) & AMDVI_DEV_MODE_MASK; + } + + switch (ret) { + case 0: + /* DTE was successfully retrieved */ + if (!dte_mode) { + enable_nodma_mode(as); /* DTE[V]=1 && DTE[Mode]=0 => passthrough */ + } else { + enable_dma_mode(as, false); /* Enable DMA translation */ + } + break; + case -AMDVI_FR_DTE_V: + /* DTE[V]=0, address is passed untranslated */ + enable_nodma_mode(as); + break; + case -AMDVI_FR_DTE_RTR_ERR: + case -AMDVI_FR_DTE_TV: + /* + * Enforce isolation by using DMA in rare scenarios where the DTE cannot + * be retrieved or DTE[TV]=0. Existing mappings are dropped. + */ + enable_dma_mode(as, true); + break; + } +} + /* log error without aborting since linux seems to be using reserved bits */ static void amdvi_inval_devtab_entry(AMDVIState *s, uint64_t *cmd) { uint16_t devid = cpu_to_le16((uint16_t)extract64(cmd[0], 0, 16)); + trace_amdvi_devtab_inval(PCI_BUS_NUM(devid), PCI_SLOT(devid), + PCI_FUNC(devid)); + /* This command should invalidate internal caches of which there isn't */ if (extract64(cmd[0], 16, 44) || cmd[1]) { amdvi_log_illegalcom_error(s, extract64(cmd[0], 60, 4), s->cmdbuf + s->cmdbuf_head); + return; + } + + /* + * When DMA remapping capability is enabled, check if updated DTE is setup + * for paging or not, and configure the corresponding memory regions. + */ + if (s->dma_remap) { + amdvi_update_addr_translation_mode(s, devid); } - trace_amdvi_devtab_inval(PCI_BUS_NUM(devid), PCI_SLOT(devid), - PCI_FUNC(devid)); } static void amdvi_complete_ppr(AMDVIState *s, uint64_t *cmd) @@ -480,6 +1193,13 @@ static void amdvi_inval_all(AMDVIState *s, uint64_t *cmd) amdvi_intremap_inval_notify_all(s, true, 0, 0); amdvi_iotlb_reset(s); + + /* + * Fully replay the address space i.e. send both UNMAP and MAP events in + * order to synchronize guest and host IO page tables tables. + */ + amdvi_iommu_address_space_sync_all(s); + trace_amdvi_all_inval(); } @@ -491,10 +1211,109 @@ static gboolean amdvi_iotlb_remove_by_domid(gpointer key, gpointer value, return entry->domid == domid; } +/* + * Helper to decode the size of the range to invalidate encoded in the + * INVALIDATE_IOMMU_PAGES Command format. + * The size of the region to invalidate depends on the S bit and address. + * S bit value: + * 0 : Invalidation size is 4 Kbytes. + * 1 : Invalidation size is determined by first zero bit in the address + * starting from Address[12]. + * + * In the AMD IOMMU Linux driver, an invalidation command with address + * ((1 << 63) - 1) is sent when intending to clear the entire cache. + * However, Table 14: Example Page Size Encodings shows that an address of + * ((1ULL << 51) - 1) encodes the entire cache, so effectively any address with + * first zero at bit 51 or larger is a request to invalidate the entire address + * space. + */ +static uint64_t amdvi_decode_invalidation_size(hwaddr addr, uint16_t flags) +{ + uint64_t size = AMDVI_PAGE_SIZE; + uint8_t fzbit = 0; + + if (flags & AMDVI_CMD_INVAL_IOMMU_PAGES_S) { + fzbit = cto64(addr | 0xFFF); + + if (fzbit >= 51) { + size = AMDVI_INV_ALL_PAGES; + } else { + size = 1ULL << (fzbit + 1); + } + } + return size; +} + +/* + * Synchronize the guest page tables with the shadow page tables kept in the + * host for the specified range. + * The invalidation command issued by the guest and intercepted by the VMM + * does not specify a device, but a domain, since all devices in the same domain + * share the same page tables. However, vIOMMU emulation creates separate + * address spaces per device, so it is necessary to traverse the list of all of + * address spaces (i.e. devices) that have notifiers registered in order to + * propagate the changes to the host page tables. + * We cannot return early from this function once a matching domain has been + * identified and its page tables synced (based on the fact that all devices in + * the same domain share the page tables). The reason is that different devices + * (i.e. address spaces) could have different notifiers registered, and by + * skipping address spaces that appear later on the amdvi_as_with_notifiers list + * their notifiers (which could differ from the ones registered for the first + * device/address space) would not be invoked. + */ +static void amdvi_sync_domain(AMDVIState *s, uint16_t domid, uint64_t addr, + uint16_t flags) +{ + AMDVIAddressSpace *as; + + uint64_t size = amdvi_decode_invalidation_size(addr, flags); + + if (size == AMDVI_INV_ALL_PAGES) { + addr = 0; /* Set start address to 0 and invalidate entire AS */ + } else { + addr &= ~(size - 1); + } + + /* + * Call notifiers that have registered for each address space matching the + * domain ID, in order to sync the guest pagetable state with the host. + */ + QLIST_FOREACH(as, &s->amdvi_as_with_notifiers, next) { + + uint64_t dte[4] = { 0 }; + + /* + * Retrieve the Device Table entry for the devid corresponding to the + * current address space, and verify the DomainID matches i.e. the page + * tables to be synced belong to devices in the domain. + */ + if (amdvi_as_to_dte(as, dte)) { + continue; + } + + /* Only need to sync the Page Tables for a matching domain */ + if (domid != (dte[1] & AMDVI_DEV_DOMID_ID_MASK)) { + continue; + } + + /* + * We have determined that there is a valid Device Table Entry for a + * device matching the DomainID in the INV_IOMMU_PAGES command issued by + * the guest. Walk the guest page table to sync shadow page table. + */ + if (as->notifier_flags & IOMMU_NOTIFIER_MAP) { + /* Sync guest IOMMU mappings with host */ + amdvi_sync_shadow_page_table_range(as, &dte[0], addr, size, true); + } + } +} + /* we don't have devid - we can't remove pages by address */ static void amdvi_inval_pages(AMDVIState *s, uint64_t *cmd) { uint16_t domid = cpu_to_le16((uint16_t)extract64(cmd[0], 32, 16)); + uint64_t addr = cpu_to_le64(extract64(cmd[1], 12, 52)) << 12; + uint16_t flags = cpu_to_le16((uint16_t)extract64(cmd[1], 0, 3)); if (extract64(cmd[0], 20, 12) || extract64(cmd[0], 48, 12) || extract64(cmd[1], 3, 9)) { @@ -504,6 +1323,8 @@ static void amdvi_inval_pages(AMDVIState *s, uint64_t *cmd) g_hash_table_foreach_remove(s->iotlb, amdvi_iotlb_remove_by_domid, &domid); + + amdvi_sync_domain(s, domid, addr, flags); trace_amdvi_pages_inval(domid); } @@ -894,150 +1715,68 @@ static void amdvi_mmio_write(void *opaque, hwaddr addr, uint64_t val, } } -static inline uint64_t amdvi_get_perms(uint64_t entry) -{ - return (entry & (AMDVI_DEV_PERM_READ | AMDVI_DEV_PERM_WRITE)) >> - AMDVI_DEV_PERM_SHIFT; -} - -/* validate that reserved bits are honoured */ -static bool amdvi_validate_dte(AMDVIState *s, uint16_t devid, - uint64_t *dte) -{ - if ((dte[0] & AMDVI_DTE_QUAD0_RESERVED) || - (dte[1] & AMDVI_DTE_QUAD1_RESERVED) || - (dte[2] & AMDVI_DTE_QUAD2_RESERVED) || - (dte[3] & AMDVI_DTE_QUAD3_RESERVED)) { - amdvi_log_illegaldevtab_error(s, devid, - s->devtab + - devid * AMDVI_DEVTAB_ENTRY_SIZE, 0); - return false; - } - - return true; -} - -/* get a device table entry given the devid */ -static bool amdvi_get_dte(AMDVIState *s, int devid, uint64_t *entry) +static void amdvi_page_walk(AMDVIAddressSpace *as, uint64_t *dte, + IOMMUTLBEntry *ret, unsigned perms, + hwaddr addr) { - uint32_t offset = devid * AMDVI_DEVTAB_ENTRY_SIZE; + hwaddr page_mask, pagesize = 0; + uint8_t mode; + uint64_t pte; + int fetch_ret; - if (dma_memory_read(&address_space_memory, s->devtab + offset, entry, - AMDVI_DEVTAB_ENTRY_SIZE, MEMTXATTRS_UNSPECIFIED)) { - trace_amdvi_dte_get_fail(s->devtab, offset); - /* log error accessing dte */ - amdvi_log_devtab_error(s, devid, s->devtab + offset, 0); - return false; + /* make sure the DTE has TV = 1 */ + if (!(dte[0] & AMDVI_DEV_TRANSLATION_VALID)) { + /* + * A DTE with V=1, TV=0 does not have a valid Page Table Root Pointer. + * An IOMMU processing a request that requires a table walk terminates + * the walk when it encounters this condition. Do the same and return + * instead of assuming that the address is forwarded without translation + * i.e. the passthrough case, as it is done for the case where DTE[V]=0. + */ + return; } - *entry = le64_to_cpu(*entry); - if (!amdvi_validate_dte(s, devid, entry)) { - trace_amdvi_invalid_dte(entry[0]); - return false; + mode = get_pte_translation_mode(dte[0]); + if (mode >= 7) { + trace_amdvi_mode_invalid(mode, addr); + return; } - - return true; -} - -/* get pte translation mode */ -static inline uint8_t get_pte_translation_mode(uint64_t pte) -{ - return (pte >> AMDVI_DEV_MODE_RSHIFT) & AMDVI_DEV_MODE_MASK; -} - -static inline uint64_t pte_override_page_mask(uint64_t pte) -{ - uint8_t page_mask = 13; - uint64_t addr = (pte & AMDVI_DEV_PT_ROOT_MASK) >> 12; - /* find the first zero bit */ - while (addr & 1) { - page_mask++; - addr = addr >> 1; + if (mode == 0) { + goto no_remap; } - return ~((1ULL << page_mask) - 1); -} - -static inline uint64_t pte_get_page_mask(uint64_t oldlevel) -{ - return ~((1UL << ((oldlevel * 9) + 3)) - 1); -} + /* Attempt to fetch the PTE to determine if a valid mapping exists */ + fetch_ret = fetch_pte(as, addr, dte[0], &pte, &pagesize); -static inline uint64_t amdvi_get_pte_entry(AMDVIState *s, uint64_t pte_addr, - uint16_t devid) -{ - uint64_t pte; + /* + * If walking the page table results in an error of any type, returns an + * empty PTE i.e. no mapping, or the permissions do not match, return since + * there is no translation available. + */ + if (fetch_ret < 0 || !IOMMU_PTE_PRESENT(pte) || + perms != (perms & amdvi_get_perms(pte))) { - if (dma_memory_read(&address_space_memory, pte_addr, - &pte, sizeof(pte), MEMTXATTRS_UNSPECIFIED)) { - trace_amdvi_get_pte_hwerror(pte_addr); - amdvi_log_pagetab_error(s, devid, pte_addr, 0); - pte = 0; - return pte; + amdvi_page_fault(as->iommu_state, as->devfn, addr, perms); + trace_amdvi_page_fault(addr); + return; } - pte = le64_to_cpu(pte); - return pte; -} - -static void amdvi_page_walk(AMDVIAddressSpace *as, uint64_t *dte, - IOMMUTLBEntry *ret, unsigned perms, - hwaddr addr) -{ - unsigned level, present, pte_perms, oldlevel; - uint64_t pte = dte[0], pte_addr, page_mask; - - /* make sure the DTE has TV = 1 */ - if (pte & AMDVI_DEV_TRANSLATION_VALID) { - level = get_pte_translation_mode(pte); - if (level >= 7) { - trace_amdvi_mode_invalid(level, addr); - return; - } - if (level == 0) { - goto no_remap; - } - - /* we are at the leaf page table or page table encodes a huge page */ - do { - pte_perms = amdvi_get_perms(pte); - present = pte & 1; - if (!present || perms != (perms & pte_perms)) { - amdvi_page_fault(as->iommu_state, as->devfn, addr, perms); - trace_amdvi_page_fault(addr); - return; - } - - /* go to the next lower level */ - pte_addr = pte & AMDVI_DEV_PT_ROOT_MASK; - /* add offset and load pte */ - pte_addr += ((addr >> (3 + 9 * level)) & 0x1FF) << 3; - pte = amdvi_get_pte_entry(as->iommu_state, pte_addr, as->devfn); - if (!pte) { - return; - } - oldlevel = level; - level = get_pte_translation_mode(pte); - } while (level > 0 && level < 7); + /* A valid PTE and page size has been retrieved */ + assert(pagesize); + page_mask = ~(pagesize - 1); - if (level == 0x7) { - page_mask = pte_override_page_mask(pte); - } else { - page_mask = pte_get_page_mask(oldlevel); - } + /* get access permissions from pte */ + ret->iova = addr & page_mask; + ret->translated_addr = (pte & AMDVI_DEV_PT_ROOT_MASK) & page_mask; + ret->addr_mask = ~page_mask; + ret->perm = amdvi_get_perms(pte); + return; - /* get access permissions from pte */ - ret->iova = addr & page_mask; - ret->translated_addr = (pte & AMDVI_DEV_PT_ROOT_MASK) & page_mask; - ret->addr_mask = ~page_mask; - ret->perm = amdvi_get_perms(pte); - return; - } no_remap: ret->iova = addr & AMDVI_PAGE_MASK_4K; ret->translated_addr = addr & AMDVI_PAGE_MASK_4K; ret->addr_mask = ~AMDVI_PAGE_MASK_4K; - ret->perm = amdvi_get_perms(pte); + ret->perm = amdvi_get_perms(dte[0]); } static void amdvi_do_translate(AMDVIAddressSpace *as, hwaddr addr, @@ -1047,6 +1786,7 @@ static void amdvi_do_translate(AMDVIAddressSpace *as, hwaddr addr, uint16_t devid = PCI_BUILD_BDF(as->bus_num, as->devfn); AMDVIIOTLBEntry *iotlb_entry = amdvi_iotlb_lookup(s, addr, devid); uint64_t entry[4]; + int dte_ret; if (iotlb_entry) { trace_amdvi_iotlb_hit(PCI_BUS_NUM(devid), PCI_SLOT(devid), @@ -1058,13 +1798,14 @@ static void amdvi_do_translate(AMDVIAddressSpace *as, hwaddr addr, return; } - if (!amdvi_get_dte(s, devid, entry)) { - return; - } + dte_ret = amdvi_as_to_dte(as, entry); - /* devices with V = 0 are not translated */ - if (!(entry[0] & AMDVI_DEV_VALID)) { - goto out; + if (dte_ret < 0) { + if (dte_ret == -AMDVI_FR_DTE_V) { + /* DTE[V]=0, address is passed untranslated */ + goto out; + } + return; } amdvi_page_walk(as, entry, ret, @@ -1500,6 +2241,9 @@ static AddressSpace *amdvi_host_dma_iommu(PCIBus *bus, void *opaque, int devfn) iommu_as[devfn]->bus_num = (uint8_t)bus_num; iommu_as[devfn]->devfn = (uint8_t)devfn; iommu_as[devfn]->iommu_state = s; + iommu_as[devfn]->notifier_flags = IOMMU_NOTIFIER_NONE; + iommu_as[devfn]->iova_tree = iova_tree_new(); + iommu_as[devfn]->addr_translation = false; amdvi_dev_as = iommu_as[devfn]; @@ -1542,8 +2286,7 @@ static AddressSpace *amdvi_host_dma_iommu(PCIBus *bus, void *opaque, int devfn) AMDVI_INT_ADDR_FIRST, &amdvi_dev_as->iommu_ir, 1); - memory_region_set_enabled(&amdvi_dev_as->iommu_nodma, false); - memory_region_set_enabled(MEMORY_REGION(&amdvi_dev_as->iommu), true); + amdvi_switch_address_space(amdvi_dev_as); } return &iommu_as[devfn]->as; } @@ -1573,14 +2316,35 @@ static int amdvi_iommu_notify_flag_changed(IOMMUMemoryRegion *iommu, Error **errp) { AMDVIAddressSpace *as = container_of(iommu, AMDVIAddressSpace, iommu); + AMDVIState *s = as->iommu_state; - if (new & IOMMU_NOTIFIER_MAP) { - error_setg(errp, - "device %02x.%02x.%x requires iommu notifier which is not " - "currently supported", as->bus_num, PCI_SLOT(as->devfn), - PCI_FUNC(as->devfn)); - return -EINVAL; + /* + * Accurate synchronization of the vIOMMU page tables required to support + * MAP notifiers is provided by the dma-remap feature. In addition, this + * also requires that the vIOMMU presents the NpCache capability, so a guest + * driver issues invalidations for both map() and unmap() operations. The + * capability is already set by default as part of AMDVI_CAPAB_FEATURES and + * written to the configuration in amdvi_pci_realize(). + */ + if (!s->dma_remap && (new & IOMMU_NOTIFIER_MAP)) { + error_setg_errno(errp, ENOTSUP, + "device %02x.%02x.%x requires dma-remap=1", + as->bus_num, PCI_SLOT(as->devfn), PCI_FUNC(as->devfn)); + return -ENOTSUP; } + + /* + * Update notifier flags for address space and the list of address spaces + * with registered notifiers. + */ + as->notifier_flags = new; + + if (old == IOMMU_NOTIFIER_NONE) { + QLIST_INSERT_HEAD(&s->amdvi_as_with_notifiers, as, next); + } else if (new == IOMMU_NOTIFIER_NONE) { + QLIST_REMOVE(as, next); + } + return 0; } @@ -1657,6 +2421,10 @@ static void amdvi_sysbus_reset(DeviceState *dev) msi_reset(&s->pci->dev); amdvi_init(s); + + /* Discard all mappings on device reset */ + amdvi_address_space_unmap_all(s); + amdvi_reset_address_translation_all(s); } static const VMStateDescription vmstate_amdvi_sysbus_migratable = { @@ -1787,6 +2555,7 @@ static void amdvi_sysbus_realize(DeviceState *dev, Error **errp) static const Property amdvi_properties[] = { DEFINE_PROP_BOOL("xtsup", AMDVIState, xtsup, false), DEFINE_PROP_STRING("pci-id", AMDVIState, pci_id), + DEFINE_PROP_BOOL("dma-remap", AMDVIState, dma_remap, false), }; static const VMStateDescription vmstate_amdvi_sysbus = { @@ -1848,6 +2617,7 @@ static void amdvi_iommu_memory_region_class_init(ObjectClass *klass, imrc->translate = amdvi_translate; imrc->notify_flag_changed = amdvi_iommu_notify_flag_changed; + imrc->replay = amdvi_iommu_replay; } static const TypeInfo amdvi_iommu_memory_region_info = { diff --git a/hw/i386/amd_iommu.h b/hw/i386/amd_iommu.h index 2476296c49..daf82fc85f 100644 --- a/hw/i386/amd_iommu.h +++ b/hw/i386/amd_iommu.h @@ -126,6 +126,10 @@ #define AMDVI_CMD_COMPLETE_PPR_REQUEST 0x07 #define AMDVI_CMD_INVAL_AMDVI_ALL 0x08 + +#define AMDVI_CMD_INVAL_IOMMU_PAGES_S (1ULL << 0) +#define AMDVI_INV_ALL_PAGES (1ULL << 52) + #define AMDVI_DEVTAB_ENTRY_SIZE 32 /* Device table entry bits 0:63 */ @@ -173,6 +177,47 @@ /* AMDVI paging mode */ #define AMDVI_GATS_MODE (2ULL << 12) #define AMDVI_HATS_MODE (2ULL << 10) +#define AMDVI_HATS_MODE_RESERVED (3ULL << 10) + +/* Page Table format */ + +#define AMDVI_PTE_PR (1ULL << 0) +#define AMDVI_PTE_NEXT_LEVEL_MASK GENMASK64(11, 9) + +#define IOMMU_PTE_PRESENT(pte) ((pte) & AMDVI_PTE_PR) + +/* Using level=0 for leaf PTE at 4K page size */ +#define PT_LEVEL_SHIFT(level) (12 + ((level) * 9)) + +/* Return IOVA bit group used to index the Page Table at specific level */ +#define PT_LEVEL_INDEX(level, iova) (((iova) >> PT_LEVEL_SHIFT(level)) & \ + GENMASK64(8, 0)) + +/* Return the max address for a specified level i.e. max_oaddr */ +#define PT_LEVEL_MAX_ADDR(x) (((x) < 5) ? \ + ((1ULL << PT_LEVEL_SHIFT((x + 1))) - 1) : \ + (~(0ULL))) + +/* Extract the NextLevel field from PTE/PDE */ +#define PTE_NEXT_LEVEL(pte) (((pte) & AMDVI_PTE_NEXT_LEVEL_MASK) >> 9) + +/* Take page table level and return default pagetable size for level */ +#define PTE_LEVEL_PAGE_SIZE(level) (1ULL << (PT_LEVEL_SHIFT(level))) + +/* + * Return address of lower level page table encoded in PTE and specified by + * current level and corresponding IOVA bit group at such level. + */ +#define NEXT_PTE_ADDR(pte, level, iova) (((pte) & AMDVI_DEV_PT_ROOT_MASK) + \ + (PT_LEVEL_INDEX(level, iova) * 8)) + +/* + * Take a PTE value with mode=0x07 and return the page size it encodes. + */ +#define PTE_LARGE_PAGE_SIZE(pte) (1ULL << (1 + cto64(((pte) | 0xfffULL)))) + +/* Return number of PTEs to use for a given page size (expected power of 2) */ +#define PAGE_SIZE_PTE_COUNT(pgsz) (1ULL << ((ctz64(pgsz) - 12) % 9)) /* IOTLB */ #define AMDVI_IOTLB_MAX_SIZE 1024 @@ -365,12 +410,18 @@ struct AMDVIState { /* for each served device */ AMDVIAddressSpace **address_spaces[PCI_BUS_MAX]; + /* list of address spaces with registered notifiers */ + QLIST_HEAD(, AMDVIAddressSpace) amdvi_as_with_notifiers; + /* IOTLB */ GHashTable *iotlb; /* Interrupt remapping */ bool ga_enabled; bool xtsup; + + /* DMA address translation */ + bool dma_remap; }; uint64_t amdvi_extended_feature_register(AMDVIState *s); diff --git a/hw/i386/intel_iommu.c b/hw/i386/intel_iommu.c index 83c5e44413..6a168d5107 100644 --- a/hw/i386/intel_iommu.c +++ b/hw/i386/intel_iommu.c @@ -45,6 +45,8 @@ ((ce)->val[1] & VTD_SM_CONTEXT_ENTRY_RID2PASID_MASK) #define VTD_CE_GET_PASID_DIR_TABLE(ce) \ ((ce)->val[0] & VTD_PASID_DIR_BASE_ADDR_MASK) +#define VTD_CE_GET_PRE(ce) \ + ((ce)->val[0] & VTD_SM_CONTEXT_ENTRY_PRE) /* pe operations */ #define VTD_PE_GET_TYPE(pe) ((pe)->val[0] & VTD_SM_PASID_ENTRY_PGTT) @@ -85,13 +87,6 @@ struct vtd_iotlb_key { static void vtd_address_space_refresh_all(IntelIOMMUState *s); static void vtd_address_space_unmap(VTDAddressSpace *as, IOMMUNotifier *n); -static void vtd_panic_require_caching_mode(void) -{ - error_report("We need to set caching-mode=on for intel-iommu to enable " - "device assignment with IOMMU protection."); - exit(1); -} - static void vtd_define_quad(IntelIOMMUState *s, hwaddr addr, uint64_t val, uint64_t wmask, uint64_t w1cmask) { @@ -1838,6 +1833,7 @@ static const bool vtd_qualified_faults[] = { [VTD_FR_FS_NON_CANONICAL] = true, [VTD_FR_FS_PAGING_ENTRY_US] = true, [VTD_FR_SM_WRITE] = true, + [VTD_FR_SM_PRE_ABS] = true, [VTD_FR_SM_INTERRUPT_ADDR] = true, [VTD_FR_FS_BIT_UPDATE_FAILED] = true, [VTD_FR_MAX] = false, @@ -2701,7 +2697,7 @@ static void vtd_handle_gcmd_write(IntelIOMMUState *s) uint32_t changed = status ^ val; trace_vtd_reg_write_gcmd(status, val); - if ((changed & VTD_GCMD_TE) && s->dma_translation) { + if ((changed & VTD_GCMD_TE) && x86_iommu->dma_translation) { /* Translation enable/disable */ vtd_handle_gcmd_te(s, val & VTD_GCMD_TE); } @@ -2857,7 +2853,13 @@ static bool vtd_process_wait_desc(IntelIOMMUState *s, VTDInvDesc *inv_desc) vtd_generate_completion_event(s); } - if (!(inv_desc->lo & (VTD_INV_DESC_WAIT_IF | VTD_INV_DESC_WAIT_SW))) { + /* + * SW=0, IF=0, FN=1 is also a valid descriptor (VT-d 7.10) + * Nothing to do as we process the descriptors in order + */ + + if (!(inv_desc->lo & (VTD_INV_DESC_WAIT_IF | VTD_INV_DESC_WAIT_SW | + VTD_INV_DESC_WAIT_FN))) { error_report_once("%s: invalid wait desc: hi=%"PRIx64", lo=%"PRIx64 " (unknown type)", __func__, inv_desc->hi, inv_desc->lo); @@ -3146,6 +3148,59 @@ static bool vtd_process_device_piotlb_desc(IntelIOMMUState *s, return true; } +static bool vtd_process_page_group_response_desc(IntelIOMMUState *s, + VTDInvDesc *inv_desc) +{ + VTDAddressSpace *vtd_dev_as; + bool pasid_present; + uint8_t response_code; + uint16_t rid; + uint32_t pasid; + uint16_t prgi; + IOMMUPRIResponse response; + + if ((inv_desc->lo & VTD_INV_DESC_PGRESP_RSVD_LO) || + (inv_desc->hi & VTD_INV_DESC_PGRESP_RSVD_HI)) { + error_report_once("%s: invalid page group response desc: hi=%"PRIx64 + ", lo=%"PRIx64" (reserved nonzero)", __func__, + inv_desc->hi, inv_desc->lo); + return false; + } + + pasid_present = VTD_INV_DESC_PGRESP_PP(inv_desc->lo); + response_code = VTD_INV_DESC_PGRESP_RC(inv_desc->lo); + rid = VTD_INV_DESC_PGRESP_RID(inv_desc->lo); + pasid = VTD_INV_DESC_PGRESP_PASID(inv_desc->lo); + prgi = VTD_INV_DESC_PGRESP_PRGI(inv_desc->hi); + + if (!pasid_present) { + error_report_once("Page group response without PASID is" + "not supported yet"); + return false; + } + + vtd_dev_as = vtd_get_as_by_sid_and_pasid(s, rid, pasid); + if (!vtd_dev_as) { + return true; + } + + response.prgi = prgi; + + if (response_code == 0x0u) { + response.response_code = IOMMU_PRI_RESP_SUCCESS; + } else if (response_code == 0x1u) { + response.response_code = IOMMU_PRI_RESP_INVALID_REQUEST; + } else { + response.response_code = IOMMU_PRI_RESP_FAILURE; + } + + if (vtd_dev_as->pri_notifier) { + vtd_dev_as->pri_notifier->notify(vtd_dev_as->pri_notifier, &response); + } + + return true; +} + static bool vtd_process_device_iotlb_desc(IntelIOMMUState *s, VTDInvDesc *inv_desc) { @@ -3246,6 +3301,13 @@ static bool vtd_process_inv_desc(IntelIOMMUState *s) } break; + case VTD_INV_DESC_PGRESP: + trace_vtd_inv_desc("page group response", inv_desc.hi, inv_desc.lo); + if (!vtd_process_page_group_response_desc(s, &inv_desc)) { + return false; + } + break; + /* * TODO: the entity of below two cases will be implemented in future series. * To make guest (which integrates scalable mode support patch set in @@ -3380,6 +3442,27 @@ static void vtd_handle_iectl_write(IntelIOMMUState *s) } } +static void vtd_handle_prs_write(IntelIOMMUState *s) +{ + uint32_t prs = vtd_get_long_raw(s, DMAR_PRS_REG); + if (!(prs & VTD_PR_STATUS_PPR) && !(prs & VTD_PR_STATUS_PRO)) { + vtd_set_clear_mask_long(s, DMAR_PECTL_REG, VTD_PR_PECTL_IP, 0); + } +} + +static void vtd_handle_pectl_write(IntelIOMMUState *s) +{ + uint32_t pectl = vtd_get_long_raw(s, DMAR_PECTL_REG); + if ((pectl & VTD_PR_PECTL_IP) && !(pectl & VTD_PR_PECTL_IM)) { + /* + * If IP field was 1 when software clears the IM field, + * the interrupt is generated along with clearing the IP field. + */ + vtd_set_clear_mask_long(s, DMAR_PECTL_REG, VTD_PR_PECTL_IP, 0); + vtd_generate_interrupt(s, DMAR_PEADDR_REG, DMAR_PEDATA_REG); + } +} + static uint64_t vtd_mem_read(void *opaque, hwaddr addr, unsigned size) { IntelIOMMUState *s = opaque; @@ -3422,6 +3505,11 @@ static uint64_t vtd_mem_read(void *opaque, hwaddr addr, unsigned size) val = s->iq >> 32; break; + case DMAR_PEUADDR_REG: + assert(size == 4); + val = vtd_get_long_raw(s, DMAR_PEUADDR_REG); + break; + default: if (size == 4) { val = vtd_get_long(s, addr); @@ -3485,6 +3573,11 @@ static void vtd_mem_write(void *opaque, hwaddr addr, vtd_handle_iotlb_write(s); break; + case DMAR_PEUADDR_REG: + assert(size == 4); + vtd_set_long(s, addr, val); + break; + /* Invalidate Address Register, 64-bit */ case DMAR_IVA_REG: if (size == 4) { @@ -3665,6 +3758,18 @@ static void vtd_mem_write(void *opaque, hwaddr addr, vtd_set_long(s, addr, val); break; + case DMAR_PRS_REG: + assert(size == 4); + vtd_set_long(s, addr, val); + vtd_handle_prs_write(s); + break; + + case DMAR_PECTL_REG: + assert(size == 4); + vtd_set_long(s, addr, val); + vtd_handle_pectl_write(s); + break; + default: if (size == 4) { vtd_set_long(s, addr, val); @@ -3835,7 +3940,6 @@ static const Property vtd_properties[] = { DEFINE_PROP_BOOL("snoop-control", IntelIOMMUState, snoop_control, false), DEFINE_PROP_BOOL("x-pasid-mode", IntelIOMMUState, pasid, false), DEFINE_PROP_BOOL("dma-drain", IntelIOMMUState, dma_drain, true), - DEFINE_PROP_BOOL("dma-translation", IntelIOMMUState, dma_translation, true), DEFINE_PROP_BOOL("stale-tm", IntelIOMMUState, stale_tm, false), DEFINE_PROP_BOOL("fs1gp", IntelIOMMUState, fs1gp, true), }; @@ -4378,6 +4482,12 @@ static bool vtd_dev_set_iommu_device(PCIBus *bus, void *opaque, int devfn, assert(hiod); + if (!s->caching_mode) { + error_setg(errp, "Device assignment is not allowed without enabling " + "caching-mode=on for Intel IOMMU."); + return false; + } + vtd_iommu_lock(s); if (g_hash_table_lookup(s->vtd_host_iommu_dev, &key)) { @@ -4549,11 +4659,11 @@ static void vtd_cap_init(IntelIOMMUState *s) s->cap = VTD_CAP_FRO | VTD_CAP_NFR | VTD_CAP_ND | VTD_CAP_MAMV | VTD_CAP_PSI | VTD_CAP_SLLPS | - VTD_CAP_MGAW(s->aw_bits); + VTD_CAP_ESRTPS | VTD_CAP_MGAW(s->aw_bits); if (s->dma_drain) { s->cap |= VTD_CAP_DRAIN; } - if (s->dma_translation) { + if (x86_iommu->dma_translation) { if (s->aw_bits >= VTD_HOST_AW_39BIT) { s->cap |= VTD_CAP_SAGAW_39bit; } @@ -4716,6 +4826,18 @@ static void vtd_init(IntelIOMMUState *s) * Interrupt remapping registers. */ vtd_define_quad(s, DMAR_IRTA_REG, 0, 0xfffffffffffff80fULL, 0); + + /* Page request registers */ + if (s->ecap & VTD_ECAP_PRS) { + vtd_define_quad(s, DMAR_PQH_REG, 0, 0x7ffe0ULL, 0); + vtd_define_quad(s, DMAR_PQT_REG, 0, 0x7ffe0ULL, 0); + vtd_define_quad(s, DMAR_PQA_REG, 0, 0xfffffffffffff007ULL, 0); + vtd_define_long(s, DMAR_PRS_REG, 0, 0, 0x3UL); + vtd_define_long(s, DMAR_PECTL_REG, 0, 0x80000000UL, 0); + vtd_define_long(s, DMAR_PEDATA_REG, 0, 0xffffUL, 0); + vtd_define_long(s, DMAR_PEADDR_REG, 0, 0xfffffffcUL, 0); + vtd_define_long(s, DMAR_PEUADDR_REG, 0, 0xffffffffUL, 0); + } } /* Should not reset address_spaces when reset because devices will still use @@ -4803,6 +4925,194 @@ static ssize_t vtd_ats_request_translation(PCIBus *bus, void *opaque, return res_index; } +/* 11.4.11.3 : The number of entries in the page request queue is 2^(PQS + 7) */ +static inline uint64_t vtd_prq_size(IntelIOMMUState *s) +{ + return 1ULL << ((vtd_get_quad(s, DMAR_PQA_REG) & VTD_PQA_SIZE) + 7); +} + +/** + * Return true if the bit is accessible and correctly set, false otherwise + */ +static bool vtd_check_pre_bit(VTDAddressSpace *vtd_as, hwaddr addr, + uint16_t sid, bool is_write) +{ + int ret; + IntelIOMMUState *s = vtd_as->iommu_state; + uint8_t bus_n = pci_bus_num(vtd_as->bus); + VTDContextEntry ce; + bool is_fpd_set = false; + + ret = vtd_dev_to_context_entry(s, bus_n, vtd_as->devfn, &ce); + + if (ret) { + goto error_report; + } + + if (!VTD_CE_GET_PRE(&ce)) { + ret = -VTD_FR_SM_PRE_ABS; + goto error_get_fpd_and_report; + } + + return true; + +error_get_fpd_and_report: + /* Try to get fpd (may not work but we are already on an error path) */ + is_fpd_set = ce.lo & VTD_CONTEXT_ENTRY_FPD; + vtd_ce_get_pasid_fpd(s, &ce, &is_fpd_set, vtd_as->pasid); +error_report: + vtd_report_fault(s, -ret, is_fpd_set, sid, addr, is_write, + vtd_as->pasid != PCI_NO_PASID, vtd_as->pasid); + return false; +} + +/* Logic described in section 7.5 */ +static void vtd_generate_page_request_event(IntelIOMMUState *s, + uint32_t old_pr_status) +{ + uint32_t current_pectl = vtd_get_long(s, DMAR_PECTL_REG); + /* + * Hardware evaluates PPR and PRO fields in the Page Request Status Register + * and if any of them is set, Page Request Event is not generated + */ + if (old_pr_status & (VTD_PR_STATUS_PRO | VTD_PR_STATUS_PPR)) { + return; + } + + vtd_set_clear_mask_long(s, DMAR_PECTL_REG, 0, VTD_PR_PECTL_IP); + if (!(current_pectl & VTD_PR_PECTL_IM)) { + vtd_set_clear_mask_long(s, DMAR_PECTL_REG, VTD_PR_PECTL_IP, 0); + vtd_generate_interrupt(s, DMAR_PEADDR_REG, DMAR_PEDATA_REG); + } +} + +/* When calling this function, we known that we are in scalable mode */ +static int vtd_pri_perform_implicit_invalidation(VTDAddressSpace *vtd_as, + hwaddr addr) +{ + IntelIOMMUState *s = vtd_as->iommu_state; + VTDContextEntry ce; + VTDPASIDEntry pe; + uint16_t pgtt; + uint16_t domain_id; + int ret = vtd_dev_to_context_entry(s, pci_bus_num(vtd_as->bus), + vtd_as->devfn, &ce); + if (ret) { + return -EINVAL; + } + ret = vtd_ce_get_rid2pasid_entry(s, &ce, &pe, vtd_as->pasid); + if (ret) { + return -EINVAL; + } + pgtt = VTD_PE_GET_TYPE(&pe); + domain_id = VTD_SM_PASID_ENTRY_DID(pe.val[1]); + ret = 0; + switch (pgtt) { + case VTD_SM_PASID_ENTRY_FLT: + vtd_piotlb_page_invalidate(s, domain_id, vtd_as->pasid, addr, 0); + break; + /* Room for other pgtt values */ + default: + error_report_once("Translation type not supported yet : %d", pgtt); + ret = -EINVAL; + break; + } + + return ret; +} + +/* Page Request Descriptor : 7.4.1.1 */ +static int vtd_pri_request_page(PCIBus *bus, void *opaque, int devfn, + uint32_t pasid, bool priv_req, bool exec_req, + hwaddr addr, bool lpig, uint16_t prgi, + bool is_read, bool is_write) +{ + IntelIOMMUState *s = opaque; + VTDAddressSpace *vtd_as; + + vtd_as = vtd_find_add_as(s, bus, devfn, pasid); + + uint64_t queue_addr_reg = vtd_get_quad(s, DMAR_PQA_REG); + uint64_t queue_tail_offset_reg = vtd_get_quad(s, DMAR_PQT_REG); + uint64_t new_queue_tail_offset = ( + (queue_tail_offset_reg + VTD_PQA_ENTRY_SIZE) % + (vtd_prq_size(s) * VTD_PQA_ENTRY_SIZE)); + uint64_t queue_head_offset_reg = vtd_get_quad(s, DMAR_PQH_REG); + hwaddr queue_tail = (queue_addr_reg & VTD_PQA_ADDR) + queue_tail_offset_reg; + uint32_t old_pr_status = vtd_get_long(s, DMAR_PRS_REG); + uint16_t sid = PCI_BUILD_BDF(pci_bus_num(vtd_as->bus), vtd_as->devfn); + VTDPRDesc desc; + + if (!(s->ecap & VTD_ECAP_PRS)) { + return -EPERM; + } + + /* + * No need to check if scalable mode is enabled as we already known that + * VTD_ECAP_PRS is set (see vtd_decide_config) + */ + + /* We do not support PRI without PASID */ + if (vtd_as->pasid == PCI_NO_PASID) { + return -EPERM; + } + if (exec_req && !is_read) { + return -EINVAL; + } + + /* Check PRE bit in the scalable mode context entry */ + if (!vtd_check_pre_bit(vtd_as, addr, sid, is_write)) { + return -EPERM; + } + + if (old_pr_status & VTD_PR_STATUS_PRO) { + /* + * No action is taken by hardware to report a fault + * or generate an event + */ + return -ENOSPC; + } + + /* Check for overflow */ + if (new_queue_tail_offset == queue_head_offset_reg) { + vtd_set_clear_mask_long(s, DMAR_PRS_REG, 0, VTD_PR_STATUS_PRO); + vtd_generate_page_request_event(s, old_pr_status); + return -ENOSPC; + } + + if (vtd_pri_perform_implicit_invalidation(vtd_as, addr)) { + return -EINVAL; + } + + desc.lo = VTD_PRD_TYPE | VTD_PRD_PP(true) | VTD_PRD_RID(sid) | + VTD_PRD_PASID(vtd_as->pasid) | VTD_PRD_PMR(priv_req); + desc.hi = VTD_PRD_RDR(is_read) | VTD_PRD_WRR(is_write) | + VTD_PRD_LPIG(lpig) | VTD_PRD_PRGI(prgi) | VTD_PRD_ADDR(addr); + + desc.lo = cpu_to_le64(desc.lo); + desc.hi = cpu_to_le64(desc.hi); + if (dma_memory_write(&address_space_memory, queue_tail, &desc, sizeof(desc), + MEMTXATTRS_UNSPECIFIED)) { + error_report_once("IO error, the PQ tail cannot be updated"); + return -EIO; + } + + /* increment the tail register and set the pending request bit */ + vtd_set_quad(s, DMAR_PQT_REG, new_queue_tail_offset); + /* + * read status again so that the kernel does not miss a request. + * in some cases, we can trigger an unecessary interrupt but this strategy + * drastically improves performance as we don't need to take a lock. + */ + old_pr_status = vtd_get_long(s, DMAR_PRS_REG); + if (!(old_pr_status & VTD_PR_STATUS_PPR)) { + vtd_set_clear_mask_long(s, DMAR_PRS_REG, 0, VTD_PR_STATUS_PPR); + vtd_generate_page_request_event(s, old_pr_status); + } + + return 0; +} + static void vtd_init_iotlb_notifier(PCIBus *bus, void *opaque, int devfn, IOMMUNotifier *n, IOMMUNotify fn, void *user_opaque) @@ -4844,6 +5154,26 @@ static void vtd_unregister_iotlb_notifier(PCIBus *bus, void *opaque, memory_region_unregister_iommu_notifier(MEMORY_REGION(&vtd_as->iommu), n); } +static void vtd_pri_register_notifier(PCIBus *bus, void *opaque, int devfn, + uint32_t pasid, IOMMUPRINotifier *notifier) +{ + IntelIOMMUState *s = opaque; + VTDAddressSpace *vtd_as; + + vtd_as = vtd_find_add_as(s, bus, devfn, pasid); + vtd_as->pri_notifier = notifier; +} + +static void vtd_pri_unregister_notifier(PCIBus *bus, void *opaque, + int devfn, uint32_t pasid) +{ + IntelIOMMUState *s = opaque; + VTDAddressSpace *vtd_as; + + vtd_as = vtd_find_add_as(s, bus, devfn, pasid); + vtd_as->pri_notifier = NULL; +} + static PCIIOMMUOps vtd_iommu_ops = { .get_address_space = vtd_host_dma_iommu, .set_iommu_device = vtd_dev_set_iommu_device, @@ -4853,6 +5183,9 @@ static PCIIOMMUOps vtd_iommu_ops = { .register_iotlb_notifier = vtd_register_iotlb_notifier, .unregister_iotlb_notifier = vtd_unregister_iotlb_notifier, .ats_request_translation = vtd_ats_request_translation, + .pri_register_notifier = vtd_pri_register_notifier, + .pri_unregister_notifier = vtd_pri_unregister_notifier, + .pri_request_page = vtd_pri_request_page, }; static bool vtd_decide_config(IntelIOMMUState *s, Error **errp) @@ -4910,32 +5243,6 @@ static bool vtd_decide_config(IntelIOMMUState *s, Error **errp) return true; } -static int vtd_machine_done_notify_one(Object *child, void *unused) -{ - IntelIOMMUState *iommu = INTEL_IOMMU_DEVICE(x86_iommu_get_default()); - - /* - * We hard-coded here because vfio-pci is the only special case - * here. Let's be more elegant in the future when we can, but so - * far there seems to be no better way. - */ - if (object_dynamic_cast(child, "vfio-pci") && !iommu->caching_mode) { - vtd_panic_require_caching_mode(); - } - - return 0; -} - -static void vtd_machine_done_hook(Notifier *notifier, void *unused) -{ - object_child_foreach_recursive(object_get_root(), - vtd_machine_done_notify_one, NULL); -} - -static Notifier vtd_machine_done_notify = { - .notify = vtd_machine_done_hook, -}; - static void vtd_realize(DeviceState *dev, Error **errp) { MachineState *ms = MACHINE(qdev_get_machine()); @@ -4990,7 +5297,6 @@ static void vtd_realize(DeviceState *dev, Error **errp) pci_setup_iommu(bus, &vtd_iommu_ops, dev); /* Pseudo address space under root PCI bus. */ x86ms->ioapic_as = vtd_host_dma_iommu(bus, s, Q35_PSEUDO_DEVFN_IOAPIC); - qemu_add_machine_init_done_notifier(&vtd_machine_done_notify); } static void vtd_class_init(ObjectClass *klass, const void *data) diff --git a/hw/i386/intel_iommu_internal.h b/hw/i386/intel_iommu_internal.h index 360e937989..0f6a1237e4 100644 --- a/hw/i386/intel_iommu_internal.h +++ b/hw/i386/intel_iommu_internal.h @@ -190,6 +190,7 @@ #define VTD_ECAP_EIM (1ULL << 4) #define VTD_ECAP_PT (1ULL << 6) #define VTD_ECAP_SC (1ULL << 7) +#define VTD_ECAP_PRS (1ULL << 29) #define VTD_ECAP_MHMV (15ULL << 20) #define VTD_ECAP_SRS (1ULL << 31) #define VTD_ECAP_PSS (7ULL << 35) /* limit: MemTxAttrs::pid */ @@ -214,6 +215,7 @@ #define VTD_CAP_DRAIN_WRITE (1ULL << 54) #define VTD_CAP_DRAIN_READ (1ULL << 55) #define VTD_CAP_FS1GP (1ULL << 56) +#define VTD_CAP_ESRTPS (1ULL << 63) #define VTD_CAP_DRAIN (VTD_CAP_DRAIN_READ | VTD_CAP_DRAIN_WRITE) #define VTD_CAP_CM (1ULL << 7) #define VTD_PASID_ID_SHIFT 20 @@ -314,6 +316,8 @@ typedef enum VTDFaultReason { * request while disabled */ VTD_FR_IR_SID_ERR = 0x26, /* Invalid Source-ID */ + VTD_FR_SM_PRE_ABS = 0x47, /* SCT.8 : PRE bit in a present SM CE is 0 */ + /* PASID directory entry access failure */ VTD_FR_PASID_DIR_ACCESS_ERR = 0x50, /* The Present(P) field of pasid directory entry is 0 */ @@ -376,6 +380,18 @@ union VTDInvDesc { }; typedef union VTDInvDesc VTDInvDesc; +/* Page Request Descriptor */ +union VTDPRDesc { + struct { + uint64_t lo; + uint64_t hi; + }; + struct { + uint64_t val[4]; + }; +}; +typedef union VTDPRDesc VTDPRDesc; + /* Masks for struct VTDInvDesc */ #define VTD_INV_DESC_ALL_ONE -1ULL #define VTD_INV_DESC_TYPE(val) ((((val) >> 5) & 0x70ULL) | \ @@ -389,6 +405,7 @@ typedef union VTDInvDesc VTDInvDesc; #define VTD_INV_DESC_PIOTLB 0x6 /* PASID-IOTLB Invalidate Desc */ #define VTD_INV_DESC_PC 0x7 /* PASID-cache Invalidate Desc */ #define VTD_INV_DESC_DEV_PIOTLB 0x8 /* PASID-based-DIOTLB inv_desc*/ +#define VTD_INV_DESC_PGRESP 0x9 /* Page Group Response Desc */ #define VTD_INV_DESC_NONE 0 /* Not an Invalidate Descriptor */ /* Masks for Invalidation Wait Descriptor*/ @@ -440,6 +457,15 @@ typedef union VTDInvDesc VTDInvDesc; #define VTD_INV_DESC_PASID_DEVICE_IOTLB_RSVD_VAL0 0xfff000000000f000ULL #define VTD_INV_DESC_PASID_DEVICE_IOTLB_RSVD_VAL1 0x7feULL +/* Mask for Page Group Response Descriptor */ +#define VTD_INV_DESC_PGRESP_RSVD_HI 0xfffffffffffff003ULL +#define VTD_INV_DESC_PGRESP_RSVD_LO 0xfff00000000001e0ULL +#define VTD_INV_DESC_PGRESP_PP(val) (((val) >> 4) & 0x1ULL) +#define VTD_INV_DESC_PGRESP_RC(val) (((val) >> 12) & 0xfULL) +#define VTD_INV_DESC_PGRESP_RID(val) (((val) >> 16) & 0xffffULL) +#define VTD_INV_DESC_PGRESP_PASID(val) (((val) >> 32) & 0xfffffULL) +#define VTD_INV_DESC_PGRESP_PRGI(val) (((val) >> 3) & 0x1ffULL) + /* Rsvd field masks for spte */ #define VTD_SPTE_SNP 0x800ULL @@ -491,6 +517,31 @@ typedef union VTDInvDesc VTDInvDesc; #define VTD_INV_DESC_PIOTLB_RSVD_VAL0 0xfff000000000f1c0ULL #define VTD_INV_DESC_PIOTLB_RSVD_VAL1 0xf80ULL +/* Page Request Descriptor */ +/* For the low 64-bit of 128-bit */ +#define VTD_PRD_TYPE (1ULL) +#define VTD_PRD_PP(val) (((val) & 1ULL) << 8) +#define VTD_PRD_RID(val) (((val) & 0xffffULL) << 16) +#define VTD_PRD_PASID(val) (((val) & 0xfffffULL) << 32) +#define VTD_PRD_EXR(val) (((val) & 1ULL) << 52) +#define VTD_PRD_PMR(val) (((val) & 1ULL) << 53) +/* For the high 64-bit of 128-bit */ +#define VTD_PRD_RDR(val) ((val) & 1ULL) +#define VTD_PRD_WRR(val) (((val) & 1ULL) << 1) +#define VTD_PRD_LPIG(val) (((val) & 1ULL) << 2) +#define VTD_PRD_PRGI(val) (((val) & 0x1ffULL) << 3) +#define VTD_PRD_ADDR(val) ((val) & 0xfffffffffffff000ULL) + +/* Page Request Queue constants */ +#define VTD_PQA_ENTRY_SIZE 32 /* Size of an entry in bytes */ +/* Page Request Queue masks */ +#define VTD_PQA_ADDR 0xfffffffffffff000ULL /* PR queue address */ +#define VTD_PQA_SIZE 0x7ULL /* PR queue size */ +#define VTD_PR_STATUS_PPR 1UL /* Pending page request */ +#define VTD_PR_STATUS_PRO 2UL /* Page request overflow */ +#define VTD_PR_PECTL_IP 0x40000000UL /* PR control interrup pending */ +#define VTD_PR_PECTL_IM 0x80000000UL /* PR control interrup mask */ + /* Information about page-selective IOTLB invalidate */ struct VTDIOTLBPageInvInfo { uint16_t domain_id; @@ -550,6 +601,7 @@ typedef struct VTDRootEntry VTDRootEntry; #define VTD_SM_CONTEXT_ENTRY_RID2PASID_MASK 0xfffff #define VTD_SM_CONTEXT_ENTRY_RSVD_VAL0(aw) (0x1e0ULL | ~VTD_HAW_MASK(aw)) #define VTD_SM_CONTEXT_ENTRY_RSVD_VAL1 0xffffffffffe00000ULL +#define VTD_SM_CONTEXT_ENTRY_PRE 0x10ULL /* PASID Table Related Definitions */ #define VTD_PASID_DIR_BASE_ADDR_MASK (~0xfffULL) diff --git a/hw/i386/pc.c b/hw/i386/pc.c index bc048a6d13..34b00663f2 100644 --- a/hw/i386/pc.c +++ b/hw/i386/pc.c @@ -837,6 +837,7 @@ void pc_memory_init(PCMachineState *pcms, hwaddr maxphysaddr, maxusedaddr; hwaddr cxl_base, cxl_resv_end = 0; X86CPU *cpu = X86_CPU(first_cpu); + uint64_t res_mem_end; assert(machine->ram_size == x86ms->below_4g_mem_size + x86ms->above_4g_mem_size); @@ -978,16 +979,17 @@ void pc_memory_init(PCMachineState *pcms, rom_set_fw(fw_cfg); - if (machine->device_memory) { - uint64_t *val = g_malloc(sizeof(*val)); - uint64_t res_mem_end; + if (pcms->cxl_devices_state.is_enabled) { + res_mem_end = cxl_resv_end; + } else if (machine->device_memory) { + res_mem_end = machine->device_memory->base + + memory_region_size(&machine->device_memory->mr); + } else { + res_mem_end = 0; + } - if (pcms->cxl_devices_state.is_enabled) { - res_mem_end = cxl_resv_end; - } else { - res_mem_end = machine->device_memory->base - + memory_region_size(&machine->device_memory->mr); - } + if (res_mem_end) { + uint64_t *val = g_malloc(sizeof(*val)); *val = cpu_to_le64(ROUND_UP(res_mem_end, 1 * GiB)); fw_cfg_add_file(fw_cfg, "etc/reserved-memory-end", val, sizeof(*val)); } @@ -1720,25 +1722,6 @@ static void pc_machine_wakeup(MachineState *machine) cpu_synchronize_all_post_reset(); } -static bool pc_hotplug_allowed(MachineState *ms, DeviceState *dev, Error **errp) -{ - X86IOMMUState *iommu = x86_iommu_get_default(); - IntelIOMMUState *intel_iommu; - - if (iommu && - object_dynamic_cast((Object *)iommu, TYPE_INTEL_IOMMU_DEVICE) && - object_dynamic_cast((Object *)dev, "vfio-pci")) { - intel_iommu = INTEL_IOMMU_DEVICE(iommu); - if (!intel_iommu->caching_mode) { - error_setg(errp, "Device assignment is not allowed without " - "enabling caching-mode=on for Intel IOMMU."); - return false; - } - } - - return true; -} - static void pc_machine_class_init(ObjectClass *oc, const void *data) { MachineClass *mc = MACHINE_CLASS(oc); @@ -1758,7 +1741,6 @@ static void pc_machine_class_init(ObjectClass *oc, const void *data) x86mc->apic_xrupt_override = true; assert(!mc->get_hotplug_handler); mc->get_hotplug_handler = pc_get_hotplug_handler; - mc->hotplug_allowed = pc_hotplug_allowed; mc->auto_enable_numa_with_memhp = true; mc->auto_enable_numa_with_memdev = true; mc->has_hotpluggable_cpus = true; diff --git a/hw/i386/pc_piix.c b/hw/i386/pc_piix.c index caf8bab68e..7b3611e973 100644 --- a/hw/i386/pc_piix.c +++ b/hw/i386/pc_piix.c @@ -448,6 +448,7 @@ DEFINE_I440FX_MACHINE_AS_LATEST(10, 2); static void pc_i440fx_machine_10_1_options(MachineClass *m) { pc_i440fx_machine_10_2_options(m); + m->smbios_memory_device_size = 2047 * TiB; compat_props_add(m->compat_props, hw_compat_10_1, hw_compat_10_1_len); compat_props_add(m->compat_props, pc_compat_10_1, pc_compat_10_1_len); } diff --git a/hw/i386/pc_q35.c b/hw/i386/pc_q35.c index e89951285e..6015e639d7 100644 --- a/hw/i386/pc_q35.c +++ b/hw/i386/pc_q35.c @@ -384,6 +384,7 @@ DEFINE_Q35_MACHINE_AS_LATEST(10, 2); static void pc_q35_machine_10_1_options(MachineClass *m) { pc_q35_machine_10_2_options(m); + m->smbios_memory_device_size = 2047 * TiB; compat_props_add(m->compat_props, hw_compat_10_1, hw_compat_10_1_len); compat_props_add(m->compat_props, pc_compat_10_1, pc_compat_10_1_len); } diff --git a/hw/i386/x86-iommu.c b/hw/i386/x86-iommu.c index d34a6849f4..c127a44bb4 100644 --- a/hw/i386/x86-iommu.c +++ b/hw/i386/x86-iommu.c @@ -130,6 +130,7 @@ static const Property x86_iommu_properties[] = { intr_supported, ON_OFF_AUTO_AUTO), DEFINE_PROP_BOOL("device-iotlb", X86IOMMUState, dt_supported, false), DEFINE_PROP_BOOL("pt", X86IOMMUState, pt_supported, true), + DEFINE_PROP_BOOL("dma-translation", X86IOMMUState, dma_translation, true), }; static void x86_iommu_class_init(ObjectClass *klass, const void *data) diff --git a/hw/isa/lpc_ich9.c b/hw/isa/lpc_ich9.c index 304dffac32..c9cb8f7779 100644 --- a/hw/isa/lpc_ich9.c +++ b/hw/isa/lpc_ich9.c @@ -132,6 +132,11 @@ static void ich9_cc_init(ICH9LPCState *lpc) static void ich9_cc_reset(ICH9LPCState *lpc) { uint8_t *c = lpc->chip_config; + uint32_t gcs = ICH9_CC_GCS_DEFAULT; + + if (lpc->pin_strap.spkr_hi) { + gcs |= ICH9_CC_GCS_NO_REBOOT; + } memset(lpc->chip_config, 0, sizeof(lpc->chip_config)); @@ -142,7 +147,7 @@ static void ich9_cc_reset(ICH9LPCState *lpc) pci_set_long(c + ICH9_CC_D27IR, ICH9_CC_DIR_DEFAULT); pci_set_long(c + ICH9_CC_D26IR, ICH9_CC_DIR_DEFAULT); pci_set_long(c + ICH9_CC_D25IR, ICH9_CC_DIR_DEFAULT); - pci_set_long(c + ICH9_CC_GCS, ICH9_CC_GCS_DEFAULT); + pci_set_long(c + ICH9_CC_GCS, gcs); ich9_cc_update(lpc); } diff --git a/hw/net/e1000e_core.c b/hw/net/e1000e_core.c index 06657bb3ac..8fef598b49 100644 --- a/hw/net/e1000e_core.c +++ b/hw/net/e1000e_core.c @@ -2822,8 +2822,9 @@ e1000e_update_rx_offloads(E1000ECore *core) trace_e1000e_rx_set_cso(cso_state); if (core->has_vnet) { - qemu_set_offload(qemu_get_queue(core->owner_nic)->peer, - cso_state, 0, 0, 0, 0, 0, 0); + NetOffloads ol = { .csum = cso_state }; + + qemu_set_offload(qemu_get_queue(core->owner_nic)->peer, &ol); } } diff --git a/hw/net/igb_core.c b/hw/net/igb_core.c index 39e3ce1c8f..45d8fd795b 100644 --- a/hw/net/igb_core.c +++ b/hw/net/igb_core.c @@ -3058,8 +3058,9 @@ igb_update_rx_offloads(IGBCore *core) trace_e1000e_rx_set_cso(cso_state); if (core->has_vnet) { - qemu_set_offload(qemu_get_queue(core->owner_nic)->peer, - cso_state, 0, 0, 0, 0, 0, 0); + NetOffloads ol = {.csum = cso_state }; + + qemu_set_offload(qemu_get_queue(core->owner_nic)->peer, &ol); } } diff --git a/hw/net/igbvf.c b/hw/net/igbvf.c index 31d72c4977..9b0db8f841 100644 --- a/hw/net/igbvf.c +++ b/hw/net/igbvf.c @@ -251,10 +251,12 @@ static void igbvf_pci_realize(PCIDevice *dev, Error **errp) memory_region_init_io(&s->mmio, OBJECT(dev), &mmio_ops, s, "igbvf-mmio", IGBVF_MMIO_SIZE); - pcie_sriov_vf_register_bar(dev, IGBVF_MMIO_BAR_IDX, &s->mmio); + pci_register_bar(dev, IGBVF_MMIO_BAR_IDX, PCI_BASE_ADDRESS_MEM_TYPE_64 | + PCI_BASE_ADDRESS_MEM_PREFETCH, &s->mmio); memory_region_init(&s->msix, OBJECT(dev), "igbvf-msix", IGBVF_MSIX_SIZE); - pcie_sriov_vf_register_bar(dev, IGBVF_MSIX_BAR_IDX, &s->msix); + pci_register_bar(dev, IGBVF_MSIX_BAR_IDX, PCI_BASE_ADDRESS_MEM_TYPE_64 | + PCI_BASE_ADDRESS_MEM_PREFETCH, &s->msix); ret = msix_init(dev, IGBVF_MSIX_VEC_NUM, &s->msix, IGBVF_MSIX_BAR_IDX, 0, &s->msix, IGBVF_MSIX_BAR_IDX, 0x2000, 0x70, errp); diff --git a/hw/net/vhost_net-stub.c b/hw/net/vhost_net-stub.c index 7d49f82906..0740d5a2eb 100644 --- a/hw/net/vhost_net-stub.c +++ b/hw/net/vhost_net-stub.c @@ -46,9 +46,8 @@ void vhost_net_cleanup(struct vhost_net *net) { } -uint64_t vhost_net_get_features(struct vhost_net *net, uint64_t features) +void vhost_net_get_features_ex(struct vhost_net *net, uint64_t *features) { - return features; } int vhost_net_get_config(struct vhost_net *net, uint8_t *config, @@ -62,13 +61,12 @@ int vhost_net_set_config(struct vhost_net *net, const uint8_t *data, return 0; } -void vhost_net_ack_features(struct vhost_net *net, uint64_t features) +void vhost_net_ack_features_ex(struct vhost_net *net, const uint64_t *features) { } -uint64_t vhost_net_get_acked_features(VHostNetState *net) +void vhost_net_get_acked_features_ex(VHostNetState *net, uint64_t *features) { - return 0; } bool vhost_net_virtqueue_pending(VHostNetState *net, int idx) diff --git a/hw/net/vhost_net.c b/hw/net/vhost_net.c index 540492b37d..a8ee18a912 100644 --- a/hw/net/vhost_net.c +++ b/hw/net/vhost_net.c @@ -35,10 +35,9 @@ #include "hw/virtio/virtio-bus.h" #include "linux-headers/linux/vhost.h" -uint64_t vhost_net_get_features(struct vhost_net *net, uint64_t features) +void vhost_net_get_features_ex(struct vhost_net *net, uint64_t *features) { - return vhost_get_features(&net->dev, net->feature_bits, - features); + vhost_get_features_ex(&net->dev, net->feature_bits, features); } int vhost_net_get_config(struct vhost_net *net, uint8_t *config, uint32_t config_len) @@ -51,10 +50,11 @@ int vhost_net_set_config(struct vhost_net *net, const uint8_t *data, return vhost_dev_set_config(&net->dev, data, offset, size, flags); } -void vhost_net_ack_features(struct vhost_net *net, uint64_t features) +void vhost_net_ack_features_ex(struct vhost_net *net, const uint64_t *features) { - net->dev.acked_features = net->dev.backend_features; - vhost_ack_features(&net->dev, net->feature_bits, features); + virtio_features_copy(net->dev.acked_features_ex, + net->dev.backend_features_ex); + vhost_ack_features_ex(&net->dev, net->feature_bits, features); } uint64_t vhost_net_get_max_queues(VHostNetState *net) @@ -62,9 +62,9 @@ uint64_t vhost_net_get_max_queues(VHostNetState *net) return net->dev.max_queues; } -uint64_t vhost_net_get_acked_features(VHostNetState *net) +void vhost_net_get_acked_features_ex(VHostNetState *net, uint64_t *features) { - return net->dev.acked_features; + virtio_features_copy(features, net->dev.acked_features_ex); } void vhost_net_save_acked_features(NetClientState *nc) @@ -234,7 +234,8 @@ struct vhost_net *vhost_net_init(VhostNetOptions *options) int r; bool backend_kernel = options->backend_type == VHOST_BACKEND_TYPE_KERNEL; struct vhost_net *net = g_new0(struct vhost_net, 1); - uint64_t features = 0; + uint64_t missing_features[VIRTIO_FEATURES_NU64S]; + uint64_t features[VIRTIO_FEATURES_NU64S]; Error *local_err = NULL; if (!options->net_backend) { @@ -247,6 +248,7 @@ struct vhost_net *vhost_net_init(VhostNetOptions *options) net->save_acked_features = options->save_acked_features; net->max_tx_queue_size = options->max_tx_queue_size; net->is_vhost_user = options->is_vhost_user; + virtio_features_clear(features); net->dev.max_queues = 1; net->dev.vqs = net->vqs; @@ -261,7 +263,7 @@ struct vhost_net *vhost_net_init(VhostNetOptions *options) net->backend = r; net->dev.protocol_features = 0; } else { - net->dev.backend_features = 0; + virtio_features_clear(net->dev.backend_features_ex); net->dev.protocol_features = 0; net->backend = -1; @@ -281,26 +283,29 @@ struct vhost_net *vhost_net_init(VhostNetOptions *options) sizeof(struct virtio_net_hdr_mrg_rxbuf))) { net->dev.features &= ~(1ULL << VIRTIO_NET_F_MRG_RXBUF); } - if (~net->dev.features & net->dev.backend_features) { - fprintf(stderr, "vhost lacks feature mask 0x%" PRIx64 - " for backend\n", - (uint64_t)(~net->dev.features & net->dev.backend_features)); + + if (virtio_features_andnot(missing_features, + net->dev.backend_features_ex, + net->dev.features_ex)) { + fprintf(stderr, "vhost lacks feature mask 0x" VIRTIO_FEATURES_FMT + " for backend\n", VIRTIO_FEATURES_PR(missing_features)); goto fail; } } /* Set sane init value. Override when guest acks. */ if (options->get_acked_features) { - features = options->get_acked_features(net->nc); - if (~net->dev.features & features) { - fprintf(stderr, "vhost lacks feature mask 0x%" PRIx64 - " for backend\n", - (uint64_t)(~net->dev.features & features)); + virtio_features_from_u64(features, + options->get_acked_features(net->nc)); + if (virtio_features_andnot(missing_features, features, + net->dev.features_ex)) { + fprintf(stderr, "vhost lacks feature mask 0x" VIRTIO_FEATURES_FMT + " for backend\n", VIRTIO_FEATURES_PR(missing_features)); goto fail; } } - vhost_net_ack_features(net, features); + vhost_net_ack_features_ex(net, features); return net; diff --git a/hw/net/virtio-net.c b/hw/net/virtio-net.c index 7848e26278..33116712eb 100644 --- a/hw/net/virtio-net.c +++ b/hw/net/virtio-net.c @@ -90,6 +90,25 @@ VIRTIO_NET_RSS_HASH_TYPE_TCP_EX | \ VIRTIO_NET_RSS_HASH_TYPE_UDP_EX) +/* + * Features starting from VIRTIO_NET_FEATURES_MAP_MIN bit correspond + * to guest offloads in the VIRTIO_NET_OFFLOAD_MAP range + */ +#define VIRTIO_NET_OFFLOAD_MAP_MIN 46 +#define VIRTIO_NET_OFFLOAD_MAP_LENGTH 4 +#define VIRTIO_NET_OFFLOAD_MAP MAKE_64BIT_MASK( \ + VIRTIO_NET_OFFLOAD_MAP_MIN, \ + VIRTIO_NET_OFFLOAD_MAP_LENGTH) +#define VIRTIO_NET_FEATURES_MAP_MIN 65 +#define VIRTIO_NET_F2O_SHIFT (VIRTIO_NET_OFFLOAD_MAP_MIN - \ + VIRTIO_NET_FEATURES_MAP_MIN + 64) + +static bool virtio_has_tunnel_hdr(const uint64_t *features) +{ + return virtio_has_feature_ex(features, VIRTIO_NET_F_HOST_UDP_TUNNEL_GSO) || + virtio_has_feature_ex(features, VIRTIO_NET_F_GUEST_UDP_TUNNEL_GSO); +} + static const VirtIOFeature feature_sizes[] = { {.flags = 1ULL << VIRTIO_NET_F_MAC, .end = endof(struct virtio_net_config, mac)}, @@ -636,8 +655,18 @@ static int peer_has_uso(VirtIONet *n) return qemu_has_uso(qemu_get_queue(n->nic)->peer); } +static bool peer_has_tunnel(VirtIONet *n) +{ + if (!peer_has_vnet_hdr(n)) { + return false; + } + + return qemu_has_tunnel(qemu_get_queue(n->nic)->peer); +} + static void virtio_net_set_mrg_rx_bufs(VirtIONet *n, int mergeable_rx_bufs, - int version_1, int hash_report) + int version_1, int hash_report, + int tunnel) { int i; NetClientState *nc; @@ -645,9 +674,11 @@ static void virtio_net_set_mrg_rx_bufs(VirtIONet *n, int mergeable_rx_bufs, n->mergeable_rx_bufs = mergeable_rx_bufs; if (version_1) { - n->guest_hdr_len = hash_report ? - sizeof(struct virtio_net_hdr_v1_hash) : - sizeof(struct virtio_net_hdr_mrg_rxbuf); + n->guest_hdr_len = tunnel ? + sizeof(struct virtio_net_hdr_v1_hash_tunnel) : + (hash_report ? + sizeof(struct virtio_net_hdr_v1_hash) : + sizeof(struct virtio_net_hdr_mrg_rxbuf)); n->rss_data.populate_hash = !!hash_report; } else { n->guest_hdr_len = n->mergeable_rx_bufs ? @@ -773,17 +804,31 @@ static uint64_t virtio_net_bad_features(VirtIODevice *vdev) static void virtio_net_apply_guest_offloads(VirtIONet *n) { - qemu_set_offload(qemu_get_queue(n->nic)->peer, - !!(n->curr_guest_offloads & (1ULL << VIRTIO_NET_F_GUEST_CSUM)), - !!(n->curr_guest_offloads & (1ULL << VIRTIO_NET_F_GUEST_TSO4)), - !!(n->curr_guest_offloads & (1ULL << VIRTIO_NET_F_GUEST_TSO6)), - !!(n->curr_guest_offloads & (1ULL << VIRTIO_NET_F_GUEST_ECN)), - !!(n->curr_guest_offloads & (1ULL << VIRTIO_NET_F_GUEST_UFO)), - !!(n->curr_guest_offloads & (1ULL << VIRTIO_NET_F_GUEST_USO4)), - !!(n->curr_guest_offloads & (1ULL << VIRTIO_NET_F_GUEST_USO6))); + NetOffloads ol = { + .csum = !!(n->curr_guest_offloads & (1ULL << VIRTIO_NET_F_GUEST_CSUM)), + .tso4 = !!(n->curr_guest_offloads & (1ULL << VIRTIO_NET_F_GUEST_TSO4)), + .tso6 = !!(n->curr_guest_offloads & (1ULL << VIRTIO_NET_F_GUEST_TSO6)), + .ecn = !!(n->curr_guest_offloads & (1ULL << VIRTIO_NET_F_GUEST_ECN)), + .ufo = !!(n->curr_guest_offloads & (1ULL << VIRTIO_NET_F_GUEST_UFO)), + .uso4 = !!(n->curr_guest_offloads & (1ULL << VIRTIO_NET_F_GUEST_USO4)), + .uso6 = !!(n->curr_guest_offloads & (1ULL << VIRTIO_NET_F_GUEST_USO6)), + .tnl = !!(n->curr_guest_offloads & + (1ULL << VIRTIO_NET_F_GUEST_UDP_TUNNEL_GSO_MAPPED)), + .tnl_csum = !!(n->curr_guest_offloads & + (1ULL << VIRTIO_NET_F_GUEST_UDP_TUNNEL_GSO_CSUM_MAPPED)), + }; + + qemu_set_offload(qemu_get_queue(n->nic)->peer, &ol); +} + +static uint64_t virtio_net_features_to_offload(const uint64_t *features) +{ + return (features[0] & ~VIRTIO_NET_OFFLOAD_MAP) | + ((features[1] << VIRTIO_NET_F2O_SHIFT) & VIRTIO_NET_OFFLOAD_MAP); } -static uint64_t virtio_net_guest_offloads_by_features(uint64_t features) +static uint64_t +virtio_net_guest_offloads_by_features(const uint64_t *features) { static const uint64_t guest_offloads_mask = (1ULL << VIRTIO_NET_F_GUEST_CSUM) | @@ -792,15 +837,17 @@ static uint64_t virtio_net_guest_offloads_by_features(uint64_t features) (1ULL << VIRTIO_NET_F_GUEST_ECN) | (1ULL << VIRTIO_NET_F_GUEST_UFO) | (1ULL << VIRTIO_NET_F_GUEST_USO4) | - (1ULL << VIRTIO_NET_F_GUEST_USO6); + (1ULL << VIRTIO_NET_F_GUEST_USO6) | + (1ULL << VIRTIO_NET_F_GUEST_UDP_TUNNEL_GSO_MAPPED) | + (1ULL << VIRTIO_NET_F_GUEST_UDP_TUNNEL_GSO_CSUM_MAPPED); - return guest_offloads_mask & features; + return guest_offloads_mask & virtio_net_features_to_offload(features); } uint64_t virtio_net_supported_guest_offloads(const VirtIONet *n) { VirtIODevice *vdev = VIRTIO_DEVICE(n); - return virtio_net_guest_offloads_by_features(vdev->guest_features); + return virtio_net_guest_offloads_by_features(vdev->guest_features_ex); } typedef struct { @@ -879,34 +926,40 @@ static void failover_add_primary(VirtIONet *n, Error **errp) error_propagate(errp, err); } -static void virtio_net_set_features(VirtIODevice *vdev, uint64_t features) +static void virtio_net_set_features(VirtIODevice *vdev, + const uint64_t *in_features) { + uint64_t features[VIRTIO_FEATURES_NU64S]; VirtIONet *n = VIRTIO_NET(vdev); Error *err = NULL; int i; + virtio_features_copy(features, in_features); if (n->mtu_bypass_backend && !virtio_has_feature(vdev->backend_features, VIRTIO_NET_F_MTU)) { - features &= ~(1ULL << VIRTIO_NET_F_MTU); + virtio_clear_feature_ex(features, VIRTIO_NET_F_MTU); } virtio_net_set_multiqueue(n, - virtio_has_feature(features, VIRTIO_NET_F_RSS) || - virtio_has_feature(features, VIRTIO_NET_F_MQ)); + virtio_has_feature_ex(features, + VIRTIO_NET_F_RSS) || + virtio_has_feature_ex(features, + VIRTIO_NET_F_MQ)); virtio_net_set_mrg_rx_bufs(n, - virtio_has_feature(features, + virtio_has_feature_ex(features, VIRTIO_NET_F_MRG_RXBUF), - virtio_has_feature(features, + virtio_has_feature_ex(features, VIRTIO_F_VERSION_1), - virtio_has_feature(features, - VIRTIO_NET_F_HASH_REPORT)); + virtio_has_feature_ex(features, + VIRTIO_NET_F_HASH_REPORT), + virtio_has_tunnel_hdr(features)); - n->rsc4_enabled = virtio_has_feature(features, VIRTIO_NET_F_RSC_EXT) && - virtio_has_feature(features, VIRTIO_NET_F_GUEST_TSO4); - n->rsc6_enabled = virtio_has_feature(features, VIRTIO_NET_F_RSC_EXT) && - virtio_has_feature(features, VIRTIO_NET_F_GUEST_TSO6); - n->rss_data.redirect = virtio_has_feature(features, VIRTIO_NET_F_RSS); + n->rsc4_enabled = virtio_has_feature_ex(features, VIRTIO_NET_F_RSC_EXT) && + virtio_has_feature_ex(features, VIRTIO_NET_F_GUEST_TSO4); + n->rsc6_enabled = virtio_has_feature_ex(features, VIRTIO_NET_F_RSC_EXT) && + virtio_has_feature_ex(features, VIRTIO_NET_F_GUEST_TSO6); + n->rss_data.redirect = virtio_has_feature_ex(features, VIRTIO_NET_F_RSS); if (n->has_vnet_hdr) { n->curr_guest_offloads = @@ -920,7 +973,7 @@ static void virtio_net_set_features(VirtIODevice *vdev, uint64_t features) if (!get_vhost_net(nc->peer)) { continue; } - vhost_net_ack_features(get_vhost_net(nc->peer), features); + vhost_net_ack_features_ex(get_vhost_net(nc->peer), features); /* * keep acked_features in NetVhostUserState up-to-date so it @@ -929,12 +982,14 @@ static void virtio_net_set_features(VirtIODevice *vdev, uint64_t features) vhost_net_save_acked_features(nc->peer); } - if (virtio_has_feature(vdev->guest_features ^ features, VIRTIO_NET_F_CTRL_VLAN)) { - bool vlan = virtio_has_feature(features, VIRTIO_NET_F_CTRL_VLAN); + if (virtio_has_feature_ex(features, VIRTIO_NET_F_CTRL_VLAN) != + virtio_has_feature_ex(vdev->guest_features_ex, + VIRTIO_NET_F_CTRL_VLAN)) { + bool vlan = virtio_has_feature_ex(features, VIRTIO_NET_F_CTRL_VLAN); memset(n->vlans, vlan ? 0 : 0xff, MAX_VLAN >> 3); } - if (virtio_has_feature(features, VIRTIO_NET_F_STANDBY)) { + if (virtio_has_feature_ex(features, VIRTIO_NET_F_STANDBY)) { qapi_event_send_failover_negotiated(n->netclient_name); qatomic_set(&n->failover_primary_hidden, false); failover_add_primary(n, &err); @@ -1905,10 +1960,10 @@ static ssize_t virtio_net_receive_rcu(NetClientState *nc, const uint8_t *buf, virtio_error(vdev, "virtio-net unexpected empty queue: " "i %zd mergeable %d offset %zd, size %zd, " "guest hdr len %zd, host hdr len %zd " - "guest features 0x%" PRIx64, + "guest features 0x" VIRTIO_FEATURES_FMT, i, n->mergeable_rx_bufs, offset, size, n->guest_hdr_len, n->host_hdr_len, - vdev->guest_features); + VIRTIO_FEATURES_PR(vdev->guest_features_ex)); } err = -1; goto err; @@ -3015,8 +3070,8 @@ static int virtio_net_pre_load_queues(VirtIODevice *vdev, uint32_t n) return 0; } -static uint64_t virtio_net_get_features(VirtIODevice *vdev, uint64_t features, - Error **errp) +static void virtio_net_get_features(VirtIODevice *vdev, uint64_t *features, + Error **errp) { VirtIONet *n = VIRTIO_NET(vdev); NetClientState *nc = qemu_get_queue(n->nic); @@ -3030,68 +3085,83 @@ static uint64_t virtio_net_get_features(VirtIODevice *vdev, uint64_t features, (supported_hash_types & peer_hash_types) == supported_hash_types; /* Firstly sync all virtio-net possible supported features */ - features |= n->host_features; + virtio_features_or(features, features, n->host_features_ex); - virtio_add_feature(&features, VIRTIO_NET_F_MAC); + virtio_add_feature_ex(features, VIRTIO_NET_F_MAC); if (!peer_has_vnet_hdr(n)) { - virtio_clear_feature(&features, VIRTIO_NET_F_CSUM); - virtio_clear_feature(&features, VIRTIO_NET_F_HOST_TSO4); - virtio_clear_feature(&features, VIRTIO_NET_F_HOST_TSO6); - virtio_clear_feature(&features, VIRTIO_NET_F_HOST_ECN); + virtio_clear_feature_ex(features, VIRTIO_NET_F_CSUM); + virtio_clear_feature_ex(features, VIRTIO_NET_F_HOST_TSO4); + virtio_clear_feature_ex(features, VIRTIO_NET_F_HOST_TSO6); + virtio_clear_feature_ex(features, VIRTIO_NET_F_HOST_ECN); + + virtio_clear_feature_ex(features, VIRTIO_NET_F_GUEST_CSUM); + virtio_clear_feature_ex(features, VIRTIO_NET_F_GUEST_TSO4); + virtio_clear_feature_ex(features, VIRTIO_NET_F_GUEST_TSO6); + virtio_clear_feature_ex(features, VIRTIO_NET_F_GUEST_ECN); - virtio_clear_feature(&features, VIRTIO_NET_F_GUEST_CSUM); - virtio_clear_feature(&features, VIRTIO_NET_F_GUEST_TSO4); - virtio_clear_feature(&features, VIRTIO_NET_F_GUEST_TSO6); - virtio_clear_feature(&features, VIRTIO_NET_F_GUEST_ECN); + virtio_clear_feature_ex(features, VIRTIO_NET_F_HOST_USO); + virtio_clear_feature_ex(features, VIRTIO_NET_F_GUEST_USO4); + virtio_clear_feature_ex(features, VIRTIO_NET_F_GUEST_USO6); - virtio_clear_feature(&features, VIRTIO_NET_F_HOST_USO); - virtio_clear_feature(&features, VIRTIO_NET_F_GUEST_USO4); - virtio_clear_feature(&features, VIRTIO_NET_F_GUEST_USO6); + virtio_clear_feature_ex(features, VIRTIO_NET_F_GUEST_UDP_TUNNEL_GSO); + virtio_clear_feature_ex(features, VIRTIO_NET_F_HOST_UDP_TUNNEL_GSO); + virtio_clear_feature_ex(features, + VIRTIO_NET_F_GUEST_UDP_TUNNEL_GSO_CSUM); + virtio_clear_feature_ex(features, + VIRTIO_NET_F_HOST_UDP_TUNNEL_GSO_CSUM); - virtio_clear_feature(&features, VIRTIO_NET_F_HASH_REPORT); + virtio_clear_feature_ex(features, VIRTIO_NET_F_HASH_REPORT); } if (!peer_has_vnet_hdr(n) || !peer_has_ufo(n)) { - virtio_clear_feature(&features, VIRTIO_NET_F_GUEST_UFO); - virtio_clear_feature(&features, VIRTIO_NET_F_HOST_UFO); + virtio_clear_feature_ex(features, VIRTIO_NET_F_GUEST_UFO); + virtio_clear_feature_ex(features, VIRTIO_NET_F_HOST_UFO); } - if (!peer_has_uso(n)) { - virtio_clear_feature(&features, VIRTIO_NET_F_HOST_USO); - virtio_clear_feature(&features, VIRTIO_NET_F_GUEST_USO4); - virtio_clear_feature(&features, VIRTIO_NET_F_GUEST_USO6); + virtio_clear_feature_ex(features, VIRTIO_NET_F_HOST_USO); + virtio_clear_feature_ex(features, VIRTIO_NET_F_GUEST_USO4); + virtio_clear_feature_ex(features, VIRTIO_NET_F_GUEST_USO6); + } + + if (!peer_has_tunnel(n)) { + virtio_clear_feature_ex(features, VIRTIO_NET_F_GUEST_UDP_TUNNEL_GSO); + virtio_clear_feature_ex(features, VIRTIO_NET_F_HOST_UDP_TUNNEL_GSO); + virtio_clear_feature_ex(features, + VIRTIO_NET_F_GUEST_UDP_TUNNEL_GSO_CSUM); + virtio_clear_feature_ex(features, + VIRTIO_NET_F_HOST_UDP_TUNNEL_GSO_CSUM); } if (!get_vhost_net(nc->peer)) { if (!use_own_hash) { - virtio_clear_feature(&features, VIRTIO_NET_F_HASH_REPORT); - virtio_clear_feature(&features, VIRTIO_NET_F_RSS); - } else if (virtio_has_feature(features, VIRTIO_NET_F_RSS)) { + virtio_clear_feature_ex(features, VIRTIO_NET_F_HASH_REPORT); + virtio_clear_feature_ex(features, VIRTIO_NET_F_RSS); + } else if (virtio_has_feature_ex(features, VIRTIO_NET_F_RSS)) { virtio_net_load_ebpf(n, errp); } - return features; + return; } if (!use_peer_hash) { - virtio_clear_feature(&features, VIRTIO_NET_F_HASH_REPORT); + virtio_clear_feature_ex(features, VIRTIO_NET_F_HASH_REPORT); if (!use_own_hash || !virtio_net_attach_ebpf_to_backend(n->nic, -1)) { if (!virtio_net_load_ebpf(n, errp)) { - return features; + return; } - virtio_clear_feature(&features, VIRTIO_NET_F_RSS); + virtio_clear_feature_ex(features, VIRTIO_NET_F_RSS); } } - features = vhost_net_get_features(get_vhost_net(nc->peer), features); - vdev->backend_features = features; + vhost_net_get_features_ex(get_vhost_net(nc->peer), features); + virtio_features_copy(vdev->backend_features_ex, features); if (n->mtu_bypass_backend && (n->host_features & 1ULL << VIRTIO_NET_F_MTU)) { - features |= (1ULL << VIRTIO_NET_F_MTU); + virtio_add_feature_ex(features, VIRTIO_NET_F_MTU); } /* @@ -3106,10 +3176,8 @@ static uint64_t virtio_net_get_features(VirtIODevice *vdev, uint64_t features, * support it. */ if (!virtio_has_feature(vdev->backend_features, VIRTIO_NET_F_CTRL_VQ)) { - virtio_clear_feature(&features, VIRTIO_NET_F_GUEST_ANNOUNCE); + virtio_clear_feature_ex(features, VIRTIO_NET_F_GUEST_ANNOUNCE); } - - return features; } static int virtio_net_post_load_device(void *opaque, int version_id) @@ -3117,13 +3185,15 @@ static int virtio_net_post_load_device(void *opaque, int version_id) VirtIONet *n = opaque; VirtIODevice *vdev = VIRTIO_DEVICE(n); int i, link_down; + bool has_tunnel_hdr = virtio_has_tunnel_hdr(vdev->guest_features_ex); trace_virtio_net_post_load_device(); virtio_net_set_mrg_rx_bufs(n, n->mergeable_rx_bufs, virtio_vdev_has_feature(vdev, VIRTIO_F_VERSION_1), virtio_vdev_has_feature(vdev, - VIRTIO_NET_F_HASH_REPORT)); + VIRTIO_NET_F_HASH_REPORT), + has_tunnel_hdr); /* MAC_TABLE_ENTRIES may be different from the saved image */ if (n->mac_table.in_use > MAC_TABLE_ENTRIES) { @@ -3943,7 +4013,7 @@ static void virtio_net_device_realize(DeviceState *dev, Error **errp) n->vqs[0].tx_waiting = 0; n->tx_burst = n->net_conf.txburst; - virtio_net_set_mrg_rx_bufs(n, 0, 0, 0); + virtio_net_set_mrg_rx_bufs(n, 0, 0, 0, 0); n->promisc = 1; /* for compatibility */ n->mac_table.macs = g_malloc0(MAC_TABLE_ENTRIES * ETH_ALEN); @@ -4227,6 +4297,22 @@ static const Property virtio_net_properties[] = { rss_data.specified_hash_types, VIRTIO_NET_HASH_REPORT_UDPv6_EX - 1, ON_OFF_AUTO_AUTO), + VIRTIO_DEFINE_PROP_FEATURE("host_tunnel", VirtIONet, + host_features_ex, + VIRTIO_NET_F_HOST_UDP_TUNNEL_GSO, + false), + VIRTIO_DEFINE_PROP_FEATURE("host_tunnel_csum", VirtIONet, + host_features_ex, + VIRTIO_NET_F_HOST_UDP_TUNNEL_GSO_CSUM, + false), + VIRTIO_DEFINE_PROP_FEATURE("guest_tunnel", VirtIONet, + host_features_ex, + VIRTIO_NET_F_GUEST_UDP_TUNNEL_GSO, + false), + VIRTIO_DEFINE_PROP_FEATURE("guest_tunnel_csum", VirtIONet, + host_features_ex, + VIRTIO_NET_F_GUEST_UDP_TUNNEL_GSO_CSUM, + false), }; static void virtio_net_class_init(ObjectClass *klass, const void *data) @@ -4241,8 +4327,8 @@ static void virtio_net_class_init(ObjectClass *klass, const void *data) vdc->unrealize = virtio_net_device_unrealize; vdc->get_config = virtio_net_get_config; vdc->set_config = virtio_net_set_config; - vdc->get_features = virtio_net_get_features; - vdc->set_features = virtio_net_set_features; + vdc->get_features_ex = virtio_net_get_features; + vdc->set_features_ex = virtio_net_set_features; vdc->bad_features = virtio_net_bad_features; vdc->reset = virtio_net_reset; vdc->queue_reset = virtio_net_queue_reset; diff --git a/hw/net/vmxnet3.c b/hw/net/vmxnet3.c index af73aa8ef2..03732375a7 100644 --- a/hw/net/vmxnet3.c +++ b/hw/net/vmxnet3.c @@ -1322,14 +1322,11 @@ static void vmxnet3_update_features(VMXNET3State *s) s->lro_supported, rxcso_supported, s->rx_vlan_stripping); if (s->peer_has_vhdr) { - qemu_set_offload(qemu_get_queue(s->nic)->peer, - rxcso_supported, - s->lro_supported, - s->lro_supported, - 0, - 0, - 0, - 0); + NetOffloads ol = { .csum = rxcso_supported, + .tso4 = s->lro_supported, + .tso6 = s->lro_supported }; + + qemu_set_offload(qemu_get_queue(s->nic)->peer, &ol); } } diff --git a/hw/nvme/ctrl.c b/hw/nvme/ctrl.c index f5ee6bf260..cd81f73997 100644 --- a/hw/nvme/ctrl.c +++ b/hw/nvme/ctrl.c @@ -8708,12 +8708,8 @@ static bool nvme_init_pci(NvmeCtrl *n, PCIDevice *pci_dev, Error **errp) msix_table_offset); memory_region_add_subregion(&n->bar0, 0, &n->iomem); - if (pci_is_vf(pci_dev)) { - pcie_sriov_vf_register_bar(pci_dev, 0, &n->bar0); - } else { - pci_register_bar(pci_dev, 0, PCI_BASE_ADDRESS_SPACE_MEMORY | - PCI_BASE_ADDRESS_MEM_TYPE_64, &n->bar0); - } + pci_register_bar(pci_dev, 0, PCI_BASE_ADDRESS_SPACE_MEMORY | + PCI_BASE_ADDRESS_MEM_TYPE_64, &n->bar0); ret = msix_init(pci_dev, nr_vectors, &n->bar0, 0, msix_table_offset, diff --git a/hw/pci/pci.c b/hw/pci/pci.c index 5e2c3c6fc2..acc03fd470 100644 --- a/hw/pci/pci.c +++ b/hw/pci/pci.c @@ -1492,9 +1492,6 @@ void pci_register_bar(PCIDevice *pci_dev, int region_num, : pci_get_bus(pci_dev)->address_space_mem; if (pci_is_vf(pci_dev)) { - PCIDevice *pf = pci_dev->exp.sriov_vf.pf; - assert(!pf || type == pf->exp.sriov_pf.vf_bar_type[region_num]); - r->addr = pci_bar_address(pci_dev, region_num, r->type, r->size); if (r->addr != PCI_BAR_UNMAPPED) { memory_region_add_subregion_overlap(r->address_space, @@ -2968,7 +2965,7 @@ int pci_iommu_init_iotlb_notifier(PCIDevice *dev, IOMMUNotifier *n, PCIBus *iommu_bus; int devfn; - pci_device_get_iommu_bus_devfn(dev, &bus, &iommu_bus, &devfn); + pci_device_get_iommu_bus_devfn(dev, &iommu_bus, &bus, &devfn); if (iommu_bus && iommu_bus->iommu_ops->init_iotlb_notifier) { iommu_bus->iommu_ops->init_iotlb_notifier(bus, iommu_bus->iommu_opaque, devfn, n, fn, opaque); @@ -3026,7 +3023,7 @@ int pci_pri_request_page(PCIDevice *dev, uint32_t pasid, bool priv_req, return -EPERM; } - pci_device_get_iommu_bus_devfn(dev, &bus, &iommu_bus, &devfn); + pci_device_get_iommu_bus_devfn(dev, &iommu_bus, &bus, &devfn); if (iommu_bus && iommu_bus->iommu_ops->pri_request_page) { return iommu_bus->iommu_ops->pri_request_page(bus, iommu_bus->iommu_opaque, @@ -3050,7 +3047,7 @@ int pci_pri_register_notifier(PCIDevice *dev, uint32_t pasid, return -EPERM; } - pci_device_get_iommu_bus_devfn(dev, &bus, &iommu_bus, &devfn); + pci_device_get_iommu_bus_devfn(dev, &iommu_bus, &bus, &devfn); if (iommu_bus && iommu_bus->iommu_ops->pri_register_notifier) { iommu_bus->iommu_ops->pri_register_notifier(bus, iommu_bus->iommu_opaque, @@ -3067,7 +3064,7 @@ void pci_pri_unregister_notifier(PCIDevice *dev, uint32_t pasid) PCIBus *iommu_bus; int devfn; - pci_device_get_iommu_bus_devfn(dev, &bus, &iommu_bus, &devfn); + pci_device_get_iommu_bus_devfn(dev, &iommu_bus, &bus, &devfn); if (iommu_bus && iommu_bus->iommu_ops->pri_unregister_notifier) { iommu_bus->iommu_ops->pri_unregister_notifier(bus, iommu_bus->iommu_opaque, @@ -3099,7 +3096,7 @@ ssize_t pci_ats_request_translation(PCIDevice *dev, uint32_t pasid, return -EPERM; } - pci_device_get_iommu_bus_devfn(dev, &bus, &iommu_bus, &devfn); + pci_device_get_iommu_bus_devfn(dev, &iommu_bus, &bus, &devfn); if (iommu_bus && iommu_bus->iommu_ops->ats_request_translation) { return iommu_bus->iommu_ops->ats_request_translation(bus, iommu_bus->iommu_opaque, @@ -3123,7 +3120,7 @@ int pci_iommu_register_iotlb_notifier(PCIDevice *dev, uint32_t pasid, return -EPERM; } - pci_device_get_iommu_bus_devfn(dev, &bus, &iommu_bus, &devfn); + pci_device_get_iommu_bus_devfn(dev, &iommu_bus, &bus, &devfn); if (iommu_bus && iommu_bus->iommu_ops->register_iotlb_notifier) { iommu_bus->iommu_ops->register_iotlb_notifier(bus, iommu_bus->iommu_opaque, devfn, @@ -3145,7 +3142,7 @@ int pci_iommu_unregister_iotlb_notifier(PCIDevice *dev, uint32_t pasid, return -EPERM; } - pci_device_get_iommu_bus_devfn(dev, &bus, &iommu_bus, &devfn); + pci_device_get_iommu_bus_devfn(dev, &iommu_bus, &bus, &devfn); if (iommu_bus && iommu_bus->iommu_ops->unregister_iotlb_notifier) { iommu_bus->iommu_ops->unregister_iotlb_notifier(bus, iommu_bus->iommu_opaque, @@ -3159,11 +3156,9 @@ int pci_iommu_unregister_iotlb_notifier(PCIDevice *dev, uint32_t pasid, int pci_iommu_get_iotlb_info(PCIDevice *dev, uint8_t *addr_width, uint32_t *min_page_size) { - PCIBus *bus; PCIBus *iommu_bus; - int devfn; - pci_device_get_iommu_bus_devfn(dev, &bus, &iommu_bus, &devfn); + pci_device_get_iommu_bus_devfn(dev, &iommu_bus, NULL, NULL); if (iommu_bus && iommu_bus->iommu_ops->get_iotlb_info) { iommu_bus->iommu_ops->get_iotlb_info(iommu_bus->iommu_opaque, addr_width, min_page_size); diff --git a/hw/pci/pcie.c b/hw/pci/pcie.c index eaeb68894e..b302de6419 100644 --- a/hw/pci/pcie.c +++ b/hw/pci/pcie.c @@ -1266,6 +1266,14 @@ void pcie_pri_init(PCIDevice *dev, uint16_t offset, uint32_t outstanding_pr_cap, dev->exp.pri_cap = offset; } +uint32_t pcie_pri_get_req_alloc(const PCIDevice *dev) +{ + if (!pcie_pri_enabled(dev)) { + return 0; + } + return pci_get_long(dev->config + dev->exp.pri_cap + PCI_PRI_ALLOC_REQ); +} + bool pcie_pri_enabled(const PCIDevice *dev) { if (!pci_is_express(dev) || !dev->exp.pri_cap) { diff --git a/hw/pci/pcie_sriov.c b/hw/pci/pcie_sriov.c index 8a4bf0d6f7..c4f88f0975 100644 --- a/hw/pci/pcie_sriov.c +++ b/hw/pci/pcie_sriov.c @@ -195,7 +195,9 @@ bool pcie_sriov_pf_init(PCIDevice *dev, uint16_t offset, void pcie_sriov_pf_exit(PCIDevice *dev) { - uint8_t *cfg = dev->config + dev->exp.sriov_cap; + if (dev->exp.sriov_cap == 0) { + return; + } if (dev->exp.sriov_pf.vf_user_created) { uint16_t ven_id = pci_get_word(dev->config + PCI_VENDOR_ID); @@ -211,6 +213,8 @@ void pcie_sriov_pf_exit(PCIDevice *dev) pci_config_set_device_id(dev->exp.sriov_pf.vf[i]->config, vf_dev_id); } } else { + uint8_t *cfg = dev->config + dev->exp.sriov_cap; + unparent_vfs(dev, pci_get_word(cfg + PCI_SRIOV_TOTAL_VF)); } } @@ -242,17 +246,6 @@ void pcie_sriov_pf_init_vf_bar(PCIDevice *dev, int region_num, dev->exp.sriov_pf.vf_bar_type[region_num] = type; } -void pcie_sriov_vf_register_bar(PCIDevice *dev, int region_num, - MemoryRegion *memory) -{ - uint8_t type; - - assert(dev->exp.sriov_vf.pf); - type = dev->exp.sriov_vf.pf->exp.sriov_pf.vf_bar_type[region_num]; - - return pci_register_bar(dev, region_num, type, memory); -} - static gint compare_vf_devfns(gconstpointer a, gconstpointer b) { return (*(PCIDevice **)a)->devfn - (*(PCIDevice **)b)->devfn; diff --git a/hw/scsi/virtio-scsi.c b/hw/scsi/virtio-scsi.c index 34ae14f7bf..d817fc42b4 100644 --- a/hw/scsi/virtio-scsi.c +++ b/hw/scsi/virtio-scsi.c @@ -116,11 +116,7 @@ static void virtio_scsi_complete_req(VirtIOSCSIReq *req, QemuMutex *vq_lock) } virtqueue_push(vq, &req->elem, req->qsgl.size + req->resp_iov.size); - if (s->dataplane_started && !s->dataplane_fenced) { - virtio_notify_irqfd(vdev, vq); - } else { - virtio_notify(vdev, vq); - } + virtio_notify(vdev, vq); if (vq_lock) { qemu_mutex_unlock(vq_lock); diff --git a/hw/smbios/smbios.c b/hw/smbios/smbios.c index 1ac063cfb4..13e21a9c43 100644 --- a/hw/smbios/smbios.c +++ b/hw/smbios/smbios.c @@ -179,6 +179,10 @@ static const QemuOptDesc qemu_smbios_type0_opts[] = { .name = "uefi", .type = QEMU_OPT_BOOL, .help = "uefi support", + },{ + .name = "vm", + .type = QEMU_OPT_BOOL, + .help = "virtual machine", }, { /* end of list */ } }; @@ -574,10 +578,14 @@ static void smbios_build_type_0_table(void) t->bios_characteristics = cpu_to_le64(0x08); /* Not supported */ t->bios_characteristics_extension_bytes[0] = 0; - t->bios_characteristics_extension_bytes[1] = 0x14; /* TCD/SVVP | VM */ + + t->bios_characteristics_extension_bytes[1] = 0x04; /* TCD/SVVP */ if (smbios_type0.uefi) { t->bios_characteristics_extension_bytes[1] |= 0x08; /* |= UEFI */ } + if (smbios_type0.vm) { + t->bios_characteristics_extension_bytes[1] |= 0x10; /* |= VM */ + } if (smbios_type0.have_major_minor) { t->system_bios_major_release = smbios_type0.major; @@ -1405,6 +1413,7 @@ void smbios_entry_add(QemuOpts *opts, Error **errp) save_opt(&smbios_type0.version, opts, "version"); save_opt(&smbios_type0.date, opts, "date"); smbios_type0.uefi = qemu_opt_get_bool(opts, "uefi", false); + smbios_type0.vm = qemu_opt_get_bool(opts, "vm", true); val = qemu_opt_get(opts, "release"); if (val) { diff --git a/hw/virtio/Kconfig b/hw/virtio/Kconfig index 7648a2d68d..10f5c53ac0 100644 --- a/hw/virtio/Kconfig +++ b/hw/virtio/Kconfig @@ -126,3 +126,8 @@ config VHOST_USER_SCMI bool default y depends on VIRTIO && VHOST_USER && ARM + +config VHOST_USER_TEST + bool + default y + depends on VIRTIO && VHOST_USER diff --git a/hw/virtio/meson.build b/hw/virtio/meson.build index 3ea7b3cec8..48b9fedfa5 100644 --- a/hw/virtio/meson.build +++ b/hw/virtio/meson.build @@ -22,7 +22,7 @@ if have_vhost system_virtio_ss.add(files('vhost-user-base.c')) # MMIO Stubs - system_virtio_ss.add(files('vhost-user-device.c')) + system_virtio_ss.add(when: 'CONFIG_VHOST_USER_TEST', if_true: files('vhost-user-test-device.c')) system_virtio_ss.add(when: 'CONFIG_VHOST_USER_GPIO', if_true: files('vhost-user-gpio.c')) system_virtio_ss.add(when: 'CONFIG_VHOST_USER_I2C', if_true: files('vhost-user-i2c.c')) system_virtio_ss.add(when: 'CONFIG_VHOST_USER_RNG', if_true: files('vhost-user-rng.c')) @@ -30,7 +30,8 @@ if have_vhost system_virtio_ss.add(when: 'CONFIG_VHOST_USER_INPUT', if_true: files('vhost-user-input.c')) # PCI Stubs - system_virtio_ss.add(when: 'CONFIG_VIRTIO_PCI', if_true: files('vhost-user-device-pci.c')) + system_virtio_ss.add(when: ['CONFIG_VIRTIO_PCI', 'CONFIG_VHOST_USER_TEST'], + if_true: files('vhost-user-test-device-pci.c')) system_virtio_ss.add(when: ['CONFIG_VIRTIO_PCI', 'CONFIG_VHOST_USER_GPIO'], if_true: files('vhost-user-gpio-pci.c')) system_virtio_ss.add(when: ['CONFIG_VIRTIO_PCI', 'CONFIG_VHOST_USER_I2C'], diff --git a/hw/virtio/trace-events b/hw/virtio/trace-events index 76f0d458b2..658cc365e7 100644 --- a/hw/virtio/trace-events +++ b/hw/virtio/trace-events @@ -75,7 +75,6 @@ virtqueue_flush(void *vq, unsigned int count) "vq %p count %u" virtqueue_pop(void *vq, void *elem, unsigned int in_num, unsigned int out_num) "vq %p elem %p in_num %u out_num %u" virtio_queue_notify(void *vdev, int n, void *vq) "vdev %p n %d vq %p" virtio_notify_irqfd_deferred_fn(void *vdev, void *vq) "vdev %p vq %p" -virtio_notify_irqfd(void *vdev, void *vq) "vdev %p vq %p" virtio_notify(void *vdev, void *vq) "vdev %p vq %p" virtio_set_status(void *vdev, uint8_t val) "vdev %p val %u" diff --git a/hw/virtio/vdpa-dev.c b/hw/virtio/vdpa-dev.c index d1da40afc8..4a7b970976 100644 --- a/hw/virtio/vdpa-dev.c +++ b/hw/virtio/vdpa-dev.c @@ -338,6 +338,12 @@ static int vhost_vdpa_device_set_status(VirtIODevice *vdev, uint8_t status) return 0; } +static struct vhost_dev *vhost_vdpa_device_get_vhost(VirtIODevice *vdev) +{ + VhostVdpaDevice *s = VHOST_VDPA_DEVICE(vdev); + return &s->dev; +} + static const Property vhost_vdpa_device_properties[] = { DEFINE_PROP_STRING("vhostdev", VhostVdpaDevice, vhostdev), DEFINE_PROP_UINT16("queue-size", VhostVdpaDevice, queue_size, 0), @@ -369,6 +375,7 @@ static void vhost_vdpa_device_class_init(ObjectClass *klass, const void *data) vdc->set_config = vhost_vdpa_device_set_config; vdc->get_features = vhost_vdpa_device_get_features; vdc->set_status = vhost_vdpa_device_set_status; + vdc->get_vhost = vhost_vdpa_device_get_vhost; } static void vhost_vdpa_device_instance_init(Object *obj) diff --git a/hw/virtio/vhost-backend.c b/hw/virtio/vhost-backend.c index 833804dd40..4367db0d95 100644 --- a/hw/virtio/vhost-backend.c +++ b/hw/virtio/vhost-backend.c @@ -20,6 +20,11 @@ #include <linux/vhost.h> #include <sys/ioctl.h> +struct vhost_features { + uint64_t count; + uint64_t features[VIRTIO_FEATURES_NU64S]; +}; + static int vhost_kernel_call(struct vhost_dev *dev, unsigned long int request, void *arg) { @@ -182,12 +187,6 @@ static int vhost_kernel_get_vring_worker(struct vhost_dev *dev, return vhost_kernel_call(dev, VHOST_GET_VRING_WORKER, worker); } -static int vhost_kernel_set_features(struct vhost_dev *dev, - uint64_t features) -{ - return vhost_kernel_call(dev, VHOST_SET_FEATURES, &features); -} - static int vhost_kernel_set_backend_cap(struct vhost_dev *dev) { uint64_t features; @@ -210,10 +209,51 @@ static int vhost_kernel_set_backend_cap(struct vhost_dev *dev) return 0; } -static int vhost_kernel_get_features(struct vhost_dev *dev, - uint64_t *features) +static int vhost_kernel_set_features(struct vhost_dev *dev, + const uint64_t *features) { - return vhost_kernel_call(dev, VHOST_GET_FEATURES, features); + struct vhost_features farray; + bool extended_in_use; + int r; + + farray.count = VIRTIO_FEATURES_NU64S; + virtio_features_copy(farray.features, features); + extended_in_use = virtio_features_use_ex(farray.features); + + /* + * Can't check for ENOTTY: for unknown ioctls the kernel interprets + * the argument as a virtio queue id and most likely errors out validating + * such id, instead of reporting an unknown operation. + */ + r = vhost_kernel_call(dev, VHOST_SET_FEATURES_ARRAY, &farray); + if (!r) { + return 0; + } + + if (extended_in_use) { + error_report("Trying to set extended features without kernel support"); + return -EINVAL; + } + return vhost_kernel_call(dev, VHOST_SET_FEATURES, &farray.features[0]); +} + +static int vhost_kernel_get_features(struct vhost_dev *dev, uint64_t *features) +{ + struct vhost_features farray; + int r; + + farray.count = VIRTIO_FEATURES_NU64S; + r = vhost_kernel_call(dev, VHOST_GET_FEATURES_ARRAY, &farray); + if (r) { + memset(&farray, 0, sizeof(farray)); + r = vhost_kernel_call(dev, VHOST_GET_FEATURES, &farray.features[0]); + } + if (r) { + return r; + } + + virtio_features_copy(features, farray.features); + return 0; } static int vhost_kernel_set_owner(struct vhost_dev *dev) @@ -341,8 +381,8 @@ const VhostOps kernel_ops = { .vhost_attach_vring_worker = vhost_kernel_attach_vring_worker, .vhost_new_worker = vhost_kernel_new_worker, .vhost_free_worker = vhost_kernel_free_worker, - .vhost_set_features = vhost_kernel_set_features, - .vhost_get_features = vhost_kernel_get_features, + .vhost_set_features_ex = vhost_kernel_set_features, + .vhost_get_features_ex = vhost_kernel_get_features, .vhost_set_backend_cap = vhost_kernel_set_backend_cap, .vhost_set_owner = vhost_kernel_set_owner, .vhost_get_vq_index = vhost_kernel_get_vq_index, diff --git a/hw/virtio/vhost-user-device-pci.c b/hw/virtio/vhost-user-test-device-pci.c index f10bac874e..b4ed0efb50 100644 --- a/hw/virtio/vhost-user-device-pci.c +++ b/hw/virtio/vhost-user-test-device-pci.c @@ -18,13 +18,13 @@ struct VHostUserDevicePCI { VHostUserBase vub; }; -#define TYPE_VHOST_USER_DEVICE_PCI "vhost-user-device-pci-base" +#define TYPE_VHOST_USER_TEST_DEVICE_PCI "vhost-user-test-device-pci-base" -OBJECT_DECLARE_SIMPLE_TYPE(VHostUserDevicePCI, VHOST_USER_DEVICE_PCI) +OBJECT_DECLARE_SIMPLE_TYPE(VHostUserDevicePCI, VHOST_USER_TEST_DEVICE_PCI) static void vhost_user_device_pci_realize(VirtIOPCIProxy *vpci_dev, Error **errp) { - VHostUserDevicePCI *dev = VHOST_USER_DEVICE_PCI(vpci_dev); + VHostUserDevicePCI *dev = VHOST_USER_TEST_DEVICE_PCI(vpci_dev); DeviceState *vdev = DEVICE(&dev->vub); vpci_dev->nvectors = 1; @@ -38,9 +38,6 @@ static void vhost_user_device_pci_class_init(ObjectClass *klass, VirtioPCIClass *k = VIRTIO_PCI_CLASS(klass); PCIDeviceClass *pcidev_k = PCI_DEVICE_CLASS(klass); - /* Reason: stop users confusing themselves */ - dc->user_creatable = false; - k->realize = vhost_user_device_pci_realize; set_bit(DEVICE_CATEGORY_INPUT, dc->categories); pcidev_k->vendor_id = PCI_VENDOR_ID_REDHAT_QUMRANET; @@ -51,15 +48,15 @@ static void vhost_user_device_pci_class_init(ObjectClass *klass, static void vhost_user_device_pci_instance_init(Object *obj) { - VHostUserDevicePCI *dev = VHOST_USER_DEVICE_PCI(obj); + VHostUserDevicePCI *dev = VHOST_USER_TEST_DEVICE_PCI(obj); virtio_instance_init_common(obj, &dev->vub, sizeof(dev->vub), - TYPE_VHOST_USER_DEVICE); + TYPE_VHOST_USER_TEST_DEVICE); } static const VirtioPCIDeviceTypeInfo vhost_user_device_pci_info = { - .base_name = TYPE_VHOST_USER_DEVICE_PCI, - .non_transitional_name = "vhost-user-device-pci", + .base_name = TYPE_VHOST_USER_TEST_DEVICE_PCI, + .non_transitional_name = "vhost-user-test-device-pci", .instance_size = sizeof(VHostUserDevicePCI), .instance_init = vhost_user_device_pci_instance_init, .class_init = vhost_user_device_pci_class_init, diff --git a/hw/virtio/vhost-user-device.c b/hw/virtio/vhost-user-test-device.c index 3939bdf755..1b98ea3e48 100644 --- a/hw/virtio/vhost-user-device.c +++ b/hw/virtio/vhost-user-test-device.c @@ -1,5 +1,5 @@ /* - * Generic vhost-user-device implementation for any vhost-user-backend + * Generic vhost-user-test-device implementation for any vhost-user-backend * * This is a concrete implementation of vhost-user-base which can be * configured via properties. It is useful for development and @@ -25,7 +25,7 @@ */ static const VMStateDescription vud_vmstate = { - .name = "vhost-user-device", + .name = "vhost-user-test-device", .unmigratable = 1, }; @@ -41,16 +41,13 @@ static void vud_class_init(ObjectClass *klass, const void *data) { DeviceClass *dc = DEVICE_CLASS(klass); - /* Reason: stop inexperienced users confusing themselves */ - dc->user_creatable = false; - device_class_set_props(dc, vud_properties); dc->vmsd = &vud_vmstate; set_bit(DEVICE_CATEGORY_INPUT, dc->categories); } static const TypeInfo vud_info = { - .name = TYPE_VHOST_USER_DEVICE, + .name = TYPE_VHOST_USER_TEST_DEVICE, .parent = TYPE_VHOST_USER_BASE, .class_init = vud_class_init, }; diff --git a/hw/virtio/vhost.c b/hw/virtio/vhost.c index 6557c58d12..c120ef38b9 100644 --- a/hw/virtio/vhost.c +++ b/hw/virtio/vhost.c @@ -972,20 +972,34 @@ static int vhost_virtqueue_set_addr(struct vhost_dev *dev, static int vhost_dev_set_features(struct vhost_dev *dev, bool enable_log) { - uint64_t features = dev->acked_features; + uint64_t features[VIRTIO_FEATURES_NU64S]; int r; + + virtio_features_copy(features, dev->acked_features_ex); if (enable_log) { - features |= 0x1ULL << VHOST_F_LOG_ALL; + virtio_add_feature_ex(features, VHOST_F_LOG_ALL); } if (!vhost_dev_has_iommu(dev)) { - features &= ~(0x1ULL << VIRTIO_F_IOMMU_PLATFORM); + virtio_clear_feature_ex(features, VIRTIO_F_IOMMU_PLATFORM); } if (dev->vhost_ops->vhost_force_iommu) { if (dev->vhost_ops->vhost_force_iommu(dev) == true) { - features |= 0x1ULL << VIRTIO_F_IOMMU_PLATFORM; + virtio_add_feature_ex(features, VIRTIO_F_IOMMU_PLATFORM); } } - r = dev->vhost_ops->vhost_set_features(dev, features); + + if (virtio_features_use_ex(features) && + !dev->vhost_ops->vhost_set_features_ex) { + r = -EINVAL; + VHOST_OPS_DEBUG(r, "extended features without device support"); + goto out; + } + + if (dev->vhost_ops->vhost_set_features_ex) { + r = dev->vhost_ops->vhost_set_features_ex(dev, features); + } else { + r = dev->vhost_ops->vhost_set_features(dev, features[0]); + } if (r < 0) { VHOST_OPS_DEBUG(r, "vhost_set_features failed"); goto out; @@ -1508,12 +1522,27 @@ static void vhost_virtqueue_cleanup(struct vhost_virtqueue *vq) } } +static int vhost_dev_get_features(struct vhost_dev *hdev, + uint64_t *features) +{ + uint64_t features64; + int r; + + if (hdev->vhost_ops->vhost_get_features_ex) { + return hdev->vhost_ops->vhost_get_features_ex(hdev, features); + } + + r = hdev->vhost_ops->vhost_get_features(hdev, &features64); + virtio_features_from_u64(features, features64); + return r; +} + int vhost_dev_init(struct vhost_dev *hdev, void *opaque, VhostBackendType backend_type, uint32_t busyloop_timeout, Error **errp) { + uint64_t features[VIRTIO_FEATURES_NU64S]; unsigned int used, reserved, limit; - uint64_t features; int i, r, n_initialized_vqs = 0; hdev->vdev = NULL; @@ -1533,7 +1562,7 @@ int vhost_dev_init(struct vhost_dev *hdev, void *opaque, goto fail; } - r = hdev->vhost_ops->vhost_get_features(hdev, &features); + r = vhost_dev_get_features(hdev, features); if (r < 0) { error_setg_errno(errp, -r, "vhost_get_features failed"); goto fail; @@ -1571,7 +1600,7 @@ int vhost_dev_init(struct vhost_dev *hdev, void *opaque, } } - hdev->features = features; + virtio_features_copy(hdev->features_ex, features); hdev->memory_listener = (MemoryListener) { .name = "vhost", @@ -1594,7 +1623,7 @@ int vhost_dev_init(struct vhost_dev *hdev, void *opaque, }; if (hdev->migration_blocker == NULL) { - if (!(hdev->features & (0x1ULL << VHOST_F_LOG_ALL))) { + if (!virtio_has_feature_ex(hdev->features_ex, VHOST_F_LOG_ALL)) { error_setg(&hdev->migration_blocker, "Migration disabled: vhost lacks VHOST_F_LOG_ALL feature."); } else if (vhost_dev_log_is_shared(hdev) && !qemu_memfd_alloc_check()) { @@ -1817,7 +1846,7 @@ void vhost_config_mask(struct vhost_dev *hdev, VirtIODevice *vdev, bool mask) int r; EventNotifier *notifier = &hdev->vqs[VHOST_QUEUE_NUM_CONFIG_INR].masked_config_notifier; - EventNotifier *config_notifier = &vdev->config_notifier; + EventNotifier *config_notifier = virtio_config_get_guest_notifier(vdev); assert(hdev->vhost_ops); if ((hdev->started == false) || @@ -1848,39 +1877,40 @@ static void vhost_stop_config_intr(struct vhost_dev *dev) static void vhost_start_config_intr(struct vhost_dev *dev) { int r; + EventNotifier *config_notifier = + virtio_config_get_guest_notifier(dev->vdev); assert(dev->vhost_ops); - int fd = event_notifier_get_fd(&dev->vdev->config_notifier); + int fd = event_notifier_get_fd(config_notifier); if (dev->vhost_ops->vhost_set_config_call) { r = dev->vhost_ops->vhost_set_config_call(dev, fd); if (!r) { - event_notifier_set(&dev->vdev->config_notifier); + event_notifier_set(config_notifier); } } } -uint64_t vhost_get_features(struct vhost_dev *hdev, const int *feature_bits, - uint64_t features) +void vhost_get_features_ex(struct vhost_dev *hdev, + const int *feature_bits, + uint64_t *features) { const int *bit = feature_bits; + while (*bit != VHOST_INVALID_FEATURE_BIT) { - uint64_t bit_mask = (1ULL << *bit); - if (!(hdev->features & bit_mask)) { - features &= ~bit_mask; + if (!virtio_has_feature_ex(hdev->features_ex, *bit)) { + virtio_clear_feature_ex(features, *bit); } bit++; } - return features; } -void vhost_ack_features(struct vhost_dev *hdev, const int *feature_bits, - uint64_t features) +void vhost_ack_features_ex(struct vhost_dev *hdev, const int *feature_bits, + const uint64_t *features) { const int *bit = feature_bits; while (*bit != VHOST_INVALID_FEATURE_BIT) { - uint64_t bit_mask = (1ULL << *bit); - if (features & bit_mask) { - hdev->acked_features |= bit_mask; + if (virtio_has_feature_ex(features, *bit)) { + virtio_add_feature_ex(hdev->acked_features_ex, *bit); } bit++; } @@ -2139,12 +2169,13 @@ static int do_vhost_dev_stop(struct vhost_dev *hdev, VirtIODevice *vdev, { int i; int rc = 0; + EventNotifier *config_notifier = virtio_config_get_guest_notifier(vdev); /* should only be called after backend is connected */ assert(hdev->vhost_ops); event_notifier_test_and_clear( &hdev->vqs[VHOST_QUEUE_NUM_CONFIG_INR].masked_config_notifier); - event_notifier_test_and_clear(&vdev->config_notifier); + event_notifier_test_and_clear(config_notifier); event_notifier_cleanup( &hdev->vqs[VHOST_QUEUE_NUM_CONFIG_INR].masked_config_notifier); diff --git a/hw/virtio/virtio-bus.c b/hw/virtio/virtio-bus.c index 11adfbf3ab..cef944e015 100644 --- a/hw/virtio/virtio-bus.c +++ b/hw/virtio/virtio-bus.c @@ -62,9 +62,14 @@ void virtio_bus_device_plugged(VirtIODevice *vdev, Error **errp) } /* Get the features of the plugged device. */ - assert(vdc->get_features != NULL); - vdev->host_features = vdc->get_features(vdev, vdev->host_features, - &local_err); + if (vdc->get_features_ex) { + vdc->get_features_ex(vdev, vdev->host_features_ex, &local_err); + } else { + assert(vdc->get_features != NULL); + virtio_features_from_u64(vdev->host_features_ex, + vdc->get_features(vdev, vdev->host_features, + &local_err)); + } if (local_err) { error_propagate(errp, local_err); return; diff --git a/hw/virtio/virtio-hmp-cmds.c b/hw/virtio/virtio-hmp-cmds.c index 7d8677bcf0..1daae482d3 100644 --- a/hw/virtio/virtio-hmp-cmds.c +++ b/hw/virtio/virtio-hmp-cmds.c @@ -74,7 +74,8 @@ static void hmp_virtio_dump_features(Monitor *mon, } if (features->has_unknown_dev_features) { - monitor_printf(mon, " unknown-features(0x%016"PRIx64")\n", + monitor_printf(mon, " unknown-features(0x%016"PRIx64"%016"PRIx64")\n", + features->unknown_dev_features2, features->unknown_dev_features); } } diff --git a/hw/virtio/virtio-pci.c b/hw/virtio/virtio-pci.c index d2595fbd55..ab96ce1776 100644 --- a/hw/virtio/virtio-pci.c +++ b/hw/virtio/virtio-pci.c @@ -109,6 +109,29 @@ static const VMStateDescription vmstate_virtio_pci_modern_queue_state = { } }; +static bool virtio_pci_modern_state_features128_needed(void *opaque) +{ + VirtIOPCIProxy *proxy = opaque; + uint32_t features = 0; + int i; + + for (i = 2; i < ARRAY_SIZE(proxy->guest_features); ++i) { + features |= proxy->guest_features[i]; + } + return features; +} + +static const VMStateDescription vmstate_virtio_pci_modern_state_features128 = { + .name = "virtio_pci/modern_state/features128", + .version_id = 1, + .minimum_version_id = 1, + .needed = &virtio_pci_modern_state_features128_needed, + .fields = (const VMStateField[]) { + VMSTATE_UINT32_SUB_ARRAY(guest_features, VirtIOPCIProxy, 2, 2), + VMSTATE_END_OF_LIST() + } +}; + static bool virtio_pci_modern_state_needed(void *opaque) { VirtIOPCIProxy *proxy = opaque; @@ -116,6 +139,12 @@ static bool virtio_pci_modern_state_needed(void *opaque) return virtio_pci_modern(proxy); } +/* + * Avoid silently breaking migration should the feature space increase + * even more in the (far away) future + */ +QEMU_BUILD_BUG_ON(VIRTIO_FEATURES_NU32S != 4); + static const VMStateDescription vmstate_virtio_pci_modern_state_sub = { .name = "virtio_pci/modern_state", .version_id = 1, @@ -124,11 +153,15 @@ static const VMStateDescription vmstate_virtio_pci_modern_state_sub = { .fields = (const VMStateField[]) { VMSTATE_UINT32(dfselect, VirtIOPCIProxy), VMSTATE_UINT32(gfselect, VirtIOPCIProxy), - VMSTATE_UINT32_ARRAY(guest_features, VirtIOPCIProxy, 2), + VMSTATE_UINT32_SUB_ARRAY(guest_features, VirtIOPCIProxy, 0, 2), VMSTATE_STRUCT_ARRAY(vqs, VirtIOPCIProxy, VIRTIO_QUEUE_MAX, 0, vmstate_virtio_pci_modern_queue_state, VirtIOPCIQueue), VMSTATE_END_OF_LIST() + }, + .subsections = (const VMStateDescription * const []) { + &vmstate_virtio_pci_modern_state_features128, + NULL } }; @@ -1477,6 +1510,19 @@ int virtio_pci_add_shm_cap(VirtIOPCIProxy *proxy, return virtio_pci_add_mem_cap(proxy, &cap.cap); } +static int virtio_pci_select_max(const VirtIODevice *vdev) +{ + int i; + + for (i = VIRTIO_FEATURES_NU64S - 1; i > 0; i--) { + if (vdev->host_features_ex[i]) { + return (i + 1) * 2; + } + } + + return 2; +} + static uint64_t virtio_pci_common_read(void *opaque, hwaddr addr, unsigned size) { @@ -1494,18 +1540,21 @@ static uint64_t virtio_pci_common_read(void *opaque, hwaddr addr, val = proxy->dfselect; break; case VIRTIO_PCI_COMMON_DF: - if (proxy->dfselect <= 1) { + if (proxy->dfselect < virtio_pci_select_max(vdev)) { VirtioDeviceClass *vdc = VIRTIO_DEVICE_GET_CLASS(vdev); - val = (vdev->host_features & ~vdc->legacy_features) >> - (32 * proxy->dfselect); + val = vdev->host_features_ex[proxy->dfselect >> 1] >> + (32 * (proxy->dfselect & 1)); + if (proxy->dfselect <= 1) { + val &= (~vdc->legacy_features) >> (32 * proxy->dfselect); + } } break; case VIRTIO_PCI_COMMON_GFSELECT: val = proxy->gfselect; break; case VIRTIO_PCI_COMMON_GF: - if (proxy->gfselect < ARRAY_SIZE(proxy->guest_features)) { + if (proxy->gfselect < virtio_pci_select_max(vdev)) { val = proxy->guest_features[proxy->gfselect]; } break; @@ -1588,11 +1637,18 @@ static void virtio_pci_common_write(void *opaque, hwaddr addr, proxy->gfselect = val; break; case VIRTIO_PCI_COMMON_GF: - if (proxy->gfselect < ARRAY_SIZE(proxy->guest_features)) { + if (proxy->gfselect < virtio_pci_select_max(vdev)) { + uint64_t features[VIRTIO_FEATURES_NU64S]; + int i; + proxy->guest_features[proxy->gfselect] = val; - virtio_set_features(vdev, - (((uint64_t)proxy->guest_features[1]) << 32) | - proxy->guest_features[0]); + virtio_features_clear(features); + for (i = 0; i < ARRAY_SIZE(proxy->guest_features); ++i) { + uint64_t cur = proxy->guest_features[i]; + + features[i >> 1] |= cur << ((i & 1) * 32); + } + virtio_set_features_ex(vdev, features); } break; case VIRTIO_PCI_COMMON_MSIX: @@ -2311,6 +2367,8 @@ static void virtio_pci_reset(DeviceState *qdev) virtio_bus_reset(bus); msix_unuse_all_vectors(&proxy->pci_dev); + memset(proxy->guest_features, 0, sizeof(proxy->guest_features)); + for (i = 0; i < VIRTIO_QUEUE_MAX; i++) { proxy->vqs[i].enabled = 0; proxy->vqs[i].reset = 0; diff --git a/hw/virtio/virtio-qmp.c b/hw/virtio/virtio-qmp.c index 3b6377cf0d..b338344c6c 100644 --- a/hw/virtio/virtio-qmp.c +++ b/hw/virtio/virtio-qmp.c @@ -325,6 +325,20 @@ static const qmp_virtio_feature_map_t virtio_net_feature_map[] = { FEATURE_ENTRY(VHOST_USER_F_PROTOCOL_FEATURES, \ "VHOST_USER_F_PROTOCOL_FEATURES: Vhost-user protocol features " "negotiation supported"), + FEATURE_ENTRY(VIRTIO_NET_F_GUEST_UDP_TUNNEL_GSO, \ + "VIRTIO_NET_F_GUEST_UDP_TUNNEL_GSO: Driver can receive GSO over " + "UDP tunnel packets"), + FEATURE_ENTRY(VIRTIO_NET_F_GUEST_UDP_TUNNEL_GSO_CSUM, \ + "VIRTIO_NET_F_GUEST_UDP_TUNNEL_GSO: Driver can receive GSO over " + "UDP tunnel packets requiring checksum offload for the outer " + "header"), + FEATURE_ENTRY(VIRTIO_NET_F_HOST_UDP_TUNNEL_GSO, \ + "VIRTIO_NET_F_HOST_UDP_TUNNEL_GSO: Device can receive GSO over " + "UDP tunnel packets"), + FEATURE_ENTRY(VIRTIO_NET_F_HOST_UDP_TUNNEL_GSO_CSUM, \ + "VIRTIO_NET_F_HOST_UDP_TUNNEL_GSO_CSUM: Device can receive GSO over " + "UDP tunnel packets requiring checksum offload for the outer " + "header"), { -1, "" } }; #endif @@ -510,6 +524,24 @@ static const qmp_virtio_feature_map_t virtio_gpio_feature_map[] = { list; \ }) +#define CONVERT_FEATURES_EX(type, map, bitmap) \ + ({ \ + type *list = NULL; \ + type *node; \ + for (i = 0; map[i].virtio_bit != -1; i++) { \ + bit = map[i].virtio_bit; \ + if (!virtio_has_feature_ex(bitmap, bit)) { \ + continue; \ + } \ + node = g_new0(type, 1); \ + node->value = g_strdup(map[i].feature_desc); \ + node->next = list; \ + list = node; \ + virtio_clear_feature_ex(bitmap, bit); \ + } \ + list; \ + }) + VirtioDeviceStatus *qmp_decode_status(uint8_t bitmap) { VirtioDeviceStatus *status; @@ -545,109 +577,112 @@ VhostDeviceProtocols *qmp_decode_protocols(uint64_t bitmap) return vhu_protocols; } -VirtioDeviceFeatures *qmp_decode_features(uint16_t device_id, uint64_t bitmap) +VirtioDeviceFeatures *qmp_decode_features(uint16_t device_id, + const uint64_t *bmap) { + uint64_t bitmap[VIRTIO_FEATURES_NU64S]; VirtioDeviceFeatures *features; uint64_t bit; int i; + virtio_features_copy(bitmap, bmap); features = g_new0(VirtioDeviceFeatures, 1); features->has_dev_features = true; /* transport features */ - features->transports = CONVERT_FEATURES(strList, virtio_transport_map, 0, - bitmap); + features->transports = CONVERT_FEATURES_EX(strList, virtio_transport_map, + bitmap); /* device features */ switch (device_id) { #ifdef CONFIG_VIRTIO_SERIAL case VIRTIO_ID_CONSOLE: features->dev_features = - CONVERT_FEATURES(strList, virtio_serial_feature_map, 0, bitmap); + CONVERT_FEATURES_EX(strList, virtio_serial_feature_map, bitmap); break; #endif #ifdef CONFIG_VIRTIO_BLK case VIRTIO_ID_BLOCK: features->dev_features = - CONVERT_FEATURES(strList, virtio_blk_feature_map, 0, bitmap); + CONVERT_FEATURES_EX(strList, virtio_blk_feature_map, bitmap); break; #endif #ifdef CONFIG_VIRTIO_GPU case VIRTIO_ID_GPU: features->dev_features = - CONVERT_FEATURES(strList, virtio_gpu_feature_map, 0, bitmap); + CONVERT_FEATURES_EX(strList, virtio_gpu_feature_map, bitmap); break; #endif #ifdef CONFIG_VIRTIO_NET case VIRTIO_ID_NET: features->dev_features = - CONVERT_FEATURES(strList, virtio_net_feature_map, 0, bitmap); + CONVERT_FEATURES_EX(strList, virtio_net_feature_map, bitmap); break; #endif #ifdef CONFIG_VIRTIO_SCSI case VIRTIO_ID_SCSI: features->dev_features = - CONVERT_FEATURES(strList, virtio_scsi_feature_map, 0, bitmap); + CONVERT_FEATURES_EX(strList, virtio_scsi_feature_map, bitmap); break; #endif #ifdef CONFIG_VIRTIO_BALLOON case VIRTIO_ID_BALLOON: features->dev_features = - CONVERT_FEATURES(strList, virtio_balloon_feature_map, 0, bitmap); + CONVERT_FEATURES_EX(strList, virtio_balloon_feature_map, bitmap); break; #endif #ifdef CONFIG_VIRTIO_IOMMU case VIRTIO_ID_IOMMU: features->dev_features = - CONVERT_FEATURES(strList, virtio_iommu_feature_map, 0, bitmap); + CONVERT_FEATURES_EX(strList, virtio_iommu_feature_map, bitmap); break; #endif #ifdef CONFIG_VIRTIO_INPUT case VIRTIO_ID_INPUT: features->dev_features = - CONVERT_FEATURES(strList, virtio_input_feature_map, 0, bitmap); + CONVERT_FEATURES_EX(strList, virtio_input_feature_map, bitmap); break; #endif #ifdef CONFIG_VHOST_USER_FS case VIRTIO_ID_FS: features->dev_features = - CONVERT_FEATURES(strList, virtio_fs_feature_map, 0, bitmap); + CONVERT_FEATURES_EX(strList, virtio_fs_feature_map, bitmap); break; #endif #ifdef CONFIG_VHOST_VSOCK case VIRTIO_ID_VSOCK: features->dev_features = - CONVERT_FEATURES(strList, virtio_vsock_feature_map, 0, bitmap); + CONVERT_FEATURES_EX(strList, virtio_vsock_feature_map, bitmap); break; #endif #ifdef CONFIG_VIRTIO_CRYPTO case VIRTIO_ID_CRYPTO: features->dev_features = - CONVERT_FEATURES(strList, virtio_crypto_feature_map, 0, bitmap); + CONVERT_FEATURES_EX(strList, virtio_crypto_feature_map, bitmap); break; #endif #ifdef CONFIG_VIRTIO_MEM case VIRTIO_ID_MEM: features->dev_features = - CONVERT_FEATURES(strList, virtio_mem_feature_map, 0, bitmap); + CONVERT_FEATURES_EX(strList, virtio_mem_feature_map, bitmap); break; #endif #ifdef CONFIG_VIRTIO_I2C_ADAPTER case VIRTIO_ID_I2C_ADAPTER: features->dev_features = - CONVERT_FEATURES(strList, virtio_i2c_feature_map, 0, bitmap); + CONVERT_FEATURES_EX(strList, virtio_i2c_feature_map, bitmap); break; #endif #ifdef CONFIG_VIRTIO_RNG case VIRTIO_ID_RNG: features->dev_features = - CONVERT_FEATURES(strList, virtio_rng_feature_map, 0, bitmap); + CONVERT_FEATURES_EX(strList, virtio_rng_feature_map, bitmap); break; #endif #ifdef CONFIG_VHOST_USER_GPIO case VIRTIO_ID_GPIO: features->dev_features = - CONVERT_FEATURES(strList, virtio_gpio_feature_map, 0, bitmap); + CONVERT_FEATURES_EX(strList, virtio_gpio_feature_map, bitmap); break; #endif /* No features */ @@ -680,10 +715,9 @@ VirtioDeviceFeatures *qmp_decode_features(uint16_t device_id, uint64_t bitmap) g_assert_not_reached(); } - features->has_unknown_dev_features = bitmap != 0; - if (features->has_unknown_dev_features) { - features->unknown_dev_features = bitmap; - } + features->has_unknown_dev_features = !virtio_features_empty(bitmap); + features->unknown_dev_features = bitmap[0]; + features->unknown_dev_features2 = bitmap[1]; return features; } @@ -743,11 +777,11 @@ VirtioStatus *qmp_x_query_virtio_status(const char *path, Error **errp) status->device_id = vdev->device_id; status->vhost_started = vdev->vhost_started; status->guest_features = qmp_decode_features(vdev->device_id, - vdev->guest_features); + vdev->guest_features_ex); status->host_features = qmp_decode_features(vdev->device_id, - vdev->host_features); + vdev->host_features_ex); status->backend_features = qmp_decode_features(vdev->device_id, - vdev->backend_features); + vdev->backend_features_ex); switch (vdev->device_endian) { case VIRTIO_DEVICE_ENDIAN_LITTLE: @@ -785,11 +819,12 @@ VirtioStatus *qmp_x_query_virtio_status(const char *path, Error **errp) status->vhost_dev->nvqs = hdev->nvqs; status->vhost_dev->vq_index = hdev->vq_index; status->vhost_dev->features = - qmp_decode_features(vdev->device_id, hdev->features); + qmp_decode_features(vdev->device_id, hdev->features_ex); status->vhost_dev->acked_features = - qmp_decode_features(vdev->device_id, hdev->acked_features); + qmp_decode_features(vdev->device_id, hdev->acked_features_ex); status->vhost_dev->backend_features = - qmp_decode_features(vdev->device_id, hdev->backend_features); + qmp_decode_features(vdev->device_id, hdev->backend_features_ex); + status->vhost_dev->protocol_features = qmp_decode_protocols(hdev->protocol_features); status->vhost_dev->max_queues = hdev->max_queues; diff --git a/hw/virtio/virtio-qmp.h b/hw/virtio/virtio-qmp.h index 245a446a56..e0a1e49035 100644 --- a/hw/virtio/virtio-qmp.h +++ b/hw/virtio/virtio-qmp.h @@ -18,6 +18,7 @@ VirtIODevice *qmp_find_virtio_device(const char *path); VirtioDeviceStatus *qmp_decode_status(uint8_t bitmap); VhostDeviceProtocols *qmp_decode_protocols(uint64_t bitmap); -VirtioDeviceFeatures *qmp_decode_features(uint16_t device_id, uint64_t bitmap); +VirtioDeviceFeatures *qmp_decode_features(uint16_t device_id, + const uint64_t *bitmap); #endif diff --git a/hw/virtio/virtio.c b/hw/virtio/virtio.c index 0a68f1b6f1..be73753b59 100644 --- a/hw/virtio/virtio.c +++ b/hw/virtio/virtio.c @@ -31,6 +31,7 @@ #include "hw/qdev-properties.h" #include "hw/virtio/virtio-access.h" #include "system/dma.h" +#include "system/iothread.h" #include "system/runstate.h" #include "virtio-qmp.h" @@ -256,7 +257,10 @@ void virtio_init_region_cache(VirtIODevice *vdev, int n) len = address_space_cache_init(&new->desc, vdev->dma_as, addr, size, packed); if (len < size) { - virtio_error(vdev, "Cannot map desc"); + virtio_error(vdev, + "Failed to map descriptor ring for device %s: " + "invalid guest physical address or corrupted queue setup", + qdev_get_printable_name(DEVICE(vdev))); goto err_desc; } @@ -264,7 +268,10 @@ void virtio_init_region_cache(VirtIODevice *vdev, int n) len = address_space_cache_init(&new->used, vdev->dma_as, vq->vring.used, size, true); if (len < size) { - virtio_error(vdev, "Cannot map used"); + virtio_error(vdev, + "Failed to map used ring for device %s: " + "possible guest misconfiguration or insufficient memory", + qdev_get_printable_name(DEVICE(vdev))); goto err_used; } @@ -272,7 +279,10 @@ void virtio_init_region_cache(VirtIODevice *vdev, int n) len = address_space_cache_init(&new->avail, vdev->dma_as, vq->vring.avail, size, false); if (len < size) { - virtio_error(vdev, "Cannot map avail"); + virtio_error(vdev, + "Failed to map avalaible ring for device %s: " + "possible queue misconfiguration or overlapping memory region", + qdev_get_printable_name(DEVICE(vdev))); goto err_avail; } @@ -2654,16 +2664,8 @@ static void virtio_notify_irqfd_deferred_fn(void *opaque) event_notifier_set(notifier); } -void virtio_notify_irqfd(VirtIODevice *vdev, VirtQueue *vq) +static void virtio_irq(VirtQueue *vq) { - WITH_RCU_READ_LOCK_GUARD() { - if (!virtio_should_notify(vdev, vq)) { - return; - } - } - - trace_virtio_notify_irqfd(vdev, vq); - /* * virtio spec 1.0 says ISR bit 0 should be ignored with MSI, but * windows drivers included in virtio-win 1.8.0 (circa 2015) are @@ -2680,13 +2682,18 @@ void virtio_notify_irqfd(VirtIODevice *vdev, VirtQueue *vq) * to an atomic operation. */ virtio_set_isr(vq->vdev, 0x1); - defer_call(virtio_notify_irqfd_deferred_fn, &vq->guest_notifier); -} -static void virtio_irq(VirtQueue *vq) -{ - virtio_set_isr(vq->vdev, 0x1); - virtio_notify_vector(vq->vdev, vq->vector); + /* + * The interrupt code path requires the Big QEMU Lock (BQL), so use the + * notifier instead when in an IOThread. This assumes that device models + * have already called ->set_guest_notifiers() sometime before calling this + * function. + */ + if (qemu_in_iothread()) { + defer_call(virtio_notify_irqfd_deferred_fn, &vq->guest_notifier); + } else { + virtio_notify_vector(vq->vdev, vq->vector); + } } void virtio_notify(VirtIODevice *vdev, VirtQueue *vq) @@ -2708,7 +2715,12 @@ void virtio_notify_config(VirtIODevice *vdev) virtio_set_isr(vdev, 0x3); vdev->generation++; - virtio_notify_vector(vdev, vdev->config_vector); + + if (qemu_in_iothread()) { + defer_call(virtio_notify_irqfd_deferred_fn, &vdev->config_notifier); + } else { + virtio_notify_vector(vdev, vdev->config_vector); + } } static bool virtio_device_endian_needed(void *opaque) @@ -2964,6 +2976,30 @@ static const VMStateDescription vmstate_virtio_disabled = { } }; +static bool virtio_128bit_features_needed(void *opaque) +{ + VirtIODevice *vdev = opaque; + + return virtio_features_use_ex(vdev->host_features_ex); +} + +static const VMStateDescription vmstate_virtio_128bit_features = { + .name = "virtio/128bit_features", + .version_id = 1, + .minimum_version_id = 1, + .needed = &virtio_128bit_features_needed, + .fields = (const VMStateField[]) { + VMSTATE_UINT64(guest_features_ex[1], VirtIODevice), + VMSTATE_END_OF_LIST() + } +}; + +/* + * Avoid silently breaking migration should the feature space increase + * even more in the (far away) future + */ +QEMU_BUILD_BUG_ON(VIRTIO_FEATURES_NU64S != 2); + static const VMStateDescription vmstate_virtio = { .name = "virtio", .version_id = 1, @@ -2973,6 +3009,7 @@ static const VMStateDescription vmstate_virtio = { }, .subsections = (const VMStateDescription * const []) { &vmstate_virtio_device_endian, + &vmstate_virtio_128bit_features, &vmstate_virtio_64bit_features, &vmstate_virtio_virtqueues, &vmstate_virtio_ringsize, @@ -3071,23 +3108,30 @@ const VMStateInfo virtio_vmstate_info = { .put = virtio_device_put, }; -static int virtio_set_features_nocheck(VirtIODevice *vdev, uint64_t val) +static int virtio_set_features_nocheck(VirtIODevice *vdev, const uint64_t *val) { VirtioDeviceClass *k = VIRTIO_DEVICE_GET_CLASS(vdev); - bool bad = (val & ~(vdev->host_features)) != 0; + uint64_t tmp[VIRTIO_FEATURES_NU64S]; + bool bad; + + bad = virtio_features_andnot(tmp, val, vdev->host_features_ex); + virtio_features_and(tmp, val, vdev->host_features_ex); - val &= vdev->host_features; - if (k->set_features) { - k->set_features(vdev, val); + if (k->set_features_ex) { + k->set_features_ex(vdev, val); + } else if (k->set_features) { + bad = bad || virtio_features_use_ex(tmp); + k->set_features(vdev, tmp[0]); } - vdev->guest_features = val; + + virtio_features_copy(vdev->guest_features_ex, tmp); return bad ? -1 : 0; } typedef struct VirtioSetFeaturesNocheckData { Coroutine *co; VirtIODevice *vdev; - uint64_t val; + uint64_t val[VIRTIO_FEATURES_NU64S]; int ret; } VirtioSetFeaturesNocheckData; @@ -3100,14 +3144,15 @@ static void virtio_set_features_nocheck_bh(void *opaque) } static int coroutine_mixed_fn -virtio_set_features_nocheck_maybe_co(VirtIODevice *vdev, uint64_t val) +virtio_set_features_nocheck_maybe_co(VirtIODevice *vdev, + const uint64_t *val) { if (qemu_in_coroutine()) { VirtioSetFeaturesNocheckData data = { .co = qemu_coroutine_self(), .vdev = vdev, - .val = val, }; + virtio_features_copy(data.val, val); aio_bh_schedule_oneshot(qemu_get_current_aio_context(), virtio_set_features_nocheck_bh, &data); qemu_coroutine_yield(); @@ -3119,6 +3164,14 @@ virtio_set_features_nocheck_maybe_co(VirtIODevice *vdev, uint64_t val) int virtio_set_features(VirtIODevice *vdev, uint64_t val) { + uint64_t features[VIRTIO_FEATURES_NU64S]; + + virtio_features_from_u64(features, val); + return virtio_set_features_ex(vdev, features); +} + +int virtio_set_features_ex(VirtIODevice *vdev, const uint64_t *features) +{ int ret; /* * The driver must not attempt to set features after feature negotiation @@ -3128,13 +3181,13 @@ int virtio_set_features(VirtIODevice *vdev, uint64_t val) return -EINVAL; } - if (val & (1ull << VIRTIO_F_BAD_FEATURE)) { + if (features[0] & (1ull << VIRTIO_F_BAD_FEATURE)) { qemu_log_mask(LOG_GUEST_ERROR, "%s: guest driver for %s has enabled UNUSED(30) feature bit!\n", __func__, vdev->name); } - ret = virtio_set_features_nocheck(vdev, val); + ret = virtio_set_features_nocheck(vdev, features); if (virtio_vdev_has_feature(vdev, VIRTIO_RING_F_EVENT_IDX)) { /* VIRTIO_RING_F_EVENT_IDX changes the size of the caches. */ int i; @@ -3157,6 +3210,7 @@ void virtio_reset(void *opaque) { VirtIODevice *vdev = opaque; VirtioDeviceClass *k = VIRTIO_DEVICE_GET_CLASS(vdev); + uint64_t features[VIRTIO_FEATURES_NU64S]; int i; virtio_set_status(vdev, 0); @@ -3183,7 +3237,8 @@ void virtio_reset(void *opaque) vdev->start_on_kick = false; vdev->started = false; vdev->broken = false; - virtio_set_features_nocheck(vdev, 0); + virtio_features_clear(features); + virtio_set_features_nocheck(vdev, features); vdev->queue_sel = 0; vdev->status = 0; vdev->disabled = false; @@ -3267,7 +3322,7 @@ virtio_load(VirtIODevice *vdev, QEMUFile *f, int version_id) * Note: devices should always test host features in future - don't create * new dependencies like this. */ - vdev->guest_features = features; + virtio_features_from_u64(vdev->guest_features_ex, features); config_len = qemu_get_be32(f); @@ -3348,26 +3403,17 @@ virtio_load(VirtIODevice *vdev, QEMUFile *f, int version_id) vdev->device_endian = virtio_default_endian(); } - if (virtio_64bit_features_needed(vdev)) { - /* - * Subsection load filled vdev->guest_features. Run them - * through virtio_set_features to sanity-check them against - * host_features. - */ - uint64_t features64 = vdev->guest_features; - if (virtio_set_features_nocheck_maybe_co(vdev, features64) < 0) { - error_report("Features 0x%" PRIx64 " unsupported. " - "Allowed features: 0x%" PRIx64, - features64, vdev->host_features); - return -1; - } - } else { - if (virtio_set_features_nocheck_maybe_co(vdev, features) < 0) { - error_report("Features 0x%x unsupported. " - "Allowed features: 0x%" PRIx64, - features, vdev->host_features); - return -1; - } + /* + * guest_features_ex is fully initialized with u32 features and upper + * bits have been filled as needed by the later load. + */ + if (virtio_set_features_nocheck_maybe_co(vdev, + vdev->guest_features_ex) < 0) { + error_report("Features 0x" VIRTIO_FEATURES_FMT " unsupported. " + "Allowed features: 0x" VIRTIO_FEATURES_FMT, + VIRTIO_FEATURES_PR(vdev->guest_features_ex), + VIRTIO_FEATURES_PR(vdev->host_features_ex)); + return -1; } if (!virtio_device_started(vdev, vdev->status) && diff --git a/include/hw/acpi/acpi_dev_interface.h b/include/hw/acpi/acpi_dev_interface.h index 68d9d15f50..8294f8f0cc 100644 --- a/include/hw/acpi/acpi_dev_interface.h +++ b/include/hw/acpi/acpi_dev_interface.h @@ -13,6 +13,7 @@ typedef enum { ACPI_NVDIMM_HOTPLUG_STATUS = 16, ACPI_VMGENID_CHANGE_STATUS = 32, ACPI_POWER_DOWN_STATUS = 64, + ACPI_GENERIC_ERROR = 128, } AcpiEventStatusBits; #define TYPE_ACPI_DEVICE_IF "acpi-device-interface" diff --git a/include/hw/acpi/aml-build.h b/include/hw/acpi/aml-build.h index c18f681342..f38e129719 100644 --- a/include/hw/acpi/aml-build.h +++ b/include/hw/acpi/aml-build.h @@ -252,6 +252,7 @@ struct CrsRangeSet { /* Consumer/Producer */ #define AML_SERIAL_BUS_FLAG_CONSUME_ONLY (1 << 1) +#define ACPI_APEI_ERROR_DEVICE "GEDD" /** * init_aml_allocator: * @@ -382,6 +383,7 @@ Aml *aml_dma(AmlDmaType typ, AmlDmaBusMaster bm, AmlTransferSize sz, uint8_t channel); Aml *aml_sleep(uint64_t msec); Aml *aml_i2c_serial_bus_device(uint16_t address, const char *resource_source); +Aml *aml_error_device(void); /* Block AML object primitives */ Aml *aml_scope(const char *name_format, ...) G_GNUC_PRINTF(1, 2); diff --git a/include/hw/acpi/generic_event_device.h b/include/hw/acpi/generic_event_device.h index 2c5b055327..130c014d3f 100644 --- a/include/hw/acpi/generic_event_device.h +++ b/include/hw/acpi/generic_event_device.h @@ -103,6 +103,7 @@ OBJECT_DECLARE_TYPE(AcpiGedState, AcpiGedClass, ACPI_GED) #define ACPI_GED_NVDIMM_HOTPLUG_EVT 0x4 #define ACPI_GED_CPU_HOTPLUG_EVT 0x8 #define ACPI_GED_PCI_HOTPLUG_EVT 0x10 +#define ACPI_GED_ERROR_EVT 0x20 typedef struct GEDState { MemoryRegion evt; diff --git a/include/hw/acpi/ghes.h b/include/hw/acpi/ghes.h index 578a582203..df2ecbf6e4 100644 --- a/include/hw/acpi/ghes.h +++ b/include/hw/acpi/ghes.h @@ -24,6 +24,9 @@ #include "hw/acpi/bios-linker-loader.h" #include "qapi/error.h" +#include "qemu/notify.h" + +extern NotifierList acpi_generic_error_notifiers; /* * Values for Hardware Error Notification Type field @@ -57,30 +60,54 @@ enum AcpiGhesNotifyType { ACPI_GHES_NOTIFY_RESERVED = 12 }; -enum { - ACPI_HEST_SRC_ID_SEA = 0, - /* future ids go here */ - - ACPI_GHES_ERROR_SOURCE_COUNT +/* + * ID numbers used to fill HEST source ID field + */ +enum AcpiGhesSourceID { + ACPI_HEST_SRC_ID_SYNC, + ACPI_HEST_SRC_ID_QMP, /* Use it only for QMP injected errors */ }; +typedef struct AcpiNotificationSourceId { + enum AcpiGhesSourceID source_id; + enum AcpiGhesNotifyType notify; +} AcpiNotificationSourceId; + +/* + * AcpiGhesState stores GPA values that will be used to fill HEST entries. + * + * When use_hest_addr is false, the GPA of the etc/hardware_errors firmware + * is stored at hw_error_le. This is the default on QEMU 9.x. + * + * When use_hest_addr is true, the GPA of the HEST table is stored at + * hest_addr_le. This is the default for QEMU 10.x and above. + * + * Whe both GPA values are equal to zero means that GHES is not present. + */ typedef struct AcpiGhesState { + uint64_t hest_addr_le; uint64_t hw_error_le; - bool present; /* True if GHES is present at all on this board */ + bool use_hest_addr; /* True if HEST address is present */ } AcpiGhesState; -void acpi_build_hest(GArray *table_data, GArray *hardware_errors, +void acpi_build_hest(AcpiGhesState *ags, GArray *table_data, + GArray *hardware_errors, BIOSLinker *linker, + const AcpiNotificationSourceId * const notif_source, + int num_sources, const char *oem_id, const char *oem_table_id); void acpi_ghes_add_fw_cfg(AcpiGhesState *vms, FWCfgState *s, GArray *hardware_errors); -int acpi_ghes_memory_errors(uint16_t source_id, uint64_t error_physical_addr); +int acpi_ghes_memory_errors(AcpiGhesState *ags, uint16_t source_id, + uint64_t error_physical_addr); +void ghes_record_cper_errors(AcpiGhesState *ags, const void *cper, size_t len, + uint16_t source_id, Error **errp); /** - * acpi_ghes_present: Report whether ACPI GHES table is present + * acpi_ghes_get_state: Get a pointer for ACPI ghes state * - * Returns: true if the system has an ACPI GHES table and it is - * safe to call acpi_ghes_memory_errors() to record a memory error. + * Returns: a pointer to ghes state if the system has an ACPI GHES table, + * NULL, otherwise. */ -bool acpi_ghes_present(void); +AcpiGhesState *acpi_ghes_get_state(void); #endif diff --git a/include/hw/arm/virt.h b/include/hw/arm/virt.h index ea2cff05b0..04a09af354 100644 --- a/include/hw/arm/virt.h +++ b/include/hw/arm/virt.h @@ -33,6 +33,7 @@ #include "exec/hwaddr.h" #include "qemu/notify.h" #include "hw/boards.h" +#include "hw/acpi/ghes.h" #include "hw/arm/boot.h" #include "hw/arm/bsa.h" #include "hw/block/flash.h" @@ -174,6 +175,7 @@ struct VirtMachineState { DeviceState *gic; DeviceState *acpi_dev; Notifier powerdown_notifier; + Notifier generic_error_notifier; PCIBus *bus; char *oem_id; char *oem_table_id; diff --git a/include/hw/firmware/smbios.h b/include/hw/firmware/smbios.h index f066ab7262..3ea732f4e6 100644 --- a/include/hw/firmware/smbios.h +++ b/include/hw/firmware/smbios.h @@ -22,7 +22,7 @@ extern GArray *usr_blobs_sizes; typedef struct { const char *vendor, *version, *date; - bool have_major_minor, uefi; + bool have_major_minor, uefi, vm; uint8_t major, minor; } smbios_type0_t; extern smbios_type0_t smbios_type0; diff --git a/include/hw/i386/intel_iommu.h b/include/hw/i386/intel_iommu.h index e95477e855..47730ac3c7 100644 --- a/include/hw/i386/intel_iommu.h +++ b/include/hw/i386/intel_iommu.h @@ -110,6 +110,7 @@ struct VTDAddressSpace { QLIST_ENTRY(VTDAddressSpace) next; /* Superset of notifier flags that this address space has */ IOMMUNotifierFlag notifier_flags; + IOMMUPRINotifier *pri_notifier; /* * @iova_tree traces mapped IOVA ranges. * diff --git a/include/hw/i386/x86-iommu.h b/include/hw/i386/x86-iommu.h index bfd21649d0..e89f55a5c2 100644 --- a/include/hw/i386/x86-iommu.h +++ b/include/hw/i386/x86-iommu.h @@ -64,6 +64,7 @@ struct X86IOMMUState { OnOffAuto intr_supported; /* Whether vIOMMU supports IR */ bool dt_supported; /* Whether vIOMMU supports DT */ bool pt_supported; /* Whether vIOMMU supports pass-through */ + bool dma_translation; /* Whether vIOMMU supports DMA translation */ QLIST_HEAD(, IEC_Notifier) iec_notifiers; /* IEC notify list */ }; diff --git a/include/hw/pci/pcie.h b/include/hw/pci/pcie.h index ff6ce08e13..42cebcd033 100644 --- a/include/hw/pci/pcie.h +++ b/include/hw/pci/pcie.h @@ -158,6 +158,7 @@ void pcie_pasid_init(PCIDevice *dev, uint16_t offset, uint8_t pasid_width, void pcie_pri_init(PCIDevice *dev, uint16_t offset, uint32_t outstanding_pr_cap, bool prg_response_pasid_req); +uint32_t pcie_pri_get_req_alloc(const PCIDevice *dev); bool pcie_pri_enabled(const PCIDevice *dev); bool pcie_pasid_enabled(const PCIDevice *dev); bool pcie_ats_enabled(const PCIDevice *dev); diff --git a/include/hw/pci/pcie_sriov.h b/include/hw/pci/pcie_sriov.h index aeaa38cf34..b0ea6a62c7 100644 --- a/include/hw/pci/pcie_sriov.h +++ b/include/hw/pci/pcie_sriov.h @@ -37,10 +37,6 @@ void pcie_sriov_pf_exit(PCIDevice *dev); void pcie_sriov_pf_init_vf_bar(PCIDevice *dev, int region_num, uint8_t type, dma_addr_t size); -/* Instantiate a bar for a VF */ -void pcie_sriov_vf_register_bar(PCIDevice *dev, int region_num, - MemoryRegion *memory); - /** * pcie_sriov_pf_init_from_user_created_vfs() - Initialize PF with user-created * VFs, adding ARI to PF diff --git a/include/hw/qdev-core.h b/include/hw/qdev-core.h index 530f3da702..a7bfb10dc7 100644 --- a/include/hw/qdev-core.h +++ b/include/hw/qdev-core.h @@ -1064,6 +1064,7 @@ bool qdev_set_parent_bus(DeviceState *dev, BusState *bus, Error **errp); extern bool qdev_hot_removed; char *qdev_get_dev_path(DeviceState *dev); +const char *qdev_get_printable_name(DeviceState *dev); void qbus_set_hotplug_handler(BusState *bus, Object *handler); void qbus_set_bus_hotplug_handler(BusState *bus); diff --git a/include/hw/southbridge/ich9.h b/include/hw/southbridge/ich9.h index 1e231e89c9..2c35dd0484 100644 --- a/include/hw/southbridge/ich9.h +++ b/include/hw/southbridge/ich9.h @@ -95,7 +95,7 @@ struct ICH9LPCState { #define ICH9_CC_OIC 0x31FF #define ICH9_CC_OIC_AEN 0x1 #define ICH9_CC_GCS 0x3410 -#define ICH9_CC_GCS_DEFAULT 0x00000020 +#define ICH9_CC_GCS_DEFAULT 0x00000000 #define ICH9_CC_GCS_NO_REBOOT (1 << 5) /* D28:F[0-5] */ diff --git a/include/hw/virtio/vhost-backend.h b/include/hw/virtio/vhost-backend.h index d6df209a2f..ff94fa1734 100644 --- a/include/hw/virtio/vhost-backend.h +++ b/include/hw/virtio/vhost-backend.h @@ -95,6 +95,10 @@ typedef int (*vhost_new_worker_op)(struct vhost_dev *dev, struct vhost_worker_state *worker); typedef int (*vhost_free_worker_op)(struct vhost_dev *dev, struct vhost_worker_state *worker); +typedef int (*vhost_set_features_ex_op)(struct vhost_dev *dev, + const uint64_t *features); +typedef int (*vhost_get_features_ex_op)(struct vhost_dev *dev, + uint64_t *features); typedef int (*vhost_set_features_op)(struct vhost_dev *dev, uint64_t features); typedef int (*vhost_get_features_op)(struct vhost_dev *dev, @@ -186,6 +190,8 @@ typedef struct VhostOps { vhost_free_worker_op vhost_free_worker; vhost_get_vring_worker_op vhost_get_vring_worker; vhost_attach_vring_worker_op vhost_attach_vring_worker; + vhost_set_features_ex_op vhost_set_features_ex; + vhost_get_features_ex_op vhost_get_features_ex; vhost_set_features_op vhost_set_features; vhost_get_features_op vhost_get_features; vhost_set_backend_cap_op vhost_set_backend_cap; diff --git a/include/hw/virtio/vhost-user-base.h b/include/hw/virtio/vhost-user-base.h index 51d0968b89..387e434b80 100644 --- a/include/hw/virtio/vhost-user-base.h +++ b/include/hw/virtio/vhost-user-base.h @@ -44,6 +44,6 @@ struct VHostUserBaseClass { }; -#define TYPE_VHOST_USER_DEVICE "vhost-user-device" +#define TYPE_VHOST_USER_TEST_DEVICE "vhost-user-test-device" #endif /* QEMU_VHOST_USER_BASE_H */ diff --git a/include/hw/virtio/vhost.h b/include/hw/virtio/vhost.h index 66be6afc88..08bbb4dfe9 100644 --- a/include/hw/virtio/vhost.h +++ b/include/hw/virtio/vhost.h @@ -107,9 +107,9 @@ struct vhost_dev { * future use should be discouraged and the variable retired as * its easy to confuse with the VirtIO backend_features. */ - uint64_t features; - uint64_t acked_features; - uint64_t backend_features; + VIRTIO_DECLARE_FEATURES(features); + VIRTIO_DECLARE_FEATURES(acked_features); + VIRTIO_DECLARE_FEATURES(backend_features); /** * @protocol_features: is the vhost-user only feature set by @@ -321,6 +321,20 @@ void vhost_virtqueue_mask(struct vhost_dev *hdev, VirtIODevice *vdev, int n, bool mask); /** + * vhost_get_features_ex() - sanitize the extended features set + * @hdev: common vhost_dev structure + * @feature_bits: pointer to terminated table of feature bits + * @features: original features set, filtered out on return + * + * This is the extended variant of vhost_get_features(), supporting the + * the extended features set. Filter it with the intersection of what is + * supported by the vhost backend (hdev->features) and the supported + * feature_bits. + */ +void vhost_get_features_ex(struct vhost_dev *hdev, + const int *feature_bits, + uint64_t *features); +/** * vhost_get_features() - return a sanitised set of feature bits * @hdev: common vhost_dev structure * @feature_bits: pointer to terminated table of feature bits @@ -330,8 +344,28 @@ void vhost_virtqueue_mask(struct vhost_dev *hdev, VirtIODevice *vdev, int n, * is supported by the vhost backend (hdev->features), the supported * feature_bits and the requested feature set. */ -uint64_t vhost_get_features(struct vhost_dev *hdev, const int *feature_bits, - uint64_t features); +static inline uint64_t vhost_get_features(struct vhost_dev *hdev, + const int *feature_bits, + uint64_t features) +{ + uint64_t features_ex[VIRTIO_FEATURES_NU64S]; + + virtio_features_from_u64(features_ex, features); + vhost_get_features_ex(hdev, feature_bits, features_ex); + return features_ex[0]; +} + +/** + * vhost_ack_features_ex() - set vhost full set of acked_features + * @hdev: common vhost_dev structure + * @feature_bits: pointer to terminated table of feature bits + * @features: requested feature set + * + * This sets the internal hdev->acked_features to the intersection of + * the backends advertised features and the supported feature_bits. + */ +void vhost_ack_features_ex(struct vhost_dev *hdev, const int *feature_bits, + const uint64_t *features); /** * vhost_ack_features() - set vhost acked_features @@ -342,8 +376,16 @@ uint64_t vhost_get_features(struct vhost_dev *hdev, const int *feature_bits, * This sets the internal hdev->acked_features to the intersection of * the backends advertised features and the supported feature_bits. */ -void vhost_ack_features(struct vhost_dev *hdev, const int *feature_bits, - uint64_t features); +static inline void vhost_ack_features(struct vhost_dev *hdev, + const int *feature_bits, + uint64_t features) +{ + uint64_t features_ex[VIRTIO_FEATURES_NU64S]; + + virtio_features_from_u64(features_ex, features); + vhost_ack_features_ex(hdev, feature_bits, features_ex); +} + unsigned int vhost_get_max_memslots(void); unsigned int vhost_get_free_memslots(void); diff --git a/include/hw/virtio/virtio-features.h b/include/hw/virtio/virtio-features.h new file mode 100644 index 0000000000..e29b7fe48f --- /dev/null +++ b/include/hw/virtio/virtio-features.h @@ -0,0 +1,126 @@ +/* + * Virtio features helpers + * + * Copyright 2025 Red Hat, Inc. + * + * SPDX-License-Identifier: GPL-2.0-or-later + */ + +#ifndef QEMU_VIRTIO_FEATURES_H +#define QEMU_VIRTIO_FEATURES_H + +#include "qemu/bitops.h" + +#define VIRTIO_FEATURES_FMT "%016"PRIx64"%016"PRIx64 +#define VIRTIO_FEATURES_PR(f) (f)[1], (f)[0] + +#define VIRTIO_FEATURES_MAX 128 +#define VIRTIO_FEATURES_BIT(b) BIT_ULL((b) % 64) +#define VIRTIO_FEATURES_U64(b) ((b) / 64) +#define VIRTIO_FEATURES_NU32S (VIRTIO_FEATURES_MAX / 32) +#define VIRTIO_FEATURES_NU64S (VIRTIO_FEATURES_MAX / 64) + +#define VIRTIO_DECLARE_FEATURES(name) \ + union { \ + uint64_t name; \ + uint64_t name##_ex[VIRTIO_FEATURES_NU64S]; \ + } + +#define VIRTIO_DEFINE_PROP_FEATURE(_name, _state, _field, _bit, _defval) \ + DEFINE_PROP_BIT64(_name, _state, _field[VIRTIO_FEATURES_U64(_bit)], \ + (_bit) % 64, _defval) + +static inline void virtio_features_clear(uint64_t *features) +{ + memset(features, 0, sizeof(features[0]) * VIRTIO_FEATURES_NU64S); +} + +static inline void virtio_features_from_u64(uint64_t *features, uint64_t from) +{ + virtio_features_clear(features); + features[0] = from; +} + +static inline bool virtio_has_feature_ex(const uint64_t *features, + unsigned int fbit) +{ + assert(fbit < VIRTIO_FEATURES_MAX); + return features[VIRTIO_FEATURES_U64(fbit)] & VIRTIO_FEATURES_BIT(fbit); +} + +static inline void virtio_add_feature_ex(uint64_t *features, + unsigned int fbit) +{ + assert(fbit < VIRTIO_FEATURES_MAX); + features[VIRTIO_FEATURES_U64(fbit)] |= VIRTIO_FEATURES_BIT(fbit); +} + +static inline void virtio_clear_feature_ex(uint64_t *features, + unsigned int fbit) +{ + assert(fbit < VIRTIO_FEATURES_MAX); + features[VIRTIO_FEATURES_U64(fbit)] &= ~VIRTIO_FEATURES_BIT(fbit); +} + +static inline bool virtio_features_equal(const uint64_t *f1, + const uint64_t *f2) +{ + return !memcmp(f1, f2, sizeof(uint64_t) * VIRTIO_FEATURES_NU64S); +} + +static inline bool virtio_features_use_ex(const uint64_t *features) +{ + int i; + + for (i = 1; i < VIRTIO_FEATURES_NU64S; ++i) { + if (features[i]) { + return true; + } + } + return false; +} + +static inline bool virtio_features_empty(const uint64_t *features) +{ + return !virtio_features_use_ex(features) && !features[0]; +} + +static inline void virtio_features_copy(uint64_t *to, const uint64_t *from) +{ + memcpy(to, from, sizeof(to[0]) * VIRTIO_FEATURES_NU64S); +} + +static inline bool virtio_features_andnot(uint64_t *to, const uint64_t *f1, + const uint64_t *f2) +{ + uint64_t diff = 0; + int i; + + for (i = 0; i < VIRTIO_FEATURES_NU64S; i++) { + to[i] = f1[i] & ~f2[i]; + diff |= to[i]; + } + return diff; +} + +static inline void virtio_features_and(uint64_t *to, const uint64_t *f1, + const uint64_t *f2) +{ + int i; + + for (i = 0; i < VIRTIO_FEATURES_NU64S; i++) { + to[i] = f1[i] & f2[i]; + } +} + +static inline void virtio_features_or(uint64_t *to, const uint64_t *f1, + const uint64_t *f2) +{ + int i; + + for (i = 0; i < VIRTIO_FEATURES_NU64S; i++) { + to[i] = f1[i] | f2[i]; + } +} + +#endif diff --git a/include/hw/virtio/virtio-net.h b/include/hw/virtio/virtio-net.h index 73fdefc0dc..5b8ab7bda7 100644 --- a/include/hw/virtio/virtio-net.h +++ b/include/hw/virtio/virtio-net.h @@ -182,7 +182,7 @@ struct VirtIONet { uint32_t has_vnet_hdr; size_t host_hdr_len; size_t guest_hdr_len; - uint64_t host_features; + VIRTIO_DECLARE_FEATURES(host_features); uint32_t rsc_timeout; uint8_t rsc4_enabled; uint8_t rsc6_enabled; diff --git a/include/hw/virtio/virtio-pci.h b/include/hw/virtio/virtio-pci.h index eab5394898..639752977e 100644 --- a/include/hw/virtio/virtio-pci.h +++ b/include/hw/virtio/virtio-pci.h @@ -158,7 +158,7 @@ struct VirtIOPCIProxy { uint32_t nvectors; uint32_t dfselect; uint32_t gfselect; - uint32_t guest_features[2]; + uint32_t guest_features[VIRTIO_FEATURES_NU32S]; VirtIOPCIQueue vqs[VIRTIO_QUEUE_MAX]; VirtIOIRQFD *vector_irqfd; diff --git a/include/hw/virtio/virtio.h b/include/hw/virtio/virtio.h index c594764f23..d97529c3f1 100644 --- a/include/hw/virtio/virtio.h +++ b/include/hw/virtio/virtio.h @@ -16,6 +16,7 @@ #include "system/memory.h" #include "hw/qdev-core.h" +#include "hw/virtio/virtio-features.h" #include "net/net.h" #include "migration/vmstate.h" #include "qemu/event_notifier.h" @@ -121,9 +122,9 @@ struct VirtIODevice * backend (e.g. vhost) and could potentially be a subset of the * total feature set offered by QEMU. */ - uint64_t host_features; - uint64_t guest_features; - uint64_t backend_features; + VIRTIO_DECLARE_FEATURES(host_features); + VIRTIO_DECLARE_FEATURES(guest_features); + VIRTIO_DECLARE_FEATURES(backend_features); size_t config_len; void *config; @@ -177,6 +178,9 @@ struct VirtioDeviceClass { /* This is what a VirtioDevice must implement */ DeviceRealize realize; DeviceUnrealize unrealize; + void (*get_features_ex)(VirtIODevice *vdev, uint64_t *requested_features, + Error **errp); + void (*set_features_ex)(VirtIODevice *vdev, const uint64_t *val); uint64_t (*get_features)(VirtIODevice *vdev, uint64_t requested_features, Error **errp); @@ -290,7 +294,6 @@ int virtqueue_get_avail_bytes(VirtQueue *vq, unsigned int *in_bytes, unsigned int *out_bytes, unsigned max_in_bytes, unsigned max_out_bytes); -void virtio_notify_irqfd(VirtIODevice *vdev, VirtQueue *vq); void virtio_notify(VirtIODevice *vdev, VirtQueue *vq); int virtio_save(VirtIODevice *vdev, QEMUFile *f); @@ -372,6 +375,7 @@ void virtio_queue_reset(VirtIODevice *vdev, uint32_t queue_index); void virtio_queue_enable(VirtIODevice *vdev, uint32_t queue_index); void virtio_update_irq(VirtIODevice *vdev); int virtio_set_features(VirtIODevice *vdev, uint64_t val); +int virtio_set_features_ex(VirtIODevice *vdev, const uint64_t *val); /* Base devices. */ typedef struct VirtIOBlkConf VirtIOBlkConf; diff --git a/include/net/net.h b/include/net/net.h index 84ee18e0f9..72b476ee1d 100644 --- a/include/net/net.h +++ b/include/net/net.h @@ -35,6 +35,18 @@ typedef struct NICConf { int32_t bootindex; } NICConf; +typedef struct NetOffloads { + bool csum; + bool tso4; + bool tso6; + bool ecn; + bool ufo; + bool uso4; + bool uso6; + bool tnl; + bool tnl_csum; +} NetOffloads; + #define DEFINE_NIC_PROPERTIES(_state, _conf) \ DEFINE_PROP_MACADDR("mac", _state, _conf.macaddr), \ DEFINE_PROP_NETDEV("netdev", _state, _conf.peers) @@ -55,9 +67,10 @@ typedef void (NetClientDestructor)(NetClientState *); typedef RxFilterInfo *(QueryRxFilter)(NetClientState *); typedef bool (HasUfo)(NetClientState *); typedef bool (HasUso)(NetClientState *); +typedef bool (HasTunnel)(NetClientState *); typedef bool (HasVnetHdr)(NetClientState *); typedef bool (HasVnetHdrLen)(NetClientState *, int); -typedef void (SetOffload)(NetClientState *, int, int, int, int, int, int, int); +typedef void (SetOffload)(NetClientState *, const NetOffloads *); typedef int (GetVnetHdrLen)(NetClientState *); typedef void (SetVnetHdrLen)(NetClientState *, int); typedef bool (GetVnetHashSupportedTypes)(NetClientState *, uint32_t *); @@ -85,6 +98,7 @@ typedef struct NetClientInfo { NetPoll *poll; HasUfo *has_ufo; HasUso *has_uso; + HasTunnel *has_tunnel; HasVnetHdr *has_vnet_hdr; HasVnetHdrLen *has_vnet_hdr_len; SetOffload *set_offload; @@ -187,10 +201,10 @@ void qemu_set_info_str(NetClientState *nc, void qemu_format_nic_info_str(NetClientState *nc, uint8_t macaddr[6]); bool qemu_has_ufo(NetClientState *nc); bool qemu_has_uso(NetClientState *nc); +bool qemu_has_tunnel(NetClientState *nc); bool qemu_has_vnet_hdr(NetClientState *nc); bool qemu_has_vnet_hdr_len(NetClientState *nc, int len); -void qemu_set_offload(NetClientState *nc, int csum, int tso4, int tso6, - int ecn, int ufo, int uso4, int uso6); +void qemu_set_offload(NetClientState *nc, const NetOffloads *ol); int qemu_get_vnet_hdr_len(NetClientState *nc); void qemu_set_vnet_hdr_len(NetClientState *nc, int len); bool qemu_get_vnet_hash_supported_types(NetClientState *nc, uint32_t *types); diff --git a/include/net/vhost_net.h b/include/net/vhost_net.h index 879781dad7..0225207491 100644 --- a/include/net/vhost_net.h +++ b/include/net/vhost_net.h @@ -2,6 +2,7 @@ #define VHOST_NET_H #include "net/net.h" +#include "hw/virtio/virtio-features.h" #include "hw/virtio/vhost-backend.h" struct vhost_net; @@ -33,8 +34,26 @@ void vhost_net_stop(VirtIODevice *dev, NetClientState *ncs, void vhost_net_cleanup(VHostNetState *net); -uint64_t vhost_net_get_features(VHostNetState *net, uint64_t features); -void vhost_net_ack_features(VHostNetState *net, uint64_t features); +void vhost_net_get_features_ex(VHostNetState *net, uint64_t *features); +static inline uint64_t vhost_net_get_features(VHostNetState *net, + uint64_t features) +{ + uint64_t features_array[VIRTIO_FEATURES_NU64S]; + + virtio_features_from_u64(features_array, features); + vhost_net_get_features_ex(net, features_array); + return features_array[0]; +} + +void vhost_net_ack_features_ex(VHostNetState *net, const uint64_t *features); +static inline void vhost_net_ack_features(VHostNetState *net, + uint64_t features) +{ + uint64_t features_array[VIRTIO_FEATURES_NU64S]; + + virtio_features_from_u64(features_array, features); + vhost_net_ack_features_ex(net, features_array); +} int vhost_net_get_config(struct vhost_net *net, uint8_t *config, uint32_t config_len); @@ -51,7 +70,15 @@ VHostNetState *get_vhost_net(NetClientState *nc); int vhost_net_set_vring_enable(NetClientState *nc, int enable); -uint64_t vhost_net_get_acked_features(VHostNetState *net); +void vhost_net_get_acked_features_ex(VHostNetState *net, uint64_t *features); +static inline uint64_t vhost_net_get_acked_features(VHostNetState *net) +{ + uint64_t features[VIRTIO_FEATURES_NU64S]; + + vhost_net_get_acked_features_ex(net, features); + assert(!virtio_features_use_ex(features)); + return features[0]; +} int vhost_net_set_mtu(struct vhost_net *net, uint16_t mtu); diff --git a/include/standard-headers/drm/drm_fourcc.h b/include/standard-headers/drm/drm_fourcc.h index c8309d378b..cef077dfb3 100644 --- a/include/standard-headers/drm/drm_fourcc.h +++ b/include/standard-headers/drm/drm_fourcc.h @@ -209,6 +209,10 @@ extern "C" { #define DRM_FORMAT_RGBA1010102 fourcc_code('R', 'A', '3', '0') /* [31:0] R:G:B:A 10:10:10:2 little endian */ #define DRM_FORMAT_BGRA1010102 fourcc_code('B', 'A', '3', '0') /* [31:0] B:G:R:A 10:10:10:2 little endian */ +/* 48 bpp RGB */ +#define DRM_FORMAT_RGB161616 fourcc_code('R', 'G', '4', '8') /* [47:0] R:G:B 16:16:16 little endian */ +#define DRM_FORMAT_BGR161616 fourcc_code('B', 'G', '4', '8') /* [47:0] B:G:R 16:16:16 little endian */ + /* 64 bpp RGB */ #define DRM_FORMAT_XRGB16161616 fourcc_code('X', 'R', '4', '8') /* [63:0] x:R:G:B 16:16:16:16 little endian */ #define DRM_FORMAT_XBGR16161616 fourcc_code('X', 'B', '4', '8') /* [63:0] x:B:G:R 16:16:16:16 little endian */ @@ -217,7 +221,7 @@ extern "C" { #define DRM_FORMAT_ABGR16161616 fourcc_code('A', 'B', '4', '8') /* [63:0] A:B:G:R 16:16:16:16 little endian */ /* - * Floating point 64bpp RGB + * Half-Floating point - 16b/component * IEEE 754-2008 binary16 half-precision float * [15:0] sign:exponent:mantissa 1:5:10 */ @@ -227,6 +231,20 @@ extern "C" { #define DRM_FORMAT_ARGB16161616F fourcc_code('A', 'R', '4', 'H') /* [63:0] A:R:G:B 16:16:16:16 little endian */ #define DRM_FORMAT_ABGR16161616F fourcc_code('A', 'B', '4', 'H') /* [63:0] A:B:G:R 16:16:16:16 little endian */ +#define DRM_FORMAT_R16F fourcc_code('R', ' ', ' ', 'H') /* [15:0] R 16 little endian */ +#define DRM_FORMAT_GR1616F fourcc_code('G', 'R', ' ', 'H') /* [31:0] G:R 16:16 little endian */ +#define DRM_FORMAT_BGR161616F fourcc_code('B', 'G', 'R', 'H') /* [47:0] B:G:R 16:16:16 little endian */ + +/* + * Floating point - 32b/component + * IEEE 754-2008 binary32 float + * [31:0] sign:exponent:mantissa 1:8:23 + */ +#define DRM_FORMAT_R32F fourcc_code('R', ' ', ' ', 'F') /* [31:0] R 32 little endian */ +#define DRM_FORMAT_GR3232F fourcc_code('G', 'R', ' ', 'F') /* [63:0] R:G 32:32 little endian */ +#define DRM_FORMAT_BGR323232F fourcc_code('B', 'G', 'R', 'F') /* [95:0] R:G:B 32:32:32 little endian */ +#define DRM_FORMAT_ABGR32323232F fourcc_code('A', 'B', '8', 'F') /* [127:0] R:G:B:A 32:32:32:32 little endian */ + /* * RGBA format with 10-bit components packed in 64-bit per pixel, with 6 bits * of unused padding per component: @@ -377,6 +395,42 @@ extern "C" { #define DRM_FORMAT_Q401 fourcc_code('Q', '4', '0', '1') /* + * 3 plane YCbCr LSB aligned + * In order to use these formats in a similar fashion to MSB aligned ones + * implementation can multiply the values by 2^6=64. For that reason the padding + * must only contain zeros. + * index 0 = Y plane, [15:0] z:Y [6:10] little endian + * index 1 = Cr plane, [15:0] z:Cr [6:10] little endian + * index 2 = Cb plane, [15:0] z:Cb [6:10] little endian + */ +#define DRM_FORMAT_S010 fourcc_code('S', '0', '1', '0') /* 2x2 subsampled Cb (1) and Cr (2) planes 10 bits per channel */ +#define DRM_FORMAT_S210 fourcc_code('S', '2', '1', '0') /* 2x1 subsampled Cb (1) and Cr (2) planes 10 bits per channel */ +#define DRM_FORMAT_S410 fourcc_code('S', '4', '1', '0') /* non-subsampled Cb (1) and Cr (2) planes 10 bits per channel */ + +/* + * 3 plane YCbCr LSB aligned + * In order to use these formats in a similar fashion to MSB aligned ones + * implementation can multiply the values by 2^4=16. For that reason the padding + * must only contain zeros. + * index 0 = Y plane, [15:0] z:Y [4:12] little endian + * index 1 = Cr plane, [15:0] z:Cr [4:12] little endian + * index 2 = Cb plane, [15:0] z:Cb [4:12] little endian + */ +#define DRM_FORMAT_S012 fourcc_code('S', '0', '1', '2') /* 2x2 subsampled Cb (1) and Cr (2) planes 12 bits per channel */ +#define DRM_FORMAT_S212 fourcc_code('S', '2', '1', '2') /* 2x1 subsampled Cb (1) and Cr (2) planes 12 bits per channel */ +#define DRM_FORMAT_S412 fourcc_code('S', '4', '1', '2') /* non-subsampled Cb (1) and Cr (2) planes 12 bits per channel */ + +/* + * 3 plane YCbCr + * index 0 = Y plane, [15:0] Y little endian + * index 1 = Cr plane, [15:0] Cr little endian + * index 2 = Cb plane, [15:0] Cb little endian + */ +#define DRM_FORMAT_S016 fourcc_code('S', '0', '1', '6') /* 2x2 subsampled Cb (1) and Cr (2) planes 16 bits per channel */ +#define DRM_FORMAT_S216 fourcc_code('S', '2', '1', '6') /* 2x1 subsampled Cb (1) and Cr (2) planes 16 bits per channel */ +#define DRM_FORMAT_S416 fourcc_code('S', '4', '1', '6') /* non-subsampled Cb (1) and Cr (2) planes 16 bits per channel */ + +/* * 3 plane YCbCr * index 0: Y plane, [7:0] Y * index 1: Cb plane, [7:0] Cb diff --git a/include/standard-headers/linux/ethtool.h b/include/standard-headers/linux/ethtool.h index cef0d207a6..eb80314028 100644 --- a/include/standard-headers/linux/ethtool.h +++ b/include/standard-headers/linux/ethtool.h @@ -2314,7 +2314,7 @@ enum { IPV6_USER_FLOW = 0x0e, /* spec only (usr_ip6_spec; nfc only) */ IPV4_FLOW = 0x10, /* hash only */ IPV6_FLOW = 0x11, /* hash only */ - ETHER_FLOW = 0x12, /* spec only (ether_spec) */ + ETHER_FLOW = 0x12, /* hash or spec (ether_spec) */ /* Used for GTP-U IPv4 and IPv6. * The format of GTP packets only includes @@ -2371,7 +2371,7 @@ enum { /* Flag to enable RSS spreading of traffic matching rule (nfc only) */ #define FLOW_RSS 0x20000000 -/* L3-L4 network traffic flow hash options */ +/* L2-L4 network traffic flow hash options */ #define RXH_L2DA (1 << 1) #define RXH_VLAN (1 << 2) #define RXH_L3_PROTO (1 << 3) diff --git a/include/standard-headers/linux/input-event-codes.h b/include/standard-headers/linux/input-event-codes.h index a82ff795e0..00dc9caac9 100644 --- a/include/standard-headers/linux/input-event-codes.h +++ b/include/standard-headers/linux/input-event-codes.h @@ -601,6 +601,11 @@ #define BTN_DPAD_LEFT 0x222 #define BTN_DPAD_RIGHT 0x223 +#define BTN_GRIPL 0x224 +#define BTN_GRIPR 0x225 +#define BTN_GRIPL2 0x226 +#define BTN_GRIPR2 0x227 + #define KEY_ALS_TOGGLE 0x230 /* Ambient light sensor */ #define KEY_ROTATE_LOCK_TOGGLE 0x231 /* Display rotation lock */ #define KEY_REFRESH_RATE_TOGGLE 0x232 /* Display refresh rate toggle */ @@ -765,6 +770,9 @@ #define KEY_KBD_LCD_MENU4 0x2bb #define KEY_KBD_LCD_MENU5 0x2bc +/* Performance Boost key (Alienware)/G-Mode key (Dell) */ +#define KEY_PERFORMANCE 0x2bd + #define BTN_TRIGGER_HAPPY 0x2c0 #define BTN_TRIGGER_HAPPY1 0x2c0 #define BTN_TRIGGER_HAPPY2 0x2c1 diff --git a/include/standard-headers/linux/input.h b/include/standard-headers/linux/input.h index 942ea6aaa9..d4512c20b5 100644 --- a/include/standard-headers/linux/input.h +++ b/include/standard-headers/linux/input.h @@ -272,6 +272,7 @@ struct input_mask { #define BUS_CEC 0x1E #define BUS_INTEL_ISHTP 0x1F #define BUS_AMD_SFH 0x20 +#define BUS_SDW 0x21 /* * MT_TOOL types diff --git a/include/standard-headers/linux/pci_regs.h b/include/standard-headers/linux/pci_regs.h index a3a3e942de..f5b17745de 100644 --- a/include/standard-headers/linux/pci_regs.h +++ b/include/standard-headers/linux/pci_regs.h @@ -745,6 +745,7 @@ #define PCI_EXT_CAP_ID_L1SS 0x1E /* L1 PM Substates */ #define PCI_EXT_CAP_ID_PTM 0x1F /* Precision Time Measurement */ #define PCI_EXT_CAP_ID_DVSEC 0x23 /* Designated Vendor-Specific */ +#define PCI_EXT_CAP_ID_VF_REBAR 0x24 /* VF Resizable BAR */ #define PCI_EXT_CAP_ID_DLF 0x25 /* Data Link Feature */ #define PCI_EXT_CAP_ID_PL_16GT 0x26 /* Physical Layer 16.0 GT/s */ #define PCI_EXT_CAP_ID_NPEM 0x29 /* Native PCIe Enclosure Management */ @@ -1141,6 +1142,14 @@ #define PCI_DVSEC_HEADER2 0x8 /* Designated Vendor-Specific Header2 */ #define PCI_DVSEC_HEADER2_ID(x) ((x) & 0xffff) +/* VF Resizable BARs, same layout as PCI_REBAR */ +#define PCI_VF_REBAR_CAP PCI_REBAR_CAP +#define PCI_VF_REBAR_CAP_SIZES PCI_REBAR_CAP_SIZES +#define PCI_VF_REBAR_CTRL PCI_REBAR_CTRL +#define PCI_VF_REBAR_CTRL_BAR_IDX PCI_REBAR_CTRL_BAR_IDX +#define PCI_VF_REBAR_CTRL_NBAR_MASK PCI_REBAR_CTRL_NBAR_MASK +#define PCI_VF_REBAR_CTRL_BAR_SIZE PCI_REBAR_CTRL_BAR_SIZE + /* Data Link Feature */ #define PCI_DLF_CAP 0x04 /* Capabilities Register */ #define PCI_DLF_EXCHANGE_ENABLE 0x80000000 /* Data Link Feature Exchange Enable */ diff --git a/include/standard-headers/linux/vhost_types.h b/include/standard-headers/linux/vhost_types.h index fd54044936..79b53a931a 100644 --- a/include/standard-headers/linux/vhost_types.h +++ b/include/standard-headers/linux/vhost_types.h @@ -110,6 +110,11 @@ struct vhost_msg_v2 { }; }; +struct vhost_features_array { + uint64_t count; /* number of entries present in features array */ + uint64_t features[] ; +}; + struct vhost_memory_region { uint64_t guest_phys_addr; uint64_t memory_size; /* bytes */ diff --git a/include/standard-headers/linux/virtio_net.h b/include/standard-headers/linux/virtio_net.h index 982e854f14..93abaae0b9 100644 --- a/include/standard-headers/linux/virtio_net.h +++ b/include/standard-headers/linux/virtio_net.h @@ -70,6 +70,28 @@ * with the same MAC. */ #define VIRTIO_NET_F_SPEED_DUPLEX 63 /* Device set linkspeed and duplex */ +#define VIRTIO_NET_F_GUEST_UDP_TUNNEL_GSO 65 /* Driver can receive + * GSO-over-UDP-tunnel packets + */ +#define VIRTIO_NET_F_GUEST_UDP_TUNNEL_GSO_CSUM 66 /* Driver handles + * GSO-over-UDP-tunnel + * packets with partial csum + * for the outer header + */ +#define VIRTIO_NET_F_HOST_UDP_TUNNEL_GSO 67 /* Device can receive + * GSO-over-UDP-tunnel packets + */ +#define VIRTIO_NET_F_HOST_UDP_TUNNEL_GSO_CSUM 68 /* Device handles + * GSO-over-UDP-tunnel + * packets with partial csum + * for the outer header + */ + +/* Offloads bits corresponding to VIRTIO_NET_F_HOST_UDP_TUNNEL_GSO{,_CSUM} + * features + */ +#define VIRTIO_NET_F_GUEST_UDP_TUNNEL_GSO_MAPPED 46 +#define VIRTIO_NET_F_GUEST_UDP_TUNNEL_GSO_CSUM_MAPPED 47 #ifndef VIRTIO_NET_NO_LEGACY #define VIRTIO_NET_F_GSO 6 /* Host handles pkts w/ any GSO type */ @@ -131,12 +153,17 @@ struct virtio_net_hdr_v1 { #define VIRTIO_NET_HDR_F_NEEDS_CSUM 1 /* Use csum_start, csum_offset */ #define VIRTIO_NET_HDR_F_DATA_VALID 2 /* Csum is valid */ #define VIRTIO_NET_HDR_F_RSC_INFO 4 /* rsc info in csum_ fields */ +#define VIRTIO_NET_HDR_F_UDP_TUNNEL_CSUM 8 /* UDP tunnel csum offload */ uint8_t flags; #define VIRTIO_NET_HDR_GSO_NONE 0 /* Not a GSO frame */ #define VIRTIO_NET_HDR_GSO_TCPV4 1 /* GSO frame, IPv4 TCP (TSO) */ #define VIRTIO_NET_HDR_GSO_UDP 3 /* GSO frame, IPv4 UDP (UFO) */ #define VIRTIO_NET_HDR_GSO_TCPV6 4 /* GSO frame, IPv6 TCP */ #define VIRTIO_NET_HDR_GSO_UDP_L4 5 /* GSO frame, IPv4& IPv6 UDP (USO) */ +#define VIRTIO_NET_HDR_GSO_UDP_TUNNEL_IPV4 0x20 /* UDPv4 tunnel present */ +#define VIRTIO_NET_HDR_GSO_UDP_TUNNEL_IPV6 0x40 /* UDPv6 tunnel present */ +#define VIRTIO_NET_HDR_GSO_UDP_TUNNEL (VIRTIO_NET_HDR_GSO_UDP_TUNNEL_IPV4 | \ + VIRTIO_NET_HDR_GSO_UDP_TUNNEL_IPV6) #define VIRTIO_NET_HDR_GSO_ECN 0x80 /* TCP has ECN set */ uint8_t gso_type; __virtio16 hdr_len; /* Ethernet + IP + tcp/udp hdrs */ @@ -181,6 +208,12 @@ struct virtio_net_hdr_v1_hash { uint16_t padding; }; +struct virtio_net_hdr_v1_hash_tunnel { + struct virtio_net_hdr_v1_hash hash_hdr; + uint16_t outer_th_offset; + uint16_t inner_nh_offset; +}; + #ifndef VIRTIO_NET_NO_LEGACY /* This header comes first in the scatter-gather list. * For legacy virtio, if VIRTIO_F_ANY_LAYOUT is not negotiated, it must diff --git a/linux-headers/LICENSES/preferred/GPL-2.0 b/linux-headers/LICENSES/preferred/GPL-2.0 index ff0812fd89..ea8e93dc44 100644 --- a/linux-headers/LICENSES/preferred/GPL-2.0 +++ b/linux-headers/LICENSES/preferred/GPL-2.0 @@ -20,8 +20,8 @@ License-Text: GNU GENERAL PUBLIC LICENSE Version 2, June 1991 - Copyright (C) 1989, 1991 Free Software Foundation, Inc. - 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA + Copyright (C) 1989, 1991 Free Software Foundation, Inc., + <https://fsf.org/> Everyone is permitted to copy and distribute verbatim copies of this license document, but changing it is not allowed. @@ -322,10 +322,8 @@ the "copyright" line and a pointer to where the full notice is found. MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. - You should have received a copy of the GNU General Public License - along with this program; if not, write to the Free Software - Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA - + You should have received a copy of the GNU General Public License along + with this program; if not, see <https://www.gnu.org/licenses/>. Also add information on how to contact you by electronic and paper mail. diff --git a/linux-headers/asm-arm64/unistd_64.h b/linux-headers/asm-arm64/unistd_64.h index ee9aaebdf3..4ae25c2b91 100644 --- a/linux-headers/asm-arm64/unistd_64.h +++ b/linux-headers/asm-arm64/unistd_64.h @@ -324,6 +324,8 @@ #define __NR_listxattrat 465 #define __NR_removexattrat 466 #define __NR_open_tree_attr 467 +#define __NR_file_getattr 468 +#define __NR_file_setattr 469 #endif /* _ASM_UNISTD_64_H */ diff --git a/linux-headers/asm-generic/unistd.h b/linux-headers/asm-generic/unistd.h index 2892a45023..04e0077fb4 100644 --- a/linux-headers/asm-generic/unistd.h +++ b/linux-headers/asm-generic/unistd.h @@ -852,8 +852,14 @@ __SYSCALL(__NR_removexattrat, sys_removexattrat) #define __NR_open_tree_attr 467 __SYSCALL(__NR_open_tree_attr, sys_open_tree_attr) +/* fs/inode.c */ +#define __NR_file_getattr 468 +__SYSCALL(__NR_file_getattr, sys_file_getattr) +#define __NR_file_setattr 469 +__SYSCALL(__NR_file_setattr, sys_file_setattr) + #undef __NR_syscalls -#define __NR_syscalls 468 +#define __NR_syscalls 470 /* * 32 bit systems traditionally used different diff --git a/linux-headers/asm-loongarch/unistd_64.h b/linux-headers/asm-loongarch/unistd_64.h index 50d22df8f7..5033fc8f2f 100644 --- a/linux-headers/asm-loongarch/unistd_64.h +++ b/linux-headers/asm-loongarch/unistd_64.h @@ -320,6 +320,8 @@ #define __NR_listxattrat 465 #define __NR_removexattrat 466 #define __NR_open_tree_attr 467 +#define __NR_file_getattr 468 +#define __NR_file_setattr 469 #endif /* _ASM_UNISTD_64_H */ diff --git a/linux-headers/asm-mips/unistd_n32.h b/linux-headers/asm-mips/unistd_n32.h index bdcc2f460b..c99c10e5bf 100644 --- a/linux-headers/asm-mips/unistd_n32.h +++ b/linux-headers/asm-mips/unistd_n32.h @@ -396,5 +396,7 @@ #define __NR_listxattrat (__NR_Linux + 465) #define __NR_removexattrat (__NR_Linux + 466) #define __NR_open_tree_attr (__NR_Linux + 467) +#define __NR_file_getattr (__NR_Linux + 468) +#define __NR_file_setattr (__NR_Linux + 469) #endif /* _ASM_UNISTD_N32_H */ diff --git a/linux-headers/asm-mips/unistd_n64.h b/linux-headers/asm-mips/unistd_n64.h index 3b6b0193b6..0d975bb185 100644 --- a/linux-headers/asm-mips/unistd_n64.h +++ b/linux-headers/asm-mips/unistd_n64.h @@ -372,5 +372,7 @@ #define __NR_listxattrat (__NR_Linux + 465) #define __NR_removexattrat (__NR_Linux + 466) #define __NR_open_tree_attr (__NR_Linux + 467) +#define __NR_file_getattr (__NR_Linux + 468) +#define __NR_file_setattr (__NR_Linux + 469) #endif /* _ASM_UNISTD_N64_H */ diff --git a/linux-headers/asm-mips/unistd_o32.h b/linux-headers/asm-mips/unistd_o32.h index 4609a4b4d3..86ac0ac84b 100644 --- a/linux-headers/asm-mips/unistd_o32.h +++ b/linux-headers/asm-mips/unistd_o32.h @@ -442,5 +442,7 @@ #define __NR_listxattrat (__NR_Linux + 465) #define __NR_removexattrat (__NR_Linux + 466) #define __NR_open_tree_attr (__NR_Linux + 467) +#define __NR_file_getattr (__NR_Linux + 468) +#define __NR_file_setattr (__NR_Linux + 469) #endif /* _ASM_UNISTD_O32_H */ diff --git a/linux-headers/asm-powerpc/kvm.h b/linux-headers/asm-powerpc/kvm.h index eaeda00178..077c5437f5 100644 --- a/linux-headers/asm-powerpc/kvm.h +++ b/linux-headers/asm-powerpc/kvm.h @@ -1,18 +1,5 @@ /* SPDX-License-Identifier: GPL-2.0 WITH Linux-syscall-note */ /* - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License, version 2, as - * published by the Free Software Foundation. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program; if not, write to the Free Software - * Foundation, 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA. - * * Copyright IBM Corp. 2007 * * Authors: Hollis Blanchard <hollisb@us.ibm.com> diff --git a/linux-headers/asm-powerpc/unistd_32.h b/linux-headers/asm-powerpc/unistd_32.h index 5d38a427e0..d7a32c5e06 100644 --- a/linux-headers/asm-powerpc/unistd_32.h +++ b/linux-headers/asm-powerpc/unistd_32.h @@ -449,6 +449,8 @@ #define __NR_listxattrat 465 #define __NR_removexattrat 466 #define __NR_open_tree_attr 467 +#define __NR_file_getattr 468 +#define __NR_file_setattr 469 #endif /* _ASM_UNISTD_32_H */ diff --git a/linux-headers/asm-powerpc/unistd_64.h b/linux-headers/asm-powerpc/unistd_64.h index 860a488e4d..ff35c51fc6 100644 --- a/linux-headers/asm-powerpc/unistd_64.h +++ b/linux-headers/asm-powerpc/unistd_64.h @@ -421,6 +421,8 @@ #define __NR_listxattrat 465 #define __NR_removexattrat 466 #define __NR_open_tree_attr 467 +#define __NR_file_getattr 468 +#define __NR_file_setattr 469 #endif /* _ASM_UNISTD_64_H */ diff --git a/linux-headers/asm-riscv/kvm.h b/linux-headers/asm-riscv/kvm.h index 5f59fd226c..ef27d4289d 100644 --- a/linux-headers/asm-riscv/kvm.h +++ b/linux-headers/asm-riscv/kvm.h @@ -18,6 +18,7 @@ #define __KVM_HAVE_IRQ_LINE #define KVM_COALESCED_MMIO_PAGE_OFFSET 1 +#define KVM_DIRTY_LOG_PAGE_OFFSET 64 #define KVM_INTERRUPT_SET -1U #define KVM_INTERRUPT_UNSET -2U diff --git a/linux-headers/asm-riscv/unistd_32.h b/linux-headers/asm-riscv/unistd_32.h index a5e769f1d9..6083373e88 100644 --- a/linux-headers/asm-riscv/unistd_32.h +++ b/linux-headers/asm-riscv/unistd_32.h @@ -315,6 +315,8 @@ #define __NR_listxattrat 465 #define __NR_removexattrat 466 #define __NR_open_tree_attr 467 +#define __NR_file_getattr 468 +#define __NR_file_setattr 469 #endif /* _ASM_UNISTD_32_H */ diff --git a/linux-headers/asm-riscv/unistd_64.h b/linux-headers/asm-riscv/unistd_64.h index 8df4d64841..f0c7585c60 100644 --- a/linux-headers/asm-riscv/unistd_64.h +++ b/linux-headers/asm-riscv/unistd_64.h @@ -325,6 +325,8 @@ #define __NR_listxattrat 465 #define __NR_removexattrat 466 #define __NR_open_tree_attr 467 +#define __NR_file_getattr 468 +#define __NR_file_setattr 469 #endif /* _ASM_UNISTD_64_H */ diff --git a/linux-headers/asm-s390/unistd_32.h b/linux-headers/asm-s390/unistd_32.h index 85eedbd18e..37b8f6f358 100644 --- a/linux-headers/asm-s390/unistd_32.h +++ b/linux-headers/asm-s390/unistd_32.h @@ -440,5 +440,7 @@ #define __NR_listxattrat 465 #define __NR_removexattrat 466 #define __NR_open_tree_attr 467 +#define __NR_file_getattr 468 +#define __NR_file_setattr 469 #endif /* _ASM_S390_UNISTD_32_H */ diff --git a/linux-headers/asm-s390/unistd_64.h b/linux-headers/asm-s390/unistd_64.h index c03b1b9701..0652ba6331 100644 --- a/linux-headers/asm-s390/unistd_64.h +++ b/linux-headers/asm-s390/unistd_64.h @@ -388,5 +388,7 @@ #define __NR_listxattrat 465 #define __NR_removexattrat 466 #define __NR_open_tree_attr 467 +#define __NR_file_getattr 468 +#define __NR_file_setattr 469 #endif /* _ASM_S390_UNISTD_64_H */ diff --git a/linux-headers/asm-x86/unistd_32.h b/linux-headers/asm-x86/unistd_32.h index 491d6b4eb6..8f784a5634 100644 --- a/linux-headers/asm-x86/unistd_32.h +++ b/linux-headers/asm-x86/unistd_32.h @@ -458,6 +458,8 @@ #define __NR_listxattrat 465 #define __NR_removexattrat 466 #define __NR_open_tree_attr 467 +#define __NR_file_getattr 468 +#define __NR_file_setattr 469 #endif /* _ASM_UNISTD_32_H */ diff --git a/linux-headers/asm-x86/unistd_64.h b/linux-headers/asm-x86/unistd_64.h index 7cf88bf9bd..2f55bebb81 100644 --- a/linux-headers/asm-x86/unistd_64.h +++ b/linux-headers/asm-x86/unistd_64.h @@ -381,6 +381,8 @@ #define __NR_listxattrat 465 #define __NR_removexattrat 466 #define __NR_open_tree_attr 467 +#define __NR_file_getattr 468 +#define __NR_file_setattr 469 #endif /* _ASM_UNISTD_64_H */ diff --git a/linux-headers/asm-x86/unistd_x32.h b/linux-headers/asm-x86/unistd_x32.h index 82959111e6..8cc8673f15 100644 --- a/linux-headers/asm-x86/unistd_x32.h +++ b/linux-headers/asm-x86/unistd_x32.h @@ -334,6 +334,8 @@ #define __NR_listxattrat (__X32_SYSCALL_BIT + 465) #define __NR_removexattrat (__X32_SYSCALL_BIT + 466) #define __NR_open_tree_attr (__X32_SYSCALL_BIT + 467) +#define __NR_file_getattr (__X32_SYSCALL_BIT + 468) +#define __NR_file_setattr (__X32_SYSCALL_BIT + 469) #define __NR_rt_sigaction (__X32_SYSCALL_BIT + 512) #define __NR_rt_sigreturn (__X32_SYSCALL_BIT + 513) #define __NR_ioctl (__X32_SYSCALL_BIT + 514) diff --git a/linux-headers/linux/iommufd.h b/linux-headers/linux/iommufd.h index cb0f7d6b4d..2105a03955 100644 --- a/linux-headers/linux/iommufd.h +++ b/linux-headers/linux/iommufd.h @@ -56,6 +56,7 @@ enum { IOMMUFD_CMD_VDEVICE_ALLOC = 0x91, IOMMUFD_CMD_IOAS_CHANGE_PROCESS = 0x92, IOMMUFD_CMD_VEVENTQ_ALLOC = 0x93, + IOMMUFD_CMD_HW_QUEUE_ALLOC = 0x94, }; /** @@ -591,16 +592,43 @@ struct iommu_hw_info_arm_smmuv3 { }; /** + * struct iommu_hw_info_tegra241_cmdqv - NVIDIA Tegra241 CMDQV Hardware + * Information (IOMMU_HW_INFO_TYPE_TEGRA241_CMDQV) + * + * @flags: Must be 0 + * @version: Version number for the CMDQ-V HW for PARAM bits[03:00] + * @log2vcmdqs: Log2 of the total number of VCMDQs for PARAM bits[07:04] + * @log2vsids: Log2 of the total number of SID replacements for PARAM bits[15:12] + * @__reserved: Must be 0 + * + * VMM can use these fields directly in its emulated global PARAM register. Note + * that only one Virtual Interface (VINTF) should be exposed to a VM, i.e. PARAM + * bits[11:08] should be set to 0 for log2 of the total number of VINTFs. + */ +struct iommu_hw_info_tegra241_cmdqv { + __u32 flags; + __u8 version; + __u8 log2vcmdqs; + __u8 log2vsids; + __u8 __reserved; +}; + +/** * enum iommu_hw_info_type - IOMMU Hardware Info Types - * @IOMMU_HW_INFO_TYPE_NONE: Used by the drivers that do not report hardware + * @IOMMU_HW_INFO_TYPE_NONE: Output by the drivers that do not report hardware * info + * @IOMMU_HW_INFO_TYPE_DEFAULT: Input to request for a default type * @IOMMU_HW_INFO_TYPE_INTEL_VTD: Intel VT-d iommu info type * @IOMMU_HW_INFO_TYPE_ARM_SMMUV3: ARM SMMUv3 iommu info type + * @IOMMU_HW_INFO_TYPE_TEGRA241_CMDQV: NVIDIA Tegra241 CMDQV (extension for ARM + * SMMUv3) info type */ enum iommu_hw_info_type { IOMMU_HW_INFO_TYPE_NONE = 0, + IOMMU_HW_INFO_TYPE_DEFAULT = 0, IOMMU_HW_INFO_TYPE_INTEL_VTD = 1, IOMMU_HW_INFO_TYPE_ARM_SMMUV3 = 2, + IOMMU_HW_INFO_TYPE_TEGRA241_CMDQV = 3, }; /** @@ -626,6 +654,15 @@ enum iommufd_hw_capabilities { }; /** + * enum iommufd_hw_info_flags - Flags for iommu_hw_info + * @IOMMU_HW_INFO_FLAG_INPUT_TYPE: If set, @in_data_type carries an input type + * for user space to request for a specific info + */ +enum iommufd_hw_info_flags { + IOMMU_HW_INFO_FLAG_INPUT_TYPE = 1 << 0, +}; + +/** * struct iommu_hw_info - ioctl(IOMMU_GET_HW_INFO) * @size: sizeof(struct iommu_hw_info) * @flags: Must be 0 @@ -634,6 +671,12 @@ enum iommufd_hw_capabilities { * data that kernel supports * @data_uptr: User pointer to a user-space buffer used by the kernel to fill * the iommu type specific hardware information data + * @in_data_type: This shares the same field with @out_data_type, making it be + * a bidirectional field. When IOMMU_HW_INFO_FLAG_INPUT_TYPE is + * set, an input type carried via this @in_data_type field will + * be valid, requesting for the info data to the given type. If + * IOMMU_HW_INFO_FLAG_INPUT_TYPE is unset, any input value will + * be seen as IOMMU_HW_INFO_TYPE_DEFAULT * @out_data_type: Output the iommu hardware info type as defined in the enum * iommu_hw_info_type. * @out_capabilities: Output the generic iommu capability info type as defined @@ -663,7 +706,10 @@ struct iommu_hw_info { __u32 dev_id; __u32 data_len; __aligned_u64 data_uptr; - __u32 out_data_type; + union { + __u32 in_data_type; + __u32 out_data_type; + }; __u8 out_max_pasid_log2; __u8 __reserved[3]; __aligned_u64 out_capabilities; @@ -951,10 +997,29 @@ struct iommu_fault_alloc { * enum iommu_viommu_type - Virtual IOMMU Type * @IOMMU_VIOMMU_TYPE_DEFAULT: Reserved for future use * @IOMMU_VIOMMU_TYPE_ARM_SMMUV3: ARM SMMUv3 driver specific type + * @IOMMU_VIOMMU_TYPE_TEGRA241_CMDQV: NVIDIA Tegra241 CMDQV (extension for ARM + * SMMUv3) enabled ARM SMMUv3 type */ enum iommu_viommu_type { IOMMU_VIOMMU_TYPE_DEFAULT = 0, IOMMU_VIOMMU_TYPE_ARM_SMMUV3 = 1, + IOMMU_VIOMMU_TYPE_TEGRA241_CMDQV = 2, +}; + +/** + * struct iommu_viommu_tegra241_cmdqv - NVIDIA Tegra241 CMDQV Virtual Interface + * (IOMMU_VIOMMU_TYPE_TEGRA241_CMDQV) + * @out_vintf_mmap_offset: mmap offset argument for VINTF's page0 + * @out_vintf_mmap_length: mmap length argument for VINTF's page0 + * + * Both @out_vintf_mmap_offset and @out_vintf_mmap_length are reported by kernel + * for user space to mmap the VINTF page0 from the host physical address space + * to the guest physical address space so that a guest kernel can directly R/W + * access to the VINTF page0 in order to control its virtual command queues. + */ +struct iommu_viommu_tegra241_cmdqv { + __aligned_u64 out_vintf_mmap_offset; + __aligned_u64 out_vintf_mmap_length; }; /** @@ -965,6 +1030,9 @@ enum iommu_viommu_type { * @dev_id: The device's physical IOMMU will be used to back the virtual IOMMU * @hwpt_id: ID of a nesting parent HWPT to associate to * @out_viommu_id: Output virtual IOMMU ID for the allocated object + * @data_len: Length of the type specific data + * @__reserved: Must be 0 + * @data_uptr: User pointer to a driver-specific virtual IOMMU data * * Allocate a virtual IOMMU object, representing the underlying physical IOMMU's * virtualization support that is a security-isolated slice of the real IOMMU HW @@ -985,6 +1053,9 @@ struct iommu_viommu_alloc { __u32 dev_id; __u32 hwpt_id; __u32 out_viommu_id; + __u32 data_len; + __u32 __reserved; + __aligned_u64 data_uptr; }; #define IOMMU_VIOMMU_ALLOC _IO(IOMMUFD_TYPE, IOMMUFD_CMD_VIOMMU_ALLOC) @@ -995,10 +1066,15 @@ struct iommu_viommu_alloc { * @dev_id: The physical device to allocate a virtual instance on the vIOMMU * @out_vdevice_id: Object handle for the vDevice. Pass to IOMMU_DESTORY * @virt_id: Virtual device ID per vIOMMU, e.g. vSID of ARM SMMUv3, vDeviceID - * of AMD IOMMU, and vRID of a nested Intel VT-d to a Context Table + * of AMD IOMMU, and vRID of Intel VT-d * * Allocate a virtual device instance (for a physical device) against a vIOMMU. * This instance holds the device's information (related to its vIOMMU) in a VM. + * User should use IOMMU_DESTROY to destroy the virtual device before + * destroying the physical device (by closing vfio_cdev fd). Otherwise the + * virtual device would be forcibly destroyed on physical device destruction, + * its vdevice_id would be permanently leaked (unremovable & unreusable) until + * iommu fd closed. */ struct iommu_vdevice_alloc { __u32 size; @@ -1075,10 +1151,12 @@ struct iommufd_vevent_header { * enum iommu_veventq_type - Virtual Event Queue Type * @IOMMU_VEVENTQ_TYPE_DEFAULT: Reserved for future use * @IOMMU_VEVENTQ_TYPE_ARM_SMMUV3: ARM SMMUv3 Virtual Event Queue + * @IOMMU_VEVENTQ_TYPE_TEGRA241_CMDQV: NVIDIA Tegra241 CMDQV Extension IRQ */ enum iommu_veventq_type { IOMMU_VEVENTQ_TYPE_DEFAULT = 0, IOMMU_VEVENTQ_TYPE_ARM_SMMUV3 = 1, + IOMMU_VEVENTQ_TYPE_TEGRA241_CMDQV = 2, }; /** @@ -1103,6 +1181,19 @@ struct iommu_vevent_arm_smmuv3 { }; /** + * struct iommu_vevent_tegra241_cmdqv - Tegra241 CMDQV IRQ + * (IOMMU_VEVENTQ_TYPE_TEGRA241_CMDQV) + * @lvcmdq_err_map: 128-bit logical vcmdq error map, little-endian. + * (Refer to register LVCMDQ_ERR_MAPs per VINTF ) + * + * The 128-bit register value from HW exclusively reflect the error bits for a + * Virtual Interface represented by a vIOMMU object. Read and report directly. + */ +struct iommu_vevent_tegra241_cmdqv { + __aligned_le64 lvcmdq_err_map[2]; +}; + +/** * struct iommu_veventq_alloc - ioctl(IOMMU_VEVENTQ_ALLOC) * @size: sizeof(struct iommu_veventq_alloc) * @flags: Must be 0 @@ -1141,4 +1232,61 @@ struct iommu_veventq_alloc { __u32 __reserved; }; #define IOMMU_VEVENTQ_ALLOC _IO(IOMMUFD_TYPE, IOMMUFD_CMD_VEVENTQ_ALLOC) + +/** + * enum iommu_hw_queue_type - HW Queue Type + * @IOMMU_HW_QUEUE_TYPE_DEFAULT: Reserved for future use + * @IOMMU_HW_QUEUE_TYPE_TEGRA241_CMDQV: NVIDIA Tegra241 CMDQV (extension for ARM + * SMMUv3) Virtual Command Queue (VCMDQ) + */ +enum iommu_hw_queue_type { + IOMMU_HW_QUEUE_TYPE_DEFAULT = 0, + /* + * TEGRA241_CMDQV requirements (otherwise, allocation will fail) + * - alloc starts from the lowest @index=0 in ascending order + * - destroy starts from the last allocated @index in descending order + * - @base_addr must be aligned to @length in bytes and mapped in IOAS + * - @length must be a power of 2, with a minimum 32 bytes and a maximum + * 2 ^ idr[1].CMDQS * 16 bytes (use GET_HW_INFO call to read idr[1] + * from struct iommu_hw_info_arm_smmuv3) + * - suggest to back the queue memory with contiguous physical pages or + * a single huge page with alignment of the queue size, and limit the + * emulated vSMMU's IDR1.CMDQS to log2(huge page size / 16 bytes) + */ + IOMMU_HW_QUEUE_TYPE_TEGRA241_CMDQV = 1, +}; + +/** + * struct iommu_hw_queue_alloc - ioctl(IOMMU_HW_QUEUE_ALLOC) + * @size: sizeof(struct iommu_hw_queue_alloc) + * @flags: Must be 0 + * @viommu_id: Virtual IOMMU ID to associate the HW queue with + * @type: One of enum iommu_hw_queue_type + * @index: The logical index to the HW queue per virtual IOMMU for a multi-queue + * model + * @out_hw_queue_id: The ID of the new HW queue + * @nesting_parent_iova: Base address of the queue memory in the guest physical + * address space + * @length: Length of the queue memory + * + * Allocate a HW queue object for a vIOMMU-specific HW-accelerated queue, which + * allows HW to access a guest queue memory described using @nesting_parent_iova + * and @length. + * + * A vIOMMU can allocate multiple queues, but it must use a different @index per + * type to separate each allocation, e.g:: + * + * Type1 HW queue0, Type1 HW queue1, Type2 HW queue0, ... + */ +struct iommu_hw_queue_alloc { + __u32 size; + __u32 flags; + __u32 viommu_id; + __u32 type; + __u32 index; + __u32 out_hw_queue_id; + __aligned_u64 nesting_parent_iova; + __aligned_u64 length; +}; +#define IOMMU_HW_QUEUE_ALLOC _IO(IOMMUFD_TYPE, IOMMUFD_CMD_HW_QUEUE_ALLOC) #endif diff --git a/linux-headers/linux/kvm.h b/linux-headers/linux/kvm.h index 32c5885a3c..be704965d8 100644 --- a/linux-headers/linux/kvm.h +++ b/linux-headers/linux/kvm.h @@ -636,6 +636,7 @@ struct kvm_ioeventfd { #define KVM_X86_DISABLE_EXITS_HLT (1 << 1) #define KVM_X86_DISABLE_EXITS_PAUSE (1 << 2) #define KVM_X86_DISABLE_EXITS_CSTATE (1 << 3) +#define KVM_X86_DISABLE_EXITS_APERFMPERF (1 << 4) /* for KVM_ENABLE_CAP */ struct kvm_enable_cap { @@ -952,6 +953,7 @@ struct kvm_enable_cap { #define KVM_CAP_ARM_EL2 240 #define KVM_CAP_ARM_EL2_E2H0 241 #define KVM_CAP_RISCV_MP_STATE_RESET 242 +#define KVM_CAP_ARM_CACHEABLE_PFNMAP_SUPPORTED 243 struct kvm_irq_routing_irqchip { __u32 irqchip; diff --git a/linux-headers/linux/vfio.h b/linux-headers/linux/vfio.h index 79bf8c0cc5..4d96d1fc12 100644 --- a/linux-headers/linux/vfio.h +++ b/linux-headers/linux/vfio.h @@ -905,10 +905,12 @@ struct vfio_device_feature { * VFIO_DEVICE_BIND_IOMMUFD - _IOR(VFIO_TYPE, VFIO_BASE + 18, * struct vfio_device_bind_iommufd) * @argsz: User filled size of this data. - * @flags: Must be 0. + * @flags: Must be 0 or a bit flags of VFIO_DEVICE_BIND_* * @iommufd: iommufd to bind. * @out_devid: The device id generated by this bind. devid is a handle for * this device/iommufd bond and can be used in IOMMUFD commands. + * @token_uuid_ptr: Valid if VFIO_DEVICE_BIND_FLAG_TOKEN. Points to a 16 byte + * UUID in the same format as VFIO_DEVICE_FEATURE_PCI_VF_TOKEN. * * Bind a vfio_device to the specified iommufd. * @@ -917,13 +919,21 @@ struct vfio_device_feature { * * Unbind is automatically conducted when device fd is closed. * + * A token is sometimes required to open the device, unless this is known to be + * needed VFIO_DEVICE_BIND_FLAG_TOKEN should not be set and token_uuid_ptr is + * ignored. The only case today is a PF/VF relationship where the VF bind must + * be provided the same token as VFIO_DEVICE_FEATURE_PCI_VF_TOKEN provided to + * the PF. + * * Return: 0 on success, -errno on failure. */ struct vfio_device_bind_iommufd { __u32 argsz; __u32 flags; +#define VFIO_DEVICE_BIND_FLAG_TOKEN (1 << 0) __s32 iommufd; __u32 out_devid; + __aligned_u64 token_uuid_ptr; }; #define VFIO_DEVICE_BIND_IOMMUFD _IO(VFIO_TYPE, VFIO_BASE + 18) diff --git a/linux-headers/linux/vhost.h b/linux-headers/linux/vhost.h index d4b3e2ae13..283348b64a 100644 --- a/linux-headers/linux/vhost.h +++ b/linux-headers/linux/vhost.h @@ -235,4 +235,39 @@ */ #define VHOST_VDPA_GET_VRING_SIZE _IOWR(VHOST_VIRTIO, 0x82, \ struct vhost_vring_state) + +/* Extended features manipulation */ +#define VHOST_GET_FEATURES_ARRAY _IOR(VHOST_VIRTIO, 0x83, \ + struct vhost_features_array) +#define VHOST_SET_FEATURES_ARRAY _IOW(VHOST_VIRTIO, 0x83, \ + struct vhost_features_array) + +/* fork_owner values for vhost */ +#define VHOST_FORK_OWNER_KTHREAD 0 +#define VHOST_FORK_OWNER_TASK 1 + +/** + * VHOST_SET_FORK_FROM_OWNER - Set the fork_owner flag for the vhost device, + * This ioctl must called before VHOST_SET_OWNER. + * Only available when CONFIG_VHOST_ENABLE_FORK_OWNER_CONTROL=y + * + * @param fork_owner: An 8-bit value that determines the vhost thread mode + * + * When fork_owner is set to VHOST_FORK_OWNER_TASK(default value): + * - Vhost will create vhost worker as tasks forked from the owner, + * inheriting all of the owner's attributes. + * + * When fork_owner is set to VHOST_FORK_OWNER_KTHREAD: + * - Vhost will create vhost workers as kernel threads. + */ +#define VHOST_SET_FORK_FROM_OWNER _IOW(VHOST_VIRTIO, 0x83, __u8) + +/** + * VHOST_GET_FORK_OWNER - Get the current fork_owner flag for the vhost device. + * Only available when CONFIG_VHOST_ENABLE_FORK_OWNER_CONTROL=y + * + * @return: An 8-bit value indicating the current thread mode. + */ +#define VHOST_GET_FORK_FROM_OWNER _IOR(VHOST_VIRTIO, 0x84, __u8) + #endif diff --git a/net/net.c b/net/net.c index da275db86e..27e0d27807 100644 --- a/net/net.c +++ b/net/net.c @@ -522,6 +522,15 @@ bool qemu_has_uso(NetClientState *nc) return nc->info->has_uso(nc); } +bool qemu_has_tunnel(NetClientState *nc) +{ + if (!nc || !nc->info->has_tunnel) { + return false; + } + + return nc->info->has_tunnel(nc); +} + bool qemu_has_vnet_hdr(NetClientState *nc) { if (!nc || !nc->info->has_vnet_hdr) { @@ -540,14 +549,13 @@ bool qemu_has_vnet_hdr_len(NetClientState *nc, int len) return nc->info->has_vnet_hdr_len(nc, len); } -void qemu_set_offload(NetClientState *nc, int csum, int tso4, int tso6, - int ecn, int ufo, int uso4, int uso6) +void qemu_set_offload(NetClientState *nc, const NetOffloads *ol) { if (!nc || !nc->info->set_offload) { return; } - nc->info->set_offload(nc, csum, tso4, tso6, ecn, ufo, uso4, uso6); + nc->info->set_offload(nc, ol); } int qemu_get_vnet_hdr_len(NetClientState *nc) @@ -567,7 +575,8 @@ void qemu_set_vnet_hdr_len(NetClientState *nc, int len) assert(len == sizeof(struct virtio_net_hdr_mrg_rxbuf) || len == sizeof(struct virtio_net_hdr) || - len == sizeof(struct virtio_net_hdr_v1_hash)); + len == sizeof(struct virtio_net_hdr_v1_hash) || + len == sizeof(struct virtio_net_hdr_v1_hash_tunnel)); nc->vnet_hdr_len = len; nc->info->set_vnet_hdr_len(nc, len); diff --git a/net/netmap.c b/net/netmap.c index 297510e190..6cd8f2bdc5 100644 --- a/net/netmap.c +++ b/net/netmap.c @@ -366,8 +366,7 @@ static void netmap_set_vnet_hdr_len(NetClientState *nc, int len) } } -static void netmap_set_offload(NetClientState *nc, int csum, int tso4, int tso6, - int ecn, int ufo, int uso4, int uso6) +static void netmap_set_offload(NetClientState *nc, const NetOffloads *ol) { NetmapState *s = DO_UPCAST(NetmapState, nc, nc); diff --git a/net/tap-bsd.c b/net/tap-bsd.c index 3f98d0ea82..bbf84d1828 100644 --- a/net/tap-bsd.c +++ b/net/tap-bsd.c @@ -225,6 +225,11 @@ int tap_probe_has_uso(int fd) return 0; } +bool tap_probe_has_tunnel(int fd) +{ + return false; +} + void tap_fd_set_vnet_hdr_len(int fd, int len) { } @@ -239,8 +244,7 @@ int tap_fd_set_vnet_be(int fd, int is_be) return -EINVAL; } -void tap_fd_set_offload(int fd, int csum, int tso4, - int tso6, int ecn, int ufo, int uso4, int uso6) +void tap_fd_set_offload(int fd, const NetOffloads *ol) { } diff --git a/net/tap-linux.c b/net/tap-linux.c index e832810665..2a90b58467 100644 --- a/net/tap-linux.c +++ b/net/tap-linux.c @@ -201,6 +201,17 @@ int tap_probe_has_uso(int fd) return 1; } +bool tap_probe_has_tunnel(int fd) +{ + unsigned offload; + + offload = TUN_F_CSUM | TUN_F_TSO4 | TUN_F_UDP_TUNNEL_GSO; + if (ioctl(fd, TUNSETOFFLOAD, offload) < 0) { + return false; + } + return true; +} + void tap_fd_set_vnet_hdr_len(int fd, int len) { if (ioctl(fd, TUNSETVNETHDRSZ, &len) == -1) { @@ -244,8 +255,7 @@ int tap_fd_set_vnet_be(int fd, int is_be) abort(); } -void tap_fd_set_offload(int fd, int csum, int tso4, - int tso6, int ecn, int ufo, int uso4, int uso6) +void tap_fd_set_offload(int fd, const NetOffloads *ol) { unsigned int offload = 0; @@ -254,22 +264,32 @@ void tap_fd_set_offload(int fd, int csum, int tso4, return; } - if (csum) { + if (ol->csum) { offload |= TUN_F_CSUM; - if (tso4) + if (ol->tso4) { offload |= TUN_F_TSO4; - if (tso6) + } + if (ol->tso6) { offload |= TUN_F_TSO6; - if ((tso4 || tso6) && ecn) + } + if ((ol->tso4 || ol->tso6) && ol->ecn) { offload |= TUN_F_TSO_ECN; - if (ufo) + } + if (ol->ufo) { offload |= TUN_F_UFO; - if (uso4) { + } + if (ol->uso4) { offload |= TUN_F_USO4; } - if (uso6) { + if (ol->uso6) { offload |= TUN_F_USO6; } + if (ol->tnl) { + offload |= TUN_F_UDP_TUNNEL_GSO; + } + if (ol->tnl_csum) { + offload |= TUN_F_UDP_TUNNEL_GSO_CSUM; + } } if (ioctl(fd, TUNSETOFFLOAD, offload) != 0) { diff --git a/net/tap-linux.h b/net/tap-linux.h index 9a58cecb7f..8cd6b5874b 100644 --- a/net/tap-linux.h +++ b/net/tap-linux.h @@ -53,4 +53,13 @@ #define TUN_F_USO4 0x20 /* I can handle USO for IPv4 packets */ #define TUN_F_USO6 0x40 /* I can handle USO for IPv6 packets */ +/* I can handle TSO/USO for UDP tunneled packets */ +#define TUN_F_UDP_TUNNEL_GSO 0x080 + +/* + * I can handle TSO/USO for UDP tunneled packets requiring csum offload for + * the outer header + */ +#define TUN_F_UDP_TUNNEL_GSO_CSUM 0x100 + #endif /* QEMU_TAP_LINUX_H */ diff --git a/net/tap-solaris.c b/net/tap-solaris.c index af2ebb16f5..75397e6c54 100644 --- a/net/tap-solaris.c +++ b/net/tap-solaris.c @@ -27,6 +27,7 @@ #include "tap_int.h" #include "qemu/ctype.h" #include "qemu/cutils.h" +#include "net/net.h" #include <sys/ethernet.h> #include <sys/sockio.h> @@ -226,6 +227,11 @@ int tap_probe_has_uso(int fd) return 0; } +bool tap_probe_has_tunnel(int fd) +{ + return false; +} + void tap_fd_set_vnet_hdr_len(int fd, int len) { } @@ -240,8 +246,7 @@ int tap_fd_set_vnet_be(int fd, int is_be) return -EINVAL; } -void tap_fd_set_offload(int fd, int csum, int tso4, - int tso6, int ecn, int ufo, int uso4, int uso6) +void tap_fd_set_offload(int fd, const NetOffloads *ol) { } diff --git a/net/tap-stub.c b/net/tap-stub.c index 38673434cb..f7a5e0c163 100644 --- a/net/tap-stub.c +++ b/net/tap-stub.c @@ -52,6 +52,11 @@ int tap_probe_has_uso(int fd) return 0; } +bool tap_probe_has_tunnel(int fd) +{ + return false; +} + void tap_fd_set_vnet_hdr_len(int fd, int len) { } @@ -66,8 +71,7 @@ int tap_fd_set_vnet_be(int fd, int is_be) return -EINVAL; } -void tap_fd_set_offload(int fd, int csum, int tso4, - int tso6, int ecn, int ufo, int uso4, int uso6) +void tap_fd_set_offload(int fd, const NetOffloads *ol) { } diff --git a/net/tap.c b/net/tap.c index f37133e301..abe3b2d036 100644 --- a/net/tap.c +++ b/net/tap.c @@ -62,6 +62,8 @@ static const int kernel_feature_bits[] = { VIRTIO_F_NOTIFICATION_DATA, VIRTIO_NET_F_RSC_EXT, VIRTIO_NET_F_HASH_REPORT, + VIRTIO_NET_F_GUEST_UDP_TUNNEL_GSO, + VIRTIO_NET_F_HOST_UDP_TUNNEL_GSO, VHOST_INVALID_FEATURE_BIT }; @@ -76,6 +78,7 @@ typedef struct TAPState { bool using_vnet_hdr; bool has_ufo; bool has_uso; + bool has_tunnel; bool enabled; VHostNetState *vhost_net; unsigned host_vnet_hdr_len; @@ -246,6 +249,14 @@ static bool tap_has_uso(NetClientState *nc) return s->has_uso; } +static bool tap_has_tunnel(NetClientState *nc) +{ + TAPState *s = DO_UPCAST(TAPState, nc, nc); + + assert(nc->info->type == NET_CLIENT_DRIVER_TAP); + return s->has_tunnel; +} + static bool tap_has_vnet_hdr(NetClientState *nc) { TAPState *s = DO_UPCAST(TAPState, nc, nc); @@ -285,15 +296,14 @@ static int tap_set_vnet_be(NetClientState *nc, bool is_be) return tap_fd_set_vnet_be(s->fd, is_be); } -static void tap_set_offload(NetClientState *nc, int csum, int tso4, - int tso6, int ecn, int ufo, int uso4, int uso6) +static void tap_set_offload(NetClientState *nc, const NetOffloads *ol) { TAPState *s = DO_UPCAST(TAPState, nc, nc); if (s->fd < 0) { return; } - tap_fd_set_offload(s->fd, csum, tso4, tso6, ecn, ufo, uso4, uso6); + tap_fd_set_offload(s->fd, ol); } static void tap_exit_notify(Notifier *notifier, void *data) @@ -375,6 +385,7 @@ static NetClientInfo net_tap_info = { .cleanup = tap_cleanup, .has_ufo = tap_has_ufo, .has_uso = tap_has_uso, + .has_tunnel = tap_has_tunnel, .has_vnet_hdr = tap_has_vnet_hdr, .has_vnet_hdr_len = tap_has_vnet_hdr_len, .set_offload = tap_set_offload, @@ -391,6 +402,7 @@ static TAPState *net_tap_fd_init(NetClientState *peer, int fd, int vnet_hdr) { + NetOffloads ol = {}; NetClientState *nc; TAPState *s; @@ -403,8 +415,9 @@ static TAPState *net_tap_fd_init(NetClientState *peer, s->using_vnet_hdr = false; s->has_ufo = tap_probe_has_ufo(s->fd); s->has_uso = tap_probe_has_uso(s->fd); + s->has_tunnel = tap_probe_has_tunnel(s->fd); s->enabled = true; - tap_set_offload(&s->nc, 0, 0, 0, 0, 0, 0, 0); + tap_set_offload(&s->nc, &ol); /* * Make sure host header length is set correctly in tap: * it might have been modified by another instance of qemu. diff --git a/net/tap_int.h b/net/tap_int.h index 8857ff299d..b76a05044b 100644 --- a/net/tap_int.h +++ b/net/tap_int.h @@ -27,6 +27,7 @@ #define NET_TAP_INT_H #include "qapi/qapi-types-net.h" +#include "net/net.h" int tap_open(char *ifname, int ifname_size, int *vnet_hdr, int vnet_hdr_required, int mq_required, Error **errp); @@ -37,8 +38,8 @@ void tap_set_sndbuf(int fd, const NetdevTapOptions *tap, Error **errp); int tap_probe_vnet_hdr(int fd, Error **errp); int tap_probe_has_ufo(int fd); int tap_probe_has_uso(int fd); -void tap_fd_set_offload(int fd, int csum, int tso4, int tso6, int ecn, int ufo, - int uso4, int uso6); +bool tap_probe_has_tunnel(int fd); +void tap_fd_set_offload(int fd, const NetOffloads *ol); void tap_fd_set_vnet_hdr_len(int fd, int len); int tap_fd_set_vnet_le(int fd, int vnet_is_le); int tap_fd_set_vnet_be(int fd, int vnet_is_be); diff --git a/qapi/acpi-hest.json b/qapi/acpi-hest.json new file mode 100644 index 0000000000..28af1266a7 --- /dev/null +++ b/qapi/acpi-hest.json @@ -0,0 +1,36 @@ +# -*- Mode: Python -*- +# vim: filetype=python +# SPDX-License-Identifier: GPL-2.0-or-later + +## +# == GHESv2 CPER Error Injection +# +# Defined since ACPI Specification 6.1, +# section 18.3.2.8 Generic Hardware Error Source version 2. See: +# +# https://uefi.org/sites/default/files/resources/ACPI_6_1.pdf +## + + +## +# @inject-ghes-v2-error: +# +# Inject an error with additional ACPI 6.1 GHESv2 error information +# +# @cper: contains a base64 encoded string with raw data for a single +# CPER record with Generic Error Status Block, Generic Error Data +# Entry and generic error data payload, as described at +# https://uefi.org/specs/UEFI/2.10/Apx_N_Common_Platform_Error_Record.html#format +# +# Features: +# +# @unstable: This command is experimental. +# +# Since: 10.2 +## +{ 'command': 'inject-ghes-v2-error', + 'data': { + 'cper': 'str' + }, + 'features': [ 'unstable' ] +} diff --git a/qapi/meson.build b/qapi/meson.build index ca6b61a608..a46269b5a0 100644 --- a/qapi/meson.build +++ b/qapi/meson.build @@ -59,6 +59,7 @@ if have_system qapi_all_modules += [ 'accelerator', 'acpi', + 'acpi-hest', 'audio', 'cryptodev', 'qdev', diff --git a/qapi/qapi-schema.json b/qapi/qapi-schema.json index 82f111ba06..b93dd68d94 100644 --- a/qapi/qapi-schema.json +++ b/qapi/qapi-schema.json @@ -68,6 +68,7 @@ { 'include': 'misc-i386.json' } { 'include': 'audio.json' } { 'include': 'acpi.json' } +{ 'include': 'acpi-hest.json' } { 'include': 'pci.json' } { 'include': 'stats.json' } { 'include': 'virtio.json' } diff --git a/qapi/virtio.json b/qapi/virtio.json index 9d652fe4a8..05295ab665 100644 --- a/qapi/virtio.json +++ b/qapi/virtio.json @@ -247,6 +247,7 @@ # }, # "host-features": { # "unknown-dev-features": 1073741824, +# "unknown-dev-features2": 0, # "dev-features": [], # "transports": [ # "VIRTIO_RING_F_EVENT_IDX: Used & avail. event fields enabled", @@ -490,14 +491,18 @@ # unique features) # # @unknown-dev-features: Virtio device features bitmap that have not -# been decoded +# been decoded (bits 0-63) +# +# @unknown-dev-features2: Virtio device features bitmap that have not +# been decoded (bits 64-127) (since 10.2) # # Since: 7.2 ## { 'struct': 'VirtioDeviceFeatures', 'data': { 'transports': [ 'str' ], '*dev-features': [ 'str' ], - '*unknown-dev-features': 'uint64' } } + '*unknown-dev-features': 'uint64', + '*unknown-dev-features2': 'uint64' } } ## # @VirtQueueStatus: diff --git a/qemu-options.hx b/qemu-options.hx index a9d640d9e6..cc2ef4424e 100644 --- a/qemu-options.hx +++ b/qemu-options.hx @@ -1238,6 +1238,29 @@ SRST Accepts either the default root complex (pcie.0) or a pxb-pcie based root complex. +``-device amd-iommu[,option=...]`` + Enables emulation of an AMD-Vi I/O Memory Management Unit (IOMMU). + Only available with ``-machine q35``, it supports the following options: + + ``dma-remap=on|off`` (default: off) + Support for DMA address translation and access permission checking for + guests attaching passthrough devices to paging domains, using the AMD v1 + I/O Page Table format. This enables ``-device vfio-pci,...`` to work + correctly with a guest using the DMA remapping feature of the vIOMMU. + + ``intremap=on|off`` (default: auto) + Generic x86 IOMMU functionality implemented by ``amd-iommu`` device. + Enables interrupt remapping feature in guests, which is also required to + enable x2apic support. + Currently only available with ``kernel-irqchip=off|split``, it is + automatically enabled when either of those modes is in use, and disabled + with ``kernel-irqchip=on``. + + ``xtsup=on|off`` (default: off) + Interrupt remapping table supports x2apic mode, enabling the use of + 128-bit IRTE format with 32-bit destination field by the guest. Required + to support routing interrupts to vCPUs with APIC IDs larger than 0xff. + ERST DEF("name", HAS_ARG, QEMU_OPTION_name, @@ -2700,7 +2723,7 @@ DEF("smbios", HAS_ARG, QEMU_OPTION_smbios, "-smbios file=binary\n" " load SMBIOS entry from binary file\n" "-smbios type=0[,vendor=str][,version=str][,date=str][,release=%d.%d]\n" - " [,uefi=on|off]\n" + " [,uefi=on|off][,vm=on|off]\n" " specify SMBIOS type 0 fields\n" "-smbios type=1[,manufacturer=str][,product=str][,version=str][,serial=str]\n" " [,uuid=uuid][,sku=str][,family=str]\n" diff --git a/scripts/arm_processor_error.py b/scripts/arm_processor_error.py new file mode 100644 index 0000000000..73d069f070 --- /dev/null +++ b/scripts/arm_processor_error.py @@ -0,0 +1,476 @@ +#!/usr/bin/env python3 +# +# pylint: disable=C0301,C0114,R0903,R0912,R0913,R0914,R0915,W0511 +# SPDX-License-Identifier: GPL-2.0-or-later +# +# Copyright (C) 2024-2025 Mauro Carvalho Chehab <mchehab+huawei@kernel.org> + +# TODO: current implementation has dummy defaults. +# +# For a better implementation, a QMP addition/call is needed to +# retrieve some data for ARM Processor Error injection: +# +# - ARM registers: power_state, mpidr. + +""" +Generate an ARM processor error CPER, compatible with +UEFI 2.9A Errata. + +Injecting such errors can be done using: + + $ ./scripts/ghes_inject.py arm + Error injected. + +Produces a simple CPER register, as detected on a Linux guest: + +[Hardware Error]: Hardware error from APEI Generic Hardware Error Source: 1 +[Hardware Error]: event severity: recoverable +[Hardware Error]: Error 0, type: recoverable +[Hardware Error]: section_type: ARM processor error +[Hardware Error]: MIDR: 0x0000000000000000 +[Hardware Error]: running state: 0x0 +[Hardware Error]: Power State Coordination Interface state: 0 +[Hardware Error]: Error info structure 0: +[Hardware Error]: num errors: 2 +[Hardware Error]: error_type: 0x02: cache error +[Hardware Error]: error_info: 0x000000000091000f +[Hardware Error]: transaction type: Data Access +[Hardware Error]: cache error, operation type: Data write +[Hardware Error]: cache level: 2 +[Hardware Error]: processor context not corrupted +[Firmware Warn]: GHES: Unhandled processor error type 0x02: cache error + +The ARM Processor Error message can be customized via command line +parameters. For instance: + + $ ./scripts/ghes_inject.py arm --mpidr 0x444 --running --affinity 1 \ + --error-info 12345678 --vendor 0x13,123,4,5,1 --ctx-array 0,1,2,3,4,5 \ + -t cache tlb bus micro-arch tlb,micro-arch + Error injected. + +Injects this error, as detected on a Linux guest: + +[Hardware Error]: Hardware error from APEI Generic Hardware Error Source: 1 +[Hardware Error]: event severity: recoverable +[Hardware Error]: Error 0, type: recoverable +[Hardware Error]: section_type: ARM processor error +[Hardware Error]: MIDR: 0x0000000000000000 +[Hardware Error]: Multiprocessor Affinity Register (MPIDR): 0x0000000000000000 +[Hardware Error]: error affinity level: 0 +[Hardware Error]: running state: 0x1 +[Hardware Error]: Power State Coordination Interface state: 0 +[Hardware Error]: Error info structure 0: +[Hardware Error]: num errors: 2 +[Hardware Error]: error_type: 0x02: cache error +[Hardware Error]: error_info: 0x0000000000bc614e +[Hardware Error]: cache level: 2 +[Hardware Error]: processor context not corrupted +[Hardware Error]: Error info structure 1: +[Hardware Error]: num errors: 2 +[Hardware Error]: error_type: 0x04: TLB error +[Hardware Error]: error_info: 0x000000000054007f +[Hardware Error]: transaction type: Instruction +[Hardware Error]: TLB error, operation type: Instruction fetch +[Hardware Error]: TLB level: 1 +[Hardware Error]: processor context not corrupted +[Hardware Error]: the error has not been corrected +[Hardware Error]: PC is imprecise +[Hardware Error]: Error info structure 2: +[Hardware Error]: num errors: 2 +[Hardware Error]: error_type: 0x08: bus error +[Hardware Error]: error_info: 0x00000080d6460fff +[Hardware Error]: transaction type: Generic +[Hardware Error]: bus error, operation type: Generic read (type of instruction or data request cannot be determined) +[Hardware Error]: affinity level at which the bus error occurred: 1 +[Hardware Error]: processor context corrupted +[Hardware Error]: the error has been corrected +[Hardware Error]: PC is imprecise +[Hardware Error]: Program execution can be restarted reliably at the PC associated with the error. +[Hardware Error]: participation type: Local processor observed +[Hardware Error]: request timed out +[Hardware Error]: address space: External Memory Access +[Hardware Error]: memory access attributes:0x20 +[Hardware Error]: access mode: secure +[Hardware Error]: Error info structure 3: +[Hardware Error]: num errors: 2 +[Hardware Error]: error_type: 0x10: micro-architectural error +[Hardware Error]: error_info: 0x0000000078da03ff +[Hardware Error]: Error info structure 4: +[Hardware Error]: num errors: 2 +[Hardware Error]: error_type: 0x14: TLB error|micro-architectural error +[Hardware Error]: Context info structure 0: +[Hardware Error]: register context type: AArch64 EL1 context registers +[Hardware Error]: 00000000: 00000000 00000000 +[Hardware Error]: Vendor specific error info has 5 bytes: +[Hardware Error]: 00000000: 13 7b 04 05 01 .{... +[Firmware Warn]: GHES: Unhandled processor error type 0x02: cache error +[Firmware Warn]: GHES: Unhandled processor error type 0x04: TLB error +[Firmware Warn]: GHES: Unhandled processor error type 0x08: bus error +[Firmware Warn]: GHES: Unhandled processor error type 0x10: micro-architectural error +[Firmware Warn]: GHES: Unhandled processor error type 0x14: TLB error|micro-architectural error +""" + +import argparse +import re + +from qmp_helper import qmp, util, cper_guid + + +class ArmProcessorEinj: + """ + Implements ARM Processor Error injection via GHES + """ + + DESC = """ + Generates an ARM processor error CPER, compatible with + UEFI 2.9A Errata. + """ + + ACPI_GHES_ARM_CPER_LENGTH = 40 + ACPI_GHES_ARM_CPER_PEI_LENGTH = 32 + + # Context types + CONTEXT_AARCH32_EL1 = 1 + CONTEXT_AARCH64_EL1 = 5 + CONTEXT_MISC_REG = 8 + + def __init__(self, subparsers): + """Initialize the error injection class and add subparser""" + + # Valid choice values + self.arm_valid_bits = { + "mpidr": util.bit(0), + "affinity": util.bit(1), + "running": util.bit(2), + "vendor": util.bit(3), + } + + self.pei_flags = { + "first": util.bit(0), + "last": util.bit(1), + "propagated": util.bit(2), + "overflow": util.bit(3), + } + + self.pei_error_types = { + "cache": util.bit(1), + "tlb": util.bit(2), + "bus": util.bit(3), + "micro-arch": util.bit(4), + } + + self.pei_valid_bits = { + "multiple-error": util.bit(0), + "flags": util.bit(1), + "error-info": util.bit(2), + "virt-addr": util.bit(3), + "phy-addr": util.bit(4), + } + + self.data = bytearray() + + parser = subparsers.add_parser("arm", description=self.DESC) + + arm_valid_bits = ",".join(self.arm_valid_bits.keys()) + flags = ",".join(self.pei_flags.keys()) + error_types = ",".join(self.pei_error_types.keys()) + pei_valid_bits = ",".join(self.pei_valid_bits.keys()) + + # UEFI N.16 ARM Validation bits + g_arm = parser.add_argument_group("ARM processor") + g_arm.add_argument("--arm", "--arm-valid", + help=f"ARM valid bits: {arm_valid_bits}") + g_arm.add_argument("-a", "--affinity", "--level", "--affinity-level", + type=lambda x: int(x, 0), + help="Affinity level (when multiple levels apply)") + g_arm.add_argument("-l", "--mpidr", type=lambda x: int(x, 0), + help="Multiprocessor Affinity Register") + g_arm.add_argument("-i", "--midr", type=lambda x: int(x, 0), + help="Main ID Register") + g_arm.add_argument("-r", "--running", + action=argparse.BooleanOptionalAction, + default=None, + help="Indicates if the processor is running or not") + g_arm.add_argument("--psci", "--psci-state", + type=lambda x: int(x, 0), + help="Power State Coordination Interface - PSCI state") + + # TODO: Add vendor-specific support + + # UEFI N.17 bitmaps (type and flags) + g_pei = parser.add_argument_group("ARM Processor Error Info (PEI)") + g_pei.add_argument("-t", "--type", nargs="+", + help=f"one or more error types: {error_types}") + g_pei.add_argument("-f", "--flags", nargs="*", + help=f"zero or more error flags: {flags}") + g_pei.add_argument("-V", "--pei-valid", "--error-valid", nargs="*", + help=f"zero or more PEI valid bits: {pei_valid_bits}") + + # UEFI N.17 Integer values + g_pei.add_argument("-m", "--multiple-error", nargs="+", + help="Number of errors: 0: Single error, 1: Multiple errors, 2-65535: Error count if known") + g_pei.add_argument("-e", "--error-info", nargs="+", + help="Error information (UEFI 2.10 tables N.18 to N.20)") + g_pei.add_argument("-p", "--physical-address", nargs="+", + help="Physical address") + g_pei.add_argument("-v", "--virtual-address", nargs="+", + help="Virtual address") + + # UEFI N.21 Context + g_ctx = parser.add_argument_group("Processor Context") + g_ctx.add_argument("--ctx-type", "--context-type", nargs="*", + help="Type of the context (0=ARM32 GPR, 5=ARM64 EL1, other values supported)") + g_ctx.add_argument("--ctx-size", "--context-size", nargs="*", + help="Minimal size of the context") + g_ctx.add_argument("--ctx-array", "--context-array", nargs="*", + help="Comma-separated arrays for each context") + + # Vendor-specific data + g_vendor = parser.add_argument_group("Vendor-specific data") + g_vendor.add_argument("--vendor", "--vendor-specific", nargs="+", + help="Vendor-specific byte arrays of data") + + # Add arguments for Generic Error Data + qmp.argparse(parser) + + parser.set_defaults(func=self.send_cper) + + def send_cper(self, args): + """Parse subcommand arguments and send a CPER via QMP""" + + qmp_cmd = qmp(args.host, args.port, args.debug) + + # Handle Generic Error Data arguments if any + qmp_cmd.set_args(args) + + is_cpu_type = re.compile(r"^([\w+]+\-)?arm\-cpu$") + cpus = qmp_cmd.search_qom("/machine/unattached/device", + "type", is_cpu_type) + + cper = {} + pei = {} + ctx = {} + vendor = {} + + arg = vars(args) + + # Handle global parameters + if args.arm: + arm_valid_init = False + cper["valid"] = util.get_choice(name="valid", + value=args.arm, + choices=self.arm_valid_bits, + suffixes=["-error", "-err"]) + else: + cper["valid"] = 0 + arm_valid_init = True + + if "running" in arg: + if args.running: + cper["running-state"] = util.bit(0) + else: + cper["running-state"] = 0 + else: + cper["running-state"] = 0 + + if arm_valid_init: + if args.affinity: + cper["valid"] |= self.arm_valid_bits["affinity"] + + if args.mpidr: + cper["valid"] |= self.arm_valid_bits["mpidr"] + + if "running-state" in cper: + cper["valid"] |= self.arm_valid_bits["running"] + + if args.psci: + cper["valid"] |= self.arm_valid_bits["running"] + + # Handle PEI + if not args.type: + args.type = ["cache-error"] + + util.get_mult_choices( + pei, + name="valid", + values=args.pei_valid, + choices=self.pei_valid_bits, + suffixes=["-valid", "--addr"], + ) + util.get_mult_choices( + pei, + name="type", + values=args.type, + choices=self.pei_error_types, + suffixes=["-error", "-err"], + ) + util.get_mult_choices( + pei, + name="flags", + values=args.flags, + choices=self.pei_flags, + suffixes=["-error", "-cap"], + ) + util.get_mult_int(pei, "error-info", args.error_info) + util.get_mult_int(pei, "multiple-error", args.multiple_error) + util.get_mult_int(pei, "phy-addr", args.physical_address) + util.get_mult_int(pei, "virt-addr", args.virtual_address) + + # Handle context + util.get_mult_int(ctx, "type", args.ctx_type, allow_zero=True) + util.get_mult_int(ctx, "minimal-size", args.ctx_size, allow_zero=True) + util.get_mult_array(ctx, "register", args.ctx_array, allow_zero=True) + + util.get_mult_array(vendor, "bytes", args.vendor, max_val=255) + + # Store PEI + pei_data = bytearray() + default_flags = self.pei_flags["first"] + default_flags |= self.pei_flags["last"] + + error_info_num = 0 + + for i, p in pei.items(): # pylint: disable=W0612 + error_info_num += 1 + + # UEFI 2.10 doesn't define how to encode error information + # when multiple types are raised. So, provide a default only + # if a single type is there + if "error-info" not in p: + if p["type"] == util.bit(1): + p["error-info"] = 0x0091000F + if p["type"] == util.bit(2): + p["error-info"] = 0x0054007F + if p["type"] == util.bit(3): + p["error-info"] = 0x80D6460FFF + if p["type"] == util.bit(4): + p["error-info"] = 0x78DA03FF + + if "valid" not in p: + p["valid"] = 0 + if "multiple-error" in p: + p["valid"] |= self.pei_valid_bits["multiple-error"] + + if "flags" in p: + p["valid"] |= self.pei_valid_bits["flags"] + + if "error-info" in p: + p["valid"] |= self.pei_valid_bits["error-info"] + + if "phy-addr" in p: + p["valid"] |= self.pei_valid_bits["phy-addr"] + + if "virt-addr" in p: + p["valid"] |= self.pei_valid_bits["virt-addr"] + + # Version + util.data_add(pei_data, 0, 1) + + util.data_add(pei_data, + self.ACPI_GHES_ARM_CPER_PEI_LENGTH, 1) + + util.data_add(pei_data, p["valid"], 2) + util.data_add(pei_data, p["type"], 1) + util.data_add(pei_data, p.get("multiple-error", 1), 2) + util.data_add(pei_data, p.get("flags", default_flags), 1) + util.data_add(pei_data, p.get("error-info", 0), 8) + util.data_add(pei_data, p.get("virt-addr", 0xDEADBEEF), 8) + util.data_add(pei_data, p.get("phy-addr", 0xABBA0BAD), 8) + + # Store Context + ctx_data = bytearray() + context_info_num = 0 + + if ctx: + ret = qmp_cmd.send_cmd("query-target", may_open=True) + + default_ctx = self.CONTEXT_MISC_REG + + if "arch" in ret: + if ret["arch"] == "aarch64": + default_ctx = self.CONTEXT_AARCH64_EL1 + elif ret["arch"] == "arm": + default_ctx = self.CONTEXT_AARCH32_EL1 + + for k in sorted(ctx.keys()): + context_info_num += 1 + + if "type" not in ctx[k]: + ctx[k]["type"] = default_ctx + + if "register" not in ctx[k]: + ctx[k]["register"] = [] + + reg_size = len(ctx[k]["register"]) + size = 0 + + if "minimal-size" in ctx: + size = ctx[k]["minimal-size"] + + size = max(size, reg_size) + + size = (size + 1) % 0xFFFE + + # Version + util.data_add(ctx_data, 0, 2) + + util.data_add(ctx_data, ctx[k]["type"], 2) + + util.data_add(ctx_data, 8 * size, 4) + + for r in ctx[k]["register"]: + util.data_add(ctx_data, r, 8) + + for i in range(reg_size, size): # pylint: disable=W0612 + util.data_add(ctx_data, 0, 8) + + # Vendor-specific bytes are not grouped + vendor_data = bytearray() + if vendor: + for k in sorted(vendor.keys()): + for b in vendor[k]["bytes"]: + util.data_add(vendor_data, b, 1) + + # Encode ARM Processor Error + data = bytearray() + + util.data_add(data, cper["valid"], 4) + + util.data_add(data, error_info_num, 2) + util.data_add(data, context_info_num, 2) + + # Calculate the length of the CPER data + cper_length = self.ACPI_GHES_ARM_CPER_LENGTH + cper_length += len(pei_data) + cper_length += len(vendor_data) + cper_length += len(ctx_data) + util.data_add(data, cper_length, 4) + + util.data_add(data, arg.get("affinity-level", 0), 1) + + # Reserved + util.data_add(data, 0, 3) + + if "midr-el1" not in arg: + if cpus: + cmd_arg = { + 'path': cpus[0], + 'property': "midr" + } + ret = qmp_cmd.send_cmd("qom-get", cmd_arg, may_open=True) + if isinstance(ret, int): + arg["midr-el1"] = ret + + util.data_add(data, arg.get("mpidr-el1", 0), 8) + util.data_add(data, arg.get("midr-el1", 0), 8) + util.data_add(data, cper["running-state"], 4) + util.data_add(data, arg.get("psci-state", 0), 4) + + # Add PEI + data.extend(pei_data) + data.extend(ctx_data) + data.extend(vendor_data) + + self.data = data + + qmp_cmd.send_cper(cper_guid.CPER_PROC_ARM, self.data) diff --git a/scripts/ghes_inject.py b/scripts/ghes_inject.py new file mode 100755 index 0000000000..9a23520141 --- /dev/null +++ b/scripts/ghes_inject.py @@ -0,0 +1,51 @@ +#!/usr/bin/env python3 +# +# SPDX-License-Identifier: GPL-2.0-or-later +# +# Copyright (C) 2024-2025 Mauro Carvalho Chehab <mchehab+huawei@kernel.org> + +""" +Handle ACPI GHESv2 error injection logic QEMU QMP interface. +""" + +import argparse +import sys + +from arm_processor_error import ArmProcessorEinj + +EINJ_DESC = """ +Handle ACPI GHESv2 error injection logic QEMU QMP interface. + +It allows using UEFI BIOS EINJ features to generate GHES records. + +It helps testing CPER and GHES drivers at the guest OS and how +userspace applications at the guest handle them. +""" + +def main(): + """Main program""" + + # Main parser - handle generic args like QEMU QMP TCP socket options + parser = argparse.ArgumentParser(formatter_class=argparse.ArgumentDefaultsHelpFormatter, + usage="%(prog)s [options]", + description=EINJ_DESC) + + g_options = parser.add_argument_group("QEMU QMP socket options") + g_options.add_argument("-H", "--host", default="localhost", type=str, + help="host name") + g_options.add_argument("-P", "--port", default=4445, type=int, + help="TCP port number") + g_options.add_argument('-d', '--debug', action='store_true') + + subparsers = parser.add_subparsers() + + ArmProcessorEinj(subparsers) + + args = parser.parse_args() + if "func" in args: + args.func(args) + else: + sys.exit(f"Please specify a valid command for {sys.argv[0]}") + +if __name__ == "__main__": + main() diff --git a/scripts/qmp_helper.py b/scripts/qmp_helper.py new file mode 100755 index 0000000000..c1e7e0fd80 --- /dev/null +++ b/scripts/qmp_helper.py @@ -0,0 +1,703 @@ +#!/usr/bin/env python3 +# +# pylint: disable=C0103,E0213,E1135,E1136,E1137,R0902,R0903,R0912,R0913,R0917 +# SPDX-License-Identifier: GPL-2.0-or-later +# +# Copyright (C) 2024-2025 Mauro Carvalho Chehab <mchehab+huawei@kernel.org> + +""" +Helper classes to be used by ghes_inject command classes. +""" + +import json +import sys + +from datetime import datetime +from os import path as os_path + +try: + qemu_dir = os_path.abspath(os_path.dirname(os_path.dirname(__file__))) + sys.path.append(os_path.join(qemu_dir, 'python')) + + from qemu.qmp.legacy import QEMUMonitorProtocol + +except ModuleNotFoundError as exc: + print(f"Module '{exc.name}' not found.") + print("Try export PYTHONPATH=top-qemu-dir/python or run from top-qemu-dir") + sys.exit(1) + +from base64 import b64encode + +class util: + """ + Ancillary functions to deal with bitmaps, parse arguments, + generate GUID and encode data on a bytearray buffer. + """ + + # + # Helper routines to handle multiple choice arguments + # + def get_choice(name, value, choices, suffixes=None, bitmask=True): + """Produce a list from multiple choice argument""" + + new_values = 0 + + if not value: + return new_values + + for val in value.split(","): + val = val.lower() + + if suffixes: + for suffix in suffixes: + val = val.removesuffix(suffix) + + if val not in choices.keys(): + if suffixes: + for suffix in suffixes: + if val + suffix in choices.keys(): + val += suffix + break + + if val not in choices.keys(): + sys.exit(f"Error on '{name}': choice '{val}' is invalid.") + + val = choices[val] + + if bitmask: + new_values |= val + else: + if new_values: + sys.exit(f"Error on '{name}': only one value is accepted.") + + new_values = val + + return new_values + + def get_array(name, values, max_val=None): + """Add numbered hashes from integer lists into an array""" + + array = [] + + for value in values: + for val in value.split(","): + try: + val = int(val, 0) + except ValueError: + sys.exit(f"Error on '{name}': {val} is not an integer") + + if val < 0: + sys.exit(f"Error on '{name}': {val} is not unsigned") + + if max_val and val > max_val: + sys.exit(f"Error on '{name}': {val} is too little") + + array.append(val) + + return array + + def get_mult_array(mult, name, values, allow_zero=False, max_val=None): + """Add numbered hashes from integer lists""" + + if not allow_zero: + if not values: + return + else: + if values is None: + return + + if not values: + i = 0 + if i not in mult: + mult[i] = {} + + mult[i][name] = [] + return + + i = 0 + for value in values: + for val in value.split(","): + try: + val = int(val, 0) + except ValueError: + sys.exit(f"Error on '{name}': {val} is not an integer") + + if val < 0: + sys.exit(f"Error on '{name}': {val} is not unsigned") + + if max_val and val > max_val: + sys.exit(f"Error on '{name}': {val} is too little") + + if i not in mult: + mult[i] = {} + + if name not in mult[i]: + mult[i][name] = [] + + mult[i][name].append(val) + + i += 1 + + + def get_mult_choices(mult, name, values, choices, + suffixes=None, allow_zero=False): + """Add numbered hashes from multiple choice arguments""" + + if not allow_zero: + if not values: + return + else: + if values is None: + return + + i = 0 + for val in values: + new_values = util.get_choice(name, val, choices, suffixes) + + if i not in mult: + mult[i] = {} + + mult[i][name] = new_values + i += 1 + + + def get_mult_int(mult, name, values, allow_zero=False): + """Add numbered hashes from integer arguments""" + if not allow_zero: + if not values: + return + else: + if values is None: + return + + i = 0 + for val in values: + try: + val = int(val, 0) + except ValueError: + sys.exit(f"Error on '{name}': {val} is not an integer") + + if val < 0: + sys.exit(f"Error on '{name}': {val} is not unsigned") + + if i not in mult: + mult[i] = {} + + mult[i][name] = val + i += 1 + + + # + # Data encode helper functions + # + def bit(b): + """Simple macro to define a bit on a bitmask""" + return 1 << b + + + def data_add(data, value, num_bytes): + """Adds bytes from value inside a bitarray""" + + data.extend(value.to_bytes(num_bytes, byteorder="little")) # pylint: disable=E1101 + + def dump_bytearray(name, data): + """Does an hexdump of a byte array, grouping in bytes""" + + print(f"{name} ({len(data)} bytes):") + + for ln_start in range(0, len(data), 16): + ln_end = min(ln_start + 16, len(data)) + print(f" {ln_start:08x} ", end="") + for i in range(ln_start, ln_end): + print(f"{data[i]:02x} ", end="") + for i in range(ln_end, ln_start + 16): + print(" ", end="") + print(" ", end="") + for i in range(ln_start, ln_end): + if data[i] >= 32 and data[i] < 127: + print(chr(data[i]), end="") + else: + print(".", end="") + + print() + print() + + def time(string): + """Handle BCD timestamps used on Generic Error Data Block""" + + time = None + + # Formats to be used when parsing time stamps + formats = [ + "%Y-%m-%d %H:%M:%S", + ] + + if string == "now": + time = datetime.now() + + if time is None: + for fmt in formats: + try: + time = datetime.strptime(string, fmt) + break + except ValueError: + pass + + if time is None: + raise ValueError("Invalid time format") + + return time + +class guid: + """ + Simple class to handle GUID fields. + """ + + def __init__(self, time_low, time_mid, time_high, nodes): + """Initialize a GUID value""" + + assert len(nodes) == 8 + + self.time_low = time_low + self.time_mid = time_mid + self.time_high = time_high + self.nodes = nodes + + @classmethod + def UUID(cls, guid_str): + """Initialize a GUID using a string on its standard format""" + + if len(guid_str) != 36: + print("Size not 36") + raise ValueError('Invalid GUID size') + + # It is easier to parse without separators. So, drop them + guid_str = guid_str.replace('-', '') + + if len(guid_str) != 32: + print("Size not 32", guid_str, len(guid_str)) + raise ValueError('Invalid GUID hex size') + + time_low = 0 + time_mid = 0 + time_high = 0 + nodes = [] + + for i in reversed(range(16, 32, 2)): + h = guid_str[i:i + 2] + value = int(h, 16) + nodes.insert(0, value) + + time_high = int(guid_str[12:16], 16) + time_mid = int(guid_str[8:12], 16) + time_low = int(guid_str[0:8], 16) + + return cls(time_low, time_mid, time_high, nodes) + + def __str__(self): + """Output a GUID value on its default string representation""" + + clock = self.nodes[0] << 8 | self.nodes[1] + + node = 0 + for i in range(2, len(self.nodes)): + node = node << 8 | self.nodes[i] + + s = f"{self.time_low:08x}-{self.time_mid:04x}-" + s += f"{self.time_high:04x}-{clock:04x}-{node:012x}" + return s + + def to_bytes(self): + """Output a GUID value in bytes""" + + data = bytearray() + + util.data_add(data, self.time_low, 4) + util.data_add(data, self.time_mid, 2) + util.data_add(data, self.time_high, 2) + data.extend(bytearray(self.nodes)) + + return data + +class qmp: + """ + Opens a connection and send/receive QMP commands. + """ + + def send_cmd(self, command, args=None, may_open=False, return_error=True): + """Send a command to QMP, optinally opening a connection""" + + if may_open: + self._connect() + elif not self.connected: + return False + + msg = { 'execute': command } + if args: + msg['arguments'] = args + + try: + obj = self.qmp_monitor.cmd_obj(msg) + # Can we use some other exception class here? + except Exception as e: # pylint: disable=W0718 + print(f"Command: {command}") + print(f"Failed to inject error: {e}.") + return None + + if "return" in obj: + if isinstance(obj.get("return"), dict): + if obj["return"]: + return obj["return"] + return "OK" + + return obj["return"] + + if isinstance(obj.get("error"), dict): + error = obj["error"] + if return_error: + print(f"Command: {msg}") + print(f'{error["class"]}: {error["desc"]}') + else: + print(json.dumps(obj)) + + return None + + def _close(self): + """Shutdown and close the socket, if opened""" + if not self.connected: + return + + self.qmp_monitor.close() + self.connected = False + + def _connect(self): + """Connect to a QMP TCP/IP port, if not connected yet""" + + if self.connected: + return True + + try: + self.qmp_monitor.connect(negotiate=True) + except ConnectionError: + sys.exit(f"Can't connect to QMP host {self.host}:{self.port}") + + self.connected = True + + return True + + BLOCK_STATUS_BITS = { + "uncorrectable": util.bit(0), + "correctable": util.bit(1), + "multi-uncorrectable": util.bit(2), + "multi-correctable": util.bit(3), + } + + ERROR_SEVERITY = { + "recoverable": 0, + "fatal": 1, + "corrected": 2, + "none": 3, + } + + VALIDATION_BITS = { + "fru-id": util.bit(0), + "fru-text": util.bit(1), + "timestamp": util.bit(2), + } + + GEDB_FLAGS_BITS = { + "recovered": util.bit(0), + "prev-error": util.bit(1), + "simulated": util.bit(2), + } + + GENERIC_DATA_SIZE = 72 + + def argparse(parser): + """Prepare a parser group to query generic error data""" + + block_status_bits = ",".join(qmp.BLOCK_STATUS_BITS.keys()) + error_severity_enum = ",".join(qmp.ERROR_SEVERITY.keys()) + validation_bits = ",".join(qmp.VALIDATION_BITS.keys()) + gedb_flags_bits = ",".join(qmp.GEDB_FLAGS_BITS.keys()) + + g_gen = parser.add_argument_group("Generic Error Data") # pylint: disable=E1101 + g_gen.add_argument("--block-status", + help=f"block status bits: {block_status_bits}") + g_gen.add_argument("--raw-data", nargs="+", + help="Raw data inside the Error Status Block") + g_gen.add_argument("--error-severity", "--severity", + help=f"error severity: {error_severity_enum}") + g_gen.add_argument("--gen-err-valid-bits", + "--generic-error-validation-bits", + help=f"validation bits: {validation_bits}") + g_gen.add_argument("--fru-id", type=guid.UUID, + help="GUID representing a physical device") + g_gen.add_argument("--fru-text", + help="ASCII string identifying the FRU hardware") + g_gen.add_argument("--timestamp", type=util.time, + help="Time when the error info was collected") + g_gen.add_argument("--precise", "--precise-timestamp", + action='store_true', + help="Marks the timestamp as precise if --timestamp is used") + g_gen.add_argument("--gedb-flags", + help=f"General Error Data Block flags: {gedb_flags_bits}") + + def set_args(self, args): + """Set the arguments optionally defined via self.argparse()""" + + if args.block_status: + self.block_status = util.get_choice(name="block-status", + value=args.block_status, + choices=self.BLOCK_STATUS_BITS, + bitmask=False) + if args.raw_data: + self.raw_data = util.get_array("raw-data", args.raw_data, + max_val=255) + print(self.raw_data) + + if args.error_severity: + self.error_severity = util.get_choice(name="error-severity", + value=args.error_severity, + choices=self.ERROR_SEVERITY, + bitmask=False) + + if args.fru_id: + self.fru_id = args.fru_id.to_bytes() + if not args.gen_err_valid_bits: + self.validation_bits |= self.VALIDATION_BITS["fru-id"] + + if args.fru_text: + text = bytearray(args.fru_text.encode('ascii')) + if len(text) > 20: + sys.exit("FRU text is too big to fit") + + self.fru_text = text + if not args.gen_err_valid_bits: + self.validation_bits |= self.VALIDATION_BITS["fru-text"] + + if args.timestamp: + time = args.timestamp + century = int(time.year / 100) + + bcd = bytearray() + util.data_add(bcd, (time.second // 10) << 4 | (time.second % 10), 1) + util.data_add(bcd, (time.minute // 10) << 4 | (time.minute % 10), 1) + util.data_add(bcd, (time.hour // 10) << 4 | (time.hour % 10), 1) + + if args.precise: + util.data_add(bcd, 1, 1) + else: + util.data_add(bcd, 0, 1) + + util.data_add(bcd, (time.day // 10) << 4 | (time.day % 10), 1) + util.data_add(bcd, (time.month // 10) << 4 | (time.month % 10), 1) + util.data_add(bcd, + ((time.year % 100) // 10) << 4 | (time.year % 10), 1) + util.data_add(bcd, ((century % 100) // 10) << 4 | (century % 10), 1) + + self.timestamp = bcd + if not args.gen_err_valid_bits: + self.validation_bits |= self.VALIDATION_BITS["timestamp"] + + if args.gen_err_valid_bits: + self.validation_bits = util.get_choice(name="validation", + value=args.gen_err_valid_bits, + choices=self.VALIDATION_BITS) + + def __init__(self, host, port, debug=False): + """Initialize variables used by the QMP send logic""" + + self.connected = False + self.host = host + self.port = port + self.debug = debug + + # ACPI 6.1: 18.3.2.7.1 Generic Error Data: Generic Error Status Block + self.block_status = self.BLOCK_STATUS_BITS["uncorrectable"] + self.raw_data = [] + self.error_severity = self.ERROR_SEVERITY["recoverable"] + + # ACPI 6.1: 18.3.2.7.1 Generic Error Data: Generic Error Data Entry + self.validation_bits = 0 + self.flags = 0 + self.fru_id = bytearray(16) + self.fru_text = bytearray(20) + self.timestamp = bytearray(8) + + self.qmp_monitor = QEMUMonitorProtocol(address=(self.host, self.port)) + + # + # Socket QMP send command + # + def send_cper_raw(self, cper_data): + """Send a raw CPER data to QEMU though QMP TCP socket""" + + data = b64encode(bytes(cper_data)).decode('ascii') + + cmd_arg = { + 'cper': data + } + + self._connect() + + if self.send_cmd("inject-ghes-v2-error", cmd_arg): + print("Error injected.") + + def send_cper(self, notif_type, payload): + """Send commands to QEMU though QMP TCP socket""" + + # Fill CPER record header + + # NOTE: bits 4 to 13 of block status contain the number of + # data entries in the data section. This is currently unsupported. + + cper_length = len(payload) + data_length = cper_length + len(self.raw_data) + self.GENERIC_DATA_SIZE + + # Generic Error Data Entry + gede = bytearray() + + gede.extend(notif_type.to_bytes()) + util.data_add(gede, self.error_severity, 4) + util.data_add(gede, 0x300, 2) + util.data_add(gede, self.validation_bits, 1) + util.data_add(gede, self.flags, 1) + util.data_add(gede, cper_length, 4) + gede.extend(self.fru_id) + gede.extend(self.fru_text) + gede.extend(self.timestamp) + + # Generic Error Status Block + gebs = bytearray() + + if self.raw_data: + raw_data_offset = len(gebs) + else: + raw_data_offset = 0 + + util.data_add(gebs, self.block_status, 4) + util.data_add(gebs, raw_data_offset, 4) + util.data_add(gebs, len(self.raw_data), 4) + util.data_add(gebs, data_length, 4) + util.data_add(gebs, self.error_severity, 4) + + cper_data = bytearray() + cper_data.extend(gebs) + cper_data.extend(gede) + cper_data.extend(bytearray(self.raw_data)) + cper_data.extend(bytearray(payload)) + + if self.debug: + print(f"GUID: {notif_type}") + + util.dump_bytearray("Generic Error Status Block", gebs) + util.dump_bytearray("Generic Error Data Entry", gede) + + if self.raw_data: + util.dump_bytearray("Raw data", bytearray(self.raw_data)) + + util.dump_bytearray("Payload", payload) + + self.send_cper_raw(cper_data) + + + def search_qom(self, path, prop, regex): + """ + Return a list of devices that match path array like: + + /machine/unattached/device + /machine/peripheral-anon/device + ... + """ + + found = [] + + i = 0 + while 1: + dev = f"{path}[{i}]" + args = { + 'path': dev, + 'property': prop + } + ret = self.send_cmd("qom-get", args, may_open=True, + return_error=False) + if not ret: + break + + if isinstance(ret, str): + if regex.search(ret): + found.append(dev) + + i += 1 + if i > 10000: + print("Too many objects returned by qom-get!") + break + + return found + +class cper_guid: + """ + Contains CPER GUID, as per: + https://uefi.org/specs/UEFI/2.10/Apx_N_Common_Platform_Error_Record.html + """ + + CPER_PROC_GENERIC = guid(0x9876CCAD, 0x47B4, 0x4bdb, + [0xB6, 0x5E, 0x16, 0xF1, + 0x93, 0xC4, 0xF3, 0xDB]) + + CPER_PROC_X86 = guid(0xDC3EA0B0, 0xA144, 0x4797, + [0xB9, 0x5B, 0x53, 0xFA, + 0x24, 0x2B, 0x6E, 0x1D]) + + CPER_PROC_ITANIUM = guid(0xe429faf1, 0x3cb7, 0x11d4, + [0xbc, 0xa7, 0x00, 0x80, + 0xc7, 0x3c, 0x88, 0x81]) + + CPER_PROC_ARM = guid(0xE19E3D16, 0xBC11, 0x11E4, + [0x9C, 0xAA, 0xC2, 0x05, + 0x1D, 0x5D, 0x46, 0xB0]) + + CPER_PLATFORM_MEM = guid(0xA5BC1114, 0x6F64, 0x4EDE, + [0xB8, 0x63, 0x3E, 0x83, + 0xED, 0x7C, 0x83, 0xB1]) + + CPER_PLATFORM_MEM2 = guid(0x61EC04FC, 0x48E6, 0xD813, + [0x25, 0xC9, 0x8D, 0xAA, + 0x44, 0x75, 0x0B, 0x12]) + + CPER_PCIE = guid(0xD995E954, 0xBBC1, 0x430F, + [0xAD, 0x91, 0xB4, 0x4D, + 0xCB, 0x3C, 0x6F, 0x35]) + + CPER_PCI_BUS = guid(0xC5753963, 0x3B84, 0x4095, + [0xBF, 0x78, 0xED, 0xDA, + 0xD3, 0xF9, 0xC9, 0xDD]) + + CPER_PCI_DEV = guid(0xEB5E4685, 0xCA66, 0x4769, + [0xB6, 0xA2, 0x26, 0x06, + 0x8B, 0x00, 0x13, 0x26]) + + CPER_FW_ERROR = guid(0x81212A96, 0x09ED, 0x4996, + [0x94, 0x71, 0x8D, 0x72, + 0x9C, 0x8E, 0x69, 0xED]) + + CPER_DMA_GENERIC = guid(0x5B51FEF7, 0xC79D, 0x4434, + [0x8F, 0x1B, 0xAA, 0x62, + 0xDE, 0x3E, 0x2C, 0x64]) + + CPER_DMA_VT = guid(0x71761D37, 0x32B2, 0x45cd, + [0xA7, 0xD0, 0xB0, 0xFE, + 0xDD, 0x93, 0xE8, 0xCF]) + + CPER_DMA_IOMMU = guid(0x036F84E1, 0x7F37, 0x428c, + [0xA7, 0x9E, 0x57, 0x5F, + 0xDF, 0xAA, 0x84, 0xEC]) + + CPER_CCIX_PER = guid(0x91335EF6, 0xEBFB, 0x4478, + [0xA6, 0xA6, 0x88, 0xB7, + 0x28, 0xCF, 0x75, 0xD7]) + + CPER_CXL_PROT_ERR = guid(0x80B9EFB4, 0x52B5, 0x4DE3, + [0xA7, 0x77, 0x68, 0x78, + 0x4B, 0x77, 0x10, 0x48]) diff --git a/scripts/update-linux-headers.sh b/scripts/update-linux-headers.sh index 717c379f9e..64c0d7c4eb 100755 --- a/scripts/update-linux-headers.sh +++ b/scripts/update-linux-headers.sh @@ -90,6 +90,7 @@ cp_portable() { -e 's/<linux\/\([^>]*\)>/"standard-headers\/linux\/\1"/' \ -e "$arch_cmd" \ -e 's/__bitwise//' \ + -e 's/__counted_by(\w*)//' \ -e 's/__attribute__((packed))/QEMU_PACKED/' \ -e 's/__inline__/inline/' \ -e 's/__BITS_PER_LONG/HOST_LONG_BITS/' \ diff --git a/system/memory.c b/system/memory.c index fe8b28a096..41797ceef4 100644 --- a/system/memory.c +++ b/system/memory.c @@ -2044,13 +2044,9 @@ void memory_region_notify_iommu_one(IOMMUNotifier *notifier, return; } - if (notifier->notifier_flags & IOMMU_NOTIFIER_DEVIOTLB_UNMAP) { - /* Crop (iova, addr_mask) to range */ - tmp.iova = MAX(tmp.iova, notifier->start); - tmp.addr_mask = MIN(entry_end, notifier->end) - tmp.iova; - } else { - assert(entry->iova >= notifier->start && entry_end <= notifier->end); - } + /* Crop (iova, addr_mask) to range */ + tmp.iova = MAX(tmp.iova, notifier->start); + tmp.addr_mask = MIN(entry_end, notifier->end) - tmp.iova; if (event->type & notifier->notifier_flags) { notifier->notify(notifier, &tmp); diff --git a/target/arm/kvm.c b/target/arm/kvm.c index b8a1c071f5..4f769d69b3 100644 --- a/target/arm/kvm.c +++ b/target/arm/kvm.c @@ -2433,10 +2433,12 @@ void kvm_arch_on_sigbus_vcpu(CPUState *c, int code, void *addr) { ram_addr_t ram_addr; hwaddr paddr; + AcpiGhesState *ags; assert(code == BUS_MCEERR_AR || code == BUS_MCEERR_AO); - if (acpi_ghes_present() && addr) { + ags = acpi_ghes_get_state(); + if (ags && addr) { ram_addr = qemu_ram_addr_from_host(addr); if (ram_addr != RAM_ADDR_INVALID && kvm_physical_memory_addr_from_host(c->kvm_state, addr, &paddr)) { @@ -2454,7 +2456,8 @@ void kvm_arch_on_sigbus_vcpu(CPUState *c, int code, void *addr) */ if (code == BUS_MCEERR_AR) { kvm_cpu_synchronize_state(c); - if (!acpi_ghes_memory_errors(ACPI_HEST_SRC_ID_SEA, paddr)) { + if (!acpi_ghes_memory_errors(ags, ACPI_HEST_SRC_ID_SYNC, + paddr)) { kvm_inject_arm_sea(c); } else { error_report("failed to record the error"); diff --git a/tests/data/acpi/aarch64/virt/DSDT b/tests/data/acpi/aarch64/virt/DSDT index 18d97e8f22..38f01adb61 100644 --- a/tests/data/acpi/aarch64/virt/DSDT +++ b/tests/data/acpi/aarch64/virt/DSDT Binary files differdiff --git a/tests/data/acpi/aarch64/virt/DSDT.acpihmatvirt b/tests/data/acpi/aarch64/virt/DSDT.acpihmatvirt index 2cef095bcc..37a9af713b 100644 --- a/tests/data/acpi/aarch64/virt/DSDT.acpihmatvirt +++ b/tests/data/acpi/aarch64/virt/DSDT.acpihmatvirt Binary files differdiff --git a/tests/data/acpi/aarch64/virt/DSDT.acpipcihp b/tests/data/acpi/aarch64/virt/DSDT.acpipcihp index 8d55a877a4..04427e2d8e 100644 --- a/tests/data/acpi/aarch64/virt/DSDT.acpipcihp +++ b/tests/data/acpi/aarch64/virt/DSDT.acpipcihp Binary files differdiff --git a/tests/data/acpi/aarch64/virt/DSDT.hpoffacpiindex b/tests/data/acpi/aarch64/virt/DSDT.hpoffacpiindex index 970d43f68b..43ab60496e 100644 --- a/tests/data/acpi/aarch64/virt/DSDT.hpoffacpiindex +++ b/tests/data/acpi/aarch64/virt/DSDT.hpoffacpiindex Binary files differdiff --git a/tests/data/acpi/aarch64/virt/DSDT.memhp b/tests/data/acpi/aarch64/virt/DSDT.memhp index 372ca3d7fb..3c39167444 100644 --- a/tests/data/acpi/aarch64/virt/DSDT.memhp +++ b/tests/data/acpi/aarch64/virt/DSDT.memhp Binary files differdiff --git a/tests/data/acpi/aarch64/virt/DSDT.pxb b/tests/data/acpi/aarch64/virt/DSDT.pxb index c277988249..71c632cedc 100644 --- a/tests/data/acpi/aarch64/virt/DSDT.pxb +++ b/tests/data/acpi/aarch64/virt/DSDT.pxb Binary files differdiff --git a/tests/data/acpi/aarch64/virt/DSDT.smmuv3-dev b/tests/data/acpi/aarch64/virt/DSDT.smmuv3-dev index 53d4c07f42..e8c2b376df 100644 --- a/tests/data/acpi/aarch64/virt/DSDT.smmuv3-dev +++ b/tests/data/acpi/aarch64/virt/DSDT.smmuv3-dev Binary files differdiff --git a/tests/data/acpi/aarch64/virt/DSDT.smmuv3-legacy b/tests/data/acpi/aarch64/virt/DSDT.smmuv3-legacy index 53d4c07f42..e8c2b376df 100644 --- a/tests/data/acpi/aarch64/virt/DSDT.smmuv3-legacy +++ b/tests/data/acpi/aarch64/virt/DSDT.smmuv3-legacy Binary files differdiff --git a/tests/data/acpi/aarch64/virt/DSDT.topology b/tests/data/acpi/aarch64/virt/DSDT.topology index ebbeedc1ed..9f22cd3dc8 100644 --- a/tests/data/acpi/aarch64/virt/DSDT.topology +++ b/tests/data/acpi/aarch64/virt/DSDT.topology Binary files differdiff --git a/tests/data/acpi/aarch64/virt/DSDT.viot b/tests/data/acpi/aarch64/virt/DSDT.viot index b897d66797..dd3775a076 100644 --- a/tests/data/acpi/aarch64/virt/DSDT.viot +++ b/tests/data/acpi/aarch64/virt/DSDT.viot Binary files differdiff --git a/tests/data/acpi/aarch64/virt/HEST b/tests/data/acpi/aarch64/virt/HEST index 4c5d8c5b5d..674272922d 100644 --- a/tests/data/acpi/aarch64/virt/HEST +++ b/tests/data/acpi/aarch64/virt/HEST Binary files differdiff --git a/tests/qtest/libqos/virtio.c b/tests/qtest/libqos/virtio.c index 5a709d0bc5..010ff40834 100644 --- a/tests/qtest/libqos/virtio.c +++ b/tests/qtest/libqos/virtio.c @@ -265,8 +265,9 @@ void qvring_init(QTestState *qts, const QGuestAllocator *alloc, QVirtQueue *vq, /* vq->avail->flags */ qvirtio_writew(vq->vdev, qts, vq->avail, 0); - /* vq->avail->idx */ - qvirtio_writew(vq->vdev, qts, vq->avail + 2, 0); + + qvirtqueue_set_avail_idx(qts, vq->vdev, vq, 0); + /* vq->avail->used_event */ qvirtio_writew(vq->vdev, qts, vq->avail + 4 + (2 * vq->size), 0); @@ -388,6 +389,13 @@ uint32_t qvirtqueue_add_indirect(QTestState *qts, QVirtQueue *vq, return vq->free_head++; /* Return and increase, in this order */ } +void qvirtqueue_set_avail_idx(QTestState *qts, QVirtioDevice *d, + QVirtQueue *vq, uint16_t idx) +{ + /* vq->avail->idx */ + qvirtio_writew(d, qts, vq->avail + 2, idx); +} + void qvirtqueue_kick(QTestState *qts, QVirtioDevice *d, QVirtQueue *vq, uint32_t free_head) { @@ -400,8 +408,8 @@ void qvirtqueue_kick(QTestState *qts, QVirtioDevice *d, QVirtQueue *vq, /* vq->avail->ring[idx % vq->size] */ qvirtio_writew(d, qts, vq->avail + 4 + (2 * (idx % vq->size)), free_head); - /* vq->avail->idx */ - qvirtio_writew(d, qts, vq->avail + 2, idx + 1); + + qvirtqueue_set_avail_idx(qts, d, vq, idx + 1); /* Must read after idx is updated */ flags = qvirtio_readw(d, qts, vq->used); diff --git a/tests/qtest/libqos/virtio.h b/tests/qtest/libqos/virtio.h index 7adc7cbd10..e238f1726f 100644 --- a/tests/qtest/libqos/virtio.h +++ b/tests/qtest/libqos/virtio.h @@ -143,6 +143,8 @@ uint32_t qvirtqueue_add(QTestState *qts, QVirtQueue *vq, uint64_t data, uint32_t len, bool write, bool next); uint32_t qvirtqueue_add_indirect(QTestState *qts, QVirtQueue *vq, QVRingIndirectDesc *indirect); +void qvirtqueue_set_avail_idx(QTestState *qts, QVirtioDevice *d, + QVirtQueue *vq, uint16_t idx); void qvirtqueue_kick(QTestState *qts, QVirtioDevice *d, QVirtQueue *vq, uint32_t free_head); bool qvirtqueue_get_buf(QTestState *qts, QVirtQueue *vq, uint32_t *desc_idx, diff --git a/tests/qtest/virtio-scsi-test.c b/tests/qtest/virtio-scsi-test.c index db10d572d0..e2350c52f6 100644 --- a/tests/qtest/virtio-scsi-test.c +++ b/tests/qtest/virtio-scsi-test.c @@ -311,6 +311,31 @@ fail: unlink(tmp_path); } +static void test_iothread_virtio_error(void *obj, void *data, + QGuestAllocator *t_alloc) +{ + QVirtioSCSIPCI *scsi_pci = obj; + QVirtioSCSI *scsi = &scsi_pci->scsi; + QVirtioSCSIQueues *vs; + QVirtQueue *vq; + + alloc = t_alloc; + vs = qvirtio_scsi_init(scsi->vdev); + vq = vs->vq[2]; + + /* Move avail.idx out of bounds to trigger virtio_error() */ + qvirtqueue_set_avail_idx(global_qtest, scsi->vdev, vq, vq->size * 2); + scsi->vdev->bus->virtqueue_kick(scsi->vdev, vq); + + /* + * Reset the device out of the error state. If QEMU hangs or crashes then + * this will fail. + */ + qvirtio_reset(scsi->vdev); + + qvirtio_scsi_pci_free(vs); +} + static void *virtio_scsi_hotplug_setup(GString *cmd_line, void *arg) { g_string_append(cmd_line, @@ -383,6 +408,13 @@ static void register_virtio_scsi_test(void) }; qos_add_test("iothread-attach-node", "virtio-scsi-pci", test_iothread_attach_node, &opts); + + opts.before = virtio_scsi_setup_iothread; + opts.edge = (QOSGraphEdgeOptions) { + .extra_device_opts = "iothread=thread0", + }; + qos_add_test("iothread-virtio-error", "virtio-scsi-pci", + test_iothread_virtio_error, &opts); } libqos_init(register_virtio_scsi_test); |