diff options
Diffstat (limited to 'hw')
50 files changed, 2384 insertions, 582 deletions
diff --git a/hw/acpi/Kconfig b/hw/acpi/Kconfig index 1d4e9f0845..daabbe6cd1 100644 --- a/hw/acpi/Kconfig +++ b/hw/acpi/Kconfig @@ -51,6 +51,11 @@ config ACPI_APEI bool depends on ACPI +config GHES_CPER + bool + depends on ACPI_APEI + default y + config ACPI_PCI bool depends on ACPI && PCI diff --git a/hw/acpi/aml-build.c b/hw/acpi/aml-build.c index 1e685f982f..2d5826a8f1 100644 --- a/hw/acpi/aml-build.c +++ b/hw/acpi/aml-build.c @@ -2629,3 +2629,13 @@ Aml *aml_i2c_serial_bus_device(uint16_t address, const char *resource_source) return var; } + +/* ACPI 5.0b: 18.3.2.6.2 Event Notification For Generic Error Sources */ +Aml *aml_error_device(void) +{ + Aml *dev = aml_device(ACPI_APEI_ERROR_DEVICE); + aml_append(dev, aml_name_decl("_HID", aml_string("PNP0C33"))); + aml_append(dev, aml_name_decl("_UID", aml_int(0))); + + return dev; +} diff --git a/hw/acpi/generic_event_device.c b/hw/acpi/generic_event_device.c index 95682b79a2..e7b773d84d 100644 --- a/hw/acpi/generic_event_device.c +++ b/hw/acpi/generic_event_device.c @@ -30,6 +30,7 @@ static const uint32_t ged_supported_events[] = { ACPI_GED_NVDIMM_HOTPLUG_EVT, ACPI_GED_CPU_HOTPLUG_EVT, ACPI_GED_PCI_HOTPLUG_EVT, + ACPI_GED_ERROR_EVT, }; /* @@ -120,6 +121,16 @@ void build_ged_aml(Aml *table, const char *name, HotplugHandler *hotplug_dev, aml_notify(aml_name(ACPI_POWER_BUTTON_DEVICE), aml_int(0x80))); break; + case ACPI_GED_ERROR_EVT: + /* + * ACPI 5.0b: 5.6.6 Device Object Notifications + * Table 5-135 Error Device Notification Values + * Defines 0x80 as the value to be used on notifications + */ + aml_append(if_ctx, + aml_notify(aml_name(ACPI_APEI_ERROR_DEVICE), + aml_int(0x80))); + break; case ACPI_GED_NVDIMM_HOTPLUG_EVT: aml_append(if_ctx, aml_notify(aml_name("\\_SB.NVDR"), @@ -320,6 +331,8 @@ static void acpi_ged_send_event(AcpiDeviceIf *adev, AcpiEventStatusBits ev) sel = ACPI_GED_MEM_HOTPLUG_EVT; } else if (ev & ACPI_POWER_DOWN_STATUS) { sel = ACPI_GED_PWR_DOWN_EVT; + } else if (ev & ACPI_GENERIC_ERROR) { + sel = ACPI_GED_ERROR_EVT; } else if (ev & ACPI_NVDIMM_HOTPLUG_STATUS) { sel = ACPI_GED_NVDIMM_HOTPLUG_EVT; } else if (ev & ACPI_CPU_HOTPLUG_STATUS) { @@ -349,6 +362,8 @@ static const Property acpi_ged_properties[] = { pcihp_state.use_acpi_hotplug_bridge, 0), DEFINE_PROP_LINK("bus", AcpiGedState, pcihp_state.root, TYPE_PCI_BUS, PCIBus *), + DEFINE_PROP_BOOL("x-has-hest-addr", AcpiGedState, + ghes_state.use_hest_addr, true), }; static const VMStateDescription vmstate_memhp_state = { @@ -436,6 +451,34 @@ static const VMStateDescription vmstate_pcihp_state = { } }; +static const VMStateDescription vmstate_hest = { + .name = "acpi-hest", + .version_id = 1, + .minimum_version_id = 1, + .fields = (const VMStateField[]) { + VMSTATE_UINT64(hest_addr_le, AcpiGhesState), + VMSTATE_END_OF_LIST() + }, +}; + +static bool hest_needed(void *opaque) +{ + AcpiGedState *s = opaque; + return s->ghes_state.hest_addr_le; +} + +static const VMStateDescription vmstate_hest_state = { + .name = "acpi-ged/hest", + .version_id = 1, + .minimum_version_id = 1, + .needed = hest_needed, + .fields = (const VMStateField[]) { + VMSTATE_STRUCT(ghes_state, AcpiGedState, 1, + vmstate_hest, AcpiGhesState), + VMSTATE_END_OF_LIST() + } +}; + static const VMStateDescription vmstate_acpi_ged = { .name = "acpi-ged", .version_id = 1, @@ -449,6 +492,7 @@ static const VMStateDescription vmstate_acpi_ged = { &vmstate_cpuhp_state, &vmstate_ghes_state, &vmstate_pcihp_state, + &vmstate_hest_state, NULL } }; diff --git a/hw/acpi/ghes-stub.c b/hw/acpi/ghes-stub.c index 7cec1812da..40f660c246 100644 --- a/hw/acpi/ghes-stub.c +++ b/hw/acpi/ghes-stub.c @@ -11,12 +11,13 @@ #include "qemu/osdep.h" #include "hw/acpi/ghes.h" -int acpi_ghes_memory_errors(uint16_t source_id, uint64_t physical_address) +int acpi_ghes_memory_errors(AcpiGhesState *ags, uint16_t source_id, + uint64_t physical_address) { return -1; } -bool acpi_ghes_present(void) +AcpiGhesState *acpi_ghes_get_state(void) { - return false; + return NULL; } diff --git a/hw/acpi/ghes.c b/hw/acpi/ghes.c index b85bb48195..06555905ce 100644 --- a/hw/acpi/ghes.c +++ b/hw/acpi/ghes.c @@ -30,6 +30,7 @@ #define ACPI_HW_ERROR_FW_CFG_FILE "etc/hardware_errors" #define ACPI_HW_ERROR_ADDR_FW_CFG_FILE "etc/hardware_errors_addr" +#define ACPI_HEST_ADDR_FW_CFG_FILE "etc/acpi_table_hest_addr" /* The max size in bytes for one error block */ #define ACPI_GHES_MAX_RAW_DATA_LENGTH (1 * KiB) @@ -41,6 +42,12 @@ #define GAS_ADDR_OFFSET 4 /* + * ACPI spec 1.0b + * 5.2.3 System Description Table Header + */ +#define ACPI_DESC_HEADER_OFFSET 36 + +/* * The total size of Generic Error Data Entry * ACPI 6.1/6.2: 18.3.2.7.1 Generic Error Data, * Table 18-343 Generic Error Data Entry @@ -61,6 +68,30 @@ #define ACPI_GHES_GESB_SIZE 20 /* + * See the memory layout map at docs/specs/acpi_hest_ghes.rst. + */ + +/* + * ACPI 6.1: 18.3.2.8 Generic Hardware Error Source version 2 + * Table 18-344 Generic Hardware Error Source version 2 (GHESv2) Structure + */ +#define HEST_GHES_V2_ENTRY_SIZE 92 + +/* + * ACPI 6.1: 18.3.2.8 Generic Hardware Error Source version 2 + * Table 18-344 Generic Hardware Error Source version 2 (GHESv2) Structure + * Read Ack Register + */ +#define GHES_READ_ACK_ADDR_OFF 64 + +/* + * ACPI 6.1: 18.3.2.7: Generic Hardware Error Source + * Table 18-341 Generic Hardware Error Source Structure + * Error Status Address + */ +#define GHES_ERR_STATUS_ADDR_OFF 20 + +/* * Values for error_severity field */ enum AcpiGenericErrorSeverity { @@ -206,17 +237,18 @@ ghes_gen_err_data_uncorrectable_recoverable(GArray *block, * Initialize "etc/hardware_errors" and "etc/hardware_errors_addr" fw_cfg blobs. * See docs/specs/acpi_hest_ghes.rst for blobs format. */ -static void build_ghes_error_table(GArray *hardware_errors, BIOSLinker *linker) +static void build_ghes_error_table(AcpiGhesState *ags, GArray *hardware_errors, + BIOSLinker *linker, int num_sources) { int i, error_status_block_offset; /* Build error_block_address */ - for (i = 0; i < ACPI_GHES_ERROR_SOURCE_COUNT; i++) { + for (i = 0; i < num_sources; i++) { build_append_int_noprefix(hardware_errors, 0, sizeof(uint64_t)); } /* Build read_ack_register */ - for (i = 0; i < ACPI_GHES_ERROR_SOURCE_COUNT; i++) { + for (i = 0; i < num_sources; i++) { /* * Initialize the value of read_ack_register to 1, so GHES can be * writable after (re)boot. @@ -231,13 +263,13 @@ static void build_ghes_error_table(GArray *hardware_errors, BIOSLinker *linker) /* Reserve space for Error Status Data Block */ acpi_data_push(hardware_errors, - ACPI_GHES_MAX_RAW_DATA_LENGTH * ACPI_GHES_ERROR_SOURCE_COUNT); + ACPI_GHES_MAX_RAW_DATA_LENGTH * num_sources); /* Tell guest firmware to place hardware_errors blob into RAM */ bios_linker_loader_alloc(linker, ACPI_HW_ERROR_FW_CFG_FILE, hardware_errors, sizeof(uint64_t), false); - for (i = 0; i < ACPI_GHES_ERROR_SOURCE_COUNT; i++) { + for (i = 0; i < num_sources; i++) { /* * Tell firmware to patch error_block_address entries to point to * corresponding "Generic Error Status Block" @@ -251,22 +283,26 @@ static void build_ghes_error_table(GArray *hardware_errors, BIOSLinker *linker) i * ACPI_GHES_MAX_RAW_DATA_LENGTH); } - /* - * tell firmware to write hardware_errors GPA into - * hardware_errors_addr fw_cfg, once the former has been initialized. - */ - bios_linker_loader_write_pointer(linker, ACPI_HW_ERROR_ADDR_FW_CFG_FILE, 0, - sizeof(uint64_t), - ACPI_HW_ERROR_FW_CFG_FILE, 0); + if (!ags->use_hest_addr) { + /* + * Tell firmware to write hardware_errors GPA into + * hardware_errors_addr fw_cfg, once the former has been initialized. + */ + bios_linker_loader_write_pointer(linker, ACPI_HW_ERROR_ADDR_FW_CFG_FILE, + 0, sizeof(uint64_t), + ACPI_HW_ERROR_FW_CFG_FILE, 0); + } } /* Build Generic Hardware Error Source version 2 (GHESv2) */ -static void build_ghes_v2(GArray *table_data, - BIOSLinker *linker, - enum AcpiGhesNotifyType notify, - uint16_t source_id) +static void build_ghes_v2_entry(GArray *table_data, + BIOSLinker *linker, + const AcpiNotificationSourceId *notif_src, + uint16_t index, int num_sources) { uint64_t address_offset; + const uint16_t notify = notif_src->notify; + const uint16_t source_id = notif_src->source_id; /* * Type: @@ -297,7 +333,7 @@ static void build_ghes_v2(GArray *table_data, address_offset + GAS_ADDR_OFFSET, sizeof(uint64_t), ACPI_HW_ERROR_FW_CFG_FILE, - source_id * sizeof(uint64_t)); + index * sizeof(uint64_t)); /* Notification Structure */ build_ghes_hw_error_notification(table_data, notify); @@ -317,8 +353,7 @@ static void build_ghes_v2(GArray *table_data, address_offset + GAS_ADDR_OFFSET, sizeof(uint64_t), ACPI_HW_ERROR_FW_CFG_FILE, - (ACPI_GHES_ERROR_SOURCE_COUNT + source_id) - * sizeof(uint64_t)); + (num_sources + index) * sizeof(uint64_t)); /* * Read Ack Preserve field @@ -331,23 +366,42 @@ static void build_ghes_v2(GArray *table_data, } /* Build Hardware Error Source Table */ -void acpi_build_hest(GArray *table_data, GArray *hardware_errors, +void acpi_build_hest(AcpiGhesState *ags, GArray *table_data, + GArray *hardware_errors, BIOSLinker *linker, + const AcpiNotificationSourceId *notif_source, + int num_sources, const char *oem_id, const char *oem_table_id) { AcpiTable table = { .sig = "HEST", .rev = 1, .oem_id = oem_id, .oem_table_id = oem_table_id }; + uint32_t hest_offset; + int i; - build_ghes_error_table(hardware_errors, linker); + hest_offset = table_data->len; + + build_ghes_error_table(ags, hardware_errors, linker, num_sources); acpi_table_begin(&table, table_data); /* Error Source Count */ - build_append_int_noprefix(table_data, ACPI_GHES_ERROR_SOURCE_COUNT, 4); - build_ghes_v2(table_data, linker, - ACPI_GHES_NOTIFY_SEA, ACPI_HEST_SRC_ID_SEA); + build_append_int_noprefix(table_data, num_sources, 4); + for (i = 0; i < num_sources; i++) { + build_ghes_v2_entry(table_data, linker, ¬if_source[i], i, num_sources); + } acpi_table_end(linker, &table); + + if (ags->use_hest_addr) { + /* + * Tell firmware to write into GPA the address of HEST via fw_cfg, + * once initialized. + */ + bios_linker_loader_write_pointer(linker, + ACPI_HEST_ADDR_FW_CFG_FILE, 0, + sizeof(uint64_t), + ACPI_BUILD_TABLE_FILE, hest_offset); + } } void acpi_ghes_add_fw_cfg(AcpiGhesState *ags, FWCfgState *s, @@ -357,21 +411,20 @@ void acpi_ghes_add_fw_cfg(AcpiGhesState *ags, FWCfgState *s, fw_cfg_add_file(s, ACPI_HW_ERROR_FW_CFG_FILE, hardware_error->data, hardware_error->len); - /* Create a read-write fw_cfg file for Address */ - fw_cfg_add_file_callback(s, ACPI_HW_ERROR_ADDR_FW_CFG_FILE, NULL, NULL, - NULL, &(ags->hw_error_le), sizeof(ags->hw_error_le), false); - - ags->present = true; + if (ags->use_hest_addr) { + fw_cfg_add_file_callback(s, ACPI_HEST_ADDR_FW_CFG_FILE, NULL, NULL, + NULL, &(ags->hest_addr_le), sizeof(ags->hest_addr_le), false); + } else { + /* Create a read-write fw_cfg file for Address */ + fw_cfg_add_file_callback(s, ACPI_HW_ERROR_ADDR_FW_CFG_FILE, NULL, NULL, + NULL, &(ags->hw_error_le), sizeof(ags->hw_error_le), false); + } } static void get_hw_error_offsets(uint64_t ghes_addr, uint64_t *cper_addr, uint64_t *read_ack_register_addr) { - if (!ghes_addr) { - return; - } - /* * non-HEST version supports only one source, so no need to change * the start offset based on the source ID. Also, we can't validate @@ -390,35 +443,94 @@ static void get_hw_error_offsets(uint64_t ghes_addr, *read_ack_register_addr = ghes_addr + sizeof(uint64_t); } -static void ghes_record_cper_errors(const void *cper, size_t len, - uint16_t source_id, Error **errp) +static void get_ghes_source_offsets(uint16_t source_id, + uint64_t hest_addr, + uint64_t *cper_addr, + uint64_t *read_ack_start_addr, + Error **errp) { - uint64_t cper_addr = 0, read_ack_register_addr = 0, read_ack_register; - AcpiGedState *acpi_ged_state; - AcpiGhesState *ags; + uint64_t hest_err_block_addr, hest_read_ack_addr; + uint64_t err_source_entry, error_block_addr; + uint32_t num_sources, i; - if (len > ACPI_GHES_MAX_RAW_DATA_LENGTH) { - error_setg(errp, "GHES CPER record is too big: %zd", len); - return; - } + hest_addr += ACPI_DESC_HEADER_OFFSET; - acpi_ged_state = ACPI_GED(object_resolve_path_type("", TYPE_ACPI_GED, - NULL)); - if (!acpi_ged_state) { - error_setg(errp, "Can't find ACPI_GED object"); + cpu_physical_memory_read(hest_addr, &num_sources, + sizeof(num_sources)); + num_sources = le32_to_cpu(num_sources); + + err_source_entry = hest_addr + sizeof(num_sources); + + /* + * Currently, HEST Error source navigates only for GHESv2 tables + */ + for (i = 0; i < num_sources; i++) { + uint64_t addr = err_source_entry; + uint16_t type, src_id; + + cpu_physical_memory_read(addr, &type, sizeof(type)); + type = le16_to_cpu(type); + + /* For now, we only know the size of GHESv2 table */ + if (type != ACPI_GHES_SOURCE_GENERIC_ERROR_V2) { + error_setg(errp, "HEST: type %d not supported.", type); + return; + } + + /* Compare CPER source ID at the GHESv2 structure */ + addr += sizeof(type); + cpu_physical_memory_read(addr, &src_id, sizeof(src_id)); + if (le16_to_cpu(src_id) == source_id) { + break; + } + + err_source_entry += HEST_GHES_V2_ENTRY_SIZE; + } + if (i == num_sources) { + error_setg(errp, "HEST: Source %d not found.", source_id); return; } - ags = &acpi_ged_state->ghes_state; - assert(ACPI_GHES_ERROR_SOURCE_COUNT == 1); - get_hw_error_offsets(le64_to_cpu(ags->hw_error_le), - &cper_addr, &read_ack_register_addr); + /* Navigate through table address pointers */ + hest_err_block_addr = err_source_entry + GHES_ERR_STATUS_ADDR_OFF + + GAS_ADDR_OFFSET; + + cpu_physical_memory_read(hest_err_block_addr, &error_block_addr, + sizeof(error_block_addr)); + error_block_addr = le64_to_cpu(error_block_addr); - if (!cper_addr) { - error_setg(errp, "can not find Generic Error Status Block"); + cpu_physical_memory_read(error_block_addr, cper_addr, + sizeof(*cper_addr)); + *cper_addr = le64_to_cpu(*cper_addr); + + hest_read_ack_addr = err_source_entry + GHES_READ_ACK_ADDR_OFF + + GAS_ADDR_OFFSET; + cpu_physical_memory_read(hest_read_ack_addr, read_ack_start_addr, + sizeof(*read_ack_start_addr)); + *read_ack_start_addr = le64_to_cpu(*read_ack_start_addr); +} + +NotifierList acpi_generic_error_notifiers = + NOTIFIER_LIST_INITIALIZER(acpi_generic_error_notifiers); + +void ghes_record_cper_errors(AcpiGhesState *ags, const void *cper, size_t len, + uint16_t source_id, Error **errp) +{ + uint64_t cper_addr = 0, read_ack_register_addr = 0, read_ack_register; + + if (len > ACPI_GHES_MAX_RAW_DATA_LENGTH) { + error_setg(errp, "GHES CPER record is too big: %zd", len); return; } + if (!ags->use_hest_addr) { + get_hw_error_offsets(le64_to_cpu(ags->hw_error_le), + &cper_addr, &read_ack_register_addr); + } else { + get_ghes_source_offsets(source_id, le64_to_cpu(ags->hest_addr_le), + &cper_addr, &read_ack_register_addr, errp); + } + cpu_physical_memory_read(read_ack_register_addr, &read_ack_register, sizeof(read_ack_register)); @@ -440,9 +552,12 @@ static void ghes_record_cper_errors(const void *cper, size_t len, /* Write the generic error data entry into guest memory */ cpu_physical_memory_write(cper_addr, cper, len); + + notifier_list_notify(&acpi_generic_error_notifiers, &source_id); } -int acpi_ghes_memory_errors(uint16_t source_id, uint64_t physical_address) +int acpi_ghes_memory_errors(AcpiGhesState *ags, uint16_t source_id, + uint64_t physical_address) { /* Memory Error Section Type */ const uint8_t guid[] = @@ -468,7 +583,7 @@ int acpi_ghes_memory_errors(uint16_t source_id, uint64_t physical_address) acpi_ghes_build_append_mem_cper(block, physical_address); /* Report the error */ - ghes_record_cper_errors(block->data, block->len, source_id, &errp); + ghes_record_cper_errors(ags, block->data, block->len, source_id, &errp); g_array_free(block, true); @@ -480,7 +595,7 @@ int acpi_ghes_memory_errors(uint16_t source_id, uint64_t physical_address) return 0; } -bool acpi_ghes_present(void) +AcpiGhesState *acpi_ghes_get_state(void) { AcpiGedState *acpi_ged_state; AcpiGhesState *ags; @@ -489,8 +604,12 @@ bool acpi_ghes_present(void) NULL)); if (!acpi_ged_state) { - return false; + return NULL; } ags = &acpi_ged_state->ghes_state; - return ags->present; + + if (!ags->hw_error_le && !ags->hest_addr_le) { + return NULL; + } + return ags; } diff --git a/hw/acpi/ghes_cper.c b/hw/acpi/ghes_cper.c new file mode 100644 index 0000000000..31cb2ffabe --- /dev/null +++ b/hw/acpi/ghes_cper.c @@ -0,0 +1,40 @@ +/* + * CPER payload parser for error injection + * + * Copyright(C) 2024-2025 Huawei LTD. + * + * This code is licensed under the GPL version 2 or later. See the + * COPYING file in the top-level directory. + * + * SPDX-License-Identifier: GPL-2.0-or-later + */ + +#include "qemu/osdep.h" + +#include "qemu/base64.h" +#include "qemu/error-report.h" +#include "qemu/uuid.h" +#include "qapi/qapi-commands-acpi-hest.h" +#include "hw/acpi/ghes.h" + +void qmp_inject_ghes_v2_error(const char *qmp_cper, Error **errp) +{ + AcpiGhesState *ags; + uint8_t *cper; + size_t len; + + ags = acpi_ghes_get_state(); + if (!ags) { + return; + } + + cper = qbase64_decode(qmp_cper, -1, &len, errp); + if (!cper) { + error_setg(errp, "missing GHES CPER payload"); + return; + } + + ghes_record_cper_errors(ags, cper, len, ACPI_HEST_SRC_ID_QMP, errp); + + g_free(cper); +} diff --git a/hw/acpi/ghes_cper_stub.c b/hw/acpi/ghes_cper_stub.c new file mode 100644 index 0000000000..b16be73502 --- /dev/null +++ b/hw/acpi/ghes_cper_stub.c @@ -0,0 +1,20 @@ +/* + * Stub interface for CPER payload parser for error injection + * + * Copyright(C) 2024-2025 Huawei LTD. + * + * This code is licensed under the GPL version 2 or later. See the + * COPYING file in the top-level directory. + * + * SPDX-License-Identifier: GPL-2.0-or-later + */ + +#include "qemu/osdep.h" +#include "qapi/error.h" +#include "qapi/qapi-commands-acpi-hest.h" +#include "hw/acpi/ghes.h" + +void qmp_inject_ghes_v2_error(const char *cper, Error **errp) +{ + error_setg(errp, "GHES QMP error inject is not compiled in"); +} diff --git a/hw/acpi/meson.build b/hw/acpi/meson.build index 73f02b9691..56b5d1ec96 100644 --- a/hw/acpi/meson.build +++ b/hw/acpi/meson.build @@ -34,4 +34,6 @@ endif system_ss.add(when: 'CONFIG_ACPI', if_false: files('acpi-stub.c', 'aml-build-stub.c', 'ghes-stub.c', 'acpi_interface.c')) system_ss.add(when: 'CONFIG_ACPI_PCI_BRIDGE', if_false: files('pci-bridge-stub.c')) system_ss.add_all(when: 'CONFIG_ACPI', if_true: acpi_ss) +system_ss.add(when: 'CONFIG_GHES_CPER', if_true: files('ghes_cper.c')) +system_ss.add(when: 'CONFIG_GHES_CPER', if_false: files('ghes_cper_stub.c')) system_ss.add(files('acpi-qmp-cmds.c')) diff --git a/hw/arm/virt-acpi-build.c b/hw/arm/virt-acpi-build.c index 96830f7c4e..8bb6b60515 100644 --- a/hw/arm/virt-acpi-build.c +++ b/hw/arm/virt-acpi-build.c @@ -1066,6 +1066,7 @@ build_dsdt(GArray *table_data, BIOSLinker *linker, VirtMachineState *vms) } acpi_dsdt_add_power_button(scope); + aml_append(scope, aml_error_device()); #ifdef CONFIG_TPM acpi_dsdt_add_tpm(scope, vms); #endif @@ -1125,6 +1126,15 @@ static void acpi_align_size(GArray *blob, unsigned align) g_array_set_size(blob, ROUND_UP(acpi_data_len(blob), align)); } +static const AcpiNotificationSourceId hest_ghes_notify[] = { + { ACPI_HEST_SRC_ID_SYNC, ACPI_GHES_NOTIFY_SEA }, + { ACPI_HEST_SRC_ID_QMP, ACPI_GHES_NOTIFY_GPIO }, +}; + +static const AcpiNotificationSourceId hest_ghes_notify_10_0[] = { + { ACPI_HEST_SRC_ID_SYNC, ACPI_GHES_NOTIFY_SEA }, +}; + static void virt_acpi_build(VirtMachineState *vms, AcpiBuildTables *tables) { @@ -1181,9 +1191,28 @@ void virt_acpi_build(VirtMachineState *vms, AcpiBuildTables *tables) build_dbg2(tables_blob, tables->linker, vms); if (vms->ras) { - acpi_add_table(table_offsets, tables_blob); - acpi_build_hest(tables_blob, tables->hardware_errors, tables->linker, - vms->oem_id, vms->oem_table_id); + AcpiGedState *acpi_ged_state; + static const AcpiNotificationSourceId *notify; + unsigned int notify_sz; + AcpiGhesState *ags; + + acpi_ged_state = ACPI_GED(vms->acpi_dev); + ags = &acpi_ged_state->ghes_state; + if (ags) { + acpi_add_table(table_offsets, tables_blob); + + if (!ags->use_hest_addr) { + notify = hest_ghes_notify_10_0; + notify_sz = ARRAY_SIZE(hest_ghes_notify_10_0); + } else { + notify = hest_ghes_notify; + notify_sz = ARRAY_SIZE(hest_ghes_notify); + } + + acpi_build_hest(ags, tables_blob, tables->hardware_errors, + tables->linker, notify, notify_sz, + vms->oem_id, vms->oem_table_id); + } } if (ms->numa_state->num_nodes > 0) { diff --git a/hw/arm/virt.c b/hw/arm/virt.c index 02209fadcf..175023897a 100644 --- a/hw/arm/virt.c +++ b/hw/arm/virt.c @@ -693,7 +693,7 @@ static inline DeviceState *create_acpi_ged(VirtMachineState *vms) MachineState *ms = MACHINE(vms); SysBusDevice *sbdev; int irq = vms->irqmap[VIRT_ACPI_GED]; - uint32_t event = ACPI_GED_PWR_DOWN_EVT; + uint32_t event = ACPI_GED_PWR_DOWN_EVT | ACPI_GED_ERROR_EVT; bool acpi_pcihp; if (ms->ram_slots) { @@ -1050,6 +1050,20 @@ static void virt_powerdown_req(Notifier *n, void *opaque) } } +static void virt_generic_error_req(Notifier *n, void *opaque) +{ + uint16_t *source_id = opaque; + + /* Currently, only QMP source ID is async */ + if (*source_id != ACPI_HEST_SRC_ID_QMP) { + return; + } + + VirtMachineState *s = container_of(n, VirtMachineState, generic_error_notifier); + + acpi_send_event(s->acpi_dev, ACPI_GENERIC_ERROR); +} + static void create_gpio_keys(char *fdt, DeviceState *pl061_dev, uint32_t phandle) { @@ -2500,6 +2514,9 @@ static void machvirt_init(MachineState *machine) if (has_ged && aarch64 && firmware_loaded && virt_is_acpi_enabled(vms)) { vms->acpi_dev = create_acpi_ged(vms); + vms->generic_error_notifier.notify = virt_generic_error_req; + notifier_list_add(&acpi_generic_error_notifiers, + &vms->generic_error_notifier); } else { create_gpio_devices(vms, VIRT_GPIO, sysmem); } @@ -3520,6 +3537,7 @@ DEFINE_VIRT_MACHINE_AS_LATEST(10, 2) static void virt_machine_10_1_options(MachineClass *mc) { virt_machine_10_2_options(mc); + mc->smbios_memory_device_size = 2047 * TiB; compat_props_add(mc->compat_props, hw_compat_10_1, hw_compat_10_1_len); } DEFINE_VIRT_MACHINE(10, 1) diff --git a/hw/block/virtio-blk.c b/hw/block/virtio-blk.c index 9bab2716c1..64efce4846 100644 --- a/hw/block/virtio-blk.c +++ b/hw/block/virtio-blk.c @@ -62,11 +62,7 @@ void virtio_blk_req_complete(VirtIOBlockReq *req, unsigned char status) iov_discard_undo(&req->inhdr_undo); iov_discard_undo(&req->outhdr_undo); virtqueue_push(req->vq, &req->elem, req->in_len); - if (qemu_in_iothread()) { - virtio_notify_irqfd(vdev, req->vq); - } else { - virtio_notify(vdev, req->vq); - } + virtio_notify(vdev, req->vq); } static int virtio_blk_handle_rw_error(VirtIOBlockReq *req, int error, diff --git a/hw/core/machine.c b/hw/core/machine.c index 38c949c4f2..681adbb7ac 100644 --- a/hw/core/machine.c +++ b/hw/core/machine.c @@ -35,9 +35,12 @@ #include "hw/virtio/virtio-pci.h" #include "hw/virtio/virtio-net.h" #include "hw/virtio/virtio-iommu.h" +#include "hw/acpi/generic_event_device.h" #include "audio/audio.h" -GlobalProperty hw_compat_10_1[] = {}; +GlobalProperty hw_compat_10_1[] = { + { TYPE_ACPI_GED, "x-has-hest-addr", "false" }, +}; const size_t hw_compat_10_1_len = G_N_ELEMENTS(hw_compat_10_1); GlobalProperty hw_compat_10_0[] = { @@ -1115,8 +1118,11 @@ static void machine_class_init(ObjectClass *oc, const void *data) * SMBIOS 3.1.0 7.18.5 Memory Device — Extended Size * use max possible value that could be encoded into * 'Extended Size' field (2047Tb). + * + * Unfortunately (current) Windows Server 2025 and earlier do not handle + * 4Tb+ DIMM size. */ - mc->smbios_memory_device_size = 2047 * TiB; + mc->smbios_memory_device_size = 2 * TiB; /* numa node memory size aligned on 8MB by default. * On Linux, each node's border has to be 8MB aligned diff --git a/hw/core/qdev.c b/hw/core/qdev.c index f600226176..fab42a7270 100644 --- a/hw/core/qdev.c +++ b/hw/core/qdev.c @@ -411,6 +411,35 @@ char *qdev_get_dev_path(DeviceState *dev) return NULL; } +const char *qdev_get_printable_name(DeviceState *vdev) +{ + /* + * Return device ID if explicity set + * (e.g. -device virtio-blk-pci,id=foo) + * This allows users to correlate errors with their custom device + * names. + */ + if (vdev->id) { + return vdev->id; + } + /* + * Fall back to the canonical QOM device path (eg. ID for PCI + * devices). + * This ensures the device is still uniquely and meaningfully + * identified. + */ + const char *path = qdev_get_dev_path(vdev); + if (path) { + return path; + } + + /* + * Final fallback: if all else fails, return a placeholder string. + * This ensures the error message always contains a valid string. + */ + return "<unknown device>"; +} + void qdev_add_unplug_blocker(DeviceState *dev, Error *reason) { dev->unplug_blockers = g_slist_prepend(dev->unplug_blockers, reason); diff --git a/hw/i386/acpi-build.c b/hw/i386/acpi-build.c index 423c4959fe..9446a9f862 100644 --- a/hw/i386/acpi-build.c +++ b/hw/i386/acpi-build.c @@ -1863,7 +1863,11 @@ build_amd_iommu(GArray *table_data, BIOSLinker *linker, const char *oem_id, /* IOMMU info */ build_append_int_noprefix(table_data, 0, 2); /* IOMMU Attributes */ - build_append_int_noprefix(table_data, 0, 4); + if (!s->iommu.dma_translation) { + build_append_int_noprefix(table_data, (1UL << 0) /* HATDis */, 4); + } else { + build_append_int_noprefix(table_data, 0, 4); + } /* EFR Register Image */ build_append_int_noprefix(table_data, amdvi_extended_feature_register(s), diff --git a/hw/i386/amd_iommu.c b/hw/i386/amd_iommu.c index 26be69bec8..378e0cb55e 100644 --- a/hw/i386/amd_iommu.c +++ b/hw/i386/amd_iommu.c @@ -33,6 +33,7 @@ #include "hw/i386/apic-msidef.h" #include "hw/qdev-properties.h" #include "kvm/kvm_i386.h" +#include "qemu/iova-tree.h" /* used AMD-Vi MMIO registers */ const char *amdvi_mmio_low[] = { @@ -66,6 +67,15 @@ struct AMDVIAddressSpace { MemoryRegion iommu_nodma; /* Alias of shared nodma memory region */ MemoryRegion iommu_ir; /* Device's interrupt remapping region */ AddressSpace as; /* device's corresponding address space */ + + /* DMA address translation support */ + IOMMUNotifierFlag notifier_flags; + /* entry in list of Address spaces with registered notifiers */ + QLIST_ENTRY(AMDVIAddressSpace) next; + /* Record DMA translation ranges */ + IOVATree *iova_tree; + /* DMA address translation active */ + bool addr_translation; }; /* AMDVI cache entry */ @@ -77,12 +87,29 @@ typedef struct AMDVIIOTLBEntry { uint64_t page_mask; /* physical page size */ } AMDVIIOTLBEntry; +/* + * These 'fault' reasons have an overloaded meaning since they are not only + * intended for describing reasons that generate an IO_PAGE_FAULT as per the AMD + * IOMMU specification, but are also used to signal internal errors in the + * emulation code. + */ +typedef enum AMDVIFaultReason { + AMDVI_FR_DTE_RTR_ERR = 1, /* Failure to retrieve DTE */ + AMDVI_FR_DTE_V, /* DTE[V] = 0 */ + AMDVI_FR_DTE_TV, /* DTE[TV] = 0 */ + AMDVI_FR_PT_ROOT_INV, /* Page Table Root ptr invalid */ + AMDVI_FR_PT_ENTRY_INV, /* Failure to read PTE from guest memory */ +} AMDVIFaultReason; + uint64_t amdvi_extended_feature_register(AMDVIState *s) { uint64_t feature = AMDVI_DEFAULT_EXT_FEATURES; if (s->xtsup) { feature |= AMDVI_FEATURE_XT; } + if (!s->iommu.dma_translation) { + feature |= AMDVI_HATS_MODE_RESERVED; + } return feature; } @@ -438,18 +465,704 @@ static void amdvi_completion_wait(AMDVIState *s, uint64_t *cmd) trace_amdvi_completion_wait(addr, data); } +static inline uint64_t amdvi_get_perms(uint64_t entry) +{ + return (entry & (AMDVI_DEV_PERM_READ | AMDVI_DEV_PERM_WRITE)) >> + AMDVI_DEV_PERM_SHIFT; +} + +/* validate that reserved bits are honoured */ +static bool amdvi_validate_dte(AMDVIState *s, uint16_t devid, + uint64_t *dte) +{ + + uint64_t root; + + if ((dte[0] & AMDVI_DTE_QUAD0_RESERVED) || + (dte[1] & AMDVI_DTE_QUAD1_RESERVED) || + (dte[2] & AMDVI_DTE_QUAD2_RESERVED) || + (dte[3] & AMDVI_DTE_QUAD3_RESERVED)) { + amdvi_log_illegaldevtab_error(s, devid, + s->devtab + + devid * AMDVI_DEVTAB_ENTRY_SIZE, 0); + return false; + } + + /* + * 1 = Host Address Translation is not supported. Value in MMIO Offset + * 0030h[HATS] is not meaningful. A non-zero host page table root pointer + * in the DTE would result in an ILLEGAL_DEV_TABLE_ENTRY event. + */ + root = (dte[0] & AMDVI_DEV_PT_ROOT_MASK) >> 12; + if (root && !s->iommu.dma_translation) { + amdvi_log_illegaldevtab_error(s, devid, + s->devtab + + devid * AMDVI_DEVTAB_ENTRY_SIZE, 0); + return false; + } + + return true; +} + +/* get a device table entry given the devid */ +static bool amdvi_get_dte(AMDVIState *s, int devid, uint64_t *entry) +{ + uint32_t offset = devid * AMDVI_DEVTAB_ENTRY_SIZE; + + if (dma_memory_read(&address_space_memory, s->devtab + offset, entry, + AMDVI_DEVTAB_ENTRY_SIZE, MEMTXATTRS_UNSPECIFIED)) { + trace_amdvi_dte_get_fail(s->devtab, offset); + /* log error accessing dte */ + amdvi_log_devtab_error(s, devid, s->devtab + offset, 0); + return false; + } + + *entry = le64_to_cpu(*entry); + if (!amdvi_validate_dte(s, devid, entry)) { + trace_amdvi_invalid_dte(entry[0]); + return false; + } + + return true; +} + +/* get pte translation mode */ +static inline uint8_t get_pte_translation_mode(uint64_t pte) +{ + return (pte >> AMDVI_DEV_MODE_RSHIFT) & AMDVI_DEV_MODE_MASK; +} + +static inline uint64_t amdvi_get_pte_entry(AMDVIState *s, uint64_t pte_addr, + uint16_t devid) +{ + uint64_t pte; + + if (dma_memory_read(&address_space_memory, pte_addr, + &pte, sizeof(pte), MEMTXATTRS_UNSPECIFIED)) { + trace_amdvi_get_pte_hwerror(pte_addr); + amdvi_log_pagetab_error(s, devid, pte_addr, 0); + pte = (uint64_t)-1; + return pte; + } + + pte = le64_to_cpu(pte); + return pte; +} + +static int amdvi_as_to_dte(AMDVIAddressSpace *as, uint64_t *dte) +{ + uint16_t devid = PCI_BUILD_BDF(as->bus_num, as->devfn); + AMDVIState *s = as->iommu_state; + + if (!amdvi_get_dte(s, devid, dte)) { + /* Unable to retrieve DTE for devid */ + return -AMDVI_FR_DTE_RTR_ERR; + } + + if (!(dte[0] & AMDVI_DEV_VALID)) { + /* DTE[V] not set, address is passed untranslated for devid */ + return -AMDVI_FR_DTE_V; + } + + if (!(dte[0] & AMDVI_DEV_TRANSLATION_VALID)) { + /* DTE[TV] not set, host page table not valid for devid */ + return -AMDVI_FR_DTE_TV; + } + return 0; +} + +/* + * For a PTE encoding a large page, return the page size it encodes as described + * by the AMD IOMMU Specification Table 14: Example Page Size Encodings. + * No need to adjust the value of the PTE to point to the first PTE in the large + * page since the encoding guarantees all "base" PTEs in the large page are the + * same. + */ +static uint64_t large_pte_page_size(uint64_t pte) +{ + assert(PTE_NEXT_LEVEL(pte) == 7); + + /* Determine size of the large/contiguous page encoded in the PTE */ + return PTE_LARGE_PAGE_SIZE(pte); +} + +/* + * Helper function to fetch a PTE using AMD v1 pgtable format. + * On successful page walk, returns 0 and pte parameter points to a valid PTE. + * On failure, returns: + * -AMDVI_FR_PT_ROOT_INV: A page walk is not possible due to conditions like DTE + * with invalid permissions, Page Table Root can not be read from DTE, or a + * larger IOVA than supported by page table level encoded in DTE[Mode]. + * -AMDVI_FR_PT_ENTRY_INV: A PTE could not be read from guest memory during a + * page table walk. This means that the DTE has valid data, but one of the + * lower level entries in the Page Table could not be read. + */ +static uint64_t fetch_pte(AMDVIAddressSpace *as, hwaddr address, uint64_t dte, + uint64_t *pte, hwaddr *page_size) +{ + IOMMUAccessFlags perms = amdvi_get_perms(dte); + + uint8_t level, mode; + uint64_t pte_addr; + + *pte = dte; + *page_size = 0; + + if (perms == IOMMU_NONE) { + return -AMDVI_FR_PT_ROOT_INV; + } + + /* + * The Linux kernel driver initializes the default mode to 3, corresponding + * to a 39-bit GPA space, where each entry in the pagetable translates to a + * 1GB (2^30) page size. + */ + level = mode = get_pte_translation_mode(dte); + assert(mode > 0 && mode < 7); + + /* + * If IOVA is larger than the max supported by the current pgtable level, + * there is nothing to do. + */ + if (address > PT_LEVEL_MAX_ADDR(mode - 1)) { + /* IOVA too large for the current DTE */ + return -AMDVI_FR_PT_ROOT_INV; + } + + do { + level -= 1; + + /* Update the page_size */ + *page_size = PTE_LEVEL_PAGE_SIZE(level); + + /* Permission bits are ANDed at every level, including the DTE */ + perms &= amdvi_get_perms(*pte); + if (perms == IOMMU_NONE) { + return 0; + } + + /* Not Present */ + if (!IOMMU_PTE_PRESENT(*pte)) { + return 0; + } + + /* Large or Leaf PTE found */ + if (PTE_NEXT_LEVEL(*pte) == 7 || PTE_NEXT_LEVEL(*pte) == 0) { + /* Leaf PTE found */ + break; + } + + /* + * Index the pgtable using the IOVA bits corresponding to current level + * and walk down to the lower level. + */ + pte_addr = NEXT_PTE_ADDR(*pte, level, address); + *pte = amdvi_get_pte_entry(as->iommu_state, pte_addr, as->devfn); + + if (*pte == (uint64_t)-1) { + /* + * A returned PTE of -1 indicates a failure to read the page table + * entry from guest memory. + */ + if (level == mode - 1) { + /* Failure to retrieve the Page Table from Root Pointer */ + *page_size = 0; + return -AMDVI_FR_PT_ROOT_INV; + } else { + /* Failure to read PTE. Page walk skips a page_size chunk */ + return -AMDVI_FR_PT_ENTRY_INV; + } + } + } while (level > 0); + + assert(PTE_NEXT_LEVEL(*pte) == 0 || PTE_NEXT_LEVEL(*pte) == 7 || + level == 0); + /* + * Page walk ends when Next Level field on PTE shows that either a leaf PTE + * or a series of large PTEs have been reached. In the latter case, even if + * the range starts in the middle of a contiguous page, the returned PTE + * must be the first PTE of the series. + */ + if (PTE_NEXT_LEVEL(*pte) == 7) { + /* Update page_size with the large PTE page size */ + *page_size = large_pte_page_size(*pte); + } + + return 0; +} + +/* + * Invoke notifiers registered for the address space. Update record of mapped + * ranges in IOVA Tree. + */ +static void amdvi_notify_iommu(AMDVIAddressSpace *as, IOMMUTLBEvent *event) +{ + IOMMUTLBEntry *entry = &event->entry; + + DMAMap target = { + .iova = entry->iova, + .size = entry->addr_mask, + .translated_addr = entry->translated_addr, + .perm = entry->perm, + }; + + /* + * Search the IOVA Tree for an existing translation for the target, and skip + * the notification if the mapping is already recorded. + * When the guest uses large pages, comparing against the record makes it + * possible to determine the size of the original MAP and adjust the UNMAP + * request to match it. This avoids failed checks against the mappings kept + * by the VFIO kernel driver. + */ + const DMAMap *mapped = iova_tree_find(as->iova_tree, &target); + + if (event->type == IOMMU_NOTIFIER_UNMAP) { + if (!mapped) { + /* No record exists of this mapping, nothing to do */ + return; + } + /* + * Adjust the size based on the original record. This is essential to + * determine when large/contiguous pages are used, since the guest has + * already cleared the PTE (erasing the pagesize encoded on it) before + * issuing the invalidation command. + */ + if (mapped->size != target.size) { + assert(mapped->size > target.size); + target.size = mapped->size; + /* Adjust event to invoke notifier with correct range */ + entry->addr_mask = mapped->size; + } + iova_tree_remove(as->iova_tree, target); + } else { /* IOMMU_NOTIFIER_MAP */ + if (mapped) { + /* + * If a mapping is present and matches the request, skip the + * notification. + */ + if (!memcmp(mapped, &target, sizeof(DMAMap))) { + return; + } else { + /* + * This should never happen unless a buggy guest OS omits or + * sends incorrect invalidation(s). Report an error in the event + * it does happen. + */ + error_report("Found conflicting translation. This could be due " + "to an incorrect or missing invalidation command"); + } + } + /* Record the new mapping */ + iova_tree_insert(as->iova_tree, &target); + } + + /* Invoke the notifiers registered for this address space */ + memory_region_notify_iommu(&as->iommu, 0, *event); +} + +/* + * Walk the guest page table for an IOVA and range and signal the registered + * notifiers to sync the shadow page tables in the host. + * Must be called with a valid DTE for DMA remapping i.e. V=1,TV=1 + */ +static void amdvi_sync_shadow_page_table_range(AMDVIAddressSpace *as, + uint64_t *dte, hwaddr addr, + uint64_t size, bool send_unmap) +{ + IOMMUTLBEvent event; + + hwaddr page_mask, pagesize; + hwaddr iova = addr; + hwaddr end = iova + size - 1; + + uint64_t pte; + int ret; + + while (iova < end) { + + ret = fetch_pte(as, iova, dte[0], &pte, &pagesize); + + if (ret == -AMDVI_FR_PT_ROOT_INV) { + /* + * Invalid conditions such as the IOVA being larger than supported + * by current page table mode as configured in the DTE, or a failure + * to fetch the Page Table from the Page Table Root Pointer in DTE. + */ + assert(pagesize == 0); + return; + } + /* PTE has been validated for major errors and pagesize is set */ + assert(pagesize); + page_mask = ~(pagesize - 1); + + if (ret == -AMDVI_FR_PT_ENTRY_INV) { + /* + * Failure to read PTE from memory, the pagesize matches the current + * level. Unable to determine the region type, so a safe strategy is + * to skip the range and continue the page walk. + */ + goto next; + } + + event.entry.target_as = &address_space_memory; + event.entry.iova = iova & page_mask; + /* translated_addr is irrelevant for the unmap case */ + event.entry.translated_addr = (pte & AMDVI_DEV_PT_ROOT_MASK) & + page_mask; + event.entry.addr_mask = ~page_mask; + event.entry.perm = amdvi_get_perms(pte); + + /* + * In cases where the leaf PTE is not found, or it has invalid + * permissions, an UNMAP type notification is sent, but only if the + * caller requested it. + */ + if (!IOMMU_PTE_PRESENT(pte) || (event.entry.perm == IOMMU_NONE)) { + if (!send_unmap) { + goto next; + } + event.type = IOMMU_NOTIFIER_UNMAP; + } else { + event.type = IOMMU_NOTIFIER_MAP; + } + + /* + * The following call might need to adjust event.entry.size in cases + * where the guest unmapped a series of large pages. + */ + amdvi_notify_iommu(as, &event); + /* + * In the special scenario where the guest is unmapping a large page, + * addr_mask has been adjusted before sending the notification. Update + * pagesize accordingly in order to correctly compute the next IOVA. + */ + pagesize = event.entry.addr_mask + 1; + +next: + iova &= ~(pagesize - 1); + + /* Check for 64-bit overflow and terminate walk in such cases */ + if ((iova + pagesize) < iova) { + break; + } else { + iova += pagesize; + } + } +} + +/* + * Unmap entire range that the notifier registered for i.e. the full AS. + * + * This is seemingly technically equivalent to directly calling + * memory_region_unmap_iommu_notifier_range(), but it allows to check for + * notifier boundaries and issue notifications with ranges within those bounds. + */ +static void amdvi_address_space_unmap(AMDVIAddressSpace *as, IOMMUNotifier *n) +{ + + hwaddr start = n->start; + hwaddr end = n->end; + hwaddr remain; + DMAMap map; + + assert(start <= end); + remain = end - start + 1; + + /* + * Divide the notifier range into chunks that are aligned and do not exceed + * the notifier boundaries. + */ + while (remain >= AMDVI_PAGE_SIZE) { + + IOMMUTLBEvent event; + + uint64_t mask = dma_aligned_pow2_mask(start, end, 64); + + event.type = IOMMU_NOTIFIER_UNMAP; + + IOMMUTLBEntry entry = { + .target_as = &address_space_memory, + .iova = start, + .translated_addr = 0, /* irrelevant for unmap case */ + .addr_mask = mask, + .perm = IOMMU_NONE, + }; + event.entry = entry; + + /* Call notifier registered for updates on this address space */ + memory_region_notify_iommu_one(n, &event); + + start += mask + 1; + remain -= mask + 1; + } + + assert(!remain); + + map.iova = n->start; + map.size = n->end - n->start; + + iova_tree_remove(as->iova_tree, map); +} + +/* + * For all the address spaces with notifiers registered, unmap the entire range + * the notifier registered for i.e. clear all the address spaces managed by the + * IOMMU. + */ +static void amdvi_address_space_unmap_all(AMDVIState *s) +{ + AMDVIAddressSpace *as; + IOMMUNotifier *n; + + QLIST_FOREACH(as, &s->amdvi_as_with_notifiers, next) { + IOMMU_NOTIFIER_FOREACH(n, &as->iommu) { + amdvi_address_space_unmap(as, n); + } + } +} + +/* + * For every translation present in the IOMMU, construct IOMMUTLBEntry data + * and pass it as parameter to notifier callback. + */ +static void amdvi_iommu_replay(IOMMUMemoryRegion *iommu_mr, IOMMUNotifier *n) +{ + AMDVIAddressSpace *as = container_of(iommu_mr, AMDVIAddressSpace, iommu); + uint64_t dte[4] = { 0 }; + + if (!(n->notifier_flags & IOMMU_NOTIFIER_MAP)) { + return; + } + + if (amdvi_as_to_dte(as, dte)) { + return; + } + + /* Dropping all mappings for the address space. Also clears the IOVA tree */ + amdvi_address_space_unmap(as, n); + + amdvi_sync_shadow_page_table_range(as, &dte[0], 0, UINT64_MAX, false); +} + +static void amdvi_address_space_sync(AMDVIAddressSpace *as) +{ + IOMMUNotifier *n; + uint64_t dte[4] = { 0 }; + + /* If only UNMAP notifiers are registered, drop all existing mappings */ + if (!(as->notifier_flags & IOMMU_NOTIFIER_MAP)) { + IOMMU_NOTIFIER_FOREACH(n, &as->iommu) { + /* + * Directly calling memory_region_unmap_iommu_notifier_range() does + * not guarantee that the addr_mask eventually passed as parameter + * to the notifier is valid. Use amdvi_address_space_unmap() which + * ensures the notifier range is divided into properly aligned + * regions, and issues notifications for each one. + */ + amdvi_address_space_unmap(as, n); + } + return; + } + + if (amdvi_as_to_dte(as, dte)) { + return; + } + + amdvi_sync_shadow_page_table_range(as, &dte[0], 0, UINT64_MAX, true); +} + +/* + * This differs from the replay() method in that it issues both MAP and UNMAP + * notifications since it is called after global invalidation events in order to + * re-sync all address spaces. + */ +static void amdvi_iommu_address_space_sync_all(AMDVIState *s) +{ + AMDVIAddressSpace *as; + + QLIST_FOREACH(as, &s->amdvi_as_with_notifiers, next) { + amdvi_address_space_sync(as); + } +} + +/* + * Toggle between address translation and passthrough modes by enabling the + * corresponding memory regions. + */ +static void amdvi_switch_address_space(AMDVIAddressSpace *amdvi_as) +{ + AMDVIState *s = amdvi_as->iommu_state; + + if (s->dma_remap && amdvi_as->addr_translation) { + /* Enabling DMA region */ + memory_region_set_enabled(&amdvi_as->iommu_nodma, false); + memory_region_set_enabled(MEMORY_REGION(&amdvi_as->iommu), true); + } else { + /* Disabling DMA region, using passthrough */ + memory_region_set_enabled(MEMORY_REGION(&amdvi_as->iommu), false); + memory_region_set_enabled(&amdvi_as->iommu_nodma, true); + } +} + +/* + * For all existing address spaces managed by the IOMMU, enable/disable the + * corresponding memory regions to reset the address translation mode and + * use passthrough by default. + */ +static void amdvi_reset_address_translation_all(AMDVIState *s) +{ + AMDVIAddressSpace **iommu_as; + + for (int bus_num = 0; bus_num < PCI_BUS_MAX; bus_num++) { + + /* Nothing to do if there are no devices on the current bus */ + if (!s->address_spaces[bus_num]) { + continue; + } + iommu_as = s->address_spaces[bus_num]; + + for (int devfn = 0; devfn < PCI_DEVFN_MAX; devfn++) { + + if (!iommu_as[devfn]) { + continue; + } + /* Use passthrough as default mode after reset */ + iommu_as[devfn]->addr_translation = false; + amdvi_switch_address_space(iommu_as[devfn]); + } + } +} + +static void enable_dma_mode(AMDVIAddressSpace *as, bool inval_current) +{ + /* + * When enabling DMA mode for the purpose of isolating guest devices on + * a failure to retrieve or invalid DTE, all existing mappings must be + * dropped. + */ + if (inval_current) { + IOMMUNotifier *n; + IOMMU_NOTIFIER_FOREACH(n, &as->iommu) { + amdvi_address_space_unmap(as, n); + } + } + + if (as->addr_translation) { + return; + } + + /* Installing DTE enabling translation, activate region */ + as->addr_translation = true; + amdvi_switch_address_space(as); + /* Sync shadow page tables */ + amdvi_address_space_sync(as); +} + +/* + * If paging was previously in use in the address space + * - invalidate all existing mappings + * - switch to no_dma memory region + */ +static void enable_nodma_mode(AMDVIAddressSpace *as) +{ + IOMMUNotifier *n; + + if (!as->addr_translation) { + /* passthrough is already active, nothing to do */ + return; + } + + as->addr_translation = false; + IOMMU_NOTIFIER_FOREACH(n, &as->iommu) { + /* Drop all mappings for the address space */ + amdvi_address_space_unmap(as, n); + } + amdvi_switch_address_space(as); +} + +/* + * A guest driver must issue the INVALIDATE_DEVTAB_ENTRY command to the IOMMU + * after changing a Device Table entry. We can use this fact to detect when a + * Device Table entry is created for a device attached to a paging domain and + * enable the corresponding IOMMU memory region to allow for DMA translation if + * appropriate. + */ +static void amdvi_update_addr_translation_mode(AMDVIState *s, uint16_t devid) +{ + uint8_t bus_num, devfn, dte_mode; + AMDVIAddressSpace *as; + uint64_t dte[4] = { 0 }; + int ret; + + /* + * Convert the devid encoded in the command to a bus and devfn in + * order to retrieve the corresponding address space. + */ + bus_num = PCI_BUS_NUM(devid); + devfn = devid & 0xff; + + /* + * The main buffer of size (AMDVIAddressSpace *) * (PCI_BUS_MAX) has already + * been allocated within AMDVIState, but must be careful to not access + * unallocated devfn. + */ + if (!s->address_spaces[bus_num] || !s->address_spaces[bus_num][devfn]) { + return; + } + as = s->address_spaces[bus_num][devfn]; + + ret = amdvi_as_to_dte(as, dte); + + if (!ret) { + dte_mode = (dte[0] >> AMDVI_DEV_MODE_RSHIFT) & AMDVI_DEV_MODE_MASK; + } + + switch (ret) { + case 0: + /* DTE was successfully retrieved */ + if (!dte_mode) { + enable_nodma_mode(as); /* DTE[V]=1 && DTE[Mode]=0 => passthrough */ + } else { + enable_dma_mode(as, false); /* Enable DMA translation */ + } + break; + case -AMDVI_FR_DTE_V: + /* DTE[V]=0, address is passed untranslated */ + enable_nodma_mode(as); + break; + case -AMDVI_FR_DTE_RTR_ERR: + case -AMDVI_FR_DTE_TV: + /* + * Enforce isolation by using DMA in rare scenarios where the DTE cannot + * be retrieved or DTE[TV]=0. Existing mappings are dropped. + */ + enable_dma_mode(as, true); + break; + } +} + /* log error without aborting since linux seems to be using reserved bits */ static void amdvi_inval_devtab_entry(AMDVIState *s, uint64_t *cmd) { uint16_t devid = cpu_to_le16((uint16_t)extract64(cmd[0], 0, 16)); + trace_amdvi_devtab_inval(PCI_BUS_NUM(devid), PCI_SLOT(devid), + PCI_FUNC(devid)); + /* This command should invalidate internal caches of which there isn't */ if (extract64(cmd[0], 16, 44) || cmd[1]) { amdvi_log_illegalcom_error(s, extract64(cmd[0], 60, 4), s->cmdbuf + s->cmdbuf_head); + return; + } + + /* + * When DMA remapping capability is enabled, check if updated DTE is setup + * for paging or not, and configure the corresponding memory regions. + */ + if (s->dma_remap) { + amdvi_update_addr_translation_mode(s, devid); } - trace_amdvi_devtab_inval(PCI_BUS_NUM(devid), PCI_SLOT(devid), - PCI_FUNC(devid)); } static void amdvi_complete_ppr(AMDVIState *s, uint64_t *cmd) @@ -480,6 +1193,13 @@ static void amdvi_inval_all(AMDVIState *s, uint64_t *cmd) amdvi_intremap_inval_notify_all(s, true, 0, 0); amdvi_iotlb_reset(s); + + /* + * Fully replay the address space i.e. send both UNMAP and MAP events in + * order to synchronize guest and host IO page tables tables. + */ + amdvi_iommu_address_space_sync_all(s); + trace_amdvi_all_inval(); } @@ -491,10 +1211,109 @@ static gboolean amdvi_iotlb_remove_by_domid(gpointer key, gpointer value, return entry->domid == domid; } +/* + * Helper to decode the size of the range to invalidate encoded in the + * INVALIDATE_IOMMU_PAGES Command format. + * The size of the region to invalidate depends on the S bit and address. + * S bit value: + * 0 : Invalidation size is 4 Kbytes. + * 1 : Invalidation size is determined by first zero bit in the address + * starting from Address[12]. + * + * In the AMD IOMMU Linux driver, an invalidation command with address + * ((1 << 63) - 1) is sent when intending to clear the entire cache. + * However, Table 14: Example Page Size Encodings shows that an address of + * ((1ULL << 51) - 1) encodes the entire cache, so effectively any address with + * first zero at bit 51 or larger is a request to invalidate the entire address + * space. + */ +static uint64_t amdvi_decode_invalidation_size(hwaddr addr, uint16_t flags) +{ + uint64_t size = AMDVI_PAGE_SIZE; + uint8_t fzbit = 0; + + if (flags & AMDVI_CMD_INVAL_IOMMU_PAGES_S) { + fzbit = cto64(addr | 0xFFF); + + if (fzbit >= 51) { + size = AMDVI_INV_ALL_PAGES; + } else { + size = 1ULL << (fzbit + 1); + } + } + return size; +} + +/* + * Synchronize the guest page tables with the shadow page tables kept in the + * host for the specified range. + * The invalidation command issued by the guest and intercepted by the VMM + * does not specify a device, but a domain, since all devices in the same domain + * share the same page tables. However, vIOMMU emulation creates separate + * address spaces per device, so it is necessary to traverse the list of all of + * address spaces (i.e. devices) that have notifiers registered in order to + * propagate the changes to the host page tables. + * We cannot return early from this function once a matching domain has been + * identified and its page tables synced (based on the fact that all devices in + * the same domain share the page tables). The reason is that different devices + * (i.e. address spaces) could have different notifiers registered, and by + * skipping address spaces that appear later on the amdvi_as_with_notifiers list + * their notifiers (which could differ from the ones registered for the first + * device/address space) would not be invoked. + */ +static void amdvi_sync_domain(AMDVIState *s, uint16_t domid, uint64_t addr, + uint16_t flags) +{ + AMDVIAddressSpace *as; + + uint64_t size = amdvi_decode_invalidation_size(addr, flags); + + if (size == AMDVI_INV_ALL_PAGES) { + addr = 0; /* Set start address to 0 and invalidate entire AS */ + } else { + addr &= ~(size - 1); + } + + /* + * Call notifiers that have registered for each address space matching the + * domain ID, in order to sync the guest pagetable state with the host. + */ + QLIST_FOREACH(as, &s->amdvi_as_with_notifiers, next) { + + uint64_t dte[4] = { 0 }; + + /* + * Retrieve the Device Table entry for the devid corresponding to the + * current address space, and verify the DomainID matches i.e. the page + * tables to be synced belong to devices in the domain. + */ + if (amdvi_as_to_dte(as, dte)) { + continue; + } + + /* Only need to sync the Page Tables for a matching domain */ + if (domid != (dte[1] & AMDVI_DEV_DOMID_ID_MASK)) { + continue; + } + + /* + * We have determined that there is a valid Device Table Entry for a + * device matching the DomainID in the INV_IOMMU_PAGES command issued by + * the guest. Walk the guest page table to sync shadow page table. + */ + if (as->notifier_flags & IOMMU_NOTIFIER_MAP) { + /* Sync guest IOMMU mappings with host */ + amdvi_sync_shadow_page_table_range(as, &dte[0], addr, size, true); + } + } +} + /* we don't have devid - we can't remove pages by address */ static void amdvi_inval_pages(AMDVIState *s, uint64_t *cmd) { uint16_t domid = cpu_to_le16((uint16_t)extract64(cmd[0], 32, 16)); + uint64_t addr = cpu_to_le64(extract64(cmd[1], 12, 52)) << 12; + uint16_t flags = cpu_to_le16((uint16_t)extract64(cmd[1], 0, 3)); if (extract64(cmd[0], 20, 12) || extract64(cmd[0], 48, 12) || extract64(cmd[1], 3, 9)) { @@ -504,6 +1323,8 @@ static void amdvi_inval_pages(AMDVIState *s, uint64_t *cmd) g_hash_table_foreach_remove(s->iotlb, amdvi_iotlb_remove_by_domid, &domid); + + amdvi_sync_domain(s, domid, addr, flags); trace_amdvi_pages_inval(domid); } @@ -894,150 +1715,68 @@ static void amdvi_mmio_write(void *opaque, hwaddr addr, uint64_t val, } } -static inline uint64_t amdvi_get_perms(uint64_t entry) -{ - return (entry & (AMDVI_DEV_PERM_READ | AMDVI_DEV_PERM_WRITE)) >> - AMDVI_DEV_PERM_SHIFT; -} - -/* validate that reserved bits are honoured */ -static bool amdvi_validate_dte(AMDVIState *s, uint16_t devid, - uint64_t *dte) -{ - if ((dte[0] & AMDVI_DTE_QUAD0_RESERVED) || - (dte[1] & AMDVI_DTE_QUAD1_RESERVED) || - (dte[2] & AMDVI_DTE_QUAD2_RESERVED) || - (dte[3] & AMDVI_DTE_QUAD3_RESERVED)) { - amdvi_log_illegaldevtab_error(s, devid, - s->devtab + - devid * AMDVI_DEVTAB_ENTRY_SIZE, 0); - return false; - } - - return true; -} - -/* get a device table entry given the devid */ -static bool amdvi_get_dte(AMDVIState *s, int devid, uint64_t *entry) +static void amdvi_page_walk(AMDVIAddressSpace *as, uint64_t *dte, + IOMMUTLBEntry *ret, unsigned perms, + hwaddr addr) { - uint32_t offset = devid * AMDVI_DEVTAB_ENTRY_SIZE; + hwaddr page_mask, pagesize = 0; + uint8_t mode; + uint64_t pte; + int fetch_ret; - if (dma_memory_read(&address_space_memory, s->devtab + offset, entry, - AMDVI_DEVTAB_ENTRY_SIZE, MEMTXATTRS_UNSPECIFIED)) { - trace_amdvi_dte_get_fail(s->devtab, offset); - /* log error accessing dte */ - amdvi_log_devtab_error(s, devid, s->devtab + offset, 0); - return false; + /* make sure the DTE has TV = 1 */ + if (!(dte[0] & AMDVI_DEV_TRANSLATION_VALID)) { + /* + * A DTE with V=1, TV=0 does not have a valid Page Table Root Pointer. + * An IOMMU processing a request that requires a table walk terminates + * the walk when it encounters this condition. Do the same and return + * instead of assuming that the address is forwarded without translation + * i.e. the passthrough case, as it is done for the case where DTE[V]=0. + */ + return; } - *entry = le64_to_cpu(*entry); - if (!amdvi_validate_dte(s, devid, entry)) { - trace_amdvi_invalid_dte(entry[0]); - return false; + mode = get_pte_translation_mode(dte[0]); + if (mode >= 7) { + trace_amdvi_mode_invalid(mode, addr); + return; } - - return true; -} - -/* get pte translation mode */ -static inline uint8_t get_pte_translation_mode(uint64_t pte) -{ - return (pte >> AMDVI_DEV_MODE_RSHIFT) & AMDVI_DEV_MODE_MASK; -} - -static inline uint64_t pte_override_page_mask(uint64_t pte) -{ - uint8_t page_mask = 13; - uint64_t addr = (pte & AMDVI_DEV_PT_ROOT_MASK) >> 12; - /* find the first zero bit */ - while (addr & 1) { - page_mask++; - addr = addr >> 1; + if (mode == 0) { + goto no_remap; } - return ~((1ULL << page_mask) - 1); -} - -static inline uint64_t pte_get_page_mask(uint64_t oldlevel) -{ - return ~((1UL << ((oldlevel * 9) + 3)) - 1); -} + /* Attempt to fetch the PTE to determine if a valid mapping exists */ + fetch_ret = fetch_pte(as, addr, dte[0], &pte, &pagesize); -static inline uint64_t amdvi_get_pte_entry(AMDVIState *s, uint64_t pte_addr, - uint16_t devid) -{ - uint64_t pte; + /* + * If walking the page table results in an error of any type, returns an + * empty PTE i.e. no mapping, or the permissions do not match, return since + * there is no translation available. + */ + if (fetch_ret < 0 || !IOMMU_PTE_PRESENT(pte) || + perms != (perms & amdvi_get_perms(pte))) { - if (dma_memory_read(&address_space_memory, pte_addr, - &pte, sizeof(pte), MEMTXATTRS_UNSPECIFIED)) { - trace_amdvi_get_pte_hwerror(pte_addr); - amdvi_log_pagetab_error(s, devid, pte_addr, 0); - pte = 0; - return pte; + amdvi_page_fault(as->iommu_state, as->devfn, addr, perms); + trace_amdvi_page_fault(addr); + return; } - pte = le64_to_cpu(pte); - return pte; -} - -static void amdvi_page_walk(AMDVIAddressSpace *as, uint64_t *dte, - IOMMUTLBEntry *ret, unsigned perms, - hwaddr addr) -{ - unsigned level, present, pte_perms, oldlevel; - uint64_t pte = dte[0], pte_addr, page_mask; - - /* make sure the DTE has TV = 1 */ - if (pte & AMDVI_DEV_TRANSLATION_VALID) { - level = get_pte_translation_mode(pte); - if (level >= 7) { - trace_amdvi_mode_invalid(level, addr); - return; - } - if (level == 0) { - goto no_remap; - } - - /* we are at the leaf page table or page table encodes a huge page */ - do { - pte_perms = amdvi_get_perms(pte); - present = pte & 1; - if (!present || perms != (perms & pte_perms)) { - amdvi_page_fault(as->iommu_state, as->devfn, addr, perms); - trace_amdvi_page_fault(addr); - return; - } - - /* go to the next lower level */ - pte_addr = pte & AMDVI_DEV_PT_ROOT_MASK; - /* add offset and load pte */ - pte_addr += ((addr >> (3 + 9 * level)) & 0x1FF) << 3; - pte = amdvi_get_pte_entry(as->iommu_state, pte_addr, as->devfn); - if (!pte) { - return; - } - oldlevel = level; - level = get_pte_translation_mode(pte); - } while (level > 0 && level < 7); + /* A valid PTE and page size has been retrieved */ + assert(pagesize); + page_mask = ~(pagesize - 1); - if (level == 0x7) { - page_mask = pte_override_page_mask(pte); - } else { - page_mask = pte_get_page_mask(oldlevel); - } + /* get access permissions from pte */ + ret->iova = addr & page_mask; + ret->translated_addr = (pte & AMDVI_DEV_PT_ROOT_MASK) & page_mask; + ret->addr_mask = ~page_mask; + ret->perm = amdvi_get_perms(pte); + return; - /* get access permissions from pte */ - ret->iova = addr & page_mask; - ret->translated_addr = (pte & AMDVI_DEV_PT_ROOT_MASK) & page_mask; - ret->addr_mask = ~page_mask; - ret->perm = amdvi_get_perms(pte); - return; - } no_remap: ret->iova = addr & AMDVI_PAGE_MASK_4K; ret->translated_addr = addr & AMDVI_PAGE_MASK_4K; ret->addr_mask = ~AMDVI_PAGE_MASK_4K; - ret->perm = amdvi_get_perms(pte); + ret->perm = amdvi_get_perms(dte[0]); } static void amdvi_do_translate(AMDVIAddressSpace *as, hwaddr addr, @@ -1047,6 +1786,7 @@ static void amdvi_do_translate(AMDVIAddressSpace *as, hwaddr addr, uint16_t devid = PCI_BUILD_BDF(as->bus_num, as->devfn); AMDVIIOTLBEntry *iotlb_entry = amdvi_iotlb_lookup(s, addr, devid); uint64_t entry[4]; + int dte_ret; if (iotlb_entry) { trace_amdvi_iotlb_hit(PCI_BUS_NUM(devid), PCI_SLOT(devid), @@ -1058,13 +1798,14 @@ static void amdvi_do_translate(AMDVIAddressSpace *as, hwaddr addr, return; } - if (!amdvi_get_dte(s, devid, entry)) { - return; - } + dte_ret = amdvi_as_to_dte(as, entry); - /* devices with V = 0 are not translated */ - if (!(entry[0] & AMDVI_DEV_VALID)) { - goto out; + if (dte_ret < 0) { + if (dte_ret == -AMDVI_FR_DTE_V) { + /* DTE[V]=0, address is passed untranslated */ + goto out; + } + return; } amdvi_page_walk(as, entry, ret, @@ -1500,6 +2241,9 @@ static AddressSpace *amdvi_host_dma_iommu(PCIBus *bus, void *opaque, int devfn) iommu_as[devfn]->bus_num = (uint8_t)bus_num; iommu_as[devfn]->devfn = (uint8_t)devfn; iommu_as[devfn]->iommu_state = s; + iommu_as[devfn]->notifier_flags = IOMMU_NOTIFIER_NONE; + iommu_as[devfn]->iova_tree = iova_tree_new(); + iommu_as[devfn]->addr_translation = false; amdvi_dev_as = iommu_as[devfn]; @@ -1542,8 +2286,7 @@ static AddressSpace *amdvi_host_dma_iommu(PCIBus *bus, void *opaque, int devfn) AMDVI_INT_ADDR_FIRST, &amdvi_dev_as->iommu_ir, 1); - memory_region_set_enabled(&amdvi_dev_as->iommu_nodma, false); - memory_region_set_enabled(MEMORY_REGION(&amdvi_dev_as->iommu), true); + amdvi_switch_address_space(amdvi_dev_as); } return &iommu_as[devfn]->as; } @@ -1573,14 +2316,35 @@ static int amdvi_iommu_notify_flag_changed(IOMMUMemoryRegion *iommu, Error **errp) { AMDVIAddressSpace *as = container_of(iommu, AMDVIAddressSpace, iommu); + AMDVIState *s = as->iommu_state; - if (new & IOMMU_NOTIFIER_MAP) { - error_setg(errp, - "device %02x.%02x.%x requires iommu notifier which is not " - "currently supported", as->bus_num, PCI_SLOT(as->devfn), - PCI_FUNC(as->devfn)); - return -EINVAL; + /* + * Accurate synchronization of the vIOMMU page tables required to support + * MAP notifiers is provided by the dma-remap feature. In addition, this + * also requires that the vIOMMU presents the NpCache capability, so a guest + * driver issues invalidations for both map() and unmap() operations. The + * capability is already set by default as part of AMDVI_CAPAB_FEATURES and + * written to the configuration in amdvi_pci_realize(). + */ + if (!s->dma_remap && (new & IOMMU_NOTIFIER_MAP)) { + error_setg_errno(errp, ENOTSUP, + "device %02x.%02x.%x requires dma-remap=1", + as->bus_num, PCI_SLOT(as->devfn), PCI_FUNC(as->devfn)); + return -ENOTSUP; } + + /* + * Update notifier flags for address space and the list of address spaces + * with registered notifiers. + */ + as->notifier_flags = new; + + if (old == IOMMU_NOTIFIER_NONE) { + QLIST_INSERT_HEAD(&s->amdvi_as_with_notifiers, as, next); + } else if (new == IOMMU_NOTIFIER_NONE) { + QLIST_REMOVE(as, next); + } + return 0; } @@ -1657,6 +2421,10 @@ static void amdvi_sysbus_reset(DeviceState *dev) msi_reset(&s->pci->dev); amdvi_init(s); + + /* Discard all mappings on device reset */ + amdvi_address_space_unmap_all(s); + amdvi_reset_address_translation_all(s); } static const VMStateDescription vmstate_amdvi_sysbus_migratable = { @@ -1787,6 +2555,7 @@ static void amdvi_sysbus_realize(DeviceState *dev, Error **errp) static const Property amdvi_properties[] = { DEFINE_PROP_BOOL("xtsup", AMDVIState, xtsup, false), DEFINE_PROP_STRING("pci-id", AMDVIState, pci_id), + DEFINE_PROP_BOOL("dma-remap", AMDVIState, dma_remap, false), }; static const VMStateDescription vmstate_amdvi_sysbus = { @@ -1848,6 +2617,7 @@ static void amdvi_iommu_memory_region_class_init(ObjectClass *klass, imrc->translate = amdvi_translate; imrc->notify_flag_changed = amdvi_iommu_notify_flag_changed; + imrc->replay = amdvi_iommu_replay; } static const TypeInfo amdvi_iommu_memory_region_info = { diff --git a/hw/i386/amd_iommu.h b/hw/i386/amd_iommu.h index 2476296c49..daf82fc85f 100644 --- a/hw/i386/amd_iommu.h +++ b/hw/i386/amd_iommu.h @@ -126,6 +126,10 @@ #define AMDVI_CMD_COMPLETE_PPR_REQUEST 0x07 #define AMDVI_CMD_INVAL_AMDVI_ALL 0x08 + +#define AMDVI_CMD_INVAL_IOMMU_PAGES_S (1ULL << 0) +#define AMDVI_INV_ALL_PAGES (1ULL << 52) + #define AMDVI_DEVTAB_ENTRY_SIZE 32 /* Device table entry bits 0:63 */ @@ -173,6 +177,47 @@ /* AMDVI paging mode */ #define AMDVI_GATS_MODE (2ULL << 12) #define AMDVI_HATS_MODE (2ULL << 10) +#define AMDVI_HATS_MODE_RESERVED (3ULL << 10) + +/* Page Table format */ + +#define AMDVI_PTE_PR (1ULL << 0) +#define AMDVI_PTE_NEXT_LEVEL_MASK GENMASK64(11, 9) + +#define IOMMU_PTE_PRESENT(pte) ((pte) & AMDVI_PTE_PR) + +/* Using level=0 for leaf PTE at 4K page size */ +#define PT_LEVEL_SHIFT(level) (12 + ((level) * 9)) + +/* Return IOVA bit group used to index the Page Table at specific level */ +#define PT_LEVEL_INDEX(level, iova) (((iova) >> PT_LEVEL_SHIFT(level)) & \ + GENMASK64(8, 0)) + +/* Return the max address for a specified level i.e. max_oaddr */ +#define PT_LEVEL_MAX_ADDR(x) (((x) < 5) ? \ + ((1ULL << PT_LEVEL_SHIFT((x + 1))) - 1) : \ + (~(0ULL))) + +/* Extract the NextLevel field from PTE/PDE */ +#define PTE_NEXT_LEVEL(pte) (((pte) & AMDVI_PTE_NEXT_LEVEL_MASK) >> 9) + +/* Take page table level and return default pagetable size for level */ +#define PTE_LEVEL_PAGE_SIZE(level) (1ULL << (PT_LEVEL_SHIFT(level))) + +/* + * Return address of lower level page table encoded in PTE and specified by + * current level and corresponding IOVA bit group at such level. + */ +#define NEXT_PTE_ADDR(pte, level, iova) (((pte) & AMDVI_DEV_PT_ROOT_MASK) + \ + (PT_LEVEL_INDEX(level, iova) * 8)) + +/* + * Take a PTE value with mode=0x07 and return the page size it encodes. + */ +#define PTE_LARGE_PAGE_SIZE(pte) (1ULL << (1 + cto64(((pte) | 0xfffULL)))) + +/* Return number of PTEs to use for a given page size (expected power of 2) */ +#define PAGE_SIZE_PTE_COUNT(pgsz) (1ULL << ((ctz64(pgsz) - 12) % 9)) /* IOTLB */ #define AMDVI_IOTLB_MAX_SIZE 1024 @@ -365,12 +410,18 @@ struct AMDVIState { /* for each served device */ AMDVIAddressSpace **address_spaces[PCI_BUS_MAX]; + /* list of address spaces with registered notifiers */ + QLIST_HEAD(, AMDVIAddressSpace) amdvi_as_with_notifiers; + /* IOTLB */ GHashTable *iotlb; /* Interrupt remapping */ bool ga_enabled; bool xtsup; + + /* DMA address translation */ + bool dma_remap; }; uint64_t amdvi_extended_feature_register(AMDVIState *s); diff --git a/hw/i386/intel_iommu.c b/hw/i386/intel_iommu.c index 83c5e44413..6a168d5107 100644 --- a/hw/i386/intel_iommu.c +++ b/hw/i386/intel_iommu.c @@ -45,6 +45,8 @@ ((ce)->val[1] & VTD_SM_CONTEXT_ENTRY_RID2PASID_MASK) #define VTD_CE_GET_PASID_DIR_TABLE(ce) \ ((ce)->val[0] & VTD_PASID_DIR_BASE_ADDR_MASK) +#define VTD_CE_GET_PRE(ce) \ + ((ce)->val[0] & VTD_SM_CONTEXT_ENTRY_PRE) /* pe operations */ #define VTD_PE_GET_TYPE(pe) ((pe)->val[0] & VTD_SM_PASID_ENTRY_PGTT) @@ -85,13 +87,6 @@ struct vtd_iotlb_key { static void vtd_address_space_refresh_all(IntelIOMMUState *s); static void vtd_address_space_unmap(VTDAddressSpace *as, IOMMUNotifier *n); -static void vtd_panic_require_caching_mode(void) -{ - error_report("We need to set caching-mode=on for intel-iommu to enable " - "device assignment with IOMMU protection."); - exit(1); -} - static void vtd_define_quad(IntelIOMMUState *s, hwaddr addr, uint64_t val, uint64_t wmask, uint64_t w1cmask) { @@ -1838,6 +1833,7 @@ static const bool vtd_qualified_faults[] = { [VTD_FR_FS_NON_CANONICAL] = true, [VTD_FR_FS_PAGING_ENTRY_US] = true, [VTD_FR_SM_WRITE] = true, + [VTD_FR_SM_PRE_ABS] = true, [VTD_FR_SM_INTERRUPT_ADDR] = true, [VTD_FR_FS_BIT_UPDATE_FAILED] = true, [VTD_FR_MAX] = false, @@ -2701,7 +2697,7 @@ static void vtd_handle_gcmd_write(IntelIOMMUState *s) uint32_t changed = status ^ val; trace_vtd_reg_write_gcmd(status, val); - if ((changed & VTD_GCMD_TE) && s->dma_translation) { + if ((changed & VTD_GCMD_TE) && x86_iommu->dma_translation) { /* Translation enable/disable */ vtd_handle_gcmd_te(s, val & VTD_GCMD_TE); } @@ -2857,7 +2853,13 @@ static bool vtd_process_wait_desc(IntelIOMMUState *s, VTDInvDesc *inv_desc) vtd_generate_completion_event(s); } - if (!(inv_desc->lo & (VTD_INV_DESC_WAIT_IF | VTD_INV_DESC_WAIT_SW))) { + /* + * SW=0, IF=0, FN=1 is also a valid descriptor (VT-d 7.10) + * Nothing to do as we process the descriptors in order + */ + + if (!(inv_desc->lo & (VTD_INV_DESC_WAIT_IF | VTD_INV_DESC_WAIT_SW | + VTD_INV_DESC_WAIT_FN))) { error_report_once("%s: invalid wait desc: hi=%"PRIx64", lo=%"PRIx64 " (unknown type)", __func__, inv_desc->hi, inv_desc->lo); @@ -3146,6 +3148,59 @@ static bool vtd_process_device_piotlb_desc(IntelIOMMUState *s, return true; } +static bool vtd_process_page_group_response_desc(IntelIOMMUState *s, + VTDInvDesc *inv_desc) +{ + VTDAddressSpace *vtd_dev_as; + bool pasid_present; + uint8_t response_code; + uint16_t rid; + uint32_t pasid; + uint16_t prgi; + IOMMUPRIResponse response; + + if ((inv_desc->lo & VTD_INV_DESC_PGRESP_RSVD_LO) || + (inv_desc->hi & VTD_INV_DESC_PGRESP_RSVD_HI)) { + error_report_once("%s: invalid page group response desc: hi=%"PRIx64 + ", lo=%"PRIx64" (reserved nonzero)", __func__, + inv_desc->hi, inv_desc->lo); + return false; + } + + pasid_present = VTD_INV_DESC_PGRESP_PP(inv_desc->lo); + response_code = VTD_INV_DESC_PGRESP_RC(inv_desc->lo); + rid = VTD_INV_DESC_PGRESP_RID(inv_desc->lo); + pasid = VTD_INV_DESC_PGRESP_PASID(inv_desc->lo); + prgi = VTD_INV_DESC_PGRESP_PRGI(inv_desc->hi); + + if (!pasid_present) { + error_report_once("Page group response without PASID is" + "not supported yet"); + return false; + } + + vtd_dev_as = vtd_get_as_by_sid_and_pasid(s, rid, pasid); + if (!vtd_dev_as) { + return true; + } + + response.prgi = prgi; + + if (response_code == 0x0u) { + response.response_code = IOMMU_PRI_RESP_SUCCESS; + } else if (response_code == 0x1u) { + response.response_code = IOMMU_PRI_RESP_INVALID_REQUEST; + } else { + response.response_code = IOMMU_PRI_RESP_FAILURE; + } + + if (vtd_dev_as->pri_notifier) { + vtd_dev_as->pri_notifier->notify(vtd_dev_as->pri_notifier, &response); + } + + return true; +} + static bool vtd_process_device_iotlb_desc(IntelIOMMUState *s, VTDInvDesc *inv_desc) { @@ -3246,6 +3301,13 @@ static bool vtd_process_inv_desc(IntelIOMMUState *s) } break; + case VTD_INV_DESC_PGRESP: + trace_vtd_inv_desc("page group response", inv_desc.hi, inv_desc.lo); + if (!vtd_process_page_group_response_desc(s, &inv_desc)) { + return false; + } + break; + /* * TODO: the entity of below two cases will be implemented in future series. * To make guest (which integrates scalable mode support patch set in @@ -3380,6 +3442,27 @@ static void vtd_handle_iectl_write(IntelIOMMUState *s) } } +static void vtd_handle_prs_write(IntelIOMMUState *s) +{ + uint32_t prs = vtd_get_long_raw(s, DMAR_PRS_REG); + if (!(prs & VTD_PR_STATUS_PPR) && !(prs & VTD_PR_STATUS_PRO)) { + vtd_set_clear_mask_long(s, DMAR_PECTL_REG, VTD_PR_PECTL_IP, 0); + } +} + +static void vtd_handle_pectl_write(IntelIOMMUState *s) +{ + uint32_t pectl = vtd_get_long_raw(s, DMAR_PECTL_REG); + if ((pectl & VTD_PR_PECTL_IP) && !(pectl & VTD_PR_PECTL_IM)) { + /* + * If IP field was 1 when software clears the IM field, + * the interrupt is generated along with clearing the IP field. + */ + vtd_set_clear_mask_long(s, DMAR_PECTL_REG, VTD_PR_PECTL_IP, 0); + vtd_generate_interrupt(s, DMAR_PEADDR_REG, DMAR_PEDATA_REG); + } +} + static uint64_t vtd_mem_read(void *opaque, hwaddr addr, unsigned size) { IntelIOMMUState *s = opaque; @@ -3422,6 +3505,11 @@ static uint64_t vtd_mem_read(void *opaque, hwaddr addr, unsigned size) val = s->iq >> 32; break; + case DMAR_PEUADDR_REG: + assert(size == 4); + val = vtd_get_long_raw(s, DMAR_PEUADDR_REG); + break; + default: if (size == 4) { val = vtd_get_long(s, addr); @@ -3485,6 +3573,11 @@ static void vtd_mem_write(void *opaque, hwaddr addr, vtd_handle_iotlb_write(s); break; + case DMAR_PEUADDR_REG: + assert(size == 4); + vtd_set_long(s, addr, val); + break; + /* Invalidate Address Register, 64-bit */ case DMAR_IVA_REG: if (size == 4) { @@ -3665,6 +3758,18 @@ static void vtd_mem_write(void *opaque, hwaddr addr, vtd_set_long(s, addr, val); break; + case DMAR_PRS_REG: + assert(size == 4); + vtd_set_long(s, addr, val); + vtd_handle_prs_write(s); + break; + + case DMAR_PECTL_REG: + assert(size == 4); + vtd_set_long(s, addr, val); + vtd_handle_pectl_write(s); + break; + default: if (size == 4) { vtd_set_long(s, addr, val); @@ -3835,7 +3940,6 @@ static const Property vtd_properties[] = { DEFINE_PROP_BOOL("snoop-control", IntelIOMMUState, snoop_control, false), DEFINE_PROP_BOOL("x-pasid-mode", IntelIOMMUState, pasid, false), DEFINE_PROP_BOOL("dma-drain", IntelIOMMUState, dma_drain, true), - DEFINE_PROP_BOOL("dma-translation", IntelIOMMUState, dma_translation, true), DEFINE_PROP_BOOL("stale-tm", IntelIOMMUState, stale_tm, false), DEFINE_PROP_BOOL("fs1gp", IntelIOMMUState, fs1gp, true), }; @@ -4378,6 +4482,12 @@ static bool vtd_dev_set_iommu_device(PCIBus *bus, void *opaque, int devfn, assert(hiod); + if (!s->caching_mode) { + error_setg(errp, "Device assignment is not allowed without enabling " + "caching-mode=on for Intel IOMMU."); + return false; + } + vtd_iommu_lock(s); if (g_hash_table_lookup(s->vtd_host_iommu_dev, &key)) { @@ -4549,11 +4659,11 @@ static void vtd_cap_init(IntelIOMMUState *s) s->cap = VTD_CAP_FRO | VTD_CAP_NFR | VTD_CAP_ND | VTD_CAP_MAMV | VTD_CAP_PSI | VTD_CAP_SLLPS | - VTD_CAP_MGAW(s->aw_bits); + VTD_CAP_ESRTPS | VTD_CAP_MGAW(s->aw_bits); if (s->dma_drain) { s->cap |= VTD_CAP_DRAIN; } - if (s->dma_translation) { + if (x86_iommu->dma_translation) { if (s->aw_bits >= VTD_HOST_AW_39BIT) { s->cap |= VTD_CAP_SAGAW_39bit; } @@ -4716,6 +4826,18 @@ static void vtd_init(IntelIOMMUState *s) * Interrupt remapping registers. */ vtd_define_quad(s, DMAR_IRTA_REG, 0, 0xfffffffffffff80fULL, 0); + + /* Page request registers */ + if (s->ecap & VTD_ECAP_PRS) { + vtd_define_quad(s, DMAR_PQH_REG, 0, 0x7ffe0ULL, 0); + vtd_define_quad(s, DMAR_PQT_REG, 0, 0x7ffe0ULL, 0); + vtd_define_quad(s, DMAR_PQA_REG, 0, 0xfffffffffffff007ULL, 0); + vtd_define_long(s, DMAR_PRS_REG, 0, 0, 0x3UL); + vtd_define_long(s, DMAR_PECTL_REG, 0, 0x80000000UL, 0); + vtd_define_long(s, DMAR_PEDATA_REG, 0, 0xffffUL, 0); + vtd_define_long(s, DMAR_PEADDR_REG, 0, 0xfffffffcUL, 0); + vtd_define_long(s, DMAR_PEUADDR_REG, 0, 0xffffffffUL, 0); + } } /* Should not reset address_spaces when reset because devices will still use @@ -4803,6 +4925,194 @@ static ssize_t vtd_ats_request_translation(PCIBus *bus, void *opaque, return res_index; } +/* 11.4.11.3 : The number of entries in the page request queue is 2^(PQS + 7) */ +static inline uint64_t vtd_prq_size(IntelIOMMUState *s) +{ + return 1ULL << ((vtd_get_quad(s, DMAR_PQA_REG) & VTD_PQA_SIZE) + 7); +} + +/** + * Return true if the bit is accessible and correctly set, false otherwise + */ +static bool vtd_check_pre_bit(VTDAddressSpace *vtd_as, hwaddr addr, + uint16_t sid, bool is_write) +{ + int ret; + IntelIOMMUState *s = vtd_as->iommu_state; + uint8_t bus_n = pci_bus_num(vtd_as->bus); + VTDContextEntry ce; + bool is_fpd_set = false; + + ret = vtd_dev_to_context_entry(s, bus_n, vtd_as->devfn, &ce); + + if (ret) { + goto error_report; + } + + if (!VTD_CE_GET_PRE(&ce)) { + ret = -VTD_FR_SM_PRE_ABS; + goto error_get_fpd_and_report; + } + + return true; + +error_get_fpd_and_report: + /* Try to get fpd (may not work but we are already on an error path) */ + is_fpd_set = ce.lo & VTD_CONTEXT_ENTRY_FPD; + vtd_ce_get_pasid_fpd(s, &ce, &is_fpd_set, vtd_as->pasid); +error_report: + vtd_report_fault(s, -ret, is_fpd_set, sid, addr, is_write, + vtd_as->pasid != PCI_NO_PASID, vtd_as->pasid); + return false; +} + +/* Logic described in section 7.5 */ +static void vtd_generate_page_request_event(IntelIOMMUState *s, + uint32_t old_pr_status) +{ + uint32_t current_pectl = vtd_get_long(s, DMAR_PECTL_REG); + /* + * Hardware evaluates PPR and PRO fields in the Page Request Status Register + * and if any of them is set, Page Request Event is not generated + */ + if (old_pr_status & (VTD_PR_STATUS_PRO | VTD_PR_STATUS_PPR)) { + return; + } + + vtd_set_clear_mask_long(s, DMAR_PECTL_REG, 0, VTD_PR_PECTL_IP); + if (!(current_pectl & VTD_PR_PECTL_IM)) { + vtd_set_clear_mask_long(s, DMAR_PECTL_REG, VTD_PR_PECTL_IP, 0); + vtd_generate_interrupt(s, DMAR_PEADDR_REG, DMAR_PEDATA_REG); + } +} + +/* When calling this function, we known that we are in scalable mode */ +static int vtd_pri_perform_implicit_invalidation(VTDAddressSpace *vtd_as, + hwaddr addr) +{ + IntelIOMMUState *s = vtd_as->iommu_state; + VTDContextEntry ce; + VTDPASIDEntry pe; + uint16_t pgtt; + uint16_t domain_id; + int ret = vtd_dev_to_context_entry(s, pci_bus_num(vtd_as->bus), + vtd_as->devfn, &ce); + if (ret) { + return -EINVAL; + } + ret = vtd_ce_get_rid2pasid_entry(s, &ce, &pe, vtd_as->pasid); + if (ret) { + return -EINVAL; + } + pgtt = VTD_PE_GET_TYPE(&pe); + domain_id = VTD_SM_PASID_ENTRY_DID(pe.val[1]); + ret = 0; + switch (pgtt) { + case VTD_SM_PASID_ENTRY_FLT: + vtd_piotlb_page_invalidate(s, domain_id, vtd_as->pasid, addr, 0); + break; + /* Room for other pgtt values */ + default: + error_report_once("Translation type not supported yet : %d", pgtt); + ret = -EINVAL; + break; + } + + return ret; +} + +/* Page Request Descriptor : 7.4.1.1 */ +static int vtd_pri_request_page(PCIBus *bus, void *opaque, int devfn, + uint32_t pasid, bool priv_req, bool exec_req, + hwaddr addr, bool lpig, uint16_t prgi, + bool is_read, bool is_write) +{ + IntelIOMMUState *s = opaque; + VTDAddressSpace *vtd_as; + + vtd_as = vtd_find_add_as(s, bus, devfn, pasid); + + uint64_t queue_addr_reg = vtd_get_quad(s, DMAR_PQA_REG); + uint64_t queue_tail_offset_reg = vtd_get_quad(s, DMAR_PQT_REG); + uint64_t new_queue_tail_offset = ( + (queue_tail_offset_reg + VTD_PQA_ENTRY_SIZE) % + (vtd_prq_size(s) * VTD_PQA_ENTRY_SIZE)); + uint64_t queue_head_offset_reg = vtd_get_quad(s, DMAR_PQH_REG); + hwaddr queue_tail = (queue_addr_reg & VTD_PQA_ADDR) + queue_tail_offset_reg; + uint32_t old_pr_status = vtd_get_long(s, DMAR_PRS_REG); + uint16_t sid = PCI_BUILD_BDF(pci_bus_num(vtd_as->bus), vtd_as->devfn); + VTDPRDesc desc; + + if (!(s->ecap & VTD_ECAP_PRS)) { + return -EPERM; + } + + /* + * No need to check if scalable mode is enabled as we already known that + * VTD_ECAP_PRS is set (see vtd_decide_config) + */ + + /* We do not support PRI without PASID */ + if (vtd_as->pasid == PCI_NO_PASID) { + return -EPERM; + } + if (exec_req && !is_read) { + return -EINVAL; + } + + /* Check PRE bit in the scalable mode context entry */ + if (!vtd_check_pre_bit(vtd_as, addr, sid, is_write)) { + return -EPERM; + } + + if (old_pr_status & VTD_PR_STATUS_PRO) { + /* + * No action is taken by hardware to report a fault + * or generate an event + */ + return -ENOSPC; + } + + /* Check for overflow */ + if (new_queue_tail_offset == queue_head_offset_reg) { + vtd_set_clear_mask_long(s, DMAR_PRS_REG, 0, VTD_PR_STATUS_PRO); + vtd_generate_page_request_event(s, old_pr_status); + return -ENOSPC; + } + + if (vtd_pri_perform_implicit_invalidation(vtd_as, addr)) { + return -EINVAL; + } + + desc.lo = VTD_PRD_TYPE | VTD_PRD_PP(true) | VTD_PRD_RID(sid) | + VTD_PRD_PASID(vtd_as->pasid) | VTD_PRD_PMR(priv_req); + desc.hi = VTD_PRD_RDR(is_read) | VTD_PRD_WRR(is_write) | + VTD_PRD_LPIG(lpig) | VTD_PRD_PRGI(prgi) | VTD_PRD_ADDR(addr); + + desc.lo = cpu_to_le64(desc.lo); + desc.hi = cpu_to_le64(desc.hi); + if (dma_memory_write(&address_space_memory, queue_tail, &desc, sizeof(desc), + MEMTXATTRS_UNSPECIFIED)) { + error_report_once("IO error, the PQ tail cannot be updated"); + return -EIO; + } + + /* increment the tail register and set the pending request bit */ + vtd_set_quad(s, DMAR_PQT_REG, new_queue_tail_offset); + /* + * read status again so that the kernel does not miss a request. + * in some cases, we can trigger an unecessary interrupt but this strategy + * drastically improves performance as we don't need to take a lock. + */ + old_pr_status = vtd_get_long(s, DMAR_PRS_REG); + if (!(old_pr_status & VTD_PR_STATUS_PPR)) { + vtd_set_clear_mask_long(s, DMAR_PRS_REG, 0, VTD_PR_STATUS_PPR); + vtd_generate_page_request_event(s, old_pr_status); + } + + return 0; +} + static void vtd_init_iotlb_notifier(PCIBus *bus, void *opaque, int devfn, IOMMUNotifier *n, IOMMUNotify fn, void *user_opaque) @@ -4844,6 +5154,26 @@ static void vtd_unregister_iotlb_notifier(PCIBus *bus, void *opaque, memory_region_unregister_iommu_notifier(MEMORY_REGION(&vtd_as->iommu), n); } +static void vtd_pri_register_notifier(PCIBus *bus, void *opaque, int devfn, + uint32_t pasid, IOMMUPRINotifier *notifier) +{ + IntelIOMMUState *s = opaque; + VTDAddressSpace *vtd_as; + + vtd_as = vtd_find_add_as(s, bus, devfn, pasid); + vtd_as->pri_notifier = notifier; +} + +static void vtd_pri_unregister_notifier(PCIBus *bus, void *opaque, + int devfn, uint32_t pasid) +{ + IntelIOMMUState *s = opaque; + VTDAddressSpace *vtd_as; + + vtd_as = vtd_find_add_as(s, bus, devfn, pasid); + vtd_as->pri_notifier = NULL; +} + static PCIIOMMUOps vtd_iommu_ops = { .get_address_space = vtd_host_dma_iommu, .set_iommu_device = vtd_dev_set_iommu_device, @@ -4853,6 +5183,9 @@ static PCIIOMMUOps vtd_iommu_ops = { .register_iotlb_notifier = vtd_register_iotlb_notifier, .unregister_iotlb_notifier = vtd_unregister_iotlb_notifier, .ats_request_translation = vtd_ats_request_translation, + .pri_register_notifier = vtd_pri_register_notifier, + .pri_unregister_notifier = vtd_pri_unregister_notifier, + .pri_request_page = vtd_pri_request_page, }; static bool vtd_decide_config(IntelIOMMUState *s, Error **errp) @@ -4910,32 +5243,6 @@ static bool vtd_decide_config(IntelIOMMUState *s, Error **errp) return true; } -static int vtd_machine_done_notify_one(Object *child, void *unused) -{ - IntelIOMMUState *iommu = INTEL_IOMMU_DEVICE(x86_iommu_get_default()); - - /* - * We hard-coded here because vfio-pci is the only special case - * here. Let's be more elegant in the future when we can, but so - * far there seems to be no better way. - */ - if (object_dynamic_cast(child, "vfio-pci") && !iommu->caching_mode) { - vtd_panic_require_caching_mode(); - } - - return 0; -} - -static void vtd_machine_done_hook(Notifier *notifier, void *unused) -{ - object_child_foreach_recursive(object_get_root(), - vtd_machine_done_notify_one, NULL); -} - -static Notifier vtd_machine_done_notify = { - .notify = vtd_machine_done_hook, -}; - static void vtd_realize(DeviceState *dev, Error **errp) { MachineState *ms = MACHINE(qdev_get_machine()); @@ -4990,7 +5297,6 @@ static void vtd_realize(DeviceState *dev, Error **errp) pci_setup_iommu(bus, &vtd_iommu_ops, dev); /* Pseudo address space under root PCI bus. */ x86ms->ioapic_as = vtd_host_dma_iommu(bus, s, Q35_PSEUDO_DEVFN_IOAPIC); - qemu_add_machine_init_done_notifier(&vtd_machine_done_notify); } static void vtd_class_init(ObjectClass *klass, const void *data) diff --git a/hw/i386/intel_iommu_internal.h b/hw/i386/intel_iommu_internal.h index 360e937989..0f6a1237e4 100644 --- a/hw/i386/intel_iommu_internal.h +++ b/hw/i386/intel_iommu_internal.h @@ -190,6 +190,7 @@ #define VTD_ECAP_EIM (1ULL << 4) #define VTD_ECAP_PT (1ULL << 6) #define VTD_ECAP_SC (1ULL << 7) +#define VTD_ECAP_PRS (1ULL << 29) #define VTD_ECAP_MHMV (15ULL << 20) #define VTD_ECAP_SRS (1ULL << 31) #define VTD_ECAP_PSS (7ULL << 35) /* limit: MemTxAttrs::pid */ @@ -214,6 +215,7 @@ #define VTD_CAP_DRAIN_WRITE (1ULL << 54) #define VTD_CAP_DRAIN_READ (1ULL << 55) #define VTD_CAP_FS1GP (1ULL << 56) +#define VTD_CAP_ESRTPS (1ULL << 63) #define VTD_CAP_DRAIN (VTD_CAP_DRAIN_READ | VTD_CAP_DRAIN_WRITE) #define VTD_CAP_CM (1ULL << 7) #define VTD_PASID_ID_SHIFT 20 @@ -314,6 +316,8 @@ typedef enum VTDFaultReason { * request while disabled */ VTD_FR_IR_SID_ERR = 0x26, /* Invalid Source-ID */ + VTD_FR_SM_PRE_ABS = 0x47, /* SCT.8 : PRE bit in a present SM CE is 0 */ + /* PASID directory entry access failure */ VTD_FR_PASID_DIR_ACCESS_ERR = 0x50, /* The Present(P) field of pasid directory entry is 0 */ @@ -376,6 +380,18 @@ union VTDInvDesc { }; typedef union VTDInvDesc VTDInvDesc; +/* Page Request Descriptor */ +union VTDPRDesc { + struct { + uint64_t lo; + uint64_t hi; + }; + struct { + uint64_t val[4]; + }; +}; +typedef union VTDPRDesc VTDPRDesc; + /* Masks for struct VTDInvDesc */ #define VTD_INV_DESC_ALL_ONE -1ULL #define VTD_INV_DESC_TYPE(val) ((((val) >> 5) & 0x70ULL) | \ @@ -389,6 +405,7 @@ typedef union VTDInvDesc VTDInvDesc; #define VTD_INV_DESC_PIOTLB 0x6 /* PASID-IOTLB Invalidate Desc */ #define VTD_INV_DESC_PC 0x7 /* PASID-cache Invalidate Desc */ #define VTD_INV_DESC_DEV_PIOTLB 0x8 /* PASID-based-DIOTLB inv_desc*/ +#define VTD_INV_DESC_PGRESP 0x9 /* Page Group Response Desc */ #define VTD_INV_DESC_NONE 0 /* Not an Invalidate Descriptor */ /* Masks for Invalidation Wait Descriptor*/ @@ -440,6 +457,15 @@ typedef union VTDInvDesc VTDInvDesc; #define VTD_INV_DESC_PASID_DEVICE_IOTLB_RSVD_VAL0 0xfff000000000f000ULL #define VTD_INV_DESC_PASID_DEVICE_IOTLB_RSVD_VAL1 0x7feULL +/* Mask for Page Group Response Descriptor */ +#define VTD_INV_DESC_PGRESP_RSVD_HI 0xfffffffffffff003ULL +#define VTD_INV_DESC_PGRESP_RSVD_LO 0xfff00000000001e0ULL +#define VTD_INV_DESC_PGRESP_PP(val) (((val) >> 4) & 0x1ULL) +#define VTD_INV_DESC_PGRESP_RC(val) (((val) >> 12) & 0xfULL) +#define VTD_INV_DESC_PGRESP_RID(val) (((val) >> 16) & 0xffffULL) +#define VTD_INV_DESC_PGRESP_PASID(val) (((val) >> 32) & 0xfffffULL) +#define VTD_INV_DESC_PGRESP_PRGI(val) (((val) >> 3) & 0x1ffULL) + /* Rsvd field masks for spte */ #define VTD_SPTE_SNP 0x800ULL @@ -491,6 +517,31 @@ typedef union VTDInvDesc VTDInvDesc; #define VTD_INV_DESC_PIOTLB_RSVD_VAL0 0xfff000000000f1c0ULL #define VTD_INV_DESC_PIOTLB_RSVD_VAL1 0xf80ULL +/* Page Request Descriptor */ +/* For the low 64-bit of 128-bit */ +#define VTD_PRD_TYPE (1ULL) +#define VTD_PRD_PP(val) (((val) & 1ULL) << 8) +#define VTD_PRD_RID(val) (((val) & 0xffffULL) << 16) +#define VTD_PRD_PASID(val) (((val) & 0xfffffULL) << 32) +#define VTD_PRD_EXR(val) (((val) & 1ULL) << 52) +#define VTD_PRD_PMR(val) (((val) & 1ULL) << 53) +/* For the high 64-bit of 128-bit */ +#define VTD_PRD_RDR(val) ((val) & 1ULL) +#define VTD_PRD_WRR(val) (((val) & 1ULL) << 1) +#define VTD_PRD_LPIG(val) (((val) & 1ULL) << 2) +#define VTD_PRD_PRGI(val) (((val) & 0x1ffULL) << 3) +#define VTD_PRD_ADDR(val) ((val) & 0xfffffffffffff000ULL) + +/* Page Request Queue constants */ +#define VTD_PQA_ENTRY_SIZE 32 /* Size of an entry in bytes */ +/* Page Request Queue masks */ +#define VTD_PQA_ADDR 0xfffffffffffff000ULL /* PR queue address */ +#define VTD_PQA_SIZE 0x7ULL /* PR queue size */ +#define VTD_PR_STATUS_PPR 1UL /* Pending page request */ +#define VTD_PR_STATUS_PRO 2UL /* Page request overflow */ +#define VTD_PR_PECTL_IP 0x40000000UL /* PR control interrup pending */ +#define VTD_PR_PECTL_IM 0x80000000UL /* PR control interrup mask */ + /* Information about page-selective IOTLB invalidate */ struct VTDIOTLBPageInvInfo { uint16_t domain_id; @@ -550,6 +601,7 @@ typedef struct VTDRootEntry VTDRootEntry; #define VTD_SM_CONTEXT_ENTRY_RID2PASID_MASK 0xfffff #define VTD_SM_CONTEXT_ENTRY_RSVD_VAL0(aw) (0x1e0ULL | ~VTD_HAW_MASK(aw)) #define VTD_SM_CONTEXT_ENTRY_RSVD_VAL1 0xffffffffffe00000ULL +#define VTD_SM_CONTEXT_ENTRY_PRE 0x10ULL /* PASID Table Related Definitions */ #define VTD_PASID_DIR_BASE_ADDR_MASK (~0xfffULL) diff --git a/hw/i386/pc.c b/hw/i386/pc.c index bc048a6d13..34b00663f2 100644 --- a/hw/i386/pc.c +++ b/hw/i386/pc.c @@ -837,6 +837,7 @@ void pc_memory_init(PCMachineState *pcms, hwaddr maxphysaddr, maxusedaddr; hwaddr cxl_base, cxl_resv_end = 0; X86CPU *cpu = X86_CPU(first_cpu); + uint64_t res_mem_end; assert(machine->ram_size == x86ms->below_4g_mem_size + x86ms->above_4g_mem_size); @@ -978,16 +979,17 @@ void pc_memory_init(PCMachineState *pcms, rom_set_fw(fw_cfg); - if (machine->device_memory) { - uint64_t *val = g_malloc(sizeof(*val)); - uint64_t res_mem_end; + if (pcms->cxl_devices_state.is_enabled) { + res_mem_end = cxl_resv_end; + } else if (machine->device_memory) { + res_mem_end = machine->device_memory->base + + memory_region_size(&machine->device_memory->mr); + } else { + res_mem_end = 0; + } - if (pcms->cxl_devices_state.is_enabled) { - res_mem_end = cxl_resv_end; - } else { - res_mem_end = machine->device_memory->base - + memory_region_size(&machine->device_memory->mr); - } + if (res_mem_end) { + uint64_t *val = g_malloc(sizeof(*val)); *val = cpu_to_le64(ROUND_UP(res_mem_end, 1 * GiB)); fw_cfg_add_file(fw_cfg, "etc/reserved-memory-end", val, sizeof(*val)); } @@ -1720,25 +1722,6 @@ static void pc_machine_wakeup(MachineState *machine) cpu_synchronize_all_post_reset(); } -static bool pc_hotplug_allowed(MachineState *ms, DeviceState *dev, Error **errp) -{ - X86IOMMUState *iommu = x86_iommu_get_default(); - IntelIOMMUState *intel_iommu; - - if (iommu && - object_dynamic_cast((Object *)iommu, TYPE_INTEL_IOMMU_DEVICE) && - object_dynamic_cast((Object *)dev, "vfio-pci")) { - intel_iommu = INTEL_IOMMU_DEVICE(iommu); - if (!intel_iommu->caching_mode) { - error_setg(errp, "Device assignment is not allowed without " - "enabling caching-mode=on for Intel IOMMU."); - return false; - } - } - - return true; -} - static void pc_machine_class_init(ObjectClass *oc, const void *data) { MachineClass *mc = MACHINE_CLASS(oc); @@ -1758,7 +1741,6 @@ static void pc_machine_class_init(ObjectClass *oc, const void *data) x86mc->apic_xrupt_override = true; assert(!mc->get_hotplug_handler); mc->get_hotplug_handler = pc_get_hotplug_handler; - mc->hotplug_allowed = pc_hotplug_allowed; mc->auto_enable_numa_with_memhp = true; mc->auto_enable_numa_with_memdev = true; mc->has_hotpluggable_cpus = true; diff --git a/hw/i386/pc_piix.c b/hw/i386/pc_piix.c index caf8bab68e..7b3611e973 100644 --- a/hw/i386/pc_piix.c +++ b/hw/i386/pc_piix.c @@ -448,6 +448,7 @@ DEFINE_I440FX_MACHINE_AS_LATEST(10, 2); static void pc_i440fx_machine_10_1_options(MachineClass *m) { pc_i440fx_machine_10_2_options(m); + m->smbios_memory_device_size = 2047 * TiB; compat_props_add(m->compat_props, hw_compat_10_1, hw_compat_10_1_len); compat_props_add(m->compat_props, pc_compat_10_1, pc_compat_10_1_len); } diff --git a/hw/i386/pc_q35.c b/hw/i386/pc_q35.c index e89951285e..6015e639d7 100644 --- a/hw/i386/pc_q35.c +++ b/hw/i386/pc_q35.c @@ -384,6 +384,7 @@ DEFINE_Q35_MACHINE_AS_LATEST(10, 2); static void pc_q35_machine_10_1_options(MachineClass *m) { pc_q35_machine_10_2_options(m); + m->smbios_memory_device_size = 2047 * TiB; compat_props_add(m->compat_props, hw_compat_10_1, hw_compat_10_1_len); compat_props_add(m->compat_props, pc_compat_10_1, pc_compat_10_1_len); } diff --git a/hw/i386/x86-iommu.c b/hw/i386/x86-iommu.c index d34a6849f4..c127a44bb4 100644 --- a/hw/i386/x86-iommu.c +++ b/hw/i386/x86-iommu.c @@ -130,6 +130,7 @@ static const Property x86_iommu_properties[] = { intr_supported, ON_OFF_AUTO_AUTO), DEFINE_PROP_BOOL("device-iotlb", X86IOMMUState, dt_supported, false), DEFINE_PROP_BOOL("pt", X86IOMMUState, pt_supported, true), + DEFINE_PROP_BOOL("dma-translation", X86IOMMUState, dma_translation, true), }; static void x86_iommu_class_init(ObjectClass *klass, const void *data) diff --git a/hw/isa/lpc_ich9.c b/hw/isa/lpc_ich9.c index 304dffac32..c9cb8f7779 100644 --- a/hw/isa/lpc_ich9.c +++ b/hw/isa/lpc_ich9.c @@ -132,6 +132,11 @@ static void ich9_cc_init(ICH9LPCState *lpc) static void ich9_cc_reset(ICH9LPCState *lpc) { uint8_t *c = lpc->chip_config; + uint32_t gcs = ICH9_CC_GCS_DEFAULT; + + if (lpc->pin_strap.spkr_hi) { + gcs |= ICH9_CC_GCS_NO_REBOOT; + } memset(lpc->chip_config, 0, sizeof(lpc->chip_config)); @@ -142,7 +147,7 @@ static void ich9_cc_reset(ICH9LPCState *lpc) pci_set_long(c + ICH9_CC_D27IR, ICH9_CC_DIR_DEFAULT); pci_set_long(c + ICH9_CC_D26IR, ICH9_CC_DIR_DEFAULT); pci_set_long(c + ICH9_CC_D25IR, ICH9_CC_DIR_DEFAULT); - pci_set_long(c + ICH9_CC_GCS, ICH9_CC_GCS_DEFAULT); + pci_set_long(c + ICH9_CC_GCS, gcs); ich9_cc_update(lpc); } diff --git a/hw/net/e1000e_core.c b/hw/net/e1000e_core.c index 06657bb3ac..8fef598b49 100644 --- a/hw/net/e1000e_core.c +++ b/hw/net/e1000e_core.c @@ -2822,8 +2822,9 @@ e1000e_update_rx_offloads(E1000ECore *core) trace_e1000e_rx_set_cso(cso_state); if (core->has_vnet) { - qemu_set_offload(qemu_get_queue(core->owner_nic)->peer, - cso_state, 0, 0, 0, 0, 0, 0); + NetOffloads ol = { .csum = cso_state }; + + qemu_set_offload(qemu_get_queue(core->owner_nic)->peer, &ol); } } diff --git a/hw/net/igb_core.c b/hw/net/igb_core.c index 39e3ce1c8f..45d8fd795b 100644 --- a/hw/net/igb_core.c +++ b/hw/net/igb_core.c @@ -3058,8 +3058,9 @@ igb_update_rx_offloads(IGBCore *core) trace_e1000e_rx_set_cso(cso_state); if (core->has_vnet) { - qemu_set_offload(qemu_get_queue(core->owner_nic)->peer, - cso_state, 0, 0, 0, 0, 0, 0); + NetOffloads ol = {.csum = cso_state }; + + qemu_set_offload(qemu_get_queue(core->owner_nic)->peer, &ol); } } diff --git a/hw/net/igbvf.c b/hw/net/igbvf.c index 31d72c4977..9b0db8f841 100644 --- a/hw/net/igbvf.c +++ b/hw/net/igbvf.c @@ -251,10 +251,12 @@ static void igbvf_pci_realize(PCIDevice *dev, Error **errp) memory_region_init_io(&s->mmio, OBJECT(dev), &mmio_ops, s, "igbvf-mmio", IGBVF_MMIO_SIZE); - pcie_sriov_vf_register_bar(dev, IGBVF_MMIO_BAR_IDX, &s->mmio); + pci_register_bar(dev, IGBVF_MMIO_BAR_IDX, PCI_BASE_ADDRESS_MEM_TYPE_64 | + PCI_BASE_ADDRESS_MEM_PREFETCH, &s->mmio); memory_region_init(&s->msix, OBJECT(dev), "igbvf-msix", IGBVF_MSIX_SIZE); - pcie_sriov_vf_register_bar(dev, IGBVF_MSIX_BAR_IDX, &s->msix); + pci_register_bar(dev, IGBVF_MSIX_BAR_IDX, PCI_BASE_ADDRESS_MEM_TYPE_64 | + PCI_BASE_ADDRESS_MEM_PREFETCH, &s->msix); ret = msix_init(dev, IGBVF_MSIX_VEC_NUM, &s->msix, IGBVF_MSIX_BAR_IDX, 0, &s->msix, IGBVF_MSIX_BAR_IDX, 0x2000, 0x70, errp); diff --git a/hw/net/vhost_net-stub.c b/hw/net/vhost_net-stub.c index 7d49f82906..0740d5a2eb 100644 --- a/hw/net/vhost_net-stub.c +++ b/hw/net/vhost_net-stub.c @@ -46,9 +46,8 @@ void vhost_net_cleanup(struct vhost_net *net) { } -uint64_t vhost_net_get_features(struct vhost_net *net, uint64_t features) +void vhost_net_get_features_ex(struct vhost_net *net, uint64_t *features) { - return features; } int vhost_net_get_config(struct vhost_net *net, uint8_t *config, @@ -62,13 +61,12 @@ int vhost_net_set_config(struct vhost_net *net, const uint8_t *data, return 0; } -void vhost_net_ack_features(struct vhost_net *net, uint64_t features) +void vhost_net_ack_features_ex(struct vhost_net *net, const uint64_t *features) { } -uint64_t vhost_net_get_acked_features(VHostNetState *net) +void vhost_net_get_acked_features_ex(VHostNetState *net, uint64_t *features) { - return 0; } bool vhost_net_virtqueue_pending(VHostNetState *net, int idx) diff --git a/hw/net/vhost_net.c b/hw/net/vhost_net.c index 540492b37d..a8ee18a912 100644 --- a/hw/net/vhost_net.c +++ b/hw/net/vhost_net.c @@ -35,10 +35,9 @@ #include "hw/virtio/virtio-bus.h" #include "linux-headers/linux/vhost.h" -uint64_t vhost_net_get_features(struct vhost_net *net, uint64_t features) +void vhost_net_get_features_ex(struct vhost_net *net, uint64_t *features) { - return vhost_get_features(&net->dev, net->feature_bits, - features); + vhost_get_features_ex(&net->dev, net->feature_bits, features); } int vhost_net_get_config(struct vhost_net *net, uint8_t *config, uint32_t config_len) @@ -51,10 +50,11 @@ int vhost_net_set_config(struct vhost_net *net, const uint8_t *data, return vhost_dev_set_config(&net->dev, data, offset, size, flags); } -void vhost_net_ack_features(struct vhost_net *net, uint64_t features) +void vhost_net_ack_features_ex(struct vhost_net *net, const uint64_t *features) { - net->dev.acked_features = net->dev.backend_features; - vhost_ack_features(&net->dev, net->feature_bits, features); + virtio_features_copy(net->dev.acked_features_ex, + net->dev.backend_features_ex); + vhost_ack_features_ex(&net->dev, net->feature_bits, features); } uint64_t vhost_net_get_max_queues(VHostNetState *net) @@ -62,9 +62,9 @@ uint64_t vhost_net_get_max_queues(VHostNetState *net) return net->dev.max_queues; } -uint64_t vhost_net_get_acked_features(VHostNetState *net) +void vhost_net_get_acked_features_ex(VHostNetState *net, uint64_t *features) { - return net->dev.acked_features; + virtio_features_copy(features, net->dev.acked_features_ex); } void vhost_net_save_acked_features(NetClientState *nc) @@ -234,7 +234,8 @@ struct vhost_net *vhost_net_init(VhostNetOptions *options) int r; bool backend_kernel = options->backend_type == VHOST_BACKEND_TYPE_KERNEL; struct vhost_net *net = g_new0(struct vhost_net, 1); - uint64_t features = 0; + uint64_t missing_features[VIRTIO_FEATURES_NU64S]; + uint64_t features[VIRTIO_FEATURES_NU64S]; Error *local_err = NULL; if (!options->net_backend) { @@ -247,6 +248,7 @@ struct vhost_net *vhost_net_init(VhostNetOptions *options) net->save_acked_features = options->save_acked_features; net->max_tx_queue_size = options->max_tx_queue_size; net->is_vhost_user = options->is_vhost_user; + virtio_features_clear(features); net->dev.max_queues = 1; net->dev.vqs = net->vqs; @@ -261,7 +263,7 @@ struct vhost_net *vhost_net_init(VhostNetOptions *options) net->backend = r; net->dev.protocol_features = 0; } else { - net->dev.backend_features = 0; + virtio_features_clear(net->dev.backend_features_ex); net->dev.protocol_features = 0; net->backend = -1; @@ -281,26 +283,29 @@ struct vhost_net *vhost_net_init(VhostNetOptions *options) sizeof(struct virtio_net_hdr_mrg_rxbuf))) { net->dev.features &= ~(1ULL << VIRTIO_NET_F_MRG_RXBUF); } - if (~net->dev.features & net->dev.backend_features) { - fprintf(stderr, "vhost lacks feature mask 0x%" PRIx64 - " for backend\n", - (uint64_t)(~net->dev.features & net->dev.backend_features)); + + if (virtio_features_andnot(missing_features, + net->dev.backend_features_ex, + net->dev.features_ex)) { + fprintf(stderr, "vhost lacks feature mask 0x" VIRTIO_FEATURES_FMT + " for backend\n", VIRTIO_FEATURES_PR(missing_features)); goto fail; } } /* Set sane init value. Override when guest acks. */ if (options->get_acked_features) { - features = options->get_acked_features(net->nc); - if (~net->dev.features & features) { - fprintf(stderr, "vhost lacks feature mask 0x%" PRIx64 - " for backend\n", - (uint64_t)(~net->dev.features & features)); + virtio_features_from_u64(features, + options->get_acked_features(net->nc)); + if (virtio_features_andnot(missing_features, features, + net->dev.features_ex)) { + fprintf(stderr, "vhost lacks feature mask 0x" VIRTIO_FEATURES_FMT + " for backend\n", VIRTIO_FEATURES_PR(missing_features)); goto fail; } } - vhost_net_ack_features(net, features); + vhost_net_ack_features_ex(net, features); return net; diff --git a/hw/net/virtio-net.c b/hw/net/virtio-net.c index 7848e26278..33116712eb 100644 --- a/hw/net/virtio-net.c +++ b/hw/net/virtio-net.c @@ -90,6 +90,25 @@ VIRTIO_NET_RSS_HASH_TYPE_TCP_EX | \ VIRTIO_NET_RSS_HASH_TYPE_UDP_EX) +/* + * Features starting from VIRTIO_NET_FEATURES_MAP_MIN bit correspond + * to guest offloads in the VIRTIO_NET_OFFLOAD_MAP range + */ +#define VIRTIO_NET_OFFLOAD_MAP_MIN 46 +#define VIRTIO_NET_OFFLOAD_MAP_LENGTH 4 +#define VIRTIO_NET_OFFLOAD_MAP MAKE_64BIT_MASK( \ + VIRTIO_NET_OFFLOAD_MAP_MIN, \ + VIRTIO_NET_OFFLOAD_MAP_LENGTH) +#define VIRTIO_NET_FEATURES_MAP_MIN 65 +#define VIRTIO_NET_F2O_SHIFT (VIRTIO_NET_OFFLOAD_MAP_MIN - \ + VIRTIO_NET_FEATURES_MAP_MIN + 64) + +static bool virtio_has_tunnel_hdr(const uint64_t *features) +{ + return virtio_has_feature_ex(features, VIRTIO_NET_F_HOST_UDP_TUNNEL_GSO) || + virtio_has_feature_ex(features, VIRTIO_NET_F_GUEST_UDP_TUNNEL_GSO); +} + static const VirtIOFeature feature_sizes[] = { {.flags = 1ULL << VIRTIO_NET_F_MAC, .end = endof(struct virtio_net_config, mac)}, @@ -636,8 +655,18 @@ static int peer_has_uso(VirtIONet *n) return qemu_has_uso(qemu_get_queue(n->nic)->peer); } +static bool peer_has_tunnel(VirtIONet *n) +{ + if (!peer_has_vnet_hdr(n)) { + return false; + } + + return qemu_has_tunnel(qemu_get_queue(n->nic)->peer); +} + static void virtio_net_set_mrg_rx_bufs(VirtIONet *n, int mergeable_rx_bufs, - int version_1, int hash_report) + int version_1, int hash_report, + int tunnel) { int i; NetClientState *nc; @@ -645,9 +674,11 @@ static void virtio_net_set_mrg_rx_bufs(VirtIONet *n, int mergeable_rx_bufs, n->mergeable_rx_bufs = mergeable_rx_bufs; if (version_1) { - n->guest_hdr_len = hash_report ? - sizeof(struct virtio_net_hdr_v1_hash) : - sizeof(struct virtio_net_hdr_mrg_rxbuf); + n->guest_hdr_len = tunnel ? + sizeof(struct virtio_net_hdr_v1_hash_tunnel) : + (hash_report ? + sizeof(struct virtio_net_hdr_v1_hash) : + sizeof(struct virtio_net_hdr_mrg_rxbuf)); n->rss_data.populate_hash = !!hash_report; } else { n->guest_hdr_len = n->mergeable_rx_bufs ? @@ -773,17 +804,31 @@ static uint64_t virtio_net_bad_features(VirtIODevice *vdev) static void virtio_net_apply_guest_offloads(VirtIONet *n) { - qemu_set_offload(qemu_get_queue(n->nic)->peer, - !!(n->curr_guest_offloads & (1ULL << VIRTIO_NET_F_GUEST_CSUM)), - !!(n->curr_guest_offloads & (1ULL << VIRTIO_NET_F_GUEST_TSO4)), - !!(n->curr_guest_offloads & (1ULL << VIRTIO_NET_F_GUEST_TSO6)), - !!(n->curr_guest_offloads & (1ULL << VIRTIO_NET_F_GUEST_ECN)), - !!(n->curr_guest_offloads & (1ULL << VIRTIO_NET_F_GUEST_UFO)), - !!(n->curr_guest_offloads & (1ULL << VIRTIO_NET_F_GUEST_USO4)), - !!(n->curr_guest_offloads & (1ULL << VIRTIO_NET_F_GUEST_USO6))); + NetOffloads ol = { + .csum = !!(n->curr_guest_offloads & (1ULL << VIRTIO_NET_F_GUEST_CSUM)), + .tso4 = !!(n->curr_guest_offloads & (1ULL << VIRTIO_NET_F_GUEST_TSO4)), + .tso6 = !!(n->curr_guest_offloads & (1ULL << VIRTIO_NET_F_GUEST_TSO6)), + .ecn = !!(n->curr_guest_offloads & (1ULL << VIRTIO_NET_F_GUEST_ECN)), + .ufo = !!(n->curr_guest_offloads & (1ULL << VIRTIO_NET_F_GUEST_UFO)), + .uso4 = !!(n->curr_guest_offloads & (1ULL << VIRTIO_NET_F_GUEST_USO4)), + .uso6 = !!(n->curr_guest_offloads & (1ULL << VIRTIO_NET_F_GUEST_USO6)), + .tnl = !!(n->curr_guest_offloads & + (1ULL << VIRTIO_NET_F_GUEST_UDP_TUNNEL_GSO_MAPPED)), + .tnl_csum = !!(n->curr_guest_offloads & + (1ULL << VIRTIO_NET_F_GUEST_UDP_TUNNEL_GSO_CSUM_MAPPED)), + }; + + qemu_set_offload(qemu_get_queue(n->nic)->peer, &ol); +} + +static uint64_t virtio_net_features_to_offload(const uint64_t *features) +{ + return (features[0] & ~VIRTIO_NET_OFFLOAD_MAP) | + ((features[1] << VIRTIO_NET_F2O_SHIFT) & VIRTIO_NET_OFFLOAD_MAP); } -static uint64_t virtio_net_guest_offloads_by_features(uint64_t features) +static uint64_t +virtio_net_guest_offloads_by_features(const uint64_t *features) { static const uint64_t guest_offloads_mask = (1ULL << VIRTIO_NET_F_GUEST_CSUM) | @@ -792,15 +837,17 @@ static uint64_t virtio_net_guest_offloads_by_features(uint64_t features) (1ULL << VIRTIO_NET_F_GUEST_ECN) | (1ULL << VIRTIO_NET_F_GUEST_UFO) | (1ULL << VIRTIO_NET_F_GUEST_USO4) | - (1ULL << VIRTIO_NET_F_GUEST_USO6); + (1ULL << VIRTIO_NET_F_GUEST_USO6) | + (1ULL << VIRTIO_NET_F_GUEST_UDP_TUNNEL_GSO_MAPPED) | + (1ULL << VIRTIO_NET_F_GUEST_UDP_TUNNEL_GSO_CSUM_MAPPED); - return guest_offloads_mask & features; + return guest_offloads_mask & virtio_net_features_to_offload(features); } uint64_t virtio_net_supported_guest_offloads(const VirtIONet *n) { VirtIODevice *vdev = VIRTIO_DEVICE(n); - return virtio_net_guest_offloads_by_features(vdev->guest_features); + return virtio_net_guest_offloads_by_features(vdev->guest_features_ex); } typedef struct { @@ -879,34 +926,40 @@ static void failover_add_primary(VirtIONet *n, Error **errp) error_propagate(errp, err); } -static void virtio_net_set_features(VirtIODevice *vdev, uint64_t features) +static void virtio_net_set_features(VirtIODevice *vdev, + const uint64_t *in_features) { + uint64_t features[VIRTIO_FEATURES_NU64S]; VirtIONet *n = VIRTIO_NET(vdev); Error *err = NULL; int i; + virtio_features_copy(features, in_features); if (n->mtu_bypass_backend && !virtio_has_feature(vdev->backend_features, VIRTIO_NET_F_MTU)) { - features &= ~(1ULL << VIRTIO_NET_F_MTU); + virtio_clear_feature_ex(features, VIRTIO_NET_F_MTU); } virtio_net_set_multiqueue(n, - virtio_has_feature(features, VIRTIO_NET_F_RSS) || - virtio_has_feature(features, VIRTIO_NET_F_MQ)); + virtio_has_feature_ex(features, + VIRTIO_NET_F_RSS) || + virtio_has_feature_ex(features, + VIRTIO_NET_F_MQ)); virtio_net_set_mrg_rx_bufs(n, - virtio_has_feature(features, + virtio_has_feature_ex(features, VIRTIO_NET_F_MRG_RXBUF), - virtio_has_feature(features, + virtio_has_feature_ex(features, VIRTIO_F_VERSION_1), - virtio_has_feature(features, - VIRTIO_NET_F_HASH_REPORT)); + virtio_has_feature_ex(features, + VIRTIO_NET_F_HASH_REPORT), + virtio_has_tunnel_hdr(features)); - n->rsc4_enabled = virtio_has_feature(features, VIRTIO_NET_F_RSC_EXT) && - virtio_has_feature(features, VIRTIO_NET_F_GUEST_TSO4); - n->rsc6_enabled = virtio_has_feature(features, VIRTIO_NET_F_RSC_EXT) && - virtio_has_feature(features, VIRTIO_NET_F_GUEST_TSO6); - n->rss_data.redirect = virtio_has_feature(features, VIRTIO_NET_F_RSS); + n->rsc4_enabled = virtio_has_feature_ex(features, VIRTIO_NET_F_RSC_EXT) && + virtio_has_feature_ex(features, VIRTIO_NET_F_GUEST_TSO4); + n->rsc6_enabled = virtio_has_feature_ex(features, VIRTIO_NET_F_RSC_EXT) && + virtio_has_feature_ex(features, VIRTIO_NET_F_GUEST_TSO6); + n->rss_data.redirect = virtio_has_feature_ex(features, VIRTIO_NET_F_RSS); if (n->has_vnet_hdr) { n->curr_guest_offloads = @@ -920,7 +973,7 @@ static void virtio_net_set_features(VirtIODevice *vdev, uint64_t features) if (!get_vhost_net(nc->peer)) { continue; } - vhost_net_ack_features(get_vhost_net(nc->peer), features); + vhost_net_ack_features_ex(get_vhost_net(nc->peer), features); /* * keep acked_features in NetVhostUserState up-to-date so it @@ -929,12 +982,14 @@ static void virtio_net_set_features(VirtIODevice *vdev, uint64_t features) vhost_net_save_acked_features(nc->peer); } - if (virtio_has_feature(vdev->guest_features ^ features, VIRTIO_NET_F_CTRL_VLAN)) { - bool vlan = virtio_has_feature(features, VIRTIO_NET_F_CTRL_VLAN); + if (virtio_has_feature_ex(features, VIRTIO_NET_F_CTRL_VLAN) != + virtio_has_feature_ex(vdev->guest_features_ex, + VIRTIO_NET_F_CTRL_VLAN)) { + bool vlan = virtio_has_feature_ex(features, VIRTIO_NET_F_CTRL_VLAN); memset(n->vlans, vlan ? 0 : 0xff, MAX_VLAN >> 3); } - if (virtio_has_feature(features, VIRTIO_NET_F_STANDBY)) { + if (virtio_has_feature_ex(features, VIRTIO_NET_F_STANDBY)) { qapi_event_send_failover_negotiated(n->netclient_name); qatomic_set(&n->failover_primary_hidden, false); failover_add_primary(n, &err); @@ -1905,10 +1960,10 @@ static ssize_t virtio_net_receive_rcu(NetClientState *nc, const uint8_t *buf, virtio_error(vdev, "virtio-net unexpected empty queue: " "i %zd mergeable %d offset %zd, size %zd, " "guest hdr len %zd, host hdr len %zd " - "guest features 0x%" PRIx64, + "guest features 0x" VIRTIO_FEATURES_FMT, i, n->mergeable_rx_bufs, offset, size, n->guest_hdr_len, n->host_hdr_len, - vdev->guest_features); + VIRTIO_FEATURES_PR(vdev->guest_features_ex)); } err = -1; goto err; @@ -3015,8 +3070,8 @@ static int virtio_net_pre_load_queues(VirtIODevice *vdev, uint32_t n) return 0; } -static uint64_t virtio_net_get_features(VirtIODevice *vdev, uint64_t features, - Error **errp) +static void virtio_net_get_features(VirtIODevice *vdev, uint64_t *features, + Error **errp) { VirtIONet *n = VIRTIO_NET(vdev); NetClientState *nc = qemu_get_queue(n->nic); @@ -3030,68 +3085,83 @@ static uint64_t virtio_net_get_features(VirtIODevice *vdev, uint64_t features, (supported_hash_types & peer_hash_types) == supported_hash_types; /* Firstly sync all virtio-net possible supported features */ - features |= n->host_features; + virtio_features_or(features, features, n->host_features_ex); - virtio_add_feature(&features, VIRTIO_NET_F_MAC); + virtio_add_feature_ex(features, VIRTIO_NET_F_MAC); if (!peer_has_vnet_hdr(n)) { - virtio_clear_feature(&features, VIRTIO_NET_F_CSUM); - virtio_clear_feature(&features, VIRTIO_NET_F_HOST_TSO4); - virtio_clear_feature(&features, VIRTIO_NET_F_HOST_TSO6); - virtio_clear_feature(&features, VIRTIO_NET_F_HOST_ECN); + virtio_clear_feature_ex(features, VIRTIO_NET_F_CSUM); + virtio_clear_feature_ex(features, VIRTIO_NET_F_HOST_TSO4); + virtio_clear_feature_ex(features, VIRTIO_NET_F_HOST_TSO6); + virtio_clear_feature_ex(features, VIRTIO_NET_F_HOST_ECN); + + virtio_clear_feature_ex(features, VIRTIO_NET_F_GUEST_CSUM); + virtio_clear_feature_ex(features, VIRTIO_NET_F_GUEST_TSO4); + virtio_clear_feature_ex(features, VIRTIO_NET_F_GUEST_TSO6); + virtio_clear_feature_ex(features, VIRTIO_NET_F_GUEST_ECN); - virtio_clear_feature(&features, VIRTIO_NET_F_GUEST_CSUM); - virtio_clear_feature(&features, VIRTIO_NET_F_GUEST_TSO4); - virtio_clear_feature(&features, VIRTIO_NET_F_GUEST_TSO6); - virtio_clear_feature(&features, VIRTIO_NET_F_GUEST_ECN); + virtio_clear_feature_ex(features, VIRTIO_NET_F_HOST_USO); + virtio_clear_feature_ex(features, VIRTIO_NET_F_GUEST_USO4); + virtio_clear_feature_ex(features, VIRTIO_NET_F_GUEST_USO6); - virtio_clear_feature(&features, VIRTIO_NET_F_HOST_USO); - virtio_clear_feature(&features, VIRTIO_NET_F_GUEST_USO4); - virtio_clear_feature(&features, VIRTIO_NET_F_GUEST_USO6); + virtio_clear_feature_ex(features, VIRTIO_NET_F_GUEST_UDP_TUNNEL_GSO); + virtio_clear_feature_ex(features, VIRTIO_NET_F_HOST_UDP_TUNNEL_GSO); + virtio_clear_feature_ex(features, + VIRTIO_NET_F_GUEST_UDP_TUNNEL_GSO_CSUM); + virtio_clear_feature_ex(features, + VIRTIO_NET_F_HOST_UDP_TUNNEL_GSO_CSUM); - virtio_clear_feature(&features, VIRTIO_NET_F_HASH_REPORT); + virtio_clear_feature_ex(features, VIRTIO_NET_F_HASH_REPORT); } if (!peer_has_vnet_hdr(n) || !peer_has_ufo(n)) { - virtio_clear_feature(&features, VIRTIO_NET_F_GUEST_UFO); - virtio_clear_feature(&features, VIRTIO_NET_F_HOST_UFO); + virtio_clear_feature_ex(features, VIRTIO_NET_F_GUEST_UFO); + virtio_clear_feature_ex(features, VIRTIO_NET_F_HOST_UFO); } - if (!peer_has_uso(n)) { - virtio_clear_feature(&features, VIRTIO_NET_F_HOST_USO); - virtio_clear_feature(&features, VIRTIO_NET_F_GUEST_USO4); - virtio_clear_feature(&features, VIRTIO_NET_F_GUEST_USO6); + virtio_clear_feature_ex(features, VIRTIO_NET_F_HOST_USO); + virtio_clear_feature_ex(features, VIRTIO_NET_F_GUEST_USO4); + virtio_clear_feature_ex(features, VIRTIO_NET_F_GUEST_USO6); + } + + if (!peer_has_tunnel(n)) { + virtio_clear_feature_ex(features, VIRTIO_NET_F_GUEST_UDP_TUNNEL_GSO); + virtio_clear_feature_ex(features, VIRTIO_NET_F_HOST_UDP_TUNNEL_GSO); + virtio_clear_feature_ex(features, + VIRTIO_NET_F_GUEST_UDP_TUNNEL_GSO_CSUM); + virtio_clear_feature_ex(features, + VIRTIO_NET_F_HOST_UDP_TUNNEL_GSO_CSUM); } if (!get_vhost_net(nc->peer)) { if (!use_own_hash) { - virtio_clear_feature(&features, VIRTIO_NET_F_HASH_REPORT); - virtio_clear_feature(&features, VIRTIO_NET_F_RSS); - } else if (virtio_has_feature(features, VIRTIO_NET_F_RSS)) { + virtio_clear_feature_ex(features, VIRTIO_NET_F_HASH_REPORT); + virtio_clear_feature_ex(features, VIRTIO_NET_F_RSS); + } else if (virtio_has_feature_ex(features, VIRTIO_NET_F_RSS)) { virtio_net_load_ebpf(n, errp); } - return features; + return; } if (!use_peer_hash) { - virtio_clear_feature(&features, VIRTIO_NET_F_HASH_REPORT); + virtio_clear_feature_ex(features, VIRTIO_NET_F_HASH_REPORT); if (!use_own_hash || !virtio_net_attach_ebpf_to_backend(n->nic, -1)) { if (!virtio_net_load_ebpf(n, errp)) { - return features; + return; } - virtio_clear_feature(&features, VIRTIO_NET_F_RSS); + virtio_clear_feature_ex(features, VIRTIO_NET_F_RSS); } } - features = vhost_net_get_features(get_vhost_net(nc->peer), features); - vdev->backend_features = features; + vhost_net_get_features_ex(get_vhost_net(nc->peer), features); + virtio_features_copy(vdev->backend_features_ex, features); if (n->mtu_bypass_backend && (n->host_features & 1ULL << VIRTIO_NET_F_MTU)) { - features |= (1ULL << VIRTIO_NET_F_MTU); + virtio_add_feature_ex(features, VIRTIO_NET_F_MTU); } /* @@ -3106,10 +3176,8 @@ static uint64_t virtio_net_get_features(VirtIODevice *vdev, uint64_t features, * support it. */ if (!virtio_has_feature(vdev->backend_features, VIRTIO_NET_F_CTRL_VQ)) { - virtio_clear_feature(&features, VIRTIO_NET_F_GUEST_ANNOUNCE); + virtio_clear_feature_ex(features, VIRTIO_NET_F_GUEST_ANNOUNCE); } - - return features; } static int virtio_net_post_load_device(void *opaque, int version_id) @@ -3117,13 +3185,15 @@ static int virtio_net_post_load_device(void *opaque, int version_id) VirtIONet *n = opaque; VirtIODevice *vdev = VIRTIO_DEVICE(n); int i, link_down; + bool has_tunnel_hdr = virtio_has_tunnel_hdr(vdev->guest_features_ex); trace_virtio_net_post_load_device(); virtio_net_set_mrg_rx_bufs(n, n->mergeable_rx_bufs, virtio_vdev_has_feature(vdev, VIRTIO_F_VERSION_1), virtio_vdev_has_feature(vdev, - VIRTIO_NET_F_HASH_REPORT)); + VIRTIO_NET_F_HASH_REPORT), + has_tunnel_hdr); /* MAC_TABLE_ENTRIES may be different from the saved image */ if (n->mac_table.in_use > MAC_TABLE_ENTRIES) { @@ -3943,7 +4013,7 @@ static void virtio_net_device_realize(DeviceState *dev, Error **errp) n->vqs[0].tx_waiting = 0; n->tx_burst = n->net_conf.txburst; - virtio_net_set_mrg_rx_bufs(n, 0, 0, 0); + virtio_net_set_mrg_rx_bufs(n, 0, 0, 0, 0); n->promisc = 1; /* for compatibility */ n->mac_table.macs = g_malloc0(MAC_TABLE_ENTRIES * ETH_ALEN); @@ -4227,6 +4297,22 @@ static const Property virtio_net_properties[] = { rss_data.specified_hash_types, VIRTIO_NET_HASH_REPORT_UDPv6_EX - 1, ON_OFF_AUTO_AUTO), + VIRTIO_DEFINE_PROP_FEATURE("host_tunnel", VirtIONet, + host_features_ex, + VIRTIO_NET_F_HOST_UDP_TUNNEL_GSO, + false), + VIRTIO_DEFINE_PROP_FEATURE("host_tunnel_csum", VirtIONet, + host_features_ex, + VIRTIO_NET_F_HOST_UDP_TUNNEL_GSO_CSUM, + false), + VIRTIO_DEFINE_PROP_FEATURE("guest_tunnel", VirtIONet, + host_features_ex, + VIRTIO_NET_F_GUEST_UDP_TUNNEL_GSO, + false), + VIRTIO_DEFINE_PROP_FEATURE("guest_tunnel_csum", VirtIONet, + host_features_ex, + VIRTIO_NET_F_GUEST_UDP_TUNNEL_GSO_CSUM, + false), }; static void virtio_net_class_init(ObjectClass *klass, const void *data) @@ -4241,8 +4327,8 @@ static void virtio_net_class_init(ObjectClass *klass, const void *data) vdc->unrealize = virtio_net_device_unrealize; vdc->get_config = virtio_net_get_config; vdc->set_config = virtio_net_set_config; - vdc->get_features = virtio_net_get_features; - vdc->set_features = virtio_net_set_features; + vdc->get_features_ex = virtio_net_get_features; + vdc->set_features_ex = virtio_net_set_features; vdc->bad_features = virtio_net_bad_features; vdc->reset = virtio_net_reset; vdc->queue_reset = virtio_net_queue_reset; diff --git a/hw/net/vmxnet3.c b/hw/net/vmxnet3.c index af73aa8ef2..03732375a7 100644 --- a/hw/net/vmxnet3.c +++ b/hw/net/vmxnet3.c @@ -1322,14 +1322,11 @@ static void vmxnet3_update_features(VMXNET3State *s) s->lro_supported, rxcso_supported, s->rx_vlan_stripping); if (s->peer_has_vhdr) { - qemu_set_offload(qemu_get_queue(s->nic)->peer, - rxcso_supported, - s->lro_supported, - s->lro_supported, - 0, - 0, - 0, - 0); + NetOffloads ol = { .csum = rxcso_supported, + .tso4 = s->lro_supported, + .tso6 = s->lro_supported }; + + qemu_set_offload(qemu_get_queue(s->nic)->peer, &ol); } } diff --git a/hw/nvme/ctrl.c b/hw/nvme/ctrl.c index f5ee6bf260..cd81f73997 100644 --- a/hw/nvme/ctrl.c +++ b/hw/nvme/ctrl.c @@ -8708,12 +8708,8 @@ static bool nvme_init_pci(NvmeCtrl *n, PCIDevice *pci_dev, Error **errp) msix_table_offset); memory_region_add_subregion(&n->bar0, 0, &n->iomem); - if (pci_is_vf(pci_dev)) { - pcie_sriov_vf_register_bar(pci_dev, 0, &n->bar0); - } else { - pci_register_bar(pci_dev, 0, PCI_BASE_ADDRESS_SPACE_MEMORY | - PCI_BASE_ADDRESS_MEM_TYPE_64, &n->bar0); - } + pci_register_bar(pci_dev, 0, PCI_BASE_ADDRESS_SPACE_MEMORY | + PCI_BASE_ADDRESS_MEM_TYPE_64, &n->bar0); ret = msix_init(pci_dev, nr_vectors, &n->bar0, 0, msix_table_offset, diff --git a/hw/pci/pci.c b/hw/pci/pci.c index 5e2c3c6fc2..acc03fd470 100644 --- a/hw/pci/pci.c +++ b/hw/pci/pci.c @@ -1492,9 +1492,6 @@ void pci_register_bar(PCIDevice *pci_dev, int region_num, : pci_get_bus(pci_dev)->address_space_mem; if (pci_is_vf(pci_dev)) { - PCIDevice *pf = pci_dev->exp.sriov_vf.pf; - assert(!pf || type == pf->exp.sriov_pf.vf_bar_type[region_num]); - r->addr = pci_bar_address(pci_dev, region_num, r->type, r->size); if (r->addr != PCI_BAR_UNMAPPED) { memory_region_add_subregion_overlap(r->address_space, @@ -2968,7 +2965,7 @@ int pci_iommu_init_iotlb_notifier(PCIDevice *dev, IOMMUNotifier *n, PCIBus *iommu_bus; int devfn; - pci_device_get_iommu_bus_devfn(dev, &bus, &iommu_bus, &devfn); + pci_device_get_iommu_bus_devfn(dev, &iommu_bus, &bus, &devfn); if (iommu_bus && iommu_bus->iommu_ops->init_iotlb_notifier) { iommu_bus->iommu_ops->init_iotlb_notifier(bus, iommu_bus->iommu_opaque, devfn, n, fn, opaque); @@ -3026,7 +3023,7 @@ int pci_pri_request_page(PCIDevice *dev, uint32_t pasid, bool priv_req, return -EPERM; } - pci_device_get_iommu_bus_devfn(dev, &bus, &iommu_bus, &devfn); + pci_device_get_iommu_bus_devfn(dev, &iommu_bus, &bus, &devfn); if (iommu_bus && iommu_bus->iommu_ops->pri_request_page) { return iommu_bus->iommu_ops->pri_request_page(bus, iommu_bus->iommu_opaque, @@ -3050,7 +3047,7 @@ int pci_pri_register_notifier(PCIDevice *dev, uint32_t pasid, return -EPERM; } - pci_device_get_iommu_bus_devfn(dev, &bus, &iommu_bus, &devfn); + pci_device_get_iommu_bus_devfn(dev, &iommu_bus, &bus, &devfn); if (iommu_bus && iommu_bus->iommu_ops->pri_register_notifier) { iommu_bus->iommu_ops->pri_register_notifier(bus, iommu_bus->iommu_opaque, @@ -3067,7 +3064,7 @@ void pci_pri_unregister_notifier(PCIDevice *dev, uint32_t pasid) PCIBus *iommu_bus; int devfn; - pci_device_get_iommu_bus_devfn(dev, &bus, &iommu_bus, &devfn); + pci_device_get_iommu_bus_devfn(dev, &iommu_bus, &bus, &devfn); if (iommu_bus && iommu_bus->iommu_ops->pri_unregister_notifier) { iommu_bus->iommu_ops->pri_unregister_notifier(bus, iommu_bus->iommu_opaque, @@ -3099,7 +3096,7 @@ ssize_t pci_ats_request_translation(PCIDevice *dev, uint32_t pasid, return -EPERM; } - pci_device_get_iommu_bus_devfn(dev, &bus, &iommu_bus, &devfn); + pci_device_get_iommu_bus_devfn(dev, &iommu_bus, &bus, &devfn); if (iommu_bus && iommu_bus->iommu_ops->ats_request_translation) { return iommu_bus->iommu_ops->ats_request_translation(bus, iommu_bus->iommu_opaque, @@ -3123,7 +3120,7 @@ int pci_iommu_register_iotlb_notifier(PCIDevice *dev, uint32_t pasid, return -EPERM; } - pci_device_get_iommu_bus_devfn(dev, &bus, &iommu_bus, &devfn); + pci_device_get_iommu_bus_devfn(dev, &iommu_bus, &bus, &devfn); if (iommu_bus && iommu_bus->iommu_ops->register_iotlb_notifier) { iommu_bus->iommu_ops->register_iotlb_notifier(bus, iommu_bus->iommu_opaque, devfn, @@ -3145,7 +3142,7 @@ int pci_iommu_unregister_iotlb_notifier(PCIDevice *dev, uint32_t pasid, return -EPERM; } - pci_device_get_iommu_bus_devfn(dev, &bus, &iommu_bus, &devfn); + pci_device_get_iommu_bus_devfn(dev, &iommu_bus, &bus, &devfn); if (iommu_bus && iommu_bus->iommu_ops->unregister_iotlb_notifier) { iommu_bus->iommu_ops->unregister_iotlb_notifier(bus, iommu_bus->iommu_opaque, @@ -3159,11 +3156,9 @@ int pci_iommu_unregister_iotlb_notifier(PCIDevice *dev, uint32_t pasid, int pci_iommu_get_iotlb_info(PCIDevice *dev, uint8_t *addr_width, uint32_t *min_page_size) { - PCIBus *bus; PCIBus *iommu_bus; - int devfn; - pci_device_get_iommu_bus_devfn(dev, &bus, &iommu_bus, &devfn); + pci_device_get_iommu_bus_devfn(dev, &iommu_bus, NULL, NULL); if (iommu_bus && iommu_bus->iommu_ops->get_iotlb_info) { iommu_bus->iommu_ops->get_iotlb_info(iommu_bus->iommu_opaque, addr_width, min_page_size); diff --git a/hw/pci/pcie.c b/hw/pci/pcie.c index eaeb68894e..b302de6419 100644 --- a/hw/pci/pcie.c +++ b/hw/pci/pcie.c @@ -1266,6 +1266,14 @@ void pcie_pri_init(PCIDevice *dev, uint16_t offset, uint32_t outstanding_pr_cap, dev->exp.pri_cap = offset; } +uint32_t pcie_pri_get_req_alloc(const PCIDevice *dev) +{ + if (!pcie_pri_enabled(dev)) { + return 0; + } + return pci_get_long(dev->config + dev->exp.pri_cap + PCI_PRI_ALLOC_REQ); +} + bool pcie_pri_enabled(const PCIDevice *dev) { if (!pci_is_express(dev) || !dev->exp.pri_cap) { diff --git a/hw/pci/pcie_sriov.c b/hw/pci/pcie_sriov.c index 8a4bf0d6f7..c4f88f0975 100644 --- a/hw/pci/pcie_sriov.c +++ b/hw/pci/pcie_sriov.c @@ -195,7 +195,9 @@ bool pcie_sriov_pf_init(PCIDevice *dev, uint16_t offset, void pcie_sriov_pf_exit(PCIDevice *dev) { - uint8_t *cfg = dev->config + dev->exp.sriov_cap; + if (dev->exp.sriov_cap == 0) { + return; + } if (dev->exp.sriov_pf.vf_user_created) { uint16_t ven_id = pci_get_word(dev->config + PCI_VENDOR_ID); @@ -211,6 +213,8 @@ void pcie_sriov_pf_exit(PCIDevice *dev) pci_config_set_device_id(dev->exp.sriov_pf.vf[i]->config, vf_dev_id); } } else { + uint8_t *cfg = dev->config + dev->exp.sriov_cap; + unparent_vfs(dev, pci_get_word(cfg + PCI_SRIOV_TOTAL_VF)); } } @@ -242,17 +246,6 @@ void pcie_sriov_pf_init_vf_bar(PCIDevice *dev, int region_num, dev->exp.sriov_pf.vf_bar_type[region_num] = type; } -void pcie_sriov_vf_register_bar(PCIDevice *dev, int region_num, - MemoryRegion *memory) -{ - uint8_t type; - - assert(dev->exp.sriov_vf.pf); - type = dev->exp.sriov_vf.pf->exp.sriov_pf.vf_bar_type[region_num]; - - return pci_register_bar(dev, region_num, type, memory); -} - static gint compare_vf_devfns(gconstpointer a, gconstpointer b) { return (*(PCIDevice **)a)->devfn - (*(PCIDevice **)b)->devfn; diff --git a/hw/scsi/virtio-scsi.c b/hw/scsi/virtio-scsi.c index 34ae14f7bf..d817fc42b4 100644 --- a/hw/scsi/virtio-scsi.c +++ b/hw/scsi/virtio-scsi.c @@ -116,11 +116,7 @@ static void virtio_scsi_complete_req(VirtIOSCSIReq *req, QemuMutex *vq_lock) } virtqueue_push(vq, &req->elem, req->qsgl.size + req->resp_iov.size); - if (s->dataplane_started && !s->dataplane_fenced) { - virtio_notify_irqfd(vdev, vq); - } else { - virtio_notify(vdev, vq); - } + virtio_notify(vdev, vq); if (vq_lock) { qemu_mutex_unlock(vq_lock); diff --git a/hw/smbios/smbios.c b/hw/smbios/smbios.c index 1ac063cfb4..13e21a9c43 100644 --- a/hw/smbios/smbios.c +++ b/hw/smbios/smbios.c @@ -179,6 +179,10 @@ static const QemuOptDesc qemu_smbios_type0_opts[] = { .name = "uefi", .type = QEMU_OPT_BOOL, .help = "uefi support", + },{ + .name = "vm", + .type = QEMU_OPT_BOOL, + .help = "virtual machine", }, { /* end of list */ } }; @@ -574,10 +578,14 @@ static void smbios_build_type_0_table(void) t->bios_characteristics = cpu_to_le64(0x08); /* Not supported */ t->bios_characteristics_extension_bytes[0] = 0; - t->bios_characteristics_extension_bytes[1] = 0x14; /* TCD/SVVP | VM */ + + t->bios_characteristics_extension_bytes[1] = 0x04; /* TCD/SVVP */ if (smbios_type0.uefi) { t->bios_characteristics_extension_bytes[1] |= 0x08; /* |= UEFI */ } + if (smbios_type0.vm) { + t->bios_characteristics_extension_bytes[1] |= 0x10; /* |= VM */ + } if (smbios_type0.have_major_minor) { t->system_bios_major_release = smbios_type0.major; @@ -1405,6 +1413,7 @@ void smbios_entry_add(QemuOpts *opts, Error **errp) save_opt(&smbios_type0.version, opts, "version"); save_opt(&smbios_type0.date, opts, "date"); smbios_type0.uefi = qemu_opt_get_bool(opts, "uefi", false); + smbios_type0.vm = qemu_opt_get_bool(opts, "vm", true); val = qemu_opt_get(opts, "release"); if (val) { diff --git a/hw/virtio/Kconfig b/hw/virtio/Kconfig index 7648a2d68d..10f5c53ac0 100644 --- a/hw/virtio/Kconfig +++ b/hw/virtio/Kconfig @@ -126,3 +126,8 @@ config VHOST_USER_SCMI bool default y depends on VIRTIO && VHOST_USER && ARM + +config VHOST_USER_TEST + bool + default y + depends on VIRTIO && VHOST_USER diff --git a/hw/virtio/meson.build b/hw/virtio/meson.build index 3ea7b3cec8..48b9fedfa5 100644 --- a/hw/virtio/meson.build +++ b/hw/virtio/meson.build @@ -22,7 +22,7 @@ if have_vhost system_virtio_ss.add(files('vhost-user-base.c')) # MMIO Stubs - system_virtio_ss.add(files('vhost-user-device.c')) + system_virtio_ss.add(when: 'CONFIG_VHOST_USER_TEST', if_true: files('vhost-user-test-device.c')) system_virtio_ss.add(when: 'CONFIG_VHOST_USER_GPIO', if_true: files('vhost-user-gpio.c')) system_virtio_ss.add(when: 'CONFIG_VHOST_USER_I2C', if_true: files('vhost-user-i2c.c')) system_virtio_ss.add(when: 'CONFIG_VHOST_USER_RNG', if_true: files('vhost-user-rng.c')) @@ -30,7 +30,8 @@ if have_vhost system_virtio_ss.add(when: 'CONFIG_VHOST_USER_INPUT', if_true: files('vhost-user-input.c')) # PCI Stubs - system_virtio_ss.add(when: 'CONFIG_VIRTIO_PCI', if_true: files('vhost-user-device-pci.c')) + system_virtio_ss.add(when: ['CONFIG_VIRTIO_PCI', 'CONFIG_VHOST_USER_TEST'], + if_true: files('vhost-user-test-device-pci.c')) system_virtio_ss.add(when: ['CONFIG_VIRTIO_PCI', 'CONFIG_VHOST_USER_GPIO'], if_true: files('vhost-user-gpio-pci.c')) system_virtio_ss.add(when: ['CONFIG_VIRTIO_PCI', 'CONFIG_VHOST_USER_I2C'], diff --git a/hw/virtio/trace-events b/hw/virtio/trace-events index 76f0d458b2..658cc365e7 100644 --- a/hw/virtio/trace-events +++ b/hw/virtio/trace-events @@ -75,7 +75,6 @@ virtqueue_flush(void *vq, unsigned int count) "vq %p count %u" virtqueue_pop(void *vq, void *elem, unsigned int in_num, unsigned int out_num) "vq %p elem %p in_num %u out_num %u" virtio_queue_notify(void *vdev, int n, void *vq) "vdev %p n %d vq %p" virtio_notify_irqfd_deferred_fn(void *vdev, void *vq) "vdev %p vq %p" -virtio_notify_irqfd(void *vdev, void *vq) "vdev %p vq %p" virtio_notify(void *vdev, void *vq) "vdev %p vq %p" virtio_set_status(void *vdev, uint8_t val) "vdev %p val %u" diff --git a/hw/virtio/vdpa-dev.c b/hw/virtio/vdpa-dev.c index d1da40afc8..4a7b970976 100644 --- a/hw/virtio/vdpa-dev.c +++ b/hw/virtio/vdpa-dev.c @@ -338,6 +338,12 @@ static int vhost_vdpa_device_set_status(VirtIODevice *vdev, uint8_t status) return 0; } +static struct vhost_dev *vhost_vdpa_device_get_vhost(VirtIODevice *vdev) +{ + VhostVdpaDevice *s = VHOST_VDPA_DEVICE(vdev); + return &s->dev; +} + static const Property vhost_vdpa_device_properties[] = { DEFINE_PROP_STRING("vhostdev", VhostVdpaDevice, vhostdev), DEFINE_PROP_UINT16("queue-size", VhostVdpaDevice, queue_size, 0), @@ -369,6 +375,7 @@ static void vhost_vdpa_device_class_init(ObjectClass *klass, const void *data) vdc->set_config = vhost_vdpa_device_set_config; vdc->get_features = vhost_vdpa_device_get_features; vdc->set_status = vhost_vdpa_device_set_status; + vdc->get_vhost = vhost_vdpa_device_get_vhost; } static void vhost_vdpa_device_instance_init(Object *obj) diff --git a/hw/virtio/vhost-backend.c b/hw/virtio/vhost-backend.c index 833804dd40..4367db0d95 100644 --- a/hw/virtio/vhost-backend.c +++ b/hw/virtio/vhost-backend.c @@ -20,6 +20,11 @@ #include <linux/vhost.h> #include <sys/ioctl.h> +struct vhost_features { + uint64_t count; + uint64_t features[VIRTIO_FEATURES_NU64S]; +}; + static int vhost_kernel_call(struct vhost_dev *dev, unsigned long int request, void *arg) { @@ -182,12 +187,6 @@ static int vhost_kernel_get_vring_worker(struct vhost_dev *dev, return vhost_kernel_call(dev, VHOST_GET_VRING_WORKER, worker); } -static int vhost_kernel_set_features(struct vhost_dev *dev, - uint64_t features) -{ - return vhost_kernel_call(dev, VHOST_SET_FEATURES, &features); -} - static int vhost_kernel_set_backend_cap(struct vhost_dev *dev) { uint64_t features; @@ -210,10 +209,51 @@ static int vhost_kernel_set_backend_cap(struct vhost_dev *dev) return 0; } -static int vhost_kernel_get_features(struct vhost_dev *dev, - uint64_t *features) +static int vhost_kernel_set_features(struct vhost_dev *dev, + const uint64_t *features) { - return vhost_kernel_call(dev, VHOST_GET_FEATURES, features); + struct vhost_features farray; + bool extended_in_use; + int r; + + farray.count = VIRTIO_FEATURES_NU64S; + virtio_features_copy(farray.features, features); + extended_in_use = virtio_features_use_ex(farray.features); + + /* + * Can't check for ENOTTY: for unknown ioctls the kernel interprets + * the argument as a virtio queue id and most likely errors out validating + * such id, instead of reporting an unknown operation. + */ + r = vhost_kernel_call(dev, VHOST_SET_FEATURES_ARRAY, &farray); + if (!r) { + return 0; + } + + if (extended_in_use) { + error_report("Trying to set extended features without kernel support"); + return -EINVAL; + } + return vhost_kernel_call(dev, VHOST_SET_FEATURES, &farray.features[0]); +} + +static int vhost_kernel_get_features(struct vhost_dev *dev, uint64_t *features) +{ + struct vhost_features farray; + int r; + + farray.count = VIRTIO_FEATURES_NU64S; + r = vhost_kernel_call(dev, VHOST_GET_FEATURES_ARRAY, &farray); + if (r) { + memset(&farray, 0, sizeof(farray)); + r = vhost_kernel_call(dev, VHOST_GET_FEATURES, &farray.features[0]); + } + if (r) { + return r; + } + + virtio_features_copy(features, farray.features); + return 0; } static int vhost_kernel_set_owner(struct vhost_dev *dev) @@ -341,8 +381,8 @@ const VhostOps kernel_ops = { .vhost_attach_vring_worker = vhost_kernel_attach_vring_worker, .vhost_new_worker = vhost_kernel_new_worker, .vhost_free_worker = vhost_kernel_free_worker, - .vhost_set_features = vhost_kernel_set_features, - .vhost_get_features = vhost_kernel_get_features, + .vhost_set_features_ex = vhost_kernel_set_features, + .vhost_get_features_ex = vhost_kernel_get_features, .vhost_set_backend_cap = vhost_kernel_set_backend_cap, .vhost_set_owner = vhost_kernel_set_owner, .vhost_get_vq_index = vhost_kernel_get_vq_index, diff --git a/hw/virtio/vhost-user-device-pci.c b/hw/virtio/vhost-user-test-device-pci.c index f10bac874e..b4ed0efb50 100644 --- a/hw/virtio/vhost-user-device-pci.c +++ b/hw/virtio/vhost-user-test-device-pci.c @@ -18,13 +18,13 @@ struct VHostUserDevicePCI { VHostUserBase vub; }; -#define TYPE_VHOST_USER_DEVICE_PCI "vhost-user-device-pci-base" +#define TYPE_VHOST_USER_TEST_DEVICE_PCI "vhost-user-test-device-pci-base" -OBJECT_DECLARE_SIMPLE_TYPE(VHostUserDevicePCI, VHOST_USER_DEVICE_PCI) +OBJECT_DECLARE_SIMPLE_TYPE(VHostUserDevicePCI, VHOST_USER_TEST_DEVICE_PCI) static void vhost_user_device_pci_realize(VirtIOPCIProxy *vpci_dev, Error **errp) { - VHostUserDevicePCI *dev = VHOST_USER_DEVICE_PCI(vpci_dev); + VHostUserDevicePCI *dev = VHOST_USER_TEST_DEVICE_PCI(vpci_dev); DeviceState *vdev = DEVICE(&dev->vub); vpci_dev->nvectors = 1; @@ -38,9 +38,6 @@ static void vhost_user_device_pci_class_init(ObjectClass *klass, VirtioPCIClass *k = VIRTIO_PCI_CLASS(klass); PCIDeviceClass *pcidev_k = PCI_DEVICE_CLASS(klass); - /* Reason: stop users confusing themselves */ - dc->user_creatable = false; - k->realize = vhost_user_device_pci_realize; set_bit(DEVICE_CATEGORY_INPUT, dc->categories); pcidev_k->vendor_id = PCI_VENDOR_ID_REDHAT_QUMRANET; @@ -51,15 +48,15 @@ static void vhost_user_device_pci_class_init(ObjectClass *klass, static void vhost_user_device_pci_instance_init(Object *obj) { - VHostUserDevicePCI *dev = VHOST_USER_DEVICE_PCI(obj); + VHostUserDevicePCI *dev = VHOST_USER_TEST_DEVICE_PCI(obj); virtio_instance_init_common(obj, &dev->vub, sizeof(dev->vub), - TYPE_VHOST_USER_DEVICE); + TYPE_VHOST_USER_TEST_DEVICE); } static const VirtioPCIDeviceTypeInfo vhost_user_device_pci_info = { - .base_name = TYPE_VHOST_USER_DEVICE_PCI, - .non_transitional_name = "vhost-user-device-pci", + .base_name = TYPE_VHOST_USER_TEST_DEVICE_PCI, + .non_transitional_name = "vhost-user-test-device-pci", .instance_size = sizeof(VHostUserDevicePCI), .instance_init = vhost_user_device_pci_instance_init, .class_init = vhost_user_device_pci_class_init, diff --git a/hw/virtio/vhost-user-device.c b/hw/virtio/vhost-user-test-device.c index 3939bdf755..1b98ea3e48 100644 --- a/hw/virtio/vhost-user-device.c +++ b/hw/virtio/vhost-user-test-device.c @@ -1,5 +1,5 @@ /* - * Generic vhost-user-device implementation for any vhost-user-backend + * Generic vhost-user-test-device implementation for any vhost-user-backend * * This is a concrete implementation of vhost-user-base which can be * configured via properties. It is useful for development and @@ -25,7 +25,7 @@ */ static const VMStateDescription vud_vmstate = { - .name = "vhost-user-device", + .name = "vhost-user-test-device", .unmigratable = 1, }; @@ -41,16 +41,13 @@ static void vud_class_init(ObjectClass *klass, const void *data) { DeviceClass *dc = DEVICE_CLASS(klass); - /* Reason: stop inexperienced users confusing themselves */ - dc->user_creatable = false; - device_class_set_props(dc, vud_properties); dc->vmsd = &vud_vmstate; set_bit(DEVICE_CATEGORY_INPUT, dc->categories); } static const TypeInfo vud_info = { - .name = TYPE_VHOST_USER_DEVICE, + .name = TYPE_VHOST_USER_TEST_DEVICE, .parent = TYPE_VHOST_USER_BASE, .class_init = vud_class_init, }; diff --git a/hw/virtio/vhost.c b/hw/virtio/vhost.c index 6557c58d12..c120ef38b9 100644 --- a/hw/virtio/vhost.c +++ b/hw/virtio/vhost.c @@ -972,20 +972,34 @@ static int vhost_virtqueue_set_addr(struct vhost_dev *dev, static int vhost_dev_set_features(struct vhost_dev *dev, bool enable_log) { - uint64_t features = dev->acked_features; + uint64_t features[VIRTIO_FEATURES_NU64S]; int r; + + virtio_features_copy(features, dev->acked_features_ex); if (enable_log) { - features |= 0x1ULL << VHOST_F_LOG_ALL; + virtio_add_feature_ex(features, VHOST_F_LOG_ALL); } if (!vhost_dev_has_iommu(dev)) { - features &= ~(0x1ULL << VIRTIO_F_IOMMU_PLATFORM); + virtio_clear_feature_ex(features, VIRTIO_F_IOMMU_PLATFORM); } if (dev->vhost_ops->vhost_force_iommu) { if (dev->vhost_ops->vhost_force_iommu(dev) == true) { - features |= 0x1ULL << VIRTIO_F_IOMMU_PLATFORM; + virtio_add_feature_ex(features, VIRTIO_F_IOMMU_PLATFORM); } } - r = dev->vhost_ops->vhost_set_features(dev, features); + + if (virtio_features_use_ex(features) && + !dev->vhost_ops->vhost_set_features_ex) { + r = -EINVAL; + VHOST_OPS_DEBUG(r, "extended features without device support"); + goto out; + } + + if (dev->vhost_ops->vhost_set_features_ex) { + r = dev->vhost_ops->vhost_set_features_ex(dev, features); + } else { + r = dev->vhost_ops->vhost_set_features(dev, features[0]); + } if (r < 0) { VHOST_OPS_DEBUG(r, "vhost_set_features failed"); goto out; @@ -1508,12 +1522,27 @@ static void vhost_virtqueue_cleanup(struct vhost_virtqueue *vq) } } +static int vhost_dev_get_features(struct vhost_dev *hdev, + uint64_t *features) +{ + uint64_t features64; + int r; + + if (hdev->vhost_ops->vhost_get_features_ex) { + return hdev->vhost_ops->vhost_get_features_ex(hdev, features); + } + + r = hdev->vhost_ops->vhost_get_features(hdev, &features64); + virtio_features_from_u64(features, features64); + return r; +} + int vhost_dev_init(struct vhost_dev *hdev, void *opaque, VhostBackendType backend_type, uint32_t busyloop_timeout, Error **errp) { + uint64_t features[VIRTIO_FEATURES_NU64S]; unsigned int used, reserved, limit; - uint64_t features; int i, r, n_initialized_vqs = 0; hdev->vdev = NULL; @@ -1533,7 +1562,7 @@ int vhost_dev_init(struct vhost_dev *hdev, void *opaque, goto fail; } - r = hdev->vhost_ops->vhost_get_features(hdev, &features); + r = vhost_dev_get_features(hdev, features); if (r < 0) { error_setg_errno(errp, -r, "vhost_get_features failed"); goto fail; @@ -1571,7 +1600,7 @@ int vhost_dev_init(struct vhost_dev *hdev, void *opaque, } } - hdev->features = features; + virtio_features_copy(hdev->features_ex, features); hdev->memory_listener = (MemoryListener) { .name = "vhost", @@ -1594,7 +1623,7 @@ int vhost_dev_init(struct vhost_dev *hdev, void *opaque, }; if (hdev->migration_blocker == NULL) { - if (!(hdev->features & (0x1ULL << VHOST_F_LOG_ALL))) { + if (!virtio_has_feature_ex(hdev->features_ex, VHOST_F_LOG_ALL)) { error_setg(&hdev->migration_blocker, "Migration disabled: vhost lacks VHOST_F_LOG_ALL feature."); } else if (vhost_dev_log_is_shared(hdev) && !qemu_memfd_alloc_check()) { @@ -1817,7 +1846,7 @@ void vhost_config_mask(struct vhost_dev *hdev, VirtIODevice *vdev, bool mask) int r; EventNotifier *notifier = &hdev->vqs[VHOST_QUEUE_NUM_CONFIG_INR].masked_config_notifier; - EventNotifier *config_notifier = &vdev->config_notifier; + EventNotifier *config_notifier = virtio_config_get_guest_notifier(vdev); assert(hdev->vhost_ops); if ((hdev->started == false) || @@ -1848,39 +1877,40 @@ static void vhost_stop_config_intr(struct vhost_dev *dev) static void vhost_start_config_intr(struct vhost_dev *dev) { int r; + EventNotifier *config_notifier = + virtio_config_get_guest_notifier(dev->vdev); assert(dev->vhost_ops); - int fd = event_notifier_get_fd(&dev->vdev->config_notifier); + int fd = event_notifier_get_fd(config_notifier); if (dev->vhost_ops->vhost_set_config_call) { r = dev->vhost_ops->vhost_set_config_call(dev, fd); if (!r) { - event_notifier_set(&dev->vdev->config_notifier); + event_notifier_set(config_notifier); } } } -uint64_t vhost_get_features(struct vhost_dev *hdev, const int *feature_bits, - uint64_t features) +void vhost_get_features_ex(struct vhost_dev *hdev, + const int *feature_bits, + uint64_t *features) { const int *bit = feature_bits; + while (*bit != VHOST_INVALID_FEATURE_BIT) { - uint64_t bit_mask = (1ULL << *bit); - if (!(hdev->features & bit_mask)) { - features &= ~bit_mask; + if (!virtio_has_feature_ex(hdev->features_ex, *bit)) { + virtio_clear_feature_ex(features, *bit); } bit++; } - return features; } -void vhost_ack_features(struct vhost_dev *hdev, const int *feature_bits, - uint64_t features) +void vhost_ack_features_ex(struct vhost_dev *hdev, const int *feature_bits, + const uint64_t *features) { const int *bit = feature_bits; while (*bit != VHOST_INVALID_FEATURE_BIT) { - uint64_t bit_mask = (1ULL << *bit); - if (features & bit_mask) { - hdev->acked_features |= bit_mask; + if (virtio_has_feature_ex(features, *bit)) { + virtio_add_feature_ex(hdev->acked_features_ex, *bit); } bit++; } @@ -2139,12 +2169,13 @@ static int do_vhost_dev_stop(struct vhost_dev *hdev, VirtIODevice *vdev, { int i; int rc = 0; + EventNotifier *config_notifier = virtio_config_get_guest_notifier(vdev); /* should only be called after backend is connected */ assert(hdev->vhost_ops); event_notifier_test_and_clear( &hdev->vqs[VHOST_QUEUE_NUM_CONFIG_INR].masked_config_notifier); - event_notifier_test_and_clear(&vdev->config_notifier); + event_notifier_test_and_clear(config_notifier); event_notifier_cleanup( &hdev->vqs[VHOST_QUEUE_NUM_CONFIG_INR].masked_config_notifier); diff --git a/hw/virtio/virtio-bus.c b/hw/virtio/virtio-bus.c index 11adfbf3ab..cef944e015 100644 --- a/hw/virtio/virtio-bus.c +++ b/hw/virtio/virtio-bus.c @@ -62,9 +62,14 @@ void virtio_bus_device_plugged(VirtIODevice *vdev, Error **errp) } /* Get the features of the plugged device. */ - assert(vdc->get_features != NULL); - vdev->host_features = vdc->get_features(vdev, vdev->host_features, - &local_err); + if (vdc->get_features_ex) { + vdc->get_features_ex(vdev, vdev->host_features_ex, &local_err); + } else { + assert(vdc->get_features != NULL); + virtio_features_from_u64(vdev->host_features_ex, + vdc->get_features(vdev, vdev->host_features, + &local_err)); + } if (local_err) { error_propagate(errp, local_err); return; diff --git a/hw/virtio/virtio-hmp-cmds.c b/hw/virtio/virtio-hmp-cmds.c index 7d8677bcf0..1daae482d3 100644 --- a/hw/virtio/virtio-hmp-cmds.c +++ b/hw/virtio/virtio-hmp-cmds.c @@ -74,7 +74,8 @@ static void hmp_virtio_dump_features(Monitor *mon, } if (features->has_unknown_dev_features) { - monitor_printf(mon, " unknown-features(0x%016"PRIx64")\n", + monitor_printf(mon, " unknown-features(0x%016"PRIx64"%016"PRIx64")\n", + features->unknown_dev_features2, features->unknown_dev_features); } } diff --git a/hw/virtio/virtio-pci.c b/hw/virtio/virtio-pci.c index d2595fbd55..ab96ce1776 100644 --- a/hw/virtio/virtio-pci.c +++ b/hw/virtio/virtio-pci.c @@ -109,6 +109,29 @@ static const VMStateDescription vmstate_virtio_pci_modern_queue_state = { } }; +static bool virtio_pci_modern_state_features128_needed(void *opaque) +{ + VirtIOPCIProxy *proxy = opaque; + uint32_t features = 0; + int i; + + for (i = 2; i < ARRAY_SIZE(proxy->guest_features); ++i) { + features |= proxy->guest_features[i]; + } + return features; +} + +static const VMStateDescription vmstate_virtio_pci_modern_state_features128 = { + .name = "virtio_pci/modern_state/features128", + .version_id = 1, + .minimum_version_id = 1, + .needed = &virtio_pci_modern_state_features128_needed, + .fields = (const VMStateField[]) { + VMSTATE_UINT32_SUB_ARRAY(guest_features, VirtIOPCIProxy, 2, 2), + VMSTATE_END_OF_LIST() + } +}; + static bool virtio_pci_modern_state_needed(void *opaque) { VirtIOPCIProxy *proxy = opaque; @@ -116,6 +139,12 @@ static bool virtio_pci_modern_state_needed(void *opaque) return virtio_pci_modern(proxy); } +/* + * Avoid silently breaking migration should the feature space increase + * even more in the (far away) future + */ +QEMU_BUILD_BUG_ON(VIRTIO_FEATURES_NU32S != 4); + static const VMStateDescription vmstate_virtio_pci_modern_state_sub = { .name = "virtio_pci/modern_state", .version_id = 1, @@ -124,11 +153,15 @@ static const VMStateDescription vmstate_virtio_pci_modern_state_sub = { .fields = (const VMStateField[]) { VMSTATE_UINT32(dfselect, VirtIOPCIProxy), VMSTATE_UINT32(gfselect, VirtIOPCIProxy), - VMSTATE_UINT32_ARRAY(guest_features, VirtIOPCIProxy, 2), + VMSTATE_UINT32_SUB_ARRAY(guest_features, VirtIOPCIProxy, 0, 2), VMSTATE_STRUCT_ARRAY(vqs, VirtIOPCIProxy, VIRTIO_QUEUE_MAX, 0, vmstate_virtio_pci_modern_queue_state, VirtIOPCIQueue), VMSTATE_END_OF_LIST() + }, + .subsections = (const VMStateDescription * const []) { + &vmstate_virtio_pci_modern_state_features128, + NULL } }; @@ -1477,6 +1510,19 @@ int virtio_pci_add_shm_cap(VirtIOPCIProxy *proxy, return virtio_pci_add_mem_cap(proxy, &cap.cap); } +static int virtio_pci_select_max(const VirtIODevice *vdev) +{ + int i; + + for (i = VIRTIO_FEATURES_NU64S - 1; i > 0; i--) { + if (vdev->host_features_ex[i]) { + return (i + 1) * 2; + } + } + + return 2; +} + static uint64_t virtio_pci_common_read(void *opaque, hwaddr addr, unsigned size) { @@ -1494,18 +1540,21 @@ static uint64_t virtio_pci_common_read(void *opaque, hwaddr addr, val = proxy->dfselect; break; case VIRTIO_PCI_COMMON_DF: - if (proxy->dfselect <= 1) { + if (proxy->dfselect < virtio_pci_select_max(vdev)) { VirtioDeviceClass *vdc = VIRTIO_DEVICE_GET_CLASS(vdev); - val = (vdev->host_features & ~vdc->legacy_features) >> - (32 * proxy->dfselect); + val = vdev->host_features_ex[proxy->dfselect >> 1] >> + (32 * (proxy->dfselect & 1)); + if (proxy->dfselect <= 1) { + val &= (~vdc->legacy_features) >> (32 * proxy->dfselect); + } } break; case VIRTIO_PCI_COMMON_GFSELECT: val = proxy->gfselect; break; case VIRTIO_PCI_COMMON_GF: - if (proxy->gfselect < ARRAY_SIZE(proxy->guest_features)) { + if (proxy->gfselect < virtio_pci_select_max(vdev)) { val = proxy->guest_features[proxy->gfselect]; } break; @@ -1588,11 +1637,18 @@ static void virtio_pci_common_write(void *opaque, hwaddr addr, proxy->gfselect = val; break; case VIRTIO_PCI_COMMON_GF: - if (proxy->gfselect < ARRAY_SIZE(proxy->guest_features)) { + if (proxy->gfselect < virtio_pci_select_max(vdev)) { + uint64_t features[VIRTIO_FEATURES_NU64S]; + int i; + proxy->guest_features[proxy->gfselect] = val; - virtio_set_features(vdev, - (((uint64_t)proxy->guest_features[1]) << 32) | - proxy->guest_features[0]); + virtio_features_clear(features); + for (i = 0; i < ARRAY_SIZE(proxy->guest_features); ++i) { + uint64_t cur = proxy->guest_features[i]; + + features[i >> 1] |= cur << ((i & 1) * 32); + } + virtio_set_features_ex(vdev, features); } break; case VIRTIO_PCI_COMMON_MSIX: @@ -2311,6 +2367,8 @@ static void virtio_pci_reset(DeviceState *qdev) virtio_bus_reset(bus); msix_unuse_all_vectors(&proxy->pci_dev); + memset(proxy->guest_features, 0, sizeof(proxy->guest_features)); + for (i = 0; i < VIRTIO_QUEUE_MAX; i++) { proxy->vqs[i].enabled = 0; proxy->vqs[i].reset = 0; diff --git a/hw/virtio/virtio-qmp.c b/hw/virtio/virtio-qmp.c index 3b6377cf0d..b338344c6c 100644 --- a/hw/virtio/virtio-qmp.c +++ b/hw/virtio/virtio-qmp.c @@ -325,6 +325,20 @@ static const qmp_virtio_feature_map_t virtio_net_feature_map[] = { FEATURE_ENTRY(VHOST_USER_F_PROTOCOL_FEATURES, \ "VHOST_USER_F_PROTOCOL_FEATURES: Vhost-user protocol features " "negotiation supported"), + FEATURE_ENTRY(VIRTIO_NET_F_GUEST_UDP_TUNNEL_GSO, \ + "VIRTIO_NET_F_GUEST_UDP_TUNNEL_GSO: Driver can receive GSO over " + "UDP tunnel packets"), + FEATURE_ENTRY(VIRTIO_NET_F_GUEST_UDP_TUNNEL_GSO_CSUM, \ + "VIRTIO_NET_F_GUEST_UDP_TUNNEL_GSO: Driver can receive GSO over " + "UDP tunnel packets requiring checksum offload for the outer " + "header"), + FEATURE_ENTRY(VIRTIO_NET_F_HOST_UDP_TUNNEL_GSO, \ + "VIRTIO_NET_F_HOST_UDP_TUNNEL_GSO: Device can receive GSO over " + "UDP tunnel packets"), + FEATURE_ENTRY(VIRTIO_NET_F_HOST_UDP_TUNNEL_GSO_CSUM, \ + "VIRTIO_NET_F_HOST_UDP_TUNNEL_GSO_CSUM: Device can receive GSO over " + "UDP tunnel packets requiring checksum offload for the outer " + "header"), { -1, "" } }; #endif @@ -510,6 +524,24 @@ static const qmp_virtio_feature_map_t virtio_gpio_feature_map[] = { list; \ }) +#define CONVERT_FEATURES_EX(type, map, bitmap) \ + ({ \ + type *list = NULL; \ + type *node; \ + for (i = 0; map[i].virtio_bit != -1; i++) { \ + bit = map[i].virtio_bit; \ + if (!virtio_has_feature_ex(bitmap, bit)) { \ + continue; \ + } \ + node = g_new0(type, 1); \ + node->value = g_strdup(map[i].feature_desc); \ + node->next = list; \ + list = node; \ + virtio_clear_feature_ex(bitmap, bit); \ + } \ + list; \ + }) + VirtioDeviceStatus *qmp_decode_status(uint8_t bitmap) { VirtioDeviceStatus *status; @@ -545,109 +577,112 @@ VhostDeviceProtocols *qmp_decode_protocols(uint64_t bitmap) return vhu_protocols; } -VirtioDeviceFeatures *qmp_decode_features(uint16_t device_id, uint64_t bitmap) +VirtioDeviceFeatures *qmp_decode_features(uint16_t device_id, + const uint64_t *bmap) { + uint64_t bitmap[VIRTIO_FEATURES_NU64S]; VirtioDeviceFeatures *features; uint64_t bit; int i; + virtio_features_copy(bitmap, bmap); features = g_new0(VirtioDeviceFeatures, 1); features->has_dev_features = true; /* transport features */ - features->transports = CONVERT_FEATURES(strList, virtio_transport_map, 0, - bitmap); + features->transports = CONVERT_FEATURES_EX(strList, virtio_transport_map, + bitmap); /* device features */ switch (device_id) { #ifdef CONFIG_VIRTIO_SERIAL case VIRTIO_ID_CONSOLE: features->dev_features = - CONVERT_FEATURES(strList, virtio_serial_feature_map, 0, bitmap); + CONVERT_FEATURES_EX(strList, virtio_serial_feature_map, bitmap); break; #endif #ifdef CONFIG_VIRTIO_BLK case VIRTIO_ID_BLOCK: features->dev_features = - CONVERT_FEATURES(strList, virtio_blk_feature_map, 0, bitmap); + CONVERT_FEATURES_EX(strList, virtio_blk_feature_map, bitmap); break; #endif #ifdef CONFIG_VIRTIO_GPU case VIRTIO_ID_GPU: features->dev_features = - CONVERT_FEATURES(strList, virtio_gpu_feature_map, 0, bitmap); + CONVERT_FEATURES_EX(strList, virtio_gpu_feature_map, bitmap); break; #endif #ifdef CONFIG_VIRTIO_NET case VIRTIO_ID_NET: features->dev_features = - CONVERT_FEATURES(strList, virtio_net_feature_map, 0, bitmap); + CONVERT_FEATURES_EX(strList, virtio_net_feature_map, bitmap); break; #endif #ifdef CONFIG_VIRTIO_SCSI case VIRTIO_ID_SCSI: features->dev_features = - CONVERT_FEATURES(strList, virtio_scsi_feature_map, 0, bitmap); + CONVERT_FEATURES_EX(strList, virtio_scsi_feature_map, bitmap); break; #endif #ifdef CONFIG_VIRTIO_BALLOON case VIRTIO_ID_BALLOON: features->dev_features = - CONVERT_FEATURES(strList, virtio_balloon_feature_map, 0, bitmap); + CONVERT_FEATURES_EX(strList, virtio_balloon_feature_map, bitmap); break; #endif #ifdef CONFIG_VIRTIO_IOMMU case VIRTIO_ID_IOMMU: features->dev_features = - CONVERT_FEATURES(strList, virtio_iommu_feature_map, 0, bitmap); + CONVERT_FEATURES_EX(strList, virtio_iommu_feature_map, bitmap); break; #endif #ifdef CONFIG_VIRTIO_INPUT case VIRTIO_ID_INPUT: features->dev_features = - CONVERT_FEATURES(strList, virtio_input_feature_map, 0, bitmap); + CONVERT_FEATURES_EX(strList, virtio_input_feature_map, bitmap); break; #endif #ifdef CONFIG_VHOST_USER_FS case VIRTIO_ID_FS: features->dev_features = - CONVERT_FEATURES(strList, virtio_fs_feature_map, 0, bitmap); + CONVERT_FEATURES_EX(strList, virtio_fs_feature_map, bitmap); break; #endif #ifdef CONFIG_VHOST_VSOCK case VIRTIO_ID_VSOCK: features->dev_features = - CONVERT_FEATURES(strList, virtio_vsock_feature_map, 0, bitmap); + CONVERT_FEATURES_EX(strList, virtio_vsock_feature_map, bitmap); break; #endif #ifdef CONFIG_VIRTIO_CRYPTO case VIRTIO_ID_CRYPTO: features->dev_features = - CONVERT_FEATURES(strList, virtio_crypto_feature_map, 0, bitmap); + CONVERT_FEATURES_EX(strList, virtio_crypto_feature_map, bitmap); break; #endif #ifdef CONFIG_VIRTIO_MEM case VIRTIO_ID_MEM: features->dev_features = - CONVERT_FEATURES(strList, virtio_mem_feature_map, 0, bitmap); + CONVERT_FEATURES_EX(strList, virtio_mem_feature_map, bitmap); break; #endif #ifdef CONFIG_VIRTIO_I2C_ADAPTER case VIRTIO_ID_I2C_ADAPTER: features->dev_features = - CONVERT_FEATURES(strList, virtio_i2c_feature_map, 0, bitmap); + CONVERT_FEATURES_EX(strList, virtio_i2c_feature_map, bitmap); break; #endif #ifdef CONFIG_VIRTIO_RNG case VIRTIO_ID_RNG: features->dev_features = - CONVERT_FEATURES(strList, virtio_rng_feature_map, 0, bitmap); + CONVERT_FEATURES_EX(strList, virtio_rng_feature_map, bitmap); break; #endif #ifdef CONFIG_VHOST_USER_GPIO case VIRTIO_ID_GPIO: features->dev_features = - CONVERT_FEATURES(strList, virtio_gpio_feature_map, 0, bitmap); + CONVERT_FEATURES_EX(strList, virtio_gpio_feature_map, bitmap); break; #endif /* No features */ @@ -680,10 +715,9 @@ VirtioDeviceFeatures *qmp_decode_features(uint16_t device_id, uint64_t bitmap) g_assert_not_reached(); } - features->has_unknown_dev_features = bitmap != 0; - if (features->has_unknown_dev_features) { - features->unknown_dev_features = bitmap; - } + features->has_unknown_dev_features = !virtio_features_empty(bitmap); + features->unknown_dev_features = bitmap[0]; + features->unknown_dev_features2 = bitmap[1]; return features; } @@ -743,11 +777,11 @@ VirtioStatus *qmp_x_query_virtio_status(const char *path, Error **errp) status->device_id = vdev->device_id; status->vhost_started = vdev->vhost_started; status->guest_features = qmp_decode_features(vdev->device_id, - vdev->guest_features); + vdev->guest_features_ex); status->host_features = qmp_decode_features(vdev->device_id, - vdev->host_features); + vdev->host_features_ex); status->backend_features = qmp_decode_features(vdev->device_id, - vdev->backend_features); + vdev->backend_features_ex); switch (vdev->device_endian) { case VIRTIO_DEVICE_ENDIAN_LITTLE: @@ -785,11 +819,12 @@ VirtioStatus *qmp_x_query_virtio_status(const char *path, Error **errp) status->vhost_dev->nvqs = hdev->nvqs; status->vhost_dev->vq_index = hdev->vq_index; status->vhost_dev->features = - qmp_decode_features(vdev->device_id, hdev->features); + qmp_decode_features(vdev->device_id, hdev->features_ex); status->vhost_dev->acked_features = - qmp_decode_features(vdev->device_id, hdev->acked_features); + qmp_decode_features(vdev->device_id, hdev->acked_features_ex); status->vhost_dev->backend_features = - qmp_decode_features(vdev->device_id, hdev->backend_features); + qmp_decode_features(vdev->device_id, hdev->backend_features_ex); + status->vhost_dev->protocol_features = qmp_decode_protocols(hdev->protocol_features); status->vhost_dev->max_queues = hdev->max_queues; diff --git a/hw/virtio/virtio-qmp.h b/hw/virtio/virtio-qmp.h index 245a446a56..e0a1e49035 100644 --- a/hw/virtio/virtio-qmp.h +++ b/hw/virtio/virtio-qmp.h @@ -18,6 +18,7 @@ VirtIODevice *qmp_find_virtio_device(const char *path); VirtioDeviceStatus *qmp_decode_status(uint8_t bitmap); VhostDeviceProtocols *qmp_decode_protocols(uint64_t bitmap); -VirtioDeviceFeatures *qmp_decode_features(uint16_t device_id, uint64_t bitmap); +VirtioDeviceFeatures *qmp_decode_features(uint16_t device_id, + const uint64_t *bitmap); #endif diff --git a/hw/virtio/virtio.c b/hw/virtio/virtio.c index 0a68f1b6f1..be73753b59 100644 --- a/hw/virtio/virtio.c +++ b/hw/virtio/virtio.c @@ -31,6 +31,7 @@ #include "hw/qdev-properties.h" #include "hw/virtio/virtio-access.h" #include "system/dma.h" +#include "system/iothread.h" #include "system/runstate.h" #include "virtio-qmp.h" @@ -256,7 +257,10 @@ void virtio_init_region_cache(VirtIODevice *vdev, int n) len = address_space_cache_init(&new->desc, vdev->dma_as, addr, size, packed); if (len < size) { - virtio_error(vdev, "Cannot map desc"); + virtio_error(vdev, + "Failed to map descriptor ring for device %s: " + "invalid guest physical address or corrupted queue setup", + qdev_get_printable_name(DEVICE(vdev))); goto err_desc; } @@ -264,7 +268,10 @@ void virtio_init_region_cache(VirtIODevice *vdev, int n) len = address_space_cache_init(&new->used, vdev->dma_as, vq->vring.used, size, true); if (len < size) { - virtio_error(vdev, "Cannot map used"); + virtio_error(vdev, + "Failed to map used ring for device %s: " + "possible guest misconfiguration or insufficient memory", + qdev_get_printable_name(DEVICE(vdev))); goto err_used; } @@ -272,7 +279,10 @@ void virtio_init_region_cache(VirtIODevice *vdev, int n) len = address_space_cache_init(&new->avail, vdev->dma_as, vq->vring.avail, size, false); if (len < size) { - virtio_error(vdev, "Cannot map avail"); + virtio_error(vdev, + "Failed to map avalaible ring for device %s: " + "possible queue misconfiguration or overlapping memory region", + qdev_get_printable_name(DEVICE(vdev))); goto err_avail; } @@ -2654,16 +2664,8 @@ static void virtio_notify_irqfd_deferred_fn(void *opaque) event_notifier_set(notifier); } -void virtio_notify_irqfd(VirtIODevice *vdev, VirtQueue *vq) +static void virtio_irq(VirtQueue *vq) { - WITH_RCU_READ_LOCK_GUARD() { - if (!virtio_should_notify(vdev, vq)) { - return; - } - } - - trace_virtio_notify_irqfd(vdev, vq); - /* * virtio spec 1.0 says ISR bit 0 should be ignored with MSI, but * windows drivers included in virtio-win 1.8.0 (circa 2015) are @@ -2680,13 +2682,18 @@ void virtio_notify_irqfd(VirtIODevice *vdev, VirtQueue *vq) * to an atomic operation. */ virtio_set_isr(vq->vdev, 0x1); - defer_call(virtio_notify_irqfd_deferred_fn, &vq->guest_notifier); -} -static void virtio_irq(VirtQueue *vq) -{ - virtio_set_isr(vq->vdev, 0x1); - virtio_notify_vector(vq->vdev, vq->vector); + /* + * The interrupt code path requires the Big QEMU Lock (BQL), so use the + * notifier instead when in an IOThread. This assumes that device models + * have already called ->set_guest_notifiers() sometime before calling this + * function. + */ + if (qemu_in_iothread()) { + defer_call(virtio_notify_irqfd_deferred_fn, &vq->guest_notifier); + } else { + virtio_notify_vector(vq->vdev, vq->vector); + } } void virtio_notify(VirtIODevice *vdev, VirtQueue *vq) @@ -2708,7 +2715,12 @@ void virtio_notify_config(VirtIODevice *vdev) virtio_set_isr(vdev, 0x3); vdev->generation++; - virtio_notify_vector(vdev, vdev->config_vector); + + if (qemu_in_iothread()) { + defer_call(virtio_notify_irqfd_deferred_fn, &vdev->config_notifier); + } else { + virtio_notify_vector(vdev, vdev->config_vector); + } } static bool virtio_device_endian_needed(void *opaque) @@ -2964,6 +2976,30 @@ static const VMStateDescription vmstate_virtio_disabled = { } }; +static bool virtio_128bit_features_needed(void *opaque) +{ + VirtIODevice *vdev = opaque; + + return virtio_features_use_ex(vdev->host_features_ex); +} + +static const VMStateDescription vmstate_virtio_128bit_features = { + .name = "virtio/128bit_features", + .version_id = 1, + .minimum_version_id = 1, + .needed = &virtio_128bit_features_needed, + .fields = (const VMStateField[]) { + VMSTATE_UINT64(guest_features_ex[1], VirtIODevice), + VMSTATE_END_OF_LIST() + } +}; + +/* + * Avoid silently breaking migration should the feature space increase + * even more in the (far away) future + */ +QEMU_BUILD_BUG_ON(VIRTIO_FEATURES_NU64S != 2); + static const VMStateDescription vmstate_virtio = { .name = "virtio", .version_id = 1, @@ -2973,6 +3009,7 @@ static const VMStateDescription vmstate_virtio = { }, .subsections = (const VMStateDescription * const []) { &vmstate_virtio_device_endian, + &vmstate_virtio_128bit_features, &vmstate_virtio_64bit_features, &vmstate_virtio_virtqueues, &vmstate_virtio_ringsize, @@ -3071,23 +3108,30 @@ const VMStateInfo virtio_vmstate_info = { .put = virtio_device_put, }; -static int virtio_set_features_nocheck(VirtIODevice *vdev, uint64_t val) +static int virtio_set_features_nocheck(VirtIODevice *vdev, const uint64_t *val) { VirtioDeviceClass *k = VIRTIO_DEVICE_GET_CLASS(vdev); - bool bad = (val & ~(vdev->host_features)) != 0; + uint64_t tmp[VIRTIO_FEATURES_NU64S]; + bool bad; + + bad = virtio_features_andnot(tmp, val, vdev->host_features_ex); + virtio_features_and(tmp, val, vdev->host_features_ex); - val &= vdev->host_features; - if (k->set_features) { - k->set_features(vdev, val); + if (k->set_features_ex) { + k->set_features_ex(vdev, val); + } else if (k->set_features) { + bad = bad || virtio_features_use_ex(tmp); + k->set_features(vdev, tmp[0]); } - vdev->guest_features = val; + + virtio_features_copy(vdev->guest_features_ex, tmp); return bad ? -1 : 0; } typedef struct VirtioSetFeaturesNocheckData { Coroutine *co; VirtIODevice *vdev; - uint64_t val; + uint64_t val[VIRTIO_FEATURES_NU64S]; int ret; } VirtioSetFeaturesNocheckData; @@ -3100,14 +3144,15 @@ static void virtio_set_features_nocheck_bh(void *opaque) } static int coroutine_mixed_fn -virtio_set_features_nocheck_maybe_co(VirtIODevice *vdev, uint64_t val) +virtio_set_features_nocheck_maybe_co(VirtIODevice *vdev, + const uint64_t *val) { if (qemu_in_coroutine()) { VirtioSetFeaturesNocheckData data = { .co = qemu_coroutine_self(), .vdev = vdev, - .val = val, }; + virtio_features_copy(data.val, val); aio_bh_schedule_oneshot(qemu_get_current_aio_context(), virtio_set_features_nocheck_bh, &data); qemu_coroutine_yield(); @@ -3119,6 +3164,14 @@ virtio_set_features_nocheck_maybe_co(VirtIODevice *vdev, uint64_t val) int virtio_set_features(VirtIODevice *vdev, uint64_t val) { + uint64_t features[VIRTIO_FEATURES_NU64S]; + + virtio_features_from_u64(features, val); + return virtio_set_features_ex(vdev, features); +} + +int virtio_set_features_ex(VirtIODevice *vdev, const uint64_t *features) +{ int ret; /* * The driver must not attempt to set features after feature negotiation @@ -3128,13 +3181,13 @@ int virtio_set_features(VirtIODevice *vdev, uint64_t val) return -EINVAL; } - if (val & (1ull << VIRTIO_F_BAD_FEATURE)) { + if (features[0] & (1ull << VIRTIO_F_BAD_FEATURE)) { qemu_log_mask(LOG_GUEST_ERROR, "%s: guest driver for %s has enabled UNUSED(30) feature bit!\n", __func__, vdev->name); } - ret = virtio_set_features_nocheck(vdev, val); + ret = virtio_set_features_nocheck(vdev, features); if (virtio_vdev_has_feature(vdev, VIRTIO_RING_F_EVENT_IDX)) { /* VIRTIO_RING_F_EVENT_IDX changes the size of the caches. */ int i; @@ -3157,6 +3210,7 @@ void virtio_reset(void *opaque) { VirtIODevice *vdev = opaque; VirtioDeviceClass *k = VIRTIO_DEVICE_GET_CLASS(vdev); + uint64_t features[VIRTIO_FEATURES_NU64S]; int i; virtio_set_status(vdev, 0); @@ -3183,7 +3237,8 @@ void virtio_reset(void *opaque) vdev->start_on_kick = false; vdev->started = false; vdev->broken = false; - virtio_set_features_nocheck(vdev, 0); + virtio_features_clear(features); + virtio_set_features_nocheck(vdev, features); vdev->queue_sel = 0; vdev->status = 0; vdev->disabled = false; @@ -3267,7 +3322,7 @@ virtio_load(VirtIODevice *vdev, QEMUFile *f, int version_id) * Note: devices should always test host features in future - don't create * new dependencies like this. */ - vdev->guest_features = features; + virtio_features_from_u64(vdev->guest_features_ex, features); config_len = qemu_get_be32(f); @@ -3348,26 +3403,17 @@ virtio_load(VirtIODevice *vdev, QEMUFile *f, int version_id) vdev->device_endian = virtio_default_endian(); } - if (virtio_64bit_features_needed(vdev)) { - /* - * Subsection load filled vdev->guest_features. Run them - * through virtio_set_features to sanity-check them against - * host_features. - */ - uint64_t features64 = vdev->guest_features; - if (virtio_set_features_nocheck_maybe_co(vdev, features64) < 0) { - error_report("Features 0x%" PRIx64 " unsupported. " - "Allowed features: 0x%" PRIx64, - features64, vdev->host_features); - return -1; - } - } else { - if (virtio_set_features_nocheck_maybe_co(vdev, features) < 0) { - error_report("Features 0x%x unsupported. " - "Allowed features: 0x%" PRIx64, - features, vdev->host_features); - return -1; - } + /* + * guest_features_ex is fully initialized with u32 features and upper + * bits have been filled as needed by the later load. + */ + if (virtio_set_features_nocheck_maybe_co(vdev, + vdev->guest_features_ex) < 0) { + error_report("Features 0x" VIRTIO_FEATURES_FMT " unsupported. " + "Allowed features: 0x" VIRTIO_FEATURES_FMT, + VIRTIO_FEATURES_PR(vdev->guest_features_ex), + VIRTIO_FEATURES_PR(vdev->host_features_ex)); + return -1; } if (!virtio_device_started(vdev, vdev->status) && |