From 57ac646a2ecb2967c46febcfd0f40d396868b4dc Mon Sep 17 00:00:00 2001 From: CLEMENT MATHIEU--DRIF Date: Mon, 1 Sep 2025 11:17:20 +0000 Subject: intel_iommu: Bypass barrier wait descriptor wait_desc with SW=0,IF=0,FN=1 must not be considered as an invalid descriptor as it is used to implement section 7.10 of the VT-d spec. Signed-off-by: Clement Mathieu--Drif Reviewed-by: Michael S. Tsirkin Message-ID: <20250901111630.1018573-3-clement.mathieu--drif@eviden.com> Signed-off-by: Michael S. Tsirkin --- hw/i386/intel_iommu.c | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) (limited to 'hw/i386/intel_iommu.c') diff --git a/hw/i386/intel_iommu.c b/hw/i386/intel_iommu.c index 83c5e44413..4e7ad3a290 100644 --- a/hw/i386/intel_iommu.c +++ b/hw/i386/intel_iommu.c @@ -2857,7 +2857,13 @@ static bool vtd_process_wait_desc(IntelIOMMUState *s, VTDInvDesc *inv_desc) vtd_generate_completion_event(s); } - if (!(inv_desc->lo & (VTD_INV_DESC_WAIT_IF | VTD_INV_DESC_WAIT_SW))) { + /* + * SW=0, IF=0, FN=1 is also a valid descriptor (VT-d 7.10) + * Nothing to do as we process the descriptors in order + */ + + if (!(inv_desc->lo & (VTD_INV_DESC_WAIT_IF | VTD_INV_DESC_WAIT_SW | + VTD_INV_DESC_WAIT_FN))) { error_report_once("%s: invalid wait desc: hi=%"PRIx64", lo=%"PRIx64 " (unknown type)", __func__, inv_desc->hi, inv_desc->lo); -- cgit 1.4.1 From b84e1e0730f3063fc453ce2d0a77136f101467a2 Mon Sep 17 00:00:00 2001 From: CLEMENT MATHIEU--DRIF Date: Mon, 1 Sep 2025 11:17:23 +0000 Subject: intel_iommu: Declare registers for PRI Signed-off-by: Clement Mathieu--Drif Reviewed-by: Michael S. Tsirkin Message-ID: <20250901111630.1018573-5-clement.mathieu--drif@eviden.com> Signed-off-by: Michael S. Tsirkin --- hw/i386/intel_iommu.c | 55 +++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 55 insertions(+) (limited to 'hw/i386/intel_iommu.c') diff --git a/hw/i386/intel_iommu.c b/hw/i386/intel_iommu.c index 4e7ad3a290..d952ec1428 100644 --- a/hw/i386/intel_iommu.c +++ b/hw/i386/intel_iommu.c @@ -3386,6 +3386,27 @@ static void vtd_handle_iectl_write(IntelIOMMUState *s) } } +static void vtd_handle_prs_write(IntelIOMMUState *s) +{ + uint32_t prs = vtd_get_long_raw(s, DMAR_PRS_REG); + if (!(prs & VTD_PR_STATUS_PPR) && !(prs & VTD_PR_STATUS_PRO)) { + vtd_set_clear_mask_long(s, DMAR_PECTL_REG, VTD_PR_PECTL_IP, 0); + } +} + +static void vtd_handle_pectl_write(IntelIOMMUState *s) +{ + uint32_t pectl = vtd_get_long_raw(s, DMAR_PECTL_REG); + if ((pectl & VTD_PR_PECTL_IP) && !(pectl & VTD_PR_PECTL_IM)) { + /* + * If IP field was 1 when software clears the IM field, + * the interrupt is generated along with clearing the IP field. + */ + vtd_set_clear_mask_long(s, DMAR_PECTL_REG, VTD_PR_PECTL_IP, 0); + vtd_generate_interrupt(s, DMAR_PEADDR_REG, DMAR_PEDATA_REG); + } +} + static uint64_t vtd_mem_read(void *opaque, hwaddr addr, unsigned size) { IntelIOMMUState *s = opaque; @@ -3428,6 +3449,11 @@ static uint64_t vtd_mem_read(void *opaque, hwaddr addr, unsigned size) val = s->iq >> 32; break; + case DMAR_PEUADDR_REG: + assert(size == 4); + val = vtd_get_long_raw(s, DMAR_PEUADDR_REG); + break; + default: if (size == 4) { val = vtd_get_long(s, addr); @@ -3491,6 +3517,11 @@ static void vtd_mem_write(void *opaque, hwaddr addr, vtd_handle_iotlb_write(s); break; + case DMAR_PEUADDR_REG: + assert(size == 4); + vtd_set_long(s, addr, val); + break; + /* Invalidate Address Register, 64-bit */ case DMAR_IVA_REG: if (size == 4) { @@ -3671,6 +3702,18 @@ static void vtd_mem_write(void *opaque, hwaddr addr, vtd_set_long(s, addr, val); break; + case DMAR_PRS_REG: + assert(size == 4); + vtd_set_long(s, addr, val); + vtd_handle_prs_write(s); + break; + + case DMAR_PECTL_REG: + assert(size == 4); + vtd_set_long(s, addr, val); + vtd_handle_pectl_write(s); + break; + default: if (size == 4) { vtd_set_long(s, addr, val); @@ -4722,6 +4765,18 @@ static void vtd_init(IntelIOMMUState *s) * Interrupt remapping registers. */ vtd_define_quad(s, DMAR_IRTA_REG, 0, 0xfffffffffffff80fULL, 0); + + /* Page request registers */ + if (s->ecap & VTD_ECAP_PRS) { + vtd_define_quad(s, DMAR_PQH_REG, 0, 0x7ffe0ULL, 0); + vtd_define_quad(s, DMAR_PQT_REG, 0, 0x7ffe0ULL, 0); + vtd_define_quad(s, DMAR_PQA_REG, 0, 0xfffffffffffff007ULL, 0); + vtd_define_long(s, DMAR_PRS_REG, 0, 0, 0x3UL); + vtd_define_long(s, DMAR_PECTL_REG, 0, 0x80000000UL, 0); + vtd_define_long(s, DMAR_PEDATA_REG, 0, 0xffffUL, 0); + vtd_define_long(s, DMAR_PEADDR_REG, 0, 0xfffffffcUL, 0); + vtd_define_long(s, DMAR_PEUADDR_REG, 0, 0xffffffffUL, 0); + } } /* Should not reset address_spaces when reset because devices will still use -- cgit 1.4.1 From 676757e50ce0ce206969b02dc2433c9e825e55f5 Mon Sep 17 00:00:00 2001 From: CLEMENT MATHIEU--DRIF Date: Mon, 1 Sep 2025 11:17:24 +0000 Subject: intel_iommu: Add PRI operations support Implement the PRI callbacks in vtd_iommu_ops. Signed-off-by: Clement Mathieu--Drif Reviewed-by: Michael S. Tsirkin Message-ID: <20250901111630.1018573-6-clement.mathieu--drif@eviden.com> Signed-off-by: Michael S. Tsirkin --- hw/i386/intel_iommu.c | 274 +++++++++++++++++++++++++++++++++++++++++ hw/i386/intel_iommu_internal.h | 2 + include/hw/i386/intel_iommu.h | 1 + 3 files changed, 277 insertions(+) (limited to 'hw/i386/intel_iommu.c') diff --git a/hw/i386/intel_iommu.c b/hw/i386/intel_iommu.c index d952ec1428..2cc9bd5e45 100644 --- a/hw/i386/intel_iommu.c +++ b/hw/i386/intel_iommu.c @@ -45,6 +45,8 @@ ((ce)->val[1] & VTD_SM_CONTEXT_ENTRY_RID2PASID_MASK) #define VTD_CE_GET_PASID_DIR_TABLE(ce) \ ((ce)->val[0] & VTD_PASID_DIR_BASE_ADDR_MASK) +#define VTD_CE_GET_PRE(ce) \ + ((ce)->val[0] & VTD_SM_CONTEXT_ENTRY_PRE) /* pe operations */ #define VTD_PE_GET_TYPE(pe) ((pe)->val[0] & VTD_SM_PASID_ENTRY_PGTT) @@ -1838,6 +1840,7 @@ static const bool vtd_qualified_faults[] = { [VTD_FR_FS_NON_CANONICAL] = true, [VTD_FR_FS_PAGING_ENTRY_US] = true, [VTD_FR_SM_WRITE] = true, + [VTD_FR_SM_PRE_ABS] = true, [VTD_FR_SM_INTERRUPT_ADDR] = true, [VTD_FR_FS_BIT_UPDATE_FAILED] = true, [VTD_FR_MAX] = false, @@ -3152,6 +3155,59 @@ static bool vtd_process_device_piotlb_desc(IntelIOMMUState *s, return true; } +static bool vtd_process_page_group_response_desc(IntelIOMMUState *s, + VTDInvDesc *inv_desc) +{ + VTDAddressSpace *vtd_dev_as; + bool pasid_present; + uint8_t response_code; + uint16_t rid; + uint32_t pasid; + uint16_t prgi; + IOMMUPRIResponse response; + + if ((inv_desc->lo & VTD_INV_DESC_PGRESP_RSVD_LO) || + (inv_desc->hi & VTD_INV_DESC_PGRESP_RSVD_HI)) { + error_report_once("%s: invalid page group response desc: hi=%"PRIx64 + ", lo=%"PRIx64" (reserved nonzero)", __func__, + inv_desc->hi, inv_desc->lo); + return false; + } + + pasid_present = VTD_INV_DESC_PGRESP_PP(inv_desc->lo); + response_code = VTD_INV_DESC_PGRESP_RC(inv_desc->lo); + rid = VTD_INV_DESC_PGRESP_RID(inv_desc->lo); + pasid = VTD_INV_DESC_PGRESP_PASID(inv_desc->lo); + prgi = VTD_INV_DESC_PGRESP_PRGI(inv_desc->hi); + + if (!pasid_present) { + error_report_once("Page group response without PASID is" + "not supported yet"); + return false; + } + + vtd_dev_as = vtd_get_as_by_sid_and_pasid(s, rid, pasid); + if (!vtd_dev_as) { + return true; + } + + response.prgi = prgi; + + if (response_code == 0x0u) { + response.response_code = IOMMU_PRI_RESP_SUCCESS; + } else if (response_code == 0x1u) { + response.response_code = IOMMU_PRI_RESP_INVALID_REQUEST; + } else { + response.response_code = IOMMU_PRI_RESP_FAILURE; + } + + if (vtd_dev_as->pri_notifier) { + vtd_dev_as->pri_notifier->notify(vtd_dev_as->pri_notifier, &response); + } + + return true; +} + static bool vtd_process_device_iotlb_desc(IntelIOMMUState *s, VTDInvDesc *inv_desc) { @@ -3252,6 +3308,13 @@ static bool vtd_process_inv_desc(IntelIOMMUState *s) } break; + case VTD_INV_DESC_PGRESP: + trace_vtd_inv_desc("page group response", inv_desc.hi, inv_desc.lo); + if (!vtd_process_page_group_response_desc(s, &inv_desc)) { + return false; + } + break; + /* * TODO: the entity of below two cases will be implemented in future series. * To make guest (which integrates scalable mode support patch set in @@ -4864,6 +4927,194 @@ static ssize_t vtd_ats_request_translation(PCIBus *bus, void *opaque, return res_index; } +/* 11.4.11.3 : The number of entries in the page request queue is 2^(PQS + 7) */ +static inline uint64_t vtd_prq_size(IntelIOMMUState *s) +{ + return 1ULL << ((vtd_get_quad(s, DMAR_PQA_REG) & VTD_PQA_SIZE) + 7); +} + +/** + * Return true if the bit is accessible and correctly set, false otherwise + */ +static bool vtd_check_pre_bit(VTDAddressSpace *vtd_as, hwaddr addr, + uint16_t sid, bool is_write) +{ + int ret; + IntelIOMMUState *s = vtd_as->iommu_state; + uint8_t bus_n = pci_bus_num(vtd_as->bus); + VTDContextEntry ce; + bool is_fpd_set = false; + + ret = vtd_dev_to_context_entry(s, bus_n, vtd_as->devfn, &ce); + + if (ret) { + goto error_report; + } + + if (!VTD_CE_GET_PRE(&ce)) { + ret = -VTD_FR_SM_PRE_ABS; + goto error_get_fpd_and_report; + } + + return true; + +error_get_fpd_and_report: + /* Try to get fpd (may not work but we are already on an error path) */ + is_fpd_set = ce.lo & VTD_CONTEXT_ENTRY_FPD; + vtd_ce_get_pasid_fpd(s, &ce, &is_fpd_set, vtd_as->pasid); +error_report: + vtd_report_fault(s, -ret, is_fpd_set, sid, addr, is_write, + vtd_as->pasid != PCI_NO_PASID, vtd_as->pasid); + return false; +} + +/* Logic described in section 7.5 */ +static void vtd_generate_page_request_event(IntelIOMMUState *s, + uint32_t old_pr_status) +{ + uint32_t current_pectl = vtd_get_long(s, DMAR_PECTL_REG); + /* + * Hardware evaluates PPR and PRO fields in the Page Request Status Register + * and if any of them is set, Page Request Event is not generated + */ + if (old_pr_status & (VTD_PR_STATUS_PRO | VTD_PR_STATUS_PPR)) { + return; + } + + vtd_set_clear_mask_long(s, DMAR_PECTL_REG, 0, VTD_PR_PECTL_IP); + if (!(current_pectl & VTD_PR_PECTL_IM)) { + vtd_set_clear_mask_long(s, DMAR_PECTL_REG, VTD_PR_PECTL_IP, 0); + vtd_generate_interrupt(s, DMAR_PEADDR_REG, DMAR_PEDATA_REG); + } +} + +/* When calling this function, we known that we are in scalable mode */ +static int vtd_pri_perform_implicit_invalidation(VTDAddressSpace *vtd_as, + hwaddr addr) +{ + IntelIOMMUState *s = vtd_as->iommu_state; + VTDContextEntry ce; + VTDPASIDEntry pe; + uint16_t pgtt; + uint16_t domain_id; + int ret = vtd_dev_to_context_entry(s, pci_bus_num(vtd_as->bus), + vtd_as->devfn, &ce); + if (ret) { + return -EINVAL; + } + ret = vtd_ce_get_rid2pasid_entry(s, &ce, &pe, vtd_as->pasid); + if (ret) { + return -EINVAL; + } + pgtt = VTD_PE_GET_TYPE(&pe); + domain_id = VTD_SM_PASID_ENTRY_DID(pe.val[1]); + ret = 0; + switch (pgtt) { + case VTD_SM_PASID_ENTRY_FLT: + vtd_piotlb_page_invalidate(s, domain_id, vtd_as->pasid, addr, 0); + break; + /* Room for other pgtt values */ + default: + error_report_once("Translation type not supported yet : %d", pgtt); + ret = -EINVAL; + break; + } + + return ret; +} + +/* Page Request Descriptor : 7.4.1.1 */ +static int vtd_pri_request_page(PCIBus *bus, void *opaque, int devfn, + uint32_t pasid, bool priv_req, bool exec_req, + hwaddr addr, bool lpig, uint16_t prgi, + bool is_read, bool is_write) +{ + IntelIOMMUState *s = opaque; + VTDAddressSpace *vtd_as; + + vtd_as = vtd_find_add_as(s, bus, devfn, pasid); + + uint64_t queue_addr_reg = vtd_get_quad(s, DMAR_PQA_REG); + uint64_t queue_tail_offset_reg = vtd_get_quad(s, DMAR_PQT_REG); + uint64_t new_queue_tail_offset = ( + (queue_tail_offset_reg + VTD_PQA_ENTRY_SIZE) % + (vtd_prq_size(s) * VTD_PQA_ENTRY_SIZE)); + uint64_t queue_head_offset_reg = vtd_get_quad(s, DMAR_PQH_REG); + hwaddr queue_tail = (queue_addr_reg & VTD_PQA_ADDR) + queue_tail_offset_reg; + uint32_t old_pr_status = vtd_get_long(s, DMAR_PRS_REG); + uint16_t sid = PCI_BUILD_BDF(pci_bus_num(vtd_as->bus), vtd_as->devfn); + VTDPRDesc desc; + + if (!(s->ecap & VTD_ECAP_PRS)) { + return -EPERM; + } + + /* + * No need to check if scalable mode is enabled as we already known that + * VTD_ECAP_PRS is set (see vtd_decide_config) + */ + + /* We do not support PRI without PASID */ + if (vtd_as->pasid == PCI_NO_PASID) { + return -EPERM; + } + if (exec_req && !is_read) { + return -EINVAL; + } + + /* Check PRE bit in the scalable mode context entry */ + if (!vtd_check_pre_bit(vtd_as, addr, sid, is_write)) { + return -EPERM; + } + + if (old_pr_status & VTD_PR_STATUS_PRO) { + /* + * No action is taken by hardware to report a fault + * or generate an event + */ + return -ENOSPC; + } + + /* Check for overflow */ + if (new_queue_tail_offset == queue_head_offset_reg) { + vtd_set_clear_mask_long(s, DMAR_PRS_REG, 0, VTD_PR_STATUS_PRO); + vtd_generate_page_request_event(s, old_pr_status); + return -ENOSPC; + } + + if (vtd_pri_perform_implicit_invalidation(vtd_as, addr)) { + return -EINVAL; + } + + desc.lo = VTD_PRD_TYPE | VTD_PRD_PP(true) | VTD_PRD_RID(sid) | + VTD_PRD_PASID(vtd_as->pasid) | VTD_PRD_PMR(priv_req); + desc.hi = VTD_PRD_RDR(is_read) | VTD_PRD_WRR(is_write) | + VTD_PRD_LPIG(lpig) | VTD_PRD_PRGI(prgi) | VTD_PRD_ADDR(addr); + + desc.lo = cpu_to_le64(desc.lo); + desc.hi = cpu_to_le64(desc.hi); + if (dma_memory_write(&address_space_memory, queue_tail, &desc, sizeof(desc), + MEMTXATTRS_UNSPECIFIED)) { + error_report_once("IO error, the PQ tail cannot be updated"); + return -EIO; + } + + /* increment the tail register and set the pending request bit */ + vtd_set_quad(s, DMAR_PQT_REG, new_queue_tail_offset); + /* + * read status again so that the kernel does not miss a request. + * in some cases, we can trigger an unecessary interrupt but this strategy + * drastically improves performance as we don't need to take a lock. + */ + old_pr_status = vtd_get_long(s, DMAR_PRS_REG); + if (!(old_pr_status & VTD_PR_STATUS_PPR)) { + vtd_set_clear_mask_long(s, DMAR_PRS_REG, 0, VTD_PR_STATUS_PPR); + vtd_generate_page_request_event(s, old_pr_status); + } + + return 0; +} + static void vtd_init_iotlb_notifier(PCIBus *bus, void *opaque, int devfn, IOMMUNotifier *n, IOMMUNotify fn, void *user_opaque) @@ -4905,6 +5156,26 @@ static void vtd_unregister_iotlb_notifier(PCIBus *bus, void *opaque, memory_region_unregister_iommu_notifier(MEMORY_REGION(&vtd_as->iommu), n); } +static void vtd_pri_register_notifier(PCIBus *bus, void *opaque, int devfn, + uint32_t pasid, IOMMUPRINotifier *notifier) +{ + IntelIOMMUState *s = opaque; + VTDAddressSpace *vtd_as; + + vtd_as = vtd_find_add_as(s, bus, devfn, pasid); + vtd_as->pri_notifier = notifier; +} + +static void vtd_pri_unregister_notifier(PCIBus *bus, void *opaque, + int devfn, uint32_t pasid) +{ + IntelIOMMUState *s = opaque; + VTDAddressSpace *vtd_as; + + vtd_as = vtd_find_add_as(s, bus, devfn, pasid); + vtd_as->pri_notifier = NULL; +} + static PCIIOMMUOps vtd_iommu_ops = { .get_address_space = vtd_host_dma_iommu, .set_iommu_device = vtd_dev_set_iommu_device, @@ -4914,6 +5185,9 @@ static PCIIOMMUOps vtd_iommu_ops = { .register_iotlb_notifier = vtd_register_iotlb_notifier, .unregister_iotlb_notifier = vtd_unregister_iotlb_notifier, .ats_request_translation = vtd_ats_request_translation, + .pri_register_notifier = vtd_pri_register_notifier, + .pri_unregister_notifier = vtd_pri_unregister_notifier, + .pri_request_page = vtd_pri_request_page, }; static bool vtd_decide_config(IntelIOMMUState *s, Error **errp) diff --git a/hw/i386/intel_iommu_internal.h b/hw/i386/intel_iommu_internal.h index 04a8d4c769..0d0069a612 100644 --- a/hw/i386/intel_iommu_internal.h +++ b/hw/i386/intel_iommu_internal.h @@ -315,6 +315,8 @@ typedef enum VTDFaultReason { * request while disabled */ VTD_FR_IR_SID_ERR = 0x26, /* Invalid Source-ID */ + VTD_FR_SM_PRE_ABS = 0x47, /* SCT.8 : PRE bit in a present SM CE is 0 */ + /* PASID directory entry access failure */ VTD_FR_PASID_DIR_ACCESS_ERR = 0x50, /* The Present(P) field of pasid directory entry is 0 */ diff --git a/include/hw/i386/intel_iommu.h b/include/hw/i386/intel_iommu.h index e95477e855..47730ac3c7 100644 --- a/include/hw/i386/intel_iommu.h +++ b/include/hw/i386/intel_iommu.h @@ -110,6 +110,7 @@ struct VTDAddressSpace { QLIST_ENTRY(VTDAddressSpace) next; /* Superset of notifier flags that this address space has */ IOMMUNotifierFlag notifier_flags; + IOMMUPRINotifier *pri_notifier; /* * @iova_tree traces mapped IOVA ranges. * -- cgit 1.4.1 From b6b49c2cd6c2be8aac80f7563950398c7128f19e Mon Sep 17 00:00:00 2001 From: Joao Martins Date: Fri, 19 Sep 2025 21:35:14 +0000 Subject: intel-iommu: Move dma_translation to x86-iommu To be later reused by AMD, now that it shares similar property. Signed-off-by: Joao Martins Signed-off-by: Alejandro Jimenez Reviewed-by: Michael S. Tsirkin Message-ID: <20250919213515.917111-22-alejandro.j.jimenez@oracle.com> Signed-off-by: Michael S. Tsirkin --- hw/i386/intel_iommu.c | 5 ++--- hw/i386/x86-iommu.c | 1 + include/hw/i386/x86-iommu.h | 1 + 3 files changed, 4 insertions(+), 3 deletions(-) (limited to 'hw/i386/intel_iommu.c') diff --git a/hw/i386/intel_iommu.c b/hw/i386/intel_iommu.c index 2cc9bd5e45..fa2ad9c2eb 100644 --- a/hw/i386/intel_iommu.c +++ b/hw/i386/intel_iommu.c @@ -2704,7 +2704,7 @@ static void vtd_handle_gcmd_write(IntelIOMMUState *s) uint32_t changed = status ^ val; trace_vtd_reg_write_gcmd(status, val); - if ((changed & VTD_GCMD_TE) && s->dma_translation) { + if ((changed & VTD_GCMD_TE) && x86_iommu->dma_translation) { /* Translation enable/disable */ vtd_handle_gcmd_te(s, val & VTD_GCMD_TE); } @@ -3947,7 +3947,6 @@ static const Property vtd_properties[] = { DEFINE_PROP_BOOL("snoop-control", IntelIOMMUState, snoop_control, false), DEFINE_PROP_BOOL("x-pasid-mode", IntelIOMMUState, pasid, false), DEFINE_PROP_BOOL("dma-drain", IntelIOMMUState, dma_drain, true), - DEFINE_PROP_BOOL("dma-translation", IntelIOMMUState, dma_translation, true), DEFINE_PROP_BOOL("stale-tm", IntelIOMMUState, stale_tm, false), DEFINE_PROP_BOOL("fs1gp", IntelIOMMUState, fs1gp, true), }; @@ -4665,7 +4664,7 @@ static void vtd_cap_init(IntelIOMMUState *s) if (s->dma_drain) { s->cap |= VTD_CAP_DRAIN; } - if (s->dma_translation) { + if (x86_iommu->dma_translation) { if (s->aw_bits >= VTD_HOST_AW_39BIT) { s->cap |= VTD_CAP_SAGAW_39bit; } diff --git a/hw/i386/x86-iommu.c b/hw/i386/x86-iommu.c index d34a6849f4..c127a44bb4 100644 --- a/hw/i386/x86-iommu.c +++ b/hw/i386/x86-iommu.c @@ -130,6 +130,7 @@ static const Property x86_iommu_properties[] = { intr_supported, ON_OFF_AUTO_AUTO), DEFINE_PROP_BOOL("device-iotlb", X86IOMMUState, dt_supported, false), DEFINE_PROP_BOOL("pt", X86IOMMUState, pt_supported, true), + DEFINE_PROP_BOOL("dma-translation", X86IOMMUState, dma_translation, true), }; static void x86_iommu_class_init(ObjectClass *klass, const void *data) diff --git a/include/hw/i386/x86-iommu.h b/include/hw/i386/x86-iommu.h index bfd21649d0..e89f55a5c2 100644 --- a/include/hw/i386/x86-iommu.h +++ b/include/hw/i386/x86-iommu.h @@ -64,6 +64,7 @@ struct X86IOMMUState { OnOffAuto intr_supported; /* Whether vIOMMU supports IR */ bool dt_supported; /* Whether vIOMMU supports DT */ bool pt_supported; /* Whether vIOMMU supports pass-through */ + bool dma_translation; /* Whether vIOMMU supports DMA translation */ QLIST_HEAD(, IEC_Notifier) iec_notifiers; /* IEC notify list */ }; -- cgit 1.4.1 From 0305eceecdf100b98e4f230bf5a821621815a026 Mon Sep 17 00:00:00 2001 From: Zhenzhong Duan Date: Sun, 28 Sep 2025 23:42:04 -0400 Subject: intel_iommu: Enable Enhanced Set Root Table Pointer Support (ESRTPS) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit According to VTD spec rev 4.1 section 6.6: "For implementations reporting the Enhanced Set Root Table Pointer Support (ESRTPS) field as Clear, on a 'Set Root Table Pointer' operation, software must perform a global invalidate of the context cache, PASID-cache (if applicable), and IOTLB, in that order. This is required to ensure hardware references only the remapping structures referenced by the new root table pointer and not stale cached entries. For implementations reporting the Enhanced Set Root Table Pointer Support (ESRTPS) field as Set, as part of 'Set Root Table Pointer' operation, hardware performs global invalidation on all DMA remapping translation caches and hence software is not required to perform additional invalidations" We already implemented ESRTPS capability in vtd_handle_gcmd_srtp() by calling vtd_reset_caches(), just set ESRTPS in DMAR_CAP_REG to avoid unnecessary global invalidation requests of context, PASID-cache and IOTLB from guest. This change doesn't impact migration as the content of DMAR_CAP_REG is migrated too. Signed-off-by: Zhenzhong Duan Reviewed-by: Clément Mathieu--Drif Reviewed-by: Michael S. Tsirkin Message-ID: <20250929034206.439266-2-zhenzhong.duan@intel.com> Signed-off-by: Michael S. Tsirkin --- hw/i386/intel_iommu.c | 2 +- hw/i386/intel_iommu_internal.h | 1 + 2 files changed, 2 insertions(+), 1 deletion(-) (limited to 'hw/i386/intel_iommu.c') diff --git a/hw/i386/intel_iommu.c b/hw/i386/intel_iommu.c index fa2ad9c2eb..0378551038 100644 --- a/hw/i386/intel_iommu.c +++ b/hw/i386/intel_iommu.c @@ -4660,7 +4660,7 @@ static void vtd_cap_init(IntelIOMMUState *s) s->cap = VTD_CAP_FRO | VTD_CAP_NFR | VTD_CAP_ND | VTD_CAP_MAMV | VTD_CAP_PSI | VTD_CAP_SLLPS | - VTD_CAP_MGAW(s->aw_bits); + VTD_CAP_ESRTPS | VTD_CAP_MGAW(s->aw_bits); if (s->dma_drain) { s->cap |= VTD_CAP_DRAIN; } diff --git a/hw/i386/intel_iommu_internal.h b/hw/i386/intel_iommu_internal.h index 0d0069a612..0f6a1237e4 100644 --- a/hw/i386/intel_iommu_internal.h +++ b/hw/i386/intel_iommu_internal.h @@ -215,6 +215,7 @@ #define VTD_CAP_DRAIN_WRITE (1ULL << 54) #define VTD_CAP_DRAIN_READ (1ULL << 55) #define VTD_CAP_FS1GP (1ULL << 56) +#define VTD_CAP_ESRTPS (1ULL << 63) #define VTD_CAP_DRAIN (VTD_CAP_DRAIN_READ | VTD_CAP_DRAIN_WRITE) #define VTD_CAP_CM (1ULL << 7) #define VTD_PASID_ID_SHIFT 20 -- cgit 1.4.1 From 3cda33e06e72412512c0f376a5261e0ff6ebe73f Mon Sep 17 00:00:00 2001 From: Zhenzhong Duan Date: Sun, 28 Sep 2025 23:42:05 -0400 Subject: intel_iommu: Simplify caching mode check with VFIO device In early days, we had different tricks to ensure caching-mode=on with VFIO device: 28cf553afe ("intel_iommu: Sanity check vfio-pci config on machine init done") c6cbc29d36 ("pc/q35: Disallow vfio-pci hotplug without VT-d caching mode") There is also a patch with the same purpose but for VDPA device: b8d78277c0 ("intel-iommu: fail MAP notifier without caching mode") Because without caching mode, MAP notifier won't work correctly since guest won't send IOTLB update event when it establishes new mappings in the I/O page tables. Now with host IOMMU device interface between VFIO and vIOMMU, we can simplify first two commits above with a small check in set_iommu_device(). This also works for future IOMMUFD backed VDPA implementation which may also need caching mode on. But for legacy VDPA we still need commit b8d78277c0 as it doesn't use the host IOMMU device interface. For coldplug VFIO device: qemu-system-x86_64: -device vfio-pci,host=0000:3b:00.0,id=hostdev3,bus=root0,iommufd=iommufd0: vfio 0000:3b:00.0: Failed to set vIOMMU: Device assignment is not allowed without enabling caching-mode=on for Intel IOMMU. For hotplug VFIO device: if "iommu=off" is configured in guest, Error: vfio 0000:3b:00.0: Failed to set vIOMMU: Device assignment is not allowed without enabling caching-mode=on for Intel IOMMU. else Error: vfio 0000:3b:00.0: memory listener initialization failed: Region vtd-00.0-dmar: device 01.00.0 requires caching mode: Operation not supported The specialty for hotplug is due to the check in commit b8d78277c0 happen before the check in set_iommu_device. Signed-off-by: Zhenzhong Duan Reviewed-by: Michael S. Tsirkin Message-ID: <20250929034206.439266-3-zhenzhong.duan@intel.com> Signed-off-by: Michael S. Tsirkin --- hw/i386/intel_iommu.c | 40 ++++++---------------------------------- hw/i386/pc.c | 20 -------------------- 2 files changed, 6 insertions(+), 54 deletions(-) (limited to 'hw/i386/intel_iommu.c') diff --git a/hw/i386/intel_iommu.c b/hw/i386/intel_iommu.c index 0378551038..6a168d5107 100644 --- a/hw/i386/intel_iommu.c +++ b/hw/i386/intel_iommu.c @@ -87,13 +87,6 @@ struct vtd_iotlb_key { static void vtd_address_space_refresh_all(IntelIOMMUState *s); static void vtd_address_space_unmap(VTDAddressSpace *as, IOMMUNotifier *n); -static void vtd_panic_require_caching_mode(void) -{ - error_report("We need to set caching-mode=on for intel-iommu to enable " - "device assignment with IOMMU protection."); - exit(1); -} - static void vtd_define_quad(IntelIOMMUState *s, hwaddr addr, uint64_t val, uint64_t wmask, uint64_t w1cmask) { @@ -4489,6 +4482,12 @@ static bool vtd_dev_set_iommu_device(PCIBus *bus, void *opaque, int devfn, assert(hiod); + if (!s->caching_mode) { + error_setg(errp, "Device assignment is not allowed without enabling " + "caching-mode=on for Intel IOMMU."); + return false; + } + vtd_iommu_lock(s); if (g_hash_table_lookup(s->vtd_host_iommu_dev, &key)) { @@ -5244,32 +5243,6 @@ static bool vtd_decide_config(IntelIOMMUState *s, Error **errp) return true; } -static int vtd_machine_done_notify_one(Object *child, void *unused) -{ - IntelIOMMUState *iommu = INTEL_IOMMU_DEVICE(x86_iommu_get_default()); - - /* - * We hard-coded here because vfio-pci is the only special case - * here. Let's be more elegant in the future when we can, but so - * far there seems to be no better way. - */ - if (object_dynamic_cast(child, "vfio-pci") && !iommu->caching_mode) { - vtd_panic_require_caching_mode(); - } - - return 0; -} - -static void vtd_machine_done_hook(Notifier *notifier, void *unused) -{ - object_child_foreach_recursive(object_get_root(), - vtd_machine_done_notify_one, NULL); -} - -static Notifier vtd_machine_done_notify = { - .notify = vtd_machine_done_hook, -}; - static void vtd_realize(DeviceState *dev, Error **errp) { MachineState *ms = MACHINE(qdev_get_machine()); @@ -5324,7 +5297,6 @@ static void vtd_realize(DeviceState *dev, Error **errp) pci_setup_iommu(bus, &vtd_iommu_ops, dev); /* Pseudo address space under root PCI bus. */ x86ms->ioapic_as = vtd_host_dma_iommu(bus, s, Q35_PSEUDO_DEVFN_IOAPIC); - qemu_add_machine_init_done_notifier(&vtd_machine_done_notify); } static void vtd_class_init(ObjectClass *klass, const void *data) diff --git a/hw/i386/pc.c b/hw/i386/pc.c index eb36d50589..34b00663f2 100644 --- a/hw/i386/pc.c +++ b/hw/i386/pc.c @@ -1722,25 +1722,6 @@ static void pc_machine_wakeup(MachineState *machine) cpu_synchronize_all_post_reset(); } -static bool pc_hotplug_allowed(MachineState *ms, DeviceState *dev, Error **errp) -{ - X86IOMMUState *iommu = x86_iommu_get_default(); - IntelIOMMUState *intel_iommu; - - if (iommu && - object_dynamic_cast((Object *)iommu, TYPE_INTEL_IOMMU_DEVICE) && - object_dynamic_cast((Object *)dev, "vfio-pci")) { - intel_iommu = INTEL_IOMMU_DEVICE(iommu); - if (!intel_iommu->caching_mode) { - error_setg(errp, "Device assignment is not allowed without " - "enabling caching-mode=on for Intel IOMMU."); - return false; - } - } - - return true; -} - static void pc_machine_class_init(ObjectClass *oc, const void *data) { MachineClass *mc = MACHINE_CLASS(oc); @@ -1760,7 +1741,6 @@ static void pc_machine_class_init(ObjectClass *oc, const void *data) x86mc->apic_xrupt_override = true; assert(!mc->get_hotplug_handler); mc->get_hotplug_handler = pc_get_hotplug_handler; - mc->hotplug_allowed = pc_hotplug_allowed; mc->auto_enable_numa_with_memhp = true; mc->auto_enable_numa_with_memdev = true; mc->has_hotpluggable_cpus = true; -- cgit 1.4.1