diff options
Diffstat (limited to 'hw')
| -rw-r--r-- | hw/arm/virt.c | 9 | ||||
| -rw-r--r-- | hw/core/machine.c | 3 | ||||
| -rw-r--r-- | hw/i386/amd_iommu.c | 2 | ||||
| -rw-r--r-- | hw/i386/intel_iommu.c | 2 | ||||
| -rw-r--r-- | hw/i386/pc.c | 3 | ||||
| -rw-r--r-- | hw/i386/pc_piix.c | 15 | ||||
| -rw-r--r-- | hw/i386/pc_q35.c | 13 | ||||
| -rw-r--r-- | hw/pci/pci.c | 2 | ||||
| -rw-r--r-- | hw/ppc/Makefile.objs | 2 | ||||
| -rw-r--r-- | hw/ppc/prep.c | 1 | ||||
| -rw-r--r-- | hw/ppc/spapr.c | 104 | ||||
| -rw-r--r-- | hw/ppc/spapr_caps.c | 4 | ||||
| -rw-r--r-- | hw/ppc/spapr_hcall.c | 24 | ||||
| -rw-r--r-- | hw/ppc/spapr_irq.c | 44 | ||||
| -rw-r--r-- | hw/ppc/spapr_pci.c | 48 | ||||
| -rw-r--r-- | hw/ppc/spapr_pci_nvlink2.c | 450 | ||||
| -rw-r--r-- | hw/ppc/spapr_rtas.c | 2 | ||||
| -rw-r--r-- | hw/s390x/ipl.c | 61 | ||||
| -rw-r--r-- | hw/s390x/s390-ccw.c | 9 | ||||
| -rw-r--r-- | hw/s390x/s390-virtio-ccw.c | 26 | ||||
| -rw-r--r-- | hw/vfio/ccw.c | 6 | ||||
| -rw-r--r-- | hw/vfio/pci-quirks.c | 131 | ||||
| -rw-r--r-- | hw/vfio/pci.c | 14 | ||||
| -rw-r--r-- | hw/vfio/pci.h | 2 | ||||
| -rw-r--r-- | hw/vfio/spapr.c | 2 | ||||
| -rw-r--r-- | hw/vfio/trace-events | 4 |
26 files changed, 875 insertions, 108 deletions
diff --git a/hw/arm/virt.c b/hw/arm/virt.c index ce2664a30b..16ba67f7a7 100644 --- a/hw/arm/virt.c +++ b/hw/arm/virt.c @@ -1978,10 +1978,17 @@ static void machvirt_machine_init(void) } type_init(machvirt_machine_init); +static void virt_machine_4_1_options(MachineClass *mc) +{ +} +DEFINE_VIRT_MACHINE_AS_LATEST(4, 1) + static void virt_machine_4_0_options(MachineClass *mc) { + virt_machine_4_1_options(mc); + compat_props_add(mc->compat_props, hw_compat_4_0, hw_compat_4_0_len); } -DEFINE_VIRT_MACHINE_AS_LATEST(4, 0) +DEFINE_VIRT_MACHINE(4, 0) static void virt_machine_3_1_options(MachineClass *mc) { diff --git a/hw/core/machine.c b/hw/core/machine.c index 743fef2898..5d046a43e3 100644 --- a/hw/core/machine.c +++ b/hw/core/machine.c @@ -24,6 +24,9 @@ #include "hw/pci/pci.h" #include "hw/mem/nvdimm.h" +GlobalProperty hw_compat_4_0[] = {}; +const size_t hw_compat_4_0_len = G_N_ELEMENTS(hw_compat_4_0); + GlobalProperty hw_compat_3_1[] = { { "pcie-root-port", "x-speed", "2_5" }, { "pcie-root-port", "x-width", "1" }, diff --git a/hw/i386/amd_iommu.c b/hw/i386/amd_iommu.c index 6eabdf9917..4a4e2c7fd4 100644 --- a/hw/i386/amd_iommu.c +++ b/hw/i386/amd_iommu.c @@ -1601,6 +1601,8 @@ static void amdvi_class_init(ObjectClass *klass, void* data) dc_class->int_remap = amdvi_int_remap; /* Supported by the pc-q35-* machine types */ dc->user_creatable = true; + set_bit(DEVICE_CATEGORY_MISC, dc->categories); + dc->desc = "AMD IOMMU (AMD-Vi) DMA Remapping device"; } static const TypeInfo amdvi = { diff --git a/hw/i386/intel_iommu.c b/hw/i386/intel_iommu.c index 2558f48fe6..44b1231157 100644 --- a/hw/i386/intel_iommu.c +++ b/hw/i386/intel_iommu.c @@ -3741,6 +3741,8 @@ static void vtd_class_init(ObjectClass *klass, void *data) x86_class->int_remap = vtd_int_remap; /* Supported by the pc-q35-* machine types */ dc->user_creatable = true; + set_bit(DEVICE_CATEGORY_MISC, dc->categories); + dc->desc = "Intel IOMMU (VT-d) DMA Remapping device"; } static const TypeInfo vtd_info = { diff --git a/hw/i386/pc.c b/hw/i386/pc.c index f2c15bf1f2..d98b737b8f 100644 --- a/hw/i386/pc.c +++ b/hw/i386/pc.c @@ -115,6 +115,9 @@ struct hpet_fw_config hpet_cfg = {.count = UINT8_MAX}; /* Physical Address of PVH entry point read from kernel ELF NOTE */ static size_t pvh_start_addr; +GlobalProperty pc_compat_4_0[] = {}; +const size_t pc_compat_4_0_len = G_N_ELEMENTS(pc_compat_4_0); + GlobalProperty pc_compat_3_1[] = { { "intel-iommu", "dma-drain", "off" }, { "Opteron_G3" "-" TYPE_X86_CPU, "rdtscp", "off" }, diff --git a/hw/i386/pc_piix.c b/hw/i386/pc_piix.c index 8ad8e885c6..c07c4a5b38 100644 --- a/hw/i386/pc_piix.c +++ b/hw/i386/pc_piix.c @@ -428,13 +428,25 @@ static void pc_i440fx_machine_options(MachineClass *m) machine_class_allow_dynamic_sysbus_dev(m, TYPE_RAMFB_DEVICE); } -static void pc_i440fx_4_0_machine_options(MachineClass *m) +static void pc_i440fx_4_1_machine_options(MachineClass *m) { pc_i440fx_machine_options(m); m->alias = "pc"; m->is_default = 1; } +DEFINE_I440FX_MACHINE(v4_1, "pc-i440fx-4.1", NULL, + pc_i440fx_4_1_machine_options); + +static void pc_i440fx_4_0_machine_options(MachineClass *m) +{ + pc_i440fx_4_1_machine_options(m); + m->alias = NULL; + m->is_default = 0; + compat_props_add(m->compat_props, hw_compat_4_0, hw_compat_4_0_len); + compat_props_add(m->compat_props, pc_compat_4_0, pc_compat_4_0_len); +} + DEFINE_I440FX_MACHINE(v4_0, "pc-i440fx-4.0", NULL, pc_i440fx_4_0_machine_options); @@ -911,6 +923,7 @@ static void isa_bridge_class_init(ObjectClass *klass, void *data) PCIDeviceClass *k = PCI_DEVICE_CLASS(klass); dc->desc = "ISA bridge faked to support IGD PT"; + set_bit(DEVICE_CATEGORY_BRIDGE, dc->categories); k->vendor_id = PCI_VENDOR_ID_INTEL; k->class_id = PCI_CLASS_BRIDGE_ISA; }; diff --git a/hw/i386/pc_q35.c b/hw/i386/pc_q35.c index 372c6b73be..37dd350511 100644 --- a/hw/i386/pc_q35.c +++ b/hw/i386/pc_q35.c @@ -365,12 +365,23 @@ static void pc_q35_machine_options(MachineClass *m) m->max_cpus = 288; } -static void pc_q35_4_0_machine_options(MachineClass *m) +static void pc_q35_4_1_machine_options(MachineClass *m) { pc_q35_machine_options(m); m->alias = "q35"; } +DEFINE_Q35_MACHINE(v4_1, "pc-q35-4.1", NULL, + pc_q35_4_1_machine_options); + +static void pc_q35_4_0_machine_options(MachineClass *m) +{ + pc_q35_4_1_machine_options(m); + m->alias = NULL; + compat_props_add(m->compat_props, hw_compat_4_0, hw_compat_4_0_len); + compat_props_add(m->compat_props, pc_compat_4_0, pc_compat_4_0_len); +} + DEFINE_Q35_MACHINE(v4_0, "pc-q35-4.0", NULL, pc_q35_4_0_machine_options); diff --git a/hw/pci/pci.c b/hw/pci/pci.c index 1808b242dd..a78023f669 100644 --- a/hw/pci/pci.c +++ b/hw/pci/pci.c @@ -1556,7 +1556,7 @@ void pci_device_set_intx_routing_notifier(PCIDevice *dev, */ int pci_swizzle_map_irq_fn(PCIDevice *pci_dev, int pin) { - return (pin + PCI_SLOT(pci_dev->devfn)) % PCI_NUM_PINS; + return pci_swizzle(PCI_SLOT(pci_dev->devfn), pin); } /***********************************************************/ diff --git a/hw/ppc/Makefile.objs b/hw/ppc/Makefile.objs index 1111b218a0..636e717f20 100644 --- a/hw/ppc/Makefile.objs +++ b/hw/ppc/Makefile.objs @@ -9,7 +9,7 @@ obj-$(CONFIG_SPAPR_RNG) += spapr_rng.o # IBM PowerNV obj-$(CONFIG_POWERNV) += pnv.o pnv_xscom.o pnv_core.o pnv_lpc.o pnv_psi.o pnv_occ.o pnv_bmc.o ifeq ($(CONFIG_PCI)$(CONFIG_PSERIES)$(CONFIG_LINUX), yyy) -obj-y += spapr_pci_vfio.o +obj-y += spapr_pci_vfio.o spapr_pci_nvlink2.o endif obj-$(CONFIG_PSERIES) += spapr_rtas_ddw.o # PowerPC 4xx boards diff --git a/hw/ppc/prep.c b/hw/ppc/prep.c index 847d320465..b7f459d475 100644 --- a/hw/ppc/prep.c +++ b/hw/ppc/prep.c @@ -40,7 +40,6 @@ #include "hw/ide.h" #include "hw/loader.h" #include "hw/timer/mc146818rtc.h" -#include "hw/input/i8042.h" #include "hw/isa/pc87312.h" #include "hw/net/ne2000-isa.h" #include "sysemu/arch_init.h" diff --git a/hw/ppc/spapr.c b/hw/ppc/spapr.c index b52b82d298..2ef3ce4362 100644 --- a/hw/ppc/spapr.c +++ b/hw/ppc/spapr.c @@ -1034,12 +1034,13 @@ static void spapr_dt_rtas(SpaprMachineState *spapr, void *fdt) 0, cpu_to_be32(SPAPR_MEMORY_BLOCK_SIZE), cpu_to_be32(max_cpus / smp_threads), }; + uint32_t maxdomain = cpu_to_be32(spapr->gpu_numa_id > 1 ? 1 : 0); uint32_t maxdomains[] = { cpu_to_be32(4), - cpu_to_be32(0), - cpu_to_be32(0), - cpu_to_be32(0), - cpu_to_be32(nb_numa_nodes ? nb_numa_nodes : 1), + maxdomain, + maxdomain, + maxdomain, + cpu_to_be32(spapr->gpu_numa_id), }; _FDT(rtas = fdt_add_subnode(fdt, 0, "rtas")); @@ -1519,10 +1520,10 @@ static void spapr_unmap_hptes(PPCVirtualHypervisor *vhyp, /* Nothing to do for qemu managed HPT */ } -static void spapr_store_hpte(PPCVirtualHypervisor *vhyp, hwaddr ptex, - uint64_t pte0, uint64_t pte1) +void spapr_store_hpte(PowerPCCPU *cpu, hwaddr ptex, + uint64_t pte0, uint64_t pte1) { - SpaprMachineState *spapr = SPAPR_MACHINE(vhyp); + SpaprMachineState *spapr = SPAPR_MACHINE(cpu->vhyp); hwaddr offset = ptex * HASH_PTE_SIZE_64; if (!spapr->htab) { @@ -1550,6 +1551,38 @@ static void spapr_store_hpte(PPCVirtualHypervisor *vhyp, hwaddr ptex, } } +static void spapr_hpte_set_c(PPCVirtualHypervisor *vhyp, hwaddr ptex, + uint64_t pte1) +{ + hwaddr offset = ptex * HASH_PTE_SIZE_64 + 15; + SpaprMachineState *spapr = SPAPR_MACHINE(vhyp); + + if (!spapr->htab) { + /* There should always be a hash table when this is called */ + error_report("spapr_hpte_set_c called with no hash table !"); + return; + } + + /* The HW performs a non-atomic byte update */ + stb_p(spapr->htab + offset, (pte1 & 0xff) | 0x80); +} + +static void spapr_hpte_set_r(PPCVirtualHypervisor *vhyp, hwaddr ptex, + uint64_t pte1) +{ + hwaddr offset = ptex * HASH_PTE_SIZE_64 + 14; + SpaprMachineState *spapr = SPAPR_MACHINE(vhyp); + + if (!spapr->htab) { + /* There should always be a hash table when this is called */ + error_report("spapr_hpte_set_r called with no hash table !"); + return; + } + + /* The HW performs a non-atomic byte update */ + stb_p(spapr->htab + offset, ((pte1 >> 8) & 0xff) | 0x01); +} + int spapr_hpt_shift_for_ramsize(uint64_t ramsize) { int shift; @@ -1698,6 +1731,16 @@ static void spapr_machine_reset(void) spapr_irq_msi_reset(spapr); } + /* + * NVLink2-connected GPU RAM needs to be placed on a separate NUMA node. + * We assign a new numa ID per GPU in spapr_pci_collect_nvgpu() which is + * called from vPHB reset handler so we initialize the counter here. + * If no NUMA is configured from the QEMU side, we start from 1 as GPU RAM + * must be equally distant from any other node. + * The final value of spapr->gpu_numa_id is going to be written to + * max-associativity-domains in spapr_build_fdt(). + */ + spapr->gpu_numa_id = MAX(1, nb_numa_nodes); qemu_devices_reset(); /* @@ -3907,7 +3950,9 @@ static void spapr_phb_pre_plug(HotplugHandler *hotplug_dev, DeviceState *dev, smc->phb_placement(spapr, sphb->index, &sphb->buid, &sphb->io_win_addr, &sphb->mem_win_addr, &sphb->mem64_win_addr, - windows_supported, sphb->dma_liobn, errp); + windows_supported, sphb->dma_liobn, + &sphb->nv2_gpa_win_addr, &sphb->nv2_atsd_win_addr, + errp); } static void spapr_phb_plug(HotplugHandler *hotplug_dev, DeviceState *dev, @@ -4108,7 +4153,8 @@ static const CPUArchIdList *spapr_possible_cpu_arch_ids(MachineState *machine) static void spapr_phb_placement(SpaprMachineState *spapr, uint32_t index, uint64_t *buid, hwaddr *pio, hwaddr *mmio32, hwaddr *mmio64, - unsigned n_dma, uint32_t *liobns, Error **errp) + unsigned n_dma, uint32_t *liobns, + hwaddr *nv2gpa, hwaddr *nv2atsd, Error **errp) { /* * New-style PHB window placement. @@ -4153,6 +4199,9 @@ static void spapr_phb_placement(SpaprMachineState *spapr, uint32_t index, *pio = SPAPR_PCI_BASE + index * SPAPR_PCI_IO_WIN_SIZE; *mmio32 = SPAPR_PCI_BASE + (index + 1) * SPAPR_PCI_MEM32_WIN_SIZE; *mmio64 = SPAPR_PCI_BASE + (index + 1) * SPAPR_PCI_MEM64_WIN_SIZE; + + *nv2gpa = SPAPR_PCI_NV2RAM64_WIN_BASE + index * SPAPR_PCI_NV2RAM64_WIN_SIZE; + *nv2atsd = SPAPR_PCI_NV2ATSD_WIN_BASE + index * SPAPR_PCI_NV2ATSD_WIN_SIZE; } static ICSState *spapr_ics_get(XICSFabric *dev, int irq) @@ -4274,7 +4323,8 @@ static void spapr_machine_class_init(ObjectClass *oc, void *data) vhc->hpt_mask = spapr_hpt_mask; vhc->map_hptes = spapr_map_hptes; vhc->unmap_hptes = spapr_unmap_hptes; - vhc->store_hpte = spapr_store_hpte; + vhc->hpte_set_c = spapr_hpte_set_c; + vhc->hpte_set_r = spapr_hpte_set_r; vhc->get_pate = spapr_get_pate; vhc->encode_hpt_for_kvm_pr = spapr_encode_hpt_for_kvm_pr; xic->ics_get = spapr_ics_get; @@ -4345,18 +4395,41 @@ static const TypeInfo spapr_machine_info = { type_init(spapr_machine_register_##suffix) /* + * pseries-4.1 + */ +static void spapr_machine_4_1_class_options(MachineClass *mc) +{ + /* Defaults for the latest behaviour inherited from the base class */ +} + +DEFINE_SPAPR_MACHINE(4_1, "4.1", true); + +/* * pseries-4.0 */ static void spapr_machine_4_0_class_options(MachineClass *mc) { - /* Defaults for the latest behaviour inherited from the base class */ + spapr_machine_4_1_class_options(mc); + compat_props_add(mc->compat_props, hw_compat_4_0, hw_compat_4_0_len); } -DEFINE_SPAPR_MACHINE(4_0, "4.0", true); +DEFINE_SPAPR_MACHINE(4_0, "4.0", false); /* * pseries-3.1 */ +static void phb_placement_3_1(SpaprMachineState *spapr, uint32_t index, + uint64_t *buid, hwaddr *pio, + hwaddr *mmio32, hwaddr *mmio64, + unsigned n_dma, uint32_t *liobns, + hwaddr *nv2gpa, hwaddr *nv2atsd, Error **errp) +{ + spapr_phb_placement(spapr, index, buid, pio, mmio32, mmio64, n_dma, liobns, + nv2gpa, nv2atsd, errp); + *nv2gpa = 0; + *nv2atsd = 0; +} + static void spapr_machine_3_1_class_options(MachineClass *mc) { SpaprMachineClass *smc = SPAPR_MACHINE_CLASS(mc); @@ -4372,6 +4445,7 @@ static void spapr_machine_3_1_class_options(MachineClass *mc) smc->default_caps.caps[SPAPR_CAP_SBBC] = SPAPR_CAP_BROKEN; smc->default_caps.caps[SPAPR_CAP_IBS] = SPAPR_CAP_BROKEN; smc->default_caps.caps[SPAPR_CAP_LARGE_DECREMENTER] = SPAPR_CAP_OFF; + smc->phb_placement = phb_placement_3_1; } DEFINE_SPAPR_MACHINE(3_1, "3.1", false); @@ -4503,7 +4577,8 @@ DEFINE_SPAPR_MACHINE(2_8, "2.8", false); static void phb_placement_2_7(SpaprMachineState *spapr, uint32_t index, uint64_t *buid, hwaddr *pio, hwaddr *mmio32, hwaddr *mmio64, - unsigned n_dma, uint32_t *liobns, Error **errp) + unsigned n_dma, uint32_t *liobns, + hwaddr *nv2gpa, hwaddr *nv2atsd, Error **errp) { /* Legacy PHB placement for pseries-2.7 and earlier machine types */ const uint64_t base_buid = 0x800000020000000ULL; @@ -4547,6 +4622,9 @@ static void phb_placement_2_7(SpaprMachineState *spapr, uint32_t index, * fallback behaviour of automatically splitting a large "32-bit" * window into contiguous 32-bit and 64-bit windows */ + + *nv2gpa = 0; + *nv2atsd = 0; } static void spapr_machine_2_7_class_options(MachineClass *mc) diff --git a/hw/ppc/spapr_caps.c b/hw/ppc/spapr_caps.c index edc5ed0e0c..9b1c10baa6 100644 --- a/hw/ppc/spapr_caps.c +++ b/hw/ppc/spapr_caps.c @@ -347,7 +347,7 @@ static void cap_hpt_maxpagesize_apply(SpaprMachineState *spapr, warn_report("Many guests require at least 64kiB hpt-max-page-size"); } - spapr_check_pagesize(spapr, qemu_getrampagesize(), errp); + spapr_check_pagesize(spapr, qemu_minrampagesize(), errp); } static bool spapr_pagesize_cb(void *opaque, uint32_t seg_pshift, @@ -609,7 +609,7 @@ static SpaprCapabilities default_caps_with_cpu(SpaprMachineState *spapr, uint8_t mps; if (kvmppc_hpt_needs_host_contiguous_pages()) { - mps = ctz64(qemu_getrampagesize()); + mps = ctz64(qemu_minrampagesize()); } else { mps = 34; /* allow everything up to 16GiB, i.e. everything */ } diff --git a/hw/ppc/spapr_hcall.c b/hw/ppc/spapr_hcall.c index 8a736797b9..6c16d2b120 100644 --- a/hw/ppc/spapr_hcall.c +++ b/hw/ppc/spapr_hcall.c @@ -118,7 +118,7 @@ static target_ulong h_enter(PowerPCCPU *cpu, SpaprMachineState *spapr, ppc_hash64_unmap_hptes(cpu, hptes, ptex, 1); } - ppc_hash64_store_hpte(cpu, ptex + slot, pteh | HPTE64_V_HPTE_DIRTY, ptel); + spapr_store_hpte(cpu, ptex + slot, pteh | HPTE64_V_HPTE_DIRTY, ptel); args[0] = ptex + slot; return H_SUCCESS; @@ -131,7 +131,8 @@ typedef enum { REMOVE_HW = 3, } RemoveResult; -static RemoveResult remove_hpte(PowerPCCPU *cpu, target_ulong ptex, +static RemoveResult remove_hpte(PowerPCCPU *cpu + , target_ulong ptex, target_ulong avpn, target_ulong flags, target_ulong *vp, target_ulong *rp) @@ -155,7 +156,7 @@ static RemoveResult remove_hpte(PowerPCCPU *cpu, target_ulong ptex, } *vp = v; *rp = r; - ppc_hash64_store_hpte(cpu, ptex, HPTE64_V_HPTE_DIRTY, 0); + spapr_store_hpte(cpu, ptex, HPTE64_V_HPTE_DIRTY, 0); ppc_hash64_tlb_flush_hpte(cpu, ptex, v, r); return REMOVE_SUCCESS; } @@ -289,13 +290,13 @@ static target_ulong h_protect(PowerPCCPU *cpu, SpaprMachineState *spapr, r |= (flags << 55) & HPTE64_R_PP0; r |= (flags << 48) & HPTE64_R_KEY_HI; r |= flags & (HPTE64_R_PP | HPTE64_R_N | HPTE64_R_KEY_LO); - ppc_hash64_store_hpte(cpu, ptex, - (v & ~HPTE64_V_VALID) | HPTE64_V_HPTE_DIRTY, 0); + spapr_store_hpte(cpu, ptex, + (v & ~HPTE64_V_VALID) | HPTE64_V_HPTE_DIRTY, 0); ppc_hash64_tlb_flush_hpte(cpu, ptex, v, r); /* Flush the tlb */ check_tlb_flush(env, true); /* Don't need a memory barrier, due to qemu's global lock */ - ppc_hash64_store_hpte(cpu, ptex, v | HPTE64_V_HPTE_DIRTY, r); + spapr_store_hpte(cpu, ptex, v | HPTE64_V_HPTE_DIRTY, r); return H_SUCCESS; } @@ -304,8 +305,8 @@ static target_ulong h_read(PowerPCCPU *cpu, SpaprMachineState *spapr, { target_ulong flags = args[0]; target_ulong ptex = args[1]; - uint8_t *hpte; int i, ridx, n_entries = 1; + const ppc_hash_pte64_t *hptes; if (!valid_ptex(cpu, ptex)) { return H_PARAMETER; @@ -317,13 +318,12 @@ static target_ulong h_read(PowerPCCPU *cpu, SpaprMachineState *spapr, n_entries = 4; } - hpte = spapr->htab + (ptex * HASH_PTE_SIZE_64); - + hptes = ppc_hash64_map_hptes(cpu, ptex, n_entries); for (i = 0, ridx = 0; i < n_entries; i++) { - args[ridx++] = ldq_p(hpte); - args[ridx++] = ldq_p(hpte + (HASH_PTE_SIZE_64/2)); - hpte += HASH_PTE_SIZE_64; + args[ridx++] = ppc_hash64_hpte0(cpu, hptes, i); + args[ridx++] = ppc_hash64_hpte1(cpu, hptes, i); } + ppc_hash64_unmap_hptes(cpu, hptes, ptex, n_entries); return H_SUCCESS; } diff --git a/hw/ppc/spapr_irq.c b/hw/ppc/spapr_irq.c index 0a84e4cf63..b1f79ea9de 100644 --- a/hw/ppc/spapr_irq.c +++ b/hw/ppc/spapr_irq.c @@ -67,36 +67,11 @@ void spapr_irq_msi_reset(SpaprMachineState *spapr) * XICS IRQ backend. */ -static ICSState *spapr_ics_create(SpaprMachineState *spapr, - int nr_irqs, Error **errp) -{ - Error *local_err = NULL; - Object *obj; - - obj = object_new(TYPE_ICS_SIMPLE); - object_property_add_child(OBJECT(spapr), "ics", obj, &error_abort); - object_property_add_const_link(obj, ICS_PROP_XICS, OBJECT(spapr), - &error_abort); - object_property_set_int(obj, nr_irqs, "nr-irqs", &local_err); - if (local_err) { - goto error; - } - object_property_set_bool(obj, true, "realized", &local_err); - if (local_err) { - goto error; - } - - return ICS_BASE(obj); - -error: - error_propagate(errp, local_err); - return NULL; -} - static void spapr_irq_init_xics(SpaprMachineState *spapr, int nr_irqs, Error **errp) { MachineState *machine = MACHINE(spapr); + Object *obj; Error *local_err = NULL; bool xics_kvm = false; @@ -108,7 +83,8 @@ static void spapr_irq_init_xics(SpaprMachineState *spapr, int nr_irqs, if (machine_kernel_irqchip_required(machine) && !xics_kvm) { error_prepend(&local_err, "kernel_irqchip requested but unavailable: "); - goto error; + error_propagate(errp, local_err); + return; } error_free(local_err); local_err = NULL; @@ -118,10 +94,18 @@ static void spapr_irq_init_xics(SpaprMachineState *spapr, int nr_irqs, xics_spapr_init(spapr); } - spapr->ics = spapr_ics_create(spapr, nr_irqs, &local_err); + obj = object_new(TYPE_ICS_SIMPLE); + object_property_add_child(OBJECT(spapr), "ics", obj, &error_abort); + object_property_add_const_link(obj, ICS_PROP_XICS, OBJECT(spapr), + &error_fatal); + object_property_set_int(obj, nr_irqs, "nr-irqs", &error_fatal); + object_property_set_bool(obj, true, "realized", &local_err); + if (local_err) { + error_propagate(errp, local_err); + return; + } -error: - error_propagate(errp, local_err); + spapr->ics = ICS_BASE(obj); } #define ICS_IRQ_FREE(ics, srcno) \ diff --git a/hw/ppc/spapr_pci.c b/hw/ppc/spapr_pci.c index f62e6833b8..97961b0128 100644 --- a/hw/ppc/spapr_pci.c +++ b/hw/ppc/spapr_pci.c @@ -719,26 +719,10 @@ param_error_exit: rtas_st(rets, 0, RTAS_OUT_PARAM_ERROR); } -static int pci_spapr_swizzle(int slot, int pin) -{ - return (slot + pin) % PCI_NUM_PINS; -} - -static int pci_spapr_map_irq(PCIDevice *pci_dev, int irq_num) -{ - /* - * Here we need to convert pci_dev + irq_num to some unique value - * which is less than number of IRQs on the specific bus (4). We - * use standard PCI swizzling, that is (slot number + pin number) - * % 4. - */ - return pci_spapr_swizzle(PCI_SLOT(pci_dev->devfn), irq_num); -} - static void pci_spapr_set_irq(void *opaque, int irq_num, int level) { /* - * Here we use the number returned by pci_spapr_map_irq to find a + * Here we use the number returned by pci_swizzle_map_irq_fn to find a * corresponding qemu_irq. */ SpaprPhbState *phb = opaque; @@ -1355,6 +1339,8 @@ static void spapr_populate_pci_child_dt(PCIDevice *dev, void *fdt, int offset, if (sphb->pcie_ecs && pci_is_express(dev)) { _FDT(fdt_setprop_cell(fdt, offset, "ibm,pci-config-space-type", 0x1)); } + + spapr_phb_nvgpu_populate_pcidev_dt(dev, fdt, offset, sphb); } /* create OF node for pci device and required OF DT properties */ @@ -1587,6 +1573,8 @@ static void spapr_phb_unrealize(DeviceState *dev, Error **errp) int i; const unsigned windows_supported = spapr_phb_windows_supported(sphb); + spapr_phb_nvgpu_free(sphb); + if (sphb->msi) { g_hash_table_unref(sphb->msi); sphb->msi = NULL; @@ -1762,7 +1750,7 @@ static void spapr_phb_realize(DeviceState *dev, Error **errp) &sphb->iowindow); bus = pci_register_root_bus(dev, NULL, - pci_spapr_set_irq, pci_spapr_map_irq, sphb, + pci_spapr_set_irq, pci_swizzle_map_irq_fn, sphb, &sphb->memspace, &sphb->iospace, PCI_DEVFN(0, 0), PCI_NUM_PINS, TYPE_SPAPR_PHB_ROOT_BUS); @@ -1898,8 +1886,14 @@ void spapr_phb_dma_reset(SpaprPhbState *sphb) static void spapr_phb_reset(DeviceState *qdev) { SpaprPhbState *sphb = SPAPR_PCI_HOST_BRIDGE(qdev); + Error *errp = NULL; spapr_phb_dma_reset(sphb); + spapr_phb_nvgpu_free(sphb); + spapr_phb_nvgpu_setup(sphb, &errp); + if (errp) { + error_report_err(errp); + } /* Reset the IOMMU state */ object_child_foreach(OBJECT(qdev), spapr_phb_children_reset, NULL); @@ -1932,6 +1926,8 @@ static Property spapr_phb_properties[] = { pre_2_8_migration, false), DEFINE_PROP_BOOL("pcie-extended-configuration-space", SpaprPhbState, pcie_ecs, true), + DEFINE_PROP_UINT64("gpa", SpaprPhbState, nv2_gpa_win_addr, 0), + DEFINE_PROP_UINT64("atsd", SpaprPhbState, nv2_atsd_win_addr, 0), DEFINE_PROP_END_OF_LIST(), }; @@ -2164,7 +2160,6 @@ int spapr_populate_pci_dt(SpaprPhbState *phb, uint32_t intc_phandle, void *fdt, uint32_t nr_msis, int *node_offset) { int bus_off, i, j, ret; - gchar *nodename; uint32_t bus_range[] = { cpu_to_be32(0), cpu_to_be32(0xff) }; struct { uint32_t hi; @@ -2212,11 +2207,10 @@ int spapr_populate_pci_dt(SpaprPhbState *phb, uint32_t intc_phandle, void *fdt, PCIBus *bus = PCI_HOST_BRIDGE(phb)->bus; SpaprFdt s_fdt; SpaprDrc *drc; + Error *errp = NULL; /* Start populating the FDT */ - nodename = g_strdup_printf("pci@%" PRIx64, phb->buid); - _FDT(bus_off = fdt_add_subnode(fdt, 0, nodename)); - g_free(nodename); + _FDT(bus_off = fdt_add_subnode(fdt, 0, phb->dtbusname)); if (node_offset) { *node_offset = bus_off; } @@ -2249,14 +2243,14 @@ int spapr_populate_pci_dt(SpaprPhbState *phb, uint32_t intc_phandle, void *fdt, } /* Build the interrupt-map, this must matches what is done - * in pci_spapr_map_irq + * in pci_swizzle_map_irq_fn */ _FDT(fdt_setprop(fdt, bus_off, "interrupt-map-mask", &interrupt_map_mask, sizeof(interrupt_map_mask))); for (i = 0; i < PCI_SLOT_MAX; i++) { for (j = 0; j < PCI_NUM_PINS; j++) { uint32_t *irqmap = interrupt_map[i*PCI_NUM_PINS + j]; - int lsi_num = pci_spapr_swizzle(i, j); + int lsi_num = pci_swizzle(i, j); irqmap[0] = cpu_to_be32(b_ddddd(i)|b_fff(0)); irqmap[1] = 0; @@ -2304,6 +2298,12 @@ int spapr_populate_pci_dt(SpaprPhbState *phb, uint32_t intc_phandle, void *fdt, return ret; } + spapr_phb_nvgpu_populate_dt(phb, fdt, bus_off, &errp); + if (errp) { + error_report_err(errp); + } + spapr_phb_nvgpu_ram_populate_dt(phb, fdt); + return 0; } diff --git a/hw/ppc/spapr_pci_nvlink2.c b/hw/ppc/spapr_pci_nvlink2.c new file mode 100644 index 0000000000..eda8c752aa --- /dev/null +++ b/hw/ppc/spapr_pci_nvlink2.c @@ -0,0 +1,450 @@ +/* + * QEMU sPAPR PCI for NVLink2 pass through + * + * Copyright (c) 2019 Alexey Kardashevskiy, IBM Corporation. + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL + * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN + * THE SOFTWARE. + */ +#include "qemu/osdep.h" +#include "qapi/error.h" +#include "qemu-common.h" +#include "hw/pci/pci.h" +#include "hw/pci-host/spapr.h" +#include "qemu/error-report.h" +#include "hw/ppc/fdt.h" +#include "hw/pci/pci_bridge.h" + +#define PHANDLE_PCIDEV(phb, pdev) (0x12000000 | \ + (((phb)->index) << 16) | ((pdev)->devfn)) +#define PHANDLE_GPURAM(phb, n) (0x110000FF | ((n) << 8) | \ + (((phb)->index) << 16)) +#define PHANDLE_NVLINK(phb, gn, nn) (0x00130000 | (((phb)->index) << 8) | \ + ((gn) << 4) | (nn)) + +#define SPAPR_GPU_NUMA_ID (cpu_to_be32(1)) + +struct spapr_phb_pci_nvgpu_config { + uint64_t nv2_ram_current; + uint64_t nv2_atsd_current; + int num; /* number of non empty (i.e. tgt!=0) entries in slots[] */ + struct spapr_phb_pci_nvgpu_slot { + uint64_t tgt; + uint64_t gpa; + unsigned numa_id; + PCIDevice *gpdev; + int linknum; + struct { + uint64_t atsd_gpa; + PCIDevice *npdev; + uint32_t link_speed; + } links[NVGPU_MAX_LINKS]; + } slots[NVGPU_MAX_NUM]; + Error *errp; +}; + +static struct spapr_phb_pci_nvgpu_slot * +spapr_nvgpu_get_slot(struct spapr_phb_pci_nvgpu_config *nvgpus, uint64_t tgt) +{ + int i; + + /* Search for partially collected "slot" */ + for (i = 0; i < nvgpus->num; ++i) { + if (nvgpus->slots[i].tgt == tgt) { + return &nvgpus->slots[i]; + } + } + + if (nvgpus->num == ARRAY_SIZE(nvgpus->slots)) { + return NULL; + } + + i = nvgpus->num; + nvgpus->slots[i].tgt = tgt; + ++nvgpus->num; + + return &nvgpus->slots[i]; +} + +static void spapr_pci_collect_nvgpu(struct spapr_phb_pci_nvgpu_config *nvgpus, + PCIDevice *pdev, uint64_t tgt, + MemoryRegion *mr, Error **errp) +{ + MachineState *machine = MACHINE(qdev_get_machine()); + SpaprMachineState *spapr = SPAPR_MACHINE(machine); + struct spapr_phb_pci_nvgpu_slot *nvslot = spapr_nvgpu_get_slot(nvgpus, tgt); + + if (!nvslot) { + error_setg(errp, "Found too many GPUs per vPHB"); + return; + } + g_assert(!nvslot->gpdev); + nvslot->gpdev = pdev; + + nvslot->gpa = nvgpus->nv2_ram_current; + nvgpus->nv2_ram_current += memory_region_size(mr); + nvslot->numa_id = spapr->gpu_numa_id; + ++spapr->gpu_numa_id; +} + +static void spapr_pci_collect_nvnpu(struct spapr_phb_pci_nvgpu_config *nvgpus, + PCIDevice *pdev, uint64_t tgt, + MemoryRegion *mr, Error **errp) +{ + struct spapr_phb_pci_nvgpu_slot *nvslot = spapr_nvgpu_get_slot(nvgpus, tgt); + int j; + + if (!nvslot) { + error_setg(errp, "Found too many NVLink bridges per vPHB"); + return; + } + + j = nvslot->linknum; + if (j == ARRAY_SIZE(nvslot->links)) { + error_setg(errp, "Found too many NVLink bridges per GPU"); + return; + } + ++nvslot->linknum; + + g_assert(!nvslot->links[j].npdev); + nvslot->links[j].npdev = pdev; + nvslot->links[j].atsd_gpa = nvgpus->nv2_atsd_current; + nvgpus->nv2_atsd_current += memory_region_size(mr); + nvslot->links[j].link_speed = + object_property_get_uint(OBJECT(pdev), "nvlink2-link-speed", NULL); +} + +static void spapr_phb_pci_collect_nvgpu(PCIBus *bus, PCIDevice *pdev, + void *opaque) +{ + PCIBus *sec_bus; + Object *po = OBJECT(pdev); + uint64_t tgt = object_property_get_uint(po, "nvlink2-tgt", NULL); + + if (tgt) { + Error *local_err = NULL; + struct spapr_phb_pci_nvgpu_config *nvgpus = opaque; + Object *mr_gpu = object_property_get_link(po, "nvlink2-mr[0]", NULL); + Object *mr_npu = object_property_get_link(po, "nvlink2-atsd-mr[0]", + NULL); + + g_assert(mr_gpu || mr_npu); + if (mr_gpu) { + spapr_pci_collect_nvgpu(nvgpus, pdev, tgt, MEMORY_REGION(mr_gpu), + &local_err); + } else { + spapr_pci_collect_nvnpu(nvgpus, pdev, tgt, MEMORY_REGION(mr_npu), + &local_err); + } + error_propagate(&nvgpus->errp, local_err); + } + if ((pci_default_read_config(pdev, PCI_HEADER_TYPE, 1) != + PCI_HEADER_TYPE_BRIDGE)) { + return; + } + + sec_bus = pci_bridge_get_sec_bus(PCI_BRIDGE(pdev)); + if (!sec_bus) { + return; + } + + pci_for_each_device(sec_bus, pci_bus_num(sec_bus), + spapr_phb_pci_collect_nvgpu, opaque); +} + +void spapr_phb_nvgpu_setup(SpaprPhbState *sphb, Error **errp) +{ + int i, j, valid_gpu_num; + PCIBus *bus; + + /* Search for GPUs and NPUs */ + if (!sphb->nv2_gpa_win_addr || !sphb->nv2_atsd_win_addr) { + return; + } + + sphb->nvgpus = g_new0(struct spapr_phb_pci_nvgpu_config, 1); + sphb->nvgpus->nv2_ram_current = sphb->nv2_gpa_win_addr; + sphb->nvgpus->nv2_atsd_current = sphb->nv2_atsd_win_addr; + + bus = PCI_HOST_BRIDGE(sphb)->bus; + pci_for_each_device(bus, pci_bus_num(bus), + spapr_phb_pci_collect_nvgpu, sphb->nvgpus); + + if (sphb->nvgpus->errp) { + error_propagate(errp, sphb->nvgpus->errp); + sphb->nvgpus->errp = NULL; + goto cleanup_exit; + } + + /* Add found GPU RAM and ATSD MRs if found */ + for (i = 0, valid_gpu_num = 0; i < sphb->nvgpus->num; ++i) { + Object *nvmrobj; + struct spapr_phb_pci_nvgpu_slot *nvslot = &sphb->nvgpus->slots[i]; + + if (!nvslot->gpdev) { + continue; + } + nvmrobj = object_property_get_link(OBJECT(nvslot->gpdev), + "nvlink2-mr[0]", NULL); + /* ATSD is pointless without GPU RAM MR so skip those */ + if (!nvmrobj) { + continue; + } + + ++valid_gpu_num; + memory_region_add_subregion(get_system_memory(), nvslot->gpa, + MEMORY_REGION(nvmrobj)); + + for (j = 0; j < nvslot->linknum; ++j) { + Object *atsdmrobj; + + atsdmrobj = object_property_get_link(OBJECT(nvslot->links[j].npdev), + "nvlink2-atsd-mr[0]", NULL); + if (!atsdmrobj) { + continue; + } + memory_region_add_subregion(get_system_memory(), + nvslot->links[j].atsd_gpa, + MEMORY_REGION(atsdmrobj)); + } + } + + if (valid_gpu_num) { + return; + } + /* We did not find any interesting GPU */ +cleanup_exit: + g_free(sphb->nvgpus); + sphb->nvgpus = NULL; +} + +void spapr_phb_nvgpu_free(SpaprPhbState *sphb) +{ + int i, j; + + if (!sphb->nvgpus) { + return; + } + + for (i = 0; i < sphb->nvgpus->num; ++i) { + struct spapr_phb_pci_nvgpu_slot *nvslot = &sphb->nvgpus->slots[i]; + Object *nv_mrobj = object_property_get_link(OBJECT(nvslot->gpdev), + "nvlink2-mr[0]", NULL); + + if (nv_mrobj) { + memory_region_del_subregion(get_system_memory(), + MEMORY_REGION(nv_mrobj)); + } + for (j = 0; j < nvslot->linknum; ++j) { + PCIDevice *npdev = nvslot->links[j].npdev; + Object *atsd_mrobj; + atsd_mrobj = object_property_get_link(OBJECT(npdev), + "nvlink2-atsd-mr[0]", NULL); + if (atsd_mrobj) { + memory_region_del_subregion(get_system_memory(), + MEMORY_REGION(atsd_mrobj)); + } + } + } + g_free(sphb->nvgpus); + sphb->nvgpus = NULL; +} + +void spapr_phb_nvgpu_populate_dt(SpaprPhbState *sphb, void *fdt, int bus_off, + Error **errp) +{ + int i, j, atsdnum = 0; + uint64_t atsd[8]; /* The existing limitation of known guests */ + + if (!sphb->nvgpus) { + return; + } + + for (i = 0; (i < sphb->nvgpus->num) && (atsdnum < ARRAY_SIZE(atsd)); ++i) { + struct spapr_phb_pci_nvgpu_slot *nvslot = &sphb->nvgpus->slots[i]; + + if (!nvslot->gpdev) { + continue; + } + for (j = 0; j < nvslot->linknum; ++j) { + if (!nvslot->links[j].atsd_gpa) { + continue; + } + + if (atsdnum == ARRAY_SIZE(atsd)) { + error_report("Only %"PRIuPTR" ATSD registers supported", + ARRAY_SIZE(atsd)); + break; + } + atsd[atsdnum] = cpu_to_be64(nvslot->links[j].atsd_gpa); + ++atsdnum; + } + } + + if (!atsdnum) { + error_setg(errp, "No ATSD registers found"); + return; + } + + if (!spapr_phb_eeh_available(sphb)) { + /* + * ibm,mmio-atsd contains ATSD registers; these belong to an NPU PHB + * which we do not emulate as a separate device. Instead we put + * ibm,mmio-atsd to the vPHB with GPU and make sure that we do not + * put GPUs from different IOMMU groups to the same vPHB to ensure + * that the guest will use ATSDs from the corresponding NPU. + */ + error_setg(errp, "ATSD requires separate vPHB per GPU IOMMU group"); + return; + } + + _FDT((fdt_setprop(fdt, bus_off, "ibm,mmio-atsd", atsd, + atsdnum * sizeof(atsd[0])))); +} + +void spapr_phb_nvgpu_ram_populate_dt(SpaprPhbState *sphb, void *fdt) +{ + int i, j, linkidx, npuoff; + char *npuname; + + if (!sphb->nvgpus) { + return; + } + + npuname = g_strdup_printf("npuphb%d", sphb->index); + npuoff = fdt_add_subnode(fdt, 0, npuname); + _FDT(npuoff); + _FDT(fdt_setprop_cell(fdt, npuoff, "#address-cells", 1)); + _FDT(fdt_setprop_cell(fdt, npuoff, "#size-cells", 0)); + /* Advertise NPU as POWER9 so the guest can enable NPU2 contexts */ + _FDT((fdt_setprop_string(fdt, npuoff, "compatible", "ibm,power9-npu"))); + g_free(npuname); + + for (i = 0, linkidx = 0; i < sphb->nvgpus->num; ++i) { + for (j = 0; j < sphb->nvgpus->slots[i].linknum; ++j) { + char *linkname = g_strdup_printf("link@%d", linkidx); + int off = fdt_add_subnode(fdt, npuoff, linkname); + + _FDT(off); + /* _FDT((fdt_setprop_cell(fdt, off, "reg", linkidx))); */ + _FDT((fdt_setprop_string(fdt, off, "compatible", + "ibm,npu-link"))); + _FDT((fdt_setprop_cell(fdt, off, "phandle", + PHANDLE_NVLINK(sphb, i, j)))); + _FDT((fdt_setprop_cell(fdt, off, "ibm,npu-link-index", linkidx))); + g_free(linkname); + ++linkidx; + } + } + + /* Add memory nodes for GPU RAM and mark them unusable */ + for (i = 0; i < sphb->nvgpus->num; ++i) { + struct spapr_phb_pci_nvgpu_slot *nvslot = &sphb->nvgpus->slots[i]; + Object *nv_mrobj = object_property_get_link(OBJECT(nvslot->gpdev), + "nvlink2-mr[0]", NULL); + uint32_t associativity[] = { + cpu_to_be32(0x4), + SPAPR_GPU_NUMA_ID, + SPAPR_GPU_NUMA_ID, + SPAPR_GPU_NUMA_ID, + cpu_to_be32(nvslot->numa_id) + }; + uint64_t size = object_property_get_uint(nv_mrobj, "size", NULL); + uint64_t mem_reg[2] = { cpu_to_be64(nvslot->gpa), cpu_to_be64(size) }; + char *mem_name = g_strdup_printf("memory@%"PRIx64, nvslot->gpa); + int off = fdt_add_subnode(fdt, 0, mem_name); + + _FDT(off); + _FDT((fdt_setprop_string(fdt, off, "device_type", "memory"))); + _FDT((fdt_setprop(fdt, off, "reg", mem_reg, sizeof(mem_reg)))); + _FDT((fdt_setprop(fdt, off, "ibm,associativity", associativity, + sizeof(associativity)))); + + _FDT((fdt_setprop_string(fdt, off, "compatible", + "ibm,coherent-device-memory"))); + + mem_reg[1] = cpu_to_be64(0); + _FDT((fdt_setprop(fdt, off, "linux,usable-memory", mem_reg, + sizeof(mem_reg)))); + _FDT((fdt_setprop_cell(fdt, off, "phandle", + PHANDLE_GPURAM(sphb, i)))); + g_free(mem_name); + } + +} + +void spapr_phb_nvgpu_populate_pcidev_dt(PCIDevice *dev, void *fdt, int offset, + SpaprPhbState *sphb) +{ + int i, j; + + if (!sphb->nvgpus) { + return; + } + + for (i = 0; i < sphb->nvgpus->num; ++i) { + struct spapr_phb_pci_nvgpu_slot *nvslot = &sphb->nvgpus->slots[i]; + + /* Skip "slot" without attached GPU */ + if (!nvslot->gpdev) { + continue; + } + if (dev == nvslot->gpdev) { + uint32_t npus[nvslot->linknum]; + + for (j = 0; j < nvslot->linknum; ++j) { + PCIDevice *npdev = nvslot->links[j].npdev; + + npus[j] = cpu_to_be32(PHANDLE_PCIDEV(sphb, npdev)); + } + _FDT(fdt_setprop(fdt, offset, "ibm,npu", npus, + j * sizeof(npus[0]))); + _FDT((fdt_setprop_cell(fdt, offset, "phandle", + PHANDLE_PCIDEV(sphb, dev)))); + continue; + } + + for (j = 0; j < nvslot->linknum; ++j) { + if (dev != nvslot->links[j].npdev) { + continue; + } + + _FDT((fdt_setprop_cell(fdt, offset, "phandle", + PHANDLE_PCIDEV(sphb, dev)))); + _FDT(fdt_setprop_cell(fdt, offset, "ibm,gpu", + PHANDLE_PCIDEV(sphb, nvslot->gpdev))); + _FDT((fdt_setprop_cell(fdt, offset, "ibm,nvlink", + PHANDLE_NVLINK(sphb, i, j)))); + /* + * If we ever want to emulate GPU RAM at the same location as on + * the host - here is the encoding GPA->TGT: + * + * gta = ((sphb->nv2_gpa >> 42) & 0x1) << 42; + * gta |= ((sphb->nv2_gpa >> 45) & 0x3) << 43; + * gta |= ((sphb->nv2_gpa >> 49) & 0x3) << 45; + * gta |= sphb->nv2_gpa & ((1UL << 43) - 1); + */ + _FDT(fdt_setprop_cell(fdt, offset, "memory-region", + PHANDLE_GPURAM(sphb, i))); + _FDT(fdt_setprop_u64(fdt, offset, "ibm,device-tgt-addr", + nvslot->tgt)); + _FDT(fdt_setprop_cell(fdt, offset, "ibm,nvlink-speed", + nvslot->links[j].link_speed)); + } + } +} diff --git a/hw/ppc/spapr_rtas.c b/hw/ppc/spapr_rtas.c index 24c45b12d4..ee24212765 100644 --- a/hw/ppc/spapr_rtas.c +++ b/hw/ppc/spapr_rtas.c @@ -404,7 +404,7 @@ void spapr_rtas_register(int token, const char *name, spapr_rtas_fn fn) token -= RTAS_TOKEN_BASE; - assert(!rtas_table[token].name); + assert(!name || !rtas_table[token].name); rtas_table[token].name = name; rtas_table[token].fn = fn; diff --git a/hw/s390x/ipl.c b/hw/s390x/ipl.c index 51b272e190..d0cc06a05f 100644 --- a/hw/s390x/ipl.c +++ b/hw/s390x/ipl.c @@ -19,6 +19,7 @@ #include "hw/loader.h" #include "hw/boards.h" #include "hw/s390x/virtio-ccw.h" +#include "hw/s390x/vfio-ccw.h" #include "hw/s390x/css.h" #include "hw/s390x/ebcdic.h" #include "ipl.h" @@ -303,16 +304,36 @@ static void s390_ipl_set_boot_menu(S390IPLState *ipl) ipl->qipl.boot_menu_timeout = cpu_to_be32(splash_time); } -static CcwDevice *s390_get_ccw_device(DeviceState *dev_st) +#define CCW_DEVTYPE_NONE 0x00 +#define CCW_DEVTYPE_VIRTIO 0x01 +#define CCW_DEVTYPE_VIRTIO_NET 0x02 +#define CCW_DEVTYPE_SCSI 0x03 +#define CCW_DEVTYPE_VFIO 0x04 + +static CcwDevice *s390_get_ccw_device(DeviceState *dev_st, int *devtype) { CcwDevice *ccw_dev = NULL; + int tmp_dt = CCW_DEVTYPE_NONE; if (dev_st) { + VirtIONet *virtio_net_dev = (VirtIONet *) + object_dynamic_cast(OBJECT(dev_st), TYPE_VIRTIO_NET); VirtioCcwDevice *virtio_ccw_dev = (VirtioCcwDevice *) object_dynamic_cast(OBJECT(qdev_get_parent_bus(dev_st)->parent), TYPE_VIRTIO_CCW_DEVICE); + VFIOCCWDevice *vfio_ccw_dev = (VFIOCCWDevice *) + object_dynamic_cast(OBJECT(dev_st), TYPE_VFIO_CCW); + if (virtio_ccw_dev) { ccw_dev = CCW_DEVICE(virtio_ccw_dev); + if (virtio_net_dev) { + tmp_dt = CCW_DEVTYPE_VIRTIO_NET; + } else { + tmp_dt = CCW_DEVTYPE_VIRTIO; + } + } else if (vfio_ccw_dev) { + ccw_dev = CCW_DEVICE(vfio_ccw_dev); + tmp_dt = CCW_DEVTYPE_VFIO; } else { SCSIDevice *sd = (SCSIDevice *) object_dynamic_cast(OBJECT(dev_st), @@ -325,9 +346,13 @@ static CcwDevice *s390_get_ccw_device(DeviceState *dev_st) ccw_dev = (CcwDevice *)object_dynamic_cast(OBJECT(scsi_ccw), TYPE_CCW_DEVICE); + tmp_dt = CCW_DEVTYPE_SCSI; } } } + if (devtype) { + *devtype = tmp_dt; + } return ccw_dev; } @@ -335,20 +360,22 @@ static bool s390_gen_initial_iplb(S390IPLState *ipl) { DeviceState *dev_st; CcwDevice *ccw_dev = NULL; + SCSIDevice *sd; + int devtype; dev_st = get_boot_device(0); if (dev_st) { - ccw_dev = s390_get_ccw_device(dev_st); + ccw_dev = s390_get_ccw_device(dev_st, &devtype); } /* * Currently allow IPL only from CCW devices. */ if (ccw_dev) { - SCSIDevice *sd = (SCSIDevice *) object_dynamic_cast(OBJECT(dev_st), - TYPE_SCSI_DEVICE); - - if (sd) { + switch (devtype) { + case CCW_DEVTYPE_SCSI: + sd = (SCSIDevice *) object_dynamic_cast(OBJECT(dev_st), + TYPE_SCSI_DEVICE); ipl->iplb.len = cpu_to_be32(S390_IPLB_MIN_QEMU_SCSI_LEN); ipl->iplb.blk0_len = cpu_to_be32(S390_IPLB_MIN_QEMU_SCSI_LEN - S390_IPLB_HEADER_LEN); @@ -358,20 +385,24 @@ static bool s390_gen_initial_iplb(S390IPLState *ipl) ipl->iplb.scsi.channel = cpu_to_be16(sd->channel); ipl->iplb.scsi.devno = cpu_to_be16(ccw_dev->sch->devno); ipl->iplb.scsi.ssid = ccw_dev->sch->ssid & 3; - } else { - VirtIONet *vn = (VirtIONet *) object_dynamic_cast(OBJECT(dev_st), - TYPE_VIRTIO_NET); - + break; + case CCW_DEVTYPE_VFIO: + ipl->iplb.len = cpu_to_be32(S390_IPLB_MIN_CCW_LEN); + ipl->iplb.pbt = S390_IPL_TYPE_CCW; + ipl->iplb.ccw.devno = cpu_to_be16(ccw_dev->sch->devno); + ipl->iplb.ccw.ssid = ccw_dev->sch->ssid & 3; + break; + case CCW_DEVTYPE_VIRTIO_NET: + ipl->netboot = true; + /* Fall through to CCW_DEVTYPE_VIRTIO case */ + case CCW_DEVTYPE_VIRTIO: ipl->iplb.len = cpu_to_be32(S390_IPLB_MIN_CCW_LEN); ipl->iplb.blk0_len = cpu_to_be32(S390_IPLB_MIN_CCW_LEN - S390_IPLB_HEADER_LEN); ipl->iplb.pbt = S390_IPL_TYPE_CCW; ipl->iplb.ccw.devno = cpu_to_be16(ccw_dev->sch->devno); ipl->iplb.ccw.ssid = ccw_dev->sch->ssid & 3; - - if (vn) { - ipl->netboot = true; - } + break; } if (!s390_ipl_set_loadparm(ipl->iplb.loadparm)) { @@ -530,7 +561,7 @@ void s390_ipl_reset_request(CPUState *cs, enum s390_reset reset_type) !ipl->netboot && ipl->iplb.pbt == S390_IPL_TYPE_CCW && is_virtio_scsi_device(&ipl->iplb)) { - CcwDevice *ccw_dev = s390_get_ccw_device(get_boot_device(0)); + CcwDevice *ccw_dev = s390_get_ccw_device(get_boot_device(0), NULL); if (ccw_dev && cpu_to_be16(ccw_dev->sch->devno) == ipl->iplb.ccw.devno && diff --git a/hw/s390x/s390-ccw.c b/hw/s390x/s390-ccw.c index cad91ee626..f5f025d1b6 100644 --- a/hw/s390x/s390-ccw.c +++ b/hw/s390x/s390-ccw.c @@ -124,6 +124,14 @@ static void s390_ccw_unrealize(S390CCWDevice *cdev, Error **errp) g_free(cdev->mdevid); } +static void s390_ccw_instance_init(Object *obj) +{ + S390CCWDevice *dev = S390_CCW_DEVICE(obj); + + device_add_bootindex_property(obj, &dev->bootindex, "bootindex", + "/disk@0,0", DEVICE(obj), NULL); +} + static void s390_ccw_class_init(ObjectClass *klass, void *data) { DeviceClass *dc = DEVICE_CLASS(klass); @@ -137,6 +145,7 @@ static void s390_ccw_class_init(ObjectClass *klass, void *data) static const TypeInfo s390_ccw_info = { .name = TYPE_S390_CCW, .parent = TYPE_CCW_DEVICE, + .instance_init = s390_ccw_instance_init, .instance_size = sizeof(S390CCWDevice), .class_size = sizeof(S390CCWDeviceClass), .class_init = s390_ccw_class_init, diff --git a/hw/s390x/s390-virtio-ccw.c b/hw/s390x/s390-virtio-ccw.c index d11069b860..bbc6e8fa0b 100644 --- a/hw/s390x/s390-virtio-ccw.c +++ b/hw/s390x/s390-virtio-ccw.c @@ -15,6 +15,7 @@ #include "cpu.h" #include "hw/boards.h" #include "exec/address-spaces.h" +#include "exec/ram_addr.h" #include "hw/s390x/s390-virtio-hcall.h" #include "hw/s390x/sclp.h" #include "hw/s390x/s390_flic.h" @@ -163,6 +164,7 @@ static void s390_memory_init(ram_addr_t mem_size) MemoryRegion *sysmem = get_system_memory(); ram_addr_t chunk, offset = 0; unsigned int number = 0; + Error *local_err = NULL; gchar *name; /* allocate RAM for core */ @@ -182,6 +184,15 @@ static void s390_memory_init(ram_addr_t mem_size) } g_free(name); + /* + * Configure the maximum page size. As no memory devices were created + * yet, this is the page size of initial memory only. + */ + s390_set_max_pagesize(qemu_maxrampagesize(), &local_err); + if (local_err) { + error_report_err(local_err); + exit(EXIT_FAILURE); + } /* Initialize storage key device */ s390_skeys_init(); /* Initialize storage attributes device */ @@ -253,6 +264,7 @@ static void ccw_init(MachineState *machine) DeviceState *dev; s390_sclp_init(); + /* init memory + setup max page size. Required for the CPU model */ s390_memory_init(machine->ram_size); /* init CPUs (incl. CPU model) early so s390_has_feature() works */ @@ -646,14 +658,26 @@ bool css_migration_enabled(void) } \ type_init(ccw_machine_register_##suffix) +static void ccw_machine_4_1_instance_options(MachineState *machine) +{ +} + +static void ccw_machine_4_1_class_options(MachineClass *mc) +{ +} +DEFINE_CCW_MACHINE(4_1, "4.1", true); + static void ccw_machine_4_0_instance_options(MachineState *machine) { + ccw_machine_4_1_instance_options(machine); } static void ccw_machine_4_0_class_options(MachineClass *mc) { + ccw_machine_4_1_class_options(mc); + compat_props_add(mc->compat_props, hw_compat_4_0, hw_compat_4_0_len); } -DEFINE_CCW_MACHINE(4_0, "4.0", true); +DEFINE_CCW_MACHINE(4_0, "4.0", false); static void ccw_machine_3_1_instance_options(MachineState *machine) { diff --git a/hw/vfio/ccw.c b/hw/vfio/ccw.c index c44d13cc50..31dd3a2a87 100644 --- a/hw/vfio/ccw.c +++ b/hw/vfio/ccw.c @@ -21,12 +21,12 @@ #include "hw/vfio/vfio.h" #include "hw/vfio/vfio-common.h" #include "hw/s390x/s390-ccw.h" +#include "hw/s390x/vfio-ccw.h" #include "hw/s390x/ccw-device.h" #include "exec/address-spaces.h" #include "qemu/error-report.h" -#define TYPE_VFIO_CCW "vfio-ccw" -typedef struct VFIOCCWDevice { +struct VFIOCCWDevice { S390CCWDevice cdev; VFIODevice vdev; uint64_t io_region_size; @@ -35,7 +35,7 @@ typedef struct VFIOCCWDevice { EventNotifier io_notifier; bool force_orb_pfch; bool warned_orb_pfch; -} VFIOCCWDevice; +}; static inline void warn_once_pfch(VFIOCCWDevice *vcdev, SubchDev *sch, const char *msg) diff --git a/hw/vfio/pci-quirks.c b/hw/vfio/pci-quirks.c index 40a12001f5..29b2697fe1 100644 --- a/hw/vfio/pci-quirks.c +++ b/hw/vfio/pci-quirks.c @@ -2180,3 +2180,134 @@ int vfio_add_virt_caps(VFIOPCIDevice *vdev, Error **errp) return 0; } + +static void vfio_pci_nvlink2_get_tgt(Object *obj, Visitor *v, + const char *name, + void *opaque, Error **errp) +{ + uint64_t tgt = (uintptr_t) opaque; + visit_type_uint64(v, name, &tgt, errp); +} + +static void vfio_pci_nvlink2_get_link_speed(Object *obj, Visitor *v, + const char *name, + void *opaque, Error **errp) +{ + uint32_t link_speed = (uint32_t)(uintptr_t) opaque; + visit_type_uint32(v, name, &link_speed, errp); +} + +int vfio_pci_nvidia_v100_ram_init(VFIOPCIDevice *vdev, Error **errp) +{ + int ret; + void *p; + struct vfio_region_info *nv2reg = NULL; + struct vfio_info_cap_header *hdr; + struct vfio_region_info_cap_nvlink2_ssatgt *cap; + VFIOQuirk *quirk; + + ret = vfio_get_dev_region_info(&vdev->vbasedev, + VFIO_REGION_TYPE_PCI_VENDOR_TYPE | + PCI_VENDOR_ID_NVIDIA, + VFIO_REGION_SUBTYPE_NVIDIA_NVLINK2_RAM, + &nv2reg); + if (ret) { + return ret; + } + + hdr = vfio_get_region_info_cap(nv2reg, VFIO_REGION_INFO_CAP_NVLINK2_SSATGT); + if (!hdr) { + ret = -ENODEV; + goto free_exit; + } + cap = (void *) hdr; + + p = mmap(NULL, nv2reg->size, PROT_READ | PROT_WRITE | PROT_EXEC, + MAP_SHARED, vdev->vbasedev.fd, nv2reg->offset); + if (p == MAP_FAILED) { + ret = -errno; + goto free_exit; + } + + quirk = vfio_quirk_alloc(1); + memory_region_init_ram_ptr(&quirk->mem[0], OBJECT(vdev), "nvlink2-mr", + nv2reg->size, p); + QLIST_INSERT_HEAD(&vdev->bars[0].quirks, quirk, next); + + object_property_add(OBJECT(vdev), "nvlink2-tgt", "uint64", + vfio_pci_nvlink2_get_tgt, NULL, NULL, + (void *) (uintptr_t) cap->tgt, NULL); + trace_vfio_pci_nvidia_gpu_setup_quirk(vdev->vbasedev.name, cap->tgt, + nv2reg->size); +free_exit: + g_free(nv2reg); + + return ret; +} + +int vfio_pci_nvlink2_init(VFIOPCIDevice *vdev, Error **errp) +{ + int ret; + void *p; + struct vfio_region_info *atsdreg = NULL; + struct vfio_info_cap_header *hdr; + struct vfio_region_info_cap_nvlink2_ssatgt *captgt; + struct vfio_region_info_cap_nvlink2_lnkspd *capspeed; + VFIOQuirk *quirk; + + ret = vfio_get_dev_region_info(&vdev->vbasedev, + VFIO_REGION_TYPE_PCI_VENDOR_TYPE | + PCI_VENDOR_ID_IBM, + VFIO_REGION_SUBTYPE_IBM_NVLINK2_ATSD, + &atsdreg); + if (ret) { + return ret; + } + + hdr = vfio_get_region_info_cap(atsdreg, + VFIO_REGION_INFO_CAP_NVLINK2_SSATGT); + if (!hdr) { + ret = -ENODEV; + goto free_exit; + } + captgt = (void *) hdr; + + hdr = vfio_get_region_info_cap(atsdreg, + VFIO_REGION_INFO_CAP_NVLINK2_LNKSPD); + if (!hdr) { + ret = -ENODEV; + goto free_exit; + } + capspeed = (void *) hdr; + + /* Some NVLink bridges may not have assigned ATSD */ + if (atsdreg->size) { + p = mmap(NULL, atsdreg->size, PROT_READ | PROT_WRITE | PROT_EXEC, + MAP_SHARED, vdev->vbasedev.fd, atsdreg->offset); + if (p == MAP_FAILED) { + ret = -errno; + goto free_exit; + } + + quirk = vfio_quirk_alloc(1); + memory_region_init_ram_device_ptr(&quirk->mem[0], OBJECT(vdev), + "nvlink2-atsd-mr", atsdreg->size, p); + QLIST_INSERT_HEAD(&vdev->bars[0].quirks, quirk, next); + } + + object_property_add(OBJECT(vdev), "nvlink2-tgt", "uint64", + vfio_pci_nvlink2_get_tgt, NULL, NULL, + (void *) (uintptr_t) captgt->tgt, NULL); + trace_vfio_pci_nvlink2_setup_quirk_ssatgt(vdev->vbasedev.name, captgt->tgt, + atsdreg->size); + + object_property_add(OBJECT(vdev), "nvlink2-link-speed", "uint32", + vfio_pci_nvlink2_get_link_speed, NULL, NULL, + (void *) (uintptr_t) capspeed->link_speed, NULL); + trace_vfio_pci_nvlink2_setup_quirk_lnkspd(vdev->vbasedev.name, + capspeed->link_speed); +free_exit: + g_free(atsdreg); + + return ret; +} diff --git a/hw/vfio/pci.c b/hw/vfio/pci.c index 0142819ea6..8cecb53d5c 100644 --- a/hw/vfio/pci.c +++ b/hw/vfio/pci.c @@ -3086,6 +3086,20 @@ static void vfio_realize(PCIDevice *pdev, Error **errp) } } + if (vdev->vendor_id == PCI_VENDOR_ID_NVIDIA) { + ret = vfio_pci_nvidia_v100_ram_init(vdev, errp); + if (ret && ret != -ENODEV) { + error_report("Failed to setup NVIDIA V100 GPU RAM"); + } + } + + if (vdev->vendor_id == PCI_VENDOR_ID_IBM) { + ret = vfio_pci_nvlink2_init(vdev, errp); + if (ret && ret != -ENODEV) { + error_report("Failed to setup NVlink2 bridge"); + } + } + vfio_register_err_notifier(vdev); vfio_register_req_notifier(vdev); vfio_setup_resetfn_quirk(vdev); diff --git a/hw/vfio/pci.h b/hw/vfio/pci.h index c11c3f1670..cfcd1a81b8 100644 --- a/hw/vfio/pci.h +++ b/hw/vfio/pci.h @@ -196,6 +196,8 @@ int vfio_populate_vga(VFIOPCIDevice *vdev, Error **errp); int vfio_pci_igd_opregion_init(VFIOPCIDevice *vdev, struct vfio_region_info *info, Error **errp); +int vfio_pci_nvidia_v100_ram_init(VFIOPCIDevice *vdev, Error **errp); +int vfio_pci_nvlink2_init(VFIOPCIDevice *vdev, Error **errp); void vfio_display_reset(VFIOPCIDevice *vdev); int vfio_display_probe(VFIOPCIDevice *vdev, Error **errp); diff --git a/hw/vfio/spapr.c b/hw/vfio/spapr.c index 57fe758e54..96c0ad9d9b 100644 --- a/hw/vfio/spapr.c +++ b/hw/vfio/spapr.c @@ -148,7 +148,7 @@ int vfio_spapr_create_window(VFIOContainer *container, uint64_t pagesize = memory_region_iommu_get_min_page_size(iommu_mr); unsigned entries, bits_total, bits_per_level, max_levels; struct vfio_iommu_spapr_tce_create create = { .argsz = sizeof(create) }; - long rampagesize = qemu_getrampagesize(); + long rampagesize = qemu_minrampagesize(); /* * The host might not support the guest supported IOMMU page size, diff --git a/hw/vfio/trace-events b/hw/vfio/trace-events index eb589930a5..b1ef55a33f 100644 --- a/hw/vfio/trace-events +++ b/hw/vfio/trace-events @@ -86,6 +86,10 @@ vfio_pci_igd_opregion_enabled(const char *name) "%s" vfio_pci_igd_host_bridge_enabled(const char *name) "%s" vfio_pci_igd_lpc_bridge_enabled(const char *name) "%s" +vfio_pci_nvidia_gpu_setup_quirk(const char *name, uint64_t tgt, uint64_t size) "%s tgt=0x%"PRIx64" size=0x%"PRIx64 +vfio_pci_nvlink2_setup_quirk_ssatgt(const char *name, uint64_t tgt, uint64_t size) "%s tgt=0x%"PRIx64" size=0x%"PRIx64 +vfio_pci_nvlink2_setup_quirk_lnkspd(const char *name, uint32_t link_speed) "%s link_speed=0x%x" + # common.c vfio_region_write(const char *name, int index, uint64_t addr, uint64_t data, unsigned size) " (%s:region%d+0x%"PRIx64", 0x%"PRIx64 ", %d)" vfio_region_read(char *name, int index, uint64_t addr, unsigned size, uint64_t data) " (%s:region%d+0x%"PRIx64", %d) = 0x%"PRIx64 |