summary refs log tree commit diff stats
path: root/hw/ppc
diff options
context:
space:
mode:
Diffstat (limited to 'hw/ppc')
-rw-r--r--hw/ppc/Makefile.objs2
-rw-r--r--hw/ppc/pnv.c411
-rw-r--r--hw/ppc/pnv_bmc.c122
-rw-r--r--hw/ppc/pnv_core.c27
-rw-r--r--hw/ppc/pnv_lpc.c106
-rw-r--r--hw/ppc/pnv_occ.c136
-rw-r--r--hw/ppc/pnv_psi.c571
-rw-r--r--hw/ppc/spapr.c371
-rw-r--r--hw/ppc/spapr_cpu_core.c17
-rw-r--r--hw/ppc/spapr_events.c2
-rw-r--r--hw/ppc/spapr_hcall.c174
-rw-r--r--hw/ppc/spapr_iommu.c8
-rw-r--r--hw/ppc/spapr_pci.c8
-rw-r--r--hw/ppc/spapr_rtc.c41
14 files changed, 1757 insertions, 239 deletions
diff --git a/hw/ppc/Makefile.objs b/hw/ppc/Makefile.objs
index 001293423c..7efc686748 100644
--- a/hw/ppc/Makefile.objs
+++ b/hw/ppc/Makefile.objs
@@ -6,7 +6,7 @@ obj-$(CONFIG_PSERIES) += spapr_hcall.o spapr_iommu.o spapr_rtas.o
 obj-$(CONFIG_PSERIES) += spapr_pci.o spapr_rtc.o spapr_drc.o spapr_rng.o
 obj-$(CONFIG_PSERIES) += spapr_cpu_core.o spapr_ovec.o
 # IBM PowerNV
-obj-$(CONFIG_POWERNV) += pnv.o pnv_xscom.o pnv_core.o pnv_lpc.o
+obj-$(CONFIG_POWERNV) += pnv.o pnv_xscom.o pnv_core.o pnv_lpc.o pnv_psi.o pnv_occ.o pnv_bmc.o
 ifeq ($(CONFIG_PCI)$(CONFIG_PSERIES)$(CONFIG_LINUX), yyy)
 obj-y += spapr_pci_vfio.o
 endif
diff --git a/hw/ppc/pnv.c b/hw/ppc/pnv.c
index 3fa722af82..d4bcdb027f 100644
--- a/hw/ppc/pnv.c
+++ b/hw/ppc/pnv.c
@@ -33,7 +33,11 @@
 #include "exec/address-spaces.h"
 #include "qemu/cutils.h"
 #include "qapi/visitor.h"
+#include "monitor/monitor.h"
+#include "hw/intc/intc.h"
+#include "hw/ipmi/ipmi.h"
 
+#include "hw/ppc/xics.h"
 #include "hw/ppc/pnv_xscom.h"
 
 #include "hw/isa/isa.h"
@@ -215,6 +219,55 @@ static void powernv_create_core_node(PnvChip *chip, PnvCore *pc, void *fdt)
                        servers_prop, sizeof(servers_prop))));
 }
 
+static void powernv_populate_icp(PnvChip *chip, void *fdt, uint32_t pir,
+                                 uint32_t nr_threads)
+{
+    uint64_t addr = PNV_ICP_BASE(chip) | (pir << 12);
+    char *name;
+    const char compat[] = "IBM,power8-icp\0IBM,ppc-xicp";
+    uint32_t irange[2], i, rsize;
+    uint64_t *reg;
+    int offset;
+
+    irange[0] = cpu_to_be32(pir);
+    irange[1] = cpu_to_be32(nr_threads);
+
+    rsize = sizeof(uint64_t) * 2 * nr_threads;
+    reg = g_malloc(rsize);
+    for (i = 0; i < nr_threads; i++) {
+        reg[i * 2] = cpu_to_be64(addr | ((pir + i) * 0x1000));
+        reg[i * 2 + 1] = cpu_to_be64(0x1000);
+    }
+
+    name = g_strdup_printf("interrupt-controller@%"PRIX64, addr);
+    offset = fdt_add_subnode(fdt, 0, name);
+    _FDT(offset);
+    g_free(name);
+
+    _FDT((fdt_setprop(fdt, offset, "compatible", compat, sizeof(compat))));
+    _FDT((fdt_setprop(fdt, offset, "reg", reg, rsize)));
+    _FDT((fdt_setprop_string(fdt, offset, "device_type",
+                              "PowerPC-External-Interrupt-Presentation")));
+    _FDT((fdt_setprop(fdt, offset, "interrupt-controller", NULL, 0)));
+    _FDT((fdt_setprop(fdt, offset, "ibm,interrupt-server-ranges",
+                       irange, sizeof(irange))));
+    _FDT((fdt_setprop_cell(fdt, offset, "#interrupt-cells", 1)));
+    _FDT((fdt_setprop_cell(fdt, offset, "#address-cells", 0)));
+    g_free(reg);
+}
+
+static int pnv_chip_lpc_offset(PnvChip *chip, void *fdt)
+{
+    char *name;
+    int offset;
+
+    name = g_strdup_printf("/xscom@%" PRIx64 "/isa@%x",
+                           (uint64_t) PNV_XSCOM_BASE(chip), PNV_XSCOM_LPC_BASE);
+    offset = fdt_path_offset(fdt, name);
+    g_free(name);
+    return offset;
+}
+
 static void powernv_populate_chip(PnvChip *chip, void *fdt)
 {
     PnvChipClass *pcc = PNV_CHIP_GET_CLASS(chip);
@@ -224,10 +277,24 @@ static void powernv_populate_chip(PnvChip *chip, void *fdt)
 
     pnv_xscom_populate(chip, fdt, 0);
 
+    /* The default LPC bus of a multichip system is on chip 0. It's
+     * recognized by the firmware (skiboot) using a "primary"
+     * property.
+     */
+    if (chip->chip_id == 0x0) {
+        int lpc_offset = pnv_chip_lpc_offset(chip, fdt);
+
+        _FDT((fdt_setprop(fdt, lpc_offset, "primary", NULL, 0)));
+    }
+
     for (i = 0; i < chip->nr_cores; i++) {
         PnvCore *pnv_core = PNV_CORE(chip->cores + i * typesize);
 
         powernv_create_core_node(chip, pnv_core, fdt);
+
+        /* Interrupt Control Presenters (ICP). One per core. */
+        powernv_populate_icp(chip, fdt, pnv_core->pir,
+                             CPU_CORE(pnv_core)->nr_threads);
     }
 
     if (chip->ram_size) {
@@ -237,6 +304,127 @@ static void powernv_populate_chip(PnvChip *chip, void *fdt)
     g_free(typename);
 }
 
+static void powernv_populate_rtc(ISADevice *d, void *fdt, int lpc_off)
+{
+    uint32_t io_base = d->ioport_id;
+    uint32_t io_regs[] = {
+        cpu_to_be32(1),
+        cpu_to_be32(io_base),
+        cpu_to_be32(2)
+    };
+    char *name;
+    int node;
+
+    name = g_strdup_printf("%s@i%x", qdev_fw_name(DEVICE(d)), io_base);
+    node = fdt_add_subnode(fdt, lpc_off, name);
+    _FDT(node);
+    g_free(name);
+
+    _FDT((fdt_setprop(fdt, node, "reg", io_regs, sizeof(io_regs))));
+    _FDT((fdt_setprop_string(fdt, node, "compatible", "pnpPNP,b00")));
+}
+
+static void powernv_populate_serial(ISADevice *d, void *fdt, int lpc_off)
+{
+    const char compatible[] = "ns16550\0pnpPNP,501";
+    uint32_t io_base = d->ioport_id;
+    uint32_t io_regs[] = {
+        cpu_to_be32(1),
+        cpu_to_be32(io_base),
+        cpu_to_be32(8)
+    };
+    char *name;
+    int node;
+
+    name = g_strdup_printf("%s@i%x", qdev_fw_name(DEVICE(d)), io_base);
+    node = fdt_add_subnode(fdt, lpc_off, name);
+    _FDT(node);
+    g_free(name);
+
+    _FDT((fdt_setprop(fdt, node, "reg", io_regs, sizeof(io_regs))));
+    _FDT((fdt_setprop(fdt, node, "compatible", compatible,
+                      sizeof(compatible))));
+
+    _FDT((fdt_setprop_cell(fdt, node, "clock-frequency", 1843200)));
+    _FDT((fdt_setprop_cell(fdt, node, "current-speed", 115200)));
+    _FDT((fdt_setprop_cell(fdt, node, "interrupts", d->isairq[0])));
+    _FDT((fdt_setprop_cell(fdt, node, "interrupt-parent",
+                           fdt_get_phandle(fdt, lpc_off))));
+
+    /* This is needed by Linux */
+    _FDT((fdt_setprop_string(fdt, node, "device_type", "serial")));
+}
+
+static void powernv_populate_ipmi_bt(ISADevice *d, void *fdt, int lpc_off)
+{
+    const char compatible[] = "bt\0ipmi-bt";
+    uint32_t io_base;
+    uint32_t io_regs[] = {
+        cpu_to_be32(1),
+        0, /* 'io_base' retrieved from the 'ioport' property of 'isa-ipmi-bt' */
+        cpu_to_be32(3)
+    };
+    uint32_t irq;
+    char *name;
+    int node;
+
+    io_base = object_property_get_int(OBJECT(d), "ioport", &error_fatal);
+    io_regs[1] = cpu_to_be32(io_base);
+
+    irq = object_property_get_int(OBJECT(d), "irq", &error_fatal);
+
+    name = g_strdup_printf("%s@i%x", qdev_fw_name(DEVICE(d)), io_base);
+    node = fdt_add_subnode(fdt, lpc_off, name);
+    _FDT(node);
+    g_free(name);
+
+    fdt_setprop(fdt, node, "reg", io_regs, sizeof(io_regs));
+    fdt_setprop(fdt, node, "compatible", compatible, sizeof(compatible));
+
+    /* Mark it as reserved to avoid Linux trying to claim it */
+    _FDT((fdt_setprop_string(fdt, node, "status", "reserved")));
+    _FDT((fdt_setprop_cell(fdt, node, "interrupts", irq)));
+    _FDT((fdt_setprop_cell(fdt, node, "interrupt-parent",
+                           fdt_get_phandle(fdt, lpc_off))));
+}
+
+typedef struct ForeachPopulateArgs {
+    void *fdt;
+    int offset;
+} ForeachPopulateArgs;
+
+static int powernv_populate_isa_device(DeviceState *dev, void *opaque)
+{
+    ForeachPopulateArgs *args = opaque;
+    ISADevice *d = ISA_DEVICE(dev);
+
+    if (object_dynamic_cast(OBJECT(dev), TYPE_MC146818_RTC)) {
+        powernv_populate_rtc(d, args->fdt, args->offset);
+    } else if (object_dynamic_cast(OBJECT(dev), TYPE_ISA_SERIAL)) {
+        powernv_populate_serial(d, args->fdt, args->offset);
+    } else if (object_dynamic_cast(OBJECT(dev), "isa-ipmi-bt")) {
+        powernv_populate_ipmi_bt(d, args->fdt, args->offset);
+    } else {
+        error_report("unknown isa device %s@i%x", qdev_fw_name(dev),
+                     d->ioport_id);
+    }
+
+    return 0;
+}
+
+static void powernv_populate_isa(ISABus *bus, void *fdt, int lpc_offset)
+{
+    ForeachPopulateArgs args = {
+        .fdt = fdt,
+        .offset = lpc_offset,
+    };
+
+    /* ISA devices are not necessarily parented to the ISA bus so we
+     * can not use object_child_foreach() */
+    qbus_walk_children(BUS(bus), powernv_populate_isa_device,
+                       NULL, NULL, NULL, &args);
+}
+
 static void *powernv_create_fdt(MachineState *machine)
 {
     const char plat_compat[] = "qemu,powernv\0ibm,powernv";
@@ -245,6 +433,7 @@ static void *powernv_create_fdt(MachineState *machine)
     char *buf;
     int off;
     int i;
+    int lpc_offset;
 
     fdt = g_malloc0(FDT_MAX_SIZE);
     _FDT((fdt_create_empty_tree(fdt, FDT_MAX_SIZE)));
@@ -284,16 +473,49 @@ static void *powernv_create_fdt(MachineState *machine)
     for (i = 0; i < pnv->num_chips; i++) {
         powernv_populate_chip(pnv->chips[i], fdt);
     }
+
+    /* Populate ISA devices on chip 0 */
+    lpc_offset = pnv_chip_lpc_offset(pnv->chips[0], fdt);
+    powernv_populate_isa(pnv->isa_bus, fdt, lpc_offset);
+
+    if (pnv->bmc) {
+        pnv_bmc_populate_sensors(pnv->bmc, fdt);
+    }
+
     return fdt;
 }
 
+static void pnv_powerdown_notify(Notifier *n, void *opaque)
+{
+    PnvMachineState *pnv = POWERNV_MACHINE(qdev_get_machine());
+
+    if (pnv->bmc) {
+        pnv_bmc_powerdown(pnv->bmc);
+    }
+}
+
 static void ppc_powernv_reset(void)
 {
     MachineState *machine = MACHINE(qdev_get_machine());
+    PnvMachineState *pnv = POWERNV_MACHINE(machine);
     void *fdt;
+    Object *obj;
 
     qemu_devices_reset();
 
+    /* OpenPOWER systems have a BMC, which can be defined on the
+     * command line with:
+     *
+     *   -device ipmi-bmc-sim,id=bmc0
+     *
+     * This is the internal simulator but it could also be an external
+     * BMC.
+     */
+    obj = object_resolve_path_type("", TYPE_IPMI_BMC, NULL);
+    if (obj) {
+        pnv->bmc = IPMI_BMC(obj);
+    }
+
     fdt = powernv_create_fdt(machine);
 
     /* Pack resulting tree */
@@ -302,29 +524,6 @@ static void ppc_powernv_reset(void)
     cpu_physical_memory_write(PNV_FDT_ADDR, fdt, fdt_totalsize(fdt));
 }
 
-/* If we don't use the built-in LPC interrupt deserializer, we need
- * to provide a set of qirqs for the ISA bus or things will go bad.
- *
- * Most machines using pre-Naples chips (without said deserializer)
- * have a CPLD that will collect the SerIRQ and shoot them as a
- * single level interrupt to the P8 chip. So let's setup a hook
- * for doing just that.
- *
- * Note: The actual interrupt input isn't emulated yet, this will
- * come with the PSI bridge model.
- */
-static void pnv_lpc_isa_irq_handler_cpld(void *opaque, int n, int level)
-{
-    /* We don't yet emulate the PSI bridge which provides the external
-     * interrupt, so just drop interrupts on the floor
-     */
-}
-
-static void pnv_lpc_isa_irq_handler(void *opaque, int n, int level)
-{
-     /* XXX TODO */
-}
-
 static ISABus *pnv_isa_create(PnvChip *chip)
 {
     PnvLpcController *lpc = &chip->lpc;
@@ -339,16 +538,7 @@ static ISABus *pnv_isa_create(PnvChip *chip)
     isa_bus = isa_bus_new(NULL, &lpc->isa_mem, &lpc->isa_io,
                           &error_fatal);
 
-    /* Not all variants have a working serial irq decoder. If not,
-     * handling of LPC interrupts becomes a platform issue (some
-     * platforms have a CPLD to do it).
-     */
-    if (pcc->chip_type == PNV_CHIP_POWER8NVL) {
-        irqs = qemu_allocate_irqs(pnv_lpc_isa_irq_handler, chip, ISA_NUM_IRQS);
-    } else {
-        irqs = qemu_allocate_irqs(pnv_lpc_isa_irq_handler_cpld, chip,
-                                  ISA_NUM_IRQS);
-    }
+    irqs = pnv_lpc_isa_irq_create(lpc, pcc->chip_type, ISA_NUM_IRQS);
 
     isa_bus_irqs(isa_bus, irqs);
     return isa_bus;
@@ -457,6 +647,11 @@ static void ppc_powernv_init(MachineState *machine)
 
     /* Create an RTC ISA device too */
     rtc_init(pnv->isa_bus, 2000, NULL);
+
+    /* OpenPOWER systems use a IPMI SEL Event message to notify the
+     * host to powerdown */
+    pnv->powerdown_notifier.notify = pnv_powerdown_notify;
+    qemu_register_powerdown_notifier(&pnv->powerdown_notifier);
 }
 
 /*
@@ -638,6 +833,52 @@ static void pnv_chip_init(Object *obj)
 
     object_initialize(&chip->lpc, sizeof(chip->lpc), TYPE_PNV_LPC);
     object_property_add_child(obj, "lpc", OBJECT(&chip->lpc), NULL);
+
+    object_initialize(&chip->psi, sizeof(chip->psi), TYPE_PNV_PSI);
+    object_property_add_child(obj, "psi", OBJECT(&chip->psi), NULL);
+    object_property_add_const_link(OBJECT(&chip->psi), "xics",
+                                   OBJECT(qdev_get_machine()), &error_abort);
+
+    object_initialize(&chip->occ, sizeof(chip->occ), TYPE_PNV_OCC);
+    object_property_add_child(obj, "occ", OBJECT(&chip->occ), NULL);
+    object_property_add_const_link(OBJECT(&chip->occ), "psi",
+                                   OBJECT(&chip->psi), &error_abort);
+
+    /* The LPC controller needs PSI to generate interrupts */
+    object_property_add_const_link(OBJECT(&chip->lpc), "psi",
+                                   OBJECT(&chip->psi), &error_abort);
+}
+
+static void pnv_chip_icp_realize(PnvChip *chip, Error **errp)
+{
+    PnvChipClass *pcc = PNV_CHIP_GET_CLASS(chip);
+    char *typename = pnv_core_typename(pcc->cpu_model);
+    size_t typesize = object_type_get_instance_size(typename);
+    int i, j;
+    char *name;
+    XICSFabric *xi = XICS_FABRIC(qdev_get_machine());
+
+    name = g_strdup_printf("icp-%x", chip->chip_id);
+    memory_region_init(&chip->icp_mmio, OBJECT(chip), name, PNV_ICP_SIZE);
+    sysbus_init_mmio(SYS_BUS_DEVICE(chip), &chip->icp_mmio);
+    g_free(name);
+
+    sysbus_mmio_map(SYS_BUS_DEVICE(chip), 1, PNV_ICP_BASE(chip));
+
+    /* Map the ICP registers for each thread */
+    for (i = 0; i < chip->nr_cores; i++) {
+        PnvCore *pnv_core = PNV_CORE(chip->cores + i * typesize);
+        int core_hwid = CPU_CORE(pnv_core)->core_id;
+
+        for (j = 0; j < CPU_CORE(pnv_core)->nr_threads; j++) {
+            uint32_t pir = pcc->core_pir(chip, core_hwid) + j;
+            PnvICPState *icp = PNV_ICP(xics_icp_get(xi, pir));
+
+            memory_region_add_subregion(&chip->icp_mmio, pir << 12, &icp->mmio);
+        }
+    }
+
+    g_free(typename);
 }
 
 static void pnv_chip_realize(DeviceState *dev, Error **errp)
@@ -691,6 +932,8 @@ static void pnv_chip_realize(DeviceState *dev, Error **errp)
         object_property_set_int(OBJECT(pnv_core),
                                 pcc->core_pir(chip, core_hwid),
                                 "pir", &error_fatal);
+        object_property_add_const_link(OBJECT(pnv_core), "xics",
+                                       qdev_get_machine(), &error_fatal);
         object_property_set_bool(OBJECT(pnv_core), true, "realized",
                                  &error_fatal);
         object_unref(OBJECT(pnv_core));
@@ -708,6 +951,32 @@ static void pnv_chip_realize(DeviceState *dev, Error **errp)
     object_property_set_bool(OBJECT(&chip->lpc), true, "realized",
                              &error_fatal);
     pnv_xscom_add_subregion(chip, PNV_XSCOM_LPC_BASE, &chip->lpc.xscom_regs);
+
+    /* Interrupt Management Area. This is the memory region holding
+     * all the Interrupt Control Presenter (ICP) registers */
+    pnv_chip_icp_realize(chip, &error);
+    if (error) {
+        error_propagate(errp, error);
+        return;
+    }
+
+    /* Processor Service Interface (PSI) Host Bridge */
+    object_property_set_int(OBJECT(&chip->psi), PNV_PSIHB_BASE(chip),
+                            "bar", &error_fatal);
+    object_property_set_bool(OBJECT(&chip->psi), true, "realized", &error);
+    if (error) {
+        error_propagate(errp, error);
+        return;
+    }
+    pnv_xscom_add_subregion(chip, PNV_XSCOM_PSIHB_BASE, &chip->psi.xscom_regs);
+
+    /* Create the simplified OCC model */
+    object_property_set_bool(OBJECT(&chip->occ), true, "realized", &error);
+    if (error) {
+        error_propagate(errp, error);
+        return;
+    }
+    pnv_xscom_add_subregion(chip, PNV_XSCOM_OCC_BASE, &chip->occ.xscom_regs);
 }
 
 static Property pnv_chip_properties[] = {
@@ -723,6 +992,7 @@ static void pnv_chip_class_init(ObjectClass *klass, void *data)
 {
     DeviceClass *dc = DEVICE_CLASS(klass);
 
+    set_bit(DEVICE_CATEGORY_CPU, dc->categories);
     dc->realize = pnv_chip_realize;
     dc->props = pnv_chip_properties;
     dc->desc = "PowerNV Chip";
@@ -737,6 +1007,70 @@ static const TypeInfo pnv_chip_info = {
     .abstract      = true,
 };
 
+static ICSState *pnv_ics_get(XICSFabric *xi, int irq)
+{
+    PnvMachineState *pnv = POWERNV_MACHINE(xi);
+    int i;
+
+    for (i = 0; i < pnv->num_chips; i++) {
+        if (ics_valid_irq(&pnv->chips[i]->psi.ics, irq)) {
+            return &pnv->chips[i]->psi.ics;
+        }
+    }
+    return NULL;
+}
+
+static void pnv_ics_resend(XICSFabric *xi)
+{
+    PnvMachineState *pnv = POWERNV_MACHINE(xi);
+    int i;
+
+    for (i = 0; i < pnv->num_chips; i++) {
+        ics_resend(&pnv->chips[i]->psi.ics);
+    }
+}
+
+static PowerPCCPU *ppc_get_vcpu_by_pir(int pir)
+{
+    CPUState *cs;
+
+    CPU_FOREACH(cs) {
+        PowerPCCPU *cpu = POWERPC_CPU(cs);
+        CPUPPCState *env = &cpu->env;
+
+        if (env->spr_cb[SPR_PIR].default_value == pir) {
+            return cpu;
+        }
+    }
+
+    return NULL;
+}
+
+static ICPState *pnv_icp_get(XICSFabric *xi, int pir)
+{
+    PowerPCCPU *cpu = ppc_get_vcpu_by_pir(pir);
+
+    return cpu ? ICP(cpu->intc) : NULL;
+}
+
+static void pnv_pic_print_info(InterruptStatsProvider *obj,
+                               Monitor *mon)
+{
+    PnvMachineState *pnv = POWERNV_MACHINE(obj);
+    int i;
+    CPUState *cs;
+
+    CPU_FOREACH(cs) {
+        PowerPCCPU *cpu = POWERPC_CPU(cs);
+
+        icp_pic_print_info(ICP(cpu->intc), mon);
+    }
+
+    for (i = 0; i < pnv->num_chips; i++) {
+        ics_pic_print_info(&pnv->chips[i]->psi.ics, mon);
+    }
+}
+
 static void pnv_get_num_chips(Object *obj, Visitor *v, const char *name,
                               void *opaque, Error **errp)
 {
@@ -787,6 +1121,8 @@ static void powernv_machine_class_props_init(ObjectClass *oc)
 static void powernv_machine_class_init(ObjectClass *oc, void *data)
 {
     MachineClass *mc = MACHINE_CLASS(oc);
+    XICSFabricClass *xic = XICS_FABRIC_CLASS(oc);
+    InterruptStatsProviderClass *ispc = INTERRUPT_STATS_PROVIDER_CLASS(oc);
 
     mc->desc = "IBM PowerNV (Non-Virtualized)";
     mc->init = ppc_powernv_init;
@@ -797,6 +1133,10 @@ static void powernv_machine_class_init(ObjectClass *oc, void *data)
     mc->no_parallel = 1;
     mc->default_boot_order = NULL;
     mc->default_ram_size = 1 * G_BYTE;
+    xic->icp_get = pnv_icp_get;
+    xic->ics_get = pnv_ics_get;
+    xic->ics_resend = pnv_ics_resend;
+    ispc->print_info = pnv_pic_print_info;
 
     powernv_machine_class_props_init(oc);
 }
@@ -807,6 +1147,11 @@ static const TypeInfo powernv_machine_info = {
     .instance_size = sizeof(PnvMachineState),
     .instance_init = powernv_machine_initfn,
     .class_init    = powernv_machine_class_init,
+    .interfaces = (InterfaceInfo[]) {
+        { TYPE_XICS_FABRIC },
+        { TYPE_INTERRUPT_STATS_PROVIDER },
+        { },
+    },
 };
 
 static void powernv_machine_register_types(void)
diff --git a/hw/ppc/pnv_bmc.c b/hw/ppc/pnv_bmc.c
new file mode 100644
index 0000000000..7b60b4c360
--- /dev/null
+++ b/hw/ppc/pnv_bmc.c
@@ -0,0 +1,122 @@
+/*
+ * QEMU PowerNV, BMC related functions
+ *
+ * Copyright (c) 2016-2017, IBM Corporation.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License, version 2, as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, see <http://www.gnu.org/licenses/>.
+ */
+
+#include "qemu/osdep.h"
+#include "hw/hw.h"
+#include "sysemu/sysemu.h"
+#include "target/ppc/cpu.h"
+#include "qapi/error.h"
+#include "qemu/log.h"
+#include "hw/ipmi/ipmi.h"
+#include "hw/ppc/fdt.h"
+
+#include "hw/ppc/pnv.h"
+
+#include <libfdt.h>
+
+/* TODO: include definition in ipmi.h */
+#define IPMI_SDR_FULL_TYPE 1
+
+/*
+ * OEM SEL Event data packet sent by BMC in response of a Read Event
+ * Message Buffer command
+ */
+typedef struct OemSel {
+    /* SEL header */
+    uint8_t id[2];
+    uint8_t type;
+    uint8_t timestamp[4];
+    uint8_t manuf_id[3];
+
+    /* OEM SEL data (6 bytes) follows */
+    uint8_t netfun;
+    uint8_t cmd;
+    uint8_t data[4];
+} OemSel;
+
+#define SOFT_OFF        0x00
+#define SOFT_REBOOT     0x01
+
+static void pnv_gen_oem_sel(IPMIBmc *bmc, uint8_t reboot)
+{
+    /* IPMI SEL Event are 16 bytes long */
+    OemSel sel = {
+        .id        = { 0x55 , 0x55 },
+        .type      = 0xC0, /* OEM */
+        .manuf_id  = { 0x0, 0x0, 0x0 },
+        .timestamp = { 0x0, 0x0, 0x0, 0x0 },
+        .netfun    = 0x3A, /* IBM */
+        .cmd       = 0x04, /* AMI OEM SEL Power Notification */
+        .data      = { reboot, 0xFF, 0xFF, 0xFF },
+    };
+
+    ipmi_bmc_gen_event(bmc, (uint8_t *) &sel, 0 /* do not log the event */);
+}
+
+void pnv_bmc_powerdown(IPMIBmc *bmc)
+{
+    pnv_gen_oem_sel(bmc, SOFT_OFF);
+}
+
+void pnv_bmc_populate_sensors(IPMIBmc *bmc, void *fdt)
+{
+    int offset;
+    int i;
+    const struct ipmi_sdr_compact *sdr;
+    uint16_t nextrec;
+
+    offset = fdt_add_subnode(fdt, 0, "/bmc");
+    _FDT(offset);
+
+    _FDT((fdt_setprop_string(fdt, offset, "name", "bmc")));
+    _FDT((fdt_setprop_cell(fdt, offset, "#address-cells", 0x1)));
+    _FDT((fdt_setprop_cell(fdt, offset, "#size-cells", 0x0)));
+
+    offset = fdt_add_subnode(fdt, offset, "sensors");
+    _FDT(offset);
+
+    _FDT((fdt_setprop_cell(fdt, offset, "#address-cells", 0x1)));
+    _FDT((fdt_setprop_cell(fdt, offset, "#size-cells", 0x0)));
+
+    for (i = 0; !ipmi_bmc_sdr_find(bmc, i, &sdr, &nextrec); i++) {
+        int off;
+        char *name;
+
+        if (sdr->header.rec_type != IPMI_SDR_COMPACT_TYPE &&
+            sdr->header.rec_type != IPMI_SDR_FULL_TYPE) {
+            continue;
+        }
+
+        name = g_strdup_printf("sensor@%x", sdr->sensor_owner_number);
+        off = fdt_add_subnode(fdt, offset, name);
+        _FDT(off);
+        g_free(name);
+
+        _FDT((fdt_setprop_cell(fdt, off, "reg", sdr->sensor_owner_number)));
+        _FDT((fdt_setprop_string(fdt, off, "name", "sensor")));
+        _FDT((fdt_setprop_string(fdt, off, "compatible", "ibm,ipmi-sensor")));
+        _FDT((fdt_setprop_cell(fdt, off, "ipmi-sensor-reading-type",
+                               sdr->reading_type)));
+        _FDT((fdt_setprop_cell(fdt, off, "ipmi-entity-id",
+                               sdr->entity_id)));
+        _FDT((fdt_setprop_cell(fdt, off, "ipmi-entity-instance",
+                               sdr->entity_instance)));
+        _FDT((fdt_setprop_cell(fdt, off, "ipmi-sensor-type",
+                               sdr->sensor_type)));
+    }
+}
diff --git a/hw/ppc/pnv_core.c b/hw/ppc/pnv_core.c
index d79d530b48..1b7ec70f03 100644
--- a/hw/ppc/pnv_core.c
+++ b/hw/ppc/pnv_core.c
@@ -25,6 +25,7 @@
 #include "hw/ppc/pnv.h"
 #include "hw/ppc/pnv_core.h"
 #include "hw/ppc/pnv_xscom.h"
+#include "hw/ppc/xics.h"
 
 static void powernv_cpu_reset(void *opaque)
 {
@@ -110,23 +111,37 @@ static const MemoryRegionOps pnv_core_xscom_ops = {
     .endianness = DEVICE_BIG_ENDIAN,
 };
 
-static void pnv_core_realize_child(Object *child, Error **errp)
+static void pnv_core_realize_child(Object *child, XICSFabric *xi, Error **errp)
 {
     Error *local_err = NULL;
     CPUState *cs = CPU(child);
     PowerPCCPU *cpu = POWERPC_CPU(cs);
+    Object *obj;
+
+    obj = object_new(TYPE_PNV_ICP);
+    object_property_add_child(OBJECT(cpu), "icp", obj, NULL);
+    object_property_add_const_link(obj, "xics", OBJECT(xi), &error_abort);
+    object_property_set_bool(obj, true, "realized", &local_err);
+    if (local_err) {
+        error_propagate(errp, local_err);
+        return;
+    }
 
     object_property_set_bool(child, true, "realized", &local_err);
     if (local_err) {
+        object_unparent(obj);
         error_propagate(errp, local_err);
         return;
     }
 
     powernv_cpu_init(cpu, &local_err);
     if (local_err) {
+        object_unparent(obj);
         error_propagate(errp, local_err);
         return;
     }
+
+    xics_cpu_setup(xi, cpu, ICP(obj));
 }
 
 static void pnv_core_realize(DeviceState *dev, Error **errp)
@@ -140,6 +155,14 @@ static void pnv_core_realize(DeviceState *dev, Error **errp)
     void *obj;
     int i, j;
     char name[32];
+    Object *xi;
+
+    xi = object_property_get_link(OBJECT(dev), "xics", &local_err);
+    if (!xi) {
+        error_setg(errp, "%s: required link 'xics' not found: %s",
+                   __func__, error_get_pretty(local_err));
+        return;
+    }
 
     pc->threads = g_malloc0(size * cc->nr_threads);
     for (i = 0; i < cc->nr_threads; i++) {
@@ -160,7 +183,7 @@ static void pnv_core_realize(DeviceState *dev, Error **errp)
     for (j = 0; j < cc->nr_threads; j++) {
         obj = pc->threads + j * size;
 
-        pnv_core_realize_child(obj, &local_err);
+        pnv_core_realize_child(obj, XICS_FABRIC(xi), &local_err);
         if (local_err) {
             goto err;
         }
diff --git a/hw/ppc/pnv_lpc.c b/hw/ppc/pnv_lpc.c
index 78db52415b..f03a80a29b 100644
--- a/hw/ppc/pnv_lpc.c
+++ b/hw/ppc/pnv_lpc.c
@@ -92,14 +92,6 @@ enum {
 #define LPC_HC_REGS_OPB_SIZE    0x00001000
 
 
-/*
- * TODO: the "primary" cell should only be added on chip 0. This is
- * how skiboot chooses the default LPC controller on multichip
- * systems.
- *
- * It would be easly done if we can change the populate() interface to
- * replace the PnvXScomInterface parameter by a PnvChip one
- */
 static int pnv_lpc_populate(PnvXScomInterface *dev, void *fdt, int xscom_offset)
 {
     const char compat[] = "ibm,power8-lpc\0ibm,lpc";
@@ -119,7 +111,6 @@ static int pnv_lpc_populate(PnvXScomInterface *dev, void *fdt, int xscom_offset)
     _FDT((fdt_setprop(fdt, offset, "reg", reg, sizeof(reg))));
     _FDT((fdt_setprop_cell(fdt, offset, "#address-cells", 2)));
     _FDT((fdt_setprop_cell(fdt, offset, "#size-cells", 1)));
-    _FDT((fdt_setprop(fdt, offset, "primary", NULL, 0)));
     _FDT((fdt_setprop(fdt, offset, "compatible", compat, sizeof(compat))));
     return 0;
 }
@@ -250,6 +241,34 @@ static const MemoryRegionOps pnv_lpc_xscom_ops = {
     .endianness = DEVICE_BIG_ENDIAN,
 };
 
+static void pnv_lpc_eval_irqs(PnvLpcController *lpc)
+{
+    bool lpc_to_opb_irq = false;
+
+    /* Update LPC controller to OPB line */
+    if (lpc->lpc_hc_irqser_ctrl & LPC_HC_IRQSER_EN) {
+        uint32_t irqs;
+
+        irqs = lpc->lpc_hc_irqstat & lpc->lpc_hc_irqmask;
+        lpc_to_opb_irq = (irqs != 0);
+    }
+
+    /* We don't honor the polarity register, it's pointless and unused
+     * anyway
+     */
+    if (lpc_to_opb_irq) {
+        lpc->opb_irq_input |= OPB_MASTER_IRQ_LPC;
+    } else {
+        lpc->opb_irq_input &= ~OPB_MASTER_IRQ_LPC;
+    }
+
+    /* Update OPB internal latch */
+    lpc->opb_irq_stat |= lpc->opb_irq_input & lpc->opb_irq_mask;
+
+    /* Reflect the interrupt */
+    pnv_psi_irq_set(lpc->psi, PSIHB_IRQ_LPC_I2C, lpc->opb_irq_stat != 0);
+}
+
 static uint64_t lpc_hc_read(void *opaque, hwaddr addr, unsigned size)
 {
     PnvLpcController *lpc = opaque;
@@ -300,12 +319,15 @@ static void lpc_hc_write(void *opaque, hwaddr addr, uint64_t val,
         break;
     case LPC_HC_IRQSER_CTRL:
         lpc->lpc_hc_irqser_ctrl = val;
+        pnv_lpc_eval_irqs(lpc);
         break;
     case LPC_HC_IRQMASK:
         lpc->lpc_hc_irqmask = val;
+        pnv_lpc_eval_irqs(lpc);
         break;
     case LPC_HC_IRQSTAT:
         lpc->lpc_hc_irqstat &= ~val;
+        pnv_lpc_eval_irqs(lpc);
         break;
     case LPC_HC_ERROR_ADDRESS:
         break;
@@ -363,14 +385,15 @@ static void opb_master_write(void *opaque, hwaddr addr,
     switch (addr) {
     case OPB_MASTER_LS_IRQ_STAT:
         lpc->opb_irq_stat &= ~val;
+        pnv_lpc_eval_irqs(lpc);
         break;
     case OPB_MASTER_LS_IRQ_MASK:
-        /* XXX Filter out reserved bits */
         lpc->opb_irq_mask = val;
+        pnv_lpc_eval_irqs(lpc);
         break;
     case OPB_MASTER_LS_IRQ_POL:
-        /* XXX Filter out reserved bits */
         lpc->opb_irq_pol = val;
+        pnv_lpc_eval_irqs(lpc);
         break;
     case OPB_MASTER_LS_IRQ_INPUT:
         /* Read only */
@@ -398,6 +421,8 @@ static const MemoryRegionOps opb_master_ops = {
 static void pnv_lpc_realize(DeviceState *dev, Error **errp)
 {
     PnvLpcController *lpc = PNV_LPC(dev);
+    Object *obj;
+    Error *error = NULL;
 
     /* Reg inits */
     lpc->lpc_hc_fw_rd_acc_size = LPC_HC_FW_RD_4B;
@@ -441,6 +466,15 @@ static void pnv_lpc_realize(DeviceState *dev, Error **errp)
     pnv_xscom_region_init(&lpc->xscom_regs, OBJECT(dev),
                           &pnv_lpc_xscom_ops, lpc, "xscom-lpc",
                           PNV_XSCOM_LPC_SIZE);
+
+    /* get PSI object from chip */
+    obj = object_property_get_link(OBJECT(dev), "psi", &error);
+    if (!obj) {
+        error_setg(errp, "%s: required link 'psi' not found: %s",
+                   __func__, error_get_pretty(error));
+        return;
+    }
+    lpc->psi = PNV_PSI(obj);
 }
 
 static void pnv_lpc_class_init(ObjectClass *klass, void *data)
@@ -470,3 +504,53 @@ static void pnv_lpc_register_types(void)
 }
 
 type_init(pnv_lpc_register_types)
+
+/* If we don't use the built-in LPC interrupt deserializer, we need
+ * to provide a set of qirqs for the ISA bus or things will go bad.
+ *
+ * Most machines using pre-Naples chips (without said deserializer)
+ * have a CPLD that will collect the SerIRQ and shoot them as a
+ * single level interrupt to the P8 chip. So let's setup a hook
+ * for doing just that.
+ */
+static void pnv_lpc_isa_irq_handler_cpld(void *opaque, int n, int level)
+{
+    PnvMachineState *pnv = POWERNV_MACHINE(qdev_get_machine());
+    uint32_t old_state = pnv->cpld_irqstate;
+    PnvLpcController *lpc = PNV_LPC(opaque);
+
+    if (level) {
+        pnv->cpld_irqstate |= 1u << n;
+    } else {
+        pnv->cpld_irqstate &= ~(1u << n);
+    }
+
+    if (pnv->cpld_irqstate != old_state) {
+        pnv_psi_irq_set(lpc->psi, PSIHB_IRQ_EXTERNAL, pnv->cpld_irqstate != 0);
+    }
+}
+
+static void pnv_lpc_isa_irq_handler(void *opaque, int n, int level)
+{
+    PnvLpcController *lpc = PNV_LPC(opaque);
+
+    /* The Naples HW latches the 1 levels, clearing is done by SW */
+    if (level) {
+        lpc->lpc_hc_irqstat |= LPC_HC_IRQ_SERIRQ0 >> n;
+        pnv_lpc_eval_irqs(lpc);
+    }
+}
+
+qemu_irq *pnv_lpc_isa_irq_create(PnvLpcController *lpc, int chip_type,
+                                 int nirqs)
+{
+    /* Not all variants have a working serial irq decoder. If not,
+     * handling of LPC interrupts becomes a platform issue (some
+     * platforms have a CPLD to do it).
+     */
+    if (chip_type == PNV_CHIP_POWER8NVL) {
+        return qemu_allocate_irqs(pnv_lpc_isa_irq_handler, lpc, nirqs);
+    } else {
+        return qemu_allocate_irqs(pnv_lpc_isa_irq_handler_cpld, lpc, nirqs);
+    }
+}
diff --git a/hw/ppc/pnv_occ.c b/hw/ppc/pnv_occ.c
new file mode 100644
index 0000000000..04880f26d6
--- /dev/null
+++ b/hw/ppc/pnv_occ.c
@@ -0,0 +1,136 @@
+/*
+ * QEMU PowerPC PowerNV Emulation of a few OCC related registers
+ *
+ * Copyright (c) 2015-2017, IBM Corporation.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License, version 2, as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, see <http://www.gnu.org/licenses/>.
+ */
+
+#include "qemu/osdep.h"
+#include "hw/hw.h"
+#include "sysemu/sysemu.h"
+#include "target/ppc/cpu.h"
+#include "qapi/error.h"
+#include "qemu/log.h"
+
+#include "hw/ppc/pnv.h"
+#include "hw/ppc/pnv_xscom.h"
+#include "hw/ppc/pnv_occ.h"
+
+#define OCB_OCI_OCCMISC         0x4020
+#define OCB_OCI_OCCMISC_AND     0x4021
+#define OCB_OCI_OCCMISC_OR      0x4022
+
+static void pnv_occ_set_misc(PnvOCC *occ, uint64_t val)
+{
+    bool irq_state;
+
+    val &= 0xffff000000000000ull;
+
+    occ->occmisc = val;
+    irq_state = !!(val >> 63);
+    pnv_psi_irq_set(occ->psi, PSIHB_IRQ_OCC, irq_state);
+}
+
+static uint64_t pnv_occ_xscom_read(void *opaque, hwaddr addr, unsigned size)
+{
+    PnvOCC *occ = PNV_OCC(opaque);
+    uint32_t offset = addr >> 3;
+    uint64_t val = 0;
+
+    switch (offset) {
+    case OCB_OCI_OCCMISC:
+        val = occ->occmisc;
+        break;
+    default:
+        qemu_log_mask(LOG_UNIMP, "OCC Unimplemented register: Ox%"
+                      HWADDR_PRIx "\n", addr);
+    }
+    return val;
+}
+
+static void pnv_occ_xscom_write(void *opaque, hwaddr addr,
+                                uint64_t val, unsigned size)
+{
+    PnvOCC *occ = PNV_OCC(opaque);
+    uint32_t offset = addr >> 3;
+
+    switch (offset) {
+    case OCB_OCI_OCCMISC_AND:
+        pnv_occ_set_misc(occ, occ->occmisc & val);
+        break;
+    case OCB_OCI_OCCMISC_OR:
+        pnv_occ_set_misc(occ, occ->occmisc | val);
+        break;
+    case OCB_OCI_OCCMISC:
+        pnv_occ_set_misc(occ, val);
+        break;
+    default:
+        qemu_log_mask(LOG_UNIMP, "OCC Unimplemented register: Ox%"
+                      HWADDR_PRIx "\n", addr);
+    }
+}
+
+static const MemoryRegionOps pnv_occ_xscom_ops = {
+    .read = pnv_occ_xscom_read,
+    .write = pnv_occ_xscom_write,
+    .valid.min_access_size = 8,
+    .valid.max_access_size = 8,
+    .impl.min_access_size = 8,
+    .impl.max_access_size = 8,
+    .endianness = DEVICE_BIG_ENDIAN,
+};
+
+
+static void pnv_occ_realize(DeviceState *dev, Error **errp)
+{
+    PnvOCC *occ = PNV_OCC(dev);
+    Object *obj;
+    Error *error = NULL;
+
+    occ->occmisc = 0;
+
+    /* get PSI object from chip */
+    obj = object_property_get_link(OBJECT(dev), "psi", &error);
+    if (!obj) {
+        error_setg(errp, "%s: required link 'psi' not found: %s",
+                   __func__, error_get_pretty(error));
+        return;
+    }
+    occ->psi = PNV_PSI(obj);
+
+    /* XScom region for OCC registers */
+    pnv_xscom_region_init(&occ->xscom_regs, OBJECT(dev), &pnv_occ_xscom_ops,
+                  occ, "xscom-occ", PNV_XSCOM_OCC_SIZE);
+}
+
+static void pnv_occ_class_init(ObjectClass *klass, void *data)
+{
+    DeviceClass *dc = DEVICE_CLASS(klass);
+
+    dc->realize = pnv_occ_realize;
+}
+
+static const TypeInfo pnv_occ_type_info = {
+    .name          = TYPE_PNV_OCC,
+    .parent        = TYPE_DEVICE,
+    .instance_size = sizeof(PnvOCC),
+    .class_init    = pnv_occ_class_init,
+};
+
+static void pnv_occ_register_types(void)
+{
+    type_register_static(&pnv_occ_type_info);
+}
+
+type_init(pnv_occ_register_types)
diff --git a/hw/ppc/pnv_psi.c b/hw/ppc/pnv_psi.c
new file mode 100644
index 0000000000..2bf5bfe3fd
--- /dev/null
+++ b/hw/ppc/pnv_psi.c
@@ -0,0 +1,571 @@
+/*
+ * QEMU PowerPC PowerNV Processor Service Interface (PSI) model
+ *
+ * Copyright (c) 2015-2017, IBM Corporation.
+ *
+ * This library is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2 of the License, or (at your option) any later version.
+ *
+ * This library is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with this library; if not, see <http://www.gnu.org/licenses/>.
+ */
+
+#include "qemu/osdep.h"
+#include "hw/hw.h"
+#include "target/ppc/cpu.h"
+#include "qemu/log.h"
+#include "qapi/error.h"
+
+#include "exec/address-spaces.h"
+
+#include "hw/ppc/fdt.h"
+#include "hw/ppc/pnv.h"
+#include "hw/ppc/pnv_xscom.h"
+#include "hw/ppc/pnv_psi.h"
+
+#include <libfdt.h>
+
+#define PSIHB_XSCOM_FIR_RW      0x00
+#define PSIHB_XSCOM_FIR_AND     0x01
+#define PSIHB_XSCOM_FIR_OR      0x02
+#define PSIHB_XSCOM_FIRMASK_RW  0x03
+#define PSIHB_XSCOM_FIRMASK_AND 0x04
+#define PSIHB_XSCOM_FIRMASK_OR  0x05
+#define PSIHB_XSCOM_FIRACT0     0x06
+#define PSIHB_XSCOM_FIRACT1     0x07
+
+/* Host Bridge Base Address Register */
+#define PSIHB_XSCOM_BAR         0x0a
+#define   PSIHB_BAR_EN                  0x0000000000000001ull
+
+/* FSP Base Address Register */
+#define PSIHB_XSCOM_FSPBAR      0x0b
+
+/* PSI Host Bridge Control/Status Register */
+#define PSIHB_XSCOM_CR          0x0e
+#define   PSIHB_CR_FSP_CMD_ENABLE       0x8000000000000000ull
+#define   PSIHB_CR_FSP_MMIO_ENABLE      0x4000000000000000ull
+#define   PSIHB_CR_FSP_IRQ_ENABLE       0x1000000000000000ull
+#define   PSIHB_CR_FSP_ERR_RSP_ENABLE   0x0800000000000000ull
+#define   PSIHB_CR_PSI_LINK_ENABLE      0x0400000000000000ull
+#define   PSIHB_CR_FSP_RESET            0x0200000000000000ull
+#define   PSIHB_CR_PSIHB_RESET          0x0100000000000000ull
+#define   PSIHB_CR_PSI_IRQ              0x0000800000000000ull
+#define   PSIHB_CR_FSP_IRQ              0x0000400000000000ull
+#define   PSIHB_CR_FSP_LINK_ACTIVE      0x0000200000000000ull
+#define   PSIHB_CR_IRQ_CMD_EXPECT       0x0000010000000000ull
+          /* and more ... */
+
+/* PSIHB Status / Error Mask Register */
+#define PSIHB_XSCOM_SEMR        0x0f
+
+/* XIVR, to signal interrupts to the CEC firmware. more XIVR below. */
+#define PSIHB_XSCOM_XIVR_FSP    0x10
+#define   PSIHB_XIVR_SERVER_SH          40
+#define   PSIHB_XIVR_SERVER_MSK         (0xffffull << PSIHB_XIVR_SERVER_SH)
+#define   PSIHB_XIVR_PRIO_SH            32
+#define   PSIHB_XIVR_PRIO_MSK           (0xffull << PSIHB_XIVR_PRIO_SH)
+#define   PSIHB_XIVR_SRC_SH             29
+#define   PSIHB_XIVR_SRC_MSK            (0x7ull << PSIHB_XIVR_SRC_SH)
+#define   PSIHB_XIVR_PENDING            0x01000000ull
+
+/* PSI Host Bridge Set Control/ Status Register */
+#define PSIHB_XSCOM_SCR         0x12
+
+/* PSI Host Bridge Clear Control/ Status Register */
+#define PSIHB_XSCOM_CCR         0x13
+
+/* DMA Upper Address Register */
+#define PSIHB_XSCOM_DMA_UPADD   0x14
+
+/* Interrupt Status */
+#define PSIHB_XSCOM_IRQ_STAT    0x15
+#define   PSIHB_IRQ_STAT_OCC            0x0000001000000000ull
+#define   PSIHB_IRQ_STAT_FSI            0x0000000800000000ull
+#define   PSIHB_IRQ_STAT_LPCI2C         0x0000000400000000ull
+#define   PSIHB_IRQ_STAT_LOCERR         0x0000000200000000ull
+#define   PSIHB_IRQ_STAT_EXT            0x0000000100000000ull
+
+/* remaining XIVR */
+#define PSIHB_XSCOM_XIVR_OCC    0x16
+#define PSIHB_XSCOM_XIVR_FSI    0x17
+#define PSIHB_XSCOM_XIVR_LPCI2C 0x18
+#define PSIHB_XSCOM_XIVR_LOCERR 0x19
+#define PSIHB_XSCOM_XIVR_EXT    0x1a
+
+/* Interrupt Requester Source Compare Register */
+#define PSIHB_XSCOM_IRSN        0x1b
+#define   PSIHB_IRSN_COMP_SH            45
+#define   PSIHB_IRSN_COMP_MSK           (0x7ffffull << PSIHB_IRSN_COMP_SH)
+#define   PSIHB_IRSN_IRQ_MUX            0x0000000800000000ull
+#define   PSIHB_IRSN_IRQ_RESET          0x0000000400000000ull
+#define   PSIHB_IRSN_DOWNSTREAM_EN      0x0000000200000000ull
+#define   PSIHB_IRSN_UPSTREAM_EN        0x0000000100000000ull
+#define   PSIHB_IRSN_COMPMASK_SH        13
+#define   PSIHB_IRSN_COMPMASK_MSK       (0x7ffffull << PSIHB_IRSN_COMPMASK_SH)
+
+#define PSIHB_BAR_MASK                  0x0003fffffff00000ull
+#define PSIHB_FSPBAR_MASK               0x0003ffff00000000ull
+
+static void pnv_psi_set_bar(PnvPsi *psi, uint64_t bar)
+{
+    MemoryRegion *sysmem = get_system_memory();
+    uint64_t old = psi->regs[PSIHB_XSCOM_BAR];
+
+    psi->regs[PSIHB_XSCOM_BAR] = bar & (PSIHB_BAR_MASK | PSIHB_BAR_EN);
+
+    /* Update MR, always remove it first */
+    if (old & PSIHB_BAR_EN) {
+        memory_region_del_subregion(sysmem, &psi->regs_mr);
+    }
+
+    /* Then add it back if needed */
+    if (bar & PSIHB_BAR_EN) {
+        uint64_t addr = bar & PSIHB_BAR_MASK;
+        memory_region_add_subregion(sysmem, addr, &psi->regs_mr);
+    }
+}
+
+static void pnv_psi_update_fsp_mr(PnvPsi *psi)
+{
+    /* TODO: Update FSP MR if/when we support FSP BAR */
+}
+
+static void pnv_psi_set_cr(PnvPsi *psi, uint64_t cr)
+{
+    uint64_t old = psi->regs[PSIHB_XSCOM_CR];
+
+    psi->regs[PSIHB_XSCOM_CR] = cr;
+
+    /* Check some bit changes */
+    if ((old ^ psi->regs[PSIHB_XSCOM_CR]) & PSIHB_CR_FSP_MMIO_ENABLE) {
+        pnv_psi_update_fsp_mr(psi);
+    }
+}
+
+static void pnv_psi_set_irsn(PnvPsi *psi, uint64_t val)
+{
+    ICSState *ics = &psi->ics;
+
+    /* In this model we ignore the up/down enable bits for now
+     * as SW doesn't use them (other than setting them at boot).
+     * We ignore IRQ_MUX, its meaning isn't clear and we don't use
+     * it and finally we ignore reset (XXX fix that ?)
+     */
+    psi->regs[PSIHB_XSCOM_IRSN] = val & (PSIHB_IRSN_COMP_MSK |
+                                         PSIHB_IRSN_IRQ_MUX |
+                                         PSIHB_IRSN_IRQ_RESET |
+                                         PSIHB_IRSN_DOWNSTREAM_EN |
+                                         PSIHB_IRSN_UPSTREAM_EN);
+
+    /* We ignore the compare mask as well, our ICS emulation is too
+     * simplistic to make any use if it, and we extract the offset
+     * from the compare value
+     */
+    ics->offset = (val & PSIHB_IRSN_COMP_MSK) >> PSIHB_IRSN_COMP_SH;
+}
+
+/*
+ * FSP and PSI interrupts are muxed under the same number.
+ */
+static const uint32_t xivr_regs[] = {
+    [PSIHB_IRQ_PSI]       = PSIHB_XSCOM_XIVR_FSP,
+    [PSIHB_IRQ_FSP]       = PSIHB_XSCOM_XIVR_FSP,
+    [PSIHB_IRQ_OCC]       = PSIHB_XSCOM_XIVR_OCC,
+    [PSIHB_IRQ_FSI]       = PSIHB_XSCOM_XIVR_FSI,
+    [PSIHB_IRQ_LPC_I2C]   = PSIHB_XSCOM_XIVR_LPCI2C,
+    [PSIHB_IRQ_LOCAL_ERR] = PSIHB_XSCOM_XIVR_LOCERR,
+    [PSIHB_IRQ_EXTERNAL]  = PSIHB_XSCOM_XIVR_EXT,
+};
+
+static const uint32_t stat_regs[] = {
+    [PSIHB_IRQ_PSI]       = PSIHB_XSCOM_CR,
+    [PSIHB_IRQ_FSP]       = PSIHB_XSCOM_CR,
+    [PSIHB_IRQ_OCC]       = PSIHB_XSCOM_IRQ_STAT,
+    [PSIHB_IRQ_FSI]       = PSIHB_XSCOM_IRQ_STAT,
+    [PSIHB_IRQ_LPC_I2C]   = PSIHB_XSCOM_IRQ_STAT,
+    [PSIHB_IRQ_LOCAL_ERR] = PSIHB_XSCOM_IRQ_STAT,
+    [PSIHB_IRQ_EXTERNAL]  = PSIHB_XSCOM_IRQ_STAT,
+};
+
+static const uint64_t stat_bits[] = {
+    [PSIHB_IRQ_PSI]       = PSIHB_CR_PSI_IRQ,
+    [PSIHB_IRQ_FSP]       = PSIHB_CR_FSP_IRQ,
+    [PSIHB_IRQ_OCC]       = PSIHB_IRQ_STAT_OCC,
+    [PSIHB_IRQ_FSI]       = PSIHB_IRQ_STAT_FSI,
+    [PSIHB_IRQ_LPC_I2C]   = PSIHB_IRQ_STAT_LPCI2C,
+    [PSIHB_IRQ_LOCAL_ERR] = PSIHB_IRQ_STAT_LOCERR,
+    [PSIHB_IRQ_EXTERNAL]  = PSIHB_IRQ_STAT_EXT,
+};
+
+void pnv_psi_irq_set(PnvPsi *psi, PnvPsiIrq irq, bool state)
+{
+    ICSState *ics = &psi->ics;
+    uint32_t xivr_reg;
+    uint32_t stat_reg;
+    uint32_t src;
+    bool masked;
+
+    if (irq > PSIHB_IRQ_EXTERNAL) {
+        qemu_log_mask(LOG_GUEST_ERROR, "PSI: Unsupported irq %d\n", irq);
+        return;
+    }
+
+    xivr_reg = xivr_regs[irq];
+    stat_reg = stat_regs[irq];
+
+    src = (psi->regs[xivr_reg] & PSIHB_XIVR_SRC_MSK) >> PSIHB_XIVR_SRC_SH;
+    if (state) {
+        psi->regs[stat_reg] |= stat_bits[irq];
+        /* TODO: optimization, check mask here. That means
+         * re-evaluating when unmasking
+         */
+        qemu_irq_raise(ics->qirqs[src]);
+    } else {
+        psi->regs[stat_reg] &= ~stat_bits[irq];
+
+        /* FSP and PSI are muxed so don't lower if either is still set */
+        if (stat_reg != PSIHB_XSCOM_CR ||
+            !(psi->regs[stat_reg] & (PSIHB_CR_PSI_IRQ | PSIHB_CR_FSP_IRQ))) {
+            qemu_irq_lower(ics->qirqs[src]);
+        } else {
+            state = true;
+        }
+    }
+
+    /* Note about the emulation of the pending bit: This isn't
+     * entirely correct. The pending bit should be cleared when the
+     * EOI has been received. However, we don't have callbacks on EOI
+     * (especially not under KVM) so no way to emulate that properly,
+     * so instead we just set that bit as the logical "output" of the
+     * XIVR (ie pending & !masked)
+     *
+     * CLG: We could define a new ICS object with a custom eoi()
+     * handler to clear the pending bit. But I am not sure this would
+     * be useful for the software anyhow.
+     */
+    masked = (psi->regs[xivr_reg] & PSIHB_XIVR_PRIO_MSK) == PSIHB_XIVR_PRIO_MSK;
+    if (state && !masked) {
+        psi->regs[xivr_reg] |= PSIHB_XIVR_PENDING;
+    } else {
+        psi->regs[xivr_reg] &= ~PSIHB_XIVR_PENDING;
+    }
+}
+
+static void pnv_psi_set_xivr(PnvPsi *psi, uint32_t reg, uint64_t val)
+{
+    ICSState *ics = &psi->ics;
+    uint16_t server;
+    uint8_t prio;
+    uint8_t src;
+
+    psi->regs[reg] = (psi->regs[reg] & PSIHB_XIVR_PENDING) |
+            (val & (PSIHB_XIVR_SERVER_MSK |
+                    PSIHB_XIVR_PRIO_MSK |
+                    PSIHB_XIVR_SRC_MSK));
+    val = psi->regs[reg];
+    server = (val & PSIHB_XIVR_SERVER_MSK) >> PSIHB_XIVR_SERVER_SH;
+    prio = (val & PSIHB_XIVR_PRIO_MSK) >> PSIHB_XIVR_PRIO_SH;
+    src = (val & PSIHB_XIVR_SRC_MSK) >> PSIHB_XIVR_SRC_SH;
+
+    if (src >= PSI_NUM_INTERRUPTS) {
+        qemu_log_mask(LOG_GUEST_ERROR, "PSI: Unsupported irq %d\n", src);
+        return;
+    }
+
+    /* Remove pending bit if the IRQ is masked */
+    if ((psi->regs[reg] & PSIHB_XIVR_PRIO_MSK) == PSIHB_XIVR_PRIO_MSK) {
+        psi->regs[reg] &= ~PSIHB_XIVR_PENDING;
+    }
+
+    /* The low order 2 bits are the link pointer (Type II interrupts).
+     * Shift back to get a valid IRQ server.
+     */
+    server >>= 2;
+
+    /* Now because of source remapping, weird things can happen
+     * if you change the source number dynamically, our simple ICS
+     * doesn't deal with remapping. So we just poke a different
+     * ICS entry based on what source number was written. This will
+     * do for now but a more accurate implementation would instead
+     * use a fixed server/prio and a remapper of the generated irq.
+     */
+    ics_simple_write_xive(ics, src, server, prio, prio);
+}
+
+static uint64_t pnv_psi_reg_read(PnvPsi *psi, uint32_t offset, bool mmio)
+{
+    uint64_t val = 0xffffffffffffffffull;
+
+    switch (offset) {
+    case PSIHB_XSCOM_FIR_RW:
+    case PSIHB_XSCOM_FIRACT0:
+    case PSIHB_XSCOM_FIRACT1:
+    case PSIHB_XSCOM_BAR:
+    case PSIHB_XSCOM_FSPBAR:
+    case PSIHB_XSCOM_CR:
+    case PSIHB_XSCOM_XIVR_FSP:
+    case PSIHB_XSCOM_XIVR_OCC:
+    case PSIHB_XSCOM_XIVR_FSI:
+    case PSIHB_XSCOM_XIVR_LPCI2C:
+    case PSIHB_XSCOM_XIVR_LOCERR:
+    case PSIHB_XSCOM_XIVR_EXT:
+    case PSIHB_XSCOM_IRQ_STAT:
+    case PSIHB_XSCOM_SEMR:
+    case PSIHB_XSCOM_DMA_UPADD:
+    case PSIHB_XSCOM_IRSN:
+        val = psi->regs[offset];
+        break;
+    default:
+        qemu_log_mask(LOG_UNIMP, "PSI: read at Ox%" PRIx32 "\n", offset);
+    }
+    return val;
+}
+
+static void pnv_psi_reg_write(PnvPsi *psi, uint32_t offset, uint64_t val,
+                              bool mmio)
+{
+    switch (offset) {
+    case PSIHB_XSCOM_FIR_RW:
+    case PSIHB_XSCOM_FIRACT0:
+    case PSIHB_XSCOM_FIRACT1:
+    case PSIHB_XSCOM_SEMR:
+    case PSIHB_XSCOM_DMA_UPADD:
+        psi->regs[offset] = val;
+        break;
+    case PSIHB_XSCOM_FIR_OR:
+        psi->regs[PSIHB_XSCOM_FIR_RW] |= val;
+        break;
+    case PSIHB_XSCOM_FIR_AND:
+        psi->regs[PSIHB_XSCOM_FIR_RW] &= val;
+        break;
+    case PSIHB_XSCOM_BAR:
+        /* Only XSCOM can write this one */
+        if (!mmio) {
+            pnv_psi_set_bar(psi, val);
+        } else {
+            qemu_log_mask(LOG_GUEST_ERROR, "PSI: invalid write of BAR\n");
+        }
+        break;
+    case PSIHB_XSCOM_FSPBAR:
+        psi->regs[PSIHB_XSCOM_FSPBAR] = val & PSIHB_FSPBAR_MASK;
+        pnv_psi_update_fsp_mr(psi);
+        break;
+    case PSIHB_XSCOM_CR:
+        pnv_psi_set_cr(psi, val);
+        break;
+    case PSIHB_XSCOM_SCR:
+        pnv_psi_set_cr(psi, psi->regs[PSIHB_XSCOM_CR] | val);
+        break;
+    case PSIHB_XSCOM_CCR:
+        pnv_psi_set_cr(psi, psi->regs[PSIHB_XSCOM_CR] & ~val);
+        break;
+    case PSIHB_XSCOM_XIVR_FSP:
+    case PSIHB_XSCOM_XIVR_OCC:
+    case PSIHB_XSCOM_XIVR_FSI:
+    case PSIHB_XSCOM_XIVR_LPCI2C:
+    case PSIHB_XSCOM_XIVR_LOCERR:
+    case PSIHB_XSCOM_XIVR_EXT:
+        pnv_psi_set_xivr(psi, offset, val);
+        break;
+    case PSIHB_XSCOM_IRQ_STAT:
+        /* Read only */
+        qemu_log_mask(LOG_GUEST_ERROR, "PSI: invalid write of IRQ_STAT\n");
+        break;
+    case PSIHB_XSCOM_IRSN:
+        pnv_psi_set_irsn(psi, val);
+        break;
+    default:
+        qemu_log_mask(LOG_UNIMP, "PSI: write at Ox%" PRIx32 "\n", offset);
+    }
+}
+
+/*
+ * The values of the registers when accessed through the MMIO region
+ * follow the relation : xscom = (mmio + 0x50) >> 3
+ */
+static uint64_t pnv_psi_mmio_read(void *opaque, hwaddr addr, unsigned size)
+{
+    return pnv_psi_reg_read(opaque, (addr >> 3) + PSIHB_XSCOM_BAR, true);
+}
+
+static void pnv_psi_mmio_write(void *opaque, hwaddr addr,
+                              uint64_t val, unsigned size)
+{
+    pnv_psi_reg_write(opaque, (addr >> 3) + PSIHB_XSCOM_BAR, val, true);
+}
+
+static const MemoryRegionOps psi_mmio_ops = {
+    .read = pnv_psi_mmio_read,
+    .write = pnv_psi_mmio_write,
+    .endianness = DEVICE_BIG_ENDIAN,
+    .valid = {
+        .min_access_size = 8,
+        .max_access_size = 8,
+    },
+    .impl = {
+        .min_access_size = 8,
+        .max_access_size = 8,
+    },
+};
+
+static uint64_t pnv_psi_xscom_read(void *opaque, hwaddr addr, unsigned size)
+{
+    return pnv_psi_reg_read(opaque, addr >> 3, false);
+}
+
+static void pnv_psi_xscom_write(void *opaque, hwaddr addr,
+                                uint64_t val, unsigned size)
+{
+    pnv_psi_reg_write(opaque, addr >> 3, val, false);
+}
+
+static const MemoryRegionOps pnv_psi_xscom_ops = {
+    .read = pnv_psi_xscom_read,
+    .write = pnv_psi_xscom_write,
+    .endianness = DEVICE_BIG_ENDIAN,
+    .valid = {
+        .min_access_size = 8,
+        .max_access_size = 8,
+    },
+    .impl = {
+        .min_access_size = 8,
+        .max_access_size = 8,
+    }
+};
+
+static void pnv_psi_init(Object *obj)
+{
+    PnvPsi *psi = PNV_PSI(obj);
+
+    object_initialize(&psi->ics, sizeof(psi->ics), TYPE_ICS_SIMPLE);
+    object_property_add_child(obj, "ics-psi", OBJECT(&psi->ics), NULL);
+}
+
+static const uint8_t irq_to_xivr[] = {
+    PSIHB_XSCOM_XIVR_FSP,
+    PSIHB_XSCOM_XIVR_OCC,
+    PSIHB_XSCOM_XIVR_FSI,
+    PSIHB_XSCOM_XIVR_LPCI2C,
+    PSIHB_XSCOM_XIVR_LOCERR,
+    PSIHB_XSCOM_XIVR_EXT,
+};
+
+static void pnv_psi_realize(DeviceState *dev, Error **errp)
+{
+    PnvPsi *psi = PNV_PSI(dev);
+    ICSState *ics = &psi->ics;
+    Object *obj;
+    Error *err = NULL;
+    unsigned int i;
+
+    obj = object_property_get_link(OBJECT(dev), "xics", &err);
+    if (!obj) {
+        error_setg(errp, "%s: required link 'xics' not found: %s",
+                   __func__, error_get_pretty(err));
+        return;
+    }
+
+    /* Create PSI interrupt control source */
+    object_property_add_const_link(OBJECT(ics), "xics", obj,  &error_abort);
+    object_property_set_int(OBJECT(ics), PSI_NUM_INTERRUPTS, "nr-irqs", &err);
+    if (err) {
+        error_propagate(errp, err);
+        return;
+    }
+    object_property_set_bool(OBJECT(ics), true, "realized",  &err);
+    if (err) {
+        error_propagate(errp, err);
+        return;
+    }
+
+    for (i = 0; i < ics->nr_irqs; i++) {
+        ics_set_irq_type(ics, i, true);
+    }
+
+    /* XSCOM region for PSI registers */
+    pnv_xscom_region_init(&psi->xscom_regs, OBJECT(dev), &pnv_psi_xscom_ops,
+                psi, "xscom-psi", PNV_XSCOM_PSIHB_SIZE);
+
+    /* Initialize MMIO region */
+    memory_region_init_io(&psi->regs_mr, OBJECT(dev), &psi_mmio_ops, psi,
+                          "psihb", PNV_PSIHB_SIZE);
+
+    /* Default BAR for MMIO region */
+    pnv_psi_set_bar(psi, psi->bar | PSIHB_BAR_EN);
+
+    /* Default sources in XIVR */
+    for (i = 0; i < PSI_NUM_INTERRUPTS; i++) {
+        uint8_t xivr = irq_to_xivr[i];
+        psi->regs[xivr] = PSIHB_XIVR_PRIO_MSK |
+            ((uint64_t) i << PSIHB_XIVR_SRC_SH);
+    }
+}
+
+static int pnv_psi_populate(PnvXScomInterface *dev, void *fdt, int xscom_offset)
+{
+    const char compat[] = "ibm,power8-psihb-x\0ibm,psihb-x";
+    char *name;
+    int offset;
+    uint32_t lpc_pcba = PNV_XSCOM_PSIHB_BASE;
+    uint32_t reg[] = {
+        cpu_to_be32(lpc_pcba),
+        cpu_to_be32(PNV_XSCOM_PSIHB_SIZE)
+    };
+
+    name = g_strdup_printf("psihb@%x", lpc_pcba);
+    offset = fdt_add_subnode(fdt, xscom_offset, name);
+    _FDT(offset);
+    g_free(name);
+
+    _FDT((fdt_setprop(fdt, offset, "reg", reg, sizeof(reg))));
+
+    _FDT((fdt_setprop_cell(fdt, offset, "#address-cells", 2)));
+    _FDT((fdt_setprop_cell(fdt, offset, "#size-cells", 1)));
+    _FDT((fdt_setprop(fdt, offset, "compatible", compat,
+                      sizeof(compat))));
+    return 0;
+}
+
+static Property pnv_psi_properties[] = {
+    DEFINE_PROP_UINT64("bar", PnvPsi, bar, 0),
+    DEFINE_PROP_UINT64("fsp-bar", PnvPsi, fsp_bar, 0),
+    DEFINE_PROP_END_OF_LIST(),
+};
+
+static void pnv_psi_class_init(ObjectClass *klass, void *data)
+{
+    DeviceClass *dc = DEVICE_CLASS(klass);
+    PnvXScomInterfaceClass *xdc = PNV_XSCOM_INTERFACE_CLASS(klass);
+
+    xdc->populate = pnv_psi_populate;
+
+    dc->realize = pnv_psi_realize;
+    dc->props = pnv_psi_properties;
+}
+
+static const TypeInfo pnv_psi_info = {
+    .name          = TYPE_PNV_PSI,
+    .parent        = TYPE_SYS_BUS_DEVICE,
+    .instance_size = sizeof(PnvPsi),
+    .instance_init = pnv_psi_init,
+    .class_init    = pnv_psi_class_init,
+    .interfaces    = (InterfaceInfo[]) {
+        { TYPE_PNV_XSCOM_INTERFACE },
+        { }
+    }
+};
+
+static void pnv_psi_register_types(void)
+{
+    type_register_static(&pnv_psi_info);
+}
+
+type_init(pnv_psi_register_types)
diff --git a/hw/ppc/spapr.c b/hw/ppc/spapr.c
index 35db949dbc..80d12d005c 100644
--- a/hw/ppc/spapr.c
+++ b/hw/ppc/spapr.c
@@ -40,6 +40,7 @@
 #include "kvm_ppc.h"
 #include "migration/migration.h"
 #include "mmu-hash64.h"
+#include "mmu-book3s-v3.h"
 #include "qom/cpu.h"
 
 #include "hw/boards.h"
@@ -96,66 +97,40 @@
 
 #define HTAB_SIZE(spapr)        (1ULL << ((spapr)->htab_shift))
 
-static int try_create_xics(sPAPRMachineState *spapr, const char *type_ics,
-                           const char *type_icp, int nr_servers,
-                           int nr_irqs, Error **errp)
+static ICSState *spapr_ics_create(sPAPRMachineState *spapr,
+                                  const char *type_ics,
+                                  int nr_irqs, Error **errp)
 {
-    XICSFabric *xi = XICS_FABRIC(spapr);
     Error *err = NULL, *local_err = NULL;
-    ICSState *ics = NULL;
-    int i;
+    Object *obj;
 
-    ics = ICS_SIMPLE(object_new(type_ics));
-    object_property_add_child(OBJECT(spapr), "ics", OBJECT(ics), NULL);
-    object_property_set_int(OBJECT(ics), nr_irqs, "nr-irqs", &err);
-    object_property_add_const_link(OBJECT(ics), "xics", OBJECT(xi), NULL);
-    object_property_set_bool(OBJECT(ics), true, "realized", &local_err);
+    obj = object_new(type_ics);
+    object_property_add_child(OBJECT(spapr), "ics", obj, NULL);
+    object_property_add_const_link(obj, "xics", OBJECT(spapr), &error_abort);
+    object_property_set_int(obj, nr_irqs, "nr-irqs", &err);
+    object_property_set_bool(obj, true, "realized", &local_err);
     error_propagate(&err, local_err);
     if (err) {
-        goto error;
-    }
-
-    spapr->icps = g_malloc0(nr_servers * sizeof(ICPState));
-    spapr->nr_servers = nr_servers;
-
-    for (i = 0; i < nr_servers; i++) {
-        ICPState *icp = &spapr->icps[i];
-
-        object_initialize(icp, sizeof(*icp), type_icp);
-        object_property_add_child(OBJECT(spapr), "icp[*]", OBJECT(icp), NULL);
-        object_property_add_const_link(OBJECT(icp), "xics", OBJECT(xi), NULL);
-        object_property_set_bool(OBJECT(icp), true, "realized", &err);
-        if (err) {
-            goto error;
-        }
-        object_unref(OBJECT(icp));
+        error_propagate(errp, err);
+        return NULL;
     }
 
-    spapr->ics = ics;
-    return 0;
-
-error:
-    error_propagate(errp, err);
-    if (ics) {
-        object_unparent(OBJECT(ics));
-    }
-    return -1;
+    return ICS_SIMPLE(obj);
 }
 
-static int xics_system_init(MachineState *machine,
-                            int nr_servers, int nr_irqs, Error **errp)
+static void xics_system_init(MachineState *machine, int nr_irqs, Error **errp)
 {
-    int rc = -1;
+    sPAPRMachineState *spapr = SPAPR_MACHINE(machine);
 
     if (kvm_enabled()) {
         Error *err = NULL;
 
         if (machine_kernel_irqchip_allowed(machine) &&
-            !xics_kvm_init(SPAPR_MACHINE(machine), errp)) {
-            rc = try_create_xics(SPAPR_MACHINE(machine), TYPE_ICS_KVM,
-                                 TYPE_KVM_ICP, nr_servers, nr_irqs, &err);
+            !xics_kvm_init(spapr, errp)) {
+            spapr->icp_type = TYPE_KVM_ICP;
+            spapr->ics = spapr_ics_create(spapr, TYPE_ICS_KVM, nr_irqs, &err);
         }
-        if (machine_kernel_irqchip_required(machine) && rc < 0) {
+        if (machine_kernel_irqchip_required(machine) && !spapr->ics) {
             error_reportf_err(err,
                               "kernel_irqchip requested but unavailable: ");
         } else {
@@ -163,13 +138,11 @@ static int xics_system_init(MachineState *machine,
         }
     }
 
-    if (rc < 0) {
-        xics_spapr_init(SPAPR_MACHINE(machine), errp);
-        rc = try_create_xics(SPAPR_MACHINE(machine), TYPE_ICS_SIMPLE,
-                               TYPE_ICP, nr_servers, nr_irqs, errp);
+    if (!spapr->ics) {
+        xics_spapr_init(spapr, errp);
+        spapr->icp_type = TYPE_ICP;
+        spapr->ics = spapr_ics_create(spapr, TYPE_ICS_SIMPLE, nr_irqs, errp);
     }
-
-    return rc;
 }
 
 static int spapr_fixup_cpu_smt_dt(void *fdt, int offset, PowerPCCPU *cpu,
@@ -226,6 +199,85 @@ static int spapr_fixup_cpu_numa_dt(void *fdt, int offset, CPUState *cs)
     return ret;
 }
 
+/* Populate the "ibm,pa-features" property */
+static void spapr_populate_pa_features(CPUPPCState *env, void *fdt, int offset,
+                                      bool legacy_guest)
+{
+    uint8_t pa_features_206[] = { 6, 0,
+        0xf6, 0x1f, 0xc7, 0x00, 0x80, 0xc0 };
+    uint8_t pa_features_207[] = { 24, 0,
+        0xf6, 0x1f, 0xc7, 0xc0, 0x80, 0xf0,
+        0x80, 0x00, 0x00, 0x00, 0x00, 0x00,
+        0x00, 0x00, 0x00, 0x00, 0x80, 0x00,
+        0x80, 0x00, 0x80, 0x00, 0x00, 0x00 };
+    uint8_t pa_features_300[] = { 66, 0,
+        /* 0: MMU|FPU|SLB|RUN|DABR|NX, 1: fri[nzpm]|DABRX|SPRG3|SLB0|PP110 */
+        /* 2: VPM|DS205|PPR|DS202|DS206, 3: LSD|URG, SSO, 5: LE|CFAR|EB|LSQ */
+        0xf6, 0x1f, 0xc7, 0xc0, 0x80, 0xf0, /* 0 - 5 */
+        /* 6: DS207 */
+        0x80, 0x00, 0x00, 0x00, 0x00, 0x00, /* 6 - 11 */
+        /* 16: Vector */
+        0x00, 0x00, 0x00, 0x00, 0x80, 0x00, /* 12 - 17 */
+        /* 18: Vec. Scalar, 20: Vec. XOR, 22: HTM */
+        0x80, 0x00, 0x80, 0x00, 0x80, 0x00, /* 18 - 23 */
+        /* 24: Ext. Dec, 26: 64 bit ftrs, 28: PM ftrs */
+        0x80, 0x00, 0x80, 0x00, 0x80, 0x00, /* 24 - 29 */
+        /* 30: MMR, 32: LE atomic, 34: EBB + ext EBB */
+        0x80, 0x00, 0x80, 0x00, 0xC0, 0x00, /* 30 - 35 */
+        /* 36: SPR SO, 38: Copy/Paste, 40: Radix MMU */
+        0x80, 0x00, 0x80, 0x00, 0x80, 0x00, /* 36 - 41 */
+        /* 42: PM, 44: PC RA, 46: SC vec'd */
+        0x80, 0x00, 0x80, 0x00, 0x80, 0x00, /* 42 - 47 */
+        /* 48: SIMD, 50: QP BFP, 52: String */
+        0x80, 0x00, 0x80, 0x00, 0x80, 0x00, /* 48 - 53 */
+        /* 54: DecFP, 56: DecI, 58: SHA */
+        0x80, 0x00, 0x80, 0x00, 0x80, 0x00, /* 54 - 59 */
+        /* 60: NM atomic, 62: RNG */
+        0x80, 0x00, 0x80, 0x00, 0x00, 0x00, /* 60 - 65 */
+    };
+    uint8_t *pa_features;
+    size_t pa_size;
+
+    switch (POWERPC_MMU_VER(env->mmu_model)) {
+    case POWERPC_MMU_VER_2_06:
+        pa_features = pa_features_206;
+        pa_size = sizeof(pa_features_206);
+        break;
+    case POWERPC_MMU_VER_2_07:
+        pa_features = pa_features_207;
+        pa_size = sizeof(pa_features_207);
+        break;
+    case POWERPC_MMU_VER_3_00:
+        pa_features = pa_features_300;
+        pa_size = sizeof(pa_features_300);
+        break;
+    default:
+        return;
+    }
+
+    if (env->ci_large_pages) {
+        /*
+         * Note: we keep CI large pages off by default because a 64K capable
+         * guest provisioned with large pages might otherwise try to map a qemu
+         * framebuffer (or other kind of memory mapped PCI BAR) using 64K pages
+         * even if that qemu runs on a 4k host.
+         * We dd this bit back here if we are confident this is not an issue
+         */
+        pa_features[3] |= 0x20;
+    }
+    if (kvmppc_has_cap_htm() && pa_size > 24) {
+        pa_features[24] |= 0x80;    /* Transactional memory support */
+    }
+    if (legacy_guest && pa_size > 40) {
+        /* Workaround for broken kernels that attempt (guest) radix
+         * mode when they can't handle it, if they see the radix bit set
+         * in pa-features. So hide it from them. */
+        pa_features[40 + 2] &= ~0x80; /* Radix MMU */
+    }
+
+    _FDT((fdt_setprop(fdt, offset, "ibm,pa-features", pa_features, pa_size)));
+}
+
 static int spapr_fixup_cpu_dt(void *fdt, sPAPRMachineState *spapr)
 {
     int ret = 0, offset, cpus_offset;
@@ -236,6 +288,7 @@ static int spapr_fixup_cpu_dt(void *fdt, sPAPRMachineState *spapr)
 
     CPU_FOREACH(cs) {
         PowerPCCPU *cpu = POWERPC_CPU(cs);
+        CPUPPCState *env = &cpu->env;
         DeviceClass *dc = DEVICE_GET_CLASS(cs);
         int index = ppc_get_vcpu_dt_id(cpu);
         int compat_smt = MIN(smp_threads, ppc_compat_max_threads(cpu));
@@ -277,6 +330,9 @@ static int spapr_fixup_cpu_dt(void *fdt, sPAPRMachineState *spapr)
         if (ret < 0) {
             return ret;
         }
+
+        spapr_populate_pa_features(env, fdt, offset,
+                                         spapr->cas_legacy_guest_workaround);
     }
     return ret;
 }
@@ -378,67 +434,6 @@ static int spapr_populate_memory(sPAPRMachineState *spapr, void *fdt)
     return 0;
 }
 
-/* Populate the "ibm,pa-features" property */
-static void spapr_populate_pa_features(CPUPPCState *env, void *fdt, int offset)
-{
-    uint8_t pa_features_206[] = { 6, 0,
-        0xf6, 0x1f, 0xc7, 0x00, 0x80, 0xc0 };
-    uint8_t pa_features_207[] = { 24, 0,
-        0xf6, 0x1f, 0xc7, 0xc0, 0x80, 0xf0,
-        0x80, 0x00, 0x00, 0x00, 0x00, 0x00,
-        0x00, 0x00, 0x00, 0x00, 0x80, 0x00,
-        0x80, 0x00, 0x80, 0x00, 0x00, 0x00 };
-    /* Currently we don't advertise any of the "new" ISAv3.00 functionality */
-    uint8_t pa_features_300[] = { 64, 0,
-        0xf6, 0x1f, 0xc7, 0xc0, 0x80, 0xf0, /*  0 -  5 */
-        0x80, 0x00, 0x00, 0x00, 0x00, 0x00, /*  6 - 11 */
-        0x00, 0x00, 0x00, 0x00, 0x80, 0x00, /* 12 - 17 */
-        0x80, 0x00, 0x80, 0x00, 0x00, 0x00, /* 18 - 23 */
-        0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 24 - 29 */
-        0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 30 - 35 */
-        0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 36 - 41 */
-        0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 42 - 47 */
-        0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 48 - 53 */
-        0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 54 - 59 */
-        0x00, 0x00, 0x00, 0x00           }; /* 60 - 63 */
-
-    uint8_t *pa_features;
-    size_t pa_size;
-
-    switch (POWERPC_MMU_VER(env->mmu_model)) {
-    case POWERPC_MMU_VER_2_06:
-        pa_features = pa_features_206;
-        pa_size = sizeof(pa_features_206);
-        break;
-    case POWERPC_MMU_VER_2_07:
-        pa_features = pa_features_207;
-        pa_size = sizeof(pa_features_207);
-        break;
-    case POWERPC_MMU_VER_3_00:
-        pa_features = pa_features_300;
-        pa_size = sizeof(pa_features_300);
-        break;
-    default:
-        return;
-    }
-
-    if (env->ci_large_pages) {
-        /*
-         * Note: we keep CI large pages off by default because a 64K capable
-         * guest provisioned with large pages might otherwise try to map a qemu
-         * framebuffer (or other kind of memory mapped PCI BAR) using 64K pages
-         * even if that qemu runs on a 4k host.
-         * We dd this bit back here if we are confident this is not an issue
-         */
-        pa_features[3] |= 0x20;
-    }
-    if (kvmppc_has_cap_htm() && pa_size > 24) {
-        pa_features[24] |= 0x80;    /* Transactional memory support */
-    }
-
-    _FDT((fdt_setprop(fdt, offset, "ibm,pa-features", pa_features, pa_size)));
-}
-
 static void spapr_populate_cpu_dt(CPUState *cs, void *fdt, int offset,
                                   sPAPRMachineState *spapr)
 {
@@ -459,6 +454,8 @@ static void spapr_populate_cpu_dt(CPUState *cs, void *fdt, int offset,
     sPAPRDRConnector *drc;
     sPAPRDRConnectorClass *drck;
     int drc_index;
+    uint32_t radix_AP_encodings[PPC_PAGE_SIZES_MAX_SZ];
+    int i;
 
     drc = spapr_dr_connector_by_id(SPAPR_DR_CONNECTOR_TYPE_CPU, index);
     if (drc) {
@@ -533,7 +530,7 @@ static void spapr_populate_cpu_dt(CPUState *cs, void *fdt, int offset,
                           page_sizes_prop, page_sizes_prop_size)));
     }
 
-    spapr_populate_pa_features(env, fdt, offset);
+    spapr_populate_pa_features(env, fdt, offset, false);
 
     _FDT((fdt_setprop_cell(fdt, offset, "ibm,chip-id",
                            cs->cpu_index / vcpus_per_socket)));
@@ -544,6 +541,17 @@ static void spapr_populate_cpu_dt(CPUState *cs, void *fdt, int offset,
     _FDT(spapr_fixup_cpu_numa_dt(fdt, offset, cs));
 
     _FDT(spapr_fixup_cpu_smt_dt(fdt, offset, cpu, compat_smt));
+
+    if (pcc->radix_page_info) {
+        for (i = 0; i < pcc->radix_page_info->count; i++) {
+            radix_AP_encodings[i] =
+                cpu_to_be32(pcc->radix_page_info->entries[i]);
+        }
+        _FDT((fdt_setprop(fdt, offset, "ibm,processor-radix-AP-encodings",
+                          radix_AP_encodings,
+                          pcc->radix_page_info->count *
+                          sizeof(radix_AP_encodings[0]))));
+    }
 }
 
 static void spapr_populate_cpus_dt_node(void *fdt, sPAPRMachineState *spapr)
@@ -842,6 +850,33 @@ static void spapr_dt_rtas(sPAPRMachineState *spapr, void *fdt)
     spapr_dt_rtas_tokens(fdt, rtas);
 }
 
+/* Prepare ibm,arch-vec-5-platform-support, which indicates the MMU features
+ * that the guest may request and thus the valid values for bytes 24..26 of
+ * option vector 5: */
+static void spapr_dt_ov5_platform_support(void *fdt, int chosen)
+{
+    char val[2 * 3] = {
+        24, 0x00, /* Hash/Radix, filled in below. */
+        25, 0x00, /* Hash options: Segment Tables == no, GTSE == no. */
+        26, 0x40, /* Radix options: GTSE == yes. */
+    };
+
+    if (kvm_enabled()) {
+        if (kvmppc_has_cap_mmu_radix() && kvmppc_has_cap_mmu_hash_v3()) {
+            val[1] = 0x80; /* OV5_MMU_BOTH */
+        } else if (kvmppc_has_cap_mmu_radix()) {
+            val[1] = 0x40; /* OV5_MMU_RADIX_300 */
+        } else {
+            val[1] = 0x00; /* Hash */
+        }
+    } else {
+        /* TODO: TCG case, hash */
+        val[1] = 0x00;
+    }
+    _FDT(fdt_setprop(fdt, chosen, "ibm,arch-vec-5-platform-support",
+                     val, sizeof(val)));
+}
+
 static void spapr_dt_chosen(sPAPRMachineState *spapr, void *fdt)
 {
     MachineState *machine = MACHINE(spapr);
@@ -895,6 +930,8 @@ static void spapr_dt_chosen(sPAPRMachineState *spapr, void *fdt)
         _FDT(fdt_setprop_string(fdt, chosen, "linux,stdout-path", stdout_path));
     }
 
+    spapr_dt_ov5_platform_support(fdt, chosen);
+
     g_free(stdout_path);
     g_free(bootlist);
 }
@@ -933,6 +970,7 @@ static void *spapr_build_fdt(sPAPRMachineState *spapr,
     void *fdt;
     sPAPRPHBState *phb;
     char *buf;
+    int smt = kvmppc_smt_threads();
 
     fdt = g_malloc0(FDT_MAX_SIZE);
     _FDT((fdt_create_empty_tree(fdt, FDT_MAX_SIZE)));
@@ -972,7 +1010,7 @@ static void *spapr_build_fdt(sPAPRMachineState *spapr,
     _FDT(fdt_setprop_cell(fdt, 0, "#size-cells", 2));
 
     /* /interrupt controller */
-    spapr_dt_xics(spapr->nr_servers, fdt, PHANDLE_XICP);
+    spapr_dt_xics(DIV_ROUND_UP(max_cpus * smt, smp_threads), fdt, PHANDLE_XICP);
 
     ret = spapr_populate_memory(spapr, fdt);
     if (ret < 0) {
@@ -1100,7 +1138,7 @@ static int get_htab_fd(sPAPRMachineState *spapr)
     return spapr->htab_fd;
 }
 
-static void close_htab_fd(sPAPRMachineState *spapr)
+void close_htab_fd(sPAPRMachineState *spapr)
 {
     if (spapr->htab_fd >= 0) {
         close(spapr->htab_fd);
@@ -1227,6 +1265,19 @@ static void spapr_reallocate_hpt(sPAPRMachineState *spapr, int shift,
     }
 }
 
+void spapr_setup_hpt_and_vrma(sPAPRMachineState *spapr)
+{
+    spapr_reallocate_hpt(spapr,
+                     spapr_hpt_shift_for_ramsize(MACHINE(spapr)->maxram_size),
+                     &error_fatal);
+    if (spapr->vrma_adjust) {
+        spapr->rma_size = kvmppc_rma_size(spapr_node0_size(),
+                                          spapr->htab_shift);
+    }
+    /* We're setting up a hash table, so that means we're not radix */
+    spapr->patb_entry = 0;
+}
+
 static void find_unknown_sysbus_device(SysBusDevice *sbdev, void *opaque)
 {
     bool matched = false;
@@ -1255,17 +1306,14 @@ static void ppc_spapr_reset(void)
     /* Check for unknown sysbus devices */
     foreach_dynamic_sysbus_device(find_unknown_sysbus_device, NULL);
 
-    spapr->patb_entry = 0;
-
-    /* Allocate and/or reset the hash page table */
-    spapr_reallocate_hpt(spapr,
-                         spapr_hpt_shift_for_ramsize(machine->maxram_size),
-                         &error_fatal);
-
-    /* Update the RMA size if necessary */
-    if (spapr->vrma_adjust) {
-        spapr->rma_size = kvmppc_rma_size(spapr_node0_size(),
-                                          spapr->htab_shift);
+    if (kvm_enabled() && kvmppc_has_cap_mmu_radix()) {
+        /* If using KVM with radix mode available, VCPUs can be started
+         * without a HPT because KVM will start them in radix mode.
+         * Set the GR bit in PATB so that we know there is no HPT. */
+        spapr->patb_entry = PATBE1_GR;
+    } else {
+        spapr->patb_entry = 0;
+        spapr_setup_hpt_and_vrma(spapr);
     }
 
     qemu_devices_reset();
@@ -1333,13 +1381,13 @@ static void spapr_create_nvram(sPAPRMachineState *spapr)
 
 static void spapr_rtc_create(sPAPRMachineState *spapr)
 {
-    DeviceState *dev = qdev_create(NULL, TYPE_SPAPR_RTC);
-
-    qdev_init_nofail(dev);
-    spapr->rtc = dev;
-
-    object_property_add_alias(qdev_get_machine(), "rtc-time",
-                              OBJECT(spapr->rtc), "date", NULL);
+    object_initialize(&spapr->rtc, sizeof(spapr->rtc), TYPE_SPAPR_RTC);
+    object_property_add_child(OBJECT(spapr), "rtc", OBJECT(&spapr->rtc),
+                              &error_fatal);
+    object_property_set_bool(OBJECT(&spapr->rtc), true, "realized",
+                              &error_fatal);
+    object_property_add_alias(OBJECT(spapr), "rtc-time", OBJECT(&spapr->rtc),
+                              "date", &error_fatal);
 }
 
 /* Returns whether we want to use VGA or not */
@@ -1366,9 +1414,10 @@ static int spapr_post_load(void *opaque, int version_id)
     int err = 0;
 
     if (!object_dynamic_cast(OBJECT(spapr->ics), TYPE_ICS_KVM)) {
-        int i;
-        for (i = 0; i < spapr->nr_servers; i++) {
-            icp_resend(&spapr->icps[i]);
+        CPUState *cs;
+        CPU_FOREACH(cs) {
+            PowerPCCPU *cpu = POWERPC_CPU(cs);
+            icp_resend(ICP(cpu->intc));
         }
     }
 
@@ -1377,7 +1426,7 @@ static int spapr_post_load(void *opaque, int version_id)
      * So when migrating from those versions, poke the incoming offset
      * value into the RTC device */
     if (version_id < 3) {
-        err = spapr_rtc_import_offset(spapr->rtc, spapr->rtc_offset);
+        err = spapr_rtc_import_offset(&spapr->rtc, spapr->rtc_offset);
     }
 
     return err;
@@ -1990,7 +2039,6 @@ static void ppc_spapr_init(MachineState *machine)
     hwaddr node0_size = spapr_node0_size();
     long load_limit, fw_size;
     char *filename;
-    int smt = kvmppc_smt_threads();
 
     msi_nonbroken = true;
 
@@ -2041,8 +2089,7 @@ static void ppc_spapr_init(MachineState *machine)
     load_limit = MIN(spapr->rma_size, RTAS_MAX_ADDR) - FW_OVERHEAD;
 
     /* Set up Interrupt Controller before we create the VCPUs */
-    xics_system_init(machine, DIV_ROUND_UP(max_cpus * smt, smp_threads),
-                     XICS_IRQS_SPAPR, &error_fatal);
+    xics_system_init(machine, XICS_IRQS_SPAPR, &error_fatal);
 
     /* Set up containers for ibm,client-set-architecture negotiated options */
     spapr->ov5 = spapr_ovec_new();
@@ -2054,6 +2101,11 @@ static void ppc_spapr_init(MachineState *machine)
     }
 
     spapr_ovec_set(spapr->ov5, OV5_FORM1_AFFINITY);
+    if (kvmppc_has_cap_mmu_radix()) {
+        /* KVM always allows GTSE with radix... */
+        spapr_ovec_set(spapr->ov5, OV5_MMU_RADIX_GTSE);
+    }
+    /* ... but not with hash (currently). */
 
     /* advertise support for dedicated HP event source to guests */
     if (spapr->use_hotplug_event_source) {
@@ -2281,10 +2333,12 @@ static void ppc_spapr_init(MachineState *machine)
 
     qemu_register_boot_set(spapr_boot_set, spapr);
 
-    /* to stop and start vmclock */
     if (kvm_enabled()) {
+        /* to stop and start vmclock */
         qemu_add_vm_change_state_handler(cpu_ppc_clock_vm_state_change,
                                          &spapr->tb);
+
+        kvmppc_spapr_enable_inkernel_multitce();
     }
 }
 
@@ -3030,21 +3084,23 @@ static void spapr_ics_resend(XICSFabric *dev)
     ics_resend(spapr->ics);
 }
 
-static ICPState *spapr_icp_get(XICSFabric *xi, int server)
+static ICPState *spapr_icp_get(XICSFabric *xi, int cpu_dt_id)
 {
-    sPAPRMachineState *spapr = SPAPR_MACHINE(xi);
+    PowerPCCPU *cpu = ppc_get_vcpu_by_dt_id(cpu_dt_id);
 
-    return (server < spapr->nr_servers) ? &spapr->icps[server] : NULL;
+    return cpu ? ICP(cpu->intc) : NULL;
 }
 
 static void spapr_pic_print_info(InterruptStatsProvider *obj,
                                  Monitor *mon)
 {
     sPAPRMachineState *spapr = SPAPR_MACHINE(obj);
-    int i;
+    CPUState *cs;
 
-    for (i = 0; i < spapr->nr_servers; i++) {
-        icp_pic_print_info(&spapr->icps[i], mon);
+    CPU_FOREACH(cs) {
+        PowerPCCPU *cpu = POWERPC_CPU(cs);
+
+        icp_pic_print_info(ICP(cpu->intc), mon);
     }
 
     ics_pic_print_info(spapr->ics, mon);
@@ -3158,18 +3214,37 @@ static const TypeInfo spapr_machine_info = {
     type_init(spapr_machine_register_##suffix)
 
 /*
+ * pseries-2.10
+ */
+static void spapr_machine_2_10_instance_options(MachineState *machine)
+{
+}
+
+static void spapr_machine_2_10_class_options(MachineClass *mc)
+{
+    /* Defaults for the latest behaviour inherited from the base class */
+}
+
+DEFINE_SPAPR_MACHINE(2_10, "2.10", true);
+
+/*
  * pseries-2.9
  */
+#define SPAPR_COMPAT_2_9                                               \
+    HW_COMPAT_2_9
+
 static void spapr_machine_2_9_instance_options(MachineState *machine)
 {
+    spapr_machine_2_10_instance_options(machine);
 }
 
 static void spapr_machine_2_9_class_options(MachineClass *mc)
 {
-    /* Defaults for the latest behaviour inherited from the base class */
+    spapr_machine_2_10_class_options(mc);
+    SET_MACHINE_COMPAT(mc, SPAPR_COMPAT_2_9);
 }
 
-DEFINE_SPAPR_MACHINE(2_9, "2.9", true);
+DEFINE_SPAPR_MACHINE(2_9, "2.9", false);
 
 /*
  * pseries-2.8
diff --git a/hw/ppc/spapr_cpu_core.c b/hw/ppc/spapr_cpu_core.c
index 6883f0991a..4389ef4c2a 100644
--- a/hw/ppc/spapr_cpu_core.c
+++ b/hw/ppc/spapr_cpu_core.c
@@ -80,8 +80,6 @@ static void spapr_cpu_init(sPAPRMachineState *spapr, PowerPCCPU *cpu,
         }
     }
 
-    xics_cpu_setup(XICS_FABRIC(spapr), cpu);
-
     qemu_register_reset(spapr_cpu_reset, cpu);
     spapr_cpu_reset(cpu);
 }
@@ -129,6 +127,7 @@ static void spapr_cpu_core_unrealizefn(DeviceState *dev, Error **errp)
         PowerPCCPU *cpu = POWERPC_CPU(cs);
 
         spapr_cpu_destroy(cpu);
+        object_unparent(cpu->intc);
         cpu_remove_sync(cs);
         object_unparent(obj);
     }
@@ -141,18 +140,32 @@ static void spapr_cpu_core_realize_child(Object *child, Error **errp)
     sPAPRMachineState *spapr = SPAPR_MACHINE(qdev_get_machine());
     CPUState *cs = CPU(child);
     PowerPCCPU *cpu = POWERPC_CPU(cs);
+    Object *obj;
+
+    obj = object_new(spapr->icp_type);
+    object_property_add_child(OBJECT(cpu), "icp", obj, NULL);
+    object_property_add_const_link(obj, "xics", OBJECT(spapr), &error_abort);
+    object_property_set_bool(obj, true, "realized", &local_err);
+    if (local_err) {
+        error_propagate(errp, local_err);
+        return;
+    }
 
     object_property_set_bool(child, true, "realized", &local_err);
     if (local_err) {
+        object_unparent(obj);
         error_propagate(errp, local_err);
         return;
     }
 
     spapr_cpu_init(spapr, cpu, &local_err);
     if (local_err) {
+        object_unparent(obj);
         error_propagate(errp, local_err);
         return;
     }
+
+    xics_cpu_setup(XICS_FABRIC(spapr), cpu, ICP(obj));
 }
 
 static void spapr_cpu_core_realize(DeviceState *dev, Error **errp)
diff --git a/hw/ppc/spapr_events.c b/hw/ppc/spapr_events.c
index 24a5758e62..f0b28d8112 100644
--- a/hw/ppc/spapr_events.c
+++ b/hw/ppc/spapr_events.c
@@ -422,7 +422,7 @@ static void spapr_init_maina(struct rtas_event_log_v6_maina *maina,
     maina->hdr.section_id = cpu_to_be16(RTAS_LOG_V6_SECTION_ID_MAINA);
     maina->hdr.section_length = cpu_to_be16(sizeof(*maina));
     /* FIXME: section version, subtype and creator id? */
-    spapr_rtc_read(spapr->rtc, &tm, NULL);
+    spapr_rtc_read(&spapr->rtc, &tm, NULL);
     year = tm.tm_year + 1900;
     maina->creation_date = cpu_to_be32((to_bcd(year / 100) << 24)
                                        | (to_bcd(year % 100) << 16)
diff --git a/hw/ppc/spapr_hcall.c b/hw/ppc/spapr_hcall.c
index f05a90ed2c..9f18f75b88 100644
--- a/hw/ppc/spapr_hcall.c
+++ b/hw/ppc/spapr_hcall.c
@@ -12,6 +12,8 @@
 #include "trace.h"
 #include "kvm_ppc.h"
 #include "hw/ppc/spapr_ovec.h"
+#include "qemu/error-report.h"
+#include "mmu-book3s-v3.h"
 
 struct SPRSyncState {
     int spr;
@@ -878,6 +880,137 @@ static target_ulong h_set_mode(PowerPCCPU *cpu, sPAPRMachineState *spapr,
     return ret;
 }
 
+static target_ulong h_clean_slb(PowerPCCPU *cpu, sPAPRMachineState *spapr,
+                                target_ulong opcode, target_ulong *args)
+{
+    qemu_log_mask(LOG_UNIMP, "Unimplemented SPAPR hcall 0x"TARGET_FMT_lx"%s\n",
+                  opcode, " (H_CLEAN_SLB)");
+    return H_FUNCTION;
+}
+
+static target_ulong h_invalidate_pid(PowerPCCPU *cpu, sPAPRMachineState *spapr,
+                                     target_ulong opcode, target_ulong *args)
+{
+    qemu_log_mask(LOG_UNIMP, "Unimplemented SPAPR hcall 0x"TARGET_FMT_lx"%s\n",
+                  opcode, " (H_INVALIDATE_PID)");
+    return H_FUNCTION;
+}
+
+static void spapr_check_setup_free_hpt(sPAPRMachineState *spapr,
+                                       uint64_t patbe_old, uint64_t patbe_new)
+{
+    /*
+     * We have 4 Options:
+     * HASH->HASH || RADIX->RADIX || NOTHING->RADIX : Do Nothing
+     * HASH->RADIX                                  : Free HPT
+     * RADIX->HASH                                  : Allocate HPT
+     * NOTHING->HASH                                : Allocate HPT
+     * Note: NOTHING implies the case where we said the guest could choose
+     *       later and so assumed radix and now it's called H_REG_PROC_TBL
+     */
+
+    if ((patbe_old & PATBE1_GR) == (patbe_new & PATBE1_GR)) {
+        /* We assume RADIX, so this catches all the "Do Nothing" cases */
+    } else if (!(patbe_old & PATBE1_GR)) {
+        /* HASH->RADIX : Free HPT */
+        g_free(spapr->htab);
+        spapr->htab = NULL;
+        spapr->htab_shift = 0;
+        close_htab_fd(spapr);
+    } else if (!(patbe_new & PATBE1_GR)) {
+        /* RADIX->HASH || NOTHING->HASH : Allocate HPT */
+        spapr_setup_hpt_and_vrma(spapr);
+    }
+    return;
+}
+
+#define FLAGS_MASK              0x01FULL
+#define FLAG_MODIFY             0x10
+#define FLAG_REGISTER           0x08
+#define FLAG_RADIX              0x04
+#define FLAG_HASH_PROC_TBL      0x02
+#define FLAG_GTSE               0x01
+
+static target_ulong h_register_process_table(PowerPCCPU *cpu,
+                                             sPAPRMachineState *spapr,
+                                             target_ulong opcode,
+                                             target_ulong *args)
+{
+    CPUPPCState *env = &cpu->env;
+    target_ulong flags = args[0];
+    target_ulong proc_tbl = args[1];
+    target_ulong page_size = args[2];
+    target_ulong table_size = args[3];
+    uint64_t cproc;
+
+    if (flags & ~FLAGS_MASK) { /* Check no reserved bits are set */
+        return H_PARAMETER;
+    }
+    if (flags & FLAG_MODIFY) {
+        if (flags & FLAG_REGISTER) {
+            if (flags & FLAG_RADIX) { /* Register new RADIX process table */
+                if (proc_tbl & 0xfff || proc_tbl >> 60) {
+                    return H_P2;
+                } else if (page_size) {
+                    return H_P3;
+                } else if (table_size > 24) {
+                    return H_P4;
+                }
+                cproc = PATBE1_GR | proc_tbl | table_size;
+            } else { /* Register new HPT process table */
+                if (flags & FLAG_HASH_PROC_TBL) { /* Hash with Segment Tables */
+                    /* TODO - Not Supported */
+                    /* Technically caused by flag bits => H_PARAMETER */
+                    return H_PARAMETER;
+                } else { /* Hash with SLB */
+                    if (proc_tbl >> 38) {
+                        return H_P2;
+                    } else if (page_size & ~0x7) {
+                        return H_P3;
+                    } else if (table_size > 24) {
+                        return H_P4;
+                    }
+                }
+                cproc = (proc_tbl << 25) | page_size << 5 | table_size;
+            }
+
+        } else { /* Deregister current process table */
+            /* Set to benign value: (current GR) | 0. This allows
+             * deregistration in KVM to succeed even if the radix bit in flags
+             * doesn't match the radix bit in the old PATB. */
+            cproc = spapr->patb_entry & PATBE1_GR;
+        }
+    } else { /* Maintain current registration */
+        if (!(flags & FLAG_RADIX) != !(spapr->patb_entry & PATBE1_GR)) {
+            /* Technically caused by flag bits => H_PARAMETER */
+            return H_PARAMETER; /* Existing Process Table Mismatch */
+        }
+        cproc = spapr->patb_entry;
+    }
+
+    /* Check if we need to setup OR free the hpt */
+    spapr_check_setup_free_hpt(spapr, spapr->patb_entry, cproc);
+
+    spapr->patb_entry = cproc; /* Save new process table */
+    if ((flags & FLAG_RADIX) || (flags & FLAG_HASH_PROC_TBL)) {
+        /* Use Process TBL */
+        env->spr[SPR_LPCR] |= LPCR_UPRT;
+    } else {
+        env->spr[SPR_LPCR] &= ~LPCR_UPRT;
+    }
+    if (flags & FLAG_GTSE) { /* Partition Uses Guest Translation Shootdwn */
+        env->spr[SPR_LPCR] |= LPCR_GTSE;
+    } else {
+        env->spr[SPR_LPCR] &= ~LPCR_GTSE;
+    }
+
+    if (kvm_enabled()) {
+        return kvmppc_configure_v3_mmu(cpu, flags & FLAG_RADIX,
+                                       flags & FLAG_GTSE, cproc);
+    }
+    return H_SUCCESS;
+}
+
 #define H_SIGNAL_SYS_RESET_ALL         -1
 #define H_SIGNAL_SYS_RESET_ALLBUTSELF  -2
 
@@ -929,7 +1062,8 @@ static target_ulong h_client_architecture_support(PowerPCCPU *cpu,
     uint32_t max_compat = cpu->max_compat;
     uint32_t best_compat = 0;
     int i;
-    sPAPROptionVector *ov5_guest, *ov5_cas_old, *ov5_updates;
+    sPAPROptionVector *ov1_guest, *ov5_guest, *ov5_cas_old, *ov5_updates;
+    bool guest_radix;
 
     /*
      * We scan the supplied table of PVRs looking for two things
@@ -980,7 +1114,15 @@ static target_ulong h_client_architecture_support(PowerPCCPU *cpu,
     /* For the future use: here @ov_table points to the first option vector */
     ov_table = list;
 
+    ov1_guest = spapr_ovec_parse_vector(ov_table, 1);
     ov5_guest = spapr_ovec_parse_vector(ov_table, 5);
+    if (spapr_ovec_test(ov5_guest, OV5_MMU_BOTH)) {
+        error_report("guest requested hash and radix MMU, which is invalid.");
+        exit(EXIT_FAILURE);
+    }
+    /* The radix/hash bit in byte 24 requires special handling: */
+    guest_radix = spapr_ovec_test(ov5_guest, OV5_MMU_RADIX_300);
+    spapr_ovec_clear(ov5_guest, OV5_MMU_RADIX_300);
 
     /* NOTE: there are actually a number of ov5 bits where input from the
      * guest is always zero, and the platform/QEMU enables them independently
@@ -999,7 +1141,23 @@ static target_ulong h_client_architecture_support(PowerPCCPU *cpu,
     ov5_updates = spapr_ovec_new();
     spapr->cas_reboot = spapr_ovec_diff(ov5_updates,
                                         ov5_cas_old, spapr->ov5_cas);
-
+    /* Now that processing is finished, set the radix/hash bit for the
+     * guest if it requested a valid mode; otherwise terminate the boot. */
+    if (guest_radix) {
+        if (kvm_enabled() && !kvmppc_has_cap_mmu_radix()) {
+            error_report("Guest requested unavailable MMU mode (radix).");
+            exit(EXIT_FAILURE);
+        }
+        spapr_ovec_set(spapr->ov5_cas, OV5_MMU_RADIX_300);
+    } else {
+        if (kvm_enabled() && kvmppc_has_cap_mmu_radix()
+            && !kvmppc_has_cap_mmu_hash_v3()) {
+            error_report("Guest requested unavailable MMU mode (hash).");
+            exit(EXIT_FAILURE);
+        }
+    }
+    spapr->cas_legacy_guest_workaround = !spapr_ovec_test(ov1_guest,
+                                                          OV1_PPC_3_00);
     if (!spapr->cas_reboot) {
         spapr->cas_reboot =
             (spapr_h_cas_compose_response(spapr, args[1], args[2],
@@ -1009,6 +1167,13 @@ static target_ulong h_client_architecture_support(PowerPCCPU *cpu,
 
     if (spapr->cas_reboot) {
         qemu_system_reset_request();
+    } else {
+        /* If ppc_spapr_reset() did not set up a HPT but one is necessary
+         * (because the guest isn't going to use radix) then set it up here. */
+        if ((spapr->patb_entry & PATBE1_GR) && !guest_radix) {
+            /* legacy hash or new hash: */
+            spapr_setup_hpt_and_vrma(spapr);
+        }
     }
 
     return H_SUCCESS;
@@ -1084,6 +1249,11 @@ static void hypercall_register_types(void)
     spapr_register_hypercall(H_PAGE_INIT, h_page_init);
     spapr_register_hypercall(H_SET_MODE, h_set_mode);
 
+    /* In Memory Table MMU h-calls */
+    spapr_register_hypercall(H_CLEAN_SLB, h_clean_slb);
+    spapr_register_hypercall(H_INVALIDATE_PID, h_invalidate_pid);
+    spapr_register_hypercall(H_REGISTER_PROC_TBL, h_register_process_table);
+
     /* "debugger" hcalls (also used by SLOF). Note: We do -not- differenciate
      * here between the "CI" and the "CACHE" variants, they will use whatever
      * mapping attributes qemu is using. When using KVM, the kernel will
diff --git a/hw/ppc/spapr_iommu.c b/hw/ppc/spapr_iommu.c
index ae30bbe30f..29c80bb3c8 100644
--- a/hw/ppc/spapr_iommu.c
+++ b/hw/ppc/spapr_iommu.c
@@ -79,15 +79,16 @@ static IOMMUAccessFlags spapr_tce_iommu_access_flags(uint64_t tce)
 
 static uint64_t *spapr_tce_alloc_table(uint32_t liobn,
                                        uint32_t page_shift,
+                                       uint64_t bus_offset,
                                        uint32_t nb_table,
                                        int *fd,
                                        bool need_vfio)
 {
     uint64_t *table = NULL;
-    uint64_t window_size = (uint64_t)nb_table << page_shift;
 
-    if (kvm_enabled() && !(window_size >> 32)) {
-        table = kvmppc_create_spapr_tce(liobn, window_size, fd, need_vfio);
+    if (kvm_enabled()) {
+        table = kvmppc_create_spapr_tce(liobn, page_shift, bus_offset, nb_table,
+                                        fd, need_vfio);
     }
 
     if (!table) {
@@ -342,6 +343,7 @@ void spapr_tce_table_enable(sPAPRTCETable *tcet,
     tcet->nb_table = nb_table;
     tcet->table = spapr_tce_alloc_table(tcet->liobn,
                                         tcet->page_shift,
+                                        tcet->bus_offset,
                                         tcet->nb_table,
                                         &tcet->fd,
                                         tcet->need_vfio);
diff --git a/hw/ppc/spapr_pci.c b/hw/ppc/spapr_pci.c
index 98c52e411f..e7567e2e8f 100644
--- a/hw/ppc/spapr_pci.c
+++ b/hw/ppc/spapr_pci.c
@@ -50,8 +50,6 @@
 #include "sysemu/hostmem.h"
 #include "sysemu/numa.h"
 
-#include "hw/vfio/vfio.h"
-
 /* Copied from the kernel arch/powerpc/platforms/pseries/msi.c */
 #define RTAS_QUERY_FN           0
 #define RTAS_CHANGE_FN          1
@@ -1771,6 +1769,12 @@ static void spapr_phb_realize(DeviceState *dev, Error **errp)
     }
 
     /* DMA setup */
+    if ((sphb->page_size_mask & qemu_getrampagesize()) == 0) {
+        error_report("System page size 0x%lx is not enabled in page_size_mask "
+                     "(0x%"PRIx64"). Performance may be slow",
+                     qemu_getrampagesize(), sphb->page_size_mask);
+    }
+
     for (i = 0; i < windows_supported; ++i) {
         tcet = spapr_tce_new_table(DEVICE(sphb), sphb->dma_liobn[i]);
         if (!tcet) {
diff --git a/hw/ppc/spapr_rtc.c b/hw/ppc/spapr_rtc.c
index 3a17ac42e4..00a4e4c717 100644
--- a/hw/ppc/spapr_rtc.c
+++ b/hw/ppc/spapr_rtc.c
@@ -33,19 +33,8 @@
 #include "qapi-event.h"
 #include "qemu/cutils.h"
 
-#define SPAPR_RTC(obj) \
-    OBJECT_CHECK(sPAPRRTCState, (obj), TYPE_SPAPR_RTC)
-
-typedef struct sPAPRRTCState sPAPRRTCState;
-struct sPAPRRTCState {
-    /*< private >*/
-    SysBusDevice parent_obj;
-    int64_t ns_offset;
-};
-
-void spapr_rtc_read(DeviceState *dev, struct tm *tm, uint32_t *ns)
+void spapr_rtc_read(sPAPRRTCState *rtc, struct tm *tm, uint32_t *ns)
 {
-    sPAPRRTCState *rtc = SPAPR_RTC(dev);
     int64_t host_ns = qemu_clock_get_ns(rtc_clock);
     int64_t guest_ns;
     time_t guest_s;
@@ -63,16 +52,12 @@ void spapr_rtc_read(DeviceState *dev, struct tm *tm, uint32_t *ns)
     }
 }
 
-int spapr_rtc_import_offset(DeviceState *dev, int64_t legacy_offset)
+int spapr_rtc_import_offset(sPAPRRTCState *rtc, int64_t legacy_offset)
 {
-    sPAPRRTCState *rtc;
-
-    if (!dev) {
+    if (!rtc) {
         return -ENODEV;
     }
 
-    rtc = SPAPR_RTC(dev);
-
     rtc->ns_offset = legacy_offset * NANOSECONDS_PER_SECOND;
 
     return 0;
@@ -91,12 +76,7 @@ static void rtas_get_time_of_day(PowerPCCPU *cpu, sPAPRMachineState *spapr,
         return;
     }
 
-    if (!spapr->rtc) {
-        rtas_st(rets, 0, RTAS_OUT_HW_ERROR);
-        return;
-    }
-
-    spapr_rtc_read(spapr->rtc, &tm, &ns);
+    spapr_rtc_read(&spapr->rtc, &tm, &ns);
 
     rtas_st(rets, 0, RTAS_OUT_SUCCESS);
     rtas_st(rets, 1, tm.tm_year + 1900);
@@ -113,7 +93,7 @@ static void rtas_set_time_of_day(PowerPCCPU *cpu, sPAPRMachineState *spapr,
                                  target_ulong args,
                                  uint32_t nret, target_ulong rets)
 {
-    sPAPRRTCState *rtc;
+    sPAPRRTCState *rtc = &spapr->rtc;
     struct tm tm;
     time_t new_s;
     int64_t host_ns;
@@ -123,11 +103,6 @@ static void rtas_set_time_of_day(PowerPCCPU *cpu, sPAPRMachineState *spapr,
         return;
     }
 
-    if (!spapr->rtc) {
-        rtas_st(rets, 0, RTAS_OUT_HW_ERROR);
-        return;
-    }
-
     tm.tm_year = rtas_ld(args, 0) - 1900;
     tm.tm_mon = rtas_ld(args, 1) - 1;
     tm.tm_mday = rtas_ld(args, 2);
@@ -144,8 +119,6 @@ static void rtas_set_time_of_day(PowerPCCPU *cpu, sPAPRMachineState *spapr,
     /* Generate a monitor event for the change */
     qapi_event_send_rtc_change(qemu_timedate_diff(&tm), &error_abort);
 
-    rtc = SPAPR_RTC(spapr->rtc);
-
     host_ns = qemu_clock_get_ns(rtc_clock);
 
     rtc->ns_offset = (new_s * NANOSECONDS_PER_SECOND) - host_ns;
@@ -155,7 +128,7 @@ static void rtas_set_time_of_day(PowerPCCPU *cpu, sPAPRMachineState *spapr,
 
 static void spapr_rtc_qom_date(Object *obj, struct tm *current_tm, Error **errp)
 {
-    spapr_rtc_read(DEVICE(obj), current_tm, NULL);
+    spapr_rtc_read(SPAPR_RTC(obj), current_tm, NULL);
 }
 
 static void spapr_rtc_realize(DeviceState *dev, Error **errp)
@@ -200,7 +173,7 @@ static void spapr_rtc_class_init(ObjectClass *oc, void *data)
 
 static const TypeInfo spapr_rtc_info = {
     .name          = TYPE_SPAPR_RTC,
-    .parent        = TYPE_SYS_BUS_DEVICE,
+    .parent        = TYPE_DEVICE,
     .instance_size = sizeof(sPAPRRTCState),
     .class_init    = spapr_rtc_class_init,
 };