summary refs log tree commit diff stats
diff options
context:
space:
mode:
-rw-r--r--MAINTAINERS2
-rw-r--r--default-configs/arm-softmmu.mak1
-rw-r--r--hw/arm/armsse.c87
-rw-r--r--hw/misc/Makefile.objs1
-rw-r--r--hw/misc/armsse-mhu.c198
-rw-r--r--hw/misc/iotkit-sysctl.c294
-rw-r--r--hw/misc/trace-events4
-rw-r--r--include/hw/arm/armsse.h3
-rw-r--r--include/hw/misc/armsse-mhu.h44
-rw-r--r--include/hw/misc/iotkit-sysctl.h25
-rw-r--r--linux-user/elfload.c2
-rw-r--r--target/arm/arm-powerctl.c56
-rw-r--r--target/arm/arm-powerctl.h16
-rw-r--r--target/arm/cpu.c32
-rw-r--r--target/arm/cpu.h76
-rw-r--r--target/arm/cpu64.c2
-rw-r--r--target/arm/helper.c27
-rw-r--r--target/arm/helper.h9
-rw-r--r--target/arm/kvm32.c23
-rw-r--r--target/arm/kvm64.c2
-rw-r--r--target/arm/machine.c2
-rw-r--r--target/arm/translate-a64.c49
-rw-r--r--target/arm/translate.c180
-rw-r--r--target/arm/vec_helper.c148
24 files changed, 1137 insertions, 146 deletions
diff --git a/MAINTAINERS b/MAINTAINERS
index 8f9f9d7c7d..0a6190ecff 100644
--- a/MAINTAINERS
+++ b/MAINTAINERS
@@ -633,6 +633,8 @@ F: hw/misc/iotkit-sysinfo.c
 F: include/hw/misc/iotkit-sysinfo.h
 F: hw/misc/armsse-cpuid.c
 F: include/hw/misc/armsse-cpuid.h
+F: hw/misc/armsse-mhu.c
+F: include/hw/misc/armsse-mhu.h
 
 Musca
 M: Peter Maydell <peter.maydell@linaro.org>
diff --git a/default-configs/arm-softmmu.mak b/default-configs/arm-softmmu.mak
index 87ad267494..bd6943b691 100644
--- a/default-configs/arm-softmmu.mak
+++ b/default-configs/arm-softmmu.mak
@@ -119,6 +119,7 @@ CONFIG_IOTKIT_SECCTL=y
 CONFIG_IOTKIT_SYSCTL=y
 CONFIG_IOTKIT_SYSINFO=y
 CONFIG_ARMSSE_CPUID=y
+CONFIG_ARMSSE_MHU=y
 
 CONFIG_VERSATILE=y
 CONFIG_VERSATILE_PCI=y
diff --git a/hw/arm/armsse.c b/hw/arm/armsse.c
index 129e7ea7fe..76cc690579 100644
--- a/hw/arm/armsse.c
+++ b/hw/arm/armsse.c
@@ -11,6 +11,7 @@
 
 #include "qemu/osdep.h"
 #include "qemu/log.h"
+#include "qemu/bitops.h"
 #include "qapi/error.h"
 #include "trace.h"
 #include "hw/sysbus.h"
@@ -29,6 +30,7 @@ struct ARMSSEInfo {
     int sram_banks;
     int num_cpus;
     uint32_t sys_version;
+    uint32_t cpuwait_rst;
     SysConfigFormat sys_config_format;
     bool has_mhus;
     bool has_ppus;
@@ -43,6 +45,7 @@ static const ARMSSEInfo armsse_variants[] = {
         .sram_banks = 1,
         .num_cpus = 1,
         .sys_version = 0x41743,
+        .cpuwait_rst = 0,
         .sys_config_format = IoTKitFormat,
         .has_mhus = false,
         .has_ppus = false,
@@ -55,6 +58,7 @@ static const ARMSSEInfo armsse_variants[] = {
         .sram_banks = 4,
         .num_cpus = 2,
         .sys_version = 0x22041743,
+        .cpuwait_rst = 2,
         .sys_config_format = SSE200Format,
         .has_mhus = true,
         .has_ppus = true,
@@ -282,9 +286,9 @@ static void armsse_init(Object *obj)
                           sizeof(s->sysinfo), TYPE_IOTKIT_SYSINFO);
     if (info->has_mhus) {
         sysbus_init_child_obj(obj, "mhu0", &s->mhu[0], sizeof(s->mhu[0]),
-                              TYPE_UNIMPLEMENTED_DEVICE);
+                              TYPE_ARMSSE_MHU);
         sysbus_init_child_obj(obj, "mhu1", &s->mhu[1], sizeof(s->mhu[1]),
-                              TYPE_UNIMPLEMENTED_DEVICE);
+                              TYPE_ARMSSE_MHU);
     }
     if (info->has_ppus) {
         for (i = 0; i < info->num_cpus; i++) {
@@ -495,30 +499,33 @@ static void armsse_realize(DeviceState *dev, Error **errp)
 
         qdev_prop_set_uint32(cpudev, "num-irq", s->exp_numirq + 32);
         /*
-         * In real hardware the initial Secure VTOR is set from the INITSVTOR0
-         * register in the IoT Kit System Control Register block, and the
-         * initial value of that is in turn specifiable by the FPGA that
-         * instantiates the IoT Kit. In QEMU we don't implement this wrinkle,
-         * and simply set the CPU's init-svtor to the IoT Kit default value.
-         * In SSE-200 the situation is similar, except that the default value
-         * is a reset-time signal input. Typically a board using the SSE-200
-         * will have a system control processor whose boot firmware initializes
-         * the INITSVTOR* registers before powering up the CPUs in any case,
-         * so the hardware's default value doesn't matter. QEMU doesn't emulate
+         * In real hardware the initial Secure VTOR is set from the INITSVTOR*
+         * registers in the IoT Kit System Control Register block. In QEMU
+         * we set the initial value here, and also the reset value of the
+         * sysctl register, from this object's QOM init-svtor property.
+         * If the guest changes the INITSVTOR* registers at runtime then the
+         * code in iotkit-sysctl.c will update the CPU init-svtor property
+         * (which will then take effect on the next CPU warm-reset).
+         *
+         * Note that typically a board using the SSE-200 will have a system
+         * control processor whose boot firmware initializes the INITSVTOR*
+         * registers before powering up the CPUs. QEMU doesn't emulate
          * the control processor, so instead we behave in the way that the
-         * firmware does. The initial value is configurable by the board code
-         * to match whatever its firmware does.
+         * firmware does: the initial value should be set by the board code
+         * (using the init-svtor property on the ARMSSE object) to match
+         * whatever its firmware does.
          */
         qdev_prop_set_uint32(cpudev, "init-svtor", s->init_svtor);
         /*
-         * Start all CPUs except CPU0 powered down. In real hardware it is
-         * a configurable property of the SSE-200 which CPUs start powered up
-         * (via the CPUWAIT0_RST and CPUWAIT1_RST parameters), but since all
-         * the boards we care about start CPU0 and leave CPU1 powered off,
-         * we hard-code that for now. We can add QOM properties for this
+         * CPUs start powered down if the corresponding bit in the CPUWAIT
+         * register is 1. In real hardware the CPUWAIT register reset value is
+         * a configurable property of the SSE-200 (via the CPUWAIT0_RST and
+         * CPUWAIT1_RST parameters), but since all the boards we care about
+         * start CPU0 and leave CPU1 powered off, we hard-code that in
+         * info->cpuwait_rst for now. We can add QOM properties for this
          * later if necessary.
          */
-        if (i > 0) {
+        if (extract32(info->cpuwait_rst, i, 1)) {
             object_property_set_bool(cpuobj, true, "start-powered-off", &err);
             if (err) {
                 error_propagate(errp, err);
@@ -766,22 +773,28 @@ static void armsse_realize(DeviceState *dev, Error **errp)
     }
 
     if (info->has_mhus) {
+        /*
+         * An SSE-200 with only one CPU should have only one MHU created,
+         * with the region where the second MHU usually is being RAZ/WI.
+         * We don't implement that SSE-200 config; if we want to support
+         * it then this code needs to be enhanced to handle creating the
+         * RAZ/WI region instead of the second MHU.
+         */
+        assert(info->num_cpus == ARRAY_SIZE(s->mhu));
+
         for (i = 0; i < ARRAY_SIZE(s->mhu); i++) {
-            char *name;
             char *port;
+            int cpunum;
+            SysBusDevice *mhu_sbd = SYS_BUS_DEVICE(&s->mhu[i]);
 
-            name = g_strdup_printf("MHU%d", i);
-            qdev_prop_set_string(DEVICE(&s->mhu[i]), "name", name);
-            qdev_prop_set_uint64(DEVICE(&s->mhu[i]), "size", 0x1000);
             object_property_set_bool(OBJECT(&s->mhu[i]), true,
                                      "realized", &err);
-            g_free(name);
             if (err) {
                 error_propagate(errp, err);
                 return;
             }
             port = g_strdup_printf("port[%d]", i + 3);
-            mr = sysbus_mmio_get_region(SYS_BUS_DEVICE(&s->mhu[i]), 0);
+            mr = sysbus_mmio_get_region(mhu_sbd, 0);
             object_property_set_link(OBJECT(&s->apb_ppc0), OBJECT(mr),
                                      port, &err);
             g_free(port);
@@ -789,6 +802,20 @@ static void armsse_realize(DeviceState *dev, Error **errp)
                 error_propagate(errp, err);
                 return;
             }
+
+            /*
+             * Each MHU has an irq line for each CPU:
+             *  MHU 0 irq line 0 -> CPU 0 IRQ 6
+             *  MHU 0 irq line 1 -> CPU 1 IRQ 6
+             *  MHU 1 irq line 0 -> CPU 0 IRQ 7
+             *  MHU 1 irq line 1 -> CPU 1 IRQ 7
+             */
+            for (cpunum = 0; cpunum < info->num_cpus; cpunum++) {
+                DeviceState *cpudev = DEVICE(&s->armv7m[cpunum]);
+
+                sysbus_connect_irq(mhu_sbd, cpunum,
+                                   qdev_get_gpio_in(cpudev, 6 + i));
+            }
         }
     }
 
@@ -977,6 +1004,14 @@ static void armsse_realize(DeviceState *dev, Error **errp)
     /* System information registers */
     sysbus_mmio_map(SYS_BUS_DEVICE(&s->sysinfo), 0, 0x40020000);
     /* System control registers */
+    object_property_set_int(OBJECT(&s->sysctl), info->sys_version,
+                            "SYS_VERSION", &err);
+    object_property_set_int(OBJECT(&s->sysctl), info->cpuwait_rst,
+                            "CPUWAIT_RST", &err);
+    object_property_set_int(OBJECT(&s->sysctl), s->init_svtor,
+                            "INITSVTOR0_RST", &err);
+    object_property_set_int(OBJECT(&s->sysctl), s->init_svtor,
+                            "INITSVTOR1_RST", &err);
     object_property_set_bool(OBJECT(&s->sysctl), true, "realized", &err);
     if (err) {
         error_propagate(errp, err);
diff --git a/hw/misc/Makefile.objs b/hw/misc/Makefile.objs
index 74c91d250c..c71e07ae35 100644
--- a/hw/misc/Makefile.objs
+++ b/hw/misc/Makefile.objs
@@ -70,6 +70,7 @@ obj-$(CONFIG_IOTKIT_SECCTL) += iotkit-secctl.o
 obj-$(CONFIG_IOTKIT_SYSCTL) += iotkit-sysctl.o
 obj-$(CONFIG_IOTKIT_SYSINFO) += iotkit-sysinfo.o
 obj-$(CONFIG_ARMSSE_CPUID) += armsse-cpuid.o
+obj-$(CONFIG_ARMSSE_MHU) += armsse-mhu.o
 
 obj-$(CONFIG_PVPANIC) += pvpanic.o
 obj-$(CONFIG_AUX) += auxbus.o
diff --git a/hw/misc/armsse-mhu.c b/hw/misc/armsse-mhu.c
new file mode 100644
index 0000000000..9ebca32e9a
--- /dev/null
+++ b/hw/misc/armsse-mhu.c
@@ -0,0 +1,198 @@
+/*
+ * ARM SSE-200 Message Handling Unit (MHU)
+ *
+ * Copyright (c) 2019 Linaro Limited
+ * Written by Peter Maydell
+ *
+ *  This program is free software; you can redistribute it and/or modify
+ *  it under the terms of the GNU General Public License version 2 or
+ *  (at your option) any later version.
+ */
+
+/*
+ * This is a model of the Message Handling Unit (MHU) which is part of the
+ * Arm SSE-200 and documented in
+ * http://infocenter.arm.com/help/topic/com.arm.doc.101104_0100_00_en/corelink_sse200_subsystem_for_embedded_technical_reference_manual_101104_0100_00_en.pdf
+ */
+
+#include "qemu/osdep.h"
+#include "qemu/log.h"
+#include "trace.h"
+#include "qapi/error.h"
+#include "sysemu/sysemu.h"
+#include "hw/sysbus.h"
+#include "hw/registerfields.h"
+#include "hw/misc/armsse-mhu.h"
+
+REG32(CPU0INTR_STAT, 0x0)
+REG32(CPU0INTR_SET, 0x4)
+REG32(CPU0INTR_CLR, 0x8)
+REG32(CPU1INTR_STAT, 0x10)
+REG32(CPU1INTR_SET, 0x14)
+REG32(CPU1INTR_CLR, 0x18)
+REG32(PID4, 0xfd0)
+REG32(PID5, 0xfd4)
+REG32(PID6, 0xfd8)
+REG32(PID7, 0xfdc)
+REG32(PID0, 0xfe0)
+REG32(PID1, 0xfe4)
+REG32(PID2, 0xfe8)
+REG32(PID3, 0xfec)
+REG32(CID0, 0xff0)
+REG32(CID1, 0xff4)
+REG32(CID2, 0xff8)
+REG32(CID3, 0xffc)
+
+/* Valid bits in the interrupt registers. If any are set the IRQ is raised */
+#define INTR_MASK 0xf
+
+/* PID/CID values */
+static const int armsse_mhu_id[] = {
+    0x04, 0x00, 0x00, 0x00, /* PID4..PID7 */
+    0x56, 0xb8, 0x0b, 0x00, /* PID0..PID3 */
+    0x0d, 0xf0, 0x05, 0xb1, /* CID0..CID3 */
+};
+
+static void armsse_mhu_update(ARMSSEMHU *s)
+{
+    qemu_set_irq(s->cpu0irq, s->cpu0intr != 0);
+    qemu_set_irq(s->cpu1irq, s->cpu1intr != 0);
+}
+
+static uint64_t armsse_mhu_read(void *opaque, hwaddr offset, unsigned size)
+{
+    ARMSSEMHU *s = ARMSSE_MHU(opaque);
+    uint64_t r;
+
+    switch (offset) {
+    case A_CPU0INTR_STAT:
+        r = s->cpu0intr;
+        break;
+
+    case A_CPU1INTR_STAT:
+        r = s->cpu1intr;
+        break;
+
+    case A_PID4 ... A_CID3:
+        r = armsse_mhu_id[(offset - A_PID4) / 4];
+        break;
+
+    case A_CPU0INTR_SET:
+    case A_CPU0INTR_CLR:
+    case A_CPU1INTR_SET:
+    case A_CPU1INTR_CLR:
+        qemu_log_mask(LOG_GUEST_ERROR,
+                      "SSE MHU: read of write-only register at offset 0x%x\n",
+                      (int)offset);
+        r = 0;
+        break;
+
+    default:
+        qemu_log_mask(LOG_GUEST_ERROR,
+                      "SSE MHU read: bad offset 0x%x\n", (int)offset);
+        r = 0;
+        break;
+    }
+    trace_armsse_mhu_read(offset, r, size);
+    return r;
+}
+
+static void armsse_mhu_write(void *opaque, hwaddr offset,
+                             uint64_t value, unsigned size)
+{
+    ARMSSEMHU *s = ARMSSE_MHU(opaque);
+
+    trace_armsse_mhu_write(offset, value, size);
+
+    switch (offset) {
+    case A_CPU0INTR_SET:
+        s->cpu0intr |= (value & INTR_MASK);
+        break;
+    case A_CPU0INTR_CLR:
+        s->cpu0intr &= ~(value & INTR_MASK);
+        break;
+    case A_CPU1INTR_SET:
+        s->cpu1intr |= (value & INTR_MASK);
+        break;
+    case A_CPU1INTR_CLR:
+        s->cpu1intr &= ~(value & INTR_MASK);
+        break;
+
+    case A_CPU0INTR_STAT:
+    case A_CPU1INTR_STAT:
+    case A_PID4 ... A_CID3:
+        qemu_log_mask(LOG_GUEST_ERROR,
+                      "SSE MHU: write to read-only register at offset 0x%x\n",
+                      (int)offset);
+        break;
+
+    default:
+        qemu_log_mask(LOG_GUEST_ERROR,
+                      "SSE MHU write: bad offset 0x%x\n", (int)offset);
+        break;
+    }
+
+    armsse_mhu_update(s);
+}
+
+static const MemoryRegionOps armsse_mhu_ops = {
+    .read = armsse_mhu_read,
+    .write = armsse_mhu_write,
+    .endianness = DEVICE_LITTLE_ENDIAN,
+    .valid.min_access_size = 4,
+    .valid.max_access_size = 4,
+};
+
+static void armsse_mhu_reset(DeviceState *dev)
+{
+    ARMSSEMHU *s = ARMSSE_MHU(dev);
+
+    s->cpu0intr = 0;
+    s->cpu1intr = 0;
+}
+
+static const VMStateDescription armsse_mhu_vmstate = {
+    .name = "armsse-mhu",
+    .version_id = 1,
+    .minimum_version_id = 1,
+    .fields = (VMStateField[]) {
+        VMSTATE_UINT32(cpu0intr, ARMSSEMHU),
+        VMSTATE_UINT32(cpu1intr, ARMSSEMHU),
+        VMSTATE_END_OF_LIST()
+    },
+};
+
+static void armsse_mhu_init(Object *obj)
+{
+    SysBusDevice *sbd = SYS_BUS_DEVICE(obj);
+    ARMSSEMHU *s = ARMSSE_MHU(obj);
+
+    memory_region_init_io(&s->iomem, obj, &armsse_mhu_ops,
+                          s, "armsse-mhu", 0x1000);
+    sysbus_init_mmio(sbd, &s->iomem);
+    sysbus_init_irq(sbd, &s->cpu0irq);
+    sysbus_init_irq(sbd, &s->cpu1irq);
+}
+
+static void armsse_mhu_class_init(ObjectClass *klass, void *data)
+{
+    DeviceClass *dc = DEVICE_CLASS(klass);
+
+    dc->reset = armsse_mhu_reset;
+    dc->vmsd = &armsse_mhu_vmstate;
+}
+
+static const TypeInfo armsse_mhu_info = {
+    .name = TYPE_ARMSSE_MHU,
+    .parent = TYPE_SYS_BUS_DEVICE,
+    .instance_size = sizeof(ARMSSEMHU),
+    .instance_init = armsse_mhu_init,
+    .class_init = armsse_mhu_class_init,
+};
+
+static void armsse_mhu_register_types(void)
+{
+    type_register_static(&armsse_mhu_info);
+}
+
+type_init(armsse_mhu_register_types);
diff --git a/hw/misc/iotkit-sysctl.c b/hw/misc/iotkit-sysctl.c
index a21d8bd678..54064a31ef 100644
--- a/hw/misc/iotkit-sysctl.c
+++ b/hw/misc/iotkit-sysctl.c
@@ -17,6 +17,7 @@
  */
 
 #include "qemu/osdep.h"
+#include "qemu/bitops.h"
 #include "qemu/log.h"
 #include "trace.h"
 #include "qapi/error.h"
@@ -24,19 +25,32 @@
 #include "hw/sysbus.h"
 #include "hw/registerfields.h"
 #include "hw/misc/iotkit-sysctl.h"
+#include "target/arm/arm-powerctl.h"
+#include "target/arm/cpu.h"
 
 REG32(SECDBGSTAT, 0x0)
 REG32(SECDBGSET, 0x4)
 REG32(SECDBGCLR, 0x8)
+REG32(SCSECCTRL, 0xc)
+REG32(FCLK_DIV, 0x10)
+REG32(SYSCLK_DIV, 0x14)
+REG32(CLOCK_FORCE, 0x18)
 REG32(RESET_SYNDROME, 0x100)
 REG32(RESET_MASK, 0x104)
 REG32(SWRESET, 0x108)
     FIELD(SWRESET, SWRESETREQ, 9, 1)
 REG32(GRETREG, 0x10c)
-REG32(INITSVRTOR0, 0x110)
+REG32(INITSVTOR0, 0x110)
+REG32(INITSVTOR1, 0x114)
 REG32(CPUWAIT, 0x118)
-REG32(BUSWAIT, 0x11c)
+REG32(NMI_ENABLE, 0x11c) /* BUSWAIT in IoTKit */
 REG32(WICCTRL, 0x120)
+REG32(EWCTRL, 0x124)
+REG32(PDCM_PD_SYS_SENSE, 0x200)
+REG32(PDCM_PD_SRAM0_SENSE, 0x20c)
+REG32(PDCM_PD_SRAM1_SENSE, 0x210)
+REG32(PDCM_PD_SRAM2_SENSE, 0x214)
+REG32(PDCM_PD_SRAM3_SENSE, 0x218)
 REG32(PID4, 0xfd0)
 REG32(PID5, 0xfd4)
 REG32(PID6, 0xfd8)
@@ -57,6 +71,21 @@ static const int sysctl_id[] = {
     0x0d, 0xf0, 0x05, 0xb1, /* CID0..CID3 */
 };
 
+/*
+ * Set the initial secure vector table offset address for the core.
+ * This will take effect when the CPU next resets.
+ */
+static void set_init_vtor(uint64_t cpuid, uint32_t vtor)
+{
+    Object *cpuobj = OBJECT(arm_get_cpu_by_id(cpuid));
+
+    if (cpuobj) {
+        if (object_property_find(cpuobj, "init-svtor", NULL)) {
+            object_property_set_uint(cpuobj, vtor, "init-svtor", &error_abort);
+        }
+    }
+}
+
 static uint64_t iotkit_sysctl_read(void *opaque, hwaddr offset,
                                     unsigned size)
 {
@@ -67,6 +96,30 @@ static uint64_t iotkit_sysctl_read(void *opaque, hwaddr offset,
     case A_SECDBGSTAT:
         r = s->secure_debug;
         break;
+    case A_SCSECCTRL:
+        if (!s->is_sse200) {
+            goto bad_offset;
+        }
+        r = s->scsecctrl;
+        break;
+    case A_FCLK_DIV:
+        if (!s->is_sse200) {
+            goto bad_offset;
+        }
+        r = s->fclk_div;
+        break;
+    case A_SYSCLK_DIV:
+        if (!s->is_sse200) {
+            goto bad_offset;
+        }
+        r = s->sysclk_div;
+        break;
+    case A_CLOCK_FORCE:
+        if (!s->is_sse200) {
+            goto bad_offset;
+        }
+        r = s->clock_force;
+        break;
     case A_RESET_SYNDROME:
         r = s->reset_syndrome;
         break;
@@ -76,19 +129,65 @@ static uint64_t iotkit_sysctl_read(void *opaque, hwaddr offset,
     case A_GRETREG:
         r = s->gretreg;
         break;
-    case A_INITSVRTOR0:
-        r = s->initsvrtor0;
+    case A_INITSVTOR0:
+        r = s->initsvtor0;
+        break;
+    case A_INITSVTOR1:
+        if (!s->is_sse200) {
+            goto bad_offset;
+        }
+        r = s->initsvtor1;
         break;
     case A_CPUWAIT:
         r = s->cpuwait;
         break;
-    case A_BUSWAIT:
-        /* In IoTKit BUSWAIT is reserved, R/O, zero */
-        r = 0;
+    case A_NMI_ENABLE:
+        /* In IoTKit this is named BUSWAIT but is marked reserved, R/O, zero */
+        if (!s->is_sse200) {
+            r = 0;
+            break;
+        }
+        r = s->nmi_enable;
         break;
     case A_WICCTRL:
         r = s->wicctrl;
         break;
+    case A_EWCTRL:
+        if (!s->is_sse200) {
+            goto bad_offset;
+        }
+        r = s->ewctrl;
+        break;
+    case A_PDCM_PD_SYS_SENSE:
+        if (!s->is_sse200) {
+            goto bad_offset;
+        }
+        r = s->pdcm_pd_sys_sense;
+        break;
+    case A_PDCM_PD_SRAM0_SENSE:
+        if (!s->is_sse200) {
+            goto bad_offset;
+        }
+        r = s->pdcm_pd_sram0_sense;
+        break;
+    case A_PDCM_PD_SRAM1_SENSE:
+        if (!s->is_sse200) {
+            goto bad_offset;
+        }
+        r = s->pdcm_pd_sram1_sense;
+        break;
+    case A_PDCM_PD_SRAM2_SENSE:
+        if (!s->is_sse200) {
+            goto bad_offset;
+        }
+        r = s->pdcm_pd_sram2_sense;
+        break;
+    case A_PDCM_PD_SRAM3_SENSE:
+        if (!s->is_sse200) {
+            goto bad_offset;
+        }
+        r = s->pdcm_pd_sram3_sense;
+        break;
     case A_PID4 ... A_CID3:
         r = sysctl_id[(offset - A_PID4) / 4];
         break;
@@ -101,6 +200,7 @@ static uint64_t iotkit_sysctl_read(void *opaque, hwaddr offset,
         r = 0;
         break;
     default:
+    bad_offset:
         qemu_log_mask(LOG_GUEST_ERROR,
                       "IoTKit SysCtl read: bad offset %x\n", (int)offset);
         r = 0;
@@ -145,12 +245,19 @@ static void iotkit_sysctl_write(void *opaque, hwaddr offset,
          */
         s->gretreg = value;
         break;
-    case A_INITSVRTOR0:
-        qemu_log_mask(LOG_UNIMP, "IoTKit SysCtl INITSVRTOR0 unimplemented\n");
-        s->initsvrtor0 = value;
+    case A_INITSVTOR0:
+        s->initsvtor0 = value;
+        set_init_vtor(0, s->initsvtor0);
         break;
     case A_CPUWAIT:
-        qemu_log_mask(LOG_UNIMP, "IoTKit SysCtl CPUWAIT unimplemented\n");
+        if ((s->cpuwait & 1) && !(value & 1)) {
+            /* Powering up CPU 0 */
+            arm_set_cpu_on_and_reset(0);
+        }
+        if ((s->cpuwait & 2) && !(value & 2)) {
+            /* Powering up CPU 1 */
+            arm_set_cpu_on_and_reset(1);
+        }
         s->cpuwait = value;
         break;
     case A_WICCTRL:
@@ -172,14 +279,105 @@ static void iotkit_sysctl_write(void *opaque, hwaddr offset,
             qemu_system_reset_request(SHUTDOWN_CAUSE_GUEST_RESET);
         }
         break;
-    case A_BUSWAIT:        /* In IoTKit BUSWAIT is reserved, R/O, zero */
+    case A_SCSECCTRL:
+        if (!s->is_sse200) {
+            goto bad_offset;
+        }
+        qemu_log_mask(LOG_UNIMP, "IoTKit SysCtl SCSECCTRL unimplemented\n");
+        s->scsecctrl = value;
+        break;
+    case A_FCLK_DIV:
+        if (!s->is_sse200) {
+            goto bad_offset;
+        }
+        qemu_log_mask(LOG_UNIMP, "IoTKit SysCtl FCLK_DIV unimplemented\n");
+        s->fclk_div = value;
+        break;
+    case A_SYSCLK_DIV:
+        if (!s->is_sse200) {
+            goto bad_offset;
+        }
+        qemu_log_mask(LOG_UNIMP, "IoTKit SysCtl SYSCLK_DIV unimplemented\n");
+        s->sysclk_div = value;
+        break;
+    case A_CLOCK_FORCE:
+        if (!s->is_sse200) {
+            goto bad_offset;
+        }
+        qemu_log_mask(LOG_UNIMP, "IoTKit SysCtl CLOCK_FORCE unimplemented\n");
+        s->clock_force = value;
+        break;
+    case A_INITSVTOR1:
+        if (!s->is_sse200) {
+            goto bad_offset;
+        }
+        s->initsvtor1 = value;
+        set_init_vtor(1, s->initsvtor1);
+        break;
+    case A_EWCTRL:
+        if (!s->is_sse200) {
+            goto bad_offset;
+        }
+        qemu_log_mask(LOG_UNIMP, "IoTKit SysCtl EWCTRL unimplemented\n");
+        s->ewctrl = value;
+        break;
+    case A_PDCM_PD_SYS_SENSE:
+        if (!s->is_sse200) {
+            goto bad_offset;
+        }
+        qemu_log_mask(LOG_UNIMP,
+                      "IoTKit SysCtl PDCM_PD_SYS_SENSE unimplemented\n");
+        s->pdcm_pd_sys_sense = value;
+        break;
+    case A_PDCM_PD_SRAM0_SENSE:
+        if (!s->is_sse200) {
+            goto bad_offset;
+        }
+        qemu_log_mask(LOG_UNIMP,
+                      "IoTKit SysCtl PDCM_PD_SRAM0_SENSE unimplemented\n");
+        s->pdcm_pd_sram0_sense = value;
+        break;
+    case A_PDCM_PD_SRAM1_SENSE:
+        if (!s->is_sse200) {
+            goto bad_offset;
+        }
+        qemu_log_mask(LOG_UNIMP,
+                      "IoTKit SysCtl PDCM_PD_SRAM1_SENSE unimplemented\n");
+        s->pdcm_pd_sram1_sense = value;
+        break;
+    case A_PDCM_PD_SRAM2_SENSE:
+        if (!s->is_sse200) {
+            goto bad_offset;
+        }
+        qemu_log_mask(LOG_UNIMP,
+                      "IoTKit SysCtl PDCM_PD_SRAM2_SENSE unimplemented\n");
+        s->pdcm_pd_sram2_sense = value;
+        break;
+    case A_PDCM_PD_SRAM3_SENSE:
+        if (!s->is_sse200) {
+            goto bad_offset;
+        }
+        qemu_log_mask(LOG_UNIMP,
+                      "IoTKit SysCtl PDCM_PD_SRAM3_SENSE unimplemented\n");
+        s->pdcm_pd_sram3_sense = value;
+        break;
+    case A_NMI_ENABLE:
+        /* In IoTKit this is BUSWAIT: reserved, R/O, zero */
+        if (!s->is_sse200) {
+            goto ro_offset;
+        }
+        qemu_log_mask(LOG_UNIMP, "IoTKit SysCtl NMI_ENABLE unimplemented\n");
+        s->nmi_enable = value;
+        break;
     case A_SECDBGSTAT:
     case A_PID4 ... A_CID3:
+    ro_offset:
         qemu_log_mask(LOG_GUEST_ERROR,
                       "IoTKit SysCtl write: write of RO offset %x\n",
                       (int)offset);
         break;
     default:
+    bad_offset:
         qemu_log_mask(LOG_GUEST_ERROR,
                       "IoTKit SysCtl write: bad offset %x\n", (int)offset);
         break;
@@ -206,9 +404,21 @@ static void iotkit_sysctl_reset(DeviceState *dev)
     s->reset_syndrome = 1;
     s->reset_mask = 0;
     s->gretreg = 0;
-    s->initsvrtor0 = 0x10000000;
-    s->cpuwait = 0;
+    s->initsvtor0 = s->initsvtor0_rst;
+    s->initsvtor1 = s->initsvtor1_rst;
+    s->cpuwait = s->cpuwait_rst;
     s->wicctrl = 0;
+    s->scsecctrl = 0;
+    s->fclk_div = 0;
+    s->sysclk_div = 0;
+    s->clock_force = 0;
+    s->nmi_enable = 0;
+    s->ewctrl = 0;
+    s->pdcm_pd_sys_sense = 0x7f;
+    s->pdcm_pd_sram0_sense = 0;
+    s->pdcm_pd_sram1_sense = 0;
+    s->pdcm_pd_sram2_sense = 0;
+    s->pdcm_pd_sram3_sense = 0;
 }
 
 static void iotkit_sysctl_init(Object *obj)
@@ -221,6 +431,44 @@ static void iotkit_sysctl_init(Object *obj)
     sysbus_init_mmio(sbd, &s->iomem);
 }
 
+static void iotkit_sysctl_realize(DeviceState *dev, Error **errp)
+{
+    IoTKitSysCtl *s = IOTKIT_SYSCTL(dev);
+
+    /* The top 4 bits of the SYS_VERSION register tell us if we're an SSE-200 */
+    if (extract32(s->sys_version, 28, 4) == 2) {
+        s->is_sse200 = true;
+    }
+}
+
+static bool sse200_needed(void *opaque)
+{
+    IoTKitSysCtl *s = IOTKIT_SYSCTL(opaque);
+
+    return s->is_sse200;
+}
+
+static const VMStateDescription iotkit_sysctl_sse200_vmstate = {
+    .name = "iotkit-sysctl/sse-200",
+    .version_id = 1,
+    .minimum_version_id = 1,
+    .needed = sse200_needed,
+    .fields = (VMStateField[]) {
+        VMSTATE_UINT32(scsecctrl, IoTKitSysCtl),
+        VMSTATE_UINT32(fclk_div, IoTKitSysCtl),
+        VMSTATE_UINT32(sysclk_div, IoTKitSysCtl),
+        VMSTATE_UINT32(clock_force, IoTKitSysCtl),
+        VMSTATE_UINT32(initsvtor1, IoTKitSysCtl),
+        VMSTATE_UINT32(nmi_enable, IoTKitSysCtl),
+        VMSTATE_UINT32(pdcm_pd_sys_sense, IoTKitSysCtl),
+        VMSTATE_UINT32(pdcm_pd_sram0_sense, IoTKitSysCtl),
+        VMSTATE_UINT32(pdcm_pd_sram1_sense, IoTKitSysCtl),
+        VMSTATE_UINT32(pdcm_pd_sram2_sense, IoTKitSysCtl),
+        VMSTATE_UINT32(pdcm_pd_sram3_sense, IoTKitSysCtl),
+        VMSTATE_END_OF_LIST()
+    }
+};
+
 static const VMStateDescription iotkit_sysctl_vmstate = {
     .name = "iotkit-sysctl",
     .version_id = 1,
@@ -230,19 +478,35 @@ static const VMStateDescription iotkit_sysctl_vmstate = {
         VMSTATE_UINT32(reset_syndrome, IoTKitSysCtl),
         VMSTATE_UINT32(reset_mask, IoTKitSysCtl),
         VMSTATE_UINT32(gretreg, IoTKitSysCtl),
-        VMSTATE_UINT32(initsvrtor0, IoTKitSysCtl),
+        VMSTATE_UINT32(initsvtor0, IoTKitSysCtl),
         VMSTATE_UINT32(cpuwait, IoTKitSysCtl),
         VMSTATE_UINT32(wicctrl, IoTKitSysCtl),
         VMSTATE_END_OF_LIST()
+    },
+    .subsections = (const VMStateDescription*[]) {
+        &iotkit_sysctl_sse200_vmstate,
+        NULL
     }
 };
 
+static Property iotkit_sysctl_props[] = {
+    DEFINE_PROP_UINT32("SYS_VERSION", IoTKitSysCtl, sys_version, 0),
+    DEFINE_PROP_UINT32("CPUWAIT_RST", IoTKitSysCtl, cpuwait_rst, 0),
+    DEFINE_PROP_UINT32("INITSVTOR0_RST", IoTKitSysCtl, initsvtor0_rst,
+                       0x10000000),
+    DEFINE_PROP_UINT32("INITSVTOR1_RST", IoTKitSysCtl, initsvtor1_rst,
+                       0x10000000),
+    DEFINE_PROP_END_OF_LIST()
+};
+
 static void iotkit_sysctl_class_init(ObjectClass *klass, void *data)
 {
     DeviceClass *dc = DEVICE_CLASS(klass);
 
     dc->vmsd = &iotkit_sysctl_vmstate;
     dc->reset = iotkit_sysctl_reset;
+    dc->props = iotkit_sysctl_props;
+    dc->realize = iotkit_sysctl_realize;
 }
 
 static const TypeInfo iotkit_sysctl_info = {
diff --git a/hw/misc/trace-events b/hw/misc/trace-events
index b0701bddd3..c1795bb54b 100644
--- a/hw/misc/trace-events
+++ b/hw/misc/trace-events
@@ -136,3 +136,7 @@ iotkit_sysctl_reset(void) "IoTKit SysCtl: reset"
 # hw/misc/armsse-cpuid.c
 armsse_cpuid_read(uint64_t offset, uint64_t data, unsigned size) "SSE-200 CPU_IDENTITY read: offset 0x%" PRIx64 " data 0x%" PRIx64 " size %u"
 armsse_cpuid_write(uint64_t offset, uint64_t data, unsigned size) "SSE-200 CPU_IDENTITY write: offset 0x%" PRIx64 " data 0x%" PRIx64 " size %u"
+
+# hw/misc/armsse-mhu.c
+armsse_mhu_read(uint64_t offset, uint64_t data, unsigned size) "SSE-200 MHU read: offset 0x%" PRIx64 " data 0x%" PRIx64 " size %u"
+armsse_mhu_write(uint64_t offset, uint64_t data, unsigned size) "SSE-200 MHU write: offset 0x%" PRIx64 " data 0x%" PRIx64 " size %u"
diff --git a/include/hw/arm/armsse.h b/include/hw/arm/armsse.h
index 7ef871c7df..81e082cccf 100644
--- a/include/hw/arm/armsse.h
+++ b/include/hw/arm/armsse.h
@@ -95,6 +95,7 @@
 #include "hw/misc/iotkit-sysctl.h"
 #include "hw/misc/iotkit-sysinfo.h"
 #include "hw/misc/armsse-cpuid.h"
+#include "hw/misc/armsse-mhu.h"
 #include "hw/misc/unimp.h"
 #include "hw/or-irq.h"
 #include "hw/core/split-irq.h"
@@ -166,7 +167,7 @@ typedef struct ARMSSE {
     IoTKitSysCtl sysctl;
     IoTKitSysCtl sysinfo;
 
-    UnimplementedDeviceState mhu[2];
+    ARMSSEMHU mhu[2];
     UnimplementedDeviceState ppu[NUM_PPUS];
     UnimplementedDeviceState cachectrl[SSE_MAX_CPUS];
     UnimplementedDeviceState cpusecctrl[SSE_MAX_CPUS];
diff --git a/include/hw/misc/armsse-mhu.h b/include/hw/misc/armsse-mhu.h
new file mode 100644
index 0000000000..e57eafc252
--- /dev/null
+++ b/include/hw/misc/armsse-mhu.h
@@ -0,0 +1,44 @@
+/*
+ * ARM SSE-200 Message Handling Unit (MHU)
+ *
+ * Copyright (c) 2019 Linaro Limited
+ * Written by Peter Maydell
+ *
+ *  This program is free software; you can redistribute it and/or modify
+ *  it under the terms of the GNU General Public License version 2 or
+ *  (at your option) any later version.
+ */
+
+/*
+ * This is a model of the Message Handling Unit (MHU) which is part of the
+ * Arm SSE-200 and documented in
+ * http://infocenter.arm.com/help/topic/com.arm.doc.101104_0100_00_en/corelink_sse200_subsystem_for_embedded_technical_reference_manual_101104_0100_00_en.pdf
+ *
+ * QEMU interface:
+ *  + sysbus MMIO region 0: the system information register bank
+ *  + sysbus IRQ 0: interrupt for CPU 0
+ *  + sysbus IRQ 1: interrupt for CPU 1
+ */
+
+#ifndef HW_MISC_SSE_MHU_H
+#define HW_MISC_SSE_MHU_H
+
+#include "hw/sysbus.h"
+
+#define TYPE_ARMSSE_MHU "armsse-mhu"
+#define ARMSSE_MHU(obj) OBJECT_CHECK(ARMSSEMHU, (obj), TYPE_ARMSSE_MHU)
+
+typedef struct ARMSSEMHU {
+    /*< private >*/
+    SysBusDevice parent_obj;
+
+    /*< public >*/
+    MemoryRegion iomem;
+    qemu_irq cpu0irq;
+    qemu_irq cpu1irq;
+
+    uint32_t cpu0intr;
+    uint32_t cpu1intr;
+} ARMSSEMHU;
+
+#endif
diff --git a/include/hw/misc/iotkit-sysctl.h b/include/hw/misc/iotkit-sysctl.h
index e36613cb5e..601c8ecc0d 100644
--- a/include/hw/misc/iotkit-sysctl.h
+++ b/include/hw/misc/iotkit-sysctl.h
@@ -17,6 +17,9 @@
  * "system control register" blocks.
  *
  * QEMU interface:
+ *  + QOM property "SYS_VERSION": value of the SYS_VERSION register of the
+ *    system information block of the SSE
+ *    (used to identify whether to provide SSE-200-only registers)
  *  + sysbus MMIO region 0: the system information register bank
  *  + sysbus MMIO region 1: the system control register bank
  */
@@ -41,9 +44,29 @@ typedef struct IoTKitSysCtl {
     uint32_t reset_syndrome;
     uint32_t reset_mask;
     uint32_t gretreg;
-    uint32_t initsvrtor0;
+    uint32_t initsvtor0;
     uint32_t cpuwait;
     uint32_t wicctrl;
+    uint32_t scsecctrl;
+    uint32_t fclk_div;
+    uint32_t sysclk_div;
+    uint32_t clock_force;
+    uint32_t initsvtor1;
+    uint32_t nmi_enable;
+    uint32_t ewctrl;
+    uint32_t pdcm_pd_sys_sense;
+    uint32_t pdcm_pd_sram0_sense;
+    uint32_t pdcm_pd_sram1_sense;
+    uint32_t pdcm_pd_sram2_sense;
+    uint32_t pdcm_pd_sram3_sense;
+
+    /* Properties */
+    uint32_t sys_version;
+    uint32_t cpuwait_rst;
+    uint32_t initsvtor0_rst;
+    uint32_t initsvtor1_rst;
+
+    bool is_sse200;
 } IoTKitSysCtl;
 
 #endif
diff --git a/linux-user/elfload.c b/linux-user/elfload.c
index 3a50d587ff..b9f7cbbdc1 100644
--- a/linux-user/elfload.c
+++ b/linux-user/elfload.c
@@ -602,6 +602,8 @@ static uint32_t get_elf_hwcap(void)
     GET_FEATURE_ID(aa64_fcma, ARM_HWCAP_A64_FCMA);
     GET_FEATURE_ID(aa64_sve, ARM_HWCAP_A64_SVE);
     GET_FEATURE_ID(aa64_pauth, ARM_HWCAP_A64_PACA | ARM_HWCAP_A64_PACG);
+    GET_FEATURE_ID(aa64_fhm, ARM_HWCAP_A64_ASIMDFHM);
+    GET_FEATURE_ID(aa64_jscvt, ARM_HWCAP_A64_JSCVT);
 
 #undef GET_FEATURE_ID
 
diff --git a/target/arm/arm-powerctl.c b/target/arm/arm-powerctl.c
index f9de5164e5..f77a950db6 100644
--- a/target/arm/arm-powerctl.c
+++ b/target/arm/arm-powerctl.c
@@ -228,6 +228,62 @@ int arm_set_cpu_on(uint64_t cpuid, uint64_t entry, uint64_t context_id,
     return QEMU_ARM_POWERCTL_RET_SUCCESS;
 }
 
+static void arm_set_cpu_on_and_reset_async_work(CPUState *target_cpu_state,
+                                                run_on_cpu_data data)
+{
+    ARMCPU *target_cpu = ARM_CPU(target_cpu_state);
+
+    /* Initialize the cpu we are turning on */
+    cpu_reset(target_cpu_state);
+    target_cpu_state->halted = 0;
+
+    /* Finally set the power status */
+    assert(qemu_mutex_iothread_locked());
+    target_cpu->power_state = PSCI_ON;
+}
+
+int arm_set_cpu_on_and_reset(uint64_t cpuid)
+{
+    CPUState *target_cpu_state;
+    ARMCPU *target_cpu;
+
+    assert(qemu_mutex_iothread_locked());
+
+    /* Retrieve the cpu we are powering up */
+    target_cpu_state = arm_get_cpu_by_id(cpuid);
+    if (!target_cpu_state) {
+        /* The cpu was not found */
+        return QEMU_ARM_POWERCTL_INVALID_PARAM;
+    }
+
+    target_cpu = ARM_CPU(target_cpu_state);
+    if (target_cpu->power_state == PSCI_ON) {
+        qemu_log_mask(LOG_GUEST_ERROR,
+                      "[ARM]%s: CPU %" PRId64 " is already on\n",
+                      __func__, cpuid);
+        return QEMU_ARM_POWERCTL_ALREADY_ON;
+    }
+
+    /*
+     * If another CPU has powered the target on we are in the state
+     * ON_PENDING and additional attempts to power on the CPU should
+     * fail (see 6.6 Implementation CPU_ON/CPU_OFF races in the PSCI
+     * spec)
+     */
+    if (target_cpu->power_state == PSCI_ON_PENDING) {
+        qemu_log_mask(LOG_GUEST_ERROR,
+                      "[ARM]%s: CPU %" PRId64 " is already powering on\n",
+                      __func__, cpuid);
+        return QEMU_ARM_POWERCTL_ON_PENDING;
+    }
+
+    async_run_on_cpu(target_cpu_state, arm_set_cpu_on_and_reset_async_work,
+                     RUN_ON_CPU_NULL);
+
+    /* We are good to go */
+    return QEMU_ARM_POWERCTL_RET_SUCCESS;
+}
+
 static void arm_set_cpu_off_async_work(CPUState *target_cpu_state,
                                        run_on_cpu_data data)
 {
diff --git a/target/arm/arm-powerctl.h b/target/arm/arm-powerctl.h
index 04353923c0..37c8a04f0a 100644
--- a/target/arm/arm-powerctl.h
+++ b/target/arm/arm-powerctl.h
@@ -74,4 +74,20 @@ int arm_set_cpu_off(uint64_t cpuid);
  */
 int arm_reset_cpu(uint64_t cpuid);
 
+/*
+ * arm_set_cpu_on_and_reset:
+ * @cpuid: the id of the CPU we want to star
+ *
+ * Start the cpu designated by @cpuid and put it through its normal
+ * CPU reset process. The CPU will start in the way it is architected
+ * to start after a power-on reset.
+ *
+ * Returns: QEMU_ARM_POWERCTL_RET_SUCCESS on success.
+ * QEMU_ARM_POWERCTL_INVALID_PARAM if there is no CPU with that ID.
+ * QEMU_ARM_POWERCTL_ALREADY_ON if the CPU is already on.
+ * QEMU_ARM_POWERCTL_ON_PENDING if the CPU is already partway through
+ * powering on.
+ */
+int arm_set_cpu_on_and_reset(uint64_t cpuid);
+
 #endif
diff --git a/target/arm/cpu.c b/target/arm/cpu.c
index 8ea6569088..54b61f917b 100644
--- a/target/arm/cpu.c
+++ b/target/arm/cpu.c
@@ -22,6 +22,7 @@
 #include "target/arm/idau.h"
 #include "qemu/error-report.h"
 #include "qapi/error.h"
+#include "qapi/visitor.h"
 #include "cpu.h"
 #include "internals.h"
 #include "qemu-common.h"
@@ -771,9 +772,21 @@ static Property arm_cpu_pmsav7_dregion_property =
                                            pmsav7_dregion,
                                            qdev_prop_uint32, uint32_t);
 
-/* M profile: initial value of the Secure VTOR */
-static Property arm_cpu_initsvtor_property =
-            DEFINE_PROP_UINT32("init-svtor", ARMCPU, init_svtor, 0);
+static void arm_get_init_svtor(Object *obj, Visitor *v, const char *name,
+                               void *opaque, Error **errp)
+{
+    ARMCPU *cpu = ARM_CPU(obj);
+
+    visit_type_uint32(v, name, &cpu->init_svtor, errp);
+}
+
+static void arm_set_init_svtor(Object *obj, Visitor *v, const char *name,
+                               void *opaque, Error **errp)
+{
+    ARMCPU *cpu = ARM_CPU(obj);
+
+    visit_type_uint32(v, name, &cpu->init_svtor, errp);
+}
 
 void arm_cpu_post_init(Object *obj)
 {
@@ -845,8 +858,14 @@ void arm_cpu_post_init(Object *obj)
                                  qdev_prop_allow_set_link_before_realize,
                                  OBJ_PROP_LINK_STRONG,
                                  &error_abort);
-        qdev_property_add_static(DEVICE(obj), &arm_cpu_initsvtor_property,
-                                 &error_abort);
+        /*
+         * M profile: initial value of the Secure VTOR. We can't just use
+         * a simple DEFINE_PROP_UINT32 for this because we want to permit
+         * the property to be set after realize.
+         */
+        object_property_add(obj, "init-svtor", "uint32",
+                            arm_get_init_svtor, arm_set_init_svtor,
+                            NULL, NULL, &error_abort);
     }
 
     qdev_property_add_static(DEVICE(obj), &arm_cpu_cfgend_property,
@@ -995,7 +1014,6 @@ static void arm_cpu_realizefn(DeviceState *dev, Error **errp)
     }
     if (arm_feature(env, ARM_FEATURE_VFP4)) {
         set_feature(env, ARM_FEATURE_VFP3);
-        set_feature(env, ARM_FEATURE_VFP_FP16);
     }
     if (arm_feature(env, ARM_FEATURE_VFP3)) {
         set_feature(env, ARM_FEATURE_VFP);
@@ -1656,7 +1674,6 @@ static void cortex_a9_initfn(Object *obj)
     cpu->dtb_compatible = "arm,cortex-a9";
     set_feature(&cpu->env, ARM_FEATURE_V7);
     set_feature(&cpu->env, ARM_FEATURE_VFP3);
-    set_feature(&cpu->env, ARM_FEATURE_VFP_FP16);
     set_feature(&cpu->env, ARM_FEATURE_NEON);
     set_feature(&cpu->env, ARM_FEATURE_THUMB2EE);
     set_feature(&cpu->env, ARM_FEATURE_EL3);
@@ -2003,6 +2020,7 @@ static void arm_max_initfn(Object *obj)
             t = cpu->isar.id_isar6;
             t = FIELD_DP32(t, ID_ISAR6, JSCVT, 1);
             t = FIELD_DP32(t, ID_ISAR6, DP, 1);
+            t = FIELD_DP32(t, ID_ISAR6, FHM, 1);
             cpu->isar.id_isar6 = t;
 
             t = cpu->id_mmfr4;
diff --git a/target/arm/cpu.h b/target/arm/cpu.h
index 1eea1a408b..36cd365efa 100644
--- a/target/arm/cpu.h
+++ b/target/arm/cpu.h
@@ -1730,6 +1730,27 @@ FIELD(ID_DFR0, MPROFDBG, 20, 4)
 FIELD(ID_DFR0, PERFMON, 24, 4)
 FIELD(ID_DFR0, TRACEFILT, 28, 4)
 
+FIELD(MVFR0, SIMDREG, 0, 4)
+FIELD(MVFR0, FPSP, 4, 4)
+FIELD(MVFR0, FPDP, 8, 4)
+FIELD(MVFR0, FPTRAP, 12, 4)
+FIELD(MVFR0, FPDIVIDE, 16, 4)
+FIELD(MVFR0, FPSQRT, 20, 4)
+FIELD(MVFR0, FPSHVEC, 24, 4)
+FIELD(MVFR0, FPROUND, 28, 4)
+
+FIELD(MVFR1, FPFTZ, 0, 4)
+FIELD(MVFR1, FPDNAN, 4, 4)
+FIELD(MVFR1, SIMDLS, 8, 4)
+FIELD(MVFR1, SIMDINT, 12, 4)
+FIELD(MVFR1, SIMDSP, 16, 4)
+FIELD(MVFR1, SIMDHP, 20, 4)
+FIELD(MVFR1, FPHP, 24, 4)
+FIELD(MVFR1, SIMDFMAC, 28, 4)
+
+FIELD(MVFR2, SIMDMISC, 0, 4)
+FIELD(MVFR2, FPMISC, 4, 4)
+
 QEMU_BUILD_BUG_ON(ARRAY_SIZE(((ARMCPU *)0)->ccsidr) <= R_V7M_CSSELR_INDEX_MASK);
 
 /* If adding a feature bit which corresponds to a Linux ELF
@@ -1747,7 +1768,6 @@ enum arm_features {
     ARM_FEATURE_THUMB2,
     ARM_FEATURE_PMSA,   /* no MMU; may have Memory Protection Unit */
     ARM_FEATURE_VFP3,
-    ARM_FEATURE_VFP_FP16,
     ARM_FEATURE_NEON,
     ARM_FEATURE_M, /* Microcontroller profile.  */
     ARM_FEATURE_OMAPCP, /* OMAP specific CP15 ops handling.  */
@@ -2538,25 +2558,18 @@ bool write_list_to_cpustate(ARMCPU *cpu);
 /**
  * write_cpustate_to_list:
  * @cpu: ARMCPU
- * @kvm_sync: true if this is for syncing back to KVM
  *
  * For each register listed in the ARMCPU cpreg_indexes list, write
  * its value from the ARMCPUState structure into the cpreg_values list.
  * This is used to copy info from TCG's working data structures into
  * KVM or for outbound migration.
  *
- * @kvm_sync is true if we are doing this in order to sync the
- * register state back to KVM. In this case we will only update
- * values in the list if the previous list->cpustate sync actually
- * successfully wrote the CPU state. Otherwise we will keep the value
- * that is in the list.
- *
  * Returns: true if all register values were read correctly,
  * false if some register was unknown or could not be read.
  * Note that we do not stop early on failure -- we will attempt
  * reading all registers in the list.
  */
-bool write_cpustate_to_list(ARMCPU *cpu, bool kvm_sync);
+bool write_cpustate_to_list(ARMCPU *cpu);
 
 #define ARM_CPUID_TI915T      0x54029152
 #define ARM_CPUID_TI925T      0x54029252
@@ -3283,6 +3296,11 @@ static inline bool isar_feature_aa32_dp(const ARMISARegisters *id)
     return FIELD_EX32(id->id_isar6, ID_ISAR6, DP) != 0;
 }
 
+static inline bool isar_feature_aa32_fhm(const ARMISARegisters *id)
+{
+    return FIELD_EX32(id->id_isar6, ID_ISAR6, FHM) != 0;
+}
+
 static inline bool isar_feature_aa32_fp16_arith(const ARMISARegisters *id)
 {
     /*
@@ -3294,6 +3312,41 @@ static inline bool isar_feature_aa32_fp16_arith(const ARMISARegisters *id)
 }
 
 /*
+ * We always set the FP and SIMD FP16 fields to indicate identical
+ * levels of support (assuming SIMD is implemented at all), so
+ * we only need one set of accessors.
+ */
+static inline bool isar_feature_aa32_fp16_spconv(const ARMISARegisters *id)
+{
+    return FIELD_EX64(id->mvfr1, MVFR1, FPHP) > 0;
+}
+
+static inline bool isar_feature_aa32_fp16_dpconv(const ARMISARegisters *id)
+{
+    return FIELD_EX64(id->mvfr1, MVFR1, FPHP) > 1;
+}
+
+static inline bool isar_feature_aa32_vsel(const ARMISARegisters *id)
+{
+    return FIELD_EX64(id->mvfr2, MVFR2, FPMISC) >= 1;
+}
+
+static inline bool isar_feature_aa32_vcvt_dr(const ARMISARegisters *id)
+{
+    return FIELD_EX64(id->mvfr2, MVFR2, FPMISC) >= 2;
+}
+
+static inline bool isar_feature_aa32_vrint(const ARMISARegisters *id)
+{
+    return FIELD_EX64(id->mvfr2, MVFR2, FPMISC) >= 3;
+}
+
+static inline bool isar_feature_aa32_vminmaxnm(const ARMISARegisters *id)
+{
+    return FIELD_EX64(id->mvfr2, MVFR2, FPMISC) >= 4;
+}
+
+/*
  * 64-bit feature tests via id registers.
  */
 static inline bool isar_feature_aa64_aes(const ARMISARegisters *id)
@@ -3356,6 +3409,11 @@ static inline bool isar_feature_aa64_dp(const ARMISARegisters *id)
     return FIELD_EX64(id->id_aa64isar0, ID_AA64ISAR0, DP) != 0;
 }
 
+static inline bool isar_feature_aa64_fhm(const ARMISARegisters *id)
+{
+    return FIELD_EX64(id->id_aa64isar0, ID_AA64ISAR0, FHM) != 0;
+}
+
 static inline bool isar_feature_aa64_jscvt(const ARMISARegisters *id)
 {
     return FIELD_EX64(id->id_aa64isar1, ID_AA64ISAR1, JSCVT) != 0;
diff --git a/target/arm/cpu64.c b/target/arm/cpu64.c
index 69e4134f79..1b0c427277 100644
--- a/target/arm/cpu64.c
+++ b/target/arm/cpu64.c
@@ -308,6 +308,7 @@ static void aarch64_max_initfn(Object *obj)
         t = FIELD_DP64(t, ID_AA64ISAR0, SM3, 1);
         t = FIELD_DP64(t, ID_AA64ISAR0, SM4, 1);
         t = FIELD_DP64(t, ID_AA64ISAR0, DP, 1);
+        t = FIELD_DP64(t, ID_AA64ISAR0, FHM, 1);
         cpu->isar.id_aa64isar0 = t;
 
         t = cpu->isar.id_aa64isar1;
@@ -347,6 +348,7 @@ static void aarch64_max_initfn(Object *obj)
         u = cpu->isar.id_isar6;
         u = FIELD_DP32(u, ID_ISAR6, JSCVT, 1);
         u = FIELD_DP32(u, ID_ISAR6, DP, 1);
+        u = FIELD_DP32(u, ID_ISAR6, FHM, 1);
         cpu->isar.id_isar6 = u;
 
         /*
diff --git a/target/arm/helper.c b/target/arm/helper.c
index fbaa801cea..1fa282a7fc 100644
--- a/target/arm/helper.c
+++ b/target/arm/helper.c
@@ -265,7 +265,7 @@ static bool raw_accessors_invalid(const ARMCPRegInfo *ri)
     return true;
 }
 
-bool write_cpustate_to_list(ARMCPU *cpu, bool kvm_sync)
+bool write_cpustate_to_list(ARMCPU *cpu)
 {
     /* Write the coprocessor state from cpu->env to the (index,value) list. */
     int i;
@@ -274,7 +274,6 @@ bool write_cpustate_to_list(ARMCPU *cpu, bool kvm_sync)
     for (i = 0; i < cpu->cpreg_array_len; i++) {
         uint32_t regidx = kvm_to_cpreg_id(cpu->cpreg_indexes[i]);
         const ARMCPRegInfo *ri;
-        uint64_t newval;
 
         ri = get_arm_cp_reginfo(cpu->cp_regs, regidx);
         if (!ri) {
@@ -284,29 +283,7 @@ bool write_cpustate_to_list(ARMCPU *cpu, bool kvm_sync)
         if (ri->type & ARM_CP_NO_RAW) {
             continue;
         }
-
-        newval = read_raw_cp_reg(&cpu->env, ri);
-        if (kvm_sync) {
-            /*
-             * Only sync if the previous list->cpustate sync succeeded.
-             * Rather than tracking the success/failure state for every
-             * item in the list, we just recheck "does the raw write we must
-             * have made in write_list_to_cpustate() read back OK" here.
-             */
-            uint64_t oldval = cpu->cpreg_values[i];
-
-            if (oldval == newval) {
-                continue;
-            }
-
-            write_raw_cp_reg(&cpu->env, ri, oldval);
-            if (read_raw_cp_reg(&cpu->env, ri) != oldval) {
-                continue;
-            }
-
-            write_raw_cp_reg(&cpu->env, ri, newval);
-        }
-        cpu->cpreg_values[i] = newval;
+        cpu->cpreg_values[i] = read_raw_cp_reg(&cpu->env, ri);
     }
     return ok;
 }
diff --git a/target/arm/helper.h b/target/arm/helper.h
index 747cb64d29..d363904278 100644
--- a/target/arm/helper.h
+++ b/target/arm/helper.h
@@ -677,6 +677,15 @@ DEF_HELPER_FLAGS_5(gvec_sqsub_s, TCG_CALL_NO_RWG,
 DEF_HELPER_FLAGS_5(gvec_sqsub_d, TCG_CALL_NO_RWG,
                    void, ptr, ptr, ptr, ptr, i32)
 
+DEF_HELPER_FLAGS_5(gvec_fmlal_a32, TCG_CALL_NO_RWG,
+                   void, ptr, ptr, ptr, ptr, i32)
+DEF_HELPER_FLAGS_5(gvec_fmlal_a64, TCG_CALL_NO_RWG,
+                   void, ptr, ptr, ptr, ptr, i32)
+DEF_HELPER_FLAGS_5(gvec_fmlal_idx_a32, TCG_CALL_NO_RWG,
+                   void, ptr, ptr, ptr, ptr, i32)
+DEF_HELPER_FLAGS_5(gvec_fmlal_idx_a64, TCG_CALL_NO_RWG,
+                   void, ptr, ptr, ptr, ptr, i32)
+
 #ifdef TARGET_AARCH64
 #include "helper-a64.h"
 #include "helper-sve.h"
diff --git a/target/arm/kvm32.c b/target/arm/kvm32.c
index a75e04cc8f..50327989dc 100644
--- a/target/arm/kvm32.c
+++ b/target/arm/kvm32.c
@@ -125,9 +125,6 @@ bool kvm_arm_get_host_cpu_features(ARMHostCPUFeatures *ahcf)
     if (extract32(id_pfr0, 12, 4) == 1) {
         set_feature(&features, ARM_FEATURE_THUMB2EE);
     }
-    if (extract32(ahcf->isar.mvfr1, 20, 4) == 1) {
-        set_feature(&features, ARM_FEATURE_VFP_FP16);
-    }
     if (extract32(ahcf->isar.mvfr1, 12, 4) == 1) {
         set_feature(&features, ARM_FEATURE_NEON);
     }
@@ -387,8 +384,24 @@ int kvm_arch_put_registers(CPUState *cs, int level)
         return ret;
     }
 
-    write_cpustate_to_list(cpu, true);
-
+    /* Note that we do not call write_cpustate_to_list()
+     * here, so we are only writing the tuple list back to
+     * KVM. This is safe because nothing can change the
+     * CPUARMState cp15 fields (in particular gdb accesses cannot)
+     * and so there are no changes to sync. In fact syncing would
+     * be wrong at this point: for a constant register where TCG and
+     * KVM disagree about its value, the preceding write_list_to_cpustate()
+     * would not have had any effect on the CPUARMState value (since the
+     * register is read-only), and a write_cpustate_to_list() here would
+     * then try to write the TCG value back into KVM -- this would either
+     * fail or incorrectly change the value the guest sees.
+     *
+     * If we ever want to allow the user to modify cp15 registers via
+     * the gdb stub, we would need to be more clever here (for instance
+     * tracking the set of registers kvm_arch_get_registers() successfully
+     * managed to update the CPUARMState with, and only allowing those
+     * to be written back up into the kernel).
+     */
     if (!write_list_to_kvmstate(cpu, level)) {
         return EINVAL;
     }
diff --git a/target/arm/kvm64.c b/target/arm/kvm64.c
index e3ba149248..089af9c5f0 100644
--- a/target/arm/kvm64.c
+++ b/target/arm/kvm64.c
@@ -838,8 +838,6 @@ int kvm_arch_put_registers(CPUState *cs, int level)
         return ret;
     }
 
-    write_cpustate_to_list(cpu, true);
-
     if (!write_list_to_kvmstate(cpu, level)) {
         return EINVAL;
     }
diff --git a/target/arm/machine.c b/target/arm/machine.c
index 124192bfc2..b292549614 100644
--- a/target/arm/machine.c
+++ b/target/arm/machine.c
@@ -630,7 +630,7 @@ static int cpu_pre_save(void *opaque)
             abort();
         }
     } else {
-        if (!write_cpustate_to_list(cpu, false)) {
+        if (!write_cpustate_to_list(cpu)) {
             /* This should never fail. */
             abort();
         }
diff --git a/target/arm/translate-a64.c b/target/arm/translate-a64.c
index c56e878787..d3c8eaf089 100644
--- a/target/arm/translate-a64.c
+++ b/target/arm/translate-a64.c
@@ -10917,9 +10917,29 @@ static void disas_simd_3same_float(DisasContext *s, uint32_t insn)
         if (!fp_access_check(s)) {
             return;
         }
-
         handle_3same_float(s, size, elements, fpopcode, rd, rn, rm);
         return;
+
+    case 0x1d: /* FMLAL  */
+    case 0x3d: /* FMLSL  */
+    case 0x59: /* FMLAL2 */
+    case 0x79: /* FMLSL2 */
+        if (size & 1 || !dc_isar_feature(aa64_fhm, s)) {
+            unallocated_encoding(s);
+            return;
+        }
+        if (fp_access_check(s)) {
+            int is_s = extract32(insn, 23, 1);
+            int is_2 = extract32(insn, 29, 1);
+            int data = (is_2 << 1) | is_s;
+            tcg_gen_gvec_3_ptr(vec_full_reg_offset(s, rd),
+                               vec_full_reg_offset(s, rn),
+                               vec_full_reg_offset(s, rm), cpu_env,
+                               is_q ? 16 : 8, vec_full_reg_size(s),
+                               data, gen_helper_gvec_fmlal_a64);
+        }
+        return;
+
     default:
         unallocated_encoding(s);
         return;
@@ -12739,6 +12759,17 @@ static void disas_simd_indexed(DisasContext *s, uint32_t insn)
         }
         is_fp = 2;
         break;
+    case 0x00: /* FMLAL */
+    case 0x04: /* FMLSL */
+    case 0x18: /* FMLAL2 */
+    case 0x1c: /* FMLSL2 */
+        if (is_scalar || size != MO_32 || !dc_isar_feature(aa64_fhm, s)) {
+            unallocated_encoding(s);
+            return;
+        }
+        size = MO_16;
+        /* is_fp, but we pass cpu_env not fp_status.  */
+        break;
     default:
         unallocated_encoding(s);
         return;
@@ -12849,6 +12880,22 @@ static void disas_simd_indexed(DisasContext *s, uint32_t insn)
             tcg_temp_free_ptr(fpst);
         }
         return;
+
+    case 0x00: /* FMLAL */
+    case 0x04: /* FMLSL */
+    case 0x18: /* FMLAL2 */
+    case 0x1c: /* FMLSL2 */
+        {
+            int is_s = extract32(opcode, 2, 1);
+            int is_2 = u;
+            int data = (index << 2) | (is_2 << 1) | is_s;
+            tcg_gen_gvec_3_ptr(vec_full_reg_offset(s, rd),
+                               vec_full_reg_offset(s, rn),
+                               vec_full_reg_offset(s, rm), cpu_env,
+                               is_q ? 16 : 8, vec_full_reg_size(s),
+                               data, gen_helper_gvec_fmlal_idx_a64);
+        }
+        return;
     }
 
     if (size == 3) {
diff --git a/target/arm/translate.c b/target/arm/translate.c
index c1175798ac..8f7f5b95aa 100644
--- a/target/arm/translate.c
+++ b/target/arm/translate.c
@@ -3357,14 +3357,10 @@ static const uint8_t fp_decode_rm[] = {
     FPROUNDING_NEGINF,
 };
 
-static int disas_vfp_v8_insn(DisasContext *s, uint32_t insn)
+static int disas_vfp_misc_insn(DisasContext *s, uint32_t insn)
 {
     uint32_t rd, rn, rm, dp = extract32(insn, 8, 1);
 
-    if (!arm_dc_feature(s, ARM_FEATURE_V8)) {
-        return 1;
-    }
-
     if (dp) {
         VFP_DREG_D(rd, insn);
         VFP_DREG_N(rn, insn);
@@ -3375,15 +3371,18 @@ static int disas_vfp_v8_insn(DisasContext *s, uint32_t insn)
         rm = VFP_SREG_M(insn);
     }
 
-    if ((insn & 0x0f800e50) == 0x0e000a00) {
+    if ((insn & 0x0f800e50) == 0x0e000a00 && dc_isar_feature(aa32_vsel, s)) {
         return handle_vsel(insn, rd, rn, rm, dp);
-    } else if ((insn & 0x0fb00e10) == 0x0e800a00) {
+    } else if ((insn & 0x0fb00e10) == 0x0e800a00 &&
+               dc_isar_feature(aa32_vminmaxnm, s)) {
         return handle_vminmaxnm(insn, rd, rn, rm, dp);
-    } else if ((insn & 0x0fbc0ed0) == 0x0eb80a40) {
+    } else if ((insn & 0x0fbc0ed0) == 0x0eb80a40 &&
+               dc_isar_feature(aa32_vrint, s)) {
         /* VRINTA, VRINTN, VRINTP, VRINTM */
         int rounding = fp_decode_rm[extract32(insn, 16, 2)];
         return handle_vrint(insn, rd, rm, dp, rounding);
-    } else if ((insn & 0x0fbc0e50) == 0x0ebc0a40) {
+    } else if ((insn & 0x0fbc0e50) == 0x0ebc0a40 &&
+               dc_isar_feature(aa32_vcvt_dr, s)) {
         /* VCVTA, VCVTN, VCVTP, VCVTM */
         int rounding = fp_decode_rm[extract32(insn, 16, 2)];
         return handle_vcvt(insn, rd, rm, dp, rounding);
@@ -3427,10 +3426,12 @@ static int disas_vfp_insn(DisasContext *s, uint32_t insn)
     }
 
     if (extract32(insn, 28, 4) == 0xf) {
-        /* Encodings with T=1 (Thumb) or unconditional (ARM):
-         * only used in v8 and above.
+        /*
+         * Encodings with T=1 (Thumb) or unconditional (ARM):
+         * only used for the "miscellaneous VFP features" added in v8A
+         * and v7M (and gated on the MVFR2.FPMisc field).
          */
-        return disas_vfp_v8_insn(s, insn);
+        return disas_vfp_misc_insn(s, insn);
     }
 
     dp = ((insn & 0xf00) == 0xb00);
@@ -3663,17 +3664,27 @@ static int disas_vfp_insn(DisasContext *s, uint32_t insn)
                      * UNPREDICTABLE if bit 8 is set prior to ARMv8
                      * (we choose to UNDEF)
                      */
-                    if ((dp && !arm_dc_feature(s, ARM_FEATURE_V8)) ||
-                        !arm_dc_feature(s, ARM_FEATURE_VFP_FP16)) {
-                        return 1;
+                    if (dp) {
+                        if (!dc_isar_feature(aa32_fp16_dpconv, s)) {
+                            return 1;
+                        }
+                    } else {
+                        if (!dc_isar_feature(aa32_fp16_spconv, s)) {
+                            return 1;
+                        }
                     }
                     rm_is_dp = false;
                     break;
                 case 0x06: /* vcvtb.f16.f32, vcvtb.f16.f64 */
                 case 0x07: /* vcvtt.f16.f32, vcvtt.f16.f64 */
-                    if ((dp && !arm_dc_feature(s, ARM_FEATURE_V8)) ||
-                        !arm_dc_feature(s, ARM_FEATURE_VFP_FP16)) {
-                        return 1;
+                    if (dp) {
+                        if (!dc_isar_feature(aa32_fp16_dpconv, s)) {
+                            return 1;
+                        }
+                    } else {
+                        if (!dc_isar_feature(aa32_fp16_spconv, s)) {
+                            return 1;
+                        }
                     }
                     rd_is_dp = false;
                     break;
@@ -7876,7 +7887,7 @@ static int disas_neon_data_insn(DisasContext *s, uint32_t insn)
                     TCGv_ptr fpst;
                     TCGv_i32 ahp;
 
-                    if (!arm_dc_feature(s, ARM_FEATURE_VFP_FP16) ||
+                    if (!dc_isar_feature(aa32_fp16_spconv, s) ||
                         q || (rm & 1)) {
                         return 1;
                     }
@@ -7908,7 +7919,7 @@ static int disas_neon_data_insn(DisasContext *s, uint32_t insn)
                 {
                     TCGv_ptr fpst;
                     TCGv_i32 ahp;
-                    if (!arm_dc_feature(s, ARM_FEATURE_VFP_FP16) ||
+                    if (!dc_isar_feature(aa32_fp16_spconv, s) ||
                         q || (rd & 1)) {
                         return 1;
                     }
@@ -8372,15 +8383,9 @@ static int disas_neon_insn_3same_ext(DisasContext *s, uint32_t insn)
     gen_helper_gvec_3_ptr *fn_gvec_ptr = NULL;
     int rd, rn, rm, opr_sz;
     int data = 0;
-    bool q;
-
-    q = extract32(insn, 6, 1);
-    VFP_DREG_D(rd, insn);
-    VFP_DREG_N(rn, insn);
-    VFP_DREG_M(rm, insn);
-    if ((rd | rn | rm) & q) {
-        return 1;
-    }
+    int off_rn, off_rm;
+    bool is_long = false, q = extract32(insn, 6, 1);
+    bool ptr_is_env = false;
 
     if ((insn & 0xfe200f10) == 0xfc200800) {
         /* VCMLA -- 1111 110R R.1S .... .... 1000 ...0 .... */
@@ -8407,10 +8412,39 @@ static int disas_neon_insn_3same_ext(DisasContext *s, uint32_t insn)
             return 1;
         }
         fn_gvec = u ? gen_helper_gvec_udot_b : gen_helper_gvec_sdot_b;
+    } else if ((insn & 0xff300f10) == 0xfc200810) {
+        /* VFM[AS]L -- 1111 1100 S.10 .... .... 1000 .Q.1 .... */
+        int is_s = extract32(insn, 23, 1);
+        if (!dc_isar_feature(aa32_fhm, s)) {
+            return 1;
+        }
+        is_long = true;
+        data = is_s; /* is_2 == 0 */
+        fn_gvec_ptr = gen_helper_gvec_fmlal_a32;
+        ptr_is_env = true;
     } else {
         return 1;
     }
 
+    VFP_DREG_D(rd, insn);
+    if (rd & q) {
+        return 1;
+    }
+    if (q || !is_long) {
+        VFP_DREG_N(rn, insn);
+        VFP_DREG_M(rm, insn);
+        if ((rn | rm) & q & !is_long) {
+            return 1;
+        }
+        off_rn = vfp_reg_offset(1, rn);
+        off_rm = vfp_reg_offset(1, rm);
+    } else {
+        rn = VFP_SREG_N(insn);
+        rm = VFP_SREG_M(insn);
+        off_rn = vfp_reg_offset(0, rn);
+        off_rm = vfp_reg_offset(0, rm);
+    }
+
     if (s->fp_excp_el) {
         gen_exception_insn(s, 4, EXCP_UDEF,
                            syn_simd_access_trap(1, 0xe, false), s->fp_excp_el);
@@ -8422,16 +8456,19 @@ static int disas_neon_insn_3same_ext(DisasContext *s, uint32_t insn)
 
     opr_sz = (1 + q) * 8;
     if (fn_gvec_ptr) {
-        TCGv_ptr fpst = get_fpstatus_ptr(1);
-        tcg_gen_gvec_3_ptr(vfp_reg_offset(1, rd),
-                           vfp_reg_offset(1, rn),
-                           vfp_reg_offset(1, rm), fpst,
+        TCGv_ptr ptr;
+        if (ptr_is_env) {
+            ptr = cpu_env;
+        } else {
+            ptr = get_fpstatus_ptr(1);
+        }
+        tcg_gen_gvec_3_ptr(vfp_reg_offset(1, rd), off_rn, off_rm, ptr,
                            opr_sz, opr_sz, data, fn_gvec_ptr);
-        tcg_temp_free_ptr(fpst);
+        if (!ptr_is_env) {
+            tcg_temp_free_ptr(ptr);
+        }
     } else {
-        tcg_gen_gvec_3_ool(vfp_reg_offset(1, rd),
-                           vfp_reg_offset(1, rn),
-                           vfp_reg_offset(1, rm),
+        tcg_gen_gvec_3_ool(vfp_reg_offset(1, rd), off_rn, off_rm,
                            opr_sz, opr_sz, data, fn_gvec);
     }
     return 0;
@@ -8450,14 +8487,9 @@ static int disas_neon_insn_2reg_scalar_ext(DisasContext *s, uint32_t insn)
     gen_helper_gvec_3 *fn_gvec = NULL;
     gen_helper_gvec_3_ptr *fn_gvec_ptr = NULL;
     int rd, rn, rm, opr_sz, data;
-    bool q;
-
-    q = extract32(insn, 6, 1);
-    VFP_DREG_D(rd, insn);
-    VFP_DREG_N(rn, insn);
-    if ((rd | rn) & q) {
-        return 1;
-    }
+    int off_rn, off_rm;
+    bool is_long = false, q = extract32(insn, 6, 1);
+    bool ptr_is_env = false;
 
     if ((insn & 0xff000f10) == 0xfe000800) {
         /* VCMLA (indexed) -- 1111 1110 S.RR .... .... 1000 ...0 .... */
@@ -8486,6 +8518,7 @@ static int disas_neon_insn_2reg_scalar_ext(DisasContext *s, uint32_t insn)
     } else if ((insn & 0xffb00f00) == 0xfe200d00) {
         /* V[US]DOT -- 1111 1110 0.10 .... .... 1101 .Q.U .... */
         int u = extract32(insn, 4, 1);
+
         if (!dc_isar_feature(aa32_dp, s)) {
             return 1;
         }
@@ -8493,10 +8526,48 @@ static int disas_neon_insn_2reg_scalar_ext(DisasContext *s, uint32_t insn)
         /* rm is just Vm, and index is M.  */
         data = extract32(insn, 5, 1); /* index */
         rm = extract32(insn, 0, 4);
+    } else if ((insn & 0xffa00f10) == 0xfe000810) {
+        /* VFM[AS]L -- 1111 1110 0.0S .... .... 1000 .Q.1 .... */
+        int is_s = extract32(insn, 20, 1);
+        int vm20 = extract32(insn, 0, 3);
+        int vm3 = extract32(insn, 3, 1);
+        int m = extract32(insn, 5, 1);
+        int index;
+
+        if (!dc_isar_feature(aa32_fhm, s)) {
+            return 1;
+        }
+        if (q) {
+            rm = vm20;
+            index = m * 2 + vm3;
+        } else {
+            rm = vm20 * 2 + m;
+            index = vm3;
+        }
+        is_long = true;
+        data = (index << 2) | is_s; /* is_2 == 0 */
+        fn_gvec_ptr = gen_helper_gvec_fmlal_idx_a32;
+        ptr_is_env = true;
     } else {
         return 1;
     }
 
+    VFP_DREG_D(rd, insn);
+    if (rd & q) {
+        return 1;
+    }
+    if (q || !is_long) {
+        VFP_DREG_N(rn, insn);
+        if (rn & q & !is_long) {
+            return 1;
+        }
+        off_rn = vfp_reg_offset(1, rn);
+        off_rm = vfp_reg_offset(1, rm);
+    } else {
+        rn = VFP_SREG_N(insn);
+        off_rn = vfp_reg_offset(0, rn);
+        off_rm = vfp_reg_offset(0, rm);
+    }
     if (s->fp_excp_el) {
         gen_exception_insn(s, 4, EXCP_UDEF,
                            syn_simd_access_trap(1, 0xe, false), s->fp_excp_el);
@@ -8508,16 +8579,19 @@ static int disas_neon_insn_2reg_scalar_ext(DisasContext *s, uint32_t insn)
 
     opr_sz = (1 + q) * 8;
     if (fn_gvec_ptr) {
-        TCGv_ptr fpst = get_fpstatus_ptr(1);
-        tcg_gen_gvec_3_ptr(vfp_reg_offset(1, rd),
-                           vfp_reg_offset(1, rn),
-                           vfp_reg_offset(1, rm), fpst,
+        TCGv_ptr ptr;
+        if (ptr_is_env) {
+            ptr = cpu_env;
+        } else {
+            ptr = get_fpstatus_ptr(1);
+        }
+        tcg_gen_gvec_3_ptr(vfp_reg_offset(1, rd), off_rn, off_rm, ptr,
                            opr_sz, opr_sz, data, fn_gvec_ptr);
-        tcg_temp_free_ptr(fpst);
+        if (!ptr_is_env) {
+            tcg_temp_free_ptr(ptr);
+        }
     } else {
-        tcg_gen_gvec_3_ool(vfp_reg_offset(1, rd),
-                           vfp_reg_offset(1, rn),
-                           vfp_reg_offset(1, rm),
+        tcg_gen_gvec_3_ool(vfp_reg_offset(1, rd), off_rn, off_rm,
                            opr_sz, opr_sz, data, fn_gvec);
     }
     return 0;
diff --git a/target/arm/vec_helper.c b/target/arm/vec_helper.c
index dfc635cf9a..dedef62403 100644
--- a/target/arm/vec_helper.c
+++ b/target/arm/vec_helper.c
@@ -898,3 +898,151 @@ void HELPER(gvec_sqsub_d)(void *vd, void *vq, void *vn,
     }
     clear_tail(d, oprsz, simd_maxsz(desc));
 }
+
+/*
+ * Convert float16 to float32, raising no exceptions and
+ * preserving exceptional values, including SNaN.
+ * This is effectively an unpack+repack operation.
+ */
+static float32 float16_to_float32_by_bits(uint32_t f16, bool fz16)
+{
+    const int f16_bias = 15;
+    const int f32_bias = 127;
+    uint32_t sign = extract32(f16, 15, 1);
+    uint32_t exp = extract32(f16, 10, 5);
+    uint32_t frac = extract32(f16, 0, 10);
+
+    if (exp == 0x1f) {
+        /* Inf or NaN */
+        exp = 0xff;
+    } else if (exp == 0) {
+        /* Zero or denormal.  */
+        if (frac != 0) {
+            if (fz16) {
+                frac = 0;
+            } else {
+                /*
+                 * Denormal; these are all normal float32.
+                 * Shift the fraction so that the msb is at bit 11,
+                 * then remove bit 11 as the implicit bit of the
+                 * normalized float32.  Note that we still go through
+                 * the shift for normal numbers below, to put the
+                 * float32 fraction at the right place.
+                 */
+                int shift = clz32(frac) - 21;
+                frac = (frac << shift) & 0x3ff;
+                exp = f32_bias - f16_bias - shift + 1;
+            }
+        }
+    } else {
+        /* Normal number; adjust the bias.  */
+        exp += f32_bias - f16_bias;
+    }
+    sign <<= 31;
+    exp <<= 23;
+    frac <<= 23 - 10;
+
+    return sign | exp | frac;
+}
+
+static uint64_t load4_f16(uint64_t *ptr, int is_q, int is_2)
+{
+    /*
+     * Branchless load of u32[0], u64[0], u32[1], or u64[1].
+     * Load the 2nd qword iff is_q & is_2.
+     * Shift to the 2nd dword iff !is_q & is_2.
+     * For !is_q & !is_2, the upper bits of the result are garbage.
+     */
+    return ptr[is_q & is_2] >> ((is_2 & ~is_q) << 5);
+}
+
+/*
+ * Note that FMLAL requires oprsz == 8 or oprsz == 16,
+ * as there is not yet SVE versions that might use blocking.
+ */
+
+static void do_fmlal(float32 *d, void *vn, void *vm, float_status *fpst,
+                     uint32_t desc, bool fz16)
+{
+    intptr_t i, oprsz = simd_oprsz(desc);
+    int is_s = extract32(desc, SIMD_DATA_SHIFT, 1);
+    int is_2 = extract32(desc, SIMD_DATA_SHIFT + 1, 1);
+    int is_q = oprsz == 16;
+    uint64_t n_4, m_4;
+
+    /* Pre-load all of the f16 data, avoiding overlap issues.  */
+    n_4 = load4_f16(vn, is_q, is_2);
+    m_4 = load4_f16(vm, is_q, is_2);
+
+    /* Negate all inputs for FMLSL at once.  */
+    if (is_s) {
+        n_4 ^= 0x8000800080008000ull;
+    }
+
+    for (i = 0; i < oprsz / 4; i++) {
+        float32 n_1 = float16_to_float32_by_bits(n_4 >> (i * 16), fz16);
+        float32 m_1 = float16_to_float32_by_bits(m_4 >> (i * 16), fz16);
+        d[H4(i)] = float32_muladd(n_1, m_1, d[H4(i)], 0, fpst);
+    }
+    clear_tail(d, oprsz, simd_maxsz(desc));
+}
+
+void HELPER(gvec_fmlal_a32)(void *vd, void *vn, void *vm,
+                            void *venv, uint32_t desc)
+{
+    CPUARMState *env = venv;
+    do_fmlal(vd, vn, vm, &env->vfp.standard_fp_status, desc,
+             get_flush_inputs_to_zero(&env->vfp.fp_status_f16));
+}
+
+void HELPER(gvec_fmlal_a64)(void *vd, void *vn, void *vm,
+                            void *venv, uint32_t desc)
+{
+    CPUARMState *env = venv;
+    do_fmlal(vd, vn, vm, &env->vfp.fp_status, desc,
+             get_flush_inputs_to_zero(&env->vfp.fp_status_f16));
+}
+
+static void do_fmlal_idx(float32 *d, void *vn, void *vm, float_status *fpst,
+                         uint32_t desc, bool fz16)
+{
+    intptr_t i, oprsz = simd_oprsz(desc);
+    int is_s = extract32(desc, SIMD_DATA_SHIFT, 1);
+    int is_2 = extract32(desc, SIMD_DATA_SHIFT + 1, 1);
+    int index = extract32(desc, SIMD_DATA_SHIFT + 2, 3);
+    int is_q = oprsz == 16;
+    uint64_t n_4;
+    float32 m_1;
+
+    /* Pre-load all of the f16 data, avoiding overlap issues.  */
+    n_4 = load4_f16(vn, is_q, is_2);
+
+    /* Negate all inputs for FMLSL at once.  */
+    if (is_s) {
+        n_4 ^= 0x8000800080008000ull;
+    }
+
+    m_1 = float16_to_float32_by_bits(((float16 *)vm)[H2(index)], fz16);
+
+    for (i = 0; i < oprsz / 4; i++) {
+        float32 n_1 = float16_to_float32_by_bits(n_4 >> (i * 16), fz16);
+        d[H4(i)] = float32_muladd(n_1, m_1, d[H4(i)], 0, fpst);
+    }
+    clear_tail(d, oprsz, simd_maxsz(desc));
+}
+
+void HELPER(gvec_fmlal_idx_a32)(void *vd, void *vn, void *vm,
+                                void *venv, uint32_t desc)
+{
+    CPUARMState *env = venv;
+    do_fmlal_idx(vd, vn, vm, &env->vfp.standard_fp_status, desc,
+                 get_flush_inputs_to_zero(&env->vfp.fp_status_f16));
+}
+
+void HELPER(gvec_fmlal_idx_a64)(void *vd, void *vn, void *vm,
+                                void *venv, uint32_t desc)
+{
+    CPUARMState *env = venv;
+    do_fmlal_idx(vd, vn, vm, &env->vfp.fp_status, desc,
+                 get_flush_inputs_to_zero(&env->vfp.fp_status_f16));
+}