summary refs log tree commit diff stats
diff options
context:
space:
mode:
-rw-r--r--audio/audio.c2
-rw-r--r--audio/paaudio.c8
-rw-r--r--default-configs/arm-softmmu.mak1
-rw-r--r--device_tree.c4
-rw-r--r--fpu/softfloat.c38
-rw-r--r--hw/arm/Makefile.objs2
-rw-r--r--hw/arm/boot.c32
-rw-r--r--hw/arm/integratorcp.c13
-rw-r--r--hw/arm/virt.c452
-rw-r--r--hw/audio/adlib.c4
-rw-r--r--hw/audio/intel-hda.c1
-rw-r--r--hw/cpu/a9mpcore.c44
-rw-r--r--hw/display/qxl-render.c1
-rw-r--r--hw/net/cadence_gem.c278
-rw-r--r--hw/timer/Makefile.objs1
-rw-r--r--hw/timer/a9gtimer.c369
-rw-r--r--include/fpu/softfloat.h4
-rw-r--r--include/hw/arm/arm.h7
-rw-r--r--include/hw/cpu/a9mpcore.h4
-rw-r--r--include/hw/timer/a9gtimer.h97
-rw-r--r--libcacard/vscclient.c4
-rw-r--r--target-arm/cpu-qom.h11
-rw-r--r--target-arm/cpu.c59
-rw-r--r--target-arm/cpu.h13
-rw-r--r--target-arm/helper.c33
-rw-r--r--target-arm/helper.h5
-rw-r--r--target-arm/kvm-consts.h64
-rw-r--r--target-arm/kvm.c243
-rw-r--r--target-arm/kvm_arm.h55
-rw-r--r--target-arm/translate.c302
-rw-r--r--target-mips/dsp_helper.c30
-rw-r--r--target-mips/translate.c7
-rw-r--r--target-sh4/cpu.h6
33 files changed, 1957 insertions, 237 deletions
diff --git a/audio/audio.c b/audio/audio.c
index b3db67979d..fc775110af 100644
--- a/audio/audio.c
+++ b/audio/audio.c
@@ -95,7 +95,7 @@ static struct {
         }
     },
 
-    .period = { .hertz = 250 },
+    .period = { .hertz = 100 },
     .plive = 0,
     .log_to_monitor = 0,
     .try_poll_in = 1,
diff --git a/audio/paaudio.c b/audio/paaudio.c
index 8b69778ad9..90ff24500b 100644
--- a/audio/paaudio.c
+++ b/audio/paaudio.c
@@ -547,11 +547,11 @@ static int qpa_init_out (HWVoiceOut *hw, struct audsettings *as)
     ss.rate = as->freq;
 
     /*
-     * qemu audio tick runs at 250 Hz (by default), so processing
-     * data chunks worth 4 ms of sound should be a good fit.
+     * qemu audio tick runs at 100 Hz (by default), so processing
+     * data chunks worth 10 ms of sound should be a good fit.
      */
-    ba.tlength = pa_usec_to_bytes (4 * 1000, &ss);
-    ba.minreq = pa_usec_to_bytes (2 * 1000, &ss);
+    ba.tlength = pa_usec_to_bytes (10 * 1000, &ss);
+    ba.minreq = pa_usec_to_bytes (5 * 1000, &ss);
     ba.maxlength = -1;
     ba.prebuf = -1;
 
diff --git a/default-configs/arm-softmmu.mak b/default-configs/arm-softmmu.mak
index a555eefed5..e48f102af6 100644
--- a/default-configs/arm-softmmu.mak
+++ b/default-configs/arm-softmmu.mak
@@ -41,6 +41,7 @@ CONFIG_ARM_GIC=y
 CONFIG_ARM_GIC_KVM=$(CONFIG_KVM)
 CONFIG_ARM_TIMER=y
 CONFIG_ARM_MPTIMER=y
+CONFIG_A9_GTIMER=y
 CONFIG_PL011=y
 CONFIG_PL022=y
 CONFIG_PL031=y
diff --git a/device_tree.c b/device_tree.c
index ffec99ae29..391da8c45e 100644
--- a/device_tree.c
+++ b/device_tree.c
@@ -41,6 +41,10 @@ void *create_device_tree(int *sizep)
     if (ret < 0) {
         goto fail;
     }
+    ret = fdt_finish_reservemap(fdt);
+    if (ret < 0) {
+        goto fail;
+    }
     ret = fdt_begin_node(fdt, "");
     if (ret < 0) {
         goto fail;
diff --git a/fpu/softfloat.c b/fpu/softfloat.c
index 7ba51b6f3c..dbda61bc8e 100644
--- a/fpu/softfloat.c
+++ b/fpu/softfloat.c
@@ -6705,10 +6705,17 @@ int float128_compare_quiet( float128 a, float128 b STATUS_PARAM )
 /* min() and max() functions. These can't be implemented as
  * 'compare and pick one input' because that would mishandle
  * NaNs and +0 vs -0.
+ *
+ * minnum() and maxnum() functions. These are similar to the min()
+ * and max() functions but if one of the arguments is a QNaN and
+ * the other is numerical then the numerical argument is returned.
+ * minnum() and maxnum correspond to the IEEE 754-2008 minNum()
+ * and maxNum() operations. min() and max() are the typical min/max
+ * semantics provided by many CPUs which predate that specification.
  */
-#define MINMAX(s, nan_exp)                                              \
+#define MINMAX(s)                                                       \
 INLINE float ## s float ## s ## _minmax(float ## s a, float ## s b,     \
-                                        int ismin STATUS_PARAM )        \
+                                        int ismin, int isieee STATUS_PARAM) \
 {                                                                       \
     flag aSign, bSign;                                                  \
     uint ## s ## _t av, bv;                                             \
@@ -6716,6 +6723,15 @@ INLINE float ## s float ## s ## _minmax(float ## s a, float ## s b,     \
     b = float ## s ## _squash_input_denormal(b STATUS_VAR);             \
     if (float ## s ## _is_any_nan(a) ||                                 \
         float ## s ## _is_any_nan(b)) {                                 \
+        if (isieee) {                                                   \
+            if (float ## s ## _is_quiet_nan(a) &&                       \
+                !float ## s ##_is_any_nan(b)) {                         \
+                return b;                                               \
+            } else if (float ## s ## _is_quiet_nan(b) &&                \
+                       !float ## s ## _is_any_nan(a)) {                 \
+                return a;                                               \
+            }                                                           \
+        }                                                               \
         return propagateFloat ## s ## NaN(a, b STATUS_VAR);             \
     }                                                                   \
     aSign = extractFloat ## s ## Sign(a);                               \
@@ -6739,16 +6755,26 @@ INLINE float ## s float ## s ## _minmax(float ## s a, float ## s b,     \
                                                                         \
 float ## s float ## s ## _min(float ## s a, float ## s b STATUS_PARAM)  \
 {                                                                       \
-    return float ## s ## _minmax(a, b, 1 STATUS_VAR);                   \
+    return float ## s ## _minmax(a, b, 1, 0 STATUS_VAR);                \
 }                                                                       \
                                                                         \
 float ## s float ## s ## _max(float ## s a, float ## s b STATUS_PARAM)  \
 {                                                                       \
-    return float ## s ## _minmax(a, b, 0 STATUS_VAR);                   \
+    return float ## s ## _minmax(a, b, 0, 0 STATUS_VAR);                \
+}                                                                       \
+                                                                        \
+float ## s float ## s ## _minnum(float ## s a, float ## s b STATUS_PARAM) \
+{                                                                       \
+    return float ## s ## _minmax(a, b, 1, 1 STATUS_VAR);                \
+}                                                                       \
+                                                                        \
+float ## s float ## s ## _maxnum(float ## s a, float ## s b STATUS_PARAM) \
+{                                                                       \
+    return float ## s ## _minmax(a, b, 0, 1 STATUS_VAR);                \
 }
 
-MINMAX(32, 0xff)
-MINMAX(64, 0x7ff)
+MINMAX(32)
+MINMAX(64)
 
 
 /* Multiply A by 2 raised to the power N.  */
diff --git a/hw/arm/Makefile.objs b/hw/arm/Makefile.objs
index 3671b42738..78b56149b6 100644
--- a/hw/arm/Makefile.objs
+++ b/hw/arm/Makefile.objs
@@ -1,7 +1,7 @@
 obj-y += boot.o collie.o exynos4_boards.o gumstix.o highbank.o
 obj-y += integratorcp.o kzm.o mainstone.o musicpal.o nseries.o
 obj-y += omap_sx1.o palm.o realview.o spitz.o stellaris.o
-obj-y += tosa.o versatilepb.o vexpress.o xilinx_zynq.o z2.o
+obj-y += tosa.o versatilepb.o vexpress.o virt.o xilinx_zynq.o z2.o
 
 obj-y += armv7m.o exynos4210.o pxa2xx.o pxa2xx_gpio.o pxa2xx_pic.o
 obj-y += omap1.o omap2.o strongarm.o
diff --git a/hw/arm/boot.c b/hw/arm/boot.c
index 583ec7992e..55d552f3a8 100644
--- a/hw/arm/boot.c
+++ b/hw/arm/boot.c
@@ -228,23 +228,31 @@ static void set_kernel_args_old(const struct arm_boot_info *info)
 static int load_dtb(hwaddr addr, const struct arm_boot_info *binfo)
 {
     void *fdt = NULL;
-    char *filename;
     int size, rc;
     uint32_t acells, scells;
 
-    filename = qemu_find_file(QEMU_FILE_TYPE_BIOS, binfo->dtb_filename);
-    if (!filename) {
-        fprintf(stderr, "Couldn't open dtb file %s\n", binfo->dtb_filename);
-        goto fail;
-    }
+    if (binfo->dtb_filename) {
+        char *filename;
+        filename = qemu_find_file(QEMU_FILE_TYPE_BIOS, binfo->dtb_filename);
+        if (!filename) {
+            fprintf(stderr, "Couldn't open dtb file %s\n", binfo->dtb_filename);
+            goto fail;
+        }
 
-    fdt = load_device_tree(filename, &size);
-    if (!fdt) {
-        fprintf(stderr, "Couldn't open dtb file %s\n", filename);
+        fdt = load_device_tree(filename, &size);
+        if (!fdt) {
+            fprintf(stderr, "Couldn't open dtb file %s\n", filename);
+            g_free(filename);
+            goto fail;
+        }
         g_free(filename);
-        goto fail;
+    } else if (binfo->get_dtb) {
+        fdt = binfo->get_dtb(binfo, &size);
+        if (!fdt) {
+            fprintf(stderr, "Board was unable to create a dtb blob\n");
+            goto fail;
+        }
     }
-    g_free(filename);
 
     acells = qemu_devtree_getprop_cell(fdt, "/", "#address-cells");
     scells = qemu_devtree_getprop_cell(fdt, "/", "#size-cells");
@@ -438,7 +446,7 @@ void arm_load_kernel(ARMCPU *cpu, struct arm_boot_info *info)
         /* for device tree boot, we pass the DTB directly in r2. Otherwise
          * we point to the kernel args.
          */
-        if (info->dtb_filename) {
+        if (info->dtb_filename || info->get_dtb) {
             /* Place the DTB after the initrd in memory. Note that some
              * kernels will trash anything in the 4K page the initrd
              * ends in, so make sure the DTB isn't caught up in that.
diff --git a/hw/arm/integratorcp.c b/hw/arm/integratorcp.c
index c44b2a499c..a759689b44 100644
--- a/hw/arm/integratorcp.c
+++ b/hw/arm/integratorcp.c
@@ -36,6 +36,7 @@ typedef struct IntegratorCMState {
     uint32_t cm_init;
     uint32_t cm_flags;
     uint32_t cm_nvflags;
+    uint32_t cm_refcnt_offset;
     uint32_t int_level;
     uint32_t irq_enabled;
     uint32_t fiq_enabled;
@@ -82,9 +83,13 @@ static uint64_t integratorcm_read(void *opaque, hwaddr offset,
         return s->cm_sdram;
     case 9: /* CM_INIT */
         return s->cm_init;
-    case 10: /* CM_REFCT */
-        /* ??? High frequency timer.  */
-        hw_error("integratorcm_read: CM_REFCT");
+    case 10: /* CM_REFCNT */
+        /* This register, CM_REFCNT, provides a 32-bit count value.
+         * The count increments at the fixed reference clock frequency of 24MHz
+         * and can be used as a real-time counter.
+         */
+        return (uint32_t)muldiv64(qemu_clock_get_ns(QEMU_CLOCK_VIRTUAL), 24,
+                                  1000) - s->cm_refcnt_offset;
     case 12: /* CM_FLAGS */
         return s->cm_flags;
     case 14: /* CM_NVFLAGS */
@@ -257,6 +262,8 @@ static int integratorcm_init(SysBusDevice *dev)
     }
     memcpy(integrator_spd + 73, "QEMU-MEMORY", 11);
     s->cm_init = 0x00000112;
+    s->cm_refcnt_offset = muldiv64(qemu_clock_get_ns(QEMU_CLOCK_VIRTUAL), 24,
+                                   1000);
     memory_region_init_ram(&s->flash, OBJECT(s), "integrator.flash", 0x100000);
     vmstate_register_ram_global(&s->flash);
 
diff --git a/hw/arm/virt.c b/hw/arm/virt.c
new file mode 100644
index 0000000000..9531b5a574
--- /dev/null
+++ b/hw/arm/virt.c
@@ -0,0 +1,452 @@
+/*
+ * ARM mach-virt emulation
+ *
+ * Copyright (c) 2013 Linaro Limited
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms and conditions of the GNU General Public License,
+ * version 2 or later, as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
+ * more details.
+ *
+ * You should have received a copy of the GNU General Public License along with
+ * this program.  If not, see <http://www.gnu.org/licenses/>.
+ *
+ * Emulate a virtual board which works by passing Linux all the information
+ * it needs about what devices are present via the device tree.
+ * There are some restrictions about what we can do here:
+ *  + we can only present devices whose Linux drivers will work based
+ *    purely on the device tree with no platform data at all
+ *  + we want to present a very stripped-down minimalist platform,
+ *    both because this reduces the security attack surface from the guest
+ *    and also because it reduces our exposure to being broken when
+ *    the kernel updates its device tree bindings and requires further
+ *    information in a device binding that we aren't providing.
+ * This is essentially the same approach kvmtool uses.
+ */
+
+#include "hw/sysbus.h"
+#include "hw/arm/arm.h"
+#include "hw/arm/primecell.h"
+#include "hw/devices.h"
+#include "net/net.h"
+#include "sysemu/device_tree.h"
+#include "sysemu/sysemu.h"
+#include "sysemu/kvm.h"
+#include "hw/boards.h"
+#include "exec/address-spaces.h"
+#include "qemu/bitops.h"
+#include "qemu/error-report.h"
+
+#define NUM_VIRTIO_TRANSPORTS 32
+
+/* Number of external interrupt lines to configure the GIC with */
+#define NUM_IRQS 128
+
+#define GIC_FDT_IRQ_TYPE_SPI 0
+#define GIC_FDT_IRQ_TYPE_PPI 1
+
+#define GIC_FDT_IRQ_FLAGS_EDGE_LO_HI 1
+#define GIC_FDT_IRQ_FLAGS_EDGE_HI_LO 2
+#define GIC_FDT_IRQ_FLAGS_LEVEL_HI 4
+#define GIC_FDT_IRQ_FLAGS_LEVEL_LO 8
+
+#define GIC_FDT_IRQ_PPI_CPU_START 8
+#define GIC_FDT_IRQ_PPI_CPU_WIDTH 8
+
+enum {
+    VIRT_FLASH,
+    VIRT_MEM,
+    VIRT_CPUPERIPHS,
+    VIRT_GIC_DIST,
+    VIRT_GIC_CPU,
+    VIRT_UART,
+    VIRT_MMIO,
+};
+
+typedef struct MemMapEntry {
+    hwaddr base;
+    hwaddr size;
+} MemMapEntry;
+
+typedef struct VirtBoardInfo {
+    struct arm_boot_info bootinfo;
+    const char *cpu_model;
+    const char *qdevname;
+    const char *gic_compatible;
+    const MemMapEntry *memmap;
+    const int *irqmap;
+    int smp_cpus;
+    void *fdt;
+    int fdt_size;
+    uint32_t clock_phandle;
+} VirtBoardInfo;
+
+/* Addresses and sizes of our components.
+ * 0..128MB is space for a flash device so we can run bootrom code such as UEFI.
+ * 128MB..256MB is used for miscellaneous device I/O.
+ * 256MB..1GB is reserved for possible future PCI support (ie where the
+ * PCI memory window will go if we add a PCI host controller).
+ * 1GB and up is RAM (which may happily spill over into the
+ * high memory region beyond 4GB).
+ * This represents a compromise between how much RAM can be given to
+ * a 32 bit VM and leaving space for expansion and in particular for PCI.
+ */
+static const MemMapEntry a15memmap[] = {
+    /* Space up to 0x8000000 is reserved for a boot ROM */
+    [VIRT_FLASH] = { 0, 0x8000000 },
+    [VIRT_CPUPERIPHS] = { 0x8000000, 0x8000 },
+    /* GIC distributor and CPU interfaces sit inside the CPU peripheral space */
+    [VIRT_GIC_DIST] = { 0x8001000, 0x1000 },
+    [VIRT_GIC_CPU] = { 0x8002000, 0x1000 },
+    [VIRT_UART] = { 0x9000000, 0x1000 },
+    [VIRT_MMIO] = { 0xa000000, 0x200 },
+    /* ...repeating for a total of NUM_VIRTIO_TRANSPORTS, each of that size */
+    /* 0x10000000 .. 0x40000000 reserved for PCI */
+    [VIRT_MEM] = { 0x40000000, 30ULL * 1024 * 1024 * 1024 },
+};
+
+static const int a15irqmap[] = {
+    [VIRT_UART] = 1,
+    [VIRT_MMIO] = 16, /* ...to 16 + NUM_VIRTIO_TRANSPORTS - 1 */
+};
+
+static VirtBoardInfo machines[] = {
+    {
+        .cpu_model = "cortex-a15",
+        .qdevname = "a15mpcore_priv",
+        .gic_compatible = "arm,cortex-a15-gic",
+        .memmap = a15memmap,
+        .irqmap = a15irqmap,
+    },
+    {
+        .cpu_model = "host",
+        /* We use the A15 private peripheral model to get a V2 GIC */
+        .qdevname = "a15mpcore_priv",
+        .gic_compatible = "arm,cortex-a15-gic",
+        .memmap = a15memmap,
+        .irqmap = a15irqmap,
+    },
+};
+
+static VirtBoardInfo *find_machine_info(const char *cpu)
+{
+    int i;
+
+    for (i = 0; i < ARRAY_SIZE(machines); i++) {
+        if (strcmp(cpu, machines[i].cpu_model) == 0) {
+            return &machines[i];
+        }
+    }
+    return NULL;
+}
+
+static void create_fdt(VirtBoardInfo *vbi)
+{
+    void *fdt = create_device_tree(&vbi->fdt_size);
+
+    if (!fdt) {
+        error_report("create_device_tree() failed");
+        exit(1);
+    }
+
+    vbi->fdt = fdt;
+
+    /* Header */
+    qemu_devtree_setprop_string(fdt, "/", "compatible", "linux,dummy-virt");
+    qemu_devtree_setprop_cell(fdt, "/", "#address-cells", 0x2);
+    qemu_devtree_setprop_cell(fdt, "/", "#size-cells", 0x2);
+
+    /*
+     * /chosen and /memory nodes must exist for load_dtb
+     * to fill in necessary properties later
+     */
+    qemu_devtree_add_subnode(fdt, "/chosen");
+    qemu_devtree_add_subnode(fdt, "/memory");
+    qemu_devtree_setprop_string(fdt, "/memory", "device_type", "memory");
+
+    /* Clock node, for the benefit of the UART. The kernel device tree
+     * binding documentation claims the PL011 node clock properties are
+     * optional but in practice if you omit them the kernel refuses to
+     * probe for the device.
+     */
+    vbi->clock_phandle = qemu_devtree_alloc_phandle(fdt);
+    qemu_devtree_add_subnode(fdt, "/apb-pclk");
+    qemu_devtree_setprop_string(fdt, "/apb-pclk", "compatible", "fixed-clock");
+    qemu_devtree_setprop_cell(fdt, "/apb-pclk", "#clock-cells", 0x0);
+    qemu_devtree_setprop_cell(fdt, "/apb-pclk", "clock-frequency", 24000000);
+    qemu_devtree_setprop_string(fdt, "/apb-pclk", "clock-output-names",
+                                "clk24mhz");
+    qemu_devtree_setprop_cell(fdt, "/apb-pclk", "phandle", vbi->clock_phandle);
+
+    /* No PSCI for TCG yet */
+    if (kvm_enabled()) {
+        qemu_devtree_add_subnode(fdt, "/psci");
+        qemu_devtree_setprop_string(fdt, "/psci", "compatible", "arm,psci");
+        qemu_devtree_setprop_string(fdt, "/psci", "method", "hvc");
+        qemu_devtree_setprop_cell(fdt, "/psci", "cpu_suspend",
+                                  PSCI_FN_CPU_SUSPEND);
+        qemu_devtree_setprop_cell(fdt, "/psci", "cpu_off", PSCI_FN_CPU_OFF);
+        qemu_devtree_setprop_cell(fdt, "/psci", "cpu_on", PSCI_FN_CPU_ON);
+        qemu_devtree_setprop_cell(fdt, "/psci", "migrate", PSCI_FN_MIGRATE);
+    }
+}
+
+static void fdt_add_timer_nodes(const VirtBoardInfo *vbi)
+{
+    /* Note that on A15 h/w these interrupts are level-triggered,
+     * but for the GIC implementation provided by both QEMU and KVM
+     * they are edge-triggered.
+     */
+    uint32_t irqflags = GIC_FDT_IRQ_FLAGS_EDGE_LO_HI;
+
+    irqflags = deposit32(irqflags, GIC_FDT_IRQ_PPI_CPU_START,
+                         GIC_FDT_IRQ_PPI_CPU_WIDTH, (1 << vbi->smp_cpus) - 1);
+
+    qemu_devtree_add_subnode(vbi->fdt, "/timer");
+    qemu_devtree_setprop_string(vbi->fdt, "/timer",
+                                "compatible", "arm,armv7-timer");
+    qemu_devtree_setprop_cells(vbi->fdt, "/timer", "interrupts",
+                               GIC_FDT_IRQ_TYPE_PPI, 13, irqflags,
+                               GIC_FDT_IRQ_TYPE_PPI, 14, irqflags,
+                               GIC_FDT_IRQ_TYPE_PPI, 11, irqflags,
+                               GIC_FDT_IRQ_TYPE_PPI, 10, irqflags);
+}
+
+static void fdt_add_cpu_nodes(const VirtBoardInfo *vbi)
+{
+    int cpu;
+
+    qemu_devtree_add_subnode(vbi->fdt, "/cpus");
+    qemu_devtree_setprop_cell(vbi->fdt, "/cpus", "#address-cells", 0x1);
+    qemu_devtree_setprop_cell(vbi->fdt, "/cpus", "#size-cells", 0x0);
+
+    for (cpu = vbi->smp_cpus - 1; cpu >= 0; cpu--) {
+        char *nodename = g_strdup_printf("/cpus/cpu@%d", cpu);
+        ARMCPU *armcpu = ARM_CPU(qemu_get_cpu(cpu));
+
+        qemu_devtree_add_subnode(vbi->fdt, nodename);
+        qemu_devtree_setprop_string(vbi->fdt, nodename, "device_type", "cpu");
+        qemu_devtree_setprop_string(vbi->fdt, nodename, "compatible",
+                                    armcpu->dtb_compatible);
+
+        if (vbi->smp_cpus > 1) {
+            qemu_devtree_setprop_string(vbi->fdt, nodename,
+                                        "enable-method", "psci");
+        }
+
+        qemu_devtree_setprop_cell(vbi->fdt, nodename, "reg", cpu);
+        g_free(nodename);
+    }
+}
+
+static void fdt_add_gic_node(const VirtBoardInfo *vbi)
+{
+    uint32_t gic_phandle;
+
+    gic_phandle = qemu_devtree_alloc_phandle(vbi->fdt);
+    qemu_devtree_setprop_cell(vbi->fdt, "/", "interrupt-parent", gic_phandle);
+
+    qemu_devtree_add_subnode(vbi->fdt, "/intc");
+    qemu_devtree_setprop_string(vbi->fdt, "/intc", "compatible",
+                                vbi->gic_compatible);
+    qemu_devtree_setprop_cell(vbi->fdt, "/intc", "#interrupt-cells", 3);
+    qemu_devtree_setprop(vbi->fdt, "/intc", "interrupt-controller", NULL, 0);
+    qemu_devtree_setprop_sized_cells(vbi->fdt, "/intc", "reg",
+                                     2, vbi->memmap[VIRT_GIC_DIST].base,
+                                     2, vbi->memmap[VIRT_GIC_DIST].size,
+                                     2, vbi->memmap[VIRT_GIC_CPU].base,
+                                     2, vbi->memmap[VIRT_GIC_CPU].size);
+    qemu_devtree_setprop_cell(vbi->fdt, "/intc", "phandle", gic_phandle);
+}
+
+static void create_uart(const VirtBoardInfo *vbi, qemu_irq *pic)
+{
+    char *nodename;
+    hwaddr base = vbi->memmap[VIRT_UART].base;
+    hwaddr size = vbi->memmap[VIRT_UART].size;
+    int irq = vbi->irqmap[VIRT_UART];
+    const char compat[] = "arm,pl011\0arm,primecell";
+    const char clocknames[] = "uartclk\0apb_pclk";
+
+    sysbus_create_simple("pl011", base, pic[irq]);
+
+    nodename = g_strdup_printf("/pl011@%" PRIx64, base);
+    qemu_devtree_add_subnode(vbi->fdt, nodename);
+    /* Note that we can't use setprop_string because of the embedded NUL */
+    qemu_devtree_setprop(vbi->fdt, nodename, "compatible",
+                         compat, sizeof(compat));
+    qemu_devtree_setprop_sized_cells(vbi->fdt, nodename, "reg",
+                                     2, base, 2, size);
+    qemu_devtree_setprop_cells(vbi->fdt, nodename, "interrupts",
+                               GIC_FDT_IRQ_TYPE_SPI, irq,
+                               GIC_FDT_IRQ_FLAGS_EDGE_LO_HI);
+    qemu_devtree_setprop_cells(vbi->fdt, nodename, "clocks",
+                               vbi->clock_phandle, vbi->clock_phandle);
+    qemu_devtree_setprop(vbi->fdt, nodename, "clock-names",
+                         clocknames, sizeof(clocknames));
+    g_free(nodename);
+}
+
+static void create_virtio_devices(const VirtBoardInfo *vbi, qemu_irq *pic)
+{
+    int i;
+    hwaddr size = vbi->memmap[VIRT_MMIO].size;
+
+    /* Note that we have to create the transports in forwards order
+     * so that command line devices are inserted lowest address first,
+     * and then add dtb nodes in reverse order so that they appear in
+     * the finished device tree lowest address first.
+     */
+    for (i = 0; i < NUM_VIRTIO_TRANSPORTS; i++) {
+        int irq = vbi->irqmap[VIRT_MMIO] + i;
+        hwaddr base = vbi->memmap[VIRT_MMIO].base + i * size;
+
+        sysbus_create_simple("virtio-mmio", base, pic[irq]);
+    }
+
+    for (i = NUM_VIRTIO_TRANSPORTS - 1; i >= 0; i--) {
+        char *nodename;
+        int irq = vbi->irqmap[VIRT_MMIO] + i;
+        hwaddr base = vbi->memmap[VIRT_MMIO].base + i * size;
+
+        nodename = g_strdup_printf("/virtio_mmio@%" PRIx64, base);
+        qemu_devtree_add_subnode(vbi->fdt, nodename);
+        qemu_devtree_setprop_string(vbi->fdt, nodename,
+                                    "compatible", "virtio,mmio");
+        qemu_devtree_setprop_sized_cells(vbi->fdt, nodename, "reg",
+                                         2, base, 2, size);
+        qemu_devtree_setprop_cells(vbi->fdt, nodename, "interrupts",
+                                   GIC_FDT_IRQ_TYPE_SPI, irq,
+                                   GIC_FDT_IRQ_FLAGS_EDGE_LO_HI);
+        g_free(nodename);
+    }
+}
+
+static void *machvirt_dtb(const struct arm_boot_info *binfo, int *fdt_size)
+{
+    const VirtBoardInfo *board = (const VirtBoardInfo *)binfo;
+
+    *fdt_size = board->fdt_size;
+    return board->fdt;
+}
+
+static void machvirt_init(QEMUMachineInitArgs *args)
+{
+    qemu_irq pic[NUM_IRQS];
+    MemoryRegion *sysmem = get_system_memory();
+    int n;
+    MemoryRegion *ram = g_new(MemoryRegion, 1);
+    DeviceState *dev;
+    SysBusDevice *busdev;
+    const char *cpu_model = args->cpu_model;
+    VirtBoardInfo *vbi;
+
+    if (!cpu_model) {
+        cpu_model = "cortex-a15";
+    }
+
+    vbi = find_machine_info(cpu_model);
+
+    if (!vbi) {
+        error_report("mach-virt: CPU %s not supported", cpu_model);
+        exit(1);
+    }
+
+    vbi->smp_cpus = smp_cpus;
+
+    /*
+     * Only supported method of starting secondary CPUs is PSCI and
+     * PSCI is not yet supported with TCG, so limit smp_cpus to 1
+     * if we're not using KVM.
+     */
+    if (!kvm_enabled() && smp_cpus > 1) {
+        error_report("mach-virt: must enable KVM to use multiple CPUs");
+        exit(1);
+    }
+
+    if (args->ram_size > vbi->memmap[VIRT_MEM].size) {
+        error_report("mach-virt: cannot model more than 30GB RAM");
+        exit(1);
+    }
+
+    create_fdt(vbi);
+    fdt_add_timer_nodes(vbi);
+
+    for (n = 0; n < smp_cpus; n++) {
+        ObjectClass *oc = cpu_class_by_name(TYPE_ARM_CPU, cpu_model);
+        Object *cpuobj;
+
+        if (!oc) {
+            fprintf(stderr, "Unable to find CPU definition\n");
+            exit(1);
+        }
+        cpuobj = object_new(object_class_get_name(oc));
+
+        /* Secondary CPUs start in PSCI powered-down state */
+        if (n > 0) {
+            object_property_set_bool(cpuobj, true, "start-powered-off", NULL);
+        }
+        object_property_set_bool(cpuobj, true, "realized", NULL);
+    }
+    fdt_add_cpu_nodes(vbi);
+
+    memory_region_init_ram(ram, NULL, "mach-virt.ram", args->ram_size);
+    vmstate_register_ram_global(ram);
+    memory_region_add_subregion(sysmem, vbi->memmap[VIRT_MEM].base, ram);
+
+    dev = qdev_create(NULL, vbi->qdevname);
+    qdev_prop_set_uint32(dev, "num-cpu", smp_cpus);
+    /* Note that the num-irq property counts both internal and external
+     * interrupts; there are always 32 of the former (mandated by GIC spec).
+     */
+    qdev_prop_set_uint32(dev, "num-irq", NUM_IRQS + 32);
+    qdev_init_nofail(dev);
+    busdev = SYS_BUS_DEVICE(dev);
+    sysbus_mmio_map(busdev, 0, vbi->memmap[VIRT_CPUPERIPHS].base);
+    fdt_add_gic_node(vbi);
+    for (n = 0; n < smp_cpus; n++) {
+        DeviceState *cpudev = DEVICE(qemu_get_cpu(n));
+
+        sysbus_connect_irq(busdev, n, qdev_get_gpio_in(cpudev, ARM_CPU_IRQ));
+    }
+
+    for (n = 0; n < NUM_IRQS; n++) {
+        pic[n] = qdev_get_gpio_in(dev, n);
+    }
+
+    create_uart(vbi, pic);
+
+    /* Create mmio transports, so the user can create virtio backends
+     * (which will be automatically plugged in to the transports). If
+     * no backend is created the transport will just sit harmlessly idle.
+     */
+    create_virtio_devices(vbi, pic);
+
+    vbi->bootinfo.ram_size = args->ram_size;
+    vbi->bootinfo.kernel_filename = args->kernel_filename;
+    vbi->bootinfo.kernel_cmdline = args->kernel_cmdline;
+    vbi->bootinfo.initrd_filename = args->initrd_filename;
+    vbi->bootinfo.nb_cpus = smp_cpus;
+    vbi->bootinfo.board_id = -1;
+    vbi->bootinfo.loader_start = vbi->memmap[VIRT_MEM].base;
+    vbi->bootinfo.get_dtb = machvirt_dtb;
+    arm_load_kernel(ARM_CPU(first_cpu), &vbi->bootinfo);
+}
+
+static QEMUMachine machvirt_a15_machine = {
+    .name = "virt",
+    .desc = "ARM Virtual Machine",
+    .init = machvirt_init,
+    .max_cpus = 4,
+};
+
+static void machvirt_machine_init(void)
+{
+    qemu_register_machine(&machvirt_a15_machine);
+}
+
+machine_init(machvirt_machine_init);
diff --git a/hw/audio/adlib.c b/hw/audio/adlib.c
index bd8e9d9815..e88d2dd845 100644
--- a/hw/audio/adlib.c
+++ b/hw/audio/adlib.c
@@ -347,8 +347,8 @@ static void adlib_realizefn (DeviceState *dev, Error **errp)
     s->samples = AUD_get_buffer_size_out (s->voice) >> SHIFT;
     s->mixbuf = g_malloc0 (s->samples << SHIFT);
 
-    adlib_portio_list[1].offset = s->port;
-    adlib_portio_list[2].offset = s->port + 8;
+    adlib_portio_list[0].offset = s->port;
+    adlib_portio_list[1].offset = s->port + 8;
     portio_list_init (port_list, OBJECT(s), adlib_portio_list, s, "adlib");
     portio_list_add (port_list, isa_address_space_io(&s->parent_obj), 0);
 }
diff --git a/hw/audio/intel-hda.c b/hw/audio/intel-hda.c
index 4327264394..6ab8c245d3 100644
--- a/hw/audio/intel-hda.c
+++ b/hw/audio/intel-hda.c
@@ -444,6 +444,7 @@ static bool intel_hda_xfer(HDACodecDevice *dev, uint32_t stnr, bool output,
         }
     }
     if (d->dp_lbase & 0x01) {
+        s = st - d->st;
         addr = intel_hda_addr(d->dp_lbase & ~0x01, d->dp_ubase);
         stl_le_pci_dma(&d->pci, addr + 8*s, st->lpib);
     }
diff --git a/hw/cpu/a9mpcore.c b/hw/cpu/a9mpcore.c
index 918a7d1291..c09358c6e7 100644
--- a/hw/cpu/a9mpcore.c
+++ b/hw/cpu/a9mpcore.c
@@ -24,11 +24,14 @@ static void a9mp_priv_initfn(Object *obj)
     memory_region_init(&s->container, obj, "a9mp-priv-container", 0x2000);
     sysbus_init_mmio(SYS_BUS_DEVICE(obj), &s->container);
 
+    object_initialize(&s->scu, sizeof(s->scu), TYPE_A9_SCU);
+    qdev_set_parent_bus(DEVICE(&s->scu), sysbus_get_default());
+
     object_initialize(&s->gic, sizeof(s->gic), TYPE_ARM_GIC);
     qdev_set_parent_bus(DEVICE(&s->gic), sysbus_get_default());
 
-    object_initialize(&s->scu, sizeof(s->scu), TYPE_A9_SCU);
-    qdev_set_parent_bus(DEVICE(&s->scu), sysbus_get_default());
+    object_initialize(&s->gtimer, sizeof(s->gtimer), TYPE_A9_GTIMER);
+    qdev_set_parent_bus(DEVICE(&s->gtimer), sysbus_get_default());
 
     object_initialize(&s->mptimer, sizeof(s->mptimer), TYPE_ARM_MPTIMER);
     qdev_set_parent_bus(DEVICE(&s->mptimer), sysbus_get_default());
@@ -41,11 +44,21 @@ static void a9mp_priv_realize(DeviceState *dev, Error **errp)
 {
     SysBusDevice *sbd = SYS_BUS_DEVICE(dev);
     A9MPPrivState *s = A9MPCORE_PRIV(dev);
-    DeviceState *gicdev, *scudev, *mptimerdev, *wdtdev;
-    SysBusDevice *timerbusdev, *wdtbusdev, *gicbusdev, *scubusdev;
+    DeviceState *scudev, *gicdev, *gtimerdev, *mptimerdev, *wdtdev;
+    SysBusDevice *scubusdev, *gicbusdev, *gtimerbusdev, *mptimerbusdev,
+                 *wdtbusdev;
     Error *err = NULL;
     int i;
 
+    scudev = DEVICE(&s->scu);
+    qdev_prop_set_uint32(scudev, "num-cpu", s->num_cpu);
+    object_property_set_bool(OBJECT(&s->scu), true, "realized", &err);
+    if (err != NULL) {
+        error_propagate(errp, err);
+        return;
+    }
+    scubusdev = SYS_BUS_DEVICE(&s->scu);
+
     gicdev = DEVICE(&s->gic);
     qdev_prop_set_uint32(gicdev, "num-cpu", s->num_cpu);
     qdev_prop_set_uint32(gicdev, "num-irq", s->num_irq);
@@ -62,14 +75,14 @@ static void a9mp_priv_realize(DeviceState *dev, Error **errp)
     /* Pass through inbound GPIO lines to the GIC */
     qdev_init_gpio_in(dev, a9mp_priv_set_irq, s->num_irq - 32);
 
-    scudev = DEVICE(&s->scu);
-    qdev_prop_set_uint32(scudev, "num-cpu", s->num_cpu);
-    object_property_set_bool(OBJECT(&s->scu), true, "realized", &err);
+    gtimerdev = DEVICE(&s->gtimer);
+    qdev_prop_set_uint32(gtimerdev, "num-cpu", s->num_cpu);
+    object_property_set_bool(OBJECT(&s->gtimer), true, "realized", &err);
     if (err != NULL) {
         error_propagate(errp, err);
         return;
     }
-    scubusdev = SYS_BUS_DEVICE(&s->scu);
+    gtimerbusdev = SYS_BUS_DEVICE(&s->gtimer);
 
     mptimerdev = DEVICE(&s->mptimer);
     qdev_prop_set_uint32(mptimerdev, "num-cpu", s->num_cpu);
@@ -78,7 +91,7 @@ static void a9mp_priv_realize(DeviceState *dev, Error **errp)
         error_propagate(errp, err);
         return;
     }
-    timerbusdev = SYS_BUS_DEVICE(&s->mptimer);
+    mptimerbusdev = SYS_BUS_DEVICE(&s->mptimer);
 
     wdtdev = DEVICE(&s->wdt);
     qdev_prop_set_uint32(wdtdev, "num-cpu", s->num_cpu);
@@ -97,30 +110,33 @@ static void a9mp_priv_realize(DeviceState *dev, Error **errp)
      *  0x0600-0x06ff -- private timers and watchdogs
      *  0x0700-0x0fff -- nothing
      *  0x1000-0x1fff -- GIC Distributor
-     *
-     * We should implement the global timer but don't currently do so.
      */
     memory_region_add_subregion(&s->container, 0,
                                 sysbus_mmio_get_region(scubusdev, 0));
     /* GIC CPU interface */
     memory_region_add_subregion(&s->container, 0x100,
                                 sysbus_mmio_get_region(gicbusdev, 1));
+    memory_region_add_subregion(&s->container, 0x200,
+                                sysbus_mmio_get_region(gtimerbusdev, 0));
     /* Note that the A9 exposes only the "timer/watchdog for this core"
      * memory region, not the "timer/watchdog for core X" ones 11MPcore has.
      */
     memory_region_add_subregion(&s->container, 0x600,
-                                sysbus_mmio_get_region(timerbusdev, 0));
+                                sysbus_mmio_get_region(mptimerbusdev, 0));
     memory_region_add_subregion(&s->container, 0x620,
                                 sysbus_mmio_get_region(wdtbusdev, 0));
     memory_region_add_subregion(&s->container, 0x1000,
                                 sysbus_mmio_get_region(gicbusdev, 0));
 
     /* Wire up the interrupt from each watchdog and timer.
-     * For each core the timer is PPI 29 and the watchdog PPI 30.
+     * For each core the global timer is PPI 27, the private
+     * timer is PPI 29 and the watchdog PPI 30.
      */
     for (i = 0; i < s->num_cpu; i++) {
         int ppibase = (s->num_irq - 32) + i * 32;
-        sysbus_connect_irq(timerbusdev, i,
+        sysbus_connect_irq(gtimerbusdev, i,
+                           qdev_get_gpio_in(gicdev, ppibase + 27));
+        sysbus_connect_irq(mptimerbusdev, i,
                            qdev_get_gpio_in(gicdev, ppibase + 29));
         sysbus_connect_irq(wdtbusdev, i,
                            qdev_get_gpio_in(gicdev, ppibase + 30));
diff --git a/hw/display/qxl-render.c b/hw/display/qxl-render.c
index d34b0c4170..84f1367716 100644
--- a/hw/display/qxl-render.c
+++ b/hw/display/qxl-render.c
@@ -20,6 +20,7 @@
  */
 
 #include "qxl.h"
+#include "trace.h"
 
 static void qxl_blit(PCIQXLDevice *qxl, QXLRect *rect)
 {
diff --git a/hw/net/cadence_gem.c b/hw/net/cadence_gem.c
index 4a355bbbef..92dc2f21fa 100644
--- a/hw/net/cadence_gem.c
+++ b/hw/net/cadence_gem.c
@@ -222,8 +222,13 @@
 #define PHY_REG_INT_ST_ENERGY   0x0010
 
 /***********************************************************************/
-#define GEM_RX_REJECT  1
-#define GEM_RX_ACCEPT  0
+#define GEM_RX_REJECT                   (-1)
+#define GEM_RX_PROMISCUOUS_ACCEPT       (-2)
+#define GEM_RX_BROADCAST_ACCEPT         (-3)
+#define GEM_RX_MULTICAST_HASH_ACCEPT    (-4)
+#define GEM_RX_UNICAST_HASH_ACCEPT      (-5)
+
+#define GEM_RX_SAR_ACCEPT               0
 
 /***********************************************************************/
 
@@ -236,6 +241,13 @@
 #define DESC_0_RX_WRAP 0x00000002
 #define DESC_0_RX_OWNERSHIP 0x00000001
 
+#define R_DESC_1_RX_SAR_SHIFT           25
+#define R_DESC_1_RX_SAR_LENGTH          2
+#define R_DESC_1_RX_SAR_MATCH           (1 << 27)
+#define R_DESC_1_RX_UNICAST_HASH        (1 << 29)
+#define R_DESC_1_RX_MULTICAST_HASH      (1 << 30)
+#define R_DESC_1_RX_BROADCAST           (1 << 31)
+
 #define DESC_1_RX_SOF 0x00004000
 #define DESC_1_RX_EOF 0x00008000
 
@@ -315,6 +327,28 @@ static inline void rx_desc_set_length(unsigned *desc, unsigned len)
     desc[1] |= len;
 }
 
+static inline void rx_desc_set_broadcast(unsigned *desc)
+{
+    desc[1] |= R_DESC_1_RX_BROADCAST;
+}
+
+static inline void rx_desc_set_unicast_hash(unsigned *desc)
+{
+    desc[1] |= R_DESC_1_RX_UNICAST_HASH;
+}
+
+static inline void rx_desc_set_multicast_hash(unsigned *desc)
+{
+    desc[1] |= R_DESC_1_RX_MULTICAST_HASH;
+}
+
+static inline void rx_desc_set_sar(unsigned *desc, int sar_idx)
+{
+    desc[1] = deposit32(desc[1], R_DESC_1_RX_SAR_SHIFT, R_DESC_1_RX_SAR_LENGTH,
+                        sar_idx);
+    desc[1] |= R_DESC_1_RX_SAR_MATCH;
+}
+
 #define TYPE_CADENCE_GEM "cadence_gem"
 #define GEM(obj) OBJECT_CHECK(GemState, (obj), TYPE_CADENCE_GEM)
 
@@ -346,6 +380,11 @@ typedef struct GemState {
     uint32_t rx_desc_addr;
     uint32_t tx_desc_addr;
 
+    uint8_t can_rx_state; /* Debug only */
+
+    unsigned rx_desc[2];
+
+    bool sar_active[4];
 } GemState;
 
 /* The broadcast MAC address: 0xFFFFFFFFFFFF */
@@ -415,13 +454,28 @@ static int gem_can_receive(NetClientState *nc)
 
     s = qemu_get_nic_opaque(nc);
 
-    DB_PRINT("\n");
-
     /* Do nothing if receive is not enabled. */
     if (!(s->regs[GEM_NWCTRL] & GEM_NWCTRL_RXENA)) {
+        if (s->can_rx_state != 1) {
+            s->can_rx_state = 1;
+            DB_PRINT("can't receive - no enable\n");
+        }
         return 0;
     }
 
+    if (rx_desc_get_ownership(s->rx_desc) == 1) {
+        if (s->can_rx_state != 2) {
+            s->can_rx_state = 2;
+            DB_PRINT("can't receive - busy buffer descriptor 0x%x\n",
+                     s->rx_desc_addr);
+        }
+        return 0;
+    }
+
+    if (s->can_rx_state != 0) {
+        s->can_rx_state = 0;
+        DB_PRINT("can receive 0x%x\n", s->rx_desc_addr);
+    }
     return 1;
 }
 
@@ -527,7 +581,10 @@ static unsigned calc_mac_hash(const uint8_t *mac)
  * Accept or reject this destination address?
  * Returns:
  * GEM_RX_REJECT: reject
- * GEM_RX_ACCEPT: accept
+ * >= 0: Specific address accept (which matched SAR is returned)
+ * others for various other modes of accept:
+ * GEM_RM_PROMISCUOUS_ACCEPT, GEM_RX_BROADCAST_ACCEPT,
+ * GEM_RX_MULTICAST_HASH_ACCEPT or GEM_RX_UNICAST_HASH_ACCEPT
  */
 static int gem_mac_address_filter(GemState *s, const uint8_t *packet)
 {
@@ -536,7 +593,7 @@ static int gem_mac_address_filter(GemState *s, const uint8_t *packet)
 
     /* Promiscuous mode? */
     if (s->regs[GEM_NWCFG] & GEM_NWCFG_PROMISC) {
-        return GEM_RX_ACCEPT;
+        return GEM_RX_PROMISCUOUS_ACCEPT;
     }
 
     if (!memcmp(packet, broadcast_addr, 6)) {
@@ -544,7 +601,7 @@ static int gem_mac_address_filter(GemState *s, const uint8_t *packet)
         if (s->regs[GEM_NWCFG] & GEM_NWCFG_BCAST_REJ) {
             return GEM_RX_REJECT;
         }
-        return GEM_RX_ACCEPT;
+        return GEM_RX_BROADCAST_ACCEPT;
     }
 
     /* Accept packets -w- hash match? */
@@ -555,53 +612,67 @@ static int gem_mac_address_filter(GemState *s, const uint8_t *packet)
         hash_index = calc_mac_hash(packet);
         if (hash_index < 32) {
             if (s->regs[GEM_HASHLO] & (1<<hash_index)) {
-                return GEM_RX_ACCEPT;
+                return packet[0] == 0x01 ? GEM_RX_MULTICAST_HASH_ACCEPT :
+                                           GEM_RX_UNICAST_HASH_ACCEPT;
             }
         } else {
             hash_index -= 32;
             if (s->regs[GEM_HASHHI] & (1<<hash_index)) {
-                return GEM_RX_ACCEPT;
+                return packet[0] == 0x01 ? GEM_RX_MULTICAST_HASH_ACCEPT :
+                                           GEM_RX_UNICAST_HASH_ACCEPT;
             }
         }
     }
 
     /* Check all 4 specific addresses */
     gem_spaddr = (uint8_t *)&(s->regs[GEM_SPADDR1LO]);
-    for (i = 0; i < 4; i++) {
-        if (!memcmp(packet, gem_spaddr, 6)) {
-            return GEM_RX_ACCEPT;
+    for (i = 3; i >= 0; i--) {
+        if (s->sar_active[i] && !memcmp(packet, gem_spaddr + 8 * i, 6)) {
+            return GEM_RX_SAR_ACCEPT + i;
         }
-
-        gem_spaddr += 8;
     }
 
     /* No address match; reject the packet */
     return GEM_RX_REJECT;
 }
 
+static void gem_get_rx_desc(GemState *s)
+{
+    DB_PRINT("read descriptor 0x%x\n", (unsigned)s->rx_desc_addr);
+    /* read current descriptor */
+    cpu_physical_memory_read(s->rx_desc_addr,
+                             (uint8_t *)s->rx_desc, sizeof(s->rx_desc));
+
+    /* Descriptor owned by software ? */
+    if (rx_desc_get_ownership(s->rx_desc) == 1) {
+        DB_PRINT("descriptor 0x%x owned by sw.\n",
+                 (unsigned)s->rx_desc_addr);
+        s->regs[GEM_RXSTATUS] |= GEM_RXSTATUS_NOBUF;
+        s->regs[GEM_ISR] |= GEM_INT_RXUSED & ~(s->regs[GEM_IMR]);
+        /* Handle interrupt consequences */
+        gem_update_int_status(s);
+    }
+}
+
 /*
  * gem_receive:
  * Fit a packet handed to us by QEMU into the receive descriptor ring.
  */
 static ssize_t gem_receive(NetClientState *nc, const uint8_t *buf, size_t size)
 {
-    unsigned    desc[2];
-    hwaddr packet_desc_addr, last_desc_addr;
     GemState *s;
     unsigned   rxbufsize, bytes_to_copy;
     unsigned   rxbuf_offset;
     uint8_t    rxbuf[2048];
     uint8_t   *rxbuf_ptr;
+    bool first_desc = true;
+    int maf;
 
     s = qemu_get_nic_opaque(nc);
 
-    /* Do nothing if receive is not enabled. */
-    if (!gem_can_receive(nc)) {
-        return -1;
-    }
-
     /* Is this destination MAC address "for us" ? */
-    if (gem_mac_address_filter(s, buf) == GEM_RX_REJECT) {
+    maf = gem_mac_address_filter(s, buf);
+    if (maf == GEM_RX_REJECT) {
         return -1;
     }
 
@@ -633,6 +704,14 @@ static ssize_t gem_receive(NetClientState *nc, const uint8_t *buf, size_t size)
                  GEM_DMACFG_RBUFSZ_S) * GEM_DMACFG_RBUFSZ_MUL;
     bytes_to_copy = size;
 
+    /* Pad to minimum length. Assume FCS field is stripped, logic
+     * below will increment it to the real minimum of 64 when
+     * not FCS stripping
+     */
+    if (size < 60) {
+        size = 60;
+    }
+
     /* Strip of FCS field ? (usually yes) */
     if (s->regs[GEM_NWCFG] & GEM_NWCFG_STRIP_FCS) {
         rxbuf_ptr = (void *)buf;
@@ -659,95 +738,71 @@ static ssize_t gem_receive(NetClientState *nc, const uint8_t *buf, size_t size)
         size += 4;
     }
 
-    /* Pad to minimum length */
-    if (size < 64) {
-        size = 64;
-    }
-
     DB_PRINT("config bufsize: %d packet size: %ld\n", rxbufsize, size);
 
-    packet_desc_addr = s->rx_desc_addr;
-    while (1) {
-        DB_PRINT("read descriptor 0x%x\n", (unsigned)packet_desc_addr);
-        /* read current descriptor */
-        cpu_physical_memory_read(packet_desc_addr,
-                                 (uint8_t *)&desc[0], sizeof(desc));
-
-        /* Descriptor owned by software ? */
-        if (rx_desc_get_ownership(desc) == 1) {
-            DB_PRINT("descriptor 0x%x owned by sw.\n",
-                     (unsigned)packet_desc_addr);
-            s->regs[GEM_RXSTATUS] |= GEM_RXSTATUS_NOBUF;
-            s->regs[GEM_ISR] |= GEM_INT_RXUSED & ~(s->regs[GEM_IMR]);
-            /* Handle interrupt consequences */
-            gem_update_int_status(s);
+    while (bytes_to_copy) {
+        /* Do nothing if receive is not enabled. */
+        if (!gem_can_receive(nc)) {
+            assert(!first_desc);
             return -1;
         }
 
         DB_PRINT("copy %d bytes to 0x%x\n", MIN(bytes_to_copy, rxbufsize),
-                rx_desc_get_buffer(desc));
-
-        /*
-         * Let's have QEMU lend a helping hand.
-         */
-        if (rx_desc_get_buffer(desc) == 0) {
-            DB_PRINT("Invalid RX buffer (NULL) for descriptor 0x%x\n",
-                     (unsigned)packet_desc_addr);
-            break;
-        }
+                rx_desc_get_buffer(s->rx_desc));
 
         /* Copy packet data to emulated DMA buffer */
-        cpu_physical_memory_write(rx_desc_get_buffer(desc) + rxbuf_offset,
+        cpu_physical_memory_write(rx_desc_get_buffer(s->rx_desc) + rxbuf_offset,
                                   rxbuf_ptr, MIN(bytes_to_copy, rxbufsize));
-        bytes_to_copy -= MIN(bytes_to_copy, rxbufsize);
         rxbuf_ptr += MIN(bytes_to_copy, rxbufsize);
+        bytes_to_copy -= MIN(bytes_to_copy, rxbufsize);
+
+        /* Update the descriptor.  */
+        if (first_desc) {
+            rx_desc_set_sof(s->rx_desc);
+            first_desc = false;
+        }
         if (bytes_to_copy == 0) {
+            rx_desc_set_eof(s->rx_desc);
+            rx_desc_set_length(s->rx_desc, size);
+        }
+        rx_desc_set_ownership(s->rx_desc);
+
+        switch (maf) {
+        case GEM_RX_PROMISCUOUS_ACCEPT:
+            break;
+        case GEM_RX_BROADCAST_ACCEPT:
+            rx_desc_set_broadcast(s->rx_desc);
+            break;
+        case GEM_RX_UNICAST_HASH_ACCEPT:
+            rx_desc_set_unicast_hash(s->rx_desc);
+            break;
+        case GEM_RX_MULTICAST_HASH_ACCEPT:
+            rx_desc_set_multicast_hash(s->rx_desc);
             break;
+        case GEM_RX_REJECT:
+            abort();
+        default: /* SAR */
+            rx_desc_set_sar(s->rx_desc, maf);
         }
 
+        /* Descriptor write-back.  */
+        cpu_physical_memory_write(s->rx_desc_addr,
+                                  (uint8_t *)s->rx_desc, sizeof(s->rx_desc));
+
         /* Next descriptor */
-        if (rx_desc_get_wrap(desc)) {
-            packet_desc_addr = s->regs[GEM_RXQBASE];
+        if (rx_desc_get_wrap(s->rx_desc)) {
+            DB_PRINT("wrapping RX descriptor list\n");
+            s->rx_desc_addr = s->regs[GEM_RXQBASE];
         } else {
-            packet_desc_addr += 8;
+            DB_PRINT("incrementing RX descriptor list\n");
+            s->rx_desc_addr += 8;
         }
+        gem_get_rx_desc(s);
     }
 
-    DB_PRINT("set length: %ld, EOF on descriptor 0x%x\n", size,
-            (unsigned)packet_desc_addr);
-
-    /* Update last descriptor with EOF and total length */
-    rx_desc_set_eof(desc);
-    rx_desc_set_length(desc, size);
-    cpu_physical_memory_write(packet_desc_addr,
-                              (uint8_t *)&desc[0], sizeof(desc));
-
-    /* Advance RX packet descriptor Q */
-    last_desc_addr = packet_desc_addr;
-    packet_desc_addr = s->rx_desc_addr;
-    s->rx_desc_addr = last_desc_addr;
-    if (rx_desc_get_wrap(desc)) {
-        s->rx_desc_addr = s->regs[GEM_RXQBASE];
-        DB_PRINT("wrapping RX descriptor list\n");
-    } else {
-        DB_PRINT("incrementing RX descriptor list\n");
-        s->rx_desc_addr += 8;
-    }
-
-    DB_PRINT("set SOF, OWN on descriptor 0x%08x\n", (unsigned)packet_desc_addr);
-
     /* Count it */
     gem_receive_updatestats(s, buf, size);
 
-    /* Update first descriptor (which could also be the last) */
-    /* read descriptor */
-    cpu_physical_memory_read(packet_desc_addr,
-                             (uint8_t *)&desc[0], sizeof(desc));
-    rx_desc_set_sof(desc);
-    rx_desc_set_ownership(desc);
-    cpu_physical_memory_write(packet_desc_addr,
-                              (uint8_t *)&desc[0], sizeof(desc));
-
     s->regs[GEM_RXSTATUS] |= GEM_RXSTATUS_FRMRCVD;
     s->regs[GEM_ISR] |= GEM_INT_RXCMPL & ~(s->regs[GEM_IMR]);
 
@@ -893,7 +948,7 @@ static void gem_transmit(GemState *s)
             gem_transmit_updatestats(s, tx_packet, total_bytes);
 
             /* Send the packet somewhere */
-            if (s->phy_loop) {
+            if (s->phy_loop || (s->regs[GEM_NWCTRL] & GEM_NWCTRL_LOCALLOOP)) {
                 gem_receive(qemu_get_queue(s->nic), tx_packet, total_bytes);
             } else {
                 qemu_send_packet(qemu_get_queue(s->nic), tx_packet,
@@ -949,6 +1004,7 @@ static void gem_phy_reset(GemState *s)
 
 static void gem_reset(DeviceState *d)
 {
+    int i;
     GemState *s = GEM(d);
 
     DB_PRINT("\n");
@@ -968,6 +1024,10 @@ static void gem_reset(DeviceState *d)
     s->regs[GEM_DESCONF5] = 0x002f2145;
     s->regs[GEM_DESCONF6] = 0x00000200;
 
+    for (i = 0; i < 4; i++) {
+        s->sar_active[i] = false;
+    }
+
     gem_phy_reset(s);
 
     gem_update_int_status(s);
@@ -1069,19 +1129,21 @@ static void gem_write(void *opaque, hwaddr offset, uint64_t val,
 
     /* Squash bits which are read only in write value */
     val &= ~(s->regs_ro[offset]);
-    /* Preserve (only) bits which are read only in register */
-    readonly = s->regs[offset];
-    readonly &= s->regs_ro[offset];
-
-    /* Squash bits which are write 1 to clear */
-    val &= ~(s->regs_w1c[offset] & val);
+    /* Preserve (only) bits which are read only and wtc in register */
+    readonly = s->regs[offset] & (s->regs_ro[offset] | s->regs_w1c[offset]);
 
     /* Copy register write to backing store */
-    s->regs[offset] = val | readonly;
+    s->regs[offset] = (val & ~s->regs_w1c[offset]) | readonly;
+
+    /* do w1c */
+    s->regs[offset] &= ~(s->regs_w1c[offset] & val);
 
     /* Handle register write side effects */
     switch (offset) {
     case GEM_NWCTRL:
+        if (val & GEM_NWCTRL_RXENA) {
+            gem_get_rx_desc(s);
+        }
         if (val & GEM_NWCTRL_TXSTART) {
             gem_transmit(s);
         }
@@ -1089,7 +1151,7 @@ static void gem_write(void *opaque, hwaddr offset, uint64_t val,
             /* Reset to start of Q when transmit disabled. */
             s->tx_desc_addr = s->regs[GEM_TXQBASE];
         }
-        if (val & GEM_NWCTRL_RXENA) {
+        if (gem_can_receive(qemu_get_queue(s->nic))) {
             qemu_flush_queued_packets(qemu_get_queue(s->nic));
         }
         break;
@@ -1114,6 +1176,18 @@ static void gem_write(void *opaque, hwaddr offset, uint64_t val,
         s->regs[GEM_IMR] |= val;
         gem_update_int_status(s);
         break;
+    case GEM_SPADDR1LO:
+    case GEM_SPADDR2LO:
+    case GEM_SPADDR3LO:
+    case GEM_SPADDR4LO:
+        s->sar_active[(offset - GEM_SPADDR1LO) / 2] = false;
+        break;
+    case GEM_SPADDR1HI:
+    case GEM_SPADDR2HI:
+    case GEM_SPADDR3HI:
+    case GEM_SPADDR4HI:
+        s->sar_active[(offset - GEM_SPADDR1HI) / 2] = true;
+        break;
     case GEM_PHYMNTNC:
         if (val & GEM_PHYMNTNC_OP_W) {
             uint32_t phy_addr, reg_num;
@@ -1181,15 +1255,17 @@ static int gem_init(SysBusDevice *sbd)
 
 static const VMStateDescription vmstate_cadence_gem = {
     .name = "cadence_gem",
-    .version_id = 1,
-    .minimum_version_id = 1,
-    .minimum_version_id_old = 1,
+    .version_id = 2,
+    .minimum_version_id = 2,
+    .minimum_version_id_old = 2,
     .fields      = (VMStateField[]) {
         VMSTATE_UINT32_ARRAY(regs, GemState, GEM_MAXREG),
         VMSTATE_UINT16_ARRAY(phy_regs, GemState, 32),
         VMSTATE_UINT8(phy_loop, GemState),
         VMSTATE_UINT32(rx_desc_addr, GemState),
         VMSTATE_UINT32(tx_desc_addr, GemState),
+        VMSTATE_BOOL_ARRAY(sar_active, GemState, 4),
+        VMSTATE_END_OF_LIST(),
     }
 };
 
diff --git a/hw/timer/Makefile.objs b/hw/timer/Makefile.objs
index eca590570e..3ae091c95e 100644
--- a/hw/timer/Makefile.objs
+++ b/hw/timer/Makefile.objs
@@ -1,5 +1,6 @@
 common-obj-$(CONFIG_ARM_TIMER) += arm_timer.o
 common-obj-$(CONFIG_ARM_MPTIMER) += arm_mptimer.o
+common-obj-$(CONFIG_A9_GTIMER) += a9gtimer.o
 common-obj-$(CONFIG_CADENCE) += cadence_ttc.o
 common-obj-$(CONFIG_DS1338) += ds1338.o
 common-obj-$(CONFIG_HPET) += hpet.o
diff --git a/hw/timer/a9gtimer.c b/hw/timer/a9gtimer.c
new file mode 100644
index 0000000000..a0656d58a1
--- /dev/null
+++ b/hw/timer/a9gtimer.c
@@ -0,0 +1,369 @@
+/*
+ * Global peripheral timer block for ARM A9MP
+ *
+ * (C) 2013 Xilinx Inc.
+ *
+ * Written by François LEGAL
+ * Written by Peter Crosthwaite <peter.crosthwaite@xilinx.com>
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version
+ * 2 of the License, or (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License along
+ * with this program; if not, see <http://www.gnu.org/licenses/>.
+ */
+
+#include "hw/timer/a9gtimer.h"
+#include "qemu/timer.h"
+#include "qemu/bitops.h"
+#include "qemu/log.h"
+
+#ifndef A9_GTIMER_ERR_DEBUG
+#define A9_GTIMER_ERR_DEBUG 0
+#endif
+
+#define DB_PRINT_L(level, ...) do { \
+    if (A9_GTIMER_ERR_DEBUG > (level)) { \
+        fprintf(stderr,  ": %s: ", __func__); \
+        fprintf(stderr, ## __VA_ARGS__); \
+    } \
+} while (0);
+
+#define DB_PRINT(...) DB_PRINT_L(0, ## __VA_ARGS__)
+
+static inline int a9_gtimer_get_current_cpu(A9GTimerState *s)
+{
+    if (current_cpu->cpu_index >= s->num_cpu) {
+        hw_error("a9gtimer: num-cpu %d but this cpu is %d!\n",
+                 s->num_cpu, current_cpu->cpu_index);
+    }
+    return current_cpu->cpu_index;
+}
+
+static inline uint64_t a9_gtimer_get_conv(A9GTimerState *s)
+{
+    uint64_t prescale = extract32(s->control, R_CONTROL_PRESCALER_SHIFT,
+                                  R_CONTROL_PRESCALER_LEN);
+
+    return (prescale + 1) * 10;
+}
+
+static A9GTimerUpdate a9_gtimer_get_update(A9GTimerState *s)
+{
+    A9GTimerUpdate ret;
+
+    ret.now = qemu_clock_get_ns(QEMU_CLOCK_VIRTUAL);
+    ret.new = s->ref_counter +
+              (ret.now - s->cpu_ref_time) / a9_gtimer_get_conv(s);
+    return ret;
+}
+
+static void a9_gtimer_update(A9GTimerState *s, bool sync)
+{
+
+    A9GTimerUpdate update = a9_gtimer_get_update(s);
+    int i;
+    int64_t next_cdiff = 0;
+
+    for (i = 0; i < s->num_cpu; ++i) {
+        A9GTimerPerCPU *gtb = &s->per_cpu[i];
+        int64_t cdiff = 0;
+
+        if ((s->control & R_CONTROL_TIMER_ENABLE) &&
+                (gtb->control & R_CONTROL_COMP_ENABLE)) {
+            /* R2p0+, where the compare function is >= */
+            while (gtb->compare < update.new) {
+                DB_PRINT("Compare event happened for CPU %d\n", i);
+                gtb->status = 1;
+                if (gtb->control & R_CONTROL_AUTO_INCREMENT) {
+                    DB_PRINT("Auto incrementing timer compare by %" PRId32 "\n",
+                             gtb->inc);
+                    gtb->compare += gtb->inc;
+                } else {
+                    break;
+                }
+            }
+            cdiff = (int64_t)gtb->compare - (int64_t)update.new + 1;
+            if (cdiff > 0 && (cdiff < next_cdiff || !next_cdiff)) {
+                next_cdiff = cdiff;
+            }
+        }
+
+        qemu_set_irq(gtb->irq,
+                     gtb->status && (gtb->control & R_CONTROL_IRQ_ENABLE));
+    }
+
+    timer_del(s->timer);
+    if (next_cdiff) {
+        DB_PRINT("scheduling qemu_timer to fire again in %"
+                 PRIx64 " cycles\n", next_cdiff);
+        timer_mod(s->timer, update.now + next_cdiff * a9_gtimer_get_conv(s));
+    }
+
+    if (s->control & R_CONTROL_TIMER_ENABLE) {
+        s->counter = update.new;
+    }
+
+    if (sync) {
+        s->cpu_ref_time = update.now;
+        s->ref_counter = s->counter;
+    }
+}
+
+static void a9_gtimer_update_no_sync(void *opaque)
+{
+    A9GTimerState *s = A9_GTIMER(opaque);
+
+    return a9_gtimer_update(s, false);
+}
+
+static uint64_t a9_gtimer_read(void *opaque, hwaddr addr, unsigned size)
+{
+    A9GTimerPerCPU *gtb = (A9GTimerPerCPU *)opaque;
+    A9GTimerState *s = gtb->parent;
+    A9GTimerUpdate update;
+    uint64_t ret = 0;
+    int shift = 0;
+
+    switch (addr) {
+    case R_COUNTER_HI:
+        shift = 32;
+        /* fallthrough */
+    case R_COUNTER_LO:
+        update = a9_gtimer_get_update(s);
+        ret = extract64(update.new, shift, 32);
+        break;
+    case R_CONTROL:
+        ret = s->control | gtb->control;
+        break;
+    case R_INTERRUPT_STATUS:
+        ret = gtb->status;
+        break;
+    case R_COMPARATOR_HI:
+        shift = 32;
+        /* fallthrough */
+    case R_COMPARATOR_LO:
+        ret = extract64(gtb->compare, shift, 32);
+        break;
+    case R_AUTO_INCREMENT:
+        ret =  gtb->inc;
+        break;
+    default:
+        qemu_log_mask(LOG_GUEST_ERROR, "bad a9gtimer register: %x\n",
+                      (unsigned)addr);
+        return 0;
+    }
+
+    DB_PRINT("addr:%#x data:%#08" PRIx64 "\n", (unsigned)addr, ret);
+    return ret;
+}
+
+static void a9_gtimer_write(void *opaque, hwaddr addr, uint64_t value,
+                            unsigned size)
+{
+    A9GTimerPerCPU *gtb = (A9GTimerPerCPU *)opaque;
+    A9GTimerState *s = gtb->parent;
+    int shift = 0;
+
+    DB_PRINT("addr:%#x data:%#08" PRIx64 "\n", (unsigned)addr, value);
+
+    switch (addr) {
+    case R_COUNTER_HI:
+        shift = 32;
+        /* fallthrough */
+    case R_COUNTER_LO:
+        /*
+         * Keep it simple - ARM docco explicitly says to disable timer before
+         * modding it, so dont bother trying to do all the difficult on the fly
+         * timer modifications - (if they even work in real hardware??).
+         */
+        if (s->control & R_CONTROL_TIMER_ENABLE) {
+            qemu_log_mask(LOG_GUEST_ERROR, "Cannot mod running ARM gtimer\n");
+            return;
+        }
+        s->counter = deposit64(s->counter, shift, 32, value);
+        return;
+    case R_CONTROL:
+        a9_gtimer_update(s, (value ^ s->control) & R_CONTROL_NEEDS_SYNC);
+        gtb->control = value & R_CONTROL_BANKED;
+        s->control = value & ~R_CONTROL_BANKED;
+        break;
+    case R_INTERRUPT_STATUS:
+        a9_gtimer_update(s, false);
+        gtb->status &= ~value;
+        break;
+    case R_COMPARATOR_HI:
+        shift = 32;
+        /* fallthrough */
+    case R_COMPARATOR_LO:
+        a9_gtimer_update(s, false);
+        gtb->compare = deposit64(gtb->compare, shift, 32, value);
+        break;
+    case R_AUTO_INCREMENT:
+        gtb->inc = value;
+        return;
+    default:
+        return;
+    }
+
+    a9_gtimer_update(s, false);
+}
+
+/* Wrapper functions to implement the "read global timer for
+ * the current CPU" memory regions.
+ */
+static uint64_t a9_gtimer_this_read(void *opaque, hwaddr addr,
+                                    unsigned size)
+{
+    A9GTimerState *s = A9_GTIMER(opaque);
+    int id = a9_gtimer_get_current_cpu(s);
+
+    /* no \n so concatenates with message from read fn */
+    DB_PRINT("CPU:%d:", id);
+
+    return a9_gtimer_read(&s->per_cpu[id], addr, size);
+}
+
+static void a9_gtimer_this_write(void *opaque, hwaddr addr,
+                                 uint64_t value, unsigned size)
+{
+    A9GTimerState *s = A9_GTIMER(opaque);
+    int id = a9_gtimer_get_current_cpu(s);
+
+    /* no \n so concatenates with message from write fn */
+    DB_PRINT("CPU:%d:", id);
+
+    a9_gtimer_write(&s->per_cpu[id], addr, value, size);
+}
+
+static const MemoryRegionOps a9_gtimer_this_ops = {
+    .read = a9_gtimer_this_read,
+    .write = a9_gtimer_this_write,
+    .valid = {
+        .min_access_size = 4,
+        .max_access_size = 4,
+    },
+    .endianness = DEVICE_NATIVE_ENDIAN,
+};
+
+static const MemoryRegionOps a9_gtimer_ops = {
+    .read = a9_gtimer_read,
+    .write = a9_gtimer_write,
+    .valid = {
+        .min_access_size = 4,
+        .max_access_size = 4,
+    },
+    .endianness = DEVICE_NATIVE_ENDIAN,
+};
+
+static void a9_gtimer_reset(DeviceState *dev)
+{
+    A9GTimerState *s = A9_GTIMER(dev);
+    int i;
+
+    s->counter = 0;
+    s->control = 0;
+
+    for (i = 0; i < s->num_cpu; i++) {
+        A9GTimerPerCPU *gtb = &s->per_cpu[i];
+
+        gtb->control = 0;
+        gtb->status = 0;
+        gtb->compare = 0;
+        gtb->inc = 0;
+    }
+    a9_gtimer_update(s, false);
+}
+
+static void a9_gtimer_realize(DeviceState *dev, Error **errp)
+{
+    A9GTimerState *s = A9_GTIMER(dev);
+    SysBusDevice *sbd = SYS_BUS_DEVICE(dev);
+    int i;
+
+    if (s->num_cpu < 1 || s->num_cpu > A9_GTIMER_MAX_CPUS) {
+        error_setg(errp, "%s: num-cpu must be between 1 and %d\n",
+                   __func__, A9_GTIMER_MAX_CPUS);
+        return;
+    }
+
+    memory_region_init_io(&s->iomem, OBJECT(dev), &a9_gtimer_this_ops, s,
+                          "a9gtimer shared", 0x20);
+    sysbus_init_mmio(sbd, &s->iomem);
+    s->timer = timer_new_ns(QEMU_CLOCK_VIRTUAL, a9_gtimer_update_no_sync, s);
+
+    for (i = 0; i < s->num_cpu; i++) {
+        A9GTimerPerCPU *gtb = &s->per_cpu[i];
+
+        gtb->parent = s;
+        sysbus_init_irq(sbd, &gtb->irq);
+        memory_region_init_io(&gtb->iomem, OBJECT(dev), &a9_gtimer_ops, gtb,
+                              "a9gtimer per cpu", 0x20);
+        sysbus_init_mmio(sbd, &gtb->iomem);
+    }
+}
+
+static const VMStateDescription vmstate_a9_gtimer_per_cpu = {
+    .name = "arm.cortex-a9-global-timer.percpu",
+    .version_id = 1,
+    .minimum_version_id = 1,
+    .fields = (VMStateField[]) {
+        VMSTATE_UINT32(control, A9GTimerPerCPU),
+        VMSTATE_UINT64(compare, A9GTimerPerCPU),
+        VMSTATE_UINT32(status, A9GTimerPerCPU),
+        VMSTATE_UINT32(inc, A9GTimerPerCPU),
+        VMSTATE_END_OF_LIST()
+    }
+};
+
+static const VMStateDescription vmstate_a9_gtimer = {
+    .name = "arm.cortex-a9-global-timer",
+    .version_id = 1,
+    .minimum_version_id = 1,
+    .fields = (VMStateField[]) {
+        VMSTATE_TIMER(timer, A9GTimerState),
+        VMSTATE_UINT64(counter, A9GTimerState),
+        VMSTATE_UINT64(ref_counter, A9GTimerState),
+        VMSTATE_UINT64(cpu_ref_time, A9GTimerState),
+        VMSTATE_STRUCT_VARRAY_UINT32(per_cpu, A9GTimerState, num_cpu,
+                                     1, vmstate_a9_gtimer_per_cpu,
+                                     A9GTimerPerCPU),
+        VMSTATE_END_OF_LIST()
+    }
+};
+
+static Property a9_gtimer_properties[] = {
+    DEFINE_PROP_UINT32("num-cpu", A9GTimerState, num_cpu, 0),
+    DEFINE_PROP_END_OF_LIST()
+};
+
+static void a9_gtimer_class_init(ObjectClass *klass, void *data)
+{
+    DeviceClass *dc = DEVICE_CLASS(klass);
+
+    dc->realize = a9_gtimer_realize;
+    dc->vmsd = &vmstate_a9_gtimer;
+    dc->reset = a9_gtimer_reset;
+    dc->props = a9_gtimer_properties;
+}
+
+static const TypeInfo a9_gtimer_info = {
+    .name          = TYPE_A9_GTIMER,
+    .parent        = TYPE_SYS_BUS_DEVICE,
+    .instance_size = sizeof(A9GTimerState),
+    .class_init    = a9_gtimer_class_init,
+};
+
+static void a9_gtimer_register_types(void)
+{
+    type_register_static(&a9_gtimer_info);
+}
+
+type_init(a9_gtimer_register_types)
diff --git a/include/fpu/softfloat.h b/include/fpu/softfloat.h
index f3927e2419..2365274daa 100644
--- a/include/fpu/softfloat.h
+++ b/include/fpu/softfloat.h
@@ -302,6 +302,8 @@ int float32_compare( float32, float32 STATUS_PARAM );
 int float32_compare_quiet( float32, float32 STATUS_PARAM );
 float32 float32_min(float32, float32 STATUS_PARAM);
 float32 float32_max(float32, float32 STATUS_PARAM);
+float32 float32_minnum(float32, float32 STATUS_PARAM);
+float32 float32_maxnum(float32, float32 STATUS_PARAM);
 int float32_is_quiet_nan( float32 );
 int float32_is_signaling_nan( float32 );
 float32 float32_maybe_silence_nan( float32 );
@@ -408,6 +410,8 @@ int float64_compare( float64, float64 STATUS_PARAM );
 int float64_compare_quiet( float64, float64 STATUS_PARAM );
 float64 float64_min(float64, float64 STATUS_PARAM);
 float64 float64_max(float64, float64 STATUS_PARAM);
+float64 float64_minnum(float64, float64 STATUS_PARAM);
+float64 float64_maxnum(float64, float64 STATUS_PARAM);
 int float64_is_quiet_nan( float64 a );
 int float64_is_signaling_nan( float64 );
 float64 float64_maybe_silence_nan( float64 );
diff --git a/include/hw/arm/arm.h b/include/hw/arm/arm.h
index ecbbba871e..cbbf4ca4cb 100644
--- a/include/hw/arm/arm.h
+++ b/include/hw/arm/arm.h
@@ -50,6 +50,13 @@ struct arm_boot_info {
                                  const struct arm_boot_info *info);
     void (*secondary_cpu_reset_hook)(ARMCPU *cpu,
                                      const struct arm_boot_info *info);
+    /* if a board is able to create a dtb without a dtb file then it
+     * sets get_dtb. This will only be used if no dtb file is provided
+     * by the user. On success, sets *size to the length of the created
+     * dtb, and returns a pointer to it. (The caller must free this memory
+     * with g_free() when it has finished with it.) On failure, returns NULL.
+     */
+    void *(*get_dtb)(const struct arm_boot_info *info, int *size);
     /* if a board needs to be able to modify a device tree provided by
      * the user it should implement this hook.
      */
diff --git a/include/hw/cpu/a9mpcore.h b/include/hw/cpu/a9mpcore.h
index 010489b98e..5d67ca22c4 100644
--- a/include/hw/cpu/a9mpcore.h
+++ b/include/hw/cpu/a9mpcore.h
@@ -14,6 +14,7 @@
 #include "hw/intc/arm_gic.h"
 #include "hw/misc/a9scu.h"
 #include "hw/timer/arm_mptimer.h"
+#include "hw/timer/a9gtimer.h"
 
 #define TYPE_A9MPCORE_PRIV "a9mpcore_priv"
 #define A9MPCORE_PRIV(obj) \
@@ -28,8 +29,9 @@ typedef struct A9MPPrivState {
     MemoryRegion container;
     uint32_t num_irq;
 
-    GICState gic;
     A9SCUState scu;
+    GICState gic;
+    A9GTimerState gtimer;
     ARMMPTimerState mptimer;
     ARMMPTimerState wdt;
 } A9MPPrivState;
diff --git a/include/hw/timer/a9gtimer.h b/include/hw/timer/a9gtimer.h
new file mode 100644
index 0000000000..b88c02a6ef
--- /dev/null
+++ b/include/hw/timer/a9gtimer.h
@@ -0,0 +1,97 @@
+/*
+ * Global peripheral timer block for ARM A9MP
+ *
+ * (C) 2013 Xilinx Inc.
+ *
+ * Written by François LEGAL
+ * Written by Peter Crosthwaite <peter.crosthwaite@xilinx.com>
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version
+ * 2 of the License, or (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License along
+ * with this program; if not, see <http://www.gnu.org/licenses/>.
+ */
+
+#ifndef HW_TIMER_A9_GTIMER_H_H
+#define HW_TIMER_A9_GTIMER_H_H
+
+#include "hw/sysbus.h"
+
+#define A9_GTIMER_MAX_CPUS 4
+
+#define TYPE_A9_GTIMER "arm.cortex-a9-global-timer"
+#define A9_GTIMER(obj) OBJECT_CHECK(A9GTimerState, (obj), TYPE_A9_GTIMER)
+
+#define R_COUNTER_LO                0x00
+#define R_COUNTER_HI                0x04
+
+#define R_CONTROL                   0x08
+#define R_CONTROL_TIMER_ENABLE      (1 << 0)
+#define R_CONTROL_COMP_ENABLE       (1 << 1)
+#define R_CONTROL_IRQ_ENABLE        (1 << 2)
+#define R_CONTROL_AUTO_INCREMENT    (1 << 2)
+#define R_CONTROL_PRESCALER_SHIFT   8
+#define R_CONTROL_PRESCALER_LEN     8
+#define R_CONTROL_PRESCALER_MASK    (((1 << R_CONTROL_PRESCALER_LEN) - 1) << \
+                                     R_CONTROL_PRESCALER_SHIFT)
+
+#define R_CONTROL_BANKED            (R_CONTROL_COMP_ENABLE | \
+                                     R_CONTROL_IRQ_ENABLE | \
+                                     R_CONTROL_AUTO_INCREMENT)
+#define R_CONTROL_NEEDS_SYNC        (R_CONTROL_TIMER_ENABLE | \
+                                     R_CONTROL_PRESCALER_MASK)
+
+#define R_INTERRUPT_STATUS          0x0C
+#define R_COMPARATOR_LO             0x10
+#define R_COMPARATOR_HI             0x14
+#define R_AUTO_INCREMENT            0x18
+
+typedef struct A9GTimerPerCPU A9GTimerPerCPU;
+typedef struct A9GTimerState A9GTimerState;
+
+struct A9GTimerPerCPU {
+    A9GTimerState *parent;
+
+    uint32_t control; /* only per cpu banked bits valid */
+    uint64_t compare;
+    uint32_t status;
+    uint32_t inc;
+
+    MemoryRegion iomem;
+    qemu_irq irq; /* PPI interrupts */
+};
+
+struct A9GTimerState {
+    /*< private >*/
+    SysBusDevice parent_obj;
+    /*< public >*/
+
+    MemoryRegion iomem;
+    /* static props */
+    uint32_t num_cpu;
+
+    QEMUTimer *timer;
+
+    uint64_t counter; /* current timer value */
+
+    uint64_t ref_counter;
+    uint64_t cpu_ref_time; /* the cpu time as of last update of ref_counter */
+    uint32_t control; /* only non per cpu banked bits valid */
+
+    A9GTimerPerCPU per_cpu[A9_GTIMER_MAX_CPUS];
+};
+
+typedef struct A9GTimerUpdate {
+    uint64_t now;
+    uint64_t new;
+} A9GTimerUpdate;
+
+#endif /* #ifdef HW_TIMER_A9_GTIMER_H_H */
diff --git a/libcacard/vscclient.c b/libcacard/vscclient.c
index a3cb7762b5..f1d46d397a 100644
--- a/libcacard/vscclient.c
+++ b/libcacard/vscclient.c
@@ -407,7 +407,7 @@ do_socket_read(GIOChannel *source,
             }
             break;
         default:
-            g_warn_if_reached();
+            g_assert_not_reached();
             return FALSE;
         }
 
@@ -760,7 +760,7 @@ main(
 
     g_io_channel_unref(channel_stdin);
     g_io_channel_unref(channel_socket);
-    g_byte_array_unref(socket_to_send);
+    g_byte_array_free(socket_to_send, TRUE);
 
     closesocket(sock);
     return 0;
diff --git a/target-arm/cpu-qom.h b/target-arm/cpu-qom.h
index b55306a3c3..f32178a9db 100644
--- a/target-arm/cpu-qom.h
+++ b/target-arm/cpu-qom.h
@@ -91,6 +91,17 @@ typedef struct ARMCPU {
     /* GPIO outputs for generic timer */
     qemu_irq gt_timer_outputs[NUM_GTIMERS];
 
+    /* 'compatible' string for this CPU for Linux device trees */
+    const char *dtb_compatible;
+
+    /* Should CPU start in PSCI powered-off state? */
+    bool start_powered_off;
+
+    /* [QEMU_]KVM_ARM_TARGET_* constant for this CPU, or
+     * QEMU_KVM_ARM_TARGET_NONE if the kernel doesn't support this CPU type.
+     */
+    uint32_t kvm_target;
+
     /* The instance init functions for implementation-specific subclasses
      * set these fields to specify the implementation-dependent values of
      * various constant registers and reset values of non-constant
diff --git a/target-arm/cpu.c b/target-arm/cpu.c
index d40f2a7a4f..0635e78ec2 100644
--- a/target-arm/cpu.c
+++ b/target-arm/cpu.c
@@ -20,6 +20,7 @@
 
 #include "cpu.h"
 #include "qemu-common.h"
+#include "hw/qdev-properties.h"
 #if !defined(CONFIG_USER_ONLY)
 #include "hw/loader.h"
 #endif
@@ -217,6 +218,13 @@ static void arm_cpu_initfn(Object *obj)
                        ARRAY_SIZE(cpu->gt_timer_outputs));
 #endif
 
+    /* DTB consumers generally don't in fact care what the 'compatible'
+     * string is, so always provide some string and trust that a hypothetical
+     * picky DTB consumer will also provide a helpful error message.
+     */
+    cpu->dtb_compatible = "qemu,unknown";
+    cpu->kvm_target = QEMU_KVM_ARM_TARGET_NONE;
+
     if (tcg_enabled() && !inited) {
         inited = true;
         arm_translate_init();
@@ -318,6 +326,8 @@ static ObjectClass *arm_cpu_class_by_name(const char *cpu_model)
 static void arm926_initfn(Object *obj)
 {
     ARMCPU *cpu = ARM_CPU(obj);
+
+    cpu->dtb_compatible = "arm,arm926";
     set_feature(&cpu->env, ARM_FEATURE_V5);
     set_feature(&cpu->env, ARM_FEATURE_VFP);
     set_feature(&cpu->env, ARM_FEATURE_DUMMY_C15_REGS);
@@ -331,6 +341,8 @@ static void arm926_initfn(Object *obj)
 static void arm946_initfn(Object *obj)
 {
     ARMCPU *cpu = ARM_CPU(obj);
+
+    cpu->dtb_compatible = "arm,arm946";
     set_feature(&cpu->env, ARM_FEATURE_V5);
     set_feature(&cpu->env, ARM_FEATURE_MPU);
     set_feature(&cpu->env, ARM_FEATURE_DUMMY_C15_REGS);
@@ -342,6 +354,8 @@ static void arm946_initfn(Object *obj)
 static void arm1026_initfn(Object *obj)
 {
     ARMCPU *cpu = ARM_CPU(obj);
+
+    cpu->dtb_compatible = "arm,arm1026";
     set_feature(&cpu->env, ARM_FEATURE_V5);
     set_feature(&cpu->env, ARM_FEATURE_VFP);
     set_feature(&cpu->env, ARM_FEATURE_AUXCR);
@@ -374,6 +388,8 @@ static void arm1136_r2_initfn(Object *obj)
      * for 1136_r2 (in particular r0p2 does not actually implement most
      * of the ID registers).
      */
+
+    cpu->dtb_compatible = "arm,arm1136";
     set_feature(&cpu->env, ARM_FEATURE_V6);
     set_feature(&cpu->env, ARM_FEATURE_VFP);
     set_feature(&cpu->env, ARM_FEATURE_DUMMY_C15_REGS);
@@ -403,6 +419,8 @@ static void arm1136_r2_initfn(Object *obj)
 static void arm1136_initfn(Object *obj)
 {
     ARMCPU *cpu = ARM_CPU(obj);
+
+    cpu->dtb_compatible = "arm,arm1136";
     set_feature(&cpu->env, ARM_FEATURE_V6K);
     set_feature(&cpu->env, ARM_FEATURE_V6);
     set_feature(&cpu->env, ARM_FEATURE_VFP);
@@ -433,6 +451,8 @@ static void arm1136_initfn(Object *obj)
 static void arm1176_initfn(Object *obj)
 {
     ARMCPU *cpu = ARM_CPU(obj);
+
+    cpu->dtb_compatible = "arm,arm1176";
     set_feature(&cpu->env, ARM_FEATURE_V6K);
     set_feature(&cpu->env, ARM_FEATURE_VFP);
     set_feature(&cpu->env, ARM_FEATURE_VAPA);
@@ -463,6 +483,8 @@ static void arm1176_initfn(Object *obj)
 static void arm11mpcore_initfn(Object *obj)
 {
     ARMCPU *cpu = ARM_CPU(obj);
+
+    cpu->dtb_compatible = "arm,arm11mpcore";
     set_feature(&cpu->env, ARM_FEATURE_V6K);
     set_feature(&cpu->env, ARM_FEATURE_VFP);
     set_feature(&cpu->env, ARM_FEATURE_VAPA);
@@ -516,6 +538,8 @@ static const ARMCPRegInfo cortexa8_cp_reginfo[] = {
 static void cortex_a8_initfn(Object *obj)
 {
     ARMCPU *cpu = ARM_CPU(obj);
+
+    cpu->dtb_compatible = "arm,cortex-a8";
     set_feature(&cpu->env, ARM_FEATURE_V7);
     set_feature(&cpu->env, ARM_FEATURE_VFP3);
     set_feature(&cpu->env, ARM_FEATURE_NEON);
@@ -580,6 +604,8 @@ static const ARMCPRegInfo cortexa9_cp_reginfo[] = {
 static void cortex_a9_initfn(Object *obj)
 {
     ARMCPU *cpu = ARM_CPU(obj);
+
+    cpu->dtb_compatible = "arm,cortex-a9";
     set_feature(&cpu->env, ARM_FEATURE_V7);
     set_feature(&cpu->env, ARM_FEATURE_VFP3);
     set_feature(&cpu->env, ARM_FEATURE_VFP_FP16);
@@ -649,6 +675,8 @@ static const ARMCPRegInfo cortexa15_cp_reginfo[] = {
 static void cortex_a15_initfn(Object *obj)
 {
     ARMCPU *cpu = ARM_CPU(obj);
+
+    cpu->dtb_compatible = "arm,cortex-a15";
     set_feature(&cpu->env, ARM_FEATURE_V7);
     set_feature(&cpu->env, ARM_FEATURE_VFP4);
     set_feature(&cpu->env, ARM_FEATURE_VFP_FP16);
@@ -658,6 +686,7 @@ static void cortex_a15_initfn(Object *obj)
     set_feature(&cpu->env, ARM_FEATURE_GENERIC_TIMER);
     set_feature(&cpu->env, ARM_FEATURE_DUMMY_C15_REGS);
     set_feature(&cpu->env, ARM_FEATURE_LPAE);
+    cpu->kvm_target = QEMU_KVM_ARM_TARGET_CORTEX_A15;
     cpu->midr = 0x412fc0f1;
     cpu->reset_fpsid = 0x410430f0;
     cpu->mvfr0 = 0x10110222;
@@ -697,6 +726,8 @@ static void ti925t_initfn(Object *obj)
 static void sa1100_initfn(Object *obj)
 {
     ARMCPU *cpu = ARM_CPU(obj);
+
+    cpu->dtb_compatible = "intel,sa1100";
     set_feature(&cpu->env, ARM_FEATURE_STRONGARM);
     set_feature(&cpu->env, ARM_FEATURE_DUMMY_C15_REGS);
     cpu->midr = 0x4401A11B;
@@ -715,6 +746,8 @@ static void sa1110_initfn(Object *obj)
 static void pxa250_initfn(Object *obj)
 {
     ARMCPU *cpu = ARM_CPU(obj);
+
+    cpu->dtb_compatible = "marvell,xscale";
     set_feature(&cpu->env, ARM_FEATURE_V5);
     set_feature(&cpu->env, ARM_FEATURE_XSCALE);
     cpu->midr = 0x69052100;
@@ -725,6 +758,8 @@ static void pxa250_initfn(Object *obj)
 static void pxa255_initfn(Object *obj)
 {
     ARMCPU *cpu = ARM_CPU(obj);
+
+    cpu->dtb_compatible = "marvell,xscale";
     set_feature(&cpu->env, ARM_FEATURE_V5);
     set_feature(&cpu->env, ARM_FEATURE_XSCALE);
     cpu->midr = 0x69052d00;
@@ -735,6 +770,8 @@ static void pxa255_initfn(Object *obj)
 static void pxa260_initfn(Object *obj)
 {
     ARMCPU *cpu = ARM_CPU(obj);
+
+    cpu->dtb_compatible = "marvell,xscale";
     set_feature(&cpu->env, ARM_FEATURE_V5);
     set_feature(&cpu->env, ARM_FEATURE_XSCALE);
     cpu->midr = 0x69052903;
@@ -745,6 +782,8 @@ static void pxa260_initfn(Object *obj)
 static void pxa261_initfn(Object *obj)
 {
     ARMCPU *cpu = ARM_CPU(obj);
+
+    cpu->dtb_compatible = "marvell,xscale";
     set_feature(&cpu->env, ARM_FEATURE_V5);
     set_feature(&cpu->env, ARM_FEATURE_XSCALE);
     cpu->midr = 0x69052d05;
@@ -755,6 +794,8 @@ static void pxa261_initfn(Object *obj)
 static void pxa262_initfn(Object *obj)
 {
     ARMCPU *cpu = ARM_CPU(obj);
+
+    cpu->dtb_compatible = "marvell,xscale";
     set_feature(&cpu->env, ARM_FEATURE_V5);
     set_feature(&cpu->env, ARM_FEATURE_XSCALE);
     cpu->midr = 0x69052d06;
@@ -765,6 +806,8 @@ static void pxa262_initfn(Object *obj)
 static void pxa270a0_initfn(Object *obj)
 {
     ARMCPU *cpu = ARM_CPU(obj);
+
+    cpu->dtb_compatible = "marvell,xscale";
     set_feature(&cpu->env, ARM_FEATURE_V5);
     set_feature(&cpu->env, ARM_FEATURE_XSCALE);
     set_feature(&cpu->env, ARM_FEATURE_IWMMXT);
@@ -776,6 +819,8 @@ static void pxa270a0_initfn(Object *obj)
 static void pxa270a1_initfn(Object *obj)
 {
     ARMCPU *cpu = ARM_CPU(obj);
+
+    cpu->dtb_compatible = "marvell,xscale";
     set_feature(&cpu->env, ARM_FEATURE_V5);
     set_feature(&cpu->env, ARM_FEATURE_XSCALE);
     set_feature(&cpu->env, ARM_FEATURE_IWMMXT);
@@ -787,6 +832,8 @@ static void pxa270a1_initfn(Object *obj)
 static void pxa270b0_initfn(Object *obj)
 {
     ARMCPU *cpu = ARM_CPU(obj);
+
+    cpu->dtb_compatible = "marvell,xscale";
     set_feature(&cpu->env, ARM_FEATURE_V5);
     set_feature(&cpu->env, ARM_FEATURE_XSCALE);
     set_feature(&cpu->env, ARM_FEATURE_IWMMXT);
@@ -798,6 +845,8 @@ static void pxa270b0_initfn(Object *obj)
 static void pxa270b1_initfn(Object *obj)
 {
     ARMCPU *cpu = ARM_CPU(obj);
+
+    cpu->dtb_compatible = "marvell,xscale";
     set_feature(&cpu->env, ARM_FEATURE_V5);
     set_feature(&cpu->env, ARM_FEATURE_XSCALE);
     set_feature(&cpu->env, ARM_FEATURE_IWMMXT);
@@ -809,6 +858,8 @@ static void pxa270b1_initfn(Object *obj)
 static void pxa270c0_initfn(Object *obj)
 {
     ARMCPU *cpu = ARM_CPU(obj);
+
+    cpu->dtb_compatible = "marvell,xscale";
     set_feature(&cpu->env, ARM_FEATURE_V5);
     set_feature(&cpu->env, ARM_FEATURE_XSCALE);
     set_feature(&cpu->env, ARM_FEATURE_IWMMXT);
@@ -820,6 +871,8 @@ static void pxa270c0_initfn(Object *obj)
 static void pxa270c5_initfn(Object *obj)
 {
     ARMCPU *cpu = ARM_CPU(obj);
+
+    cpu->dtb_compatible = "marvell,xscale";
     set_feature(&cpu->env, ARM_FEATURE_V5);
     set_feature(&cpu->env, ARM_FEATURE_XSCALE);
     set_feature(&cpu->env, ARM_FEATURE_IWMMXT);
@@ -894,6 +947,11 @@ static const ARMCPUInfo arm_cpus[] = {
 #endif
 };
 
+static Property arm_cpu_properties[] = {
+    DEFINE_PROP_BOOL("start-powered-off", ARMCPU, start_powered_off, false),
+    DEFINE_PROP_END_OF_LIST()
+};
+
 static void arm_cpu_class_init(ObjectClass *oc, void *data)
 {
     ARMCPUClass *acc = ARM_CPU_CLASS(oc);
@@ -902,6 +960,7 @@ static void arm_cpu_class_init(ObjectClass *oc, void *data)
 
     acc->parent_realize = dc->realize;
     dc->realize = arm_cpu_realizefn;
+    dc->props = arm_cpu_properties;
 
     acc->parent_reset = cc->reset;
     cc->reset = arm_cpu_reset;
diff --git a/target-arm/cpu.h b/target-arm/cpu.h
index 9f110f15b6..c3f007fc53 100644
--- a/target-arm/cpu.h
+++ b/target-arm/cpu.h
@@ -21,6 +21,8 @@
 
 #include "config.h"
 
+#include "kvm-consts.h"
+
 #if defined(TARGET_AARCH64)
   /* AArch64 definitions */
 #  define TARGET_LONG_BITS 64
@@ -497,17 +499,6 @@ void armv7m_nvic_complete_irq(void *opaque, int irq);
     (((cp) << 16) | ((is64) << 15) | ((crn) << 11) |    \
      ((crm) << 7) | ((opc1) << 3) | (opc2))
 
-/* Note that these must line up with the KVM/ARM register
- * ID field definitions (kvm.c will check this, but we
- * can't just use the KVM defines here as the kvm headers
- * are unavailable to non-KVM-specific files)
- */
-#define CP_REG_SIZE_SHIFT 52
-#define CP_REG_SIZE_MASK       0x00f0000000000000ULL
-#define CP_REG_SIZE_U32        0x0020000000000000ULL
-#define CP_REG_SIZE_U64        0x0030000000000000ULL
-#define CP_REG_ARM             0x4000000000000000ULL
-
 /* Convert a full 64 bit KVM register ID to the truncated 32 bit
  * version used as a key for the coprocessor register hashtable
  */
diff --git a/target-arm/helper.c b/target-arm/helper.c
index 3445813465..5e5e5aad2b 100644
--- a/target-arm/helper.c
+++ b/target-arm/helper.c
@@ -1173,7 +1173,7 @@ static int vmsa_ttbcr_raw_write(CPUARMState *env, const ARMCPRegInfo *ri,
 {
     int maskshift = extract32(value, 0, 3);
 
-    if (arm_feature(env, ARM_FEATURE_LPAE)) {
+    if (arm_feature(env, ARM_FEATURE_LPAE) && (value & (1 << 31))) {
         value &= ~((7 << 19) | (3 << 14) | (0xf << 3));
     } else {
         value &= 7;
@@ -1842,6 +1842,12 @@ void arm_cpu_list(FILE *f, fprintf_function cpu_fprintf)
     (*cpu_fprintf)(f, "Available CPUs:\n");
     g_slist_foreach(list, arm_cpu_list_entry, &s);
     g_slist_free(list);
+#ifdef CONFIG_KVM
+    /* The 'host' CPU type is dynamically registered only if KVM is
+     * enabled, so we have to special-case it here:
+     */
+    (*cpu_fprintf)(f, "  host (only available in KVM mode)\n");
+#endif
 }
 
 static void arm_cpu_add_definition(gpointer data, gpointer user_data)
@@ -4079,3 +4085,28 @@ float64 VFP_HELPER(muladd, d)(float64 a, float64 b, float64 c, void *fpstp)
     float_status *fpst = fpstp;
     return float64_muladd(a, b, c, 0, fpst);
 }
+
+/* ARMv8 VMAXNM/VMINNM */
+float32 VFP_HELPER(maxnm, s)(float32 a, float32 b, void *fpstp)
+{
+    float_status *fpst = fpstp;
+    return float32_maxnum(a, b, fpst);
+}
+
+float64 VFP_HELPER(maxnm, d)(float64 a, float64 b, void *fpstp)
+{
+    float_status *fpst = fpstp;
+    return float64_maxnum(a, b, fpst);
+}
+
+float32 VFP_HELPER(minnm, s)(float32 a, float32 b, void *fpstp)
+{
+    float_status *fpst = fpstp;
+    return float32_minnum(a, b, fpst);
+}
+
+float64 VFP_HELPER(minnm, d)(float64 a, float64 b, void *fpstp)
+{
+    float_status *fpst = fpstp;
+    return float64_minnum(a, b, fpst);
+}
diff --git a/target-arm/helper.h b/target-arm/helper.h
index cac9564f5f..d459a39e46 100644
--- a/target-arm/helper.h
+++ b/target-arm/helper.h
@@ -132,6 +132,11 @@ DEF_HELPER_2(neon_fcvt_f32_to_f16, i32, f32, env)
 DEF_HELPER_4(vfp_muladdd, f64, f64, f64, f64, ptr)
 DEF_HELPER_4(vfp_muladds, f32, f32, f32, f32, ptr)
 
+DEF_HELPER_3(vfp_maxnmd, f64, f64, f64, ptr)
+DEF_HELPER_3(vfp_maxnms, f32, f32, f32, ptr)
+DEF_HELPER_3(vfp_minnmd, f64, f64, f64, ptr)
+DEF_HELPER_3(vfp_minnms, f32, f32, f32, ptr)
+
 DEF_HELPER_3(recps_f32, f32, f32, f32, env)
 DEF_HELPER_3(rsqrts_f32, f32, f32, f32, env)
 DEF_HELPER_2(recpe_f32, f32, f32, env)
diff --git a/target-arm/kvm-consts.h b/target-arm/kvm-consts.h
new file mode 100644
index 0000000000..2bba0bd198
--- /dev/null
+++ b/target-arm/kvm-consts.h
@@ -0,0 +1,64 @@
+/*
+ * KVM ARM ABI constant definitions
+ *
+ * Copyright (c) 2013 Linaro Limited
+ *
+ * Provide versions of KVM constant defines that can be used even
+ * when CONFIG_KVM is not set and we don't have access to the
+ * KVM headers. If CONFIG_KVM is set, we do a compile-time check
+ * that we haven't got out of sync somehow.
+ *
+ * This work is licensed under the terms of the GNU GPL, version 2 or later.
+ * See the COPYING file in the top-level directory.
+ */
+#ifndef ARM_KVM_CONSTS_H
+#define ARM_KVM_CONSTS_H
+
+#ifdef CONFIG_KVM
+#include "qemu/compiler.h"
+#include <linux/kvm.h>
+
+#define MISMATCH_CHECK(X, Y) QEMU_BUILD_BUG_ON(X != Y)
+
+#else
+#define MISMATCH_CHECK(X, Y)
+#endif
+
+#define CP_REG_SIZE_SHIFT 52
+#define CP_REG_SIZE_MASK       0x00f0000000000000ULL
+#define CP_REG_SIZE_U32        0x0020000000000000ULL
+#define CP_REG_SIZE_U64        0x0030000000000000ULL
+#define CP_REG_ARM             0x4000000000000000ULL
+
+MISMATCH_CHECK(CP_REG_SIZE_SHIFT, KVM_REG_SIZE_SHIFT)
+MISMATCH_CHECK(CP_REG_SIZE_MASK, KVM_REG_SIZE_MASK)
+MISMATCH_CHECK(CP_REG_SIZE_U32, KVM_REG_SIZE_U32)
+MISMATCH_CHECK(CP_REG_SIZE_U64, KVM_REG_SIZE_U64)
+MISMATCH_CHECK(CP_REG_ARM, KVM_REG_ARM)
+
+#define PSCI_FN_BASE 0x95c1ba5e
+#define PSCI_FN(n) (PSCI_FN_BASE + (n))
+#define PSCI_FN_CPU_SUSPEND PSCI_FN(0)
+#define PSCI_FN_CPU_OFF PSCI_FN(1)
+#define PSCI_FN_CPU_ON PSCI_FN(2)
+#define PSCI_FN_MIGRATE PSCI_FN(3)
+
+MISMATCH_CHECK(PSCI_FN_CPU_SUSPEND, KVM_PSCI_FN_CPU_SUSPEND)
+MISMATCH_CHECK(PSCI_FN_CPU_OFF, KVM_PSCI_FN_CPU_OFF)
+MISMATCH_CHECK(PSCI_FN_CPU_ON, KVM_PSCI_FN_CPU_ON)
+MISMATCH_CHECK(PSCI_FN_MIGRATE, KVM_PSCI_FN_MIGRATE)
+
+#define QEMU_KVM_ARM_TARGET_CORTEX_A15 0
+
+/* There's no kernel define for this: sentinel value which
+ * matches no KVM target value for either 64 or 32 bit
+ */
+#define QEMU_KVM_ARM_TARGET_NONE UINT_MAX
+
+#ifndef TARGET_AARCH64
+MISMATCH_CHECK(QEMU_KVM_ARM_TARGET_CORTEX_A15, KVM_ARM_TARGET_CORTEX_A15)
+#endif
+
+#undef MISMATCH_CHECK
+
+#endif
diff --git a/target-arm/kvm.c b/target-arm/kvm.c
index 6e5cd36fae..f865dac871 100644
--- a/target-arm/kvm.c
+++ b/target-arm/kvm.c
@@ -23,25 +23,240 @@
 #include "cpu.h"
 #include "hw/arm/arm.h"
 
-/* Check that cpu.h's idea of coprocessor fields matches KVM's */
-#if (CP_REG_SIZE_SHIFT != KVM_REG_SIZE_SHIFT) || \
-    (CP_REG_SIZE_MASK != KVM_REG_SIZE_MASK) ||   \
-    (CP_REG_SIZE_U32 != KVM_REG_SIZE_U32) || \
-    (CP_REG_SIZE_U64 != KVM_REG_SIZE_U64) || \
-    (CP_REG_ARM != KVM_REG_ARM)
-#error mismatch between cpu.h and KVM header definitions
-#endif
-
 const KVMCapabilityInfo kvm_arch_required_capabilities[] = {
     KVM_CAP_LAST_INFO
 };
 
+bool kvm_arm_create_scratch_host_vcpu(const uint32_t *cpus_to_try,
+                                      int *fdarray,
+                                      struct kvm_vcpu_init *init)
+{
+    int ret, kvmfd = -1, vmfd = -1, cpufd = -1;
+
+    kvmfd = qemu_open("/dev/kvm", O_RDWR);
+    if (kvmfd < 0) {
+        goto err;
+    }
+    vmfd = ioctl(kvmfd, KVM_CREATE_VM, 0);
+    if (vmfd < 0) {
+        goto err;
+    }
+    cpufd = ioctl(vmfd, KVM_CREATE_VCPU, 0);
+    if (cpufd < 0) {
+        goto err;
+    }
+
+    ret = ioctl(vmfd, KVM_ARM_PREFERRED_TARGET, init);
+    if (ret >= 0) {
+        ret = ioctl(cpufd, KVM_ARM_VCPU_INIT, init);
+        if (ret < 0) {
+            goto err;
+        }
+    } else {
+        /* Old kernel which doesn't know about the
+         * PREFERRED_TARGET ioctl: we know it will only support
+         * creating one kind of guest CPU which is its preferred
+         * CPU type.
+         */
+        while (*cpus_to_try != QEMU_KVM_ARM_TARGET_NONE) {
+            init->target = *cpus_to_try++;
+            memset(init->features, 0, sizeof(init->features));
+            ret = ioctl(cpufd, KVM_ARM_VCPU_INIT, init);
+            if (ret >= 0) {
+                break;
+            }
+        }
+        if (ret < 0) {
+            goto err;
+        }
+    }
+
+    fdarray[0] = kvmfd;
+    fdarray[1] = vmfd;
+    fdarray[2] = cpufd;
+
+    return true;
+
+err:
+    if (cpufd >= 0) {
+        close(cpufd);
+    }
+    if (vmfd >= 0) {
+        close(vmfd);
+    }
+    if (kvmfd >= 0) {
+        close(kvmfd);
+    }
+
+    return false;
+}
+
+void kvm_arm_destroy_scratch_host_vcpu(int *fdarray)
+{
+    int i;
+
+    for (i = 2; i >= 0; i--) {
+        close(fdarray[i]);
+    }
+}
+
+static inline void set_feature(uint64_t *features, int feature)
+{
+    *features |= 1ULL << feature;
+}
+
+bool kvm_arm_get_host_cpu_features(ARMHostCPUClass *ahcc)
+{
+    /* Identify the feature bits corresponding to the host CPU, and
+     * fill out the ARMHostCPUClass fields accordingly. To do this
+     * we have to create a scratch VM, create a single CPU inside it,
+     * and then query that CPU for the relevant ID registers.
+     */
+    int i, ret, fdarray[3];
+    uint32_t midr, id_pfr0, id_isar0, mvfr1;
+    uint64_t features = 0;
+    /* Old kernels may not know about the PREFERRED_TARGET ioctl: however
+     * we know these will only support creating one kind of guest CPU,
+     * which is its preferred CPU type.
+     */
+    static const uint32_t cpus_to_try[] = {
+        QEMU_KVM_ARM_TARGET_CORTEX_A15,
+        QEMU_KVM_ARM_TARGET_NONE
+    };
+    struct kvm_vcpu_init init;
+    struct kvm_one_reg idregs[] = {
+        {
+            .id = KVM_REG_ARM | KVM_REG_SIZE_U32
+            | ENCODE_CP_REG(15, 0, 0, 0, 0, 0),
+            .addr = (uintptr_t)&midr,
+        },
+        {
+            .id = KVM_REG_ARM | KVM_REG_SIZE_U32
+            | ENCODE_CP_REG(15, 0, 0, 1, 0, 0),
+            .addr = (uintptr_t)&id_pfr0,
+        },
+        {
+            .id = KVM_REG_ARM | KVM_REG_SIZE_U32
+            | ENCODE_CP_REG(15, 0, 0, 2, 0, 0),
+            .addr = (uintptr_t)&id_isar0,
+        },
+        {
+            .id = KVM_REG_ARM | KVM_REG_SIZE_U32
+            | KVM_REG_ARM_VFP | KVM_REG_ARM_VFP_MVFR1,
+            .addr = (uintptr_t)&mvfr1,
+        },
+    };
+
+    if (!kvm_arm_create_scratch_host_vcpu(cpus_to_try, fdarray, &init)) {
+        return false;
+    }
+
+    ahcc->target = init.target;
+
+    /* This is not strictly blessed by the device tree binding docs yet,
+     * but in practice the kernel does not care about this string so
+     * there is no point maintaining an KVM_ARM_TARGET_* -> string table.
+     */
+    ahcc->dtb_compatible = "arm,arm-v7";
+
+    for (i = 0; i < ARRAY_SIZE(idregs); i++) {
+        ret = ioctl(fdarray[2], KVM_GET_ONE_REG, &idregs[i]);
+        if (ret) {
+            break;
+        }
+    }
+
+    kvm_arm_destroy_scratch_host_vcpu(fdarray);
+
+    if (ret) {
+        return false;
+    }
+
+    /* Now we've retrieved all the register information we can
+     * set the feature bits based on the ID register fields.
+     * We can assume any KVM supporting CPU is at least a v7
+     * with VFPv3, LPAE and the generic timers; this in turn implies
+     * most of the other feature bits, but a few must be tested.
+     */
+    set_feature(&features, ARM_FEATURE_V7);
+    set_feature(&features, ARM_FEATURE_VFP3);
+    set_feature(&features, ARM_FEATURE_LPAE);
+    set_feature(&features, ARM_FEATURE_GENERIC_TIMER);
+
+    switch (extract32(id_isar0, 24, 4)) {
+    case 1:
+        set_feature(&features, ARM_FEATURE_THUMB_DIV);
+        break;
+    case 2:
+        set_feature(&features, ARM_FEATURE_ARM_DIV);
+        set_feature(&features, ARM_FEATURE_THUMB_DIV);
+        break;
+    default:
+        break;
+    }
+
+    if (extract32(id_pfr0, 12, 4) == 1) {
+        set_feature(&features, ARM_FEATURE_THUMB2EE);
+    }
+    if (extract32(mvfr1, 20, 4) == 1) {
+        set_feature(&features, ARM_FEATURE_VFP_FP16);
+    }
+    if (extract32(mvfr1, 12, 4) == 1) {
+        set_feature(&features, ARM_FEATURE_NEON);
+    }
+    if (extract32(mvfr1, 28, 4) == 1) {
+        /* FMAC support implies VFPv4 */
+        set_feature(&features, ARM_FEATURE_VFP4);
+    }
+
+    ahcc->features = features;
+
+    return true;
+}
+
+static void kvm_arm_host_cpu_class_init(ObjectClass *oc, void *data)
+{
+    ARMHostCPUClass *ahcc = ARM_HOST_CPU_CLASS(oc);
+
+    /* All we really need to set up for the 'host' CPU
+     * is the feature bits -- we rely on the fact that the
+     * various ID register values in ARMCPU are only used for
+     * TCG CPUs.
+     */
+    if (!kvm_arm_get_host_cpu_features(ahcc)) {
+        fprintf(stderr, "Failed to retrieve host CPU features!\n");
+        abort();
+    }
+}
+
+static void kvm_arm_host_cpu_initfn(Object *obj)
+{
+    ARMHostCPUClass *ahcc = ARM_HOST_CPU_GET_CLASS(obj);
+    ARMCPU *cpu = ARM_CPU(obj);
+    CPUARMState *env = &cpu->env;
+
+    cpu->kvm_target = ahcc->target;
+    cpu->dtb_compatible = ahcc->dtb_compatible;
+    env->features = ahcc->features;
+}
+
+static const TypeInfo host_arm_cpu_type_info = {
+    .name = TYPE_ARM_HOST_CPU,
+    .parent = TYPE_ARM_CPU,
+    .instance_init = kvm_arm_host_cpu_initfn,
+    .class_init = kvm_arm_host_cpu_class_init,
+    .class_size = sizeof(ARMHostCPUClass),
+};
+
 int kvm_arch_init(KVMState *s)
 {
     /* For ARM interrupt delivery is always asynchronous,
      * whether we are using an in-kernel VGIC or not.
      */
     kvm_async_interrupts_allowed = true;
+
+    type_register_static(&host_arm_cpu_type_info);
+
     return 0;
 }
 
@@ -86,8 +301,16 @@ int kvm_arch_init_vcpu(CPUState *cs)
     struct kvm_reg_list *rlp;
     ARMCPU *cpu = ARM_CPU(cs);
 
-    init.target = KVM_ARM_TARGET_CORTEX_A15;
+    if (cpu->kvm_target == QEMU_KVM_ARM_TARGET_NONE) {
+        fprintf(stderr, "KVM is not supported for this guest CPU type\n");
+        return -EINVAL;
+    }
+
+    init.target = cpu->kvm_target;
     memset(init.features, 0, sizeof(init.features));
+    if (cpu->start_powered_off) {
+        init.features[0] = 1 << KVM_ARM_VCPU_POWER_OFF;
+    }
     ret = kvm_vcpu_ioctl(cs, KVM_ARM_VCPU_INIT, &init);
     if (ret) {
         return ret;
diff --git a/target-arm/kvm_arm.h b/target-arm/kvm_arm.h
index 5d14887e66..cd3d13ca2d 100644
--- a/target-arm/kvm_arm.h
+++ b/target-arm/kvm_arm.h
@@ -62,4 +62,59 @@ bool write_list_to_kvmstate(ARMCPU *cpu);
  */
 bool write_kvmstate_to_list(ARMCPU *cpu);
 
+#ifdef CONFIG_KVM
+/**
+ * kvm_arm_create_scratch_host_vcpu:
+ * @cpus_to_try: array of QEMU_KVM_ARM_TARGET_* values (terminated with
+ * QEMU_KVM_ARM_TARGET_NONE) to try as fallback if the kernel does not
+ * know the PREFERRED_TARGET ioctl
+ * @fdarray: filled in with kvmfd, vmfd, cpufd file descriptors in that order
+ * @init: filled in with the necessary values for creating a host vcpu
+ *
+ * Create a scratch vcpu in its own VM of the type preferred by the host
+ * kernel (as would be used for '-cpu host'), for purposes of probing it
+ * for capabilities.
+ *
+ * Returns: true on success (and fdarray and init are filled in),
+ * false on failure (and fdarray and init are not valid).
+ */
+bool kvm_arm_create_scratch_host_vcpu(const uint32_t *cpus_to_try,
+                                      int *fdarray,
+                                      struct kvm_vcpu_init *init);
+
+/**
+ * kvm_arm_destroy_scratch_host_vcpu:
+ * @fdarray: array of fds as set up by kvm_arm_create_scratch_host_vcpu
+ *
+ * Tear down the scratch vcpu created by kvm_arm_create_scratch_host_vcpu.
+ */
+void kvm_arm_destroy_scratch_host_vcpu(int *fdarray);
+
+#define TYPE_ARM_HOST_CPU "host-" TYPE_ARM_CPU
+#define ARM_HOST_CPU_CLASS(klass) \
+    OBJECT_CLASS_CHECK(ARMHostCPUClass, (klass), TYPE_ARM_HOST_CPU)
+#define ARM_HOST_CPU_GET_CLASS(obj) \
+    OBJECT_GET_CLASS(ARMHostCPUClass, (obj), TYPE_ARM_HOST_CPU)
+
+typedef struct ARMHostCPUClass {
+    /*< private >*/
+    ARMCPUClass parent_class;
+    /*< public >*/
+
+    uint64_t features;
+    uint32_t target;
+    const char *dtb_compatible;
+} ARMHostCPUClass;
+
+/**
+ * kvm_arm_get_host_cpu_features:
+ * @ahcc: ARMHostCPUClass to fill in
+ *
+ * Probe the capabilities of the host kernel's preferred CPU and fill
+ * in the ARMHostCPUClass struct accordingly.
+ */
+bool kvm_arm_get_host_cpu_features(ARMHostCPUClass *ahcc);
+
+#endif
+
 #endif
diff --git a/target-arm/translate.c b/target-arm/translate.c
index 5f003e785e..8c479ff9a8 100644
--- a/target-arm/translate.c
+++ b/target-arm/translate.c
@@ -825,63 +825,57 @@ static inline void store_reg_from_load(CPUARMState *env, DisasContext *s,
  * extended if we're a 64 bit core) and  data is also
  * 32 bits unless specifically doing a 64 bit access.
  * These functions work like tcg_gen_qemu_{ld,st}* except
- * that their arguments are TCGv_i32 rather than TCGv.
+ * that the address argument is TCGv_i32 rather than TCGv.
  */
 #if TARGET_LONG_BITS == 32
 
-#define DO_GEN_LD(OP)                                                    \
-static inline void gen_aa32_##OP(TCGv_i32 val, TCGv_i32 addr, int index) \
+#define DO_GEN_LD(SUFF, OPC)                                             \
+static inline void gen_aa32_ld##SUFF(TCGv_i32 val, TCGv_i32 addr, int index) \
 {                                                                        \
-    tcg_gen_qemu_##OP(val, addr, index);                                 \
+    tcg_gen_qemu_ld_i32(val, addr, index, OPC);                          \
 }
 
-#define DO_GEN_ST(OP)                                                    \
-static inline void gen_aa32_##OP(TCGv_i32 val, TCGv_i32 addr, int index) \
+#define DO_GEN_ST(SUFF, OPC)                                             \
+static inline void gen_aa32_st##SUFF(TCGv_i32 val, TCGv_i32 addr, int index) \
 {                                                                        \
-    tcg_gen_qemu_##OP(val, addr, index);                                 \
+    tcg_gen_qemu_st_i32(val, addr, index, OPC);                          \
 }
 
 static inline void gen_aa32_ld64(TCGv_i64 val, TCGv_i32 addr, int index)
 {
-    tcg_gen_qemu_ld64(val, addr, index);
+    tcg_gen_qemu_ld_i64(val, addr, index, MO_TEQ);
 }
 
 static inline void gen_aa32_st64(TCGv_i64 val, TCGv_i32 addr, int index)
 {
-    tcg_gen_qemu_st64(val, addr, index);
+    tcg_gen_qemu_st_i64(val, addr, index, MO_TEQ);
 }
 
 #else
 
-#define DO_GEN_LD(OP)                                                    \
-static inline void gen_aa32_##OP(TCGv_i32 val, TCGv_i32 addr, int index) \
+#define DO_GEN_LD(SUFF, OPC)                                             \
+static inline void gen_aa32_ld##SUFF(TCGv_i32 val, TCGv_i32 addr, int index) \
 {                                                                        \
     TCGv addr64 = tcg_temp_new();                                        \
-    TCGv val64 = tcg_temp_new();                                         \
     tcg_gen_extu_i32_i64(addr64, addr);                                  \
-    tcg_gen_qemu_##OP(val64, addr64, index);                             \
+    tcg_gen_qemu_ld_i32(val, addr64, index, OPC);                        \
     tcg_temp_free(addr64);                                               \
-    tcg_gen_trunc_i64_i32(val, val64);                                   \
-    tcg_temp_free(val64);                                                \
 }
 
-#define DO_GEN_ST(OP)                                                    \
-static inline void gen_aa32_##OP(TCGv_i32 val, TCGv_i32 addr, int index) \
+#define DO_GEN_ST(SUFF, OPC)                                             \
+static inline void gen_aa32_st##SUFF(TCGv_i32 val, TCGv_i32 addr, int index) \
 {                                                                        \
     TCGv addr64 = tcg_temp_new();                                        \
-    TCGv val64 = tcg_temp_new();                                         \
     tcg_gen_extu_i32_i64(addr64, addr);                                  \
-    tcg_gen_extu_i32_i64(val64, val);                                    \
-    tcg_gen_qemu_##OP(val64, addr64, index);                             \
+    tcg_gen_qemu_st_i32(val, addr64, index, OPC);                        \
     tcg_temp_free(addr64);                                               \
-    tcg_temp_free(val64);                                                \
 }
 
 static inline void gen_aa32_ld64(TCGv_i64 val, TCGv_i32 addr, int index)
 {
     TCGv addr64 = tcg_temp_new();
     tcg_gen_extu_i32_i64(addr64, addr);
-    tcg_gen_qemu_ld64(val, addr64, index);
+    tcg_gen_qemu_ld_i64(val, addr64, index, MO_TEQ);
     tcg_temp_free(addr64);
 }
 
@@ -889,20 +883,20 @@ static inline void gen_aa32_st64(TCGv_i64 val, TCGv_i32 addr, int index)
 {
     TCGv addr64 = tcg_temp_new();
     tcg_gen_extu_i32_i64(addr64, addr);
-    tcg_gen_qemu_st64(val, addr64, index);
+    tcg_gen_qemu_st_i64(val, addr64, index, MO_TEQ);
     tcg_temp_free(addr64);
 }
 
 #endif
 
-DO_GEN_LD(ld8s)
-DO_GEN_LD(ld8u)
-DO_GEN_LD(ld16s)
-DO_GEN_LD(ld16u)
-DO_GEN_LD(ld32u)
-DO_GEN_ST(st8)
-DO_GEN_ST(st16)
-DO_GEN_ST(st32)
+DO_GEN_LD(8s, MO_SB)
+DO_GEN_LD(8u, MO_UB)
+DO_GEN_LD(16s, MO_TESW)
+DO_GEN_LD(16u, MO_TEUW)
+DO_GEN_LD(32u, MO_TEUL)
+DO_GEN_ST(8, MO_UB)
+DO_GEN_ST(16, MO_TEUW)
+DO_GEN_ST(32, MO_TEUL)
 
 static inline void gen_set_pc_im(DisasContext *s, target_ulong val)
 {
@@ -2614,6 +2608,189 @@ static TCGv_i32 gen_load_and_replicate(DisasContext *s, TCGv_i32 addr, int size)
     return tmp;
 }
 
+static int handle_vsel(uint32_t insn, uint32_t rd, uint32_t rn, uint32_t rm,
+                       uint32_t dp)
+{
+    uint32_t cc = extract32(insn, 20, 2);
+
+    if (dp) {
+        TCGv_i64 frn, frm, dest;
+        TCGv_i64 tmp, zero, zf, nf, vf;
+
+        zero = tcg_const_i64(0);
+
+        frn = tcg_temp_new_i64();
+        frm = tcg_temp_new_i64();
+        dest = tcg_temp_new_i64();
+
+        zf = tcg_temp_new_i64();
+        nf = tcg_temp_new_i64();
+        vf = tcg_temp_new_i64();
+
+        tcg_gen_extu_i32_i64(zf, cpu_ZF);
+        tcg_gen_ext_i32_i64(nf, cpu_NF);
+        tcg_gen_ext_i32_i64(vf, cpu_VF);
+
+        tcg_gen_ld_f64(frn, cpu_env, vfp_reg_offset(dp, rn));
+        tcg_gen_ld_f64(frm, cpu_env, vfp_reg_offset(dp, rm));
+        switch (cc) {
+        case 0: /* eq: Z */
+            tcg_gen_movcond_i64(TCG_COND_EQ, dest, zf, zero,
+                                frn, frm);
+            break;
+        case 1: /* vs: V */
+            tcg_gen_movcond_i64(TCG_COND_LT, dest, vf, zero,
+                                frn, frm);
+            break;
+        case 2: /* ge: N == V -> N ^ V == 0 */
+            tmp = tcg_temp_new_i64();
+            tcg_gen_xor_i64(tmp, vf, nf);
+            tcg_gen_movcond_i64(TCG_COND_GE, dest, tmp, zero,
+                                frn, frm);
+            tcg_temp_free_i64(tmp);
+            break;
+        case 3: /* gt: !Z && N == V */
+            tcg_gen_movcond_i64(TCG_COND_NE, dest, zf, zero,
+                                frn, frm);
+            tmp = tcg_temp_new_i64();
+            tcg_gen_xor_i64(tmp, vf, nf);
+            tcg_gen_movcond_i64(TCG_COND_GE, dest, tmp, zero,
+                                dest, frm);
+            tcg_temp_free_i64(tmp);
+            break;
+        }
+        tcg_gen_st_f64(dest, cpu_env, vfp_reg_offset(dp, rd));
+        tcg_temp_free_i64(frn);
+        tcg_temp_free_i64(frm);
+        tcg_temp_free_i64(dest);
+
+        tcg_temp_free_i64(zf);
+        tcg_temp_free_i64(nf);
+        tcg_temp_free_i64(vf);
+
+        tcg_temp_free_i64(zero);
+    } else {
+        TCGv_i32 frn, frm, dest;
+        TCGv_i32 tmp, zero;
+
+        zero = tcg_const_i32(0);
+
+        frn = tcg_temp_new_i32();
+        frm = tcg_temp_new_i32();
+        dest = tcg_temp_new_i32();
+        tcg_gen_ld_f32(frn, cpu_env, vfp_reg_offset(dp, rn));
+        tcg_gen_ld_f32(frm, cpu_env, vfp_reg_offset(dp, rm));
+        switch (cc) {
+        case 0: /* eq: Z */
+            tcg_gen_movcond_i32(TCG_COND_EQ, dest, cpu_ZF, zero,
+                                frn, frm);
+            break;
+        case 1: /* vs: V */
+            tcg_gen_movcond_i32(TCG_COND_LT, dest, cpu_VF, zero,
+                                frn, frm);
+            break;
+        case 2: /* ge: N == V -> N ^ V == 0 */
+            tmp = tcg_temp_new_i32();
+            tcg_gen_xor_i32(tmp, cpu_VF, cpu_NF);
+            tcg_gen_movcond_i32(TCG_COND_GE, dest, tmp, zero,
+                                frn, frm);
+            tcg_temp_free_i32(tmp);
+            break;
+        case 3: /* gt: !Z && N == V */
+            tcg_gen_movcond_i32(TCG_COND_NE, dest, cpu_ZF, zero,
+                                frn, frm);
+            tmp = tcg_temp_new_i32();
+            tcg_gen_xor_i32(tmp, cpu_VF, cpu_NF);
+            tcg_gen_movcond_i32(TCG_COND_GE, dest, tmp, zero,
+                                dest, frm);
+            tcg_temp_free_i32(tmp);
+            break;
+        }
+        tcg_gen_st_f32(dest, cpu_env, vfp_reg_offset(dp, rd));
+        tcg_temp_free_i32(frn);
+        tcg_temp_free_i32(frm);
+        tcg_temp_free_i32(dest);
+
+        tcg_temp_free_i32(zero);
+    }
+
+    return 0;
+}
+
+static int handle_vminmaxnm(uint32_t insn, uint32_t rd, uint32_t rn,
+                            uint32_t rm, uint32_t dp)
+{
+    uint32_t vmin = extract32(insn, 6, 1);
+    TCGv_ptr fpst = get_fpstatus_ptr(0);
+
+    if (dp) {
+        TCGv_i64 frn, frm, dest;
+
+        frn = tcg_temp_new_i64();
+        frm = tcg_temp_new_i64();
+        dest = tcg_temp_new_i64();
+
+        tcg_gen_ld_f64(frn, cpu_env, vfp_reg_offset(dp, rn));
+        tcg_gen_ld_f64(frm, cpu_env, vfp_reg_offset(dp, rm));
+        if (vmin) {
+            gen_helper_vfp_minnmd(dest, frn, frm, fpst);
+        } else {
+            gen_helper_vfp_maxnmd(dest, frn, frm, fpst);
+        }
+        tcg_gen_st_f64(dest, cpu_env, vfp_reg_offset(dp, rd));
+        tcg_temp_free_i64(frn);
+        tcg_temp_free_i64(frm);
+        tcg_temp_free_i64(dest);
+    } else {
+        TCGv_i32 frn, frm, dest;
+
+        frn = tcg_temp_new_i32();
+        frm = tcg_temp_new_i32();
+        dest = tcg_temp_new_i32();
+
+        tcg_gen_ld_f32(frn, cpu_env, vfp_reg_offset(dp, rn));
+        tcg_gen_ld_f32(frm, cpu_env, vfp_reg_offset(dp, rm));
+        if (vmin) {
+            gen_helper_vfp_minnms(dest, frn, frm, fpst);
+        } else {
+            gen_helper_vfp_maxnms(dest, frn, frm, fpst);
+        }
+        tcg_gen_st_f32(dest, cpu_env, vfp_reg_offset(dp, rd));
+        tcg_temp_free_i32(frn);
+        tcg_temp_free_i32(frm);
+        tcg_temp_free_i32(dest);
+    }
+
+    tcg_temp_free_ptr(fpst);
+    return 0;
+}
+
+static int disas_vfp_v8_insn(CPUARMState *env, DisasContext *s, uint32_t insn)
+{
+    uint32_t rd, rn, rm, dp = extract32(insn, 8, 1);
+
+    if (!arm_feature(env, ARM_FEATURE_V8)) {
+        return 1;
+    }
+
+    if (dp) {
+        VFP_DREG_D(rd, insn);
+        VFP_DREG_N(rn, insn);
+        VFP_DREG_M(rm, insn);
+    } else {
+        rd = VFP_SREG_D(insn);
+        rn = VFP_SREG_N(insn);
+        rm = VFP_SREG_M(insn);
+    }
+
+    if ((insn & 0x0f800e50) == 0x0e000a00) {
+        return handle_vsel(insn, rd, rn, rm, dp);
+    } else if ((insn & 0x0fb00e10) == 0x0e800a00) {
+        return handle_vminmaxnm(insn, rd, rn, rm, dp);
+    }
+    return 1;
+}
+
 /* Disassemble a VFP instruction.  Returns nonzero if an error occurred
    (ie. an undefined instruction).  */
 static int disas_vfp_insn(CPUARMState * env, DisasContext *s, uint32_t insn)
@@ -2636,6 +2813,14 @@ static int disas_vfp_insn(CPUARMState * env, DisasContext *s, uint32_t insn)
             && rn != ARM_VFP_MVFR1 && rn != ARM_VFP_MVFR0)
             return 1;
     }
+
+    if (extract32(insn, 28, 4) == 0xf) {
+        /* Encodings with T=1 (Thumb) or unconditional (ARM):
+         * only used in v8 and above.
+         */
+        return disas_vfp_v8_insn(env, s, insn);
+    }
+
     dp = ((insn & 0xf00) == 0xb00);
     switch ((insn >> 24) & 0xf) {
     case 0xe:
@@ -4362,7 +4547,7 @@ static void gen_neon_narrow_op(int op, int u, int size,
 #define NEON_3R_FLOAT_CMP 28 /* float VCEQ, VCGE, VCGT */
 #define NEON_3R_FLOAT_ACMP 29 /* float VACGE, VACGT, VACLE, VACLT */
 #define NEON_3R_FLOAT_MINMAX 30 /* float VMIN, VMAX */
-#define NEON_3R_VRECPS_VRSQRTS 31 /* float VRECPS, VRSQRTS */
+#define NEON_3R_FLOAT_MISC 31 /* float VRECPS, VRSQRTS, VMAXNM/MINNM */
 
 static const uint8_t neon_3r_sizes[] = {
     [NEON_3R_VHADD] = 0x7,
@@ -4395,7 +4580,7 @@ static const uint8_t neon_3r_sizes[] = {
     [NEON_3R_FLOAT_CMP] = 0x5, /* size bit 1 encodes op */
     [NEON_3R_FLOAT_ACMP] = 0x5, /* size bit 1 encodes op */
     [NEON_3R_FLOAT_MINMAX] = 0x5, /* size bit 1 encodes op */
-    [NEON_3R_VRECPS_VRSQRTS] = 0x5, /* size bit 1 encodes op */
+    [NEON_3R_FLOAT_MISC] = 0x5, /* size bit 1 encodes op */
 };
 
 /* Symbolic constants for op fields for Neon 2-register miscellaneous.
@@ -4656,8 +4841,9 @@ static int disas_neon_data_insn(CPUARMState * env, DisasContext *s, uint32_t ins
                 return 1;
             }
             break;
-        case NEON_3R_VRECPS_VRSQRTS:
-            if (u) {
+        case NEON_3R_FLOAT_MISC:
+            /* VMAXNM/VMINNM in ARMv8 */
+            if (u && !arm_feature(env, ARM_FEATURE_V8)) {
                 return 1;
             }
             break;
@@ -4946,11 +5132,23 @@ static int disas_neon_data_insn(CPUARMState * env, DisasContext *s, uint32_t ins
             tcg_temp_free_ptr(fpstatus);
             break;
         }
-        case NEON_3R_VRECPS_VRSQRTS:
-            if (size == 0)
-                gen_helper_recps_f32(tmp, tmp, tmp2, cpu_env);
-            else
-                gen_helper_rsqrts_f32(tmp, tmp, tmp2, cpu_env);
+        case NEON_3R_FLOAT_MISC:
+            if (u) {
+                /* VMAXNM/VMINNM */
+                TCGv_ptr fpstatus = get_fpstatus_ptr(1);
+                if (size == 0) {
+                    gen_helper_vfp_maxnms(tmp, tmp, tmp2, fpstatus);
+                } else {
+                    gen_helper_vfp_minnms(tmp, tmp, tmp2, fpstatus);
+                }
+                tcg_temp_free_ptr(fpstatus);
+            } else {
+                if (size == 0) {
+                    gen_helper_recps_f32(tmp, tmp, tmp2, cpu_env);
+                } else {
+                    gen_helper_rsqrts_f32(tmp, tmp, tmp2, cpu_env);
+              }
+            }
             break;
         case NEON_3R_VFM:
         {
@@ -6296,9 +6494,6 @@ static int disas_coproc_insn(CPUARMState * env, DisasContext *s, uint32_t insn)
 	    return disas_dsp_insn(env, s, insn);
 	}
 	return 1;
-    case 10:
-    case 11:
-	return disas_vfp_insn (env, s, insn);
     default:
         break;
     }
@@ -6753,6 +6948,13 @@ static void disas_arm_insn(CPUARMState * env, DisasContext *s)
                 goto illegal_op;
             return;
         }
+        if ((insn & 0x0f000e10) == 0x0e000a00) {
+            /* VFP.  */
+            if (disas_vfp_insn(env, s, insn)) {
+                goto illegal_op;
+            }
+            return;
+        }
         if (((insn & 0x0f30f000) == 0x0510f000) ||
             ((insn & 0x0f30f010) == 0x0710f000)) {
             if ((insn & (1 << 22)) == 0) {
@@ -8033,9 +8235,15 @@ static void disas_arm_insn(CPUARMState * env, DisasContext *s)
         case 0xc:
         case 0xd:
         case 0xe:
-            /* Coprocessor.  */
-            if (disas_coproc_insn(env, s, insn))
+            if (((insn >> 8) & 0xe) == 10) {
+                /* VFP.  */
+                if (disas_vfp_insn(env, s, insn)) {
+                    goto illegal_op;
+                }
+            } else if (disas_coproc_insn(env, s, insn)) {
+                /* Coprocessor.  */
                 goto illegal_op;
+            }
             break;
         case 0xf:
             /* swi */
@@ -8765,6 +8973,10 @@ static int disas_thumb2_insn(CPUARMState *env, DisasContext *s, uint16_t insn_hw
             insn = (insn & 0xe2ffffff) | ((insn & (1 << 28)) >> 4) | (1 << 28);
             if (disas_neon_data_insn(env, s, insn))
                 goto illegal_op;
+        } else if (((insn >> 8) & 0xe) == 10) {
+            if (disas_vfp_insn(env, s, insn)) {
+                goto illegal_op;
+            }
         } else {
             if (insn & (1 << 28))
                 goto illegal_op;
diff --git a/target-mips/dsp_helper.c b/target-mips/dsp_helper.c
index b088a25017..a2f46d9637 100644
--- a/target-mips/dsp_helper.c
+++ b/target-mips/dsp_helper.c
@@ -1088,12 +1088,11 @@ static inline int32_t mipsdsp_cmpu_lt(uint32_t a, uint32_t b)
 target_ulong helper_##name(target_ulong rt, CPUMIPSState *env)             \
 {                                                                          \
     DSP32Value dt;                                                         \
-    unsigned int i, n;                                                     \
+    unsigned int i;                                                     \
                                                                            \
-    n = sizeof(DSP32Value) / sizeof(dt.element[0]);                        \
     dt.sw[0] = rt;                                                         \
                                                                            \
-    for (i = 0; i < n; i++) {                                              \
+    for (i = 0; i < ARRAY_SIZE(dt.element); i++) {                         \
         dt.element[i] = mipsdsp_##func(dt.element[i], env);                \
     }                                                                      \
                                                                            \
@@ -1109,12 +1108,11 @@ MIPSDSP32_UNOP_ENV(absq_s_w, sat_abs32, sw)
 target_ulong helper_##name(target_ulong rt, CPUMIPSState *env)             \
 {                                                                          \
     DSP64Value dt;                                                         \
-    unsigned int i, n;                                                     \
+    unsigned int i;                                                        \
                                                                            \
-    n = sizeof(DSP64Value) / sizeof(dt.element[0]);                        \
     dt.sl[0] = rt;                                                         \
                                                                            \
-    for (i = 0; i < n; i++) {                                              \
+    for (i = 0; i < ARRAY_SIZE(dt.element); i++) {                         \
         dt.element[i] = mipsdsp_##func(dt.element[i], env);                \
     }                                                                      \
                                                                            \
@@ -1130,13 +1128,12 @@ MIPSDSP64_UNOP_ENV(absq_s_pw, sat_abs32, sw)
 target_ulong helper_##name(target_ulong rs, target_ulong rt)               \
 {                                                                          \
     DSP32Value ds, dt;                                                     \
-    unsigned int i, n;                                                     \
+    unsigned int i;                                                        \
                                                                            \
-    n = sizeof(DSP32Value) / sizeof(ds.element[0]);                        \
     ds.sw[0] = rs;                                                         \
     dt.sw[0] = rt;                                                         \
                                                                            \
-    for (i = 0; i < n; i++) {                                              \
+    for (i = 0; i < ARRAY_SIZE(ds.element); i++) {                         \
         ds.element[i] = mipsdsp_##func(ds.element[i], dt.element[i]);      \
     }                                                                      \
                                                                            \
@@ -1159,13 +1156,12 @@ target_ulong helper_##name(target_ulong rs, target_ulong rt,               \
                            CPUMIPSState *env)                              \
 {                                                                          \
     DSP32Value ds, dt;                                                     \
-    unsigned int i, n;                                                     \
+    unsigned int i;                                                        \
                                                                            \
-    n = sizeof(DSP32Value) / sizeof(ds.element[0]);                        \
     ds.sw[0] = rs;                                                         \
     dt.sw[0] = rt;                                                         \
                                                                            \
-    for (i = 0 ; i < n ; i++) {                                            \
+    for (i = 0 ; i < ARRAY_SIZE(ds.element); i++) {                        \
         ds.element[i] = mipsdsp_##func(ds.element[i], dt.element[i], env); \
     }                                                                      \
                                                                            \
@@ -1192,13 +1188,12 @@ MIPSDSP32_BINOP_ENV(subu_s_qb, satu8_sub, ub);
 target_ulong helper_##name(target_ulong rs, target_ulong rt)               \
 {                                                                          \
     DSP64Value ds, dt;                                                     \
-    unsigned int i, n;                                                     \
+    unsigned int i;                                                        \
                                                                            \
-    n = sizeof(DSP64Value) / sizeof(ds.element[0]);                        \
     ds.sl[0] = rs;                                                         \
     dt.sl[0] = rt;                                                         \
                                                                            \
-    for (i = 0 ; i < n ; i++) {                                            \
+    for (i = 0 ; i < ARRAY_SIZE(ds.element); i++) {                        \
         ds.element[i] = mipsdsp_##func(ds.element[i], dt.element[i]);      \
     }                                                                      \
                                                                            \
@@ -1215,13 +1210,12 @@ target_ulong helper_##name(target_ulong rs, target_ulong rt,               \
                            CPUMIPSState *env)                              \
 {                                                                          \
     DSP64Value ds, dt;                                                     \
-    unsigned int i, n;                                                     \
+    unsigned int i;                                                        \
                                                                            \
-    n = sizeof(DSP64Value) / sizeof(ds.element[0]);                        \
     ds.sl[0] = rs;                                                         \
     dt.sl[0] = rt;                                                         \
                                                                            \
-    for (i = 0 ; i < n ; i++) {                                            \
+    for (i = 0 ; i < ARRAY_SIZE(ds.element); i++) {                        \
         ds.element[i] = mipsdsp_##func(ds.element[i], dt.element[i], env); \
     }                                                                      \
                                                                            \
diff --git a/target-mips/translate.c b/target-mips/translate.c
index 67f326b205..e30273438a 100644
--- a/target-mips/translate.c
+++ b/target-mips/translate.c
@@ -15983,10 +15983,13 @@ void cpu_state_reset(CPUMIPSState *env)
     if (env->CP0_Config3 & (1 << CP0C3_DSPP)) {
         env->CP0_Status |= (1 << CP0St_MX);
     }
-    /* Enable 64-bit FPU if the target cpu supports it.  */
-    if (env->active_fpu.fcr0 & (1 << FCR0_F64)) {
+# if defined(TARGET_MIPS64)
+    /* For MIPS64, init FR bit to 1 if FPU unit is there and bit is writable. */
+    if ((env->CP0_Config1 & (1 << CP0C1_FP)) &&
+        (env->CP0_Status_rw_bitmask & (1 << CP0St_FR))) {
         env->CP0_Status |= (1 << CP0St_FR);
     }
+# endif
 #else
     if (env->hflags & MIPS_HFLAG_BMASK) {
         /* If the exception was raised from a delay slot,
diff --git a/target-sh4/cpu.h b/target-sh4/cpu.h
index 276d2955c3..c181ddacf5 100644
--- a/target-sh4/cpu.h
+++ b/target-sh4/cpu.h
@@ -157,9 +157,6 @@ typedef struct CPUSH4State {
     /* float point status register */
     float_status fp_status;
 
-    /* The features that we should emulate. See sh_features above.  */
-    uint32_t features;
-
     /* Those belong to the specific unit (SH7750) but are handled here */
     uint32_t mmucr;		/* MMU control register */
     uint32_t pteh;		/* page table entry high register */
@@ -180,6 +177,9 @@ typedef struct CPUSH4State {
 
     int id;			/* CPU model */
 
+    /* The features that we should emulate. See sh_features above.  */
+    uint32_t features;
+
     void *intc_handle;
     int in_sleep;		/* SR_BL ignored during sleep */
     memory_content *movcal_backup;