summary refs log tree commit diff stats
diff options
context:
space:
mode:
-rw-r--r--MAINTAINERS1
-rw-r--r--Makefile2
-rwxr-xr-xconfigure4
-rw-r--r--disas/i386.c8
-rw-r--r--docs/qmp/qmp-events.txt2
-rw-r--r--docs/specs/acpi_cpu_hotplug.txt4
-rw-r--r--exec.c6
-rw-r--r--hw/acpi/Makefile.objs3
-rw-r--r--hw/acpi/cpu_hotplug.c64
-rw-r--r--hw/acpi/ich9.c14
-rw-r--r--hw/acpi/pcihp.c316
-rw-r--r--hw/acpi/piix4.c155
-rw-r--r--hw/arm/boot.c9
-rw-r--r--hw/arm/xilinx_zynq.c7
-rw-r--r--hw/display/blizzard_template.h40
-rw-r--r--hw/display/pl110_template.h12
-rw-r--r--hw/display/pxa2xx_template.h22
-rw-r--r--hw/display/tc6393xb_template.h14
-rw-r--r--hw/display/xenfb.c7
-rw-r--r--hw/i386/Makefile.objs2
-rw-r--r--hw/i386/acpi-build.c364
-rw-r--r--hw/i386/acpi-dsdt-cpu-hotplug.dsl14
-rw-r--r--hw/i386/acpi-dsdt-isa.dsl11
-rw-r--r--hw/i386/acpi-dsdt-pci-crs.dsl15
-rw-r--r--hw/i386/acpi-dsdt.dsl76
-rw-r--r--hw/i386/acpi-dsdt.hex.generated217
-rw-r--r--hw/i386/pc.c1
-rw-r--r--hw/i386/pc_q35.c20
-rw-r--r--hw/i386/q35-acpi-dsdt.dsl19
-rw-r--r--hw/i386/q35-acpi-dsdt.hex.generated74
-rw-r--r--hw/i386/ssdt-pcihp.dsl11
-rw-r--r--hw/i386/ssdt-pcihp.hex.generated20
-rw-r--r--hw/i386/ssdt-proc.hex.generated6
-rw-r--r--hw/ide/core.c1
-rw-r--r--hw/intc/arm_gic.c21
-rw-r--r--hw/misc/applesmc.c1
-rw-r--r--hw/misc/vfio.c78
-rw-r--r--hw/net/lan9118.c6
-rw-r--r--hw/net/vhost_net.c2
-rw-r--r--hw/pci/pci.c48
-rw-r--r--hw/virtio/virtio-balloon.c7
-rw-r--r--hw/xen/xen_pt.c8
-rw-r--r--include/exec/exec-all.h1
-rw-r--r--include/exec/ram_addr.h2
-rw-r--r--include/hw/acpi/cpu_hotplug.h27
-rw-r--r--include/hw/acpi/cpu_hotplug_defs.h24
-rw-r--r--include/hw/acpi/ich9.h4
-rw-r--r--include/hw/acpi/pcihp.h72
-rw-r--r--include/hw/i386/pc.h7
-rw-r--r--include/hw/intc/arm_gic_common.h1
-rw-r--r--include/hw/isa/isa.h7
-rw-r--r--include/hw/pci/pci.h14
-rw-r--r--include/qemu/timer.h6
-rw-r--r--kvm-all.c1
-rw-r--r--linux-user/s390x/syscall.h2
-rw-r--r--linux-user/syscall.c4
-rw-r--r--net/net.c5
-rw-r--r--net/tap-linux.c14
-rw-r--r--[-rwxr-xr-x]pc-bios/kvmvapic.binbin9216 -> 9216 bytes
-rw-r--r--[-rwxr-xr-x]pc-bios/multiboot.binbin1024 -> 1024 bytes
-rw-r--r--[-rwxr-xr-x]pc-bios/sgabios.binbin4096 -> 4096 bytes
-rwxr-xr-xscripts/create_config4
-rw-r--r--scripts/dump-guest-memory.py339
-rw-r--r--scripts/tracetool/backend/simple.py6
-rw-r--r--target-arm/cpu.c1
-rw-r--r--target-arm/cpu.h2
-rw-r--r--target-arm/helper-a64.c31
-rw-r--r--target-arm/helper-a64.h1
-rw-r--r--target-arm/helper.c45
-rw-r--r--target-arm/helper.h1
-rw-r--r--target-arm/translate-a64.c2699
-rw-r--r--target-arm/translate.c251
-rw-r--r--tcg/i386/tcg-target.c145
-rw-r--r--tcg/tcg.c2
-rw-r--r--tests/acpi-test-data/pc/APICbin0 -> 120 bytes
-rw-r--r--tests/acpi-test-data/pc/DSDTbin0 -> 4582 bytes
-rw-r--r--tests/acpi-test-data/pc/FACPbin0 -> 116 bytes
-rw-r--r--tests/acpi-test-data/pc/FACSbin0 -> 64 bytes
-rw-r--r--tests/acpi-test-data/pc/HPETbin0 -> 56 bytes
-rw-r--r--tests/acpi-test-data/pc/SSDTbin0 -> 2200 bytes
-rw-r--r--tests/acpi-test-data/q35/APICbin0 -> 120 bytes
-rw-r--r--tests/acpi-test-data/q35/DSDTbin0 -> 7438 bytes
-rw-r--r--tests/acpi-test-data/q35/FACPbin0 -> 116 bytes
-rw-r--r--tests/acpi-test-data/q35/FACSbin0 -> 64 bytes
-rw-r--r--tests/acpi-test-data/q35/HPETbin0 -> 56 bytes
-rw-r--r--tests/acpi-test-data/q35/MCFGbin0 -> 60 bytes
-rw-r--r--tests/acpi-test-data/q35/SSDTbin0 -> 475 bytes
-rwxr-xr-xtests/acpi-test-data/rebuild-expected-aml.sh36
-rw-r--r--tests/acpi-test.c305
-rw-r--r--trace/simple.c24
-rw-r--r--translate-all.c14
-rw-r--r--vl.c14
92 files changed, 5300 insertions, 523 deletions
diff --git a/MAINTAINERS b/MAINTAINERS
index fb5324285c..adc59735a9 100644
--- a/MAINTAINERS
+++ b/MAINTAINERS
@@ -610,6 +610,7 @@ F: hw/*/*vhost*
 
 virtio
 M: Anthony Liguori <aliguori@amazon.com>
+M: Michael S. Tsirkin <mst@redhat.com>
 S: Supported
 F: hw/*/virtio*
 
diff --git a/Makefile b/Makefile
index bdff4e4684..807054b3a1 100644
--- a/Makefile
+++ b/Makefile
@@ -290,7 +290,7 @@ common  de-ch  es     fo  fr-ca  hu     ja  mk  nl-be      pt  sl     tr \
 bepo    cz
 
 ifdef INSTALL_BLOBS
-BLOBS=bios.bin sgabios.bin vgabios.bin vgabios-cirrus.bin \
+BLOBS=bios.bin bios-256k.bin sgabios.bin vgabios.bin vgabios-cirrus.bin \
 vgabios-stdvga.bin vgabios-vmware.bin vgabios-qxl.bin \
 acpi-dsdt.aml q35-acpi-dsdt.aml \
 ppc_rom.bin openbios-sparc32 openbios-sparc64 openbios-ppc QEMU,tcx.bin \
diff --git a/configure b/configure
index b472694cb2..236764a3bd 100755
--- a/configure
+++ b/configure
@@ -4774,6 +4774,10 @@ for bios_file in \
 do
     FILES="$FILES pc-bios/`basename $bios_file`"
 done
+for test_file in `find $source_path/tests/acpi-test-data -type f`
+do
+    FILES="$FILES tests/acpi-test-data`echo $test_file | sed -e 's/.*acpi-test-data//'`"
+done
 mkdir -p $DIRS
 for f in $FILES ; do
     if [ -e "$source_path/$f" ] && [ "$source_path" != `pwd` ]; then
diff --git a/disas/i386.c b/disas/i386.c
index 47f1f2ea61..044e02c032 100644
--- a/disas/i386.c
+++ b/disas/i386.c
@@ -2632,17 +2632,17 @@ static const struct dis386 prefix_user_table[][4] = {
 
   /* PREGRP87 */
   {
+    { "movbe",	{ Gv, Ev } },
     { "(bad)",	{ XX } },
-    { "(bad)",	{ XX } },
-    { "(bad)",	{ XX } },
+    { "movbe",	{ Gv, Ev } },
     { "crc32",	{ Gdq, { CRC32_Fixup, b_mode } } },
   },
 
   /* PREGRP88 */
   {
+    { "movbe",	{ Ev, Gv } },
     { "(bad)",	{ XX } },
-    { "(bad)",	{ XX } },
-    { "(bad)",	{ XX } },
+    { "movbe",	{ Ev, Gv } },
     { "crc32",	{ Gdq, { CRC32_Fixup, v_mode } } },
   },
 
diff --git a/docs/qmp/qmp-events.txt b/docs/qmp/qmp-events.txt
index 6b87e9786a..a378c87583 100644
--- a/docs/qmp/qmp-events.txt
+++ b/docs/qmp/qmp-events.txt
@@ -479,7 +479,7 @@ Data: None.
 
 Example:
 
-{ "event": "WATCHDOG",
+{ "event": "WAKEUP",
      "timestamp": { "seconds": 1344522075, "microseconds": 745528 } }
 
 WATCHDOG
diff --git a/docs/specs/acpi_cpu_hotplug.txt b/docs/specs/acpi_cpu_hotplug.txt
index f6f577457d..340b751a95 100644
--- a/docs/specs/acpi_cpu_hotplug.txt
+++ b/docs/specs/acpi_cpu_hotplug.txt
@@ -10,7 +10,9 @@ ACPI GPE block (IO ports 0xafe0-0xafe3, byte access):
 Generic ACPI GPE block. Bit 2 (GPE.2) used to notify CPU
 hot-add/remove event to ACPI BIOS, via SCI interrupt.
 
-CPU present bitmap (IO port 0xaf00-0xaf1f, 1-byte access):
+CPU present bitmap for:
+  ICH9-LPC (IO port 0x0cd8-0xcf7, 1-byte access)
+  PIIX-PM  (IO port 0xaf00-0xaf1f, 1-byte access)
 ---------------------------------------------------------------
 One bit per CPU. Bit position reflects corresponding CPU APIC ID.
 Read-only.
diff --git a/exec.c b/exec.c
index 2435d9ecd9..9ad0a4b045 100644
--- a/exec.c
+++ b/exec.c
@@ -325,7 +325,7 @@ address_space_translate_internal(AddressSpaceDispatch *d, hwaddr addr, hwaddr *x
                                  hwaddr *plen, bool resolve_subpage)
 {
     MemoryRegionSection *section;
-    Int128 diff;
+    Int128 diff, diff_page;
 
     section = address_space_lookup_region(d, addr, resolve_subpage);
     /* Compute offset within MemoryRegionSection */
@@ -334,7 +334,9 @@ address_space_translate_internal(AddressSpaceDispatch *d, hwaddr addr, hwaddr *x
     /* Compute offset within MemoryRegion */
     *xlat = addr + section->offset_within_region;
 
+    diff_page = int128_make64(((addr & TARGET_PAGE_MASK) + TARGET_PAGE_SIZE) - addr);
     diff = int128_sub(section->mr->size, int128_make64(addr));
+    diff = int128_min(diff, diff_page);
     *plen = int128_get64(int128_min(diff, int128_make64(*plen)));
     return section;
 }
@@ -349,7 +351,7 @@ MemoryRegion *address_space_translate(AddressSpace *as, hwaddr addr,
     hwaddr len = *plen;
 
     for (;;) {
-        section = address_space_translate_internal(as->dispatch, addr, &addr, plen, true);
+        section = address_space_translate_internal(as->dispatch, addr, &addr, &len, true);
         mr = section->mr;
 
         if (!mr->iommu_ops) {
diff --git a/hw/acpi/Makefile.objs b/hw/acpi/Makefile.objs
index a0b63b5626..397d32babd 100644
--- a/hw/acpi/Makefile.objs
+++ b/hw/acpi/Makefile.objs
@@ -1,2 +1 @@
-common-obj-$(CONFIG_ACPI) += core.o piix4.o ich9.o
-
+common-obj-$(CONFIG_ACPI) += core.o piix4.o ich9.o pcihp.o cpu_hotplug.o
diff --git a/hw/acpi/cpu_hotplug.c b/hw/acpi/cpu_hotplug.c
new file mode 100644
index 0000000000..48928dc0ea
--- /dev/null
+++ b/hw/acpi/cpu_hotplug.c
@@ -0,0 +1,64 @@
+/*
+ * QEMU ACPI hotplug utilities
+ *
+ * Copyright (C) 2013 Red Hat Inc
+ *
+ * Authors:
+ *   Igor Mammedov <imammedo@redhat.com>
+ *
+ * This work is licensed under the terms of the GNU GPL, version 2 or later.
+ * See the COPYING file in the top-level directory.
+ */
+#include "hw/hw.h"
+#include "hw/acpi/cpu_hotplug.h"
+
+static uint64_t cpu_status_read(void *opaque, hwaddr addr, unsigned int size)
+{
+    AcpiCpuHotplug *cpus = opaque;
+    uint64_t val = cpus->sts[addr];
+
+    return val;
+}
+
+static void cpu_status_write(void *opaque, hwaddr addr, uint64_t data,
+                             unsigned int size)
+{
+    /* TODO: implement VCPU removal on guest signal that CPU can be removed */
+}
+
+static const MemoryRegionOps AcpiCpuHotplug_ops = {
+    .read = cpu_status_read,
+    .write = cpu_status_write,
+    .endianness = DEVICE_LITTLE_ENDIAN,
+    .valid = {
+        .min_access_size = 1,
+        .max_access_size = 1,
+    },
+};
+
+void AcpiCpuHotplug_add(ACPIGPE *gpe, AcpiCpuHotplug *g, CPUState *cpu)
+{
+    CPUClass *k = CPU_GET_CLASS(cpu);
+    int64_t cpu_id;
+
+    *gpe->sts = *gpe->sts | ACPI_CPU_HOTPLUG_STATUS;
+    cpu_id = k->get_arch_id(CPU(cpu));
+    g->sts[cpu_id / 8] |= (1 << (cpu_id % 8));
+}
+
+void AcpiCpuHotplug_init(MemoryRegion *parent, Object *owner,
+                         AcpiCpuHotplug *gpe_cpu, uint16_t base)
+{
+    CPUState *cpu;
+
+    CPU_FOREACH(cpu) {
+        CPUClass *cc = CPU_GET_CLASS(cpu);
+        int64_t id = cc->get_arch_id(cpu);
+
+        g_assert((id / 8) < ACPI_GPE_PROC_LEN);
+        gpe_cpu->sts[id / 8] |= (1 << (id % 8));
+    }
+    memory_region_init_io(&gpe_cpu->io, owner, &AcpiCpuHotplug_ops,
+                          gpe_cpu, "acpi-cpu-hotplug", ACPI_GPE_PROC_LEN);
+    memory_region_add_subregion(parent, base, &gpe_cpu->io);
+}
diff --git a/hw/acpi/ich9.c b/hw/acpi/ich9.c
index 30f0df8713..0afac425ec 100644
--- a/hw/acpi/ich9.c
+++ b/hw/acpi/ich9.c
@@ -185,6 +185,15 @@ static void pm_powerdown_req(Notifier *n, void *opaque)
     acpi_pm1_evt_power_down(&pm->acpi_regs);
 }
 
+static void ich9_cpu_added_req(Notifier *n, void *opaque)
+{
+    ICH9LPCPMRegs *pm = container_of(n, ICH9LPCPMRegs, cpu_added_notifier);
+
+    assert(pm != NULL);
+    AcpiCpuHotplug_add(&pm->acpi_regs.gpe, &pm->gpe_cpu, CPU(opaque));
+    acpi_update_sci(&pm->acpi_regs, pm->irq);
+}
+
 void ich9_pm_init(PCIDevice *lpc_pci, ICH9LPCPMRegs *pm,
                   qemu_irq sci_irq)
 {
@@ -210,6 +219,11 @@ void ich9_pm_init(PCIDevice *lpc_pci, ICH9LPCPMRegs *pm,
     qemu_register_reset(pm_reset, pm);
     pm->powerdown_notifier.notify = pm_powerdown_req;
     qemu_register_powerdown_notifier(&pm->powerdown_notifier);
+
+    AcpiCpuHotplug_init(pci_address_space_io(lpc_pci), OBJECT(lpc_pci),
+                        &pm->gpe_cpu, ICH9_CPU_HOTPLUG_IO_BASE);
+    pm->cpu_added_notifier.notify = ich9_cpu_added_req;
+    qemu_register_cpu_added_notifier(&pm->cpu_added_notifier);
 }
 
 static void ich9_pm_get_gpe0_blk(Object *obj, Visitor *v,
diff --git a/hw/acpi/pcihp.c b/hw/acpi/pcihp.c
new file mode 100644
index 0000000000..3fa3d7c290
--- /dev/null
+++ b/hw/acpi/pcihp.c
@@ -0,0 +1,316 @@
+/*
+ * QEMU<->ACPI BIOS PCI hotplug interface
+ *
+ * QEMU supports PCI hotplug via ACPI. This module
+ * implements the interface between QEMU and the ACPI BIOS.
+ * Interface specification - see docs/specs/acpi_pci_hotplug.txt
+ *
+ * Copyright (c) 2013, Red Hat Inc, Michael S. Tsirkin (mst@redhat.com)
+ * Copyright (c) 2006 Fabrice Bellard
+ *
+ * This library is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2 as published by the Free Software Foundation.
+ *
+ * This library is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with this library; if not, see <http://www.gnu.org/licenses/>
+ *
+ * Contributions after 2012-01-13 are licensed under the terms of the
+ * GNU GPL, version 2 or (at your option) any later version.
+ */
+
+#include "hw/acpi/pcihp.h"
+
+#include "hw/hw.h"
+#include "hw/i386/pc.h"
+#include "hw/pci/pci.h"
+#include "hw/acpi/acpi.h"
+#include "sysemu/sysemu.h"
+#include "qemu/range.h"
+#include "exec/ioport.h"
+#include "exec/address-spaces.h"
+#include "hw/pci/pci_bus.h"
+#include "qom/qom-qobject.h"
+#include "qapi/qmp/qint.h"
+
+//#define DEBUG
+
+#ifdef DEBUG
+# define ACPI_PCIHP_DPRINTF(format, ...)     printf(format, ## __VA_ARGS__)
+#else
+# define ACPI_PCIHP_DPRINTF(format, ...)     do { } while (0)
+#endif
+
+#define PCI_HOTPLUG_ADDR 0xae00
+#define PCI_HOTPLUG_SIZE 0x0014
+#define PCI_UP_BASE 0xae00
+#define PCI_DOWN_BASE 0xae04
+#define PCI_EJ_BASE 0xae08
+#define PCI_RMV_BASE 0xae0c
+#define PCI_SEL_BASE 0xae10
+
+typedef struct AcpiPciHpFind {
+    int bsel;
+    PCIBus *bus;
+} AcpiPciHpFind;
+
+static int acpi_pcihp_get_bsel(PCIBus *bus)
+{
+    QObject *o = object_property_get_qobject(OBJECT(bus),
+                                             ACPI_PCIHP_PROP_BSEL, NULL);
+    int64_t bsel = -1;
+    if (o) {
+        bsel = qint_get_int(qobject_to_qint(o));
+    }
+    if (bsel < 0) {
+        return -1;
+    }
+    return bsel;
+}
+
+static void acpi_pcihp_test_hotplug_bus(PCIBus *bus, void *opaque)
+{
+    AcpiPciHpFind *find = opaque;
+    if (find->bsel == acpi_pcihp_get_bsel(bus)) {
+        find->bus = bus;
+    }
+}
+
+static PCIBus *acpi_pcihp_find_hotplug_bus(AcpiPciHpState *s, int bsel)
+{
+    AcpiPciHpFind find = { .bsel = bsel, .bus = NULL };
+
+    if (bsel < 0) {
+        return NULL;
+    }
+
+    pci_for_each_bus(s->root, acpi_pcihp_test_hotplug_bus, &find);
+
+    /* Make bsel 0 eject root bus if bsel property is not set,
+     * for compatibility with non acpi setups.
+     * TODO: really needed?
+     */
+    if (!bsel && !find.bus) {
+        find.bus = s->root;
+    }
+    return find.bus;
+}
+
+static bool acpi_pcihp_pc_no_hotplug(AcpiPciHpState *s, PCIDevice *dev)
+{
+    PCIDeviceClass *pc = PCI_DEVICE_GET_CLASS(dev);
+    /*
+     * ACPI doesn't allow hotplug of bridge devices.  Don't allow
+     * hot-unplug of bridge devices unless they were added by hotplug
+     * (and so, not described by acpi).
+     */
+    return (pc->is_bridge && !dev->qdev.hotplugged) || pc->no_hotplug;
+}
+
+static void acpi_pcihp_eject_slot(AcpiPciHpState *s, unsigned bsel, unsigned slots)
+{
+    BusChild *kid, *next;
+    int slot = ffs(slots) - 1;
+    bool slot_free = true;
+    PCIBus *bus = acpi_pcihp_find_hotplug_bus(s, bsel);
+
+    if (!bus) {
+        return;
+    }
+
+    /* Mark request as complete */
+    s->acpi_pcihp_pci_status[bsel].down &= ~(1U << slot);
+
+    QTAILQ_FOREACH_SAFE(kid, &bus->qbus.children, sibling, next) {
+        DeviceState *qdev = kid->child;
+        PCIDevice *dev = PCI_DEVICE(qdev);
+        if (PCI_SLOT(dev->devfn) == slot) {
+            if (acpi_pcihp_pc_no_hotplug(s, dev)) {
+                slot_free = false;
+            } else {
+                object_unparent(OBJECT(qdev));
+            }
+        }
+    }
+    if (slot_free) {
+        s->acpi_pcihp_pci_status[bsel].device_present &= ~(1U << slot);
+    }
+}
+
+static void acpi_pcihp_update_hotplug_bus(AcpiPciHpState *s, int bsel)
+{
+    BusChild *kid, *next;
+    PCIBus *bus = acpi_pcihp_find_hotplug_bus(s, bsel);
+
+    /* Execute any pending removes during reset */
+    while (s->acpi_pcihp_pci_status[bsel].down) {
+        acpi_pcihp_eject_slot(s, bsel, s->acpi_pcihp_pci_status[bsel].down);
+    }
+
+    s->acpi_pcihp_pci_status[bsel].hotplug_enable = ~0;
+    s->acpi_pcihp_pci_status[bsel].device_present = 0;
+
+    if (!bus) {
+        return;
+    }
+    QTAILQ_FOREACH_SAFE(kid, &bus->qbus.children, sibling, next) {
+        DeviceState *qdev = kid->child;
+        PCIDevice *pdev = PCI_DEVICE(qdev);
+        int slot = PCI_SLOT(pdev->devfn);
+
+        if (acpi_pcihp_pc_no_hotplug(s, pdev)) {
+            s->acpi_pcihp_pci_status[bsel].hotplug_enable &= ~(1U << slot);
+        }
+
+        s->acpi_pcihp_pci_status[bsel].device_present |= (1U << slot);
+    }
+}
+
+static void acpi_pcihp_update(AcpiPciHpState *s)
+{
+    int i;
+
+    for (i = 0; i < ACPI_PCIHP_MAX_HOTPLUG_BUS; ++i) {
+        acpi_pcihp_update_hotplug_bus(s, i);
+    }
+}
+
+void acpi_pcihp_reset(AcpiPciHpState *s)
+{
+    acpi_pcihp_update(s);
+}
+
+static void enable_device(AcpiPciHpState *s, unsigned bsel, int slot)
+{
+    s->acpi_pcihp_pci_status[bsel].device_present |= (1U << slot);
+}
+
+static void disable_device(AcpiPciHpState *s, unsigned bsel, int slot)
+{
+    s->acpi_pcihp_pci_status[bsel].down |= (1U << slot);
+}
+
+int acpi_pcihp_device_hotplug(AcpiPciHpState *s, PCIDevice *dev,
+                              PCIHotplugState state)
+{
+    int slot = PCI_SLOT(dev->devfn);
+    int bsel = acpi_pcihp_get_bsel(dev->bus);
+    if (bsel < 0) {
+        return -1;
+    }
+
+    /* Don't send event when device is enabled during qemu machine creation:
+     * it is present on boot, no hotplug event is necessary. We do send an
+     * event when the device is disabled later. */
+    if (state == PCI_COLDPLUG_ENABLED) {
+        s->acpi_pcihp_pci_status[bsel].device_present |= (1U << slot);
+        return 0;
+    }
+
+    if (state == PCI_HOTPLUG_ENABLED) {
+        enable_device(s, bsel, slot);
+    } else {
+        disable_device(s, bsel, slot);
+    }
+
+    return 0;
+}
+
+static uint64_t pci_read(void *opaque, hwaddr addr, unsigned int size)
+{
+    AcpiPciHpState *s = opaque;
+    uint32_t val = 0;
+    int bsel = s->hotplug_select;
+
+    if (bsel < 0 || bsel > ACPI_PCIHP_MAX_HOTPLUG_BUS) {
+        return 0;
+    }
+
+    switch (addr) {
+    case PCI_UP_BASE - PCI_HOTPLUG_ADDR:
+        /* Manufacture an "up" value to cause a device check on any hotplug
+         * slot with a device.  Extra device checks are harmless. */
+        val = s->acpi_pcihp_pci_status[bsel].device_present &
+            s->acpi_pcihp_pci_status[bsel].hotplug_enable;
+        ACPI_PCIHP_DPRINTF("pci_up_read %" PRIu32 "\n", val);
+        break;
+    case PCI_DOWN_BASE - PCI_HOTPLUG_ADDR:
+        val = s->acpi_pcihp_pci_status[bsel].down;
+        ACPI_PCIHP_DPRINTF("pci_down_read %" PRIu32 "\n", val);
+        break;
+    case PCI_EJ_BASE - PCI_HOTPLUG_ADDR:
+        /* No feature defined yet */
+        ACPI_PCIHP_DPRINTF("pci_features_read %" PRIu32 "\n", val);
+        break;
+    case PCI_RMV_BASE - PCI_HOTPLUG_ADDR:
+        val = s->acpi_pcihp_pci_status[bsel].hotplug_enable;
+        ACPI_PCIHP_DPRINTF("pci_rmv_read %" PRIu32 "\n", val);
+        break;
+    case PCI_SEL_BASE - PCI_HOTPLUG_ADDR:
+        val = s->hotplug_select;
+        ACPI_PCIHP_DPRINTF("pci_sel_read %" PRIu32 "\n", val);
+    default:
+        break;
+    }
+
+    return val;
+}
+
+static void pci_write(void *opaque, hwaddr addr, uint64_t data,
+                      unsigned int size)
+{
+    AcpiPciHpState *s = opaque;
+    switch (addr) {
+    case PCI_EJ_BASE - PCI_HOTPLUG_ADDR:
+        if (s->hotplug_select >= ACPI_PCIHP_MAX_HOTPLUG_BUS) {
+            break;
+        }
+        acpi_pcihp_eject_slot(s, s->hotplug_select, data);
+        ACPI_PCIHP_DPRINTF("pciej write %" HWADDR_PRIx " <== %" PRIu64 "\n",
+                      addr, data);
+        break;
+    case PCI_SEL_BASE - PCI_HOTPLUG_ADDR:
+        s->hotplug_select = data;
+        ACPI_PCIHP_DPRINTF("pcisel write %" HWADDR_PRIx " <== %" PRIu64 "\n",
+                      addr, data);
+    default:
+        break;
+    }
+}
+
+static const MemoryRegionOps acpi_pcihp_io_ops = {
+    .read = pci_read,
+    .write = pci_write,
+    .endianness = DEVICE_LITTLE_ENDIAN,
+    .valid = {
+        .min_access_size = 4,
+        .max_access_size = 4,
+    },
+};
+
+void acpi_pcihp_init(AcpiPciHpState *s, PCIBus *root_bus,
+                     MemoryRegion *address_space_io)
+{
+    s->root= root_bus;
+    memory_region_init_io(&s->io, NULL, &acpi_pcihp_io_ops, s,
+                          "acpi-pci-hotplug",
+                          PCI_HOTPLUG_SIZE);
+    memory_region_add_subregion(address_space_io, PCI_HOTPLUG_ADDR, &s->io);
+}
+
+const VMStateDescription vmstate_acpi_pcihp_pci_status = {
+    .name = "acpi_pcihp_pci_status",
+    .version_id = 1,
+    .minimum_version_id = 1,
+    .minimum_version_id_old = 1,
+    .fields      = (VMStateField []) {
+        VMSTATE_UINT32(up, AcpiPciHpPciStatus),
+        VMSTATE_UINT32(down, AcpiPciHpPciStatus),
+        VMSTATE_END_OF_LIST()
+    }
+};
diff --git a/hw/acpi/piix4.c b/hw/acpi/piix4.c
index 20353b983e..5d55a3c222 100644
--- a/hw/acpi/piix4.c
+++ b/hw/acpi/piix4.c
@@ -30,6 +30,8 @@
 #include "hw/nvram/fw_cfg.h"
 #include "exec/address-spaces.h"
 #include "hw/acpi/piix4.h"
+#include "hw/acpi/pcihp.h"
+#include "hw/acpi/cpu_hotplug.h"
 
 //#define DEBUG
 
@@ -49,21 +51,13 @@
 #define PCI_EJ_BASE 0xae08
 #define PCI_RMV_BASE 0xae0c
 
-#define PIIX4_PROC_BASE 0xaf00
-#define PIIX4_PROC_LEN 32
-
 #define PIIX4_PCI_HOTPLUG_STATUS 2
-#define PIIX4_CPU_HOTPLUG_STATUS 4
 
 struct pci_status {
     uint32_t up; /* deprecated, maintained for migration compatibility */
     uint32_t down;
 };
 
-typedef struct CPUStatus {
-    uint8_t sts[PIIX4_PROC_LEN];
-} CPUStatus;
-
 typedef struct PIIX4PMState {
     /*< private >*/
     PCIDevice parent_obj;
@@ -73,8 +67,6 @@ typedef struct PIIX4PMState {
     uint32_t io_base;
 
     MemoryRegion io_gpe;
-    MemoryRegion io_pci;
-    MemoryRegion io_cpu;
     ACPIREGS ar;
 
     APMState apm;
@@ -88,16 +80,21 @@ typedef struct PIIX4PMState {
     Notifier machine_ready;
     Notifier powerdown_notifier;
 
-    /* for pci hotplug */
+    /* for legacy pci hotplug (compatible with qemu 1.6 and older) */
+    MemoryRegion io_pci;
     struct pci_status pci0_status;
     uint32_t pci0_hotplug_enable;
     uint32_t pci0_slot_device_present;
 
+    /* for new pci hotplug (with PCI2PCI bridge support) */
+    AcpiPciHpState acpi_pci_hotplug;
+    bool use_acpi_pci_hotplug;
+
     uint8_t disable_s3;
     uint8_t disable_s4;
     uint8_t s4_val;
 
-    CPUStatus gpe_cpu;
+    AcpiCpuHotplug gpe_cpu;
     Notifier cpu_added_notifier;
 } PIIX4PMState;
 
@@ -263,6 +260,18 @@ static int acpi_load_old(QEMUFile *f, void *opaque, int version_id)
     return ret;
 }
 
+static bool vmstate_test_use_acpi_pci_hotplug(void *opaque, int version_id)
+{
+    PIIX4PMState *s = opaque;
+    return s->use_acpi_pci_hotplug;
+}
+
+static bool vmstate_test_no_use_acpi_pci_hotplug(void *opaque, int version_id)
+{
+    PIIX4PMState *s = opaque;
+    return !s->use_acpi_pci_hotplug;
+}
+
 /* qemu-kvm 1.2 uses version 3 but advertised as 2
  * To support incoming qemu-kvm 1.2 migration, change version_id
  * and minimum_version_id to 2 below (which breaks migration from
@@ -285,8 +294,12 @@ static const VMStateDescription vmstate_acpi = {
         VMSTATE_TIMER(ar.tmr.timer, PIIX4PMState),
         VMSTATE_INT64(ar.tmr.overflow_time, PIIX4PMState),
         VMSTATE_STRUCT(ar.gpe, PIIX4PMState, 2, vmstate_gpe, ACPIGPE),
-        VMSTATE_STRUCT(pci0_status, PIIX4PMState, 2, vmstate_pci_status,
-                       struct pci_status),
+        VMSTATE_STRUCT_TEST(pci0_status, PIIX4PMState,
+                            vmstate_test_no_use_acpi_pci_hotplug,
+                            2, vmstate_pci_status,
+                            struct pci_status),
+        VMSTATE_PCI_HOTPLUG(acpi_pci_hotplug, PIIX4PMState,
+                            vmstate_test_use_acpi_pci_hotplug),
         VMSTATE_END_OF_LIST()
     }
 };
@@ -364,7 +377,11 @@ static void piix4_reset(void *opaque)
         pci_conf[0x5B] = 0x02;
     }
     pm_io_space_update(s);
-    piix4_update_hotplug(s);
+    if (s->use_acpi_pci_hotplug) {
+        acpi_pcihp_reset(&s->acpi_pci_hotplug);
+    } else {
+        piix4_update_hotplug(s);
+    }
 }
 
 static void piix4_pm_powerdown_req(Notifier *n, void *opaque)
@@ -375,6 +392,26 @@ static void piix4_pm_powerdown_req(Notifier *n, void *opaque)
     acpi_pm1_evt_power_down(&s->ar);
 }
 
+static int piix4_acpi_pci_hotplug(DeviceState *qdev, PCIDevice *dev,
+                                  PCIHotplugState state)
+{
+    PIIX4PMState *s = PIIX4_PM(qdev);
+    int ret = acpi_pcihp_device_hotplug(&s->acpi_pci_hotplug, dev, state);
+    if (ret < 0) {
+        return ret;
+    }
+    s->ar.gpe.sts[0] |= PIIX4_PCI_HOTPLUG_STATUS;
+
+    acpi_update_sci(&s->ar, s->irq);
+    return 0;
+}
+
+static void piix4_update_bus_hotplug(PCIBus *bus, void *opaque)
+{
+    PIIX4PMState *s = opaque;
+    pci_bus_hotplug(bus, piix4_acpi_pci_hotplug, DEVICE(s));
+}
+
 static void piix4_pm_machine_ready(Notifier *n, void *opaque)
 {
     PIIX4PMState *s = container_of(n, PIIX4PMState, machine_ready);
@@ -388,6 +425,10 @@ static void piix4_pm_machine_ready(Notifier *n, void *opaque)
     pci_conf[0x63] = 0x60;
     pci_conf[0x67] = (memory_region_present(io_as, 0x3f8) ? 0x08 : 0) |
         (memory_region_present(io_as, 0x2f8) ? 0x90 : 0);
+
+    if (s->use_acpi_pci_hotplug) {
+        pci_for_each_bus(d->bus, piix4_update_bus_hotplug, s);
+    }
 }
 
 static void piix4_pm_add_propeties(PIIX4PMState *s)
@@ -509,6 +550,8 @@ static Property piix4_pm_properties[] = {
     DEFINE_PROP_UINT8(ACPI_PM_PROP_S3_DISABLED, PIIX4PMState, disable_s3, 0),
     DEFINE_PROP_UINT8(ACPI_PM_PROP_S4_DISABLED, PIIX4PMState, disable_s4, 0),
     DEFINE_PROP_UINT8(ACPI_PM_PROP_S4_VAL, PIIX4PMState, s4_val, 2),
+    DEFINE_PROP_BOOL("acpi-pci-hotplug-with-bridge-support", PIIX4PMState,
+                     use_acpi_pci_hotplug, true),
     DEFINE_PROP_END_OF_LIST(),
 };
 
@@ -632,61 +675,13 @@ static const MemoryRegionOps piix4_pci_ops = {
     },
 };
 
-static uint64_t cpu_status_read(void *opaque, hwaddr addr, unsigned int size)
-{
-    PIIX4PMState *s = opaque;
-    CPUStatus *cpus = &s->gpe_cpu;
-    uint64_t val = cpus->sts[addr];
-
-    return val;
-}
-
-static void cpu_status_write(void *opaque, hwaddr addr, uint64_t data,
-                             unsigned int size)
-{
-    /* TODO: implement VCPU removal on guest signal that CPU can be removed */
-}
-
-static const MemoryRegionOps cpu_hotplug_ops = {
-    .read = cpu_status_read,
-    .write = cpu_status_write,
-    .endianness = DEVICE_LITTLE_ENDIAN,
-    .valid = {
-        .min_access_size = 1,
-        .max_access_size = 1,
-    },
-};
-
-typedef enum {
-    PLUG,
-    UNPLUG,
-} HotplugEventType;
-
-static void piix4_cpu_hotplug_req(PIIX4PMState *s, CPUState *cpu,
-                                  HotplugEventType action)
-{
-    CPUStatus *g = &s->gpe_cpu;
-    ACPIGPE *gpe = &s->ar.gpe;
-    CPUClass *k = CPU_GET_CLASS(cpu);
-    int64_t cpu_id;
-
-    assert(s != NULL);
-
-    *gpe->sts = *gpe->sts | PIIX4_CPU_HOTPLUG_STATUS;
-    cpu_id = k->get_arch_id(CPU(cpu));
-    if (action == PLUG) {
-        g->sts[cpu_id / 8] |= (1 << (cpu_id % 8));
-    } else {
-        g->sts[cpu_id / 8] &= ~(1 << (cpu_id % 8));
-    }
-    acpi_update_sci(&s->ar, s->irq);
-}
-
 static void piix4_cpu_added_req(Notifier *n, void *opaque)
 {
     PIIX4PMState *s = container_of(n, PIIX4PMState, cpu_added_notifier);
 
-    piix4_cpu_hotplug_req(s, CPU(opaque), PLUG);
+    assert(s != NULL);
+    AcpiCpuHotplug_add(&s->ar.gpe, &s->gpe_cpu, CPU(opaque));
+    acpi_update_sci(&s->ar, s->irq);
 }
 
 static int piix4_device_hotplug(DeviceState *qdev, PCIDevice *dev,
@@ -695,28 +690,22 @@ static int piix4_device_hotplug(DeviceState *qdev, PCIDevice *dev,
 static void piix4_acpi_system_hot_add_init(MemoryRegion *parent,
                                            PCIBus *bus, PIIX4PMState *s)
 {
-    CPUState *cpu;
-
     memory_region_init_io(&s->io_gpe, OBJECT(s), &piix4_gpe_ops, s,
                           "acpi-gpe0", GPE_LEN);
     memory_region_add_subregion(parent, GPE_BASE, &s->io_gpe);
 
-    memory_region_init_io(&s->io_pci, OBJECT(s), &piix4_pci_ops, s,
-                          "acpi-pci-hotplug", PCI_HOTPLUG_SIZE);
-    memory_region_add_subregion(parent, PCI_HOTPLUG_ADDR,
-                                &s->io_pci);
-    pci_bus_hotplug(bus, piix4_device_hotplug, DEVICE(s));
-
-    CPU_FOREACH(cpu) {
-        CPUClass *cc = CPU_GET_CLASS(cpu);
-        int64_t id = cc->get_arch_id(cpu);
-
-        g_assert((id / 8) < PIIX4_PROC_LEN);
-        s->gpe_cpu.sts[id / 8] |= (1 << (id % 8));
+    if (s->use_acpi_pci_hotplug) {
+        acpi_pcihp_init(&s->acpi_pci_hotplug, bus, parent);
+    } else {
+        memory_region_init_io(&s->io_pci, OBJECT(s), &piix4_pci_ops, s,
+                              "acpi-pci-hotplug", PCI_HOTPLUG_SIZE);
+        memory_region_add_subregion(parent, PCI_HOTPLUG_ADDR,
+                                    &s->io_pci);
+        pci_bus_hotplug(bus, piix4_device_hotplug, DEVICE(s));
     }
-    memory_region_init_io(&s->io_cpu, OBJECT(s), &cpu_hotplug_ops, s,
-                          "acpi-cpu-hotplug", PIIX4_PROC_LEN);
-    memory_region_add_subregion(parent, PIIX4_PROC_BASE, &s->io_cpu);
+
+    AcpiCpuHotplug_init(parent, OBJECT(s), &s->gpe_cpu,
+                        PIIX4_CPU_HOTPLUG_IO_BASE);
     s->cpu_added_notifier.notify = piix4_cpu_added_req;
     qemu_register_cpu_added_notifier(&s->cpu_added_notifier);
 }
diff --git a/hw/arm/boot.c b/hw/arm/boot.c
index 1c1b0e5258..4036262f50 100644
--- a/hw/arm/boot.c
+++ b/hw/arm/boot.c
@@ -173,6 +173,11 @@ static void default_reset_secondary(ARMCPU *cpu,
     env->regs[15] = info->smp_loader_start;
 }
 
+static inline bool have_dtb(const struct arm_boot_info *info)
+{
+    return info->dtb_filename || info->get_dtb;
+}
+
 #define WRITE_WORD(p, value) do { \
     stl_phys_notdirty(p, value);  \
     p += 4;                       \
@@ -421,7 +426,7 @@ static void do_cpu_reset(void *opaque)
                     env->regs[15] = info->loader_start;
                 }
 
-                if (!info->dtb_filename) {
+                if (!have_dtb(info)) {
                     if (old_param) {
                         set_kernel_args_old(info);
                     } else {
@@ -542,7 +547,7 @@ void arm_load_kernel(ARMCPU *cpu, struct arm_boot_info *info)
         /* for device tree boot, we pass the DTB directly in r2. Otherwise
          * we point to the kernel args.
          */
-        if (info->dtb_filename || info->get_dtb) {
+        if (have_dtb(info)) {
             /* Place the DTB after the initrd in memory. Note that some
              * kernels will trash anything in the 4K page the initrd
              * ends in, so make sure the DTB isn't caught up in that.
diff --git a/hw/arm/xilinx_zynq.c b/hw/arm/xilinx_zynq.c
index 98e0958a77..9ee21e726a 100644
--- a/hw/arm/xilinx_zynq.c
+++ b/hw/arm/xilinx_zynq.c
@@ -37,6 +37,7 @@
 #define IRQ_OFFSET 32 /* pic interrupts start from index 32 */
 
 #define MPCORE_PERIPHBASE 0xF8F00000
+#define ZYNQ_BOARD_MIDR 0x413FC090
 
 static const int dma_irqs[8] = {
     46, 47, 48, 49, 72, 73, 74, 75
@@ -125,6 +126,12 @@ static void zynq_init(QEMUMachineInitArgs *args)
 
     cpu = ARM_CPU(object_new(object_class_get_name(cpu_oc)));
 
+    object_property_set_int(OBJECT(cpu), ZYNQ_BOARD_MIDR, "midr", &err);
+    if (err) {
+        error_report("%s", error_get_pretty(err));
+        exit(1);
+    }
+
     object_property_set_int(OBJECT(cpu), MPCORE_PERIPHBASE, "reset-cbar", &err);
     if (err) {
         error_report("%s", error_get_pretty(err));
diff --git a/hw/display/blizzard_template.h b/hw/display/blizzard_template.h
index a8a8899478..b7ef27c808 100644
--- a/hw/display/blizzard_template.h
+++ b/hw/display/blizzard_template.h
@@ -18,25 +18,35 @@
  * with this program; if not, see <http://www.gnu.org/licenses/>.
  */
 
-#define SKIP_PIXEL(to)		to += deststep
+#define SKIP_PIXEL(to)         (to += deststep)
 #if DEPTH == 8
-# define PIXEL_TYPE		uint8_t
-# define COPY_PIXEL(to, from)	*to = from; SKIP_PIXEL(to)
-# define COPY_PIXEL1(to, from)	*to ++ = from
+# define PIXEL_TYPE            uint8_t
+# define COPY_PIXEL(to, from)  do { *to = from; SKIP_PIXEL(to); } while (0)
+# define COPY_PIXEL1(to, from) (*to++ = from)
 #elif DEPTH == 15 || DEPTH == 16
-# define PIXEL_TYPE		uint16_t
-# define COPY_PIXEL(to, from)	*to = from; SKIP_PIXEL(to)
-# define COPY_PIXEL1(to, from)	*to ++ = from
+# define PIXEL_TYPE            uint16_t
+# define COPY_PIXEL(to, from)  do { *to = from; SKIP_PIXEL(to); } while (0)
+# define COPY_PIXEL1(to, from) (*to++ = from)
 #elif DEPTH == 24
-# define PIXEL_TYPE		uint8_t
-# define COPY_PIXEL(to, from)	\
-    to[0] = from; to[1] = (from) >> 8; to[2] = (from) >> 16; SKIP_PIXEL(to)
-# define COPY_PIXEL1(to, from)	\
-    *to ++ = from; *to ++ = (from) >> 8; *to ++ = (from) >> 16
+# define PIXEL_TYPE            uint8_t
+# define COPY_PIXEL(to, from) \
+    do {                      \
+        to[0] = from;         \
+        to[1] = (from) >> 8;  \
+        to[2] = (from) >> 16; \
+        SKIP_PIXEL(to);       \
+    } while (0)
+
+# define COPY_PIXEL1(to, from) \
+    do {                       \
+        *to++ = from;          \
+        *to++ = (from) >> 8;   \
+        *to++ = (from) >> 16;  \
+    } while (0)
 #elif DEPTH == 32
-# define PIXEL_TYPE		uint32_t
-# define COPY_PIXEL(to, from)	*to = from; SKIP_PIXEL(to)
-# define COPY_PIXEL1(to, from)	*to ++ = from
+# define PIXEL_TYPE            uint32_t
+# define COPY_PIXEL(to, from)  do { *to = from; SKIP_PIXEL(to); } while (0)
+# define COPY_PIXEL1(to, from) (*to++ = from)
 #else
 # error unknown bit depth
 #endif
diff --git a/hw/display/pl110_template.h b/hw/display/pl110_template.h
index e738e4a241..36ba791c6f 100644
--- a/hw/display/pl110_template.h
+++ b/hw/display/pl110_template.h
@@ -14,12 +14,16 @@
 #if BITS == 8
 #define COPY_PIXEL(to, from) *(to++) = from
 #elif BITS == 15 || BITS == 16
-#define COPY_PIXEL(to, from) *(uint16_t *)to = from; to += 2;
+#define COPY_PIXEL(to, from) do { *(uint16_t *)to = from; to += 2; } while (0)
 #elif BITS == 24
-#define COPY_PIXEL(to, from) \
-  *(to++) = from; *(to++) = (from) >> 8; *(to++) = (from) >> 16
+#define COPY_PIXEL(to, from)    \
+    do {                        \
+        *(to++) = from;         \
+        *(to++) = (from) >> 8;  \
+        *(to++) = (from) >> 16; \
+    } while (0)
 #elif BITS == 32
-#define COPY_PIXEL(to, from) *(uint32_t *)to = from; to += 4;
+#define COPY_PIXEL(to, from) do { *(uint32_t *)to = from; to += 4; } while (0)
 #else
 #error unknown bit depth
 #endif
diff --git a/hw/display/pxa2xx_template.h b/hw/display/pxa2xx_template.h
index 1cbe36cb80..c64eebc4b6 100644
--- a/hw/display/pxa2xx_template.h
+++ b/hw/display/pxa2xx_template.h
@@ -11,14 +11,26 @@
 
 # define SKIP_PIXEL(to)		to += deststep
 #if BITS == 8
-# define COPY_PIXEL(to, from)	*to = from; SKIP_PIXEL(to)
+# define COPY_PIXEL(to, from)  do { *to = from; SKIP_PIXEL(to); } while (0)
 #elif BITS == 15 || BITS == 16
-# define COPY_PIXEL(to, from)	*(uint16_t *) to = from; SKIP_PIXEL(to)
+# define COPY_PIXEL(to, from)    \
+    do {                         \
+        *(uint16_t *) to = from; \
+        SKIP_PIXEL(to);          \
+    } while (0)
 #elif BITS == 24
-# define COPY_PIXEL(to, from)	\
-	*(uint16_t *) to = from; *(to + 2) = (from) >> 16; SKIP_PIXEL(to)
+# define COPY_PIXEL(to, from)     \
+    do {                          \
+        *(uint16_t *) to = from;  \
+        *(to + 2) = (from) >> 16; \
+        SKIP_PIXEL(to);           \
+    } while (0)
 #elif BITS == 32
-# define COPY_PIXEL(to, from)	*(uint32_t *) to = from; SKIP_PIXEL(to)
+# define COPY_PIXEL(to, from)    \
+    do {                         \
+        *(uint32_t *) to = from; \
+        SKIP_PIXEL(to);          \
+    } while (0)
 #else
 # error unknown bit depth
 #endif
diff --git a/hw/display/tc6393xb_template.h b/hw/display/tc6393xb_template.h
index 154aafd400..78629c07f9 100644
--- a/hw/display/tc6393xb_template.h
+++ b/hw/display/tc6393xb_template.h
@@ -22,14 +22,18 @@
  */
 
 #if BITS == 8
-# define SET_PIXEL(addr, color)	*(uint8_t*)addr = color;
+# define SET_PIXEL(addr, color)  (*(uint8_t *)addr = color)
 #elif BITS == 15 || BITS == 16
-# define SET_PIXEL(addr, color)	*(uint16_t*)addr = color;
+# define SET_PIXEL(addr, color)  (*(uint16_t *)addr = color)
 #elif BITS == 24
-# define SET_PIXEL(addr, color)	\
-    addr[0] = color; addr[1] = (color) >> 8; addr[2] = (color) >> 16;
+# define SET_PIXEL(addr, color)  \
+    do {                         \
+        addr[0] = color;         \
+        addr[1] = (color) >> 8;  \
+        addr[2] = (color) >> 16; \
+    } while (0)
 #elif BITS == 32
-# define SET_PIXEL(addr, color)	*(uint32_t*)addr = color;
+# define SET_PIXEL(addr, color)  (*(uint32_t *)addr = color)
 #else
 # error unknown bit depth
 #endif
diff --git a/hw/display/xenfb.c b/hw/display/xenfb.c
index f0333a0cad..cb9d456814 100644
--- a/hw/display/xenfb.c
+++ b/hw/display/xenfb.c
@@ -495,7 +495,7 @@ static int xenfb_map_fb(struct XenFB *xenfb)
     munmap(map, n_fbdirs * XC_PAGE_SIZE);
 
     xenfb->pixels = xc_map_foreign_pages(xen_xc, xenfb->c.xendev.dom,
-					 PROT_READ | PROT_WRITE, fbmfns, xenfb->fbpages);
+            PROT_READ, fbmfns, xenfb->fbpages);
     if (xenfb->pixels == NULL)
 	goto out;
 
@@ -903,6 +903,11 @@ static void fb_disconnect(struct XenDevice *xendev)
     fb->pixels = mmap(fb->pixels, fb->fbpages * XC_PAGE_SIZE,
                       PROT_READ | PROT_WRITE, MAP_SHARED | MAP_ANON,
                       -1, 0);
+    if (fb->pixels == MAP_FAILED) {
+        xen_be_printf(xendev, 0,
+                "Couldn't replace the framebuffer with anonymous memory errno=%d\n",
+                errno);
+    }
     common_unbind(&fb->c);
     fb->feature_update = 0;
     fb->bug_trigger    = 0;
diff --git a/hw/i386/Makefile.objs b/hw/i386/Makefile.objs
index 09ac433cf9..3df1612651 100644
--- a/hw/i386/Makefile.objs
+++ b/hw/i386/Makefile.objs
@@ -17,7 +17,7 @@ iasl-option=$(shell if test -z "`$(1) $(2) 2>&1 > /dev/null`" \
 ifdef IASL
 #IASL Present. Generate hex files from .dsl
 hw/i386/%.hex: $(SRC_PATH)/hw/i386/%.dsl $(SRC_PATH)/scripts/acpi_extract_preprocess.py $(SRC_PATH)/scripts/acpi_extract.py
-	$(call quiet-command, cpp -P $< -o $*.dsl.i.orig, "  CPP $(TARGET_DIR)$*.dsl.i.orig")
+	$(call quiet-command, cpp -P $(QEMU_DGFLAGS) $(QEMU_INCLUDES) $< -o $*.dsl.i.orig, "  CPP $(TARGET_DIR)$*.dsl.i.orig")
 	$(call quiet-command, $(PYTHON) $(SRC_PATH)/scripts/acpi_extract_preprocess.py $*.dsl.i.orig > $*.dsl.i, "  ACPI_PREPROCESS $(TARGET_DIR)$*.dsl.i")
 	$(call quiet-command, $(IASL) $(call iasl-option,$(IASL),-Pn,) -vs -l -tc -p $* $*.dsl.i $(if $(V), , > /dev/null) 2>&1 ,"  IASL $(TARGET_DIR)$*.dsl.i")
 	$(call quiet-command, $(PYTHON) $(SRC_PATH)/scripts/acpi_extract.py $*.lst > $*.off, "  ACPI_EXTRACT $(TARGET_DIR)$*.off")
diff --git a/hw/i386/acpi-build.c b/hw/i386/acpi-build.c
index 48312f5a83..50e83f3b46 100644
--- a/hw/i386/acpi-build.c
+++ b/hw/i386/acpi-build.c
@@ -36,9 +36,11 @@
 #include "hw/nvram/fw_cfg.h"
 #include "bios-linker-loader.h"
 #include "hw/loader.h"
+#include "hw/isa/isa.h"
 
 /* Supported chipsets: */
 #include "hw/acpi/piix4.h"
+#include "hw/acpi/pcihp.h"
 #include "hw/i386/ich9.h"
 #include "hw/pci/pci_bus.h"
 #include "hw/pci-host/q35.h"
@@ -78,8 +80,15 @@ typedef struct AcpiMiscInfo {
     uint16_t pvpanic_port;
 } AcpiMiscInfo;
 
+typedef struct AcpiBuildPciBusHotplugState {
+    GArray *device_table;
+    GArray *notify_table;
+    struct AcpiBuildPciBusHotplugState *parent;
+} AcpiBuildPciBusHotplugState;
+
 static void acpi_get_dsdt(AcpiMiscInfo *info)
 {
+    uint16_t *applesmc_sta;
     Object *piix = piix4_pm_find();
     Object *lpc = ich9_lpc_find();
     assert(!!piix != !!lpc);
@@ -87,11 +96,17 @@ static void acpi_get_dsdt(AcpiMiscInfo *info)
     if (piix) {
         info->dsdt_code = AcpiDsdtAmlCode;
         info->dsdt_size = sizeof AcpiDsdtAmlCode;
+        applesmc_sta = piix_dsdt_applesmc_sta;
     }
     if (lpc) {
         info->dsdt_code = Q35AcpiDsdtAmlCode;
         info->dsdt_size = sizeof Q35AcpiDsdtAmlCode;
+        applesmc_sta = q35_dsdt_applesmc_sta;
     }
+
+    /* Patch in appropriate value for AppleSMC _STA */
+    *(uint8_t *)(info->dsdt_code + *applesmc_sta) =
+        applesmc_find() ? 0x0b : 0x00;
 }
 
 static
@@ -171,38 +186,6 @@ static void acpi_get_pm_info(AcpiPmInfo *pm)
                                                NULL);
 }
 
-static void acpi_get_hotplug_info(AcpiMiscInfo *misc)
-{
-    int i;
-    PCIBus *bus = find_i440fx();
-
-    if (!bus) {
-        /* Only PIIX supports ACPI hotplug */
-        memset(misc->slot_hotplug_enable, 0, sizeof misc->slot_hotplug_enable);
-        return;
-    }
-
-    memset(misc->slot_hotplug_enable, 0xff,
-           DIV_ROUND_UP(PCI_SLOT_MAX, BITS_PER_BYTE));
-
-    for (i = 0; i < ARRAY_SIZE(bus->devices); ++i) {
-        PCIDeviceClass *pc;
-        PCIDevice *pdev = bus->devices[i];
-
-        if (!pdev) {
-            continue;
-        }
-
-        pc = PCI_DEVICE_GET_CLASS(pdev);
-
-        if (pc->no_hotplug) {
-            int slot = PCI_SLOT(i);
-
-            clear_bit(slot, misc->slot_hotplug_enable);
-        }
-    }
-}
-
 static void acpi_get_misc_info(AcpiMiscInfo *info)
 {
     info->has_hpet = hpet_find();
@@ -368,6 +351,12 @@ static void build_package(GArray *package, uint8_t op, unsigned min_bytes)
     build_prepend_byte(package, op);
 }
 
+static void build_extop_package(GArray *package, uint8_t op)
+{
+    build_package(package, op, 1);
+    build_prepend_byte(package, 0x5B); /* ExtOpPrefix */
+}
+
 static void build_append_value(GArray *table, uint32_t value, int size)
 {
     uint8_t prefix;
@@ -394,8 +383,44 @@ static void build_append_value(GArray *table, uint32_t value, int size)
     }
 }
 
-static void build_append_notify_target(GArray *method, GArray *target_name,
-                                       uint32_t value, int size)
+static void build_append_int(GArray *table, uint32_t value)
+{
+    if (value == 0x00) {
+        build_append_byte(table, 0x00); /* ZeroOp */
+    } else if (value == 0x01) {
+        build_append_byte(table, 0x01); /* OneOp */
+    } else if (value <= 0xFF) {
+        build_append_value(table, value, 1);
+    } else if (value <= 0xFFFFF) {
+        build_append_value(table, value, 2);
+    } else {
+        build_append_value(table, value, 4);
+    }
+}
+
+static GArray *build_alloc_method(const char *name, uint8_t arg_count)
+{
+    GArray *method = build_alloc_array();
+
+    build_append_nameseg(method, "%s", name);
+    build_append_byte(method, arg_count); /* MethodFlags: ArgCount */
+
+    return method;
+}
+
+static void build_append_and_cleanup_method(GArray *device, GArray *method)
+{
+    uint8_t op = 0x14; /* MethodOp */
+
+    build_package(method, op, 0);
+
+    build_append_array(device, method);
+    build_free_array(method);
+}
+
+static void build_append_notify_target_ifequal(GArray *method,
+                                               GArray *target_name,
+                                               uint32_t value, int size)
 {
     GArray *notify = build_alloc_array();
     uint8_t op = 0xA0; /* IfOp */
@@ -415,6 +440,7 @@ static void build_append_notify_target(GArray *method, GArray *target_name,
     build_free_array(notify);
 }
 
+/* End here */
 #define ACPI_PORT_SMI_CMD           0x00b2 /* TODO: this is APM_CNT_IOPORT */
 
 static inline void *acpi_data_push(GArray *table_data, unsigned size)
@@ -624,44 +650,236 @@ static inline char acpi_get_hex(uint32_t val)
 #include "hw/i386/ssdt-pcihp.hex"
 
 static void
-build_append_notify(GArray *device, const char *name,
-                    const char *format, int skip, int count)
+build_append_notify_method(GArray *device, const char *name,
+                           const char *format, int count)
 {
     int i;
-    GArray *method = build_alloc_array();
-    uint8_t op = 0x14; /* MethodOp */
+    GArray *method = build_alloc_method(name, 2);
 
-    build_append_nameseg(method, "%s", name);
-    build_append_byte(method, 0x02); /* MethodFlags: ArgCount */
-    for (i = skip; i < count; i++) {
+    for (i = 0; i < count; i++) {
         GArray *target = build_alloc_array();
         build_append_nameseg(target, format, i);
         assert(i < 256); /* Fits in 1 byte */
-        build_append_notify_target(method, target, i, 1);
+        build_append_notify_target_ifequal(method, target, i, 1);
         build_free_array(target);
     }
-    build_package(method, op, 2);
 
-    build_append_array(device, method);
-    build_free_array(method);
+    build_append_and_cleanup_method(device, method);
 }
 
-static void patch_pcihp(int slot, uint8_t *ssdt_ptr, uint32_t eject)
+static void patch_pcihp(int slot, uint8_t *ssdt_ptr)
 {
-    ssdt_ptr[ACPI_PCIHP_OFFSET_HEX] = acpi_get_hex(slot >> 4);
-    ssdt_ptr[ACPI_PCIHP_OFFSET_HEX + 1] = acpi_get_hex(slot);
+    unsigned devfn = PCI_DEVFN(slot, 0);
+
+    ssdt_ptr[ACPI_PCIHP_OFFSET_HEX] = acpi_get_hex(devfn >> 4);
+    ssdt_ptr[ACPI_PCIHP_OFFSET_HEX + 1] = acpi_get_hex(devfn);
     ssdt_ptr[ACPI_PCIHP_OFFSET_ID] = slot;
     ssdt_ptr[ACPI_PCIHP_OFFSET_ADR + 2] = slot;
+}
+
+/* Assign BSEL property to all buses.  In the future, this can be changed
+ * to only assign to buses that support hotplug.
+ */
+static void *acpi_set_bsel(PCIBus *bus, void *opaque)
+{
+    unsigned *bsel_alloc = opaque;
+    unsigned *bus_bsel;
+
+    if (bus->qbus.allow_hotplug) {
+        bus_bsel = g_malloc(sizeof *bus_bsel);
 
-    /* Runtime patching of ACPI_EJ0: to disable hotplug for a slot,
-     * replace the method name: _EJ0 by ACPI_EJ0_.
+        *bus_bsel = (*bsel_alloc)++;
+        object_property_add_uint32_ptr(OBJECT(bus), ACPI_PCIHP_PROP_BSEL,
+                                       bus_bsel, NULL);
+    }
+
+    return bsel_alloc;
+}
+
+static void acpi_set_pci_info(void)
+{
+    PCIBus *bus = find_i440fx(); /* TODO: Q35 support */
+    unsigned bsel_alloc = 0;
+
+    if (bus) {
+        /* Scan all PCI buses. Set property to enable acpi based hotplug. */
+        pci_for_each_bus_depth_first(bus, acpi_set_bsel, NULL, &bsel_alloc);
+    }
+}
+
+static void build_pci_bus_state_init(AcpiBuildPciBusHotplugState *state,
+                                     AcpiBuildPciBusHotplugState *parent)
+{
+    state->parent = parent;
+    state->device_table = build_alloc_array();
+    state->notify_table = build_alloc_array();
+}
+
+static void build_pci_bus_state_cleanup(AcpiBuildPciBusHotplugState *state)
+{
+    build_free_array(state->device_table);
+    build_free_array(state->notify_table);
+}
+
+static void *build_pci_bus_begin(PCIBus *bus, void *parent_state)
+{
+    AcpiBuildPciBusHotplugState *parent = parent_state;
+    AcpiBuildPciBusHotplugState *child = g_malloc(sizeof *child);
+
+    build_pci_bus_state_init(child, parent);
+
+    return child;
+}
+
+static void build_pci_bus_end(PCIBus *bus, void *bus_state)
+{
+    AcpiBuildPciBusHotplugState *child = bus_state;
+    AcpiBuildPciBusHotplugState *parent = child->parent;
+    GArray *bus_table = build_alloc_array();
+    DECLARE_BITMAP(slot_hotplug_enable, PCI_SLOT_MAX);
+    uint8_t op;
+    int i;
+    QObject *bsel;
+    GArray *method;
+    bool bus_hotplug_support = false;
+
+    if (bus->parent_dev) {
+        op = 0x82; /* DeviceOp */
+        build_append_nameseg(bus_table, "S%.02X_",
+                             bus->parent_dev->devfn);
+        build_append_byte(bus_table, 0x08); /* NameOp */
+        build_append_nameseg(bus_table, "_SUN");
+        build_append_value(bus_table, PCI_SLOT(bus->parent_dev->devfn), 1);
+        build_append_byte(bus_table, 0x08); /* NameOp */
+        build_append_nameseg(bus_table, "_ADR");
+        build_append_value(bus_table, (PCI_SLOT(bus->parent_dev->devfn) << 16) |
+                           PCI_FUNC(bus->parent_dev->devfn), 4);
+    } else {
+        op = 0x10; /* ScopeOp */;
+        build_append_nameseg(bus_table, "PCI0");
+    }
+
+    bsel = object_property_get_qobject(OBJECT(bus), ACPI_PCIHP_PROP_BSEL, NULL);
+    if (bsel) {
+        build_append_byte(bus_table, 0x08); /* NameOp */
+        build_append_nameseg(bus_table, "BSEL");
+        build_append_int(bus_table, qint_get_int(qobject_to_qint(bsel)));
+
+        memset(slot_hotplug_enable, 0xff, sizeof slot_hotplug_enable);
+
+        for (i = 0; i < ARRAY_SIZE(bus->devices); ++i) {
+            PCIDeviceClass *pc;
+            PCIDevice *pdev = bus->devices[i];
+
+            if (!pdev) {
+                continue;
+            }
+
+            pc = PCI_DEVICE_GET_CLASS(pdev);
+
+            if (pc->no_hotplug || pc->is_bridge) {
+                int slot = PCI_SLOT(i);
+
+                clear_bit(slot, slot_hotplug_enable);
+            }
+        }
+
+        /* Append Device object for each slot which supports eject */
+        for (i = 0; i < PCI_SLOT_MAX; i++) {
+            bool can_eject = test_bit(i, slot_hotplug_enable);
+            if (can_eject) {
+                void *pcihp = acpi_data_push(bus_table,
+                                             ACPI_PCIHP_SIZEOF);
+                memcpy(pcihp, ACPI_PCIHP_AML, ACPI_PCIHP_SIZEOF);
+                patch_pcihp(i, pcihp);
+                bus_hotplug_support = true;
+            }
+        }
+
+        method = build_alloc_method("DVNT", 2);
+
+        for (i = 0; i < PCI_SLOT_MAX; i++) {
+            GArray *notify;
+            uint8_t op;
+
+            if (!test_bit(i, slot_hotplug_enable)) {
+                continue;
+            }
+
+            notify = build_alloc_array();
+            op = 0xA0; /* IfOp */
+
+            build_append_byte(notify, 0x7B); /* AndOp */
+            build_append_byte(notify, 0x68); /* Arg0Op */
+            build_append_int(notify, 0x1 << i);
+            build_append_byte(notify, 0x00); /* NullName */
+            build_append_byte(notify, 0x86); /* NotifyOp */
+            build_append_nameseg(notify, "S%.02X_", PCI_DEVFN(i, 0));
+            build_append_byte(notify, 0x69); /* Arg1Op */
+
+            /* Pack it up */
+            build_package(notify, op, 0);
+
+            build_append_array(method, notify);
+
+            build_free_array(notify);
+        }
+
+        build_append_and_cleanup_method(bus_table, method);
+    }
+
+    /* Append PCNT method to notify about events on local and child buses.
+     * Add unconditionally for root since DSDT expects it.
      */
-    /* Sanity check */
-    assert(!memcmp(ssdt_ptr + ACPI_PCIHP_OFFSET_EJ0, "_EJ0", 4));
+    if (bus_hotplug_support || child->notify_table->len || !bus->parent_dev) {
+        method = build_alloc_method("PCNT", 0);
+
+        /* If bus supports hotplug select it and notify about local events */
+        if (bsel) {
+            build_append_byte(method, 0x70); /* StoreOp */
+            build_append_int(method, qint_get_int(qobject_to_qint(bsel)));
+            build_append_nameseg(method, "BNUM");
+            build_append_nameseg(method, "DVNT");
+            build_append_nameseg(method, "PCIU");
+            build_append_int(method, 1); /* Device Check */
+            build_append_nameseg(method, "DVNT");
+            build_append_nameseg(method, "PCID");
+            build_append_int(method, 3); /* Eject Request */
+        }
+
+        /* Notify about child bus events in any case */
+        build_append_array(method, child->notify_table);
+
+        build_append_and_cleanup_method(bus_table, method);
+
+        /* Append description of child buses */
+        build_append_array(bus_table, child->device_table);
+
+        /* Pack it up */
+        if (bus->parent_dev) {
+            build_extop_package(bus_table, op);
+        } else {
+            build_package(bus_table, op, 0);
+        }
 
-    if (!eject) {
-        memcpy(ssdt_ptr + ACPI_PCIHP_OFFSET_EJ0, "EJ0_", 4);
+        /* Append our bus description to parent table */
+        build_append_array(parent->device_table, bus_table);
+
+        /* Also tell parent how to notify us, invoking PCNT method.
+         * At the moment this is not needed for root as we have a single root.
+         */
+        if (bus->parent_dev) {
+            build_append_byte(parent->notify_table, '^'); /* ParentPrefixChar */
+            build_append_byte(parent->notify_table, 0x2E); /* DualNamePrefix */
+            build_append_nameseg(parent->notify_table, "S%.02X_",
+                                 bus->parent_dev->devfn);
+            build_append_nameseg(parent->notify_table, "PCNT");
+        }
     }
+
+    build_free_array(bus_table);
+    build_pci_bus_state_cleanup(child);
+    g_free(child);
 }
 
 static void patch_pci_windows(PcPciInfo *pci, uint8_t *start, unsigned size)
@@ -733,7 +951,7 @@ build_ssdt(GArray *table_data, GArray *linker,
          *   Method(NTFY, 2) {If (LEqual(Arg0, 0x00)) {Notify(CP00, Arg1)} ...}
          */
         /* Arg0 = Processor ID = APIC ID */
-        build_append_notify(sb_scope, "NTFY", "CP%0.02X", 0, acpi_cpus);
+        build_append_notify_method(sb_scope, "NTFY", "CP%0.02X", acpi_cpus);
 
         /* build "Name(CPON, Package() { One, One, ..., Zero, Zero, ... })" */
         build_append_byte(sb_scope, 0x08); /* NameOp */
@@ -755,24 +973,19 @@ build_ssdt(GArray *table_data, GArray *linker,
         }
 
         {
-            GArray *pci0 = build_alloc_array();
-            uint8_t op = 0x10; /* ScopeOp */;
+            AcpiBuildPciBusHotplugState hotplug_state;
+            PCIBus *bus = find_i440fx(); /* TODO: Q35 support */
 
-            build_append_nameseg(pci0, "PCI0");
+            build_pci_bus_state_init(&hotplug_state, NULL);
 
-            /* build Device object for each slot */
-            for (i = 1; i < PCI_SLOT_MAX; i++) {
-                bool eject = test_bit(i, misc->slot_hotplug_enable);
-                void *pcihp = acpi_data_push(pci0, ACPI_PCIHP_SIZEOF);
-
-                memcpy(pcihp, ACPI_PCIHP_AML, ACPI_PCIHP_SIZEOF);
-                patch_pcihp(i, pcihp, eject);
+            if (bus) {
+                /* Scan all PCI buses. Generate tables to support hotplug. */
+                pci_for_each_bus_depth_first(bus, build_pci_bus_begin,
+                                             build_pci_bus_end, &hotplug_state);
             }
 
-            build_append_notify(pci0, "PCNT", "S%0.02X_", 1, PCI_SLOT_MAX);
-            build_package(pci0, op, 3);
-            build_append_array(sb_scope, pci0);
-            build_free_array(pci0);
+            build_append_array(sb_scope, hotplug_state.device_table);
+            build_pci_bus_state_cleanup(&hotplug_state);
         }
 
         build_package(sb_scope, op, 3);
@@ -867,16 +1080,16 @@ build_srat(GArray *table_data, GArray *linker,
         next_base = mem_base + mem_len;
 
         /* Cut out the ACPI_PCI hole */
-        if (mem_base <= guest_info->ram_size &&
-            next_base > guest_info->ram_size) {
-            mem_len -= next_base - guest_info->ram_size;
+        if (mem_base <= guest_info->ram_size_below_4g &&
+            next_base > guest_info->ram_size_below_4g) {
+            mem_len -= next_base - guest_info->ram_size_below_4g;
             if (mem_len > 0) {
                 numamem = acpi_data_push(table_data, sizeof *numamem);
                 acpi_build_srat_memory(numamem, mem_base, mem_len, i-1, 1);
             }
             mem_base = 1ULL << 32;
-            mem_len = next_base - guest_info->ram_size;
-            next_base += (1ULL << 32) - guest_info->ram_size;
+            mem_len = next_base - guest_info->ram_size_below_4g;
+            next_base += (1ULL << 32) - guest_info->ram_size_below_4g;
         }
         numamem = acpi_data_push(table_data, sizeof *numamem);
         acpi_build_srat_memory(numamem, mem_base, mem_len, i - 1, 1);
@@ -1055,7 +1268,6 @@ void acpi_build(PcGuestInfo *guest_info, AcpiBuildTables *tables)
     acpi_get_cpu_info(&cpu);
     acpi_get_pm_info(&pm);
     acpi_get_dsdt(&misc);
-    acpi_get_hotplug_info(&misc);
     acpi_get_misc_info(&misc);
     acpi_get_pci_info(&pci);
 
@@ -1200,6 +1412,8 @@ void acpi_setup(PcGuestInfo *guest_info)
 
     build_state->guest_info = guest_info;
 
+    acpi_set_pci_info();
+
     acpi_build_tables_init(&tables);
     acpi_build(build_state->guest_info, &tables);
 
diff --git a/hw/i386/acpi-dsdt-cpu-hotplug.dsl b/hw/i386/acpi-dsdt-cpu-hotplug.dsl
index 995b415bae..dee4843cde 100644
--- a/hw/i386/acpi-dsdt-cpu-hotplug.dsl
+++ b/hw/i386/acpi-dsdt-cpu-hotplug.dsl
@@ -16,6 +16,7 @@
 /****************************************************************
  * CPU hotplug
  ****************************************************************/
+#define CPU_HOTPLUG_RESOURCE_DEVICE PRES
 
 Scope(\_SB) {
     /* Objects filled in by run-time generated SSDT */
@@ -52,7 +53,8 @@ Scope(\_SB) {
         Sleep(200)
     }
 
-    OperationRegion(PRST, SystemIO, 0xaf00, 32)
+#define CPU_STATUS_LEN ACPI_GPE_PROC_LEN
+    OperationRegion(PRST, SystemIO, CPU_STATUS_BASE, CPU_STATUS_LEN)
     Field(PRST, ByteAcc, NoLock, Preserve) {
         PRS, 256
     }
@@ -89,4 +91,14 @@ Scope(\_SB) {
             Increment(Local0)
         }
     }
+
+    Device(CPU_HOTPLUG_RESOURCE_DEVICE) {
+        Name(_HID, "ACPI0004")
+
+        Name(_CRS, ResourceTemplate() {
+            IO(Decode16, CPU_STATUS_BASE, CPU_STATUS_BASE, 0, CPU_STATUS_LEN)
+        })
+
+        Name(_STA, 0xB) /* present, functioning, decoding, not shown in UI */
+    }
 }
diff --git a/hw/i386/acpi-dsdt-isa.dsl b/hw/i386/acpi-dsdt-isa.dsl
index 89caa1649d..deb37de92e 100644
--- a/hw/i386/acpi-dsdt-isa.dsl
+++ b/hw/i386/acpi-dsdt-isa.dsl
@@ -16,6 +16,17 @@
 /* Common legacy ISA style devices. */
 Scope(\_SB.PCI0.ISA) {
 
+    Device (SMC) {
+        Name(_HID, EisaId("APP0001"))
+        /* _STA will be patched to 0x0B if AppleSMC is present */
+        ACPI_EXTRACT_NAME_BYTE_CONST DSDT_APPLESMC_STA
+        Name(_STA, 0xF0)
+        Name(_CRS, ResourceTemplate () {
+            IO (Decode16, 0x0300, 0x0300, 0x01, 0x20)
+            IRQNoFlags() { 6 }
+        })
+    }
+
     Device(RTC) {
         Name(_HID, EisaId("PNP0B00"))
         Name(_CRS, ResourceTemplate() {
diff --git a/hw/i386/acpi-dsdt-pci-crs.dsl b/hw/i386/acpi-dsdt-pci-crs.dsl
index b375a19cf6..4648e90366 100644
--- a/hw/i386/acpi-dsdt-pci-crs.dsl
+++ b/hw/i386/acpi-dsdt-pci-crs.dsl
@@ -30,20 +30,7 @@ Scope(\_SB.PCI0) {
             0x01,               // Address Alignment
             0x08,               // Address Length
             )
-        WordIO(ResourceProducer, MinFixed, MaxFixed, PosDecode, EntireRange,
-            0x0000,             // Address Space Granularity
-            0x0000,             // Address Range Minimum
-            0x0CF7,             // Address Range Maximum
-            0x0000,             // Address Translation Offset
-            0x0CF8,             // Address Length
-            ,, , TypeStatic)
-        WordIO(ResourceProducer, MinFixed, MaxFixed, PosDecode, EntireRange,
-            0x0000,             // Address Space Granularity
-            0x0D00,             // Address Range Minimum
-            0xFFFF,             // Address Range Maximum
-            0x0000,             // Address Translation Offset
-            0xF300,             // Address Length
-            ,, , TypeStatic)
+        BOARD_SPECIFIC_PCI_RESOURSES
         DWordMemory(ResourceProducer, PosDecode, MinFixed, MaxFixed, Cacheable, ReadWrite,
             0x00000000,         // Address Space Granularity
             0x000A0000,         // Address Range Minimum
diff --git a/hw/i386/acpi-dsdt.dsl b/hw/i386/acpi-dsdt.dsl
index a377424f39..b23d5e0eac 100644
--- a/hw/i386/acpi-dsdt.dsl
+++ b/hw/i386/acpi-dsdt.dsl
@@ -35,6 +35,45 @@ DefinitionBlock (
 /****************************************************************
  * PCI Bus definition
  ****************************************************************/
+#define BOARD_SPECIFIC_PCI_RESOURSES \
+     WordIO(ResourceProducer, MinFixed, MaxFixed, PosDecode, EntireRange, \
+         0x0000, \
+         0x0000, \
+         0x0CF7, \
+         0x0000, \
+         0x0CF8, \
+         ,, , TypeStatic) \
+     WordIO(ResourceProducer, MinFixed, MaxFixed, PosDecode, EntireRange, \
+         0x0000, \
+         0x0D00, \
+         0xADFF, \
+         0x0000, \
+         0xA100, \
+         ,, , TypeStatic) \
+     /* 0xae00-0xae0e hole for PCI hotplug, hw/acpi/piix4.c:PCI_HOTPLUG_ADDR */ \
+     WordIO(ResourceProducer, MinFixed, MaxFixed, PosDecode, EntireRange, \
+         0x0000, \
+         0xAE0F, \
+         0xAEFF, \
+         0x0000, \
+         0x00F1, \
+         ,, , TypeStatic) \
+     /* 0xaf00-0xaf1f hole for CPU hotplug, hw/acpi/piix4.c:PIIX4_PROC_BASE */ \
+     WordIO(ResourceProducer, MinFixed, MaxFixed, PosDecode, EntireRange, \
+         0x0000, \
+         0xAF20, \
+         0xAFDF, \
+         0x0000, \
+         0x00C0, \
+         ,, , TypeStatic) \
+     /* 0xafe0-0xafe3 hole for ACPI.GPE0, hw/acpi/piix4.c:GPE_BASE */ \
+     WordIO(ResourceProducer, MinFixed, MaxFixed, PosDecode, EntireRange, \
+         0x0000, \
+         0xAFE4, \
+         0xFFFF, \
+         0x0000, \
+         0x501C, \
+         ,, , TypeStatic)
 
     Scope(\_SB) {
         Device(PCI0) {
@@ -114,6 +153,7 @@ DefinitionBlock (
         }
     }
 
+#define DSDT_APPLESMC_STA piix_dsdt_applesmc_sta
 #include "acpi-dsdt-isa.dsl"
 
 
@@ -133,32 +173,28 @@ DefinitionBlock (
             B0EJ, 32,
         }
 
+        OperationRegion(BNMR, SystemIO, 0xae10, 0x04)
+        Field(BNMR, DWordAcc, NoLock, WriteAsZeros) {
+            BNUM, 32,
+        }
+
+        /* Lock to protect access to fields above. */
+        Mutex(BLCK, 0)
+
         /* Methods called by bulk generated PCI devices below */
 
         /* Methods called by hotplug devices */
-        Method(PCEJ, 1, NotSerialized) {
+        Method(PCEJ, 2, NotSerialized) {
             // _EJ0 method - eject callback
-            Store(ShiftLeft(1, Arg0), B0EJ)
+            Acquire(BLCK, 0xFFFF)
+            Store(Arg0, BNUM)
+            Store(ShiftLeft(1, Arg1), B0EJ)
+            Release(BLCK)
             Return (0x0)
         }
 
         /* Hotplug notification method supplied by SSDT */
         External(\_SB.PCI0.PCNT, MethodObj)
-
-        /* PCI hotplug notify method */
-        Method(PCNF, 0) {
-            // Local0 = iterator
-            Store(Zero, Local0)
-            While (LLess(Local0, 31)) {
-                Increment(Local0)
-                If (And(PCIU, ShiftLeft(1, Local0))) {
-                    PCNT(Local0, 1)
-                }
-                If (And(PCID, ShiftLeft(1, Local0))) {
-                    PCNT(Local0, 3)
-                }
-            }
-        }
     }
 
 
@@ -293,6 +329,8 @@ DefinitionBlock (
         }
     }
 
+#include "hw/acpi/cpu_hotplug_defs.h"
+#define CPU_STATUS_BASE PIIX4_CPU_HOTPLUG_IO_BASE
 #include "acpi-dsdt-cpu-hotplug.dsl"
 
 
@@ -307,7 +345,9 @@ DefinitionBlock (
         }
         Method(_E01) {
             // PCI hotplug event
-            \_SB.PCI0.PCNF()
+            Acquire(\_SB.PCI0.BLCK, 0xFFFF)
+            \_SB.PCI0.PCNT()
+            Release(\_SB.PCI0.BLCK)
         }
         Method(_E02) {
             // CPU hotplug event
diff --git a/hw/i386/acpi-dsdt.hex.generated b/hw/i386/acpi-dsdt.hex.generated
index f8bd4ea1b5..1e58801b2a 100644
--- a/hw/i386/acpi-dsdt.hex.generated
+++ b/hw/i386/acpi-dsdt.hex.generated
@@ -3,12 +3,12 @@ static unsigned char AcpiDsdtAmlCode[] = {
 0x53,
 0x44,
 0x54,
-0x37,
+0x87,
 0x11,
 0x0,
 0x0,
 0x1,
-0xd8,
+0xb8,
 0x42,
 0x58,
 0x50,
@@ -860,8 +860,8 @@ static unsigned char AcpiDsdtAmlCode[] = {
 0x4e,
 0x1,
 0x10,
-0x4c,
-0x1b,
+0x4b,
+0x1e,
 0x2f,
 0x3,
 0x5f,
@@ -879,6 +879,53 @@ static unsigned char AcpiDsdtAmlCode[] = {
 0x5b,
 0x82,
 0x2d,
+0x53,
+0x4d,
+0x43,
+0x5f,
+0x8,
+0x5f,
+0x48,
+0x49,
+0x44,
+0xc,
+0x6,
+0x10,
+0x0,
+0x1,
+0x8,
+0x5f,
+0x53,
+0x54,
+0x41,
+0xb,
+0x0,
+0xff,
+0x8,
+0x5f,
+0x43,
+0x52,
+0x53,
+0x11,
+0x10,
+0xa,
+0xd,
+0x47,
+0x1,
+0x0,
+0x3,
+0x0,
+0x3,
+0x1,
+0x20,
+0x22,
+0x40,
+0x0,
+0x79,
+0x0,
+0x5b,
+0x82,
+0x2d,
 0x52,
 0x54,
 0x43,
@@ -1305,7 +1352,7 @@ static unsigned char AcpiDsdtAmlCode[] = {
 0x79,
 0x0,
 0x10,
-0x4b,
+0x48,
 0x8,
 0x2e,
 0x5f,
@@ -1371,79 +1418,76 @@ static unsigned char AcpiDsdtAmlCode[] = {
 0x45,
 0x4a,
 0x20,
+0x5b,
+0x80,
+0x42,
+0x4e,
+0x4d,
+0x52,
+0x1,
+0xb,
+0x10,
+0xae,
+0xa,
+0x4,
+0x5b,
+0x81,
+0xb,
+0x42,
+0x4e,
+0x4d,
+0x52,
+0x43,
+0x42,
+0x4e,
+0x55,
+0x4d,
+0x20,
+0x5b,
+0x1,
+0x42,
+0x4c,
+0x43,
+0x4b,
+0x0,
 0x14,
-0x11,
+0x25,
 0x50,
 0x43,
 0x45,
 0x4a,
-0x1,
+0x2,
+0x5b,
+0x23,
+0x42,
+0x4c,
+0x43,
+0x4b,
+0xff,
+0xff,
 0x70,
-0x79,
-0x1,
 0x68,
-0x0,
 0x42,
-0x30,
-0x45,
-0x4a,
-0xa4,
-0x0,
-0x14,
-0x36,
-0x50,
-0x43,
 0x4e,
-0x46,
-0x0,
-0x70,
-0x0,
-0x60,
-0xa2,
-0x2c,
-0x95,
-0x60,
-0xa,
-0x1f,
-0x75,
-0x60,
-0xa0,
-0x11,
-0x7b,
-0x50,
-0x43,
-0x49,
 0x55,
+0x4d,
+0x70,
 0x79,
 0x1,
-0x60,
+0x69,
 0x0,
-0x0,
-0x50,
-0x43,
-0x4e,
-0x54,
-0x60,
-0x1,
-0xa0,
-0x12,
-0x7b,
-0x50,
+0x42,
+0x30,
+0x45,
+0x4a,
+0x5b,
+0x27,
+0x42,
+0x4c,
 0x43,
-0x49,
-0x44,
-0x79,
-0x1,
-0x60,
-0x0,
+0x4b,
+0xa4,
 0x0,
-0x50,
-0x43,
-0x4e,
-0x54,
-0x60,
-0xa,
-0x3,
 0x10,
 0x4a,
 0xa0,
@@ -4248,8 +4292,8 @@ static unsigned char AcpiDsdtAmlCode[] = {
 0x75,
 0x60,
 0x10,
-0x4e,
-0x9,
+0x42,
+0xc,
 0x5f,
 0x47,
 0x50,
@@ -4277,12 +4321,31 @@ static unsigned char AcpiDsdtAmlCode[] = {
 0x30,
 0x0,
 0x14,
-0x15,
+0x39,
 0x5f,
 0x45,
 0x30,
 0x31,
 0x0,
+0x5b,
+0x23,
+0x5c,
+0x2f,
+0x3,
+0x5f,
+0x53,
+0x42,
+0x5f,
+0x50,
+0x43,
+0x49,
+0x30,
+0x42,
+0x4c,
+0x43,
+0x4b,
+0xff,
+0xff,
 0x5c,
 0x2f,
 0x3,
@@ -4297,7 +4360,24 @@ static unsigned char AcpiDsdtAmlCode[] = {
 0x50,
 0x43,
 0x4e,
-0x46,
+0x54,
+0x5b,
+0x27,
+0x5c,
+0x2f,
+0x3,
+0x5f,
+0x53,
+0x42,
+0x5f,
+0x50,
+0x43,
+0x49,
+0x30,
+0x42,
+0x4c,
+0x43,
+0x4b,
 0x14,
 0x10,
 0x5f,
@@ -4407,3 +4487,6 @@ static unsigned char AcpiDsdtAmlCode[] = {
 0x46,
 0x0
 };
+static unsigned short piix_dsdt_applesmc_sta[] = {
+0x384
+};
diff --git a/hw/i386/pc.c b/hw/i386/pc.c
index 6f0be37d8b..348b15f267 100644
--- a/hw/i386/pc.c
+++ b/hw/i386/pc.c
@@ -1072,6 +1072,7 @@ PcGuestInfo *pc_guest_info_init(ram_addr_t below_4g_mem_size,
     PcGuestInfo *guest_info = &guest_info_state->info;
     int i, j;
 
+    guest_info->ram_size_below_4g = below_4g_mem_size;
     guest_info->ram_size = below_4g_mem_size + above_4g_mem_size;
     guest_info->apic_id_limit = pc_apic_id_limit(max_cpus);
     guest_info->apic_xrupt_override = kvm_allows_irq0_override();
diff --git a/hw/i386/pc_q35.c b/hw/i386/pc_q35.c
index 07f38ff704..a7f626096a 100644
--- a/hw/i386/pc_q35.c
+++ b/hw/i386/pc_q35.c
@@ -51,6 +51,11 @@
 static bool has_pci_info;
 static bool has_acpi_build = true;
 static bool smbios_type1_defaults = true;
+/* Make sure that guest addresses aligned at 1Gbyte boundaries get mapped to
+ * host addresses aligned at 1Gbyte boundaries.  This way we can use 1GByte
+ * pages in the host.
+ */
+static bool gigabyte_align = true;
 
 /* PC hardware initialisation */
 static void pc_q35_init(QEMUMachineInitArgs *args)
@@ -92,9 +97,19 @@ static void pc_q35_init(QEMUMachineInitArgs *args)
 
     kvmclock_create();
 
+    /* Check whether RAM fits below 4G (leaving 1/2 GByte for IO memory
+     * and 256 Mbytes for PCI Express Enhanced Configuration Access Mapping
+     * also known as MMCFG).
+     * If it doesn't, we need to split it in chunks below and above 4G.
+     * In any case, try to make sure that guest addresses aligned at
+     * 1G boundaries get mapped to host addresses aligned at 1G boundaries.
+     * For old machine types, use whatever split we used historically to avoid
+     * breaking migration.
+     */
     if (args->ram_size >= 0xb0000000) {
-        above_4g_mem_size = args->ram_size - 0xb0000000;
-        below_4g_mem_size = 0xb0000000;
+        ram_addr_t lowmem = gigabyte_align ? 0x80000000 : 0xb0000000;
+        above_4g_mem_size = args->ram_size - lowmem;
+        below_4g_mem_size = lowmem;
     } else {
         above_4g_mem_size = 0;
         below_4g_mem_size = args->ram_size;
@@ -228,6 +243,7 @@ static void pc_q35_init(QEMUMachineInitArgs *args)
 static void pc_compat_1_7(QEMUMachineInitArgs *args)
 {
     smbios_type1_defaults = false;
+    gigabyte_align = false;
 }
 
 static void pc_compat_1_6(QEMUMachineInitArgs *args)
diff --git a/hw/i386/q35-acpi-dsdt.dsl b/hw/i386/q35-acpi-dsdt.dsl
index 7934a9ddfb..d618e9e2d2 100644
--- a/hw/i386/q35-acpi-dsdt.dsl
+++ b/hw/i386/q35-acpi-dsdt.dsl
@@ -48,6 +48,22 @@ DefinitionBlock (
 /****************************************************************
  * PCI Bus definition
  ****************************************************************/
+#define BOARD_SPECIFIC_PCI_RESOURSES \
+     WordIO(ResourceProducer, MinFixed, MaxFixed, PosDecode, EntireRange, \
+         0x0000, \
+         0x0000, \
+         0x0CD7, \
+         0x0000, \
+         0x0CD8, \
+         ,, , TypeStatic) \
+     /* 0xcd8-0xcf7 hole for CPU hotplug, hw/acpi/ich9.c:ICH9_PROC_BASE */ \
+     WordIO(ResourceProducer, MinFixed, MaxFixed, PosDecode, EntireRange, \
+         0x0000, \
+         0x0D00, \
+         0xFFFF, \
+         0x0000, \
+         0xF300, \
+         ,, , TypeStatic)
 
     Scope(\_SB) {
         Device(PCI0) {
@@ -171,6 +187,7 @@ DefinitionBlock (
         }
     }
 
+#define DSDT_APPLESMC_STA q35_dsdt_applesmc_sta
 #include "acpi-dsdt-isa.dsl"
 
 
@@ -404,6 +421,8 @@ DefinitionBlock (
         define_gsi_link(GSIH, 0, 0x17)
     }
 
+#include "hw/acpi/cpu_hotplug_defs.h"
+#define CPU_STATUS_BASE ICH9_CPU_HOTPLUG_IO_BASE
 #include "acpi-dsdt-cpu-hotplug.dsl"
 
 
diff --git a/hw/i386/q35-acpi-dsdt.hex.generated b/hw/i386/q35-acpi-dsdt.hex.generated
index 111ad3e9c2..6d885a9055 100644
--- a/hw/i386/q35-acpi-dsdt.hex.generated
+++ b/hw/i386/q35-acpi-dsdt.hex.generated
@@ -3,12 +3,12 @@ static unsigned char Q35AcpiDsdtAmlCode[] = {
 0x53,
 0x44,
 0x54,
-0xb0,
+0xdf,
 0x1c,
 0x0,
 0x0,
 0x1,
-0xfe,
+0xff,
 0x42,
 0x58,
 0x50,
@@ -1033,8 +1033,8 @@ static unsigned char Q35AcpiDsdtAmlCode[] = {
 0x4e,
 0x1,
 0x10,
-0x4c,
-0x1b,
+0x4b,
+0x1e,
 0x2f,
 0x3,
 0x5f,
@@ -1052,6 +1052,53 @@ static unsigned char Q35AcpiDsdtAmlCode[] = {
 0x5b,
 0x82,
 0x2d,
+0x53,
+0x4d,
+0x43,
+0x5f,
+0x8,
+0x5f,
+0x48,
+0x49,
+0x44,
+0xc,
+0x6,
+0x10,
+0x0,
+0x1,
+0x8,
+0x5f,
+0x53,
+0x54,
+0x41,
+0xb,
+0x0,
+0xff,
+0x8,
+0x5f,
+0x43,
+0x52,
+0x53,
+0x11,
+0x10,
+0xa,
+0xd,
+0x47,
+0x1,
+0x0,
+0x3,
+0x0,
+0x3,
+0x1,
+0x20,
+0x22,
+0x40,
+0x0,
+0x79,
+0x0,
+0x5b,
+0x82,
+0x2d,
 0x52,
 0x54,
 0x43,
@@ -7229,12 +7276,19 @@ static unsigned char Q35AcpiDsdtAmlCode[] = {
 0x30,
 0x0,
 0x14,
-0x10,
+0x6,
 0x5f,
 0x4c,
 0x30,
 0x31,
 0x0,
+0x14,
+0x10,
+0x5f,
+0x45,
+0x30,
+0x32,
+0x0,
 0x5c,
 0x2e,
 0x5f,
@@ -7250,13 +7304,6 @@ static unsigned char Q35AcpiDsdtAmlCode[] = {
 0x5f,
 0x4c,
 0x30,
-0x32,
-0x0,
-0x14,
-0x6,
-0x5f,
-0x4c,
-0x30,
 0x33,
 0x0,
 0x14,
@@ -7344,3 +7391,6 @@ static unsigned char Q35AcpiDsdtAmlCode[] = {
 0x46,
 0x0
 };
+static unsigned short q35_dsdt_applesmc_sta[] = {
+0x431
+};
diff --git a/hw/i386/ssdt-pcihp.dsl b/hw/i386/ssdt-pcihp.dsl
index d29a5b95d2..cc245c3e7c 100644
--- a/hw/i386/ssdt-pcihp.dsl
+++ b/hw/i386/ssdt-pcihp.dsl
@@ -25,6 +25,7 @@ DefinitionBlock ("ssdt-pcihp.aml", "SSDT", 0x01, "BXPC", "BXSSDTPCIHP", 0x1)
     /* Objects supplied by DSDT */
     External(\_SB.PCI0, DeviceObj)
     External(\_SB.PCI0.PCEJ, MethodObj)
+    External(BSEL, IntObj)
 
     Scope(\_SB.PCI0) {
 
@@ -33,19 +34,17 @@ DefinitionBlock ("ssdt-pcihp.aml", "SSDT", 0x01, "BXPC", "BXSSDTPCIHP", 0x1)
         ACPI_EXTRACT_DEVICE_END ssdt_pcihp_end
         ACPI_EXTRACT_DEVICE_STRING ssdt_pcihp_name
 
-        // Method _EJ0 can be patched by BIOS to EJ0_
-        // at runtime, if the slot is detected to not support hotplug.
-        // Extract the offset of the address dword and the
-        // _EJ0 name to allow this patching.
+        // Extract the offsets of the device name, address dword and the slot
+        // name byte - we fill them in for each device.
         Device(SAA) {
             ACPI_EXTRACT_NAME_BYTE_CONST ssdt_pcihp_id
             Name(_SUN, 0xAA)
             ACPI_EXTRACT_NAME_DWORD_CONST ssdt_pcihp_adr
             Name(_ADR, 0xAA0000)
-            ACPI_EXTRACT_METHOD_STRING ssdt_pcihp_ej0
             Method(_EJ0, 1) {
-                Return (PCEJ(_SUN))
+                PCEJ(BSEL, _SUN)
             }
         }
+
     }
 }
diff --git a/hw/i386/ssdt-pcihp.hex.generated b/hw/i386/ssdt-pcihp.hex.generated
index b3c2cd5cf9..610a631fd1 100644
--- a/hw/i386/ssdt-pcihp.hex.generated
+++ b/hw/i386/ssdt-pcihp.hex.generated
@@ -5,19 +5,19 @@ static unsigned char ssdt_pcihp_adr[] = {
 0x44
 };
 static unsigned char ssdt_pcihp_end[] = {
-0x58
+0x5b
 };
 static unsigned char ssdp_pcihp_aml[] = {
 0x53,
 0x53,
 0x44,
 0x54,
-0x58,
+0x5b,
 0x0,
 0x0,
 0x0,
 0x1,
-0x76,
+0xe8,
 0x42,
 0x58,
 0x50,
@@ -45,7 +45,7 @@ static unsigned char ssdp_pcihp_aml[] = {
 0x13,
 0x20,
 0x10,
-0x33,
+0x36,
 0x5c,
 0x2e,
 0x5f,
@@ -58,7 +58,7 @@ static unsigned char ssdp_pcihp_aml[] = {
 0x30,
 0x5b,
 0x82,
-0x26,
+0x29,
 0x53,
 0x41,
 0x41,
@@ -81,17 +81,20 @@ static unsigned char ssdp_pcihp_aml[] = {
 0xaa,
 0x0,
 0x14,
-0xf,
+0x12,
 0x5f,
 0x45,
 0x4a,
 0x30,
 0x1,
-0xa4,
 0x50,
 0x43,
 0x45,
 0x4a,
+0x42,
+0x53,
+0x45,
+0x4c,
 0x5f,
 0x53,
 0x55,
@@ -103,6 +106,3 @@ static unsigned char ssdt_pcihp_start[] = {
 static unsigned char ssdt_pcihp_id[] = {
 0x3d
 };
-static unsigned char ssdt_pcihp_ej0[] = {
-0x4a
-};
diff --git a/hw/i386/ssdt-proc.hex.generated b/hw/i386/ssdt-proc.hex.generated
index bb9920d3c9..97e28d4820 100644
--- a/hw/i386/ssdt-proc.hex.generated
+++ b/hw/i386/ssdt-proc.hex.generated
@@ -11,7 +11,7 @@ static unsigned char ssdp_proc_aml[] = {
 0x0,
 0x0,
 0x1,
-0xb8,
+0x78,
 0x42,
 0x58,
 0x50,
@@ -47,8 +47,8 @@ static unsigned char ssdp_proc_aml[] = {
 0x41,
 0x41,
 0xaa,
-0x10,
-0xb0,
+0x0,
+0x0,
 0x0,
 0x0,
 0x0,
diff --git a/hw/ide/core.c b/hw/ide/core.c
index 036cd4a6d1..e1dfe54df6 100644
--- a/hw/ide/core.c
+++ b/hw/ide/core.c
@@ -1321,6 +1321,7 @@ static bool cmd_exec_dev_diagnostic(IDEState *s, uint8_t cmd)
         s->status = 0; /* ATAPI spec (v6) section 9.10 defines packet
                         * devices to return a clear status register
                         * with READY_STAT *not* set. */
+        s->error = 0x01;
     } else {
         s->status = READY_STAT | SEEK_STAT;
         /* The bits of the error register are not as usual for this command!
diff --git a/hw/intc/arm_gic.c b/hw/intc/arm_gic.c
index 9409684ce8..1c4a1143af 100644
--- a/hw/intc/arm_gic.c
+++ b/hw/intc/arm_gic.c
@@ -380,8 +380,10 @@ static void gic_dist_writeb(void *opaque, hwaddr offset,
         irq = (offset - 0x100) * 8 + GIC_BASE_IRQ;
         if (irq >= s->num_irq)
             goto bad_reg;
-        if (irq < 16)
-          value = 0xff;
+        if (irq < GIC_NR_SGIS) {
+            value = 0xff;
+        }
+
         for (i = 0; i < 8; i++) {
             if (value & (1 << i)) {
                 int mask =
@@ -406,8 +408,10 @@ static void gic_dist_writeb(void *opaque, hwaddr offset,
         irq = (offset - 0x180) * 8 + GIC_BASE_IRQ;
         if (irq >= s->num_irq)
             goto bad_reg;
-        if (irq < 16)
-          value = 0;
+        if (irq < GIC_NR_SGIS) {
+            value = 0;
+        }
+
         for (i = 0; i < 8; i++) {
             if (value & (1 << i)) {
                 int cm = (irq < GIC_INTERNAL) ? (1 << cpu) : ALL_CPU_MASK;
@@ -423,8 +427,9 @@ static void gic_dist_writeb(void *opaque, hwaddr offset,
         irq = (offset - 0x200) * 8 + GIC_BASE_IRQ;
         if (irq >= s->num_irq)
             goto bad_reg;
-        if (irq < 16)
-          irq = 0;
+        if (irq < GIC_NR_SGIS) {
+            value = 0;
+        }
 
         for (i = 0; i < 8; i++) {
             if (value & (1 << i)) {
@@ -436,6 +441,10 @@ static void gic_dist_writeb(void *opaque, hwaddr offset,
         irq = (offset - 0x280) * 8 + GIC_BASE_IRQ;
         if (irq >= s->num_irq)
             goto bad_reg;
+        if (irq < GIC_NR_SGIS) {
+            value = 0;
+        }
+
         for (i = 0; i < 8; i++) {
             /* ??? This currently clears the pending bit for all CPUs, even
                for per-CPU interrupts.  It's unclear whether this is the
diff --git a/hw/misc/applesmc.c b/hw/misc/applesmc.c
index 1e8d183e7f..627adb97c9 100644
--- a/hw/misc/applesmc.c
+++ b/hw/misc/applesmc.c
@@ -66,7 +66,6 @@ struct AppleSMCData {
     QLIST_ENTRY(AppleSMCData) node;
 };
 
-#define TYPE_APPLE_SMC "isa-applesmc"
 #define APPLE_SMC(obj) OBJECT_CHECK(AppleSMCState, (obj), TYPE_APPLE_SMC)
 
 typedef struct AppleSMCState AppleSMCState;
diff --git a/hw/misc/vfio.c b/hw/misc/vfio.c
index 9aecaa82bc..8db182fa3d 100644
--- a/hw/misc/vfio.c
+++ b/hw/misc/vfio.c
@@ -135,12 +135,18 @@ enum {
 
 struct VFIOGroup;
 
+typedef struct VFIOType1 {
+    MemoryListener listener;
+    int error;
+    bool initialized;
+} VFIOType1;
+
 typedef struct VFIOContainer {
     int fd; /* /dev/vfio/vfio, empowered by the attached groups */
     struct {
         /* enable abstraction to support various iommu backends */
         union {
-            MemoryListener listener; /* Used by type1 iommu */
+            VFIOType1 type1;
         };
         void (*release)(struct VFIOContainer *);
     } iommu_data;
@@ -191,6 +197,7 @@ typedef struct VFIODevice {
     bool has_flr;
     bool has_pm_reset;
     bool needs_reset;
+    bool rom_read_failed;
 } VFIODevice;
 
 typedef struct VFIOGroup {
@@ -592,7 +599,7 @@ static void vfio_msi_interrupt(void *opaque)
         return;
     }
 
-#ifdef VFIO_DEBUG
+#ifdef DEBUG_VFIO
     MSIMessage msg;
 
     if (vdev->interrupt == VFIO_INT_MSIX) {
@@ -1125,6 +1132,14 @@ static void vfio_pci_load_rom(VFIODevice *vdev)
     vdev->rom_offset = reg_info.offset;
 
     if (!vdev->rom_size) {
+        vdev->rom_read_failed = true;
+        error_report("vfio-pci: Cannot read device rom at "
+                    "%04x:%02x:%02x.%x\n",
+                    vdev->host.domain, vdev->host.bus, vdev->host.slot,
+                    vdev->host.function);
+        error_printf("Device option ROM contents are probably invalid "
+                    "(check dmesg).\nSkip option ROM probe with rombar=0, "
+                    "or load from file with romfile=\n");
         return;
     }
 
@@ -1156,6 +1171,9 @@ static uint64_t vfio_rom_read(void *opaque, hwaddr addr, unsigned size)
     /* Load the ROM lazily when the guest tries to read it */
     if (unlikely(!vdev->rom)) {
         vfio_pci_load_rom(vdev);
+        if (unlikely(!vdev->rom && !vdev->rom_read_failed)) {
+            vfio_pci_load_rom(vdev);
+        }
     }
 
     memcpy(&val, vdev->rom + addr,
@@ -1223,6 +1241,7 @@ static void vfio_pci_size_rom(VFIODevice *vdev)
                      PCI_BASE_ADDRESS_SPACE_MEMORY, &vdev->pdev.rom);
 
     vdev->pdev.has_rom = true;
+    vdev->rom_read_failed = false;
 }
 
 static void vfio_vga_write(void *opaque, hwaddr addr,
@@ -1968,6 +1987,7 @@ static void vfio_vga_quirk_teardown(VFIODevice *vdev)
         while (!QLIST_EMPTY(&vdev->vga.region[i].quirks)) {
             VFIOQuirk *quirk = QLIST_FIRST(&vdev->vga.region[i].quirks);
             memory_region_del_subregion(&vdev->vga.region[i].mem, &quirk->mem);
+            memory_region_destroy(&quirk->mem);
             QLIST_REMOVE(quirk, next);
             g_free(quirk);
         }
@@ -1990,6 +2010,7 @@ static void vfio_bar_quirk_teardown(VFIODevice *vdev, int nr)
     while (!QLIST_EMPTY(&bar->quirks)) {
         VFIOQuirk *quirk = QLIST_FIRST(&bar->quirks);
         memory_region_del_subregion(&bar->mem, &quirk->mem);
+        memory_region_destroy(&quirk->mem);
         QLIST_REMOVE(quirk, next);
         g_free(quirk);
     }
@@ -2141,14 +2162,21 @@ static int vfio_dma_map(VFIOContainer *container, hwaddr iova,
 
 static bool vfio_listener_skipped_section(MemoryRegionSection *section)
 {
-    return !memory_region_is_ram(section->mr);
+    return !memory_region_is_ram(section->mr) ||
+           /*
+            * Sizing an enabled 64-bit BAR can cause spurious mappings to
+            * addresses in the upper part of the 64-bit address space.  These
+            * are never accessed by the CPU and beyond the address width of
+            * some IOMMU hardware.  TODO: VFIO should tell us the IOMMU width.
+            */
+           section->offset_within_address_space & (1ULL << 63);
 }
 
 static void vfio_listener_region_add(MemoryListener *listener,
                                      MemoryRegionSection *section)
 {
     VFIOContainer *container = container_of(listener, VFIOContainer,
-                                            iommu_data.listener);
+                                            iommu_data.type1.listener);
     hwaddr iova, end;
     void *vaddr;
     int ret;
@@ -2190,6 +2218,19 @@ static void vfio_listener_region_add(MemoryListener *listener,
         error_report("vfio_dma_map(%p, 0x%"HWADDR_PRIx", "
                      "0x%"HWADDR_PRIx", %p) = %d (%m)",
                      container, iova, end - iova, vaddr, ret);
+
+        /*
+         * On the initfn path, store the first error in the container so we
+         * can gracefully fail.  Runtime, there's not much we can do other
+         * than throw a hardware error.
+         */
+        if (!container->iommu_data.type1.initialized) {
+            if (!container->iommu_data.type1.error) {
+                container->iommu_data.type1.error = ret;
+            }
+        } else {
+            hw_error("vfio: DMA mapping failed, unable to continue\n");
+        }
     }
 }
 
@@ -2197,7 +2238,7 @@ static void vfio_listener_region_del(MemoryListener *listener,
                                      MemoryRegionSection *section)
 {
     VFIOContainer *container = container_of(listener, VFIOContainer,
-                                            iommu_data.listener);
+                                            iommu_data.type1.listener);
     hwaddr iova, end;
     int ret;
 
@@ -2242,7 +2283,7 @@ static MemoryListener vfio_memory_listener = {
 
 static void vfio_listener_release(VFIOContainer *container)
 {
-    memory_listener_unregister(&container->iommu_data.listener);
+    memory_listener_unregister(&container->iommu_data.type1.listener);
 }
 
 /*
@@ -2412,10 +2453,12 @@ static void vfio_unmap_bar(VFIODevice *vdev, int nr)
 
     memory_region_del_subregion(&bar->mem, &bar->mmap_mem);
     munmap(bar->mmap, memory_region_size(&bar->mmap_mem));
+    memory_region_destroy(&bar->mmap_mem);
 
     if (vdev->msix && vdev->msix->table_bar == nr) {
         memory_region_del_subregion(&bar->mem, &vdev->msix->mmap_mem);
         munmap(vdev->msix->mmap, memory_region_size(&vdev->msix->mmap_mem));
+        memory_region_destroy(&vdev->msix->mmap_mem);
     }
 
     memory_region_destroy(&bar->mem);
@@ -2501,7 +2544,7 @@ static void vfio_map_bar(VFIODevice *vdev, int nr)
      * potentially insert a direct-mapped subregion before and after it.
      */
     if (vdev->msix && vdev->msix->table_bar == nr) {
-        size = vdev->msix->table_offset & TARGET_PAGE_MASK;
+        size = vdev->msix->table_offset & qemu_host_page_mask;
     }
 
     strncat(name, " mmap", sizeof(name) - strlen(name) - 1);
@@ -2513,8 +2556,8 @@ static void vfio_map_bar(VFIODevice *vdev, int nr)
     if (vdev->msix && vdev->msix->table_bar == nr) {
         unsigned start;
 
-        start = TARGET_PAGE_ALIGN(vdev->msix->table_offset +
-                                  (vdev->msix->entries * PCI_MSIX_ENTRY_SIZE));
+        start = HOST_PAGE_ALIGN(vdev->msix->table_offset +
+                                (vdev->msix->entries * PCI_MSIX_ENTRY_SIZE));
 
         size = start < bar->size ? bar->size - start : 0;
         strncat(name, " msix-hi", sizeof(name) - strlen(name) - 1);
@@ -3212,10 +3255,23 @@ static int vfio_connect_container(VFIOGroup *group)
             return -errno;
         }
 
-        container->iommu_data.listener = vfio_memory_listener;
+        container->iommu_data.type1.listener = vfio_memory_listener;
         container->iommu_data.release = vfio_listener_release;
 
-        memory_listener_register(&container->iommu_data.listener, &address_space_memory);
+        memory_listener_register(&container->iommu_data.type1.listener,
+                                 &address_space_memory);
+
+        if (container->iommu_data.type1.error) {
+            ret = container->iommu_data.type1.error;
+            vfio_listener_release(container);
+            g_free(container);
+            close(fd);
+            error_report("vfio: memory listener initialization failed for container\n");
+            return ret;
+        }
+
+        container->iommu_data.type1.initialized = true;
+
     } else {
         error_report("vfio: No available IOMMU models");
         g_free(container);
diff --git a/hw/net/lan9118.c b/hw/net/lan9118.c
index 2315f996d4..e528290b41 100644
--- a/hw/net/lan9118.c
+++ b/hw/net/lan9118.c
@@ -727,14 +727,14 @@ static void tx_fifo_push(lan9118_state *s, uint32_t val)
         s->txp->cmd_a = val & 0x831f37ff;
         s->txp->fifo_used++;
         s->txp->state = TX_B;
+        s->txp->buffer_size = extract32(s->txp->cmd_a, 0, 11);
+        s->txp->offset = extract32(s->txp->cmd_a, 16, 5);
         break;
     case TX_B:
         if (s->txp->cmd_a & 0x2000) {
             /* First segment */
             s->txp->cmd_b = val;
             s->txp->fifo_used++;
-            s->txp->buffer_size = s->txp->cmd_a & 0x7ff;
-            s->txp->offset = (s->txp->cmd_a >> 16) & 0x1f;
             /* End alignment does not include command words.  */
             n = (s->txp->buffer_size + s->txp->offset + 3) >> 2;
             switch ((n >> 24) & 3) {
@@ -763,7 +763,7 @@ static void tx_fifo_push(lan9118_state *s, uint32_t val)
         if (s->txp->buffer_size <= 0 && s->txp->pad != 0) {
             s->txp->pad--;
         } else {
-            n = 4;
+            n = MIN(4, s->txp->buffer_size + s->txp->offset);
             while (s->txp->offset) {
                 val >>= 8;
                 n--;
diff --git a/hw/net/vhost_net.c b/hw/net/vhost_net.c
index 006576db31..854997d9ba 100644
--- a/hw/net/vhost_net.c
+++ b/hw/net/vhost_net.c
@@ -321,7 +321,7 @@ void vhost_net_ack_features(struct vhost_net *net, unsigned features)
 
 bool vhost_net_virtqueue_pending(VHostNetState *net, int idx)
 {
-    return -ENOSYS;
+    return false;
 }
 
 void vhost_net_virtqueue_mask(VHostNetState *net, VirtIODevice *dev,
diff --git a/hw/pci/pci.c b/hw/pci/pci.c
index aa2a395499..1221f32847 100644
--- a/hw/pci/pci.c
+++ b/hw/pci/pci.c
@@ -793,6 +793,15 @@ static void pci_config_free(PCIDevice *pci_dev)
     g_free(pci_dev->used);
 }
 
+static void do_pci_unregister_device(PCIDevice *pci_dev)
+{
+    pci_dev->bus->devices[pci_dev->devfn] = NULL;
+    pci_config_free(pci_dev);
+
+    address_space_destroy(&pci_dev->bus_master_as);
+    memory_region_destroy(&pci_dev->bus_master_enable_region);
+}
+
 /* -1 for devfn means auto assign */
 static PCIDevice *do_pci_register_device(PCIDevice *pci_dev, PCIBus *bus,
                                          const char *name, int devfn)
@@ -858,7 +867,7 @@ static PCIDevice *do_pci_register_device(PCIDevice *pci_dev, PCIBus *bus,
         pci_init_mask_bridge(pci_dev);
     }
     if (pci_init_multifunction(bus, pci_dev)) {
-        pci_config_free(pci_dev);
+        do_pci_unregister_device(pci_dev);
         return NULL;
     }
 
@@ -873,15 +882,6 @@ static PCIDevice *do_pci_register_device(PCIDevice *pci_dev, PCIBus *bus,
     return pci_dev;
 }
 
-static void do_pci_unregister_device(PCIDevice *pci_dev)
-{
-    pci_dev->bus->devices[pci_dev->devfn] = NULL;
-    pci_config_free(pci_dev);
-
-    address_space_destroy(&pci_dev->bus_master_as);
-    memory_region_destroy(&pci_dev->bus_master_enable_region);
-}
-
 static void pci_unregister_io_regions(PCIDevice *pci_dev)
 {
     PCIIORegion *r;
@@ -1704,6 +1704,34 @@ static PCIBus *pci_find_bus_nr(PCIBus *bus, int bus_num)
     return NULL;
 }
 
+void pci_for_each_bus_depth_first(PCIBus *bus,
+                                  void *(*begin)(PCIBus *bus, void *parent_state),
+                                  void (*end)(PCIBus *bus, void *state),
+                                  void *parent_state)
+{
+    PCIBus *sec;
+    void *state;
+
+    if (!bus) {
+        return;
+    }
+
+    if (begin) {
+        state = begin(bus, parent_state);
+    } else {
+        state = parent_state;
+    }
+
+    QLIST_FOREACH(sec, &bus->child, sibling) {
+        pci_for_each_bus_depth_first(sec, begin, end, state);
+    }
+
+    if (end) {
+        end(bus, state);
+    }
+}
+
+
 PCIDevice *pci_find_device(PCIBus *bus, int bus_num, uint8_t devfn)
 {
     bus = pci_find_bus_nr(bus, bus_num);
diff --git a/hw/virtio/virtio-balloon.c b/hw/virtio/virtio-balloon.c
index d9754dbd33..a470a0b3a6 100644
--- a/hw/virtio/virtio-balloon.c
+++ b/hw/virtio/virtio-balloon.c
@@ -263,7 +263,7 @@ static void virtio_balloon_get_config(VirtIODevice *vdev, uint8_t *config_data)
     config.num_pages = cpu_to_le32(dev->num_pages);
     config.actual = cpu_to_le32(dev->actual);
 
-    memcpy(config_data, &config, 8);
+    memcpy(config_data, &config, sizeof(struct virtio_balloon_config));
 }
 
 static void virtio_balloon_set_config(VirtIODevice *vdev,
@@ -272,7 +272,7 @@ static void virtio_balloon_set_config(VirtIODevice *vdev,
     VirtIOBalloon *dev = VIRTIO_BALLOON(vdev);
     struct virtio_balloon_config config;
     uint32_t oldactual = dev->actual;
-    memcpy(&config, config_data, 8);
+    memcpy(&config, config_data, sizeof(struct virtio_balloon_config));
     dev->actual = le32_to_cpu(config.actual);
     if (dev->actual != oldactual) {
         qemu_balloon_changed(ram_size -
@@ -343,7 +343,8 @@ static void virtio_balloon_device_realize(DeviceState *dev, Error **errp)
     VirtIOBalloon *s = VIRTIO_BALLOON(dev);
     int ret;
 
-    virtio_init(vdev, "virtio-balloon", VIRTIO_ID_BALLOON, 8);
+    virtio_init(vdev, "virtio-balloon", VIRTIO_ID_BALLOON,
+                sizeof(struct virtio_balloon_config));
 
     ret = qemu_add_balloon_handler(virtio_balloon_to_target,
                                    virtio_balloon_stat, s);
diff --git a/hw/xen/xen_pt.c b/hw/xen/xen_pt.c
index d58cb616b1..be4220b415 100644
--- a/hw/xen/xen_pt.c
+++ b/hw/xen/xen_pt.c
@@ -420,8 +420,8 @@ static int xen_pt_register_regions(XenPCIPassthroughState *s)
                               "xen-pci-pt-bar", r->size);
         pci_register_bar(&s->dev, i, type, &s->bar[i]);
 
-        XEN_PT_LOG(&s->dev, "IO region %i registered (size=0x%lx"PRIx64
-                   " base_addr=0x%lx"PRIx64" type: %#x)\n",
+        XEN_PT_LOG(&s->dev, "IO region %i registered (size=0x%08"PRIx64
+                   " base_addr=0x%08"PRIx64" type: %#x)\n",
                    i, r->size, r->base_addr, type);
     }
 
@@ -440,8 +440,8 @@ static int xen_pt_register_regions(XenPCIPassthroughState *s)
 
         s->bases[PCI_ROM_SLOT].access.maddr = d->rom.base_addr;
 
-        memory_region_init_rom_device(&s->rom, OBJECT(s), NULL, NULL,
-                                      "xen-pci-pt-rom", d->rom.size);
+        memory_region_init_io(&s->rom, OBJECT(s), &ops, &s->dev,
+                              "xen-pci-pt-rom", d->rom.size);
         pci_register_bar(&s->dev, PCI_ROM_SLOT, PCI_BASE_ADDRESS_MEM_PREFETCH,
                          &s->rom);
 
diff --git a/include/exec/exec-all.h b/include/exec/exec-all.h
index ea90b649d4..3b03cbfcf8 100644
--- a/include/exec/exec-all.h
+++ b/include/exec/exec-all.h
@@ -81,6 +81,7 @@ void cpu_gen_init(void);
 int cpu_gen_code(CPUArchState *env, struct TranslationBlock *tb,
                  int *gen_code_size_ptr);
 bool cpu_restore_state(CPUArchState *env, uintptr_t searched_pc);
+void page_size_init(void);
 
 void QEMU_NORETURN cpu_resume_from_signal(CPUArchState *env1, void *puc);
 void QEMU_NORETURN cpu_io_recompile(CPUArchState *env, uintptr_t retaddr);
diff --git a/include/exec/ram_addr.h b/include/exec/ram_addr.h
index 33c8acc02e..481a447417 100644
--- a/include/exec/ram_addr.h
+++ b/include/exec/ram_addr.h
@@ -79,6 +79,7 @@ static inline void cpu_physical_memory_set_dirty_range(ram_addr_t start,
     xen_modified_memory(start, length);
 }
 
+#if !defined(_WIN32)
 static inline void cpu_physical_memory_set_dirty_lebitmap(unsigned long *bitmap,
                                                           ram_addr_t start,
                                                           ram_addr_t pages)
@@ -127,6 +128,7 @@ static inline void cpu_physical_memory_set_dirty_lebitmap(unsigned long *bitmap,
         }
     }
 }
+#endif /* not _WIN32 */
 
 static inline void cpu_physical_memory_clear_dirty_range(ram_addr_t start,
                                                          ram_addr_t length,
diff --git a/include/hw/acpi/cpu_hotplug.h b/include/hw/acpi/cpu_hotplug.h
new file mode 100644
index 0000000000..4576400fd7
--- /dev/null
+++ b/include/hw/acpi/cpu_hotplug.h
@@ -0,0 +1,27 @@
+/*
+ * QEMU ACPI hotplug utilities
+ *
+ * Copyright (C) 2013 Red Hat Inc
+ *
+ * Authors:
+ *   Igor Mammedov <imammedo@redhat.com>
+ *
+ * This work is licensed under the terms of the GNU GPL, version 2 or later.
+ * See the COPYING file in the top-level directory.
+ */
+#ifndef ACPI_HOTPLUG_H
+#define ACPI_HOTPLUG_H
+
+#include "hw/acpi/acpi.h"
+#include "hw/acpi/cpu_hotplug_defs.h"
+
+typedef struct AcpiCpuHotplug {
+    MemoryRegion io;
+    uint8_t sts[ACPI_GPE_PROC_LEN];
+} AcpiCpuHotplug;
+
+void AcpiCpuHotplug_add(ACPIGPE *gpe, AcpiCpuHotplug *g, CPUState *cpu);
+
+void AcpiCpuHotplug_init(MemoryRegion *parent, Object *owner,
+                         AcpiCpuHotplug *gpe_cpu, uint16_t base);
+#endif
diff --git a/include/hw/acpi/cpu_hotplug_defs.h b/include/hw/acpi/cpu_hotplug_defs.h
new file mode 100644
index 0000000000..2725b50aac
--- /dev/null
+++ b/include/hw/acpi/cpu_hotplug_defs.h
@@ -0,0 +1,24 @@
+/*
+ * QEMU ACPI hotplug utilities shared defines
+ *
+ * Copyright (C) 2013 Red Hat Inc
+ *
+ * Authors:
+ *   Igor Mammedov <imammedo@redhat.com>
+ *
+ * This work is licensed under the terms of the GNU GPL, version 2 or later.
+ * See the COPYING file in the top-level directory.
+ */
+#ifndef ACPI_HOTPLUG_DEFS_H
+#define ACPI_HOTPLUG_DEFS_H
+
+/*
+ * ONLY DEFINEs are permited in this file since it's shared
+ * between C and ASL code.
+ */
+#define ACPI_CPU_HOTPLUG_STATUS 4
+#define ACPI_GPE_PROC_LEN 32
+#define ICH9_CPU_HOTPLUG_IO_BASE 0x0CD8
+#define PIIX4_CPU_HOTPLUG_IO_BASE 0xaf00
+
+#endif
diff --git a/include/hw/acpi/ich9.h b/include/hw/acpi/ich9.h
index 82fcf9f2eb..104f419852 100644
--- a/include/hw/acpi/ich9.h
+++ b/include/hw/acpi/ich9.h
@@ -22,6 +22,7 @@
 #define HW_ACPI_ICH9_H
 
 #include "hw/acpi/acpi.h"
+#include "hw/acpi/cpu_hotplug.h"
 
 typedef struct ICH9LPCPMRegs {
     /*
@@ -42,6 +43,9 @@ typedef struct ICH9LPCPMRegs {
 
     uint32_t pm_io_base;
     Notifier powerdown_notifier;
+
+    AcpiCpuHotplug gpe_cpu;
+    Notifier cpu_added_notifier;
 } ICH9LPCPMRegs;
 
 void ich9_pm_init(PCIDevice *lpc_pci, ICH9LPCPMRegs *pm,
diff --git a/include/hw/acpi/pcihp.h b/include/hw/acpi/pcihp.h
new file mode 100644
index 0000000000..6230e60954
--- /dev/null
+++ b/include/hw/acpi/pcihp.h
@@ -0,0 +1,72 @@
+/*
+ * QEMU<->ACPI BIOS PCI hotplug interface
+ *
+ * QEMU supports PCI hotplug via ACPI. This module
+ * implements the interface between QEMU and the ACPI BIOS.
+ * Interface specification - see docs/specs/acpi_pci_hotplug.txt
+ *
+ * Copyright (c) 2013, Red Hat Inc, Michael S. Tsirkin (mst@redhat.com)
+ * Copyright (c) 2006 Fabrice Bellard
+ *
+ * This library is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2 as published by the Free Software Foundation.
+ *
+ * This library is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with this library; if not, see <http://www.gnu.org/licenses/>
+ *
+ * Contributions after 2012-01-13 are licensed under the terms of the
+ * GNU GPL, version 2 or (at your option) any later version.
+ */
+
+#ifndef HW_ACPI_PCIHP_H
+#define HW_ACPI_PCIHP_H
+
+#include <inttypes.h>
+#include <qemu/typedefs.h>
+#include "hw/pci/pci.h" /* for PCIHotplugState */
+
+typedef struct AcpiPciHpPciStatus {
+    uint32_t up; /* deprecated, maintained for migration compatibility */
+    uint32_t down;
+    uint32_t hotplug_enable;
+    uint32_t device_present;
+} AcpiPciHpPciStatus;
+
+#define ACPI_PCIHP_PROP_BSEL "acpi-pcihp-bsel"
+#define ACPI_PCIHP_MAX_HOTPLUG_BUS 256
+
+typedef struct AcpiPciHpState {
+    AcpiPciHpPciStatus acpi_pcihp_pci_status[ACPI_PCIHP_MAX_HOTPLUG_BUS];
+    uint32_t hotplug_select;
+    PCIBus *root;
+    MemoryRegion io;
+} AcpiPciHpState;
+
+void acpi_pcihp_init(AcpiPciHpState *, PCIBus *root,
+                     MemoryRegion *address_space_io);
+
+/* Invoke on device hotplug */
+int acpi_pcihp_device_hotplug(AcpiPciHpState *, PCIDevice *,
+                              PCIHotplugState state);
+
+/* Called on reset */
+void acpi_pcihp_reset(AcpiPciHpState *s);
+
+extern const VMStateDescription vmstate_acpi_pcihp_pci_status;
+
+#define VMSTATE_PCI_HOTPLUG(pcihp, state, test_pcihp) \
+        VMSTATE_UINT32_TEST(pcihp.hotplug_select, state, \
+                            test_pcihp), \
+        VMSTATE_STRUCT_ARRAY_TEST(pcihp.acpi_pcihp_pci_status, state, \
+                                  ACPI_PCIHP_MAX_HOTPLUG_BUS, \
+                                  test_pcihp, 1, \
+                                  vmstate_acpi_pcihp_pci_status, \
+                                  AcpiPciHpPciStatus)
+
+#endif
diff --git a/include/hw/i386/pc.h b/include/hw/i386/pc.h
index 7fe2bd17f6..3e1e81b27b 100644
--- a/include/hw/i386/pc.h
+++ b/include/hw/i386/pc.h
@@ -35,7 +35,7 @@ typedef struct PcPciInfo {
 struct PcGuestInfo {
     bool has_pci_info;
     bool isapc_ram_fw;
-    hwaddr ram_size;
+    hwaddr ram_size, ram_size_below_4g;
     unsigned apic_id_limit;
     bool apic_xrupt_override;
     uint64_t numa_nodes;
@@ -265,6 +265,11 @@ int e820_add_entry(uint64_t, uint64_t, uint32_t);
             .driver   = TYPE_USB_DEVICE,\
             .property = "msos-desc",\
             .value    = "no",\
+        },\
+        {\
+            .driver   = "PIIX4_PM",\
+            .property = "acpi-pci-hotplug-with-bridge-support",\
+            .value    = "off",\
         }
 
 #define PC_COMPAT_1_6 \
diff --git a/include/hw/intc/arm_gic_common.h b/include/hw/intc/arm_gic_common.h
index 0d232dfb67..8a2aa00cee 100644
--- a/include/hw/intc/arm_gic_common.h
+++ b/include/hw/intc/arm_gic_common.h
@@ -27,6 +27,7 @@
 #define GIC_MAXIRQ 1020
 /* First 32 are private to each CPU (SGIs and PPIs). */
 #define GIC_INTERNAL 32
+#define GIC_NR_SGIS 16
 /* Maximum number of possible CPU interfaces, determined by GIC architecture */
 #define GIC_NCPU 8
 
diff --git a/include/hw/isa/isa.h b/include/hw/isa/isa.h
index fa45a5b094..e0c749f9e9 100644
--- a/include/hw/isa/isa.h
+++ b/include/hw/isa/isa.h
@@ -20,6 +20,13 @@
 #define TYPE_ISA_BUS "ISA"
 #define ISA_BUS(obj) OBJECT_CHECK(ISABus, (obj), TYPE_ISA_BUS)
 
+#define TYPE_APPLE_SMC "isa-applesmc"
+
+static inline bool applesmc_find(void)
+{
+    return object_resolve_path_type("", TYPE_APPLE_SMC, NULL);
+}
+
 typedef struct ISADeviceClass {
     DeviceClass parent_class;
 } ISADeviceClass;
diff --git a/include/hw/pci/pci.h b/include/hw/pci/pci.h
index 754b82de81..52523467b6 100644
--- a/include/hw/pci/pci.h
+++ b/include/hw/pci/pci.h
@@ -387,6 +387,20 @@ int pci_bus_num(PCIBus *s);
 void pci_for_each_device(PCIBus *bus, int bus_num,
                          void (*fn)(PCIBus *bus, PCIDevice *d, void *opaque),
                          void *opaque);
+void pci_for_each_bus_depth_first(PCIBus *bus,
+                                  void *(*begin)(PCIBus *bus, void *parent_state),
+                                  void (*end)(PCIBus *bus, void *state),
+                                  void *parent_state);
+
+/* Use this wrapper when specific scan order is not required. */
+static inline
+void pci_for_each_bus(PCIBus *bus,
+                      void (*fn)(PCIBus *bus, void *opaque),
+                      void *opaque)
+{
+    pci_for_each_bus_depth_first(bus, NULL, fn, opaque);
+}
+
 PCIBus *pci_find_primary_bus(void);
 PCIBus *pci_device_root_bus(const PCIDevice *d);
 const char *pci_root_bus_path(PCIDevice *dev);
diff --git a/include/qemu/timer.h b/include/qemu/timer.h
index 5afcffc3f9..7f9a074c2a 100644
--- a/include/qemu/timer.h
+++ b/include/qemu/timer.h
@@ -405,7 +405,7 @@ int64_t timerlistgroup_deadline_ns(QEMUTimerListGroup *tlg);
  * timer_init:
  * @ts: the timer to be initialised
  * @timer_list: the timer list to attach the timer to
- * @scale: the scale value for the tiemr
+ * @scale: the scale value for the timer
  * @cb: the callback to be called when the timer expires
  * @opaque: the opaque pointer to be passed to the callback
  *
@@ -422,7 +422,7 @@ void timer_init(QEMUTimer *ts,
 /**
  * timer_new_tl:
  * @timer_list: the timer list to attach the timer to
- * @scale: the scale value for the tiemr
+ * @scale: the scale value for the timer
  * @cb: the callback to be called when the timer expires
  * @opaque: the opaque pointer to be passed to the callback
  *
@@ -447,7 +447,7 @@ static inline QEMUTimer *timer_new_tl(QEMUTimerList *timer_list,
 /**
  * timer_new:
  * @type: the clock type to use
- * @scale: the scale value for the tiemr
+ * @scale: the scale value for the timer
  * @cb: the callback to be called when the timer expires
  * @opaque: the opaque pointer to be passed to the callback
  *
diff --git a/kvm-all.c b/kvm-all.c
index a3fb8de268..9588feab99 100644
--- a/kvm-all.c
+++ b/kvm-all.c
@@ -1360,6 +1360,7 @@ int kvm_init(void)
      * page size for the system though.
      */
     assert(TARGET_PAGE_SIZE <= getpagesize());
+    page_size_init();
 
 #ifdef KVM_CAP_SET_GUEST_DEBUG
     QTAILQ_INIT(&s->kvm_sw_breakpoints);
diff --git a/linux-user/s390x/syscall.h b/linux-user/s390x/syscall.h
index ea8c304840..e5ce30b667 100644
--- a/linux-user/s390x/syscall.h
+++ b/linux-user/s390x/syscall.h
@@ -22,4 +22,4 @@ struct target_pt_regs {
 
 #define UNAME_MACHINE "s390x"
 
-#define TARGET_CLONE_BACKWARDS
+#define TARGET_CLONE_BACKWARDS2
diff --git a/linux-user/syscall.c b/linux-user/syscall.c
index 0ac05b85f2..bc0ac98d4f 100644
--- a/linux-user/syscall.c
+++ b/linux-user/syscall.c
@@ -2340,7 +2340,7 @@ static abi_long do_socketcall(int num, abi_ulong vptr)
             size_t len;
             abi_ulong flags;
             abi_ulong addr;
-            socklen_t addrlen;
+            abi_ulong addrlen;
 
             if (get_user_ual(sockfd, vptr)
                 || get_user_ual(msg, vptr + n)
@@ -2406,7 +2406,7 @@ static abi_long do_socketcall(int num, abi_ulong vptr)
             abi_ulong level;
             abi_ulong optname;
             abi_ulong optval;
-            socklen_t optlen;
+            abi_ulong optlen;
 
             if (get_user_ual(sockfd, vptr)
                 || get_user_ual(level, vptr + n)
diff --git a/net/net.c b/net/net.c
index f8db85f30b..2c3af202a6 100644
--- a/net/net.c
+++ b/net/net.c
@@ -164,7 +164,6 @@ void qemu_macaddr_default_if_unset(MACAddr *macaddr)
 static char *assign_name(NetClientState *nc1, const char *model)
 {
     NetClientState *nc;
-    char buf[256];
     int id = 0;
 
     QTAILQ_FOREACH(nc, &net_clients, next) {
@@ -176,9 +175,7 @@ static char *assign_name(NetClientState *nc1, const char *model)
         }
     }
 
-    snprintf(buf, sizeof(buf), "%s.%d", model, id);
-
-    return g_strdup(buf);
+    return g_strdup_printf("%s.%d", model, id);
 }
 
 static void qemu_net_client_destructor(NetClientState *nc)
diff --git a/net/tap-linux.c b/net/tap-linux.c
index 36c09e24d8..812bf2dfc6 100644
--- a/net/tap-linux.c
+++ b/net/tap-linux.c
@@ -52,14 +52,17 @@ int tap_open(char *ifname, int ifname_size, int *vnet_hdr,
     memset(&ifr, 0, sizeof(ifr));
     ifr.ifr_flags = IFF_TAP | IFF_NO_PI;
 
-    if (ioctl(fd, TUNGETFEATURES, &features) == 0 &&
-        features & IFF_ONE_QUEUE) {
+    if (ioctl(fd, TUNGETFEATURES, &features) == -1) {
+        error_report("warning: TUNGETFEATURES failed: %s", strerror(errno));
+        features = 0;
+    }
+
+    if (features & IFF_ONE_QUEUE) {
         ifr.ifr_flags |= IFF_ONE_QUEUE;
     }
 
     if (*vnet_hdr) {
-        if (ioctl(fd, TUNGETFEATURES, &features) == 0 &&
-            features & IFF_VNET_HDR) {
+        if (features & IFF_VNET_HDR) {
             *vnet_hdr = 1;
             ifr.ifr_flags |= IFF_VNET_HDR;
         } else {
@@ -82,8 +85,7 @@ int tap_open(char *ifname, int ifname_size, int *vnet_hdr,
     }
 
     if (mq_required) {
-        if ((ioctl(fd, TUNGETFEATURES, &features) != 0) ||
-            !(features & IFF_MULTI_QUEUE)) {
+        if (!(features & IFF_MULTI_QUEUE)) {
             error_report("multiqueue required, but no kernel "
                          "support for IFF_MULTI_QUEUE available");
             close(fd);
diff --git a/pc-bios/kvmvapic.bin b/pc-bios/kvmvapic.bin
index 045f5c2884..045f5c2884 100755..100644
--- a/pc-bios/kvmvapic.bin
+++ b/pc-bios/kvmvapic.bin
Binary files differdiff --git a/pc-bios/multiboot.bin b/pc-bios/multiboot.bin
index e772713c95..e772713c95 100755..100644
--- a/pc-bios/multiboot.bin
+++ b/pc-bios/multiboot.bin
Binary files differdiff --git a/pc-bios/sgabios.bin b/pc-bios/sgabios.bin
index c3da4c3d0a..c3da4c3d0a 100755..100644
--- a/pc-bios/sgabios.bin
+++ b/pc-bios/sgabios.bin
Binary files differdiff --git a/scripts/create_config b/scripts/create_config
index b1adbf5897..06f5316d9d 100755
--- a/scripts/create_config
+++ b/scripts/create_config
@@ -26,6 +26,10 @@ case $line in
     # save for the next definitions
     prefix=${line#*=}
     ;;
+ IASL=*) # iasl executable
+    value=${line#*=}
+    echo "#define CONFIG_IASL $value"
+    ;;
  CONFIG_AUDIO_DRIVERS=*)
     drivers=${line#*=}
     echo "#define CONFIG_AUDIO_DRIVERS \\"
diff --git a/scripts/dump-guest-memory.py b/scripts/dump-guest-memory.py
new file mode 100644
index 0000000000..1ed8b67883
--- /dev/null
+++ b/scripts/dump-guest-memory.py
@@ -0,0 +1,339 @@
+# This python script adds a new gdb command, "dump-guest-memory". It
+# should be loaded with "source dump-guest-memory.py" at the (gdb)
+# prompt.
+#
+# Copyright (C) 2013, Red Hat, Inc.
+#
+# Authors:
+#   Laszlo Ersek <lersek@redhat.com>
+#
+# This work is licensed under the terms of the GNU GPL, version 2 or later. See
+# the COPYING file in the top-level directory.
+#
+# The leading docstring doesn't have idiomatic Python formatting. It is
+# printed by gdb's "help" command (the first line is printed in the
+# "help data" summary), and it should match how other help texts look in
+# gdb.
+
+import struct
+
+class DumpGuestMemory(gdb.Command):
+    """Extract guest vmcore from qemu process coredump.
+
+The sole argument is FILE, identifying the target file to write the
+guest vmcore to.
+
+This GDB command reimplements the dump-guest-memory QMP command in
+python, using the representation of guest memory as captured in the qemu
+coredump. The qemu process that has been dumped must have had the
+command line option "-machine dump-guest-core=on".
+
+For simplicity, the "paging", "begin" and "end" parameters of the QMP
+command are not supported -- no attempt is made to get the guest's
+internal paging structures (ie. paging=false is hard-wired), and guest
+memory is always fully dumped.
+
+Only x86_64 guests are supported.
+
+The CORE/NT_PRSTATUS and QEMU notes (that is, the VCPUs' statuses) are
+not written to the vmcore. Preparing these would require context that is
+only present in the KVM host kernel module when the guest is alive. A
+fake ELF note is written instead, only to keep the ELF parser of "crash"
+happy.
+
+Dependent on how busted the qemu process was at the time of the
+coredump, this command might produce unpredictable results. If qemu
+deliberately called abort(), or it was dumped in response to a signal at
+a halfway fortunate point, then its coredump should be in reasonable
+shape and this command should mostly work."""
+
+    TARGET_PAGE_SIZE = 0x1000
+    TARGET_PAGE_MASK = 0xFFFFFFFFFFFFF000
+
+    # Various ELF constants
+    EM_X86_64   = 62        # AMD x86-64 target machine
+    ELFDATA2LSB = 1         # little endian
+    ELFCLASS64  = 2
+    ELFMAG      = "\x7FELF"
+    EV_CURRENT  = 1
+    ET_CORE     = 4
+    PT_LOAD     = 1
+    PT_NOTE     = 4
+
+    # Special value for e_phnum. This indicates that the real number of
+    # program headers is too large to fit into e_phnum. Instead the real
+    # value is in the field sh_info of section 0.
+    PN_XNUM = 0xFFFF
+
+    # Format strings for packing and header size calculation.
+    ELF64_EHDR = ("4s" # e_ident/magic
+                  "B"  # e_ident/class
+                  "B"  # e_ident/data
+                  "B"  # e_ident/version
+                  "B"  # e_ident/osabi
+                  "8s" # e_ident/pad
+                  "H"  # e_type
+                  "H"  # e_machine
+                  "I"  # e_version
+                  "Q"  # e_entry
+                  "Q"  # e_phoff
+                  "Q"  # e_shoff
+                  "I"  # e_flags
+                  "H"  # e_ehsize
+                  "H"  # e_phentsize
+                  "H"  # e_phnum
+                  "H"  # e_shentsize
+                  "H"  # e_shnum
+                  "H"  # e_shstrndx
+                 )
+    ELF64_PHDR = ("I"  # p_type
+                  "I"  # p_flags
+                  "Q"  # p_offset
+                  "Q"  # p_vaddr
+                  "Q"  # p_paddr
+                  "Q"  # p_filesz
+                  "Q"  # p_memsz
+                  "Q"  # p_align
+                 )
+
+    def __init__(self):
+        super(DumpGuestMemory, self).__init__("dump-guest-memory",
+                                              gdb.COMMAND_DATA,
+                                              gdb.COMPLETE_FILENAME)
+        self.uintptr_t     = gdb.lookup_type("uintptr_t")
+        self.elf64_ehdr_le = struct.Struct("<%s" % self.ELF64_EHDR)
+        self.elf64_phdr_le = struct.Struct("<%s" % self.ELF64_PHDR)
+
+    def int128_get64(self, val):
+        assert (val["hi"] == 0)
+        return val["lo"]
+
+    def qtailq_foreach(self, head, field_str):
+        var_p = head["tqh_first"]
+        while (var_p != 0):
+            var = var_p.dereference()
+            yield var
+            var_p = var[field_str]["tqe_next"]
+
+    def qemu_get_ram_block(self, ram_addr):
+        ram_blocks = gdb.parse_and_eval("ram_list.blocks")
+        for block in self.qtailq_foreach(ram_blocks, "next"):
+            if (ram_addr - block["offset"] < block["length"]):
+                return block
+        raise gdb.GdbError("Bad ram offset %x" % ram_addr)
+
+    def qemu_get_ram_ptr(self, ram_addr):
+        block = self.qemu_get_ram_block(ram_addr)
+        return block["host"] + (ram_addr - block["offset"])
+
+    def memory_region_get_ram_ptr(self, mr):
+        if (mr["alias"] != 0):
+            return (self.memory_region_get_ram_ptr(mr["alias"].dereference()) +
+                    mr["alias_offset"])
+        return self.qemu_get_ram_ptr(mr["ram_addr"] & self.TARGET_PAGE_MASK)
+
+    def guest_phys_blocks_init(self):
+        self.guest_phys_blocks = []
+
+    def guest_phys_blocks_append(self):
+        print "guest RAM blocks:"
+        print ("target_start     target_end       host_addr        message "
+               "count")
+        print ("---------------- ---------------- ---------------- ------- "
+               "-----")
+
+        current_map_p = gdb.parse_and_eval("address_space_memory.current_map")
+        current_map = current_map_p.dereference()
+        for cur in range(current_map["nr"]):
+            flat_range   = (current_map["ranges"] + cur).dereference()
+            mr           = flat_range["mr"].dereference()
+
+            # we only care about RAM
+            if (not mr["ram"]):
+                continue
+
+            section_size = self.int128_get64(flat_range["addr"]["size"])
+            target_start = self.int128_get64(flat_range["addr"]["start"])
+            target_end   = target_start + section_size
+            host_addr    = (self.memory_region_get_ram_ptr(mr) +
+                            flat_range["offset_in_region"])
+            predecessor = None
+
+            # find continuity in guest physical address space
+            if (len(self.guest_phys_blocks) > 0):
+                predecessor = self.guest_phys_blocks[-1]
+                predecessor_size = (predecessor["target_end"] -
+                                    predecessor["target_start"])
+
+                # the memory API guarantees monotonically increasing
+                # traversal
+                assert (predecessor["target_end"] <= target_start)
+
+                # we want continuity in both guest-physical and
+                # host-virtual memory
+                if (predecessor["target_end"] < target_start or
+                    predecessor["host_addr"] + predecessor_size != host_addr):
+                    predecessor = None
+
+            if (predecessor is None):
+                # isolated mapping, add it to the list
+                self.guest_phys_blocks.append({"target_start": target_start,
+                                               "target_end"  : target_end,
+                                               "host_addr"   : host_addr})
+                message = "added"
+            else:
+                # expand predecessor until @target_end; predecessor's
+                # start doesn't change
+                predecessor["target_end"] = target_end
+                message = "joined"
+
+            print ("%016x %016x %016x %-7s %5u" %
+                   (target_start, target_end, host_addr.cast(self.uintptr_t),
+                    message, len(self.guest_phys_blocks)))
+
+    def cpu_get_dump_info(self):
+        # We can't synchronize the registers with KVM post-mortem, and
+        # the bits in (first_x86_cpu->env.hflags) seem to be stale; they
+        # may not reflect long mode for example. Hence just assume the
+        # most common values. This also means that instruction pointer
+        # etc. will be bogus in the dump, but at least the RAM contents
+        # should be valid.
+        self.dump_info = {"d_machine": self.EM_X86_64,
+                          "d_endian" : self.ELFDATA2LSB,
+                          "d_class"  : self.ELFCLASS64}
+
+    def encode_elf64_ehdr_le(self):
+        return self.elf64_ehdr_le.pack(
+                                 self.ELFMAG,                 # e_ident/magic
+                                 self.dump_info["d_class"],   # e_ident/class
+                                 self.dump_info["d_endian"],  # e_ident/data
+                                 self.EV_CURRENT,             # e_ident/version
+                                 0,                           # e_ident/osabi
+                                 "",                          # e_ident/pad
+                                 self.ET_CORE,                # e_type
+                                 self.dump_info["d_machine"], # e_machine
+                                 self.EV_CURRENT,             # e_version
+                                 0,                           # e_entry
+                                 self.elf64_ehdr_le.size,     # e_phoff
+                                 0,                           # e_shoff
+                                 0,                           # e_flags
+                                 self.elf64_ehdr_le.size,     # e_ehsize
+                                 self.elf64_phdr_le.size,     # e_phentsize
+                                 self.phdr_num,               # e_phnum
+                                 0,                           # e_shentsize
+                                 0,                           # e_shnum
+                                 0                            # e_shstrndx
+                                )
+
+    def encode_elf64_note_le(self):
+        return self.elf64_phdr_le.pack(self.PT_NOTE,         # p_type
+                                       0,                    # p_flags
+                                       (self.memory_offset -
+                                        len(self.note)),     # p_offset
+                                       0,                    # p_vaddr
+                                       0,                    # p_paddr
+                                       len(self.note),       # p_filesz
+                                       len(self.note),       # p_memsz
+                                       0                     # p_align
+                                      )
+
+    def encode_elf64_load_le(self, offset, start_hwaddr, range_size):
+        return self.elf64_phdr_le.pack(self.PT_LOAD, # p_type
+                                       0,            # p_flags
+                                       offset,       # p_offset
+                                       0,            # p_vaddr
+                                       start_hwaddr, # p_paddr
+                                       range_size,   # p_filesz
+                                       range_size,   # p_memsz
+                                       0             # p_align
+                                      )
+
+    def note_init(self, name, desc, type):
+        # name must include a trailing NUL
+        namesz = (len(name) + 1 + 3) / 4 * 4
+        descsz = (len(desc)     + 3) / 4 * 4
+        fmt = ("<"   # little endian
+               "I"   # n_namesz
+               "I"   # n_descsz
+               "I"   # n_type
+               "%us" # name
+               "%us" # desc
+               % (namesz, descsz))
+        self.note = struct.pack(fmt,
+                                len(name) + 1, len(desc), type, name, desc)
+
+    def dump_init(self):
+        self.guest_phys_blocks_init()
+        self.guest_phys_blocks_append()
+        self.cpu_get_dump_info()
+        # we have no way to retrieve the VCPU status from KVM
+        # post-mortem
+        self.note_init("NONE", "EMPTY", 0)
+
+        # Account for PT_NOTE.
+        self.phdr_num = 1
+
+        # We should never reach PN_XNUM for paging=false dumps: there's
+        # just a handful of discontiguous ranges after merging.
+        self.phdr_num += len(self.guest_phys_blocks)
+        assert (self.phdr_num < self.PN_XNUM)
+
+        # Calculate the ELF file offset where the memory dump commences:
+        #
+        #   ELF header
+        #   PT_NOTE
+        #   PT_LOAD: 1
+        #   PT_LOAD: 2
+        #   ...
+        #   PT_LOAD: len(self.guest_phys_blocks)
+        #   ELF note
+        #   memory dump
+        self.memory_offset = (self.elf64_ehdr_le.size +
+                              self.elf64_phdr_le.size * self.phdr_num +
+                              len(self.note))
+
+    def dump_begin(self, vmcore):
+        vmcore.write(self.encode_elf64_ehdr_le())
+        vmcore.write(self.encode_elf64_note_le())
+        running = self.memory_offset
+        for block in self.guest_phys_blocks:
+            range_size = block["target_end"] - block["target_start"]
+            vmcore.write(self.encode_elf64_load_le(running,
+                                                   block["target_start"],
+                                                   range_size))
+            running += range_size
+        vmcore.write(self.note)
+
+    def dump_iterate(self, vmcore):
+        qemu_core = gdb.inferiors()[0]
+        for block in self.guest_phys_blocks:
+            cur  = block["host_addr"]
+            left = block["target_end"] - block["target_start"]
+            print ("dumping range at %016x for length %016x" %
+                   (cur.cast(self.uintptr_t), left))
+            while (left > 0):
+                chunk_size = min(self.TARGET_PAGE_SIZE, left)
+                chunk = qemu_core.read_memory(cur, chunk_size)
+                vmcore.write(chunk)
+                cur  += chunk_size
+                left -= chunk_size
+
+    def create_vmcore(self, filename):
+        vmcore = open(filename, "wb")
+        self.dump_begin(vmcore)
+        self.dump_iterate(vmcore)
+        vmcore.close()
+
+    def invoke(self, args, from_tty):
+        # Unwittingly pressing the Enter key after the command should
+        # not dump the same multi-gig coredump to the same file.
+        self.dont_repeat()
+
+        argv = gdb.string_to_argv(args)
+        if (len(argv) != 1):
+            raise gdb.GdbError("usage: dump-guest-memory FILE")
+
+        self.dump_init()
+        self.create_vmcore(argv[0])
+
+DumpGuestMemory()
diff --git a/scripts/tracetool/backend/simple.py b/scripts/tracetool/backend/simple.py
index 37ef599324..3dde372e46 100644
--- a/scripts/tracetool/backend/simple.py
+++ b/scripts/tracetool/backend/simple.py
@@ -56,7 +56,7 @@ def c(events):
 
 
         out('',
-            '    TraceEvent *eventp = trace_event_id(%(event_id)s);',
+            '    TraceEvent *eventp = trace_event_id(%(event_enum)s);',
             '    bool _state = trace_event_get_state_dynamic(eventp);',
             '    if (!_state) {',
             '        return;',
@@ -65,6 +65,7 @@ def c(events):
             '    if (trace_record_start(&rec, %(event_id)s, %(size_str)s)) {',
             '        return; /* Trace Buffer Full, Event Dropped ! */',
             '    }',
+            event_enum = 'TRACE_' + event.name.upper(),
             event_id = num,
             size_str = sizestr,
             )
@@ -93,9 +94,6 @@ def c(events):
 
 
 def h(events):
-    out('#include "trace/simple.h"',
-        '')
-
     for event in events:
         out('void trace_%(name)s(%(args)s);',
             name = event.name,
diff --git a/target-arm/cpu.c b/target-arm/cpu.c
index 52efd5d66f..45ad7f0260 100644
--- a/target-arm/cpu.c
+++ b/target-arm/cpu.c
@@ -982,6 +982,7 @@ static const ARMCPUInfo arm_cpus[] = {
 
 static Property arm_cpu_properties[] = {
     DEFINE_PROP_BOOL("start-powered-off", ARMCPU, start_powered_off, false),
+    DEFINE_PROP_UINT32("midr", ARMCPU, midr, 0),
     DEFINE_PROP_END_OF_LIST()
 };
 
diff --git a/target-arm/cpu.h b/target-arm/cpu.h
index 198b6b8d4e..383c58221e 100644
--- a/target-arm/cpu.h
+++ b/target-arm/cpu.h
@@ -496,6 +496,8 @@ enum arm_fprounding {
     FPROUNDING_ODD
 };
 
+int arm_rmode_to_sf(int rmode);
+
 enum arm_cpu_mode {
   ARM_CPU_MODE_USR = 0x10,
   ARM_CPU_MODE_FIQ = 0x11,
diff --git a/target-arm/helper-a64.c b/target-arm/helper-a64.c
index 4ce0d01a85..6ca958afb1 100644
--- a/target-arm/helper-a64.c
+++ b/target-arm/helper-a64.c
@@ -122,3 +122,34 @@ uint64_t HELPER(vfp_cmped_a64)(float64 x, float64 y, void *fp_status)
 {
     return float_rel_to_flags(float64_compare(x, y, fp_status));
 }
+
+uint64_t HELPER(simd_tbl)(CPUARMState *env, uint64_t result, uint64_t indices,
+                          uint32_t rn, uint32_t numregs)
+{
+    /* Helper function for SIMD TBL and TBX. We have to do the table
+     * lookup part for the 64 bits worth of indices we're passed in.
+     * result is the initial results vector (either zeroes for TBL
+     * or some guest values for TBX), rn the register number where
+     * the table starts, and numregs the number of registers in the table.
+     * We return the results of the lookups.
+     */
+    int shift;
+
+    for (shift = 0; shift < 64; shift += 8) {
+        int index = extract64(indices, shift, 8);
+        if (index < 16 * numregs) {
+            /* Convert index (a byte offset into the virtual table
+             * which is a series of 128-bit vectors concatenated)
+             * into the correct vfp.regs[] element plus a bit offset
+             * into that element, bearing in mind that the table
+             * can wrap around from V31 to V0.
+             */
+            int elt = (rn * 2 + (index >> 3)) % 64;
+            int bitidx = (index & 7) * 8;
+            uint64_t val = extract64(env->vfp.regs[elt], bitidx, 8);
+
+            result = deposit64(result, shift, 8, val);
+        }
+    }
+    return result;
+}
diff --git a/target-arm/helper-a64.h b/target-arm/helper-a64.h
index bca19f3dea..99832ee55e 100644
--- a/target-arm/helper-a64.h
+++ b/target-arm/helper-a64.h
@@ -26,3 +26,4 @@ DEF_HELPER_3(vfp_cmps_a64, i64, f32, f32, ptr)
 DEF_HELPER_3(vfp_cmpes_a64, i64, f32, f32, ptr)
 DEF_HELPER_3(vfp_cmpd_a64, i64, f64, f64, ptr)
 DEF_HELPER_3(vfp_cmped_a64, i64, f64, f64, ptr)
+DEF_HELPER_FLAGS_5(simd_tbl, TCG_CALL_NO_RWG_SE, i64, env, i64, i64, i32, i32)
diff --git a/target-arm/helper.c b/target-arm/helper.c
index c708f15e27..ca5b0000ad 100644
--- a/target-arm/helper.c
+++ b/target-arm/helper.c
@@ -4048,6 +4048,23 @@ uint32_t HELPER(set_rmode)(uint32_t rmode, CPUARMState *env)
     return prev_rmode;
 }
 
+/* Set the current fp rounding mode in the standard fp status and return
+ * the old one. This is for NEON instructions that need to change the
+ * rounding mode but wish to use the standard FPSCR values for everything
+ * else. Always set the rounding mode back to the correct value after
+ * modifying it.
+ * The argument is a softfloat float_round_ value.
+ */
+uint32_t HELPER(set_neon_rmode)(uint32_t rmode, CPUARMState *env)
+{
+    float_status *fp_status = &env->vfp.standard_fp_status;
+
+    uint32_t prev_rmode = get_float_rounding_mode(fp_status);
+    set_float_rounding_mode(rmode, fp_status);
+
+    return prev_rmode;
+}
+
 /* Half precision conversions.  */
 static float32 do_fcvt_f16_to_f32(uint32_t a, CPUARMState *env, float_status *s)
 {
@@ -4418,3 +4435,31 @@ float64 HELPER(rintd)(float64 x, void *fp_status)
 
     return ret;
 }
+
+/* Convert ARM rounding mode to softfloat */
+int arm_rmode_to_sf(int rmode)
+{
+    switch (rmode) {
+    case FPROUNDING_TIEAWAY:
+        rmode = float_round_ties_away;
+        break;
+    case FPROUNDING_ODD:
+        /* FIXME: add support for TIEAWAY and ODD */
+        qemu_log_mask(LOG_UNIMP, "arm: unimplemented rounding mode: %d\n",
+                      rmode);
+    case FPROUNDING_TIEEVEN:
+    default:
+        rmode = float_round_nearest_even;
+        break;
+    case FPROUNDING_POSINF:
+        rmode = float_round_up;
+        break;
+    case FPROUNDING_NEGINF:
+        rmode = float_round_down;
+        break;
+    case FPROUNDING_ZERO:
+        rmode = float_round_to_zero;
+        break;
+    }
+    return rmode;
+}
diff --git a/target-arm/helper.h b/target-arm/helper.h
index 70872dffc6..71b8411120 100644
--- a/target-arm/helper.h
+++ b/target-arm/helper.h
@@ -149,6 +149,7 @@ DEF_HELPER_3(vfp_ultod, f64, i64, i32, ptr)
 DEF_HELPER_3(vfp_uqtod, f64, i64, i32, ptr)
 
 DEF_HELPER_FLAGS_2(set_rmode, TCG_CALL_NO_RWG, i32, i32, env)
+DEF_HELPER_FLAGS_2(set_neon_rmode, TCG_CALL_NO_RWG, i32, i32, env)
 
 DEF_HELPER_2(vfp_fcvt_f16_to_f32, f32, i32, env)
 DEF_HELPER_2(vfp_fcvt_f32_to_f16, i32, f32, env)
diff --git a/target-arm/translate-a64.c b/target-arm/translate-a64.c
index cf80c46b90..6c1ec1edc6 100644
--- a/target-arm/translate-a64.c
+++ b/target-arm/translate-a64.c
@@ -61,6 +61,20 @@ enum a64_shift_type {
     A64_SHIFT_TYPE_ROR = 3
 };
 
+/* Table based decoder typedefs - used when the relevant bits for decode
+ * are too awkwardly scattered across the instruction (eg SIMD).
+ */
+typedef void AArch64DecodeFn(DisasContext *s, uint32_t insn);
+
+typedef struct AArch64DecodeTable {
+    uint32_t pattern;
+    uint32_t mask;
+    AArch64DecodeFn *disas_fn;
+} AArch64DecodeTable;
+
+/* Function prototype for gen_ functions for calling Neon helpers */
+typedef void NeonGenTwoOpFn(TCGv_i32, TCGv_i32, TCGv_i32);
+
 /* initialize TCG globals.  */
 void a64_translate_init(void)
 {
@@ -308,6 +322,28 @@ static TCGv_i64 read_cpu_reg_sp(DisasContext *s, int reg, int sf)
     return v;
 }
 
+/* Return the offset into CPUARMState of an element of specified
+ * size, 'element' places in from the least significant end of
+ * the FP/vector register Qn.
+ */
+static inline int vec_reg_offset(int regno, int element, TCGMemOp size)
+{
+    int offs = offsetof(CPUARMState, vfp.regs[regno * 2]);
+#ifdef HOST_WORDS_BIGENDIAN
+    /* This is complicated slightly because vfp.regs[2n] is
+     * still the low half and  vfp.regs[2n+1] the high half
+     * of the 128 bit vector, even on big endian systems.
+     * Calculate the offset assuming a fully bigendian 128 bits,
+     * then XOR to account for the order of the two 64 bit halves.
+     */
+    offs += (16 - ((element + 1) * (1 << size)));
+    offs ^= 8;
+#else
+    offs += element * (1 << size);
+#endif
+    return offs;
+}
+
 /* Return the offset into CPUARMState of a slice (from
  * the least significant end) of FP register Qn (ie
  * Dn, Sn, Hn or Bn).
@@ -661,6 +697,156 @@ static void do_fp_ld(DisasContext *s, int destidx, TCGv_i64 tcg_addr, int size)
 }
 
 /*
+ * Vector load/store helpers.
+ *
+ * The principal difference between this and a FP load is that we don't
+ * zero extend as we are filling a partial chunk of the vector register.
+ * These functions don't support 128 bit loads/stores, which would be
+ * normal load/store operations.
+ *
+ * The _i32 versions are useful when operating on 32 bit quantities
+ * (eg for floating point single or using Neon helper functions).
+ */
+
+/* Get value of an element within a vector register */
+static void read_vec_element(DisasContext *s, TCGv_i64 tcg_dest, int srcidx,
+                             int element, TCGMemOp memop)
+{
+    int vect_off = vec_reg_offset(srcidx, element, memop & MO_SIZE);
+    switch (memop) {
+    case MO_8:
+        tcg_gen_ld8u_i64(tcg_dest, cpu_env, vect_off);
+        break;
+    case MO_16:
+        tcg_gen_ld16u_i64(tcg_dest, cpu_env, vect_off);
+        break;
+    case MO_32:
+        tcg_gen_ld32u_i64(tcg_dest, cpu_env, vect_off);
+        break;
+    case MO_8|MO_SIGN:
+        tcg_gen_ld8s_i64(tcg_dest, cpu_env, vect_off);
+        break;
+    case MO_16|MO_SIGN:
+        tcg_gen_ld16s_i64(tcg_dest, cpu_env, vect_off);
+        break;
+    case MO_32|MO_SIGN:
+        tcg_gen_ld32s_i64(tcg_dest, cpu_env, vect_off);
+        break;
+    case MO_64:
+    case MO_64|MO_SIGN:
+        tcg_gen_ld_i64(tcg_dest, cpu_env, vect_off);
+        break;
+    default:
+        g_assert_not_reached();
+    }
+}
+
+static void read_vec_element_i32(DisasContext *s, TCGv_i32 tcg_dest, int srcidx,
+                                 int element, TCGMemOp memop)
+{
+    int vect_off = vec_reg_offset(srcidx, element, memop & MO_SIZE);
+    switch (memop) {
+    case MO_8:
+        tcg_gen_ld8u_i32(tcg_dest, cpu_env, vect_off);
+        break;
+    case MO_16:
+        tcg_gen_ld16u_i32(tcg_dest, cpu_env, vect_off);
+        break;
+    case MO_8|MO_SIGN:
+        tcg_gen_ld8s_i32(tcg_dest, cpu_env, vect_off);
+        break;
+    case MO_16|MO_SIGN:
+        tcg_gen_ld16s_i32(tcg_dest, cpu_env, vect_off);
+        break;
+    case MO_32:
+    case MO_32|MO_SIGN:
+        tcg_gen_ld_i32(tcg_dest, cpu_env, vect_off);
+        break;
+    default:
+        g_assert_not_reached();
+    }
+}
+
+/* Set value of an element within a vector register */
+static void write_vec_element(DisasContext *s, TCGv_i64 tcg_src, int destidx,
+                              int element, TCGMemOp memop)
+{
+    int vect_off = vec_reg_offset(destidx, element, memop & MO_SIZE);
+    switch (memop) {
+    case MO_8:
+        tcg_gen_st8_i64(tcg_src, cpu_env, vect_off);
+        break;
+    case MO_16:
+        tcg_gen_st16_i64(tcg_src, cpu_env, vect_off);
+        break;
+    case MO_32:
+        tcg_gen_st32_i64(tcg_src, cpu_env, vect_off);
+        break;
+    case MO_64:
+        tcg_gen_st_i64(tcg_src, cpu_env, vect_off);
+        break;
+    default:
+        g_assert_not_reached();
+    }
+}
+
+static void write_vec_element_i32(DisasContext *s, TCGv_i32 tcg_src,
+                                  int destidx, int element, TCGMemOp memop)
+{
+    int vect_off = vec_reg_offset(destidx, element, memop & MO_SIZE);
+    switch (memop) {
+    case MO_8:
+        tcg_gen_st8_i32(tcg_src, cpu_env, vect_off);
+        break;
+    case MO_16:
+        tcg_gen_st16_i32(tcg_src, cpu_env, vect_off);
+        break;
+    case MO_32:
+        tcg_gen_st_i32(tcg_src, cpu_env, vect_off);
+        break;
+    default:
+        g_assert_not_reached();
+    }
+}
+
+/* Clear the high 64 bits of a 128 bit vector (in general non-quad
+ * vector ops all need to do this).
+ */
+static void clear_vec_high(DisasContext *s, int rd)
+{
+    TCGv_i64 tcg_zero = tcg_const_i64(0);
+
+    write_vec_element(s, tcg_zero, rd, 1, MO_64);
+    tcg_temp_free_i64(tcg_zero);
+}
+
+/* Store from vector register to memory */
+static void do_vec_st(DisasContext *s, int srcidx, int element,
+                      TCGv_i64 tcg_addr, int size)
+{
+    TCGMemOp memop = MO_TE + size;
+    TCGv_i64 tcg_tmp = tcg_temp_new_i64();
+
+    read_vec_element(s, tcg_tmp, srcidx, element, size);
+    tcg_gen_qemu_st_i64(tcg_tmp, tcg_addr, get_mem_index(s), memop);
+
+    tcg_temp_free_i64(tcg_tmp);
+}
+
+/* Load from memory to vector register */
+static void do_vec_ld(DisasContext *s, int destidx, int element,
+                      TCGv_i64 tcg_addr, int size)
+{
+    TCGMemOp memop = MO_TE + size;
+    TCGv_i64 tcg_tmp = tcg_temp_new_i64();
+
+    tcg_gen_qemu_ld_i64(tcg_tmp, tcg_addr, get_mem_index(s), memop);
+    write_vec_element(s, tcg_tmp, destidx, element, size);
+
+    tcg_temp_free_i64(tcg_tmp);
+}
+
+/*
  * This utility function is for doing register extension with an
  * optional shift. You will likely want to pass a temporary for the
  * destination register. See DecodeRegExtend() in the ARM ARM.
@@ -722,6 +908,31 @@ static inline void gen_check_sp_alignment(DisasContext *s)
 }
 
 /*
+ * This provides a simple table based table lookup decoder. It is
+ * intended to be used when the relevant bits for decode are too
+ * awkwardly placed and switch/if based logic would be confusing and
+ * deeply nested. Since it's a linear search through the table, tables
+ * should be kept small.
+ *
+ * It returns the first handler where insn & mask == pattern, or
+ * NULL if there is no match.
+ * The table is terminated by an empty mask (i.e. 0)
+ */
+static inline AArch64DecodeFn *lookup_disas_fn(const AArch64DecodeTable *table,
+                                               uint32_t insn)
+{
+    const AArch64DecodeTable *tptr = table;
+
+    while (tptr->mask) {
+        if ((insn & tptr->mask) == tptr->pattern) {
+            return tptr->disas_fn;
+        }
+        tptr++;
+    }
+    return NULL;
+}
+
+/*
  * the instruction disassembly implemented here matches
  * the instruction encoding classifications in chapter 3 (C3)
  * of the ARM Architecture Reference Manual (DDI0487A_a)
@@ -1835,16 +2046,278 @@ static void disas_ldst_reg(DisasContext *s, uint32_t insn)
     }
 }
 
-/* AdvSIMD load/store multiple structures */
+/* C3.3.1 AdvSIMD load/store multiple structures
+ *
+ *  31  30  29           23 22  21         16 15    12 11  10 9    5 4    0
+ * +---+---+---------------+---+-------------+--------+------+------+------+
+ * | 0 | Q | 0 0 1 1 0 0 0 | L | 0 0 0 0 0 0 | opcode | size |  Rn  |  Rt  |
+ * +---+---+---------------+---+-------------+--------+------+------+------+
+ *
+ * C3.3.2 AdvSIMD load/store multiple structures (post-indexed)
+ *
+ *  31  30  29           23 22  21  20     16 15    12 11  10 9    5 4    0
+ * +---+---+---------------+---+---+---------+--------+------+------+------+
+ * | 0 | Q | 0 0 1 1 0 0 1 | L | 0 |   Rm    | opcode | size |  Rn  |  Rt  |
+ * +---+---+---------------+---+---+---------+--------+------+------+------+
+ *
+ * Rt: first (or only) SIMD&FP register to be transferred
+ * Rn: base address or SP
+ * Rm (post-index only): post-index register (when !31) or size dependent #imm
+ */
 static void disas_ldst_multiple_struct(DisasContext *s, uint32_t insn)
 {
-    unsupported_encoding(s, insn);
+    int rt = extract32(insn, 0, 5);
+    int rn = extract32(insn, 5, 5);
+    int size = extract32(insn, 10, 2);
+    int opcode = extract32(insn, 12, 4);
+    bool is_store = !extract32(insn, 22, 1);
+    bool is_postidx = extract32(insn, 23, 1);
+    bool is_q = extract32(insn, 30, 1);
+    TCGv_i64 tcg_addr, tcg_rn;
+
+    int ebytes = 1 << size;
+    int elements = (is_q ? 128 : 64) / (8 << size);
+    int rpt;    /* num iterations */
+    int selem;  /* structure elements */
+    int r;
+
+    if (extract32(insn, 31, 1) || extract32(insn, 21, 1)) {
+        unallocated_encoding(s);
+        return;
+    }
+
+    /* From the shared decode logic */
+    switch (opcode) {
+    case 0x0:
+        rpt = 1;
+        selem = 4;
+        break;
+    case 0x2:
+        rpt = 4;
+        selem = 1;
+        break;
+    case 0x4:
+        rpt = 1;
+        selem = 3;
+        break;
+    case 0x6:
+        rpt = 3;
+        selem = 1;
+        break;
+    case 0x7:
+        rpt = 1;
+        selem = 1;
+        break;
+    case 0x8:
+        rpt = 1;
+        selem = 2;
+        break;
+    case 0xa:
+        rpt = 2;
+        selem = 1;
+        break;
+    default:
+        unallocated_encoding(s);
+        return;
+    }
+
+    if (size == 3 && !is_q && selem != 1) {
+        /* reserved */
+        unallocated_encoding(s);
+        return;
+    }
+
+    if (rn == 31) {
+        gen_check_sp_alignment(s);
+    }
+
+    tcg_rn = cpu_reg_sp(s, rn);
+    tcg_addr = tcg_temp_new_i64();
+    tcg_gen_mov_i64(tcg_addr, tcg_rn);
+
+    for (r = 0; r < rpt; r++) {
+        int e;
+        for (e = 0; e < elements; e++) {
+            int tt = (rt + r) % 32;
+            int xs;
+            for (xs = 0; xs < selem; xs++) {
+                if (is_store) {
+                    do_vec_st(s, tt, e, tcg_addr, size);
+                } else {
+                    do_vec_ld(s, tt, e, tcg_addr, size);
+
+                    /* For non-quad operations, setting a slice of the low
+                     * 64 bits of the register clears the high 64 bits (in
+                     * the ARM ARM pseudocode this is implicit in the fact
+                     * that 'rval' is a 64 bit wide variable). We optimize
+                     * by noticing that we only need to do this the first
+                     * time we touch a register.
+                     */
+                    if (!is_q && e == 0 && (r == 0 || xs == selem - 1)) {
+                        clear_vec_high(s, tt);
+                    }
+                }
+                tcg_gen_addi_i64(tcg_addr, tcg_addr, ebytes);
+                tt = (tt + 1) % 32;
+            }
+        }
+    }
+
+    if (is_postidx) {
+        int rm = extract32(insn, 16, 5);
+        if (rm == 31) {
+            tcg_gen_mov_i64(tcg_rn, tcg_addr);
+        } else {
+            tcg_gen_add_i64(tcg_rn, tcg_rn, cpu_reg(s, rm));
+        }
+    }
+    tcg_temp_free_i64(tcg_addr);
 }
 
-/* AdvSIMD load/store single structure */
+/* C3.3.3 AdvSIMD load/store single structure
+ *
+ *  31  30  29           23 22 21 20       16 15 13 12  11  10 9    5 4    0
+ * +---+---+---------------+-----+-----------+-----+---+------+------+------+
+ * | 0 | Q | 0 0 1 1 0 1 0 | L R | 0 0 0 0 0 | opc | S | size |  Rn  |  Rt  |
+ * +---+---+---------------+-----+-----------+-----+---+------+------+------+
+ *
+ * C3.3.4 AdvSIMD load/store single structure (post-indexed)
+ *
+ *  31  30  29           23 22 21 20       16 15 13 12  11  10 9    5 4    0
+ * +---+---+---------------+-----+-----------+-----+---+------+------+------+
+ * | 0 | Q | 0 0 1 1 0 1 1 | L R |     Rm    | opc | S | size |  Rn  |  Rt  |
+ * +---+---+---------------+-----+-----------+-----+---+------+------+------+
+ *
+ * Rt: first (or only) SIMD&FP register to be transferred
+ * Rn: base address or SP
+ * Rm (post-index only): post-index register (when !31) or size dependent #imm
+ * index = encoded in Q:S:size dependent on size
+ *
+ * lane_size = encoded in R, opc
+ * transfer width = encoded in opc, S, size
+ */
 static void disas_ldst_single_struct(DisasContext *s, uint32_t insn)
 {
-    unsupported_encoding(s, insn);
+    int rt = extract32(insn, 0, 5);
+    int rn = extract32(insn, 5, 5);
+    int size = extract32(insn, 10, 2);
+    int S = extract32(insn, 12, 1);
+    int opc = extract32(insn, 13, 3);
+    int R = extract32(insn, 21, 1);
+    int is_load = extract32(insn, 22, 1);
+    int is_postidx = extract32(insn, 23, 1);
+    int is_q = extract32(insn, 30, 1);
+
+    int scale = extract32(opc, 1, 2);
+    int selem = (extract32(opc, 0, 1) << 1 | R) + 1;
+    bool replicate = false;
+    int index = is_q << 3 | S << 2 | size;
+    int ebytes, xs;
+    TCGv_i64 tcg_addr, tcg_rn;
+
+    switch (scale) {
+    case 3:
+        if (!is_load || S) {
+            unallocated_encoding(s);
+            return;
+        }
+        scale = size;
+        replicate = true;
+        break;
+    case 0:
+        break;
+    case 1:
+        if (extract32(size, 0, 1)) {
+            unallocated_encoding(s);
+            return;
+        }
+        index >>= 1;
+        break;
+    case 2:
+        if (extract32(size, 1, 1)) {
+            unallocated_encoding(s);
+            return;
+        }
+        if (!extract32(size, 0, 1)) {
+            index >>= 2;
+        } else {
+            if (S) {
+                unallocated_encoding(s);
+                return;
+            }
+            index >>= 3;
+            scale = 3;
+        }
+        break;
+    default:
+        g_assert_not_reached();
+    }
+
+    ebytes = 1 << scale;
+
+    if (rn == 31) {
+        gen_check_sp_alignment(s);
+    }
+
+    tcg_rn = cpu_reg_sp(s, rn);
+    tcg_addr = tcg_temp_new_i64();
+    tcg_gen_mov_i64(tcg_addr, tcg_rn);
+
+    for (xs = 0; xs < selem; xs++) {
+        if (replicate) {
+            /* Load and replicate to all elements */
+            uint64_t mulconst;
+            TCGv_i64 tcg_tmp = tcg_temp_new_i64();
+
+            tcg_gen_qemu_ld_i64(tcg_tmp, tcg_addr,
+                                get_mem_index(s), MO_TE + scale);
+            switch (scale) {
+            case 0:
+                mulconst = 0x0101010101010101ULL;
+                break;
+            case 1:
+                mulconst = 0x0001000100010001ULL;
+                break;
+            case 2:
+                mulconst = 0x0000000100000001ULL;
+                break;
+            case 3:
+                mulconst = 0;
+                break;
+            default:
+                g_assert_not_reached();
+            }
+            if (mulconst) {
+                tcg_gen_muli_i64(tcg_tmp, tcg_tmp, mulconst);
+            }
+            write_vec_element(s, tcg_tmp, rt, 0, MO_64);
+            if (is_q) {
+                write_vec_element(s, tcg_tmp, rt, 1, MO_64);
+            } else {
+                clear_vec_high(s, rt);
+            }
+            tcg_temp_free_i64(tcg_tmp);
+        } else {
+            /* Load/store one element per register */
+            if (is_load) {
+                do_vec_ld(s, rt, index, tcg_addr, MO_TE + scale);
+            } else {
+                do_vec_st(s, rt, index, tcg_addr, MO_TE + scale);
+            }
+        }
+        tcg_gen_addi_i64(tcg_addr, tcg_addr, ebytes);
+        rt = (rt + 1) % 32;
+    }
+
+    if (is_postidx) {
+        int rm = extract32(insn, 16, 5);
+        if (rm == 31) {
+            tcg_gen_mov_i64(tcg_rn, tcg_addr);
+        } else {
+            tcg_gen_add_i64(tcg_rn, tcg_rn, cpu_reg(s, rm));
+        }
+    }
+    tcg_temp_free_i64(tcg_addr);
 }
 
 /* C3.3 Loads and stores */
@@ -3186,34 +3659,6 @@ static void disas_data_proc_reg(DisasContext *s, uint32_t insn)
     }
 }
 
-/* Convert ARM rounding mode to softfloat */
-static inline int arm_rmode_to_sf(int rmode)
-{
-    switch (rmode) {
-    case FPROUNDING_TIEAWAY:
-        rmode = float_round_ties_away;
-        break;
-    case FPROUNDING_ODD:
-        /* FIXME: add support for TIEAWAY and ODD */
-        qemu_log_mask(LOG_UNIMP, "arm: unimplemented rounding mode: %d\n",
-                      rmode);
-    case FPROUNDING_TIEEVEN:
-    default:
-        rmode = float_round_nearest_even;
-        break;
-    case FPROUNDING_POSINF:
-        rmode = float_round_up;
-        break;
-    case FPROUNDING_NEGINF:
-        rmode = float_round_down;
-        break;
-    case FPROUNDING_ZERO:
-        rmode = float_round_to_zero;
-        break;
-    }
-    return rmode;
-}
-
 static void handle_fp_compare(DisasContext *s, bool is_double,
                               unsigned int rn, unsigned int rm,
                               bool cmp_with_zero, bool signal_all_nans)
@@ -4224,13 +4669,2201 @@ static void disas_data_proc_fp(DisasContext *s, uint32_t insn)
     }
 }
 
+static void do_ext64(DisasContext *s, TCGv_i64 tcg_left, TCGv_i64 tcg_right,
+                     int pos)
+{
+    /* Extract 64 bits from the middle of two concatenated 64 bit
+     * vector register slices left:right. The extracted bits start
+     * at 'pos' bits into the right (least significant) side.
+     * We return the result in tcg_right, and guarantee not to
+     * trash tcg_left.
+     */
+    TCGv_i64 tcg_tmp = tcg_temp_new_i64();
+    assert(pos > 0 && pos < 64);
+
+    tcg_gen_shri_i64(tcg_right, tcg_right, pos);
+    tcg_gen_shli_i64(tcg_tmp, tcg_left, 64 - pos);
+    tcg_gen_or_i64(tcg_right, tcg_right, tcg_tmp);
+
+    tcg_temp_free_i64(tcg_tmp);
+}
+
+/* C3.6.1 EXT
+ *   31  30 29         24 23 22  21 20  16 15  14  11 10  9    5 4    0
+ * +---+---+-------------+-----+---+------+---+------+---+------+------+
+ * | 0 | Q | 1 0 1 1 1 0 | op2 | 0 |  Rm  | 0 | imm4 | 0 |  Rn  |  Rd  |
+ * +---+---+-------------+-----+---+------+---+------+---+------+------+
+ */
+static void disas_simd_ext(DisasContext *s, uint32_t insn)
+{
+    int is_q = extract32(insn, 30, 1);
+    int op2 = extract32(insn, 22, 2);
+    int imm4 = extract32(insn, 11, 4);
+    int rm = extract32(insn, 16, 5);
+    int rn = extract32(insn, 5, 5);
+    int rd = extract32(insn, 0, 5);
+    int pos = imm4 << 3;
+    TCGv_i64 tcg_resl, tcg_resh;
+
+    if (op2 != 0 || (!is_q && extract32(imm4, 3, 1))) {
+        unallocated_encoding(s);
+        return;
+    }
+
+    tcg_resh = tcg_temp_new_i64();
+    tcg_resl = tcg_temp_new_i64();
+
+    /* Vd gets bits starting at pos bits into Vm:Vn. This is
+     * either extracting 128 bits from a 128:128 concatenation, or
+     * extracting 64 bits from a 64:64 concatenation.
+     */
+    if (!is_q) {
+        read_vec_element(s, tcg_resl, rn, 0, MO_64);
+        if (pos != 0) {
+            read_vec_element(s, tcg_resh, rm, 0, MO_64);
+            do_ext64(s, tcg_resh, tcg_resl, pos);
+        }
+        tcg_gen_movi_i64(tcg_resh, 0);
+    } else {
+        TCGv_i64 tcg_hh;
+        typedef struct {
+            int reg;
+            int elt;
+        } EltPosns;
+        EltPosns eltposns[] = { {rn, 0}, {rn, 1}, {rm, 0}, {rm, 1} };
+        EltPosns *elt = eltposns;
+
+        if (pos >= 64) {
+            elt++;
+            pos -= 64;
+        }
+
+        read_vec_element(s, tcg_resl, elt->reg, elt->elt, MO_64);
+        elt++;
+        read_vec_element(s, tcg_resh, elt->reg, elt->elt, MO_64);
+        elt++;
+        if (pos != 0) {
+            do_ext64(s, tcg_resh, tcg_resl, pos);
+            tcg_hh = tcg_temp_new_i64();
+            read_vec_element(s, tcg_hh, elt->reg, elt->elt, MO_64);
+            do_ext64(s, tcg_hh, tcg_resh, pos);
+            tcg_temp_free_i64(tcg_hh);
+        }
+    }
+
+    write_vec_element(s, tcg_resl, rd, 0, MO_64);
+    tcg_temp_free_i64(tcg_resl);
+    write_vec_element(s, tcg_resh, rd, 1, MO_64);
+    tcg_temp_free_i64(tcg_resh);
+}
+
+/* C3.6.2 TBL/TBX
+ *   31  30 29         24 23 22  21 20  16 15  14 13  12  11 10 9    5 4    0
+ * +---+---+-------------+-----+---+------+---+-----+----+-----+------+------+
+ * | 0 | Q | 0 0 1 1 1 0 | op2 | 0 |  Rm  | 0 | len | op | 0 0 |  Rn  |  Rd  |
+ * +---+---+-------------+-----+---+------+---+-----+----+-----+------+------+
+ */
+static void disas_simd_tb(DisasContext *s, uint32_t insn)
+{
+    int op2 = extract32(insn, 22, 2);
+    int is_q = extract32(insn, 30, 1);
+    int rm = extract32(insn, 16, 5);
+    int rn = extract32(insn, 5, 5);
+    int rd = extract32(insn, 0, 5);
+    int is_tblx = extract32(insn, 12, 1);
+    int len = extract32(insn, 13, 2);
+    TCGv_i64 tcg_resl, tcg_resh, tcg_idx;
+    TCGv_i32 tcg_regno, tcg_numregs;
+
+    if (op2 != 0) {
+        unallocated_encoding(s);
+        return;
+    }
+
+    /* This does a table lookup: for every byte element in the input
+     * we index into a table formed from up to four vector registers,
+     * and then the output is the result of the lookups. Our helper
+     * function does the lookup operation for a single 64 bit part of
+     * the input.
+     */
+    tcg_resl = tcg_temp_new_i64();
+    tcg_resh = tcg_temp_new_i64();
+
+    if (is_tblx) {
+        read_vec_element(s, tcg_resl, rd, 0, MO_64);
+    } else {
+        tcg_gen_movi_i64(tcg_resl, 0);
+    }
+    if (is_tblx && is_q) {
+        read_vec_element(s, tcg_resh, rd, 1, MO_64);
+    } else {
+        tcg_gen_movi_i64(tcg_resh, 0);
+    }
+
+    tcg_idx = tcg_temp_new_i64();
+    tcg_regno = tcg_const_i32(rn);
+    tcg_numregs = tcg_const_i32(len + 1);
+    read_vec_element(s, tcg_idx, rm, 0, MO_64);
+    gen_helper_simd_tbl(tcg_resl, cpu_env, tcg_resl, tcg_idx,
+                        tcg_regno, tcg_numregs);
+    if (is_q) {
+        read_vec_element(s, tcg_idx, rm, 1, MO_64);
+        gen_helper_simd_tbl(tcg_resh, cpu_env, tcg_resh, tcg_idx,
+                            tcg_regno, tcg_numregs);
+    }
+    tcg_temp_free_i64(tcg_idx);
+    tcg_temp_free_i32(tcg_regno);
+    tcg_temp_free_i32(tcg_numregs);
+
+    write_vec_element(s, tcg_resl, rd, 0, MO_64);
+    tcg_temp_free_i64(tcg_resl);
+    write_vec_element(s, tcg_resh, rd, 1, MO_64);
+    tcg_temp_free_i64(tcg_resh);
+}
+
+/* C3.6.3 ZIP/UZP/TRN
+ *   31  30 29         24 23  22  21 20   16 15 14 12 11 10 9    5 4    0
+ * +---+---+-------------+------+---+------+---+------------------+------+
+ * | 0 | Q | 0 0 1 1 1 0 | size | 0 |  Rm  | 0 | opc | 1 0 |  Rn  |  Rd  |
+ * +---+---+-------------+------+---+------+---+------------------+------+
+ */
+static void disas_simd_zip_trn(DisasContext *s, uint32_t insn)
+{
+    int rd = extract32(insn, 0, 5);
+    int rn = extract32(insn, 5, 5);
+    int rm = extract32(insn, 16, 5);
+    int size = extract32(insn, 22, 2);
+    /* opc field bits [1:0] indicate ZIP/UZP/TRN;
+     * bit 2 indicates 1 vs 2 variant of the insn.
+     */
+    int opcode = extract32(insn, 12, 2);
+    bool part = extract32(insn, 14, 1);
+    bool is_q = extract32(insn, 30, 1);
+    int esize = 8 << size;
+    int i, ofs;
+    int datasize = is_q ? 128 : 64;
+    int elements = datasize / esize;
+    TCGv_i64 tcg_res, tcg_resl, tcg_resh;
+
+    if (opcode == 0 || (size == 3 && !is_q)) {
+        unallocated_encoding(s);
+        return;
+    }
+
+    tcg_resl = tcg_const_i64(0);
+    tcg_resh = tcg_const_i64(0);
+    tcg_res = tcg_temp_new_i64();
+
+    for (i = 0; i < elements; i++) {
+        switch (opcode) {
+        case 1: /* UZP1/2 */
+        {
+            int midpoint = elements / 2;
+            if (i < midpoint) {
+                read_vec_element(s, tcg_res, rn, 2 * i + part, size);
+            } else {
+                read_vec_element(s, tcg_res, rm,
+                                 2 * (i - midpoint) + part, size);
+            }
+            break;
+        }
+        case 2: /* TRN1/2 */
+            if (i & 1) {
+                read_vec_element(s, tcg_res, rm, (i & ~1) + part, size);
+            } else {
+                read_vec_element(s, tcg_res, rn, (i & ~1) + part, size);
+            }
+            break;
+        case 3: /* ZIP1/2 */
+        {
+            int base = part * elements / 2;
+            if (i & 1) {
+                read_vec_element(s, tcg_res, rm, base + (i >> 1), size);
+            } else {
+                read_vec_element(s, tcg_res, rn, base + (i >> 1), size);
+            }
+            break;
+        }
+        default:
+            g_assert_not_reached();
+        }
+
+        ofs = i * esize;
+        if (ofs < 64) {
+            tcg_gen_shli_i64(tcg_res, tcg_res, ofs);
+            tcg_gen_or_i64(tcg_resl, tcg_resl, tcg_res);
+        } else {
+            tcg_gen_shli_i64(tcg_res, tcg_res, ofs - 64);
+            tcg_gen_or_i64(tcg_resh, tcg_resh, tcg_res);
+        }
+    }
+
+    tcg_temp_free_i64(tcg_res);
+
+    write_vec_element(s, tcg_resl, rd, 0, MO_64);
+    tcg_temp_free_i64(tcg_resl);
+    write_vec_element(s, tcg_resh, rd, 1, MO_64);
+    tcg_temp_free_i64(tcg_resh);
+}
+
+static void do_minmaxop(DisasContext *s, TCGv_i32 tcg_elt1, TCGv_i32 tcg_elt2,
+                        int opc, bool is_min, TCGv_ptr fpst)
+{
+    /* Helper function for disas_simd_across_lanes: do a single precision
+     * min/max operation on the specified two inputs,
+     * and return the result in tcg_elt1.
+     */
+    if (opc == 0xc) {
+        if (is_min) {
+            gen_helper_vfp_minnums(tcg_elt1, tcg_elt1, tcg_elt2, fpst);
+        } else {
+            gen_helper_vfp_maxnums(tcg_elt1, tcg_elt1, tcg_elt2, fpst);
+        }
+    } else {
+        assert(opc == 0xf);
+        if (is_min) {
+            gen_helper_vfp_mins(tcg_elt1, tcg_elt1, tcg_elt2, fpst);
+        } else {
+            gen_helper_vfp_maxs(tcg_elt1, tcg_elt1, tcg_elt2, fpst);
+        }
+    }
+}
+
+/* C3.6.4 AdvSIMD across lanes
+ *   31  30  29 28       24 23  22 21       17 16    12 11 10 9    5 4    0
+ * +---+---+---+-----------+------+-----------+--------+-----+------+------+
+ * | 0 | Q | U | 0 1 1 1 0 | size | 1 1 0 0 0 | opcode | 1 0 |  Rn  |  Rd  |
+ * +---+---+---+-----------+------+-----------+--------+-----+------+------+
+ */
+static void disas_simd_across_lanes(DisasContext *s, uint32_t insn)
+{
+    int rd = extract32(insn, 0, 5);
+    int rn = extract32(insn, 5, 5);
+    int size = extract32(insn, 22, 2);
+    int opcode = extract32(insn, 12, 5);
+    bool is_q = extract32(insn, 30, 1);
+    bool is_u = extract32(insn, 29, 1);
+    bool is_fp = false;
+    bool is_min = false;
+    int esize;
+    int elements;
+    int i;
+    TCGv_i64 tcg_res, tcg_elt;
+
+    switch (opcode) {
+    case 0x1b: /* ADDV */
+        if (is_u) {
+            unallocated_encoding(s);
+            return;
+        }
+        /* fall through */
+    case 0x3: /* SADDLV, UADDLV */
+    case 0xa: /* SMAXV, UMAXV */
+    case 0x1a: /* SMINV, UMINV */
+        if (size == 3 || (size == 2 && !is_q)) {
+            unallocated_encoding(s);
+            return;
+        }
+        break;
+    case 0xc: /* FMAXNMV, FMINNMV */
+    case 0xf: /* FMAXV, FMINV */
+        if (!is_u || !is_q || extract32(size, 0, 1)) {
+            unallocated_encoding(s);
+            return;
+        }
+        /* Bit 1 of size field encodes min vs max, and actual size is always
+         * 32 bits: adjust the size variable so following code can rely on it
+         */
+        is_min = extract32(size, 1, 1);
+        is_fp = true;
+        size = 2;
+        break;
+    default:
+        unallocated_encoding(s);
+        return;
+    }
+
+    esize = 8 << size;
+    elements = (is_q ? 128 : 64) / esize;
+
+    tcg_res = tcg_temp_new_i64();
+    tcg_elt = tcg_temp_new_i64();
+
+    /* These instructions operate across all lanes of a vector
+     * to produce a single result. We can guarantee that a 64
+     * bit intermediate is sufficient:
+     *  + for [US]ADDLV the maximum element size is 32 bits, and
+     *    the result type is 64 bits
+     *  + for FMAX*V, FMIN*V, ADDV the intermediate type is the
+     *    same as the element size, which is 32 bits at most
+     * For the integer operations we can choose to work at 64
+     * or 32 bits and truncate at the end; for simplicity
+     * we use 64 bits always. The floating point
+     * ops do require 32 bit intermediates, though.
+     */
+    if (!is_fp) {
+        read_vec_element(s, tcg_res, rn, 0, size | (is_u ? 0 : MO_SIGN));
+
+        for (i = 1; i < elements; i++) {
+            read_vec_element(s, tcg_elt, rn, i, size | (is_u ? 0 : MO_SIGN));
+
+            switch (opcode) {
+            case 0x03: /* SADDLV / UADDLV */
+            case 0x1b: /* ADDV */
+                tcg_gen_add_i64(tcg_res, tcg_res, tcg_elt);
+                break;
+            case 0x0a: /* SMAXV / UMAXV */
+                tcg_gen_movcond_i64(is_u ? TCG_COND_GEU : TCG_COND_GE,
+                                    tcg_res,
+                                    tcg_res, tcg_elt, tcg_res, tcg_elt);
+                break;
+            case 0x1a: /* SMINV / UMINV */
+                tcg_gen_movcond_i64(is_u ? TCG_COND_LEU : TCG_COND_LE,
+                                    tcg_res,
+                                    tcg_res, tcg_elt, tcg_res, tcg_elt);
+                break;
+                break;
+            default:
+                g_assert_not_reached();
+            }
+
+        }
+    } else {
+        /* Floating point ops which work on 32 bit (single) intermediates.
+         * Note that correct NaN propagation requires that we do these
+         * operations in exactly the order specified by the pseudocode.
+         */
+        TCGv_i32 tcg_elt1 = tcg_temp_new_i32();
+        TCGv_i32 tcg_elt2 = tcg_temp_new_i32();
+        TCGv_i32 tcg_elt3 = tcg_temp_new_i32();
+        TCGv_ptr fpst = get_fpstatus_ptr();
+
+        assert(esize == 32);
+        assert(elements == 4);
+
+        read_vec_element(s, tcg_elt, rn, 0, MO_32);
+        tcg_gen_trunc_i64_i32(tcg_elt1, tcg_elt);
+        read_vec_element(s, tcg_elt, rn, 1, MO_32);
+        tcg_gen_trunc_i64_i32(tcg_elt2, tcg_elt);
+
+        do_minmaxop(s, tcg_elt1, tcg_elt2, opcode, is_min, fpst);
+
+        read_vec_element(s, tcg_elt, rn, 2, MO_32);
+        tcg_gen_trunc_i64_i32(tcg_elt2, tcg_elt);
+        read_vec_element(s, tcg_elt, rn, 3, MO_32);
+        tcg_gen_trunc_i64_i32(tcg_elt3, tcg_elt);
+
+        do_minmaxop(s, tcg_elt2, tcg_elt3, opcode, is_min, fpst);
+
+        do_minmaxop(s, tcg_elt1, tcg_elt2, opcode, is_min, fpst);
+
+        tcg_gen_extu_i32_i64(tcg_res, tcg_elt1);
+        tcg_temp_free_i32(tcg_elt1);
+        tcg_temp_free_i32(tcg_elt2);
+        tcg_temp_free_i32(tcg_elt3);
+        tcg_temp_free_ptr(fpst);
+    }
+
+    tcg_temp_free_i64(tcg_elt);
+
+    /* Now truncate the result to the width required for the final output */
+    if (opcode == 0x03) {
+        /* SADDLV, UADDLV: result is 2*esize */
+        size++;
+    }
+
+    switch (size) {
+    case 0:
+        tcg_gen_ext8u_i64(tcg_res, tcg_res);
+        break;
+    case 1:
+        tcg_gen_ext16u_i64(tcg_res, tcg_res);
+        break;
+    case 2:
+        tcg_gen_ext32u_i64(tcg_res, tcg_res);
+        break;
+    case 3:
+        break;
+    default:
+        g_assert_not_reached();
+    }
+
+    write_fp_dreg(s, rd, tcg_res);
+    tcg_temp_free_i64(tcg_res);
+}
+
+/* C6.3.31 DUP (Element, Vector)
+ *
+ *  31  30   29              21 20    16 15        10  9    5 4    0
+ * +---+---+-------------------+--------+-------------+------+------+
+ * | 0 | Q | 0 0 1 1 1 0 0 0 0 |  imm5  | 0 0 0 0 0 1 |  Rn  |  Rd  |
+ * +---+---+-------------------+--------+-------------+------+------+
+ *
+ * size: encoded in imm5 (see ARM ARM LowestSetBit())
+ */
+static void handle_simd_dupe(DisasContext *s, int is_q, int rd, int rn,
+                             int imm5)
+{
+    int size = ctz32(imm5);
+    int esize = 8 << size;
+    int elements = (is_q ? 128 : 64) / esize;
+    int index, i;
+    TCGv_i64 tmp;
+
+    if (size > 3 || (size == 3 && !is_q)) {
+        unallocated_encoding(s);
+        return;
+    }
+
+    index = imm5 >> (size + 1);
+
+    tmp = tcg_temp_new_i64();
+    read_vec_element(s, tmp, rn, index, size);
+
+    for (i = 0; i < elements; i++) {
+        write_vec_element(s, tmp, rd, i, size);
+    }
+
+    if (!is_q) {
+        clear_vec_high(s, rd);
+    }
+
+    tcg_temp_free_i64(tmp);
+}
+
+/* C6.3.31 DUP (element, scalar)
+ *  31                   21 20    16 15        10  9    5 4    0
+ * +-----------------------+--------+-------------+------+------+
+ * | 0 1 0 1 1 1 1 0 0 0 0 |  imm5  | 0 0 0 0 0 1 |  Rn  |  Rd  |
+ * +-----------------------+--------+-------------+------+------+
+ */
+static void handle_simd_dupes(DisasContext *s, int rd, int rn,
+                              int imm5)
+{
+    int size = ctz32(imm5);
+    int index;
+    TCGv_i64 tmp;
+
+    if (size > 3) {
+        unallocated_encoding(s);
+        return;
+    }
+
+    index = imm5 >> (size + 1);
+
+    /* This instruction just extracts the specified element and
+     * zero-extends it into the bottom of the destination register.
+     */
+    tmp = tcg_temp_new_i64();
+    read_vec_element(s, tmp, rn, index, size);
+    write_fp_dreg(s, rd, tmp);
+    tcg_temp_free_i64(tmp);
+}
+
+/* C6.3.32 DUP (General)
+ *
+ *  31  30   29              21 20    16 15        10  9    5 4    0
+ * +---+---+-------------------+--------+-------------+------+------+
+ * | 0 | Q | 0 0 1 1 1 0 0 0 0 |  imm5  | 0 0 0 0 1 1 |  Rn  |  Rd  |
+ * +---+---+-------------------+--------+-------------+------+------+
+ *
+ * size: encoded in imm5 (see ARM ARM LowestSetBit())
+ */
+static void handle_simd_dupg(DisasContext *s, int is_q, int rd, int rn,
+                             int imm5)
+{
+    int size = ctz32(imm5);
+    int esize = 8 << size;
+    int elements = (is_q ? 128 : 64)/esize;
+    int i = 0;
+
+    if (size > 3 || ((size == 3) && !is_q)) {
+        unallocated_encoding(s);
+        return;
+    }
+    for (i = 0; i < elements; i++) {
+        write_vec_element(s, cpu_reg(s, rn), rd, i, size);
+    }
+    if (!is_q) {
+        clear_vec_high(s, rd);
+    }
+}
+
+/* C6.3.150 INS (Element)
+ *
+ *  31                   21 20    16 15  14    11  10 9    5 4    0
+ * +-----------------------+--------+------------+---+------+------+
+ * | 0 1 1 0 1 1 1 0 0 0 0 |  imm5  | 0 |  imm4  | 1 |  Rn  |  Rd  |
+ * +-----------------------+--------+------------+---+------+------+
+ *
+ * size: encoded in imm5 (see ARM ARM LowestSetBit())
+ * index: encoded in imm5<4:size+1>
+ */
+static void handle_simd_inse(DisasContext *s, int rd, int rn,
+                             int imm4, int imm5)
+{
+    int size = ctz32(imm5);
+    int src_index, dst_index;
+    TCGv_i64 tmp;
+
+    if (size > 3) {
+        unallocated_encoding(s);
+        return;
+    }
+    dst_index = extract32(imm5, 1+size, 5);
+    src_index = extract32(imm4, size, 4);
+
+    tmp = tcg_temp_new_i64();
+
+    read_vec_element(s, tmp, rn, src_index, size);
+    write_vec_element(s, tmp, rd, dst_index, size);
+
+    tcg_temp_free_i64(tmp);
+}
+
+
+/* C6.3.151 INS (General)
+ *
+ *  31                   21 20    16 15        10  9    5 4    0
+ * +-----------------------+--------+-------------+------+------+
+ * | 0 1 0 0 1 1 1 0 0 0 0 |  imm5  | 0 0 0 1 1 1 |  Rn  |  Rd  |
+ * +-----------------------+--------+-------------+------+------+
+ *
+ * size: encoded in imm5 (see ARM ARM LowestSetBit())
+ * index: encoded in imm5<4:size+1>
+ */
+static void handle_simd_insg(DisasContext *s, int rd, int rn, int imm5)
+{
+    int size = ctz32(imm5);
+    int idx;
+
+    if (size > 3) {
+        unallocated_encoding(s);
+        return;
+    }
+
+    idx = extract32(imm5, 1 + size, 4 - size);
+    write_vec_element(s, cpu_reg(s, rn), rd, idx, size);
+}
+
+/*
+ * C6.3.321 UMOV (General)
+ * C6.3.237 SMOV (General)
+ *
+ *  31  30   29              21 20    16 15    12   10 9    5 4    0
+ * +---+---+-------------------+--------+-------------+------+------+
+ * | 0 | Q | 0 0 1 1 1 0 0 0 0 |  imm5  | 0 0 1 U 1 1 |  Rn  |  Rd  |
+ * +---+---+-------------------+--------+-------------+------+------+
+ *
+ * U: unsigned when set
+ * size: encoded in imm5 (see ARM ARM LowestSetBit())
+ */
+static void handle_simd_umov_smov(DisasContext *s, int is_q, int is_signed,
+                                  int rn, int rd, int imm5)
+{
+    int size = ctz32(imm5);
+    int element;
+    TCGv_i64 tcg_rd;
+
+    /* Check for UnallocatedEncodings */
+    if (is_signed) {
+        if (size > 2 || (size == 2 && !is_q)) {
+            unallocated_encoding(s);
+            return;
+        }
+    } else {
+        if (size > 3
+            || (size < 3 && is_q)
+            || (size == 3 && !is_q)) {
+            unallocated_encoding(s);
+            return;
+        }
+    }
+    element = extract32(imm5, 1+size, 4);
+
+    tcg_rd = cpu_reg(s, rd);
+    read_vec_element(s, tcg_rd, rn, element, size | (is_signed ? MO_SIGN : 0));
+    if (is_signed && !is_q) {
+        tcg_gen_ext32u_i64(tcg_rd, tcg_rd);
+    }
+}
+
+/* C3.6.5 AdvSIMD copy
+ *   31  30  29  28             21 20  16 15  14  11 10  9    5 4    0
+ * +---+---+----+-----------------+------+---+------+---+------+------+
+ * | 0 | Q | op | 0 1 1 1 0 0 0 0 | imm5 | 0 | imm4 | 1 |  Rn  |  Rd  |
+ * +---+---+----+-----------------+------+---+------+---+------+------+
+ */
+static void disas_simd_copy(DisasContext *s, uint32_t insn)
+{
+    int rd = extract32(insn, 0, 5);
+    int rn = extract32(insn, 5, 5);
+    int imm4 = extract32(insn, 11, 4);
+    int op = extract32(insn, 29, 1);
+    int is_q = extract32(insn, 30, 1);
+    int imm5 = extract32(insn, 16, 5);
+
+    if (op) {
+        if (is_q) {
+            /* INS (element) */
+            handle_simd_inse(s, rd, rn, imm4, imm5);
+        } else {
+            unallocated_encoding(s);
+        }
+    } else {
+        switch (imm4) {
+        case 0:
+            /* DUP (element - vector) */
+            handle_simd_dupe(s, is_q, rd, rn, imm5);
+            break;
+        case 1:
+            /* DUP (general) */
+            handle_simd_dupg(s, is_q, rd, rn, imm5);
+            break;
+        case 3:
+            if (is_q) {
+                /* INS (general) */
+                handle_simd_insg(s, rd, rn, imm5);
+            } else {
+                unallocated_encoding(s);
+            }
+            break;
+        case 5:
+        case 7:
+            /* UMOV/SMOV (is_q indicates 32/64; imm4 indicates signedness) */
+            handle_simd_umov_smov(s, is_q, (imm4 == 5), rn, rd, imm5);
+            break;
+        default:
+            unallocated_encoding(s);
+            break;
+        }
+    }
+}
+
+/* C3.6.6 AdvSIMD modified immediate
+ *  31  30   29  28                 19 18 16 15   12  11  10  9     5 4    0
+ * +---+---+----+---------------------+-----+-------+----+---+-------+------+
+ * | 0 | Q | op | 0 1 1 1 1 0 0 0 0 0 | abc | cmode | o2 | 1 | defgh |  Rd  |
+ * +---+---+----+---------------------+-----+-------+----+---+-------+------+
+ *
+ * There are a number of operations that can be carried out here:
+ *   MOVI - move (shifted) imm into register
+ *   MVNI - move inverted (shifted) imm into register
+ *   ORR  - bitwise OR of (shifted) imm with register
+ *   BIC  - bitwise clear of (shifted) imm with register
+ */
+static void disas_simd_mod_imm(DisasContext *s, uint32_t insn)
+{
+    int rd = extract32(insn, 0, 5);
+    int cmode = extract32(insn, 12, 4);
+    int cmode_3_1 = extract32(cmode, 1, 3);
+    int cmode_0 = extract32(cmode, 0, 1);
+    int o2 = extract32(insn, 11, 1);
+    uint64_t abcdefgh = extract32(insn, 5, 5) | (extract32(insn, 16, 3) << 5);
+    bool is_neg = extract32(insn, 29, 1);
+    bool is_q = extract32(insn, 30, 1);
+    uint64_t imm = 0;
+    TCGv_i64 tcg_rd, tcg_imm;
+    int i;
+
+    if (o2 != 0 || ((cmode == 0xf) && is_neg && !is_q)) {
+        unallocated_encoding(s);
+        return;
+    }
+
+    /* See AdvSIMDExpandImm() in ARM ARM */
+    switch (cmode_3_1) {
+    case 0: /* Replicate(Zeros(24):imm8, 2) */
+    case 1: /* Replicate(Zeros(16):imm8:Zeros(8), 2) */
+    case 2: /* Replicate(Zeros(8):imm8:Zeros(16), 2) */
+    case 3: /* Replicate(imm8:Zeros(24), 2) */
+    {
+        int shift = cmode_3_1 * 8;
+        imm = bitfield_replicate(abcdefgh << shift, 32);
+        break;
+    }
+    case 4: /* Replicate(Zeros(8):imm8, 4) */
+    case 5: /* Replicate(imm8:Zeros(8), 4) */
+    {
+        int shift = (cmode_3_1 & 0x1) * 8;
+        imm = bitfield_replicate(abcdefgh << shift, 16);
+        break;
+    }
+    case 6:
+        if (cmode_0) {
+            /* Replicate(Zeros(8):imm8:Ones(16), 2) */
+            imm = (abcdefgh << 16) | 0xffff;
+        } else {
+            /* Replicate(Zeros(16):imm8:Ones(8), 2) */
+            imm = (abcdefgh << 8) | 0xff;
+        }
+        imm = bitfield_replicate(imm, 32);
+        break;
+    case 7:
+        if (!cmode_0 && !is_neg) {
+            imm = bitfield_replicate(abcdefgh, 8);
+        } else if (!cmode_0 && is_neg) {
+            int i;
+            imm = 0;
+            for (i = 0; i < 8; i++) {
+                if ((abcdefgh) & (1 << i)) {
+                    imm |= 0xffULL << (i * 8);
+                }
+            }
+        } else if (cmode_0) {
+            if (is_neg) {
+                imm = (abcdefgh & 0x3f) << 48;
+                if (abcdefgh & 0x80) {
+                    imm |= 0x8000000000000000ULL;
+                }
+                if (abcdefgh & 0x40) {
+                    imm |= 0x3fc0000000000000ULL;
+                } else {
+                    imm |= 0x4000000000000000ULL;
+                }
+            } else {
+                imm = (abcdefgh & 0x3f) << 19;
+                if (abcdefgh & 0x80) {
+                    imm |= 0x80000000;
+                }
+                if (abcdefgh & 0x40) {
+                    imm |= 0x3e000000;
+                } else {
+                    imm |= 0x40000000;
+                }
+                imm |= (imm << 32);
+            }
+        }
+        break;
+    }
+
+    if (cmode_3_1 != 7 && is_neg) {
+        imm = ~imm;
+    }
+
+    tcg_imm = tcg_const_i64(imm);
+    tcg_rd = new_tmp_a64(s);
+
+    for (i = 0; i < 2; i++) {
+        int foffs = i ? fp_reg_hi_offset(rd) : fp_reg_offset(rd, MO_64);
+
+        if (i == 1 && !is_q) {
+            /* non-quad ops clear high half of vector */
+            tcg_gen_movi_i64(tcg_rd, 0);
+        } else if ((cmode & 0x9) == 0x1 || (cmode & 0xd) == 0x9) {
+            tcg_gen_ld_i64(tcg_rd, cpu_env, foffs);
+            if (is_neg) {
+                /* AND (BIC) */
+                tcg_gen_and_i64(tcg_rd, tcg_rd, tcg_imm);
+            } else {
+                /* ORR */
+                tcg_gen_or_i64(tcg_rd, tcg_rd, tcg_imm);
+            }
+        } else {
+            /* MOVI */
+            tcg_gen_mov_i64(tcg_rd, tcg_imm);
+        }
+        tcg_gen_st_i64(tcg_rd, cpu_env, foffs);
+    }
+
+    tcg_temp_free_i64(tcg_imm);
+}
+
+/* C3.6.7 AdvSIMD scalar copy
+ *  31 30  29  28             21 20  16 15  14  11 10  9    5 4    0
+ * +-----+----+-----------------+------+---+------+---+------+------+
+ * | 0 1 | op | 1 1 1 1 0 0 0 0 | imm5 | 0 | imm4 | 1 |  Rn  |  Rd  |
+ * +-----+----+-----------------+------+---+------+---+------+------+
+ */
+static void disas_simd_scalar_copy(DisasContext *s, uint32_t insn)
+{
+    int rd = extract32(insn, 0, 5);
+    int rn = extract32(insn, 5, 5);
+    int imm4 = extract32(insn, 11, 4);
+    int imm5 = extract32(insn, 16, 5);
+    int op = extract32(insn, 29, 1);
+
+    if (op != 0 || imm4 != 0) {
+        unallocated_encoding(s);
+        return;
+    }
+
+    /* DUP (element, scalar) */
+    handle_simd_dupes(s, rd, rn, imm5);
+}
+
+/* C3.6.8 AdvSIMD scalar pairwise
+ *  31 30  29 28       24 23  22 21       17 16    12 11 10 9    5 4    0
+ * +-----+---+-----------+------+-----------+--------+-----+------+------+
+ * | 0 1 | U | 1 1 1 1 0 | size | 1 1 0 0 0 | opcode | 1 0 |  Rn  |  Rd  |
+ * +-----+---+-----------+------+-----------+--------+-----+------+------+
+ */
+static void disas_simd_scalar_pairwise(DisasContext *s, uint32_t insn)
+{
+    unsupported_encoding(s, insn);
+}
+
+/*
+ * Common SSHR[RA]/USHR[RA] - Shift right (optional rounding/accumulate)
+ *
+ * This code is handles the common shifting code and is used by both
+ * the vector and scalar code.
+ */
+static void handle_shri_with_rndacc(TCGv_i64 tcg_res, TCGv_i64 tcg_src,
+                                    TCGv_i64 tcg_rnd, bool accumulate,
+                                    bool is_u, int size, int shift)
+{
+    bool extended_result = false;
+    bool round = !TCGV_IS_UNUSED_I64(tcg_rnd);
+    int ext_lshift = 0;
+    TCGv_i64 tcg_src_hi;
+
+    if (round && size == 3) {
+        extended_result = true;
+        ext_lshift = 64 - shift;
+        tcg_src_hi = tcg_temp_new_i64();
+    } else if (shift == 64) {
+        if (!accumulate && is_u) {
+            /* result is zero */
+            tcg_gen_movi_i64(tcg_res, 0);
+            return;
+        }
+    }
+
+    /* Deal with the rounding step */
+    if (round) {
+        if (extended_result) {
+            TCGv_i64 tcg_zero = tcg_const_i64(0);
+            if (!is_u) {
+                /* take care of sign extending tcg_res */
+                tcg_gen_sari_i64(tcg_src_hi, tcg_src, 63);
+                tcg_gen_add2_i64(tcg_src, tcg_src_hi,
+                                 tcg_src, tcg_src_hi,
+                                 tcg_rnd, tcg_zero);
+            } else {
+                tcg_gen_add2_i64(tcg_src, tcg_src_hi,
+                                 tcg_src, tcg_zero,
+                                 tcg_rnd, tcg_zero);
+            }
+            tcg_temp_free_i64(tcg_zero);
+        } else {
+            tcg_gen_add_i64(tcg_src, tcg_src, tcg_rnd);
+        }
+    }
+
+    /* Now do the shift right */
+    if (round && extended_result) {
+        /* extended case, >64 bit precision required */
+        if (ext_lshift == 0) {
+            /* special case, only high bits matter */
+            tcg_gen_mov_i64(tcg_src, tcg_src_hi);
+        } else {
+            tcg_gen_shri_i64(tcg_src, tcg_src, shift);
+            tcg_gen_shli_i64(tcg_src_hi, tcg_src_hi, ext_lshift);
+            tcg_gen_or_i64(tcg_src, tcg_src, tcg_src_hi);
+        }
+    } else {
+        if (is_u) {
+            if (shift == 64) {
+                /* essentially shifting in 64 zeros */
+                tcg_gen_movi_i64(tcg_src, 0);
+            } else {
+                tcg_gen_shri_i64(tcg_src, tcg_src, shift);
+            }
+        } else {
+            if (shift == 64) {
+                /* effectively extending the sign-bit */
+                tcg_gen_sari_i64(tcg_src, tcg_src, 63);
+            } else {
+                tcg_gen_sari_i64(tcg_src, tcg_src, shift);
+            }
+        }
+    }
+
+    if (accumulate) {
+        tcg_gen_add_i64(tcg_res, tcg_res, tcg_src);
+    } else {
+        tcg_gen_mov_i64(tcg_res, tcg_src);
+    }
+
+    if (extended_result) {
+        tcg_temp_free_i64(tcg_src_hi);
+    }
+}
+
+/* Common SHL/SLI - Shift left with an optional insert */
+static void handle_shli_with_ins(TCGv_i64 tcg_res, TCGv_i64 tcg_src,
+                                 bool insert, int shift)
+{
+    if (insert) { /* SLI */
+        tcg_gen_deposit_i64(tcg_res, tcg_res, tcg_src, shift, 64 - shift);
+    } else { /* SHL */
+        tcg_gen_shli_i64(tcg_res, tcg_src, shift);
+    }
+}
+
+/* SSHR[RA]/USHR[RA] - Scalar shift right (optional rounding/accumulate) */
+static void handle_scalar_simd_shri(DisasContext *s,
+                                    bool is_u, int immh, int immb,
+                                    int opcode, int rn, int rd)
+{
+    const int size = 3;
+    int immhb = immh << 3 | immb;
+    int shift = 2 * (8 << size) - immhb;
+    bool accumulate = false;
+    bool round = false;
+    TCGv_i64 tcg_rn;
+    TCGv_i64 tcg_rd;
+    TCGv_i64 tcg_round;
+
+    if (!extract32(immh, 3, 1)) {
+        unallocated_encoding(s);
+        return;
+    }
+
+    switch (opcode) {
+    case 0x02: /* SSRA / USRA (accumulate) */
+        accumulate = true;
+        break;
+    case 0x04: /* SRSHR / URSHR (rounding) */
+        round = true;
+        break;
+    case 0x06: /* SRSRA / URSRA (accum + rounding) */
+        accumulate = round = true;
+        break;
+    }
+
+    if (round) {
+        uint64_t round_const = 1ULL << (shift - 1);
+        tcg_round = tcg_const_i64(round_const);
+    } else {
+        TCGV_UNUSED_I64(tcg_round);
+    }
+
+    tcg_rn = read_fp_dreg(s, rn);
+    tcg_rd = accumulate ? read_fp_dreg(s, rd) : tcg_temp_new_i64();
+
+    handle_shri_with_rndacc(tcg_rd, tcg_rn, tcg_round,
+                               accumulate, is_u, size, shift);
+
+    write_fp_dreg(s, rd, tcg_rd);
+
+    tcg_temp_free_i64(tcg_rn);
+    tcg_temp_free_i64(tcg_rd);
+    if (round) {
+        tcg_temp_free_i64(tcg_round);
+    }
+}
+
+/* SHL/SLI - Scalar shift left */
+static void handle_scalar_simd_shli(DisasContext *s, bool insert,
+                                    int immh, int immb, int opcode,
+                                    int rn, int rd)
+{
+    int size = 32 - clz32(immh) - 1;
+    int immhb = immh << 3 | immb;
+    int shift = immhb - (8 << size);
+    TCGv_i64 tcg_rn = new_tmp_a64(s);
+    TCGv_i64 tcg_rd = new_tmp_a64(s);
+
+    if (!extract32(immh, 3, 1)) {
+        unallocated_encoding(s);
+        return;
+    }
+
+    tcg_rn = read_fp_dreg(s, rn);
+    tcg_rd = insert ? read_fp_dreg(s, rd) : tcg_temp_new_i64();
+
+    handle_shli_with_ins(tcg_rd, tcg_rn, insert, shift);
+
+    write_fp_dreg(s, rd, tcg_rd);
+
+    tcg_temp_free_i64(tcg_rn);
+    tcg_temp_free_i64(tcg_rd);
+}
+
+/* C3.6.9 AdvSIMD scalar shift by immediate
+ *  31 30  29 28         23 22  19 18  16 15    11  10 9    5 4    0
+ * +-----+---+-------------+------+------+--------+---+------+------+
+ * | 0 1 | U | 1 1 1 1 1 0 | immh | immb | opcode | 1 |  Rn  |  Rd  |
+ * +-----+---+-------------+------+------+--------+---+------+------+
+ *
+ * This is the scalar version so it works on a fixed sized registers
+ */
+static void disas_simd_scalar_shift_imm(DisasContext *s, uint32_t insn)
+{
+    int rd = extract32(insn, 0, 5);
+    int rn = extract32(insn, 5, 5);
+    int opcode = extract32(insn, 11, 5);
+    int immb = extract32(insn, 16, 3);
+    int immh = extract32(insn, 19, 4);
+    bool is_u = extract32(insn, 29, 1);
+
+    switch (opcode) {
+    case 0x00: /* SSHR / USHR */
+    case 0x02: /* SSRA / USRA */
+    case 0x04: /* SRSHR / URSHR */
+    case 0x06: /* SRSRA / URSRA */
+        handle_scalar_simd_shri(s, is_u, immh, immb, opcode, rn, rd);
+        break;
+    case 0x0a: /* SHL / SLI */
+        handle_scalar_simd_shli(s, is_u, immh, immb, opcode, rn, rd);
+        break;
+    default:
+        unsupported_encoding(s, insn);
+        break;
+    }
+}
+
+/* C3.6.10 AdvSIMD scalar three different
+ *  31 30  29 28       24 23  22  21 20  16 15    12 11 10 9    5 4    0
+ * +-----+---+-----------+------+---+------+--------+-----+------+------+
+ * | 0 1 | U | 1 1 1 1 0 | size | 1 |  Rm  | opcode | 0 0 |  Rn  |  Rd  |
+ * +-----+---+-----------+------+---+------+--------+-----+------+------+
+ */
+static void disas_simd_scalar_three_reg_diff(DisasContext *s, uint32_t insn)
+{
+    unsupported_encoding(s, insn);
+}
+
+static void handle_3same_64(DisasContext *s, int opcode, bool u,
+                            TCGv_i64 tcg_rd, TCGv_i64 tcg_rn, TCGv_i64 tcg_rm)
+{
+    /* Handle 64x64->64 opcodes which are shared between the scalar
+     * and vector 3-same groups. We cover every opcode where size == 3
+     * is valid in either the three-reg-same (integer, not pairwise)
+     * or scalar-three-reg-same groups. (Some opcodes are not yet
+     * implemented.)
+     */
+    TCGCond cond;
+
+    switch (opcode) {
+    case 0x6: /* CMGT, CMHI */
+        /* 64 bit integer comparison, result = test ? (2^64 - 1) : 0.
+         * We implement this using setcond (test) and then negating.
+         */
+        cond = u ? TCG_COND_GTU : TCG_COND_GT;
+    do_cmop:
+        tcg_gen_setcond_i64(cond, tcg_rd, tcg_rn, tcg_rm);
+        tcg_gen_neg_i64(tcg_rd, tcg_rd);
+        break;
+    case 0x7: /* CMGE, CMHS */
+        cond = u ? TCG_COND_GEU : TCG_COND_GE;
+        goto do_cmop;
+    case 0x11: /* CMTST, CMEQ */
+        if (u) {
+            cond = TCG_COND_EQ;
+            goto do_cmop;
+        }
+        /* CMTST : test is "if (X & Y != 0)". */
+        tcg_gen_and_i64(tcg_rd, tcg_rn, tcg_rm);
+        tcg_gen_setcondi_i64(TCG_COND_NE, tcg_rd, tcg_rd, 0);
+        tcg_gen_neg_i64(tcg_rd, tcg_rd);
+        break;
+    case 0x10: /* ADD, SUB */
+        if (u) {
+            tcg_gen_sub_i64(tcg_rd, tcg_rn, tcg_rm);
+        } else {
+            tcg_gen_add_i64(tcg_rd, tcg_rn, tcg_rm);
+        }
+        break;
+    case 0x1: /* SQADD */
+    case 0x5: /* SQSUB */
+    case 0x8: /* SSHL, USHL */
+    case 0x9: /* SQSHL, UQSHL */
+    case 0xa: /* SRSHL, URSHL */
+    case 0xb: /* SQRSHL, UQRSHL */
+    default:
+        g_assert_not_reached();
+    }
+}
+
+/* Handle the 3-same-operands float operations; shared by the scalar
+ * and vector encodings. The caller must filter out any encodings
+ * not allocated for the encoding it is dealing with.
+ */
+static void handle_3same_float(DisasContext *s, int size, int elements,
+                               int fpopcode, int rd, int rn, int rm)
+{
+    int pass;
+    TCGv_ptr fpst = get_fpstatus_ptr();
+
+    for (pass = 0; pass < elements; pass++) {
+        if (size) {
+            /* Double */
+            TCGv_i64 tcg_op1 = tcg_temp_new_i64();
+            TCGv_i64 tcg_op2 = tcg_temp_new_i64();
+            TCGv_i64 tcg_res = tcg_temp_new_i64();
+
+            read_vec_element(s, tcg_op1, rn, pass, MO_64);
+            read_vec_element(s, tcg_op2, rm, pass, MO_64);
+
+            switch (fpopcode) {
+            case 0x18: /* FMAXNM */
+                gen_helper_vfp_maxnumd(tcg_res, tcg_op1, tcg_op2, fpst);
+                break;
+            case 0x1a: /* FADD */
+                gen_helper_vfp_addd(tcg_res, tcg_op1, tcg_op2, fpst);
+                break;
+            case 0x1e: /* FMAX */
+                gen_helper_vfp_maxd(tcg_res, tcg_op1, tcg_op2, fpst);
+                break;
+            case 0x38: /* FMINNM */
+                gen_helper_vfp_minnumd(tcg_res, tcg_op1, tcg_op2, fpst);
+                break;
+            case 0x3a: /* FSUB */
+                gen_helper_vfp_subd(tcg_res, tcg_op1, tcg_op2, fpst);
+                break;
+            case 0x3e: /* FMIN */
+                gen_helper_vfp_mind(tcg_res, tcg_op1, tcg_op2, fpst);
+                break;
+            case 0x5b: /* FMUL */
+                gen_helper_vfp_muld(tcg_res, tcg_op1, tcg_op2, fpst);
+                break;
+            case 0x5f: /* FDIV */
+                gen_helper_vfp_divd(tcg_res, tcg_op1, tcg_op2, fpst);
+                break;
+            case 0x7a: /* FABD */
+                gen_helper_vfp_subd(tcg_res, tcg_op1, tcg_op2, fpst);
+                gen_helper_vfp_absd(tcg_res, tcg_res);
+                break;
+            default:
+                g_assert_not_reached();
+            }
+
+            write_vec_element(s, tcg_res, rd, pass, MO_64);
+
+            tcg_temp_free_i64(tcg_res);
+            tcg_temp_free_i64(tcg_op1);
+            tcg_temp_free_i64(tcg_op2);
+        } else {
+            /* Single */
+            TCGv_i32 tcg_op1 = tcg_temp_new_i32();
+            TCGv_i32 tcg_op2 = tcg_temp_new_i32();
+            TCGv_i32 tcg_res = tcg_temp_new_i32();
+
+            read_vec_element_i32(s, tcg_op1, rn, pass, MO_32);
+            read_vec_element_i32(s, tcg_op2, rm, pass, MO_32);
+
+            switch (fpopcode) {
+            case 0x1a: /* FADD */
+                gen_helper_vfp_adds(tcg_res, tcg_op1, tcg_op2, fpst);
+                break;
+            case 0x1e: /* FMAX */
+                gen_helper_vfp_maxs(tcg_res, tcg_op1, tcg_op2, fpst);
+                break;
+            case 0x18: /* FMAXNM */
+                gen_helper_vfp_maxnums(tcg_res, tcg_op1, tcg_op2, fpst);
+                break;
+            case 0x38: /* FMINNM */
+                gen_helper_vfp_minnums(tcg_res, tcg_op1, tcg_op2, fpst);
+                break;
+            case 0x3a: /* FSUB */
+                gen_helper_vfp_subs(tcg_res, tcg_op1, tcg_op2, fpst);
+                break;
+            case 0x3e: /* FMIN */
+                gen_helper_vfp_mins(tcg_res, tcg_op1, tcg_op2, fpst);
+                break;
+            case 0x5b: /* FMUL */
+                gen_helper_vfp_muls(tcg_res, tcg_op1, tcg_op2, fpst);
+                break;
+            case 0x5f: /* FDIV */
+                gen_helper_vfp_divs(tcg_res, tcg_op1, tcg_op2, fpst);
+                break;
+            case 0x7a: /* FABD */
+                gen_helper_vfp_subs(tcg_res, tcg_op1, tcg_op2, fpst);
+                gen_helper_vfp_abss(tcg_res, tcg_res);
+                break;
+            default:
+                g_assert_not_reached();
+            }
+
+            if (elements == 1) {
+                /* scalar single so clear high part */
+                TCGv_i64 tcg_tmp = tcg_temp_new_i64();
+
+                tcg_gen_extu_i32_i64(tcg_tmp, tcg_res);
+                write_vec_element(s, tcg_tmp, rd, pass, MO_64);
+                tcg_temp_free_i64(tcg_tmp);
+            } else {
+                write_vec_element_i32(s, tcg_res, rd, pass, MO_32);
+            }
+
+            tcg_temp_free_i32(tcg_res);
+            tcg_temp_free_i32(tcg_op1);
+            tcg_temp_free_i32(tcg_op2);
+        }
+    }
+
+    tcg_temp_free_ptr(fpst);
+
+    if ((elements << size) < 4) {
+        /* scalar, or non-quad vector op */
+        clear_vec_high(s, rd);
+    }
+}
+
+/* C3.6.11 AdvSIMD scalar three same
+ *  31 30  29 28       24 23  22  21 20  16 15    11  10 9    5 4    0
+ * +-----+---+-----------+------+---+------+--------+---+------+------+
+ * | 0 1 | U | 1 1 1 1 0 | size | 1 |  Rm  | opcode | 1 |  Rn  |  Rd  |
+ * +-----+---+-----------+------+---+------+--------+---+------+------+
+ */
+static void disas_simd_scalar_three_reg_same(DisasContext *s, uint32_t insn)
+{
+    int rd = extract32(insn, 0, 5);
+    int rn = extract32(insn, 5, 5);
+    int opcode = extract32(insn, 11, 5);
+    int rm = extract32(insn, 16, 5);
+    int size = extract32(insn, 22, 2);
+    bool u = extract32(insn, 29, 1);
+    TCGv_i64 tcg_rn;
+    TCGv_i64 tcg_rm;
+    TCGv_i64 tcg_rd;
+
+    if (opcode >= 0x18) {
+        /* Floating point: U, size[1] and opcode indicate operation */
+        int fpopcode = opcode | (extract32(size, 1, 1) << 5) | (u << 6);
+        switch (fpopcode) {
+        case 0x1b: /* FMULX */
+        case 0x1c: /* FCMEQ */
+        case 0x1f: /* FRECPS */
+        case 0x3f: /* FRSQRTS */
+        case 0x5c: /* FCMGE */
+        case 0x5d: /* FACGE */
+        case 0x7c: /* FCMGT */
+        case 0x7d: /* FACGT */
+            unsupported_encoding(s, insn);
+            return;
+        case 0x7a: /* FABD */
+            break;
+        default:
+            unallocated_encoding(s);
+            return;
+        }
+
+        handle_3same_float(s, extract32(size, 0, 1), 1, fpopcode, rd, rn, rm);
+        return;
+    }
+
+    switch (opcode) {
+    case 0x1: /* SQADD, UQADD */
+    case 0x5: /* SQSUB, UQSUB */
+    case 0x8: /* SSHL, USHL */
+    case 0xa: /* SRSHL, URSHL */
+        unsupported_encoding(s, insn);
+        return;
+    case 0x6: /* CMGT, CMHI */
+    case 0x7: /* CMGE, CMHS */
+    case 0x11: /* CMTST, CMEQ */
+    case 0x10: /* ADD, SUB (vector) */
+        if (size != 3) {
+            unallocated_encoding(s);
+            return;
+        }
+        break;
+    case 0x9: /* SQSHL, UQSHL */
+    case 0xb: /* SQRSHL, UQRSHL */
+        unsupported_encoding(s, insn);
+        return;
+    case 0x16: /* SQDMULH, SQRDMULH (vector) */
+        if (size != 1 && size != 2) {
+            unallocated_encoding(s);
+            return;
+        }
+        unsupported_encoding(s, insn);
+        return;
+    default:
+        unallocated_encoding(s);
+        return;
+    }
+
+    tcg_rn = read_fp_dreg(s, rn);       /* op1 */
+    tcg_rm = read_fp_dreg(s, rm);       /* op2 */
+    tcg_rd = tcg_temp_new_i64();
+
+    /* For the moment we only support the opcodes which are
+     * 64-bit-width only. The size != 3 cases will
+     * be handled later when the relevant ops are implemented.
+     */
+    handle_3same_64(s, opcode, u, tcg_rd, tcg_rn, tcg_rm);
+
+    write_fp_dreg(s, rd, tcg_rd);
+
+    tcg_temp_free_i64(tcg_rn);
+    tcg_temp_free_i64(tcg_rm);
+    tcg_temp_free_i64(tcg_rd);
+}
+
+/* C3.6.12 AdvSIMD scalar two reg misc
+ *  31 30  29 28       24 23  22 21       17 16    12 11 10 9    5 4    0
+ * +-----+---+-----------+------+-----------+--------+-----+------+------+
+ * | 0 1 | U | 1 1 1 1 0 | size | 1 0 0 0 0 | opcode | 1 0 |  Rn  |  Rd  |
+ * +-----+---+-----------+------+-----------+--------+-----+------+------+
+ */
+static void disas_simd_scalar_two_reg_misc(DisasContext *s, uint32_t insn)
+{
+    unsupported_encoding(s, insn);
+}
+
+/* C3.6.13 AdvSIMD scalar x indexed element
+ *  31 30  29 28       24 23  22 21  20  19  16 15 12  11  10 9    5 4    0
+ * +-----+---+-----------+------+---+---+------+-----+---+---+------+------+
+ * | 0 1 | U | 1 1 1 1 1 | size | L | M |  Rm  | opc | H | 0 |  Rn  |  Rd  |
+ * +-----+---+-----------+------+---+---+------+-----+---+---+------+------+
+ */
+static void disas_simd_scalar_indexed(DisasContext *s, uint32_t insn)
+{
+    unsupported_encoding(s, insn);
+}
+
+/* SSHR[RA]/USHR[RA] - Vector shift right (optional rounding/accumulate) */
+static void handle_vec_simd_shri(DisasContext *s, bool is_q, bool is_u,
+                                 int immh, int immb, int opcode, int rn, int rd)
+{
+    int size = 32 - clz32(immh) - 1;
+    int immhb = immh << 3 | immb;
+    int shift = 2 * (8 << size) - immhb;
+    bool accumulate = false;
+    bool round = false;
+    int dsize = is_q ? 128 : 64;
+    int esize = 8 << size;
+    int elements = dsize/esize;
+    TCGMemOp memop = size | (is_u ? 0 : MO_SIGN);
+    TCGv_i64 tcg_rn = new_tmp_a64(s);
+    TCGv_i64 tcg_rd = new_tmp_a64(s);
+    TCGv_i64 tcg_round;
+    int i;
+
+    if (extract32(immh, 3, 1) && !is_q) {
+        unallocated_encoding(s);
+        return;
+    }
+
+    if (size > 3 && !is_q) {
+        unallocated_encoding(s);
+        return;
+    }
+
+    switch (opcode) {
+    case 0x02: /* SSRA / USRA (accumulate) */
+        accumulate = true;
+        break;
+    case 0x04: /* SRSHR / URSHR (rounding) */
+        round = true;
+        break;
+    case 0x06: /* SRSRA / URSRA (accum + rounding) */
+        accumulate = round = true;
+        break;
+    }
+
+    if (round) {
+        uint64_t round_const = 1ULL << (shift - 1);
+        tcg_round = tcg_const_i64(round_const);
+    } else {
+        TCGV_UNUSED_I64(tcg_round);
+    }
+
+    for (i = 0; i < elements; i++) {
+        read_vec_element(s, tcg_rn, rn, i, memop);
+        if (accumulate) {
+            read_vec_element(s, tcg_rd, rd, i, memop);
+        }
+
+        handle_shri_with_rndacc(tcg_rd, tcg_rn, tcg_round,
+                                accumulate, is_u, size, shift);
+
+        write_vec_element(s, tcg_rd, rd, i, size);
+    }
+
+    if (!is_q) {
+        clear_vec_high(s, rd);
+    }
+
+    if (round) {
+        tcg_temp_free_i64(tcg_round);
+    }
+}
+
+/* SHL/SLI - Vector shift left */
+static void handle_vec_simd_shli(DisasContext *s, bool is_q, bool insert,
+                                int immh, int immb, int opcode, int rn, int rd)
+{
+    int size = 32 - clz32(immh) - 1;
+    int immhb = immh << 3 | immb;
+    int shift = immhb - (8 << size);
+    int dsize = is_q ? 128 : 64;
+    int esize = 8 << size;
+    int elements = dsize/esize;
+    TCGv_i64 tcg_rn = new_tmp_a64(s);
+    TCGv_i64 tcg_rd = new_tmp_a64(s);
+    int i;
+
+    if (extract32(immh, 3, 1) && !is_q) {
+        unallocated_encoding(s);
+        return;
+    }
+
+    if (size > 3 && !is_q) {
+        unallocated_encoding(s);
+        return;
+    }
+
+    for (i = 0; i < elements; i++) {
+        read_vec_element(s, tcg_rn, rn, i, size);
+        if (insert) {
+            read_vec_element(s, tcg_rd, rd, i, size);
+        }
+
+        handle_shli_with_ins(tcg_rd, tcg_rn, insert, shift);
+
+        write_vec_element(s, tcg_rd, rd, i, size);
+    }
+
+    if (!is_q) {
+        clear_vec_high(s, rd);
+    }
+}
+
+/* USHLL/SHLL - Vector shift left with widening */
+static void handle_vec_simd_wshli(DisasContext *s, bool is_q, bool is_u,
+                                 int immh, int immb, int opcode, int rn, int rd)
+{
+    int size = 32 - clz32(immh) - 1;
+    int immhb = immh << 3 | immb;
+    int shift = immhb - (8 << size);
+    int dsize = 64;
+    int esize = 8 << size;
+    int elements = dsize/esize;
+    TCGv_i64 tcg_rn = new_tmp_a64(s);
+    TCGv_i64 tcg_rd = new_tmp_a64(s);
+    int i;
+
+    if (size >= 3) {
+        unallocated_encoding(s);
+        return;
+    }
+
+    /* For the LL variants the store is larger than the load,
+     * so if rd == rn we would overwrite parts of our input.
+     * So load everything right now and use shifts in the main loop.
+     */
+    read_vec_element(s, tcg_rn, rn, is_q ? 1 : 0, MO_64);
+
+    for (i = 0; i < elements; i++) {
+        tcg_gen_shri_i64(tcg_rd, tcg_rn, i * esize);
+        ext_and_shift_reg(tcg_rd, tcg_rd, size | (!is_u << 2), 0);
+        tcg_gen_shli_i64(tcg_rd, tcg_rd, shift);
+        write_vec_element(s, tcg_rd, rd, i, size + 1);
+    }
+}
+
+
+/* C3.6.14 AdvSIMD shift by immediate
+ *  31  30   29 28         23 22  19 18  16 15    11  10 9    5 4    0
+ * +---+---+---+-------------+------+------+--------+---+------+------+
+ * | 0 | Q | U | 0 1 1 1 1 0 | immh | immb | opcode | 1 |  Rn  |  Rd  |
+ * +---+---+---+-------------+------+------+--------+---+------+------+
+ */
+static void disas_simd_shift_imm(DisasContext *s, uint32_t insn)
+{
+    int rd = extract32(insn, 0, 5);
+    int rn = extract32(insn, 5, 5);
+    int opcode = extract32(insn, 11, 5);
+    int immb = extract32(insn, 16, 3);
+    int immh = extract32(insn, 19, 4);
+    bool is_u = extract32(insn, 29, 1);
+    bool is_q = extract32(insn, 30, 1);
+
+    switch (opcode) {
+    case 0x00: /* SSHR / USHR */
+    case 0x02: /* SSRA / USRA (accumulate) */
+    case 0x04: /* SRSHR / URSHR (rounding) */
+    case 0x06: /* SRSRA / URSRA (accum + rounding) */
+        handle_vec_simd_shri(s, is_q, is_u, immh, immb, opcode, rn, rd);
+        break;
+    case 0x0a: /* SHL / SLI */
+        handle_vec_simd_shli(s, is_q, is_u, immh, immb, opcode, rn, rd);
+        break;
+    case 0x14: /* SSHLL / USHLL */
+        handle_vec_simd_wshli(s, is_q, is_u, immh, immb, opcode, rn, rd);
+        break;
+    default:
+        /* We don't currently implement any of the Narrow or saturating shifts;
+         * nor do we implement the fixed-point conversions in this
+         * encoding group (SCVTF, FCVTZS, UCVTF, FCVTZU).
+         */
+        unsupported_encoding(s, insn);
+        return;
+    }
+}
+
+static void handle_3rd_widening(DisasContext *s, int is_q, int is_u, int size,
+                                int opcode, int rd, int rn, int rm)
+{
+    /* 3-reg-different widening insns: 64 x 64 -> 128 */
+    TCGv_i64 tcg_res[2];
+    int pass, accop;
+
+    tcg_res[0] = tcg_temp_new_i64();
+    tcg_res[1] = tcg_temp_new_i64();
+
+    /* Does this op do an adding accumulate, a subtracting accumulate,
+     * or no accumulate at all?
+     */
+    switch (opcode) {
+    case 5:
+    case 8:
+    case 9:
+        accop = 1;
+        break;
+    case 10:
+    case 11:
+        accop = -1;
+        break;
+    default:
+        accop = 0;
+        break;
+    }
+
+    if (accop != 0) {
+        read_vec_element(s, tcg_res[0], rd, 0, MO_64);
+        read_vec_element(s, tcg_res[1], rd, 1, MO_64);
+    }
+
+    /* size == 2 means two 32x32->64 operations; this is worth special
+     * casing because we can generally handle it inline.
+     */
+    if (size == 2) {
+        for (pass = 0; pass < 2; pass++) {
+            TCGv_i64 tcg_op1 = tcg_temp_new_i64();
+            TCGv_i64 tcg_op2 = tcg_temp_new_i64();
+            TCGv_i64 tcg_passres;
+            TCGMemOp memop = MO_32 | (is_u ? 0 : MO_SIGN);
+
+            int elt = pass + is_q * 2;
+
+            read_vec_element(s, tcg_op1, rn, elt, memop);
+            read_vec_element(s, tcg_op2, rm, elt, memop);
+
+            if (accop == 0) {
+                tcg_passres = tcg_res[pass];
+            } else {
+                tcg_passres = tcg_temp_new_i64();
+            }
+
+            switch (opcode) {
+            case 5: /* SABAL, SABAL2, UABAL, UABAL2 */
+            case 7: /* SABDL, SABDL2, UABDL, UABDL2 */
+            {
+                TCGv_i64 tcg_tmp1 = tcg_temp_new_i64();
+                TCGv_i64 tcg_tmp2 = tcg_temp_new_i64();
+
+                tcg_gen_sub_i64(tcg_tmp1, tcg_op1, tcg_op2);
+                tcg_gen_sub_i64(tcg_tmp2, tcg_op2, tcg_op1);
+                tcg_gen_movcond_i64(is_u ? TCG_COND_GEU : TCG_COND_GE,
+                                    tcg_passres,
+                                    tcg_op1, tcg_op2, tcg_tmp1, tcg_tmp2);
+                tcg_temp_free_i64(tcg_tmp1);
+                tcg_temp_free_i64(tcg_tmp2);
+                break;
+            }
+            case 8: /* SMLAL, SMLAL2, UMLAL, UMLAL2 */
+            case 10: /* SMLSL, SMLSL2, UMLSL, UMLSL2 */
+            case 12: /* UMULL, UMULL2, SMULL, SMULL2 */
+                tcg_gen_mul_i64(tcg_passres, tcg_op1, tcg_op2);
+                break;
+            default:
+                g_assert_not_reached();
+            }
+
+            if (accop > 0) {
+                tcg_gen_add_i64(tcg_res[pass], tcg_res[pass], tcg_passres);
+                tcg_temp_free_i64(tcg_passres);
+            } else if (accop < 0) {
+                tcg_gen_sub_i64(tcg_res[pass], tcg_res[pass], tcg_passres);
+                tcg_temp_free_i64(tcg_passres);
+            }
+
+            tcg_temp_free_i64(tcg_op1);
+            tcg_temp_free_i64(tcg_op2);
+        }
+    } else {
+        /* size 0 or 1, generally helper functions */
+        for (pass = 0; pass < 2; pass++) {
+            TCGv_i32 tcg_op1 = tcg_temp_new_i32();
+            TCGv_i32 tcg_op2 = tcg_temp_new_i32();
+            TCGv_i64 tcg_passres;
+            int elt = pass + is_q * 2;
+
+            read_vec_element_i32(s, tcg_op1, rn, elt, MO_32);
+            read_vec_element_i32(s, tcg_op2, rm, elt, MO_32);
+
+            if (accop == 0) {
+                tcg_passres = tcg_res[pass];
+            } else {
+                tcg_passres = tcg_temp_new_i64();
+            }
+
+            switch (opcode) {
+            case 5: /* SABAL, SABAL2, UABAL, UABAL2 */
+            case 7: /* SABDL, SABDL2, UABDL, UABDL2 */
+                if (size == 0) {
+                    if (is_u) {
+                        gen_helper_neon_abdl_u16(tcg_passres, tcg_op1, tcg_op2);
+                    } else {
+                        gen_helper_neon_abdl_s16(tcg_passres, tcg_op1, tcg_op2);
+                    }
+                } else {
+                    if (is_u) {
+                        gen_helper_neon_abdl_u32(tcg_passres, tcg_op1, tcg_op2);
+                    } else {
+                        gen_helper_neon_abdl_s32(tcg_passres, tcg_op1, tcg_op2);
+                    }
+                }
+                break;
+            case 8: /* SMLAL, SMLAL2, UMLAL, UMLAL2 */
+            case 10: /* SMLSL, SMLSL2, UMLSL, UMLSL2 */
+            case 12: /* UMULL, UMULL2, SMULL, SMULL2 */
+                if (size == 0) {
+                    if (is_u) {
+                        gen_helper_neon_mull_u8(tcg_passres, tcg_op1, tcg_op2);
+                    } else {
+                        gen_helper_neon_mull_s8(tcg_passres, tcg_op1, tcg_op2);
+                    }
+                } else {
+                    if (is_u) {
+                        gen_helper_neon_mull_u16(tcg_passres, tcg_op1, tcg_op2);
+                    } else {
+                        gen_helper_neon_mull_s16(tcg_passres, tcg_op1, tcg_op2);
+                    }
+                }
+                break;
+            default:
+                g_assert_not_reached();
+            }
+            tcg_temp_free_i32(tcg_op1);
+            tcg_temp_free_i32(tcg_op2);
+
+            if (accop > 0) {
+                if (size == 0) {
+                    gen_helper_neon_addl_u16(tcg_res[pass], tcg_res[pass],
+                                             tcg_passres);
+                } else {
+                    gen_helper_neon_addl_u32(tcg_res[pass], tcg_res[pass],
+                                             tcg_passres);
+                }
+                tcg_temp_free_i64(tcg_passres);
+            } else if (accop < 0) {
+                if (size == 0) {
+                    gen_helper_neon_subl_u16(tcg_res[pass], tcg_res[pass],
+                                             tcg_passres);
+                } else {
+                    gen_helper_neon_subl_u32(tcg_res[pass], tcg_res[pass],
+                                             tcg_passres);
+                }
+                tcg_temp_free_i64(tcg_passres);
+            }
+        }
+    }
+
+    write_vec_element(s, tcg_res[0], rd, 0, MO_64);
+    write_vec_element(s, tcg_res[1], rd, 1, MO_64);
+    tcg_temp_free_i64(tcg_res[0]);
+    tcg_temp_free_i64(tcg_res[1]);
+}
+
+/* C3.6.15 AdvSIMD three different
+ *   31  30  29 28       24 23  22  21 20  16 15    12 11 10 9    5 4    0
+ * +---+---+---+-----------+------+---+------+--------+-----+------+------+
+ * | 0 | Q | U | 0 1 1 1 0 | size | 1 |  Rm  | opcode | 0 0 |  Rn  |  Rd  |
+ * +---+---+---+-----------+------+---+------+--------+-----+------+------+
+ */
+static void disas_simd_three_reg_diff(DisasContext *s, uint32_t insn)
+{
+    /* Instructions in this group fall into three basic classes
+     * (in each case with the operation working on each element in
+     * the input vectors):
+     * (1) widening 64 x 64 -> 128 (with possibly Vd as an extra
+     *     128 bit input)
+     * (2) wide 64 x 128 -> 128
+     * (3) narrowing 128 x 128 -> 64
+     * Here we do initial decode, catch unallocated cases and
+     * dispatch to separate functions for each class.
+     */
+    int is_q = extract32(insn, 30, 1);
+    int is_u = extract32(insn, 29, 1);
+    int size = extract32(insn, 22, 2);
+    int opcode = extract32(insn, 12, 4);
+    int rm = extract32(insn, 16, 5);
+    int rn = extract32(insn, 5, 5);
+    int rd = extract32(insn, 0, 5);
+
+    switch (opcode) {
+    case 1: /* SADDW, SADDW2, UADDW, UADDW2 */
+    case 3: /* SSUBW, SSUBW2, USUBW, USUBW2 */
+        /* 64 x 128 -> 128 */
+        unsupported_encoding(s, insn);
+        break;
+    case 4: /* ADDHN, ADDHN2, RADDHN, RADDHN2 */
+    case 6: /* SUBHN, SUBHN2, RSUBHN, RSUBHN2 */
+        /* 128 x 128 -> 64 */
+        unsupported_encoding(s, insn);
+        break;
+    case 9:
+    case 11:
+    case 13:
+    case 14:
+        if (is_u) {
+            unallocated_encoding(s);
+            return;
+        }
+        /* fall through */
+    case 0:
+    case 2:
+        unsupported_encoding(s, insn);
+        break;
+    case 5:
+    case 7:
+    case 8:
+    case 10:
+    case 12:
+        /* 64 x 64 -> 128 */
+        if (size == 3) {
+            unallocated_encoding(s);
+            return;
+        }
+        handle_3rd_widening(s, is_q, is_u, size, opcode, rd, rn, rm);
+        break;
+    default:
+        /* opcode 15 not allocated */
+        unallocated_encoding(s);
+        break;
+    }
+}
+
+/* Logic op (opcode == 3) subgroup of C3.6.16. */
+static void disas_simd_3same_logic(DisasContext *s, uint32_t insn)
+{
+    int rd = extract32(insn, 0, 5);
+    int rn = extract32(insn, 5, 5);
+    int rm = extract32(insn, 16, 5);
+    int size = extract32(insn, 22, 2);
+    bool is_u = extract32(insn, 29, 1);
+    bool is_q = extract32(insn, 30, 1);
+    TCGv_i64 tcg_op1 = tcg_temp_new_i64();
+    TCGv_i64 tcg_op2 = tcg_temp_new_i64();
+    TCGv_i64 tcg_res[2];
+    int pass;
+
+    tcg_res[0] = tcg_temp_new_i64();
+    tcg_res[1] = tcg_temp_new_i64();
+
+    for (pass = 0; pass < (is_q ? 2 : 1); pass++) {
+        read_vec_element(s, tcg_op1, rn, pass, MO_64);
+        read_vec_element(s, tcg_op2, rm, pass, MO_64);
+
+        if (!is_u) {
+            switch (size) {
+            case 0: /* AND */
+                tcg_gen_and_i64(tcg_res[pass], tcg_op1, tcg_op2);
+                break;
+            case 1: /* BIC */
+                tcg_gen_andc_i64(tcg_res[pass], tcg_op1, tcg_op2);
+                break;
+            case 2: /* ORR */
+                tcg_gen_or_i64(tcg_res[pass], tcg_op1, tcg_op2);
+                break;
+            case 3: /* ORN */
+                tcg_gen_orc_i64(tcg_res[pass], tcg_op1, tcg_op2);
+                break;
+            }
+        } else {
+            if (size != 0) {
+                /* B* ops need res loaded to operate on */
+                read_vec_element(s, tcg_res[pass], rd, pass, MO_64);
+            }
+
+            switch (size) {
+            case 0: /* EOR */
+                tcg_gen_xor_i64(tcg_res[pass], tcg_op1, tcg_op2);
+                break;
+            case 1: /* BSL bitwise select */
+                tcg_gen_xor_i64(tcg_op1, tcg_op1, tcg_op2);
+                tcg_gen_and_i64(tcg_op1, tcg_op1, tcg_res[pass]);
+                tcg_gen_xor_i64(tcg_res[pass], tcg_op2, tcg_op1);
+                break;
+            case 2: /* BIT, bitwise insert if true */
+                tcg_gen_xor_i64(tcg_op1, tcg_op1, tcg_res[pass]);
+                tcg_gen_and_i64(tcg_op1, tcg_op1, tcg_op2);
+                tcg_gen_xor_i64(tcg_res[pass], tcg_res[pass], tcg_op1);
+                break;
+            case 3: /* BIF, bitwise insert if false */
+                tcg_gen_xor_i64(tcg_op1, tcg_op1, tcg_res[pass]);
+                tcg_gen_andc_i64(tcg_op1, tcg_op1, tcg_op2);
+                tcg_gen_xor_i64(tcg_res[pass], tcg_res[pass], tcg_op1);
+                break;
+            }
+        }
+    }
+
+    write_vec_element(s, tcg_res[0], rd, 0, MO_64);
+    if (!is_q) {
+        tcg_gen_movi_i64(tcg_res[1], 0);
+    }
+    write_vec_element(s, tcg_res[1], rd, 1, MO_64);
+
+    tcg_temp_free_i64(tcg_op1);
+    tcg_temp_free_i64(tcg_op2);
+    tcg_temp_free_i64(tcg_res[0]);
+    tcg_temp_free_i64(tcg_res[1]);
+}
+
+/* Pairwise op subgroup of C3.6.16. */
+static void disas_simd_3same_pair(DisasContext *s, uint32_t insn)
+{
+    unsupported_encoding(s, insn);
+}
+
+/* Floating point op subgroup of C3.6.16. */
+static void disas_simd_3same_float(DisasContext *s, uint32_t insn)
+{
+    /* For floating point ops, the U, size[1] and opcode bits
+     * together indicate the operation. size[0] indicates single
+     * or double.
+     */
+    int fpopcode = extract32(insn, 11, 5)
+        | (extract32(insn, 23, 1) << 5)
+        | (extract32(insn, 29, 1) << 6);
+    int is_q = extract32(insn, 30, 1);
+    int size = extract32(insn, 22, 1);
+    int rm = extract32(insn, 16, 5);
+    int rn = extract32(insn, 5, 5);
+    int rd = extract32(insn, 0, 5);
+
+    int datasize = is_q ? 128 : 64;
+    int esize = 32 << size;
+    int elements = datasize / esize;
+
+    if (size == 1 && !is_q) {
+        unallocated_encoding(s);
+        return;
+    }
+
+    switch (fpopcode) {
+    case 0x58: /* FMAXNMP */
+    case 0x5a: /* FADDP */
+    case 0x5e: /* FMAXP */
+    case 0x78: /* FMINNMP */
+    case 0x7e: /* FMINP */
+        /* pairwise ops */
+        unsupported_encoding(s, insn);
+        return;
+    case 0x1b: /* FMULX */
+    case 0x1c: /* FCMEQ */
+    case 0x1f: /* FRECPS */
+    case 0x3f: /* FRSQRTS */
+    case 0x5c: /* FCMGE */
+    case 0x5d: /* FACGE */
+    case 0x7c: /* FCMGT */
+    case 0x7d: /* FACGT */
+    case 0x19: /* FMLA */
+    case 0x39: /* FMLS */
+        unsupported_encoding(s, insn);
+        return;
+    case 0x18: /* FMAXNM */
+    case 0x1a: /* FADD */
+    case 0x1e: /* FMAX */
+    case 0x38: /* FMINNM */
+    case 0x3a: /* FSUB */
+    case 0x3e: /* FMIN */
+    case 0x5b: /* FMUL */
+    case 0x5f: /* FDIV */
+    case 0x7a: /* FABD */
+        handle_3same_float(s, size, elements, fpopcode, rd, rn, rm);
+        return;
+    default:
+        unallocated_encoding(s);
+        return;
+    }
+}
+
+/* Integer op subgroup of C3.6.16. */
+static void disas_simd_3same_int(DisasContext *s, uint32_t insn)
+{
+    int is_q = extract32(insn, 30, 1);
+    int u = extract32(insn, 29, 1);
+    int size = extract32(insn, 22, 2);
+    int opcode = extract32(insn, 11, 5);
+    int rm = extract32(insn, 16, 5);
+    int rn = extract32(insn, 5, 5);
+    int rd = extract32(insn, 0, 5);
+    int pass;
+
+    switch (opcode) {
+    case 0x13: /* MUL, PMUL */
+        if (u && size != 0) {
+            unallocated_encoding(s);
+            return;
+        }
+        /* fall through */
+    case 0x0: /* SHADD, UHADD */
+    case 0x2: /* SRHADD, URHADD */
+    case 0x4: /* SHSUB, UHSUB */
+    case 0xc: /* SMAX, UMAX */
+    case 0xd: /* SMIN, UMIN */
+    case 0xe: /* SABD, UABD */
+    case 0xf: /* SABA, UABA */
+    case 0x12: /* MLA, MLS */
+        if (size == 3) {
+            unallocated_encoding(s);
+            return;
+        }
+        unsupported_encoding(s, insn);
+        return;
+    case 0x1: /* SQADD */
+    case 0x5: /* SQSUB */
+    case 0x8: /* SSHL, USHL */
+    case 0x9: /* SQSHL, UQSHL */
+    case 0xa: /* SRSHL, URSHL */
+    case 0xb: /* SQRSHL, UQRSHL */
+        if (size == 3 && !is_q) {
+            unallocated_encoding(s);
+            return;
+        }
+        unsupported_encoding(s, insn);
+        return;
+    case 0x16: /* SQDMULH, SQRDMULH */
+        if (size == 0 || size == 3) {
+            unallocated_encoding(s);
+            return;
+        }
+        unsupported_encoding(s, insn);
+        return;
+    default:
+        if (size == 3 && !is_q) {
+            unallocated_encoding(s);
+            return;
+        }
+        break;
+    }
+
+    if (size == 3) {
+        for (pass = 0; pass < (is_q ? 2 : 1); pass++) {
+            TCGv_i64 tcg_op1 = tcg_temp_new_i64();
+            TCGv_i64 tcg_op2 = tcg_temp_new_i64();
+            TCGv_i64 tcg_res = tcg_temp_new_i64();
+
+            read_vec_element(s, tcg_op1, rn, pass, MO_64);
+            read_vec_element(s, tcg_op2, rm, pass, MO_64);
+
+            handle_3same_64(s, opcode, u, tcg_res, tcg_op1, tcg_op2);
+
+            write_vec_element(s, tcg_res, rd, pass, MO_64);
+
+            tcg_temp_free_i64(tcg_res);
+            tcg_temp_free_i64(tcg_op1);
+            tcg_temp_free_i64(tcg_op2);
+        }
+    } else {
+        for (pass = 0; pass < (is_q ? 4 : 2); pass++) {
+            TCGv_i32 tcg_op1 = tcg_temp_new_i32();
+            TCGv_i32 tcg_op2 = tcg_temp_new_i32();
+            TCGv_i32 tcg_res = tcg_temp_new_i32();
+            NeonGenTwoOpFn *genfn;
+
+            read_vec_element_i32(s, tcg_op1, rn, pass, MO_32);
+            read_vec_element_i32(s, tcg_op2, rm, pass, MO_32);
+
+            switch (opcode) {
+            case 0x6: /* CMGT, CMHI */
+            {
+                static NeonGenTwoOpFn * const fns[3][2] = {
+                    { gen_helper_neon_cgt_s8, gen_helper_neon_cgt_u8 },
+                    { gen_helper_neon_cgt_s16, gen_helper_neon_cgt_u16 },
+                    { gen_helper_neon_cgt_s32, gen_helper_neon_cgt_u32 },
+                };
+                genfn = fns[size][u];
+                break;
+            }
+            case 0x7: /* CMGE, CMHS */
+            {
+                static NeonGenTwoOpFn * const fns[3][2] = {
+                    { gen_helper_neon_cge_s8, gen_helper_neon_cge_u8 },
+                    { gen_helper_neon_cge_s16, gen_helper_neon_cge_u16 },
+                    { gen_helper_neon_cge_s32, gen_helper_neon_cge_u32 },
+                };
+                genfn = fns[size][u];
+                break;
+            }
+            case 0x10: /* ADD, SUB */
+            {
+                static NeonGenTwoOpFn * const fns[3][2] = {
+                    { gen_helper_neon_add_u8, gen_helper_neon_sub_u8 },
+                    { gen_helper_neon_add_u16, gen_helper_neon_sub_u16 },
+                    { tcg_gen_add_i32, tcg_gen_sub_i32 },
+                };
+                genfn = fns[size][u];
+                break;
+            }
+            case 0x11: /* CMTST, CMEQ */
+            {
+                static NeonGenTwoOpFn * const fns[3][2] = {
+                    { gen_helper_neon_tst_u8, gen_helper_neon_ceq_u8 },
+                    { gen_helper_neon_tst_u16, gen_helper_neon_ceq_u16 },
+                    { gen_helper_neon_tst_u32, gen_helper_neon_ceq_u32 },
+                };
+                genfn = fns[size][u];
+                break;
+            }
+            default:
+                g_assert_not_reached();
+            }
+
+            genfn(tcg_res, tcg_op1, tcg_op2);
+
+            write_vec_element_i32(s, tcg_res, rd, pass, MO_32);
+
+            tcg_temp_free_i32(tcg_res);
+            tcg_temp_free_i32(tcg_op1);
+            tcg_temp_free_i32(tcg_op2);
+        }
+    }
+
+    if (!is_q) {
+        clear_vec_high(s, rd);
+    }
+}
+
+/* C3.6.16 AdvSIMD three same
+ *  31  30  29  28       24 23  22  21 20  16 15    11  10 9    5 4    0
+ * +---+---+---+-----------+------+---+------+--------+---+------+------+
+ * | 0 | Q | U | 0 1 1 1 0 | size | 1 |  Rm  | opcode | 1 |  Rn  |  Rd  |
+ * +---+---+---+-----------+------+---+------+--------+---+------+------+
+ */
+static void disas_simd_three_reg_same(DisasContext *s, uint32_t insn)
+{
+    int opcode = extract32(insn, 11, 5);
+
+    switch (opcode) {
+    case 0x3: /* logic ops */
+        disas_simd_3same_logic(s, insn);
+        break;
+    case 0x17: /* ADDP */
+    case 0x14: /* SMAXP, UMAXP */
+    case 0x15: /* SMINP, UMINP */
+        /* Pairwise operations */
+        disas_simd_3same_pair(s, insn);
+        break;
+    case 0x18 ... 0x31:
+        /* floating point ops, sz[1] and U are part of opcode */
+        disas_simd_3same_float(s, insn);
+        break;
+    default:
+        disas_simd_3same_int(s, insn);
+        break;
+    }
+}
+
+/* C3.6.17 AdvSIMD two reg misc
+ *   31  30  29 28       24 23  22 21       17 16    12 11 10 9    5 4    0
+ * +---+---+---+-----------+------+-----------+--------+-----+------+------+
+ * | 0 | Q | U | 0 1 1 1 0 | size | 1 0 0 0 0 | opcode | 1 0 |  Rn  |  Rd  |
+ * +---+---+---+-----------+------+-----------+--------+-----+------+------+
+ */
+static void disas_simd_two_reg_misc(DisasContext *s, uint32_t insn)
+{
+    unsupported_encoding(s, insn);
+}
+
+/* C3.6.18 AdvSIMD vector x indexed element
+ *   31  30  29 28       24 23  22 21  20  19  16 15 12  11  10 9    5 4    0
+ * +---+---+---+-----------+------+---+---+------+-----+---+---+------+------+
+ * | 0 | Q | U | 0 1 1 1 1 | size | L | M |  Rm  | opc | H | 0 |  Rn  |  Rd  |
+ * +---+---+---+-----------+------+---+---+------+-----+---+---+------+------+
+ */
+static void disas_simd_indexed_vector(DisasContext *s, uint32_t insn)
+{
+    unsupported_encoding(s, insn);
+}
+
+/* C3.6.19 Crypto AES
+ *  31             24 23  22 21       17 16    12 11 10 9    5 4    0
+ * +-----------------+------+-----------+--------+-----+------+------+
+ * | 0 1 0 0 1 1 1 0 | size | 1 0 1 0 0 | opcode | 1 0 |  Rn  |  Rd  |
+ * +-----------------+------+-----------+--------+-----+------+------+
+ */
+static void disas_crypto_aes(DisasContext *s, uint32_t insn)
+{
+    unsupported_encoding(s, insn);
+}
+
+/* C3.6.20 Crypto three-reg SHA
+ *  31             24 23  22  21 20  16  15 14    12 11 10 9    5 4    0
+ * +-----------------+------+---+------+---+--------+-----+------+------+
+ * | 0 1 0 1 1 1 1 0 | size | 0 |  Rm  | 0 | opcode | 0 0 |  Rn  |  Rd  |
+ * +-----------------+------+---+------+---+--------+-----+------+------+
+ */
+static void disas_crypto_three_reg_sha(DisasContext *s, uint32_t insn)
+{
+    unsupported_encoding(s, insn);
+}
+
+/* C3.6.21 Crypto two-reg SHA
+ *  31             24 23  22 21       17 16    12 11 10 9    5 4    0
+ * +-----------------+------+-----------+--------+-----+------+------+
+ * | 0 1 0 1 1 1 1 0 | size | 1 0 1 0 0 | opcode | 1 0 |  Rn  |  Rd  |
+ * +-----------------+------+-----------+--------+-----+------+------+
+ */
+static void disas_crypto_two_reg_sha(DisasContext *s, uint32_t insn)
+{
+    unsupported_encoding(s, insn);
+}
+
+/* C3.6 Data processing - SIMD, inc Crypto
+ *
+ * As the decode gets a little complex we are using a table based
+ * approach for this part of the decode.
+ */
+static const AArch64DecodeTable data_proc_simd[] = {
+    /* pattern  ,  mask     ,  fn                        */
+    { 0x0e200400, 0x9f200400, disas_simd_three_reg_same },
+    { 0x0e200000, 0x9f200c00, disas_simd_three_reg_diff },
+    { 0x0e200800, 0x9f3e0c00, disas_simd_two_reg_misc },
+    { 0x0e300800, 0x9f3e0c00, disas_simd_across_lanes },
+    { 0x0e000400, 0x9fe08400, disas_simd_copy },
+    { 0x0f000000, 0x9f000400, disas_simd_indexed_vector },
+    /* simd_mod_imm decode is a subset of simd_shift_imm, so must precede it */
+    { 0x0f000400, 0x9ff80400, disas_simd_mod_imm },
+    { 0x0f000400, 0x9f800400, disas_simd_shift_imm },
+    { 0x0e000000, 0xbf208c00, disas_simd_tb },
+    { 0x0e000800, 0xbf208c00, disas_simd_zip_trn },
+    { 0x2e000000, 0xbf208400, disas_simd_ext },
+    { 0x5e200400, 0xdf200400, disas_simd_scalar_three_reg_same },
+    { 0x5e200000, 0xdf200c00, disas_simd_scalar_three_reg_diff },
+    { 0x5e200800, 0xdf3e0c00, disas_simd_scalar_two_reg_misc },
+    { 0x5e300800, 0xdf3e0c00, disas_simd_scalar_pairwise },
+    { 0x5e000400, 0xdfe08400, disas_simd_scalar_copy },
+    { 0x5f000000, 0xdf000400, disas_simd_scalar_indexed },
+    { 0x5f000400, 0xdf800400, disas_simd_scalar_shift_imm },
+    { 0x4e280800, 0xff3e0c00, disas_crypto_aes },
+    { 0x5e000000, 0xff208c00, disas_crypto_three_reg_sha },
+    { 0x5e280800, 0xff3e0c00, disas_crypto_two_reg_sha },
+    { 0x00000000, 0x00000000, NULL }
+};
+
 static void disas_data_proc_simd(DisasContext *s, uint32_t insn)
 {
     /* Note that this is called with all non-FP cases from
      * table C3-6 so it must UNDEF for entries not specifically
      * allocated to instructions in that table.
      */
-    unsupported_encoding(s, insn);
+    AArch64DecodeFn *fn = lookup_disas_fn(&data_proc_simd[0], insn);
+    if (fn) {
+        fn(s, insn);
+    } else {
+        unallocated_encoding(s);
+    }
 }
 
 /* C3.6 Data processing - SIMD and floating point */
diff --git a/target-arm/translate.c b/target-arm/translate.c
index 8d240e160d..e701c0f9e1 100644
--- a/target-arm/translate.c
+++ b/target-arm/translate.c
@@ -2759,6 +2759,113 @@ static int handle_vminmaxnm(uint32_t insn, uint32_t rd, uint32_t rn,
     return 0;
 }
 
+static int handle_vrint(uint32_t insn, uint32_t rd, uint32_t rm, uint32_t dp,
+                        int rounding)
+{
+    TCGv_ptr fpst = get_fpstatus_ptr(0);
+    TCGv_i32 tcg_rmode;
+
+    tcg_rmode = tcg_const_i32(arm_rmode_to_sf(rounding));
+    gen_helper_set_rmode(tcg_rmode, tcg_rmode, cpu_env);
+
+    if (dp) {
+        TCGv_i64 tcg_op;
+        TCGv_i64 tcg_res;
+        tcg_op = tcg_temp_new_i64();
+        tcg_res = tcg_temp_new_i64();
+        tcg_gen_ld_f64(tcg_op, cpu_env, vfp_reg_offset(dp, rm));
+        gen_helper_rintd(tcg_res, tcg_op, fpst);
+        tcg_gen_st_f64(tcg_res, cpu_env, vfp_reg_offset(dp, rd));
+        tcg_temp_free_i64(tcg_op);
+        tcg_temp_free_i64(tcg_res);
+    } else {
+        TCGv_i32 tcg_op;
+        TCGv_i32 tcg_res;
+        tcg_op = tcg_temp_new_i32();
+        tcg_res = tcg_temp_new_i32();
+        tcg_gen_ld_f32(tcg_op, cpu_env, vfp_reg_offset(dp, rm));
+        gen_helper_rints(tcg_res, tcg_op, fpst);
+        tcg_gen_st_f32(tcg_res, cpu_env, vfp_reg_offset(dp, rd));
+        tcg_temp_free_i32(tcg_op);
+        tcg_temp_free_i32(tcg_res);
+    }
+
+    gen_helper_set_rmode(tcg_rmode, tcg_rmode, cpu_env);
+    tcg_temp_free_i32(tcg_rmode);
+
+    tcg_temp_free_ptr(fpst);
+    return 0;
+}
+
+static int handle_vcvt(uint32_t insn, uint32_t rd, uint32_t rm, uint32_t dp,
+                       int rounding)
+{
+    bool is_signed = extract32(insn, 7, 1);
+    TCGv_ptr fpst = get_fpstatus_ptr(0);
+    TCGv_i32 tcg_rmode, tcg_shift;
+
+    tcg_shift = tcg_const_i32(0);
+
+    tcg_rmode = tcg_const_i32(arm_rmode_to_sf(rounding));
+    gen_helper_set_rmode(tcg_rmode, tcg_rmode, cpu_env);
+
+    if (dp) {
+        TCGv_i64 tcg_double, tcg_res;
+        TCGv_i32 tcg_tmp;
+        /* Rd is encoded as a single precision register even when the source
+         * is double precision.
+         */
+        rd = ((rd << 1) & 0x1e) | ((rd >> 4) & 0x1);
+        tcg_double = tcg_temp_new_i64();
+        tcg_res = tcg_temp_new_i64();
+        tcg_tmp = tcg_temp_new_i32();
+        tcg_gen_ld_f64(tcg_double, cpu_env, vfp_reg_offset(1, rm));
+        if (is_signed) {
+            gen_helper_vfp_tosld(tcg_res, tcg_double, tcg_shift, fpst);
+        } else {
+            gen_helper_vfp_tould(tcg_res, tcg_double, tcg_shift, fpst);
+        }
+        tcg_gen_trunc_i64_i32(tcg_tmp, tcg_res);
+        tcg_gen_st_f32(tcg_tmp, cpu_env, vfp_reg_offset(0, rd));
+        tcg_temp_free_i32(tcg_tmp);
+        tcg_temp_free_i64(tcg_res);
+        tcg_temp_free_i64(tcg_double);
+    } else {
+        TCGv_i32 tcg_single, tcg_res;
+        tcg_single = tcg_temp_new_i32();
+        tcg_res = tcg_temp_new_i32();
+        tcg_gen_ld_f32(tcg_single, cpu_env, vfp_reg_offset(0, rm));
+        if (is_signed) {
+            gen_helper_vfp_tosls(tcg_res, tcg_single, tcg_shift, fpst);
+        } else {
+            gen_helper_vfp_touls(tcg_res, tcg_single, tcg_shift, fpst);
+        }
+        tcg_gen_st_f32(tcg_res, cpu_env, vfp_reg_offset(0, rd));
+        tcg_temp_free_i32(tcg_res);
+        tcg_temp_free_i32(tcg_single);
+    }
+
+    gen_helper_set_rmode(tcg_rmode, tcg_rmode, cpu_env);
+    tcg_temp_free_i32(tcg_rmode);
+
+    tcg_temp_free_i32(tcg_shift);
+
+    tcg_temp_free_ptr(fpst);
+
+    return 0;
+}
+
+/* Table for converting the most common AArch32 encoding of
+ * rounding mode to arm_fprounding order (which matches the
+ * common AArch64 order); see ARM ARM pseudocode FPDecodeRM().
+ */
+static const uint8_t fp_decode_rm[] = {
+    FPROUNDING_TIEAWAY,
+    FPROUNDING_TIEEVEN,
+    FPROUNDING_POSINF,
+    FPROUNDING_NEGINF,
+};
+
 static int disas_vfp_v8_insn(CPUARMState *env, DisasContext *s, uint32_t insn)
 {
     uint32_t rd, rn, rm, dp = extract32(insn, 8, 1);
@@ -2781,6 +2888,14 @@ static int disas_vfp_v8_insn(CPUARMState *env, DisasContext *s, uint32_t insn)
         return handle_vsel(insn, rd, rn, rm, dp);
     } else if ((insn & 0x0fb00e10) == 0x0e800a00) {
         return handle_vminmaxnm(insn, rd, rn, rm, dp);
+    } else if ((insn & 0x0fbc0ed0) == 0x0eb80a40) {
+        /* VRINTA, VRINTN, VRINTP, VRINTM */
+        int rounding = fp_decode_rm[extract32(insn, 16, 2)];
+        return handle_vrint(insn, rd, rm, dp, rounding);
+    } else if ((insn & 0x0fbc0e50) == 0x0ebc0a40) {
+        /* VCVTA, VCVTN, VCVTP, VCVTM */
+        int rounding = fp_decode_rm[extract32(insn, 16, 2)];
+        return handle_vcvt(insn, rd, rm, dp, rounding);
     }
     return 1;
 }
@@ -3325,6 +3440,44 @@ static int disas_vfp_insn(CPUARMState * env, DisasContext *s, uint32_t insn)
                         gen_vfp_F1_ld0(dp);
                         gen_vfp_cmpe(dp);
                         break;
+                    case 12: /* vrintr */
+                    {
+                        TCGv_ptr fpst = get_fpstatus_ptr(0);
+                        if (dp) {
+                            gen_helper_rintd(cpu_F0d, cpu_F0d, fpst);
+                        } else {
+                            gen_helper_rints(cpu_F0s, cpu_F0s, fpst);
+                        }
+                        tcg_temp_free_ptr(fpst);
+                        break;
+                    }
+                    case 13: /* vrintz */
+                    {
+                        TCGv_ptr fpst = get_fpstatus_ptr(0);
+                        TCGv_i32 tcg_rmode;
+                        tcg_rmode = tcg_const_i32(float_round_to_zero);
+                        gen_helper_set_rmode(tcg_rmode, tcg_rmode, cpu_env);
+                        if (dp) {
+                            gen_helper_rintd(cpu_F0d, cpu_F0d, fpst);
+                        } else {
+                            gen_helper_rints(cpu_F0s, cpu_F0s, fpst);
+                        }
+                        gen_helper_set_rmode(tcg_rmode, tcg_rmode, cpu_env);
+                        tcg_temp_free_i32(tcg_rmode);
+                        tcg_temp_free_ptr(fpst);
+                        break;
+                    }
+                    case 14: /* vrintx */
+                    {
+                        TCGv_ptr fpst = get_fpstatus_ptr(0);
+                        if (dp) {
+                            gen_helper_rintd_exact(cpu_F0d, cpu_F0d, fpst);
+                        } else {
+                            gen_helper_rints_exact(cpu_F0s, cpu_F0s, fpst);
+                        }
+                        tcg_temp_free_ptr(fpst);
+                        break;
+                    }
                     case 15: /* single<->double conversion */
                         if (dp)
                             gen_helper_vfp_fcvtsd(cpu_F0s, cpu_F0d, cpu_env);
@@ -4617,8 +4770,22 @@ static const uint8_t neon_3r_sizes[] = {
 #define NEON_2RM_VMOVN 36 /* Includes VQMOVN, VQMOVUN */
 #define NEON_2RM_VQMOVN 37 /* Includes VQMOVUN */
 #define NEON_2RM_VSHLL 38
+#define NEON_2RM_VRINTN 40
+#define NEON_2RM_VRINTX 41
+#define NEON_2RM_VRINTA 42
+#define NEON_2RM_VRINTZ 43
 #define NEON_2RM_VCVT_F16_F32 44
+#define NEON_2RM_VRINTM 45
 #define NEON_2RM_VCVT_F32_F16 46
+#define NEON_2RM_VRINTP 47
+#define NEON_2RM_VCVTAU 48
+#define NEON_2RM_VCVTAS 49
+#define NEON_2RM_VCVTNU 50
+#define NEON_2RM_VCVTNS 51
+#define NEON_2RM_VCVTPU 52
+#define NEON_2RM_VCVTPS 53
+#define NEON_2RM_VCVTMU 54
+#define NEON_2RM_VCVTMS 55
 #define NEON_2RM_VRECPE 56
 #define NEON_2RM_VRSQRTE 57
 #define NEON_2RM_VRECPE_F 58
@@ -4632,6 +4799,9 @@ static int neon_2rm_is_float_op(int op)
 {
     /* Return true if this neon 2reg-misc op is float-to-float */
     return (op == NEON_2RM_VABS_F || op == NEON_2RM_VNEG_F ||
+            (op >= NEON_2RM_VRINTN && op <= NEON_2RM_VRINTZ) ||
+            op == NEON_2RM_VRINTM ||
+            (op >= NEON_2RM_VRINTP && op <= NEON_2RM_VCVTMS) ||
             op >= NEON_2RM_VRECPE_F);
 }
 
@@ -4676,8 +4846,22 @@ static const uint8_t neon_2rm_sizes[] = {
     [NEON_2RM_VMOVN] = 0x7,
     [NEON_2RM_VQMOVN] = 0x7,
     [NEON_2RM_VSHLL] = 0x7,
+    [NEON_2RM_VRINTN] = 0x4,
+    [NEON_2RM_VRINTX] = 0x4,
+    [NEON_2RM_VRINTA] = 0x4,
+    [NEON_2RM_VRINTZ] = 0x4,
     [NEON_2RM_VCVT_F16_F32] = 0x2,
+    [NEON_2RM_VRINTM] = 0x4,
     [NEON_2RM_VCVT_F32_F16] = 0x2,
+    [NEON_2RM_VRINTP] = 0x4,
+    [NEON_2RM_VCVTAU] = 0x4,
+    [NEON_2RM_VCVTAS] = 0x4,
+    [NEON_2RM_VCVTNU] = 0x4,
+    [NEON_2RM_VCVTNS] = 0x4,
+    [NEON_2RM_VCVTPU] = 0x4,
+    [NEON_2RM_VCVTPS] = 0x4,
+    [NEON_2RM_VCVTMU] = 0x4,
+    [NEON_2RM_VCVTMS] = 0x4,
     [NEON_2RM_VRECPE] = 0x4,
     [NEON_2RM_VRSQRTE] = 0x4,
     [NEON_2RM_VRECPE_F] = 0x4,
@@ -6388,6 +6572,73 @@ static int disas_neon_data_insn(CPUARMState * env, DisasContext *s, uint32_t ins
                             }
                             neon_store_reg(rm, pass, tmp2);
                             break;
+                        case NEON_2RM_VRINTN:
+                        case NEON_2RM_VRINTA:
+                        case NEON_2RM_VRINTM:
+                        case NEON_2RM_VRINTP:
+                        case NEON_2RM_VRINTZ:
+                        {
+                            TCGv_i32 tcg_rmode;
+                            TCGv_ptr fpstatus = get_fpstatus_ptr(1);
+                            int rmode;
+
+                            if (op == NEON_2RM_VRINTZ) {
+                                rmode = FPROUNDING_ZERO;
+                            } else {
+                                rmode = fp_decode_rm[((op & 0x6) >> 1) ^ 1];
+                            }
+
+                            tcg_rmode = tcg_const_i32(arm_rmode_to_sf(rmode));
+                            gen_helper_set_neon_rmode(tcg_rmode, tcg_rmode,
+                                                      cpu_env);
+                            gen_helper_rints(cpu_F0s, cpu_F0s, fpstatus);
+                            gen_helper_set_neon_rmode(tcg_rmode, tcg_rmode,
+                                                      cpu_env);
+                            tcg_temp_free_ptr(fpstatus);
+                            tcg_temp_free_i32(tcg_rmode);
+                            break;
+                        }
+                        case NEON_2RM_VRINTX:
+                        {
+                            TCGv_ptr fpstatus = get_fpstatus_ptr(1);
+                            gen_helper_rints_exact(cpu_F0s, cpu_F0s, fpstatus);
+                            tcg_temp_free_ptr(fpstatus);
+                            break;
+                        }
+                        case NEON_2RM_VCVTAU:
+                        case NEON_2RM_VCVTAS:
+                        case NEON_2RM_VCVTNU:
+                        case NEON_2RM_VCVTNS:
+                        case NEON_2RM_VCVTPU:
+                        case NEON_2RM_VCVTPS:
+                        case NEON_2RM_VCVTMU:
+                        case NEON_2RM_VCVTMS:
+                        {
+                            bool is_signed = !extract32(insn, 7, 1);
+                            TCGv_ptr fpst = get_fpstatus_ptr(1);
+                            TCGv_i32 tcg_rmode, tcg_shift;
+                            int rmode = fp_decode_rm[extract32(insn, 8, 2)];
+
+                            tcg_shift = tcg_const_i32(0);
+                            tcg_rmode = tcg_const_i32(arm_rmode_to_sf(rmode));
+                            gen_helper_set_neon_rmode(tcg_rmode, tcg_rmode,
+                                                      cpu_env);
+
+                            if (is_signed) {
+                                gen_helper_vfp_tosls(cpu_F0s, cpu_F0s,
+                                                     tcg_shift, fpst);
+                            } else {
+                                gen_helper_vfp_touls(cpu_F0s, cpu_F0s,
+                                                     tcg_shift, fpst);
+                            }
+
+                            gen_helper_set_neon_rmode(tcg_rmode, tcg_rmode,
+                                                      cpu_env);
+                            tcg_temp_free_i32(tcg_rmode);
+                            tcg_temp_free_i32(tcg_shift);
+                            tcg_temp_free_ptr(fpst);
+                            break;
+                        }
                         case NEON_2RM_VRECPE:
                             gen_helper_recpe_u32(tmp, tmp, cpu_env);
                             break;
diff --git a/tcg/i386/tcg-target.c b/tcg/i386/tcg-target.c
index 495b901080..5d4cf9386e 100644
--- a/tcg/i386/tcg-target.c
+++ b/tcg/i386/tcg-target.c
@@ -99,18 +99,31 @@ static const int tcg_target_call_oarg_regs[] = {
 # define TCG_REG_L1 TCG_REG_EDX
 #endif
 
+/* The host compiler should supply <cpuid.h> to enable runtime features
+   detection, as we're not going to go so far as our own inline assembly.
+   If not available, default values will be assumed.  */
+#if defined(CONFIG_CPUID_H)
+#include <cpuid.h>
+#endif
+
 /* For 32-bit, we are going to attempt to determine at runtime whether cmov
-   is available.  However, the host compiler must supply <cpuid.h>, as we're
-   not going to go so far as our own inline assembly.  */
+   is available.  */
 #if TCG_TARGET_REG_BITS == 64
 # define have_cmov 1
 #elif defined(CONFIG_CPUID_H)
-#include <cpuid.h>
 static bool have_cmov;
 #else
 # define have_cmov 0
 #endif
 
+/* If bit_MOVBE is defined in cpuid.h (added in GCC version 4.6), we are
+   going to attempt to determine at runtime whether movbe is available.  */
+#if defined(CONFIG_CPUID_H) && defined(bit_MOVBE)
+static bool have_movbe;
+#else
+# define have_movbe 0
+#endif
+
 static uint8_t *tb_ret_addr;
 
 static void patch_reloc(uint8_t *code_ptr, int type,
@@ -240,13 +253,14 @@ static inline int tcg_target_const_match(tcg_target_long val,
 #endif
 
 #define P_EXT		0x100		/* 0x0f opcode prefix */
-#define P_DATA16	0x200		/* 0x66 opcode prefix */
+#define P_EXT38         0x200           /* 0x0f 0x38 opcode prefix */
+#define P_DATA16        0x400           /* 0x66 opcode prefix */
 #if TCG_TARGET_REG_BITS == 64
-# define P_ADDR32	0x400		/* 0x67 opcode prefix */
-# define P_REXW		0x800		/* Set REX.W = 1 */
-# define P_REXB_R	0x1000		/* REG field as byte register */
-# define P_REXB_RM	0x2000		/* R/M field as byte register */
-# define P_GS           0x4000          /* gs segment override */
+# define P_ADDR32       0x800           /* 0x67 opcode prefix */
+# define P_REXW         0x1000          /* Set REX.W = 1 */
+# define P_REXB_R       0x2000          /* REG field as byte register */
+# define P_REXB_RM      0x4000          /* R/M field as byte register */
+# define P_GS           0x8000          /* gs segment override */
 #else
 # define P_ADDR32	0
 # define P_REXW		0
@@ -279,6 +293,8 @@ static inline int tcg_target_const_match(tcg_target_long val,
 #define OPC_MOVB_EvIz   (0xc6)
 #define OPC_MOVL_EvIz	(0xc7)
 #define OPC_MOVL_Iv     (0xb8)
+#define OPC_MOVBE_GyMy  (0xf0 | P_EXT38)
+#define OPC_MOVBE_MyGy  (0xf1 | P_EXT38)
 #define OPC_MOVSBL	(0xbe | P_EXT)
 #define OPC_MOVSWL	(0xbf | P_EXT)
 #define OPC_MOVSLQ	(0x63 | P_REXW)
@@ -381,7 +397,7 @@ static void tcg_out_opc(TCGContext *s, int opc, int r, int rm, int x)
     }
 
     rex = 0;
-    rex |= (opc & P_REXW) >> 8;		/* REX.W */
+    rex |= (opc & P_REXW) ? 0x8 : 0x0;  /* REX.W */
     rex |= (r & 8) >> 1;		/* REX.R */
     rex |= (x & 8) >> 2;		/* REX.X */
     rex |= (rm & 8) >> 3;		/* REX.B */
@@ -398,9 +414,13 @@ static void tcg_out_opc(TCGContext *s, int opc, int r, int rm, int x)
         tcg_out8(s, (uint8_t)(rex | 0x40));
     }
 
-    if (opc & P_EXT) {
+    if (opc & (P_EXT | P_EXT38)) {
         tcg_out8(s, 0x0f);
+        if (opc & P_EXT38) {
+            tcg_out8(s, 0x38);
+        }
     }
+
     tcg_out8(s, opc);
 }
 #else
@@ -409,8 +429,11 @@ static void tcg_out_opc(TCGContext *s, int opc)
     if (opc & P_DATA16) {
         tcg_out8(s, 0x66);
     }
-    if (opc & P_EXT) {
+    if (opc & (P_EXT | P_EXT38)) {
         tcg_out8(s, 0x0f);
+        if (opc & P_EXT38) {
+            tcg_out8(s, 0x38);
+        }
     }
     tcg_out8(s, opc);
 }
@@ -1336,7 +1359,14 @@ static void tcg_out_qemu_ld_direct(TCGContext *s, TCGReg datalo, TCGReg datahi,
                                    TCGReg base, intptr_t ofs, int seg,
                                    TCGMemOp memop)
 {
-    const TCGMemOp bswap = memop & MO_BSWAP;
+    const TCGMemOp real_bswap = memop & MO_BSWAP;
+    TCGMemOp bswap = real_bswap;
+    int movop = OPC_MOVL_GvEv;
+
+    if (have_movbe && real_bswap) {
+        bswap = 0;
+        movop = OPC_MOVBE_GyMy;
+    }
 
     switch (memop & MO_SSIZE) {
     case MO_UB:
@@ -1347,14 +1377,19 @@ static void tcg_out_qemu_ld_direct(TCGContext *s, TCGReg datalo, TCGReg datahi,
         break;
     case MO_UW:
         tcg_out_modrm_offset(s, OPC_MOVZWL + seg, datalo, base, ofs);
-        if (bswap) {
+        if (real_bswap) {
             tcg_out_rolw_8(s, datalo);
         }
         break;
     case MO_SW:
-        if (bswap) {
-            tcg_out_modrm_offset(s, OPC_MOVZWL + seg, datalo, base, ofs);
-            tcg_out_rolw_8(s, datalo);
+        if (real_bswap) {
+            if (have_movbe) {
+                tcg_out_modrm_offset(s, OPC_MOVBE_GyMy + P_DATA16 + seg,
+                                     datalo, base, ofs);
+            } else {
+                tcg_out_modrm_offset(s, OPC_MOVZWL + seg, datalo, base, ofs);
+                tcg_out_rolw_8(s, datalo);
+            }
             tcg_out_modrm(s, OPC_MOVSWL + P_REXW, datalo, datalo);
         } else {
             tcg_out_modrm_offset(s, OPC_MOVSWL + P_REXW + seg,
@@ -1362,16 +1397,18 @@ static void tcg_out_qemu_ld_direct(TCGContext *s, TCGReg datalo, TCGReg datahi,
         }
         break;
     case MO_UL:
-        tcg_out_modrm_offset(s, OPC_MOVL_GvEv + seg, datalo, base, ofs);
+        tcg_out_modrm_offset(s, movop + seg, datalo, base, ofs);
         if (bswap) {
             tcg_out_bswap32(s, datalo);
         }
         break;
 #if TCG_TARGET_REG_BITS == 64
     case MO_SL:
-        if (bswap) {
-            tcg_out_modrm_offset(s, OPC_MOVL_GvEv + seg, datalo, base, ofs);
-            tcg_out_bswap32(s, datalo);
+        if (real_bswap) {
+            tcg_out_modrm_offset(s, movop + seg, datalo, base, ofs);
+            if (bswap) {
+                tcg_out_bswap32(s, datalo);
+            }
             tcg_out_ext32s(s, datalo, datalo);
         } else {
             tcg_out_modrm_offset(s, OPC_MOVSLQ + seg, datalo, base, ofs);
@@ -1380,27 +1417,22 @@ static void tcg_out_qemu_ld_direct(TCGContext *s, TCGReg datalo, TCGReg datahi,
 #endif
     case MO_Q:
         if (TCG_TARGET_REG_BITS == 64) {
-            tcg_out_modrm_offset(s, OPC_MOVL_GvEv + P_REXW + seg,
-                                 datalo, base, ofs);
+            tcg_out_modrm_offset(s, movop + P_REXW + seg, datalo, base, ofs);
             if (bswap) {
                 tcg_out_bswap64(s, datalo);
             }
         } else {
-            if (bswap) {
+            if (real_bswap) {
                 int t = datalo;
                 datalo = datahi;
                 datahi = t;
             }
             if (base != datalo) {
-                tcg_out_modrm_offset(s, OPC_MOVL_GvEv + seg,
-                                     datalo, base, ofs);
-                tcg_out_modrm_offset(s, OPC_MOVL_GvEv + seg,
-                                     datahi, base, ofs + 4);
+                tcg_out_modrm_offset(s, movop + seg, datalo, base, ofs);
+                tcg_out_modrm_offset(s, movop + seg, datahi, base, ofs + 4);
             } else {
-                tcg_out_modrm_offset(s, OPC_MOVL_GvEv + seg,
-                                     datahi, base, ofs + 4);
-                tcg_out_modrm_offset(s, OPC_MOVL_GvEv + seg,
-                                     datalo, base, ofs);
+                tcg_out_modrm_offset(s, movop + seg, datahi, base, ofs + 4);
+                tcg_out_modrm_offset(s, movop + seg, datalo, base, ofs);
             }
             if (bswap) {
                 tcg_out_bswap32(s, datalo);
@@ -1476,13 +1508,19 @@ static void tcg_out_qemu_st_direct(TCGContext *s, TCGReg datalo, TCGReg datahi,
                                    TCGReg base, intptr_t ofs, int seg,
                                    TCGMemOp memop)
 {
-    const TCGMemOp bswap = memop & MO_BSWAP;
-
     /* ??? Ideally we wouldn't need a scratch register.  For user-only,
        we could perform the bswap twice to restore the original value
        instead of moving to the scratch.  But as it is, the L constraint
        means that TCG_REG_L0 is definitely free here.  */
     const TCGReg scratch = TCG_REG_L0;
+    const TCGMemOp real_bswap = memop & MO_BSWAP;
+    TCGMemOp bswap = real_bswap;
+    int movop = OPC_MOVL_EvGv;
+
+    if (have_movbe && real_bswap) {
+        bswap = 0;
+        movop = OPC_MOVBE_MyGy;
+    }
 
     switch (memop & MO_SIZE) {
     case MO_8:
@@ -1501,8 +1539,7 @@ static void tcg_out_qemu_st_direct(TCGContext *s, TCGReg datalo, TCGReg datahi,
             tcg_out_rolw_8(s, scratch);
             datalo = scratch;
         }
-        tcg_out_modrm_offset(s, OPC_MOVL_EvGv + P_DATA16 + seg,
-                             datalo, base, ofs);
+        tcg_out_modrm_offset(s, movop + P_DATA16 + seg, datalo, base, ofs);
         break;
     case MO_32:
         if (bswap) {
@@ -1510,7 +1547,7 @@ static void tcg_out_qemu_st_direct(TCGContext *s, TCGReg datalo, TCGReg datahi,
             tcg_out_bswap32(s, scratch);
             datalo = scratch;
         }
-        tcg_out_modrm_offset(s, OPC_MOVL_EvGv + seg, datalo, base, ofs);
+        tcg_out_modrm_offset(s, movop + seg, datalo, base, ofs);
         break;
     case MO_64:
         if (TCG_TARGET_REG_BITS == 64) {
@@ -1519,8 +1556,7 @@ static void tcg_out_qemu_st_direct(TCGContext *s, TCGReg datalo, TCGReg datahi,
                 tcg_out_bswap64(s, scratch);
                 datalo = scratch;
             }
-            tcg_out_modrm_offset(s, OPC_MOVL_EvGv + P_REXW + seg,
-                                 datalo, base, ofs);
+            tcg_out_modrm_offset(s, movop + P_REXW + seg, datalo, base, ofs);
         } else if (bswap) {
             tcg_out_mov(s, TCG_TYPE_I32, scratch, datahi);
             tcg_out_bswap32(s, scratch);
@@ -1529,8 +1565,13 @@ static void tcg_out_qemu_st_direct(TCGContext *s, TCGReg datalo, TCGReg datahi,
             tcg_out_bswap32(s, scratch);
             tcg_out_modrm_offset(s, OPC_MOVL_EvGv + seg, scratch, base, ofs+4);
         } else {
-            tcg_out_modrm_offset(s, OPC_MOVL_EvGv + seg, datalo, base, ofs);
-            tcg_out_modrm_offset(s, OPC_MOVL_EvGv + seg, datahi, base, ofs+4);
+            if (real_bswap) {
+                int t = datalo;
+                datalo = datahi;
+                datahi = t;
+            }
+            tcg_out_modrm_offset(s, movop + seg, datalo, base, ofs);
+            tcg_out_modrm_offset(s, movop + seg, datahi, base, ofs+4);
         }
         break;
     default:
@@ -1985,9 +2026,7 @@ static const TCGTargetOpDef x86_op_defs[] = {
     { INDEX_op_setcond_i32, { "q", "r", "ri" } },
 
     { INDEX_op_deposit_i32, { "Q", "0", "Q" } },
-#if TCG_TARGET_HAS_movcond_i32
     { INDEX_op_movcond_i32, { "r", "r", "ri", "r", "0" } },
-#endif
 
     { INDEX_op_mulu2_i32, { "a", "d", "a", "r" } },
     { INDEX_op_muls2_i32, { "a", "d", "a", "r" } },
@@ -2157,13 +2196,23 @@ static void tcg_target_qemu_prologue(TCGContext *s)
 
 static void tcg_target_init(TCGContext *s)
 {
-    /* For 32-bit, 99% certainty that we're running on hardware that supports
-       cmov, but we still need to check.  In case cmov is not available, we'll
-       use a small forward branch.  */
-#ifndef have_cmov
+#if !(defined(have_cmov) && defined(have_movbe))
     {
         unsigned a, b, c, d;
-        have_cmov = (__get_cpuid(1, &a, &b, &c, &d) && (d & bit_CMOV));
+        int ret = __get_cpuid(1, &a, &b, &c, &d);
+
+# ifndef have_cmov
+        /* For 32-bit, 99% certainty that we're running on hardware that
+           supports cmov, but we still need to check.  In case cmov is not
+           available, we'll use a small forward branch.  */
+        have_cmov = ret && (d & bit_CMOV);
+# endif
+
+# ifndef have_movbe
+        /* MOVBE is only available on Intel Atom and Haswell CPUs, so we
+           need to probe for it.  */
+        have_movbe = ret && (c & bit_MOVBE);
+# endif
     }
 #endif
 
diff --git a/tcg/tcg.c b/tcg/tcg.c
index 712438ced8..acd02b99b6 100644
--- a/tcg/tcg.c
+++ b/tcg/tcg.c
@@ -586,7 +586,7 @@ static void tcg_temp_free_internal(int idx)
     assert(ts->temp_allocated != 0);
     ts->temp_allocated = 0;
 
-    k = ts->type + (ts->temp_local ? TCG_TYPE_COUNT : 0);
+    k = ts->base_type + (ts->temp_local ? TCG_TYPE_COUNT : 0);
     set_bit(idx, s->free_temps[k].l);
 }
 
diff --git a/tests/acpi-test-data/pc/APIC b/tests/acpi-test-data/pc/APIC
new file mode 100644
index 0000000000..84509e0ae4
--- /dev/null
+++ b/tests/acpi-test-data/pc/APIC
Binary files differdiff --git a/tests/acpi-test-data/pc/DSDT b/tests/acpi-test-data/pc/DSDT
new file mode 100644
index 0000000000..fbf1c3e6e8
--- /dev/null
+++ b/tests/acpi-test-data/pc/DSDT
Binary files differdiff --git a/tests/acpi-test-data/pc/FACP b/tests/acpi-test-data/pc/FACP
new file mode 100644
index 0000000000..0639999ed1
--- /dev/null
+++ b/tests/acpi-test-data/pc/FACP
Binary files differdiff --git a/tests/acpi-test-data/pc/FACS b/tests/acpi-test-data/pc/FACS
new file mode 100644
index 0000000000..fc67ecc407
--- /dev/null
+++ b/tests/acpi-test-data/pc/FACS
Binary files differdiff --git a/tests/acpi-test-data/pc/HPET b/tests/acpi-test-data/pc/HPET
new file mode 100644
index 0000000000..df689b8f99
--- /dev/null
+++ b/tests/acpi-test-data/pc/HPET
Binary files differdiff --git a/tests/acpi-test-data/pc/SSDT b/tests/acpi-test-data/pc/SSDT
new file mode 100644
index 0000000000..a51c68e21b
--- /dev/null
+++ b/tests/acpi-test-data/pc/SSDT
Binary files differdiff --git a/tests/acpi-test-data/q35/APIC b/tests/acpi-test-data/q35/APIC
new file mode 100644
index 0000000000..84509e0ae4
--- /dev/null
+++ b/tests/acpi-test-data/q35/APIC
Binary files differdiff --git a/tests/acpi-test-data/q35/DSDT b/tests/acpi-test-data/q35/DSDT
new file mode 100644
index 0000000000..5086b839a6
--- /dev/null
+++ b/tests/acpi-test-data/q35/DSDT
Binary files differdiff --git a/tests/acpi-test-data/q35/FACP b/tests/acpi-test-data/q35/FACP
new file mode 100644
index 0000000000..19f3ac3ce6
--- /dev/null
+++ b/tests/acpi-test-data/q35/FACP
Binary files differdiff --git a/tests/acpi-test-data/q35/FACS b/tests/acpi-test-data/q35/FACS
new file mode 100644
index 0000000000..fc67ecc407
--- /dev/null
+++ b/tests/acpi-test-data/q35/FACS
Binary files differdiff --git a/tests/acpi-test-data/q35/HPET b/tests/acpi-test-data/q35/HPET
new file mode 100644
index 0000000000..df689b8f99
--- /dev/null
+++ b/tests/acpi-test-data/q35/HPET
Binary files differdiff --git a/tests/acpi-test-data/q35/MCFG b/tests/acpi-test-data/q35/MCFG
new file mode 100644
index 0000000000..79ceb27a03
--- /dev/null
+++ b/tests/acpi-test-data/q35/MCFG
Binary files differdiff --git a/tests/acpi-test-data/q35/SSDT b/tests/acpi-test-data/q35/SSDT
new file mode 100644
index 0000000000..9c6cad8b0b
--- /dev/null
+++ b/tests/acpi-test-data/q35/SSDT
Binary files differdiff --git a/tests/acpi-test-data/rebuild-expected-aml.sh b/tests/acpi-test-data/rebuild-expected-aml.sh
new file mode 100755
index 0000000000..ab98498884
--- /dev/null
+++ b/tests/acpi-test-data/rebuild-expected-aml.sh
@@ -0,0 +1,36 @@
+#! /bin/bash
+
+#
+# Rebuild expected AML files for acpi unit-test
+#
+# Copyright (c) 2013 Red Hat Inc.
+#
+# Authors:
+#  Marcel Apfelbaum <marcel.a@redhat.com>
+#
+# This work is licensed under the terms of the GNU GPLv2.
+# See the COPYING.LIB file in the top-level directory.
+
+qemu=
+
+if [ -e x86_64-softmmu/qemu-system-x86_64 ]; then
+    qemu="x86_64-softmmu/qemu-system-x86_64"
+elif [ -e i386-softmmu/qemu-system-i386 ]; then
+    qemu="i386-softmmu/qemu-system-i386"
+else
+    echo "Run 'make' to build the qemu exectutable!"
+    echo "Run this script from the build directory."
+    exit 1;
+fi
+
+if [ ! -e "tests/acpi-test" ]; then
+    echo "Test: acpi-test is required! Run make check before this script."
+    echo "Run this script from the build directory."
+    exit 1;
+fi
+
+TEST_ACPI_REBUILD_AML=y QTEST_QEMU_BINARY=$qemu tests/acpi-test
+
+echo "The files were rebuilt and can be added to git."
+echo "However, if new files were created, please copy them manually" \
+     "to tests/acpi-test-data/pc/ or tests/acpi-test-data/q35/ ."
diff --git a/tests/acpi-test.c b/tests/acpi-test.c
index df1af83158..31f5359787 100644
--- a/tests/acpi-test.c
+++ b/tests/acpi-test.c
@@ -13,19 +13,32 @@
 #include <string.h>
 #include <stdio.h>
 #include <glib.h>
+#include <glib/gstdio.h>
 #include "qemu-common.h"
 #include "libqtest.h"
 #include "qemu/compiler.h"
 #include "hw/i386/acpi-defs.h"
 
+#define MACHINE_PC "pc"
+#define MACHINE_Q35 "q35"
+
+#define ACPI_REBUILD_EXPECTED_AML "TEST_ACPI_REBUILD_AML"
+#define ACPI_SSDT_SIGNATURE 0x54445353 /* SSDT */
+
 /* DSDT and SSDTs format */
 typedef struct {
     AcpiTableHeader header;
-    uint8_t *aml;
-    int aml_len;
-} AcpiSdtTable;
+    gchar *aml;            /* aml bytecode from guest */
+    gsize aml_len;
+    gchar *aml_file;
+    gchar *asl;            /* asl code generated from aml */
+    gsize asl_len;
+    gchar *asl_file;
+    bool asl_file_retain;   /* do not delete the temp asl */
+} QEMU_PACKED AcpiSdtTable;
 
 typedef struct {
+    const char *machine;
     uint32_t rsdp_addr;
     AcpiRsdpDescriptor rsdp_table;
     AcpiRsdtDescriptorRev1 rsdt_table;
@@ -33,8 +46,7 @@ typedef struct {
     AcpiFacsDescriptorRev1 facs_table;
     uint32_t *rsdt_tables_addr;
     int rsdt_tables_nr;
-    AcpiSdtTable dsdt_table;
-    GArray *ssdt_tables;
+    GArray *tables;
 } test_data;
 
 #define LOW(x) ((x) & 0xff)
@@ -51,13 +63,13 @@ typedef struct {
             field = readb(addr);               \
             break;                             \
         case 2:                                \
-            field = le16_to_cpu(readw(addr));  \
+            field = readw(addr);               \
             break;                             \
         case 4:                                \
-            field = le32_to_cpu(readl(addr));  \
+            field = readl(addr);               \
             break;                             \
         case 8:                                \
-            field = le64_to_cpu(readq(addr));  \
+            field = readq(addr);               \
             break;                             \
         default:                               \
             g_assert(false);                   \
@@ -91,8 +103,10 @@ typedef struct {
 
 /* Boot sector code: write SIGNATURE into memory,
  * then halt.
+ * Q35 machine requires a minimum 0x7e000 bytes disk.
+ * (bug or feature?)
  */
-static uint8_t boot_sector[0x200] = {
+static uint8_t boot_sector[0x7e000] = {
     /* 7c00: mov $0xdead,%ax */
     [0x00] = 0xb8,
     [0x01] = LOW(SIGNATURE),
@@ -117,17 +131,45 @@ static uint8_t boot_sector[0x200] = {
 };
 
 static const char *disk = "tests/acpi-test-disk.raw";
+static const char *data_dir = "tests/acpi-test-data";
+#ifdef CONFIG_IASL
+static const char *iasl = stringify(CONFIG_IASL);
+#else
+static const char *iasl;
+#endif
 
 static void free_test_data(test_data *data)
 {
+    AcpiSdtTable *temp;
     int i;
 
-    g_free(data->rsdt_tables_addr);
-    for (i = 0; i < data->ssdt_tables->len; ++i) {
-        g_free(g_array_index(data->ssdt_tables, AcpiSdtTable, i).aml);
+    if (data->rsdt_tables_addr) {
+        g_free(data->rsdt_tables_addr);
+    }
+
+    for (i = 0; i < data->tables->len; ++i) {
+        temp = &g_array_index(data->tables, AcpiSdtTable, i);
+        if (temp->aml) {
+            g_free(temp->aml);
+        }
+        if (temp->aml_file) {
+            if (g_strstr_len(temp->aml_file, -1, "aml-")) {
+                unlink(temp->aml_file);
+            }
+            g_free(temp->aml_file);
+        }
+        if (temp->asl) {
+            g_free(temp->asl);
+        }
+        if (temp->asl_file) {
+            if (!temp->asl_file_retain) {
+                unlink(temp->asl_file);
+            }
+            g_free(temp->asl_file);
+        }
     }
-    g_array_free(data->ssdt_tables, false);
-    g_free(data->dsdt_table.aml);
+
+    g_array_free(data->tables, false);
 }
 
 static uint8_t acpi_checksum(const uint8_t *data, int len)
@@ -292,34 +334,219 @@ static void test_dst_table(AcpiSdtTable *sdt_table, uint32_t addr)
     ACPI_READ_ARRAY_PTR(sdt_table->aml, sdt_table->aml_len, addr);
 
     checksum = acpi_checksum((uint8_t *)sdt_table, sizeof(AcpiTableHeader)) +
-               acpi_checksum(sdt_table->aml, sdt_table->aml_len);
+               acpi_checksum((uint8_t *)sdt_table->aml, sdt_table->aml_len);
     g_assert(!checksum);
 }
 
 static void test_acpi_dsdt_table(test_data *data)
 {
-    AcpiSdtTable *dsdt_table = &data->dsdt_table;
+    AcpiSdtTable dsdt_table;
     uint32_t addr = data->fadt_table.dsdt;
 
-    test_dst_table(dsdt_table, addr);
-    g_assert_cmphex(dsdt_table->header.signature, ==, ACPI_DSDT_SIGNATURE);
+    memset(&dsdt_table, 0, sizeof(dsdt_table));
+    data->tables = g_array_new(false, true, sizeof(AcpiSdtTable));
+
+    test_dst_table(&dsdt_table, addr);
+    g_assert_cmphex(dsdt_table.header.signature, ==, ACPI_DSDT_SIGNATURE);
+
+    /* Place DSDT first */
+    g_array_append_val(data->tables, dsdt_table);
 }
 
-static void test_acpi_ssdt_tables(test_data *data)
+static void test_acpi_tables(test_data *data)
 {
-    GArray *ssdt_tables;
-    int ssdt_tables_nr = data->rsdt_tables_nr - 1; /* fadt is first */
+    int tables_nr = data->rsdt_tables_nr - 1; /* fadt is first */
     int i;
 
-    ssdt_tables = g_array_sized_new(false, true, sizeof(AcpiSdtTable),
-                                    ssdt_tables_nr);
-    for (i = 0; i < ssdt_tables_nr; i++) {
+    for (i = 0; i < tables_nr; i++) {
         AcpiSdtTable ssdt_table;
+
+        memset(&ssdt_table, 0 , sizeof(ssdt_table));
         uint32_t addr = data->rsdt_tables_addr[i + 1]; /* fadt is first */
         test_dst_table(&ssdt_table, addr);
-        g_array_append_val(ssdt_tables, ssdt_table);
+        g_array_append_val(data->tables, ssdt_table);
+    }
+}
+
+static void dump_aml_files(test_data *data, bool rebuild)
+{
+    AcpiSdtTable *sdt;
+    GError *error = NULL;
+    gchar *aml_file = NULL;
+    gint fd;
+    ssize_t ret;
+    int i;
+
+    for (i = 0; i < data->tables->len; ++i) {
+        sdt = &g_array_index(data->tables, AcpiSdtTable, i);
+        g_assert(sdt->aml);
+
+        if (rebuild) {
+            aml_file = g_strdup_printf("%s/%s/%.4s", data_dir, data->machine,
+                                       (gchar *)&sdt->header.signature);
+            fd = g_open(aml_file, O_WRONLY|O_TRUNC|O_CREAT,
+                        S_IRUSR|S_IWUSR|S_IRGRP|S_IWGRP|S_IROTH);
+        } else {
+            fd = g_file_open_tmp("aml-XXXXXX", &sdt->aml_file, &error);
+            g_assert_no_error(error);
+        }
+        g_assert(fd >= 0);
+
+        ret = qemu_write_full(fd, sdt, sizeof(AcpiTableHeader));
+        g_assert(ret == sizeof(AcpiTableHeader));
+        ret = qemu_write_full(fd, sdt->aml, sdt->aml_len);
+        g_assert(ret == sdt->aml_len);
+
+        close(fd);
+
+        if (aml_file) {
+            g_free(aml_file);
+        }
+    }
+}
+
+static bool compare_signature(AcpiSdtTable *sdt, uint32_t signature)
+{
+   return sdt->header.signature == signature;
+}
+
+static void load_asl(GArray *sdts, AcpiSdtTable *sdt)
+{
+    AcpiSdtTable *temp;
+    GError *error = NULL;
+    GString *command_line = g_string_new(iasl);
+    gint fd;
+    gchar *out, *out_err;
+    gboolean ret;
+    int i;
+
+    fd = g_file_open_tmp("asl-XXXXXX.dsl", &sdt->asl_file, &error);
+    g_assert_no_error(error);
+    close(fd);
+
+    /* build command line */
+    g_string_append_printf(command_line, " -p %s ", sdt->asl_file);
+    if (compare_signature(sdt, ACPI_DSDT_SIGNATURE) ||
+        compare_signature(sdt, ACPI_SSDT_SIGNATURE)) {
+        for (i = 0; i < sdts->len; ++i) {
+            temp = &g_array_index(sdts, AcpiSdtTable, i);
+            if (compare_signature(temp, ACPI_DSDT_SIGNATURE) ||
+                compare_signature(temp, ACPI_SSDT_SIGNATURE)) {
+                g_string_append_printf(command_line, "-e %s ", temp->aml_file);
+            }
+        }
     }
-    data->ssdt_tables = ssdt_tables;
+    g_string_append_printf(command_line, "-d %s", sdt->aml_file);
+
+    /* pass 'out' and 'out_err' in order to be redirected */
+    g_spawn_command_line_sync(command_line->str, &out, &out_err, NULL, &error);
+    g_assert_no_error(error);
+
+    ret = g_file_get_contents(sdt->asl_file, (gchar **)&sdt->asl,
+                              &sdt->asl_len, &error);
+    g_assert(ret);
+    g_assert_no_error(error);
+    g_assert(sdt->asl_len);
+
+    g_free(out);
+    g_free(out_err);
+    g_string_free(command_line, true);
+}
+
+#define COMMENT_END "*/"
+#define DEF_BLOCK "DefinitionBlock ("
+#define BLOCK_NAME_END ".aml"
+
+static GString *normalize_asl(gchar *asl_code)
+{
+    GString *asl = g_string_new(asl_code);
+    gchar *comment, *block_name;
+
+    /* strip comments (different generation days) */
+    comment = g_strstr_len(asl->str, asl->len, COMMENT_END);
+    if (comment) {
+        asl = g_string_erase(asl, 0, comment + sizeof(COMMENT_END) - asl->str);
+    }
+
+    /* strip def block name (it has file path in it) */
+    if (g_str_has_prefix(asl->str, DEF_BLOCK)) {
+        block_name = g_strstr_len(asl->str, asl->len, BLOCK_NAME_END);
+        g_assert(block_name);
+        asl = g_string_erase(asl, 0,
+                             block_name + sizeof(BLOCK_NAME_END) - asl->str);
+    }
+
+    return asl;
+}
+
+static GArray *load_expected_aml(test_data *data)
+{
+    int i;
+    AcpiSdtTable *sdt;
+    gchar *aml_file;
+    GError *error = NULL;
+    gboolean ret;
+
+    GArray *exp_tables = g_array_new(false, true, sizeof(AcpiSdtTable));
+    for (i = 0; i < data->tables->len; ++i) {
+        AcpiSdtTable exp_sdt;
+        sdt = &g_array_index(data->tables, AcpiSdtTable, i);
+
+        memset(&exp_sdt, 0, sizeof(exp_sdt));
+        exp_sdt.header.signature = sdt->header.signature;
+
+        aml_file = g_strdup_printf("%s/%s/%.4s", data_dir, data->machine,
+                                   (gchar *)&exp_sdt.header.signature);
+        exp_sdt.aml_file = aml_file;
+        g_assert(g_file_test(aml_file, G_FILE_TEST_EXISTS));
+        ret = g_file_get_contents(aml_file, &exp_sdt.aml,
+                                  &exp_sdt.aml_len, &error);
+        g_assert(ret);
+        g_assert_no_error(error);
+        g_assert(exp_sdt.aml);
+        g_assert(exp_sdt.aml_len);
+
+        g_array_append_val(exp_tables, exp_sdt);
+    }
+
+    return exp_tables;
+}
+
+static void test_acpi_asl(test_data *data)
+{
+    int i;
+    AcpiSdtTable *sdt, *exp_sdt;
+    test_data exp_data;
+
+    memset(&exp_data, 0, sizeof(exp_data));
+    exp_data.tables = load_expected_aml(data);
+    dump_aml_files(data, false);
+    for (i = 0; i < data->tables->len; ++i) {
+        GString *asl, *exp_asl;
+
+        sdt = &g_array_index(data->tables, AcpiSdtTable, i);
+        exp_sdt = &g_array_index(exp_data.tables, AcpiSdtTable, i);
+
+        load_asl(data->tables, sdt);
+        asl = normalize_asl(sdt->asl);
+
+        load_asl(exp_data.tables, exp_sdt);
+        exp_asl = normalize_asl(exp_sdt->asl);
+
+        if (g_strcmp0(asl->str, exp_asl->str)) {
+            sdt->asl_file_retain = true;
+            exp_sdt->asl_file_retain = true;
+            fprintf(stderr,
+                    "acpi-test: Warning! %.4s mismatch. "
+                    "Orig asl: %s, expected asl %s.\n",
+                    (gchar *)&exp_sdt->header.signature,
+                    sdt->asl_file, exp_sdt->asl_file);
+        }
+        g_string_free(asl, true);
+        g_string_free(exp_asl, true);
+    }
+
+    free_test_data(&exp_data);
 }
 
 static void test_acpi_one(const char *params, test_data *data)
@@ -329,10 +556,14 @@ static void test_acpi_one(const char *params, test_data *data)
     uint8_t signature_high;
     uint16_t signature;
     int i;
+    const char *device = "";
+
+    if (!g_strcmp0(data->machine, MACHINE_Q35)) {
+        device = ",id=hd -device ide-hd,drive=hd";
+    }
 
-    memset(data, 0, sizeof(*data));
-    args = g_strdup_printf("-net none -display none %s %s",
-                           params ? params : "", disk);
+    args = g_strdup_printf("-net none -display none %s -drive file=%s%s,",
+                           params ? params : "", disk, device);
     qtest_start(args);
 
    /* Wait at most 1 minute */
@@ -360,7 +591,15 @@ static void test_acpi_one(const char *params, test_data *data)
     test_acpi_fadt_table(data);
     test_acpi_facs_table(data);
     test_acpi_dsdt_table(data);
-    test_acpi_ssdt_tables(data);
+    test_acpi_tables(data);
+
+    if (iasl) {
+        if (getenv(ACPI_REBUILD_EXPECTED_AML)) {
+            dump_aml_files(data, true);
+        } else {
+            test_acpi_asl(data);
+        }
+    }
 
     qtest_quit(global_qtest);
     g_free(args);
@@ -373,8 +612,14 @@ static void test_acpi_tcg(void)
     /* Supplying -machine accel argument overrides the default (qtest).
      * This is to make guest actually run.
      */
+    memset(&data, 0, sizeof(data));
+    data.machine = MACHINE_PC;
     test_acpi_one("-machine accel=tcg", &data);
+    free_test_data(&data);
 
+    memset(&data, 0, sizeof(data));
+    data.machine = MACHINE_Q35;
+    test_acpi_one("-machine q35,accel=tcg", &data);
     free_test_data(&data);
 }
 
diff --git a/trace/simple.c b/trace/simple.c
index 1e3f6914c5..57572c4905 100644
--- a/trace/simple.c
+++ b/trace/simple.c
@@ -19,6 +19,7 @@
 #include "qemu/timer.h"
 #include "trace.h"
 #include "trace/control.h"
+#include "trace/simple.h"
 
 /** Trace file header event ID */
 #define HEADER_EVENT_ID (~(uint64_t)0) /* avoids conflicting with TraceEventIDs */
@@ -39,7 +40,17 @@
  * Trace records are written out by a dedicated thread.  The thread waits for
  * records to become available, writes them out, and then waits again.
  */
+#if GLIB_CHECK_VERSION(2, 32, 0)
+static GMutex trace_lock;
+#define lock_trace_lock() g_mutex_lock(&trace_lock)
+#define unlock_trace_lock() g_mutex_unlock(&trace_lock)
+#define get_trace_lock_mutex() (&trace_lock)
+#else
 static GStaticMutex trace_lock = G_STATIC_MUTEX_INIT;
+#define lock_trace_lock() g_static_mutex_lock(&trace_lock)
+#define unlock_trace_lock() g_static_mutex_unlock(&trace_lock)
+#define get_trace_lock_mutex() g_static_mutex_get_mutex(&trace_lock)
+#endif
 
 /* g_cond_new() was deprecated in glib 2.31 but we still need to support it */
 #if GLIB_CHECK_VERSION(2, 31, 0)
@@ -139,27 +150,26 @@ static bool get_trace_record(unsigned int idx, TraceRecord **recordptr)
  */
 static void flush_trace_file(bool wait)
 {
-    g_static_mutex_lock(&trace_lock);
+    lock_trace_lock();
     trace_available = true;
     g_cond_signal(trace_available_cond);
 
     if (wait) {
-        g_cond_wait(trace_empty_cond, g_static_mutex_get_mutex(&trace_lock));
+        g_cond_wait(trace_empty_cond, get_trace_lock_mutex());
     }
 
-    g_static_mutex_unlock(&trace_lock);
+    unlock_trace_lock();
 }
 
 static void wait_for_trace_records_available(void)
 {
-    g_static_mutex_lock(&trace_lock);
+    lock_trace_lock();
     while (!(trace_available && trace_writeout_enabled)) {
         g_cond_signal(trace_empty_cond);
-        g_cond_wait(trace_available_cond,
-                    g_static_mutex_get_mutex(&trace_lock));
+        g_cond_wait(trace_available_cond, get_trace_lock_mutex());
     }
     trace_available = false;
-    g_static_mutex_unlock(&trace_lock);
+    unlock_trace_lock();
 }
 
 static gpointer writeout_thread(gpointer opaque)
diff --git a/translate-all.c b/translate-all.c
index 105c25aff3..543e1ffe77 100644
--- a/translate-all.c
+++ b/translate-all.c
@@ -289,17 +289,15 @@ static inline void map_exec(void *addr, long size)
 }
 #endif
 
-static void page_init(void)
+void page_size_init(void)
 {
     /* NOTE: we can always suppose that qemu_host_page_size >=
        TARGET_PAGE_SIZE */
 #ifdef _WIN32
-    {
-        SYSTEM_INFO system_info;
+    SYSTEM_INFO system_info;
 
-        GetSystemInfo(&system_info);
-        qemu_real_host_page_size = system_info.dwPageSize;
-    }
+    GetSystemInfo(&system_info);
+    qemu_real_host_page_size = system_info.dwPageSize;
 #else
     qemu_real_host_page_size = getpagesize();
 #endif
@@ -310,7 +308,11 @@ static void page_init(void)
         qemu_host_page_size = TARGET_PAGE_SIZE;
     }
     qemu_host_page_mask = ~(qemu_host_page_size - 1);
+}
 
+static void page_init(void)
+{
+    page_size_init();
 #if defined(CONFIG_BSD) && defined(CONFIG_USER_ONLY)
     {
 #ifdef HAVE_KINFO_GETVMMAP
diff --git a/vl.c b/vl.c
index 9c2619f271..383be1b617 100644
--- a/vl.c
+++ b/vl.c
@@ -2945,7 +2945,7 @@ int main(int argc, char **argv, char **envp)
 
     bdrv_init_with_whitelist();
 
-    autostart= 1;
+    autostart = 1;
 
     /* first pass of option parsing */
     optind = 1;
@@ -3899,8 +3899,10 @@ int main(int argc, char **argv, char **envp)
         qemu_set_log(mask);
     }
 
-    if (!trace_backend_init(trace_events, trace_file)) {
-        exit(1);
+    if (!is_daemonized()) {
+        if (!trace_backend_init(trace_events, trace_file)) {
+            exit(1);
+        }
     }
 
     /* If no data_dir is specified then try to find it relative to the
@@ -4399,6 +4401,12 @@ int main(int argc, char **argv, char **envp)
 
     os_setup_post();
 
+    if (is_daemonized()) {
+        if (!trace_backend_init(trace_events, trace_file)) {
+            exit(1);
+        }
+    }
+
     main_loop();
     bdrv_close_all();
     pause_all_vcpus();