summary refs log tree commit diff stats
diff options
context:
space:
mode:
-rw-r--r--.gitignore1
-rw-r--r--MAINTAINERS28
-rw-r--r--Makefile33
-rw-r--r--block.c5
-rw-r--r--block/mirror.c4
-rw-r--r--blockdev.c2
-rw-r--r--bsd-user/elfload.c2
-rw-r--r--bsd-user/main.c12
-rw-r--r--bsd-user/qemu.h2
-rwxr-xr-xconfigure156
-rw-r--r--cpu-exec.c10
-rw-r--r--cpus.c2
-rw-r--r--cputlb.c97
-rw-r--r--disas.c12
-rw-r--r--exec.c2
-rw-r--r--hw/arm/xlnx-zynqmp.c15
-rw-r--r--hw/char/mcf_uart.c2
-rw-r--r--hw/net/ne2000-isa.c1
-rw-r--r--hw/net/ne2000.c10
-rw-r--r--hw/net/ne2000.h1
-rw-r--r--hw/net/rtl8139.c111
-rw-r--r--hw/net/vmxnet3.c5
-rw-r--r--hw/scsi/scsi-bus.c3
-rw-r--r--hw/scsi/scsi-disk.c97
-rw-r--r--hw/scsi/scsi-generic.c66
-rw-r--r--hw/scsi/vhost-scsi.c3
-rw-r--r--hw/scsi/virtio-scsi.c5
-rw-r--r--include/block/block.h3
-rw-r--r--include/exec/cpu-all.h9
-rw-r--r--include/exec/cpu-defs.h23
-rw-r--r--include/exec/cpu_ldst.h8
-rw-r--r--include/exec/exec-all.h76
-rw-r--r--include/hw/arm/xlnx-zynqmp.h6
-rw-r--r--include/hw/i386/apic_internal.h1
-rw-r--r--include/qemu-common.h103
-rw-r--r--include/qemu/compiler.h51
-rw-r--r--include/qemu/host-utils.h29
-rw-r--r--include/qemu/osdep.h111
-rw-r--r--include/qemu/timer.h1
-rw-r--r--include/qom/cpu.h4
-rw-r--r--include/sysemu/os-win32.h18
-rw-r--r--linux-user/elfload.c2
-rw-r--r--linux-user/main.c12
-rw-r--r--linux-user/mmap.c24
-rw-r--r--monitor.c4
-rw-r--r--qapi/qmp-event.c8
-rw-r--r--qemu-doc.texi66
-rw-r--r--qemu-ga.texi137
-rw-r--r--qemu-nbd.c1
-rw-r--r--qga/commands-posix.c6
-rw-r--r--qga/commands-win32.c81
-rw-r--r--qga/installer/qemu-ga.wxs78
-rw-r--r--qga/main.c471
-rw-r--r--qga/qapi-schema.json2
-rw-r--r--qom/cpu.c2
-rw-r--r--softmmu_template.h4
-rw-r--r--target-alpha/cpu.h3
-rw-r--r--target-alpha/gdbstub.c4
-rw-r--r--target-alpha/helper.c63
-rw-r--r--target-alpha/helper.h3
-rw-r--r--target-alpha/machine.c4
-rw-r--r--target-alpha/sys_helper.c22
-rw-r--r--target-alpha/translate.c205
-rw-r--r--target-arm/cpu.h3
-rw-r--r--target-arm/helper.c489
-rw-r--r--target-arm/op_helper.c8
-rw-r--r--target-arm/translate-a64.c60
-rw-r--r--target-arm/translate.c46
-rw-r--r--target-cris/translate.c4
-rw-r--r--target-m68k/translate.c2
-rw-r--r--target-microblaze/translate.c8
-rw-r--r--target-mips/translate.c4
-rw-r--r--target-openrisc/translate.c22
-rw-r--r--target-s390x/translate.c30
-rw-r--r--target-sh4/translate.c4
-rw-r--r--target-sparc/translate.c14
-rw-r--r--target-tricore/translate.c32
-rw-r--r--target-xtensa/translate.c2
-rw-r--r--tcg/README32
-rw-r--r--tcg/aarch64/tcg-target.c59
-rw-r--r--tcg/aarch64/tcg-target.h3
-rw-r--r--tcg/arm/tcg-target.c8
-rw-r--r--tcg/i386/tcg-target.c49
-rw-r--r--tcg/i386/tcg-target.h3
-rw-r--r--tcg/ia64/tcg-target.c29
-rw-r--r--tcg/ia64/tcg-target.h3
-rw-r--r--tcg/mips/tcg-target.c16
-rw-r--r--tcg/optimize.c253
-rw-r--r--tcg/ppc/tcg-target.c65
-rw-r--r--tcg/ppc/tcg-target.h3
-rw-r--r--tcg/s390/tcg-target.c50
-rw-r--r--tcg/s390/tcg-target.h3
-rw-r--r--tcg/sparc/tcg-target.c36
-rw-r--r--tcg/sparc/tcg-target.h3
-rw-r--r--tcg/tcg-op.c48
-rw-r--r--tcg/tcg-op.h12
-rw-r--r--tcg/tcg-opc.h10
-rw-r--r--tcg/tcg.h3
-rw-r--r--tcg/tci/tcg-target.c4
-rw-r--r--tcg/tci/tcg-target.h3
-rw-r--r--tci.c6
-rw-r--r--tests/Makefile2
-rw-r--r--tests/virtio-net-test.c237
-rw-r--r--tests/virtio-scsi-test.c100
-rw-r--r--translate-all.c13
-rw-r--r--ui/vnc.c15
-rw-r--r--user-exec.c4
-rw-r--r--util/rcu.c48
108 files changed, 2796 insertions, 1384 deletions
diff --git a/.gitignore b/.gitignore
index 61bc49263a..cb4b8ec137 100644
--- a/.gitignore
+++ b/.gitignore
@@ -58,6 +58,7 @@
 *.cp
 *.dvi
 *.exe
+*.msi
 *.dll
 *.so
 *.mo
diff --git a/MAINTAINERS b/MAINTAINERS
index a059d5de3b..a4ea7c39ed 100644
--- a/MAINTAINERS
+++ b/MAINTAINERS
@@ -349,13 +349,22 @@ S: Maintained
 F: hw/*/versatile*
 
 Xilinx Zynq
-M: Peter Crosthwaite <peter.crosthwaite@xilinx.com>
+M: Alistair Francis <alistair.francis@xilinx.com>
+M: Peter Crosthwaite <crosthwaite.peter@gmail.com>
 S: Maintained
 F: hw/arm/xilinx_zynq.c
 F: hw/misc/zynq_slcr.c
 F: hw/*/cadence_*
 F: hw/ssi/xilinx_spips.c
 
+Xilinx ZynqMP
+M: Alistair Francis <alistair.francis@xilinx.com>
+M: Peter Crosthwaite <crosthwaite.peter@gmail.com>
+S: Maintained
+F: hw/arm/xlnx-zynqmp.c
+F: hw/arm/xlnx-ep108.c
+F: include/hw/arm/xlnx-zynqmp.h
+
 ARM ACPI Subsystem
 M: Shannon Zhao <zhaoshenglong@huawei.com>
 M: Shannon Zhao <shannon.zhao@linaro.org>
@@ -405,7 +414,7 @@ S: Maintained
 F: hw/microblaze/petalogix_s3adsp1800_mmu.c
 
 petalogix_ml605
-M: Peter Crosthwaite <peter.crosthwaite@xilinx.com>
+M: Edgar E. Iglesias <edgar.iglesias@gmail.com>
 S: Maintained
 F: hw/microblaze/petalogix_ml605_mmu.c
 
@@ -685,10 +694,17 @@ S: Orphan
 F: hw/scsi/lsi53c895a.c
 
 SSI
-M: Peter Crosthwaite <peter.crosthwaite@xilinx.com>
+M: Peter Crosthwaite <crosthwaite.peter@gmail.com>
 S: Maintained
 F: hw/ssi/*
 F: hw/block/m25p80.c
+X: hw/ssi/xilinx_*
+
+Xilinx SPI
+M: Alistair Francis <alistair.francis@xilinx.com>
+M: Peter Crosthwaite <crosthwaite.peter@gmail.com>
+S: Maintained
+F: hw/ssi/xilinx_*
 
 USB
 M: Gerd Hoffmann <kraxel@redhat.com>
@@ -777,8 +793,9 @@ F: hw/scsi/megasas.c
 F: hw/scsi/mfi.h
 
 Xilinx EDK
-M: Peter Crosthwaite <peter.crosthwaite@xilinx.com>
 M: Edgar E. Iglesias <edgar.iglesias@gmail.com>
+M: Alistair Francis <alistair.francis@xilinx.com>
+M: Peter Crosthwaite <crosthwaite.peter@gmail.com>
 S: Maintained
 F: hw/*/xilinx_*
 F: include/hw/xilinx.h
@@ -880,7 +897,7 @@ F: include/hw/cpu/icc_bus.h
 F: hw/cpu/icc_bus.c
 
 Device Tree
-M: Peter Crosthwaite <peter.crosthwaite@xilinx.com>
+M: Peter Crosthwaite <crosthwaite.peter@gmail.com>
 M: Alexander Graf <agraf@suse.de>
 S: Maintained
 F: device_tree.[ch]
@@ -1290,6 +1307,7 @@ F: block/dmg.c
 
 parallels
 M: Stefan Hajnoczi <stefanha@redhat.com>
+M: Denis V. Lunev <den@openvz.org>
 L: qemu-block@nongnu.org
 S: Supported
 F: block/parallels.c
diff --git a/Makefile b/Makefile
index 340d9c8faa..9ce3972d84 100644
--- a/Makefile
+++ b/Makefile
@@ -88,7 +88,8 @@ LIBS+=-lz $(LIBS_TOOLS)
 HELPERS-$(CONFIG_LINUX) = qemu-bridge-helper$(EXESUF)
 
 ifdef BUILD_DOCS
-DOCS=qemu-doc.html qemu-tech.html qemu.1 qemu-img.1 qemu-nbd.8 qmp-commands.txt
+DOCS=qemu-doc.html qemu-tech.html qemu.1 qemu-img.1 qemu-nbd.8 qemu-ga.8
+DOCS+=qmp-commands.txt
 ifdef CONFIG_LINUX
 DOCS+=kvm_stat.1
 endif
@@ -289,28 +290,27 @@ $(qapi-modules) $(SRC_PATH)/scripts/qapi-commands.py $(qapi-py)
 QGALIB_GEN=$(addprefix qga/qapi-generated/, qga-qapi-types.h qga-qapi-visit.h qga-qmp-commands.h)
 $(qga-obj-y) qemu-ga.o: $(QGALIB_GEN)
 
-qemu-ga$(EXESUF): $(qga-obj-y) libqemuutil.a libqemustub.a
-	$(call LINK, $^)
+# we require QGA_VSS_PROVIDER files to be built alongside qemu-ga
+# executable since they are shipped together, but we don't want to actually
+# link against them
+qemu-ga$(EXESUF): $(qga-obj-y) libqemuutil.a libqemustub.a $(QGA_VSS_PROVIDER)
+	$(call LINK, $(filter-out $(QGA_VSS_PROVIDER), $^))
 
 ifdef QEMU_GA_MSI_ENABLED
 QEMU_GA_MSI=qemu-ga-$(ARCH).msi
 
-msi: ${QEMU_GA_MSI}
+msi: $(QEMU_GA_MSI)
 
 $(QEMU_GA_MSI): qemu-ga.exe
 
-ifdef QEMU_GA_MSI_WITH_VSS
-$(QEMU_GA_MSI): qga/vss-win32/qga-vss.dll
-endif
-
 $(QEMU_GA_MSI): config-host.mak
 
-$(QEMU_GA_MSI):  qga/installer/qemu-ga.wxs
-	$(call quiet-command,QEMU_GA_VERSION="$(QEMU_GA_VERSION)" QEMU_GA_MANUFACTURER="$(QEMU_GA_MANUFACTURER)" QEMU_GA_DISTRO="$(QEMU_GA_DISTRO)" \
+$(QEMU_GA_MSI):  $(SRC_PATH)/qga/installer/qemu-ga.wxs
+	$(call quiet-command,QEMU_GA_VERSION="$(QEMU_GA_VERSION)" QEMU_GA_MANUFACTURER="$(QEMU_GA_MANUFACTURER)" QEMU_GA_DISTRO="$(QEMU_GA_DISTRO)" BUILD_DIR="$(BUILD_DIR)" \
 	wixl -o $@ $(QEMU_GA_MSI_ARCH) $(QEMU_GA_MSI_WITH_VSS) $(QEMU_GA_MSI_MINGW_DLL_PATH) $<, "  WIXL  $@")
 else
 msi:
-	@echo MSI build not configured or dependency resolution failed (reconfigure with --enable-guest-agent-msi option)
+	@echo "MSI build not configured or dependency resolution failed (reconfigure with --enable-guest-agent-msi option)"
 endif
 
 clean:
@@ -400,6 +400,9 @@ ifneq ($(TOOLS),)
 	$(INSTALL_DIR) "$(DESTDIR)$(mandir)/man8"
 	$(INSTALL_DATA) qemu-nbd.8 "$(DESTDIR)$(mandir)/man8"
 endif
+ifneq (,$(findstring qemu-ga,$(TOOLS)))
+	$(INSTALL_DATA) qemu-ga.8 "$(DESTDIR)$(mandir)/man8"
+endif
 endif
 ifdef CONFIG_VIRTFS
 	$(INSTALL_DIR) "$(DESTDIR)$(mandir)/man1"
@@ -538,6 +541,12 @@ qemu-nbd.8: qemu-nbd.texi
 	  $(POD2MAN) --section=8 --center=" " --release=" " qemu-nbd.pod > $@, \
 	  "  GEN   $@")
 
+qemu-ga.8: qemu-ga.texi
+	$(call quiet-command, \
+	  perl -Ww -- $(SRC_PATH)/scripts/texi2pod.pl $< qemu-ga.pod && \
+	  $(POD2MAN) --section=8 --center=" " --release=" " qemu-ga.pod > $@, \
+	  "  GEN   $@")
+
 kvm_stat.1: scripts/kvm/kvm_stat.texi
 	$(call quiet-command, \
 	  perl -Ww -- $(SRC_PATH)/scripts/texi2pod.pl $< kvm_stat.pod && \
@@ -551,7 +560,7 @@ pdf: qemu-doc.pdf qemu-tech.pdf
 
 qemu-doc.dvi qemu-doc.html qemu-doc.info qemu-doc.pdf: \
 	qemu-img.texi qemu-nbd.texi qemu-options.texi \
-	qemu-monitor.texi qemu-img-cmds.texi
+	qemu-monitor.texi qemu-img-cmds.texi qemu-ga.texi
 
 ifdef CONFIG_WIN32
 
diff --git a/block.c b/block.c
index d088ee02ff..090923c312 100644
--- a/block.c
+++ b/block.c
@@ -4077,7 +4077,8 @@ bool bdrv_is_first_non_filter(BlockDriverState *candidate)
     return false;
 }
 
-BlockDriverState *check_to_replace_node(const char *node_name, Error **errp)
+BlockDriverState *check_to_replace_node(BlockDriverState *parent_bs,
+                                        const char *node_name, Error **errp)
 {
     BlockDriverState *to_replace_bs = bdrv_find_node(node_name);
     AioContext *aio_context;
@@ -4100,7 +4101,7 @@ BlockDriverState *check_to_replace_node(const char *node_name, Error **errp)
      * Another benefit is that this tests exclude backing files which are
      * blocked by the backing blockers.
      */
-    if (!bdrv_is_first_non_filter(to_replace_bs)) {
+    if (!bdrv_recurse_is_first_non_filter(parent_bs, to_replace_bs)) {
         error_setg(errp, "Only top most non filter can be replaced");
         to_replace_bs = NULL;
         goto out;
diff --git a/block/mirror.c b/block/mirror.c
index 94744432eb..a2589261f5 100644
--- a/block/mirror.c
+++ b/block/mirror.c
@@ -644,9 +644,9 @@ static void mirror_complete(BlockJob *job, Error **errp)
     if (s->replaces) {
         AioContext *replace_aio_context;
 
-        s->to_replace = check_to_replace_node(s->replaces, &local_err);
+        s->to_replace = bdrv_find_node(s->replaces);
         if (!s->to_replace) {
-            error_propagate(errp, local_err);
+            error_setg(errp, "Node name '%s' not found", s->replaces);
             return;
         }
 
diff --git a/blockdev.c b/blockdev.c
index 4125ff642a..6b48be60ba 100644
--- a/blockdev.c
+++ b/blockdev.c
@@ -2760,7 +2760,7 @@ void qmp_drive_mirror(const char *device, const char *target,
             goto out;
         }
 
-        to_replace_bs = check_to_replace_node(replaces, &local_err);
+        to_replace_bs = check_to_replace_node(bs, replaces, &local_err);
 
         if (!to_replace_bs) {
             error_propagate(errp, local_err);
diff --git a/bsd-user/elfload.c b/bsd-user/elfload.c
index 2bf57eb1fc..d067779273 100644
--- a/bsd-user/elfload.c
+++ b/bsd-user/elfload.c
@@ -1371,7 +1371,6 @@ int load_elf_binary(struct linux_binprm * bprm, struct target_pt_regs * regs,
     info->mmap = 0;
     elf_entry = (abi_ulong) elf_ex.e_entry;
 
-#if defined(CONFIG_USE_GUEST_BASE)
     /*
      * In case where user has not explicitly set the guest_base, we
      * probe here that should we set it automatically.
@@ -1392,7 +1391,6 @@ int load_elf_binary(struct linux_binprm * bprm, struct target_pt_regs * regs,
             }
         }
     }
-#endif /* CONFIG_USE_GUEST_BASE */
 
     /* Do this so that we can load the interpreter, if need be.  We will
        change some of these later */
diff --git a/bsd-user/main.c b/bsd-user/main.c
index ee68daa395..f0a1268dda 100644
--- a/bsd-user/main.c
+++ b/bsd-user/main.c
@@ -35,12 +35,10 @@
 #include "qemu/envlist.h"
 
 int singlestep;
-#if defined(CONFIG_USE_GUEST_BASE)
 unsigned long mmap_min_addr;
 unsigned long guest_base;
 int have_guest_base;
 unsigned long reserved_va;
-#endif
 
 static const char *interp_prefix = CONFIG_QEMU_INTERP_PREFIX;
 const char *qemu_uname_release;
@@ -682,9 +680,7 @@ static void usage(void)
            "-drop-ld-preload  drop LD_PRELOAD for target process\n"
            "-E var=value      sets/modifies targets environment variable(s)\n"
            "-U var            unsets targets environment variable(s)\n"
-#if defined(CONFIG_USE_GUEST_BASE)
            "-B address        set guest_base address to address\n"
-#endif
            "-bsd type         select emulated BSD type FreeBSD/NetBSD/OpenBSD (default)\n"
            "\n"
            "Debug options:\n"
@@ -830,11 +826,9 @@ int main(int argc, char **argv)
 #endif
                 exit(1);
             }
-#if defined(CONFIG_USE_GUEST_BASE)
         } else if (!strcmp(r, "B")) {
            guest_base = strtol(argv[optind++], NULL, 0);
            have_guest_base = 1;
-#endif
         } else if (!strcmp(r, "drop-ld-preload")) {
             (void) envlist_unsetenv(envlist, "LD_PRELOAD");
         } else if (!strcmp(r, "bsd")) {
@@ -923,7 +917,6 @@ int main(int argc, char **argv)
     target_environ = envlist_to_environ(envlist, NULL);
     envlist_free(envlist);
 
-#if defined(CONFIG_USE_GUEST_BASE)
     /*
      * Now that page sizes are configured in cpu_init() we can do
      * proper page alignment for guest_base.
@@ -950,7 +943,6 @@ int main(int argc, char **argv)
             fclose(fp);
         }
     }
-#endif /* CONFIG_USE_GUEST_BASE */
 
     if (loader_exec(filename, argv+optind, target_environ, regs, info) != 0) {
         printf("Error loading %s\n", filename);
@@ -964,9 +956,7 @@ int main(int argc, char **argv)
     free(target_environ);
 
     if (qemu_log_enabled()) {
-#if defined(CONFIG_USE_GUEST_BASE)
         qemu_log("guest_base  0x%lx\n", guest_base);
-#endif
         log_page_dump();
 
         qemu_log("start_brk   0x" TARGET_ABI_FMT_lx "\n", info->start_brk);
@@ -986,12 +976,10 @@ int main(int argc, char **argv)
     syscall_init();
     signal_init();
 
-#if defined(CONFIG_USE_GUEST_BASE)
     /* Now that we've loaded the binary, GUEST_BASE is fixed.  Delay
        generating the prologue until now so that the prologue can take
        the real value of GUEST_BASE into account.  */
     tcg_prologue_init(&tcg_ctx);
-#endif
 
     /* build Task State */
     memset(ts, 0, sizeof(TaskState));
diff --git a/bsd-user/qemu.h b/bsd-user/qemu.h
index 5362297fe1..21cc6023ee 100644
--- a/bsd-user/qemu.h
+++ b/bsd-user/qemu.h
@@ -101,9 +101,7 @@ typedef struct TaskState {
 
 void init_task_state(TaskState *ts);
 extern const char *qemu_uname_release;
-#if defined(CONFIG_USE_GUEST_BASE)
 extern unsigned long mmap_min_addr;
-#endif
 
 /* ??? See if we can avoid exposing so much of the loader internals.  */
 /*
diff --git a/configure b/configure
index cd219d8e3b..21c4089c5c 100755
--- a/configure
+++ b/configure
@@ -293,7 +293,6 @@ cocoa="no"
 softmmu="yes"
 linux_user="no"
 bsd_user="no"
-guest_base="yes"
 aix="no"
 blobs="yes"
 pkgversion=""
@@ -733,7 +732,7 @@ if test "$mingw32" = "yes" ; then
   sysconfdir="\${prefix}"
   local_statedir=
   confsuffix=""
-  libs_qga="-lws2_32 -lwinmm -lpowrprof -liphlpapi $libs_qga"
+  libs_qga="-lws2_32 -lwinmm -lpowrprof -liphlpapi -lnetapi32 $libs_qga"
 fi
 
 werror=""
@@ -956,7 +955,6 @@ for opt do
   ;;
   --enable-cocoa)
       cocoa="yes" ;
-      sdl="no" ;
       audio_drv_list="coreaudio `echo $audio_drv_list | sed s,coreaudio,,g`"
   ;;
   --disable-system) softmmu="no"
@@ -976,10 +974,6 @@ for opt do
   ;;
   --enable-bsd-user) bsd_user="yes"
   ;;
-  --enable-guest-base) guest_base="yes"
-  ;;
-  --disable-guest-base) guest_base="no"
-  ;;
   --enable-pie) pie="yes"
   ;;
   --disable-pie) pie="no"
@@ -1315,7 +1309,6 @@ disabled with --disable-FEATURE, default is enabled if available:
   user            supported user emulation targets
   linux-user      all linux usermode emulation targets
   bsd-user        all BSD usermode emulation targets
-  guest-base      GUEST_BASE support for usermode emulation targets
   docs            build documentation
   guest-agent     build the QEMU Guest Agent
   guest-agent-msi build guest agent Windows MSI installation package
@@ -1711,6 +1704,21 @@ else
 fi
 
 ##########################################
+# cocoa implies not SDL or GTK
+# (the cocoa UI code currently assumes it is always the active UI
+# and doesn't interact well with other UI frontend code)
+if test "$cocoa" = "yes"; then
+    if test "$sdl" = "yes"; then
+        error_exit "Cocoa and SDL UIs cannot both be enabled at once"
+    fi
+    if test "$gtk" = "yes"; then
+        error_exit "Cocoa and GTK UIs cannot both be enabled at once"
+    fi
+    gtk=no
+    sdl=no
+fi
+
+##########################################
 # L2TPV3 probe
 
 cat > $TMPC <<EOF
@@ -2287,9 +2295,7 @@ EOF
     if test "$_sdlversion" -lt 121 ; then
       sdl_too_old=yes
     else
-      if test "$cocoa" = "no" ; then
-        sdl=yes
-      fi
+      sdl=yes
     fi
 
     # static link with sdl ? (note: sdl.pc's --static --libs is broken)
@@ -3845,6 +3851,7 @@ EOF
     guest_agent_with_vss="yes"
     QEMU_CFLAGS="$QEMU_CFLAGS $vss_win32_include"
     libs_qga="-lole32 -loleaut32 -lshlwapi -luuid -lstdc++ -Wl,--enable-stdcall-fixup $libs_qga"
+    qga_vss_provider="qga/vss-win32/qga-vss.dll qga/vss-win32/qga-vss.tlb"
   else
     if test "$vss_win32_sdk" != "" ; then
       echo "ERROR: Please download and install Microsoft VSS SDK:"
@@ -3899,58 +3906,6 @@ EOF
 fi
 
 ##########################################
-# Guest agent Window MSI  package
-
-if test "$guest_agent" != yes; then
-  if test "$guest_agent_msi" = yes; then
-    error_exit "MSI guest agent package requires guest agent enabled"
-  fi
-  guest_agent_msi=no
-elif test "$mingw32" != "yes"; then
-  if test "$guest_agent_msi" = "yes"; then
-    error_exit "MSI guest agent package is available only for MinGW Windows cross-compilation"
-  fi
-  guest_agent_msi=no
-elif ! has wixl; then
-  if test "$guest_agent_msi" = "yes"; then
-    error_exit "MSI guest agent package requires wixl tool installed ( usually from msitools package )"
-  fi
-  guest_agent_msi=no
-fi
-
-if test "$guest_agent_msi" != "no"; then
-  if test "$guest_agent_with_vss" = "yes"; then
-    QEMU_GA_MSI_WITH_VSS="-D InstallVss"
-  fi
-
-  if test "$QEMU_GA_MANUFACTURER" = ""; then
-    QEMU_GA_MANUFACTURER=QEMU
-  fi
-
-  if test "$QEMU_GA_DISTRO" = ""; then
-    QEMU_GA_DISTRO=Linux
-  fi
-
-  if test "$QEMU_GA_VERSION" = ""; then
-      QEMU_GA_VERSION=`cat $source_path/VERSION`
-  fi
-
-  QEMU_GA_MSI_MINGW_DLL_PATH="-D Mingw_dlls=`$pkg_config --variable=prefix glib-2.0`/bin"
-  
-  case "$cpu" in
-  x86_64)
-    QEMU_GA_MSI_ARCH="-a x64 -D Arch=64"
-    ;;
-  i386)
-    QEMU_GA_MSI_ARCH="-D Arch=32"
-    ;;
-  *)
-    error_exit "CPU $cpu not supported for building installation package"
-    ;;
-  esac
-fi
-
-##########################################
 # check if we have fdatasync
 
 fdatasync=no
@@ -4390,12 +4345,12 @@ if test "$softmmu" = yes ; then
     fi
   fi
 fi
+
+# Probe for guest agent support/options
+
 if [ "$guest_agent" != "no" ]; then
   if [ "$linux" = "yes" -o "$bsd" = "yes" -o "$solaris" = "yes" -o "$mingw32" = "yes" ] ; then
       tools="qemu-ga\$(EXESUF) $tools"
-      if [ "$mingw32" = "yes" -a "$guest_agent_with_vss" = "yes" ]; then
-        tools="qga/vss-win32/qga-vss.dll qga/vss-win32/qga-vss.tlb $tools"
-      fi
       guest_agent=yes
   elif [ "$guest_agent" != yes ]; then
       guest_agent=no
@@ -4404,6 +4359,63 @@ if [ "$guest_agent" != "no" ]; then
   fi
 fi
 
+# Guest agent Window MSI  package
+
+if test "$guest_agent" != yes; then
+  if test "$guest_agent_msi" = yes; then
+    error_exit "MSI guest agent package requires guest agent enabled"
+  fi
+  guest_agent_msi=no
+elif test "$mingw32" != "yes"; then
+  if test "$guest_agent_msi" = "yes"; then
+    error_exit "MSI guest agent package is available only for MinGW Windows cross-compilation"
+  fi
+  guest_agent_msi=no
+elif ! has wixl; then
+  if test "$guest_agent_msi" = "yes"; then
+    error_exit "MSI guest agent package requires wixl tool installed ( usually from msitools package )"
+  fi
+  guest_agent_msi=no
+else
+  # we support qemu-ga, mingw32, and wixl: default to MSI enabled if it wasn't
+  # disabled explicitly
+  if test "$guest_agent_msi" != "no"; then
+    guest_agent_msi=yes
+  fi
+fi
+
+if test "$guest_agent_msi" = "yes"; then
+  if test "$guest_agent_with_vss" = "yes"; then
+    QEMU_GA_MSI_WITH_VSS="-D InstallVss"
+  fi
+
+  if test "$QEMU_GA_MANUFACTURER" = ""; then
+    QEMU_GA_MANUFACTURER=QEMU
+  fi
+
+  if test "$QEMU_GA_DISTRO" = ""; then
+    QEMU_GA_DISTRO=Linux
+  fi
+
+  if test "$QEMU_GA_VERSION" = ""; then
+      QEMU_GA_VERSION=`cat $source_path/VERSION`
+  fi
+
+  QEMU_GA_MSI_MINGW_DLL_PATH="-D Mingw_dlls=`$pkg_config --variable=prefix glib-2.0`/bin"
+
+  case "$cpu" in
+  x86_64)
+    QEMU_GA_MSI_ARCH="-a x64 -D Arch=64"
+    ;;
+  i386)
+    QEMU_GA_MSI_ARCH="-D Arch=32"
+    ;;
+  *)
+    error_exit "CPU $cpu not supported for building installation package"
+    ;;
+  esac
+fi
+
 # Mac OS X ships with a broken assembler
 roms=
 if test \( "$cpu" = "i386" -o "$cpu" = "x86_64" \) -a \
@@ -4532,7 +4544,6 @@ fi
 echo "brlapi support    $brlapi"
 echo "bluez  support    $bluez"
 echo "Documentation     $docs"
-echo "GUEST_BASE        $guest_base"
 echo "PIE               $pie"
 echo "vde support       $vde"
 echo "netmap support    $netmap"
@@ -4572,6 +4583,7 @@ echo "libnfs support    $libnfs"
 echo "build guest agent $guest_agent"
 echo "QGA VSS support   $guest_agent_with_vss"
 echo "QGA w32 disk info $guest_agent_ntddscsi"
+echo "QGA MSI support   $guest_agent_msi"
 echo "seccomp support   $seccomp"
 echo "coroutine backend $coroutine"
 echo "coroutine pool    $coroutine_pool"
@@ -4646,12 +4658,13 @@ if test "$mingw32" = "yes" ; then
   echo "CONFIG_PRODUCTVERSION=$version_major,$version_minor,$version_subminor,$version_micro" >> $config_host_mak
   if test "$guest_agent_with_vss" = "yes" ; then
     echo "CONFIG_QGA_VSS=y" >> $config_host_mak
+    echo "QGA_VSS_PROVIDER=$qga_vss_provider" >> $config_host_mak
     echo "WIN_SDK=\"$win_sdk\"" >> $config_host_mak
   fi
   if test "$guest_agent_ntddscsi" = "yes" ; then
     echo "CONFIG_QGA_NTDDDISK=y" >> $config_host_mak
   fi
-  if test "$guest_agent_msi" != "no"; then
+  if test "$guest_agent_msi" = "yes"; then
     echo "QEMU_GA_MSI_ENABLED=yes" >> $config_host_mak  
     echo "QEMU_GA_MSI_MINGW_DLL_PATH=${QEMU_GA_MSI_MINGW_DLL_PATH}" >> $config_host_mak
     echo "QEMU_GA_MSI_WITH_VSS=${QEMU_GA_MSI_WITH_VSS}" >> $config_host_mak
@@ -5469,9 +5482,6 @@ fi
 if test "$target_user_only" = "yes" -a "$bflt" = "yes"; then
   echo "TARGET_HAS_BFLT=y" >> $config_target_mak
 fi
-if test "$target_user_only" = "yes" -a "$guest_base" = "yes"; then
-  echo "CONFIG_USE_GUEST_BASE=y" >> $config_target_mak
-fi
 if test "$target_bsd_user" = "yes" ; then
   echo "CONFIG_BSD_USER=y" >> $config_target_mak
 fi
@@ -5600,10 +5610,6 @@ if [ "$pixman" = "internal" ]; then
   echo "config-host.h: subdir-pixman" >> $config_host_mak
 fi
 
-if test "$rdma" = "yes" ; then
-echo "CONFIG_RDMA=y" >> $config_host_mak
-fi
-
 if [ "$dtc_internal" = "yes" ]; then
   echo "config-host.h: subdir-dtc" >> $config_host_mak
 fi
diff --git a/cpu-exec.c b/cpu-exec.c
index 75694f3bb3..713540fc8f 100644
--- a/cpu-exec.c
+++ b/cpu-exec.c
@@ -196,7 +196,7 @@ static inline tcg_target_ulong cpu_tb_exec(CPUState *cpu, uint8_t *tb_ptr)
     }
 #endif /* DEBUG_DISAS */
 
-    cpu->can_do_io = 0;
+    cpu->can_do_io = !use_icount;
     next_tb = tcg_qemu_tb_exec(env, tb_ptr);
     cpu->can_do_io = 1;
     trace_exec_tb_exit((void *) (next_tb & ~TB_EXIT_MASK),
@@ -231,19 +231,15 @@ static void cpu_exec_nocache(CPUState *cpu, int max_cycles,
                              TranslationBlock *orig_tb)
 {
     TranslationBlock *tb;
-    target_ulong pc = orig_tb->pc;
-    target_ulong cs_base = orig_tb->cs_base;
-    uint64_t flags = orig_tb->flags;
 
     /* Should never happen.
        We only end up here when an existing TB is too long.  */
     if (max_cycles > CF_COUNT_MASK)
         max_cycles = CF_COUNT_MASK;
 
-    /* tb_gen_code can flush our orig_tb, invalidate it now */
-    tb_phys_invalidate(orig_tb, -1);
-    tb = tb_gen_code(cpu, pc, cs_base, flags,
+    tb = tb_gen_code(cpu, orig_tb->pc, orig_tb->cs_base, orig_tb->flags,
                      max_cycles | CF_NOCACHE);
+    tb->orig_tb = tcg_ctx.tb_ctx.tb_invalidated_flag ? NULL : orig_tb;
     cpu->current_tb = tb;
     /* execute the generated code */
     trace_exec_tb_nocache(tb, tb->pc);
diff --git a/cpus.c b/cpus.c
index a822ce3d80..c1e74d9824 100644
--- a/cpus.c
+++ b/cpus.c
@@ -145,7 +145,7 @@ int64_t cpu_get_icount_raw(void)
 
     icount = timers_state.qemu_icount;
     if (cpu) {
-        if (!cpu_can_do_io(cpu)) {
+        if (!cpu->can_do_io) {
             fprintf(stderr, "Bad icount read\n");
             exit(1);
         }
diff --git a/cputlb.c b/cputlb.c
index a50608676c..4bc6c24e11 100644
--- a/cputlb.c
+++ b/cputlb.c
@@ -69,6 +69,47 @@ void tlb_flush(CPUState *cpu, int flush_global)
     tlb_flush_count++;
 }
 
+static inline void v_tlb_flush_by_mmuidx(CPUState *cpu, va_list argp)
+{
+    CPUArchState *env = cpu->env_ptr;
+
+#if defined(DEBUG_TLB)
+    printf("tlb_flush_by_mmuidx:");
+#endif
+    /* must reset current TB so that interrupts cannot modify the
+       links while we are modifying them */
+    cpu->current_tb = NULL;
+
+    for (;;) {
+        int mmu_idx = va_arg(argp, int);
+
+        if (mmu_idx < 0) {
+            break;
+        }
+
+#if defined(DEBUG_TLB)
+        printf(" %d", mmu_idx);
+#endif
+
+        memset(env->tlb_table[mmu_idx], -1, sizeof(env->tlb_table[0]));
+        memset(env->tlb_v_table[mmu_idx], -1, sizeof(env->tlb_v_table[0]));
+    }
+
+#if defined(DEBUG_TLB)
+    printf("\n");
+#endif
+
+    memset(cpu->tb_jmp_cache, 0, sizeof(cpu->tb_jmp_cache));
+}
+
+void tlb_flush_by_mmuidx(CPUState *cpu, ...)
+{
+    va_list argp;
+    va_start(argp, cpu);
+    v_tlb_flush_by_mmuidx(cpu, argp);
+    va_end(argp);
+}
+
 static inline void tlb_flush_entry(CPUTLBEntry *tlb_entry, target_ulong addr)
 {
     if (addr == (tlb_entry->addr_read &
@@ -121,6 +162,62 @@ void tlb_flush_page(CPUState *cpu, target_ulong addr)
     tb_flush_jmp_cache(cpu, addr);
 }
 
+void tlb_flush_page_by_mmuidx(CPUState *cpu, target_ulong addr, ...)
+{
+    CPUArchState *env = cpu->env_ptr;
+    int i, k;
+    va_list argp;
+
+    va_start(argp, addr);
+
+#if defined(DEBUG_TLB)
+    printf("tlb_flush_page_by_mmu_idx: " TARGET_FMT_lx, addr);
+#endif
+    /* Check if we need to flush due to large pages.  */
+    if ((addr & env->tlb_flush_mask) == env->tlb_flush_addr) {
+#if defined(DEBUG_TLB)
+        printf(" forced full flush ("
+               TARGET_FMT_lx "/" TARGET_FMT_lx ")\n",
+               env->tlb_flush_addr, env->tlb_flush_mask);
+#endif
+        v_tlb_flush_by_mmuidx(cpu, argp);
+        va_end(argp);
+        return;
+    }
+    /* must reset current TB so that interrupts cannot modify the
+       links while we are modifying them */
+    cpu->current_tb = NULL;
+
+    addr &= TARGET_PAGE_MASK;
+    i = (addr >> TARGET_PAGE_BITS) & (CPU_TLB_SIZE - 1);
+
+    for (;;) {
+        int mmu_idx = va_arg(argp, int);
+
+        if (mmu_idx < 0) {
+            break;
+        }
+
+#if defined(DEBUG_TLB)
+        printf(" %d", mmu_idx);
+#endif
+
+        tlb_flush_entry(&env->tlb_table[mmu_idx][i], addr);
+
+        /* check whether there are vltb entries that need to be flushed */
+        for (k = 0; k < CPU_VTLB_SIZE; k++) {
+            tlb_flush_entry(&env->tlb_v_table[mmu_idx][k], addr);
+        }
+    }
+    va_end(argp);
+
+#if defined(DEBUG_TLB)
+    printf("\n");
+#endif
+
+    tb_flush_jmp_cache(cpu, addr);
+}
+
 /* update the TLBs so that writes to code in the virtual page 'addr'
    can be detected */
 void tlb_protect_code(ram_addr_t ram_addr)
diff --git a/disas.c b/disas.c
index 69a6066914..0ae70c22f7 100644
--- a/disas.c
+++ b/disas.c
@@ -72,14 +72,6 @@ generic_print_address (bfd_vma addr, struct disassemble_info *info)
     (*info->fprintf_func) (info->stream, "0x%" PRIx64, addr);
 }
 
-/* Print address in hex, truncated to the width of a target virtual address. */
-static void
-generic_print_target_address(bfd_vma addr, struct disassemble_info *info)
-{
-    uint64_t mask = ~0ULL >> (64 - TARGET_VIRT_ADDR_SPACE_BITS);
-    generic_print_address(addr & mask, info);
-}
-
 /* Print address in hex, truncated to the width of a host virtual address. */
 static void
 generic_print_host_address(bfd_vma addr, struct disassemble_info *info)
@@ -201,7 +193,7 @@ void target_disas(FILE *out, CPUState *cpu, target_ulong code,
     s.info.read_memory_func = target_read_memory;
     s.info.buffer_vma = code;
     s.info.buffer_length = size;
-    s.info.print_address_func = generic_print_target_address;
+    s.info.print_address_func = generic_print_address;
 
 #ifdef TARGET_WORDS_BIGENDIAN
     s.info.endian = BFD_ENDIAN_BIG;
@@ -424,7 +416,7 @@ void monitor_disas(Monitor *mon, CPUState *cpu,
     s.cpu = cpu;
     monitor_disas_is_physical = is_physical;
     s.info.read_memory_func = monitor_read_memory;
-    s.info.print_address_func = generic_print_target_address;
+    s.info.print_address_func = generic_print_address;
 
     s.info.buffer_vma = pc;
 
diff --git a/exec.c b/exec.c
index 0a4a0c5af6..54cd70ac1e 100644
--- a/exec.c
+++ b/exec.c
@@ -1210,7 +1210,7 @@ static void *file_ram_alloc(RAMBlock *block,
     unlink(filename);
     g_free(filename);
 
-    memory = (memory+hpagesize-1) & ~(hpagesize-1);
+    memory = ROUND_UP(memory, hpagesize);
 
     /*
      * ftruncate is not supported by hugetlbfs in older
diff --git a/hw/arm/xlnx-zynqmp.c b/hw/arm/xlnx-zynqmp.c
index 62ef4ceb32..388baef76e 100644
--- a/hw/arm/xlnx-zynqmp.c
+++ b/hw/arm/xlnx-zynqmp.c
@@ -101,6 +101,21 @@ static void xlnx_zynqmp_realize(DeviceState *dev, Error **errp)
     qemu_irq gic_spi[GIC_NUM_SPI_INTR];
     Error *err = NULL;
 
+    /* Create the four OCM banks */
+    for (i = 0; i < XLNX_ZYNQMP_NUM_OCM_BANKS; i++) {
+        char *ocm_name = g_strdup_printf("zynqmp.ocm_ram_bank_%d", i);
+
+        memory_region_init_ram(&s->ocm_ram[i], NULL, ocm_name,
+                               XLNX_ZYNQMP_OCM_RAM_SIZE, &error_abort);
+        vmstate_register_ram_global(&s->ocm_ram[i]);
+        memory_region_add_subregion(get_system_memory(),
+                                    XLNX_ZYNQMP_OCM_RAM_0_ADDRESS +
+                                        i * XLNX_ZYNQMP_OCM_RAM_SIZE,
+                                    &s->ocm_ram[i]);
+
+        g_free(ocm_name);
+    }
+
     qdev_prop_set_uint32(DEVICE(&s->gic), "num-irq", GIC_NUM_SPI_INTR + 32);
     qdev_prop_set_uint32(DEVICE(&s->gic), "revision", 2);
     qdev_prop_set_uint32(DEVICE(&s->gic), "num-cpu", XLNX_ZYNQMP_NUM_APU_CPUS);
diff --git a/hw/char/mcf_uart.c b/hw/char/mcf_uart.c
index 98fd44e66a..cda22eea5d 100644
--- a/hw/char/mcf_uart.c
+++ b/hw/char/mcf_uart.c
@@ -126,7 +126,7 @@ static void mcf_uart_do_tx(mcf_uart_state *s)
 static void mcf_do_command(mcf_uart_state *s, uint8_t cmd)
 {
     /* Misc command.  */
-    switch ((cmd >> 4) & 3) {
+    switch ((cmd >> 4) & 7) {
     case 0: /* No-op.  */
         break;
     case 1: /* Reset mode register pointer.  */
diff --git a/hw/net/ne2000-isa.c b/hw/net/ne2000-isa.c
index 17e7199f70..18b064463a 100644
--- a/hw/net/ne2000-isa.c
+++ b/hw/net/ne2000-isa.c
@@ -44,7 +44,6 @@ typedef struct ISANE2000State {
 static NetClientInfo net_ne2000_isa_info = {
     .type = NET_CLIENT_OPTIONS_KIND_NIC,
     .size = sizeof(NICState),
-    .can_receive = ne2000_can_receive,
     .receive = ne2000_receive,
 };
 
diff --git a/hw/net/ne2000.c b/hw/net/ne2000.c
index 3492db3663..53c704ad41 100644
--- a/hw/net/ne2000.c
+++ b/hw/net/ne2000.c
@@ -165,15 +165,6 @@ static int ne2000_buffer_full(NE2000State *s)
     return 0;
 }
 
-int ne2000_can_receive(NetClientState *nc)
-{
-    NE2000State *s = qemu_get_nic_opaque(nc);
-
-    if (s->cmd & E8390_STOP)
-        return 1;
-    return !ne2000_buffer_full(s);
-}
-
 #define MIN_BUF_SIZE 60
 
 ssize_t ne2000_receive(NetClientState *nc, const uint8_t *buf, size_t size_)
@@ -705,7 +696,6 @@ void ne2000_setup_io(NE2000State *s, DeviceState *dev, unsigned size)
 static NetClientInfo net_ne2000_info = {
     .type = NET_CLIENT_OPTIONS_KIND_NIC,
     .size = sizeof(NICState),
-    .can_receive = ne2000_can_receive,
     .receive = ne2000_receive,
 };
 
diff --git a/hw/net/ne2000.h b/hw/net/ne2000.h
index e500306aac..d022b28fc2 100644
--- a/hw/net/ne2000.h
+++ b/hw/net/ne2000.h
@@ -34,7 +34,6 @@ typedef struct NE2000State {
 void ne2000_setup_io(NE2000State *s, DeviceState *dev, unsigned size);
 extern const VMStateDescription vmstate_ne2000;
 void ne2000_reset(NE2000State *s);
-int ne2000_can_receive(NetClientState *nc);
 ssize_t ne2000_receive(NetClientState *nc, const uint8_t *buf, size_t size_);
 
 #endif
diff --git a/hw/net/rtl8139.c b/hw/net/rtl8139.c
index edbb61ccf3..fb2c55ce0b 100644
--- a/hw/net/rtl8139.c
+++ b/hw/net/rtl8139.c
@@ -56,6 +56,7 @@
 #include "sysemu/dma.h"
 #include "qemu/timer.h"
 #include "net/net.h"
+#include "net/eth.h"
 #include "hw/loader.h"
 #include "sysemu/sysemu.h"
 #include "qemu/iov.h"
@@ -72,11 +73,8 @@
 #define MOD2(input, size) \
     ( ( input ) & ( size - 1 )  )
 
-#define ETHER_ADDR_LEN 6
 #define ETHER_TYPE_LEN 2
-#define ETH_HLEN (ETHER_ADDR_LEN * 2 + ETHER_TYPE_LEN)
-#define ETH_P_IP    0x0800      /* Internet Protocol packet */
-#define ETH_P_8021Q 0x8100      /* 802.1Q VLAN Extended Header  */
+#define ETH_HLEN (ETH_ALEN * 2 + ETHER_TYPE_LEN)
 #define ETH_MTU     1500
 
 #define VLAN_TCI_LEN 2
@@ -1016,8 +1014,8 @@ static ssize_t rtl8139_do_receive(NetClientState *nc, const uint8_t *buf, size_t
 
         /* write VLAN info to descriptor variables. */
         if (s->CpCmd & CPlusRxVLAN && be16_to_cpup((uint16_t *)
-                &buf[ETHER_ADDR_LEN * 2]) == ETH_P_8021Q) {
-            dot1q_buf = &buf[ETHER_ADDR_LEN * 2];
+                &buf[ETH_ALEN * 2]) == ETH_P_VLAN) {
+            dot1q_buf = &buf[ETH_ALEN * 2];
             size -= VLAN_HLEN;
             /* if too small buffer, use the tailroom added duing expansion */
             if (size < MIN_BUF_SIZE) {
@@ -1058,10 +1056,10 @@ static ssize_t rtl8139_do_receive(NetClientState *nc, const uint8_t *buf, size_t
 
         /* receive/copy to target memory */
         if (dot1q_buf) {
-            pci_dma_write(d, rx_addr, buf, 2 * ETHER_ADDR_LEN);
-            pci_dma_write(d, rx_addr + 2 * ETHER_ADDR_LEN,
-                          buf + 2 * ETHER_ADDR_LEN + VLAN_HLEN,
-                          size - 2 * ETHER_ADDR_LEN);
+            pci_dma_write(d, rx_addr, buf, 2 * ETH_ALEN);
+            pci_dma_write(d, rx_addr + 2 * ETH_ALEN,
+                          buf + 2 * ETH_ALEN + VLAN_HLEN,
+                          size - 2 * ETH_ALEN);
         } else {
             pci_dma_write(d, rx_addr, buf, size);
         }
@@ -1148,7 +1146,9 @@ static ssize_t rtl8139_do_receive(NetClientState *nc, const uint8_t *buf, size_t
 
         /* if receiver buffer is empty then avail == 0 */
 
-        if (avail != 0 && size + 8 >= avail)
+#define RX_ALIGN(x) (((x) + 3) & ~0x3)
+
+        if (avail != 0 && RX_ALIGN(size + 8) >= avail)
         {
             DPRINTF("rx overflow: rx buffer length %d head 0x%04x "
                 "read 0x%04x === available 0x%04x need 0x%04x\n",
@@ -1157,7 +1157,7 @@ static ssize_t rtl8139_do_receive(NetClientState *nc, const uint8_t *buf, size_t
             s->IntrStatus |= RxOverflow;
             ++s->RxMissed;
             rtl8139_update_irq(s);
-            return size_;
+            return 0;
         }
 
         packet_header |= RxStatusOK;
@@ -1176,7 +1176,7 @@ static ssize_t rtl8139_do_receive(NetClientState *nc, const uint8_t *buf, size_t
         rtl8139_write_buffer(s, (uint8_t *)&val, 4);
 
         /* correct buffer write pointer */
-        s->RxBufAddr = MOD2((s->RxBufAddr + 3) & ~0x3, s->RxBufferSize);
+        s->RxBufAddr = MOD2(RX_ALIGN(s->RxBufAddr), s->RxBufferSize);
 
         /* now we can signal we have received something */
 
@@ -1783,12 +1783,12 @@ static void rtl8139_transfer_frame(RTL8139State *s, uint8_t *buf, int size,
         return;
     }
 
-    if (dot1q_buf && size >= ETHER_ADDR_LEN * 2) {
+    if (dot1q_buf && size >= ETH_ALEN * 2) {
         iov = (struct iovec[3]) {
-            { .iov_base = buf, .iov_len = ETHER_ADDR_LEN * 2 },
+            { .iov_base = buf, .iov_len = ETH_ALEN * 2 },
             { .iov_base = (void *) dot1q_buf, .iov_len = VLAN_HLEN },
-            { .iov_base = buf + ETHER_ADDR_LEN * 2,
-                .iov_len = size - ETHER_ADDR_LEN * 2 },
+            { .iov_base = buf + ETH_ALEN * 2,
+                .iov_len = size - ETH_ALEN * 2 },
         };
 
         memcpy(vlan_iov, iov, sizeof(vlan_iov));
@@ -1868,64 +1868,12 @@ static int rtl8139_transmit_one(RTL8139State *s, int descriptor)
 }
 
 /* structures and macros for task offloading */
-typedef struct ip_header
-{
-    uint8_t  ip_ver_len;    /* version and header length */
-    uint8_t  ip_tos;        /* type of service */
-    uint16_t ip_len;        /* total length */
-    uint16_t ip_id;         /* identification */
-    uint16_t ip_off;        /* fragment offset field */
-    uint8_t  ip_ttl;        /* time to live */
-    uint8_t  ip_p;          /* protocol */
-    uint16_t ip_sum;        /* checksum */
-    uint32_t ip_src,ip_dst; /* source and dest address */
-} ip_header;
-
-#define IP_HEADER_VERSION_4 4
-#define IP_HEADER_VERSION(ip) ((ip->ip_ver_len >> 4)&0xf)
-#define IP_HEADER_LENGTH(ip) (((ip->ip_ver_len)&0xf) << 2)
-
-typedef struct tcp_header
-{
-    uint16_t th_sport;		/* source port */
-    uint16_t th_dport;		/* destination port */
-    uint32_t th_seq;			/* sequence number */
-    uint32_t th_ack;			/* acknowledgement number */
-    uint16_t th_offset_flags; /* data offset, reserved 6 bits, TCP protocol flags */
-    uint16_t th_win;			/* window */
-    uint16_t th_sum;			/* checksum */
-    uint16_t th_urp;			/* urgent pointer */
-} tcp_header;
-
-typedef struct udp_header
-{
-    uint16_t uh_sport; /* source port */
-    uint16_t uh_dport; /* destination port */
-    uint16_t uh_ulen;  /* udp length */
-    uint16_t uh_sum;   /* udp checksum */
-} udp_header;
-
-typedef struct ip_pseudo_header
-{
-    uint32_t ip_src;
-    uint32_t ip_dst;
-    uint8_t  zeros;
-    uint8_t  ip_proto;
-    uint16_t ip_payload;
-} ip_pseudo_header;
-
-#define IP_PROTO_TCP 6
-#define IP_PROTO_UDP 17
-
 #define TCP_HEADER_DATA_OFFSET(tcp) (((be16_to_cpu(tcp->th_offset_flags) >> 12)&0xf) << 2)
 #define TCP_FLAGS_ONLY(flags) ((flags)&0x3f)
 #define TCP_HEADER_FLAGS(tcp) TCP_FLAGS_ONLY(be16_to_cpu(tcp->th_offset_flags))
 
 #define TCP_HEADER_CLEAR_FLAGS(tcp, off) ((tcp)->th_offset_flags &= cpu_to_be16(~TCP_FLAGS_ONLY(off)))
 
-#define TCP_FLAG_FIN  0x01
-#define TCP_FLAG_PUSH 0x08
-
 /* produces ones' complement sum of data */
 static uint16_t ones_complement_sum(uint8_t *data, size_t len)
 {
@@ -2134,7 +2082,7 @@ static int rtl8139_cplus_transmit_one(RTL8139State *s)
                 bswap16(txdw1 & CP_TX_VLAN_TAG_MASK));
 
             dot1q_buffer = (uint16_t *) dot1q_buffer_space;
-            dot1q_buffer[0] = cpu_to_be16(ETH_P_8021Q);
+            dot1q_buffer[0] = cpu_to_be16(ETH_P_VLAN);
             /* BE + le_to_cpu() + ~cpu_to_le()~ = BE */
             dot1q_buffer[1] = cpu_to_le16(txdw1 & CP_TX_VLAN_TAG_MASK);
         } else {
@@ -2151,12 +2099,12 @@ static int rtl8139_cplus_transmit_one(RTL8139State *s)
             DPRINTF("+++ C+ mode offloaded task checksum\n");
 
             /* Large enough for Ethernet and IP headers? */
-            if (saved_size < ETH_HLEN + sizeof(ip_header)) {
+            if (saved_size < ETH_HLEN + sizeof(struct ip_header)) {
                 goto skip_offload;
             }
 
             /* ip packet header */
-            ip_header *ip = NULL;
+            struct ip_header *ip = NULL;
             int hlen = 0;
             uint8_t  ip_protocol = 0;
             uint16_t ip_data_len = 0;
@@ -2172,11 +2120,15 @@ static int rtl8139_cplus_transmit_one(RTL8139State *s)
 
             DPRINTF("+++ C+ mode has IP packet\n");
 
-            /* not aligned */
+            /* Note on memory alignment: eth_payload_data is 16-bit aligned
+             * since saved_buffer is allocated with g_malloc() and ETH_HLEN is
+             * even.  32-bit accesses must use ldl/stl wrappers to avoid
+             * unaligned accesses.
+             */
             eth_payload_data = saved_buffer + ETH_HLEN;
             eth_payload_len  = saved_size   - ETH_HLEN;
 
-            ip = (ip_header*)eth_payload_data;
+            ip = (struct ip_header*)eth_payload_data;
 
             if (IP_HEADER_VERSION(ip) != IP_HEADER_VERSION_4) {
                 DPRINTF("+++ C+ mode packet has bad IP version %d "
@@ -2185,8 +2137,8 @@ static int rtl8139_cplus_transmit_one(RTL8139State *s)
                 goto skip_offload;
             }
 
-            hlen = IP_HEADER_LENGTH(ip);
-            if (hlen < sizeof(ip_header) || hlen > eth_payload_len) {
+            hlen = IP_HDR_GET_LEN(ip);
+            if (hlen < sizeof(struct ip_header) || hlen > eth_payload_len) {
                 goto skip_offload;
             }
 
@@ -2269,7 +2221,7 @@ static int rtl8139_cplus_transmit_one(RTL8139State *s)
                     }
 
                     DPRINTF("+++ C+ mode TSO TCP seqno %08x\n",
-                        be32_to_cpu(p_tcp_hdr->th_seq));
+                            ldl_be_p(&p_tcp_hdr->th_seq));
 
                     /* add 4 TCP pseudoheader fields */
                     /* copy IP source and destination fields */
@@ -2287,7 +2239,7 @@ static int rtl8139_cplus_transmit_one(RTL8139State *s)
                     /* keep PUSH and FIN flags only for the last frame */
                     if (!is_last_frame)
                     {
-                        TCP_HEADER_CLEAR_FLAGS(p_tcp_hdr, TCP_FLAG_PUSH|TCP_FLAG_FIN);
+                        TCP_HEADER_CLEAR_FLAGS(p_tcp_hdr, TH_PUSH | TH_FIN);
                     }
 
                     /* recalculate TCP checksum */
@@ -2325,7 +2277,8 @@ static int rtl8139_cplus_transmit_one(RTL8139State *s)
                         0, (uint8_t *) dot1q_buffer);
 
                     /* add transferred count to TCP sequence number */
-                    p_tcp_hdr->th_seq = cpu_to_be32(chunk_size + be32_to_cpu(p_tcp_hdr->th_seq));
+                    stl_be_p(&p_tcp_hdr->th_seq,
+                             chunk_size + ldl_be_p(&p_tcp_hdr->th_seq));
                     ++send_count;
                 }
 
diff --git a/hw/net/vmxnet3.c b/hw/net/vmxnet3.c
index 59b06b8412..04159c8222 100644
--- a/hw/net/vmxnet3.c
+++ b/hw/net/vmxnet3.c
@@ -927,9 +927,9 @@ static void vmxnet3_rx_need_csum_calculate(struct VmxnetRxPkt *pkt,
 
     /* Validate packet len: csum_start + scum_offset + length of csum field */
     if (pkt_len < (vhdr->csum_start + vhdr->csum_offset + 2)) {
-        VMW_PKPRN("packet len:%d < csum_start(%d) + csum_offset(%d) + 2, "
+        VMW_PKPRN("packet len:%lu < csum_start(%d) + csum_offset(%d) + 2, "
                   "cannot calculate checksum",
-                  len, vhdr->csum_start, vhdr->csum_offset);
+                  pkt_len, vhdr->csum_start, vhdr->csum_offset);
         return;
     }
 
@@ -1988,7 +1988,6 @@ static void vmxnet3_set_link_status(NetClientState *nc)
 static NetClientInfo net_vmxnet3_info = {
         .type = NET_CLIENT_OPTIONS_KIND_NIC,
         .size = sizeof(NICState),
-        .can_receive = vmxnet3_can_receive,
         .receive = vmxnet3_receive,
         .link_status_changed = vmxnet3_set_link_status,
 };
diff --git a/hw/scsi/scsi-bus.c b/hw/scsi/scsi-bus.c
index f0ae4625ff..ffac8f4bb6 100644
--- a/hw/scsi/scsi-bus.c
+++ b/hw/scsi/scsi-bus.c
@@ -136,7 +136,8 @@ static void scsi_dma_restart_cb(void *opaque, int running, RunState state)
         return;
     }
     if (!s->bh) {
-        s->bh = qemu_bh_new(scsi_dma_restart_bh, s);
+        AioContext *ctx = blk_get_aio_context(s->conf.blk);
+        s->bh = aio_bh_new(ctx, scsi_dma_restart_bh, s);
         qemu_bh_schedule(s->bh);
     }
 }
diff --git a/hw/scsi/scsi-disk.c b/hw/scsi/scsi-disk.c
index 64f0694734..bada9a7f62 100644
--- a/hw/scsi/scsi-disk.c
+++ b/hw/scsi/scsi-disk.c
@@ -217,6 +217,8 @@ static void scsi_write_do_fua(SCSIDiskReq *r)
 {
     SCSIDiskState *s = DO_UPCAST(SCSIDiskState, qdev, r->req.dev);
 
+    assert(r->req.aiocb == NULL);
+
     if (r->req.io_canceled) {
         scsi_req_cancel_complete(&r->req);
         goto done;
@@ -235,15 +237,10 @@ done:
     scsi_req_unref(&r->req);
 }
 
-static void scsi_dma_complete_noio(void *opaque, int ret)
+static void scsi_dma_complete_noio(SCSIDiskReq *r, int ret)
 {
-    SCSIDiskReq *r = (SCSIDiskReq *)opaque;
-    SCSIDiskState *s = DO_UPCAST(SCSIDiskState, qdev, r->req.dev);
+    assert(r->req.aiocb == NULL);
 
-    if (r->req.aiocb != NULL) {
-        r->req.aiocb = NULL;
-        block_acct_done(blk_get_stats(s->qdev.conf.blk), &r->acct);
-    }
     if (r->req.io_canceled) {
         scsi_req_cancel_complete(&r->req);
         goto done;
@@ -271,9 +268,13 @@ done:
 static void scsi_dma_complete(void *opaque, int ret)
 {
     SCSIDiskReq *r = (SCSIDiskReq *)opaque;
+    SCSIDiskState *s = DO_UPCAST(SCSIDiskState, qdev, r->req.dev);
 
     assert(r->req.aiocb != NULL);
-    scsi_dma_complete_noio(opaque, ret);
+    r->req.aiocb = NULL;
+
+    block_acct_done(blk_get_stats(s->qdev.conf.blk), &r->acct);
+    scsi_dma_complete_noio(r, ret);
 }
 
 static void scsi_read_complete(void * opaque, int ret)
@@ -308,16 +309,13 @@ done:
 }
 
 /* Actually issue a read to the block device.  */
-static void scsi_do_read(void *opaque, int ret)
+static void scsi_do_read(SCSIDiskReq *r, int ret)
 {
-    SCSIDiskReq *r = opaque;
     SCSIDiskState *s = DO_UPCAST(SCSIDiskState, qdev, r->req.dev);
     uint32_t n;
 
-    if (r->req.aiocb != NULL) {
-        r->req.aiocb = NULL;
-        block_acct_done(blk_get_stats(s->qdev.conf.blk), &r->acct);
-    }
+    assert (r->req.aiocb == NULL);
+
     if (r->req.io_canceled) {
         scsi_req_cancel_complete(&r->req);
         goto done;
@@ -349,6 +347,18 @@ done:
     scsi_req_unref(&r->req);
 }
 
+static void scsi_do_read_cb(void *opaque, int ret)
+{
+    SCSIDiskReq *r = (SCSIDiskReq *)opaque;
+    SCSIDiskState *s = DO_UPCAST(SCSIDiskState, qdev, r->req.dev);
+
+    assert (r->req.aiocb != NULL);
+    r->req.aiocb = NULL;
+
+    block_acct_done(blk_get_stats(s->qdev.conf.blk), &r->acct);
+    scsi_do_read(opaque, ret);
+}
+
 /* Read more data from scsi device into buffer.  */
 static void scsi_read_data(SCSIRequest *req)
 {
@@ -384,7 +394,7 @@ static void scsi_read_data(SCSIRequest *req)
     if (first && scsi_is_cmd_fua(&r->req.cmd)) {
         block_acct_start(blk_get_stats(s->qdev.conf.blk), &r->acct, 0,
                          BLOCK_ACCT_FLUSH);
-        r->req.aiocb = blk_aio_flush(s->qdev.conf.blk, scsi_do_read, r);
+        r->req.aiocb = blk_aio_flush(s->qdev.conf.blk, scsi_do_read_cb, r);
     } else {
         scsi_do_read(r, 0);
     }
@@ -399,7 +409,7 @@ static void scsi_read_data(SCSIRequest *req)
  */
 static int scsi_handle_rw_error(SCSIDiskReq *r, int error)
 {
-    bool is_read = (r->req.cmd.xfer == SCSI_XFER_FROM_DEV);
+    bool is_read = (r->req.cmd.mode == SCSI_XFER_FROM_DEV);
     SCSIDiskState *s = DO_UPCAST(SCSIDiskState, qdev, r->req.dev);
     BlockErrorAction action = blk_get_error_action(s->qdev.conf.blk,
                                                    is_read, error);
@@ -430,16 +440,12 @@ static int scsi_handle_rw_error(SCSIDiskReq *r, int error)
     return action != BLOCK_ERROR_ACTION_IGNORE;
 }
 
-static void scsi_write_complete(void * opaque, int ret)
+static void scsi_write_complete_noio(SCSIDiskReq *r, int ret)
 {
-    SCSIDiskReq *r = (SCSIDiskReq *)opaque;
-    SCSIDiskState *s = DO_UPCAST(SCSIDiskState, qdev, r->req.dev);
     uint32_t n;
 
-    if (r->req.aiocb != NULL) {
-        r->req.aiocb = NULL;
-        block_acct_done(blk_get_stats(s->qdev.conf.blk), &r->acct);
-    }
+    assert (r->req.aiocb == NULL);
+
     if (r->req.io_canceled) {
         scsi_req_cancel_complete(&r->req);
         goto done;
@@ -467,6 +473,18 @@ done:
     scsi_req_unref(&r->req);
 }
 
+static void scsi_write_complete(void * opaque, int ret)
+{
+    SCSIDiskReq *r = (SCSIDiskReq *)opaque;
+    SCSIDiskState *s = DO_UPCAST(SCSIDiskState, qdev, r->req.dev);
+
+    assert (r->req.aiocb != NULL);
+    r->req.aiocb = NULL;
+
+    block_acct_done(blk_get_stats(s->qdev.conf.blk), &r->acct);
+    scsi_write_complete_noio(r, ret);
+}
+
 static void scsi_write_data(SCSIRequest *req)
 {
     SCSIDiskReq *r = DO_UPCAST(SCSIDiskReq, req, req);
@@ -480,18 +498,18 @@ static void scsi_write_data(SCSIRequest *req)
     scsi_req_ref(&r->req);
     if (r->req.cmd.mode != SCSI_XFER_TO_DEV) {
         DPRINTF("Data transfer direction invalid\n");
-        scsi_write_complete(r, -EINVAL);
+        scsi_write_complete_noio(r, -EINVAL);
         return;
     }
 
     if (!r->req.sg && !r->qiov.size) {
         /* Called for the first time.  Ask the driver to send us more data.  */
         r->started = true;
-        scsi_write_complete(r, 0);
+        scsi_write_complete_noio(r, 0);
         return;
     }
     if (s->tray_open) {
-        scsi_write_complete(r, -ENOMEDIUM);
+        scsi_write_complete_noio(r, -ENOMEDIUM);
         return;
     }
 
@@ -500,7 +518,7 @@ static void scsi_write_data(SCSIRequest *req)
         if (r->req.sg) {
             scsi_dma_complete_noio(r, 0);
         } else {
-            scsi_write_complete(r, 0);
+            scsi_write_complete_noio(r, 0);
         }
         return;
     }
@@ -1557,15 +1575,17 @@ typedef struct UnmapCBData {
     int count;
 } UnmapCBData;
 
-static void scsi_unmap_complete(void *opaque, int ret)
+static void scsi_unmap_complete(void *opaque, int ret);
+
+static void scsi_unmap_complete_noio(UnmapCBData *data, int ret)
 {
-    UnmapCBData *data = opaque;
     SCSIDiskReq *r = data->r;
     SCSIDiskState *s = DO_UPCAST(SCSIDiskState, qdev, r->req.dev);
     uint64_t sector_num;
     uint32_t nb_sectors;
 
-    r->req.aiocb = NULL;
+    assert(r->req.aiocb == NULL);
+
     if (r->req.io_canceled) {
         scsi_req_cancel_complete(&r->req);
         goto done;
@@ -1601,6 +1621,17 @@ done:
     g_free(data);
 }
 
+static void scsi_unmap_complete(void *opaque, int ret)
+{
+    UnmapCBData *data = opaque;
+    SCSIDiskReq *r = data->r;
+
+    assert(r->req.aiocb != NULL);
+    r->req.aiocb = NULL;
+
+    scsi_unmap_complete_noio(data, ret);
+}
+
 static void scsi_disk_emulate_unmap(SCSIDiskReq *r, uint8_t *inbuf)
 {
     SCSIDiskState *s = DO_UPCAST(SCSIDiskState, qdev, r->req.dev);
@@ -1638,7 +1669,7 @@ static void scsi_disk_emulate_unmap(SCSIDiskReq *r, uint8_t *inbuf)
 
     /* The matching unref is in scsi_unmap_complete, before data is freed.  */
     scsi_req_ref(&r->req);
-    scsi_unmap_complete(data, 0);
+    scsi_unmap_complete_noio(data, 0);
     return;
 
 invalid_param_len:
@@ -1683,6 +1714,10 @@ static void scsi_write_same_complete(void *opaque, int ret)
     if (data->iov.iov_len) {
         block_acct_start(blk_get_stats(s->qdev.conf.blk), &r->acct,
                          data->iov.iov_len, BLOCK_ACCT_WRITE);
+        /* blk_aio_write doesn't like the qiov size being different from
+         * nb_sectors, make sure they match.
+         */
+        qemu_iovec_init_external(&data->qiov, &data->iov, 1);
         r->req.aiocb = blk_aio_writev(s->qdev.conf.blk, data->sector,
                                       &data->qiov, data->iov.iov_len / 512,
                                       scsi_write_same_complete, data);
diff --git a/hw/scsi/scsi-generic.c b/hw/scsi/scsi-generic.c
index e53470f85e..1b6350be41 100644
--- a/hw/scsi/scsi-generic.c
+++ b/hw/scsi/scsi-generic.c
@@ -88,12 +88,12 @@ static void scsi_free_request(SCSIRequest *req)
 }
 
 /* Helper function for command completion.  */
-static void scsi_command_complete(void *opaque, int ret)
+static void scsi_command_complete_noio(SCSIGenericReq *r, int ret)
 {
     int status;
-    SCSIGenericReq *r = (SCSIGenericReq *)opaque;
 
-    r->req.aiocb = NULL;
+    assert(r->req.aiocb == NULL);
+
     if (r->req.io_canceled) {
         scsi_req_cancel_complete(&r->req);
         goto done;
@@ -142,6 +142,15 @@ done:
     scsi_req_unref(&r->req);
 }
 
+static void scsi_command_complete(void *opaque, int ret)
+{
+    SCSIGenericReq *r = (SCSIGenericReq *)opaque;
+
+    assert(r->req.aiocb != NULL);
+    r->req.aiocb = NULL;
+    scsi_command_complete_noio(r, ret);
+}
+
 static int execute_command(BlockBackend *blk,
                            SCSIGenericReq *r, int direction,
                            BlockCompletionFunc *complete)
@@ -172,33 +181,37 @@ static void scsi_read_complete(void * opaque, int ret)
     SCSIDevice *s = r->req.dev;
     int len;
 
+    assert(r->req.aiocb != NULL);
     r->req.aiocb = NULL;
+
     if (ret || r->req.io_canceled) {
-        scsi_command_complete(r, ret);
+        scsi_command_complete_noio(r, ret);
         return;
     }
+
     len = r->io_header.dxfer_len - r->io_header.resid;
     DPRINTF("Data ready tag=0x%x len=%d\n", r->req.tag, len);
 
     r->len = -1;
     if (len == 0) {
-        scsi_command_complete(r, 0);
-    } else {
-        /* Snoop READ CAPACITY output to set the blocksize.  */
-        if (r->req.cmd.buf[0] == READ_CAPACITY_10 &&
-            (ldl_be_p(&r->buf[0]) != 0xffffffffU || s->max_lba == 0)) {
-            s->blocksize = ldl_be_p(&r->buf[4]);
-            s->max_lba = ldl_be_p(&r->buf[0]) & 0xffffffffULL;
-        } else if (r->req.cmd.buf[0] == SERVICE_ACTION_IN_16 &&
-                   (r->req.cmd.buf[1] & 31) == SAI_READ_CAPACITY_16) {
-            s->blocksize = ldl_be_p(&r->buf[8]);
-            s->max_lba = ldq_be_p(&r->buf[0]);
-        }
-        blk_set_guest_block_size(s->conf.blk, s->blocksize);
+        scsi_command_complete_noio(r, 0);
+        return;
+    }
 
-        scsi_req_data(&r->req, len);
-        scsi_req_unref(&r->req);
+    /* Snoop READ CAPACITY output to set the blocksize.  */
+    if (r->req.cmd.buf[0] == READ_CAPACITY_10 &&
+        (ldl_be_p(&r->buf[0]) != 0xffffffffU || s->max_lba == 0)) {
+        s->blocksize = ldl_be_p(&r->buf[4]);
+        s->max_lba = ldl_be_p(&r->buf[0]) & 0xffffffffULL;
+    } else if (r->req.cmd.buf[0] == SERVICE_ACTION_IN_16 &&
+               (r->req.cmd.buf[1] & 31) == SAI_READ_CAPACITY_16) {
+        s->blocksize = ldl_be_p(&r->buf[8]);
+        s->max_lba = ldq_be_p(&r->buf[0]);
     }
+    blk_set_guest_block_size(s->conf.blk, s->blocksize);
+
+    scsi_req_data(&r->req, len);
+    scsi_req_unref(&r->req);
 }
 
 /* Read more data from scsi device into buffer.  */
@@ -213,14 +226,14 @@ static void scsi_read_data(SCSIRequest *req)
     /* The request is used as the AIO opaque value, so add a ref.  */
     scsi_req_ref(&r->req);
     if (r->len == -1) {
-        scsi_command_complete(r, 0);
+        scsi_command_complete_noio(r, 0);
         return;
     }
 
     ret = execute_command(s->conf.blk, r, SG_DXFER_FROM_DEV,
                           scsi_read_complete);
     if (ret < 0) {
-        scsi_command_complete(r, ret);
+        scsi_command_complete_noio(r, ret);
     }
 }
 
@@ -230,9 +243,12 @@ static void scsi_write_complete(void * opaque, int ret)
     SCSIDevice *s = r->req.dev;
 
     DPRINTF("scsi_write_complete() ret = %d\n", ret);
+
+    assert(r->req.aiocb != NULL);
     r->req.aiocb = NULL;
+
     if (ret || r->req.io_canceled) {
-        scsi_command_complete(r, ret);
+        scsi_command_complete_noio(r, ret);
         return;
     }
 
@@ -242,7 +258,7 @@ static void scsi_write_complete(void * opaque, int ret)
         DPRINTF("block size %d\n", s->blocksize);
     }
 
-    scsi_command_complete(r, ret);
+    scsi_command_complete_noio(r, ret);
 }
 
 /* Write data to a scsi device.  Returns nonzero on failure.
@@ -264,7 +280,7 @@ static void scsi_write_data(SCSIRequest *req)
     scsi_req_ref(&r->req);
     ret = execute_command(s->conf.blk, r, SG_DXFER_TO_DEV, scsi_write_complete);
     if (ret < 0) {
-        scsi_command_complete(r, ret);
+        scsi_command_complete_noio(r, ret);
     }
 }
 
@@ -306,7 +322,7 @@ static int32_t scsi_send_command(SCSIRequest *req, uint8_t *cmd)
         ret = execute_command(s->conf.blk, r, SG_DXFER_NONE,
                               scsi_command_complete);
         if (ret < 0) {
-            scsi_command_complete(r, ret);
+            scsi_command_complete_noio(r, ret);
             return 0;
         }
         return 0;
diff --git a/hw/scsi/vhost-scsi.c b/hw/scsi/vhost-scsi.c
index a69918bef8..7eacca9dc5 100644
--- a/hw/scsi/vhost-scsi.c
+++ b/hw/scsi/vhost-scsi.c
@@ -118,7 +118,7 @@ static int vhost_scsi_start(VHostSCSI *s)
      * enabling/disabling irqfd.
      */
     for (i = 0; i < s->dev.nvqs; i++) {
-        vhost_virtqueue_mask(&s->dev, vdev, i, false);
+        vhost_virtqueue_mask(&s->dev, vdev, s->dev.vq_index + i, false);
     }
 
     return ret;
@@ -277,6 +277,7 @@ static void vhost_scsi_unrealize(DeviceState *dev, Error **errp)
     /* This will stop vhost backend. */
     vhost_scsi_set_status(vdev, 0);
 
+    vhost_dev_cleanup(&s->dev);
     g_free(s->dev.vqs);
 
     virtio_scsi_common_unrealize(dev, errp);
diff --git a/hw/scsi/virtio-scsi.c b/hw/scsi/virtio-scsi.c
index 811c3da8bd..a8bb1c66f9 100644
--- a/hw/scsi/virtio-scsi.c
+++ b/hw/scsi/virtio-scsi.c
@@ -217,6 +217,11 @@ static void *virtio_scsi_load_request(QEMUFile *f, SCSIRequest *sreq)
     assert(req->elem.in_num <= ARRAY_SIZE(req->elem.in_sg));
     assert(req->elem.out_num <= ARRAY_SIZE(req->elem.out_sg));
 
+    virtqueue_map_sg(req->elem.in_sg, req->elem.in_addr,
+                     req->elem.in_num, 1);
+    virtqueue_map_sg(req->elem.out_sg, req->elem.out_addr,
+                     req->elem.out_num, 0);
+
     if (virtio_scsi_parse_req(req, sizeof(VirtIOSCSICmdReq) + vs->cdb_size,
                               sizeof(VirtIOSCSICmdResp) + vs->sense_size) < 0) {
         error_report("invalid SCSI request migration data");
diff --git a/include/block/block.h b/include/block/block.h
index 37916f7208..608cd4e4fb 100644
--- a/include/block/block.h
+++ b/include/block/block.h
@@ -317,7 +317,8 @@ bool bdrv_recurse_is_first_non_filter(BlockDriverState *bs,
 bool bdrv_is_first_non_filter(BlockDriverState *candidate);
 
 /* check if a named node can be replaced when doing drive-mirror */
-BlockDriverState *check_to_replace_node(const char *node_name, Error **errp);
+BlockDriverState *check_to_replace_node(BlockDriverState *parent_bs,
+                                        const char *node_name, Error **errp);
 
 /* async block I/O */
 typedef void BlockDriverDirtyHandler(BlockDriverState *bs, int64_t sector,
diff --git a/include/exec/cpu-all.h b/include/exec/cpu-all.h
index ea6a9a667c..89db792767 100644
--- a/include/exec/cpu-all.h
+++ b/include/exec/cpu-all.h
@@ -160,18 +160,11 @@ static inline void tswap64s(uint64_t *s)
 /* On some host systems the guest address space is reserved on the host.
  * This allows the guest address space to be offset to a convenient location.
  */
-#if defined(CONFIG_USE_GUEST_BASE)
 extern unsigned long guest_base;
 extern int have_guest_base;
 extern unsigned long reserved_va;
-#define GUEST_BASE guest_base
-#define RESERVED_VA reserved_va
-#else
-#define GUEST_BASE 0ul
-#define RESERVED_VA 0ul
-#endif
 
-#define GUEST_ADDR_MAX (RESERVED_VA ? RESERVED_VA : \
+#define GUEST_ADDR_MAX (reserved_va ? reserved_va : \
                                     (1ul << TARGET_VIRT_ADDR_SPACE_BITS) - 1)
 #endif
 
diff --git a/include/exec/cpu-defs.h b/include/exec/cpu-defs.h
index 98b9cff310..5093be26ac 100644
--- a/include/exec/cpu-defs.h
+++ b/include/exec/cpu-defs.h
@@ -105,17 +105,18 @@ typedef struct CPUTLBEntry {
        bit 3                      : indicates that the entry is invalid
        bit 2..0                   : zero
     */
-    target_ulong addr_read;
-    target_ulong addr_write;
-    target_ulong addr_code;
-    /* Addend to virtual address to get host address.  IO accesses
-       use the corresponding iotlb value.  */
-    uintptr_t addend;
-    /* padding to get a power of two size */
-    uint8_t dummy[(1 << CPU_TLB_ENTRY_BITS) -
-                  (sizeof(target_ulong) * 3 +
-                   ((-sizeof(target_ulong) * 3) & (sizeof(uintptr_t) - 1)) +
-                   sizeof(uintptr_t))];
+    union {
+        struct {
+            target_ulong addr_read;
+            target_ulong addr_write;
+            target_ulong addr_code;
+            /* Addend to virtual address to get host address.  IO accesses
+               use the corresponding iotlb value.  */
+            uintptr_t addend;
+        };
+        /* padding to get a power of two size */
+        uint8_t dummy[1 << CPU_TLB_ENTRY_BITS];
+    };
 } CPUTLBEntry;
 
 QEMU_BUILD_BUG_ON(sizeof(CPUTLBEntry) != (1 << CPU_TLB_ENTRY_BITS));
diff --git a/include/exec/cpu_ldst.h b/include/exec/cpu_ldst.h
index 1239c60f23..26f479416a 100644
--- a/include/exec/cpu_ldst.h
+++ b/include/exec/cpu_ldst.h
@@ -49,20 +49,20 @@
 
 #if defined(CONFIG_USER_ONLY)
 /* All direct uses of g2h and h2g need to go away for usermode softmmu.  */
-#define g2h(x) ((void *)((unsigned long)(target_ulong)(x) + GUEST_BASE))
+#define g2h(x) ((void *)((unsigned long)(target_ulong)(x) + guest_base))
 
 #if HOST_LONG_BITS <= TARGET_VIRT_ADDR_SPACE_BITS
 #define h2g_valid(x) 1
 #else
 #define h2g_valid(x) ({ \
-    unsigned long __guest = (unsigned long)(x) - GUEST_BASE; \
+    unsigned long __guest = (unsigned long)(x) - guest_base; \
     (__guest < (1ul << TARGET_VIRT_ADDR_SPACE_BITS)) && \
-    (!RESERVED_VA || (__guest < RESERVED_VA)); \
+    (!reserved_va || (__guest < reserved_va)); \
 })
 #endif
 
 #define h2g_nocheck(x) ({ \
-    unsigned long __ret = (unsigned long)(x) - GUEST_BASE; \
+    unsigned long __ret = (unsigned long)(x) - guest_base; \
     (abi_ulong)__ret; \
 })
 
diff --git a/include/exec/exec-all.h b/include/exec/exec-all.h
index a6fce04f65..83b925172f 100644
--- a/include/exec/exec-all.h
+++ b/include/exec/exec-all.h
@@ -96,8 +96,46 @@ bool qemu_in_vcpu_thread(void);
 void cpu_reload_memory_map(CPUState *cpu);
 void tcg_cpu_address_space_init(CPUState *cpu, AddressSpace *as);
 /* cputlb.c */
+/**
+ * tlb_flush_page:
+ * @cpu: CPU whose TLB should be flushed
+ * @addr: virtual address of page to be flushed
+ *
+ * Flush one page from the TLB of the specified CPU, for all
+ * MMU indexes.
+ */
 void tlb_flush_page(CPUState *cpu, target_ulong addr);
+/**
+ * tlb_flush:
+ * @cpu: CPU whose TLB should be flushed
+ * @flush_global: ignored
+ *
+ * Flush the entire TLB for the specified CPU.
+ * The flush_global flag is in theory an indicator of whether the whole
+ * TLB should be flushed, or only those entries not marked global.
+ * In practice QEMU does not implement any global/not global flag for
+ * TLB entries, and the argument is ignored.
+ */
 void tlb_flush(CPUState *cpu, int flush_global);
+/**
+ * tlb_flush_page_by_mmuidx:
+ * @cpu: CPU whose TLB should be flushed
+ * @addr: virtual address of page to be flushed
+ * @...: list of MMU indexes to flush, terminated by a negative value
+ *
+ * Flush one page from the TLB of the specified CPU, for the specified
+ * MMU indexes.
+ */
+void tlb_flush_page_by_mmuidx(CPUState *cpu, target_ulong addr, ...);
+/**
+ * tlb_flush_by_mmuidx:
+ * @cpu: CPU whose TLB should be flushed
+ * @...: list of MMU indexes to flush, terminated by a negative value
+ *
+ * Flush all entries from the TLB of the specified CPU, for the specified
+ * MMU indexes.
+ */
+void tlb_flush_by_mmuidx(CPUState *cpu, ...);
 void tlb_set_page(CPUState *cpu, target_ulong vaddr,
                   hwaddr paddr, int prot,
                   int mmu_idx, target_ulong size);
@@ -115,6 +153,15 @@ static inline void tlb_flush_page(CPUState *cpu, target_ulong addr)
 static inline void tlb_flush(CPUState *cpu, int flush_global)
 {
 }
+
+static inline void tlb_flush_page_by_mmuidx(CPUState *cpu,
+                                            target_ulong addr, ...)
+{
+}
+
+static inline void tlb_flush_by_mmuidx(CPUState *cpu, ...)
+{
+}
 #endif
 
 #define CODE_GEN_ALIGN           16 /* must be >= of the size of a icache line */
@@ -155,6 +202,8 @@ struct TranslationBlock {
     void *tc_ptr;    /* pointer to the translated code */
     /* next matching tb for physical address. */
     struct TranslationBlock *phys_hash_next;
+    /* original tb when cflags has CF_NOCACHE */
+    struct TranslationBlock *orig_tb;
     /* first and second physical page containing code. The lower bit
        of the pointer tells the index in page_next[] */
     struct TranslationBlock *page_next[2];
@@ -308,11 +357,7 @@ extern uintptr_t tci_tb_ptr;
    to indicate the compressed mode; subtracting two works around that.  It
    is also the case that there are no host isas that contain a call insn
    smaller than 4 bytes, so we don't worry about special-casing this.  */
-#if defined(CONFIG_TCG_INTERPRETER)
-# define GETPC_ADJ   0
-#else
-# define GETPC_ADJ   2
-#endif
+#define GETPC_ADJ   2
 
 #define GETPC()  (GETRA() - GETPC_ADJ)
 
@@ -344,27 +389,6 @@ extern int singlestep;
 /* cpu-exec.c */
 extern volatile sig_atomic_t exit_request;
 
-/**
- * cpu_can_do_io:
- * @cpu: The CPU for which to check IO.
- *
- * Deterministic execution requires that IO only be performed on the last
- * instruction of a TB so that interrupts take effect immediately.
- *
- * Returns: %true if memory-mapped IO is safe, %false otherwise.
- */
-static inline bool cpu_can_do_io(CPUState *cpu)
-{
-    if (!use_icount) {
-        return true;
-    }
-    /* If not executing code then assume we are ok.  */
-    if (cpu->current_tb == NULL) {
-        return true;
-    }
-    return cpu->can_do_io != 0;
-}
-
 #if !defined(CONFIG_USER_ONLY)
 void migration_bitmap_extend(ram_addr_t old, ram_addr_t new);
 #endif
diff --git a/include/hw/arm/xlnx-zynqmp.h b/include/hw/arm/xlnx-zynqmp.h
index c379632f2a..6ccb57b187 100644
--- a/include/hw/arm/xlnx-zynqmp.h
+++ b/include/hw/arm/xlnx-zynqmp.h
@@ -32,6 +32,10 @@
 #define XLNX_ZYNQMP_NUM_GEMS 4
 #define XLNX_ZYNQMP_NUM_UARTS 2
 
+#define XLNX_ZYNQMP_NUM_OCM_BANKS 4
+#define XLNX_ZYNQMP_OCM_RAM_0_ADDRESS 0xFFFC0000
+#define XLNX_ZYNQMP_OCM_RAM_SIZE 0x10000
+
 #define XLNX_ZYNQMP_GIC_REGIONS 2
 
 /* ZynqMP maps the ARM GIC regions (GICC, GICD ...) at consecutive 64k offsets
@@ -52,6 +56,8 @@ typedef struct XlnxZynqMPState {
     ARMCPU rpu_cpu[XLNX_ZYNQMP_NUM_RPU_CPUS];
     GICState gic;
     MemoryRegion gic_mr[XLNX_ZYNQMP_GIC_REGIONS][XLNX_ZYNQMP_GIC_ALIASES];
+    MemoryRegion ocm_ram[XLNX_ZYNQMP_NUM_OCM_BANKS];
+
     CadenceGEMState gem[XLNX_ZYNQMP_NUM_GEMS];
     CadenceUARTState uart[XLNX_ZYNQMP_NUM_UARTS];
 
diff --git a/include/hw/i386/apic_internal.h b/include/hw/i386/apic_internal.h
index dc7a89d988..26632acf37 100644
--- a/include/hw/i386/apic_internal.h
+++ b/include/hw/i386/apic_internal.h
@@ -20,6 +20,7 @@
 #ifndef QEMU_APIC_INTERNAL_H
 #define QEMU_APIC_INTERNAL_H
 
+#include "cpu.h"
 #include "exec/memory.h"
 #include "hw/cpu/icc_bus.h"
 #include "qemu/timer.h"
diff --git a/include/qemu-common.h b/include/qemu-common.h
index fb3da6ca22..bbaffd12e7 100644
--- a/include/qemu-common.h
+++ b/include/qemu-common.h
@@ -12,8 +12,7 @@
 #ifndef QEMU_COMMON_H
 #define QEMU_COMMON_H
 
-#include "qemu/compiler.h"
-#include "config-host.h"
+#include "qemu/osdep.h"
 #include "qemu/typedefs.h"
 #include "qemu/fprintf-fn.h"
 
@@ -23,60 +22,9 @@
 
 #define TFR(expr) do { if ((expr) != -1) break; } while (errno == EINTR)
 
-/* we put basic includes here to avoid repeating them in device drivers */
-#include <stdlib.h>
-#include <stdio.h>
-#include <stdarg.h>
-#include <stdbool.h>
-#include <string.h>
-#include <strings.h>
-#include <inttypes.h>
-#include <limits.h>
-#include <time.h>
-#include <ctype.h>
-#include <errno.h>
-#include <unistd.h>
-#include <fcntl.h>
-#include <sys/stat.h>
-#include <sys/time.h>
-#include <assert.h>
-#include <signal.h>
 #include "glib-compat.h"
 #include "qemu/option.h"
 
-#ifdef _WIN32
-#include "sysemu/os-win32.h"
-#endif
-
-#ifdef CONFIG_POSIX
-#include "sysemu/os-posix.h"
-#endif
-
-#ifndef O_LARGEFILE
-#define O_LARGEFILE 0
-#endif
-#ifndef O_BINARY
-#define O_BINARY 0
-#endif
-#ifndef MAP_ANONYMOUS
-#define MAP_ANONYMOUS MAP_ANON
-#endif
-#ifndef ENOMEDIUM
-#define ENOMEDIUM ENODEV
-#endif
-#if !defined(ENOTSUP)
-#define ENOTSUP 4096
-#endif
-#if !defined(ECANCELED)
-#define ECANCELED 4097
-#endif
-#if !defined(EMEDIUMTYPE)
-#define EMEDIUMTYPE 4098
-#endif
-#ifndef TIME_MAX
-#define TIME_MAX LONG_MAX
-#endif
-
 /* HOST_LONG_BITS is the size of a native pointer in bits. */
 #if UINTPTR_MAX == UINT32_MAX
 # define HOST_LONG_BITS 32
@@ -86,23 +34,6 @@
 # error Unknown pointer size
 #endif
 
-#ifdef _WIN32
-#define fsync _commit
-#if !defined(lseek)
-# define lseek _lseeki64
-#endif
-int qemu_ftruncate64(int, int64_t);
-#if !defined(ftruncate)
-# define ftruncate qemu_ftruncate64
-#endif
-
-static inline char *realpath(const char *path, char *resolved_path)
-{
-    _fullpath(resolved_path, path, _MAX_PATH);
-    return resolved_path;
-}
-#endif
-
 void cpu_ticks_init(void);
 
 /* icount */
@@ -114,7 +45,6 @@ extern int64_t max_delay;
 extern int64_t max_advance;
 void dump_drift_info(FILE *f, fprintf_function cpu_fprintf);
 
-#include "qemu/osdep.h"
 #include "qemu/bswap.h"
 
 /* FIXME: Remove NEED_CPU_H.  */
@@ -481,37 +411,6 @@ static inline uint8_t from_bcd(uint8_t val)
     return ((val >> 4) * 10) + (val & 0x0f);
 }
 
-/* compute with 96 bit intermediate result: (a*b)/c */
-#ifdef CONFIG_INT128
-static inline uint64_t muldiv64(uint64_t a, uint32_t b, uint32_t c)
-{
-    return (__int128_t)a * b / c;
-}
-#else
-static inline uint64_t muldiv64(uint64_t a, uint32_t b, uint32_t c)
-{
-    union {
-        uint64_t ll;
-        struct {
-#ifdef HOST_WORDS_BIGENDIAN
-            uint32_t high, low;
-#else
-            uint32_t low, high;
-#endif
-        } l;
-    } u, res;
-    uint64_t rl, rh;
-
-    u.ll = a;
-    rl = (uint64_t)u.l.low * (uint64_t)b;
-    rh = (uint64_t)u.l.high * (uint64_t)b;
-    rh += (rl >> 32);
-    res.l.high = rh / c;
-    res.l.low = (((rh % c) << 32) + (rl & 0xffffffff)) / c;
-    return res.ll;
-}
-#endif
-
 /* Round number down to multiple */
 #define QEMU_ALIGN_DOWN(n, m) ((n) / (m) * (m))
 
diff --git a/include/qemu/compiler.h b/include/qemu/compiler.h
index df9dd514f1..d22eb01be4 100644
--- a/include/qemu/compiler.h
+++ b/include/qemu/compiler.h
@@ -42,10 +42,55 @@
 # define QEMU_PACKED __attribute__((packed))
 #endif
 
-#define cat(x,y) x ## y
-#define cat2(x,y) cat(x,y)
+#ifndef glue
+#define xglue(x, y) x ## y
+#define glue(x, y) xglue(x, y)
+#define stringify(s)	tostring(s)
+#define tostring(s)	#s
+#endif
+
+#ifndef likely
+#if __GNUC__ < 3
+#define __builtin_expect(x, n) (x)
+#endif
+
+#define likely(x)   __builtin_expect(!!(x), 1)
+#define unlikely(x)   __builtin_expect(!!(x), 0)
+#endif
+
+#ifndef container_of
+#define container_of(ptr, type, member) ({                      \
+        const typeof(((type *) 0)->member) *__mptr = (ptr);     \
+        (type *) ((char *) __mptr - offsetof(type, member));})
+#endif
+
+/* Convert from a base type to a parent type, with compile time checking.  */
+#ifdef __GNUC__
+#define DO_UPCAST(type, field, dev) ( __extension__ ( { \
+    char __attribute__((unused)) offset_must_be_zero[ \
+        -offsetof(type, field)]; \
+    container_of(dev, type, field);}))
+#else
+#define DO_UPCAST(type, field, dev) container_of(dev, type, field)
+#endif
+
+#define typeof_field(type, field) typeof(((type *)0)->field)
+#define type_check(t1,t2) ((t1*)0 - (t2*)0)
+
+#ifndef always_inline
+#if !((__GNUC__ < 3) || defined(__APPLE__))
+#ifdef __OPTIMIZE__
+#undef inline
+#define inline __attribute__ (( always_inline )) __inline__
+#endif
+#endif
+#else
+#undef inline
+#define inline always_inline
+#endif
+
 #define QEMU_BUILD_BUG_ON(x) \
-    typedef char cat2(qemu_build_bug_on__,__LINE__)[(x)?-1:1] __attribute__((unused));
+    typedef char glue(qemu_build_bug_on__,__LINE__)[(x)?-1:1] __attribute__((unused));
 
 #if defined __GNUC__
 # if !QEMU_GNUC_PREREQ(4, 4)
diff --git a/include/qemu/host-utils.h b/include/qemu/host-utils.h
index d4f21c947f..c27d3dc898 100644
--- a/include/qemu/host-utils.h
+++ b/include/qemu/host-utils.h
@@ -45,6 +45,12 @@ static inline void muls64(uint64_t *plow, uint64_t *phigh,
     *phigh = r >> 64;
 }
 
+/* compute with 96 bit intermediate result: (a*b)/c */
+static inline uint64_t muldiv64(uint64_t a, uint32_t b, uint32_t c)
+{
+    return (__int128_t)a * b / c;
+}
+
 static inline int divu128(uint64_t *plow, uint64_t *phigh, uint64_t divisor)
 {
     if (divisor == 0) {
@@ -75,6 +81,29 @@ void muls64(uint64_t *phigh, uint64_t *plow, int64_t a, int64_t b);
 void mulu64(uint64_t *phigh, uint64_t *plow, uint64_t a, uint64_t b);
 int divu128(uint64_t *plow, uint64_t *phigh, uint64_t divisor);
 int divs128(int64_t *plow, int64_t *phigh, int64_t divisor);
+
+static inline uint64_t muldiv64(uint64_t a, uint32_t b, uint32_t c)
+{
+    union {
+        uint64_t ll;
+        struct {
+#ifdef HOST_WORDS_BIGENDIAN
+            uint32_t high, low;
+#else
+            uint32_t low, high;
+#endif
+        } l;
+    } u, res;
+    uint64_t rl, rh;
+
+    u.ll = a;
+    rl = (uint64_t)u.l.low * (uint64_t)b;
+    rh = (uint64_t)u.l.high * (uint64_t)b;
+    rh += (rl >> 32);
+    res.l.high = rh / c;
+    res.l.low = (((rh % c) << 32) + (rl & 0xffffffff)) / c;
+    return res.ll;
+}
 #endif
 
 /**
diff --git a/include/qemu/osdep.h b/include/qemu/osdep.h
index 3247364268..ab3c8766b4 100644
--- a/include/qemu/osdep.h
+++ b/include/qemu/osdep.h
@@ -1,12 +1,53 @@
+/*
+ * OS includes and handling of OS dependencies
+ *
+ * This header exists to pull in some common system headers that
+ * most code in QEMU will want, and to fix up some possible issues with
+ * it (missing defines, Windows weirdness, and so on).
+ *
+ * To avoid getting into possible circular include dependencies, this
+ * file should not include any other QEMU headers, with the exceptions
+ * of config-host.h, compiler.h, os-posix.h and os-win32.h, all of which
+ * are doing a similar job to this file and are under similar constraints.
+ *
+ * This header also contains prototypes for functions defined in
+ * os-*.c and util/oslib-*.c; those would probably be better split
+ * out into separate header files.
+ *
+ * In an ideal world this header would contain only:
+ *  (1) things which everybody needs
+ *  (2) things without which code would work on most platforms but
+ *      fail to compile or misbehave on a minority of host OSes
+ *
+ * This work is licensed under the terms of the GNU GPL, version 2 or later.
+ * See the COPYING file in the top-level directory.
+ */
 #ifndef QEMU_OSDEP_H
 #define QEMU_OSDEP_H
 
 #include "config-host.h"
+#include "qemu/compiler.h"
 #include <stdarg.h>
 #include <stddef.h>
 #include <stdbool.h>
 #include <stdint.h>
 #include <sys/types.h>
+#include <stdlib.h>
+#include <stdio.h>
+#include <string.h>
+#include <strings.h>
+#include <inttypes.h>
+#include <limits.h>
+#include <time.h>
+#include <ctype.h>
+#include <errno.h>
+#include <unistd.h>
+#include <fcntl.h>
+#include <sys/stat.h>
+#include <sys/time.h>
+#include <assert.h>
+#include <signal.h>
+
 #ifdef __OpenBSD__
 #include <sys/signal.h>
 #endif
@@ -18,7 +59,13 @@
 #define WEXITSTATUS(x) (x)
 #endif
 
-#include <sys/time.h>
+#ifdef _WIN32
+#include "sysemu/os-win32.h"
+#endif
+
+#ifdef CONFIG_POSIX
+#include "sysemu/os-posix.h"
+#endif
 
 #if defined(CONFIG_SOLARIS) && CONFIG_SOLARIS_VERSION < 10
 /* [u]int_fast*_t not in <sys/int_types.h> */
@@ -27,40 +74,30 @@ typedef unsigned int            uint_fast16_t;
 typedef signed int              int_fast16_t;
 #endif
 
-#ifndef glue
-#define xglue(x, y) x ## y
-#define glue(x, y) xglue(x, y)
-#define stringify(s)	tostring(s)
-#define tostring(s)	#s
+#ifndef O_LARGEFILE
+#define O_LARGEFILE 0
 #endif
-
-#ifndef likely
-#if __GNUC__ < 3
-#define __builtin_expect(x, n) (x)
+#ifndef O_BINARY
+#define O_BINARY 0
 #endif
-
-#define likely(x)   __builtin_expect(!!(x), 1)
-#define unlikely(x)   __builtin_expect(!!(x), 0)
+#ifndef MAP_ANONYMOUS
+#define MAP_ANONYMOUS MAP_ANON
 #endif
-
-#ifndef container_of
-#define container_of(ptr, type, member) ({                      \
-        const typeof(((type *) 0)->member) *__mptr = (ptr);     \
-        (type *) ((char *) __mptr - offsetof(type, member));})
+#ifndef ENOMEDIUM
+#define ENOMEDIUM ENODEV
 #endif
-
-/* Convert from a base type to a parent type, with compile time checking.  */
-#ifdef __GNUC__
-#define DO_UPCAST(type, field, dev) ( __extension__ ( { \
-    char __attribute__((unused)) offset_must_be_zero[ \
-        -offsetof(type, field)]; \
-    container_of(dev, type, field);}))
-#else
-#define DO_UPCAST(type, field, dev) container_of(dev, type, field)
+#if !defined(ENOTSUP)
+#define ENOTSUP 4096
+#endif
+#if !defined(ECANCELED)
+#define ECANCELED 4097
+#endif
+#if !defined(EMEDIUMTYPE)
+#define EMEDIUMTYPE 4098
+#endif
+#ifndef TIME_MAX
+#define TIME_MAX LONG_MAX
 #endif
-
-#define typeof_field(type, field) typeof(((type *)0)->field)
-#define type_check(t1,t2) ((t1*)0 - (t2*)0)
 
 #ifndef MIN
 #define MIN(a, b) (((a) < (b)) ? (a) : (b))
@@ -87,20 +124,6 @@ typedef signed int              int_fast16_t;
 #define ARRAY_SIZE(x) (sizeof(x) / sizeof((x)[0]))
 #endif
 
-#ifndef always_inline
-#if !((__GNUC__ < 3) || defined(__APPLE__))
-#ifdef __OPTIMIZE__
-#undef inline
-#define inline __attribute__ (( always_inline )) __inline__
-#endif
-#endif
-#else
-#undef inline
-#define inline always_inline
-#endif
-
-#define qemu_printf printf
-
 int qemu_daemon(int nochdir, int noclose);
 void *qemu_try_memalign(size_t alignment, size_t size);
 void *qemu_memalign(size_t alignment, size_t size);
diff --git a/include/qemu/timer.h b/include/qemu/timer.h
index 5923d600fd..99392464a6 100644
--- a/include/qemu/timer.h
+++ b/include/qemu/timer.h
@@ -4,6 +4,7 @@
 #include "qemu/typedefs.h"
 #include "qemu-common.h"
 #include "qemu/notify.h"
+#include "qemu/host-utils.h"
 
 #define NANOSECONDS_PER_SECOND 1000000000LL
 
diff --git a/include/qom/cpu.h b/include/qom/cpu.h
index 20aabc9cb3..39712ab7cb 100644
--- a/include/qom/cpu.h
+++ b/include/qom/cpu.h
@@ -231,7 +231,9 @@ struct kvm_run;
  * @icount_decr: Number of cycles left, with interrupt flag in high bit.
  * This allows a single read-compare-cbranch-write sequence to test
  * for both decrementer underflow and exceptions.
- * @can_do_io: Nonzero if memory-mapped IO is safe.
+ * @can_do_io: Nonzero if memory-mapped IO is safe. Deterministic execution
+ * requires that IO only be performed on the last instruction of a TB
+ * so that interrupts take effect immediately.
  * @env_ptr: Pointer to subclass-specific CPUArchState field.
  * @current_tb: Currently executing TB.
  * @gdb_regs: Additional GDB registers.
diff --git a/include/sysemu/os-win32.h b/include/sysemu/os-win32.h
index 4035c4fe54..706d85a98e 100644
--- a/include/sysemu/os-win32.h
+++ b/include/sysemu/os-win32.h
@@ -109,4 +109,22 @@ static inline int os_mlock(void)
     return -ENOSYS;
 }
 
+#define fsync _commit
+
+#if !defined(lseek)
+# define lseek _lseeki64
+#endif
+
+int qemu_ftruncate64(int, int64_t);
+
+#if !defined(ftruncate)
+# define ftruncate qemu_ftruncate64
+#endif
+
+static inline char *realpath(const char *path, char *resolved_path)
+{
+    _fullpath(resolved_path, path, _MAX_PATH);
+    return resolved_path;
+}
+
 #endif
diff --git a/linux-user/elfload.c b/linux-user/elfload.c
index 17883686f0..9c999ac139 100644
--- a/linux-user/elfload.c
+++ b/linux-user/elfload.c
@@ -1756,7 +1756,6 @@ static void probe_guest_base(const char *image_name,
      * it explicitly, and set guest_base appropriately.
      * In case of error we will print a suitable message and exit.
      */
-#if defined(CONFIG_USE_GUEST_BASE)
     const char *errmsg;
     if (!have_guest_base && !reserved_va) {
         unsigned long host_start, real_start, host_size;
@@ -1795,7 +1794,6 @@ static void probe_guest_base(const char *image_name,
 exit_errmsg:
     fprintf(stderr, "%s: %s\n", image_name, errmsg);
     exit(-1);
-#endif
 }
 
 
diff --git a/linux-user/main.c b/linux-user/main.c
index fdee981351..2c9658e90d 100644
--- a/linux-user/main.c
+++ b/linux-user/main.c
@@ -43,7 +43,6 @@ int gdbstub_port;
 envlist_t *envlist;
 static const char *cpu_model;
 unsigned long mmap_min_addr;
-#if defined(CONFIG_USE_GUEST_BASE)
 unsigned long guest_base;
 int have_guest_base;
 #if (TARGET_LONG_BITS == 32) && (HOST_LONG_BITS == 64)
@@ -63,7 +62,6 @@ unsigned long reserved_va = 0xf7000000;
 #else
 unsigned long reserved_va;
 #endif
-#endif
 
 static void usage(void);
 
@@ -3584,7 +3582,6 @@ static void handle_arg_cpu(const char *arg)
     }
 }
 
-#if defined(CONFIG_USE_GUEST_BASE)
 static void handle_arg_guest_base(const char *arg)
 {
     guest_base = strtol(arg, NULL, 0);
@@ -3626,7 +3623,6 @@ static void handle_arg_reserved_va(const char *arg)
         exit(1);
     }
 }
-#endif
 
 static void handle_arg_singlestep(const char *arg)
 {
@@ -3673,12 +3669,10 @@ static const struct qemu_argument arg_table[] = {
      "argv0",      "forces target process argv[0] to be 'argv0'"},
     {"r",          "QEMU_UNAME",       true,  handle_arg_uname,
      "uname",      "set qemu uname release string to 'uname'"},
-#if defined(CONFIG_USE_GUEST_BASE)
     {"B",          "QEMU_GUEST_BASE",  true,  handle_arg_guest_base,
      "address",    "set guest_base address to 'address'"},
     {"R",          "QEMU_RESERVED_VA", true,  handle_arg_reserved_va,
      "size",       "reserve 'size' bytes for guest virtual address space"},
-#endif
     {"d",          "QEMU_LOG",         true,  handle_arg_log,
      "item[,...]", "enable logging of specified items "
      "(use '-d help' for a list of items)"},
@@ -3954,7 +3948,6 @@ int main(int argc, char **argv, char **envp)
     target_environ = envlist_to_environ(envlist, NULL);
     envlist_free(envlist);
 
-#if defined(CONFIG_USE_GUEST_BASE)
     /*
      * Now that page sizes are configured in cpu_init() we can do
      * proper page alignment for guest_base.
@@ -3976,7 +3969,6 @@ int main(int argc, char **argv, char **envp)
             mmap_next_start = reserved_va;
         }
     }
-#endif /* CONFIG_USE_GUEST_BASE */
 
     /*
      * Read in mmap_min_addr kernel parameter.  This value is used
@@ -4050,9 +4042,7 @@ int main(int argc, char **argv, char **envp)
     free(target_environ);
 
     if (qemu_log_enabled()) {
-#if defined(CONFIG_USE_GUEST_BASE)
         qemu_log("guest_base  0x%lx\n", guest_base);
-#endif
         log_page_dump();
 
         qemu_log("start_brk   0x" TARGET_ABI_FMT_lx "\n", info->start_brk);
@@ -4072,12 +4062,10 @@ int main(int argc, char **argv, char **envp)
     syscall_init();
     signal_init();
 
-#if defined(CONFIG_USE_GUEST_BASE)
     /* Now that we've loaded the binary, GUEST_BASE is fixed.  Delay
        generating the prologue until now so that the prologue can take
        the real value of GUEST_BASE into account.  */
     tcg_prologue_init(&tcg_ctx);
-#endif
 
 #if defined(TARGET_I386)
     env->cr[0] = CR0_PG_MASK | CR0_WP_MASK | CR0_PE_MASK;
diff --git a/linux-user/mmap.c b/linux-user/mmap.c
index 78e1b2df43..b2126c76fa 100644
--- a/linux-user/mmap.c
+++ b/linux-user/mmap.c
@@ -206,7 +206,6 @@ abi_ulong mmap_next_start = TASK_UNMAPPED_BASE;
 
 unsigned long last_brk;
 
-#ifdef CONFIG_USE_GUEST_BASE
 /* Subroutine of mmap_find_vma, used when we have pre-allocated a chunk
    of guest address space.  */
 static abi_ulong mmap_find_vma_reserved(abi_ulong start, abi_ulong size)
@@ -216,14 +215,14 @@ static abi_ulong mmap_find_vma_reserved(abi_ulong start, abi_ulong size)
     int prot;
     int looped = 0;
 
-    if (size > RESERVED_VA) {
+    if (size > reserved_va) {
         return (abi_ulong)-1;
     }
 
     size = HOST_PAGE_ALIGN(size);
     end_addr = start + size;
-    if (end_addr > RESERVED_VA) {
-        end_addr = RESERVED_VA;
+    if (end_addr > reserved_va) {
+        end_addr = reserved_va;
     }
     addr = end_addr - qemu_host_page_size;
 
@@ -232,7 +231,7 @@ static abi_ulong mmap_find_vma_reserved(abi_ulong start, abi_ulong size)
             if (looped) {
                 return (abi_ulong)-1;
             }
-            end_addr = RESERVED_VA;
+            end_addr = reserved_va;
             addr = end_addr - qemu_host_page_size;
             looped = 1;
             continue;
@@ -253,7 +252,6 @@ static abi_ulong mmap_find_vma_reserved(abi_ulong start, abi_ulong size)
 
     return addr;
 }
-#endif
 
 /*
  * Find and reserve a free memory area of size 'size'. The search
@@ -276,11 +274,9 @@ abi_ulong mmap_find_vma(abi_ulong start, abi_ulong size)
 
     size = HOST_PAGE_ALIGN(size);
 
-#ifdef CONFIG_USE_GUEST_BASE
-    if (RESERVED_VA) {
+    if (reserved_va) {
         return mmap_find_vma_reserved(start, size);
     }
-#endif
 
     addr = start;
     wrapped = repeat = 0;
@@ -671,7 +667,7 @@ int target_munmap(abi_ulong start, abi_ulong len)
     ret = 0;
     /* unmap what we can */
     if (real_start < real_end) {
-        if (RESERVED_VA) {
+        if (reserved_va) {
             mmap_reserve(real_start, real_end - real_start);
         } else {
             ret = munmap(g2h(real_start), real_end - real_start);
@@ -701,7 +697,7 @@ abi_long target_mremap(abi_ulong old_addr, abi_ulong old_size,
                                      flags,
                                      g2h(new_addr));
 
-        if (RESERVED_VA && host_addr != MAP_FAILED) {
+        if (reserved_va && host_addr != MAP_FAILED) {
             /* If new and old addresses overlap then the above mremap will
                already have failed with EINVAL.  */
             mmap_reserve(old_addr, old_size);
@@ -719,13 +715,13 @@ abi_long target_mremap(abi_ulong old_addr, abi_ulong old_size,
                                          old_size, new_size,
                                          flags | MREMAP_FIXED,
                                          g2h(mmap_start));
-            if ( RESERVED_VA ) {
+            if (reserved_va) {
                 mmap_reserve(old_addr, old_size);
             }
         }
     } else {
         int prot = 0;
-        if (RESERVED_VA && old_size < new_size) {
+        if (reserved_va && old_size < new_size) {
             abi_ulong addr;
             for (addr = old_addr + old_size;
                  addr < old_addr + new_size;
@@ -735,7 +731,7 @@ abi_long target_mremap(abi_ulong old_addr, abi_ulong old_size,
         }
         if (prot == 0) {
             host_addr = mremap(g2h(old_addr), old_size, new_size, flags);
-            if (host_addr != MAP_FAILED && RESERVED_VA && old_size > new_size) {
+            if (host_addr != MAP_FAILED && reserved_va && old_size > new_size) {
                 mmap_reserve(old_addr + old_size, new_size - old_size);
             }
         } else {
diff --git a/monitor.c b/monitor.c
index aeea2b5f9e..fc32f12eef 100644
--- a/monitor.c
+++ b/monitor.c
@@ -678,7 +678,7 @@ static int get_str(char *buf, int buf_size, const char **pp)
                 case '\"':
                     break;
                 default:
-                    qemu_printf("unsupported escape code: '\\%c'\n", c);
+                    printf("unsupported escape code: '\\%c'\n", c);
                     goto fail;
                 }
                 if ((q - buf) < buf_size - 1) {
@@ -692,7 +692,7 @@ static int get_str(char *buf, int buf_size, const char **pp)
             }
         }
         if (*p != '\"') {
-            qemu_printf("unterminated string\n");
+            printf("unterminated string\n");
             goto fail;
         }
         p++;
diff --git a/qapi/qmp-event.c b/qapi/qmp-event.c
index 0d1ce0bd18..c0e435f994 100644
--- a/qapi/qmp-event.c
+++ b/qapi/qmp-event.c
@@ -18,14 +18,6 @@
 #include "qapi/qmp/qstring.h"
 #include "qapi/qmp/qjson.h"
 
-#ifdef _WIN32
-#include "sysemu/os-win32.h"
-#endif
-
-#ifdef CONFIG_POSIX
-#include "sysemu/os-posix.h"
-#endif
-
 static QMPEventFuncEmit qmp_emit;
 
 void qmp_event_set_func_emit(QMPEventFuncEmit emit)
diff --git a/qemu-doc.texi b/qemu-doc.texi
index 94af8c0f33..ea9b3fbfca 100644
--- a/qemu-doc.texi
+++ b/qemu-doc.texi
@@ -412,6 +412,7 @@ snapshots.
 * vm_snapshots::              VM snapshots
 * qemu_img_invocation::       qemu-img Invocation
 * qemu_nbd_invocation::       qemu-nbd Invocation
+* qemu_ga_invocation::        qemu-ga Invocation
 * disk_images_formats::       Disk image file formats
 * host_drives::               Using host drives
 * disk_images_fat_images::    Virtual FAT disk images
@@ -505,6 +506,11 @@ state is not saved or restored properly (in particular USB).
 
 @include qemu-nbd.texi
 
+@node qemu_ga_invocation
+@subsection @code{qemu-ga} Invocation
+
+@include qemu-ga.texi
+
 @node disk_images_formats
 @subsection Disk image file formats
 
@@ -3021,9 +3027,63 @@ and all other qemu-system-@var{target}.exe compiled for Win32.
 @node Mac OS X
 @section Mac OS X
 
-The Mac OS X patches are not fully merged in QEMU, so you should look
-at the QEMU mailing list archive to have all the necessary
-information.
+System Requirements:
+@itemize
+@item Mac OS 10.5 or higher
+@item The clang compiler shipped with Xcode 4.2 or higher,
+or GCC 4.3 or higher
+@end itemize
+
+Additional Requirements (install in order):
+@enumerate
+@item libffi: @uref{https://sourceware.org/libffi/}
+@item gettext: @uref{http://www.gnu.org/software/gettext/}
+@item glib: @uref{http://ftp.gnome.org/pub/GNOME/sources/glib/}
+@item pkg-config: @uref{http://www.freedesktop.org/wiki/Software/pkg-config/}
+@item autoconf: @uref{http://www.gnu.org/software/autoconf/autoconf.html}
+@item automake: @uref{http://www.gnu.org/software/automake/}
+@item libtool: @uref{http://www.gnu.org/software/libtool/}
+@item pixman: @uref{http://www.pixman.org/}
+@end enumerate
+
+* You may find it easiest to get these from a third-party packager
+such as Homebrew, Macports, or Fink.
+
+After downloading the QEMU source code, double-click it to expand it.
+
+Then configure and make QEMU:
+@example
+./configure
+make
+@end example
+
+If you have a recent version of Mac OS X (OSX 10.7 or better
+with Xcode 4.2 or better) we recommend building QEMU with the
+default compiler provided by Apple, for your version of Mac OS X
+(which will be 'clang'). The configure script will
+automatically pick this.
+
+Note: If after the configure step you see a message like this:
+@example
+ERROR: Your compiler does not support the __thread specifier for
+       Thread-Local Storage (TLS). Please upgrade to a version that does.
+@end example
+you may have to build your own version of gcc from source. Expect that to take
+several hours. More information can be found here:
+@uref{https://gcc.gnu.org/install/} @*
+
+These are some of the third party binaries of gcc available for download:
+@itemize
+@item Homebrew: @uref{http://brew.sh/}
+@item @uref{https://www.litebeam.net/gcc/gcc_472.pkg}
+@item @uref{http://www.macports.org/ports.php?by=name&substr=gcc}
+@end itemize
+
+You can have several versions of GCC on your system. To specify a certain version,
+use the --cc and --cxx options.
+@example
+./configure --cxx=<path of your c++ compiler> --cc=<path of your c compiler> <other options>
+@end example
 
 @node Make targets
 @section Make targets
diff --git a/qemu-ga.texi b/qemu-ga.texi
new file mode 100644
index 0000000000..536a9b5241
--- /dev/null
+++ b/qemu-ga.texi
@@ -0,0 +1,137 @@
+@example
+@c man begin SYNOPSIS
+usage: qemu-ga [OPTIONS]
+@c man end
+@end example
+
+@c man begin DESCRIPTION
+
+The QEMU Guest Agent is a daemon intended to be run within virtual
+machines. It allows the hypervisor host to perform various operations
+in the guest, such as:
+
+@itemize
+@item
+get information from the guest
+@item
+set the guest's system time
+@item
+read/write a file
+@item
+sync and freeze the filesystems
+@item
+suspend the guest
+@item
+reconfigure guest local processors
+@item
+set user's password
+@item
+...
+@end itemize
+
+qemu-ga will read a system configuration file on startup (located at
+q@file{/etc/qemu/qemu-ga.conf} by default), then parse remaining
+configuration options on the command line. For the same key, the last
+option wins, but the lists accumulate (see below for configuration
+file format).
+
+@c man end
+
+@c man begin OPTIONS
+@table @option
+@item -m, --method=@var{method}
+  Transport method: one of @samp{unix-listen}, @samp{virtio-serial}, or
+  @samp{isa-serial} (@samp{virtio-serial} is the default).
+
+@item -p, --path=@var{path}
+  Device/socket path (the default for virtio-serial is
+  @samp{/dev/virtio-ports/org.qemu.guest_agent.0},
+  the default for isa-serial is @samp{/dev/ttyS0})
+
+@item -l, --logfile=@var{path}
+  Set log file path (default is stderr).
+
+@item -f, --pidfile=@var{path}
+  Specify pid file (default is @samp{/var/run/qemu-ga.pid}).
+
+@item -F, --fsfreeze-hook=@var{path}
+  Enable fsfreeze hook. Accepts an optional argument that specifies
+  script to run on freeze/thaw. Script will be called with
+  'freeze'/'thaw' arguments accordingly (default is
+  @samp{/etc/qemu/fsfreeze-hook}). If using -F with an argument, do
+  not follow -F with a space (for example:
+  @samp{-F/var/run/fsfreezehook.sh}).
+
+@item -t, --statedir=@var{path}
+  Specify the directory to store state information (absolute paths only,
+  default is @samp{/var/run}).
+
+@item -v, --verbose
+  Log extra debugging information.
+
+@item -V, --version
+  Print version information and exit.
+
+@item -d, --daemon
+  Daemonize after startup (detach from terminal).
+
+@item -b, --blacklist=@var{list}
+  Comma-separated list of RPCs to disable (no spaces, @samp{?} to list
+  available RPCs).
+
+@item -D, --dump-conf
+  Dump the configuration in a format compatible with @file{qemu-ga.conf}
+  and exit.
+
+@item -h, --help
+  Display this help and exit.
+@end table
+
+@c man end
+
+@c man begin FILES
+
+The syntax of the @file{qemu-ga.conf} configuration file follows the
+Desktop Entry Specification, here is a quick summary: it consists of
+groups of key-value pairs, interspersed with comments.
+
+@example
+# qemu-ga configuration sample
+[general]
+daemonize = 0
+pidfile = /var/run/qemu-ga.pid
+verbose = 0
+method = virtio-serial
+path = /dev/virtio-ports/org.qemu.guest_agent.0
+statedir = /var/run
+@end example
+
+The list of keys follows the command line options:
+@table @option
+@item daemon= boolean
+@item method= string
+@item path= string
+@item logfile= string
+@item pidfile= string
+@item fsfreeze-hook= string
+@item statedir= string
+@item verbose= boolean
+@item blacklist= string list
+@end table
+
+@c man end
+
+@ignore
+
+@setfilename qemu-ga
+@settitle QEMU Guest Agent
+
+@c man begin AUTHOR
+Michael Roth <mdroth@linux.vnet.ibm.com>
+@c man end
+
+@c man begin SEEALSO
+qemu(1)
+@c man end
+
+@end ignore
diff --git a/qemu-nbd.c b/qemu-nbd.c
index 5106b802e6..d9644b2431 100644
--- a/qemu-nbd.c
+++ b/qemu-nbd.c
@@ -362,7 +362,6 @@ static void nbd_client_closed(NBDClient *client)
         state = TERMINATE;
     }
     nbd_update_server_fd_handler(server_fd);
-    qemu_notify_event();
     nbd_client_put(client);
 }
 
diff --git a/qga/commands-posix.c b/qga/commands-posix.c
index 675f4b4c66..fc4fc727f7 100644
--- a/qga/commands-posix.c
+++ b/qga/commands-posix.c
@@ -2454,7 +2454,7 @@ GList *ga_command_blacklist_init(GList *blacklist)
         char **p = (char **)list;
 
         while (*p) {
-            blacklist = g_list_append(blacklist, *p++);
+            blacklist = g_list_append(blacklist, g_strdup(*p++));
         }
     }
 #endif
@@ -2468,13 +2468,13 @@ GList *ga_command_blacklist_init(GList *blacklist)
         char **p = (char **)list;
 
         while (*p) {
-            blacklist = g_list_append(blacklist, *p++);
+            blacklist = g_list_append(blacklist, g_strdup(*p++));
         }
     }
 #endif
 
 #if !defined(CONFIG_FSTRIM)
-    blacklist = g_list_append(blacklist, (char *)"guest-fstrim");
+    blacklist = g_list_append(blacklist, g_strdup("guest-fstrim"));
 #endif
 
     return blacklist;
diff --git a/qga/commands-win32.c b/qga/commands-win32.c
index a7822d5ff7..cbee18644b 100644
--- a/qga/commands-win32.c
+++ b/qga/commands-win32.c
@@ -26,6 +26,8 @@
 #include <setupapi.h>
 #include <initguid.h>
 #endif
+#include <lm.h>
+
 #include "qga/guest-agent-core.h"
 #include "qga/vss-win32.h"
 #include "qga-qmp-commands.h"
@@ -1192,12 +1194,84 @@ int64_t qmp_guest_set_vcpus(GuestLogicalProcessorList *vcpus, Error **errp)
     return -1;
 }
 
+static gchar *
+get_net_error_message(gint error)
+{
+    HMODULE module = NULL;
+    gchar *retval = NULL;
+    wchar_t *msg = NULL;
+    int flags, nchars;
+
+    flags = FORMAT_MESSAGE_ALLOCATE_BUFFER
+        |FORMAT_MESSAGE_IGNORE_INSERTS
+        |FORMAT_MESSAGE_FROM_SYSTEM;
+
+    if (error >= NERR_BASE && error <= MAX_NERR) {
+        module = LoadLibraryExW(L"netmsg.dll", NULL, LOAD_LIBRARY_AS_DATAFILE);
+
+        if (module != NULL) {
+            flags |= FORMAT_MESSAGE_FROM_HMODULE;
+        }
+    }
+
+    FormatMessageW(flags, module, error, 0, (LPWSTR)&msg, 0, NULL);
+
+    if (msg != NULL) {
+        nchars = wcslen(msg);
+
+        if (nchars > 2 && msg[nchars-1] == '\n' && msg[nchars-2] == '\r') {
+            msg[nchars-2] = '\0';
+        }
+
+        retval = g_utf16_to_utf8(msg, -1, NULL, NULL, NULL);
+
+        LocalFree(msg);
+    }
+
+    if (module != NULL) {
+        FreeLibrary(module);
+    }
+
+    return retval;
+}
+
 void qmp_guest_set_user_password(const char *username,
                                  const char *password,
                                  bool crypted,
                                  Error **errp)
 {
-    error_setg(errp, QERR_UNSUPPORTED);
+    NET_API_STATUS nas;
+    char *rawpasswddata = NULL;
+    size_t rawpasswdlen;
+    wchar_t *user, *wpass;
+    USER_INFO_1003 pi1003 = { 0, };
+
+    if (crypted) {
+        error_setg(errp, QERR_UNSUPPORTED);
+        return;
+    }
+
+    rawpasswddata = (char *)g_base64_decode(password, &rawpasswdlen);
+    rawpasswddata = g_renew(char, rawpasswddata, rawpasswdlen + 1);
+    rawpasswddata[rawpasswdlen] = '\0';
+
+    user = g_utf8_to_utf16(username, -1, NULL, NULL, NULL);
+    wpass = g_utf8_to_utf16(rawpasswddata, -1, NULL, NULL, NULL);
+
+    pi1003.usri1003_password = wpass;
+    nas = NetUserSetInfo(NULL, user,
+                         1003, (LPBYTE)&pi1003,
+                         NULL);
+
+    if (nas != NERR_Success) {
+        gchar *msg = get_net_error_message(nas);
+        error_setg(errp, "failed to set password: %s", msg);
+        g_free(msg);
+    }
+
+    g_free(user);
+    g_free(wpass);
+    g_free(rawpasswddata);
 }
 
 GuestMemoryBlockList *qmp_guest_get_memory_blocks(Error **errp)
@@ -1225,7 +1299,6 @@ GList *ga_command_blacklist_init(GList *blacklist)
     const char *list_unsupported[] = {
         "guest-suspend-hybrid",
         "guest-get-vcpus", "guest-set-vcpus",
-        "guest-set-user-password",
         "guest-get-memory-blocks", "guest-set-memory-blocks",
         "guest-get-memory-block-size",
         "guest-fsfreeze-freeze-list",
@@ -1233,7 +1306,7 @@ GList *ga_command_blacklist_init(GList *blacklist)
     char **p = (char **)list_unsupported;
 
     while (*p) {
-        blacklist = g_list_append(blacklist, *p++);
+        blacklist = g_list_append(blacklist, g_strdup(*p++));
     }
 
     if (!vss_init(true)) {
@@ -1244,7 +1317,7 @@ GList *ga_command_blacklist_init(GList *blacklist)
         p = (char **)list;
 
         while (*p) {
-            blacklist = g_list_append(blacklist, *p++);
+            blacklist = g_list_append(blacklist, g_strdup(*p++));
         }
     }
 
diff --git a/qga/installer/qemu-ga.wxs b/qga/installer/qemu-ga.wxs
index 2c43f1b5aa..6804f0279f 100644
--- a/qga/installer/qemu-ga.wxs
+++ b/qga/installer/qemu-ga.wxs
@@ -42,7 +42,7 @@
   <Product
     Name="QEMU guest agent"
     Id="*"
-    UpgradeCode="{EB6B8302-C06E-4bec-ADAC-932C68A3A98D}"
+    UpgradeCode="{EB6B8302-C06E-4BEC-ADAC-932C68A3A98D}"
     Manufacturer="$(env.QEMU_GA_MANUFACTURER)"
     Version="$(env.QEMU_GA_VERSION)"
     Language="1033">
@@ -58,29 +58,15 @@
       />
     <Media Id="1" Cabinet="qemu_ga.$(env.QEMU_GA_VERSION).cab" EmbedCab="yes" />
     <Property Id="WHSLogo">1</Property>
-    <Property Id="PREVIOUSVERSIONSINSTALLED" />
-    <Upgrade Id="{EB6B8302-C06E-4bec-ADAC-932C68A3A98D}">
-      <UpgradeVersion
-        Minimum="1.0.0.0" Maximum="$(env.QEMU_GA_VERSION)"
-        Property="PREVIOUSVERSIONSINSTALLED"
-        IncludeMinimum="yes" IncludeMaximum="no" />
-    </Upgrade>
+    <MajorUpgrade
+      DowngradeErrorMessage="Error: A newer version of QEMU guest agent is already installed."
+      />
 
     <Directory Id="TARGETDIR" Name="SourceDir">
       <Directory Id="$(var.GaProgramFilesFolder)" Name="QEMU Guest Agent">
         <Directory Id="qemu_ga_directory" Name="Qemu-ga">
-          <Component Id="qemu_ga" Guid="{908B7199-DE2A-4dc6-A8D0-27A5AE444FEA}">
-            <File Id="qemu_ga.exe" Name="qemu-ga.exe" Source="../../qemu-ga.exe" KeyPath="yes" DiskId="1"/>
-            <?ifdef var.InstallVss ?>
-            <File Id="qga_vss.dll" Name="qga-vss.dll" Source="../vss-win32/qga-vss.dll" KeyPath="no" DiskId="1"/>
-            <File Id="qga_vss.tlb" Name="qga-vss.tlb" Source="../vss-win32/qga-vss.tlb" KeyPath="no" DiskId="1"/>
-            <?endif?>
-            <File Id="iconv.dll" Name="iconv.dll" Source="$(var.Mingw_bin)/iconv.dll" KeyPath="no" DiskId="1"/>
-            <File Id="libgcc_arch_lib" Name="$(var.ArchLib)" Source="$(var.Mingw_bin)/$(var.ArchLib)" KeyPath="no" DiskId="1"/>
-            <File Id="libglib_2.0_0.dll" Name="libglib-2.0-0.dll" Source="$(var.Mingw_bin)/libglib-2.0-0.dll" KeyPath="no" DiskId="1"/>
-            <File Id="libintl_8.dll" Name="libintl-8.dll" Source="$(var.Mingw_bin)/libintl-8.dll" KeyPath="no" DiskId="1"/>
-            <File Id="libssp_0.dll" Name="libssp-0.dll" Source="$(var.Mingw_bin)/libssp-0.dll" KeyPath="no" DiskId="1"/>
-            <File Id="libwinpthread_1.dll" Name="libwinpthread-1.dll" Source="$(var.Mingw_bin)/libwinpthread-1.dll" KeyPath="no" DiskId="1"/>
+          <Component Id="qemu_ga" Guid="{908B7199-DE2A-4DC6-A8D0-27A5AE444FEA}">
+            <File Id="qemu_ga.exe" Name="qemu-ga.exe" Source="$(env.BUILD_DIR)/qemu-ga.exe" KeyPath="yes" DiskId="1"/>
             <ServiceInstall
               Id="ServiceInstaller"
               Type="ownProcess"
@@ -97,8 +83,33 @@
             </ServiceInstall>
             <ServiceControl Id="StartService" Start="install" Stop="both" Remove="uninstall" Name="QEMU-GA" Wait="no" />
           </Component>
-
-          <Component Id="registry_entries" Guid="d075d109-51ca-11e3-9f8b-000c29858960">
+          <?ifdef var.InstallVss?>
+          <Component Id="qga_vss_dll" Guid="{CB19C453-FABB-4BB1-ABAB-6B74F687BFBB}">
+            <File Id="qga_vss.dll" Name="qga-vss.dll" Source="$(env.BUILD_DIR)/qga/vss-win32/qga-vss.dll" KeyPath="yes" DiskId="1"/>
+          </Component>
+          <Component Id="qga_vss_tlb" Guid="{D8D584B1-59C2-4FB7-A91F-636FF7BFA66E}">
+            <File Id="qga_vss.tlb" Name="qga-vss.tlb" Source="$(env.BUILD_DIR)/qga/vss-win32/qga-vss.tlb" KeyPath="yes" DiskId="1"/>
+          </Component>
+          <?endif?>
+          <Component Id="iconv" Guid="{35EE3558-D34B-4F0A-B8BD-430FF0775246}">
+            <File Id="iconv.dll" Name="iconv.dll" Source="$(var.Mingw_bin)/iconv.dll" KeyPath="yes" DiskId="1"/>
+          </Component>
+          <Component Id="libgcc_arch_lib" Guid="{ADD4D07D-4515-4AB6-AF3E-C904961B4BB0}">
+            <File Id="libgcc_arch_lib" Name="$(var.ArchLib)" Source="$(var.Mingw_bin)/$(var.ArchLib)" KeyPath="yes" DiskId="1"/>
+          </Component>
+          <Component Id="libglib" Guid="{D31BFD83-2773-4B65-B45A-E0D2ADA58679}">
+            <File Id="libglib_2.0_0.dll" Name="libglib-2.0-0.dll" Source="$(var.Mingw_bin)/libglib-2.0-0.dll" KeyPath="yes" DiskId="1"/>
+          </Component>
+          <Component Id="libintl" Guid="{A641BC2D-A907-4A94-9149-F30ED430878F}">
+            <File Id="libintl_8.dll" Name="libintl-8.dll" Source="$(var.Mingw_bin)/libintl-8.dll" KeyPath="yes" DiskId="1"/>
+          </Component>
+          <Component Id="libssp" Guid="{7880087B-02B4-4EF6-A5D3-D18F8E3D90E1}">
+            <File Id="libssp_0.dll" Name="libssp-0.dll" Source="$(var.Mingw_bin)/libssp-0.dll" KeyPath="yes" DiskId="1"/>
+          </Component>
+          <Component Id="libwinpthread" Guid="{6C117C78-0F47-4B07-8F34-6BEE11643829}">
+            <File Id="libwinpthread_1.dll" Name="libwinpthread-1.dll" Source="$(var.Mingw_bin)/libwinpthread-1.dll" KeyPath="yes" DiskId="1"/>
+          </Component>
+          <Component Id="registry_entries" Guid="{D075D109-51CA-11E3-9F8B-000C29858960}">
             <RegistryKey Root="HKLM"
                          Key="Software\$(env.QEMU_GA_MANUFACTURER)\$(env.QEMU_GA_DISTRO)\Tools\QemuGA">
               <RegistryValue Type="string" Name="ProductID" Value="fb0a0d66-c7fb-4e2e-a16b-c4a3bfe8d13b" />
@@ -110,10 +121,11 @@
     </Directory>
 
     <Property Id="cmd" Value="cmd.exe"/>
+    <Property Id="REINSTALLMODE" Value="amus"/>
 
-    <?ifdef var.InstallVss ?>
+    <?ifdef var.InstallVss?>
     <CustomAction Id="RegisterCom"
-             ExeCommand='/c "[qemu_ga_directory]qemu-ga.exe" -s vss-install'
+              ExeCommand='/c "[qemu_ga_directory]qemu-ga.exe" -s vss-install'
               Execute="deferred"
               Property="cmd"
               Impersonate="no"
@@ -126,19 +138,29 @@
               Property="cmd"
               Impersonate="no"
               Return="check"
-              ></CustomAction>
+              >
+    </CustomAction>
     <?endif?>
 
     <Feature Id="QEMUFeature" Title="QEMU Guest Agent" Level="1">
       <ComponentRef Id="qemu_ga" />
+      <?ifdef var.InstallVss?>
+      <ComponentRef Id="qga_vss_dll" />
+      <ComponentRef Id="qga_vss_tlb" />
+      <?endif?>
+      <ComponentRef Id="iconv" />
+      <ComponentRef Id="libgcc_arch_lib" />
+      <ComponentRef Id="libglib" />
+      <ComponentRef Id="libintl" />
+      <ComponentRef Id="libssp" />
+      <ComponentRef Id="libwinpthread" />
       <ComponentRef Id="registry_entries" />
     </Feature>
 
     <InstallExecuteSequence>
-      <RemoveExistingProducts Before="InstallInitialize" />
-      <?ifdef var.InstallVss ?>
-      <Custom Action="RegisterCom" After="InstallServices">NOT Installed</Custom>
+      <?ifdef var.InstallVss?>
       <Custom Action="UnRegisterCom" After="StopServices">Installed</Custom>
+      <Custom Action="RegisterCom" After="InstallServices">NOT REMOVE</Custom>
       <?endif?>
     </InstallExecuteSequence>
   </Product>
diff --git a/qga/main.c b/qga/main.c
index 791982ef01..d8e063a4a3 100644
--- a/qga/main.c
+++ b/qga/main.c
@@ -56,6 +56,7 @@
 #define QGA_FSFREEZE_HOOK_DEFAULT CONFIG_QEMU_CONFDIR "/fsfreeze-hook"
 #endif
 #define QGA_SENTINEL_BYTE 0xFF
+#define QGA_CONF_DEFAULT CONFIG_QEMU_CONFDIR G_DIR_SEPARATOR_S "qemu-ga.conf"
 
 static struct {
     const char *state_dir;
@@ -82,7 +83,7 @@ struct GAState {
     bool delimit_response;
     bool frozen;
     GList *blacklist;
-    const char *state_filepath_isfrozen;
+    char *state_filepath_isfrozen;
     struct {
         const char *log_filepath;
         const char *pid_filepath;
@@ -90,7 +91,7 @@ struct GAState {
 #ifdef CONFIG_FSFREEZE
     const char *fsfreeze_hook;
 #endif
-    const gchar *pstate_filepath;
+    gchar *pstate_filepath;
     GAPersistentState pstate;
 };
 
@@ -215,6 +216,8 @@ static void usage(const char *cmd)
 #endif
 "  -b, --blacklist   comma-separated list of RPCs to disable (no spaces, \"?\"\n"
 "                    to list available RPCs)\n"
+"  -D, --dump-conf   dump a qemu-ga config file based on current config\n"
+"                    options / command-line parameters to stdout\n"
 "  -h, --help        display this help and exit\n"
 "\n"
 "Report bugs to <mdroth@linux.vnet.ibm.com>\n"
@@ -658,23 +661,6 @@ static gboolean channel_init(GAState *s, const gchar *method, const gchar *path)
 {
     GAChannelMethod channel_method;
 
-    if (method == NULL) {
-        method = "virtio-serial";
-    }
-
-    if (path == NULL) {
-        if (strcmp(method, "virtio-serial") == 0 ) {
-            /* try the default path for the virtio-serial port */
-            path = QGA_VIRTIO_PATH_DEFAULT;
-        } else if (strcmp(method, "isa-serial") == 0){
-            /* try the default path for the serial port - COM1 */
-            path = QGA_SERIAL_PATH_DEFAULT;
-        } else {
-            g_critical("must specify a path for this channel");
-            return false;
-        }
-    }
-
     if (strcmp(method, "virtio-serial") == 0) {
         s->virtio = true; /* virtio requires special handling in some cases */
         channel_method = GA_CHANNEL_VIRTIO_SERIAL;
@@ -921,22 +907,164 @@ static void ga_print_cmd(QmpCommand *cmd, void *opaque)
     printf("%s\n", qmp_command_name(cmd));
 }
 
-int main(int argc, char **argv)
+static GList *split_list(const gchar *str, const gchar *delim)
 {
-    const char *sopt = "hVvdm:p:l:f:F::b:s:t:";
-    const char *method = NULL, *path = NULL;
-    const char *log_filepath = NULL;
-    const char *pid_filepath;
+    GList *list = NULL;
+    int i;
+    gchar **strv;
+
+    strv = g_strsplit(str, delim, -1);
+    for (i = 0; strv[i]; i++) {
+        list = g_list_prepend(list, strv[i]);
+    }
+    g_free(strv);
+
+    return list;
+}
+
+typedef struct GAConfig {
+    char *channel_path;
+    char *method;
+    char *log_filepath;
+    char *pid_filepath;
 #ifdef CONFIG_FSFREEZE
-    const char *fsfreeze_hook = NULL;
+    char *fsfreeze_hook;
 #endif
-    const char *state_dir;
+    char *state_dir;
 #ifdef _WIN32
-    const char *service = NULL;
+    const char *service;
+#endif
+    gchar *bliststr; /* blacklist may point to this string */
+    GList *blacklist;
+    int daemonize;
+    GLogLevelFlags log_level;
+    int dumpconf;
+} GAConfig;
+
+static void config_load(GAConfig *config)
+{
+    GError *gerr = NULL;
+    GKeyFile *keyfile;
+
+    /* read system config */
+    keyfile = g_key_file_new();
+    if (!g_key_file_load_from_file(keyfile, QGA_CONF_DEFAULT, 0, &gerr)) {
+        goto end;
+    }
+    if (g_key_file_has_key(keyfile, "general", "daemon", NULL)) {
+        config->daemonize =
+            g_key_file_get_boolean(keyfile, "general", "daemon", &gerr);
+    }
+    if (g_key_file_has_key(keyfile, "general", "method", NULL)) {
+        config->method =
+            g_key_file_get_string(keyfile, "general", "method", &gerr);
+    }
+    if (g_key_file_has_key(keyfile, "general", "path", NULL)) {
+        config->channel_path =
+            g_key_file_get_string(keyfile, "general", "path", &gerr);
+    }
+    if (g_key_file_has_key(keyfile, "general", "logfile", NULL)) {
+        config->log_filepath =
+            g_key_file_get_string(keyfile, "general", "logfile", &gerr);
+    }
+    if (g_key_file_has_key(keyfile, "general", "pidfile", NULL)) {
+        config->pid_filepath =
+            g_key_file_get_string(keyfile, "general", "pidfile", &gerr);
+    }
+#ifdef CONFIG_FSFREEZE
+    if (g_key_file_has_key(keyfile, "general", "fsfreeze-hook", NULL)) {
+        config->fsfreeze_hook =
+            g_key_file_get_string(keyfile,
+                                  "general", "fsfreeze-hook", &gerr);
+    }
+#endif
+    if (g_key_file_has_key(keyfile, "general", "statedir", NULL)) {
+        config->state_dir =
+            g_key_file_get_string(keyfile, "general", "statedir", &gerr);
+    }
+    if (g_key_file_has_key(keyfile, "general", "verbose", NULL) &&
+        g_key_file_get_boolean(keyfile, "general", "verbose", &gerr)) {
+        /* enable all log levels */
+        config->log_level = G_LOG_LEVEL_MASK;
+    }
+    if (g_key_file_has_key(keyfile, "general", "blacklist", NULL)) {
+        config->bliststr =
+            g_key_file_get_string(keyfile, "general", "blacklist", &gerr);
+        config->blacklist = g_list_concat(config->blacklist,
+                                          split_list(config->bliststr, ","));
+    }
+
+end:
+    g_key_file_free(keyfile);
+    if (gerr &&
+        !(gerr->domain == G_FILE_ERROR && gerr->code == G_FILE_ERROR_NOENT)) {
+        g_critical("error loading configuration from path: %s, %s",
+                   QGA_CONF_DEFAULT, gerr->message);
+        exit(EXIT_FAILURE);
+    }
+    g_clear_error(&gerr);
+}
+
+static gchar *list_join(GList *list, const gchar separator)
+{
+    GString *str = g_string_new("");
+
+    while (list) {
+        str = g_string_append(str, (gchar *)list->data);
+        list = g_list_next(list);
+        if (list) {
+            str = g_string_append_c(str, separator);
+        }
+    }
+
+    return g_string_free(str, FALSE);
+}
+
+static void config_dump(GAConfig *config)
+{
+    GError *error = NULL;
+    GKeyFile *keyfile;
+    gchar *tmp;
+
+    keyfile = g_key_file_new();
+    g_assert(keyfile);
+
+    g_key_file_set_boolean(keyfile, "general", "daemon", config->daemonize);
+    g_key_file_set_string(keyfile, "general", "method", config->method);
+    g_key_file_set_string(keyfile, "general", "path", config->channel_path);
+    if (config->log_filepath) {
+        g_key_file_set_string(keyfile, "general", "logfile",
+                              config->log_filepath);
+    }
+    g_key_file_set_string(keyfile, "general", "pidfile", config->pid_filepath);
+#ifdef CONFIG_FSFREEZE
+    if (config->fsfreeze_hook) {
+        g_key_file_set_string(keyfile, "general", "fsfreeze-hook",
+                              config->fsfreeze_hook);
+    }
 #endif
+    g_key_file_set_string(keyfile, "general", "statedir", config->state_dir);
+    g_key_file_set_boolean(keyfile, "general", "verbose",
+                           config->log_level == G_LOG_LEVEL_MASK);
+    tmp = list_join(config->blacklist, ',');
+    g_key_file_set_string(keyfile, "general", "blacklist", tmp);
+    g_free(tmp);
+
+    tmp = g_key_file_to_data(keyfile, NULL, &error);
+    printf("%s", tmp);
+
+    g_free(tmp);
+    g_key_file_free(keyfile);
+}
+
+static void config_parse(GAConfig *config, int argc, char **argv)
+{
+    const char *sopt = "hVvdm:p:l:f:F::b:s:t:D";
+    int opt_ind = 0, ch;
     const struct option lopt[] = {
         { "help", 0, NULL, 'h' },
         { "version", 0, NULL, 'V' },
+        { "dump-conf", 0, NULL, 'D' },
         { "logfile", 1, NULL, 'l' },
         { "pidfile", 1, NULL, 'f' },
 #ifdef CONFIG_FSFREEZE
@@ -953,141 +1081,115 @@ int main(int argc, char **argv)
         { "statedir", 1, NULL, 't' },
         { NULL, 0, NULL, 0 }
     };
-    int opt_ind = 0, ch, daemonize = 0, i, j, len;
-    GLogLevelFlags log_level = G_LOG_LEVEL_ERROR | G_LOG_LEVEL_CRITICAL;
-    GList *blacklist = NULL;
-    GAState *s;
-
-    module_call_init(MODULE_INIT_QAPI);
 
-    init_dfl_pathnames();
-    pid_filepath = dfl_pathnames.pidfile;
-    state_dir = dfl_pathnames.state_dir;
+    config->log_level = G_LOG_LEVEL_ERROR | G_LOG_LEVEL_CRITICAL;
 
     while ((ch = getopt_long(argc, argv, sopt, lopt, &opt_ind)) != -1) {
         switch (ch) {
         case 'm':
-            method = optarg;
+            g_free(config->method);
+            config->method = g_strdup(optarg);
             break;
         case 'p':
-            path = optarg;
+            g_free(config->channel_path);
+            config->channel_path = g_strdup(optarg);
             break;
         case 'l':
-            log_filepath = optarg;
+            g_free(config->log_filepath);
+            config->log_filepath = g_strdup(optarg);
             break;
         case 'f':
-            pid_filepath = optarg;
+            g_free(config->pid_filepath);
+            config->pid_filepath = g_strdup(optarg);
             break;
 #ifdef CONFIG_FSFREEZE
         case 'F':
-            fsfreeze_hook = optarg ? optarg : QGA_FSFREEZE_HOOK_DEFAULT;
+            g_free(config->fsfreeze_hook);
+            config->fsfreeze_hook = g_strdup(optarg ?: QGA_FSFREEZE_HOOK_DEFAULT);
             break;
 #endif
         case 't':
-             state_dir = optarg;
-             break;
+            g_free(config->state_dir);
+            config->state_dir = g_strdup(optarg);
+            break;
         case 'v':
             /* enable all log levels */
-            log_level = G_LOG_LEVEL_MASK;
+            config->log_level = G_LOG_LEVEL_MASK;
             break;
         case 'V':
             printf("QEMU Guest Agent %s\n", QEMU_VERSION);
-            return 0;
+            exit(EXIT_SUCCESS);
         case 'd':
-            daemonize = 1;
+            config->daemonize = 1;
+            break;
+        case 'D':
+            config->dumpconf = 1;
             break;
         case 'b': {
             if (is_help_option(optarg)) {
                 qmp_for_each_command(ga_print_cmd, NULL);
-                return 0;
-            }
-            for (j = 0, i = 0, len = strlen(optarg); i < len; i++) {
-                if (optarg[i] == ',') {
-                    optarg[i] = 0;
-                    blacklist = g_list_append(blacklist, &optarg[j]);
-                    j = i + 1;
-                }
-            }
-            if (j < i) {
-                blacklist = g_list_append(blacklist, &optarg[j]);
+                exit(EXIT_SUCCESS);
             }
+            config->blacklist = g_list_concat(config->blacklist,
+                                             split_list(optarg, ","));
             break;
         }
 #ifdef _WIN32
         case 's':
-            service = optarg;
-            if (strcmp(service, "install") == 0) {
-                const char *fixed_state_dir;
-
-                /* If the user passed the "-t" option, we save that state dir
-                 * in the service. Otherwise we let the service fetch the state
-                 * dir from the environment when it starts.
-                 */
-                fixed_state_dir = (state_dir == dfl_pathnames.state_dir) ?
-                                  NULL :
-                                  state_dir;
+            config->service = optarg;
+            if (strcmp(config->service, "install") == 0) {
                 if (ga_install_vss_provider()) {
-                    return EXIT_FAILURE;
+                    exit(EXIT_FAILURE);
                 }
-                if (ga_install_service(path, log_filepath, fixed_state_dir)) {
-                    return EXIT_FAILURE;
+                if (ga_install_service(config->channel_path,
+                                       config->log_filepath, config->state_dir)) {
+                    exit(EXIT_FAILURE);
                 }
-                return 0;
-            } else if (strcmp(service, "uninstall") == 0) {
+                exit(EXIT_SUCCESS);
+            } else if (strcmp(config->service, "uninstall") == 0) {
                 ga_uninstall_vss_provider();
-                return ga_uninstall_service();
-            } else if (strcmp(service, "vss-install") == 0) {
+                exit(ga_uninstall_service());
+            } else if (strcmp(config->service, "vss-install") == 0) {
                 if (ga_install_vss_provider()) {
-                    return EXIT_FAILURE;
+                    exit(EXIT_FAILURE);
                 }
-                return EXIT_SUCCESS;
-            } else if (strcmp(service, "vss-uninstall") == 0) {
+                exit(EXIT_SUCCESS);
+            } else if (strcmp(config->service, "vss-uninstall") == 0) {
                 ga_uninstall_vss_provider();
-                return EXIT_SUCCESS;
+                exit(EXIT_SUCCESS);
             } else {
                 printf("Unknown service command.\n");
-                return EXIT_FAILURE;
+                exit(EXIT_FAILURE);
             }
             break;
 #endif
         case 'h':
             usage(argv[0]);
-            return 0;
+            exit(EXIT_SUCCESS);
         case '?':
             g_print("Unknown option, try '%s --help' for more information.\n",
                     argv[0]);
-            return EXIT_FAILURE;
+            exit(EXIT_FAILURE);
         }
     }
+}
 
-#ifdef _WIN32
-    /* On win32 the state directory is application specific (be it the default
-     * or a user override). We got past the command line parsing; let's create
-     * the directory (with any intermediate directories). If we run into an
-     * error later on, we won't try to clean up the directory, it is considered
-     * persistent.
-     */
-    if (g_mkdir_with_parents(state_dir, S_IRWXU) == -1) {
-        g_critical("unable to create (an ancestor of) the state directory"
-                   " '%s': %s", state_dir, strerror(errno));
-        return EXIT_FAILURE;
-    }
-#endif
-
-    s = g_malloc0(sizeof(GAState));
-    s->log_level = log_level;
-    s->log_file = stderr;
+static void config_free(GAConfig *config)
+{
+    g_free(config->method);
+    g_free(config->log_filepath);
+    g_free(config->pid_filepath);
+    g_free(config->state_dir);
+    g_free(config->channel_path);
+    g_free(config->bliststr);
 #ifdef CONFIG_FSFREEZE
-    s->fsfreeze_hook = fsfreeze_hook;
+    g_free(config->fsfreeze_hook);
 #endif
-    g_log_set_default_handler(ga_log, s);
-    g_log_set_fatal_mask(NULL, G_LOG_LEVEL_ERROR);
-    ga_enable_logging(s);
-    s->state_filepath_isfrozen = g_strdup_printf("%s/qga.state.isfrozen",
-                                                 state_dir);
-    s->pstate_filepath = g_strdup_printf("%s/qga.state", state_dir);
-    s->frozen = false;
+    g_free(config);
+}
 
+static bool check_is_frozen(GAState *s)
+{
 #ifndef _WIN32
     /* check if a previous instance of qemu-ga exited with filesystems' state
      * marked as frozen. this could be a stale value (a non-qemu-ga process
@@ -1113,32 +1215,56 @@ int main(int argc, char **argv)
                   " guest-fsfreeze-thaw is issued, or filesystems are"
                   " manually unfrozen and the file %s is removed",
                   s->state_filepath_isfrozen);
-        s->frozen = true;
+        return true;
+    }
+#endif
+    return false;
+}
+
+static int run_agent(GAState *s, GAConfig *config)
+{
+    ga_state = s;
+
+    g_log_set_default_handler(ga_log, s);
+    g_log_set_fatal_mask(NULL, G_LOG_LEVEL_ERROR);
+    ga_enable_logging(s);
+
+#ifdef _WIN32
+    /* On win32 the state directory is application specific (be it the default
+     * or a user override). We got past the command line parsing; let's create
+     * the directory (with any intermediate directories). If we run into an
+     * error later on, we won't try to clean up the directory, it is considered
+     * persistent.
+     */
+    if (g_mkdir_with_parents(config->state_dir, S_IRWXU) == -1) {
+        g_critical("unable to create (an ancestor of) the state directory"
+                   " '%s': %s", config->state_dir, strerror(errno));
+        return EXIT_FAILURE;
     }
 #endif
 
     if (ga_is_frozen(s)) {
-        if (daemonize) {
+        if (config->daemonize) {
             /* delay opening/locking of pidfile till filesystems are unfrozen */
-            s->deferred_options.pid_filepath = pid_filepath;
+            s->deferred_options.pid_filepath = config->pid_filepath;
             become_daemon(NULL);
         }
-        if (log_filepath) {
+        if (config->log_filepath) {
             /* delay opening the log file till filesystems are unfrozen */
-            s->deferred_options.log_filepath = log_filepath;
+            s->deferred_options.log_filepath = config->log_filepath;
         }
         ga_disable_logging(s);
         qmp_for_each_command(ga_disable_non_whitelisted, NULL);
     } else {
-        if (daemonize) {
-            become_daemon(pid_filepath);
+        if (config->daemonize) {
+            become_daemon(config->pid_filepath);
         }
-        if (log_filepath) {
-            FILE *log_file = ga_open_logfile(log_filepath);
+        if (config->log_filepath) {
+            FILE *log_file = ga_open_logfile(config->log_filepath);
             if (!log_file) {
                 g_critical("unable to open specified log file: %s",
                            strerror(errno));
-                goto out_bad;
+                return EXIT_FAILURE;
             }
             s->log_file = log_file;
         }
@@ -1149,17 +1275,18 @@ int main(int argc, char **argv)
                                s->pstate_filepath,
                                ga_is_frozen(s))) {
         g_critical("failed to load persistent state");
-        goto out_bad;
+        return EXIT_FAILURE;
     }
 
-    blacklist = ga_command_blacklist_init(blacklist);
-    if (blacklist) {
-        s->blacklist = blacklist;
+    config->blacklist = ga_command_blacklist_init(config->blacklist);
+    if (config->blacklist) {
+        GList *l = config->blacklist;
+        s->blacklist = config->blacklist;
         do {
-            g_debug("disabling command: %s", (char *)blacklist->data);
-            qmp_disable_command(blacklist->data);
-            blacklist = g_list_next(blacklist);
-        } while (blacklist);
+            g_debug("disabling command: %s", (char *)l->data);
+            qmp_disable_command(l->data);
+            l = g_list_next(l);
+        } while (l);
     }
     s->command_state = ga_command_state_new();
     ga_command_state_init(s, s->command_state);
@@ -1169,19 +1296,19 @@ int main(int argc, char **argv)
 #ifndef _WIN32
     if (!register_signal_handlers()) {
         g_critical("failed to register signal handlers");
-        goto out_bad;
+        return EXIT_FAILURE;
     }
 #endif
 
     s->main_loop = g_main_loop_new(NULL, false);
-    if (!channel_init(ga_state, method, path)) {
+    if (!channel_init(ga_state, config->method, config->channel_path)) {
         g_critical("failed to initialize guest agent channel");
-        goto out_bad;
+        return EXIT_FAILURE;
     }
 #ifndef _WIN32
     g_main_loop_run(ga_state->main_loop);
 #else
-    if (daemonize) {
+    if (config->daemonize) {
         SERVICE_TABLE_ENTRY service_table[] = {
             { (char *)QGA_SERVICE_NAME, service_main }, { NULL, NULL } };
         StartServiceCtrlDispatcher(service_table);
@@ -1190,17 +1317,85 @@ int main(int argc, char **argv)
     }
 #endif
 
-    ga_command_state_cleanup_all(ga_state->command_state);
-    ga_channel_free(ga_state->channel);
+    return EXIT_SUCCESS;
+}
+
+static void free_blacklist_entry(gpointer entry, gpointer unused)
+{
+    g_free(entry);
+}
+
+int main(int argc, char **argv)
+{
+    int ret = EXIT_SUCCESS;
+    GAState *s = g_new0(GAState, 1);
+    GAConfig *config = g_new0(GAConfig, 1);
+
+    module_call_init(MODULE_INIT_QAPI);
 
-    if (daemonize) {
-        unlink(pid_filepath);
+    init_dfl_pathnames();
+    config_load(config);
+    config_parse(config, argc, argv);
+
+    if (config->pid_filepath == NULL) {
+        config->pid_filepath = g_strdup(dfl_pathnames.pidfile);
     }
-    return 0;
 
-out_bad:
-    if (daemonize) {
-        unlink(pid_filepath);
+    if (config->state_dir == NULL) {
+        config->state_dir = g_strdup(dfl_pathnames.state_dir);
+    }
+
+    if (config->method == NULL) {
+        config->method = g_strdup("virtio-serial");
+    }
+
+    if (config->channel_path == NULL) {
+        if (strcmp(config->method, "virtio-serial") == 0) {
+            /* try the default path for the virtio-serial port */
+            config->channel_path = g_strdup(QGA_VIRTIO_PATH_DEFAULT);
+        } else if (strcmp(config->method, "isa-serial") == 0) {
+            /* try the default path for the serial port - COM1 */
+            config->channel_path = g_strdup(QGA_SERIAL_PATH_DEFAULT);
+        } else {
+            g_critical("must specify a path for this channel");
+            ret = EXIT_FAILURE;
+            goto end;
+        }
+    }
+
+    s->log_level = config->log_level;
+    s->log_file = stderr;
+#ifdef CONFIG_FSFREEZE
+    s->fsfreeze_hook = config->fsfreeze_hook;
+#endif
+    s->pstate_filepath = g_strdup_printf("%s/qga.state", config->state_dir);
+    s->state_filepath_isfrozen = g_strdup_printf("%s/qga.state.isfrozen",
+                                                 config->state_dir);
+    s->frozen = check_is_frozen(s);
+
+    if (config->dumpconf) {
+        config_dump(config);
+        goto end;
     }
-    return EXIT_FAILURE;
+
+    ret = run_agent(s, config);
+
+end:
+    if (s->command_state) {
+        ga_command_state_cleanup_all(s->command_state);
+    }
+    if (s->channel) {
+        ga_channel_free(s->channel);
+    }
+    g_list_foreach(config->blacklist, free_blacklist_entry, NULL);
+    g_free(s->pstate_filepath);
+    g_free(s->state_filepath_isfrozen);
+
+    if (config->daemonize) {
+        unlink(config->pid_filepath);
+    }
+
+    config_free(config);
+
+    return ret;
 }
diff --git a/qga/qapi-schema.json b/qga/qapi-schema.json
index 18e3cc37d4..6b0bd163c3 100644
--- a/qga/qapi-schema.json
+++ b/qga/qapi-schema.json
@@ -793,7 +793,7 @@
 # scheme. Refer to the documentation of the guest operating system
 # in question to determine what is supported.
 #
-# Note all guest operating systems will support use of the
+# Not all guest operating systems will support use of the
 # @crypted flag, as they may require the clear-text password
 #
 # The @password parameter must always be base64 encoded before
diff --git a/qom/cpu.c b/qom/cpu.c
index eb9cfeca18..62f4b5de44 100644
--- a/qom/cpu.c
+++ b/qom/cpu.c
@@ -247,7 +247,7 @@ static void cpu_common_reset(CPUState *cpu)
     cpu->mem_io_vaddr = 0;
     cpu->icount_extra = 0;
     cpu->icount_decr.u32 = 0;
-    cpu->can_do_io = 0;
+    cpu->can_do_io = 1;
     cpu->exception_index = -1;
     memset(cpu->tb_jmp_cache, 0, TB_JMP_CACHE_SIZE * sizeof(void *));
 }
diff --git a/softmmu_template.h b/softmmu_template.h
index d42d89d541..50dec1c510 100644
--- a/softmmu_template.h
+++ b/softmmu_template.h
@@ -154,7 +154,7 @@ static inline DATA_TYPE glue(io_read, SUFFIX)(CPUArchState *env,
 
     physaddr = (physaddr & TARGET_PAGE_MASK) + addr;
     cpu->mem_io_pc = retaddr;
-    if (mr != &io_mem_rom && mr != &io_mem_notdirty && !cpu_can_do_io(cpu)) {
+    if (mr != &io_mem_rom && mr != &io_mem_notdirty && !cpu->can_do_io) {
         cpu_io_recompile(cpu, retaddr);
     }
 
@@ -374,7 +374,7 @@ static inline void glue(io_write, SUFFIX)(CPUArchState *env,
     MemoryRegion *mr = iotlb_to_region(cpu, physaddr);
 
     physaddr = (physaddr & TARGET_PAGE_MASK) + addr;
-    if (mr != &io_mem_rom && mr != &io_mem_notdirty && !cpu_can_do_io(cpu)) {
+    if (mr != &io_mem_rom && mr != &io_mem_notdirty && !cpu->can_do_io) {
         cpu_io_recompile(cpu, retaddr);
     }
 
diff --git a/target-alpha/cpu.h b/target-alpha/cpu.h
index 91c56d6bcf..3f1ece30ee 100644
--- a/target-alpha/cpu.h
+++ b/target-alpha/cpu.h
@@ -445,8 +445,9 @@ void QEMU_NORETURN arith_excp(CPUAlphaState *, uintptr_t, int, uint64_t);
 
 uint64_t cpu_alpha_load_fpcr (CPUAlphaState *env);
 void cpu_alpha_store_fpcr (CPUAlphaState *env, uint64_t val);
+uint64_t cpu_alpha_load_gr(CPUAlphaState *env, unsigned reg);
+void cpu_alpha_store_gr(CPUAlphaState *env, unsigned reg, uint64_t val);
 #ifndef CONFIG_USER_ONLY
-void swap_shadow_regs(CPUAlphaState *env);
 QEMU_NORETURN void alpha_cpu_unassigned_access(CPUState *cpu, hwaddr addr,
                                                bool is_write, bool is_exec,
                                                int unused, unsigned size);
diff --git a/target-alpha/gdbstub.c b/target-alpha/gdbstub.c
index 980f140e72..99a4051b35 100644
--- a/target-alpha/gdbstub.c
+++ b/target-alpha/gdbstub.c
@@ -30,7 +30,7 @@ int alpha_cpu_gdb_read_register(CPUState *cs, uint8_t *mem_buf, int n)
 
     switch (n) {
     case 0 ... 30:
-        val = env->ir[n];
+        val = cpu_alpha_load_gr(env, n);
         break;
     case 32 ... 62:
         d.d = env->fir[n - 32];
@@ -66,7 +66,7 @@ int alpha_cpu_gdb_write_register(CPUState *cs, uint8_t *mem_buf, int n)
 
     switch (n) {
     case 0 ... 30:
-        env->ir[n] = tmp;
+        cpu_alpha_store_gr(env, n, tmp);
         break;
     case 32 ... 62:
         d.ll = tmp;
diff --git a/target-alpha/helper.c b/target-alpha/helper.c
index 46b8ef9141..5a85335838 100644
--- a/target-alpha/helper.c
+++ b/target-alpha/helper.c
@@ -79,6 +79,30 @@ void helper_store_fpcr(CPUAlphaState *env, uint64_t val)
     cpu_alpha_store_fpcr(env, val);
 }
 
+static uint64_t *cpu_alpha_addr_gr(CPUAlphaState *env, unsigned reg)
+{
+#ifndef CONFIG_USER_ONLY
+    if (env->pal_mode) {
+        if (reg >= 8 && reg <= 14) {
+            return &env->shadow[reg - 8];
+        } else if (reg == 25) {
+            return &env->shadow[7];
+        }
+    }
+#endif
+    return &env->ir[reg];
+}
+
+uint64_t cpu_alpha_load_gr(CPUAlphaState *env, unsigned reg)
+{
+    return *cpu_alpha_addr_gr(env, reg);
+}
+
+void cpu_alpha_store_gr(CPUAlphaState *env, unsigned reg, uint64_t val)
+{
+    *cpu_alpha_addr_gr(env, reg) = val;
+}
+
 #if defined(CONFIG_USER_ONLY)
 int alpha_cpu_handle_mmu_fault(CPUState *cs, vaddr address,
                                int rw, int mmu_idx)
@@ -90,38 +114,6 @@ int alpha_cpu_handle_mmu_fault(CPUState *cs, vaddr address,
     return 1;
 }
 #else
-void swap_shadow_regs(CPUAlphaState *env)
-{
-    uint64_t i0, i1, i2, i3, i4, i5, i6, i7;
-
-    i0 = env->ir[8];
-    i1 = env->ir[9];
-    i2 = env->ir[10];
-    i3 = env->ir[11];
-    i4 = env->ir[12];
-    i5 = env->ir[13];
-    i6 = env->ir[14];
-    i7 = env->ir[25];
-
-    env->ir[8]  = env->shadow[0];
-    env->ir[9]  = env->shadow[1];
-    env->ir[10] = env->shadow[2];
-    env->ir[11] = env->shadow[3];
-    env->ir[12] = env->shadow[4];
-    env->ir[13] = env->shadow[5];
-    env->ir[14] = env->shadow[6];
-    env->ir[25] = env->shadow[7];
-
-    env->shadow[0] = i0;
-    env->shadow[1] = i1;
-    env->shadow[2] = i2;
-    env->shadow[3] = i3;
-    env->shadow[4] = i4;
-    env->shadow[5] = i5;
-    env->shadow[6] = i6;
-    env->shadow[7] = i7;
-}
-
 /* Returns the OSF/1 entMM failure indication, or -1 on success.  */
 static int get_physical_address(CPUAlphaState *env, target_ulong addr,
                                 int prot_need, int mmu_idx,
@@ -375,10 +367,7 @@ void alpha_cpu_do_interrupt(CPUState *cs)
     env->pc = env->palbr + i;
 
     /* Switch to PALmode.  */
-    if (!env->pal_mode) {
-        env->pal_mode = 1;
-        swap_shadow_regs(env);
-    }
+    env->pal_mode = 1;
 #endif /* !USER_ONLY */
 }
 
@@ -443,7 +432,7 @@ void alpha_cpu_dump_state(CPUState *cs, FILE *f, fprintf_function cpu_fprintf,
                 env->pc, env->ps);
     for (i = 0; i < 31; i++) {
         cpu_fprintf(f, "IR%02d %s " TARGET_FMT_lx " ", i,
-                    linux_reg_names[i], env->ir[i]);
+                    linux_reg_names[i], cpu_alpha_load_gr(env, i));
         if ((i % 3) == 2)
             cpu_fprintf(f, "\n");
     }
diff --git a/target-alpha/helper.h b/target-alpha/helper.h
index 83cbe2abda..c3d8a3ee49 100644
--- a/target-alpha/helper.h
+++ b/target-alpha/helper.h
@@ -92,9 +92,6 @@ DEF_HELPER_FLAGS_2(ieee_input_cmp, TCG_CALL_NO_WG, void, env, i64)
 DEF_HELPER_FLAGS_2(ieee_input_s, TCG_CALL_NO_WG, void, env, i64)
 
 #if !defined (CONFIG_USER_ONLY)
-DEF_HELPER_2(hw_ret, void, env, i64)
-DEF_HELPER_3(call_pal, void, env, i64, i64)
-
 DEF_HELPER_2(ldl_phys, i64, env, i64)
 DEF_HELPER_2(ldq_phys, i64, env, i64)
 DEF_HELPER_2(ldl_l_phys, i64, env, i64)
diff --git a/target-alpha/machine.c b/target-alpha/machine.c
index e796bbe27d..d9bf977fc6 100644
--- a/target-alpha/machine.c
+++ b/target-alpha/machine.c
@@ -70,8 +70,8 @@ static VMStateField vmstate_env_fields[] = {
 
 static const VMStateDescription vmstate_env = {
     .name = "env",
-    .version_id = 1,
-    .minimum_version_id = 1,
+    .version_id = 2,
+    .minimum_version_id = 2,
     .fields = vmstate_env_fields,
 };
 
diff --git a/target-alpha/sys_helper.c b/target-alpha/sys_helper.c
index 1c59e108b9..1f0e1a9671 100644
--- a/target-alpha/sys_helper.c
+++ b/target-alpha/sys_helper.c
@@ -40,28 +40,6 @@ uint64_t helper_load_pcc(CPUAlphaState *env)
 
 /* PALcode support special instructions */
 #ifndef CONFIG_USER_ONLY
-void helper_hw_ret(CPUAlphaState *env, uint64_t a)
-{
-    env->pc = a & ~3;
-    env->intr_flag = 0;
-    env->lock_addr = -1;
-    if ((a & 1) == 0) {
-        env->pal_mode = 0;
-        swap_shadow_regs(env);
-    }
-}
-
-void helper_call_pal(CPUAlphaState *env, uint64_t pc, uint64_t entry_ofs)
-{
-    int pal_mode = env->pal_mode;
-    env->exc_addr = pc | pal_mode;
-    env->pc = env->palbr + entry_ofs;
-    if (!pal_mode) {
-        env->pal_mode = 1;
-        swap_shadow_regs(env);
-    }
-}
-
 void helper_tbia(CPUAlphaState *env)
 {
     tlb_flush(CPU(alpha_env_get_cpu(env)), 1);
diff --git a/target-alpha/translate.c b/target-alpha/translate.c
index b766ae3daa..206feb5746 100644
--- a/target-alpha/translate.c
+++ b/target-alpha/translate.c
@@ -42,6 +42,9 @@ typedef struct DisasContext DisasContext;
 struct DisasContext {
     struct TranslationBlock *tb;
     uint64_t pc;
+#ifndef CONFIG_USER_ONLY
+    uint64_t palbr;
+#endif
     int mem_idx;
 
     /* Current rounding mode for this TB.  */
@@ -52,6 +55,9 @@ struct DisasContext {
     /* implver value for this CPU.  */
     int implver;
 
+    /* The set of registers active in the current context.  */
+    TCGv *ir;
+
     /* Temporaries for $31 and $f31 as source and destination.  */
     TCGv zero;
     TCGv sink;
@@ -86,13 +92,17 @@ typedef enum {
 
 /* global register indexes */
 static TCGv_ptr cpu_env;
-static TCGv cpu_ir[31];
+static TCGv cpu_std_ir[31];
 static TCGv cpu_fir[31];
 static TCGv cpu_pc;
 static TCGv cpu_lock_addr;
 static TCGv cpu_lock_st_addr;
 static TCGv cpu_lock_value;
 
+#ifndef CONFIG_USER_ONLY
+static TCGv cpu_pal_ir[31];
+#endif
+
 #include "exec/gen-icount.h"
 
 void alpha_translate_init(void)
@@ -122,6 +132,12 @@ void alpha_translate_init(void)
         "f16", "f17", "f18", "f19", "f20", "f21", "f22", "f23",
         "f24", "f25", "f26", "f27", "f28", "f29", "f30"
     };
+#ifndef CONFIG_USER_ONLY
+    static const char shadow_names[8][8] = {
+        "pal_t7", "pal_s0", "pal_s1", "pal_s2",
+        "pal_s3", "pal_s4", "pal_s5", "pal_t11"
+    };
+#endif
 
     static bool done_init = 0;
     int i;
@@ -134,9 +150,9 @@ void alpha_translate_init(void)
     cpu_env = tcg_global_reg_new_ptr(TCG_AREG0, "env");
 
     for (i = 0; i < 31; i++) {
-        cpu_ir[i] = tcg_global_mem_new_i64(TCG_AREG0,
-                                           offsetof(CPUAlphaState, ir[i]),
-                                           greg_names[i]);
+        cpu_std_ir[i] = tcg_global_mem_new_i64(TCG_AREG0,
+                                               offsetof(CPUAlphaState, ir[i]),
+                                               greg_names[i]);
     }
 
     for (i = 0; i < 31; i++) {
@@ -145,6 +161,17 @@ void alpha_translate_init(void)
                                             freg_names[i]);
     }
 
+#ifndef CONFIG_USER_ONLY
+    memcpy(cpu_pal_ir, cpu_std_ir, sizeof(cpu_pal_ir));
+    for (i = 0; i < 8; i++) {
+        int r = (i == 7 ? 25 : i + 8);
+        cpu_pal_ir[r] = tcg_global_mem_new_i64(TCG_AREG0,
+                                               offsetof(CPUAlphaState,
+                                                        shadow[i]),
+                                               shadow_names[i]);
+    }
+#endif
+
     for (i = 0; i < ARRAY_SIZE(vars); ++i) {
         const GlobalVar *v = &vars[i];
         *v->var = tcg_global_mem_new_i64(TCG_AREG0, v->ofs, v->name);
@@ -170,7 +197,7 @@ static TCGv dest_sink(DisasContext *ctx)
 static TCGv load_gpr(DisasContext *ctx, unsigned reg)
 {
     if (likely(reg < 31)) {
-        return cpu_ir[reg];
+        return ctx->ir[reg];
     } else {
         return load_zero(ctx);
     }
@@ -183,7 +210,7 @@ static TCGv load_gpr_lit(DisasContext *ctx, unsigned reg,
         ctx->lit = tcg_const_i64(lit);
         return ctx->lit;
     } else if (likely(reg < 31)) {
-        return cpu_ir[reg];
+        return ctx->ir[reg];
     } else {
         return load_zero(ctx);
     }
@@ -192,7 +219,7 @@ static TCGv load_gpr_lit(DisasContext *ctx, unsigned reg,
 static TCGv dest_gpr(DisasContext *ctx, unsigned reg)
 {
     if (likely(reg < 31)) {
-        return cpu_ir[reg];
+        return ctx->ir[reg];
     } else {
         return dest_sink(ctx);
     }
@@ -304,7 +331,7 @@ static inline void gen_load_mem(DisasContext *ctx,
         addr = tmp;
     }
 
-    va = (fp ? cpu_fir[ra] : cpu_ir[ra]);
+    va = (fp ? cpu_fir[ra] : ctx->ir[ra]);
     tcg_gen_qemu_load(va, addr, ctx->mem_idx);
 
     tcg_temp_free(tmp);
@@ -399,13 +426,13 @@ static ExitStatus gen_store_conditional(DisasContext *ctx, int ra, int rb,
         tcg_gen_qemu_ld_i64(val, addr, ctx->mem_idx, quad ? MO_LEQ : MO_LESL);
         tcg_gen_brcond_i64(TCG_COND_NE, val, cpu_lock_value, lab_fail);
 
-        tcg_gen_qemu_st_i64(cpu_ir[ra], addr, ctx->mem_idx,
+        tcg_gen_qemu_st_i64(ctx->ir[ra], addr, ctx->mem_idx,
                             quad ? MO_LEQ : MO_LEUL);
-        tcg_gen_movi_i64(cpu_ir[ra], 1);
+        tcg_gen_movi_i64(ctx->ir[ra], 1);
         tcg_gen_br(lab_done);
 
         gen_set_label(lab_fail);
-        tcg_gen_movi_i64(cpu_ir[ra], 0);
+        tcg_gen_movi_i64(ctx->ir[ra], 0);
 
         gen_set_label(lab_done);
         tcg_gen_movi_i64(cpu_lock_addr, -1);
@@ -444,7 +471,7 @@ static ExitStatus gen_bdirect(DisasContext *ctx, int ra, int32_t disp)
     uint64_t dest = ctx->pc + (disp << 2);
 
     if (ra != 31) {
-        tcg_gen_movi_i64(cpu_ir[ra], ctx->pc);
+        tcg_gen_movi_i64(ctx->ir[ra], ctx->pc);
     }
 
     /* Notice branch-to-next; used to initialize RA with the PC.  */
@@ -1059,12 +1086,13 @@ static void gen_msk_l(DisasContext *ctx, TCGv vc, TCGv va, int rb, bool islit,
     }
 }
 
-static void gen_rx(int ra, int set)
+static void gen_rx(DisasContext *ctx, int ra, int set)
 {
     TCGv_i32 tmp;
 
     if (ra != 31) {
-        tcg_gen_ld8u_i64(cpu_ir[ra], cpu_env, offsetof(CPUAlphaState, intr_flag));
+        tcg_gen_ld8u_i64(ctx->ir[ra], cpu_env,
+                         offsetof(CPUAlphaState, intr_flag));
     }
 
     tmp = tcg_const_i32(set);
@@ -1086,12 +1114,12 @@ static ExitStatus gen_call_pal(DisasContext *ctx, int palcode)
             break;
         case 0x9E:
             /* RDUNIQUE */
-            tcg_gen_ld_i64(cpu_ir[IR_V0], cpu_env,
+            tcg_gen_ld_i64(ctx->ir[IR_V0], cpu_env,
                            offsetof(CPUAlphaState, unique));
             break;
         case 0x9F:
             /* WRUNIQUE */
-            tcg_gen_st_i64(cpu_ir[IR_A0], cpu_env,
+            tcg_gen_st_i64(ctx->ir[IR_A0], cpu_env,
                            offsetof(CPUAlphaState, unique));
             break;
         default:
@@ -1115,17 +1143,17 @@ static ExitStatus gen_call_pal(DisasContext *ctx, int palcode)
             break;
         case 0x2D:
             /* WRVPTPTR */
-            tcg_gen_st_i64(cpu_ir[IR_A0], cpu_env,
+            tcg_gen_st_i64(ctx->ir[IR_A0], cpu_env,
                            offsetof(CPUAlphaState, vptptr));
             break;
         case 0x31:
             /* WRVAL */
-            tcg_gen_st_i64(cpu_ir[IR_A0], cpu_env,
+            tcg_gen_st_i64(ctx->ir[IR_A0], cpu_env,
                            offsetof(CPUAlphaState, sysval));
             break;
         case 0x32:
             /* RDVAL */
-            tcg_gen_ld_i64(cpu_ir[IR_V0], cpu_env,
+            tcg_gen_ld_i64(ctx->ir[IR_V0], cpu_env,
                            offsetof(CPUAlphaState, sysval));
             break;
 
@@ -1135,12 +1163,12 @@ static ExitStatus gen_call_pal(DisasContext *ctx, int palcode)
 
             /* Note that we already know we're in kernel mode, so we know
                that PS only contains the 3 IPL bits.  */
-            tcg_gen_ld8u_i64(cpu_ir[IR_V0], cpu_env,
+            tcg_gen_ld8u_i64(ctx->ir[IR_V0], cpu_env,
                              offsetof(CPUAlphaState, ps));
 
             /* But make sure and store only the 3 IPL bits from the user.  */
             tmp = tcg_temp_new();
-            tcg_gen_andi_i64(tmp, cpu_ir[IR_A0], PS_INT_MASK);
+            tcg_gen_andi_i64(tmp, ctx->ir[IR_A0], PS_INT_MASK);
             tcg_gen_st8_i64(tmp, cpu_env, offsetof(CPUAlphaState, ps));
             tcg_temp_free(tmp);
             break;
@@ -1148,22 +1176,22 @@ static ExitStatus gen_call_pal(DisasContext *ctx, int palcode)
 
         case 0x36:
             /* RDPS */
-            tcg_gen_ld8u_i64(cpu_ir[IR_V0], cpu_env,
+            tcg_gen_ld8u_i64(ctx->ir[IR_V0], cpu_env,
                              offsetof(CPUAlphaState, ps));
             break;
         case 0x38:
             /* WRUSP */
-            tcg_gen_st_i64(cpu_ir[IR_A0], cpu_env,
+            tcg_gen_st_i64(ctx->ir[IR_A0], cpu_env,
                            offsetof(CPUAlphaState, usp));
             break;
         case 0x3A:
             /* RDUSP */
-            tcg_gen_ld_i64(cpu_ir[IR_V0], cpu_env,
+            tcg_gen_ld_i64(ctx->ir[IR_V0], cpu_env,
                            offsetof(CPUAlphaState, usp));
             break;
         case 0x3C:
             /* WHAMI */
-            tcg_gen_ld32s_i64(cpu_ir[IR_V0], cpu_env,
+            tcg_gen_ld32s_i64(ctx->ir[IR_V0], cpu_env,
                 -offsetof(AlphaCPU, env) + offsetof(CPUState, cpu_index));
             break;
 
@@ -1181,15 +1209,24 @@ static ExitStatus gen_call_pal(DisasContext *ctx, int palcode)
     return gen_excp(ctx, EXCP_CALL_PAL, palcode);
 #else
     {
-        TCGv pc = tcg_const_i64(ctx->pc);
-        TCGv entry = tcg_const_i64(palcode & 0x80
-                                   ? 0x2000 + (palcode - 0x80) * 64
-                                   : 0x1000 + palcode * 64);
+        TCGv tmp = tcg_temp_new();
+        uint64_t exc_addr = ctx->pc;
+        uint64_t entry = ctx->palbr;
+
+        if (ctx->tb->flags & TB_FLAGS_PAL_MODE) {
+            exc_addr |= 1;
+        } else {
+            tcg_gen_movi_i64(tmp, 1);
+            tcg_gen_st8_i64(tmp, cpu_env, offsetof(CPUAlphaState, pal_mode));
+        }
 
-        gen_helper_call_pal(cpu_env, pc, entry);
+        tcg_gen_movi_i64(tmp, exc_addr);
+        tcg_gen_st_i64(tmp, cpu_env, offsetof(CPUAlphaState, exc_addr));
+        tcg_temp_free(tmp);
 
-        tcg_temp_free(entry);
-        tcg_temp_free(pc);
+        entry += (palcode & 0x80
+                  ? 0x2000 + (palcode - 0x80) * 64
+                  : 0x1000 + palcode * 64);
 
         /* Since the destination is running in PALmode, we don't really
            need the page permissions check.  We'll see the existence of
@@ -1197,11 +1234,13 @@ static ExitStatus gen_call_pal(DisasContext *ctx, int palcode)
            we change the PAL base register.  */
         if (!ctx->singlestep_enabled && !(ctx->tb->cflags & CF_LAST_IO)) {
             tcg_gen_goto_tb(0);
+            tcg_gen_movi_i64(cpu_pc, entry);
             tcg_gen_exit_tb((uintptr_t)ctx->tb);
             return EXIT_GOTO_TB;
+        } else {
+            tcg_gen_movi_i64(cpu_pc, entry);
+            return EXIT_PC_UPDATED;
         }
-
-        return EXIT_PC_UPDATED;
     }
 #endif
 }
@@ -1228,8 +1267,6 @@ static int cpu_pr_data(int pr)
     case 11: return offsetof(CPUAlphaState, sysval);
     case 12: return offsetof(CPUAlphaState, usp);
 
-    case 32 ... 39:
-        return offsetof(CPUAlphaState, shadow[pr - 32]);
     case 40 ... 63:
         return offsetof(CPUAlphaState, scratch[pr - 40]);
 
@@ -1241,36 +1278,48 @@ static int cpu_pr_data(int pr)
 
 static ExitStatus gen_mfpr(DisasContext *ctx, TCGv va, int regno)
 {
-    int data = cpu_pr_data(regno);
-
-    /* Special help for VMTIME and WALLTIME.  */
-    if (regno == 250 || regno == 249) {
-	void (*helper)(TCGv) = gen_helper_get_walltime;
-	if (regno == 249) {
-		helper = gen_helper_get_vmtime;
-	}
-        if (ctx->tb->cflags & CF_USE_ICOUNT) {
+    void (*helper)(TCGv);
+    int data;
+
+    switch (regno) {
+    case 32 ... 39:
+        /* Accessing the "non-shadow" general registers.  */
+        regno = regno == 39 ? 25 : regno - 32 + 8;
+        tcg_gen_mov_i64(va, cpu_std_ir[regno]);
+        break;
+
+    case 250: /* WALLTIME */
+        helper = gen_helper_get_walltime;
+        goto do_helper;
+    case 249: /* VMTIME */
+        helper = gen_helper_get_vmtime;
+    do_helper:
+        if (use_icount) {
             gen_io_start();
             helper(va);
             gen_io_end();
             return EXIT_PC_STALE;
         } else {
             helper(va);
-            return NO_EXIT;
         }
-    }
+        break;
 
-    /* The basic registers are data only, and unknown registers
-       are read-zero, write-ignore.  */
-    if (data == 0) {
-        tcg_gen_movi_i64(va, 0);
-    } else if (data & PR_BYTE) {
-        tcg_gen_ld8u_i64(va, cpu_env, data & ~PR_BYTE);
-    } else if (data & PR_LONG) {
-        tcg_gen_ld32s_i64(va, cpu_env, data & ~PR_LONG);
-    } else {
-        tcg_gen_ld_i64(va, cpu_env, data);
+    default:
+        /* The basic registers are data only, and unknown registers
+           are read-zero, write-ignore.  */
+        data = cpu_pr_data(regno);
+        if (data == 0) {
+            tcg_gen_movi_i64(va, 0);
+        } else if (data & PR_BYTE) {
+            tcg_gen_ld8u_i64(va, cpu_env, data & ~PR_BYTE);
+        } else if (data & PR_LONG) {
+            tcg_gen_ld32s_i64(va, cpu_env, data & ~PR_LONG);
+        } else {
+            tcg_gen_ld_i64(va, cpu_env, data);
+        }
+        break;
     }
+
     return NO_EXIT;
 }
 
@@ -1316,6 +1365,12 @@ static ExitStatus gen_mtpr(DisasContext *ctx, TCGv vb, int regno)
         gen_helper_tb_flush(cpu_env);
         return EXIT_PC_STALE;
 
+    case 32 ... 39:
+        /* Accessing the "non-shadow" general registers.  */
+        regno = regno == 39 ? 25 : regno - 32 + 8;
+        tcg_gen_mov_i64(cpu_std_ir[regno], vb);
+        break;
+
     default:
         /* The basic registers are data only, and unknown registers
            are read-zero, write-ignore.  */
@@ -1957,7 +2012,7 @@ static ExitStatus translate_one(DisasContext *ctx, uint32_t insn)
             REQUIRE_REG_31(rb);
             t32 = tcg_temp_new_i32();
             va = load_gpr(ctx, ra);
-            tcg_gen_trunc_i64_i32(t32, va);
+            tcg_gen_extrl_i64_i32(t32, va);
             gen_helper_memory_to_s(vc, t32);
             tcg_temp_free_i32(t32);
             break;
@@ -1977,7 +2032,7 @@ static ExitStatus translate_one(DisasContext *ctx, uint32_t insn)
             REQUIRE_REG_31(rb);
             t32 = tcg_temp_new_i32();
             va = load_gpr(ctx, ra);
-            tcg_gen_trunc_i64_i32(t32, va);
+            tcg_gen_extrl_i64_i32(t32, va);
             gen_helper_memory_to_f(vc, t32);
             tcg_temp_free_i32(t32);
             break;
@@ -2300,14 +2355,14 @@ static ExitStatus translate_one(DisasContext *ctx, uint32_t insn)
             break;
         case 0xE000:
             /* RC */
-            gen_rx(ra, 0);
+            gen_rx(ctx, ra, 0);
             break;
         case 0xE800:
             /* ECB */
             break;
         case 0xF000:
             /* RS */
-            gen_rx(ra, 1);
+            gen_rx(ctx, ra, 1);
             break;
         case 0xF800:
             /* WH64 */
@@ -2339,7 +2394,7 @@ static ExitStatus translate_one(DisasContext *ctx, uint32_t insn)
         vb = load_gpr(ctx, rb);
         tcg_gen_andi_i64(cpu_pc, vb, ~3);
         if (ra != 31) {
-            tcg_gen_movi_i64(cpu_ir[ra], ctx->pc);
+            tcg_gen_movi_i64(ctx->ir[ra], ctx->pc);
         }
         ret = EXIT_PC_UPDATED;
         break;
@@ -2379,10 +2434,10 @@ static ExitStatus translate_one(DisasContext *ctx, uint32_t insn)
                 goto invalid_opc;
                 break;
             case 0x6:
-                /* Incpu_ir[ra]id */
+                /* Invalid */
                 goto invalid_opc;
             case 0x7:
-                /* Incpu_ir[ra]id */
+                /* Invaliid */
                 goto invalid_opc;
             case 0x8:
                 /* Longword virtual access (hw_ldl) */
@@ -2585,13 +2640,18 @@ static ExitStatus translate_one(DisasContext *ctx, uint32_t insn)
             /* Pre-EV6 CPUs interpreted this as HW_REI, loading the return
                address from EXC_ADDR.  This turns out to be useful for our
                emulation PALcode, so continue to accept it.  */
-            tmp = tcg_temp_new();
-            tcg_gen_ld_i64(tmp, cpu_env, offsetof(CPUAlphaState, exc_addr));
-            gen_helper_hw_ret(cpu_env, tmp);
-            tcg_temp_free(tmp);
+            ctx->lit = vb = tcg_temp_new();
+            tcg_gen_ld_i64(vb, cpu_env, offsetof(CPUAlphaState, exc_addr));
         } else {
-            gen_helper_hw_ret(cpu_env, load_gpr(ctx, rb));
+            vb = load_gpr(ctx, rb);
         }
+        tmp = tcg_temp_new();
+        tcg_gen_movi_i64(tmp, 0);
+        tcg_gen_st8_i64(tmp, cpu_env, offsetof(CPUAlphaState, intr_flag));
+        tcg_gen_movi_i64(cpu_lock_addr, -1);
+        tcg_gen_andi_i64(tmp, vb, 1);
+        tcg_gen_st8_i64(tmp, cpu_env, offsetof(CPUAlphaState, pal_mode));
+        tcg_gen_andi_i64(cpu_pc, vb, ~3);
         ret = EXIT_PC_UPDATED;
         break;
 #else
@@ -2822,6 +2882,13 @@ static inline void gen_intermediate_code_internal(AlphaCPU *cpu,
     ctx.implver = env->implver;
     ctx.singlestep_enabled = cs->singlestep_enabled;
 
+#ifdef CONFIG_USER_ONLY
+    ctx.ir = cpu_std_ir;
+#else
+    ctx.palbr = env->palbr;
+    ctx.ir = (tb->flags & TB_FLAGS_PAL_MODE ? cpu_pal_ir : cpu_std_ir);
+#endif
+
     /* ??? Every TB begins with unset rounding mode, to be initialized on
        the first fp insn of the TB.  Alternately we could define a proper
        default for every TB (e.g. QUAL_RM_N or QUAL_RM_D) and make sure
diff --git a/target-arm/cpu.h b/target-arm/cpu.h
index 2e680da1fc..31825d34a1 100644
--- a/target-arm/cpu.h
+++ b/target-arm/cpu.h
@@ -1284,6 +1284,9 @@ typedef enum CPAccessResult {
     /* As CP_ACCESS_TRAP, but for traps directly to EL2 or EL3 */
     CP_ACCESS_TRAP_EL2 = 3,
     CP_ACCESS_TRAP_EL3 = 4,
+    /* As CP_ACCESS_UNCATEGORIZED, but for traps directly to EL2 or EL3 */
+    CP_ACCESS_TRAP_UNCATEGORIZED_EL2 = 5,
+    CP_ACCESS_TRAP_UNCATEGORIZED_EL3 = 6,
 } CPAccessResult;
 
 /* Access functions for coprocessor registers. These cannot fail and
diff --git a/target-arm/helper.c b/target-arm/helper.c
index 1568aa6617..7df1f0684d 100644
--- a/target-arm/helper.c
+++ b/target-arm/helper.c
@@ -1022,6 +1022,10 @@ static const ARMCPRegInfo v7_cp_reginfo[] = {
       .opc0 = 3, .opc1 = 0, .crn = 10, .crm = 2, .opc2 = 0,
       .access = PL1_RW, .fieldoffset = offsetof(CPUARMState, cp15.mair_el[1]),
       .resetvalue = 0 },
+    { .name = "MAIR_EL3", .state = ARM_CP_STATE_AA64,
+      .opc0 = 3, .opc1 = 6, .crn = 10, .crm = 2, .opc2 = 0,
+      .access = PL3_RW, .fieldoffset = offsetof(CPUARMState, cp15.mair_el[3]),
+      .resetvalue = 0 },
     /* For non-long-descriptor page tables these are PRRR and NMRR;
      * regardless they still act as reads-as-written for QEMU.
      */
@@ -1715,12 +1719,17 @@ static void par_write(CPUARMState *env, const ARMCPRegInfo *ri, uint64_t value)
 static CPAccessResult ats_access(CPUARMState *env, const ARMCPRegInfo *ri)
 {
     if (ri->opc2 & 4) {
-        /* Other states are only available with TrustZone; in
-         * a non-TZ implementation these registers don't exist
-         * at all, which is an Uncategorized trap. This underdecoding
-         * is safe because the reginfo is NO_RAW.
+        /* The ATS12NSO* operations must trap to EL3 if executed in
+         * Secure EL1 (which can only happen if EL3 is AArch64).
+         * They are simply UNDEF if executed from NS EL1.
+         * They function normally from EL2 or EL3.
          */
-        return CP_ACCESS_TRAP_UNCATEGORIZED;
+        if (arm_current_el(env) == 1) {
+            if (arm_is_secure_below_el3(env)) {
+                return CP_ACCESS_TRAP_UNCATEGORIZED_EL3;
+            }
+            return CP_ACCESS_TRAP_UNCATEGORIZED;
+        }
     }
     return CP_ACCESS_OK;
 }
@@ -1840,6 +1849,25 @@ static void ats_write(CPUARMState *env, const ARMCPRegInfo *ri, uint64_t value)
     A32_BANKED_CURRENT_REG_SET(env, par, par64);
 }
 
+static void ats1h_write(CPUARMState *env, const ARMCPRegInfo *ri,
+                        uint64_t value)
+{
+    int access_type = ri->opc2 & 1;
+    uint64_t par64;
+
+    par64 = do_ats_write(env, value, access_type, ARMMMUIdx_S2NS);
+
+    A32_BANKED_CURRENT_REG_SET(env, par, par64);
+}
+
+static CPAccessResult at_s1e2_access(CPUARMState *env, const ARMCPRegInfo *ri)
+{
+    if (arm_current_el(env) == 3 && !(env->cp15.scr_el3 & SCR_NS)) {
+        return CP_ACCESS_TRAP;
+    }
+    return CP_ACCESS_OK;
+}
+
 static void ats_write64(CPUARMState *env, const ARMCPRegInfo *ri,
                         uint64_t value)
 {
@@ -1867,10 +1895,10 @@ static void ats_write64(CPUARMState *env, const ARMCPRegInfo *ri,
         mmu_idx = secure ? ARMMMUIdx_S1SE0 : ARMMMUIdx_S1NSE0;
         break;
     case 4: /* AT S12E1R, AT S12E1W */
-        mmu_idx = ARMMMUIdx_S12NSE1;
+        mmu_idx = secure ? ARMMMUIdx_S1SE1 : ARMMMUIdx_S12NSE1;
         break;
     case 6: /* AT S12E0R, AT S12E0W */
-        mmu_idx = ARMMMUIdx_S12NSE0;
+        mmu_idx = secure ? ARMMMUIdx_S1SE0 : ARMMMUIdx_S12NSE0;
         break;
     default:
         g_assert_not_reached();
@@ -1887,6 +1915,7 @@ static const ARMCPRegInfo vapa_cp_reginfo[] = {
                              offsetoflow32(CPUARMState, cp15.par_ns) },
       .writefn = par_write },
 #ifndef CONFIG_USER_ONLY
+    /* This underdecoding is safe because the reginfo is NO_RAW. */
     { .name = "ATS", .cp = 15, .crn = 7, .crm = 8, .opc1 = 0, .opc2 = CP_ANY,
       .access = PL1_W, .accessfn = ats_access,
       .writefn = ats_write, .type = ARM_CP_NO_RAW },
@@ -2478,65 +2507,244 @@ static CPAccessResult aa64_cacheop_access(CPUARMState *env,
  * Page D4-1736 (DDI0487A.b)
  */
 
-static void tlbi_aa64_va_write(CPUARMState *env, const ARMCPRegInfo *ri,
-                               uint64_t value)
+static void tlbi_aa64_vmalle1_write(CPUARMState *env, const ARMCPRegInfo *ri,
+                                    uint64_t value)
+{
+    ARMCPU *cpu = arm_env_get_cpu(env);
+    CPUState *cs = CPU(cpu);
+
+    if (arm_is_secure_below_el3(env)) {
+        tlb_flush_by_mmuidx(cs, ARMMMUIdx_S1SE1, ARMMMUIdx_S1SE0, -1);
+    } else {
+        tlb_flush_by_mmuidx(cs, ARMMMUIdx_S12NSE1, ARMMMUIdx_S12NSE0, -1);
+    }
+}
+
+static void tlbi_aa64_vmalle1is_write(CPUARMState *env, const ARMCPRegInfo *ri,
+                                      uint64_t value)
 {
-    /* Invalidate by VA (AArch64 version) */
+    bool sec = arm_is_secure_below_el3(env);
+    CPUState *other_cs;
+
+    CPU_FOREACH(other_cs) {
+        if (sec) {
+            tlb_flush_by_mmuidx(other_cs, ARMMMUIdx_S1SE1, ARMMMUIdx_S1SE0, -1);
+        } else {
+            tlb_flush_by_mmuidx(other_cs, ARMMMUIdx_S12NSE1,
+                                ARMMMUIdx_S12NSE0, -1);
+        }
+    }
+}
+
+static void tlbi_aa64_alle1_write(CPUARMState *env, const ARMCPRegInfo *ri,
+                                  uint64_t value)
+{
+    /* Note that the 'ALL' scope must invalidate both stage 1 and
+     * stage 2 translations, whereas most other scopes only invalidate
+     * stage 1 translations.
+     */
+    ARMCPU *cpu = arm_env_get_cpu(env);
+    CPUState *cs = CPU(cpu);
+
+    if (arm_is_secure_below_el3(env)) {
+        tlb_flush_by_mmuidx(cs, ARMMMUIdx_S1SE1, ARMMMUIdx_S1SE0, -1);
+    } else {
+        if (arm_feature(env, ARM_FEATURE_EL2)) {
+            tlb_flush_by_mmuidx(cs, ARMMMUIdx_S12NSE1, ARMMMUIdx_S12NSE0,
+                                ARMMMUIdx_S2NS, -1);
+        } else {
+            tlb_flush_by_mmuidx(cs, ARMMMUIdx_S12NSE1, ARMMMUIdx_S12NSE0, -1);
+        }
+    }
+}
+
+static void tlbi_aa64_alle2_write(CPUARMState *env, const ARMCPRegInfo *ri,
+                                  uint64_t value)
+{
+    ARMCPU *cpu = arm_env_get_cpu(env);
+    CPUState *cs = CPU(cpu);
+
+    tlb_flush_by_mmuidx(cs, ARMMMUIdx_S1E2, -1);
+}
+
+static void tlbi_aa64_alle3_write(CPUARMState *env, const ARMCPRegInfo *ri,
+                                  uint64_t value)
+{
+    ARMCPU *cpu = arm_env_get_cpu(env);
+    CPUState *cs = CPU(cpu);
+
+    tlb_flush_by_mmuidx(cs, ARMMMUIdx_S1E3, -1);
+}
+
+static void tlbi_aa64_alle1is_write(CPUARMState *env, const ARMCPRegInfo *ri,
+                                    uint64_t value)
+{
+    /* Note that the 'ALL' scope must invalidate both stage 1 and
+     * stage 2 translations, whereas most other scopes only invalidate
+     * stage 1 translations.
+     */
+    bool sec = arm_is_secure_below_el3(env);
+    bool has_el2 = arm_feature(env, ARM_FEATURE_EL2);
+    CPUState *other_cs;
+
+    CPU_FOREACH(other_cs) {
+        if (sec) {
+            tlb_flush_by_mmuidx(other_cs, ARMMMUIdx_S1SE1, ARMMMUIdx_S1SE0, -1);
+        } else if (has_el2) {
+            tlb_flush_by_mmuidx(other_cs, ARMMMUIdx_S12NSE1,
+                                ARMMMUIdx_S12NSE0, ARMMMUIdx_S2NS, -1);
+        } else {
+            tlb_flush_by_mmuidx(other_cs, ARMMMUIdx_S12NSE1,
+                                ARMMMUIdx_S12NSE0, -1);
+        }
+    }
+}
+
+static void tlbi_aa64_alle2is_write(CPUARMState *env, const ARMCPRegInfo *ri,
+                                    uint64_t value)
+{
+    CPUState *other_cs;
+
+    CPU_FOREACH(other_cs) {
+        tlb_flush_by_mmuidx(other_cs, ARMMMUIdx_S1E2, -1);
+    }
+}
+
+static void tlbi_aa64_alle3is_write(CPUARMState *env, const ARMCPRegInfo *ri,
+                                    uint64_t value)
+{
+    CPUState *other_cs;
+
+    CPU_FOREACH(other_cs) {
+        tlb_flush_by_mmuidx(other_cs, ARMMMUIdx_S1E3, -1);
+    }
+}
+
+static void tlbi_aa64_vae1_write(CPUARMState *env, const ARMCPRegInfo *ri,
+                                 uint64_t value)
+{
+    /* Invalidate by VA, EL1&0 (AArch64 version).
+     * Currently handles all of VAE1, VAAE1, VAALE1 and VALE1,
+     * since we don't support flush-for-specific-ASID-only or
+     * flush-last-level-only.
+     */
     ARMCPU *cpu = arm_env_get_cpu(env);
+    CPUState *cs = CPU(cpu);
     uint64_t pageaddr = sextract64(value << 12, 0, 56);
 
-    tlb_flush_page(CPU(cpu), pageaddr);
+    if (arm_is_secure_below_el3(env)) {
+        tlb_flush_page_by_mmuidx(cs, pageaddr, ARMMMUIdx_S1SE1,
+                                 ARMMMUIdx_S1SE0, -1);
+    } else {
+        tlb_flush_page_by_mmuidx(cs, pageaddr, ARMMMUIdx_S12NSE1,
+                                 ARMMMUIdx_S12NSE0, -1);
+    }
 }
 
-static void tlbi_aa64_vaa_write(CPUARMState *env, const ARMCPRegInfo *ri,
-                                uint64_t value)
+static void tlbi_aa64_vae2_write(CPUARMState *env, const ARMCPRegInfo *ri,
+                                 uint64_t value)
 {
-    /* Invalidate by VA, all ASIDs (AArch64 version) */
+    /* Invalidate by VA, EL2
+     * Currently handles both VAE2 and VALE2, since we don't support
+     * flush-last-level-only.
+     */
     ARMCPU *cpu = arm_env_get_cpu(env);
+    CPUState *cs = CPU(cpu);
     uint64_t pageaddr = sextract64(value << 12, 0, 56);
 
-    tlb_flush_page(CPU(cpu), pageaddr);
+    tlb_flush_page_by_mmuidx(cs, pageaddr, ARMMMUIdx_S1E2, -1);
 }
 
-static void tlbi_aa64_asid_write(CPUARMState *env, const ARMCPRegInfo *ri,
+static void tlbi_aa64_vae3_write(CPUARMState *env, const ARMCPRegInfo *ri,
                                  uint64_t value)
 {
-    /* Invalidate by ASID (AArch64 version) */
+    /* Invalidate by VA, EL3
+     * Currently handles both VAE3 and VALE3, since we don't support
+     * flush-last-level-only.
+     */
     ARMCPU *cpu = arm_env_get_cpu(env);
-    int asid = extract64(value, 48, 16);
-    tlb_flush(CPU(cpu), asid == 0);
+    CPUState *cs = CPU(cpu);
+    uint64_t pageaddr = sextract64(value << 12, 0, 56);
+
+    tlb_flush_page_by_mmuidx(cs, pageaddr, ARMMMUIdx_S1E3, -1);
 }
 
-static void tlbi_aa64_va_is_write(CPUARMState *env, const ARMCPRegInfo *ri,
-                                  uint64_t value)
+static void tlbi_aa64_vae1is_write(CPUARMState *env, const ARMCPRegInfo *ri,
+                                   uint64_t value)
 {
+    bool sec = arm_is_secure_below_el3(env);
     CPUState *other_cs;
     uint64_t pageaddr = sextract64(value << 12, 0, 56);
 
     CPU_FOREACH(other_cs) {
-        tlb_flush_page(other_cs, pageaddr);
+        if (sec) {
+            tlb_flush_page_by_mmuidx(other_cs, pageaddr, ARMMMUIdx_S1SE1,
+                                     ARMMMUIdx_S1SE0, -1);
+        } else {
+            tlb_flush_page_by_mmuidx(other_cs, pageaddr, ARMMMUIdx_S12NSE1,
+                                     ARMMMUIdx_S12NSE0, -1);
+        }
     }
 }
 
-static void tlbi_aa64_vaa_is_write(CPUARMState *env, const ARMCPRegInfo *ri,
-                                  uint64_t value)
+static void tlbi_aa64_vae2is_write(CPUARMState *env, const ARMCPRegInfo *ri,
+                                   uint64_t value)
 {
     CPUState *other_cs;
     uint64_t pageaddr = sextract64(value << 12, 0, 56);
 
     CPU_FOREACH(other_cs) {
-        tlb_flush_page(other_cs, pageaddr);
+        tlb_flush_page_by_mmuidx(other_cs, pageaddr, ARMMMUIdx_S1E2, -1);
     }
 }
 
-static void tlbi_aa64_asid_is_write(CPUARMState *env, const ARMCPRegInfo *ri,
-                                  uint64_t value)
+static void tlbi_aa64_vae3is_write(CPUARMState *env, const ARMCPRegInfo *ri,
+                                   uint64_t value)
 {
     CPUState *other_cs;
-    int asid = extract64(value, 48, 16);
+    uint64_t pageaddr = sextract64(value << 12, 0, 56);
 
     CPU_FOREACH(other_cs) {
-        tlb_flush(other_cs, asid == 0);
+        tlb_flush_page_by_mmuidx(other_cs, pageaddr, ARMMMUIdx_S1E3, -1);
+    }
+}
+
+static void tlbi_aa64_ipas2e1_write(CPUARMState *env, const ARMCPRegInfo *ri,
+                                    uint64_t value)
+{
+    /* Invalidate by IPA. This has to invalidate any structures that
+     * contain only stage 2 translation information, but does not need
+     * to apply to structures that contain combined stage 1 and stage 2
+     * translation information.
+     * This must NOP if EL2 isn't implemented or SCR_EL3.NS is zero.
+     */
+    ARMCPU *cpu = arm_env_get_cpu(env);
+    CPUState *cs = CPU(cpu);
+    uint64_t pageaddr;
+
+    if (!arm_feature(env, ARM_FEATURE_EL2) || !(env->cp15.scr_el3 & SCR_NS)) {
+        return;
+    }
+
+    pageaddr = sextract64(value << 12, 0, 48);
+
+    tlb_flush_page_by_mmuidx(cs, pageaddr, ARMMMUIdx_S2NS, -1);
+}
+
+static void tlbi_aa64_ipas2e1is_write(CPUARMState *env, const ARMCPRegInfo *ri,
+                                      uint64_t value)
+{
+    CPUState *other_cs;
+    uint64_t pageaddr;
+
+    if (!arm_feature(env, ARM_FEATURE_EL2) || !(env->cp15.scr_el3 & SCR_NS)) {
+        return;
+    }
+
+    pageaddr = sextract64(value << 12, 0, 48);
+
+    CPU_FOREACH(other_cs) {
+        tlb_flush_page_by_mmuidx(other_cs, pageaddr, ARMMMUIdx_S2NS, -1);
     }
 }
 
@@ -2672,62 +2880,86 @@ static const ARMCPRegInfo v8_cp_reginfo[] = {
       .opc0 = 1, .opc1 = 0, .crn = 7, .crm = 14, .opc2 = 2,
       .access = PL1_W, .type = ARM_CP_NOP },
     /* TLBI operations */
-    { .name = "TLBI_ALLE1", .state = ARM_CP_STATE_AA64,
-      .opc0 = 1, .opc1 = 4, .crn = 8, .crm = 7, .opc2 = 4,
-      .access = PL2_W, .type = ARM_CP_NO_RAW,
-      .writefn = tlbiall_write },
-    { .name = "TLBI_ALLE1IS", .state = ARM_CP_STATE_AA64,
-      .opc0 = 1, .opc1 = 4, .crn = 8, .crm = 3, .opc2 = 4,
-      .access = PL2_W, .type = ARM_CP_NO_RAW,
-      .writefn = tlbiall_is_write },
     { .name = "TLBI_VMALLE1IS", .state = ARM_CP_STATE_AA64,
       .opc0 = 1, .opc1 = 0, .crn = 8, .crm = 3, .opc2 = 0,
       .access = PL1_W, .type = ARM_CP_NO_RAW,
-      .writefn = tlbiall_is_write },
+      .writefn = tlbi_aa64_vmalle1is_write },
     { .name = "TLBI_VAE1IS", .state = ARM_CP_STATE_AA64,
       .opc0 = 1, .opc1 = 0, .crn = 8, .crm = 3, .opc2 = 1,
       .access = PL1_W, .type = ARM_CP_NO_RAW,
-      .writefn = tlbi_aa64_va_is_write },
+      .writefn = tlbi_aa64_vae1is_write },
     { .name = "TLBI_ASIDE1IS", .state = ARM_CP_STATE_AA64,
       .opc0 = 1, .opc1 = 0, .crn = 8, .crm = 3, .opc2 = 2,
       .access = PL1_W, .type = ARM_CP_NO_RAW,
-      .writefn = tlbi_aa64_asid_is_write },
+      .writefn = tlbi_aa64_vmalle1is_write },
     { .name = "TLBI_VAAE1IS", .state = ARM_CP_STATE_AA64,
       .opc0 = 1, .opc1 = 0, .crn = 8, .crm = 3, .opc2 = 3,
       .access = PL1_W, .type = ARM_CP_NO_RAW,
-      .writefn = tlbi_aa64_vaa_is_write },
+      .writefn = tlbi_aa64_vae1is_write },
     { .name = "TLBI_VALE1IS", .state = ARM_CP_STATE_AA64,
       .opc0 = 1, .opc1 = 0, .crn = 8, .crm = 3, .opc2 = 5,
       .access = PL1_W, .type = ARM_CP_NO_RAW,
-      .writefn = tlbi_aa64_va_is_write },
+      .writefn = tlbi_aa64_vae1is_write },
     { .name = "TLBI_VAALE1IS", .state = ARM_CP_STATE_AA64,
       .opc0 = 1, .opc1 = 0, .crn = 8, .crm = 3, .opc2 = 7,
       .access = PL1_W, .type = ARM_CP_NO_RAW,
-      .writefn = tlbi_aa64_vaa_is_write },
+      .writefn = tlbi_aa64_vae1is_write },
     { .name = "TLBI_VMALLE1", .state = ARM_CP_STATE_AA64,
       .opc0 = 1, .opc1 = 0, .crn = 8, .crm = 7, .opc2 = 0,
       .access = PL1_W, .type = ARM_CP_NO_RAW,
-      .writefn = tlbiall_write },
+      .writefn = tlbi_aa64_vmalle1_write },
     { .name = "TLBI_VAE1", .state = ARM_CP_STATE_AA64,
       .opc0 = 1, .opc1 = 0, .crn = 8, .crm = 7, .opc2 = 1,
       .access = PL1_W, .type = ARM_CP_NO_RAW,
-      .writefn = tlbi_aa64_va_write },
+      .writefn = tlbi_aa64_vae1_write },
     { .name = "TLBI_ASIDE1", .state = ARM_CP_STATE_AA64,
       .opc0 = 1, .opc1 = 0, .crn = 8, .crm = 7, .opc2 = 2,
       .access = PL1_W, .type = ARM_CP_NO_RAW,
-      .writefn = tlbi_aa64_asid_write },
+      .writefn = tlbi_aa64_vmalle1_write },
     { .name = "TLBI_VAAE1", .state = ARM_CP_STATE_AA64,
       .opc0 = 1, .opc1 = 0, .crn = 8, .crm = 7, .opc2 = 3,
       .access = PL1_W, .type = ARM_CP_NO_RAW,
-      .writefn = tlbi_aa64_vaa_write },
+      .writefn = tlbi_aa64_vae1_write },
     { .name = "TLBI_VALE1", .state = ARM_CP_STATE_AA64,
       .opc0 = 1, .opc1 = 0, .crn = 8, .crm = 7, .opc2 = 5,
       .access = PL1_W, .type = ARM_CP_NO_RAW,
-      .writefn = tlbi_aa64_va_write },
+      .writefn = tlbi_aa64_vae1_write },
     { .name = "TLBI_VAALE1", .state = ARM_CP_STATE_AA64,
       .opc0 = 1, .opc1 = 0, .crn = 8, .crm = 7, .opc2 = 7,
       .access = PL1_W, .type = ARM_CP_NO_RAW,
-      .writefn = tlbi_aa64_vaa_write },
+      .writefn = tlbi_aa64_vae1_write },
+    { .name = "TLBI_IPAS2E1IS", .state = ARM_CP_STATE_AA64,
+      .opc0 = 1, .opc1 = 4, .crn = 8, .crm = 0, .opc2 = 1,
+      .access = PL2_W, .type = ARM_CP_NO_RAW,
+      .writefn = tlbi_aa64_ipas2e1is_write },
+    { .name = "TLBI_IPAS2LE1IS", .state = ARM_CP_STATE_AA64,
+      .opc0 = 1, .opc1 = 4, .crn = 8, .crm = 0, .opc2 = 5,
+      .access = PL2_W, .type = ARM_CP_NO_RAW,
+      .writefn = tlbi_aa64_ipas2e1is_write },
+    { .name = "TLBI_ALLE1IS", .state = ARM_CP_STATE_AA64,
+      .opc0 = 1, .opc1 = 4, .crn = 8, .crm = 3, .opc2 = 4,
+      .access = PL2_W, .type = ARM_CP_NO_RAW,
+      .writefn = tlbi_aa64_alle1is_write },
+    { .name = "TLBI_VMALLS12E1IS", .state = ARM_CP_STATE_AA64,
+      .opc0 = 1, .opc1 = 4, .crn = 8, .crm = 3, .opc2 = 6,
+      .access = PL2_W, .type = ARM_CP_NO_RAW,
+      .writefn = tlbi_aa64_alle1is_write },
+    { .name = "TLBI_IPAS2E1", .state = ARM_CP_STATE_AA64,
+      .opc0 = 1, .opc1 = 4, .crn = 8, .crm = 4, .opc2 = 1,
+      .access = PL2_W, .type = ARM_CP_NO_RAW,
+      .writefn = tlbi_aa64_ipas2e1_write },
+    { .name = "TLBI_IPAS2LE1", .state = ARM_CP_STATE_AA64,
+      .opc0 = 1, .opc1 = 4, .crn = 8, .crm = 4, .opc2 = 5,
+      .access = PL2_W, .type = ARM_CP_NO_RAW,
+      .writefn = tlbi_aa64_ipas2e1_write },
+    { .name = "TLBI_ALLE1", .state = ARM_CP_STATE_AA64,
+      .opc0 = 1, .opc1 = 4, .crn = 8, .crm = 7, .opc2 = 4,
+      .access = PL2_W, .type = ARM_CP_NO_RAW,
+      .writefn = tlbi_aa64_alle1_write },
+    { .name = "TLBI_VMALLS12E1", .state = ARM_CP_STATE_AA64,
+      .opc0 = 1, .opc1 = 4, .crn = 8, .crm = 7, .opc2 = 6,
+      .access = PL2_W, .type = ARM_CP_NO_RAW,
+      .writefn = tlbi_aa64_alle1is_write },
 #ifndef CONFIG_USER_ONLY
     /* 64 bit address translation operations */
     { .name = "AT_S1E1R", .state = ARM_CP_STATE_AA64,
@@ -2742,6 +2974,25 @@ static const ARMCPRegInfo v8_cp_reginfo[] = {
     { .name = "AT_S1E0W", .state = ARM_CP_STATE_AA64,
       .opc0 = 1, .opc1 = 0, .crn = 7, .crm = 8, .opc2 = 3,
       .access = PL1_W, .type = ARM_CP_NO_RAW, .writefn = ats_write64 },
+    { .name = "AT_S12E1R", .state = ARM_CP_STATE_AA64,
+      .opc0 = 1, .opc1 = 0, .crn = 7, .crm = 8, .opc2 = 4,
+      .access = PL2_W, .type = ARM_CP_NO_RAW, .writefn = ats_write64 },
+    { .name = "AT_S12E1W", .state = ARM_CP_STATE_AA64,
+      .opc0 = 1, .opc1 = 0, .crn = 7, .crm = 8, .opc2 = 5,
+      .access = PL2_W, .type = ARM_CP_NO_RAW, .writefn = ats_write64 },
+    { .name = "AT_S12E0R", .state = ARM_CP_STATE_AA64,
+      .opc0 = 1, .opc1 = 0, .crn = 7, .crm = 8, .opc2 = 6,
+      .access = PL2_W, .type = ARM_CP_NO_RAW, .writefn = ats_write64 },
+    { .name = "AT_S12E0W", .state = ARM_CP_STATE_AA64,
+      .opc0 = 1, .opc1 = 0, .crn = 7, .crm = 8, .opc2 = 7,
+      .access = PL2_W, .type = ARM_CP_NO_RAW, .writefn = ats_write64 },
+    /* AT S1E2* are elsewhere as they UNDEF from EL3 if EL2 is not present */
+    { .name = "AT_S1E3R", .state = ARM_CP_STATE_AA64,
+      .opc0 = 1, .opc1 = 6, .crn = 7, .crm = 8, .opc2 = 0,
+      .access = PL3_W, .type = ARM_CP_NO_RAW, .writefn = ats_write64 },
+    { .name = "AT_S1E3W", .state = ARM_CP_STATE_AA64,
+      .opc0 = 1, .opc1 = 6, .crn = 7, .crm = 8, .opc2 = 1,
+      .access = PL3_W, .type = ARM_CP_NO_RAW, .writefn = ats_write64 },
 #endif
     /* TLB invalidate last level of translation table walk */
     { .name = "TLBIMVALIS", .cp = 15, .opc1 = 0, .crn = 8, .crm = 3, .opc2 = 5,
@@ -2836,6 +3087,22 @@ static const ARMCPRegInfo el3_no_el2_cp_reginfo[] = {
     { .name = "HMAIR1", .state = ARM_CP_STATE_AA32,
       .opc1 = 4, .crn = 10, .crm = 2, .opc2 = 1,
       .access = PL2_RW, .type = ARM_CP_CONST, .resetvalue = 0 },
+    { .name = "AMAIR_EL2", .state = ARM_CP_STATE_BOTH,
+      .opc0 = 3, .opc1 = 4, .crn = 10, .crm = 3, .opc2 = 0,
+      .access = PL2_RW, .type = ARM_CP_CONST,
+      .resetvalue = 0 },
+    { .name = "HMAIR1", .state = ARM_CP_STATE_AA32,
+      .opc1 = 4, .crn = 10, .crm = 3, .opc2 = 1,
+      .access = PL2_RW, .type = ARM_CP_CONST,
+      .resetvalue = 0 },
+    { .name = "AFSR0_EL2", .state = ARM_CP_STATE_BOTH,
+      .opc0 = 3, .opc1 = 4, .crn = 5, .crm = 1, .opc2 = 0,
+      .access = PL2_RW, .type = ARM_CP_CONST,
+      .resetvalue = 0 },
+    { .name = "AFSR1_EL2", .state = ARM_CP_STATE_BOTH,
+      .opc0 = 3, .opc1 = 4, .crn = 5, .crm = 1, .opc2 = 1,
+      .access = PL2_RW, .type = ARM_CP_CONST,
+      .resetvalue = 0 },
     { .name = "TCR_EL2", .state = ARM_CP_STATE_BOTH,
       .opc0 = 3, .opc1 = 4, .crn = 2, .crm = 0, .opc2 = 2,
       .access = PL2_RW, .type = ARM_CP_CONST, .resetvalue = 0 },
@@ -2951,6 +3218,23 @@ static const ARMCPRegInfo el2_cp_reginfo[] = {
       .opc1 = 4, .crn = 10, .crm = 2, .opc2 = 1,
       .access = PL2_RW, .type = ARM_CP_ALIAS,
       .fieldoffset = offsetofhigh32(CPUARMState, cp15.mair_el[2]) },
+    { .name = "AMAIR_EL2", .state = ARM_CP_STATE_BOTH,
+      .opc0 = 3, .opc1 = 4, .crn = 10, .crm = 3, .opc2 = 0,
+      .access = PL2_RW, .type = ARM_CP_CONST,
+      .resetvalue = 0 },
+    /* HAMAIR1 is mapped to AMAIR_EL2[63:32] */
+    { .name = "HMAIR1", .state = ARM_CP_STATE_AA32,
+      .opc1 = 4, .crn = 10, .crm = 3, .opc2 = 1,
+      .access = PL2_RW, .type = ARM_CP_CONST,
+      .resetvalue = 0 },
+    { .name = "AFSR0_EL2", .state = ARM_CP_STATE_BOTH,
+      .opc0 = 3, .opc1 = 4, .crn = 5, .crm = 1, .opc2 = 0,
+      .access = PL2_RW, .type = ARM_CP_CONST,
+      .resetvalue = 0 },
+    { .name = "AFSR1_EL2", .state = ARM_CP_STATE_BOTH,
+      .opc0 = 3, .opc1 = 4, .crn = 5, .crm = 1, .opc2 = 1,
+      .access = PL2_RW, .type = ARM_CP_CONST,
+      .resetvalue = 0 },
     { .name = "TCR_EL2", .state = ARM_CP_STATE_BOTH,
       .opc0 = 3, .opc1 = 4, .crn = 2, .crm = 0, .opc2 = 2,
       .access = PL2_RW, .writefn = vmsa_tcr_el1_write,
@@ -2974,16 +3258,51 @@ static const ARMCPRegInfo el2_cp_reginfo[] = {
     { .name = "TLBI_ALLE2", .state = ARM_CP_STATE_AA64,
       .opc0 = 1, .opc1 = 4, .crn = 8, .crm = 7, .opc2 = 0,
       .type = ARM_CP_NO_RAW, .access = PL2_W,
-      .writefn = tlbiall_write },
+      .writefn = tlbi_aa64_alle2_write },
     { .name = "TLBI_VAE2", .state = ARM_CP_STATE_AA64,
       .opc0 = 1, .opc1 = 4, .crn = 8, .crm = 7, .opc2 = 1,
       .type = ARM_CP_NO_RAW, .access = PL2_W,
-      .writefn = tlbi_aa64_vaa_write },
+      .writefn = tlbi_aa64_vae2_write },
+    { .name = "TLBI_VALE2", .state = ARM_CP_STATE_AA64,
+      .opc0 = 1, .opc1 = 4, .crn = 8, .crm = 7, .opc2 = 5,
+      .access = PL2_W, .type = ARM_CP_NO_RAW,
+      .writefn = tlbi_aa64_vae2_write },
+    { .name = "TLBI_ALLE2IS", .state = ARM_CP_STATE_AA64,
+      .opc0 = 1, .opc1 = 4, .crn = 8, .crm = 3, .opc2 = 0,
+      .access = PL2_W, .type = ARM_CP_NO_RAW,
+      .writefn = tlbi_aa64_alle2is_write },
     { .name = "TLBI_VAE2IS", .state = ARM_CP_STATE_AA64,
       .opc0 = 1, .opc1 = 4, .crn = 8, .crm = 3, .opc2 = 1,
       .type = ARM_CP_NO_RAW, .access = PL2_W,
-      .writefn = tlbi_aa64_vaa_write },
+      .writefn = tlbi_aa64_vae2is_write },
+    { .name = "TLBI_VALE2IS", .state = ARM_CP_STATE_AA64,
+      .opc0 = 1, .opc1 = 4, .crn = 8, .crm = 3, .opc2 = 5,
+      .access = PL2_W, .type = ARM_CP_NO_RAW,
+      .writefn = tlbi_aa64_vae2is_write },
 #ifndef CONFIG_USER_ONLY
+    /* Unlike the other EL2-related AT operations, these must
+     * UNDEF from EL3 if EL2 is not implemented, which is why we
+     * define them here rather than with the rest of the AT ops.
+     */
+    { .name = "AT_S1E2R", .state = ARM_CP_STATE_AA64,
+      .opc0 = 1, .opc1 = 4, .crn = 7, .crm = 8, .opc2 = 0,
+      .access = PL2_W, .accessfn = at_s1e2_access,
+      .type = ARM_CP_NO_RAW, .writefn = ats_write64 },
+    { .name = "AT_S1E2W", .state = ARM_CP_STATE_AA64,
+      .opc0 = 1, .opc1 = 4, .crn = 7, .crm = 8, .opc2 = 1,
+      .access = PL2_W, .accessfn = at_s1e2_access,
+      .type = ARM_CP_NO_RAW, .writefn = ats_write64 },
+    /* The AArch32 ATS1H* operations are CONSTRAINED UNPREDICTABLE
+     * if EL2 is not implemented; we choose to UNDEF. Behaviour at EL3
+     * with SCR.NS == 0 outside Monitor mode is UNPREDICTABLE; we choose
+     * to behave as if SCR.NS was 1.
+     */
+    { .name = "ATS1HR", .cp = 15, .opc1 = 4, .crn = 7, .crm = 8, .opc2 = 0,
+      .access = PL2_W,
+      .writefn = ats1h_write, .type = ARM_CP_NO_RAW },
+    { .name = "ATS1HW", .cp = 15, .opc1 = 4, .crn = 7, .crm = 8, .opc2 = 1,
+      .access = PL2_W,
+      .writefn = ats1h_write, .type = ARM_CP_NO_RAW },
     { .name = "CNTHCTL_EL2", .state = ARM_CP_STATE_BOTH,
       .opc0 = 3, .opc1 = 4, .crn = 14, .crm = 1, .opc2 = 0,
       /* ARMv7 requires bit 0 and 1 to reset to 1. ARMv8 defines the
@@ -3089,6 +3408,46 @@ static const ARMCPRegInfo el3_cp_reginfo[] = {
       .opc0 = 3, .opc1 = 6, .crn = 1, .crm = 1, .opc2 = 2,
       .access = PL3_RW, .accessfn = cptr_access, .resetvalue = 0,
       .fieldoffset = offsetof(CPUARMState, cp15.cptr_el[3]) },
+    { .name = "TPIDR_EL3", .state = ARM_CP_STATE_AA64,
+      .opc0 = 3, .opc1 = 6, .crn = 13, .crm = 0, .opc2 = 2,
+      .access = PL3_RW, .resetvalue = 0,
+      .fieldoffset = offsetof(CPUARMState, cp15.tpidr_el[3]) },
+    { .name = "AMAIR_EL3", .state = ARM_CP_STATE_AA64,
+      .opc0 = 3, .opc1 = 6, .crn = 10, .crm = 3, .opc2 = 0,
+      .access = PL3_RW, .type = ARM_CP_CONST,
+      .resetvalue = 0 },
+    { .name = "AFSR0_EL3", .state = ARM_CP_STATE_BOTH,
+      .opc0 = 3, .opc1 = 6, .crn = 5, .crm = 1, .opc2 = 0,
+      .access = PL3_RW, .type = ARM_CP_CONST,
+      .resetvalue = 0 },
+    { .name = "AFSR1_EL3", .state = ARM_CP_STATE_BOTH,
+      .opc0 = 3, .opc1 = 6, .crn = 5, .crm = 1, .opc2 = 1,
+      .access = PL3_RW, .type = ARM_CP_CONST,
+      .resetvalue = 0 },
+    { .name = "TLBI_ALLE3IS", .state = ARM_CP_STATE_AA64,
+      .opc0 = 1, .opc1 = 6, .crn = 8, .crm = 3, .opc2 = 0,
+      .access = PL3_W, .type = ARM_CP_NO_RAW,
+      .writefn = tlbi_aa64_alle3is_write },
+    { .name = "TLBI_VAE3IS", .state = ARM_CP_STATE_AA64,
+      .opc0 = 1, .opc1 = 6, .crn = 8, .crm = 3, .opc2 = 1,
+      .access = PL3_W, .type = ARM_CP_NO_RAW,
+      .writefn = tlbi_aa64_vae3is_write },
+    { .name = "TLBI_VALE3IS", .state = ARM_CP_STATE_AA64,
+      .opc0 = 1, .opc1 = 6, .crn = 8, .crm = 3, .opc2 = 5,
+      .access = PL3_W, .type = ARM_CP_NO_RAW,
+      .writefn = tlbi_aa64_vae3is_write },
+    { .name = "TLBI_ALLE3", .state = ARM_CP_STATE_AA64,
+      .opc0 = 1, .opc1 = 6, .crn = 8, .crm = 7, .opc2 = 0,
+      .access = PL3_W, .type = ARM_CP_NO_RAW,
+      .writefn = tlbi_aa64_alle3_write },
+    { .name = "TLBI_VAE3", .state = ARM_CP_STATE_AA64,
+      .opc0 = 1, .opc1 = 6, .crn = 8, .crm = 7, .opc2 = 1,
+      .access = PL3_W, .type = ARM_CP_NO_RAW,
+      .writefn = tlbi_aa64_vae3_write },
+    { .name = "TLBI_VALE3", .state = ARM_CP_STATE_AA64,
+      .opc0 = 1, .opc1 = 6, .crn = 8, .crm = 7, .opc2 = 5,
+      .access = PL3_W, .type = ARM_CP_NO_RAW,
+      .writefn = tlbi_aa64_vae3_write },
     REGINFO_SENTINEL
 };
 
@@ -3883,13 +4242,22 @@ void register_cp_regs_for_features(ARMCPU *cpu)
     }
 
     if (arm_feature(env, ARM_FEATURE_AUXCR)) {
-        ARMCPRegInfo auxcr = {
-            .name = "ACTLR_EL1", .state = ARM_CP_STATE_BOTH,
-            .opc0 = 3, .opc1 = 0, .crn = 1, .crm = 0, .opc2 = 1,
-            .access = PL1_RW, .type = ARM_CP_CONST,
-            .resetvalue = cpu->reset_auxcr
+        ARMCPRegInfo auxcr_reginfo[] = {
+            { .name = "ACTLR_EL1", .state = ARM_CP_STATE_BOTH,
+              .opc0 = 3, .opc1 = 0, .crn = 1, .crm = 0, .opc2 = 1,
+              .access = PL1_RW, .type = ARM_CP_CONST,
+              .resetvalue = cpu->reset_auxcr },
+            { .name = "ACTLR_EL2", .state = ARM_CP_STATE_BOTH,
+              .opc0 = 3, .opc1 = 4, .crn = 1, .crm = 0, .opc2 = 1,
+              .access = PL2_RW, .type = ARM_CP_CONST,
+              .resetvalue = 0 },
+            { .name = "ACTLR_EL3", .state = ARM_CP_STATE_AA64,
+              .opc0 = 3, .opc1 = 6, .crn = 1, .crm = 0, .opc2 = 1,
+              .access = PL3_RW, .type = ARM_CP_CONST,
+              .resetvalue = 0 },
+            REGINFO_SENTINEL
         };
-        define_one_arm_cp_reg(cpu, &auxcr);
+        define_arm_cp_regs(cpu, auxcr_reginfo);
     }
 
     if (arm_feature(env, ARM_FEATURE_CBAR)) {
@@ -5937,6 +6305,11 @@ static bool get_phys_addr_lpae(CPUARMState *env, target_ulong address,
         if (el > 1) {
             ttbr1_valid = false;
         }
+    } else {
+        /* There is no TTBR1 for EL2 */
+        if (el == 2) {
+            ttbr1_valid = false;
+        }
     }
 
     /* Determine whether this address is in the region controlled by
diff --git a/target-arm/op_helper.c b/target-arm/op_helper.c
index 663c05d1d2..1425a1d4bb 100644
--- a/target-arm/op_helper.c
+++ b/target-arm/op_helper.c
@@ -444,6 +444,14 @@ void HELPER(access_check_cp_reg)(CPUARMState *env, void *rip, uint32_t syndrome)
         target_el = exception_target_el(env);
         syndrome = syn_uncategorized();
         break;
+    case CP_ACCESS_TRAP_UNCATEGORIZED_EL2:
+        target_el = 2;
+        syndrome = syn_uncategorized();
+        break;
+    case CP_ACCESS_TRAP_UNCATEGORIZED_EL3:
+        target_el = 3;
+        syndrome = syn_uncategorized();
+        break;
     default:
         g_assert_not_reached();
     }
diff --git a/target-arm/translate-a64.c b/target-arm/translate-a64.c
index 689f2be896..5c13e153d4 100644
--- a/target-arm/translate-a64.c
+++ b/target-arm/translate-a64.c
@@ -528,9 +528,9 @@ static inline void gen_set_NZ64(TCGv_i64 result)
     TCGv_i64 flag = tcg_temp_new_i64();
 
     tcg_gen_setcondi_i64(TCG_COND_NE, flag, result, 0);
-    tcg_gen_trunc_i64_i32(cpu_ZF, flag);
+    tcg_gen_extrl_i64_i32(cpu_ZF, flag);
     tcg_gen_shri_i64(flag, result, 32);
-    tcg_gen_trunc_i64_i32(cpu_NF, flag);
+    tcg_gen_extrl_i64_i32(cpu_NF, flag);
     tcg_temp_free_i64(flag);
 }
 
@@ -540,8 +540,8 @@ static inline void gen_logic_CC(int sf, TCGv_i64 result)
     if (sf) {
         gen_set_NZ64(result);
     } else {
-        tcg_gen_trunc_i64_i32(cpu_ZF, result);
-        tcg_gen_trunc_i64_i32(cpu_NF, result);
+        tcg_gen_extrl_i64_i32(cpu_ZF, result);
+        tcg_gen_extrl_i64_i32(cpu_NF, result);
     }
     tcg_gen_movi_i32(cpu_CF, 0);
     tcg_gen_movi_i32(cpu_VF, 0);
@@ -559,7 +559,7 @@ static void gen_add_CC(int sf, TCGv_i64 dest, TCGv_i64 t0, TCGv_i64 t1)
         tcg_gen_movi_i64(tmp, 0);
         tcg_gen_add2_i64(result, flag, t0, tmp, t1, tmp);
 
-        tcg_gen_trunc_i64_i32(cpu_CF, flag);
+        tcg_gen_extrl_i64_i32(cpu_CF, flag);
 
         gen_set_NZ64(result);
 
@@ -568,7 +568,7 @@ static void gen_add_CC(int sf, TCGv_i64 dest, TCGv_i64 t0, TCGv_i64 t1)
         tcg_gen_andc_i64(flag, flag, tmp);
         tcg_temp_free_i64(tmp);
         tcg_gen_shri_i64(flag, flag, 32);
-        tcg_gen_trunc_i64_i32(cpu_VF, flag);
+        tcg_gen_extrl_i64_i32(cpu_VF, flag);
 
         tcg_gen_mov_i64(dest, result);
         tcg_temp_free_i64(result);
@@ -580,8 +580,8 @@ static void gen_add_CC(int sf, TCGv_i64 dest, TCGv_i64 t0, TCGv_i64 t1)
         TCGv_i32 tmp = tcg_temp_new_i32();
 
         tcg_gen_movi_i32(tmp, 0);
-        tcg_gen_trunc_i64_i32(t0_32, t0);
-        tcg_gen_trunc_i64_i32(t1_32, t1);
+        tcg_gen_extrl_i64_i32(t0_32, t0);
+        tcg_gen_extrl_i64_i32(t1_32, t1);
         tcg_gen_add2_i32(cpu_NF, cpu_CF, t0_32, tmp, t1_32, tmp);
         tcg_gen_mov_i32(cpu_ZF, cpu_NF);
         tcg_gen_xor_i32(cpu_VF, cpu_NF, t0_32);
@@ -609,7 +609,7 @@ static void gen_sub_CC(int sf, TCGv_i64 dest, TCGv_i64 t0, TCGv_i64 t1)
         gen_set_NZ64(result);
 
         tcg_gen_setcond_i64(TCG_COND_GEU, flag, t0, t1);
-        tcg_gen_trunc_i64_i32(cpu_CF, flag);
+        tcg_gen_extrl_i64_i32(cpu_CF, flag);
 
         tcg_gen_xor_i64(flag, result, t0);
         tmp = tcg_temp_new_i64();
@@ -617,7 +617,7 @@ static void gen_sub_CC(int sf, TCGv_i64 dest, TCGv_i64 t0, TCGv_i64 t1)
         tcg_gen_and_i64(flag, flag, tmp);
         tcg_temp_free_i64(tmp);
         tcg_gen_shri_i64(flag, flag, 32);
-        tcg_gen_trunc_i64_i32(cpu_VF, flag);
+        tcg_gen_extrl_i64_i32(cpu_VF, flag);
         tcg_gen_mov_i64(dest, result);
         tcg_temp_free_i64(flag);
         tcg_temp_free_i64(result);
@@ -627,8 +627,8 @@ static void gen_sub_CC(int sf, TCGv_i64 dest, TCGv_i64 t0, TCGv_i64 t1)
         TCGv_i32 t1_32 = tcg_temp_new_i32();
         TCGv_i32 tmp;
 
-        tcg_gen_trunc_i64_i32(t0_32, t0);
-        tcg_gen_trunc_i64_i32(t1_32, t1);
+        tcg_gen_extrl_i64_i32(t0_32, t0);
+        tcg_gen_extrl_i64_i32(t1_32, t1);
         tcg_gen_sub_i32(cpu_NF, t0_32, t1_32);
         tcg_gen_mov_i32(cpu_ZF, cpu_NF);
         tcg_gen_setcond_i32(TCG_COND_GEU, cpu_CF, t0_32, t1_32);
@@ -670,14 +670,14 @@ static void gen_adc_CC(int sf, TCGv_i64 dest, TCGv_i64 t0, TCGv_i64 t1)
         tcg_gen_extu_i32_i64(cf_64, cpu_CF);
         tcg_gen_add2_i64(result, cf_64, t0, tmp, cf_64, tmp);
         tcg_gen_add2_i64(result, cf_64, result, cf_64, t1, tmp);
-        tcg_gen_trunc_i64_i32(cpu_CF, cf_64);
+        tcg_gen_extrl_i64_i32(cpu_CF, cf_64);
         gen_set_NZ64(result);
 
         tcg_gen_xor_i64(vf_64, result, t0);
         tcg_gen_xor_i64(tmp, t0, t1);
         tcg_gen_andc_i64(vf_64, vf_64, tmp);
         tcg_gen_shri_i64(vf_64, vf_64, 32);
-        tcg_gen_trunc_i64_i32(cpu_VF, vf_64);
+        tcg_gen_extrl_i64_i32(cpu_VF, vf_64);
 
         tcg_gen_mov_i64(dest, result);
 
@@ -691,8 +691,8 @@ static void gen_adc_CC(int sf, TCGv_i64 dest, TCGv_i64 t0, TCGv_i64 t1)
         t1_32 = tcg_temp_new_i32();
         tmp = tcg_const_i32(0);
 
-        tcg_gen_trunc_i64_i32(t0_32, t0);
-        tcg_gen_trunc_i64_i32(t1_32, t1);
+        tcg_gen_extrl_i64_i32(t0_32, t0);
+        tcg_gen_extrl_i64_i32(t1_32, t1);
         tcg_gen_add2_i32(cpu_NF, cpu_CF, t0_32, tmp, cpu_CF, tmp);
         tcg_gen_add2_i32(cpu_NF, cpu_CF, cpu_NF, cpu_CF, t1_32, tmp);
 
@@ -1301,7 +1301,7 @@ static void gen_set_nzcv(TCGv_i64 tcg_rt)
     TCGv_i32 nzcv = tcg_temp_new_i32();
 
     /* take NZCV from R[t] */
-    tcg_gen_trunc_i64_i32(nzcv, tcg_rt);
+    tcg_gen_extrl_i64_i32(nzcv, tcg_rt);
 
     /* bit 31, N */
     tcg_gen_andi_i32(cpu_NF, nzcv, (1U << 31));
@@ -3131,8 +3131,8 @@ static void shift_reg(TCGv_i64 dst, TCGv_i64 src, int sf,
             TCGv_i32 t0, t1;
             t0 = tcg_temp_new_i32();
             t1 = tcg_temp_new_i32();
-            tcg_gen_trunc_i64_i32(t0, src);
-            tcg_gen_trunc_i64_i32(t1, shift_amount);
+            tcg_gen_extrl_i64_i32(t0, src);
+            tcg_gen_extrl_i64_i32(t1, shift_amount);
             tcg_gen_rotr_i32(t0, t0, t1);
             tcg_gen_extu_i32_i64(dst, t0);
             tcg_temp_free_i32(t0);
@@ -3680,7 +3680,7 @@ static void handle_clz(DisasContext *s, unsigned int sf,
         gen_helper_clz64(tcg_rd, tcg_rn);
     } else {
         TCGv_i32 tcg_tmp32 = tcg_temp_new_i32();
-        tcg_gen_trunc_i64_i32(tcg_tmp32, tcg_rn);
+        tcg_gen_extrl_i64_i32(tcg_tmp32, tcg_rn);
         gen_helper_clz(tcg_tmp32, tcg_tmp32);
         tcg_gen_extu_i32_i64(tcg_rd, tcg_tmp32);
         tcg_temp_free_i32(tcg_tmp32);
@@ -3698,7 +3698,7 @@ static void handle_cls(DisasContext *s, unsigned int sf,
         gen_helper_cls64(tcg_rd, tcg_rn);
     } else {
         TCGv_i32 tcg_tmp32 = tcg_temp_new_i32();
-        tcg_gen_trunc_i64_i32(tcg_tmp32, tcg_rn);
+        tcg_gen_extrl_i64_i32(tcg_tmp32, tcg_rn);
         gen_helper_cls32(tcg_tmp32, tcg_tmp32);
         tcg_gen_extu_i32_i64(tcg_rd, tcg_tmp32);
         tcg_temp_free_i32(tcg_tmp32);
@@ -3716,7 +3716,7 @@ static void handle_rbit(DisasContext *s, unsigned int sf,
         gen_helper_rbit64(tcg_rd, tcg_rn);
     } else {
         TCGv_i32 tcg_tmp32 = tcg_temp_new_i32();
-        tcg_gen_trunc_i64_i32(tcg_tmp32, tcg_rn);
+        tcg_gen_extrl_i64_i32(tcg_tmp32, tcg_rn);
         gen_helper_rbit(tcg_tmp32, tcg_tmp32);
         tcg_gen_extu_i32_i64(tcg_rd, tcg_tmp32);
         tcg_temp_free_i32(tcg_tmp32);
@@ -5475,16 +5475,16 @@ static void disas_simd_across_lanes(DisasContext *s, uint32_t insn)
         assert(elements == 4);
 
         read_vec_element(s, tcg_elt, rn, 0, MO_32);
-        tcg_gen_trunc_i64_i32(tcg_elt1, tcg_elt);
+        tcg_gen_extrl_i64_i32(tcg_elt1, tcg_elt);
         read_vec_element(s, tcg_elt, rn, 1, MO_32);
-        tcg_gen_trunc_i64_i32(tcg_elt2, tcg_elt);
+        tcg_gen_extrl_i64_i32(tcg_elt2, tcg_elt);
 
         do_minmaxop(s, tcg_elt1, tcg_elt2, opcode, is_min, fpst);
 
         read_vec_element(s, tcg_elt, rn, 2, MO_32);
-        tcg_gen_trunc_i64_i32(tcg_elt2, tcg_elt);
+        tcg_gen_extrl_i64_i32(tcg_elt2, tcg_elt);
         read_vec_element(s, tcg_elt, rn, 3, MO_32);
-        tcg_gen_trunc_i64_i32(tcg_elt3, tcg_elt);
+        tcg_gen_extrl_i64_i32(tcg_elt3, tcg_elt);
 
         do_minmaxop(s, tcg_elt2, tcg_elt3, opcode, is_min, fpst);
 
@@ -7647,7 +7647,7 @@ static void handle_2misc_narrow(DisasContext *s, bool scalar,
             static NeonGenNarrowFn * const xtnfns[3] = {
                 gen_helper_neon_narrow_u8,
                 gen_helper_neon_narrow_u16,
-                tcg_gen_trunc_i64_i32,
+                tcg_gen_extrl_i64_i32,
             };
             static NeonGenNarrowEnvFn * const sqxtunfns[3] = {
                 gen_helper_neon_unarrow_sat8,
@@ -7681,10 +7681,10 @@ static void handle_2misc_narrow(DisasContext *s, bool scalar,
             } else {
                 TCGv_i32 tcg_lo = tcg_temp_new_i32();
                 TCGv_i32 tcg_hi = tcg_temp_new_i32();
-                tcg_gen_trunc_i64_i32(tcg_lo, tcg_op);
+                tcg_gen_extrl_i64_i32(tcg_lo, tcg_op);
                 gen_helper_vfp_fcvt_f32_to_f16(tcg_lo, tcg_lo, cpu_env);
                 tcg_gen_shri_i64(tcg_op, tcg_op, 32);
-                tcg_gen_trunc_i64_i32(tcg_hi, tcg_op);
+                tcg_gen_extrl_i64_i32(tcg_hi, tcg_op);
                 gen_helper_vfp_fcvt_f32_to_f16(tcg_hi, tcg_hi, cpu_env);
                 tcg_gen_deposit_i32(tcg_res[pass], tcg_lo, tcg_hi, 16, 16);
                 tcg_temp_free_i32(tcg_lo);
@@ -8593,7 +8593,7 @@ static void handle_3rd_wide(DisasContext *s, int is_q, int is_u, int size,
 static void do_narrow_high_u32(TCGv_i32 res, TCGv_i64 in)
 {
     tcg_gen_shri_i64(in, in, 32);
-    tcg_gen_trunc_i64_i32(res, in);
+    tcg_gen_extrl_i64_i32(res, in);
 }
 
 static void do_narrow_round_high_u32(TCGv_i32 res, TCGv_i64 in)
diff --git a/target-arm/translate.c b/target-arm/translate.c
index 69ac18c108..e27634f3c8 100644
--- a/target-arm/translate.c
+++ b/target-arm/translate.c
@@ -1557,7 +1557,7 @@ static inline int gen_iwmmxt_shift(uint32_t insn, uint32_t mask, TCGv_i32 dest)
     } else {
         tmp = tcg_temp_new_i32();
         iwmmxt_load_reg(cpu_V0, rd);
-        tcg_gen_trunc_i64_i32(tmp, cpu_V0);
+        tcg_gen_extrl_i64_i32(tmp, cpu_V0);
     }
     tcg_gen_andi_i32(tmp, tmp, mask);
     tcg_gen_mov_i32(dest, tmp);
@@ -1581,9 +1581,9 @@ static int disas_iwmmxt_insn(DisasContext *s, uint32_t insn)
             rdhi = (insn >> 16) & 0xf;
             if (insn & ARM_CP_RW_BIT) {			/* TMRRC */
                 iwmmxt_load_reg(cpu_V0, wrd);
-                tcg_gen_trunc_i64_i32(cpu_R[rdlo], cpu_V0);
+                tcg_gen_extrl_i64_i32(cpu_R[rdlo], cpu_V0);
                 tcg_gen_shri_i64(cpu_V0, cpu_V0, 32);
-                tcg_gen_trunc_i64_i32(cpu_R[rdhi], cpu_V0);
+                tcg_gen_extrl_i64_i32(cpu_R[rdhi], cpu_V0);
             } else {					/* TMCRR */
                 tcg_gen_concat_i32_i64(cpu_V0, cpu_R[rdlo], cpu_R[rdhi]);
                 iwmmxt_store_reg(cpu_V0, wrd);
@@ -1638,15 +1638,15 @@ static int disas_iwmmxt_insn(DisasContext *s, uint32_t insn)
                     if (insn & (1 << 22)) {		/* WSTRD */
                         gen_aa32_st64(cpu_M0, addr, get_mem_index(s));
                     } else {				/* WSTRW wRd */
-                        tcg_gen_trunc_i64_i32(tmp, cpu_M0);
+                        tcg_gen_extrl_i64_i32(tmp, cpu_M0);
                         gen_aa32_st32(tmp, addr, get_mem_index(s));
                     }
                 } else {
                     if (insn & (1 << 22)) {		/* WSTRH */
-                        tcg_gen_trunc_i64_i32(tmp, cpu_M0);
+                        tcg_gen_extrl_i64_i32(tmp, cpu_M0);
                         gen_aa32_st16(tmp, addr, get_mem_index(s));
                     } else {				/* WSTRB */
-                        tcg_gen_trunc_i64_i32(tmp, cpu_M0);
+                        tcg_gen_extrl_i64_i32(tmp, cpu_M0);
                         gen_aa32_st8(tmp, addr, get_mem_index(s));
                     }
                 }
@@ -1946,7 +1946,7 @@ static int disas_iwmmxt_insn(DisasContext *s, uint32_t insn)
         switch ((insn >> 22) & 3) {
         case 0:
             tcg_gen_shri_i64(cpu_M0, cpu_M0, (insn & 7) << 3);
-            tcg_gen_trunc_i64_i32(tmp, cpu_M0);
+            tcg_gen_extrl_i64_i32(tmp, cpu_M0);
             if (insn & 8) {
                 tcg_gen_ext8s_i32(tmp, tmp);
             } else {
@@ -1955,7 +1955,7 @@ static int disas_iwmmxt_insn(DisasContext *s, uint32_t insn)
             break;
         case 1:
             tcg_gen_shri_i64(cpu_M0, cpu_M0, (insn & 3) << 4);
-            tcg_gen_trunc_i64_i32(tmp, cpu_M0);
+            tcg_gen_extrl_i64_i32(tmp, cpu_M0);
             if (insn & 8) {
                 tcg_gen_ext16s_i32(tmp, tmp);
             } else {
@@ -1964,7 +1964,7 @@ static int disas_iwmmxt_insn(DisasContext *s, uint32_t insn)
             break;
         case 2:
             tcg_gen_shri_i64(cpu_M0, cpu_M0, (insn & 1) << 5);
-            tcg_gen_trunc_i64_i32(tmp, cpu_M0);
+            tcg_gen_extrl_i64_i32(tmp, cpu_M0);
             break;
         }
         store_reg(s, rd, tmp);
@@ -2627,9 +2627,9 @@ static int disas_dsp_insn(DisasContext *s, uint32_t insn)
 
         if (insn & ARM_CP_RW_BIT) {			/* MRA */
             iwmmxt_load_reg(cpu_V0, acc);
-            tcg_gen_trunc_i64_i32(cpu_R[rdlo], cpu_V0);
+            tcg_gen_extrl_i64_i32(cpu_R[rdlo], cpu_V0);
             tcg_gen_shri_i64(cpu_V0, cpu_V0, 32);
-            tcg_gen_trunc_i64_i32(cpu_R[rdhi], cpu_V0);
+            tcg_gen_extrl_i64_i32(cpu_R[rdhi], cpu_V0);
             tcg_gen_andi_i32(cpu_R[rdhi], cpu_R[rdhi], (1 << (40 - 32)) - 1);
         } else {					/* MAR */
             tcg_gen_concat_i32_i64(cpu_V0, cpu_R[rdlo], cpu_R[rdhi]);
@@ -2951,7 +2951,7 @@ static int handle_vcvt(uint32_t insn, uint32_t rd, uint32_t rm, uint32_t dp,
         } else {
             gen_helper_vfp_tould(tcg_res, tcg_double, tcg_shift, fpst);
         }
-        tcg_gen_trunc_i64_i32(tcg_tmp, tcg_res);
+        tcg_gen_extrl_i64_i32(tcg_tmp, tcg_res);
         tcg_gen_st_f32(tcg_tmp, cpu_env, vfp_reg_offset(0, rd));
         tcg_temp_free_i32(tcg_tmp);
         tcg_temp_free_i64(tcg_res);
@@ -4683,7 +4683,7 @@ static inline void gen_neon_narrow(int size, TCGv_i32 dest, TCGv_i64 src)
     switch (size) {
     case 0: gen_helper_neon_narrow_u8(dest, src); break;
     case 1: gen_helper_neon_narrow_u16(dest, src); break;
-    case 2: tcg_gen_trunc_i64_i32(dest, src); break;
+    case 2: tcg_gen_extrl_i64_i32(dest, src); break;
     default: abort();
     }
 }
@@ -6254,7 +6254,7 @@ static int disas_neon_data_insn(DisasContext *s, uint32_t insn)
                                 break;
                             case 2:
                                 tcg_gen_shri_i64(cpu_V0, cpu_V0, 32);
-                                tcg_gen_trunc_i64_i32(tmp, cpu_V0);
+                                tcg_gen_extrl_i64_i32(tmp, cpu_V0);
                                 break;
                             default: abort();
                             }
@@ -6269,7 +6269,7 @@ static int disas_neon_data_insn(DisasContext *s, uint32_t insn)
                             case 2:
                                 tcg_gen_addi_i64(cpu_V0, cpu_V0, 1u << 31);
                                 tcg_gen_shri_i64(cpu_V0, cpu_V0, 32);
-                                tcg_gen_trunc_i64_i32(tmp, cpu_V0);
+                                tcg_gen_extrl_i64_i32(tmp, cpu_V0);
                                 break;
                             default: abort();
                             }
@@ -7224,11 +7224,11 @@ static int disas_coproc_insn(DisasContext *s, uint32_t insn)
                     tcg_gen_ld_i64(tmp64, cpu_env, ri->fieldoffset);
                 }
                 tmp = tcg_temp_new_i32();
-                tcg_gen_trunc_i64_i32(tmp, tmp64);
+                tcg_gen_extrl_i64_i32(tmp, tmp64);
                 store_reg(s, rt, tmp);
                 tcg_gen_shri_i64(tmp64, tmp64, 32);
                 tmp = tcg_temp_new_i32();
-                tcg_gen_trunc_i64_i32(tmp, tmp64);
+                tcg_gen_extrl_i64_i32(tmp, tmp64);
                 tcg_temp_free_i64(tmp64);
                 store_reg(s, rt2, tmp);
             } else {
@@ -7334,11 +7334,11 @@ static void gen_storeq_reg(DisasContext *s, int rlow, int rhigh, TCGv_i64 val)
 {
     TCGv_i32 tmp;
     tmp = tcg_temp_new_i32();
-    tcg_gen_trunc_i64_i32(tmp, val);
+    tcg_gen_extrl_i64_i32(tmp, val);
     store_reg(s, rlow, tmp);
     tmp = tcg_temp_new_i32();
     tcg_gen_shri_i64(val, val, 32);
-    tcg_gen_trunc_i64_i32(tmp, val);
+    tcg_gen_extrl_i64_i32(tmp, val);
     store_reg(s, rhigh, tmp);
 }
 
@@ -8013,7 +8013,7 @@ static void disas_arm_insn(DisasContext *s, unsigned int insn)
                 tmp64 = gen_muls_i64_i32(tmp, tmp2);
                 tcg_gen_shri_i64(tmp64, tmp64, 16);
                 tmp = tcg_temp_new_i32();
-                tcg_gen_trunc_i64_i32(tmp, tmp64);
+                tcg_gen_extrl_i64_i32(tmp, tmp64);
                 tcg_temp_free_i64(tmp64);
                 if ((sh & 2) == 0) {
                     tmp2 = load_reg(s, rn);
@@ -8679,7 +8679,7 @@ static void disas_arm_insn(DisasContext *s, unsigned int insn)
                         }
                         tcg_gen_shri_i64(tmp64, tmp64, 32);
                         tmp = tcg_temp_new_i32();
-                        tcg_gen_trunc_i64_i32(tmp, tmp64);
+                        tcg_gen_extrl_i64_i32(tmp, tmp64);
                         tcg_temp_free_i64(tmp64);
                         store_reg(s, rn, tmp);
                         break;
@@ -9749,7 +9749,7 @@ static int disas_thumb2_insn(CPUARMState *env, DisasContext *s, uint16_t insn_hw
                 tmp64 = gen_muls_i64_i32(tmp, tmp2);
                 tcg_gen_shri_i64(tmp64, tmp64, 16);
                 tmp = tcg_temp_new_i32();
-                tcg_gen_trunc_i64_i32(tmp, tmp64);
+                tcg_gen_extrl_i64_i32(tmp, tmp64);
                 tcg_temp_free_i64(tmp64);
                 if (rs != 15)
                   {
@@ -9773,7 +9773,7 @@ static int disas_thumb2_insn(CPUARMState *env, DisasContext *s, uint16_t insn_hw
                 }
                 tcg_gen_shri_i64(tmp64, tmp64, 32);
                 tmp = tcg_temp_new_i32();
-                tcg_gen_trunc_i64_i32(tmp, tmp64);
+                tcg_gen_extrl_i64_i32(tmp, tmp64);
                 tcg_temp_free_i64(tmp64);
                 break;
             case 7: /* Unsigned sum of absolute differences.  */
diff --git a/target-cris/translate.c b/target-cris/translate.c
index 3e59601eb4..5699826c8b 100644
--- a/target-cris/translate.c
+++ b/target-cris/translate.c
@@ -2604,9 +2604,9 @@ static int dec_movem_mr(CPUCRISState *env, DisasContext *dc)
     tcg_temp_free(addr);
 
     for (i = 0; i < (nr >> 1); i++) {
-        tcg_gen_trunc_i64_i32(cpu_R[i * 2], tmp[i]);
+        tcg_gen_extrl_i64_i32(cpu_R[i * 2], tmp[i]);
         tcg_gen_shri_i64(tmp[i], tmp[i], 32);
-        tcg_gen_trunc_i64_i32(cpu_R[i * 2 + 1], tmp[i]);
+        tcg_gen_extrl_i64_i32(cpu_R[i * 2 + 1], tmp[i]);
         tcg_temp_free_i64(tmp[i]);
     }
     if (nr & 1) {
diff --git a/target-m68k/translate.c b/target-m68k/translate.c
index a57d2415c9..3cdf6652aa 100644
--- a/target-m68k/translate.c
+++ b/target-m68k/translate.c
@@ -2680,7 +2680,7 @@ DISAS_INSN(from_mac)
     if (s->env->macsr & MACSR_FI) {
         gen_helper_get_macf(rx, cpu_env, acc);
     } else if ((s->env->macsr & MACSR_OMC) == 0) {
-        tcg_gen_trunc_i64_i32(rx, acc);
+        tcg_gen_extrl_i64_i32(rx, acc);
     } else if (s->env->macsr & MACSR_SU) {
         gen_helper_get_macs(rx, acc);
     } else {
diff --git a/target-microblaze/translate.c b/target-microblaze/translate.c
index f4e969b29c..47ac18015e 100644
--- a/target-microblaze/translate.c
+++ b/target-microblaze/translate.c
@@ -598,9 +598,9 @@ static void t_gen_muls(TCGv d, TCGv d2, TCGv a, TCGv b)
     tcg_gen_ext_i32_i64(t1, b);
     tcg_gen_mul_i64(t0, t0, t1);
 
-    tcg_gen_trunc_i64_i32(d, t0);
+    tcg_gen_extrl_i64_i32(d, t0);
     tcg_gen_shri_i64(t0, t0, 32);
-    tcg_gen_trunc_i64_i32(d2, t0);
+    tcg_gen_extrl_i64_i32(d2, t0);
 
     tcg_temp_free_i64(t0);
     tcg_temp_free_i64(t1);
@@ -618,9 +618,9 @@ static void t_gen_mulu(TCGv d, TCGv d2, TCGv a, TCGv b)
     tcg_gen_extu_i32_i64(t1, b);
     tcg_gen_mul_i64(t0, t0, t1);
 
-    tcg_gen_trunc_i64_i32(d, t0);
+    tcg_gen_extrl_i64_i32(d, t0);
     tcg_gen_shri_i64(t0, t0, 32);
-    tcg_gen_trunc_i64_i32(d2, t0);
+    tcg_gen_extrl_i64_i32(d2, t0);
 
     tcg_temp_free_i64(t0);
     tcg_temp_free_i64(t1);
diff --git a/target-mips/translate.c b/target-mips/translate.c
index 98cf72de74..93cb4f2731 100644
--- a/target-mips/translate.c
+++ b/target-mips/translate.c
@@ -1629,7 +1629,7 @@ static void gen_load_fpr32(DisasContext *ctx, TCGv_i32 t, int reg)
     if (ctx->hflags & MIPS_HFLAG_FRE) {
         generate_exception(ctx, EXCP_RI);
     }
-    tcg_gen_trunc_i64_i32(t, fpu_f64[reg]);
+    tcg_gen_extrl_i64_i32(t, fpu_f64[reg]);
 }
 
 static void gen_store_fpr32(DisasContext *ctx, TCGv_i32 t, int reg)
@@ -1649,7 +1649,7 @@ static void gen_load_fpr32h(DisasContext *ctx, TCGv_i32 t, int reg)
     if (ctx->hflags & MIPS_HFLAG_F64) {
         TCGv_i64 t64 = tcg_temp_new_i64();
         tcg_gen_shri_i64(t64, fpu_f64[reg], 32);
-        tcg_gen_trunc_i64_i32(t, t64);
+        tcg_gen_extrl_i64_i32(t, t64);
         tcg_temp_free_i64(t64);
     } else {
         gen_load_fpr32(ctx, t, reg | 1);
diff --git a/target-openrisc/translate.c b/target-openrisc/translate.c
index a62cbf4011..aca1242bdb 100644
--- a/target-openrisc/translate.c
+++ b/target-openrisc/translate.c
@@ -279,7 +279,7 @@ static void dec_calc(DisasContext *dc, uint32_t insn)
                 tcg_gen_extu_i32_i64(ta, cpu_R[ra]);
                 tcg_gen_extu_i32_i64(tb, cpu_R[rb]);
                 tcg_gen_add_i64(td, ta, tb);
-                tcg_gen_trunc_i64_i32(res, td);
+                tcg_gen_extrl_i64_i32(res, td);
                 tcg_gen_shri_i64(td, td, 31);
                 tcg_gen_andi_i64(td, td, 0x3);
                 /* Jump to lab when no overflow.  */
@@ -324,7 +324,7 @@ static void dec_calc(DisasContext *dc, uint32_t insn)
                 tcg_gen_shri_i64(tcy, tcy, 10);
                 tcg_gen_add_i64(td, ta, tb);
                 tcg_gen_add_i64(td, td, tcy);
-                tcg_gen_trunc_i64_i32(res, td);
+                tcg_gen_extrl_i64_i32(res, td);
                 tcg_gen_shri_i64(td, td, 32);
                 tcg_gen_andi_i64(td, td, 0x3);
                 /* Jump to lab when no overflow.  */
@@ -366,7 +366,7 @@ static void dec_calc(DisasContext *dc, uint32_t insn)
                 tcg_gen_extu_i32_i64(ta, cpu_R[ra]);
                 tcg_gen_extu_i32_i64(tb, cpu_R[rb]);
                 tcg_gen_sub_i64(td, ta, tb);
-                tcg_gen_trunc_i64_i32(res, td);
+                tcg_gen_extrl_i64_i32(res, td);
                 tcg_gen_shri_i64(td, td, 31);
                 tcg_gen_andi_i64(td, td, 0x3);
                 /* Jump to lab when no overflow.  */
@@ -779,9 +779,9 @@ static void dec_misc(DisasContext *dc, uint32_t insn)
             tcg_gen_ext_i32_i64(t1, dst);
             tcg_gen_concat_i32_i64(t2, maclo, machi);
             tcg_gen_add_i64(t2, t2, t1);
-            tcg_gen_trunc_i64_i32(maclo, t2);
+            tcg_gen_extrl_i64_i32(maclo, t2);
             tcg_gen_shri_i64(t2, t2, 32);
-            tcg_gen_trunc_i64_i32(machi, t2);
+            tcg_gen_extrl_i64_i32(machi, t2);
             tcg_temp_free_i32(dst);
             tcg_temp_free(ttmp);
             tcg_temp_free_i64(t1);
@@ -898,7 +898,7 @@ static void dec_misc(DisasContext *dc, uint32_t insn)
                 TCGv_i32 sr_ove = tcg_temp_local_new_i32();
                 tcg_gen_extu_i32_i64(ta, cpu_R[ra]);
                 tcg_gen_addi_i64(td, ta, sign_extend(I16, 16));
-                tcg_gen_trunc_i64_i32(res, td);
+                tcg_gen_extrl_i64_i32(res, td);
                 tcg_gen_shri_i64(td, td, 32);
                 tcg_gen_andi_i64(td, td, 0x3);
                 /* Jump to lab when no overflow.  */
@@ -934,7 +934,7 @@ static void dec_misc(DisasContext *dc, uint32_t insn)
             tcg_gen_extu_i32_i64(tcy, sr_cy);
             tcg_gen_addi_i64(td, ta, sign_extend(I16, 16));
             tcg_gen_add_i64(td, td, tcy);
-            tcg_gen_trunc_i64_i32(res, td);
+            tcg_gen_extrl_i64_i32(res, td);
             tcg_gen_shri_i64(td, td, 32);
             tcg_gen_andi_i64(td, td, 0x3);
             /* Jump to lab when no overflow.  */
@@ -1073,9 +1073,9 @@ static void dec_mac(DisasContext *dc, uint32_t insn)
             tcg_gen_ext_i32_i64(t1, t0);
             tcg_gen_concat_i32_i64(t2, maclo, machi);
             tcg_gen_add_i64(t2, t2, t1);
-            tcg_gen_trunc_i64_i32(maclo, t2);
+            tcg_gen_extrl_i64_i32(maclo, t2);
             tcg_gen_shri_i64(t2, t2, 32);
-            tcg_gen_trunc_i64_i32(machi, t2);
+            tcg_gen_extrl_i64_i32(machi, t2);
             tcg_temp_free_i32(t0);
             tcg_temp_free_i64(t1);
             tcg_temp_free_i64(t2);
@@ -1092,9 +1092,9 @@ static void dec_mac(DisasContext *dc, uint32_t insn)
             tcg_gen_ext_i32_i64(t1, t0);
             tcg_gen_concat_i32_i64(t2, maclo, machi);
             tcg_gen_sub_i64(t2, t2, t1);
-            tcg_gen_trunc_i64_i32(maclo, t2);
+            tcg_gen_extrl_i64_i32(maclo, t2);
             tcg_gen_shri_i64(t2, t2, 32);
-            tcg_gen_trunc_i64_i32(machi, t2);
+            tcg_gen_extrl_i64_i32(machi, t2);
             tcg_temp_free_i32(t0);
             tcg_temp_free_i64(t1);
             tcg_temp_free_i64(t2);
diff --git a/target-s390x/translate.c b/target-s390x/translate.c
index c748290d5c..2bca33acca 100644
--- a/target-s390x/translate.c
+++ b/target-s390x/translate.c
@@ -811,7 +811,7 @@ static void disas_jcc(DisasContext *s, DisasCompare *c, uint32_t mask)
     case CC_OP_LTGT0_32:
         c->is_64 = false;
         c->u.s32.a = tcg_temp_new_i32();
-        tcg_gen_trunc_i64_i32(c->u.s32.a, cc_dst);
+        tcg_gen_extrl_i64_i32(c->u.s32.a, cc_dst);
         c->u.s32.b = tcg_const_i32(0);
         break;
     case CC_OP_LTGT_32:
@@ -819,9 +819,9 @@ static void disas_jcc(DisasContext *s, DisasCompare *c, uint32_t mask)
     case CC_OP_SUBU_32:
         c->is_64 = false;
         c->u.s32.a = tcg_temp_new_i32();
-        tcg_gen_trunc_i64_i32(c->u.s32.a, cc_src);
+        tcg_gen_extrl_i64_i32(c->u.s32.a, cc_src);
         c->u.s32.b = tcg_temp_new_i32();
-        tcg_gen_trunc_i64_i32(c->u.s32.b, cc_dst);
+        tcg_gen_extrl_i64_i32(c->u.s32.b, cc_dst);
         break;
 
     case CC_OP_LTGT0_64:
@@ -851,11 +851,11 @@ static void disas_jcc(DisasContext *s, DisasCompare *c, uint32_t mask)
         c->is_64 = false;
         c->u.s32.a = tcg_temp_new_i32();
         c->u.s32.b = tcg_temp_new_i32();
-        tcg_gen_trunc_i64_i32(c->u.s32.a, cc_vr);
+        tcg_gen_extrl_i64_i32(c->u.s32.a, cc_vr);
         if (cond == TCG_COND_EQ || cond == TCG_COND_NE) {
             tcg_gen_movi_i32(c->u.s32.b, 0);
         } else {
-            tcg_gen_trunc_i64_i32(c->u.s32.b, cc_src);
+            tcg_gen_extrl_i64_i32(c->u.s32.b, cc_src);
         }
         break;
 
@@ -1532,7 +1532,7 @@ static ExitStatus op_bct32(DisasContext *s, DisasOps *o)
     store_reg32_i64(r1, t);
     c.u.s32.a = tcg_temp_new_i32();
     c.u.s32.b = tcg_const_i32(0);
-    tcg_gen_trunc_i64_i32(c.u.s32.a, t);
+    tcg_gen_extrl_i64_i32(c.u.s32.a, t);
     tcg_temp_free_i64(t);
 
     return help_branch(s, &c, is_imm, imm, o->in2);
@@ -1556,7 +1556,7 @@ static ExitStatus op_bcth(DisasContext *s, DisasOps *o)
     store_reg32h_i64(r1, t);
     c.u.s32.a = tcg_temp_new_i32();
     c.u.s32.b = tcg_const_i32(0);
-    tcg_gen_trunc_i64_i32(c.u.s32.a, t);
+    tcg_gen_extrl_i64_i32(c.u.s32.a, t);
     tcg_temp_free_i64(t);
 
     return help_branch(s, &c, 1, imm, o->in2);
@@ -1599,8 +1599,8 @@ static ExitStatus op_bx32(DisasContext *s, DisasOps *o)
     tcg_gen_add_i64(t, regs[r1], regs[r3]);
     c.u.s32.a = tcg_temp_new_i32();
     c.u.s32.b = tcg_temp_new_i32();
-    tcg_gen_trunc_i64_i32(c.u.s32.a, t);
-    tcg_gen_trunc_i64_i32(c.u.s32.b, regs[r3 | 1]);
+    tcg_gen_extrl_i64_i32(c.u.s32.a, t);
+    tcg_gen_extrl_i64_i32(c.u.s32.b, regs[r3 | 1]);
     store_reg32_i64(r1, t);
     tcg_temp_free_i64(t);
 
@@ -1905,7 +1905,7 @@ static ExitStatus op_clm(DisasContext *s, DisasOps *o)
 {
     TCGv_i32 m3 = tcg_const_i32(get_field(s->fields, m3));
     TCGv_i32 t1 = tcg_temp_new_i32();
-    tcg_gen_trunc_i64_i32(t1, o->in1);
+    tcg_gen_extrl_i64_i32(t1, o->in1);
     potential_page_fault(s);
     gen_helper_clm(cc_op, cpu_env, t1, m3, o->in2);
     set_cc_static(s);
@@ -1977,7 +1977,7 @@ static ExitStatus op_cs(DisasContext *s, DisasOps *o)
 
     /* Store CC back to cc_op.  Wait until after the store so that any
        exception gets the old cc_op value.  */
-    tcg_gen_trunc_i64_i32(cc_op, cc);
+    tcg_gen_extrl_i64_i32(cc_op, cc);
     tcg_temp_free_i64(cc);
     set_cc_static(s);
     return NO_EXIT;
@@ -2027,7 +2027,7 @@ static ExitStatus op_cdsg(DisasContext *s, DisasOps *o)
     /* Save back state now that we've passed all exceptions.  */
     tcg_gen_mov_i64(regs[r1], outh);
     tcg_gen_mov_i64(regs[r1 + 1], outl);
-    tcg_gen_trunc_i64_i32(cc_op, cc);
+    tcg_gen_extrl_i64_i32(cc_op, cc);
     tcg_temp_free_i64(outh);
     tcg_temp_free_i64(outl);
     tcg_temp_free_i64(cc);
@@ -2051,7 +2051,7 @@ static ExitStatus op_cvd(DisasContext *s, DisasOps *o)
 {
     TCGv_i64 t1 = tcg_temp_new_i64();
     TCGv_i32 t2 = tcg_temp_new_i32();
-    tcg_gen_trunc_i64_i32(t2, o->in1);
+    tcg_gen_extrl_i64_i32(t2, o->in1);
     gen_helper_cvd(t1, t2);
     tcg_temp_free_i32(t2);
     tcg_gen_qemu_st64(t1, o->in2, get_mem_index(s));
@@ -3235,8 +3235,8 @@ static ExitStatus op_rll32(DisasContext *s, DisasOps *o)
     TCGv_i32 t1 = tcg_temp_new_i32();
     TCGv_i32 t2 = tcg_temp_new_i32();
     TCGv_i32 to = tcg_temp_new_i32();
-    tcg_gen_trunc_i64_i32(t1, o->in1);
-    tcg_gen_trunc_i64_i32(t2, o->in2);
+    tcg_gen_extrl_i64_i32(t1, o->in1);
+    tcg_gen_extrl_i64_i32(t2, o->in2);
     tcg_gen_rotl_i32(to, t1, t2);
     tcg_gen_extu_i32_i64(o->out, to);
     tcg_temp_free_i32(t1);
diff --git a/target-sh4/translate.c b/target-sh4/translate.c
index 3b4a1b5cea..be0cb321cf 100644
--- a/target-sh4/translate.c
+++ b/target-sh4/translate.c
@@ -288,10 +288,10 @@ static inline void gen_load_fpr64(TCGv_i64 t, int reg)
 static inline void gen_store_fpr64 (TCGv_i64 t, int reg)
 {
     TCGv_i32 tmp = tcg_temp_new_i32();
-    tcg_gen_trunc_i64_i32(tmp, t);
+    tcg_gen_extrl_i64_i32(tmp, t);
     tcg_gen_mov_i32(cpu_fregs[reg + 1], tmp);
     tcg_gen_shri_i64(t, t, 32);
-    tcg_gen_trunc_i64_i32(tmp, t);
+    tcg_gen_extrl_i64_i32(tmp, t);
     tcg_gen_mov_i32(cpu_fregs[reg], tmp);
     tcg_temp_free_i32(tmp);
 }
diff --git a/target-sparc/translate.c b/target-sparc/translate.c
index c58dd4e95b..48fc2abe63 100644
--- a/target-sparc/translate.c
+++ b/target-sparc/translate.c
@@ -164,7 +164,7 @@ static TCGv_i32 gen_load_fpr_F(DisasContext *dc, unsigned int src)
         TCGv_i64 t = tcg_temp_new_i64();
 
         tcg_gen_shri_i64(t, cpu_fpr[src / 2], 32);
-        tcg_gen_trunc_i64_i32(ret, t);
+        tcg_gen_extrl_i64_i32(ret, t);
         tcg_temp_free_i64(t);
 
         return ret;
@@ -379,8 +379,8 @@ static TCGv_i32 gen_add32_carry32(void)
 #if TARGET_LONG_BITS == 64
     cc_src1_32 = tcg_temp_new_i32();
     cc_src2_32 = tcg_temp_new_i32();
-    tcg_gen_trunc_i64_i32(cc_src1_32, cpu_cc_dst);
-    tcg_gen_trunc_i64_i32(cc_src2_32, cpu_cc_src);
+    tcg_gen_extrl_i64_i32(cc_src1_32, cpu_cc_dst);
+    tcg_gen_extrl_i64_i32(cc_src2_32, cpu_cc_src);
 #else
     cc_src1_32 = cpu_cc_dst;
     cc_src2_32 = cpu_cc_src;
@@ -405,8 +405,8 @@ static TCGv_i32 gen_sub32_carry32(void)
 #if TARGET_LONG_BITS == 64
     cc_src1_32 = tcg_temp_new_i32();
     cc_src2_32 = tcg_temp_new_i32();
-    tcg_gen_trunc_i64_i32(cc_src1_32, cpu_cc_src);
-    tcg_gen_trunc_i64_i32(cc_src2_32, cpu_cc_src2);
+    tcg_gen_extrl_i64_i32(cc_src1_32, cpu_cc_src);
+    tcg_gen_extrl_i64_i32(cc_src2_32, cpu_cc_src2);
 #else
     cc_src1_32 = cpu_cc_src;
     cc_src2_32 = cpu_cc_src2;
@@ -2254,11 +2254,11 @@ static void gen_fmovs(DisasContext *dc, DisasCompare *cmp, int rd, int rs)
        the later.  */
     c32 = tcg_temp_new_i32();
     if (cmp->is_bool) {
-        tcg_gen_trunc_i64_i32(c32, cmp->c1);
+        tcg_gen_extrl_i64_i32(c32, cmp->c1);
     } else {
         TCGv_i64 c64 = tcg_temp_new_i64();
         tcg_gen_setcond_i64(cmp->cond, c64, cmp->c1, cmp->c2);
-        tcg_gen_trunc_i64_i32(c32, c64);
+        tcg_gen_extrl_i64_i32(c32, c64);
         tcg_temp_free_i64(c64);
     }
 
diff --git a/target-tricore/translate.c b/target-tricore/translate.c
index 7dc7a325b4..f02bef41ee 100644
--- a/target-tricore/translate.c
+++ b/target-tricore/translate.c
@@ -457,11 +457,11 @@ gen_add64_d(TCGv_i64 ret, TCGv_i64 r1, TCGv_i64 r2)
     tcg_gen_xor_i64(t1, result, r1);
     tcg_gen_xor_i64(t0, r1, r2);
     tcg_gen_andc_i64(t1, t1, t0);
-    tcg_gen_trunc_shr_i64_i32(cpu_PSW_V, t1, 32);
+    tcg_gen_extrh_i64_i32(cpu_PSW_V, t1);
     /* calc SV bit */
     tcg_gen_or_tl(cpu_PSW_SV, cpu_PSW_SV, cpu_PSW_V);
     /* calc AV/SAV bits */
-    tcg_gen_trunc_shr_i64_i32(temp, result, 32);
+    tcg_gen_extrh_i64_i32(temp, result);
     tcg_gen_add_tl(cpu_PSW_AV, temp, temp);
     tcg_gen_xor_tl(cpu_PSW_AV, temp, cpu_PSW_AV);
     /* calc SAV */
@@ -540,14 +540,14 @@ static inline void gen_madd32_d(TCGv ret, TCGv r1, TCGv r2, TCGv r3)
     tcg_gen_mul_i64(t1, t1, t3);
     tcg_gen_add_i64(t1, t2, t1);
 
-    tcg_gen_trunc_i64_i32(ret, t1);
+    tcg_gen_extrl_i64_i32(ret, t1);
     /* calc V
        t1 > 0x7fffffff */
     tcg_gen_setcondi_i64(TCG_COND_GT, t3, t1, 0x7fffffffLL);
     /* t1 < -0x80000000 */
     tcg_gen_setcondi_i64(TCG_COND_LT, t2, t1, -0x80000000LL);
     tcg_gen_or_i64(t2, t2, t3);
-    tcg_gen_trunc_i64_i32(cpu_PSW_V, t2);
+    tcg_gen_extrl_i64_i32(cpu_PSW_V, t2);
     tcg_gen_shli_tl(cpu_PSW_V, cpu_PSW_V, 31);
     /* Calc SV bit */
     tcg_gen_or_tl(cpu_PSW_SV, cpu_PSW_SV, cpu_PSW_V);
@@ -621,7 +621,7 @@ gen_maddu64_d(TCGv ret_low, TCGv ret_high, TCGv r1, TCGv r2_low, TCGv r2_high,
     /* only the add overflows, if t2 < t1
        calc V bit */
     tcg_gen_setcond_i64(TCG_COND_LTU, t2, t2, t1);
-    tcg_gen_trunc_i64_i32(cpu_PSW_V, t2);
+    tcg_gen_extrl_i64_i32(cpu_PSW_V, t2);
     tcg_gen_shli_tl(cpu_PSW_V, cpu_PSW_V, 31);
     /* Calc SV bit */
     tcg_gen_or_tl(cpu_PSW_SV, cpu_PSW_SV, cpu_PSW_V);
@@ -1110,12 +1110,12 @@ gen_madd32_q(TCGv ret, TCGv arg1, TCGv arg2, TCGv arg3, uint32_t n,
     tcg_gen_sari_i64(t2, t2, up_shift);
 
     tcg_gen_add_i64(t3, t1, t2);
-    tcg_gen_trunc_i64_i32(temp3, t3);
+    tcg_gen_extrl_i64_i32(temp3, t3);
     /* calc v bit */
     tcg_gen_setcondi_i64(TCG_COND_GT, t1, t3, 0x7fffffffLL);
     tcg_gen_setcondi_i64(TCG_COND_LT, t2, t3, -0x80000000LL);
     tcg_gen_or_i64(t1, t1, t2);
-    tcg_gen_trunc_i64_i32(cpu_PSW_V, t1);
+    tcg_gen_extrl_i64_i32(cpu_PSW_V, t1);
     tcg_gen_shli_tl(cpu_PSW_V, cpu_PSW_V, 31);
     /* We produce an overflow on the host if the mul before was
        (0x80000000 * 0x80000000) << 1). If this is the
@@ -1273,7 +1273,7 @@ gen_madd64_q(TCGv rl, TCGv rh, TCGv arg1_low, TCGv arg1_high, TCGv arg2,
     tcg_gen_xor_i64(t3, t4, t1);
     tcg_gen_xor_i64(t2, t1, t2);
     tcg_gen_andc_i64(t3, t3, t2);
-    tcg_gen_trunc_shr_i64_i32(cpu_PSW_V, t3, 32);
+    tcg_gen_extrh_i64_i32(cpu_PSW_V, t3);
     /* We produce an overflow on the host if the mul before was
        (0x80000000 * 0x80000000) << 1). If this is the
        case, we negate the ovf. */
@@ -1356,14 +1356,14 @@ static inline void gen_msub32_d(TCGv ret, TCGv r1, TCGv r2, TCGv r3)
     tcg_gen_mul_i64(t1, t1, t3);
     tcg_gen_sub_i64(t1, t2, t1);
 
-    tcg_gen_trunc_i64_i32(ret, t1);
+    tcg_gen_extrl_i64_i32(ret, t1);
     /* calc V
        t2 > 0x7fffffff */
     tcg_gen_setcondi_i64(TCG_COND_GT, t3, t1, 0x7fffffffLL);
     /* result < -0x80000000 */
     tcg_gen_setcondi_i64(TCG_COND_LT, t2, t1, -0x80000000LL);
     tcg_gen_or_i64(t2, t2, t3);
-    tcg_gen_trunc_i64_i32(cpu_PSW_V, t2);
+    tcg_gen_extrl_i64_i32(cpu_PSW_V, t2);
     tcg_gen_shli_tl(cpu_PSW_V, cpu_PSW_V, 31);
 
     /* Calc SV bit */
@@ -1445,7 +1445,7 @@ gen_msubu64_d(TCGv ret_low, TCGv ret_high, TCGv r1, TCGv r2_low, TCGv r2_high,
     tcg_gen_extr_i64_i32(ret_low, ret_high, t3);
     /* calc V bit, only the sub can overflow, if t1 > t2 */
     tcg_gen_setcond_i64(TCG_COND_GTU, t1, t1, t2);
-    tcg_gen_trunc_i64_i32(cpu_PSW_V, t1);
+    tcg_gen_extrl_i64_i32(cpu_PSW_V, t1);
     tcg_gen_shli_tl(cpu_PSW_V, cpu_PSW_V, 31);
     /* Calc SV bit */
     tcg_gen_or_tl(cpu_PSW_SV, cpu_PSW_SV, cpu_PSW_V);
@@ -1630,11 +1630,11 @@ gen_sub64_d(TCGv_i64 ret, TCGv_i64 r1, TCGv_i64 r2)
     tcg_gen_xor_i64(t1, result, r1);
     tcg_gen_xor_i64(t0, r1, r2);
     tcg_gen_and_i64(t1, t1, t0);
-    tcg_gen_trunc_shr_i64_i32(cpu_PSW_V, t1, 32);
+    tcg_gen_extrh_i64_i32(cpu_PSW_V, t1);
     /* calc SV bit */
     tcg_gen_or_tl(cpu_PSW_SV, cpu_PSW_SV, cpu_PSW_V);
     /* calc AV/SAV bits */
-    tcg_gen_trunc_shr_i64_i32(temp, result, 32);
+    tcg_gen_extrh_i64_i32(temp, result);
     tcg_gen_add_tl(cpu_PSW_AV, temp, temp);
     tcg_gen_xor_tl(cpu_PSW_AV, temp, cpu_PSW_AV);
     /* calc SAV */
@@ -1973,12 +1973,12 @@ gen_msub32_q(TCGv ret, TCGv arg1, TCGv arg2, TCGv arg3, uint32_t n,
     tcg_gen_add_i64(t2, t2, t4);
 
     tcg_gen_sub_i64(t3, t1, t2);
-    tcg_gen_trunc_i64_i32(temp3, t3);
+    tcg_gen_extrl_i64_i32(temp3, t3);
     /* calc v bit */
     tcg_gen_setcondi_i64(TCG_COND_GT, t1, t3, 0x7fffffffLL);
     tcg_gen_setcondi_i64(TCG_COND_LT, t2, t3, -0x80000000LL);
     tcg_gen_or_i64(t1, t1, t2);
-    tcg_gen_trunc_i64_i32(cpu_PSW_V, t1);
+    tcg_gen_extrl_i64_i32(cpu_PSW_V, t1);
     tcg_gen_shli_tl(cpu_PSW_V, cpu_PSW_V, 31);
     /* Calc SV bit */
     tcg_gen_or_tl(cpu_PSW_SV, cpu_PSW_SV, cpu_PSW_V);
@@ -2126,7 +2126,7 @@ gen_msub64_q(TCGv rl, TCGv rh, TCGv arg1_low, TCGv arg1_high, TCGv arg2,
     tcg_gen_xor_i64(t3, t4, t1);
     tcg_gen_xor_i64(t2, t1, t2);
     tcg_gen_and_i64(t3, t3, t2);
-    tcg_gen_trunc_shr_i64_i32(cpu_PSW_V, t3, 32);
+    tcg_gen_extrh_i64_i32(cpu_PSW_V, t3);
     /* We produce an overflow on the host if the mul before was
        (0x80000000 * 0x80000000) << 1). If this is the
        case, we negate the ovf. */
diff --git a/target-xtensa/translate.c b/target-xtensa/translate.c
index f2118c24c0..a29b3e61bc 100644
--- a/target-xtensa/translate.c
+++ b/target-xtensa/translate.c
@@ -1544,7 +1544,7 @@ static void disas_xtensa_insn(CPUXtensaState *env, DisasContext *dc)
                     TCGv_i64 tmp = tcg_temp_new_i64(); \
                     tcg_gen_extu_i32_i64(tmp, reg); \
                     tcg_gen_##cmd##_i64(v, v, tmp); \
-                    tcg_gen_trunc_i64_i32(cpu_R[RRR_R], v); \
+                    tcg_gen_extrl_i64_i32(cpu_R[RRR_R], v); \
                     tcg_temp_free_i64(v); \
                     tcg_temp_free_i64(tmp); \
                 } while (0)
diff --git a/tcg/README b/tcg/README
index a550ff176d..34c0775cff 100644
--- a/tcg/README
+++ b/tcg/README
@@ -314,11 +314,17 @@ This operation would be equivalent to
 
   dest = (t1 & ~0x0f00) | ((t2 << 8) & 0x0f00)
 
-* trunc_shr_i32 t0, t1, pos
+* extrl_i64_i32 t0, t1
 
-For 64-bit hosts only, right shift the 64-bit input T1 by POS and
-truncate to 32-bit output T0.  Depending on the host, this may be
-a simple mov/shift, or may require additional canonicalization.
+For 64-bit hosts only, extract the low 32-bits of input T1 and place it
+into 32-bit output T0.  Depending on the host, this may be a simple move,
+or may require additional canonicalization.
+
+* extrh_i64_i32 t0, t1
+
+For 64-bit hosts only, extract the high 32-bits of input T1 and place it
+into 32-bit output T0.  Depending on the host, this may be a simple shift,
+or may require additional canonicalization.
 
 ********* Conditional moves
 
@@ -466,13 +472,25 @@ On a 32 bit target, all 64 bit operations are converted to 32 bits. A
 few specific operations must be implemented to allow it (see add2_i32,
 sub2_i32, brcond2_i32).
 
+On a 64 bit target, the values are transfered between 32 and 64-bit
+registers using the following ops:
+- trunc_shr_i64_i32
+- ext_i32_i64
+- extu_i32_i64
+
+They ensure that the values are correctly truncated or extended when
+moved from a 32-bit to a 64-bit register or vice-versa. Note that the
+trunc_shr_i64_i32 is an optional op. It is not necessary to implement
+it if all the following conditions are met:
+- 64-bit registers can hold 32-bit values
+- 32-bit values in a 64-bit register do not need to stay zero or
+  sign extended
+- all 32-bit TCG ops ignore the high part of 64-bit registers
+
 Floating point operations are not supported in this version. A
 previous incarnation of the code generator had full support of them,
 but it is better to concentrate on integer operations first.
 
-On a 64 bit target, no assumption is made in TCG about the storage of
-the 32 bit values in 64 bit registers.
-
 4.2) Constraints
 
 GCC like constraints are used to define the constraints of every
diff --git a/tcg/aarch64/tcg-target.c b/tcg/aarch64/tcg-target.c
index b7ec4f5ace..01ae610cd7 100644
--- a/tcg/aarch64/tcg-target.c
+++ b/tcg/aarch64/tcg-target.c
@@ -30,7 +30,7 @@ static const char * const tcg_target_reg_names[TCG_TARGET_NB_REGS] = {
 static const int tcg_target_reg_alloc_order[] = {
     TCG_REG_X20, TCG_REG_X21, TCG_REG_X22, TCG_REG_X23,
     TCG_REG_X24, TCG_REG_X25, TCG_REG_X26, TCG_REG_X27,
-    TCG_REG_X28, /* we will reserve this for GUEST_BASE if configured */
+    TCG_REG_X28, /* we will reserve this for guest_base if configured */
 
     TCG_REG_X8, TCG_REG_X9, TCG_REG_X10, TCG_REG_X11,
     TCG_REG_X12, TCG_REG_X13, TCG_REG_X14, TCG_REG_X15,
@@ -56,11 +56,7 @@ static const int tcg_target_call_oarg_regs[1] = {
 #define TCG_REG_TMP TCG_REG_X30
 
 #ifndef CONFIG_SOFTMMU
-# ifdef CONFIG_USE_GUEST_BASE
-#  define TCG_REG_GUEST_BASE TCG_REG_X28
-# else
-#  define TCG_REG_GUEST_BASE TCG_REG_XZR
-# endif
+#define TCG_REG_GUEST_BASE TCG_REG_X28
 #endif
 
 static inline void reloc_pc26(tcg_insn_unit *code_ptr, tcg_insn_unit *target)
@@ -1051,14 +1047,29 @@ static void add_qemu_ldst_label(TCGContext *s, bool is_ld, TCGMemOpIdx oi,
    slow path for the failure case, which will be patched later when finalizing
    the slow path. Generated code returns the host addend in X1,
    clobbers X0,X2,X3,TMP. */
-static void tcg_out_tlb_read(TCGContext *s, TCGReg addr_reg, TCGMemOp s_bits,
+static void tcg_out_tlb_read(TCGContext *s, TCGReg addr_reg, TCGMemOp opc,
                              tcg_insn_unit **label_ptr, int mem_index,
                              bool is_read)
 {
-    TCGReg base = TCG_AREG0;
     int tlb_offset = is_read ?
         offsetof(CPUArchState, tlb_table[mem_index][0].addr_read)
         : offsetof(CPUArchState, tlb_table[mem_index][0].addr_write);
+    int s_mask = (1 << (opc & MO_SIZE)) - 1;
+    TCGReg base = TCG_AREG0, x3;
+    uint64_t tlb_mask;
+
+    /* For aligned accesses, we check the first byte and include the alignment
+       bits within the address.  For unaligned access, we check that we don't
+       cross pages using the address of the last byte of the access.  */
+    if ((opc & MO_AMASK) == MO_ALIGN || s_mask == 0) {
+        tlb_mask = TARGET_PAGE_MASK | s_mask;
+        x3 = addr_reg;
+    } else {
+        tcg_out_insn(s, 3401, ADDI, TARGET_LONG_BITS == 64,
+                     TCG_REG_X3, addr_reg, s_mask);
+        tlb_mask = TARGET_PAGE_MASK;
+        x3 = TCG_REG_X3;
+    }
 
     /* Extract the TLB index from the address into X0.
        X0<CPU_TLB_BITS:0> =
@@ -1066,11 +1077,9 @@ static void tcg_out_tlb_read(TCGContext *s, TCGReg addr_reg, TCGMemOp s_bits,
     tcg_out_ubfm(s, TARGET_LONG_BITS == 64, TCG_REG_X0, addr_reg,
                  TARGET_PAGE_BITS, TARGET_PAGE_BITS + CPU_TLB_BITS);
 
-    /* Store the page mask part of the address and the low s_bits into X3.
-       Later this allows checking for equality and alignment at the same time.
-       X3 = addr_reg & (PAGE_MASK | ((1 << s_bits) - 1)) */
-    tcg_out_logicali(s, I3404_ANDI, TARGET_LONG_BITS == 64, TCG_REG_X3,
-                     addr_reg, TARGET_PAGE_MASK | ((1 << s_bits) - 1));
+    /* Store the page mask part of the address into X3.  */
+    tcg_out_logicali(s, I3404_ANDI, TARGET_LONG_BITS == 64,
+                     TCG_REG_X3, x3, tlb_mask);
 
     /* Add any "high bits" from the tlb offset to the env address into X2,
        to take advantage of the LSL12 form of the ADDI instruction.
@@ -1207,17 +1216,16 @@ static void tcg_out_qemu_ld(TCGContext *s, TCGReg data_reg, TCGReg addr_reg,
     const TCGType otype = TARGET_LONG_BITS == 64 ? TCG_TYPE_I64 : TCG_TYPE_I32;
 #ifdef CONFIG_SOFTMMU
     unsigned mem_index = get_mmuidx(oi);
-    TCGMemOp s_bits = memop & MO_SIZE;
     tcg_insn_unit *label_ptr;
 
-    tcg_out_tlb_read(s, addr_reg, s_bits, &label_ptr, mem_index, 1);
+    tcg_out_tlb_read(s, addr_reg, memop, &label_ptr, mem_index, 1);
     tcg_out_qemu_ld_direct(s, memop, ext, data_reg,
                            TCG_REG_X1, otype, addr_reg);
     add_qemu_ldst_label(s, true, oi, ext, data_reg, addr_reg,
                         s->code_ptr, label_ptr);
 #else /* !CONFIG_SOFTMMU */
     tcg_out_qemu_ld_direct(s, memop, ext, data_reg,
-                           GUEST_BASE ? TCG_REG_GUEST_BASE : TCG_REG_XZR,
+                           guest_base ? TCG_REG_GUEST_BASE : TCG_REG_XZR,
                            otype, addr_reg);
 #endif /* CONFIG_SOFTMMU */
 }
@@ -1229,17 +1237,16 @@ static void tcg_out_qemu_st(TCGContext *s, TCGReg data_reg, TCGReg addr_reg,
     const TCGType otype = TARGET_LONG_BITS == 64 ? TCG_TYPE_I64 : TCG_TYPE_I32;
 #ifdef CONFIG_SOFTMMU
     unsigned mem_index = get_mmuidx(oi);
-    TCGMemOp s_bits = memop & MO_SIZE;
     tcg_insn_unit *label_ptr;
 
-    tcg_out_tlb_read(s, addr_reg, s_bits, &label_ptr, mem_index, 0);
+    tcg_out_tlb_read(s, addr_reg, memop, &label_ptr, mem_index, 0);
     tcg_out_qemu_st_direct(s, memop, data_reg,
                            TCG_REG_X1, otype, addr_reg);
-    add_qemu_ldst_label(s, false, oi, s_bits == MO_64, data_reg, addr_reg,
-                        s->code_ptr, label_ptr);
+    add_qemu_ldst_label(s, false, oi, (memop & MO_SIZE)== MO_64,
+                        data_reg, addr_reg, s->code_ptr, label_ptr);
 #else /* !CONFIG_SOFTMMU */
     tcg_out_qemu_st_direct(s, memop, data_reg,
-                           GUEST_BASE ? TCG_REG_GUEST_BASE : TCG_REG_XZR,
+                           guest_base ? TCG_REG_GUEST_BASE : TCG_REG_XZR,
                            otype, addr_reg);
 #endif /* CONFIG_SOFTMMU */
 }
@@ -1556,6 +1563,7 @@ static void tcg_out_op(TCGContext *s, TCGOpcode opc,
     case INDEX_op_ext16s_i32:
         tcg_out_sxt(s, ext, MO_16, a0, a1);
         break;
+    case INDEX_op_ext_i32_i64:
     case INDEX_op_ext32s_i64:
         tcg_out_sxt(s, TCG_TYPE_I64, MO_32, a0, a1);
         break;
@@ -1567,6 +1575,7 @@ static void tcg_out_op(TCGContext *s, TCGOpcode opc,
     case INDEX_op_ext16u_i32:
         tcg_out_uxt(s, MO_16, a0, a1);
         break;
+    case INDEX_op_extu_i32_i64:
     case INDEX_op_ext32u_i64:
         tcg_out_movr(s, TCG_TYPE_I32, a0, a1);
         break;
@@ -1712,6 +1721,8 @@ static const TCGTargetOpDef aarch64_op_defs[] = {
     { INDEX_op_ext8u_i64, { "r", "r" } },
     { INDEX_op_ext16u_i64, { "r", "r" } },
     { INDEX_op_ext32u_i64, { "r", "r" } },
+    { INDEX_op_ext_i32_i64, { "r", "r" } },
+    { INDEX_op_extu_i32_i64, { "r", "r" } },
 
     { INDEX_op_deposit_i32, { "r", "0", "rZ" } },
     { INDEX_op_deposit_i64, { "r", "0", "rZ" } },
@@ -1794,9 +1805,9 @@ static void tcg_target_qemu_prologue(TCGContext *s)
     tcg_set_frame(s, TCG_REG_SP, TCG_STATIC_CALL_ARGS_SIZE,
                   CPU_TEMP_BUF_NLONGS * sizeof(long));
 
-#if defined(CONFIG_USE_GUEST_BASE)
-    if (GUEST_BASE) {
-        tcg_out_movi(s, TCG_TYPE_PTR, TCG_REG_GUEST_BASE, GUEST_BASE);
+#if !defined(CONFIG_SOFTMMU)
+    if (guest_base) {
+        tcg_out_movi(s, TCG_TYPE_PTR, TCG_REG_GUEST_BASE, guest_base);
         tcg_regset_set_reg(s->reserved_regs, TCG_REG_GUEST_BASE);
     }
 #endif
diff --git a/tcg/aarch64/tcg-target.h b/tcg/aarch64/tcg-target.h
index 8aec04d2bf..19a04a6e75 100644
--- a/tcg/aarch64/tcg-target.h
+++ b/tcg/aarch64/tcg-target.h
@@ -70,7 +70,8 @@ typedef enum {
 #define TCG_TARGET_HAS_muls2_i32        0
 #define TCG_TARGET_HAS_muluh_i32        0
 #define TCG_TARGET_HAS_mulsh_i32        0
-#define TCG_TARGET_HAS_trunc_shr_i32    0
+#define TCG_TARGET_HAS_extrl_i64_i32    0
+#define TCG_TARGET_HAS_extrh_i64_i32    0
 
 #define TCG_TARGET_HAS_div_i64          1
 #define TCG_TARGET_HAS_rem_i64          1
diff --git a/tcg/arm/tcg-target.c b/tcg/arm/tcg-target.c
index ae2ec7a922..3edf6a6f97 100644
--- a/tcg/arm/tcg-target.c
+++ b/tcg/arm/tcg-target.c
@@ -1493,8 +1493,8 @@ static void tcg_out_qemu_ld(TCGContext *s, const TCGArg *args, bool is64)
     add_qemu_ldst_label(s, true, oi, datalo, datahi, addrlo, addrhi,
                         s->code_ptr, label_ptr);
 #else /* !CONFIG_SOFTMMU */
-    if (GUEST_BASE) {
-        tcg_out_movi(s, TCG_TYPE_PTR, TCG_REG_TMP, GUEST_BASE);
+    if (guest_base) {
+        tcg_out_movi(s, TCG_TYPE_PTR, TCG_REG_TMP, guest_base);
         tcg_out_qemu_ld_index(s, opc, datalo, datahi, addrlo, TCG_REG_TMP);
     } else {
         tcg_out_qemu_ld_direct(s, opc, datalo, datahi, addrlo);
@@ -1623,8 +1623,8 @@ static void tcg_out_qemu_st(TCGContext *s, const TCGArg *args, bool is64)
     add_qemu_ldst_label(s, false, oi, datalo, datahi, addrlo, addrhi,
                         s->code_ptr, label_ptr);
 #else /* !CONFIG_SOFTMMU */
-    if (GUEST_BASE) {
-        tcg_out_movi(s, TCG_TYPE_PTR, TCG_REG_TMP, GUEST_BASE);
+    if (guest_base) {
+        tcg_out_movi(s, TCG_TYPE_PTR, TCG_REG_TMP, guest_base);
         tcg_out_qemu_st_index(s, COND_AL, opc, datalo,
                               datahi, addrlo, TCG_REG_TMP);
     } else {
diff --git a/tcg/i386/tcg-target.c b/tcg/i386/tcg-target.c
index 887f22f675..d2adbc4d17 100644
--- a/tcg/i386/tcg-target.c
+++ b/tcg/i386/tcg-target.c
@@ -1172,7 +1172,7 @@ static void * const qemu_st_helpers[16] = {
    First argument register is clobbered.  */
 
 static inline void tcg_out_tlb_load(TCGContext *s, TCGReg addrlo, TCGReg addrhi,
-                                    int mem_index, TCGMemOp s_bits,
+                                    int mem_index, TCGMemOp opc,
                                     tcg_insn_unit **label_ptr, int which)
 {
     const TCGReg r0 = TCG_REG_L0;
@@ -1180,6 +1180,8 @@ static inline void tcg_out_tlb_load(TCGContext *s, TCGReg addrlo, TCGReg addrhi,
     TCGType ttype = TCG_TYPE_I32;
     TCGType htype = TCG_TYPE_I32;
     int trexw = 0, hrexw = 0;
+    int s_mask = (1 << (opc & MO_SIZE)) - 1;
+    bool aligned = (opc & MO_AMASK) == MO_ALIGN || s_mask == 0;
 
     if (TCG_TARGET_REG_BITS == 64) {
         if (TARGET_LONG_BITS == 64) {
@@ -1193,13 +1195,19 @@ static inline void tcg_out_tlb_load(TCGContext *s, TCGReg addrlo, TCGReg addrhi,
     }
 
     tcg_out_mov(s, htype, r0, addrlo);
-    tcg_out_mov(s, ttype, r1, addrlo);
+    if (aligned) {
+        tcg_out_mov(s, ttype, r1, addrlo);
+    } else {
+        /* For unaligned access check that we don't cross pages using
+           the page address of the last byte.  */
+        tcg_out_modrm_offset(s, OPC_LEA + trexw, r1, addrlo, s_mask);
+    }
 
     tcg_out_shifti(s, SHIFT_SHR + hrexw, r0,
                    TARGET_PAGE_BITS - CPU_TLB_ENTRY_BITS);
 
     tgen_arithi(s, ARITH_AND + trexw, r1,
-                TARGET_PAGE_MASK | ((1 << s_bits) - 1), 0);
+                TARGET_PAGE_MASK | (aligned ? s_mask : 0), 0);
     tgen_arithi(s, ARITH_AND + hrexw, r0,
                 (CPU_TLB_SIZE - 1) << CPU_TLB_ENTRY_BITS, 0);
 
@@ -1424,7 +1432,7 @@ int arch_prctl(int code, unsigned long addr);
 static int guest_base_flags;
 static inline void setup_guest_base_seg(void)
 {
-    if (arch_prctl(ARCH_SET_GS, GUEST_BASE) == 0) {
+    if (arch_prctl(ARCH_SET_GS, guest_base) == 0) {
         guest_base_flags = P_GS;
     }
 }
@@ -1545,7 +1553,6 @@ static void tcg_out_qemu_ld(TCGContext *s, const TCGArg *args, bool is64)
     TCGMemOp opc;
 #if defined(CONFIG_SOFTMMU)
     int mem_index;
-    TCGMemOp s_bits;
     tcg_insn_unit *label_ptr[2];
 #endif
 
@@ -1558,9 +1565,8 @@ static void tcg_out_qemu_ld(TCGContext *s, const TCGArg *args, bool is64)
 
 #if defined(CONFIG_SOFTMMU)
     mem_index = get_mmuidx(oi);
-    s_bits = opc & MO_SIZE;
 
-    tcg_out_tlb_load(s, addrlo, addrhi, mem_index, s_bits,
+    tcg_out_tlb_load(s, addrlo, addrhi, mem_index, opc,
                      label_ptr, offsetof(CPUTLBEntry, addr_read));
 
     /* TLB Hit.  */
@@ -1571,7 +1577,7 @@ static void tcg_out_qemu_ld(TCGContext *s, const TCGArg *args, bool is64)
                         s->code_ptr, label_ptr);
 #else
     {
-        int32_t offset = GUEST_BASE;
+        int32_t offset = guest_base;
         TCGReg base = addrlo;
         int index = -1;
         int seg = 0;
@@ -1580,7 +1586,7 @@ static void tcg_out_qemu_ld(TCGContext *s, const TCGArg *args, bool is64)
            We can do this with the ADDR32 prefix if we're not using
            a guest base, or when using segmentation.  Otherwise we
            need to zero-extend manually.  */
-        if (GUEST_BASE == 0 || guest_base_flags) {
+        if (guest_base == 0 || guest_base_flags) {
             seg = guest_base_flags;
             offset = 0;
             if (TCG_TARGET_REG_BITS > TARGET_LONG_BITS) {
@@ -1591,8 +1597,8 @@ static void tcg_out_qemu_ld(TCGContext *s, const TCGArg *args, bool is64)
                 tcg_out_ext32u(s, TCG_REG_L0, base);
                 base = TCG_REG_L0;
             }
-            if (offset != GUEST_BASE) {
-                tcg_out_movi(s, TCG_TYPE_I64, TCG_REG_L1, GUEST_BASE);
+            if (offset != guest_base) {
+                tcg_out_movi(s, TCG_TYPE_I64, TCG_REG_L1, guest_base);
                 index = TCG_REG_L1;
                 offset = 0;
             }
@@ -1687,7 +1693,6 @@ static void tcg_out_qemu_st(TCGContext *s, const TCGArg *args, bool is64)
     TCGMemOp opc;
 #if defined(CONFIG_SOFTMMU)
     int mem_index;
-    TCGMemOp s_bits;
     tcg_insn_unit *label_ptr[2];
 #endif
 
@@ -1700,9 +1705,8 @@ static void tcg_out_qemu_st(TCGContext *s, const TCGArg *args, bool is64)
 
 #if defined(CONFIG_SOFTMMU)
     mem_index = get_mmuidx(oi);
-    s_bits = opc & MO_SIZE;
 
-    tcg_out_tlb_load(s, addrlo, addrhi, mem_index, s_bits,
+    tcg_out_tlb_load(s, addrlo, addrhi, mem_index, opc,
                      label_ptr, offsetof(CPUTLBEntry, addr_write));
 
     /* TLB Hit.  */
@@ -1713,12 +1717,12 @@ static void tcg_out_qemu_st(TCGContext *s, const TCGArg *args, bool is64)
                         s->code_ptr, label_ptr);
 #else
     {
-        int32_t offset = GUEST_BASE;
+        int32_t offset = guest_base;
         TCGReg base = addrlo;
         int seg = 0;
 
         /* See comment in tcg_out_qemu_ld re zero-extension of addrlo.  */
-        if (GUEST_BASE == 0 || guest_base_flags) {
+        if (guest_base == 0 || guest_base_flags) {
             seg = guest_base_flags;
             offset = 0;
             if (TCG_TARGET_REG_BITS > TARGET_LONG_BITS) {
@@ -1727,12 +1731,12 @@ static void tcg_out_qemu_st(TCGContext *s, const TCGArg *args, bool is64)
         } else if (TCG_TARGET_REG_BITS == 64) {
             /* ??? Note that we can't use the same SIB addressing scheme
                as for loads, since we require L0 free for bswap.  */
-            if (offset != GUEST_BASE) {
+            if (offset != guest_base) {
                 if (TARGET_LONG_BITS == 32) {
                     tcg_out_ext32u(s, TCG_REG_L0, base);
                     base = TCG_REG_L0;
                 }
-                tcg_out_movi(s, TCG_TYPE_I64, TCG_REG_L1, GUEST_BASE);
+                tcg_out_movi(s, TCG_TYPE_I64, TCG_REG_L1, guest_base);
                 tgen_arithr(s, ARITH_ADD + P_REXW, TCG_REG_L1, base);
                 base = TCG_REG_L1;
                 offset = 0;
@@ -2064,9 +2068,11 @@ static inline void tcg_out_op(TCGContext *s, TCGOpcode opc,
     case INDEX_op_bswap64_i64:
         tcg_out_bswap64(s, args[0]);
         break;
+    case INDEX_op_extu_i32_i64:
     case INDEX_op_ext32u_i64:
         tcg_out_ext32u(s, args[0], args[1]);
         break;
+    case INDEX_op_ext_i32_i64:
     case INDEX_op_ext32s_i64:
         tcg_out_ext32s(s, args[0], args[1]);
         break;
@@ -2201,6 +2207,9 @@ static const TCGTargetOpDef x86_op_defs[] = {
     { INDEX_op_ext16u_i64, { "r", "r" } },
     { INDEX_op_ext32u_i64, { "r", "r" } },
 
+    { INDEX_op_ext_i32_i64, { "r", "r" } },
+    { INDEX_op_extu_i32_i64, { "r", "r" } },
+
     { INDEX_op_deposit_i64, { "Q", "0", "Q" } },
     { INDEX_op_movcond_i64, { "r", "r", "re", "r", "0" } },
 
@@ -2306,8 +2315,8 @@ static void tcg_target_qemu_prologue(TCGContext *s)
     tcg_out_opc(s, OPC_RET, 0, 0, 0);
 
 #if !defined(CONFIG_SOFTMMU)
-    /* Try to set up a segment register to point to GUEST_BASE.  */
-    if (GUEST_BASE) {
+    /* Try to set up a segment register to point to guest_base.  */
+    if (guest_base) {
         setup_guest_base_seg();
     }
 #endif
diff --git a/tcg/i386/tcg-target.h b/tcg/i386/tcg-target.h
index 25b513354c..92be341713 100644
--- a/tcg/i386/tcg-target.h
+++ b/tcg/i386/tcg-target.h
@@ -102,7 +102,8 @@ extern bool have_bmi1;
 #define TCG_TARGET_HAS_mulsh_i32        0
 
 #if TCG_TARGET_REG_BITS == 64
-#define TCG_TARGET_HAS_trunc_shr_i32    0
+#define TCG_TARGET_HAS_extrl_i64_i32    0
+#define TCG_TARGET_HAS_extrh_i64_i32    0
 #define TCG_TARGET_HAS_div2_i64         1
 #define TCG_TARGET_HAS_rot_i64          1
 #define TCG_TARGET_HAS_ext8s_i64        1
diff --git a/tcg/ia64/tcg-target.c b/tcg/ia64/tcg-target.c
index 81cb9f79f3..3c07017868 100644
--- a/tcg/ia64/tcg-target.c
+++ b/tcg/ia64/tcg-target.c
@@ -40,13 +40,8 @@ static const char * const tcg_target_reg_names[TCG_TARGET_NB_REGS] = {
 };
 #endif
 
-#ifdef CONFIG_USE_GUEST_BASE
+#ifndef CONFIG_SOFTMMU
 #define TCG_GUEST_BASE_REG TCG_REG_R55
-#else
-#define TCG_GUEST_BASE_REG TCG_REG_R0
-#endif
-#ifndef GUEST_BASE
-#define GUEST_BASE 0
 #endif
 
 /* Branch registers */
@@ -1765,7 +1760,7 @@ static inline void tcg_out_qemu_ld(TCGContext *s, const TCGArg *args)
     bswap = opc & MO_BSWAP;
 
 #if TARGET_LONG_BITS == 32
-    if (GUEST_BASE != 0) {
+    if (guest_base != 0) {
         tcg_out_bundle(s, mII,
                        INSN_NOP_M,
                        tcg_opc_i29(TCG_REG_P0, OPC_ZXT4_I29,
@@ -1829,7 +1824,7 @@ static inline void tcg_out_qemu_ld(TCGContext *s, const TCGArg *args)
         }
     }
 #else
-    if (GUEST_BASE != 0) {
+    if (guest_base != 0) {
         tcg_out_bundle(s, MmI,
                        tcg_opc_a1 (TCG_REG_P0, OPC_ADD_A1, TCG_REG_R2,
                                    TCG_GUEST_BASE_REG, addr_reg),
@@ -1889,7 +1884,7 @@ static inline void tcg_out_qemu_st(TCGContext *s, const TCGArg *args)
     bswap = opc & MO_BSWAP;
 
 #if TARGET_LONG_BITS == 32
-    if (GUEST_BASE != 0) {
+    if (guest_base != 0) {
         tcg_out_bundle(s, mII,
                        INSN_NOP_M,
                        tcg_opc_i29(TCG_REG_P0, OPC_ZXT4_I29,
@@ -1935,7 +1930,7 @@ static inline void tcg_out_qemu_st(TCGContext *s, const TCGArg *args)
                    INSN_NOP_M,
                    INSN_NOP_I);
 #else
-    if (GUEST_BASE != 0) {
+    if (guest_base != 0) {
         add_guest_base = tcg_opc_a1 (TCG_REG_P0, OPC_ADD_A1, TCG_REG_R2,
                                      TCG_GUEST_BASE_REG, addr_reg);
         addr_reg = TCG_REG_R2;
@@ -1944,7 +1939,7 @@ static inline void tcg_out_qemu_st(TCGContext *s, const TCGArg *args)
     }
 
     if (!bswap) {
-        tcg_out_bundle(s, (GUEST_BASE ? MmI : mmI),
+        tcg_out_bundle(s, (guest_base ? MmI : mmI),
                        add_guest_base,
                        tcg_opc_m4 (TCG_REG_P0, opc_st_m4[s_bits],
                                    data_reg, addr_reg),
@@ -2148,9 +2143,11 @@ static inline void tcg_out_op(TCGContext *s, TCGOpcode opc,
     case INDEX_op_ext16u_i64:
         tcg_out_ext(s, OPC_ZXT2_I29, args[0], args[1]);
         break;
+    case INDEX_op_ext_i32_i64:
     case INDEX_op_ext32s_i64:
         tcg_out_ext(s, OPC_SXT4_I29, args[0], args[1]);
         break;
+    case INDEX_op_extu_i32_i64:
     case INDEX_op_ext32u_i64:
         tcg_out_ext(s, OPC_ZXT4_I29, args[0], args[1]);
         break;
@@ -2301,6 +2298,8 @@ static const TCGTargetOpDef ia64_op_defs[] = {
     { INDEX_op_ext16u_i64, { "r", "rZ"} },
     { INDEX_op_ext32s_i64, { "r", "rZ"} },
     { INDEX_op_ext32u_i64, { "r", "rZ"} },
+    { INDEX_op_ext_i32_i64, { "r", "rZ" } },
+    { INDEX_op_extu_i32_i64, { "r", "rZ" } },
 
     { INDEX_op_bswap16_i64, { "r", "rZ" } },
     { INDEX_op_bswap32_i64, { "r", "rZ" } },
@@ -2349,14 +2348,14 @@ static void tcg_target_qemu_prologue(TCGContext *s)
                    tcg_opc_i21(TCG_REG_P0, OPC_MOV_I21,
                                TCG_REG_B6, TCG_REG_R33, 0));
 
-    /* ??? If GUEST_BASE < 0x200000, we could load the register via
+    /* ??? If guest_base < 0x200000, we could load the register via
        an ADDL in the M slot of the next bundle.  */
-    if (GUEST_BASE != 0) {
+    if (guest_base != 0) {
         tcg_out_bundle(s, mlx,
                        INSN_NOP_M,
-                       tcg_opc_l2 (GUEST_BASE),
+                       tcg_opc_l2(guest_base),
                        tcg_opc_x2 (TCG_REG_P0, OPC_MOVL_X2,
-                                   TCG_GUEST_BASE_REG, GUEST_BASE));
+                                   TCG_GUEST_BASE_REG, guest_base));
         tcg_regset_set_reg(s->reserved_regs, TCG_GUEST_BASE_REG);
     }
 
diff --git a/tcg/ia64/tcg-target.h b/tcg/ia64/tcg-target.h
index a04ed81262..ae9b79f02f 100644
--- a/tcg/ia64/tcg-target.h
+++ b/tcg/ia64/tcg-target.h
@@ -160,7 +160,8 @@ typedef enum {
 #define TCG_TARGET_HAS_muluh_i64        0
 #define TCG_TARGET_HAS_mulsh_i32        0
 #define TCG_TARGET_HAS_mulsh_i64        0
-#define TCG_TARGET_HAS_trunc_shr_i32    0
+#define TCG_TARGET_HAS_extrl_i64_i32    0
+#define TCG_TARGET_HAS_extrh_i64_i32    0
 
 #define TCG_TARGET_deposit_i32_valid(ofs, len) ((len) <= 16)
 #define TCG_TARGET_deposit_i64_valid(ofs, len) ((len) <= 16)
diff --git a/tcg/mips/tcg-target.c b/tcg/mips/tcg-target.c
index e97980df0b..c0ce520228 100644
--- a/tcg/mips/tcg-target.c
+++ b/tcg/mips/tcg-target.c
@@ -1180,12 +1180,12 @@ static void tcg_out_qemu_ld(TCGContext *s, const TCGArg *args, bool is_64)
     add_qemu_ldst_label(s, 1, oi, data_regl, data_regh, addr_regl, addr_regh,
                         s->code_ptr, label_ptr);
 #else
-    if (GUEST_BASE == 0 && data_regl != addr_regl) {
+    if (guest_base == 0 && data_regl != addr_regl) {
         base = addr_regl;
-    } else if (GUEST_BASE == (int16_t)GUEST_BASE) {
-        tcg_out_opc_imm(s, OPC_ADDIU, base, addr_regl, GUEST_BASE);
+    } else if (guest_base == (int16_t)guest_base) {
+        tcg_out_opc_imm(s, OPC_ADDIU, base, addr_regl, guest_base);
     } else {
-        tcg_out_movi(s, TCG_TYPE_PTR, base, GUEST_BASE);
+        tcg_out_movi(s, TCG_TYPE_PTR, base, guest_base);
         tcg_out_opc_reg(s, OPC_ADDU, base, base, addr_regl);
     }
     tcg_out_qemu_ld_direct(s, data_regl, data_regh, base, opc);
@@ -1314,14 +1314,14 @@ static void tcg_out_qemu_st(TCGContext *s, const TCGArg *args, bool is_64)
     add_qemu_ldst_label(s, 0, oi, data_regl, data_regh, addr_regl, addr_regh,
                         s->code_ptr, label_ptr);
 #else
-    if (GUEST_BASE == 0) {
+    if (guest_base == 0) {
         base = addr_regl;
     } else {
         base = TCG_REG_A0;
-        if (GUEST_BASE == (int16_t)GUEST_BASE) {
-            tcg_out_opc_imm(s, OPC_ADDIU, base, addr_regl, GUEST_BASE);
+        if (guest_base == (int16_t)guest_base) {
+            tcg_out_opc_imm(s, OPC_ADDIU, base, addr_regl, guest_base);
         } else {
-            tcg_out_movi(s, TCG_TYPE_PTR, base, GUEST_BASE);
+            tcg_out_movi(s, TCG_TYPE_PTR, base, guest_base);
             tcg_out_opc_reg(s, OPC_ADDU, base, base, addr_regl);
         }
     }
diff --git a/tcg/optimize.c b/tcg/optimize.c
index 18283cfd7b..10795ec9d5 100644
--- a/tcg/optimize.c
+++ b/tcg/optimize.c
@@ -35,14 +35,8 @@
         glue(glue(case INDEX_op_, x), _i32):    \
         glue(glue(case INDEX_op_, x), _i64)
 
-typedef enum {
-    TCG_TEMP_UNDEF = 0,
-    TCG_TEMP_CONST,
-    TCG_TEMP_COPY,
-} tcg_temp_state;
-
 struct tcg_temp_info {
-    tcg_temp_state state;
+    bool is_const;
     uint16_t prev_copy;
     uint16_t next_copy;
     tcg_target_ulong val;
@@ -50,23 +44,47 @@ struct tcg_temp_info {
 };
 
 static struct tcg_temp_info temps[TCG_MAX_TEMPS];
+static TCGTempSet temps_used;
+
+static inline bool temp_is_const(TCGArg arg)
+{
+    return temps[arg].is_const;
+}
+
+static inline bool temp_is_copy(TCGArg arg)
+{
+    return temps[arg].next_copy != arg;
+}
 
-/* Reset TEMP's state to TCG_TEMP_UNDEF.  If TEMP only had one copy, remove
-   the copy flag from the left temp.  */
+/* Reset TEMP's state, possibly removing the temp for the list of copies.  */
 static void reset_temp(TCGArg temp)
 {
-    if (temps[temp].state == TCG_TEMP_COPY) {
-        if (temps[temp].prev_copy == temps[temp].next_copy) {
-            temps[temps[temp].next_copy].state = TCG_TEMP_UNDEF;
-        } else {
-            temps[temps[temp].next_copy].prev_copy = temps[temp].prev_copy;
-            temps[temps[temp].prev_copy].next_copy = temps[temp].next_copy;
-        }
-    }
-    temps[temp].state = TCG_TEMP_UNDEF;
+    temps[temps[temp].next_copy].prev_copy = temps[temp].prev_copy;
+    temps[temps[temp].prev_copy].next_copy = temps[temp].next_copy;
+    temps[temp].next_copy = temp;
+    temps[temp].prev_copy = temp;
+    temps[temp].is_const = false;
     temps[temp].mask = -1;
 }
 
+/* Reset all temporaries, given that there are NB_TEMPS of them.  */
+static void reset_all_temps(int nb_temps)
+{
+    bitmap_zero(temps_used.l, nb_temps);
+}
+
+/* Initialize and activate a temporary.  */
+static void init_temp_info(TCGArg temp)
+{
+    if (!test_bit(temp, temps_used.l)) {
+        temps[temp].next_copy = temp;
+        temps[temp].prev_copy = temp;
+        temps[temp].is_const = false;
+        temps[temp].mask = -1;
+        set_bit(temp, temps_used.l);
+    }
+}
+
 static TCGOp *insert_op_before(TCGContext *s, TCGOp *old_op,
                                 TCGOpcode opc, int nargs)
 {
@@ -98,16 +116,6 @@ static TCGOp *insert_op_before(TCGContext *s, TCGOp *old_op,
     return new_op;
 }
 
-/* Reset all temporaries, given that there are NB_TEMPS of them.  */
-static void reset_all_temps(int nb_temps)
-{
-    int i;
-    for (i = 0; i < nb_temps; i++) {
-        temps[i].state = TCG_TEMP_UNDEF;
-        temps[i].mask = -1;
-    }
-}
-
 static int op_bits(TCGOpcode op)
 {
     const TCGOpDef *def = &tcg_op_defs[op];
@@ -179,8 +187,7 @@ static bool temps_are_copies(TCGArg arg1, TCGArg arg2)
         return true;
     }
 
-    if (temps[arg1].state != TCG_TEMP_COPY
-        || temps[arg2].state != TCG_TEMP_COPY) {
+    if (!temp_is_copy(arg1) || !temp_is_copy(arg2)) {
         return false;
     }
 
@@ -202,7 +209,7 @@ static void tcg_opt_gen_movi(TCGContext *s, TCGOp *op, TCGArg *args,
     op->opc = new_op;
 
     reset_temp(dst);
-    temps[dst].state = TCG_TEMP_CONST;
+    temps[dst].is_const = true;
     temps[dst].val = val;
     mask = val;
     if (TCG_TARGET_REG_BITS > 32 && new_op == INDEX_op_movi_i32) {
@@ -223,11 +230,6 @@ static void tcg_opt_gen_mov(TCGContext *s, TCGOp *op, TCGArg *args,
         return;
     }
 
-    if (temps[src].state == TCG_TEMP_CONST) {
-        tcg_opt_gen_movi(s, op, args, dst, temps[src].val);
-        return;
-    }
-
     TCGOpcode new_op = op_to_mov(op->opc);
     tcg_target_ulong mask;
 
@@ -241,19 +243,13 @@ static void tcg_opt_gen_mov(TCGContext *s, TCGOp *op, TCGArg *args,
     }
     temps[dst].mask = mask;
 
-    assert(temps[src].state != TCG_TEMP_CONST);
-
     if (s->temps[src].type == s->temps[dst].type) {
-        if (temps[src].state != TCG_TEMP_COPY) {
-            temps[src].state = TCG_TEMP_COPY;
-            temps[src].next_copy = src;
-            temps[src].prev_copy = src;
-        }
-        temps[dst].state = TCG_TEMP_COPY;
         temps[dst].next_copy = temps[src].next_copy;
         temps[dst].prev_copy = src;
         temps[temps[dst].next_copy].prev_copy = dst;
         temps[src].next_copy = dst;
+        temps[dst].is_const = temps[src].is_const;
+        temps[dst].val = temps[src].val;
     }
 
     args[0] = dst;
@@ -292,7 +288,6 @@ static TCGArg do_constant_folding_2(TCGOpcode op, TCGArg x, TCGArg y)
     case INDEX_op_shr_i32:
         return (uint32_t)x >> (y & 31);
 
-    case INDEX_op_trunc_shr_i32:
     case INDEX_op_shr_i64:
         return (uint64_t)x >> (y & 63);
 
@@ -347,12 +342,18 @@ static TCGArg do_constant_folding_2(TCGOpcode op, TCGArg x, TCGArg y)
     CASE_OP_32_64(ext16u):
         return (uint16_t)x;
 
+    case INDEX_op_ext_i32_i64:
     case INDEX_op_ext32s_i64:
         return (int32_t)x;
 
+    case INDEX_op_extu_i32_i64:
+    case INDEX_op_extrl_i64_i32:
     case INDEX_op_ext32u_i64:
         return (uint32_t)x;
 
+    case INDEX_op_extrh_i64_i32:
+        return (uint64_t)x >> 32;
+
     case INDEX_op_muluh_i32:
         return ((uint64_t)(uint32_t)x * (uint32_t)y) >> 32;
     case INDEX_op_mulsh_i32:
@@ -395,7 +396,7 @@ static TCGArg do_constant_folding(TCGOpcode op, TCGArg x, TCGArg y)
 {
     TCGArg res = do_constant_folding_2(op, x, y);
     if (op_bits(op) == 32) {
-        res &= 0xffffffff;
+        res = (int32_t)res;
     }
     return res;
 }
@@ -481,7 +482,7 @@ static bool do_constant_folding_cond_eq(TCGCond c)
 static TCGArg do_constant_folding_cond(TCGOpcode op, TCGArg x,
                                        TCGArg y, TCGCond c)
 {
-    if (temps[x].state == TCG_TEMP_CONST && temps[y].state == TCG_TEMP_CONST) {
+    if (temp_is_const(x) && temp_is_const(y)) {
         switch (op_bits(op)) {
         case 32:
             return do_constant_folding_cond_32(temps[x].val, temps[y].val, c);
@@ -492,7 +493,7 @@ static TCGArg do_constant_folding_cond(TCGOpcode op, TCGArg x,
         }
     } else if (temps_are_copies(x, y)) {
         return do_constant_folding_cond_eq(c);
-    } else if (temps[y].state == TCG_TEMP_CONST && temps[y].val == 0) {
+    } else if (temp_is_const(y) && temps[y].val == 0) {
         switch (c) {
         case TCG_COND_LTU:
             return 0;
@@ -513,12 +514,10 @@ static TCGArg do_constant_folding_cond2(TCGArg *p1, TCGArg *p2, TCGCond c)
     TCGArg al = p1[0], ah = p1[1];
     TCGArg bl = p2[0], bh = p2[1];
 
-    if (temps[bl].state == TCG_TEMP_CONST
-        && temps[bh].state == TCG_TEMP_CONST) {
+    if (temp_is_const(bl) && temp_is_const(bh)) {
         uint64_t b = ((uint64_t)temps[bh].val << 32) | (uint32_t)temps[bl].val;
 
-        if (temps[al].state == TCG_TEMP_CONST
-            && temps[ah].state == TCG_TEMP_CONST) {
+        if (temp_is_const(al) && temp_is_const(ah)) {
             uint64_t a;
             a = ((uint64_t)temps[ah].val << 32) | (uint32_t)temps[al].val;
             return do_constant_folding_cond_64(a, b, c);
@@ -544,8 +543,8 @@ static bool swap_commutative(TCGArg dest, TCGArg *p1, TCGArg *p2)
 {
     TCGArg a1 = *p1, a2 = *p2;
     int sum = 0;
-    sum += temps[a1].state == TCG_TEMP_CONST;
-    sum -= temps[a2].state == TCG_TEMP_CONST;
+    sum += temp_is_const(a1);
+    sum -= temp_is_const(a2);
 
     /* Prefer the constant in second argument, and then the form
        op a, a, b, which is better handled on non-RISC hosts. */
@@ -560,10 +559,10 @@ static bool swap_commutative(TCGArg dest, TCGArg *p1, TCGArg *p2)
 static bool swap_commutative2(TCGArg *p1, TCGArg *p2)
 {
     int sum = 0;
-    sum += temps[p1[0]].state == TCG_TEMP_CONST;
-    sum += temps[p1[1]].state == TCG_TEMP_CONST;
-    sum -= temps[p2[0]].state == TCG_TEMP_CONST;
-    sum -= temps[p2[1]].state == TCG_TEMP_CONST;
+    sum += temp_is_const(p1[0]);
+    sum += temp_is_const(p1[1]);
+    sum -= temp_is_const(p2[0]);
+    sum -= temp_is_const(p2[1]);
     if (sum > 0) {
         TCGArg t;
         t = p1[0], p1[0] = p2[0], p2[0] = t;
@@ -598,17 +597,29 @@ void tcg_optimize(TCGContext *s)
         const TCGOpDef *def = &tcg_op_defs[opc];
 
         oi_next = op->next;
+
+        /* Count the arguments, and initialize the temps that are
+           going to be used */
         if (opc == INDEX_op_call) {
             nb_oargs = op->callo;
             nb_iargs = op->calli;
+            for (i = 0; i < nb_oargs + nb_iargs; i++) {
+                tmp = args[i];
+                if (tmp != TCG_CALL_DUMMY_ARG) {
+                    init_temp_info(tmp);
+                }
+            }
         } else {
             nb_oargs = def->nb_oargs;
             nb_iargs = def->nb_iargs;
+            for (i = 0; i < nb_oargs + nb_iargs; i++) {
+                init_temp_info(args[i]);
+            }
         }
 
         /* Do copy propagation */
         for (i = nb_oargs; i < nb_oargs + nb_iargs; i++) {
-            if (temps[args[i]].state == TCG_TEMP_COPY) {
+            if (temp_is_copy(args[i])) {
                 args[i] = find_better_copy(s, args[i]);
             }
         }
@@ -678,8 +689,7 @@ void tcg_optimize(TCGContext *s)
         CASE_OP_32_64(sar):
         CASE_OP_32_64(rotl):
         CASE_OP_32_64(rotr):
-            if (temps[args[1]].state == TCG_TEMP_CONST
-                && temps[args[1]].val == 0) {
+            if (temp_is_const(args[1]) && temps[args[1]].val == 0) {
                 tcg_opt_gen_movi(s, op, args, args[0], 0);
                 continue;
             }
@@ -689,7 +699,7 @@ void tcg_optimize(TCGContext *s)
                 TCGOpcode neg_op;
                 bool have_neg;
 
-                if (temps[args[2]].state == TCG_TEMP_CONST) {
+                if (temp_is_const(args[2])) {
                     /* Proceed with possible constant folding. */
                     break;
                 }
@@ -703,8 +713,7 @@ void tcg_optimize(TCGContext *s)
                 if (!have_neg) {
                     break;
                 }
-                if (temps[args[1]].state == TCG_TEMP_CONST
-                    && temps[args[1]].val == 0) {
+                if (temp_is_const(args[1]) && temps[args[1]].val == 0) {
                     op->opc = neg_op;
                     reset_temp(args[0]);
                     args[1] = args[2];
@@ -714,34 +723,30 @@ void tcg_optimize(TCGContext *s)
             break;
         CASE_OP_32_64(xor):
         CASE_OP_32_64(nand):
-            if (temps[args[1]].state != TCG_TEMP_CONST
-                && temps[args[2]].state == TCG_TEMP_CONST
-                && temps[args[2]].val == -1) {
+            if (!temp_is_const(args[1])
+                && temp_is_const(args[2]) && temps[args[2]].val == -1) {
                 i = 1;
                 goto try_not;
             }
             break;
         CASE_OP_32_64(nor):
-            if (temps[args[1]].state != TCG_TEMP_CONST
-                && temps[args[2]].state == TCG_TEMP_CONST
-                && temps[args[2]].val == 0) {
+            if (!temp_is_const(args[1])
+                && temp_is_const(args[2]) && temps[args[2]].val == 0) {
                 i = 1;
                 goto try_not;
             }
             break;
         CASE_OP_32_64(andc):
-            if (temps[args[2]].state != TCG_TEMP_CONST
-                && temps[args[1]].state == TCG_TEMP_CONST
-                && temps[args[1]].val == -1) {
+            if (!temp_is_const(args[2])
+                && temp_is_const(args[1]) && temps[args[1]].val == -1) {
                 i = 2;
                 goto try_not;
             }
             break;
         CASE_OP_32_64(orc):
         CASE_OP_32_64(eqv):
-            if (temps[args[2]].state != TCG_TEMP_CONST
-                && temps[args[1]].state == TCG_TEMP_CONST
-                && temps[args[1]].val == 0) {
+            if (!temp_is_const(args[2])
+                && temp_is_const(args[1]) && temps[args[1]].val == 0) {
                 i = 2;
                 goto try_not;
             }
@@ -782,9 +787,8 @@ void tcg_optimize(TCGContext *s)
         CASE_OP_32_64(or):
         CASE_OP_32_64(xor):
         CASE_OP_32_64(andc):
-            if (temps[args[1]].state != TCG_TEMP_CONST
-                && temps[args[2]].state == TCG_TEMP_CONST
-                && temps[args[2]].val == 0) {
+            if (!temp_is_const(args[1])
+                && temp_is_const(args[2]) && temps[args[2]].val == 0) {
                 tcg_opt_gen_mov(s, op, args, args[0], args[1]);
                 continue;
             }
@@ -792,9 +796,8 @@ void tcg_optimize(TCGContext *s)
         CASE_OP_32_64(and):
         CASE_OP_32_64(orc):
         CASE_OP_32_64(eqv):
-            if (temps[args[1]].state != TCG_TEMP_CONST
-                && temps[args[2]].state == TCG_TEMP_CONST
-                && temps[args[2]].val == -1) {
+            if (!temp_is_const(args[1])
+                && temp_is_const(args[2]) && temps[args[2]].val == -1) {
                 tcg_opt_gen_mov(s, op, args, args[0], args[1]);
                 continue;
             }
@@ -832,17 +835,26 @@ void tcg_optimize(TCGContext *s)
 
         CASE_OP_32_64(and):
             mask = temps[args[2]].mask;
-            if (temps[args[2]].state == TCG_TEMP_CONST) {
+            if (temp_is_const(args[2])) {
         and_const:
                 affected = temps[args[1]].mask & ~mask;
             }
             mask = temps[args[1]].mask & mask;
             break;
 
+        case INDEX_op_ext_i32_i64:
+            if ((temps[args[1]].mask & 0x80000000) != 0) {
+                break;
+            }
+        case INDEX_op_extu_i32_i64:
+            /* We do not compute affected as it is a size changing op.  */
+            mask = (uint32_t)temps[args[1]].mask;
+            break;
+
         CASE_OP_32_64(andc):
             /* Known-zeros does not imply known-ones.  Therefore unless
                args[2] is constant, we can't infer anything from it.  */
-            if (temps[args[2]].state == TCG_TEMP_CONST) {
+            if (temp_is_const(args[2])) {
                 mask = ~temps[args[2]].mask;
                 goto and_const;
             }
@@ -851,37 +863,40 @@ void tcg_optimize(TCGContext *s)
             break;
 
         case INDEX_op_sar_i32:
-            if (temps[args[2]].state == TCG_TEMP_CONST) {
+            if (temp_is_const(args[2])) {
                 tmp = temps[args[2]].val & 31;
                 mask = (int32_t)temps[args[1]].mask >> tmp;
             }
             break;
         case INDEX_op_sar_i64:
-            if (temps[args[2]].state == TCG_TEMP_CONST) {
+            if (temp_is_const(args[2])) {
                 tmp = temps[args[2]].val & 63;
                 mask = (int64_t)temps[args[1]].mask >> tmp;
             }
             break;
 
         case INDEX_op_shr_i32:
-            if (temps[args[2]].state == TCG_TEMP_CONST) {
+            if (temp_is_const(args[2])) {
                 tmp = temps[args[2]].val & 31;
                 mask = (uint32_t)temps[args[1]].mask >> tmp;
             }
             break;
         case INDEX_op_shr_i64:
-            if (temps[args[2]].state == TCG_TEMP_CONST) {
+            if (temp_is_const(args[2])) {
                 tmp = temps[args[2]].val & 63;
                 mask = (uint64_t)temps[args[1]].mask >> tmp;
             }
             break;
 
-        case INDEX_op_trunc_shr_i32:
-            mask = (uint64_t)temps[args[1]].mask >> args[2];
+        case INDEX_op_extrl_i64_i32:
+            mask = (uint32_t)temps[args[1]].mask;
+            break;
+        case INDEX_op_extrh_i64_i32:
+            mask = (uint64_t)temps[args[1]].mask >> 32;
             break;
 
         CASE_OP_32_64(shl):
-            if (temps[args[2]].state == TCG_TEMP_CONST) {
+            if (temp_is_const(args[2])) {
                 tmp = temps[args[2]].val & (TCG_TARGET_REG_BITS - 1);
                 mask = temps[args[1]].mask << tmp;
             }
@@ -962,8 +977,7 @@ void tcg_optimize(TCGContext *s)
         CASE_OP_32_64(mul):
         CASE_OP_32_64(muluh):
         CASE_OP_32_64(mulsh):
-            if ((temps[args[2]].state == TCG_TEMP_CONST
-                && temps[args[2]].val == 0)) {
+            if ((temp_is_const(args[2]) && temps[args[2]].val == 0)) {
                 tcg_opt_gen_movi(s, op, args, args[0], 0);
                 continue;
             }
@@ -1018,21 +1032,17 @@ void tcg_optimize(TCGContext *s)
         CASE_OP_32_64(ext16u):
         case INDEX_op_ext32s_i64:
         case INDEX_op_ext32u_i64:
-            if (temps[args[1]].state == TCG_TEMP_CONST) {
+        case INDEX_op_ext_i32_i64:
+        case INDEX_op_extu_i32_i64:
+        case INDEX_op_extrl_i64_i32:
+        case INDEX_op_extrh_i64_i32:
+            if (temp_is_const(args[1])) {
                 tmp = do_constant_folding(opc, temps[args[1]].val, 0);
                 tcg_opt_gen_movi(s, op, args, args[0], tmp);
                 break;
             }
             goto do_default;
 
-        case INDEX_op_trunc_shr_i32:
-            if (temps[args[1]].state == TCG_TEMP_CONST) {
-                tmp = do_constant_folding(opc, temps[args[1]].val, args[2]);
-                tcg_opt_gen_movi(s, op, args, args[0], tmp);
-                break;
-            }
-            goto do_default;
-
         CASE_OP_32_64(add):
         CASE_OP_32_64(sub):
         CASE_OP_32_64(mul):
@@ -1055,8 +1065,7 @@ void tcg_optimize(TCGContext *s)
         CASE_OP_32_64(divu):
         CASE_OP_32_64(rem):
         CASE_OP_32_64(remu):
-            if (temps[args[1]].state == TCG_TEMP_CONST
-                && temps[args[2]].state == TCG_TEMP_CONST) {
+            if (temp_is_const(args[1]) && temp_is_const(args[2])) {
                 tmp = do_constant_folding(opc, temps[args[1]].val,
                                           temps[args[2]].val);
                 tcg_opt_gen_movi(s, op, args, args[0], tmp);
@@ -1065,8 +1074,7 @@ void tcg_optimize(TCGContext *s)
             goto do_default;
 
         CASE_OP_32_64(deposit):
-            if (temps[args[1]].state == TCG_TEMP_CONST
-                && temps[args[2]].state == TCG_TEMP_CONST) {
+            if (temp_is_const(args[1]) && temp_is_const(args[2])) {
                 tmp = deposit64(temps[args[1]].val, args[3], args[4],
                                 temps[args[2]].val);
                 tcg_opt_gen_movi(s, op, args, args[0], tmp);
@@ -1106,10 +1114,8 @@ void tcg_optimize(TCGContext *s)
 
         case INDEX_op_add2_i32:
         case INDEX_op_sub2_i32:
-            if (temps[args[2]].state == TCG_TEMP_CONST
-                && temps[args[3]].state == TCG_TEMP_CONST
-                && temps[args[4]].state == TCG_TEMP_CONST
-                && temps[args[5]].state == TCG_TEMP_CONST) {
+            if (temp_is_const(args[2]) && temp_is_const(args[3])
+                && temp_is_const(args[4]) && temp_is_const(args[5])) {
                 uint32_t al = temps[args[2]].val;
                 uint32_t ah = temps[args[3]].val;
                 uint32_t bl = temps[args[4]].val;
@@ -1128,8 +1134,8 @@ void tcg_optimize(TCGContext *s)
 
                 rl = args[0];
                 rh = args[1];
-                tcg_opt_gen_movi(s, op, args, rl, (uint32_t)a);
-                tcg_opt_gen_movi(s, op2, args2, rh, (uint32_t)(a >> 32));
+                tcg_opt_gen_movi(s, op, args, rl, (int32_t)a);
+                tcg_opt_gen_movi(s, op2, args2, rh, (int32_t)(a >> 32));
 
                 /* We've done all we need to do with the movi.  Skip it.  */
                 oi_next = op2->next;
@@ -1138,8 +1144,7 @@ void tcg_optimize(TCGContext *s)
             goto do_default;
 
         case INDEX_op_mulu2_i32:
-            if (temps[args[2]].state == TCG_TEMP_CONST
-                && temps[args[3]].state == TCG_TEMP_CONST) {
+            if (temp_is_const(args[2]) && temp_is_const(args[3])) {
                 uint32_t a = temps[args[2]].val;
                 uint32_t b = temps[args[3]].val;
                 uint64_t r = (uint64_t)a * b;
@@ -1149,8 +1154,8 @@ void tcg_optimize(TCGContext *s)
 
                 rl = args[0];
                 rh = args[1];
-                tcg_opt_gen_movi(s, op, args, rl, (uint32_t)r);
-                tcg_opt_gen_movi(s, op2, args2, rh, (uint32_t)(r >> 32));
+                tcg_opt_gen_movi(s, op, args, rl, (int32_t)r);
+                tcg_opt_gen_movi(s, op2, args2, rh, (int32_t)(r >> 32));
 
                 /* We've done all we need to do with the movi.  Skip it.  */
                 oi_next = op2->next;
@@ -1171,10 +1176,8 @@ void tcg_optimize(TCGContext *s)
                     tcg_op_remove(s, op);
                 }
             } else if ((args[4] == TCG_COND_LT || args[4] == TCG_COND_GE)
-                       && temps[args[2]].state == TCG_TEMP_CONST
-                       && temps[args[3]].state == TCG_TEMP_CONST
-                       && temps[args[2]].val == 0
-                       && temps[args[3]].val == 0) {
+                       && temp_is_const(args[2]) && temps[args[2]].val == 0
+                       && temp_is_const(args[3]) && temps[args[3]].val == 0) {
                 /* Simplify LT/GE comparisons vs zero to a single compare
                    vs the high word of the input.  */
             do_brcond_high:
@@ -1236,10 +1239,8 @@ void tcg_optimize(TCGContext *s)
             do_setcond_const:
                 tcg_opt_gen_movi(s, op, args, args[0], tmp);
             } else if ((args[5] == TCG_COND_LT || args[5] == TCG_COND_GE)
-                       && temps[args[3]].state == TCG_TEMP_CONST
-                       && temps[args[4]].state == TCG_TEMP_CONST
-                       && temps[args[3]].val == 0
-                       && temps[args[4]].val == 0) {
+                       && temp_is_const(args[3]) && temps[args[3]].val == 0
+                       && temp_is_const(args[4]) && temps[args[4]].val == 0) {
                 /* Simplify LT/GE comparisons vs zero to a single compare
                    vs the high word of the input.  */
             do_setcond_high:
@@ -1299,7 +1300,9 @@ void tcg_optimize(TCGContext *s)
             if (!(args[nb_oargs + nb_iargs + 1]
                   & (TCG_CALL_NO_READ_GLOBALS | TCG_CALL_NO_WRITE_GLOBALS))) {
                 for (i = 0; i < nb_globals; i++) {
-                    reset_temp(i);
+                    if (test_bit(i, temps_used.l)) {
+                        reset_temp(i);
+                    }
                 }
             }
             goto do_reset_output;
diff --git a/tcg/ppc/tcg-target.c b/tcg/ppc/tcg-target.c
index 2b6eafa03c..92ef719e40 100644
--- a/tcg/ppc/tcg-target.c
+++ b/tcg/ppc/tcg-target.c
@@ -80,19 +80,13 @@
 
 static tcg_insn_unit *tb_ret_addr;
 
-#ifndef GUEST_BASE
-#define GUEST_BASE 0
-#endif
-
 #include "elf.h"
 static bool have_isa_2_06;
 #define HAVE_ISA_2_06  have_isa_2_06
 #define HAVE_ISEL      have_isa_2_06
 
-#ifdef CONFIG_USE_GUEST_BASE
+#ifndef CONFIG_SOFTMMU
 #define TCG_GUEST_BASE_REG 30
-#else
-#define TCG_GUEST_BASE_REG 0
 #endif
 
 #ifndef NDEBUG
@@ -1361,7 +1355,7 @@ static void * const qemu_st_helpers[16] = {
    in CR7, loads the addend of the TLB into R3, and returns the register
    containing the guest address (zero-extended into R4).  Clobbers R0 and R2. */
 
-static TCGReg tcg_out_tlb_read(TCGContext *s, TCGMemOp s_bits,
+static TCGReg tcg_out_tlb_read(TCGContext *s, TCGMemOp opc,
                                TCGReg addrlo, TCGReg addrhi,
                                int mem_index, bool is_read)
 {
@@ -1371,6 +1365,7 @@ static TCGReg tcg_out_tlb_read(TCGContext *s, TCGMemOp s_bits,
            : offsetof(CPUArchState, tlb_table[mem_index][0].addr_write));
     int add_off = offsetof(CPUArchState, tlb_table[mem_index][0].addend);
     TCGReg base = TCG_AREG0;
+    TCGMemOp s_bits = opc & MO_SIZE;
 
     /* Extract the page index, shifted into place for tlb index.  */
     if (TCG_TARGET_REG_BITS == 64) {
@@ -1422,17 +1417,37 @@ static TCGReg tcg_out_tlb_read(TCGContext *s, TCGMemOp s_bits,
        to minimize any load use delay.  */
     tcg_out_ld(s, TCG_TYPE_PTR, TCG_REG_R3, TCG_REG_R3, add_off);
 
-    /* Clear the non-page, non-alignment bits from the address.  */
+    /* Clear the non-page, non-alignment bits from the address */
     if (TCG_TARGET_REG_BITS == 32 || TARGET_LONG_BITS == 32) {
+        /* We don't support unaligned accesses on 32-bits, preserve
+         * the bottom bits and thus trigger a comparison failure on
+         * unaligned accesses
+         */
         tcg_out_rlw(s, RLWINM, TCG_REG_R0, addrlo, 0,
                     (32 - s_bits) & 31, 31 - TARGET_PAGE_BITS);
-    } else if (!s_bits) {
-        tcg_out_rld(s, RLDICR, TCG_REG_R0, addrlo,
-                    0, 63 - TARGET_PAGE_BITS);
+    } else if (s_bits) {
+        /* > byte access, we need to handle alignment */
+        if ((opc & MO_AMASK) == MO_ALIGN) {
+            /* Alignment required by the front-end, same as 32-bits */
+            tcg_out_rld(s, RLDICL, TCG_REG_R0, addrlo,
+                        64 - TARGET_PAGE_BITS, TARGET_PAGE_BITS - s_bits);
+            tcg_out_rld(s, RLDICL, TCG_REG_R0, TCG_REG_R0, TARGET_PAGE_BITS, 0);
+       } else {
+           /* We support unaligned accesses, we need to make sure we fail
+            * if we cross a page boundary. The trick is to add the
+            * access_size-1 to the address before masking the low bits.
+            * That will make the address overflow to the next page if we
+            * cross a page boundary which will then force a mismatch of
+            * the TLB compare since the next page cannot possibly be in
+            * the same TLB index.
+            */
+            tcg_out32(s, ADDI | TAI(TCG_REG_R0, addrlo, (1 << s_bits) - 1));
+            tcg_out_rld(s, RLDICR, TCG_REG_R0, TCG_REG_R0,
+                        0, 63 - TARGET_PAGE_BITS);
+        }
     } else {
-        tcg_out_rld(s, RLDICL, TCG_REG_R0, addrlo,
-                    64 - TARGET_PAGE_BITS, TARGET_PAGE_BITS - s_bits);
-        tcg_out_rld(s, RLDICL, TCG_REG_R0, TCG_REG_R0, TARGET_PAGE_BITS, 0);
+        /* Byte access, just chop off the bits below the page index */
+        tcg_out_rld(s, RLDICR, TCG_REG_R0, addrlo, 0, 63 - TARGET_PAGE_BITS);
     }
 
     if (TCG_TARGET_REG_BITS < TARGET_LONG_BITS) {
@@ -1592,7 +1607,7 @@ static void tcg_out_qemu_ld(TCGContext *s, const TCGArg *args, bool is_64)
 
 #ifdef CONFIG_SOFTMMU
     mem_index = get_mmuidx(oi);
-    addrlo = tcg_out_tlb_read(s, s_bits, addrlo, addrhi, mem_index, true);
+    addrlo = tcg_out_tlb_read(s, opc, addrlo, addrhi, mem_index, true);
 
     /* Load a pointer into the current opcode w/conditional branch-link. */
     label_ptr = s->code_ptr;
@@ -1600,7 +1615,7 @@ static void tcg_out_qemu_ld(TCGContext *s, const TCGArg *args, bool is_64)
 
     rbase = TCG_REG_R3;
 #else  /* !CONFIG_SOFTMMU */
-    rbase = GUEST_BASE ? TCG_GUEST_BASE_REG : 0;
+    rbase = guest_base ? TCG_GUEST_BASE_REG : 0;
     if (TCG_TARGET_REG_BITS > TARGET_LONG_BITS) {
         tcg_out_ext32u(s, TCG_REG_TMP1, addrlo);
         addrlo = TCG_REG_TMP1;
@@ -1667,7 +1682,7 @@ static void tcg_out_qemu_st(TCGContext *s, const TCGArg *args, bool is_64)
 
 #ifdef CONFIG_SOFTMMU
     mem_index = get_mmuidx(oi);
-    addrlo = tcg_out_tlb_read(s, s_bits, addrlo, addrhi, mem_index, false);
+    addrlo = tcg_out_tlb_read(s, opc, addrlo, addrhi, mem_index, false);
 
     /* Load a pointer into the current opcode w/conditional branch-link. */
     label_ptr = s->code_ptr;
@@ -1675,7 +1690,7 @@ static void tcg_out_qemu_st(TCGContext *s, const TCGArg *args, bool is_64)
 
     rbase = TCG_REG_R3;
 #else  /* !CONFIG_SOFTMMU */
-    rbase = GUEST_BASE ? TCG_GUEST_BASE_REG : 0;
+    rbase = guest_base ? TCG_GUEST_BASE_REG : 0;
     if (TCG_TARGET_REG_BITS > TARGET_LONG_BITS) {
         tcg_out_ext32u(s, TCG_REG_TMP1, addrlo);
         addrlo = TCG_REG_TMP1;
@@ -1779,9 +1794,9 @@ static void tcg_target_qemu_prologue(TCGContext *s)
     }
     tcg_out_st(s, TCG_TYPE_PTR, TCG_REG_R0, TCG_REG_R1, FRAME_SIZE+LR_OFFSET);
 
-#ifdef CONFIG_USE_GUEST_BASE
-    if (GUEST_BASE) {
-        tcg_out_movi(s, TCG_TYPE_PTR, TCG_GUEST_BASE_REG, GUEST_BASE);
+#ifndef CONFIG_SOFTMMU
+    if (guest_base) {
+        tcg_out_movi(s, TCG_TYPE_PTR, TCG_GUEST_BASE_REG, guest_base);
         tcg_regset_set_reg(s->reserved_regs, TCG_GUEST_BASE_REG);
     }
 #endif
@@ -2200,12 +2215,16 @@ static void tcg_out_op(TCGContext *s, TCGOpcode opc, const TCGArg *args,
     case INDEX_op_ext16s_i64:
         c = EXTSH;
         goto gen_ext;
+    case INDEX_op_ext_i32_i64:
     case INDEX_op_ext32s_i64:
         c = EXTSW;
         goto gen_ext;
     gen_ext:
         tcg_out32(s, c | RS(args[1]) | RA(args[0]));
         break;
+    case INDEX_op_extu_i32_i64:
+        tcg_out_ext32u(s, args[0], args[1]);
+        break;
 
     case INDEX_op_setcond_i32:
         tcg_out_setcond(s, TCG_TYPE_I32, args[3], args[0], args[1], args[2],
@@ -2482,6 +2501,8 @@ static const TCGTargetOpDef ppc_op_defs[] = {
     { INDEX_op_ext8s_i64, { "r", "r" } },
     { INDEX_op_ext16s_i64, { "r", "r" } },
     { INDEX_op_ext32s_i64, { "r", "r" } },
+    { INDEX_op_ext_i32_i64, { "r", "r" } },
+    { INDEX_op_extu_i32_i64, { "r", "r" } },
     { INDEX_op_bswap16_i64, { "r", "r" } },
     { INDEX_op_bswap32_i64, { "r", "r" } },
     { INDEX_op_bswap64_i64, { "r", "r" } },
diff --git a/tcg/ppc/tcg-target.h b/tcg/ppc/tcg-target.h
index 7ce7048824..b4f0818762 100644
--- a/tcg/ppc/tcg-target.h
+++ b/tcg/ppc/tcg-target.h
@@ -77,7 +77,8 @@ typedef enum {
 #if TCG_TARGET_REG_BITS == 64
 #define TCG_TARGET_HAS_add2_i32         0
 #define TCG_TARGET_HAS_sub2_i32         0
-#define TCG_TARGET_HAS_trunc_shr_i32    0
+#define TCG_TARGET_HAS_extrl_i64_i32    0
+#define TCG_TARGET_HAS_extrh_i64_i32    0
 #define TCG_TARGET_HAS_div_i64          1
 #define TCG_TARGET_HAS_rem_i64          0
 #define TCG_TARGET_HAS_rot_i64          1
diff --git a/tcg/s390/tcg-target.c b/tcg/s390/tcg-target.c
index aa718eca0c..fbf97bb2e1 100644
--- a/tcg/s390/tcg-target.c
+++ b/tcg/s390/tcg-target.c
@@ -51,17 +51,10 @@
 /* A scratch register that may be be used throughout the backend.  */
 #define TCG_TMP0        TCG_REG_R14
 
-#ifdef CONFIG_USE_GUEST_BASE
+#ifndef CONFIG_SOFTMMU
 #define TCG_GUEST_BASE_REG TCG_REG_R13
-#else
-#define TCG_GUEST_BASE_REG TCG_REG_R0
-#endif
-
-#ifndef GUEST_BASE
-#define GUEST_BASE 0
 #endif
 
-
 /* All of the following instructions are prefixed with their instruction
    format, and are defined as 8- or 16-bit quantities, even when the two
    halves of the 16-bit quantity may appear 32 bits apart in the insn.
@@ -1504,20 +1497,36 @@ QEMU_BUILD_BUG_ON(offsetof(CPUArchState, tlb_table[NB_MMU_MODES - 1][1])
 static TCGReg tcg_out_tlb_read(TCGContext* s, TCGReg addr_reg, TCGMemOp opc,
                                int mem_index, bool is_ld)
 {
-    TCGMemOp s_bits = opc & MO_SIZE;
-    uint64_t tlb_mask = TARGET_PAGE_MASK | ((1 << s_bits) - 1);
-    int ofs;
+    int s_mask = (1 << (opc & MO_SIZE)) - 1;
+    int ofs, a_off;
+    uint64_t tlb_mask;
+
+    /* For aligned accesses, we check the first byte and include the alignment
+       bits within the address.  For unaligned access, we check that we don't
+       cross pages using the address of the last byte of the access.  */
+    if ((opc & MO_AMASK) == MO_ALIGN || s_mask == 0) {
+        a_off = 0;
+        tlb_mask = TARGET_PAGE_MASK | s_mask;
+    } else {
+        a_off = s_mask;
+        tlb_mask = TARGET_PAGE_MASK;
+    }
 
     if (facilities & FACILITY_GEN_INST_EXT) {
         tcg_out_risbg(s, TCG_REG_R2, addr_reg,
                       64 - CPU_TLB_BITS - CPU_TLB_ENTRY_BITS,
                       63 - CPU_TLB_ENTRY_BITS,
                       64 + CPU_TLB_ENTRY_BITS - TARGET_PAGE_BITS, 1);
-        tgen_andi_risbg(s, TCG_REG_R3, addr_reg, tlb_mask);
+        if (a_off) {
+            tcg_out_insn(s, RX, LA, TCG_REG_R3, addr_reg, TCG_REG_NONE, a_off);
+            tgen_andi(s, TCG_TYPE_TL, TCG_REG_R3, tlb_mask);
+        } else {
+            tgen_andi_risbg(s, TCG_REG_R3, addr_reg, tlb_mask);
+        }
     } else {
         tcg_out_sh64(s, RSY_SRLG, TCG_REG_R2, addr_reg, TCG_REG_NONE,
                      TARGET_PAGE_BITS - CPU_TLB_ENTRY_BITS);
-        tcg_out_movi(s, TCG_TYPE_TL, TCG_REG_R3, addr_reg);
+        tcg_out_insn(s, RX, LA, TCG_REG_R3, addr_reg, TCG_REG_NONE, a_off);
         tgen_andi(s, TCG_TYPE_I64, TCG_REG_R2,
                   (CPU_TLB_SIZE - 1) << CPU_TLB_ENTRY_BITS);
         tgen_andi(s, TCG_TYPE_TL, TCG_REG_R3, tlb_mask);
@@ -1622,9 +1631,9 @@ static void tcg_prepare_user_ldst(TCGContext *s, TCGReg *addr_reg,
         tgen_ext32u(s, TCG_TMP0, *addr_reg);
         *addr_reg = TCG_TMP0;
     }
-    if (GUEST_BASE < 0x80000) {
+    if (guest_base < 0x80000) {
         *index_reg = TCG_REG_NONE;
-        *disp = GUEST_BASE;
+        *disp = guest_base;
     } else {
         *index_reg = TCG_GUEST_BASE_REG;
         *disp = 0;
@@ -2090,6 +2099,7 @@ static inline void tcg_out_op(TCGContext *s, TCGOpcode opc,
     case INDEX_op_ext16s_i64:
         tgen_ext16s(s, TCG_TYPE_I64, args[0], args[1]);
         break;
+    case INDEX_op_ext_i32_i64:
     case INDEX_op_ext32s_i64:
         tgen_ext32s(s, args[0], args[1]);
         break;
@@ -2099,6 +2109,7 @@ static inline void tcg_out_op(TCGContext *s, TCGOpcode opc,
     case INDEX_op_ext16u_i64:
         tgen_ext16u(s, TCG_TYPE_I64, args[0], args[1]);
         break;
+    case INDEX_op_extu_i32_i64:
     case INDEX_op_ext32u_i64:
         tgen_ext32u(s, args[0], args[1]);
         break;
@@ -2251,6 +2262,9 @@ static const TCGTargetOpDef s390_op_defs[] = {
     { INDEX_op_ext32s_i64, { "r", "r" } },
     { INDEX_op_ext32u_i64, { "r", "r" } },
 
+    { INDEX_op_ext_i32_i64, { "r", "r" } },
+    { INDEX_op_extu_i32_i64, { "r", "r" } },
+
     { INDEX_op_bswap16_i64, { "r", "r" } },
     { INDEX_op_bswap32_i64, { "r", "r" } },
     { INDEX_op_bswap64_i64, { "r", "r" } },
@@ -2328,10 +2342,12 @@ static void tcg_target_qemu_prologue(TCGContext *s)
                   TCG_STATIC_CALL_ARGS_SIZE + TCG_TARGET_CALL_STACK_OFFSET,
                   CPU_TEMP_BUF_NLONGS * sizeof(long));
 
-    if (GUEST_BASE >= 0x80000) {
-        tcg_out_movi(s, TCG_TYPE_PTR, TCG_GUEST_BASE_REG, GUEST_BASE);
+#ifndef CONFIG_SOFTMMU
+    if (guest_base >= 0x80000) {
+        tcg_out_movi(s, TCG_TYPE_PTR, TCG_GUEST_BASE_REG, guest_base);
         tcg_regset_set_reg(s->reserved_regs, TCG_GUEST_BASE_REG);
     }
+#endif
 
     tcg_out_mov(s, TCG_TYPE_PTR, TCG_AREG0, tcg_target_call_iarg_regs[0]);
     /* br %r3 (go to TB) */
diff --git a/tcg/s390/tcg-target.h b/tcg/s390/tcg-target.h
index 91576d5949..d9dc038733 100644
--- a/tcg/s390/tcg-target.h
+++ b/tcg/s390/tcg-target.h
@@ -72,7 +72,8 @@ typedef enum TCGReg {
 #define TCG_TARGET_HAS_muls2_i32        0
 #define TCG_TARGET_HAS_muluh_i32        0
 #define TCG_TARGET_HAS_mulsh_i32        0
-#define TCG_TARGET_HAS_trunc_shr_i32    0
+#define TCG_TARGET_HAS_extrl_i64_i32    0
+#define TCG_TARGET_HAS_extrh_i64_i32    0
 
 #define TCG_TARGET_HAS_div2_i64         1
 #define TCG_TARGET_HAS_rot_i64          1
diff --git a/tcg/sparc/tcg-target.c b/tcg/sparc/tcg-target.c
index 1a870a81d7..54df1bc424 100644
--- a/tcg/sparc/tcg-target.c
+++ b/tcg/sparc/tcg-target.c
@@ -83,10 +83,8 @@ static const char * const tcg_target_reg_names[TCG_TARGET_NB_REGS] = {
 #define TCG_REG_T1  TCG_REG_G1
 #define TCG_REG_T2  TCG_REG_O7
 
-#ifdef CONFIG_USE_GUEST_BASE
+#ifndef CONFIG_SOFTMMU
 # define TCG_GUEST_BASE_REG TCG_REG_I5
-#else
-# define TCG_GUEST_BASE_REG TCG_REG_G0
 #endif
 
 static const int tcg_target_reg_alloc_order[] = {
@@ -955,9 +953,9 @@ static void tcg_target_qemu_prologue(TCGContext *s)
     tcg_out32(s, SAVE | INSN_RD(TCG_REG_O6) | INSN_RS1(TCG_REG_O6) |
               INSN_IMM13(-frame_size));
 
-#ifdef CONFIG_USE_GUEST_BASE
-    if (GUEST_BASE != 0) {
-        tcg_out_movi(s, TCG_TYPE_PTR, TCG_GUEST_BASE_REG, GUEST_BASE);
+#ifndef CONFIG_SOFTMMU
+    if (guest_base != 0) {
+        tcg_out_movi(s, TCG_TYPE_PTR, TCG_GUEST_BASE_REG, guest_base);
         tcg_regset_set_reg(s->reserved_regs, TCG_GUEST_BASE_REG);
     }
 #endif
@@ -1146,7 +1144,7 @@ static void tcg_out_qemu_ld(TCGContext *s, TCGReg data, TCGReg addr,
         addr = TCG_REG_T1;
     }
     tcg_out_ldst_rr(s, data, addr,
-                    (GUEST_BASE ? TCG_GUEST_BASE_REG : TCG_REG_G0),
+                    (guest_base ? TCG_GUEST_BASE_REG : TCG_REG_G0),
                     qemu_ld_opc[memop & (MO_BSWAP | MO_SSIZE)]);
 #endif /* CONFIG_SOFTMMU */
 }
@@ -1201,7 +1199,7 @@ static void tcg_out_qemu_st(TCGContext *s, TCGReg data, TCGReg addr,
         addr = TCG_REG_T1;
     }
     tcg_out_ldst_rr(s, data, addr,
-                    (GUEST_BASE ? TCG_GUEST_BASE_REG : TCG_REG_G0),
+                    (guest_base ? TCG_GUEST_BASE_REG : TCG_REG_G0),
                     qemu_st_opc[memop & (MO_BSWAP | MO_SIZE)]);
 #endif /* CONFIG_SOFTMMU */
 }
@@ -1407,18 +1405,19 @@ static void tcg_out_op(TCGContext *s, TCGOpcode opc,
     case INDEX_op_divu_i64:
         c = ARITH_UDIVX;
         goto gen_arith;
+    case INDEX_op_ext_i32_i64:
     case INDEX_op_ext32s_i64:
         tcg_out_arithi(s, a0, a1, 0, SHIFT_SRA);
         break;
+    case INDEX_op_extu_i32_i64:
     case INDEX_op_ext32u_i64:
         tcg_out_arithi(s, a0, a1, 0, SHIFT_SRL);
         break;
-    case INDEX_op_trunc_shr_i32:
-        if (a2 == 0) {
-            tcg_out_mov(s, TCG_TYPE_I32, a0, a1);
-        } else {
-            tcg_out_arithi(s, a0, a1, a2, SHIFT_SRLX);
-        }
+    case INDEX_op_extrl_i64_i32:
+        tcg_out_mov(s, TCG_TYPE_I32, a0, a1);
+        break;
+    case INDEX_op_extrh_i64_i32:
+        tcg_out_arithi(s, a0, a1, 32, SHIFT_SRLX);
         break;
 
     case INDEX_op_brcond_i64:
@@ -1531,9 +1530,12 @@ static const TCGTargetOpDef sparc_op_defs[] = {
     { INDEX_op_neg_i64, { "R", "RJ" } },
     { INDEX_op_not_i64, { "R", "RJ" } },
 
-    { INDEX_op_ext32s_i64, { "R", "r" } },
-    { INDEX_op_ext32u_i64, { "R", "r" } },
-    { INDEX_op_trunc_shr_i32,  { "r", "R" } },
+    { INDEX_op_ext32s_i64, { "R", "R" } },
+    { INDEX_op_ext32u_i64, { "R", "R" } },
+    { INDEX_op_ext_i32_i64, { "R", "r" } },
+    { INDEX_op_extu_i32_i64, { "R", "r" } },
+    { INDEX_op_extrl_i64_i32,  { "r", "R" } },
+    { INDEX_op_extrh_i64_i32,  { "r", "R" } },
 
     { INDEX_op_brcond_i64, { "RZ", "RJ" } },
     { INDEX_op_setcond_i64, { "R", "RZ", "RJ" } },
diff --git a/tcg/sparc/tcg-target.h b/tcg/sparc/tcg-target.h
index f584de4766..2cd72d2d41 100644
--- a/tcg/sparc/tcg-target.h
+++ b/tcg/sparc/tcg-target.h
@@ -118,7 +118,8 @@ extern bool use_vis3_instructions;
 #define TCG_TARGET_HAS_muluh_i32        0
 #define TCG_TARGET_HAS_mulsh_i32        0
 
-#define TCG_TARGET_HAS_trunc_shr_i32    1
+#define TCG_TARGET_HAS_extrl_i64_i32    1
+#define TCG_TARGET_HAS_extrh_i64_i32    1
 #define TCG_TARGET_HAS_div_i64          1
 #define TCG_TARGET_HAS_rem_i64          0
 #define TCG_TARGET_HAS_rot_i64          0
diff --git a/tcg/tcg-op.c b/tcg/tcg-op.c
index 45098c310e..0b9dd8ff9f 100644
--- a/tcg/tcg-op.c
+++ b/tcg/tcg-op.c
@@ -1737,28 +1737,28 @@ void tcg_gen_muls2_i64(TCGv_i64 rl, TCGv_i64 rh, TCGv_i64 arg1, TCGv_i64 arg2)
 
 /* Size changing operations.  */
 
-void tcg_gen_trunc_shr_i64_i32(TCGv_i32 ret, TCGv_i64 arg, unsigned count)
+void tcg_gen_extrl_i64_i32(TCGv_i32 ret, TCGv_i64 arg)
 {
-    tcg_debug_assert(count < 64);
     if (TCG_TARGET_REG_BITS == 32) {
-        if (count >= 32) {
-            tcg_gen_shri_i32(ret, TCGV_HIGH(arg), count - 32);
-        } else if (count == 0) {
-            tcg_gen_mov_i32(ret, TCGV_LOW(arg));
-        } else {
-            TCGv_i64 t = tcg_temp_new_i64();
-            tcg_gen_shri_i64(t, arg, count);
-            tcg_gen_mov_i32(ret, TCGV_LOW(t));
-            tcg_temp_free_i64(t);
-        }
-    } else if (TCG_TARGET_HAS_trunc_shr_i32) {
-        tcg_gen_op3i_i32(INDEX_op_trunc_shr_i32, ret,
-                         MAKE_TCGV_I32(GET_TCGV_I64(arg)), count);
-    } else if (count == 0) {
+        tcg_gen_mov_i32(ret, TCGV_LOW(arg));
+    } else if (TCG_TARGET_HAS_extrl_i64_i32) {
+        tcg_gen_op2(&tcg_ctx, INDEX_op_extrl_i64_i32,
+                    GET_TCGV_I32(ret), GET_TCGV_I64(arg));
+    } else {
         tcg_gen_mov_i32(ret, MAKE_TCGV_I32(GET_TCGV_I64(arg)));
+    }
+}
+
+void tcg_gen_extrh_i64_i32(TCGv_i32 ret, TCGv_i64 arg)
+{
+    if (TCG_TARGET_REG_BITS == 32) {
+        tcg_gen_mov_i32(ret, TCGV_HIGH(arg));
+    } else if (TCG_TARGET_HAS_extrh_i64_i32) {
+        tcg_gen_op2(&tcg_ctx, INDEX_op_extrh_i64_i32,
+                    GET_TCGV_I32(ret), GET_TCGV_I64(arg));
     } else {
         TCGv_i64 t = tcg_temp_new_i64();
-        tcg_gen_shri_i64(t, arg, count);
+        tcg_gen_shri_i64(t, arg, 32);
         tcg_gen_mov_i32(ret, MAKE_TCGV_I32(GET_TCGV_I64(t)));
         tcg_temp_free_i64(t);
     }
@@ -1770,9 +1770,8 @@ void tcg_gen_extu_i32_i64(TCGv_i64 ret, TCGv_i32 arg)
         tcg_gen_mov_i32(TCGV_LOW(ret), arg);
         tcg_gen_movi_i32(TCGV_HIGH(ret), 0);
     } else {
-        /* Note: we assume the target supports move between
-           32 and 64 bit registers.  */
-        tcg_gen_ext32u_i64(ret, MAKE_TCGV_I64(GET_TCGV_I32(arg)));
+        tcg_gen_op2(&tcg_ctx, INDEX_op_extu_i32_i64,
+                    GET_TCGV_I64(ret), GET_TCGV_I32(arg));
     }
 }
 
@@ -1782,9 +1781,8 @@ void tcg_gen_ext_i32_i64(TCGv_i64 ret, TCGv_i32 arg)
         tcg_gen_mov_i32(TCGV_LOW(ret), arg);
         tcg_gen_sari_i32(TCGV_HIGH(ret), TCGV_LOW(ret), 31);
     } else {
-        /* Note: we assume the target supports move between
-           32 and 64 bit registers.  */
-        tcg_gen_ext32s_i64(ret, MAKE_TCGV_I64(GET_TCGV_I32(arg)));
+        tcg_gen_op2(&tcg_ctx, INDEX_op_ext_i32_i64,
+                    GET_TCGV_I64(ret), GET_TCGV_I32(arg));
     }
 }
 
@@ -1820,8 +1818,8 @@ void tcg_gen_extr_i64_i32(TCGv_i32 lo, TCGv_i32 hi, TCGv_i64 arg)
         tcg_gen_mov_i32(lo, TCGV_LOW(arg));
         tcg_gen_mov_i32(hi, TCGV_HIGH(arg));
     } else {
-        tcg_gen_trunc_shr_i64_i32(lo, arg, 0);
-        tcg_gen_trunc_shr_i64_i32(hi, arg, 32);
+        tcg_gen_extrl_i64_i32(lo, arg);
+        tcg_gen_extrh_i64_i32(hi, arg);
     }
 }
 
diff --git a/tcg/tcg-op.h b/tcg/tcg-op.h
index d1d763f6ff..6da083a1e9 100644
--- a/tcg/tcg-op.h
+++ b/tcg/tcg-op.h
@@ -684,7 +684,8 @@ static inline void tcg_gen_neg_i64(TCGv_i64 ret, TCGv_i64 arg)
 void tcg_gen_extu_i32_i64(TCGv_i64 ret, TCGv_i32 arg);
 void tcg_gen_ext_i32_i64(TCGv_i64 ret, TCGv_i32 arg);
 void tcg_gen_concat_i32_i64(TCGv_i64 dest, TCGv_i32 low, TCGv_i32 high);
-void tcg_gen_trunc_shr_i64_i32(TCGv_i32 ret, TCGv_i64 arg, unsigned int c);
+void tcg_gen_extrl_i64_i32(TCGv_i32 ret, TCGv_i64 arg);
+void tcg_gen_extrh_i64_i32(TCGv_i32 ret, TCGv_i64 arg);
 void tcg_gen_extr_i64_i32(TCGv_i32 lo, TCGv_i32 hi, TCGv_i64 arg);
 void tcg_gen_extr32_i64(TCGv_i64 lo, TCGv_i64 hi, TCGv_i64 arg);
 
@@ -693,11 +694,6 @@ static inline void tcg_gen_concat32_i64(TCGv_i64 ret, TCGv_i64 lo, TCGv_i64 hi)
     tcg_gen_deposit_i64(ret, lo, hi, 32, 32);
 }
 
-static inline void tcg_gen_trunc_i64_i32(TCGv_i32 ret, TCGv_i64 arg)
-{
-    tcg_gen_trunc_shr_i64_i32(ret, arg, 0);
-}
-
 /* QEMU specific operations.  */
 
 #ifndef TARGET_LONG_BITS
@@ -853,7 +849,7 @@ static inline void tcg_gen_qemu_st64(TCGv_i64 arg, TCGv addr, int mem_index)
 #define tcg_gen_divu_tl tcg_gen_divu_i64
 #define tcg_gen_remu_tl tcg_gen_remu_i64
 #define tcg_gen_discard_tl tcg_gen_discard_i64
-#define tcg_gen_trunc_tl_i32 tcg_gen_trunc_i64_i32
+#define tcg_gen_trunc_tl_i32 tcg_gen_extrl_i64_i32
 #define tcg_gen_trunc_i64_tl tcg_gen_mov_i64
 #define tcg_gen_extu_i32_tl tcg_gen_extu_i32_i64
 #define tcg_gen_ext_i32_tl tcg_gen_ext_i32_i64
@@ -932,7 +928,7 @@ static inline void tcg_gen_qemu_st64(TCGv_i64 arg, TCGv addr, int mem_index)
 #define tcg_gen_remu_tl tcg_gen_remu_i32
 #define tcg_gen_discard_tl tcg_gen_discard_i32
 #define tcg_gen_trunc_tl_i32 tcg_gen_mov_i32
-#define tcg_gen_trunc_i64_tl tcg_gen_trunc_i64_i32
+#define tcg_gen_trunc_i64_tl tcg_gen_extrl_i64_i32
 #define tcg_gen_extu_i32_tl tcg_gen_mov_i32
 #define tcg_gen_ext_i32_tl tcg_gen_mov_i32
 #define tcg_gen_extu_tl_i64 tcg_gen_extu_i32_i64
diff --git a/tcg/tcg-opc.h b/tcg/tcg-opc.h
index 13ccb60a5d..02bbf30387 100644
--- a/tcg/tcg-opc.h
+++ b/tcg/tcg-opc.h
@@ -138,8 +138,14 @@ DEF(rotl_i64, 1, 2, 0, IMPL64 | IMPL(TCG_TARGET_HAS_rot_i64))
 DEF(rotr_i64, 1, 2, 0, IMPL64 | IMPL(TCG_TARGET_HAS_rot_i64))
 DEF(deposit_i64, 1, 2, 2, IMPL64 | IMPL(TCG_TARGET_HAS_deposit_i64))
 
-DEF(trunc_shr_i32, 1, 1, 1,
-    IMPL(TCG_TARGET_HAS_trunc_shr_i32)
+/* size changing ops */
+DEF(ext_i32_i64, 1, 1, 0, IMPL64)
+DEF(extu_i32_i64, 1, 1, 0, IMPL64)
+DEF(extrl_i64_i32, 1, 1, 0,
+    IMPL(TCG_TARGET_HAS_extrl_i64_i32)
+    | (TCG_TARGET_REG_BITS == 32 ? TCG_OPF_NOT_PRESENT : 0))
+DEF(extrh_i64_i32, 1, 1, 0,
+    IMPL(TCG_TARGET_HAS_extrh_i64_i32)
     | (TCG_TARGET_REG_BITS == 32 ? TCG_OPF_NOT_PRESENT : 0))
 
 DEF(brcond_i64, 0, 2, 2, TCG_OPF_BB_END | IMPL64)
diff --git a/tcg/tcg.h b/tcg/tcg.h
index 231a781524..f437824ba9 100644
--- a/tcg/tcg.h
+++ b/tcg/tcg.h
@@ -66,7 +66,8 @@ typedef uint64_t TCGRegSet;
 
 #if TCG_TARGET_REG_BITS == 32
 /* Turn some undef macros into false macros.  */
-#define TCG_TARGET_HAS_trunc_shr_i32    0
+#define TCG_TARGET_HAS_extrl_i64_i32    0
+#define TCG_TARGET_HAS_extrh_i64_i32    0
 #define TCG_TARGET_HAS_div_i64          0
 #define TCG_TARGET_HAS_rem_i64          0
 #define TCG_TARGET_HAS_div2_i64         0
diff --git a/tcg/tci/tcg-target.c b/tcg/tci/tcg-target.c
index 83472dbcd8..bbb54d4e8c 100644
--- a/tcg/tci/tcg-target.c
+++ b/tcg/tci/tcg-target.c
@@ -210,6 +210,8 @@ static const TCGTargetOpDef tcg_target_op_defs[] = {
 #if TCG_TARGET_HAS_ext32u_i64
     { INDEX_op_ext32u_i64, { R, R } },
 #endif
+    { INDEX_op_ext_i32_i64, { R, R } },
+    { INDEX_op_extu_i32_i64, { R, R } },
 #if TCG_TARGET_HAS_bswap16_i64
     { INDEX_op_bswap16_i64, { R, R } },
 #endif
@@ -701,6 +703,8 @@ static void tcg_out_op(TCGContext *s, TCGOpcode opc, const TCGArg *args,
     case INDEX_op_ext16u_i64:   /* Optional (TCG_TARGET_HAS_ext16u_i64). */
     case INDEX_op_ext32s_i64:   /* Optional (TCG_TARGET_HAS_ext32s_i64). */
     case INDEX_op_ext32u_i64:   /* Optional (TCG_TARGET_HAS_ext32u_i64). */
+    case INDEX_op_ext_i32_i64:
+    case INDEX_op_extu_i32_i64:
 #endif /* TCG_TARGET_REG_BITS == 64 */
     case INDEX_op_neg_i32:      /* Optional (TCG_TARGET_HAS_neg_i32). */
     case INDEX_op_not_i32:      /* Optional (TCG_TARGET_HAS_not_i32). */
diff --git a/tcg/tci/tcg-target.h b/tcg/tci/tcg-target.h
index cbf3f9b5a6..77e5952781 100644
--- a/tcg/tci/tcg-target.h
+++ b/tcg/tci/tcg-target.h
@@ -84,7 +84,8 @@
 #define TCG_TARGET_HAS_mulsh_i32        0
 
 #if TCG_TARGET_REG_BITS == 64
-#define TCG_TARGET_HAS_trunc_shr_i32    0
+#define TCG_TARGET_HAS_extrl_i64_i32    0
+#define TCG_TARGET_HAS_extrh_i64_i32    0
 #define TCG_TARGET_HAS_bswap16_i64      1
 #define TCG_TARGET_HAS_bswap32_i64      1
 #define TCG_TARGET_HAS_bswap64_i64      1
diff --git a/tci.c b/tci.c
index 84449489d2..3d6d17783d 100644
--- a/tci.c
+++ b/tci.c
@@ -1033,18 +1033,20 @@ uintptr_t tcg_qemu_tb_exec(CPUArchState *env, uint8_t *tb_ptr)
 #endif
 #if TCG_TARGET_HAS_ext32s_i64
         case INDEX_op_ext32s_i64:
+#endif
+        case INDEX_op_ext_i32_i64:
             t0 = *tb_ptr++;
             t1 = tci_read_r32s(&tb_ptr);
             tci_write_reg64(t0, t1);
             break;
-#endif
 #if TCG_TARGET_HAS_ext32u_i64
         case INDEX_op_ext32u_i64:
+#endif
+        case INDEX_op_extu_i32_i64:
             t0 = *tb_ptr++;
             t1 = tci_read_r32(&tb_ptr);
             tci_write_reg64(t0, t1);
             break;
-#endif
 #if TCG_TARGET_HAS_bswap16_i64
         case INDEX_op_bswap16_i64:
             TODO();
diff --git a/tests/Makefile b/tests/Makefile
index 749458224a..52711237ca 100644
--- a/tests/Makefile
+++ b/tests/Makefile
@@ -381,7 +381,7 @@ tests/wdt_ib700-test$(EXESUF): tests/wdt_ib700-test.o
 tests/tco-test$(EXESUF): tests/tco-test.o $(libqos-pc-obj-y)
 tests/virtio-balloon-test$(EXESUF): tests/virtio-balloon-test.o
 tests/virtio-blk-test$(EXESUF): tests/virtio-blk-test.o $(libqos-virtio-obj-y)
-tests/virtio-net-test$(EXESUF): tests/virtio-net-test.o $(libqos-pc-obj-y)
+tests/virtio-net-test$(EXESUF): tests/virtio-net-test.o $(libqos-pc-obj-y) $(libqos-virtio-obj-y)
 tests/virtio-rng-test$(EXESUF): tests/virtio-rng-test.o $(libqos-pc-obj-y)
 tests/virtio-scsi-test$(EXESUF): tests/virtio-scsi-test.o $(libqos-virtio-obj-y)
 tests/virtio-9p-test$(EXESUF): tests/virtio-9p-test.o
diff --git a/tests/virtio-net-test.c b/tests/virtio-net-test.c
index ea7478c278..982d77a14b 100644
--- a/tests/virtio-net-test.c
+++ b/tests/virtio-net-test.c
@@ -10,20 +10,242 @@
 #include <glib.h>
 #include <string.h>
 #include "libqtest.h"
+#include "qemu-common.h"
+#include "qemu/sockets.h"
 #include "qemu/osdep.h"
-#include "libqos/pci.h"
+#include "qemu/iov.h"
+#include "libqos/pci-pc.h"
+#include "libqos/virtio.h"
+#include "libqos/virtio-pci.h"
+#include "libqos/malloc.h"
+#include "libqos/malloc-pc.h"
+#include "libqos/malloc-generic.h"
+#include "qemu/bswap.h"
+#include "hw/virtio/virtio-net.h"
 
 #define PCI_SLOT_HP             0x06
+#define PCI_SLOT                0x04
+#define PCI_FN                  0x00
 
-/* Tests only initialization so far. TODO: Replace with functional tests */
-static void pci_nop(void)
+#define QVIRTIO_NET_TIMEOUT_US (30 * 1000 * 1000)
+#define VNET_HDR_SIZE sizeof(struct virtio_net_hdr_mrg_rxbuf)
+
+static void test_end(void)
+{
+    qtest_end();
+}
+
+#ifndef _WIN32
+
+static QVirtioPCIDevice *virtio_net_pci_init(QPCIBus *bus, int slot)
+{
+    QVirtioPCIDevice *dev;
+
+    dev = qvirtio_pci_device_find(bus, QVIRTIO_NET_DEVICE_ID);
+    g_assert(dev != NULL);
+    g_assert_cmphex(dev->vdev.device_type, ==, QVIRTIO_NET_DEVICE_ID);
+
+    qvirtio_pci_device_enable(dev);
+    qvirtio_reset(&qvirtio_pci, &dev->vdev);
+    qvirtio_set_acknowledge(&qvirtio_pci, &dev->vdev);
+    qvirtio_set_driver(&qvirtio_pci, &dev->vdev);
+
+    return dev;
+}
+
+static QPCIBus *pci_test_start(int socket)
+{
+    char *cmdline;
+
+    cmdline = g_strdup_printf("-netdev socket,fd=%d,id=hs0 -device "
+                              "virtio-net-pci,netdev=hs0", socket);
+    qtest_start(cmdline);
+    g_free(cmdline);
+
+    return qpci_init_pc();
+}
+
+static void driver_init(const QVirtioBus *bus, QVirtioDevice *dev)
+{
+    uint32_t features;
+
+    features = qvirtio_get_features(bus, dev);
+    features = features & ~(QVIRTIO_F_BAD_FEATURE |
+                            QVIRTIO_F_RING_INDIRECT_DESC |
+                            QVIRTIO_F_RING_EVENT_IDX);
+    qvirtio_set_features(bus, dev, features);
+
+    qvirtio_set_driver_ok(bus, dev);
+}
+
+static void rx_test(const QVirtioBus *bus, QVirtioDevice *dev,
+                    QGuestAllocator *alloc, QVirtQueue *vq,
+                    int socket)
+{
+    uint64_t req_addr;
+    uint32_t free_head;
+    char test[] = "TEST";
+    char buffer[64];
+    int len = htonl(sizeof(test));
+    struct iovec iov[] = {
+        {
+            .iov_base = &len,
+            .iov_len = sizeof(len),
+        }, {
+            .iov_base = test,
+            .iov_len = sizeof(test),
+        },
+    };
+    int ret;
+
+    req_addr = guest_alloc(alloc, 64);
+
+    free_head = qvirtqueue_add(vq, req_addr, 64, true, false);
+    qvirtqueue_kick(bus, dev, vq, free_head);
+
+    ret = iov_send(socket, iov, 2, 0, sizeof(len) + sizeof(test));
+    g_assert_cmpint(ret, ==, sizeof(test) + sizeof(len));
+
+    qvirtio_wait_queue_isr(bus, dev, vq, QVIRTIO_NET_TIMEOUT_US);
+    memread(req_addr + VNET_HDR_SIZE, buffer, sizeof(test));
+    g_assert_cmpstr(buffer, ==, "TEST");
+
+    guest_free(alloc, req_addr);
+}
+
+static void tx_test(const QVirtioBus *bus, QVirtioDevice *dev,
+                    QGuestAllocator *alloc, QVirtQueue *vq,
+                    int socket)
+{
+    uint64_t req_addr;
+    uint32_t free_head;
+    uint32_t len;
+    char buffer[64];
+    int ret;
+
+    req_addr = guest_alloc(alloc, 64);
+    memwrite(req_addr + VNET_HDR_SIZE, "TEST", 4);
+
+    free_head = qvirtqueue_add(vq, req_addr, 64, false, false);
+    qvirtqueue_kick(bus, dev, vq, free_head);
+
+    qvirtio_wait_queue_isr(bus, dev, vq, QVIRTIO_NET_TIMEOUT_US);
+    guest_free(alloc, req_addr);
+
+    ret = qemu_recv(socket, &len, sizeof(len), 0);
+    g_assert_cmpint(ret, ==, sizeof(len));
+    len = ntohl(len);
+
+    ret = qemu_recv(socket, buffer, len, 0);
+    g_assert_cmpstr(buffer, ==, "TEST");
+}
+
+static void rx_stop_cont_test(const QVirtioBus *bus, QVirtioDevice *dev,
+                              QGuestAllocator *alloc, QVirtQueue *vq,
+                              int socket)
+{
+    uint64_t req_addr;
+    uint32_t free_head;
+    char test[] = "TEST";
+    char buffer[64];
+    int len = htonl(sizeof(test));
+    struct iovec iov[] = {
+        {
+            .iov_base = &len,
+            .iov_len = sizeof(len),
+        }, {
+            .iov_base = test,
+            .iov_len = sizeof(test),
+        },
+    };
+    int ret;
+
+    req_addr = guest_alloc(alloc, 64);
+
+    free_head = qvirtqueue_add(vq, req_addr, 64, true, false);
+    qvirtqueue_kick(bus, dev, vq, free_head);
+
+    qmp("{ 'execute' : 'stop'}");
+
+    ret = iov_send(socket, iov, 2, 0, sizeof(len) + sizeof(test));
+    g_assert_cmpint(ret, ==, sizeof(test) + sizeof(len));
+
+    /* We could check the status, but this command is more importantly to
+     * ensure the packet data gets queued in QEMU, before we do 'cont'.
+     */
+    qmp("{ 'execute' : 'query-status'}");
+    qmp("{ 'execute' : 'cont'}");
+
+    qvirtio_wait_queue_isr(bus, dev, vq, QVIRTIO_NET_TIMEOUT_US);
+    memread(req_addr + VNET_HDR_SIZE, buffer, sizeof(test));
+    g_assert_cmpstr(buffer, ==, "TEST");
+
+    guest_free(alloc, req_addr);
+}
+
+static void send_recv_test(const QVirtioBus *bus, QVirtioDevice *dev,
+                           QGuestAllocator *alloc, QVirtQueue *rvq,
+                           QVirtQueue *tvq, int socket)
 {
+    rx_test(bus, dev, alloc, rvq, socket);
+    tx_test(bus, dev, alloc, tvq, socket);
 }
 
+static void stop_cont_test(const QVirtioBus *bus, QVirtioDevice *dev,
+                           QGuestAllocator *alloc, QVirtQueue *rvq,
+                           QVirtQueue *tvq, int socket)
+{
+    rx_stop_cont_test(bus, dev, alloc, rvq, socket);
+}
+
+static void pci_basic(gconstpointer data)
+{
+    QVirtioPCIDevice *dev;
+    QPCIBus *bus;
+    QVirtQueuePCI *tx, *rx;
+    QGuestAllocator *alloc;
+    void (*func) (const QVirtioBus *bus,
+                  QVirtioDevice *dev,
+                  QGuestAllocator *alloc,
+                  QVirtQueue *rvq,
+                  QVirtQueue *tvq,
+                  int socket) = data;
+    int sv[2], ret;
+
+    ret = socketpair(PF_UNIX, SOCK_STREAM, 0, sv);
+    g_assert_cmpint(ret, !=, -1);
+
+    bus = pci_test_start(sv[1]);
+    dev = virtio_net_pci_init(bus, PCI_SLOT);
+
+    alloc = pc_alloc_init();
+    rx = (QVirtQueuePCI *)qvirtqueue_setup(&qvirtio_pci, &dev->vdev,
+                                           alloc, 0);
+    tx = (QVirtQueuePCI *)qvirtqueue_setup(&qvirtio_pci, &dev->vdev,
+                                           alloc, 1);
+
+    driver_init(&qvirtio_pci, &dev->vdev);
+    func(&qvirtio_pci, &dev->vdev, alloc, &rx->vq, &tx->vq, sv[0]);
+
+    /* End test */
+    close(sv[0]);
+    guest_free(alloc, tx->vq.desc);
+    pc_alloc_uninit(alloc);
+    qvirtio_pci_device_disable(dev);
+    g_free(dev);
+    qpci_free_pc(bus);
+    test_end();
+}
+#endif
+
 static void hotplug(void)
 {
+    qtest_start("-device virtio-net-pci");
+
     qpci_plug_device_test("virtio-net-pci", "net1", PCI_SLOT_HP, NULL);
     qpci_unplug_acpi_device_test("net1", PCI_SLOT_HP);
+
+    test_end();
 }
 
 int main(int argc, char **argv)
@@ -31,13 +253,14 @@ int main(int argc, char **argv)
     int ret;
 
     g_test_init(&argc, &argv, NULL);
-    qtest_add_func("/virtio/net/pci/nop", pci_nop);
+#ifndef _WIN32
+    qtest_add_data_func("/virtio/net/pci/basic", send_recv_test, pci_basic);
+    qtest_add_data_func("/virtio/net/pci/rx_stop_cont",
+                        stop_cont_test, pci_basic);
+#endif
     qtest_add_func("/virtio/net/pci/hotplug", hotplug);
 
-    qtest_start("-device virtio-net-pci");
     ret = g_test_run();
 
-    qtest_end();
-
     return ret;
 }
diff --git a/tests/virtio-scsi-test.c b/tests/virtio-scsi-test.c
index 11ccdd632e..66d8491e9d 100644
--- a/tests/virtio-scsi-test.c
+++ b/tests/virtio-scsi-test.c
@@ -13,6 +13,7 @@
 #include "libqtest.h"
 #include "qemu/osdep.h"
 #include <stdio.h>
+#include "block/scsi.h"
 #include "libqos/virtio.h"
 #include "libqos/virtio-pci.h"
 #include "libqos/pci-pc.h"
@@ -71,40 +72,6 @@ static void qvirtio_scsi_stop(void)
     qtest_end();
 }
 
-static QVirtIOSCSI *qvirtio_scsi_pci_init(int slot)
-{
-    QVirtIOSCSI *vs;
-    QVirtioPCIDevice *dev;
-    void *addr;
-    int i;
-
-    vs = g_new0(QVirtIOSCSI, 1);
-    vs->alloc = pc_alloc_init();
-    vs->bus = qpci_init_pc();
-
-    dev = qvirtio_pci_device_find(vs->bus, QVIRTIO_SCSI_DEVICE_ID);
-    vs->dev = (QVirtioDevice *)dev;
-    g_assert(dev != NULL);
-    g_assert_cmphex(vs->dev->device_type, ==, QVIRTIO_SCSI_DEVICE_ID);
-
-    qvirtio_pci_device_enable(dev);
-    qvirtio_reset(&qvirtio_pci, vs->dev);
-    qvirtio_set_acknowledge(&qvirtio_pci, vs->dev);
-    qvirtio_set_driver(&qvirtio_pci, vs->dev);
-
-    addr = dev->addr + QVIRTIO_PCI_DEVICE_SPECIFIC_NO_MSIX;
-    vs->num_queues = qvirtio_config_readl(&qvirtio_pci, vs->dev,
-                                          (uint64_t)(uintptr_t)addr);
-
-    g_assert_cmpint(vs->num_queues, <, MAX_NUM_QUEUES);
-
-    for (i = 0; i < vs->num_queues + 2; i++) {
-        vs->vq[i] = qvirtqueue_setup(&qvirtio_pci, vs->dev, vs->alloc, i);
-    }
-
-    return vs;
-}
-
 static void qvirtio_scsi_pci_free(QVirtIOSCSI *vs)
 {
     int i;
@@ -134,7 +101,8 @@ static uint64_t qvirtio_scsi_alloc(QVirtIOSCSI *vs, size_t alloc_size,
 static uint8_t virtio_scsi_do_command(QVirtIOSCSI *vs, const uint8_t *cdb,
                                       const uint8_t *data_in,
                                       size_t data_in_len,
-                                      uint8_t *data_out, size_t data_out_len)
+                                      uint8_t *data_out, size_t data_out_len,
+                                      QVirtIOSCSICmdResp *resp_out)
 {
     QVirtQueue *vq;
     QVirtIOSCSICmdReq req = { { 0 } };
@@ -174,6 +142,10 @@ static uint8_t virtio_scsi_do_command(QVirtIOSCSI *vs, const uint8_t *cdb,
 
     response = readb(resp_addr + offsetof(QVirtIOSCSICmdResp, response));
 
+    if (resp_out) {
+        memread(resp_addr, resp_out, sizeof(*resp_out));
+    }
+
     guest_free(vs->alloc, req_addr);
     guest_free(vs->alloc, resp_addr);
     guest_free(vs->alloc, data_in_addr);
@@ -181,6 +153,52 @@ static uint8_t virtio_scsi_do_command(QVirtIOSCSI *vs, const uint8_t *cdb,
     return response;
 }
 
+static QVirtIOSCSI *qvirtio_scsi_pci_init(int slot)
+{
+    const uint8_t test_unit_ready_cdb[CDB_SIZE] = {};
+    QVirtIOSCSI *vs;
+    QVirtioPCIDevice *dev;
+    QVirtIOSCSICmdResp resp;
+    void *addr;
+    int i;
+
+    vs = g_new0(QVirtIOSCSI, 1);
+    vs->alloc = pc_alloc_init();
+    vs->bus = qpci_init_pc();
+
+    dev = qvirtio_pci_device_find(vs->bus, QVIRTIO_SCSI_DEVICE_ID);
+    vs->dev = (QVirtioDevice *)dev;
+    g_assert(dev != NULL);
+    g_assert_cmphex(vs->dev->device_type, ==, QVIRTIO_SCSI_DEVICE_ID);
+
+    qvirtio_pci_device_enable(dev);
+    qvirtio_reset(&qvirtio_pci, vs->dev);
+    qvirtio_set_acknowledge(&qvirtio_pci, vs->dev);
+    qvirtio_set_driver(&qvirtio_pci, vs->dev);
+
+    addr = dev->addr + QVIRTIO_PCI_DEVICE_SPECIFIC_NO_MSIX;
+    vs->num_queues = qvirtio_config_readl(&qvirtio_pci, vs->dev,
+                                          (uint64_t)(uintptr_t)addr);
+
+    g_assert_cmpint(vs->num_queues, <, MAX_NUM_QUEUES);
+
+    for (i = 0; i < vs->num_queues + 2; i++) {
+        vs->vq[i] = qvirtqueue_setup(&qvirtio_pci, vs->dev, vs->alloc, i);
+    }
+
+    /* Clear the POWER ON OCCURRED unit attention */
+    g_assert_cmpint(virtio_scsi_do_command(vs, test_unit_ready_cdb,
+                                           NULL, 0, NULL, 0, &resp),
+                    ==, 0);
+    g_assert_cmpint(resp.status, ==, CHECK_CONDITION);
+    g_assert_cmpint(resp.sense[0], ==, 0x70); /* Fixed format sense buffer */
+    g_assert_cmpint(resp.sense[2], ==, UNIT_ATTENTION);
+    g_assert_cmpint(resp.sense[12], ==, 0x29); /* POWER ON */
+    g_assert_cmpint(resp.sense[13], ==, 0x00);
+
+    return vs;
+}
+
 /* Tests only initialization so far. TODO: Replace with functional tests */
 static void pci_nop(void)
 {
@@ -221,9 +239,12 @@ static void hotplug(void)
 static void test_unaligned_write_same(void)
 {
     QVirtIOSCSI *vs;
-    uint8_t buf[512] = { 0 };
-    const uint8_t write_same_cdb[CDB_SIZE] = { 0x41, 0x00, 0x00, 0x00, 0x00,
+    uint8_t buf1[512] = { 0 };
+    uint8_t buf2[512] = { 1 };
+    const uint8_t write_same_cdb_1[CDB_SIZE] = { 0x41, 0x00, 0x00, 0x00, 0x00,
                                                0x01, 0x00, 0x00, 0x02, 0x00 };
+    const uint8_t write_same_cdb_2[CDB_SIZE] = { 0x41, 0x00, 0x00, 0x00, 0x00,
+                                               0x01, 0x00, 0x33, 0x00, 0x00 };
 
     qvirtio_scsi_start("-drive file=blkdebug::null-co://,if=none,id=dr1"
                        ",format=raw,file.align=4k "
@@ -231,7 +252,10 @@ static void test_unaligned_write_same(void)
     vs = qvirtio_scsi_pci_init(PCI_SLOT);
 
     g_assert_cmphex(0, ==,
-        virtio_scsi_do_command(vs, write_same_cdb, NULL, 0, buf, 512));
+        virtio_scsi_do_command(vs, write_same_cdb_1, NULL, 0, buf1, 512, NULL));
+
+    g_assert_cmphex(0, ==,
+        virtio_scsi_do_command(vs, write_same_cdb_2, NULL, 0, buf2, 512, NULL));
 
     qvirtio_scsi_pci_free(vs);
     qvirtio_scsi_stop();
diff --git a/translate-all.c b/translate-all.c
index 60a3d8b2bd..2a40530bba 100644
--- a/translate-all.c
+++ b/translate-all.c
@@ -222,6 +222,7 @@ static int cpu_restore_state_from_tb(CPUState *cpu, TranslationBlock *tb,
     gen_intermediate_code_pc(env, tb);
 
     if (tb->cflags & CF_USE_ICOUNT) {
+        assert(use_icount);
         /* Reset the cycle counter to the start of the block.  */
         cpu->icount_decr.u16.low += tb->icount;
         /* Clear the IO flag.  */
@@ -687,7 +688,7 @@ void tcg_exec_init(unsigned long tb_size)
     tcg_ctx.code_gen_ptr = tcg_ctx.code_gen_buffer;
     tcg_register_jit(tcg_ctx.code_gen_buffer, tcg_ctx.code_gen_buffer_size);
     page_init();
-#if !defined(CONFIG_USER_ONLY) || !defined(CONFIG_USE_GUEST_BASE)
+#if defined(CONFIG_SOFTMMU)
     /* There's no guest base to take into account, so go ahead and
        initialize the prologue now.  */
     tcg_prologue_init(&tcg_ctx);
@@ -1470,7 +1471,7 @@ static void tcg_handle_interrupt(CPUState *cpu, int mask)
 
     if (use_icount) {
         cpu->icount_decr.u16.high = 0xffff;
-        if (!cpu_can_do_io(cpu)
+        if (!cpu->can_do_io
             && (mask & ~old_mask) != 0) {
             cpu_abort(cpu, "Raised interrupt while not in I/O function");
         }
@@ -1533,6 +1534,14 @@ void cpu_io_recompile(CPUState *cpu, uintptr_t retaddr)
     cs_base = tb->cs_base;
     flags = tb->flags;
     tb_phys_invalidate(tb, -1);
+    if (tb->cflags & CF_NOCACHE) {
+        if (tb->orig_tb) {
+            /* Invalidate original TB if this TB was generated in
+             * cpu_exec_nocache() */
+            tb_phys_invalidate(tb->orig_tb, -1);
+        }
+        tb_free(tb);
+    }
     /* FIXME: In theory this could raise an exception.  In practice
        we have already translated the block once so it's probably ok.  */
     tb_gen_code(cpu, pc, cs_base, flags, cflags);
diff --git a/ui/vnc.c b/ui/vnc.c
index e26973a2b6..caf82f56f1 100644
--- a/ui/vnc.c
+++ b/ui/vnc.c
@@ -2872,7 +2872,7 @@ static int vnc_refresh_server_surface(VncDisplay *vd)
                     pixman_image_get_width(vd->server));
     int height = MIN(pixman_image_get_height(vd->guest.fb),
                      pixman_image_get_height(vd->server));
-    int cmp_bytes, server_stride, min_stride, guest_stride, y = 0;
+    int cmp_bytes, server_stride, line_bytes, guest_ll, guest_stride, y = 0;
     uint8_t *guest_row0 = NULL, *server_row0;
     VncState *vs;
     int has_dirty = 0;
@@ -2891,17 +2891,21 @@ static int vnc_refresh_server_surface(VncDisplay *vd)
      * Update server dirty map.
      */
     server_row0 = (uint8_t *)pixman_image_get_data(vd->server);
-    server_stride = guest_stride = pixman_image_get_stride(vd->server);
+    server_stride = guest_stride = guest_ll =
+        pixman_image_get_stride(vd->server);
     cmp_bytes = MIN(VNC_DIRTY_PIXELS_PER_BIT * VNC_SERVER_FB_BYTES,
                     server_stride);
     if (vd->guest.format != VNC_SERVER_FB_FORMAT) {
         int width = pixman_image_get_width(vd->server);
         tmpbuf = qemu_pixman_linebuf_create(VNC_SERVER_FB_FORMAT, width);
     } else {
+        int guest_bpp =
+            PIXMAN_FORMAT_BPP(pixman_image_get_format(vd->guest.fb));
         guest_row0 = (uint8_t *)pixman_image_get_data(vd->guest.fb);
         guest_stride = pixman_image_get_stride(vd->guest.fb);
+        guest_ll = pixman_image_get_width(vd->guest.fb) * ((guest_bpp + 7) / 8);
     }
-    min_stride = MIN(server_stride, guest_stride);
+    line_bytes = MIN(server_stride, guest_ll);
 
     for (;;) {
         int x;
@@ -2932,9 +2936,10 @@ static int vnc_refresh_server_surface(VncDisplay *vd)
             if (!test_and_clear_bit(x, vd->guest.dirty[y])) {
                 continue;
             }
-            if ((x + 1) * cmp_bytes > min_stride) {
-                _cmp_bytes = min_stride - x * cmp_bytes;
+            if ((x + 1) * cmp_bytes > line_bytes) {
+                _cmp_bytes = line_bytes - x * cmp_bytes;
             }
+            assert(_cmp_bytes >= 0);
             if (memcmp(server_ptr, guest_ptr, _cmp_bytes) == 0) {
                 continue;
             }
diff --git a/user-exec.c b/user-exec.c
index ed9a07f159..8ad89a466b 100644
--- a/user-exec.c
+++ b/user-exec.c
@@ -92,8 +92,8 @@ static inline int handle_cpu_signal(uintptr_t pc, unsigned long address,
     int ret;
 
 #if defined(DEBUG_SIGNAL)
-    qemu_printf("qemu: SIGSEGV pc=0x%08lx address=%08lx w=%d oldset=0x%08lx\n",
-                pc, address, is_write, *(unsigned long *)old_set);
+    printf("qemu: SIGSEGV pc=0x%08lx address=%08lx w=%d oldset=0x%08lx\n",
+           pc, address, is_write, *(unsigned long *)old_set);
 #endif
     /* XXX: locking issue */
     if (is_write && h2g_valid(address)
diff --git a/util/rcu.c b/util/rcu.c
index cdcad678b4..8ba304dc44 100644
--- a/util/rcu.c
+++ b/util/rcu.c
@@ -47,7 +47,8 @@
 unsigned long rcu_gp_ctr = RCU_GP_LOCKED;
 
 QemuEvent rcu_gp_event;
-static QemuMutex rcu_gp_lock;
+static QemuMutex rcu_registry_lock;
+static QemuMutex rcu_sync_lock;
 
 /*
  * Check whether a quiescent state was crossed between the beginning of
@@ -66,7 +67,7 @@ static inline int rcu_gp_ongoing(unsigned long *ctr)
  */
 __thread struct rcu_reader_data rcu_reader;
 
-/* Protected by rcu_gp_lock.  */
+/* Protected by rcu_registry_lock.  */
 typedef QLIST_HEAD(, rcu_reader_data) ThreadList;
 static ThreadList registry = QLIST_HEAD_INITIALIZER(registry);
 
@@ -114,10 +115,26 @@ static void wait_for_readers(void)
             break;
         }
 
-        /* Wait for one thread to report a quiescent state and
-         * try again.
+        /* Wait for one thread to report a quiescent state and try again.
+         * Release rcu_registry_lock, so rcu_(un)register_thread() doesn't
+         * wait too much time.
+         *
+         * rcu_register_thread() may add nodes to &registry; it will not
+         * wake up synchronize_rcu, but that is okay because at least another
+         * thread must exit its RCU read-side critical section before
+         * synchronize_rcu is done.  The next iteration of the loop will
+         * move the new thread's rcu_reader from &registry to &qsreaders,
+         * because rcu_gp_ongoing() will return false.
+         *
+         * rcu_unregister_thread() may remove nodes from &qsreaders instead
+         * of &registry if it runs during qemu_event_wait.  That's okay;
+         * the node then will not be added back to &registry by QLIST_SWAP
+         * below.  The invariant is that the node is part of one list when
+         * rcu_registry_lock is released.
          */
+        qemu_mutex_unlock(&rcu_registry_lock);
         qemu_event_wait(&rcu_gp_event);
+        qemu_mutex_lock(&rcu_registry_lock);
     }
 
     /* put back the reader list in the registry */
@@ -126,7 +143,8 @@ static void wait_for_readers(void)
 
 void synchronize_rcu(void)
 {
-    qemu_mutex_lock(&rcu_gp_lock);
+    qemu_mutex_lock(&rcu_sync_lock);
+    qemu_mutex_lock(&rcu_registry_lock);
 
     if (!QLIST_EMPTY(&registry)) {
         /* In either case, the atomic_mb_set below blocks stores that free
@@ -149,7 +167,8 @@ void synchronize_rcu(void)
         wait_for_readers();
     }
 
-    qemu_mutex_unlock(&rcu_gp_lock);
+    qemu_mutex_unlock(&rcu_registry_lock);
+    qemu_mutex_unlock(&rcu_sync_lock);
 }
 
 
@@ -273,23 +292,24 @@ void call_rcu1(struct rcu_head *node, void (*func)(struct rcu_head *node))
 void rcu_register_thread(void)
 {
     assert(rcu_reader.ctr == 0);
-    qemu_mutex_lock(&rcu_gp_lock);
+    qemu_mutex_lock(&rcu_registry_lock);
     QLIST_INSERT_HEAD(&registry, &rcu_reader, node);
-    qemu_mutex_unlock(&rcu_gp_lock);
+    qemu_mutex_unlock(&rcu_registry_lock);
 }
 
 void rcu_unregister_thread(void)
 {
-    qemu_mutex_lock(&rcu_gp_lock);
+    qemu_mutex_lock(&rcu_registry_lock);
     QLIST_REMOVE(&rcu_reader, node);
-    qemu_mutex_unlock(&rcu_gp_lock);
+    qemu_mutex_unlock(&rcu_registry_lock);
 }
 
 static void rcu_init_complete(void)
 {
     QemuThread thread;
 
-    qemu_mutex_init(&rcu_gp_lock);
+    qemu_mutex_init(&rcu_registry_lock);
+    qemu_mutex_init(&rcu_sync_lock);
     qemu_event_init(&rcu_gp_event, true);
 
     qemu_event_init(&rcu_call_ready_event, false);
@@ -306,12 +326,14 @@ static void rcu_init_complete(void)
 #ifdef CONFIG_POSIX
 static void rcu_init_lock(void)
 {
-    qemu_mutex_lock(&rcu_gp_lock);
+    qemu_mutex_lock(&rcu_sync_lock);
+    qemu_mutex_lock(&rcu_registry_lock);
 }
 
 static void rcu_init_unlock(void)
 {
-    qemu_mutex_unlock(&rcu_gp_lock);
+    qemu_mutex_unlock(&rcu_registry_lock);
+    qemu_mutex_unlock(&rcu_sync_lock);
 }
 #endif