summary refs log tree commit diff stats
diff options
context:
space:
mode:
-rw-r--r--COPYING.PYTHON270
-rw-r--r--MAINTAINERS6
-rw-r--r--block.c96
-rw-r--r--block/backup.c2
-rw-r--r--block/block-backend.c5
-rw-r--r--block/dirty-bitmap.c57
-rw-r--r--block/io.c332
-rw-r--r--block/mirror.c613
-rw-r--r--block/vvfat.c1
-rw-r--r--blockdev.c9
-rw-r--r--blockjob.c23
-rwxr-xr-xconfigure11
-rw-r--r--default-configs/ppc-softmmu.mak2
-rw-r--r--docs/devel/testing.rst192
-rw-r--r--hmp.c7
-rw-r--r--hw/arm/sysbus-fdt.c7
-rw-r--r--hw/arm/virt.c2
-rw-r--r--hw/core/bus.c1
-rw-r--r--hw/display/Makefile.objs3
-rw-r--r--hw/display/ramfb-standalone.c62
-rw-r--r--hw/display/ramfb.c95
-rw-r--r--hw/display/sm501.c15
-rw-r--r--hw/i386/pc_piix.c2
-rw-r--r--hw/i386/pc_q35.c2
-rw-r--r--hw/input/adb-kbd.c29
-rw-r--r--hw/input/adb-mouse.c41
-rw-r--r--hw/input/adb.c7
-rw-r--r--hw/input/ps2.c2
-rw-r--r--hw/intc/xics_kvm.c10
-rw-r--r--hw/isa/smc37c669-superio.c2
-rw-r--r--hw/misc/macio/Makefile.objs2
-rw-r--r--hw/misc/macio/gpio.c231
-rw-r--r--hw/misc/macio/macio.c89
-rw-r--r--hw/misc/macio/pmu.c871
-rw-r--r--hw/misc/macio/trace-events28
-rw-r--r--hw/misc/mos6522.c5
-rw-r--r--hw/ppc/mac.h20
-rw-r--r--hw/ppc/mac_newworld.c84
-rw-r--r--hw/ppc/pnv.c36
-rw-r--r--hw/ppc/pnv_core.c93
-rw-r--r--hw/ppc/spapr.c22
-rw-r--r--hw/ppc/spapr_caps.c6
-rw-r--r--hw/ppc/spapr_cpu_core.c163
-rw-r--r--hw/ppc/spapr_hcall.c78
-rw-r--r--hw/s390x/css.c12
-rw-r--r--hw/s390x/ipl.c27
-rw-r--r--hw/s390x/virtio-ccw.c13
-rw-r--r--hw/sparc/sun4m.c67
-rw-r--r--hw/sparc64/sun4u.c18
-rw-r--r--hw/usb/dev-smartcard-reader.c1
-rw-r--r--hw/usb/dev-storage.c16
-rw-r--r--hw/usb/dev-uas.c2
-rw-r--r--hw/vfio/ccw.c35
-rw-r--r--include/block/aio-wait.h25
-rw-r--r--include/block/block.h31
-rw-r--r--include/block/block_int.h18
-rw-r--r--include/block/blockjob_int.h8
-rw-r--r--include/block/dirty-bitmap.h2
-rw-r--r--include/exec/ramlist.h4
-rw-r--r--include/hw/display/ramfb.h12
-rw-r--r--include/hw/input/adb.h1
-rw-r--r--include/hw/misc/macio/gpio.h47
-rw-r--r--include/hw/misc/macio/macio.h7
-rw-r--r--include/hw/misc/macio/pmu.h237
-rw-r--r--include/hw/misc/mos6522.h1
-rw-r--r--include/hw/ppc/pnv_core.h2
-rw-r--r--include/hw/ppc/ppc.h1
-rw-r--r--include/hw/ppc/spapr_cpu_core.h11
-rw-r--r--include/migration/vmstate.h2
-rw-r--r--include/qemu/hbitmap.h5
-rw-r--r--include/qemu/job.h15
-rw-r--r--include/qemu/typedefs.h1
-rw-r--r--job.c5
-rw-r--r--migration/block-dirty-bitmap.c3
-rw-r--r--migration/migration.c73
-rw-r--r--migration/migration.h11
-rw-r--r--migration/qjson.h2
-rw-r--r--migration/ram.c53
-rw-r--r--migration/rdma.c2
-rw-r--r--migration/trace-events2
-rw-r--r--monitor.c168
-rw-r--r--pc-bios/openbios-ppcbin754936 -> 763128 bytes
-rw-r--r--pc-bios/openbios-sparc32bin382048 -> 382048 bytes
-rw-r--r--pc-bios/openbios-sparc64bin1593408 -> 1593408 bytes
-rw-r--r--pc-bios/s390-ccw/Makefile1
-rw-r--r--pc-bios/s390-ccw/iplb.h4
-rw-r--r--pc-bios/s390-ccw/main.c8
-rw-r--r--pc-bios/s390-ccw/netboot.mak13
-rw-r--r--pc-bios/s390-ccw/netmain.c226
-rw-r--r--pc-bios/s390-ccw/sclp.c2
-rw-r--r--pc-bios/s390-ccw/sclp.h2
-rw-r--r--pc-bios/s390-netboot.imgbin87872 -> 54944 bytes
-rw-r--r--qapi/block-core.json29
-rw-r--r--qapi/migration.json19
m---------roms/SLOF0
m---------roms/openbios0
-rw-r--r--scripts/qemu.py103
-rw-r--r--stubs/fdset.c2
-rw-r--r--target/ppc/cpu.h9
-rw-r--r--target/ppc/kvm.c58
-rw-r--r--target/ppc/translate_init.inc.c8
-rw-r--r--target/s390x/cpu_models.c1
-rw-r--r--target/sparc/translate.c111
-rw-r--r--tests/acceptance/README.rst10
-rw-r--r--tests/acceptance/avocado_qemu/__init__.py54
-rw-r--r--tests/acceptance/boot_linux_console.py47
-rw-r--r--tests/acceptance/version.py24
-rw-r--r--tests/acceptance/vnc.py60
-rwxr-xr-xtests/qemu-iotests/151120
-rw-r--r--tests/qemu-iotests/151.out5
-rw-r--r--tests/qemu-iotests/group1
-rw-r--r--tests/test-bdrv-drain.c705
-rw-r--r--tests/test-hbitmap.c38
-rw-r--r--util/hbitmap.c10
-rw-r--r--util/osdep.c3
-rw-r--r--vl.c7
116 files changed, 5135 insertions, 1119 deletions
diff --git a/COPYING.PYTHON b/COPYING.PYTHON
deleted file mode 100644
index 4d3f1ef276..0000000000
--- a/COPYING.PYTHON
+++ /dev/null
@@ -1,270 +0,0 @@
-A. HISTORY OF THE SOFTWARE
-==========================
-
-Python was created in the early 1990s by Guido van Rossum at Stichting
-Mathematisch Centrum (CWI, see http://www.cwi.nl) in the Netherlands
-as a successor of a language called ABC.  Guido remains Python's
-principal author, although it includes many contributions from others.
-
-In 1995, Guido continued his work on Python at the Corporation for
-National Research Initiatives (CNRI, see http://www.cnri.reston.va.us)
-in Reston, Virginia where he released several versions of the
-software.
-
-In May 2000, Guido and the Python core development team moved to
-BeOpen.com to form the BeOpen PythonLabs team.  In October of the same
-year, the PythonLabs team moved to Digital Creations (now Zope
-Corporation, see http://www.zope.com).  In 2001, the Python Software
-Foundation (PSF, see http://www.python.org/psf/) was formed, a
-non-profit organization created specifically to own Python-related
-Intellectual Property.  Zope Corporation is a sponsoring member of
-the PSF.
-
-All Python releases are Open Source (see http://www.opensource.org for
-the Open Source Definition).  Historically, most, but not all, Python
-releases have also been GPL-compatible; the table below summarizes
-the various releases.
-
-    Release         Derived     Year        Owner       GPL-
-                    from                                compatible? (1)
-
-    0.9.0 thru 1.2              1991-1995   CWI         yes
-    1.3 thru 1.5.2  1.2         1995-1999   CNRI        yes
-    1.6             1.5.2       2000        CNRI        no
-    2.0             1.6         2000        BeOpen.com  no
-    1.6.1           1.6         2001        CNRI        yes (2)
-    2.1             2.0+1.6.1   2001        PSF         no
-    2.0.1           2.0+1.6.1   2001        PSF         yes
-    2.1.1           2.1+2.0.1   2001        PSF         yes
-    2.2             2.1.1       2001        PSF         yes
-    2.1.2           2.1.1       2002        PSF         yes
-    2.1.3           2.1.2       2002        PSF         yes
-    2.2.1           2.2         2002        PSF         yes
-    2.2.2           2.2.1       2002        PSF         yes
-    2.2.3           2.2.2       2003        PSF         yes
-    2.3             2.2.2       2002-2003   PSF         yes
-    2.3.1           2.3         2002-2003   PSF         yes
-    2.3.2           2.3.1       2002-2003   PSF         yes
-    2.3.3           2.3.2       2002-2003   PSF         yes
-    2.3.4           2.3.3       2004        PSF         yes
-    2.3.5           2.3.4       2005        PSF         yes
-    2.4             2.3         2004        PSF         yes
-    2.4.1           2.4         2005        PSF         yes
-    2.4.2           2.4.1       2005        PSF         yes
-    2.4.3           2.4.2       2006        PSF         yes
-    2.5             2.4         2006        PSF         yes
-    2.7             2.6         2010        PSF         yes
-
-Footnotes:
-
-(1) GPL-compatible doesn't mean that we're distributing Python under
-    the GPL.  All Python licenses, unlike the GPL, let you distribute
-    a modified version without making your changes open source.  The
-    GPL-compatible licenses make it possible to combine Python with
-    other software that is released under the GPL; the others don't.
-
-(2) According to Richard Stallman, 1.6.1 is not GPL-compatible,
-    because its license has a choice of law clause.  According to
-    CNRI, however, Stallman's lawyer has told CNRI's lawyer that 1.6.1
-    is "not incompatible" with the GPL.
-
-Thanks to the many outside volunteers who have worked under Guido's
-direction to make these releases possible.
-
-
-B. TERMS AND CONDITIONS FOR ACCESSING OR OTHERWISE USING PYTHON
-===============================================================
-
-PYTHON SOFTWARE FOUNDATION LICENSE VERSION 2
---------------------------------------------
-
-1. This LICENSE AGREEMENT is between the Python Software Foundation
-("PSF"), and the Individual or Organization ("Licensee") accessing and
-otherwise using this software ("Python") in source or binary form and
-its associated documentation.
-
-2. Subject to the terms and conditions of this License Agreement, PSF
-hereby grants Licensee a nonexclusive, royalty-free, world-wide
-license to reproduce, analyze, test, perform and/or display publicly,
-prepare derivative works, distribute, and otherwise use Python
-alone or in any derivative version, provided, however, that PSF's
-License Agreement and PSF's notice of copyright, i.e., "Copyright (c)
-2001, 2002, 2003, 2004, 2005, 2006 Python Software Foundation; All Rights
-Reserved" are retained in Python alone or in any derivative version 
-prepared by Licensee.
-
-3. In the event Licensee prepares a derivative work that is based on
-or incorporates Python or any part thereof, and wants to make
-the derivative work available to others as provided herein, then
-Licensee hereby agrees to include in any such work a brief summary of
-the changes made to Python.
-
-4. PSF is making Python available to Licensee on an "AS IS"
-basis.  PSF MAKES NO REPRESENTATIONS OR WARRANTIES, EXPRESS OR
-IMPLIED.  BY WAY OF EXAMPLE, BUT NOT LIMITATION, PSF MAKES NO AND
-DISCLAIMS ANY REPRESENTATION OR WARRANTY OF MERCHANTABILITY OR FITNESS
-FOR ANY PARTICULAR PURPOSE OR THAT THE USE OF PYTHON WILL NOT
-INFRINGE ANY THIRD PARTY RIGHTS.
-
-5. PSF SHALL NOT BE LIABLE TO LICENSEE OR ANY OTHER USERS OF PYTHON
-FOR ANY INCIDENTAL, SPECIAL, OR CONSEQUENTIAL DAMAGES OR LOSS AS
-A RESULT OF MODIFYING, DISTRIBUTING, OR OTHERWISE USING PYTHON,
-OR ANY DERIVATIVE THEREOF, EVEN IF ADVISED OF THE POSSIBILITY THEREOF.
-
-6. This License Agreement will automatically terminate upon a material
-breach of its terms and conditions.
-
-7. Nothing in this License Agreement shall be deemed to create any
-relationship of agency, partnership, or joint venture between PSF and
-Licensee.  This License Agreement does not grant permission to use PSF
-trademarks or trade name in a trademark sense to endorse or promote
-products or services of Licensee, or any third party.
-
-8. By copying, installing or otherwise using Python, Licensee
-agrees to be bound by the terms and conditions of this License
-Agreement.
-
-
-BEOPEN.COM LICENSE AGREEMENT FOR PYTHON 2.0
--------------------------------------------
-
-BEOPEN PYTHON OPEN SOURCE LICENSE AGREEMENT VERSION 1
-
-1. This LICENSE AGREEMENT is between BeOpen.com ("BeOpen"), having an
-office at 160 Saratoga Avenue, Santa Clara, CA 95051, and the
-Individual or Organization ("Licensee") accessing and otherwise using
-this software in source or binary form and its associated
-documentation ("the Software").
-
-2. Subject to the terms and conditions of this BeOpen Python License
-Agreement, BeOpen hereby grants Licensee a non-exclusive,
-royalty-free, world-wide license to reproduce, analyze, test, perform
-and/or display publicly, prepare derivative works, distribute, and
-otherwise use the Software alone or in any derivative version,
-provided, however, that the BeOpen Python License is retained in the
-Software, alone or in any derivative version prepared by Licensee.
-
-3. BeOpen is making the Software available to Licensee on an "AS IS"
-basis.  BEOPEN MAKES NO REPRESENTATIONS OR WARRANTIES, EXPRESS OR
-IMPLIED.  BY WAY OF EXAMPLE, BUT NOT LIMITATION, BEOPEN MAKES NO AND
-DISCLAIMS ANY REPRESENTATION OR WARRANTY OF MERCHANTABILITY OR FITNESS
-FOR ANY PARTICULAR PURPOSE OR THAT THE USE OF THE SOFTWARE WILL NOT
-INFRINGE ANY THIRD PARTY RIGHTS.
-
-4. BEOPEN SHALL NOT BE LIABLE TO LICENSEE OR ANY OTHER USERS OF THE
-SOFTWARE FOR ANY INCIDENTAL, SPECIAL, OR CONSEQUENTIAL DAMAGES OR LOSS
-AS A RESULT OF USING, MODIFYING OR DISTRIBUTING THE SOFTWARE, OR ANY
-DERIVATIVE THEREOF, EVEN IF ADVISED OF THE POSSIBILITY THEREOF.
-
-5. This License Agreement will automatically terminate upon a material
-breach of its terms and conditions.
-
-6. This License Agreement shall be governed by and interpreted in all
-respects by the law of the State of California, excluding conflict of
-law provisions.  Nothing in this License Agreement shall be deemed to
-create any relationship of agency, partnership, or joint venture
-between BeOpen and Licensee.  This License Agreement does not grant
-permission to use BeOpen trademarks or trade names in a trademark
-sense to endorse or promote products or services of Licensee, or any
-third party.  As an exception, the "BeOpen Python" logos available at
-http://www.pythonlabs.com/logos.html may be used according to the
-permissions granted on that web page.
-
-7. By copying, installing or otherwise using the software, Licensee
-agrees to be bound by the terms and conditions of this License
-Agreement.
-
-
-CNRI LICENSE AGREEMENT FOR PYTHON 1.6.1
----------------------------------------
-
-1. This LICENSE AGREEMENT is between the Corporation for National
-Research Initiatives, having an office at 1895 Preston White Drive,
-Reston, VA 20191 ("CNRI"), and the Individual or Organization
-("Licensee") accessing and otherwise using Python 1.6.1 software in
-source or binary form and its associated documentation.
-
-2. Subject to the terms and conditions of this License Agreement, CNRI
-hereby grants Licensee a nonexclusive, royalty-free, world-wide
-license to reproduce, analyze, test, perform and/or display publicly,
-prepare derivative works, distribute, and otherwise use Python 1.6.1
-alone or in any derivative version, provided, however, that CNRI's
-License Agreement and CNRI's notice of copyright, i.e., "Copyright (c)
-1995-2001 Corporation for National Research Initiatives; All Rights
-Reserved" are retained in Python 1.6.1 alone or in any derivative
-version prepared by Licensee.  Alternately, in lieu of CNRI's License
-Agreement, Licensee may substitute the following text (omitting the
-quotes): "Python 1.6.1 is made available subject to the terms and
-conditions in CNRI's License Agreement.  This Agreement together with
-Python 1.6.1 may be located on the Internet using the following
-unique, persistent identifier (known as a handle): 1895.22/1013.  This
-Agreement may also be obtained from a proxy server on the Internet
-using the following URL: http://hdl.handle.net/1895.22/1013".
-
-3. In the event Licensee prepares a derivative work that is based on
-or incorporates Python 1.6.1 or any part thereof, and wants to make
-the derivative work available to others as provided herein, then
-Licensee hereby agrees to include in any such work a brief summary of
-the changes made to Python 1.6.1.
-
-4. CNRI is making Python 1.6.1 available to Licensee on an "AS IS"
-basis.  CNRI MAKES NO REPRESENTATIONS OR WARRANTIES, EXPRESS OR
-IMPLIED.  BY WAY OF EXAMPLE, BUT NOT LIMITATION, CNRI MAKES NO AND
-DISCLAIMS ANY REPRESENTATION OR WARRANTY OF MERCHANTABILITY OR FITNESS
-FOR ANY PARTICULAR PURPOSE OR THAT THE USE OF PYTHON 1.6.1 WILL NOT
-INFRINGE ANY THIRD PARTY RIGHTS.
-
-5. CNRI SHALL NOT BE LIABLE TO LICENSEE OR ANY OTHER USERS OF PYTHON
-1.6.1 FOR ANY INCIDENTAL, SPECIAL, OR CONSEQUENTIAL DAMAGES OR LOSS AS
-A RESULT OF MODIFYING, DISTRIBUTING, OR OTHERWISE USING PYTHON 1.6.1,
-OR ANY DERIVATIVE THEREOF, EVEN IF ADVISED OF THE POSSIBILITY THEREOF.
-
-6. This License Agreement will automatically terminate upon a material
-breach of its terms and conditions.
-
-7. This License Agreement shall be governed by the federal
-intellectual property law of the United States, including without
-limitation the federal copyright law, and, to the extent such
-U.S. federal law does not apply, by the law of the Commonwealth of
-Virginia, excluding Virginia's conflict of law provisions.
-Notwithstanding the foregoing, with regard to derivative works based
-on Python 1.6.1 that incorporate non-separable material that was
-previously distributed under the GNU General Public License (GPL), the
-law of the Commonwealth of Virginia shall govern this License
-Agreement only as to issues arising under or with respect to
-Paragraphs 4, 5, and 7 of this License Agreement.  Nothing in this
-License Agreement shall be deemed to create any relationship of
-agency, partnership, or joint venture between CNRI and Licensee.  This
-License Agreement does not grant permission to use CNRI trademarks or
-trade name in a trademark sense to endorse or promote products or
-services of Licensee, or any third party.
-
-8. By clicking on the "ACCEPT" button where indicated, or by copying,
-installing or otherwise using Python 1.6.1, Licensee agrees to be
-bound by the terms and conditions of this License Agreement.
-
-        ACCEPT
-
-
-CWI LICENSE AGREEMENT FOR PYTHON 0.9.0 THROUGH 1.2
---------------------------------------------------
-
-Copyright (c) 1991 - 1995, Stichting Mathematisch Centrum Amsterdam,
-The Netherlands.  All rights reserved.
-
-Permission to use, copy, modify, and distribute this software and its
-documentation for any purpose and without fee is hereby granted,
-provided that the above copyright notice appear in all copies and that
-both that copyright notice and this permission notice appear in
-supporting documentation, and that the name of Stichting Mathematisch
-Centrum or CWI not be used in advertising or publicity pertaining to
-distribution of the software without specific, written prior
-permission.
-
-STICHTING MATHEMATISCH CENTRUM DISCLAIMS ALL WARRANTIES WITH REGARD TO
-THIS SOFTWARE, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND
-FITNESS, IN NO EVENT SHALL STICHTING MATHEMATISCH CENTRUM BE LIABLE
-FOR ANY SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
-WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
-ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT
-OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
diff --git a/MAINTAINERS b/MAINTAINERS
index 0fb5f38f9f..da91501c7a 100644
--- a/MAINTAINERS
+++ b/MAINTAINERS
@@ -1331,6 +1331,12 @@ F: hw/display/bochs-display.c
 F: include/hw/display/vga.h
 F: include/hw/display/bochs-vbe.h
 
+ramfb
+M: Gerd Hoffmann <kraxel@redhat.com>
+S: Maintained
+F: hw/display/ramfb*.c
+F: include/hw/display/ramfb.h
+
 virtio-gpu
 M: Gerd Hoffmann <kraxel@redhat.com>
 S: Maintained
diff --git a/block.c b/block.c
index afe30caac3..1b8147c1b3 100644
--- a/block.c
+++ b/block.c
@@ -333,6 +333,10 @@ BlockDriverState *bdrv_new(void)
 
     qemu_co_queue_init(&bs->flush_queue);
 
+    for (i = 0; i < bdrv_drain_all_count; i++) {
+        bdrv_drained_begin(bs);
+    }
+
     QTAILQ_INSERT_TAIL(&all_bdrv_states, bs, bs_list);
 
     return bs;
@@ -818,7 +822,13 @@ static char *bdrv_child_get_parent_desc(BdrvChild *c)
 static void bdrv_child_cb_drained_begin(BdrvChild *child)
 {
     BlockDriverState *bs = child->opaque;
-    bdrv_drained_begin(bs);
+    bdrv_do_drained_begin_quiesce(bs, NULL, false);
+}
+
+static bool bdrv_child_cb_drained_poll(BdrvChild *child)
+{
+    BlockDriverState *bs = child->opaque;
+    return bdrv_drain_poll(bs, false, NULL, false);
 }
 
 static void bdrv_child_cb_drained_end(BdrvChild *child)
@@ -902,9 +912,11 @@ static void bdrv_inherited_options(int *child_flags, QDict *child_options,
 }
 
 const BdrvChildRole child_file = {
+    .parent_is_bds   = true,
     .get_parent_desc = bdrv_child_get_parent_desc,
     .inherit_options = bdrv_inherited_options,
     .drained_begin   = bdrv_child_cb_drained_begin,
+    .drained_poll    = bdrv_child_cb_drained_poll,
     .drained_end     = bdrv_child_cb_drained_end,
     .attach          = bdrv_child_cb_attach,
     .detach          = bdrv_child_cb_detach,
@@ -926,9 +938,11 @@ static void bdrv_inherited_fmt_options(int *child_flags, QDict *child_options,
 }
 
 const BdrvChildRole child_format = {
+    .parent_is_bds   = true,
     .get_parent_desc = bdrv_child_get_parent_desc,
     .inherit_options = bdrv_inherited_fmt_options,
     .drained_begin   = bdrv_child_cb_drained_begin,
+    .drained_poll    = bdrv_child_cb_drained_poll,
     .drained_end     = bdrv_child_cb_drained_end,
     .attach          = bdrv_child_cb_attach,
     .detach          = bdrv_child_cb_detach,
@@ -1043,11 +1057,13 @@ static int bdrv_backing_update_filename(BdrvChild *c, BlockDriverState *base,
 }
 
 const BdrvChildRole child_backing = {
+    .parent_is_bds   = true,
     .get_parent_desc = bdrv_child_get_parent_desc,
     .attach          = bdrv_backing_attach,
     .detach          = bdrv_backing_detach,
     .inherit_options = bdrv_backing_options,
     .drained_begin   = bdrv_child_cb_drained_begin,
+    .drained_poll    = bdrv_child_cb_drained_poll,
     .drained_end     = bdrv_child_cb_drained_end,
     .inactivate      = bdrv_child_cb_inactivate,
     .update_filename = bdrv_backing_update_filename,
@@ -1152,7 +1168,7 @@ static int bdrv_open_driver(BlockDriverState *bs, BlockDriver *drv,
                             int open_flags, Error **errp)
 {
     Error *local_err = NULL;
-    int ret;
+    int i, ret;
 
     bdrv_assign_node_name(bs, node_name, &local_err);
     if (local_err) {
@@ -1200,6 +1216,12 @@ static int bdrv_open_driver(BlockDriverState *bs, BlockDriver *drv,
     assert(bdrv_min_mem_align(bs) != 0);
     assert(is_power_of_2(bs->bl.request_alignment));
 
+    for (i = 0; i < bs->quiesce_counter; i++) {
+        if (drv->bdrv_co_drain_begin) {
+            drv->bdrv_co_drain_begin(bs);
+        }
+    }
+
     return 0;
 open_failed:
     bs->drv = NULL;
@@ -2021,7 +2043,12 @@ static void bdrv_replace_child_noperm(BdrvChild *child,
             child->role->detach(child);
         }
         if (old_bs->quiesce_counter && child->role->drained_end) {
-            for (i = 0; i < old_bs->quiesce_counter; i++) {
+            int num = old_bs->quiesce_counter;
+            if (child->role->parent_is_bds) {
+                num -= bdrv_drain_all_count;
+            }
+            assert(num >= 0);
+            for (i = 0; i < num; i++) {
                 child->role->drained_end(child);
             }
         }
@@ -2033,7 +2060,12 @@ static void bdrv_replace_child_noperm(BdrvChild *child,
     if (new_bs) {
         QLIST_INSERT_HEAD(&new_bs->parents, child, next_parent);
         if (new_bs->quiesce_counter && child->role->drained_begin) {
-            for (i = 0; i < new_bs->quiesce_counter; i++) {
+            int num = new_bs->quiesce_counter;
+            if (child->role->parent_is_bds) {
+                num -= bdrv_drain_all_count;
+            }
+            assert(num >= 0);
+            for (i = 0; i < num; i++) {
                 child->role->drained_begin(child);
             }
         }
@@ -3395,16 +3427,39 @@ static bool should_update_child(BdrvChild *c, BlockDriverState *to)
         return false;
     }
 
-    if (c->role == &child_backing) {
-        /* If @from is a backing file of @to, ignore the child to avoid
-         * creating a loop. We only want to change the pointer of other
-         * parents. */
-        QLIST_FOREACH(to_c, &to->children, next) {
-            if (to_c == c) {
-                break;
-            }
-        }
-        if (to_c) {
+    /* If the child @c belongs to the BDS @to, replacing the current
+     * c->bs by @to would mean to create a loop.
+     *
+     * Such a case occurs when appending a BDS to a backing chain.
+     * For instance, imagine the following chain:
+     *
+     *   guest device -> node A -> further backing chain...
+     *
+     * Now we create a new BDS B which we want to put on top of this
+     * chain, so we first attach A as its backing node:
+     *
+     *                   node B
+     *                     |
+     *                     v
+     *   guest device -> node A -> further backing chain...
+     *
+     * Finally we want to replace A by B.  When doing that, we want to
+     * replace all pointers to A by pointers to B -- except for the
+     * pointer from B because (1) that would create a loop, and (2)
+     * that pointer should simply stay intact:
+     *
+     *   guest device -> node B
+     *                     |
+     *                     v
+     *                   node A -> further backing chain...
+     *
+     * In general, when replacing a node A (c->bs) by a node B (@to),
+     * if A is a child of B, that means we cannot replace A by B there
+     * because that would create a loop.  Silently detaching A from B
+     * is also not really an option.  So overall just leaving A in
+     * place there is the most sensible choice. */
+    QLIST_FOREACH(to_c, &to->children, next) {
+        if (to_c == c) {
             return false;
         }
     }
@@ -3430,6 +3485,7 @@ void bdrv_replace_node(BlockDriverState *from, BlockDriverState *to,
 
     /* Put all parents into @list and calculate their cumulative permissions */
     QLIST_FOREACH_SAFE(c, &from->parents, next_parent, next) {
+        assert(c->bs == from);
         if (!should_update_child(c, to)) {
             continue;
         }
@@ -4037,6 +4093,14 @@ BlockDriverState *bdrv_next_node(BlockDriverState *bs)
     return QTAILQ_NEXT(bs, node_list);
 }
 
+BlockDriverState *bdrv_next_all_states(BlockDriverState *bs)
+{
+    if (!bs) {
+        return QTAILQ_FIRST(&all_bdrv_states);
+    }
+    return QTAILQ_NEXT(bs, bs_list);
+}
+
 const char *bdrv_get_node_name(const BlockDriverState *bs)
 {
     return bs->node_name;
@@ -4948,7 +5012,7 @@ void bdrv_set_aio_context(BlockDriverState *bs, AioContext *new_context)
     AioContext *ctx = bdrv_get_aio_context(bs);
 
     aio_disable_external(ctx);
-    bdrv_parent_drained_begin(bs, NULL);
+    bdrv_parent_drained_begin(bs, NULL, false);
     bdrv_drain(bs); /* ensure there are no in-flight requests */
 
     while (aio_poll(ctx, false)) {
@@ -4962,7 +5026,7 @@ void bdrv_set_aio_context(BlockDriverState *bs, AioContext *new_context)
      */
     aio_context_acquire(new_context);
     bdrv_attach_aio_context(bs, new_context);
-    bdrv_parent_drained_end(bs, NULL);
+    bdrv_parent_drained_end(bs, NULL, false);
     aio_enable_external(ctx);
     aio_context_release(new_context);
 }
diff --git a/block/backup.c b/block/backup.c
index 5661435675..d18be40caf 100644
--- a/block/backup.c
+++ b/block/backup.c
@@ -354,7 +354,7 @@ static int coroutine_fn backup_run_incremental(BackupBlockJob *job)
     HBitmapIter hbi;
 
     hbitmap_iter_init(&hbi, job->copy_bitmap, 0);
-    while ((cluster = hbitmap_iter_next(&hbi)) != -1) {
+    while ((cluster = hbitmap_iter_next(&hbi, true)) != -1) {
         do {
             if (yield_and_check(job)) {
                 return 0;
diff --git a/block/block-backend.c b/block/block-backend.c
index 2d1a3463e8..6b75bca317 100644
--- a/block/block-backend.c
+++ b/block/block-backend.c
@@ -767,6 +767,11 @@ void blk_remove_bs(BlockBackend *blk)
 
     blk_update_root_state(blk);
 
+    /* bdrv_root_unref_child() will cause blk->root to become stale and may
+     * switch to a completion coroutine later on. Let's drain all I/O here
+     * to avoid that and a potential QEMU crash.
+     */
+    blk_drain(blk);
     bdrv_root_unref_child(blk->root);
     blk->root = NULL;
 }
diff --git a/block/dirty-bitmap.c b/block/dirty-bitmap.c
index 383d742cdb..db1782ec1f 100644
--- a/block/dirty-bitmap.c
+++ b/block/dirty-bitmap.c
@@ -519,7 +519,62 @@ void bdrv_dirty_iter_free(BdrvDirtyBitmapIter *iter)
 
 int64_t bdrv_dirty_iter_next(BdrvDirtyBitmapIter *iter)
 {
-    return hbitmap_iter_next(&iter->hbi);
+    return hbitmap_iter_next(&iter->hbi, true);
+}
+
+/**
+ * Return the next consecutively dirty area in the dirty bitmap
+ * belonging to the given iterator @iter.
+ *
+ * @max_offset: Maximum value that may be returned for
+ *              *offset + *bytes
+ * @offset:     Will contain the start offset of the next dirty area
+ * @bytes:      Will contain the length of the next dirty area
+ *
+ * Returns: True if a dirty area could be found before max_offset
+ *          (which means that *offset and *bytes then contain valid
+ *          values), false otherwise.
+ *
+ * Note that @iter is never advanced if false is returned.  If an area
+ * is found (which means that true is returned), it will be advanced
+ * past that area.
+ */
+bool bdrv_dirty_iter_next_area(BdrvDirtyBitmapIter *iter, uint64_t max_offset,
+                               uint64_t *offset, int *bytes)
+{
+    uint32_t granularity = bdrv_dirty_bitmap_granularity(iter->bitmap);
+    uint64_t gran_max_offset;
+    int64_t ret;
+    int size;
+
+    if (max_offset == iter->bitmap->size) {
+        /* If max_offset points to the image end, round it up by the
+         * bitmap granularity */
+        gran_max_offset = ROUND_UP(max_offset, granularity);
+    } else {
+        gran_max_offset = max_offset;
+    }
+
+    ret = hbitmap_iter_next(&iter->hbi, false);
+    if (ret < 0 || ret + granularity > gran_max_offset) {
+        return false;
+    }
+
+    *offset = ret;
+    size = 0;
+
+    assert(granularity <= INT_MAX);
+
+    do {
+        /* Advance iterator */
+        ret = hbitmap_iter_next(&iter->hbi, true);
+        size += granularity;
+    } while (ret + granularity <= gran_max_offset &&
+             hbitmap_iter_next(&iter->hbi, false) == ret + granularity &&
+             size <= INT_MAX - granularity);
+
+    *bytes = MIN(size, max_offset - *offset);
+    return true;
 }
 
 /* Called within bdrv_dirty_bitmap_lock..unlock */
diff --git a/block/io.c b/block/io.c
index b7beaeeb9f..ef4fedd364 100644
--- a/block/io.c
+++ b/block/io.c
@@ -38,15 +38,18 @@
 /* Maximum bounce buffer for copy-on-read and write zeroes, in bytes */
 #define MAX_BOUNCE_BUFFER (32768 << BDRV_SECTOR_BITS)
 
+static AioWait drain_all_aio_wait;
+
 static int coroutine_fn bdrv_co_do_pwrite_zeroes(BlockDriverState *bs,
     int64_t offset, int bytes, BdrvRequestFlags flags);
 
-void bdrv_parent_drained_begin(BlockDriverState *bs, BdrvChild *ignore)
+void bdrv_parent_drained_begin(BlockDriverState *bs, BdrvChild *ignore,
+                               bool ignore_bds_parents)
 {
     BdrvChild *c, *next;
 
     QLIST_FOREACH_SAFE(c, &bs->parents, next_parent, next) {
-        if (c == ignore) {
+        if (c == ignore || (ignore_bds_parents && c->role->parent_is_bds)) {
             continue;
         }
         if (c->role->drained_begin) {
@@ -55,12 +58,13 @@ void bdrv_parent_drained_begin(BlockDriverState *bs, BdrvChild *ignore)
     }
 }
 
-void bdrv_parent_drained_end(BlockDriverState *bs, BdrvChild *ignore)
+void bdrv_parent_drained_end(BlockDriverState *bs, BdrvChild *ignore,
+                             bool ignore_bds_parents)
 {
     BdrvChild *c, *next;
 
     QLIST_FOREACH_SAFE(c, &bs->parents, next_parent, next) {
-        if (c == ignore) {
+        if (c == ignore || (ignore_bds_parents && c->role->parent_is_bds)) {
             continue;
         }
         if (c->role->drained_end) {
@@ -69,6 +73,24 @@ void bdrv_parent_drained_end(BlockDriverState *bs, BdrvChild *ignore)
     }
 }
 
+static bool bdrv_parent_drained_poll(BlockDriverState *bs, BdrvChild *ignore,
+                                     bool ignore_bds_parents)
+{
+    BdrvChild *c, *next;
+    bool busy = false;
+
+    QLIST_FOREACH_SAFE(c, &bs->parents, next_parent, next) {
+        if (c == ignore || (ignore_bds_parents && c->role->parent_is_bds)) {
+            continue;
+        }
+        if (c->role->drained_poll) {
+            busy |= c->role->drained_poll(c);
+        }
+    }
+
+    return busy;
+}
+
 static void bdrv_merge_limits(BlockLimits *dst, const BlockLimits *src)
 {
     dst->opt_transfer = MAX(dst->opt_transfer, src->opt_transfer);
@@ -148,7 +170,9 @@ typedef struct {
     bool done;
     bool begin;
     bool recursive;
+    bool poll;
     BdrvChild *parent;
+    bool ignore_bds_parents;
 } BdrvCoDrainData;
 
 static void coroutine_fn bdrv_drain_invoke_entry(void *opaque)
@@ -164,67 +188,83 @@ static void coroutine_fn bdrv_drain_invoke_entry(void *opaque)
 
     /* Set data->done before reading bs->wakeup.  */
     atomic_mb_set(&data->done, true);
-    bdrv_wakeup(bs);
+    bdrv_dec_in_flight(bs);
+
+    if (data->begin) {
+        g_free(data);
+    }
 }
 
 /* Recursively call BlockDriver.bdrv_co_drain_begin/end callbacks */
-static void bdrv_drain_invoke(BlockDriverState *bs, bool begin, bool recursive)
+static void bdrv_drain_invoke(BlockDriverState *bs, bool begin)
 {
-    BdrvChild *child, *tmp;
-    BdrvCoDrainData data = { .bs = bs, .done = false, .begin = begin};
+    BdrvCoDrainData *data;
 
     if (!bs->drv || (begin && !bs->drv->bdrv_co_drain_begin) ||
             (!begin && !bs->drv->bdrv_co_drain_end)) {
         return;
     }
 
-    data.co = qemu_coroutine_create(bdrv_drain_invoke_entry, &data);
-    bdrv_coroutine_enter(bs, data.co);
-    BDRV_POLL_WHILE(bs, !data.done);
+    data = g_new(BdrvCoDrainData, 1);
+    *data = (BdrvCoDrainData) {
+        .bs = bs,
+        .done = false,
+        .begin = begin
+    };
 
-    if (recursive) {
-        QLIST_FOREACH_SAFE(child, &bs->children, next, tmp) {
-            bdrv_drain_invoke(child->bs, begin, true);
-        }
+    /* Make sure the driver callback completes during the polling phase for
+     * drain_begin. */
+    bdrv_inc_in_flight(bs);
+    data->co = qemu_coroutine_create(bdrv_drain_invoke_entry, data);
+    aio_co_schedule(bdrv_get_aio_context(bs), data->co);
+
+    if (!begin) {
+        BDRV_POLL_WHILE(bs, !data->done);
+        g_free(data);
     }
 }
 
-static bool bdrv_drain_recurse(BlockDriverState *bs)
+/* Returns true if BDRV_POLL_WHILE() should go into a blocking aio_poll() */
+bool bdrv_drain_poll(BlockDriverState *bs, bool recursive,
+                     BdrvChild *ignore_parent, bool ignore_bds_parents)
 {
-    BdrvChild *child, *tmp;
-    bool waited;
+    BdrvChild *child, *next;
 
-    /* Wait for drained requests to finish */
-    waited = BDRV_POLL_WHILE(bs, atomic_read(&bs->in_flight) > 0);
-
-    QLIST_FOREACH_SAFE(child, &bs->children, next, tmp) {
-        BlockDriverState *bs = child->bs;
-        bool in_main_loop =
-            qemu_get_current_aio_context() == qemu_get_aio_context();
-        assert(bs->refcnt > 0);
-        if (in_main_loop) {
-            /* In case the recursive bdrv_drain_recurse processes a
-             * block_job_defer_to_main_loop BH and modifies the graph,
-             * let's hold a reference to bs until we are done.
-             *
-             * IOThread doesn't have such a BH, and it is not safe to call
-             * bdrv_unref without BQL, so skip doing it there.
-             */
-            bdrv_ref(bs);
-        }
-        waited |= bdrv_drain_recurse(bs);
-        if (in_main_loop) {
-            bdrv_unref(bs);
+    if (bdrv_parent_drained_poll(bs, ignore_parent, ignore_bds_parents)) {
+        return true;
+    }
+
+    if (atomic_read(&bs->in_flight)) {
+        return true;
+    }
+
+    if (recursive) {
+        assert(!ignore_bds_parents);
+        QLIST_FOREACH_SAFE(child, &bs->children, next, next) {
+            if (bdrv_drain_poll(child->bs, recursive, child, false)) {
+                return true;
+            }
         }
     }
 
-    return waited;
+    return false;
+}
+
+static bool bdrv_drain_poll_top_level(BlockDriverState *bs, bool recursive,
+                                      BdrvChild *ignore_parent)
+{
+    /* Execute pending BHs first and check everything else only after the BHs
+     * have executed. */
+    while (aio_poll(bs->aio_context, false));
+
+    return bdrv_drain_poll(bs, recursive, ignore_parent, false);
 }
 
 static void bdrv_do_drained_begin(BlockDriverState *bs, bool recursive,
-                                  BdrvChild *parent);
+                                  BdrvChild *parent, bool ignore_bds_parents,
+                                  bool poll);
 static void bdrv_do_drained_end(BlockDriverState *bs, bool recursive,
-                                BdrvChild *parent);
+                                BdrvChild *parent, bool ignore_bds_parents);
 
 static void bdrv_co_drain_bh_cb(void *opaque)
 {
@@ -232,11 +272,18 @@ static void bdrv_co_drain_bh_cb(void *opaque)
     Coroutine *co = data->co;
     BlockDriverState *bs = data->bs;
 
-    bdrv_dec_in_flight(bs);
-    if (data->begin) {
-        bdrv_do_drained_begin(bs, data->recursive, data->parent);
+    if (bs) {
+        bdrv_dec_in_flight(bs);
+        if (data->begin) {
+            bdrv_do_drained_begin(bs, data->recursive, data->parent,
+                                  data->ignore_bds_parents, data->poll);
+        } else {
+            bdrv_do_drained_end(bs, data->recursive, data->parent,
+                                data->ignore_bds_parents);
+        }
     } else {
-        bdrv_do_drained_end(bs, data->recursive, data->parent);
+        assert(data->begin);
+        bdrv_drain_all_begin();
     }
 
     data->done = true;
@@ -245,7 +292,9 @@ static void bdrv_co_drain_bh_cb(void *opaque)
 
 static void coroutine_fn bdrv_co_yield_to_drain(BlockDriverState *bs,
                                                 bool begin, bool recursive,
-                                                BdrvChild *parent)
+                                                BdrvChild *parent,
+                                                bool ignore_bds_parents,
+                                                bool poll)
 {
     BdrvCoDrainData data;
 
@@ -260,8 +309,12 @@ static void coroutine_fn bdrv_co_yield_to_drain(BlockDriverState *bs,
         .begin = begin,
         .recursive = recursive,
         .parent = parent,
+        .ignore_bds_parents = ignore_bds_parents,
+        .poll = poll,
     };
-    bdrv_inc_in_flight(bs);
+    if (bs) {
+        bdrv_inc_in_flight(bs);
+    }
     aio_bh_schedule_oneshot(bdrv_get_aio_context(bs),
                             bdrv_co_drain_bh_cb, &data);
 
@@ -271,79 +324,106 @@ static void coroutine_fn bdrv_co_yield_to_drain(BlockDriverState *bs,
     assert(data.done);
 }
 
-void bdrv_do_drained_begin(BlockDriverState *bs, bool recursive,
-                           BdrvChild *parent)
+void bdrv_do_drained_begin_quiesce(BlockDriverState *bs,
+                                   BdrvChild *parent, bool ignore_bds_parents)
 {
-    BdrvChild *child, *next;
-
-    if (qemu_in_coroutine()) {
-        bdrv_co_yield_to_drain(bs, true, recursive, parent);
-        return;
-    }
+    assert(!qemu_in_coroutine());
 
     /* Stop things in parent-to-child order */
     if (atomic_fetch_inc(&bs->quiesce_counter) == 0) {
         aio_disable_external(bdrv_get_aio_context(bs));
     }
 
-    bdrv_parent_drained_begin(bs, parent);
-    bdrv_drain_invoke(bs, true, false);
-    bdrv_drain_recurse(bs);
+    bdrv_parent_drained_begin(bs, parent, ignore_bds_parents);
+    bdrv_drain_invoke(bs, true);
+}
+
+static void bdrv_do_drained_begin(BlockDriverState *bs, bool recursive,
+                                  BdrvChild *parent, bool ignore_bds_parents,
+                                  bool poll)
+{
+    BdrvChild *child, *next;
+
+    if (qemu_in_coroutine()) {
+        bdrv_co_yield_to_drain(bs, true, recursive, parent, ignore_bds_parents,
+                               poll);
+        return;
+    }
+
+    bdrv_do_drained_begin_quiesce(bs, parent, ignore_bds_parents);
 
     if (recursive) {
+        assert(!ignore_bds_parents);
         bs->recursive_quiesce_counter++;
         QLIST_FOREACH_SAFE(child, &bs->children, next, next) {
-            bdrv_do_drained_begin(child->bs, true, child);
+            bdrv_do_drained_begin(child->bs, true, child, ignore_bds_parents,
+                                  false);
         }
     }
+
+    /*
+     * Wait for drained requests to finish.
+     *
+     * Calling BDRV_POLL_WHILE() only once for the top-level node is okay: The
+     * call is needed so things in this AioContext can make progress even
+     * though we don't return to the main AioContext loop - this automatically
+     * includes other nodes in the same AioContext and therefore all child
+     * nodes.
+     */
+    if (poll) {
+        assert(!ignore_bds_parents);
+        BDRV_POLL_WHILE(bs, bdrv_drain_poll_top_level(bs, recursive, parent));
+    }
 }
 
 void bdrv_drained_begin(BlockDriverState *bs)
 {
-    bdrv_do_drained_begin(bs, false, NULL);
+    bdrv_do_drained_begin(bs, false, NULL, false, true);
 }
 
 void bdrv_subtree_drained_begin(BlockDriverState *bs)
 {
-    bdrv_do_drained_begin(bs, true, NULL);
+    bdrv_do_drained_begin(bs, true, NULL, false, true);
 }
 
-void bdrv_do_drained_end(BlockDriverState *bs, bool recursive,
-                         BdrvChild *parent)
+static void bdrv_do_drained_end(BlockDriverState *bs, bool recursive,
+                                BdrvChild *parent, bool ignore_bds_parents)
 {
     BdrvChild *child, *next;
     int old_quiesce_counter;
 
     if (qemu_in_coroutine()) {
-        bdrv_co_yield_to_drain(bs, false, recursive, parent);
+        bdrv_co_yield_to_drain(bs, false, recursive, parent, ignore_bds_parents,
+                               false);
         return;
     }
     assert(bs->quiesce_counter > 0);
     old_quiesce_counter = atomic_fetch_dec(&bs->quiesce_counter);
 
     /* Re-enable things in child-to-parent order */
-    bdrv_drain_invoke(bs, false, false);
-    bdrv_parent_drained_end(bs, parent);
+    bdrv_drain_invoke(bs, false);
+    bdrv_parent_drained_end(bs, parent, ignore_bds_parents);
     if (old_quiesce_counter == 1) {
         aio_enable_external(bdrv_get_aio_context(bs));
     }
 
     if (recursive) {
+        assert(!ignore_bds_parents);
         bs->recursive_quiesce_counter--;
         QLIST_FOREACH_SAFE(child, &bs->children, next, next) {
-            bdrv_do_drained_end(child->bs, true, child);
+            bdrv_do_drained_end(child->bs, true, child, ignore_bds_parents);
         }
     }
 }
 
 void bdrv_drained_end(BlockDriverState *bs)
 {
-    bdrv_do_drained_end(bs, false, NULL);
+    bdrv_do_drained_end(bs, false, NULL, false);
 }
 
 void bdrv_subtree_drained_end(BlockDriverState *bs)
 {
-    bdrv_do_drained_end(bs, true, NULL);
+    bdrv_do_drained_end(bs, true, NULL, false);
 }
 
 void bdrv_apply_subtree_drain(BdrvChild *child, BlockDriverState *new_parent)
@@ -351,7 +431,7 @@ void bdrv_apply_subtree_drain(BdrvChild *child, BlockDriverState *new_parent)
     int i;
 
     for (i = 0; i < new_parent->recursive_quiesce_counter; i++) {
-        bdrv_do_drained_begin(child->bs, true, child);
+        bdrv_do_drained_begin(child->bs, true, child, false, true);
     }
 }
 
@@ -360,7 +440,7 @@ void bdrv_unapply_subtree_drain(BdrvChild *child, BlockDriverState *old_parent)
     int i;
 
     for (i = 0; i < old_parent->recursive_quiesce_counter; i++) {
-        bdrv_do_drained_end(child->bs, true, child);
+        bdrv_do_drained_end(child->bs, true, child, false);
     }
 }
 
@@ -370,10 +450,6 @@ void bdrv_unapply_subtree_drain(BdrvChild *child, BlockDriverState *old_parent)
  *
  * Note that unlike bdrv_drain_all(), the caller must hold the BlockDriverState
  * AioContext.
- *
- * Only this BlockDriverState's AioContext is run, so in-flight requests must
- * not depend on events in other AioContexts.  In that case, use
- * bdrv_drain_all() instead.
  */
 void coroutine_fn bdrv_co_drain(BlockDriverState *bs)
 {
@@ -388,6 +464,39 @@ void bdrv_drain(BlockDriverState *bs)
     bdrv_drained_end(bs);
 }
 
+static void bdrv_drain_assert_idle(BlockDriverState *bs)
+{
+    BdrvChild *child, *next;
+
+    assert(atomic_read(&bs->in_flight) == 0);
+    QLIST_FOREACH_SAFE(child, &bs->children, next, next) {
+        bdrv_drain_assert_idle(child->bs);
+    }
+}
+
+unsigned int bdrv_drain_all_count = 0;
+
+static bool bdrv_drain_all_poll(void)
+{
+    BlockDriverState *bs = NULL;
+    bool result = false;
+
+    /* Execute pending BHs first (may modify the graph) and check everything
+     * else only after the BHs have executed. */
+    while (aio_poll(qemu_get_aio_context(), false));
+
+    /* bdrv_drain_poll() can't make changes to the graph and we are holding the
+     * main AioContext lock, so iterating bdrv_next_all_states() is safe. */
+    while ((bs = bdrv_next_all_states(bs))) {
+        AioContext *aio_context = bdrv_get_aio_context(bs);
+        aio_context_acquire(aio_context);
+        result |= bdrv_drain_poll(bs, false, NULL, true);
+        aio_context_release(aio_context);
+    }
+
+    return result;
+}
+
 /*
  * Wait for pending requests to complete across all BlockDriverStates
  *
@@ -402,73 +511,51 @@ void bdrv_drain(BlockDriverState *bs)
  */
 void bdrv_drain_all_begin(void)
 {
-    /* Always run first iteration so any pending completion BHs run */
-    bool waited = true;
-    BlockDriverState *bs;
-    BdrvNextIterator it;
-    GSList *aio_ctxs = NULL, *ctx;
+    BlockDriverState *bs = NULL;
+
+    if (qemu_in_coroutine()) {
+        bdrv_co_yield_to_drain(NULL, true, false, NULL, true, true);
+        return;
+    }
 
-    /* BDRV_POLL_WHILE() for a node can only be called from its own I/O thread
-     * or the main loop AioContext. We potentially use BDRV_POLL_WHILE() on
-     * nodes in several different AioContexts, so make sure we're in the main
-     * context. */
+    /* AIO_WAIT_WHILE() with a NULL context can only be called from the main
+     * loop AioContext, so make sure we're in the main context. */
     assert(qemu_get_current_aio_context() == qemu_get_aio_context());
+    assert(bdrv_drain_all_count < INT_MAX);
+    bdrv_drain_all_count++;
 
-    for (bs = bdrv_first(&it); bs; bs = bdrv_next(&it)) {
+    /* Quiesce all nodes, without polling in-flight requests yet. The graph
+     * cannot change during this loop. */
+    while ((bs = bdrv_next_all_states(bs))) {
         AioContext *aio_context = bdrv_get_aio_context(bs);
 
-        /* Stop things in parent-to-child order */
         aio_context_acquire(aio_context);
-        aio_disable_external(aio_context);
-        bdrv_parent_drained_begin(bs, NULL);
-        bdrv_drain_invoke(bs, true, true);
+        bdrv_do_drained_begin(bs, false, NULL, true, false);
         aio_context_release(aio_context);
-
-        if (!g_slist_find(aio_ctxs, aio_context)) {
-            aio_ctxs = g_slist_prepend(aio_ctxs, aio_context);
-        }
     }
 
-    /* Note that completion of an asynchronous I/O operation can trigger any
-     * number of other I/O operations on other devices---for example a
-     * coroutine can submit an I/O request to another device in response to
-     * request completion.  Therefore we must keep looping until there was no
-     * more activity rather than simply draining each device independently.
-     */
-    while (waited) {
-        waited = false;
-
-        for (ctx = aio_ctxs; ctx != NULL; ctx = ctx->next) {
-            AioContext *aio_context = ctx->data;
+    /* Now poll the in-flight requests */
+    AIO_WAIT_WHILE(&drain_all_aio_wait, NULL, bdrv_drain_all_poll());
 
-            aio_context_acquire(aio_context);
-            for (bs = bdrv_first(&it); bs; bs = bdrv_next(&it)) {
-                if (aio_context == bdrv_get_aio_context(bs)) {
-                    waited |= bdrv_drain_recurse(bs);
-                }
-            }
-            aio_context_release(aio_context);
-        }
+    while ((bs = bdrv_next_all_states(bs))) {
+        bdrv_drain_assert_idle(bs);
     }
-
-    g_slist_free(aio_ctxs);
 }
 
 void bdrv_drain_all_end(void)
 {
-    BlockDriverState *bs;
-    BdrvNextIterator it;
+    BlockDriverState *bs = NULL;
 
-    for (bs = bdrv_first(&it); bs; bs = bdrv_next(&it)) {
+    while ((bs = bdrv_next_all_states(bs))) {
         AioContext *aio_context = bdrv_get_aio_context(bs);
 
-        /* Re-enable things in child-to-parent order */
         aio_context_acquire(aio_context);
-        bdrv_drain_invoke(bs, false, true);
-        bdrv_parent_drained_end(bs, NULL);
-        aio_enable_external(aio_context);
+        bdrv_do_drained_end(bs, false, NULL, true);
         aio_context_release(aio_context);
     }
+
+    assert(bdrv_drain_all_count > 0);
+    bdrv_drain_all_count--;
 }
 
 void bdrv_drain_all(void)
@@ -591,6 +678,7 @@ void bdrv_inc_in_flight(BlockDriverState *bs)
 void bdrv_wakeup(BlockDriverState *bs)
 {
     aio_wait_kick(bdrv_get_aio_wait(bs));
+    aio_wait_kick(&drain_all_aio_wait);
 }
 
 void bdrv_dec_in_flight(BlockDriverState *bs)
diff --git a/block/mirror.c b/block/mirror.c
index 435268bbbf..61bd9f3cf1 100644
--- a/block/mirror.c
+++ b/block/mirror.c
@@ -13,6 +13,8 @@
 
 #include "qemu/osdep.h"
 #include "qemu/cutils.h"
+#include "qemu/coroutine.h"
+#include "qemu/range.h"
 #include "trace.h"
 #include "block/blockjob_int.h"
 #include "block/block_int.h"
@@ -33,11 +35,12 @@ typedef struct MirrorBuffer {
     QSIMPLEQ_ENTRY(MirrorBuffer) next;
 } MirrorBuffer;
 
+typedef struct MirrorOp MirrorOp;
+
 typedef struct MirrorBlockJob {
     BlockJob common;
     BlockBackend *target;
     BlockDriverState *mirror_top_bs;
-    BlockDriverState *source;
     BlockDriverState *base;
 
     /* The name of the graph node to replace */
@@ -48,8 +51,12 @@ typedef struct MirrorBlockJob {
     Error *replace_blocker;
     bool is_none_mode;
     BlockMirrorBackingMode backing_mode;
+    MirrorCopyMode copy_mode;
     BlockdevOnError on_source_error, on_target_error;
     bool synced;
+    /* Set when the target is synced (dirty bitmap is clean, nothing
+     * in flight) and the job is running in active mode */
+    bool actively_synced;
     bool should_complete;
     int64_t granularity;
     size_t buf_size;
@@ -65,25 +72,47 @@ typedef struct MirrorBlockJob {
     unsigned long *in_flight_bitmap;
     int in_flight;
     int64_t bytes_in_flight;
+    QTAILQ_HEAD(MirrorOpList, MirrorOp) ops_in_flight;
     int ret;
     bool unmap;
-    bool waiting_for_io;
     int target_cluster_size;
     int max_iov;
     bool initial_zeroing_ongoing;
+    int in_active_write_counter;
 } MirrorBlockJob;
 
-typedef struct MirrorOp {
+typedef struct MirrorBDSOpaque {
+    MirrorBlockJob *job;
+} MirrorBDSOpaque;
+
+struct MirrorOp {
     MirrorBlockJob *s;
     QEMUIOVector qiov;
     int64_t offset;
     uint64_t bytes;
-} MirrorOp;
+
+    /* The pointee is set by mirror_co_read(), mirror_co_zero(), and
+     * mirror_co_discard() before yielding for the first time */
+    int64_t *bytes_handled;
+
+    bool is_pseudo_op;
+    bool is_active_write;
+    CoQueue waiting_requests;
+
+    QTAILQ_ENTRY(MirrorOp) next;
+};
+
+typedef enum MirrorMethod {
+    MIRROR_METHOD_COPY,
+    MIRROR_METHOD_ZERO,
+    MIRROR_METHOD_DISCARD,
+} MirrorMethod;
 
 static BlockErrorAction mirror_error_action(MirrorBlockJob *s, bool read,
                                             int error)
 {
     s->synced = false;
+    s->actively_synced = false;
     if (read) {
         return block_job_error_action(&s->common, s->on_source_error,
                                       true, error);
@@ -93,7 +122,42 @@ static BlockErrorAction mirror_error_action(MirrorBlockJob *s, bool read,
     }
 }
 
-static void mirror_iteration_done(MirrorOp *op, int ret)
+static void coroutine_fn mirror_wait_on_conflicts(MirrorOp *self,
+                                                  MirrorBlockJob *s,
+                                                  uint64_t offset,
+                                                  uint64_t bytes)
+{
+    uint64_t self_start_chunk = offset / s->granularity;
+    uint64_t self_end_chunk = DIV_ROUND_UP(offset + bytes, s->granularity);
+    uint64_t self_nb_chunks = self_end_chunk - self_start_chunk;
+
+    while (find_next_bit(s->in_flight_bitmap, self_end_chunk,
+                         self_start_chunk) < self_end_chunk &&
+           s->ret >= 0)
+    {
+        MirrorOp *op;
+
+        QTAILQ_FOREACH(op, &s->ops_in_flight, next) {
+            uint64_t op_start_chunk = op->offset / s->granularity;
+            uint64_t op_nb_chunks = DIV_ROUND_UP(op->offset + op->bytes,
+                                                 s->granularity) -
+                                    op_start_chunk;
+
+            if (op == self) {
+                continue;
+            }
+
+            if (ranges_overlap(self_start_chunk, self_nb_chunks,
+                               op_start_chunk, op_nb_chunks))
+            {
+                qemu_co_queue_wait(&op->waiting_requests, NULL);
+                break;
+            }
+        }
+    }
+}
+
+static void coroutine_fn mirror_iteration_done(MirrorOp *op, int ret)
 {
     MirrorBlockJob *s = op->s;
     struct iovec *iov;
@@ -113,7 +177,9 @@ static void mirror_iteration_done(MirrorOp *op, int ret)
 
     chunk_num = op->offset / s->granularity;
     nb_chunks = DIV_ROUND_UP(op->bytes, s->granularity);
+
     bitmap_clear(s->in_flight_bitmap, chunk_num, nb_chunks);
+    QTAILQ_REMOVE(&s->ops_in_flight, op, next);
     if (ret >= 0) {
         if (s->cow_bitmap) {
             bitmap_set(s->cow_bitmap, chunk_num, nb_chunks);
@@ -123,16 +189,13 @@ static void mirror_iteration_done(MirrorOp *op, int ret)
         }
     }
     qemu_iovec_destroy(&op->qiov);
-    g_free(op);
 
-    if (s->waiting_for_io) {
-        qemu_coroutine_enter(s->common.job.co);
-    }
+    qemu_co_queue_restart_all(&op->waiting_requests);
+    g_free(op);
 }
 
-static void mirror_write_complete(void *opaque, int ret)
+static void coroutine_fn mirror_write_complete(MirrorOp *op, int ret)
 {
-    MirrorOp *op = opaque;
     MirrorBlockJob *s = op->s;
 
     aio_context_acquire(blk_get_aio_context(s->common.blk));
@@ -149,9 +212,8 @@ static void mirror_write_complete(void *opaque, int ret)
     aio_context_release(blk_get_aio_context(s->common.blk));
 }
 
-static void mirror_read_complete(void *opaque, int ret)
+static void coroutine_fn mirror_read_complete(MirrorOp *op, int ret)
 {
-    MirrorOp *op = opaque;
     MirrorBlockJob *s = op->s;
 
     aio_context_acquire(blk_get_aio_context(s->common.blk));
@@ -166,8 +228,9 @@ static void mirror_read_complete(void *opaque, int ret)
 
         mirror_iteration_done(op, ret);
     } else {
-        blk_aio_pwritev(s->target, op->offset, &op->qiov,
-                        0, mirror_write_complete, op);
+        ret = blk_co_pwritev(s->target, op->offset,
+                             op->qiov.size, &op->qiov, 0);
+        mirror_write_complete(op, ret);
     }
     aio_context_release(blk_get_aio_context(s->common.blk));
 }
@@ -216,68 +279,80 @@ static int mirror_cow_align(MirrorBlockJob *s, int64_t *offset,
     return ret;
 }
 
-static inline void mirror_wait_for_io(MirrorBlockJob *s)
+static inline void mirror_wait_for_any_operation(MirrorBlockJob *s, bool active)
 {
-    assert(!s->waiting_for_io);
-    s->waiting_for_io = true;
-    qemu_coroutine_yield();
-    s->waiting_for_io = false;
+    MirrorOp *op;
+
+    QTAILQ_FOREACH(op, &s->ops_in_flight, next) {
+        /* Do not wait on pseudo ops, because it may in turn wait on
+         * some other operation to start, which may in fact be the
+         * caller of this function.  Since there is only one pseudo op
+         * at any given time, we will always find some real operation
+         * to wait on. */
+        if (!op->is_pseudo_op && op->is_active_write == active) {
+            qemu_co_queue_wait(&op->waiting_requests, NULL);
+            return;
+        }
+    }
+    abort();
 }
 
-/* Submit async read while handling COW.
- * Returns: The number of bytes copied after and including offset,
- *          excluding any bytes copied prior to offset due to alignment.
- *          This will be @bytes if no alignment is necessary, or
- *          (new_end - offset) if tail is rounded up or down due to
- *          alignment or buffer limit.
+static inline void mirror_wait_for_free_in_flight_slot(MirrorBlockJob *s)
+{
+    /* Only non-active operations use up in-flight slots */
+    mirror_wait_for_any_operation(s, false);
+}
+
+/* Perform a mirror copy operation.
+ *
+ * *op->bytes_handled is set to the number of bytes copied after and
+ * including offset, excluding any bytes copied prior to offset due
+ * to alignment.  This will be op->bytes if no alignment is necessary,
+ * or (new_end - op->offset) if the tail is rounded up or down due to
+ * alignment or buffer limit.
  */
-static uint64_t mirror_do_read(MirrorBlockJob *s, int64_t offset,
-                               uint64_t bytes)
+static void coroutine_fn mirror_co_read(void *opaque)
 {
-    BlockBackend *source = s->common.blk;
+    MirrorOp *op = opaque;
+    MirrorBlockJob *s = op->s;
     int nb_chunks;
     uint64_t ret;
-    MirrorOp *op;
     uint64_t max_bytes;
 
     max_bytes = s->granularity * s->max_iov;
 
     /* We can only handle as much as buf_size at a time. */
-    bytes = MIN(s->buf_size, MIN(max_bytes, bytes));
-    assert(bytes);
-    assert(bytes < BDRV_REQUEST_MAX_BYTES);
-    ret = bytes;
+    op->bytes = MIN(s->buf_size, MIN(max_bytes, op->bytes));
+    assert(op->bytes);
+    assert(op->bytes < BDRV_REQUEST_MAX_BYTES);
+    *op->bytes_handled = op->bytes;
 
     if (s->cow_bitmap) {
-        ret += mirror_cow_align(s, &offset, &bytes);
+        *op->bytes_handled += mirror_cow_align(s, &op->offset, &op->bytes);
     }
-    assert(bytes <= s->buf_size);
+    /* Cannot exceed BDRV_REQUEST_MAX_BYTES + INT_MAX */
+    assert(*op->bytes_handled <= UINT_MAX);
+    assert(op->bytes <= s->buf_size);
     /* The offset is granularity-aligned because:
      * 1) Caller passes in aligned values;
      * 2) mirror_cow_align is used only when target cluster is larger. */
-    assert(QEMU_IS_ALIGNED(offset, s->granularity));
+    assert(QEMU_IS_ALIGNED(op->offset, s->granularity));
     /* The range is sector-aligned, since bdrv_getlength() rounds up. */
-    assert(QEMU_IS_ALIGNED(bytes, BDRV_SECTOR_SIZE));
-    nb_chunks = DIV_ROUND_UP(bytes, s->granularity);
+    assert(QEMU_IS_ALIGNED(op->bytes, BDRV_SECTOR_SIZE));
+    nb_chunks = DIV_ROUND_UP(op->bytes, s->granularity);
 
     while (s->buf_free_count < nb_chunks) {
-        trace_mirror_yield_in_flight(s, offset, s->in_flight);
-        mirror_wait_for_io(s);
+        trace_mirror_yield_in_flight(s, op->offset, s->in_flight);
+        mirror_wait_for_free_in_flight_slot(s);
     }
 
-    /* Allocate a MirrorOp that is used as an AIO callback.  */
-    op = g_new(MirrorOp, 1);
-    op->s = s;
-    op->offset = offset;
-    op->bytes = bytes;
-
     /* Now make a QEMUIOVector taking enough granularity-sized chunks
      * from s->buf_free.
      */
     qemu_iovec_init(&op->qiov, nb_chunks);
     while (nb_chunks-- > 0) {
         MirrorBuffer *buf = QSIMPLEQ_FIRST(&s->buf_free);
-        size_t remaining = bytes - op->qiov.size;
+        size_t remaining = op->bytes - op->qiov.size;
 
         QSIMPLEQ_REMOVE_HEAD(&s->buf_free, next);
         s->buf_free_count--;
@@ -286,44 +361,92 @@ static uint64_t mirror_do_read(MirrorBlockJob *s, int64_t offset,
 
     /* Copy the dirty cluster.  */
     s->in_flight++;
-    s->bytes_in_flight += bytes;
-    trace_mirror_one_iteration(s, offset, bytes);
+    s->bytes_in_flight += op->bytes;
+    trace_mirror_one_iteration(s, op->offset, op->bytes);
 
-    blk_aio_preadv(source, offset, &op->qiov, 0, mirror_read_complete, op);
-    return ret;
+    ret = bdrv_co_preadv(s->mirror_top_bs->backing, op->offset, op->bytes,
+                         &op->qiov, 0);
+    mirror_read_complete(op, ret);
 }
 
-static void mirror_do_zero_or_discard(MirrorBlockJob *s,
-                                      int64_t offset,
-                                      uint64_t bytes,
-                                      bool is_discard)
+static void coroutine_fn mirror_co_zero(void *opaque)
 {
-    MirrorOp *op;
+    MirrorOp *op = opaque;
+    int ret;
 
-    /* Allocate a MirrorOp that is used as an AIO callback. The qiov is zeroed
-     * so the freeing in mirror_iteration_done is nop. */
-    op = g_new0(MirrorOp, 1);
-    op->s = s;
-    op->offset = offset;
-    op->bytes = bytes;
+    op->s->in_flight++;
+    op->s->bytes_in_flight += op->bytes;
+    *op->bytes_handled = op->bytes;
 
-    s->in_flight++;
-    s->bytes_in_flight += bytes;
-    if (is_discard) {
-        blk_aio_pdiscard(s->target, offset,
-                         op->bytes, mirror_write_complete, op);
-    } else {
-        blk_aio_pwrite_zeroes(s->target, offset,
-                              op->bytes, s->unmap ? BDRV_REQ_MAY_UNMAP : 0,
-                              mirror_write_complete, op);
+    ret = blk_co_pwrite_zeroes(op->s->target, op->offset, op->bytes,
+                               op->s->unmap ? BDRV_REQ_MAY_UNMAP : 0);
+    mirror_write_complete(op, ret);
+}
+
+static void coroutine_fn mirror_co_discard(void *opaque)
+{
+    MirrorOp *op = opaque;
+    int ret;
+
+    op->s->in_flight++;
+    op->s->bytes_in_flight += op->bytes;
+    *op->bytes_handled = op->bytes;
+
+    ret = blk_co_pdiscard(op->s->target, op->offset, op->bytes);
+    mirror_write_complete(op, ret);
+}
+
+static unsigned mirror_perform(MirrorBlockJob *s, int64_t offset,
+                               unsigned bytes, MirrorMethod mirror_method)
+{
+    MirrorOp *op;
+    Coroutine *co;
+    int64_t bytes_handled = -1;
+
+    op = g_new(MirrorOp, 1);
+    *op = (MirrorOp){
+        .s              = s,
+        .offset         = offset,
+        .bytes          = bytes,
+        .bytes_handled  = &bytes_handled,
+    };
+    qemu_co_queue_init(&op->waiting_requests);
+
+    switch (mirror_method) {
+    case MIRROR_METHOD_COPY:
+        co = qemu_coroutine_create(mirror_co_read, op);
+        break;
+    case MIRROR_METHOD_ZERO:
+        co = qemu_coroutine_create(mirror_co_zero, op);
+        break;
+    case MIRROR_METHOD_DISCARD:
+        co = qemu_coroutine_create(mirror_co_discard, op);
+        break;
+    default:
+        abort();
     }
+
+    QTAILQ_INSERT_TAIL(&s->ops_in_flight, op, next);
+    qemu_coroutine_enter(co);
+    /* At this point, ownership of op has been moved to the coroutine
+     * and the object may already be freed */
+
+    /* Assert that this value has been set */
+    assert(bytes_handled >= 0);
+
+    /* Same assertion as in mirror_co_read() (and for mirror_co_read()
+     * and mirror_co_discard(), bytes_handled == op->bytes, which
+     * is the @bytes parameter given to this function) */
+    assert(bytes_handled <= UINT_MAX);
+    return bytes_handled;
 }
 
 static uint64_t coroutine_fn mirror_iteration(MirrorBlockJob *s)
 {
-    BlockDriverState *source = s->source;
-    int64_t offset, first_chunk;
-    uint64_t delay_ns = 0;
+    BlockDriverState *source = s->mirror_top_bs->backing->bs;
+    MirrorOp *pseudo_op;
+    int64_t offset;
+    uint64_t delay_ns = 0, ret = 0;
     /* At least the first dirty chunk is mirrored in one iteration. */
     int nb_chunks = 1;
     bool write_zeroes_ok = bdrv_can_write_zeroes_with_unmap(blk_bs(s->target));
@@ -339,11 +462,7 @@ static uint64_t coroutine_fn mirror_iteration(MirrorBlockJob *s)
     }
     bdrv_dirty_bitmap_unlock(s->dirty_bitmap);
 
-    first_chunk = offset / s->granularity;
-    while (test_bit(first_chunk, s->in_flight_bitmap)) {
-        trace_mirror_yield_in_flight(s, offset, s->in_flight);
-        mirror_wait_for_io(s);
-    }
+    mirror_wait_on_conflicts(NULL, s, offset, 1);
 
     job_pause_point(&s->common.job);
 
@@ -380,16 +499,27 @@ static uint64_t coroutine_fn mirror_iteration(MirrorBlockJob *s)
                                    nb_chunks * s->granularity);
     bdrv_dirty_bitmap_unlock(s->dirty_bitmap);
 
+    /* Before claiming an area in the in-flight bitmap, we have to
+     * create a MirrorOp for it so that conflicting requests can wait
+     * for it.  mirror_perform() will create the real MirrorOps later,
+     * for now we just create a pseudo operation that will wake up all
+     * conflicting requests once all real operations have been
+     * launched. */
+    pseudo_op = g_new(MirrorOp, 1);
+    *pseudo_op = (MirrorOp){
+        .offset         = offset,
+        .bytes          = nb_chunks * s->granularity,
+        .is_pseudo_op   = true,
+    };
+    qemu_co_queue_init(&pseudo_op->waiting_requests);
+    QTAILQ_INSERT_TAIL(&s->ops_in_flight, pseudo_op, next);
+
     bitmap_set(s->in_flight_bitmap, offset / s->granularity, nb_chunks);
     while (nb_chunks > 0 && offset < s->bdev_length) {
         int ret;
         int64_t io_bytes;
         int64_t io_bytes_acct;
-        enum MirrorMethod {
-            MIRROR_METHOD_COPY,
-            MIRROR_METHOD_ZERO,
-            MIRROR_METHOD_DISCARD
-        } mirror_method = MIRROR_METHOD_COPY;
+        MirrorMethod mirror_method = MIRROR_METHOD_COPY;
 
         assert(!(offset % s->granularity));
         ret = bdrv_block_status_above(source, NULL, offset,
@@ -419,37 +549,34 @@ static uint64_t coroutine_fn mirror_iteration(MirrorBlockJob *s)
 
         while (s->in_flight >= MAX_IN_FLIGHT) {
             trace_mirror_yield_in_flight(s, offset, s->in_flight);
-            mirror_wait_for_io(s);
+            mirror_wait_for_free_in_flight_slot(s);
         }
 
         if (s->ret < 0) {
-            return 0;
+            ret = 0;
+            goto fail;
         }
 
         io_bytes = mirror_clip_bytes(s, offset, io_bytes);
-        switch (mirror_method) {
-        case MIRROR_METHOD_COPY:
-            io_bytes = io_bytes_acct = mirror_do_read(s, offset, io_bytes);
-            break;
-        case MIRROR_METHOD_ZERO:
-        case MIRROR_METHOD_DISCARD:
-            mirror_do_zero_or_discard(s, offset, io_bytes,
-                                      mirror_method == MIRROR_METHOD_DISCARD);
-            if (write_zeroes_ok) {
-                io_bytes_acct = 0;
-            } else {
-                io_bytes_acct = io_bytes;
-            }
-            break;
-        default:
-            abort();
+        io_bytes = mirror_perform(s, offset, io_bytes, mirror_method);
+        if (mirror_method != MIRROR_METHOD_COPY && write_zeroes_ok) {
+            io_bytes_acct = 0;
+        } else {
+            io_bytes_acct = io_bytes;
         }
         assert(io_bytes);
         offset += io_bytes;
         nb_chunks -= DIV_ROUND_UP(io_bytes, s->granularity);
         delay_ns = block_job_ratelimit_get_delay(&s->common, io_bytes_acct);
     }
-    return delay_ns;
+
+    ret = delay_ns;
+fail:
+    QTAILQ_REMOVE(&s->ops_in_flight, pseudo_op, next);
+    qemu_co_queue_restart_all(&pseudo_op->waiting_requests);
+    g_free(pseudo_op);
+
+    return ret;
 }
 
 static void mirror_free_init(MirrorBlockJob *s)
@@ -476,7 +603,7 @@ static void mirror_free_init(MirrorBlockJob *s)
 static void mirror_wait_for_all_io(MirrorBlockJob *s)
 {
     while (s->in_flight > 0) {
-        mirror_wait_for_io(s);
+        mirror_wait_for_free_in_flight_slot(s);
     }
 }
 
@@ -489,8 +616,9 @@ static void mirror_exit(Job *job, void *opaque)
     MirrorBlockJob *s = container_of(job, MirrorBlockJob, common.job);
     BlockJob *bjob = &s->common;
     MirrorExitData *data = opaque;
+    MirrorBDSOpaque *bs_opaque = s->mirror_top_bs->opaque;
     AioContext *replace_aio_context = NULL;
-    BlockDriverState *src = s->source;
+    BlockDriverState *src = s->mirror_top_bs->backing->bs;
     BlockDriverState *target_bs = blk_bs(s->target);
     BlockDriverState *mirror_top_bs = s->mirror_top_bs;
     Error *local_err = NULL;
@@ -581,6 +709,7 @@ static void mirror_exit(Job *job, void *opaque)
     blk_set_perm(bjob->blk, 0, BLK_PERM_ALL, &error_abort);
     blk_insert_bs(bjob->blk, mirror_top_bs, &error_abort);
 
+    bs_opaque->job = NULL;
     job_completed(job, data->ret, NULL);
 
     g_free(data);
@@ -605,7 +734,7 @@ static int coroutine_fn mirror_dirty_init(MirrorBlockJob *s)
 {
     int64_t offset;
     BlockDriverState *base = s->base;
-    BlockDriverState *bs = s->source;
+    BlockDriverState *bs = s->mirror_top_bs->backing->bs;
     BlockDriverState *target_bs = blk_bs(s->target);
     int ret;
     int64_t count;
@@ -631,11 +760,11 @@ static int coroutine_fn mirror_dirty_init(MirrorBlockJob *s)
             if (s->in_flight >= MAX_IN_FLIGHT) {
                 trace_mirror_yield(s, UINT64_MAX, s->buf_free_count,
                                    s->in_flight);
-                mirror_wait_for_io(s);
+                mirror_wait_for_free_in_flight_slot(s);
                 continue;
             }
 
-            mirror_do_zero_or_discard(s, offset, bytes, false);
+            mirror_perform(s, offset, bytes, MIRROR_METHOD_ZERO);
             offset += bytes;
         }
 
@@ -687,7 +816,7 @@ static void coroutine_fn mirror_run(void *opaque)
 {
     MirrorBlockJob *s = opaque;
     MirrorExitData *data;
-    BlockDriverState *bs = s->source;
+    BlockDriverState *bs = s->mirror_top_bs->backing->bs;
     BlockDriverState *target_bs = blk_bs(s->target);
     bool need_drain = true;
     int64_t length;
@@ -730,6 +859,7 @@ static void coroutine_fn mirror_run(void *opaque)
         /* Transition to the READY state and wait for complete. */
         job_transition_to_ready(&s->common.job);
         s->synced = true;
+        s->actively_synced = true;
         while (!job_is_cancelled(&s->common.job) && !s->should_complete) {
             job_yield(&s->common.job);
         }
@@ -781,6 +911,12 @@ static void coroutine_fn mirror_run(void *opaque)
         int64_t cnt, delta;
         bool should_complete;
 
+        /* Do not start passive operations while there are active
+         * writes in progress */
+        while (s->in_active_write_counter) {
+            mirror_wait_for_any_operation(s, true);
+        }
+
         if (s->ret < 0) {
             ret = s->ret;
             goto immediate_exit;
@@ -804,7 +940,7 @@ static void coroutine_fn mirror_run(void *opaque)
             if (s->in_flight >= MAX_IN_FLIGHT || s->buf_free_count == 0 ||
                 (cnt == 0 && s->in_flight > 0)) {
                 trace_mirror_yield(s, cnt, s->buf_free_count, s->in_flight);
-                mirror_wait_for_io(s);
+                mirror_wait_for_free_in_flight_slot(s);
                 continue;
             } else if (cnt != 0) {
                 delay_ns = mirror_iteration(s);
@@ -826,6 +962,9 @@ static void coroutine_fn mirror_run(void *opaque)
                  */
                 job_transition_to_ready(&s->common.job);
                 s->synced = true;
+                if (s->copy_mode != MIRROR_COPY_MODE_BACKGROUND) {
+                    s->actively_synced = true;
+                }
             }
 
             should_complete = s->should_complete ||
@@ -964,6 +1103,12 @@ static void mirror_pause(Job *job)
     mirror_wait_for_all_io(s);
 }
 
+static bool mirror_drained_poll(BlockJob *job)
+{
+    MirrorBlockJob *s = container_of(job, MirrorBlockJob, common);
+    return !!s->in_flight;
+}
+
 static void mirror_attached_aio_context(BlockJob *job, AioContext *new_context)
 {
     MirrorBlockJob *s = container_of(job, MirrorBlockJob, common);
@@ -997,6 +1142,7 @@ static const BlockJobDriver mirror_job_driver = {
         .pause                  = mirror_pause,
         .complete               = mirror_complete,
     },
+    .drained_poll           = mirror_drained_poll,
     .attached_aio_context   = mirror_attached_aio_context,
     .drain                  = mirror_drain,
 };
@@ -1012,20 +1158,237 @@ static const BlockJobDriver commit_active_job_driver = {
         .pause                  = mirror_pause,
         .complete               = mirror_complete,
     },
+    .drained_poll           = mirror_drained_poll,
     .attached_aio_context   = mirror_attached_aio_context,
     .drain                  = mirror_drain,
 };
 
+static void do_sync_target_write(MirrorBlockJob *job, MirrorMethod method,
+                                 uint64_t offset, uint64_t bytes,
+                                 QEMUIOVector *qiov, int flags)
+{
+    BdrvDirtyBitmapIter *iter;
+    QEMUIOVector target_qiov;
+    uint64_t dirty_offset;
+    int dirty_bytes;
+
+    if (qiov) {
+        qemu_iovec_init(&target_qiov, qiov->niov);
+    }
+
+    iter = bdrv_dirty_iter_new(job->dirty_bitmap);
+    bdrv_set_dirty_iter(iter, offset);
+
+    while (true) {
+        bool valid_area;
+        int ret;
+
+        bdrv_dirty_bitmap_lock(job->dirty_bitmap);
+        valid_area = bdrv_dirty_iter_next_area(iter, offset + bytes,
+                                               &dirty_offset, &dirty_bytes);
+        if (!valid_area) {
+            bdrv_dirty_bitmap_unlock(job->dirty_bitmap);
+            break;
+        }
+
+        bdrv_reset_dirty_bitmap_locked(job->dirty_bitmap,
+                                       dirty_offset, dirty_bytes);
+        bdrv_dirty_bitmap_unlock(job->dirty_bitmap);
+
+        job_progress_increase_remaining(&job->common.job, dirty_bytes);
+
+        assert(dirty_offset - offset <= SIZE_MAX);
+        if (qiov) {
+            qemu_iovec_reset(&target_qiov);
+            qemu_iovec_concat(&target_qiov, qiov,
+                              dirty_offset - offset, dirty_bytes);
+        }
+
+        switch (method) {
+        case MIRROR_METHOD_COPY:
+            ret = blk_co_pwritev(job->target, dirty_offset, dirty_bytes,
+                                 qiov ? &target_qiov : NULL, flags);
+            break;
+
+        case MIRROR_METHOD_ZERO:
+            assert(!qiov);
+            ret = blk_co_pwrite_zeroes(job->target, dirty_offset, dirty_bytes,
+                                       flags);
+            break;
+
+        case MIRROR_METHOD_DISCARD:
+            assert(!qiov);
+            ret = blk_co_pdiscard(job->target, dirty_offset, dirty_bytes);
+            break;
+
+        default:
+            abort();
+        }
+
+        if (ret >= 0) {
+            job_progress_update(&job->common.job, dirty_bytes);
+        } else {
+            BlockErrorAction action;
+
+            bdrv_set_dirty_bitmap(job->dirty_bitmap, dirty_offset, dirty_bytes);
+            job->actively_synced = false;
+
+            action = mirror_error_action(job, false, -ret);
+            if (action == BLOCK_ERROR_ACTION_REPORT) {
+                if (!job->ret) {
+                    job->ret = ret;
+                }
+                break;
+            }
+        }
+    }
+
+    bdrv_dirty_iter_free(iter);
+    if (qiov) {
+        qemu_iovec_destroy(&target_qiov);
+    }
+}
+
+static MirrorOp *coroutine_fn active_write_prepare(MirrorBlockJob *s,
+                                                   uint64_t offset,
+                                                   uint64_t bytes)
+{
+    MirrorOp *op;
+    uint64_t start_chunk = offset / s->granularity;
+    uint64_t end_chunk = DIV_ROUND_UP(offset + bytes, s->granularity);
+
+    op = g_new(MirrorOp, 1);
+    *op = (MirrorOp){
+        .s                  = s,
+        .offset             = offset,
+        .bytes              = bytes,
+        .is_active_write    = true,
+    };
+    qemu_co_queue_init(&op->waiting_requests);
+    QTAILQ_INSERT_TAIL(&s->ops_in_flight, op, next);
+
+    s->in_active_write_counter++;
+
+    mirror_wait_on_conflicts(op, s, offset, bytes);
+
+    bitmap_set(s->in_flight_bitmap, start_chunk, end_chunk - start_chunk);
+
+    return op;
+}
+
+static void coroutine_fn active_write_settle(MirrorOp *op)
+{
+    uint64_t start_chunk = op->offset / op->s->granularity;
+    uint64_t end_chunk = DIV_ROUND_UP(op->offset + op->bytes,
+                                      op->s->granularity);
+
+    if (!--op->s->in_active_write_counter && op->s->actively_synced) {
+        BdrvChild *source = op->s->mirror_top_bs->backing;
+
+        if (QLIST_FIRST(&source->bs->parents) == source &&
+            QLIST_NEXT(source, next_parent) == NULL)
+        {
+            /* Assert that we are back in sync once all active write
+             * operations are settled.
+             * Note that we can only assert this if the mirror node
+             * is the source node's only parent. */
+            assert(!bdrv_get_dirty_count(op->s->dirty_bitmap));
+        }
+    }
+    bitmap_clear(op->s->in_flight_bitmap, start_chunk, end_chunk - start_chunk);
+    QTAILQ_REMOVE(&op->s->ops_in_flight, op, next);
+    qemu_co_queue_restart_all(&op->waiting_requests);
+    g_free(op);
+}
+
 static int coroutine_fn bdrv_mirror_top_preadv(BlockDriverState *bs,
     uint64_t offset, uint64_t bytes, QEMUIOVector *qiov, int flags)
 {
     return bdrv_co_preadv(bs->backing, offset, bytes, qiov, flags);
 }
 
+static int coroutine_fn bdrv_mirror_top_do_write(BlockDriverState *bs,
+    MirrorMethod method, uint64_t offset, uint64_t bytes, QEMUIOVector *qiov,
+    int flags)
+{
+    MirrorOp *op = NULL;
+    MirrorBDSOpaque *s = bs->opaque;
+    int ret = 0;
+    bool copy_to_target;
+
+    copy_to_target = s->job->ret >= 0 &&
+                     s->job->copy_mode == MIRROR_COPY_MODE_WRITE_BLOCKING;
+
+    if (copy_to_target) {
+        op = active_write_prepare(s->job, offset, bytes);
+    }
+
+    switch (method) {
+    case MIRROR_METHOD_COPY:
+        ret = bdrv_co_pwritev(bs->backing, offset, bytes, qiov, flags);
+        break;
+
+    case MIRROR_METHOD_ZERO:
+        ret = bdrv_co_pwrite_zeroes(bs->backing, offset, bytes, flags);
+        break;
+
+    case MIRROR_METHOD_DISCARD:
+        ret = bdrv_co_pdiscard(bs->backing->bs, offset, bytes);
+        break;
+
+    default:
+        abort();
+    }
+
+    if (ret < 0) {
+        goto out;
+    }
+
+    if (copy_to_target) {
+        do_sync_target_write(s->job, method, offset, bytes, qiov, flags);
+    }
+
+out:
+    if (copy_to_target) {
+        active_write_settle(op);
+    }
+    return ret;
+}
+
 static int coroutine_fn bdrv_mirror_top_pwritev(BlockDriverState *bs,
     uint64_t offset, uint64_t bytes, QEMUIOVector *qiov, int flags)
 {
-    return bdrv_co_pwritev(bs->backing, offset, bytes, qiov, flags);
+    MirrorBDSOpaque *s = bs->opaque;
+    QEMUIOVector bounce_qiov;
+    void *bounce_buf;
+    int ret = 0;
+    bool copy_to_target;
+
+    copy_to_target = s->job->ret >= 0 &&
+                     s->job->copy_mode == MIRROR_COPY_MODE_WRITE_BLOCKING;
+
+    if (copy_to_target) {
+        /* The guest might concurrently modify the data to write; but
+         * the data on source and destination must match, so we have
+         * to use a bounce buffer if we are going to write to the
+         * target now. */
+        bounce_buf = qemu_blockalign(bs, bytes);
+        iov_to_buf_full(qiov->iov, qiov->niov, 0, bounce_buf, bytes);
+
+        qemu_iovec_init(&bounce_qiov, 1);
+        qemu_iovec_add(&bounce_qiov, bounce_buf, bytes);
+        qiov = &bounce_qiov;
+    }
+
+    ret = bdrv_mirror_top_do_write(bs, MIRROR_METHOD_COPY, offset, bytes, qiov,
+                                   flags);
+
+    if (copy_to_target) {
+        qemu_iovec_destroy(&bounce_qiov);
+        qemu_vfree(bounce_buf);
+    }
+
+    return ret;
 }
 
 static int coroutine_fn bdrv_mirror_top_flush(BlockDriverState *bs)
@@ -1040,13 +1403,15 @@ static int coroutine_fn bdrv_mirror_top_flush(BlockDriverState *bs)
 static int coroutine_fn bdrv_mirror_top_pwrite_zeroes(BlockDriverState *bs,
     int64_t offset, int bytes, BdrvRequestFlags flags)
 {
-    return bdrv_co_pwrite_zeroes(bs->backing, offset, bytes, flags);
+    return bdrv_mirror_top_do_write(bs, MIRROR_METHOD_ZERO, offset, bytes, NULL,
+                                    flags);
 }
 
 static int coroutine_fn bdrv_mirror_top_pdiscard(BlockDriverState *bs,
     int64_t offset, int bytes)
 {
-    return bdrv_co_pdiscard(bs->backing->bs, offset, bytes);
+    return bdrv_mirror_top_do_write(bs, MIRROR_METHOD_DISCARD, offset, bytes,
+                                    NULL, 0);
 }
 
 static void bdrv_mirror_top_refresh_filename(BlockDriverState *bs, QDict *opts)
@@ -1108,10 +1473,11 @@ static void mirror_start_job(const char *job_id, BlockDriverState *bs,
                              const BlockJobDriver *driver,
                              bool is_none_mode, BlockDriverState *base,
                              bool auto_complete, const char *filter_node_name,
-                             bool is_mirror,
+                             bool is_mirror, MirrorCopyMode copy_mode,
                              Error **errp)
 {
     MirrorBlockJob *s;
+    MirrorBDSOpaque *bs_opaque;
     BlockDriverState *mirror_top_bs;
     bool target_graph_mod;
     bool target_is_backing;
@@ -1147,6 +1513,8 @@ static void mirror_start_job(const char *job_id, BlockDriverState *bs,
     mirror_top_bs->total_sectors = bs->total_sectors;
     mirror_top_bs->supported_write_flags = BDRV_REQ_WRITE_UNCHANGED;
     mirror_top_bs->supported_zero_flags = BDRV_REQ_WRITE_UNCHANGED;
+    bs_opaque = g_new0(MirrorBDSOpaque, 1);
+    mirror_top_bs->opaque = bs_opaque;
     bdrv_set_aio_context(mirror_top_bs, bdrv_get_aio_context(bs));
 
     /* bdrv_append takes ownership of the mirror_top_bs reference, need to keep
@@ -1171,10 +1539,11 @@ static void mirror_start_job(const char *job_id, BlockDriverState *bs,
     if (!s) {
         goto fail;
     }
+    bs_opaque->job = s;
+
     /* The block job now has a reference to this node */
     bdrv_unref(mirror_top_bs);
 
-    s->source = bs;
     s->mirror_top_bs = mirror_top_bs;
 
     /* No resize for the target either; while the mirror is still running, a
@@ -1212,6 +1581,7 @@ static void mirror_start_job(const char *job_id, BlockDriverState *bs,
     s->on_target_error = on_target_error;
     s->is_none_mode = is_none_mode;
     s->backing_mode = backing_mode;
+    s->copy_mode = copy_mode;
     s->base = base;
     s->granularity = granularity;
     s->buf_size = ROUND_UP(buf_size, granularity);
@@ -1247,6 +1617,8 @@ static void mirror_start_job(const char *job_id, BlockDriverState *bs,
         }
     }
 
+    QTAILQ_INIT(&s->ops_in_flight);
+
     trace_mirror_start(bs, s, opaque);
     job_start(&s->common.job);
     return;
@@ -1259,6 +1631,7 @@ fail:
 
         g_free(s->replaces);
         blk_unref(s->target);
+        bs_opaque->job = NULL;
         job_early_fail(&s->common.job);
     }
 
@@ -1275,7 +1648,8 @@ void mirror_start(const char *job_id, BlockDriverState *bs,
                   MirrorSyncMode mode, BlockMirrorBackingMode backing_mode,
                   BlockdevOnError on_source_error,
                   BlockdevOnError on_target_error,
-                  bool unmap, const char *filter_node_name, Error **errp)
+                  bool unmap, const char *filter_node_name,
+                  MirrorCopyMode copy_mode, Error **errp)
 {
     bool is_none_mode;
     BlockDriverState *base;
@@ -1290,7 +1664,7 @@ void mirror_start(const char *job_id, BlockDriverState *bs,
                      speed, granularity, buf_size, backing_mode,
                      on_source_error, on_target_error, unmap, NULL, NULL,
                      &mirror_job_driver, is_none_mode, base, false,
-                     filter_node_name, true, errp);
+                     filter_node_name, true, copy_mode, errp);
 }
 
 void commit_active_start(const char *job_id, BlockDriverState *bs,
@@ -1313,7 +1687,8 @@ void commit_active_start(const char *job_id, BlockDriverState *bs,
                      MIRROR_LEAVE_BACKING_CHAIN,
                      on_error, on_error, true, cb, opaque,
                      &commit_active_job_driver, false, base, auto_complete,
-                     filter_node_name, false, &local_err);
+                     filter_node_name, false, MIRROR_COPY_MODE_BACKGROUND,
+                     &local_err);
     if (local_err) {
         error_propagate(errp, local_err);
         goto error_restore_flags;
diff --git a/block/vvfat.c b/block/vvfat.c
index 4595f335b8..c7d2ed2d96 100644
--- a/block/vvfat.c
+++ b/block/vvfat.c
@@ -3134,6 +3134,7 @@ static void vvfat_qcow_options(int *child_flags, QDict *child_options,
 }
 
 static const BdrvChildRole child_vvfat_qcow = {
+    .parent_is_bds      = true,
     .inherit_options    = vvfat_qcow_options,
 };
 
diff --git a/blockdev.c b/blockdev.c
index 7f65cd7497..58d7570932 100644
--- a/blockdev.c
+++ b/blockdev.c
@@ -3586,6 +3586,7 @@ static void blockdev_mirror_common(const char *job_id, BlockDriverState *bs,
                                    bool has_unmap, bool unmap,
                                    bool has_filter_node_name,
                                    const char *filter_node_name,
+                                   bool has_copy_mode, MirrorCopyMode copy_mode,
                                    Error **errp)
 {
 
@@ -3610,6 +3611,9 @@ static void blockdev_mirror_common(const char *job_id, BlockDriverState *bs,
     if (!has_filter_node_name) {
         filter_node_name = NULL;
     }
+    if (!has_copy_mode) {
+        copy_mode = MIRROR_COPY_MODE_BACKGROUND;
+    }
 
     if (granularity != 0 && (granularity < 512 || granularity > 1048576 * 64)) {
         error_setg(errp, QERR_INVALID_PARAMETER_VALUE, "granularity",
@@ -3640,7 +3644,7 @@ static void blockdev_mirror_common(const char *job_id, BlockDriverState *bs,
                  has_replaces ? replaces : NULL,
                  speed, granularity, buf_size, sync, backing_mode,
                  on_source_error, on_target_error, unmap, filter_node_name,
-                 errp);
+                 copy_mode, errp);
 }
 
 void qmp_drive_mirror(DriveMirror *arg, Error **errp)
@@ -3786,6 +3790,7 @@ void qmp_drive_mirror(DriveMirror *arg, Error **errp)
                            arg->has_on_target_error, arg->on_target_error,
                            arg->has_unmap, arg->unmap,
                            false, NULL,
+                           arg->has_copy_mode, arg->copy_mode,
                            &local_err);
     bdrv_unref(target_bs);
     error_propagate(errp, local_err);
@@ -3806,6 +3811,7 @@ void qmp_blockdev_mirror(bool has_job_id, const char *job_id,
                          BlockdevOnError on_target_error,
                          bool has_filter_node_name,
                          const char *filter_node_name,
+                         bool has_copy_mode, MirrorCopyMode copy_mode,
                          Error **errp)
 {
     BlockDriverState *bs;
@@ -3838,6 +3844,7 @@ void qmp_blockdev_mirror(bool has_job_id, const char *job_id,
                            has_on_target_error, on_target_error,
                            true, true,
                            has_filter_node_name, filter_node_name,
+                           has_copy_mode, copy_mode,
                            &local_err);
     error_propagate(errp, local_err);
 
diff --git a/blockjob.c b/blockjob.c
index 0306533a2e..be5903aa96 100644
--- a/blockjob.c
+++ b/blockjob.c
@@ -155,6 +155,28 @@ static void child_job_drained_begin(BdrvChild *c)
     job_pause(&job->job);
 }
 
+static bool child_job_drained_poll(BdrvChild *c)
+{
+    BlockJob *bjob = c->opaque;
+    Job *job = &bjob->job;
+    const BlockJobDriver *drv = block_job_driver(bjob);
+
+    /* An inactive or completed job doesn't have any pending requests. Jobs
+     * with !job->busy are either already paused or have a pause point after
+     * being reentered, so no job driver code will run before they pause. */
+    if (!job->busy || job_is_completed(job) || job->deferred_to_main_loop) {
+        return false;
+    }
+
+    /* Otherwise, assume that it isn't fully stopped yet, but allow the job to
+     * override this assumption. */
+    if (drv->drained_poll) {
+        return drv->drained_poll(bjob);
+    } else {
+        return true;
+    }
+}
+
 static void child_job_drained_end(BdrvChild *c)
 {
     BlockJob *job = c->opaque;
@@ -164,6 +186,7 @@ static void child_job_drained_end(BdrvChild *c)
 static const BdrvChildRole child_job = {
     .get_parent_desc    = child_job_get_parent_desc,
     .drained_begin      = child_job_drained_begin,
+    .drained_poll       = child_job_drained_poll,
     .drained_end        = child_job_drained_end,
     .stay_at_node       = true,
 };
diff --git a/configure b/configure
index a8c4094c87..a5fd46c9d4 100755
--- a/configure
+++ b/configure
@@ -4599,6 +4599,7 @@ int main(void) { virgl_renderer_poll(); return 0; }
 EOF
   virgl_cflags=$($pkg_config --cflags virglrenderer 2>/dev/null)
   virgl_libs=$($pkg_config --libs virglrenderer 2>/dev/null)
+  virgl_version=$($pkg_config --modversion virglrenderer 2>/dev/null)
   if $pkg_config virglrenderer >/dev/null 2>&1 && \
      compile_prog "$virgl_cflags" "$virgl_libs" ; then
     virglrenderer="yes"
@@ -5827,7 +5828,7 @@ echo "nettle            $nettle $(echo_version $nettle $nettle_version)"
 echo "nettle kdf        $nettle_kdf"
 echo "libtasn1          $tasn1"
 echo "curses support    $curses"
-echo "virgl support     $virglrenderer"
+echo "virgl support     $virglrenderer $(echo_version $virglrenderer $virgl_version)"
 echo "curl support      $curl"
 echo "mingw32 support   $mingw32"
 echo "Audio drivers     $audio_drv_list"
@@ -7239,9 +7240,11 @@ for rom in seabios vgabios ; do
 done
 
 # set up tests data directory
-if [ ! -e tests/data ]; then
-    symlink "$source_path/tests/data" tests/data
-fi
+for tests_subdir in acceptance data; do
+    if [ ! -e tests/$tests_subdir ]; then
+        symlink "$source_path/tests/$tests_subdir" tests/$tests_subdir
+    fi
+done
 
 # set up qemu-iotests in this build directory
 iotests_common_env="tests/qemu-iotests/common.env"
diff --git a/default-configs/ppc-softmmu.mak b/default-configs/ppc-softmmu.mak
index 4d7be45ac5..abeeb0418a 100644
--- a/default-configs/ppc-softmmu.mak
+++ b/default-configs/ppc-softmmu.mak
@@ -31,12 +31,14 @@ CONFIG_I2C=y
 CONFIG_MAC=y
 CONFIG_ESCC=y
 CONFIG_MACIO=y
+CONFIG_MACIO_GPIO=y
 CONFIG_SUNGEM=y
 CONFIG_MOS6522=y
 CONFIG_CUDA=y
 CONFIG_ADB=y
 CONFIG_MAC_NVRAM=y
 CONFIG_MAC_DBDMA=y
+CONFIG_MAC_PMU=y
 CONFIG_HEATHROW_PIC=y
 CONFIG_GRACKLE_PCI=y
 CONFIG_UNIN_PCI=y
diff --git a/docs/devel/testing.rst b/docs/devel/testing.rst
index 0ca1a2d4b5..f33e5a8423 100644
--- a/docs/devel/testing.rst
+++ b/docs/devel/testing.rst
@@ -484,3 +484,195 @@ supported. To start the fuzzer, run
 
 Alternatively, some command different from "qemu-img info" can be tested, by
 changing the ``-c`` option.
+
+Acceptance tests using the Avocado Framework
+============================================
+
+The ``tests/acceptance`` directory hosts functional tests, also known
+as acceptance level tests.  They're usually higher level tests, and
+may interact with external resources and with various guest operating
+systems.
+
+These tests are written using the Avocado Testing Framework (which must
+be installed separately) in conjunction with a the ``avocado_qemu.Test``
+class, implemented at ``tests/acceptance/avocado_qemu``.
+
+Tests based on ``avocado_qemu.Test`` can easily:
+
+ * Customize the command line arguments given to the convenience
+   ``self.vm`` attribute (a QEMUMachine instance)
+
+ * Interact with the QEMU monitor, send QMP commands and check
+   their results
+
+ * Interact with the guest OS, using the convenience console device
+   (which may be useful to assert the effectiveness and correctness of
+   command line arguments or QMP commands)
+
+ * Interact with external data files that accompany the test itself
+   (see ``self.get_data()``)
+
+ * Download (and cache) remote data files, such as firmware and kernel
+   images
+
+ * Have access to a library of guest OS images (by means of the
+   ``avocado.utils.vmimage`` library)
+
+ * Make use of various other test related utilities available at the
+   test class itself and at the utility library:
+
+   - http://avocado-framework.readthedocs.io/en/latest/api/test/avocado.html#avocado.Test
+   - http://avocado-framework.readthedocs.io/en/latest/api/utils/avocado.utils.html
+
+Installation
+------------
+
+To install Avocado and its dependencies, run:
+
+.. code::
+
+  pip install --user avocado-framework
+
+Alternatively, follow the instructions on this link:
+
+  http://avocado-framework.readthedocs.io/en/latest/GetStartedGuide.html#installing-avocado
+
+Overview
+--------
+
+This directory provides the ``avocado_qemu`` Python module, containing
+the ``avocado_qemu.Test`` class.  Here's a simple usage example:
+
+.. code::
+
+  from avocado_qemu import Test
+
+
+  class Version(Test):
+      """
+      :avocado: enable
+      :avocado: tags=quick
+      """
+      def test_qmp_human_info_version(self):
+          self.vm.launch()
+          res = self.vm.command('human-monitor-command',
+                                command_line='info version')
+          self.assertRegexpMatches(res, r'^(\d+\.\d+\.\d)')
+
+To execute your test, run:
+
+.. code::
+
+  avocado run version.py
+
+Tests may be classified according to a convention by using docstring
+directives such as ``:avocado: tags=TAG1,TAG2``.  To run all tests
+in the current directory, tagged as "quick", run:
+
+.. code::
+
+  avocado run -t quick .
+
+The ``avocado_qemu.Test`` base test class
+-----------------------------------------
+
+The ``avocado_qemu.Test`` class has a number of characteristics that
+are worth being mentioned right away.
+
+First of all, it attempts to give each test a ready to use QEMUMachine
+instance, available at ``self.vm``.  Because many tests will tweak the
+QEMU command line, launching the QEMUMachine (by using ``self.vm.launch()``)
+is left to the test writer.
+
+At test "tear down", ``avocado_qemu.Test`` handles the QEMUMachine
+shutdown.
+
+QEMUMachine
+~~~~~~~~~~~
+
+The QEMUMachine API is already widely used in the Python iotests,
+device-crash-test and other Python scripts.  It's a wrapper around the
+execution of a QEMU binary, giving its users:
+
+ * the ability to set command line arguments to be given to the QEMU
+   binary
+
+ * a ready to use QMP connection and interface, which can be used to
+   send commands and inspect its results, as well as asynchronous
+   events
+
+ * convenience methods to set commonly used command line arguments in
+   a more succinct and intuitive way
+
+QEMU binary selection
+~~~~~~~~~~~~~~~~~~~~~
+
+The QEMU binary used for the ``self.vm`` QEMUMachine instance will
+primarily depend on the value of the ``qemu_bin`` parameter.  If it's
+not explicitly set, its default value will be the result of a dynamic
+probe in the same source tree.  A suitable binary will be one that
+targets the architecture matching host machine.
+
+Based on this description, test writers will usually rely on one of
+the following approaches:
+
+1) Set ``qemu_bin``, and use the given binary
+
+2) Do not set ``qemu_bin``, and use a QEMU binary named like
+   "${arch}-softmmu/qemu-system-${arch}", either in the current
+   working directory, or in the current source tree.
+
+The resulting ``qemu_bin`` value will be preserved in the
+``avocado_qemu.Test`` as an attribute with the same name.
+
+Attribute reference
+-------------------
+
+Besides the attributes and methods that are part of the base
+``avocado.Test`` class, the following attributes are available on any
+``avocado_qemu.Test`` instance.
+
+vm
+~~
+
+A QEMUMachine instance, initially configured according to the given
+``qemu_bin`` parameter.
+
+qemu_bin
+~~~~~~~~
+
+The preserved value of the ``qemu_bin`` parameter or the result of the
+dynamic probe for a QEMU binary in the current working directory or
+source tree.
+
+Parameter reference
+-------------------
+
+To understand how Avocado parameters are accessed by tests, and how
+they can be passed to tests, please refer to::
+
+  http://avocado-framework.readthedocs.io/en/latest/WritingTests.html#accessing-test-parameters
+
+Parameter values can be easily seen in the log files, and will look
+like the following:
+
+.. code::
+
+  PARAMS (key=qemu_bin, path=*, default=x86_64-softmmu/qemu-system-x86_64) => 'x86_64-softmmu/qemu-system-x86_64
+
+qemu_bin
+~~~~~~~~
+
+The exact QEMU binary to be used on QEMUMachine.
+
+Uninstalling Avocado
+--------------------
+
+If you've followed the installation instructions above, you can easily
+uninstall Avocado.  Start by listing the packages you have installed::
+
+  pip list --user
+
+And remove any package you want with::
+
+  pip uninstall <package_name>
diff --git a/hmp.c b/hmp.c
index ef93f4878b..f40d8279cf 100644
--- a/hmp.c
+++ b/hmp.c
@@ -370,6 +370,9 @@ void hmp_info_migrate_parameters(Monitor *mon, const QDict *qdict)
         monitor_printf(mon, "%s: %" PRIu64 "\n",
             MigrationParameter_str(MIGRATION_PARAMETER_XBZRLE_CACHE_SIZE),
             params->xbzrle_cache_size);
+        monitor_printf(mon, "%s: %" PRIu64 "\n",
+            MigrationParameter_str(MIGRATION_PARAMETER_MAX_POSTCOPY_BANDWIDTH),
+            params->max_postcopy_bandwidth);
     }
 
     qapi_free_MigrationParameters(params);
@@ -1676,6 +1679,10 @@ void hmp_migrate_set_parameter(Monitor *mon, const QDict *qdict)
         }
         p->xbzrle_cache_size = cache_size;
         break;
+    case MIGRATION_PARAMETER_MAX_POSTCOPY_BANDWIDTH:
+        p->has_max_postcopy_bandwidth = true;
+        visit_type_size(v, param, &p->max_postcopy_bandwidth, &err);
+        break;
     default:
         assert(0);
     }
diff --git a/hw/arm/sysbus-fdt.c b/hw/arm/sysbus-fdt.c
index e4c492ea44..277ed872e7 100644
--- a/hw/arm/sysbus-fdt.c
+++ b/hw/arm/sysbus-fdt.c
@@ -36,6 +36,7 @@
 #include "hw/vfio/vfio-platform.h"
 #include "hw/vfio/vfio-calxeda-xgmac.h"
 #include "hw/vfio/vfio-amd-xgbe.h"
+#include "hw/display/ramfb.h"
 #include "hw/arm/fdt.h"
 
 /*
@@ -406,12 +407,18 @@ static int add_amd_xgbe_fdt_node(SysBusDevice *sbdev, void *opaque)
 
 #endif /* CONFIG_LINUX */
 
+static int no_fdt_node(SysBusDevice *sbdev, void *opaque)
+{
+    return 0;
+}
+
 /* list of supported dynamic sysbus devices */
 static const NodeCreationPair add_fdt_node_functions[] = {
 #ifdef CONFIG_LINUX
     {TYPE_VFIO_CALXEDA_XGMAC, add_calxeda_midway_xgmac_fdt_node},
     {TYPE_VFIO_AMD_XGBE, add_amd_xgbe_fdt_node},
 #endif
+    {TYPE_RAMFB_DEVICE, no_fdt_node},
     {"", NULL}, /* last element */
 };
 
diff --git a/hw/arm/virt.c b/hw/arm/virt.c
index f0a4fa004c..98b99cf236 100644
--- a/hw/arm/virt.c
+++ b/hw/arm/virt.c
@@ -36,6 +36,7 @@
 #include "hw/arm/virt.h"
 #include "hw/vfio/vfio-calxeda-xgmac.h"
 #include "hw/vfio/vfio-amd-xgbe.h"
+#include "hw/display/ramfb.h"
 #include "hw/devices.h"
 #include "net/net.h"
 #include "sysemu/device_tree.h"
@@ -1659,6 +1660,7 @@ static void virt_machine_class_init(ObjectClass *oc, void *data)
     mc->max_cpus = 255;
     machine_class_allow_dynamic_sysbus_dev(mc, TYPE_VFIO_CALXEDA_XGMAC);
     machine_class_allow_dynamic_sysbus_dev(mc, TYPE_VFIO_AMD_XGBE);
+    machine_class_allow_dynamic_sysbus_dev(mc, TYPE_RAMFB_DEVICE);
     mc->block_default_type = IF_VIRTIO;
     mc->no_cdrom = 1;
     mc->pci_allow_0_address = true;
diff --git a/hw/core/bus.c b/hw/core/bus.c
index ad0c9df335..4651f24486 100644
--- a/hw/core/bus.c
+++ b/hw/core/bus.c
@@ -102,6 +102,7 @@ static void qbus_realize(BusState *bus, DeviceState *parent, const char *name)
         QLIST_INSERT_HEAD(&bus->parent->child_bus, bus, sibling);
         bus->parent->num_child_bus++;
         object_property_add_child(OBJECT(bus->parent), bus->name, OBJECT(bus), NULL);
+        object_unref(OBJECT(bus));
     } else if (bus != sysbus_get_default()) {
         /* TODO: once all bus devices are qdevified,
            only reset handler for main_system_bus should be registered here. */
diff --git a/hw/display/Makefile.objs b/hw/display/Makefile.objs
index b5d97ab26d..fb8408c6d0 100644
--- a/hw/display/Makefile.objs
+++ b/hw/display/Makefile.objs
@@ -1,3 +1,6 @@
+common-obj-y += ramfb.o
+common-obj-y += ramfb-standalone.o
+
 common-obj-$(CONFIG_ADS7846) += ads7846.o
 common-obj-$(CONFIG_VGA_CIRRUS) += cirrus_vga.o
 common-obj-$(CONFIG_G364FB) += g364fb.o
diff --git a/hw/display/ramfb-standalone.c b/hw/display/ramfb-standalone.c
new file mode 100644
index 0000000000..c0d241ba01
--- /dev/null
+++ b/hw/display/ramfb-standalone.c
@@ -0,0 +1,62 @@
+#include "qemu/osdep.h"
+#include "qapi/error.h"
+#include "hw/loader.h"
+#include "hw/isa/isa.h"
+#include "hw/display/ramfb.h"
+#include "ui/console.h"
+#include "sysemu/sysemu.h"
+
+#define RAMFB(obj) OBJECT_CHECK(RAMFBStandaloneState, (obj), TYPE_RAMFB_DEVICE)
+
+typedef struct RAMFBStandaloneState {
+    SysBusDevice parent_obj;
+    QemuConsole *con;
+    RAMFBState *state;
+} RAMFBStandaloneState;
+
+static void display_update_wrapper(void *dev)
+{
+    RAMFBStandaloneState *ramfb = RAMFB(dev);
+
+    if (0 /* native driver active */) {
+        /* non-standalone device would run native display update here */;
+    } else {
+        ramfb_display_update(ramfb->con, ramfb->state);
+    }
+}
+
+static const GraphicHwOps wrapper_ops = {
+    .gfx_update = display_update_wrapper,
+};
+
+static void ramfb_realizefn(DeviceState *dev, Error **errp)
+{
+    RAMFBStandaloneState *ramfb = RAMFB(dev);
+
+    ramfb->con = graphic_console_init(dev, 0, &wrapper_ops, dev);
+    ramfb->state = ramfb_setup(errp);
+}
+
+static void ramfb_class_initfn(ObjectClass *klass, void *data)
+{
+    DeviceClass *dc = DEVICE_CLASS(klass);
+
+    set_bit(DEVICE_CATEGORY_DISPLAY, dc->categories);
+    dc->realize = ramfb_realizefn;
+    dc->desc = "ram framebuffer standalone device";
+    dc->user_creatable = true;
+}
+
+static const TypeInfo ramfb_info = {
+    .name          = TYPE_RAMFB_DEVICE,
+    .parent        = TYPE_SYS_BUS_DEVICE,
+    .instance_size = sizeof(RAMFBStandaloneState),
+    .class_init    = ramfb_class_initfn,
+};
+
+static void ramfb_register_types(void)
+{
+    type_register_static(&ramfb_info);
+}
+
+type_init(ramfb_register_types)
diff --git a/hw/display/ramfb.c b/hw/display/ramfb.c
new file mode 100644
index 0000000000..6867bce8ae
--- /dev/null
+++ b/hw/display/ramfb.c
@@ -0,0 +1,95 @@
+/*
+ * early boot framebuffer in guest ram
+ * configured using fw_cfg
+ *
+ * Copyright Red Hat, Inc. 2017
+ *
+ * Author:
+ *     Gerd Hoffmann <kraxel@redhat.com>
+ *
+ * This work is licensed under the terms of the GNU GPL, version 2 or later.
+ * See the COPYING file in the top-level directory.
+ */
+#include "qemu/osdep.h"
+#include "qapi/error.h"
+#include "hw/loader.h"
+#include "hw/display/ramfb.h"
+#include "ui/console.h"
+#include "sysemu/sysemu.h"
+
+struct QEMU_PACKED RAMFBCfg {
+    uint64_t addr;
+    uint32_t fourcc;
+    uint32_t flags;
+    uint32_t width;
+    uint32_t height;
+    uint32_t stride;
+};
+
+struct RAMFBState {
+    DisplaySurface *ds;
+    uint32_t width, height;
+    struct RAMFBCfg cfg;
+};
+
+static void ramfb_fw_cfg_write(void *dev, off_t offset, size_t len)
+{
+    RAMFBState *s = dev;
+    void *framebuffer;
+    uint32_t stride, fourcc, format;
+    hwaddr addr, length;
+
+    s->width  = be32_to_cpu(s->cfg.width);
+    s->height = be32_to_cpu(s->cfg.height);
+    stride    = be32_to_cpu(s->cfg.stride);
+    fourcc    = be32_to_cpu(s->cfg.fourcc);
+    addr      = be64_to_cpu(s->cfg.addr);
+    length    = stride * s->height;
+    format    = qemu_drm_format_to_pixman(fourcc);
+
+    fprintf(stderr, "%s: %dx%d @ 0x%" PRIx64 "\n", __func__,
+            s->width, s->height, addr);
+    framebuffer = address_space_map(&address_space_memory,
+                                    addr, &length, false,
+                                    MEMTXATTRS_UNSPECIFIED);
+    if (!framebuffer || length < stride * s->height) {
+        s->width = 0;
+        s->height = 0;
+        return;
+    }
+    s->ds = qemu_create_displaysurface_from(s->width, s->height,
+                                            format, stride, framebuffer);
+}
+
+void ramfb_display_update(QemuConsole *con, RAMFBState *s)
+{
+    if (!s->width || !s->height) {
+        return;
+    }
+
+    if (s->ds) {
+        dpy_gfx_replace_surface(con, s->ds);
+        s->ds = NULL;
+    }
+
+    /* simple full screen update */
+    dpy_gfx_update_full(con);
+}
+
+RAMFBState *ramfb_setup(Error **errp)
+{
+    FWCfgState *fw_cfg = fw_cfg_find();
+    RAMFBState *s;
+
+    if (!fw_cfg || !fw_cfg->dma_enabled) {
+        error_setg(errp, "ramfb device requires fw_cfg with DMA");
+        return NULL;
+    }
+
+    s = g_new0(RAMFBState, 1);
+
+    fw_cfg_add_file_callback(fw_cfg, "etc/ramfb",
+                             NULL, ramfb_fw_cfg_write, s,
+                             &s->cfg, sizeof(s->cfg), false);
+    return s;
+}
diff --git a/hw/display/sm501.c b/hw/display/sm501.c
index e47be99451..ca0840f6fa 100644
--- a/hw/display/sm501.c
+++ b/hw/display/sm501.c
@@ -836,27 +836,30 @@ static void sm501_system_config_write(void *opaque, hwaddr addr,
 
     switch (addr) {
     case SM501_SYSTEM_CONTROL:
-        s->system_control = value & 0xE300B8F7;
+        s->system_control &= 0x10DB0000;
+        s->system_control |= value & 0xEF00B8F7;
         break;
     case SM501_MISC_CONTROL:
-        s->misc_control = value & 0xFF7FFF20;
+        s->misc_control &= 0xEF;
+        s->misc_control |= value & 0xFF7FFF10;
         break;
     case SM501_GPIO31_0_CONTROL:
         s->gpio_31_0_control = value;
         break;
     case SM501_GPIO63_32_CONTROL:
-        s->gpio_63_32_control = value;
+        s->gpio_63_32_control = value & 0xFF80FFFF;
         break;
     case SM501_DRAM_CONTROL:
         s->local_mem_size_index = (value >> 13) & 0x7;
         /* TODO : check validity of size change */
-        s->dram_control |=  value & 0x7FFFFFC3;
+        s->dram_control &= 0x80000000;
+        s->dram_control |= value & 0x7FFFFFC3;
         break;
     case SM501_ARBTRTN_CONTROL:
-        s->arbitration_control =  value & 0x37777777;
+        s->arbitration_control = value & 0x37777777;
         break;
     case SM501_IRQ_MASK:
-        s->irq_mask = value;
+        s->irq_mask = value & 0xFFDF3F5F;
         break;
     case SM501_MISC_TIMING:
         s->misc_timing = value & 0xF31F1FFF;
diff --git a/hw/i386/pc_piix.c b/hw/i386/pc_piix.c
index 3b87f3cedb..e9b6f064fb 100644
--- a/hw/i386/pc_piix.c
+++ b/hw/i386/pc_piix.c
@@ -28,6 +28,7 @@
 #include "hw/loader.h"
 #include "hw/i386/pc.h"
 #include "hw/i386/apic.h"
+#include "hw/display/ramfb.h"
 #include "hw/smbios/smbios.h"
 #include "hw/pci/pci.h"
 #include "hw/pci/pci_ids.h"
@@ -423,6 +424,7 @@ static void pc_i440fx_machine_options(MachineClass *m)
     m->desc = "Standard PC (i440FX + PIIX, 1996)";
     m->default_machine_opts = "firmware=bios-256k.bin";
     m->default_display = "std";
+    machine_class_allow_dynamic_sysbus_dev(m, TYPE_RAMFB_DEVICE);
 }
 
 static void pc_i440fx_3_0_machine_options(MachineClass *m)
diff --git a/hw/i386/pc_q35.c b/hw/i386/pc_q35.c
index 087f2630f9..1a73e1848a 100644
--- a/hw/i386/pc_q35.c
+++ b/hw/i386/pc_q35.c
@@ -45,6 +45,7 @@
 #include "hw/i386/ich9.h"
 #include "hw/i386/amd_iommu.h"
 #include "hw/i386/intel_iommu.h"
+#include "hw/display/ramfb.h"
 #include "hw/smbios/smbios.h"
 #include "hw/ide/pci.h"
 #include "hw/ide/ahci.h"
@@ -305,6 +306,7 @@ static void pc_q35_machine_options(MachineClass *m)
     m->no_floppy = 1;
     machine_class_allow_dynamic_sysbus_dev(m, TYPE_AMD_IOMMU_DEVICE);
     machine_class_allow_dynamic_sysbus_dev(m, TYPE_INTEL_IOMMU_DEVICE);
+    machine_class_allow_dynamic_sysbus_dev(m, TYPE_RAMFB_DEVICE);
     m->max_cpus = 288;
 }
 
diff --git a/hw/input/adb-kbd.c b/hw/input/adb-kbd.c
index 50b62712c8..b026e9d49f 100644
--- a/hw/input/adb-kbd.c
+++ b/hw/input/adb-kbd.c
@@ -261,18 +261,21 @@ static int adb_kbd_request(ADBDevice *d, uint8_t *obuf,
                 trace_adb_kbd_request_change_addr(d->devaddr);
                 break;
             default:
-                d->devaddr = buf[1] & 0xf;
-                /* we support handlers:
-                 * 1: Apple Standard Keyboard
-                 * 2: Apple Extended Keyboard (LShift = RShift)
-                 * 3: Apple Extended Keyboard (LShift != RShift)
-                 */
-                if (buf[2] == 1 || buf[2] == 2 || buf[2] == 3) {
-                    d->handler = buf[2];
+                if (!d->disable_direct_reg3_writes) {
+                    d->devaddr = buf[1] & 0xf;
+
+                    /* we support handlers:
+                     * 1: Apple Standard Keyboard
+                     * 2: Apple Extended Keyboard (LShift = RShift)
+                     * 3: Apple Extended Keyboard (LShift != RShift)
+                     */
+                    if (buf[2] == 1 || buf[2] == 2 || buf[2] == 3) {
+                        d->handler = buf[2];
+                    }
+
+                    trace_adb_kbd_request_change_addr_and_handler(d->devaddr,
+                                                                  d->handler);
                 }
-
-                trace_adb_kbd_request_change_addr_and_handler(d->devaddr,
-                                                              d->handler);
                 break;
             }
         }
@@ -290,8 +293,8 @@ static int adb_kbd_request(ADBDevice *d, uint8_t *obuf,
             olen = 2;
             break;
         case 3:
-            obuf[0] = d->handler;
-            obuf[1] = d->devaddr;
+            obuf[0] = d->devaddr;
+            obuf[1] = d->handler;
             olen = 2;
             break;
         }
diff --git a/hw/input/adb-mouse.c b/hw/input/adb-mouse.c
index 3ba6027d33..83833b0035 100644
--- a/hw/input/adb-mouse.c
+++ b/hw/input/adb-mouse.c
@@ -142,24 +142,27 @@ static int adb_mouse_request(ADBDevice *d, uint8_t *obuf,
                 trace_adb_mouse_request_change_addr(d->devaddr);
                 break;
             default:
-                d->devaddr = buf[1] & 0xf;
-                /* we support handlers:
-                 * 0x01: Classic Apple Mouse Protocol / 100 cpi operations
-                 * 0x02: Classic Apple Mouse Protocol / 200 cpi operations
-                 * we don't support handlers (at least):
-                 * 0x03: Mouse systems A3 trackball
-                 * 0x04: Extended Apple Mouse Protocol
-                 * 0x2f: Microspeed mouse
-                 * 0x42: Macally
-                 * 0x5f: Microspeed mouse
-                 * 0x66: Microspeed mouse
-                 */
-                if (buf[2] == 1 || buf[2] == 2) {
-                    d->handler = buf[2];
+                if (!d->disable_direct_reg3_writes) {
+                    d->devaddr = buf[1] & 0xf;
+
+                    /* we support handlers:
+                     * 0x01: Classic Apple Mouse Protocol / 100 cpi operations
+                     * 0x02: Classic Apple Mouse Protocol / 200 cpi operations
+                     * we don't support handlers (at least):
+                     * 0x03: Mouse systems A3 trackball
+                     * 0x04: Extended Apple Mouse Protocol
+                     * 0x2f: Microspeed mouse
+                     * 0x42: Macally
+                     * 0x5f: Microspeed mouse
+                     * 0x66: Microspeed mouse
+                     */
+                    if (buf[2] == 1 || buf[2] == 2) {
+                        d->handler = buf[2];
+                    }
+
+                    trace_adb_mouse_request_change_addr_and_handler(
+                        d->devaddr, d->handler);
                 }
-
-                trace_adb_mouse_request_change_addr_and_handler(d->devaddr,
-                                                                d->handler);
                 break;
             }
         }
@@ -172,8 +175,8 @@ static int adb_mouse_request(ADBDevice *d, uint8_t *obuf,
         case 1:
             break;
         case 3:
-            obuf[0] = d->handler;
-            obuf[1] = d->devaddr;
+            obuf[0] = d->devaddr;
+            obuf[1] = d->handler;
             olen = 2;
             break;
         }
diff --git a/hw/input/adb.c b/hw/input/adb.c
index 23ae6f0d75..bbb40aeef1 100644
--- a/hw/input/adb.c
+++ b/hw/input/adb.c
@@ -113,11 +113,18 @@ static void adb_device_realizefn(DeviceState *dev, Error **errp)
     bus->devices[bus->nb_devices++] = d;
 }
 
+static Property adb_device_properties[] = {
+    DEFINE_PROP_BOOL("disable-direct-reg3-writes", ADBDevice,
+                     disable_direct_reg3_writes, false),
+    DEFINE_PROP_END_OF_LIST(),
+};
+
 static void adb_device_class_init(ObjectClass *oc, void *data)
 {
     DeviceClass *dc = DEVICE_CLASS(oc);
 
     dc->realize = adb_device_realizefn;
+    dc->props = adb_device_properties;
     dc->bus_type = TYPE_ADB_BUS;
 }
 
diff --git a/hw/input/ps2.c b/hw/input/ps2.c
index eeec6180d0..fdfcadf9a1 100644
--- a/hw/input/ps2.c
+++ b/hw/input/ps2.c
@@ -927,7 +927,7 @@ static void ps2_common_post_load(PS2State *s)
 
     /* reset rptr/wptr/count */
     q->rptr = 0;
-    q->wptr = size;
+    q->wptr = (size == PS2_QUEUE_SIZE) ? 0 : size;
     q->count = size;
     s->update_irq(s->update_arg, q->count != 0);
 }
diff --git a/hw/intc/xics_kvm.c b/hw/intc/xics_kvm.c
index 8bdf6afe82..8dba2f84e7 100644
--- a/hw/intc/xics_kvm.c
+++ b/hw/intc/xics_kvm.c
@@ -186,8 +186,7 @@ static void ics_get_kvm_state(ICSState *ics)
         kvm_device_access(kernel_xics_fd, KVM_DEV_XICS_GRP_SOURCES,
                           i + ics->offset, &state, false, &local_err);
         if (local_err) {
-            error_report("Unable to retrieve KVM interrupt controller state"
-                    " for IRQ %d: %s", i + ics->offset, strerror(errno));
+            error_report_err(local_err);
             exit(1);
         }
 
@@ -273,11 +272,10 @@ static int ics_set_kvm_state(ICSState *ics, int version_id)
                 state |= KVM_XICS_QUEUED;
         }
 
-        kvm_device_access(kernel_xics_fd, KVM_DEV_XICS_GRP_SOURCES,
-                          i + ics->offset, &state, true, &local_err);
+        ret = kvm_device_access(kernel_xics_fd, KVM_DEV_XICS_GRP_SOURCES,
+                                i + ics->offset, &state, true, &local_err);
         if (local_err) {
-            error_report("Unable to restore KVM interrupt controller state"
-                    " for IRQs %d: %s", i + ics->offset, strerror(errno));
+            error_report_err(local_err);
             return ret;
         }
     }
diff --git a/hw/isa/smc37c669-superio.c b/hw/isa/smc37c669-superio.c
index aa233c6967..64466a9373 100644
--- a/hw/isa/smc37c669-superio.c
+++ b/hw/isa/smc37c669-superio.c
@@ -37,7 +37,7 @@ static bool is_parallel_enabled(ISASuperIODevice *sio, uint8_t index)
 
 static uint16_t get_parallel_iobase(ISASuperIODevice *sio, uint8_t index)
 {
-    return 0x3bc;
+    return 0x378;
 }
 
 static unsigned int get_parallel_irq(ISASuperIODevice *sio, uint8_t index)
diff --git a/hw/misc/macio/Makefile.objs b/hw/misc/macio/Makefile.objs
index ef7ac249ec..07fdb320d4 100644
--- a/hw/misc/macio/Makefile.objs
+++ b/hw/misc/macio/Makefile.objs
@@ -1,3 +1,5 @@
 common-obj-y += macio.o
 common-obj-$(CONFIG_CUDA) += cuda.o
+common-obj-$(CONFIG_MAC_PMU) += pmu.o
 common-obj-$(CONFIG_MAC_DBDMA) += mac_dbdma.o
+common-obj-$(CONFIG_MACIO_GPIO) += gpio.o
diff --git a/hw/misc/macio/gpio.c b/hw/misc/macio/gpio.c
new file mode 100644
index 0000000000..9317df759c
--- /dev/null
+++ b/hw/misc/macio/gpio.c
@@ -0,0 +1,231 @@
+/*
+ * PowerMac NewWorld MacIO GPIO emulation
+ *
+ * Copyright (c) 2016 Benjamin Herrenschmidt
+ * Copyright (c) 2018 Mark Cave-Ayland
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ */
+
+#include "qemu/osdep.h"
+#include "hw/hw.h"
+#include "hw/ppc/mac.h"
+#include "hw/misc/macio/macio.h"
+#include "hw/misc/macio/gpio.h"
+#include "hw/nmi.h"
+#include "qemu/log.h"
+#include "trace.h"
+
+
+void macio_set_gpio(MacIOGPIOState *s, uint32_t gpio, bool state)
+{
+    uint8_t new_reg;
+
+    trace_macio_set_gpio(gpio, state);
+
+    if (s->gpio_regs[gpio] & 4) {
+        qemu_log_mask(LOG_GUEST_ERROR,
+                      "GPIO: Setting GPIO %d while it's an output\n", gpio);
+    }
+
+    new_reg = s->gpio_regs[gpio] & ~2;
+    if (state) {
+        new_reg |= 2;
+    }
+
+    if (new_reg == s->gpio_regs[gpio]) {
+        return;
+    }
+
+    s->gpio_regs[gpio] = new_reg;
+
+    /* This is will work until we fix the binding between MacIO and
+     * the MPIC properly so we can route all GPIOs and avoid going
+     * via the top level platform code.
+     *
+     * Note that we probably need to get access to the MPIC config to
+     * decode polarity since qemu always use "raise" regardless.
+     *
+     * For now, we hard wire known GPIOs
+     */
+
+    switch (gpio) {
+    case 1:
+        /* Level low */
+        if (!state) {
+            trace_macio_gpio_irq_assert(gpio);
+            qemu_irq_raise(s->gpio_extirqs[gpio]);
+        } else {
+            trace_macio_gpio_irq_deassert(gpio);
+            qemu_irq_lower(s->gpio_extirqs[gpio]);
+        }
+        break;
+
+    case 9:
+        /* Edge, triggered by NMI below */
+        if (state) {
+            trace_macio_gpio_irq_assert(gpio);
+            qemu_irq_raise(s->gpio_extirqs[gpio]);
+        } else {
+            trace_macio_gpio_irq_deassert(gpio);
+            qemu_irq_lower(s->gpio_extirqs[gpio]);
+        }
+        break;
+
+    default:
+        qemu_log_mask(LOG_UNIMP, "GPIO: setting unimplemented GPIO %d", gpio);
+    }
+}
+
+static void macio_gpio_write(void *opaque, hwaddr addr, uint64_t value,
+                             unsigned size)
+{
+    MacIOGPIOState *s = opaque;
+    uint8_t ibit;
+
+    trace_macio_gpio_write(addr, value);
+
+    /* Levels regs are read-only */
+    if (addr < 8) {
+        return;
+    }
+
+    addr -= 8;
+    if (addr < 36) {
+        value &= ~2;
+
+        if (value & 4) {
+            ibit = (value & 1) << 1;
+        } else {
+            ibit = s->gpio_regs[addr] & 2;
+        }
+
+        s->gpio_regs[addr] = value | ibit;
+    }
+}
+
+static uint64_t macio_gpio_read(void *opaque, hwaddr addr, unsigned size)
+{
+    MacIOGPIOState *s = opaque;
+    uint64_t val = 0;
+
+    /* Levels regs */
+    if (addr < 8) {
+        val = s->gpio_levels[addr];
+    } else {
+        addr -= 8;
+
+        if (addr < 36) {
+            val = s->gpio_regs[addr];
+        }
+    }
+
+    trace_macio_gpio_write(addr, val);
+    return val;
+}
+
+static const MemoryRegionOps macio_gpio_ops = {
+    .read = macio_gpio_read,
+    .write = macio_gpio_write,
+    .endianness = DEVICE_LITTLE_ENDIAN,
+    .impl = {
+        .min_access_size = 1,
+        .max_access_size = 1,
+    },
+};
+
+static void macio_gpio_realize(DeviceState *dev, Error **errp)
+{
+    MacIOGPIOState *s = MACIO_GPIO(dev);
+
+    s->gpio_extirqs[1] = qdev_get_gpio_in(DEVICE(s->pic),
+                                          NEWWORLD_EXTING_GPIO1);
+    s->gpio_extirqs[9] = qdev_get_gpio_in(DEVICE(s->pic),
+                                          NEWWORLD_EXTING_GPIO9);
+}
+
+static void macio_gpio_init(Object *obj)
+{
+    SysBusDevice *sbd = SYS_BUS_DEVICE(obj);
+    MacIOGPIOState *s = MACIO_GPIO(obj);
+
+    object_property_add_link(obj, "pic", TYPE_OPENPIC,
+                             (Object **) &s->pic,
+                             qdev_prop_allow_set_link_before_realize,
+                             0, NULL);
+
+    memory_region_init_io(&s->gpiomem, OBJECT(s), &macio_gpio_ops, obj,
+                          "gpio", 0x30);
+    sysbus_init_mmio(sbd, &s->gpiomem);
+}
+
+static const VMStateDescription vmstate_macio_gpio = {
+    .name = "macio_gpio",
+    .version_id = 0,
+    .minimum_version_id = 0,
+    .fields = (VMStateField[]) {
+        VMSTATE_UINT8_ARRAY(gpio_levels, MacIOGPIOState, 8),
+        VMSTATE_UINT8_ARRAY(gpio_regs, MacIOGPIOState, 36),
+        VMSTATE_END_OF_LIST()
+    }
+};
+
+static void macio_gpio_reset(DeviceState *dev)
+{
+    MacIOGPIOState *s = MACIO_GPIO(dev);
+
+    /* GPIO 1 is up by default */
+    macio_set_gpio(s, 1, true);
+}
+
+static void macio_gpio_nmi(NMIState *n, int cpu_index, Error **errp)
+{
+    macio_set_gpio(MACIO_GPIO(n), 9, true);
+    macio_set_gpio(MACIO_GPIO(n), 9, false);
+}
+
+static void macio_gpio_class_init(ObjectClass *oc, void *data)
+{
+    DeviceClass *dc = DEVICE_CLASS(oc);
+    NMIClass *nc = NMI_CLASS(oc);
+
+    dc->realize = macio_gpio_realize;
+    dc->reset = macio_gpio_reset;
+    dc->vmsd = &vmstate_macio_gpio;
+    nc->nmi_monitor_handler = macio_gpio_nmi;
+}
+
+static const TypeInfo macio_gpio_init_info = {
+    .name          = TYPE_MACIO_GPIO,
+    .parent        = TYPE_SYS_BUS_DEVICE,
+    .instance_size = sizeof(MacIOGPIOState),
+    .instance_init = macio_gpio_init,
+    .class_init    = macio_gpio_class_init,
+    .interfaces = (InterfaceInfo[]) {
+        { TYPE_NMI },
+        { }
+    },
+};
+
+static void macio_gpio_register_types(void)
+{
+    type_register_static(&macio_gpio_init_info);
+}
+
+type_init(macio_gpio_register_types)
diff --git a/hw/misc/macio/macio.c b/hw/misc/macio/macio.c
index f9a40eea81..d135e3bc2b 100644
--- a/hw/misc/macio/macio.c
+++ b/hw/misc/macio/macio.c
@@ -105,17 +105,6 @@ static void macio_common_realize(PCIDevice *d, Error **errp)
     memory_region_add_subregion(&s->bar, 0x08000,
                                 sysbus_mmio_get_region(sysbus_dev, 0));
 
-    qdev_prop_set_uint64(DEVICE(&s->cuda), "timebase-frequency",
-                         s->frequency);
-    object_property_set_bool(OBJECT(&s->cuda), true, "realized", &err);
-    if (err) {
-        error_propagate(errp, err);
-        return;
-    }
-    sysbus_dev = SYS_BUS_DEVICE(&s->cuda);
-    memory_region_add_subregion(&s->bar, 0x16000,
-                                sysbus_mmio_get_region(sysbus_dev, 0));
-
     qdev_prop_set_uint32(DEVICE(&s->escc), "disabled", 0);
     qdev_prop_set_uint32(DEVICE(&s->escc), "frequency", ESCC_CLOCK);
     qdev_prop_set_uint32(DEVICE(&s->escc), "it_shift", 4);
@@ -163,7 +152,16 @@ static void macio_oldworld_realize(PCIDevice *d, Error **errp)
         return;
     }
 
+    qdev_prop_set_uint64(DEVICE(&s->cuda), "timebase-frequency",
+                         s->frequency);
+    object_property_set_bool(OBJECT(&s->cuda), true, "realized", &err);
+    if (err) {
+        error_propagate(errp, err);
+        return;
+    }
     sysbus_dev = SYS_BUS_DEVICE(&s->cuda);
+    memory_region_add_subregion(&s->bar, 0x16000,
+                                sysbus_mmio_get_region(sysbus_dev, 0));
     sysbus_connect_irq(sysbus_dev, 0, qdev_get_gpio_in(pic_dev,
                                                        OLDWORLD_CUDA_IRQ));
 
@@ -234,6 +232,10 @@ static void macio_oldworld_init(Object *obj)
                              qdev_prop_allow_set_link_before_realize,
                              0, NULL);
 
+    object_initialize(&s->cuda, sizeof(s->cuda), TYPE_CUDA);
+    qdev_set_parent_bus(DEVICE(&s->cuda), sysbus_get_default());
+    object_property_add_child(obj, "cuda", OBJECT(&s->cuda), NULL);
+
     object_initialize(&os->nvram, sizeof(os->nvram), TYPE_MACIO_NVRAM);
     dev = DEVICE(&os->nvram);
     qdev_prop_set_uint32(dev, "size", 0x2000);
@@ -293,10 +295,6 @@ static void macio_newworld_realize(PCIDevice *d, Error **errp)
         return;
     }
 
-    sysbus_dev = SYS_BUS_DEVICE(&s->cuda);
-    sysbus_connect_irq(sysbus_dev, 0, qdev_get_gpio_in(pic_dev,
-                                                       NEWWORLD_CUDA_IRQ));
-
     sysbus_dev = SYS_BUS_DEVICE(&s->escc);
     sysbus_connect_irq(sysbus_dev, 0, qdev_get_gpio_in(pic_dev,
                                                        NEWWORLD_ESCCB_IRQ));
@@ -332,6 +330,53 @@ static void macio_newworld_realize(PCIDevice *d, Error **errp)
     memory_region_init_io(timer_memory, OBJECT(s), &timer_ops, NULL, "timer",
                           0x1000);
     memory_region_add_subregion(&s->bar, 0x15000, timer_memory);
+
+    if (ns->has_pmu) {
+        /* GPIOs */
+        sysbus_dev = SYS_BUS_DEVICE(&ns->gpio);
+        object_property_set_link(OBJECT(&ns->gpio), OBJECT(pic_dev), "pic",
+                                 &error_abort);
+        memory_region_add_subregion(&s->bar, 0x50,
+                                    sysbus_mmio_get_region(sysbus_dev, 0));
+        object_property_set_bool(OBJECT(&ns->gpio), true, "realized", &err);
+
+        /* PMU */
+        object_initialize(&s->pmu, sizeof(s->pmu), TYPE_VIA_PMU);
+        object_property_set_link(OBJECT(&s->pmu), OBJECT(sysbus_dev), "gpio",
+                                 &error_abort);
+        qdev_prop_set_bit(DEVICE(&s->pmu), "has-adb", ns->has_adb);
+        qdev_set_parent_bus(DEVICE(&s->pmu), sysbus_get_default());
+        object_property_add_child(OBJECT(s), "pmu", OBJECT(&s->pmu), NULL);
+
+        object_property_set_bool(OBJECT(&s->pmu), true, "realized", &err);
+        if (err) {
+            error_propagate(errp, err);
+            return;
+        }
+        sysbus_dev = SYS_BUS_DEVICE(&s->pmu);
+        sysbus_connect_irq(sysbus_dev, 0, qdev_get_gpio_in(pic_dev,
+                                                           NEWWORLD_PMU_IRQ));
+        memory_region_add_subregion(&s->bar, 0x16000,
+                                    sysbus_mmio_get_region(sysbus_dev, 0));
+    } else {
+        /* CUDA */
+        object_initialize(&s->cuda, sizeof(s->cuda), TYPE_CUDA);
+        qdev_set_parent_bus(DEVICE(&s->cuda), sysbus_get_default());
+        object_property_add_child(OBJECT(s), "cuda", OBJECT(&s->cuda), NULL);
+        qdev_prop_set_uint64(DEVICE(&s->cuda), "timebase-frequency",
+                             s->frequency);
+
+        object_property_set_bool(OBJECT(&s->cuda), true, "realized", &err);
+        if (err) {
+            error_propagate(errp, err);
+            return;
+        }
+        sysbus_dev = SYS_BUS_DEVICE(&s->cuda);
+        sysbus_connect_irq(sysbus_dev, 0, qdev_get_gpio_in(pic_dev,
+                                                           NEWWORLD_CUDA_IRQ));
+        memory_region_add_subregion(&s->bar, 0x16000,
+                                    sysbus_mmio_get_region(sysbus_dev, 0));
+    }
 }
 
 static void macio_newworld_init(Object *obj)
@@ -345,6 +390,9 @@ static void macio_newworld_init(Object *obj)
                              qdev_prop_allow_set_link_before_realize,
                              0, NULL);
 
+    object_initialize(&ns->gpio, sizeof(ns->gpio), TYPE_MACIO_GPIO);
+    qdev_set_parent_bus(DEVICE(&ns->gpio), sysbus_get_default());
+
     for (i = 0; i < 2; i++) {
         macio_init_ide(s, &ns->ide[i], sizeof(ns->ide[i]), i);
     }
@@ -356,10 +404,6 @@ static void macio_instance_init(Object *obj)
 
     memory_region_init(&s->bar, obj, "macio", 0x80000);
 
-    object_initialize(&s->cuda, sizeof(s->cuda), TYPE_CUDA);
-    qdev_set_parent_bus(DEVICE(&s->cuda), sysbus_get_default());
-    object_property_add_child(obj, "cuda", OBJECT(&s->cuda), NULL);
-
     object_initialize(&s->dbdma, sizeof(s->dbdma), TYPE_MAC_DBDMA);
     qdev_set_parent_bus(DEVICE(&s->dbdma), sysbus_get_default());
     object_property_add_child(obj, "dbdma", OBJECT(&s->dbdma), NULL);
@@ -399,6 +443,12 @@ static const VMStateDescription vmstate_macio_newworld = {
     }
 };
 
+static Property macio_newworld_properties[] = {
+    DEFINE_PROP_BOOL("has-pmu", NewWorldMacIOState, has_pmu, false),
+    DEFINE_PROP_BOOL("has-adb", NewWorldMacIOState, has_adb, false),
+    DEFINE_PROP_END_OF_LIST()
+};
+
 static void macio_newworld_class_init(ObjectClass *oc, void *data)
 {
     PCIDeviceClass *pdc = PCI_DEVICE_CLASS(oc);
@@ -407,6 +457,7 @@ static void macio_newworld_class_init(ObjectClass *oc, void *data)
     pdc->realize = macio_newworld_realize;
     pdc->device_id = PCI_DEVICE_ID_APPLE_UNI_N_KEYL;
     dc->vmsd = &vmstate_macio_newworld;
+    dc->props = macio_newworld_properties;
 }
 
 static Property macio_properties[] = {
diff --git a/hw/misc/macio/pmu.c b/hw/misc/macio/pmu.c
new file mode 100644
index 0000000000..e246b0fd41
--- /dev/null
+++ b/hw/misc/macio/pmu.c
@@ -0,0 +1,871 @@
+/*
+ * QEMU PowerMac PMU device support
+ *
+ * Copyright (c) 2016 Benjamin Herrenschmidt, IBM Corp.
+ * Copyright (c) 2018 Mark Cave-Ayland
+ *
+ * Based on the CUDA device by:
+ *
+ * Copyright (c) 2004-2007 Fabrice Bellard
+ * Copyright (c) 2007 Jocelyn Mayer
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ */
+
+#include "qemu/osdep.h"
+#include "hw/hw.h"
+#include "hw/ppc/mac.h"
+#include "hw/input/adb.h"
+#include "hw/misc/mos6522.h"
+#include "hw/misc/macio/gpio.h"
+#include "hw/misc/macio/pmu.h"
+#include "qemu/timer.h"
+#include "sysemu/sysemu.h"
+#include "qemu/cutils.h"
+#include "qemu/log.h"
+#include "trace.h"
+
+
+/* Bits in B data register: all active low */
+#define TACK    0x08    /* Transfer request (input) */
+#define TREQ    0x10    /* Transfer acknowledge (output) */
+
+/* PMU returns time_t's offset from Jan 1, 1904, not 1970 */
+#define RTC_OFFSET                      2082844800
+
+#define VIA_TIMER_FREQ (4700000 / 6)
+
+static void via_update_irq(PMUState *s)
+{
+    MOS6522PMUState *mps = MOS6522_PMU(&s->mos6522_pmu);
+    MOS6522State *ms = MOS6522(mps);
+
+    bool new_state = !!(ms->ifr & ms->ier & (SR_INT | T1_INT | T2_INT));
+
+    if (new_state != s->via_irq_state) {
+        s->via_irq_state = new_state;
+        qemu_set_irq(s->via_irq, new_state);
+    }
+}
+
+static void via_set_sr_int(void *opaque)
+{
+    PMUState *s = opaque;
+    MOS6522PMUState *mps = MOS6522_PMU(&s->mos6522_pmu);
+    MOS6522State *ms = MOS6522(mps);
+    MOS6522DeviceClass *mdc = MOS6522_DEVICE_GET_CLASS(ms);
+
+    mdc->set_sr_int(ms);
+}
+
+static void pmu_update_extirq(PMUState *s)
+{
+    if ((s->intbits & s->intmask) != 0) {
+        macio_set_gpio(s->gpio, 1, false);
+    } else {
+        macio_set_gpio(s->gpio, 1, true);
+    }
+}
+
+static void pmu_adb_poll(void *opaque)
+{
+    PMUState *s = opaque;
+    int olen;
+
+    if (!(s->intbits & PMU_INT_ADB)) {
+        olen = adb_poll(&s->adb_bus, s->adb_reply, s->adb_poll_mask);
+        trace_pmu_adb_poll(olen);
+
+        if (olen > 0) {
+            s->adb_reply_size = olen;
+            s->intbits |= PMU_INT_ADB | PMU_INT_ADB_AUTO;
+            pmu_update_extirq(s);
+        }
+    }
+
+    timer_mod(s->adb_poll_timer,
+              qemu_clock_get_ms(QEMU_CLOCK_VIRTUAL) + 30);
+}
+
+static void pmu_one_sec_timer(void *opaque)
+{
+    PMUState *s = opaque;
+
+    trace_pmu_one_sec_timer();
+
+    s->intbits |= PMU_INT_TICK;
+    pmu_update_extirq(s);
+    s->one_sec_target += 1000;
+
+    timer_mod(s->one_sec_timer, s->one_sec_target);
+}
+
+static void pmu_cmd_int_ack(PMUState *s,
+                            const uint8_t *in_data, uint8_t in_len,
+                            uint8_t *out_data, uint8_t *out_len)
+{
+    if (in_len != 0) {
+        qemu_log_mask(LOG_GUEST_ERROR,
+                      "PMU: INT_ACK command, invalid len: %d want: 0\n",
+                      in_len);
+        return;
+    }
+
+    /* Make appropriate reply packet */
+    if (s->intbits & PMU_INT_ADB) {
+        if (!s->adb_reply_size) {
+            qemu_log_mask(LOG_GUEST_ERROR,
+                          "Odd, PMU_INT_ADB set with no reply in buffer\n");
+        }
+
+        memcpy(out_data + 1, s->adb_reply, s->adb_reply_size);
+        out_data[0] = s->intbits & (PMU_INT_ADB | PMU_INT_ADB_AUTO);
+        *out_len = s->adb_reply_size + 1;
+        s->intbits &= ~(PMU_INT_ADB | PMU_INT_ADB_AUTO);
+        s->adb_reply_size = 0;
+    } else {
+        out_data[0] = s->intbits;
+        s->intbits = 0;
+        *out_len = 1;
+    }
+
+    pmu_update_extirq(s);
+}
+
+static void pmu_cmd_set_int_mask(PMUState *s,
+                                 const uint8_t *in_data, uint8_t in_len,
+                                 uint8_t *out_data, uint8_t *out_len)
+{
+    if (in_len != 1) {
+        qemu_log_mask(LOG_GUEST_ERROR,
+                      "PMU: SET_INT_MASK command, invalid len: %d want: 1\n",
+                      in_len);
+        return;
+    }
+
+    trace_pmu_cmd_set_int_mask(s->intmask);
+    s->intmask = in_data[0];
+
+    pmu_update_extirq(s);
+}
+
+static void pmu_cmd_set_adb_autopoll(PMUState *s, uint16_t mask)
+{
+    trace_pmu_cmd_set_adb_autopoll(mask);
+
+    if (s->autopoll_mask == mask) {
+        return;
+    }
+
+    s->autopoll_mask = mask;
+    if (mask) {
+        timer_mod(s->adb_poll_timer,
+                  qemu_clock_get_ms(QEMU_CLOCK_VIRTUAL) + 30);
+    } else {
+        timer_del(s->adb_poll_timer);
+    }
+}
+
+static void pmu_cmd_adb(PMUState *s,
+                        const uint8_t *in_data, uint8_t in_len,
+                        uint8_t *out_data, uint8_t *out_len)
+{
+    int len, adblen;
+    uint8_t adb_cmd[255];
+
+    if (in_len < 2) {
+        qemu_log_mask(LOG_GUEST_ERROR,
+                      "PMU: ADB PACKET, invalid len: %d want at least 2\n",
+                      in_len);
+        return;
+    }
+
+    *out_len = 0;
+
+    if (!s->has_adb) {
+        trace_pmu_cmd_adb_nobus();
+        return;
+    }
+
+    /* Set autopoll is a special form of the command */
+    if (in_data[0] == 0 && in_data[1] == 0x86) {
+        uint16_t mask = in_data[2];
+        mask = (mask << 8) | in_data[3];
+        if (in_len != 4) {
+            qemu_log_mask(LOG_GUEST_ERROR,
+                          "PMU: ADB Autopoll requires 4 bytes, got %d\n",
+                          in_len);
+            return;
+        }
+
+        pmu_cmd_set_adb_autopoll(s, mask);
+        return;
+    }
+
+    trace_pmu_cmd_adb_request(in_len, in_data[0], in_data[1], in_data[2],
+                              in_data[3], in_data[4]);
+
+    *out_len = 0;
+
+    /* Check ADB len */
+    adblen = in_data[2];
+    if (adblen > (in_len - 3)) {
+        qemu_log_mask(LOG_GUEST_ERROR,
+                      "PMU: ADB len is %d > %d (in_len -3)...erroring\n",
+                      adblen, in_len - 3);
+        len = -1;
+    } else if (adblen > 252) {
+        qemu_log_mask(LOG_GUEST_ERROR, "PMU: ADB command too big!\n");
+        len = -1;
+    } else {
+        /* Format command */
+        adb_cmd[0] = in_data[0];
+        memcpy(&adb_cmd[1], &in_data[3], in_len - 3);
+        len = adb_request(&s->adb_bus, s->adb_reply + 2, adb_cmd, in_len - 2);
+
+        trace_pmu_cmd_adb_reply(len);
+    }
+
+    if (len > 0) {
+        /* XXX Check this */
+        s->adb_reply_size = len + 2;
+        s->adb_reply[0] = 0x01;
+        s->adb_reply[1] = len;
+    } else {
+        /* XXX Check this */
+        s->adb_reply_size = 1;
+        s->adb_reply[0] = 0x00;
+    }
+
+    s->intbits |= PMU_INT_ADB;
+    pmu_update_extirq(s);
+}
+
+static void pmu_cmd_adb_poll_off(PMUState *s,
+                                 const uint8_t *in_data, uint8_t in_len,
+                                 uint8_t *out_data, uint8_t *out_len)
+{
+    if (in_len != 0) {
+        qemu_log_mask(LOG_GUEST_ERROR,
+                      "PMU: ADB POLL OFF command, invalid len: %d want: 0\n",
+                      in_len);
+        return;
+    }
+
+    if (s->has_adb && s->autopoll_mask) {
+        timer_del(s->adb_poll_timer);
+        s->autopoll_mask = false;
+    }
+}
+
+static void pmu_cmd_shutdown(PMUState *s,
+                             const uint8_t *in_data, uint8_t in_len,
+                             uint8_t *out_data, uint8_t *out_len)
+{
+    if (in_len != 4) {
+        qemu_log_mask(LOG_GUEST_ERROR,
+                      "PMU: SHUTDOWN command, invalid len: %d want: 4\n",
+                      in_len);
+        return;
+    }
+
+    *out_len = 1;
+    out_data[0] = 0;
+
+    if (in_data[0] != 'M' || in_data[1] != 'A' || in_data[2] != 'T' ||
+        in_data[3] != 'T') {
+
+        qemu_log_mask(LOG_GUEST_ERROR,
+                      "PMU: SHUTDOWN command, Bad MATT signature\n");
+        return;
+    }
+
+    qemu_system_shutdown_request(SHUTDOWN_CAUSE_GUEST_SHUTDOWN);
+}
+
+static void pmu_cmd_reset(PMUState *s,
+                          const uint8_t *in_data, uint8_t in_len,
+                          uint8_t *out_data, uint8_t *out_len)
+{
+    if (in_len != 0) {
+        qemu_log_mask(LOG_GUEST_ERROR,
+                      "PMU: RESET command, invalid len: %d want: 0\n",
+                      in_len);
+        return;
+    }
+
+    qemu_system_reset_request(SHUTDOWN_CAUSE_GUEST_RESET);
+}
+
+static void pmu_cmd_get_rtc(PMUState *s,
+                            const uint8_t *in_data, uint8_t in_len,
+                            uint8_t *out_data, uint8_t *out_len)
+{
+    uint32_t ti;
+
+    if (in_len != 0) {
+        qemu_log_mask(LOG_GUEST_ERROR,
+                      "PMU: GET_RTC command, invalid len: %d want: 0\n",
+                      in_len);
+        return;
+    }
+
+    ti = s->tick_offset + (qemu_clock_get_ns(QEMU_CLOCK_VIRTUAL)
+                           / NANOSECONDS_PER_SECOND);
+    out_data[0] = ti >> 24;
+    out_data[1] = ti >> 16;
+    out_data[2] = ti >> 8;
+    out_data[3] = ti;
+    *out_len = 4;
+}
+
+static void pmu_cmd_set_rtc(PMUState *s,
+                            const uint8_t *in_data, uint8_t in_len,
+                            uint8_t *out_data, uint8_t *out_len)
+{
+    uint32_t ti;
+
+    if (in_len != 4) {
+        qemu_log_mask(LOG_GUEST_ERROR,
+                      "PMU: SET_RTC command, invalid len: %d want: 4\n",
+                      in_len);
+        return;
+    }
+
+    ti = (((uint32_t)in_data[0]) << 24) + (((uint32_t)in_data[1]) << 16)
+         + (((uint32_t)in_data[2]) << 8) + in_data[3];
+
+    s->tick_offset = ti - (qemu_clock_get_ns(QEMU_CLOCK_VIRTUAL)
+                           / NANOSECONDS_PER_SECOND);
+}
+
+static void pmu_cmd_system_ready(PMUState *s,
+                                 const uint8_t *in_data, uint8_t in_len,
+                                 uint8_t *out_data, uint8_t *out_len)
+{
+    /* Do nothing */
+}
+
+static void pmu_cmd_get_version(PMUState *s,
+                                const uint8_t *in_data, uint8_t in_len,
+                                uint8_t *out_data, uint8_t *out_len)
+{
+    *out_len = 1;
+    *out_data = 1; /* ??? Check what Apple does */
+}
+
+static void pmu_cmd_power_events(PMUState *s,
+                                 const uint8_t *in_data, uint8_t in_len,
+                                 uint8_t *out_data, uint8_t *out_len)
+{
+    if (in_len < 1) {
+        qemu_log_mask(LOG_GUEST_ERROR,
+                      "PMU: POWER EVENTS command, invalid len %d, want at least 1\n",
+                      in_len);
+        return;
+    }
+
+    switch (in_data[0]) {
+    /* Dummies for now */
+    case PMU_PWR_GET_POWERUP_EVENTS:
+        *out_len = 2;
+        out_data[0] = 0;
+        out_data[1] = 0;
+        break;
+    case PMU_PWR_SET_POWERUP_EVENTS:
+    case PMU_PWR_CLR_POWERUP_EVENTS:
+        break;
+    case PMU_PWR_GET_WAKEUP_EVENTS:
+        *out_len = 2;
+        out_data[0] = 0;
+        out_data[1] = 0;
+        break;
+    case PMU_PWR_SET_WAKEUP_EVENTS:
+    case PMU_PWR_CLR_WAKEUP_EVENTS:
+        break;
+    default:
+        qemu_log_mask(LOG_GUEST_ERROR,
+                      "PMU: POWER EVENTS unknown subcommand 0x%02x\n",
+                      in_data[0]);
+    }
+}
+
+static void pmu_cmd_get_cover(PMUState *s,
+                              const uint8_t *in_data, uint8_t in_len,
+                              uint8_t *out_data, uint8_t *out_len)
+{
+    /* Not 100% sure here, will have to check what a real Mac
+     * returns other than byte 0 bit 0 is LID closed on laptops
+     */
+    *out_len = 1;
+    *out_data = 0x00;
+}
+
+static void pmu_cmd_download_status(PMUState *s,
+                                    const uint8_t *in_data, uint8_t in_len,
+                                    uint8_t *out_data, uint8_t *out_len)
+{
+    /* This has to do with PMU firmware updates as far as I can tell.
+     *
+     * We return 0x62 which is what OpenPMU expects
+     */
+    *out_len = 1;
+    *out_data = 0x62;
+}
+
+static void pmu_cmd_read_pmu_ram(PMUState *s,
+                                 const uint8_t *in_data, uint8_t in_len,
+                                 uint8_t *out_data, uint8_t *out_len)
+{
+    if (in_len < 3) {
+        qemu_log_mask(LOG_GUEST_ERROR,
+                      "PMU: READ_PMU_RAM command, invalid len %d, expected 3\n",
+                      in_len);
+        return;
+    }
+
+    qemu_log_mask(LOG_GUEST_ERROR,
+                  "PMU: Unsupported READ_PMU_RAM, args: %02x %02x %02x\n",
+                  in_data[0], in_data[1], in_data[2]);
+
+    *out_len = 0;
+}
+
+/* description of commands */
+typedef struct PMUCmdHandler {
+    uint8_t command;
+    const char *name;
+    void (*handler)(PMUState *s,
+                    const uint8_t *in_args, uint8_t in_len,
+                    uint8_t *out_args, uint8_t *out_len);
+} PMUCmdHandler;
+
+static const PMUCmdHandler PMUCmdHandlers[] = {
+    { PMU_INT_ACK, "INT ACK", pmu_cmd_int_ack },
+    { PMU_SET_INTR_MASK, "SET INT MASK", pmu_cmd_set_int_mask },
+    { PMU_ADB_CMD, "ADB COMMAND", pmu_cmd_adb },
+    { PMU_ADB_POLL_OFF, "ADB POLL OFF", pmu_cmd_adb_poll_off },
+    { PMU_RESET, "REBOOT", pmu_cmd_reset },
+    { PMU_SHUTDOWN, "SHUTDOWN", pmu_cmd_shutdown },
+    { PMU_READ_RTC, "GET RTC", pmu_cmd_get_rtc },
+    { PMU_SET_RTC, "SET RTC", pmu_cmd_set_rtc },
+    { PMU_SYSTEM_READY, "SYSTEM READY", pmu_cmd_system_ready },
+    { PMU_GET_VERSION, "GET VERSION", pmu_cmd_get_version },
+    { PMU_POWER_EVENTS, "POWER EVENTS", pmu_cmd_power_events },
+    { PMU_GET_COVER, "GET_COVER", pmu_cmd_get_cover },
+    { PMU_DOWNLOAD_STATUS, "DOWNLOAD STATUS", pmu_cmd_download_status },
+    { PMU_READ_PMU_RAM, "READ PMGR RAM", pmu_cmd_read_pmu_ram },
+};
+
+static void pmu_dispatch_cmd(PMUState *s)
+{
+    unsigned int i;
+
+    /* No response by default */
+    s->cmd_rsp_sz = 0;
+
+    for (i = 0; i < ARRAY_SIZE(PMUCmdHandlers); i++) {
+        const PMUCmdHandler *desc = &PMUCmdHandlers[i];
+
+        if (desc->command != s->cmd) {
+            continue;
+        }
+
+        trace_pmu_dispatch_cmd(desc->name);
+        desc->handler(s, s->cmd_buf, s->cmd_buf_pos,
+                      s->cmd_rsp, &s->cmd_rsp_sz);
+
+        if (s->rsplen != -1 && s->rsplen != s->cmd_rsp_sz) {
+            trace_pmu_debug_protocol_string("QEMU internal cmd resp mismatch!");
+        } else {
+            trace_pmu_debug_protocol_resp_size(s->cmd_rsp_sz);
+        }
+
+        return;
+    }
+
+    trace_pmu_dispatch_unknown_cmd(s->cmd);
+
+    /* Manufacture fake response with 0's */
+    if (s->rsplen == -1) {
+        s->cmd_rsp_sz = 0;
+    } else {
+        s->cmd_rsp_sz = s->rsplen;
+        memset(s->cmd_rsp, 0, s->rsplen);
+    }
+}
+
+static void pmu_update(PMUState *s)
+{
+    MOS6522PMUState *mps = &s->mos6522_pmu;
+    MOS6522State *ms = MOS6522(mps);
+
+    /* Only react to changes in reg B */
+    if (ms->b == s->last_b) {
+        return;
+    }
+    s->last_b = ms->b;
+
+    /* Check the TREQ / TACK state */
+    switch (ms->b & (TREQ | TACK)) {
+    case TREQ:
+        /* This is an ack release, handle it and bail out */
+        ms->b |= TACK;
+        s->last_b = ms->b;
+
+        trace_pmu_debug_protocol_string("handshake: TREQ high, setting TACK");
+        return;
+    case TACK:
+        /* This is a valid request, handle below */
+        break;
+    case TREQ | TACK:
+        /* This is an idle state */
+        return;
+    default:
+        /* Invalid state, log and ignore */
+        trace_pmu_debug_protocol_error(ms->b);
+        return;
+    }
+
+    /* If we wanted to handle commands asynchronously, this is where
+     * we would delay the clearing of TACK until we are ready to send
+     * the response
+     */
+
+    /* We have a request, handshake TACK so we don't stay in
+     * an invalid state. If we were concurrent with the OS we
+     * should only do this after we grabbed the SR but that isn't
+     * a problem here.
+     */
+
+    trace_pmu_debug_protocol_clear_treq(s->cmd_state);
+
+    ms->b &= ~TACK;
+    s->last_b = ms->b;
+
+    /* Act according to state */
+    switch (s->cmd_state) {
+    case pmu_state_idle:
+        if (!(ms->acr & SR_OUT)) {
+            trace_pmu_debug_protocol_string("protocol error! "
+                                            "state idle, ACR reading");
+            break;
+        }
+
+        s->cmd = ms->sr;
+        via_set_sr_int(s);
+        s->cmdlen = pmu_data_len[s->cmd][0];
+        s->rsplen = pmu_data_len[s->cmd][1];
+        s->cmd_buf_pos = 0;
+        s->cmd_rsp_pos = 0;
+        s->cmd_state = pmu_state_cmd;
+
+        trace_pmu_debug_protocol_cmd(s->cmd, s->cmdlen, s->rsplen);
+        break;
+
+    case pmu_state_cmd:
+        if (!(ms->acr & SR_OUT)) {
+            trace_pmu_debug_protocol_string("protocol error! "
+                                            "state cmd, ACR reading");
+            break;
+        }
+
+        if (s->cmdlen == -1) {
+            trace_pmu_debug_protocol_cmdlen(ms->sr);
+
+            s->cmdlen = ms->sr;
+            if (s->cmdlen > sizeof(s->cmd_buf)) {
+                trace_pmu_debug_protocol_cmd_toobig(s->cmdlen);
+            }
+        } else if (s->cmd_buf_pos < sizeof(s->cmd_buf)) {
+            s->cmd_buf[s->cmd_buf_pos++] = ms->sr;
+        }
+
+        via_set_sr_int(s);
+        break;
+
+    case pmu_state_rsp:
+        if (ms->acr & SR_OUT) {
+            trace_pmu_debug_protocol_string("protocol error! "
+                                            "state resp, ACR writing");
+            break;
+        }
+
+        if (s->rsplen == -1) {
+            trace_pmu_debug_protocol_cmd_send_resp_size(s->cmd_rsp_sz);
+
+            ms->sr = s->cmd_rsp_sz;
+            s->rsplen = s->cmd_rsp_sz;
+        } else if (s->cmd_rsp_pos < s->cmd_rsp_sz) {
+            trace_pmu_debug_protocol_cmd_send_resp(s->cmd_rsp_pos, s->rsplen);
+
+            ms->sr = s->cmd_rsp[s->cmd_rsp_pos++];
+        }
+
+        via_set_sr_int(s);
+        break;
+    }
+
+    /* Check for state completion */
+    if (s->cmd_state == pmu_state_cmd && s->cmdlen == s->cmd_buf_pos) {
+        trace_pmu_debug_protocol_string("Command reception complete, "
+                                        "dispatching...");
+
+        pmu_dispatch_cmd(s);
+        s->cmd_state = pmu_state_rsp;
+    }
+
+    if (s->cmd_state == pmu_state_rsp && s->rsplen == s->cmd_rsp_pos) {
+        trace_pmu_debug_protocol_cmd_resp_complete(ms->ier);
+
+        s->cmd_state = pmu_state_idle;
+    }
+}
+
+static uint64_t mos6522_pmu_read(void *opaque, hwaddr addr, unsigned size)
+{
+    PMUState *s = opaque;
+    MOS6522PMUState *mps = &s->mos6522_pmu;
+    MOS6522State *ms = MOS6522(mps);
+
+    addr = (addr >> 9) & 0xf;
+    return mos6522_read(ms, addr, size);
+}
+
+static void mos6522_pmu_write(void *opaque, hwaddr addr, uint64_t val,
+                              unsigned size)
+{
+    PMUState *s = opaque;
+    MOS6522PMUState *mps = &s->mos6522_pmu;
+    MOS6522State *ms = MOS6522(mps);
+
+    addr = (addr >> 9) & 0xf;
+    mos6522_write(ms, addr, val, size);
+}
+
+static const MemoryRegionOps mos6522_pmu_ops = {
+    .read = mos6522_pmu_read,
+    .write = mos6522_pmu_write,
+    .endianness = DEVICE_BIG_ENDIAN,
+    .impl = {
+        .min_access_size = 1,
+        .max_access_size = 1,
+    },
+};
+
+static bool pmu_adb_state_needed(void *opaque)
+{
+    PMUState *s = opaque;
+
+    return s->has_adb;
+}
+
+static const VMStateDescription vmstate_pmu_adb = {
+    .name = "pmu/adb",
+    .version_id = 0,
+    .minimum_version_id = 0,
+    .needed = pmu_adb_state_needed,
+    .fields = (VMStateField[]) {
+        VMSTATE_UINT16(adb_poll_mask, PMUState),
+        VMSTATE_TIMER_PTR(adb_poll_timer, PMUState),
+        VMSTATE_UINT8(adb_reply_size, PMUState),
+        VMSTATE_BUFFER(adb_reply, PMUState),
+    }
+};
+
+static const VMStateDescription vmstate_pmu = {
+    .name = "pmu",
+    .version_id = 0,
+    .minimum_version_id = 0,
+    .fields = (VMStateField[]) {
+        VMSTATE_STRUCT(mos6522_pmu.parent_obj, PMUState, 0, vmstate_mos6522,
+                       MOS6522State),
+        VMSTATE_UINT8(last_b, PMUState),
+        VMSTATE_UINT8(cmd, PMUState),
+        VMSTATE_UINT32(cmdlen, PMUState),
+        VMSTATE_UINT32(rsplen, PMUState),
+        VMSTATE_UINT8(cmd_buf_pos, PMUState),
+        VMSTATE_BUFFER(cmd_buf, PMUState),
+        VMSTATE_UINT8(cmd_rsp_pos, PMUState),
+        VMSTATE_UINT8(cmd_rsp_sz, PMUState),
+        VMSTATE_BUFFER(cmd_rsp, PMUState),
+        VMSTATE_UINT8(intbits, PMUState),
+        VMSTATE_UINT8(intmask, PMUState),
+        VMSTATE_UINT8(autopoll_rate_ms, PMUState),
+        VMSTATE_UINT8(autopoll_mask, PMUState),
+        VMSTATE_UINT32(tick_offset, PMUState),
+        VMSTATE_TIMER_PTR(one_sec_timer, PMUState),
+        VMSTATE_INT64(one_sec_target, PMUState),
+        VMSTATE_END_OF_LIST()
+    },
+    .subsections = (const VMStateDescription * []) {
+        &vmstate_pmu_adb,
+    }
+};
+
+static void pmu_reset(DeviceState *dev)
+{
+    PMUState *s = VIA_PMU(dev);
+
+    /* OpenBIOS needs to do this? MacOS 9 needs it */
+    s->intmask = PMU_INT_ADB | PMU_INT_TICK;
+    s->intbits = 0;
+
+    s->cmd_state = pmu_state_idle;
+    s->autopoll_mask = 0;
+}
+
+static void pmu_realize(DeviceState *dev, Error **errp)
+{
+    PMUState *s = VIA_PMU(dev);
+    SysBusDevice *sbd;
+    MOS6522State *ms;
+    DeviceState *d;
+    struct tm tm;
+
+    /* Pass IRQ from 6522 */
+    d = DEVICE(&s->mos6522_pmu);
+    ms = MOS6522(d);
+    sbd = SYS_BUS_DEVICE(s);
+    sysbus_pass_irq(sbd, SYS_BUS_DEVICE(ms));
+
+    qemu_get_timedate(&tm, 0);
+    s->tick_offset = (uint32_t)mktimegm(&tm) + RTC_OFFSET;
+    s->one_sec_timer = timer_new_ms(QEMU_CLOCK_VIRTUAL, pmu_one_sec_timer, s);
+    s->one_sec_target = qemu_clock_get_ms(QEMU_CLOCK_VIRTUAL) + 1000;
+    timer_mod(s->one_sec_timer, s->one_sec_target);
+
+    if (s->has_adb) {
+        qbus_create_inplace(&s->adb_bus, sizeof(s->adb_bus), TYPE_ADB_BUS,
+                            DEVICE(dev), "adb.0");
+        s->adb_poll_timer = timer_new_ms(QEMU_CLOCK_VIRTUAL, pmu_adb_poll, s);
+        s->adb_poll_mask = 0xffff;
+        s->autopoll_rate_ms = 20;
+    }
+}
+
+static void pmu_init(Object *obj)
+{
+    SysBusDevice *d = SYS_BUS_DEVICE(obj);
+    PMUState *s = VIA_PMU(obj);
+
+    object_property_add_link(obj, "gpio", TYPE_MACIO_GPIO,
+                             (Object **) &s->gpio,
+                             qdev_prop_allow_set_link_before_realize,
+                             0, NULL);
+
+    object_initialize(&s->mos6522_pmu, sizeof(s->mos6522_pmu),
+                      TYPE_MOS6522_PMU);
+    qdev_set_parent_bus(DEVICE(&s->mos6522_pmu), sysbus_get_default());
+
+    memory_region_init_io(&s->mem, obj, &mos6522_pmu_ops, s, "via-pmu",
+                          0x2000);
+    sysbus_init_mmio(d, &s->mem);
+}
+
+static Property pmu_properties[] = {
+    DEFINE_PROP_BOOL("has-adb", PMUState, has_adb, true),
+    DEFINE_PROP_END_OF_LIST()
+};
+
+static void pmu_class_init(ObjectClass *oc, void *data)
+{
+    DeviceClass *dc = DEVICE_CLASS(oc);
+
+    dc->realize = pmu_realize;
+    dc->reset = pmu_reset;
+    dc->vmsd = &vmstate_pmu;
+    dc->props = pmu_properties;
+    set_bit(DEVICE_CATEGORY_BRIDGE, dc->categories);
+}
+
+static const TypeInfo pmu_type_info = {
+    .name = TYPE_VIA_PMU,
+    .parent = TYPE_SYS_BUS_DEVICE,
+    .instance_size = sizeof(PMUState),
+    .instance_init = pmu_init,
+    .class_init = pmu_class_init,
+};
+
+static void mos6522_pmu_portB_write(MOS6522State *s)
+{
+    MOS6522PMUState *mps = container_of(s, MOS6522PMUState, parent_obj);
+    PMUState *ps = container_of(mps, PMUState, mos6522_pmu);
+
+    if ((s->pcr & 0xe0) == 0x20 || (s->pcr & 0xe0) == 0x60) {
+        s->ifr &= ~CB2_INT;
+    }
+    s->ifr &= ~CB1_INT;
+
+    via_update_irq(ps);
+    pmu_update(ps);
+}
+
+static void mos6522_pmu_portA_write(MOS6522State *s)
+{
+    MOS6522PMUState *mps = container_of(s, MOS6522PMUState, parent_obj);
+    PMUState *ps = container_of(mps, PMUState, mos6522_pmu);
+
+    if ((s->pcr & 0x0e) == 0x02 || (s->pcr & 0x0e) == 0x06) {
+        s->ifr &= ~CA2_INT;
+    }
+    s->ifr &= ~CA1_INT;
+
+    via_update_irq(ps);
+}
+
+static void mos6522_pmu_reset(DeviceState *dev)
+{
+    MOS6522State *ms = MOS6522(dev);
+    MOS6522PMUState *mps = container_of(ms, MOS6522PMUState, parent_obj);
+    PMUState *s = container_of(mps, PMUState, mos6522_pmu);
+    MOS6522DeviceClass *mdc = MOS6522_DEVICE_GET_CLASS(ms);
+
+    mdc->parent_reset(dev);
+
+    ms->timers[0].frequency = VIA_TIMER_FREQ;
+    ms->timers[1].frequency = (SCALE_US * 6000) / 4700;
+
+    s->last_b = ms->b = TACK | TREQ;
+}
+
+static void mos6522_pmu_class_init(ObjectClass *oc, void *data)
+{
+    DeviceClass *dc = DEVICE_CLASS(oc);
+    MOS6522DeviceClass *mdc = MOS6522_DEVICE_CLASS(oc);
+
+    dc->reset = mos6522_pmu_reset;
+    mdc->portB_write = mos6522_pmu_portB_write;
+    mdc->portA_write = mos6522_pmu_portA_write;
+}
+
+static const TypeInfo mos6522_pmu_type_info = {
+    .name = TYPE_MOS6522_PMU,
+    .parent = TYPE_MOS6522,
+    .instance_size = sizeof(MOS6522PMUState),
+    .class_init = mos6522_pmu_class_init,
+};
+
+static void pmu_register_types(void)
+{
+    type_register_static(&pmu_type_info);
+    type_register_static(&mos6522_pmu_type_info);
+}
+
+type_init(pmu_register_types)
diff --git a/hw/misc/macio/trace-events b/hw/misc/macio/trace-events
index d499d78c99..05019262fa 100644
--- a/hw/misc/macio/trace-events
+++ b/hw/misc/macio/trace-events
@@ -13,3 +13,31 @@ cuda_packet_send_data(int i, const uint8_t data) "[%d] 0x%02x"
 # hw/misc/macio/macio.c
 macio_timer_write(uint64_t addr, unsigned len, uint64_t val) "write addr 0x%"PRIx64 " len %d val 0x%"PRIx64
 macio_timer_read(uint64_t addr, unsigned len, uint32_t val) "read addr 0x%"PRIx64 " len %d val 0x%"PRIx32
+
+# hw/misc/macio/gpio.c
+macio_set_gpio(int gpio, bool state) "setting GPIO %d to %d"
+macio_gpio_irq_assert(int gpio) "asserting GPIO %d"
+macio_gpio_irq_deassert(int gpio) "deasserting GPIO %d"
+macio_gpio_write(uint64_t addr, uint64_t val) "addr: 0x%"PRIx64" value: 0x%"PRIx64
+macio_gpio_read(uint64_t addr, uint64_t val) "addr: 0x%"PRIx64" value: 0x%"PRIx64
+
+# hw/misc/macio/pmu.c
+pmu_adb_poll(int olen) "ADB autopoll, olen=%d"
+pmu_one_sec_timer(void) "PMU one sec..."
+pmu_cmd_set_int_mask(int intmask) "Setting PMU int mask to 0x%02x"
+pmu_cmd_set_adb_autopoll(int mask) "ADB set autopoll, mask=0x%04x"
+pmu_cmd_adb_nobus(void) "ADB PACKET with no ADB bus!"
+pmu_cmd_adb_request(int inlen, int indata0, int indata1, int indata2, int indata3, int indata4) "ADB request: len=%d, cmd=0x%02x, pflags=0x%02x, adblen=%d: 0x%02x 0x%02x..."
+pmu_cmd_adb_reply(int len) "ADB reply is %d bytes"
+pmu_dispatch_cmd(const char *name) "handling command %s"
+pmu_dispatch_unknown_cmd(int cmd) "Unknown PMU command 0x%02x"
+pmu_debug_protocol_string(const char *str) "%s"
+pmu_debug_protocol_resp_size(int size) "sending %d resp bytes"
+pmu_debug_protocol_error(int portB) "protocol error! portB=0x%02x"
+pmu_debug_protocol_clear_treq(int state) "TREQ cleared, clearing TACK, state: %d"
+pmu_debug_protocol_cmd(int cmd, int cmdlen, int rsplen) "Got command byte 0x%02x, clen=%d, rlen=%d"
+pmu_debug_protocol_cmdlen(int len) "got cmd length byte: %d"
+pmu_debug_protocol_cmd_toobig(int len) "command too big (%d bytes)"
+pmu_debug_protocol_cmd_send_resp_size(int len) "sending length byte: %d"
+pmu_debug_protocol_cmd_send_resp(int pos, int len) "sending byte: %d/%d"
+pmu_debug_protocol_cmd_resp_complete(int ier) "Response send complete. IER=0x%02x"
diff --git a/hw/misc/mos6522.c b/hw/misc/mos6522.c
index 44eb306cf1..14cff26c61 100644
--- a/hw/misc/mos6522.c
+++ b/hw/misc/mos6522.c
@@ -40,7 +40,7 @@ static void mos6522_timer_update(MOS6522State *s, MOS6522Timer *ti,
 
 static void mos6522_update_irq(MOS6522State *s)
 {
-    if (s->ifr & s->ier & (SR_INT | T1_INT | T2_INT)) {
+    if (s->ifr & s->ier) {
         qemu_irq_raise(s->irq);
     } else {
         qemu_irq_lower(s->irq);
@@ -241,7 +241,7 @@ uint64_t mos6522_read(void *opaque, hwaddr addr, unsigned size)
         break;
     case VIA_REG_SR:
         val = s->sr;
-        s->ifr &= ~(SR_INT | CB1_INT | CB2_INT);
+        s->ifr &= ~SR_INT;
         mos6522_update_irq(s);
         break;
     case VIA_REG_ACR:
@@ -463,6 +463,7 @@ static void mos6522_class_init(ObjectClass *oc, void *data)
     mdc->set_sr_int = mos6522_set_sr_int;
     mdc->portB_write = mos6522_portB_write;
     mdc->portA_write = mos6522_portA_write;
+    mdc->update_irq = mos6522_update_irq;
     mdc->get_timer1_counter_value = mos6522_get_counter_value;
     mdc->get_timer2_counter_value = mos6522_get_counter_value;
     mdc->get_timer1_load_time = mos6522_get_load_time;
diff --git a/hw/ppc/mac.h b/hw/ppc/mac.h
index 89fa8bbed7..c0217e66f2 100644
--- a/hw/ppc/mac.h
+++ b/hw/ppc/mac.h
@@ -27,6 +27,7 @@
 #define PPC_MAC_H
 
 #include "exec/memory.h"
+#include "hw/boards.h"
 #include "hw/sysbus.h"
 #include "hw/ide/internal.h"
 #include "hw/input/adb.h"
@@ -58,12 +59,31 @@
 
 /* New World IRQs */
 #define NEWWORLD_CUDA_IRQ      0x19
+#define NEWWORLD_PMU_IRQ       0x19
 #define NEWWORLD_ESCCB_IRQ     0x24
 #define NEWWORLD_ESCCA_IRQ     0x25
 #define NEWWORLD_IDE0_IRQ      0xd
 #define NEWWORLD_IDE0_DMA_IRQ  0x2
 #define NEWWORLD_IDE1_IRQ      0xe
 #define NEWWORLD_IDE1_DMA_IRQ  0x3
+#define NEWWORLD_EXTING_GPIO1  0x2f
+#define NEWWORLD_EXTING_GPIO9  0x37
+
+/* Core99 machine */
+#define TYPE_CORE99_MACHINE MACHINE_TYPE_NAME("mac99")
+#define CORE99_MACHINE(obj) OBJECT_CHECK(Core99MachineState, (obj), \
+                                         TYPE_CORE99_MACHINE)
+
+#define CORE99_VIA_CONFIG_CUDA     0x0
+#define CORE99_VIA_CONFIG_PMU      0x1
+#define CORE99_VIA_CONFIG_PMU_ADB  0x2
+
+typedef struct Core99MachineState {
+    /*< private >*/
+    MachineState parent;
+
+    uint8_t via_config;
+} Core99MachineState;
 
 /* MacIO */
 #define TYPE_MACIO_IDE "macio-ide"
diff --git a/hw/ppc/mac_newworld.c b/hw/ppc/mac_newworld.c
index 744acdfd2e..ff715ffffd 100644
--- a/hw/ppc/mac_newworld.c
+++ b/hw/ppc/mac_newworld.c
@@ -111,6 +111,7 @@ static void ppc_core99_init(MachineState *machine)
     const char *kernel_cmdline = machine->kernel_cmdline;
     const char *initrd_filename = machine->initrd_filename;
     const char *boot_device = machine->boot_order;
+    Core99MachineState *core99_machine = CORE99_MACHINE(machine);
     PowerPCCPU *cpu = NULL;
     CPUPPCState *env = NULL;
     char *filename;
@@ -122,6 +123,7 @@ static void ppc_core99_init(MachineState *machine)
     UNINHostState *uninorth_pci;
     PCIBus *pci_bus;
     NewWorldMacIOState *macio;
+    bool has_pmu, has_adb;
     MACIOIDEState *macio_ide;
     BusState *adb_bus;
     MacIONVRAMState *nvr;
@@ -361,6 +363,9 @@ static void ppc_core99_init(MachineState *machine)
     }
 
     machine->usb |= defaults_enabled() && !machine->usb_disabled;
+    has_pmu = (core99_machine->via_config != CORE99_VIA_CONFIG_CUDA);
+    has_adb = (core99_machine->via_config == CORE99_VIA_CONFIG_CUDA ||
+               core99_machine->via_config == CORE99_VIA_CONFIG_PMU_ADB);
 
     /* Timebase Frequency */
     if (kvm_enabled()) {
@@ -376,6 +381,8 @@ static void ppc_core99_init(MachineState *machine)
     macio = NEWWORLD_MACIO(pci_create(pci_bus, -1, TYPE_NEWWORLD_MACIO));
     dev = DEVICE(macio);
     qdev_prop_set_uint64(dev, "frequency", tbfreq);
+    qdev_prop_set_bit(dev, "has-pmu", has_pmu);
+    qdev_prop_set_bit(dev, "has-adb", has_adb);
     object_property_set_link(OBJECT(macio), OBJECT(pic_dev), "pic",
                              &error_abort);
     qdev_init_nofail(dev);
@@ -391,19 +398,29 @@ static void ppc_core99_init(MachineState *machine)
                                                         "ide[1]"));
     macio_ide_init_drives(macio_ide, &hd[MAX_IDE_DEVS]);
 
-    dev = DEVICE(object_resolve_path_component(OBJECT(macio), "cuda"));
-    adb_bus = qdev_get_child_bus(dev, "adb.0");
-    dev = qdev_create(adb_bus, TYPE_ADB_KEYBOARD);
-    qdev_init_nofail(dev);
-    dev = qdev_create(adb_bus, TYPE_ADB_MOUSE);
-    qdev_init_nofail(dev);
+    if (has_adb) {
+        if (has_pmu) {
+            dev = DEVICE(object_resolve_path_component(OBJECT(macio), "pmu"));
+        } else {
+            dev = DEVICE(object_resolve_path_component(OBJECT(macio), "cuda"));
+        }
+
+        adb_bus = qdev_get_child_bus(dev, "adb.0");
+        dev = qdev_create(adb_bus, TYPE_ADB_KEYBOARD);
+        qdev_prop_set_bit(dev, "disable-direct-reg3-writes", has_pmu);
+        qdev_init_nofail(dev);
+
+        dev = qdev_create(adb_bus, TYPE_ADB_MOUSE);
+        qdev_prop_set_bit(dev, "disable-direct-reg3-writes", has_pmu);
+        qdev_init_nofail(dev);
+    }
 
     if (machine->usb) {
         pci_create_simple(pci_bus, -1, "pci-ohci");
 
         /* U3 needs to use USB for input because Linux doesn't support via-cuda
         on PPC64 */
-        if (machine_arch == ARCH_MAC99_U3) {
+        if (!has_adb || machine_arch == ARCH_MAC99_U3) {
             USBBus *usb_bus = usb_bus_find(-1);
 
             usb_create_simple(usb_bus, "usb-kbd");
@@ -459,6 +476,8 @@ static void ppc_core99_init(MachineState *machine)
     fw_cfg_add_i16(fw_cfg, FW_CFG_PPC_HEIGHT, graphic_height);
     fw_cfg_add_i16(fw_cfg, FW_CFG_PPC_DEPTH, graphic_depth);
 
+    fw_cfg_add_i32(fw_cfg, FW_CFG_PPC_VIACONFIG, core99_machine->via_config);
+
     fw_cfg_add_i32(fw_cfg, FW_CFG_PPC_IS_KVM, kvm_enabled());
     if (kvm_enabled()) {
 #ifdef CONFIG_KVM
@@ -515,10 +534,61 @@ static void core99_machine_class_init(ObjectClass *oc, void *data)
 #endif
 }
 
+static char *core99_get_via_config(Object *obj, Error **errp)
+{
+    Core99MachineState *cms = CORE99_MACHINE(obj);
+
+    switch (cms->via_config) {
+    default:
+    case CORE99_VIA_CONFIG_CUDA:
+        return g_strdup("cuda");
+
+    case CORE99_VIA_CONFIG_PMU:
+        return g_strdup("pmu");
+
+    case CORE99_VIA_CONFIG_PMU_ADB:
+        return g_strdup("pmu-adb");
+    }
+}
+
+static void core99_set_via_config(Object *obj, const char *value, Error **errp)
+{
+    Core99MachineState *cms = CORE99_MACHINE(obj);
+
+    if (!strcmp(value, "cuda")) {
+        cms->via_config = CORE99_VIA_CONFIG_CUDA;
+    } else if (!strcmp(value, "pmu")) {
+        cms->via_config = CORE99_VIA_CONFIG_PMU;
+    } else if (!strcmp(value, "pmu-adb")) {
+        cms->via_config = CORE99_VIA_CONFIG_PMU_ADB;
+    } else {
+        error_setg(errp, "Invalid via value");
+        error_append_hint(errp, "Valid values are cuda, pmu, pmu-adb.\n");
+    }
+}
+
+static void core99_instance_init(Object *obj)
+{
+    Core99MachineState *cms = CORE99_MACHINE(obj);
+
+    /* Default via_config is CORE99_VIA_CONFIG_CUDA */
+    cms->via_config = CORE99_VIA_CONFIG_CUDA;
+    object_property_add_str(obj, "via", core99_get_via_config,
+                            core99_set_via_config, NULL);
+    object_property_set_description(obj, "via",
+                                    "Set VIA configuration. "
+                                    "Valid values are cuda, pmu and pmu-adb",
+                                    NULL);
+
+    return;
+}
+
 static const TypeInfo core99_machine_info = {
     .name          = MACHINE_TYPE_NAME("mac99"),
     .parent        = TYPE_MACHINE,
     .class_init    = core99_machine_class_init,
+    .instance_init = core99_instance_init,
+    .instance_size = sizeof(Core99MachineState)
 };
 
 static void mac_machine_register_types(void)
diff --git a/hw/ppc/pnv.c b/hw/ppc/pnv.c
index 0314881316..0d2b79f798 100644
--- a/hw/ppc/pnv.c
+++ b/hw/ppc/pnv.c
@@ -121,9 +121,9 @@ static int get_cpus_node(void *fdt)
  */
 static void pnv_dt_core(PnvChip *chip, PnvCore *pc, void *fdt)
 {
-    CPUState *cs = CPU(DEVICE(pc->threads));
+    PowerPCCPU *cpu = pc->threads[0];
+    CPUState *cs = CPU(cpu);
     DeviceClass *dc = DEVICE_GET_CLASS(cs);
-    PowerPCCPU *cpu = POWERPC_CPU(cs);
     int smt_threads = CPU_CORE(pc)->nr_threads;
     CPUPPCState *env = &cpu->env;
     PowerPCCPUClass *pcc = POWERPC_CPU_GET_CLASS(cs);
@@ -849,9 +849,8 @@ static void pnv_chip_icp_realize(PnvChip *chip, Error **errp)
     }
 }
 
-static void pnv_chip_realize(DeviceState *dev, Error **errp)
+static void pnv_chip_core_realize(PnvChip *chip, Error **errp)
 {
-    PnvChip *chip = PNV_CHIP(dev);
     Error *error = NULL;
     PnvChipClass *pcc = PNV_CHIP_GET_CLASS(chip);
     const char *typename = pnv_chip_core_typename(chip);
@@ -863,14 +862,6 @@ static void pnv_chip_realize(DeviceState *dev, Error **errp)
         return;
     }
 
-    /* XSCOM bridge */
-    pnv_xscom_realize(chip, &error);
-    if (error) {
-        error_propagate(errp, error);
-        return;
-    }
-    sysbus_mmio_map(SYS_BUS_DEVICE(chip), 0, PNV_XSCOM_BASE(chip));
-
     /* Cores */
     pnv_chip_core_sanitize(chip, &error);
     if (error) {
@@ -918,6 +909,27 @@ static void pnv_chip_realize(DeviceState *dev, Error **errp)
                                 &PNV_CORE(pnv_core)->xscom_regs);
         i++;
     }
+}
+
+static void pnv_chip_realize(DeviceState *dev, Error **errp)
+{
+    PnvChip *chip = PNV_CHIP(dev);
+    Error *error = NULL;
+
+    /* XSCOM bridge */
+    pnv_xscom_realize(chip, &error);
+    if (error) {
+        error_propagate(errp, error);
+        return;
+    }
+    sysbus_mmio_map(SYS_BUS_DEVICE(chip), 0, PNV_XSCOM_BASE(chip));
+
+    /* Cores */
+    pnv_chip_core_realize(chip, &error);
+    if (error) {
+        error_propagate(errp, error);
+        return;
+    }
 
     /* Create LPC controller */
     object_property_set_bool(OBJECT(&chip->lpc), true, "realized",
diff --git a/hw/ppc/pnv_core.c b/hw/ppc/pnv_core.c
index 13ad7d9e04..f7cf33f547 100644
--- a/hw/ppc/pnv_core.c
+++ b/hw/ppc/pnv_core.c
@@ -54,28 +54,6 @@ static void pnv_cpu_reset(void *opaque)
     env->msr |= MSR_HVB; /* Hypervisor mode */
 }
 
-static void pnv_cpu_init(PowerPCCPU *cpu, Error **errp)
-{
-    CPUPPCState *env = &cpu->env;
-    int core_pir;
-    int thread_index = 0; /* TODO: TCG supports only one thread */
-    ppc_spr_t *pir = &env->spr_cb[SPR_PIR];
-
-    core_pir = object_property_get_uint(OBJECT(cpu), "core-pir", &error_abort);
-
-    /*
-     * The PIR of a thread is the core PIR + the thread index. We will
-     * need to find a way to get the thread index when TCG supports
-     * more than 1. We could use the object name ?
-     */
-    pir->default_value = core_pir + thread_index;
-
-    /* Set time-base frequency to 512 MHz */
-    cpu_ppc_tb_init(env, PNV_TIMEBASE_FREQ);
-
-    qemu_register_reset(pnv_cpu_reset, cpu);
-}
-
 /*
  * These values are read by the PowerNV HW monitors under Linux
  */
@@ -121,29 +99,39 @@ static const MemoryRegionOps pnv_core_xscom_ops = {
     .endianness = DEVICE_BIG_ENDIAN,
 };
 
-static void pnv_core_realize_child(Object *child, XICSFabric *xi, Error **errp)
+static void pnv_realize_vcpu(PowerPCCPU *cpu, XICSFabric *xi, Error **errp)
 {
+    CPUPPCState *env = &cpu->env;
+    int core_pir;
+    int thread_index = 0; /* TODO: TCG supports only one thread */
+    ppc_spr_t *pir = &env->spr_cb[SPR_PIR];
     Error *local_err = NULL;
-    CPUState *cs = CPU(child);
-    PowerPCCPU *cpu = POWERPC_CPU(cs);
 
-    object_property_set_bool(child, true, "realized", &local_err);
+    object_property_set_bool(OBJECT(cpu), true, "realized", &local_err);
     if (local_err) {
         error_propagate(errp, local_err);
         return;
     }
 
-    cpu->intc = icp_create(child, TYPE_PNV_ICP, xi, &local_err);
+    cpu->intc = icp_create(OBJECT(cpu), TYPE_PNV_ICP, xi, &local_err);
     if (local_err) {
         error_propagate(errp, local_err);
         return;
     }
 
-    pnv_cpu_init(cpu, &local_err);
-    if (local_err) {
-        error_propagate(errp, local_err);
-        return;
-    }
+    core_pir = object_property_get_uint(OBJECT(cpu), "core-pir", &error_abort);
+
+    /*
+     * The PIR of a thread is the core PIR + the thread index. We will
+     * need to find a way to get the thread index when TCG supports
+     * more than 1. We could use the object name ?
+     */
+    pir->default_value = core_pir + thread_index;
+
+    /* Set time-base frequency to 512 MHz */
+    cpu_ppc_tb_init(env, PNV_TIMEBASE_FREQ);
+
+    qemu_register_reset(pnv_cpu_reset, cpu);
 }
 
 static void pnv_core_realize(DeviceState *dev, Error **errp)
@@ -151,7 +139,6 @@ static void pnv_core_realize(DeviceState *dev, Error **errp)
     PnvCore *pc = PNV_CORE(OBJECT(dev));
     CPUCore *cc = CPU_CORE(OBJECT(dev));
     const char *typename = pnv_core_cpu_typename(pc);
-    size_t size = object_type_get_instance_size(typename);
     Error *local_err = NULL;
     void *obj;
     int i, j;
@@ -165,26 +152,21 @@ static void pnv_core_realize(DeviceState *dev, Error **errp)
         return;
     }
 
-    pc->threads = g_malloc0(size * cc->nr_threads);
+    pc->threads = g_new(PowerPCCPU *, cc->nr_threads);
     for (i = 0; i < cc->nr_threads; i++) {
-        obj = pc->threads + i * size;
+        obj = object_new(typename);
 
-        object_initialize(obj, size, typename);
+        pc->threads[i] = POWERPC_CPU(obj);
 
         snprintf(name, sizeof(name), "thread[%d]", i);
-        object_property_add_child(OBJECT(pc), name, obj, &local_err);
+        object_property_add_child(OBJECT(pc), name, obj, &error_abort);
         object_property_add_alias(obj, "core-pir", OBJECT(pc),
-                                  "pir", &local_err);
-        if (local_err) {
-            goto err;
-        }
+                                  "pir", &error_abort);
         object_unref(obj);
     }
 
     for (j = 0; j < cc->nr_threads; j++) {
-        obj = pc->threads + j * size;
-
-        pnv_core_realize_child(obj, XICS_FABRIC(xi), &local_err);
+        pnv_realize_vcpu(pc->threads[j], XICS_FABRIC(xi), &local_err);
         if (local_err) {
             goto err;
         }
@@ -197,13 +179,33 @@ static void pnv_core_realize(DeviceState *dev, Error **errp)
 
 err:
     while (--i >= 0) {
-        obj = pc->threads + i * size;
+        obj = OBJECT(pc->threads[i]);
         object_unparent(obj);
     }
     g_free(pc->threads);
     error_propagate(errp, local_err);
 }
 
+static void pnv_unrealize_vcpu(PowerPCCPU *cpu)
+{
+    qemu_unregister_reset(pnv_cpu_reset, cpu);
+    object_unparent(cpu->intc);
+    cpu_remove_sync(CPU(cpu));
+    object_unparent(OBJECT(cpu));
+}
+
+static void pnv_core_unrealize(DeviceState *dev, Error **errp)
+{
+    PnvCore *pc = PNV_CORE(dev);
+    CPUCore *cc = CPU_CORE(dev);
+    int i;
+
+    for (i = 0; i < cc->nr_threads; i++) {
+        pnv_unrealize_vcpu(pc->threads[i]);
+    }
+    g_free(pc->threads);
+}
+
 static Property pnv_core_properties[] = {
     DEFINE_PROP_UINT32("pir", PnvCore, pir, 0),
     DEFINE_PROP_END_OF_LIST(),
@@ -214,6 +216,7 @@ static void pnv_core_class_init(ObjectClass *oc, void *data)
     DeviceClass *dc = DEVICE_CLASS(oc);
 
     dc->realize = pnv_core_realize;
+    dc->unrealize = pnv_core_unrealize;
     dc->props = pnv_core_properties;
 }
 
diff --git a/hw/ppc/spapr.c b/hw/ppc/spapr.c
index f59999daac..db0fb385d4 100644
--- a/hw/ppc/spapr.c
+++ b/hw/ppc/spapr.c
@@ -186,27 +186,33 @@ static int xics_max_server_number(sPAPRMachineState *spapr)
 static void xics_system_init(MachineState *machine, int nr_irqs, Error **errp)
 {
     sPAPRMachineState *spapr = SPAPR_MACHINE(machine);
+    Error *local_err = NULL;
 
     if (kvm_enabled()) {
         if (machine_kernel_irqchip_allowed(machine) &&
-            !xics_kvm_init(spapr, errp)) {
+            !xics_kvm_init(spapr, &local_err)) {
             spapr->icp_type = TYPE_KVM_ICP;
-            spapr->ics = spapr_ics_create(spapr, TYPE_ICS_KVM, nr_irqs, errp);
+            spapr->ics = spapr_ics_create(spapr, TYPE_ICS_KVM, nr_irqs,
+                                          &local_err);
         }
         if (machine_kernel_irqchip_required(machine) && !spapr->ics) {
-            error_prepend(errp, "kernel_irqchip requested but unavailable: ");
-            return;
+            error_prepend(&local_err,
+                          "kernel_irqchip requested but unavailable: ");
+            goto error;
         }
+        error_free(local_err);
+        local_err = NULL;
     }
 
     if (!spapr->ics) {
         xics_spapr_init(spapr);
         spapr->icp_type = TYPE_ICP;
-        spapr->ics = spapr_ics_create(spapr, TYPE_ICS_SIMPLE, nr_irqs, errp);
-        if (!spapr->ics) {
-            return;
-        }
+        spapr->ics = spapr_ics_create(spapr, TYPE_ICS_SIMPLE, nr_irqs,
+                                      &local_err);
     }
+
+error:
+    error_propagate(errp, local_err);
 }
 
 static int spapr_fixup_cpu_smt_dt(void *fdt, int offset, PowerPCCPU *cpu,
diff --git a/hw/ppc/spapr_caps.c b/hw/ppc/spapr_caps.c
index 531e145114..00e43a9ba7 100644
--- a/hw/ppc/spapr_caps.c
+++ b/hw/ppc/spapr_caps.c
@@ -335,14 +335,10 @@ static sPAPRCapabilities default_caps_with_cpu(sPAPRMachineState *spapr,
 
     caps = smc->default_caps;
 
-    if (!ppc_check_compat(cpu, CPU_POWERPC_LOGICAL_3_00,
-                          0, spapr->max_compat_pvr)) {
-        caps.caps[SPAPR_CAP_CFPC] = SPAPR_CAP_BROKEN;
-    }
-
     if (!ppc_check_compat(cpu, CPU_POWERPC_LOGICAL_2_07,
                           0, spapr->max_compat_pvr)) {
         caps.caps[SPAPR_CAP_HTM] = SPAPR_CAP_OFF;
+        caps.caps[SPAPR_CAP_CFPC] = SPAPR_CAP_BROKEN;
     }
 
     if (!ppc_check_compat(cpu, CPU_POWERPC_LOGICAL_2_06_PLUS,
diff --git a/hw/ppc/spapr_cpu_core.c b/hw/ppc/spapr_cpu_core.c
index f3e9b879b2..aef3be33a3 100644
--- a/hw/ppc/spapr_cpu_core.c
+++ b/hw/ppc/spapr_cpu_core.c
@@ -28,6 +28,7 @@ static void spapr_cpu_reset(void *opaque)
     CPUState *cs = CPU(cpu);
     CPUPPCState *env = &cpu->env;
     PowerPCCPUClass *pcc = POWERPC_CPU_GET_CLASS(cpu);
+    sPAPRCPUState *spapr_cpu = spapr_cpu_state(cpu);
     target_ulong lpcr;
 
     cpu_reset(cs);
@@ -69,6 +70,12 @@ static void spapr_cpu_reset(void *opaque)
 
     /* Set a full AMOR so guest can use the AMR as it sees fit */
     env->spr[SPR_AMOR] = 0xffffffffffffffffull;
+
+    spapr_cpu->vpa_addr = 0;
+    spapr_cpu->slb_shadow_addr = 0;
+    spapr_cpu->slb_shadow_size = 0;
+    spapr_cpu->dtl_addr = 0;
+    spapr_cpu->dtl_size = 0;
 }
 
 void spapr_cpu_set_entry_state(PowerPCCPU *cpu, target_ulong nip, target_ulong r3)
@@ -83,26 +90,6 @@ void spapr_cpu_set_entry_state(PowerPCCPU *cpu, target_ulong nip, target_ulong r
     ppc_store_lpcr(cpu, env->spr[SPR_LPCR] | pcc->lpcr_pm);
 }
 
-static void spapr_cpu_destroy(PowerPCCPU *cpu)
-{
-    qemu_unregister_reset(spapr_cpu_reset, cpu);
-}
-
-static void spapr_cpu_init(sPAPRMachineState *spapr, PowerPCCPU *cpu,
-                           Error **errp)
-{
-    CPUPPCState *env = &cpu->env;
-
-    /* Set time-base frequency to 512 MHz */
-    cpu_ppc_tb_init(env, SPAPR_TIMEBASE_FREQ);
-
-    cpu_ppc_set_vhyp(cpu, PPC_VIRTUAL_HYPERVISOR(spapr));
-    kvmppc_set_papr(cpu);
-
-    qemu_register_reset(spapr_cpu_reset, cpu);
-    spapr_cpu_reset(cpu);
-}
-
 /*
  * Return the sPAPR CPU core type for @model which essentially is the CPU
  * model specified with -cpu cmdline option.
@@ -122,55 +109,110 @@ const char *spapr_get_cpu_core_type(const char *cpu_type)
     return object_class_get_name(oc);
 }
 
-static void spapr_cpu_core_unrealizefn(DeviceState *dev, Error **errp)
+static void spapr_unrealize_vcpu(PowerPCCPU *cpu)
+{
+    qemu_unregister_reset(spapr_cpu_reset, cpu);
+    object_unparent(cpu->intc);
+    cpu_remove_sync(CPU(cpu));
+    object_unparent(OBJECT(cpu));
+}
+
+static void spapr_cpu_core_unrealize(DeviceState *dev, Error **errp)
 {
     sPAPRCPUCore *sc = SPAPR_CPU_CORE(OBJECT(dev));
     CPUCore *cc = CPU_CORE(dev);
     int i;
 
     for (i = 0; i < cc->nr_threads; i++) {
-        Object *obj = OBJECT(sc->threads[i]);
-        DeviceState *dev = DEVICE(obj);
-        CPUState *cs = CPU(dev);
-        PowerPCCPU *cpu = POWERPC_CPU(cs);
-
-        spapr_cpu_destroy(cpu);
-        object_unparent(cpu->intc);
-        cpu_remove_sync(cs);
-        object_unparent(obj);
+        spapr_unrealize_vcpu(sc->threads[i]);
     }
     g_free(sc->threads);
 }
 
-static void spapr_cpu_core_realize_child(Object *child,
-                                         sPAPRMachineState *spapr, Error **errp)
+static void spapr_realize_vcpu(PowerPCCPU *cpu, sPAPRMachineState *spapr,
+                               Error **errp)
 {
+    CPUPPCState *env = &cpu->env;
     Error *local_err = NULL;
-    CPUState *cs = CPU(child);
-    PowerPCCPU *cpu = POWERPC_CPU(cs);
 
-    object_property_set_bool(child, true, "realized", &local_err);
+    object_property_set_bool(OBJECT(cpu), true, "realized", &local_err);
     if (local_err) {
         goto error;
     }
 
-    spapr_cpu_init(spapr, cpu, &local_err);
-    if (local_err) {
-        goto error;
-    }
+    /* Set time-base frequency to 512 MHz */
+    cpu_ppc_tb_init(env, SPAPR_TIMEBASE_FREQ);
+
+    cpu_ppc_set_vhyp(cpu, PPC_VIRTUAL_HYPERVISOR(spapr));
+    kvmppc_set_papr(cpu);
 
-    cpu->intc = icp_create(child, spapr->icp_type, XICS_FABRIC(spapr),
+    qemu_register_reset(spapr_cpu_reset, cpu);
+    spapr_cpu_reset(cpu);
+
+    cpu->intc = icp_create(OBJECT(cpu), spapr->icp_type, XICS_FABRIC(spapr),
                            &local_err);
     if (local_err) {
-        goto error;
+        goto error_unregister;
     }
 
     return;
 
+error_unregister:
+    qemu_unregister_reset(spapr_cpu_reset, cpu);
+    cpu_remove_sync(CPU(cpu));
 error:
     error_propagate(errp, local_err);
 }
 
+static PowerPCCPU *spapr_create_vcpu(sPAPRCPUCore *sc, int i, Error **errp)
+{
+    sPAPRCPUCoreClass *scc = SPAPR_CPU_CORE_GET_CLASS(sc);
+    CPUCore *cc = CPU_CORE(sc);
+    Object *obj;
+    char *id;
+    CPUState *cs;
+    PowerPCCPU *cpu;
+    Error *local_err = NULL;
+
+    obj = object_new(scc->cpu_type);
+
+    cs = CPU(obj);
+    cpu = POWERPC_CPU(obj);
+    cs->cpu_index = cc->core_id + i;
+    spapr_set_vcpu_id(cpu, cs->cpu_index, &local_err);
+    if (local_err) {
+        goto err;
+    }
+
+    cpu->node_id = sc->node_id;
+
+    id = g_strdup_printf("thread[%d]", i);
+    object_property_add_child(OBJECT(sc), id, obj, &local_err);
+    g_free(id);
+    if (local_err) {
+        goto err;
+    }
+
+    cpu->machine_data = g_new0(sPAPRCPUState, 1);
+
+    object_unref(obj);
+    return cpu;
+
+err:
+    object_unref(obj);
+    error_propagate(errp, local_err);
+    return NULL;
+}
+
+static void spapr_delete_vcpu(PowerPCCPU *cpu)
+{
+    sPAPRCPUState *spapr_cpu = spapr_cpu_state(cpu);
+
+    cpu->machine_data = NULL;
+    g_free(spapr_cpu);
+    object_unparent(OBJECT(cpu));
+}
+
 static void spapr_cpu_core_realize(DeviceState *dev, Error **errp)
 {
     /* We don't use SPAPR_MACHINE() in order to exit gracefully if the user
@@ -180,10 +222,8 @@ static void spapr_cpu_core_realize(DeviceState *dev, Error **errp)
         (sPAPRMachineState *) object_dynamic_cast(qdev_get_machine(),
                                                   TYPE_SPAPR_MACHINE);
     sPAPRCPUCore *sc = SPAPR_CPU_CORE(OBJECT(dev));
-    sPAPRCPUCoreClass *scc = SPAPR_CPU_CORE_GET_CLASS(OBJECT(dev));
     CPUCore *cc = CPU_CORE(OBJECT(dev));
     Error *local_err = NULL;
-    Object *obj;
     int i, j;
 
     if (!spapr) {
@@ -193,46 +233,27 @@ static void spapr_cpu_core_realize(DeviceState *dev, Error **errp)
 
     sc->threads = g_new(PowerPCCPU *, cc->nr_threads);
     for (i = 0; i < cc->nr_threads; i++) {
-        char id[32];
-        CPUState *cs;
-        PowerPCCPU *cpu;
-
-        obj = object_new(scc->cpu_type);
-
-        cs = CPU(obj);
-        cpu = sc->threads[i] = POWERPC_CPU(obj);
-        cs->cpu_index = cc->core_id + i;
-        spapr_set_vcpu_id(cpu, cs->cpu_index, &local_err);
+        sc->threads[i] = spapr_create_vcpu(sc, i, &local_err);
         if (local_err) {
             goto err;
         }
-
-
-        /* Set NUMA node for the threads belonged to core  */
-        cpu->node_id = sc->node_id;
-
-        snprintf(id, sizeof(id), "thread[%d]", i);
-        object_property_add_child(OBJECT(sc), id, obj, &local_err);
-        if (local_err) {
-            goto err;
-        }
-        object_unref(obj);
     }
 
     for (j = 0; j < cc->nr_threads; j++) {
-        obj = OBJECT(sc->threads[j]);
-
-        spapr_cpu_core_realize_child(obj, spapr, &local_err);
+        spapr_realize_vcpu(sc->threads[j], spapr, &local_err);
         if (local_err) {
-            goto err;
+            goto err_unrealize;
         }
     }
     return;
 
+err_unrealize:
+    while (--j >= 0) {
+        spapr_unrealize_vcpu(sc->threads[j]);
+    }
 err:
     while (--i >= 0) {
-        obj = OBJECT(sc->threads[i]);
-        object_unparent(obj);
+        spapr_delete_vcpu(sc->threads[i]);
     }
     g_free(sc->threads);
     error_propagate(errp, local_err);
@@ -249,7 +270,7 @@ static void spapr_cpu_core_class_init(ObjectClass *oc, void *data)
     sPAPRCPUCoreClass *scc = SPAPR_CPU_CORE_CLASS(oc);
 
     dc->realize = spapr_cpu_core_realize;
-    dc->unrealize = spapr_cpu_core_unrealizefn;
+    dc->unrealize = spapr_cpu_core_unrealize;
     dc->props = spapr_cpu_core_properties;
     scc->cpu_type = data;
 }
diff --git a/hw/ppc/spapr_hcall.c b/hw/ppc/spapr_hcall.c
index 022f6d8101..ae913d070f 100644
--- a/hw/ppc/spapr_hcall.c
+++ b/hw/ppc/spapr_hcall.c
@@ -8,6 +8,7 @@
 #include "exec/exec-all.h"
 #include "helper_regs.h"
 #include "hw/ppc/spapr.h"
+#include "hw/ppc/spapr_cpu_core.h"
 #include "mmu-hash64.h"
 #include "cpu-models.h"
 #include "trace.h"
@@ -908,9 +909,11 @@ unmap_out:
 #define VPA_SHARED_PROC_OFFSET 0x9
 #define VPA_SHARED_PROC_VAL    0x2
 
-static target_ulong register_vpa(CPUPPCState *env, target_ulong vpa)
+static target_ulong register_vpa(PowerPCCPU *cpu, target_ulong vpa)
 {
-    CPUState *cs = CPU(ppc_env_get_cpu(env));
+    CPUState *cs = CPU(cpu);
+    CPUPPCState *env = &cpu->env;
+    sPAPRCPUState *spapr_cpu = spapr_cpu_state(cpu);
     uint16_t size;
     uint8_t tmp;
 
@@ -935,32 +938,34 @@ static target_ulong register_vpa(CPUPPCState *env, target_ulong vpa)
         return H_PARAMETER;
     }
 
-    env->vpa_addr = vpa;
+    spapr_cpu->vpa_addr = vpa;
 
-    tmp = ldub_phys(cs->as, env->vpa_addr + VPA_SHARED_PROC_OFFSET);
+    tmp = ldub_phys(cs->as, spapr_cpu->vpa_addr + VPA_SHARED_PROC_OFFSET);
     tmp |= VPA_SHARED_PROC_VAL;
-    stb_phys(cs->as, env->vpa_addr + VPA_SHARED_PROC_OFFSET, tmp);
+    stb_phys(cs->as, spapr_cpu->vpa_addr + VPA_SHARED_PROC_OFFSET, tmp);
 
     return H_SUCCESS;
 }
 
-static target_ulong deregister_vpa(CPUPPCState *env, target_ulong vpa)
+static target_ulong deregister_vpa(PowerPCCPU *cpu, target_ulong vpa)
 {
-    if (env->slb_shadow_addr) {
+    sPAPRCPUState *spapr_cpu = spapr_cpu_state(cpu);
+
+    if (spapr_cpu->slb_shadow_addr) {
         return H_RESOURCE;
     }
 
-    if (env->dtl_addr) {
+    if (spapr_cpu->dtl_addr) {
         return H_RESOURCE;
     }
 
-    env->vpa_addr = 0;
+    spapr_cpu->vpa_addr = 0;
     return H_SUCCESS;
 }
 
-static target_ulong register_slb_shadow(CPUPPCState *env, target_ulong addr)
+static target_ulong register_slb_shadow(PowerPCCPU *cpu, target_ulong addr)
 {
-    CPUState *cs = CPU(ppc_env_get_cpu(env));
+    sPAPRCPUState *spapr_cpu = spapr_cpu_state(cpu);
     uint32_t size;
 
     if (addr == 0) {
@@ -968,7 +973,7 @@ static target_ulong register_slb_shadow(CPUPPCState *env, target_ulong addr)
         return H_HARDWARE;
     }
 
-    size = ldl_be_phys(cs->as, addr + 0x4);
+    size = ldl_be_phys(CPU(cpu)->as, addr + 0x4);
     if (size < 0x8) {
         return H_PARAMETER;
     }
@@ -977,26 +982,28 @@ static target_ulong register_slb_shadow(CPUPPCState *env, target_ulong addr)
         return H_PARAMETER;
     }
 
-    if (!env->vpa_addr) {
+    if (!spapr_cpu->vpa_addr) {
         return H_RESOURCE;
     }
 
-    env->slb_shadow_addr = addr;
-    env->slb_shadow_size = size;
+    spapr_cpu->slb_shadow_addr = addr;
+    spapr_cpu->slb_shadow_size = size;
 
     return H_SUCCESS;
 }
 
-static target_ulong deregister_slb_shadow(CPUPPCState *env, target_ulong addr)
+static target_ulong deregister_slb_shadow(PowerPCCPU *cpu, target_ulong addr)
 {
-    env->slb_shadow_addr = 0;
-    env->slb_shadow_size = 0;
+    sPAPRCPUState *spapr_cpu = spapr_cpu_state(cpu);
+
+    spapr_cpu->slb_shadow_addr = 0;
+    spapr_cpu->slb_shadow_size = 0;
     return H_SUCCESS;
 }
 
-static target_ulong register_dtl(CPUPPCState *env, target_ulong addr)
+static target_ulong register_dtl(PowerPCCPU *cpu, target_ulong addr)
 {
-    CPUState *cs = CPU(ppc_env_get_cpu(env));
+    sPAPRCPUState *spapr_cpu = spapr_cpu_state(cpu);
     uint32_t size;
 
     if (addr == 0) {
@@ -1004,26 +1011,28 @@ static target_ulong register_dtl(CPUPPCState *env, target_ulong addr)
         return H_HARDWARE;
     }
 
-    size = ldl_be_phys(cs->as, addr + 0x4);
+    size = ldl_be_phys(CPU(cpu)->as, addr + 0x4);
 
     if (size < 48) {
         return H_PARAMETER;
     }
 
-    if (!env->vpa_addr) {
+    if (!spapr_cpu->vpa_addr) {
         return H_RESOURCE;
     }
 
-    env->dtl_addr = addr;
-    env->dtl_size = size;
+    spapr_cpu->dtl_addr = addr;
+    spapr_cpu->dtl_size = size;
 
     return H_SUCCESS;
 }
 
-static target_ulong deregister_dtl(CPUPPCState *env, target_ulong addr)
+static target_ulong deregister_dtl(PowerPCCPU *cpu, target_ulong addr)
 {
-    env->dtl_addr = 0;
-    env->dtl_size = 0;
+    sPAPRCPUState *spapr_cpu = spapr_cpu_state(cpu);
+
+    spapr_cpu->dtl_addr = 0;
+    spapr_cpu->dtl_size = 0;
 
     return H_SUCCESS;
 }
@@ -1035,38 +1044,36 @@ static target_ulong h_register_vpa(PowerPCCPU *cpu, sPAPRMachineState *spapr,
     target_ulong procno = args[1];
     target_ulong vpa = args[2];
     target_ulong ret = H_PARAMETER;
-    CPUPPCState *tenv;
     PowerPCCPU *tcpu;
 
     tcpu = spapr_find_cpu(procno);
     if (!tcpu) {
         return H_PARAMETER;
     }
-    tenv = &tcpu->env;
 
     switch (flags) {
     case FLAGS_REGISTER_VPA:
-        ret = register_vpa(tenv, vpa);
+        ret = register_vpa(tcpu, vpa);
         break;
 
     case FLAGS_DEREGISTER_VPA:
-        ret = deregister_vpa(tenv, vpa);
+        ret = deregister_vpa(tcpu, vpa);
         break;
 
     case FLAGS_REGISTER_SLBSHADOW:
-        ret = register_slb_shadow(tenv, vpa);
+        ret = register_slb_shadow(tcpu, vpa);
         break;
 
     case FLAGS_DEREGISTER_SLBSHADOW:
-        ret = deregister_slb_shadow(tenv, vpa);
+        ret = deregister_slb_shadow(tcpu, vpa);
         break;
 
     case FLAGS_REGISTER_DTL:
-        ret = register_dtl(tenv, vpa);
+        ret = register_dtl(tcpu, vpa);
         break;
 
     case FLAGS_DEREGISTER_DTL:
-        ret = deregister_dtl(tenv, vpa);
+        ret = deregister_dtl(tcpu, vpa);
         break;
     }
 
@@ -1547,6 +1554,7 @@ static target_ulong h_client_architecture_support(PowerPCCPU *cpu,
                 error_report_err(local_err);
                 return H_HARDWARE;
             }
+            error_free(local_err);
             local_err = NULL;
         }
     }
diff --git a/hw/s390x/css.c b/hw/s390x/css.c
index 56c3fa8c89..5424ea4562 100644
--- a/hw/s390x/css.c
+++ b/hw/s390x/css.c
@@ -1199,18 +1199,6 @@ static IOInstEnding sch_handle_start_func_passthrough(SubchDev *sch)
         assert(orb != NULL);
         p->intparm = orb->intparm;
     }
-
-    /*
-     * Only support prefetch enable mode.
-     * Only support 64bit addressing idal.
-     */
-    if (!(orb->ctrl0 & ORB_CTRL0_MASK_PFCH) ||
-        !(orb->ctrl0 & ORB_CTRL0_MASK_C64)) {
-        warn_report("vfio-ccw requires PFCH and C64 flags set");
-        sch_gen_unit_exception(sch);
-        css_inject_io_interrupt(sch);
-        return IOINST_CC_EXPECTED;
-    }
     return s390_ccw_cmd_request(sch);
 }
 
diff --git a/hw/s390x/ipl.c b/hw/s390x/ipl.c
index 04245b5258..0d67349004 100644
--- a/hw/s390x/ipl.c
+++ b/hw/s390x/ipl.c
@@ -29,6 +29,7 @@
 #include "exec/exec-all.h"
 
 #define KERN_IMAGE_START                0x010000UL
+#define LINUX_MAGIC_ADDR                0x010008UL
 #define KERN_PARM_AREA                  0x010480UL
 #define INITRD_START                    0x800000UL
 #define INITRD_PARM_START               0x010408UL
@@ -105,7 +106,9 @@ static uint64_t bios_translate_addr(void *opaque, uint64_t srcaddr)
 static void s390_ipl_realize(DeviceState *dev, Error **errp)
 {
     S390IPLState *ipl = S390_IPL(dev);
-    uint64_t pentry = KERN_IMAGE_START;
+    uint32_t *ipl_psw;
+    uint64_t pentry;
+    char *magic;
     int kernel_size;
     Error *err = NULL;
 
@@ -157,10 +160,24 @@ static void s390_ipl_realize(DeviceState *dev, Error **errp)
                                NULL, 1, EM_S390, 0, 0);
         if (kernel_size < 0) {
             kernel_size = load_image_targphys(ipl->kernel, 0, ram_size);
-        }
-        if (kernel_size < 0) {
-            error_setg(&err, "could not load kernel '%s'", ipl->kernel);
-            goto error;
+            if (kernel_size < 0) {
+                error_setg(&err, "could not load kernel '%s'", ipl->kernel);
+                goto error;
+            }
+            /* if this is Linux use KERN_IMAGE_START */
+            magic = rom_ptr(LINUX_MAGIC_ADDR);
+            if (magic && !memcmp(magic, "S390EP", 6)) {
+                pentry = KERN_IMAGE_START;
+            } else {
+                /* if not Linux load the address of the (short) IPL PSW */
+                ipl_psw = rom_ptr(4);
+                if (ipl_psw) {
+                    pentry = be32_to_cpu(*ipl_psw) & 0x7fffffffUL;
+                } else {
+                    error_setg(&err, "Could not get IPL PSW");
+                    goto error;
+                }
+            }
         }
         /*
          * Is it a Linux kernel (starting at 0x10000)? If yes, we fill in the
diff --git a/hw/s390x/virtio-ccw.c b/hw/s390x/virtio-ccw.c
index 0a9bec484b..b92a85d0b0 100644
--- a/hw/s390x/virtio-ccw.c
+++ b/hw/s390x/virtio-ccw.c
@@ -1001,10 +1001,15 @@ static void virtio_ccw_notify(DeviceState *d, uint16_t vector)
     SubchDev *sch = ccw_dev->sch;
     uint64_t indicators;
 
-    /* queue indicators + secondary indicators */
-    if (vector >= VIRTIO_QUEUE_MAX + 64) {
+    if (vector == VIRTIO_NO_VECTOR) {
         return;
     }
+    /*
+     * vector < VIRTIO_QUEUE_MAX: notification for a virtqueue
+     * vector == VIRTIO_QUEUE_MAX: configuration change notification
+     * bits beyond that are unused and should never be notified for
+     */
+    assert(vector <= VIRTIO_QUEUE_MAX);
 
     if (vector < VIRTIO_QUEUE_MAX) {
         if (!dev->indicators) {
@@ -1027,6 +1032,7 @@ static void virtio_ccw_notify(DeviceState *d, uint16_t vector)
                 css_adapter_interrupt(CSS_IO_ADAPTER_VIRTIO, dev->thinint_isc);
             }
         } else {
+            assert(vector < NR_CLASSIC_INDICATOR_BITS);
             indicators = address_space_ldq(&address_space_memory,
                                            dev->indicators->addr,
                                            MEMTXATTRS_UNSPECIFIED,
@@ -1040,12 +1046,11 @@ static void virtio_ccw_notify(DeviceState *d, uint16_t vector)
         if (!dev->indicators2) {
             return;
         }
-        vector = 0;
         indicators = address_space_ldq(&address_space_memory,
                                        dev->indicators2->addr,
                                        MEMTXATTRS_UNSPECIFIED,
                                        NULL);
-        indicators |= 1ULL << vector;
+        indicators |= 1ULL;
         address_space_stq(&address_space_memory, dev->indicators2->addr,
                           indicators, MEMTXATTRS_UNSPECIFIED, NULL);
         css_conditional_io_interrupt(sch);
diff --git a/hw/sparc/sun4m.c b/hw/sparc/sun4m.c
index 0ee779fafe..b984d2da0e 100644
--- a/hw/sparc/sun4m.c
+++ b/hw/sparc/sun4m.c
@@ -572,23 +572,36 @@ typedef struct IDRegState {
     MemoryRegion mem;
 } IDRegState;
 
-static void idreg_init1(Object *obj)
+static void idreg_realize(DeviceState *ds, Error **errp)
 {
-    IDRegState *s = MACIO_ID_REGISTER(obj);
-    SysBusDevice *dev = SYS_BUS_DEVICE(obj);
+    IDRegState *s = MACIO_ID_REGISTER(ds);
+    SysBusDevice *dev = SYS_BUS_DEVICE(ds);
+    Error *local_err = NULL;
+
+    memory_region_init_ram_nomigrate(&s->mem, OBJECT(ds), "sun4m.idreg",
+                                     sizeof(idreg_data), &local_err);
+    if (local_err) {
+        error_propagate(errp, local_err);
+        return;
+    }
 
-    memory_region_init_ram_nomigrate(&s->mem, obj,
-                           "sun4m.idreg", sizeof(idreg_data), &error_fatal);
     vmstate_register_ram_global(&s->mem);
     memory_region_set_readonly(&s->mem, true);
     sysbus_init_mmio(dev, &s->mem);
 }
 
+static void idreg_class_init(ObjectClass *oc, void *data)
+{
+    DeviceClass *dc = DEVICE_CLASS(oc);
+
+    dc->realize = idreg_realize;
+}
+
 static const TypeInfo idreg_info = {
     .name          = TYPE_MACIO_ID_REGISTER,
     .parent        = TYPE_SYS_BUS_DEVICE,
     .instance_size = sizeof(IDRegState),
-    .instance_init = idreg_init1,
+    .class_init    = idreg_class_init,
 };
 
 #define TYPE_TCX_AFX "tcx_afx"
@@ -613,21 +626,35 @@ static void afx_init(hwaddr addr)
     sysbus_mmio_map(s, 0, addr);
 }
 
-static void afx_init1(Object *obj)
+static void afx_realize(DeviceState *ds, Error **errp)
 {
-    AFXState *s = TCX_AFX(obj);
-    SysBusDevice *dev = SYS_BUS_DEVICE(obj);
+    AFXState *s = TCX_AFX(ds);
+    SysBusDevice *dev = SYS_BUS_DEVICE(ds);
+    Error *local_err = NULL;
+
+    memory_region_init_ram_nomigrate(&s->mem, OBJECT(ds), "sun4m.afx", 4,
+                                     &local_err);
+    if (local_err) {
+        error_propagate(errp, local_err);
+        return;
+    }
 
-    memory_region_init_ram_nomigrate(&s->mem, obj, "sun4m.afx", 4, &error_fatal);
     vmstate_register_ram_global(&s->mem);
     sysbus_init_mmio(dev, &s->mem);
 }
 
+static void afx_class_init(ObjectClass *oc, void *data)
+{
+    DeviceClass *dc = DEVICE_CLASS(oc);
+
+    dc->realize = afx_realize;
+}
+
 static const TypeInfo afx_info = {
     .name          = TYPE_TCX_AFX,
     .parent        = TYPE_SYS_BUS_DEVICE,
     .instance_size = sizeof(AFXState),
-    .instance_init = afx_init1,
+    .class_init    = afx_class_init,
 };
 
 #define TYPE_OPENPROM "openprom"
@@ -680,13 +707,19 @@ static void prom_init(hwaddr addr, const char *bios_name)
     }
 }
 
-static void prom_init1(Object *obj)
+static void prom_realize(DeviceState *ds, Error **errp)
 {
-    PROMState *s = OPENPROM(obj);
-    SysBusDevice *dev = SYS_BUS_DEVICE(obj);
+    PROMState *s = OPENPROM(ds);
+    SysBusDevice *dev = SYS_BUS_DEVICE(ds);
+    Error *local_err = NULL;
+
+    memory_region_init_ram_nomigrate(&s->prom, OBJECT(ds), "sun4m.prom",
+                                     PROM_SIZE_MAX, &local_err);
+    if (local_err) {
+        error_propagate(errp, local_err);
+        return;
+    }
 
-    memory_region_init_ram_nomigrate(&s->prom, obj, "sun4m.prom", PROM_SIZE_MAX,
-                           &error_fatal);
     vmstate_register_ram_global(&s->prom);
     memory_region_set_readonly(&s->prom, true);
     sysbus_init_mmio(dev, &s->prom);
@@ -701,6 +734,7 @@ static void prom_class_init(ObjectClass *klass, void *data)
     DeviceClass *dc = DEVICE_CLASS(klass);
 
     dc->props = prom_properties;
+    dc->realize = prom_realize;
 }
 
 static const TypeInfo prom_info = {
@@ -708,7 +742,6 @@ static const TypeInfo prom_info = {
     .parent        = TYPE_SYS_BUS_DEVICE,
     .instance_size = sizeof(PROMState),
     .class_init    = prom_class_init,
-    .instance_init = prom_init1,
 };
 
 #define TYPE_SUN4M_MEMORY "memory"
diff --git a/hw/sparc64/sun4u.c b/hw/sparc64/sun4u.c
index 1bede85370..3975a7b65a 100644
--- a/hw/sparc64/sun4u.c
+++ b/hw/sparc64/sun4u.c
@@ -425,13 +425,19 @@ static void prom_init(hwaddr addr, const char *bios_name)
     }
 }
 
-static void prom_init1(Object *obj)
+static void prom_realize(DeviceState *ds, Error **errp)
 {
-    PROMState *s = OPENPROM(obj);
-    SysBusDevice *dev = SYS_BUS_DEVICE(obj);
+    PROMState *s = OPENPROM(ds);
+    SysBusDevice *dev = SYS_BUS_DEVICE(ds);
+    Error *local_err = NULL;
+
+    memory_region_init_ram_nomigrate(&s->prom, OBJECT(ds), "sun4u.prom",
+                                     PROM_SIZE_MAX, &local_err);
+    if (local_err) {
+        error_propagate(errp, local_err);
+        return;
+    }
 
-    memory_region_init_ram_nomigrate(&s->prom, obj, "sun4u.prom", PROM_SIZE_MAX,
-                           &error_fatal);
     vmstate_register_ram_global(&s->prom);
     memory_region_set_readonly(&s->prom, true);
     sysbus_init_mmio(dev, &s->prom);
@@ -446,6 +452,7 @@ static void prom_class_init(ObjectClass *klass, void *data)
     DeviceClass *dc = DEVICE_CLASS(klass);
 
     dc->props = prom_properties;
+    dc->realize = prom_realize;
 }
 
 static const TypeInfo prom_info = {
@@ -453,7 +460,6 @@ static const TypeInfo prom_info = {
     .parent        = TYPE_SYS_BUS_DEVICE,
     .instance_size = sizeof(PROMState),
     .class_init    = prom_class_init,
-    .instance_init = prom_init1,
 };
 
 
diff --git a/hw/usb/dev-smartcard-reader.c b/hw/usb/dev-smartcard-reader.c
index fa546fb3ce..13d0befd9c 100644
--- a/hw/usb/dev-smartcard-reader.c
+++ b/hw/usb/dev-smartcard-reader.c
@@ -1147,7 +1147,6 @@ static void ccid_unrealize(USBDevice *dev, Error **errp)
     USBCCIDState *s = USB_CCID_DEV(dev);
 
     ccid_bulk_in_clear(s);
-    object_unref(OBJECT(&s->bus));
 }
 
 static void ccid_flush_pending_answers(USBCCIDState *s)
diff --git a/hw/usb/dev-storage.c b/hw/usb/dev-storage.c
index 47b992f403..c99398b7f6 100644
--- a/hw/usb/dev-storage.c
+++ b/hw/usb/dev-storage.c
@@ -588,13 +588,6 @@ static const struct SCSIBusInfo usb_msd_scsi_info_bot = {
     .load_request = usb_msd_load_request,
 };
 
-static void usb_msd_unrealize_storage(USBDevice *dev, Error **errp)
-{
-    MSDState *s = USB_STORAGE_DEV(dev);
-
-    object_unref(OBJECT(&s->bus));
-}
-
 static void usb_msd_storage_realize(USBDevice *dev, Error **errp)
 {
     MSDState *s = USB_STORAGE_DEV(dev);
@@ -642,13 +635,6 @@ static void usb_msd_storage_realize(USBDevice *dev, Error **errp)
     s->scsi_dev = scsi_dev;
 }
 
-static void usb_msd_bot_unrealize(USBDevice *dev, Error **errp)
-{
-    MSDState *s = USB_STORAGE_DEV(dev);
-
-    object_unref(OBJECT(&s->bus));
-}
-
 static void usb_msd_bot_realize(USBDevice *dev, Error **errp)
 {
     MSDState *s = USB_STORAGE_DEV(dev);
@@ -712,7 +698,6 @@ static void usb_msd_class_storage_initfn(ObjectClass *klass, void *data)
     USBDeviceClass *uc = USB_DEVICE_CLASS(klass);
 
     uc->realize = usb_msd_storage_realize;
-    uc->unrealize = usb_msd_unrealize_storage;
     dc->props = msd_properties;
 }
 
@@ -775,7 +760,6 @@ static void usb_msd_class_bot_initfn(ObjectClass *klass, void *data)
     USBDeviceClass *uc = USB_DEVICE_CLASS(klass);
 
     uc->realize = usb_msd_bot_realize;
-    uc->unrealize = usb_msd_bot_unrealize;
     uc->attached_settable = true;
 }
 
diff --git a/hw/usb/dev-uas.c b/hw/usb/dev-uas.c
index aaf5a88095..be566cad02 100644
--- a/hw/usb/dev-uas.c
+++ b/hw/usb/dev-uas.c
@@ -896,8 +896,6 @@ static void usb_uas_unrealize(USBDevice *dev, Error **errp)
     UASDevice *uas = USB_UAS(dev);
 
     qemu_bh_delete(uas->status_bh);
-
-    object_unref(OBJECT(&uas->bus));
 }
 
 static void usb_uas_realize(USBDevice *dev, Error **errp)
diff --git a/hw/vfio/ccw.c b/hw/vfio/ccw.c
index 76e4e8c652..351b305e1a 100644
--- a/hw/vfio/ccw.c
+++ b/hw/vfio/ccw.c
@@ -33,8 +33,30 @@ typedef struct VFIOCCWDevice {
     uint64_t io_region_offset;
     struct ccw_io_region *io_region;
     EventNotifier io_notifier;
+    bool force_orb_pfch;
+    bool warned_orb_pfch;
 } VFIOCCWDevice;
 
+static inline void warn_once(bool *warned, const char *fmt, ...)
+{
+    va_list ap;
+
+    if (!warned || *warned) {
+        return;
+    }
+    *warned = true;
+    va_start(ap, fmt);
+    warn_vreport(fmt, ap);
+    va_end(ap);
+}
+
+static inline void warn_once_pfch(VFIOCCWDevice *vcdev, SubchDev *sch,
+                                  const char *msg)
+{
+    warn_once(&vcdev->warned_orb_pfch, "vfio-ccw (devno %x.%x.%04x): %s",
+              sch->cssid, sch->ssid, sch->devno, msg);
+}
+
 static void vfio_ccw_compute_needs_reset(VFIODevice *vdev)
 {
     vdev->needs_reset = false;
@@ -55,6 +77,18 @@ static IOInstEnding vfio_ccw_handle_request(SubchDev *sch)
     struct ccw_io_region *region = vcdev->io_region;
     int ret;
 
+    if (!(sch->orb.ctrl0 & ORB_CTRL0_MASK_PFCH)) {
+        if (!(vcdev->force_orb_pfch)) {
+            warn_once_pfch(vcdev, sch, "requires PFCH flag set");
+            sch_gen_unit_exception(sch);
+            css_inject_io_interrupt(sch);
+            return IOINST_CC_EXPECTED;
+        } else {
+            sch->orb.ctrl0 |= ORB_CTRL0_MASK_PFCH;
+            warn_once_pfch(vcdev, sch, "PFCH flag forced");
+        }
+    }
+
     QEMU_BUILD_BUG_ON(sizeof(region->orb_area) != sizeof(ORB));
     QEMU_BUILD_BUG_ON(sizeof(region->scsw_area) != sizeof(SCSW));
     QEMU_BUILD_BUG_ON(sizeof(region->irb_area) != sizeof(IRB));
@@ -430,6 +464,7 @@ static void vfio_ccw_unrealize(DeviceState *dev, Error **errp)
 
 static Property vfio_ccw_properties[] = {
     DEFINE_PROP_STRING("sysfsdev", VFIOCCWDevice, vdev.sysfsdev),
+    DEFINE_PROP_BOOL("force-orb-pfch", VFIOCCWDevice, force_orb_pfch, false),
     DEFINE_PROP_END_OF_LIST(),
 };
 
diff --git a/include/block/aio-wait.h b/include/block/aio-wait.h
index 8c90a2e66e..c85a62f798 100644
--- a/include/block/aio-wait.h
+++ b/include/block/aio-wait.h
@@ -57,7 +57,8 @@ typedef struct {
 /**
  * AIO_WAIT_WHILE:
  * @wait: the aio wait object
- * @ctx: the aio context
+ * @ctx: the aio context, or NULL if multiple aio contexts (for which the
+ *       caller does not hold a lock) are involved in the polling condition.
  * @cond: wait while this conditional expression is true
  *
  * Wait while a condition is true.  Use this to implement synchronous
@@ -73,29 +74,27 @@ typedef struct {
  */
 #define AIO_WAIT_WHILE(wait, ctx, cond) ({                         \
     bool waited_ = false;                                          \
-    bool busy_ = true;                                             \
     AioWait *wait_ = (wait);                                       \
     AioContext *ctx_ = (ctx);                                      \
-    if (in_aio_context_home_thread(ctx_)) {                        \
-        while ((cond) || busy_) {                                  \
-            busy_ = aio_poll(ctx_, (cond));                        \
-            waited_ |= !!(cond) | busy_;                           \
+    if (ctx_ && in_aio_context_home_thread(ctx_)) {                \
+        while ((cond)) {                                           \
+            aio_poll(ctx_, true);                                  \
+            waited_ = true;                                        \
         }                                                          \
     } else {                                                       \
         assert(qemu_get_current_aio_context() ==                   \
                qemu_get_aio_context());                            \
         /* Increment wait_->num_waiters before evaluating cond. */ \
         atomic_inc(&wait_->num_waiters);                           \
-        while (busy_) {                                            \
-            if ((cond)) {                                          \
-                waited_ = busy_ = true;                            \
+        while ((cond)) {                                           \
+            if (ctx_) {                                            \
                 aio_context_release(ctx_);                         \
-                aio_poll(qemu_get_aio_context(), true);            \
+            }                                                      \
+            aio_poll(qemu_get_aio_context(), true);                \
+            if (ctx_) {                                            \
                 aio_context_acquire(ctx_);                         \
-            } else {                                               \
-                busy_ = aio_poll(ctx_, false);                     \
-                waited_ |= busy_;                                  \
             }                                                      \
+            waited_ = true;                                        \
         }                                                          \
         atomic_dec(&wait_->num_waiters);                           \
     }                                                              \
diff --git a/include/block/block.h b/include/block/block.h
index e677080c4e..b1d6fdb97a 100644
--- a/include/block/block.h
+++ b/include/block/block.h
@@ -421,6 +421,7 @@ BlockDriverState *bdrv_lookup_bs(const char *device,
                                  Error **errp);
 bool bdrv_chain_contains(BlockDriverState *top, BlockDriverState *base);
 BlockDriverState *bdrv_next_node(BlockDriverState *bs);
+BlockDriverState *bdrv_next_all_states(BlockDriverState *bs);
 
 typedef struct BdrvNextIterator {
     enum {
@@ -557,7 +558,8 @@ void bdrv_io_unplug(BlockDriverState *bs);
  * Begin a quiesced section of all users of @bs. This is part of
  * bdrv_drained_begin.
  */
-void bdrv_parent_drained_begin(BlockDriverState *bs, BdrvChild *ignore);
+void bdrv_parent_drained_begin(BlockDriverState *bs, BdrvChild *ignore,
+                               bool ignore_bds_parents);
 
 /**
  * bdrv_parent_drained_end:
@@ -565,7 +567,23 @@ void bdrv_parent_drained_begin(BlockDriverState *bs, BdrvChild *ignore);
  * End a quiesced section of all users of @bs. This is part of
  * bdrv_drained_end.
  */
-void bdrv_parent_drained_end(BlockDriverState *bs, BdrvChild *ignore);
+void bdrv_parent_drained_end(BlockDriverState *bs, BdrvChild *ignore,
+                             bool ignore_bds_parents);
+
+/**
+ * bdrv_drain_poll:
+ *
+ * Poll for pending requests in @bs, its parents (except for @ignore_parent),
+ * and if @recursive is true its children as well (used for subtree drain).
+ *
+ * If @ignore_bds_parents is true, parents that are BlockDriverStates must
+ * ignore the drain request because they will be drained separately (used for
+ * drain_all).
+ *
+ * This is part of bdrv_drained_begin.
+ */
+bool bdrv_drain_poll(BlockDriverState *bs, bool recursive,
+                     BdrvChild *ignore_parent, bool ignore_bds_parents);
 
 /**
  * bdrv_drained_begin:
@@ -580,6 +598,15 @@ void bdrv_parent_drained_end(BlockDriverState *bs, BdrvChild *ignore);
 void bdrv_drained_begin(BlockDriverState *bs);
 
 /**
+ * bdrv_do_drained_begin_quiesce:
+ *
+ * Quiesces a BDS like bdrv_drained_begin(), but does not wait for already
+ * running requests to complete.
+ */
+void bdrv_do_drained_begin_quiesce(BlockDriverState *bs,
+                                   BdrvChild *parent, bool ignore_bds_parents);
+
+/**
  * Like bdrv_drained_begin, but recursively begins a quiesced section for
  * exclusive access to all child nodes as well.
  */
diff --git a/include/block/block_int.h b/include/block/block_int.h
index 327e478a73..74646ed722 100644
--- a/include/block/block_int.h
+++ b/include/block/block_int.h
@@ -577,6 +577,12 @@ struct BdrvChildRole {
      * points to. */
     bool stay_at_node;
 
+    /* If true, the parent is a BlockDriverState and bdrv_next_all_states()
+     * will return it. This information is used for drain_all, where every node
+     * will be drained separately, so the drain only needs to be propagated to
+     * non-BDS parents. */
+    bool parent_is_bds;
+
     void (*inherit_options)(int *child_flags, QDict *child_options,
                             int parent_flags, QDict *parent_options);
 
@@ -605,6 +611,13 @@ struct BdrvChildRole {
     void (*drained_begin)(BdrvChild *child);
     void (*drained_end)(BdrvChild *child);
 
+    /*
+     * Returns whether the parent has pending requests for the child. This
+     * callback is polled after .drained_begin() has been called until all
+     * activity on the child has stopped.
+     */
+    bool (*drained_poll)(BdrvChild *child);
+
     /* Notifies the parent that the child has been activated/inactivated (e.g.
      * when migration is completing) and it can start/stop requesting
      * permissions and doing I/O on it. */
@@ -841,6 +854,7 @@ int coroutine_fn bdrv_co_pwritev(BdrvChild *child,
     int64_t offset, unsigned int bytes, QEMUIOVector *qiov,
     BdrvRequestFlags flags);
 
+extern unsigned int bdrv_drain_all_count;
 void bdrv_apply_subtree_drain(BdrvChild *child, BlockDriverState *new_parent);
 void bdrv_unapply_subtree_drain(BdrvChild *child, BlockDriverState *old_parent);
 
@@ -1017,6 +1031,7 @@ void commit_active_start(const char *job_id, BlockDriverState *bs,
  * @filter_node_name: The node name that should be assigned to the filter
  * driver that the mirror job inserts into the graph above @bs. NULL means that
  * a node name should be autogenerated.
+ * @copy_mode: When to trigger writes to the target.
  * @errp: Error object.
  *
  * Start a mirroring operation on @bs.  Clusters that are allocated
@@ -1030,7 +1045,8 @@ void mirror_start(const char *job_id, BlockDriverState *bs,
                   MirrorSyncMode mode, BlockMirrorBackingMode backing_mode,
                   BlockdevOnError on_source_error,
                   BlockdevOnError on_target_error,
-                  bool unmap, const char *filter_node_name, Error **errp);
+                  bool unmap, const char *filter_node_name,
+                  MirrorCopyMode copy_mode, Error **errp);
 
 /*
  * backup_job_create:
diff --git a/include/block/blockjob_int.h b/include/block/blockjob_int.h
index 5cd50c6639..e4a318dd15 100644
--- a/include/block/blockjob_int.h
+++ b/include/block/blockjob_int.h
@@ -39,6 +39,14 @@ struct BlockJobDriver {
     JobDriver job_driver;
 
     /*
+     * Returns whether the job has pending requests for the child or will
+     * submit new requests before the next pause point. This callback is polled
+     * in the context of draining a job node after requesting that the job be
+     * paused, until all activity on the child has stopped.
+     */
+    bool (*drained_poll)(BlockJob *job);
+
+    /*
      * If the callback is not NULL, it will be invoked before the job is
      * resumed in a new AioContext.  This is the place to move any resources
      * besides job->blk to the new AioContext.
diff --git a/include/block/dirty-bitmap.h b/include/block/dirty-bitmap.h
index 02e0cbabd2..288dc6adb6 100644
--- a/include/block/dirty-bitmap.h
+++ b/include/block/dirty-bitmap.h
@@ -82,6 +82,8 @@ void bdrv_set_dirty_bitmap_locked(BdrvDirtyBitmap *bitmap,
 void bdrv_reset_dirty_bitmap_locked(BdrvDirtyBitmap *bitmap,
                                     int64_t offset, int64_t bytes);
 int64_t bdrv_dirty_iter_next(BdrvDirtyBitmapIter *iter);
+bool bdrv_dirty_iter_next_area(BdrvDirtyBitmapIter *iter, uint64_t max_offset,
+                               uint64_t *offset, int *bytes);
 void bdrv_set_dirty_iter(BdrvDirtyBitmapIter *hbi, int64_t offset);
 int64_t bdrv_get_dirty_count(BdrvDirtyBitmap *bitmap);
 int64_t bdrv_get_meta_dirty_count(BdrvDirtyBitmap *bitmap);
diff --git a/include/exec/ramlist.h b/include/exec/ramlist.h
index 2e2ac6cb99..bc4faa1b00 100644
--- a/include/exec/ramlist.h
+++ b/include/exec/ramlist.h
@@ -56,8 +56,10 @@ typedef struct RAMList {
 extern RAMList ram_list;
 
 /* Should be holding either ram_list.mutex, or the RCU lock. */
-#define  RAMBLOCK_FOREACH(block)  \
+#define  INTERNAL_RAMBLOCK_FOREACH(block)  \
     QLIST_FOREACH_RCU(block, &ram_list.blocks, next)
+/* Never use the INTERNAL_ version except for defining other macros */
+#define RAMBLOCK_FOREACH(block) INTERNAL_RAMBLOCK_FOREACH(block)
 
 void qemu_mutex_lock_ramlist(void);
 void qemu_mutex_unlock_ramlist(void);
diff --git a/include/hw/display/ramfb.h b/include/hw/display/ramfb.h
new file mode 100644
index 0000000000..b33a2c467b
--- /dev/null
+++ b/include/hw/display/ramfb.h
@@ -0,0 +1,12 @@
+#ifndef RAMFB_H
+#define RAMFB_H
+
+/* ramfb.c */
+typedef struct RAMFBState RAMFBState;
+void ramfb_display_update(QemuConsole *con, RAMFBState *s);
+RAMFBState *ramfb_setup(Error **errp);
+
+/* ramfb-standalone.c */
+#define TYPE_RAMFB_DEVICE "ramfb"
+
+#endif /* RAMFB_H */
diff --git a/include/hw/input/adb.h b/include/hw/input/adb.h
index 3ae8445e95..f99d478252 100644
--- a/include/hw/input/adb.h
+++ b/include/hw/input/adb.h
@@ -49,6 +49,7 @@ struct ADBDevice {
 
     int devaddr;
     int handler;
+    bool disable_direct_reg3_writes;
 };
 
 #define ADB_DEVICE_CLASS(cls) \
diff --git a/include/hw/misc/macio/gpio.h b/include/hw/misc/macio/gpio.h
new file mode 100644
index 0000000000..2838ae5fde
--- /dev/null
+++ b/include/hw/misc/macio/gpio.h
@@ -0,0 +1,47 @@
+/*
+ * PowerMac NewWorld MacIO GPIO emulation
+ *
+ * Copyright (c) 2016 Benjamin Herrenschmidt
+ * Copyright (c) 2018 Mark Cave-Ayland
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ */
+
+#ifndef MACIO_GPIO_H
+#define MACIO_GPIO_H
+
+#define TYPE_MACIO_GPIO "macio-gpio"
+#define MACIO_GPIO(obj) OBJECT_CHECK(MacIOGPIOState, (obj), TYPE_MACIO_GPIO)
+
+typedef struct MacIOGPIOState {
+    /*< private >*/
+    SysBusDevice parent;
+    /*< public >*/
+
+    OpenPICState *pic;
+
+    MemoryRegion gpiomem;
+    qemu_irq gpio_extirqs[10];
+    uint8_t gpio_levels[8];
+    uint8_t gpio_regs[36]; /* XXX Check count */
+} MacIOGPIOState;
+
+void macio_set_gpio(MacIOGPIOState *s, uint32_t gpio, bool state);
+
+#endif
diff --git a/include/hw/misc/macio/macio.h b/include/hw/misc/macio/macio.h
index 838eaf1db0..cfaa145500 100644
--- a/include/hw/misc/macio/macio.h
+++ b/include/hw/misc/macio/macio.h
@@ -26,8 +26,11 @@
 #ifndef MACIO_H
 #define MACIO_H
 
+#include "hw/char/escc.h"
 #include "hw/intc/heathrow_pic.h"
 #include "hw/misc/macio/cuda.h"
+#include "hw/misc/macio/gpio.h"
+#include "hw/misc/macio/pmu.h"
 #include "hw/ppc/mac_dbdma.h"
 #include "hw/ppc/openpic.h"
 
@@ -41,6 +44,7 @@ typedef struct MacIOState {
 
     MemoryRegion bar;
     CUDAState cuda;
+    PMUState pmu;
     DBDMAState dbdma;
     ESCCState escc;
     uint64_t frequency;
@@ -70,8 +74,11 @@ typedef struct NewWorldMacIOState {
     MacIOState parent_obj;
     /*< public >*/
 
+    bool has_pmu;
+    bool has_adb;
     OpenPICState *pic;
     MACIOIDEState ide[2];
+    MacIOGPIOState gpio;
 } NewWorldMacIOState;
 
 #endif /* MACIO_H */
diff --git a/include/hw/misc/macio/pmu.h b/include/hw/misc/macio/pmu.h
new file mode 100644
index 0000000000..d10895ba5f
--- /dev/null
+++ b/include/hw/misc/macio/pmu.h
@@ -0,0 +1,237 @@
+/*
+ * Definitions for talking to the PMU.  The PMU is a microcontroller
+ * which controls battery charging and system power on PowerBook 3400
+ * and 2400 models as well as the RTC and various other things.
+ *
+ * Copyright (C) 1998 Paul Mackerras.
+ * Copyright (C) 2016 Ben Herrenschmidt
+ */
+
+#ifndef PMU_H
+#define PMU_H
+
+/*
+ * PMU commands
+ */
+
+#define PMU_POWER_CTRL0            0x10  /* control power of some devices */
+#define PMU_POWER_CTRL             0x11  /* control power of some devices */
+#define PMU_ADB_CMD                0x20  /* send ADB packet */
+#define PMU_ADB_POLL_OFF           0x21  /* disable ADB auto-poll */
+#define PMU_WRITE_NVRAM            0x33  /* write non-volatile RAM */
+#define PMU_READ_NVRAM             0x3b  /* read non-volatile RAM */
+#define PMU_SET_RTC                0x30  /* set real-time clock */
+#define PMU_READ_RTC               0x38  /* read real-time clock */
+#define PMU_SET_VOLBUTTON          0x40  /* set volume up/down position */
+#define PMU_BACKLIGHT_BRIGHT       0x41  /* set backlight brightness */
+#define PMU_GET_VOLBUTTON          0x48  /* get volume up/down position */
+#define PMU_PCEJECT                0x4c  /* eject PC-card from slot */
+#define PMU_BATTERY_STATE          0x6b  /* report battery state etc. */
+#define PMU_SMART_BATTERY_STATE    0x6f  /* report battery state (new way) */
+#define PMU_SET_INTR_MASK          0x70  /* set PMU interrupt mask */
+#define PMU_INT_ACK                0x78  /* read interrupt bits */
+#define PMU_SHUTDOWN               0x7e  /* turn power off */
+#define PMU_CPU_SPEED              0x7d  /* control CPU speed on some models */
+#define PMU_SLEEP                  0x7f  /* put CPU to sleep */
+#define PMU_POWER_EVENTS           0x8f  /* Send power-event commands to PMU */
+#define PMU_I2C_CMD                0x9a  /* I2C operations */
+#define PMU_RESET                  0xd0  /* reset CPU */
+#define PMU_GET_BRIGHTBUTTON       0xd9  /* report brightness up/down pos */
+#define PMU_GET_COVER              0xdc  /* report cover open/closed */
+#define PMU_SYSTEM_READY           0xdf  /* tell PMU we are awake */
+#define PMU_DOWNLOAD_STATUS        0xe2  /* Called by MacOS during boot... */
+#define PMU_READ_PMU_RAM           0xe8  /* read the PMU RAM... ??? */
+#define PMU_GET_VERSION            0xea  /* read the PMU version */
+
+/* Bits to use with the PMU_POWER_CTRL0 command */
+#define PMU_POW0_ON            0x80    /* OR this to power ON the device */
+#define PMU_POW0_OFF           0x00    /* leave bit 7 to 0 to power it OFF */
+#define PMU_POW0_HARD_DRIVE    0x04    /* Hard drive power
+                                        * (on wallstreet/lombard ?) */
+
+/* Bits to use with the PMU_POWER_CTRL command */
+#define PMU_POW_ON             0x80    /* OR this to power ON the device */
+#define PMU_POW_OFF            0x00    /* leave bit 7 to 0 to power it OFF */
+#define PMU_POW_BACKLIGHT      0x01    /* backlight power */
+#define PMU_POW_CHARGER        0x02    /* battery charger power */
+#define PMU_POW_IRLED          0x04    /* IR led power (on wallstreet) */
+#define PMU_POW_MEDIABAY       0x08    /* media bay power
+                                        * (wallstreet/lombard ?) */
+
+/* Bits in PMU interrupt and interrupt mask bytes */
+#define PMU_INT_PCEJECT        0x04    /* PC-card eject buttons */
+#define PMU_INT_SNDBRT         0x08    /* sound/brightness up/down buttons */
+#define PMU_INT_ADB            0x10    /* ADB autopoll or reply data */
+#define PMU_INT_BATTERY        0x20    /* Battery state change */
+#define PMU_INT_ENVIRONMENT    0x40    /* Environment interrupts */
+#define PMU_INT_TICK           0x80    /* 1-second tick interrupt */
+
+/* Other bits in PMU interrupt valid when PMU_INT_ADB is set */
+#define PMU_INT_ADB_AUTO           0x04    /* ADB autopoll, when PMU_INT_ADB */
+#define PMU_INT_WAITING_CHARGER    0x01    /* ??? */
+#define PMU_INT_AUTO_SRQ_POLL      0x02    /* ??? */
+
+/* Bits in the environement message (either obtained via PMU_GET_COVER,
+ * or via PMU_INT_ENVIRONMENT on core99 */
+#define PMU_ENV_LID_CLOSED     0x01    /* The lid is closed */
+
+/* I2C related definitions */
+#define PMU_I2C_MODE_SIMPLE    0
+#define PMU_I2C_MODE_STDSUB    1
+#define PMU_I2C_MODE_COMBINED  2
+
+#define PMU_I2C_BUS_STATUS     0
+#define PMU_I2C_BUS_SYSCLK     1
+#define PMU_I2C_BUS_POWER      2
+
+#define PMU_I2C_STATUS_OK          0
+#define PMU_I2C_STATUS_DATAREAD    1
+#define PMU_I2C_STATUS_BUSY        0xfe
+
+/* Kind of PMU (model) */
+enum {
+    PMU_UNKNOWN,
+    PMU_OHARE_BASED,        /* 2400, 3400, 3500 (old G3 powerbook) */
+    PMU_HEATHROW_BASED,     /* PowerBook G3 series */
+    PMU_PADDINGTON_BASED,   /* 1999 PowerBook G3 */
+    PMU_KEYLARGO_BASED,     /* Core99 motherboard (PMU99) */
+    PMU_68K_V1,             /* 68K PMU, version 1 */
+    PMU_68K_V2,             /* 68K PMU, version 2 */
+};
+
+/* PMU PMU_POWER_EVENTS commands */
+enum {
+    PMU_PWR_GET_POWERUP_EVENTS = 0x00,
+    PMU_PWR_SET_POWERUP_EVENTS = 0x01,
+    PMU_PWR_CLR_POWERUP_EVENTS = 0x02,
+    PMU_PWR_GET_WAKEUP_EVENTS = 0x03,
+    PMU_PWR_SET_WAKEUP_EVENTS = 0x04,
+    PMU_PWR_CLR_WAKEUP_EVENTS = 0x05,
+};
+
+/* Power events wakeup bits */
+enum {
+    PMU_PWR_WAKEUP_KEY = 0x01,           /* Wake on key press */
+    PMU_PWR_WAKEUP_AC_INSERT = 0x02,     /* Wake on AC adapter plug */
+    PMU_PWR_WAKEUP_AC_CHANGE = 0x04,
+    PMU_PWR_WAKEUP_LID_OPEN = 0x08,
+    PMU_PWR_WAKEUP_RING = 0x10,
+};
+
+/*
+ * This table indicates for each PMU opcode:
+ * - the number of data bytes to be sent with the command, or -1
+ *   if a length byte should be sent,
+ * - the number of response bytes which the PMU will return, or
+ *   -1 if it will send a length byte.
+ */
+
+static const int8_t pmu_data_len[256][2] = {
+/*  0        1        2        3        4        5        6        7  */
+    {-1,  0},{-1,  0},{-1,  0},{-1,  0},{-1,  0},{-1,  0},{-1,  0},{-1,  0},
+    {-1, -1},{-1, -1},{-1, -1},{-1, -1},{-1, -1},{-1, -1},{-1, -1},{-1, -1},
+    { 1,  0},{ 1,  0},{-1,  0},{-1,  0},{-1,  0},{-1,  0},{-1,  0},{-1,  0},
+    { 0,  1},{ 0,  1},{-1, -1},{-1, -1},{-1, -1},{-1, -1},{-1, -1},{ 0,  0},
+    {-1,  0},{ 0,  0},{ 2,  0},{ 1,  0},{ 1,  0},{-1,  0},{-1,  0},{-1,  0},
+    { 0, -1},{ 0, -1},{-1, -1},{-1, -1},{-1, -1},{-1, -1},{-1, -1},{ 0, -1},
+    { 4,  0},{20,  0},{-1,  0},{ 3,  0},{-1,  0},{-1,  0},{-1,  0},{-1,  0},
+    { 0,  4},{ 0, 20},{ 2, -1},{ 2,  1},{ 3, -1},{-1, -1},{-1, -1},{ 4,  0},
+    { 1,  0},{ 1,  0},{-1,  0},{-1,  0},{-1,  0},{-1,  0},{-1,  0},{-1,  0},
+    { 0,  1},{ 0,  1},{-1, -1},{ 1,  0},{ 1,  0},{-1, -1},{-1, -1},{-1, -1},
+    { 1,  0},{ 0,  0},{ 2,  0},{ 2,  0},{-1,  0},{ 1,  0},{ 3,  0},{ 1,  0},
+    { 0,  1},{ 1,  0},{ 0,  2},{ 0,  2},{ 0, -1},{-1, -1},{-1, -1},{-1, -1},
+    { 2,  0},{-1,  0},{-1,  0},{-1,  0},{-1,  0},{-1,  0},{-1,  0},{-1,  0},
+    { 0,  3},{ 0,  3},{ 0,  2},{ 0,  8},{ 0, -1},{ 0, -1},{-1, -1},{-1, -1},
+    { 1,  0},{ 1,  0},{ 1,  0},{-1,  0},{-1,  0},{-1,  0},{-1,  0},{-1,  0},
+    { 0, -1},{ 0, -1},{-1, -1},{-1, -1},{-1, -1},{ 5,  1},{ 4,  1},{ 4,  1},
+    { 4,  0},{-1,  0},{ 0,  0},{-1,  0},{-1,  0},{-1,  0},{-1,  0},{-1,  0},
+    { 0,  5},{-1, -1},{-1, -1},{-1, -1},{-1, -1},{-1, -1},{-1, -1},{-1, -1},
+    { 1,  0},{ 2,  0},{-1,  0},{-1,  0},{-1,  0},{-1,  0},{-1,  0},{-1,  0},
+    { 0,  1},{ 0,  1},{-1, -1},{-1, -1},{-1, -1},{-1, -1},{-1, -1},{-1, -1},
+    { 2,  0},{ 2,  0},{ 2,  0},{ 4,  0},{-1,  0},{ 0,  0},{-1,  0},{-1,  0},
+    { 1,  1},{ 1,  0},{ 3,  0},{ 2,  0},{-1, -1},{-1, -1},{-1, -1},{-1, -1},
+    {-1,  0},{-1,  0},{-1,  0},{-1,  0},{-1,  0},{-1,  0},{-1,  0},{-1,  0},
+    {-1, -1},{-1, -1},{-1, -1},{-1, -1},{-1, -1},{-1, -1},{-1, -1},{-1, -1},
+    {-1,  0},{-1,  0},{-1,  0},{-1,  0},{-1,  0},{-1,  0},{-1,  0},{-1,  0},
+    {-1, -1},{-1, -1},{-1, -1},{-1, -1},{-1, -1},{-1, -1},{-1, -1},{-1, -1},
+    { 0,  0},{-1,  0},{-1,  0},{-1,  0},{-1,  0},{-1,  0},{-1,  0},{-1,  0},
+    { 1,  1},{ 1,  1},{-1, -1},{-1, -1},{ 0,  1},{ 0, -1},{-1, -1},{-1, -1},
+    {-1,  0},{ 4,  0},{ 0,  1},{-1,  0},{-1,  0},{ 4,  0},{-1,  0},{-1,  0},
+    { 3, -1},{-1, -1},{ 0,  1},{-1, -1},{ 0, -1},{-1, -1},{-1, -1},{ 0,  0},
+    {-1,  0},{-1,  0},{-1,  0},{-1,  0},{-1,  0},{-1,  0},{-1,  0},{-1,  0},
+    {-1, -1},{-1, -1},{-1, -1},{-1, -1},{-1, -1},{-1, -1},{-1, -1},{-1, -1},
+};
+
+/* Command protocol state machine */
+typedef enum {
+    pmu_state_idle, /* Waiting for command */
+    pmu_state_cmd,  /* Receiving command */
+    pmu_state_rsp,  /* Responding to command */
+} PMUCmdState;
+
+/* MOS6522 PMU */
+typedef struct MOS6522PMUState {
+    /*< private >*/
+    MOS6522State parent_obj;
+} MOS6522PMUState;
+
+#define TYPE_MOS6522_PMU "mos6522-pmu"
+#define MOS6522_PMU(obj) OBJECT_CHECK(MOS6522PMUState, (obj), \
+                                      TYPE_MOS6522_PMU)
+/**
+ * PMUState:
+ * @last_b: last value of B register
+ */
+
+typedef struct PMUState {
+    /*< private >*/
+    SysBusDevice parent_obj;
+    /*< public >*/
+
+    MemoryRegion mem;
+    uint64_t frequency;
+    qemu_irq via_irq;
+    bool via_irq_state;
+
+    /* PMU state */
+    MOS6522PMUState mos6522_pmu;
+
+    /* PMU low level protocol state */
+    PMUCmdState cmd_state;
+    uint8_t last_b;
+    uint8_t cmd;
+    uint32_t cmdlen;
+    uint32_t rsplen;
+    uint8_t cmd_buf_pos;
+    uint8_t cmd_buf[128];
+    uint8_t cmd_rsp_pos;
+    uint8_t cmd_rsp_sz;
+    uint8_t cmd_rsp[128];
+
+    /* PMU events/interrupts */
+    uint8_t intbits;
+    uint8_t intmask;
+
+    /* ADB */
+    bool has_adb;
+    ADBBusState adb_bus;
+    uint16_t adb_poll_mask;
+    uint8_t autopoll_rate_ms;
+    uint8_t autopoll_mask;
+    QEMUTimer *adb_poll_timer;
+    uint8_t adb_reply_size;
+    uint8_t adb_reply[ADB_MAX_OUT_LEN];
+
+    /* RTC */
+    uint32_t tick_offset;
+    QEMUTimer *one_sec_timer;
+    int64_t one_sec_target;
+
+    /* GPIO */
+    MacIOGPIOState *gpio;
+} PMUState;
+
+#define TYPE_VIA_PMU "via-pmu"
+#define VIA_PMU(obj) OBJECT_CHECK(PMUState, (obj), TYPE_VIA_PMU)
+
+#endif /* PMU_H */
diff --git a/include/hw/misc/mos6522.h b/include/hw/misc/mos6522.h
index f52b41920b..03d9f0c059 100644
--- a/include/hw/misc/mos6522.h
+++ b/include/hw/misc/mos6522.h
@@ -134,6 +134,7 @@ typedef struct MOS6522DeviceClass {
     void (*set_sr_int)(MOS6522State *dev);
     void (*portB_write)(MOS6522State *dev);
     void (*portA_write)(MOS6522State *dev);
+    void (*update_irq)(MOS6522State *dev);
     /* These are used to influence the CUDA MacOS timebase calibration */
     uint64_t (*get_timer1_counter_value)(MOS6522State *dev, MOS6522Timer *ti);
     uint64_t (*get_timer2_counter_value)(MOS6522State *dev, MOS6522Timer *ti);
diff --git a/include/hw/ppc/pnv_core.h b/include/hw/ppc/pnv_core.h
index e337af7a3a..447ae761f7 100644
--- a/include/hw/ppc/pnv_core.h
+++ b/include/hw/ppc/pnv_core.h
@@ -34,7 +34,7 @@ typedef struct PnvCore {
     CPUCore parent_obj;
 
     /*< public >*/
-    void *threads;
+    PowerPCCPU **threads;
     uint32_t pir;
 
     MemoryRegion xscom_regs;
diff --git a/include/hw/ppc/ppc.h b/include/hw/ppc/ppc.h
index b18ef3eefb..298ec354a8 100644
--- a/include/hw/ppc/ppc.h
+++ b/include/hw/ppc/ppc.h
@@ -101,6 +101,7 @@ enum {
 #define FW_CFG_PPC_NVRAM_ADDR   (FW_CFG_ARCH_LOCAL + 0x08)
 #define FW_CFG_PPC_BUSFREQ      (FW_CFG_ARCH_LOCAL + 0x09)
 #define FW_CFG_PPC_NVRAM_FLAT   (FW_CFG_ARCH_LOCAL + 0x0a)
+#define FW_CFG_PPC_VIACONFIG    (FW_CFG_ARCH_LOCAL + 0x0b)
 
 #define PPC_SERIAL_MM_BAUDBASE 399193
 
diff --git a/include/hw/ppc/spapr_cpu_core.h b/include/hw/ppc/spapr_cpu_core.h
index 47dcfda12b..8ceea2973a 100644
--- a/include/hw/ppc/spapr_cpu_core.h
+++ b/include/hw/ppc/spapr_cpu_core.h
@@ -41,4 +41,15 @@ typedef struct sPAPRCPUCoreClass {
 const char *spapr_get_cpu_core_type(const char *cpu_type);
 void spapr_cpu_set_entry_state(PowerPCCPU *cpu, target_ulong nip, target_ulong r3);
 
+typedef struct sPAPRCPUState {
+    uint64_t vpa_addr;
+    uint64_t slb_shadow_addr, slb_shadow_size;
+    uint64_t dtl_addr, dtl_size;
+} sPAPRCPUState;
+
+static inline sPAPRCPUState *spapr_cpu_state(PowerPCCPU *cpu)
+{
+    return (sPAPRCPUState *)cpu->machine_data;
+}
+
 #endif
diff --git a/include/migration/vmstate.h b/include/migration/vmstate.h
index 3747110f95..42b946ce90 100644
--- a/include/migration/vmstate.h
+++ b/include/migration/vmstate.h
@@ -27,8 +27,6 @@
 #ifndef QEMU_VMSTATE_H
 #define QEMU_VMSTATE_H
 
-typedef struct QJSON QJSON;
-
 typedef struct VMStateInfo VMStateInfo;
 typedef struct VMStateDescription VMStateDescription;
 typedef struct VMStateField VMStateField;
diff --git a/include/qemu/hbitmap.h b/include/qemu/hbitmap.h
index 6b6490ecad..ddca52c48e 100644
--- a/include/qemu/hbitmap.h
+++ b/include/qemu/hbitmap.h
@@ -324,11 +324,14 @@ void hbitmap_free_meta(HBitmap *hb);
 /**
  * hbitmap_iter_next:
  * @hbi: HBitmapIter to operate on.
+ * @advance: If true, advance the iterator.  Otherwise, the next call
+ *           of this function will return the same result (if that
+ *           position is still dirty).
  *
  * Return the next bit that is set in @hbi's associated HBitmap,
  * or -1 if all remaining bits are zero.
  */
-int64_t hbitmap_iter_next(HBitmapIter *hbi);
+int64_t hbitmap_iter_next(HBitmapIter *hbi, bool advance);
 
 /**
  * hbitmap_iter_next_word:
diff --git a/include/qemu/job.h b/include/qemu/job.h
index 1d820530fa..18c9223e31 100644
--- a/include/qemu/job.h
+++ b/include/qemu/job.h
@@ -335,6 +335,21 @@ void job_progress_update(Job *job, uint64_t done);
  */
 void job_progress_set_remaining(Job *job, uint64_t remaining);
 
+/**
+ * @job: The job whose expected progress end value is updated
+ * @delta: Value which is to be added to the current expected end
+ *         value
+ *
+ * Increases the expected end value of the progress counter of a job.
+ * This is useful for parenthesis operations: If a job has to
+ * conditionally perform a high-priority operation as part of its
+ * progress, it calls this function with the expected operation's
+ * length before, and job_progress_update() afterwards.
+ * (So the operation acts as a parenthesis in regards to the main job
+ * operation running in background.)
+ */
+void job_progress_increase_remaining(Job *job, uint64_t delta);
+
 /** To be called when a cancelled job is finalised. */
 void job_event_cancelled(Job *job);
 
diff --git a/include/qemu/typedefs.h b/include/qemu/typedefs.h
index 325c72de33..3ec0e13a96 100644
--- a/include/qemu/typedefs.h
+++ b/include/qemu/typedefs.h
@@ -97,6 +97,7 @@ typedef struct QEMUTimer QEMUTimer;
 typedef struct QEMUTimerListGroup QEMUTimerListGroup;
 typedef struct QBool QBool;
 typedef struct QDict QDict;
+typedef struct QJSON QJSON;
 typedef struct QList QList;
 typedef struct QNull QNull;
 typedef struct QNum QNum;
diff --git a/job.c b/job.c
index 84e140238b..fa671b431a 100644
--- a/job.c
+++ b/job.c
@@ -385,6 +385,11 @@ void job_progress_set_remaining(Job *job, uint64_t remaining)
     job->progress_total = job->progress_current + remaining;
 }
 
+void job_progress_increase_remaining(Job *job, uint64_t delta)
+{
+    job->progress_total += delta;
+}
+
 void job_event_cancelled(Job *job)
 {
     notifier_list_notify(&job->on_finalize_cancelled, job);
diff --git a/migration/block-dirty-bitmap.c b/migration/block-dirty-bitmap.c
index eeccaff34b..3bafbbdc4c 100644
--- a/migration/block-dirty-bitmap.c
+++ b/migration/block-dirty-bitmap.c
@@ -672,6 +672,9 @@ static int dirty_bitmap_load(QEMUFile *f, void *opaque, int version_id)
 
     do {
         ret = dirty_bitmap_load_header(f, &s);
+        if (ret < 0) {
+            return ret;
+        }
 
         if (s.flags & DIRTY_BITMAP_MIG_FLAG_START) {
             ret = dirty_bitmap_load_start(f, &s);
diff --git a/migration/migration.c b/migration/migration.c
index 1e99ec9b7e..e1eaa97df4 100644
--- a/migration/migration.c
+++ b/migration/migration.c
@@ -82,6 +82,11 @@
 #define DEFAULT_MIGRATE_MULTIFD_CHANNELS 2
 #define DEFAULT_MIGRATE_MULTIFD_PAGE_COUNT 16
 
+/* Background transfer rate for postcopy, 0 means unlimited, note
+ * that page requests can still exceed this limit.
+ */
+#define DEFAULT_MIGRATE_MAX_POSTCOPY_BANDWIDTH 0
+
 static NotifierList migration_state_notifiers =
     NOTIFIER_LIST_INITIALIZER(migration_state_notifiers);
 
@@ -659,6 +664,8 @@ MigrationParameters *qmp_query_migrate_parameters(Error **errp)
     params->x_multifd_page_count = s->parameters.x_multifd_page_count;
     params->has_xbzrle_cache_size = true;
     params->xbzrle_cache_size = s->parameters.xbzrle_cache_size;
+    params->has_max_postcopy_bandwidth = true;
+    params->max_postcopy_bandwidth = s->parameters.max_postcopy_bandwidth;
 
     return params;
 }
@@ -1066,6 +1073,9 @@ static void migrate_params_test_apply(MigrateSetParameters *params,
     if (params->has_xbzrle_cache_size) {
         dest->xbzrle_cache_size = params->xbzrle_cache_size;
     }
+    if (params->has_max_postcopy_bandwidth) {
+        dest->max_postcopy_bandwidth = params->max_postcopy_bandwidth;
+    }
 }
 
 static void migrate_params_apply(MigrateSetParameters *params, Error **errp)
@@ -1138,6 +1148,9 @@ static void migrate_params_apply(MigrateSetParameters *params, Error **errp)
         s->parameters.xbzrle_cache_size = params->xbzrle_cache_size;
         xbzrle_cache_resize(params->xbzrle_cache_size, errp);
     }
+    if (params->has_max_postcopy_bandwidth) {
+        s->parameters.max_postcopy_bandwidth = params->max_postcopy_bandwidth;
+    }
 }
 
 void qmp_migrate_set_parameters(MigrateSetParameters *params, Error **errp)
@@ -1887,6 +1900,16 @@ int64_t migrate_xbzrle_cache_size(void)
     return s->parameters.xbzrle_cache_size;
 }
 
+static int64_t migrate_max_postcopy_bandwidth(void)
+{
+    MigrationState *s;
+
+    s = migrate_get_current();
+
+    return s->parameters.max_postcopy_bandwidth;
+}
+
+
 bool migrate_use_block(void)
 {
     MigrationState *s;
@@ -2226,6 +2249,7 @@ static int postcopy_start(MigrationState *ms)
     QIOChannelBuffer *bioc;
     QEMUFile *fb;
     int64_t time_at_stop = qemu_clock_get_ms(QEMU_CLOCK_REALTIME);
+    int64_t bandwidth = migrate_max_postcopy_bandwidth();
     bool restart_block = false;
     int cur_state = MIGRATION_STATUS_ACTIVE;
     if (!migrate_pause_before_switchover()) {
@@ -2280,7 +2304,12 @@ static int postcopy_start(MigrationState *ms)
      * will notice we're in POSTCOPY_ACTIVE and not actually
      * wrap their state up here
      */
-    qemu_file_set_rate_limit(ms->to_dst_file, INT64_MAX);
+    /* 0 max-postcopy-bandwidth means unlimited */
+    if (!bandwidth) {
+        qemu_file_set_rate_limit(ms->to_dst_file, INT64_MAX);
+    } else {
+        qemu_file_set_rate_limit(ms->to_dst_file, bandwidth / XFER_LIMIT_RATIO);
+    }
     if (migrate_postcopy_ram()) {
         /* Ping just for debugging, helps line traces up */
         qemu_savevm_send_ping(ms->to_dst_file, 2);
@@ -2717,8 +2746,7 @@ static void migration_update_counters(MigrationState *s,
      * recalculate. 10000 is a small enough number for our purposes
      */
     if (ram_counters.dirty_pages_rate && transferred > 10000) {
-        s->expected_downtime = ram_counters.dirty_pages_rate *
-            qemu_target_page_size() / bandwidth;
+        s->expected_downtime = ram_counters.remaining / bandwidth;
     }
 
     qemu_file_reset_rate_limit(s->to_dst_file);
@@ -2823,6 +2851,16 @@ static void migration_iteration_finish(MigrationState *s)
     qemu_mutex_unlock_iothread();
 }
 
+void migration_make_urgent_request(void)
+{
+    qemu_sem_post(&migrate_get_current()->rate_limit_sem);
+}
+
+void migration_consume_urgent_request(void)
+{
+    qemu_sem_wait(&migrate_get_current()->rate_limit_sem);
+}
+
 /*
  * Master migration thread on the source VM.
  * It drives the migration and pumps the data down the outgoing channel.
@@ -2832,6 +2870,7 @@ static void *migration_thread(void *opaque)
     MigrationState *s = opaque;
     int64_t setup_start = qemu_clock_get_ms(QEMU_CLOCK_HOST);
     MigThrError thr_error;
+    bool urgent = false;
 
     rcu_register_thread();
 
@@ -2872,7 +2911,7 @@ static void *migration_thread(void *opaque)
            s->state == MIGRATION_STATUS_POSTCOPY_ACTIVE) {
         int64_t current_time;
 
-        if (!qemu_file_rate_limit(s->to_dst_file)) {
+        if (urgent || !qemu_file_rate_limit(s->to_dst_file)) {
             MigIterateState iter_state = migration_iteration_run(s);
             if (iter_state == MIG_ITERATE_SKIP) {
                 continue;
@@ -2903,10 +2942,24 @@ static void *migration_thread(void *opaque)
 
         migration_update_counters(s, current_time);
 
+        urgent = false;
         if (qemu_file_rate_limit(s->to_dst_file)) {
-            /* usleep expects microseconds */
-            g_usleep((s->iteration_start_time + BUFFER_DELAY -
-                      current_time) * 1000);
+            /* Wait for a delay to do rate limiting OR
+             * something urgent to post the semaphore.
+             */
+            int ms = s->iteration_start_time + BUFFER_DELAY - current_time;
+            trace_migration_thread_ratelimit_pre(ms);
+            if (qemu_sem_timedwait(&s->rate_limit_sem, ms) == 0) {
+                /* We were worken by one or more urgent things but
+                 * the timedwait will have consumed one of them.
+                 * The service routine for the urgent wake will dec
+                 * the semaphore itself for each item it consumes,
+                 * so add this one we just eat back.
+                 */
+                qemu_sem_post(&s->rate_limit_sem);
+                urgent = true;
+            }
+            trace_migration_thread_ratelimit_post(urgent);
         }
     }
 
@@ -3042,6 +3095,9 @@ static Property migration_properties[] = {
     DEFINE_PROP_SIZE("xbzrle-cache-size", MigrationState,
                       parameters.xbzrle_cache_size,
                       DEFAULT_MIGRATE_XBZRLE_CACHE_SIZE),
+    DEFINE_PROP_SIZE("max-postcopy-bandwidth", MigrationState,
+                      parameters.max_postcopy_bandwidth,
+                      DEFAULT_MIGRATE_MAX_POSTCOPY_BANDWIDTH),
 
     /* Migration capabilities */
     DEFINE_PROP_MIG_CAP("x-xbzrle", MIGRATION_CAPABILITY_XBZRLE),
@@ -3077,6 +3133,7 @@ static void migration_instance_finalize(Object *obj)
     qemu_mutex_destroy(&ms->qemu_file_lock);
     g_free(params->tls_hostname);
     g_free(params->tls_creds);
+    qemu_sem_destroy(&ms->rate_limit_sem);
     qemu_sem_destroy(&ms->pause_sem);
     qemu_sem_destroy(&ms->postcopy_pause_sem);
     qemu_sem_destroy(&ms->postcopy_pause_rp_sem);
@@ -3110,10 +3167,12 @@ static void migration_instance_init(Object *obj)
     params->has_x_multifd_channels = true;
     params->has_x_multifd_page_count = true;
     params->has_xbzrle_cache_size = true;
+    params->has_max_postcopy_bandwidth = true;
 
     qemu_sem_init(&ms->postcopy_pause_sem, 0);
     qemu_sem_init(&ms->postcopy_pause_rp_sem, 0);
     qemu_sem_init(&ms->rp_state.rp_sem, 0);
+    qemu_sem_init(&ms->rate_limit_sem, 0);
     qemu_mutex_init(&ms->qemu_file_lock);
 }
 
diff --git a/migration/migration.h b/migration/migration.h
index 5af57d616c..64a7b33735 100644
--- a/migration/migration.h
+++ b/migration/migration.h
@@ -121,6 +121,11 @@ struct MigrationState
      */
     QemuMutex qemu_file_lock;
 
+    /*
+     * Used to allow urgent requests to override rate limiting.
+     */
+    QemuSemaphore rate_limit_sem;
+
     /* bytes already send at the beggining of current interation */
     uint64_t iteration_initial_bytes;
     /* time at the start of current iteration */
@@ -284,4 +289,10 @@ void migrate_send_rp_resume_ack(MigrationIncomingState *mis, uint32_t value);
 void dirty_bitmap_mig_before_vm_start(void);
 void init_dirty_bitmap_incoming_migration(void);
 
+#define qemu_ram_foreach_block \
+  #warning "Use qemu_ram_foreach_block_migratable in migration code"
+
+void migration_make_urgent_request(void);
+void migration_consume_urgent_request(void);
+
 #endif
diff --git a/migration/qjson.h b/migration/qjson.h
index 2978b5f371..41664f2d71 100644
--- a/migration/qjson.h
+++ b/migration/qjson.h
@@ -13,8 +13,6 @@
 #ifndef QEMU_QJSON_H
 #define QEMU_QJSON_H
 
-typedef struct QJSON QJSON;
-
 QJSON *qjson_new(void);
 void qjson_destroy(QJSON *json);
 void json_prop_str(QJSON *json, const char *name, const char *str);
diff --git a/migration/ram.c b/migration/ram.c
index a500015a2f..cd5f55117d 100644
--- a/migration/ram.c
+++ b/migration/ram.c
@@ -159,9 +159,11 @@ out:
 
 /* Should be holding either ram_list.mutex, or the RCU lock. */
 #define RAMBLOCK_FOREACH_MIGRATABLE(block)             \
-    RAMBLOCK_FOREACH(block)                            \
+    INTERNAL_RAMBLOCK_FOREACH(block)                   \
         if (!qemu_ram_is_migratable(block)) {} else
 
+#undef RAMBLOCK_FOREACH
+
 static void ramblock_recv_map_init(void)
 {
     RAMBlock *rb;
@@ -1139,6 +1141,25 @@ uint64_t ram_pagesize_summary(void)
     return summary;
 }
 
+static void migration_update_rates(RAMState *rs, int64_t end_time)
+{
+    uint64_t iter_count = rs->iterations - rs->iterations_prev;
+
+    /* calculate period counters */
+    ram_counters.dirty_pages_rate = rs->num_dirty_pages_period * 1000
+                / (end_time - rs->time_last_bitmap_sync);
+
+    if (!iter_count) {
+        return;
+    }
+
+    if (migrate_use_xbzrle()) {
+        xbzrle_counters.cache_miss_rate = (double)(xbzrle_counters.cache_miss -
+            rs->xbzrle_cache_miss_prev) / iter_count;
+        rs->xbzrle_cache_miss_prev = xbzrle_counters.cache_miss;
+    }
+}
+
 static void migration_bitmap_sync(RAMState *rs)
 {
     RAMBlock *block;
@@ -1159,6 +1180,7 @@ static void migration_bitmap_sync(RAMState *rs)
     RAMBLOCK_FOREACH_MIGRATABLE(block) {
         migration_bitmap_sync_range(rs, block, 0, block->used_length);
     }
+    ram_counters.remaining = ram_bytes_remaining();
     rcu_read_unlock();
     qemu_mutex_unlock(&rs->bitmap_mutex);
 
@@ -1168,9 +1190,6 @@ static void migration_bitmap_sync(RAMState *rs)
 
     /* more than 1 second = 1000 millisecons */
     if (end_time > rs->time_last_bitmap_sync + 1000) {
-        /* calculate period counters */
-        ram_counters.dirty_pages_rate = rs->num_dirty_pages_period * 1000
-            / (end_time - rs->time_last_bitmap_sync);
         bytes_xfer_now = ram_counters.transferred;
 
         /* During block migration the auto-converge logic incorrectly detects
@@ -1192,16 +1211,9 @@ static void migration_bitmap_sync(RAMState *rs)
             }
         }
 
-        if (migrate_use_xbzrle()) {
-            if (rs->iterations_prev != rs->iterations) {
-                xbzrle_counters.cache_miss_rate =
-                   (double)(xbzrle_counters.cache_miss -
-                            rs->xbzrle_cache_miss_prev) /
-                   (rs->iterations - rs->iterations_prev);
-            }
-            rs->iterations_prev = rs->iterations;
-            rs->xbzrle_cache_miss_prev = xbzrle_counters.cache_miss;
-        }
+        migration_update_rates(rs, end_time);
+
+        rs->iterations_prev = rs->iterations;
 
         /* reset period counters */
         rs->time_last_bitmap_sync = end_time;
@@ -1536,6 +1548,7 @@ static RAMBlock *unqueue_page(RAMState *rs, ram_addr_t *offset)
             memory_region_unref(block->mr);
             QSIMPLEQ_REMOVE_HEAD(&rs->src_page_requests, next_req);
             g_free(entry);
+            migration_consume_urgent_request();
         }
     }
     qemu_mutex_unlock(&rs->src_page_req_mutex);
@@ -1684,6 +1697,7 @@ int ram_save_queue_pages(const char *rbname, ram_addr_t start, ram_addr_t len)
     memory_region_ref(ramblock->mr);
     qemu_mutex_lock(&rs->src_page_req_mutex);
     QSIMPLEQ_INSERT_TAIL(&rs->src_page_requests, new_entry, next_req);
+    migration_make_urgent_request();
     qemu_mutex_unlock(&rs->src_page_req_mutex);
     rcu_read_unlock();
 
@@ -2516,7 +2530,7 @@ static void ram_state_resume_prepare(RAMState *rs, QEMUFile *out)
      * about dirty page logging as well.
      */
 
-    RAMBLOCK_FOREACH(block) {
+    RAMBLOCK_FOREACH_MIGRATABLE(block) {
         pages += bitmap_count_one(block->bmap,
                                   block->used_length >> TARGET_PAGE_BITS);
     }
@@ -2632,9 +2646,14 @@ static int ram_save_iterate(QEMUFile *f, void *opaque)
 
     t0 = qemu_clock_get_ns(QEMU_CLOCK_REALTIME);
     i = 0;
-    while ((ret = qemu_file_rate_limit(f)) == 0) {
+    while ((ret = qemu_file_rate_limit(f)) == 0 ||
+            !QSIMPLEQ_EMPTY(&rs->src_page_requests)) {
         int pages;
 
+        if (qemu_file_get_error(f)) {
+            break;
+        }
+
         pages = ram_find_and_save_block(rs, false);
         /* no more pages to sent */
         if (pages == 0) {
@@ -3431,7 +3450,7 @@ static int ram_dirty_bitmap_sync_all(MigrationState *s, RAMState *rs)
 
     trace_ram_dirty_bitmap_sync_start();
 
-    RAMBLOCK_FOREACH(block) {
+    RAMBLOCK_FOREACH_MIGRATABLE(block) {
         qemu_savevm_send_recv_bitmap(file, block->idstr);
         trace_ram_dirty_bitmap_request(block->idstr);
         ramblock_count++;
diff --git a/migration/rdma.c b/migration/rdma.c
index 05aee3d591..8bd7159059 100644
--- a/migration/rdma.c
+++ b/migration/rdma.c
@@ -635,7 +635,7 @@ static int qemu_rdma_init_ram_blocks(RDMAContext *rdma)
 
     assert(rdma->blockmap == NULL);
     memset(local, 0, sizeof *local);
-    qemu_ram_foreach_block(qemu_rdma_init_one_block, rdma);
+    qemu_ram_foreach_migratable_block(qemu_rdma_init_one_block, rdma);
     trace_qemu_rdma_init_ram_blocks(local->nb_blocks);
     rdma->dest_blocks = g_new0(RDMADestBlock,
                                rdma->local_ram_blocks.nb_blocks);
diff --git a/migration/trace-events b/migration/trace-events
index 4a768eaaeb..3f67758893 100644
--- a/migration/trace-events
+++ b/migration/trace-events
@@ -108,6 +108,8 @@ migration_return_path_end_before(void) ""
 migration_return_path_end_after(int rp_error) "%d"
 migration_thread_after_loop(void) ""
 migration_thread_file_err(void) ""
+migration_thread_ratelimit_pre(int ms) "%d ms"
+migration_thread_ratelimit_post(int urgent) "urgent: %d"
 migration_thread_setup_complete(void) ""
 open_return_path_on_source(void) ""
 open_return_path_on_source_continue(void) ""
diff --git a/monitor.c b/monitor.c
index 6d0cec552e..885e000f9b 100644
--- a/monitor.c
+++ b/monitor.c
@@ -207,22 +207,35 @@ struct Monitor {
     bool skip_flush;
     bool use_io_thr;
 
-    /* We can't access guest memory when holding the lock */
-    QemuMutex out_lock;
-    QString *outbuf;
-    guint out_watch;
-
-    /* Read under either BQL or out_lock, written with BQL+out_lock.  */
-    int mux_out;
-
+    /*
+     * State used only in the thread "owning" the monitor.
+     * If @use_io_thr, this is mon_global.mon_iothread.
+     * Else, it's the main thread.
+     * These members can be safely accessed without locks.
+     */
     ReadLineState *rs;
+
     MonitorQMP qmp;
     gchar *mon_cpu_path;
     BlockCompletionFunc *password_completion_cb;
     void *password_opaque;
     mon_cmd_t *cmd_table;
-    QLIST_HEAD(,mon_fd_t) fds;
     QTAILQ_ENTRY(Monitor) entry;
+
+    /*
+     * The per-monitor lock. We can't access guest memory when holding
+     * the lock.
+     */
+    QemuMutex mon_lock;
+
+    /*
+     * Fields that are protected by the per-monitor lock.
+     */
+    QLIST_HEAD(, mon_fd_t) fds;
+    QString *outbuf;
+    guint out_watch;
+    /* Read under either BQL or mon_lock, written with BQL+mon_lock.  */
+    int mux_out;
 };
 
 /* Let's add monitor global variables to this struct. */
@@ -253,11 +266,15 @@ typedef struct QMPRequest QMPRequest;
 /* QMP checker flags */
 #define QMP_ACCEPT_UNKNOWNS 1
 
-/* Protects mon_list, monitor_event_state.  */
+/* Protects mon_list, monitor_qapi_event_state.  */
 static QemuMutex monitor_lock;
-
+static GHashTable *monitor_qapi_event_state;
 static QTAILQ_HEAD(mon_list, Monitor) mon_list;
+
+/* Protects mon_fdsets */
+static QemuMutex mon_fdsets_lock;
 static QLIST_HEAD(mon_fdsets, MonFdset) mon_fdsets;
+
 static int mon_refcount;
 
 static mon_cmd_t mon_cmds[];
@@ -267,8 +284,6 @@ QmpCommandList qmp_commands, qmp_cap_negotiation_commands;
 
 Monitor *cur_mon;
 
-static QEMUClockType event_clock_type = QEMU_CLOCK_REALTIME;
-
 static void monitor_command_cb(void *opaque, const char *cmdline,
                                void *readline_opaque);
 
@@ -295,6 +310,19 @@ static inline bool monitor_is_hmp_non_interactive(const Monitor *mon)
     return !monitor_is_qmp(mon) && !monitor_uses_readline(mon);
 }
 
+/*
+ * Return the clock to use for recording an event's time.
+ * Beware: result is invalid before configure_accelerator().
+ */
+static inline QEMUClockType monitor_get_event_clock(void)
+{
+    /*
+     * This allows us to perform tests on the monitor queues to verify
+     * that the rate limits are enforced.
+     */
+    return qtest_enabled() ? QEMU_CLOCK_VIRTUAL : QEMU_CLOCK_REALTIME;
+}
+
 /**
  * Is the current monitor, if any, a QMP monitor?
  */
@@ -365,14 +393,14 @@ static gboolean monitor_unblocked(GIOChannel *chan, GIOCondition cond,
 {
     Monitor *mon = opaque;
 
-    qemu_mutex_lock(&mon->out_lock);
+    qemu_mutex_lock(&mon->mon_lock);
     mon->out_watch = 0;
     monitor_flush_locked(mon);
-    qemu_mutex_unlock(&mon->out_lock);
+    qemu_mutex_unlock(&mon->mon_lock);
     return FALSE;
 }
 
-/* Called with mon->out_lock held.  */
+/* Called with mon->mon_lock held.  */
 static void monitor_flush_locked(Monitor *mon)
 {
     int rc;
@@ -410,9 +438,9 @@ static void monitor_flush_locked(Monitor *mon)
 
 void monitor_flush(Monitor *mon)
 {
-    qemu_mutex_lock(&mon->out_lock);
+    qemu_mutex_lock(&mon->mon_lock);
     monitor_flush_locked(mon);
-    qemu_mutex_unlock(&mon->out_lock);
+    qemu_mutex_unlock(&mon->mon_lock);
 }
 
 /* flush at every end of line */
@@ -420,7 +448,7 @@ static void monitor_puts(Monitor *mon, const char *str)
 {
     char c;
 
-    qemu_mutex_lock(&mon->out_lock);
+    qemu_mutex_lock(&mon->mon_lock);
     for(;;) {
         c = *str++;
         if (c == '\0')
@@ -433,7 +461,7 @@ static void monitor_puts(Monitor *mon, const char *str)
             monitor_flush_locked(mon);
         }
     }
-    qemu_mutex_unlock(&mon->out_lock);
+    qemu_mutex_unlock(&mon->mon_lock);
 }
 
 void monitor_vprintf(Monitor *mon, const char *fmt, va_list ap)
@@ -558,8 +586,6 @@ static MonitorQAPIEventConf monitor_qapi_event_conf[QAPI_EVENT__MAX] = {
     [QAPI_EVENT_VSERPORT_CHANGE]   = { 1000 * SCALE_MS },
 };
 
-GHashTable *monitor_qapi_event_state;
-
 /*
  * Emits the event to every monitor instance, @event is only used for trace
  * Called with monitor_lock held.
@@ -620,7 +646,7 @@ monitor_qapi_event_queue(QAPIEvent event, QDict *qdict, Error **errp)
              * monitor_qapi_event_handler() in evconf->rate ns.  Any
              * events arriving before then will be delayed until then.
              */
-            int64_t now = qemu_clock_get_ns(event_clock_type);
+            int64_t now = qemu_clock_get_ns(monitor_get_event_clock());
 
             monitor_qapi_event_emit(event, qdict);
 
@@ -628,7 +654,7 @@ monitor_qapi_event_queue(QAPIEvent event, QDict *qdict, Error **errp)
             evstate->event = event;
             evstate->data = qobject_ref(data);
             evstate->qdict = NULL;
-            evstate->timer = timer_new_ns(event_clock_type,
+            evstate->timer = timer_new_ns(monitor_get_event_clock(),
                                           monitor_qapi_event_handler,
                                           evstate);
             g_hash_table_add(monitor_qapi_event_state, evstate);
@@ -653,7 +679,7 @@ static void monitor_qapi_event_handler(void *opaque)
     qemu_mutex_lock(&monitor_lock);
 
     if (evstate->qdict) {
-        int64_t now = qemu_clock_get_ns(event_clock_type);
+        int64_t now = qemu_clock_get_ns(monitor_get_event_clock());
 
         monitor_qapi_event_emit(evstate->event, evstate->qdict);
         qobject_unref(evstate->qdict);
@@ -709,10 +735,6 @@ static gboolean qapi_event_throttle_equal(const void *a, const void *b)
 
 static void monitor_qapi_event_init(void)
 {
-    if (qtest_enabled()) {
-        event_clock_type = QEMU_CLOCK_VIRTUAL;
-    }
-
     monitor_qapi_event_state = g_hash_table_new(qapi_event_throttle_hash,
                                                 qapi_event_throttle_equal);
     qmp_event_set_func_emit(monitor_qapi_event_queue);
@@ -724,7 +746,7 @@ static void monitor_data_init(Monitor *mon, bool skip_flush,
                               bool use_io_thr)
 {
     memset(mon, 0, sizeof(Monitor));
-    qemu_mutex_init(&mon->out_lock);
+    qemu_mutex_init(&mon->mon_lock);
     qemu_mutex_init(&mon->qmp.qmp_queue_lock);
     mon->outbuf = qstring_new();
     /* Use *mon_cmds by default. */
@@ -744,7 +766,7 @@ static void monitor_data_destroy(Monitor *mon)
     }
     readline_free(mon->rs);
     qobject_unref(mon->outbuf);
-    qemu_mutex_destroy(&mon->out_lock);
+    qemu_mutex_destroy(&mon->mon_lock);
     qemu_mutex_destroy(&mon->qmp.qmp_queue_lock);
     monitor_qmp_cleanup_req_queue_locked(mon);
     monitor_qmp_cleanup_resp_queue_locked(mon);
@@ -776,13 +798,13 @@ char *qmp_human_monitor_command(const char *command_line, bool has_cpu_index,
     handle_hmp_command(&hmp, command_line);
     cur_mon = old_mon;
 
-    qemu_mutex_lock(&hmp.out_lock);
+    qemu_mutex_lock(&hmp.mon_lock);
     if (qstring_get_length(hmp.outbuf) > 0) {
         output = g_strdup(qstring_get_str(hmp.outbuf));
     } else {
         output = g_strdup("");
     }
-    qemu_mutex_unlock(&hmp.out_lock);
+    qemu_mutex_unlock(&hmp.mon_lock);
 
 out:
     monitor_data_destroy(&hmp);
@@ -1306,7 +1328,7 @@ void qmp_qmp_capabilities(bool has_enable, QMPCapabilityList *enable,
     cur_mon->qmp.commands = &qmp_commands;
 }
 
-/* set the current CPU defined by the user */
+/* Set the current CPU defined by the user. Callers must hold BQL. */
 int monitor_set_cpu(int cpu_index)
 {
     CPUState *cpu;
@@ -1320,6 +1342,7 @@ int monitor_set_cpu(int cpu_index)
     return 0;
 }
 
+/* Callers must hold BQL. */
 static CPUState *mon_get_cpu_sync(bool synchronize)
 {
     CPUState *cpu;
@@ -2182,7 +2205,7 @@ static void hmp_acl_remove(Monitor *mon, const QDict *qdict)
 void qmp_getfd(const char *fdname, Error **errp)
 {
     mon_fd_t *monfd;
-    int fd;
+    int fd, tmp_fd;
 
     fd = qemu_chr_fe_get_msgfd(&cur_mon->chr);
     if (fd == -1) {
@@ -2197,13 +2220,17 @@ void qmp_getfd(const char *fdname, Error **errp)
         return;
     }
 
+    qemu_mutex_lock(&cur_mon->mon_lock);
     QLIST_FOREACH(monfd, &cur_mon->fds, next) {
         if (strcmp(monfd->name, fdname) != 0) {
             continue;
         }
 
-        close(monfd->fd);
+        tmp_fd = monfd->fd;
         monfd->fd = fd;
+        qemu_mutex_unlock(&cur_mon->mon_lock);
+        /* Make sure close() is out of critical section */
+        close(tmp_fd);
         return;
     }
 
@@ -2212,24 +2239,31 @@ void qmp_getfd(const char *fdname, Error **errp)
     monfd->fd = fd;
 
     QLIST_INSERT_HEAD(&cur_mon->fds, monfd, next);
+    qemu_mutex_unlock(&cur_mon->mon_lock);
 }
 
 void qmp_closefd(const char *fdname, Error **errp)
 {
     mon_fd_t *monfd;
+    int tmp_fd;
 
+    qemu_mutex_lock(&cur_mon->mon_lock);
     QLIST_FOREACH(monfd, &cur_mon->fds, next) {
         if (strcmp(monfd->name, fdname) != 0) {
             continue;
         }
 
         QLIST_REMOVE(monfd, next);
-        close(monfd->fd);
+        tmp_fd = monfd->fd;
         g_free(monfd->name);
         g_free(monfd);
+        qemu_mutex_unlock(&cur_mon->mon_lock);
+        /* Make sure close() is out of critical section */
+        close(tmp_fd);
         return;
     }
 
+    qemu_mutex_unlock(&cur_mon->mon_lock);
     error_setg(errp, QERR_FD_NOT_FOUND, fdname);
 }
 
@@ -2237,6 +2271,7 @@ int monitor_get_fd(Monitor *mon, const char *fdname, Error **errp)
 {
     mon_fd_t *monfd;
 
+    qemu_mutex_lock(&mon->mon_lock);
     QLIST_FOREACH(monfd, &mon->fds, next) {
         int fd;
 
@@ -2250,10 +2285,12 @@ int monitor_get_fd(Monitor *mon, const char *fdname, Error **errp)
         QLIST_REMOVE(monfd, next);
         g_free(monfd->name);
         g_free(monfd);
+        qemu_mutex_unlock(&mon->mon_lock);
 
         return fd;
     }
 
+    qemu_mutex_unlock(&mon->mon_lock);
     error_setg(errp, "File descriptor named '%s' has not been found", fdname);
     return -1;
 }
@@ -2285,9 +2322,11 @@ static void monitor_fdsets_cleanup(void)
     MonFdset *mon_fdset;
     MonFdset *mon_fdset_next;
 
+    qemu_mutex_lock(&mon_fdsets_lock);
     QLIST_FOREACH_SAFE(mon_fdset, &mon_fdsets, next, mon_fdset_next) {
         monitor_fdset_cleanup(mon_fdset);
     }
+    qemu_mutex_unlock(&mon_fdsets_lock);
 }
 
 AddfdInfo *qmp_add_fd(bool has_fdset_id, int64_t fdset_id, bool has_opaque,
@@ -2322,6 +2361,7 @@ void qmp_remove_fd(int64_t fdset_id, bool has_fd, int64_t fd, Error **errp)
     MonFdsetFd *mon_fdset_fd;
     char fd_str[60];
 
+    qemu_mutex_lock(&mon_fdsets_lock);
     QLIST_FOREACH(mon_fdset, &mon_fdsets, next) {
         if (mon_fdset->id != fdset_id) {
             continue;
@@ -2341,10 +2381,12 @@ void qmp_remove_fd(int64_t fdset_id, bool has_fd, int64_t fd, Error **errp)
             goto error;
         }
         monitor_fdset_cleanup(mon_fdset);
+        qemu_mutex_unlock(&mon_fdsets_lock);
         return;
     }
 
 error:
+    qemu_mutex_unlock(&mon_fdsets_lock);
     if (has_fd) {
         snprintf(fd_str, sizeof(fd_str), "fdset-id:%" PRId64 ", fd:%" PRId64,
                  fdset_id, fd);
@@ -2360,6 +2402,7 @@ FdsetInfoList *qmp_query_fdsets(Error **errp)
     MonFdsetFd *mon_fdset_fd;
     FdsetInfoList *fdset_list = NULL;
 
+    qemu_mutex_lock(&mon_fdsets_lock);
     QLIST_FOREACH(mon_fdset, &mon_fdsets, next) {
         FdsetInfoList *fdset_info = g_malloc0(sizeof(*fdset_info));
         FdsetFdInfoList *fdsetfd_list = NULL;
@@ -2389,6 +2432,7 @@ FdsetInfoList *qmp_query_fdsets(Error **errp)
         fdset_info->next = fdset_list;
         fdset_list = fdset_info;
     }
+    qemu_mutex_unlock(&mon_fdsets_lock);
 
     return fdset_list;
 }
@@ -2401,6 +2445,7 @@ AddfdInfo *monitor_fdset_add_fd(int fd, bool has_fdset_id, int64_t fdset_id,
     MonFdsetFd *mon_fdset_fd;
     AddfdInfo *fdinfo;
 
+    qemu_mutex_lock(&mon_fdsets_lock);
     if (has_fdset_id) {
         QLIST_FOREACH(mon_fdset, &mon_fdsets, next) {
             /* Break if match found or match impossible due to ordering by ID */
@@ -2421,6 +2466,7 @@ AddfdInfo *monitor_fdset_add_fd(int fd, bool has_fdset_id, int64_t fdset_id,
             if (fdset_id < 0) {
                 error_setg(errp, QERR_INVALID_PARAMETER_VALUE, "fdset-id",
                            "a non-negative value");
+                qemu_mutex_unlock(&mon_fdsets_lock);
                 return NULL;
             }
             /* Use specified fdset ID */
@@ -2471,16 +2517,21 @@ AddfdInfo *monitor_fdset_add_fd(int fd, bool has_fdset_id, int64_t fdset_id,
     fdinfo->fdset_id = mon_fdset->id;
     fdinfo->fd = mon_fdset_fd->fd;
 
+    qemu_mutex_unlock(&mon_fdsets_lock);
     return fdinfo;
 }
 
 int monitor_fdset_get_fd(int64_t fdset_id, int flags)
 {
-#ifndef _WIN32
+#ifdef _WIN32
+    return -ENOENT;
+#else
     MonFdset *mon_fdset;
     MonFdsetFd *mon_fdset_fd;
     int mon_fd_flags;
+    int ret;
 
+    qemu_mutex_lock(&mon_fdsets_lock);
     QLIST_FOREACH(mon_fdset, &mon_fdsets, next) {
         if (mon_fdset->id != fdset_id) {
             continue;
@@ -2488,20 +2539,24 @@ int monitor_fdset_get_fd(int64_t fdset_id, int flags)
         QLIST_FOREACH(mon_fdset_fd, &mon_fdset->fds, next) {
             mon_fd_flags = fcntl(mon_fdset_fd->fd, F_GETFL);
             if (mon_fd_flags == -1) {
-                return -1;
+                ret = -errno;
+                goto out;
             }
 
             if ((flags & O_ACCMODE) == (mon_fd_flags & O_ACCMODE)) {
-                return mon_fdset_fd->fd;
+                ret = mon_fdset_fd->fd;
+                goto out;
             }
         }
-        errno = EACCES;
-        return -1;
+        ret = -EACCES;
+        goto out;
     }
-#endif
+    ret = -ENOENT;
 
-    errno = ENOENT;
-    return -1;
+out:
+    qemu_mutex_unlock(&mon_fdsets_lock);
+    return ret;
+#endif
 }
 
 int monitor_fdset_dup_fd_add(int64_t fdset_id, int dup_fd)
@@ -2509,20 +2564,25 @@ int monitor_fdset_dup_fd_add(int64_t fdset_id, int dup_fd)
     MonFdset *mon_fdset;
     MonFdsetFd *mon_fdset_fd_dup;
 
+    qemu_mutex_lock(&mon_fdsets_lock);
     QLIST_FOREACH(mon_fdset, &mon_fdsets, next) {
         if (mon_fdset->id != fdset_id) {
             continue;
         }
         QLIST_FOREACH(mon_fdset_fd_dup, &mon_fdset->dup_fds, next) {
             if (mon_fdset_fd_dup->fd == dup_fd) {
-                return -1;
+                goto err;
             }
         }
         mon_fdset_fd_dup = g_malloc0(sizeof(*mon_fdset_fd_dup));
         mon_fdset_fd_dup->fd = dup_fd;
         QLIST_INSERT_HEAD(&mon_fdset->dup_fds, mon_fdset_fd_dup, next);
+        qemu_mutex_unlock(&mon_fdsets_lock);
         return 0;
     }
+
+err:
+    qemu_mutex_unlock(&mon_fdsets_lock);
     return -1;
 }
 
@@ -2531,6 +2591,7 @@ static int monitor_fdset_dup_fd_find_remove(int dup_fd, bool remove)
     MonFdset *mon_fdset;
     MonFdsetFd *mon_fdset_fd_dup;
 
+    qemu_mutex_lock(&mon_fdsets_lock);
     QLIST_FOREACH(mon_fdset, &mon_fdsets, next) {
         QLIST_FOREACH(mon_fdset_fd_dup, &mon_fdset->dup_fds, next) {
             if (mon_fdset_fd_dup->fd == dup_fd) {
@@ -2539,13 +2600,17 @@ static int monitor_fdset_dup_fd_find_remove(int dup_fd, bool remove)
                     if (QLIST_EMPTY(&mon_fdset->dup_fds)) {
                         monitor_fdset_cleanup(mon_fdset);
                     }
-                    return -1;
+                    goto err;
                 } else {
+                    qemu_mutex_unlock(&mon_fdsets_lock);
                     return mon_fdset->id;
                 }
             }
         }
     }
+
+err:
+    qemu_mutex_unlock(&mon_fdsets_lock);
     return -1;
 }
 
@@ -4381,9 +4446,9 @@ static void monitor_event(void *opaque, int event)
 
     switch (event) {
     case CHR_EVENT_MUX_IN:
-        qemu_mutex_lock(&mon->out_lock);
+        qemu_mutex_lock(&mon->mon_lock);
         mon->mux_out = 0;
-        qemu_mutex_unlock(&mon->out_lock);
+        qemu_mutex_unlock(&mon->mon_lock);
         if (mon->reset_seen) {
             readline_restart(mon->rs);
             monitor_resume(mon);
@@ -4403,9 +4468,9 @@ static void monitor_event(void *opaque, int event)
         } else {
             atomic_inc(&mon->suspend_cnt);
         }
-        qemu_mutex_lock(&mon->out_lock);
+        qemu_mutex_lock(&mon->mon_lock);
         mon->mux_out = 1;
-        qemu_mutex_unlock(&mon->out_lock);
+        qemu_mutex_unlock(&mon->mon_lock);
         break;
 
     case CHR_EVENT_OPENED:
@@ -4485,6 +4550,7 @@ void monitor_init_globals(void)
     monitor_qapi_event_init();
     sortcmdlist();
     qemu_mutex_init(&monitor_lock);
+    qemu_mutex_init(&mon_fdsets_lock);
     monitor_iothread_init();
 }
 
diff --git a/pc-bios/openbios-ppc b/pc-bios/openbios-ppc
index af70c365ee..a39cbe57ca 100644
--- a/pc-bios/openbios-ppc
+++ b/pc-bios/openbios-ppc
Binary files differdiff --git a/pc-bios/openbios-sparc32 b/pc-bios/openbios-sparc32
index 9e736e8838..7163ba8b3b 100644
--- a/pc-bios/openbios-sparc32
+++ b/pc-bios/openbios-sparc32
Binary files differdiff --git a/pc-bios/openbios-sparc64 b/pc-bios/openbios-sparc64
index 82ea0f8be6..0a9a338f78 100644
--- a/pc-bios/openbios-sparc64
+++ b/pc-bios/openbios-sparc64
Binary files differdiff --git a/pc-bios/s390-ccw/Makefile b/pc-bios/s390-ccw/Makefile
index 439e3cc9c9..1eb316b02f 100644
--- a/pc-bios/s390-ccw/Makefile
+++ b/pc-bios/s390-ccw/Makefile
@@ -15,6 +15,7 @@ OBJECTS = start.o main.o bootmap.o jump2ipl.o sclp.o menu.o \
 QEMU_CFLAGS := $(filter -W%, $(QEMU_CFLAGS))
 QEMU_CFLAGS += -ffreestanding -fno-delete-null-pointer-checks -msoft-float
 QEMU_CFLAGS += -march=z900 -fPIE -fno-strict-aliasing
+QEMU_CFLAGS += -fno-asynchronous-unwind-tables
 QEMU_CFLAGS += $(call cc-option, $(QEMU_CFLAGS), -fno-stack-protector)
 LDFLAGS += -Wl,-pie -nostdlib
 
diff --git a/pc-bios/s390-ccw/iplb.h b/pc-bios/s390-ccw/iplb.h
index ded20c834e..772d5c57c9 100644
--- a/pc-bios/s390-ccw/iplb.h
+++ b/pc-bios/s390-ccw/iplb.h
@@ -12,6 +12,8 @@
 #ifndef IPLB_H
 #define IPLB_H
 
+#define LOADPARM_LEN    8
+
 struct IplBlockCcw {
     uint8_t  reserved0[85];
     uint8_t  ssid;
@@ -61,7 +63,7 @@ struct IplParameterBlock {
     uint8_t  pbt;
     uint8_t  flags;
     uint16_t reserved01;
-    uint8_t  loadparm[8];
+    uint8_t  loadparm[LOADPARM_LEN];
     union {
         IplBlockCcw ccw;
         IplBlockFcp fcp;
diff --git a/pc-bios/s390-ccw/main.c b/pc-bios/s390-ccw/main.c
index 26f9adf84a..544851d672 100644
--- a/pc-bios/s390-ccw/main.c
+++ b/pc-bios/s390-ccw/main.c
@@ -15,7 +15,7 @@
 char stack[PAGE_SIZE * 8] __attribute__((__aligned__(PAGE_SIZE)));
 static SubChannelId blk_schid = { .one = 1 };
 IplParameterBlock iplb __attribute__((__aligned__(PAGE_SIZE)));
-static char loadparm_str[9] = { 0, 0, 0, 0, 0, 0, 0, 0, 0 };
+static char loadparm_str[LOADPARM_LEN + 1] = { 0, 0, 0, 0, 0, 0, 0, 0, 0 };
 QemuIplParameters qipl;
 
 #define LOADPARM_PROMPT "PROMPT  "
@@ -80,13 +80,13 @@ static bool find_dev(Schib *schib, int dev_no)
 
 static void menu_setup(void)
 {
-    if (memcmp(loadparm_str, LOADPARM_PROMPT, 8) == 0) {
+    if (memcmp(loadparm_str, LOADPARM_PROMPT, LOADPARM_LEN) == 0) {
         menu_set_parms(QIPL_FLAG_BM_OPTS_CMD, 0);
         return;
     }
 
     /* If loadparm was set to any other value, then do not enable menu */
-    if (memcmp(loadparm_str, LOADPARM_EMPTY, 8) != 0) {
+    if (memcmp(loadparm_str, LOADPARM_EMPTY, LOADPARM_LEN) != 0) {
         return;
     }
 
@@ -117,7 +117,7 @@ static void virtio_setup(void)
     enable_mss_facility();
 
     sclp_get_loadparm_ascii(loadparm_str);
-    memcpy(ldp + 10, loadparm_str, 8);
+    memcpy(ldp + 10, loadparm_str, LOADPARM_LEN);
     sclp_print(ldp);
 
     memcpy(&qipl, early_qipl, sizeof(QemuIplParameters));
diff --git a/pc-bios/s390-ccw/netboot.mak b/pc-bios/s390-ccw/netboot.mak
index 4f64128c6c..14e96b2aa6 100644
--- a/pc-bios/s390-ccw/netboot.mak
+++ b/pc-bios/s390-ccw/netboot.mak
@@ -19,14 +19,15 @@ s390-netboot.img: s390-netboot.elf
 
 # libc files:
 
-LIBC_CFLAGS :=  $(QEMU_CFLAGS) $(LIBC_INC) $(LIBNET_INC)
+LIBC_CFLAGS :=  $(QEMU_CFLAGS) $(CFLAGS) $(LIBC_INC) $(LIBNET_INC)
 
 CTYPE_OBJS = isdigit.o isxdigit.o toupper.o
 %.o : $(SLOF_DIR)/lib/libc/ctype/%.c
 	$(call quiet-command,$(CC) $(LIBC_CFLAGS) -c -o $@ $<,"CC","$(TARGET_DIR)$@")
 
-STRING_OBJS = strcat.o strchr.o strcmp.o strcpy.o strlen.o strncmp.o strncpy.o \
-	      strstr.o memset.o memcpy.o memmove.o memcmp.o
+STRING_OBJS = strcat.o strchr.o strrchr.o strcpy.o strlen.o strncpy.o \
+	      strcmp.o strncmp.o strcasecmp.o strncasecmp.o strstr.o \
+	      memset.o memcpy.o memmove.o memcmp.o
 %.o : $(SLOF_DIR)/lib/libc/string/%.c
 	$(call quiet-command,$(CC) $(LIBC_CFLAGS) -c -o $@ $<,"CC","$(TARGET_DIR)$@")
 
@@ -34,7 +35,7 @@ STDLIB_OBJS = atoi.o atol.o strtoul.o strtol.o rand.o malloc.o free.o
 %.o : $(SLOF_DIR)/lib/libc/stdlib/%.c
 	$(call quiet-command,$(CC) $(LIBC_CFLAGS) -c -o $@ $<,"CC","$(TARGET_DIR)$@")
 
-STDIO_OBJS = sprintf.o vfprintf.o vsnprintf.o vsprintf.o fprintf.o \
+STDIO_OBJS = sprintf.o snprintf.o vfprintf.o vsnprintf.o vsprintf.o fprintf.o \
 	     printf.o putc.o puts.o putchar.o stdchnls.o fileno.o
 %.o : $(SLOF_DIR)/lib/libc/stdio/%.c
 	$(call quiet-command,$(CC) $(LIBC_CFLAGS) -c -o $@ $<,"CC","$(TARGET_DIR)$@")
@@ -50,8 +51,8 @@ libc.a: $(LIBCOBJS)
 # libnet files:
 
 LIBNETOBJS := args.o dhcp.o dns.o icmpv6.o ipv6.o tcp.o udp.o bootp.o \
-	      dhcpv6.o ethernet.o ipv4.o ndp.o tftp.o
-LIBNETCFLAGS := $(QEMU_CFLAGS) -DDHCPARCH=0x1F $(LIBC_INC) $(LIBNET_INC)
+	      dhcpv6.o ethernet.o ipv4.o ndp.o tftp.o pxelinux.o
+LIBNETCFLAGS := $(QEMU_CFLAGS) $(CFLAGS) -DDHCPARCH=0x1F $(LIBC_INC) $(LIBNET_INC)
 
 %.o : $(SLOF_DIR)/lib/libnet/%.c
 	$(call quiet-command,$(CC) $(LIBNETCFLAGS) -c -o $@ $<,"CC","$(TARGET_DIR)$@")
diff --git a/pc-bios/s390-ccw/netmain.c b/pc-bios/s390-ccw/netmain.c
index 600024155b..0392131c27 100644
--- a/pc-bios/s390-ccw/netmain.c
+++ b/pc-bios/s390-ccw/netmain.c
@@ -30,6 +30,7 @@
 #include <ipv6.h>
 #include <dns.h>
 #include <time.h>
+#include <pxelinux.h>
 
 #include "s390-ccw.h"
 #include "virtio.h"
@@ -41,13 +42,17 @@ extern char _start[];
 
 #define KERNEL_ADDR             ((void *)0L)
 #define KERNEL_MAX_SIZE         ((long)_start)
+#define ARCH_COMMAND_LINE_SIZE  896              /* Taken from Linux kernel */
+
+/* STSI 3.2.2 offset of first vmdb + offset of uuid inside vmdb */
+#define STSI322_VMDB_UUID_OFFSET ((8 + 12) * 4)
 
 char stack[PAGE_SIZE * 8] __attribute__((aligned(PAGE_SIZE)));
 IplParameterBlock iplb __attribute__((aligned(PAGE_SIZE)));
 static char cfgbuf[2048];
 
 static SubChannelId net_schid = { .one = 1 };
-static int ip_version = 4;
+static uint8_t mac[6];
 static uint64_t dest_timer;
 
 static uint64_t get_timer_ms(void)
@@ -100,10 +105,10 @@ static int dhcp(struct filename_ip *fn_ip, int retries)
             printf("\nGiving up after %d DHCP requests\n", retries);
             return -1;
         }
-        ip_version = 4;
+        fn_ip->ip_version = 4;
         rc = dhcpv4(NULL, fn_ip);
         if (rc == -1) {
-            ip_version = 6;
+            fn_ip->ip_version = 6;
             set_ipv6_address(fn_ip->fd, 0);
             rc = dhcpv6(NULL, fn_ip);
             if (rc == 0) {
@@ -137,8 +142,7 @@ static int tftp_load(filename_ip_t *fnip, void *buffer, int len)
     tftp_err_t tftp_err;
     int rc;
 
-    rc = tftp(fnip, buffer, len, DEFAULT_TFTP_RETRIES, &tftp_err, 1, 1428,
-              ip_version);
+    rc = tftp(fnip, buffer, len, DEFAULT_TFTP_RETRIES, &tftp_err);
 
     if (rc < 0) {
         /* Make sure that error messages are put into a new line */
@@ -149,61 +153,11 @@ static int tftp_load(filename_ip_t *fnip, void *buffer, int len)
         printf("  TFTP: Received %s (%d KBytes)\n", fnip->filename, rc / 1024);
     } else if (rc > 0) {
         printf("  TFTP: Received %s (%d Bytes)\n", fnip->filename, rc);
-    } else if (rc == -1) {
-        puts("unknown TFTP error");
-    } else if (rc == -2) {
-        printf("TFTP buffer of %d bytes is too small for %s\n",
-            len, fnip->filename);
-    } else if (rc == -3) {
-        printf("file not found: %s\n", fnip->filename);
-    } else if (rc == -4) {
-        puts("TFTP access violation");
-    } else if (rc == -5) {
-        puts("illegal TFTP operation");
-    } else if (rc == -6) {
-        puts("unknown TFTP transfer ID");
-    } else if (rc == -7) {
-        puts("no such TFTP user");
-    } else if (rc == -8) {
-        puts("TFTP blocksize negotiation failed");
-    } else if (rc == -9) {
-        puts("file exceeds maximum TFTP transfer size");
-    } else if (rc <= -10 && rc >= -15) {
-        const char *icmp_err_str;
-        switch (rc) {
-        case -ICMP_NET_UNREACHABLE - 10:
-            icmp_err_str = "net unreachable";
-            break;
-        case -ICMP_HOST_UNREACHABLE - 10:
-            icmp_err_str = "host unreachable";
-            break;
-        case -ICMP_PROTOCOL_UNREACHABLE - 10:
-            icmp_err_str = "protocol unreachable";
-            break;
-        case -ICMP_PORT_UNREACHABLE - 10:
-            icmp_err_str = "port unreachable";
-            break;
-        case -ICMP_FRAGMENTATION_NEEDED - 10:
-            icmp_err_str = "fragmentation needed and DF set";
-            break;
-        case -ICMP_SOURCE_ROUTE_FAILED - 10:
-            icmp_err_str = "source route failed";
-            break;
-        default:
-            icmp_err_str = " UNKNOWN";
-            break;
-        }
-        printf("ICMP ERROR \"%s\"\n", icmp_err_str);
-    } else if (rc == -40) {
-        printf("TFTP error occurred after %d bad packets received",
-            tftp_err.bad_tftp_packets);
-    } else if (rc == -41) {
-        printf("TFTP error occurred after missing %d responses",
-            tftp_err.no_packets);
-    } else if (rc == -42) {
-        printf("TFTP error missing block %d, expected block was %d",
-            tftp_err.blocks_missed,
-            tftp_err.blocks_received);
+    } else {
+        const char *errstr = NULL;
+        int ecode;
+        tftp_get_error_info(fnip, &tftp_err, rc, &errstr, &ecode);
+        printf("TFTP error: %s\n", errstr ? errstr : "unknown error");
     }
 
     return rc;
@@ -211,7 +165,6 @@ static int tftp_load(filename_ip_t *fnip, void *buffer, int len)
 
 static int net_init(filename_ip_t *fn_ip)
 {
-    uint8_t mac[6];
     int rc;
 
     memset(fn_ip, 0, sizeof(filename_ip_t));
@@ -231,7 +184,7 @@ static int net_init(filename_ip_t *fn_ip)
 
     rc = dhcp(fn_ip, DEFAULT_BOOT_RETRIES);
     if (rc >= 0) {
-        if (ip_version == 4) {
+        if (fn_ip->ip_version == 4) {
             set_ipv4_address(fn_ip->own_ip);
         }
     } else {
@@ -239,11 +192,11 @@ static int net_init(filename_ip_t *fn_ip)
         return -101;
     }
 
-    if (ip_version == 4) {
+    if (fn_ip->ip_version == 4) {
         printf("  Using IPv4 address: %d.%d.%d.%d\n",
               (fn_ip->own_ip >> 24) & 0xFF, (fn_ip->own_ip >> 16) & 0xFF,
               (fn_ip->own_ip >>  8) & 0xFF, fn_ip->own_ip & 0xFF);
-    } else if (ip_version == 6) {
+    } else if (fn_ip->ip_version == 6) {
         char ip6_str[40];
         ipv6_to_str(fn_ip->own_ip6.addr, ip6_str);
         printf("  Using IPv6 address: %s\n", ip6_str);
@@ -261,17 +214,17 @@ static int net_init(filename_ip_t *fn_ip)
     }
 
     printf("  Using TFTP server: ");
-    if (ip_version == 4) {
+    if (fn_ip->ip_version == 4) {
         printf("%d.%d.%d.%d\n",
                (fn_ip->server_ip >> 24) & 0xFF, (fn_ip->server_ip >> 16) & 0xFF,
                (fn_ip->server_ip >>  8) & 0xFF, fn_ip->server_ip & 0xFF);
-    } else if (ip_version == 6) {
+    } else if (fn_ip->ip_version == 6) {
         char ip6_str[40];
         ipv6_to_str(fn_ip->server_ip6.addr, ip6_str);
         printf("%s\n", ip6_str);
     }
 
-    if (strlen((char *)fn_ip->filename) > 0) {
+    if (strlen(fn_ip->filename) > 0) {
         printf("  Bootfile name: '%s'\n", fn_ip->filename);
     }
 
@@ -280,12 +233,123 @@ static int net_init(filename_ip_t *fn_ip)
 
 static void net_release(filename_ip_t *fn_ip)
 {
-    if (ip_version == 4) {
+    if (fn_ip->ip_version == 4) {
         dhcp_send_release(fn_ip->fd);
     }
 }
 
 /**
+ * Retrieve the Universally Unique Identifier of the VM.
+ * @return UUID string, or NULL in case of errors
+ */
+static const char *get_uuid(void)
+{
+    register int r0 asm("0");
+    register int r1 asm("1");
+    uint8_t *mem, *buf, uuid[16];
+    int i, cc, chk = 0;
+    static char uuid_str[37];
+
+    mem = malloc(2 * PAGE_SIZE);
+    if (!mem) {
+        puts("Out of memory ... can not get UUID.");
+        return NULL;
+    }
+    buf = (uint8_t *)(((uint64_t)mem + PAGE_SIZE - 1) & ~(PAGE_SIZE - 1));
+    memset(buf, 0, PAGE_SIZE);
+
+    /* Get SYSIB 3.2.2 */
+    r0 = (3 << 28) | 2;
+    r1 = 2;
+    asm volatile(" stsi 0(%[addr])\n"
+                 " ipm  %[cc]\n"
+                 " srl  %[cc],28\n"
+                 : [cc] "=d" (cc)
+                 : "d" (r0), "d" (r1), [addr] "a" (buf)
+                 : "cc", "memory");
+    if (cc) {
+        return NULL;
+    }
+
+    for (i = 0; i < 16; i++) {
+        uuid[i] = buf[STSI322_VMDB_UUID_OFFSET + i];
+        chk |= uuid[i];
+    }
+    free(mem);
+    if (!chk) {
+        return NULL;
+    }
+
+    sprintf(uuid_str, "%02x%02x%02x%02x-%02x%02x-%02x%02x-%02x%02x-"
+            "%02x%02x%02x%02x%02x%02x", uuid[0], uuid[1], uuid[2], uuid[3],
+            uuid[4], uuid[5], uuid[6], uuid[7], uuid[8], uuid[9], uuid[10],
+            uuid[11], uuid[12], uuid[13], uuid[14], uuid[15]);
+
+    return uuid_str;
+}
+
+/**
+ * Load a kernel with initrd (i.e. with the information that we've got from
+ * a pxelinux.cfg config file)
+ */
+static int load_kernel_with_initrd(filename_ip_t *fn_ip,
+                                   struct pl_cfg_entry *entry)
+{
+    int rc;
+
+    printf("Loading pxelinux.cfg entry '%s'\n", entry->label);
+
+    if (!entry->kernel) {
+        printf("Kernel entry is missing!\n");
+        return -1;
+    }
+
+    strncpy(fn_ip->filename, entry->kernel, sizeof(fn_ip->filename));
+    rc = tftp_load(fn_ip, KERNEL_ADDR, KERNEL_MAX_SIZE);
+    if (rc < 0) {
+        return rc;
+    }
+
+    if (entry->initrd) {
+        uint64_t iaddr = (rc + 0xfff) & ~0xfffUL;
+
+        strncpy(fn_ip->filename, entry->initrd, sizeof(fn_ip->filename));
+        rc = tftp_load(fn_ip, (void *)iaddr, KERNEL_MAX_SIZE - iaddr);
+        if (rc < 0) {
+            return rc;
+        }
+        /* Patch location and size: */
+        *(uint64_t *)0x10408 = iaddr;
+        *(uint64_t *)0x10410 = rc;
+        rc += iaddr;
+    }
+
+    if (entry->append) {
+        strncpy((char *)0x10480, entry->append, ARCH_COMMAND_LINE_SIZE);
+    }
+
+    return rc;
+}
+
+#define MAX_PXELINUX_ENTRIES 16
+
+static int net_try_pxelinux_cfg(filename_ip_t *fn_ip)
+{
+    struct pl_cfg_entry entries[MAX_PXELINUX_ENTRIES];
+    int num_ent, def_ent = 0;
+
+    num_ent = pxelinux_load_parse_cfg(fn_ip, mac, get_uuid(),
+                                      DEFAULT_TFTP_RETRIES,
+                                      cfgbuf, sizeof(cfgbuf),
+                                      entries, MAX_PXELINUX_ENTRIES, &def_ent);
+    if (num_ent > 0) {
+        return load_kernel_with_initrd(fn_ip, &entries[def_ent]);
+    }
+
+    return -1;
+}
+
+/**
  * Load via information from a .INS file (which can be found on CD-ROMs
  * for example)
  */
@@ -322,7 +386,7 @@ static int handle_ins_cfg(filename_ip_t *fn_ip, char *cfg, int cfgsize)
             return -1;
         }
         *ptr = 0;
-        strncpy((char *)fn_ip->filename, insbuf, sizeof(fn_ip->filename));
+        strncpy(fn_ip->filename, insbuf, sizeof(fn_ip->filename));
         destaddr = (char *)atol(ptr + 1);
         rc = tftp_load(fn_ip, destaddr, (long)_start - (long)destaddr);
         if (rc <= 0) {
@@ -354,6 +418,25 @@ static int net_try_direct_tftp_load(filename_ip_t *fn_ip)
         if (!strncmp("* ", cfgbuf, 2)) {
             return handle_ins_cfg(fn_ip, cfgbuf, rc);
         }
+        /*
+         * pxelinux.cfg support via bootfile name is just here for developers'
+         * convenience (it eases testing with the built-in DHCP server of QEMU
+         * that does not support RFC 5071). The official way to configure a
+         * pxelinux.cfg file name is to use DHCP options 209 and 210 instead.
+         * So only use the pxelinux.cfg parser here for files that start with
+         * a magic comment string.
+         */
+        if (!strncasecmp("# pxelinux", cfgbuf, 10)) {
+            struct pl_cfg_entry entries[MAX_PXELINUX_ENTRIES];
+            int num_ent, def_ent = 0;
+
+            num_ent = pxelinux_parse_cfg(cfgbuf, sizeof(cfgbuf), entries,
+                                         MAX_PXELINUX_ENTRIES, &def_ent);
+            if (num_ent <= 0) {
+                return -1;
+            }
+            return load_kernel_with_initrd(fn_ip, &entries[def_ent]);
+        }
     }
 
     /* Move kernel to right location */
@@ -455,10 +538,13 @@ void main(void)
         panic("Network initialization failed. Halting.\n");
     }
 
-    fnlen = strlen((char *)fn_ip.filename);
+    fnlen = strlen(fn_ip.filename);
     if (fnlen > 0 && fn_ip.filename[fnlen - 1] != '/') {
         rc = net_try_direct_tftp_load(&fn_ip);
     }
+    if (rc <= 0) {
+        rc = net_try_pxelinux_cfg(&fn_ip);
+    }
 
     net_release(&fn_ip);
 
diff --git a/pc-bios/s390-ccw/sclp.c b/pc-bios/s390-ccw/sclp.c
index 3836cb4716..c0223fab0b 100644
--- a/pc-bios/s390-ccw/sclp.c
+++ b/pc-bios/s390-ccw/sclp.c
@@ -114,7 +114,7 @@ void sclp_get_loadparm_ascii(char *loadparm)
     memset((char *)_sccb, 0, sizeof(ReadInfo));
     sccb->h.length = sizeof(ReadInfo);
     if (!sclp_service_call(SCLP_CMDW_READ_SCP_INFO, sccb)) {
-        ebcdic_to_ascii((char *) sccb->loadparm, loadparm, 8);
+        ebcdic_to_ascii((char *) sccb->loadparm, loadparm, LOADPARM_LEN);
     }
 }
 
diff --git a/pc-bios/s390-ccw/sclp.h b/pc-bios/s390-ccw/sclp.h
index 0dd987ff5d..8450161ba7 100644
--- a/pc-bios/s390-ccw/sclp.h
+++ b/pc-bios/s390-ccw/sclp.h
@@ -56,7 +56,7 @@ typedef struct ReadInfo {
     uint16_t rnmax;
     uint8_t rnsize;
     uint8_t reserved[13];
-    uint8_t loadparm[8];
+    uint8_t loadparm[LOADPARM_LEN];
 } __attribute__((packed)) ReadInfo;
 
 typedef struct SCCB {
diff --git a/pc-bios/s390-netboot.img b/pc-bios/s390-netboot.img
index ef561efd2e..2c6886efb8 100644
--- a/pc-bios/s390-netboot.img
+++ b/pc-bios/s390-netboot.img
Binary files differdiff --git a/qapi/block-core.json b/qapi/block-core.json
index ab629d1647..cc3ede0630 100644
--- a/qapi/block-core.json
+++ b/qapi/block-core.json
@@ -1051,6 +1051,24 @@
   'data': ['top', 'full', 'none', 'incremental'] }
 
 ##
+# @MirrorCopyMode:
+#
+# An enumeration whose values tell the mirror block job when to
+# trigger writes to the target.
+#
+# @background: copy data in background only.
+#
+# @write-blocking: when data is written to the source, write it
+#                  (synchronously) to the target as well.  In
+#                  addition, data is copied in background just like in
+#                  @background mode.
+#
+# Since: 3.0
+##
+{ 'enum': 'MirrorCopyMode',
+  'data': ['background', 'write-blocking'] }
+
+##
 # @BlockJobInfo:
 #
 # Information about a long-running block device operation.
@@ -1692,6 +1710,9 @@
 #         written. Both will result in identical contents.
 #         Default is true. (Since 2.4)
 #
+# @copy-mode: when to copy data to the destination; defaults to 'background'
+#             (Since: 3.0)
+#
 # Since: 1.3
 ##
 { 'struct': 'DriveMirror',
@@ -1701,7 +1722,7 @@
             '*speed': 'int', '*granularity': 'uint32',
             '*buf-size': 'int', '*on-source-error': 'BlockdevOnError',
             '*on-target-error': 'BlockdevOnError',
-            '*unmap': 'bool' } }
+            '*unmap': 'bool', '*copy-mode': 'MirrorCopyMode' } }
 
 ##
 # @BlockDirtyBitmap:
@@ -1964,6 +1985,9 @@
 #                    above @device. If this option is not given, a node name is
 #                    autogenerated. (Since: 2.9)
 #
+# @copy-mode: when to copy data to the destination; defaults to 'background'
+#             (Since: 3.0)
+#
 # Returns: nothing on success.
 #
 # Since: 2.6
@@ -1984,7 +2008,8 @@
             '*speed': 'int', '*granularity': 'uint32',
             '*buf-size': 'int', '*on-source-error': 'BlockdevOnError',
             '*on-target-error': 'BlockdevOnError',
-            '*filter-node-name': 'str' } }
+            '*filter-node-name': 'str',
+            '*copy-mode': 'MirrorCopyMode' } }
 
 ##
 # @block_set_io_throttle:
diff --git a/qapi/migration.json b/qapi/migration.json
index f7e10ee90f..1b4c1db670 100644
--- a/qapi/migration.json
+++ b/qapi/migration.json
@@ -517,6 +517,9 @@
 #                     and a power of 2
 #                     (Since 2.11)
 #
+# @max-postcopy-bandwidth: Background transfer bandwidth during postcopy.
+#                     Defaults to 0 (unlimited).  In bytes per second.
+#                     (Since 3.0)
 # Since: 2.4
 ##
 { 'enum': 'MigrationParameter',
@@ -525,7 +528,7 @@
            'tls-creds', 'tls-hostname', 'max-bandwidth',
            'downtime-limit', 'x-checkpoint-delay', 'block-incremental',
            'x-multifd-channels', 'x-multifd-page-count',
-           'xbzrle-cache-size' ] }
+           'xbzrle-cache-size', 'max-postcopy-bandwidth' ] }
 
 ##
 # @MigrateSetParameters:
@@ -593,6 +596,10 @@
 #                     needs to be a multiple of the target page size
 #                     and a power of 2
 #                     (Since 2.11)
+#
+# @max-postcopy-bandwidth: Background transfer bandwidth during postcopy.
+#                     Defaults to 0 (unlimited).  In bytes per second.
+#                     (Since 3.0)
 # Since: 2.4
 ##
 # TODO either fuse back into MigrationParameters, or make
@@ -611,7 +618,8 @@
             '*block-incremental': 'bool',
             '*x-multifd-channels': 'int',
             '*x-multifd-page-count': 'int',
-            '*xbzrle-cache-size': 'size' } }
+            '*xbzrle-cache-size': 'size',
+            '*max-postcopy-bandwidth': 'size' } }
 
 ##
 # @migrate-set-parameters:
@@ -694,6 +702,10 @@
 #                     needs to be a multiple of the target page size
 #                     and a power of 2
 #                     (Since 2.11)
+#
+# @max-postcopy-bandwidth: Background transfer bandwidth during postcopy.
+#                     Defaults to 0 (unlimited).  In bytes per second.
+#                     (Since 3.0)
 # Since: 2.4
 ##
 { 'struct': 'MigrationParameters',
@@ -710,7 +722,8 @@
             '*block-incremental': 'bool' ,
             '*x-multifd-channels': 'uint8',
             '*x-multifd-page-count': 'uint32',
-            '*xbzrle-cache-size': 'size' } }
+            '*xbzrle-cache-size': 'size',
+            '*max-postcopy-bandwidth': 'size'  } }
 
 ##
 # @query-migrate-parameters:
diff --git a/roms/SLOF b/roms/SLOF
-Subproject fa981320a1e0968d6fc1b8de319723ff8212b33
+Subproject 2317427ce76006723f7ae103a6998ab41dd79c6
diff --git a/roms/openbios b/roms/openbios
-Subproject 54d959d97fb331708767b2fd4a878efd2bbc41b
+Subproject 8fe6f5f96f6ca39f1f62200be7fa130e929f13f
diff --git a/scripts/qemu.py b/scripts/qemu.py
index 08a3e9af5a..f099ce7278 100644
--- a/scripts/qemu.py
+++ b/scripts/qemu.py
@@ -17,19 +17,41 @@ import logging
 import os
 import subprocess
 import qmp.qmp
+import re
 import shutil
+import socket
 import tempfile
 
 
 LOG = logging.getLogger(__name__)
 
 
+#: Maps machine types to the preferred console device types
+CONSOLE_DEV_TYPES = {
+    r'^clipper$': 'isa-serial',
+    r'^malta': 'isa-serial',
+    r'^(pc.*|q35.*|isapc)$': 'isa-serial',
+    r'^(40p|powernv|prep)$': 'isa-serial',
+    r'^pseries.*': 'spapr-vty',
+    r'^s390-ccw-virtio.*': 'sclpconsole',
+    }
+
+
 class QEMUMachineError(Exception):
     """
     Exception called when an error in QEMUMachine happens.
     """
 
 
+class QEMUMachineAddDeviceError(QEMUMachineError):
+    """
+    Exception raised when a request to add a device can not be fulfilled
+
+    The failures are caused by limitations, lack of information or conflicting
+    requests on the QEMUMachine methods.  This exception does not represent
+    failures reported by the QEMU binary itself.
+    """
+
 class MonitorResponseError(qmp.qmp.QMPError):
     '''
     Represents erroneous QMP monitor reply
@@ -91,6 +113,10 @@ class QEMUMachine(object):
         self._test_dir = test_dir
         self._temp_dir = None
         self._launched = False
+        self._machine = None
+        self._console_device_type = None
+        self._console_address = None
+        self._console_socket = None
 
         # just in case logging wasn't configured by the main script:
         logging.basicConfig()
@@ -175,9 +201,19 @@ class QEMUMachine(object):
                 self._monitor_address[1])
         else:
             moncdev = 'socket,id=mon,path=%s' % self._vm_monitor
-        return ['-chardev', moncdev,
+        args = ['-chardev', moncdev,
                 '-mon', 'chardev=mon,mode=control',
                 '-display', 'none', '-vga', 'none']
+        if self._machine is not None:
+            args.extend(['-machine', self._machine])
+        if self._console_device_type is not None:
+            self._console_address = os.path.join(self._temp_dir,
+                                                 self._name + "-console.sock")
+            chardev = ('socket,id=console,path=%s,server,nowait' %
+                       self._console_address)
+            device = '%s,chardev=console' % self._console_device_type
+            args.extend(['-chardev', chardev, '-device', device])
+        return args
 
     def _pre_launch(self):
         self._temp_dir = tempfile.mkdtemp(dir=self._test_dir)
@@ -202,6 +238,10 @@ class QEMUMachine(object):
 
         self._qemu_log_path = None
 
+        if self._console_socket is not None:
+            self._console_socket.close()
+            self._console_socket = None
+
         if self._temp_dir is not None:
             shutil.rmtree(self._temp_dir)
             self._temp_dir = None
@@ -359,3 +399,64 @@ class QEMUMachine(object):
         of the qemu process.
         '''
         return self._iolog
+
+    def add_args(self, *args):
+        '''
+        Adds to the list of extra arguments to be given to the QEMU binary
+        '''
+        self._args.extend(args)
+
+    def set_machine(self, machine_type):
+        '''
+        Sets the machine type
+
+        If set, the machine type will be added to the base arguments
+        of the resulting QEMU command line.
+        '''
+        self._machine = machine_type
+
+    def set_console(self, device_type=None):
+        '''
+        Sets the device type for a console device
+
+        If set, the console device and a backing character device will
+        be added to the base arguments of the resulting QEMU command
+        line.
+
+        This is a convenience method that will either use the provided
+        device type, of if not given, it will used the device type set
+        on CONSOLE_DEV_TYPES.
+
+        The actual setting of command line arguments will be be done at
+        machine launch time, as it depends on the temporary directory
+        to be created.
+
+        @param device_type: the device type, such as "isa-serial"
+        @raises: QEMUMachineAddDeviceError if the device type is not given
+                 and can not be determined.
+        '''
+        if device_type is None:
+            if self._machine is None:
+                raise QEMUMachineAddDeviceError("Can not add a console device:"
+                                                " QEMU instance without a "
+                                                "defined machine type")
+            for regex, device in CONSOLE_DEV_TYPES.items():
+                if re.match(regex, self._machine):
+                    device_type = device
+                    break
+            if device_type is None:
+                raise QEMUMachineAddDeviceError("Can not add a console device:"
+                                                " no matching console device "
+                                                "type definition")
+        self._console_device_type = device_type
+
+    @property
+    def console_socket(self):
+        """
+        Returns a socket connected to the console
+        """
+        if self._console_socket is None:
+            self._console_socket = socket.socket(socket.AF_UNIX,
+                                                 socket.SOCK_STREAM)
+            self._console_socket.connect(self._console_address)
+        return self._console_socket
diff --git a/stubs/fdset.c b/stubs/fdset.c
index 6020cf28c8..4f3edf2ea4 100644
--- a/stubs/fdset.c
+++ b/stubs/fdset.c
@@ -14,7 +14,7 @@ int monitor_fdset_dup_fd_find(int dup_fd)
 
 int monitor_fdset_get_fd(int64_t fdset_id, int flags)
 {
-    return -1;
+    return -ENOENT;
 }
 
 void monitor_fdset_dup_fd_remove(int dupfd)
diff --git a/target/ppc/cpu.h b/target/ppc/cpu.h
index 0247c1f04c..874da6efbc 100644
--- a/target/ppc/cpu.h
+++ b/target/ppc/cpu.h
@@ -1091,12 +1091,6 @@ struct CPUPPCState {
     target_ulong rmls;
 #endif
 
-#if defined(TARGET_PPC64) && !defined(CONFIG_USER_ONLY)
-    uint64_t vpa_addr;
-    uint64_t slb_shadow_addr, slb_shadow_size;
-    uint64_t dtl_addr, dtl_size;
-#endif /* TARGET_PPC64 */
-
     int error_code;
     uint32_t pending_interrupts;
 #if !defined(CONFIG_USER_ONLY)
@@ -1205,6 +1199,7 @@ struct PowerPCCPU {
     uint32_t compat_pvr;
     PPCVirtualHypervisor *vhyp;
     Object *intc;
+    void *machine_data;
     int32_t node_id; /* NUMA node this CPU belongs to */
     PPCHash64Options *hash64_opts;
 
@@ -1300,8 +1295,6 @@ void ppc_store_ptcr(CPUPPCState *env, target_ulong value);
 void ppc_store_msr (CPUPPCState *env, target_ulong value);
 
 void ppc_cpu_list (FILE *f, fprintf_function cpu_fprintf);
-#if defined(TARGET_PPC64)
-#endif
 
 /* Time-base and decrementer management */
 #ifndef NO_CPU_IO_DEFS
diff --git a/target/ppc/kvm.c b/target/ppc/kvm.c
index 2c0c34e125..5c0e313ca6 100644
--- a/target/ppc/kvm.c
+++ b/target/ppc/kvm.c
@@ -829,22 +829,22 @@ static int kvm_get_fp(CPUState *cs)
 static int kvm_get_vpa(CPUState *cs)
 {
     PowerPCCPU *cpu = POWERPC_CPU(cs);
-    CPUPPCState *env = &cpu->env;
+    sPAPRCPUState *spapr_cpu = spapr_cpu_state(cpu);
     struct kvm_one_reg reg;
     int ret;
 
     reg.id = KVM_REG_PPC_VPA_ADDR;
-    reg.addr = (uintptr_t)&env->vpa_addr;
+    reg.addr = (uintptr_t)&spapr_cpu->vpa_addr;
     ret = kvm_vcpu_ioctl(cs, KVM_GET_ONE_REG, &reg);
     if (ret < 0) {
         DPRINTF("Unable to get VPA address from KVM: %s\n", strerror(errno));
         return ret;
     }
 
-    assert((uintptr_t)&env->slb_shadow_size
-           == ((uintptr_t)&env->slb_shadow_addr + 8));
+    assert((uintptr_t)&spapr_cpu->slb_shadow_size
+           == ((uintptr_t)&spapr_cpu->slb_shadow_addr + 8));
     reg.id = KVM_REG_PPC_VPA_SLB;
-    reg.addr = (uintptr_t)&env->slb_shadow_addr;
+    reg.addr = (uintptr_t)&spapr_cpu->slb_shadow_addr;
     ret = kvm_vcpu_ioctl(cs, KVM_GET_ONE_REG, &reg);
     if (ret < 0) {
         DPRINTF("Unable to get SLB shadow state from KVM: %s\n",
@@ -852,9 +852,10 @@ static int kvm_get_vpa(CPUState *cs)
         return ret;
     }
 
-    assert((uintptr_t)&env->dtl_size == ((uintptr_t)&env->dtl_addr + 8));
+    assert((uintptr_t)&spapr_cpu->dtl_size
+           == ((uintptr_t)&spapr_cpu->dtl_addr + 8));
     reg.id = KVM_REG_PPC_VPA_DTL;
-    reg.addr = (uintptr_t)&env->dtl_addr;
+    reg.addr = (uintptr_t)&spapr_cpu->dtl_addr;
     ret = kvm_vcpu_ioctl(cs, KVM_GET_ONE_REG, &reg);
     if (ret < 0) {
         DPRINTF("Unable to get dispatch trace log state from KVM: %s\n",
@@ -868,7 +869,7 @@ static int kvm_get_vpa(CPUState *cs)
 static int kvm_put_vpa(CPUState *cs)
 {
     PowerPCCPU *cpu = POWERPC_CPU(cs);
-    CPUPPCState *env = &cpu->env;
+    sPAPRCPUState *spapr_cpu = spapr_cpu_state(cpu);
     struct kvm_one_reg reg;
     int ret;
 
@@ -876,11 +877,12 @@ static int kvm_put_vpa(CPUState *cs)
      * registered.  That means when restoring state, if a VPA *is*
      * registered, we need to set that up first.  If not, we need to
      * deregister the others before deregistering the master VPA */
-    assert(env->vpa_addr || !(env->slb_shadow_addr || env->dtl_addr));
+    assert(spapr_cpu->vpa_addr
+           || !(spapr_cpu->slb_shadow_addr || spapr_cpu->dtl_addr));
 
-    if (env->vpa_addr) {
+    if (spapr_cpu->vpa_addr) {
         reg.id = KVM_REG_PPC_VPA_ADDR;
-        reg.addr = (uintptr_t)&env->vpa_addr;
+        reg.addr = (uintptr_t)&spapr_cpu->vpa_addr;
         ret = kvm_vcpu_ioctl(cs, KVM_SET_ONE_REG, &reg);
         if (ret < 0) {
             DPRINTF("Unable to set VPA address to KVM: %s\n", strerror(errno));
@@ -888,19 +890,20 @@ static int kvm_put_vpa(CPUState *cs)
         }
     }
 
-    assert((uintptr_t)&env->slb_shadow_size
-           == ((uintptr_t)&env->slb_shadow_addr + 8));
+    assert((uintptr_t)&spapr_cpu->slb_shadow_size
+           == ((uintptr_t)&spapr_cpu->slb_shadow_addr + 8));
     reg.id = KVM_REG_PPC_VPA_SLB;
-    reg.addr = (uintptr_t)&env->slb_shadow_addr;
+    reg.addr = (uintptr_t)&spapr_cpu->slb_shadow_addr;
     ret = kvm_vcpu_ioctl(cs, KVM_SET_ONE_REG, &reg);
     if (ret < 0) {
         DPRINTF("Unable to set SLB shadow state to KVM: %s\n", strerror(errno));
         return ret;
     }
 
-    assert((uintptr_t)&env->dtl_size == ((uintptr_t)&env->dtl_addr + 8));
+    assert((uintptr_t)&spapr_cpu->dtl_size
+           == ((uintptr_t)&spapr_cpu->dtl_addr + 8));
     reg.id = KVM_REG_PPC_VPA_DTL;
-    reg.addr = (uintptr_t)&env->dtl_addr;
+    reg.addr = (uintptr_t)&spapr_cpu->dtl_addr;
     ret = kvm_vcpu_ioctl(cs, KVM_SET_ONE_REG, &reg);
     if (ret < 0) {
         DPRINTF("Unable to set dispatch trace log state to KVM: %s\n",
@@ -908,9 +911,9 @@ static int kvm_put_vpa(CPUState *cs)
         return ret;
     }
 
-    if (!env->vpa_addr) {
+    if (!spapr_cpu->vpa_addr) {
         reg.id = KVM_REG_PPC_VPA_ADDR;
-        reg.addr = (uintptr_t)&env->vpa_addr;
+        reg.addr = (uintptr_t)&spapr_cpu->vpa_addr;
         ret = kvm_vcpu_ioctl(cs, KVM_SET_ONE_REG, &reg);
         if (ret < 0) {
             DPRINTF("Unable to set VPA address to KVM: %s\n", strerror(errno));
@@ -2412,11 +2415,28 @@ bool kvmppc_has_cap_mmu_hash_v3(void)
     return cap_mmu_hash_v3;
 }
 
+static bool kvmppc_power8_host(void)
+{
+    bool ret = false;
+#ifdef TARGET_PPC64
+    {
+        uint32_t base_pvr = CPU_POWERPC_POWER_SERVER_MASK & mfpvr();
+        ret = (base_pvr == CPU_POWERPC_POWER8E_BASE) ||
+              (base_pvr == CPU_POWERPC_POWER8NVL_BASE) ||
+              (base_pvr == CPU_POWERPC_POWER8_BASE);
+    }
+#endif /* TARGET_PPC64 */
+    return ret;
+}
+
 static int parse_cap_ppc_safe_cache(struct kvm_ppc_cpu_char c)
 {
+    bool l1d_thread_priv_req = !kvmppc_power8_host();
+
     if (~c.behaviour & c.behaviour_mask & H_CPU_BEHAV_L1D_FLUSH_PR) {
         return 2;
-    } else if ((c.character & c.character_mask & H_CPU_CHAR_L1D_THREAD_PRIV) &&
+    } else if ((!l1d_thread_priv_req ||
+                c.character & c.character_mask & H_CPU_CHAR_L1D_THREAD_PRIV) &&
                (c.character & c.character_mask
                 & (H_CPU_CHAR_L1D_FLUSH_ORI30 | H_CPU_CHAR_L1D_FLUSH_TRIG2))) {
         return 1;
diff --git a/target/ppc/translate_init.inc.c b/target/ppc/translate_init.inc.c
index bb9296f5a3..76d6f3fd5e 100644
--- a/target/ppc/translate_init.inc.c
+++ b/target/ppc/translate_init.inc.c
@@ -10316,14 +10316,6 @@ static void ppc_cpu_reset(CPUState *s)
     s->exception_index = POWERPC_EXCP_NONE;
     env->error_code = 0;
 
-#if defined(TARGET_PPC64) && !defined(CONFIG_USER_ONLY)
-    env->vpa_addr = 0;
-    env->slb_shadow_addr = 0;
-    env->slb_shadow_size = 0;
-    env->dtl_addr = 0;
-    env->dtl_size = 0;
-#endif /* TARGET_PPC64 */
-
     for (i = 0; i < ARRAY_SIZE(env->spr_cb); i++) {
         ppc_spr_t *spr = &env->spr_cb[i];
 
diff --git a/target/s390x/cpu_models.c b/target/s390x/cpu_models.c
index e10035aaa8..cfdbccf46d 100644
--- a/target/s390x/cpu_models.c
+++ b/target/s390x/cpu_models.c
@@ -79,6 +79,7 @@ static S390CPUDef s390_cpu_defs[] = {
     CPUDEF_INIT(0x2964, 13, 2, 47, 0x08000000U, "z13.2", "IBM z13 GA2"),
     CPUDEF_INIT(0x2965, 13, 2, 47, 0x08000000U, "z13s", "IBM z13s GA1"),
     CPUDEF_INIT(0x3906, 14, 1, 47, 0x08000000U, "z14", "IBM z14 GA1"),
+    CPUDEF_INIT(0x3907, 14, 1, 47, 0x08000000U, "z14ZR1", "IBM z14 Model ZR1 GA1"),
 };
 
 #define QEMU_MAX_CPU_TYPE 0x2827
diff --git a/target/sparc/translate.c b/target/sparc/translate.c
index f3d430c1b2..74315cdf09 100644
--- a/target/sparc/translate.c
+++ b/target/sparc/translate.c
@@ -41,6 +41,8 @@
 #define JUMP_PC     2 /* dynamic pc value which takes only two values
                          according to jump_pc[T2] */
 
+#define DISAS_EXIT  DISAS_TARGET_0
+
 /* global register indexes */
 static TCGv_ptr cpu_regwptr;
 static TCGv cpu_cc_src, cpu_cc_src2, cpu_cc_dst;
@@ -3400,11 +3402,17 @@ static void disas_sparc_insn(DisasContext * dc, unsigned int insn)
                         r_const = tcg_const_i32(dc->mem_idx);
                         tcg_gen_ld_ptr(r_tickptr, cpu_env,
                                        offsetof(CPUSPARCState, tick));
+                        if (tb_cflags(dc->base.tb) & CF_USE_ICOUNT) {
+                            gen_io_start();
+                        }
                         gen_helper_tick_get_count(cpu_dst, cpu_env, r_tickptr,
                                                   r_const);
                         tcg_temp_free_ptr(r_tickptr);
                         tcg_temp_free_i32(r_const);
                         gen_store_gpr(dc, rd, cpu_dst);
+                        if (tb_cflags(dc->base.tb) & CF_USE_ICOUNT) {
+                            gen_io_end();
+                        }
                     }
                     break;
                 case 0x5: /* V9 rdpc */
@@ -3447,11 +3455,17 @@ static void disas_sparc_insn(DisasContext * dc, unsigned int insn)
                         r_const = tcg_const_i32(dc->mem_idx);
                         tcg_gen_ld_ptr(r_tickptr, cpu_env,
                                        offsetof(CPUSPARCState, stick));
+                        if (tb_cflags(dc->base.tb) & CF_USE_ICOUNT) {
+                            gen_io_start();
+                        }
                         gen_helper_tick_get_count(cpu_dst, cpu_env, r_tickptr,
                                                   r_const);
                         tcg_temp_free_ptr(r_tickptr);
                         tcg_temp_free_i32(r_const);
                         gen_store_gpr(dc, rd, cpu_dst);
+                        if (tb_cflags(dc->base.tb) & CF_USE_ICOUNT) {
+                            gen_io_end();
+                        }
                     }
                     break;
                 case 0x19: /* System tick compare */
@@ -3576,10 +3590,16 @@ static void disas_sparc_insn(DisasContext * dc, unsigned int insn)
                         r_const = tcg_const_i32(dc->mem_idx);
                         tcg_gen_ld_ptr(r_tickptr, cpu_env,
                                        offsetof(CPUSPARCState, tick));
+                        if (tb_cflags(dc->base.tb) & CF_USE_ICOUNT) {
+                            gen_io_start();
+                        }
                         gen_helper_tick_get_count(cpu_tmp0, cpu_env,
                                                   r_tickptr, r_const);
                         tcg_temp_free_ptr(r_tickptr);
                         tcg_temp_free_i32(r_const);
+                        if (tb_cflags(dc->base.tb) & CF_USE_ICOUNT) {
+                            gen_io_end();
+                        }
                     }
                     break;
                 case 5: // tba
@@ -4385,9 +4405,19 @@ static void disas_sparc_insn(DisasContext * dc, unsigned int insn)
                                     r_tickptr = tcg_temp_new_ptr();
                                     tcg_gen_ld_ptr(r_tickptr, cpu_env,
                                                    offsetof(CPUSPARCState, tick));
+                                    if (tb_cflags(dc->base.tb) &
+                                           CF_USE_ICOUNT) {
+                                        gen_io_start();
+                                    }
                                     gen_helper_tick_set_limit(r_tickptr,
                                                               cpu_tick_cmpr);
                                     tcg_temp_free_ptr(r_tickptr);
+                                    if (tb_cflags(dc->base.tb) &
+                                           CF_USE_ICOUNT) {
+                                        gen_io_end();
+                                    }
+                                    /* End TB to handle timer interrupt */
+                                    dc->base.is_jmp = DISAS_EXIT;
                                 }
                                 break;
                             case 0x18: /* System tick */
@@ -4403,9 +4433,19 @@ static void disas_sparc_insn(DisasContext * dc, unsigned int insn)
                                     r_tickptr = tcg_temp_new_ptr();
                                     tcg_gen_ld_ptr(r_tickptr, cpu_env,
                                                    offsetof(CPUSPARCState, stick));
+                                    if (tb_cflags(dc->base.tb) &
+                                           CF_USE_ICOUNT) {
+                                        gen_io_start();
+                                    }
                                     gen_helper_tick_set_count(r_tickptr,
                                                               cpu_tmp0);
                                     tcg_temp_free_ptr(r_tickptr);
+                                    if (tb_cflags(dc->base.tb) &
+                                           CF_USE_ICOUNT) {
+                                        gen_io_end();
+                                    }
+                                    /* End TB to handle timer interrupt */
+                                    dc->base.is_jmp = DISAS_EXIT;
                                 }
                                 break;
                             case 0x19: /* System tick compare */
@@ -4421,9 +4461,19 @@ static void disas_sparc_insn(DisasContext * dc, unsigned int insn)
                                     r_tickptr = tcg_temp_new_ptr();
                                     tcg_gen_ld_ptr(r_tickptr, cpu_env,
                                                    offsetof(CPUSPARCState, stick));
+                                    if (tb_cflags(dc->base.tb) &
+                                           CF_USE_ICOUNT) {
+                                        gen_io_start();
+                                    }
                                     gen_helper_tick_set_limit(r_tickptr,
                                                               cpu_stick_cmpr);
                                     tcg_temp_free_ptr(r_tickptr);
+                                    if (tb_cflags(dc->base.tb) &
+                                           CF_USE_ICOUNT) {
+                                        gen_io_end();
+                                    }
+                                    /* End TB to handle timer interrupt */
+                                    dc->base.is_jmp = DISAS_EXIT;
                                 }
                                 break;
 
@@ -4531,9 +4581,19 @@ static void disas_sparc_insn(DisasContext * dc, unsigned int insn)
                                     r_tickptr = tcg_temp_new_ptr();
                                     tcg_gen_ld_ptr(r_tickptr, cpu_env,
                                                    offsetof(CPUSPARCState, tick));
+                                    if (tb_cflags(dc->base.tb) &
+                                           CF_USE_ICOUNT) {
+                                        gen_io_start();
+                                    }
                                     gen_helper_tick_set_count(r_tickptr,
                                                               cpu_tmp0);
                                     tcg_temp_free_ptr(r_tickptr);
+                                    if (tb_cflags(dc->base.tb) &
+                                           CF_USE_ICOUNT) {
+                                        gen_io_end();
+                                    }
+                                    /* End TB to handle timer interrupt */
+                                    dc->base.is_jmp = DISAS_EXIT;
                                 }
                                 break;
                             case 5: // tba
@@ -4541,7 +4601,13 @@ static void disas_sparc_insn(DisasContext * dc, unsigned int insn)
                                 break;
                             case 6: // pstate
                                 save_state(dc);
+                                if (tb_cflags(dc->base.tb) & CF_USE_ICOUNT) {
+                                    gen_io_start();
+                                }
                                 gen_helper_wrpstate(cpu_env, cpu_tmp0);
+                                if (tb_cflags(dc->base.tb) & CF_USE_ICOUNT) {
+                                    gen_io_end();
+                                }
                                 dc->npc = DYNAMIC_PC;
                                 break;
                             case 7: // tl
@@ -4551,7 +4617,13 @@ static void disas_sparc_insn(DisasContext * dc, unsigned int insn)
                                 dc->npc = DYNAMIC_PC;
                                 break;
                             case 8: // pil
+                                if (tb_cflags(dc->base.tb) & CF_USE_ICOUNT) {
+                                    gen_io_start();
+                                }
                                 gen_helper_wrpil(cpu_env, cpu_tmp0);
+                                if (tb_cflags(dc->base.tb) & CF_USE_ICOUNT) {
+                                    gen_io_end();
+                                }
                                 break;
                             case 9: // cwp
                                 gen_helper_wrcwp(cpu_env, cpu_tmp0);
@@ -4642,9 +4714,19 @@ static void disas_sparc_insn(DisasContext * dc, unsigned int insn)
                                     r_tickptr = tcg_temp_new_ptr();
                                     tcg_gen_ld_ptr(r_tickptr, cpu_env,
                                                    offsetof(CPUSPARCState, hstick));
+                                    if (tb_cflags(dc->base.tb) &
+                                           CF_USE_ICOUNT) {
+                                        gen_io_start();
+                                    }
                                     gen_helper_tick_set_limit(r_tickptr,
                                                               cpu_hstick_cmpr);
                                     tcg_temp_free_ptr(r_tickptr);
+                                    if (tb_cflags(dc->base.tb) &
+                                           CF_USE_ICOUNT) {
+                                        gen_io_end();
+                                    }
+                                    /* End TB to handle timer interrupt */
+                                    dc->base.is_jmp = DISAS_EXIT;
                                 }
                                 break;
                             case 6: // hver readonly
@@ -5265,14 +5347,26 @@ static void disas_sparc_insn(DisasContext * dc, unsigned int insn)
                                 goto priv_insn;
                             dc->npc = DYNAMIC_PC;
                             dc->pc = DYNAMIC_PC;
+                            if (tb_cflags(dc->base.tb) & CF_USE_ICOUNT) {
+                                gen_io_start();
+                            }
                             gen_helper_done(cpu_env);
+                            if (tb_cflags(dc->base.tb) & CF_USE_ICOUNT) {
+                                gen_io_end();
+                            }
                             goto jmp_insn;
                         case 1:
                             if (!supervisor(dc))
                                 goto priv_insn;
                             dc->npc = DYNAMIC_PC;
                             dc->pc = DYNAMIC_PC;
+                            if (tb_cflags(dc->base.tb) & CF_USE_ICOUNT) {
+                                gen_io_start();
+                            }
                             gen_helper_retry(cpu_env);
+                            if (tb_cflags(dc->base.tb) & CF_USE_ICOUNT) {
+                                gen_io_end();
+                            }
                             goto jmp_insn;
                         default:
                             goto illegal_insn;
@@ -5822,7 +5916,9 @@ static void sparc_tr_tb_stop(DisasContextBase *dcbase, CPUState *cs)
 {
     DisasContext *dc = container_of(dcbase, DisasContext, base);
 
-    if (dc->base.is_jmp != DISAS_NORETURN) {
+    switch (dc->base.is_jmp) {
+    case DISAS_NEXT:
+    case DISAS_TOO_MANY:
         if (dc->pc != DYNAMIC_PC &&
             (dc->npc != DYNAMIC_PC && dc->npc != JUMP_PC)) {
             /* static PC and NPC: we can use direct chaining */
@@ -5834,6 +5930,19 @@ static void sparc_tr_tb_stop(DisasContextBase *dcbase, CPUState *cs)
             save_npc(dc);
             tcg_gen_exit_tb(NULL, 0);
         }
+        break;
+
+    case DISAS_NORETURN:
+       break;
+
+    case DISAS_EXIT:
+        /* Exit TB */
+        save_state(dc);
+        tcg_gen_exit_tb(NULL, 0);
+        break;
+
+    default:
+        g_assert_not_reached();
     }
 }
 
diff --git a/tests/acceptance/README.rst b/tests/acceptance/README.rst
new file mode 100644
index 0000000000..89260faed6
--- /dev/null
+++ b/tests/acceptance/README.rst
@@ -0,0 +1,10 @@
+============================================
+Acceptance tests using the Avocado Framework
+============================================
+
+This directory contains functional tests, also known as acceptance
+level tests.  They're usually higher level, and may interact with
+external resources and with various guest operating systems.
+
+For more information, please refer to ``docs/devel/testing.rst``,
+section "Acceptance tests using the Avocado Framework".
diff --git a/tests/acceptance/avocado_qemu/__init__.py b/tests/acceptance/avocado_qemu/__init__.py
new file mode 100644
index 0000000000..1e54fd5932
--- /dev/null
+++ b/tests/acceptance/avocado_qemu/__init__.py
@@ -0,0 +1,54 @@
+# Test class and utilities for functional tests
+#
+# Copyright (c) 2018 Red Hat, Inc.
+#
+# Author:
+#  Cleber Rosa <crosa@redhat.com>
+#
+# This work is licensed under the terms of the GNU GPL, version 2 or
+# later.  See the COPYING file in the top-level directory.
+
+import os
+import sys
+
+import avocado
+
+SRC_ROOT_DIR = os.path.dirname(os.path.dirname(os.path.dirname(__file__)))
+SRC_ROOT_DIR = os.path.abspath(os.path.dirname(SRC_ROOT_DIR))
+sys.path.append(os.path.join(SRC_ROOT_DIR, 'scripts'))
+
+from qemu import QEMUMachine
+
+def is_readable_executable_file(path):
+    return os.path.isfile(path) and os.access(path, os.R_OK | os.X_OK)
+
+
+def pick_default_qemu_bin():
+    """
+    Picks the path of a QEMU binary, starting either in the current working
+    directory or in the source tree root directory.
+    """
+    arch = os.uname()[4]
+    qemu_bin_relative_path = os.path.join("%s-softmmu" % arch,
+                                          "qemu-system-%s" % arch)
+    if is_readable_executable_file(qemu_bin_relative_path):
+        return qemu_bin_relative_path
+
+    qemu_bin_from_src_dir_path = os.path.join(SRC_ROOT_DIR,
+                                              qemu_bin_relative_path)
+    if is_readable_executable_file(qemu_bin_from_src_dir_path):
+        return qemu_bin_from_src_dir_path
+
+
+class Test(avocado.Test):
+    def setUp(self):
+        self.vm = None
+        self.qemu_bin = self.params.get('qemu_bin',
+                                        default=pick_default_qemu_bin())
+        if self.qemu_bin is None:
+            self.cancel("No QEMU binary defined or found in the source tree")
+        self.vm = QEMUMachine(self.qemu_bin)
+
+    def tearDown(self):
+        if self.vm is not None:
+            self.vm.shutdown()
diff --git a/tests/acceptance/boot_linux_console.py b/tests/acceptance/boot_linux_console.py
new file mode 100644
index 0000000000..98324f7591
--- /dev/null
+++ b/tests/acceptance/boot_linux_console.py
@@ -0,0 +1,47 @@
+# Functional test that boots a Linux kernel and checks the console
+#
+# Copyright (c) 2018 Red Hat, Inc.
+#
+# Author:
+#  Cleber Rosa <crosa@redhat.com>
+#
+# This work is licensed under the terms of the GNU GPL, version 2 or
+# later.  See the COPYING file in the top-level directory.
+
+import logging
+
+from avocado_qemu import Test
+
+
+class BootLinuxConsole(Test):
+    """
+    Boots a x86_64 Linux kernel and checks that the console is operational
+    and the kernel command line is properly passed from QEMU to the kernel
+
+    :avocado: enable
+    :avocado: tags=x86_64
+    """
+
+    timeout = 60
+
+    def test(self):
+        kernel_url = ('https://mirrors.kernel.org/fedora/releases/28/'
+                      'Everything/x86_64/os/images/pxeboot/vmlinuz')
+        kernel_hash = '238e083e114c48200f80d889f7e32eeb2793e02a'
+        kernel_path = self.fetch_asset(kernel_url, asset_hash=kernel_hash)
+
+        self.vm.set_machine('pc')
+        self.vm.set_console()
+        kernel_command_line = 'console=ttyS0'
+        self.vm.add_args('-kernel', kernel_path,
+                         '-append', kernel_command_line)
+        self.vm.launch()
+        console = self.vm.console_socket.makefile()
+        console_logger = logging.getLogger('console')
+        while True:
+            msg = console.readline()
+            console_logger.debug(msg.strip())
+            if 'Kernel command line: %s' % kernel_command_line in msg:
+                break
+            if 'Kernel panic - not syncing' in msg:
+                self.fail("Kernel panic reached")
diff --git a/tests/acceptance/version.py b/tests/acceptance/version.py
new file mode 100644
index 0000000000..13b0a7440d
--- /dev/null
+++ b/tests/acceptance/version.py
@@ -0,0 +1,24 @@
+# Version check example test
+#
+# Copyright (c) 2018 Red Hat, Inc.
+#
+# Author:
+#  Cleber Rosa <crosa@redhat.com>
+#
+# This work is licensed under the terms of the GNU GPL, version 2 or
+# later.  See the COPYING file in the top-level directory.
+
+
+from avocado_qemu import Test
+
+
+class Version(Test):
+    """
+    :avocado: enable
+    :avocado: tags=quick
+    """
+    def test_qmp_human_info_version(self):
+        self.vm.launch()
+        res = self.vm.command('human-monitor-command',
+                              command_line='info version')
+        self.assertRegexpMatches(res, r'^(\d+\.\d+\.\d)')
diff --git a/tests/acceptance/vnc.py b/tests/acceptance/vnc.py
new file mode 100644
index 0000000000..b1ef9d71b1
--- /dev/null
+++ b/tests/acceptance/vnc.py
@@ -0,0 +1,60 @@
+# Simple functional tests for VNC functionality
+#
+# Copyright (c) 2018 Red Hat, Inc.
+#
+# Author:
+#  Cleber Rosa <crosa@redhat.com>
+#
+# This work is licensed under the terms of the GNU GPL, version 2 or
+# later.  See the COPYING file in the top-level directory.
+
+from avocado_qemu import Test
+
+
+class Vnc(Test):
+    """
+    :avocado: enable
+    :avocado: tags=vnc,quick
+    """
+    def test_no_vnc(self):
+        self.vm.add_args('-nodefaults', '-S')
+        self.vm.launch()
+        self.assertFalse(self.vm.qmp('query-vnc')['return']['enabled'])
+
+    def test_no_vnc_change_password(self):
+        self.vm.add_args('-nodefaults', '-S')
+        self.vm.launch()
+        self.assertFalse(self.vm.qmp('query-vnc')['return']['enabled'])
+        set_password_response = self.vm.qmp('change',
+                                            device='vnc',
+                                            target='password',
+                                            arg='new_password')
+        self.assertIn('error', set_password_response)
+        self.assertEqual(set_password_response['error']['class'],
+                         'GenericError')
+        self.assertEqual(set_password_response['error']['desc'],
+                         'Could not set password')
+
+    def test_vnc_change_password_requires_a_password(self):
+        self.vm.add_args('-nodefaults', '-S', '-vnc', ':0')
+        self.vm.launch()
+        self.assertTrue(self.vm.qmp('query-vnc')['return']['enabled'])
+        set_password_response = self.vm.qmp('change',
+                                            device='vnc',
+                                            target='password',
+                                            arg='new_password')
+        self.assertIn('error', set_password_response)
+        self.assertEqual(set_password_response['error']['class'],
+                         'GenericError')
+        self.assertEqual(set_password_response['error']['desc'],
+                         'Could not set password')
+
+    def test_vnc_change_password(self):
+        self.vm.add_args('-nodefaults', '-S', '-vnc', ':0,password')
+        self.vm.launch()
+        self.assertTrue(self.vm.qmp('query-vnc')['return']['enabled'])
+        set_password_response = self.vm.qmp('change',
+                                            device='vnc',
+                                            target='password',
+                                            arg='new_password')
+        self.assertEqual(set_password_response['return'], {})
diff --git a/tests/qemu-iotests/151 b/tests/qemu-iotests/151
new file mode 100755
index 0000000000..fe53b9f446
--- /dev/null
+++ b/tests/qemu-iotests/151
@@ -0,0 +1,120 @@
+#!/usr/bin/env python
+#
+# Tests for active mirroring
+#
+# Copyright (C) 2018 Red Hat, Inc.
+#
+# This program is free software; you can redistribute it and/or modify
+# it under the terms of the GNU General Public License as published by
+# the Free Software Foundation; either version 2 of the License, or
+# (at your option) any later version.
+#
+# This program is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+# GNU General Public License for more details.
+#
+# You should have received a copy of the GNU General Public License
+# along with this program.  If not, see <http://www.gnu.org/licenses/>.
+#
+
+import os
+import iotests
+from iotests import qemu_img
+
+source_img = os.path.join(iotests.test_dir, 'source.' + iotests.imgfmt)
+target_img = os.path.join(iotests.test_dir, 'target.' + iotests.imgfmt)
+
+class TestActiveMirror(iotests.QMPTestCase):
+    image_len = 128 * 1024 * 1024 # MB
+    potential_writes_in_flight = True
+
+    def setUp(self):
+        qemu_img('create', '-f', iotests.imgfmt, source_img, '128M')
+        qemu_img('create', '-f', iotests.imgfmt, target_img, '128M')
+
+        blk_source = {'id': 'source',
+                      'if': 'none',
+                      'node-name': 'source-node',
+                      'driver': iotests.imgfmt,
+                      'file': {'driver': 'file',
+                               'filename': source_img}}
+
+        blk_target = {'node-name': 'target-node',
+                      'driver': iotests.imgfmt,
+                      'file': {'driver': 'file',
+                               'filename': target_img}}
+
+        self.vm = iotests.VM()
+        self.vm.add_drive_raw(self.vm.qmp_to_opts(blk_source))
+        self.vm.add_blockdev(self.vm.qmp_to_opts(blk_target))
+        self.vm.add_device('virtio-blk,drive=source')
+        self.vm.launch()
+
+    def tearDown(self):
+        self.vm.shutdown()
+
+        if not self.potential_writes_in_flight:
+            self.assertTrue(iotests.compare_images(source_img, target_img),
+                            'mirror target does not match source')
+
+        os.remove(source_img)
+        os.remove(target_img)
+
+    def doActiveIO(self, sync_source_and_target):
+        # Fill the source image
+        self.vm.hmp_qemu_io('source',
+                            'write -P 1 0 %i' % self.image_len);
+
+        # Start some background requests
+        for offset in range(1 * self.image_len / 8, 3 * self.image_len / 8, 1024 * 1024):
+            self.vm.hmp_qemu_io('source', 'aio_write -P 2 %i 1M' % offset)
+        for offset in range(2 * self.image_len / 8, 3 * self.image_len / 8, 1024 * 1024):
+            self.vm.hmp_qemu_io('source', 'aio_write -z %i 1M' % offset)
+
+        # Start the block job
+        result = self.vm.qmp('blockdev-mirror',
+                             job_id='mirror',
+                             filter_node_name='mirror-node',
+                             device='source-node',
+                             target='target-node',
+                             sync='full',
+                             copy_mode='write-blocking')
+        self.assert_qmp(result, 'return', {})
+
+        # Start some more requests
+        for offset in range(3 * self.image_len / 8, 5 * self.image_len / 8, 1024 * 1024):
+            self.vm.hmp_qemu_io('source', 'aio_write -P 3 %i 1M' % offset)
+        for offset in range(4 * self.image_len / 8, 5 * self.image_len / 8, 1024 * 1024):
+            self.vm.hmp_qemu_io('source', 'aio_write -z %i 1M' % offset)
+
+        # Wait for the READY event
+        self.wait_ready(drive='mirror')
+
+        # Now start some final requests; all of these (which land on
+        # the source) should be settled using the active mechanism.
+        # The mirror code itself asserts that the source BDS's dirty
+        # bitmap will stay clean between READY and COMPLETED.
+        for offset in range(5 * self.image_len / 8, 7 * self.image_len / 8, 1024 * 1024):
+            self.vm.hmp_qemu_io('source', 'aio_write -P 3 %i 1M' % offset)
+        for offset in range(6 * self.image_len / 8, 7 * self.image_len / 8, 1024 * 1024):
+            self.vm.hmp_qemu_io('source', 'aio_write -z %i 1M' % offset)
+
+        if sync_source_and_target:
+            # If source and target should be in sync after the mirror,
+            # we have to flush before completion
+            self.vm.hmp_qemu_io('source', 'aio_flush')
+            self.potential_writes_in_flight = False
+
+        self.complete_and_wait(drive='mirror', wait_ready=False)
+
+    def testActiveIO(self):
+        self.doActiveIO(False)
+
+    def testActiveIOFlushed(self):
+        self.doActiveIO(True)
+
+
+
+if __name__ == '__main__':
+    iotests.main(supported_fmts=['qcow2', 'raw'])
diff --git a/tests/qemu-iotests/151.out b/tests/qemu-iotests/151.out
new file mode 100644
index 0000000000..fbc63e62f8
--- /dev/null
+++ b/tests/qemu-iotests/151.out
@@ -0,0 +1,5 @@
+..
+----------------------------------------------------------------------
+Ran 2 tests
+
+OK
diff --git a/tests/qemu-iotests/group b/tests/qemu-iotests/group
index 937a3d0a4d..eea75819d2 100644
--- a/tests/qemu-iotests/group
+++ b/tests/qemu-iotests/group
@@ -157,6 +157,7 @@
 148 rw auto quick
 149 rw auto sudo
 150 rw auto quick
+151 rw auto
 152 rw auto quick
 153 rw auto quick
 154 rw auto backing quick
diff --git a/tests/test-bdrv-drain.c b/tests/test-bdrv-drain.c
index a11c4cfbf2..291a050f86 100644
--- a/tests/test-bdrv-drain.c
+++ b/tests/test-bdrv-drain.c
@@ -27,15 +27,23 @@
 #include "block/blockjob_int.h"
 #include "sysemu/block-backend.h"
 #include "qapi/error.h"
+#include "iothread.h"
+
+static QemuEvent done_event;
 
 typedef struct BDRVTestState {
     int drain_count;
+    AioContext *bh_indirection_ctx;
+    bool sleep_in_drain_begin;
 } BDRVTestState;
 
 static void coroutine_fn bdrv_test_co_drain_begin(BlockDriverState *bs)
 {
     BDRVTestState *s = bs->opaque;
     s->drain_count++;
+    if (s->sleep_in_drain_begin) {
+        qemu_co_sleep_ns(QEMU_CLOCK_REALTIME, 100000);
+    }
 }
 
 static void coroutine_fn bdrv_test_co_drain_end(BlockDriverState *bs)
@@ -50,19 +58,48 @@ static void bdrv_test_close(BlockDriverState *bs)
     g_assert_cmpint(s->drain_count, >, 0);
 }
 
+static void co_reenter_bh(void *opaque)
+{
+    aio_co_wake(opaque);
+}
+
 static int coroutine_fn bdrv_test_co_preadv(BlockDriverState *bs,
                                             uint64_t offset, uint64_t bytes,
                                             QEMUIOVector *qiov, int flags)
 {
+    BDRVTestState *s = bs->opaque;
+
     /* We want this request to stay until the polling loop in drain waits for
      * it to complete. We need to sleep a while as bdrv_drain_invoke() comes
      * first and polls its result, too, but it shouldn't accidentally complete
      * this request yet. */
     qemu_co_sleep_ns(QEMU_CLOCK_REALTIME, 100000);
 
+    if (s->bh_indirection_ctx) {
+        aio_bh_schedule_oneshot(s->bh_indirection_ctx, co_reenter_bh,
+                                qemu_coroutine_self());
+        qemu_coroutine_yield();
+    }
+
     return 0;
 }
 
+static void bdrv_test_child_perm(BlockDriverState *bs, BdrvChild *c,
+                                 const BdrvChildRole *role,
+                                 BlockReopenQueue *reopen_queue,
+                                 uint64_t perm, uint64_t shared,
+                                 uint64_t *nperm, uint64_t *nshared)
+{
+    /* bdrv_format_default_perms() accepts only these two, so disguise
+     * detach_by_driver_cb_role as one of them. */
+    if (role != &child_file && role != &child_backing) {
+        role = &child_file;
+    }
+
+    bdrv_format_default_perms(bs, c, role, reopen_queue, perm, shared,
+                              nperm, nshared);
+}
+
 static BlockDriver bdrv_test = {
     .format_name            = "test",
     .instance_size          = sizeof(BDRVTestState),
@@ -73,7 +110,7 @@ static BlockDriver bdrv_test = {
     .bdrv_co_drain_begin    = bdrv_test_co_drain_begin,
     .bdrv_co_drain_end      = bdrv_test_co_drain_end,
 
-    .bdrv_child_perm        = bdrv_format_default_perms,
+    .bdrv_child_perm        = bdrv_test_child_perm,
 };
 
 static void aio_ret_cb(void *opaque, int ret)
@@ -216,6 +253,11 @@ static void test_drv_cb_drain_subtree(void)
     test_drv_cb_common(BDRV_SUBTREE_DRAIN, true);
 }
 
+static void test_drv_cb_co_drain_all(void)
+{
+    call_in_coroutine(test_drv_cb_drain_all);
+}
+
 static void test_drv_cb_co_drain(void)
 {
     call_in_coroutine(test_drv_cb_drain);
@@ -259,8 +301,7 @@ static void test_quiesce_common(enum drain_type drain_type, bool recursive)
 
 static void test_quiesce_drain_all(void)
 {
-    // XXX drain_all doesn't quiesce
-    //test_quiesce_common(BDRV_DRAIN_ALL, true);
+    test_quiesce_common(BDRV_DRAIN_ALL, true);
 }
 
 static void test_quiesce_drain(void)
@@ -273,6 +314,11 @@ static void test_quiesce_drain_subtree(void)
     test_quiesce_common(BDRV_SUBTREE_DRAIN, true);
 }
 
+static void test_quiesce_co_drain_all(void)
+{
+    call_in_coroutine(test_quiesce_drain_all);
+}
+
 static void test_quiesce_co_drain(void)
 {
     call_in_coroutine(test_quiesce_drain);
@@ -302,12 +348,7 @@ static void test_nested(void)
 
     for (outer = 0; outer < DRAIN_TYPE_MAX; outer++) {
         for (inner = 0; inner < DRAIN_TYPE_MAX; inner++) {
-            /* XXX bdrv_drain_all() doesn't increase the quiesce_counter */
-            int bs_quiesce      = (outer != BDRV_DRAIN_ALL) +
-                                  (inner != BDRV_DRAIN_ALL);
-            int backing_quiesce = (outer == BDRV_SUBTREE_DRAIN) +
-                                  (inner == BDRV_SUBTREE_DRAIN);
-            int backing_cb_cnt  = (outer != BDRV_DRAIN) +
+            int backing_quiesce = (outer != BDRV_DRAIN) +
                                   (inner != BDRV_DRAIN);
 
             g_assert_cmpint(bs->quiesce_counter, ==, 0);
@@ -318,10 +359,10 @@ static void test_nested(void)
             do_drain_begin(outer, bs);
             do_drain_begin(inner, bs);
 
-            g_assert_cmpint(bs->quiesce_counter, ==, bs_quiesce);
+            g_assert_cmpint(bs->quiesce_counter, ==, 2);
             g_assert_cmpint(backing->quiesce_counter, ==, backing_quiesce);
             g_assert_cmpint(s->drain_count, ==, 2);
-            g_assert_cmpint(backing_s->drain_count, ==, backing_cb_cnt);
+            g_assert_cmpint(backing_s->drain_count, ==, backing_quiesce);
 
             do_drain_end(inner, bs);
             do_drain_end(outer, bs);
@@ -411,7 +452,7 @@ static void test_multiparent(void)
     blk_unref(blk_b);
 }
 
-static void test_graph_change(void)
+static void test_graph_change_drain_subtree(void)
 {
     BlockBackend *blk_a, *blk_b;
     BlockDriverState *bs_a, *bs_b, *backing;
@@ -490,6 +531,221 @@ static void test_graph_change(void)
     blk_unref(blk_b);
 }
 
+static void test_graph_change_drain_all(void)
+{
+    BlockBackend *blk_a, *blk_b;
+    BlockDriverState *bs_a, *bs_b;
+    BDRVTestState *a_s, *b_s;
+
+    /* Create node A with a BlockBackend */
+    blk_a = blk_new(BLK_PERM_ALL, BLK_PERM_ALL);
+    bs_a = bdrv_new_open_driver(&bdrv_test, "test-node-a", BDRV_O_RDWR,
+                                &error_abort);
+    a_s = bs_a->opaque;
+    blk_insert_bs(blk_a, bs_a, &error_abort);
+
+    g_assert_cmpint(bs_a->quiesce_counter, ==, 0);
+    g_assert_cmpint(a_s->drain_count, ==, 0);
+
+    /* Call bdrv_drain_all_begin() */
+    bdrv_drain_all_begin();
+
+    g_assert_cmpint(bs_a->quiesce_counter, ==, 1);
+    g_assert_cmpint(a_s->drain_count, ==, 1);
+
+    /* Create node B with a BlockBackend */
+    blk_b = blk_new(BLK_PERM_ALL, BLK_PERM_ALL);
+    bs_b = bdrv_new_open_driver(&bdrv_test, "test-node-b", BDRV_O_RDWR,
+                                &error_abort);
+    b_s = bs_b->opaque;
+    blk_insert_bs(blk_b, bs_b, &error_abort);
+
+    g_assert_cmpint(bs_a->quiesce_counter, ==, 1);
+    g_assert_cmpint(bs_b->quiesce_counter, ==, 1);
+    g_assert_cmpint(a_s->drain_count, ==, 1);
+    g_assert_cmpint(b_s->drain_count, ==, 1);
+
+    /* Unref and finally delete node A */
+    blk_unref(blk_a);
+
+    g_assert_cmpint(bs_a->quiesce_counter, ==, 1);
+    g_assert_cmpint(bs_b->quiesce_counter, ==, 1);
+    g_assert_cmpint(a_s->drain_count, ==, 1);
+    g_assert_cmpint(b_s->drain_count, ==, 1);
+
+    bdrv_unref(bs_a);
+
+    g_assert_cmpint(bs_b->quiesce_counter, ==, 1);
+    g_assert_cmpint(b_s->drain_count, ==, 1);
+
+    /* End the drained section */
+    bdrv_drain_all_end();
+
+    g_assert_cmpint(bs_b->quiesce_counter, ==, 0);
+    g_assert_cmpint(b_s->drain_count, ==, 0);
+
+    bdrv_unref(bs_b);
+    blk_unref(blk_b);
+}
+
+struct test_iothread_data {
+    BlockDriverState *bs;
+    enum drain_type drain_type;
+    int *aio_ret;
+};
+
+static void test_iothread_drain_entry(void *opaque)
+{
+    struct test_iothread_data *data = opaque;
+
+    aio_context_acquire(bdrv_get_aio_context(data->bs));
+    do_drain_begin(data->drain_type, data->bs);
+    g_assert_cmpint(*data->aio_ret, ==, 0);
+    do_drain_end(data->drain_type, data->bs);
+    aio_context_release(bdrv_get_aio_context(data->bs));
+
+    qemu_event_set(&done_event);
+}
+
+static void test_iothread_aio_cb(void *opaque, int ret)
+{
+    int *aio_ret = opaque;
+    *aio_ret = ret;
+    qemu_event_set(&done_event);
+}
+
+/*
+ * Starts an AIO request on a BDS that runs in the AioContext of iothread 1.
+ * The request involves a BH on iothread 2 before it can complete.
+ *
+ * @drain_thread = 0 means that do_drain_begin/end are called from the main
+ * thread, @drain_thread = 1 means that they are called from iothread 1. Drain
+ * for this BDS cannot be called from iothread 2 because only the main thread
+ * may do cross-AioContext polling.
+ */
+static void test_iothread_common(enum drain_type drain_type, int drain_thread)
+{
+    BlockBackend *blk;
+    BlockDriverState *bs;
+    BDRVTestState *s;
+    BlockAIOCB *acb;
+    int aio_ret;
+    struct test_iothread_data data;
+
+    IOThread *a = iothread_new();
+    IOThread *b = iothread_new();
+    AioContext *ctx_a = iothread_get_aio_context(a);
+    AioContext *ctx_b = iothread_get_aio_context(b);
+
+    QEMUIOVector qiov;
+    struct iovec iov = {
+        .iov_base = NULL,
+        .iov_len = 0,
+    };
+    qemu_iovec_init_external(&qiov, &iov, 1);
+
+    /* bdrv_drain_all() may only be called from the main loop thread */
+    if (drain_type == BDRV_DRAIN_ALL && drain_thread != 0) {
+        goto out;
+    }
+
+    blk = blk_new(BLK_PERM_ALL, BLK_PERM_ALL);
+    bs = bdrv_new_open_driver(&bdrv_test, "test-node", BDRV_O_RDWR,
+                              &error_abort);
+    s = bs->opaque;
+    blk_insert_bs(blk, bs, &error_abort);
+
+    blk_set_aio_context(blk, ctx_a);
+    aio_context_acquire(ctx_a);
+
+    s->bh_indirection_ctx = ctx_b;
+
+    aio_ret = -EINPROGRESS;
+    if (drain_thread == 0) {
+        acb = blk_aio_preadv(blk, 0, &qiov, 0, test_iothread_aio_cb, &aio_ret);
+    } else {
+        acb = blk_aio_preadv(blk, 0, &qiov, 0, aio_ret_cb, &aio_ret);
+    }
+    g_assert(acb != NULL);
+    g_assert_cmpint(aio_ret, ==, -EINPROGRESS);
+
+    aio_context_release(ctx_a);
+
+    data = (struct test_iothread_data) {
+        .bs         = bs,
+        .drain_type = drain_type,
+        .aio_ret    = &aio_ret,
+    };
+
+    switch (drain_thread) {
+    case 0:
+        if (drain_type != BDRV_DRAIN_ALL) {
+            aio_context_acquire(ctx_a);
+        }
+
+        /* The request is running on the IOThread a. Draining its block device
+         * will make sure that it has completed as far as the BDS is concerned,
+         * but the drain in this thread can continue immediately after
+         * bdrv_dec_in_flight() and aio_ret might be assigned only slightly
+         * later. */
+        qemu_event_reset(&done_event);
+        do_drain_begin(drain_type, bs);
+        g_assert_cmpint(bs->in_flight, ==, 0);
+
+        if (drain_type != BDRV_DRAIN_ALL) {
+            aio_context_release(ctx_a);
+        }
+        qemu_event_wait(&done_event);
+        if (drain_type != BDRV_DRAIN_ALL) {
+            aio_context_acquire(ctx_a);
+        }
+
+        g_assert_cmpint(aio_ret, ==, 0);
+        do_drain_end(drain_type, bs);
+
+        if (drain_type != BDRV_DRAIN_ALL) {
+            aio_context_release(ctx_a);
+        }
+        break;
+    case 1:
+        qemu_event_reset(&done_event);
+        aio_bh_schedule_oneshot(ctx_a, test_iothread_drain_entry, &data);
+        qemu_event_wait(&done_event);
+        break;
+    default:
+        g_assert_not_reached();
+    }
+
+    aio_context_acquire(ctx_a);
+    blk_set_aio_context(blk, qemu_get_aio_context());
+    aio_context_release(ctx_a);
+
+    bdrv_unref(bs);
+    blk_unref(blk);
+
+out:
+    iothread_join(a);
+    iothread_join(b);
+}
+
+static void test_iothread_drain_all(void)
+{
+    test_iothread_common(BDRV_DRAIN_ALL, 0);
+    test_iothread_common(BDRV_DRAIN_ALL, 1);
+}
+
+static void test_iothread_drain(void)
+{
+    test_iothread_common(BDRV_DRAIN, 0);
+    test_iothread_common(BDRV_DRAIN, 1);
+}
+
+static void test_iothread_drain_subtree(void)
+{
+    test_iothread_common(BDRV_SUBTREE_DRAIN, 0);
+    test_iothread_common(BDRV_SUBTREE_DRAIN, 1);
+}
+
 
 typedef struct TestBlockJob {
     BlockJob common;
@@ -507,7 +763,11 @@ static void coroutine_fn test_job_start(void *opaque)
 
     job_transition_to_ready(&s->common.job);
     while (!s->should_complete) {
-        job_sleep_ns(&s->common.job, 100000);
+        /* Avoid block_job_sleep_ns() because it marks the job as !busy. We
+         * want to emulate some actual activity (probably some I/O) here so
+         * that drain has to wait for this acitivity to stop. */
+        qemu_co_sleep_ns(QEMU_CLOCK_REALTIME, 100000);
+        job_pause_point(&s->common.job);
     }
 
     job_defer_to_main_loop(&s->common.job, test_job_completed, NULL);
@@ -554,7 +814,7 @@ static void test_blockjob_common(enum drain_type drain_type)
 
     g_assert_cmpint(job->job.pause_count, ==, 0);
     g_assert_false(job->job.paused);
-    g_assert_false(job->job.busy); /* We're in job_sleep_ns() */
+    g_assert_true(job->job.busy); /* We're in job_sleep_ns() */
 
     do_drain_begin(drain_type, src);
 
@@ -564,15 +824,14 @@ static void test_blockjob_common(enum drain_type drain_type)
     } else {
         g_assert_cmpint(job->job.pause_count, ==, 1);
     }
-    /* XXX We don't wait until the job is actually paused. Is this okay? */
-    /* g_assert_true(job->job.paused); */
+    g_assert_true(job->job.paused);
     g_assert_false(job->job.busy); /* The job is paused */
 
     do_drain_end(drain_type, src);
 
     g_assert_cmpint(job->job.pause_count, ==, 0);
     g_assert_false(job->job.paused);
-    g_assert_false(job->job.busy); /* We're in job_sleep_ns() */
+    g_assert_true(job->job.busy); /* We're in qemu_co_sleep_ns() */
 
     do_drain_begin(drain_type, target);
 
@@ -582,15 +841,14 @@ static void test_blockjob_common(enum drain_type drain_type)
     } else {
         g_assert_cmpint(job->job.pause_count, ==, 1);
     }
-    /* XXX We don't wait until the job is actually paused. Is this okay? */
-    /* g_assert_true(job->job.paused); */
+    g_assert_true(job->job.paused);
     g_assert_false(job->job.busy); /* The job is paused */
 
     do_drain_end(drain_type, target);
 
     g_assert_cmpint(job->job.pause_count, ==, 0);
     g_assert_false(job->job.paused);
-    g_assert_false(job->job.busy); /* We're in job_sleep_ns() */
+    g_assert_true(job->job.busy); /* We're in job_sleep_ns() */
 
     ret = job_complete_sync(&job->job, &error_abort);
     g_assert_cmpint(ret, ==, 0);
@@ -616,19 +874,399 @@ static void test_blockjob_drain_subtree(void)
     test_blockjob_common(BDRV_SUBTREE_DRAIN);
 }
 
+
+typedef struct BDRVTestTopState {
+    BdrvChild *wait_child;
+} BDRVTestTopState;
+
+static void bdrv_test_top_close(BlockDriverState *bs)
+{
+    BdrvChild *c, *next_c;
+    QLIST_FOREACH_SAFE(c, &bs->children, next, next_c) {
+        bdrv_unref_child(bs, c);
+    }
+}
+
+static int coroutine_fn bdrv_test_top_co_preadv(BlockDriverState *bs,
+                                                uint64_t offset, uint64_t bytes,
+                                                QEMUIOVector *qiov, int flags)
+{
+    BDRVTestTopState *tts = bs->opaque;
+    return bdrv_co_preadv(tts->wait_child, offset, bytes, qiov, flags);
+}
+
+static BlockDriver bdrv_test_top_driver = {
+    .format_name            = "test_top_driver",
+    .instance_size          = sizeof(BDRVTestTopState),
+
+    .bdrv_close             = bdrv_test_top_close,
+    .bdrv_co_preadv         = bdrv_test_top_co_preadv,
+
+    .bdrv_child_perm        = bdrv_format_default_perms,
+};
+
+typedef struct TestCoDeleteByDrainData {
+    BlockBackend *blk;
+    bool detach_instead_of_delete;
+    bool done;
+} TestCoDeleteByDrainData;
+
+static void coroutine_fn test_co_delete_by_drain(void *opaque)
+{
+    TestCoDeleteByDrainData *dbdd = opaque;
+    BlockBackend *blk = dbdd->blk;
+    BlockDriverState *bs = blk_bs(blk);
+    BDRVTestTopState *tts = bs->opaque;
+    void *buffer = g_malloc(65536);
+    QEMUIOVector qiov;
+    struct iovec iov = {
+        .iov_base = buffer,
+        .iov_len  = 65536,
+    };
+
+    qemu_iovec_init_external(&qiov, &iov, 1);
+
+    /* Pretend some internal write operation from parent to child.
+     * Important: We have to read from the child, not from the parent!
+     * Draining works by first propagating it all up the tree to the
+     * root and then waiting for drainage from root to the leaves
+     * (protocol nodes).  If we have a request waiting on the root,
+     * everything will be drained before we go back down the tree, but
+     * we do not want that.  We want to be in the middle of draining
+     * when this following requests returns. */
+    bdrv_co_preadv(tts->wait_child, 0, 65536, &qiov, 0);
+
+    g_assert_cmpint(bs->refcnt, ==, 1);
+
+    if (!dbdd->detach_instead_of_delete) {
+        blk_unref(blk);
+    } else {
+        BdrvChild *c, *next_c;
+        QLIST_FOREACH_SAFE(c, &bs->children, next, next_c) {
+            bdrv_unref_child(bs, c);
+        }
+    }
+
+    dbdd->done = true;
+}
+
+/**
+ * Test what happens when some BDS has some children, you drain one of
+ * them and this results in the BDS being deleted.
+ *
+ * If @detach_instead_of_delete is set, the BDS is not going to be
+ * deleted but will only detach all of its children.
+ */
+static void do_test_delete_by_drain(bool detach_instead_of_delete,
+                                    enum drain_type drain_type)
+{
+    BlockBackend *blk;
+    BlockDriverState *bs, *child_bs, *null_bs;
+    BDRVTestTopState *tts;
+    TestCoDeleteByDrainData dbdd;
+    Coroutine *co;
+
+    bs = bdrv_new_open_driver(&bdrv_test_top_driver, "top", BDRV_O_RDWR,
+                              &error_abort);
+    bs->total_sectors = 65536 >> BDRV_SECTOR_BITS;
+    tts = bs->opaque;
+
+    null_bs = bdrv_open("null-co://", NULL, NULL, BDRV_O_RDWR | BDRV_O_PROTOCOL,
+                        &error_abort);
+    bdrv_attach_child(bs, null_bs, "null-child", &child_file, &error_abort);
+
+    /* This child will be the one to pass to requests through to, and
+     * it will stall until a drain occurs */
+    child_bs = bdrv_new_open_driver(&bdrv_test, "child", BDRV_O_RDWR,
+                                    &error_abort);
+    child_bs->total_sectors = 65536 >> BDRV_SECTOR_BITS;
+    /* Takes our reference to child_bs */
+    tts->wait_child = bdrv_attach_child(bs, child_bs, "wait-child", &child_file,
+                                        &error_abort);
+
+    /* This child is just there to be deleted
+     * (for detach_instead_of_delete == true) */
+    null_bs = bdrv_open("null-co://", NULL, NULL, BDRV_O_RDWR | BDRV_O_PROTOCOL,
+                        &error_abort);
+    bdrv_attach_child(bs, null_bs, "null-child", &child_file, &error_abort);
+
+    blk = blk_new(BLK_PERM_ALL, BLK_PERM_ALL);
+    blk_insert_bs(blk, bs, &error_abort);
+
+    /* Referenced by blk now */
+    bdrv_unref(bs);
+
+    g_assert_cmpint(bs->refcnt, ==, 1);
+    g_assert_cmpint(child_bs->refcnt, ==, 1);
+    g_assert_cmpint(null_bs->refcnt, ==, 1);
+
+
+    dbdd = (TestCoDeleteByDrainData){
+        .blk = blk,
+        .detach_instead_of_delete = detach_instead_of_delete,
+        .done = false,
+    };
+    co = qemu_coroutine_create(test_co_delete_by_drain, &dbdd);
+    qemu_coroutine_enter(co);
+
+    /* Drain the child while the read operation is still pending.
+     * This should result in the operation finishing and
+     * test_co_delete_by_drain() resuming.  Thus, @bs will be deleted
+     * and the coroutine will exit while this drain operation is still
+     * in progress. */
+    switch (drain_type) {
+    case BDRV_DRAIN:
+        bdrv_ref(child_bs);
+        bdrv_drain(child_bs);
+        bdrv_unref(child_bs);
+        break;
+    case BDRV_SUBTREE_DRAIN:
+        /* Would have to ref/unref bs here for !detach_instead_of_delete, but
+         * then the whole test becomes pointless because the graph changes
+         * don't occur during the drain any more. */
+        assert(detach_instead_of_delete);
+        bdrv_subtree_drained_begin(bs);
+        bdrv_subtree_drained_end(bs);
+        break;
+    case BDRV_DRAIN_ALL:
+        bdrv_drain_all_begin();
+        bdrv_drain_all_end();
+        break;
+    default:
+        g_assert_not_reached();
+    }
+
+    while (!dbdd.done) {
+        aio_poll(qemu_get_aio_context(), true);
+    }
+
+    if (detach_instead_of_delete) {
+        /* Here, the reference has not passed over to the coroutine,
+         * so we have to delete the BB ourselves */
+        blk_unref(blk);
+    }
+}
+
+static void test_delete_by_drain(void)
+{
+    do_test_delete_by_drain(false, BDRV_DRAIN);
+}
+
+static void test_detach_by_drain_all(void)
+{
+    do_test_delete_by_drain(true, BDRV_DRAIN_ALL);
+}
+
+static void test_detach_by_drain(void)
+{
+    do_test_delete_by_drain(true, BDRV_DRAIN);
+}
+
+static void test_detach_by_drain_subtree(void)
+{
+    do_test_delete_by_drain(true, BDRV_SUBTREE_DRAIN);
+}
+
+
+struct detach_by_parent_data {
+    BlockDriverState *parent_b;
+    BdrvChild *child_b;
+    BlockDriverState *c;
+    BdrvChild *child_c;
+    bool by_parent_cb;
+};
+static struct detach_by_parent_data detach_by_parent_data;
+
+static void detach_indirect_bh(void *opaque)
+{
+    struct detach_by_parent_data *data = opaque;
+
+    bdrv_unref_child(data->parent_b, data->child_b);
+
+    bdrv_ref(data->c);
+    data->child_c = bdrv_attach_child(data->parent_b, data->c, "PB-C",
+                                      &child_file, &error_abort);
+}
+
+static void detach_by_parent_aio_cb(void *opaque, int ret)
+{
+    struct detach_by_parent_data *data = &detach_by_parent_data;
+
+    g_assert_cmpint(ret, ==, 0);
+    if (data->by_parent_cb) {
+        detach_indirect_bh(data);
+    }
+}
+
+static void detach_by_driver_cb_drained_begin(BdrvChild *child)
+{
+    aio_bh_schedule_oneshot(qemu_get_current_aio_context(),
+                            detach_indirect_bh, &detach_by_parent_data);
+    child_file.drained_begin(child);
+}
+
+static BdrvChildRole detach_by_driver_cb_role;
+
+/*
+ * Initial graph:
+ *
+ * PA     PB
+ *    \ /   \
+ *     A     B     C
+ *
+ * by_parent_cb == true:  Test that parent callbacks don't poll
+ *
+ *     PA has a pending write request whose callback changes the child nodes of
+ *     PB: It removes B and adds C instead. The subtree of PB is drained, which
+ *     will indirectly drain the write request, too.
+ *
+ * by_parent_cb == false: Test that bdrv_drain_invoke() doesn't poll
+ *
+ *     PA's BdrvChildRole has a .drained_begin callback that schedules a BH
+ *     that does the same graph change. If bdrv_drain_invoke() calls it, the
+ *     state is messed up, but if it is only polled in the single
+ *     BDRV_POLL_WHILE() at the end of the drain, this should work fine.
+ */
+static void test_detach_indirect(bool by_parent_cb)
+{
+    BlockBackend *blk;
+    BlockDriverState *parent_a, *parent_b, *a, *b, *c;
+    BdrvChild *child_a, *child_b;
+    BlockAIOCB *acb;
+
+    QEMUIOVector qiov;
+    struct iovec iov = {
+        .iov_base = NULL,
+        .iov_len = 0,
+    };
+    qemu_iovec_init_external(&qiov, &iov, 1);
+
+    if (!by_parent_cb) {
+        detach_by_driver_cb_role = child_file;
+        detach_by_driver_cb_role.drained_begin =
+            detach_by_driver_cb_drained_begin;
+    }
+
+    /* Create all involved nodes */
+    parent_a = bdrv_new_open_driver(&bdrv_test, "parent-a", BDRV_O_RDWR,
+                                    &error_abort);
+    parent_b = bdrv_new_open_driver(&bdrv_test, "parent-b", 0,
+                                    &error_abort);
+
+    a = bdrv_new_open_driver(&bdrv_test, "a", BDRV_O_RDWR, &error_abort);
+    b = bdrv_new_open_driver(&bdrv_test, "b", BDRV_O_RDWR, &error_abort);
+    c = bdrv_new_open_driver(&bdrv_test, "c", BDRV_O_RDWR, &error_abort);
+
+    /* blk is a BB for parent-a */
+    blk = blk_new(BLK_PERM_ALL, BLK_PERM_ALL);
+    blk_insert_bs(blk, parent_a, &error_abort);
+    bdrv_unref(parent_a);
+
+    /* If we want to get bdrv_drain_invoke() to call aio_poll(), the driver
+     * callback must not return immediately. */
+    if (!by_parent_cb) {
+        BDRVTestState *s = parent_a->opaque;
+        s->sleep_in_drain_begin = true;
+    }
+
+    /* Set child relationships */
+    bdrv_ref(b);
+    bdrv_ref(a);
+    child_b = bdrv_attach_child(parent_b, b, "PB-B", &child_file, &error_abort);
+    child_a = bdrv_attach_child(parent_b, a, "PB-A", &child_backing, &error_abort);
+
+    bdrv_ref(a);
+    bdrv_attach_child(parent_a, a, "PA-A",
+                      by_parent_cb ? &child_file : &detach_by_driver_cb_role,
+                      &error_abort);
+
+    g_assert_cmpint(parent_a->refcnt, ==, 1);
+    g_assert_cmpint(parent_b->refcnt, ==, 1);
+    g_assert_cmpint(a->refcnt, ==, 3);
+    g_assert_cmpint(b->refcnt, ==, 2);
+    g_assert_cmpint(c->refcnt, ==, 1);
+
+    g_assert(QLIST_FIRST(&parent_b->children) == child_a);
+    g_assert(QLIST_NEXT(child_a, next) == child_b);
+    g_assert(QLIST_NEXT(child_b, next) == NULL);
+
+    /* Start the evil write request */
+    detach_by_parent_data = (struct detach_by_parent_data) {
+        .parent_b = parent_b,
+        .child_b = child_b,
+        .c = c,
+        .by_parent_cb = by_parent_cb,
+    };
+    acb = blk_aio_preadv(blk, 0, &qiov, 0, detach_by_parent_aio_cb, NULL);
+    g_assert(acb != NULL);
+
+    /* Drain and check the expected result */
+    bdrv_subtree_drained_begin(parent_b);
+
+    g_assert(detach_by_parent_data.child_c != NULL);
+
+    g_assert_cmpint(parent_a->refcnt, ==, 1);
+    g_assert_cmpint(parent_b->refcnt, ==, 1);
+    g_assert_cmpint(a->refcnt, ==, 3);
+    g_assert_cmpint(b->refcnt, ==, 1);
+    g_assert_cmpint(c->refcnt, ==, 2);
+
+    g_assert(QLIST_FIRST(&parent_b->children) == detach_by_parent_data.child_c);
+    g_assert(QLIST_NEXT(detach_by_parent_data.child_c, next) == child_a);
+    g_assert(QLIST_NEXT(child_a, next) == NULL);
+
+    g_assert_cmpint(parent_a->quiesce_counter, ==, 1);
+    g_assert_cmpint(parent_b->quiesce_counter, ==, 1);
+    g_assert_cmpint(a->quiesce_counter, ==, 1);
+    g_assert_cmpint(b->quiesce_counter, ==, 0);
+    g_assert_cmpint(c->quiesce_counter, ==, 1);
+
+    bdrv_subtree_drained_end(parent_b);
+
+    bdrv_unref(parent_b);
+    blk_unref(blk);
+
+    /* XXX Once bdrv_close() unref's children instead of just detaching them,
+     * this won't be necessary any more. */
+    bdrv_unref(a);
+    bdrv_unref(a);
+    bdrv_unref(c);
+
+    g_assert_cmpint(a->refcnt, ==, 1);
+    g_assert_cmpint(b->refcnt, ==, 1);
+    g_assert_cmpint(c->refcnt, ==, 1);
+    bdrv_unref(a);
+    bdrv_unref(b);
+    bdrv_unref(c);
+}
+
+static void test_detach_by_parent_cb(void)
+{
+    test_detach_indirect(true);
+}
+
+static void test_detach_by_driver_cb(void)
+{
+    test_detach_indirect(false);
+}
+
 int main(int argc, char **argv)
 {
+    int ret;
+
     bdrv_init();
     qemu_init_main_loop(&error_abort);
 
     g_test_init(&argc, &argv, NULL);
+    qemu_event_init(&done_event, false);
 
     g_test_add_func("/bdrv-drain/driver-cb/drain_all", test_drv_cb_drain_all);
     g_test_add_func("/bdrv-drain/driver-cb/drain", test_drv_cb_drain);
     g_test_add_func("/bdrv-drain/driver-cb/drain_subtree",
                     test_drv_cb_drain_subtree);
 
-    // XXX bdrv_drain_all() doesn't work in coroutine context
+    g_test_add_func("/bdrv-drain/driver-cb/co/drain_all",
+                    test_drv_cb_co_drain_all);
     g_test_add_func("/bdrv-drain/driver-cb/co/drain", test_drv_cb_co_drain);
     g_test_add_func("/bdrv-drain/driver-cb/co/drain_subtree",
                     test_drv_cb_co_drain_subtree);
@@ -639,19 +1277,38 @@ int main(int argc, char **argv)
     g_test_add_func("/bdrv-drain/quiesce/drain_subtree",
                     test_quiesce_drain_subtree);
 
-    // XXX bdrv_drain_all() doesn't work in coroutine context
+    g_test_add_func("/bdrv-drain/quiesce/co/drain_all",
+                    test_quiesce_co_drain_all);
     g_test_add_func("/bdrv-drain/quiesce/co/drain", test_quiesce_co_drain);
     g_test_add_func("/bdrv-drain/quiesce/co/drain_subtree",
                     test_quiesce_co_drain_subtree);
 
     g_test_add_func("/bdrv-drain/nested", test_nested);
     g_test_add_func("/bdrv-drain/multiparent", test_multiparent);
-    g_test_add_func("/bdrv-drain/graph-change", test_graph_change);
+
+    g_test_add_func("/bdrv-drain/graph-change/drain_subtree",
+                    test_graph_change_drain_subtree);
+    g_test_add_func("/bdrv-drain/graph-change/drain_all",
+                    test_graph_change_drain_all);
+
+    g_test_add_func("/bdrv-drain/iothread/drain_all", test_iothread_drain_all);
+    g_test_add_func("/bdrv-drain/iothread/drain", test_iothread_drain);
+    g_test_add_func("/bdrv-drain/iothread/drain_subtree",
+                    test_iothread_drain_subtree);
 
     g_test_add_func("/bdrv-drain/blockjob/drain_all", test_blockjob_drain_all);
     g_test_add_func("/bdrv-drain/blockjob/drain", test_blockjob_drain);
     g_test_add_func("/bdrv-drain/blockjob/drain_subtree",
                     test_blockjob_drain_subtree);
 
-    return g_test_run();
+    g_test_add_func("/bdrv-drain/deletion/drain", test_delete_by_drain);
+    g_test_add_func("/bdrv-drain/detach/drain_all", test_detach_by_drain_all);
+    g_test_add_func("/bdrv-drain/detach/drain", test_detach_by_drain);
+    g_test_add_func("/bdrv-drain/detach/drain_subtree", test_detach_by_drain_subtree);
+    g_test_add_func("/bdrv-drain/detach/parent_cb", test_detach_by_parent_cb);
+    g_test_add_func("/bdrv-drain/detach/driver_cb", test_detach_by_driver_cb);
+
+    ret = g_test_run();
+    qemu_event_destroy(&done_event);
+    return ret;
 }
diff --git a/tests/test-hbitmap.c b/tests/test-hbitmap.c
index f29631f939..5e67ac1d3a 100644
--- a/tests/test-hbitmap.c
+++ b/tests/test-hbitmap.c
@@ -30,6 +30,18 @@ typedef struct TestHBitmapData {
 } TestHBitmapData;
 
 
+static int64_t check_hbitmap_iter_next(HBitmapIter *hbi)
+{
+    int next0, next1;
+
+    next0 = hbitmap_iter_next(hbi, false);
+    next1 = hbitmap_iter_next(hbi, true);
+
+    g_assert_cmpint(next0, ==, next1);
+
+    return next0;
+}
+
 /* Check that the HBitmap and the shadow bitmap contain the same data,
  * ignoring the same "first" bits.
  */
@@ -46,7 +58,7 @@ static void hbitmap_test_check(TestHBitmapData *data,
 
     i = first;
     for (;;) {
-        next = hbitmap_iter_next(&hbi);
+        next = check_hbitmap_iter_next(&hbi);
         if (next < 0) {
             next = data->size;
         }
@@ -435,25 +447,25 @@ static void test_hbitmap_iter_granularity(TestHBitmapData *data,
     /* Note that hbitmap_test_check has to be invoked manually in this test.  */
     hbitmap_test_init(data, 131072 << 7, 7);
     hbitmap_iter_init(&hbi, data->hb, 0);
-    g_assert_cmpint(hbitmap_iter_next(&hbi), <, 0);
+    g_assert_cmpint(check_hbitmap_iter_next(&hbi), <, 0);
 
     hbitmap_test_set(data, ((L2 + L1 + 1) << 7) + 8, 8);
     hbitmap_iter_init(&hbi, data->hb, 0);
-    g_assert_cmpint(hbitmap_iter_next(&hbi), ==, (L2 + L1 + 1) << 7);
-    g_assert_cmpint(hbitmap_iter_next(&hbi), <, 0);
+    g_assert_cmpint(check_hbitmap_iter_next(&hbi), ==, (L2 + L1 + 1) << 7);
+    g_assert_cmpint(check_hbitmap_iter_next(&hbi), <, 0);
 
     hbitmap_iter_init(&hbi, data->hb, (L2 + L1 + 2) << 7);
-    g_assert_cmpint(hbitmap_iter_next(&hbi), <, 0);
+    g_assert_cmpint(hbitmap_iter_next(&hbi, true), <, 0);
 
     hbitmap_test_set(data, (131072 << 7) - 8, 8);
     hbitmap_iter_init(&hbi, data->hb, 0);
-    g_assert_cmpint(hbitmap_iter_next(&hbi), ==, (L2 + L1 + 1) << 7);
-    g_assert_cmpint(hbitmap_iter_next(&hbi), ==, 131071 << 7);
-    g_assert_cmpint(hbitmap_iter_next(&hbi), <, 0);
+    g_assert_cmpint(check_hbitmap_iter_next(&hbi), ==, (L2 + L1 + 1) << 7);
+    g_assert_cmpint(check_hbitmap_iter_next(&hbi), ==, 131071 << 7);
+    g_assert_cmpint(check_hbitmap_iter_next(&hbi), <, 0);
 
     hbitmap_iter_init(&hbi, data->hb, (L2 + L1 + 2) << 7);
-    g_assert_cmpint(hbitmap_iter_next(&hbi), ==, 131071 << 7);
-    g_assert_cmpint(hbitmap_iter_next(&hbi), <, 0);
+    g_assert_cmpint(check_hbitmap_iter_next(&hbi), ==, 131071 << 7);
+    g_assert_cmpint(check_hbitmap_iter_next(&hbi), <, 0);
 }
 
 static void hbitmap_test_set_boundary_bits(TestHBitmapData *data, ssize_t diff)
@@ -893,7 +905,7 @@ static void test_hbitmap_serialize_zeroes(TestHBitmapData *data,
     for (i = 0; i < num_positions; i++) {
         hbitmap_deserialize_zeroes(data->hb, positions[i], min_l1, true);
         hbitmap_iter_init(&iter, data->hb, 0);
-        next = hbitmap_iter_next(&iter);
+        next = check_hbitmap_iter_next(&iter);
         if (i == num_positions - 1) {
             g_assert_cmpint(next, ==, -1);
         } else {
@@ -919,10 +931,10 @@ static void test_hbitmap_iter_and_reset(TestHBitmapData *data,
 
     hbitmap_iter_init(&hbi, data->hb, BITS_PER_LONG - 1);
 
-    hbitmap_iter_next(&hbi);
+    check_hbitmap_iter_next(&hbi);
 
     hbitmap_reset_all(data->hb);
-    hbitmap_iter_next(&hbi);
+    check_hbitmap_iter_next(&hbi);
 }
 
 static void test_hbitmap_next_zero_check(TestHBitmapData *data, int64_t start)
diff --git a/util/hbitmap.c b/util/hbitmap.c
index 58a2c93842..bcd304041a 100644
--- a/util/hbitmap.c
+++ b/util/hbitmap.c
@@ -141,7 +141,7 @@ unsigned long hbitmap_iter_skip_words(HBitmapIter *hbi)
     return cur;
 }
 
-int64_t hbitmap_iter_next(HBitmapIter *hbi)
+int64_t hbitmap_iter_next(HBitmapIter *hbi, bool advance)
 {
     unsigned long cur = hbi->cur[HBITMAP_LEVELS - 1] &
             hbi->hb->levels[HBITMAP_LEVELS - 1][hbi->pos];
@@ -154,8 +154,12 @@ int64_t hbitmap_iter_next(HBitmapIter *hbi)
         }
     }
 
-    /* The next call will resume work from the next bit.  */
-    hbi->cur[HBITMAP_LEVELS - 1] = cur & (cur - 1);
+    if (advance) {
+        /* The next call will resume work from the next bit.  */
+        hbi->cur[HBITMAP_LEVELS - 1] = cur & (cur - 1);
+    } else {
+        hbi->cur[HBITMAP_LEVELS - 1] = cur;
+    }
     item = ((uint64_t)hbi->pos << BITS_PER_LEVEL) + ctzl(cur);
 
     return item << hbi->granularity;
diff --git a/util/osdep.c b/util/osdep.c
index a73de0e1ba..ea51d500b6 100644
--- a/util/osdep.c
+++ b/util/osdep.c
@@ -302,7 +302,8 @@ int qemu_open(const char *name, int flags, ...)
         }
 
         fd = monitor_fdset_get_fd(fdset_id, flags);
-        if (fd == -1) {
+        if (fd < 0) {
+            errno = -fd;
             return -1;
         }
 
diff --git a/vl.c b/vl.c
index 6e34fb348d..b3426e03d0 100644
--- a/vl.c
+++ b/vl.c
@@ -2978,6 +2978,7 @@ int main(int argc, char **argv, char **envp)
 
     runstate_init();
     postcopy_infrastructure_init();
+    monitor_init_globals();
 
     if (qcrypto_init(&err) < 0) {
         error_reportf_err(err, "cannot initialize crypto: ");
@@ -4412,12 +4413,6 @@ int main(int argc, char **argv, char **envp)
     default_drive(default_floppy, snapshot, IF_FLOPPY, 0, FD_OPTS);
     default_drive(default_sdcard, snapshot, IF_SD, 0, SD_OPTS);
 
-    /*
-     * Note: qtest_enabled() (which is used in monitor_qapi_event_init())
-     * depends on configure_accelerator() above.
-     */
-    monitor_init_globals();
-
     if (qemu_opts_foreach(qemu_find_opts("mon"),
                           mon_init_func, NULL, NULL)) {
         exit(1);