summary refs log tree commit diff stats
diff options
context:
space:
mode:
-rw-r--r--MAINTAINERS19
-rw-r--r--Makefile2
-rw-r--r--accel.c1
-rw-r--r--async.c29
-rw-r--r--block.c6
-rw-r--r--block/backup.c17
-rw-r--r--block/block-backend.c30
-rw-r--r--block/commit.c2
-rw-r--r--block/io.c137
-rw-r--r--block/mirror.c70
-rw-r--r--block/nfs.c55
-rw-r--r--block/qed-table.c16
-rw-r--r--block/qed.c16
-rw-r--r--block/replication.c27
-rw-r--r--block/sheepdog.c67
-rw-r--r--blockjob.c37
-rwxr-xr-xconfigure11
-rw-r--r--cputlb.c1
-rw-r--r--docs/COLO-FT.txt189
-rw-r--r--docs/multiple-iothreads.txt40
-rw-r--r--docs/qmp-commands.txt17
-rw-r--r--gdbstub.c1
-rw-r--r--hmp-commands.hx15
-rw-r--r--hmp.c16
-rw-r--r--hmp.h1
-rw-r--r--hw/arm/cubieboard.c1
-rw-r--r--hw/arm/pxa2xx.c4
-rw-r--r--hw/arm/spitz.c13
-rw-r--r--hw/arm/tosa.c12
-rw-r--r--hw/arm/versatilepb.c9
-rw-r--r--hw/arm/virt-acpi-build.c2
-rw-r--r--hw/arm/virt.c9
-rw-r--r--hw/block/nvme.c4
-rw-r--r--hw/block/xen_disk.c65
-rw-r--r--hw/char/cadence_uart.c3
-rw-r--r--hw/char/xen_console.c30
-rw-r--r--hw/display/milkymist-tmu2.c2
-rw-r--r--hw/display/xenfb.c127
-rw-r--r--hw/gpio/imx_gpio.c2
-rw-r--r--hw/i386/acpi-build.c1
-rw-r--r--hw/microblaze/boot.c1
-rw-r--r--hw/mips/mips_malta.c1
-rw-r--r--hw/misc/milkymist-pfpu.c2
-rw-r--r--hw/net/xen_nic.c36
-rw-r--r--hw/nvram/fw_cfg.c1
-rw-r--r--hw/pci-bridge/pci_expander_bridge.c1
-rw-r--r--hw/ppc/ppc405_boards.c1
-rw-r--r--hw/ppc/spapr.c1
-rw-r--r--hw/s390x/s390-pci-bus.c10
-rw-r--r--hw/scsi/virtio-scsi-dataplane.c4
-rw-r--r--hw/timer/grlib_gptimer.c1
-rw-r--r--hw/tpm/tpm_passthrough.c6
-rw-r--r--hw/tpm/tpm_tis.c1
-rw-r--r--hw/unicore32/puv3.c1
-rw-r--r--hw/usb/ccid-card-emulated.c3
-rw-r--r--hw/usb/ccid-card-passthru.c6
-rw-r--r--hw/usb/ccid.h2
-rw-r--r--hw/usb/dev-mtp.c1
-rw-r--r--hw/usb/dev-smartcard-reader.c11
-rw-r--r--hw/usb/xen-usb.c46
-rw-r--r--hw/xen/Makefile.objs2
-rw-r--r--hw/xen/xen_backend.c348
-rw-r--r--hw/xen/xen_devconfig.c4
-rw-r--r--hw/xen/xen_pvdev.c316
-rw-r--r--include/block/aio.h24
-rw-r--r--include/block/block.h31
-rw-r--r--include/block/block_int.h27
-rw-r--r--include/block/blockjob.h7
-rw-r--r--include/hw/i386/pc.h1
-rw-r--r--include/hw/xen/xen_backend.h72
-rw-r--r--include/hw/xen/xen_pvdev.h78
-rw-r--r--include/migration/colo.h38
-rw-r--r--include/migration/failover.h26
-rw-r--r--include/migration/migration.h8
-rw-r--r--include/monitor/monitor.h2
-rw-r--r--include/qemu/rfifolock.h54
-rw-r--r--include/qemu/thread-posix.h6
-rw-r--r--include/qemu/thread-win32.h10
-rw-r--r--include/qemu/thread.h3
-rw-r--r--iothread.c33
-rw-r--r--migration/Makefile.objs2
-rw-r--r--migration/colo-comm.c72
-rw-r--r--migration/colo-failover.c83
-rw-r--r--migration/colo.c529
-rw-r--r--migration/migration.c86
-rw-r--r--migration/ram.c37
-rw-r--r--migration/trace-events6
-rw-r--r--monitor.c4
-rw-r--r--net/colo-compare.c29
-rw-r--r--net/trace-events3
-rw-r--r--qapi-schema.json100
-rw-r--r--qemu-ga.texi2
-rw-r--r--qemu-img.c6
-rw-r--r--qemu-io-cmds.c7
-rw-r--r--qemu-options.hx12
-rw-r--r--qmp.c1
-rwxr-xr-xscripts/clean-includes56
-rw-r--r--scripts/hxtool20
-rwxr-xr-xscripts/tracetool.py2
-rw-r--r--stubs/Makefile.objs2
-rw-r--r--stubs/iothread.c8
-rw-r--r--stubs/migration-colo.c46
-rw-r--r--target-arm/cpu.c15
-rw-r--r--target-arm/cpu.h1
-rw-r--r--target-arm/cpu64.c2
-rw-r--r--target-arm/kvm64.c17
-rw-r--r--target-i386/machine.c3
-rw-r--r--target-lm32/translate.c57
-rw-r--r--target-mips/machine.c1
-rw-r--r--target-ppc/machine.c1
-rw-r--r--target-ppc/mem_helper.c1
-rw-r--r--target-sparc/machine.c3
-rw-r--r--target-xtensa/translate.c1
-rw-r--r--tests/.gitignore1
-rw-r--r--tests/Makefile.include2
-rw-r--r--tests/crypto-tls-x509-helpers.h3
-rw-r--r--tests/test-aio.c22
-rw-r--r--tests/test-rfifolock.c91
-rw-r--r--tests/vhost-user-test.c2
-rw-r--r--util/Makefile.objs1
-rw-r--r--util/oslib-posix.c1
-rw-r--r--util/qemu-thread-posix.c14
-rw-r--r--util/qemu-thread-win32.c25
-rw-r--r--util/rfifolock.c78
-rw-r--r--vl.c23
-rw-r--r--xen-common.c4
126 files changed, 2617 insertions, 1197 deletions
diff --git a/MAINTAINERS b/MAINTAINERS
index 82d4d0083b..3fecf458c0 100644
--- a/MAINTAINERS
+++ b/MAINTAINERS
@@ -63,6 +63,17 @@ W: http://wiki.qemu.org/SecurityProcess
 M: Michael S. Tsirkin <mst@redhat.com>
 L: secalert@redhat.com
 
+Trivial patches
+---------------
+Trivial patches
+M: Michael Tokarev <mjt@tls.msk.ru>
+M: Laurent Vivier <laurent@vivier.eu>
+S: Maintained
+L: qemu-trivial@nongnu.org
+K: ^Subject:.*(?i)trivial
+T: git git://git.corpit.ru/qemu.git trivial-patches
+T: git git://github.com/vivier/qemu.git trivial-patches
+
 Guest CPU cores (TCG):
 ----------------------
 Overall
@@ -1415,6 +1426,14 @@ F: util/uuid.c
 F: include/qemu/uuid.h
 F: tests/test-uuid.c
 
+COLO Framework
+M: zhanghailiang <zhang.zhanghailiang@huawei.com>
+S: Maintained
+F: migration/colo*
+F: include/migration/colo.h
+F: include/migration/failover.h
+F: docs/COLO-FT.txt
+
 COLO Proxy
 M: Zhang Chen <zhangchen.fnst@cn.fujitsu.com>
 M: Li Zhijian <lizhijian@cn.fujitsu.com>
diff --git a/Makefile b/Makefile
index 11f5154c81..474cc5e66a 100644
--- a/Makefile
+++ b/Makefile
@@ -695,7 +695,7 @@ help:
 	@echo  ''
 ifdef CONFIG_WIN32
 	@echo  'Windows targets:'
-	@echo  '  installer       - Build NSIS-based installer for qemu-ga'
+	@echo  '  installer       - Build NSIS-based installer for QEMU'
 ifdef QEMU_GA_MSI_ENABLED
 	@echo  '  msi             - Build MSI-based installer for qemu-ga'
 endif
diff --git a/accel.c b/accel.c
index 403eb5e94d..664bb88422 100644
--- a/accel.c
+++ b/accel.c
@@ -33,7 +33,6 @@
 #include "sysemu/qtest.h"
 #include "hw/xen/xen.h"
 #include "qom/object.h"
-#include "hw/boards.h"
 
 int tcg_tb_size;
 static bool tcg_allowed = true;
diff --git a/async.c b/async.c
index f30d011ebc..b2de360c23 100644
--- a/async.c
+++ b/async.c
@@ -61,6 +61,7 @@ void aio_bh_schedule_oneshot(AioContext *ctx, QEMUBHFunc *cb, void *opaque)
     smp_wmb();
     ctx->first_bh = bh;
     qemu_mutex_unlock(&ctx->bh_lock);
+    aio_notify(ctx);
 }
 
 QEMUBH *aio_bh_new(AioContext *ctx, QEMUBHFunc *cb, void *opaque)
@@ -106,8 +107,8 @@ int aio_bh_poll(AioContext *ctx)
          * aio_notify again if necessary.
          */
         if (atomic_xchg(&bh->scheduled, 0)) {
-            /* Idle BHs and the notify BH don't count as progress */
-            if (!bh->idle && bh != ctx->notify_dummy_bh) {
+            /* Idle BHs don't count as progress */
+            if (!bh->idle) {
                 ret = 1;
             }
             bh->idle = 0;
@@ -259,7 +260,6 @@ aio_ctx_finalize(GSource     *source)
 {
     AioContext *ctx = (AioContext *) source;
 
-    qemu_bh_delete(ctx->notify_dummy_bh);
     thread_pool_free(ctx->thread_pool);
 
 #ifdef CONFIG_LINUX_AIO
@@ -284,7 +284,7 @@ aio_ctx_finalize(GSource     *source)
 
     aio_set_event_notifier(ctx, &ctx->notifier, false, NULL);
     event_notifier_cleanup(&ctx->notifier);
-    rfifolock_destroy(&ctx->lock);
+    qemu_rec_mutex_destroy(&ctx->lock);
     qemu_mutex_destroy(&ctx->bh_lock);
     timerlistgroup_deinit(&ctx->tlg);
 }
@@ -345,19 +345,6 @@ static void aio_timerlist_notify(void *opaque)
     aio_notify(opaque);
 }
 
-static void aio_rfifolock_cb(void *opaque)
-{
-    AioContext *ctx = opaque;
-
-    /* Kick owner thread in case they are blocked in aio_poll() */
-    qemu_bh_schedule(ctx->notify_dummy_bh);
-}
-
-static void notify_dummy_bh(void *opaque)
-{
-    /* Do nothing, we were invoked just to force the event loop to iterate */
-}
-
 static void event_notifier_dummy_cb(EventNotifier *e)
 {
 }
@@ -385,11 +372,9 @@ AioContext *aio_context_new(Error **errp)
 #endif
     ctx->thread_pool = NULL;
     qemu_mutex_init(&ctx->bh_lock);
-    rfifolock_init(&ctx->lock, aio_rfifolock_cb, ctx);
+    qemu_rec_mutex_init(&ctx->lock);
     timerlistgroup_init(&ctx->tlg, aio_timerlist_notify, ctx);
 
-    ctx->notify_dummy_bh = aio_bh_new(ctx, notify_dummy_bh, NULL);
-
     return ctx;
 fail:
     g_source_destroy(&ctx->source);
@@ -408,10 +393,10 @@ void aio_context_unref(AioContext *ctx)
 
 void aio_context_acquire(AioContext *ctx)
 {
-    rfifolock_lock(&ctx->lock);
+    qemu_rec_mutex_lock(&ctx->lock);
 }
 
 void aio_context_release(AioContext *ctx)
 {
-    rfifolock_unlock(&ctx->lock);
+    qemu_rec_mutex_unlock(&ctx->lock);
 }
diff --git a/block.c b/block.c
index 7f3e7bcdc3..a17baab1d0 100644
--- a/block.c
+++ b/block.c
@@ -2082,7 +2082,7 @@ BlockReopenQueue *bdrv_reopen_queue(BlockReopenQueue *bs_queue,
  * to all devices.
  *
  */
-int bdrv_reopen_multiple(BlockReopenQueue *bs_queue, Error **errp)
+int bdrv_reopen_multiple(AioContext *ctx, BlockReopenQueue *bs_queue, Error **errp)
 {
     int ret = -1;
     BlockReopenQueueEntry *bs_entry, *next;
@@ -2090,7 +2090,9 @@ int bdrv_reopen_multiple(BlockReopenQueue *bs_queue, Error **errp)
 
     assert(bs_queue != NULL);
 
+    aio_context_release(ctx);
     bdrv_drain_all();
+    aio_context_acquire(ctx);
 
     QSIMPLEQ_FOREACH(bs_entry, bs_queue, entry) {
         if (bdrv_reopen_prepare(&bs_entry->state, bs_queue, &local_err)) {
@@ -2131,7 +2133,7 @@ int bdrv_reopen(BlockDriverState *bs, int bdrv_flags, Error **errp)
     Error *local_err = NULL;
     BlockReopenQueue *queue = bdrv_reopen_queue(NULL, bs, NULL, bdrv_flags);
 
-    ret = bdrv_reopen_multiple(queue, &local_err);
+    ret = bdrv_reopen_multiple(bdrv_get_aio_context(bs), queue, &local_err);
     if (local_err != NULL) {
         error_propagate(errp, local_err);
     }
diff --git a/block/backup.c b/block/backup.c
index 02dbe48035..81d4042ae8 100644
--- a/block/backup.c
+++ b/block/backup.c
@@ -300,6 +300,21 @@ void backup_cow_request_end(CowRequest *req)
     cow_request_end(req);
 }
 
+static void backup_drain(BlockJob *job)
+{
+    BackupBlockJob *s = container_of(job, BackupBlockJob, common);
+
+    /* Need to keep a reference in case blk_drain triggers execution
+     * of backup_complete...
+     */
+    if (s->target) {
+        BlockBackend *target = s->target;
+        blk_ref(target);
+        blk_drain(target);
+        blk_unref(target);
+    }
+}
+
 static const BlockJobDriver backup_job_driver = {
     .instance_size          = sizeof(BackupBlockJob),
     .job_type               = BLOCK_JOB_TYPE_BACKUP,
@@ -307,6 +322,7 @@ static const BlockJobDriver backup_job_driver = {
     .commit                 = backup_commit,
     .abort                  = backup_abort,
     .attached_aio_context   = backup_attached_aio_context,
+    .drain                  = backup_drain,
 };
 
 static BlockErrorAction backup_error_action(BackupBlockJob *job,
@@ -331,6 +347,7 @@ static void backup_complete(BlockJob *job, void *opaque)
     BackupCompleteData *data = opaque;
 
     blk_unref(s->target);
+    s->target = NULL;
 
     block_job_completed(job, data->ret);
     g_free(data);
diff --git a/block/block-backend.c b/block/block-backend.c
index c53ca30000..27a7f6f523 100644
--- a/block/block-backend.c
+++ b/block/block-backend.c
@@ -799,20 +799,25 @@ int coroutine_fn blk_co_preadv(BlockBackend *blk, int64_t offset,
                                BdrvRequestFlags flags)
 {
     int ret;
+    BlockDriverState *bs = blk_bs(blk);
 
-    trace_blk_co_preadv(blk, blk_bs(blk), offset, bytes, flags);
+    trace_blk_co_preadv(blk, bs, offset, bytes, flags);
 
     ret = blk_check_byte_request(blk, offset, bytes);
     if (ret < 0) {
         return ret;
     }
 
+    bdrv_inc_in_flight(bs);
+
     /* throttling disk I/O */
     if (blk->public.throttle_state) {
         throttle_group_co_io_limits_intercept(blk, bytes, false);
     }
 
-    return bdrv_co_preadv(blk->root, offset, bytes, qiov, flags);
+    ret = bdrv_co_preadv(blk->root, offset, bytes, qiov, flags);
+    bdrv_dec_in_flight(bs);
+    return ret;
 }
 
 int coroutine_fn blk_co_pwritev(BlockBackend *blk, int64_t offset,
@@ -820,14 +825,17 @@ int coroutine_fn blk_co_pwritev(BlockBackend *blk, int64_t offset,
                                 BdrvRequestFlags flags)
 {
     int ret;
+    BlockDriverState *bs = blk_bs(blk);
 
-    trace_blk_co_pwritev(blk, blk_bs(blk), offset, bytes, flags);
+    trace_blk_co_pwritev(blk, bs, offset, bytes, flags);
 
     ret = blk_check_byte_request(blk, offset, bytes);
     if (ret < 0) {
         return ret;
     }
 
+    bdrv_inc_in_flight(bs);
+
     /* throttling disk I/O */
     if (blk->public.throttle_state) {
         throttle_group_co_io_limits_intercept(blk, bytes, true);
@@ -837,7 +845,9 @@ int coroutine_fn blk_co_pwritev(BlockBackend *blk, int64_t offset,
         flags |= BDRV_REQ_FUA;
     }
 
-    return bdrv_co_pwritev(blk->root, offset, bytes, qiov, flags);
+    ret = bdrv_co_pwritev(blk->root, offset, bytes, qiov, flags);
+    bdrv_dec_in_flight(bs);
+    return ret;
 }
 
 typedef struct BlkRwCo {
@@ -868,7 +878,6 @@ static int blk_prw(BlockBackend *blk, int64_t offset, uint8_t *buf,
                    int64_t bytes, CoroutineEntry co_entry,
                    BdrvRequestFlags flags)
 {
-    AioContext *aio_context;
     QEMUIOVector qiov;
     struct iovec iov;
     Coroutine *co;
@@ -890,11 +899,7 @@ static int blk_prw(BlockBackend *blk, int64_t offset, uint8_t *buf,
 
     co = qemu_coroutine_create(co_entry, &rwco);
     qemu_coroutine_enter(co);
-
-    aio_context = blk_get_aio_context(blk);
-    while (rwco.ret == NOT_DONE) {
-        aio_poll(aio_context, true);
-    }
+    BDRV_POLL_WHILE(blk_bs(blk), rwco.ret == NOT_DONE);
 
     return rwco.ret;
 }
@@ -930,6 +935,8 @@ int blk_make_zero(BlockBackend *blk, BdrvRequestFlags flags)
 static void error_callback_bh(void *opaque)
 {
     struct BlockBackendAIOCB *acb = opaque;
+
+    bdrv_dec_in_flight(acb->common.bs);
     acb->common.cb(acb->common.opaque, acb->ret);
     qemu_aio_unref(acb);
 }
@@ -940,6 +947,7 @@ BlockAIOCB *blk_abort_aio_request(BlockBackend *blk,
 {
     struct BlockBackendAIOCB *acb;
 
+    bdrv_inc_in_flight(blk_bs(blk));
     acb = blk_aio_get(&block_backend_aiocb_info, blk, cb, opaque);
     acb->blk = blk;
     acb->ret = ret;
@@ -962,6 +970,7 @@ static const AIOCBInfo blk_aio_em_aiocb_info = {
 static void blk_aio_complete(BlkAioEmAIOCB *acb)
 {
     if (acb->has_returned) {
+        bdrv_dec_in_flight(acb->common.bs);
         acb->common.cb(acb->common.opaque, acb->rwco.ret);
         qemu_aio_unref(acb);
     }
@@ -983,6 +992,7 @@ static BlockAIOCB *blk_aio_prwv(BlockBackend *blk, int64_t offset, int bytes,
     BlkAioEmAIOCB *acb;
     Coroutine *co;
 
+    bdrv_inc_in_flight(blk_bs(blk));
     acb = blk_aio_get(&blk_aio_em_aiocb_info, blk, cb, opaque);
     acb->rwco = (BlkRwCo) {
         .blk    = blk,
diff --git a/block/commit.c b/block/commit.c
index 9f67a8b121..499eccaeee 100644
--- a/block/commit.c
+++ b/block/commit.c
@@ -251,7 +251,7 @@ void commit_start(const char *job_id, BlockDriverState *bs,
                                          orig_overlay_flags | BDRV_O_RDWR);
     }
     if (reopen_queue) {
-        bdrv_reopen_multiple(reopen_queue, &local_err);
+        bdrv_reopen_multiple(bdrv_get_aio_context(bs), reopen_queue, &local_err);
         if (local_err != NULL) {
             error_propagate(errp, local_err);
             block_job_unref(&s->common);
diff --git a/block/io.c b/block/io.c
index 79cbbdf769..be0d862ca6 100644
--- a/block/io.c
+++ b/block/io.c
@@ -143,7 +143,7 @@ bool bdrv_requests_pending(BlockDriverState *bs)
 {
     BdrvChild *child;
 
-    if (!QLIST_EMPTY(&bs->tracked_requests)) {
+    if (atomic_read(&bs->in_flight)) {
         return true;
     }
 
@@ -156,16 +156,22 @@ bool bdrv_requests_pending(BlockDriverState *bs)
     return false;
 }
 
-static void bdrv_drain_recurse(BlockDriverState *bs)
+static bool bdrv_drain_recurse(BlockDriverState *bs)
 {
     BdrvChild *child;
+    bool waited;
+
+    waited = BDRV_POLL_WHILE(bs, atomic_read(&bs->in_flight) > 0);
 
     if (bs->drv && bs->drv->bdrv_drain) {
         bs->drv->bdrv_drain(bs);
     }
+
     QLIST_FOREACH(child, &bs->children, next) {
-        bdrv_drain_recurse(child->bs);
+        waited |= bdrv_drain_recurse(child->bs);
     }
+
+    return waited;
 }
 
 typedef struct {
@@ -174,23 +180,14 @@ typedef struct {
     bool done;
 } BdrvCoDrainData;
 
-static void bdrv_drain_poll(BlockDriverState *bs)
-{
-    bool busy = true;
-
-    while (busy) {
-        /* Keep iterating */
-        busy = bdrv_requests_pending(bs);
-        busy |= aio_poll(bdrv_get_aio_context(bs), busy);
-    }
-}
-
 static void bdrv_co_drain_bh_cb(void *opaque)
 {
     BdrvCoDrainData *data = opaque;
     Coroutine *co = data->co;
+    BlockDriverState *bs = data->bs;
 
-    bdrv_drain_poll(data->bs);
+    bdrv_dec_in_flight(bs);
+    bdrv_drained_begin(bs);
     data->done = true;
     qemu_coroutine_enter(co);
 }
@@ -209,6 +206,7 @@ static void coroutine_fn bdrv_co_yield_to_drain(BlockDriverState *bs)
         .bs = bs,
         .done = false,
     };
+    bdrv_inc_in_flight(bs);
     aio_bh_schedule_oneshot(bdrv_get_aio_context(bs),
                             bdrv_co_drain_bh_cb, &data);
 
@@ -220,6 +218,11 @@ static void coroutine_fn bdrv_co_yield_to_drain(BlockDriverState *bs)
 
 void bdrv_drained_begin(BlockDriverState *bs)
 {
+    if (qemu_in_coroutine()) {
+        bdrv_co_yield_to_drain(bs);
+        return;
+    }
+
     if (!bs->quiesce_counter++) {
         aio_disable_external(bdrv_get_aio_context(bs));
         bdrv_parent_drained_begin(bs);
@@ -227,11 +230,6 @@ void bdrv_drained_begin(BlockDriverState *bs)
 
     bdrv_io_unplugged_begin(bs);
     bdrv_drain_recurse(bs);
-    if (qemu_in_coroutine()) {
-        bdrv_co_yield_to_drain(bs);
-    } else {
-        bdrv_drain_poll(bs);
-    }
     bdrv_io_unplugged_end(bs);
 }
 
@@ -279,7 +277,7 @@ void bdrv_drain(BlockDriverState *bs)
 void bdrv_drain_all(void)
 {
     /* Always run first iteration so any pending completion BHs run */
-    bool busy = true;
+    bool waited = true;
     BlockDriverState *bs;
     BdrvNextIterator it;
     BlockJob *job = NULL;
@@ -299,7 +297,6 @@ void bdrv_drain_all(void)
         aio_context_acquire(aio_context);
         bdrv_parent_drained_begin(bs);
         bdrv_io_unplugged_begin(bs);
-        bdrv_drain_recurse(bs);
         aio_context_release(aio_context);
 
         if (!g_slist_find(aio_ctxs, aio_context)) {
@@ -313,8 +310,8 @@ void bdrv_drain_all(void)
      * request completion.  Therefore we must keep looping until there was no
      * more activity rather than simply draining each device independently.
      */
-    while (busy) {
-        busy = false;
+    while (waited) {
+        waited = false;
 
         for (ctx = aio_ctxs; ctx != NULL; ctx = ctx->next) {
             AioContext *aio_context = ctx->data;
@@ -322,13 +319,9 @@ void bdrv_drain_all(void)
             aio_context_acquire(aio_context);
             for (bs = bdrv_first(&it); bs; bs = bdrv_next(&it)) {
                 if (aio_context == bdrv_get_aio_context(bs)) {
-                    if (bdrv_requests_pending(bs)) {
-                        busy = true;
-                        aio_poll(aio_context, busy);
-                    }
+                    waited |= bdrv_drain_recurse(bs);
                 }
             }
-            busy |= aio_poll(aio_context, false);
             aio_context_release(aio_context);
         }
     }
@@ -476,6 +469,28 @@ static bool tracked_request_overlaps(BdrvTrackedRequest *req,
     return true;
 }
 
+void bdrv_inc_in_flight(BlockDriverState *bs)
+{
+    atomic_inc(&bs->in_flight);
+}
+
+static void dummy_bh_cb(void *opaque)
+{
+}
+
+void bdrv_wakeup(BlockDriverState *bs)
+{
+    if (bs->wakeup) {
+        aio_bh_schedule_oneshot(qemu_get_aio_context(), dummy_bh_cb, NULL);
+    }
+}
+
+void bdrv_dec_in_flight(BlockDriverState *bs)
+{
+    atomic_dec(&bs->in_flight);
+    bdrv_wakeup(bs);
+}
+
 static bool coroutine_fn wait_serialising_requests(BdrvTrackedRequest *self)
 {
     BlockDriverState *bs = self->bs;
@@ -583,13 +598,9 @@ static int bdrv_prwv_co(BdrvChild *child, int64_t offset,
         /* Fast-path if already in coroutine context */
         bdrv_rw_co_entry(&rwco);
     } else {
-        AioContext *aio_context = bdrv_get_aio_context(child->bs);
-
         co = qemu_coroutine_create(bdrv_rw_co_entry, &rwco);
         qemu_coroutine_enter(co);
-        while (rwco.ret == NOT_DONE) {
-            aio_poll(aio_context, true);
-        }
+        BDRV_POLL_WHILE(child->bs, rwco.ret == NOT_DONE);
     }
     return rwco.ret;
 }
@@ -1097,6 +1108,8 @@ int coroutine_fn bdrv_co_preadv(BdrvChild *child,
         return ret;
     }
 
+    bdrv_inc_in_flight(bs);
+
     /* Don't do copy-on-read if we read data before write operation */
     if (bs->copy_on_read && !(flags & BDRV_REQ_NO_SERIALISING)) {
         flags |= BDRV_REQ_COPY_ON_READ;
@@ -1132,6 +1145,7 @@ int coroutine_fn bdrv_co_preadv(BdrvChild *child,
                               use_local_qiov ? &local_qiov : qiov,
                               flags);
     tracked_request_end(&req);
+    bdrv_dec_in_flight(bs);
 
     if (use_local_qiov) {
         qemu_iovec_destroy(&local_qiov);
@@ -1480,6 +1494,7 @@ int coroutine_fn bdrv_co_pwritev(BdrvChild *child,
         return ret;
     }
 
+    bdrv_inc_in_flight(bs);
     /*
      * Align write if necessary by performing a read-modify-write cycle.
      * Pad qiov with the read parts and be sure to have a tracked request not
@@ -1581,6 +1596,7 @@ fail:
     qemu_vfree(tail_buf);
 out:
     tracked_request_end(&req);
+    bdrv_dec_in_flight(bs);
     return ret;
 }
 
@@ -1705,17 +1721,19 @@ static int64_t coroutine_fn bdrv_co_get_block_status(BlockDriverState *bs,
     }
 
     *file = NULL;
+    bdrv_inc_in_flight(bs);
     ret = bs->drv->bdrv_co_get_block_status(bs, sector_num, nb_sectors, pnum,
                                             file);
     if (ret < 0) {
         *pnum = 0;
-        return ret;
+        goto out;
     }
 
     if (ret & BDRV_BLOCK_RAW) {
         assert(ret & BDRV_BLOCK_OFFSET_VALID);
-        return bdrv_get_block_status(bs->file->bs, ret >> BDRV_SECTOR_BITS,
-                                     *pnum, pnum, file);
+        ret = bdrv_get_block_status(bs->file->bs, ret >> BDRV_SECTOR_BITS,
+                                    *pnum, pnum, file);
+        goto out;
     }
 
     if (ret & (BDRV_BLOCK_DATA | BDRV_BLOCK_ZERO)) {
@@ -1757,6 +1775,8 @@ static int64_t coroutine_fn bdrv_co_get_block_status(BlockDriverState *bs,
         }
     }
 
+out:
+    bdrv_dec_in_flight(bs);
     return ret;
 }
 
@@ -1822,14 +1842,10 @@ int64_t bdrv_get_block_status_above(BlockDriverState *bs,
         /* Fast-path if already in coroutine context */
         bdrv_get_block_status_above_co_entry(&data);
     } else {
-        AioContext *aio_context = bdrv_get_aio_context(bs);
-
         co = qemu_coroutine_create(bdrv_get_block_status_above_co_entry,
                                    &data);
         qemu_coroutine_enter(co);
-        while (!data.done) {
-            aio_poll(aio_context, true);
-        }
+        BDRV_POLL_WHILE(bs, !data.done);
     }
     return data.ret;
 }
@@ -2102,6 +2118,7 @@ static const AIOCBInfo bdrv_em_co_aiocb_info = {
 static void bdrv_co_complete(BlockAIOCBCoroutine *acb)
 {
     if (!acb->need_bh) {
+        bdrv_dec_in_flight(acb->common.bs);
         acb->common.cb(acb->common.opaque, acb->req.error);
         qemu_aio_unref(acb);
     }
@@ -2152,6 +2169,9 @@ static BlockAIOCB *bdrv_co_aio_prw_vector(BdrvChild *child,
     Coroutine *co;
     BlockAIOCBCoroutine *acb;
 
+    /* Matched by bdrv_co_complete's bdrv_dec_in_flight.  */
+    bdrv_inc_in_flight(child->bs);
+
     acb = qemu_aio_get(&bdrv_em_co_aiocb_info, child->bs, cb, opaque);
     acb->child = child;
     acb->need_bh = true;
@@ -2185,6 +2205,9 @@ BlockAIOCB *bdrv_aio_flush(BlockDriverState *bs,
     Coroutine *co;
     BlockAIOCBCoroutine *acb;
 
+    /* Matched by bdrv_co_complete's bdrv_dec_in_flight.  */
+    bdrv_inc_in_flight(bs);
+
     acb = qemu_aio_get(&bdrv_em_co_aiocb_info, bs, cb, opaque);
     acb->need_bh = true;
     acb->req.error = -EINPROGRESS;
@@ -2244,23 +2267,22 @@ static void coroutine_fn bdrv_flush_co_entry(void *opaque)
 int coroutine_fn bdrv_co_flush(BlockDriverState *bs)
 {
     int ret;
-    BdrvTrackedRequest req;
 
     if (!bs || !bdrv_is_inserted(bs) || bdrv_is_read_only(bs) ||
         bdrv_is_sg(bs)) {
         return 0;
     }
 
-    tracked_request_begin(&req, bs, 0, 0, BDRV_TRACKED_FLUSH);
+    bdrv_inc_in_flight(bs);
 
     int current_gen = bs->write_gen;
 
     /* Wait until any previous flushes are completed */
-    while (bs->active_flush_req != NULL) {
+    while (bs->active_flush_req) {
         qemu_co_queue_wait(&bs->flush_queue);
     }
 
-    bs->active_flush_req = &req;
+    bs->active_flush_req = true;
 
     /* Write back all layers by calling one driver function */
     if (bs->drv->bdrv_co_flush) {
@@ -2330,11 +2352,11 @@ flush_parent:
 out:
     /* Notify any pending flushes that we have completed */
     bs->flushed_gen = current_gen;
-    bs->active_flush_req = NULL;
+    bs->active_flush_req = false;
     /* Return value is ignored - it's ok if wait queue is empty */
     qemu_co_queue_next(&bs->flush_queue);
 
-    tracked_request_end(&req);
+    bdrv_dec_in_flight(bs);
     return ret;
 }
 
@@ -2350,13 +2372,9 @@ int bdrv_flush(BlockDriverState *bs)
         /* Fast-path if already in coroutine context */
         bdrv_flush_co_entry(&flush_co);
     } else {
-        AioContext *aio_context = bdrv_get_aio_context(bs);
-
         co = qemu_coroutine_create(bdrv_flush_co_entry, &flush_co);
         qemu_coroutine_enter(co);
-        while (flush_co.ret == NOT_DONE) {
-            aio_poll(aio_context, true);
-        }
+        BDRV_POLL_WHILE(bs, flush_co.ret == NOT_DONE);
     }
 
     return flush_co.ret;
@@ -2417,6 +2435,7 @@ int coroutine_fn bdrv_co_pdiscard(BlockDriverState *bs, int64_t offset,
         return 0;
     }
 
+    bdrv_inc_in_flight(bs);
     tracked_request_begin(&req, bs, offset, count, BDRV_TRACKED_DISCARD);
 
     ret = notifier_with_return_list_notify(&bs->before_write_notifiers, &req);
@@ -2463,6 +2482,7 @@ out:
     bdrv_set_dirty(bs, req.offset >> BDRV_SECTOR_BITS,
                    req.bytes >> BDRV_SECTOR_BITS);
     tracked_request_end(&req);
+    bdrv_dec_in_flight(bs);
     return ret;
 }
 
@@ -2480,13 +2500,9 @@ int bdrv_pdiscard(BlockDriverState *bs, int64_t offset, int count)
         /* Fast-path if already in coroutine context */
         bdrv_pdiscard_co_entry(&rwco);
     } else {
-        AioContext *aio_context = bdrv_get_aio_context(bs);
-
         co = qemu_coroutine_create(bdrv_pdiscard_co_entry, &rwco);
         qemu_coroutine_enter(co);
-        while (rwco.ret == NOT_DONE) {
-            aio_poll(aio_context, true);
-        }
+        BDRV_POLL_WHILE(bs, rwco.ret == NOT_DONE);
     }
 
     return rwco.ret;
@@ -2495,13 +2511,12 @@ int bdrv_pdiscard(BlockDriverState *bs, int64_t offset, int count)
 int bdrv_co_ioctl(BlockDriverState *bs, int req, void *buf)
 {
     BlockDriver *drv = bs->drv;
-    BdrvTrackedRequest tracked_req;
     CoroutineIOCompletion co = {
         .coroutine = qemu_coroutine_self(),
     };
     BlockAIOCB *acb;
 
-    tracked_request_begin(&tracked_req, bs, 0, 0, BDRV_TRACKED_IOCTL);
+    bdrv_inc_in_flight(bs);
     if (!drv || (!drv->bdrv_aio_ioctl && !drv->bdrv_co_ioctl)) {
         co.ret = -ENOTSUP;
         goto out;
@@ -2518,7 +2533,7 @@ int bdrv_co_ioctl(BlockDriverState *bs, int req, void *buf)
         qemu_coroutine_yield();
     }
 out:
-    tracked_request_end(&tracked_req);
+    bdrv_dec_in_flight(bs);
     return co.ret;
 }
 
diff --git a/block/mirror.c b/block/mirror.c
index a433e6848c..3a0788ede3 100644
--- a/block/mirror.c
+++ b/block/mirror.c
@@ -469,7 +469,11 @@ static void mirror_free_init(MirrorBlockJob *s)
     }
 }
 
-static void mirror_drain(MirrorBlockJob *s)
+/* This is also used for the .pause callback. There is no matching
+ * mirror_resume() because mirror_run() will begin iterating again
+ * when the job is resumed.
+ */
+static void mirror_wait_for_all_io(MirrorBlockJob *s)
 {
     while (s->in_flight > 0) {
         mirror_wait_for_io(s);
@@ -528,6 +532,7 @@ static void mirror_exit(BlockJob *job, void *opaque)
     g_free(s->replaces);
     bdrv_op_unblock_all(target_bs, s->common.blocker);
     blk_unref(s->target);
+    s->target = NULL;
     block_job_completed(&s->common, data->ret);
     g_free(data);
     bdrv_drained_end(src);
@@ -582,7 +587,7 @@ static int coroutine_fn mirror_dirty_init(MirrorBlockJob *s)
             sector_num += nb_sectors;
         }
 
-        mirror_drain(s);
+        mirror_wait_for_all_io(s);
     }
 
     /* First part, loop on the sectors and initialize the dirty bitmap.  */
@@ -617,6 +622,7 @@ static void coroutine_fn mirror_run(void *opaque)
     MirrorExitData *data;
     BlockDriverState *bs = blk_bs(s->common.blk);
     BlockDriverState *target_bs = blk_bs(s->target);
+    bool need_drain = true;
     int64_t length;
     BlockDriverInfo bdi;
     char backing_filename[2]; /* we only need 2 characters because we are only
@@ -752,11 +758,26 @@ static void coroutine_fn mirror_run(void *opaque)
              * source has dirty data to copy!
              *
              * Note that I/O can be submitted by the guest while
-             * mirror_populate runs.
+             * mirror_populate runs, so pause it now.  Before deciding
+             * whether to switch to target check one last time if I/O has
+             * come in the meanwhile, and if not flush the data to disk.
              */
             trace_mirror_before_drain(s, cnt);
-            bdrv_co_drain(bs);
+
+            bdrv_drained_begin(bs);
             cnt = bdrv_get_dirty_count(s->dirty_bitmap);
+            if (cnt > 0) {
+                bdrv_drained_end(bs);
+                continue;
+            }
+
+            /* The two disks are in sync.  Exit and report successful
+             * completion.
+             */
+            assert(QLIST_EMPTY(&bs->tracked_requests));
+            s->common.cancelled = false;
+            need_drain = false;
+            break;
         }
 
         ret = 0;
@@ -769,13 +790,6 @@ static void coroutine_fn mirror_run(void *opaque)
         } else if (!should_complete) {
             delay_ns = (s->in_flight == 0 && cnt == 0 ? SLICE_TIME : 0);
             block_job_sleep_ns(&s->common, QEMU_CLOCK_REALTIME, delay_ns);
-        } else if (cnt == 0) {
-            /* The two disks are in sync.  Exit and report successful
-             * completion.
-             */
-            assert(QLIST_EMPTY(&bs->tracked_requests));
-            s->common.cancelled = false;
-            break;
         }
         s->last_pause_ns = qemu_clock_get_ns(QEMU_CLOCK_REALTIME);
     }
@@ -787,7 +801,8 @@ immediate_exit:
          * the target is a copy of the source.
          */
         assert(ret < 0 || (!s->synced && block_job_is_cancelled(&s->common)));
-        mirror_drain(s);
+        assert(need_drain);
+        mirror_wait_for_all_io(s);
     }
 
     assert(s->in_flight == 0);
@@ -799,9 +814,10 @@ immediate_exit:
 
     data = g_malloc(sizeof(*data));
     data->ret = ret;
-    /* Before we switch to target in mirror_exit, make sure data doesn't
-     * change. */
-    bdrv_drained_begin(bs);
+
+    if (need_drain) {
+        bdrv_drained_begin(bs);
+    }
     block_job_defer_to_main_loop(&s->common, mirror_exit, data);
 }
 
@@ -872,14 +888,11 @@ static void mirror_complete(BlockJob *job, Error **errp)
     block_job_enter(&s->common);
 }
 
-/* There is no matching mirror_resume() because mirror_run() will begin
- * iterating again when the job is resumed.
- */
-static void coroutine_fn mirror_pause(BlockJob *job)
+static void mirror_pause(BlockJob *job)
 {
     MirrorBlockJob *s = container_of(job, MirrorBlockJob, common);
 
-    mirror_drain(s);
+    mirror_wait_for_all_io(s);
 }
 
 static void mirror_attached_aio_context(BlockJob *job, AioContext *new_context)
@@ -889,6 +902,21 @@ static void mirror_attached_aio_context(BlockJob *job, AioContext *new_context)
     blk_set_aio_context(s->target, new_context);
 }
 
+static void mirror_drain(BlockJob *job)
+{
+    MirrorBlockJob *s = container_of(job, MirrorBlockJob, common);
+
+    /* Need to keep a reference in case blk_drain triggers execution
+     * of mirror_complete...
+     */
+    if (s->target) {
+        BlockBackend *target = s->target;
+        blk_ref(target);
+        blk_drain(target);
+        blk_unref(target);
+    }
+}
+
 static const BlockJobDriver mirror_job_driver = {
     .instance_size          = sizeof(MirrorBlockJob),
     .job_type               = BLOCK_JOB_TYPE_MIRROR,
@@ -896,6 +924,7 @@ static const BlockJobDriver mirror_job_driver = {
     .complete               = mirror_complete,
     .pause                  = mirror_pause,
     .attached_aio_context   = mirror_attached_aio_context,
+    .drain                  = mirror_drain,
 };
 
 static const BlockJobDriver commit_active_job_driver = {
@@ -905,6 +934,7 @@ static const BlockJobDriver commit_active_job_driver = {
     .complete               = mirror_complete,
     .pause                  = mirror_pause,
     .attached_aio_context   = mirror_attached_aio_context,
+    .drain                  = mirror_drain,
 };
 
 static void mirror_start_job(const char *job_id, BlockDriverState *bs,
diff --git a/block/nfs.c b/block/nfs.c
index c3db2ec58d..88c60a9118 100644
--- a/block/nfs.c
+++ b/block/nfs.c
@@ -52,6 +52,7 @@ typedef struct NFSClient {
 } NFSClient;
 
 typedef struct NFSRPC {
+    BlockDriverState *bs;
     int ret;
     int complete;
     QEMUIOVector *iov;
@@ -90,11 +91,12 @@ static void nfs_process_write(void *arg)
     nfs_set_events(client);
 }
 
-static void nfs_co_init_task(NFSClient *client, NFSRPC *task)
+static void nfs_co_init_task(BlockDriverState *bs, NFSRPC *task)
 {
     *task = (NFSRPC) {
         .co             = qemu_coroutine_self(),
-        .client         = client,
+        .bs             = bs,
+        .client         = bs->opaque,
     };
 }
 
@@ -111,6 +113,7 @@ nfs_co_generic_cb(int ret, struct nfs_context *nfs, void *data,
 {
     NFSRPC *task = private_data;
     task->ret = ret;
+    assert(!task->st);
     if (task->ret > 0 && task->iov) {
         if (task->ret <= task->iov->size) {
             qemu_iovec_from_buf(task->iov, 0, data, task->ret);
@@ -118,18 +121,11 @@ nfs_co_generic_cb(int ret, struct nfs_context *nfs, void *data,
             task->ret = -EIO;
         }
     }
-    if (task->ret == 0 && task->st) {
-        memcpy(task->st, data, sizeof(struct stat));
-    }
     if (task->ret < 0) {
         error_report("NFS Error: %s", nfs_get_error(nfs));
     }
-    if (task->co) {
-        aio_bh_schedule_oneshot(task->client->aio_context,
-                                nfs_co_generic_bh_cb, task);
-    } else {
-        task->complete = 1;
-    }
+    aio_bh_schedule_oneshot(task->client->aio_context,
+                            nfs_co_generic_bh_cb, task);
 }
 
 static int coroutine_fn nfs_co_readv(BlockDriverState *bs,
@@ -139,7 +135,7 @@ static int coroutine_fn nfs_co_readv(BlockDriverState *bs,
     NFSClient *client = bs->opaque;
     NFSRPC task;
 
-    nfs_co_init_task(client, &task);
+    nfs_co_init_task(bs, &task);
     task.iov = iov;
 
     if (nfs_pread_async(client->context, client->fh,
@@ -149,8 +145,8 @@ static int coroutine_fn nfs_co_readv(BlockDriverState *bs,
         return -ENOMEM;
     }
 
+    nfs_set_events(client);
     while (!task.complete) {
-        nfs_set_events(client);
         qemu_coroutine_yield();
     }
 
@@ -174,7 +170,7 @@ static int coroutine_fn nfs_co_writev(BlockDriverState *bs,
     NFSRPC task;
     char *buf = NULL;
 
-    nfs_co_init_task(client, &task);
+    nfs_co_init_task(bs, &task);
 
     buf = g_try_malloc(nb_sectors * BDRV_SECTOR_SIZE);
     if (nb_sectors && buf == NULL) {
@@ -191,8 +187,8 @@ static int coroutine_fn nfs_co_writev(BlockDriverState *bs,
         return -ENOMEM;
     }
 
+    nfs_set_events(client);
     while (!task.complete) {
-        nfs_set_events(client);
         qemu_coroutine_yield();
     }
 
@@ -210,15 +206,15 @@ static int coroutine_fn nfs_co_flush(BlockDriverState *bs)
     NFSClient *client = bs->opaque;
     NFSRPC task;
 
-    nfs_co_init_task(client, &task);
+    nfs_co_init_task(bs, &task);
 
     if (nfs_fsync_async(client->context, client->fh, nfs_co_generic_cb,
                         &task) != 0) {
         return -ENOMEM;
     }
 
+    nfs_set_events(client);
     while (!task.complete) {
-        nfs_set_events(client);
         qemu_coroutine_yield();
     }
 
@@ -496,6 +492,22 @@ static int nfs_has_zero_init(BlockDriverState *bs)
     return client->has_zero_init;
 }
 
+static void
+nfs_get_allocated_file_size_cb(int ret, struct nfs_context *nfs, void *data,
+                               void *private_data)
+{
+    NFSRPC *task = private_data;
+    task->ret = ret;
+    if (task->ret == 0) {
+        memcpy(task->st, data, sizeof(struct stat));
+    }
+    if (task->ret < 0) {
+        error_report("NFS Error: %s", nfs_get_error(nfs));
+    }
+    task->complete = 1;
+    bdrv_wakeup(task->bs);
+}
+
 static int64_t nfs_get_allocated_file_size(BlockDriverState *bs)
 {
     NFSClient *client = bs->opaque;
@@ -507,16 +519,15 @@ static int64_t nfs_get_allocated_file_size(BlockDriverState *bs)
         return client->st_blocks * 512;
     }
 
+    task.bs = bs;
     task.st = &st;
-    if (nfs_fstat_async(client->context, client->fh, nfs_co_generic_cb,
+    if (nfs_fstat_async(client->context, client->fh, nfs_get_allocated_file_size_cb,
                         &task) != 0) {
         return -ENOMEM;
     }
 
-    while (!task.complete) {
-        nfs_set_events(client);
-        aio_poll(client->aio_context, true);
-    }
+    nfs_set_events(client);
+    BDRV_POLL_WHILE(bs, !task.complete);
 
     return (task.ret < 0 ? task.ret : st.st_blocks * 512);
 }
diff --git a/block/qed-table.c b/block/qed-table.c
index 1a731dff51..ed443e2b70 100644
--- a/block/qed-table.c
+++ b/block/qed-table.c
@@ -174,9 +174,7 @@ int qed_read_l1_table_sync(BDRVQEDState *s)
 
     qed_read_table(s, s->header.l1_table_offset,
                    s->l1_table, qed_sync_cb, &ret);
-    while (ret == -EINPROGRESS) {
-        aio_poll(bdrv_get_aio_context(s->bs), true);
-    }
+    BDRV_POLL_WHILE(s->bs, ret == -EINPROGRESS);
 
     return ret;
 }
@@ -195,9 +193,7 @@ int qed_write_l1_table_sync(BDRVQEDState *s, unsigned int index,
     int ret = -EINPROGRESS;
 
     qed_write_l1_table(s, index, n, qed_sync_cb, &ret);
-    while (ret == -EINPROGRESS) {
-        aio_poll(bdrv_get_aio_context(s->bs), true);
-    }
+    BDRV_POLL_WHILE(s->bs, ret == -EINPROGRESS);
 
     return ret;
 }
@@ -268,9 +264,7 @@ int qed_read_l2_table_sync(BDRVQEDState *s, QEDRequest *request, uint64_t offset
     int ret = -EINPROGRESS;
 
     qed_read_l2_table(s, request, offset, qed_sync_cb, &ret);
-    while (ret == -EINPROGRESS) {
-        aio_poll(bdrv_get_aio_context(s->bs), true);
-    }
+    BDRV_POLL_WHILE(s->bs, ret == -EINPROGRESS);
 
     return ret;
 }
@@ -290,9 +284,7 @@ int qed_write_l2_table_sync(BDRVQEDState *s, QEDRequest *request,
     int ret = -EINPROGRESS;
 
     qed_write_l2_table(s, request, index, n, flush, qed_sync_cb, &ret);
-    while (ret == -EINPROGRESS) {
-        aio_poll(bdrv_get_aio_context(s->bs), true);
-    }
+    BDRV_POLL_WHILE(s->bs, ret == -EINPROGRESS);
 
     return ret;
 }
diff --git a/block/qed.c b/block/qed.c
index 3ee879b52e..1a7ef0a9ce 100644
--- a/block/qed.c
+++ b/block/qed.c
@@ -336,7 +336,7 @@ static void qed_need_check_timer_cb(void *opaque)
     qed_plug_allocating_write_reqs(s);
 
     /* Ensure writes are on disk before clearing flag */
-    bdrv_aio_flush(s->bs, qed_clear_need_check, s);
+    bdrv_aio_flush(s->bs->file->bs, qed_clear_need_check, s);
 }
 
 static void qed_start_need_check_timer(BDRVQEDState *s)
@@ -378,6 +378,19 @@ static void bdrv_qed_attach_aio_context(BlockDriverState *bs,
     }
 }
 
+static void bdrv_qed_drain(BlockDriverState *bs)
+{
+    BDRVQEDState *s = bs->opaque;
+
+    /* Fire the timer immediately in order to start doing I/O as soon as the
+     * header is flushed.
+     */
+    if (s->need_check_timer && timer_pending(s->need_check_timer)) {
+        qed_cancel_need_check_timer(s);
+        qed_need_check_timer_cb(s);
+    }
+}
+
 static int bdrv_qed_open(BlockDriverState *bs, QDict *options, int flags,
                          Error **errp)
 {
@@ -1668,6 +1681,7 @@ static BlockDriver bdrv_qed = {
     .bdrv_check               = bdrv_qed_check,
     .bdrv_detach_aio_context  = bdrv_qed_detach_aio_context,
     .bdrv_attach_aio_context  = bdrv_qed_attach_aio_context,
+    .bdrv_drain               = bdrv_qed_drain,
 };
 
 static void bdrv_qed_init(void)
diff --git a/block/replication.c b/block/replication.c
index 8bbfc8f870..02aeaaf7d0 100644
--- a/block/replication.c
+++ b/block/replication.c
@@ -138,6 +138,9 @@ static void replication_close(BlockDriverState *bs)
     if (s->replication_state == BLOCK_REPLICATION_RUNNING) {
         replication_stop(s->rs, false, NULL);
     }
+    if (s->replication_state == BLOCK_REPLICATION_FAILOVER) {
+        block_job_cancel_sync(s->active_disk->bs->job);
+    }
 
     if (s->mode == REPLICATION_MODE_SECONDARY) {
         g_free(s->top_id);
@@ -319,9 +322,10 @@ static void secondary_do_checkpoint(BDRVReplicationState *s, Error **errp)
     }
 }
 
-static void reopen_backing_file(BDRVReplicationState *s, bool writable,
+static void reopen_backing_file(BlockDriverState *bs, bool writable,
                                 Error **errp)
 {
+    BDRVReplicationState *s = bs->opaque;
     BlockReopenQueue *reopen_queue = NULL;
     int orig_hidden_flags, orig_secondary_flags;
     int new_hidden_flags, new_secondary_flags;
@@ -356,13 +360,15 @@ static void reopen_backing_file(BDRVReplicationState *s, bool writable,
     }
 
     if (reopen_queue) {
-        bdrv_reopen_multiple(reopen_queue, &local_err);
+        bdrv_reopen_multiple(bdrv_get_aio_context(bs),
+                             reopen_queue, &local_err);
         error_propagate(errp, local_err);
     }
 }
 
-static void backup_job_cleanup(BDRVReplicationState *s)
+static void backup_job_cleanup(BlockDriverState *bs)
 {
+    BDRVReplicationState *s = bs->opaque;
     BlockDriverState *top_bs;
 
     top_bs = bdrv_lookup_bs(s->top_id, s->top_id, NULL);
@@ -371,19 +377,20 @@ static void backup_job_cleanup(BDRVReplicationState *s)
     }
     bdrv_op_unblock_all(top_bs, s->blocker);
     error_free(s->blocker);
-    reopen_backing_file(s, false, NULL);
+    reopen_backing_file(bs, false, NULL);
 }
 
 static void backup_job_completed(void *opaque, int ret)
 {
-    BDRVReplicationState *s = opaque;
+    BlockDriverState *bs = opaque;
+    BDRVReplicationState *s = bs->opaque;
 
     if (s->replication_state != BLOCK_REPLICATION_FAILOVER) {
         /* The backup job is cancelled unexpectedly */
         s->error = -EIO;
     }
 
-    backup_job_cleanup(s);
+    backup_job_cleanup(bs);
 }
 
 static bool check_top_bs(BlockDriverState *top_bs, BlockDriverState *bs)
@@ -479,7 +486,7 @@ static void replication_start(ReplicationState *rs, ReplicationMode mode,
         }
 
         /* reopen the backing file in r/w mode */
-        reopen_backing_file(s, true, &local_err);
+        reopen_backing_file(bs, true, &local_err);
         if (local_err) {
             error_propagate(errp, local_err);
             aio_context_release(aio_context);
@@ -494,7 +501,7 @@ static void replication_start(ReplicationState *rs, ReplicationMode mode,
         if (!top_bs || !bdrv_is_root_node(top_bs) ||
             !check_top_bs(top_bs, bs)) {
             error_setg(errp, "No top_bs or it is invalid");
-            reopen_backing_file(s, false, NULL);
+            reopen_backing_file(bs, false, NULL);
             aio_context_release(aio_context);
             return;
         }
@@ -504,10 +511,10 @@ static void replication_start(ReplicationState *rs, ReplicationMode mode,
         backup_start("replication-backup", s->secondary_disk->bs,
                      s->hidden_disk->bs, 0, MIRROR_SYNC_MODE_NONE, NULL, false,
                      BLOCKDEV_ON_ERROR_REPORT, BLOCKDEV_ON_ERROR_REPORT,
-                     backup_job_completed, s, NULL, &local_err);
+                     backup_job_completed, bs, NULL, &local_err);
         if (local_err) {
             error_propagate(errp, local_err);
-            backup_job_cleanup(s);
+            backup_job_cleanup(bs);
             aio_context_release(aio_context);
             return;
         }
diff --git a/block/sheepdog.c b/block/sheepdog.c
index ccbf7e1fa6..1fb917343a 100644
--- a/block/sheepdog.c
+++ b/block/sheepdog.c
@@ -641,6 +641,7 @@ static void restart_co_req(void *opaque)
 
 typedef struct SheepdogReqCo {
     int sockfd;
+    BlockDriverState *bs;
     AioContext *aio_context;
     SheepdogReq *hdr;
     void *data;
@@ -701,6 +702,9 @@ out:
 
     srco->ret = ret;
     srco->finished = true;
+    if (srco->bs) {
+        bdrv_wakeup(srco->bs);
+    }
 }
 
 /*
@@ -708,13 +712,14 @@ out:
  *
  * Return 0 on success, -errno in case of error.
  */
-static int do_req(int sockfd, AioContext *aio_context, SheepdogReq *hdr,
+static int do_req(int sockfd, BlockDriverState *bs, SheepdogReq *hdr,
                   void *data, unsigned int *wlen, unsigned int *rlen)
 {
     Coroutine *co;
     SheepdogReqCo srco = {
         .sockfd = sockfd,
-        .aio_context = aio_context,
+        .aio_context = bs ? bdrv_get_aio_context(bs) : qemu_get_aio_context(),
+        .bs = bs,
         .hdr = hdr,
         .data = data,
         .wlen = wlen,
@@ -727,9 +732,14 @@ static int do_req(int sockfd, AioContext *aio_context, SheepdogReq *hdr,
         do_co_req(&srco);
     } else {
         co = qemu_coroutine_create(do_co_req, &srco);
-        qemu_coroutine_enter(co);
-        while (!srco.finished) {
-            aio_poll(aio_context, true);
+        if (bs) {
+            qemu_coroutine_enter(co);
+            BDRV_POLL_WHILE(bs, !srco.finished);
+        } else {
+            qemu_coroutine_enter(co);
+            while (!srco.finished) {
+                aio_poll(qemu_get_aio_context(), true);
+            }
         }
     }
 
@@ -1125,7 +1135,7 @@ static int find_vdi_name(BDRVSheepdogState *s, const char *filename,
     hdr.snapid = snapid;
     hdr.flags = SD_FLAG_CMD_WRITE;
 
-    ret = do_req(fd, s->aio_context, (SheepdogReq *)&hdr, buf, &wlen, &rlen);
+    ret = do_req(fd, s->bs, (SheepdogReq *)&hdr, buf, &wlen, &rlen);
     if (ret) {
         error_setg_errno(errp, -ret, "cannot get vdi info");
         goto out;
@@ -1240,7 +1250,7 @@ out:
     qemu_co_mutex_unlock(&s->lock);
 }
 
-static int read_write_object(int fd, AioContext *aio_context, char *buf,
+static int read_write_object(int fd, BlockDriverState *bs, char *buf,
                              uint64_t oid, uint8_t copies,
                              unsigned int datalen, uint64_t offset,
                              bool write, bool create, uint32_t cache_flags)
@@ -1274,7 +1284,7 @@ static int read_write_object(int fd, AioContext *aio_context, char *buf,
     hdr.offset = offset;
     hdr.copies = copies;
 
-    ret = do_req(fd, aio_context, (SheepdogReq *)&hdr, buf, &wlen, &rlen);
+    ret = do_req(fd, bs, (SheepdogReq *)&hdr, buf, &wlen, &rlen);
     if (ret) {
         error_report("failed to send a request to the sheep");
         return ret;
@@ -1289,22 +1299,22 @@ static int read_write_object(int fd, AioContext *aio_context, char *buf,
     }
 }
 
-static int read_object(int fd, AioContext *aio_context, char *buf,
+static int read_object(int fd, BlockDriverState *bs, char *buf,
                        uint64_t oid, uint8_t copies,
                        unsigned int datalen, uint64_t offset,
                        uint32_t cache_flags)
 {
-    return read_write_object(fd, aio_context, buf, oid, copies,
+    return read_write_object(fd, bs, buf, oid, copies,
                              datalen, offset, false,
                              false, cache_flags);
 }
 
-static int write_object(int fd, AioContext *aio_context, char *buf,
+static int write_object(int fd, BlockDriverState *bs, char *buf,
                         uint64_t oid, uint8_t copies,
                         unsigned int datalen, uint64_t offset, bool create,
                         uint32_t cache_flags)
 {
-    return read_write_object(fd, aio_context, buf, oid, copies,
+    return read_write_object(fd, bs, buf, oid, copies,
                              datalen, offset, true,
                              create, cache_flags);
 }
@@ -1331,7 +1341,7 @@ static int reload_inode(BDRVSheepdogState *s, uint32_t snapid, const char *tag)
         goto out;
     }
 
-    ret = read_object(fd, s->aio_context, (char *)inode, vid_to_vdi_oid(vid),
+    ret = read_object(fd, s->bs, (char *)inode, vid_to_vdi_oid(vid),
                       s->inode.nr_copies, SD_INODE_HEADER_SIZE, 0,
                       s->cache_flags);
     if (ret < 0) {
@@ -1489,7 +1499,7 @@ static int sd_open(BlockDriverState *bs, QDict *options, int flags,
     }
 
     buf = g_malloc(SD_INODE_SIZE);
-    ret = read_object(fd, s->aio_context, buf, vid_to_vdi_oid(vid),
+    ret = read_object(fd, s->bs, buf, vid_to_vdi_oid(vid),
                       0, SD_INODE_SIZE, 0, s->cache_flags);
 
     closesocket(fd);
@@ -1618,7 +1628,7 @@ static int do_sd_create(BDRVSheepdogState *s, uint32_t *vdi_id, int snapshot,
     hdr.copies = s->inode.nr_copies;
     hdr.block_size_shift = s->inode.block_size_shift;
 
-    ret = do_req(fd, s->aio_context, (SheepdogReq *)&hdr, buf, &wlen, &rlen);
+    ret = do_req(fd, NULL, (SheepdogReq *)&hdr, buf, &wlen, &rlen);
 
     closesocket(fd);
 
@@ -1886,7 +1896,7 @@ static int sd_create(const char *filename, QemuOpts *opts,
         hdr.opcode = SD_OP_GET_CLUSTER_DEFAULT;
         hdr.proto_ver = SD_PROTO_VER;
 
-        ret = do_req(fd, s->aio_context, (SheepdogReq *)&hdr,
+        ret = do_req(fd, NULL, (SheepdogReq *)&hdr,
                      NULL, &wlen, &rlen);
         closesocket(fd);
         if (ret) {
@@ -1951,7 +1961,7 @@ static void sd_close(BlockDriverState *bs)
     hdr.data_length = wlen;
     hdr.flags = SD_FLAG_CMD_WRITE;
 
-    ret = do_req(fd, s->aio_context, (SheepdogReq *)&hdr,
+    ret = do_req(fd, s->bs, (SheepdogReq *)&hdr,
                  s->name, &wlen, &rlen);
 
     closesocket(fd);
@@ -2000,7 +2010,7 @@ static int sd_truncate(BlockDriverState *bs, int64_t offset)
     /* we don't need to update entire object */
     datalen = SD_INODE_SIZE - sizeof(s->inode.data_vdi_id);
     s->inode.vdi_size = offset;
-    ret = write_object(fd, s->aio_context, (char *)&s->inode,
+    ret = write_object(fd, s->bs, (char *)&s->inode,
                        vid_to_vdi_oid(s->inode.vdi_id), s->inode.nr_copies,
                        datalen, 0, false, s->cache_flags);
     close(fd);
@@ -2070,7 +2080,7 @@ static bool sd_delete(BDRVSheepdogState *s)
         return false;
     }
 
-    ret = do_req(fd, s->aio_context, (SheepdogReq *)&hdr,
+    ret = do_req(fd, s->bs, (SheepdogReq *)&hdr,
                  s->name, &wlen, &rlen);
     closesocket(fd);
     if (ret) {
@@ -2126,7 +2136,7 @@ static int sd_create_branch(BDRVSheepdogState *s)
         goto out;
     }
 
-    ret = read_object(fd, s->aio_context, buf, vid_to_vdi_oid(vid),
+    ret = read_object(fd, s->bs, buf, vid_to_vdi_oid(vid),
                       s->inode.nr_copies, SD_INODE_SIZE, 0, s->cache_flags);
 
     closesocket(fd);
@@ -2411,7 +2421,7 @@ static int sd_snapshot_create(BlockDriverState *bs, QEMUSnapshotInfo *sn_info)
         goto cleanup;
     }
 
-    ret = write_object(fd, s->aio_context, (char *)&s->inode,
+    ret = write_object(fd, s->bs, (char *)&s->inode,
                        vid_to_vdi_oid(s->inode.vdi_id), s->inode.nr_copies,
                        datalen, 0, false, s->cache_flags);
     if (ret < 0) {
@@ -2426,7 +2436,7 @@ static int sd_snapshot_create(BlockDriverState *bs, QEMUSnapshotInfo *sn_info)
         goto cleanup;
     }
 
-    ret = read_object(fd, s->aio_context, (char *)inode,
+    ret = read_object(fd, s->bs, (char *)inode,
                       vid_to_vdi_oid(new_vid), s->inode.nr_copies, datalen, 0,
                       s->cache_flags);
 
@@ -2528,7 +2538,7 @@ static bool remove_objects(BDRVSheepdogState *s)
             i++;
         }
 
-        ret = write_object(fd, s->aio_context,
+        ret = write_object(fd, s->bs,
                            (char *)&inode->data_vdi_id[start_idx],
                            vid_to_vdi_oid(s->inode.vdi_id), inode->nr_copies,
                            (i - start_idx) * sizeof(uint32_t),
@@ -2600,7 +2610,7 @@ static int sd_snapshot_delete(BlockDriverState *bs,
         return -1;
     }
 
-    ret = do_req(fd, s->aio_context, (SheepdogReq *)&hdr,
+    ret = do_req(fd, s->bs, (SheepdogReq *)&hdr,
                  buf, &wlen, &rlen);
     closesocket(fd);
     if (ret) {
@@ -2652,8 +2662,7 @@ static int sd_snapshot_list(BlockDriverState *bs, QEMUSnapshotInfo **psn_tab)
     req.opcode = SD_OP_READ_VDIS;
     req.data_length = max;
 
-    ret = do_req(fd, s->aio_context, &req,
-                 vdi_inuse, &wlen, &rlen);
+    ret = do_req(fd, s->bs, &req, vdi_inuse, &wlen, &rlen);
 
     closesocket(fd);
     if (ret) {
@@ -2679,7 +2688,7 @@ static int sd_snapshot_list(BlockDriverState *bs, QEMUSnapshotInfo **psn_tab)
         }
 
         /* we don't need to read entire object */
-        ret = read_object(fd, s->aio_context, (char *)&inode,
+        ret = read_object(fd, s->bs, (char *)&inode,
                           vid_to_vdi_oid(vid),
                           0, SD_INODE_SIZE - sizeof(inode.data_vdi_id), 0,
                           s->cache_flags);
@@ -2745,11 +2754,11 @@ static int do_load_save_vmstate(BDRVSheepdogState *s, uint8_t *data,
 
         create = (offset == 0);
         if (load) {
-            ret = read_object(fd, s->aio_context, (char *)data, vmstate_oid,
+            ret = read_object(fd, s->bs, (char *)data, vmstate_oid,
                               s->inode.nr_copies, data_len, offset,
                               s->cache_flags);
         } else {
-            ret = write_object(fd, s->aio_context, (char *)data, vmstate_oid,
+            ret = write_object(fd, s->bs, (char *)data, vmstate_oid,
                                s->inode.nr_copies, data_len, offset, create,
                                s->cache_flags);
         }
diff --git a/blockjob.c b/blockjob.c
index 43fecbe13e..7c88b30074 100644
--- a/blockjob.c
+++ b/blockjob.c
@@ -74,17 +74,6 @@ BlockJob *block_job_get(const char *id)
     return NULL;
 }
 
-/* Normally the job runs in its BlockBackend's AioContext.  The exception is
- * block_job_defer_to_main_loop() where it runs in the QEMU main loop.  Code
- * that supports both cases uses this helper function.
- */
-static AioContext *block_job_get_aio_context(BlockJob *job)
-{
-    return job->deferred_to_main_loop ?
-           qemu_get_aio_context() :
-           blk_get_aio_context(job->blk);
-}
-
 static void block_job_attached_aio_context(AioContext *new_context,
                                            void *opaque)
 {
@@ -97,6 +86,17 @@ static void block_job_attached_aio_context(AioContext *new_context,
     block_job_resume(job);
 }
 
+static void block_job_drain(BlockJob *job)
+{
+    /* If job is !job->busy this kicks it into the next pause point. */
+    block_job_enter(job);
+
+    blk_drain(job->blk);
+    if (job->driver->drain) {
+        job->driver->drain(job);
+    }
+}
+
 static void block_job_detach_aio_context(void *opaque)
 {
     BlockJob *job = opaque;
@@ -106,12 +106,8 @@ static void block_job_detach_aio_context(void *opaque)
 
     block_job_pause(job);
 
-    if (!job->paused) {
-        /* If job is !job->busy this kicks it into the next pause point. */
-        block_job_enter(job);
-    }
     while (!job->paused && !job->completed) {
-        aio_poll(block_job_get_aio_context(job), true);
+        block_job_drain(job);
     }
 
     block_job_unref(job);
@@ -413,14 +409,21 @@ static int block_job_finish_sync(BlockJob *job,
     assert(blk_bs(job->blk)->job == job);
 
     block_job_ref(job);
+
     finish(job, &local_err);
     if (local_err) {
         error_propagate(errp, local_err);
         block_job_unref(job);
         return -EBUSY;
     }
+    /* block_job_drain calls block_job_enter, and it should be enough to
+     * induce progress until the job completes or moves to the main thread.
+    */
+    while (!job->deferred_to_main_loop && !job->completed) {
+        block_job_drain(job);
+    }
     while (!job->completed) {
-        aio_poll(block_job_get_aio_context(job), true);
+        aio_poll(qemu_get_aio_context(), true);
     }
     ret = (job->cancelled && job->ret == 0) ? -ECANCELED : job->ret;
     block_job_unref(job);
diff --git a/configure b/configure
index f83cdf8876..6b7acb1ccb 100755
--- a/configure
+++ b/configure
@@ -230,6 +230,7 @@ vhost_net="no"
 vhost_scsi="no"
 vhost_vsock="no"
 kvm="no"
+colo="yes"
 rdma=""
 gprof="no"
 debug_tcg="no"
@@ -918,6 +919,10 @@ for opt do
   ;;
   --enable-kvm) kvm="yes"
   ;;
+  --disable-colo) colo="no"
+  ;;
+  --enable-colo) colo="yes"
+  ;;
   --disable-tcg-interpreter) tcg_interpreter="no"
   ;;
   --enable-tcg-interpreter) tcg_interpreter="yes"
@@ -1366,6 +1371,7 @@ disabled with --disable-FEATURE, default is enabled if available:
   fdt             fdt device tree
   bluez           bluez stack connectivity
   kvm             KVM acceleration support
+  colo            COarse-grain LOck-stepping VM for Non-stop Service
   rdma            RDMA-based migration support
   vde             support for vde network
   netmap          support for netmap network
@@ -5004,6 +5010,7 @@ echo "Linux AIO support $linux_aio"
 echo "ATTR/XATTR support $attr"
 echo "Install blobs     $blobs"
 echo "KVM support       $kvm"
+echo "COLO support      $colo"
 echo "RDMA support      $rdma"
 echo "TCG interpreter   $tcg_interpreter"
 echo "fdt support       $fdt"
@@ -5639,6 +5646,10 @@ if have_backend "syslog"; then
 fi
 echo "CONFIG_TRACE_FILE=$trace_file" >> $config_host_mak
 
+if test "$colo" = "yes"; then
+  echo "CONFIG_COLO=y" >> $config_host_mak
+fi
+
 if test "$rdma" = "yes" ; then
   echo "CONFIG_RDMA=y" >> $config_host_mak
 fi
diff --git a/cputlb.c b/cputlb.c
index cc4da4d7eb..813279f3bc 100644
--- a/cputlb.c
+++ b/cputlb.c
@@ -26,7 +26,6 @@
 #include "exec/cputlb.h"
 #include "exec/memory-internal.h"
 #include "exec/ram_addr.h"
-#include "exec/exec-all.h"
 #include "tcg/tcg.h"
 #include "qemu/error-report.h"
 #include "exec/log.h"
diff --git a/docs/COLO-FT.txt b/docs/COLO-FT.txt
new file mode 100644
index 0000000000..628293824c
--- /dev/null
+++ b/docs/COLO-FT.txt
@@ -0,0 +1,189 @@
+COarse-grained LOck-stepping Virtual Machines for Non-stop Service
+----------------------------------------
+Copyright (c) 2016 Intel Corporation
+Copyright (c) 2016 HUAWEI TECHNOLOGIES CO., LTD.
+Copyright (c) 2016 Fujitsu, Corp.
+
+This work is licensed under the terms of the GNU GPL, version 2 or later.
+See the COPYING file in the top-level directory.
+
+This document gives an overview of COLO's design and how to use it.
+
+== Background ==
+Virtual machine (VM) replication is a well known technique for providing
+application-agnostic software-implemented hardware fault tolerance,
+also known as "non-stop service".
+
+COLO (COarse-grained LOck-stepping) is a high availability solution.
+Both primary VM (PVM) and secondary VM (SVM) run in parallel. They receive the
+same request from client, and generate response in parallel too.
+If the response packets from PVM and SVM are identical, they are released
+immediately. Otherwise, a VM checkpoint (on demand) is conducted.
+
+== Architecture ==
+
+The architecture of COLO is shown in the diagram below.
+It consists of a pair of networked physical nodes:
+The primary node running the PVM, and the secondary node running the SVM
+to maintain a valid replica of the PVM.
+PVM and SVM execute in parallel and generate output of response packets for
+client requests according to the application semantics.
+
+The incoming packets from the client or external network are received by the
+primary node, and then forwarded to the secondary node, so that both the PVM
+and the SVM are stimulated with the same requests.
+
+COLO receives the outbound packets from both the PVM and SVM and compares them
+before allowing the output to be sent to clients.
+
+The SVM is qualified as a valid replica of the PVM, as long as it generates
+identical responses to all client requests. Once the differences in the outputs
+are detected between the PVM and SVM, COLO withholds transmission of the
+outbound packets until it has successfully synchronized the PVM state to the SVM.
+
+   Primary Node                                                            Secondary Node
+ +------------+  +-----------------------+       +------------------------+  +------------+
+ |            |  |       HeartBeat       |<----->|       HeartBeat        |  |            |
+ | Primary VM |  +-----------|-----------+       +-----------|------------+  |Secondary VM|
+ |            |              |                               |               |            |
+ |            |  +-----------|-----------+       +-----------|------------+  |            |
+ |            |  |QEMU   +---v----+      |       |QEMU  +----v---+        |  |            |
+ |            |  |       |Failover|      |       |      |Failover|        |  |            |
+ |            |  |       +--------+      |       |      +--------+        |  |            |
+ |            |  |   +---------------+   |       |   +---------------+    |  |            |
+ |            |  |   | VM Checkpoint |-------------->| VM Checkpoint |    |  |            |
+ |            |  |   +---------------+   |       |   +---------------+    |  |            |
+ |            |  |                       |       |                        |  |            |
+ |Requests<---------------------------^------------------------------------------>Requests|
+ |Responses----------------------\ /--|--------------\  /------------------------Responses|
+ |            |  |               | |  |  |       |   |  |                 |  |            |
+ |            |  | +-----------+ | |  |  |       |   |  |  +------------+ |  |            |
+ |            |  | | COLO disk | | |  |  |       |   |  |  | COLO disk  | |  |            |
+ |            |  | |   Manager |-|-|--|--------------|--|->| Manager    | |  |            |
+ |            |  | +|----------+ | |  |  |       |   |  |  +-----------|+ |  |            |
+ |            |  |  |            | |  |  |       |   |  |              |  |  |            |
+ +------------+  +--|------------|-|--|--+       +---|--|--------------|--+  +------------+
+                    |            | |  |              |  |              |
+ +-------------+    | +----------v-v--|--+       +---|--v-----------+  |    +-------------+
+ |  VM Monitor |    | |  COLO Proxy      |       |    COLO Proxy    |  |    | VM Monitor  |
+ |             |    | |(compare packet)  |       | (adjust sequence)|  |    |             |
+ +-------------+    | +----------|----^--+       +------------------+  |    +-------------+
+                    |            |    |                                |
+ +------------------|------------|----|--+       +---------------------|------------------+
+ |   Kernel         |            |    |  |       |   Kernel            |                  |
+ +------------------|------------|----|--+       +---------------------|------------------+
+                    |            |    |                                |
+     +--------------v+  +--------v----|--+       +------------------+ +v-------------+
+     |   Storage     |  |External Network|       | External Network | |   Storage    |
+     +---------------+  +----------------+       +------------------+ +--------------+
+
+== Components introduction ==
+
+You can see there are several components in COLO's diagram of architecture.
+Their functions are described below.
+
+HeartBeat:
+Runs on both the primary and secondary nodes, to periodically check platform
+availability. When the primary node suffers a hardware fail-stop failure,
+the heartbeat stops responding, the secondary node will trigger a failover
+as soon as it determines the absence.
+
+COLO disk Manager:
+When primary VM writes data into image, the colo disk manger captures this data
+and sends it to secondary VM's which makes sure the context of secondary VM's
+image is consistent with the context of primary VM 's image.
+For more details, please refer to docs/block-replication.txt.
+
+Checkpoint/Failover Controller:
+Modifications of save/restore flow to realize continuous migration,
+to make sure the state of VM in Secondary side is always consistent with VM in
+Primary side.
+
+COLO Proxy:
+Delivers packets to Primary and Seconday, and then compare the responses from
+both side. Then decide whether to start a checkpoint according to some rules.
+Please refer to docs/colo-proxy.txt for more informations.
+
+Note:
+HeartBeat has not been implemented yet, so you need to trigger failover process
+by using 'x-colo-lost-heartbeat' command.
+
+== Test procedure ==
+1. Startup qemu
+Primary:
+# qemu-kvm -enable-kvm -m 2048 -smp 2 -qmp stdio -vnc :7 -name primary \
+  -device piix3-usb-uhci \
+  -device usb-tablet -netdev tap,id=hn0,vhost=off \
+  -device virtio-net-pci,id=net-pci0,netdev=hn0 \
+  -drive if=virtio,id=primary-disk0,driver=quorum,read-pattern=fifo,vote-threshold=1,\
+         children.0.file.filename=1.raw,\
+         children.0.driver=raw -S
+Secondary:
+# qemu-kvm -enable-kvm -m 2048 -smp 2 -qmp stdio -vnc :7 -name secondary \
+  -device piix3-usb-uhci \
+  -device usb-tablet -netdev tap,id=hn0,vhost=off \
+  -device virtio-net-pci,id=net-pci0,netdev=hn0 \
+  -drive if=none,id=secondary-disk0,file.filename=1.raw,driver=raw,node-name=node0 \
+  -drive if=virtio,id=active-disk0,driver=replication,mode=secondary,\
+         file.driver=qcow2,top-id=active-disk0,\
+         file.file.filename=/mnt/ramfs/active_disk.img,\
+         file.backing.driver=qcow2,\
+         file.backing.file.filename=/mnt/ramfs/hidden_disk.img,\
+         file.backing.backing=secondary-disk0 \
+  -incoming tcp:0:8888
+
+2. On Secondary VM's QEMU monitor, issue command
+{'execute':'qmp_capabilities'}
+{ 'execute': 'nbd-server-start',
+  'arguments': {'addr': {'type': 'inet', 'data': {'host': 'xx.xx.xx.xx', 'port': '8889'} } }
+}
+{'execute': 'nbd-server-add', 'arguments': {'device': 'secondeary-disk0', 'writable': true } }
+
+Note:
+  a. The qmp command nbd-server-start and nbd-server-add must be run
+     before running the qmp command migrate on primary QEMU
+  b. Active disk, hidden disk and nbd target's length should be the
+     same.
+  c. It is better to put active disk and hidden disk in ramdisk.
+
+3. On Primary VM's QEMU monitor, issue command:
+{'execute':'qmp_capabilities'}
+{ 'execute': 'human-monitor-command',
+  'arguments': {'command-line': 'drive_add -n buddy driver=replication,mode=primary,file.driver=nbd,file.host=xx.xx.xx.xx,file.port=8889,file.export=secondary-disk0,node-name=nbd_client0'}}
+{ 'execute':'x-blockdev-change', 'arguments':{'parent': 'primary-disk0', 'node': 'nbd_client0' } }
+{ 'execute': 'migrate-set-capabilities',
+      'arguments': {'capabilities': [ {'capability': 'x-colo', 'state': true } ] } }
+{ 'execute': 'migrate', 'arguments': {'uri': 'tcp:xx.xx.xx.xx:8888' } }
+
+  Note:
+  a. There should be only one NBD Client for each primary disk.
+  b. xx.xx.xx.xx is the secondary physical machine's hostname or IP
+  c. The qmp command line must be run after running qmp command line in
+     secondary qemu.
+
+4. After the above steps, you will see, whenever you make changes to PVM, SVM will be synced.
+You can issue command '{ "execute": "migrate-set-parameters" , "arguments":{ "x-checkpoint-delay": 2000 } }'
+to change the checkpoint period time
+
+5. Failover test
+You can kill Primary VM and run 'x_colo_lost_heartbeat' in Secondary VM's
+monitor at the same time, then SVM will failover and client will not detect this
+change.
+
+Before issuing '{ "execute": "x-colo-lost-heartbeat" }' command, we have to
+issue block related command to stop block replication.
+Primary:
+  Remove the nbd child from the quorum:
+  { 'execute': 'x-blockdev-change', 'arguments': {'parent': 'colo-disk0', 'child': 'children.1'}}
+  { 'execute': 'human-monitor-command','arguments': {'command-line': 'drive_del blk-buddy0'}}
+  Note: there is no qmp command to remove the blockdev now
+
+Secondary:
+  The primary host is down, so we should do the following thing:
+  { 'execute': 'nbd-server-stop' }
+
+== TODO ==
+1. Support continuous VM replication.
+2. Support shared storage.
+3. Develop the heartbeat part.
+4. Reduce checkpoint VM’s downtime while doing checkpoint.
diff --git a/docs/multiple-iothreads.txt b/docs/multiple-iothreads.txt
index 40b8419916..0e7cdb2c28 100644
--- a/docs/multiple-iothreads.txt
+++ b/docs/multiple-iothreads.txt
@@ -105,13 +105,10 @@ a BH in the target AioContext beforehand and then call qemu_bh_schedule().  No
 acquire/release or locking is needed for the qemu_bh_schedule() call.  But be
 sure to acquire the AioContext for aio_bh_new() if necessary.
 
-The relationship between AioContext and the block layer
--------------------------------------------------------
-The AioContext originates from the QEMU block layer because it provides a
-scoped way of running event loop iterations until all work is done.  This
-feature is used to complete all in-flight block I/O requests (see
-bdrv_drain_all()).  Nowadays AioContext is a generic event loop that can be
-used by any QEMU subsystem.
+AioContext and the block layer
+------------------------------
+The AioContext originates from the QEMU block layer, even though nowadays
+AioContext is a generic event loop that can be used by any QEMU subsystem.
 
 The block layer has support for AioContext integrated.  Each BlockDriverState
 is associated with an AioContext using bdrv_set_aio_context() and
@@ -122,13 +119,22 @@ Block layer code must therefore expect to run in an IOThread and avoid using
 old APIs that implicitly use the main loop.  See the "How to program for
 IOThreads" above for information on how to do that.
 
-If main loop code such as a QMP function wishes to access a BlockDriverState it
-must first call aio_context_acquire(bdrv_get_aio_context(bs)) to ensure the
-IOThread does not run in parallel.
-
-Long-running jobs (usually in the form of coroutines) are best scheduled in the
-BlockDriverState's AioContext to avoid the need to acquire/release around each
-bdrv_*() call.  Be aware that there is currently no mechanism to get notified
-when bdrv_set_aio_context() moves this BlockDriverState to a different
-AioContext (see bdrv_detach_aio_context()/bdrv_attach_aio_context()), so you
-may need to add this if you want to support long-running jobs.
+If main loop code such as a QMP function wishes to access a BlockDriverState
+it must first call aio_context_acquire(bdrv_get_aio_context(bs)) to ensure
+that callbacks in the IOThread do not run in parallel.
+
+Code running in the monitor typically needs to ensure that past
+requests from the guest are completed.  When a block device is running
+in an IOThread, the IOThread can also process requests from the guest
+(via ioeventfd).  To achieve both objects, wrap the code between
+bdrv_drained_begin() and bdrv_drained_end(), thus creating a "drained
+section".  The functions must be called between aio_context_acquire()
+and aio_context_release().  You can freely release and re-acquire the
+AioContext within a drained section.
+
+Long-running jobs (usually in the form of coroutines) are best scheduled in
+the BlockDriverState's AioContext to avoid the need to acquire/release around
+each bdrv_*() call.  The functions bdrv_add/remove_aio_context_notifier,
+or alternatively blk_add/remove_aio_context_notifier if you use BlockBackends,
+can be used to get a notification whenever bdrv_set_aio_context() moves a
+BlockDriverState to a different AioContext.
diff --git a/docs/qmp-commands.txt b/docs/qmp-commands.txt
index 284576d795..a4732a570c 100644
--- a/docs/qmp-commands.txt
+++ b/docs/qmp-commands.txt
@@ -554,6 +554,16 @@ Example:
 -> { "execute": "migrate_set_downtime", "arguments": { "value": 0.1 } }
 <- { "return": {} }
 
+x-colo-lost-heartbeat
+--------------------
+
+Tell COLO that heartbeat is lost, a failover or takeover is needed.
+
+Example:
+
+-> { "execute": "x-colo-lost-heartbeat" }
+<- { "return": {} }
+
 client_migrate_info
 -------------------
 
@@ -2861,6 +2871,7 @@ Enable/Disable migration capabilities
 - "compress": use multiple compression threads to accelerate live migration
 - "events": generate events for each migration state change
 - "postcopy-ram": postcopy mode for live migration
+- "x-colo": COarse-Grain LOck Stepping (COLO) for Non-stop Service
 
 Arguments:
 
@@ -2882,6 +2893,7 @@ Query current migration capabilities
          - "compress": Multiple compression threads state (json-bool)
          - "events": Migration state change event state (json-bool)
          - "postcopy-ram": postcopy ram state (json-bool)
+         - "x-colo": COarse-Grain LOck Stepping for Non-stop Service (json-bool)
 
 Arguments:
 
@@ -2895,7 +2907,8 @@ Example:
      {"state": false, "capability": "zero-blocks"},
      {"state": false, "capability": "compress"},
      {"state": true, "capability": "events"},
-     {"state": false, "capability": "postcopy-ram"}
+     {"state": false, "capability": "postcopy-ram"},
+     {"state": false, "capability": "x-colo"}
    ]}
 
 migrate-set-parameters
@@ -2913,6 +2926,8 @@ Set migration parameters
 - "max-bandwidth": set maximum speed for migrations (in bytes/sec) (json-int)
 - "downtime-limit": set maximum tolerated downtime (in milliseconds) for
                     migrations (json-int)
+- "x-checkpoint-delay": set the delay time for periodic checkpoint (json-int)
+
 Arguments:
 
 Example:
diff --git a/gdbstub.c b/gdbstub.c
index b2e1b79ca3..de62d26096 100644
--- a/gdbstub.c
+++ b/gdbstub.c
@@ -31,7 +31,6 @@
 
 #define MAX_PACKET_LENGTH 4096
 
-#include "cpu.h"
 #include "qemu/sockets.h"
 #include "sysemu/kvm.h"
 #include "exec/semihost.h"
diff --git a/hmp-commands.hx b/hmp-commands.hx
index 06bef470b9..88192817b2 100644
--- a/hmp-commands.hx
+++ b/hmp-commands.hx
@@ -1040,6 +1040,21 @@ migration (or once already in postcopy).
 ETEXI
 
     {
+        .name       = "x_colo_lost_heartbeat",
+        .args_type  = "",
+        .params     = "",
+        .help       = "Tell COLO that heartbeat is lost,\n\t\t\t"
+                      "a failover or takeover is needed.",
+        .cmd = hmp_x_colo_lost_heartbeat,
+    },
+
+STEXI
+@item x_colo_lost_heartbeat
+@findex x_colo_lost_heartbeat
+Tell COLO that heartbeat is lost, a failover or takeover is needed.
+ETEXI
+
+    {
         .name       = "client_migrate_info",
         .args_type  = "protocol:s,hostname:s,port:i?,tls-port:i?,cert-subject:s?",
         .params     = "protocol hostname port tls-port cert-subject",
diff --git a/hmp.c b/hmp.c
index 3d602594a2..00af4230bf 100644
--- a/hmp.c
+++ b/hmp.c
@@ -318,6 +318,9 @@ void hmp_info_migrate_parameters(Monitor *mon, const QDict *qdict)
         monitor_printf(mon, " %s: %" PRId64 " milliseconds",
             MigrationParameter_lookup[MIGRATION_PARAMETER_DOWNTIME_LIMIT],
             params->downtime_limit);
+        monitor_printf(mon, " %s: %" PRId64,
+            MigrationParameter_lookup[MIGRATION_PARAMETER_X_CHECKPOINT_DELAY],
+            params->x_checkpoint_delay);
         monitor_printf(mon, "\n");
     }
 
@@ -1386,6 +1389,10 @@ void hmp_migrate_set_parameter(Monitor *mon, const QDict *qdict)
                 p.has_downtime_limit = true;
                 use_int_value = true;
                 break;
+            case MIGRATION_PARAMETER_X_CHECKPOINT_DELAY:
+                p.has_x_checkpoint_delay = true;
+                use_int_value = true;
+                break;
             }
 
             if (use_int_value) {
@@ -1402,6 +1409,7 @@ void hmp_migrate_set_parameter(Monitor *mon, const QDict *qdict)
                 p.cpu_throttle_initial = valueint;
                 p.cpu_throttle_increment = valueint;
                 p.downtime_limit = valueint;
+                p.x_checkpoint_delay = valueint;
             }
 
             qmp_migrate_set_parameters(&p, &err);
@@ -1443,6 +1451,14 @@ void hmp_migrate_start_postcopy(Monitor *mon, const QDict *qdict)
     hmp_handle_error(mon, &err);
 }
 
+void hmp_x_colo_lost_heartbeat(Monitor *mon, const QDict *qdict)
+{
+    Error *err = NULL;
+
+    qmp_x_colo_lost_heartbeat(&err);
+    hmp_handle_error(mon, &err);
+}
+
 void hmp_set_password(Monitor *mon, const QDict *qdict)
 {
     const char *protocol  = qdict_get_str(qdict, "protocol");
diff --git a/hmp.h b/hmp.h
index 184769c13f..05daf7cd5c 100644
--- a/hmp.h
+++ b/hmp.h
@@ -72,6 +72,7 @@ void hmp_migrate_set_parameter(Monitor *mon, const QDict *qdict);
 void hmp_migrate_set_cache_size(Monitor *mon, const QDict *qdict);
 void hmp_client_migrate_info(Monitor *mon, const QDict *qdict);
 void hmp_migrate_start_postcopy(Monitor *mon, const QDict *qdict);
+void hmp_x_colo_lost_heartbeat(Monitor *mon, const QDict *qdict);
 void hmp_set_password(Monitor *mon, const QDict *qdict);
 void hmp_expire_password(Monitor *mon, const QDict *qdict);
 void hmp_eject(Monitor *mon, const QDict *qdict);
diff --git a/hw/arm/cubieboard.c b/hw/arm/cubieboard.c
index fbd78ed01c..dd19ba3c99 100644
--- a/hw/arm/cubieboard.c
+++ b/hw/arm/cubieboard.c
@@ -74,6 +74,7 @@ static void cubieboard_init(MachineState *machine)
     cubieboard_binfo.ram_size = machine->ram_size;
     cubieboard_binfo.kernel_filename = machine->kernel_filename;
     cubieboard_binfo.kernel_cmdline = machine->kernel_cmdline;
+    cubieboard_binfo.initrd_filename = machine->initrd_filename;
     arm_load_kernel(&s->a10->cpu, &cubieboard_binfo);
 }
 
diff --git a/hw/arm/pxa2xx.c b/hw/arm/pxa2xx.c
index 42cdde0478..21ea1d6210 100644
--- a/hw/arm/pxa2xx.c
+++ b/hw/arm/pxa2xx.c
@@ -2267,7 +2267,9 @@ PXA2xxState *pxa255_init(MemoryRegion *address_space, unsigned int sdram_size)
                     qdev_get_gpio_in(s->pic, PXA2XX_PIC_LCD));
 
     s->cm_base = 0x41300000;
-    s->cm_regs[CCCR >> 2] = 0x02000210;	/* 416.0 MHz */
+    s->cm_regs[CCCR >> 2] = 0x00000121;         /* from datasheet */
+    s->cm_regs[CKEN >> 2] = 0x00017def;         /* from datasheet */
+
     s->clkcfg = 0x00000009;		/* Turbo mode active */
     memory_region_init_io(&s->cm_iomem, NULL, &pxa2xx_cm_ops, s, "pxa2xx-cm", 0x1000);
     memory_region_add_subregion(address_space, s->cm_base, &s->cm_iomem);
diff --git a/hw/arm/spitz.c b/hw/arm/spitz.c
index 41cc2eeeb1..949a15ae64 100644
--- a/hw/arm/spitz.c
+++ b/hw/arm/spitz.c
@@ -29,6 +29,7 @@
 #include "sysemu/block-backend.h"
 #include "hw/sysbus.h"
 #include "exec/address-spaces.h"
+#include "sysemu/sysemu.h"
 
 #undef REG_FMT
 #define REG_FMT			"0x%02lx"
@@ -844,9 +845,18 @@ static void spitz_lcd_hsync_handler(void *opaque, int line, int level)
     spitz_hsync ^= 1;
 }
 
+static void spitz_reset(void *opaque, int line, int level)
+{
+    if (level) {
+        qemu_system_reset_request();
+    }
+}
+
 static void spitz_gpio_setup(PXA2xxState *cpu, int slots)
 {
     qemu_irq lcd_hsync;
+    qemu_irq reset;
+
     /*
      * Bad hack: We toggle the LCD hsync GPIO on every GPIO status
      * read to satisfy broken guests that poll-wait for hsync.
@@ -867,7 +877,8 @@ static void spitz_gpio_setup(PXA2xxState *cpu, int slots)
     qemu_irq_raise(qdev_get_gpio_in(cpu->gpio, SPITZ_GPIO_BAT_COVER));
 
     /* Handle reset */
-    qdev_connect_gpio_out(cpu->gpio, SPITZ_GPIO_ON_RESET, cpu->reset);
+    reset = qemu_allocate_irq(spitz_reset, cpu, 0);
+    qdev_connect_gpio_out(cpu->gpio, SPITZ_GPIO_ON_RESET, reset);
 
     /* PCMCIA signals: card's IRQ and Card-Detect */
     if (slots >= 1)
diff --git a/hw/arm/tosa.c b/hw/arm/tosa.c
index 2db66508b5..1ee12f49b3 100644
--- a/hw/arm/tosa.c
+++ b/hw/arm/tosa.c
@@ -25,6 +25,7 @@
 #include "sysemu/block-backend.h"
 #include "hw/sysbus.h"
 #include "exec/address-spaces.h"
+#include "sysemu/sysemu.h"
 
 #define TOSA_RAM    0x04000000
 #define TOSA_ROM	0x00800000
@@ -86,6 +87,12 @@ static void tosa_out_switch(void *opaque, int line, int level)
     }
 }
 
+static void tosa_reset(void *opaque, int line, int level)
+{
+    if (level) {
+        qemu_system_reset_request();
+    }
+}
 
 static void tosa_gpio_setup(PXA2xxState *cpu,
                 DeviceState *scp0,
@@ -93,13 +100,16 @@ static void tosa_gpio_setup(PXA2xxState *cpu,
                 TC6393xbState *tmio)
 {
     qemu_irq *outsignals = qemu_allocate_irqs(tosa_out_switch, cpu, 4);
+    qemu_irq reset;
+
     /* MMC/SD host */
     pxa2xx_mmci_handlers(cpu->mmc,
                     qdev_get_gpio_in(scp0, TOSA_GPIO_SD_WP),
                     qemu_irq_invert(qdev_get_gpio_in(cpu->gpio, TOSA_GPIO_nSD_DETECT)));
 
     /* Handle reset */
-    qdev_connect_gpio_out(cpu->gpio, TOSA_GPIO_ON_RESET, cpu->reset);
+    reset = qemu_allocate_irq(tosa_reset, cpu, 0);
+    qdev_connect_gpio_out(cpu->gpio, TOSA_GPIO_ON_RESET, reset);
 
     /* PCMCIA signals: card's IRQ and Card-Detect */
     pxa2xx_pcmcia_set_irq_cb(cpu->pcmcia[0],
diff --git a/hw/arm/versatilepb.c b/hw/arm/versatilepb.c
index 8ae5392bcc..7b5cb36d5a 100644
--- a/hw/arm/versatilepb.c
+++ b/hw/arm/versatilepb.c
@@ -198,6 +198,15 @@ static void versatile_init(MachineState *machine, int board_id)
     int done_smc = 0;
     DriveInfo *dinfo;
 
+    if (machine->ram_size > 0x10000000) {
+        /* Device starting at address 0x10000000,
+         * and memory cannot overlap with devices.
+         * Refuse to run rather than behaving very confusingly.
+         */
+        error_report("versatilepb: memory size must not exceed 256MB");
+        exit(1);
+    }
+
     if (!machine->cpu_model) {
         machine->cpu_model = "arm926";
     }
diff --git a/hw/arm/virt-acpi-build.c b/hw/arm/virt-acpi-build.c
index 5fc10df08e..f953610018 100644
--- a/hw/arm/virt-acpi-build.c
+++ b/hw/arm/virt-acpi-build.c
@@ -594,7 +594,7 @@ build_madt(GArray *table_data, BIOSLinker *linker, VirtGuestInfo *guest_info)
         gicc->uid = i;
         gicc->flags = cpu_to_le32(ACPI_GICC_ENABLED);
 
-        if (armcpu->has_pmu) {
+        if (arm_feature(&armcpu->env, ARM_FEATURE_PMU)) {
             gicc->performance_interrupt = cpu_to_le32(PPI(VIRTUAL_PMU_IRQ));
         }
     }
diff --git a/hw/arm/virt.c b/hw/arm/virt.c
index 070bbf89d4..54a8b28a58 100644
--- a/hw/arm/virt.c
+++ b/hw/arm/virt.c
@@ -85,6 +85,7 @@ typedef struct {
     VirtBoardInfo *daughterboard;
     bool disallow_affinity_adjustment;
     bool no_its;
+    bool no_pmu;
 } VirtMachineClass;
 
 typedef struct {
@@ -490,7 +491,7 @@ static void fdt_add_pmu_nodes(const VirtBoardInfo *vbi, int gictype)
 
     CPU_FOREACH(cpu) {
         armcpu = ARM_CPU(cpu);
-        if (!armcpu->has_pmu ||
+        if (!arm_feature(&armcpu->env, ARM_FEATURE_PMU) ||
             !kvm_arm_pmu_create(cpu, PPI(VIRTUAL_PMU_IRQ))) {
             return;
         }
@@ -1353,6 +1354,10 @@ static void machvirt_init(MachineState *machine)
             }
         }
 
+        if (vmc->no_pmu && object_property_find(cpuobj, "pmu", NULL)) {
+            object_property_set_bool(cpuobj, false, "pmu", NULL);
+        }
+
         if (object_property_find(cpuobj, "reset-cbar", NULL)) {
             object_property_set_int(cpuobj, vbi->memmap[VIRT_CPUPERIPHS].base,
                                     "reset-cbar", &error_abort);
@@ -1592,5 +1597,7 @@ static void virt_machine_2_6_options(MachineClass *mc)
     virt_machine_2_7_options(mc);
     SET_MACHINE_COMPAT(mc, VIRT_COMPAT_2_6);
     vmc->disallow_affinity_adjustment = true;
+    /* Disable PMU for 2.6 as PMU support was first introduced in 2.7 */
+    vmc->no_pmu = true;
 }
 DEFINE_VIRT_MACHINE(2, 6)
diff --git a/hw/block/nvme.c b/hw/block/nvme.c
index b380142028..d479fd22f5 100644
--- a/hw/block/nvme.c
+++ b/hw/block/nvme.c
@@ -375,7 +375,7 @@ static uint16_t nvme_create_sq(NvmeCtrl *n, NvmeCmd *cmd)
     if (!cqid || nvme_check_cqid(n, cqid)) {
         return NVME_INVALID_CQID | NVME_DNR;
     }
-    if (!sqid || (sqid && !nvme_check_sqid(n, sqid))) {
+    if (!sqid || !nvme_check_sqid(n, sqid)) {
         return NVME_INVALID_QID | NVME_DNR;
     }
     if (!qsize || qsize > NVME_CAP_MQES(n->bar.cap)) {
@@ -449,7 +449,7 @@ static uint16_t nvme_create_cq(NvmeCtrl *n, NvmeCmd *cmd)
     uint16_t qflags = le16_to_cpu(c->cq_flags);
     uint64_t prp1 = le64_to_cpu(c->prp1);
 
-    if (!cqid || (cqid && !nvme_check_cqid(n, cqid))) {
+    if (!cqid || !nvme_check_cqid(n, cqid)) {
         return NVME_INVALID_CQID | NVME_DNR;
     }
     if (!qsize || qsize > NVME_CAP_MQES(n->bar.cap)) {
diff --git a/hw/block/xen_disk.c b/hw/block/xen_disk.c
index 1292a4b459..3a7dc194e2 100644
--- a/hw/block/xen_disk.c
+++ b/hw/block/xen_disk.c
@@ -167,12 +167,12 @@ static void destroy_grant(gpointer pgnt)
     xengnttab_handle *gnt = grant->blkdev->xendev.gnttabdev;
 
     if (xengnttab_unmap(gnt, grant->page, 1) != 0) {
-        xen_be_printf(&grant->blkdev->xendev, 0,
+        xen_pv_printf(&grant->blkdev->xendev, 0,
                       "xengnttab_unmap failed: %s\n",
                       strerror(errno));
     }
     grant->blkdev->persistent_gnt_count--;
-    xen_be_printf(&grant->blkdev->xendev, 3,
+    xen_pv_printf(&grant->blkdev->xendev, 3,
                   "unmapped grant %p\n", grant->page);
     g_free(grant);
 }
@@ -184,11 +184,11 @@ static void remove_persistent_region(gpointer data, gpointer dev)
     xengnttab_handle *gnt = blkdev->xendev.gnttabdev;
 
     if (xengnttab_unmap(gnt, region->addr, region->num) != 0) {
-        xen_be_printf(&blkdev->xendev, 0,
+        xen_pv_printf(&blkdev->xendev, 0,
                       "xengnttab_unmap region %p failed: %s\n",
                       region->addr, strerror(errno));
     }
-    xen_be_printf(&blkdev->xendev, 3,
+    xen_pv_printf(&blkdev->xendev, 3,
                   "unmapped grant region %p with %d pages\n",
                   region->addr, region->num);
     g_free(region);
@@ -255,7 +255,7 @@ static int ioreq_parse(struct ioreq *ioreq)
     size_t len;
     int i;
 
-    xen_be_printf(&blkdev->xendev, 3,
+    xen_pv_printf(&blkdev->xendev, 3,
                   "op %d, nr %d, handle %d, id %" PRId64 ", sector %" PRId64 "\n",
                   ioreq->req.operation, ioreq->req.nr_segments,
                   ioreq->req.handle, ioreq->req.id, ioreq->req.sector_number);
@@ -275,28 +275,28 @@ static int ioreq_parse(struct ioreq *ioreq)
     case BLKIF_OP_DISCARD:
         return 0;
     default:
-        xen_be_printf(&blkdev->xendev, 0, "error: unknown operation (%d)\n",
+        xen_pv_printf(&blkdev->xendev, 0, "error: unknown operation (%d)\n",
                       ioreq->req.operation);
         goto err;
     };
 
     if (ioreq->req.operation != BLKIF_OP_READ && blkdev->mode[0] != 'w') {
-        xen_be_printf(&blkdev->xendev, 0, "error: write req for ro device\n");
+        xen_pv_printf(&blkdev->xendev, 0, "error: write req for ro device\n");
         goto err;
     }
 
     ioreq->start = ioreq->req.sector_number * blkdev->file_blk;
     for (i = 0; i < ioreq->req.nr_segments; i++) {
         if (i == BLKIF_MAX_SEGMENTS_PER_REQUEST) {
-            xen_be_printf(&blkdev->xendev, 0, "error: nr_segments too big\n");
+            xen_pv_printf(&blkdev->xendev, 0, "error: nr_segments too big\n");
             goto err;
         }
         if (ioreq->req.seg[i].first_sect > ioreq->req.seg[i].last_sect) {
-            xen_be_printf(&blkdev->xendev, 0, "error: first > last sector\n");
+            xen_pv_printf(&blkdev->xendev, 0, "error: first > last sector\n");
             goto err;
         }
         if (ioreq->req.seg[i].last_sect * BLOCK_SIZE >= XC_PAGE_SIZE) {
-            xen_be_printf(&blkdev->xendev, 0, "error: page crossing\n");
+            xen_pv_printf(&blkdev->xendev, 0, "error: page crossing\n");
             goto err;
         }
 
@@ -308,7 +308,7 @@ static int ioreq_parse(struct ioreq *ioreq)
         qemu_iovec_add(&ioreq->v, (void*)mem, len);
     }
     if (ioreq->start + ioreq->v.size > blkdev->file_size) {
-        xen_be_printf(&blkdev->xendev, 0, "error: access beyond end of file\n");
+        xen_pv_printf(&blkdev->xendev, 0, "error: access beyond end of file\n");
         goto err;
     }
     return 0;
@@ -331,7 +331,7 @@ static void ioreq_unmap(struct ioreq *ioreq)
             return;
         }
         if (xengnttab_unmap(gnt, ioreq->pages, ioreq->num_unmap) != 0) {
-            xen_be_printf(&ioreq->blkdev->xendev, 0,
+            xen_pv_printf(&ioreq->blkdev->xendev, 0,
                           "xengnttab_unmap failed: %s\n",
                           strerror(errno));
         }
@@ -343,7 +343,7 @@ static void ioreq_unmap(struct ioreq *ioreq)
                 continue;
             }
             if (xengnttab_unmap(gnt, ioreq->page[i], 1) != 0) {
-                xen_be_printf(&ioreq->blkdev->xendev, 0,
+                xen_pv_printf(&ioreq->blkdev->xendev, 0,
                               "xengnttab_unmap failed: %s\n",
                               strerror(errno));
             }
@@ -381,7 +381,7 @@ static int ioreq_map(struct ioreq *ioreq)
 
             if (grant != NULL) {
                 page[i] = grant->page;
-                xen_be_printf(&ioreq->blkdev->xendev, 3,
+                xen_pv_printf(&ioreq->blkdev->xendev, 3,
                               "using persistent-grant %" PRIu32 "\n",
                               ioreq->refs[i]);
             } else {
@@ -410,7 +410,7 @@ static int ioreq_map(struct ioreq *ioreq)
         ioreq->pages = xengnttab_map_grant_refs
             (gnt, new_maps, domids, refs, ioreq->prot);
         if (ioreq->pages == NULL) {
-            xen_be_printf(&ioreq->blkdev->xendev, 0,
+            xen_pv_printf(&ioreq->blkdev->xendev, 0,
                           "can't map %d grant refs (%s, %d maps)\n",
                           new_maps, strerror(errno), ioreq->blkdev->cnt_map);
             return -1;
@@ -426,7 +426,7 @@ static int ioreq_map(struct ioreq *ioreq)
             ioreq->page[i] = xengnttab_map_grant_ref
                 (gnt, domids[i], refs[i], ioreq->prot);
             if (ioreq->page[i] == NULL) {
-                xen_be_printf(&ioreq->blkdev->xendev, 0,
+                xen_pv_printf(&ioreq->blkdev->xendev, 0,
                               "can't map grant ref %d (%s, %d maps)\n",
                               refs[i], strerror(errno), ioreq->blkdev->cnt_map);
                 ioreq->mapped = 1;
@@ -474,7 +474,7 @@ static int ioreq_map(struct ioreq *ioreq)
                 grant->page = ioreq->page[new_maps];
             }
             grant->blkdev = ioreq->blkdev;
-            xen_be_printf(&ioreq->blkdev->xendev, 3,
+            xen_pv_printf(&ioreq->blkdev->xendev, 3,
                           "adding grant %" PRIu32 " page: %p\n",
                           refs[new_maps], grant->page);
             g_tree_insert(ioreq->blkdev->persistent_gnts,
@@ -557,7 +557,7 @@ static int ioreq_grant_copy(struct ioreq *ioreq)
     rc = xengnttab_grant_copy(gnt, count, segs);
 
     if (rc) {
-        xen_be_printf(&ioreq->blkdev->xendev, 0,
+        xen_pv_printf(&ioreq->blkdev->xendev, 0,
                       "failed to copy data %d\n", rc);
         ioreq->aio_errors++;
         return -1;
@@ -565,7 +565,7 @@ static int ioreq_grant_copy(struct ioreq *ioreq)
 
     for (i = 0; i < count; i++) {
         if (segs[i].status != GNTST_okay) {
-            xen_be_printf(&ioreq->blkdev->xendev, 3,
+            xen_pv_printf(&ioreq->blkdev->xendev, 3,
                           "failed to copy data %d for gref %d, domid %d\n",
                           segs[i].status, ioreq->refs[i], ioreq->domids[i]);
             ioreq->aio_errors++;
@@ -599,7 +599,7 @@ static void qemu_aio_complete(void *opaque, int ret)
     struct ioreq *ioreq = opaque;
 
     if (ret != 0) {
-        xen_be_printf(&ioreq->blkdev->xendev, 0, "%s I/O error\n",
+        xen_pv_printf(&ioreq->blkdev->xendev, 0, "%s I/O error\n",
                       ioreq->req.operation == BLKIF_OP_READ ? "read" : "write");
         ioreq->aio_errors++;
     }
@@ -796,7 +796,7 @@ static void blk_send_response_all(struct XenBlkDev *blkdev)
         ioreq_release(ioreq, true);
     }
     if (send_notify) {
-        xen_be_send_notify(&blkdev->xendev);
+        xen_pv_send_notify(&blkdev->xendev);
     }
 }
 
@@ -866,7 +866,7 @@ static void blk_handle_requests(struct XenBlkDev *blkdev)
             };
 
             if (blk_send_response_one(ioreq)) {
-                xen_be_send_notify(&blkdev->xendev);
+                xen_pv_send_notify(&blkdev->xendev);
             }
             ioreq_release(ioreq, false);
             continue;
@@ -910,7 +910,7 @@ static void blk_alloc(struct XenDevice *xendev)
     }
     if (xengnttab_set_max_grants(xendev->gnttabdev,
             MAX_GRANTS(max_requests, BLKIF_MAX_SEGMENTS_PER_REQUEST)) < 0) {
-        xen_be_printf(xendev, 0, "xengnttab_set_max_grants failed: %s\n",
+        xen_pv_printf(xendev, 0, "xengnttab_set_max_grants failed: %s\n",
                       strerror(errno));
     }
 }
@@ -1056,11 +1056,11 @@ static int blk_connect(struct XenDevice *xendev)
         }
 
         /* setup via xenbus -> create new block driver instance */
-        xen_be_printf(&blkdev->xendev, 2, "create new bdrv (xenbus setup)\n");
+        xen_pv_printf(&blkdev->xendev, 2, "create new bdrv (xenbus setup)\n");
         blkdev->blk = blk_new_open(blkdev->filename, NULL, options,
                                    qflags, &local_err);
         if (!blkdev->blk) {
-            xen_be_printf(&blkdev->xendev, 0, "error: %s\n",
+            xen_pv_printf(&blkdev->xendev, 0, "error: %s\n",
                           error_get_pretty(local_err));
             error_free(local_err);
             return -1;
@@ -1068,10 +1068,11 @@ static int blk_connect(struct XenDevice *xendev)
         blk_set_enable_write_cache(blkdev->blk, !writethrough);
     } else {
         /* setup via qemu cmdline -> already setup for us */
-        xen_be_printf(&blkdev->xendev, 2, "get configured bdrv (cmdline setup)\n");
+        xen_pv_printf(&blkdev->xendev, 2,
+                      "get configured bdrv (cmdline setup)\n");
         blkdev->blk = blk_by_legacy_dinfo(blkdev->dinfo);
         if (blk_is_read_only(blkdev->blk) && !readonly) {
-            xen_be_printf(&blkdev->xendev, 0, "Unexpected read-only drive");
+            xen_pv_printf(&blkdev->xendev, 0, "Unexpected read-only drive");
             blkdev->blk = NULL;
             return -1;
         }
@@ -1084,13 +1085,13 @@ static int blk_connect(struct XenDevice *xendev)
     if (blkdev->file_size < 0) {
         BlockDriverState *bs = blk_bs(blkdev->blk);
         const char *drv_name = bs ? bdrv_get_format_name(bs) : NULL;
-        xen_be_printf(&blkdev->xendev, 1, "blk_getlength: %d (%s) | drv %s\n",
+        xen_pv_printf(&blkdev->xendev, 1, "blk_getlength: %d (%s) | drv %s\n",
                       (int)blkdev->file_size, strerror(-blkdev->file_size),
                       drv_name ?: "-");
         blkdev->file_size = 0;
     }
 
-    xen_be_printf(xendev, 1, "type \"%s\", fileproto \"%s\", filename \"%s\","
+    xen_pv_printf(xendev, 1, "type \"%s\", fileproto \"%s\", filename \"%s\","
                   " size %" PRId64 " (%" PRId64 " MB)\n",
                   blkdev->type, blkdev->fileproto, blkdev->filename,
                   blkdev->file_size, blkdev->file_size >> 20);
@@ -1174,10 +1175,10 @@ static int blk_connect(struct XenDevice *xendev)
     blkdev->feature_grant_copy =
                 (xengnttab_grant_copy(blkdev->xendev.gnttabdev, 0, NULL) == 0);
 
-    xen_be_printf(&blkdev->xendev, 3, "grant copy operation %s\n",
+    xen_pv_printf(&blkdev->xendev, 3, "grant copy operation %s\n",
                   blkdev->feature_grant_copy ? "enabled" : "disabled");
 
-    xen_be_printf(&blkdev->xendev, 1, "ok: proto %s, ring-ref %d, "
+    xen_pv_printf(&blkdev->xendev, 1, "ok: proto %s, ring-ref %d, "
                   "remote port %d, local port %d\n",
                   blkdev->xendev.protocol, blkdev->ring_ref,
                   blkdev->xendev.remote_port, blkdev->xendev.local_port);
@@ -1193,7 +1194,7 @@ static void blk_disconnect(struct XenDevice *xendev)
         blk_unref(blkdev->blk);
         blkdev->blk = NULL;
     }
-    xen_be_unbind_evtchn(&blkdev->xendev);
+    xen_pv_unbind_evtchn(&blkdev->xendev);
 
     if (blkdev->sring) {
         xengnttab_unmap(blkdev->xendev.gnttabdev, blkdev->sring, 1);
diff --git a/hw/char/cadence_uart.c b/hw/char/cadence_uart.c
index c2b9154305..def34cd0d2 100644
--- a/hw/char/cadence_uart.c
+++ b/hw/char/cadence_uart.c
@@ -450,7 +450,8 @@ static void cadence_uart_reset(DeviceState *dev)
     s->r[R_IMR] = 0;
     s->r[R_CISR] = 0;
     s->r[R_RTRIG] = 0x00000020;
-    s->r[R_BRGR] = 0x0000000F;
+    s->r[R_BRGR] = 0x0000028B;
+    s->r[R_BDIV] = 0x0000000F;
     s->r[R_TTRIG] = 0x00000020;
 
     uart_rx_reset(s);
diff --git a/hw/char/xen_console.c b/hw/char/xen_console.c
index 86cdc529a3..c01f41090e 100644
--- a/hw/char/xen_console.c
+++ b/hw/char/xen_console.c
@@ -74,7 +74,7 @@ static void buffer_append(struct XenConsole *con)
 
     xen_mb();
     intf->out_cons = cons;
-    xen_be_send_notify(&con->xendev);
+    xen_pv_send_notify(&con->xendev);
 
     if (buffer->max_capacity &&
 	buffer->size > buffer->max_capacity) {
@@ -142,7 +142,7 @@ static void xencons_receive(void *opaque, const uint8_t *buf, int len)
     }
     xen_wmb();
     intf->in_prod = prod;
-    xen_be_send_notify(&con->xendev);
+    xen_pv_send_notify(&con->xendev);
 }
 
 static void xencons_send(struct XenConsole *con)
@@ -158,16 +158,17 @@ static void xencons_send(struct XenConsole *con)
         len = size;
     }
     if (len < 1) {
-	if (!con->backlog) {
-	    con->backlog = 1;
-	    xen_be_printf(&con->xendev, 1, "backlog piling up, nobody listening?\n");
-	}
+        if (!con->backlog) {
+            con->backlog = 1;
+            xen_pv_printf(&con->xendev, 1,
+                          "backlog piling up, nobody listening?\n");
+        }
     } else {
-	buffer_advance(&con->buffer, len);
-	if (con->backlog && len == size) {
-	    con->backlog = 0;
-	    xen_be_printf(&con->xendev, 1, "backlog is gone\n");
-	}
+        buffer_advance(&con->buffer, len);
+        if (con->backlog && len == size) {
+            con->backlog = 0;
+            xen_pv_printf(&con->xendev, 1, "backlog is gone\n");
+        }
     }
 }
 
@@ -191,7 +192,7 @@ static int con_init(struct XenDevice *xendev)
 
     type = xenstore_read_str(con->console, "type");
     if (!type || strcmp(type, "ioemu") != 0) {
-	xen_be_printf(xendev, 1, "not for me (type=%s)\n", type);
+        xen_pv_printf(xendev, 1, "not for me (type=%s)\n", type);
         ret = -1;
         goto out;
     }
@@ -247,7 +248,8 @@ static int con_initialise(struct XenDevice *xendev)
     qemu_chr_fe_set_handlers(&con->chr, xencons_can_receive,
                              xencons_receive, NULL, con, NULL, true);
 
-    xen_be_printf(xendev, 1, "ring mfn %d, remote port %d, local port %d, limit %zd\n",
+    xen_pv_printf(xendev, 1,
+                  "ring mfn %d, remote port %d, local port %d, limit %zd\n",
 		  con->ring_ref,
 		  con->xendev.remote_port,
 		  con->xendev.local_port,
@@ -260,7 +262,7 @@ static void con_disconnect(struct XenDevice *xendev)
     struct XenConsole *con = container_of(xendev, struct XenConsole, xendev);
 
     qemu_chr_fe_deinit(&con->chr);
-    xen_be_unbind_evtchn(&con->xendev);
+    xen_pv_unbind_evtchn(&con->xendev);
 
     if (con->sring) {
         if (!xendev->dev) {
diff --git a/hw/display/milkymist-tmu2.c b/hw/display/milkymist-tmu2.c
index 9c0018448a..5c666f9b24 100644
--- a/hw/display/milkymist-tmu2.c
+++ b/hw/display/milkymist-tmu2.c
@@ -213,7 +213,7 @@ static void tmu2_start(MilkymistTMU2State *s)
     /* Read the QEMU source framebuffer into an OpenGL texture */
     glGenTextures(1, &texture);
     glBindTexture(GL_TEXTURE_2D, texture);
-    fb_len = 2*s->regs[R_TEXHRES]*s->regs[R_TEXVRES];
+    fb_len = 2ULL * s->regs[R_TEXHRES] * s->regs[R_TEXVRES];
     fb = cpu_physical_memory_map(s->regs[R_TEXFBUF], &fb_len, 0);
     if (fb == NULL) {
         glDeleteTextures(1, &texture);
diff --git a/hw/display/xenfb.c b/hw/display/xenfb.c
index 46b7d5eded..7a8727aa21 100644
--- a/hw/display/xenfb.c
+++ b/hw/display/xenfb.c
@@ -90,28 +90,29 @@ static int common_bind(struct common *c)
     xen_pfn_t mfn;
 
     if (xenstore_read_fe_uint64(&c->xendev, "page-ref", &val) == -1)
-	return -1;
+        return -1;
     mfn = (xen_pfn_t)val;
     assert(val == mfn);
 
     if (xenstore_read_fe_int(&c->xendev, "event-channel", &c->xendev.remote_port) == -1)
-	return -1;
+        return -1;
 
     c->page = xenforeignmemory_map(xen_fmem, c->xendev.dom,
                                    PROT_READ | PROT_WRITE, 1, &mfn, NULL);
     if (c->page == NULL)
-	return -1;
+        return -1;
 
     xen_be_bind_evtchn(&c->xendev);
-    xen_be_printf(&c->xendev, 1, "ring mfn %"PRI_xen_pfn", remote-port %d, local-port %d\n",
-		  mfn, c->xendev.remote_port, c->xendev.local_port);
+    xen_pv_printf(&c->xendev, 1,
+                  "ring mfn %"PRI_xen_pfn", remote-port %d, local-port %d\n",
+                  mfn, c->xendev.remote_port, c->xendev.local_port);
 
     return 0;
 }
 
 static void common_unbind(struct common *c)
 {
-    xen_be_unbind_evtchn(&c->xendev);
+    xen_pv_unbind_evtchn(&c->xendev);
     if (c->page) {
         xenforeignmemory_unmap(xen_fmem, c->page, 1);
 	c->page = NULL;
@@ -214,7 +215,7 @@ static int xenfb_kbd_event(struct XenInput *xenfb,
     XENKBD_IN_RING_REF(page, prod) = *event;
     xen_wmb();		/* ensure ring contents visible */
     page->in_prod = prod + 1;
-    return xen_be_send_notify(&xenfb->c.xendev);
+    return xen_pv_send_notify(&xenfb->c.xendev);
 }
 
 /* Send a keyboard (or mouse button) event */
@@ -345,7 +346,7 @@ static int input_initialise(struct XenDevice *xendev)
     int rc;
 
     if (!in->c.con) {
-        xen_be_printf(xendev, 1, "ds not set (yet)\n");
+        xen_pv_printf(xendev, 1, "ds not set (yet)\n");
         return -1;
     }
 
@@ -396,7 +397,7 @@ static void input_event(struct XenDevice *xendev)
     if (page->out_prod == page->out_cons)
 	return;
     page->out_cons = page->out_prod;
-    xen_be_send_notify(&xenfb->c.xendev);
+    xen_pv_send_notify(&xenfb->c.xendev);
 }
 
 /* -------------------------------------------------------------------- */
@@ -500,8 +501,8 @@ out:
 }
 
 static int xenfb_configure_fb(struct XenFB *xenfb, size_t fb_len_lim,
-			      int width, int height, int depth,
-			      size_t fb_len, int offset, int row_stride)
+                              int width, int height, int depth,
+                              size_t fb_len, int offset, int row_stride)
 {
     size_t mfn_sz = sizeof(*((struct xenfb_page *)0)->pd);
     size_t pd_len = sizeof(((struct xenfb_page *)0)->pd) / mfn_sz;
@@ -510,40 +511,47 @@ static int xenfb_configure_fb(struct XenFB *xenfb, size_t fb_len_lim,
     int max_width, max_height;
 
     if (fb_len_lim > fb_len_max) {
-	xen_be_printf(&xenfb->c.xendev, 0, "fb size limit %zu exceeds %zu, corrected\n",
-		      fb_len_lim, fb_len_max);
-	fb_len_lim = fb_len_max;
+        xen_pv_printf(&xenfb->c.xendev, 0,
+                      "fb size limit %zu exceeds %zu, corrected\n",
+                      fb_len_lim, fb_len_max);
+        fb_len_lim = fb_len_max;
     }
     if (fb_len_lim && fb_len > fb_len_lim) {
-	xen_be_printf(&xenfb->c.xendev, 0, "frontend fb size %zu limited to %zu\n",
-		      fb_len, fb_len_lim);
-	fb_len = fb_len_lim;
+        xen_pv_printf(&xenfb->c.xendev, 0,
+                      "frontend fb size %zu limited to %zu\n",
+                      fb_len, fb_len_lim);
+        fb_len = fb_len_lim;
     }
     if (depth != 8 && depth != 16 && depth != 24 && depth != 32) {
-	xen_be_printf(&xenfb->c.xendev, 0, "can't handle frontend fb depth %d\n",
-		      depth);
-	return -1;
+        xen_pv_printf(&xenfb->c.xendev, 0,
+                      "can't handle frontend fb depth %d\n",
+                      depth);
+        return -1;
     }
     if (row_stride <= 0 || row_stride > fb_len) {
-	xen_be_printf(&xenfb->c.xendev, 0, "invalid frontend stride %d\n", row_stride);
-	return -1;
+        xen_pv_printf(&xenfb->c.xendev, 0, "invalid frontend stride %d\n",
+                      row_stride);
+        return -1;
     }
     max_width = row_stride / (depth / 8);
     if (width < 0 || width > max_width) {
-	xen_be_printf(&xenfb->c.xendev, 0, "invalid frontend width %d limited to %d\n",
-		      width, max_width);
-	width = max_width;
+        xen_pv_printf(&xenfb->c.xendev, 0,
+                      "invalid frontend width %d limited to %d\n",
+                      width, max_width);
+        width = max_width;
     }
     if (offset < 0 || offset >= fb_len) {
-	xen_be_printf(&xenfb->c.xendev, 0, "invalid frontend offset %d (max %zu)\n",
-		      offset, fb_len - 1);
-	return -1;
+        xen_pv_printf(&xenfb->c.xendev, 0,
+                      "invalid frontend offset %d (max %zu)\n",
+                      offset, fb_len - 1);
+        return -1;
     }
     max_height = (fb_len - offset) / row_stride;
     if (height < 0 || height > max_height) {
-	xen_be_printf(&xenfb->c.xendev, 0, "invalid frontend height %d limited to %d\n",
-		      height, max_height);
-	height = max_height;
+        xen_pv_printf(&xenfb->c.xendev, 0,
+                      "invalid frontend height %d limited to %d\n",
+                      height, max_height);
+        height = max_height;
     }
     xenfb->fb_len = fb_len;
     xenfb->row_stride = row_stride;
@@ -553,8 +561,9 @@ static int xenfb_configure_fb(struct XenFB *xenfb, size_t fb_len_lim,
     xenfb->offset = offset;
     xenfb->up_fullscreen = 1;
     xenfb->do_resize = 1;
-    xen_be_printf(&xenfb->c.xendev, 1, "framebuffer %dx%dx%d offset %d stride %d\n",
-		  width, height, depth, offset, row_stride);
+    xen_pv_printf(&xenfb->c.xendev, 1,
+                  "framebuffer %dx%dx%d offset %d stride %d\n",
+                  width, height, depth, offset, row_stride);
     return 0;
 }
 
@@ -631,7 +640,7 @@ static void xenfb_guest_copy(struct XenFB *xenfb, int x, int y, int w, int h)
 	}
     }
     if (oops) /* should not happen */
-        xen_be_printf(&xenfb->c.xendev, 0, "%s: oops: convert %d -> %d bpp?\n",
+        xen_pv_printf(&xenfb->c.xendev, 0, "%s: oops: convert %d -> %d bpp?\n",
                       __FUNCTION__, xenfb->depth, bpp);
 
     dpy_gfx_update(xenfb->c.con, x, y, w, h);
@@ -663,7 +672,7 @@ static void xenfb_send_event(struct XenFB *xenfb, union xenfb_in_event *event)
     xen_wmb();                  /* ensure ring contents visible */
     page->in_prod = prod + 1;
 
-    xen_be_send_notify(&xenfb->c.xendev);
+    xen_pv_send_notify(&xenfb->c.xendev);
 }
 
 static void xenfb_send_refresh_period(struct XenFB *xenfb, int period)
@@ -696,9 +705,9 @@ static void xenfb_update(void *opaque)
         return;
 
     if (!xenfb->feature_update) {
-	/* we don't get update notifications, thus use the
-	 * sledge hammer approach ... */
-	xenfb->up_fullscreen = 1;
+        /* we don't get update notifications, thus use the
+         * sledge hammer approach ... */
+        xenfb->up_fullscreen = 1;
     }
 
     /* resize if needed */
@@ -721,7 +730,8 @@ static void xenfb_update(void *opaque)
             break;
         }
         dpy_gfx_replace_surface(xenfb->c.con, surface);
-        xen_be_printf(&xenfb->c.xendev, 1, "update: resizing: %dx%d @ %d bpp%s\n",
+        xen_pv_printf(&xenfb->c.xendev, 1,
+                      "update: resizing: %dx%d @ %d bpp%s\n",
                       xenfb->width, xenfb->height, xenfb->depth,
                       is_buffer_shared(surface) ? " (shared)" : "");
         xenfb->up_fullscreen = 1;
@@ -729,18 +739,19 @@ static void xenfb_update(void *opaque)
 
     /* run queued updates */
     if (xenfb->up_fullscreen) {
-	xen_be_printf(&xenfb->c.xendev, 3, "update: fullscreen\n");
-	xenfb_guest_copy(xenfb, 0, 0, xenfb->width, xenfb->height);
+        xen_pv_printf(&xenfb->c.xendev, 3, "update: fullscreen\n");
+        xenfb_guest_copy(xenfb, 0, 0, xenfb->width, xenfb->height);
     } else if (xenfb->up_count) {
-	xen_be_printf(&xenfb->c.xendev, 3, "update: %d rects\n", xenfb->up_count);
-	for (i = 0; i < xenfb->up_count; i++)
-	    xenfb_guest_copy(xenfb,
-			     xenfb->up_rects[i].x,
-			     xenfb->up_rects[i].y,
-			     xenfb->up_rects[i].w,
-			     xenfb->up_rects[i].h);
+        xen_pv_printf(&xenfb->c.xendev, 3, "update: %d rects\n",
+                      xenfb->up_count);
+        for (i = 0; i < xenfb->up_count; i++)
+            xenfb_guest_copy(xenfb,
+                             xenfb->up_rects[i].x,
+                             xenfb->up_rects[i].y,
+                             xenfb->up_rects[i].w,
+                             xenfb->up_rects[i].h);
     } else {
-	xen_be_printf(&xenfb->c.xendev, 3, "update: nothing\n");
+        xen_pv_printf(&xenfb->c.xendev, 3, "update: nothing\n");
     }
     xenfb->up_count = 0;
     xenfb->up_fullscreen = 0;
@@ -794,14 +805,14 @@ static void xenfb_handle_events(struct XenFB *xenfb)
 	    w = MIN(event->update.width, xenfb->width - x);
 	    h = MIN(event->update.height, xenfb->height - y);
 	    if (w < 0 || h < 0) {
-                xen_be_printf(&xenfb->c.xendev, 1, "bogus update ignored\n");
+                xen_pv_printf(&xenfb->c.xendev, 1, "bogus update ignored\n");
 		break;
 	    }
 	    if (x != event->update.x ||
                 y != event->update.y ||
 		w != event->update.width ||
 		h != event->update.height) {
-                xen_be_printf(&xenfb->c.xendev, 1, "bogus update clipped\n");
+                xen_pv_printf(&xenfb->c.xendev, 1, "bogus update clipped\n");
 	    }
 	    if (w == xenfb->width && h > xenfb->height / 2) {
 		/* scroll detector: updated more than 50% of the lines,
@@ -883,7 +894,7 @@ static int fb_initialise(struct XenDevice *xendev)
     if (fb->feature_update)
 	xenstore_write_be_int(xendev, "request-update", 1);
 
-    xen_be_printf(xendev, 1, "feature-update=%d, videoram=%d\n",
+    xen_pv_printf(xendev, 1, "feature-update=%d, videoram=%d\n",
 		  fb->feature_update, videoram);
     return 0;
 }
@@ -902,7 +913,7 @@ static void fb_disconnect(struct XenDevice *xendev)
                       PROT_READ | PROT_WRITE, MAP_SHARED | MAP_ANON,
                       -1, 0);
     if (fb->pixels == MAP_FAILED) {
-        xen_be_printf(xendev, 0,
+        xen_pv_printf(xendev, 0,
                 "Couldn't replace the framebuffer with anonymous memory errno=%d\n",
                 errno);
     }
@@ -923,7 +934,7 @@ static void fb_frontend_changed(struct XenDevice *xendev, const char *node)
     if (fb->bug_trigger == 0 && strcmp(node, "state") == 0 &&
         xendev->fe_state == XenbusStateConnected &&
         xendev->be_state == XenbusStateConnected) {
-        xen_be_printf(xendev, 2, "re-trigger connected (frontend bug)\n");
+        xen_pv_printf(xendev, 2, "re-trigger connected (frontend bug)\n");
         xen_be_set_state(xendev, XenbusStateConnected);
         fb->bug_trigger = 1; /* only once */
     }
@@ -934,7 +945,7 @@ static void fb_event(struct XenDevice *xendev)
     struct XenFB *xenfb = container_of(xendev, struct XenFB, c.xendev);
 
     xenfb_handle_events(xenfb);
-    xen_be_send_notify(&xenfb->c.xendev);
+    xen_pv_send_notify(&xenfb->c.xendev);
 }
 
 /* -------------------------------------------------------------------- */
@@ -977,14 +988,14 @@ void xen_init_display(int domid)
 wait_more:
     i++;
     main_loop_wait(true);
-    xfb = xen_be_find_xendev("vfb", domid, 0);
-    xin = xen_be_find_xendev("vkbd", domid, 0);
+    xfb = xen_pv_find_xendev("vfb", domid, 0);
+    xin = xen_pv_find_xendev("vkbd", domid, 0);
     if (!xfb || !xin) {
         if (i < 256) {
             usleep(10000);
             goto wait_more;
         }
-        xen_be_printf(NULL, 1, "displaystate setup failed\n");
+        xen_pv_printf(NULL, 1, "displaystate setup failed\n");
         return;
     }
 
diff --git a/hw/gpio/imx_gpio.c b/hw/gpio/imx_gpio.c
index f3574aa8f3..c36c394fda 100644
--- a/hw/gpio/imx_gpio.c
+++ b/hw/gpio/imx_gpio.c
@@ -237,7 +237,7 @@ static void imx_gpio_write(void *opaque, hwaddr offset, uint64_t value,
         break;
 
     case ISR_ADDR:
-        s->isr |= ~value;
+        s->isr &= ~value;
         imx_gpio_set_all_int_lines(s);
         break;
 
diff --git a/hw/i386/acpi-build.c b/hw/i386/acpi-build.c
index 93be96f89c..5cd1da9a87 100644
--- a/hw/i386/acpi-build.c
+++ b/hw/i386/acpi-build.c
@@ -53,7 +53,6 @@
 #include "hw/pci/pci_bus.h"
 #include "hw/pci-host/q35.h"
 #include "hw/i386/x86-iommu.h"
-#include "hw/timer/hpet.h"
 
 #include "hw/acpi/aml-build.h"
 
diff --git a/hw/microblaze/boot.c b/hw/microblaze/boot.c
index 9eebb1a521..1834d22a61 100644
--- a/hw/microblaze/boot.c
+++ b/hw/microblaze/boot.c
@@ -30,7 +30,6 @@
 #include "qemu/option.h"
 #include "qemu/config-file.h"
 #include "qemu/error-report.h"
-#include "qemu-common.h"
 #include "sysemu/device_tree.h"
 #include "sysemu/sysemu.h"
 #include "hw/loader.h"
diff --git a/hw/mips/mips_malta.c b/hw/mips/mips_malta.c
index cf9bd3eb45..cf48f420cc 100644
--- a/hw/mips/mips_malta.c
+++ b/hw/mips/mips_malta.c
@@ -47,7 +47,6 @@
 #include "elf.h"
 #include "hw/timer/mc146818rtc.h"
 #include "hw/timer/i8254.h"
-#include "sysemu/block-backend.h"
 #include "sysemu/blockdev.h"
 #include "exec/address-spaces.h"
 #include "hw/sysbus.h"             /* SysBusDevice */
diff --git a/hw/misc/milkymist-pfpu.c b/hw/misc/milkymist-pfpu.c
index 1da21a643e..3ca25894f1 100644
--- a/hw/misc/milkymist-pfpu.c
+++ b/hw/misc/milkymist-pfpu.c
@@ -137,7 +137,7 @@ struct MilkymistPFPUState {
 };
 typedef struct MilkymistPFPUState MilkymistPFPUState;
 
-static inline hwaddr
+static inline uint32_t
 get_dma_address(uint32_t base, uint32_t x, uint32_t y)
 {
     return base + 8 * (128 * y + x);
diff --git a/hw/net/xen_nic.c b/hw/net/xen_nic.c
index 6856b52999..20c43a61b3 100644
--- a/hw/net/xen_nic.c
+++ b/hw/net/xen_nic.c
@@ -69,7 +69,7 @@ static void net_tx_response(struct XenNetDev *netdev, netif_tx_request_t *txp, i
     netdev->tx_ring.rsp_prod_pvt = ++i;
     RING_PUSH_RESPONSES_AND_CHECK_NOTIFY(&netdev->tx_ring, notify);
     if (notify) {
-        xen_be_send_notify(&netdev->xendev);
+        xen_pv_send_notify(&netdev->xendev);
     }
 
     if (i == netdev->tx_ring.req_cons) {
@@ -128,30 +128,32 @@ static void net_tx_packets(struct XenNetDev *netdev)
             /* should not happen in theory, we don't announce the *
              * feature-{sg,gso,whatelse} flags in xenstore (yet?) */
             if (txreq.flags & NETTXF_extra_info) {
-                xen_be_printf(&netdev->xendev, 0, "FIXME: extra info flag\n");
+                xen_pv_printf(&netdev->xendev, 0, "FIXME: extra info flag\n");
                 net_tx_error(netdev, &txreq, rc);
                 continue;
             }
             if (txreq.flags & NETTXF_more_data) {
-                xen_be_printf(&netdev->xendev, 0, "FIXME: more data flag\n");
+                xen_pv_printf(&netdev->xendev, 0, "FIXME: more data flag\n");
                 net_tx_error(netdev, &txreq, rc);
                 continue;
             }
 #endif
 
             if (txreq.size < 14) {
-                xen_be_printf(&netdev->xendev, 0, "bad packet size: %d\n", txreq.size);
+                xen_pv_printf(&netdev->xendev, 0, "bad packet size: %d\n",
+                              txreq.size);
                 net_tx_error(netdev, &txreq, rc);
                 continue;
             }
 
             if ((txreq.offset + txreq.size) > XC_PAGE_SIZE) {
-                xen_be_printf(&netdev->xendev, 0, "error: page crossing\n");
+                xen_pv_printf(&netdev->xendev, 0, "error: page crossing\n");
                 net_tx_error(netdev, &txreq, rc);
                 continue;
             }
 
-            xen_be_printf(&netdev->xendev, 3, "tx packet ref %d, off %d, len %d, flags 0x%x%s%s%s%s\n",
+            xen_pv_printf(&netdev->xendev, 3,
+                          "tx packet ref %d, off %d, len %d, flags 0x%x%s%s%s%s\n",
                           txreq.gref, txreq.offset, txreq.size, txreq.flags,
                           (txreq.flags & NETTXF_csum_blank)     ? " csum_blank"     : "",
                           (txreq.flags & NETTXF_data_validated) ? " data_validated" : "",
@@ -162,8 +164,9 @@ static void net_tx_packets(struct XenNetDev *netdev)
                                            netdev->xendev.dom,
                                            txreq.gref, PROT_READ);
             if (page == NULL) {
-                xen_be_printf(&netdev->xendev, 0, "error: tx gref dereference failed (%d)\n",
-                              txreq.gref);
+                xen_pv_printf(&netdev->xendev, 0,
+                              "error: tx gref dereference failed (%d)\n",
+                             txreq.gref);
                 net_tx_error(netdev, &txreq, rc);
                 continue;
             }
@@ -211,13 +214,14 @@ static void net_rx_response(struct XenNetDev *netdev,
         resp->status = (int16_t)st;
     }
 
-    xen_be_printf(&netdev->xendev, 3, "rx response: idx %d, status %d, flags 0x%x\n",
+    xen_pv_printf(&netdev->xendev, 3,
+                  "rx response: idx %d, status %d, flags 0x%x\n",
                   i, resp->status, resp->flags);
 
     netdev->rx_ring.rsp_prod_pvt = ++i;
     RING_PUSH_RESPONSES_AND_CHECK_NOTIFY(&netdev->rx_ring, notify);
     if (notify) {
-        xen_be_send_notify(&netdev->xendev);
+        xen_pv_send_notify(&netdev->xendev);
     }
 }
 
@@ -242,7 +246,7 @@ static ssize_t net_rx_packet(NetClientState *nc, const uint8_t *buf, size_t size
         return 0;
     }
     if (size > XC_PAGE_SIZE - NET_IP_ALIGN) {
-        xen_be_printf(&netdev->xendev, 0, "packet too big (%lu > %ld)",
+        xen_pv_printf(&netdev->xendev, 0, "packet too big (%lu > %ld)",
                       (unsigned long)size, XC_PAGE_SIZE - NET_IP_ALIGN);
         return -1;
     }
@@ -254,7 +258,8 @@ static ssize_t net_rx_packet(NetClientState *nc, const uint8_t *buf, size_t size
                                    netdev->xendev.dom,
                                    rxreq.gref, PROT_WRITE);
     if (page == NULL) {
-        xen_be_printf(&netdev->xendev, 0, "error: rx gref dereference failed (%d)\n",
+        xen_pv_printf(&netdev->xendev, 0,
+                      "error: rx gref dereference failed (%d)\n",
                       rxreq.gref);
         net_rx_response(netdev, &rxreq, NETIF_RSP_ERROR, 0, 0, 0);
         return -1;
@@ -328,7 +333,8 @@ static int net_connect(struct XenDevice *xendev)
         rx_copy = 0;
     }
     if (rx_copy == 0) {
-        xen_be_printf(&netdev->xendev, 0, "frontend doesn't support rx-copy.\n");
+        xen_pv_printf(&netdev->xendev, 0,
+                      "frontend doesn't support rx-copy.\n");
         return -1;
     }
 
@@ -353,7 +359,7 @@ static int net_connect(struct XenDevice *xendev)
 
     xen_be_bind_evtchn(&netdev->xendev);
 
-    xen_be_printf(&netdev->xendev, 1, "ok: tx-ring-ref %d, rx-ring-ref %d, "
+    xen_pv_printf(&netdev->xendev, 1, "ok: tx-ring-ref %d, rx-ring-ref %d, "
                   "remote port %d, local port %d\n",
                   netdev->tx_ring_ref, netdev->rx_ring_ref,
                   netdev->xendev.remote_port, netdev->xendev.local_port);
@@ -366,7 +372,7 @@ static void net_disconnect(struct XenDevice *xendev)
 {
     struct XenNetDev *netdev = container_of(xendev, struct XenNetDev, xendev);
 
-    xen_be_unbind_evtchn(&netdev->xendev);
+    xen_pv_unbind_evtchn(&netdev->xendev);
 
     if (netdev->txs) {
         xengnttab_unmap(netdev->xendev.gnttabdev, netdev->txs, 1);
diff --git a/hw/nvram/fw_cfg.c b/hw/nvram/fw_cfg.c
index 92aa563929..1f0c3e9910 100644
--- a/hw/nvram/fw_cfg.c
+++ b/hw/nvram/fw_cfg.c
@@ -29,7 +29,6 @@
 #include "hw/isa/isa.h"
 #include "hw/nvram/fw_cfg.h"
 #include "hw/sysbus.h"
-#include "hw/boards.h"
 #include "trace.h"
 #include "qemu/error-report.h"
 #include "qemu/config-file.h"
diff --git a/hw/pci-bridge/pci_expander_bridge.c b/hw/pci-bridge/pci_expander_bridge.c
index 1cc598f7e9..6ac187fa32 100644
--- a/hw/pci-bridge/pci_expander_bridge.c
+++ b/hw/pci-bridge/pci_expander_bridge.c
@@ -15,7 +15,6 @@
 #include "hw/pci/pci.h"
 #include "hw/pci/pci_bus.h"
 #include "hw/pci/pci_host.h"
-#include "hw/pci/pci_bus.h"
 #include "hw/pci/pci_bridge.h"
 #include "hw/i386/pc.h"
 #include "qemu/range.h"
diff --git a/hw/ppc/ppc405_boards.c b/hw/ppc/ppc405_boards.c
index 4b2f07aecb..d01798f245 100644
--- a/hw/ppc/ppc405_boards.c
+++ b/hw/ppc/ppc405_boards.c
@@ -37,7 +37,6 @@
 #include "qemu/log.h"
 #include "qemu/error-report.h"
 #include "hw/loader.h"
-#include "sysemu/block-backend.h"
 #include "sysemu/blockdev.h"
 #include "exec/address-spaces.h"
 
diff --git a/hw/ppc/spapr.c b/hw/ppc/spapr.c
index c8e29212cb..807ac52aed 100644
--- a/hw/ppc/spapr.c
+++ b/hw/ppc/spapr.c
@@ -37,7 +37,6 @@
 #include "sysemu/block-backend.h"
 #include "sysemu/cpus.h"
 #include "sysemu/kvm.h"
-#include "sysemu/device_tree.h"
 #include "kvm_ppc.h"
 #include "migration/migration.h"
 #include "mmu-hash64.h"
diff --git a/hw/s390x/s390-pci-bus.c b/hw/s390x/s390-pci-bus.c
index 58d0dd2203..63f6248f1d 100644
--- a/hw/s390x/s390-pci-bus.c
+++ b/hw/s390x/s390-pci-bus.c
@@ -807,17 +807,11 @@ static uint32_t s390_pci_generate_fid(Error **errp)
 {
     uint32_t fid = 0;
 
-    while (fid <= ZPCI_MAX_FID) {
+    do {
         if (!s390_pci_find_dev_by_fid(fid)) {
             return fid;
         }
-
-        if (fid == ZPCI_MAX_FID) {
-            break;
-        }
-
-        fid++;
-    }
+    } while (fid++ != ZPCI_MAX_FID);
 
     error_setg(errp, "no free fid could be found");
     return 0;
diff --git a/hw/scsi/virtio-scsi-dataplane.c b/hw/scsi/virtio-scsi-dataplane.c
index b173b94949..9424f0e057 100644
--- a/hw/scsi/virtio-scsi-dataplane.c
+++ b/hw/scsi/virtio-scsi-dataplane.c
@@ -189,13 +189,11 @@ void virtio_scsi_dataplane_stop(VirtIOSCSI *s)
     assert(s->ctx == iothread_get_aio_context(vs->conf.iothread));
 
     aio_context_acquire(s->ctx);
-
     virtio_scsi_clear_aio(s);
+    aio_context_release(s->ctx);
 
     blk_drain_all(); /* ensure there are no in-flight requests */
 
-    aio_context_release(s->ctx);
-
     for (i = 0; i < vs->conf.num_queues + 2; i++) {
         virtio_bus_set_host_notifier(VIRTIO_BUS(qbus), i, false);
     }
diff --git a/hw/timer/grlib_gptimer.c b/hw/timer/grlib_gptimer.c
index 712d1aece5..4ed96e970a 100644
--- a/hw/timer/grlib_gptimer.c
+++ b/hw/timer/grlib_gptimer.c
@@ -26,7 +26,6 @@
 #include "hw/sysbus.h"
 #include "qemu/timer.h"
 #include "hw/ptimer.h"
-#include "qemu/timer.h"
 #include "qemu/main-loop.h"
 
 #include "trace.h"
diff --git a/hw/tpm/tpm_passthrough.c b/hw/tpm/tpm_passthrough.c
index e88c0d20bc..9234eb3459 100644
--- a/hw/tpm/tpm_passthrough.c
+++ b/hw/tpm/tpm_passthrough.c
@@ -165,8 +165,7 @@ static int tpm_passthrough_unix_tx_bufs(TPMPassthruState *tpm_pt,
 
     ret = tpm_passthrough_unix_write(tpm_pt->tpm_fd, in, in_len);
     if (ret != in_len) {
-        if (!tpm_pt->tpm_op_canceled ||
-            (tpm_pt->tpm_op_canceled && errno != ECANCELED)) {
+        if (!tpm_pt->tpm_op_canceled || errno != ECANCELED) {
             error_report("tpm_passthrough: error while transmitting data "
                          "to TPM: %s (%i)",
                          strerror(errno), errno);
@@ -178,8 +177,7 @@ static int tpm_passthrough_unix_tx_bufs(TPMPassthruState *tpm_pt,
 
     ret = tpm_passthrough_unix_read(tpm_pt->tpm_fd, out, out_len);
     if (ret < 0) {
-        if (!tpm_pt->tpm_op_canceled ||
-            (tpm_pt->tpm_op_canceled && errno != ECANCELED)) {
+        if (!tpm_pt->tpm_op_canceled || errno != ECANCELED) {
             error_report("tpm_passthrough: error while reading data from "
                          "TPM: %s (%i)",
                          strerror(errno), errno);
diff --git a/hw/tpm/tpm_tis.c b/hw/tpm/tpm_tis.c
index 381e7266ea..a6440fef91 100644
--- a/hw/tpm/tpm_tis.c
+++ b/hw/tpm/tpm_tis.c
@@ -34,7 +34,6 @@
 #include "qapi/error.h"
 #include "qemu-common.h"
 #include "qemu/main-loop.h"
-#include "sysemu/tpm_backend.h"
 
 #define DEBUG_TIS 0
 
diff --git a/hw/unicore32/puv3.c b/hw/unicore32/puv3.c
index 31cd171016..032078fd3e 100644
--- a/hw/unicore32/puv3.c
+++ b/hw/unicore32/puv3.c
@@ -13,7 +13,6 @@
 #include "qapi/error.h"
 #include "qemu-common.h"
 #include "cpu.h"
-#include "qemu-common.h"
 #include "ui/console.h"
 #include "elf.h"
 #include "exec/address-spaces.h"
diff --git a/hw/usb/ccid-card-emulated.c b/hw/usb/ccid-card-emulated.c
index 3213f9f8af..eceb5f3ee2 100644
--- a/hw/usb/ccid-card-emulated.c
+++ b/hw/usb/ccid-card-emulated.c
@@ -547,7 +547,7 @@ static int emulated_initfn(CCIDCardState *base)
     return 0;
 }
 
-static int emulated_exitfn(CCIDCardState *base)
+static void emulated_exitfn(CCIDCardState *base)
 {
     EmulatedState *card = EMULATED_CCID_CARD(base);
     VEvent *vevent = vevent_new(VEVENT_LAST, NULL, NULL);
@@ -564,7 +564,6 @@ static int emulated_exitfn(CCIDCardState *base)
     qemu_mutex_destroy(&card->handle_apdu_mutex);
     qemu_mutex_destroy(&card->vreader_mutex);
     qemu_mutex_destroy(&card->event_list_mutex);
-    return 0;
 }
 
 static Property emulated_card_properties[] = {
diff --git a/hw/usb/ccid-card-passthru.c b/hw/usb/ccid-card-passthru.c
index 325129a2f6..88cb6d8978 100644
--- a/hw/usb/ccid-card-passthru.c
+++ b/hw/usb/ccid-card-passthru.c
@@ -365,11 +365,6 @@ static int passthru_initfn(CCIDCardState *base)
     return 0;
 }
 
-static int passthru_exitfn(CCIDCardState *base)
-{
-    return 0;
-}
-
 static VMStateDescription passthru_vmstate = {
     .name = "ccid-card-passthru",
     .version_id = 1,
@@ -396,7 +391,6 @@ static void passthru_class_initfn(ObjectClass *klass, void *data)
     CCIDCardClass *cc = CCID_CARD_CLASS(klass);
 
     cc->initfn = passthru_initfn;
-    cc->exitfn = passthru_exitfn;
     cc->get_atr = passthru_get_atr;
     cc->apdu_from_guest = passthru_apdu_from_guest;
     set_bit(DEVICE_CATEGORY_INPUT, dc->categories);
diff --git a/hw/usb/ccid.h b/hw/usb/ccid.h
index 9334da8acd..1f070116d6 100644
--- a/hw/usb/ccid.h
+++ b/hw/usb/ccid.h
@@ -33,7 +33,7 @@ typedef struct CCIDCardClass {
     void (*apdu_from_guest)(CCIDCardState *card,
                             const uint8_t *apdu,
                             uint32_t len);
-    int (*exitfn)(CCIDCardState *card);
+    void (*exitfn)(CCIDCardState *card);
     int (*initfn)(CCIDCardState *card);
 } CCIDCardClass;
 
diff --git a/hw/usb/dev-mtp.c b/hw/usb/dev-mtp.c
index 58d95fffb2..9cb0f50750 100644
--- a/hw/usb/dev-mtp.c
+++ b/hw/usb/dev-mtp.c
@@ -17,7 +17,6 @@
 #include <sys/statvfs.h>
 #ifdef CONFIG_INOTIFY1
 #include <sys/inotify.h>
-#include "qapi/error.h"
 #include "qemu/main-loop.h"
 #endif
 
diff --git a/hw/usb/dev-smartcard-reader.c b/hw/usb/dev-smartcard-reader.c
index af4b851356..89e11b68c4 100644
--- a/hw/usb/dev-smartcard-reader.c
+++ b/hw/usb/dev-smartcard-reader.c
@@ -508,14 +508,14 @@ static void ccid_card_apdu_from_guest(CCIDCardState *card,
     }
 }
 
-static int ccid_card_exitfn(CCIDCardState *card)
+static void ccid_card_exitfn(CCIDCardState *card)
 {
     CCIDCardClass *cc = CCID_CARD_GET_CLASS(card);
 
     if (cc->exitfn) {
-        return cc->exitfn(card);
+        cc->exitfn(card);
     }
-    return 0;
+
 }
 
 static int ccid_card_initfn(CCIDCardState *card)
@@ -1279,7 +1279,6 @@ void ccid_card_card_inserted(CCIDCardState *card)
 
 static int ccid_card_exit(DeviceState *qdev)
 {
-    int ret = 0;
     CCIDCardState *card = CCID_CARD(qdev);
     USBDevice *dev = USB_DEVICE(qdev->parent_bus->parent);
     USBCCIDState *s = USB_CCID_DEV(dev);
@@ -1287,9 +1286,9 @@ static int ccid_card_exit(DeviceState *qdev)
     if (ccid_card_inserted(s)) {
         ccid_card_card_removed(card);
     }
-    ret = ccid_card_exitfn(card);
+    ccid_card_exitfn(card);
     s->card = NULL;
-    return ret;
+    return 0;
 }
 
 static int ccid_card_init(DeviceState *qdev)
diff --git a/hw/usb/xen-usb.c b/hw/usb/xen-usb.c
index de2ebd6210..1b3c2fb3c7 100644
--- a/hw/usb/xen-usb.c
+++ b/hw/usb/xen-usb.c
@@ -47,7 +47,7 @@
         struct timeval tv;                                          \
                                                                     \
         gettimeofday(&tv, NULL);                                    \
-        xen_be_printf(xendev, lvl, "%8ld.%06ld xen-usb(%s):" fmt,   \
+        xen_pv_printf(xendev, lvl, "%8ld.%06ld xen-usb(%s):" fmt,   \
                       tv.tv_sec, tv.tv_usec, __func__, ##args);     \
     }
 #define TR_BUS(xendev, fmt, args...) TR(xendev, 2, fmt, ##args)
@@ -153,7 +153,7 @@ static int usbback_gnttab_map(struct usbback_req *usbback_req)
     }
 
     if (nr_segs > USBIF_MAX_SEGMENTS_PER_REQUEST) {
-        xen_be_printf(xendev, 0, "bad number of segments in request (%d)\n",
+        xen_pv_printf(xendev, 0, "bad number of segments in request (%d)\n",
                       nr_segs);
         return -EINVAL;
     }
@@ -161,7 +161,7 @@ static int usbback_gnttab_map(struct usbback_req *usbback_req)
     for (i = 0; i < nr_segs; i++) {
         if ((unsigned)usbback_req->req.seg[i].offset +
             (unsigned)usbback_req->req.seg[i].length > XC_PAGE_SIZE) {
-            xen_be_printf(xendev, 0, "segment crosses page boundary\n");
+            xen_pv_printf(xendev, 0, "segment crosses page boundary\n");
             return -EINVAL;
         }
     }
@@ -199,7 +199,7 @@ static int usbback_gnttab_map(struct usbback_req *usbback_req)
      */
 
     if (!usbback_req->nr_extra_segs) {
-        xen_be_printf(xendev, 0, "iso request without descriptor segments\n");
+        xen_pv_printf(xendev, 0, "iso request without descriptor segments\n");
         return -EINVAL;
     }
 
@@ -314,7 +314,7 @@ static void usbback_do_response(struct usbback_req *usbback_req, int32_t status,
         RING_PUSH_RESPONSES_AND_CHECK_NOTIFY(&usbif->urb_ring, notify);
 
         if (notify) {
-            xen_be_send_notify(xendev);
+            xen_pv_send_notify(xendev);
         }
     }
 
@@ -551,14 +551,14 @@ static void usbback_dispatch(struct usbback_req *usbback_req)
 
     ret = usbback_init_packet(usbback_req);
     if (ret) {
-        xen_be_printf(&usbif->xendev, 0, "invalid request\n");
+        xen_pv_printf(&usbif->xendev, 0, "invalid request\n");
         ret = -ESHUTDOWN;
         goto fail_free_urb;
     }
 
     ret = usbback_gnttab_map(usbback_req);
     if (ret) {
-        xen_be_printf(&usbif->xendev, 0, "invalid buffer, ret=%d\n", ret);
+        xen_pv_printf(&usbif->xendev, 0, "invalid buffer, ret=%d\n", ret);
         ret = -ESHUTDOWN;
         goto fail_free_urb;
     }
@@ -590,7 +590,7 @@ static void usbback_hotplug_notify(struct usbback_info *usbif)
 
     /* Check for full ring. */
     if ((RING_SIZE(ring) - ring->rsp_prod_pvt - ring->req_cons) == 0) {
-        xen_be_send_notify(&usbif->xendev);
+        xen_pv_send_notify(&usbif->xendev);
         return;
     }
 
@@ -609,7 +609,7 @@ static void usbback_hotplug_notify(struct usbback_info *usbif)
     RING_PUSH_RESPONSES_AND_CHECK_NOTIFY(ring, notify);
 
     if (notify) {
-        xen_be_send_notify(&usbif->xendev);
+        xen_pv_send_notify(&usbif->xendev);
     }
 
     TR_BUS(&usbif->xendev, "hotplug port %d speed %d\n", usb_hp->port,
@@ -646,7 +646,7 @@ static void usbback_bh(void *opaque)
 
     if (RING_REQUEST_PROD_OVERFLOW(urb_ring, rp)) {
         rc = urb_ring->rsp_prod_pvt;
-        xen_be_printf(&usbif->xendev, 0, "domU provided bogus ring requests "
+        xen_pv_printf(&usbif->xendev, 0, "domU provided bogus ring requests "
                       "(%#x - %#x = %u). Halting ring processing.\n",
                       rp, rc, rp - rc);
         usbif->ring_error = true;
@@ -744,7 +744,7 @@ static void usbback_portid_add(struct usbback_info *usbif, unsigned port,
 
     portname = strchr(busid, '-');
     if (!portname) {
-        xen_be_printf(&usbif->xendev, 0, "device %s illegal specification\n",
+        xen_pv_printf(&usbif->xendev, 0, "device %s illegal specification\n",
                       busid);
         return;
     }
@@ -783,7 +783,7 @@ static void usbback_portid_add(struct usbback_info *usbif, unsigned port,
         break;
     }
     if (speed == USBIF_SPEED_NONE) {
-        xen_be_printf(&usbif->xendev, 0, "device %s wrong speed\n", busid);
+        xen_pv_printf(&usbif->xendev, 0, "device %s wrong speed\n", busid);
         object_unparent(OBJECT(usbif->ports[port - 1].dev));
         usbif->ports[port - 1].dev = NULL;
         return;
@@ -800,7 +800,7 @@ static void usbback_portid_add(struct usbback_info *usbif, unsigned port,
 err:
     QDECREF(qdict);
     snprintf(p->path, sizeof(p->path), "%d", 99);
-    xen_be_printf(&usbif->xendev, 0, "device %s could not be opened\n", busid);
+    xen_pv_printf(&usbif->xendev, 0, "device %s could not be opened\n", busid);
 }
 
 static void usbback_process_port(struct usbback_info *usbif, unsigned port)
@@ -811,7 +811,7 @@ static void usbback_process_port(struct usbback_info *usbif, unsigned port)
     snprintf(node, sizeof(node), "port/%d", port);
     busid = xenstore_read_be_str(&usbif->xendev, node);
     if (busid == NULL) {
-        xen_be_printf(&usbif->xendev, 0, "xenstore_read %s failed\n", node);
+        xen_pv_printf(&usbif->xendev, 0, "xenstore_read %s failed\n", node);
         return;
     }
 
@@ -834,7 +834,7 @@ static void usbback_disconnect(struct XenDevice *xendev)
 
     usbif = container_of(xendev, struct usbback_info, xendev);
 
-    xen_be_unbind_evtchn(xendev);
+    xen_pv_unbind_evtchn(xendev);
 
     if (usbif->urb_sring) {
         xengnttab_unmap(xendev->gnttabdev, usbif->urb_sring, 1);
@@ -868,15 +868,15 @@ static int usbback_connect(struct XenDevice *xendev)
     usbif = container_of(xendev, struct usbback_info, xendev);
 
     if (xenstore_read_fe_int(xendev, "urb-ring-ref", &urb_ring_ref)) {
-        xen_be_printf(xendev, 0, "error reading urb-ring-ref\n");
+        xen_pv_printf(xendev, 0, "error reading urb-ring-ref\n");
         return -1;
     }
     if (xenstore_read_fe_int(xendev, "conn-ring-ref", &conn_ring_ref)) {
-        xen_be_printf(xendev, 0, "error reading conn-ring-ref\n");
+        xen_pv_printf(xendev, 0, "error reading conn-ring-ref\n");
         return -1;
     }
     if (xenstore_read_fe_int(xendev, "event-channel", &xendev->remote_port)) {
-        xen_be_printf(xendev, 0, "error reading event-channel\n");
+        xen_pv_printf(xendev, 0, "error reading event-channel\n");
         return -1;
     }
 
@@ -887,7 +887,7 @@ static int usbback_connect(struct XenDevice *xendev)
                                                 conn_ring_ref,
                                                 PROT_READ | PROT_WRITE);
     if (!usbif->urb_sring || !usbif->conn_sring) {
-        xen_be_printf(xendev, 0, "error mapping rings\n");
+        xen_pv_printf(xendev, 0, "error mapping rings\n");
         usbback_disconnect(xendev);
         return -1;
     }
@@ -899,7 +899,7 @@ static int usbback_connect(struct XenDevice *xendev)
 
     xen_be_bind_evtchn(xendev);
 
-    xen_be_printf(xendev, 1, "urb-ring-ref %d, conn-ring-ref %d, "
+    xen_pv_printf(xendev, 1, "urb-ring-ref %d, conn-ring-ref %d, "
                   "remote port %d, local port %d\n", urb_ring_ref,
                   conn_ring_ref, xendev->remote_port, xendev->local_port);
 
@@ -935,12 +935,12 @@ static int usbback_init(struct XenDevice *xendev)
 
     if (xenstore_read_be_int(xendev, "num-ports", &usbif->num_ports) ||
         usbif->num_ports < 1 || usbif->num_ports > USBBACK_MAXPORTS) {
-        xen_be_printf(xendev, 0, "num-ports not readable or out of bounds\n");
+        xen_pv_printf(xendev, 0, "num-ports not readable or out of bounds\n");
         return -1;
     }
     if (xenstore_read_be_int(xendev, "usb-ver", &usbif->usb_ver) ||
         (usbif->usb_ver != USB_VER_USB11 && usbif->usb_ver != USB_VER_USB20)) {
-        xen_be_printf(xendev, 0, "usb-ver not readable or out of bounds\n");
+        xen_pv_printf(xendev, 0, "usb-ver not readable or out of bounds\n");
         return -1;
     }
 
@@ -1028,7 +1028,7 @@ static void usbback_alloc(struct XenDevice *xendev)
     /* max_grants: for each request and for the rings (request and connect). */
     max_grants = USBIF_MAX_SEGMENTS_PER_REQUEST * USB_URB_RING_SIZE + 2;
     if (xengnttab_set_max_grants(xendev->gnttabdev, max_grants) < 0) {
-        xen_be_printf(xendev, 0, "xengnttab_set_max_grants failed: %s\n",
+        xen_pv_printf(xendev, 0, "xengnttab_set_max_grants failed: %s\n",
                       strerror(errno));
     }
 }
diff --git a/hw/xen/Makefile.objs b/hw/xen/Makefile.objs
index d3670940b7..591cdc229d 100644
--- a/hw/xen/Makefile.objs
+++ b/hw/xen/Makefile.objs
@@ -1,5 +1,5 @@
 # xen backend driver support
-common-obj-$(CONFIG_XEN_BACKEND) += xen_backend.o xen_devconfig.o
+common-obj-$(CONFIG_XEN_BACKEND) += xen_backend.o xen_devconfig.o xen_pvdev.o
 
 obj-$(CONFIG_XEN_PCI_PASSTHROUGH) += xen-host-pci-device.o
 obj-$(CONFIG_XEN_PCI_PASSTHROUGH) += xen_pt.o xen_pt_config_init.o xen_pt_graphics.o xen_pt_msi.o
diff --git a/hw/xen/xen_backend.c b/hw/xen/xen_backend.c
index 69a238817e..41ba5c585a 100644
--- a/hw/xen/xen_backend.c
+++ b/hw/xen/xen_backend.c
@@ -30,6 +30,7 @@
 #include "sysemu/char.h"
 #include "qemu/log.h"
 #include "hw/xen/xen_backend.h"
+#include "hw/xen/xen_pvdev.h"
 
 #include <xen/grant_table.h>
 
@@ -46,129 +47,7 @@ struct xs_handle *xenstore = NULL;
 const char *xen_protocol;
 
 /* private */
-struct xs_dirs {
-    char *xs_dir;
-    QTAILQ_ENTRY(xs_dirs) list;
-};
-static QTAILQ_HEAD(xs_dirs_head, xs_dirs) xs_cleanup =
-    QTAILQ_HEAD_INITIALIZER(xs_cleanup);
-
-static QTAILQ_HEAD(XenDeviceHead, XenDevice) xendevs = QTAILQ_HEAD_INITIALIZER(xendevs);
-static int debug = 0;
-
-/* ------------------------------------------------------------- */
-
-static void xenstore_cleanup_dir(char *dir)
-{
-    struct xs_dirs *d;
-
-    d = g_malloc(sizeof(*d));
-    d->xs_dir = dir;
-    QTAILQ_INSERT_TAIL(&xs_cleanup, d, list);
-}
-
-void xen_config_cleanup(void)
-{
-    struct xs_dirs *d;
-
-    QTAILQ_FOREACH(d, &xs_cleanup, list) {
-        xs_rm(xenstore, 0, d->xs_dir);
-    }
-}
-
-int xenstore_write_str(const char *base, const char *node, const char *val)
-{
-    char abspath[XEN_BUFSIZE];
-
-    snprintf(abspath, sizeof(abspath), "%s/%s", base, node);
-    if (!xs_write(xenstore, 0, abspath, val, strlen(val))) {
-        return -1;
-    }
-    return 0;
-}
-
-char *xenstore_read_str(const char *base, const char *node)
-{
-    char abspath[XEN_BUFSIZE];
-    unsigned int len;
-    char *str, *ret = NULL;
-
-    snprintf(abspath, sizeof(abspath), "%s/%s", base, node);
-    str = xs_read(xenstore, 0, abspath, &len);
-    if (str != NULL) {
-        /* move to qemu-allocated memory to make sure
-         * callers can savely g_free() stuff. */
-        ret = g_strdup(str);
-        free(str);
-    }
-    return ret;
-}
-
-int xenstore_mkdir(char *path, int p)
-{
-    struct xs_permissions perms[2] = {
-        {
-            .id    = 0, /* set owner: dom0 */
-        }, {
-            .id    = xen_domid,
-            .perms = p,
-        }
-    };
-
-    if (!xs_mkdir(xenstore, 0, path)) {
-        xen_be_printf(NULL, 0, "xs_mkdir %s: failed\n", path);
-        return -1;
-    }
-    xenstore_cleanup_dir(g_strdup(path));
-
-    if (!xs_set_permissions(xenstore, 0, path, perms, 2)) {
-        xen_be_printf(NULL, 0, "xs_set_permissions %s: failed\n", path);
-        return -1;
-    }
-    return 0;
-}
-
-int xenstore_write_int(const char *base, const char *node, int ival)
-{
-    char val[12];
-
-    snprintf(val, sizeof(val), "%d", ival);
-    return xenstore_write_str(base, node, val);
-}
-
-int xenstore_write_int64(const char *base, const char *node, int64_t ival)
-{
-    char val[21];
-
-    snprintf(val, sizeof(val), "%"PRId64, ival);
-    return xenstore_write_str(base, node, val);
-}
-
-int xenstore_read_int(const char *base, const char *node, int *ival)
-{
-    char *val;
-    int rc = -1;
-
-    val = xenstore_read_str(base, node);
-    if (val && 1 == sscanf(val, "%d", ival)) {
-        rc = 0;
-    }
-    g_free(val);
-    return rc;
-}
-
-int xenstore_read_uint64(const char *base, const char *node, uint64_t *uval)
-{
-    char *val;
-    int rc = -1;
-
-    val = xenstore_read_str(base, node);
-    if (val && 1 == sscanf(val, "%"SCNu64, uval)) {
-        rc = 0;
-    }
-    g_free(val);
-    return rc;
-}
+static int debug;
 
 int xenstore_write_be_str(struct XenDevice *xendev, const char *node, const char *val)
 {
@@ -205,27 +84,14 @@ int xenstore_read_fe_int(struct XenDevice *xendev, const char *node, int *ival)
     return xenstore_read_int(xendev->fe, node, ival);
 }
 
-int xenstore_read_fe_uint64(struct XenDevice *xendev, const char *node, uint64_t *uval)
+int xenstore_read_fe_uint64(struct XenDevice *xendev, const char *node,
+                            uint64_t *uval)
 {
     return xenstore_read_uint64(xendev->fe, node, uval);
 }
 
 /* ------------------------------------------------------------- */
 
-const char *xenbus_strstate(enum xenbus_state state)
-{
-    static const char *const name[] = {
-        [ XenbusStateUnknown      ] = "Unknown",
-        [ XenbusStateInitialising ] = "Initialising",
-        [ XenbusStateInitWait     ] = "InitWait",
-        [ XenbusStateInitialised  ] = "Initialised",
-        [ XenbusStateConnected    ] = "Connected",
-        [ XenbusStateClosing      ] = "Closing",
-        [ XenbusStateClosed       ] = "Closed",
-    };
-    return (state < ARRAY_SIZE(name)) ? name[state] : "INVALID";
-}
-
 int xen_be_set_state(struct XenDevice *xendev, enum xenbus_state state)
 {
     int rc;
@@ -234,33 +100,12 @@ int xen_be_set_state(struct XenDevice *xendev, enum xenbus_state state)
     if (rc < 0) {
         return rc;
     }
-    xen_be_printf(xendev, 1, "backend state: %s -> %s\n",
+    xen_pv_printf(xendev, 1, "backend state: %s -> %s\n",
                   xenbus_strstate(xendev->be_state), xenbus_strstate(state));
     xendev->be_state = state;
     return 0;
 }
 
-/* ------------------------------------------------------------- */
-
-struct XenDevice *xen_be_find_xendev(const char *type, int dom, int dev)
-{
-    struct XenDevice *xendev;
-
-    QTAILQ_FOREACH(xendev, &xendevs, next) {
-        if (xendev->dom != dom) {
-            continue;
-        }
-        if (xendev->dev != dev) {
-            continue;
-        }
-        if (strcmp(xendev->type, type) != 0) {
-            continue;
-        }
-        return xendev;
-    }
-    return NULL;
-}
-
 /*
  * get xen backend device, allocate a new one if it doesn't exist.
  */
@@ -269,7 +114,7 @@ static struct XenDevice *xen_be_get_xendev(const char *type, int dom, int dev,
 {
     struct XenDevice *xendev;
 
-    xendev = xen_be_find_xendev(type, dom, dev);
+    xendev = xen_pv_find_xendev(type, dom, dev);
     if (xendev) {
         return xendev;
     }
@@ -291,7 +136,7 @@ static struct XenDevice *xen_be_get_xendev(const char *type, int dom, int dev,
 
     xendev->evtchndev = xenevtchn_open(NULL, 0);
     if (xendev->evtchndev == NULL) {
-        xen_be_printf(NULL, 0, "can't open evtchn device\n");
+        xen_pv_printf(NULL, 0, "can't open evtchn device\n");
         g_free(xendev);
         return NULL;
     }
@@ -300,7 +145,7 @@ static struct XenDevice *xen_be_get_xendev(const char *type, int dom, int dev,
     if (ops->flags & DEVOPS_FLAG_NEED_GNTDEV) {
         xendev->gnttabdev = xengnttab_open(NULL, 0);
         if (xendev->gnttabdev == NULL) {
-            xen_be_printf(NULL, 0, "can't open gnttab device\n");
+            xen_pv_printf(NULL, 0, "can't open gnttab device\n");
             xenevtchn_close(xendev->evtchndev);
             g_free(xendev);
             return NULL;
@@ -309,7 +154,7 @@ static struct XenDevice *xen_be_get_xendev(const char *type, int dom, int dev,
         xendev->gnttabdev = NULL;
     }
 
-    QTAILQ_INSERT_TAIL(&xendevs, xendev, next);
+    xen_pv_insert_xendev(xendev);
 
     if (xendev->ops->alloc) {
         xendev->ops->alloc(xendev);
@@ -318,32 +163,6 @@ static struct XenDevice *xen_be_get_xendev(const char *type, int dom, int dev,
     return xendev;
 }
 
-/*
- * release xen backend device.
- */
-static void xen_be_del_xendev(struct XenDevice *xendev)
-{
-    if (xendev->ops->free) {
-        xendev->ops->free(xendev);
-    }
-
-    if (xendev->fe) {
-        char token[XEN_BUFSIZE];
-        snprintf(token, sizeof(token), "fe:%p", xendev);
-        xs_unwatch(xenstore, xendev->fe, token);
-        g_free(xendev->fe);
-    }
-
-    if (xendev->evtchndev != NULL) {
-        xenevtchn_close(xendev->evtchndev);
-    }
-    if (xendev->gnttabdev != NULL) {
-        xengnttab_close(xendev->gnttabdev);
-    }
-
-    QTAILQ_REMOVE(&xendevs, xendev, next);
-    g_free(xendev);
-}
 
 /*
  * Sync internal data structures on xenstore updates.
@@ -359,7 +178,7 @@ static void xen_be_backend_changed(struct XenDevice *xendev, const char *node)
     }
 
     if (node) {
-        xen_be_printf(xendev, 2, "backend update: %s\n", node);
+        xen_pv_printf(xendev, 2, "backend update: %s\n", node);
         if (xendev->ops->backend_changed) {
             xendev->ops->backend_changed(xendev, node);
         }
@@ -375,7 +194,7 @@ static void xen_be_frontend_changed(struct XenDevice *xendev, const char *node)
             fe_state = XenbusStateUnknown;
         }
         if (xendev->fe_state != fe_state) {
-            xen_be_printf(xendev, 1, "frontend state: %s -> %s\n",
+            xen_pv_printf(xendev, 1, "frontend state: %s -> %s\n",
                           xenbus_strstate(xendev->fe_state),
                           xenbus_strstate(fe_state));
         }
@@ -385,12 +204,13 @@ static void xen_be_frontend_changed(struct XenDevice *xendev, const char *node)
         g_free(xendev->protocol);
         xendev->protocol = xenstore_read_fe_str(xendev, "protocol");
         if (xendev->protocol) {
-            xen_be_printf(xendev, 1, "frontend protocol: %s\n", xendev->protocol);
+            xen_pv_printf(xendev, 1, "frontend protocol: %s\n",
+                          xendev->protocol);
         }
     }
 
     if (node) {
-        xen_be_printf(xendev, 2, "frontend update: %s\n", node);
+        xen_pv_printf(xendev, 2, "frontend update: %s\n", node);
         if (xendev->ops->frontend_changed) {
             xendev->ops->frontend_changed(xendev, node);
         }
@@ -414,26 +234,26 @@ static int xen_be_try_setup(struct XenDevice *xendev)
     int be_state;
 
     if (xenstore_read_be_int(xendev, "state", &be_state) == -1) {
-        xen_be_printf(xendev, 0, "reading backend state failed\n");
+        xen_pv_printf(xendev, 0, "reading backend state failed\n");
         return -1;
     }
 
     if (be_state != XenbusStateInitialising) {
-        xen_be_printf(xendev, 0, "initial backend state is wrong (%s)\n",
+        xen_pv_printf(xendev, 0, "initial backend state is wrong (%s)\n",
                       xenbus_strstate(be_state));
         return -1;
     }
 
     xendev->fe = xenstore_read_be_str(xendev, "frontend");
     if (xendev->fe == NULL) {
-        xen_be_printf(xendev, 0, "reading frontend path failed\n");
+        xen_pv_printf(xendev, 0, "reading frontend path failed\n");
         return -1;
     }
 
     /* setup frontend watch */
     snprintf(token, sizeof(token), "fe:%p", xendev);
     if (!xs_watch(xenstore, xendev->fe, token)) {
-        xen_be_printf(xendev, 0, "watching frontend path (%s) failed\n",
+        xen_pv_printf(xendev, 0, "watching frontend path (%s) failed\n",
                       xendev->fe);
         return -1;
     }
@@ -457,7 +277,7 @@ static int xen_be_try_init(struct XenDevice *xendev)
     int rc = 0;
 
     if (!xendev->online) {
-        xen_be_printf(xendev, 1, "not online\n");
+        xen_pv_printf(xendev, 1, "not online\n");
         return -1;
     }
 
@@ -465,7 +285,7 @@ static int xen_be_try_init(struct XenDevice *xendev)
         rc = xendev->ops->init(xendev);
     }
     if (rc != 0) {
-        xen_be_printf(xendev, 1, "init() failed\n");
+        xen_pv_printf(xendev, 1, "init() failed\n");
         return rc;
     }
 
@@ -488,9 +308,9 @@ static int xen_be_try_initialise(struct XenDevice *xendev)
     if (xendev->fe_state != XenbusStateInitialised  &&
         xendev->fe_state != XenbusStateConnected) {
         if (xendev->ops->flags & DEVOPS_FLAG_IGNORE_STATE) {
-            xen_be_printf(xendev, 2, "frontend not ready, ignoring\n");
+            xen_pv_printf(xendev, 2, "frontend not ready, ignoring\n");
         } else {
-            xen_be_printf(xendev, 2, "frontend not ready (yet)\n");
+            xen_pv_printf(xendev, 2, "frontend not ready (yet)\n");
             return -1;
         }
     }
@@ -499,7 +319,7 @@ static int xen_be_try_initialise(struct XenDevice *xendev)
         rc = xendev->ops->initialise(xendev);
     }
     if (rc != 0) {
-        xen_be_printf(xendev, 0, "initialise() failed\n");
+        xen_pv_printf(xendev, 0, "initialise() failed\n");
         return rc;
     }
 
@@ -520,9 +340,9 @@ static void xen_be_try_connected(struct XenDevice *xendev)
 
     if (xendev->fe_state != XenbusStateConnected) {
         if (xendev->ops->flags & DEVOPS_FLAG_IGNORE_STATE) {
-            xen_be_printf(xendev, 2, "frontend not ready, ignoring\n");
+            xen_pv_printf(xendev, 2, "frontend not ready, ignoring\n");
         } else {
-            xen_be_printf(xendev, 2, "frontend not ready (yet)\n");
+            xen_pv_printf(xendev, 2, "frontend not ready (yet)\n");
             return;
         }
     }
@@ -556,7 +376,7 @@ static int xen_be_try_reset(struct XenDevice *xendev)
         return -1;
     }
 
-    xen_be_printf(xendev, 1, "device reset (for re-connect)\n");
+    xen_pv_printf(xendev, 1, "device reset (for re-connect)\n");
     xen_be_set_state(xendev, XenbusStateInitialising);
     return 0;
 }
@@ -617,7 +437,8 @@ static int xenstore_scan(const char *type, int dom, struct XenDevOps *ops)
     snprintf(token, sizeof(token), "be:%p:%d:%p", type, dom, ops);
     snprintf(path, sizeof(path), "backend/%s/%d", type, dom);
     if (!xs_watch(xenstore, path, token)) {
-        xen_be_printf(NULL, 0, "xen be: watching backend path (%s) failed\n", path);
+        xen_pv_printf(NULL, 0, "xen be: watching backend path (%s) failed\n",
+                      path);
         return -1;
     }
 
@@ -637,8 +458,8 @@ static int xenstore_scan(const char *type, int dom, struct XenDevOps *ops)
     return 0;
 }
 
-static void xenstore_update_be(char *watch, char *type, int dom,
-                               struct XenDevOps *ops)
+void xenstore_update_be(char *watch, char *type, int dom,
+                        struct XenDevOps *ops)
 {
     struct XenDevice *xendev;
     char path[XEN_BUFSIZE], *bepath;
@@ -662,7 +483,7 @@ static void xenstore_update_be(char *watch, char *type, int dom,
     if (xendev != NULL) {
         bepath = xs_read(xenstore, 0, xendev->be, &len);
         if (bepath == NULL) {
-            xen_be_del_xendev(xendev);
+            xen_pv_del_xendev(xendev);
         } else {
             free(bepath);
             xen_be_backend_changed(xendev, path);
@@ -671,7 +492,7 @@ static void xenstore_update_be(char *watch, char *type, int dom,
     }
 }
 
-static void xenstore_update_fe(char *watch, struct XenDevice *xendev)
+void xenstore_update_fe(char *watch, struct XenDevice *xendev)
 {
     char *node;
     unsigned int len;
@@ -688,56 +509,13 @@ static void xenstore_update_fe(char *watch, struct XenDevice *xendev)
     xen_be_frontend_changed(xendev, node);
     xen_be_check_state(xendev);
 }
-
-static void xenstore_update(void *unused)
-{
-    char **vec = NULL;
-    intptr_t type, ops, ptr;
-    unsigned int dom, count;
-
-    vec = xs_read_watch(xenstore, &count);
-    if (vec == NULL) {
-        goto cleanup;
-    }
-
-    if (sscanf(vec[XS_WATCH_TOKEN], "be:%" PRIxPTR ":%d:%" PRIxPTR,
-               &type, &dom, &ops) == 3) {
-        xenstore_update_be(vec[XS_WATCH_PATH], (void*)type, dom, (void*)ops);
-    }
-    if (sscanf(vec[XS_WATCH_TOKEN], "fe:%" PRIxPTR, &ptr) == 1) {
-        xenstore_update_fe(vec[XS_WATCH_PATH], (void*)ptr);
-    }
-
-cleanup:
-    free(vec);
-}
-
-static void xen_be_evtchn_event(void *opaque)
-{
-    struct XenDevice *xendev = opaque;
-    evtchn_port_t port;
-
-    port = xenevtchn_pending(xendev->evtchndev);
-    if (port != xendev->local_port) {
-        xen_be_printf(xendev, 0,
-                      "xenevtchn_pending returned %d (expected %d)\n",
-                      port, xendev->local_port);
-        return;
-    }
-    xenevtchn_unmask(xendev->evtchndev, port);
-
-    if (xendev->ops->event) {
-        xendev->ops->event(xendev);
-    }
-}
-
 /* -------------------------------------------------------------------- */
 
 int xen_be_init(void)
 {
     xenstore = xs_daemon_open();
     if (!xenstore) {
-        xen_be_printf(NULL, 0, "can't connect to xenstored\n");
+        xen_pv_printf(NULL, 0, "can't connect to xenstored\n");
         return -1;
     }
 
@@ -798,69 +576,15 @@ int xen_be_bind_evtchn(struct XenDevice *xendev)
     xendev->local_port = xenevtchn_bind_interdomain
         (xendev->evtchndev, xendev->dom, xendev->remote_port);
     if (xendev->local_port == -1) {
-        xen_be_printf(xendev, 0, "xenevtchn_bind_interdomain failed\n");
+        xen_pv_printf(xendev, 0, "xenevtchn_bind_interdomain failed\n");
         return -1;
     }
-    xen_be_printf(xendev, 2, "bind evtchn port %d\n", xendev->local_port);
+    xen_pv_printf(xendev, 2, "bind evtchn port %d\n", xendev->local_port);
     qemu_set_fd_handler(xenevtchn_fd(xendev->evtchndev),
-                        xen_be_evtchn_event, NULL, xendev);
+                        xen_pv_evtchn_event, NULL, xendev);
     return 0;
 }
 
-void xen_be_unbind_evtchn(struct XenDevice *xendev)
-{
-    if (xendev->local_port == -1) {
-        return;
-    }
-    qemu_set_fd_handler(xenevtchn_fd(xendev->evtchndev), NULL, NULL, NULL);
-    xenevtchn_unbind(xendev->evtchndev, xendev->local_port);
-    xen_be_printf(xendev, 2, "unbind evtchn port %d\n", xendev->local_port);
-    xendev->local_port = -1;
-}
-
-int xen_be_send_notify(struct XenDevice *xendev)
-{
-    return xenevtchn_notify(xendev->evtchndev, xendev->local_port);
-}
-
-/*
- * msg_level:
- *  0 == errors (stderr + logfile).
- *  1 == informative debug messages (logfile only).
- *  2 == noisy debug messages (logfile only).
- *  3 == will flood your log (logfile only).
- */
-void xen_be_printf(struct XenDevice *xendev, int msg_level, const char *fmt, ...)
-{
-    va_list args;
-
-    if (xendev) {
-        if (msg_level > xendev->debug) {
-            return;
-        }
-        qemu_log("xen be: %s: ", xendev->name);
-        if (msg_level == 0) {
-            fprintf(stderr, "xen be: %s: ", xendev->name);
-        }
-    } else {
-        if (msg_level > debug) {
-            return;
-        }
-        qemu_log("xen be core: ");
-        if (msg_level == 0) {
-            fprintf(stderr, "xen be core: ");
-        }
-    }
-    va_start(args, fmt);
-    qemu_log_vprintf(fmt, args);
-    va_end(args);
-    if (msg_level == 0) {
-        va_start(args, fmt);
-        vfprintf(stderr, fmt, args);
-        va_end(args);
-    }
-    qemu_log_flush();
-}
 
 static int xen_sysdev_init(SysBusDevice *dev)
 {
diff --git a/hw/xen/xen_devconfig.c b/hw/xen/xen_devconfig.c
index b7d290df6c..a80e78c0dc 100644
--- a/hw/xen/xen_devconfig.c
+++ b/hw/xen/xen_devconfig.c
@@ -55,7 +55,7 @@ int xen_config_dev_blk(DriveInfo *disk)
     const char *filename = qemu_opt_get(disk->opts, "file");
 
     snprintf(device_name, sizeof(device_name), "xvd%c", 'a' + disk->unit);
-    xen_be_printf(NULL, 1, "config disk %d [%s]: %s\n",
+    xen_pv_printf(NULL, 1, "config disk %d [%s]: %s\n",
                   disk->unit, device_name, filename);
     xen_config_dev_dirs("vbd", "qdisk", vdev, fe, be, sizeof(fe));
 
@@ -83,7 +83,7 @@ int xen_config_dev_nic(NICInfo *nic)
     snprintf(mac, sizeof(mac), "%02x:%02x:%02x:%02x:%02x:%02x",
              nic->macaddr.a[0], nic->macaddr.a[1], nic->macaddr.a[2],
              nic->macaddr.a[3], nic->macaddr.a[4], nic->macaddr.a[5]);
-    xen_be_printf(NULL, 1, "config nic %d: mac=\"%s\"\n", vlan_id, mac);
+    xen_pv_printf(NULL, 1, "config nic %d: mac=\"%s\"\n", vlan_id, mac);
     xen_config_dev_dirs("vif", "qnic", vlan_id, fe, be, sizeof(fe));
 
     /* frontend */
diff --git a/hw/xen/xen_pvdev.c b/hw/xen/xen_pvdev.c
new file mode 100644
index 0000000000..405e15484c
--- /dev/null
+++ b/hw/xen/xen_pvdev.c
@@ -0,0 +1,316 @@
+/*
+ * Xen para-virtualization device
+ *
+ *  (c) 2008 Gerd Hoffmann <kraxel@redhat.com>
+ *
+ * This library is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2 of the License, or (at your option) any later version.
+ *
+ * This library is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with this library; if not, see <http://www.gnu.org/licenses/>
+ */
+
+#include "qemu/osdep.h"
+
+#include "hw/xen/xen_backend.h"
+#include "hw/xen/xen_pvdev.h"
+
+/* private */
+static int debug;
+
+struct xs_dirs {
+    char *xs_dir;
+    QTAILQ_ENTRY(xs_dirs) list;
+};
+
+static QTAILQ_HEAD(xs_dirs_head, xs_dirs) xs_cleanup =
+    QTAILQ_HEAD_INITIALIZER(xs_cleanup);
+
+static QTAILQ_HEAD(XenDeviceHead, XenDevice) xendevs =
+    QTAILQ_HEAD_INITIALIZER(xendevs);
+
+/* ------------------------------------------------------------- */
+
+static void xenstore_cleanup_dir(char *dir)
+{
+    struct xs_dirs *d;
+
+    d = g_malloc(sizeof(*d));
+    d->xs_dir = dir;
+    QTAILQ_INSERT_TAIL(&xs_cleanup, d, list);
+}
+
+void xen_config_cleanup(void)
+{
+    struct xs_dirs *d;
+
+    QTAILQ_FOREACH(d, &xs_cleanup, list) {
+        xs_rm(xenstore, 0, d->xs_dir);
+    }
+}
+
+int xenstore_mkdir(char *path, int p)
+{
+    struct xs_permissions perms[2] = {
+        {
+            .id    = 0, /* set owner: dom0 */
+        }, {
+            .id    = xen_domid,
+            .perms = p,
+        }
+    };
+
+    if (!xs_mkdir(xenstore, 0, path)) {
+        xen_pv_printf(NULL, 0, "xs_mkdir %s: failed\n", path);
+        return -1;
+    }
+    xenstore_cleanup_dir(g_strdup(path));
+
+    if (!xs_set_permissions(xenstore, 0, path, perms, 2)) {
+        xen_pv_printf(NULL, 0, "xs_set_permissions %s: failed\n", path);
+        return -1;
+    }
+    return 0;
+}
+
+int xenstore_write_str(const char *base, const char *node, const char *val)
+{
+    char abspath[XEN_BUFSIZE];
+
+    snprintf(abspath, sizeof(abspath), "%s/%s", base, node);
+    if (!xs_write(xenstore, 0, abspath, val, strlen(val))) {
+        return -1;
+    }
+    return 0;
+}
+
+char *xenstore_read_str(const char *base, const char *node)
+{
+    char abspath[XEN_BUFSIZE];
+    unsigned int len;
+    char *str, *ret = NULL;
+
+    snprintf(abspath, sizeof(abspath), "%s/%s", base, node);
+    str = xs_read(xenstore, 0, abspath, &len);
+    if (str != NULL) {
+        /* move to qemu-allocated memory to make sure
+         * callers can savely g_free() stuff. */
+        ret = g_strdup(str);
+        free(str);
+    }
+    return ret;
+}
+
+int xenstore_write_int(const char *base, const char *node, int ival)
+{
+    char val[12];
+
+    snprintf(val, sizeof(val), "%d", ival);
+    return xenstore_write_str(base, node, val);
+}
+
+int xenstore_write_int64(const char *base, const char *node, int64_t ival)
+{
+    char val[21];
+
+    snprintf(val, sizeof(val), "%"PRId64, ival);
+    return xenstore_write_str(base, node, val);
+}
+
+int xenstore_read_int(const char *base, const char *node, int *ival)
+{
+    char *val;
+    int rc = -1;
+
+    val = xenstore_read_str(base, node);
+    if (val && 1 == sscanf(val, "%d", ival)) {
+        rc = 0;
+    }
+    g_free(val);
+    return rc;
+}
+
+int xenstore_read_uint64(const char *base, const char *node, uint64_t *uval)
+{
+    char *val;
+    int rc = -1;
+
+    val = xenstore_read_str(base, node);
+    if (val && 1 == sscanf(val, "%"SCNu64, uval)) {
+        rc = 0;
+    }
+    g_free(val);
+    return rc;
+}
+
+void xenstore_update(void *unused)
+{
+    char **vec = NULL;
+    intptr_t type, ops, ptr;
+    unsigned int dom, count;
+
+    vec = xs_read_watch(xenstore, &count);
+    if (vec == NULL) {
+        goto cleanup;
+    }
+
+    if (sscanf(vec[XS_WATCH_TOKEN], "be:%" PRIxPTR ":%d:%" PRIxPTR,
+               &type, &dom, &ops) == 3) {
+        xenstore_update_be(vec[XS_WATCH_PATH], (void *)type, dom, (void*)ops);
+    }
+    if (sscanf(vec[XS_WATCH_TOKEN], "fe:%" PRIxPTR, &ptr) == 1) {
+        xenstore_update_fe(vec[XS_WATCH_PATH], (void *)ptr);
+    }
+
+cleanup:
+    free(vec);
+}
+
+const char *xenbus_strstate(enum xenbus_state state)
+{
+    static const char *const name[] = {
+        [XenbusStateUnknown]       = "Unknown",
+        [XenbusStateInitialising]  = "Initialising",
+        [XenbusStateInitWait]      = "InitWait",
+        [XenbusStateInitialised]   = "Initialised",
+        [XenbusStateConnected]     = "Connected",
+        [XenbusStateClosing]       = "Closing",
+        [XenbusStateClosed]        = "Closed",
+    };
+    return (state < ARRAY_SIZE(name)) ? name[state] : "INVALID";
+}
+
+/*
+ * msg_level:
+ *  0 == errors (stderr + logfile).
+ *  1 == informative debug messages (logfile only).
+ *  2 == noisy debug messages (logfile only).
+ *  3 == will flood your log (logfile only).
+ */
+void xen_pv_printf(struct XenDevice *xendev, int msg_level,
+                   const char *fmt, ...)
+{
+    va_list args;
+
+    if (xendev) {
+        if (msg_level > xendev->debug) {
+            return;
+        }
+        qemu_log("xen be: %s: ", xendev->name);
+        if (msg_level == 0) {
+            fprintf(stderr, "xen be: %s: ", xendev->name);
+        }
+    } else {
+        if (msg_level > debug) {
+            return;
+        }
+        qemu_log("xen be core: ");
+        if (msg_level == 0) {
+            fprintf(stderr, "xen be core: ");
+        }
+    }
+    va_start(args, fmt);
+    qemu_log_vprintf(fmt, args);
+    va_end(args);
+    if (msg_level == 0) {
+        va_start(args, fmt);
+        vfprintf(stderr, fmt, args);
+        va_end(args);
+    }
+    qemu_log_flush();
+}
+
+void xen_pv_evtchn_event(void *opaque)
+{
+    struct XenDevice *xendev = opaque;
+    evtchn_port_t port;
+
+    port = xenevtchn_pending(xendev->evtchndev);
+    if (port != xendev->local_port) {
+        xen_pv_printf(xendev, 0,
+                      "xenevtchn_pending returned %d (expected %d)\n",
+                      port, xendev->local_port);
+        return;
+    }
+    xenevtchn_unmask(xendev->evtchndev, port);
+
+    if (xendev->ops->event) {
+        xendev->ops->event(xendev);
+    }
+}
+
+void xen_pv_unbind_evtchn(struct XenDevice *xendev)
+{
+    if (xendev->local_port == -1) {
+        return;
+    }
+    qemu_set_fd_handler(xenevtchn_fd(xendev->evtchndev), NULL, NULL, NULL);
+    xenevtchn_unbind(xendev->evtchndev, xendev->local_port);
+    xen_pv_printf(xendev, 2, "unbind evtchn port %d\n", xendev->local_port);
+    xendev->local_port = -1;
+}
+
+int xen_pv_send_notify(struct XenDevice *xendev)
+{
+    return xenevtchn_notify(xendev->evtchndev, xendev->local_port);
+}
+
+/* ------------------------------------------------------------- */
+
+struct XenDevice *xen_pv_find_xendev(const char *type, int dom, int dev)
+{
+    struct XenDevice *xendev;
+
+    QTAILQ_FOREACH(xendev, &xendevs, next) {
+        if (xendev->dom != dom) {
+            continue;
+        }
+        if (xendev->dev != dev) {
+            continue;
+        }
+        if (strcmp(xendev->type, type) != 0) {
+            continue;
+        }
+        return xendev;
+    }
+    return NULL;
+}
+
+/*
+ * release xen backend device.
+ */
+void xen_pv_del_xendev(struct XenDevice *xendev)
+{
+    if (xendev->ops->free) {
+        xendev->ops->free(xendev);
+    }
+
+    if (xendev->fe) {
+        char token[XEN_BUFSIZE];
+        snprintf(token, sizeof(token), "fe:%p", xendev);
+        xs_unwatch(xenstore, xendev->fe, token);
+        g_free(xendev->fe);
+    }
+
+    if (xendev->evtchndev != NULL) {
+        xenevtchn_close(xendev->evtchndev);
+    }
+    if (xendev->gnttabdev != NULL) {
+        xengnttab_close(xendev->gnttabdev);
+    }
+
+    QTAILQ_REMOVE(&xendevs, xendev, next);
+    g_free(xendev);
+}
+
+void xen_pv_insert_xendev(struct XenDevice *xendev)
+{
+    QTAILQ_INSERT_TAIL(&xendevs, xendev, next);
+}
diff --git a/include/block/aio.h b/include/block/aio.h
index b9fe2cb37e..c7ae27c91c 100644
--- a/include/block/aio.h
+++ b/include/block/aio.h
@@ -18,7 +18,6 @@
 #include "qemu/queue.h"
 #include "qemu/event_notifier.h"
 #include "qemu/thread.h"
-#include "qemu/rfifolock.h"
 #include "qemu/timer.h"
 
 typedef struct BlockAIOCB BlockAIOCB;
@@ -54,7 +53,7 @@ struct AioContext {
     GSource source;
 
     /* Protects all fields from multi-threaded access */
-    RFifoLock lock;
+    QemuRecMutex lock;
 
     /* The list of registered AIO handlers */
     QLIST_HEAD(, AioHandler) aio_handlers;
@@ -116,9 +115,6 @@ struct AioContext {
     bool notified;
     EventNotifier notifier;
 
-    /* Scheduling this BH forces the event loop it iterate */
-    QEMUBH *notify_dummy_bh;
-
     /* Thread pool for performing work and receiving completion callbacks */
     struct ThreadPool *thread_pool;
 
@@ -453,6 +449,24 @@ static inline bool aio_node_check(AioContext *ctx, bool is_external)
 }
 
 /**
+ * Return the AioContext whose event loop runs in the current thread.
+ *
+ * If called from an IOThread this will be the IOThread's AioContext.  If
+ * called from another thread it will be the main loop AioContext.
+ */
+AioContext *qemu_get_current_aio_context(void);
+
+/**
+ * @ctx: the aio context
+ *
+ * Return whether we are running in the I/O thread that manages @ctx.
+ */
+static inline bool aio_context_in_iothread(AioContext *ctx)
+{
+    return ctx == qemu_get_current_aio_context();
+}
+
+/**
  * aio_context_setup:
  * @ctx: the aio context
  *
diff --git a/include/block/block.h b/include/block/block.h
index 398a050176..b7dc7d54ae 100644
--- a/include/block/block.h
+++ b/include/block/block.h
@@ -218,7 +218,7 @@ BlockDriverState *bdrv_open(const char *filename, const char *reference,
 BlockReopenQueue *bdrv_reopen_queue(BlockReopenQueue *bs_queue,
                                     BlockDriverState *bs,
                                     QDict *options, int flags);
-int bdrv_reopen_multiple(BlockReopenQueue *bs_queue, Error **errp);
+int bdrv_reopen_multiple(AioContext *ctx, BlockReopenQueue *bs_queue, Error **errp);
 int bdrv_reopen(BlockDriverState *bs, int bdrv_flags, Error **errp);
 int bdrv_reopen_prepare(BDRVReopenState *reopen_state,
                         BlockReopenQueue *queue, Error **errp);
@@ -334,6 +334,35 @@ void bdrv_drain(BlockDriverState *bs);
 void coroutine_fn bdrv_co_drain(BlockDriverState *bs);
 void bdrv_drain_all(void);
 
+#define BDRV_POLL_WHILE(bs, cond) ({                       \
+    bool waited_ = false;                                  \
+    BlockDriverState *bs_ = (bs);                          \
+    AioContext *ctx_ = bdrv_get_aio_context(bs_);          \
+    if (aio_context_in_iothread(ctx_)) {                   \
+        while ((cond)) {                                   \
+            aio_poll(ctx_, true);                          \
+            waited_ = true;                                \
+        }                                                  \
+    } else {                                               \
+        assert(qemu_get_current_aio_context() ==           \
+               qemu_get_aio_context());                    \
+        /* Ask bdrv_dec_in_flight to wake up the main      \
+         * QEMU AioContext.  Extra I/O threads never take  \
+         * other I/O threads' AioContexts (see for example \
+         * block_job_defer_to_main_loop for how to do it). \
+         */                                                \
+        assert(!bs_->wakeup);                              \
+        bs_->wakeup = true;                                \
+        while ((cond)) {                                   \
+            aio_context_release(ctx_);                     \
+            aio_poll(qemu_get_aio_context(), true);        \
+            aio_context_acquire(ctx_);                     \
+            waited_ = true;                                \
+        }                                                  \
+        bs_->wakeup = false;                               \
+    }                                                      \
+    waited_; })
+
 int bdrv_pdiscard(BlockDriverState *bs, int64_t offset, int count);
 int bdrv_co_pdiscard(BlockDriverState *bs, int64_t offset, int count);
 int bdrv_has_zero_init_1(BlockDriverState *bs);
diff --git a/include/block/block_int.h b/include/block/block_int.h
index e96e9ada57..e7ff58419c 100644
--- a/include/block/block_int.h
+++ b/include/block/block_int.h
@@ -62,8 +62,6 @@
 enum BdrvTrackedRequestType {
     BDRV_TRACKED_READ,
     BDRV_TRACKED_WRITE,
-    BDRV_TRACKED_FLUSH,
-    BDRV_TRACKED_IOCTL,
     BDRV_TRACKED_DISCARD,
 };
 
@@ -445,7 +443,7 @@ struct BlockDriverState {
                          note this is a reference count */
 
     CoQueue flush_queue;            /* Serializing flush queue */
-    BdrvTrackedRequest *active_flush_req; /* Flush request in flight */
+    bool active_flush_req;          /* Flush request in flight? */
     unsigned int write_gen;         /* Current data generation */
     unsigned int flushed_gen;       /* Flushed write generation */
 
@@ -473,9 +471,12 @@ struct BlockDriverState {
     /* Callback before write request is processed */
     NotifierWithReturnList before_write_notifiers;
 
-    /* number of in-flight serialising requests */
+    /* number of in-flight requests; overall and serialising */
+    unsigned int in_flight;
     unsigned int serialising_in_flight;
 
+    bool wakeup;
+
     /* Offset after the highest byte written to */
     uint64_t wr_highest_offset;
 
@@ -634,6 +635,21 @@ void bdrv_remove_aio_context_notifier(BlockDriverState *bs,
                                       void (*aio_context_detached)(void *),
                                       void *opaque);
 
+/**
+ * bdrv_wakeup:
+ * @bs: The BlockDriverState for which an I/O operation has been completed.
+ *
+ * Wake up the main thread if it is waiting on BDRV_POLL_WHILE.  During
+ * synchronous I/O on a BlockDriverState that is attached to another
+ * I/O thread, the main thread lets the I/O thread's event loop run,
+ * waiting for the I/O operation to complete.  A bdrv_wakeup will wake
+ * up the main thread if necessary.
+ *
+ * Manual calls to bdrv_wakeup are rarely necessary, because
+ * bdrv_dec_in_flight already calls it.
+ */
+void bdrv_wakeup(BlockDriverState *bs);
+
 #ifdef _WIN32
 int is_windows_drive(const char *filename);
 #endif
@@ -787,6 +803,9 @@ bool bdrv_requests_pending(BlockDriverState *bs);
 void bdrv_clear_dirty_bitmap(BdrvDirtyBitmap *bitmap, HBitmap **out);
 void bdrv_undo_clear_dirty_bitmap(BdrvDirtyBitmap *bitmap, HBitmap *in);
 
+void bdrv_inc_in_flight(BlockDriverState *bs);
+void bdrv_dec_in_flight(BlockDriverState *bs);
+
 void blockdev_close_all_bdrv_states(void);
 
 #endif /* BLOCK_INT_H */
diff --git a/include/block/blockjob.h b/include/block/blockjob.h
index 4ddb4ae2e1..2bb39f4d29 100644
--- a/include/block/blockjob.h
+++ b/include/block/blockjob.h
@@ -92,6 +92,13 @@ typedef struct BlockJobDriver {
      * besides job->blk to the new AioContext.
      */
     void (*attached_aio_context)(BlockJob *job, AioContext *new_context);
+
+    /*
+     * If the callback is not NULL, it will be invoked when the job has to be
+     * synchronously cancelled or completed; it should drain BlockDriverStates
+     * as required to ensure progress.
+     */
+    void (*drain)(BlockJob *job);
 } BlockJobDriver;
 
 /**
diff --git a/include/hw/i386/pc.h b/include/hw/i386/pc.h
index 17fff80c8a..98dc7722c3 100644
--- a/include/hw/i386/pc.h
+++ b/include/hw/i386/pc.h
@@ -13,7 +13,6 @@
 #include "qemu/bitmap.h"
 #include "sysemu/sysemu.h"
 #include "hw/pci/pci.h"
-#include "hw/boards.h"
 #include "hw/compat.h"
 #include "hw/mem/pc-dimm.h"
 #include "hw/mem/nvdimm.h"
diff --git a/include/hw/xen/xen_backend.h b/include/hw/xen/xen_backend.h
index 0df282ab5f..cbda40ee53 100644
--- a/include/hw/xen/xen_backend.h
+++ b/include/hw/xen/xen_backend.h
@@ -2,60 +2,10 @@
 #define QEMU_HW_XEN_BACKEND_H
 
 #include "hw/xen/xen_common.h"
+#include "hw/xen/xen_pvdev.h"
 #include "sysemu/sysemu.h"
 #include "net/net.h"
 
-/* ------------------------------------------------------------- */
-
-#define XEN_BUFSIZE 1024
-
-struct XenDevice;
-
-/* driver uses grant tables  ->  open gntdev device (xendev->gnttabdev) */
-#define DEVOPS_FLAG_NEED_GNTDEV   1
-/* don't expect frontend doing correct state transitions (aka console quirk) */
-#define DEVOPS_FLAG_IGNORE_STATE  2
-
-struct XenDevOps {
-    size_t    size;
-    uint32_t  flags;
-    void      (*alloc)(struct XenDevice *xendev);
-    int       (*init)(struct XenDevice *xendev);
-    int       (*initialise)(struct XenDevice *xendev);
-    void      (*connected)(struct XenDevice *xendev);
-    void      (*event)(struct XenDevice *xendev);
-    void      (*disconnect)(struct XenDevice *xendev);
-    int       (*free)(struct XenDevice *xendev);
-    void      (*backend_changed)(struct XenDevice *xendev, const char *node);
-    void      (*frontend_changed)(struct XenDevice *xendev, const char *node);
-    int       (*backend_register)(void);
-};
-
-struct XenDevice {
-    const char         *type;
-    int                dom;
-    int                dev;
-    char               name[64];
-    int                debug;
-
-    enum xenbus_state  be_state;
-    enum xenbus_state  fe_state;
-    int                online;
-    char               be[XEN_BUFSIZE];
-    char               *fe;
-    char               *protocol;
-    int                remote_port;
-    int                local_port;
-
-    xenevtchn_handle   *evtchndev;
-    xengnttab_handle   *gnttabdev;
-
-    struct XenDevOps   *ops;
-    QTAILQ_ENTRY(XenDevice) next;
-};
-
-/* ------------------------------------------------------------- */
-
 /* variables */
 extern xc_interface *xen_xc;
 extern xenforeignmemory_handle *xen_fmem;
@@ -63,26 +13,20 @@ extern struct xs_handle *xenstore;
 extern const char *xen_protocol;
 extern DeviceState *xen_sysdev;
 
-/* xenstore helper functions */
 int xenstore_mkdir(char *path, int p);
-int xenstore_write_str(const char *base, const char *node, const char *val);
-int xenstore_write_int(const char *base, const char *node, int ival);
-int xenstore_write_int64(const char *base, const char *node, int64_t ival);
-char *xenstore_read_str(const char *base, const char *node);
-int xenstore_read_int(const char *base, const char *node, int *ival);
-
 int xenstore_write_be_str(struct XenDevice *xendev, const char *node, const char *val);
 int xenstore_write_be_int(struct XenDevice *xendev, const char *node, int ival);
 int xenstore_write_be_int64(struct XenDevice *xendev, const char *node, int64_t ival);
 char *xenstore_read_be_str(struct XenDevice *xendev, const char *node);
 int xenstore_read_be_int(struct XenDevice *xendev, const char *node, int *ival);
+void xenstore_update_fe(char *watch, struct XenDevice *xendev);
+void xenstore_update_be(char *watch, char *type, int dom,
+                        struct XenDevOps *ops);
 char *xenstore_read_fe_str(struct XenDevice *xendev, const char *node);
 int xenstore_read_fe_int(struct XenDevice *xendev, const char *node, int *ival);
-int xenstore_read_uint64(const char *base, const char *node, uint64_t *uval);
-int xenstore_read_fe_uint64(struct XenDevice *xendev, const char *node, uint64_t *uval);
+int xenstore_read_fe_uint64(struct XenDevice *xendev, const char *node,
+                            uint64_t *uval);
 
-const char *xenbus_strstate(enum xenbus_state state);
-struct XenDevice *xen_be_find_xendev(const char *type, int dom, int dev);
 void xen_be_check_state(struct XenDevice *xendev);
 
 /* xen backend driver bits */
@@ -91,10 +35,6 @@ void xen_be_register_common(void);
 int xen_be_register(const char *type, struct XenDevOps *ops);
 int xen_be_set_state(struct XenDevice *xendev, enum xenbus_state state);
 int xen_be_bind_evtchn(struct XenDevice *xendev);
-void xen_be_unbind_evtchn(struct XenDevice *xendev);
-int xen_be_send_notify(struct XenDevice *xendev);
-void xen_be_printf(struct XenDevice *xendev, int msg_level, const char *fmt, ...)
-    GCC_FMT_ATTR(3, 4);
 
 /* actual backend drivers */
 extern struct XenDevOps xen_console_ops;      /* xen_console.c     */
diff --git a/include/hw/xen/xen_pvdev.h b/include/hw/xen/xen_pvdev.h
new file mode 100644
index 0000000000..083f0a9cc7
--- /dev/null
+++ b/include/hw/xen/xen_pvdev.h
@@ -0,0 +1,78 @@
+#ifndef QEMU_HW_XEN_PVDEV_H
+#define QEMU_HW_XEN_PVDEV_H
+
+#include "hw/xen/xen_common.h"
+/* ------------------------------------------------------------- */
+
+#define XEN_BUFSIZE 1024
+
+struct XenDevice;
+
+/* driver uses grant tables  ->  open gntdev device (xendev->gnttabdev) */
+#define DEVOPS_FLAG_NEED_GNTDEV   1
+/* don't expect frontend doing correct state transitions (aka console quirk) */
+#define DEVOPS_FLAG_IGNORE_STATE  2
+
+struct XenDevOps {
+    size_t    size;
+    uint32_t  flags;
+    void      (*alloc)(struct XenDevice *xendev);
+    int       (*init)(struct XenDevice *xendev);
+    int       (*initialise)(struct XenDevice *xendev);
+    void      (*connected)(struct XenDevice *xendev);
+    void      (*event)(struct XenDevice *xendev);
+    void      (*disconnect)(struct XenDevice *xendev);
+    int       (*free)(struct XenDevice *xendev);
+    void      (*backend_changed)(struct XenDevice *xendev, const char *node);
+    void      (*frontend_changed)(struct XenDevice *xendev, const char *node);
+    int       (*backend_register)(void);
+};
+
+struct XenDevice {
+    const char         *type;
+    int                dom;
+    int                dev;
+    char               name[64];
+    int                debug;
+
+    enum xenbus_state  be_state;
+    enum xenbus_state  fe_state;
+    int                online;
+    char               be[XEN_BUFSIZE];
+    char               *fe;
+    char               *protocol;
+    int                remote_port;
+    int                local_port;
+
+    xenevtchn_handle   *evtchndev;
+    xengnttab_handle   *gnttabdev;
+
+    struct XenDevOps   *ops;
+    QTAILQ_ENTRY(XenDevice) next;
+};
+
+/* ------------------------------------------------------------- */
+
+/* xenstore helper functions */
+int xenstore_write_str(const char *base, const char *node, const char *val);
+int xenstore_write_int(const char *base, const char *node, int ival);
+int xenstore_write_int64(const char *base, const char *node, int64_t ival);
+char *xenstore_read_str(const char *base, const char *node);
+int xenstore_read_int(const char *base, const char *node, int *ival);
+int xenstore_read_uint64(const char *base, const char *node, uint64_t *uval);
+void xenstore_update(void *unused);
+
+const char *xenbus_strstate(enum xenbus_state state);
+
+void xen_pv_evtchn_event(void *opaque);
+void xen_pv_insert_xendev(struct XenDevice *xendev);
+void xen_pv_del_xendev(struct XenDevice *xendev);
+struct XenDevice *xen_pv_find_xendev(const char *type, int dom, int dev);
+
+void xen_pv_unbind_evtchn(struct XenDevice *xendev);
+int xen_pv_send_notify(struct XenDevice *xendev);
+
+void xen_pv_printf(struct XenDevice *xendev, int msg_level,
+                   const char *fmt, ...)  GCC_FMT_ATTR(3, 4);
+
+#endif /* QEMU_HW_XEN_PVDEV_H */
diff --git a/include/migration/colo.h b/include/migration/colo.h
new file mode 100644
index 0000000000..e32eef4763
--- /dev/null
+++ b/include/migration/colo.h
@@ -0,0 +1,38 @@
+/*
+ * COarse-grain LOck-stepping Virtual Machines for Non-stop Service (COLO)
+ * (a.k.a. Fault Tolerance or Continuous Replication)
+ *
+ * Copyright (c) 2016 HUAWEI TECHNOLOGIES CO., LTD.
+ * Copyright (c) 2016 FUJITSU LIMITED
+ * Copyright (c) 2016 Intel Corporation
+ *
+ * This work is licensed under the terms of the GNU GPL, version 2 or
+ * later.  See the COPYING file in the top-level directory.
+ */
+
+#ifndef QEMU_COLO_H
+#define QEMU_COLO_H
+
+#include "qemu-common.h"
+#include "migration/migration.h"
+#include "qemu/coroutine_int.h"
+#include "qemu/thread.h"
+#include "qemu/main-loop.h"
+
+bool colo_supported(void);
+void colo_info_init(void);
+
+void migrate_start_colo_process(MigrationState *s);
+bool migration_in_colo_state(void);
+
+/* loadvm */
+bool migration_incoming_enable_colo(void);
+void migration_incoming_exit_colo(void);
+void *colo_process_incoming_thread(void *opaque);
+bool migration_incoming_in_colo_state(void);
+
+COLOMode get_colo_mode(void);
+
+/* failover */
+void colo_do_failover(MigrationState *s);
+#endif
diff --git a/include/migration/failover.h b/include/migration/failover.h
new file mode 100644
index 0000000000..ad91ef2381
--- /dev/null
+++ b/include/migration/failover.h
@@ -0,0 +1,26 @@
+/*
+ *  COarse-grain LOck-stepping Virtual Machines for Non-stop Service (COLO)
+ *  (a.k.a. Fault Tolerance or Continuous Replication)
+ *
+ * Copyright (c) 2016 HUAWEI TECHNOLOGIES CO.,LTD.
+ * Copyright (c) 2016 FUJITSU LIMITED
+ * Copyright (c) 2016 Intel Corporation
+ *
+ * This work is licensed under the terms of the GNU GPL, version 2 or
+ * later.  See the COPYING file in the top-level directory.
+ */
+
+#ifndef QEMU_FAILOVER_H
+#define QEMU_FAILOVER_H
+
+#include "qemu-common.h"
+#include "qapi-types.h"
+
+void failover_init_state(void);
+FailoverStatus failover_set_state(FailoverStatus old_state,
+                                     FailoverStatus new_state);
+FailoverStatus failover_get_state(void);
+void failover_request_active(Error **errp);
+bool failover_request_is_active(void);
+
+#endif
diff --git a/include/migration/migration.h b/include/migration/migration.h
index 2791b90c00..c309d23370 100644
--- a/include/migration/migration.h
+++ b/include/migration/migration.h
@@ -21,6 +21,7 @@
 #include "migration/vmstate.h"
 #include "qapi-types.h"
 #include "exec/cpu-common.h"
+#include "qemu/coroutine_int.h"
 
 #define QEMU_VM_FILE_MAGIC           0x5145564d
 #define QEMU_VM_FILE_VERSION_COMPAT  0x00000002
@@ -107,6 +108,12 @@ struct MigrationIncomingState {
     QEMUBH *bh;
 
     int state;
+
+    bool have_colo_incoming_thread;
+    QemuThread colo_incoming_thread;
+    /* The coroutine we should enter (back) after failover */
+    Coroutine *migration_incoming_co;
+
     /* See savevm.c */
     LoadStateEntry_Head loadvm_handlers;
 };
@@ -298,6 +305,7 @@ int xbzrle_decode_buffer(uint8_t *src, int slen, uint8_t *dst, int dlen);
 
 int migrate_use_xbzrle(void);
 int64_t migrate_xbzrle_cache_size(void);
+bool migrate_colo_enabled(void);
 
 int64_t xbzrle_cache_resize(int64_t new_size);
 
diff --git a/include/monitor/monitor.h b/include/monitor/monitor.h
index a714d8ef80..8cc532ec0e 100644
--- a/include/monitor/monitor.h
+++ b/include/monitor/monitor.h
@@ -9,7 +9,7 @@
 extern Monitor *cur_mon;
 
 /* flags for monitor_init */
-#define MONITOR_IS_DEFAULT    0x01
+/* 0x01 unused */
 #define MONITOR_USE_READLINE  0x02
 #define MONITOR_USE_CONTROL   0x04
 #define MONITOR_USE_PRETTY    0x08
diff --git a/include/qemu/rfifolock.h b/include/qemu/rfifolock.h
deleted file mode 100644
index b23ab538a6..0000000000
--- a/include/qemu/rfifolock.h
+++ /dev/null
@@ -1,54 +0,0 @@
-/*
- * Recursive FIFO lock
- *
- * Copyright Red Hat, Inc. 2013
- *
- * Authors:
- *  Stefan Hajnoczi   <stefanha@redhat.com>
- *
- * This work is licensed under the terms of the GNU GPL, version 2 or later.
- * See the COPYING file in the top-level directory.
- *
- */
-
-#ifndef QEMU_RFIFOLOCK_H
-#define QEMU_RFIFOLOCK_H
-
-#include "qemu/thread.h"
-
-/* Recursive FIFO lock
- *
- * This lock provides more features than a plain mutex:
- *
- * 1. Fairness - enforces FIFO order.
- * 2. Nesting - can be taken recursively.
- * 3. Contention callback - optional, called when thread must wait.
- *
- * The recursive FIFO lock is heavyweight so prefer other synchronization
- * primitives if you do not need its features.
- */
-typedef struct {
-    QemuMutex lock;             /* protects all fields */
-
-    /* FIFO order */
-    unsigned int head;          /* active ticket number */
-    unsigned int tail;          /* waiting ticket number */
-    QemuCond cond;              /* used to wait for our ticket number */
-
-    /* Nesting */
-    QemuThread owner_thread;    /* thread that currently has ownership */
-    unsigned int nesting;       /* amount of nesting levels */
-
-    /* Contention callback */
-    void (*cb)(void *);         /* called when thread must wait, with ->lock
-                                 * held so it may not recursively lock/unlock
-                                 */
-    void *cb_opaque;
-} RFifoLock;
-
-void rfifolock_init(RFifoLock *r, void (*cb)(void *), void *opaque);
-void rfifolock_destroy(RFifoLock *r);
-void rfifolock_lock(RFifoLock *r);
-void rfifolock_unlock(RFifoLock *r);
-
-#endif /* QEMU_RFIFOLOCK_H */
diff --git a/include/qemu/thread-posix.h b/include/qemu/thread-posix.h
index aa03567e5e..09d1e15728 100644
--- a/include/qemu/thread-posix.h
+++ b/include/qemu/thread-posix.h
@@ -4,6 +4,12 @@
 #include <pthread.h>
 #include <semaphore.h>
 
+typedef QemuMutex QemuRecMutex;
+#define qemu_rec_mutex_destroy qemu_mutex_destroy
+#define qemu_rec_mutex_lock qemu_mutex_lock
+#define qemu_rec_mutex_try_lock qemu_mutex_try_lock
+#define qemu_rec_mutex_unlock qemu_mutex_unlock
+
 struct QemuMutex {
     pthread_mutex_t lock;
 };
diff --git a/include/qemu/thread-win32.h b/include/qemu/thread-win32.h
index c7ce8dcd45..5fb6541ae9 100644
--- a/include/qemu/thread-win32.h
+++ b/include/qemu/thread-win32.h
@@ -8,6 +8,16 @@ struct QemuMutex {
     LONG owner;
 };
 
+typedef struct QemuRecMutex QemuRecMutex;
+struct QemuRecMutex {
+    CRITICAL_SECTION lock;
+};
+
+void qemu_rec_mutex_destroy(QemuRecMutex *mutex);
+void qemu_rec_mutex_lock(QemuRecMutex *mutex);
+int qemu_rec_mutex_trylock(QemuRecMutex *mutex);
+void qemu_rec_mutex_unlock(QemuRecMutex *mutex);
+
 struct QemuCond {
     LONG waiters, target;
     HANDLE sema;
diff --git a/include/qemu/thread.h b/include/qemu/thread.h
index 31237e93ee..e8e665f020 100644
--- a/include/qemu/thread.h
+++ b/include/qemu/thread.h
@@ -25,6 +25,9 @@ void qemu_mutex_lock(QemuMutex *mutex);
 int qemu_mutex_trylock(QemuMutex *mutex);
 void qemu_mutex_unlock(QemuMutex *mutex);
 
+/* Prototypes for other functions are in thread-posix.h/thread-win32.h.  */
+void qemu_rec_mutex_init(QemuRecMutex *mutex);
+
 void qemu_cond_init(QemuCond *cond);
 void qemu_cond_destroy(QemuCond *cond);
 
diff --git a/iothread.c b/iothread.c
index fbeb8deb38..bd70344811 100644
--- a/iothread.c
+++ b/iothread.c
@@ -16,10 +16,12 @@
 #include "qom/object_interfaces.h"
 #include "qemu/module.h"
 #include "block/aio.h"
+#include "block/block.h"
 #include "sysemu/iothread.h"
 #include "qmp-commands.h"
 #include "qemu/error-report.h"
 #include "qemu/rcu.h"
+#include "qemu/main-loop.h"
 
 typedef ObjectClass IOThreadClass;
 
@@ -28,26 +30,27 @@ typedef ObjectClass IOThreadClass;
 #define IOTHREAD_CLASS(klass) \
    OBJECT_CLASS_CHECK(IOThreadClass, klass, TYPE_IOTHREAD)
 
+static __thread IOThread *my_iothread;
+
+AioContext *qemu_get_current_aio_context(void)
+{
+    return my_iothread ? my_iothread->ctx : qemu_get_aio_context();
+}
+
 static void *iothread_run(void *opaque)
 {
     IOThread *iothread = opaque;
-    bool blocking;
 
     rcu_register_thread();
 
+    my_iothread = iothread;
     qemu_mutex_lock(&iothread->init_done_lock);
     iothread->thread_id = qemu_get_thread_id();
     qemu_cond_signal(&iothread->init_done_cond);
     qemu_mutex_unlock(&iothread->init_done_lock);
 
-    while (!iothread->stopping) {
-        aio_context_acquire(iothread->ctx);
-        blocking = true;
-        while (!iothread->stopping && aio_poll(iothread->ctx, blocking)) {
-            /* Progress was made, keep going */
-            blocking = false;
-        }
-        aio_context_release(iothread->ctx);
+    while (!atomic_read(&iothread->stopping)) {
+        aio_poll(iothread->ctx, true);
     }
 
     rcu_unregister_thread();
@@ -190,6 +193,18 @@ IOThreadInfoList *qmp_query_iothreads(Error **errp)
 void iothread_stop_all(void)
 {
     Object *container = object_get_objects_root();
+    BlockDriverState *bs;
+    BdrvNextIterator it;
+
+    for (bs = bdrv_first(&it); bs; bs = bdrv_next(&it)) {
+        AioContext *ctx = bdrv_get_aio_context(bs);
+        if (ctx == qemu_get_aio_context()) {
+            continue;
+        }
+        aio_context_acquire(ctx);
+        bdrv_set_aio_context(bs, qemu_get_aio_context());
+        aio_context_release(ctx);
+    }
 
     object_child_foreach(container, iothread_stop, NULL);
 }
diff --git a/migration/Makefile.objs b/migration/Makefile.objs
index 30ad945918..3f3e237142 100644
--- a/migration/Makefile.objs
+++ b/migration/Makefile.objs
@@ -1,5 +1,7 @@
 common-obj-y += migration.o socket.o fd.o exec.o
 common-obj-y += tls.o
+common-obj-y += colo-comm.o
+common-obj-$(CONFIG_COLO) += colo.o colo-failover.o
 common-obj-y += vmstate.o
 common-obj-y += qemu-file.o
 common-obj-y += qemu-file-channel.o
diff --git a/migration/colo-comm.c b/migration/colo-comm.c
new file mode 100644
index 0000000000..20b60ec384
--- /dev/null
+++ b/migration/colo-comm.c
@@ -0,0 +1,72 @@
+/*
+ * COarse-grain LOck-stepping Virtual Machines for Non-stop Service (COLO)
+ * (a.k.a. Fault Tolerance or Continuous Replication)
+ *
+ * Copyright (c) 2016 HUAWEI TECHNOLOGIES CO., LTD.
+ * Copyright (c) 2016 FUJITSU LIMITED
+ * Copyright (c) 2016 Intel Corporation
+ *
+ * This work is licensed under the terms of the GNU GPL, version 2 or
+ * later. See the COPYING file in the top-level directory.
+ *
+ */
+
+#include "qemu/osdep.h"
+#include <migration/colo.h>
+#include "trace.h"
+
+typedef struct {
+     bool colo_requested;
+} COLOInfo;
+
+static COLOInfo colo_info;
+
+COLOMode get_colo_mode(void)
+{
+    if (migration_in_colo_state()) {
+        return COLO_MODE_PRIMARY;
+    } else if (migration_incoming_in_colo_state()) {
+        return COLO_MODE_SECONDARY;
+    } else {
+        return COLO_MODE_UNKNOWN;
+    }
+}
+
+static void colo_info_pre_save(void *opaque)
+{
+    COLOInfo *s = opaque;
+
+    s->colo_requested = migrate_colo_enabled();
+}
+
+static bool colo_info_need(void *opaque)
+{
+   return migrate_colo_enabled();
+}
+
+static const VMStateDescription colo_state = {
+    .name = "COLOState",
+    .version_id = 1,
+    .minimum_version_id = 1,
+    .pre_save = colo_info_pre_save,
+    .needed = colo_info_need,
+    .fields = (VMStateField[]) {
+        VMSTATE_BOOL(colo_requested, COLOInfo),
+        VMSTATE_END_OF_LIST()
+    },
+};
+
+void colo_info_init(void)
+{
+    vmstate_register(NULL, 0, &colo_state, &colo_info);
+}
+
+bool migration_incoming_enable_colo(void)
+{
+    return colo_info.colo_requested;
+}
+
+void migration_incoming_exit_colo(void)
+{
+    colo_info.colo_requested = false;
+}
diff --git a/migration/colo-failover.c b/migration/colo-failover.c
new file mode 100644
index 0000000000..cc229f5ab1
--- /dev/null
+++ b/migration/colo-failover.c
@@ -0,0 +1,83 @@
+/*
+ * COarse-grain LOck-stepping Virtual Machines for Non-stop Service (COLO)
+ * (a.k.a. Fault Tolerance or Continuous Replication)
+ *
+ * Copyright (c) 2016 HUAWEI TECHNOLOGIES CO., LTD.
+ * Copyright (c) 2016 FUJITSU LIMITED
+ * Copyright (c) 2016 Intel Corporation
+ *
+ * This work is licensed under the terms of the GNU GPL, version 2 or
+ * later.  See the COPYING file in the top-level directory.
+ */
+
+#include "qemu/osdep.h"
+#include "migration/colo.h"
+#include "migration/failover.h"
+#include "qmp-commands.h"
+#include "qapi/qmp/qerror.h"
+#include "qemu/error-report.h"
+#include "trace.h"
+
+static QEMUBH *failover_bh;
+static FailoverStatus failover_state;
+
+static void colo_failover_bh(void *opaque)
+{
+    int old_state;
+
+    qemu_bh_delete(failover_bh);
+    failover_bh = NULL;
+
+    old_state = failover_set_state(FAILOVER_STATUS_REQUIRE,
+                                   FAILOVER_STATUS_ACTIVE);
+    if (old_state != FAILOVER_STATUS_REQUIRE) {
+        error_report("Unknown error for failover, old_state = %s",
+                    FailoverStatus_lookup[old_state]);
+        return;
+    }
+
+    colo_do_failover(NULL);
+}
+
+void failover_request_active(Error **errp)
+{
+   if (failover_set_state(FAILOVER_STATUS_NONE,
+        FAILOVER_STATUS_REQUIRE) != FAILOVER_STATUS_NONE) {
+        error_setg(errp, "COLO failover is already actived");
+        return;
+    }
+    failover_bh = qemu_bh_new(colo_failover_bh, NULL);
+    qemu_bh_schedule(failover_bh);
+}
+
+void failover_init_state(void)
+{
+    failover_state = FAILOVER_STATUS_NONE;
+}
+
+FailoverStatus failover_set_state(FailoverStatus old_state,
+                    FailoverStatus new_state)
+{
+    FailoverStatus old;
+
+    old = atomic_cmpxchg(&failover_state, old_state, new_state);
+    if (old == old_state) {
+        trace_colo_failover_set_state(FailoverStatus_lookup[new_state]);
+    }
+    return old;
+}
+
+FailoverStatus failover_get_state(void)
+{
+    return atomic_read(&failover_state);
+}
+
+void qmp_x_colo_lost_heartbeat(Error **errp)
+{
+    if (get_colo_mode() == COLO_MODE_UNKNOWN) {
+        error_setg(errp, QERR_FEATURE_DISABLED, "colo");
+        return;
+    }
+
+    failover_request_active(errp);
+}
diff --git a/migration/colo.c b/migration/colo.c
new file mode 100644
index 0000000000..e7224b8a0a
--- /dev/null
+++ b/migration/colo.c
@@ -0,0 +1,529 @@
+/*
+ * COarse-grain LOck-stepping Virtual Machines for Non-stop Service (COLO)
+ * (a.k.a. Fault Tolerance or Continuous Replication)
+ *
+ * Copyright (c) 2016 HUAWEI TECHNOLOGIES CO., LTD.
+ * Copyright (c) 2016 FUJITSU LIMITED
+ * Copyright (c) 2016 Intel Corporation
+ *
+ * This work is licensed under the terms of the GNU GPL, version 2 or
+ * later.  See the COPYING file in the top-level directory.
+ */
+
+#include "qemu/osdep.h"
+#include "qemu/timer.h"
+#include "sysemu/sysemu.h"
+#include "migration/colo.h"
+#include "io/channel-buffer.h"
+#include "trace.h"
+#include "qemu/error-report.h"
+#include "qapi/error.h"
+#include "migration/failover.h"
+
+#define COLO_BUFFER_BASE_SIZE (4 * 1024 * 1024)
+
+bool colo_supported(void)
+{
+    return true;
+}
+
+bool migration_in_colo_state(void)
+{
+    MigrationState *s = migrate_get_current();
+
+    return (s->state == MIGRATION_STATUS_COLO);
+}
+
+bool migration_incoming_in_colo_state(void)
+{
+    MigrationIncomingState *mis = migration_incoming_get_current();
+
+    return mis && (mis->state == MIGRATION_STATUS_COLO);
+}
+
+static bool colo_runstate_is_stopped(void)
+{
+    return runstate_check(RUN_STATE_COLO) || !runstate_is_running();
+}
+
+static void secondary_vm_do_failover(void)
+{
+    int old_state;
+    MigrationIncomingState *mis = migration_incoming_get_current();
+
+    migrate_set_state(&mis->state, MIGRATION_STATUS_COLO,
+                      MIGRATION_STATUS_COMPLETED);
+
+    if (!autostart) {
+        error_report("\"-S\" qemu option will be ignored in secondary side");
+        /* recover runstate to normal migration finish state */
+        autostart = true;
+    }
+
+    old_state = failover_set_state(FAILOVER_STATUS_ACTIVE,
+                                   FAILOVER_STATUS_COMPLETED);
+    if (old_state != FAILOVER_STATUS_ACTIVE) {
+        error_report("Incorrect state (%s) while doing failover for "
+                     "secondary VM", FailoverStatus_lookup[old_state]);
+        return;
+    }
+    /* For Secondary VM, jump to incoming co */
+    if (mis->migration_incoming_co) {
+        qemu_coroutine_enter(mis->migration_incoming_co);
+    }
+}
+
+static void primary_vm_do_failover(void)
+{
+    MigrationState *s = migrate_get_current();
+    int old_state;
+
+    migrate_set_state(&s->state, MIGRATION_STATUS_COLO,
+                      MIGRATION_STATUS_COMPLETED);
+
+    old_state = failover_set_state(FAILOVER_STATUS_ACTIVE,
+                                   FAILOVER_STATUS_COMPLETED);
+    if (old_state != FAILOVER_STATUS_ACTIVE) {
+        error_report("Incorrect state (%s) while doing failover for Primary VM",
+                     FailoverStatus_lookup[old_state]);
+        return;
+    }
+}
+
+void colo_do_failover(MigrationState *s)
+{
+    /* Make sure VM stopped while failover happened. */
+    if (!colo_runstate_is_stopped()) {
+        vm_stop_force_state(RUN_STATE_COLO);
+    }
+
+    if (get_colo_mode() == COLO_MODE_PRIMARY) {
+        primary_vm_do_failover();
+    } else {
+        secondary_vm_do_failover();
+    }
+}
+
+static void colo_send_message(QEMUFile *f, COLOMessage msg,
+                              Error **errp)
+{
+    int ret;
+
+    if (msg >= COLO_MESSAGE__MAX) {
+        error_setg(errp, "%s: Invalid message", __func__);
+        return;
+    }
+    qemu_put_be32(f, msg);
+    qemu_fflush(f);
+
+    ret = qemu_file_get_error(f);
+    if (ret < 0) {
+        error_setg_errno(errp, -ret, "Can't send COLO message");
+    }
+    trace_colo_send_message(COLOMessage_lookup[msg]);
+}
+
+static void colo_send_message_value(QEMUFile *f, COLOMessage msg,
+                                    uint64_t value, Error **errp)
+{
+    Error *local_err = NULL;
+    int ret;
+
+    colo_send_message(f, msg, &local_err);
+    if (local_err) {
+        error_propagate(errp, local_err);
+        return;
+    }
+    qemu_put_be64(f, value);
+    qemu_fflush(f);
+
+    ret = qemu_file_get_error(f);
+    if (ret < 0) {
+        error_setg_errno(errp, -ret, "Failed to send value for message:%s",
+                         COLOMessage_lookup[msg]);
+    }
+}
+
+static COLOMessage colo_receive_message(QEMUFile *f, Error **errp)
+{
+    COLOMessage msg;
+    int ret;
+
+    msg = qemu_get_be32(f);
+    ret = qemu_file_get_error(f);
+    if (ret < 0) {
+        error_setg_errno(errp, -ret, "Can't receive COLO message");
+        return msg;
+    }
+    if (msg >= COLO_MESSAGE__MAX) {
+        error_setg(errp, "%s: Invalid message", __func__);
+        return msg;
+    }
+    trace_colo_receive_message(COLOMessage_lookup[msg]);
+    return msg;
+}
+
+static void colo_receive_check_message(QEMUFile *f, COLOMessage expect_msg,
+                                       Error **errp)
+{
+    COLOMessage msg;
+    Error *local_err = NULL;
+
+    msg = colo_receive_message(f, &local_err);
+    if (local_err) {
+        error_propagate(errp, local_err);
+        return;
+    }
+    if (msg != expect_msg) {
+        error_setg(errp, "Unexpected COLO message %d, expected %d",
+                          msg, expect_msg);
+    }
+}
+
+static uint64_t colo_receive_message_value(QEMUFile *f, uint32_t expect_msg,
+                                           Error **errp)
+{
+    Error *local_err = NULL;
+    uint64_t value;
+    int ret;
+
+    colo_receive_check_message(f, expect_msg, &local_err);
+    if (local_err) {
+        error_propagate(errp, local_err);
+        return 0;
+    }
+
+    value = qemu_get_be64(f);
+    ret = qemu_file_get_error(f);
+    if (ret < 0) {
+        error_setg_errno(errp, -ret, "Failed to get value for COLO message: %s",
+                         COLOMessage_lookup[expect_msg]);
+    }
+    return value;
+}
+
+static int colo_do_checkpoint_transaction(MigrationState *s,
+                                          QIOChannelBuffer *bioc,
+                                          QEMUFile *fb)
+{
+    Error *local_err = NULL;
+    int ret = -1;
+
+    colo_send_message(s->to_dst_file, COLO_MESSAGE_CHECKPOINT_REQUEST,
+                      &local_err);
+    if (local_err) {
+        goto out;
+    }
+
+    colo_receive_check_message(s->rp_state.from_dst_file,
+                    COLO_MESSAGE_CHECKPOINT_REPLY, &local_err);
+    if (local_err) {
+        goto out;
+    }
+    /* Reset channel-buffer directly */
+    qio_channel_io_seek(QIO_CHANNEL(bioc), 0, 0, NULL);
+    bioc->usage = 0;
+
+    qemu_mutex_lock_iothread();
+    if (failover_get_state() != FAILOVER_STATUS_NONE) {
+        qemu_mutex_unlock_iothread();
+        goto out;
+    }
+    vm_stop_force_state(RUN_STATE_COLO);
+    qemu_mutex_unlock_iothread();
+    trace_colo_vm_state_change("run", "stop");
+    /*
+     * Failover request bh could be called after vm_stop_force_state(),
+     * So we need check failover_request_is_active() again.
+     */
+    if (failover_get_state() != FAILOVER_STATUS_NONE) {
+        goto out;
+    }
+
+    /* Disable block migration */
+    s->params.blk = 0;
+    s->params.shared = 0;
+    qemu_savevm_state_header(fb);
+    qemu_savevm_state_begin(fb, &s->params);
+    qemu_mutex_lock_iothread();
+    qemu_savevm_state_complete_precopy(fb, false);
+    qemu_mutex_unlock_iothread();
+
+    qemu_fflush(fb);
+
+    colo_send_message(s->to_dst_file, COLO_MESSAGE_VMSTATE_SEND, &local_err);
+    if (local_err) {
+        goto out;
+    }
+    /*
+     * We need the size of the VMstate data in Secondary side,
+     * With which we can decide how much data should be read.
+     */
+    colo_send_message_value(s->to_dst_file, COLO_MESSAGE_VMSTATE_SIZE,
+                            bioc->usage, &local_err);
+    if (local_err) {
+        goto out;
+    }
+
+    qemu_put_buffer(s->to_dst_file, bioc->data, bioc->usage);
+    qemu_fflush(s->to_dst_file);
+    ret = qemu_file_get_error(s->to_dst_file);
+    if (ret < 0) {
+        goto out;
+    }
+
+    colo_receive_check_message(s->rp_state.from_dst_file,
+                       COLO_MESSAGE_VMSTATE_RECEIVED, &local_err);
+    if (local_err) {
+        goto out;
+    }
+
+    colo_receive_check_message(s->rp_state.from_dst_file,
+                       COLO_MESSAGE_VMSTATE_LOADED, &local_err);
+    if (local_err) {
+        goto out;
+    }
+
+    ret = 0;
+
+    qemu_mutex_lock_iothread();
+    vm_start();
+    qemu_mutex_unlock_iothread();
+    trace_colo_vm_state_change("stop", "run");
+
+out:
+    if (local_err) {
+        error_report_err(local_err);
+    }
+    return ret;
+}
+
+static void colo_process_checkpoint(MigrationState *s)
+{
+    QIOChannelBuffer *bioc;
+    QEMUFile *fb = NULL;
+    int64_t current_time, checkpoint_time = qemu_clock_get_ms(QEMU_CLOCK_HOST);
+    Error *local_err = NULL;
+    int ret;
+
+    failover_init_state();
+
+    s->rp_state.from_dst_file = qemu_file_get_return_path(s->to_dst_file);
+    if (!s->rp_state.from_dst_file) {
+        error_report("Open QEMUFile from_dst_file failed");
+        goto out;
+    }
+
+    /*
+     * Wait for Secondary finish loading VM states and enter COLO
+     * restore.
+     */
+    colo_receive_check_message(s->rp_state.from_dst_file,
+                       COLO_MESSAGE_CHECKPOINT_READY, &local_err);
+    if (local_err) {
+        goto out;
+    }
+    bioc = qio_channel_buffer_new(COLO_BUFFER_BASE_SIZE);
+    fb = qemu_fopen_channel_output(QIO_CHANNEL(bioc));
+    object_unref(OBJECT(bioc));
+
+    qemu_mutex_lock_iothread();
+    vm_start();
+    qemu_mutex_unlock_iothread();
+    trace_colo_vm_state_change("stop", "run");
+
+    while (s->state == MIGRATION_STATUS_COLO) {
+        if (failover_get_state() != FAILOVER_STATUS_NONE) {
+            error_report("failover request");
+            goto out;
+        }
+
+        current_time = qemu_clock_get_ms(QEMU_CLOCK_HOST);
+        if (current_time - checkpoint_time <
+            s->parameters.x_checkpoint_delay) {
+            int64_t delay_ms;
+
+            delay_ms = s->parameters.x_checkpoint_delay -
+                       (current_time - checkpoint_time);
+            g_usleep(delay_ms * 1000);
+        }
+        ret = colo_do_checkpoint_transaction(s, bioc, fb);
+        if (ret < 0) {
+            goto out;
+        }
+        checkpoint_time = qemu_clock_get_ms(QEMU_CLOCK_HOST);
+    }
+
+out:
+    /* Throw the unreported error message after exited from loop */
+    if (local_err) {
+        error_report_err(local_err);
+    }
+
+    if (fb) {
+        qemu_fclose(fb);
+    }
+
+    if (s->rp_state.from_dst_file) {
+        qemu_fclose(s->rp_state.from_dst_file);
+    }
+}
+
+void migrate_start_colo_process(MigrationState *s)
+{
+    qemu_mutex_unlock_iothread();
+    migrate_set_state(&s->state, MIGRATION_STATUS_ACTIVE,
+                      MIGRATION_STATUS_COLO);
+    colo_process_checkpoint(s);
+    qemu_mutex_lock_iothread();
+}
+
+static void colo_wait_handle_message(QEMUFile *f, int *checkpoint_request,
+                                     Error **errp)
+{
+    COLOMessage msg;
+    Error *local_err = NULL;
+
+    msg = colo_receive_message(f, &local_err);
+    if (local_err) {
+        error_propagate(errp, local_err);
+        return;
+    }
+
+    switch (msg) {
+    case COLO_MESSAGE_CHECKPOINT_REQUEST:
+        *checkpoint_request = 1;
+        break;
+    default:
+        *checkpoint_request = 0;
+        error_setg(errp, "Got unknown COLO message: %d", msg);
+        break;
+    }
+}
+
+void *colo_process_incoming_thread(void *opaque)
+{
+    MigrationIncomingState *mis = opaque;
+    QEMUFile *fb = NULL;
+    QIOChannelBuffer *bioc = NULL; /* Cache incoming device state */
+    uint64_t total_size;
+    uint64_t value;
+    Error *local_err = NULL;
+
+    migrate_set_state(&mis->state, MIGRATION_STATUS_ACTIVE,
+                      MIGRATION_STATUS_COLO);
+
+    failover_init_state();
+
+    mis->to_src_file = qemu_file_get_return_path(mis->from_src_file);
+    if (!mis->to_src_file) {
+        error_report("COLO incoming thread: Open QEMUFile to_src_file failed");
+        goto out;
+    }
+    /*
+     * Note: the communication between Primary side and Secondary side
+     * should be sequential, we set the fd to unblocked in migration incoming
+     * coroutine, and here we are in the COLO incoming thread, so it is ok to
+     * set the fd back to blocked.
+     */
+    qemu_file_set_blocking(mis->from_src_file, true);
+
+    bioc = qio_channel_buffer_new(COLO_BUFFER_BASE_SIZE);
+    fb = qemu_fopen_channel_input(QIO_CHANNEL(bioc));
+    object_unref(OBJECT(bioc));
+
+    colo_send_message(mis->to_src_file, COLO_MESSAGE_CHECKPOINT_READY,
+                      &local_err);
+    if (local_err) {
+        goto out;
+    }
+
+    while (mis->state == MIGRATION_STATUS_COLO) {
+        int request;
+
+        colo_wait_handle_message(mis->from_src_file, &request, &local_err);
+        if (local_err) {
+            goto out;
+        }
+        assert(request);
+        if (failover_get_state() != FAILOVER_STATUS_NONE) {
+            error_report("failover request");
+            goto out;
+        }
+
+        /* FIXME: This is unnecessary for periodic checkpoint mode */
+        colo_send_message(mis->to_src_file, COLO_MESSAGE_CHECKPOINT_REPLY,
+                     &local_err);
+        if (local_err) {
+            goto out;
+        }
+
+        colo_receive_check_message(mis->from_src_file,
+                           COLO_MESSAGE_VMSTATE_SEND, &local_err);
+        if (local_err) {
+            goto out;
+        }
+
+        value = colo_receive_message_value(mis->from_src_file,
+                                 COLO_MESSAGE_VMSTATE_SIZE, &local_err);
+        if (local_err) {
+            goto out;
+        }
+
+        /*
+         * Read VM device state data into channel buffer,
+         * It's better to re-use the memory allocated.
+         * Here we need to handle the channel buffer directly.
+         */
+        if (value > bioc->capacity) {
+            bioc->capacity = value;
+            bioc->data = g_realloc(bioc->data, bioc->capacity);
+        }
+        total_size = qemu_get_buffer(mis->from_src_file, bioc->data, value);
+        if (total_size != value) {
+            error_report("Got %" PRIu64 " VMState data, less than expected"
+                        " %" PRIu64, total_size, value);
+            goto out;
+        }
+        bioc->usage = total_size;
+        qio_channel_io_seek(QIO_CHANNEL(bioc), 0, 0, NULL);
+
+        colo_send_message(mis->to_src_file, COLO_MESSAGE_VMSTATE_RECEIVED,
+                     &local_err);
+        if (local_err) {
+            goto out;
+        }
+
+        qemu_mutex_lock_iothread();
+        qemu_system_reset(VMRESET_SILENT);
+        if (qemu_loadvm_state(fb) < 0) {
+            error_report("COLO: loadvm failed");
+            qemu_mutex_unlock_iothread();
+            goto out;
+        }
+        qemu_mutex_unlock_iothread();
+
+        colo_send_message(mis->to_src_file, COLO_MESSAGE_VMSTATE_LOADED,
+                     &local_err);
+        if (local_err) {
+            goto out;
+        }
+    }
+
+out:
+    /* Throw the unreported error message after exited from loop */
+    if (local_err) {
+        error_report_err(local_err);
+    }
+
+    if (fb) {
+        qemu_fclose(fb);
+    }
+
+    if (mis->to_src_file) {
+        qemu_fclose(mis->to_src_file);
+    }
+    migration_incoming_exit_colo();
+
+    return NULL;
+}
diff --git a/migration/migration.c b/migration/migration.c
index 156e70791a..e331f28382 100644
--- a/migration/migration.c
+++ b/migration/migration.c
@@ -36,6 +36,7 @@
 #include "exec/address-spaces.h"
 #include "io/channel-buffer.h"
 #include "io/channel-tls.h"
+#include "migration/colo.h"
 
 #define MAX_THROTTLE  (32 << 20)      /* Migration transfer speed throttling */
 
@@ -62,6 +63,11 @@
 /* Migration XBZRLE default cache size */
 #define DEFAULT_MIGRATE_CACHE_SIZE (64 * 1024 * 1024)
 
+/* The delay time (in ms) between two COLO checkpoints
+ * Note: Please change this default value to 10000 when we support hybrid mode.
+ */
+#define DEFAULT_MIGRATE_X_CHECKPOINT_DELAY 200
+
 static NotifierList migration_state_notifiers =
     NOTIFIER_LIST_INITIALIZER(migration_state_notifiers);
 
@@ -94,6 +100,7 @@ MigrationState *migrate_get_current(void)
             .cpu_throttle_increment = DEFAULT_MIGRATE_CPU_THROTTLE_INCREMENT,
             .max_bandwidth = MAX_THROTTLE,
             .downtime_limit = DEFAULT_MIGRATE_SET_DOWNTIME,
+            .x_checkpoint_delay = DEFAULT_MIGRATE_X_CHECKPOINT_DELAY,
         },
     };
 
@@ -406,6 +413,18 @@ static void process_incoming_migration_co(void *opaque)
         /* Else if something went wrong then just fall out of the normal exit */
     }
 
+    /* we get COLO info, and know if we are in COLO mode */
+    if (!ret && migration_incoming_enable_colo()) {
+        mis->migration_incoming_co = qemu_coroutine_self();
+        qemu_thread_create(&mis->colo_incoming_thread, "COLO incoming",
+             colo_process_incoming_thread, mis, QEMU_THREAD_JOINABLE);
+        mis->have_colo_incoming_thread = true;
+        qemu_coroutine_yield();
+
+        /* Wait checkpoint incoming thread exit before free resource */
+        qemu_thread_join(&mis->colo_incoming_thread);
+    }
+
     qemu_fclose(f);
     free_xbzrle_decoded_buf();
 
@@ -531,6 +550,9 @@ MigrationCapabilityStatusList *qmp_query_migrate_capabilities(Error **errp)
 
     caps = NULL; /* silence compiler warning */
     for (i = 0; i < MIGRATION_CAPABILITY__MAX; i++) {
+        if (i == MIGRATION_CAPABILITY_X_COLO && !colo_supported()) {
+            continue;
+        }
         if (head == NULL) {
             head = g_malloc0(sizeof(*caps));
             caps = head;
@@ -571,6 +593,7 @@ MigrationParameters *qmp_query_migrate_parameters(Error **errp)
     params->max_bandwidth = s->parameters.max_bandwidth;
     params->has_downtime_limit = true;
     params->downtime_limit = s->parameters.downtime_limit;
+    params->x_checkpoint_delay = s->parameters.x_checkpoint_delay;
 
     return params;
 }
@@ -691,6 +714,10 @@ MigrationInfo *qmp_query_migrate(Error **errp)
 
         get_xbzrle_cache_stats(info);
         break;
+    case MIGRATION_STATUS_COLO:
+        info->has_status = true;
+        /* TODO: display COLO specific information (checkpoint info etc.) */
+        break;
     case MIGRATION_STATUS_COMPLETED:
         get_xbzrle_cache_stats(info);
 
@@ -733,6 +760,14 @@ void qmp_migrate_set_capabilities(MigrationCapabilityStatusList *params,
     }
 
     for (cap = params; cap; cap = cap->next) {
+        if (cap->value->capability == MIGRATION_CAPABILITY_X_COLO) {
+            if (!colo_supported()) {
+                error_setg(errp, "COLO is not currently supported, please"
+                             " configure with --enable-colo option in order to"
+                             " support COLO feature");
+                continue;
+            }
+        }
         s->enabled_capabilities[cap->value->capability] = cap->value->state;
     }
 
@@ -817,6 +852,11 @@ void qmp_migrate_set_parameters(MigrationParameters *params, Error **errp)
                    "an integer in the range of 0 to 2000000 milliseconds");
         return;
     }
+    if (params->has_x_checkpoint_delay && (params->x_checkpoint_delay < 0)) {
+        error_setg(errp, QERR_INVALID_PARAMETER_VALUE,
+                    "x_checkpoint_delay",
+                    "is invalid, it should be positive");
+    }
 
     if (params->has_compress_level) {
         s->parameters.compress_level = params->compress_level;
@@ -851,6 +891,10 @@ void qmp_migrate_set_parameters(MigrationParameters *params, Error **errp)
     if (params->has_downtime_limit) {
         s->parameters.downtime_limit = params->downtime_limit;
     }
+
+    if (params->has_x_checkpoint_delay) {
+        s->parameters.x_checkpoint_delay = params->x_checkpoint_delay;
+    }
 }
 
 
@@ -922,7 +966,7 @@ static void migrate_fd_cleanup(void *opaque)
 
 void migrate_fd_error(MigrationState *s, const Error *error)
 {
-    trace_migrate_fd_error(error ? error_get_pretty(error) : "");
+    trace_migrate_fd_error(error_get_pretty(error));
     assert(s->to_dst_file == NULL);
     migrate_set_state(&s->state, MIGRATION_STATUS_SETUP,
                       MIGRATION_STATUS_FAILED);
@@ -1101,7 +1145,8 @@ void qmp_migrate(const char *uri, bool has_blk, bool blk,
     params.shared = has_inc && inc;
 
     if (migration_is_setup_or_active(s->state) ||
-        s->state == MIGRATION_STATUS_CANCELLING) {
+        s->state == MIGRATION_STATUS_CANCELLING ||
+        s->state == MIGRATION_STATUS_COLO) {
         error_setg(errp, QERR_MIGRATION_ACTIVE);
         return;
     }
@@ -1649,7 +1694,11 @@ static void migration_completion(MigrationState *s, int current_active_state,
 
         if (!ret) {
             ret = vm_stop_force_state(RUN_STATE_FINISH_MIGRATE);
-            if (ret >= 0) {
+            /*
+             * Don't mark the image with BDRV_O_INACTIVE flag if
+             * we will go into COLO stage later.
+             */
+            if (ret >= 0 && !migrate_colo_enabled()) {
                 ret = bdrv_inactivate_all();
             }
             if (ret >= 0) {
@@ -1691,8 +1740,11 @@ static void migration_completion(MigrationState *s, int current_active_state,
         goto fail_invalidate;
     }
 
-    migrate_set_state(&s->state, current_active_state,
-                      MIGRATION_STATUS_COMPLETED);
+    if (!migrate_colo_enabled()) {
+        migrate_set_state(&s->state, current_active_state,
+                          MIGRATION_STATUS_COMPLETED);
+    }
+
     return;
 
 fail_invalidate:
@@ -1713,6 +1765,12 @@ fail:
                       MIGRATION_STATUS_FAILED);
 }
 
+bool migrate_colo_enabled(void)
+{
+    MigrationState *s = migrate_get_current();
+    return s->enabled_capabilities[MIGRATION_CAPABILITY_X_COLO];
+}
+
 /*
  * Master migration thread on the source VM.
  * It drives the migration and pumps the data down the outgoing channel.
@@ -1731,6 +1789,7 @@ static void *migration_thread(void *opaque)
     bool entered_postcopy = false;
     /* The active state we expect to be in; ACTIVE or POSTCOPY_ACTIVE */
     enum MigrationStatus current_active_state = MIGRATION_STATUS_ACTIVE;
+    bool enable_colo = migrate_colo_enabled();
 
     rcu_register_thread();
 
@@ -1839,7 +1898,13 @@ static void *migration_thread(void *opaque)
     end_time = qemu_clock_get_ms(QEMU_CLOCK_REALTIME);
 
     qemu_mutex_lock_iothread();
-    qemu_savevm_state_cleanup();
+    /*
+     * The resource has been allocated by migration will be reused in COLO
+     * process, so don't release them.
+     */
+    if (!enable_colo) {
+        qemu_savevm_state_cleanup();
+    }
     if (s->state == MIGRATION_STATUS_COMPLETED) {
         uint64_t transferred_bytes = qemu_ftell(s->to_dst_file);
         s->total_time = end_time - s->total_time;
@@ -1852,6 +1917,15 @@ static void *migration_thread(void *opaque)
         }
         runstate_set(RUN_STATE_POSTMIGRATE);
     } else {
+        if (s->state == MIGRATION_STATUS_ACTIVE && enable_colo) {
+            migrate_start_colo_process(s);
+            qemu_savevm_state_cleanup();
+            /*
+            * Fixme: we will run VM in COLO no matter its old running state.
+            * After exited COLO, we will keep running.
+            */
+            old_vm_running = true;
+        }
         if (old_vm_running && !entered_postcopy) {
             vm_start();
         } else {
diff --git a/migration/ram.c b/migration/ram.c
index d032d389c4..fb9252d722 100644
--- a/migration/ram.c
+++ b/migration/ram.c
@@ -43,6 +43,7 @@
 #include "trace.h"
 #include "exec/ram_addr.h"
 #include "qemu/rcu_queue.h"
+#include "migration/colo.h"
 
 #ifdef DEBUG_MIGRATION_RAM
 #define DPRINTF(fmt, ...) \
@@ -1871,16 +1872,8 @@ err:
     return ret;
 }
 
-
-/* Each of ram_save_setup, ram_save_iterate and ram_save_complete has
- * long-running RCU critical section.  When rcu-reclaims in the code
- * start to become numerous it will be necessary to reduce the
- * granularity of these critical sections.
- */
-
-static int ram_save_setup(QEMUFile *f, void *opaque)
+static int ram_save_init_globals(void)
 {
-    RAMBlock *block;
     int64_t ram_bitmap_pages; /* Size of bitmap in pages, including gaps */
 
     dirty_rate_high_cnt = 0;
@@ -1947,6 +1940,29 @@ static int ram_save_setup(QEMUFile *f, void *opaque)
     migration_bitmap_sync();
     qemu_mutex_unlock_ramlist();
     qemu_mutex_unlock_iothread();
+    rcu_read_unlock();
+
+    return 0;
+}
+
+/* Each of ram_save_setup, ram_save_iterate and ram_save_complete has
+ * long-running RCU critical section.  When rcu-reclaims in the code
+ * start to become numerous it will be necessary to reduce the
+ * granularity of these critical sections.
+ */
+
+static int ram_save_setup(QEMUFile *f, void *opaque)
+{
+    RAMBlock *block;
+
+    /* migration has already setup the bitmap, reuse it. */
+    if (!migration_in_colo_state()) {
+        if (ram_save_init_globals() < 0) {
+            return -1;
+         }
+    }
+
+    rcu_read_lock();
 
     qemu_put_be64(f, ram_bytes_total() | RAM_SAVE_FLAG_MEM_SIZE);
 
@@ -2048,7 +2064,8 @@ static int ram_save_complete(QEMUFile *f, void *opaque)
     while (true) {
         int pages;
 
-        pages = ram_find_and_save_block(f, true, &bytes_transferred);
+        pages = ram_find_and_save_block(f, !migration_in_colo_state(),
+                                        &bytes_transferred);
         /* no more blocks to sent */
         if (pages == 0) {
             break;
diff --git a/migration/trace-events b/migration/trace-events
index dfee75abf4..94134f700b 100644
--- a/migration/trace-events
+++ b/migration/trace-events
@@ -207,3 +207,9 @@ migration_tls_outgoing_handshake_complete(void) ""
 migration_tls_incoming_handshake_start(void) ""
 migration_tls_incoming_handshake_error(const char *err) "err=%s"
 migration_tls_incoming_handshake_complete(void) ""
+
+# migration/colo.c
+colo_vm_state_change(const char *old, const char *new) "Change '%s' => '%s'"
+colo_send_message(const char *msg) "Send '%s' message"
+colo_receive_message(const char *msg) "Receive '%s' message"
+colo_failover_set_state(const char *new_state) "new state %s"
diff --git a/monitor.c b/monitor.c
index 21dcfb28c1..7b963ad1ad 100644
--- a/monitor.c
+++ b/monitor.c
@@ -59,7 +59,6 @@
 #include "qapi/qmp/json-streamer.h"
 #include "qapi/qmp/json-parser.h"
 #include "qom/object_interfaces.h"
-#include "cpu.h"
 #include "trace.h"
 #include "trace/control.h"
 #include "monitor/hmp-target.h"
@@ -76,7 +75,6 @@
 #include "qapi/qmp-event.h"
 #include "qapi-event.h"
 #include "qmp-introspect.h"
-#include "sysemu/block-backend.h"
 #include "sysemu/qtest.h"
 #include "qemu/cutils.h"
 #include "qapi/qmp/dispatch.h"
@@ -4094,7 +4092,7 @@ QemuOptsList qemu_mon_opts = {
             .name = "chardev",
             .type = QEMU_OPT_STRING,
         },{
-            .name = "default",
+            .name = "default",  /* deprecated */
             .type = QEMU_OPT_BOOL,
         },{
             .name = "pretty",
diff --git a/net/colo-compare.c b/net/colo-compare.c
index f791383dbc..9bfc736f55 100644
--- a/net/colo-compare.c
+++ b/net/colo-compare.c
@@ -92,10 +92,6 @@ typedef struct CompareClass {
     ObjectClass parent_class;
 } CompareClass;
 
-typedef struct CompareChardevProps {
-    bool is_socket;
-} CompareChardevProps;
-
 enum {
     PRIMARY_IN = 0,
     SECONDARY_IN,
@@ -218,16 +214,17 @@ static int colo_packet_compare_tcp(Packet *spkt, Packet *ppkt)
                 (spkt->size - ETH_HLEN));
 
     if (res != 0 && trace_event_get_state(TRACE_COLO_COMPARE_MISCOMPARE)) {
-        trace_colo_compare_pkt_info(inet_ntoa(ppkt->ip->ip_src),
-                                    inet_ntoa(ppkt->ip->ip_dst),
-                                    ntohl(ptcp->th_seq),
-                                    ntohl(ptcp->th_ack),
-                                    ntohl(stcp->th_seq),
-                                    ntohl(stcp->th_ack),
-                                    res, ptcp->th_flags,
-                                    stcp->th_flags,
-                                    ppkt->size,
-                                    spkt->size);
+        trace_colo_compare_pkt_info_src(inet_ntoa(ppkt->ip->ip_src),
+                                        ntohl(stcp->th_seq),
+                                        ntohl(stcp->th_ack),
+                                        res, stcp->th_flags,
+                                        spkt->size);
+
+        trace_colo_compare_pkt_info_dst(inet_ntoa(ppkt->ip->ip_dst),
+                                        ntohl(ptcp->th_seq),
+                                        ntohl(ptcp->th_ack),
+                                        res, ptcp->th_flags,
+                                        ppkt->size);
 
         qemu_hexdump((char *)ppkt->data, stderr,
                      "colo-compare ppkt", ppkt->size);
@@ -571,8 +568,6 @@ static int find_and_check_chardev(CharDriverState **chr,
                                   char *chr_name,
                                   Error **errp)
 {
-    CompareChardevProps props;
-
     *chr = qemu_chr_find(chr_name);
     if (*chr == NULL) {
         error_setg(errp, "Device '%s' not found",
@@ -580,8 +575,6 @@ static int find_and_check_chardev(CharDriverState **chr,
         return 1;
     }
 
-    memset(&props, 0, sizeof(props));
-
     if (!qemu_chr_has_feature(*chr, QEMU_CHAR_FEATURE_RECONNECTABLE)) {
         error_setg(errp, "chardev \"%s\" is not reconnectable",
                    chr_name);
diff --git a/net/trace-events b/net/trace-events
index b1913a6666..35198bc742 100644
--- a/net/trace-events
+++ b/net/trace-events
@@ -13,7 +13,8 @@ colo_compare_icmp_miscompare(const char *sta, int size) ": %s = %d"
 colo_compare_ip_info(int psize, const char *sta, const char *stb, int ssize, const char *stc, const char *std) "ppkt size = %d, ip_src = %s, ip_dst = %s, spkt size = %d, ip_src = %s, ip_dst = %s"
 colo_old_packet_check_found(int64_t old_time) "%" PRId64
 colo_compare_miscompare(void) ""
-colo_compare_pkt_info(const char *src, const char *dst, uint32_t pseq, uint32_t pack, uint32_t sseq, uint32_t sack, int res, uint32_t pflag, uint32_t sflag, int psize, int ssize) "src/dst: %s/%s p: seq/ack=%u/%u   s: seq/ack=%u/%u res=%d flags=%x/%x ppkt_size: %d spkt_size: %d\n"
+colo_compare_pkt_info_src(const char *src, uint32_t sseq, uint32_t sack, int res, uint32_t sflag, int ssize) "src/dst: %s s: seq/ack=%u/%u res=%d flags=%x spkt_size: %d\n"
+colo_compare_pkt_info_dst(const char *dst, uint32_t dseq, uint32_t dack, int res, uint32_t dflag, int dsize) "src/dst: %s d: seq/ack=%u/%u res=%d flags=%x dpkt_size: %d\n"
 
 # net/filter-rewriter.c
 colo_filter_rewriter_debug(void) ""
diff --git a/qapi-schema.json b/qapi-schema.json
index d6a43a108c..8a7b527091 100644
--- a/qapi-schema.json
+++ b/qapi-schema.json
@@ -175,12 +175,15 @@
 # @watchdog: the watchdog action is configured to pause and has been triggered
 #
 # @guest-panicked: guest has been panicked as a result of guest OS panic
+#
+# @colo: guest is paused to save/restore VM state under colo checkpoint (since
+# 2.8)
 ##
 { 'enum': 'RunState',
   'data': [ 'debug', 'inmigrate', 'internal-error', 'io-error', 'paused',
             'postmigrate', 'prelaunch', 'finish-migrate', 'restore-vm',
             'running', 'save-vm', 'shutdown', 'suspended', 'watchdog',
-            'guest-panicked' ] }
+            'guest-panicked', 'colo' ] }
 
 ##
 # @StatusInfo:
@@ -459,12 +462,14 @@
 #
 # @failed: some error occurred during migration process.
 #
+# @colo: VM is in the process of fault tolerance. (since 2.8)
+#
 # Since: 2.3
 #
 ##
 { 'enum': 'MigrationStatus',
   'data': [ 'none', 'setup', 'cancelling', 'cancelled',
-            'active', 'postcopy-active', 'completed', 'failed' ] }
+            'active', 'postcopy-active', 'completed', 'failed', 'colo' ] }
 
 ##
 # @MigrationInfo
@@ -574,11 +579,16 @@
 #          been migrated, pulling the remaining pages along as needed. NOTE: If
 #          the migration fails during postcopy the VM will fail.  (since 2.6)
 #
+# @x-colo: If enabled, migration will never end, and the state of the VM on the
+#        primary side will be migrated continuously to the VM on secondary
+#        side, this process is called COarse-Grain LOck Stepping (COLO) for
+#        Non-stop Service. (since 2.8)
+#
 # Since: 1.2
 ##
 { 'enum': 'MigrationCapability',
   'data': ['xbzrle', 'rdma-pin-all', 'auto-converge', 'zero-blocks',
-           'compress', 'events', 'postcopy-ram'] }
+           'compress', 'events', 'postcopy-ram', 'x-colo'] }
 
 ##
 # @MigrationCapabilityStatus
@@ -664,19 +674,24 @@
 # @downtime-limit: set maximum tolerated downtime for migration. maximum
 #                  downtime in milliseconds (Since 2.8)
 #
+# @x-checkpoint-delay: The delay time (in ms) between two COLO checkpoints in
+#          periodic mode. (Since 2.8)
+#
 # Since: 2.4
 ##
 { 'enum': 'MigrationParameter',
   'data': ['compress-level', 'compress-threads', 'decompress-threads',
            'cpu-throttle-initial', 'cpu-throttle-increment',
            'tls-creds', 'tls-hostname', 'max-bandwidth',
-           'downtime-limit'] }
+           'downtime-limit', 'x-checkpoint-delay' ] }
 
 #
 # @migrate-set-parameters
 #
 # Set various migration parameters.  See MigrationParameters for details.
 #
+# @x-checkpoint-delay: the delay time between two checkpoints. (Since 2.8)
+#
 # Since: 2.4
 ##
 { 'command': 'migrate-set-parameters', 'boxed': true,
@@ -725,6 +740,8 @@
 # @downtime-limit: set maximum tolerated downtime for migration. maximum
 #                  downtime in milliseconds (Since 2.8)
 #
+# @x-checkpoint-delay: the delay time between two COLO checkpoints. (Since 2.8)
+#
 # Since: 2.4
 ##
 { 'struct': 'MigrationParameters',
@@ -736,7 +753,8 @@
             '*tls-creds': 'str',
             '*tls-hostname': 'str',
             '*max-bandwidth': 'int',
-            '*downtime-limit': 'int'} }
+            '*downtime-limit': 'int',
+            '*x-checkpoint-delay': 'int'} }
 
 ##
 # @query-migrate-parameters
@@ -780,6 +798,78 @@
 { 'command': 'migrate-start-postcopy' }
 
 ##
+# @COLOMessage
+#
+# The message transmission between Primary side and Secondary side.
+#
+# @checkpoint-ready: Secondary VM (SVM) is ready for checkpointing
+#
+# @checkpoint-request: Primary VM (PVM) tells SVM to prepare for checkpointing
+#
+# @checkpoint-reply: SVM gets PVM's checkpoint request
+#
+# @vmstate-send: VM's state will be sent by PVM.
+#
+# @vmstate-size: The total size of VMstate.
+#
+# @vmstate-received: VM's state has been received by SVM.
+#
+# @vmstate-loaded: VM's state has been loaded by SVM.
+#
+# Since: 2.8
+##
+{ 'enum': 'COLOMessage',
+  'data': [ 'checkpoint-ready', 'checkpoint-request', 'checkpoint-reply',
+            'vmstate-send', 'vmstate-size', 'vmstate-received',
+            'vmstate-loaded' ] }
+
+##
+# @COLOMode
+#
+# The colo mode
+#
+# @unknown: unknown mode
+#
+# @primary: master side
+#
+# @secondary: slave side
+#
+# Since: 2.8
+##
+{ 'enum': 'COLOMode',
+  'data': [ 'unknown', 'primary', 'secondary'] }
+
+##
+# @FailoverStatus
+#
+# An enumeration of COLO failover status
+#
+# @none: no failover has ever happened
+#
+# @require: got failover requirement but not handled
+#
+# @active: in the process of doing failover
+#
+# @completed: finish the process of failover
+#
+# Since: 2.8
+##
+{ 'enum': 'FailoverStatus',
+  'data': [ 'none', 'require', 'active', 'completed'] }
+
+##
+# @x-colo-lost-heartbeat
+#
+# Tell qemu that heartbeat is lost, request it to do takeover procedures.
+# If this command is sent to the PVM, the Primary side will exit COLO mode.
+# If sent to the Secondary, the Secondary side will run failover work,
+# then takes over server operation to become the service VM.
+#
+# Since: 2.8
+##
+{ 'command': 'x-colo-lost-heartbeat' }
+
+##
 # @MouseInfo:
 #
 # Information about a mouse device.
diff --git a/qemu-ga.texi b/qemu-ga.texi
index 0e53bf6b2c..4c7a8fd163 100644
--- a/qemu-ga.texi
+++ b/qemu-ga.texi
@@ -30,7 +30,7 @@ set user's password
 @end itemize
 
 qemu-ga will read a system configuration file on startup (located at
-q@file{/etc/qemu/qemu-ga.conf} by default), then parse remaining
+@file{/etc/qemu/qemu-ga.conf} by default), then parse remaining
 configuration options on the command line. For the same key, the last
 option wins, but the lists accumulate (see below for configuration
 file format).
diff --git a/qemu-img.c b/qemu-img.c
index afcd51ff18..ac7f40d91a 100644
--- a/qemu-img.c
+++ b/qemu-img.c
@@ -795,6 +795,7 @@ static void run_block_job(BlockJob *job, Error **errp)
 {
     AioContext *aio_context = blk_get_aio_context(job->blk);
 
+    aio_context_acquire(aio_context);
     do {
         aio_poll(aio_context, true);
         qemu_progress_print(job->len ?
@@ -802,6 +803,7 @@ static void run_block_job(BlockJob *job, Error **errp)
     } while (!job->ready);
 
     block_job_complete_sync(job, errp);
+    aio_context_release(aio_context);
 
     /* A block job may finish instantaneously without publishing any progress,
      * so just signal completion here */
@@ -819,6 +821,7 @@ static int img_commit(int argc, char **argv)
     Error *local_err = NULL;
     CommonBlockJobCBInfo cbi;
     bool image_opts = false;
+    AioContext *aio_context;
 
     fmt = NULL;
     cache = BDRV_DEFAULT_CACHE;
@@ -928,8 +931,11 @@ static int img_commit(int argc, char **argv)
         .bs   = bs,
     };
 
+    aio_context = bdrv_get_aio_context(bs);
+    aio_context_acquire(aio_context);
     commit_active_start("commit", bs, base_bs, 0, BLOCKDEV_ON_ERROR_REPORT,
                         common_block_job_cb, &cbi, &local_err, false);
+    aio_context_release(aio_context);
     if (local_err) {
         goto done;
     }
diff --git a/qemu-io-cmds.c b/qemu-io-cmds.c
index 3a3838a079..95bcde1d88 100644
--- a/qemu-io-cmds.c
+++ b/qemu-io-cmds.c
@@ -18,7 +18,6 @@
 #include "qemu/error-report.h"
 #include "qemu/main-loop.h"
 #include "qemu/timer.h"
-#include "sysemu/block-backend.h"
 #include "qemu/cutils.h"
 
 #define CMD_NOFILE_OK   0x01
@@ -1956,7 +1955,7 @@ static int reopen_f(BlockBackend *blk, int argc, char **argv)
     qemu_opts_reset(&reopen_opts);
 
     brq = bdrv_reopen_queue(NULL, bs, opts, flags);
-    bdrv_reopen_multiple(brq, &local_err);
+    bdrv_reopen_multiple(bdrv_get_aio_context(bs), brq, &local_err);
     if (local_err) {
         error_report_err(local_err);
     } else {
@@ -2216,6 +2215,7 @@ static const cmdinfo_t help_cmd = {
 
 bool qemuio_command(BlockBackend *blk, const char *cmd)
 {
+    AioContext *ctx;
     char *input;
     const cmdinfo_t *ct;
     char **v;
@@ -2227,7 +2227,10 @@ bool qemuio_command(BlockBackend *blk, const char *cmd)
     if (c) {
         ct = find_command(v[0]);
         if (ct) {
+            ctx = blk ? blk_get_aio_context(blk) : qemu_get_aio_context();
+            aio_context_acquire(ctx);
             done = command(blk, ct, c, v);
+            aio_context_release(ctx);
         } else {
             fprintf(stderr, "command \"%s\" not found\n", v[0]);
         }
diff --git a/qemu-options.hx b/qemu-options.hx
index b1fbdb08cd..95332cc05b 100644
--- a/qemu-options.hx
+++ b/qemu-options.hx
@@ -172,7 +172,7 @@ DEF("set", HAS_ARG, QEMU_OPTION_set,
 STEXI
 @item -set @var{group}.@var{id}.@var{arg}=@var{value}
 @findex -set
-Set parameter @var{arg} for item @var{id} of type @var{group}\n"
+Set parameter @var{arg} for item @var{id} of type @var{group}
 ETEXI
 
 DEF("global", HAS_ARG, QEMU_OPTION_global,
@@ -2239,7 +2239,7 @@ two serial ports and the QEMU monitor:
 
 @example
 -chardev stdio,mux=on,id=char0 \
--mon chardev=char0,mode=readline,default \
+-mon chardev=char0,mode=readline \
 -serial chardev:char0 \
 -serial chardev:char0
 @end example
@@ -2250,7 +2250,7 @@ multiplexed between the QEMU monitor and a parallel port:
 
 @example
 -chardev stdio,mux=on,id=char0 \
--mon chardev=char0,mode=readline,default \
+-mon chardev=char0,mode=readline \
 -parallel chardev:char0 \
 -chardev tcp,...,mux=on,id=char1 \
 -serial chardev:char1 \
@@ -3112,9 +3112,9 @@ Like -qmp but uses pretty JSON formatting.
 ETEXI
 
 DEF("mon", HAS_ARG, QEMU_OPTION_mon, \
-    "-mon [chardev=]name[,mode=readline|control][,default]\n", QEMU_ARCH_ALL)
+    "-mon [chardev=]name[,mode=readline|control]\n", QEMU_ARCH_ALL)
 STEXI
-@item -mon [chardev=]name[,mode=readline|control][,default]
+@item -mon [chardev=]name[,mode=readline|control]
 @findex -mon
 Setup monitor on chardev @var{name}.
 ETEXI
@@ -3902,7 +3902,7 @@ colo secondary:
 -object filter-redirector,id=f2,netdev=hn0,queue=rx,outdev=red1
 -object filter-rewriter,id=rew0,netdev=hn0,queue=all
 
-@item -object filter-dump,id=@var{id},netdev=@var{dev},file=@var{filename}][,maxlen=@var{len}]
+@item -object filter-dump,id=@var{id},netdev=@var{dev}[,file=@var{filename}][,maxlen=@var{len}]
 
 Dump the network traffic on netdev @var{dev} to the file specified by
 @var{filename}. At most @var{len} bytes (64k by default) per packet are stored.
diff --git a/qmp.c b/qmp.c
index a06cb7ba62..0028f0b30e 100644
--- a/qmp.c
+++ b/qmp.c
@@ -36,7 +36,6 @@
 #include "qom/object_interfaces.h"
 #include "hw/mem/pc-dimm.h"
 #include "hw/acpi/acpi_dev_interface.h"
-#include "qemu/uuid.h"
 
 NameInfo *qmp_query_name(Error **errp)
 {
diff --git a/scripts/clean-includes b/scripts/clean-includes
index 4412a5590a..dd938daa3e 100755
--- a/scripts/clean-includes
+++ b/scripts/clean-includes
@@ -14,15 +14,18 @@
 # the top-level directory.
 
 # Usage:
-#   clean-includes [--git subjectprefix] file ...
+#   clean-includes [--git subjectprefix] [--check-dup-head] file ...
 # or
-#   clean-includes [--git subjectprefix] --all
+#   clean-includes [--git subjectprefix] [--check-dup-head] --all
 #
 # If the --git subjectprefix option is given, then after making
 # the changes to the files this script will create a git commit
 # with the subject line "subjectprefix: Clean up includes"
 # and a boilerplate commit message.
 #
+# If --check-dup-head is specified, additionally check for duplicate
+# header includes.
+#
 # Using --all will cause clean-includes to run on the whole source
 # tree (excluding certain directories which are known not to need
 # handling).
@@ -45,23 +48,40 @@
 
 
 GIT=no
+DUPHEAD=no
 
 # Extended regular expression defining files to ignore when using --all
 XDIRREGEX='^(tests/tcg|tests/multiboot|pc-bios|disas/libvixl)'
 
-if [ $# -ne 0 ] && [ "$1" = "--git" ]; then
-    if [ $# -eq 1 ]; then
-        echo "--git option requires an argument"
-        exit 1
-    fi
-    GITSUBJ="$2"
-    GIT=yes
-    shift
-    shift
-fi
+while true
+do
+    case $1 in
+    "--git")
+         if [ $# -eq 1 ]; then
+             echo "--git option requires an argument"
+             exit 1
+         fi
+         GITSUBJ="$2"
+         GIT=yes
+         shift
+         shift
+         ;;
+    "--check-dup-head")
+        DUPHEAD=yes
+        shift
+        ;;
+    "--")
+        shift
+        break
+        ;;
+    *)
+        break
+        ;;
+   esac
+done
 
 if [ $# -eq 0 ]; then
-    echo "Usage: clean-includes [--git subjectprefix] [--all | foo.c ...]"
+    echo "Usage: clean-includes [--git subjectprefix] [--check-dup-head] [--all | foo.c ...]"
     echo "(modifies the files in place)"
     exit 1
 fi
@@ -91,7 +111,6 @@ cat >"$COCCIFILE" <<EOT
 )
 EOT
 
-
 for f in "$@"; do
   case "$f" in
     *.inc.c)
@@ -154,6 +173,15 @@ for f in "$@"; do
 
 done
 
+if [ "$DUPHEAD" = "yes" ]; then
+    egrep "^[[:space:]]*#[[:space:]]*include" "$@" | tr -d '[:blank:]' \
+        | sort | uniq -c | awk '{if ($1 > 1) print $0}'
+    if [ $? -eq 0 ]; then
+        echo "Found duplicate header file includes. Please check the above files manually."
+        exit 1
+    fi
+fi
+
 if [ "$GIT" = "yes" ]; then
     git add -- "$@"
     git commit --signoff -F - <<EOF
diff --git a/scripts/hxtool b/scripts/hxtool
index 995bb7f08c..04f7d7b0ed 100644
--- a/scripts/hxtool
+++ b/scripts/hxtool
@@ -26,32 +26,32 @@ hxtotexi()
             ;;
             STEXI*)
             if test $flag -eq 1 ; then
-                echo "line $line: syntax error: expected ETEXI, found $str" >&2
+                printf "line %d: syntax error: expected ETEXI, found '%s'\n" "$line" "$str" >&2
                 exit 1
             fi
             flag=1
             ;;
             ETEXI*)
             if test $flag -ne 1 ; then
-                echo "line $line: syntax error: expected STEXI, found $str" >&2
+                printf "line %d: syntax error: expected STEXI, found '%s'\n" "$line" "$str" >&2
                 exit 1
             fi
             flag=0
             ;;
             SQMP*|EQMP*)
             if test $flag -eq 1 ; then
-                echo "line $line: syntax error: expected ETEXI, found $str" >&2
+                printf "line %d: syntax error: expected ETEXI, found '%s'\n" "$line" "$str" >&2
                 exit 1
             fi
             ;;
             DEFHEADING*)
-            echo "$(expr "$str" : "DEFHEADING(\(.*\))")"
+            printf '%s\n' "$(expr "$str" : "DEFHEADING(\(.*\))")"
             ;;
             ARCHHEADING*)
-            echo "$(expr "$str" : "ARCHHEADING(\(.*\),.*)")"
+            printf '%s\n' "$(expr "$str" : "ARCHHEADING(\(.*\),.*)")"
             ;;
             *)
-            test $flag -eq 1 && echo "$str"
+            test $flag -eq 1 && printf '%s\n' "$str"
             ;;
         esac
         line=$((line+1))
@@ -69,26 +69,26 @@ hxtoqmp()
             ;;
             SQMP*)
             if test $flag -eq 1 ; then
-                echo "line $line: syntax error: expected EQMP, found $str" >&2
+                printf "line %d: syntax error: expected EQMP, found '%s'\n" "$line" "$str" >&2
                 exit 1
             fi
             flag=1
             ;;
             EQMP*)
             if test $flag -ne 1 ; then
-                echo "line $line: syntax error: expected SQMP, found $str" >&2
+                printf "line %d: syntax error: expected SQMP, found '%s'\n" "$line" "$str" >&2
                 exit 1
             fi
             flag=0
             ;;
             STEXI*|ETEXI*)
             if test $flag -eq 1 ; then
-                echo "line $line: syntax error: expected EQMP, found $str" >&2
+                printf "line %d: syntax error: expected EQMP, found '%s'\n" "$line" "$str" >&2
                 exit 1
             fi
             ;;
             *)
-            test $flag -eq 1 && echo "$str"
+            test $flag -eq 1 && printf '%s\n' "$str"
             ;;
         esac
         line=$((line+1))
diff --git a/scripts/tracetool.py b/scripts/tracetool.py
index 629b2593c8..fe9c9e904b 100755
--- a/scripts/tracetool.py
+++ b/scripts/tracetool.py
@@ -70,7 +70,7 @@ def make_group_name(filename):
 
     if dirname == "":
         return "common"
-    return re.sub(r"/|-", "_", dirname)
+    return re.sub(r"[^A-Za-z0-9]", "_", dirname)
 
 def main(args):
     global _SCRIPT
diff --git a/stubs/Makefile.objs b/stubs/Makefile.objs
index c5850e858e..7f236a7c1f 100644
--- a/stubs/Makefile.objs
+++ b/stubs/Makefile.objs
@@ -17,6 +17,7 @@ stub-obj-y += gdbstub.o
 stub-obj-y += get-fd.o
 stub-obj-y += get-next-serial.o
 stub-obj-y += get-vm-name.o
+stub-obj-y += iothread.o
 stub-obj-y += iothread-lock.o
 stub-obj-y += is-daemonized.o
 stub-obj-y += machine-init-done.o
@@ -48,3 +49,4 @@ stub-obj-y += iohandler.o
 stub-obj-y += smbios_type_38.o
 stub-obj-y += ipmi.o
 stub-obj-y += pc_madt_cpu_entry.o
+stub-obj-y += migration-colo.o
diff --git a/stubs/iothread.c b/stubs/iothread.c
new file mode 100644
index 0000000000..8cc9e28c55
--- /dev/null
+++ b/stubs/iothread.c
@@ -0,0 +1,8 @@
+#include "qemu/osdep.h"
+#include "block/aio.h"
+#include "qemu/main-loop.h"
+
+AioContext *qemu_get_current_aio_context(void)
+{
+    return qemu_get_aio_context();
+}
diff --git a/stubs/migration-colo.c b/stubs/migration-colo.c
new file mode 100644
index 0000000000..7811764c4b
--- /dev/null
+++ b/stubs/migration-colo.c
@@ -0,0 +1,46 @@
+/*
+ * COarse-grain LOck-stepping Virtual Machines for Non-stop Service (COLO)
+ * (a.k.a. Fault Tolerance or Continuous Replication)
+ *
+ * Copyright (c) 2016 HUAWEI TECHNOLOGIES CO., LTD.
+ * Copyright (c) 2016 FUJITSU LIMITED
+ * Copyright (c) 2016 Intel Corporation
+ *
+ * This work is licensed under the terms of the GNU GPL, version 2 or
+ * later.  See the COPYING file in the top-level directory.
+ */
+
+#include "qemu/osdep.h"
+#include "migration/colo.h"
+#include "qmp-commands.h"
+
+bool colo_supported(void)
+{
+    return false;
+}
+
+bool migration_in_colo_state(void)
+{
+    return false;
+}
+
+bool migration_incoming_in_colo_state(void)
+{
+    return false;
+}
+
+void migrate_start_colo_process(MigrationState *s)
+{
+}
+
+void *colo_process_incoming_thread(void *opaque)
+{
+    return NULL;
+}
+
+void qmp_x_colo_lost_heartbeat(Error **errp)
+{
+    error_setg(errp, "COLO is not supported, please rerun configure"
+                     " with --enable-colo option in order to support"
+                     " COLO feature");
+}
diff --git a/target-arm/cpu.c b/target-arm/cpu.c
index 2439ca57d0..99f0dbebb9 100644
--- a/target-arm/cpu.c
+++ b/target-arm/cpu.c
@@ -19,6 +19,7 @@
  */
 
 #include "qemu/osdep.h"
+#include "qemu/error-report.h"
 #include "qapi/error.h"
 #include "cpu.h"
 #include "internals.h"
@@ -496,6 +497,10 @@ static Property arm_cpu_rvbar_property =
 static Property arm_cpu_has_el3_property =
             DEFINE_PROP_BOOL("has_el3", ARMCPU, has_el3, true);
 
+/* use property name "pmu" to match other archs and virt tools */
+static Property arm_cpu_has_pmu_property =
+            DEFINE_PROP_BOOL("pmu", ARMCPU, has_pmu, true);
+
 static Property arm_cpu_has_mpu_property =
             DEFINE_PROP_BOOL("has-mpu", ARMCPU, has_mpu, true);
 
@@ -539,6 +544,11 @@ static void arm_cpu_post_init(Object *obj)
 #endif
     }
 
+    if (arm_feature(&cpu->env, ARM_FEATURE_PMU)) {
+        qdev_property_add_static(DEVICE(obj), &arm_cpu_has_pmu_property,
+                                 &error_abort);
+    }
+
     if (arm_feature(&cpu->env, ARM_FEATURE_MPU)) {
         qdev_property_add_static(DEVICE(obj), &arm_cpu_has_mpu_property,
                                  &error_abort);
@@ -677,6 +687,11 @@ static void arm_cpu_realizefn(DeviceState *dev, Error **errp)
         cpu->id_aa64pfr0 &= ~0xf000;
     }
 
+    if (!cpu->has_pmu || !kvm_enabled()) {
+        cpu->has_pmu = false;
+        unset_feature(env, ARM_FEATURE_PMU);
+    }
+
     if (!arm_feature(env, ARM_FEATURE_EL2)) {
         /* Disable the hypervisor feature bits in the processor feature
          * registers if we don't have EL2. These are id_pfr1[15:12] and
diff --git a/target-arm/cpu.h b/target-arm/cpu.h
index 19d967b69e..ca5c849ed6 100644
--- a/target-arm/cpu.h
+++ b/target-arm/cpu.h
@@ -1124,6 +1124,7 @@ enum arm_features {
     ARM_FEATURE_V8_SHA256, /* implements SHA256 part of v8 Crypto Extensions */
     ARM_FEATURE_V8_PMULL, /* implements PMULL part of v8 Crypto Extensions */
     ARM_FEATURE_THUMB_DSP, /* DSP insns supported in the Thumb encodings */
+    ARM_FEATURE_PMU, /* has PMU support */
 };
 
 static inline int arm_feature(CPUARMState *env, int feature)
diff --git a/target-arm/cpu64.c b/target-arm/cpu64.c
index 1635debc1a..549cb1ee93 100644
--- a/target-arm/cpu64.c
+++ b/target-arm/cpu64.c
@@ -111,6 +111,7 @@ static void aarch64_a57_initfn(Object *obj)
     set_feature(&cpu->env, ARM_FEATURE_V8_PMULL);
     set_feature(&cpu->env, ARM_FEATURE_CRC);
     set_feature(&cpu->env, ARM_FEATURE_EL3);
+    set_feature(&cpu->env, ARM_FEATURE_PMU);
     cpu->kvm_target = QEMU_KVM_ARM_TARGET_CORTEX_A57;
     cpu->midr = 0x411fd070;
     cpu->revidr = 0x00000000;
@@ -166,6 +167,7 @@ static void aarch64_a53_initfn(Object *obj)
     set_feature(&cpu->env, ARM_FEATURE_V8_PMULL);
     set_feature(&cpu->env, ARM_FEATURE_CRC);
     set_feature(&cpu->env, ARM_FEATURE_EL3);
+    set_feature(&cpu->env, ARM_FEATURE_PMU);
     cpu->kvm_target = QEMU_KVM_ARM_TARGET_CORTEX_A53;
     cpu->midr = 0x410fd034;
     cpu->revidr = 0x00000000;
diff --git a/target-arm/kvm64.c b/target-arm/kvm64.c
index 5faa76c57e..61111091ad 100644
--- a/target-arm/kvm64.c
+++ b/target-arm/kvm64.c
@@ -428,6 +428,11 @@ static inline void set_feature(uint64_t *features, int feature)
     *features |= 1ULL << feature;
 }
 
+static inline void unset_feature(uint64_t *features, int feature)
+{
+    *features &= ~(1ULL << feature);
+}
+
 bool kvm_arm_get_host_cpu_features(ARMHostCPUClass *ahcc)
 {
     /* Identify the feature bits corresponding to the host CPU, and
@@ -469,6 +474,7 @@ bool kvm_arm_get_host_cpu_features(ARMHostCPUClass *ahcc)
     set_feature(&features, ARM_FEATURE_VFP4);
     set_feature(&features, ARM_FEATURE_NEON);
     set_feature(&features, ARM_FEATURE_AARCH64);
+    set_feature(&features, ARM_FEATURE_PMU);
 
     ahcc->features = features;
 
@@ -482,6 +488,7 @@ int kvm_arch_init_vcpu(CPUState *cs)
     int ret;
     uint64_t mpidr;
     ARMCPU *cpu = ARM_CPU(cs);
+    CPUARMState *env = &cpu->env;
 
     if (cpu->kvm_target == QEMU_KVM_ARM_TARGET_NONE ||
         !object_dynamic_cast(OBJECT(cpu), TYPE_AARCH64_CPU)) {
@@ -501,10 +508,14 @@ int kvm_arch_init_vcpu(CPUState *cs)
     if (!arm_feature(&cpu->env, ARM_FEATURE_AARCH64)) {
         cpu->kvm_init_features[0] |= 1 << KVM_ARM_VCPU_EL1_32BIT;
     }
-    if (kvm_irqchip_in_kernel() &&
-        kvm_check_extension(cs->kvm_state, KVM_CAP_ARM_PMU_V3)) {
-        cpu->has_pmu = true;
+    if (!kvm_irqchip_in_kernel() ||
+        !kvm_check_extension(cs->kvm_state, KVM_CAP_ARM_PMU_V3)) {
+            cpu->has_pmu = false;
+    }
+    if (cpu->has_pmu) {
         cpu->kvm_init_features[0] |= 1 << KVM_ARM_VCPU_PMU_V3;
+    } else {
+        unset_feature(&env->features, ARM_FEATURE_PMU);
     }
 
     /* Do KVM_ARM_VCPU_INIT ioctl */
diff --git a/target-i386/machine.c b/target-i386/machine.c
index 71c0e4dc47..48037f1575 100644
--- a/target-i386/machine.c
+++ b/target-i386/machine.c
@@ -7,10 +7,7 @@
 #include "hw/i386/pc.h"
 #include "hw/isa/isa.h"
 #include "migration/cpu.h"
-#include "exec/exec-all.h"
 
-#include "cpu.h"
-#include "exec/exec-all.h"
 #include "sysemu/kvm.h"
 
 #include "qemu/error-report.h"
diff --git a/target-lm32/translate.c b/target-lm32/translate.c
index 2d8caebbfc..842af63a98 100644
--- a/target-lm32/translate.c
+++ b/target-lm32/translate.c
@@ -33,12 +33,14 @@
 #include "exec/log.h"
 
 
-#define DISAS_LM32 1
-#if DISAS_LM32
-#  define LOG_DIS(...) qemu_log_mask(CPU_LOG_TB_IN_ASM, ## __VA_ARGS__)
-#else
-#  define LOG_DIS(...) do { } while (0)
-#endif
+#define DISAS_LM32 0
+
+#define LOG_DIS(...) \
+    do { \
+        if (DISAS_LM32) { \
+            qemu_log_mask(CPU_LOG_TB_IN_ASM, ## __VA_ARGS__); \
+        } \
+    } while (0)
 
 #define EXTRACT_FIELD(src, start, end) \
             (((src) >> start) & ((1 << (end - start + 1)) - 1))
@@ -211,7 +213,7 @@ static void dec_and(DisasContext *dc)
 
 static void dec_andhi(DisasContext *dc)
 {
-    LOG_DIS("andhi r%d, r%d, %d\n", dc->r2, dc->r0, dc->imm16);
+    LOG_DIS("andhi r%d, r%d, %d\n", dc->r1, dc->r0, dc->imm16);
 
     tcg_gen_andi_tl(cpu_R[dc->r1], cpu_R[dc->r0], (dc->imm16 << 16));
 }
@@ -274,7 +276,7 @@ static inline void gen_cond_branch(DisasContext *dc, int cond)
 
 static void dec_be(DisasContext *dc)
 {
-    LOG_DIS("be r%d, r%d, %d\n", dc->r0, dc->r1,
+    LOG_DIS("be r%d, r%d, %d\n", dc->r1, dc->r0,
             sign_extend(dc->imm16, 16) * 4);
 
     gen_cond_branch(dc, TCG_COND_EQ);
@@ -282,7 +284,7 @@ static void dec_be(DisasContext *dc)
 
 static void dec_bg(DisasContext *dc)
 {
-    LOG_DIS("bg r%d, r%d, %d\n", dc->r0, dc->r1,
+    LOG_DIS("bg r%d, r%d, %d\n", dc->r1, dc->r0,
             sign_extend(dc->imm16, 16 * 4));
 
     gen_cond_branch(dc, TCG_COND_GT);
@@ -290,7 +292,7 @@ static void dec_bg(DisasContext *dc)
 
 static void dec_bge(DisasContext *dc)
 {
-    LOG_DIS("bge r%d, r%d, %d\n", dc->r0, dc->r1,
+    LOG_DIS("bge r%d, r%d, %d\n", dc->r1, dc->r0,
             sign_extend(dc->imm16, 16) * 4);
 
     gen_cond_branch(dc, TCG_COND_GE);
@@ -298,7 +300,7 @@ static void dec_bge(DisasContext *dc)
 
 static void dec_bgeu(DisasContext *dc)
 {
-    LOG_DIS("bgeu r%d, r%d, %d\n", dc->r0, dc->r1,
+    LOG_DIS("bgeu r%d, r%d, %d\n", dc->r1, dc->r0,
             sign_extend(dc->imm16, 16) * 4);
 
     gen_cond_branch(dc, TCG_COND_GEU);
@@ -306,7 +308,7 @@ static void dec_bgeu(DisasContext *dc)
 
 static void dec_bgu(DisasContext *dc)
 {
-    LOG_DIS("bgu r%d, r%d, %d\n", dc->r0, dc->r1,
+    LOG_DIS("bgu r%d, r%d, %d\n", dc->r1, dc->r0,
             sign_extend(dc->imm16, 16) * 4);
 
     gen_cond_branch(dc, TCG_COND_GTU);
@@ -314,7 +316,7 @@ static void dec_bgu(DisasContext *dc)
 
 static void dec_bne(DisasContext *dc)
 {
-    LOG_DIS("bne r%d, r%d, %d\n", dc->r0, dc->r1,
+    LOG_DIS("bne r%d, r%d, %d\n", dc->r1, dc->r0,
             sign_extend(dc->imm16, 16) * 4);
 
     gen_cond_branch(dc, TCG_COND_NE);
@@ -342,9 +344,6 @@ static void dec_calli(DisasContext *dc)
 
 static inline void gen_compare(DisasContext *dc, int cond)
 {
-    int rX = (dc->format == OP_FMT_RR) ? dc->r2 : dc->r1;
-    int rY = (dc->format == OP_FMT_RR) ? dc->r0 : dc->r0;
-    int rZ = (dc->format == OP_FMT_RR) ? dc->r1 : -1;
     int i;
 
     if (dc->format == OP_FMT_RI) {
@@ -358,16 +357,16 @@ static inline void gen_compare(DisasContext *dc, int cond)
             break;
         }
 
-        tcg_gen_setcondi_tl(cond, cpu_R[rX], cpu_R[rY], i);
+        tcg_gen_setcondi_tl(cond, cpu_R[dc->r1], cpu_R[dc->r0], i);
     } else {
-        tcg_gen_setcond_tl(cond, cpu_R[rX], cpu_R[rY], cpu_R[rZ]);
+        tcg_gen_setcond_tl(cond, cpu_R[dc->r2], cpu_R[dc->r0], cpu_R[dc->r1]);
     }
 }
 
 static void dec_cmpe(DisasContext *dc)
 {
     if (dc->format == OP_FMT_RI) {
-        LOG_DIS("cmpei r%d, r%d, %d\n", dc->r0, dc->r1,
+        LOG_DIS("cmpei r%d, r%d, %d\n", dc->r1, dc->r0,
                 sign_extend(dc->imm16, 16));
     } else {
         LOG_DIS("cmpe r%d, r%d, r%d\n", dc->r2, dc->r0, dc->r1);
@@ -379,7 +378,7 @@ static void dec_cmpe(DisasContext *dc)
 static void dec_cmpg(DisasContext *dc)
 {
     if (dc->format == OP_FMT_RI) {
-        LOG_DIS("cmpgi r%d, r%d, %d\n", dc->r0, dc->r1,
+        LOG_DIS("cmpgi r%d, r%d, %d\n", dc->r1, dc->r0,
                 sign_extend(dc->imm16, 16));
     } else {
         LOG_DIS("cmpg r%d, r%d, r%d\n", dc->r2, dc->r0, dc->r1);
@@ -391,7 +390,7 @@ static void dec_cmpg(DisasContext *dc)
 static void dec_cmpge(DisasContext *dc)
 {
     if (dc->format == OP_FMT_RI) {
-        LOG_DIS("cmpgei r%d, r%d, %d\n", dc->r0, dc->r1,
+        LOG_DIS("cmpgei r%d, r%d, %d\n", dc->r1, dc->r0,
                 sign_extend(dc->imm16, 16));
     } else {
         LOG_DIS("cmpge r%d, r%d, r%d\n", dc->r2, dc->r0, dc->r1);
@@ -403,7 +402,7 @@ static void dec_cmpge(DisasContext *dc)
 static void dec_cmpgeu(DisasContext *dc)
 {
     if (dc->format == OP_FMT_RI) {
-        LOG_DIS("cmpgeui r%d, r%d, %d\n", dc->r0, dc->r1,
+        LOG_DIS("cmpgeui r%d, r%d, %d\n", dc->r1, dc->r0,
                 zero_extend(dc->imm16, 16));
     } else {
         LOG_DIS("cmpgeu r%d, r%d, r%d\n", dc->r2, dc->r0, dc->r1);
@@ -415,7 +414,7 @@ static void dec_cmpgeu(DisasContext *dc)
 static void dec_cmpgu(DisasContext *dc)
 {
     if (dc->format == OP_FMT_RI) {
-        LOG_DIS("cmpgui r%d, r%d, %d\n", dc->r0, dc->r1,
+        LOG_DIS("cmpgui r%d, r%d, %d\n", dc->r1, dc->r0,
                 zero_extend(dc->imm16, 16));
     } else {
         LOG_DIS("cmpgu r%d, r%d, r%d\n", dc->r2, dc->r0, dc->r1);
@@ -427,7 +426,7 @@ static void dec_cmpgu(DisasContext *dc)
 static void dec_cmpne(DisasContext *dc)
 {
     if (dc->format == OP_FMT_RI) {
-        LOG_DIS("cmpnei r%d, r%d, %d\n", dc->r0, dc->r1,
+        LOG_DIS("cmpnei r%d, r%d, %d\n", dc->r1, dc->r0,
                 sign_extend(dc->imm16, 16));
     } else {
         LOG_DIS("cmpne r%d, r%d, r%d\n", dc->r2, dc->r0, dc->r1);
@@ -539,7 +538,7 @@ static void dec_modu(DisasContext *dc)
 static void dec_mul(DisasContext *dc)
 {
     if (dc->format == OP_FMT_RI) {
-        LOG_DIS("muli r%d, r%d, %d\n", dc->r0, dc->r1,
+        LOG_DIS("muli r%d, r%d, %d\n", dc->r1, dc->r0,
                 sign_extend(dc->imm16, 16));
     } else {
         LOG_DIS("mul r%d, r%d, r%d\n", dc->r2, dc->r0, dc->r1);
@@ -563,7 +562,7 @@ static void dec_mul(DisasContext *dc)
 static void dec_nor(DisasContext *dc)
 {
     if (dc->format == OP_FMT_RI) {
-        LOG_DIS("nori r%d, r%d, %d\n", dc->r0, dc->r1,
+        LOG_DIS("nori r%d, r%d, %d\n", dc->r1, dc->r0,
                 zero_extend(dc->imm16, 16));
     } else {
         LOG_DIS("nor r%d, r%d, r%d\n", dc->r2, dc->r0, dc->r1);
@@ -865,7 +864,7 @@ static void dec_wcsr(DisasContext *dc)
 {
     int no;
 
-    LOG_DIS("wcsr r%d, %d\n", dc->r1, dc->csr);
+    LOG_DIS("wcsr %d, r%d\n", dc->csr, dc->r1);
 
     switch (dc->csr) {
     case CSR_IE:
@@ -959,7 +958,7 @@ static void dec_wcsr(DisasContext *dc)
 static void dec_xnor(DisasContext *dc)
 {
     if (dc->format == OP_FMT_RI) {
-        LOG_DIS("xnori r%d, r%d, %d\n", dc->r0, dc->r1,
+        LOG_DIS("xnori r%d, r%d, %d\n", dc->r1, dc->r0,
                 zero_extend(dc->imm16, 16));
     } else {
         if (dc->r1 == R_R0) {
@@ -981,7 +980,7 @@ static void dec_xnor(DisasContext *dc)
 static void dec_xor(DisasContext *dc)
 {
     if (dc->format == OP_FMT_RI) {
-        LOG_DIS("xori r%d, r%d, %d\n", dc->r0, dc->r1,
+        LOG_DIS("xori r%d, r%d, %d\n", dc->r1, dc->r0,
                 zero_extend(dc->imm16, 16));
     } else {
         LOG_DIS("xor r%d, r%d, r%d\n", dc->r2, dc->r0, dc->r1);
diff --git a/target-mips/machine.c b/target-mips/machine.c
index a27f2f156d..d20d948457 100644
--- a/target-mips/machine.c
+++ b/target-mips/machine.c
@@ -2,7 +2,6 @@
 #include "qemu-common.h"
 #include "cpu.h"
 #include "hw/hw.h"
-#include "cpu.h"
 #include "migration/cpu.h"
 
 static int cpu_post_load(void *opaque, int version_id)
diff --git a/target-ppc/machine.c b/target-ppc/machine.c
index 4820f22377..e43cb6c39d 100644
--- a/target-ppc/machine.c
+++ b/target-ppc/machine.c
@@ -8,7 +8,6 @@
 #include "helper_regs.h"
 #include "mmu-hash64.h"
 #include "migration/cpu.h"
-#include "exec/exec-all.h"
 
 static int cpu_load_old(QEMUFile *f, void *opaque, int version_id)
 {
diff --git a/target-ppc/mem_helper.c b/target-ppc/mem_helper.c
index 6548715831..1ab8a6eab4 100644
--- a/target-ppc/mem_helper.c
+++ b/target-ppc/mem_helper.c
@@ -23,7 +23,6 @@
 #include "exec/helper-proto.h"
 
 #include "helper_regs.h"
-#include "exec/exec-all.h"
 #include "exec/cpu_ldst.h"
 
 //#define DEBUG_OP
diff --git a/target-sparc/machine.c b/target-sparc/machine.c
index 59c92f7582..aea6397861 100644
--- a/target-sparc/machine.c
+++ b/target-sparc/machine.c
@@ -6,10 +6,7 @@
 #include "hw/boards.h"
 #include "qemu/timer.h"
 
-#include "cpu.h"
-#include "exec/exec-all.h"
 #include "migration/cpu.h"
-#include "exec/exec-all.h"
 
 #ifdef TARGET_SPARC64
 static const VMStateDescription vmstate_cpu_timer = {
diff --git a/target-xtensa/translate.c b/target-xtensa/translate.c
index 4c1e48748b..fb0fa56f1e 100644
--- a/target-xtensa/translate.c
+++ b/target-xtensa/translate.c
@@ -36,7 +36,6 @@
 #include "tcg-op.h"
 #include "qemu/log.h"
 #include "sysemu/sysemu.h"
-#include "exec/exec-all.h"
 #include "exec/cpu_ldst.h"
 #include "exec/semihost.h"
 
diff --git a/tests/.gitignore b/tests/.gitignore
index 64e050e859..c0d7857538 100644
--- a/tests/.gitignore
+++ b/tests/.gitignore
@@ -67,7 +67,6 @@ test-qmp-marshal.c
 test-qobject-output-visitor
 test-rcu-list
 test-replication
-test-rfifolock
 test-string-input-visitor
 test-string-output-visitor
 test-thread-pool
diff --git a/tests/Makefile.include b/tests/Makefile.include
index 1a135d2340..de516341fd 100644
--- a/tests/Makefile.include
+++ b/tests/Makefile.include
@@ -45,7 +45,6 @@ check-unit-y += tests/test-visitor-serialization$(EXESUF)
 check-unit-y += tests/test-iov$(EXESUF)
 gcov-files-test-iov-y = util/iov.c
 check-unit-y += tests/test-aio$(EXESUF)
-check-unit-$(CONFIG_POSIX) += tests/test-rfifolock$(EXESUF)
 check-unit-y += tests/test-throttle$(EXESUF)
 gcov-files-test-aio-$(CONFIG_WIN32) = aio-win32.c
 gcov-files-test-aio-$(CONFIG_POSIX) = aio-posix.c
@@ -491,7 +490,6 @@ tests/check-qom-proplist$(EXESUF): tests/check-qom-proplist.o $(test-qom-obj-y)
 tests/test-char$(EXESUF): tests/test-char.o qemu-char.o qemu-timer.o $(test-util-obj-y) $(qtest-obj-y) $(test-io-obj-y)
 tests/test-coroutine$(EXESUF): tests/test-coroutine.o $(test-block-obj-y)
 tests/test-aio$(EXESUF): tests/test-aio.o $(test-block-obj-y)
-tests/test-rfifolock$(EXESUF): tests/test-rfifolock.o $(test-util-obj-y)
 tests/test-throttle$(EXESUF): tests/test-throttle.o $(test-block-obj-y)
 tests/test-blockjob$(EXESUF): tests/test-blockjob.o $(test-block-obj-y) $(test-util-obj-y)
 tests/test-blockjob-txn$(EXESUF): tests/test-blockjob-txn.o $(test-block-obj-y) $(test-util-obj-y)
diff --git a/tests/crypto-tls-x509-helpers.h b/tests/crypto-tls-x509-helpers.h
index a8faa92bc0..921341c649 100644
--- a/tests/crypto-tls-x509-helpers.h
+++ b/tests/crypto-tls-x509-helpers.h
@@ -21,9 +21,6 @@
 #include <gnutls/gnutls.h>
 #include <gnutls/x509.h>
 
-#include <gnutls/gnutls.h>
-#include <gnutls/x509.h>
-
 #if !(defined WIN32) && \
     defined(CONFIG_TASN1) && \
     (LIBGNUTLS_VERSION_NUMBER >= 0x020600)
diff --git a/tests/test-aio.c b/tests/test-aio.c
index 03aa846970..5be99f8287 100644
--- a/tests/test-aio.c
+++ b/tests/test-aio.c
@@ -100,6 +100,7 @@ static void event_ready_cb(EventNotifier *e)
 
 typedef struct {
     QemuMutex start_lock;
+    EventNotifier notifier;
     bool thread_acquired;
 } AcquireTestData;
 
@@ -111,6 +112,11 @@ static void *test_acquire_thread(void *opaque)
     qemu_mutex_lock(&data->start_lock);
     qemu_mutex_unlock(&data->start_lock);
 
+    /* event_notifier_set might be called either before or after
+     * the main thread's call to poll().  The test case's outcome
+     * should be the same in either case.
+     */
+    event_notifier_set(&data->notifier);
     aio_context_acquire(ctx);
     aio_context_release(ctx);
 
@@ -125,20 +131,19 @@ static void set_event_notifier(AioContext *ctx, EventNotifier *notifier,
     aio_set_event_notifier(ctx, notifier, false, handler);
 }
 
-static void dummy_notifier_read(EventNotifier *unused)
+static void dummy_notifier_read(EventNotifier *n)
 {
-    g_assert(false); /* should never be invoked */
+    event_notifier_test_and_clear(n);
 }
 
 static void test_acquire(void)
 {
     QemuThread thread;
-    EventNotifier notifier;
     AcquireTestData data;
 
     /* Dummy event notifier ensures aio_poll() will block */
-    event_notifier_init(&notifier, false);
-    set_event_notifier(ctx, &notifier, dummy_notifier_read);
+    event_notifier_init(&data.notifier, false);
+    set_event_notifier(ctx, &data.notifier, dummy_notifier_read);
     g_assert(!aio_poll(ctx, false)); /* consume aio_notify() */
 
     qemu_mutex_init(&data.start_lock);
@@ -152,12 +157,13 @@ static void test_acquire(void)
     /* Block in aio_poll(), let other thread kick us and acquire context */
     aio_context_acquire(ctx);
     qemu_mutex_unlock(&data.start_lock); /* let the thread run */
-    g_assert(!aio_poll(ctx, true));
+    g_assert(aio_poll(ctx, true));
+    g_assert(!data.thread_acquired);
     aio_context_release(ctx);
 
     qemu_thread_join(&thread);
-    set_event_notifier(ctx, &notifier, NULL);
-    event_notifier_cleanup(&notifier);
+    set_event_notifier(ctx, &data.notifier, NULL);
+    event_notifier_cleanup(&data.notifier);
 
     g_assert(data.thread_acquired);
 }
diff --git a/tests/test-rfifolock.c b/tests/test-rfifolock.c
deleted file mode 100644
index 471a81114d..0000000000
--- a/tests/test-rfifolock.c
+++ /dev/null
@@ -1,91 +0,0 @@
-/*
- * RFifoLock tests
- *
- * Copyright Red Hat, Inc. 2013
- *
- * Authors:
- *  Stefan Hajnoczi    <stefanha@redhat.com>
- *
- * This work is licensed under the terms of the GNU LGPL, version 2 or later.
- * See the COPYING.LIB file in the top-level directory.
- */
-
-#include "qemu/osdep.h"
-#include "qemu-common.h"
-#include "qemu/rfifolock.h"
-
-static void test_nesting(void)
-{
-    RFifoLock lock;
-
-    /* Trivial test, ensure the lock is recursive */
-    rfifolock_init(&lock, NULL, NULL);
-    rfifolock_lock(&lock);
-    rfifolock_lock(&lock);
-    rfifolock_lock(&lock);
-    rfifolock_unlock(&lock);
-    rfifolock_unlock(&lock);
-    rfifolock_unlock(&lock);
-    rfifolock_destroy(&lock);
-}
-
-typedef struct {
-    RFifoLock lock;
-    int fd[2];
-} CallbackTestData;
-
-static void rfifolock_cb(void *opaque)
-{
-    CallbackTestData *data = opaque;
-    int ret;
-    char c = 0;
-
-    ret = write(data->fd[1], &c, sizeof(c));
-    g_assert(ret == 1);
-}
-
-static void *callback_thread(void *opaque)
-{
-    CallbackTestData *data = opaque;
-
-    /* The other thread holds the lock so the contention callback will be
-     * invoked...
-     */
-    rfifolock_lock(&data->lock);
-    rfifolock_unlock(&data->lock);
-    return NULL;
-}
-
-static void test_callback(void)
-{
-    CallbackTestData data;
-    QemuThread thread;
-    int ret;
-    char c;
-
-    rfifolock_init(&data.lock, rfifolock_cb, &data);
-    ret = qemu_pipe(data.fd);
-    g_assert(ret == 0);
-
-    /* Hold lock but allow the callback to kick us by writing to the pipe */
-    rfifolock_lock(&data.lock);
-    qemu_thread_create(&thread, "callback_thread",
-                       callback_thread, &data, QEMU_THREAD_JOINABLE);
-    ret = read(data.fd[0], &c, sizeof(c));
-    g_assert(ret == 1);
-    rfifolock_unlock(&data.lock);
-    /* If we got here then the callback was invoked, as expected */
-
-    qemu_thread_join(&thread);
-    close(data.fd[0]);
-    close(data.fd[1]);
-    rfifolock_destroy(&data.lock);
-}
-
-int main(int argc, char **argv)
-{
-    g_test_init(&argc, &argv, NULL);
-    g_test_add_func("/nesting", test_nesting);
-    g_test_add_func("/callback", test_callback);
-    return g_test_run();
-}
diff --git a/tests/vhost-user-test.c b/tests/vhost-user-test.c
index e4b2900898..96bf00eefa 100644
--- a/tests/vhost-user-test.c
+++ b/tests/vhost-user-test.c
@@ -22,8 +22,6 @@
 #include "libqos/virtio-pci.h"
 #include "qapi/error.h"
 
-#include "libqos/pci-pc.h"
-#include "libqos/virtio-pci.h"
 #include "libqos/malloc-pc.h"
 #include "hw/virtio/virtio-net.h"
 
diff --git a/util/Makefile.objs b/util/Makefile.objs
index 36c7dcc1fa..ad0f9c7fe4 100644
--- a/util/Makefile.objs
+++ b/util/Makefile.objs
@@ -25,7 +25,6 @@ util-obj-y += uuid.o
 util-obj-y += throttle.o
 util-obj-y += getauxval.o
 util-obj-y += readline.o
-util-obj-y += rfifolock.o
 util-obj-y += rcu.o
 util-obj-y += qemu-coroutine.o qemu-coroutine-lock.o qemu-coroutine-io.o
 util-obj-y += qemu-coroutine-sleep.o
diff --git a/util/oslib-posix.c b/util/oslib-posix.c
index 8ec99ccb4f..67c65893a4 100644
--- a/util/oslib-posix.c
+++ b/util/oslib-posix.c
@@ -28,7 +28,6 @@
 
 #include "qemu/osdep.h"
 #include <termios.h>
-#include <termios.h>
 
 #include <glib/gprintf.h>
 
diff --git a/util/qemu-thread-posix.c b/util/qemu-thread-posix.c
index ce51b37c1d..d20cddec0c 100644
--- a/util/qemu-thread-posix.c
+++ b/util/qemu-thread-posix.c
@@ -80,6 +80,20 @@ void qemu_mutex_unlock(QemuMutex *mutex)
         error_exit(err, __func__);
 }
 
+void qemu_rec_mutex_init(QemuRecMutex *mutex)
+{
+    int err;
+    pthread_mutexattr_t attr;
+
+    pthread_mutexattr_init(&attr);
+    pthread_mutexattr_settype(&attr, PTHREAD_MUTEX_RECURSIVE);
+    err = pthread_mutex_init(&mutex->lock, &attr);
+    pthread_mutexattr_destroy(&attr);
+    if (err) {
+        error_exit(err, __func__);
+    }
+}
+
 void qemu_cond_init(QemuCond *cond)
 {
     int err;
diff --git a/util/qemu-thread-win32.c b/util/qemu-thread-win32.c
index 072806f792..728e76b5b2 100644
--- a/util/qemu-thread-win32.c
+++ b/util/qemu-thread-win32.c
@@ -79,6 +79,31 @@ void qemu_mutex_unlock(QemuMutex *mutex)
     LeaveCriticalSection(&mutex->lock);
 }
 
+void qemu_rec_mutex_init(QemuRecMutex *mutex)
+{
+    InitializeCriticalSection(&mutex->lock);
+}
+
+void qemu_rec_mutex_destroy(QemuRecMutex *mutex)
+{
+    DeleteCriticalSection(&mutex->lock);
+}
+
+void qemu_rec_mutex_lock(QemuRecMutex *mutex)
+{
+    EnterCriticalSection(&mutex->lock);
+}
+
+int qemu_rec_mutex_trylock(QemuRecMutex *mutex)
+{
+    return !TryEnterCriticalSection(&mutex->lock);
+}
+
+void qemu_rec_mutex_unlock(QemuRecMutex *mutex)
+{
+    LeaveCriticalSection(&mutex->lock);
+}
+
 void qemu_cond_init(QemuCond *cond)
 {
     memset(cond, 0, sizeof(*cond));
diff --git a/util/rfifolock.c b/util/rfifolock.c
deleted file mode 100644
index 084c2f0ea1..0000000000
--- a/util/rfifolock.c
+++ /dev/null
@@ -1,78 +0,0 @@
-/*
- * Recursive FIFO lock
- *
- * Copyright Red Hat, Inc. 2013
- *
- * Authors:
- *  Stefan Hajnoczi   <stefanha@redhat.com>
- *
- * This work is licensed under the terms of the GNU LGPL, version 2 or later.
- * See the COPYING.LIB file in the top-level directory.
- *
- */
-
-#include "qemu/osdep.h"
-#include "qemu/rfifolock.h"
-
-void rfifolock_init(RFifoLock *r, void (*cb)(void *), void *opaque)
-{
-    qemu_mutex_init(&r->lock);
-    r->head = 0;
-    r->tail = 0;
-    qemu_cond_init(&r->cond);
-    r->nesting = 0;
-    r->cb = cb;
-    r->cb_opaque = opaque;
-}
-
-void rfifolock_destroy(RFifoLock *r)
-{
-    qemu_cond_destroy(&r->cond);
-    qemu_mutex_destroy(&r->lock);
-}
-
-/*
- * Theory of operation:
- *
- * In order to ensure FIFO ordering, implement a ticketlock.  Threads acquiring
- * the lock enqueue themselves by incrementing the tail index.  When the lock
- * is unlocked, the head is incremented and waiting threads are notified.
- *
- * Recursive locking does not take a ticket since the head is only incremented
- * when the outermost recursive caller unlocks.
- */
-void rfifolock_lock(RFifoLock *r)
-{
-    qemu_mutex_lock(&r->lock);
-
-    /* Take a ticket */
-    unsigned int ticket = r->tail++;
-
-    if (r->nesting > 0 && qemu_thread_is_self(&r->owner_thread)) {
-        r->tail--; /* put ticket back, we're nesting */
-    } else {
-        while (ticket != r->head) {
-            /* Invoke optional contention callback */
-            if (r->cb) {
-                r->cb(r->cb_opaque);
-            }
-            qemu_cond_wait(&r->cond, &r->lock);
-        }
-        qemu_thread_get_self(&r->owner_thread);
-    }
-
-    r->nesting++;
-    qemu_mutex_unlock(&r->lock);
-}
-
-void rfifolock_unlock(RFifoLock *r)
-{
-    qemu_mutex_lock(&r->lock);
-    assert(r->nesting > 0);
-    assert(qemu_thread_is_self(&r->owner_thread));
-    if (--r->nesting == 0) {
-        r->head++;
-        qemu_cond_broadcast(&r->cond);
-    }
-    qemu_mutex_unlock(&r->lock);
-}
diff --git a/vl.c b/vl.c
index 74dfe4eef9..368510fd8c 100644
--- a/vl.c
+++ b/vl.c
@@ -90,6 +90,7 @@ int main(int argc, char **argv)
 #include "audio/audio.h"
 #include "migration/migration.h"
 #include "sysemu/cpus.h"
+#include "migration/colo.h"
 #include "sysemu/kvm.h"
 #include "qapi/qmp/qjson.h"
 #include "qemu/option.h"
@@ -110,7 +111,6 @@ int main(int argc, char **argv)
 #include "trace.h"
 #include "trace/control.h"
 #include "qemu/queue.h"
-#include "sysemu/cpus.h"
 #include "sysemu/arch_init.h"
 
 #include "ui/qemu-spice.h"
@@ -613,6 +613,7 @@ static const RunStateTransition runstate_transitions_def[] = {
     { RUN_STATE_INMIGRATE, RUN_STATE_FINISH_MIGRATE },
     { RUN_STATE_INMIGRATE, RUN_STATE_PRELAUNCH },
     { RUN_STATE_INMIGRATE, RUN_STATE_POSTMIGRATE },
+    { RUN_STATE_INMIGRATE, RUN_STATE_COLO },
 
     { RUN_STATE_INTERNAL_ERROR, RUN_STATE_PAUSED },
     { RUN_STATE_INTERNAL_ERROR, RUN_STATE_FINISH_MIGRATE },
@@ -625,6 +626,7 @@ static const RunStateTransition runstate_transitions_def[] = {
     { RUN_STATE_PAUSED, RUN_STATE_RUNNING },
     { RUN_STATE_PAUSED, RUN_STATE_FINISH_MIGRATE },
     { RUN_STATE_PAUSED, RUN_STATE_PRELAUNCH },
+    { RUN_STATE_PAUSED, RUN_STATE_COLO},
 
     { RUN_STATE_POSTMIGRATE, RUN_STATE_RUNNING },
     { RUN_STATE_POSTMIGRATE, RUN_STATE_FINISH_MIGRATE },
@@ -637,10 +639,13 @@ static const RunStateTransition runstate_transitions_def[] = {
     { RUN_STATE_FINISH_MIGRATE, RUN_STATE_RUNNING },
     { RUN_STATE_FINISH_MIGRATE, RUN_STATE_POSTMIGRATE },
     { RUN_STATE_FINISH_MIGRATE, RUN_STATE_PRELAUNCH },
+    { RUN_STATE_FINISH_MIGRATE, RUN_STATE_COLO},
 
     { RUN_STATE_RESTORE_VM, RUN_STATE_RUNNING },
     { RUN_STATE_RESTORE_VM, RUN_STATE_PRELAUNCH },
 
+    { RUN_STATE_COLO, RUN_STATE_RUNNING },
+
     { RUN_STATE_RUNNING, RUN_STATE_DEBUG },
     { RUN_STATE_RUNNING, RUN_STATE_INTERNAL_ERROR },
     { RUN_STATE_RUNNING, RUN_STATE_IO_ERROR },
@@ -651,6 +656,7 @@ static const RunStateTransition runstate_transitions_def[] = {
     { RUN_STATE_RUNNING, RUN_STATE_SHUTDOWN },
     { RUN_STATE_RUNNING, RUN_STATE_WATCHDOG },
     { RUN_STATE_RUNNING, RUN_STATE_GUEST_PANICKED },
+    { RUN_STATE_RUNNING, RUN_STATE_COLO},
 
     { RUN_STATE_SAVE_VM, RUN_STATE_RUNNING },
 
@@ -663,10 +669,12 @@ static const RunStateTransition runstate_transitions_def[] = {
     { RUN_STATE_SUSPENDED, RUN_STATE_RUNNING },
     { RUN_STATE_SUSPENDED, RUN_STATE_FINISH_MIGRATE },
     { RUN_STATE_SUSPENDED, RUN_STATE_PRELAUNCH },
+    { RUN_STATE_SUSPENDED, RUN_STATE_COLO},
 
     { RUN_STATE_WATCHDOG, RUN_STATE_RUNNING },
     { RUN_STATE_WATCHDOG, RUN_STATE_FINISH_MIGRATE },
     { RUN_STATE_WATCHDOG, RUN_STATE_PRELAUNCH },
+    { RUN_STATE_WATCHDOG, RUN_STATE_COLO},
 
     { RUN_STATE_GUEST_PANICKED, RUN_STATE_RUNNING },
     { RUN_STATE_GUEST_PANICKED, RUN_STATE_FINISH_MIGRATE },
@@ -2408,8 +2416,9 @@ static int mon_init_func(void *opaque, QemuOpts *opts, Error **errp)
     if (qemu_opt_get_bool(opts, "pretty", 0))
         flags |= MONITOR_USE_PRETTY;
 
-    if (qemu_opt_get_bool(opts, "default", 0))
-        flags |= MONITOR_IS_DEFAULT;
+    if (qemu_opt_get_bool(opts, "default", 0)) {
+        error_report("option 'default' does nothing and is deprecated");
+    }
 
     chardev = qemu_opt_get(opts, "chardev");
     chr = qemu_chr_find(chardev);
@@ -2428,16 +2437,12 @@ static void monitor_parse(const char *optarg, const char *mode, bool pretty)
     QemuOpts *opts;
     const char *p;
     char label[32];
-    int def = 0;
 
     if (strstart(optarg, "chardev:", &p)) {
         snprintf(label, sizeof(label), "%s", p);
     } else {
         snprintf(label, sizeof(label), "compat_monitor%d",
                  monitor_device_index);
-        if (monitor_device_index == 0) {
-            def = 1;
-        }
         opts = qemu_chr_parse_compat(label, optarg);
         if (!opts) {
             error_report("parse error: %s", optarg);
@@ -2449,8 +2454,6 @@ static void monitor_parse(const char *optarg, const char *mode, bool pretty)
     qemu_opt_set(opts, "mode", mode, &error_abort);
     qemu_opt_set(opts, "chardev", label, &error_abort);
     qemu_opt_set_bool(opts, "pretty", pretty, &error_abort);
-    if (def)
-        qemu_opt_set(opts, "default", "on", &error_abort);
     monitor_device_index++;
 }
 
@@ -4426,6 +4429,8 @@ int main(int argc, char **argv, char **envp)
 #endif
     }
 
+    colo_info_init();
+
     if (net_init_clients() < 0) {
         exit(1);
     }
diff --git a/xen-common.c b/xen-common.c
index e641ad1aef..909976071c 100644
--- a/xen-common.c
+++ b/xen-common.c
@@ -116,12 +116,12 @@ static int xen_init(MachineState *ms)
 {
     xen_xc = xc_interface_open(0, 0, 0);
     if (xen_xc == NULL) {
-        xen_be_printf(NULL, 0, "can't open xen interface\n");
+        xen_pv_printf(NULL, 0, "can't open xen interface\n");
         return -1;
     }
     xen_fmem = xenforeignmemory_open(0, 0);
     if (xen_fmem == NULL) {
-        xen_be_printf(NULL, 0, "can't open xen fmem interface\n");
+        xen_pv_printf(NULL, 0, "can't open xen fmem interface\n");
         xc_interface_close(xen_xc);
         return -1;
     }