16 files changed, 203 insertions, 48 deletions
diff --git a/async.c b/async.c
index 90fe906539..5ce3633010 100644
--- a/async.c
+++ b/async.c
@@ -47,11 +47,16 @@ QEMUBH *aio_bh_new(AioContext *ctx, QEMUBHFunc *cb, void *opaque)
     bh->ctx = ctx;
     bh->cb = cb;
     bh->opaque = opaque;
+    qemu_mutex_lock(&ctx->bh_lock);
     bh->next = ctx->first_bh;
+    /* Make sure that the members are ready before putting bh into list */
+    smp_wmb();
     ctx->first_bh = bh;
+    qemu_mutex_unlock(&ctx->bh_lock);
     return bh;
 }
 
+/* Multiple occurrences of aio_bh_poll cannot be called concurrently */
 int aio_bh_poll(AioContext *ctx)
 {
     QEMUBH *bh, **bhp, *next;
@@ -61,9 +66,15 @@ int aio_bh_poll(AioContext *ctx)
 
     ret = 0;
     for (bh = ctx->first_bh; bh; bh = next) {
+        /* Make sure that fetching bh happens before accessing its members */
+        smp_read_barrier_depends();
         next = bh->next;
         if (!bh->deleted && bh->scheduled) {
             bh->scheduled = 0;
+            /* Paired with write barrier in bh schedule to ensure reading for
+             * idle & callbacks coming after bh's scheduling.
+             */
+            smp_rmb();
             if (!bh->idle)
                 ret = 1;
             bh->idle = 0;
@@ -75,6 +86,7 @@ int aio_bh_poll(AioContext *ctx)
 
     /* remove deleted bhs */
     if (!ctx->walking_bh) {
+        qemu_mutex_lock(&ctx->bh_lock);
         bhp = &ctx->first_bh;
         while (*bhp) {
             bh = *bhp;
@@ -85,6 +97,7 @@ int aio_bh_poll(AioContext *ctx)
                 bhp = &bh->next;
             }
         }
+        qemu_mutex_unlock(&ctx->bh_lock);
     }
 
     return ret;
@@ -94,24 +107,38 @@ void qemu_bh_schedule_idle(QEMUBH *bh)
 {
     if (bh->scheduled)
         return;
-    bh->scheduled = 1;
     bh->idle = 1;
+    /* Make sure that idle & any writes needed by the callback are done
+     * before the locations are read in the aio_bh_poll.
+     */
+    smp_wmb();
+    bh->scheduled = 1;
 }
 
 void qemu_bh_schedule(QEMUBH *bh)
 {
     if (bh->scheduled)
         return;
-    bh->scheduled = 1;
     bh->idle = 0;
+    /* Make sure that idle & any writes needed by the callback are done
+     * before the locations are read in the aio_bh_poll.
+     */
+    smp_wmb();
+    bh->scheduled = 1;
     aio_notify(bh->ctx);
 }
 
+
+/* This func is async.
+ */
 void qemu_bh_cancel(QEMUBH *bh)
 {
     bh->scheduled = 0;
 }
 
+/* This func is async.The bottom half will do the delete action at the finial
+ * end.
+ */
 void qemu_bh_delete(QEMUBH *bh)
 {
     bh->scheduled = 0;
@@ -176,6 +203,7 @@ aio_ctx_finalize(GSource     *source)
     thread_pool_free(ctx->thread_pool);
     aio_set_event_notifier(ctx, &ctx->notifier, NULL, NULL);
     event_notifier_cleanup(&ctx->notifier);
+    qemu_mutex_destroy(&ctx->bh_lock);
     g_array_free(ctx->pollfds, TRUE);
 }
 
@@ -211,6 +239,7 @@ AioContext *aio_context_new(void)
     ctx = (AioContext *) g_source_new(&aio_source_funcs, sizeof(AioContext));
     ctx->pollfds = g_array_new(FALSE, FALSE, sizeof(GPollFD));
     ctx->thread_pool = NULL;
+    qemu_mutex_init(&ctx->bh_lock);
     event_notifier_init(&ctx->notifier, false);
     aio_set_event_notifier(ctx, &ctx->notifier, 
                            (EventNotifierHandler *)
diff --git a/block-migration.c b/block-migration.c
index 2fd7699794..f803f2006f 100644
--- a/block-migration.c
+++ b/block-migration.c
@@ -29,6 +29,7 @@
 #define BLK_MIG_FLAG_DEVICE_BLOCK       0x01
 #define BLK_MIG_FLAG_EOS                0x02
 #define BLK_MIG_FLAG_PROGRESS           0x04
+#define BLK_MIG_FLAG_ZERO_BLOCK         0x08
 
 #define MAX_IS_ALLOCATED_SEARCH 65536
 
@@ -80,6 +81,7 @@ typedef struct BlkMigState {
     int shared_base;
     QSIMPLEQ_HEAD(bmds_list, BlkMigDevState) bmds_list;
     int64_t total_sector_sum;
+    bool zero_blocks;
 
     /* Protected by lock.  */
     QSIMPLEQ_HEAD(blk_list, BlkMigBlock) blk_list;
@@ -114,16 +116,30 @@ static void blk_mig_unlock(void)
 static void blk_send(QEMUFile *f, BlkMigBlock * blk)
 {
     int len;
+    uint64_t flags = BLK_MIG_FLAG_DEVICE_BLOCK;
+
+    if (block_mig_state.zero_blocks &&
+        buffer_is_zero(blk->buf, BLOCK_SIZE)) {
+        flags |= BLK_MIG_FLAG_ZERO_BLOCK;
+    }
 
     /* sector number and flags */
     qemu_put_be64(f, (blk->sector << BDRV_SECTOR_BITS)
-                     | BLK_MIG_FLAG_DEVICE_BLOCK);
+                     | flags);
 
     /* device name */
     len = strlen(blk->bmds->bs->device_name);
     qemu_put_byte(f, len);
     qemu_put_buffer(f, (uint8_t *)blk->bmds->bs->device_name, len);
 
+    /* if a block is zero we need to flush here since the network
+     * bandwidth is now a lot higher than the storage device bandwidth.
+     * thus if we queue zero blocks we slow down the migration */
+    if (flags & BLK_MIG_FLAG_ZERO_BLOCK) {
+        qemu_fflush(f);
+        return;
+    }
+
     qemu_put_buffer(f, blk->buf, BLOCK_SIZE);
 }
 
@@ -344,6 +360,7 @@ static void init_blk_migration(QEMUFile *f)
     block_mig_state.total_sector_sum = 0;
     block_mig_state.prev_progress = -1;
     block_mig_state.bulk_completed = 0;
+    block_mig_state.zero_blocks = migrate_zero_blocks();
 
     bdrv_iterate(init_blk_migration_it, NULL);
 }
@@ -762,12 +779,15 @@ static int block_load(QEMUFile *f, void *opaque, int version_id)
                 nr_sectors = BDRV_SECTORS_PER_DIRTY_CHUNK;
             }
 
-            buf = g_malloc(BLOCK_SIZE);
-
-            qemu_get_buffer(f, buf, BLOCK_SIZE);
-            ret = bdrv_write(bs, addr, buf, nr_sectors);
+            if (flags & BLK_MIG_FLAG_ZERO_BLOCK) {
+                ret = bdrv_write_zeroes(bs, addr, nr_sectors);
+            } else {
+                buf = g_malloc(BLOCK_SIZE);
+                qemu_get_buffer(f, buf, BLOCK_SIZE);
+                ret = bdrv_write(bs, addr, buf, nr_sectors);
+                g_free(buf);
+            }
 
-            g_free(buf);
             if (ret < 0) {
                 return ret;
             }
diff --git a/block.c b/block.c
index b56024113b..6cd39fa146 100644
--- a/block.c
+++ b/block.c
@@ -2162,6 +2162,7 @@ typedef struct RwCo {
     QEMUIOVector *qiov;
     bool is_write;
     int ret;
+    BdrvRequestFlags flags;
 } RwCo;
 
 static void coroutine_fn bdrv_rw_co_entry(void *opaque)
@@ -2170,10 +2171,12 @@ static void coroutine_fn bdrv_rw_co_entry(void *opaque)
 
     if (!rwco->is_write) {
         rwco->ret = bdrv_co_do_readv(rwco->bs, rwco->sector_num,
-                                     rwco->nb_sectors, rwco->qiov, 0);
+                                     rwco->nb_sectors, rwco->qiov,
+                                     rwco->flags);
     } else {
         rwco->ret = bdrv_co_do_writev(rwco->bs, rwco->sector_num,
-                                      rwco->nb_sectors, rwco->qiov, 0);
+                                      rwco->nb_sectors, rwco->qiov,
+                                      rwco->flags);
     }
 }
 
@@ -2181,7 +2184,8 @@ static void coroutine_fn bdrv_rw_co_entry(void *opaque)
  * Process a vectored synchronous request using coroutines
  */
 static int bdrv_rwv_co(BlockDriverState *bs, int64_t sector_num,
-                       QEMUIOVector *qiov, bool is_write)
+                       QEMUIOVector *qiov, bool is_write,
+                       BdrvRequestFlags flags)
 {
     Coroutine *co;
     RwCo rwco = {
@@ -2191,6 +2195,7 @@ static int bdrv_rwv_co(BlockDriverState *bs, int64_t sector_num,
         .qiov = qiov,
         .is_write = is_write,
         .ret = NOT_DONE,
+        .flags = flags,
     };
     assert((qiov->size & (BDRV_SECTOR_SIZE - 1)) == 0);
 
@@ -2222,7 +2227,7 @@ static int bdrv_rwv_co(BlockDriverState *bs, int64_t sector_num,
  * Process a synchronous request using coroutines
  */
 static int bdrv_rw_co(BlockDriverState *bs, int64_t sector_num, uint8_t *buf,
-                      int nb_sectors, bool is_write)
+                      int nb_sectors, bool is_write, BdrvRequestFlags flags)
 {
     QEMUIOVector qiov;
     struct iovec iov = {
@@ -2231,14 +2236,14 @@ static int bdrv_rw_co(BlockDriverState *bs, int64_t sector_num, uint8_t *buf,
     };
 
     qemu_iovec_init_external(&qiov, &iov, 1);
-    return bdrv_rwv_co(bs, sector_num, &qiov, is_write);
+    return bdrv_rwv_co(bs, sector_num, &qiov, is_write, flags);
 }
 
 /* return < 0 if error. See bdrv_write() for the return codes */
 int bdrv_read(BlockDriverState *bs, int64_t sector_num,
               uint8_t *buf, int nb_sectors)
 {
-    return bdrv_rw_co(bs, sector_num, buf, nb_sectors, false);
+    return bdrv_rw_co(bs, sector_num, buf, nb_sectors, false, 0);
 }
 
 /* Just like bdrv_read(), but with I/O throttling temporarily disabled */
@@ -2250,7 +2255,7 @@ int bdrv_read_unthrottled(BlockDriverState *bs, int64_t sector_num,
 
     enabled = bs->io_limits_enabled;
     bs->io_limits_enabled = false;
-    ret = bdrv_read(bs, 0, buf, 1);
+    ret = bdrv_read(bs, sector_num, buf, nb_sectors);
     bs->io_limits_enabled = enabled;
     return ret;
 }
@@ -2264,12 +2269,18 @@ int bdrv_read_unthrottled(BlockDriverState *bs, int64_t sector_num,
 int bdrv_write(BlockDriverState *bs, int64_t sector_num,
                const uint8_t *buf, int nb_sectors)
 {
-    return bdrv_rw_co(bs, sector_num, (uint8_t *)buf, nb_sectors, true);
+    return bdrv_rw_co(bs, sector_num, (uint8_t *)buf, nb_sectors, true, 0);
 }
 
 int bdrv_writev(BlockDriverState *bs, int64_t sector_num, QEMUIOVector *qiov)
 {
-    return bdrv_rwv_co(bs, sector_num, qiov, true);
+    return bdrv_rwv_co(bs, sector_num, qiov, true, 0);
+}
+
+int bdrv_write_zeroes(BlockDriverState *bs, int64_t sector_num, int nb_sectors)
+{
+    return bdrv_rw_co(bs, sector_num, NULL, nb_sectors, true,
+                      BDRV_REQ_ZERO_WRITE);
 }
 
 int bdrv_pread(BlockDriverState *bs, int64_t offset,
diff --git a/block/gluster.c b/block/gluster.c
index 61424bcb01..6de418c0bd 100644
--- a/block/gluster.c
+++ b/block/gluster.c
@@ -532,6 +532,39 @@ out:
     return NULL;
 }
 
+#ifdef CONFIG_GLUSTERFS_DISCARD
+static BlockDriverAIOCB *qemu_gluster_aio_discard(BlockDriverState *bs,
+        int64_t sector_num, int nb_sectors, BlockDriverCompletionFunc *cb,
+        void *opaque)
+{
+    int ret;
+    GlusterAIOCB *acb;
+    BDRVGlusterState *s = bs->opaque;
+    size_t size;
+    off_t offset;
+
+    offset = sector_num * BDRV_SECTOR_SIZE;
+    size = nb_sectors * BDRV_SECTOR_SIZE;
+
+    acb = qemu_aio_get(&gluster_aiocb_info, bs, cb, opaque);
+    acb->size = 0;
+    acb->ret = 0;
+    acb->finished = NULL;
+    s->qemu_aio_count++;
+
+    ret = glfs_discard_async(s->fd, offset, size, &gluster_finish_aiocb, acb);
+    if (ret < 0) {
+        goto out;
+    }
+    return &acb->common;
+
+out:
+    s->qemu_aio_count--;
+    qemu_aio_release(acb);
+    return NULL;
+}
+#endif
+
 static int64_t qemu_gluster_getlength(BlockDriverState *bs)
 {
     BDRVGlusterState *s = bs->opaque;
@@ -602,6 +635,9 @@ static BlockDriver bdrv_gluster = {
     .bdrv_aio_writev              = qemu_gluster_aio_writev,
     .bdrv_aio_flush               = qemu_gluster_aio_flush,
     .bdrv_has_zero_init           = qemu_gluster_has_zero_init,
+#ifdef CONFIG_GLUSTERFS_DISCARD
+    .bdrv_aio_discard             = qemu_gluster_aio_discard,
+#endif
     .create_options               = qemu_gluster_create_options,
 };
 
@@ -618,6 +654,9 @@ static BlockDriver bdrv_gluster_tcp = {
     .bdrv_aio_writev              = qemu_gluster_aio_writev,
     .bdrv_aio_flush               = qemu_gluster_aio_flush,
     .bdrv_has_zero_init           = qemu_gluster_has_zero_init,
+#ifdef CONFIG_GLUSTERFS_DISCARD
+    .bdrv_aio_discard             = qemu_gluster_aio_discard,
+#endif
     .create_options               = qemu_gluster_create_options,
 };
 
@@ -634,6 +673,9 @@ static BlockDriver bdrv_gluster_unix = {
     .bdrv_aio_writev              = qemu_gluster_aio_writev,
     .bdrv_aio_flush               = qemu_gluster_aio_flush,
     .bdrv_has_zero_init           = qemu_gluster_has_zero_init,
+#ifdef CONFIG_GLUSTERFS_DISCARD
+    .bdrv_aio_discard             = qemu_gluster_aio_discard,
+#endif
     .create_options               = qemu_gluster_create_options,
 };
 
@@ -650,6 +692,9 @@ static BlockDriver bdrv_gluster_rdma = {
     .bdrv_aio_writev              = qemu_gluster_aio_writev,
     .bdrv_aio_flush               = qemu_gluster_aio_flush,
     .bdrv_has_zero_init           = qemu_gluster_has_zero_init,
+#ifdef CONFIG_GLUSTERFS_DISCARD
+    .bdrv_aio_discard             = qemu_gluster_aio_discard,
+#endif
     .create_options               = qemu_gluster_create_options,
 };
 
diff --git a/block/raw.c b/block/raw.c
index ce10422006..f1682d4f32 100644
--- a/block/raw.c
+++ b/block/raw.c
@@ -42,6 +42,13 @@ static int coroutine_fn raw_co_is_allocated(BlockDriverState *bs,
     return bdrv_co_is_allocated(bs->file, sector_num, nb_sectors, pnum);
 }
 
+static int coroutine_fn raw_co_write_zeroes(BlockDriverState *bs,
+                                            int64_t sector_num,
+                                            int nb_sectors)
+{
+    return bdrv_co_write_zeroes(bs->file, sector_num, nb_sectors);
+}
+
 static int64_t raw_getlength(BlockDriverState *bs)
 {
     return bdrv_getlength(bs->file);
@@ -114,6 +121,11 @@ static int raw_has_zero_init(BlockDriverState *bs)
     return bdrv_has_zero_init(bs->file);
 }
 
+static int raw_get_info(BlockDriverState *bs, BlockDriverInfo *bdi)
+{
+    return bdrv_get_info(bs->file, bdi);
+}
+
 static BlockDriver bdrv_raw = {
     .format_name        = "raw",
 
@@ -128,10 +140,12 @@ static BlockDriver bdrv_raw = {
     .bdrv_co_readv          = raw_co_readv,
     .bdrv_co_writev         = raw_co_writev,
     .bdrv_co_is_allocated   = raw_co_is_allocated,
+    .bdrv_co_write_zeroes   = raw_co_write_zeroes,
     .bdrv_co_discard        = raw_co_discard,
 
     .bdrv_probe         = raw_probe,
     .bdrv_getlength     = raw_getlength,
+    .bdrv_get_info      = raw_get_info,
     .bdrv_truncate      = raw_truncate,
 
     .bdrv_is_inserted   = raw_is_inserted,
diff --git a/block/vvfat.c b/block/vvfat.c
index 87b02799d0..cd3b8edd9f 100644
--- a/block/vvfat.c
+++ b/block/vvfat.c
@@ -1164,8 +1164,8 @@ DLOG(if (stderr == NULL) {
     s->sector_count = cyls * heads * secs - (s->first_sectors_number - 1);
 
     if (qemu_opt_get_bool(opts, "rw", false)) {
-        if (enable_write_target(s)) {
-            ret = -EIO;
+        ret = enable_write_target(s);
+        if (ret < 0) {
             goto fail;
         }
         bs->read_only = 0;
@@ -2917,9 +2917,7 @@ static int enable_write_target(BDRVVVFATState *s)
     s->qcow_filename = g_malloc(1024);
     ret = get_tmp_filename(s->qcow_filename, 1024);
     if (ret < 0) {
-        g_free(s->qcow_filename);
-        s->qcow_filename = NULL;
-        return ret;
+        goto err;
     }
 
     bdrv_qcow = bdrv_find_format("qcow");
@@ -2927,18 +2925,18 @@ static int enable_write_target(BDRVVVFATState *s)
     set_option_parameter_int(options, BLOCK_OPT_SIZE, s->sector_count * 512);
     set_option_parameter(options, BLOCK_OPT_BACKING_FILE, "fat:");
 
-    if (bdrv_create(bdrv_qcow, s->qcow_filename, options) < 0)
-	return -1;
+    ret = bdrv_create(bdrv_qcow, s->qcow_filename, options);
+    if (ret < 0) {
+        goto err;
+    }
 
     s->qcow = bdrv_new("");
-    if (s->qcow == NULL) {
-        return -1;
-    }
 
     ret = bdrv_open(s->qcow, s->qcow_filename, NULL,
             BDRV_O_RDWR | BDRV_O_CACHE_WB | BDRV_O_NO_FLUSH, bdrv_qcow);
     if (ret < 0) {
-	return ret;
+        bdrv_delete(s->qcow);
+        goto err;
     }
 
 #ifndef _WIN32
@@ -2951,6 +2949,11 @@ static int enable_write_target(BDRVVVFATState *s)
     *(void**)s->bs->backing_hd->opaque = s;
 
     return 0;
+
+err:
+    g_free(s->qcow_filename);
+    s->qcow_filename = NULL;
+    return ret;
 }
 
 static void vvfat_close(BlockDriverState *bs)
diff --git a/configure b/configure
index 9e1cd1975a..7c45db2fdd 100755
--- a/configure
+++ b/configure
@@ -237,6 +237,7 @@ libiscsi=""
 coroutine=""
 seccomp=""
 glusterfs=""
+glusterfs_discard="no"
 virtio_blk_data_plane=""
 gtk=""
 gtkabi="2.0"
@@ -2570,23 +2571,21 @@ fi
 ##########################################
 # glusterfs probe
 if test "$glusterfs" != "no" ; then
-  cat > $TMPC <<EOF
-#include <glusterfs/api/glfs.h>
-int main(void) {
-    (void) glfs_new("volume");
-    return 0;
-}
-EOF
-  glusterfs_libs="-lgfapi -lgfrpc -lgfxdr"
-  if compile_prog "" "$glusterfs_libs" ; then
-    glusterfs=yes
+  if $pkg_config --atleast-version=3 glusterfs-api >/dev/null 2>&1; then
+    glusterfs="yes"
+    glusterfs_cflags=`$pkg_config --cflags glusterfs-api 2>/dev/null`
+    glusterfs_libs=`$pkg_config --libs glusterfs-api 2>/dev/null`
+    CFLAGS="$CFLAGS $glusterfs_cflags"
     libs_tools="$glusterfs_libs $libs_tools"
     libs_softmmu="$glusterfs_libs $libs_softmmu"
+    if $pkg_config --atleast-version=5 glusterfs-api >/dev/null 2>&1; then
+      glusterfs_discard="yes"
+    fi
   else
     if test "$glusterfs" = "yes" ; then
       feature_not_found "GlusterFS backend support"
     fi
-    glusterfs=no
+    glusterfs="no"
   fi
 fi
 
@@ -3969,6 +3968,10 @@ if test "$glusterfs" = "yes" ; then
   echo "CONFIG_GLUSTERFS=y" >> $config_host_mak
 fi
 
+if test "$glusterfs_discard" = "yes" ; then
+  echo "CONFIG_GLUSTERFS_DISCARD=y" >> $config_host_mak
+fi
+
 if test "$libssh2" = "yes" ; then
   echo "CONFIG_LIBSSH2=y" >> $config_host_mak
 fi
diff --git a/cpus.c b/cpus.c
index 8062cdd57e..2509eb5af3 100644
--- a/cpus.c
+++ b/cpus.c
@@ -443,11 +443,12 @@ static int do_vm_stop(RunState state)
         pause_all_vcpus();
         runstate_set(state);
         vm_state_notify(0, state);
-        bdrv_drain_all();
-        ret = bdrv_flush_all();
         monitor_protocol_event(QEVENT_STOP, NULL);
     }
 
+    bdrv_drain_all();
+    ret = bdrv_flush_all();
+
     return ret;
 }
 
@@ -1126,7 +1127,9 @@ int vm_stop_force_state(RunState state)
         return vm_stop(state);
     } else {
         runstate_set(state);
-        return 0;
+        /* Make sure to return an error if the flush in a previous vm_stop()
+         * failed. */
+        return bdrv_flush_all();
     }
 }
 
diff --git a/hw/block/dataplane/virtio-blk.c b/hw/block/dataplane/virtio-blk.c
index 03566650c6..2faed43127 100644
--- a/hw/block/dataplane/virtio-blk.c
+++ b/hw/block/dataplane/virtio-blk.c
@@ -537,7 +537,7 @@ void virtio_blk_data_plane_stop(VirtIOBlockDataPlane *s)
     /* Clean up guest notifier (irq) */
     k->set_guest_notifiers(qbus->parent, 1, false);
 
-    vring_teardown(&s->vring);
+    vring_teardown(&s->vring, s->vdev, 0);
     s->started = false;
     s->stopping = false;
 }
diff --git a/hw/virtio/dataplane/vring.c b/hw/virtio/dataplane/vring.c
index e0d6e83625..82cc151b17 100644
--- a/hw/virtio/dataplane/vring.c
+++ b/hw/virtio/dataplane/vring.c
@@ -39,8 +39,8 @@ bool vring_setup(Vring *vring, VirtIODevice *vdev, int n)
 
     vring_init(&vring->vr, virtio_queue_get_num(vdev, n), vring_ptr, 4096);
 
-    vring->last_avail_idx = 0;
-    vring->last_used_idx = 0;
+    vring->last_avail_idx = virtio_queue_get_last_avail_idx(vdev, n);
+    vring->last_used_idx = vring->vr.used->idx;
     vring->signalled_used = 0;
     vring->signalled_used_valid = false;
 
@@ -49,8 +49,10 @@ bool vring_setup(Vring *vring, VirtIODevice *vdev, int n)
     return true;
 }
 
-void vring_teardown(Vring *vring)
+void vring_teardown(Vring *vring, VirtIODevice *vdev, int n)
 {
+    virtio_queue_set_last_avail_idx(vdev, n, vring->last_avail_idx);
+
     hostmem_finalize(&vring->hostmem);
 }
 
diff --git a/include/block/aio.h b/include/block/aio.h
index 183679374f..cc77771c46 100644
--- a/include/block/aio.h
+++ b/include/block/aio.h
@@ -17,6 +17,7 @@
 #include "qemu-common.h"
 #include "qemu/queue.h"
 #include "qemu/event_notifier.h"
+#include "qemu/thread.h"
 
 typedef struct BlockDriverAIOCB BlockDriverAIOCB;
 typedef void BlockDriverCompletionFunc(void *opaque, int ret);
@@ -53,6 +54,8 @@ typedef struct AioContext {
      */
     int walking_handlers;
 
+    /* lock to protect between bh's adders and deleter */
+    QemuMutex bh_lock;
     /* Anchor of the list of Bottom Halves belonging to the context */
     struct QEMUBH *first_bh;
 
@@ -127,6 +130,8 @@ void aio_notify(AioContext *ctx);
  * aio_bh_poll: Poll bottom halves for an AioContext.
  *
  * These are internal functions used by the QEMU main loop.
+ * And notice that multiple occurrences of aio_bh_poll cannot
+ * be called concurrently
  */
 int aio_bh_poll(AioContext *ctx);
 
@@ -163,6 +168,8 @@ void qemu_bh_cancel(QEMUBH *bh);
  * Deleting a bottom half frees the memory that was allocated for it by
  * qemu_bh_new.  It also implies canceling the bottom half if it was
  * scheduled.
+ * This func is async. The bottom half will do the delete action at the finial
+ * end.
  *
  * @bh: The bottom half to be deleted.
  */
diff --git a/include/block/block.h b/include/block/block.h
index b6b9014a9c..742fce5f7f 100644
--- a/include/block/block.h
+++ b/include/block/block.h
@@ -157,6 +157,8 @@ int bdrv_read_unthrottled(BlockDriverState *bs, int64_t sector_num,
                           uint8_t *buf, int nb_sectors);
 int bdrv_write(BlockDriverState *bs, int64_t sector_num,
                const uint8_t *buf, int nb_sectors);
+int bdrv_write_zeroes(BlockDriverState *bs, int64_t sector_num,
+               int nb_sectors);
 int bdrv_writev(BlockDriverState *bs, int64_t sector_num, QEMUIOVector *qiov);
 int bdrv_pread(BlockDriverState *bs, int64_t offset,
                void *buf, int count);
diff --git a/include/hw/virtio/dataplane/vring.h b/include/hw/virtio/dataplane/vring.h
index 9380cb5413..c0b69ff18f 100644
--- a/include/hw/virtio/dataplane/vring.h
+++ b/include/hw/virtio/dataplane/vring.h
@@ -50,7 +50,7 @@ static inline void vring_set_broken(Vring *vring)
 }
 
 bool vring_setup(Vring *vring, VirtIODevice *vdev, int n);
-void vring_teardown(Vring *vring);
+void vring_teardown(Vring *vring, VirtIODevice *vdev, int n);
 void vring_disable_notification(VirtIODevice *vdev, Vring *vring);
 bool vring_enable_notification(VirtIODevice *vdev, Vring *vring);
 bool vring_should_notify(VirtIODevice *vdev, Vring *vring);
diff --git a/include/migration/migration.h b/include/migration/migration.h
index bc9fde0b2a..701709a1e9 100644
--- a/include/migration/migration.h
+++ b/include/migration/migration.h
@@ -124,6 +124,7 @@ void migrate_add_blocker(Error *reason);
 void migrate_del_blocker(Error *reason);
 
 bool migrate_rdma_pin_all(void);
+bool migrate_zero_blocks(void);
 
 bool migrate_auto_converge(void);
 
diff --git a/migration.c b/migration.c
index 9f5a4230d1..a9c042186d 100644
--- a/migration.c
+++ b/migration.c
@@ -493,6 +493,15 @@ bool migrate_auto_converge(void)
     return s->enabled_capabilities[MIGRATION_CAPABILITY_AUTO_CONVERGE];
 }
 
+bool migrate_zero_blocks(void)
+{
+    MigrationState *s;
+
+    s = migrate_get_current();
+
+    return s->enabled_capabilities[MIGRATION_CAPABILITY_ZERO_BLOCKS];
+}
+
 int migrate_use_xbzrle(void)
 {
     MigrationState *s;
diff --git a/qapi-schema.json b/qapi-schema.json
index 8d33d527d3..592bb9c7a1 100644
--- a/qapi-schema.json
+++ b/qapi-schema.json
@@ -613,10 +613,16 @@
 #          Disabled by default. Experimental: may (or may not) be renamed after
 #          further testing is complete. (since 1.6)
 #
+# @zero-blocks: During storage migration encode blocks of zeroes efficiently. This
+#          essentially saves 1MB of zeroes per block on the wire. Enabling requires
+#          source and target VM to support this feature. To enable it is sufficient
+#          to enable the capability on the source VM. The feature is disabled by
+#          default. (since 1.6)
+#
 # Since: 1.2
 ##
 { 'enum': 'MigrationCapability',
-  'data': ['xbzrle', 'x-rdma-pin-all', 'auto-converge'] }
+  'data': ['xbzrle', 'x-rdma-pin-all', 'auto-converge', 'zero-blocks'] }
 
 ##
 # @MigrationCapabilityStatus