14 files changed, 712 insertions, 475 deletions
diff --git a/block/Makefile.objs b/block/Makefile.objs
index f43ecbc044..4e8c91ec34 100644
--- a/block/Makefile.objs
+++ b/block/Makefile.objs
@@ -10,7 +10,7 @@ block-obj-$(CONFIG_POSIX) += raw-posix.o
 block-obj-$(CONFIG_LINUX_AIO) += linux-aio.o
 
 ifeq ($(CONFIG_POSIX),y)
-block-obj-y += nbd.o sheepdog.o
+block-obj-y += nbd.o nbd-client.o sheepdog.o
 block-obj-$(CONFIG_LIBISCSI) += iscsi.o
 block-obj-$(CONFIG_CURL) += curl.o
 block-obj-$(CONFIG_RBD) += rbd.o
diff --git a/block/blkdebug.c b/block/blkdebug.c
index 37cf028545..957be2c76b 100644
--- a/block/blkdebug.c
+++ b/block/blkdebug.c
@@ -594,9 +594,9 @@ static int blkdebug_debug_breakpoint(BlockDriverState *bs, const char *event,
 static int blkdebug_debug_resume(BlockDriverState *bs, const char *tag)
 {
     BDRVBlkdebugState *s = bs->opaque;
-    BlkdebugSuspendedReq *r;
+    BlkdebugSuspendedReq *r, *next;
 
-    QLIST_FOREACH(r, &s->suspended_reqs, next) {
+    QLIST_FOREACH_SAFE(r, &s->suspended_reqs, next, next) {
         if (!strcmp(r->tag, tag)) {
             qemu_coroutine_enter(r->co, NULL);
             return 0;
@@ -609,7 +609,7 @@ static int blkdebug_debug_remove_breakpoint(BlockDriverState *bs,
                                             const char *tag)
 {
     BDRVBlkdebugState *s = bs->opaque;
-    BlkdebugSuspendedReq *r;
+    BlkdebugSuspendedReq *r, *r_next;
     BlkdebugRule *rule, *next;
     int i, ret = -ENOENT;
 
@@ -622,7 +622,7 @@ static int blkdebug_debug_remove_breakpoint(BlockDriverState *bs,
             }
         }
     }
-    QLIST_FOREACH(r, &s->suspended_reqs, next) {
+    QLIST_FOREACH_SAFE(r, &s->suspended_reqs, next, r_next) {
         if (!strcmp(r->tag, tag)) {
             qemu_coroutine_enter(r->co, NULL);
             ret = 0;
diff --git a/block/commit.c b/block/commit.c
index d4090cbf7d..acec4ac5a8 100644
--- a/block/commit.c
+++ b/block/commit.c
@@ -198,13 +198,7 @@ void commit_start(BlockDriverState *bs, BlockDriverState *base,
         return;
     }
 
-    /* Once we support top == active layer, remove this check */
-    if (top == bs) {
-        error_setg(errp,
-                   "Top image as the active layer is currently unsupported");
-        return;
-    }
-
+    assert(top != bs);
     if (top == base) {
         error_setg(errp, "Invalid files for merge: top and base are the same");
         return;
diff --git a/block/iscsi.c b/block/iscsi.c
index b0e6eea199..56c0799f3d 100644
--- a/block/iscsi.c
+++ b/block/iscsi.c
@@ -368,7 +368,10 @@ retry:
     default:
         iTask.task = iscsi_read10_task(iscsilun->iscsi, iscsilun->lun, lba,
                                        num_sectors * iscsilun->block_size,
-                                       iscsilun->block_size, 0, 0, 0, 0, 0,
+                                       iscsilun->block_size,
+#if !defined(CONFIG_LIBISCSI_1_4) /* API change from 1.4.0 to 1.5.0 */
+                                       0, 0, 0, 0, 0,
+#endif
                                        iscsi_co_generic_cb, &iTask);
         break;
     }
diff --git a/block/mirror.c b/block/mirror.c
index 6dc27ad35d..2932bab27a 100644
--- a/block/mirror.c
+++ b/block/mirror.c
@@ -31,7 +31,8 @@ typedef struct MirrorBlockJob {
     BlockJob common;
     RateLimit limit;
     BlockDriverState *target;
-    MirrorSyncMode mode;
+    BlockDriverState *base;
+    bool is_none_mode;
     BlockdevOnError on_source_error, on_target_error;
     bool synced;
     bool should_complete;
@@ -335,10 +336,9 @@ static void coroutine_fn mirror_run(void *opaque)
     sectors_per_chunk = s->granularity >> BDRV_SECTOR_BITS;
     mirror_free_init(s);
 
-    if (s->mode != MIRROR_SYNC_MODE_NONE) {
+    if (!s->is_none_mode) {
         /* First part, loop on the sectors and initialize the dirty bitmap.  */
-        BlockDriverState *base;
-        base = s->mode == MIRROR_SYNC_MODE_FULL ? NULL : bs->backing_hd;
+        BlockDriverState *base = s->base;
         for (sector_num = 0; sector_num < end; ) {
             int64_t next = (sector_num | (sectors_per_chunk - 1)) + 1;
             ret = bdrv_is_allocated_above(bs, base,
@@ -481,8 +481,14 @@ immediate_exit:
             bdrv_reopen(s->target, bdrv_get_flags(s->common.bs), NULL);
         }
         bdrv_swap(s->target, s->common.bs);
+        if (s->common.driver->job_type == BLOCK_JOB_TYPE_COMMIT) {
+            /* drop the bs loop chain formed by the swap: break the loop then
+             * trigger the unref from the top one */
+            BlockDriverState *p = s->base->backing_hd;
+            s->base->backing_hd = NULL;
+            bdrv_unref(p);
+        }
     }
-    bdrv_close(s->target);
     bdrv_unref(s->target);
     block_job_completed(&s->common, ret);
 }
@@ -536,12 +542,24 @@ static const BlockJobDriver mirror_job_driver = {
     .complete      = mirror_complete,
 };
 
-void mirror_start(BlockDriverState *bs, BlockDriverState *target,
-                  int64_t speed, int64_t granularity, int64_t buf_size,
-                  MirrorSyncMode mode, BlockdevOnError on_source_error,
-                  BlockdevOnError on_target_error,
-                  BlockDriverCompletionFunc *cb,
-                  void *opaque, Error **errp)
+static const BlockJobDriver commit_active_job_driver = {
+    .instance_size = sizeof(MirrorBlockJob),
+    .job_type      = BLOCK_JOB_TYPE_COMMIT,
+    .set_speed     = mirror_set_speed,
+    .iostatus_reset
+                   = mirror_iostatus_reset,
+    .complete      = mirror_complete,
+};
+
+static void mirror_start_job(BlockDriverState *bs, BlockDriverState *target,
+                            int64_t speed, int64_t granularity,
+                            int64_t buf_size,
+                            BlockdevOnError on_source_error,
+                            BlockdevOnError on_target_error,
+                            BlockDriverCompletionFunc *cb,
+                            void *opaque, Error **errp,
+                            const BlockJobDriver *driver,
+                            bool is_none_mode, BlockDriverState *base)
 {
     MirrorBlockJob *s;
 
@@ -566,7 +584,8 @@ void mirror_start(BlockDriverState *bs, BlockDriverState *target,
         return;
     }
 
-    s = block_job_create(&mirror_job_driver, bs, speed, cb, opaque, errp);
+
+    s = block_job_create(driver, bs, speed, cb, opaque, errp);
     if (!s) {
         return;
     }
@@ -574,7 +593,8 @@ void mirror_start(BlockDriverState *bs, BlockDriverState *target,
     s->on_source_error = on_source_error;
     s->on_target_error = on_target_error;
     s->target = target;
-    s->mode = mode;
+    s->is_none_mode = is_none_mode;
+    s->base = base;
     s->granularity = granularity;
     s->buf_size = MAX(buf_size, granularity);
 
@@ -586,3 +606,35 @@ void mirror_start(BlockDriverState *bs, BlockDriverState *target,
     trace_mirror_start(bs, s, s->common.co, opaque);
     qemu_coroutine_enter(s->common.co, s);
 }
+
+void mirror_start(BlockDriverState *bs, BlockDriverState *target,
+                  int64_t speed, int64_t granularity, int64_t buf_size,
+                  MirrorSyncMode mode, BlockdevOnError on_source_error,
+                  BlockdevOnError on_target_error,
+                  BlockDriverCompletionFunc *cb,
+                  void *opaque, Error **errp)
+{
+    bool is_none_mode;
+    BlockDriverState *base;
+
+    is_none_mode = mode == MIRROR_SYNC_MODE_NONE;
+    base = mode == MIRROR_SYNC_MODE_TOP ? bs->backing_hd : NULL;
+    mirror_start_job(bs, target, speed, granularity, buf_size,
+                     on_source_error, on_target_error, cb, opaque, errp,
+                     &mirror_job_driver, is_none_mode, base);
+}
+
+void commit_active_start(BlockDriverState *bs, BlockDriverState *base,
+                         int64_t speed,
+                         BlockdevOnError on_error,
+                         BlockDriverCompletionFunc *cb,
+                         void *opaque, Error **errp)
+{
+    if (bdrv_reopen(base, bs->open_flags, errp)) {
+        return;
+    }
+    bdrv_ref(base);
+    mirror_start_job(bs, base, speed, 0, 0,
+                     on_error, on_error, cb, opaque, errp,
+                     &commit_active_job_driver, false, base);
+}
diff --git a/block/nbd-client.c b/block/nbd-client.c
new file mode 100644
index 0000000000..0922b78292
--- /dev/null
+++ b/block/nbd-client.c
@@ -0,0 +1,385 @@
+/*
+ * QEMU Block driver for  NBD
+ *
+ * Copyright (C) 2008 Bull S.A.S.
+ *     Author: Laurent Vivier <Laurent.Vivier@bull.net>
+ *
+ * Some parts:
+ *    Copyright (C) 2007 Anthony Liguori <anthony@codemonkey.ws>
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ */
+
+#include "nbd-client.h"
+#include "qemu/sockets.h"
+
+#define HANDLE_TO_INDEX(bs, handle) ((handle) ^ ((uint64_t)(intptr_t)bs))
+#define INDEX_TO_HANDLE(bs, index)  ((index)  ^ ((uint64_t)(intptr_t)bs))
+
+static void nbd_recv_coroutines_enter_all(NbdClientSession *s)
+{
+    int i;
+
+    for (i = 0; i < MAX_NBD_REQUESTS; i++) {
+        if (s->recv_coroutine[i]) {
+            qemu_coroutine_enter(s->recv_coroutine[i], NULL);
+        }
+    }
+}
+
+static void nbd_reply_ready(void *opaque)
+{
+    NbdClientSession *s = opaque;
+    uint64_t i;
+    int ret;
+
+    if (s->reply.handle == 0) {
+        /* No reply already in flight.  Fetch a header.  It is possible
+         * that another thread has done the same thing in parallel, so
+         * the socket is not readable anymore.
+         */
+        ret = nbd_receive_reply(s->sock, &s->reply);
+        if (ret == -EAGAIN) {
+            return;
+        }
+        if (ret < 0) {
+            s->reply.handle = 0;
+            goto fail;
+        }
+    }
+
+    /* There's no need for a mutex on the receive side, because the
+     * handler acts as a synchronization point and ensures that only
+     * one coroutine is called until the reply finishes.  */
+    i = HANDLE_TO_INDEX(s, s->reply.handle);
+    if (i >= MAX_NBD_REQUESTS) {
+        goto fail;
+    }
+
+    if (s->recv_coroutine[i]) {
+        qemu_coroutine_enter(s->recv_coroutine[i], NULL);
+        return;
+    }
+
+fail:
+    nbd_recv_coroutines_enter_all(s);
+}
+
+static void nbd_restart_write(void *opaque)
+{
+    NbdClientSession *s = opaque;
+
+    qemu_coroutine_enter(s->send_coroutine, NULL);
+}
+
+static int nbd_co_send_request(NbdClientSession *s,
+    struct nbd_request *request,
+    QEMUIOVector *qiov, int offset)
+{
+    int rc, ret;
+
+    qemu_co_mutex_lock(&s->send_mutex);
+    s->send_coroutine = qemu_coroutine_self();
+    qemu_aio_set_fd_handler(s->sock, nbd_reply_ready, nbd_restart_write, s);
+    if (qiov) {
+        if (!s->is_unix) {
+            socket_set_cork(s->sock, 1);
+        }
+        rc = nbd_send_request(s->sock, request);
+        if (rc >= 0) {
+            ret = qemu_co_sendv(s->sock, qiov->iov, qiov->niov,
+                                offset, request->len);
+            if (ret != request->len) {
+                rc = -EIO;
+            }
+        }
+        if (!s->is_unix) {
+            socket_set_cork(s->sock, 0);
+        }
+    } else {
+        rc = nbd_send_request(s->sock, request);
+    }
+    qemu_aio_set_fd_handler(s->sock, nbd_reply_ready, NULL, s);
+    s->send_coroutine = NULL;
+    qemu_co_mutex_unlock(&s->send_mutex);
+    return rc;
+}
+
+static void nbd_co_receive_reply(NbdClientSession *s,
+    struct nbd_request *request, struct nbd_reply *reply,
+    QEMUIOVector *qiov, int offset)
+{
+    int ret;
+
+    /* Wait until we're woken up by the read handler.  TODO: perhaps
+     * peek at the next reply and avoid yielding if it's ours?  */
+    qemu_coroutine_yield();
+    *reply = s->reply;
+    if (reply->handle != request->handle) {
+        reply->error = EIO;
+    } else {
+        if (qiov && reply->error == 0) {
+            ret = qemu_co_recvv(s->sock, qiov->iov, qiov->niov,
+                                offset, request->len);
+            if (ret != request->len) {
+                reply->error = EIO;
+            }
+        }
+
+        /* Tell the read handler to read another header.  */
+        s->reply.handle = 0;
+    }
+}
+
+static void nbd_coroutine_start(NbdClientSession *s,
+   struct nbd_request *request)
+{
+    int i;
+
+    /* Poor man semaphore.  The free_sema is locked when no other request
+     * can be accepted, and unlocked after receiving one reply.  */
+    if (s->in_flight >= MAX_NBD_REQUESTS - 1) {
+        qemu_co_mutex_lock(&s->free_sema);
+        assert(s->in_flight < MAX_NBD_REQUESTS);
+    }
+    s->in_flight++;
+
+    for (i = 0; i < MAX_NBD_REQUESTS; i++) {
+        if (s->recv_coroutine[i] == NULL) {
+            s->recv_coroutine[i] = qemu_coroutine_self();
+            break;
+        }
+    }
+
+    assert(i < MAX_NBD_REQUESTS);
+    request->handle = INDEX_TO_HANDLE(s, i);
+}
+
+static void nbd_coroutine_end(NbdClientSession *s,
+    struct nbd_request *request)
+{
+    int i = HANDLE_TO_INDEX(s, request->handle);
+    s->recv_coroutine[i] = NULL;
+    if (s->in_flight-- == MAX_NBD_REQUESTS) {
+        qemu_co_mutex_unlock(&s->free_sema);
+    }
+}
+
+static int nbd_co_readv_1(NbdClientSession *client, int64_t sector_num,
+                          int nb_sectors, QEMUIOVector *qiov,
+                          int offset)
+{
+    struct nbd_request request = { .type = NBD_CMD_READ };
+    struct nbd_reply reply;
+    ssize_t ret;
+
+    request.from = sector_num * 512;
+    request.len = nb_sectors * 512;
+
+    nbd_coroutine_start(client, &request);
+    ret = nbd_co_send_request(client, &request, NULL, 0);
+    if (ret < 0) {
+        reply.error = -ret;
+    } else {
+        nbd_co_receive_reply(client, &request, &reply, qiov, offset);
+    }
+    nbd_coroutine_end(client, &request);
+    return -reply.error;
+
+}
+
+static int nbd_co_writev_1(NbdClientSession *client, int64_t sector_num,
+                           int nb_sectors, QEMUIOVector *qiov,
+                           int offset)
+{
+    struct nbd_request request = { .type = NBD_CMD_WRITE };
+    struct nbd_reply reply;
+    ssize_t ret;
+
+    if (!bdrv_enable_write_cache(client->bs) &&
+        (client->nbdflags & NBD_FLAG_SEND_FUA)) {
+        request.type |= NBD_CMD_FLAG_FUA;
+    }
+
+    request.from = sector_num * 512;
+    request.len = nb_sectors * 512;
+
+    nbd_coroutine_start(client, &request);
+    ret = nbd_co_send_request(client, &request, qiov, offset);
+    if (ret < 0) {
+        reply.error = -ret;
+    } else {
+        nbd_co_receive_reply(client, &request, &reply, NULL, 0);
+    }
+    nbd_coroutine_end(client, &request);
+    return -reply.error;
+}
+
+/* qemu-nbd has a limit of slightly less than 1M per request.  Try to
+ * remain aligned to 4K. */
+#define NBD_MAX_SECTORS 2040
+
+int nbd_client_session_co_readv(NbdClientSession *client, int64_t sector_num,
+    int nb_sectors, QEMUIOVector *qiov)
+{
+    int offset = 0;
+    int ret;
+    while (nb_sectors > NBD_MAX_SECTORS) {
+        ret = nbd_co_readv_1(client, sector_num,
+                             NBD_MAX_SECTORS, qiov, offset);
+        if (ret < 0) {
+            return ret;
+        }
+        offset += NBD_MAX_SECTORS * 512;
+        sector_num += NBD_MAX_SECTORS;
+        nb_sectors -= NBD_MAX_SECTORS;
+    }
+    return nbd_co_readv_1(client, sector_num, nb_sectors, qiov, offset);
+}
+
+int nbd_client_session_co_writev(NbdClientSession *client, int64_t sector_num,
+                                 int nb_sectors, QEMUIOVector *qiov)
+{
+    int offset = 0;
+    int ret;
+    while (nb_sectors > NBD_MAX_SECTORS) {
+        ret = nbd_co_writev_1(client, sector_num,
+                              NBD_MAX_SECTORS, qiov, offset);
+        if (ret < 0) {
+            return ret;
+        }
+        offset += NBD_MAX_SECTORS * 512;
+        sector_num += NBD_MAX_SECTORS;
+        nb_sectors -= NBD_MAX_SECTORS;
+    }
+    return nbd_co_writev_1(client, sector_num, nb_sectors, qiov, offset);
+}
+
+int nbd_client_session_co_flush(NbdClientSession *client)
+{
+    struct nbd_request request = { .type = NBD_CMD_FLUSH };
+    struct nbd_reply reply;
+    ssize_t ret;
+
+    if (!(client->nbdflags & NBD_FLAG_SEND_FLUSH)) {
+        return 0;
+    }
+
+    if (client->nbdflags & NBD_FLAG_SEND_FUA) {
+        request.type |= NBD_CMD_FLAG_FUA;
+    }
+
+    request.from = 0;
+    request.len = 0;
+
+    nbd_coroutine_start(client, &request);
+    ret = nbd_co_send_request(client, &request, NULL, 0);
+    if (ret < 0) {
+        reply.error = -ret;
+    } else {
+        nbd_co_receive_reply(client, &request, &reply, NULL, 0);
+    }
+    nbd_coroutine_end(client, &request);
+    return -reply.error;
+}
+
+int nbd_client_session_co_discard(NbdClientSession *client, int64_t sector_num,
+    int nb_sectors)
+{
+    struct nbd_request request = { .type = NBD_CMD_TRIM };
+    struct nbd_reply reply;
+    ssize_t ret;
+
+    if (!(client->nbdflags & NBD_FLAG_SEND_TRIM)) {
+        return 0;
+    }
+    request.from = sector_num * 512;
+    request.len = nb_sectors * 512;
+
+    nbd_coroutine_start(client, &request);
+    ret = nbd_co_send_request(client, &request, NULL, 0);
+    if (ret < 0) {
+        reply.error = -ret;
+    } else {
+        nbd_co_receive_reply(client, &request, &reply, NULL, 0);
+    }
+    nbd_coroutine_end(client, &request);
+    return -reply.error;
+
+}
+
+static void nbd_teardown_connection(NbdClientSession *client)
+{
+    struct nbd_request request = {
+        .type = NBD_CMD_DISC,
+        .from = 0,
+        .len = 0
+    };
+
+    nbd_send_request(client->sock, &request);
+
+    /* finish any pending coroutines */
+    shutdown(client->sock, 2);
+    nbd_recv_coroutines_enter_all(client);
+
+    qemu_aio_set_fd_handler(client->sock, NULL, NULL, NULL);
+    closesocket(client->sock);
+    client->sock = -1;
+}
+
+void nbd_client_session_close(NbdClientSession *client)
+{
+    if (!client->bs) {
+        return;
+    }
+
+    nbd_teardown_connection(client);
+    client->bs = NULL;
+}
+
+int nbd_client_session_init(NbdClientSession *client, BlockDriverState *bs,
+    int sock, const char *export)
+{
+    int ret;
+
+    /* NBD handshake */
+    logout("session init %s\n", export);
+    qemu_set_block(sock);
+    ret = nbd_receive_negotiate(sock, export,
+                                &client->nbdflags, &client->size,
+                                &client->blocksize);
+    if (ret < 0) {
+        logout("Failed to negotiate with the NBD server\n");
+        closesocket(sock);
+        return ret;
+    }
+
+    qemu_co_mutex_init(&client->send_mutex);
+    qemu_co_mutex_init(&client->free_sema);
+    client->bs = bs;
+    client->sock = sock;
+
+    /* Now that we're connected, set the socket to be non-blocking and
+     * kick the reply mechanism.  */
+    qemu_set_nonblock(sock);
+    qemu_aio_set_fd_handler(sock, nbd_reply_ready, NULL, client);
+
+    logout("Established connection with NBD server\n");
+    return 0;
+}
diff --git a/block/nbd-client.h b/block/nbd-client.h
new file mode 100644
index 0000000000..f2a63378bb
--- /dev/null
+++ b/block/nbd-client.h
@@ -0,0 +1,50 @@
+#ifndef NBD_CLIENT_H
+#define NBD_CLIENT_H
+
+#include "qemu-common.h"
+#include "block/nbd.h"
+#include "block/block_int.h"
+
+/* #define DEBUG_NBD */
+
+#if defined(DEBUG_NBD)
+#define logout(fmt, ...) \
+    fprintf(stderr, "nbd\t%-24s" fmt, __func__, ##__VA_ARGS__)
+#else
+#define logout(fmt, ...) ((void)0)
+#endif
+
+#define MAX_NBD_REQUESTS    16
+
+typedef struct NbdClientSession {
+    int sock;
+    uint32_t nbdflags;
+    off_t size;
+    size_t blocksize;
+
+    CoMutex send_mutex;
+    CoMutex free_sema;
+    Coroutine *send_coroutine;
+    int in_flight;
+
+    Coroutine *recv_coroutine[MAX_NBD_REQUESTS];
+    struct nbd_reply reply;
+
+    bool is_unix;
+
+    BlockDriverState *bs;
+} NbdClientSession;
+
+int nbd_client_session_init(NbdClientSession *client, BlockDriverState *bs,
+                            int sock, const char *export_name);
+void nbd_client_session_close(NbdClientSession *client);
+
+int nbd_client_session_co_discard(NbdClientSession *client, int64_t sector_num,
+                                  int nb_sectors);
+int nbd_client_session_co_flush(NbdClientSession *client);
+int nbd_client_session_co_writev(NbdClientSession *client, int64_t sector_num,
+                                 int nb_sectors, QEMUIOVector *qiov);
+int nbd_client_session_co_readv(NbdClientSession *client, int64_t sector_num,
+                                int nb_sectors, QEMUIOVector *qiov);
+
+#endif /* NBD_CLIENT_H */
diff --git a/block/nbd.c b/block/nbd.c
index c8deeee67f..4455a134db 100644
--- a/block/nbd.c
+++ b/block/nbd.c
@@ -26,8 +26,7 @@
  * THE SOFTWARE.
  */
 
-#include "qemu-common.h"
-#include "block/nbd.h"
+#include "block/nbd-client.h"
 #include "qemu/uri.h"
 #include "block/block_int.h"
 #include "qemu/module.h"
@@ -40,37 +39,9 @@
 
 #define EN_OPTSTR ":exportname="
 
-/* #define DEBUG_NBD */
-
-#if defined(DEBUG_NBD)
-#define logout(fmt, ...) \
-                fprintf(stderr, "nbd\t%-24s" fmt, __func__, ##__VA_ARGS__)
-#else
-#define logout(fmt, ...) ((void)0)
-#endif
-
-#define MAX_NBD_REQUESTS	16
-#define HANDLE_TO_INDEX(bs, handle) ((handle) ^ ((uint64_t)(intptr_t)bs))
-#define INDEX_TO_HANDLE(bs, index)  ((index)  ^ ((uint64_t)(intptr_t)bs))
-
 typedef struct BDRVNBDState {
-    int sock;
-    uint32_t nbdflags;
-    off_t size;
-    size_t blocksize;
-
-    CoMutex send_mutex;
-    CoMutex free_sema;
-    Coroutine *send_coroutine;
-    int in_flight;
-
-    Coroutine *recv_coroutine[MAX_NBD_REQUESTS];
-    struct nbd_reply reply;
-
-    bool is_unix;
+    NbdClientSession client;
     QemuOpts *socket_opts;
-
-    char *export_name; /* An NBD server may export several devices */
 } BDRVNBDState;
 
 static int nbd_parse_uri(const char *filename, QDict *options)
@@ -217,7 +188,7 @@ out:
     g_free(file);
 }
 
-static int nbd_config(BDRVNBDState *s, QDict *options)
+static int nbd_config(BDRVNBDState *s, QDict *options, char **export)
 {
     Error *local_err = NULL;
 
@@ -227,9 +198,9 @@ static int nbd_config(BDRVNBDState *s, QDict *options)
                           "be used at the same time.");
             return -EINVAL;
         }
-        s->is_unix = true;
+        s->client.is_unix = true;
     } else if (qdict_haskey(options, "host")) {
-        s->is_unix = false;
+        s->client.is_unix = false;
     } else {
         return -EINVAL;
     }
@@ -247,162 +218,20 @@ static int nbd_config(BDRVNBDState *s, QDict *options)
         qemu_opt_set_number(s->socket_opts, "port", NBD_DEFAULT_PORT);
     }
 
-    s->export_name = g_strdup(qdict_get_try_str(options, "export"));
-    if (s->export_name) {
+    *export = g_strdup(qdict_get_try_str(options, "export"));
+    if (*export) {
         qdict_del(options, "export");
     }
 
     return 0;
 }
 
-
-static void nbd_coroutine_start(BDRVNBDState *s, struct nbd_request *request)
-{
-    int i;
-
-    /* Poor man semaphore.  The free_sema is locked when no other request
-     * can be accepted, and unlocked after receiving one reply.  */
-    if (s->in_flight >= MAX_NBD_REQUESTS - 1) {
-        qemu_co_mutex_lock(&s->free_sema);
-        assert(s->in_flight < MAX_NBD_REQUESTS);
-    }
-    s->in_flight++;
-
-    for (i = 0; i < MAX_NBD_REQUESTS; i++) {
-        if (s->recv_coroutine[i] == NULL) {
-            s->recv_coroutine[i] = qemu_coroutine_self();
-            break;
-        }
-    }
-
-    assert(i < MAX_NBD_REQUESTS);
-    request->handle = INDEX_TO_HANDLE(s, i);
-}
-
-static void nbd_reply_ready(void *opaque)
-{
-    BDRVNBDState *s = opaque;
-    uint64_t i;
-    int ret;
-
-    if (s->reply.handle == 0) {
-        /* No reply already in flight.  Fetch a header.  It is possible
-         * that another thread has done the same thing in parallel, so
-         * the socket is not readable anymore.
-         */
-        ret = nbd_receive_reply(s->sock, &s->reply);
-        if (ret == -EAGAIN) {
-            return;
-        }
-        if (ret < 0) {
-            s->reply.handle = 0;
-            goto fail;
-        }
-    }
-
-    /* There's no need for a mutex on the receive side, because the
-     * handler acts as a synchronization point and ensures that only
-     * one coroutine is called until the reply finishes.  */
-    i = HANDLE_TO_INDEX(s, s->reply.handle);
-    if (i >= MAX_NBD_REQUESTS) {
-        goto fail;
-    }
-
-    if (s->recv_coroutine[i]) {
-        qemu_coroutine_enter(s->recv_coroutine[i], NULL);
-        return;
-    }
-
-fail:
-    for (i = 0; i < MAX_NBD_REQUESTS; i++) {
-        if (s->recv_coroutine[i]) {
-            qemu_coroutine_enter(s->recv_coroutine[i], NULL);
-        }
-    }
-}
-
-static void nbd_restart_write(void *opaque)
-{
-    BDRVNBDState *s = opaque;
-    qemu_coroutine_enter(s->send_coroutine, NULL);
-}
-
-static int nbd_co_send_request(BDRVNBDState *s, struct nbd_request *request,
-                               QEMUIOVector *qiov, int offset)
-{
-    int rc, ret;
-
-    qemu_co_mutex_lock(&s->send_mutex);
-    s->send_coroutine = qemu_coroutine_self();
-    qemu_aio_set_fd_handler(s->sock, nbd_reply_ready, nbd_restart_write, s);
-    if (qiov) {
-        if (!s->is_unix) {
-            socket_set_cork(s->sock, 1);
-        }
-        rc = nbd_send_request(s->sock, request);
-        if (rc >= 0) {
-            ret = qemu_co_sendv(s->sock, qiov->iov, qiov->niov,
-                                offset, request->len);
-            if (ret != request->len) {
-                rc = -EIO;
-            }
-        }
-        if (!s->is_unix) {
-            socket_set_cork(s->sock, 0);
-        }
-    } else {
-        rc = nbd_send_request(s->sock, request);
-    }
-    qemu_aio_set_fd_handler(s->sock, nbd_reply_ready, NULL, s);
-    s->send_coroutine = NULL;
-    qemu_co_mutex_unlock(&s->send_mutex);
-    return rc;
-}
-
-static void nbd_co_receive_reply(BDRVNBDState *s, struct nbd_request *request,
-                                 struct nbd_reply *reply,
-                                 QEMUIOVector *qiov, int offset)
-{
-    int ret;
-
-    /* Wait until we're woken up by the read handler.  TODO: perhaps
-     * peek at the next reply and avoid yielding if it's ours?  */
-    qemu_coroutine_yield();
-    *reply = s->reply;
-    if (reply->handle != request->handle) {
-        reply->error = EIO;
-    } else {
-        if (qiov && reply->error == 0) {
-            ret = qemu_co_recvv(s->sock, qiov->iov, qiov->niov,
-                                offset, request->len);
-            if (ret != request->len) {
-                reply->error = EIO;
-            }
-        }
-
-        /* Tell the read handler to read another header.  */
-        s->reply.handle = 0;
-    }
-}
-
-static void nbd_coroutine_end(BDRVNBDState *s, struct nbd_request *request)
-{
-    int i = HANDLE_TO_INDEX(s, request->handle);
-    s->recv_coroutine[i] = NULL;
-    if (s->in_flight-- == MAX_NBD_REQUESTS) {
-        qemu_co_mutex_unlock(&s->free_sema);
-    }
-}
-
 static int nbd_establish_connection(BlockDriverState *bs)
 {
     BDRVNBDState *s = bs->opaque;
     int sock;
-    int ret;
-    off_t size;
-    size_t blocksize;
 
-    if (s->is_unix) {
+    if (s->client.is_unix) {
         sock = unix_socket_outgoing(qemu_opt_get(s->socket_opts, "path"));
     } else {
         sock = tcp_socket_outgoing_opts(s->socket_opts);
@@ -417,53 +246,18 @@ static int nbd_establish_connection(BlockDriverState *bs)
         return -errno;
     }
 
-    /* NBD handshake */
-    ret = nbd_receive_negotiate(sock, s->export_name, &s->nbdflags, &size,
-                                &blocksize);
-    if (ret < 0) {
-        logout("Failed to negotiate with the NBD server\n");
-        closesocket(sock);
-        return ret;
-    }
-
-    /* Now that we're connected, set the socket to be non-blocking and
-     * kick the reply mechanism.  */
-    qemu_set_nonblock(sock);
-    qemu_aio_set_fd_handler(sock, nbd_reply_ready, NULL, s);
-
-    s->sock = sock;
-    s->size = size;
-    s->blocksize = blocksize;
-
-    logout("Established connection with NBD server\n");
-    return 0;
-}
-
-static void nbd_teardown_connection(BlockDriverState *bs)
-{
-    BDRVNBDState *s = bs->opaque;
-    struct nbd_request request;
-
-    request.type = NBD_CMD_DISC;
-    request.from = 0;
-    request.len = 0;
-    nbd_send_request(s->sock, &request);
-
-    qemu_aio_set_fd_handler(s->sock, NULL, NULL, NULL);
-    closesocket(s->sock);
+    return sock;
 }
 
 static int nbd_open(BlockDriverState *bs, QDict *options, int flags,
                     Error **errp)
 {
     BDRVNBDState *s = bs->opaque;
-    int result;
-
-    qemu_co_mutex_init(&s->send_mutex);
-    qemu_co_mutex_init(&s->free_sema);
+    char *export = NULL;
+    int result, sock;
 
     /* Pop the config into our state object. Exit if invalid. */
-    result = nbd_config(s, options);
+    result = nbd_config(s, options, &export);
     if (result != 0) {
         return result;
     }
@@ -471,172 +265,64 @@ static int nbd_open(BlockDriverState *bs, QDict *options, int flags,
     /* establish TCP connection, return error if it fails
      * TODO: Configurable retry-until-timeout behaviour.
      */
-    result = nbd_establish_connection(bs);
-
-    return result;
-}
-
-static int nbd_co_readv_1(BlockDriverState *bs, int64_t sector_num,
-                          int nb_sectors, QEMUIOVector *qiov,
-                          int offset)
-{
-    BDRVNBDState *s = bs->opaque;
-    struct nbd_request request;
-    struct nbd_reply reply;
-    ssize_t ret;
-
-    request.type = NBD_CMD_READ;
-    request.from = sector_num * 512;
-    request.len = nb_sectors * 512;
-
-    nbd_coroutine_start(s, &request);
-    ret = nbd_co_send_request(s, &request, NULL, 0);
-    if (ret < 0) {
-        reply.error = -ret;
-    } else {
-        nbd_co_receive_reply(s, &request, &reply, qiov, offset);
-    }
-    nbd_coroutine_end(s, &request);
-    return -reply.error;
-
-}
-
-static int nbd_co_writev_1(BlockDriverState *bs, int64_t sector_num,
-                           int nb_sectors, QEMUIOVector *qiov,
-                           int offset)
-{
-    BDRVNBDState *s = bs->opaque;
-    struct nbd_request request;
-    struct nbd_reply reply;
-    ssize_t ret;
-
-    request.type = NBD_CMD_WRITE;
-    if (!bdrv_enable_write_cache(bs) && (s->nbdflags & NBD_FLAG_SEND_FUA)) {
-        request.type |= NBD_CMD_FLAG_FUA;
+    sock = nbd_establish_connection(bs);
+    if (sock < 0) {
+        return sock;
     }
 
-    request.from = sector_num * 512;
-    request.len = nb_sectors * 512;
-
-    nbd_coroutine_start(s, &request);
-    ret = nbd_co_send_request(s, &request, qiov, offset);
-    if (ret < 0) {
-        reply.error = -ret;
-    } else {
-        nbd_co_receive_reply(s, &request, &reply, NULL, 0);
-    }
-    nbd_coroutine_end(s, &request);
-    return -reply.error;
+    /* NBD handshake */
+    result = nbd_client_session_init(&s->client, bs, sock, export);
+    g_free(export);
+    return result;
 }
 
-/* qemu-nbd has a limit of slightly less than 1M per request.  Try to
- * remain aligned to 4K. */
-#define NBD_MAX_SECTORS 2040
-
 static int nbd_co_readv(BlockDriverState *bs, int64_t sector_num,
                         int nb_sectors, QEMUIOVector *qiov)
 {
-    int offset = 0;
-    int ret;
-    while (nb_sectors > NBD_MAX_SECTORS) {
-        ret = nbd_co_readv_1(bs, sector_num, NBD_MAX_SECTORS, qiov, offset);
-        if (ret < 0) {
-            return ret;
-        }
-        offset += NBD_MAX_SECTORS * 512;
-        sector_num += NBD_MAX_SECTORS;
-        nb_sectors -= NBD_MAX_SECTORS;
-    }
-    return nbd_co_readv_1(bs, sector_num, nb_sectors, qiov, offset);
+    BDRVNBDState *s = bs->opaque;
+
+    return nbd_client_session_co_readv(&s->client, sector_num,
+                                       nb_sectors, qiov);
 }
 
 static int nbd_co_writev(BlockDriverState *bs, int64_t sector_num,
                          int nb_sectors, QEMUIOVector *qiov)
 {
-    int offset = 0;
-    int ret;
-    while (nb_sectors > NBD_MAX_SECTORS) {
-        ret = nbd_co_writev_1(bs, sector_num, NBD_MAX_SECTORS, qiov, offset);
-        if (ret < 0) {
-            return ret;
-        }
-        offset += NBD_MAX_SECTORS * 512;
-        sector_num += NBD_MAX_SECTORS;
-        nb_sectors -= NBD_MAX_SECTORS;
-    }
-    return nbd_co_writev_1(bs, sector_num, nb_sectors, qiov, offset);
+    BDRVNBDState *s = bs->opaque;
+
+    return nbd_client_session_co_writev(&s->client, sector_num,
+                                        nb_sectors, qiov);
 }
 
 static int nbd_co_flush(BlockDriverState *bs)
 {
     BDRVNBDState *s = bs->opaque;
-    struct nbd_request request;
-    struct nbd_reply reply;
-    ssize_t ret;
-
-    if (!(s->nbdflags & NBD_FLAG_SEND_FLUSH)) {
-        return 0;
-    }
-
-    request.type = NBD_CMD_FLUSH;
-    if (s->nbdflags & NBD_FLAG_SEND_FUA) {
-        request.type |= NBD_CMD_FLAG_FUA;
-    }
-
-    request.from = 0;
-    request.len = 0;
 
-    nbd_coroutine_start(s, &request);
-    ret = nbd_co_send_request(s, &request, NULL, 0);
-    if (ret < 0) {
-        reply.error = -ret;
-    } else {
-        nbd_co_receive_reply(s, &request, &reply, NULL, 0);
-    }
-    nbd_coroutine_end(s, &request);
-    return -reply.error;
+    return nbd_client_session_co_flush(&s->client);
 }
 
 static int nbd_co_discard(BlockDriverState *bs, int64_t sector_num,
                           int nb_sectors)
 {
     BDRVNBDState *s = bs->opaque;
-    struct nbd_request request;
-    struct nbd_reply reply;
-    ssize_t ret;
 
-    if (!(s->nbdflags & NBD_FLAG_SEND_TRIM)) {
-        return 0;
-    }
-    request.type = NBD_CMD_TRIM;
-    request.from = sector_num * 512;
-    request.len = nb_sectors * 512;
-
-    nbd_coroutine_start(s, &request);
-    ret = nbd_co_send_request(s, &request, NULL, 0);
-    if (ret < 0) {
-        reply.error = -ret;
-    } else {
-        nbd_co_receive_reply(s, &request, &reply, NULL, 0);
-    }
-    nbd_coroutine_end(s, &request);
-    return -reply.error;
+    return nbd_client_session_co_discard(&s->client, sector_num,
+                                         nb_sectors);
 }
 
 static void nbd_close(BlockDriverState *bs)
 {
     BDRVNBDState *s = bs->opaque;
-    g_free(s->export_name);
-    qemu_opts_del(s->socket_opts);
 
-    nbd_teardown_connection(bs);
+    qemu_opts_del(s->socket_opts);
+    nbd_client_session_close(&s->client);
 }
 
 static int64_t nbd_getlength(BlockDriverState *bs)
 {
     BDRVNBDState *s = bs->opaque;
 
-    return s->size;
+    return s->client.size;
 }
 
 static BlockDriver bdrv_nbd = {
diff --git a/block/sheepdog.c b/block/sheepdog.c
index b4ae50f44d..ba451a97a4 100644
--- a/block/sheepdog.c
+++ b/block/sheepdog.c
@@ -1666,9 +1666,11 @@ static int sd_create(const char *filename, QEMUOptionParameter *options,
                 goto out;
             }
         } else if (!strcmp(options->name, BLOCK_OPT_REDUNDANCY)) {
-            ret = parse_redundancy(s, options->value.s);
-            if (ret < 0) {
-                goto out;
+            if (options->value.s) {
+                ret = parse_redundancy(s, options->value.s);
+                if (ret < 0) {
+                    goto out;
+                }
             }
         }
         options++;
@@ -2046,13 +2048,14 @@ static coroutine_fn int sd_co_writev(BlockDriverState *bs, int64_t sector_num,
 {
     SheepdogAIOCB *acb;
     int ret;
+    int64_t offset = (sector_num + nb_sectors) * BDRV_SECTOR_SIZE;
+    BDRVSheepdogState *s = bs->opaque;
 
-    if (bs->growable && sector_num + nb_sectors > bs->total_sectors) {
-        ret = sd_truncate(bs, (sector_num + nb_sectors) * BDRV_SECTOR_SIZE);
+    if (bs->growable && offset > s->inode.vdi_size) {
+        ret = sd_truncate(bs, offset);
         if (ret < 0) {
             return ret;
         }
-        bs->total_sectors = sector_num + nb_sectors;
     }
 
     acb = sd_aio_setup(bs, qiov, sector_num, nb_sectors);
diff --git a/block/vhdx-log.c b/block/vhdx-log.c
index ee5583c309..8c9ae0d8e7 100644
--- a/block/vhdx-log.c
+++ b/block/vhdx-log.c
@@ -706,7 +706,8 @@ exit:
  *
  * If read-only, we must replay the log in RAM (or refuse to open
  * a dirty VHDX file read-only) */
-int vhdx_parse_log(BlockDriverState *bs, BDRVVHDXState *s, bool *flushed)
+int vhdx_parse_log(BlockDriverState *bs, BDRVVHDXState *s, bool *flushed,
+                   Error **errp)
 {
     int ret = 0;
     VHDXHeader *hdr;
@@ -761,6 +762,16 @@ int vhdx_parse_log(BlockDriverState *bs, BDRVVHDXState *s, bool *flushed)
     }
 
     if (logs.valid) {
+        if (bs->read_only) {
+            ret = -EPERM;
+            error_setg_errno(errp, EPERM,
+                             "VHDX image file '%s' opened read-only, but "
+                             "contains a log that needs to be replayed.  To "
+                             "replay the log, execute:\n qemu-img check -r "
+                             "all '%s'",
+                             bs->filename, bs->filename);
+            goto exit;
+        }
         /* now flush the log */
         ret = vhdx_log_flush(bs, s, &logs);
         if (ret < 0) {
diff --git a/block/vhdx.c b/block/vhdx.c
index 67bbe103a1..1995778945 100644
--- a/block/vhdx.c
+++ b/block/vhdx.c
@@ -878,7 +878,6 @@ static int vhdx_open(BlockDriverState *bs, QDict *options, int flags,
     int ret = 0;
     uint32_t i;
     uint64_t signature;
-    bool log_flushed = false;
 
 
     s->bat = NULL;
@@ -907,7 +906,7 @@ static int vhdx_open(BlockDriverState *bs, QDict *options, int flags,
         goto fail;
     }
 
-    ret = vhdx_parse_log(bs, s, &log_flushed);
+    ret = vhdx_parse_log(bs, s, &s->log_replayed_on_open, errp);
     if (ret < 0) {
         goto fail;
     }
@@ -1854,6 +1853,24 @@ exit:
     return ret;
 }
 
+/* If opened r/w, the VHDX driver will automatically replay the log,
+ * if one is present, inside the vhdx_open() call.
+ *
+ * If qemu-img check -r all is called, the image is automatically opened
+ * r/w and any log has already been replayed, so there is nothing (currently)
+ * for us to do here
+ */
+static int vhdx_check(BlockDriverState *bs, BdrvCheckResult *result,
+                       BdrvCheckMode fix)
+{
+    BDRVVHDXState *s = bs->opaque;
+
+    if (s->log_replayed_on_open) {
+        result->corruptions_fixed++;
+    }
+    return 0;
+}
+
 static QEMUOptionParameter vhdx_create_options[] = {
     {
         .name = BLOCK_OPT_SIZE,
@@ -1898,6 +1915,7 @@ static BlockDriver bdrv_vhdx = {
     .bdrv_co_writev         = vhdx_co_writev,
     .bdrv_create            = vhdx_create,
     .bdrv_get_info          = vhdx_get_info,
+    .bdrv_check             = vhdx_check,
 
     .create_options         = vhdx_create_options,
 };
diff --git a/block/vhdx.h b/block/vhdx.h
index 51183b243c..2acd7c2d19 100644
--- a/block/vhdx.h
+++ b/block/vhdx.h
@@ -394,6 +394,8 @@ typedef struct BDRVVHDXState {
 
     Error *migration_blocker;
 
+    bool log_replayed_on_open;
+
     QLIST_HEAD(VHDXRegionHead, VHDXRegionEntry) regions;
 } BDRVVHDXState;
 
@@ -408,7 +410,8 @@ uint32_t vhdx_checksum_calc(uint32_t crc, uint8_t *buf, size_t size,
 
 bool vhdx_checksum_is_valid(uint8_t *buf, size_t size, int crc_offset);
 
-int vhdx_parse_log(BlockDriverState *bs, BDRVVHDXState *s, bool *flushed);
+int vhdx_parse_log(BlockDriverState *bs, BDRVVHDXState *s, bool *flushed,
+                   Error **errp);
 
 int vhdx_log_write_and_flush(BlockDriverState *bs, BDRVVHDXState *s,
                              void *data, uint32_t length, uint64_t offset);
diff --git a/block/vmdk.c b/block/vmdk.c
index 0734bc200c..c6b60b4a91 100644
--- a/block/vmdk.c
+++ b/block/vmdk.c
@@ -749,9 +749,14 @@ static int vmdk_parse_extents(const char *desc, BlockDriverState *bs,
                 return -EINVAL;
             }
         } else if (!strcmp(type, "VMFS")) {
-            flat_offset = 0;
+            if (ret == 4) {
+                flat_offset = 0;
+            } else {
+                error_setg(errp, "Invalid extent lines:\n%s", p);
+                return -EINVAL;
+            }
         } else if (ret != 4) {
-            error_setg(errp, "Invalid extent lines: \n%s", p);
+            error_setg(errp, "Invalid extent lines:\n%s", p);
             return -EINVAL;
         }
 
@@ -1447,23 +1452,33 @@ static int coroutine_fn vmdk_co_write_zeroes(BlockDriverState *bs,
 }
 
 static int vmdk_create_extent(const char *filename, int64_t filesize,
-                              bool flat, bool compress, bool zeroed_grain)
+                              bool flat, bool compress, bool zeroed_grain,
+                              Error **errp)
 {
     int ret, i;
-    int fd = 0;
+    BlockDriverState *bs = NULL;
     VMDK4Header header;
-    uint32_t tmp, magic, grains, gd_size, gt_size, gt_count;
+    Error *local_err;
+    uint32_t tmp, magic, grains, gd_sectors, gt_size, gt_count;
+    uint32_t *gd_buf = NULL;
+    int gd_buf_size;
+
+    ret = bdrv_create_file(filename, NULL, &local_err);
+    if (ret < 0) {
+        error_propagate(errp, local_err);
+        goto exit;
+    }
 
-    fd = qemu_open(filename,
-                   O_WRONLY | O_CREAT | O_TRUNC | O_BINARY | O_LARGEFILE,
-                   0644);
-    if (fd < 0) {
-        return -errno;
+    ret = bdrv_file_open(&bs, filename, NULL, BDRV_O_RDWR, &local_err);
+    if (ret < 0) {
+        error_propagate(errp, local_err);
+        goto exit;
     }
+
     if (flat) {
-        ret = ftruncate(fd, filesize);
+        ret = bdrv_truncate(bs, filesize);
         if (ret < 0) {
-            ret = -errno;
+            error_setg(errp, "Could not truncate file");
         }
         goto exit;
     }
@@ -1474,24 +1489,23 @@ static int vmdk_create_extent(const char *filename, int64_t filesize,
                    | (compress ? VMDK4_FLAG_COMPRESS | VMDK4_FLAG_MARKER : 0)
                    | (zeroed_grain ? VMDK4_FLAG_ZERO_GRAIN : 0);
     header.compressAlgorithm = compress ? VMDK4_COMPRESSION_DEFLATE : 0;
-    header.capacity = filesize / 512;
+    header.capacity = filesize / BDRV_SECTOR_SIZE;
     header.granularity = 128;
-    header.num_gtes_per_gt = 512;
+    header.num_gtes_per_gt = BDRV_SECTOR_SIZE;
 
-    grains = (filesize / 512 + header.granularity - 1) / header.granularity;
-    gt_size = ((header.num_gtes_per_gt * sizeof(uint32_t)) + 511) >> 9;
-    gt_count =
-        (grains + header.num_gtes_per_gt - 1) / header.num_gtes_per_gt;
-    gd_size = (gt_count * sizeof(uint32_t) + 511) >> 9;
+    grains = DIV_ROUND_UP(filesize / BDRV_SECTOR_SIZE, header.granularity);
+    gt_size = DIV_ROUND_UP(header.num_gtes_per_gt * sizeof(uint32_t),
+                           BDRV_SECTOR_SIZE);
+    gt_count = DIV_ROUND_UP(grains, header.num_gtes_per_gt);
+    gd_sectors = DIV_ROUND_UP(gt_count * sizeof(uint32_t), BDRV_SECTOR_SIZE);
 
     header.desc_offset = 1;
     header.desc_size = 20;
     header.rgd_offset = header.desc_offset + header.desc_size;
-    header.gd_offset = header.rgd_offset + gd_size + (gt_size * gt_count);
+    header.gd_offset = header.rgd_offset + gd_sectors + (gt_size * gt_count);
     header.grain_offset =
-       ((header.gd_offset + gd_size + (gt_size * gt_count) +
-         header.granularity - 1) / header.granularity) *
-        header.granularity;
+        ROUND_UP(header.gd_offset + gd_sectors + (gt_size * gt_count),
+                 header.granularity);
     /* swap endianness for all header fields */
     header.version = cpu_to_le32(header.version);
     header.flags = cpu_to_le32(header.flags);
@@ -1511,48 +1525,55 @@ static int vmdk_create_extent(const char *filename, int64_t filesize,
     header.check_bytes[3] = 0xa;
 
     /* write all the data */
-    ret = qemu_write_full(fd, &magic, sizeof(magic));
-    if (ret != sizeof(magic)) {
-        ret = -errno;
+    ret = bdrv_pwrite(bs, 0, &magic, sizeof(magic));
+    if (ret < 0) {
+        error_set(errp, QERR_IO_ERROR);
         goto exit;
     }
-    ret = qemu_write_full(fd, &header, sizeof(header));
-    if (ret != sizeof(header)) {
-        ret = -errno;
+    ret = bdrv_pwrite(bs, sizeof(magic), &header, sizeof(header));
+    if (ret < 0) {
+        error_set(errp, QERR_IO_ERROR);
         goto exit;
     }
 
-    ret = ftruncate(fd, le64_to_cpu(header.grain_offset) << 9);
+    ret = bdrv_truncate(bs, le64_to_cpu(header.grain_offset) << 9);
     if (ret < 0) {
-        ret = -errno;
+        error_setg(errp, "Could not truncate file");
         goto exit;
     }
 
     /* write grain directory */
-    lseek(fd, le64_to_cpu(header.rgd_offset) << 9, SEEK_SET);
-    for (i = 0, tmp = le64_to_cpu(header.rgd_offset) + gd_size;
+    gd_buf_size = gd_sectors * BDRV_SECTOR_SIZE;
+    gd_buf = g_malloc0(gd_buf_size);
+    for (i = 0, tmp = le64_to_cpu(header.rgd_offset) + gd_sectors;
          i < gt_count; i++, tmp += gt_size) {
-        ret = qemu_write_full(fd, &tmp, sizeof(tmp));
-        if (ret != sizeof(tmp)) {
-            ret = -errno;
-            goto exit;
-        }
+        gd_buf[i] = cpu_to_le32(tmp);
+    }
+    ret = bdrv_pwrite(bs, le64_to_cpu(header.rgd_offset) * BDRV_SECTOR_SIZE,
+                      gd_buf, gd_buf_size);
+    if (ret < 0) {
+        error_set(errp, QERR_IO_ERROR);
+        goto exit;
     }
 
     /* write backup grain directory */
-    lseek(fd, le64_to_cpu(header.gd_offset) << 9, SEEK_SET);
-    for (i = 0, tmp = le64_to_cpu(header.gd_offset) + gd_size;
+    for (i = 0, tmp = le64_to_cpu(header.gd_offset) + gd_sectors;
          i < gt_count; i++, tmp += gt_size) {
-        ret = qemu_write_full(fd, &tmp, sizeof(tmp));
-        if (ret != sizeof(tmp)) {
-            ret = -errno;
-            goto exit;
-        }
+        gd_buf[i] = cpu_to_le32(tmp);
+    }
+    ret = bdrv_pwrite(bs, le64_to_cpu(header.gd_offset) * BDRV_SECTOR_SIZE,
+                      gd_buf, gd_buf_size);
+    if (ret < 0) {
+        error_set(errp, QERR_IO_ERROR);
+        goto exit;
     }
 
     ret = 0;
- exit:
-    qemu_close(fd);
+exit:
+    if (bs) {
+        bdrv_unref(bs);
+    }
+    g_free(gd_buf);
     return ret;
 }
 
@@ -1599,7 +1620,9 @@ static int filename_decompose(const char *filename, char *path, char *prefix,
 static int vmdk_create(const char *filename, QEMUOptionParameter *options,
                        Error **errp)
 {
-    int fd, idx = 0;
+    int idx = 0;
+    BlockDriverState *new_bs = NULL;
+    Error *local_err;
     char *desc = NULL;
     int64_t total_size = 0, filesize;
     const char *adapter_type = NULL;
@@ -1616,6 +1639,7 @@ static int vmdk_create(const char *filename, QEMUOptionParameter *options,
     uint32_t parent_cid = 0xffffffff;
     uint32_t number_heads = 16;
     bool zeroed_grain = false;
+    uint32_t desc_offset = 0, desc_len;
     const char desc_template[] =
         "# Disk DescriptorFile\n"
         "version=1\n"
@@ -1749,7 +1773,7 @@ static int vmdk_create(const char *filename, QEMUOptionParameter *options,
                 path, desc_filename);
 
         if (vmdk_create_extent(ext_filename, size,
-                               flat, compress, zeroed_grain)) {
+                               flat, compress, zeroed_grain, errp)) {
             ret = -EINVAL;
             goto exit;
         }
@@ -1757,7 +1781,7 @@ static int vmdk_create(const char *filename, QEMUOptionParameter *options,
 
         /* Format description line */
         snprintf(desc_line, sizeof(desc_line),
-                    desc_extent_line, size / 512, desc_filename);
+                    desc_extent_line, size / BDRV_SECTOR_SIZE, desc_filename);
         g_string_append(ext_desc_lines, desc_line);
     }
     /* generate descriptor file */
@@ -1768,36 +1792,43 @@ static int vmdk_create(const char *filename, QEMUOptionParameter *options,
                            parent_desc_line,
                            ext_desc_lines->str,
                            (flags & BLOCK_FLAG_COMPAT6 ? 6 : 4),
-                           total_size / (int64_t)(63 * number_heads * 512),
+                           total_size /
+                               (int64_t)(63 * number_heads * BDRV_SECTOR_SIZE),
                            number_heads,
                            adapter_type);
-    if (split || flat) {
-        fd = qemu_open(filename,
-                       O_WRONLY | O_CREAT | O_TRUNC | O_BINARY | O_LARGEFILE,
-                       0644);
+    desc_len = strlen(desc);
+    /* the descriptor offset = 0x200 */
+    if (!split && !flat) {
+        desc_offset = 0x200;
     } else {
-        fd = qemu_open(filename,
-                       O_WRONLY | O_BINARY | O_LARGEFILE,
-                       0644);
+        ret = bdrv_create_file(filename, options, &local_err);
+        if (ret < 0) {
+            error_setg_errno(errp, -ret, "Could not create image file");
+            goto exit;
+        }
     }
-    if (fd < 0) {
-        ret = -errno;
+    ret = bdrv_file_open(&new_bs, filename, NULL, BDRV_O_RDWR, &local_err);
+    if (ret < 0) {
+        error_setg_errno(errp, -ret, "Could not write description");
         goto exit;
     }
-    /* the descriptor offset = 0x200 */
-    if (!split && !flat && 0x200 != lseek(fd, 0x200, SEEK_SET)) {
-        ret = -errno;
-        goto close_exit;
+    ret = bdrv_pwrite(new_bs, desc_offset, desc, desc_len);
+    if (ret < 0) {
+        error_setg_errno(errp, -ret, "Could not write description");
+        goto exit;
     }
-    ret = qemu_write_full(fd, desc, strlen(desc));
-    if (ret != strlen(desc)) {
-        ret = -errno;
-        goto close_exit;
+    /* bdrv_pwrite write padding zeros to align to sector, we don't need that
+     * for description file */
+    if (desc_offset == 0) {
+        ret = bdrv_truncate(new_bs, desc_len);
+        if (ret < 0) {
+            error_setg(errp, "Could not truncate file");
+        }
     }
-    ret = 0;
-close_exit:
-    qemu_close(fd);
 exit:
+    if (new_bs) {
+        bdrv_unref(new_bs);
+    }
     g_free(desc);
     g_string_free(ext_desc_lines, true);
     return ret;
diff --git a/block/vvfat.c b/block/vvfat.c
index 3ddaa0bcce..1abb8ad8e4 100644
--- a/block/vvfat.c
+++ b/block/vvfat.c
@@ -266,8 +266,7 @@ typedef struct mbr_t {
 } QEMU_PACKED mbr_t;
 
 typedef struct direntry_t {
-    uint8_t name[8];
-    uint8_t extension[3];
+    uint8_t name[8 + 3];
     uint8_t attributes;
     uint8_t reserved[2];
     uint16_t ctime;
@@ -518,11 +517,9 @@ static inline uint8_t fat_chksum(const direntry_t* entry)
     uint8_t chksum=0;
     int i;
 
-    for(i=0;i<11;i++) {
-        unsigned char c;
-
-        c = (i < 8) ? entry->name[i] : entry->extension[i-8];
-        chksum=(((chksum&0xfe)>>1)|((chksum&0x01)?0x80:0)) + c;
+    for (i = 0; i < ARRAY_SIZE(entry->name); i++) {
+        chksum = (((chksum & 0xfe) >> 1) |
+                  ((chksum & 0x01) ? 0x80 : 0)) + entry->name[i];
     }
 
     return chksum;
@@ -617,7 +614,7 @@ static inline direntry_t* create_short_and_long_name(BDRVVVFATState* s,
 
     if(is_dot) {
 	entry=array_get_next(&(s->directory));
-	memset(entry->name,0x20,11);
+        memset(entry->name, 0x20, sizeof(entry->name));
 	memcpy(entry->name,filename,strlen(filename));
 	return entry;
     }
@@ -632,12 +629,14 @@ static inline direntry_t* create_short_and_long_name(BDRVVVFATState* s,
 	i = 8;
 
     entry=array_get_next(&(s->directory));
-    memset(entry->name,0x20,11);
+    memset(entry->name, 0x20, sizeof(entry->name));
     memcpy(entry->name, filename, i);
 
-    if(j > 0)
-	for (i = 0; i < 3 && filename[j+1+i]; i++)
-	    entry->extension[i] = filename[j+1+i];
+    if (j > 0) {
+        for (i = 0; i < 3 && filename[j + 1 + i]; i++) {
+            entry->name[8 + i] = filename[j + 1 + i];
+        }
+    }
 
     /* upcase & remove unwanted characters */
     for(i=10;i>=0;i--) {
@@ -861,8 +860,7 @@ static int init_directories(BDRVVVFATState* s,
     {
 	direntry_t* entry=array_get_next(&(s->directory));
 	entry->attributes=0x28; /* archive | volume label */
-	memcpy(entry->name,"QEMU VVF",8);
-	memcpy(entry->extension,"AT ",3);
+        memcpy(entry->name, "QEMU VVFAT ", sizeof(entry->name));
     }
 
     /* Now build FAT, and write back information into directory */
@@ -1591,17 +1589,20 @@ static int parse_short_name(BDRVVVFATState* s,
 	    lfn->name[i] = direntry->name[i];
     }
 
-    for (j = 2; j >= 0 && direntry->extension[j] == ' '; j--);
+    for (j = 2; j >= 0 && direntry->name[8 + j] == ' '; j--) {
+    }
     if (j >= 0) {
 	lfn->name[i++] = '.';
 	lfn->name[i + j + 1] = '\0';
 	for (;j >= 0; j--) {
-	    if (direntry->extension[j] <= ' ' || direntry->extension[j] > 0x7f)
-		return -2;
-	    else if (s->downcase_short_names)
-		lfn->name[i + j] = qemu_tolower(direntry->extension[j]);
-	    else
-		lfn->name[i + j] = direntry->extension[j];
+            uint8_t c = direntry->name[8 + j];
+            if (c <= ' ' || c > 0x7f) {
+                return -2;
+            } else if (s->downcase_short_names) {
+                lfn->name[i + j] = qemu_tolower(c);
+            } else {
+                lfn->name[i + j] = c;
+            }
 	}
     } else
 	lfn->name[i + j + 1] = '\0';