summary refs log tree commit diff stats
diff options
context:
space:
mode:
-rw-r--r--Makefile.objs2
-rw-r--r--block/blkverify.c382
-rw-r--r--block/nbd.c2
-rw-r--r--block/qcow2-cluster.c19
-rw-r--r--block/qcow2-refcount.c13
-rw-r--r--block/qcow2-snapshot.c2
-rw-r--r--block/qcow2.c115
-rw-r--r--block/qcow2.h4
-rw-r--r--block/raw-posix.c79
-rw-r--r--block/vvfat.c26
-rw-r--r--cutils.c50
-rw-r--r--docs/blkverify.txt69
-rw-r--r--hw/ide/core.c4
-rw-r--r--hw/scsi-bus.c3
-rw-r--r--hw/scsi-disk.c10
-rw-r--r--hw/scsi-generic.c21
-rw-r--r--hw/sd.c2
-rw-r--r--hw/virtio-blk.c1
-rw-r--r--nbd.c25
-rw-r--r--posix-aio-compat.c2
-rw-r--r--qemu-common.h3
-rw-r--r--qemu-io.c2
-rw-r--r--qemu-nbd.c8
23 files changed, 721 insertions, 123 deletions
diff --git a/Makefile.objs b/Makefile.objs
index 3ef6d80dfd..dad459343d 100644
--- a/Makefile.objs
+++ b/Makefile.objs
@@ -14,7 +14,7 @@ block-obj-$(CONFIG_LINUX_AIO) += linux-aio.o
 
 block-nested-y += raw.o cow.o qcow.o vdi.o vmdk.o cloop.o dmg.o bochs.o vpc.o vvfat.o
 block-nested-y += qcow2.o qcow2-refcount.o qcow2-cluster.o qcow2-snapshot.o
-block-nested-y += parallels.o nbd.o blkdebug.o sheepdog.o
+block-nested-y += parallels.o nbd.o blkdebug.o sheepdog.o blkverify.o
 block-nested-$(CONFIG_WIN32) += raw-win32.o
 block-nested-$(CONFIG_POSIX) += raw-posix.o
 block-nested-$(CONFIG_CURL) += curl.o
diff --git a/block/blkverify.c b/block/blkverify.c
new file mode 100644
index 0000000000..4202685a63
--- /dev/null
+++ b/block/blkverify.c
@@ -0,0 +1,382 @@
+/*
+ * Block protocol for block driver correctness testing
+ *
+ * Copyright (C) 2010 IBM, Corp.
+ *
+ * This work is licensed under the terms of the GNU GPL, version 2 or later.
+ * See the COPYING file in the top-level directory.
+ */
+
+#include <stdarg.h>
+#include "qemu_socket.h" /* for EINPROGRESS on Windows */
+#include "block_int.h"
+
+typedef struct {
+    BlockDriverState *test_file;
+} BDRVBlkverifyState;
+
+typedef struct BlkverifyAIOCB BlkverifyAIOCB;
+struct BlkverifyAIOCB {
+    BlockDriverAIOCB common;
+    QEMUBH *bh;
+
+    /* Request metadata */
+    bool is_write;
+    int64_t sector_num;
+    int nb_sectors;
+
+    int ret;                    /* first completed request's result */
+    unsigned int done;          /* completion counter */
+    bool *finished;             /* completion signal for cancel */
+
+    QEMUIOVector *qiov;         /* user I/O vector */
+    QEMUIOVector raw_qiov;      /* cloned I/O vector for raw file */
+    void *buf;                  /* buffer for raw file I/O */
+
+    void (*verify)(BlkverifyAIOCB *acb);
+};
+
+static void blkverify_aio_cancel(BlockDriverAIOCB *blockacb)
+{
+    BlkverifyAIOCB *acb = (BlkverifyAIOCB *)blockacb;
+    bool finished = false;
+
+    /* Wait until request completes, invokes its callback, and frees itself */
+    acb->finished = &finished;
+    while (!finished) {
+        qemu_aio_wait();
+    }
+}
+
+static AIOPool blkverify_aio_pool = {
+    .aiocb_size         = sizeof(BlkverifyAIOCB),
+    .cancel             = blkverify_aio_cancel,
+};
+
+static void blkverify_err(BlkverifyAIOCB *acb, const char *fmt, ...)
+{
+    va_list ap;
+
+    va_start(ap, fmt);
+    fprintf(stderr, "blkverify: %s sector_num=%ld nb_sectors=%d ",
+            acb->is_write ? "write" : "read", acb->sector_num,
+            acb->nb_sectors);
+    vfprintf(stderr, fmt, ap);
+    fprintf(stderr, "\n");
+    va_end(ap);
+    exit(1);
+}
+
+/* Valid blkverify filenames look like blkverify:path/to/raw_image:path/to/image */
+static int blkverify_open(BlockDriverState *bs, const char *filename, int flags)
+{
+    BDRVBlkverifyState *s = bs->opaque;
+    int ret;
+    char *raw, *c;
+
+    /* Parse the blkverify: prefix */
+    if (strncmp(filename, "blkverify:", strlen("blkverify:"))) {
+        return -EINVAL;
+    }
+    filename += strlen("blkverify:");
+
+    /* Parse the raw image filename */
+    c = strchr(filename, ':');
+    if (c == NULL) {
+        return -EINVAL;
+    }
+
+    raw = strdup(filename);
+    raw[c - filename] = '\0';
+    ret = bdrv_file_open(&bs->file, raw, flags);
+    free(raw);
+    if (ret < 0) {
+        return ret;
+    }
+    filename = c + 1;
+
+    /* Open the test file */
+    s->test_file = bdrv_new("");
+    ret = bdrv_open(s->test_file, filename, flags, NULL);
+    if (ret < 0) {
+        bdrv_delete(s->test_file);
+        s->test_file = NULL;
+        return ret;
+    }
+
+    return 0;
+}
+
+static void blkverify_close(BlockDriverState *bs)
+{
+    BDRVBlkverifyState *s = bs->opaque;
+
+    bdrv_delete(s->test_file);
+    s->test_file = NULL;
+}
+
+static void blkverify_flush(BlockDriverState *bs)
+{
+    BDRVBlkverifyState *s = bs->opaque;
+
+    /* Only flush test file, the raw file is not important */
+    bdrv_flush(s->test_file);
+}
+
+static int64_t blkverify_getlength(BlockDriverState *bs)
+{
+    BDRVBlkverifyState *s = bs->opaque;
+
+    return bdrv_getlength(s->test_file);
+}
+
+/**
+ * Check that I/O vector contents are identical
+ *
+ * @a:          I/O vector
+ * @b:          I/O vector
+ * @ret:        Offset to first mismatching byte or -1 if match
+ */
+static ssize_t blkverify_iovec_compare(QEMUIOVector *a, QEMUIOVector *b)
+{
+    int i;
+    ssize_t offset = 0;
+
+    assert(a->niov == b->niov);
+    for (i = 0; i < a->niov; i++) {
+        size_t len = 0;
+        uint8_t *p = (uint8_t *)a->iov[i].iov_base;
+        uint8_t *q = (uint8_t *)b->iov[i].iov_base;
+
+        assert(a->iov[i].iov_len == b->iov[i].iov_len);
+        while (len < a->iov[i].iov_len && *p++ == *q++) {
+            len++;
+        }
+
+        offset += len;
+
+        if (len != a->iov[i].iov_len) {
+            return offset;
+        }
+    }
+    return -1;
+}
+
+typedef struct {
+    int src_index;
+    struct iovec *src_iov;
+    void *dest_base;
+} IOVectorSortElem;
+
+static int sortelem_cmp_src_base(const void *a, const void *b)
+{
+    const IOVectorSortElem *elem_a = a;
+    const IOVectorSortElem *elem_b = b;
+
+    /* Don't overflow */
+    if (elem_a->src_iov->iov_base < elem_b->src_iov->iov_base) {
+        return -1;
+    } else if (elem_a->src_iov->iov_base > elem_b->src_iov->iov_base) {
+        return 1;
+    } else {
+        return 0;
+    }
+}
+
+static int sortelem_cmp_src_index(const void *a, const void *b)
+{
+    const IOVectorSortElem *elem_a = a;
+    const IOVectorSortElem *elem_b = b;
+
+    return elem_a->src_index - elem_b->src_index;
+}
+
+/**
+ * Copy contents of I/O vector
+ *
+ * The relative relationships of overlapping iovecs are preserved.  This is
+ * necessary to ensure identical semantics in the cloned I/O vector.
+ */
+static void blkverify_iovec_clone(QEMUIOVector *dest, const QEMUIOVector *src,
+                                  void *buf)
+{
+    IOVectorSortElem sortelems[src->niov];
+    void *last_end;
+    int i;
+
+    /* Sort by source iovecs by base address */
+    for (i = 0; i < src->niov; i++) {
+        sortelems[i].src_index = i;
+        sortelems[i].src_iov = &src->iov[i];
+    }
+    qsort(sortelems, src->niov, sizeof(sortelems[0]), sortelem_cmp_src_base);
+
+    /* Allocate buffer space taking into account overlapping iovecs */
+    last_end = NULL;
+    for (i = 0; i < src->niov; i++) {
+        struct iovec *cur = sortelems[i].src_iov;
+        ptrdiff_t rewind = 0;
+
+        /* Detect overlap */
+        if (last_end && last_end > cur->iov_base) {
+            rewind = last_end - cur->iov_base;
+        }
+
+        sortelems[i].dest_base = buf - rewind;
+        buf += cur->iov_len - MIN(rewind, cur->iov_len);
+        last_end = MAX(cur->iov_base + cur->iov_len, last_end);
+    }
+
+    /* Sort by source iovec index and build destination iovec */
+    qsort(sortelems, src->niov, sizeof(sortelems[0]), sortelem_cmp_src_index);
+    for (i = 0; i < src->niov; i++) {
+        qemu_iovec_add(dest, sortelems[i].dest_base, src->iov[i].iov_len);
+    }
+}
+
+static BlkverifyAIOCB *blkverify_aio_get(BlockDriverState *bs, bool is_write,
+                                         int64_t sector_num, QEMUIOVector *qiov,
+                                         int nb_sectors,
+                                         BlockDriverCompletionFunc *cb,
+                                         void *opaque)
+{
+    BlkverifyAIOCB *acb = qemu_aio_get(&blkverify_aio_pool, bs, cb, opaque);
+
+    acb->bh = NULL;
+    acb->is_write = is_write;
+    acb->sector_num = sector_num;
+    acb->nb_sectors = nb_sectors;
+    acb->ret = -EINPROGRESS;
+    acb->done = 0;
+    acb->qiov = qiov;
+    acb->buf = NULL;
+    acb->verify = NULL;
+    acb->finished = NULL;
+    return acb;
+}
+
+static void blkverify_aio_bh(void *opaque)
+{
+    BlkverifyAIOCB *acb = opaque;
+
+    qemu_bh_delete(acb->bh);
+    if (acb->buf) {
+        qemu_iovec_destroy(&acb->raw_qiov);
+        qemu_vfree(acb->buf);
+    }
+    acb->common.cb(acb->common.opaque, acb->ret);
+    if (acb->finished) {
+        *acb->finished = true;
+    }
+    qemu_aio_release(acb);
+}
+
+static void blkverify_aio_cb(void *opaque, int ret)
+{
+    BlkverifyAIOCB *acb = opaque;
+
+    switch (++acb->done) {
+    case 1:
+        acb->ret = ret;
+        break;
+
+    case 2:
+        if (acb->ret != ret) {
+            blkverify_err(acb, "return value mismatch %d != %d", acb->ret, ret);
+        }
+
+        if (acb->verify) {
+            acb->verify(acb);
+        }
+
+        acb->bh = qemu_bh_new(blkverify_aio_bh, acb);
+        qemu_bh_schedule(acb->bh);
+        break;
+    }
+}
+
+static void blkverify_verify_readv(BlkverifyAIOCB *acb)
+{
+    ssize_t offset = blkverify_iovec_compare(acb->qiov, &acb->raw_qiov);
+    if (offset != -1) {
+        blkverify_err(acb, "contents mismatch in sector %ld",
+                      acb->sector_num + (offset / BDRV_SECTOR_SIZE));
+    }
+}
+
+static BlockDriverAIOCB *blkverify_aio_readv(BlockDriverState *bs,
+        int64_t sector_num, QEMUIOVector *qiov, int nb_sectors,
+        BlockDriverCompletionFunc *cb, void *opaque)
+{
+    BDRVBlkverifyState *s = bs->opaque;
+    BlkverifyAIOCB *acb = blkverify_aio_get(bs, false, sector_num, qiov,
+                                            nb_sectors, cb, opaque);
+
+    acb->verify = blkverify_verify_readv;
+    acb->buf = qemu_blockalign(bs->file, qiov->size);
+    qemu_iovec_init(&acb->raw_qiov, acb->qiov->niov);
+    blkverify_iovec_clone(&acb->raw_qiov, qiov, acb->buf);
+
+    if (!bdrv_aio_readv(s->test_file, sector_num, qiov, nb_sectors,
+                        blkverify_aio_cb, acb)) {
+        blkverify_aio_cb(acb, -EIO);
+    }
+    if (!bdrv_aio_readv(bs->file, sector_num, &acb->raw_qiov, nb_sectors,
+                        blkverify_aio_cb, acb)) {
+        blkverify_aio_cb(acb, -EIO);
+    }
+    return &acb->common;
+}
+
+static BlockDriverAIOCB *blkverify_aio_writev(BlockDriverState *bs,
+        int64_t sector_num, QEMUIOVector *qiov, int nb_sectors,
+        BlockDriverCompletionFunc *cb, void *opaque)
+{
+    BDRVBlkverifyState *s = bs->opaque;
+    BlkverifyAIOCB *acb = blkverify_aio_get(bs, true, sector_num, qiov,
+                                            nb_sectors, cb, opaque);
+
+    if (!bdrv_aio_writev(s->test_file, sector_num, qiov, nb_sectors,
+                         blkverify_aio_cb, acb)) {
+        blkverify_aio_cb(acb, -EIO);
+    }
+    if (!bdrv_aio_writev(bs->file, sector_num, qiov, nb_sectors,
+                         blkverify_aio_cb, acb)) {
+        blkverify_aio_cb(acb, -EIO);
+    }
+    return &acb->common;
+}
+
+static BlockDriverAIOCB *blkverify_aio_flush(BlockDriverState *bs,
+                                             BlockDriverCompletionFunc *cb,
+                                             void *opaque)
+{
+    BDRVBlkverifyState *s = bs->opaque;
+
+    /* Only flush test file, the raw file is not important */
+    return bdrv_aio_flush(s->test_file, cb, opaque);
+}
+
+static BlockDriver bdrv_blkverify = {
+    .format_name        = "blkverify",
+    .protocol_name      = "blkverify",
+
+    .instance_size      = sizeof(BDRVBlkverifyState),
+
+    .bdrv_getlength     = blkverify_getlength,
+
+    .bdrv_file_open     = blkverify_open,
+    .bdrv_close         = blkverify_close,
+    .bdrv_flush         = blkverify_flush,
+
+    .bdrv_aio_readv     = blkverify_aio_readv,
+    .bdrv_aio_writev    = blkverify_aio_writev,
+    .bdrv_aio_flush     = blkverify_aio_flush,
+};
+
+static void bdrv_blkverify_init(void)
+{
+    bdrv_register(&bdrv_blkverify);
+}
+
+block_init(bdrv_blkverify_init);
diff --git a/block/nbd.c b/block/nbd.c
index 5e9d6cb8e6..c8dc763c6b 100644
--- a/block/nbd.c
+++ b/block/nbd.c
@@ -95,8 +95,6 @@ static int nbd_open(BlockDriverState *bs, const char* filename, int flags)
             if (r == p) {
                 goto out;
             }
-        } else if (name == NULL) {
-            goto out;
         }
 
         sock = tcp_socket_outgoing(hostname, port);
diff --git a/block/qcow2-cluster.c b/block/qcow2-cluster.c
index f562b1602e..fb4224a669 100644
--- a/block/qcow2-cluster.c
+++ b/block/qcow2-cluster.c
@@ -60,6 +60,7 @@ int qcow2_grow_l1_table(BlockDriverState *bs, int min_size)
         qemu_free(new_l1_table);
         return new_l1_table_offset;
     }
+    bdrv_flush(bs->file);
 
     BLKDBG_EVENT(bs->file, BLKDBG_L1_GROW_WRITE_TABLE);
     for(i = 0; i < s->l1_size; i++)
@@ -243,6 +244,7 @@ static int l2_allocate(BlockDriverState *bs, int l1_index, uint64_t **table)
     if (l2_offset < 0) {
         return l2_offset;
     }
+    bdrv_flush(bs->file);
 
     /* allocate a new entry in the l2 cache */
 
@@ -348,6 +350,8 @@ static int qcow_read(BlockDriverState *bs, int64_t sector_num,
     BDRVQcowState *s = bs->opaque;
     int ret, index_in_cluster, n, n1;
     uint64_t cluster_offset;
+    struct iovec iov;
+    QEMUIOVector qiov;
 
     while (nb_sectors > 0) {
         n = nb_sectors;
@@ -362,7 +366,11 @@ static int qcow_read(BlockDriverState *bs, int64_t sector_num,
         if (!cluster_offset) {
             if (bs->backing_hd) {
                 /* read from the base image */
-                n1 = qcow2_backing_read1(bs->backing_hd, sector_num, buf, n);
+                iov.iov_base = buf;
+                iov.iov_len = n * 512;
+                qemu_iovec_init_external(&qiov, &iov, 1);
+
+                n1 = qcow2_backing_read1(bs->backing_hd, &qiov, sector_num, n);
                 if (n1 > 0) {
                     BLKDBG_EVENT(bs->file, BLKDBG_READ_BACKING);
                     ret = bdrv_read(bs->backing_hd, sector_num, buf, n1);
@@ -413,7 +421,7 @@ static int copy_sectors(BlockDriverState *bs, uint64_t start_sect,
                         &s->aes_encrypt_key);
     }
     BLKDBG_EVENT(bs->file, BLKDBG_COW_WRITE);
-    ret = bdrv_write_sync(bs->file, (cluster_offset >> 9) + n_start,
+    ret = bdrv_write(bs->file, (cluster_offset >> 9) + n_start,
         s->cluster_data, n);
     if (ret < 0)
         return ret;
@@ -712,6 +720,13 @@ int qcow2_alloc_cluster_link_l2(BlockDriverState *bs, QCowL2Meta *m)
                     (i << s->cluster_bits)) | QCOW_OFLAG_COPIED);
      }
 
+    /*
+     * Before we update the L2 table to actually point to the new cluster, we
+     * need to be sure that the refcounts have been increased and COW was
+     * handled.
+     */
+    bdrv_flush(bs->file);
+
     ret = write_l2_entries(bs, l2_table, l2_offset, l2_index, m->nb_clusters);
     if (ret < 0) {
         qcow2_l2_cache_reset(bs);
diff --git a/block/qcow2-refcount.c b/block/qcow2-refcount.c
index 4c19e7ebd8..7082601f59 100644
--- a/block/qcow2-refcount.c
+++ b/block/qcow2-refcount.c
@@ -261,6 +261,8 @@ static int64_t alloc_refcount_block(BlockDriverState *bs, int64_t cluster_index)
             goto fail_block;
         }
 
+        bdrv_flush(bs->file);
+
         /* Initialize the new refcount block only after updating its refcount,
          * update_refcount uses the refcount cache itself */
         memset(s->refcount_block_cache, 0, s->cluster_size);
@@ -444,7 +446,7 @@ static int write_refcount_block_entries(BlockDriverState *bs,
     size = (last_index - first_index) << REFCOUNT_SHIFT;
 
     BLKDBG_EVENT(bs->file, BLKDBG_REFBLOCK_UPDATE_PART);
-    ret = bdrv_pwrite_sync(bs->file,
+    ret = bdrv_pwrite(bs->file,
         refcount_block_offset + (first_index << REFCOUNT_SHIFT),
         &s->refcount_block_cache[first_index], size);
     if (ret < 0) {
@@ -573,6 +575,8 @@ static int update_cluster_refcount(BlockDriverState *bs,
         return ret;
     }
 
+    bdrv_flush(bs->file);
+
     return get_refcount(bs, cluster_index);
 }
 
@@ -624,6 +628,7 @@ int64_t qcow2_alloc_clusters(BlockDriverState *bs, int64_t size)
     if (ret < 0) {
         return ret;
     }
+
     return offset;
 }
 
@@ -671,6 +676,8 @@ int64_t qcow2_alloc_bytes(BlockDriverState *bs, int size)
             goto redo;
         }
     }
+
+    bdrv_flush(bs->file);
     return offset;
 }
 
@@ -801,6 +808,10 @@ int qcow2_update_snapshot_refcount(BlockDriverState *bs,
                             if (ret < 0) {
                                 goto fail;
                             }
+
+                            /* TODO Flushing once for the whole function should
+                             * be enough */
+                            bdrv_flush(bs->file);
                         }
                         /* compressed clusters are never modified */
                         refcount = 2;
diff --git a/block/qcow2-snapshot.c b/block/qcow2-snapshot.c
index 6228612f84..bbfcaaae22 100644
--- a/block/qcow2-snapshot.c
+++ b/block/qcow2-snapshot.c
@@ -138,6 +138,7 @@ static int qcow_write_snapshots(BlockDriverState *bs)
     snapshots_size = offset;
 
     snapshots_offset = qcow2_alloc_clusters(bs, snapshots_size);
+    bdrv_flush(bs->file);
     offset = snapshots_offset;
     if (offset < 0) {
         return offset;
@@ -271,6 +272,7 @@ int qcow2_snapshot_create(BlockDriverState *bs, QEMUSnapshotInfo *sn_info)
     if (l1_table_offset < 0) {
         goto fail;
     }
+    bdrv_flush(bs->file);
 
     sn->l1_table_offset = l1_table_offset;
     sn->l1_size = s->l1_size;
diff --git a/block/qcow2.c b/block/qcow2.c
index a53014dbda..ee3481b786 100644
--- a/block/qcow2.c
+++ b/block/qcow2.c
@@ -311,8 +311,8 @@ static int qcow_is_allocated(BlockDriverState *bs, int64_t sector_num,
 }
 
 /* handle reading after the end of the backing file */
-int qcow2_backing_read1(BlockDriverState *bs,
-                  int64_t sector_num, uint8_t *buf, int nb_sectors)
+int qcow2_backing_read1(BlockDriverState *bs, QEMUIOVector *qiov,
+                  int64_t sector_num, int nb_sectors)
 {
     int n1;
     if ((sector_num + nb_sectors) <= bs->total_sectors)
@@ -321,7 +321,9 @@ int qcow2_backing_read1(BlockDriverState *bs,
         n1 = 0;
     else
         n1 = bs->total_sectors - sector_num;
-    memset(buf + n1 * 512, 0, 512 * (nb_sectors - n1));
+
+    qemu_iovec_memset(qiov, 0, 512 * (nb_sectors - n1));
+
     return n1;
 }
 
@@ -329,14 +331,12 @@ typedef struct QCowAIOCB {
     BlockDriverAIOCB common;
     int64_t sector_num;
     QEMUIOVector *qiov;
-    uint8_t *buf;
-    void *orig_buf;
     int remaining_sectors;
     int cur_nr_sectors;	/* number of sectors in current iteration */
+    uint64_t bytes_done;
     uint64_t cluster_offset;
     uint8_t *cluster_data;
     BlockDriverAIOCB *hd_aiocb;
-    struct iovec hd_iov;
     QEMUIOVector hd_qiov;
     QEMUBH *bh;
     QCowL2Meta l2meta;
@@ -397,15 +397,19 @@ static void qcow_aio_read_cb(void *opaque, int ret)
         /* nothing to do */
     } else {
         if (s->crypt_method) {
-            qcow2_encrypt_sectors(s, acb->sector_num, acb->buf, acb->buf,
-                            acb->cur_nr_sectors, 0,
-                            &s->aes_decrypt_key);
+            qcow2_encrypt_sectors(s, acb->sector_num,  acb->cluster_data,
+                acb->cluster_data, acb->cur_nr_sectors, 0, &s->aes_decrypt_key);
+            qemu_iovec_reset(&acb->hd_qiov);
+            qemu_iovec_copy(&acb->hd_qiov, acb->qiov, acb->bytes_done,
+                acb->cur_nr_sectors * 512);
+            qemu_iovec_from_buffer(&acb->hd_qiov, acb->cluster_data,
+                512 * acb->cur_nr_sectors);
         }
     }
 
     acb->remaining_sectors -= acb->cur_nr_sectors;
     acb->sector_num += acb->cur_nr_sectors;
-    acb->buf += acb->cur_nr_sectors * 512;
+    acb->bytes_done += acb->cur_nr_sectors * 512;
 
     if (acb->remaining_sectors == 0) {
         /* request completed */
@@ -415,6 +419,11 @@ static void qcow_aio_read_cb(void *opaque, int ret)
 
     /* prepare next AIO request */
     acb->cur_nr_sectors = acb->remaining_sectors;
+    if (s->crypt_method) {
+        acb->cur_nr_sectors = MIN(acb->cur_nr_sectors,
+            QCOW_MAX_CRYPT_CLUSTERS * s->cluster_sectors);
+    }
+
     ret = qcow2_get_cluster_offset(bs, acb->sector_num << 9,
         &acb->cur_nr_sectors, &acb->cluster_offset);
     if (ret < 0) {
@@ -423,15 +432,17 @@ static void qcow_aio_read_cb(void *opaque, int ret)
 
     index_in_cluster = acb->sector_num & (s->cluster_sectors - 1);
 
+    qemu_iovec_reset(&acb->hd_qiov);
+    qemu_iovec_copy(&acb->hd_qiov, acb->qiov, acb->bytes_done,
+        acb->cur_nr_sectors * 512);
+
     if (!acb->cluster_offset) {
+
         if (bs->backing_hd) {
             /* read from the base image */
-            n1 = qcow2_backing_read1(bs->backing_hd, acb->sector_num,
-                               acb->buf, acb->cur_nr_sectors);
+            n1 = qcow2_backing_read1(bs->backing_hd, &acb->hd_qiov,
+                acb->sector_num, acb->cur_nr_sectors);
             if (n1 > 0) {
-                acb->hd_iov.iov_base = (void *)acb->buf;
-                acb->hd_iov.iov_len = acb->cur_nr_sectors * 512;
-                qemu_iovec_init_external(&acb->hd_qiov, &acb->hd_iov, 1);
                 BLKDBG_EVENT(bs->file, BLKDBG_READ_BACKING_AIO);
                 acb->hd_aiocb = bdrv_aio_readv(bs->backing_hd, acb->sector_num,
                                     &acb->hd_qiov, acb->cur_nr_sectors,
@@ -445,7 +456,7 @@ static void qcow_aio_read_cb(void *opaque, int ret)
             }
         } else {
             /* Note: in this case, no need to wait */
-            memset(acb->buf, 0, 512 * acb->cur_nr_sectors);
+            qemu_iovec_memset(&acb->hd_qiov, 0, 512 * acb->cur_nr_sectors);
             ret = qcow_schedule_bh(qcow_aio_read_bh, acb);
             if (ret < 0)
                 goto done;
@@ -454,8 +465,11 @@ static void qcow_aio_read_cb(void *opaque, int ret)
         /* add AIO support for compressed blocks ? */
         if (qcow2_decompress_cluster(bs, acb->cluster_offset) < 0)
             goto done;
-        memcpy(acb->buf, s->cluster_cache + index_in_cluster * 512,
-               512 * acb->cur_nr_sectors);
+
+        qemu_iovec_from_buffer(&acb->hd_qiov,
+            s->cluster_cache + index_in_cluster * 512,
+            512 * acb->cur_nr_sectors);
+
         ret = qcow_schedule_bh(qcow_aio_read_bh, acb);
         if (ret < 0)
             goto done;
@@ -465,9 +479,23 @@ static void qcow_aio_read_cb(void *opaque, int ret)
             goto done;
         }
 
-        acb->hd_iov.iov_base = (void *)acb->buf;
-        acb->hd_iov.iov_len = acb->cur_nr_sectors * 512;
-        qemu_iovec_init_external(&acb->hd_qiov, &acb->hd_iov, 1);
+        if (s->crypt_method) {
+            /*
+             * For encrypted images, read everything into a temporary
+             * contiguous buffer on which the AES functions can work.
+             */
+            if (!acb->cluster_data) {
+                acb->cluster_data =
+                    qemu_mallocz(QCOW_MAX_CRYPT_CLUSTERS * s->cluster_size);
+            }
+
+            assert(acb->cur_nr_sectors <=
+                QCOW_MAX_CRYPT_CLUSTERS * s->cluster_sectors);
+            qemu_iovec_reset(&acb->hd_qiov);
+            qemu_iovec_add(&acb->hd_qiov, acb->cluster_data,
+                512 * acb->cur_nr_sectors);
+        }
+
         BLKDBG_EVENT(bs->file, BLKDBG_READ_AIO);
         acb->hd_aiocb = bdrv_aio_readv(bs->file,
                             (acb->cluster_offset >> 9) + index_in_cluster,
@@ -481,11 +509,8 @@ static void qcow_aio_read_cb(void *opaque, int ret)
 
     return;
 done:
-    if (acb->qiov->niov > 1) {
-        qemu_iovec_from_buffer(acb->qiov, acb->orig_buf, acb->qiov->size);
-        qemu_vfree(acb->orig_buf);
-    }
     acb->common.cb(acb->common.opaque, ret);
+    qemu_iovec_destroy(&acb->hd_qiov);
     qemu_aio_release(acb);
 }
 
@@ -501,13 +526,10 @@ static QCowAIOCB *qcow_aio_setup(BlockDriverState *bs,
     acb->hd_aiocb = NULL;
     acb->sector_num = sector_num;
     acb->qiov = qiov;
-    if (qiov->niov > 1) {
-        acb->buf = acb->orig_buf = qemu_blockalign(bs, qiov->size);
-        if (is_write)
-            qemu_iovec_to_buffer(qiov, acb->buf);
-    } else {
-        acb->buf = (uint8_t *)qiov->iov->iov_base;
-    }
+
+    qemu_iovec_init(&acb->hd_qiov, qiov->niov);
+
+    acb->bytes_done = 0;
     acb->remaining_sectors = nb_sectors;
     acb->cur_nr_sectors = 0;
     acb->cluster_offset = 0;
@@ -557,7 +579,6 @@ static void qcow_aio_write_cb(void *opaque, int ret)
     BlockDriverState *bs = acb->common.bs;
     BDRVQcowState *s = bs->opaque;
     int index_in_cluster;
-    const uint8_t *src_buf;
     int n_end;
 
     acb->hd_aiocb = NULL;
@@ -573,7 +594,7 @@ static void qcow_aio_write_cb(void *opaque, int ret)
 
     acb->remaining_sectors -= acb->cur_nr_sectors;
     acb->sector_num += acb->cur_nr_sectors;
-    acb->buf += acb->cur_nr_sectors * 512;
+    acb->bytes_done += acb->cur_nr_sectors * 512;
 
     if (acb->remaining_sectors == 0) {
         /* request completed */
@@ -604,20 +625,27 @@ static void qcow_aio_write_cb(void *opaque, int ret)
 
     assert((acb->cluster_offset & 511) == 0);
 
+    qemu_iovec_reset(&acb->hd_qiov);
+    qemu_iovec_copy(&acb->hd_qiov, acb->qiov, acb->bytes_done,
+        acb->cur_nr_sectors * 512);
+
     if (s->crypt_method) {
         if (!acb->cluster_data) {
             acb->cluster_data = qemu_mallocz(QCOW_MAX_CRYPT_CLUSTERS *
                                              s->cluster_size);
         }
-        qcow2_encrypt_sectors(s, acb->sector_num, acb->cluster_data, acb->buf,
-                        acb->cur_nr_sectors, 1, &s->aes_encrypt_key);
-        src_buf = acb->cluster_data;
-    } else {
-        src_buf = acb->buf;
+
+        assert(acb->hd_qiov.size <= QCOW_MAX_CRYPT_CLUSTERS * s->cluster_size);
+        qemu_iovec_to_buffer(&acb->hd_qiov, acb->cluster_data);
+
+        qcow2_encrypt_sectors(s, acb->sector_num, acb->cluster_data,
+            acb->cluster_data, acb->cur_nr_sectors, 1, &s->aes_encrypt_key);
+
+        qemu_iovec_reset(&acb->hd_qiov);
+        qemu_iovec_add(&acb->hd_qiov, acb->cluster_data,
+            acb->cur_nr_sectors * 512);
     }
-    acb->hd_iov.iov_base = (void *)src_buf;
-    acb->hd_iov.iov_len = acb->cur_nr_sectors * 512;
-    qemu_iovec_init_external(&acb->hd_qiov, &acb->hd_iov, 1);
+
     BLKDBG_EVENT(bs->file, BLKDBG_WRITE_AIO);
     acb->hd_aiocb = bdrv_aio_writev(bs->file,
                                     (acb->cluster_offset >> 9) + index_in_cluster,
@@ -635,9 +663,8 @@ fail:
         QLIST_REMOVE(&acb->l2meta, next_in_flight);
     }
 done:
-    if (acb->qiov->niov > 1)
-        qemu_vfree(acb->orig_buf);
     acb->common.cb(acb->common.opaque, ret);
+    qemu_iovec_destroy(&acb->hd_qiov);
     qemu_aio_release(acb);
 }
 
diff --git a/block/qcow2.h b/block/qcow2.h
index 3ff162efcd..356a34af43 100644
--- a/block/qcow2.h
+++ b/block/qcow2.h
@@ -166,8 +166,8 @@ static inline int64_t align_offset(int64_t offset, int n)
 // FIXME Need qcow2_ prefix to global functions
 
 /* qcow2.c functions */
-int qcow2_backing_read1(BlockDriverState *bs,
-                  int64_t sector_num, uint8_t *buf, int nb_sectors);
+int qcow2_backing_read1(BlockDriverState *bs, QEMUIOVector *qiov,
+                  int64_t sector_num, int nb_sectors);
 
 /* qcow2-refcount.c functions */
 int qcow2_refcount_init(BlockDriverState *bs);
diff --git a/block/raw-posix.c b/block/raw-posix.c
index 813372a6c7..a5cbb7e929 100644
--- a/block/raw-posix.c
+++ b/block/raw-posix.c
@@ -97,12 +97,12 @@
 #define FTYPE_CD     1
 #define FTYPE_FD     2
 
-#define ALIGNED_BUFFER_SIZE (32 * 512)
-
 /* if the FD is not accessed during that time (in ms), we try to
    reopen it to see if the disk has been changed */
 #define FD_OPEN_TIMEOUT 1000
 
+#define MAX_BLOCKSIZE	4096
+
 typedef struct BDRVRawState {
     int fd;
     int type;
@@ -118,7 +118,8 @@ typedef struct BDRVRawState {
     int use_aio;
     void *aio_ctx;
 #endif
-    uint8_t* aligned_buf;
+    uint8_t *aligned_buf;
+    unsigned aligned_buf_size;
 } BDRVRawState;
 
 static int fd_open(BlockDriverState *bs);
@@ -161,7 +162,12 @@ static int raw_open_common(BlockDriverState *bs, const char *filename,
     s->aligned_buf = NULL;
 
     if ((bdrv_flags & BDRV_O_NOCACHE)) {
-        s->aligned_buf = qemu_blockalign(bs, ALIGNED_BUFFER_SIZE);
+        /*
+         * Allocate a buffer for read/modify/write cycles.  Chose the size
+         * pessimistically as we don't know the block size yet.
+         */
+        s->aligned_buf_size = 32 * MAX_BLOCKSIZE;
+        s->aligned_buf = qemu_memalign(MAX_BLOCKSIZE, s->aligned_buf_size);
         if (s->aligned_buf == NULL) {
             goto out_close;
         }
@@ -278,8 +284,9 @@ static int raw_pread_aligned(BlockDriverState *bs, int64_t offset,
 }
 
 /*
- * offset and count are in bytes, but must be multiples of 512 for files
- * opened with O_DIRECT. buf must be aligned to 512 bytes then.
+ * offset and count are in bytes, but must be multiples of the sector size
+ * for files opened with O_DIRECT. buf must be aligned to sector size bytes
+ * then.
  *
  * This function may be called without alignment if the caller ensures
  * that O_DIRECT is not in effect.
@@ -316,24 +323,25 @@ static int raw_pread(BlockDriverState *bs, int64_t offset,
                      uint8_t *buf, int count)
 {
     BDRVRawState *s = bs->opaque;
+    unsigned sector_mask = bs->buffer_alignment - 1;
     int size, ret, shift, sum;
 
     sum = 0;
 
     if (s->aligned_buf != NULL)  {
 
-        if (offset & 0x1ff) {
-            /* align offset on a 512 bytes boundary */
+        if (offset & sector_mask) {
+            /* align offset on a sector size bytes boundary */
 
-            shift = offset & 0x1ff;
-            size = (shift + count + 0x1ff) & ~0x1ff;
-            if (size > ALIGNED_BUFFER_SIZE)
-                size = ALIGNED_BUFFER_SIZE;
+            shift = offset & sector_mask;
+            size = (shift + count + sector_mask) & ~sector_mask;
+            if (size > s->aligned_buf_size)
+                size = s->aligned_buf_size;
             ret = raw_pread_aligned(bs, offset - shift, s->aligned_buf, size);
             if (ret < 0)
                 return ret;
 
-            size = 512 - shift;
+            size = bs->buffer_alignment - shift;
             if (size > count)
                 size = count;
             memcpy(buf, s->aligned_buf + shift, size);
@@ -346,15 +354,15 @@ static int raw_pread(BlockDriverState *bs, int64_t offset,
             if (count == 0)
                 return sum;
         }
-        if (count & 0x1ff || (uintptr_t) buf & 0x1ff) {
+        if (count & sector_mask || (uintptr_t) buf & sector_mask) {
 
             /* read on aligned buffer */
 
             while (count) {
 
-                size = (count + 0x1ff) & ~0x1ff;
-                if (size > ALIGNED_BUFFER_SIZE)
-                    size = ALIGNED_BUFFER_SIZE;
+                size = (count + sector_mask) & ~sector_mask;
+                if (size > s->aligned_buf_size)
+                    size = s->aligned_buf_size;
 
                 ret = raw_pread_aligned(bs, offset, s->aligned_buf, size);
                 if (ret < 0) {
@@ -404,25 +412,28 @@ static int raw_pwrite(BlockDriverState *bs, int64_t offset,
                       const uint8_t *buf, int count)
 {
     BDRVRawState *s = bs->opaque;
+    unsigned sector_mask = bs->buffer_alignment - 1;
     int size, ret, shift, sum;
 
     sum = 0;
 
     if (s->aligned_buf != NULL) {
 
-        if (offset & 0x1ff) {
-            /* align offset on a 512 bytes boundary */
-            shift = offset & 0x1ff;
-            ret = raw_pread_aligned(bs, offset - shift, s->aligned_buf, 512);
+        if (offset & sector_mask) {
+            /* align offset on a sector size bytes boundary */
+            shift = offset & sector_mask;
+            ret = raw_pread_aligned(bs, offset - shift, s->aligned_buf,
+                                    bs->buffer_alignment);
             if (ret < 0)
                 return ret;
 
-            size = 512 - shift;
+            size = bs->buffer_alignment - shift;
             if (size > count)
                 size = count;
             memcpy(s->aligned_buf + shift, buf, size);
 
-            ret = raw_pwrite_aligned(bs, offset - shift, s->aligned_buf, 512);
+            ret = raw_pwrite_aligned(bs, offset - shift, s->aligned_buf,
+                                     bs->buffer_alignment);
             if (ret < 0)
                 return ret;
 
@@ -434,12 +445,12 @@ static int raw_pwrite(BlockDriverState *bs, int64_t offset,
             if (count == 0)
                 return sum;
         }
-        if (count & 0x1ff || (uintptr_t) buf & 0x1ff) {
+        if (count & sector_mask || (uintptr_t) buf & sector_mask) {
 
-            while ((size = (count & ~0x1ff)) != 0) {
+            while ((size = (count & ~sector_mask)) != 0) {
 
-                if (size > ALIGNED_BUFFER_SIZE)
-                    size = ALIGNED_BUFFER_SIZE;
+                if (size > s->aligned_buf_size)
+                    size = s->aligned_buf_size;
 
                 memcpy(s->aligned_buf, buf, size);
 
@@ -452,14 +463,16 @@ static int raw_pwrite(BlockDriverState *bs, int64_t offset,
                 count -= ret;
                 sum += ret;
             }
-            /* here, count < 512 because (count & ~0x1ff) == 0 */
+            /* here, count < 512 because (count & ~sector_mask) == 0 */
             if (count) {
-                ret = raw_pread_aligned(bs, offset, s->aligned_buf, 512);
+                ret = raw_pread_aligned(bs, offset, s->aligned_buf,
+                                     bs->buffer_alignment);
                 if (ret < 0)
                     return ret;
                  memcpy(s->aligned_buf, buf, count);
 
-                 ret = raw_pwrite_aligned(bs, offset, s->aligned_buf, 512);
+                 ret = raw_pwrite_aligned(bs, offset, s->aligned_buf,
+                                     bs->buffer_alignment);
                  if (ret < 0)
                      return ret;
                  if (count < ret)
@@ -487,12 +500,12 @@ static int raw_write(BlockDriverState *bs, int64_t sector_num,
 /*
  * Check if all memory in this vector is sector aligned.
  */
-static int qiov_is_aligned(QEMUIOVector *qiov)
+static int qiov_is_aligned(BlockDriverState *bs, QEMUIOVector *qiov)
 {
     int i;
 
     for (i = 0; i < qiov->niov; i++) {
-        if ((uintptr_t) qiov->iov[i].iov_base % BDRV_SECTOR_SIZE) {
+        if ((uintptr_t) qiov->iov[i].iov_base % bs->buffer_alignment) {
             return 0;
         }
     }
@@ -515,7 +528,7 @@ static BlockDriverAIOCB *raw_aio_submit(BlockDriverState *bs,
      * driver that it needs to copy the buffer.
      */
     if (s->aligned_buf) {
-        if (!qiov_is_aligned(qiov)) {
+        if (!qiov_is_aligned(bs, qiov)) {
             type |= QEMU_AIO_MISALIGNED;
 #ifdef CONFIG_LINUX_AIO
         } else if (s->use_aio) {
diff --git a/block/vvfat.c b/block/vvfat.c
index 365332aa21..53e57bf228 100644
--- a/block/vvfat.c
+++ b/block/vvfat.c
@@ -2665,6 +2665,11 @@ static int vvfat_write(BlockDriverState *bs, int64_t sector_num,
 
 DLOG(checkpoint());
 
+    /* Check if we're operating in read-only mode */
+    if (s->qcow == NULL) {
+        return -EACCES;
+    }
+
     vvfat_close_current_file(s);
 
     /*
@@ -2763,12 +2768,12 @@ static int vvfat_is_allocated(BlockDriverState *bs,
 
 static int write_target_commit(BlockDriverState *bs, int64_t sector_num,
 	const uint8_t* buffer, int nb_sectors) {
-    BDRVVVFATState* s = bs->opaque;
+    BDRVVVFATState* s = *((BDRVVVFATState**) bs->opaque);
     return try_commit(s);
 }
 
 static void write_target_close(BlockDriverState *bs) {
-    BDRVVVFATState* s = bs->opaque;
+    BDRVVVFATState* s = *((BDRVVVFATState**) bs->opaque);
     bdrv_delete(s->qcow);
     free(s->qcow_filename);
 }
@@ -2783,6 +2788,7 @@ static int enable_write_target(BDRVVVFATState *s)
 {
     BlockDriver *bdrv_qcow;
     QEMUOptionParameter *options;
+    int ret;
     int size = sector2cluster(s, s->sector_count);
     s->used_clusters = calloc(size, 1);
 
@@ -2798,11 +2804,16 @@ static int enable_write_target(BDRVVVFATState *s)
 
     if (bdrv_create(bdrv_qcow, s->qcow_filename, options) < 0)
 	return -1;
+
     s->qcow = bdrv_new("");
-    if (s->qcow == NULL ||
-        bdrv_open(s->qcow, s->qcow_filename, BDRV_O_RDWR, bdrv_qcow) < 0)
-    {
-	return -1;
+    if (s->qcow == NULL) {
+        return -1;
+    }
+
+    ret = bdrv_open(s->qcow, s->qcow_filename,
+            BDRV_O_RDWR | BDRV_O_CACHE_WB | BDRV_O_NO_FLUSH, bdrv_qcow);
+    if (ret < 0) {
+	return ret;
     }
 
 #ifndef _WIN32
@@ -2811,7 +2822,8 @@ static int enable_write_target(BDRVVVFATState *s)
 
     s->bs->backing_hd = calloc(sizeof(BlockDriverState), 1);
     s->bs->backing_hd->drv = &vvfat_write_target;
-    s->bs->backing_hd->opaque = s;
+    s->bs->backing_hd->opaque = qemu_malloc(sizeof(void*));
+    *(void**)s->bs->backing_hd->opaque = s;
 
     return 0;
 }
diff --git a/cutils.c b/cutils.c
index 036ae3ce31..588373774a 100644
--- a/cutils.c
+++ b/cutils.c
@@ -168,30 +168,50 @@ void qemu_iovec_add(QEMUIOVector *qiov, void *base, size_t len)
 }
 
 /*
- * Copies iovecs from src to the end dst until src is completely copied or the
- * total size of the copied iovec reaches size. The size of the last copied
- * iovec is changed in order to fit the specified total size if it isn't a
- * perfect fit already.
+ * Copies iovecs from src to the end of dst. It starts copying after skipping
+ * the given number of bytes in src and copies until src is completely copied
+ * or the total size of the copied iovec reaches size.The size of the last
+ * copied iovec is changed in order to fit the specified total size if it isn't
+ * a perfect fit already.
  */
-void qemu_iovec_concat(QEMUIOVector *dst, QEMUIOVector *src, size_t size)
+void qemu_iovec_copy(QEMUIOVector *dst, QEMUIOVector *src, uint64_t skip,
+    size_t size)
 {
     int i;
     size_t done;
+    void *iov_base;
+    uint64_t iov_len;
 
     assert(dst->nalloc != -1);
 
     done = 0;
     for (i = 0; (i < src->niov) && (done != size); i++) {
-        if (done + src->iov[i].iov_len > size) {
-            qemu_iovec_add(dst, src->iov[i].iov_base, size - done);
+        if (skip >= src->iov[i].iov_len) {
+            /* Skip the whole iov */
+            skip -= src->iov[i].iov_len;
+            continue;
+        } else {
+            /* Skip only part (or nothing) of the iov */
+            iov_base = (uint8_t*) src->iov[i].iov_base + skip;
+            iov_len = src->iov[i].iov_len - skip;
+            skip = 0;
+        }
+
+        if (done + iov_len > size) {
+            qemu_iovec_add(dst, iov_base, size - done);
             break;
         } else {
-            qemu_iovec_add(dst, src->iov[i].iov_base, src->iov[i].iov_len);
+            qemu_iovec_add(dst, iov_base, iov_len);
         }
-        done += src->iov[i].iov_len;
+        done += iov_len;
     }
 }
 
+void qemu_iovec_concat(QEMUIOVector *dst, QEMUIOVector *src, size_t size)
+{
+    qemu_iovec_copy(dst, src, 0, size);
+}
+
 void qemu_iovec_destroy(QEMUIOVector *qiov)
 {
     assert(qiov->nalloc != -1);
@@ -234,6 +254,18 @@ void qemu_iovec_from_buffer(QEMUIOVector *qiov, const void *buf, size_t count)
     }
 }
 
+void qemu_iovec_memset(QEMUIOVector *qiov, int c, size_t count)
+{
+    size_t n;
+    int i;
+
+    for (i = 0; i < qiov->niov && count; ++i) {
+        n = MIN(count, qiov->iov[i].iov_len);
+        memset(qiov->iov[i].iov_base, c, n);
+        count -= n;
+    }
+}
+
 #ifndef _WIN32
 /* Sets a specific flag */
 int fcntl_setfl(int fd, int flag)
diff --git a/docs/blkverify.txt b/docs/blkverify.txt
new file mode 100644
index 0000000000..d556dc4e6d
--- /dev/null
+++ b/docs/blkverify.txt
@@ -0,0 +1,69 @@
+= Block driver correctness testing with blkverify =
+
+== Introduction ==
+
+This document describes how to use the blkverify protocol to test that a block
+driver is operating correctly.
+
+It is difficult to test and debug block drivers against real guests.  Often
+processes inside the guest will crash because corrupt sectors were read as part
+of the executable.  Other times obscure errors are raised by a program inside
+the guest.  These issues are extremely hard to trace back to bugs in the block
+driver.
+
+Blkverify solves this problem by catching data corruption inside QEMU the first
+time bad data is read and reporting the disk sector that is corrupted.
+
+== How it works ==
+
+The blkverify protocol has two child block devices, the "test" device and the
+"raw" device.  Read/write operations are mirrored to both devices so their
+state should always be in sync.
+
+The "raw" device is a raw image, a flat file, that has identical starting
+contents to the "test" image.  The idea is that the "raw" device will handle
+read/write operations correctly and not corrupt data.  It can be used as a
+reference for comparison against the "test" device.
+
+After a mirrored read operation completes, blkverify will compare the data and
+raise an error if it is not identical.  This makes it possible to catch the
+first instance where corrupt data is read.
+
+== Example ==
+
+Imagine raw.img has 0xcd repeated throughout its first sector:
+
+    $ ./qemu-io -c 'read -v 0 512' raw.img
+    00000000:  cd cd cd cd cd cd cd cd cd cd cd cd cd cd cd cd  ................
+    00000010:  cd cd cd cd cd cd cd cd cd cd cd cd cd cd cd cd  ................
+    [...]
+    000001e0:  cd cd cd cd cd cd cd cd cd cd cd cd cd cd cd cd  ................
+    000001f0:  cd cd cd cd cd cd cd cd cd cd cd cd cd cd cd cd  ................
+    read 512/512 bytes at offset 0
+    512.000000 bytes, 1 ops; 0.0000 sec (97.656 MiB/sec and 200000.0000 ops/sec)
+
+And test.img is corrupt, its first sector is zeroed when it shouldn't be:
+
+    $ ./qemu-io -c 'read -v 0 512' test.img
+    00000000:  00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00  ................
+    00000010:  00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00  ................
+    [...]
+    000001e0:  00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00  ................
+    000001f0:  00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00  ................
+    read 512/512 bytes at offset 0
+    512.000000 bytes, 1 ops; 0.0000 sec (81.380 MiB/sec and 166666.6667 ops/sec)
+
+This error is caught by blkverify:
+
+    $ ./qemu-io -c 'read 0 512' blkverify:a.img:b.img
+    blkverify: read sector_num=0 nb_sectors=4 contents mismatch in sector 0
+
+A more realistic scenario is verifying the installation of a guest OS:
+
+    $ ./qemu-img create raw.img 16G
+    $ ./qemu-img create -f qcow2 test.qcow2 16G
+    $ x86_64-softmmu/qemu-system-x86_64 -cdrom debian.iso \
+                                        -drive file=blkverify:raw.img:test.qcow2
+
+If the installation is aborted when blkverify detects corruption, use qemu-io
+to explore the contents of the disk image at the sector in question.
diff --git a/hw/ide/core.c b/hw/ide/core.c
index 1e466d1ff5..06b6e14e56 100644
--- a/hw/ide/core.c
+++ b/hw/ide/core.c
@@ -2645,6 +2645,7 @@ int ide_init_drive(IDEState *s, BlockDriverState *bs,
     if (bdrv_get_type_hint(bs) == BDRV_TYPE_CDROM) {
         s->drive_kind = IDE_CD;
         bdrv_set_change_cb(bs, cdrom_change_cb, s);
+        bs->buffer_alignment = 2048;
     } else {
         if (!bdrv_is_inserted(s->bs)) {
             error_report("Device needs media, but drive is empty");
@@ -2679,7 +2680,8 @@ static void ide_init1(IDEBus *bus, int unit)
     s->bus = bus;
     s->unit = unit;
     s->drive_serial = drive_serial++;
-    s->io_buffer = qemu_blockalign(s->bs, IDE_DMA_BUF_SECTORS*512 + 4);
+    /* we need at least 2k alignment for accessing CDROMs using O_DIRECT */
+    s->io_buffer = qemu_memalign(2048, IDE_DMA_BUF_SECTORS*512 + 4);
     s->io_buffer_total_len = IDE_DMA_BUF_SECTORS*512 + 4;
     s->smart_selftest_data = qemu_blockalign(s->bs, 512);
     s->sector_write_timer = qemu_new_timer(vm_clock,
diff --git a/hw/scsi-bus.c b/hw/scsi-bus.c
index 7aa0bcd1ee..5a3fd4b7ac 100644
--- a/hw/scsi-bus.c
+++ b/hw/scsi-bus.c
@@ -208,6 +208,8 @@ static int scsi_req_length(SCSIRequest *req, uint8_t *cmd)
     case SEEK_6:
     case WRITE_FILEMARKS:
     case SPACE:
+    case RESERVE:
+    case RELEASE:
     case ERASE:
     case ALLOW_MEDIUM_REMOVAL:
     case VERIFY:
@@ -319,7 +321,6 @@ static void scsi_req_xfer_mode(SCSIRequest *req)
     case WRITE_BUFFER:
     case FORMAT_UNIT:
     case REASSIGN_BLOCKS:
-    case RESERVE:
     case SEARCH_EQUAL:
     case SEARCH_HIGH:
     case SEARCH_LOW:
diff --git a/hw/scsi-disk.c b/hw/scsi-disk.c
index 1446ca634c..9628b39a21 100644
--- a/hw/scsi-disk.c
+++ b/hw/scsi-disk.c
@@ -70,14 +70,15 @@ struct SCSIDiskState
     char *serial;
 };
 
-static SCSIDiskReq *scsi_new_request(SCSIDevice *d, uint32_t tag, uint32_t lun)
+static SCSIDiskReq *scsi_new_request(SCSIDiskState *s, uint32_t tag,
+        uint32_t lun)
 {
     SCSIRequest *req;
     SCSIDiskReq *r;
 
-    req = scsi_req_alloc(sizeof(SCSIDiskReq), d, tag, lun);
+    req = scsi_req_alloc(sizeof(SCSIDiskReq), &s->qdev, tag, lun);
     r = DO_UPCAST(SCSIDiskReq, req, req);
-    r->iov.iov_base = qemu_memalign(512, SCSI_DMA_BUF_SIZE);
+    r->iov.iov_base = qemu_blockalign(s->bs, SCSI_DMA_BUF_SIZE);
     return r;
 }
 
@@ -939,7 +940,7 @@ static int32_t scsi_send_command(SCSIDevice *d, uint32_t tag,
     }
     /* ??? Tags are not unique for different luns.  We only implement a
        single lun, so this should not matter.  */
-    r = scsi_new_request(d, tag, lun);
+    r = scsi_new_request(s, tag, lun);
     outbuf = (uint8_t *)r->iov.iov_base;
     is_write = 0;
     DPRINTF("Command: lun=%d tag=0x%x data=0x%02x", lun, tag, buf[0]);
@@ -1177,6 +1178,7 @@ static int scsi_disk_initfn(SCSIDevice *dev)
         s->qdev.blocksize = s->qdev.conf.logical_block_size;
     }
     s->cluster_size = s->qdev.blocksize / 512;
+    s->bs->buffer_alignment = s->qdev.blocksize;
 
     s->qdev.type = TYPE_DISK;
     qemu_add_vm_change_state_handler(scsi_dma_restart_cb, s);
diff --git a/hw/scsi-generic.c b/hw/scsi-generic.c
index 953802777d..7212091695 100644
--- a/hw/scsi-generic.c
+++ b/hw/scsi-generic.c
@@ -455,15 +455,31 @@ static int get_stream_blocksize(BlockDriverState *bdrv)
     return (buf[9] << 16) | (buf[10] << 8) | buf[11];
 }
 
-static void scsi_destroy(SCSIDevice *d)
+static void scsi_generic_purge_requests(SCSIGenericState *s)
 {
-    SCSIGenericState *s = DO_UPCAST(SCSIGenericState, qdev, d);
     SCSIGenericReq *r;
 
     while (!QTAILQ_EMPTY(&s->qdev.requests)) {
         r = DO_UPCAST(SCSIGenericReq, req, QTAILQ_FIRST(&s->qdev.requests));
+        if (r->req.aiocb) {
+            bdrv_aio_cancel(r->req.aiocb);
+        }
         scsi_remove_request(r);
     }
+}
+
+static void scsi_generic_reset(DeviceState *dev)
+{
+    SCSIGenericState *s = DO_UPCAST(SCSIGenericState, qdev.qdev, dev);
+
+    scsi_generic_purge_requests(s);
+}
+
+static void scsi_destroy(SCSIDevice *d)
+{
+    SCSIGenericState *s = DO_UPCAST(SCSIGenericState, qdev, d);
+
+    scsi_generic_purge_requests(s);
     blockdev_mark_auto_del(s->qdev.conf.bs);
 }
 
@@ -537,6 +553,7 @@ static SCSIDeviceInfo scsi_generic_info = {
     .qdev.name    = "scsi-generic",
     .qdev.desc    = "pass through generic scsi device (/dev/sg*)",
     .qdev.size    = sizeof(SCSIGenericState),
+    .qdev.reset   = scsi_generic_reset,
     .init         = scsi_generic_initfn,
     .destroy      = scsi_destroy,
     .send_command = scsi_send_command,
diff --git a/hw/sd.c b/hw/sd.c
index c928120abd..4bcf1c01a6 100644
--- a/hw/sd.c
+++ b/hw/sd.c
@@ -440,7 +440,7 @@ SDState *sd_init(BlockDriverState *bs, int is_spi)
     SDState *sd;
 
     sd = (SDState *) qemu_mallocz(sizeof(SDState));
-    sd->buf = qemu_memalign(512, 512);
+    sd->buf = qemu_blockalign(bs, 512);
     sd->spi = is_spi;
     sd->enable = 1;
     sd_reset(sd, bs);
diff --git a/hw/virtio-blk.c b/hw/virtio-blk.c
index bd6bbe6b14..a1df26dbcf 100644
--- a/hw/virtio-blk.c
+++ b/hw/virtio-blk.c
@@ -540,6 +540,7 @@ VirtIODevice *virtio_blk_init(DeviceState *dev, BlockConf *conf)
     register_savevm(dev, "virtio-blk", virtio_blk_id++, 2,
                     virtio_blk_save, virtio_blk_load, s);
     bdrv_set_removable(s->bs, 0);
+    s->bs->buffer_alignment = conf->logical_block_size;
 
     return &s->vdev;
 }
diff --git a/nbd.c b/nbd.c
index 011b50f166..4bf2eb7cb0 100644
--- a/nbd.c
+++ b/nbd.c
@@ -49,6 +49,7 @@
 
 /* This is all part of the "official" NBD API */
 
+#define NBD_REPLY_SIZE		(4 + 4 + 8)
 #define NBD_REQUEST_MAGIC       0x25609513
 #define NBD_REPLY_MAGIC         0x67446698
 
@@ -588,7 +589,7 @@ static int nbd_receive_request(int csock, struct nbd_request *request)
 
 int nbd_receive_reply(int csock, struct nbd_reply *reply)
 {
-	uint8_t buf[4 + 4 + 8];
+	uint8_t buf[NBD_REPLY_SIZE];
 	uint32_t magic;
 
 	memset(buf, 0xAA, sizeof(buf));
@@ -655,9 +656,9 @@ int nbd_trip(BlockDriverState *bs, int csock, off_t size, uint64_t dev_offset,
 	if (nbd_receive_request(csock, &request) == -1)
 		return -1;
 
-	if (request.len > data_size) {
+	if (request.len + NBD_REPLY_SIZE > data_size) {
 		LOG("len (%u) is larger than max len (%u)",
-		    request.len, data_size);
+		    request.len + NBD_REPLY_SIZE, data_size);
 		errno = EINVAL;
 		return -1;
 	}
@@ -687,7 +688,8 @@ int nbd_trip(BlockDriverState *bs, int csock, off_t size, uint64_t dev_offset,
 	case NBD_CMD_READ:
 		TRACE("Request type is READ");
 
-		if (bdrv_read(bs, (request.from + dev_offset) / 512, data,
+		if (bdrv_read(bs, (request.from + dev_offset) / 512,
+			      data + NBD_REPLY_SIZE,
 			      request.len / 512) == -1) {
 			LOG("reading from file failed");
 			errno = EINVAL;
@@ -697,12 +699,21 @@ int nbd_trip(BlockDriverState *bs, int csock, off_t size, uint64_t dev_offset,
 
 		TRACE("Read %u byte(s)", request.len);
 
-		if (nbd_send_reply(csock, &reply) == -1)
-			return -1;
+		/* Reply
+		   [ 0 ..  3]    magic   (NBD_REPLY_MAGIC)
+		   [ 4 ..  7]    error   (0 == no error)
+		   [ 7 .. 15]    handle
+		 */
+
+		cpu_to_be32w((uint32_t*)data, NBD_REPLY_MAGIC);
+		cpu_to_be32w((uint32_t*)(data + 4), reply.error);
+		cpu_to_be64w((uint64_t*)(data + 8), reply.handle);
 
 		TRACE("Sending data to client");
 
-		if (write_sync(csock, data, request.len) != request.len) {
+		if (write_sync(csock, data,
+			       request.len + NBD_REPLY_SIZE) !=
+			       request.len + NBD_REPLY_SIZE) {
 			LOG("writing to socket failed");
 			errno = EINVAL;
 			return -1;
diff --git a/posix-aio-compat.c b/posix-aio-compat.c
index 10f1f037fb..842f1a24aa 100644
--- a/posix-aio-compat.c
+++ b/posix-aio-compat.c
@@ -270,7 +270,7 @@ static ssize_t handle_aiocb_rw(struct qemu_paiocb *aiocb)
      * Ok, we have to do it the hard way, copy all segments into
      * a single aligned buffer.
      */
-    buf = qemu_memalign(512, aiocb->aio_nbytes);
+    buf = qemu_blockalign(aiocb->common.bs, aiocb->aio_nbytes);
     if (aiocb->aio_type & QEMU_AIO_WRITE) {
         char *p = buf;
         int i;
diff --git a/qemu-common.h b/qemu-common.h
index dfd3dc08a2..5544ffd8cf 100644
--- a/qemu-common.h
+++ b/qemu-common.h
@@ -278,11 +278,14 @@ typedef struct QEMUIOVector {
 void qemu_iovec_init(QEMUIOVector *qiov, int alloc_hint);
 void qemu_iovec_init_external(QEMUIOVector *qiov, struct iovec *iov, int niov);
 void qemu_iovec_add(QEMUIOVector *qiov, void *base, size_t len);
+void qemu_iovec_copy(QEMUIOVector *dst, QEMUIOVector *src, uint64_t skip,
+    size_t size);
 void qemu_iovec_concat(QEMUIOVector *dst, QEMUIOVector *src, size_t size);
 void qemu_iovec_destroy(QEMUIOVector *qiov);
 void qemu_iovec_reset(QEMUIOVector *qiov);
 void qemu_iovec_to_buffer(QEMUIOVector *qiov, void *buf);
 void qemu_iovec_from_buffer(QEMUIOVector *qiov, const void *buf, size_t count);
+void qemu_iovec_memset(QEMUIOVector *qiov, int c, size_t count);
 
 struct Monitor;
 typedef struct Monitor Monitor;
diff --git a/qemu-io.c b/qemu-io.c
index bd3bd16fdf..b4e5cc8fe6 100644
--- a/qemu-io.c
+++ b/qemu-io.c
@@ -61,7 +61,7 @@ static void *qemu_io_alloc(size_t len, int pattern)
 
 	if (misalign)
 		len += MISALIGN_OFFSET;
-	buf = qemu_memalign(512, len);
+	buf = qemu_blockalign(bs, len);
 	memset(buf, pattern, len);
 	if (misalign)
 		buf += MISALIGN_OFFSET;
diff --git a/qemu-nbd.c b/qemu-nbd.c
index 91b569f8e6..99f1d22884 100644
--- a/qemu-nbd.c
+++ b/qemu-nbd.c
@@ -44,7 +44,7 @@ static void usage(const char *name)
 "Usage: %s [OPTIONS] FILE\n"
 "QEMU Disk Network Block Device Server\n"
 "\n"
-"  -p, --port=PORT      port to listen on (default `1024')\n"
+"  -p, --port=PORT      port to listen on (default `%d')\n"
 "  -o, --offset=OFFSET  offset into the image\n"
 "  -b, --bind=IFACE     interface to bind to (default `0.0.0.0')\n"
 "  -k, --socket=PATH    path to the unix socket\n"
@@ -62,7 +62,7 @@ static void usage(const char *name)
 "  -V, --version        output version information and exit\n"
 "\n"
 "Report bugs to <anthony@codemonkey.ws>\n"
-    , name, "DEVICE");
+    , name, NBD_DEFAULT_PORT, "DEVICE");
 }
 
 static void version(const char *name)
@@ -188,7 +188,7 @@ int main(int argc, char **argv)
     bool readonly = false;
     bool disconnect = false;
     const char *bindto = "0.0.0.0";
-    int port = 1024;
+    int port = NBD_DEFAULT_PORT;
     struct sockaddr_in addr;
     socklen_t addr_len = sizeof(addr);
     off_t fd_size;
@@ -446,7 +446,7 @@ int main(int argc, char **argv)
     max_fd = sharing_fds[0];
     nb_fds++;
 
-    data = qemu_memalign(512, NBD_BUFFER_SIZE);
+    data = qemu_blockalign(bs, NBD_BUFFER_SIZE);
     if (data == NULL)
         errx(EXIT_FAILURE, "Cannot allocate data buffer");