summary refs log tree commit diff stats
diff options
context:
space:
mode:
-rw-r--r--.gitignore1
-rw-r--r--Makefile3
-rw-r--r--Makefile.objs11
-rw-r--r--async.c98
-rw-r--r--balloon.c61
-rw-r--r--balloon.h12
-rw-r--r--block.c310
-rw-r--r--block.h7
-rw-r--r--block/qcow.c180
-rw-r--r--block/qcow2-cluster.c26
-rw-r--r--block/qcow2.c240
-rw-r--r--block/qcow2.h5
-rw-r--r--block/qed-table.c14
-rw-r--r--block/qed.c4
-rw-r--r--block/raw-posix.c39
-rw-r--r--block/raw-win32.c35
-rw-r--r--block/raw.c7
-rw-r--r--block/vpc.c8
-rw-r--r--block_int.h10
-rw-r--r--blockdev.c17
-rwxr-xr-xconfigure73
-rw-r--r--coroutine-gthread.c131
-rw-r--r--coroutine-ucontext.c230
-rw-r--r--coroutine-win32.c92
-rw-r--r--hw/scsi-bus.c74
-rw-r--r--hw/scsi-defs.h62
-rw-r--r--hw/scsi-disk.c79
-rw-r--r--hw/scsi-generic.c2
-rw-r--r--hw/virtio-balloon.c76
-rw-r--r--hw/virtio-pci.c14
-rw-r--r--hw/virtio.h1
-rw-r--r--linux-aio.c43
-rw-r--r--posix-aio-compat.c30
-rw-r--r--qemu-common.h4
-rw-r--r--qemu-coroutine-int.h49
-rw-r--r--qemu-coroutine-lock.c117
-rw-r--r--qemu-coroutine.c75
-rw-r--r--qemu-coroutine.h159
-rw-r--r--test-coroutine.c192
-rw-r--r--trace-events16
40 files changed, 1867 insertions, 740 deletions
diff --git a/.gitignore b/.gitignore
index 54835bcb97..59c343c414 100644
--- a/.gitignore
+++ b/.gitignore
@@ -36,6 +36,7 @@ qemu-io
 qemu-ga
 qemu-monitor.texi
 QMP/qmp-commands.txt
+test-coroutine
 .gdbinit
 *.a
 *.aux
diff --git a/Makefile b/Makefile
index 48552512d6..2becedcf88 100644
--- a/Makefile
+++ b/Makefile
@@ -151,7 +151,7 @@ qemu-io$(EXESUF): qemu-io.o cmd.o qemu-tool.o qemu-error.o $(oslib-obj-y) $(trac
 qemu-img-cmds.h: $(SRC_PATH)/qemu-img-cmds.hx
 	$(call quiet-command,sh $(SRC_PATH)/scripts/hxtool -h < $< > $@,"  GEN   $@")
 
-check-qint.o check-qstring.o check-qdict.o check-qlist.o check-qfloat.o check-qjson.o: $(GENERATED_HEADERS)
+check-qint.o check-qstring.o check-qdict.o check-qlist.o check-qfloat.o check-qjson.o test-coroutine.o: $(GENERATED_HEADERS)
 
 CHECK_PROG_DEPS = qemu-malloc.o $(oslib-obj-y) $(trace-obj-y) qemu-tool.o
 
@@ -161,6 +161,7 @@ check-qdict: check-qdict.o qdict.o qfloat.o qint.o qstring.o qbool.o qlist.o $(C
 check-qlist: check-qlist.o qlist.o qint.o $(CHECK_PROG_DEPS)
 check-qfloat: check-qfloat.o qfloat.o $(CHECK_PROG_DEPS)
 check-qjson: check-qjson.o qfloat.o qint.o qdict.o qstring.o qlist.o qbool.o qjson.o json-streamer.o json-lexer.o json-parser.o error.o qerror.o qemu-error.o $(CHECK_PROG_DEPS)
+test-coroutine: test-coroutine.o qemu-timer-common.o async.o $(coroutine-obj-y) $(CHECK_PROG_DEPS)
 
 $(qapi-obj-y): $(GENERATED_HEADERS)
 qapi-dir := qapi-generated
diff --git a/Makefile.objs b/Makefile.objs
index eb5e1dcece..b6bcfcc169 100644
--- a/Makefile.objs
+++ b/Makefile.objs
@@ -11,10 +11,21 @@ oslib-obj-$(CONFIG_WIN32) += oslib-win32.o qemu-thread-win32.o
 oslib-obj-$(CONFIG_POSIX) += oslib-posix.o qemu-thread-posix.o
 
 #######################################################################
+# coroutines
+coroutine-obj-y = qemu-coroutine.o qemu-coroutine-lock.o
+ifeq ($(CONFIG_UCONTEXT_COROUTINE),y)
+coroutine-obj-$(CONFIG_POSIX) += coroutine-ucontext.o
+else
+coroutine-obj-$(CONFIG_POSIX) += coroutine-gthread.o
+endif
+coroutine-obj-$(CONFIG_WIN32) += coroutine-win32.o
+
+#######################################################################
 # block-obj-y is code used by both qemu system emulation and qemu-img
 
 block-obj-y = cutils.o cache-utils.o qemu-malloc.o qemu-option.o module.o async.o
 block-obj-y += nbd.o block.o aio.o aes.o qemu-config.o qemu-progress.o qemu-sockets.o
+block-obj-y += $(coroutine-obj-y)
 block-obj-$(CONFIG_POSIX) += posix-aio-compat.o
 block-obj-$(CONFIG_LINUX_AIO) += linux-aio.o
 
diff --git a/async.c b/async.c
index fd313dffb7..3fe70b9deb 100644
--- a/async.c
+++ b/async.c
@@ -25,92 +25,8 @@
 #include "qemu-common.h"
 #include "qemu-aio.h"
 
-/*
- * An AsyncContext protects the callbacks of AIO requests and Bottom Halves
- * against interfering with each other. A typical example is qcow2 that accepts
- * asynchronous requests, but relies for manipulation of its metadata on
- * synchronous bdrv_read/write that doesn't trigger any callbacks.
- *
- * However, these functions are often emulated using AIO which means that AIO
- * callbacks must be run - but at the same time we must not run callbacks of
- * other requests as they might start to modify metadata and corrupt the
- * internal state of the caller of bdrv_read/write.
- *
- * To achieve the desired semantics we switch into a new AsyncContext.
- * Callbacks must only be run if they belong to the current AsyncContext.
- * Otherwise they need to be queued until their own context is active again.
- * This is how you can make qemu_aio_wait() wait only for your own callbacks.
- *
- * The AsyncContexts form a stack. When you leave a AsyncContexts, you always
- * return to the old ("parent") context.
- */
-struct AsyncContext {
-    /* Consecutive number of the AsyncContext (position in the stack) */
-    int id;
-
-    /* Anchor of the list of Bottom Halves belonging to the context */
-    struct QEMUBH *first_bh;
-
-    /* Link to parent context */
-    struct AsyncContext *parent;
-};
-
-/* The currently active AsyncContext */
-static struct AsyncContext *async_context = &(struct AsyncContext) { 0 };
-
-/*
- * Enter a new AsyncContext. Already scheduled Bottom Halves and AIO callbacks
- * won't be called until this context is left again.
- */
-void async_context_push(void)
-{
-    struct AsyncContext *new = qemu_mallocz(sizeof(*new));
-    new->parent = async_context;
-    new->id = async_context->id + 1;
-    async_context = new;
-}
-
-/* Run queued AIO completions and destroy Bottom Half */
-static void bh_run_aio_completions(void *opaque)
-{
-    QEMUBH **bh = opaque;
-    qemu_bh_delete(*bh);
-    qemu_free(bh);
-    qemu_aio_process_queue();
-}
-/*
- * Leave the currently active AsyncContext. All Bottom Halves belonging to the
- * old context are executed before changing the context.
- */
-void async_context_pop(void)
-{
-    struct AsyncContext *old = async_context;
-    QEMUBH **bh;
-
-    /* Flush the bottom halves, we don't want to lose them */
-    while (qemu_bh_poll());
-
-    /* Switch back to the parent context */
-    async_context = async_context->parent;
-    qemu_free(old);
-
-    if (async_context == NULL) {
-        abort();
-    }
-
-    /* Schedule BH to run any queued AIO completions as soon as possible */
-    bh = qemu_malloc(sizeof(*bh));
-    *bh = qemu_bh_new(bh_run_aio_completions, bh);
-    qemu_bh_schedule(*bh);
-}
-
-/*
- * Returns the ID of the currently active AsyncContext
- */
-int get_async_context_id(void)
-{
-    return async_context->id;
-}
+/* Anchor of the list of Bottom Halves belonging to the context */
+static struct QEMUBH *first_bh;
 
 /***********************************************************/
 /* bottom halves (can be seen as timers which expire ASAP) */
@@ -130,8 +46,8 @@ QEMUBH *qemu_bh_new(QEMUBHFunc *cb, void *opaque)
     bh = qemu_mallocz(sizeof(QEMUBH));
     bh->cb = cb;
     bh->opaque = opaque;
-    bh->next = async_context->first_bh;
-    async_context->first_bh = bh;
+    bh->next = first_bh;
+    first_bh = bh;
     return bh;
 }
 
@@ -141,7 +57,7 @@ int qemu_bh_poll(void)
     int ret;
 
     ret = 0;
-    for (bh = async_context->first_bh; bh; bh = next) {
+    for (bh = first_bh; bh; bh = next) {
         next = bh->next;
         if (!bh->deleted && bh->scheduled) {
             bh->scheduled = 0;
@@ -153,7 +69,7 @@ int qemu_bh_poll(void)
     }
 
     /* remove deleted bhs */
-    bhp = &async_context->first_bh;
+    bhp = &first_bh;
     while (*bhp) {
         bh = *bhp;
         if (bh->deleted) {
@@ -199,7 +115,7 @@ void qemu_bh_update_timeout(int *timeout)
 {
     QEMUBH *bh;
 
-    for (bh = async_context->first_bh; bh; bh = bh->next) {
+    for (bh = first_bh; bh; bh = bh->next) {
         if (!bh->deleted && bh->scheduled) {
             if (bh->idle) {
                 /* idle bottom halves will be polled at least
diff --git a/balloon.c b/balloon.c
index 248c1b50a9..f56fdc1c4b 100644
--- a/balloon.c
+++ b/balloon.c
@@ -1,7 +1,9 @@
 /*
- * QEMU System Emulator
+ * Generic Balloon handlers and management
  *
  * Copyright (c) 2003-2008 Fabrice Bellard
+ * Copyright (C) 2011 Red Hat, Inc.
+ * Copyright (C) 2011 Amit Shah <amit.shah@redhat.com>
  *
  * Permission is hereby granted, free of charge, to any person obtaining a copy
  * of this software and associated documentation files (the "Software"), to deal
@@ -30,44 +32,53 @@
 #include "balloon.h"
 #include "trace.h"
 
+static QEMUBalloonEvent *balloon_event_fn;
+static QEMUBalloonStatus *balloon_stat_fn;
+static void *balloon_opaque;
 
-static QEMUBalloonEvent *qemu_balloon_event;
-void *qemu_balloon_event_opaque;
-
-void qemu_add_balloon_handler(QEMUBalloonEvent *func, void *opaque)
+int qemu_add_balloon_handler(QEMUBalloonEvent *event_func,
+                             QEMUBalloonStatus *stat_func, void *opaque)
 {
-    qemu_balloon_event = func;
-    qemu_balloon_event_opaque = opaque;
+    if (balloon_event_fn || balloon_stat_fn || balloon_opaque) {
+        /* We're already registered one balloon handler.  How many can
+         * a guest really have?
+         */
+        error_report("Another balloon device already registered");
+        return -1;
+    }
+    balloon_event_fn = event_func;
+    balloon_stat_fn = stat_func;
+    balloon_opaque = opaque;
+    return 0;
 }
 
-int qemu_balloon(ram_addr_t target, MonitorCompletion cb, void *opaque)
+static int qemu_balloon(ram_addr_t target)
 {
-    if (qemu_balloon_event) {
-        trace_balloon_event(qemu_balloon_event_opaque, target);
-        qemu_balloon_event(qemu_balloon_event_opaque, target, cb, opaque);
-        return 1;
-    } else {
+    if (!balloon_event_fn) {
         return 0;
     }
+    trace_balloon_event(balloon_opaque, target);
+    balloon_event_fn(balloon_opaque, target);
+    return 1;
 }
 
-int qemu_balloon_status(MonitorCompletion cb, void *opaque)
+static int qemu_balloon_status(MonitorCompletion cb, void *opaque)
 {
-    if (qemu_balloon_event) {
-        qemu_balloon_event(qemu_balloon_event_opaque, 0, cb, opaque);
-        return 1;
-    } else {
+    if (!balloon_stat_fn) {
         return 0;
     }
+    balloon_stat_fn(balloon_opaque, cb, opaque);
+    return 1;
 }
 
 static void print_balloon_stat(const char *key, QObject *obj, void *opaque)
 {
     Monitor *mon = opaque;
 
-    if (strcmp(key, "actual"))
+    if (strcmp(key, "actual")) {
         monitor_printf(mon, ",%s=%" PRId64, key,
                        qint_get_int(qobject_to_qint(obj)));
+    }
 }
 
 void monitor_print_balloon(Monitor *mon, const QObject *data)
@@ -75,9 +86,9 @@ void monitor_print_balloon(Monitor *mon, const QObject *data)
     QDict *qdict;
 
     qdict = qobject_to_qdict(data);
-    if (!qdict_haskey(qdict, "actual"))
+    if (!qdict_haskey(qdict, "actual")) {
         return;
-
+    }
     monitor_printf(mon, "balloon: actual=%" PRId64,
                    qdict_get_int(qdict, "actual") >> 20);
     qdict_iter(qdict, print_balloon_stat, mon);
@@ -129,6 +140,7 @@ int do_info_balloon(Monitor *mon, MonitorCompletion cb, void *opaque)
 int do_balloon(Monitor *mon, const QDict *params,
 	       MonitorCompletion cb, void *opaque)
 {
+    int64_t target;
     int ret;
 
     if (kvm_enabled() && !kvm_has_sync_mmu()) {
@@ -136,7 +148,12 @@ int do_balloon(Monitor *mon, const QDict *params,
         return -1;
     }
 
-    ret = qemu_balloon(qdict_get_int(params, "value"), cb, opaque);
+    target = qdict_get_int(params, "value");
+    if (target <= 0) {
+        qerror_report(QERR_INVALID_PARAMETER_VALUE, "target", "a size");
+        return -1;
+    }
+    ret = qemu_balloon(target);
     if (ret == 0) {
         qerror_report(QERR_DEVICE_NOT_ACTIVE, "balloon");
         return -1;
diff --git a/balloon.h b/balloon.h
index d478e28475..3df14e645a 100644
--- a/balloon.h
+++ b/balloon.h
@@ -16,14 +16,12 @@
 
 #include "monitor.h"
 
-typedef void (QEMUBalloonEvent)(void *opaque, ram_addr_t target,
-                                MonitorCompletion cb, void *cb_data);
+typedef void (QEMUBalloonEvent)(void *opaque, ram_addr_t target);
+typedef void (QEMUBalloonStatus)(void *opaque, MonitorCompletion cb,
+                                 void *cb_data);
 
-void qemu_add_balloon_handler(QEMUBalloonEvent *func, void *opaque);
-
-int qemu_balloon(ram_addr_t target, MonitorCompletion cb, void *opaque);
-
-int qemu_balloon_status(MonitorCompletion cb, void *opaque);
+int qemu_add_balloon_handler(QEMUBalloonEvent *event_func,
+			     QEMUBalloonStatus *stat_func, void *opaque);
 
 void monitor_print_balloon(Monitor *mon, const QObject *data);
 int do_info_balloon(Monitor *mon, MonitorCompletion cb, void *opaque);
diff --git a/block.c b/block.c
index 9549b9eff9..26910ca143 100644
--- a/block.c
+++ b/block.c
@@ -28,6 +28,7 @@
 #include "block_int.h"
 #include "module.h"
 #include "qemu-objects.h"
+#include "qemu-coroutine.h"
 
 #ifdef CONFIG_BSD
 #include <sys/types.h>
@@ -57,6 +58,19 @@ static int bdrv_read_em(BlockDriverState *bs, int64_t sector_num,
                         uint8_t *buf, int nb_sectors);
 static int bdrv_write_em(BlockDriverState *bs, int64_t sector_num,
                          const uint8_t *buf, int nb_sectors);
+static BlockDriverAIOCB *bdrv_co_aio_readv_em(BlockDriverState *bs,
+        int64_t sector_num, QEMUIOVector *qiov, int nb_sectors,
+        BlockDriverCompletionFunc *cb, void *opaque);
+static BlockDriverAIOCB *bdrv_co_aio_writev_em(BlockDriverState *bs,
+        int64_t sector_num, QEMUIOVector *qiov, int nb_sectors,
+        BlockDriverCompletionFunc *cb, void *opaque);
+static int coroutine_fn bdrv_co_readv_em(BlockDriverState *bs,
+                                         int64_t sector_num, int nb_sectors,
+                                         QEMUIOVector *iov);
+static int coroutine_fn bdrv_co_writev_em(BlockDriverState *bs,
+                                         int64_t sector_num, int nb_sectors,
+                                         QEMUIOVector *iov);
+static int coroutine_fn bdrv_co_flush_em(BlockDriverState *bs);
 
 static QTAILQ_HEAD(, BlockDriverState) bdrv_states =
     QTAILQ_HEAD_INITIALIZER(bdrv_states);
@@ -169,14 +183,25 @@ void path_combine(char *dest, int dest_size,
 
 void bdrv_register(BlockDriver *bdrv)
 {
-    if (!bdrv->bdrv_aio_readv) {
-        /* add AIO emulation layer */
-        bdrv->bdrv_aio_readv = bdrv_aio_readv_em;
-        bdrv->bdrv_aio_writev = bdrv_aio_writev_em;
-    } else if (!bdrv->bdrv_read) {
-        /* add synchronous IO emulation layer */
+    if (bdrv->bdrv_co_readv) {
+        /* Emulate AIO by coroutines, and sync by AIO */
+        bdrv->bdrv_aio_readv = bdrv_co_aio_readv_em;
+        bdrv->bdrv_aio_writev = bdrv_co_aio_writev_em;
         bdrv->bdrv_read = bdrv_read_em;
         bdrv->bdrv_write = bdrv_write_em;
+     } else {
+        bdrv->bdrv_co_readv = bdrv_co_readv_em;
+        bdrv->bdrv_co_writev = bdrv_co_writev_em;
+
+        if (!bdrv->bdrv_aio_readv) {
+            /* add AIO emulation layer */
+            bdrv->bdrv_aio_readv = bdrv_aio_readv_em;
+            bdrv->bdrv_aio_writev = bdrv_aio_writev_em;
+        } else if (!bdrv->bdrv_read) {
+            /* add synchronous IO emulation layer */
+            bdrv->bdrv_read = bdrv_read_em;
+            bdrv->bdrv_write = bdrv_write_em;
+        }
     }
 
     if (!bdrv->bdrv_aio_flush)
@@ -730,6 +755,8 @@ void bdrv_detach(BlockDriverState *bs, DeviceState *qdev)
 {
     assert(bs->peer == qdev);
     bs->peer = NULL;
+    bs->change_cb = NULL;
+    bs->change_opaque = NULL;
 }
 
 DeviceState *bdrv_get_attached(BlockDriverState *bs)
@@ -920,6 +947,17 @@ static int bdrv_check_request(BlockDriverState *bs, int64_t sector_num,
                                    nb_sectors * BDRV_SECTOR_SIZE);
 }
 
+static inline bool bdrv_has_async_rw(BlockDriver *drv)
+{
+    return drv->bdrv_co_readv != bdrv_co_readv_em
+        || drv->bdrv_aio_readv != bdrv_aio_readv_em;
+}
+
+static inline bool bdrv_has_async_flush(BlockDriver *drv)
+{
+    return drv->bdrv_aio_flush != bdrv_aio_flush_em;
+}
+
 /* return < 0 if error. See bdrv_write() for the return codes */
 int bdrv_read(BlockDriverState *bs, int64_t sector_num,
               uint8_t *buf, int nb_sectors)
@@ -928,6 +966,18 @@ int bdrv_read(BlockDriverState *bs, int64_t sector_num,
 
     if (!drv)
         return -ENOMEDIUM;
+
+    if (bdrv_has_async_rw(drv) && qemu_in_coroutine()) {
+        QEMUIOVector qiov;
+        struct iovec iov = {
+            .iov_base = (void *)buf,
+            .iov_len = nb_sectors * BDRV_SECTOR_SIZE,
+        };
+
+        qemu_iovec_init_external(&qiov, &iov, 1);
+        return bdrv_co_readv(bs, sector_num, nb_sectors, &qiov);
+    }
+
     if (bdrv_check_request(bs, sector_num, nb_sectors))
         return -EIO;
 
@@ -972,8 +1022,21 @@ int bdrv_write(BlockDriverState *bs, int64_t sector_num,
                const uint8_t *buf, int nb_sectors)
 {
     BlockDriver *drv = bs->drv;
+
     if (!bs->drv)
         return -ENOMEDIUM;
+
+    if (bdrv_has_async_rw(drv) && qemu_in_coroutine()) {
+        QEMUIOVector qiov;
+        struct iovec iov = {
+            .iov_base = (void *)buf,
+            .iov_len = nb_sectors * BDRV_SECTOR_SIZE,
+        };
+
+        qemu_iovec_init_external(&qiov, &iov, 1);
+        return bdrv_co_writev(bs, sector_num, nb_sectors, &qiov);
+    }
+
     if (bs->read_only)
         return -EACCES;
     if (bdrv_check_request(bs, sector_num, nb_sectors))
@@ -1108,17 +1171,49 @@ int bdrv_pwrite_sync(BlockDriverState *bs, int64_t offset,
     return 0;
 }
 
-/*
- * Writes to the file and ensures that no writes are reordered across this
- * request (acts as a barrier)
- *
- * Returns 0 on success, -errno in error cases.
- */
-int bdrv_write_sync(BlockDriverState *bs, int64_t sector_num,
-    const uint8_t *buf, int nb_sectors)
+int coroutine_fn bdrv_co_readv(BlockDriverState *bs, int64_t sector_num,
+    int nb_sectors, QEMUIOVector *qiov)
+{
+    BlockDriver *drv = bs->drv;
+
+    trace_bdrv_co_readv(bs, sector_num, nb_sectors);
+
+    if (!drv) {
+        return -ENOMEDIUM;
+    }
+    if (bdrv_check_request(bs, sector_num, nb_sectors)) {
+        return -EIO;
+    }
+
+    return drv->bdrv_co_readv(bs, sector_num, nb_sectors, qiov);
+}
+
+int coroutine_fn bdrv_co_writev(BlockDriverState *bs, int64_t sector_num,
+    int nb_sectors, QEMUIOVector *qiov)
 {
-    return bdrv_pwrite_sync(bs, BDRV_SECTOR_SIZE * sector_num,
-        buf, BDRV_SECTOR_SIZE * nb_sectors);
+    BlockDriver *drv = bs->drv;
+
+    trace_bdrv_co_writev(bs, sector_num, nb_sectors);
+
+    if (!bs->drv) {
+        return -ENOMEDIUM;
+    }
+    if (bs->read_only) {
+        return -EACCES;
+    }
+    if (bdrv_check_request(bs, sector_num, nb_sectors)) {
+        return -EIO;
+    }
+
+    if (bs->dirty_bitmap) {
+        set_dirty_bitmap(bs, sector_num, nb_sectors, 1);
+    }
+
+    if (bs->wr_highest_sector < sector_num + nb_sectors - 1) {
+        bs->wr_highest_sector = sector_num + nb_sectors - 1;
+    }
+
+    return drv->bdrv_co_writev(bs, sector_num, nb_sectors, qiov);
 }
 
 /**
@@ -1591,6 +1686,10 @@ int bdrv_flush(BlockDriverState *bs)
         return 0;
     }
 
+    if (bs->drv && bdrv_has_async_flush(bs->drv) && qemu_in_coroutine()) {
+        return bdrv_co_flush_em(bs);
+    }
+
     if (bs->drv && bs->drv->bdrv_flush) {
         return bs->drv->bdrv_flush(bs);
     }
@@ -2580,6 +2679,89 @@ static BlockDriverAIOCB *bdrv_aio_writev_em(BlockDriverState *bs,
     return bdrv_aio_rw_vector(bs, sector_num, qiov, nb_sectors, cb, opaque, 1);
 }
 
+
+typedef struct BlockDriverAIOCBCoroutine {
+    BlockDriverAIOCB common;
+    BlockRequest req;
+    bool is_write;
+    QEMUBH* bh;
+} BlockDriverAIOCBCoroutine;
+
+static void bdrv_aio_co_cancel_em(BlockDriverAIOCB *blockacb)
+{
+    qemu_aio_flush();
+}
+
+static AIOPool bdrv_em_co_aio_pool = {
+    .aiocb_size         = sizeof(BlockDriverAIOCBCoroutine),
+    .cancel             = bdrv_aio_co_cancel_em,
+};
+
+static void bdrv_co_rw_bh(void *opaque)
+{
+    BlockDriverAIOCBCoroutine *acb = opaque;
+
+    acb->common.cb(acb->common.opaque, acb->req.error);
+    qemu_bh_delete(acb->bh);
+    qemu_aio_release(acb);
+}
+
+static void coroutine_fn bdrv_co_rw(void *opaque)
+{
+    BlockDriverAIOCBCoroutine *acb = opaque;
+    BlockDriverState *bs = acb->common.bs;
+
+    if (!acb->is_write) {
+        acb->req.error = bs->drv->bdrv_co_readv(bs, acb->req.sector,
+            acb->req.nb_sectors, acb->req.qiov);
+    } else {
+        acb->req.error = bs->drv->bdrv_co_writev(bs, acb->req.sector,
+            acb->req.nb_sectors, acb->req.qiov);
+    }
+
+    acb->bh = qemu_bh_new(bdrv_co_rw_bh, acb);
+    qemu_bh_schedule(acb->bh);
+}
+
+static BlockDriverAIOCB *bdrv_co_aio_rw_vector(BlockDriverState *bs,
+                                               int64_t sector_num,
+                                               QEMUIOVector *qiov,
+                                               int nb_sectors,
+                                               BlockDriverCompletionFunc *cb,
+                                               void *opaque,
+                                               bool is_write)
+{
+    Coroutine *co;
+    BlockDriverAIOCBCoroutine *acb;
+
+    acb = qemu_aio_get(&bdrv_em_co_aio_pool, bs, cb, opaque);
+    acb->req.sector = sector_num;
+    acb->req.nb_sectors = nb_sectors;
+    acb->req.qiov = qiov;
+    acb->is_write = is_write;
+
+    co = qemu_coroutine_create(bdrv_co_rw);
+    qemu_coroutine_enter(co, acb);
+
+    return &acb->common;
+}
+
+static BlockDriverAIOCB *bdrv_co_aio_readv_em(BlockDriverState *bs,
+        int64_t sector_num, QEMUIOVector *qiov, int nb_sectors,
+        BlockDriverCompletionFunc *cb, void *opaque)
+{
+    return bdrv_co_aio_rw_vector(bs, sector_num, qiov, nb_sectors, cb, opaque,
+                                 false);
+}
+
+static BlockDriverAIOCB *bdrv_co_aio_writev_em(BlockDriverState *bs,
+        int64_t sector_num, QEMUIOVector *qiov, int nb_sectors,
+        BlockDriverCompletionFunc *cb, void *opaque)
+{
+    return bdrv_co_aio_rw_vector(bs, sector_num, qiov, nb_sectors, cb, opaque,
+                                 true);
+}
+
 static BlockDriverAIOCB *bdrv_aio_flush_em(BlockDriverState *bs,
         BlockDriverCompletionFunc *cb, void *opaque)
 {
@@ -2636,8 +2818,6 @@ static int bdrv_read_em(BlockDriverState *bs, int64_t sector_num,
     struct iovec iov;
     QEMUIOVector qiov;
 
-    async_context_push();
-
     async_ret = NOT_DONE;
     iov.iov_base = (void *)buf;
     iov.iov_len = nb_sectors * BDRV_SECTOR_SIZE;
@@ -2655,7 +2835,6 @@ static int bdrv_read_em(BlockDriverState *bs, int64_t sector_num,
 
 
 fail:
-    async_context_pop();
     return async_ret;
 }
 
@@ -2667,8 +2846,6 @@ static int bdrv_write_em(BlockDriverState *bs, int64_t sector_num,
     struct iovec iov;
     QEMUIOVector qiov;
 
-    async_context_push();
-
     async_ret = NOT_DONE;
     iov.iov_base = (void *)buf;
     iov.iov_len = nb_sectors * BDRV_SECTOR_SIZE;
@@ -2684,7 +2861,6 @@ static int bdrv_write_em(BlockDriverState *bs, int64_t sector_num,
     }
 
 fail:
-    async_context_pop();
     return async_ret;
 }
 
@@ -2726,6 +2902,77 @@ void qemu_aio_release(void *p)
 }
 
 /**************************************************************/
+/* Coroutine block device emulation */
+
+typedef struct CoroutineIOCompletion {
+    Coroutine *coroutine;
+    int ret;
+} CoroutineIOCompletion;
+
+static void bdrv_co_io_em_complete(void *opaque, int ret)
+{
+    CoroutineIOCompletion *co = opaque;
+
+    co->ret = ret;
+    qemu_coroutine_enter(co->coroutine, NULL);
+}
+
+static int coroutine_fn bdrv_co_io_em(BlockDriverState *bs, int64_t sector_num,
+                                      int nb_sectors, QEMUIOVector *iov,
+                                      bool is_write)
+{
+    CoroutineIOCompletion co = {
+        .coroutine = qemu_coroutine_self(),
+    };
+    BlockDriverAIOCB *acb;
+
+    if (is_write) {
+        acb = bdrv_aio_writev(bs, sector_num, iov, nb_sectors,
+                              bdrv_co_io_em_complete, &co);
+    } else {
+        acb = bdrv_aio_readv(bs, sector_num, iov, nb_sectors,
+                             bdrv_co_io_em_complete, &co);
+    }
+
+    trace_bdrv_co_io(is_write, acb);
+    if (!acb) {
+        return -EIO;
+    }
+    qemu_coroutine_yield();
+
+    return co.ret;
+}
+
+static int coroutine_fn bdrv_co_readv_em(BlockDriverState *bs,
+                                         int64_t sector_num, int nb_sectors,
+                                         QEMUIOVector *iov)
+{
+    return bdrv_co_io_em(bs, sector_num, nb_sectors, iov, false);
+}
+
+static int coroutine_fn bdrv_co_writev_em(BlockDriverState *bs,
+                                         int64_t sector_num, int nb_sectors,
+                                         QEMUIOVector *iov)
+{
+    return bdrv_co_io_em(bs, sector_num, nb_sectors, iov, true);
+}
+
+static int coroutine_fn bdrv_co_flush_em(BlockDriverState *bs)
+{
+    CoroutineIOCompletion co = {
+        .coroutine = qemu_coroutine_self(),
+    };
+    BlockDriverAIOCB *acb;
+
+    acb = bdrv_aio_flush(bs, bdrv_co_io_em_complete, &co);
+    if (!acb) {
+        return -EIO;
+    }
+    qemu_coroutine_yield();
+    return co.ret;
+}
+
+/**************************************************************/
 /* removable device support */
 
 /**
@@ -2768,25 +3015,16 @@ int bdrv_media_changed(BlockDriverState *bs)
 int bdrv_eject(BlockDriverState *bs, int eject_flag)
 {
     BlockDriver *drv = bs->drv;
-    int ret;
 
-    if (bs->locked) {
+    if (eject_flag && bs->locked) {
         return -EBUSY;
     }
 
-    if (!drv || !drv->bdrv_eject) {
-        ret = -ENOTSUP;
-    } else {
-        ret = drv->bdrv_eject(bs, eject_flag);
-    }
-    if (ret == -ENOTSUP) {
-        ret = 0;
+    if (drv && drv->bdrv_eject) {
+        drv->bdrv_eject(bs, eject_flag);
     }
-    if (ret >= 0) {
-        bs->tray_open = eject_flag;
-    }
-
-    return ret;
+    bs->tray_open = eject_flag;
+    return 0;
 }
 
 int bdrv_is_locked(BlockDriverState *bs)
diff --git a/block.h b/block.h
index 59cc410e3b..a3bfaafef0 100644
--- a/block.h
+++ b/block.h
@@ -4,6 +4,7 @@
 #include "qemu-aio.h"
 #include "qemu-common.h"
 #include "qemu-option.h"
+#include "qemu-coroutine.h"
 #include "qobject.h"
 
 /* block.c */
@@ -85,8 +86,10 @@ int bdrv_pwrite(BlockDriverState *bs, int64_t offset,
                 const void *buf, int count);
 int bdrv_pwrite_sync(BlockDriverState *bs, int64_t offset,
     const void *buf, int count);
-int bdrv_write_sync(BlockDriverState *bs, int64_t sector_num,
-    const uint8_t *buf, int nb_sectors);
+int coroutine_fn bdrv_co_readv(BlockDriverState *bs, int64_t sector_num,
+    int nb_sectors, QEMUIOVector *qiov);
+int coroutine_fn bdrv_co_writev(BlockDriverState *bs, int64_t sector_num,
+    int nb_sectors, QEMUIOVector *qiov);
 int bdrv_truncate(BlockDriverState *bs, int64_t offset);
 int64_t bdrv_getlength(BlockDriverState *bs);
 int64_t bdrv_get_allocated_file_size(BlockDriverState *bs);
diff --git a/block/qcow.c b/block/qcow.c
index 227b104e36..6447c2a1c0 100644
--- a/block/qcow.c
+++ b/block/qcow.c
@@ -73,6 +73,7 @@ typedef struct BDRVQcowState {
     uint32_t crypt_method_header;
     AES_KEY aes_encrypt_key;
     AES_KEY aes_decrypt_key;
+    CoMutex lock;
 } BDRVQcowState;
 
 static int decompress_cluster(BlockDriverState *bs, uint64_t cluster_offset);
@@ -517,11 +518,11 @@ static AIOPool qcow_aio_pool = {
 
 static QCowAIOCB *qcow_aio_setup(BlockDriverState *bs,
         int64_t sector_num, QEMUIOVector *qiov, int nb_sectors,
-        BlockDriverCompletionFunc *cb, void *opaque, int is_write)
+        int is_write)
 {
     QCowAIOCB *acb;
 
-    acb = qemu_aio_get(&qcow_aio_pool, bs, cb, opaque);
+    acb = qemu_aio_get(&qcow_aio_pool, bs, NULL, NULL);
     if (!acb)
         return NULL;
     acb->hd_aiocb = NULL;
@@ -542,48 +543,15 @@ static QCowAIOCB *qcow_aio_setup(BlockDriverState *bs,
     return acb;
 }
 
-static void qcow_aio_read_cb(void *opaque, int ret);
-static void qcow_aio_write_cb(void *opaque, int ret);
-
-static void qcow_aio_rw_bh(void *opaque)
-{
-    QCowAIOCB *acb = opaque;
-    qemu_bh_delete(acb->bh);
-    acb->bh = NULL;
-
-    if (acb->is_write) {
-        qcow_aio_write_cb(opaque, 0);
-    } else {
-        qcow_aio_read_cb(opaque, 0);
-    }
-}
-
-static int qcow_schedule_bh(QEMUBHFunc *cb, QCowAIOCB *acb)
-{
-    if (acb->bh) {
-        return -EIO;
-    }
-
-    acb->bh = qemu_bh_new(cb, acb);
-    if (!acb->bh) {
-        return -EIO;
-    }
-
-    qemu_bh_schedule(acb->bh);
-
-    return 0;
-}
-
-static void qcow_aio_read_cb(void *opaque, int ret)
+static int qcow_aio_read_cb(void *opaque)
 {
     QCowAIOCB *acb = opaque;
     BlockDriverState *bs = acb->common.bs;
     BDRVQcowState *s = bs->opaque;
     int index_in_cluster;
+    int ret;
 
     acb->hd_aiocb = NULL;
-    if (ret < 0)
-        goto done;
 
  redo:
     /* post process the read buffer */
@@ -605,8 +573,7 @@ static void qcow_aio_read_cb(void *opaque, int ret)
 
     if (acb->nb_sectors == 0) {
         /* request completed */
-        ret = 0;
-        goto done;
+        return 0;
     }
 
     /* prepare next AIO request */
@@ -623,11 +590,12 @@ static void qcow_aio_read_cb(void *opaque, int ret)
             acb->hd_iov.iov_base = (void *)acb->buf;
             acb->hd_iov.iov_len = acb->n * 512;
             qemu_iovec_init_external(&acb->hd_qiov, &acb->hd_iov, 1);
-            acb->hd_aiocb = bdrv_aio_readv(bs->backing_hd, acb->sector_num,
-                &acb->hd_qiov, acb->n, qcow_aio_read_cb, acb);
-            if (acb->hd_aiocb == NULL) {
-                ret = -EIO;
-                goto done;
+            qemu_co_mutex_unlock(&s->lock);
+            ret = bdrv_co_readv(bs->backing_hd, acb->sector_num,
+                                acb->n, &acb->hd_qiov);
+            qemu_co_mutex_lock(&s->lock);
+            if (ret < 0) {
+                return -EIO;
             }
         } else {
             /* Note: in this case, no need to wait */
@@ -637,64 +605,56 @@ static void qcow_aio_read_cb(void *opaque, int ret)
     } else if (acb->cluster_offset & QCOW_OFLAG_COMPRESSED) {
         /* add AIO support for compressed blocks ? */
         if (decompress_cluster(bs, acb->cluster_offset) < 0) {
-            ret = -EIO;
-            goto done;
+            return -EIO;
         }
         memcpy(acb->buf,
                s->cluster_cache + index_in_cluster * 512, 512 * acb->n);
         goto redo;
     } else {
         if ((acb->cluster_offset & 511) != 0) {
-            ret = -EIO;
-            goto done;
+            return -EIO;
         }
         acb->hd_iov.iov_base = (void *)acb->buf;
         acb->hd_iov.iov_len = acb->n * 512;
         qemu_iovec_init_external(&acb->hd_qiov, &acb->hd_iov, 1);
-        acb->hd_aiocb = bdrv_aio_readv(bs->file,
+        qemu_co_mutex_unlock(&s->lock);
+        ret = bdrv_co_readv(bs->file,
                             (acb->cluster_offset >> 9) + index_in_cluster,
-                            &acb->hd_qiov, acb->n, qcow_aio_read_cb, acb);
-        if (acb->hd_aiocb == NULL) {
-            ret = -EIO;
-            goto done;
+                            acb->n, &acb->hd_qiov);
+        qemu_co_mutex_lock(&s->lock);
+        if (ret < 0) {
+            return ret;
         }
     }
 
-    return;
-
-done:
-    if (acb->qiov->niov > 1) {
-        qemu_iovec_from_buffer(acb->qiov, acb->orig_buf, acb->qiov->size);
-        qemu_vfree(acb->orig_buf);
-    }
-    acb->common.cb(acb->common.opaque, ret);
-    qemu_aio_release(acb);
+    return 1;
 }
 
-static BlockDriverAIOCB *qcow_aio_readv(BlockDriverState *bs,
-        int64_t sector_num, QEMUIOVector *qiov, int nb_sectors,
-        BlockDriverCompletionFunc *cb, void *opaque)
+static int qcow_co_readv(BlockDriverState *bs, int64_t sector_num,
+                         int nb_sectors, QEMUIOVector *qiov)
 {
+    BDRVQcowState *s = bs->opaque;
     QCowAIOCB *acb;
     int ret;
 
-    acb = qcow_aio_setup(bs, sector_num, qiov, nb_sectors, cb, opaque, 0);
-    if (!acb)
-        return NULL;
+    acb = qcow_aio_setup(bs, sector_num, qiov, nb_sectors, 0);
 
-    ret = qcow_schedule_bh(qcow_aio_rw_bh, acb);
-    if (ret < 0) {
-        if (acb->qiov->niov > 1) {
-            qemu_vfree(acb->orig_buf);
-        }
-        qemu_aio_release(acb);
-        return NULL;
+    qemu_co_mutex_lock(&s->lock);
+    do {
+        ret = qcow_aio_read_cb(acb);
+    } while (ret > 0);
+    qemu_co_mutex_unlock(&s->lock);
+
+    if (acb->qiov->niov > 1) {
+        qemu_iovec_from_buffer(acb->qiov, acb->orig_buf, acb->qiov->size);
+        qemu_vfree(acb->orig_buf);
     }
+    qemu_aio_release(acb);
 
-    return &acb->common;
+    return ret;
 }
 
-static void qcow_aio_write_cb(void *opaque, int ret)
+static int qcow_aio_write_cb(void *opaque)
 {
     QCowAIOCB *acb = opaque;
     BlockDriverState *bs = acb->common.bs;
@@ -702,20 +662,17 @@ static void qcow_aio_write_cb(void *opaque, int ret)
     int index_in_cluster;
     uint64_t cluster_offset;
     const uint8_t *src_buf;
+    int ret;
 
     acb->hd_aiocb = NULL;
 
-    if (ret < 0)
-        goto done;
-
     acb->nb_sectors -= acb->n;
     acb->sector_num += acb->n;
     acb->buf += acb->n * 512;
 
     if (acb->nb_sectors == 0) {
         /* request completed */
-        ret = 0;
-        goto done;
+        return 0;
     }
 
     index_in_cluster = acb->sector_num & (s->cluster_sectors - 1);
@@ -726,16 +683,11 @@ static void qcow_aio_write_cb(void *opaque, int ret)
                                         index_in_cluster,
                                         index_in_cluster + acb->n);
     if (!cluster_offset || (cluster_offset & 511) != 0) {
-        ret = -EIO;
-        goto done;
+        return -EIO;
     }
     if (s->crypt_method) {
         if (!acb->cluster_data) {
             acb->cluster_data = qemu_mallocz(s->cluster_size);
-            if (!acb->cluster_data) {
-                ret = -ENOMEM;
-                goto done;
-            }
         }
         encrypt_sectors(s, acb->sector_num, acb->cluster_data, acb->buf,
                         acb->n, 1, &s->aes_encrypt_key);
@@ -747,26 +699,19 @@ static void qcow_aio_write_cb(void *opaque, int ret)
     acb->hd_iov.iov_base = (void *)src_buf;
     acb->hd_iov.iov_len = acb->n * 512;
     qemu_iovec_init_external(&acb->hd_qiov, &acb->hd_iov, 1);
-    acb->hd_aiocb = bdrv_aio_writev(bs->file,
-                                    (cluster_offset >> 9) + index_in_cluster,
-                                    &acb->hd_qiov, acb->n,
-                                    qcow_aio_write_cb, acb);
-    if (acb->hd_aiocb == NULL) {
-        ret = -EIO;
-        goto done;
+    qemu_co_mutex_unlock(&s->lock);
+    ret = bdrv_co_writev(bs->file,
+                         (cluster_offset >> 9) + index_in_cluster,
+                         acb->n, &acb->hd_qiov);
+    qemu_co_mutex_lock(&s->lock);
+    if (ret < 0) {
+        return ret;
     }
-    return;
-
-done:
-    if (acb->qiov->niov > 1)
-        qemu_vfree(acb->orig_buf);
-    acb->common.cb(acb->common.opaque, ret);
-    qemu_aio_release(acb);
+    return 1;
 }
 
-static BlockDriverAIOCB *qcow_aio_writev(BlockDriverState *bs,
-        int64_t sector_num, QEMUIOVector *qiov, int nb_sectors,
-        BlockDriverCompletionFunc *cb, void *opaque)
+static int qcow_co_writev(BlockDriverState *bs, int64_t sector_num,
+                          int nb_sectors, QEMUIOVector *qiov)
 {
     BDRVQcowState *s = bs->opaque;
     QCowAIOCB *acb;
@@ -774,21 +719,20 @@ static BlockDriverAIOCB *qcow_aio_writev(BlockDriverState *bs,
 
     s->cluster_cache_offset = -1; /* disable compressed cache */
 
-    acb = qcow_aio_setup(bs, sector_num, qiov, nb_sectors, cb, opaque, 1);
-    if (!acb)
-        return NULL;
+    acb = qcow_aio_setup(bs, sector_num, qiov, nb_sectors, 1);
 
+    qemu_co_mutex_lock(&s->lock);
+    do {
+        ret = qcow_aio_write_cb(acb);
+    } while (ret > 0);
+    qemu_co_mutex_unlock(&s->lock);
 
-    ret = qcow_schedule_bh(qcow_aio_rw_bh, acb);
-    if (ret < 0) {
-        if (acb->qiov->niov > 1) {
-            qemu_vfree(acb->orig_buf);
-        }
-        qemu_aio_release(acb);
-        return NULL;
+    if (acb->qiov->niov > 1) {
+        qemu_vfree(acb->orig_buf);
     }
+    qemu_aio_release(acb);
 
-    return &acb->common;
+    return ret;
 }
 
 static void qcow_close(BlockDriverState *bs)
@@ -1020,8 +964,8 @@ static BlockDriver bdrv_qcow = {
     .bdrv_is_allocated	= qcow_is_allocated,
     .bdrv_set_key	= qcow_set_key,
     .bdrv_make_empty	= qcow_make_empty,
-    .bdrv_aio_readv	= qcow_aio_readv,
-    .bdrv_aio_writev	= qcow_aio_writev,
+    .bdrv_co_readv  = qcow_co_readv,
+    .bdrv_co_writev = qcow_co_writev,
     .bdrv_aio_flush	= qcow_aio_flush,
     .bdrv_write_compressed = qcow_write_compressed,
     .bdrv_get_info	= qcow_get_info,
diff --git a/block/qcow2-cluster.c b/block/qcow2-cluster.c
index 882f50a80b..81cf77d83c 100644
--- a/block/qcow2-cluster.c
+++ b/block/qcow2-cluster.c
@@ -697,12 +697,12 @@ err:
  * m->depends_on is set to NULL and the other fields in m are meaningless.
  *
  * If the cluster is newly allocated, m->nb_clusters is set to the number of
- * contiguous clusters that have been allocated. This may be 0 if the request
- * conflict with another write request in flight; in this case, m->depends_on
- * is set and the remaining fields of m are meaningless.
+ * contiguous clusters that have been allocated. In this case, the other
+ * fields of m are valid and contain information about the first allocated
+ * cluster.
  *
- * If m->nb_clusters is non-zero, the other fields of m are valid and contain
- * information about the first allocated cluster.
+ * If the request conflicts with another write request in flight, the coroutine
+ * is queued and will be reentered when the dependency has completed.
  *
  * Return 0 on success and -errno in error cases
  */
@@ -721,6 +721,7 @@ int qcow2_alloc_cluster_offset(BlockDriverState *bs, uint64_t offset,
         return ret;
     }
 
+again:
     nb_clusters = size_to_clusters(s, n_end << 9);
 
     nb_clusters = MIN(nb_clusters, s->l2_size - l2_index);
@@ -792,12 +793,12 @@ int qcow2_alloc_cluster_offset(BlockDriverState *bs, uint64_t offset,
             }
 
             if (nb_clusters == 0) {
-                /* Set dependency and wait for a callback */
-                m->depends_on = old_alloc;
-                m->nb_clusters = 0;
-                *num = 0;
-
-                goto out_wait_dependency;
+                /* Wait for the dependency to complete. We need to recheck
+                 * the free/allocated clusters when we continue. */
+                qemu_co_mutex_unlock(&s->lock);
+                qemu_co_queue_wait(&old_alloc->dependent_requests);
+                qemu_co_mutex_lock(&s->lock);
+                goto again;
             }
         }
     }
@@ -834,9 +835,6 @@ out:
 
     return 0;
 
-out_wait_dependency:
-    return qcow2_cache_put(bs, s->l2_table_cache, (void**) &l2_table);
-
 fail:
     qcow2_cache_put(bs, s->l2_table_cache, (void**) &l2_table);
 fail_put:
diff --git a/block/qcow2.c b/block/qcow2.c
index 48e1b95689..f07d550a96 100644
--- a/block/qcow2.c
+++ b/block/qcow2.c
@@ -276,6 +276,9 @@ static int qcow2_open(BlockDriverState *bs, int flags)
         goto fail;
     }
 
+    /* Initialise locks */
+    qemu_co_mutex_init(&s->lock);
+
 #ifdef DEBUG_ALLOC
     qcow2_check_refcounts(bs);
 #endif
@@ -379,7 +382,6 @@ typedef struct QCowAIOCB {
     uint64_t cluster_offset;
     uint8_t *cluster_data;
     bool is_write;
-    BlockDriverAIOCB *hd_aiocb;
     QEMUIOVector hd_qiov;
     QEMUBH *bh;
     QCowL2Meta l2meta;
@@ -389,8 +391,6 @@ typedef struct QCowAIOCB {
 static void qcow2_aio_cancel(BlockDriverAIOCB *blockacb)
 {
     QCowAIOCB *acb = container_of(blockacb, QCowAIOCB, common);
-    if (acb->hd_aiocb)
-        bdrv_aio_cancel(acb->hd_aiocb);
     qemu_aio_release(acb);
 }
 
@@ -399,46 +399,16 @@ static AIOPool qcow2_aio_pool = {
     .cancel             = qcow2_aio_cancel,
 };
 
-static void qcow2_aio_read_cb(void *opaque, int ret);
-static void qcow2_aio_write_cb(void *opaque, int ret);
-
-static void qcow2_aio_rw_bh(void *opaque)
-{
-    QCowAIOCB *acb = opaque;
-    qemu_bh_delete(acb->bh);
-    acb->bh = NULL;
-
-    if (acb->is_write) {
-        qcow2_aio_write_cb(opaque, 0);
-    } else {
-        qcow2_aio_read_cb(opaque, 0);
-    }
-}
-
-static int qcow2_schedule_bh(QEMUBHFunc *cb, QCowAIOCB *acb)
-{
-    if (acb->bh)
-        return -EIO;
-
-    acb->bh = qemu_bh_new(cb, acb);
-    if (!acb->bh)
-        return -EIO;
-
-    qemu_bh_schedule(acb->bh);
-
-    return 0;
-}
-
-static void qcow2_aio_read_cb(void *opaque, int ret)
+/*
+ * Returns 0 when the request is completed successfully, 1 when there is still
+ * a part left to do and -errno in error cases.
+ */
+static int qcow2_aio_read_cb(QCowAIOCB *acb)
 {
-    QCowAIOCB *acb = opaque;
     BlockDriverState *bs = acb->common.bs;
     BDRVQcowState *s = bs->opaque;
     int index_in_cluster, n1;
-
-    acb->hd_aiocb = NULL;
-    if (ret < 0)
-        goto done;
+    int ret;
 
     /* post process the read buffer */
     if (!acb->cluster_offset) {
@@ -463,8 +433,7 @@ static void qcow2_aio_read_cb(void *opaque, int ret)
 
     if (acb->remaining_sectors == 0) {
         /* request completed */
-        ret = 0;
-        goto done;
+        return 0;
     }
 
     /* prepare next AIO request */
@@ -477,7 +446,7 @@ static void qcow2_aio_read_cb(void *opaque, int ret)
     ret = qcow2_get_cluster_offset(bs, acb->sector_num << 9,
         &acb->cur_nr_sectors, &acb->cluster_offset);
     if (ret < 0) {
-        goto done;
+        return ret;
     }
 
     index_in_cluster = acb->sector_num & (s->cluster_sectors - 1);
@@ -494,42 +463,35 @@ static void qcow2_aio_read_cb(void *opaque, int ret)
                 acb->sector_num, acb->cur_nr_sectors);
             if (n1 > 0) {
                 BLKDBG_EVENT(bs->file, BLKDBG_READ_BACKING_AIO);
-                acb->hd_aiocb = bdrv_aio_readv(bs->backing_hd, acb->sector_num,
-                                    &acb->hd_qiov, n1, qcow2_aio_read_cb, acb);
-                if (acb->hd_aiocb == NULL) {
-                    ret = -EIO;
-                    goto done;
+                qemu_co_mutex_unlock(&s->lock);
+                ret = bdrv_co_readv(bs->backing_hd, acb->sector_num,
+                                    n1, &acb->hd_qiov);
+                qemu_co_mutex_lock(&s->lock);
+                if (ret < 0) {
+                    return ret;
                 }
-            } else {
-                ret = qcow2_schedule_bh(qcow2_aio_rw_bh, acb);
-                if (ret < 0)
-                    goto done;
             }
+            return 1;
         } else {
             /* Note: in this case, no need to wait */
             qemu_iovec_memset(&acb->hd_qiov, 0, 512 * acb->cur_nr_sectors);
-            ret = qcow2_schedule_bh(qcow2_aio_rw_bh, acb);
-            if (ret < 0)
-                goto done;
+            return 1;
         }
     } else if (acb->cluster_offset & QCOW_OFLAG_COMPRESSED) {
         /* add AIO support for compressed blocks ? */
         ret = qcow2_decompress_cluster(bs, acb->cluster_offset);
         if (ret < 0) {
-            goto done;
+            return ret;
         }
 
         qemu_iovec_from_buffer(&acb->hd_qiov,
             s->cluster_cache + index_in_cluster * 512,
             512 * acb->cur_nr_sectors);
 
-        ret = qcow2_schedule_bh(qcow2_aio_rw_bh, acb);
-        if (ret < 0)
-            goto done;
+        return 1;
     } else {
         if ((acb->cluster_offset & 511) != 0) {
-            ret = -EIO;
-            goto done;
+            return -EIO;
         }
 
         if (s->crypt_method) {
@@ -550,21 +512,17 @@ static void qcow2_aio_read_cb(void *opaque, int ret)
         }
 
         BLKDBG_EVENT(bs->file, BLKDBG_READ_AIO);
-        acb->hd_aiocb = bdrv_aio_readv(bs->file,
+        qemu_co_mutex_unlock(&s->lock);
+        ret = bdrv_co_readv(bs->file,
                             (acb->cluster_offset >> 9) + index_in_cluster,
-                            &acb->hd_qiov, acb->cur_nr_sectors,
-                            qcow2_aio_read_cb, acb);
-        if (acb->hd_aiocb == NULL) {
-            ret = -EIO;
-            goto done;
+                            acb->cur_nr_sectors, &acb->hd_qiov);
+        qemu_co_mutex_lock(&s->lock);
+        if (ret < 0) {
+            return ret;
         }
     }
 
-    return;
-done:
-    acb->common.cb(acb->common.opaque, ret);
-    qemu_iovec_destroy(&acb->hd_qiov);
-    qemu_aio_release(acb);
+    return 1;
 }
 
 static QCowAIOCB *qcow2_aio_setup(BlockDriverState *bs, int64_t sector_num,
@@ -577,7 +535,6 @@ static QCowAIOCB *qcow2_aio_setup(BlockDriverState *bs, int64_t sector_num,
     acb = qemu_aio_get(&qcow2_aio_pool, bs, cb, opaque);
     if (!acb)
         return NULL;
-    acb->hd_aiocb = NULL;
     acb->sector_num = sector_num;
     acb->qiov = qiov;
     acb->is_write = is_write;
@@ -589,70 +546,65 @@ static QCowAIOCB *qcow2_aio_setup(BlockDriverState *bs, int64_t sector_num,
     acb->cur_nr_sectors = 0;
     acb->cluster_offset = 0;
     acb->l2meta.nb_clusters = 0;
-    QLIST_INIT(&acb->l2meta.dependent_requests);
+    qemu_co_queue_init(&acb->l2meta.dependent_requests);
     return acb;
 }
 
-static BlockDriverAIOCB *qcow2_aio_readv(BlockDriverState *bs,
-                                         int64_t sector_num,
-                                         QEMUIOVector *qiov, int nb_sectors,
-                                         BlockDriverCompletionFunc *cb,
-                                         void *opaque)
+static int qcow2_co_readv(BlockDriverState *bs, int64_t sector_num,
+                          int nb_sectors, QEMUIOVector *qiov)
 {
+    BDRVQcowState *s = bs->opaque;
     QCowAIOCB *acb;
     int ret;
 
-    acb = qcow2_aio_setup(bs, sector_num, qiov, nb_sectors, cb, opaque, 0);
-    if (!acb)
-        return NULL;
+    acb = qcow2_aio_setup(bs, sector_num, qiov, nb_sectors, NULL, NULL, 0);
 
-    ret = qcow2_schedule_bh(qcow2_aio_rw_bh, acb);
-    if (ret < 0) {
-        qemu_iovec_destroy(&acb->hd_qiov);
-        qemu_aio_release(acb);
-        return NULL;
-    }
+    qemu_co_mutex_lock(&s->lock);
+    do {
+        ret = qcow2_aio_read_cb(acb);
+    } while (ret > 0);
+    qemu_co_mutex_unlock(&s->lock);
 
-    return &acb->common;
+    qemu_iovec_destroy(&acb->hd_qiov);
+    qemu_aio_release(acb);
+
+    return ret;
 }
 
-static void run_dependent_requests(QCowL2Meta *m)
+static void run_dependent_requests(BDRVQcowState *s, QCowL2Meta *m)
 {
-    QCowAIOCB *req;
-    QCowAIOCB *next;
-
     /* Take the request off the list of running requests */
     if (m->nb_clusters != 0) {
         QLIST_REMOVE(m, next_in_flight);
     }
 
     /* Restart all dependent requests */
-    QLIST_FOREACH_SAFE(req, &m->dependent_requests, next_depend, next) {
-        qcow2_aio_write_cb(req, 0);
+    if (!qemu_co_queue_empty(&m->dependent_requests)) {
+        qemu_co_mutex_unlock(&s->lock);
+        while(qemu_co_queue_next(&m->dependent_requests));
+        qemu_co_mutex_lock(&s->lock);
     }
-
-    /* Empty the list for the next part of the request */
-    QLIST_INIT(&m->dependent_requests);
 }
 
-static void qcow2_aio_write_cb(void *opaque, int ret)
+/*
+ * Returns 0 when the request is completed successfully, 1 when there is still
+ * a part left to do and -errno in error cases.
+ */
+static int qcow2_aio_write_cb(QCowAIOCB *acb)
 {
-    QCowAIOCB *acb = opaque;
     BlockDriverState *bs = acb->common.bs;
     BDRVQcowState *s = bs->opaque;
     int index_in_cluster;
     int n_end;
+    int ret;
 
-    acb->hd_aiocb = NULL;
-
-    if (ret >= 0) {
-        ret = qcow2_alloc_cluster_link_l2(bs, &acb->l2meta);
-    }
+    ret = qcow2_alloc_cluster_link_l2(bs, &acb->l2meta);
 
-    run_dependent_requests(&acb->l2meta);
+    run_dependent_requests(s, &acb->l2meta);
 
-    if (ret < 0)
-        goto done;
+    if (ret < 0) {
+        return ret;
+    }
 
     acb->remaining_sectors -= acb->cur_nr_sectors;
     acb->sector_num += acb->cur_nr_sectors;
@@ -660,8 +612,7 @@ static void qcow2_aio_write_cb(void *opaque, int ret)
 
     if (acb->remaining_sectors == 0) {
         /* request completed */
-        ret = 0;
-        goto done;
+        return 0;
     }
 
     index_in_cluster = acb->sector_num & (s->cluster_sectors - 1);
@@ -673,18 +624,10 @@ static void qcow2_aio_write_cb(void *opaque, int ret)
     ret = qcow2_alloc_cluster_offset(bs, acb->sector_num << 9,
         index_in_cluster, n_end, &acb->cur_nr_sectors, &acb->l2meta);
     if (ret < 0) {
-        goto done;
+        return ret;
     }
 
     acb->cluster_offset = acb->l2meta.cluster_offset;
-
-    /* Need to wait for another request? If so, we are done for now. */
-    if (acb->l2meta.nb_clusters == 0 && acb->l2meta.depends_on != NULL) {
-        QLIST_INSERT_HEAD(&acb->l2meta.depends_on->dependent_requests,
-            acb, next_depend);
-        return;
-    }
-
     assert((acb->cluster_offset & 511) == 0);
 
     qemu_iovec_reset(&acb->hd_qiov);
@@ -709,51 +652,40 @@ static void qcow2_aio_write_cb(void *opaque, int ret)
     }
 
     BLKDBG_EVENT(bs->file, BLKDBG_WRITE_AIO);
-    acb->hd_aiocb = bdrv_aio_writev(bs->file,
-                                    (acb->cluster_offset >> 9) + index_in_cluster,
-                                    &acb->hd_qiov, acb->cur_nr_sectors,
-                                    qcow2_aio_write_cb, acb);
-    if (acb->hd_aiocb == NULL) {
-        ret = -EIO;
-        goto fail;
+    qemu_co_mutex_unlock(&s->lock);
+    ret = bdrv_co_writev(bs->file,
+                         (acb->cluster_offset >> 9) + index_in_cluster,
+                         acb->cur_nr_sectors, &acb->hd_qiov);
+    qemu_co_mutex_lock(&s->lock);
+    if (ret < 0) {
+        return ret;
     }
 
-    return;
-
-fail:
-    if (acb->l2meta.nb_clusters != 0) {
-        QLIST_REMOVE(&acb->l2meta, next_in_flight);
-    }
-done:
-    acb->common.cb(acb->common.opaque, ret);
-    qemu_iovec_destroy(&acb->hd_qiov);
-    qemu_aio_release(acb);
+    return 1;
 }
 
-static BlockDriverAIOCB *qcow2_aio_writev(BlockDriverState *bs,
-                                          int64_t sector_num,
-                                          QEMUIOVector *qiov, int nb_sectors,
-                                          BlockDriverCompletionFunc *cb,
-                                          void *opaque)
+static int qcow2_co_writev(BlockDriverState *bs,
+                           int64_t sector_num,
+                           int nb_sectors,
+                           QEMUIOVector *qiov)
 {
     BDRVQcowState *s = bs->opaque;
     QCowAIOCB *acb;
     int ret;
 
+    acb = qcow2_aio_setup(bs, sector_num, qiov, nb_sectors, NULL, NULL, 1);
     s->cluster_cache_offset = -1; /* disable compressed cache */
 
-    acb = qcow2_aio_setup(bs, sector_num, qiov, nb_sectors, cb, opaque, 1);
-    if (!acb)
-        return NULL;
+    qemu_co_mutex_lock(&s->lock);
+    do {
+        ret = qcow2_aio_write_cb(acb);
+    } while (ret > 0);
+    qemu_co_mutex_unlock(&s->lock);
 
-    ret = qcow2_schedule_bh(qcow2_aio_rw_bh, acb);
-    if (ret < 0) {
-        qemu_iovec_destroy(&acb->hd_qiov);
-        qemu_aio_release(acb);
-        return NULL;
-    }
+    qemu_iovec_destroy(&acb->hd_qiov);
+    qemu_aio_release(acb);
 
-    return &acb->common;
+    return ret;
 }
 
 static void qcow2_close(BlockDriverState *bs)
@@ -881,7 +813,7 @@ static int preallocate(BlockDriverState *bs)
 
     nb_sectors = bdrv_getlength(bs) >> 9;
     offset = 0;
-    QLIST_INIT(&meta.dependent_requests);
+    qemu_co_queue_init(&meta.dependent_requests);
     meta.cluster_offset = 0;
 
     while (nb_sectors) {
@@ -899,7 +831,7 @@ static int preallocate(BlockDriverState *bs)
 
         /* There are no dependent requests, but we need to remove our request
          * from the list of in-flight requests */
-        run_dependent_requests(&meta);
+        run_dependent_requests(bs->opaque, &meta);
 
         /* TODO Preallocate data if requested */
 
@@ -1387,8 +1319,8 @@ static BlockDriver bdrv_qcow2 = {
     .bdrv_set_key       = qcow2_set_key,
     .bdrv_make_empty    = qcow2_make_empty,
 
-    .bdrv_aio_readv     = qcow2_aio_readv,
-    .bdrv_aio_writev    = qcow2_aio_writev,
+    .bdrv_co_readv      = qcow2_co_readv,
+    .bdrv_co_writev     = qcow2_co_writev,
     .bdrv_aio_flush     = qcow2_aio_flush,
 
     .bdrv_discard           = qcow2_discard,
diff --git a/block/qcow2.h b/block/qcow2.h
index 6a0a21b694..de23abe1a4 100644
--- a/block/qcow2.h
+++ b/block/qcow2.h
@@ -26,6 +26,7 @@
 #define BLOCK_QCOW2_H
 
 #include "aes.h"
+#include "qemu-coroutine.h"
 
 //#define DEBUG_ALLOC
 //#define DEBUG_ALLOC2
@@ -114,6 +115,8 @@ typedef struct BDRVQcowState {
     int64_t free_cluster_index;
     int64_t free_byte_offset;
 
+    CoMutex lock;
+
     uint32_t crypt_method; /* current crypt method, 0 if no key yet */
     uint32_t crypt_method_header;
     AES_KEY aes_encrypt_key;
@@ -146,7 +149,7 @@ typedef struct QCowL2Meta
     int nb_available;
     int nb_clusters;
     struct QCowL2Meta *depends_on;
-    QLIST_HEAD(QCowAioDependencies, QCowAIOCB) dependent_requests;
+    CoQueue dependent_requests;
 
     QLIST_ENTRY(QCowL2Meta) next_in_flight;
 } QCowL2Meta;
diff --git a/block/qed-table.c b/block/qed-table.c
index d38c673547..d96afa81d7 100644
--- a/block/qed-table.c
+++ b/block/qed-table.c
@@ -179,16 +179,12 @@ int qed_read_l1_table_sync(BDRVQEDState *s)
 {
     int ret = -EINPROGRESS;
 
-    async_context_push();
-
     qed_read_table(s, s->header.l1_table_offset,
                    s->l1_table, qed_sync_cb, &ret);
     while (ret == -EINPROGRESS) {
         qemu_aio_wait();
     }
 
-    async_context_pop();
-
     return ret;
 }
 
@@ -205,15 +201,11 @@ int qed_write_l1_table_sync(BDRVQEDState *s, unsigned int index,
 {
     int ret = -EINPROGRESS;
 
-    async_context_push();
-
     qed_write_l1_table(s, index, n, qed_sync_cb, &ret);
     while (ret == -EINPROGRESS) {
         qemu_aio_wait();
     }
 
-    async_context_pop();
-
     return ret;
 }
 
@@ -282,14 +274,11 @@ int qed_read_l2_table_sync(BDRVQEDState *s, QEDRequest *request, uint64_t offset
 {
     int ret = -EINPROGRESS;
 
-    async_context_push();
-
     qed_read_l2_table(s, request, offset, qed_sync_cb, &ret);
     while (ret == -EINPROGRESS) {
         qemu_aio_wait();
     }
 
-    async_context_pop();
     return ret;
 }
 
@@ -307,13 +296,10 @@ int qed_write_l2_table_sync(BDRVQEDState *s, QEDRequest *request,
 {
     int ret = -EINPROGRESS;
 
-    async_context_push();
-
     qed_write_l2_table(s, request, index, n, flush, qed_sync_cb, &ret);
     while (ret == -EINPROGRESS) {
         qemu_aio_wait();
     }
 
-    async_context_pop();
     return ret;
 }
diff --git a/block/qed.c b/block/qed.c
index 39703793e9..333f067582 100644
--- a/block/qed.c
+++ b/block/qed.c
@@ -680,16 +680,12 @@ static int bdrv_qed_is_allocated(BlockDriverState *bs, int64_t sector_num,
     };
     QEDRequest request = { .l2_table = NULL };
 
-    async_context_push();
-
     qed_find_cluster(s, &request, pos, len, qed_is_allocated_cb, &cb);
 
     while (cb.is_allocated == -1) {
         qemu_aio_wait();
     }
 
-    async_context_pop();
-
     qed_unref_l2_cache_entry(request.l2_table);
 
     return cb.is_allocated;
diff --git a/block/raw-posix.c b/block/raw-posix.c
index cd89c8312a..c5c99446c0 100644
--- a/block/raw-posix.c
+++ b/block/raw-posix.c
@@ -230,13 +230,15 @@ static int raw_open_common(BlockDriverState *bs, const char *filename,
         }
     }
 
+    /* We're falling back to POSIX AIO in some cases so init always */
+    if (paio_init() < 0) {
+        goto out_free_buf;
+    }
+
 #ifdef CONFIG_LINUX_AIO
     if ((bdrv_flags & (BDRV_O_NOCACHE|BDRV_O_NATIVE_AIO)) ==
                       (BDRV_O_NOCACHE|BDRV_O_NATIVE_AIO)) {
 
-        /* We're falling back to POSIX AIO in some cases */
-        paio_init();
-
         s->aio_ctx = laio_init();
         if (!s->aio_ctx) {
             goto out_free_buf;
@@ -245,9 +247,6 @@ static int raw_open_common(BlockDriverState *bs, const char *filename,
     } else
 #endif
     {
-        if (paio_init() < 0) {
-            goto out_free_buf;
-        }
 #ifdef CONFIG_LINUX_AIO
         s->use_aio = 0;
 #endif
@@ -587,7 +586,7 @@ static BlockDriverAIOCB *raw_aio_submit(BlockDriverState *bs,
 
     /*
      * If O_DIRECT is used the buffer needs to be aligned on a sector
-     * boundary.  Check if this is the case or telll the low-level
+     * boundary.  Check if this is the case or tell the low-level
      * driver that it needs to copy the buffer.
      */
     if (s->aligned_buf) {
@@ -1254,7 +1253,7 @@ static int floppy_media_changed(BlockDriverState *bs)
     return ret;
 }
 
-static int floppy_eject(BlockDriverState *bs, int eject_flag)
+static void floppy_eject(BlockDriverState *bs, int eject_flag)
 {
     BDRVRawState *s = bs->opaque;
     int fd;
@@ -1269,8 +1268,6 @@ static int floppy_eject(BlockDriverState *bs, int eject_flag)
             perror("FDEJECT");
         close(fd);
     }
-
-    return 0;
 }
 
 static BlockDriver bdrv_host_floppy = {
@@ -1348,7 +1345,7 @@ static int cdrom_is_inserted(BlockDriverState *bs)
     return 0;
 }
 
-static int cdrom_eject(BlockDriverState *bs, int eject_flag)
+static void cdrom_eject(BlockDriverState *bs, int eject_flag)
 {
     BDRVRawState *s = bs->opaque;
 
@@ -1359,11 +1356,9 @@ static int cdrom_eject(BlockDriverState *bs, int eject_flag)
         if (ioctl(s->fd, CDROMCLOSETRAY, NULL) < 0)
             perror("CDROMEJECT");
     }
-
-    return 0;
 }
 
-static int cdrom_set_locked(BlockDriverState *bs, int locked)
+static void cdrom_set_locked(BlockDriverState *bs, int locked)
 {
     BDRVRawState *s = bs->opaque;
 
@@ -1374,8 +1369,6 @@ static int cdrom_set_locked(BlockDriverState *bs, int locked)
          */
         /* perror("CDROM_LOCKDOOR"); */
     }
-
-    return 0;
 }
 
 static BlockDriver bdrv_host_cdrom = {
@@ -1464,12 +1457,12 @@ static int cdrom_is_inserted(BlockDriverState *bs)
     return raw_getlength(bs) > 0;
 }
 
-static int cdrom_eject(BlockDriverState *bs, int eject_flag)
+static void cdrom_eject(BlockDriverState *bs, int eject_flag)
 {
     BDRVRawState *s = bs->opaque;
 
     if (s->fd < 0)
-        return -ENOTSUP;
+        return;
 
     (void) ioctl(s->fd, CDIOCALLOW);
 
@@ -1481,17 +1474,15 @@ static int cdrom_eject(BlockDriverState *bs, int eject_flag)
             perror("CDIOCCLOSE");
     }
 
-    if (cdrom_reopen(bs) < 0)
-        return -ENOTSUP;
-    return 0;
+    cdrom_reopen(bs);
 }
 
-static int cdrom_set_locked(BlockDriverState *bs, int locked)
+static void cdrom_set_locked(BlockDriverState *bs, int locked)
 {
     BDRVRawState *s = bs->opaque;
 
     if (s->fd < 0)
-        return -ENOTSUP;
+        return;
     if (ioctl(s->fd, (locked ? CDIOCPREVENT : CDIOCALLOW)) < 0) {
         /*
          * Note: an error can happen if the distribution automatically
@@ -1499,8 +1490,6 @@ static int cdrom_set_locked(BlockDriverState *bs, int locked)
          */
         /* perror("CDROM_LOCKDOOR"); */
     }
-
-    return 0;
 }
 
 static BlockDriver bdrv_host_cdrom = {
diff --git a/block/raw-win32.c b/block/raw-win32.c
index 91067e7595..e47cfe0f4a 100644
--- a/block/raw-win32.c
+++ b/block/raw-win32.c
@@ -393,41 +393,6 @@ static int hdev_open(BlockDriverState *bs, const char *filename, int flags)
     return 0;
 }
 
-#if 0
-/***********************************************/
-/* removable device additional commands */
-
-static int raw_is_inserted(BlockDriverState *bs)
-{
-    return 1;
-}
-
-static int raw_media_changed(BlockDriverState *bs)
-{
-    return -ENOTSUP;
-}
-
-static int raw_eject(BlockDriverState *bs, int eject_flag)
-{
-    DWORD ret_count;
-
-    if (s->type == FTYPE_FILE)
-        return -ENOTSUP;
-    if (eject_flag) {
-        DeviceIoControl(s->hfile, IOCTL_STORAGE_EJECT_MEDIA,
-                        NULL, 0, NULL, 0, &lpBytesReturned, NULL);
-    } else {
-        DeviceIoControl(s->hfile, IOCTL_STORAGE_LOAD_MEDIA,
-                        NULL, 0, NULL, 0, &lpBytesReturned, NULL);
-    }
-}
-
-static int raw_set_locked(BlockDriverState *bs, int locked)
-{
-    return -ENOTSUP;
-}
-#endif
-
 static int hdev_has_zero_init(BlockDriverState *bs)
 {
     return 0;
diff --git a/block/raw.c b/block/raw.c
index b0f72d6a62..cb6203eeca 100644
--- a/block/raw.c
+++ b/block/raw.c
@@ -75,15 +75,14 @@ static int raw_is_inserted(BlockDriverState *bs)
     return bdrv_is_inserted(bs->file);
 }
 
-static int raw_eject(BlockDriverState *bs, int eject_flag)
+static void raw_eject(BlockDriverState *bs, int eject_flag)
 {
-    return bdrv_eject(bs->file, eject_flag);
+    bdrv_eject(bs->file, eject_flag);
 }
 
-static int raw_set_locked(BlockDriverState *bs, int locked)
+static void raw_set_locked(BlockDriverState *bs, int locked)
 {
     bdrv_set_locked(bs->file, locked);
-    return 0;
 }
 
 static int raw_ioctl(BlockDriverState *bs, unsigned long int req, void *buf)
diff --git a/block/vpc.c b/block/vpc.c
index 56865da5bc..fdd5236892 100644
--- a/block/vpc.c
+++ b/block/vpc.c
@@ -156,6 +156,7 @@ static int vpc_open(BlockDriverState *bs, int flags)
     struct vhd_dyndisk_header* dyndisk_header;
     uint8_t buf[HEADER_SIZE];
     uint32_t checksum;
+    int err = -1;
 
     if (bdrv_pread(bs->file, 0, s->footer_buf, HEADER_SIZE) != HEADER_SIZE)
         goto fail;
@@ -176,6 +177,11 @@ static int vpc_open(BlockDriverState *bs, int flags)
     bs->total_sectors = (int64_t)
         be16_to_cpu(footer->cyls) * footer->heads * footer->secs_per_cyl;
 
+    if (bs->total_sectors >= 65535 * 16 * 255) {
+        err = -EFBIG;
+        goto fail;
+    }
+
     if (bdrv_pread(bs->file, be64_to_cpu(footer->data_offset), buf, HEADER_SIZE)
             != HEADER_SIZE)
         goto fail;
@@ -222,7 +228,7 @@ static int vpc_open(BlockDriverState *bs, int flags)
 
     return 0;
  fail:
-    return -1;
+    return err;
 }
 
 /*
diff --git a/block_int.h b/block_int.h
index efb68038c4..f6d02b38a7 100644
--- a/block_int.h
+++ b/block_int.h
@@ -27,6 +27,7 @@
 #include "block.h"
 #include "qemu-option.h"
 #include "qemu-queue.h"
+#include "qemu-coroutine.h"
 
 #define BLOCK_FLAG_ENCRYPT	1
 #define BLOCK_FLAG_COMPAT6	4
@@ -77,6 +78,11 @@ struct BlockDriver {
     int (*bdrv_discard)(BlockDriverState *bs, int64_t sector_num,
                         int nb_sectors);
 
+    int coroutine_fn (*bdrv_co_readv)(BlockDriverState *bs,
+        int64_t sector_num, int nb_sectors, QEMUIOVector *qiov);
+    int coroutine_fn (*bdrv_co_writev)(BlockDriverState *bs,
+        int64_t sector_num, int nb_sectors, QEMUIOVector *qiov);
+
     int (*bdrv_aio_multiwrite)(BlockDriverState *bs, BlockRequest *reqs,
         int num_reqs);
     int (*bdrv_merge_requests)(BlockDriverState *bs, BlockRequest* a,
@@ -112,8 +118,8 @@ struct BlockDriver {
     /* removable device specific */
     int (*bdrv_is_inserted)(BlockDriverState *bs);
     int (*bdrv_media_changed)(BlockDriverState *bs);
-    int (*bdrv_eject)(BlockDriverState *bs, int eject_flag);
-    int (*bdrv_set_locked)(BlockDriverState *bs, int locked);
+    void (*bdrv_eject)(BlockDriverState *bs, int eject_flag);
+    void (*bdrv_set_locked)(BlockDriverState *bs, int locked);
 
     /* to control generic scsi devices */
     int (*bdrv_ioctl)(BlockDriverState *bs, unsigned long int req, void *buf);
diff --git a/blockdev.c b/blockdev.c
index 0b8d3a4f83..a25367a9e3 100644
--- a/blockdev.c
+++ b/blockdev.c
@@ -646,16 +646,13 @@ out:
 
 static int eject_device(Monitor *mon, BlockDriverState *bs, int force)
 {
-    if (!force) {
-        if (!bdrv_is_removable(bs)) {
-            qerror_report(QERR_DEVICE_NOT_REMOVABLE,
-                           bdrv_get_device_name(bs));
-            return -1;
-        }
-        if (bdrv_is_locked(bs)) {
-            qerror_report(QERR_DEVICE_LOCKED, bdrv_get_device_name(bs));
-            return -1;
-        }
+    if (!bdrv_is_removable(bs)) {
+        qerror_report(QERR_DEVICE_NOT_REMOVABLE, bdrv_get_device_name(bs));
+        return -1;
+    }
+    if (!force && bdrv_is_locked(bs)) {
+        qerror_report(QERR_DEVICE_LOCKED, bdrv_get_device_name(bs));
+        return -1;
     }
     bdrv_close(bs);
     return 0;
diff --git a/configure b/configure
index 77194cf9a7..27f1fa9266 100755
--- a/configure
+++ b/configure
@@ -181,6 +181,7 @@ smartcard_nss=""
 usb_redir=""
 opengl=""
 zlib="yes"
+guest_agent="yes"
 
 # parse CC options first
 for opt do
@@ -757,6 +758,10 @@ for opt do
   ;;
   --disable-zlib-test) zlib="no"
   ;;
+  --enable-guest-agent) guest_agent="yes"
+  ;;
+  --disable-guest-agent) guest_agent="no"
+  ;;
   *) echo "ERROR: unknown option $opt"; show_help="yes"
   ;;
   esac
@@ -846,7 +851,6 @@ if [ "$softmmu" = "yes" ] ; then
     default_target_list="\
 i386-softmmu \
 x86_64-softmmu \
-alpha-softmmu \
 arm-softmmu \
 cris-softmmu \
 lm32-softmmu \
@@ -1035,6 +1039,8 @@ echo "  --disable-smartcard-nss  disable smartcard nss support"
 echo "  --enable-smartcard-nss   enable smartcard nss support"
 echo "  --disable-usb-redir      disable usb network redirection support"
 echo "  --enable-usb-redir       enable usb network redirection support"
+echo "  --disable-guest-agent    disable building of the QEMU Guest Agent"
+echo "  --enable-guest-agent     enable building of the QEMU Guest Agent"
 echo ""
 echo "NOTE: The object files are built at the place where configure is launched"
 exit 1
@@ -1094,11 +1100,13 @@ if test "$solaris" = "yes" ; then
   fi
 fi
 
-if has $python; then
-  :
-else
-  echo "Python not found. Use --python=/path/to/python"
-  exit 1
+if test "$guest_agent" != "no" ; then
+  if has $python; then
+    :
+  else
+    echo "Python not found. Use --python=/path/to/python"
+    exit 1
+  fi
 fi
 
 if test -z "$target_list" ; then
@@ -1513,11 +1521,17 @@ int main(void) {
     return 0;
 }
 EOF
+  if $pkg_config libpng --modversion >/dev/null 2>&1; then
+    vnc_png_cflags=`$pkg_config libpng --cflags 2> /dev/null`
+    vnc_png_libs=`$pkg_config libpng --libs 2> /dev/null`
+  else
     vnc_png_cflags=""
     vnc_png_libs="-lpng"
+  fi
   if compile_prog "$vnc_png_cflags" "$vnc_png_libs" ; then
     vnc_png=yes
     libs_softmmu="$vnc_png_libs $libs_softmmu"
+    QEMU_CFLAGS="$QEMU_CFLAGS $vnc_png_cflags"
   else
     if test "$vnc_png" = "yes" ; then
       feature_not_found "vnc-png"
@@ -1830,14 +1844,16 @@ fi
 
 ##########################################
 # glib support probe
-if $pkg_config --modversion glib-2.0 > /dev/null 2>&1 ; then
-    glib_cflags=`$pkg_config --cflags glib-2.0 2>/dev/null`
-    glib_libs=`$pkg_config --libs glib-2.0 2>/dev/null`
-    libs_softmmu="$glib_libs $libs_softmmu"
-    libs_tools="$glib_libs $libs_tools"
-else
-    echo "glib-2.0 required to compile QEMU"
-    exit 1
+if test "$guest_agent" != "no" ; then
+    if $pkg_config --modversion glib-2.0 > /dev/null 2>&1 ; then
+        glib_cflags=`$pkg_config --cflags glib-2.0 2>/dev/null`
+        glib_libs=`$pkg_config --libs glib-2.0 2>/dev/null`
+        libs_softmmu="$glib_libs $libs_softmmu"
+        libs_tools="$glib_libs $libs_tools"
+    else
+        echo "glib-2.0 required to compile QEMU"
+        exit 1
+    fi
 fi
 
 ##########################################
@@ -2521,7 +2537,7 @@ fi
 # __sync_fetch_and_and requires at least -march=i486. Many toolchains
 # use i686 as default anyway, but for those that don't, an explicit
 # specification is necessary
-if test $vhost_net = "yes" && test $cpu = "i386"; then
+if test "$vhost_net" = "yes" && test "$cpu" = "i386"; then
   cat > $TMPC << EOF
 int sfaa(unsigned *ptr)
 {
@@ -2541,6 +2557,20 @@ EOF
 fi
 
 ##########################################
+# check if we have makecontext
+
+ucontext_coroutine=no
+if test "$darwin" != "yes"; then
+  cat > $TMPC << EOF
+#include <ucontext.h>
+int main(void) { makecontext(0, 0, 0); }
+EOF
+  if compile_prog "" "" ; then
+      ucontext_coroutine=yes
+  fi
+fi
+
+##########################################
 # End of CC checks
 # After here, no more $cc or $ld runs
 
@@ -2597,7 +2627,9 @@ if test "$softmmu" = yes ; then
   tools="qemu-img\$(EXESUF) qemu-io\$(EXESUF) $tools"
   if [ "$linux" = "yes" -o "$bsd" = "yes" -o "$solaris" = "yes" ] ; then
       tools="qemu-nbd\$(EXESUF) $tools"
+    if [ "$guest_agent" = "yes" ]; then
       tools="qemu-ga\$(EXESUF) $tools"
+    fi
     if [ "$check_utests" = "yes" ]; then
       tools="check-qint check-qstring check-qdict check-qlist $tools"
       tools="check-qfloat check-qjson $tools"
@@ -2699,8 +2731,9 @@ echo "xfsctl support    $xfs"
 echo "nss used          $smartcard_nss"
 echo "usb net redir     $usb_redir"
 echo "OpenGL support    $opengl"
+echo "build guest agent $guest_agent"
 
-if test $sdl_too_old = "yes"; then
+if test "$sdl_too_old" = "yes"; then
 echo "-> Your SDL version is too old - please upgrade to have SDL support"
 fi
 
@@ -2788,7 +2821,7 @@ fi
 if test "$static" = "yes" ; then
   echo "CONFIG_STATIC=y" >> $config_host_mak
 fi
-if test $profiler = "yes" ; then
+if test "$profiler" = "yes" ; then
   echo "CONFIG_PROFILER=y" >> $config_host_mak
 fi
 if test "$slirp" = "yes" ; then
@@ -3015,6 +3048,10 @@ if test "$rbd" = "yes" ; then
   echo "CONFIG_RBD=y" >> $config_host_mak
 fi
 
+if test "$ucontext_coroutine" = "yes" ; then
+  echo "CONFIG_UCONTEXT_COROUTINE=y" >> $config_host_mak
+fi
+
 # USB host support
 case "$usb" in
 linux)
@@ -3342,7 +3379,7 @@ case "$target_arch2" in
       \( "$target_arch2" = "x86_64" -a "$cpu" = "i386"   \) -o \
       \( "$target_arch2" = "i386"   -a "$cpu" = "x86_64" \) \) ; then
       echo "CONFIG_KVM=y" >> $config_target_mak
-      if test $vhost_net = "yes" ; then
+      if test "$vhost_net" = "yes" ; then
         echo "CONFIG_VHOST_NET=y" >> $config_target_mak
       fi
     fi
diff --git a/coroutine-gthread.c b/coroutine-gthread.c
new file mode 100644
index 0000000000..f09877e14f
--- /dev/null
+++ b/coroutine-gthread.c
@@ -0,0 +1,131 @@
+/*
+ * GThread coroutine initialization code
+ *
+ * Copyright (C) 2006  Anthony Liguori <anthony@codemonkey.ws>
+ * Copyright (C) 2011  Aneesh Kumar K.V <aneesh.kumar@linux.vnet.ibm.com>
+ *
+ * This library is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.0 of the License, or (at your option) any later version.
+ *
+ * This library is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with this library; if not, see <http://www.gnu.org/licenses/>.
+ */
+
+#include <glib.h>
+#include "qemu-common.h"
+#include "qemu-coroutine-int.h"
+
+typedef struct {
+    Coroutine base;
+    GThread *thread;
+    bool runnable;
+    CoroutineAction action;
+} CoroutineGThread;
+
+static GCond *coroutine_cond;
+static GStaticMutex coroutine_lock = G_STATIC_MUTEX_INIT;
+static GStaticPrivate coroutine_key = G_STATIC_PRIVATE_INIT;
+
+static void __attribute__((constructor)) coroutine_init(void)
+{
+    if (!g_thread_supported()) {
+        g_thread_init(NULL);
+    }
+
+    coroutine_cond = g_cond_new();
+}
+
+static void coroutine_wait_runnable_locked(CoroutineGThread *co)
+{
+    while (!co->runnable) {
+        g_cond_wait(coroutine_cond, g_static_mutex_get_mutex(&coroutine_lock));
+    }
+}
+
+static void coroutine_wait_runnable(CoroutineGThread *co)
+{
+    g_static_mutex_lock(&coroutine_lock);
+    coroutine_wait_runnable_locked(co);
+    g_static_mutex_unlock(&coroutine_lock);
+}
+
+static gpointer coroutine_thread(gpointer opaque)
+{
+    CoroutineGThread *co = opaque;
+
+    g_static_private_set(&coroutine_key, co, NULL);
+    coroutine_wait_runnable(co);
+    co->base.entry(co->base.entry_arg);
+    qemu_coroutine_switch(&co->base, co->base.caller, COROUTINE_TERMINATE);
+    return NULL;
+}
+
+Coroutine *qemu_coroutine_new(void)
+{
+    CoroutineGThread *co;
+
+    co = qemu_mallocz(sizeof(*co));
+    co->thread = g_thread_create_full(coroutine_thread, co, 0, TRUE, TRUE,
+                                      G_THREAD_PRIORITY_NORMAL, NULL);
+    if (!co->thread) {
+        qemu_free(co);
+        return NULL;
+    }
+    return &co->base;
+}
+
+void qemu_coroutine_delete(Coroutine *co_)
+{
+    CoroutineGThread *co = DO_UPCAST(CoroutineGThread, base, co_);
+
+    g_thread_join(co->thread);
+    qemu_free(co);
+}
+
+CoroutineAction qemu_coroutine_switch(Coroutine *from_,
+                                      Coroutine *to_,
+                                      CoroutineAction action)
+{
+    CoroutineGThread *from = DO_UPCAST(CoroutineGThread, base, from_);
+    CoroutineGThread *to = DO_UPCAST(CoroutineGThread, base, to_);
+
+    g_static_mutex_lock(&coroutine_lock);
+    from->runnable = false;
+    from->action = action;
+    to->runnable = true;
+    to->action = action;
+    g_cond_broadcast(coroutine_cond);
+
+    if (action != COROUTINE_TERMINATE) {
+        coroutine_wait_runnable_locked(from);
+    }
+    g_static_mutex_unlock(&coroutine_lock);
+    return from->action;
+}
+
+Coroutine *qemu_coroutine_self(void)
+{
+    CoroutineGThread *co = g_static_private_get(&coroutine_key);
+
+    if (!co) {
+        co = qemu_mallocz(sizeof(*co));
+        co->runnable = true;
+        g_static_private_set(&coroutine_key, co, (GDestroyNotify)qemu_free);
+    }
+
+    return &co->base;
+}
+
+bool qemu_in_coroutine(void)
+{
+    CoroutineGThread *co = g_static_private_get(&coroutine_key);
+
+    return co && co->base.caller;
+}
diff --git a/coroutine-ucontext.c b/coroutine-ucontext.c
new file mode 100644
index 0000000000..41c2379a2a
--- /dev/null
+++ b/coroutine-ucontext.c
@@ -0,0 +1,230 @@
+/*
+ * ucontext coroutine initialization code
+ *
+ * Copyright (C) 2006  Anthony Liguori <anthony@codemonkey.ws>
+ * Copyright (C) 2011  Kevin Wolf <kwolf@redhat.com>
+ *
+ * This library is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.0 of the License, or (at your option) any later version.
+ *
+ * This library is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with this library; if not, see <http://www.gnu.org/licenses/>.
+ */
+
+/* XXX Is there a nicer way to disable glibc's stack check for longjmp? */
+#ifdef _FORTIFY_SOURCE
+#undef _FORTIFY_SOURCE
+#endif
+#include <stdlib.h>
+#include <setjmp.h>
+#include <stdint.h>
+#include <pthread.h>
+#include <ucontext.h>
+#include "qemu-common.h"
+#include "qemu-coroutine-int.h"
+
+enum {
+    /* Maximum free pool size prevents holding too many freed coroutines */
+    POOL_MAX_SIZE = 64,
+};
+
+typedef struct {
+    Coroutine base;
+    void *stack;
+    jmp_buf env;
+} CoroutineUContext;
+
+/**
+ * Per-thread coroutine bookkeeping
+ */
+typedef struct {
+    /** Currently executing coroutine */
+    Coroutine *current;
+
+    /** Free list to speed up creation */
+    QLIST_HEAD(, Coroutine) pool;
+    unsigned int pool_size;
+
+    /** The default coroutine */
+    CoroutineUContext leader;
+} CoroutineThreadState;
+
+static pthread_key_t thread_state_key;
+
+/*
+ * va_args to makecontext() must be type 'int', so passing
+ * the pointer we need may require several int args. This
+ * union is a quick hack to let us do that
+ */
+union cc_arg {
+    void *p;
+    int i[2];
+};
+
+static CoroutineThreadState *coroutine_get_thread_state(void)
+{
+    CoroutineThreadState *s = pthread_getspecific(thread_state_key);
+
+    if (!s) {
+        s = qemu_mallocz(sizeof(*s));
+        s->current = &s->leader.base;
+        QLIST_INIT(&s->pool);
+        pthread_setspecific(thread_state_key, s);
+    }
+    return s;
+}
+
+static void qemu_coroutine_thread_cleanup(void *opaque)
+{
+    CoroutineThreadState *s = opaque;
+    Coroutine *co;
+    Coroutine *tmp;
+
+    QLIST_FOREACH_SAFE(co, &s->pool, pool_next, tmp) {
+        qemu_free(DO_UPCAST(CoroutineUContext, base, co)->stack);
+        qemu_free(co);
+    }
+    qemu_free(s);
+}
+
+static void __attribute__((constructor)) coroutine_init(void)
+{
+    int ret;
+
+    ret = pthread_key_create(&thread_state_key, qemu_coroutine_thread_cleanup);
+    if (ret != 0) {
+        fprintf(stderr, "unable to create leader key: %s\n", strerror(errno));
+        abort();
+    }
+}
+
+static void coroutine_trampoline(int i0, int i1)
+{
+    union cc_arg arg;
+    CoroutineUContext *self;
+    Coroutine *co;
+
+    arg.i[0] = i0;
+    arg.i[1] = i1;
+    self = arg.p;
+    co = &self->base;
+
+    /* Initialize longjmp environment and switch back the caller */
+    if (!setjmp(self->env)) {
+        longjmp(*(jmp_buf *)co->entry_arg, 1);
+    }
+
+    while (true) {
+        co->entry(co->entry_arg);
+        qemu_coroutine_switch(co, co->caller, COROUTINE_TERMINATE);
+    }
+}
+
+static Coroutine *coroutine_new(void)
+{
+    const size_t stack_size = 1 << 20;
+    CoroutineUContext *co;
+    ucontext_t old_uc, uc;
+    jmp_buf old_env;
+    union cc_arg arg;
+
+    /* The ucontext functions preserve signal masks which incurs a system call
+     * overhead.  setjmp()/longjmp() does not preserve signal masks but only
+     * works on the current stack.  Since we need a way to create and switch to
+     * a new stack, use the ucontext functions for that but setjmp()/longjmp()
+     * for everything else.
+     */
+
+    if (getcontext(&uc) == -1) {
+        abort();
+    }
+
+    co = qemu_mallocz(sizeof(*co));
+    co->stack = qemu_malloc(stack_size);
+    co->base.entry_arg = &old_env; /* stash away our jmp_buf */
+
+    uc.uc_link = &old_uc;
+    uc.uc_stack.ss_sp = co->stack;
+    uc.uc_stack.ss_size = stack_size;
+    uc.uc_stack.ss_flags = 0;
+
+    arg.p = co;
+
+    makecontext(&uc, (void (*)(void))coroutine_trampoline,
+                2, arg.i[0], arg.i[1]);
+
+    /* swapcontext() in, longjmp() back out */
+    if (!setjmp(old_env)) {
+        swapcontext(&old_uc, &uc);
+    }
+    return &co->base;
+}
+
+Coroutine *qemu_coroutine_new(void)
+{
+    CoroutineThreadState *s = coroutine_get_thread_state();
+    Coroutine *co;
+
+    co = QLIST_FIRST(&s->pool);
+    if (co) {
+        QLIST_REMOVE(co, pool_next);
+        s->pool_size--;
+    } else {
+        co = coroutine_new();
+    }
+    return co;
+}
+
+void qemu_coroutine_delete(Coroutine *co_)
+{
+    CoroutineThreadState *s = coroutine_get_thread_state();
+    CoroutineUContext *co = DO_UPCAST(CoroutineUContext, base, co_);
+
+    if (s->pool_size < POOL_MAX_SIZE) {
+        QLIST_INSERT_HEAD(&s->pool, &co->base, pool_next);
+        co->base.caller = NULL;
+        s->pool_size++;
+        return;
+    }
+
+    qemu_free(co->stack);
+    qemu_free(co);
+}
+
+CoroutineAction qemu_coroutine_switch(Coroutine *from_, Coroutine *to_,
+                                      CoroutineAction action)
+{
+    CoroutineUContext *from = DO_UPCAST(CoroutineUContext, base, from_);
+    CoroutineUContext *to = DO_UPCAST(CoroutineUContext, base, to_);
+    CoroutineThreadState *s = coroutine_get_thread_state();
+    int ret;
+
+    s->current = to_;
+
+    ret = setjmp(from->env);
+    if (ret == 0) {
+        longjmp(to->env, action);
+    }
+    return ret;
+}
+
+Coroutine *qemu_coroutine_self(void)
+{
+    CoroutineThreadState *s = coroutine_get_thread_state();
+
+    return s->current;
+}
+
+bool qemu_in_coroutine(void)
+{
+    CoroutineThreadState *s = pthread_getspecific(thread_state_key);
+
+    return s && s->current->caller;
+}
diff --git a/coroutine-win32.c b/coroutine-win32.c
new file mode 100644
index 0000000000..0e29448473
--- /dev/null
+++ b/coroutine-win32.c
@@ -0,0 +1,92 @@
+/*
+ * Win32 coroutine initialization code
+ *
+ * Copyright (c) 2011 Kevin Wolf <kwolf@redhat.com>
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ */
+
+#include "qemu-common.h"
+#include "qemu-coroutine-int.h"
+
+typedef struct
+{
+    Coroutine base;
+
+    LPVOID fiber;
+    CoroutineAction action;
+} CoroutineWin32;
+
+static __thread CoroutineWin32 leader;
+static __thread Coroutine *current;
+
+CoroutineAction qemu_coroutine_switch(Coroutine *from_, Coroutine *to_,
+                                      CoroutineAction action)
+{
+    CoroutineWin32 *from = DO_UPCAST(CoroutineWin32, base, from_);
+    CoroutineWin32 *to = DO_UPCAST(CoroutineWin32, base, to_);
+
+    current = to_;
+
+    to->action = action;
+    SwitchToFiber(to->fiber);
+    return from->action;
+}
+
+static void CALLBACK coroutine_trampoline(void *co_)
+{
+    Coroutine *co = co_;
+
+    while (true) {
+        co->entry(co->entry_arg);
+        qemu_coroutine_switch(co, co->caller, COROUTINE_TERMINATE);
+    }
+}
+
+Coroutine *qemu_coroutine_new(void)
+{
+    const size_t stack_size = 1 << 20;
+    CoroutineWin32 *co;
+
+    co = qemu_mallocz(sizeof(*co));
+    co->fiber = CreateFiber(stack_size, coroutine_trampoline, &co->base);
+    return &co->base;
+}
+
+void qemu_coroutine_delete(Coroutine *co_)
+{
+    CoroutineWin32 *co = DO_UPCAST(CoroutineWin32, base, co_);
+
+    DeleteFiber(co->fiber);
+    qemu_free(co);
+}
+
+Coroutine *qemu_coroutine_self(void)
+{
+    if (!current) {
+        current = &leader.base;
+        leader.fiber = ConvertThreadToFiber(NULL);
+    }
+    return current;
+}
+
+bool qemu_in_coroutine(void)
+{
+    return current && current->caller;
+}
diff --git a/hw/scsi-bus.c b/hw/scsi-bus.c
index 8b1a412210..0b0344c1fd 100644
--- a/hw/scsi-bus.c
+++ b/hw/scsi-bus.c
@@ -223,7 +223,7 @@ static int scsi_req_length(SCSIRequest *req, uint8_t *cmd)
 
     switch(cmd[0]) {
     case TEST_UNIT_READY:
-    case REZERO_UNIT:
+    case REWIND:
     case START_STOP:
     case SEEK_6:
     case WRITE_FILEMARKS:
@@ -232,24 +232,24 @@ static int scsi_req_length(SCSIRequest *req, uint8_t *cmd)
     case RELEASE:
     case ERASE:
     case ALLOW_MEDIUM_REMOVAL:
-    case VERIFY:
+    case VERIFY_10:
     case SEEK_10:
     case SYNCHRONIZE_CACHE:
     case LOCK_UNLOCK_CACHE:
     case LOAD_UNLOAD:
     case SET_CD_SPEED:
     case SET_LIMITS:
-    case WRITE_LONG:
+    case WRITE_LONG_10:
     case MOVE_MEDIUM:
     case UPDATE_BLOCK:
         req->cmd.xfer = 0;
         break;
     case MODE_SENSE:
         break;
-    case WRITE_SAME:
+    case WRITE_SAME_10:
         req->cmd.xfer = 1;
         break;
-    case READ_CAPACITY:
+    case READ_CAPACITY_10:
         req->cmd.xfer = 8;
         break;
     case READ_BLOCK_LIMITS:
@@ -265,7 +265,7 @@ static int scsi_req_length(SCSIRequest *req, uint8_t *cmd)
         req->cmd.xfer *= 8;
         break;
     case WRITE_10:
-    case WRITE_VERIFY:
+    case WRITE_VERIFY_10:
     case WRITE_6:
     case WRITE_12:
     case WRITE_VERIFY_12:
@@ -325,7 +325,7 @@ static void scsi_req_xfer_mode(SCSIRequest *req)
     switch (req->cmd.buf[0]) {
     case WRITE_6:
     case WRITE_10:
-    case WRITE_VERIFY:
+    case WRITE_VERIFY_10:
     case WRITE_12:
     case WRITE_VERIFY_12:
     case WRITE_16:
@@ -345,15 +345,13 @@ static void scsi_req_xfer_mode(SCSIRequest *req)
     case SEARCH_HIGH:
     case SEARCH_LOW:
     case UPDATE_BLOCK:
-    case WRITE_LONG:
-    case WRITE_SAME:
+    case WRITE_LONG_10:
+    case WRITE_SAME_10:
     case SEARCH_HIGH_12:
     case SEARCH_EQUAL_12:
     case SEARCH_LOW_12:
-    case SET_WINDOW:
     case MEDIUM_SCAN:
     case SEND_VOLUME_TAG:
-    case WRITE_LONG_2:
     case PERSISTENT_RESERVE_OUT:
     case MAINTENANCE_OUT:
         req->cmd.mode = SCSI_XFER_TO_DEV;
@@ -517,8 +515,7 @@ static const char *scsi_command_name(uint8_t cmd)
 {
     static const char *names[] = {
         [ TEST_UNIT_READY          ] = "TEST_UNIT_READY",
-        [ REZERO_UNIT              ] = "REZERO_UNIT",
-        /* REWIND and REZERO_UNIT use the same operation code */
+        [ REWIND                   ] = "REWIND",
         [ REQUEST_SENSE            ] = "REQUEST_SENSE",
         [ FORMAT_UNIT              ] = "FORMAT_UNIT",
         [ READ_BLOCK_LIMITS        ] = "READ_BLOCK_LIMITS",
@@ -543,14 +540,12 @@ static const char *scsi_command_name(uint8_t cmd)
         [ RECEIVE_DIAGNOSTIC       ] = "RECEIVE_DIAGNOSTIC",
         [ SEND_DIAGNOSTIC          ] = "SEND_DIAGNOSTIC",
         [ ALLOW_MEDIUM_REMOVAL     ] = "ALLOW_MEDIUM_REMOVAL",
-
-        [ SET_WINDOW               ] = "SET_WINDOW",
-        [ READ_CAPACITY            ] = "READ_CAPACITY",
+        [ READ_CAPACITY_10         ] = "READ_CAPACITY_10",
         [ READ_10                  ] = "READ_10",
         [ WRITE_10                 ] = "WRITE_10",
         [ SEEK_10                  ] = "SEEK_10",
-        [ WRITE_VERIFY             ] = "WRITE_VERIFY",
-        [ VERIFY                   ] = "VERIFY",
+        [ WRITE_VERIFY_10          ] = "WRITE_VERIFY_10",
+        [ VERIFY_10                ] = "VERIFY_10",
         [ SEARCH_HIGH              ] = "SEARCH_HIGH",
         [ SEARCH_EQUAL             ] = "SEARCH_EQUAL",
         [ SEARCH_LOW               ] = "SEARCH_LOW",
@@ -566,11 +561,14 @@ static const char *scsi_command_name(uint8_t cmd)
         [ WRITE_BUFFER             ] = "WRITE_BUFFER",
         [ READ_BUFFER              ] = "READ_BUFFER",
         [ UPDATE_BLOCK             ] = "UPDATE_BLOCK",
-        [ READ_LONG                ] = "READ_LONG",
-        [ WRITE_LONG               ] = "WRITE_LONG",
+        [ READ_LONG_10             ] = "READ_LONG_10",
+        [ WRITE_LONG_10            ] = "WRITE_LONG_10",
         [ CHANGE_DEFINITION        ] = "CHANGE_DEFINITION",
-        [ WRITE_SAME               ] = "WRITE_SAME",
+        [ WRITE_SAME_10            ] = "WRITE_SAME_10",
+        [ UNMAP                    ] = "UNMAP",
         [ READ_TOC                 ] = "READ_TOC",
+        [ REPORT_DENSITY_SUPPORT   ] = "REPORT_DENSITY_SUPPORT",
+        [ GET_CONFIGURATION        ] = "GET_CONFIGURATION",
         [ LOG_SELECT               ] = "LOG_SELECT",
         [ LOG_SENSE                ] = "LOG_SENSE",
         [ MODE_SELECT_10           ] = "MODE_SELECT_10",
@@ -579,27 +577,39 @@ static const char *scsi_command_name(uint8_t cmd)
         [ MODE_SENSE_10            ] = "MODE_SENSE_10",
         [ PERSISTENT_RESERVE_IN    ] = "PERSISTENT_RESERVE_IN",
         [ PERSISTENT_RESERVE_OUT   ] = "PERSISTENT_RESERVE_OUT",
+        [ WRITE_FILEMARKS_16       ] = "WRITE_FILEMARKS_16",
+        [ EXTENDED_COPY            ] = "EXTENDED_COPY",
+        [ ATA_PASSTHROUGH          ] = "ATA_PASSTHROUGH",
+        [ ACCESS_CONTROL_IN        ] = "ACCESS_CONTROL_IN",
+        [ ACCESS_CONTROL_OUT       ] = "ACCESS_CONTROL_OUT",
+        [ READ_16                  ] = "READ_16",
+        [ COMPARE_AND_WRITE        ] = "COMPARE_AND_WRITE",
+        [ WRITE_16                 ] = "WRITE_16",
+        [ WRITE_VERIFY_16          ] = "WRITE_VERIFY_16",
+        [ VERIFY_16                ] = "VERIFY_16",
+        [ SYNCHRONIZE_CACHE_16     ] = "SYNCHRONIZE_CACHE_16",
+        [ LOCATE_16                ] = "LOCATE_16",
+        [ WRITE_SAME_16            ] = "WRITE_SAME_16",
+        [ ERASE_16                 ] = "ERASE_16",
+        [ SERVICE_ACTION_IN        ] = "SERVICE_ACTION_IN",
+        [ WRITE_LONG_16            ] = "WRITE_LONG_16",
+        [ REPORT_LUNS              ] = "REPORT_LUNS",
+        [ BLANK                    ] = "BLANK",
+        [ MAINTENANCE_IN           ] = "MAINTENANCE_IN",
+        [ MAINTENANCE_OUT          ] = "MAINTENANCE_OUT",
         [ MOVE_MEDIUM              ] = "MOVE_MEDIUM",
+        [ LOAD_UNLOAD              ] = "LOAD_UNLOAD",
         [ READ_12                  ] = "READ_12",
         [ WRITE_12                 ] = "WRITE_12",
         [ WRITE_VERIFY_12          ] = "WRITE_VERIFY_12",
+        [ VERIFY_12                ] = "VERIFY_12",
         [ SEARCH_HIGH_12           ] = "SEARCH_HIGH_12",
         [ SEARCH_EQUAL_12          ] = "SEARCH_EQUAL_12",
         [ SEARCH_LOW_12            ] = "SEARCH_LOW_12",
         [ READ_ELEMENT_STATUS      ] = "READ_ELEMENT_STATUS",
         [ SEND_VOLUME_TAG          ] = "SEND_VOLUME_TAG",
-        [ WRITE_LONG_2             ] = "WRITE_LONG_2",
-
-        [ REPORT_DENSITY_SUPPORT   ] = "REPORT_DENSITY_SUPPORT",
-        [ GET_CONFIGURATION        ] = "GET_CONFIGURATION",
-        [ READ_16                  ] = "READ_16",
-        [ WRITE_16                 ] = "WRITE_16",
-        [ WRITE_VERIFY_16          ] = "WRITE_VERIFY_16",
-        [ SERVICE_ACTION_IN        ] = "SERVICE_ACTION_IN",
-        [ REPORT_LUNS              ] = "REPORT_LUNS",
-        [ LOAD_UNLOAD              ] = "LOAD_UNLOAD",
+        [ READ_DEFECT_DATA_12      ] = "READ_DEFECT_DATA_12",
         [ SET_CD_SPEED             ] = "SET_CD_SPEED",
-        [ BLANK                    ] = "BLANK",
     };
 
     if (cmd >= ARRAY_SIZE(names) || names[cmd] == NULL)
diff --git a/hw/scsi-defs.h b/hw/scsi-defs.h
index 413cce07b5..27010b74c0 100644
--- a/hw/scsi-defs.h
+++ b/hw/scsi-defs.h
@@ -25,7 +25,7 @@
  */
 
 #define TEST_UNIT_READY       0x00
-#define REZERO_UNIT           0x01
+#define REWIND                0x01
 #define REQUEST_SENSE         0x03
 #define FORMAT_UNIT           0x04
 #define READ_BLOCK_LIMITS     0x05
@@ -48,14 +48,13 @@
 #define RECEIVE_DIAGNOSTIC    0x1c
 #define SEND_DIAGNOSTIC       0x1d
 #define ALLOW_MEDIUM_REMOVAL  0x1e
-
-#define SET_WINDOW            0x24
-#define READ_CAPACITY         0x25
+#define READ_CAPACITY_10      0x25
 #define READ_10               0x28
 #define WRITE_10              0x2a
 #define SEEK_10               0x2b
-#define WRITE_VERIFY          0x2e
-#define VERIFY                0x2f
+#define LOCATE_10             0x2b
+#define WRITE_VERIFY_10       0x2e
+#define VERIFY_10             0x2f
 #define SEARCH_HIGH           0x30
 #define SEARCH_EQUAL          0x31
 #define SEARCH_LOW            0x32
@@ -71,11 +70,14 @@
 #define WRITE_BUFFER          0x3b
 #define READ_BUFFER           0x3c
 #define UPDATE_BLOCK          0x3d
-#define READ_LONG             0x3e
-#define WRITE_LONG            0x3f
+#define READ_LONG_10          0x3e
+#define WRITE_LONG_10         0x3f
 #define CHANGE_DEFINITION     0x40
-#define WRITE_SAME            0x41
+#define WRITE_SAME_10         0x41
+#define UNMAP                 0x42
 #define READ_TOC              0x43
+#define REPORT_DENSITY_SUPPORT 0x44
+#define GET_CONFIGURATION     0x46
 #define LOG_SELECT            0x4c
 #define LOG_SENSE             0x4d
 #define MODE_SELECT_10        0x55
@@ -84,32 +86,40 @@
 #define MODE_SENSE_10         0x5a
 #define PERSISTENT_RESERVE_IN 0x5e
 #define PERSISTENT_RESERVE_OUT 0x5f
+#define VARLENGTH_CDB         0x7f
+#define WRITE_FILEMARKS_16    0x80
+#define EXTENDED_COPY         0x83
+#define ATA_PASSTHROUGH       0x85
+#define ACCESS_CONTROL_IN     0x86
+#define ACCESS_CONTROL_OUT    0x87
+#define READ_16               0x88
+#define COMPARE_AND_WRITE     0x89
+#define WRITE_16              0x8a
+#define WRITE_VERIFY_16       0x8e
+#define VERIFY_16             0x8f
+#define SYNCHRONIZE_CACHE_16  0x91
+#define LOCATE_16             0x92
 #define WRITE_SAME_16         0x93
+#define ERASE_16              0x93
+#define SERVICE_ACTION_IN     0x9e
+#define WRITE_LONG_16         0x9f
+#define REPORT_LUNS           0xa0
+#define BLANK                 0xa1
 #define MAINTENANCE_IN        0xa3
 #define MAINTENANCE_OUT       0xa4
 #define MOVE_MEDIUM           0xa5
+#define LOAD_UNLOAD           0xa6
 #define READ_12               0xa8
 #define WRITE_12              0xaa
 #define WRITE_VERIFY_12       0xae
+#define VERIFY_12             0xaf
 #define SEARCH_HIGH_12        0xb0
 #define SEARCH_EQUAL_12       0xb1
 #define SEARCH_LOW_12         0xb2
 #define READ_ELEMENT_STATUS   0xb8
 #define SEND_VOLUME_TAG       0xb6
-#define WRITE_LONG_2          0xea
-
-/* from hw/scsi-generic.c */
-#define REWIND 0x01
-#define REPORT_DENSITY_SUPPORT 0x44
-#define GET_CONFIGURATION 0x46
-#define READ_16 0x88
-#define WRITE_16 0x8a
-#define WRITE_VERIFY_16 0x8e
-#define SERVICE_ACTION_IN 0x9e
-#define REPORT_LUNS 0xa0
-#define LOAD_UNLOAD 0xa6
-#define SET_CD_SPEED 0xbb
-#define BLANK 0xa1
+#define READ_DEFECT_DATA_12   0xb7
+#define SET_CD_SPEED          0xbb
 
 /*
  *  SAM Status codes
@@ -154,6 +164,7 @@
 
 #define TYPE_DISK           0x00
 #define TYPE_TAPE           0x01
+#define TYPE_PRINTER        0x02
 #define TYPE_PROCESSOR      0x03    /* HP scanners use this */
 #define TYPE_WORM           0x04    /* Treated as ROM by our system */
 #define TYPE_ROM            0x05
@@ -161,6 +172,9 @@
 #define TYPE_MOD            0x07    /* Magneto-optical disk -
 				     * - treated as TYPE_DISK */
 #define TYPE_MEDIUM_CHANGER 0x08
-#define TYPE_ENCLOSURE	    0x0d    /* Enclosure Services Device */
+#define TYPE_STORAGE_ARRAY  0x0c    /* Storage array device */
+#define TYPE_ENCLOSURE      0x0d    /* Enclosure Services Device */
+#define TYPE_RBC            0x0e    /* Simplified Direct-Access Device */
+#define TYPE_OSD            0x11    /* Object-storage Device */
 #define TYPE_NO_LUN         0x7f
 
diff --git a/hw/scsi-disk.c b/hw/scsi-disk.c
index f42a5d1f85..fa198f928c 100644
--- a/hw/scsi-disk.c
+++ b/hw/scsi-disk.c
@@ -59,8 +59,6 @@ typedef struct SCSIDiskReq {
     uint32_t status;
 } SCSIDiskReq;
 
-typedef enum { SCSI_HD, SCSI_CD } SCSIDriveKind;
-
 struct SCSIDiskState
 {
     SCSIDevice qdev;
@@ -74,7 +72,6 @@ struct SCSIDiskState
     char *version;
     char *serial;
     SCSISense sense;
-    SCSIDriveKind drive_kind;
 };
 
 static int scsi_handle_rw_error(SCSIDiskReq *r, int error, int type);
@@ -382,7 +379,7 @@ static int scsi_disk_emulate_inquiry(SCSIRequest *req, uint8_t *outbuf)
             return -1;
         }
 
-        if (s->drive_kind == SCSI_CD) {
+        if (s->qdev.type == TYPE_ROM) {
             outbuf[buflen++] = 5;
         } else {
             outbuf[buflen++] = 0;
@@ -401,7 +398,7 @@ static int scsi_disk_emulate_inquiry(SCSIRequest *req, uint8_t *outbuf)
             if (s->serial)
                 outbuf[buflen++] = 0x80; // unit serial number
             outbuf[buflen++] = 0x83; // device identification
-            if (s->drive_kind == SCSI_HD) {
+            if (s->qdev.type == TYPE_DISK) {
                 outbuf[buflen++] = 0xb0; // block limits
                 outbuf[buflen++] = 0xb2; // thin provisioning
             }
@@ -460,7 +457,7 @@ static int scsi_disk_emulate_inquiry(SCSIRequest *req, uint8_t *outbuf)
             unsigned int opt_io_size =
                     s->qdev.conf.opt_io_size / s->qdev.blocksize;
 
-            if (s->drive_kind == SCSI_CD) {
+            if (s->qdev.type == TYPE_ROM) {
                 DPRINTF("Inquiry (EVPD[%02X] not supported for CDROM\n",
                         page_code);
                 return -1;
@@ -526,16 +523,15 @@ static int scsi_disk_emulate_inquiry(SCSIRequest *req, uint8_t *outbuf)
     memset(outbuf, 0, buflen);
 
     if (req->lun) {
-        outbuf[0] = 0x7f;	/* LUN not supported */
+        outbuf[0] = 0x7f;       /* LUN not supported */
         return buflen;
     }
 
-    if (s->drive_kind == SCSI_CD) {
-        outbuf[0] = 5;
+    outbuf[0] = s->qdev.type & 0x1f;
+    if (s->qdev.type == TYPE_ROM) {
         outbuf[1] = 0x80;
         memcpy(&outbuf[16], "QEMU CD-ROM     ", 16);
     } else {
-        outbuf[0] = 0;
         outbuf[1] = s->removable ? 0x80 : 0;
         memcpy(&outbuf[16], "QEMU HARDDISK   ", 16);
     }
@@ -661,7 +657,7 @@ static int mode_sense_page(SCSIRequest *req, int page, uint8_t *p,
         return p[1] + 2;
 
     case 0x2a: /* CD Capabilities and Mechanical Status page. */
-        if (s->drive_kind != SCSI_CD)
+        if (s->qdev.type != TYPE_ROM)
             return 0;
         p[0] = 0x2a;
         p[1] = 0x14;
@@ -836,7 +832,7 @@ static int scsi_disk_emulate_command(SCSIDiskReq *r, uint8_t *outbuf)
     case TEST_UNIT_READY:
         if (!bdrv_is_inserted(s->bs))
             goto not_ready;
-	break;
+        break;
     case REQUEST_SENSE:
         if (req->cmd.xfer < 4)
             goto illegal_request;
@@ -848,7 +844,7 @@ static int scsi_disk_emulate_command(SCSIDiskReq *r, uint8_t *outbuf)
         buflen = scsi_disk_emulate_inquiry(req, outbuf);
         if (buflen < 0)
             goto illegal_request;
-	break;
+        break;
     case MODE_SENSE:
     case MODE_SENSE_10:
         buflen = scsi_disk_emulate_mode_sense(req, outbuf);
@@ -877,18 +873,18 @@ static int scsi_disk_emulate_command(SCSIDiskReq *r, uint8_t *outbuf)
             goto illegal_request;
         break;
     case START_STOP:
-        if (s->drive_kind == SCSI_CD && (req->cmd.buf[4] & 2)) {
+        if (s->qdev.type == TYPE_ROM && (req->cmd.buf[4] & 2)) {
             /* load/eject medium */
             bdrv_eject(s->bs, !(req->cmd.buf[4] & 1));
         }
-	break;
+        break;
     case ALLOW_MEDIUM_REMOVAL:
         bdrv_set_locked(s->bs, req->cmd.buf[4] & 1);
-	break;
-    case READ_CAPACITY:
+        break;
+    case READ_CAPACITY_10:
         /* The normal LEN field for this command is zero.  */
-	memset(outbuf, 0, 8);
-	bdrv_get_geometry(s->bs, &nb_sectors);
+        memset(outbuf, 0, 8);
+        bdrv_get_geometry(s->bs, &nb_sectors);
         if (!nb_sectors)
             goto not_ready;
         nb_sectors /= s->cluster_size;
@@ -908,7 +904,7 @@ static int scsi_disk_emulate_command(SCSIDiskReq *r, uint8_t *outbuf)
         outbuf[6] = s->cluster_size * 2;
         outbuf[7] = 0;
         buflen = 8;
-	break;
+        break;
     case SYNCHRONIZE_CACHE:
         ret = bdrv_flush(s->bs);
         if (ret < 0) {
@@ -970,13 +966,7 @@ static int scsi_disk_emulate_command(SCSIDiskReq *r, uint8_t *outbuf)
         outbuf[3] = 8;
         buflen = 16;
         break;
-    case VERIFY:
-        break;
-    case REZERO_UNIT:
-        DPRINTF("Rezero Unit\n");
-        if (!bdrv_is_inserted(s->bs)) {
-            goto not_ready;
-        }
+    case VERIFY_10:
         break;
     default:
         scsi_command_complete(r, CHECK_CONDITION, SENSE_CODE(INVALID_OPCODE));
@@ -1052,14 +1042,13 @@ static int32_t scsi_send_command(SCSIRequest *req, uint8_t *buf)
     case RELEASE_10:
     case START_STOP:
     case ALLOW_MEDIUM_REMOVAL:
-    case READ_CAPACITY:
+    case READ_CAPACITY_10:
     case SYNCHRONIZE_CACHE:
     case READ_TOC:
     case GET_CONFIGURATION:
     case SERVICE_ACTION_IN:
     case REPORT_LUNS:
-    case VERIFY:
-    case REZERO_UNIT:
+    case VERIFY_10:
         rc = scsi_disk_emulate_command(r, outbuf);
         if (rc < 0) {
             return 0;
@@ -1082,7 +1071,7 @@ static int32_t scsi_send_command(SCSIRequest *req, uint8_t *buf)
     case WRITE_10:
     case WRITE_12:
     case WRITE_16:
-    case WRITE_VERIFY:
+    case WRITE_VERIFY_10:
     case WRITE_VERIFY_12:
     case WRITE_VERIFY_16:
         len = r->req.cmd.xfer / s->qdev.blocksize;
@@ -1190,7 +1179,7 @@ static void scsi_destroy(SCSIDevice *dev)
     blockdev_mark_auto_del(s->qdev.conf.bs);
 }
 
-static int scsi_initfn(SCSIDevice *dev, SCSIDriveKind kind)
+static int scsi_initfn(SCSIDevice *dev, uint8_t scsi_type)
 {
     SCSIDiskState *s = DO_UPCAST(SCSIDiskState, qdev, dev);
     DriveInfo *dinfo;
@@ -1200,9 +1189,8 @@ static int scsi_initfn(SCSIDevice *dev, SCSIDriveKind kind)
         return -1;
     }
     s->bs = s->qdev.conf.bs;
-    s->drive_kind = kind;
 
-    if (kind == SCSI_HD && !bdrv_is_inserted(s->bs)) {
+    if (scsi_type == TYPE_DISK && !bdrv_is_inserted(s->bs)) {
         error_report("Device needs media, but drive is empty");
         return -1;
     }
@@ -1224,44 +1212,47 @@ static int scsi_initfn(SCSIDevice *dev, SCSIDriveKind kind)
         return -1;
     }
 
-    if (kind == SCSI_CD) {
+    if (scsi_type == TYPE_ROM) {
         s->qdev.blocksize = 2048;
-    } else {
+    } else if (scsi_type == TYPE_DISK) {
         s->qdev.blocksize = s->qdev.conf.logical_block_size;
+    } else {
+        error_report("scsi-disk: Unhandled SCSI type %02x", scsi_type);
+        return -1;
     }
     s->cluster_size = s->qdev.blocksize / 512;
     s->bs->buffer_alignment = s->qdev.blocksize;
 
-    s->qdev.type = TYPE_DISK;
+    s->qdev.type = scsi_type;
     qemu_add_vm_change_state_handler(scsi_dma_restart_cb, s);
-    bdrv_set_removable(s->bs, kind == SCSI_CD);
+    bdrv_set_removable(s->bs, scsi_type == TYPE_ROM);
     add_boot_device_path(s->qdev.conf.bootindex, &dev->qdev, ",0");
     return 0;
 }
 
 static int scsi_hd_initfn(SCSIDevice *dev)
 {
-    return scsi_initfn(dev, SCSI_HD);
+    return scsi_initfn(dev, TYPE_DISK);
 }
 
 static int scsi_cd_initfn(SCSIDevice *dev)
 {
-    return scsi_initfn(dev, SCSI_CD);
+    return scsi_initfn(dev, TYPE_ROM);
 }
 
 static int scsi_disk_initfn(SCSIDevice *dev)
 {
-    SCSIDriveKind kind;
     DriveInfo *dinfo;
+    uint8_t scsi_type;
 
     if (!dev->conf.bs) {
-        kind = SCSI_HD;         /* will die in scsi_initfn() */
+        scsi_type = TYPE_DISK;  /* will die in scsi_initfn() */
     } else {
         dinfo = drive_get_by_blockdev(dev->conf.bs);
-        kind = dinfo->media_cd ? SCSI_CD : SCSI_HD;
+        scsi_type = dinfo->media_cd ? TYPE_ROM : TYPE_DISK;
     }
 
-    return scsi_initfn(dev, kind);
+    return scsi_initfn(dev, scsi_type);
 }
 
 #define DEFINE_SCSI_DISK_PROPERTIES()                           \
diff --git a/hw/scsi-generic.c b/hw/scsi-generic.c
index 63361b3542..7b0026eb98 100644
--- a/hw/scsi-generic.c
+++ b/hw/scsi-generic.c
@@ -406,7 +406,7 @@ static int get_blocksize(BlockDriverState *bdrv)
 
     memset(cmd, 0, sizeof(cmd));
     memset(buf, 0, sizeof(buf));
-    cmd[0] = READ_CAPACITY;
+    cmd[0] = READ_CAPACITY_10;
 
     memset(&io_header, 0, sizeof(io_header));
     io_header.interface_id = 'S';
diff --git a/hw/virtio-balloon.c b/hw/virtio-balloon.c
index 70a8710343..072a88a382 100644
--- a/hw/virtio-balloon.c
+++ b/hw/virtio-balloon.c
@@ -1,7 +1,9 @@
 /*
- * Virtio Block Device
+ * Virtio Balloon Device
  *
  * Copyright IBM, Corp. 2008
+ * Copyright (C) 2011 Red Hat, Inc.
+ * Copyright (C) 2011 Amit Shah <amit.shah@redhat.com>
  *
  * Authors:
  *  Anthony Liguori   <aliguori@us.ibm.com>
@@ -43,6 +45,7 @@ typedef struct VirtIOBalloon
     size_t stats_vq_offset;
     MonitorCompletion *stats_callback;
     void *stats_opaque_callback_data;
+    DeviceState *qdev;
 } VirtIOBalloon;
 
 static VirtIOBalloon *to_virtio_balloon(VirtIODevice *vdev)
@@ -199,36 +202,44 @@ static uint32_t virtio_balloon_get_features(VirtIODevice *vdev, uint32_t f)
     return f;
 }
 
-static void virtio_balloon_to_target(void *opaque, ram_addr_t target,
-                                     MonitorCompletion cb, void *cb_data)
+static void virtio_balloon_stat(void *opaque, MonitorCompletion cb,
+                                void *cb_data)
 {
     VirtIOBalloon *dev = opaque;
 
-    if (target > ram_size)
-        target = ram_size;
+    /* For now, only allow one request at a time.  This restriction can be
+     * removed later by queueing callback and data pairs.
+     */
+    if (dev->stats_callback != NULL) {
+        return;
+    }
+    dev->stats_callback = cb;
+    dev->stats_opaque_callback_data = cb_data;
+
+    if (ENABLE_GUEST_STATS
+        && (dev->vdev.guest_features & (1 << VIRTIO_BALLOON_F_STATS_VQ))) {
+        virtqueue_push(dev->svq, &dev->stats_vq_elem, dev->stats_vq_offset);
+        virtio_notify(&dev->vdev, dev->svq);
+        return;
+    }
+
+    /* Stats are not supported.  Clear out any stale values that might
+     * have been set by a more featureful guest kernel.
+     */
+    reset_stats(dev);
+    complete_stats_request(dev);
+}
 
+static void virtio_balloon_to_target(void *opaque, ram_addr_t target)
+{
+    VirtIOBalloon *dev = opaque;
+
+    if (target > ram_size) {
+        target = ram_size;
+    }
     if (target) {
         dev->num_pages = (ram_size - target) >> VIRTIO_BALLOON_PFN_SHIFT;
         virtio_notify_config(&dev->vdev);
-    } else {
-        /* For now, only allow one request at a time.  This restriction can be
-         * removed later by queueing callback and data pairs.
-         */
-        if (dev->stats_callback != NULL) {
-            return;
-        }
-        dev->stats_callback = cb;
-        dev->stats_opaque_callback_data = cb_data; 
-        if (ENABLE_GUEST_STATS && (dev->vdev.guest_features & (1 << VIRTIO_BALLOON_F_STATS_VQ))) {
-            virtqueue_push(dev->svq, &dev->stats_vq_elem, dev->stats_vq_offset);
-            virtio_notify(&dev->vdev, dev->svq);
-        } else {
-            /* Stats are not supported.  Clear out any stale values that might
-             * have been set by a more featureful guest kernel.
-             */
-            reset_stats(dev);
-            complete_stats_request(dev);
-        }
     }
 }
 
@@ -259,6 +270,7 @@ static int virtio_balloon_load(QEMUFile *f, void *opaque, int version_id)
 VirtIODevice *virtio_balloon_init(DeviceState *dev)
 {
     VirtIOBalloon *s;
+    int ret;
 
     s = (VirtIOBalloon *)virtio_common_init("virtio-balloon",
                                             VIRTIO_ID_BALLOON,
@@ -268,15 +280,29 @@ VirtIODevice *virtio_balloon_init(DeviceState *dev)
     s->vdev.set_config = virtio_balloon_set_config;
     s->vdev.get_features = virtio_balloon_get_features;
 
+    ret = qemu_add_balloon_handler(virtio_balloon_to_target,
+                                   virtio_balloon_stat, s);
+    if (ret < 0) {
+        virtio_cleanup(&s->vdev);
+        return NULL;
+    }
+
     s->ivq = virtio_add_queue(&s->vdev, 128, virtio_balloon_handle_output);
     s->dvq = virtio_add_queue(&s->vdev, 128, virtio_balloon_handle_output);
     s->svq = virtio_add_queue(&s->vdev, 128, virtio_balloon_receive_stats);
 
     reset_stats(s);
-    qemu_add_balloon_handler(virtio_balloon_to_target, s);
 
+    s->qdev = dev;
     register_savevm(dev, "virtio-balloon", -1, 1,
                     virtio_balloon_save, virtio_balloon_load, s);
 
     return &s->vdev;
 }
+
+void virtio_balloon_exit(VirtIODevice *vdev)
+{
+    VirtIOBalloon *s = DO_UPCAST(VirtIOBalloon, vdev, vdev);
+    unregister_savevm(s->qdev, "virtio-balloon", s);
+    virtio_cleanup(vdev);
+}
diff --git a/hw/virtio-pci.c b/hw/virtio-pci.c
index d685243728..316bf92db0 100644
--- a/hw/virtio-pci.c
+++ b/hw/virtio-pci.c
@@ -788,10 +788,22 @@ static int virtio_balloon_init_pci(PCIDevice *pci_dev)
     VirtIODevice *vdev;
 
     vdev = virtio_balloon_init(&pci_dev->qdev);
+    if (!vdev) {
+        return -1;
+    }
     virtio_init_pci(proxy, vdev);
     return 0;
 }
 
+static int virtio_balloon_exit_pci(PCIDevice *pci_dev)
+{
+    VirtIOPCIProxy *proxy = DO_UPCAST(VirtIOPCIProxy, pci_dev, pci_dev);
+
+    virtio_pci_stop_ioeventfd(proxy);
+    virtio_balloon_exit(proxy->vdev);
+    return virtio_exit_pci(pci_dev);
+}
+
 static PCIDeviceInfo virtio_info[] = {
     {
         .qdev.name = "virtio-blk-pci",
@@ -866,7 +878,7 @@ static PCIDeviceInfo virtio_info[] = {
         .qdev.alias = "virtio-balloon",
         .qdev.size = sizeof(VirtIOPCIProxy),
         .init      = virtio_balloon_init_pci,
-        .exit      = virtio_exit_pci,
+        .exit      = virtio_balloon_exit_pci,
         .vendor_id = PCI_VENDOR_ID_REDHAT_QUMRANET,
         .device_id = PCI_DEVICE_ID_VIRTIO_BALLOON,
         .revision  = VIRTIO_PCI_ABI_VERSION,
diff --git a/hw/virtio.h b/hw/virtio.h
index 0fd0bb0ac5..c1292647fe 100644
--- a/hw/virtio.h
+++ b/hw/virtio.h
@@ -213,6 +213,7 @@ VirtIODevice *virtio_9p_init(DeviceState *dev, V9fsConf *conf);
 void virtio_net_exit(VirtIODevice *vdev);
 void virtio_blk_exit(VirtIODevice *vdev);
 void virtio_serial_exit(VirtIODevice *vdev);
+void virtio_balloon_exit(VirtIODevice *vdev);
 
 #define DEFINE_VIRTIO_COMMON_FEATURES(_state, _field) \
 	DEFINE_PROP_BIT("indirect_desc", _state, _field, \
diff --git a/linux-aio.c b/linux-aio.c
index 68f4b3d757..dc3faf2499 100644
--- a/linux-aio.c
+++ b/linux-aio.c
@@ -31,7 +31,6 @@ struct qemu_laiocb {
     struct iocb iocb;
     ssize_t ret;
     size_t nbytes;
-    int async_context_id;
     QLIST_ENTRY(qemu_laiocb) node;
 };
 
@@ -39,7 +38,6 @@ struct qemu_laio_state {
     io_context_t ctx;
     int efd;
     int count;
-    QLIST_HEAD(, qemu_laiocb) completed_reqs;
 };
 
 static inline ssize_t io_event_ret(struct io_event *ev)
@@ -49,7 +47,6 @@ static inline ssize_t io_event_ret(struct io_event *ev)
 
 /*
  * Completes an AIO request (calls the callback and frees the ACB).
- * Be sure to be in the right AsyncContext before calling this function.
  */
 static void qemu_laio_process_completion(struct qemu_laio_state *s,
     struct qemu_laiocb *laiocb)
@@ -72,42 +69,12 @@ static void qemu_laio_process_completion(struct qemu_laio_state *s,
 }
 
 /*
- * Processes all queued AIO requests, i.e. requests that have return from OS
- * but their callback was not called yet. Requests that cannot have their
- * callback called in the current AsyncContext, remain in the queue.
- *
- * Returns 1 if at least one request could be completed, 0 otherwise.
+ * All requests are directly processed when they complete, so there's nothing
+ * left to do during qemu_aio_wait().
  */
 static int qemu_laio_process_requests(void *opaque)
 {
-    struct qemu_laio_state *s = opaque;
-    struct qemu_laiocb *laiocb, *next;
-    int res = 0;
-
-    QLIST_FOREACH_SAFE (laiocb, &s->completed_reqs, node, next) {
-        if (laiocb->async_context_id == get_async_context_id()) {
-            qemu_laio_process_completion(s, laiocb);
-            QLIST_REMOVE(laiocb, node);
-            res = 1;
-        }
-    }
-
-    return res;
-}
-
-/*
- * Puts a request in the completion queue so that its callback is called the
- * next time when it's possible. If we already are in the right AsyncContext,
- * the request is completed immediately instead.
- */
-static void qemu_laio_enqueue_completed(struct qemu_laio_state *s,
-    struct qemu_laiocb* laiocb)
-{
-    if (laiocb->async_context_id == get_async_context_id()) {
-        qemu_laio_process_completion(s, laiocb);
-    } else {
-        QLIST_INSERT_HEAD(&s->completed_reqs, laiocb, node);
-    }
+    return 0;
 }
 
 static void qemu_laio_completion_cb(void *opaque)
@@ -141,7 +108,7 @@ static void qemu_laio_completion_cb(void *opaque)
                     container_of(iocb, struct qemu_laiocb, iocb);
 
             laiocb->ret = io_event_ret(&events[i]);
-            qemu_laio_enqueue_completed(s, laiocb);
+            qemu_laio_process_completion(s, laiocb);
         }
     }
 }
@@ -204,7 +171,6 @@ BlockDriverAIOCB *laio_submit(BlockDriverState *bs, void *aio_ctx, int fd,
     laiocb->nbytes = nb_sectors * 512;
     laiocb->ctx = s;
     laiocb->ret = -EINPROGRESS;
-    laiocb->async_context_id = get_async_context_id();
 
     iocbs = &laiocb->iocb;
 
@@ -239,7 +205,6 @@ void *laio_init(void)
     struct qemu_laio_state *s;
 
     s = qemu_mallocz(sizeof(*s));
-    QLIST_INIT(&s->completed_reqs);
     s->efd = eventfd(0, 0);
     if (s->efd == -1)
         goto out_free_state;
diff --git a/posix-aio-compat.c b/posix-aio-compat.c
index c4116e30f2..8dc00cbb0f 100644
--- a/posix-aio-compat.c
+++ b/posix-aio-compat.c
@@ -49,8 +49,6 @@ struct qemu_paiocb {
     ssize_t ret;
     int active;
     struct qemu_paiocb *next;
-
-    int async_context_id;
 };
 
 typedef struct PosixAioState {
@@ -200,6 +198,12 @@ static ssize_t handle_aiocb_rw_vector(struct qemu_paiocb *aiocb)
     return len;
 }
 
+/*
+ * Read/writes the data to/from a given linear buffer.
+ *
+ * Returns the number of bytes handles or -errno in case of an error. Short
+ * reads are only returned if the end of the file is reached.
+ */
 static ssize_t handle_aiocb_rw_linear(struct qemu_paiocb *aiocb, char *buf)
 {
     ssize_t offset = 0;
@@ -336,6 +340,19 @@ static void *aio_thread(void *unused)
 
         switch (aiocb->aio_type & QEMU_AIO_TYPE_MASK) {
         case QEMU_AIO_READ:
+            ret = handle_aiocb_rw(aiocb);
+            if (ret >= 0 && ret < aiocb->aio_nbytes && aiocb->common.bs->growable) {
+                /* A short read means that we have reached EOF. Pad the buffer
+                 * with zeros for bytes after EOF. */
+                QEMUIOVector qiov;
+
+                qemu_iovec_init_external(&qiov, aiocb->aio_iov,
+                                         aiocb->aio_niov);
+                qemu_iovec_memset_skip(&qiov, 0, aiocb->aio_nbytes - ret, ret);
+
+                ret = aiocb->aio_nbytes;
+            }
+            break;
         case QEMU_AIO_WRITE:
             ret = handle_aiocb_rw(aiocb);
             break;
@@ -420,7 +437,6 @@ static int posix_aio_process_queue(void *opaque)
     struct qemu_paiocb *acb, **pacb;
     int ret;
     int result = 0;
-    int async_context_id = get_async_context_id();
 
     for(;;) {
         pacb = &s->first_aio;
@@ -429,12 +445,6 @@ static int posix_aio_process_queue(void *opaque)
             if (!acb)
                 return result;
 
-            /* we're only interested in requests in the right context */
-            if (acb->async_context_id != async_context_id) {
-                pacb = &acb->next;
-                continue;
-            }
-
             ret = qemu_paio_error(acb);
             if (ret == ECANCELED) {
                 /* remove the request */
@@ -575,7 +585,6 @@ BlockDriverAIOCB *paio_submit(BlockDriverState *bs, int fd,
     acb->aio_type = type;
     acb->aio_fildes = fd;
     acb->ev_signo = SIGUSR2;
-    acb->async_context_id = get_async_context_id();
 
     if (qiov) {
         acb->aio_iov = qiov->iov;
@@ -604,7 +613,6 @@ BlockDriverAIOCB *paio_ioctl(BlockDriverState *bs, int fd,
     acb->aio_type = QEMU_AIO_IOCTL;
     acb->aio_fildes = fd;
     acb->ev_signo = SIGUSR2;
-    acb->async_context_id = get_async_context_id();
     acb->aio_offset = 0;
     acb->aio_ioctl_buf = buf;
     acb->aio_ioctl_cmd = req;
diff --git a/qemu-common.h b/qemu-common.h
index c7064d3620..afbd04d321 100644
--- a/qemu-common.h
+++ b/qemu-common.h
@@ -115,10 +115,6 @@ int qemu_main(int argc, char **argv, char **envp);
 /* bottom halves */
 typedef void QEMUBHFunc(void *opaque);
 
-void async_context_push(void);
-void async_context_pop(void);
-int get_async_context_id(void);
-
 QEMUBH *qemu_bh_new(QEMUBHFunc *cb, void *opaque);
 void qemu_bh_schedule(QEMUBH *bh);
 /* Bottom halfs that are scheduled from a bottom half handler are instantly
diff --git a/qemu-coroutine-int.h b/qemu-coroutine-int.h
new file mode 100644
index 0000000000..d495615cf6
--- /dev/null
+++ b/qemu-coroutine-int.h
@@ -0,0 +1,49 @@
+/*
+ * Coroutine internals
+ *
+ * Copyright (c) 2011 Kevin Wolf <kwolf@redhat.com>
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ */
+
+#ifndef QEMU_COROUTINE_INT_H
+#define QEMU_COROUTINE_INT_H
+
+#include "qemu-queue.h"
+#include "qemu-coroutine.h"
+
+typedef enum {
+    COROUTINE_YIELD = 1,
+    COROUTINE_TERMINATE = 2,
+} CoroutineAction;
+
+struct Coroutine {
+    CoroutineEntry *entry;
+    void *entry_arg;
+    Coroutine *caller;
+    QLIST_ENTRY(Coroutine) pool_next;
+    QTAILQ_ENTRY(Coroutine) co_queue_next;
+};
+
+Coroutine *qemu_coroutine_new(void);
+void qemu_coroutine_delete(Coroutine *co);
+CoroutineAction qemu_coroutine_switch(Coroutine *from, Coroutine *to,
+                                      CoroutineAction action);
+
+#endif
diff --git a/qemu-coroutine-lock.c b/qemu-coroutine-lock.c
new file mode 100644
index 0000000000..a80f437c59
--- /dev/null
+++ b/qemu-coroutine-lock.c
@@ -0,0 +1,117 @@
+/*
+ * coroutine queues and locks
+ *
+ * Copyright (c) 2011 Kevin Wolf <kwolf@redhat.com>
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ */
+
+#include "qemu-common.h"
+#include "qemu-coroutine.h"
+#include "qemu-coroutine-int.h"
+#include "qemu-queue.h"
+#include "trace.h"
+
+static QTAILQ_HEAD(, Coroutine) unlock_bh_queue =
+    QTAILQ_HEAD_INITIALIZER(unlock_bh_queue);
+static QEMUBH* unlock_bh;
+
+static void qemu_co_queue_next_bh(void *opaque)
+{
+    Coroutine *next;
+
+    trace_qemu_co_queue_next_bh();
+    while ((next = QTAILQ_FIRST(&unlock_bh_queue))) {
+        QTAILQ_REMOVE(&unlock_bh_queue, next, co_queue_next);
+        qemu_coroutine_enter(next, NULL);
+    }
+}
+
+void qemu_co_queue_init(CoQueue *queue)
+{
+    QTAILQ_INIT(&queue->entries);
+
+    if (!unlock_bh) {
+        unlock_bh = qemu_bh_new(qemu_co_queue_next_bh, NULL);
+    }
+}
+
+void coroutine_fn qemu_co_queue_wait(CoQueue *queue)
+{
+    Coroutine *self = qemu_coroutine_self();
+    QTAILQ_INSERT_TAIL(&queue->entries, self, co_queue_next);
+    qemu_coroutine_yield();
+    assert(qemu_in_coroutine());
+}
+
+bool qemu_co_queue_next(CoQueue *queue)
+{
+    Coroutine *next;
+
+    next = QTAILQ_FIRST(&queue->entries);
+    if (next) {
+        QTAILQ_REMOVE(&queue->entries, next, co_queue_next);
+        QTAILQ_INSERT_TAIL(&unlock_bh_queue, next, co_queue_next);
+        trace_qemu_co_queue_next(next);
+        qemu_bh_schedule(unlock_bh);
+    }
+
+    return (next != NULL);
+}
+
+bool qemu_co_queue_empty(CoQueue *queue)
+{
+    return (QTAILQ_FIRST(&queue->entries) == NULL);
+}
+
+void qemu_co_mutex_init(CoMutex *mutex)
+{
+    memset(mutex, 0, sizeof(*mutex));
+    qemu_co_queue_init(&mutex->queue);
+}
+
+void coroutine_fn qemu_co_mutex_lock(CoMutex *mutex)
+{
+    Coroutine *self = qemu_coroutine_self();
+
+    trace_qemu_co_mutex_lock_entry(mutex, self);
+
+    while (mutex->locked) {
+        qemu_co_queue_wait(&mutex->queue);
+    }
+
+    mutex->locked = true;
+
+    trace_qemu_co_mutex_lock_return(mutex, self);
+}
+
+void coroutine_fn qemu_co_mutex_unlock(CoMutex *mutex)
+{
+    Coroutine *self = qemu_coroutine_self();
+
+    trace_qemu_co_mutex_unlock_entry(mutex, self);
+
+    assert(mutex->locked == true);
+    assert(qemu_in_coroutine());
+
+    mutex->locked = false;
+    qemu_co_queue_next(&mutex->queue);
+
+    trace_qemu_co_mutex_unlock_return(mutex, self);
+}
diff --git a/qemu-coroutine.c b/qemu-coroutine.c
new file mode 100644
index 0000000000..600be2643c
--- /dev/null
+++ b/qemu-coroutine.c
@@ -0,0 +1,75 @@
+/*
+ * QEMU coroutines
+ *
+ * Copyright IBM, Corp. 2011
+ *
+ * Authors:
+ *  Stefan Hajnoczi    <stefanha@linux.vnet.ibm.com>
+ *  Kevin Wolf         <kwolf@redhat.com>
+ *
+ * This work is licensed under the terms of the GNU LGPL, version 2 or later.
+ * See the COPYING.LIB file in the top-level directory.
+ *
+ */
+
+#include "trace.h"
+#include "qemu-common.h"
+#include "qemu-coroutine.h"
+#include "qemu-coroutine-int.h"
+
+Coroutine *qemu_coroutine_create(CoroutineEntry *entry)
+{
+    Coroutine *co = qemu_coroutine_new();
+    co->entry = entry;
+    return co;
+}
+
+static void coroutine_swap(Coroutine *from, Coroutine *to)
+{
+    CoroutineAction ret;
+
+    ret = qemu_coroutine_switch(from, to, COROUTINE_YIELD);
+
+    switch (ret) {
+    case COROUTINE_YIELD:
+        return;
+    case COROUTINE_TERMINATE:
+        trace_qemu_coroutine_terminate(to);
+        qemu_coroutine_delete(to);
+        return;
+    default:
+        abort();
+    }
+}
+
+void qemu_coroutine_enter(Coroutine *co, void *opaque)
+{
+    Coroutine *self = qemu_coroutine_self();
+
+    trace_qemu_coroutine_enter(self, co, opaque);
+
+    if (co->caller) {
+        fprintf(stderr, "Co-routine re-entered recursively\n");
+        abort();
+    }
+
+    co->caller = self;
+    co->entry_arg = opaque;
+    coroutine_swap(self, co);
+}
+
+void coroutine_fn qemu_coroutine_yield(void)
+{
+    Coroutine *self = qemu_coroutine_self();
+    Coroutine *to = self->caller;
+
+    trace_qemu_coroutine_yield(self, to);
+
+    if (!to) {
+        fprintf(stderr, "Co-routine is yielding to no one\n");
+        abort();
+    }
+
+    self->caller = NULL;
+    coroutine_swap(self, to);
+}
diff --git a/qemu-coroutine.h b/qemu-coroutine.h
new file mode 100644
index 0000000000..2f2fd95552
--- /dev/null
+++ b/qemu-coroutine.h
@@ -0,0 +1,159 @@
+/*
+ * QEMU coroutine implementation
+ *
+ * Copyright IBM, Corp. 2011
+ *
+ * Authors:
+ *  Stefan Hajnoczi    <stefanha@linux.vnet.ibm.com>
+ *  Kevin Wolf         <kwolf@redhat.com>
+ *
+ * This work is licensed under the terms of the GNU LGPL, version 2 or later.
+ * See the COPYING.LIB file in the top-level directory.
+ *
+ */
+
+#ifndef QEMU_COROUTINE_H
+#define QEMU_COROUTINE_H
+
+#include <stdbool.h>
+#include "qemu-queue.h"
+
+/**
+ * Coroutines are a mechanism for stack switching and can be used for
+ * cooperative userspace threading.  These functions provide a simple but
+ * useful flavor of coroutines that is suitable for writing sequential code,
+ * rather than callbacks, for operations that need to give up control while
+ * waiting for events to complete.
+ *
+ * These functions are re-entrant and may be used outside the global mutex.
+ */
+
+/**
+ * Mark a function that executes in coroutine context
+ *
+ * Functions that execute in coroutine context cannot be called directly from
+ * normal functions.  In the future it would be nice to enable compiler or
+ * static checker support for catching such errors.  This annotation might make
+ * it possible and in the meantime it serves as documentation.
+ *
+ * For example:
+ *
+ *   static void coroutine_fn foo(void) {
+ *       ....
+ *   }
+ */
+#define coroutine_fn
+
+typedef struct Coroutine Coroutine;
+
+/**
+ * Coroutine entry point
+ *
+ * When the coroutine is entered for the first time, opaque is passed in as an
+ * argument.
+ *
+ * When this function returns, the coroutine is destroyed automatically and
+ * execution continues in the caller who last entered the coroutine.
+ */
+typedef void coroutine_fn CoroutineEntry(void *opaque);
+
+/**
+ * Create a new coroutine
+ *
+ * Use qemu_coroutine_enter() to actually transfer control to the coroutine.
+ */
+Coroutine *qemu_coroutine_create(CoroutineEntry *entry);
+
+/**
+ * Transfer control to a coroutine
+ *
+ * The opaque argument is passed as the argument to the entry point when
+ * entering the coroutine for the first time.  It is subsequently ignored.
+ */
+void qemu_coroutine_enter(Coroutine *coroutine, void *opaque);
+
+/**
+ * Transfer control back to a coroutine's caller
+ *
+ * This function does not return until the coroutine is re-entered using
+ * qemu_coroutine_enter().
+ */
+void coroutine_fn qemu_coroutine_yield(void);
+
+/**
+ * Get the currently executing coroutine
+ */
+Coroutine *coroutine_fn qemu_coroutine_self(void);
+
+/**
+ * Return whether or not currently inside a coroutine
+ *
+ * This can be used to write functions that work both when in coroutine context
+ * and when not in coroutine context.  Note that such functions cannot use the
+ * coroutine_fn annotation since they work outside coroutine context.
+ */
+bool qemu_in_coroutine(void);
+
+
+
+/**
+ * CoQueues are a mechanism to queue coroutines in order to continue executing
+ * them later. They provide the fundamental primitives on which coroutine locks
+ * are built.
+ */
+typedef struct CoQueue {
+    QTAILQ_HEAD(, Coroutine) entries;
+} CoQueue;
+
+/**
+ * Initialise a CoQueue. This must be called before any other operation is used
+ * on the CoQueue.
+ */
+void qemu_co_queue_init(CoQueue *queue);
+
+/**
+ * Adds the current coroutine to the CoQueue and transfers control to the
+ * caller of the coroutine.
+ */
+void coroutine_fn qemu_co_queue_wait(CoQueue *queue);
+
+/**
+ * Restarts the next coroutine in the CoQueue and removes it from the queue.
+ *
+ * Returns true if a coroutine was restarted, false if the queue is empty.
+ */
+bool qemu_co_queue_next(CoQueue *queue);
+
+/**
+ * Checks if the CoQueue is empty.
+ */
+bool qemu_co_queue_empty(CoQueue *queue);
+
+
+/**
+ * Provides a mutex that can be used to synchronise coroutines
+ */
+typedef struct CoMutex {
+    bool locked;
+    CoQueue queue;
+} CoMutex;
+
+/**
+ * Initialises a CoMutex. This must be called before any other operation is used
+ * on the CoMutex.
+ */
+void qemu_co_mutex_init(CoMutex *mutex);
+
+/**
+ * Locks the mutex. If the lock cannot be taken immediately, control is
+ * transferred to the caller of the current coroutine.
+ */
+void coroutine_fn qemu_co_mutex_lock(CoMutex *mutex);
+
+/**
+ * Unlocks the mutex and schedules the next coroutine that was waiting for this
+ * lock to be run.
+ */
+void coroutine_fn qemu_co_mutex_unlock(CoMutex *mutex);
+
+#endif /* QEMU_COROUTINE_H */
diff --git a/test-coroutine.c b/test-coroutine.c
new file mode 100644
index 0000000000..bf9f3e91b5
--- /dev/null
+++ b/test-coroutine.c
@@ -0,0 +1,192 @@
+/*
+ * Coroutine tests
+ *
+ * Copyright IBM, Corp. 2011
+ *
+ * Authors:
+ *  Stefan Hajnoczi    <stefanha@linux.vnet.ibm.com>
+ *
+ * This work is licensed under the terms of the GNU LGPL, version 2 or later.
+ * See the COPYING.LIB file in the top-level directory.
+ *
+ */
+
+#include <glib.h>
+#include "qemu-coroutine.h"
+
+/*
+ * Check that qemu_in_coroutine() works
+ */
+
+static void coroutine_fn verify_in_coroutine(void *opaque)
+{
+    g_assert(qemu_in_coroutine());
+}
+
+static void test_in_coroutine(void)
+{
+    Coroutine *coroutine;
+
+    g_assert(!qemu_in_coroutine());
+
+    coroutine = qemu_coroutine_create(verify_in_coroutine);
+    qemu_coroutine_enter(coroutine, NULL);
+}
+
+/*
+ * Check that qemu_coroutine_self() works
+ */
+
+static void coroutine_fn verify_self(void *opaque)
+{
+    g_assert(qemu_coroutine_self() == opaque);
+}
+
+static void test_self(void)
+{
+    Coroutine *coroutine;
+
+    coroutine = qemu_coroutine_create(verify_self);
+    qemu_coroutine_enter(coroutine, coroutine);
+}
+
+/*
+ * Check that coroutines may nest multiple levels
+ */
+
+typedef struct {
+    unsigned int n_enter;   /* num coroutines entered */
+    unsigned int n_return;  /* num coroutines returned */
+    unsigned int max;       /* maximum level of nesting */
+} NestData;
+
+static void coroutine_fn nest(void *opaque)
+{
+    NestData *nd = opaque;
+
+    nd->n_enter++;
+
+    if (nd->n_enter < nd->max) {
+        Coroutine *child;
+
+        child = qemu_coroutine_create(nest);
+        qemu_coroutine_enter(child, nd);
+    }
+
+    nd->n_return++;
+}
+
+static void test_nesting(void)
+{
+    Coroutine *root;
+    NestData nd = {
+        .n_enter  = 0,
+        .n_return = 0,
+        .max      = 128,
+    };
+
+    root = qemu_coroutine_create(nest);
+    qemu_coroutine_enter(root, &nd);
+
+    /* Must enter and return from max nesting level */
+    g_assert_cmpint(nd.n_enter, ==, nd.max);
+    g_assert_cmpint(nd.n_return, ==, nd.max);
+}
+
+/*
+ * Check that yield/enter transfer control correctly
+ */
+
+static void coroutine_fn yield_5_times(void *opaque)
+{
+    bool *done = opaque;
+    int i;
+
+    for (i = 0; i < 5; i++) {
+        qemu_coroutine_yield();
+    }
+    *done = true;
+}
+
+static void test_yield(void)
+{
+    Coroutine *coroutine;
+    bool done = false;
+    int i = -1; /* one extra time to return from coroutine */
+
+    coroutine = qemu_coroutine_create(yield_5_times);
+    while (!done) {
+        qemu_coroutine_enter(coroutine, &done);
+        i++;
+    }
+    g_assert_cmpint(i, ==, 5); /* coroutine must yield 5 times */
+}
+
+/*
+ * Check that creation, enter, and return work
+ */
+
+static void coroutine_fn set_and_exit(void *opaque)
+{
+    bool *done = opaque;
+
+    *done = true;
+}
+
+static void test_lifecycle(void)
+{
+    Coroutine *coroutine;
+    bool done = false;
+
+    /* Create, enter, and return from coroutine */
+    coroutine = qemu_coroutine_create(set_and_exit);
+    qemu_coroutine_enter(coroutine, &done);
+    g_assert(done); /* expect done to be true (first time) */
+
+    /* Repeat to check that no state affects this test */
+    done = false;
+    coroutine = qemu_coroutine_create(set_and_exit);
+    qemu_coroutine_enter(coroutine, &done);
+    g_assert(done); /* expect done to be true (second time) */
+}
+
+/*
+ * Lifecycle benchmark
+ */
+
+static void coroutine_fn empty_coroutine(void *opaque)
+{
+    /* Do nothing */
+}
+
+static void perf_lifecycle(void)
+{
+    Coroutine *coroutine;
+    unsigned int i, max;
+    double duration;
+
+    max = 1000000;
+
+    g_test_timer_start();
+    for (i = 0; i < max; i++) {
+        coroutine = qemu_coroutine_create(empty_coroutine);
+        qemu_coroutine_enter(coroutine, NULL);
+    }
+    duration = g_test_timer_elapsed();
+
+    g_test_message("Lifecycle %u iterations: %f s\n", max, duration);
+}
+
+int main(int argc, char **argv)
+{
+    g_test_init(&argc, &argv, NULL);
+    g_test_add_func("/basic/lifecycle", test_lifecycle);
+    g_test_add_func("/basic/yield", test_yield);
+    g_test_add_func("/basic/nesting", test_nesting);
+    g_test_add_func("/basic/self", test_self);
+    g_test_add_func("/basic/in_coroutine", test_in_coroutine);
+    if (g_test_perf()) {
+        g_test_add_func("/perf/lifecycle", perf_lifecycle);
+    }
+    return g_test_run();
+}
diff --git a/trace-events b/trace-events
index 713f042081..19d31e3541 100644
--- a/trace-events
+++ b/trace-events
@@ -66,6 +66,9 @@ disable bdrv_aio_flush(void *bs, void *opaque) "bs %p opaque %p"
 disable bdrv_aio_readv(void *bs, int64_t sector_num, int nb_sectors, void *opaque) "bs %p sector_num %"PRId64" nb_sectors %d opaque %p"
 disable bdrv_aio_writev(void *bs, int64_t sector_num, int nb_sectors, void *opaque) "bs %p sector_num %"PRId64" nb_sectors %d opaque %p"
 disable bdrv_set_locked(void *bs, int locked) "bs %p locked %d"
+disable bdrv_co_readv(void *bs, int64_t sector_num, int nb_sector) "bs %p sector_num %"PRId64" nb_sectors %d"
+disable bdrv_co_writev(void *bs, int64_t sector_num, int nb_sector) "bs %p sector_num %"PRId64" nb_sectors %d"
+disable bdrv_co_io(int is_write, void *acb) "is_write %d acb %p"
 
 # hw/virtio-blk.c
 disable virtio_blk_req_complete(void *req, int status) "req %p status %d"
@@ -425,3 +428,16 @@ disable qemu_put_ram_ptr(void* addr) "%p"
 
 # hw/xen_platform.c
 disable xen_platform_log(char *s) "xen platform: %s"
+
+# qemu-coroutine.c
+disable qemu_coroutine_enter(void *from, void *to, void *opaque) "from %p to %p opaque %p"
+disable qemu_coroutine_yield(void *from, void *to) "from %p to %p"
+disable qemu_coroutine_terminate(void *co) "self %p"
+
+# qemu-coroutine-lock.c
+disable qemu_co_queue_next_bh(void) ""
+disable qemu_co_queue_next(void *next) "next %p"
+disable qemu_co_mutex_lock_entry(void *mutex, void *self) "mutex %p self %p"
+disable qemu_co_mutex_lock_return(void *mutex, void *self) "mutex %p self %p"
+disable qemu_co_mutex_unlock_entry(void *mutex, void *self) "mutex %p self %p"
+disable qemu_co_mutex_unlock_return(void *mutex, void *self) "mutex %p self %p"