summary refs log tree commit diff stats
diff options
context:
space:
mode:
-rw-r--r--block/raw-posix-aio.h36
-rw-r--r--block/raw-posix.c275
-rw-r--r--posix-aio-compat.c325
-rw-r--r--posix-aio-compat.h68
4 files changed, 327 insertions, 377 deletions
diff --git a/block/raw-posix-aio.h b/block/raw-posix-aio.h
new file mode 100644
index 0000000000..6761cd39f3
--- /dev/null
+++ b/block/raw-posix-aio.h
@@ -0,0 +1,36 @@
+/*
+ * QEMU Posix block I/O backend AIO support
+ *
+ * Copyright IBM, Corp. 2008
+ *
+ * Authors:
+ *  Anthony Liguori   <aliguori@us.ibm.com>
+ *
+ * This work is licensed under the terms of the GNU GPL, version 2.  See
+ * the COPYING file in the top-level directory.
+ *
+ */
+#ifndef QEMU_RAW_POSIX_AIO_H
+#define QEMU_RAW_POSIX_AIO_H
+
+/* AIO request types */
+#define QEMU_AIO_READ         0x0001
+#define QEMU_AIO_WRITE        0x0002
+#define QEMU_AIO_IOCTL        0x0004
+#define QEMU_AIO_TYPE_MASK \
+	(QEMU_AIO_READ|QEMU_AIO_WRITE|QEMU_AIO_IOCTL)
+
+/* AIO flags */
+#define QEMU_AIO_MISALIGNED   0x1000
+
+
+/* posix-aio-compat.c - thread pool based implementation */
+void *paio_init(void);
+BlockDriverAIOCB *paio_submit(BlockDriverState *bs, void *aio_ctx, int fd,
+        int64_t sector_num, QEMUIOVector *qiov, int nb_sectors,
+        BlockDriverCompletionFunc *cb, void *opaque, int type);
+BlockDriverAIOCB *paio_ioctl(BlockDriverState *bs, int fd,
+        unsigned long int req, void *buf,
+        BlockDriverCompletionFunc *cb, void *opaque);
+
+#endif /* QEMU_RAW_POSIX_AIO_H */
diff --git a/block/raw-posix.c b/block/raw-posix.c
index ab43589402..ca9bc616a7 100644
--- a/block/raw-posix.c
+++ b/block/raw-posix.c
@@ -27,7 +27,7 @@
 #include "qemu-log.h"
 #include "block_int.h"
 #include "module.h"
-#include "posix-aio-compat.h"
+#include "block/raw-posix-aio.h"
 
 #ifdef CONFIG_COCOA
 #include <paths.h>
@@ -107,6 +107,7 @@ typedef struct BDRVRawState {
     int type;
     unsigned int lseek_err_cnt;
     int open_flags;
+    void *aio_ctx;
 #if defined(__linux__)
     /* linux floppy specific */
     int64_t fd_open_time;
@@ -117,8 +118,6 @@ typedef struct BDRVRawState {
     uint8_t* aligned_buf;
 } BDRVRawState;
 
-static int posix_aio_init(void);
-
 static int fd_open(BlockDriverState *bs);
 static int64_t raw_getlength(BlockDriverState *bs);
 
@@ -132,8 +131,6 @@ static int raw_open_common(BlockDriverState *bs, const char *filename,
     BDRVRawState *s = bs->opaque;
     int fd, ret;
 
-    posix_aio_init();
-
     s->lseek_err_cnt = 0;
 
     s->open_flags = open_flags | O_BINARY;
@@ -165,12 +162,22 @@ static int raw_open_common(BlockDriverState *bs, const char *filename,
     if ((bdrv_flags & BDRV_O_NOCACHE)) {
         s->aligned_buf = qemu_blockalign(bs, ALIGNED_BUFFER_SIZE);
         if (s->aligned_buf == NULL) {
-            ret = -errno;
-            close(fd);
-            return ret;
+            goto out_close;
         }
     }
+
+    s->aio_ctx = paio_init();
+    if (!s->aio_ctx) {
+        goto out_free_buf;
+    }
+
     return 0;
+
+out_free_buf:
+    qemu_vfree(s->aligned_buf);
+out_close:
+    close(fd);
+    return -errno;
 }
 
 static int raw_open(BlockDriverState *bs, const char *filename, int flags)
@@ -487,240 +494,58 @@ static int raw_write(BlockDriverState *bs, int64_t sector_num,
     return ret;
 }
 
-/***********************************************************/
-/* Unix AIO using POSIX AIO */
-
-typedef struct RawAIOCB {
-    BlockDriverAIOCB common;
-    struct qemu_paiocb aiocb;
-    struct RawAIOCB *next;
-    int ret;
-} RawAIOCB;
-
-typedef struct PosixAioState
-{
-    int rfd, wfd;
-    RawAIOCB *first_aio;
-} PosixAioState;
-
-static void posix_aio_read(void *opaque)
-{
-    PosixAioState *s = opaque;
-    RawAIOCB *acb, **pacb;
-    int ret;
-    ssize_t len;
-
-    /* read all bytes from signal pipe */
-    for (;;) {
-        char bytes[16];
-
-        len = read(s->rfd, bytes, sizeof(bytes));
-        if (len == -1 && errno == EINTR)
-            continue; /* try again */
-        if (len == sizeof(bytes))
-            continue; /* more to read */
-        break;
-    }
-
-    for(;;) {
-        pacb = &s->first_aio;
-        for(;;) {
-            acb = *pacb;
-            if (!acb)
-                goto the_end;
-            ret = qemu_paio_error(&acb->aiocb);
-            if (ret == ECANCELED) {
-                /* remove the request */
-                *pacb = acb->next;
-                qemu_aio_release(acb);
-            } else if (ret != EINPROGRESS) {
-                /* end of aio */
-                if (ret == 0) {
-                    ret = qemu_paio_return(&acb->aiocb);
-                    if (ret == acb->aiocb.aio_nbytes)
-                        ret = 0;
-                    else
-                        ret = -EINVAL;
-                } else {
-                    ret = -ret;
-                }
-                /* remove the request */
-                *pacb = acb->next;
-                /* call the callback */
-                acb->common.cb(acb->common.opaque, ret);
-                qemu_aio_release(acb);
-                break;
-            } else {
-                pacb = &acb->next;
-            }
-        }
-    }
- the_end: ;
-}
-
-static int posix_aio_flush(void *opaque)
-{
-    PosixAioState *s = opaque;
-    return !!s->first_aio;
-}
-
-static PosixAioState *posix_aio_state;
-
-static void aio_signal_handler(int signum)
-{
-    if (posix_aio_state) {
-        char byte = 0;
-
-        write(posix_aio_state->wfd, &byte, sizeof(byte));
-    }
-
-    qemu_service_io();
-}
-
-static int posix_aio_init(void)
+/*
+ * Check if all memory in this vector is sector aligned.
+ */
+static int qiov_is_aligned(QEMUIOVector *qiov)
 {
-    struct sigaction act;
-    PosixAioState *s;
-    int fds[2];
-    struct qemu_paioinit ai;
-  
-    if (posix_aio_state)
-        return 0;
-
-    s = qemu_malloc(sizeof(PosixAioState));
-
-    sigfillset(&act.sa_mask);
-    act.sa_flags = 0; /* do not restart syscalls to interrupt select() */
-    act.sa_handler = aio_signal_handler;
-    sigaction(SIGUSR2, &act, NULL);
-
-    s->first_aio = NULL;
-    if (pipe(fds) == -1) {
-        fprintf(stderr, "failed to create pipe\n");
-        return -errno;
-    }
-
-    s->rfd = fds[0];
-    s->wfd = fds[1];
-
-    fcntl(s->rfd, F_SETFL, O_NONBLOCK);
-    fcntl(s->wfd, F_SETFL, O_NONBLOCK);
-
-    qemu_aio_set_fd_handler(s->rfd, posix_aio_read, NULL, posix_aio_flush, s);
-
-    memset(&ai, 0, sizeof(ai));
-    ai.aio_threads = 64;
-    ai.aio_num = 64;
-    qemu_paio_init(&ai);
-
-    posix_aio_state = s;
-
-    return 0;
-}
+    int i;
 
-static void raw_aio_remove(RawAIOCB *acb)
-{
-    RawAIOCB **pacb;
-
-    /* remove the callback from the queue */
-    pacb = &posix_aio_state->first_aio;
-    for(;;) {
-        if (*pacb == NULL) {
-            fprintf(stderr, "raw_aio_remove: aio request not found!\n");
-            break;
-        } else if (*pacb == acb) {
-            *pacb = acb->next;
-            qemu_aio_release(acb);
-            break;
+    for (i = 0; i < qiov->niov; i++) {
+        if ((uintptr_t) qiov->iov[i].iov_base % 512) {
+            return 0;
         }
-        pacb = &(*pacb)->next;
     }
-}
-
-static void raw_aio_cancel(BlockDriverAIOCB *blockacb)
-{
-    int ret;
-    RawAIOCB *acb = (RawAIOCB *)blockacb;
 
-    ret = qemu_paio_cancel(acb->aiocb.aio_fildes, &acb->aiocb);
-    if (ret == QEMU_PAIO_NOTCANCELED) {
-        /* fail safe: if the aio could not be canceled, we wait for
-           it */
-        while (qemu_paio_error(&acb->aiocb) == EINPROGRESS);
-    }
-
-    raw_aio_remove(acb);
+    return 1;
 }
 
-static AIOPool raw_aio_pool = {
-    .aiocb_size         = sizeof(RawAIOCB),
-    .cancel             = raw_aio_cancel,
-};
-
-static RawAIOCB *raw_aio_setup(BlockDriverState *bs, int64_t sector_num,
-        QEMUIOVector *qiov, int nb_sectors,
-        BlockDriverCompletionFunc *cb, void *opaque)
+static BlockDriverAIOCB *raw_aio_submit(BlockDriverState *bs,
+        int64_t sector_num, QEMUIOVector *qiov, int nb_sectors,
+        BlockDriverCompletionFunc *cb, void *opaque, int type)
 {
     BDRVRawState *s = bs->opaque;
-    RawAIOCB *acb;
 
     if (fd_open(bs) < 0)
         return NULL;
 
-    acb = qemu_aio_get(&raw_aio_pool, bs, cb, opaque);
-    if (!acb)
-        return NULL;
-    acb->aiocb.aio_fildes = s->fd;
-    acb->aiocb.ev_signo = SIGUSR2;
-    acb->aiocb.aio_iov = qiov->iov;
-    acb->aiocb.aio_niov = qiov->niov;
-    acb->aiocb.aio_nbytes = nb_sectors * 512;
-    acb->aiocb.aio_offset = sector_num * 512;
-    acb->aiocb.aio_flags = 0;
-
     /*
      * If O_DIRECT is used the buffer needs to be aligned on a sector
-     * boundary. Tell the low level code to ensure that in case it's
-     * not done yet.
+     * boundary.  Check if this is the case or telll the low-level
+     * driver that it needs to copy the buffer.
      */
-    if (s->aligned_buf)
-        acb->aiocb.aio_flags |= QEMU_AIO_SECTOR_ALIGNED;
+    if (s->aligned_buf && !qiov_is_aligned(qiov)) {
+        type |= QEMU_AIO_MISALIGNED;
+    }
 
-    acb->next = posix_aio_state->first_aio;
-    posix_aio_state->first_aio = acb;
-    return acb;
+    return paio_submit(bs, s->aio_ctx, s->fd, sector_num, qiov, nb_sectors,
+                       cb, opaque, type);
 }
 
 static BlockDriverAIOCB *raw_aio_readv(BlockDriverState *bs,
         int64_t sector_num, QEMUIOVector *qiov, int nb_sectors,
         BlockDriverCompletionFunc *cb, void *opaque)
 {
-    RawAIOCB *acb;
-
-    acb = raw_aio_setup(bs, sector_num, qiov, nb_sectors, cb, opaque);
-    if (!acb)
-        return NULL;
-    if (qemu_paio_read(&acb->aiocb) < 0) {
-        raw_aio_remove(acb);
-        return NULL;
-    }
-    return &acb->common;
+    return raw_aio_submit(bs, sector_num, qiov, nb_sectors,
+                          cb, opaque, QEMU_AIO_READ);
 }
 
 static BlockDriverAIOCB *raw_aio_writev(BlockDriverState *bs,
         int64_t sector_num, QEMUIOVector *qiov, int nb_sectors,
         BlockDriverCompletionFunc *cb, void *opaque)
 {
-    RawAIOCB *acb;
-
-    acb = raw_aio_setup(bs, sector_num, qiov, nb_sectors, cb, opaque);
-    if (!acb)
-        return NULL;
-    if (qemu_paio_write(&acb->aiocb) < 0) {
-        raw_aio_remove(acb);
-        return NULL;
-    }
-    return &acb->common;
+    return raw_aio_submit(bs, sector_num, qiov, nb_sectors,
+                          cb, opaque, QEMU_AIO_WRITE);
 }
 
 static void raw_close(BlockDriverState *bs)
@@ -1085,30 +910,10 @@ static BlockDriverAIOCB *hdev_aio_ioctl(BlockDriverState *bs,
         BlockDriverCompletionFunc *cb, void *opaque)
 {
     BDRVRawState *s = bs->opaque;
-    RawAIOCB *acb;
 
     if (fd_open(bs) < 0)
         return NULL;
-
-    acb = qemu_aio_get(&raw_aio_pool, bs, cb, opaque);
-    if (!acb)
-        return NULL;
-    acb->aiocb.aio_fildes = s->fd;
-    acb->aiocb.ev_signo = SIGUSR2;
-    acb->aiocb.aio_offset = 0;
-    acb->aiocb.aio_flags = 0;
-
-    acb->next = posix_aio_state->first_aio;
-    posix_aio_state->first_aio = acb;
-
-    acb->aiocb.aio_ioctl_buf = buf;
-    acb->aiocb.aio_ioctl_cmd = req;
-    if (qemu_paio_ioctl(&acb->aiocb) < 0) {
-        raw_aio_remove(acb);
-        return NULL;
-    }
-
-    return &acb->common;
+    return paio_ioctl(bs, s->fd, req, buf, cb, opaque);
 }
 
 #elif defined(__FreeBSD__)
@@ -1189,8 +994,6 @@ static int floppy_open(BlockDriverState *bs, const char *filename, int flags)
     BDRVRawState *s = bs->opaque;
     int ret;
 
-    posix_aio_init();
-
     s->type = FTYPE_FD;
 
     /* open will not fail even if no floppy is inserted, so add O_NONBLOCK */
diff --git a/posix-aio-compat.c b/posix-aio-compat.c
index 3d79f59854..5ea197f668 100644
--- a/posix-aio-compat.c
+++ b/posix-aio-compat.c
@@ -12,17 +12,49 @@
  */
 
 #include <sys/ioctl.h>
+#include <sys/types.h>
 #include <pthread.h>
 #include <unistd.h>
 #include <errno.h>
 #include <time.h>
+#include <signal.h>
 #include <string.h>
 #include <stdlib.h>
 #include <stdio.h>
+
+#include "sys-queue.h"
 #include "osdep.h"
 #include "qemu-common.h"
+#include "block_int.h"
+
+#include "block/raw-posix-aio.h"
+
+
+struct qemu_paiocb {
+    BlockDriverAIOCB common;
+    int aio_fildes;
+    union {
+        struct iovec *aio_iov;
+	void *aio_ioctl_buf;
+    };
+    int aio_niov;
+    size_t aio_nbytes;
+#define aio_ioctl_cmd   aio_nbytes /* for QEMU_AIO_IOCTL */
+    int ev_signo;
+    off_t aio_offset;
+
+    TAILQ_ENTRY(qemu_paiocb) node;
+    int aio_type;
+    ssize_t ret;
+    int active;
+    struct qemu_paiocb *next;
+};
+
+typedef struct PosixAioState {
+    int rfd, wfd;
+    struct qemu_paiocb *first_aio;
+} PosixAioState;
 
-#include "posix-aio-compat.h"
 
 static pthread_mutex_t lock = PTHREAD_MUTEX_INITIALIZER;
 static pthread_cond_t cond = PTHREAD_COND_INITIALIZER;
@@ -132,30 +164,13 @@ qemu_pwritev(int fd, const struct iovec *iov, int nr_iov, off_t offset)
 
 #endif
 
-/*
- * Check if we need to copy the data in the aiocb into a new
- * properly aligned buffer.
- */
-static int aiocb_needs_copy(struct qemu_paiocb *aiocb)
-{
-    if (aiocb->aio_flags & QEMU_AIO_SECTOR_ALIGNED) {
-        int i;
-
-        for (i = 0; i < aiocb->aio_niov; i++)
-            if ((uintptr_t) aiocb->aio_iov[i].iov_base % 512)
-                return 1;
-    }
-
-    return 0;
-}
-
 static size_t handle_aiocb_rw_vector(struct qemu_paiocb *aiocb)
 {
     size_t offset = 0;
     ssize_t len;
 
     do {
-        if (aiocb->aio_type == QEMU_PAIO_WRITE)
+        if (aiocb->aio_type & QEMU_AIO_WRITE)
             len = qemu_pwritev(aiocb->aio_fildes,
                                aiocb->aio_iov,
                                aiocb->aio_niov,
@@ -178,7 +193,7 @@ static size_t handle_aiocb_rw_linear(struct qemu_paiocb *aiocb, char *buf)
     size_t len;
 
     while (offset < aiocb->aio_nbytes) {
-         if (aiocb->aio_type == QEMU_PAIO_WRITE)
+         if (aiocb->aio_type & QEMU_AIO_WRITE)
              len = pwrite(aiocb->aio_fildes,
                           (const char *)buf + offset,
                           aiocb->aio_nbytes - offset,
@@ -208,7 +223,7 @@ static size_t handle_aiocb_rw(struct qemu_paiocb *aiocb)
     size_t nbytes;
     char *buf;
 
-    if (!aiocb_needs_copy(aiocb)) {
+    if (!(aiocb->aio_type & QEMU_AIO_MISALIGNED)) {
         /*
          * If there is just a single buffer, and it is properly aligned
          * we can just use plain pread/pwrite without any problems.
@@ -243,7 +258,7 @@ static size_t handle_aiocb_rw(struct qemu_paiocb *aiocb)
      * a single aligned buffer.
      */
     buf = qemu_memalign(512, aiocb->aio_nbytes);
-    if (aiocb->aio_type == QEMU_PAIO_WRITE) {
+    if (aiocb->aio_type & QEMU_AIO_WRITE) {
         char *p = buf;
         int i;
 
@@ -254,7 +269,7 @@ static size_t handle_aiocb_rw(struct qemu_paiocb *aiocb)
     }
 
     nbytes = handle_aiocb_rw_linear(aiocb, buf);
-    if (aiocb->aio_type != QEMU_PAIO_WRITE) {
+    if (!(aiocb->aio_type & QEMU_AIO_WRITE)) {
         char *p = buf;
         size_t count = aiocb->aio_nbytes, copy;
         int i;
@@ -310,12 +325,12 @@ static void *aio_thread(void *unused)
         idle_threads--;
         mutex_unlock(&lock);
 
-        switch (aiocb->aio_type) {
-        case QEMU_PAIO_READ:
-        case QEMU_PAIO_WRITE:
+        switch (aiocb->aio_type & QEMU_AIO_TYPE_MASK) {
+        case QEMU_AIO_READ:
+        case QEMU_AIO_WRITE:
 		ret = handle_aiocb_rw(aiocb);
 		break;
-        case QEMU_PAIO_IOCTL:
+        case QEMU_AIO_IOCTL:
 		ret = handle_aiocb_ioctl(aiocb);
 		break;
 	default:
@@ -346,24 +361,8 @@ static void spawn_thread(void)
     thread_create(&thread_id, &attr, aio_thread, NULL);
 }
 
-int qemu_paio_init(struct qemu_paioinit *aioinit)
+static void qemu_paio_submit(struct qemu_paiocb *aiocb)
 {
-    int ret;
-
-    ret = pthread_attr_init(&attr);
-    if (ret) die2(ret, "pthread_attr_init");
-
-    ret = pthread_attr_setdetachstate(&attr, PTHREAD_CREATE_DETACHED);
-    if (ret) die2(ret, "pthread_attr_setdetachstate");
-
-    TAILQ_INIT(&request_list);
-
-    return 0;
-}
-
-static int qemu_paio_submit(struct qemu_paiocb *aiocb, int type)
-{
-    aiocb->aio_type = type;
     aiocb->ret = -EINPROGRESS;
     aiocb->active = 0;
     mutex_lock(&lock);
@@ -372,26 +371,9 @@ static int qemu_paio_submit(struct qemu_paiocb *aiocb, int type)
     TAILQ_INSERT_TAIL(&request_list, aiocb, node);
     mutex_unlock(&lock);
     cond_signal(&cond);
-
-    return 0;
 }
 
-int qemu_paio_read(struct qemu_paiocb *aiocb)
-{
-    return qemu_paio_submit(aiocb, QEMU_PAIO_READ);
-}
-
-int qemu_paio_write(struct qemu_paiocb *aiocb)
-{
-    return qemu_paio_submit(aiocb, QEMU_PAIO_WRITE);
-}
-
-int qemu_paio_ioctl(struct qemu_paiocb *aiocb)
-{
-    return qemu_paio_submit(aiocb, QEMU_PAIO_IOCTL);
-}
-
-ssize_t qemu_paio_return(struct qemu_paiocb *aiocb)
+static ssize_t qemu_paio_return(struct qemu_paiocb *aiocb)
 {
     ssize_t ret;
 
@@ -402,7 +384,7 @@ ssize_t qemu_paio_return(struct qemu_paiocb *aiocb)
     return ret;
 }
 
-int qemu_paio_error(struct qemu_paiocb *aiocb)
+static int qemu_paio_error(struct qemu_paiocb *aiocb)
 {
     ssize_t ret = qemu_paio_return(aiocb);
 
@@ -414,20 +396,217 @@ int qemu_paio_error(struct qemu_paiocb *aiocb)
     return ret;
 }
 
-int qemu_paio_cancel(int fd, struct qemu_paiocb *aiocb)
+static void posix_aio_read(void *opaque)
 {
+    PosixAioState *s = opaque;
+    struct qemu_paiocb *acb, **pacb;
     int ret;
+    ssize_t len;
+
+    /* read all bytes from signal pipe */
+    for (;;) {
+        char bytes[16];
+
+        len = read(s->rfd, bytes, sizeof(bytes));
+        if (len == -1 && errno == EINTR)
+            continue; /* try again */
+        if (len == sizeof(bytes))
+            continue; /* more to read */
+        break;
+    }
+
+    for(;;) {
+        pacb = &s->first_aio;
+        for(;;) {
+            acb = *pacb;
+            if (!acb)
+                goto the_end;
+            ret = qemu_paio_error(acb);
+            if (ret == ECANCELED) {
+                /* remove the request */
+                *pacb = acb->next;
+                qemu_aio_release(acb);
+            } else if (ret != EINPROGRESS) {
+                /* end of aio */
+                if (ret == 0) {
+                    ret = qemu_paio_return(acb);
+                    if (ret == acb->aio_nbytes)
+                        ret = 0;
+                    else
+                        ret = -EINVAL;
+                } else {
+                    ret = -ret;
+                }
+                /* remove the request */
+                *pacb = acb->next;
+                /* call the callback */
+                acb->common.cb(acb->common.opaque, ret);
+                qemu_aio_release(acb);
+                break;
+            } else {
+                pacb = &acb->next;
+            }
+        }
+    }
+ the_end: ;
+}
+
+static int posix_aio_flush(void *opaque)
+{
+    PosixAioState *s = opaque;
+    return !!s->first_aio;
+}
+
+static PosixAioState *posix_aio_state;
+
+static void aio_signal_handler(int signum)
+{
+    if (posix_aio_state) {
+        char byte = 0;
+
+        write(posix_aio_state->wfd, &byte, sizeof(byte));
+    }
+
+    qemu_service_io();
+}
+
+static void paio_remove(struct qemu_paiocb *acb)
+{
+    struct qemu_paiocb **pacb;
+
+    /* remove the callback from the queue */
+    pacb = &posix_aio_state->first_aio;
+    for(;;) {
+        if (*pacb == NULL) {
+            fprintf(stderr, "paio_remove: aio request not found!\n");
+            break;
+        } else if (*pacb == acb) {
+            *pacb = acb->next;
+            qemu_aio_release(acb);
+            break;
+        }
+        pacb = &(*pacb)->next;
+    }
+}
+
+static void paio_cancel(BlockDriverAIOCB *blockacb)
+{
+    struct qemu_paiocb *acb = (struct qemu_paiocb *)blockacb;
+    int active = 0;
 
     mutex_lock(&lock);
-    if (!aiocb->active) {
-        TAILQ_REMOVE(&request_list, aiocb, node);
-        aiocb->ret = -ECANCELED;
-        ret = QEMU_PAIO_CANCELED;
-    } else if (aiocb->ret == -EINPROGRESS)
-        ret = QEMU_PAIO_NOTCANCELED;
-    else
-        ret = QEMU_PAIO_ALLDONE;
+    if (!acb->active) {
+        TAILQ_REMOVE(&request_list, acb, node);
+        acb->ret = -ECANCELED;
+    } else if (acb->ret == -EINPROGRESS) {
+        active = 1;
+    }
     mutex_unlock(&lock);
 
-    return ret;
+    if (active) {
+        /* fail safe: if the aio could not be canceled, we wait for
+           it */
+        while (qemu_paio_error(acb) == EINPROGRESS)
+            ;
+    }
+
+    paio_remove(acb);
+}
+
+static AIOPool raw_aio_pool = {
+    .aiocb_size         = sizeof(struct qemu_paiocb),
+    .cancel             = paio_cancel,
+};
+
+BlockDriverAIOCB *paio_submit(BlockDriverState *bs, void *aio_ctx, int fd,
+        int64_t sector_num, QEMUIOVector *qiov, int nb_sectors,
+        BlockDriverCompletionFunc *cb, void *opaque, int type)
+{
+    struct qemu_paiocb *acb;
+
+    acb = qemu_aio_get(&raw_aio_pool, bs, cb, opaque);
+    if (!acb)
+        return NULL;
+    acb->aio_type = type;
+    acb->aio_fildes = fd;
+    acb->ev_signo = SIGUSR2;
+    acb->aio_iov = qiov->iov;
+    acb->aio_niov = qiov->niov;
+    acb->aio_nbytes = nb_sectors * 512;
+    acb->aio_offset = sector_num * 512;
+
+    acb->next = posix_aio_state->first_aio;
+    posix_aio_state->first_aio = acb;
+
+    qemu_paio_submit(acb);
+    return &acb->common;
+}
+
+BlockDriverAIOCB *paio_ioctl(BlockDriverState *bs, int fd,
+        unsigned long int req, void *buf,
+        BlockDriverCompletionFunc *cb, void *opaque)
+{
+    struct qemu_paiocb *acb;
+
+    acb = qemu_aio_get(&raw_aio_pool, bs, cb, opaque);
+    if (!acb)
+        return NULL;
+    acb->aio_type = QEMU_AIO_IOCTL;
+    acb->aio_fildes = fd;
+    acb->ev_signo = SIGUSR2;
+    acb->aio_offset = 0;
+    acb->aio_ioctl_buf = buf;
+    acb->aio_ioctl_cmd = req;
+
+    acb->next = posix_aio_state->first_aio;
+    posix_aio_state->first_aio = acb;
+
+    qemu_paio_submit(acb);
+    return &acb->common;
+}
+
+void *paio_init(void)
+{
+    struct sigaction act;
+    PosixAioState *s;
+    int fds[2];
+    int ret;
+
+    if (posix_aio_state)
+        return posix_aio_state;
+
+    s = qemu_malloc(sizeof(PosixAioState));
+
+    sigfillset(&act.sa_mask);
+    act.sa_flags = 0; /* do not restart syscalls to interrupt select() */
+    act.sa_handler = aio_signal_handler;
+    sigaction(SIGUSR2, &act, NULL);
+
+    s->first_aio = NULL;
+    if (pipe(fds) == -1) {
+        fprintf(stderr, "failed to create pipe\n");
+        return NULL;
+    }
+
+    s->rfd = fds[0];
+    s->wfd = fds[1];
+
+    fcntl(s->rfd, F_SETFL, O_NONBLOCK);
+    fcntl(s->wfd, F_SETFL, O_NONBLOCK);
+
+    qemu_aio_set_fd_handler(s->rfd, posix_aio_read, NULL, posix_aio_flush, s);
+
+    ret = pthread_attr_init(&attr);
+    if (ret)
+        die2(ret, "pthread_attr_init");
+
+    ret = pthread_attr_setdetachstate(&attr, PTHREAD_CREATE_DETACHED);
+    if (ret)
+        die2(ret, "pthread_attr_setdetachstate");
+
+    TAILQ_INIT(&request_list);
+
+    posix_aio_state = s;
+
+    return posix_aio_state;
 }
diff --git a/posix-aio-compat.h b/posix-aio-compat.h
deleted file mode 100644
index 1c5dcbd920..0000000000
--- a/posix-aio-compat.h
+++ /dev/null
@@ -1,68 +0,0 @@
-/*
- * QEMU posix-aio emulation
- *
- * Copyright IBM, Corp. 2008
- *
- * Authors:
- *  Anthony Liguori   <aliguori@us.ibm.com>
- *
- * This work is licensed under the terms of the GNU GPL, version 2.  See
- * the COPYING file in the top-level directory.
- *
- */
-
-#ifndef QEMU_POSIX_AIO_COMPAT_H
-#define QEMU_POSIX_AIO_COMPAT_H
-
-#include <sys/types.h>
-#include <unistd.h>
-#include <signal.h>
-
-#include "sys-queue.h"
-
-#define QEMU_PAIO_CANCELED     0x01
-#define QEMU_PAIO_NOTCANCELED  0x02
-#define QEMU_PAIO_ALLDONE      0x03
-
-struct qemu_paiocb
-{
-    int aio_fildes;
-    union {
-        struct iovec *aio_iov;
-	void *aio_ioctl_buf;
-    };
-    int aio_niov;
-    size_t aio_nbytes;
-#define aio_ioctl_cmd   aio_nbytes /* for QEMU_PAIO_IOCTL */
-    int ev_signo;
-    off_t aio_offset;
-    unsigned aio_flags;
-/* 512 byte alignment required for buffer, offset and length */
-#define QEMU_AIO_SECTOR_ALIGNED	0x01
-
-    /* private */
-    TAILQ_ENTRY(qemu_paiocb) node;
-    int aio_type;
-#define QEMU_PAIO_READ         0x01
-#define QEMU_PAIO_WRITE        0x02
-#define QEMU_PAIO_IOCTL        0x03
-    ssize_t ret;
-    int active;
-};
-
-struct qemu_paioinit
-{
-    unsigned int aio_threads;
-    unsigned int aio_num;
-    unsigned int aio_idle_time;
-};
-
-int qemu_paio_init(struct qemu_paioinit *aioinit);
-int qemu_paio_read(struct qemu_paiocb *aiocb);
-int qemu_paio_write(struct qemu_paiocb *aiocb);
-int qemu_paio_ioctl(struct qemu_paiocb *aiocb);
-int qemu_paio_error(struct qemu_paiocb *aiocb);
-ssize_t qemu_paio_return(struct qemu_paiocb *aiocb);
-int qemu_paio_cancel(int fd, struct qemu_paiocb *aiocb);
-
-#endif