summary refs log tree commit diff stats
diff options
context:
space:
mode:
-rw-r--r--Makefile3
-rw-r--r--Makefile.target3
-rw-r--r--block-raw-posix.c109
-rwxr-xr-xconfigure20
-rw-r--r--posix-aio-compat.c202
-rw-r--r--posix-aio-compat.h56
6 files changed, 287 insertions, 106 deletions
diff --git a/Makefile b/Makefile
index a2a03ec3e2..01f012198b 100644
--- a/Makefile
+++ b/Makefile
@@ -56,6 +56,9 @@ BLOCK_OBJS+=nbd.o block.o aio.o
 ifdef CONFIG_WIN32
 BLOCK_OBJS += block-raw-win32.o
 else
+ifdef CONFIG_AIO
+BLOCK_OBJS += posix-aio-compat.o
+endif
 BLOCK_OBJS += block-raw-posix.o
 endif
 
diff --git a/Makefile.target b/Makefile.target
index 7152dff541..8c649bec78 100644
--- a/Makefile.target
+++ b/Makefile.target
@@ -564,6 +564,9 @@ endif
 ifdef CONFIG_WIN32
 OBJS+=block-raw-win32.o
 else
+ifdef CONFIG_AIO
+OBJS+=posix-aio-compat.o
+endif
 OBJS+=block-raw-posix.o
 endif
 
diff --git a/block-raw-posix.c b/block-raw-posix.c
index 0a06a12fb6..2fbb714d01 100644
--- a/block-raw-posix.c
+++ b/block-raw-posix.c
@@ -27,7 +27,7 @@
 #include "block_int.h"
 #include <assert.h>
 #ifdef CONFIG_AIO
-#include <aio.h>
+#include "posix-aio-compat.h"
 #endif
 
 #ifdef CONFIG_COCOA
@@ -93,16 +93,10 @@
    reopen it to see if the disk has been changed */
 #define FD_OPEN_TIMEOUT 1000
 
-/* posix-aio doesn't allow multiple outstanding requests to a single file
- * descriptor.  we implement a pool of dup()'d file descriptors to work
- * around this */
-#define RAW_FD_POOL_SIZE	64
-
 typedef struct BDRVRawState {
     int fd;
     int type;
     unsigned int lseek_err_cnt;
-    int fd_pool[RAW_FD_POOL_SIZE];
 #if defined(__linux__)
     /* linux floppy specific */
     int fd_open_flags;
@@ -122,7 +116,6 @@ static int raw_open(BlockDriverState *bs, const char *filename, int flags)
 {
     BDRVRawState *s = bs->opaque;
     int fd, open_flags, ret;
-    int i;
 
     posix_aio_init();
 
@@ -155,8 +148,6 @@ static int raw_open(BlockDriverState *bs, const char *filename, int flags)
         return ret;
     }
     s->fd = fd;
-    for (i = 0; i < RAW_FD_POOL_SIZE; i++)
-        s->fd_pool[i] = -1;
     s->aligned_buf = NULL;
     if ((flags & BDRV_O_NOCACHE)) {
         s->aligned_buf = qemu_memalign(512, ALIGNED_BUFFER_SIZE);
@@ -446,8 +437,7 @@ static int raw_pwrite(BlockDriverState *bs, int64_t offset,
 
 typedef struct RawAIOCB {
     BlockDriverAIOCB common;
-    int fd;
-    struct aiocb aiocb;
+    struct qemu_paiocb aiocb;
     struct RawAIOCB *next;
     int ret;
 } RawAIOCB;
@@ -458,38 +448,6 @@ typedef struct PosixAioState
     RawAIOCB *first_aio;
 } PosixAioState;
 
-static int raw_fd_pool_get(BDRVRawState *s)
-{
-    int i;
-
-    for (i = 0; i < RAW_FD_POOL_SIZE; i++) {
-        /* already in use */
-        if (s->fd_pool[i] != -1)
-            continue;
-
-        /* try to dup file descriptor */
-        s->fd_pool[i] = dup(s->fd);
-        if (s->fd_pool[i] != -1)
-            return s->fd_pool[i];
-    }
-
-    /* we couldn't dup the file descriptor so just use the main one */
-    return s->fd;
-}
-
-static void raw_fd_pool_put(RawAIOCB *acb)
-{
-    BDRVRawState *s = acb->common.bs->opaque;
-    int i;
-
-    for (i = 0; i < RAW_FD_POOL_SIZE; i++) {
-        if (s->fd_pool[i] == acb->fd) {
-            close(s->fd_pool[i]);
-            s->fd_pool[i] = -1;
-        }
-    }
-}
-
 static void posix_aio_read(void *opaque)
 {
     PosixAioState *s = opaque;
@@ -515,16 +473,15 @@ static void posix_aio_read(void *opaque)
             acb = *pacb;
             if (!acb)
                 goto the_end;
-            ret = aio_error(&acb->aiocb);
+            ret = qemu_paio_error(&acb->aiocb);
             if (ret == ECANCELED) {
                 /* remove the request */
                 *pacb = acb->next;
-                raw_fd_pool_put(acb);
                 qemu_aio_release(acb);
             } else if (ret != EINPROGRESS) {
                 /* end of aio */
                 if (ret == 0) {
-                    ret = aio_return(&acb->aiocb);
+                    ret = qemu_paio_return(&acb->aiocb);
                     if (ret == acb->aiocb.aio_nbytes)
                         ret = 0;
                     else
@@ -536,7 +493,6 @@ static void posix_aio_read(void *opaque)
                 *pacb = acb->next;
                 /* call the callback */
                 acb->common.cb(acb->common.opaque, ret);
-                raw_fd_pool_put(acb);
                 qemu_aio_release(acb);
                 break;
             } else {
@@ -571,6 +527,7 @@ static int posix_aio_init(void)
     struct sigaction act;
     PosixAioState *s;
     int fds[2];
+    struct qemu_paioinit ai;
   
     if (posix_aio_state)
         return 0;
@@ -598,24 +555,11 @@ static int posix_aio_init(void)
 
     qemu_aio_set_fd_handler(s->rfd, posix_aio_read, NULL, posix_aio_flush, s);
 
-#if defined(__linux__)
-    {
-        struct aioinit ai;
+    memset(&ai, 0, sizeof(ai));
+    ai.aio_threads = 64;
+    ai.aio_num = 64;
+    qemu_paio_init(&ai);
 
-        memset(&ai, 0, sizeof(ai));
-#if defined(__GLIBC_PREREQ) && __GLIBC_PREREQ(2, 4)
-        ai.aio_threads = 64;
-        ai.aio_num = 64;
-#else
-        /* XXX: aio thread exit seems to hang on RedHat 9 and this init
-           seems to fix the problem. */
-        ai.aio_threads = 1;
-        ai.aio_num = 1;
-        ai.aio_idle_time = 365 * 100000;
-#endif
-        aio_init(&ai);
-    }
-#endif
     posix_aio_state = s;
 
     return 0;
@@ -634,8 +578,7 @@ static RawAIOCB *raw_aio_setup(BlockDriverState *bs,
     acb = qemu_aio_get(bs, cb, opaque);
     if (!acb)
         return NULL;
-    acb->fd = raw_fd_pool_get(s);
-    acb->aiocb.aio_fildes = acb->fd;
+    acb->aiocb.aio_fildes = s->fd;
     acb->aiocb.aio_sigevent.sigev_signo = SIGUSR2;
     acb->aiocb.aio_sigevent.sigev_notify = SIGEV_SIGNAL;
     acb->aiocb.aio_buf = buf;
@@ -680,7 +623,7 @@ static BlockDriverAIOCB *raw_aio_read(BlockDriverState *bs,
     acb = raw_aio_setup(bs, sector_num, buf, nb_sectors, cb, opaque);
     if (!acb)
         return NULL;
-    if (aio_read(&acb->aiocb) < 0) {
+    if (qemu_paio_read(&acb->aiocb) < 0) {
         qemu_aio_release(acb);
         return NULL;
     }
@@ -711,7 +654,7 @@ static BlockDriverAIOCB *raw_aio_write(BlockDriverState *bs,
     acb = raw_aio_setup(bs, sector_num, (uint8_t*)buf, nb_sectors, cb, opaque);
     if (!acb)
         return NULL;
-    if (aio_write(&acb->aiocb) < 0) {
+    if (qemu_paio_write(&acb->aiocb) < 0) {
         qemu_aio_release(acb);
         return NULL;
     }
@@ -724,11 +667,11 @@ static void raw_aio_cancel(BlockDriverAIOCB *blockacb)
     RawAIOCB *acb = (RawAIOCB *)blockacb;
     RawAIOCB **pacb;
 
-    ret = aio_cancel(acb->aiocb.aio_fildes, &acb->aiocb);
-    if (ret == AIO_NOTCANCELED) {
+    ret = qemu_paio_cancel(acb->aiocb.aio_fildes, &acb->aiocb);
+    if (ret == QEMU_PAIO_NOTCANCELED) {
         /* fail safe: if the aio could not be canceled, we wait for
            it */
-        while (aio_error(&acb->aiocb) == EINPROGRESS);
+        while (qemu_paio_error(&acb->aiocb) == EINPROGRESS);
     }
 
     /* remove the callback from the queue */
@@ -738,14 +681,12 @@ static void raw_aio_cancel(BlockDriverAIOCB *blockacb)
             break;
         } else if (*pacb == acb) {
             *pacb = acb->next;
-            raw_fd_pool_put(acb);
             qemu_aio_release(acb);
             break;
         }
         pacb = &acb->next;
     }
 }
-
 #else /* CONFIG_AIO */
 static int posix_aio_init(void)
 {
@@ -753,17 +694,6 @@ static int posix_aio_init(void)
 }
 #endif /* CONFIG_AIO */
 
-static void raw_close_fd_pool(BDRVRawState *s)
-{
-    int i;
-
-    for (i = 0; i < RAW_FD_POOL_SIZE; i++) {
-        if (s->fd_pool[i] != -1) {
-            close(s->fd_pool[i]);
-            s->fd_pool[i] = -1;
-        }
-    }
-}
 
 static void raw_close(BlockDriverState *bs)
 {
@@ -774,7 +704,6 @@ static void raw_close(BlockDriverState *bs)
         if (s->aligned_buf != NULL)
             qemu_free(s->aligned_buf);
     }
-    raw_close_fd_pool(s);
 }
 
 static int raw_truncate(BlockDriverState *bs, int64_t offset)
@@ -895,6 +824,7 @@ BlockDriver bdrv_raw = {
     .bdrv_aio_cancel = raw_aio_cancel,
     .aiocb_size = sizeof(RawAIOCB),
 #endif
+
     .bdrv_pread = raw_pread,
     .bdrv_pwrite = raw_pwrite,
     .bdrv_truncate = raw_truncate,
@@ -965,7 +895,7 @@ kern_return_t GetBSDPath( io_iterator_t mediaIterator, char *bsdPath, CFIndex ma
 static int hdev_open(BlockDriverState *bs, const char *filename, int flags)
 {
     BDRVRawState *s = bs->opaque;
-    int fd, open_flags, ret, i;
+    int fd, open_flags, ret;
 
     posix_aio_init();
 
@@ -1032,8 +962,6 @@ static int hdev_open(BlockDriverState *bs, const char *filename, int flags)
         return ret;
     }
     s->fd = fd;
-    for (i = 0; i < RAW_FD_POOL_SIZE; i++)
-        s->fd_pool[i] = -1;
 #if defined(__linux__)
     /* close fd so that we can reopen it as needed */
     if (s->type == FTYPE_FD) {
@@ -1061,7 +989,6 @@ static int fd_open(BlockDriverState *bs)
         (qemu_get_clock(rt_clock) - s->fd_open_time) >= FD_OPEN_TIMEOUT) {
         close(s->fd);
         s->fd = -1;
-        raw_close_fd_pool(s);
 #ifdef DEBUG_FLOPPY
         printf("Floppy closed\n");
 #endif
@@ -1162,7 +1089,6 @@ static int raw_eject(BlockDriverState *bs, int eject_flag)
             if (s->fd >= 0) {
                 close(s->fd);
                 s->fd = -1;
-                raw_close_fd_pool(s);
             }
             fd = open(bs->filename, s->fd_open_flags | O_NONBLOCK);
             if (fd >= 0) {
@@ -1252,6 +1178,7 @@ BlockDriver bdrv_host_device = {
     .bdrv_aio_cancel = raw_aio_cancel,
     .aiocb_size = sizeof(RawAIOCB),
 #endif
+
     .bdrv_pread = raw_pread,
     .bdrv_pwrite = raw_pwrite,
     .bdrv_getlength = raw_getlength,
diff --git a/configure b/configure
index 13f6358d4c..d1f0c04d80 100755
--- a/configure
+++ b/configure
@@ -149,7 +149,6 @@ FreeBSD)
 bsd="yes"
 audio_drv_list="oss"
 audio_possible_drivers="oss sdl esd pa"
-aio_lib="-lpthread"
 if [ "$cpu" = "i386" -o "$cpu" = "x86_64" ] ; then
     kqemu="yes"
 fi
@@ -159,7 +158,6 @@ bsd="yes"
 audio_drv_list="oss"
 audio_possible_drivers="oss sdl esd"
 oss_lib="-lossaudio"
-aio_lib="-lrt -lpthread"
 ;;
 OpenBSD)
 bsd="yes"
@@ -167,7 +165,6 @@ openbsd="yes"
 audio_drv_list="oss"
 audio_possible_drivers="oss sdl esd"
 oss_lib="-lossaudio"
-aio_lib="-lpthread"
 ;;
 Darwin)
 bsd="yes"
@@ -178,7 +175,6 @@ audio_drv_list="coreaudio"
 audio_possible_drivers="coreaudio sdl fmod"
 OS_CFLAGS="-mdynamic-no-pic"
 OS_LDFLAGS="-framework CoreFoundation -framework IOKit"
-aio_lib="-lpthread"
 ;;
 SunOS)
     solaris="yes"
@@ -527,15 +523,6 @@ if test "$mingw32" = "yes" ; then
     bsd_user="no"
 fi
 
-if [ "$darwin" = "yes" -o "$mingw32" = "yes" ] ; then
-    AIOLIBS=
-elif [ "$bsd" = "yes" ]; then
-    AIOLIBS="$aio_lib"
-else
-    # Some Linux architectures (e.g. s390) don't imply -lpthread automatically.
-    AIOLIBS="-lrt -lpthread"
-fi
-
 if test ! -x "$(which cgcc 2>/dev/null)"; then
     sparse="no"
 fi
@@ -954,14 +941,17 @@ fi
 
 ##########################################
 # AIO probe
+AIOLIBS=""
+
 if test "$aio" = "yes" ; then
   aio=no
   cat > $TMPC << EOF
-#include <aio.h>
-int main(void) { return aio_write(NULL); }
+#include <pthread.h>
+int main(void) { pthread_mutex_t lock; return 0; }
 EOF
   if $cc $ARCH_CFLAGS -o $TMPE $AIOLIBS $TMPC 2> /dev/null ; then
     aio=yes
+    AIOLIBS="-lpthread"
   fi
 fi
 
diff --git a/posix-aio-compat.c b/posix-aio-compat.c
new file mode 100644
index 0000000000..232b511f92
--- /dev/null
+++ b/posix-aio-compat.c
@@ -0,0 +1,202 @@
+/*
+ * QEMU posix-aio emulation
+ *
+ * Copyright IBM, Corp. 2008
+ *
+ * Authors:
+ *  Anthony Liguori   <aliguori@us.ibm.com>
+ *
+ * This work is licensed under the terms of the GNU GPL, version 2.  See
+ * the COPYING file in the top-level directory.
+ *
+ */
+
+#include <pthread.h>
+#include <unistd.h>
+#include <errno.h>
+#include <sys/time.h>
+#include "osdep.h"
+
+#include "posix-aio-compat.h"
+
+static pthread_mutex_t lock = PTHREAD_MUTEX_INITIALIZER;
+static pthread_cond_t cond = PTHREAD_COND_INITIALIZER;
+static pthread_t thread_id;
+static int max_threads = 64;
+static int cur_threads = 0;
+static int idle_threads = 0;
+static TAILQ_HEAD(, qemu_paiocb) request_list;
+
+static void *aio_thread(void *unused)
+{
+    sigset_t set;
+
+    /* block all signals */
+    sigfillset(&set);
+    sigprocmask(SIG_BLOCK, &set, NULL);
+
+    while (1) {
+        struct qemu_paiocb *aiocb;
+        size_t offset;
+        int ret = 0;
+
+        pthread_mutex_lock(&lock);
+
+        while (TAILQ_EMPTY(&request_list) &&
+               !(ret == ETIMEDOUT)) {
+            struct timespec ts = { 0 };
+            qemu_timeval tv;
+
+            qemu_gettimeofday(&tv);
+            ts.tv_sec = tv.tv_sec + 10;
+            ret = pthread_cond_timedwait(&cond, &lock, &ts);
+        }
+
+        if (ret == ETIMEDOUT)
+            break;
+
+        aiocb = TAILQ_FIRST(&request_list);
+        TAILQ_REMOVE(&request_list, aiocb, node);
+
+        offset = 0;
+        aiocb->active = 1;
+
+        idle_threads--;
+        pthread_mutex_unlock(&lock);
+
+        while (offset < aiocb->aio_nbytes) {
+            ssize_t len;
+
+            if (aiocb->is_write)
+                len = pwrite(aiocb->aio_fildes,
+                             (const char *)aiocb->aio_buf + offset,
+                             aiocb->aio_nbytes - offset,
+                             aiocb->aio_offset + offset);
+            else
+                len = pread(aiocb->aio_fildes,
+                            (char *)aiocb->aio_buf + offset,
+                            aiocb->aio_nbytes - offset,
+                            aiocb->aio_offset + offset);
+
+            if (len == -1 && errno == EINTR)
+                continue;
+            else if (len == -1) {
+                pthread_mutex_lock(&lock);
+                aiocb->ret = -errno;
+                pthread_mutex_unlock(&lock);
+                break;
+            } else if (len == 0)
+                break;
+
+            offset += len;
+
+            pthread_mutex_lock(&lock);
+            aiocb->ret = offset;
+            pthread_mutex_unlock(&lock);
+        }
+
+        pthread_mutex_lock(&lock);
+        idle_threads++;
+        pthread_mutex_unlock(&lock);
+
+        sigqueue(getpid(),
+                 aiocb->aio_sigevent.sigev_signo,
+                 aiocb->aio_sigevent.sigev_value);
+    }
+
+    idle_threads--;
+    cur_threads--;
+    pthread_mutex_unlock(&lock);
+
+    return NULL;
+}
+
+static int spawn_thread(void)
+{
+    pthread_attr_t attr;
+    int ret;
+
+    cur_threads++;
+    idle_threads++;
+
+    pthread_attr_init(&attr);
+    pthread_attr_setdetachstate(&attr, PTHREAD_CREATE_DETACHED);
+    ret = pthread_create(&thread_id, &attr, aio_thread, NULL);
+    pthread_attr_destroy(&attr);
+
+    return ret;
+}
+
+int qemu_paio_init(struct qemu_paioinit *aioinit)
+{
+    TAILQ_INIT(&request_list);
+
+    return 0;
+}
+
+static int qemu_paio_submit(struct qemu_paiocb *aiocb, int is_write)
+{
+    aiocb->is_write = is_write;
+    aiocb->ret = -EINPROGRESS;
+    aiocb->active = 0;
+    pthread_mutex_lock(&lock);
+    if (idle_threads == 0 && cur_threads < max_threads)
+        spawn_thread();
+    TAILQ_INSERT_TAIL(&request_list, aiocb, node);
+    pthread_mutex_unlock(&lock);
+    pthread_cond_broadcast(&cond);
+
+    return 0;
+}
+
+int qemu_paio_read(struct qemu_paiocb *aiocb)
+{
+    return qemu_paio_submit(aiocb, 0);
+}
+
+int qemu_paio_write(struct qemu_paiocb *aiocb)
+{
+    return qemu_paio_submit(aiocb, 1);
+}
+
+ssize_t qemu_paio_return(struct qemu_paiocb *aiocb)
+{
+    ssize_t ret;
+
+    pthread_mutex_lock(&lock);
+    ret = aiocb->ret;
+    pthread_mutex_unlock(&lock);
+
+    return ret;
+}
+
+int qemu_paio_error(struct qemu_paiocb *aiocb)
+{
+    ssize_t ret = qemu_paio_return(aiocb);
+
+    if (ret < 0)
+        ret = -ret;
+    else
+        ret = 0;
+
+    return ret;
+}
+
+int qemu_paio_cancel(int fd, struct qemu_paiocb *aiocb)
+{
+    int ret;
+
+    pthread_mutex_lock(&lock);
+    if (!aiocb->active) {
+        TAILQ_REMOVE(&request_list, aiocb, node);
+        aiocb->ret = -ECANCELED;
+        ret = QEMU_PAIO_CANCELED;
+    } else if (aiocb->ret == -EINPROGRESS)
+        ret = QEMU_PAIO_NOTCANCELED;
+    else
+        ret = QEMU_PAIO_ALLDONE;
+    pthread_mutex_unlock(&lock);
+
+    return ret;
+}
+
diff --git a/posix-aio-compat.h b/posix-aio-compat.h
new file mode 100644
index 0000000000..5dddd711a9
--- /dev/null
+++ b/posix-aio-compat.h
@@ -0,0 +1,56 @@
+/*
+ * QEMU posix-aio emulation
+ *
+ * Copyright IBM, Corp. 2008
+ *
+ * Authors:
+ *  Anthony Liguori   <aliguori@us.ibm.com>
+ *
+ * This work is licensed under the terms of the GNU GPL, version 2.  See
+ * the COPYING file in the top-level directory.
+ *
+ */
+
+#ifndef QEMU_POSIX_AIO_COMPAT_H
+#define QEMU_POSIX_AIO_COMPAT_H
+
+#include <sys/types.h>
+#include <unistd.h>
+#include <signal.h>
+
+#include "sys-queue.h"
+
+#define QEMU_PAIO_CANCELED     0x01
+#define QEMU_PAIO_NOTCANCELED  0x02
+#define QEMU_PAIO_ALLDONE      0x03
+
+struct qemu_paiocb
+{
+    int aio_fildes;
+    void *aio_buf;
+    size_t aio_nbytes;
+    struct sigevent aio_sigevent;
+    off_t aio_offset;
+
+    /* private */
+    TAILQ_ENTRY(qemu_paiocb) node;
+    int is_write;
+    ssize_t ret;
+    int active;
+};
+
+struct qemu_paioinit
+{
+    unsigned int aio_threads;
+    unsigned int aio_num;
+    unsigned int aio_idle_time;
+};
+
+int qemu_paio_init(struct qemu_paioinit *aioinit);
+int qemu_paio_read(struct qemu_paiocb *aiocb);
+int qemu_paio_write(struct qemu_paiocb *aiocb);
+int qemu_paio_error(struct qemu_paiocb *aiocb);
+ssize_t qemu_paio_return(struct qemu_paiocb *aiocb);
+int qemu_paio_cancel(int fd, struct qemu_paiocb *aiocb);
+
+#endif