summary refs log tree commit diff stats
path: root/block
diff options
context:
space:
mode:
Diffstat (limited to 'block')
-rw-r--r--block/Makefile.objs1
-rw-r--r--block/blkdebug.c2
-rw-r--r--block/blkverify.c4
-rw-r--r--block/io.c71
-rw-r--r--block/qapi.c8
-rw-r--r--block/qcow2.c11
-rw-r--r--block/qcow2.h5
-rw-r--r--block/quorum.c51
-rw-r--r--block/raw-posix.c5
-rw-r--r--block/throttle-groups.c496
-rw-r--r--block/vmdk.c84
11 files changed, 585 insertions, 153 deletions
diff --git a/block/Makefile.objs b/block/Makefile.objs
index 0d8c2a4ab6..c34fd7cdc2 100644
--- a/block/Makefile.objs
+++ b/block/Makefile.objs
@@ -10,6 +10,7 @@ block-obj-$(CONFIG_WIN32) += raw-win32.o win32-aio.o
 block-obj-$(CONFIG_POSIX) += raw-posix.o
 block-obj-$(CONFIG_LINUX_AIO) += linux-aio.o
 block-obj-y += null.o mirror.o io.o
+block-obj-y += throttle-groups.o
 
 block-obj-y += nbd.o nbd-client.o sheepdog.o
 block-obj-$(CONFIG_LIBISCSI) += iscsi.o
diff --git a/block/blkdebug.c b/block/blkdebug.c
index 1e92607ef3..bc247f46f5 100644
--- a/block/blkdebug.c
+++ b/block/blkdebug.c
@@ -429,7 +429,7 @@ static int blkdebug_open(BlockDriverState *bs, QDict *options, int flags,
     /* Open the backing file */
     assert(bs->file == NULL);
     ret = bdrv_open_image(&bs->file, qemu_opt_get(opts, "x-image"), options, "image",
-                          flags | BDRV_O_PROTOCOL, false, &local_err);
+                          bs, &child_file, false, &local_err);
     if (ret < 0) {
         error_propagate(errp, local_err);
         goto out;
diff --git a/block/blkverify.c b/block/blkverify.c
index 438dff8bcb..d277e63220 100644
--- a/block/blkverify.c
+++ b/block/blkverify.c
@@ -125,7 +125,7 @@ static int blkverify_open(BlockDriverState *bs, QDict *options, int flags,
     /* Open the raw file */
     assert(bs->file == NULL);
     ret = bdrv_open_image(&bs->file, qemu_opt_get(opts, "x-raw"), options,
-                          "raw", flags | BDRV_O_PROTOCOL, false, &local_err);
+                          "raw", bs, &child_file, false, &local_err);
     if (ret < 0) {
         error_propagate(errp, local_err);
         goto fail;
@@ -134,7 +134,7 @@ static int blkverify_open(BlockDriverState *bs, QDict *options, int flags,
     /* Open the test file */
     assert(s->test_file == NULL);
     ret = bdrv_open_image(&s->test_file, qemu_opt_get(opts, "x-image"), options,
-                          "test", flags, false, &local_err);
+                          "test", bs, &child_format, false, &local_err);
     if (ret < 0) {
         error_propagate(errp, local_err);
         s->test_file = NULL;
diff --git a/block/io.c b/block/io.c
index e394d92626..bb4f78784e 100644
--- a/block/io.c
+++ b/block/io.c
@@ -23,9 +23,9 @@
  */
 
 #include "trace.h"
-#include "sysemu/qtest.h"
 #include "block/blockjob.h"
 #include "block/block_int.h"
+#include "block/throttle-groups.h"
 
 #define NOT_DONE 0x7fffffff /* used while emulated sync operation in progress */
 
@@ -65,7 +65,7 @@ void bdrv_set_io_limits(BlockDriverState *bs,
 {
     int i;
 
-    throttle_config(&bs->throttle_state, cfg);
+    throttle_group_config(bs, cfg);
 
     for (i = 0; i < 2; i++) {
         qemu_co_enter_next(&bs->throttled_reqs[i]);
@@ -95,72 +95,33 @@ static bool bdrv_start_throttled_reqs(BlockDriverState *bs)
 void bdrv_io_limits_disable(BlockDriverState *bs)
 {
     bs->io_limits_enabled = false;
-
     bdrv_start_throttled_reqs(bs);
-
-    throttle_destroy(&bs->throttle_state);
-}
-
-static void bdrv_throttle_read_timer_cb(void *opaque)
-{
-    BlockDriverState *bs = opaque;
-    qemu_co_enter_next(&bs->throttled_reqs[0]);
-}
-
-static void bdrv_throttle_write_timer_cb(void *opaque)
-{
-    BlockDriverState *bs = opaque;
-    qemu_co_enter_next(&bs->throttled_reqs[1]);
+    throttle_group_unregister_bs(bs);
 }
 
 /* should be called before bdrv_set_io_limits if a limit is set */
-void bdrv_io_limits_enable(BlockDriverState *bs)
+void bdrv_io_limits_enable(BlockDriverState *bs, const char *group)
 {
-    int clock_type = QEMU_CLOCK_REALTIME;
-
-    if (qtest_enabled()) {
-        /* For testing block IO throttling only */
-        clock_type = QEMU_CLOCK_VIRTUAL;
-    }
     assert(!bs->io_limits_enabled);
-    throttle_init(&bs->throttle_state,
-                  bdrv_get_aio_context(bs),
-                  clock_type,
-                  bdrv_throttle_read_timer_cb,
-                  bdrv_throttle_write_timer_cb,
-                  bs);
+    throttle_group_register_bs(bs, group);
     bs->io_limits_enabled = true;
 }
 
-/* This function makes an IO wait if needed
- *
- * @nb_sectors: the number of sectors of the IO
- * @is_write:   is the IO a write
- */
-static void bdrv_io_limits_intercept(BlockDriverState *bs,
-                                     unsigned int bytes,
-                                     bool is_write)
+void bdrv_io_limits_update_group(BlockDriverState *bs, const char *group)
 {
-    /* does this io must wait */
-    bool must_wait = throttle_schedule_timer(&bs->throttle_state, is_write);
-
-    /* if must wait or any request of this type throttled queue the IO */
-    if (must_wait ||
-        !qemu_co_queue_empty(&bs->throttled_reqs[is_write])) {
-        qemu_co_queue_wait(&bs->throttled_reqs[is_write]);
+    /* this bs is not part of any group */
+    if (!bs->throttle_state) {
+        return;
     }
 
-    /* the IO will be executed, do the accounting */
-    throttle_account(&bs->throttle_state, is_write, bytes);
-
-
-    /* if the next request must wait -> do nothing */
-    if (throttle_schedule_timer(&bs->throttle_state, is_write)) {
+    /* this bs is a part of the same group than the one we want */
+    if (!g_strcmp0(throttle_group_get_name(bs), group)) {
         return;
     }
 
-    /* else queue next request for execution */
-    qemu_co_queue_next(&bs->throttled_reqs[is_write]);
+    /* need to change the group this bs belong to */
+    bdrv_io_limits_disable(bs);
+    bdrv_io_limits_enable(bs, group);
 }
 
 void bdrv_setup_io_funcs(BlockDriver *bdrv)
@@ -967,7 +928,7 @@ static int coroutine_fn bdrv_co_do_preadv(BlockDriverState *bs,
 
     /* throttling disk I/O */
     if (bs->io_limits_enabled) {
-        bdrv_io_limits_intercept(bs, bytes, false);
+        throttle_group_co_io_limits_intercept(bs, bytes, false);
     }
 
     /* Align read if necessary by padding qiov */
@@ -1297,7 +1258,7 @@ static int coroutine_fn bdrv_co_do_pwritev(BlockDriverState *bs,
 
     /* throttling disk I/O */
     if (bs->io_limits_enabled) {
-        bdrv_io_limits_intercept(bs, bytes, true);
+        throttle_group_co_io_limits_intercept(bs, bytes, true);
     }
 
     /*
diff --git a/block/qapi.c b/block/qapi.c
index 18d2b95f54..a738148bce 100644
--- a/block/qapi.c
+++ b/block/qapi.c
@@ -24,6 +24,7 @@
 
 #include "block/qapi.h"
 #include "block/block_int.h"
+#include "block/throttle-groups.h"
 #include "block/write-threshold.h"
 #include "qmp-commands.h"
 #include "qapi-visit.h"
@@ -65,7 +66,9 @@ BlockDeviceInfo *bdrv_block_device_info(BlockDriverState *bs, Error **errp)
 
     if (bs->io_limits_enabled) {
         ThrottleConfig cfg;
-        throttle_get_config(&bs->throttle_state, &cfg);
+
+        throttle_group_get_config(bs, &cfg);
+
         info->bps     = cfg.buckets[THROTTLE_BPS_TOTAL].avg;
         info->bps_rd  = cfg.buckets[THROTTLE_BPS_READ].avg;
         info->bps_wr  = cfg.buckets[THROTTLE_BPS_WRITE].avg;
@@ -90,6 +93,9 @@ BlockDeviceInfo *bdrv_block_device_info(BlockDriverState *bs, Error **errp)
 
         info->has_iops_size = cfg.op_size;
         info->iops_size = cfg.op_size;
+
+        info->has_group = true;
+        info->group = g_strdup(throttle_group_get_name(bs));
     }
 
     info->write_threshold = bdrv_write_threshold_get(bs);
diff --git a/block/qcow2.c b/block/qcow2.c
index f7b4cc6a32..c4f6938a36 100644
--- a/block/qcow2.c
+++ b/block/qcow2.c
@@ -483,9 +483,11 @@ static const char *overlap_bool_option_names[QCOW2_OL_MAX_BITNR] = {
     [QCOW2_OL_INACTIVE_L2_BITNR]    = QCOW2_OPT_OVERLAP_INACTIVE_L2,
 };
 
-static void read_cache_sizes(QemuOpts *opts, uint64_t *l2_cache_size,
+static void read_cache_sizes(BlockDriverState *bs, QemuOpts *opts,
+                             uint64_t *l2_cache_size,
                              uint64_t *refcount_cache_size, Error **errp)
 {
+    BDRVQcowState *s = bs->opaque;
     uint64_t combined_cache_size;
     bool l2_cache_size_set, refcount_cache_size_set, combined_cache_size_set;
 
@@ -525,7 +527,9 @@ static void read_cache_sizes(QemuOpts *opts, uint64_t *l2_cache_size,
         }
     } else {
         if (!l2_cache_size_set && !refcount_cache_size_set) {
-            *l2_cache_size = DEFAULT_L2_CACHE_BYTE_SIZE;
+            *l2_cache_size = MAX(DEFAULT_L2_CACHE_BYTE_SIZE,
+                                 (uint64_t)DEFAULT_L2_CACHE_CLUSTERS
+                                 * s->cluster_size);
             *refcount_cache_size = *l2_cache_size
                                  / DEFAULT_L2_REFCOUNT_SIZE_RATIO;
         } else if (!l2_cache_size_set) {
@@ -803,7 +807,8 @@ static int qcow2_open(BlockDriverState *bs, QDict *options, int flags,
         goto fail;
     }
 
-    read_cache_sizes(opts, &l2_cache_size, &refcount_cache_size, &local_err);
+    read_cache_sizes(bs, opts, &l2_cache_size, &refcount_cache_size,
+                     &local_err);
     if (local_err) {
         error_propagate(errp, local_err);
         ret = -EINVAL;
diff --git a/block/qcow2.h b/block/qcow2.h
index 0076512af4..5936d299a3 100644
--- a/block/qcow2.h
+++ b/block/qcow2.h
@@ -62,11 +62,14 @@
 #define MIN_CLUSTER_BITS 9
 #define MAX_CLUSTER_BITS 21
 
-#define MIN_L2_CACHE_SIZE 1 /* cluster */
+/* Must be at least 2 to cover COW */
+#define MIN_L2_CACHE_SIZE 2 /* clusters */
 
 /* Must be at least 4 to cover all cases of refcount table growth */
 #define MIN_REFCOUNT_CACHE_SIZE 4 /* clusters */
 
+/* Whichever is more */
+#define DEFAULT_L2_CACHE_CLUSTERS 8 /* clusters */
 #define DEFAULT_L2_CACHE_BYTE_SIZE 1048576 /* bytes */
 
 /* The refblock cache needs only a fourth of the L2 cache size to cover as many
diff --git a/block/quorum.c b/block/quorum.c
index f91ef75a84..77e55b2775 100644
--- a/block/quorum.c
+++ b/block/quorum.c
@@ -866,25 +866,18 @@ static int quorum_open(BlockDriverState *bs, QDict *options, int flags,
     Error *local_err = NULL;
     QemuOpts *opts = NULL;
     bool *opened;
-    QDict *sub = NULL;
-    QList *list = NULL;
-    const QListEntry *lentry;
     int i;
     int ret = 0;
 
     qdict_flatten(options);
-    qdict_extract_subqdict(options, &sub, "children.");
-    qdict_array_split(sub, &list);
 
-    if (qdict_size(sub)) {
-        error_setg(&local_err, "Invalid option children.%s",
-                   qdict_first(sub)->key);
+    /* count how many different children are present */
+    s->num_children = qdict_array_entries(options, "children.");
+    if (s->num_children < 0) {
+        error_setg(&local_err, "Option children is not a valid array");
         ret = -EINVAL;
         goto exit;
     }
-
-    /* count how many different children are present */
-    s->num_children = qlist_size(list);
     if (s->num_children < 2) {
         error_setg(&local_err,
                    "Number of provided children must be greater than 1");
@@ -937,37 +930,17 @@ static int quorum_open(BlockDriverState *bs, QDict *options, int flags,
     s->bs = g_new0(BlockDriverState *, s->num_children);
     opened = g_new0(bool, s->num_children);
 
-    for (i = 0, lentry = qlist_first(list); lentry;
-         lentry = qlist_next(lentry), i++) {
-        QDict *d;
-        QString *string;
-
-        switch (qobject_type(lentry->value))
-        {
-            /* List of options */
-            case QTYPE_QDICT:
-                d = qobject_to_qdict(lentry->value);
-                QINCREF(d);
-                ret = bdrv_open(&s->bs[i], NULL, NULL, d, flags, NULL,
-                                &local_err);
-                break;
-
-            /* QMP reference */
-            case QTYPE_QSTRING:
-                string = qobject_to_qstring(lentry->value);
-                ret = bdrv_open(&s->bs[i], NULL, qstring_get_str(string), NULL,
-                                flags, NULL, &local_err);
-                break;
-
-            default:
-                error_setg(&local_err, "Specification of child block device %i "
-                           "is invalid", i);
-                ret = -EINVAL;
-        }
+    for (i = 0; i < s->num_children; i++) {
+        char indexstr[32];
+        ret = snprintf(indexstr, 32, "children.%d", i);
+        assert(ret < 32);
 
+        ret = bdrv_open_image(&s->bs[i], NULL, options, indexstr, bs,
+                              &child_format, false, &local_err);
         if (ret < 0) {
             goto close_exit;
         }
+
         opened[i] = true;
     }
 
@@ -990,8 +963,6 @@ exit:
     if (local_err) {
         error_propagate(errp, local_err);
     }
-    QDECREF(list);
-    QDECREF(sub);
     return ret;
 }
 
diff --git a/block/raw-posix.c b/block/raw-posix.c
index 2990e954ae..44ade8cf4e 100644
--- a/block/raw-posix.c
+++ b/block/raw-posix.c
@@ -1848,8 +1848,9 @@ static int64_t coroutine_fn raw_co_get_block_status(BlockDriverState *bs,
         *pnum = nb_sectors;
         ret = BDRV_BLOCK_DATA;
     } else if (data == start) {
-        /* On a data extent, compute sectors to the end of the extent.  */
-        *pnum = MIN(nb_sectors, (hole - start) / BDRV_SECTOR_SIZE);
+        /* On a data extent, compute sectors to the end of the extent,
+         * possibly including a partial sector at EOF. */
+        *pnum = MIN(nb_sectors, DIV_ROUND_UP(hole - start, BDRV_SECTOR_SIZE));
         ret = BDRV_BLOCK_DATA;
     } else {
         /* On a hole, compute sectors to the beginning of the next extent.  */
diff --git a/block/throttle-groups.c b/block/throttle-groups.c
new file mode 100644
index 0000000000..efc462fbc5
--- /dev/null
+++ b/block/throttle-groups.c
@@ -0,0 +1,496 @@
+/*
+ * QEMU block throttling group infrastructure
+ *
+ * Copyright (C) Nodalink, EURL. 2014
+ * Copyright (C) Igalia, S.L. 2015
+ *
+ * Authors:
+ *   BenoƮt Canet <benoit.canet@nodalink.com>
+ *   Alberto Garcia <berto@igalia.com>
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License as
+ * published by the Free Software Foundation; either version 2 or
+ * (at your option) version 3 of the License.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, see <http://www.gnu.org/licenses/>.
+ */
+
+#include "block/throttle-groups.h"
+#include "qemu/queue.h"
+#include "qemu/thread.h"
+#include "sysemu/qtest.h"
+
+/* The ThrottleGroup structure (with its ThrottleState) is shared
+ * among different BlockDriverState and it's independent from
+ * AioContext, so in order to use it from different threads it needs
+ * its own locking.
+ *
+ * This locking is however handled internally in this file, so it's
+ * mostly transparent to outside users (but see the documentation in
+ * throttle_groups_lock()).
+ *
+ * The whole ThrottleGroup structure is private and invisible to
+ * outside users, that only use it through its ThrottleState.
+ *
+ * In addition to the ThrottleGroup structure, BlockDriverState has
+ * fields that need to be accessed by other members of the group and
+ * therefore also need to be protected by this lock. Once a BDS is
+ * registered in a group those fields can be accessed by other threads
+ * any time.
+ *
+ * Again, all this is handled internally and is mostly transparent to
+ * the outside. The 'throttle_timers' field however has an additional
+ * constraint because it may be temporarily invalid (see for example
+ * bdrv_set_aio_context()). Therefore in this file a thread will
+ * access some other BDS's timers only after verifying that that BDS
+ * has throttled requests in the queue.
+ */
+typedef struct ThrottleGroup {
+    char *name; /* This is constant during the lifetime of the group */
+
+    QemuMutex lock; /* This lock protects the following four fields */
+    ThrottleState ts;
+    QLIST_HEAD(, BlockDriverState) head;
+    BlockDriverState *tokens[2];
+    bool any_timer_armed[2];
+
+    /* These two are protected by the global throttle_groups_lock */
+    unsigned refcount;
+    QTAILQ_ENTRY(ThrottleGroup) list;
+} ThrottleGroup;
+
+static QemuMutex throttle_groups_lock;
+static QTAILQ_HEAD(, ThrottleGroup) throttle_groups =
+    QTAILQ_HEAD_INITIALIZER(throttle_groups);
+
+/* Increments the reference count of a ThrottleGroup given its name.
+ *
+ * If no ThrottleGroup is found with the given name a new one is
+ * created.
+ *
+ * @name: the name of the ThrottleGroup
+ * @ret:  the ThrottleGroup
+ */
+static ThrottleGroup *throttle_group_incref(const char *name)
+{
+    ThrottleGroup *tg = NULL;
+    ThrottleGroup *iter;
+
+    qemu_mutex_lock(&throttle_groups_lock);
+
+    /* Look for an existing group with that name */
+    QTAILQ_FOREACH(iter, &throttle_groups, list) {
+        if (!strcmp(name, iter->name)) {
+            tg = iter;
+            break;
+        }
+    }
+
+    /* Create a new one if not found */
+    if (!tg) {
+        tg = g_new0(ThrottleGroup, 1);
+        tg->name = g_strdup(name);
+        qemu_mutex_init(&tg->lock);
+        throttle_init(&tg->ts);
+        QLIST_INIT(&tg->head);
+
+        QTAILQ_INSERT_TAIL(&throttle_groups, tg, list);
+    }
+
+    tg->refcount++;
+
+    qemu_mutex_unlock(&throttle_groups_lock);
+
+    return tg;
+}
+
+/* Decrease the reference count of a ThrottleGroup.
+ *
+ * When the reference count reaches zero the ThrottleGroup is
+ * destroyed.
+ *
+ * @tg:  The ThrottleGroup to unref
+ */
+static void throttle_group_unref(ThrottleGroup *tg)
+{
+    qemu_mutex_lock(&throttle_groups_lock);
+    if (--tg->refcount == 0) {
+        QTAILQ_REMOVE(&throttle_groups, tg, list);
+        qemu_mutex_destroy(&tg->lock);
+        g_free(tg->name);
+        g_free(tg);
+    }
+    qemu_mutex_unlock(&throttle_groups_lock);
+}
+
+/* Get the name from a BlockDriverState's ThrottleGroup. The name (and
+ * the pointer) is guaranteed to remain constant during the lifetime
+ * of the group.
+ *
+ * @bs:   a BlockDriverState that is member of a throttling group
+ * @ret:  the name of the group.
+ */
+const char *throttle_group_get_name(BlockDriverState *bs)
+{
+    ThrottleGroup *tg = container_of(bs->throttle_state, ThrottleGroup, ts);
+    return tg->name;
+}
+
+/* Return the next BlockDriverState in the round-robin sequence,
+ * simulating a circular list.
+ *
+ * This assumes that tg->lock is held.
+ *
+ * @bs:  the current BlockDriverState
+ * @ret: the next BlockDriverState in the sequence
+ */
+static BlockDriverState *throttle_group_next_bs(BlockDriverState *bs)
+{
+    ThrottleState *ts = bs->throttle_state;
+    ThrottleGroup *tg = container_of(ts, ThrottleGroup, ts);
+    BlockDriverState *next = QLIST_NEXT(bs, round_robin);
+
+    if (!next) {
+        return QLIST_FIRST(&tg->head);
+    }
+
+    return next;
+}
+
+/* Return the next BlockDriverState in the round-robin sequence with
+ * pending I/O requests.
+ *
+ * This assumes that tg->lock is held.
+ *
+ * @bs:        the current BlockDriverState
+ * @is_write:  the type of operation (read/write)
+ * @ret:       the next BlockDriverState with pending requests, or bs
+ *             if there is none.
+ */
+static BlockDriverState *next_throttle_token(BlockDriverState *bs,
+                                             bool is_write)
+{
+    ThrottleGroup *tg = container_of(bs->throttle_state, ThrottleGroup, ts);
+    BlockDriverState *token, *start;
+
+    start = token = tg->tokens[is_write];
+
+    /* get next bs round in round robin style */
+    token = throttle_group_next_bs(token);
+    while (token != start && !token->pending_reqs[is_write]) {
+        token = throttle_group_next_bs(token);
+    }
+
+    /* If no IO are queued for scheduling on the next round robin token
+     * then decide the token is the current bs because chances are
+     * the current bs get the current request queued.
+     */
+    if (token == start && !token->pending_reqs[is_write]) {
+        token = bs;
+    }
+
+    return token;
+}
+
+/* Check if the next I/O request for a BlockDriverState needs to be
+ * throttled or not. If there's no timer set in this group, set one
+ * and update the token accordingly.
+ *
+ * This assumes that tg->lock is held.
+ *
+ * @bs:         the current BlockDriverState
+ * @is_write:   the type of operation (read/write)
+ * @ret:        whether the I/O request needs to be throttled or not
+ */
+static bool throttle_group_schedule_timer(BlockDriverState *bs,
+                                          bool is_write)
+{
+    ThrottleState *ts = bs->throttle_state;
+    ThrottleTimers *tt = &bs->throttle_timers;
+    ThrottleGroup *tg = container_of(ts, ThrottleGroup, ts);
+    bool must_wait;
+
+    /* Check if any of the timers in this group is already armed */
+    if (tg->any_timer_armed[is_write]) {
+        return true;
+    }
+
+    must_wait = throttle_schedule_timer(ts, tt, is_write);
+
+    /* If a timer just got armed, set bs as the current token */
+    if (must_wait) {
+        tg->tokens[is_write] = bs;
+        tg->any_timer_armed[is_write] = true;
+    }
+
+    return must_wait;
+}
+
+/* Look for the next pending I/O request and schedule it.
+ *
+ * This assumes that tg->lock is held.
+ *
+ * @bs:        the current BlockDriverState
+ * @is_write:  the type of operation (read/write)
+ */
+static void schedule_next_request(BlockDriverState *bs, bool is_write)
+{
+    ThrottleGroup *tg = container_of(bs->throttle_state, ThrottleGroup, ts);
+    bool must_wait;
+    BlockDriverState *token;
+
+    /* Check if there's any pending request to schedule next */
+    token = next_throttle_token(bs, is_write);
+    if (!token->pending_reqs[is_write]) {
+        return;
+    }
+
+    /* Set a timer for the request if it needs to be throttled */
+    must_wait = throttle_group_schedule_timer(token, is_write);
+
+    /* If it doesn't have to wait, queue it for immediate execution */
+    if (!must_wait) {
+        /* Give preference to requests from the current bs */
+        if (qemu_in_coroutine() &&
+            qemu_co_queue_next(&bs->throttled_reqs[is_write])) {
+            token = bs;
+        } else {
+            ThrottleTimers *tt = &token->throttle_timers;
+            int64_t now = qemu_clock_get_ns(tt->clock_type);
+            timer_mod(tt->timers[is_write], now + 1);
+            tg->any_timer_armed[is_write] = true;
+        }
+        tg->tokens[is_write] = token;
+    }
+}
+
+/* Check if an I/O request needs to be throttled, wait and set a timer
+ * if necessary, and schedule the next request using a round robin
+ * algorithm.
+ *
+ * @bs:        the current BlockDriverState
+ * @bytes:     the number of bytes for this I/O
+ * @is_write:  the type of operation (read/write)
+ */
+void coroutine_fn throttle_group_co_io_limits_intercept(BlockDriverState *bs,
+                                                        unsigned int bytes,
+                                                        bool is_write)
+{
+    bool must_wait;
+    BlockDriverState *token;
+
+    ThrottleGroup *tg = container_of(bs->throttle_state, ThrottleGroup, ts);
+    qemu_mutex_lock(&tg->lock);
+
+    /* First we check if this I/O has to be throttled. */
+    token = next_throttle_token(bs, is_write);
+    must_wait = throttle_group_schedule_timer(token, is_write);
+
+    /* Wait if there's a timer set or queued requests of this type */
+    if (must_wait || bs->pending_reqs[is_write]) {
+        bs->pending_reqs[is_write]++;
+        qemu_mutex_unlock(&tg->lock);
+        qemu_co_queue_wait(&bs->throttled_reqs[is_write]);
+        qemu_mutex_lock(&tg->lock);
+        bs->pending_reqs[is_write]--;
+    }
+
+    /* The I/O will be executed, so do the accounting */
+    throttle_account(bs->throttle_state, is_write, bytes);
+
+    /* Schedule the next request */
+    schedule_next_request(bs, is_write);
+
+    qemu_mutex_unlock(&tg->lock);
+}
+
+/* Update the throttle configuration for a particular group. Similar
+ * to throttle_config(), but guarantees atomicity within the
+ * throttling group.
+ *
+ * @bs:  a BlockDriverState that is member of the group
+ * @cfg: the configuration to set
+ */
+void throttle_group_config(BlockDriverState *bs, ThrottleConfig *cfg)
+{
+    ThrottleTimers *tt = &bs->throttle_timers;
+    ThrottleState *ts = bs->throttle_state;
+    ThrottleGroup *tg = container_of(ts, ThrottleGroup, ts);
+    qemu_mutex_lock(&tg->lock);
+    throttle_config(ts, tt, cfg);
+    /* throttle_config() cancels the timers */
+    tg->any_timer_armed[0] = tg->any_timer_armed[1] = false;
+    qemu_mutex_unlock(&tg->lock);
+}
+
+/* Get the throttle configuration from a particular group. Similar to
+ * throttle_get_config(), but guarantees atomicity within the
+ * throttling group.
+ *
+ * @bs:  a BlockDriverState that is member of the group
+ * @cfg: the configuration will be written here
+ */
+void throttle_group_get_config(BlockDriverState *bs, ThrottleConfig *cfg)
+{
+    ThrottleState *ts = bs->throttle_state;
+    ThrottleGroup *tg = container_of(ts, ThrottleGroup, ts);
+    qemu_mutex_lock(&tg->lock);
+    throttle_get_config(ts, cfg);
+    qemu_mutex_unlock(&tg->lock);
+}
+
+/* ThrottleTimers callback. This wakes up a request that was waiting
+ * because it had been throttled.
+ *
+ * @bs:        the BlockDriverState whose request had been throttled
+ * @is_write:  the type of operation (read/write)
+ */
+static void timer_cb(BlockDriverState *bs, bool is_write)
+{
+    ThrottleState *ts = bs->throttle_state;
+    ThrottleGroup *tg = container_of(ts, ThrottleGroup, ts);
+    bool empty_queue;
+
+    /* The timer has just been fired, so we can update the flag */
+    qemu_mutex_lock(&tg->lock);
+    tg->any_timer_armed[is_write] = false;
+    qemu_mutex_unlock(&tg->lock);
+
+    /* Run the request that was waiting for this timer */
+    empty_queue = !qemu_co_enter_next(&bs->throttled_reqs[is_write]);
+
+    /* If the request queue was empty then we have to take care of
+     * scheduling the next one */
+    if (empty_queue) {
+        qemu_mutex_lock(&tg->lock);
+        schedule_next_request(bs, is_write);
+        qemu_mutex_unlock(&tg->lock);
+    }
+}
+
+static void read_timer_cb(void *opaque)
+{
+    timer_cb(opaque, false);
+}
+
+static void write_timer_cb(void *opaque)
+{
+    timer_cb(opaque, true);
+}
+
+/* Register a BlockDriverState in the throttling group, also
+ * initializing its timers and updating its throttle_state pointer to
+ * point to it. If a throttling group with that name does not exist
+ * yet, it will be created.
+ *
+ * @bs:        the BlockDriverState to insert
+ * @groupname: the name of the group
+ */
+void throttle_group_register_bs(BlockDriverState *bs, const char *groupname)
+{
+    int i;
+    ThrottleGroup *tg = throttle_group_incref(groupname);
+    int clock_type = QEMU_CLOCK_REALTIME;
+
+    if (qtest_enabled()) {
+        /* For testing block IO throttling only */
+        clock_type = QEMU_CLOCK_VIRTUAL;
+    }
+
+    bs->throttle_state = &tg->ts;
+
+    qemu_mutex_lock(&tg->lock);
+    /* If the ThrottleGroup is new set this BlockDriverState as the token */
+    for (i = 0; i < 2; i++) {
+        if (!tg->tokens[i]) {
+            tg->tokens[i] = bs;
+        }
+    }
+
+    QLIST_INSERT_HEAD(&tg->head, bs, round_robin);
+
+    throttle_timers_init(&bs->throttle_timers,
+                         bdrv_get_aio_context(bs),
+                         clock_type,
+                         read_timer_cb,
+                         write_timer_cb,
+                         bs);
+
+    qemu_mutex_unlock(&tg->lock);
+}
+
+/* Unregister a BlockDriverState from its group, removing it from the
+ * list, destroying the timers and setting the throttle_state pointer
+ * to NULL.
+ *
+ * The group will be destroyed if it's empty after this operation.
+ *
+ * @bs: the BlockDriverState to remove
+ */
+void throttle_group_unregister_bs(BlockDriverState *bs)
+{
+    ThrottleGroup *tg = container_of(bs->throttle_state, ThrottleGroup, ts);
+    int i;
+
+    qemu_mutex_lock(&tg->lock);
+    for (i = 0; i < 2; i++) {
+        if (tg->tokens[i] == bs) {
+            BlockDriverState *token = throttle_group_next_bs(bs);
+            /* Take care of the case where this is the last bs in the group */
+            if (token == bs) {
+                token = NULL;
+            }
+            tg->tokens[i] = token;
+        }
+    }
+
+    /* remove the current bs from the list */
+    QLIST_REMOVE(bs, round_robin);
+    throttle_timers_destroy(&bs->throttle_timers);
+    qemu_mutex_unlock(&tg->lock);
+
+    throttle_group_unref(tg);
+    bs->throttle_state = NULL;
+}
+
+/* Acquire the lock of this throttling group.
+ *
+ * You won't normally need to use this. None of the functions from the
+ * ThrottleGroup API require you to acquire the lock since all of them
+ * deal with it internally.
+ *
+ * This should only be used in exceptional cases when you want to
+ * access the protected fields of a BlockDriverState directly
+ * (e.g. bdrv_swap()).
+ *
+ * @bs: a BlockDriverState that is member of the group
+ */
+void throttle_group_lock(BlockDriverState *bs)
+{
+    ThrottleGroup *tg = container_of(bs->throttle_state, ThrottleGroup, ts);
+    qemu_mutex_lock(&tg->lock);
+}
+
+/* Release the lock of this throttling group.
+ *
+ * See the comments in throttle_group_lock().
+ */
+void throttle_group_unlock(BlockDriverState *bs)
+{
+    ThrottleGroup *tg = container_of(bs->throttle_state, ThrottleGroup, ts);
+    qemu_mutex_unlock(&tg->lock);
+}
+
+static void throttle_groups_init(void)
+{
+    qemu_mutex_init(&throttle_groups_lock);
+}
+
+block_init(throttle_groups_init);
diff --git a/block/vmdk.c b/block/vmdk.c
index b66745dfdd..be9263a34e 100644
--- a/block/vmdk.c
+++ b/block/vmdk.c
@@ -321,37 +321,13 @@ static int vmdk_is_cid_valid(BlockDriverState *bs)
     return 1;
 }
 
-/* Queue extents, if any, for reopen() */
+/* We have nothing to do for VMDK reopen, stubs just return success */
 static int vmdk_reopen_prepare(BDRVReopenState *state,
                                BlockReopenQueue *queue, Error **errp)
 {
-    BDRVVmdkState *s;
-    int ret = -1;
-    int i;
-    VmdkExtent *e;
-
     assert(state != NULL);
     assert(state->bs != NULL);
-
-    if (queue == NULL) {
-        error_setg(errp, "No reopen queue for VMDK extents");
-        goto exit;
-    }
-
-    s = state->bs->opaque;
-
-    assert(s != NULL);
-
-    for (i = 0; i < s->num_extents; i++) {
-        e = &s->extents[i];
-        if (e->file != state->bs->file) {
-            bdrv_reopen_queue(queue, e->file, state->flags);
-        }
-    }
-    ret = 0;
-
-exit:
-    return ret;
+    return 0;
 }
 
 static int vmdk_parent_open(BlockDriverState *bs)
@@ -543,7 +519,7 @@ static int vmdk_open_vmfs_sparse(BlockDriverState *bs,
 }
 
 static int vmdk_open_desc_file(BlockDriverState *bs, int flags, char *buf,
-                               Error **errp);
+                               QDict *options, Error **errp);
 
 static char *vmdk_read_desc(BlockDriverState *file, uint64_t desc_offset,
                             Error **errp)
@@ -582,7 +558,7 @@ static char *vmdk_read_desc(BlockDriverState *file, uint64_t desc_offset,
 
 static int vmdk_open_vmdk4(BlockDriverState *bs,
                            BlockDriverState *file,
-                           int flags, Error **errp)
+                           int flags, QDict *options, Error **errp)
 {
     int ret;
     uint32_t magic;
@@ -606,7 +582,7 @@ static int vmdk_open_vmdk4(BlockDriverState *bs,
             if (!buf) {
                 return -EINVAL;
             }
-            ret = vmdk_open_desc_file(bs, flags, buf, errp);
+            ret = vmdk_open_desc_file(bs, flags, buf, options, errp);
             g_free(buf);
             return ret;
         }
@@ -763,7 +739,7 @@ static int vmdk_parse_description(const char *desc, const char *opt_name,
 /* Open an extent file and append to bs array */
 static int vmdk_open_sparse(BlockDriverState *bs,
                             BlockDriverState *file, int flags,
-                            char *buf, Error **errp)
+                            char *buf, QDict *options, Error **errp)
 {
     uint32_t magic;
 
@@ -773,7 +749,7 @@ static int vmdk_open_sparse(BlockDriverState *bs,
             return vmdk_open_vmfs_sparse(bs, file, flags, errp);
             break;
         case VMDK4_MAGIC:
-            return vmdk_open_vmdk4(bs, file, flags, errp);
+            return vmdk_open_vmdk4(bs, file, flags, options, errp);
             break;
         default:
             error_setg(errp, "Image not in VMDK format");
@@ -783,7 +759,8 @@ static int vmdk_open_sparse(BlockDriverState *bs,
 }
 
 static int vmdk_parse_extents(const char *desc, BlockDriverState *bs,
-                              const char *desc_file_path, Error **errp)
+                              const char *desc_file_path, QDict *options,
+                              Error **errp)
 {
     int ret;
     int matches;
@@ -797,6 +774,7 @@ static int vmdk_parse_extents(const char *desc, BlockDriverState *bs,
     BlockDriverState *extent_file;
     BDRVVmdkState *s = bs->opaque;
     VmdkExtent *extent;
+    char extent_opt_prefix[32];
 
     while (*p) {
         /* parse extent line in one of below formats:
@@ -846,8 +824,12 @@ static int vmdk_parse_extents(const char *desc, BlockDriverState *bs,
         extent_path = g_malloc0(PATH_MAX);
         path_combine(extent_path, PATH_MAX, desc_file_path, fname);
         extent_file = NULL;
-        ret = bdrv_open(&extent_file, extent_path, NULL, NULL,
-                        bs->open_flags | BDRV_O_PROTOCOL, NULL, errp);
+
+        ret = snprintf(extent_opt_prefix, 32, "extents.%d", s->num_extents);
+        assert(ret < 32);
+
+        ret = bdrv_open_image(&extent_file, extent_path, options,
+                              extent_opt_prefix, bs, &child_file, false, errp);
         g_free(extent_path);
         if (ret) {
             return ret;
@@ -870,7 +852,8 @@ static int vmdk_parse_extents(const char *desc, BlockDriverState *bs,
             if (!buf) {
                 ret = -EINVAL;
             } else {
-                ret = vmdk_open_sparse(bs, extent_file, bs->open_flags, buf, errp);
+                ret = vmdk_open_sparse(bs, extent_file, bs->open_flags, buf,
+                                       options, errp);
             }
             g_free(buf);
             if (ret) {
@@ -898,7 +881,7 @@ next_line:
 }
 
 static int vmdk_open_desc_file(BlockDriverState *bs, int flags, char *buf,
-                               Error **errp)
+                               QDict *options, Error **errp)
 {
     int ret;
     char ct[128];
@@ -920,7 +903,7 @@ static int vmdk_open_desc_file(BlockDriverState *bs, int flags, char *buf,
     }
     s->create_type = g_strdup(ct);
     s->desc_offset = 0;
-    ret = vmdk_parse_extents(buf, bs, bs->file->exact_filename, errp);
+    ret = vmdk_parse_extents(buf, bs, bs->file->exact_filename, options, errp);
 exit:
     return ret;
 }
@@ -942,11 +925,11 @@ static int vmdk_open(BlockDriverState *bs, QDict *options, int flags,
     switch (magic) {
         case VMDK3_MAGIC:
         case VMDK4_MAGIC:
-            ret = vmdk_open_sparse(bs, bs->file, flags, buf, errp);
+            ret = vmdk_open_sparse(bs, bs->file, flags, buf, options, errp);
             s->desc_offset = 0x200;
             break;
         default:
-            ret = vmdk_open_desc_file(bs, flags, buf, errp);
+            ret = vmdk_open_desc_file(bs, flags, buf, options, errp);
             break;
     }
     if (ret) {
@@ -1248,6 +1231,17 @@ static VmdkExtent *find_extent(BDRVVmdkState *s,
     return NULL;
 }
 
+static inline uint64_t vmdk_find_index_in_cluster(VmdkExtent *extent,
+                                                  int64_t sector_num)
+{
+    uint64_t index_in_cluster, extent_begin_sector, extent_relative_sector_num;
+
+    extent_begin_sector = extent->end_sector - extent->sectors;
+    extent_relative_sector_num = sector_num - extent_begin_sector;
+    index_in_cluster = extent_relative_sector_num % extent->cluster_sectors;
+    return index_in_cluster;
+}
+
 static int64_t coroutine_fn vmdk_co_get_block_status(BlockDriverState *bs,
         int64_t sector_num, int nb_sectors, int *pnum)
 {
@@ -1285,7 +1279,7 @@ static int64_t coroutine_fn vmdk_co_get_block_status(BlockDriverState *bs,
         break;
     }
 
-    index_in_cluster = sector_num % extent->cluster_sectors;
+    index_in_cluster = vmdk_find_index_in_cluster(extent, sector_num);
     n = extent->cluster_sectors - index_in_cluster;
     if (n > nb_sectors) {
         n = nb_sectors;
@@ -1413,7 +1407,6 @@ static int vmdk_read(BlockDriverState *bs, int64_t sector_num,
     BDRVVmdkState *s = bs->opaque;
     int ret;
     uint64_t n, index_in_cluster;
-    uint64_t extent_begin_sector, extent_relative_sector_num;
     VmdkExtent *extent = NULL;
     uint64_t cluster_offset;
 
@@ -1425,9 +1418,7 @@ static int vmdk_read(BlockDriverState *bs, int64_t sector_num,
         ret = get_cluster_offset(bs, extent, NULL,
                                  sector_num << 9, false, &cluster_offset,
                                  0, 0);
-        extent_begin_sector = extent->end_sector - extent->sectors;
-        extent_relative_sector_num = sector_num - extent_begin_sector;
-        index_in_cluster = extent_relative_sector_num % extent->cluster_sectors;
+        index_in_cluster = vmdk_find_index_in_cluster(extent, sector_num);
         n = extent->cluster_sectors - index_in_cluster;
         if (n > nb_sectors) {
             n = nb_sectors;
@@ -1489,7 +1480,6 @@ static int vmdk_write(BlockDriverState *bs, int64_t sector_num,
     VmdkExtent *extent = NULL;
     int ret;
     int64_t index_in_cluster, n;
-    uint64_t extent_begin_sector, extent_relative_sector_num;
     uint64_t cluster_offset;
     VmdkMetaData m_data;
 
@@ -1505,9 +1495,7 @@ static int vmdk_write(BlockDriverState *bs, int64_t sector_num,
         if (!extent) {
             return -EIO;
         }
-        extent_begin_sector = extent->end_sector - extent->sectors;
-        extent_relative_sector_num = sector_num - extent_begin_sector;
-        index_in_cluster = extent_relative_sector_num % extent->cluster_sectors;
+        index_in_cluster = vmdk_find_index_in_cluster(extent, sector_num);
         n = extent->cluster_sectors - index_in_cluster;
         if (n > nb_sectors) {
             n = nb_sectors;