diff options
Diffstat (limited to 'block')
41 files changed, 3836 insertions, 867 deletions
diff --git a/block/Makefile.objs b/block/Makefile.objs index f9368b52b8..2aaede4ae1 100644 --- a/block/Makefile.objs +++ b/block/Makefile.objs @@ -1,5 +1,5 @@ block-obj-y += raw-format.o qcow.o vdi.o vmdk.o cloop.o bochs.o vpc.o vvfat.o dmg.o -block-obj-y += qcow2.o qcow2-refcount.o qcow2-cluster.o qcow2-snapshot.o qcow2-cache.o +block-obj-y += qcow2.o qcow2-refcount.o qcow2-cluster.o qcow2-snapshot.o qcow2-cache.o qcow2-bitmap.o block-obj-y += qed.o qed-l2-cache.o qed-table.o qed-cluster.o block-obj-y += qed-check.o block-obj-y += vhdx.o vhdx-endian.o vhdx-log.o diff --git a/block/backup.c b/block/backup.c index b69184eac5..504a089150 100644 --- a/block/backup.c +++ b/block/backup.c @@ -639,12 +639,12 @@ BlockJob *backup_job_create(const char *job_id, BlockDriverState *bs, ret = bdrv_get_info(target, &bdi); if (ret == -ENOTSUP && !target->backing) { /* Cluster size is not defined */ - error_report("WARNING: The target block device doesn't provide " - "information about the block size and it doesn't have a " - "backing file. The default block size of %u bytes is " - "used. If the actual block size of the target exceeds " - "this default, the backup may be unusable", - BACKUP_CLUSTER_SIZE_DEFAULT); + warn_report("The target block device doesn't provide " + "information about the block size and it doesn't have a " + "backing file. The default block size of %u bytes is " + "used. If the actual block size of the target exceeds " + "this default, the backup may be unusable", + BACKUP_CLUSTER_SIZE_DEFAULT); job->cluster_size = BACKUP_CLUSTER_SIZE_DEFAULT; } else if (ret < 0 && !target->backing) { error_setg_errno(errp, -ret, diff --git a/block/blkdebug.c b/block/blkdebug.c index b25856c49c..c19ab28f07 100644 --- a/block/blkdebug.c +++ b/block/blkdebug.c @@ -821,9 +821,10 @@ static int64_t blkdebug_getlength(BlockDriverState *bs) return bdrv_getlength(bs->file->bs); } -static int blkdebug_truncate(BlockDriverState *bs, int64_t offset, Error **errp) +static int blkdebug_truncate(BlockDriverState *bs, int64_t offset, + PreallocMode prealloc, Error **errp) { - return bdrv_truncate(bs->file, offset, errp); + return bdrv_truncate(bs->file, offset, prealloc, errp); } static void blkdebug_refresh_filename(BlockDriverState *bs, QDict *options) diff --git a/block/block-backend.c b/block/block-backend.c index 0df3457a09..fe3542b3f8 100644 --- a/block/block-backend.c +++ b/block/block-backend.c @@ -1773,14 +1773,15 @@ int blk_pwrite_compressed(BlockBackend *blk, int64_t offset, const void *buf, BDRV_REQ_WRITE_COMPRESSED); } -int blk_truncate(BlockBackend *blk, int64_t offset, Error **errp) +int blk_truncate(BlockBackend *blk, int64_t offset, PreallocMode prealloc, + Error **errp) { if (!blk_is_available(blk)) { error_setg(errp, "No medium inserted"); return -ENOMEDIUM; } - return bdrv_truncate(blk->root, offset, errp); + return bdrv_truncate(blk->root, offset, prealloc, errp); } static void blk_pdiscard_entry(void *opaque) diff --git a/block/commit.c b/block/commit.c index 774a8a599c..13143608f8 100644 --- a/block/commit.c +++ b/block/commit.c @@ -163,7 +163,7 @@ static void coroutine_fn commit_run(void *opaque) } if (base_len < s->common.len) { - ret = blk_truncate(s->base, s->common.len, NULL); + ret = blk_truncate(s->base, s->common.len, PREALLOC_MODE_OFF, NULL); if (ret) { goto out; } @@ -521,7 +521,7 @@ int bdrv_commit(BlockDriverState *bs) * grow the backing file image if possible. If not possible, * we must return an error */ if (length > backing_length) { - ret = blk_truncate(backing, length, &local_err); + ret = blk_truncate(backing, length, PREALLOC_MODE_OFF, &local_err); if (ret < 0) { error_report_err(local_err); goto ro_cleanup; diff --git a/block/crypto.c b/block/crypto.c index 10e5ddccaa..58ef6f2f52 100644 --- a/block/crypto.c +++ b/block/crypto.c @@ -24,16 +24,10 @@ #include "sysemu/block-backend.h" #include "crypto/block.h" #include "qapi/opts-visitor.h" +#include "qapi/qobject-input-visitor.h" #include "qapi-visit.h" #include "qapi/error.h" - -#define BLOCK_CRYPTO_OPT_LUKS_KEY_SECRET "key-secret" -#define BLOCK_CRYPTO_OPT_LUKS_CIPHER_ALG "cipher-alg" -#define BLOCK_CRYPTO_OPT_LUKS_CIPHER_MODE "cipher-mode" -#define BLOCK_CRYPTO_OPT_LUKS_IVGEN_ALG "ivgen-alg" -#define BLOCK_CRYPTO_OPT_LUKS_IVGEN_HASH_ALG "ivgen-hash-alg" -#define BLOCK_CRYPTO_OPT_LUKS_HASH_ALG "hash-alg" -#define BLOCK_CRYPTO_OPT_LUKS_ITER_TIME "iter-time" +#include "block/crypto.h" typedef struct BlockCrypto BlockCrypto; @@ -135,11 +129,7 @@ static QemuOptsList block_crypto_runtime_opts_luks = { .name = "crypto", .head = QTAILQ_HEAD_INITIALIZER(block_crypto_runtime_opts_luks.head), .desc = { - { - .name = BLOCK_CRYPTO_OPT_LUKS_KEY_SECRET, - .type = QEMU_OPT_STRING, - .help = "ID of the secret that provides the encryption key", - }, + BLOCK_CRYPTO_OPT_DEF_LUKS_KEY_SECRET(""), { /* end of list */ } }, }; @@ -154,49 +144,21 @@ static QemuOptsList block_crypto_create_opts_luks = { .type = QEMU_OPT_SIZE, .help = "Virtual disk size" }, - { - .name = BLOCK_CRYPTO_OPT_LUKS_KEY_SECRET, - .type = QEMU_OPT_STRING, - .help = "ID of the secret that provides the encryption key", - }, - { - .name = BLOCK_CRYPTO_OPT_LUKS_CIPHER_ALG, - .type = QEMU_OPT_STRING, - .help = "Name of encryption cipher algorithm", - }, - { - .name = BLOCK_CRYPTO_OPT_LUKS_CIPHER_MODE, - .type = QEMU_OPT_STRING, - .help = "Name of encryption cipher mode", - }, - { - .name = BLOCK_CRYPTO_OPT_LUKS_IVGEN_ALG, - .type = QEMU_OPT_STRING, - .help = "Name of IV generator algorithm", - }, - { - .name = BLOCK_CRYPTO_OPT_LUKS_IVGEN_HASH_ALG, - .type = QEMU_OPT_STRING, - .help = "Name of IV generator hash algorithm", - }, - { - .name = BLOCK_CRYPTO_OPT_LUKS_HASH_ALG, - .type = QEMU_OPT_STRING, - .help = "Name of encryption hash algorithm", - }, - { - .name = BLOCK_CRYPTO_OPT_LUKS_ITER_TIME, - .type = QEMU_OPT_NUMBER, - .help = "Time to spend in PBKDF in milliseconds", - }, + BLOCK_CRYPTO_OPT_DEF_LUKS_KEY_SECRET(""), + BLOCK_CRYPTO_OPT_DEF_LUKS_CIPHER_ALG(""), + BLOCK_CRYPTO_OPT_DEF_LUKS_CIPHER_MODE(""), + BLOCK_CRYPTO_OPT_DEF_LUKS_IVGEN_ALG(""), + BLOCK_CRYPTO_OPT_DEF_LUKS_IVGEN_HASH_ALG(""), + BLOCK_CRYPTO_OPT_DEF_LUKS_HASH_ALG(""), + BLOCK_CRYPTO_OPT_DEF_LUKS_ITER_TIME(""), { /* end of list */ } }, }; -static QCryptoBlockOpenOptions * +QCryptoBlockOpenOptions * block_crypto_open_opts_init(QCryptoBlockFormat format, - QemuOpts *opts, + QDict *opts, Error **errp) { Visitor *v; @@ -206,7 +168,7 @@ block_crypto_open_opts_init(QCryptoBlockFormat format, ret = g_new0(QCryptoBlockOpenOptions, 1); ret->format = format; - v = opts_visitor_new(opts); + v = qobject_input_visitor_new_keyval(QOBJECT(opts)); visit_start_struct(v, NULL, NULL, 0, &local_err); if (local_err) { @@ -219,6 +181,11 @@ block_crypto_open_opts_init(QCryptoBlockFormat format, v, &ret->u.luks, &local_err); break; + case Q_CRYPTO_BLOCK_FORMAT_QCOW: + visit_type_QCryptoBlockOptionsQCow_members( + v, &ret->u.qcow, &local_err); + break; + default: error_setg(&local_err, "Unsupported block format %d", format); break; @@ -240,9 +207,9 @@ block_crypto_open_opts_init(QCryptoBlockFormat format, } -static QCryptoBlockCreateOptions * +QCryptoBlockCreateOptions * block_crypto_create_opts_init(QCryptoBlockFormat format, - QemuOpts *opts, + QDict *opts, Error **errp) { Visitor *v; @@ -252,7 +219,7 @@ block_crypto_create_opts_init(QCryptoBlockFormat format, ret = g_new0(QCryptoBlockCreateOptions, 1); ret->format = format; - v = opts_visitor_new(opts); + v = qobject_input_visitor_new_keyval(QOBJECT(opts)); visit_start_struct(v, NULL, NULL, 0, &local_err); if (local_err) { @@ -265,6 +232,11 @@ block_crypto_create_opts_init(QCryptoBlockFormat format, v, &ret->u.luks, &local_err); break; + case Q_CRYPTO_BLOCK_FORMAT_QCOW: + visit_type_QCryptoBlockOptionsQCow_members( + v, &ret->u.qcow, &local_err); + break; + default: error_setg(&local_err, "Unsupported block format %d", format); break; @@ -299,6 +271,7 @@ static int block_crypto_open_generic(QCryptoBlockFormat format, int ret = -EINVAL; QCryptoBlockOpenOptions *open_opts = NULL; unsigned int cflags = 0; + QDict *cryptoopts = NULL; bs->file = bdrv_open_child(NULL, options, "file", bs, &child_file, false, errp); @@ -313,7 +286,9 @@ static int block_crypto_open_generic(QCryptoBlockFormat format, goto cleanup; } - open_opts = block_crypto_open_opts_init(format, opts, errp); + cryptoopts = qemu_opts_to_qdict(opts, NULL); + + open_opts = block_crypto_open_opts_init(format, cryptoopts, errp); if (!open_opts) { goto cleanup; } @@ -321,7 +296,7 @@ static int block_crypto_open_generic(QCryptoBlockFormat format, if (flags & BDRV_O_NO_IO) { cflags |= QCRYPTO_BLOCK_OPEN_NO_IO; } - crypto->block = qcrypto_block_open(open_opts, + crypto->block = qcrypto_block_open(open_opts, NULL, block_crypto_read_func, bs, cflags, @@ -333,10 +308,10 @@ static int block_crypto_open_generic(QCryptoBlockFormat format, } bs->encrypted = true; - bs->valid_key = true; ret = 0; cleanup: + QDECREF(cryptoopts); qapi_free_QCryptoBlockOpenOptions(open_opts); return ret; } @@ -356,13 +331,16 @@ static int block_crypto_create_generic(QCryptoBlockFormat format, .opts = opts, .filename = filename, }; + QDict *cryptoopts; + + cryptoopts = qemu_opts_to_qdict(opts, NULL); - create_opts = block_crypto_create_opts_init(format, opts, errp); + create_opts = block_crypto_create_opts_init(format, cryptoopts, errp); if (!create_opts) { return -1; } - crypto = qcrypto_block_create(create_opts, + crypto = qcrypto_block_create(create_opts, NULL, block_crypto_init_func, block_crypto_write_func, &data, @@ -375,6 +353,7 @@ static int block_crypto_create_generic(QCryptoBlockFormat format, ret = 0; cleanup: + QDECREF(cryptoopts); qcrypto_block_free(crypto); blk_unref(data.blk); qapi_free_QCryptoBlockCreateOptions(create_opts); @@ -382,7 +361,7 @@ static int block_crypto_create_generic(QCryptoBlockFormat format, } static int block_crypto_truncate(BlockDriverState *bs, int64_t offset, - Error **errp) + PreallocMode prealloc, Error **errp) { BlockCrypto *crypto = bs->opaque; size_t payload_offset = @@ -390,7 +369,7 @@ static int block_crypto_truncate(BlockDriverState *bs, int64_t offset, offset += payload_offset; - return bdrv_truncate(bs->file, offset, errp); + return bdrv_truncate(bs->file, offset, prealloc, errp); } static void block_crypto_close(BlockDriverState *bs) diff --git a/block/crypto.h b/block/crypto.h new file mode 100644 index 0000000000..0f985ea4e2 --- /dev/null +++ b/block/crypto.h @@ -0,0 +1,101 @@ +/* + * QEMU block full disk encryption + * + * Copyright (c) 2015-2017 Red Hat, Inc. + * + * This library is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2 of the License, or (at your option) any later version. + * + * This library is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with this library; if not, see <http://www.gnu.org/licenses/>. + * + */ + +#ifndef BLOCK_CRYPTO_H__ +#define BLOCK_CRYPTO_H__ + +#define BLOCK_CRYPTO_OPT_DEF_KEY_SECRET(prefix, helpstr) \ + { \ + .name = prefix BLOCK_CRYPTO_OPT_QCOW_KEY_SECRET, \ + .type = QEMU_OPT_STRING, \ + .help = helpstr, \ + } + +#define BLOCK_CRYPTO_OPT_QCOW_KEY_SECRET "key-secret" + +#define BLOCK_CRYPTO_OPT_DEF_QCOW_KEY_SECRET(prefix) \ + BLOCK_CRYPTO_OPT_DEF_KEY_SECRET(prefix, \ + "ID of the secret that provides the AES encryption key") + +#define BLOCK_CRYPTO_OPT_LUKS_KEY_SECRET "key-secret" +#define BLOCK_CRYPTO_OPT_LUKS_CIPHER_ALG "cipher-alg" +#define BLOCK_CRYPTO_OPT_LUKS_CIPHER_MODE "cipher-mode" +#define BLOCK_CRYPTO_OPT_LUKS_IVGEN_ALG "ivgen-alg" +#define BLOCK_CRYPTO_OPT_LUKS_IVGEN_HASH_ALG "ivgen-hash-alg" +#define BLOCK_CRYPTO_OPT_LUKS_HASH_ALG "hash-alg" +#define BLOCK_CRYPTO_OPT_LUKS_ITER_TIME "iter-time" + +#define BLOCK_CRYPTO_OPT_DEF_LUKS_KEY_SECRET(prefix) \ + BLOCK_CRYPTO_OPT_DEF_KEY_SECRET(prefix, \ + "ID of the secret that provides the keyslot passphrase") + +#define BLOCK_CRYPTO_OPT_DEF_LUKS_CIPHER_ALG(prefix) \ + { \ + .name = prefix BLOCK_CRYPTO_OPT_LUKS_CIPHER_ALG, \ + .type = QEMU_OPT_STRING, \ + .help = "Name of encryption cipher algorithm", \ + } + +#define BLOCK_CRYPTO_OPT_DEF_LUKS_CIPHER_MODE(prefix) \ + { \ + .name = prefix BLOCK_CRYPTO_OPT_LUKS_CIPHER_MODE, \ + .type = QEMU_OPT_STRING, \ + .help = "Name of encryption cipher mode", \ + } + +#define BLOCK_CRYPTO_OPT_DEF_LUKS_IVGEN_ALG(prefix) \ + { \ + .name = prefix BLOCK_CRYPTO_OPT_LUKS_IVGEN_ALG, \ + .type = QEMU_OPT_STRING, \ + .help = "Name of IV generator algorithm", \ + } + +#define BLOCK_CRYPTO_OPT_DEF_LUKS_IVGEN_HASH_ALG(prefix) \ + { \ + .name = prefix BLOCK_CRYPTO_OPT_LUKS_IVGEN_HASH_ALG, \ + .type = QEMU_OPT_STRING, \ + .help = "Name of IV generator hash algorithm", \ + } + +#define BLOCK_CRYPTO_OPT_DEF_LUKS_HASH_ALG(prefix) \ + { \ + .name = prefix BLOCK_CRYPTO_OPT_LUKS_HASH_ALG, \ + .type = QEMU_OPT_STRING, \ + .help = "Name of encryption hash algorithm", \ + } + +#define BLOCK_CRYPTO_OPT_DEF_LUKS_ITER_TIME(prefix) \ + { \ + .name = prefix BLOCK_CRYPTO_OPT_LUKS_ITER_TIME, \ + .type = QEMU_OPT_NUMBER, \ + .help = "Time to spend in PBKDF in milliseconds", \ + } + +QCryptoBlockCreateOptions * +block_crypto_create_opts_init(QCryptoBlockFormat format, + QDict *opts, + Error **errp); + +QCryptoBlockOpenOptions * +block_crypto_open_opts_init(QCryptoBlockFormat format, + QDict *opts, + Error **errp); + +#endif /* BLOCK_CRYPTO_H__ */ diff --git a/block/dirty-bitmap.c b/block/dirty-bitmap.c index a04c6e4154..543bddb9b5 100644 --- a/block/dirty-bitmap.c +++ b/block/dirty-bitmap.c @@ -43,8 +43,18 @@ struct BdrvDirtyBitmap { BdrvDirtyBitmap *successor; /* Anonymous child; implies frozen status */ char *name; /* Optional non-empty unique ID */ int64_t size; /* Size of the bitmap (Number of sectors) */ - bool disabled; /* Bitmap is read-only */ + bool disabled; /* Bitmap is disabled. It ignores all writes to + the device */ int active_iterators; /* How many iterators are active */ + bool readonly; /* Bitmap is read-only. This field also + prevents the respective image from being + modified (i.e. blocks writes and discards). + Such operations must fail and both the image + and this bitmap must remain unchanged while + this flag is set. */ + bool autoload; /* For persistent bitmaps: bitmap must be + autoloaded on image opening */ + bool persistent; /* bitmap must be saved to owner disk image */ QLIST_ENTRY(BdrvDirtyBitmap) list; }; @@ -93,6 +103,8 @@ void bdrv_dirty_bitmap_make_anon(BdrvDirtyBitmap *bitmap) assert(!bdrv_dirty_bitmap_frozen(bitmap)); g_free(bitmap->name); bitmap->name = NULL; + bitmap->persistent = false; + bitmap->autoload = false; } /* Called with BQL taken. */ @@ -289,6 +301,10 @@ BdrvDirtyBitmap *bdrv_dirty_bitmap_abdicate(BlockDriverState *bs, bitmap->name = NULL; successor->name = name; bitmap->successor = NULL; + successor->persistent = bitmap->persistent; + bitmap->persistent = false; + successor->autoload = bitmap->autoload; + bitmap->autoload = false; bdrv_release_dirty_bitmap(bs, bitmap); return successor; @@ -340,15 +356,20 @@ void bdrv_dirty_bitmap_truncate(BlockDriverState *bs) bdrv_dirty_bitmaps_unlock(bs); } +static bool bdrv_dirty_bitmap_has_name(BdrvDirtyBitmap *bitmap) +{ + return !!bdrv_dirty_bitmap_name(bitmap); +} + /* Called with BQL taken. */ -static void bdrv_do_release_matching_dirty_bitmap(BlockDriverState *bs, - BdrvDirtyBitmap *bitmap, - bool only_named) +static void bdrv_do_release_matching_dirty_bitmap( + BlockDriverState *bs, BdrvDirtyBitmap *bitmap, + bool (*cond)(BdrvDirtyBitmap *bitmap)) { BdrvDirtyBitmap *bm, *next; bdrv_dirty_bitmaps_lock(bs); QLIST_FOREACH_SAFE(bm, &bs->dirty_bitmaps, list, next) { - if ((!bitmap || bm == bitmap) && (!only_named || bm->name)) { + if ((!bitmap || bm == bitmap) && (!cond || cond(bm))) { assert(!bm->active_iterators); assert(!bdrv_dirty_bitmap_frozen(bm)); assert(!bm->meta); @@ -373,17 +394,47 @@ out: /* Called with BQL taken. */ void bdrv_release_dirty_bitmap(BlockDriverState *bs, BdrvDirtyBitmap *bitmap) { - bdrv_do_release_matching_dirty_bitmap(bs, bitmap, false); + bdrv_do_release_matching_dirty_bitmap(bs, bitmap, NULL); } /** * Release all named dirty bitmaps attached to a BDS (for use in bdrv_close()). * There must not be any frozen bitmaps attached. + * This function does not remove persistent bitmaps from the storage. * Called with BQL taken. */ void bdrv_release_named_dirty_bitmaps(BlockDriverState *bs) { - bdrv_do_release_matching_dirty_bitmap(bs, NULL, true); + bdrv_do_release_matching_dirty_bitmap(bs, NULL, bdrv_dirty_bitmap_has_name); +} + +/** + * Release all persistent dirty bitmaps attached to a BDS (for use in + * bdrv_inactivate_recurse()). + * There must not be any frozen bitmaps attached. + * This function does not remove persistent bitmaps from the storage. + */ +void bdrv_release_persistent_dirty_bitmaps(BlockDriverState *bs) +{ + bdrv_do_release_matching_dirty_bitmap(bs, NULL, + bdrv_dirty_bitmap_get_persistance); +} + +/** + * Remove persistent dirty bitmap from the storage if it exists. + * Absence of bitmap is not an error, because we have the following scenario: + * BdrvDirtyBitmap can have .persistent = true but not yet saved and have no + * stored version. For such bitmap bdrv_remove_persistent_dirty_bitmap() should + * not fail. + * This function doesn't release corresponding BdrvDirtyBitmap. + */ +void bdrv_remove_persistent_dirty_bitmap(BlockDriverState *bs, + const char *name, + Error **errp) +{ + if (bs->drv && bs->drv->bdrv_remove_persistent_dirty_bitmap) { + bs->drv->bdrv_remove_persistent_dirty_bitmap(bs, name, errp); + } } /* Called with BQL taken. */ @@ -455,7 +506,7 @@ uint32_t bdrv_get_default_bitmap_granularity(BlockDriverState *bs) return granularity; } -uint32_t bdrv_dirty_bitmap_granularity(BdrvDirtyBitmap *bitmap) +uint32_t bdrv_dirty_bitmap_granularity(const BdrvDirtyBitmap *bitmap) { return BDRV_SECTOR_SIZE << hbitmap_granularity(bitmap->bitmap); } @@ -504,6 +555,7 @@ void bdrv_set_dirty_bitmap_locked(BdrvDirtyBitmap *bitmap, int64_t cur_sector, int64_t nr_sectors) { assert(bdrv_dirty_bitmap_enabled(bitmap)); + assert(!bdrv_dirty_bitmap_readonly(bitmap)); hbitmap_set(bitmap->bitmap, cur_sector, nr_sectors); } @@ -520,6 +572,7 @@ void bdrv_reset_dirty_bitmap_locked(BdrvDirtyBitmap *bitmap, int64_t cur_sector, int64_t nr_sectors) { assert(bdrv_dirty_bitmap_enabled(bitmap)); + assert(!bdrv_dirty_bitmap_readonly(bitmap)); hbitmap_reset(bitmap->bitmap, cur_sector, nr_sectors); } @@ -534,6 +587,7 @@ void bdrv_reset_dirty_bitmap(BdrvDirtyBitmap *bitmap, void bdrv_clear_dirty_bitmap(BdrvDirtyBitmap *bitmap, HBitmap **out) { assert(bdrv_dirty_bitmap_enabled(bitmap)); + assert(!bdrv_dirty_bitmap_readonly(bitmap)); bdrv_dirty_bitmap_lock(bitmap); if (!out) { hbitmap_reset_all(bitmap->bitmap); @@ -550,6 +604,7 @@ void bdrv_undo_clear_dirty_bitmap(BdrvDirtyBitmap *bitmap, HBitmap *in) { HBitmap *tmp = bitmap->bitmap; assert(bdrv_dirty_bitmap_enabled(bitmap)); + assert(!bdrv_dirty_bitmap_readonly(bitmap)); bitmap->bitmap = in; hbitmap_free(tmp); } @@ -586,6 +641,13 @@ void bdrv_dirty_bitmap_deserialize_zeroes(BdrvDirtyBitmap *bitmap, hbitmap_deserialize_zeroes(bitmap->bitmap, start, count, finish); } +void bdrv_dirty_bitmap_deserialize_ones(BdrvDirtyBitmap *bitmap, + uint64_t start, uint64_t count, + bool finish) +{ + hbitmap_deserialize_ones(bitmap->bitmap, start, count, finish); +} + void bdrv_dirty_bitmap_deserialize_finish(BdrvDirtyBitmap *bitmap) { hbitmap_deserialize_finish(bitmap->bitmap); @@ -605,6 +667,7 @@ void bdrv_set_dirty(BlockDriverState *bs, int64_t cur_sector, if (!bdrv_dirty_bitmap_enabled(bitmap)) { continue; } + assert(!bdrv_dirty_bitmap_readonly(bitmap)); hbitmap_set(bitmap->bitmap, cur_sector, nr_sectors); } bdrv_dirty_bitmaps_unlock(bs); @@ -627,3 +690,78 @@ int64_t bdrv_get_meta_dirty_count(BdrvDirtyBitmap *bitmap) { return hbitmap_count(bitmap->meta); } + +bool bdrv_dirty_bitmap_readonly(const BdrvDirtyBitmap *bitmap) +{ + return bitmap->readonly; +} + +/* Called with BQL taken. */ +void bdrv_dirty_bitmap_set_readonly(BdrvDirtyBitmap *bitmap, bool value) +{ + qemu_mutex_lock(bitmap->mutex); + bitmap->readonly = value; + qemu_mutex_unlock(bitmap->mutex); +} + +bool bdrv_has_readonly_bitmaps(BlockDriverState *bs) +{ + BdrvDirtyBitmap *bm; + QLIST_FOREACH(bm, &bs->dirty_bitmaps, list) { + if (bm->readonly) { + return true; + } + } + + return false; +} + +/* Called with BQL taken. */ +void bdrv_dirty_bitmap_set_autoload(BdrvDirtyBitmap *bitmap, bool autoload) +{ + qemu_mutex_lock(bitmap->mutex); + bitmap->autoload = autoload; + qemu_mutex_unlock(bitmap->mutex); +} + +bool bdrv_dirty_bitmap_get_autoload(const BdrvDirtyBitmap *bitmap) +{ + return bitmap->autoload; +} + +/* Called with BQL taken. */ +void bdrv_dirty_bitmap_set_persistance(BdrvDirtyBitmap *bitmap, bool persistent) +{ + qemu_mutex_lock(bitmap->mutex); + bitmap->persistent = persistent; + qemu_mutex_unlock(bitmap->mutex); +} + +bool bdrv_dirty_bitmap_get_persistance(BdrvDirtyBitmap *bitmap) +{ + return bitmap->persistent; +} + +bool bdrv_has_changed_persistent_bitmaps(BlockDriverState *bs) +{ + BdrvDirtyBitmap *bm; + QLIST_FOREACH(bm, &bs->dirty_bitmaps, list) { + if (bm->persistent && !bm->readonly) { + return true; + } + } + + return false; +} + +BdrvDirtyBitmap *bdrv_dirty_bitmap_next(BlockDriverState *bs, + BdrvDirtyBitmap *bitmap) +{ + return bitmap == NULL ? QLIST_FIRST(&bs->dirty_bitmaps) : + QLIST_NEXT(bitmap, list); +} + +char *bdrv_dirty_bitmap_sha256(const BdrvDirtyBitmap *bitmap, Error **errp) +{ + return hbitmap_sha256(bitmap->bitmap, errp); +} diff --git a/block/file-posix.c b/block/file-posix.c index 3927fabf06..cfbb236f6f 100644 --- a/block/file-posix.c +++ b/block/file-posix.c @@ -1624,7 +1624,122 @@ static void raw_close(BlockDriverState *bs) } } -static int raw_truncate(BlockDriverState *bs, int64_t offset, Error **errp) +/** + * Truncates the given regular file @fd to @offset and, when growing, fills the + * new space according to @prealloc. + * + * Returns: 0 on success, -errno on failure. + */ +static int raw_regular_truncate(int fd, int64_t offset, PreallocMode prealloc, + Error **errp) +{ + int result = 0; + int64_t current_length = 0; + char *buf = NULL; + struct stat st; + + if (fstat(fd, &st) < 0) { + result = -errno; + error_setg_errno(errp, -result, "Could not stat file"); + return result; + } + + current_length = st.st_size; + if (current_length > offset && prealloc != PREALLOC_MODE_OFF) { + error_setg(errp, "Cannot use preallocation for shrinking files"); + return -ENOTSUP; + } + + switch (prealloc) { +#ifdef CONFIG_POSIX_FALLOCATE + case PREALLOC_MODE_FALLOC: + /* + * Truncating before posix_fallocate() makes it about twice slower on + * file systems that do not support fallocate(), trying to check if a + * block is allocated before allocating it, so don't do that here. + */ + result = -posix_fallocate(fd, current_length, offset - current_length); + if (result != 0) { + /* posix_fallocate() doesn't set errno. */ + error_setg_errno(errp, -result, + "Could not preallocate new data"); + } + goto out; +#endif + case PREALLOC_MODE_FULL: + { + int64_t num = 0, left = offset - current_length; + + /* + * Knowing the final size from the beginning could allow the file + * system driver to do less allocations and possibly avoid + * fragmentation of the file. + */ + if (ftruncate(fd, offset) != 0) { + result = -errno; + error_setg_errno(errp, -result, "Could not resize file"); + goto out; + } + + buf = g_malloc0(65536); + + result = lseek(fd, current_length, SEEK_SET); + if (result < 0) { + result = -errno; + error_setg_errno(errp, -result, + "Failed to seek to the old end of file"); + goto out; + } + + while (left > 0) { + num = MIN(left, 65536); + result = write(fd, buf, num); + if (result < 0) { + result = -errno; + error_setg_errno(errp, -result, + "Could not write zeros for preallocation"); + goto out; + } + left -= result; + } + if (result >= 0) { + result = fsync(fd); + if (result < 0) { + result = -errno; + error_setg_errno(errp, -result, + "Could not flush file to disk"); + goto out; + } + } + goto out; + } + case PREALLOC_MODE_OFF: + if (ftruncate(fd, offset) != 0) { + result = -errno; + error_setg_errno(errp, -result, "Could not resize file"); + } + return result; + default: + result = -ENOTSUP; + error_setg(errp, "Unsupported preallocation mode: %s", + PreallocMode_lookup[prealloc]); + return result; + } + +out: + if (result < 0) { + if (ftruncate(fd, current_length) < 0) { + error_report("Failed to restore old file length: %s", + strerror(errno)); + } + } + + g_free(buf); + return result; +} + +static int raw_truncate(BlockDriverState *bs, int64_t offset, + PreallocMode prealloc, Error **errp) { BDRVRawState *s = bs->opaque; struct stat st; @@ -1637,12 +1752,16 @@ static int raw_truncate(BlockDriverState *bs, int64_t offset, Error **errp) } if (S_ISREG(st.st_mode)) { - if (ftruncate(s->fd, offset) < 0) { - ret = -errno; - error_setg_errno(errp, -ret, "Failed to resize the file"); - return ret; - } - } else if (S_ISCHR(st.st_mode) || S_ISBLK(st.st_mode)) { + return raw_regular_truncate(s->fd, offset, prealloc, errp); + } + + if (prealloc != PREALLOC_MODE_OFF) { + error_setg(errp, "Preallocation mode '%s' unsupported for this " + "non-regular file", PreallocMode_lookup[prealloc]); + return -ENOTSUP; + } + + if (S_ISCHR(st.st_mode) || S_ISBLK(st.st_mode)) { if (offset > raw_getlength(bs)) { error_setg(errp, "Cannot grow device files"); return -EINVAL; @@ -1885,71 +2004,9 @@ static int raw_create(const char *filename, QemuOpts *opts, Error **errp) #endif } - switch (prealloc) { -#ifdef CONFIG_POSIX_FALLOCATE - case PREALLOC_MODE_FALLOC: - /* - * Truncating before posix_fallocate() makes it about twice slower on - * file systems that do not support fallocate(), trying to check if a - * block is allocated before allocating it, so don't do that here. - */ - result = -posix_fallocate(fd, 0, total_size); - if (result != 0) { - /* posix_fallocate() doesn't set errno. */ - error_setg_errno(errp, -result, - "Could not preallocate data for the new file"); - } - break; -#endif - case PREALLOC_MODE_FULL: - { - /* - * Knowing the final size from the beginning could allow the file - * system driver to do less allocations and possibly avoid - * fragmentation of the file. - */ - if (ftruncate(fd, total_size) != 0) { - result = -errno; - error_setg_errno(errp, -result, "Could not resize file"); - goto out_close; - } - - int64_t num = 0, left = total_size; - buf = g_malloc0(65536); - - while (left > 0) { - num = MIN(left, 65536); - result = write(fd, buf, num); - if (result < 0) { - result = -errno; - error_setg_errno(errp, -result, - "Could not write to the new file"); - break; - } - left -= result; - } - if (result >= 0) { - result = fsync(fd); - if (result < 0) { - result = -errno; - error_setg_errno(errp, -result, - "Could not flush new file to disk"); - } - } - g_free(buf); - break; - } - case PREALLOC_MODE_OFF: - if (ftruncate(fd, total_size) != 0) { - result = -errno; - error_setg_errno(errp, -result, "Could not resize file"); - } - break; - default: - result = -EINVAL; - error_setg(errp, "Unsupported preallocation mode: %s", - PreallocMode_lookup[prealloc]); - break; + result = raw_regular_truncate(fd, total_size, prealloc, errp); + if (result < 0) { + goto out_close; } out_close: diff --git a/block/file-win32.c b/block/file-win32.c index ef2910b03f..4706335cff 100644 --- a/block/file-win32.c +++ b/block/file-win32.c @@ -461,12 +461,19 @@ static void raw_close(BlockDriverState *bs) } } -static int raw_truncate(BlockDriverState *bs, int64_t offset, Error **errp) +static int raw_truncate(BlockDriverState *bs, int64_t offset, + PreallocMode prealloc, Error **errp) { BDRVRawState *s = bs->opaque; LONG low, high; DWORD dwPtrLow; + if (prealloc != PREALLOC_MODE_OFF) { + error_setg(errp, "Unsupported preallocation mode '%s'", + PreallocMode_lookup[prealloc]); + return -ENOTSUP; + } + low = offset; high = offset >> 32; diff --git a/block/gluster.c b/block/gluster.c index addceed6eb..3064a45047 100644 --- a/block/gluster.c +++ b/block/gluster.c @@ -345,8 +345,7 @@ static int qemu_gluster_parse_uri(BlockdevOptionsGluster *gconf, is_unix = true; } else if (!strcmp(uri->scheme, "gluster+rdma")) { gsconf->type = SOCKET_ADDRESS_TYPE_INET; - error_report("Warning: rdma feature is not supported, falling " - "back to tcp"); + warn_report("rdma feature is not supported, falling back to tcp"); } else { ret = -EINVAL; goto out; @@ -1096,11 +1095,17 @@ static coroutine_fn int qemu_gluster_co_rw(BlockDriverState *bs, } static int qemu_gluster_truncate(BlockDriverState *bs, int64_t offset, - Error **errp) + PreallocMode prealloc, Error **errp) { int ret; BDRVGlusterState *s = bs->opaque; + if (prealloc != PREALLOC_MODE_OFF) { + error_setg(errp, "Unsupported preallocation mode '%s'", + PreallocMode_lookup[prealloc]); + return -ENOTSUP; + } + ret = glfs_ftruncate(s->fd, offset); if (ret < 0) { ret = -errno; diff --git a/block/io.c b/block/io.c index 23170a57ee..aece54c015 100644 --- a/block/io.c +++ b/block/io.c @@ -149,6 +149,37 @@ bool bdrv_requests_pending(BlockDriverState *bs) return false; } +typedef struct { + Coroutine *co; + BlockDriverState *bs; + bool done; +} BdrvCoDrainData; + +static void coroutine_fn bdrv_drain_invoke_entry(void *opaque) +{ + BdrvCoDrainData *data = opaque; + BlockDriverState *bs = data->bs; + + bs->drv->bdrv_co_drain(bs); + + /* Set data->done before reading bs->wakeup. */ + atomic_mb_set(&data->done, true); + bdrv_wakeup(bs); +} + +static void bdrv_drain_invoke(BlockDriverState *bs) +{ + BdrvCoDrainData data = { .bs = bs, .done = false }; + + if (!bs->drv || !bs->drv->bdrv_co_drain) { + return; + } + + data.co = qemu_coroutine_create(bdrv_drain_invoke_entry, &data); + bdrv_coroutine_enter(bs, data.co); + BDRV_POLL_WHILE(bs, !data.done); +} + static bool bdrv_drain_recurse(BlockDriverState *bs) { BdrvChild *child, *tmp; @@ -156,9 +187,8 @@ static bool bdrv_drain_recurse(BlockDriverState *bs) waited = BDRV_POLL_WHILE(bs, atomic_read(&bs->in_flight) > 0); - if (bs->drv && bs->drv->bdrv_drain) { - bs->drv->bdrv_drain(bs); - } + /* Ensure any pending metadata writes are submitted to bs->file. */ + bdrv_drain_invoke(bs); QLIST_FOREACH_SAFE(child, &bs->children, next, tmp) { BlockDriverState *bs = child->bs; @@ -184,12 +214,6 @@ static bool bdrv_drain_recurse(BlockDriverState *bs) return waited; } -typedef struct { - Coroutine *co; - BlockDriverState *bs; - bool done; -} BdrvCoDrainData; - static void bdrv_co_drain_bh_cb(void *opaque) { BdrvCoDrainData *data = opaque; @@ -1315,6 +1339,10 @@ static int coroutine_fn bdrv_aligned_pwritev(BdrvChild *child, uint64_t bytes_remaining = bytes; int max_transfer; + if (bdrv_has_readonly_bitmaps(bs)) { + return -EPERM; + } + assert(is_power_of_2(align)); assert((offset & (align - 1)) == 0); assert((bytes & (align - 1)) == 0); @@ -2287,6 +2315,10 @@ int coroutine_fn bdrv_co_pdiscard(BlockDriverState *bs, int64_t offset, return -ENOMEDIUM; } + if (bdrv_has_readonly_bitmaps(bs)) { + return -EPERM; + } + ret = bdrv_check_byte_request(bs, offset, bytes); if (ret < 0) { return ret; diff --git a/block/iscsi.c b/block/iscsi.c index 54067e2620..d557c99668 100644 --- a/block/iscsi.c +++ b/block/iscsi.c @@ -1761,9 +1761,9 @@ static int iscsi_open(BlockDriverState *bs, QDict *options, int flags, * filename encoded options */ filename = qdict_get_try_str(options, "filename"); if (filename) { - error_report("Warning: 'filename' option specified. " - "This is an unsupported option, and may be deprecated " - "in the future"); + warn_report("'filename' option specified. " + "This is an unsupported option, and may be deprecated " + "in the future"); iscsi_parse_filename(filename, options, &local_err); if (local_err) { ret = -EINVAL; @@ -2079,11 +2079,18 @@ static void iscsi_reopen_commit(BDRVReopenState *reopen_state) } } -static int iscsi_truncate(BlockDriverState *bs, int64_t offset, Error **errp) +static int iscsi_truncate(BlockDriverState *bs, int64_t offset, + PreallocMode prealloc, Error **errp) { IscsiLun *iscsilun = bs->opaque; Error *local_err = NULL; + if (prealloc != PREALLOC_MODE_OFF) { + error_setg(errp, "Unsupported preallocation mode '%s'", + PreallocMode_lookup[prealloc]); + return -ENOTSUP; + } + if (iscsilun->type != TYPE_DISK) { error_setg(errp, "Cannot resize non-disk iSCSI devices"); return -ENOTSUP; diff --git a/block/mirror.c b/block/mirror.c index eaf0fe7858..8583b764a0 100644 --- a/block/mirror.c +++ b/block/mirror.c @@ -739,7 +739,8 @@ static void coroutine_fn mirror_run(void *opaque) } if (s->bdev_length > base_length) { - ret = blk_truncate(s->target, s->bdev_length, NULL); + ret = blk_truncate(s->target, s->bdev_length, PREALLOC_MODE_OFF, + NULL); if (ret < 0) { goto immediate_exit; } diff --git a/block/nbd-client.c b/block/nbd-client.c index 208f907095..25dd28406b 100644 --- a/block/nbd-client.c +++ b/block/nbd-client.c @@ -242,7 +242,7 @@ int nbd_client_co_pwritev(BlockDriverState *bs, uint64_t offset, ssize_t ret; if (flags & BDRV_REQ_FUA) { - assert(client->nbdflags & NBD_FLAG_SEND_FUA); + assert(client->info.flags & NBD_FLAG_SEND_FUA); request.flags |= NBD_CMD_FLAG_FUA; } @@ -270,12 +270,12 @@ int nbd_client_co_pwrite_zeroes(BlockDriverState *bs, int64_t offset, }; NBDReply reply; - if (!(client->nbdflags & NBD_FLAG_SEND_WRITE_ZEROES)) { + if (!(client->info.flags & NBD_FLAG_SEND_WRITE_ZEROES)) { return -ENOTSUP; } if (flags & BDRV_REQ_FUA) { - assert(client->nbdflags & NBD_FLAG_SEND_FUA); + assert(client->info.flags & NBD_FLAG_SEND_FUA); request.flags |= NBD_CMD_FLAG_FUA; } if (!(flags & BDRV_REQ_MAY_UNMAP)) { @@ -299,7 +299,7 @@ int nbd_client_co_flush(BlockDriverState *bs) NBDReply reply; ssize_t ret; - if (!(client->nbdflags & NBD_FLAG_SEND_FLUSH)) { + if (!(client->info.flags & NBD_FLAG_SEND_FLUSH)) { return 0; } @@ -327,7 +327,7 @@ int nbd_client_co_pdiscard(BlockDriverState *bs, int64_t offset, int bytes) NBDReply reply; ssize_t ret; - if (!(client->nbdflags & NBD_FLAG_SEND_TRIM)) { + if (!(client->info.flags & NBD_FLAG_SEND_TRIM)) { return 0; } @@ -384,22 +384,24 @@ int nbd_client_init(BlockDriverState *bs, logout("session init %s\n", export); qio_channel_set_blocking(QIO_CHANNEL(sioc), true, NULL); + client->info.request_sizes = true; ret = nbd_receive_negotiate(QIO_CHANNEL(sioc), export, - &client->nbdflags, tlscreds, hostname, - &client->ioc, - &client->size, errp); + &client->ioc, &client->info, errp); if (ret < 0) { logout("Failed to negotiate with the NBD server\n"); return ret; } - if (client->nbdflags & NBD_FLAG_SEND_FUA) { + if (client->info.flags & NBD_FLAG_SEND_FUA) { bs->supported_write_flags = BDRV_REQ_FUA; bs->supported_zero_flags |= BDRV_REQ_FUA; } - if (client->nbdflags & NBD_FLAG_SEND_WRITE_ZEROES) { + if (client->info.flags & NBD_FLAG_SEND_WRITE_ZEROES) { bs->supported_zero_flags |= BDRV_REQ_MAY_UNMAP; } + if (client->info.min_block > bs->bl.request_alignment) { + bs->bl.request_alignment = client->info.min_block; + } qemu_co_mutex_init(&client->send_mutex); qemu_co_queue_init(&client->free_sema); diff --git a/block/nbd-client.h b/block/nbd-client.h index 49636bc621..df80771357 100644 --- a/block/nbd-client.h +++ b/block/nbd-client.h @@ -20,8 +20,7 @@ typedef struct NBDClientSession { QIOChannelSocket *sioc; /* The master data channel */ QIOChannel *ioc; /* The current I/O channel which may differ (eg TLS) */ - uint16_t nbdflags; - off_t size; + NBDExportInfo info; CoMutex send_mutex; CoQueue free_sema; diff --git a/block/nbd.c b/block/nbd.c index d529305330..a50d24b50a 100644 --- a/block/nbd.c +++ b/block/nbd.c @@ -472,9 +472,17 @@ static int nbd_co_flush(BlockDriverState *bs) static void nbd_refresh_limits(BlockDriverState *bs, Error **errp) { - bs->bl.max_pdiscard = NBD_MAX_BUFFER_SIZE; - bs->bl.max_pwrite_zeroes = NBD_MAX_BUFFER_SIZE; - bs->bl.max_transfer = NBD_MAX_BUFFER_SIZE; + NBDClientSession *s = nbd_get_client_session(bs); + uint32_t max = MIN_NON_ZERO(NBD_MAX_BUFFER_SIZE, s->info.max_block); + + bs->bl.max_pdiscard = max; + bs->bl.max_pwrite_zeroes = max; + bs->bl.max_transfer = max; + + if (s->info.opt_block && + s->info.opt_block > bs->bl.opt_transfer) { + bs->bl.opt_transfer = s->info.opt_block; + } } static void nbd_close(BlockDriverState *bs) @@ -492,7 +500,7 @@ static int64_t nbd_getlength(BlockDriverState *bs) { BDRVNBDState *s = bs->opaque; - return s->client.size; + return s->client.info.size; } static void nbd_detach_aio_context(BlockDriverState *bs) diff --git a/block/nfs.c b/block/nfs.c index c3c5de0113..d8db419957 100644 --- a/block/nfs.c +++ b/block/nfs.c @@ -558,8 +558,8 @@ static int64_t nfs_client_open(NFSClient *client, QDict *options, } client->readahead = qemu_opt_get_number(opts, "readahead-size", 0); if (client->readahead > QEMU_NFS_MAX_READAHEAD_SIZE) { - error_report("NFS Warning: Truncating NFS readahead " - "size to %d", QEMU_NFS_MAX_READAHEAD_SIZE); + warn_report("Truncating NFS readahead size to %d", + QEMU_NFS_MAX_READAHEAD_SIZE); client->readahead = QEMU_NFS_MAX_READAHEAD_SIZE; } nfs_set_readahead(client->context, client->readahead); @@ -579,8 +579,8 @@ static int64_t nfs_client_open(NFSClient *client, QDict *options, } client->pagecache = qemu_opt_get_number(opts, "page-cache-size", 0); if (client->pagecache > QEMU_NFS_MAX_PAGECACHE_SIZE) { - error_report("NFS Warning: Truncating NFS pagecache " - "size to %d pages", QEMU_NFS_MAX_PAGECACHE_SIZE); + warn_report("Truncating NFS pagecache size to %d pages", + QEMU_NFS_MAX_PAGECACHE_SIZE); client->pagecache = QEMU_NFS_MAX_PAGECACHE_SIZE; } nfs_set_pagecache(client->context, client->pagecache); @@ -595,8 +595,8 @@ static int64_t nfs_client_open(NFSClient *client, QDict *options, /* limit the maximum debug level to avoid potential flooding * of our log files. */ if (client->debug > QEMU_NFS_MAX_DEBUG_LEVEL) { - error_report("NFS Warning: Limiting NFS debug level " - "to %d", QEMU_NFS_MAX_DEBUG_LEVEL); + warn_report("Limiting NFS debug level to %d", + QEMU_NFS_MAX_DEBUG_LEVEL); client->debug = QEMU_NFS_MAX_DEBUG_LEVEL; } nfs_set_debug(client->context, client->debug); @@ -759,11 +759,18 @@ static int64_t nfs_get_allocated_file_size(BlockDriverState *bs) return (task.ret < 0 ? task.ret : st.st_blocks * 512); } -static int nfs_file_truncate(BlockDriverState *bs, int64_t offset, Error **errp) +static int nfs_file_truncate(BlockDriverState *bs, int64_t offset, + PreallocMode prealloc, Error **errp) { NFSClient *client = bs->opaque; int ret; + if (prealloc != PREALLOC_MODE_OFF) { + error_setg(errp, "Unsupported preallocation mode '%s'", + PreallocMode_lookup[prealloc]); + return -ENOTSUP; + } + ret = nfs_ftruncate(client->context, client->fh, offset); if (ret < 0) { error_setg_errno(errp, -ret, "Failed to truncate file"); diff --git a/block/parallels.c b/block/parallels.c index 8be46a7d48..5bbdfabb7a 100644 --- a/block/parallels.c +++ b/block/parallels.c @@ -224,7 +224,7 @@ static int64_t allocate_clusters(BlockDriverState *bs, int64_t sector_num, } else { ret = bdrv_truncate(bs->file, (s->data_end + space) << BDRV_SECTOR_BITS, - NULL); + PREALLOC_MODE_OFF, NULL); } if (ret < 0) { return ret; @@ -458,7 +458,8 @@ static int parallels_check(BlockDriverState *bs, BdrvCheckResult *res, res->leaks += count; if (fix & BDRV_FIX_LEAKS) { Error *local_err = NULL; - ret = bdrv_truncate(bs->file, res->image_end_offset, &local_err); + ret = bdrv_truncate(bs->file, res->image_end_offset, + PREALLOC_MODE_OFF, &local_err); if (ret < 0) { error_report_err(local_err); res->check_errors++; @@ -507,7 +508,7 @@ static int parallels_create(const char *filename, QemuOpts *opts, Error **errp) blk_set_allow_write_beyond_eof(file, true); - ret = blk_truncate(file, 0, errp); + ret = blk_truncate(file, 0, PREALLOC_MODE_OFF, errp); if (ret < 0) { goto exit; } @@ -699,7 +700,8 @@ static int parallels_open(BlockDriverState *bs, QDict *options, int flags, } if (!(flags & BDRV_O_RESIZE) || !bdrv_has_zero_init(bs->file->bs) || - bdrv_truncate(bs->file, bdrv_getlength(bs->file->bs), NULL) != 0) { + bdrv_truncate(bs->file, bdrv_getlength(bs->file->bs), + PREALLOC_MODE_OFF, NULL) != 0) { s->prealloc_mode = PRL_PREALLOC_MODE_FALLOCATE; } @@ -742,7 +744,8 @@ static void parallels_close(BlockDriverState *bs) } if (bs->open_flags & BDRV_O_RDWR) { - bdrv_truncate(bs->file, s->data_end << BDRV_SECTOR_BITS, NULL); + bdrv_truncate(bs->file, s->data_end << BDRV_SECTOR_BITS, + PREALLOC_MODE_OFF, NULL); } g_free(s->bat_dirty_bmap); diff --git a/block/qapi.c b/block/qapi.c index 0a41d59bf3..080eb8f115 100644 --- a/block/qapi.c +++ b/block/qapi.c @@ -45,7 +45,7 @@ BlockDeviceInfo *bdrv_block_device_info(BlockBackend *blk, info->ro = bs->read_only; info->drv = g_strdup(bs->drv->format_name); info->encrypted = bs->encrypted; - info->encryption_key_missing = bdrv_key_required(bs); + info->encryption_key_missing = false; info->cache = g_new(BlockdevCacheInfo, 1); *info->cache = (BlockdevCacheInfo) { diff --git a/block/qcow.c b/block/qcow.c index 7bd94dcd46..66827d6f24 100644 --- a/block/qcow.c +++ b/block/qcow.c @@ -31,8 +31,10 @@ #include "qemu/bswap.h" #include <zlib.h> #include "qapi/qmp/qerror.h" -#include "crypto/cipher.h" +#include "qapi/qmp/qstring.h" +#include "crypto/block.h" #include "migration/blocker.h" +#include "block/crypto.h" /**************************************************************/ /* QEMU COW block driver with compression and encryption support */ @@ -77,7 +79,7 @@ typedef struct BDRVQcowState { uint8_t *cluster_cache; uint8_t *cluster_data; uint64_t cluster_cache_offset; - QCryptoCipher *cipher; /* NULL if no key yet */ + QCryptoBlock *crypto; /* Disk encryption format driver */ uint32_t crypt_method_header; CoMutex lock; Error *migration_blocker; @@ -97,6 +99,15 @@ static int qcow_probe(const uint8_t *buf, int buf_size, const char *filename) return 0; } +static QemuOptsList qcow_runtime_opts = { + .name = "qcow", + .head = QTAILQ_HEAD_INITIALIZER(qcow_runtime_opts.head), + .desc = { + BLOCK_CRYPTO_OPT_DEF_QCOW_KEY_SECRET("encrypt."), + { /* end of list */ } + }, +}; + static int qcow_open(BlockDriverState *bs, QDict *options, int flags, Error **errp) { @@ -105,11 +116,19 @@ static int qcow_open(BlockDriverState *bs, QDict *options, int flags, int ret; QCowHeader header; Error *local_err = NULL; + QCryptoBlockOpenOptions *crypto_opts = NULL; + unsigned int cflags = 0; + QDict *encryptopts = NULL; + const char *encryptfmt; + + qdict_extract_subqdict(options, &encryptopts, "encrypt."); + encryptfmt = qdict_get_try_str(encryptopts, "format"); bs->file = bdrv_open_child(NULL, options, "file", bs, &child_file, false, errp); if (!bs->file) { - return -EINVAL; + ret = -EINVAL; + goto fail; } ret = bdrv_pread(bs->file, 0, &header, sizeof(header)); @@ -155,17 +174,6 @@ static int qcow_open(BlockDriverState *bs, QDict *options, int flags, goto fail; } - if (header.crypt_method > QCOW_CRYPT_AES) { - error_setg(errp, "invalid encryption method in qcow header"); - ret = -EINVAL; - goto fail; - } - if (!qcrypto_cipher_supports(QCRYPTO_CIPHER_ALG_AES_128, - QCRYPTO_CIPHER_MODE_CBC)) { - error_setg(errp, "AES cipher not available"); - ret = -EINVAL; - goto fail; - } s->crypt_method_header = header.crypt_method; if (s->crypt_method_header) { if (bdrv_uses_whitelist() && @@ -181,8 +189,44 @@ static int qcow_open(BlockDriverState *bs, QDict *options, int flags, ret = -ENOSYS; goto fail; } + if (s->crypt_method_header == QCOW_CRYPT_AES) { + if (encryptfmt && !g_str_equal(encryptfmt, "aes")) { + error_setg(errp, + "Header reported 'aes' encryption format but " + "options specify '%s'", encryptfmt); + ret = -EINVAL; + goto fail; + } + qdict_del(encryptopts, "format"); + crypto_opts = block_crypto_open_opts_init( + Q_CRYPTO_BLOCK_FORMAT_QCOW, encryptopts, errp); + if (!crypto_opts) { + ret = -EINVAL; + goto fail; + } + if (flags & BDRV_O_NO_IO) { + cflags |= QCRYPTO_BLOCK_OPEN_NO_IO; + } + s->crypto = qcrypto_block_open(crypto_opts, "encrypt.", + NULL, NULL, cflags, errp); + if (!s->crypto) { + ret = -EINVAL; + goto fail; + } + } else { + error_setg(errp, "invalid encryption method in qcow header"); + ret = -EINVAL; + goto fail; + } bs->encrypted = true; + } else { + if (encryptfmt) { + error_setg(errp, "No encryption in image header, but options " + "specified format '%s'", encryptfmt); + ret = -EINVAL; + goto fail; + } } s->cluster_bits = header.cluster_bits; s->cluster_size = 1 << s->cluster_bits; @@ -266,6 +310,8 @@ static int qcow_open(BlockDriverState *bs, QDict *options, int flags, goto fail; } + QDECREF(encryptopts); + qapi_free_QCryptoBlockOpenOptions(crypto_opts); qemu_co_mutex_init(&s->lock); return 0; @@ -274,6 +320,9 @@ static int qcow_open(BlockDriverState *bs, QDict *options, int flags, qemu_vfree(s->l2_cache); g_free(s->cluster_cache); g_free(s->cluster_data); + qcrypto_block_free(s->crypto); + QDECREF(encryptopts); + qapi_free_QCryptoBlockOpenOptions(crypto_opts); return ret; } @@ -286,85 +335,6 @@ static int qcow_reopen_prepare(BDRVReopenState *state, return 0; } -static int qcow_set_key(BlockDriverState *bs, const char *key) -{ - BDRVQcowState *s = bs->opaque; - uint8_t keybuf[16]; - int len, i; - Error *err; - - memset(keybuf, 0, 16); - len = strlen(key); - if (len > 16) - len = 16; - /* XXX: we could compress the chars to 7 bits to increase - entropy */ - for(i = 0;i < len;i++) { - keybuf[i] = key[i]; - } - assert(bs->encrypted); - - qcrypto_cipher_free(s->cipher); - s->cipher = qcrypto_cipher_new( - QCRYPTO_CIPHER_ALG_AES_128, - QCRYPTO_CIPHER_MODE_CBC, - keybuf, G_N_ELEMENTS(keybuf), - &err); - - if (!s->cipher) { - /* XXX would be nice if errors in this method could - * be properly propagate to the caller. Would need - * the bdrv_set_key() API signature to be fixed. */ - error_free(err); - return -1; - } - return 0; -} - -/* The crypt function is compatible with the linux cryptoloop - algorithm for < 4 GB images. NOTE: out_buf == in_buf is - supported */ -static int encrypt_sectors(BDRVQcowState *s, int64_t sector_num, - uint8_t *out_buf, const uint8_t *in_buf, - int nb_sectors, bool enc, Error **errp) -{ - union { - uint64_t ll[2]; - uint8_t b[16]; - } ivec; - int i; - int ret; - - for(i = 0; i < nb_sectors; i++) { - ivec.ll[0] = cpu_to_le64(sector_num); - ivec.ll[1] = 0; - if (qcrypto_cipher_setiv(s->cipher, - ivec.b, G_N_ELEMENTS(ivec.b), - errp) < 0) { - return -1; - } - if (enc) { - ret = qcrypto_cipher_encrypt(s->cipher, - in_buf, - out_buf, - 512, - errp); - } else { - ret = qcrypto_cipher_decrypt(s->cipher, - in_buf, - out_buf, - 512, - errp); - } - if (ret < 0) { - return -1; - } - sector_num++; - in_buf += 512; - out_buf += 512; - } - return 0; -} /* 'allocate' is: * @@ -473,22 +443,23 @@ static uint64_t get_cluster_offset(BlockDriverState *bs, /* round to cluster size */ cluster_offset = (cluster_offset + s->cluster_size - 1) & ~(s->cluster_size - 1); - bdrv_truncate(bs->file, cluster_offset + s->cluster_size, NULL); + bdrv_truncate(bs->file, cluster_offset + s->cluster_size, + PREALLOC_MODE_OFF, NULL); /* if encrypted, we must initialize the cluster content which won't be written */ if (bs->encrypted && (n_end - n_start) < s->cluster_sectors) { uint64_t start_sect; - assert(s->cipher); + assert(s->crypto); start_sect = (offset & ~(s->cluster_size - 1)) >> 9; - memset(s->cluster_data + 512, 0x00, 512); for(i = 0; i < s->cluster_sectors; i++) { if (i < n_start || i >= n_end) { Error *err = NULL; - if (encrypt_sectors(s, start_sect + i, - s->cluster_data, - s->cluster_data + 512, 1, - true, &err) < 0) { + memset(s->cluster_data, 0x00, 512); + if (qcrypto_block_encrypt(s->crypto, start_sect + i, + s->cluster_data, + BDRV_SECTOR_SIZE, + &err) < 0) { error_free(err); errno = EIO; return -1; @@ -533,7 +504,7 @@ static int64_t coroutine_fn qcow_co_get_block_status(BlockDriverState *bs, if (!cluster_offset) { return 0; } - if ((cluster_offset & QCOW_OFLAG_COMPRESSED) || s->cipher) { + if ((cluster_offset & QCOW_OFLAG_COMPRESSED) || s->crypto) { return BDRV_BLOCK_DATA; } cluster_offset |= (index_in_cluster << BDRV_SECTOR_BITS); @@ -664,9 +635,9 @@ static coroutine_fn int qcow_co_readv(BlockDriverState *bs, int64_t sector_num, break; } if (bs->encrypted) { - assert(s->cipher); - if (encrypt_sectors(s, sector_num, buf, buf, - n, false, &err) < 0) { + assert(s->crypto); + if (qcrypto_block_decrypt(s->crypto, sector_num, buf, + n * BDRV_SECTOR_SIZE, &err) < 0) { goto fail; } } @@ -700,9 +671,7 @@ static coroutine_fn int qcow_co_writev(BlockDriverState *bs, int64_t sector_num, BDRVQcowState *s = bs->opaque; int index_in_cluster; uint64_t cluster_offset; - const uint8_t *src_buf; int ret = 0, n; - uint8_t *cluster_data = NULL; struct iovec hd_iov; QEMUIOVector hd_qiov; uint8_t *buf; @@ -710,7 +679,9 @@ static coroutine_fn int qcow_co_writev(BlockDriverState *bs, int64_t sector_num, s->cluster_cache_offset = -1; /* disable compressed cache */ - if (qiov->niov > 1) { + /* We must always copy the iov when encrypting, so we + * don't modify the original data buffer during encryption */ + if (bs->encrypted || qiov->niov > 1) { buf = orig_buf = qemu_try_blockalign(bs, qiov->size); if (buf == NULL) { return -ENOMEM; @@ -739,22 +710,16 @@ static coroutine_fn int qcow_co_writev(BlockDriverState *bs, int64_t sector_num, } if (bs->encrypted) { Error *err = NULL; - assert(s->cipher); - if (!cluster_data) { - cluster_data = g_malloc0(s->cluster_size); - } - if (encrypt_sectors(s, sector_num, cluster_data, buf, - n, true, &err) < 0) { + assert(s->crypto); + if (qcrypto_block_encrypt(s->crypto, sector_num, buf, + n * BDRV_SECTOR_SIZE, &err) < 0) { error_free(err); ret = -EIO; break; } - src_buf = cluster_data; - } else { - src_buf = buf; } - hd_iov.iov_base = (void *)src_buf; + hd_iov.iov_base = (void *)buf; hd_iov.iov_len = n * 512; qemu_iovec_init_external(&hd_qiov, &hd_iov, 1); qemu_co_mutex_unlock(&s->lock); @@ -773,10 +738,7 @@ static coroutine_fn int qcow_co_writev(BlockDriverState *bs, int64_t sector_num, } qemu_co_mutex_unlock(&s->lock); - if (qiov->niov > 1) { - qemu_vfree(orig_buf); - } - g_free(cluster_data); + qemu_vfree(orig_buf); return ret; } @@ -785,8 +747,8 @@ static void qcow_close(BlockDriverState *bs) { BDRVQcowState *s = bs->opaque; - qcrypto_cipher_free(s->cipher); - s->cipher = NULL; + qcrypto_block_free(s->crypto); + s->crypto = NULL; g_free(s->l1_table); qemu_vfree(s->l2_cache); g_free(s->cluster_cache); @@ -803,17 +765,35 @@ static int qcow_create(const char *filename, QemuOpts *opts, Error **errp) uint8_t *tmp; int64_t total_size = 0; char *backing_file = NULL; - int flags = 0; Error *local_err = NULL; int ret; BlockBackend *qcow_blk; + const char *encryptfmt = NULL; + QDict *options; + QDict *encryptopts = NULL; + QCryptoBlockCreateOptions *crypto_opts = NULL; + QCryptoBlock *crypto = NULL; /* Read out options */ total_size = ROUND_UP(qemu_opt_get_size_del(opts, BLOCK_OPT_SIZE, 0), BDRV_SECTOR_SIZE); + if (total_size == 0) { + error_setg(errp, "Image size is too small, cannot be zero length"); + ret = -EINVAL; + goto cleanup; + } + backing_file = qemu_opt_get_del(opts, BLOCK_OPT_BACKING_FILE); - if (qemu_opt_get_bool_del(opts, BLOCK_OPT_ENCRYPT, false)) { - flags |= BLOCK_FLAG_ENCRYPT; + encryptfmt = qemu_opt_get_del(opts, BLOCK_OPT_ENCRYPT_FORMAT); + if (encryptfmt) { + if (qemu_opt_get(opts, BLOCK_OPT_ENCRYPT)) { + error_setg(errp, "Options " BLOCK_OPT_ENCRYPT " and " + BLOCK_OPT_ENCRYPT_FORMAT " are mutually exclusive"); + ret = -EINVAL; + goto cleanup; + } + } else if (qemu_opt_get_bool_del(opts, BLOCK_OPT_ENCRYPT, false)) { + encryptfmt = "aes"; } ret = bdrv_create_file(filename, opts, &local_err); @@ -833,7 +813,7 @@ static int qcow_create(const char *filename, QemuOpts *opts, Error **errp) blk_set_allow_write_beyond_eof(qcow_blk, true); - ret = blk_truncate(qcow_blk, 0, errp); + ret = blk_truncate(qcow_blk, 0, PREALLOC_MODE_OFF, errp); if (ret < 0) { goto exit; } @@ -867,8 +847,32 @@ static int qcow_create(const char *filename, QemuOpts *opts, Error **errp) l1_size = (total_size + (1LL << shift) - 1) >> shift; header.l1_table_offset = cpu_to_be64(header_size); - if (flags & BLOCK_FLAG_ENCRYPT) { + + options = qemu_opts_to_qdict(opts, NULL); + qdict_extract_subqdict(options, &encryptopts, "encrypt."); + QDECREF(options); + if (encryptfmt) { + if (!g_str_equal(encryptfmt, "aes")) { + error_setg(errp, "Unknown encryption format '%s', expected 'aes'", + encryptfmt); + ret = -EINVAL; + goto exit; + } header.crypt_method = cpu_to_be32(QCOW_CRYPT_AES); + + crypto_opts = block_crypto_create_opts_init( + Q_CRYPTO_BLOCK_FORMAT_QCOW, encryptopts, errp); + if (!crypto_opts) { + ret = -EINVAL; + goto exit; + } + + crypto = qcrypto_block_create(crypto_opts, "encrypt.", + NULL, NULL, NULL, errp); + if (!crypto) { + ret = -EINVAL; + goto exit; + } } else { header.crypt_method = cpu_to_be32(QCOW_CRYPT_NONE); } @@ -903,6 +907,9 @@ static int qcow_create(const char *filename, QemuOpts *opts, Error **errp) exit: blk_unref(qcow_blk); cleanup: + QDECREF(encryptopts); + qcrypto_block_free(crypto); + qapi_free_QCryptoBlockCreateOptions(crypto_opts); g_free(backing_file); return ret; } @@ -917,7 +924,8 @@ static int qcow_make_empty(BlockDriverState *bs) if (bdrv_pwrite_sync(bs->file, s->l1_table_offset, s->l1_table, l1_length) < 0) return -1; - ret = bdrv_truncate(bs->file, s->l1_table_offset + l1_length, NULL); + ret = bdrv_truncate(bs->file, s->l1_table_offset + l1_length, + PREALLOC_MODE_OFF, NULL); if (ret < 0) return ret; @@ -1041,9 +1049,15 @@ static QemuOptsList qcow_create_opts = { { .name = BLOCK_OPT_ENCRYPT, .type = QEMU_OPT_BOOL, - .help = "Encrypt the image", - .def_value_str = "off" + .help = "Encrypt the image with format 'aes'. (Deprecated " + "in favor of " BLOCK_OPT_ENCRYPT_FORMAT "=aes)", + }, + { + .name = BLOCK_OPT_ENCRYPT_FORMAT, + .type = QEMU_OPT_STRING, + .help = "Encrypt the image, format choices: 'aes'", }, + BLOCK_CRYPTO_OPT_DEF_QCOW_KEY_SECRET("encrypt."), { /* end of list */ } } }; @@ -1064,7 +1078,6 @@ static BlockDriver bdrv_qcow = { .bdrv_co_writev = qcow_co_writev, .bdrv_co_get_block_status = qcow_co_get_block_status, - .bdrv_set_key = qcow_set_key, .bdrv_make_empty = qcow_make_empty, .bdrv_co_pwritev_compressed = qcow_co_pwritev_compressed, .bdrv_get_info = qcow_get_info, diff --git a/block/qcow2-bitmap.c b/block/qcow2-bitmap.c new file mode 100644 index 0000000000..3e8735a20d --- /dev/null +++ b/block/qcow2-bitmap.c @@ -0,0 +1,1482 @@ +/* + * Bitmaps for the QCOW version 2 format + * + * Copyright (c) 2014-2017 Vladimir Sementsov-Ogievskiy + * + * This file is derived from qcow2-snapshot.c, original copyright: + * Copyright (c) 2004-2006 Fabrice Bellard + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL + * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN + * THE SOFTWARE. + */ + +#include "qemu/osdep.h" +#include "qapi/error.h" +#include "qemu/cutils.h" + +#include "block/block_int.h" +#include "block/qcow2.h" + +/* NOTICE: BME here means Bitmaps Extension and used as a namespace for + * _internal_ constants. Please do not use this _internal_ abbreviation for + * other needs and/or outside of this file. */ + +/* Bitmap directory entry constraints */ +#define BME_MAX_TABLE_SIZE 0x8000000 +#define BME_MAX_PHYS_SIZE 0x20000000 /* restrict BdrvDirtyBitmap size in RAM */ +#define BME_MAX_GRANULARITY_BITS 31 +#define BME_MIN_GRANULARITY_BITS 9 +#define BME_MAX_NAME_SIZE 1023 + +#if BME_MAX_TABLE_SIZE * 8ULL > INT_MAX +#error In the code bitmap table physical size assumed to fit into int +#endif + +/* Bitmap directory entry flags */ +#define BME_RESERVED_FLAGS 0xfffffffcU +#define BME_FLAG_IN_USE (1U << 0) +#define BME_FLAG_AUTO (1U << 1) + +/* bits [1, 8] U [56, 63] are reserved */ +#define BME_TABLE_ENTRY_RESERVED_MASK 0xff000000000001feULL +#define BME_TABLE_ENTRY_OFFSET_MASK 0x00fffffffffffe00ULL +#define BME_TABLE_ENTRY_FLAG_ALL_ONES (1ULL << 0) + +typedef struct QEMU_PACKED Qcow2BitmapDirEntry { + /* header is 8 byte aligned */ + uint64_t bitmap_table_offset; + + uint32_t bitmap_table_size; + uint32_t flags; + + uint8_t type; + uint8_t granularity_bits; + uint16_t name_size; + uint32_t extra_data_size; + /* extra data follows */ + /* name follows */ +} Qcow2BitmapDirEntry; + +typedef struct Qcow2BitmapTable { + uint64_t offset; + uint32_t size; /* number of 64bit entries */ + QSIMPLEQ_ENTRY(Qcow2BitmapTable) entry; +} Qcow2BitmapTable; +typedef QSIMPLEQ_HEAD(Qcow2BitmapTableList, Qcow2BitmapTable) + Qcow2BitmapTableList; + +typedef struct Qcow2Bitmap { + Qcow2BitmapTable table; + uint32_t flags; + uint8_t granularity_bits; + char *name; + + BdrvDirtyBitmap *dirty_bitmap; + + QSIMPLEQ_ENTRY(Qcow2Bitmap) entry; +} Qcow2Bitmap; +typedef QSIMPLEQ_HEAD(Qcow2BitmapList, Qcow2Bitmap) Qcow2BitmapList; + +typedef enum BitmapType { + BT_DIRTY_TRACKING_BITMAP = 1 +} BitmapType; + +static inline bool can_write(BlockDriverState *bs) +{ + return !bdrv_is_read_only(bs) && !(bdrv_get_flags(bs) & BDRV_O_INACTIVE); +} + +static int update_header_sync(BlockDriverState *bs) +{ + int ret; + + ret = qcow2_update_header(bs); + if (ret < 0) { + return ret; + } + + return bdrv_flush(bs); +} + +static inline void bitmap_table_to_be(uint64_t *bitmap_table, size_t size) +{ + size_t i; + + for (i = 0; i < size; ++i) { + cpu_to_be64s(&bitmap_table[i]); + } +} + +static int check_table_entry(uint64_t entry, int cluster_size) +{ + uint64_t offset; + + if (entry & BME_TABLE_ENTRY_RESERVED_MASK) { + return -EINVAL; + } + + offset = entry & BME_TABLE_ENTRY_OFFSET_MASK; + if (offset != 0) { + /* if offset specified, bit 0 is reserved */ + if (entry & BME_TABLE_ENTRY_FLAG_ALL_ONES) { + return -EINVAL; + } + + if (offset % cluster_size != 0) { + return -EINVAL; + } + } + + return 0; +} + +static int check_constraints_on_bitmap(BlockDriverState *bs, + const char *name, + uint32_t granularity, + Error **errp) +{ + BDRVQcow2State *s = bs->opaque; + int granularity_bits = ctz32(granularity); + int64_t len = bdrv_getlength(bs); + + assert(granularity > 0); + assert((granularity & (granularity - 1)) == 0); + + if (len < 0) { + error_setg_errno(errp, -len, "Failed to get size of '%s'", + bdrv_get_device_or_node_name(bs)); + return len; + } + + if (granularity_bits > BME_MAX_GRANULARITY_BITS) { + error_setg(errp, "Granularity exceeds maximum (%llu bytes)", + 1ULL << BME_MAX_GRANULARITY_BITS); + return -EINVAL; + } + if (granularity_bits < BME_MIN_GRANULARITY_BITS) { + error_setg(errp, "Granularity is under minimum (%llu bytes)", + 1ULL << BME_MIN_GRANULARITY_BITS); + return -EINVAL; + } + + if ((len > (uint64_t)BME_MAX_PHYS_SIZE << granularity_bits) || + (len > (uint64_t)BME_MAX_TABLE_SIZE * s->cluster_size << + granularity_bits)) + { + error_setg(errp, "Too much space will be occupied by the bitmap. " + "Use larger granularity"); + return -EINVAL; + } + + if (strlen(name) > BME_MAX_NAME_SIZE) { + error_setg(errp, "Name length exceeds maximum (%u characters)", + BME_MAX_NAME_SIZE); + return -EINVAL; + } + + return 0; +} + +static void clear_bitmap_table(BlockDriverState *bs, uint64_t *bitmap_table, + uint32_t bitmap_table_size) +{ + BDRVQcow2State *s = bs->opaque; + int i; + + for (i = 0; i < bitmap_table_size; ++i) { + uint64_t addr = bitmap_table[i] & BME_TABLE_ENTRY_OFFSET_MASK; + if (!addr) { + continue; + } + + qcow2_free_clusters(bs, addr, s->cluster_size, QCOW2_DISCARD_OTHER); + bitmap_table[i] = 0; + } +} + +static int bitmap_table_load(BlockDriverState *bs, Qcow2BitmapTable *tb, + uint64_t **bitmap_table) +{ + int ret; + BDRVQcow2State *s = bs->opaque; + uint32_t i; + uint64_t *table; + + assert(tb->size != 0); + table = g_try_new(uint64_t, tb->size); + if (table == NULL) { + return -ENOMEM; + } + + assert(tb->size <= BME_MAX_TABLE_SIZE); + ret = bdrv_pread(bs->file, tb->offset, + table, tb->size * sizeof(uint64_t)); + if (ret < 0) { + goto fail; + } + + for (i = 0; i < tb->size; ++i) { + be64_to_cpus(&table[i]); + ret = check_table_entry(table[i], s->cluster_size); + if (ret < 0) { + goto fail; + } + } + + *bitmap_table = table; + return 0; + +fail: + g_free(table); + + return ret; +} + +static int free_bitmap_clusters(BlockDriverState *bs, Qcow2BitmapTable *tb) +{ + int ret; + uint64_t *bitmap_table; + + ret = bitmap_table_load(bs, tb, &bitmap_table); + if (ret < 0) { + assert(bitmap_table == NULL); + return ret; + } + + clear_bitmap_table(bs, bitmap_table, tb->size); + qcow2_free_clusters(bs, tb->offset, tb->size * sizeof(uint64_t), + QCOW2_DISCARD_OTHER); + g_free(bitmap_table); + + tb->offset = 0; + tb->size = 0; + + return 0; +} + +/* This function returns the number of disk sectors covered by a single qcow2 + * cluster of bitmap data. */ +static uint64_t sectors_covered_by_bitmap_cluster(const BDRVQcow2State *s, + const BdrvDirtyBitmap *bitmap) +{ + uint32_t sector_granularity = + bdrv_dirty_bitmap_granularity(bitmap) >> BDRV_SECTOR_BITS; + + return (uint64_t)sector_granularity * (s->cluster_size << 3); +} + +/* load_bitmap_data + * @bitmap_table entries must satisfy specification constraints. + * @bitmap must be cleared */ +static int load_bitmap_data(BlockDriverState *bs, + const uint64_t *bitmap_table, + uint32_t bitmap_table_size, + BdrvDirtyBitmap *bitmap) +{ + int ret = 0; + BDRVQcow2State *s = bs->opaque; + uint64_t sector, sbc; + uint64_t bm_size = bdrv_dirty_bitmap_size(bitmap); + uint8_t *buf = NULL; + uint64_t i, tab_size = + size_to_clusters(s, + bdrv_dirty_bitmap_serialization_size(bitmap, 0, bm_size)); + + if (tab_size != bitmap_table_size || tab_size > BME_MAX_TABLE_SIZE) { + return -EINVAL; + } + + buf = g_malloc(s->cluster_size); + sbc = sectors_covered_by_bitmap_cluster(s, bitmap); + for (i = 0, sector = 0; i < tab_size; ++i, sector += sbc) { + uint64_t count = MIN(bm_size - sector, sbc); + uint64_t entry = bitmap_table[i]; + uint64_t offset = entry & BME_TABLE_ENTRY_OFFSET_MASK; + + assert(check_table_entry(entry, s->cluster_size) == 0); + + if (offset == 0) { + if (entry & BME_TABLE_ENTRY_FLAG_ALL_ONES) { + bdrv_dirty_bitmap_deserialize_ones(bitmap, sector, count, + false); + } else { + /* No need to deserialize zeros because the dirty bitmap is + * already cleared */ + } + } else { + ret = bdrv_pread(bs->file, offset, buf, s->cluster_size); + if (ret < 0) { + goto finish; + } + bdrv_dirty_bitmap_deserialize_part(bitmap, buf, sector, count, + false); + } + } + ret = 0; + + bdrv_dirty_bitmap_deserialize_finish(bitmap); + +finish: + g_free(buf); + + return ret; +} + +static BdrvDirtyBitmap *load_bitmap(BlockDriverState *bs, + Qcow2Bitmap *bm, Error **errp) +{ + int ret; + uint64_t *bitmap_table = NULL; + uint32_t granularity; + BdrvDirtyBitmap *bitmap = NULL; + + if (bm->flags & BME_FLAG_IN_USE) { + error_setg(errp, "Bitmap '%s' is in use", bm->name); + goto fail; + } + + ret = bitmap_table_load(bs, &bm->table, &bitmap_table); + if (ret < 0) { + error_setg_errno(errp, -ret, + "Could not read bitmap_table table from image for " + "bitmap '%s'", bm->name); + goto fail; + } + + granularity = 1U << bm->granularity_bits; + bitmap = bdrv_create_dirty_bitmap(bs, granularity, bm->name, errp); + if (bitmap == NULL) { + goto fail; + } + + ret = load_bitmap_data(bs, bitmap_table, bm->table.size, bitmap); + if (ret < 0) { + error_setg_errno(errp, -ret, "Could not read bitmap '%s' from image", + bm->name); + goto fail; + } + + g_free(bitmap_table); + return bitmap; + +fail: + g_free(bitmap_table); + if (bitmap != NULL) { + bdrv_release_dirty_bitmap(bs, bitmap); + } + + return NULL; +} + +/* + * Bitmap List + */ + +/* + * Bitmap List private functions + * Only Bitmap List knows about bitmap directory structure in Qcow2. + */ + +static inline void bitmap_dir_entry_to_cpu(Qcow2BitmapDirEntry *entry) +{ + be64_to_cpus(&entry->bitmap_table_offset); + be32_to_cpus(&entry->bitmap_table_size); + be32_to_cpus(&entry->flags); + be16_to_cpus(&entry->name_size); + be32_to_cpus(&entry->extra_data_size); +} + +static inline void bitmap_dir_entry_to_be(Qcow2BitmapDirEntry *entry) +{ + cpu_to_be64s(&entry->bitmap_table_offset); + cpu_to_be32s(&entry->bitmap_table_size); + cpu_to_be32s(&entry->flags); + cpu_to_be16s(&entry->name_size); + cpu_to_be32s(&entry->extra_data_size); +} + +static inline int calc_dir_entry_size(size_t name_size, size_t extra_data_size) +{ + return align_offset(sizeof(Qcow2BitmapDirEntry) + + name_size + extra_data_size, 8); +} + +static inline int dir_entry_size(Qcow2BitmapDirEntry *entry) +{ + return calc_dir_entry_size(entry->name_size, entry->extra_data_size); +} + +static inline const char *dir_entry_name_field(Qcow2BitmapDirEntry *entry) +{ + return (const char *)(entry + 1) + entry->extra_data_size; +} + +static inline char *dir_entry_copy_name(Qcow2BitmapDirEntry *entry) +{ + const char *name_field = dir_entry_name_field(entry); + return g_strndup(name_field, entry->name_size); +} + +static inline Qcow2BitmapDirEntry *next_dir_entry(Qcow2BitmapDirEntry *entry) +{ + return (Qcow2BitmapDirEntry *)((uint8_t *)entry + dir_entry_size(entry)); +} + +static int check_dir_entry(BlockDriverState *bs, Qcow2BitmapDirEntry *entry) +{ + BDRVQcow2State *s = bs->opaque; + uint64_t phys_bitmap_bytes; + int64_t len; + + bool fail = (entry->bitmap_table_size == 0) || + (entry->bitmap_table_offset == 0) || + (entry->bitmap_table_offset % s->cluster_size) || + (entry->bitmap_table_size > BME_MAX_TABLE_SIZE) || + (entry->granularity_bits > BME_MAX_GRANULARITY_BITS) || + (entry->granularity_bits < BME_MIN_GRANULARITY_BITS) || + (entry->flags & BME_RESERVED_FLAGS) || + (entry->name_size > BME_MAX_NAME_SIZE) || + (entry->type != BT_DIRTY_TRACKING_BITMAP); + + if (fail) { + return -EINVAL; + } + + phys_bitmap_bytes = (uint64_t)entry->bitmap_table_size * s->cluster_size; + len = bdrv_getlength(bs); + + if (len < 0) { + return len; + } + + fail = (phys_bitmap_bytes > BME_MAX_PHYS_SIZE) || + (len > ((phys_bitmap_bytes * 8) << entry->granularity_bits)); + + return fail ? -EINVAL : 0; +} + +static inline void bitmap_directory_to_be(uint8_t *dir, size_t size) +{ + uint8_t *end = dir + size; + while (dir < end) { + Qcow2BitmapDirEntry *e = (Qcow2BitmapDirEntry *)dir; + dir += dir_entry_size(e); + + bitmap_dir_entry_to_be(e); + } +} + +/* + * Bitmap List public functions + */ + +static void bitmap_free(Qcow2Bitmap *bm) +{ + g_free(bm->name); + g_free(bm); +} + +static void bitmap_list_free(Qcow2BitmapList *bm_list) +{ + Qcow2Bitmap *bm; + + if (bm_list == NULL) { + return; + } + + while ((bm = QSIMPLEQ_FIRST(bm_list)) != NULL) { + QSIMPLEQ_REMOVE_HEAD(bm_list, entry); + bitmap_free(bm); + } + + g_free(bm_list); +} + +static Qcow2BitmapList *bitmap_list_new(void) +{ + Qcow2BitmapList *bm_list = g_new(Qcow2BitmapList, 1); + QSIMPLEQ_INIT(bm_list); + + return bm_list; +} + +static uint32_t bitmap_list_count(Qcow2BitmapList *bm_list) +{ + Qcow2Bitmap *bm; + uint32_t nb_bitmaps = 0; + + QSIMPLEQ_FOREACH(bm, bm_list, entry) { + nb_bitmaps++; + } + + return nb_bitmaps; +} + +/* bitmap_list_load + * Get bitmap list from qcow2 image. Actually reads bitmap directory, + * checks it and convert to bitmap list. + */ +static Qcow2BitmapList *bitmap_list_load(BlockDriverState *bs, uint64_t offset, + uint64_t size, Error **errp) +{ + int ret; + BDRVQcow2State *s = bs->opaque; + uint8_t *dir, *dir_end; + Qcow2BitmapDirEntry *e; + uint32_t nb_dir_entries = 0; + Qcow2BitmapList *bm_list = NULL; + + if (size == 0) { + error_setg(errp, "Requested bitmap directory size is zero"); + return NULL; + } + + if (size > QCOW2_MAX_BITMAP_DIRECTORY_SIZE) { + error_setg(errp, "Requested bitmap directory size is too big"); + return NULL; + } + + dir = g_try_malloc(size); + if (dir == NULL) { + error_setg(errp, "Failed to allocate space for bitmap directory"); + return NULL; + } + dir_end = dir + size; + + ret = bdrv_pread(bs->file, offset, dir, size); + if (ret < 0) { + error_setg_errno(errp, -ret, "Failed to read bitmap directory"); + goto fail; + } + + bm_list = bitmap_list_new(); + for (e = (Qcow2BitmapDirEntry *)dir; + e < (Qcow2BitmapDirEntry *)dir_end; + e = next_dir_entry(e)) + { + Qcow2Bitmap *bm; + + if ((uint8_t *)(e + 1) > dir_end) { + goto broken_dir; + } + + if (++nb_dir_entries > s->nb_bitmaps) { + error_setg(errp, "More bitmaps found than specified in header" + " extension"); + goto fail; + } + bitmap_dir_entry_to_cpu(e); + + if ((uint8_t *)next_dir_entry(e) > dir_end) { + goto broken_dir; + } + + if (e->extra_data_size != 0) { + error_setg(errp, "Bitmap extra data is not supported"); + goto fail; + } + + ret = check_dir_entry(bs, e); + if (ret < 0) { + error_setg(errp, "Bitmap '%.*s' doesn't satisfy the constraints", + e->name_size, dir_entry_name_field(e)); + goto fail; + } + + bm = g_new(Qcow2Bitmap, 1); + bm->table.offset = e->bitmap_table_offset; + bm->table.size = e->bitmap_table_size; + bm->flags = e->flags; + bm->granularity_bits = e->granularity_bits; + bm->name = dir_entry_copy_name(e); + QSIMPLEQ_INSERT_TAIL(bm_list, bm, entry); + } + + if (nb_dir_entries != s->nb_bitmaps) { + error_setg(errp, "Less bitmaps found than specified in header" + " extension"); + goto fail; + } + + if ((uint8_t *)e != dir_end) { + goto broken_dir; + } + + g_free(dir); + return bm_list; + +broken_dir: + ret = -EINVAL; + error_setg(errp, "Broken bitmap directory"); + +fail: + g_free(dir); + bitmap_list_free(bm_list); + + return NULL; +} + +int qcow2_check_bitmaps_refcounts(BlockDriverState *bs, BdrvCheckResult *res, + void **refcount_table, + int64_t *refcount_table_size) +{ + int ret; + BDRVQcow2State *s = bs->opaque; + Qcow2BitmapList *bm_list; + Qcow2Bitmap *bm; + + if (s->nb_bitmaps == 0) { + return 0; + } + + ret = qcow2_inc_refcounts_imrt(bs, res, refcount_table, refcount_table_size, + s->bitmap_directory_offset, + s->bitmap_directory_size); + if (ret < 0) { + return ret; + } + + bm_list = bitmap_list_load(bs, s->bitmap_directory_offset, + s->bitmap_directory_size, NULL); + if (bm_list == NULL) { + res->corruptions++; + return -EINVAL; + } + + QSIMPLEQ_FOREACH(bm, bm_list, entry) { + uint64_t *bitmap_table = NULL; + int i; + + ret = qcow2_inc_refcounts_imrt(bs, res, + refcount_table, refcount_table_size, + bm->table.offset, + bm->table.size * sizeof(uint64_t)); + if (ret < 0) { + goto out; + } + + ret = bitmap_table_load(bs, &bm->table, &bitmap_table); + if (ret < 0) { + res->corruptions++; + goto out; + } + + for (i = 0; i < bm->table.size; ++i) { + uint64_t entry = bitmap_table[i]; + uint64_t offset = entry & BME_TABLE_ENTRY_OFFSET_MASK; + + if (check_table_entry(entry, s->cluster_size) < 0) { + res->corruptions++; + continue; + } + + if (offset == 0) { + continue; + } + + ret = qcow2_inc_refcounts_imrt(bs, res, + refcount_table, refcount_table_size, + offset, s->cluster_size); + if (ret < 0) { + g_free(bitmap_table); + goto out; + } + } + + g_free(bitmap_table); + } + +out: + bitmap_list_free(bm_list); + + return ret; +} + +/* bitmap_list_store + * Store bitmap list to qcow2 image as a bitmap directory. + * Everything is checked. + */ +static int bitmap_list_store(BlockDriverState *bs, Qcow2BitmapList *bm_list, + uint64_t *offset, uint64_t *size, bool in_place) +{ + int ret; + uint8_t *dir; + int64_t dir_offset = 0; + uint64_t dir_size = 0; + Qcow2Bitmap *bm; + Qcow2BitmapDirEntry *e; + + QSIMPLEQ_FOREACH(bm, bm_list, entry) { + dir_size += calc_dir_entry_size(strlen(bm->name), 0); + } + + if (dir_size == 0 || dir_size > QCOW2_MAX_BITMAP_DIRECTORY_SIZE) { + return -EINVAL; + } + + if (in_place) { + if (*size != dir_size || *offset == 0) { + return -EINVAL; + } + + dir_offset = *offset; + } + + dir = g_try_malloc(dir_size); + if (dir == NULL) { + return -ENOMEM; + } + + e = (Qcow2BitmapDirEntry *)dir; + QSIMPLEQ_FOREACH(bm, bm_list, entry) { + e->bitmap_table_offset = bm->table.offset; + e->bitmap_table_size = bm->table.size; + e->flags = bm->flags; + e->type = BT_DIRTY_TRACKING_BITMAP; + e->granularity_bits = bm->granularity_bits; + e->name_size = strlen(bm->name); + e->extra_data_size = 0; + memcpy(e + 1, bm->name, e->name_size); + + if (check_dir_entry(bs, e) < 0) { + ret = -EINVAL; + goto fail; + } + + e = next_dir_entry(e); + } + + bitmap_directory_to_be(dir, dir_size); + + if (!in_place) { + dir_offset = qcow2_alloc_clusters(bs, dir_size); + if (dir_offset < 0) { + ret = dir_offset; + goto fail; + } + } + + ret = qcow2_pre_write_overlap_check(bs, 0, dir_offset, dir_size); + if (ret < 0) { + goto fail; + } + + ret = bdrv_pwrite(bs->file, dir_offset, dir, dir_size); + if (ret < 0) { + goto fail; + } + + g_free(dir); + + if (!in_place) { + *size = dir_size; + *offset = dir_offset; + } + + return 0; + +fail: + g_free(dir); + + if (!in_place && dir_offset > 0) { + qcow2_free_clusters(bs, dir_offset, dir_size, QCOW2_DISCARD_OTHER); + } + + return ret; +} + +/* + * Bitmap List end + */ + +static int update_ext_header_and_dir_in_place(BlockDriverState *bs, + Qcow2BitmapList *bm_list) +{ + BDRVQcow2State *s = bs->opaque; + int ret; + + if (!(s->autoclear_features & QCOW2_AUTOCLEAR_BITMAPS) || + bm_list == NULL || QSIMPLEQ_EMPTY(bm_list) || + bitmap_list_count(bm_list) != s->nb_bitmaps) + { + return -EINVAL; + } + + s->autoclear_features &= ~(uint64_t)QCOW2_AUTOCLEAR_BITMAPS; + ret = update_header_sync(bs); + if (ret < 0) { + /* Two variants are possible here: + * 1. Autoclear flag is dropped, all bitmaps will be lost. + * 2. Autoclear flag is not dropped, old state is left. + */ + return ret; + } + + /* autoclear bit is not set, so we can safely update bitmap directory */ + + ret = bitmap_list_store(bs, bm_list, &s->bitmap_directory_offset, + &s->bitmap_directory_size, true); + if (ret < 0) { + /* autoclear bit is cleared, so all leaked clusters would be removed on + * qemu-img check */ + return ret; + } + + ret = update_header_sync(bs); + if (ret < 0) { + /* autoclear bit is cleared, so all leaked clusters would be removed on + * qemu-img check */ + return ret; + } + + s->autoclear_features |= QCOW2_AUTOCLEAR_BITMAPS; + return update_header_sync(bs); + /* If final update_header_sync() fails, two variants are possible: + * 1. Autoclear flag is not set, all bitmaps will be lost. + * 2. Autoclear flag is set, header and directory are successfully updated. + */ +} + +static int update_ext_header_and_dir(BlockDriverState *bs, + Qcow2BitmapList *bm_list) +{ + BDRVQcow2State *s = bs->opaque; + int ret; + uint64_t new_offset = 0; + uint64_t new_size = 0; + uint32_t new_nb_bitmaps = 0; + uint64_t old_offset = s->bitmap_directory_offset; + uint64_t old_size = s->bitmap_directory_size; + uint32_t old_nb_bitmaps = s->nb_bitmaps; + uint64_t old_autocl = s->autoclear_features; + + if (bm_list != NULL && !QSIMPLEQ_EMPTY(bm_list)) { + new_nb_bitmaps = bitmap_list_count(bm_list); + + if (new_nb_bitmaps > QCOW2_MAX_BITMAPS) { + return -EINVAL; + } + + ret = bitmap_list_store(bs, bm_list, &new_offset, &new_size, false); + if (ret < 0) { + return ret; + } + + ret = bdrv_flush(bs->file->bs); + if (ret < 0) { + goto fail; + } + + s->autoclear_features |= QCOW2_AUTOCLEAR_BITMAPS; + } else { + s->autoclear_features &= ~(uint64_t)QCOW2_AUTOCLEAR_BITMAPS; + } + + s->bitmap_directory_offset = new_offset; + s->bitmap_directory_size = new_size; + s->nb_bitmaps = new_nb_bitmaps; + + ret = update_header_sync(bs); + if (ret < 0) { + goto fail; + } + + if (old_size > 0) { + qcow2_free_clusters(bs, old_offset, old_size, QCOW2_DISCARD_OTHER); + } + + return 0; + +fail: + if (new_offset > 0) { + qcow2_free_clusters(bs, new_offset, new_size, QCOW2_DISCARD_OTHER); + } + + s->bitmap_directory_offset = old_offset; + s->bitmap_directory_size = old_size; + s->nb_bitmaps = old_nb_bitmaps; + s->autoclear_features = old_autocl; + + return ret; +} + +/* for g_slist_foreach for GSList of BdrvDirtyBitmap* elements */ +static void release_dirty_bitmap_helper(gpointer bitmap, + gpointer bs) +{ + bdrv_release_dirty_bitmap(bs, bitmap); +} + +/* for g_slist_foreach for GSList of BdrvDirtyBitmap* elements */ +static void set_readonly_helper(gpointer bitmap, gpointer value) +{ + bdrv_dirty_bitmap_set_readonly(bitmap, (bool)value); +} + +/* qcow2_load_autoloading_dirty_bitmaps() + * Return value is a hint for caller: true means that the Qcow2 header was + * updated. (false doesn't mean that the header should be updated by the + * caller, it just means that updating was not needed or the image cannot be + * written to). + * On failure the function returns false. + */ +bool qcow2_load_autoloading_dirty_bitmaps(BlockDriverState *bs, Error **errp) +{ + BDRVQcow2State *s = bs->opaque; + Qcow2BitmapList *bm_list; + Qcow2Bitmap *bm; + GSList *created_dirty_bitmaps = NULL; + bool header_updated = false; + + if (s->nb_bitmaps == 0) { + /* No bitmaps - nothing to do */ + return false; + } + + bm_list = bitmap_list_load(bs, s->bitmap_directory_offset, + s->bitmap_directory_size, errp); + if (bm_list == NULL) { + return false; + } + + QSIMPLEQ_FOREACH(bm, bm_list, entry) { + if ((bm->flags & BME_FLAG_AUTO) && !(bm->flags & BME_FLAG_IN_USE)) { + BdrvDirtyBitmap *bitmap = load_bitmap(bs, bm, errp); + if (bitmap == NULL) { + goto fail; + } + + bdrv_dirty_bitmap_set_persistance(bitmap, true); + bdrv_dirty_bitmap_set_autoload(bitmap, true); + bm->flags |= BME_FLAG_IN_USE; + created_dirty_bitmaps = + g_slist_append(created_dirty_bitmaps, bitmap); + } + } + + if (created_dirty_bitmaps != NULL) { + if (can_write(bs)) { + /* in_use flags must be updated */ + int ret = update_ext_header_and_dir_in_place(bs, bm_list); + if (ret < 0) { + error_setg_errno(errp, -ret, "Can't update bitmap directory"); + goto fail; + } + header_updated = true; + } else { + g_slist_foreach(created_dirty_bitmaps, set_readonly_helper, + (gpointer)true); + } + } + + g_slist_free(created_dirty_bitmaps); + bitmap_list_free(bm_list); + + return header_updated; + +fail: + g_slist_foreach(created_dirty_bitmaps, release_dirty_bitmap_helper, bs); + g_slist_free(created_dirty_bitmaps); + bitmap_list_free(bm_list); + + return false; +} + +int qcow2_reopen_bitmaps_rw(BlockDriverState *bs, Error **errp) +{ + BDRVQcow2State *s = bs->opaque; + Qcow2BitmapList *bm_list; + Qcow2Bitmap *bm; + GSList *ro_dirty_bitmaps = NULL; + int ret = 0; + + if (s->nb_bitmaps == 0) { + /* No bitmaps - nothing to do */ + return 0; + } + + if (!can_write(bs)) { + error_setg(errp, "Can't write to the image on reopening bitmaps rw"); + return -EINVAL; + } + + bm_list = bitmap_list_load(bs, s->bitmap_directory_offset, + s->bitmap_directory_size, errp); + if (bm_list == NULL) { + return -EINVAL; + } + + QSIMPLEQ_FOREACH(bm, bm_list, entry) { + if (!(bm->flags & BME_FLAG_IN_USE)) { + BdrvDirtyBitmap *bitmap = bdrv_find_dirty_bitmap(bs, bm->name); + if (bitmap == NULL) { + continue; + } + + if (!bdrv_dirty_bitmap_readonly(bitmap)) { + error_setg(errp, "Bitmap %s is not readonly but not marked" + "'IN_USE' in the image. Something went wrong," + "all the bitmaps may be corrupted", bm->name); + ret = -EINVAL; + goto out; + } + + bm->flags |= BME_FLAG_IN_USE; + ro_dirty_bitmaps = g_slist_append(ro_dirty_bitmaps, bitmap); + } + } + + if (ro_dirty_bitmaps != NULL) { + /* in_use flags must be updated */ + ret = update_ext_header_and_dir_in_place(bs, bm_list); + if (ret < 0) { + error_setg_errno(errp, -ret, "Can't update bitmap directory"); + goto out; + } + g_slist_foreach(ro_dirty_bitmaps, set_readonly_helper, false); + } + +out: + g_slist_free(ro_dirty_bitmaps); + bitmap_list_free(bm_list); + + return ret; +} + +/* store_bitmap_data() + * Store bitmap to image, filling bitmap table accordingly. + */ +static uint64_t *store_bitmap_data(BlockDriverState *bs, + BdrvDirtyBitmap *bitmap, + uint32_t *bitmap_table_size, Error **errp) +{ + int ret; + BDRVQcow2State *s = bs->opaque; + int64_t sector; + uint64_t sbc; + uint64_t bm_size = bdrv_dirty_bitmap_size(bitmap); + const char *bm_name = bdrv_dirty_bitmap_name(bitmap); + uint8_t *buf = NULL; + BdrvDirtyBitmapIter *dbi; + uint64_t *tb; + uint64_t tb_size = + size_to_clusters(s, + bdrv_dirty_bitmap_serialization_size(bitmap, 0, bm_size)); + + if (tb_size > BME_MAX_TABLE_SIZE || + tb_size * s->cluster_size > BME_MAX_PHYS_SIZE) + { + error_setg(errp, "Bitmap '%s' is too big", bm_name); + return NULL; + } + + tb = g_try_new0(uint64_t, tb_size); + if (tb == NULL) { + error_setg(errp, "No memory"); + return NULL; + } + + dbi = bdrv_dirty_iter_new(bitmap, 0); + buf = g_malloc(s->cluster_size); + sbc = sectors_covered_by_bitmap_cluster(s, bitmap); + assert(DIV_ROUND_UP(bm_size, sbc) == tb_size); + + while ((sector = bdrv_dirty_iter_next(dbi)) != -1) { + uint64_t cluster = sector / sbc; + uint64_t end, write_size; + int64_t off; + + sector = cluster * sbc; + end = MIN(bm_size, sector + sbc); + write_size = + bdrv_dirty_bitmap_serialization_size(bitmap, sector, end - sector); + assert(write_size <= s->cluster_size); + + off = qcow2_alloc_clusters(bs, s->cluster_size); + if (off < 0) { + error_setg_errno(errp, -off, + "Failed to allocate clusters for bitmap '%s'", + bm_name); + goto fail; + } + tb[cluster] = off; + + bdrv_dirty_bitmap_serialize_part(bitmap, buf, sector, end - sector); + if (write_size < s->cluster_size) { + memset(buf + write_size, 0, s->cluster_size - write_size); + } + + ret = qcow2_pre_write_overlap_check(bs, 0, off, s->cluster_size); + if (ret < 0) { + error_setg_errno(errp, -ret, "Qcow2 overlap check failed"); + goto fail; + } + + ret = bdrv_pwrite(bs->file, off, buf, s->cluster_size); + if (ret < 0) { + error_setg_errno(errp, -ret, "Failed to write bitmap '%s' to file", + bm_name); + goto fail; + } + + if (end >= bm_size) { + break; + } + + bdrv_set_dirty_iter(dbi, end); + } + + *bitmap_table_size = tb_size; + g_free(buf); + bdrv_dirty_iter_free(dbi); + + return tb; + +fail: + clear_bitmap_table(bs, tb, tb_size); + g_free(buf); + bdrv_dirty_iter_free(dbi); + g_free(tb); + + return NULL; +} + +/* store_bitmap() + * Store bm->dirty_bitmap to qcow2. + * Set bm->table_offset and bm->table_size accordingly. + */ +static int store_bitmap(BlockDriverState *bs, Qcow2Bitmap *bm, Error **errp) +{ + int ret; + uint64_t *tb; + int64_t tb_offset; + uint32_t tb_size; + BdrvDirtyBitmap *bitmap = bm->dirty_bitmap; + const char *bm_name; + + assert(bitmap != NULL); + + bm_name = bdrv_dirty_bitmap_name(bitmap); + + tb = store_bitmap_data(bs, bitmap, &tb_size, errp); + if (tb == NULL) { + return -EINVAL; + } + + assert(tb_size <= BME_MAX_TABLE_SIZE); + tb_offset = qcow2_alloc_clusters(bs, tb_size * sizeof(tb[0])); + if (tb_offset < 0) { + error_setg_errno(errp, -tb_offset, + "Failed to allocate clusters for bitmap '%s'", + bm_name); + ret = tb_offset; + goto fail; + } + + ret = qcow2_pre_write_overlap_check(bs, 0, tb_offset, + tb_size * sizeof(tb[0])); + if (ret < 0) { + error_setg_errno(errp, -ret, "Qcow2 overlap check failed"); + goto fail; + } + + bitmap_table_to_be(tb, tb_size); + ret = bdrv_pwrite(bs->file, tb_offset, tb, tb_size * sizeof(tb[0])); + if (ret < 0) { + error_setg_errno(errp, -ret, "Failed to write bitmap '%s' to file", + bm_name); + goto fail; + } + + g_free(tb); + + bm->table.offset = tb_offset; + bm->table.size = tb_size; + + return 0; + +fail: + clear_bitmap_table(bs, tb, tb_size); + + if (tb_offset > 0) { + qcow2_free_clusters(bs, tb_offset, tb_size * sizeof(tb[0]), + QCOW2_DISCARD_OTHER); + } + + g_free(tb); + + return ret; +} + +static Qcow2Bitmap *find_bitmap_by_name(Qcow2BitmapList *bm_list, + const char *name) +{ + Qcow2Bitmap *bm; + + QSIMPLEQ_FOREACH(bm, bm_list, entry) { + if (strcmp(name, bm->name) == 0) { + return bm; + } + } + + return NULL; +} + +void qcow2_remove_persistent_dirty_bitmap(BlockDriverState *bs, + const char *name, + Error **errp) +{ + int ret; + BDRVQcow2State *s = bs->opaque; + Qcow2Bitmap *bm; + Qcow2BitmapList *bm_list; + + if (s->nb_bitmaps == 0) { + /* Absence of the bitmap is not an error: see explanation above + * bdrv_remove_persistent_dirty_bitmap() definition. */ + return; + } + + bm_list = bitmap_list_load(bs, s->bitmap_directory_offset, + s->bitmap_directory_size, errp); + if (bm_list == NULL) { + return; + } + + bm = find_bitmap_by_name(bm_list, name); + if (bm == NULL) { + goto fail; + } + + QSIMPLEQ_REMOVE(bm_list, bm, Qcow2Bitmap, entry); + + ret = update_ext_header_and_dir(bs, bm_list); + if (ret < 0) { + error_setg_errno(errp, -ret, "Failed to update bitmap extension"); + goto fail; + } + + free_bitmap_clusters(bs, &bm->table); + +fail: + bitmap_free(bm); + bitmap_list_free(bm_list); +} + +void qcow2_store_persistent_dirty_bitmaps(BlockDriverState *bs, Error **errp) +{ + BdrvDirtyBitmap *bitmap; + BDRVQcow2State *s = bs->opaque; + uint32_t new_nb_bitmaps = s->nb_bitmaps; + uint64_t new_dir_size = s->bitmap_directory_size; + int ret; + Qcow2BitmapList *bm_list; + Qcow2Bitmap *bm; + Qcow2BitmapTableList drop_tables; + Qcow2BitmapTable *tb, *tb_next; + + if (!bdrv_has_changed_persistent_bitmaps(bs)) { + /* nothing to do */ + return; + } + + if (!can_write(bs)) { + error_setg(errp, "No write access"); + return; + } + + QSIMPLEQ_INIT(&drop_tables); + + if (s->nb_bitmaps == 0) { + bm_list = bitmap_list_new(); + } else { + bm_list = bitmap_list_load(bs, s->bitmap_directory_offset, + s->bitmap_directory_size, errp); + if (bm_list == NULL) { + return; + } + } + + /* check constraints and names */ + for (bitmap = bdrv_dirty_bitmap_next(bs, NULL); bitmap != NULL; + bitmap = bdrv_dirty_bitmap_next(bs, bitmap)) + { + const char *name = bdrv_dirty_bitmap_name(bitmap); + uint32_t granularity = bdrv_dirty_bitmap_granularity(bitmap); + Qcow2Bitmap *bm; + + if (!bdrv_dirty_bitmap_get_persistance(bitmap) || + bdrv_dirty_bitmap_readonly(bitmap)) + { + continue; + } + + if (check_constraints_on_bitmap(bs, name, granularity, errp) < 0) { + error_prepend(errp, "Bitmap '%s' doesn't satisfy the constraints: ", + name); + goto fail; + } + + bm = find_bitmap_by_name(bm_list, name); + if (bm == NULL) { + if (++new_nb_bitmaps > QCOW2_MAX_BITMAPS) { + error_setg(errp, "Too many persistent bitmaps"); + goto fail; + } + + new_dir_size += calc_dir_entry_size(strlen(name), 0); + if (new_dir_size > QCOW2_MAX_BITMAP_DIRECTORY_SIZE) { + error_setg(errp, "Bitmap directory is too large"); + goto fail; + } + + bm = g_new0(Qcow2Bitmap, 1); + bm->name = g_strdup(name); + QSIMPLEQ_INSERT_TAIL(bm_list, bm, entry); + } else { + if (!(bm->flags & BME_FLAG_IN_USE)) { + error_setg(errp, "Bitmap '%s' already exists in the image", + name); + goto fail; + } + tb = g_memdup(&bm->table, sizeof(bm->table)); + bm->table.offset = 0; + bm->table.size = 0; + QSIMPLEQ_INSERT_TAIL(&drop_tables, tb, entry); + } + bm->flags = bdrv_dirty_bitmap_get_autoload(bitmap) ? BME_FLAG_AUTO : 0; + bm->granularity_bits = ctz32(bdrv_dirty_bitmap_granularity(bitmap)); + bm->dirty_bitmap = bitmap; + } + + /* allocate clusters and store bitmaps */ + QSIMPLEQ_FOREACH(bm, bm_list, entry) { + if (bm->dirty_bitmap == NULL) { + continue; + } + + ret = store_bitmap(bs, bm, errp); + if (ret < 0) { + goto fail; + } + } + + ret = update_ext_header_and_dir(bs, bm_list); + if (ret < 0) { + error_setg_errno(errp, -ret, "Failed to update bitmap extension"); + goto fail; + } + + /* Bitmap directory was successfully updated, so, old data can be dropped. + * TODO it is better to reuse these clusters */ + QSIMPLEQ_FOREACH_SAFE(tb, &drop_tables, entry, tb_next) { + free_bitmap_clusters(bs, tb); + g_free(tb); + } + + bitmap_list_free(bm_list); + return; + +fail: + QSIMPLEQ_FOREACH(bm, bm_list, entry) { + if (bm->dirty_bitmap == NULL || bm->table.offset == 0) { + continue; + } + + free_bitmap_clusters(bs, &bm->table); + } + + QSIMPLEQ_FOREACH_SAFE(tb, &drop_tables, entry, tb_next) { + g_free(tb); + } + + bitmap_list_free(bm_list); +} + +int qcow2_reopen_bitmaps_ro(BlockDriverState *bs, Error **errp) +{ + BdrvDirtyBitmap *bitmap; + Error *local_err = NULL; + + qcow2_store_persistent_dirty_bitmaps(bs, &local_err); + if (local_err != NULL) { + error_propagate(errp, local_err); + return -EINVAL; + } + + for (bitmap = bdrv_dirty_bitmap_next(bs, NULL); bitmap != NULL; + bitmap = bdrv_dirty_bitmap_next(bs, bitmap)) + { + if (bdrv_dirty_bitmap_get_persistance(bitmap)) { + bdrv_dirty_bitmap_set_readonly(bitmap, true); + } + } + + return 0; +} + +bool qcow2_can_store_new_dirty_bitmap(BlockDriverState *bs, + const char *name, + uint32_t granularity, + Error **errp) +{ + BDRVQcow2State *s = bs->opaque; + bool found; + Qcow2BitmapList *bm_list; + + if (check_constraints_on_bitmap(bs, name, granularity, errp) != 0) { + goto fail; + } + + if (s->nb_bitmaps == 0) { + return true; + } + + if (s->nb_bitmaps >= QCOW2_MAX_BITMAPS) { + error_setg(errp, + "Maximum number of persistent bitmaps is already reached"); + goto fail; + } + + if (s->bitmap_directory_size + calc_dir_entry_size(strlen(name), 0) > + QCOW2_MAX_BITMAP_DIRECTORY_SIZE) + { + error_setg(errp, "Not enough space in the bitmap directory"); + goto fail; + } + + bm_list = bitmap_list_load(bs, s->bitmap_directory_offset, + s->bitmap_directory_size, errp); + if (bm_list == NULL) { + goto fail; + } + + found = find_bitmap_by_name(bm_list, name); + bitmap_list_free(bm_list); + if (found) { + error_setg(errp, "Bitmap with the same name is already stored"); + goto fail; + } + + return true; + +fail: + error_prepend(errp, "Can't make bitmap '%s' persistent in '%s': ", + name, bdrv_get_device_or_node_name(bs)); + return false; +} diff --git a/block/qcow2-cluster.c b/block/qcow2-cluster.c index 3d341fd9cb..f06c08f64c 100644 --- a/block/qcow2-cluster.c +++ b/block/qcow2-cluster.c @@ -357,52 +357,6 @@ static int count_contiguous_clusters_unallocated(int nb_clusters, return i; } -/* The crypt function is compatible with the linux cryptoloop - algorithm for < 4 GB images. NOTE: out_buf == in_buf is - supported */ -int qcow2_encrypt_sectors(BDRVQcow2State *s, int64_t sector_num, - uint8_t *out_buf, const uint8_t *in_buf, - int nb_sectors, bool enc, - Error **errp) -{ - union { - uint64_t ll[2]; - uint8_t b[16]; - } ivec; - int i; - int ret; - - for(i = 0; i < nb_sectors; i++) { - ivec.ll[0] = cpu_to_le64(sector_num); - ivec.ll[1] = 0; - if (qcrypto_cipher_setiv(s->cipher, - ivec.b, G_N_ELEMENTS(ivec.b), - errp) < 0) { - return -1; - } - if (enc) { - ret = qcrypto_cipher_encrypt(s->cipher, - in_buf, - out_buf, - 512, - errp); - } else { - ret = qcrypto_cipher_decrypt(s->cipher, - in_buf, - out_buf, - 512, - errp); - } - if (ret < 0) { - return -1; - } - sector_num++; - in_buf += 512; - out_buf += 512; - } - return 0; -} - static int coroutine_fn do_perform_cow_read(BlockDriverState *bs, uint64_t src_cluster_offset, unsigned offset_in_cluster, @@ -435,19 +389,22 @@ static int coroutine_fn do_perform_cow_read(BlockDriverState *bs, static bool coroutine_fn do_perform_cow_encrypt(BlockDriverState *bs, uint64_t src_cluster_offset, + uint64_t cluster_offset, unsigned offset_in_cluster, uint8_t *buffer, unsigned bytes) { if (bytes && bs->encrypted) { BDRVQcow2State *s = bs->opaque; - int64_t sector = (src_cluster_offset + offset_in_cluster) + int64_t sector = (s->crypt_physical_offset ? + (cluster_offset + offset_in_cluster) : + (src_cluster_offset + offset_in_cluster)) >> BDRV_SECTOR_BITS; - assert(s->cipher); assert((offset_in_cluster & ~BDRV_SECTOR_MASK) == 0); assert((bytes & ~BDRV_SECTOR_MASK) == 0); - if (qcow2_encrypt_sectors(s, sector, buffer, buffer, - bytes >> BDRV_SECTOR_BITS, true, NULL) < 0) { + assert(s->crypto); + if (qcrypto_block_encrypt(s->crypto, sector, buffer, + bytes, NULL) < 0) { return false; } } @@ -834,10 +791,11 @@ static int perform_cow(BlockDriverState *bs, QCowL2Meta *m) /* Encrypt the data if necessary before writing it */ if (bs->encrypted) { - if (!do_perform_cow_encrypt(bs, m->offset, start->offset, - start_buffer, start->nb_bytes) || - !do_perform_cow_encrypt(bs, m->offset, end->offset, - end_buffer, end->nb_bytes)) { + if (!do_perform_cow_encrypt(bs, m->offset, m->alloc_offset, + start->offset, start_buffer, + start->nb_bytes) || + !do_perform_cow_encrypt(bs, m->offset, m->alloc_offset, + end->offset, end_buffer, end->nb_bytes)) { ret = -EIO; goto fail; } diff --git a/block/qcow2-refcount.c b/block/qcow2-refcount.c index 7c06061aae..c9b0dcb4f3 100644 --- a/block/qcow2-refcount.c +++ b/block/qcow2-refcount.c @@ -281,25 +281,6 @@ int qcow2_get_refcount(BlockDriverState *bs, int64_t cluster_index, return 0; } -/* - * Rounds the refcount table size up to avoid growing the table for each single - * refcount block that is allocated. - */ -static unsigned int next_refcount_table_size(BDRVQcow2State *s, - unsigned int min_size) -{ - unsigned int min_clusters = (min_size >> (s->cluster_bits - 3)) + 1; - unsigned int refcount_table_clusters = - MAX(1, s->refcount_table_size >> (s->cluster_bits - 3)); - - while (min_clusters > refcount_table_clusters) { - refcount_table_clusters = (refcount_table_clusters * 3 + 1) / 2; - } - - return refcount_table_clusters << (s->cluster_bits - 3); -} - - /* Checks if two offsets are described by the same refcount block */ static int in_same_refcount_block(BDRVQcow2State *s, uint64_t offset_a, uint64_t offset_b) @@ -321,7 +302,7 @@ static int alloc_refcount_block(BlockDriverState *bs, { BDRVQcow2State *s = bs->opaque; unsigned int refcount_table_index; - int ret; + int64_t ret; BLKDBG_EVENT(bs->file, BLKDBG_REFBLOCK_ALLOC); @@ -396,7 +377,7 @@ static int alloc_refcount_block(BlockDriverState *bs, ret = qcow2_cache_get_empty(bs, s->refcount_block_cache, new_block, refcount_block); if (ret < 0) { - goto fail_block; + goto fail; } memset(*refcount_block, 0, s->cluster_size); @@ -411,12 +392,12 @@ static int alloc_refcount_block(BlockDriverState *bs, ret = update_refcount(bs, new_block, s->cluster_size, 1, false, QCOW2_DISCARD_NEVER); if (ret < 0) { - goto fail_block; + goto fail; } ret = qcow2_cache_flush(bs, s->refcount_block_cache); if (ret < 0) { - goto fail_block; + goto fail; } /* Initialize the new refcount block only after updating its refcount, @@ -424,7 +405,7 @@ static int alloc_refcount_block(BlockDriverState *bs, ret = qcow2_cache_get_empty(bs, s->refcount_block_cache, new_block, refcount_block); if (ret < 0) { - goto fail_block; + goto fail; } memset(*refcount_block, 0, s->cluster_size); @@ -435,7 +416,7 @@ static int alloc_refcount_block(BlockDriverState *bs, qcow2_cache_entry_mark_dirty(bs, s->refcount_block_cache, *refcount_block); ret = qcow2_cache_flush(bs, s->refcount_block_cache); if (ret < 0) { - goto fail_block; + goto fail; } /* If the refcount table is big enough, just hook the block up there */ @@ -446,7 +427,7 @@ static int alloc_refcount_block(BlockDriverState *bs, s->refcount_table_offset + refcount_table_index * sizeof(uint64_t), &data64, sizeof(data64)); if (ret < 0) { - goto fail_block; + goto fail; } s->refcount_table[refcount_table_index] = new_block; @@ -490,74 +471,201 @@ static int alloc_refcount_block(BlockDriverState *bs, (new_block >> s->cluster_bits) + 1), s->refcount_block_size); - if (blocks_used > QCOW_MAX_REFTABLE_SIZE / sizeof(uint64_t)) { - return -EFBIG; + /* Create the new refcount table and blocks */ + uint64_t meta_offset = (blocks_used * s->refcount_block_size) * + s->cluster_size; + + ret = qcow2_refcount_area(bs, meta_offset, 0, false, + refcount_table_index, new_block); + if (ret < 0) { + return ret; } - /* And now we need at least one block more for the new metadata */ - uint64_t table_size = next_refcount_table_size(s, blocks_used + 1); - uint64_t last_table_size; - uint64_t blocks_clusters; - do { - uint64_t table_clusters = - size_to_clusters(s, table_size * sizeof(uint64_t)); - blocks_clusters = 1 + - DIV_ROUND_UP(table_clusters, s->refcount_block_size); - uint64_t meta_clusters = table_clusters + blocks_clusters; + ret = load_refcount_block(bs, new_block, refcount_block); + if (ret < 0) { + return ret; + } - last_table_size = table_size; - table_size = next_refcount_table_size(s, blocks_used + - DIV_ROUND_UP(meta_clusters, s->refcount_block_size)); + /* If we were trying to do the initial refcount update for some cluster + * allocation, we might have used the same clusters to store newly + * allocated metadata. Make the caller search some new space. */ + return -EAGAIN; - } while (last_table_size != table_size); +fail: + if (*refcount_block != NULL) { + qcow2_cache_put(bs, s->refcount_block_cache, refcount_block); + } + return ret; +} -#ifdef DEBUG_ALLOC2 - fprintf(stderr, "qcow2: Grow refcount table %" PRId32 " => %" PRId64 "\n", - s->refcount_table_size, table_size); -#endif +/* + * Starting at @start_offset, this function creates new self-covering refcount + * structures: A new refcount table and refcount blocks which cover all of + * themselves, and a number of @additional_clusters beyond their end. + * @start_offset must be at the end of the image file, that is, there must be + * only empty space beyond it. + * If @exact_size is false, the refcount table will have 50 % more entries than + * necessary so it will not need to grow again soon. + * If @new_refblock_offset is not zero, it contains the offset of a refcount + * block that should be entered into the new refcount table at index + * @new_refblock_index. + * + * Returns: The offset after the new refcount structures (i.e. where the + * @additional_clusters may be placed) on success, -errno on error. + */ +int64_t qcow2_refcount_area(BlockDriverState *bs, uint64_t start_offset, + uint64_t additional_clusters, bool exact_size, + int new_refblock_index, + uint64_t new_refblock_offset) +{ + BDRVQcow2State *s = bs->opaque; + uint64_t total_refblock_count_u64, additional_refblock_count; + int total_refblock_count, table_size, area_reftable_index, table_clusters; + int i; + uint64_t table_offset, block_offset, end_offset; + int ret; + uint64_t *new_table; - /* Create the new refcount table and blocks */ - uint64_t meta_offset = (blocks_used * s->refcount_block_size) * - s->cluster_size; - uint64_t table_offset = meta_offset + blocks_clusters * s->cluster_size; - uint64_t *new_table = g_try_new0(uint64_t, table_size); - void *new_blocks = g_try_malloc0(blocks_clusters * s->cluster_size); + assert(!(start_offset % s->cluster_size)); + + qcow2_refcount_metadata_size(start_offset / s->cluster_size + + additional_clusters, + s->cluster_size, s->refcount_order, + !exact_size, &total_refblock_count_u64); + if (total_refblock_count_u64 > QCOW_MAX_REFTABLE_SIZE) { + return -EFBIG; + } + total_refblock_count = total_refblock_count_u64; + + /* Index in the refcount table of the first refcount block to cover the area + * of refcount structures we are about to create; we know that + * @total_refblock_count can cover @start_offset, so this will definitely + * fit into an int. */ + area_reftable_index = (start_offset / s->cluster_size) / + s->refcount_block_size; - assert(table_size > 0 && blocks_clusters > 0); - if (new_table == NULL || new_blocks == NULL) { + if (exact_size) { + table_size = total_refblock_count; + } else { + table_size = total_refblock_count + + DIV_ROUND_UP(total_refblock_count, 2); + } + /* The qcow2 file can only store the reftable size in number of clusters */ + table_size = ROUND_UP(table_size, s->cluster_size / sizeof(uint64_t)); + table_clusters = (table_size * sizeof(uint64_t)) / s->cluster_size; + + if (table_size > QCOW_MAX_REFTABLE_SIZE) { + return -EFBIG; + } + + new_table = g_try_new0(uint64_t, table_size); + + assert(table_size > 0); + if (new_table == NULL) { ret = -ENOMEM; - goto fail_table; + goto fail; } /* Fill the new refcount table */ - memcpy(new_table, s->refcount_table, - s->refcount_table_size * sizeof(uint64_t)); - new_table[refcount_table_index] = new_block; + if (table_size > s->max_refcount_table_index) { + /* We're actually growing the reftable */ + memcpy(new_table, s->refcount_table, + (s->max_refcount_table_index + 1) * sizeof(uint64_t)); + } else { + /* Improbable case: We're shrinking the reftable. However, the caller + * has assured us that there is only empty space beyond @start_offset, + * so we can simply drop all of the refblocks that won't fit into the + * new reftable. */ + memcpy(new_table, s->refcount_table, table_size * sizeof(uint64_t)); + } - int i; - for (i = 0; i < blocks_clusters; i++) { - new_table[blocks_used + i] = meta_offset + (i * s->cluster_size); + if (new_refblock_offset) { + assert(new_refblock_index < total_refblock_count); + new_table[new_refblock_index] = new_refblock_offset; } - /* Fill the refcount blocks */ - uint64_t table_clusters = size_to_clusters(s, table_size * sizeof(uint64_t)); - int block = 0; - for (i = 0; i < table_clusters + blocks_clusters; i++) { - s->set_refcount(new_blocks, block++, 1); + /* Count how many new refblocks we have to create */ + additional_refblock_count = 0; + for (i = area_reftable_index; i < total_refblock_count; i++) { + if (!new_table[i]) { + additional_refblock_count++; + } + } + + table_offset = start_offset + additional_refblock_count * s->cluster_size; + end_offset = table_offset + table_clusters * s->cluster_size; + + /* Fill the refcount blocks, and create new ones, if necessary */ + block_offset = start_offset; + for (i = area_reftable_index; i < total_refblock_count; i++) { + void *refblock_data; + uint64_t first_offset_covered; + + /* Reuse an existing refblock if possible, create a new one otherwise */ + if (new_table[i]) { + ret = qcow2_cache_get(bs, s->refcount_block_cache, new_table[i], + &refblock_data); + if (ret < 0) { + goto fail; + } + } else { + ret = qcow2_cache_get_empty(bs, s->refcount_block_cache, + block_offset, &refblock_data); + if (ret < 0) { + goto fail; + } + memset(refblock_data, 0, s->cluster_size); + qcow2_cache_entry_mark_dirty(bs, s->refcount_block_cache, + refblock_data); + + new_table[i] = block_offset; + block_offset += s->cluster_size; + } + + /* First host offset covered by this refblock */ + first_offset_covered = (uint64_t)i * s->refcount_block_size * + s->cluster_size; + if (first_offset_covered < end_offset) { + int j, end_index; + + /* Set the refcount of all of the new refcount structures to 1 */ + + if (first_offset_covered < start_offset) { + assert(i == area_reftable_index); + j = (start_offset - first_offset_covered) / s->cluster_size; + assert(j < s->refcount_block_size); + } else { + j = 0; + } + + end_index = MIN((end_offset - first_offset_covered) / + s->cluster_size, + s->refcount_block_size); + + for (; j < end_index; j++) { + /* The caller guaranteed us this space would be empty */ + assert(s->get_refcount(refblock_data, j) == 0); + s->set_refcount(refblock_data, j, 1); + } + + qcow2_cache_entry_mark_dirty(bs, s->refcount_block_cache, + refblock_data); + } + + qcow2_cache_put(bs, s->refcount_block_cache, &refblock_data); } + assert(block_offset == table_offset); + /* Write refcount blocks to disk */ BLKDBG_EVENT(bs->file, BLKDBG_REFBLOCK_ALLOC_WRITE_BLOCKS); - ret = bdrv_pwrite_sync(bs->file, meta_offset, new_blocks, - blocks_clusters * s->cluster_size); - g_free(new_blocks); - new_blocks = NULL; + ret = qcow2_cache_flush(bs, s->refcount_block_cache); if (ret < 0) { - goto fail_table; + goto fail; } /* Write refcount table to disk */ - for(i = 0; i < table_size; i++) { + for (i = 0; i < total_refblock_count; i++) { cpu_to_be64s(&new_table[i]); } @@ -565,10 +673,10 @@ static int alloc_refcount_block(BlockDriverState *bs, ret = bdrv_pwrite_sync(bs->file, table_offset, new_table, table_size * sizeof(uint64_t)); if (ret < 0) { - goto fail_table; + goto fail; } - for(i = 0; i < table_size; i++) { + for (i = 0; i < total_refblock_count; i++) { be64_to_cpus(&new_table[i]); } @@ -584,7 +692,7 @@ static int alloc_refcount_block(BlockDriverState *bs, offsetof(QCowHeader, refcount_table_offset), &data, sizeof(data)); if (ret < 0) { - goto fail_table; + goto fail; } /* And switch it in memory */ @@ -601,23 +709,10 @@ static int alloc_refcount_block(BlockDriverState *bs, qcow2_free_clusters(bs, old_table_offset, old_table_size * sizeof(uint64_t), QCOW2_DISCARD_OTHER); - ret = load_refcount_block(bs, new_block, refcount_block); - if (ret < 0) { - return ret; - } + return end_offset; - /* If we were trying to do the initial refcount update for some cluster - * allocation, we might have used the same clusters to store newly - * allocated metadata. Make the caller search some new space. */ - return -EAGAIN; - -fail_table: - g_free(new_blocks); +fail: g_free(new_table); -fail_block: - if (*refcount_block != NULL) { - qcow2_cache_put(bs, s->refcount_block_cache, refcount_block); - } return ret; } @@ -1323,11 +1418,10 @@ static int realloc_refcount_array(BDRVQcow2State *s, void **array, * * Modifies the number of errors in res. */ -static int inc_refcounts(BlockDriverState *bs, - BdrvCheckResult *res, - void **refcount_table, - int64_t *refcount_table_size, - int64_t offset, int64_t size) +int qcow2_inc_refcounts_imrt(BlockDriverState *bs, BdrvCheckResult *res, + void **refcount_table, + int64_t *refcount_table_size, + int64_t offset, int64_t size) { BDRVQcow2State *s = bs->opaque; uint64_t start, last, cluster_offset, k, refcount; @@ -1420,8 +1514,9 @@ static int check_refcounts_l2(BlockDriverState *bs, BdrvCheckResult *res, nb_csectors = ((l2_entry >> s->csize_shift) & s->csize_mask) + 1; l2_entry &= s->cluster_offset_mask; - ret = inc_refcounts(bs, res, refcount_table, refcount_table_size, - l2_entry & ~511, nb_csectors * 512); + ret = qcow2_inc_refcounts_imrt(bs, res, + refcount_table, refcount_table_size, + l2_entry & ~511, nb_csectors * 512); if (ret < 0) { goto fail; } @@ -1454,8 +1549,9 @@ static int check_refcounts_l2(BlockDriverState *bs, BdrvCheckResult *res, } /* Mark cluster as used */ - ret = inc_refcounts(bs, res, refcount_table, refcount_table_size, - offset, s->cluster_size); + ret = qcow2_inc_refcounts_imrt(bs, res, + refcount_table, refcount_table_size, + offset, s->cluster_size); if (ret < 0) { goto fail; } @@ -1508,8 +1604,8 @@ static int check_refcounts_l1(BlockDriverState *bs, l1_size2 = l1_size * sizeof(uint64_t); /* Mark L1 table as used */ - ret = inc_refcounts(bs, res, refcount_table, refcount_table_size, - l1_table_offset, l1_size2); + ret = qcow2_inc_refcounts_imrt(bs, res, refcount_table, refcount_table_size, + l1_table_offset, l1_size2); if (ret < 0) { goto fail; } @@ -1538,8 +1634,9 @@ static int check_refcounts_l1(BlockDriverState *bs, if (l2_offset) { /* Mark L2 table as used */ l2_offset &= L1E_OFFSET_MASK; - ret = inc_refcounts(bs, res, refcount_table, refcount_table_size, - l2_offset, s->cluster_size); + ret = qcow2_inc_refcounts_imrt(bs, res, + refcount_table, refcount_table_size, + l2_offset, s->cluster_size); if (ret < 0) { goto fail; } @@ -1730,7 +1827,7 @@ static int check_refblocks(BlockDriverState *bs, BdrvCheckResult *res, } ret = bdrv_truncate(bs->file, offset + s->cluster_size, - &local_err); + PREALLOC_MODE_OFF, &local_err); if (ret < 0) { error_report_err(local_err); goto resize_fail; @@ -1757,14 +1854,15 @@ static int check_refblocks(BlockDriverState *bs, BdrvCheckResult *res, } res->corruptions_fixed++; - ret = inc_refcounts(bs, res, refcount_table, nb_clusters, - offset, s->cluster_size); + ret = qcow2_inc_refcounts_imrt(bs, res, + refcount_table, nb_clusters, + offset, s->cluster_size); if (ret < 0) { return ret; } /* No need to check whether the refcount is now greater than 1: * This area was just allocated and zeroed, so it can only be - * exactly 1 after inc_refcounts() */ + * exactly 1 after qcow2_inc_refcounts_imrt() */ continue; resize_fail: @@ -1779,8 +1877,8 @@ resize_fail: } if (offset != 0) { - ret = inc_refcounts(bs, res, refcount_table, nb_clusters, - offset, s->cluster_size); + ret = qcow2_inc_refcounts_imrt(bs, res, refcount_table, nb_clusters, + offset, s->cluster_size); if (ret < 0) { return ret; } @@ -1820,8 +1918,8 @@ static int calculate_refcounts(BlockDriverState *bs, BdrvCheckResult *res, } /* header */ - ret = inc_refcounts(bs, res, refcount_table, nb_clusters, - 0, s->cluster_size); + ret = qcow2_inc_refcounts_imrt(bs, res, refcount_table, nb_clusters, + 0, s->cluster_size); if (ret < 0) { return ret; } @@ -1842,16 +1940,32 @@ static int calculate_refcounts(BlockDriverState *bs, BdrvCheckResult *res, return ret; } } - ret = inc_refcounts(bs, res, refcount_table, nb_clusters, - s->snapshots_offset, s->snapshots_size); + ret = qcow2_inc_refcounts_imrt(bs, res, refcount_table, nb_clusters, + s->snapshots_offset, s->snapshots_size); if (ret < 0) { return ret; } /* refcount data */ - ret = inc_refcounts(bs, res, refcount_table, nb_clusters, - s->refcount_table_offset, - s->refcount_table_size * sizeof(uint64_t)); + ret = qcow2_inc_refcounts_imrt(bs, res, refcount_table, nb_clusters, + s->refcount_table_offset, + s->refcount_table_size * sizeof(uint64_t)); + if (ret < 0) { + return ret; + } + + /* encryption */ + if (s->crypto_header.length) { + ret = qcow2_inc_refcounts_imrt(bs, res, refcount_table, nb_clusters, + s->crypto_header.offset, + s->crypto_header.length); + if (ret < 0) { + return ret; + } + } + + /* bitmaps */ + ret = qcow2_check_bitmaps_refcounts(bs, res, refcount_table, nb_clusters); if (ret < 0) { return ret; } diff --git a/block/qcow2.c b/block/qcow2.c index 2f94f0326e..d5790af1e0 100644 --- a/block/qcow2.c +++ b/block/qcow2.c @@ -37,6 +37,9 @@ #include "qemu/option_int.h" #include "qemu/cutils.h" #include "qemu/bswap.h" +#include "qapi/opts-visitor.h" +#include "qapi-visit.h" +#include "block/crypto.h" /* Differences with QCOW: @@ -63,6 +66,8 @@ typedef struct { #define QCOW2_EXT_MAGIC_END 0 #define QCOW2_EXT_MAGIC_BACKING_FORMAT 0xE2792ACA #define QCOW2_EXT_MAGIC_FEATURE_TABLE 0x6803f857 +#define QCOW2_EXT_MAGIC_CRYPTO_HEADER 0x0537be77 +#define QCOW2_EXT_MAGIC_BITMAPS 0x23852875 static int qcow2_probe(const uint8_t *buf, int buf_size, const char *filename) { @@ -77,6 +82,86 @@ static int qcow2_probe(const uint8_t *buf, int buf_size, const char *filename) } +static ssize_t qcow2_crypto_hdr_read_func(QCryptoBlock *block, size_t offset, + uint8_t *buf, size_t buflen, + void *opaque, Error **errp) +{ + BlockDriverState *bs = opaque; + BDRVQcow2State *s = bs->opaque; + ssize_t ret; + + if ((offset + buflen) > s->crypto_header.length) { + error_setg(errp, "Request for data outside of extension header"); + return -1; + } + + ret = bdrv_pread(bs->file, + s->crypto_header.offset + offset, buf, buflen); + if (ret < 0) { + error_setg_errno(errp, -ret, "Could not read encryption header"); + return -1; + } + return ret; +} + + +static ssize_t qcow2_crypto_hdr_init_func(QCryptoBlock *block, size_t headerlen, + void *opaque, Error **errp) +{ + BlockDriverState *bs = opaque; + BDRVQcow2State *s = bs->opaque; + int64_t ret; + int64_t clusterlen; + + ret = qcow2_alloc_clusters(bs, headerlen); + if (ret < 0) { + error_setg_errno(errp, -ret, + "Cannot allocate cluster for LUKS header size %zu", + headerlen); + return -1; + } + + s->crypto_header.length = headerlen; + s->crypto_header.offset = ret; + + /* Zero fill remaining space in cluster so it has predictable + * content in case of future spec changes */ + clusterlen = size_to_clusters(s, headerlen) * s->cluster_size; + ret = bdrv_pwrite_zeroes(bs->file, + ret + headerlen, + clusterlen - headerlen, 0); + if (ret < 0) { + error_setg_errno(errp, -ret, "Could not zero fill encryption header"); + return -1; + } + + return ret; +} + + +static ssize_t qcow2_crypto_hdr_write_func(QCryptoBlock *block, size_t offset, + const uint8_t *buf, size_t buflen, + void *opaque, Error **errp) +{ + BlockDriverState *bs = opaque; + BDRVQcow2State *s = bs->opaque; + ssize_t ret; + + if ((offset + buflen) > s->crypto_header.length) { + error_setg(errp, "Request for data outside of extension header"); + return -1; + } + + ret = bdrv_pwrite(bs->file, + s->crypto_header.offset + offset, buf, buflen); + if (ret < 0) { + error_setg_errno(errp, -ret, "Could not read encryption header"); + return -1; + } + return ret; +} + + /* * read qcow2 extension and fill bs * start reading from start_offset @@ -86,12 +171,18 @@ static int qcow2_probe(const uint8_t *buf, int buf_size, const char *filename) */ static int qcow2_read_extensions(BlockDriverState *bs, uint64_t start_offset, uint64_t end_offset, void **p_feature_table, + int flags, bool *need_update_header, Error **errp) { BDRVQcow2State *s = bs->opaque; QCowExtension ext; uint64_t offset; int ret; + Qcow2BitmapHeaderExt bitmaps_ext; + + if (need_update_header != NULL) { + *need_update_header = false; + } #ifdef DEBUG_EXT printf("qcow2_read_extensions: start=%ld end=%ld\n", start_offset, end_offset); @@ -162,6 +253,126 @@ static int qcow2_read_extensions(BlockDriverState *bs, uint64_t start_offset, } break; + case QCOW2_EXT_MAGIC_CRYPTO_HEADER: { + unsigned int cflags = 0; + if (s->crypt_method_header != QCOW_CRYPT_LUKS) { + error_setg(errp, "CRYPTO header extension only " + "expected with LUKS encryption method"); + return -EINVAL; + } + if (ext.len != sizeof(Qcow2CryptoHeaderExtension)) { + error_setg(errp, "CRYPTO header extension size %u, " + "but expected size %zu", ext.len, + sizeof(Qcow2CryptoHeaderExtension)); + return -EINVAL; + } + + ret = bdrv_pread(bs->file, offset, &s->crypto_header, ext.len); + if (ret < 0) { + error_setg_errno(errp, -ret, + "Unable to read CRYPTO header extension"); + return ret; + } + be64_to_cpus(&s->crypto_header.offset); + be64_to_cpus(&s->crypto_header.length); + + if ((s->crypto_header.offset % s->cluster_size) != 0) { + error_setg(errp, "Encryption header offset '%" PRIu64 "' is " + "not a multiple of cluster size '%u'", + s->crypto_header.offset, s->cluster_size); + return -EINVAL; + } + + if (flags & BDRV_O_NO_IO) { + cflags |= QCRYPTO_BLOCK_OPEN_NO_IO; + } + s->crypto = qcrypto_block_open(s->crypto_opts, "encrypt.", + qcow2_crypto_hdr_read_func, + bs, cflags, errp); + if (!s->crypto) { + return -EINVAL; + } + } break; + + case QCOW2_EXT_MAGIC_BITMAPS: + if (ext.len != sizeof(bitmaps_ext)) { + error_setg_errno(errp, -ret, "bitmaps_ext: " + "Invalid extension length"); + return -EINVAL; + } + + if (!(s->autoclear_features & QCOW2_AUTOCLEAR_BITMAPS)) { + error_report("WARNING: a program lacking bitmap support " + "modified this file, so all bitmaps are now " + "considered inconsistent. Some clusters may be " + "leaked, run 'qemu-img check -r' on the image " + "file to fix."); + if (need_update_header != NULL) { + /* Updating is needed to drop invalid bitmap extension. */ + *need_update_header = true; + } + break; + } + + ret = bdrv_pread(bs->file, offset, &bitmaps_ext, ext.len); + if (ret < 0) { + error_setg_errno(errp, -ret, "bitmaps_ext: " + "Could not read ext header"); + return ret; + } + + if (bitmaps_ext.reserved32 != 0) { + error_setg_errno(errp, -ret, "bitmaps_ext: " + "Reserved field is not zero"); + return -EINVAL; + } + + be32_to_cpus(&bitmaps_ext.nb_bitmaps); + be64_to_cpus(&bitmaps_ext.bitmap_directory_size); + be64_to_cpus(&bitmaps_ext.bitmap_directory_offset); + + if (bitmaps_ext.nb_bitmaps > QCOW2_MAX_BITMAPS) { + error_setg(errp, + "bitmaps_ext: Image has %" PRIu32 " bitmaps, " + "exceeding the QEMU supported maximum of %d", + bitmaps_ext.nb_bitmaps, QCOW2_MAX_BITMAPS); + return -EINVAL; + } + + if (bitmaps_ext.nb_bitmaps == 0) { + error_setg(errp, "found bitmaps extension with zero bitmaps"); + return -EINVAL; + } + + if (bitmaps_ext.bitmap_directory_offset & (s->cluster_size - 1)) { + error_setg(errp, "bitmaps_ext: " + "invalid bitmap directory offset"); + return -EINVAL; + } + + if (bitmaps_ext.bitmap_directory_size > + QCOW2_MAX_BITMAP_DIRECTORY_SIZE) { + error_setg(errp, "bitmaps_ext: " + "bitmap directory size (%" PRIu64 ") exceeds " + "the maximum supported size (%d)", + bitmaps_ext.bitmap_directory_size, + QCOW2_MAX_BITMAP_DIRECTORY_SIZE); + return -EINVAL; + } + + s->nb_bitmaps = bitmaps_ext.nb_bitmaps; + s->bitmap_directory_offset = + bitmaps_ext.bitmap_directory_offset; + s->bitmap_directory_size = + bitmaps_ext.bitmap_directory_size; + +#ifdef DEBUG_EXT + printf("Qcow2: Got bitmaps extension: " + "offset=%" PRIu64 " nb_bitmaps=%" PRIu32 "\n", + s->bitmap_directory_offset, s->nb_bitmaps); +#endif + break; + default: /* unknown magic - save it in case we need to rewrite the header */ { @@ -461,6 +672,8 @@ static QemuOptsList qcow2_runtime_opts = { .type = QEMU_OPT_NUMBER, .help = "Clean unused cache entries after this time (in seconds)", }, + BLOCK_CRYPTO_OPT_DEF_KEY_SECRET("encrypt.", + "ID of secret providing qcow2 AES key or LUKS passphrase"), { /* end of list */ } }, }; @@ -585,6 +798,7 @@ typedef struct Qcow2ReopenState { int overlap_check; bool discard_passthrough[QCOW2_DISCARD_MAX]; uint64_t cache_clean_interval; + QCryptoBlockOpenOptions *crypto_opts; /* Disk encryption runtime options */ } Qcow2ReopenState; static int qcow2_update_options_prepare(BlockDriverState *bs, @@ -598,9 +812,14 @@ static int qcow2_update_options_prepare(BlockDriverState *bs, int overlap_check_template = 0; uint64_t l2_cache_size, refcount_cache_size; int i; + const char *encryptfmt; + QDict *encryptopts = NULL; Error *local_err = NULL; int ret; + qdict_extract_subqdict(options, &encryptopts, "encrypt."); + encryptfmt = qdict_get_try_str(encryptopts, "format"); + opts = qemu_opts_create(&qcow2_runtime_opts, NULL, 0, &error_abort); qemu_opts_absorb_qdict(opts, options, &local_err); if (local_err) { @@ -751,8 +970,55 @@ static int qcow2_update_options_prepare(BlockDriverState *bs, r->discard_passthrough[QCOW2_DISCARD_OTHER] = qemu_opt_get_bool(opts, QCOW2_OPT_DISCARD_OTHER, false); + switch (s->crypt_method_header) { + case QCOW_CRYPT_NONE: + if (encryptfmt) { + error_setg(errp, "No encryption in image header, but options " + "specified format '%s'", encryptfmt); + ret = -EINVAL; + goto fail; + } + break; + + case QCOW_CRYPT_AES: + if (encryptfmt && !g_str_equal(encryptfmt, "aes")) { + error_setg(errp, + "Header reported 'aes' encryption format but " + "options specify '%s'", encryptfmt); + ret = -EINVAL; + goto fail; + } + qdict_del(encryptopts, "format"); + r->crypto_opts = block_crypto_open_opts_init( + Q_CRYPTO_BLOCK_FORMAT_QCOW, encryptopts, errp); + break; + + case QCOW_CRYPT_LUKS: + if (encryptfmt && !g_str_equal(encryptfmt, "luks")) { + error_setg(errp, + "Header reported 'luks' encryption format but " + "options specify '%s'", encryptfmt); + ret = -EINVAL; + goto fail; + } + qdict_del(encryptopts, "format"); + r->crypto_opts = block_crypto_open_opts_init( + Q_CRYPTO_BLOCK_FORMAT_LUKS, encryptopts, errp); + break; + + default: + error_setg(errp, "Unsupported encryption method %d", + s->crypt_method_header); + break; + } + if (s->crypt_method_header != QCOW_CRYPT_NONE && !r->crypto_opts) { + ret = -EINVAL; + goto fail; + } + ret = 0; fail: + QDECREF(encryptopts); qemu_opts_del(opts); opts = NULL; return ret; @@ -785,6 +1051,9 @@ static void qcow2_update_options_commit(BlockDriverState *bs, s->cache_clean_interval = r->cache_clean_interval; cache_clean_timer_init(bs, bdrv_get_aio_context(bs)); } + + qapi_free_QCryptoBlockOpenOptions(s->crypto_opts); + s->crypto_opts = r->crypto_opts; } static void qcow2_update_options_abort(BlockDriverState *bs, @@ -796,6 +1065,7 @@ static void qcow2_update_options_abort(BlockDriverState *bs, if (r->refcount_block_cache) { qcow2_cache_destroy(bs, r->refcount_block_cache); } + qapi_free_QCryptoBlockOpenOptions(r->crypto_opts); } static int qcow2_update_options(BlockDriverState *bs, QDict *options, @@ -824,6 +1094,7 @@ static int qcow2_do_open(BlockDriverState *bs, QDict *options, int flags, Error *local_err = NULL; uint64_t ext_end; uint64_t l1_vm_state_index; + bool update_header = false; ret = bdrv_pread(bs->file, 0, &header, sizeof(header)); if (ret < 0) { @@ -929,7 +1200,7 @@ static int qcow2_do_open(BlockDriverState *bs, QDict *options, int flags, if (s->incompatible_features & ~QCOW2_INCOMPAT_MASK) { void *feature_table = NULL; qcow2_read_extensions(bs, header.header_length, ext_end, - &feature_table, NULL); + &feature_table, flags, NULL, NULL); report_unsupported_feature(errp, feature_table, s->incompatible_features & ~QCOW2_INCOMPAT_MASK); @@ -961,18 +1232,6 @@ static int qcow2_do_open(BlockDriverState *bs, QDict *options, int flags, s->refcount_max = UINT64_C(1) << (s->refcount_bits - 1); s->refcount_max += s->refcount_max - 1; - if (header.crypt_method > QCOW_CRYPT_AES) { - error_setg(errp, "Unsupported encryption method: %" PRIu32, - header.crypt_method); - ret = -EINVAL; - goto fail; - } - if (!qcrypto_cipher_supports(QCRYPTO_CIPHER_ALG_AES_128, - QCRYPTO_CIPHER_MODE_CBC)) { - error_setg(errp, "AES cipher not available"); - ret = -EINVAL; - goto fail; - } s->crypt_method_header = header.crypt_method; if (s->crypt_method_header) { if (bdrv_uses_whitelist() && @@ -989,6 +1248,15 @@ static int qcow2_do_open(BlockDriverState *bs, QDict *options, int flags, goto fail; } + if (s->crypt_method_header == QCOW_CRYPT_AES) { + s->crypt_physical_offset = false; + } else { + /* Assuming LUKS and any future crypt methods we + * add will all use physical offsets, due to the + * fact that the alternative is insecure... */ + s->crypt_physical_offset = true; + } + bs->encrypted = true; } @@ -1116,12 +1384,36 @@ static int qcow2_do_open(BlockDriverState *bs, QDict *options, int flags, /* read qcow2 extensions */ if (qcow2_read_extensions(bs, header.header_length, ext_end, NULL, - &local_err)) { + flags, &update_header, &local_err)) { error_propagate(errp, local_err); ret = -EINVAL; goto fail; } + /* qcow2_read_extension may have set up the crypto context + * if the crypt method needs a header region, some methods + * don't need header extensions, so must check here + */ + if (s->crypt_method_header && !s->crypto) { + if (s->crypt_method_header == QCOW_CRYPT_AES) { + unsigned int cflags = 0; + if (flags & BDRV_O_NO_IO) { + cflags |= QCRYPTO_BLOCK_OPEN_NO_IO; + } + s->crypto = qcrypto_block_open(s->crypto_opts, "encrypt.", + NULL, NULL, cflags, errp); + if (!s->crypto) { + ret = -EINVAL; + goto fail; + } + } else if (!(flags & BDRV_O_NO_IO)) { + error_setg(errp, "Missing CRYPTO header for crypt method %d", + s->crypt_method_header); + ret = -EINVAL; + goto fail; + } + } + /* read the backing file name */ if (header.backing_file_offset != 0) { len = header.backing_file_size; @@ -1152,8 +1444,23 @@ static int qcow2_do_open(BlockDriverState *bs, QDict *options, int flags, } /* Clear unknown autoclear feature bits */ - if (!bs->read_only && !(flags & BDRV_O_INACTIVE) && s->autoclear_features) { - s->autoclear_features = 0; + update_header |= s->autoclear_features & ~QCOW2_AUTOCLEAR_MASK; + update_header = + update_header && !bs->read_only && !(flags & BDRV_O_INACTIVE); + if (update_header) { + s->autoclear_features &= QCOW2_AUTOCLEAR_MASK; + } + + if (qcow2_load_autoloading_dirty_bitmaps(bs, &local_err)) { + update_header = false; + } + if (local_err != NULL) { + error_propagate(errp, local_err); + ret = -EINVAL; + goto fail; + } + + if (update_header) { ret = qcow2_update_header(bs); if (ret < 0) { error_setg_errno(errp, -ret, "Could not update qcow2 header"); @@ -1202,6 +1509,8 @@ static int qcow2_do_open(BlockDriverState *bs, QDict *options, int flags, } g_free(s->cluster_cache); qemu_vfree(s->cluster_data); + qcrypto_block_free(s->crypto); + qapi_free_QCryptoBlockOpenOptions(s->crypto_opts); return ret; } @@ -1229,41 +1538,6 @@ static void qcow2_refresh_limits(BlockDriverState *bs, Error **errp) bs->bl.pdiscard_alignment = s->cluster_size; } -static int qcow2_set_key(BlockDriverState *bs, const char *key) -{ - BDRVQcow2State *s = bs->opaque; - uint8_t keybuf[16]; - int len, i; - Error *err = NULL; - - memset(keybuf, 0, 16); - len = strlen(key); - if (len > 16) - len = 16; - /* XXX: we could compress the chars to 7 bits to increase - entropy */ - for(i = 0;i < len;i++) { - keybuf[i] = key[i]; - } - assert(bs->encrypted); - - qcrypto_cipher_free(s->cipher); - s->cipher = qcrypto_cipher_new( - QCRYPTO_CIPHER_ALG_AES_128, - QCRYPTO_CIPHER_MODE_CBC, - keybuf, G_N_ELEMENTS(keybuf), - &err); - - if (!s->cipher) { - /* XXX would be nice if errors in this method could - * be properly propagate to the caller. Would need - * the bdrv_set_key() API signature to be fixed. */ - error_free(err); - return -1; - } - return 0; -} - static int qcow2_reopen_prepare(BDRVReopenState *state, BlockReopenQueue *queue, Error **errp) { @@ -1281,6 +1555,11 @@ static int qcow2_reopen_prepare(BDRVReopenState *state, /* We need to write out any unwritten data if we reopen read-only. */ if ((state->flags & BDRV_O_RDWR) == 0) { + ret = qcow2_reopen_bitmaps_ro(state->bs, errp); + if (ret < 0) { + goto fail; + } + ret = bdrv_flush(state->bs); if (ret < 0) { goto fail; @@ -1379,7 +1658,7 @@ static int64_t coroutine_fn qcow2_co_get_block_status(BlockDriverState *bs, *pnum = bytes >> BDRV_SECTOR_BITS; if (cluster_offset != 0 && ret != QCOW2_CLUSTER_COMPRESSED && - !s->cipher) { + !s->crypto) { index_in_cluster = sector_num & (s->cluster_sectors - 1); cluster_offset |= (index_in_cluster << BDRV_SECTOR_BITS); *file = bs->file->bs; @@ -1436,7 +1715,7 @@ static coroutine_fn int qcow2_co_preadv(BlockDriverState *bs, uint64_t offset, /* prepare next request */ cur_bytes = MIN(bytes, INT_MAX); - if (s->cipher) { + if (s->crypto) { cur_bytes = MIN(cur_bytes, QCOW_MAX_CRYPT_CLUSTERS * s->cluster_size); } @@ -1506,7 +1785,7 @@ static coroutine_fn int qcow2_co_preadv(BlockDriverState *bs, uint64_t offset, } if (bs->encrypted) { - assert(s->cipher); + assert(s->crypto); /* * For encrypted images, read everything into a temporary @@ -1538,14 +1817,17 @@ static coroutine_fn int qcow2_co_preadv(BlockDriverState *bs, uint64_t offset, goto fail; } if (bs->encrypted) { - assert(s->cipher); + assert(s->crypto); assert((offset & (BDRV_SECTOR_SIZE - 1)) == 0); assert((cur_bytes & (BDRV_SECTOR_SIZE - 1)) == 0); Error *err = NULL; - if (qcow2_encrypt_sectors(s, offset >> BDRV_SECTOR_BITS, - cluster_data, cluster_data, - cur_bytes >> BDRV_SECTOR_BITS, - false, &err) < 0) { + if (qcrypto_block_decrypt(s->crypto, + (s->crypt_physical_offset ? + cluster_offset + offset_in_cluster : + offset) >> BDRV_SECTOR_BITS, + cluster_data, + cur_bytes, + &err) < 0) { error_free(err); ret = -EIO; goto fail; @@ -1661,7 +1943,7 @@ static coroutine_fn int qcow2_co_pwritev(BlockDriverState *bs, uint64_t offset, if (bs->encrypted) { Error *err = NULL; - assert(s->cipher); + assert(s->crypto); if (!cluster_data) { cluster_data = qemu_try_blockalign(bs->file->bs, QCOW_MAX_CRYPT_CLUSTERS @@ -1676,10 +1958,12 @@ static coroutine_fn int qcow2_co_pwritev(BlockDriverState *bs, uint64_t offset, QCOW_MAX_CRYPT_CLUSTERS * s->cluster_size); qemu_iovec_to_buf(&hd_qiov, 0, cluster_data, hd_qiov.size); - if (qcow2_encrypt_sectors(s, offset >> BDRV_SECTOR_BITS, - cluster_data, cluster_data, - cur_bytes >>BDRV_SECTOR_BITS, - true, &err) < 0) { + if (qcrypto_block_encrypt(s->crypto, + (s->crypt_physical_offset ? + cluster_offset + offset_in_cluster : + offset) >> BDRV_SECTOR_BITS, + cluster_data, + cur_bytes, &err) < 0) { error_free(err); ret = -EIO; goto fail; @@ -1741,8 +2025,6 @@ static coroutine_fn int qcow2_co_pwritev(BlockDriverState *bs, uint64_t offset, ret = 0; fail: - qemu_co_mutex_unlock(&s->lock); - while (l2meta != NULL) { QCowL2Meta *next; @@ -1756,6 +2038,8 @@ fail: l2meta = next; } + qemu_co_mutex_unlock(&s->lock); + qemu_iovec_destroy(&hd_qiov); qemu_vfree(cluster_data); trace_qcow2_writev_done_req(qemu_coroutine_self(), ret); @@ -1767,6 +2051,7 @@ static int qcow2_inactivate(BlockDriverState *bs) { BDRVQcow2State *s = bs->opaque; int ret, result = 0; + Error *local_err = NULL; ret = qcow2_cache_flush(bs, s->l2_table_cache); if (ret) { @@ -1782,6 +2067,14 @@ static int qcow2_inactivate(BlockDriverState *bs) strerror(-ret)); } + qcow2_store_persistent_dirty_bitmaps(bs, &local_err); + if (local_err != NULL) { + result = -EINVAL; + error_report_err(local_err); + error_report("Persistent bitmaps are lost for node '%s'", + bdrv_get_device_or_node_name(bs)); + } + if (result == 0) { qcow2_mark_clean(bs); } @@ -1804,8 +2097,8 @@ static void qcow2_close(BlockDriverState *bs) qcow2_cache_destroy(bs, s->l2_table_cache); qcow2_cache_destroy(bs, s->refcount_block_cache); - qcrypto_cipher_free(s->cipher); - s->cipher = NULL; + qcrypto_block_free(s->crypto); + s->crypto = NULL; g_free(s->unknown_header_fields); cleanup_unknown_header_ext(bs); @@ -1823,7 +2116,7 @@ static void qcow2_invalidate_cache(BlockDriverState *bs, Error **errp) { BDRVQcow2State *s = bs->opaque; int flags = s->flags; - QCryptoCipher *cipher = NULL; + QCryptoBlock *crypto = NULL; QDict *options; Error *local_err = NULL; int ret; @@ -1833,8 +2126,8 @@ static void qcow2_invalidate_cache(BlockDriverState *bs, Error **errp) * that means we don't have to worry about reopening them here. */ - cipher = s->cipher; - s->cipher = NULL; + crypto = s->crypto; + s->crypto = NULL; qcow2_close(bs); @@ -1855,7 +2148,7 @@ static void qcow2_invalidate_cache(BlockDriverState *bs, Error **errp) return; } - s->cipher = cipher; + s->crypto = crypto; } static size_t header_ext_add(char *buf, uint32_t magic, const void *s, @@ -1981,6 +2274,22 @@ int qcow2_update_header(BlockDriverState *bs) buflen -= ret; } + /* Full disk encryption header pointer extension */ + if (s->crypto_header.offset != 0) { + cpu_to_be64s(&s->crypto_header.offset); + cpu_to_be64s(&s->crypto_header.length); + ret = header_ext_add(buf, QCOW2_EXT_MAGIC_CRYPTO_HEADER, + &s->crypto_header, sizeof(s->crypto_header), + buflen); + be64_to_cpus(&s->crypto_header.offset); + be64_to_cpus(&s->crypto_header.length); + if (ret < 0) { + goto fail; + } + buf += ret; + buflen -= ret; + } + /* Feature table */ if (s->qcow_version >= 3) { Qcow2Feature features[] = { @@ -2010,6 +2319,25 @@ int qcow2_update_header(BlockDriverState *bs) buflen -= ret; } + /* Bitmap extension */ + if (s->nb_bitmaps > 0) { + Qcow2BitmapHeaderExt bitmaps_header = { + .nb_bitmaps = cpu_to_be32(s->nb_bitmaps), + .bitmap_directory_size = + cpu_to_be64(s->bitmap_directory_size), + .bitmap_directory_offset = + cpu_to_be64(s->bitmap_directory_offset) + }; + ret = header_ext_add(buf, QCOW2_EXT_MAGIC_BITMAPS, + &bitmaps_header, sizeof(bitmaps_header), + buflen); + if (ret < 0) { + goto fail; + } + buf += ret; + buflen -= ret; + } + /* Keep unknown header extensions */ QLIST_FOREACH(uext, &s->unknown_header_ext, next) { ret = header_ext_add(buf, uext->magic, uext->data, uext->len, buflen); @@ -2079,24 +2407,105 @@ static int qcow2_change_backing_file(BlockDriverState *bs, return qcow2_update_header(bs); } -static int preallocate(BlockDriverState *bs) +static int qcow2_crypt_method_from_format(const char *encryptfmt) { + if (g_str_equal(encryptfmt, "luks")) { + return QCOW_CRYPT_LUKS; + } else if (g_str_equal(encryptfmt, "aes")) { + return QCOW_CRYPT_AES; + } else { + return -EINVAL; + } +} + +static int qcow2_set_up_encryption(BlockDriverState *bs, const char *encryptfmt, + QemuOpts *opts, Error **errp) +{ + BDRVQcow2State *s = bs->opaque; + QCryptoBlockCreateOptions *cryptoopts = NULL; + QCryptoBlock *crypto = NULL; + int ret = -EINVAL; + QDict *options, *encryptopts; + int fmt; + + options = qemu_opts_to_qdict(opts, NULL); + qdict_extract_subqdict(options, &encryptopts, "encrypt."); + QDECREF(options); + + fmt = qcow2_crypt_method_from_format(encryptfmt); + + switch (fmt) { + case QCOW_CRYPT_LUKS: + cryptoopts = block_crypto_create_opts_init( + Q_CRYPTO_BLOCK_FORMAT_LUKS, encryptopts, errp); + break; + case QCOW_CRYPT_AES: + cryptoopts = block_crypto_create_opts_init( + Q_CRYPTO_BLOCK_FORMAT_QCOW, encryptopts, errp); + break; + default: + error_setg(errp, "Unknown encryption format '%s'", encryptfmt); + break; + } + if (!cryptoopts) { + ret = -EINVAL; + goto out; + } + s->crypt_method_header = fmt; + + crypto = qcrypto_block_create(cryptoopts, "encrypt.", + qcow2_crypto_hdr_init_func, + qcow2_crypto_hdr_write_func, + bs, errp); + if (!crypto) { + ret = -EINVAL; + goto out; + } + + ret = qcow2_update_header(bs); + if (ret < 0) { + error_setg_errno(errp, -ret, "Could not write encryption header"); + goto out; + } + + out: + QDECREF(encryptopts); + qcrypto_block_free(crypto); + qapi_free_QCryptoBlockCreateOptions(cryptoopts); + return ret; +} + + +/** + * Preallocates metadata structures for data clusters between @offset (in the + * guest disk) and @new_length (which is thus generally the new guest disk + * size). + * + * Returns: 0 on success, -errno on failure. + */ +static int preallocate(BlockDriverState *bs, + uint64_t offset, uint64_t new_length) +{ + BDRVQcow2State *s = bs->opaque; uint64_t bytes; - uint64_t offset; uint64_t host_offset = 0; unsigned int cur_bytes; int ret; QCowL2Meta *meta; - bytes = bdrv_getlength(bs); - offset = 0; + if (qemu_in_coroutine()) { + qemu_co_mutex_lock(&s->lock); + } + + assert(offset <= new_length); + bytes = new_length - offset; while (bytes) { cur_bytes = MIN(bytes, INT_MAX); ret = qcow2_alloc_cluster_offset(bs, offset, &cur_bytes, &host_offset, &meta); if (ret < 0) { - return ret; + goto done; } while (meta) { @@ -2106,7 +2515,7 @@ static int preallocate(BlockDriverState *bs) if (ret < 0) { qcow2_free_any_clusters(bs, meta->alloc_offset, meta->nb_clusters, QCOW2_DISCARD_NEVER); - return ret; + goto done; } /* There are no dependent requests, but we need to remove our @@ -2133,32 +2542,174 @@ static int preallocate(BlockDriverState *bs) ret = bdrv_pwrite(bs->file, (host_offset + cur_bytes) - 1, &data, 1); if (ret < 0) { - return ret; + goto done; } } - return 0; + ret = 0; + +done: + if (qemu_in_coroutine()) { + qemu_co_mutex_unlock(&s->lock); + } + return ret; } -static int qcow2_create2(const char *filename, int64_t total_size, - const char *backing_file, const char *backing_format, - int flags, size_t cluster_size, PreallocMode prealloc, - QemuOpts *opts, int version, int refcount_order, - Error **errp) +/* qcow2_refcount_metadata_size: + * @clusters: number of clusters to refcount (including data and L1/L2 tables) + * @cluster_size: size of a cluster, in bytes + * @refcount_order: refcount bits power-of-2 exponent + * @generous_increase: allow for the refcount table to be 1.5x as large as it + * needs to be + * + * Returns: Number of bytes required for refcount blocks and table metadata. + */ +int64_t qcow2_refcount_metadata_size(int64_t clusters, size_t cluster_size, + int refcount_order, bool generous_increase, + uint64_t *refblock_count) +{ + /* + * Every host cluster is reference-counted, including metadata (even + * refcount metadata is recursively included). + * + * An accurate formula for the size of refcount metadata size is difficult + * to derive. An easier method of calculation is finding the fixed point + * where no further refcount blocks or table clusters are required to + * reference count every cluster. + */ + int64_t blocks_per_table_cluster = cluster_size / sizeof(uint64_t); + int64_t refcounts_per_block = cluster_size * 8 / (1 << refcount_order); + int64_t table = 0; /* number of refcount table clusters */ + int64_t blocks = 0; /* number of refcount block clusters */ + int64_t last; + int64_t n = 0; + + do { + last = n; + blocks = DIV_ROUND_UP(clusters + table + blocks, refcounts_per_block); + table = DIV_ROUND_UP(blocks, blocks_per_table_cluster); + n = clusters + blocks + table; + + if (n == last && generous_increase) { + clusters += DIV_ROUND_UP(table, 2); + n = 0; /* force another loop */ + generous_increase = false; + } + } while (n != last); + + if (refblock_count) { + *refblock_count = blocks; + } + + return (blocks + table) * cluster_size; +} + +/** + * qcow2_calc_prealloc_size: + * @total_size: virtual disk size in bytes + * @cluster_size: cluster size in bytes + * @refcount_order: refcount bits power-of-2 exponent + * + * Returns: Total number of bytes required for the fully allocated image + * (including metadata). + */ +static int64_t qcow2_calc_prealloc_size(int64_t total_size, + size_t cluster_size, + int refcount_order) +{ + int64_t meta_size = 0; + uint64_t nl1e, nl2e; + int64_t aligned_total_size = align_offset(total_size, cluster_size); + + /* header: 1 cluster */ + meta_size += cluster_size; + + /* total size of L2 tables */ + nl2e = aligned_total_size / cluster_size; + nl2e = align_offset(nl2e, cluster_size / sizeof(uint64_t)); + meta_size += nl2e * sizeof(uint64_t); + + /* total size of L1 tables */ + nl1e = nl2e * sizeof(uint64_t) / cluster_size; + nl1e = align_offset(nl1e, cluster_size / sizeof(uint64_t)); + meta_size += nl1e * sizeof(uint64_t); + + /* total size of refcount table and blocks */ + meta_size += qcow2_refcount_metadata_size( + (meta_size + aligned_total_size) / cluster_size, + cluster_size, refcount_order, false, NULL); + + return meta_size + aligned_total_size; +} + +static size_t qcow2_opt_get_cluster_size_del(QemuOpts *opts, Error **errp) { + size_t cluster_size; int cluster_bits; - QDict *options; - /* Calculate cluster_bits */ + cluster_size = qemu_opt_get_size_del(opts, BLOCK_OPT_CLUSTER_SIZE, + DEFAULT_CLUSTER_SIZE); cluster_bits = ctz32(cluster_size); if (cluster_bits < MIN_CLUSTER_BITS || cluster_bits > MAX_CLUSTER_BITS || (1 << cluster_bits) != cluster_size) { error_setg(errp, "Cluster size must be a power of two between %d and " "%dk", 1 << MIN_CLUSTER_BITS, 1 << (MAX_CLUSTER_BITS - 10)); - return -EINVAL; + return 0; + } + return cluster_size; +} + +static int qcow2_opt_get_version_del(QemuOpts *opts, Error **errp) +{ + char *buf; + int ret; + + buf = qemu_opt_get_del(opts, BLOCK_OPT_COMPAT_LEVEL); + if (!buf) { + ret = 3; /* default */ + } else if (!strcmp(buf, "0.10")) { + ret = 2; + } else if (!strcmp(buf, "1.1")) { + ret = 3; + } else { + error_setg(errp, "Invalid compatibility level: '%s'", buf); + ret = -EINVAL; + } + g_free(buf); + return ret; +} + +static uint64_t qcow2_opt_get_refcount_bits_del(QemuOpts *opts, int version, + Error **errp) +{ + uint64_t refcount_bits; + + refcount_bits = qemu_opt_get_number_del(opts, BLOCK_OPT_REFCOUNT_BITS, 16); + if (refcount_bits > 64 || !is_power_of_2(refcount_bits)) { + error_setg(errp, "Refcount width must be a power of two and may not " + "exceed 64 bits"); + return 0; } + if (version < 3 && refcount_bits != 16) { + error_setg(errp, "Different refcount widths than 16 bits require " + "compatibility level 1.1 or above (use compat=1.1 or " + "greater)"); + return 0; + } + + return refcount_bits; +} + +static int qcow2_create2(const char *filename, int64_t total_size, + const char *backing_file, const char *backing_format, + int flags, size_t cluster_size, PreallocMode prealloc, + QemuOpts *opts, int version, int refcount_order, + const char *encryptfmt, Error **errp) +{ + QDict *options; + /* * Open the image file and write a minimal qcow2 header. * @@ -2178,65 +2729,9 @@ static int qcow2_create2(const char *filename, int64_t total_size, int ret; if (prealloc == PREALLOC_MODE_FULL || prealloc == PREALLOC_MODE_FALLOC) { - /* Note: The following calculation does not need to be exact; if it is a - * bit off, either some bytes will be "leaked" (which is fine) or we - * will need to increase the file size by some bytes (which is fine, - * too, as long as the bulk is allocated here). Therefore, using - * floating point arithmetic is fine. */ - int64_t meta_size = 0; - uint64_t nreftablee, nrefblocke, nl1e, nl2e, refblock_count; - int64_t aligned_total_size = align_offset(total_size, cluster_size); - int refblock_bits, refblock_size; - /* refcount entry size in bytes */ - double rces = (1 << refcount_order) / 8.; - - /* see qcow2_open() */ - refblock_bits = cluster_bits - (refcount_order - 3); - refblock_size = 1 << refblock_bits; - - /* header: 1 cluster */ - meta_size += cluster_size; - - /* total size of L2 tables */ - nl2e = aligned_total_size / cluster_size; - nl2e = align_offset(nl2e, cluster_size / sizeof(uint64_t)); - meta_size += nl2e * sizeof(uint64_t); - - /* total size of L1 tables */ - nl1e = nl2e * sizeof(uint64_t) / cluster_size; - nl1e = align_offset(nl1e, cluster_size / sizeof(uint64_t)); - meta_size += nl1e * sizeof(uint64_t); - - /* total size of refcount blocks - * - * note: every host cluster is reference-counted, including metadata - * (even refcount blocks are recursively included). - * Let: - * a = total_size (this is the guest disk size) - * m = meta size not including refcount blocks and refcount tables - * c = cluster size - * y1 = number of refcount blocks entries - * y2 = meta size including everything - * rces = refcount entry size in bytes - * then, - * y1 = (y2 + a)/c - * y2 = y1 * rces + y1 * rces * sizeof(u64) / c + m - * we can get y1: - * y1 = (a + m) / (c - rces - rces * sizeof(u64) / c) - */ - nrefblocke = (aligned_total_size + meta_size + cluster_size) - / (cluster_size - rces - rces * sizeof(uint64_t) - / cluster_size); - refblock_count = DIV_ROUND_UP(nrefblocke, refblock_size); - meta_size += refblock_count * cluster_size; - - /* total size of refcount tables */ - nreftablee = align_offset(refblock_count, - cluster_size / sizeof(uint64_t)); - meta_size += nreftablee * sizeof(uint64_t); - - qemu_opt_set_number(opts, BLOCK_OPT_SIZE, - aligned_total_size + meta_size, &error_abort); + int64_t prealloc_size = + qcow2_calc_prealloc_size(total_size, cluster_size, refcount_order); + qemu_opt_set_number(opts, BLOCK_OPT_SIZE, prealloc_size, &error_abort); qemu_opt_set(opts, BLOCK_OPT_PREALLOC, PreallocMode_lookup[prealloc], &error_abort); } @@ -2263,7 +2758,7 @@ static int qcow2_create2(const char *filename, int64_t total_size, *header = (QCowHeader) { .magic = cpu_to_be32(QCOW_MAGIC), .version = cpu_to_be32(version), - .cluster_bits = cpu_to_be32(cluster_bits), + .cluster_bits = cpu_to_be32(ctz32(cluster_size)), .size = cpu_to_be64(0), .l1_table_offset = cpu_to_be64(0), .l1_size = cpu_to_be32(0), @@ -2273,11 +2768,8 @@ static int qcow2_create2(const char *filename, int64_t total_size, .header_length = cpu_to_be32(sizeof(*header)), }; - if (flags & BLOCK_FLAG_ENCRYPT) { - header->crypt_method = cpu_to_be32(QCOW_CRYPT_AES); - } else { - header->crypt_method = cpu_to_be32(QCOW_CRYPT_NONE); - } + /* We'll update this to correct value later */ + header->crypt_method = cpu_to_be32(QCOW_CRYPT_NONE); if (flags & BLOCK_FLAG_LAZY_REFCOUNTS) { header->compatible_features |= @@ -2340,7 +2832,7 @@ static int qcow2_create2(const char *filename, int64_t total_size, } /* Okay, now that we have a valid image, let's give it the right size */ - ret = blk_truncate(blk, total_size, errp); + ret = blk_truncate(blk, total_size, PREALLOC_MODE_OFF, errp); if (ret < 0) { error_prepend(errp, "Could not resize image: "); goto out; @@ -2356,12 +2848,17 @@ static int qcow2_create2(const char *filename, int64_t total_size, } } + /* Want encryption? There you go. */ + if (encryptfmt) { + ret = qcow2_set_up_encryption(blk_bs(blk), encryptfmt, opts, errp); + if (ret < 0) { + goto out; + } + } + /* And if we're supposed to preallocate metadata, do that now */ if (prealloc != PREALLOC_MODE_OFF) { - BDRVQcow2State *s = blk_bs(blk)->opaque; - qemu_co_mutex_lock(&s->lock); - ret = preallocate(blk_bs(blk)); - qemu_co_mutex_unlock(&s->lock); + ret = preallocate(blk_bs(blk), 0, total_size); if (ret < 0) { error_setg_errno(errp, -ret, "Could not preallocate metadata"); goto out; @@ -2371,11 +2868,17 @@ static int qcow2_create2(const char *filename, int64_t total_size, blk_unref(blk); blk = NULL; - /* Reopen the image without BDRV_O_NO_FLUSH to flush it before returning */ + /* Reopen the image without BDRV_O_NO_FLUSH to flush it before returning. + * Using BDRV_O_NO_IO, since encryption is now setup we don't want to + * have to setup decryption context. We're not doing any I/O on the top + * level BlockDriverState, only lower layers, where BDRV_O_NO_IO does + * not have effect. + */ options = qdict_new(); qdict_put_str(options, "driver", "qcow2"); blk = blk_new_open(filename, NULL, options, - BDRV_O_RDWR | BDRV_O_NO_BACKING, &local_err); + BDRV_O_RDWR | BDRV_O_NO_BACKING | BDRV_O_NO_IO, + &local_err); if (blk == NULL) { error_propagate(errp, local_err); ret = -EIO; @@ -2399,9 +2902,10 @@ static int qcow2_create(const char *filename, QemuOpts *opts, Error **errp) int flags = 0; size_t cluster_size = DEFAULT_CLUSTER_SIZE; PreallocMode prealloc; - int version = 3; - uint64_t refcount_bits = 16; + int version; + uint64_t refcount_bits; int refcount_order; + const char *encryptfmt = NULL; Error *local_err = NULL; int ret; @@ -2410,11 +2914,23 @@ static int qcow2_create(const char *filename, QemuOpts *opts, Error **errp) BDRV_SECTOR_SIZE); backing_file = qemu_opt_get_del(opts, BLOCK_OPT_BACKING_FILE); backing_fmt = qemu_opt_get_del(opts, BLOCK_OPT_BACKING_FMT); - if (qemu_opt_get_bool_del(opts, BLOCK_OPT_ENCRYPT, false)) { - flags |= BLOCK_FLAG_ENCRYPT; + encryptfmt = qemu_opt_get_del(opts, BLOCK_OPT_ENCRYPT_FORMAT); + if (encryptfmt) { + if (qemu_opt_get_del(opts, BLOCK_OPT_ENCRYPT)) { + error_setg(errp, "Options " BLOCK_OPT_ENCRYPT " and " + BLOCK_OPT_ENCRYPT_FORMAT " are mutually exclusive"); + ret = -EINVAL; + goto finish; + } + } else if (qemu_opt_get_bool_del(opts, BLOCK_OPT_ENCRYPT, false)) { + encryptfmt = "aes"; + } + cluster_size = qcow2_opt_get_cluster_size_del(opts, &local_err); + if (local_err) { + error_propagate(errp, local_err); + ret = -EINVAL; + goto finish; } - cluster_size = qemu_opt_get_size_del(opts, BLOCK_OPT_CLUSTER_SIZE, - DEFAULT_CLUSTER_SIZE); buf = qemu_opt_get_del(opts, BLOCK_OPT_PREALLOC); prealloc = qapi_enum_parse(PreallocMode_lookup, buf, PREALLOC_MODE__MAX, PREALLOC_MODE_OFF, @@ -2424,16 +2940,10 @@ static int qcow2_create(const char *filename, QemuOpts *opts, Error **errp) ret = -EINVAL; goto finish; } - g_free(buf); - buf = qemu_opt_get_del(opts, BLOCK_OPT_COMPAT_LEVEL); - if (!buf) { - /* keep the default */ - } else if (!strcmp(buf, "0.10")) { - version = 2; - } else if (!strcmp(buf, "1.1")) { - version = 3; - } else { - error_setg(errp, "Invalid compatibility level: '%s'", buf); + + version = qcow2_opt_get_version_del(opts, &local_err); + if (local_err) { + error_propagate(errp, local_err); ret = -EINVAL; goto finish; } @@ -2456,19 +2966,9 @@ static int qcow2_create(const char *filename, QemuOpts *opts, Error **errp) goto finish; } - refcount_bits = qemu_opt_get_number_del(opts, BLOCK_OPT_REFCOUNT_BITS, - refcount_bits); - if (refcount_bits > 64 || !is_power_of_2(refcount_bits)) { - error_setg(errp, "Refcount width must be a power of two and may not " - "exceed 64 bits"); - ret = -EINVAL; - goto finish; - } - - if (version < 3 && refcount_bits != 16) { - error_setg(errp, "Different refcount widths than 16 bits require " - "compatibility level 1.1 or above (use compat=1.1 or " - "greater)"); + refcount_bits = qcow2_opt_get_refcount_bits_del(opts, version, &local_err); + if (local_err) { + error_propagate(errp, local_err); ret = -EINVAL; goto finish; } @@ -2477,7 +2977,7 @@ static int qcow2_create(const char *filename, QemuOpts *opts, Error **errp) ret = qcow2_create2(filename, size, backing_file, backing_fmt, flags, cluster_size, prealloc, opts, version, refcount_order, - &local_err); + encryptfmt, &local_err); error_propagate(errp, local_err); finish: @@ -2585,12 +3085,22 @@ static coroutine_fn int qcow2_co_pdiscard(BlockDriverState *bs, return ret; } -static int qcow2_truncate(BlockDriverState *bs, int64_t offset, Error **errp) +static int qcow2_truncate(BlockDriverState *bs, int64_t offset, + PreallocMode prealloc, Error **errp) { BDRVQcow2State *s = bs->opaque; + uint64_t old_length; int64_t new_l1_size; int ret; + if (prealloc != PREALLOC_MODE_OFF && prealloc != PREALLOC_MODE_METADATA && + prealloc != PREALLOC_MODE_FALLOC && prealloc != PREALLOC_MODE_FULL) + { + error_setg(errp, "Unsupported preallocation mode '%s'", + PreallocMode_lookup[prealloc]); + return -ENOTSUP; + } + if (offset & 511) { error_setg(errp, "The new size must be a multiple of 512"); return -EINVAL; @@ -2602,8 +3112,17 @@ static int qcow2_truncate(BlockDriverState *bs, int64_t offset, Error **errp) return -ENOTSUP; } + /* cannot proceed if image has bitmaps */ + if (s->nb_bitmaps) { + /* TODO: resize bitmaps in the image */ + error_setg(errp, "Can't resize an image which has bitmaps"); + return -ENOTSUP; + } + + old_length = bs->total_sectors * 512; + /* shrinking is currently not supported */ - if (offset < bs->total_sectors * 512) { + if (offset < old_length) { error_setg(errp, "qcow2 doesn't support shrinking images yet"); return -ENOTSUP; } @@ -2615,6 +3134,128 @@ static int qcow2_truncate(BlockDriverState *bs, int64_t offset, Error **errp) return ret; } + switch (prealloc) { + case PREALLOC_MODE_OFF: + break; + + case PREALLOC_MODE_METADATA: + ret = preallocate(bs, old_length, offset); + if (ret < 0) { + error_setg_errno(errp, -ret, "Preallocation failed"); + return ret; + } + break; + + case PREALLOC_MODE_FALLOC: + case PREALLOC_MODE_FULL: + { + int64_t allocation_start, host_offset, guest_offset; + int64_t clusters_allocated; + int64_t old_file_size, new_file_size; + uint64_t nb_new_data_clusters, nb_new_l2_tables; + + old_file_size = bdrv_getlength(bs->file->bs); + if (old_file_size < 0) { + error_setg_errno(errp, -old_file_size, + "Failed to inquire current file length"); + return ret; + } + + nb_new_data_clusters = DIV_ROUND_UP(offset - old_length, + s->cluster_size); + + /* This is an overestimation; we will not actually allocate space for + * these in the file but just make sure the new refcount structures are + * able to cover them so we will not have to allocate new refblocks + * while entering the data blocks in the potentially new L2 tables. + * (We do not actually care where the L2 tables are placed. Maybe they + * are already allocated or they can be placed somewhere before + * @old_file_size. It does not matter because they will be fully + * allocated automatically, so they do not need to be covered by the + * preallocation. All that matters is that we will not have to allocate + * new refcount structures for them.) */ + nb_new_l2_tables = DIV_ROUND_UP(nb_new_data_clusters, + s->cluster_size / sizeof(uint64_t)); + /* The cluster range may not be aligned to L2 boundaries, so add one L2 + * table for a potential head/tail */ + nb_new_l2_tables++; + + allocation_start = qcow2_refcount_area(bs, old_file_size, + nb_new_data_clusters + + nb_new_l2_tables, + true, 0, 0); + if (allocation_start < 0) { + error_setg_errno(errp, -allocation_start, + "Failed to resize refcount structures"); + return -allocation_start; + } + + clusters_allocated = qcow2_alloc_clusters_at(bs, allocation_start, + nb_new_data_clusters); + if (clusters_allocated < 0) { + error_setg_errno(errp, -clusters_allocated, + "Failed to allocate data clusters"); + return -clusters_allocated; + } + + assert(clusters_allocated == nb_new_data_clusters); + + /* Allocate the data area */ + new_file_size = allocation_start + + nb_new_data_clusters * s->cluster_size; + ret = bdrv_truncate(bs->file, new_file_size, prealloc, errp); + if (ret < 0) { + error_prepend(errp, "Failed to resize underlying file: "); + qcow2_free_clusters(bs, allocation_start, + nb_new_data_clusters * s->cluster_size, + QCOW2_DISCARD_OTHER); + return ret; + } + + /* Create the necessary L2 entries */ + host_offset = allocation_start; + guest_offset = old_length; + while (nb_new_data_clusters) { + int64_t guest_cluster = guest_offset >> s->cluster_bits; + int64_t nb_clusters = MIN(nb_new_data_clusters, + s->l2_size - guest_cluster % s->l2_size); + QCowL2Meta allocation = { + .offset = guest_offset, + .alloc_offset = host_offset, + .nb_clusters = nb_clusters, + }; + qemu_co_queue_init(&allocation.dependent_requests); + + ret = qcow2_alloc_cluster_link_l2(bs, &allocation); + if (ret < 0) { + error_setg_errno(errp, -ret, "Failed to update L2 tables"); + qcow2_free_clusters(bs, host_offset, + nb_new_data_clusters * s->cluster_size, + QCOW2_DISCARD_OTHER); + return ret; + } + + guest_offset += nb_clusters * s->cluster_size; + host_offset += nb_clusters * s->cluster_size; + nb_new_data_clusters -= nb_clusters; + } + break; + } + + default: + g_assert_not_reached(); + } + + if (prealloc != PREALLOC_MODE_OFF) { + /* Flush metadata before actually changing the image size */ + ret = bdrv_flush(bs); + if (ret < 0) { + error_setg_errno(errp, -ret, + "Failed to flush the preallocated area to disk"); + return ret; + } + } + /* write updated header.size */ offset = cpu_to_be64(offset); ret = bdrv_pwrite_sync(bs->file, offsetof(QCowHeader, size), @@ -2646,7 +3287,7 @@ qcow2_co_pwritev_compressed(BlockDriverState *bs, uint64_t offset, /* align end of file to a sector boundary to ease reading with sector based I/Os */ cluster_offset = bdrv_getlength(bs->file->bs); - return bdrv_truncate(bs->file, cluster_offset, NULL); + return bdrv_truncate(bs->file, cluster_offset, PREALLOC_MODE_OFF, NULL); } buf = qemu_blockalign(bs, s->cluster_size); @@ -2862,7 +3503,7 @@ static int make_completely_empty(BlockDriverState *bs) } ret = bdrv_truncate(bs->file, (3 + l1_clusters) * s->cluster_size, - &local_err); + PREALLOC_MODE_OFF, &local_err); if (ret < 0) { error_report_err(local_err); goto fail; @@ -2946,6 +3587,142 @@ static coroutine_fn int qcow2_co_flush_to_os(BlockDriverState *bs) return 0; } +static BlockMeasureInfo *qcow2_measure(QemuOpts *opts, BlockDriverState *in_bs, + Error **errp) +{ + Error *local_err = NULL; + BlockMeasureInfo *info; + uint64_t required = 0; /* bytes that contribute to required size */ + uint64_t virtual_size; /* disk size as seen by guest */ + uint64_t refcount_bits; + uint64_t l2_tables; + size_t cluster_size; + int version; + char *optstr; + PreallocMode prealloc; + bool has_backing_file; + + /* Parse image creation options */ + cluster_size = qcow2_opt_get_cluster_size_del(opts, &local_err); + if (local_err) { + goto err; + } + + version = qcow2_opt_get_version_del(opts, &local_err); + if (local_err) { + goto err; + } + + refcount_bits = qcow2_opt_get_refcount_bits_del(opts, version, &local_err); + if (local_err) { + goto err; + } + + optstr = qemu_opt_get_del(opts, BLOCK_OPT_PREALLOC); + prealloc = qapi_enum_parse(PreallocMode_lookup, optstr, + PREALLOC_MODE__MAX, PREALLOC_MODE_OFF, + &local_err); + g_free(optstr); + if (local_err) { + goto err; + } + + optstr = qemu_opt_get_del(opts, BLOCK_OPT_BACKING_FILE); + has_backing_file = !!optstr; + g_free(optstr); + + virtual_size = align_offset(qemu_opt_get_size_del(opts, BLOCK_OPT_SIZE, 0), + cluster_size); + + /* Check that virtual disk size is valid */ + l2_tables = DIV_ROUND_UP(virtual_size / cluster_size, + cluster_size / sizeof(uint64_t)); + if (l2_tables * sizeof(uint64_t) > QCOW_MAX_L1_SIZE) { + error_setg(&local_err, "The image size is too large " + "(try using a larger cluster size)"); + goto err; + } + + /* Account for input image */ + if (in_bs) { + int64_t ssize = bdrv_getlength(in_bs); + if (ssize < 0) { + error_setg_errno(&local_err, -ssize, + "Unable to get image virtual_size"); + goto err; + } + + virtual_size = align_offset(ssize, cluster_size); + + if (has_backing_file) { + /* We don't how much of the backing chain is shared by the input + * image and the new image file. In the worst case the new image's + * backing file has nothing in common with the input image. Be + * conservative and assume all clusters need to be written. + */ + required = virtual_size; + } else { + int cluster_sectors = cluster_size / BDRV_SECTOR_SIZE; + int64_t sector_num; + int pnum = 0; + + for (sector_num = 0; + sector_num < ssize / BDRV_SECTOR_SIZE; + sector_num += pnum) { + int nb_sectors = MAX(ssize / BDRV_SECTOR_SIZE - sector_num, + INT_MAX); + BlockDriverState *file; + int64_t ret; + + ret = bdrv_get_block_status_above(in_bs, NULL, + sector_num, nb_sectors, + &pnum, &file); + if (ret < 0) { + error_setg_errno(&local_err, -ret, + "Unable to get block status"); + goto err; + } + + if (ret & BDRV_BLOCK_ZERO) { + /* Skip zero regions (safe with no backing file) */ + } else if ((ret & (BDRV_BLOCK_DATA | BDRV_BLOCK_ALLOCATED)) == + (BDRV_BLOCK_DATA | BDRV_BLOCK_ALLOCATED)) { + /* Extend pnum to end of cluster for next iteration */ + pnum = ROUND_UP(sector_num + pnum, cluster_sectors) - + sector_num; + + /* Count clusters we've seen */ + required += (sector_num % cluster_sectors + pnum) * + BDRV_SECTOR_SIZE; + } + } + } + } + + /* Take into account preallocation. Nothing special is needed for + * PREALLOC_MODE_METADATA since metadata is always counted. + */ + if (prealloc == PREALLOC_MODE_FULL || prealloc == PREALLOC_MODE_FALLOC) { + required = virtual_size; + } + + info = g_new(BlockMeasureInfo, 1); + info->fully_allocated = + qcow2_calc_prealloc_size(virtual_size, cluster_size, + ctz32(refcount_bits)); + + /* Remove data clusters that are not required. This overestimates the + * required size because metadata needed for the fully allocated file is + * still counted. + */ + info->required = info->fully_allocated - virtual_size + required; + return info; + +err: + error_propagate(errp, local_err); + return NULL; +} + static int qcow2_get_info(BlockDriverState *bs, BlockDriverInfo *bdi) { BDRVQcow2State *s = bs->opaque; @@ -2959,8 +3736,14 @@ static int qcow2_get_info(BlockDriverState *bs, BlockDriverInfo *bdi) static ImageInfoSpecific *qcow2_get_specific_info(BlockDriverState *bs) { BDRVQcow2State *s = bs->opaque; - ImageInfoSpecific *spec_info = g_new(ImageInfoSpecific, 1); + ImageInfoSpecific *spec_info; + QCryptoBlockInfo *encrypt_info = NULL; + if (s->crypto != NULL) { + encrypt_info = qcrypto_block_get_info(s->crypto, &error_abort); + } + + spec_info = g_new(ImageInfoSpecific, 1); *spec_info = (ImageInfoSpecific){ .type = IMAGE_INFO_SPECIFIC_KIND_QCOW2, .u.qcow2.data = g_new(ImageInfoSpecificQCow2, 1), @@ -2987,6 +3770,30 @@ static ImageInfoSpecific *qcow2_get_specific_info(BlockDriverState *bs) assert(false); } + if (encrypt_info) { + ImageInfoSpecificQCow2Encryption *qencrypt = + g_new(ImageInfoSpecificQCow2Encryption, 1); + switch (encrypt_info->format) { + case Q_CRYPTO_BLOCK_FORMAT_QCOW: + qencrypt->format = BLOCKDEV_QCOW2_ENCRYPTION_FORMAT_AES; + qencrypt->u.aes = encrypt_info->u.qcow; + break; + case Q_CRYPTO_BLOCK_FORMAT_LUKS: + qencrypt->format = BLOCKDEV_QCOW2_ENCRYPTION_FORMAT_LUKS; + qencrypt->u.luks = encrypt_info->u.luks; + break; + default: + abort(); + } + /* Since we did shallow copy above, erase any pointers + * in the original info */ + memset(&encrypt_info->u, 0, sizeof(encrypt_info->u)); + qapi_free_QCryptoBlockInfo(encrypt_info); + + spec_info->u.qcow2.data->has_encrypt = true; + spec_info->u.qcow2.data->encrypt = qencrypt; + } + return spec_info; } @@ -3175,6 +3982,7 @@ static int qcow2_amend_options(BlockDriverState *bs, QemuOpts *opts, const char *compat = NULL; uint64_t cluster_size = s->cluster_size; bool encrypt; + int encformat; int refcount_bits = s->refcount_bits; Error *local_err = NULL; int ret; @@ -3211,12 +4019,20 @@ static int qcow2_amend_options(BlockDriverState *bs, QemuOpts *opts, backing_format = qemu_opt_get(opts, BLOCK_OPT_BACKING_FMT); } else if (!strcmp(desc->name, BLOCK_OPT_ENCRYPT)) { encrypt = qemu_opt_get_bool(opts, BLOCK_OPT_ENCRYPT, - !!s->cipher); + !!s->crypto); - if (encrypt != !!s->cipher) { + if (encrypt != !!s->crypto) { error_report("Changing the encryption flag is not supported"); return -ENOTSUP; } + } else if (!strcmp(desc->name, BLOCK_OPT_ENCRYPT_FORMAT)) { + encformat = qcow2_crypt_method_from_format( + qemu_opt_get(opts, BLOCK_OPT_ENCRYPT_FORMAT)); + + if (encformat != s->crypt_method_header) { + error_report("Changing the encryption format is not supported"); + return -ENOTSUP; + } } else if (!strcmp(desc->name, BLOCK_OPT_CLUSTER_SIZE)) { cluster_size = qemu_opt_get_size(opts, BLOCK_OPT_CLUSTER_SIZE, cluster_size); @@ -3333,7 +4149,7 @@ static int qcow2_amend_options(BlockDriverState *bs, QemuOpts *opts, return ret; } - ret = blk_truncate(blk, new_size, &local_err); + ret = blk_truncate(blk, new_size, PREALLOC_MODE_OFF, &local_err); blk_unref(blk); if (ret < 0) { error_report_err(local_err); @@ -3431,10 +4247,23 @@ static QemuOptsList qcow2_create_opts = { { .name = BLOCK_OPT_ENCRYPT, .type = QEMU_OPT_BOOL, - .help = "Encrypt the image", - .def_value_str = "off" + .help = "Encrypt the image with format 'aes'. (Deprecated " + "in favor of " BLOCK_OPT_ENCRYPT_FORMAT "=aes)", }, { + .name = BLOCK_OPT_ENCRYPT_FORMAT, + .type = QEMU_OPT_STRING, + .help = "Encrypt the image, format choices: 'aes', 'luks'", + }, + BLOCK_CRYPTO_OPT_DEF_KEY_SECRET("encrypt.", + "ID of secret providing qcow AES key or LUKS passphrase"), + BLOCK_CRYPTO_OPT_DEF_LUKS_CIPHER_ALG("encrypt."), + BLOCK_CRYPTO_OPT_DEF_LUKS_CIPHER_MODE("encrypt."), + BLOCK_CRYPTO_OPT_DEF_LUKS_IVGEN_ALG("encrypt."), + BLOCK_CRYPTO_OPT_DEF_LUKS_IVGEN_HASH_ALG("encrypt."), + BLOCK_CRYPTO_OPT_DEF_LUKS_HASH_ALG("encrypt."), + BLOCK_CRYPTO_OPT_DEF_LUKS_ITER_TIME("encrypt."), + { .name = BLOCK_OPT_CLUSTER_SIZE, .type = QEMU_OPT_SIZE, .help = "qcow2 cluster size", @@ -3476,7 +4305,6 @@ BlockDriver bdrv_qcow2 = { .bdrv_create = qcow2_create, .bdrv_has_zero_init = bdrv_has_zero_init_1, .bdrv_co_get_block_status = qcow2_co_get_block_status, - .bdrv_set_key = qcow2_set_key, .bdrv_co_preadv = qcow2_co_preadv, .bdrv_co_pwritev = qcow2_co_pwritev, @@ -3493,6 +4321,7 @@ BlockDriver bdrv_qcow2 = { .bdrv_snapshot_delete = qcow2_snapshot_delete, .bdrv_snapshot_list = qcow2_snapshot_list, .bdrv_snapshot_load_tmp = qcow2_snapshot_load_tmp, + .bdrv_measure = qcow2_measure, .bdrv_get_info = qcow2_get_info, .bdrv_get_specific_info = qcow2_get_specific_info, @@ -3512,6 +4341,10 @@ BlockDriver bdrv_qcow2 = { .bdrv_detach_aio_context = qcow2_detach_aio_context, .bdrv_attach_aio_context = qcow2_attach_aio_context, + + .bdrv_reopen_bitmaps_rw = qcow2_reopen_bitmaps_rw, + .bdrv_can_store_new_dirty_bitmap = qcow2_can_store_new_dirty_bitmap, + .bdrv_remove_persistent_dirty_bitmap = qcow2_remove_persistent_dirty_bitmap, }; static void bdrv_qcow2_init(void) diff --git a/block/qcow2.h b/block/qcow2.h index 87b15eb4aa..96a8d43c17 100644 --- a/block/qcow2.h +++ b/block/qcow2.h @@ -25,7 +25,7 @@ #ifndef BLOCK_QCOW2_H #define BLOCK_QCOW2_H -#include "crypto/cipher.h" +#include "crypto/block.h" #include "qemu/coroutine.h" //#define DEBUG_ALLOC @@ -36,6 +36,7 @@ #define QCOW_CRYPT_NONE 0 #define QCOW_CRYPT_AES 1 +#define QCOW_CRYPT_LUKS 2 #define QCOW_MAX_CRYPT_CLUSTERS 32 #define QCOW_MAX_SNAPSHOTS 65536 @@ -52,6 +53,10 @@ * space for snapshot names and IDs */ #define QCOW_MAX_SNAPSHOTS_SIZE (1024 * QCOW_MAX_SNAPSHOTS) +/* Bitmap header extension constraints */ +#define QCOW2_MAX_BITMAPS 65535 +#define QCOW2_MAX_BITMAP_DIRECTORY_SIZE (1024 * QCOW2_MAX_BITMAPS) + /* indicate that the refcount of the referenced cluster is exactly one. */ #define QCOW_OFLAG_COPIED (1ULL << 63) /* indicate that the cluster is compressed (they never have the copied flag) */ @@ -163,6 +168,11 @@ typedef struct QCowSnapshot { struct Qcow2Cache; typedef struct Qcow2Cache Qcow2Cache; +typedef struct Qcow2CryptoHeaderExtension { + uint64_t offset; + uint64_t length; +} QEMU_PACKED Qcow2CryptoHeaderExtension; + typedef struct Qcow2UnknownHeaderExtension { uint32_t magic; uint32_t len; @@ -195,6 +205,14 @@ enum { QCOW2_COMPAT_FEAT_MASK = QCOW2_COMPAT_LAZY_REFCOUNTS, }; +/* Autoclear feature bits */ +enum { + QCOW2_AUTOCLEAR_BITMAPS_BITNR = 0, + QCOW2_AUTOCLEAR_BITMAPS = 1 << QCOW2_AUTOCLEAR_BITMAPS_BITNR, + + QCOW2_AUTOCLEAR_MASK = QCOW2_AUTOCLEAR_BITMAPS, +}; + enum qcow2_discard_type { QCOW2_DISCARD_NEVER = 0, QCOW2_DISCARD_ALWAYS, @@ -222,6 +240,13 @@ typedef uint64_t Qcow2GetRefcountFunc(const void *refcount_array, typedef void Qcow2SetRefcountFunc(void *refcount_array, uint64_t index, uint64_t value); +typedef struct Qcow2BitmapHeaderExt { + uint32_t nb_bitmaps; + uint32_t reserved32; + uint64_t bitmap_directory_size; + uint64_t bitmap_directory_offset; +} QEMU_PACKED Qcow2BitmapHeaderExt; + typedef struct BDRVQcow2State { int cluster_bits; int cluster_size; @@ -257,13 +282,21 @@ typedef struct BDRVQcow2State { CoMutex lock; - QCryptoCipher *cipher; /* current cipher, NULL if no key yet */ + Qcow2CryptoHeaderExtension crypto_header; /* QCow2 header extension */ + QCryptoBlockOpenOptions *crypto_opts; /* Disk encryption runtime options */ + QCryptoBlock *crypto; /* Disk encryption format driver */ + bool crypt_physical_offset; /* Whether to use virtual or physical offset + for encryption initialization vector tweak */ uint32_t crypt_method_header; uint64_t snapshots_offset; int snapshots_size; unsigned int nb_snapshots; QCowSnapshot *snapshots; + uint32_t nb_bitmaps; + uint64_t bitmap_directory_size; + uint64_t bitmap_directory_offset; + int flags; int qcow_version; bool use_lazy_refcounts; @@ -492,6 +525,10 @@ static inline uint64_t refcount_diff(uint64_t r1, uint64_t r2) int qcow2_backing_read1(BlockDriverState *bs, QEMUIOVector *qiov, int64_t sector_num, int nb_sectors); +int64_t qcow2_refcount_metadata_size(int64_t clusters, size_t cluster_size, + int refcount_order, bool generous_increase, + uint64_t *refblock_count); + int qcow2_mark_dirty(BlockDriverState *bs); int qcow2_mark_corrupt(BlockDriverState *bs); int qcow2_mark_consistent(BlockDriverState *bs); @@ -512,6 +549,11 @@ int qcow2_update_cluster_refcount(BlockDriverState *bs, int64_t cluster_index, uint64_t addend, bool decrease, enum qcow2_discard_type type); +int64_t qcow2_refcount_area(BlockDriverState *bs, uint64_t offset, + uint64_t additional_clusters, bool exact_size, + int new_refblock_index, + uint64_t new_refblock_offset); + int64_t qcow2_alloc_clusters(BlockDriverState *bs, uint64_t size); int64_t qcow2_alloc_clusters_at(BlockDriverState *bs, uint64_t offset, int64_t nb_clusters); @@ -534,6 +576,10 @@ int qcow2_check_metadata_overlap(BlockDriverState *bs, int ign, int64_t offset, int64_t size); int qcow2_pre_write_overlap_check(BlockDriverState *bs, int ign, int64_t offset, int64_t size); +int qcow2_inc_refcounts_imrt(BlockDriverState *bs, BdrvCheckResult *res, + void **refcount_table, + int64_t *refcount_table_size, + int64_t offset, int64_t size); int qcow2_change_refcount_order(BlockDriverState *bs, int refcount_order, BlockDriverAmendStatusCB *status_cb, @@ -545,8 +591,7 @@ int qcow2_grow_l1_table(BlockDriverState *bs, uint64_t min_size, int qcow2_write_l1_entry(BlockDriverState *bs, int l1_index); int qcow2_decompress_cluster(BlockDriverState *bs, uint64_t cluster_offset); int qcow2_encrypt_sectors(BDRVQcow2State *s, int64_t sector_num, - uint8_t *out_buf, const uint8_t *in_buf, - int nb_sectors, bool enc, Error **errp); + uint8_t *buf, int nb_sectors, bool enc, Error **errp); int qcow2_get_cluster_offset(BlockDriverState *bs, uint64_t offset, unsigned int *bytes, uint64_t *cluster_offset); @@ -605,4 +650,20 @@ int qcow2_cache_get_empty(BlockDriverState *bs, Qcow2Cache *c, uint64_t offset, void **table); void qcow2_cache_put(BlockDriverState *bs, Qcow2Cache *c, void **table); +/* qcow2-bitmap.c functions */ +int qcow2_check_bitmaps_refcounts(BlockDriverState *bs, BdrvCheckResult *res, + void **refcount_table, + int64_t *refcount_table_size); +bool qcow2_load_autoloading_dirty_bitmaps(BlockDriverState *bs, Error **errp); +int qcow2_reopen_bitmaps_rw(BlockDriverState *bs, Error **errp); +void qcow2_store_persistent_dirty_bitmaps(BlockDriverState *bs, Error **errp); +int qcow2_reopen_bitmaps_ro(BlockDriverState *bs, Error **errp); +bool qcow2_can_store_new_dirty_bitmap(BlockDriverState *bs, + const char *name, + uint32_t granularity, + Error **errp); +void qcow2_remove_persistent_dirty_bitmap(BlockDriverState *bs, + const char *name, + Error **errp); + #endif diff --git a/block/qed-cluster.c b/block/qed-cluster.c index d8d6e66a0f..672e2e654b 100644 --- a/block/qed-cluster.c +++ b/block/qed-cluster.c @@ -85,6 +85,8 @@ static unsigned int qed_count_contiguous_clusters(BDRVQEDState *s, * * On failure QED_CLUSTER_L2 or QED_CLUSTER_L1 is returned for missing L2 or L1 * table offset, respectively. len is number of contiguous unallocated bytes. + * + * Called with table_lock held. */ int coroutine_fn qed_find_cluster(BDRVQEDState *s, QEDRequest *request, uint64_t pos, size_t *len, @@ -112,7 +114,6 @@ int coroutine_fn qed_find_cluster(BDRVQEDState *s, QEDRequest *request, } ret = qed_read_l2_table(s, request, l2_offset); - qed_acquire(s); if (ret) { goto out; } @@ -137,6 +138,5 @@ int coroutine_fn qed_find_cluster(BDRVQEDState *s, QEDRequest *request, out: *img_offset = offset; - qed_release(s); return ret; } diff --git a/block/qed-l2-cache.c b/block/qed-l2-cache.c index 5cba794650..b548362398 100644 --- a/block/qed-l2-cache.c +++ b/block/qed-l2-cache.c @@ -101,6 +101,8 @@ CachedL2Table *qed_alloc_l2_cache_entry(L2TableCache *l2_cache) /** * Decrease an entry's reference count and free if necessary when the reference * count drops to zero. + * + * Called with table_lock held. */ void qed_unref_l2_cache_entry(CachedL2Table *entry) { @@ -122,6 +124,8 @@ void qed_unref_l2_cache_entry(CachedL2Table *entry) * * For a cached entry, this function increases the reference count and returns * the entry. + * + * Called with table_lock held. */ CachedL2Table *qed_find_l2_cache_entry(L2TableCache *l2_cache, uint64_t offset) { @@ -150,6 +154,8 @@ CachedL2Table *qed_find_l2_cache_entry(L2TableCache *l2_cache, uint64_t offset) * N.B. This function steals a reference to the l2_table from the caller so the * caller must obtain a new reference by issuing a call to * qed_find_l2_cache_entry(). + * + * Called with table_lock held. */ void qed_commit_l2_cache_entry(L2TableCache *l2_cache, CachedL2Table *l2_table) { diff --git a/block/qed-table.c b/block/qed-table.c index ebee2c50f0..eead8b0fc7 100644 --- a/block/qed-table.c +++ b/block/qed-table.c @@ -18,6 +18,7 @@ #include "qed.h" #include "qemu/bswap.h" +/* Called either from qed_check or with table_lock held. */ static int qed_read_table(BDRVQEDState *s, uint64_t offset, QEDTable *table) { QEMUIOVector qiov; @@ -32,18 +33,22 @@ static int qed_read_table(BDRVQEDState *s, uint64_t offset, QEDTable *table) trace_qed_read_table(s, offset, table); + if (qemu_in_coroutine()) { + qemu_co_mutex_unlock(&s->table_lock); + } ret = bdrv_preadv(s->bs->file, offset, &qiov); + if (qemu_in_coroutine()) { + qemu_co_mutex_lock(&s->table_lock); + } if (ret < 0) { goto out; } /* Byteswap offsets */ - qed_acquire(s); noffsets = qiov.size / sizeof(uint64_t); for (i = 0; i < noffsets; i++) { table->offsets[i] = le64_to_cpu(table->offsets[i]); } - qed_release(s); ret = 0; out: @@ -61,6 +66,8 @@ out: * @index: Index of first element * @n: Number of elements * @flush: Whether or not to sync to disk + * + * Called either from qed_check or with table_lock held. */ static int qed_write_table(BDRVQEDState *s, uint64_t offset, QEDTable *table, unsigned int index, unsigned int n, bool flush) @@ -97,16 +104,20 @@ static int qed_write_table(BDRVQEDState *s, uint64_t offset, QEDTable *table, /* Adjust for offset into table */ offset += start * sizeof(uint64_t); + if (qemu_in_coroutine()) { + qemu_co_mutex_unlock(&s->table_lock); + } ret = bdrv_pwritev(s->bs->file, offset, &qiov); + if (qemu_in_coroutine()) { + qemu_co_mutex_lock(&s->table_lock); + } trace_qed_write_table_cb(s, table, flush, ret); if (ret < 0) { goto out; } if (flush) { - qed_acquire(s); ret = bdrv_flush(s->bs); - qed_release(s); if (ret < 0) { goto out; } @@ -123,6 +134,7 @@ int qed_read_l1_table_sync(BDRVQEDState *s) return qed_read_table(s, s->header.l1_table_offset, s->l1_table); } +/* Called either from qed_check or with table_lock held. */ int qed_write_l1_table(BDRVQEDState *s, unsigned int index, unsigned int n) { BLKDBG_EVENT(s->bs->file, BLKDBG_L1_UPDATE); @@ -136,6 +148,7 @@ int qed_write_l1_table_sync(BDRVQEDState *s, unsigned int index, return qed_write_l1_table(s, index, n); } +/* Called either from qed_check or with table_lock held. */ int qed_read_l2_table(BDRVQEDState *s, QEDRequest *request, uint64_t offset) { int ret; @@ -154,7 +167,6 @@ int qed_read_l2_table(BDRVQEDState *s, QEDRequest *request, uint64_t offset) BLKDBG_EVENT(s->bs->file, BLKDBG_L2_LOAD); ret = qed_read_table(s, offset, request->l2_table->table); - qed_acquire(s); if (ret) { /* can't trust loaded L2 table anymore */ qed_unref_l2_cache_entry(request->l2_table); @@ -170,7 +182,6 @@ int qed_read_l2_table(BDRVQEDState *s, QEDRequest *request, uint64_t offset) request->l2_table = qed_find_l2_cache_entry(&s->l2_cache, offset); assert(request->l2_table != NULL); } - qed_release(s); return ret; } @@ -180,6 +191,7 @@ int qed_read_l2_table_sync(BDRVQEDState *s, QEDRequest *request, uint64_t offset return qed_read_l2_table(s, request, offset); } +/* Called either from qed_check or with table_lock held. */ int qed_write_l2_table(BDRVQEDState *s, QEDRequest *request, unsigned int index, unsigned int n, bool flush) { diff --git a/block/qed.c b/block/qed.c index 385381a78a..dc54bf4a93 100644 --- a/block/qed.c +++ b/block/qed.c @@ -93,6 +93,8 @@ int qed_write_header_sync(BDRVQEDState *s) * * This function only updates known header fields in-place and does not affect * extra data after the QED header. + * + * No new allocating reqs can start while this function runs. */ static int coroutine_fn qed_write_header(BDRVQEDState *s) { @@ -109,6 +111,8 @@ static int coroutine_fn qed_write_header(BDRVQEDState *s) QEMUIOVector qiov; int ret; + assert(s->allocating_acb || s->allocating_write_reqs_plugged); + buf = qemu_blockalign(s->bs, len); iov = (struct iovec) { .iov_base = buf, @@ -219,6 +223,8 @@ static int qed_read_string(BdrvChild *file, uint64_t offset, size_t n, * This function only produces the offset where the new clusters should be * written. It updates BDRVQEDState but does not make any changes to the image * file. + * + * Called with table_lock held. */ static uint64_t qed_alloc_clusters(BDRVQEDState *s, unsigned int n) { @@ -236,6 +242,8 @@ QEDTable *qed_alloc_table(BDRVQEDState *s) /** * Allocate a new zeroed L2 table + * + * Called with table_lock held. */ static CachedL2Table *qed_new_l2_table(BDRVQEDState *s) { @@ -249,19 +257,32 @@ static CachedL2Table *qed_new_l2_table(BDRVQEDState *s) return l2_table; } -static void qed_plug_allocating_write_reqs(BDRVQEDState *s) +static bool qed_plug_allocating_write_reqs(BDRVQEDState *s) { + qemu_co_mutex_lock(&s->table_lock); + + /* No reentrancy is allowed. */ assert(!s->allocating_write_reqs_plugged); + if (s->allocating_acb != NULL) { + /* Another allocating write came concurrently. This cannot happen + * from bdrv_qed_co_drain, but it can happen when the timer runs. + */ + qemu_co_mutex_unlock(&s->table_lock); + return false; + } s->allocating_write_reqs_plugged = true; + qemu_co_mutex_unlock(&s->table_lock); + return true; } static void qed_unplug_allocating_write_reqs(BDRVQEDState *s) { + qemu_co_mutex_lock(&s->table_lock); assert(s->allocating_write_reqs_plugged); - s->allocating_write_reqs_plugged = false; - qemu_co_enter_next(&s->allocating_write_reqs); + qemu_co_queue_next(&s->allocating_write_reqs); + qemu_co_mutex_unlock(&s->table_lock); } static void coroutine_fn qed_need_check_timer_entry(void *opaque) @@ -269,17 +290,14 @@ static void coroutine_fn qed_need_check_timer_entry(void *opaque) BDRVQEDState *s = opaque; int ret; - /* The timer should only fire when allocating writes have drained */ - assert(!s->allocating_acb); - trace_qed_need_check_timer_cb(s); - qed_acquire(s); - qed_plug_allocating_write_reqs(s); + if (!qed_plug_allocating_write_reqs(s)) { + return; + } /* Ensure writes are on disk before clearing flag */ ret = bdrv_co_flush(s->bs->file->bs); - qed_release(s); if (ret < 0) { qed_unplug_allocating_write_reqs(s); return; @@ -301,16 +319,6 @@ static void qed_need_check_timer_cb(void *opaque) qemu_coroutine_enter(co); } -void qed_acquire(BDRVQEDState *s) -{ - aio_context_acquire(bdrv_get_aio_context(s->bs)); -} - -void qed_release(BDRVQEDState *s) -{ - aio_context_release(bdrv_get_aio_context(s->bs)); -} - static void qed_start_need_check_timer(BDRVQEDState *s) { trace_qed_start_need_check_timer(s); @@ -350,7 +358,7 @@ static void bdrv_qed_attach_aio_context(BlockDriverState *bs, } } -static void bdrv_qed_drain(BlockDriverState *bs) +static void coroutine_fn bdrv_qed_co_drain(BlockDriverState *bs) { BDRVQEDState *s = bs->opaque; @@ -359,10 +367,20 @@ static void bdrv_qed_drain(BlockDriverState *bs) */ if (s->need_check_timer && timer_pending(s->need_check_timer)) { qed_cancel_need_check_timer(s); - qed_need_check_timer_cb(s); + qed_need_check_timer_entry(s); } } +static void bdrv_qed_init_state(BlockDriverState *bs) +{ + BDRVQEDState *s = bs->opaque; + + memset(s, 0, sizeof(BDRVQEDState)); + s->bs = bs; + qemu_co_mutex_init(&s->table_lock); + qemu_co_queue_init(&s->allocating_write_reqs); +} + static int bdrv_qed_do_open(BlockDriverState *bs, QDict *options, int flags, Error **errp) { @@ -371,9 +389,6 @@ static int bdrv_qed_do_open(BlockDriverState *bs, QDict *options, int flags, int64_t file_size; int ret; - s->bs = bs; - qemu_co_queue_init(&s->allocating_write_reqs); - ret = bdrv_pread(bs->file, 0, &le_header, sizeof(le_header)); if (ret < 0) { return ret; @@ -507,6 +522,7 @@ static int bdrv_qed_open(BlockDriverState *bs, QDict *options, int flags, return -EINVAL; } + bdrv_qed_init_state(bs); return bdrv_qed_do_open(bs, options, flags, errp); } @@ -583,7 +599,7 @@ static int qed_create(const char *filename, uint32_t cluster_size, blk_set_allow_write_beyond_eof(blk, true); /* File must start empty and grow, check truncate is supported */ - ret = blk_truncate(blk, 0, errp); + ret = blk_truncate(blk, 0, PREALLOC_MODE_OFF, errp); if (ret < 0) { goto out; } @@ -681,6 +697,7 @@ typedef struct { BlockDriverState **file; } QEDIsAllocatedCB; +/* Called with table_lock held. */ static void qed_is_allocated_cb(void *opaque, int ret, uint64_t offset, size_t len) { QEDIsAllocatedCB *cb = opaque; @@ -728,6 +745,7 @@ static int64_t coroutine_fn bdrv_qed_co_get_block_status(BlockDriverState *bs, uint64_t offset; int ret; + qemu_co_mutex_lock(&s->table_lock); ret = qed_find_cluster(s, &request, cb.pos, &len, &offset); qed_is_allocated_cb(&cb, ret, offset, len); @@ -735,6 +753,7 @@ static int64_t coroutine_fn bdrv_qed_co_get_block_status(BlockDriverState *bs, assert(cb.status != BDRV_BLOCK_OFFSET_MASK); qed_unref_l2_cache_entry(request.l2_table); + qemu_co_mutex_unlock(&s->table_lock); return cb.status; } @@ -865,6 +884,8 @@ out: * * The cluster offset may be an allocated byte offset in the image file, the * zero cluster marker, or the unallocated cluster marker. + * + * Called with table_lock held. */ static void coroutine_fn qed_update_l2_table(BDRVQEDState *s, QEDTable *table, int index, unsigned int n, @@ -880,6 +901,7 @@ static void coroutine_fn qed_update_l2_table(BDRVQEDState *s, QEDTable *table, } } +/* Called with table_lock held. */ static void coroutine_fn qed_aio_complete(QEDAIOCB *acb) { BDRVQEDState *s = acb_to_s(acb); @@ -903,7 +925,7 @@ static void coroutine_fn qed_aio_complete(QEDAIOCB *acb) if (acb == s->allocating_acb) { s->allocating_acb = NULL; if (!qemu_co_queue_empty(&s->allocating_write_reqs)) { - qemu_co_enter_next(&s->allocating_write_reqs); + qemu_co_queue_next(&s->allocating_write_reqs); } else if (s->header.features & QED_F_NEED_CHECK) { qed_start_need_check_timer(s); } @@ -912,6 +934,8 @@ static void coroutine_fn qed_aio_complete(QEDAIOCB *acb) /** * Update L1 table with new L2 table offset and write it out + * + * Called with table_lock held. */ static int coroutine_fn qed_aio_write_l1_update(QEDAIOCB *acb) { @@ -940,6 +964,8 @@ static int coroutine_fn qed_aio_write_l1_update(QEDAIOCB *acb) /** * Update L2 table with new cluster offsets and write them out + * + * Called with table_lock held. */ static int coroutine_fn qed_aio_write_l2_update(QEDAIOCB *acb, uint64_t offset) { @@ -976,50 +1002,26 @@ static int coroutine_fn qed_aio_write_l2_update(QEDAIOCB *acb, uint64_t offset) /** * Write data to the image file + * + * Called with table_lock *not* held. */ static int coroutine_fn qed_aio_write_main(QEDAIOCB *acb) { BDRVQEDState *s = acb_to_s(acb); uint64_t offset = acb->cur_cluster + qed_offset_into_cluster(s, acb->cur_pos); - int ret; trace_qed_aio_write_main(s, acb, 0, offset, acb->cur_qiov.size); BLKDBG_EVENT(s->bs->file, BLKDBG_WRITE_AIO); - ret = bdrv_co_pwritev(s->bs->file, offset, acb->cur_qiov.size, - &acb->cur_qiov, 0); - if (ret < 0) { - return ret; - } - - if (acb->find_cluster_ret != QED_CLUSTER_FOUND) { - if (s->bs->backing) { - /* - * Flush new data clusters before updating the L2 table - * - * This flush is necessary when a backing file is in use. A crash - * during an allocating write could result in empty clusters in the - * image. If the write only touched a subregion of the cluster, - * then backing image sectors have been lost in the untouched - * region. The solution is to flush after writing a new data - * cluster and before updating the L2 table. - */ - ret = bdrv_co_flush(s->bs->file->bs); - if (ret < 0) { - return ret; - } - } - ret = qed_aio_write_l2_update(acb, acb->cur_cluster); - if (ret < 0) { - return ret; - } - } - return 0; + return bdrv_co_pwritev(s->bs->file, offset, acb->cur_qiov.size, + &acb->cur_qiov, 0); } /** * Populate untouched regions of new data cluster + * + * Called with table_lock held. */ static int coroutine_fn qed_aio_write_cow(QEDAIOCB *acb) { @@ -1027,6 +1029,8 @@ static int coroutine_fn qed_aio_write_cow(QEDAIOCB *acb) uint64_t start, len, offset; int ret; + qemu_co_mutex_unlock(&s->table_lock); + /* Populate front untouched region of new data cluster */ start = qed_start_of_cluster(s, acb->cur_pos); len = qed_offset_into_cluster(s, acb->cur_pos); @@ -1034,7 +1038,7 @@ static int coroutine_fn qed_aio_write_cow(QEDAIOCB *acb) trace_qed_aio_write_prefill(s, acb, start, len, acb->cur_cluster); ret = qed_copy_from_backing_file(s, start, len, acb->cur_cluster); if (ret < 0) { - return ret; + goto out; } /* Populate back untouched region of new data cluster */ @@ -1047,10 +1051,31 @@ static int coroutine_fn qed_aio_write_cow(QEDAIOCB *acb) trace_qed_aio_write_postfill(s, acb, start, len, offset); ret = qed_copy_from_backing_file(s, start, len, offset); if (ret < 0) { - return ret; + goto out; + } + + ret = qed_aio_write_main(acb); + if (ret < 0) { + goto out; + } + + if (s->bs->backing) { + /* + * Flush new data clusters before updating the L2 table + * + * This flush is necessary when a backing file is in use. A crash + * during an allocating write could result in empty clusters in the + * image. If the write only touched a subregion of the cluster, + * then backing image sectors have been lost in the untouched + * region. The solution is to flush after writing a new data + * cluster and before updating the L2 table. + */ + ret = bdrv_co_flush(s->bs->file->bs); } - return qed_aio_write_main(acb); +out: + qemu_co_mutex_lock(&s->table_lock); + return ret; } /** @@ -1073,6 +1098,8 @@ static bool qed_should_set_need_check(BDRVQEDState *s) * @len: Length in bytes * * This path is taken when writing to previously unallocated clusters. + * + * Called with table_lock held. */ static int coroutine_fn qed_aio_write_alloc(QEDAIOCB *acb, size_t len) { @@ -1087,7 +1114,7 @@ static int coroutine_fn qed_aio_write_alloc(QEDAIOCB *acb, size_t len) /* Freeze this request if another allocating write is in progress */ if (s->allocating_acb != acb || s->allocating_write_reqs_plugged) { if (s->allocating_acb != NULL) { - qemu_co_queue_wait(&s->allocating_write_reqs, NULL); + qemu_co_queue_wait(&s->allocating_write_reqs, &s->table_lock); assert(s->allocating_acb == NULL); } s->allocating_acb = acb; @@ -1103,6 +1130,7 @@ static int coroutine_fn qed_aio_write_alloc(QEDAIOCB *acb, size_t len) if (acb->find_cluster_ret == QED_CLUSTER_ZERO) { return 0; } + acb->cur_cluster = 1; } else { acb->cur_cluster = qed_alloc_clusters(s, acb->cur_nclusters); } @@ -1115,15 +1143,14 @@ static int coroutine_fn qed_aio_write_alloc(QEDAIOCB *acb, size_t len) } } - if (acb->flags & QED_AIOCB_ZERO) { - ret = qed_aio_write_l2_update(acb, 1); - } else { + if (!(acb->flags & QED_AIOCB_ZERO)) { ret = qed_aio_write_cow(acb); + if (ret < 0) { + return ret; + } } - if (ret < 0) { - return ret; - } - return 0; + + return qed_aio_write_l2_update(acb, acb->cur_cluster); } /** @@ -1134,10 +1161,17 @@ static int coroutine_fn qed_aio_write_alloc(QEDAIOCB *acb, size_t len) * @len: Length in bytes * * This path is taken when writing to already allocated clusters. + * + * Called with table_lock held. */ static int coroutine_fn qed_aio_write_inplace(QEDAIOCB *acb, uint64_t offset, size_t len) { + BDRVQEDState *s = acb_to_s(acb); + int r; + + qemu_co_mutex_unlock(&s->table_lock); + /* Allocate buffer for zero writes */ if (acb->flags & QED_AIOCB_ZERO) { struct iovec *iov = acb->qiov->iov; @@ -1145,7 +1179,8 @@ static int coroutine_fn qed_aio_write_inplace(QEDAIOCB *acb, uint64_t offset, if (!iov->iov_base) { iov->iov_base = qemu_try_blockalign(acb->bs, iov->iov_len); if (iov->iov_base == NULL) { - return -ENOMEM; + r = -ENOMEM; + goto out; } memset(iov->iov_base, 0, iov->iov_len); } @@ -1155,8 +1190,11 @@ static int coroutine_fn qed_aio_write_inplace(QEDAIOCB *acb, uint64_t offset, acb->cur_cluster = offset; qemu_iovec_concat(&acb->cur_qiov, acb->qiov, acb->qiov_offset, len); - /* Do the actual write */ - return qed_aio_write_main(acb); + /* Do the actual write. */ + r = qed_aio_write_main(acb); +out: + qemu_co_mutex_lock(&s->table_lock); + return r; } /** @@ -1166,6 +1204,8 @@ static int coroutine_fn qed_aio_write_inplace(QEDAIOCB *acb, uint64_t offset, * @ret: QED_CLUSTER_FOUND, QED_CLUSTER_L2 or QED_CLUSTER_L1 * @offset: Cluster offset in bytes * @len: Length in bytes + * + * Called with table_lock held. */ static int coroutine_fn qed_aio_write_data(void *opaque, int ret, uint64_t offset, size_t len) @@ -1197,6 +1237,8 @@ static int coroutine_fn qed_aio_write_data(void *opaque, int ret, * @ret: QED_CLUSTER_FOUND, QED_CLUSTER_L2 or QED_CLUSTER_L1 * @offset: Cluster offset in bytes * @len: Length in bytes + * + * Called with table_lock held. */ static int coroutine_fn qed_aio_read_data(void *opaque, int ret, uint64_t offset, size_t len) @@ -1204,6 +1246,9 @@ static int coroutine_fn qed_aio_read_data(void *opaque, int ret, QEDAIOCB *acb = opaque; BDRVQEDState *s = acb_to_s(acb); BlockDriverState *bs = acb->bs; + int r; + + qemu_co_mutex_unlock(&s->table_lock); /* Adjust offset into cluster */ offset += qed_offset_into_cluster(s, acb->cur_pos); @@ -1212,22 +1257,23 @@ static int coroutine_fn qed_aio_read_data(void *opaque, int ret, qemu_iovec_concat(&acb->cur_qiov, acb->qiov, acb->qiov_offset, len); - /* Handle zero cluster and backing file reads */ + /* Handle zero cluster and backing file reads, otherwise read + * data cluster directly. + */ if (ret == QED_CLUSTER_ZERO) { qemu_iovec_memset(&acb->cur_qiov, 0, 0, acb->cur_qiov.size); - return 0; + r = 0; } else if (ret != QED_CLUSTER_FOUND) { - return qed_read_backing_file(s, acb->cur_pos, &acb->cur_qiov, - &acb->backing_qiov); + r = qed_read_backing_file(s, acb->cur_pos, &acb->cur_qiov, + &acb->backing_qiov); + } else { + BLKDBG_EVENT(bs->file, BLKDBG_READ_AIO); + r = bdrv_co_preadv(bs->file, offset, acb->cur_qiov.size, + &acb->cur_qiov, 0); } - BLKDBG_EVENT(bs->file, BLKDBG_READ_AIO); - ret = bdrv_co_preadv(bs->file, offset, acb->cur_qiov.size, - &acb->cur_qiov, 0); - if (ret < 0) { - return ret; - } - return 0; + qemu_co_mutex_lock(&s->table_lock); + return r; } /** @@ -1240,6 +1286,7 @@ static int coroutine_fn qed_aio_next_io(QEDAIOCB *acb) size_t len; int ret; + qemu_co_mutex_lock(&s->table_lock); while (1) { trace_qed_aio_next_io(s, acb, 0, acb->cur_pos + acb->cur_qiov.size); @@ -1279,6 +1326,7 @@ static int coroutine_fn qed_aio_next_io(QEDAIOCB *acb) trace_qed_aio_complete(s, acb, ret); qed_aio_complete(acb); + qemu_co_mutex_unlock(&s->table_lock); return ret; } @@ -1342,12 +1390,19 @@ static int coroutine_fn bdrv_qed_co_pwrite_zeroes(BlockDriverState *bs, QED_AIOCB_WRITE | QED_AIOCB_ZERO); } -static int bdrv_qed_truncate(BlockDriverState *bs, int64_t offset, Error **errp) +static int bdrv_qed_truncate(BlockDriverState *bs, int64_t offset, + PreallocMode prealloc, Error **errp) { BDRVQEDState *s = bs->opaque; uint64_t old_image_size; int ret; + if (prealloc != PREALLOC_MODE_OFF) { + error_setg(errp, "Unsupported preallocation mode '%s'", + PreallocMode_lookup[prealloc]); + return -ENOTSUP; + } + if (!qed_is_image_size_valid(offset, s->header.cluster_size, s->header.table_size)) { error_setg(errp, "Invalid image size specified"); @@ -1467,8 +1522,14 @@ static void bdrv_qed_invalidate_cache(BlockDriverState *bs, Error **errp) bdrv_qed_close(bs); - memset(s, 0, sizeof(BDRVQEDState)); + bdrv_qed_init_state(bs); + if (qemu_in_coroutine()) { + qemu_co_mutex_lock(&s->table_lock); + } ret = bdrv_qed_do_open(bs, NULL, bs->open_flags, &local_err); + if (qemu_in_coroutine()) { + qemu_co_mutex_unlock(&s->table_lock); + } if (local_err) { error_propagate(errp, local_err); error_prepend(errp, "Could not reopen qed layer: "); @@ -1547,7 +1608,7 @@ static BlockDriver bdrv_qed = { .bdrv_check = bdrv_qed_check, .bdrv_detach_aio_context = bdrv_qed_detach_aio_context, .bdrv_attach_aio_context = bdrv_qed_attach_aio_context, - .bdrv_drain = bdrv_qed_drain, + .bdrv_co_drain = bdrv_qed_co_drain, }; static void bdrv_qed_init(void) diff --git a/block/qed.h b/block/qed.h index dd3a2d5519..f35341f134 100644 --- a/block/qed.h +++ b/block/qed.h @@ -151,15 +151,21 @@ typedef struct QEDAIOCB { typedef struct { BlockDriverState *bs; /* device */ - uint64_t file_size; /* length of image file, in bytes */ + /* Written only by an allocating write or the timer handler (the latter + * while allocating reqs are plugged). + */ QEDHeader header; /* always cpu-endian */ + + /* Protected by table_lock. */ + CoMutex table_lock; QEDTable *l1_table; L2TableCache l2_cache; /* l2 table cache */ uint32_t table_nelems; uint32_t l1_shift; uint32_t l2_shift; uint32_t l2_mask; + uint64_t file_size; /* length of image file, in bytes */ /* Allocating write request queue */ QEDAIOCB *allocating_acb; @@ -177,9 +183,6 @@ enum { QED_CLUSTER_L1, /* cluster missing in L1 */ }; -void qed_acquire(BDRVQEDState *s); -void qed_release(BDRVQEDState *s); - /** * Header functions */ diff --git a/block/raw-format.c b/block/raw-format.c index 1ea8c2d7ff..142649ed56 100644 --- a/block/raw-format.c +++ b/block/raw-format.c @@ -312,6 +312,31 @@ static int64_t raw_getlength(BlockDriverState *bs) return s->size; } +static BlockMeasureInfo *raw_measure(QemuOpts *opts, BlockDriverState *in_bs, + Error **errp) +{ + BlockMeasureInfo *info; + int64_t required; + + if (in_bs) { + required = bdrv_getlength(in_bs); + if (required < 0) { + error_setg_errno(errp, -required, "Unable to get image size"); + return NULL; + } + } else { + required = ROUND_UP(qemu_opt_get_size_del(opts, BLOCK_OPT_SIZE, 0), + BDRV_SECTOR_SIZE); + } + + info = g_new(BlockMeasureInfo, 1); + info->required = required; + + /* Unallocated sectors count towards the file size in raw images */ + info->fully_allocated = info->required; + return info; +} + static int raw_get_info(BlockDriverState *bs, BlockDriverInfo *bdi) { return bdrv_get_info(bs->file->bs, bdi); @@ -327,7 +352,8 @@ static void raw_refresh_limits(BlockDriverState *bs, Error **errp) } } -static int raw_truncate(BlockDriverState *bs, int64_t offset, Error **errp) +static int raw_truncate(BlockDriverState *bs, int64_t offset, + PreallocMode prealloc, Error **errp) { BDRVRawState *s = bs->opaque; @@ -343,7 +369,7 @@ static int raw_truncate(BlockDriverState *bs, int64_t offset, Error **errp) s->size = offset; offset += s->offset; - return bdrv_truncate(bs->file, offset, errp); + return bdrv_truncate(bs->file, offset, prealloc, errp); } static int raw_media_changed(BlockDriverState *bs) @@ -479,6 +505,7 @@ BlockDriver bdrv_raw = { .bdrv_truncate = &raw_truncate, .bdrv_getlength = &raw_getlength, .has_variable_length = true, + .bdrv_measure = &raw_measure, .bdrv_get_info = &raw_get_info, .bdrv_refresh_limits = &raw_refresh_limits, .bdrv_probe_blocksizes = &raw_probe_blocksizes, diff --git a/block/rbd.c b/block/rbd.c index 9da02cdceb..9c3aa638e7 100644 --- a/block/rbd.c +++ b/block/rbd.c @@ -555,9 +555,9 @@ static int qemu_rbd_open(BlockDriverState *bs, QDict *options, int flags, * filename encoded options */ filename = qdict_get_try_str(options, "filename"); if (filename) { - error_report("Warning: 'filename' option specified. " - "This is an unsupported option, and may be deprecated " - "in the future"); + warn_report("'filename' option specified. " + "This is an unsupported option, and may be deprecated " + "in the future"); qemu_rbd_parse_filename(filename, options, &local_err); if (local_err) { r = -EINVAL; @@ -936,11 +936,18 @@ static int64_t qemu_rbd_getlength(BlockDriverState *bs) return info.size; } -static int qemu_rbd_truncate(BlockDriverState *bs, int64_t offset, Error **errp) +static int qemu_rbd_truncate(BlockDriverState *bs, int64_t offset, + PreallocMode prealloc, Error **errp) { BDRVRBDState *s = bs->opaque; int r; + if (prealloc != PREALLOC_MODE_OFF) { + error_setg(errp, "Unsupported preallocation mode '%s'", + PreallocMode_lookup[prealloc]); + return -ENOTSUP; + } + r = rbd_resize(s->image, offset); if (r < 0) { error_setg_errno(errp, -r, "Failed to resize file"); diff --git a/block/sheepdog.c b/block/sheepdog.c index 08d7b11e9d..abb2e79065 100644 --- a/block/sheepdog.c +++ b/block/sheepdog.c @@ -390,6 +390,7 @@ struct BDRVSheepdogState { QLIST_HEAD(inflight_aio_head, AIOReq) inflight_aio_head; QLIST_HEAD(failed_aio_head, AIOReq) failed_aio_head; + CoMutex queue_lock; CoQueue overlapping_queue; QLIST_HEAD(inflight_aiocb_head, SheepdogAIOCB) inflight_aiocb_head; }; @@ -488,7 +489,7 @@ static void wait_for_overlapping_aiocb(BDRVSheepdogState *s, SheepdogAIOCB *acb) retry: QLIST_FOREACH(cb, &s->inflight_aiocb_head, aiocb_siblings) { if (AIOCBOverlapping(acb, cb)) { - qemu_co_queue_wait(&s->overlapping_queue, NULL); + qemu_co_queue_wait(&s->overlapping_queue, &s->queue_lock); goto retry; } } @@ -525,8 +526,10 @@ static void sd_aio_setup(SheepdogAIOCB *acb, BDRVSheepdogState *s, return; } + qemu_co_mutex_lock(&s->queue_lock); wait_for_overlapping_aiocb(s, acb); QLIST_INSERT_HEAD(&s->inflight_aiocb_head, acb, aiocb_siblings); + qemu_co_mutex_unlock(&s->queue_lock); } static SocketAddress *sd_socket_address(const char *path, @@ -785,6 +788,7 @@ static coroutine_fn void reconnect_to_sdog(void *opaque) * have to move all the inflight requests to the failed queue before * resend_aioreq() is called. */ + qemu_co_mutex_lock(&s->queue_lock); QLIST_FOREACH_SAFE(aio_req, &s->inflight_aio_head, aio_siblings, next) { QLIST_REMOVE(aio_req, aio_siblings); QLIST_INSERT_HEAD(&s->failed_aio_head, aio_req, aio_siblings); @@ -794,8 +798,11 @@ static coroutine_fn void reconnect_to_sdog(void *opaque) while (!QLIST_EMPTY(&s->failed_aio_head)) { aio_req = QLIST_FIRST(&s->failed_aio_head); QLIST_REMOVE(aio_req, aio_siblings); + qemu_co_mutex_unlock(&s->queue_lock); resend_aioreq(s, aio_req); + qemu_co_mutex_lock(&s->queue_lock); } + qemu_co_mutex_unlock(&s->queue_lock); } /* @@ -887,7 +894,10 @@ static void coroutine_fn aio_read_response(void *opaque) */ s->co_recv = NULL; + qemu_co_mutex_lock(&s->queue_lock); QLIST_REMOVE(aio_req, aio_siblings); + qemu_co_mutex_unlock(&s->queue_lock); + switch (rsp.result) { case SD_RES_SUCCESS: break; @@ -1307,7 +1317,9 @@ static void coroutine_fn add_aio_request(BDRVSheepdogState *s, AIOReq *aio_req, uint64_t old_oid = aio_req->base_oid; bool create = aio_req->create; + qemu_co_mutex_lock(&s->queue_lock); QLIST_INSERT_HEAD(&s->inflight_aio_head, aio_req, aio_siblings); + qemu_co_mutex_unlock(&s->queue_lock); if (!nr_copies) { error_report("bug"); @@ -1678,6 +1690,7 @@ static int sd_open(BlockDriverState *bs, QDict *options, int flags, bs->total_sectors = s->inode.vdi_size / BDRV_SECTOR_SIZE; pstrcpy(s->name, sizeof(s->name), vdi); qemu_co_mutex_init(&s->lock); + qemu_co_mutex_init(&s->queue_lock); qemu_co_queue_init(&s->overlapping_queue); qemu_opts_del(opts); g_free(buf); @@ -2153,13 +2166,20 @@ static int64_t sd_getlength(BlockDriverState *bs) return s->inode.vdi_size; } -static int sd_truncate(BlockDriverState *bs, int64_t offset, Error **errp) +static int sd_truncate(BlockDriverState *bs, int64_t offset, + PreallocMode prealloc, Error **errp) { BDRVSheepdogState *s = bs->opaque; int ret, fd; unsigned int datalen; uint64_t max_vdi_size; + if (prealloc != PREALLOC_MODE_OFF) { + error_setg(errp, "Unsupported preallocation mode '%s'", + PreallocMode_lookup[prealloc]); + return -ENOTSUP; + } + max_vdi_size = (UINT64_C(1) << s->inode.block_size_shift) * MAX_DATA_OBJS; if (offset < s->inode.vdi_size) { error_setg(errp, "shrinking is not supported"); @@ -2431,12 +2451,16 @@ static void coroutine_fn sd_co_rw_vector(SheepdogAIOCB *acb) static void sd_aio_complete(SheepdogAIOCB *acb) { + BDRVSheepdogState *s; if (acb->aiocb_type == AIOCB_FLUSH_CACHE) { return; } + s = acb->s; + qemu_co_mutex_lock(&s->queue_lock); QLIST_REMOVE(acb, aiocb_siblings); - qemu_co_queue_restart_all(&acb->s->overlapping_queue); + qemu_co_queue_restart_all(&s->overlapping_queue); + qemu_co_mutex_unlock(&s->queue_lock); } static coroutine_fn int sd_co_writev(BlockDriverState *bs, int64_t sector_num, @@ -2448,7 +2472,7 @@ static coroutine_fn int sd_co_writev(BlockDriverState *bs, int64_t sector_num, BDRVSheepdogState *s = bs->opaque; if (offset > s->inode.vdi_size) { - ret = sd_truncate(bs, offset, NULL); + ret = sd_truncate(bs, offset, PREALLOC_MODE_OFF, NULL); if (ret < 0) { return ret; } diff --git a/block/ssh.c b/block/ssh.c index 52964416da..e8f0404c03 100644 --- a/block/ssh.c +++ b/block/ssh.c @@ -888,13 +888,22 @@ static int ssh_has_zero_init(BlockDriverState *bs) return has_zero_init; } +typedef struct BDRVSSHRestart { + BlockDriverState *bs; + Coroutine *co; +} BDRVSSHRestart; + static void restart_coroutine(void *opaque) { - Coroutine *co = opaque; + BDRVSSHRestart *restart = opaque; + BlockDriverState *bs = restart->bs; + BDRVSSHState *s = bs->opaque; + AioContext *ctx = bdrv_get_aio_context(bs); - DPRINTF("co=%p", co); + DPRINTF("co=%p", restart->co); + aio_set_fd_handler(ctx, s->sock, false, NULL, NULL, NULL, NULL); - aio_co_wake(co); + aio_co_wake(restart->co); } /* A non-blocking call returned EAGAIN, so yield, ensuring the @@ -905,7 +914,10 @@ static coroutine_fn void co_yield(BDRVSSHState *s, BlockDriverState *bs) { int r; IOHandler *rd_handler = NULL, *wr_handler = NULL; - Coroutine *co = qemu_coroutine_self(); + BDRVSSHRestart restart = { + .bs = bs, + .co = qemu_coroutine_self() + }; r = libssh2_session_block_directions(s->session); @@ -920,11 +932,9 @@ static coroutine_fn void co_yield(BDRVSSHState *s, BlockDriverState *bs) rd_handler, wr_handler); aio_set_fd_handler(bdrv_get_aio_context(bs), s->sock, - false, rd_handler, wr_handler, NULL, co); + false, rd_handler, wr_handler, NULL, &restart); qemu_coroutine_yield(); DPRINTF("s->sock=%d - back", s->sock); - aio_set_fd_handler(bdrv_get_aio_context(bs), s->sock, false, - NULL, NULL, NULL, NULL); } /* SFTP has a function `libssh2_sftp_seek64' which seeks to a position @@ -1114,8 +1124,8 @@ static coroutine_fn int ssh_co_writev(BlockDriverState *bs, static void unsafe_flush_warning(BDRVSSHState *s, const char *what) { if (!s->unsafe_flush_warning) { - error_report("warning: ssh server %s does not support fsync", - s->inet->host); + warn_report("ssh server %s does not support fsync", + s->inet->host); if (what) { error_report("to support fsync, you need %s", what); } diff --git a/block/vdi.c b/block/vdi.c index 79af47763b..8da5dfc897 100644 --- a/block/vdi.c +++ b/block/vdi.c @@ -172,7 +172,7 @@ typedef struct { /* VDI header (converted to host endianness). */ VdiHeader header; - CoMutex write_lock; + CoRwlock bmap_lock; Error *migration_blocker; } BDRVVdiState; @@ -485,7 +485,7 @@ static int vdi_open(BlockDriverState *bs, QDict *options, int flags, goto fail_free_bmap; } - qemu_co_mutex_init(&s->write_lock); + qemu_co_rwlock_init(&s->bmap_lock); return 0; @@ -557,7 +557,9 @@ vdi_co_preadv(BlockDriverState *bs, uint64_t offset, uint64_t bytes, n_bytes, offset); /* prepare next AIO request */ + qemu_co_rwlock_rdlock(&s->bmap_lock); bmap_entry = le32_to_cpu(s->bmap[block_index]); + qemu_co_rwlock_unlock(&s->bmap_lock); if (!VDI_IS_ALLOCATED(bmap_entry)) { /* Block not allocated, return zeros, no need to wait. */ qemu_iovec_memset(qiov, bytes_done, 0, n_bytes); @@ -595,6 +597,7 @@ vdi_co_pwritev(BlockDriverState *bs, uint64_t offset, uint64_t bytes, uint32_t block_index; uint32_t offset_in_block; uint32_t n_bytes; + uint64_t data_offset; uint32_t bmap_first = VDI_UNALLOCATED; uint32_t bmap_last = VDI_UNALLOCATED; uint8_t *block = NULL; @@ -614,10 +617,19 @@ vdi_co_pwritev(BlockDriverState *bs, uint64_t offset, uint64_t bytes, n_bytes, offset); /* prepare next AIO request */ + qemu_co_rwlock_rdlock(&s->bmap_lock); bmap_entry = le32_to_cpu(s->bmap[block_index]); if (!VDI_IS_ALLOCATED(bmap_entry)) { /* Allocate new block and write to it. */ uint64_t data_offset; + qemu_co_rwlock_upgrade(&s->bmap_lock); + bmap_entry = le32_to_cpu(s->bmap[block_index]); + if (VDI_IS_ALLOCATED(bmap_entry)) { + /* A concurrent allocation did the work for us. */ + qemu_co_rwlock_downgrade(&s->bmap_lock); + goto nonallocating_write; + } + bmap_entry = s->header.blocks_allocated; s->bmap[block_index] = cpu_to_le32(bmap_entry); s->header.blocks_allocated++; @@ -635,30 +647,18 @@ vdi_co_pwritev(BlockDriverState *bs, uint64_t offset, uint64_t bytes, memset(block + offset_in_block + n_bytes, 0, s->block_size - n_bytes - offset_in_block); - /* Note that this coroutine does not yield anywhere from reading the - * bmap entry until here, so in regards to all the coroutines trying - * to write to this cluster, the one doing the allocation will - * always be the first to try to acquire the lock. - * Therefore, it is also the first that will actually be able to - * acquire the lock and thus the padded cluster is written before - * the other coroutines can write to the affected area. */ - qemu_co_mutex_lock(&s->write_lock); + /* Write the new block under CoRwLock write-side protection, + * so this full-cluster write does not overlap a partial write + * of the same cluster, issued from the "else" branch. + */ ret = bdrv_pwrite(bs->file, data_offset, block, s->block_size); - qemu_co_mutex_unlock(&s->write_lock); + qemu_co_rwlock_unlock(&s->bmap_lock); } else { - uint64_t data_offset = s->header.offset_data + - (uint64_t)bmap_entry * s->block_size + - offset_in_block; - qemu_co_mutex_lock(&s->write_lock); - /* This lock is only used to make sure the following write operation - * is executed after the write issued by the coroutine allocating - * this cluster, therefore we do not need to keep it locked. - * As stated above, the allocating coroutine will always try to lock - * the mutex before all the other concurrent accesses to that - * cluster, therefore at this point we can be absolutely certain - * that that write operation has returned (there may be other writes - * in flight, but they do not concern this very operation). */ - qemu_co_mutex_unlock(&s->write_lock); +nonallocating_write: + data_offset = s->header.offset_data + + (uint64_t)bmap_entry * s->block_size + + offset_in_block; + qemu_co_rwlock_unlock(&s->bmap_lock); qemu_iovec_reset(&local_qiov); qemu_iovec_concat(&local_qiov, qiov, bytes_done, n_bytes); @@ -832,7 +832,8 @@ static int vdi_create(const char *filename, QemuOpts *opts, Error **errp) } if (image_type == VDI_TYPE_STATIC) { - ret = blk_truncate(blk, offset + blocks * block_size, errp); + ret = blk_truncate(blk, offset + blocks * block_size, + PREALLOC_MODE_OFF, errp); if (ret < 0) { error_prepend(errp, "Failed to statically allocate %s", filename); goto exit; diff --git a/block/vhdx-log.c b/block/vhdx-log.c index 3f4c2aa095..01278f3fc9 100644 --- a/block/vhdx-log.c +++ b/block/vhdx-log.c @@ -548,7 +548,7 @@ static int vhdx_log_flush(BlockDriverState *bs, BDRVVHDXState *s, if (new_file_size % (1024*1024)) { /* round up to nearest 1MB boundary */ new_file_size = ((new_file_size >> 20) + 1) << 20; - bdrv_truncate(bs->file, new_file_size, NULL); + bdrv_truncate(bs->file, new_file_size, PREALLOC_MODE_OFF, NULL); } } qemu_vfree(desc_entries); diff --git a/block/vhdx.c b/block/vhdx.c index 8b270b57c9..a9cecd2773 100644 --- a/block/vhdx.c +++ b/block/vhdx.c @@ -1171,7 +1171,8 @@ static int vhdx_allocate_block(BlockDriverState *bs, BDRVVHDXState *s, /* per the spec, the address for a block is in units of 1MB */ *new_offset = ROUND_UP(*new_offset, 1024 * 1024); - return bdrv_truncate(bs->file, *new_offset + s->block_size, NULL); + return bdrv_truncate(bs->file, *new_offset + s->block_size, + PREALLOC_MODE_OFF, NULL); } /* @@ -1607,12 +1608,13 @@ static int vhdx_create_bat(BlockBackend *blk, BDRVVHDXState *s, if (type == VHDX_TYPE_DYNAMIC) { /* All zeroes, so we can just extend the file - the end of the BAT * is the furthest thing we have written yet */ - ret = blk_truncate(blk, data_file_offset, errp); + ret = blk_truncate(blk, data_file_offset, PREALLOC_MODE_OFF, errp); if (ret < 0) { goto exit; } } else if (type == VHDX_TYPE_FIXED) { - ret = blk_truncate(blk, data_file_offset + image_size, errp); + ret = blk_truncate(blk, data_file_offset + image_size, + PREALLOC_MODE_OFF, errp); if (ret < 0) { goto exit; } diff --git a/block/vmdk.c b/block/vmdk.c index 55581b03fe..24d71b5982 100644 --- a/block/vmdk.c +++ b/block/vmdk.c @@ -1714,7 +1714,7 @@ static int vmdk_create_extent(const char *filename, int64_t filesize, blk_set_allow_write_beyond_eof(blk, true); if (flat) { - ret = blk_truncate(blk, filesize, errp); + ret = blk_truncate(blk, filesize, PREALLOC_MODE_OFF, errp); goto exit; } magic = cpu_to_be32(VMDK4_MAGIC); @@ -1777,7 +1777,8 @@ static int vmdk_create_extent(const char *filename, int64_t filesize, goto exit; } - ret = blk_truncate(blk, le64_to_cpu(header.grain_offset) << 9, errp); + ret = blk_truncate(blk, le64_to_cpu(header.grain_offset) << 9, + PREALLOC_MODE_OFF, errp); if (ret < 0) { goto exit; } @@ -2086,7 +2087,7 @@ static int vmdk_create(const char *filename, QemuOpts *opts, Error **errp) /* bdrv_pwrite write padding zeros to align to sector, we don't need that * for description file */ if (desc_offset == 0) { - ret = blk_truncate(new_blk, desc_len, errp); + ret = blk_truncate(new_blk, desc_len, PREALLOC_MODE_OFF, errp); } exit: if (new_blk) { diff --git a/block/vpc.c b/block/vpc.c index b313c68148..8057d42a23 100644 --- a/block/vpc.c +++ b/block/vpc.c @@ -496,12 +496,6 @@ static inline int64_t get_image_offset(BlockDriverState *bs, uint64_t offset, return block_offset; } -static inline int64_t get_sector_offset(BlockDriverState *bs, - int64_t sector_num, bool write) -{ - return get_image_offset(bs, sector_num * BDRV_SECTOR_SIZE, write); -} - /* * Writes the footer to the end of the image file. This is needed when the * file grows as it overwrites the old footer @@ -696,6 +690,7 @@ static int64_t coroutine_fn vpc_co_get_block_status(BlockDriverState *bs, VHDFooter *footer = (VHDFooter*) s->footer_buf; int64_t start, offset; bool allocated; + int64_t ret; int n; if (be32_to_cpu(footer->type) == VHD_FIXED) { @@ -705,10 +700,13 @@ static int64_t coroutine_fn vpc_co_get_block_status(BlockDriverState *bs, (sector_num << BDRV_SECTOR_BITS); } - offset = get_sector_offset(bs, sector_num, 0); + qemu_co_mutex_lock(&s->lock); + + offset = get_image_offset(bs, sector_num << BDRV_SECTOR_BITS, false); start = offset; allocated = (offset != -1); *pnum = 0; + ret = 0; do { /* All sectors in a block are contiguous (without using the bitmap) */ @@ -723,15 +721,17 @@ static int64_t coroutine_fn vpc_co_get_block_status(BlockDriverState *bs, * sectors since there is always a bitmap in between. */ if (allocated) { *file = bs->file->bs; - return BDRV_BLOCK_DATA | BDRV_BLOCK_OFFSET_VALID | start; + ret = BDRV_BLOCK_DATA | BDRV_BLOCK_OFFSET_VALID | start; + break; } if (nb_sectors == 0) { break; } - offset = get_sector_offset(bs, sector_num, 0); + offset = get_image_offset(bs, sector_num << BDRV_SECTOR_BITS, false); } while (offset == -1); - return 0; + qemu_co_mutex_unlock(&s->lock); + return ret; } /* @@ -858,7 +858,7 @@ static int create_fixed_disk(BlockBackend *blk, uint8_t *buf, /* Add footer to total size */ total_size += HEADER_SIZE; - ret = blk_truncate(blk, total_size, errp); + ret = blk_truncate(blk, total_size, PREALLOC_MODE_OFF, errp); if (ret < 0) { return ret; } diff --git a/block/vvfat.c b/block/vvfat.c index 4fd28e1e87..4dae790203 100644 --- a/block/vvfat.c +++ b/block/vvfat.c @@ -3078,8 +3078,14 @@ static int coroutine_fn write_target_commit(BlockDriverState *bs, uint64_t offset, uint64_t bytes, QEMUIOVector *qiov, int flags) { + int ret; + BDRVVVFATState* s = *((BDRVVVFATState**) bs->opaque); - return try_commit(s); + qemu_co_mutex_lock(&s->lock); + ret = try_commit(s); + qemu_co_mutex_unlock(&s->lock); + + return ret; } static void write_target_close(BlockDriverState *bs) { |