diff options
86 files changed, 3407 insertions, 2759 deletions
diff --git a/MAINTAINERS b/MAINTAINERS index 81e7fac2fc..3c949d5a97 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -165,6 +165,7 @@ F: hw/openrisc/ F: tests/tcg/openrisc/ PowerPC +M: David Gibson <david@gibson.dropbear.id.au> M: Alexander Graf <agraf@suse.de> L: qemu-ppc@nongnu.org S: Maintained @@ -597,7 +598,7 @@ F: hw/pci-host/grackle.c F: hw/misc/macio/ PReP -M: Andreas Färber <andreas.faerber@web.de> +L: qemu-devel@nongnu.org L: qemu-ppc@nongnu.org S: Odd Fixes F: hw/ppc/prep.c @@ -1046,7 +1047,7 @@ S: Supported F: scripts/coverity-model.c CPU -M: Andreas Färber <afaerber@suse.de> +L: qemu-devel@nongnu.org S: Supported F: qom/cpu.c F: include/qom/cpu.h @@ -1105,7 +1106,6 @@ F: ui/ F: include/ui/ Cocoa graphics -M: Andreas Färber <andreas.faerber@web.de> M: Peter Maydell <peter.maydell@linaro.org> S: Odd Fixes F: ui/cocoa.m @@ -1400,9 +1400,8 @@ S: Orphan Stable 0.15 L: qemu-stable@nongnu.org -M: Andreas Färber <afaerber@suse.de> T: git git://git.qemu-project.org/qemu-stable-0.15.git -S: Supported +S: Orphan Stable 0.14 L: qemu-stable@nongnu.org diff --git a/block.c b/block.c index 1205ef8860..736432f67e 100644 --- a/block.c +++ b/block.c @@ -64,16 +64,16 @@ static QTAILQ_HEAD(, BlockDriverState) all_bdrv_states = static QLIST_HEAD(, BlockDriver) bdrv_drivers = QLIST_HEAD_INITIALIZER(bdrv_drivers); -static int bdrv_open_inherit(BlockDriverState **pbs, const char *filename, - const char *reference, QDict *options, int flags, - BlockDriverState *parent, - const BdrvChildRole *child_role, Error **errp); +static BlockDriverState *bdrv_open_inherit(const char *filename, + const char *reference, + QDict *options, int flags, + BlockDriverState *parent, + const BdrvChildRole *child_role, + Error **errp); /* If non-zero, use only whitelisted block drivers */ static int use_bdrv_whitelist; -static void bdrv_close(BlockDriverState *bs); - #ifdef _WIN32 static int is_windows_drive_prefix(const char *filename) { @@ -220,11 +220,6 @@ void bdrv_register(BlockDriver *bdrv) QLIST_INSERT_HEAD(&bdrv_drivers, bdrv, list); } -BlockDriverState *bdrv_new_root(void) -{ - return bdrv_new(); -} - BlockDriverState *bdrv_new(void) { BlockDriverState *bs; @@ -664,6 +659,18 @@ int bdrv_parse_cache_mode(const char *mode, int *flags, bool *writethrough) return 0; } +static void bdrv_child_cb_drained_begin(BdrvChild *child) +{ + BlockDriverState *bs = child->opaque; + bdrv_drained_begin(bs); +} + +static void bdrv_child_cb_drained_end(BdrvChild *child) +{ + BlockDriverState *bs = child->opaque; + bdrv_drained_end(bs); +} + /* * Returns the options and flags that a temporary snapshot should get, based on * the originally requested flags (the originally requested image will have @@ -710,6 +717,8 @@ static void bdrv_inherited_options(int *child_flags, QDict *child_options, const BdrvChildRole child_file = { .inherit_options = bdrv_inherited_options, + .drained_begin = bdrv_child_cb_drained_begin, + .drained_end = bdrv_child_cb_drained_end, }; /* @@ -728,6 +737,8 @@ static void bdrv_inherited_fmt_options(int *child_flags, QDict *child_options, const BdrvChildRole child_format = { .inherit_options = bdrv_inherited_fmt_options, + .drained_begin = bdrv_child_cb_drained_begin, + .drained_end = bdrv_child_cb_drained_end, }; /* @@ -755,6 +766,8 @@ static void bdrv_backing_options(int *child_flags, QDict *child_options, static const BdrvChildRole child_backing = { .inherit_options = bdrv_backing_options, + .drained_begin = bdrv_child_cb_drained_begin, + .drained_end = bdrv_child_cb_drained_end, }; static int bdrv_open_flags(BlockDriverState *bs, int flags) @@ -1155,18 +1168,41 @@ static int bdrv_fill_options(QDict **options, const char *filename, return 0; } +static void bdrv_replace_child(BdrvChild *child, BlockDriverState *new_bs) +{ + BlockDriverState *old_bs = child->bs; + + if (old_bs) { + if (old_bs->quiesce_counter && child->role->drained_end) { + child->role->drained_end(child); + } + QLIST_REMOVE(child, next_parent); + } + + child->bs = new_bs; + + if (new_bs) { + QLIST_INSERT_HEAD(&new_bs->parents, child, next_parent); + if (new_bs->quiesce_counter && child->role->drained_begin) { + child->role->drained_begin(child); + } + } +} + BdrvChild *bdrv_root_attach_child(BlockDriverState *child_bs, const char *child_name, - const BdrvChildRole *child_role) + const BdrvChildRole *child_role, + void *opaque) { BdrvChild *child = g_new(BdrvChild, 1); *child = (BdrvChild) { - .bs = child_bs, + .bs = NULL, .name = g_strdup(child_name), .role = child_role, + .opaque = opaque, }; - QLIST_INSERT_HEAD(&child_bs->parents, child, next_parent); + bdrv_replace_child(child, child_bs); return child; } @@ -1176,7 +1212,8 @@ BdrvChild *bdrv_attach_child(BlockDriverState *parent_bs, const char *child_name, const BdrvChildRole *child_role) { - BdrvChild *child = bdrv_root_attach_child(child_bs, child_name, child_role); + BdrvChild *child = bdrv_root_attach_child(child_bs, child_name, child_role, + parent_bs); QLIST_INSERT_HEAD(&parent_bs->children, child, next); return child; } @@ -1187,7 +1224,9 @@ static void bdrv_detach_child(BdrvChild *child) QLIST_REMOVE(child, next); child->next.le_prev = NULL; } - QLIST_REMOVE(child, next_parent); + + bdrv_replace_child(child, NULL); + g_free(child->name); g_free(child); } @@ -1341,14 +1380,13 @@ int bdrv_open_backing_file(BlockDriverState *bs, QDict *parent_options, qdict_put(options, "driver", qstring_from_str(bs->backing_format)); } - backing_hd = NULL; - ret = bdrv_open_inherit(&backing_hd, - *backing_filename ? backing_filename : NULL, - reference, options, 0, bs, &child_backing, - errp); - if (ret < 0) { + backing_hd = bdrv_open_inherit(*backing_filename ? backing_filename : NULL, + reference, options, 0, bs, &child_backing, + errp); + if (!backing_hd) { bs->open_flags |= BDRV_O_NO_BACKING; error_prepend(errp, "Could not open backing file: "); + ret = -EINVAL; goto free_exit; } @@ -1388,7 +1426,6 @@ BdrvChild *bdrv_open_child(const char *filename, BdrvChild *c = NULL; BlockDriverState *bs; QDict *image_options; - int ret; char *bdref_key_dot; const char *reference; @@ -1408,10 +1445,9 @@ BdrvChild *bdrv_open_child(const char *filename, goto done; } - bs = NULL; - ret = bdrv_open_inherit(&bs, filename, reference, image_options, 0, - parent, child_role, errp); - if (ret < 0) { + bs = bdrv_open_inherit(filename, reference, image_options, 0, + parent, child_role, errp); + if (!bs) { goto done; } @@ -1422,15 +1458,16 @@ done: return c; } -static int bdrv_append_temp_snapshot(BlockDriverState *bs, int flags, - QDict *snapshot_options, Error **errp) +static BlockDriverState *bdrv_append_temp_snapshot(BlockDriverState *bs, + int flags, + QDict *snapshot_options, + Error **errp) { /* TODO: extra byte is a hack to ensure MAX_PATH space on Windows. */ char *tmp_filename = g_malloc0(PATH_MAX + 1); int64_t total_size; QemuOpts *opts = NULL; BlockDriverState *bs_snapshot; - Error *local_err = NULL; int ret; /* if snapshot, we create a temporary backing file and open it @@ -1439,7 +1476,6 @@ static int bdrv_append_temp_snapshot(BlockDriverState *bs, int flags, /* Get the required size from the image */ total_size = bdrv_getlength(bs); if (total_size < 0) { - ret = total_size; error_setg_errno(errp, -total_size, "Could not get image size"); goto out; } @@ -1470,22 +1506,26 @@ static int bdrv_append_temp_snapshot(BlockDriverState *bs, int flags, qdict_put(snapshot_options, "driver", qstring_from_str("qcow2")); - bs_snapshot = bdrv_new(); - - ret = bdrv_open(&bs_snapshot, NULL, NULL, snapshot_options, - flags, &local_err); + bs_snapshot = bdrv_open(NULL, NULL, snapshot_options, flags, errp); snapshot_options = NULL; - if (ret < 0) { - error_propagate(errp, local_err); + if (!bs_snapshot) { + ret = -EINVAL; goto out; } + /* bdrv_append() consumes a strong reference to bs_snapshot (i.e. it will + * call bdrv_unref() on it), so in order to be able to return one, we have + * to increase bs_snapshot's refcount here */ + bdrv_ref(bs_snapshot); bdrv_append(bs_snapshot, bs); + g_free(tmp_filename); + return bs_snapshot; + out: QDECREF(snapshot_options); g_free(tmp_filename); - return ret; + return NULL; } /* @@ -1503,10 +1543,12 @@ out: * should be opened. If specified, neither options nor a filename may be given, * nor can an existing BDS be reused (that is, *pbs has to be NULL). */ -static int bdrv_open_inherit(BlockDriverState **pbs, const char *filename, - const char *reference, QDict *options, int flags, - BlockDriverState *parent, - const BdrvChildRole *child_role, Error **errp) +static BlockDriverState *bdrv_open_inherit(const char *filename, + const char *reference, + QDict *options, int flags, + BlockDriverState *parent, + const BdrvChildRole *child_role, + Error **errp) { int ret; BdrvChild *file = NULL; @@ -1518,7 +1560,6 @@ static int bdrv_open_inherit(BlockDriverState **pbs, const char *filename, QDict *snapshot_options = NULL; int snapshot_flags = 0; - assert(pbs); assert(!child_role || !flags); assert(!child_role == !parent); @@ -1526,33 +1567,22 @@ static int bdrv_open_inherit(BlockDriverState **pbs, const char *filename, bool options_non_empty = options ? qdict_size(options) : false; QDECREF(options); - if (*pbs) { - error_setg(errp, "Cannot reuse an existing BDS when referencing " - "another block device"); - return -EINVAL; - } - if (filename || options_non_empty) { error_setg(errp, "Cannot reference an existing block device with " "additional options or a new filename"); - return -EINVAL; + return NULL; } bs = bdrv_lookup_bs(reference, reference, errp); if (!bs) { - return -ENODEV; + return NULL; } bdrv_ref(bs); - *pbs = bs; - return 0; + return bs; } - if (*pbs) { - bs = *pbs; - } else { - bs = bdrv_new(); - } + bs = bdrv_new(); /* NULL means an empty set of options */ if (options == NULL) { @@ -1562,7 +1592,6 @@ static int bdrv_open_inherit(BlockDriverState **pbs, const char *filename, /* json: syntax counts as explicit options, as if in the QDict */ parse_json_protocol(options, &filename, &local_err); if (local_err) { - ret = -EINVAL; goto fail; } @@ -1589,7 +1618,6 @@ static int bdrv_open_inherit(BlockDriverState **pbs, const char *filename, drv = bdrv_find_format(drvname); if (!drv) { error_setg(errp, "Unknown driver: '%s'", drvname); - ret = -EINVAL; goto fail; } } @@ -1619,7 +1647,6 @@ static int bdrv_open_inherit(BlockDriverState **pbs, const char *filename, file = bdrv_open_child(filename, options, "file", bs, &child_file, true, &local_err); if (local_err) { - ret = -EINVAL; goto fail; } } @@ -1646,7 +1673,6 @@ static int bdrv_open_inherit(BlockDriverState **pbs, const char *filename, qdict_put(options, "driver", qstring_from_str(drv->format_name)); } else if (!drv) { error_setg(errp, "Must specify either driver or file"); - ret = -EINVAL; goto fail; } @@ -1689,7 +1715,6 @@ static int bdrv_open_inherit(BlockDriverState **pbs, const char *filename, drv->format_name, entry->key); } - ret = -EINVAL; goto close_and_fail; } @@ -1700,25 +1725,30 @@ static int bdrv_open_inherit(BlockDriverState **pbs, const char *filename, && !runstate_check(RUN_STATE_PAUSED)) { /* HACK */ error_setg(errp, "Guest must be stopped for opening of encrypted image"); - ret = -EBUSY; goto close_and_fail; } QDECREF(options); - *pbs = bs; /* For snapshot=on, create a temporary qcow2 overlay. bs points to the * temporary snapshot afterwards. */ if (snapshot_flags) { - ret = bdrv_append_temp_snapshot(bs, snapshot_flags, snapshot_options, - &local_err); + BlockDriverState *snapshot_bs; + snapshot_bs = bdrv_append_temp_snapshot(bs, snapshot_flags, + snapshot_options, &local_err); snapshot_options = NULL; if (local_err) { goto close_and_fail; } + /* We are not going to return bs but the overlay on top of it + * (snapshot_bs); thus, we have to drop the strong reference to bs + * (which we obtained by calling bdrv_new()). bs will not be deleted, + * though, because the overlay still has a reference to it. */ + bdrv_unref(bs); + bs = snapshot_bs; } - return 0; + return bs; fail: if (file != NULL) { @@ -1729,36 +1759,26 @@ fail: QDECREF(bs->options); QDECREF(options); bs->options = NULL; - if (!*pbs) { - /* If *pbs is NULL, a new BDS has been created in this function and - needs to be freed now. Otherwise, it does not need to be closed, - since it has not really been opened yet. */ - bdrv_unref(bs); - } + bdrv_unref(bs); if (local_err) { error_propagate(errp, local_err); } - return ret; + return NULL; close_and_fail: - /* See fail path, but now the BDS has to be always closed */ - if (*pbs) { - bdrv_close(bs); - } else { - bdrv_unref(bs); - } + bdrv_unref(bs); QDECREF(snapshot_options); QDECREF(options); if (local_err) { error_propagate(errp, local_err); } - return ret; + return NULL; } -int bdrv_open(BlockDriverState **pbs, const char *filename, - const char *reference, QDict *options, int flags, Error **errp) +BlockDriverState *bdrv_open(const char *filename, const char *reference, + QDict *options, int flags, Error **errp) { - return bdrv_open_inherit(pbs, filename, reference, options, flags, NULL, + return bdrv_open_inherit(filename, reference, options, flags, NULL, NULL, errp); } @@ -2132,6 +2152,7 @@ static void bdrv_close(BlockDriverState *bs) BdrvAioNotifier *ban, *ban_next; assert(!bs->job); + assert(!bs->refcnt); bdrv_drained_begin(bs); /* complete I/O */ bdrv_flush(bs); @@ -2140,8 +2161,6 @@ static void bdrv_close(BlockDriverState *bs) bdrv_release_named_dirty_bitmaps(bs); assert(QLIST_EMPTY(&bs->dirty_bitmaps)); - bdrv_parent_cb_change_media(bs, false); - if (bs->drv) { BdrvChild *child, *next; @@ -2190,8 +2209,7 @@ static void bdrv_close(BlockDriverState *bs) void bdrv_close_all(void) { - BlockDriverState *bs; - AioContext *aio_context; + block_job_cancel_sync_all(); /* Drop references from requests still in flight, such as canceled block * jobs whose AIO context has not been polled yet */ @@ -2200,25 +2218,7 @@ void bdrv_close_all(void) blk_remove_all_bs(); blockdev_close_all_bdrv_states(); - /* Cancel all block jobs */ - while (!QTAILQ_EMPTY(&all_bdrv_states)) { - QTAILQ_FOREACH(bs, &all_bdrv_states, bs_list) { - aio_context = bdrv_get_aio_context(bs); - - aio_context_acquire(aio_context); - if (bs->job) { - block_job_cancel_sync(bs->job); - aio_context_release(aio_context); - break; - } - aio_context_release(aio_context); - } - - /* All the remaining BlockDriverStates are referenced directly or - * indirectly from block jobs, so there needs to be at least one BDS - * directly used by a block job */ - assert(bs); - } + assert(QTAILQ_EMPTY(&all_bdrv_states)); } static void change_parent_backing_link(BlockDriverState *from, @@ -2228,10 +2228,8 @@ static void change_parent_backing_link(BlockDriverState *from, QLIST_FOREACH_SAFE(c, &from->parents, next_parent, next) { assert(c->role != &child_backing); - c->bs = to; - QLIST_REMOVE(c, next_parent); - QLIST_INSERT_HEAD(&to->parents, c, next_parent); bdrv_ref(to); + bdrv_replace_child(c, to); bdrv_unref(from); } } @@ -3195,9 +3193,9 @@ void bdrv_invalidate_cache_all(Error **errp) { BlockDriverState *bs; Error *local_err = NULL; - BdrvNextIterator *it = NULL; + BdrvNextIterator it; - while ((it = bdrv_next(it, &bs)) != NULL) { + for (bs = bdrv_first(&it); bs; bs = bdrv_next(&it)) { AioContext *aio_context = bdrv_get_aio_context(bs); aio_context_acquire(aio_context); @@ -3239,11 +3237,11 @@ static int bdrv_inactivate_recurse(BlockDriverState *bs, int bdrv_inactivate_all(void) { BlockDriverState *bs = NULL; - BdrvNextIterator *it = NULL; + BdrvNextIterator it; int ret = 0; int pass; - while ((it = bdrv_next(it, &bs)) != NULL) { + for (bs = bdrv_first(&it); bs; bs = bdrv_next(&it)) { aio_context_acquire(bdrv_get_aio_context(bs)); } @@ -3252,8 +3250,7 @@ int bdrv_inactivate_all(void) * the second pass sets the BDRV_O_INACTIVE flag so that no further write * is allowed. */ for (pass = 0; pass < 2; pass++) { - it = NULL; - while ((it = bdrv_next(it, &bs)) != NULL) { + for (bs = bdrv_first(&it); bs; bs = bdrv_next(&it)) { ret = bdrv_inactivate_recurse(bs, pass); if (ret < 0) { goto out; @@ -3262,8 +3259,7 @@ int bdrv_inactivate_all(void) } out: - it = NULL; - while ((it = bdrv_next(it, &bs)) != NULL) { + for (bs = bdrv_first(&it); bs; bs = bdrv_next(&it)) { aio_context_release(bdrv_get_aio_context(bs)); } @@ -3547,11 +3543,10 @@ void bdrv_img_create(const char *filename, const char *fmt, qstring_from_str(backing_fmt)); } - bs = NULL; - ret = bdrv_open(&bs, full_backing, NULL, backing_options, - back_flags, &local_err); + bs = bdrv_open(full_backing, NULL, backing_options, back_flags, + &local_err); g_free(full_backing); - if (ret < 0) { + if (!bs) { goto out; } size = bdrv_getlength(bs); @@ -3753,10 +3748,10 @@ bool bdrv_recurse_is_first_non_filter(BlockDriverState *bs, bool bdrv_is_first_non_filter(BlockDriverState *candidate) { BlockDriverState *bs; - BdrvNextIterator *it = NULL; + BdrvNextIterator it; /* walk down the bs forest recursively */ - while ((it = bdrv_next(it, &bs)) != NULL) { + for (bs = bdrv_first(&it); bs; bs = bdrv_next(&it)) { bool perm; /* try to recurse in this top level bs */ diff --git a/block/backup.c b/block/backup.c index fec45e8212..feeb9f8bf2 100644 --- a/block/backup.c +++ b/block/backup.c @@ -36,7 +36,7 @@ typedef struct CowRequest { typedef struct BackupBlockJob { BlockJob common; - BlockDriverState *target; + BlockBackend *target; /* bitmap for sync=incremental */ BdrvDirtyBitmap *sync_bitmap; MirrorSyncMode sync_mode; @@ -47,6 +47,7 @@ typedef struct BackupBlockJob { uint64_t sectors_read; unsigned long *done_bitmap; int64_t cluster_size; + NotifierWithReturn before_write; QLIST_HEAD(, CowRequest) inflight_reqs; } BackupBlockJob; @@ -93,12 +94,12 @@ static void cow_request_end(CowRequest *req) qemu_co_queue_restart_all(&req->wait_queue); } -static int coroutine_fn backup_do_cow(BlockDriverState *bs, +static int coroutine_fn backup_do_cow(BackupBlockJob *job, int64_t sector_num, int nb_sectors, bool *error_is_read, bool is_write_notifier) { - BackupBlockJob *job = (BackupBlockJob *)bs->job; + BlockBackend *blk = job->common.blk; CowRequest cow_request; struct iovec iov; QEMUIOVector bounce_qiov; @@ -131,20 +132,15 @@ static int coroutine_fn backup_do_cow(BlockDriverState *bs, start * sectors_per_cluster); if (!bounce_buffer) { - bounce_buffer = qemu_blockalign(bs, job->cluster_size); + bounce_buffer = blk_blockalign(blk, job->cluster_size); } iov.iov_base = bounce_buffer; iov.iov_len = n * BDRV_SECTOR_SIZE; qemu_iovec_init_external(&bounce_qiov, &iov, 1); - if (is_write_notifier) { - ret = bdrv_co_readv_no_serialising(bs, - start * sectors_per_cluster, - n, &bounce_qiov); - } else { - ret = bdrv_co_readv(bs, start * sectors_per_cluster, n, - &bounce_qiov); - } + ret = blk_co_preadv(blk, start * job->cluster_size, + bounce_qiov.size, &bounce_qiov, + is_write_notifier ? BDRV_REQ_NO_SERIALISING : 0); if (ret < 0) { trace_backup_do_cow_read_fail(job, start, ret); if (error_is_read) { @@ -154,13 +150,11 @@ static int coroutine_fn backup_do_cow(BlockDriverState *bs, } if (buffer_is_zero(iov.iov_base, iov.iov_len)) { - ret = bdrv_co_write_zeroes(job->target, - start * sectors_per_cluster, - n, BDRV_REQ_MAY_UNMAP); + ret = blk_co_pwrite_zeroes(job->target, start * job->cluster_size, + bounce_qiov.size, BDRV_REQ_MAY_UNMAP); } else { - ret = bdrv_co_writev(job->target, - start * sectors_per_cluster, n, - &bounce_qiov); + ret = blk_co_pwritev(job->target, start * job->cluster_size, + bounce_qiov.size, &bounce_qiov, 0); } if (ret < 0) { trace_backup_do_cow_write_fail(job, start, ret); @@ -197,14 +191,16 @@ static int coroutine_fn backup_before_write_notify( NotifierWithReturn *notifier, void *opaque) { + BackupBlockJob *job = container_of(notifier, BackupBlockJob, before_write); BdrvTrackedRequest *req = opaque; int64_t sector_num = req->offset >> BDRV_SECTOR_BITS; int nb_sectors = req->bytes >> BDRV_SECTOR_BITS; + assert(req->bs == blk_bs(job->common.blk)); assert((req->offset & (BDRV_SECTOR_SIZE - 1)) == 0); assert((req->bytes & (BDRV_SECTOR_SIZE - 1)) == 0); - return backup_do_cow(req->bs, sector_num, nb_sectors, NULL, true); + return backup_do_cow(job, sector_num, nb_sectors, NULL, true); } static void backup_set_speed(BlockJob *job, int64_t speed, Error **errp) @@ -221,7 +217,7 @@ static void backup_set_speed(BlockJob *job, int64_t speed, Error **errp) static void backup_cleanup_sync_bitmap(BackupBlockJob *job, int ret) { BdrvDirtyBitmap *bm; - BlockDriverState *bs = job->common.bs; + BlockDriverState *bs = blk_bs(job->common.blk); if (ret < 0 || block_job_is_cancelled(&job->common)) { /* Merge the successor back into the parent, delete nothing. */ @@ -279,7 +275,7 @@ static void backup_complete(BlockJob *job, void *opaque) BackupBlockJob *s = container_of(job, BackupBlockJob, common); BackupCompleteData *data = opaque; - bdrv_unref(s->target); + blk_unref(s->target); block_job_completed(job, data->ret); g_free(data); @@ -321,7 +317,6 @@ static int coroutine_fn backup_run_incremental(BackupBlockJob *job) int64_t end; int64_t last_cluster = -1; int64_t sectors_per_cluster = cluster_size_sectors(job); - BlockDriverState *bs = job->common.bs; HBitmapIter hbi; granularity = bdrv_dirty_bitmap_granularity(job->sync_bitmap); @@ -343,7 +338,7 @@ static int coroutine_fn backup_run_incremental(BackupBlockJob *job) if (yield_and_check(job)) { return ret; } - ret = backup_do_cow(bs, cluster * sectors_per_cluster, + ret = backup_do_cow(job, cluster * sectors_per_cluster, sectors_per_cluster, &error_is_read, false); if ((ret < 0) && @@ -376,11 +371,8 @@ static void coroutine_fn backup_run(void *opaque) { BackupBlockJob *job = opaque; BackupCompleteData *data; - BlockDriverState *bs = job->common.bs; - BlockDriverState *target = job->target; - NotifierWithReturn before_write = { - .notify = backup_before_write_notify, - }; + BlockDriverState *bs = blk_bs(job->common.blk); + BlockBackend *target = job->target; int64_t start, end; int64_t sectors_per_cluster = cluster_size_sectors(job); int ret = 0; @@ -393,7 +385,8 @@ static void coroutine_fn backup_run(void *opaque) job->done_bitmap = bitmap_new(end); - bdrv_add_before_write_notifier(bs, &before_write); + job->before_write.notify = backup_before_write_notify; + bdrv_add_before_write_notifier(bs, &job->before_write); if (job->sync_mode == MIRROR_SYNC_MODE_NONE) { while (!block_job_is_cancelled(&job->common)) { @@ -445,7 +438,7 @@ static void coroutine_fn backup_run(void *opaque) } } /* FULL sync mode we copy the whole drive. */ - ret = backup_do_cow(bs, start * sectors_per_cluster, + ret = backup_do_cow(job, start * sectors_per_cluster, sectors_per_cluster, &error_is_read, false); if (ret < 0) { /* Depending on error action, fail now or retry cluster */ @@ -461,14 +454,14 @@ static void coroutine_fn backup_run(void *opaque) } } - notifier_with_return_remove(&before_write); + notifier_with_return_remove(&job->before_write); /* wait until pending backup_do_cow() calls have completed */ qemu_co_rwlock_wrlock(&job->flush_rwlock); qemu_co_rwlock_unlock(&job->flush_rwlock); g_free(job->done_bitmap); - bdrv_op_unblock_all(target, job->common.blocker); + bdrv_op_unblock_all(blk_bs(target), job->common.blocker); data = g_malloc(sizeof(*data)); data->ret = ret; @@ -485,6 +478,7 @@ void backup_start(BlockDriverState *bs, BlockDriverState *target, { int64_t len; BlockDriverInfo bdi; + BackupBlockJob *job = NULL; int ret; assert(bs); @@ -542,15 +536,16 @@ void backup_start(BlockDriverState *bs, BlockDriverState *target, goto error; } - BackupBlockJob *job = block_job_create(&backup_job_driver, bs, speed, - cb, opaque, errp); + job = block_job_create(&backup_job_driver, bs, speed, cb, opaque, errp); if (!job) { goto error; } + job->target = blk_new(); + blk_insert_bs(job->target, target); + job->on_source_error = on_source_error; job->on_target_error = on_target_error; - job->target = target; job->sync_mode = sync_mode; job->sync_bitmap = sync_mode == MIRROR_SYNC_MODE_INCREMENTAL ? sync_bitmap : NULL; @@ -558,7 +553,7 @@ void backup_start(BlockDriverState *bs, BlockDriverState *target, /* If there is no backing file on the target, we cannot rely on COW if our * backup cluster size is smaller than the target cluster size. Even for * targets with a backing file, try to avoid COW if possible. */ - ret = bdrv_get_info(job->target, &bdi); + ret = bdrv_get_info(target, &bdi); if (ret < 0 && !target->backing) { error_setg_errno(errp, -ret, "Couldn't determine the cluster size of the target image, " @@ -584,4 +579,8 @@ void backup_start(BlockDriverState *bs, BlockDriverState *target, if (sync_bitmap) { bdrv_reclaim_dirty_bitmap(bs, sync_bitmap, NULL); } + if (job) { + blk_unref(job->target); + block_job_unref(&job->common); + } } diff --git a/block/block-backend.c b/block/block-backend.c index 6928d61de4..34500e6080 100644 --- a/block/block-backend.c +++ b/block/block-backend.c @@ -19,6 +19,7 @@ #include "sysemu/sysemu.h" #include "qapi-event.h" #include "qemu/id.h" +#include "trace.h" /* Number of coroutines to reserve per attached device model */ #define COROUTINE_POOL_RESERVATION 64 @@ -119,12 +120,14 @@ static const BdrvChildRole child_root = { * Store an error through @errp on failure, unless it's null. * Return the new BlockBackend on success, null on failure. */ -BlockBackend *blk_new(Error **errp) +BlockBackend *blk_new(void) { BlockBackend *blk; blk = g_new0(BlockBackend, 1); blk->refcnt = 1; + blk_set_enable_write_cache(blk, true); + qemu_co_queue_init(&blk->public.throttled_reqs[0]); qemu_co_queue_init(&blk->public.throttled_reqs[1]); @@ -136,27 +139,7 @@ BlockBackend *blk_new(Error **errp) } /* - * Create a new BlockBackend with a new BlockDriverState attached. - * Otherwise just like blk_new(), which see. - */ -BlockBackend *blk_new_with_bs(Error **errp) -{ - BlockBackend *blk; - BlockDriverState *bs; - - blk = blk_new(errp); - if (!blk) { - return NULL; - } - - bs = bdrv_new_root(); - blk->root = bdrv_root_attach_child(bs, "root", &child_root); - blk->root->opaque = blk; - return blk; -} - -/* - * Calls blk_new_with_bs() and then calls bdrv_open() on the BlockDriverState. + * Creates a new BlockBackend, opens a new BlockDriverState, and connects both. * * Just as with bdrv_open(), after having called this function the reference to * @options belongs to the block layer (even on failure). @@ -171,21 +154,16 @@ BlockBackend *blk_new_open(const char *filename, const char *reference, QDict *options, int flags, Error **errp) { BlockBackend *blk; - int ret; - - blk = blk_new_with_bs(errp); - if (!blk) { - QDECREF(options); - return NULL; - } + BlockDriverState *bs; - ret = bdrv_open(&blk->root->bs, filename, reference, options, flags, errp); - if (ret < 0) { + blk = blk_new(); + bs = bdrv_open(filename, reference, options, flags, errp); + if (!bs) { blk_unref(blk); return NULL; } - blk_set_enable_write_cache(blk, true); + blk->root = bdrv_root_attach_child(bs, "root", &child_root, blk); return blk; } @@ -286,25 +264,11 @@ BlockBackend *blk_next(BlockBackend *blk) : QTAILQ_FIRST(&monitor_block_backends); } -struct BdrvNextIterator { - enum { - BDRV_NEXT_BACKEND_ROOTS, - BDRV_NEXT_MONITOR_OWNED, - } phase; - BlockBackend *blk; - BlockDriverState *bs; -}; - /* Iterates over all top-level BlockDriverStates, i.e. BDSs that are owned by * the monitor or attached to a BlockBackend */ -BdrvNextIterator *bdrv_next(BdrvNextIterator *it, BlockDriverState **bs) +BlockDriverState *bdrv_next(BdrvNextIterator *it) { - if (!it) { - it = g_new(BdrvNextIterator, 1); - *it = (BdrvNextIterator) { - .phase = BDRV_NEXT_BACKEND_ROOTS, - }; - } + BlockDriverState *bs; /* First, return all root nodes of BlockBackends. In order to avoid * returning a BDS twice when multiple BBs refer to it, we only return it @@ -312,11 +276,11 @@ BdrvNextIterator *bdrv_next(BdrvNextIterator *it, BlockDriverState **bs) if (it->phase == BDRV_NEXT_BACKEND_ROOTS) { do { it->blk = blk_all_next(it->blk); - *bs = it->blk ? blk_bs(it->blk) : NULL; - } while (it->blk && (*bs == NULL || bdrv_first_blk(*bs) != it->blk)); + bs = it->blk ? blk_bs(it->blk) : NULL; + } while (it->blk && (bs == NULL || bdrv_first_blk(bs) != it->blk)); - if (*bs) { - return it; + if (bs) { + return bs; } it->phase = BDRV_NEXT_MONITOR_OWNED; } @@ -326,10 +290,19 @@ BdrvNextIterator *bdrv_next(BdrvNextIterator *it, BlockDriverState **bs) * by the above block already */ do { it->bs = bdrv_next_monitor_owned(it->bs); - *bs = it->bs; - } while (*bs && bdrv_has_blk(*bs)); + bs = it->bs; + } while (bs && bdrv_has_blk(bs)); - return *bs ? it : NULL; + return bs; +} + +BlockDriverState *bdrv_first(BdrvNextIterator *it) +{ + *it = (BdrvNextIterator) { + .phase = BDRV_NEXT_BACKEND_ROOTS, + }; + + return bdrv_next(it); } /* @@ -509,8 +482,7 @@ void blk_remove_bs(BlockBackend *blk) void blk_insert_bs(BlockBackend *blk, BlockDriverState *bs) { bdrv_ref(bs); - blk->root = bdrv_root_attach_child(bs, "root", &child_root); - blk->root->opaque = blk; + blk->root = bdrv_root_attach_child(bs, "root", &child_root, blk); notifier_list_notify(&blk->insert_bs_notifiers, blk); if (blk->public.throttle_state) { @@ -770,11 +742,15 @@ static int blk_check_request(BlockBackend *blk, int64_t sector_num, nb_sectors * BDRV_SECTOR_SIZE); } -static int coroutine_fn blk_co_preadv(BlockBackend *blk, int64_t offset, - unsigned int bytes, QEMUIOVector *qiov, - BdrvRequestFlags flags) +int coroutine_fn blk_co_preadv(BlockBackend *blk, int64_t offset, + unsigned int bytes, QEMUIOVector *qiov, + BdrvRequestFlags flags) { - int ret = blk_check_byte_request(blk, offset, bytes); + int ret; + + trace_blk_co_preadv(blk, blk_bs(blk), offset, bytes, flags); + + ret = blk_check_byte_request(blk, offset, bytes); if (ret < 0) { return ret; } @@ -787,12 +763,14 @@ static int coroutine_fn blk_co_preadv(BlockBackend *blk, int64_t offset, return bdrv_co_preadv(blk_bs(blk), offset, bytes, qiov, flags); } -static int coroutine_fn blk_co_pwritev(BlockBackend *blk, int64_t offset, - unsigned int bytes, QEMUIOVector *qiov, - BdrvRequestFlags flags) +int coroutine_fn blk_co_pwritev(BlockBackend *blk, int64_t offset, + unsigned int bytes, QEMUIOVector *qiov, + BdrvRequestFlags flags) { int ret; + trace_blk_co_pwritev(blk, blk_bs(blk), offset, bytes, flags); + ret = blk_check_byte_request(blk, offset, bytes); if (ret < 0) { return ret; @@ -885,8 +863,8 @@ int blk_pread_unthrottled(BlockBackend *blk, int64_t offset, uint8_t *buf, return ret; } -int blk_write_zeroes(BlockBackend *blk, int64_t offset, - int count, BdrvRequestFlags flags) +int blk_pwrite_zeroes(BlockBackend *blk, int64_t offset, + int count, BdrvRequestFlags flags) { return blk_prw(blk, offset, NULL, count, blk_write_entry, flags | BDRV_REQ_ZERO_WRITE); @@ -1001,9 +979,9 @@ static void blk_aio_write_entry(void *opaque) blk_aio_complete(acb); } -BlockAIOCB *blk_aio_write_zeroes(BlockBackend *blk, int64_t offset, - int count, BdrvRequestFlags flags, - BlockCompletionFunc *cb, void *opaque) +BlockAIOCB *blk_aio_pwrite_zeroes(BlockBackend *blk, int64_t offset, + int count, BdrvRequestFlags flags, + BlockCompletionFunc *cb, void *opaque) { return blk_aio_prwv(blk, offset, count, NULL, blk_aio_write_entry, flags | BDRV_REQ_ZERO_WRITE, cb, opaque); @@ -1492,8 +1470,8 @@ void *blk_aio_get(const AIOCBInfo *aiocb_info, BlockBackend *blk, return qemu_aio_get(aiocb_info, blk_bs(blk), cb, opaque); } -int coroutine_fn blk_co_write_zeroes(BlockBackend *blk, int64_t offset, - int count, BdrvRequestFlags flags) +int coroutine_fn blk_co_pwrite_zeroes(BlockBackend *blk, int64_t offset, + int count, BdrvRequestFlags flags) { return blk_co_pwritev(blk, offset, count, NULL, flags | BDRV_REQ_ZERO_WRITE); @@ -1704,6 +1682,9 @@ static void blk_root_drained_begin(BdrvChild *child) { BlockBackend *blk = child->opaque; + /* Note that blk->root may not be accessible here yet if we are just + * attaching to a BlockDriverState that is drained. Use child instead. */ + if (blk->public.io_limits_disabled++ == 0) { throttle_group_restart_blk(blk); } diff --git a/block/commit.c b/block/commit.c index f308c8c6f0..8a00e1146c 100644 --- a/block/commit.c +++ b/block/commit.c @@ -36,28 +36,36 @@ typedef struct CommitBlockJob { BlockJob common; RateLimit limit; BlockDriverState *active; - BlockDriverState *top; - BlockDriverState *base; + BlockBackend *top; + BlockBackend *base; BlockdevOnError on_error; int base_flags; int orig_overlay_flags; char *backing_file_str; } CommitBlockJob; -static int coroutine_fn commit_populate(BlockDriverState *bs, - BlockDriverState *base, +static int coroutine_fn commit_populate(BlockBackend *bs, BlockBackend *base, int64_t sector_num, int nb_sectors, void *buf) { int ret = 0; + QEMUIOVector qiov; + struct iovec iov = { + .iov_base = buf, + .iov_len = nb_sectors * BDRV_SECTOR_SIZE, + }; - ret = bdrv_read(bs, sector_num, buf, nb_sectors); - if (ret) { + qemu_iovec_init_external(&qiov, &iov, 1); + + ret = blk_co_preadv(bs, sector_num * BDRV_SECTOR_SIZE, + qiov.size, &qiov, 0); + if (ret < 0) { return ret; } - ret = bdrv_write(base, sector_num, buf, nb_sectors); - if (ret) { + ret = blk_co_pwritev(base, sector_num * BDRV_SECTOR_SIZE, + qiov.size, &qiov, 0); + if (ret < 0) { return ret; } @@ -73,8 +81,8 @@ static void commit_complete(BlockJob *job, void *opaque) CommitBlockJob *s = container_of(job, CommitBlockJob, common); CommitCompleteData *data = opaque; BlockDriverState *active = s->active; - BlockDriverState *top = s->top; - BlockDriverState *base = s->base; + BlockDriverState *top = blk_bs(s->top); + BlockDriverState *base = blk_bs(s->base); BlockDriverState *overlay_bs; int ret = data->ret; @@ -94,6 +102,8 @@ static void commit_complete(BlockJob *job, void *opaque) bdrv_reopen(overlay_bs, s->orig_overlay_flags, NULL); } g_free(s->backing_file_str); + blk_unref(s->top); + blk_unref(s->base); block_job_completed(&s->common, ret); g_free(data); } @@ -102,8 +112,6 @@ static void coroutine_fn commit_run(void *opaque) { CommitBlockJob *s = opaque; CommitCompleteData *data; - BlockDriverState *top = s->top; - BlockDriverState *base = s->base; int64_t sector_num, end; int ret = 0; int n = 0; @@ -111,27 +119,27 @@ static void coroutine_fn commit_run(void *opaque) int bytes_written = 0; int64_t base_len; - ret = s->common.len = bdrv_getlength(top); + ret = s->common.len = blk_getlength(s->top); if (s->common.len < 0) { goto out; } - ret = base_len = bdrv_getlength(base); + ret = base_len = blk_getlength(s->base); if (base_len < 0) { goto out; } if (base_len < s->common.len) { - ret = bdrv_truncate(base, s->common.len); + ret = blk_truncate(s->base, s->common.len); if (ret) { goto out; } } end = s->common.len >> BDRV_SECTOR_BITS; - buf = qemu_blockalign(top, COMMIT_BUFFER_SIZE); + buf = blk_blockalign(s->top, COMMIT_BUFFER_SIZE); for (sector_num = 0; sector_num < end; sector_num += n) { uint64_t delay_ns = 0; @@ -146,7 +154,8 @@ wait: break; } /* Copy if allocated above the base */ - ret = bdrv_is_allocated_above(top, base, sector_num, + ret = bdrv_is_allocated_above(blk_bs(s->top), blk_bs(s->base), + sector_num, COMMIT_BUFFER_SIZE / BDRV_SECTOR_SIZE, &n); copy = (ret == 1); @@ -158,7 +167,7 @@ wait: goto wait; } } - ret = commit_populate(top, base, sector_num, n, buf); + ret = commit_populate(s->top, s->base, sector_num, n, buf); bytes_written += n * BDRV_SECTOR_SIZE; } if (ret < 0) { @@ -253,8 +262,12 @@ void commit_start(BlockDriverState *bs, BlockDriverState *base, return; } - s->base = base; - s->top = top; + s->base = blk_new(); + blk_insert_bs(s->base, base); + + s->top = blk_new(); + blk_insert_bs(s->top, top); + s->active = bs; s->base_flags = orig_base_flags; diff --git a/block/io.c b/block/io.c index 60a6bd8bdb..2d832aa532 100644 --- a/block/io.c +++ b/block/io.c @@ -225,6 +225,34 @@ static void coroutine_fn bdrv_co_yield_to_drain(BlockDriverState *bs) assert(data.done); } +void bdrv_drained_begin(BlockDriverState *bs) +{ + if (!bs->quiesce_counter++) { + aio_disable_external(bdrv_get_aio_context(bs)); + bdrv_parent_drained_begin(bs); + } + + bdrv_io_unplugged_begin(bs); + bdrv_drain_recurse(bs); + if (qemu_in_coroutine()) { + bdrv_co_yield_to_drain(bs); + } else { + bdrv_drain_poll(bs); + } + bdrv_io_unplugged_end(bs); +} + +void bdrv_drained_end(BlockDriverState *bs) +{ + assert(bs->quiesce_counter > 0); + if (--bs->quiesce_counter > 0) { + return; + } + + bdrv_parent_drained_end(bs); + aio_enable_external(bdrv_get_aio_context(bs)); +} + /* * Wait for pending requests to complete on a single BlockDriverState subtree, * and suspend block driver's internal I/O until next request arrives. @@ -238,26 +266,15 @@ static void coroutine_fn bdrv_co_yield_to_drain(BlockDriverState *bs) */ void coroutine_fn bdrv_co_drain(BlockDriverState *bs) { - bdrv_parent_drained_begin(bs); - bdrv_io_unplugged_begin(bs); - bdrv_drain_recurse(bs); - bdrv_co_yield_to_drain(bs); - bdrv_io_unplugged_end(bs); - bdrv_parent_drained_end(bs); + assert(qemu_in_coroutine()); + bdrv_drained_begin(bs); + bdrv_drained_end(bs); } void bdrv_drain(BlockDriverState *bs) { - bdrv_parent_drained_begin(bs); - bdrv_io_unplugged_begin(bs); - bdrv_drain_recurse(bs); - if (qemu_in_coroutine()) { - bdrv_co_yield_to_drain(bs); - } else { - bdrv_drain_poll(bs); - } - bdrv_io_unplugged_end(bs); - bdrv_parent_drained_end(bs); + bdrv_drained_begin(bs); + bdrv_drained_end(bs); } /* @@ -271,10 +288,10 @@ void bdrv_drain_all(void) /* Always run first iteration so any pending completion BHs run */ bool busy = true; BlockDriverState *bs; - BdrvNextIterator *it = NULL; + BdrvNextIterator it; GSList *aio_ctxs = NULL, *ctx; - while ((it = bdrv_next(it, &bs))) { + for (bs = bdrv_first(&it); bs; bs = bdrv_next(&it)) { AioContext *aio_context = bdrv_get_aio_context(bs); aio_context_acquire(aio_context); @@ -302,10 +319,9 @@ void bdrv_drain_all(void) for (ctx = aio_ctxs; ctx != NULL; ctx = ctx->next) { AioContext *aio_context = ctx->data; - it = NULL; aio_context_acquire(aio_context); - while ((it = bdrv_next(it, &bs))) { + for (bs = bdrv_first(&it); bs; bs = bdrv_next(&it)) { if (aio_context == bdrv_get_aio_context(bs)) { if (bdrv_requests_pending(bs)) { busy = true; @@ -318,8 +334,7 @@ void bdrv_drain_all(void) } } - it = NULL; - while ((it = bdrv_next(it, &bs))) { + for (bs = bdrv_first(&it); bs; bs = bdrv_next(&it)) { AioContext *aio_context = bdrv_get_aio_context(bs); aio_context_acquire(aio_context); @@ -1093,24 +1108,6 @@ int coroutine_fn bdrv_co_readv(BlockDriverState *bs, int64_t sector_num, return bdrv_co_do_readv(bs, sector_num, nb_sectors, qiov, 0); } -int coroutine_fn bdrv_co_readv_no_serialising(BlockDriverState *bs, - int64_t sector_num, int nb_sectors, QEMUIOVector *qiov) -{ - trace_bdrv_co_readv_no_serialising(bs, sector_num, nb_sectors); - - return bdrv_co_do_readv(bs, sector_num, nb_sectors, qiov, - BDRV_REQ_NO_SERIALISING); -} - -int coroutine_fn bdrv_co_copy_on_readv(BlockDriverState *bs, - int64_t sector_num, int nb_sectors, QEMUIOVector *qiov) -{ - trace_bdrv_co_copy_on_readv(bs, sector_num, nb_sectors); - - return bdrv_co_do_readv(bs, sector_num, nb_sectors, qiov, - BDRV_REQ_COPY_ON_READ); -} - #define MAX_WRITE_ZEROES_BOUNCE_BUFFER 32768 static int coroutine_fn bdrv_co_do_write_zeroes(BlockDriverState *bs, @@ -2543,23 +2540,3 @@ void bdrv_io_unplugged_end(BlockDriverState *bs) } } } - -void bdrv_drained_begin(BlockDriverState *bs) -{ - if (!bs->quiesce_counter++) { - aio_disable_external(bdrv_get_aio_context(bs)); - } - bdrv_parent_drained_begin(bs); - bdrv_drain(bs); -} - -void bdrv_drained_end(BlockDriverState *bs) -{ - bdrv_parent_drained_end(bs); - - assert(bs->quiesce_counter > 0); - if (--bs->quiesce_counter > 0) { - return; - } - aio_enable_external(bdrv_get_aio_context(bs)); -} diff --git a/block/mirror.c b/block/mirror.c index b9986d8218..80fd3c7469 100644 --- a/block/mirror.c +++ b/block/mirror.c @@ -20,7 +20,6 @@ #include "qapi/qmp/qerror.h" #include "qemu/ratelimit.h" #include "qemu/bitmap.h" -#include "qemu/error-report.h" #define SLICE_TIME 100000000ULL /* ns */ #define MAX_IN_FLIGHT 16 @@ -36,7 +35,7 @@ typedef struct MirrorBuffer { typedef struct MirrorBlockJob { BlockJob common; RateLimit limit; - BlockDriverState *target; + BlockBackend *target; BlockDriverState *base; /* The name of the graph node to replace */ char *replaces; @@ -157,7 +156,8 @@ static void mirror_read_complete(void *opaque, int ret) mirror_iteration_done(op, ret); return; } - bdrv_aio_writev(s->target, op->sector_num, &op->qiov, op->nb_sectors, + blk_aio_pwritev(s->target, op->sector_num * BDRV_SECTOR_SIZE, &op->qiov, + op->nb_sectors * BDRV_SECTOR_SIZE, mirror_write_complete, op); } @@ -186,7 +186,7 @@ static int mirror_cow_align(MirrorBlockJob *s, need_cow |= !test_bit((*sector_num + *nb_sectors - 1) / chunk_sectors, s->cow_bitmap); if (need_cow) { - bdrv_round_to_clusters(s->target, *sector_num, *nb_sectors, + bdrv_round_to_clusters(blk_bs(s->target), *sector_num, *nb_sectors, &align_sector_num, &align_nb_sectors); } @@ -224,7 +224,7 @@ static inline void mirror_wait_for_io(MirrorBlockJob *s) static int mirror_do_read(MirrorBlockJob *s, int64_t sector_num, int nb_sectors) { - BlockDriverState *source = s->common.bs; + BlockBackend *source = s->common.blk; int sectors_per_chunk, nb_chunks; int ret = nb_sectors; MirrorOp *op; @@ -274,7 +274,8 @@ static int mirror_do_read(MirrorBlockJob *s, int64_t sector_num, s->sectors_in_flight += nb_sectors; trace_mirror_one_iteration(s, sector_num, nb_sectors); - bdrv_aio_readv(source, sector_num, &op->qiov, nb_sectors, + blk_aio_preadv(source, sector_num * BDRV_SECTOR_SIZE, &op->qiov, + nb_sectors * BDRV_SECTOR_SIZE, mirror_read_complete, op); return ret; } @@ -296,10 +297,11 @@ static void mirror_do_zero_or_discard(MirrorBlockJob *s, s->in_flight++; s->sectors_in_flight += nb_sectors; if (is_discard) { - bdrv_aio_discard(s->target, sector_num, op->nb_sectors, - mirror_write_complete, op); + blk_aio_discard(s->target, sector_num, op->nb_sectors, + mirror_write_complete, op); } else { - bdrv_aio_write_zeroes(s->target, sector_num, op->nb_sectors, + blk_aio_pwrite_zeroes(s->target, sector_num * BDRV_SECTOR_SIZE, + op->nb_sectors * BDRV_SECTOR_SIZE, s->unmap ? BDRV_REQ_MAY_UNMAP : 0, mirror_write_complete, op); } @@ -307,7 +309,7 @@ static void mirror_do_zero_or_discard(MirrorBlockJob *s, static uint64_t coroutine_fn mirror_iteration(MirrorBlockJob *s) { - BlockDriverState *source = s->common.bs; + BlockDriverState *source = blk_bs(s->common.blk); int64_t sector_num, first_chunk; uint64_t delay_ns = 0; /* At least the first dirty chunk is mirrored in one iteration. */ @@ -384,7 +386,7 @@ static uint64_t coroutine_fn mirror_iteration(MirrorBlockJob *s) } else if (ret >= 0 && !(ret & BDRV_BLOCK_DATA)) { int64_t target_sector_num; int target_nb_sectors; - bdrv_round_to_clusters(s->target, sector_num, io_sectors, + bdrv_round_to_clusters(blk_bs(s->target), sector_num, io_sectors, &target_sector_num, &target_nb_sectors); if (target_sector_num == sector_num && target_nb_sectors == io_sectors) { @@ -449,7 +451,8 @@ static void mirror_exit(BlockJob *job, void *opaque) MirrorBlockJob *s = container_of(job, MirrorBlockJob, common); MirrorExitData *data = opaque; AioContext *replace_aio_context = NULL; - BlockDriverState *src = s->common.bs; + BlockDriverState *src = blk_bs(s->common.blk); + BlockDriverState *target_bs = blk_bs(s->target); /* Make sure that the source BDS doesn't go away before we called * block_job_completed(). */ @@ -461,26 +464,25 @@ static void mirror_exit(BlockJob *job, void *opaque) } if (s->should_complete && data->ret == 0) { - BlockDriverState *to_replace = s->common.bs; + BlockDriverState *to_replace = src; if (s->to_replace) { to_replace = s->to_replace; } - /* This was checked in mirror_start_job(), but meanwhile one of the - * nodes could have been newly attached to a BlockBackend. */ - if (bdrv_has_blk(to_replace) && bdrv_has_blk(s->target)) { - error_report("block job: Can't create node with two BlockBackends"); - data->ret = -EINVAL; - goto out; + if (bdrv_get_flags(target_bs) != bdrv_get_flags(to_replace)) { + bdrv_reopen(target_bs, bdrv_get_flags(to_replace), NULL); } - if (bdrv_get_flags(s->target) != bdrv_get_flags(to_replace)) { - bdrv_reopen(s->target, bdrv_get_flags(to_replace), NULL); - } - bdrv_replace_in_backing_chain(to_replace, s->target); - } + /* The mirror job has no requests in flight any more, but we need to + * drain potential other users of the BDS before changing the graph. */ + bdrv_drained_begin(target_bs); + bdrv_replace_in_backing_chain(to_replace, target_bs); + bdrv_drained_end(target_bs); -out: + /* We just changed the BDS the job BB refers to */ + blk_remove_bs(job->blk); + blk_insert_bs(job->blk, src); + } if (s->to_replace) { bdrv_op_unblock_all(s->to_replace, s->replace_blocker); error_free(s->replace_blocker); @@ -490,8 +492,8 @@ out: aio_context_release(replace_aio_context); } g_free(s->replaces); - bdrv_op_unblock_all(s->target, s->common.blocker); - bdrv_unref(s->target); + bdrv_op_unblock_all(target_bs, s->common.blocker); + blk_unref(s->target); block_job_completed(&s->common, data->ret); g_free(data); bdrv_drained_end(src); @@ -505,7 +507,8 @@ static void coroutine_fn mirror_run(void *opaque) { MirrorBlockJob *s = opaque; MirrorExitData *data; - BlockDriverState *bs = s->common.bs; + BlockDriverState *bs = blk_bs(s->common.blk); + BlockDriverState *target_bs = blk_bs(s->target); int64_t sector_num, end, length; uint64_t last_pause_ns; BlockDriverInfo bdi; @@ -541,18 +544,18 @@ static void coroutine_fn mirror_run(void *opaque) * the destination do COW. Instead, we copy sectors around the * dirty data if needed. We need a bitmap to do that. */ - bdrv_get_backing_filename(s->target, backing_filename, + bdrv_get_backing_filename(target_bs, backing_filename, sizeof(backing_filename)); - if (!bdrv_get_info(s->target, &bdi) && bdi.cluster_size) { + if (!bdrv_get_info(target_bs, &bdi) && bdi.cluster_size) { target_cluster_size = bdi.cluster_size; } - if (backing_filename[0] && !s->target->backing + if (backing_filename[0] && !target_bs->backing && s->granularity < target_cluster_size) { s->buf_size = MAX(s->buf_size, target_cluster_size); s->cow_bitmap = bitmap_new(length); } s->target_cluster_sectors = target_cluster_size >> BDRV_SECTOR_BITS; - s->max_iov = MIN(s->common.bs->bl.max_iov, s->target->bl.max_iov); + s->max_iov = MIN(bs->bl.max_iov, target_bs->bl.max_iov); end = s->bdev_length / BDRV_SECTOR_SIZE; s->buf = qemu_try_blockalign(bs, s->buf_size); @@ -567,7 +570,7 @@ static void coroutine_fn mirror_run(void *opaque) if (!s->is_none_mode) { /* First part, loop on the sectors and initialize the dirty bitmap. */ BlockDriverState *base = s->base; - bool mark_all_dirty = s->base == NULL && !bdrv_has_zero_init(s->target); + bool mark_all_dirty = s->base == NULL && !bdrv_has_zero_init(target_bs); for (sector_num = 0; sector_num < end; ) { /* Just to make sure we are not exceeding int limit. */ @@ -637,7 +640,7 @@ static void coroutine_fn mirror_run(void *opaque) should_complete = false; if (s->in_flight == 0 && cnt == 0) { trace_mirror_before_flush(s); - ret = bdrv_flush(s->target); + ret = blk_flush(s->target); if (ret < 0) { if (mirror_error_action(s, false, -ret) == BLOCK_ERROR_ACTION_REPORT) { @@ -715,7 +718,7 @@ immediate_exit: data->ret = ret; /* Before we switch to target in mirror_exit, make sure data doesn't * change. */ - bdrv_drained_begin(s->common.bs); + bdrv_drained_begin(bs); if (qemu_get_aio_context() == bdrv_get_aio_context(bs)) { /* FIXME: virtio host notifiers run on iohandler_ctx, therefore the * above bdrv_drained_end isn't enough to quiesce it. This is ugly, we @@ -742,7 +745,8 @@ static void mirror_complete(BlockJob *job, Error **errp) Error *local_err = NULL; int ret; - ret = bdrv_open_backing_file(s->target, NULL, "backing", &local_err); + ret = bdrv_open_backing_file(blk_bs(s->target), NULL, "backing", + &local_err); if (ret < 0) { error_propagate(errp, local_err); return; @@ -804,7 +808,6 @@ static void mirror_start_job(BlockDriverState *bs, BlockDriverState *target, bool is_none_mode, BlockDriverState *base) { MirrorBlockJob *s; - BlockDriverState *replaced_bs; if (granularity == 0) { granularity = bdrv_get_default_bitmap_granularity(target); @@ -821,30 +824,17 @@ static void mirror_start_job(BlockDriverState *bs, BlockDriverState *target, buf_size = DEFAULT_MIRROR_BUF_SIZE; } - /* We can't support this case as long as the block layer can't handle - * multiple BlockBackends per BlockDriverState. */ - if (replaces) { - replaced_bs = bdrv_lookup_bs(replaces, replaces, errp); - if (replaced_bs == NULL) { - return; - } - } else { - replaced_bs = bs; - } - if (bdrv_has_blk(replaced_bs) && bdrv_has_blk(target)) { - error_setg(errp, "Can't create node with two BlockBackends"); - return; - } - s = block_job_create(driver, bs, speed, cb, opaque, errp); if (!s) { return; } + s->target = blk_new(); + blk_insert_bs(s->target, target); + s->replaces = g_strdup(replaces); s->on_source_error = on_source_error; s->on_target_error = on_target_error; - s->target = target; s->is_none_mode = is_none_mode; s->base = base; s->granularity = granularity; @@ -854,11 +844,12 @@ static void mirror_start_job(BlockDriverState *bs, BlockDriverState *target, s->dirty_bitmap = bdrv_create_dirty_bitmap(bs, granularity, NULL, errp); if (!s->dirty_bitmap) { g_free(s->replaces); + blk_unref(s->target); block_job_unref(&s->common); return; } - bdrv_op_block_all(s->target, s->common.blocker); + bdrv_op_block_all(target, s->common.blocker); s->common.co = qemu_coroutine_create(mirror_run); trace_mirror_start(bs, s, s->common.co, opaque); @@ -931,7 +922,6 @@ void commit_active_start(BlockDriverState *bs, BlockDriverState *base, } } - bdrv_ref(base); mirror_start_job(bs, base, NULL, speed, 0, 0, on_error, on_error, false, cb, opaque, &local_err, &commit_active_job_driver, false, base); diff --git a/block/parallels.c b/block/parallels.c index 88cfacebe3..99fc0f77ef 100644 --- a/block/parallels.c +++ b/block/parallels.c @@ -517,8 +517,8 @@ static int parallels_create(const char *filename, QemuOpts *opts, Error **errp) if (ret < 0) { goto exit; } - ret = blk_write_zeroes(file, BDRV_SECTOR_SIZE, - (bat_sectors - 1) << BDRV_SECTOR_BITS, 0); + ret = blk_pwrite_zeroes(file, BDRV_SECTOR_SIZE, + (bat_sectors - 1) << BDRV_SECTOR_BITS, 0); if (ret < 0) { goto exit; } diff --git a/block/snapshot.c b/block/snapshot.c index 3917ec5c91..6e6e34fcf4 100644 --- a/block/snapshot.c +++ b/block/snapshot.c @@ -374,9 +374,9 @@ bool bdrv_all_can_snapshot(BlockDriverState **first_bad_bs) { bool ok = true; BlockDriverState *bs; - BdrvNextIterator *it = NULL; + BdrvNextIterator it; - while (ok && (it = bdrv_next(it, &bs))) { + for (bs = bdrv_first(&it); bs; bs = bdrv_next(&it)) { AioContext *ctx = bdrv_get_aio_context(bs); aio_context_acquire(ctx); @@ -384,8 +384,12 @@ bool bdrv_all_can_snapshot(BlockDriverState **first_bad_bs) ok = bdrv_can_snapshot(bs); } aio_context_release(ctx); + if (!ok) { + goto fail; + } } +fail: *first_bad_bs = bs; return ok; } @@ -395,20 +399,27 @@ int bdrv_all_delete_snapshot(const char *name, BlockDriverState **first_bad_bs, { int ret = 0; BlockDriverState *bs; - BdrvNextIterator *it = NULL; + BdrvNextIterator it; QEMUSnapshotInfo sn1, *snapshot = &sn1; - while (ret == 0 && (it = bdrv_next(it, &bs))) { + for (bs = bdrv_first(&it); bs; bs = bdrv_next(&it)) { AioContext *ctx = bdrv_get_aio_context(bs); aio_context_acquire(ctx); if (bdrv_can_snapshot(bs) && bdrv_snapshot_find(bs, snapshot, name) >= 0) { ret = bdrv_snapshot_delete_by_id_or_name(bs, name, err); + if (ret < 0) { + goto fail; + } } aio_context_release(ctx); + if (ret < 0) { + goto fail; + } } +fail: *first_bad_bs = bs; return ret; } @@ -418,9 +429,9 @@ int bdrv_all_goto_snapshot(const char *name, BlockDriverState **first_bad_bs) { int err = 0; BlockDriverState *bs; - BdrvNextIterator *it = NULL; + BdrvNextIterator it; - while (err == 0 && (it = bdrv_next(it, &bs))) { + for (bs = bdrv_first(&it); bs; bs = bdrv_next(&it)) { AioContext *ctx = bdrv_get_aio_context(bs); aio_context_acquire(ctx); @@ -428,8 +439,12 @@ int bdrv_all_goto_snapshot(const char *name, BlockDriverState **first_bad_bs) err = bdrv_snapshot_goto(bs, name); } aio_context_release(ctx); + if (err < 0) { + goto fail; + } } +fail: *first_bad_bs = bs; return err; } @@ -439,9 +454,9 @@ int bdrv_all_find_snapshot(const char *name, BlockDriverState **first_bad_bs) QEMUSnapshotInfo sn; int err = 0; BlockDriverState *bs; - BdrvNextIterator *it = NULL; + BdrvNextIterator it; - while (err == 0 && (it = bdrv_next(it, &bs))) { + for (bs = bdrv_first(&it); bs; bs = bdrv_next(&it)) { AioContext *ctx = bdrv_get_aio_context(bs); aio_context_acquire(ctx); @@ -449,8 +464,12 @@ int bdrv_all_find_snapshot(const char *name, BlockDriverState **first_bad_bs) err = bdrv_snapshot_find(bs, &sn, name); } aio_context_release(ctx); + if (err < 0) { + goto fail; + } } +fail: *first_bad_bs = bs; return err; } @@ -462,9 +481,9 @@ int bdrv_all_create_snapshot(QEMUSnapshotInfo *sn, { int err = 0; BlockDriverState *bs; - BdrvNextIterator *it = NULL; + BdrvNextIterator it; - while (err == 0 && (it = bdrv_next(it, &bs))) { + for (bs = bdrv_first(&it); bs; bs = bdrv_next(&it)) { AioContext *ctx = bdrv_get_aio_context(bs); aio_context_acquire(ctx); @@ -476,24 +495,32 @@ int bdrv_all_create_snapshot(QEMUSnapshotInfo *sn, err = bdrv_snapshot_create(bs, sn); } aio_context_release(ctx); + if (err < 0) { + goto fail; + } } +fail: *first_bad_bs = bs; return err; } BlockDriverState *bdrv_all_find_vmstate_bs(void) { - bool not_found = true; BlockDriverState *bs; - BdrvNextIterator *it = NULL; + BdrvNextIterator it; - while (not_found && (it = bdrv_next(it, &bs))) { + for (bs = bdrv_first(&it); bs; bs = bdrv_next(&it)) { AioContext *ctx = bdrv_get_aio_context(bs); + bool found; aio_context_acquire(ctx); - not_found = !bdrv_can_snapshot(bs); + found = bdrv_can_snapshot(bs); aio_context_release(ctx); + + if (found) { + break; + } } return bs; } diff --git a/block/stream.c b/block/stream.c index 40aa32212e..c0efbda34e 100644 --- a/block/stream.c +++ b/block/stream.c @@ -39,7 +39,7 @@ typedef struct StreamBlockJob { char *backing_file_str; } StreamBlockJob; -static int coroutine_fn stream_populate(BlockDriverState *bs, +static int coroutine_fn stream_populate(BlockBackend *blk, int64_t sector_num, int nb_sectors, void *buf) { @@ -52,7 +52,8 @@ static int coroutine_fn stream_populate(BlockDriverState *bs, qemu_iovec_init_external(&qiov, &iov, 1); /* Copy-on-read the unallocated clusters */ - return bdrv_co_copy_on_readv(bs, sector_num, nb_sectors, &qiov); + return blk_co_preadv(blk, sector_num * BDRV_SECTOR_SIZE, qiov.size, &qiov, + BDRV_REQ_COPY_ON_READ); } typedef struct { @@ -64,6 +65,7 @@ static void stream_complete(BlockJob *job, void *opaque) { StreamBlockJob *s = container_of(job, StreamBlockJob, common); StreamCompleteData *data = opaque; + BlockDriverState *bs = blk_bs(job->blk); BlockDriverState *base = s->base; if (!block_job_is_cancelled(&s->common) && data->reached_end && @@ -75,8 +77,8 @@ static void stream_complete(BlockJob *job, void *opaque) base_fmt = base->drv->format_name; } } - data->ret = bdrv_change_backing_file(job->bs, base_id, base_fmt); - bdrv_set_backing_hd(job->bs, base); + data->ret = bdrv_change_backing_file(bs, base_id, base_fmt); + bdrv_set_backing_hd(bs, base); } g_free(s->backing_file_str); @@ -88,7 +90,8 @@ static void coroutine_fn stream_run(void *opaque) { StreamBlockJob *s = opaque; StreamCompleteData *data; - BlockDriverState *bs = s->common.bs; + BlockBackend *blk = s->common.blk; + BlockDriverState *bs = blk_bs(blk); BlockDriverState *base = s->base; int64_t sector_num = 0; int64_t end = -1; @@ -159,7 +162,7 @@ wait: goto wait; } } - ret = stream_populate(bs, sector_num, n, buf); + ret = stream_populate(blk, sector_num, n, buf); } if (ret < 0) { BlockErrorAction action = diff --git a/block/vvfat.c b/block/vvfat.c index 3e484a1dcc..a39dbe67e2 100644 --- a/block/vvfat.c +++ b/block/vvfat.c @@ -2998,12 +2998,12 @@ static int enable_write_target(BDRVVVFATState *s, Error **errp) goto err; } - s->qcow = NULL; options = qdict_new(); qdict_put(options, "driver", qstring_from_str("qcow")); - ret = bdrv_open(&s->qcow, s->qcow_filename, NULL, options, - BDRV_O_RDWR | BDRV_O_NO_FLUSH, errp); - if (ret < 0) { + s->qcow = bdrv_open(s->qcow_filename, NULL, options, + BDRV_O_RDWR | BDRV_O_NO_FLUSH, errp); + if (!s->qcow) { + ret = -EINVAL; goto err; } diff --git a/blockdev.c b/blockdev.c index 40e4e6fc6f..717785eb8d 100644 --- a/blockdev.c +++ b/blockdev.c @@ -567,11 +567,7 @@ static BlockBackend *blockdev_init(const char *file, QDict *bs_opts, if ((!file || !*file) && !qdict_size(bs_opts)) { BlockBackendRootState *blk_rs; - blk = blk_new(errp); - if (!blk) { - goto early_err; - } - + blk = blk_new(); blk_rs = blk_get_root_state(blk); blk_rs->open_flags = bdrv_flags; blk_rs->read_only = !(bdrv_flags & BDRV_O_RDWR); @@ -657,7 +653,6 @@ static BlockDriverState *bds_tree_init(QDict *bs_opts, Error **errp) QemuOpts *opts; Error *local_error = NULL; BlockdevDetectZeroesOptions detect_zeroes; - int ret; int bdrv_flags = 0; opts = qemu_opts_create(&qemu_root_bds_opts, NULL, 1, errp); @@ -688,9 +683,8 @@ static BlockDriverState *bds_tree_init(QDict *bs_opts, Error **errp) bdrv_flags |= BDRV_O_INACTIVE; } - bs = NULL; - ret = bdrv_open(&bs, NULL, NULL, bs_opts, bdrv_flags, errp); - if (ret < 0) { + bs = bdrv_open(NULL, NULL, bs_opts, bdrv_flags, errp); + if (!bs) { goto fail_no_bs_opts; } @@ -1643,7 +1637,7 @@ typedef struct ExternalSnapshotState { static void external_snapshot_prepare(BlkActionState *common, Error **errp) { - int flags = 0, ret; + int flags = 0; QDict *options = NULL; Error *local_err = NULL; /* Device and node name of the image to generate the snapshot from */ @@ -1768,11 +1762,10 @@ static void external_snapshot_prepare(BlkActionState *common, flags |= BDRV_O_NO_BACKING; } - assert(state->new_bs == NULL); - ret = bdrv_open(&state->new_bs, new_image_file, snapshot_ref, options, - flags, errp); + state->new_bs = bdrv_open(new_image_file, snapshot_ref, options, flags, + errp); /* We will manually add the backing_hd field to the bs later */ - if (ret != 0) { + if (!state->new_bs) { return; } @@ -2540,7 +2533,7 @@ void qmp_blockdev_change_medium(const char *device, const char *filename, { BlockBackend *blk; BlockDriverState *medium_bs = NULL; - int bdrv_flags, ret; + int bdrv_flags; QDict *options = NULL; Error *err = NULL; @@ -2584,9 +2577,8 @@ void qmp_blockdev_change_medium(const char *device, const char *filename, qdict_put(options, "driver", qstring_from_str(format)); } - assert(!medium_bs); - ret = bdrv_open(&medium_bs, filename, NULL, options, bdrv_flags, errp); - if (ret < 0) { + medium_bs = bdrv_open(filename, NULL, options, bdrv_flags, errp); + if (!medium_bs) { goto fail; } @@ -3199,7 +3191,6 @@ static void do_drive_backup(const char *device, const char *target, Error *local_err = NULL; int flags; int64_t size; - int ret; if (!has_speed) { speed = 0; @@ -3283,10 +3274,8 @@ static void do_drive_backup(const char *device, const char *target, qdict_put(options, "driver", qstring_from_str(format)); } - target_bs = NULL; - ret = bdrv_open(&target_bs, target, NULL, options, flags, &local_err); - if (ret < 0) { - error_propagate(errp, local_err); + target_bs = bdrv_open(target, NULL, options, flags, errp); + if (!target_bs) { goto out; } @@ -3304,8 +3293,8 @@ static void do_drive_backup(const char *device, const char *target, backup_start(bs, target_bs, speed, sync, bmap, on_source_error, on_target_error, block_job_cb, bs, txn, &local_err); + bdrv_unref(target_bs); if (local_err != NULL) { - bdrv_unref(target_bs); error_propagate(errp, local_err); goto out; } @@ -3389,12 +3378,10 @@ void do_blockdev_backup(const char *device, const char *target, } target_bs = blk_bs(target_blk); - bdrv_ref(target_bs); bdrv_set_aio_context(target_bs, aio_context); backup_start(bs, target_bs, speed, sync, NULL, on_source_error, on_target_error, block_job_cb, bs, txn, &local_err); if (local_err != NULL) { - bdrv_unref(target_bs); error_propagate(errp, local_err); } out: @@ -3470,10 +3457,6 @@ static void blockdev_mirror_common(BlockDriverState *bs, if (bdrv_op_is_blocked(target, BLOCK_OP_TYPE_MIRROR_TARGET, errp)) { return; } - if (bdrv_has_blk(target)) { - error_setg(errp, "Cannot mirror to an attached block device"); - return; - } if (!bs->backing && sync == MIRROR_SYNC_MODE_TOP) { sync = MIRROR_SYNC_MODE_FULL; @@ -3511,7 +3494,6 @@ void qmp_drive_mirror(const char *device, const char *target, QDict *options = NULL; int flags; int64_t size; - int ret; blk = blk_by_name(device); if (!blk) { @@ -3620,11 +3602,9 @@ void qmp_drive_mirror(const char *device, const char *target, /* Mirroring takes care of copy-on-write using the source's backing * file. */ - target_bs = NULL; - ret = bdrv_open(&target_bs, target, NULL, options, - flags | BDRV_O_NO_BACKING, &local_err); - if (ret < 0) { - error_propagate(errp, local_err); + target_bs = bdrv_open(target, NULL, options, flags | BDRV_O_NO_BACKING, + errp); + if (!target_bs) { goto out; } @@ -3639,9 +3619,9 @@ void qmp_drive_mirror(const char *device, const char *target, has_on_target_error, on_target_error, has_unmap, unmap, &local_err); + bdrv_unref(target_bs); if (local_err) { error_propagate(errp, local_err); - bdrv_unref(target_bs); } out: aio_context_release(aio_context); @@ -3685,7 +3665,6 @@ void qmp_blockdev_mirror(const char *device, const char *target, aio_context = bdrv_get_aio_context(bs); aio_context_acquire(aio_context); - bdrv_ref(target_bs); bdrv_set_aio_context(target_bs, aio_context); blockdev_mirror_common(bs, target_bs, @@ -3699,7 +3678,6 @@ void qmp_blockdev_mirror(const char *device, const char *target, &local_err); if (local_err) { error_propagate(errp, local_err); - bdrv_unref(target_bs); } aio_context_release(aio_context); @@ -4164,9 +4142,9 @@ BlockJobInfoList *qmp_query_block_jobs(Error **errp) { BlockJobInfoList *head = NULL, **p_next = &head; BlockDriverState *bs; - BdrvNextIterator *it = NULL; + BdrvNextIterator it; - while ((it = bdrv_next(it, &bs))) { + for (bs = bdrv_first(&it); bs; bs = bdrv_next(&it)) { AioContext *aio_context = bdrv_get_aio_context(bs); aio_context_acquire(aio_context); diff --git a/blockjob.c b/blockjob.c index 5b840a7df6..c095cc57cb 100644 --- a/blockjob.c +++ b/blockjob.c @@ -50,17 +50,31 @@ struct BlockJobTxn { int refcnt; }; +static QLIST_HEAD(, BlockJob) block_jobs = QLIST_HEAD_INITIALIZER(block_jobs); + +BlockJob *block_job_next(BlockJob *job) +{ + if (!job) { + return QLIST_FIRST(&block_jobs); + } + return QLIST_NEXT(job, job_list); +} + void *block_job_create(const BlockJobDriver *driver, BlockDriverState *bs, int64_t speed, BlockCompletionFunc *cb, void *opaque, Error **errp) { + BlockBackend *blk; BlockJob *job; if (bs->job) { error_setg(errp, QERR_DEVICE_IN_USE, bdrv_get_device_name(bs)); return NULL; } - bdrv_ref(bs); + + blk = blk_new(); + blk_insert_bs(blk, bs); + job = g_malloc0(driver->instance_size); error_setg(&job->blocker, "block device is in use by block job: %s", BlockJobType_lookup[driver->job_type]); @@ -69,13 +83,15 @@ void *block_job_create(const BlockJobDriver *driver, BlockDriverState *bs, job->driver = driver; job->id = g_strdup(bdrv_get_device_name(bs)); - job->bs = bs; + job->blk = blk; job->cb = cb; job->opaque = opaque; job->busy = true; job->refcnt = 1; bs->job = job; + QLIST_INSERT_HEAD(&block_jobs, job, job_list); + /* Only set speed when necessary to avoid NotSupported error */ if (speed != 0) { Error *local_err = NULL; @@ -98,11 +114,13 @@ void block_job_ref(BlockJob *job) void block_job_unref(BlockJob *job) { if (--job->refcnt == 0) { - job->bs->job = NULL; - bdrv_op_unblock_all(job->bs, job->blocker); - bdrv_unref(job->bs); + BlockDriverState *bs = blk_bs(job->blk); + bs->job = NULL; + bdrv_op_unblock_all(bs, job->blocker); + blk_unref(job->blk); error_free(job->blocker); g_free(job->id); + QLIST_REMOVE(job, job_list); g_free(job); } } @@ -140,7 +158,7 @@ static void block_job_completed_txn_abort(BlockJob *job) txn->aborting = true; /* We are the first failed job. Cancel other jobs. */ QLIST_FOREACH(other_job, &txn->jobs, txn_list) { - ctx = bdrv_get_aio_context(other_job->bs); + ctx = blk_get_aio_context(other_job->blk); aio_context_acquire(ctx); } QLIST_FOREACH(other_job, &txn->jobs, txn_list) { @@ -157,7 +175,7 @@ static void block_job_completed_txn_abort(BlockJob *job) assert(other_job->completed); } QLIST_FOREACH_SAFE(other_job, &txn->jobs, txn_list, next) { - ctx = bdrv_get_aio_context(other_job->bs); + ctx = blk_get_aio_context(other_job->blk); block_job_completed_single(other_job); aio_context_release(ctx); } @@ -179,7 +197,7 @@ static void block_job_completed_txn_success(BlockJob *job) } /* We are the last completed job, commit the transaction. */ QLIST_FOREACH_SAFE(other_job, &txn->jobs, txn_list, next) { - ctx = bdrv_get_aio_context(other_job->bs); + ctx = blk_get_aio_context(other_job->blk); aio_context_acquire(ctx); assert(other_job->ret == 0); block_job_completed_single(other_job); @@ -189,9 +207,7 @@ static void block_job_completed_txn_success(BlockJob *job) void block_job_completed(BlockJob *job, int ret) { - BlockDriverState *bs = job->bs; - - assert(bs->job == job); + assert(blk_bs(job->blk)->job == job); assert(!job->completed); job->completed = true; job->ret = ret; @@ -282,11 +298,10 @@ static int block_job_finish_sync(BlockJob *job, void (*finish)(BlockJob *, Error **errp), Error **errp) { - BlockDriverState *bs = job->bs; Error *local_err = NULL; int ret; - assert(bs->job == job); + assert(blk_bs(job->blk)->job == job); block_job_ref(job); finish(job, &local_err); @@ -297,7 +312,7 @@ static int block_job_finish_sync(BlockJob *job, } while (!job->completed) { aio_poll(job->deferred_to_main_loop ? qemu_get_aio_context() : - bdrv_get_aio_context(bs), + blk_get_aio_context(job->blk), true); } ret = (job->cancelled && job->ret == 0) ? -ECANCELED : job->ret; @@ -318,6 +333,19 @@ int block_job_cancel_sync(BlockJob *job) return block_job_finish_sync(job, &block_job_cancel_err, NULL); } +void block_job_cancel_sync_all(void) +{ + BlockJob *job; + AioContext *aio_context; + + while ((job = QLIST_FIRST(&block_jobs))) { + aio_context = blk_get_aio_context(job->blk); + aio_context_acquire(aio_context); + block_job_cancel_sync(job); + aio_context_release(aio_context); + } +} + int block_job_complete_sync(BlockJob *job, Error **errp) { return block_job_finish_sync(job, &block_job_complete, errp); @@ -336,7 +364,7 @@ void block_job_sleep_ns(BlockJob *job, QEMUClockType type, int64_t ns) if (block_job_is_paused(job)) { qemu_coroutine_yield(); } else { - co_aio_sleep_ns(bdrv_get_aio_context(job->bs), type, ns); + co_aio_sleep_ns(blk_get_aio_context(job->blk), type, ns); } job->busy = true; } @@ -465,7 +493,7 @@ static void block_job_defer_to_main_loop_bh(void *opaque) aio_context_acquire(data->aio_context); /* Fetch BDS AioContext again, in case it has changed */ - aio_context = bdrv_get_aio_context(data->job->bs); + aio_context = blk_get_aio_context(data->job->blk); aio_context_acquire(aio_context); data->job->deferred_to_main_loop = false; @@ -485,7 +513,7 @@ void block_job_defer_to_main_loop(BlockJob *job, BlockJobDeferToMainLoopData *data = g_malloc(sizeof(*data)); data->job = job; data->bh = qemu_bh_new(block_job_defer_to_main_loop_bh, data); - data->aio_context = bdrv_get_aio_context(job->bs); + data->aio_context = blk_get_aio_context(job->blk); data->fn = fn; data->opaque = opaque; job->deferred_to_main_loop = true; diff --git a/cpu-exec.c b/cpu-exec.c index 602d0c4d0c..f7c642f4a9 100644 --- a/cpu-exec.c +++ b/cpu-exec.c @@ -345,6 +345,15 @@ static inline TranslationBlock *tb_find_fast(CPUState *cpu, *last_tb = NULL; cpu->tb_flushed = false; } +#ifndef CONFIG_USER_ONLY + /* We don't take care of direct jumps when address mapping changes in + * system emulation. So it's not safe to make a direct jump to a TB + * spanning two pages because the mapping for the second page can change. + */ + if (tb->page_addr[1] != -1) { + *last_tb = NULL; + } +#endif /* See if we can patch the calling TB. */ if (*last_tb && !qemu_loglevel_mask(CPU_LOG_TB_NOCHAIN)) { tb_add_jump(*last_tb, tb_exit, tb); diff --git a/dma-helpers.c b/dma-helpers.c index a6cc15f534..b521d84ebd 100644 --- a/dma-helpers.c +++ b/dma-helpers.c @@ -70,7 +70,7 @@ void qemu_sglist_destroy(QEMUSGList *qsg) typedef struct { BlockAIOCB common; - BlockBackend *blk; + AioContext *ctx; BlockAIOCB *acb; QEMUSGList *sg; uint64_t offset; @@ -80,6 +80,7 @@ typedef struct { QEMUIOVector iov; QEMUBH *bh; DMAIOFunc *io_func; + void *io_func_opaque; } DMAAIOCB; static void dma_blk_cb(void *opaque, int ret); @@ -154,8 +155,7 @@ static void dma_blk_cb(void *opaque, int ret) if (dbs->iov.size == 0) { trace_dma_map_wait(dbs); - dbs->bh = aio_bh_new(blk_get_aio_context(dbs->blk), - reschedule_dma, dbs); + dbs->bh = aio_bh_new(dbs->ctx, reschedule_dma, dbs); cpu_register_map_client(dbs->bh); return; } @@ -164,8 +164,8 @@ static void dma_blk_cb(void *opaque, int ret) qemu_iovec_discard_back(&dbs->iov, dbs->iov.size & ~BDRV_SECTOR_MASK); } - dbs->acb = dbs->io_func(dbs->blk, dbs->offset, &dbs->iov, 0, - dma_blk_cb, dbs); + dbs->acb = dbs->io_func(dbs->offset, &dbs->iov, + dma_blk_cb, dbs, dbs->io_func_opaque); assert(dbs->acb); } @@ -191,23 +191,25 @@ static const AIOCBInfo dma_aiocb_info = { .cancel_async = dma_aio_cancel, }; -BlockAIOCB *dma_blk_io( - BlockBackend *blk, QEMUSGList *sg, uint64_t sector_num, - DMAIOFunc *io_func, BlockCompletionFunc *cb, +BlockAIOCB *dma_blk_io(AioContext *ctx, + QEMUSGList *sg, uint64_t offset, + DMAIOFunc *io_func, void *io_func_opaque, + BlockCompletionFunc *cb, void *opaque, DMADirection dir) { - DMAAIOCB *dbs = blk_aio_get(&dma_aiocb_info, blk, cb, opaque); + DMAAIOCB *dbs = qemu_aio_get(&dma_aiocb_info, NULL, cb, opaque); - trace_dma_blk_io(dbs, blk, sector_num, (dir == DMA_DIRECTION_TO_DEVICE)); + trace_dma_blk_io(dbs, io_func_opaque, offset, (dir == DMA_DIRECTION_TO_DEVICE)); dbs->acb = NULL; - dbs->blk = blk; dbs->sg = sg; - dbs->offset = sector_num << BDRV_SECTOR_BITS; + dbs->ctx = ctx; + dbs->offset = offset; dbs->sg_cur_index = 0; dbs->sg_cur_byte = 0; dbs->dir = dir; dbs->io_func = io_func; + dbs->io_func_opaque = io_func_opaque; dbs->bh = NULL; qemu_iovec_init(&dbs->iov, sg->nsg); dma_blk_cb(dbs, 0); @@ -215,19 +217,39 @@ BlockAIOCB *dma_blk_io( } +static +BlockAIOCB *dma_blk_read_io_func(int64_t offset, QEMUIOVector *iov, + BlockCompletionFunc *cb, void *cb_opaque, + void *opaque) +{ + BlockBackend *blk = opaque; + return blk_aio_preadv(blk, offset, iov, 0, cb, cb_opaque); +} + BlockAIOCB *dma_blk_read(BlockBackend *blk, - QEMUSGList *sg, uint64_t sector, + QEMUSGList *sg, uint64_t offset, void (*cb)(void *opaque, int ret), void *opaque) { - return dma_blk_io(blk, sg, sector, blk_aio_preadv, cb, opaque, + return dma_blk_io(blk_get_aio_context(blk), + sg, offset, dma_blk_read_io_func, blk, cb, opaque, DMA_DIRECTION_FROM_DEVICE); } +static +BlockAIOCB *dma_blk_write_io_func(int64_t offset, QEMUIOVector *iov, + BlockCompletionFunc *cb, void *cb_opaque, + void *opaque) +{ + BlockBackend *blk = opaque; + return blk_aio_pwritev(blk, offset, iov, 0, cb, cb_opaque); +} + BlockAIOCB *dma_blk_write(BlockBackend *blk, - QEMUSGList *sg, uint64_t sector, + QEMUSGList *sg, uint64_t offset, void (*cb)(void *opaque, int ret), void *opaque) { - return dma_blk_io(blk, sg, sector, blk_aio_pwritev, cb, opaque, + return dma_blk_io(blk_get_aio_context(blk), + sg, offset, dma_blk_write_io_func, blk, cb, opaque, DMA_DIRECTION_TO_DEVICE); } diff --git a/docs/igd-assign.txt b/docs/igd-assign.txt new file mode 100644 index 0000000000..e17bb50789 --- /dev/null +++ b/docs/igd-assign.txt @@ -0,0 +1,133 @@ +Intel Graphics Device (IGD) assignment with vfio-pci +==================================================== + +IGD has two different modes for assignment using vfio-pci: + +1) Universal Pass-Through (UPT) mode: + + In this mode the IGD device is added as a *secondary* (ie. non-primary) + graphics device in combination with an emulated primary graphics device. + This mode *requires* guest driver support to remove the external + dependencies generally associated with IGD (see below). Those guest + drivers only support this mode for Broadwell and newer IGD, according to + Intel. Additionally, this mode by default, and as officially supported + by Intel, does not support direct video output. The intention is to use + this mode either to provide hardware acceleration to the emulated graphics + or to use this mode in combination with guest-based remote access software, + for example VNC (see below for optional output support). This mode + theoretically has no device specific handling dependencies on vfio-pci or + the VM firmware. + +2) "Legacy" mode: + + In this mode the IGD device is intended to be the primary and exclusive + graphics device in the VM[1], as such QEMU does not facilitate any sort + of remote graphics to the VM in this mode. A connected physical monitor + is the intended output device for IGD. This mode includes several + requirements and restrictions: + + * IGD must be given address 02.0 on the PCI root bus in the VM + * The host kernel must support vfio extensions for IGD (v4.6) + * vfio VGA support very likely needs to be enabled in the host kernel + * The VM firmware must support specific fw_cfg enablers for IGD + * The VM machine type must support a PCI host bridge at 00.0 (standard) + * The VM machine type must provide or allow to be created a special + ISA/LPC bridge device (vfio-pci-igd-lpc-bridge) on the root bus at + PCI address 1f.0. + * The IGD device must have a VGA ROM, either provided via the romfile + option or loaded automatically through vfio (standard). rombar=0 + will disable legacy mode support. + * Hotplug of the IGD device is not supported. + * The IGD device must be a SandyBridge or newer model device. + +For either mode, depending on the host kernel, the i915 driver in the host +may generate faults and errors upon re-binding to an IGD device after it +has been assigned to a VM. It's therefore generally recommended to prevent +such driver binding unless the host driver is known to work well for this. +There are numerous ways to do this, i915 can be blacklisted on the host, +the driver_override option can be used to ensure that only vfio-pci can bind +to the device on the host[2], virsh nodedev-detach can be used to bind the +device to vfio drivers and then managed='no' set in the VM xml to prevent +re-binding to i915, etc. Also note that IGD is also typically the primary +graphics in the host and special options may be required beyond simply +blacklisting i915 or using pci-stub/vfio-pci to take ownership of IGD as a +PCI class device. Lower level drivers exist that may still claim the device. +It may therefore be necessary to use kernel boot options video=vesafb:off or +video=efifb:off (depending on host BIOS/UEFI) or these can be combined to +a catch-all, video=vesafb:off,efifb:off. Error messages such as: + + Failed to mmap 0000:00:02.0 BAR <>. Performance may be slow + +are a good indicator that such a problem exists. The host files /proc/iomem +and /proc/ioports are often useful for identifying drivers consuming ranges +of the device to cause such conflicts. + +Additionally, IGD device are known to generate small numbers of DMAR faults +when initially assigned. It is believed that this is simply the IGD attempting +to access the reserved GTT space after reset, which it no longer has access to +when accessed from userspace. So long as the DMAR faults are small in number +and most importantly, not ongoing, these are not an indication of an error. + +Additionally++, analog VGA output (as opposed to digital outputs like HDMI, +DVI, or DisplayPort) may be unsupported in some use cases. In the author's +experience, even DP to VGA adapters can be troublesome while adapters between +digital formats work well. + +Usage +===== +The intention is for IGD assignment to be transparent for users and thus for +management tools like libvirt. To make use of legacy mode, simply remove all +other graphics options and use "-nographic" and either "-vga none" or +"-nodefaults", along with adding the device using vfio-pci: + + -device vfio-pci,host=00:02.0,id=hostdev0,bus=pci.0,addr=0x2 + +For UPT mode, retain the default emulated graphics and simply add the vfio-pci +device making use of any other bus address other than 02.0. libvirt will +default to assigning the device a UPT compatible address while legacy mode +users will need to manually edit the XML if using a tool like virt-manager +where the VM device address is not expressly specified. + +An experimental vfio-pci option also exists to enable OpRegion, and thus +external monitor support, for UPT mode. This can be enabled by adding +"x-igd-opregion=on" to the vfio-pci device options for the IGD device. As +with legacy mode, this requires the host to support features introduced in +the v4.6 kernel. If Intel chooses to embrace this support, the option may +be made non-experimental in the future, opening it to libvirt support. + +Developer ABI +============= +Legacy mode IGD support imposes two fw_cfg requirements on the VM firmware: + +1) "etc/igd-opregion" + + This fw_cfg file exposes the OpRegion for the IGD device. A reserved + region should be created below 4GB (recommended 4KB alignment), sized + sufficient for the fw_cfg file size, and the content of this file copied + to it. The dword based address of this reserved memory region must also + be written to the ASLS register at offset 0xFC on the IGD device. It is + recommended that firmware should make use of this fw_cfg entry for any + PCI class VGA device with Intel vendor ID. Multiple of such devices + within a VM is undefined. + +2) "etc/igd-bdsm-size" + + This fw_cfg file contains an 8-byte, little endian integer indicating + the size of the reserved memory region required for IGD stolen memory. + Firmware must allocate a reserved memory below 4GB with required 1MB + alignment equal to this size. Additionally the base address of this + reserved region must be written to the dword BDSM register in PCI config + space of the IGD device at offset 0x5C. As this support is related to + running the IGD ROM, which has other dependencies on the device appearing + at guest address 00:02.0, it's expected that this fw_cfg file is only + relevant to a single PCI class VGA device with Intel vendor ID, appearing + at PCI bus address 00:02.0. + +Footnotes +========= +[1] Nothing precludes adding additional emulated or assigned graphics devices + as non-primary, other than the combination typically not working. I only + intend to set user expectations, others are welcome to find working + combinations or fix whatever issues prevent this from working in the common + case. +[2] # echo "vfio-pci" > /sys/bus/pci/devices/0000:00:02.0/driver_override diff --git a/docs/migration.txt b/docs/migration.txt index 90209ab294..6503c17685 100644 --- a/docs/migration.txt +++ b/docs/migration.txt @@ -403,8 +403,8 @@ listen thread: --- page -- page -- page -- page -- page -- On receipt of CMD_PACKAGED (1) All the data associated with the package - the ( ... ) section in the -diagram - is read into memory (into a QEMUSizedBuffer), and the main thread -recurses into qemu_loadvm_state_main to process the contents of the package (2) +diagram - is read into memory, and the main thread recurses into +qemu_loadvm_state_main to process the contents of the package (2) which contains commands (3,6) and devices (4...) On receipt of 'postcopy listen' - 3 -(i.e. the 1st command in the package) diff --git a/hmp-commands.hx b/hmp-commands.hx index 4f4f60a0df..98b4b1a82c 100644 --- a/hmp-commands.hx +++ b/hmp-commands.hx @@ -1008,7 +1008,7 @@ ETEXI { .name = "migrate_set_parameter", - .args_type = "parameter:s,value:i", + .args_type = "parameter:s,value:s", .params = "parameter value", .help = "Set the parameter for migration", .mhandler.cmd = hmp_migrate_set_parameter, diff --git a/hmp.c b/hmp.c index 9f9bcf9d83..a4b1d3d220 100644 --- a/hmp.c +++ b/hmp.c @@ -35,6 +35,7 @@ #include "block/qapi.h" #include "qemu-io.h" #include "qemu/cutils.h" +#include "qemu/error-report.h" #ifdef CONFIG_SPICE #include <spice/enums.h> @@ -168,8 +169,15 @@ void hmp_info_migrate(Monitor *mon, const QDict *qdict) } if (info->has_status) { - monitor_printf(mon, "Migration status: %s\n", + monitor_printf(mon, "Migration status: %s", MigrationStatus_lookup[info->status]); + if (info->status == MIGRATION_STATUS_FAILED && + info->has_error_desc) { + monitor_printf(mon, " (%s)\n", info->error_desc); + } else { + monitor_printf(mon, "\n"); + } + monitor_printf(mon, "total time: %" PRIu64 " milliseconds\n", info->total_time); if (info->has_expected_downtime) { @@ -286,6 +294,12 @@ void hmp_info_migrate_parameters(Monitor *mon, const QDict *qdict) monitor_printf(mon, " %s: %" PRId64, MigrationParameter_lookup[MIGRATION_PARAMETER_CPU_THROTTLE_INCREMENT], params->cpu_throttle_increment); + monitor_printf(mon, " %s: '%s'", + MigrationParameter_lookup[MIGRATION_PARAMETER_TLS_CREDS], + params->tls_creds ? : ""); + monitor_printf(mon, " %s: '%s'", + MigrationParameter_lookup[MIGRATION_PARAMETER_TLS_HOSTNAME], + params->tls_hostname ? : ""); monitor_printf(mon, "\n"); } @@ -1235,13 +1249,17 @@ void hmp_migrate_set_capability(Monitor *mon, const QDict *qdict) void hmp_migrate_set_parameter(Monitor *mon, const QDict *qdict) { const char *param = qdict_get_str(qdict, "parameter"); - int value = qdict_get_int(qdict, "value"); + const char *valuestr = qdict_get_str(qdict, "value"); + long valueint = 0; Error *err = NULL; bool has_compress_level = false; bool has_compress_threads = false; bool has_decompress_threads = false; bool has_cpu_throttle_initial = false; bool has_cpu_throttle_increment = false; + bool has_tls_creds = false; + bool has_tls_hostname = false; + bool use_int_value = false; int i; for (i = 0; i < MIGRATION_PARAMETER__MAX; i++) { @@ -1249,25 +1267,46 @@ void hmp_migrate_set_parameter(Monitor *mon, const QDict *qdict) switch (i) { case MIGRATION_PARAMETER_COMPRESS_LEVEL: has_compress_level = true; + use_int_value = true; break; case MIGRATION_PARAMETER_COMPRESS_THREADS: has_compress_threads = true; + use_int_value = true; break; case MIGRATION_PARAMETER_DECOMPRESS_THREADS: has_decompress_threads = true; + use_int_value = true; break; case MIGRATION_PARAMETER_CPU_THROTTLE_INITIAL: has_cpu_throttle_initial = true; + use_int_value = true; break; case MIGRATION_PARAMETER_CPU_THROTTLE_INCREMENT: has_cpu_throttle_increment = true; break; + case MIGRATION_PARAMETER_TLS_CREDS: + has_tls_creds = true; + break; + case MIGRATION_PARAMETER_TLS_HOSTNAME: + has_tls_hostname = true; + break; } - qmp_migrate_set_parameters(has_compress_level, value, - has_compress_threads, value, - has_decompress_threads, value, - has_cpu_throttle_initial, value, - has_cpu_throttle_increment, value, + + if (use_int_value) { + if (qemu_strtol(valuestr, NULL, 10, &valueint) < 0) { + error_setg(&err, "Unable to parse '%s' as an int", + valuestr); + goto cleanup; + } + } + + qmp_migrate_set_parameters(has_compress_level, valueint, + has_compress_threads, valueint, + has_decompress_threads, valueint, + has_cpu_throttle_initial, valueint, + has_cpu_throttle_increment, valueint, + has_tls_creds, valuestr, + has_tls_hostname, valuestr, &err); break; } @@ -1277,6 +1316,7 @@ void hmp_migrate_set_parameter(Monitor *mon, const QDict *qdict) error_setg(&err, QERR_INVALID_PARAMETER, param); } + cleanup: if (err) { error_report_err(err); } @@ -1533,6 +1573,9 @@ static void hmp_migrate_status_cb(void *opaque) if (status->is_block_migration) { monitor_printf(status->mon, "\n"); } + if (info->has_error_desc) { + error_report("%s", info->error_desc); + } monitor_resume(status->mon); timer_del(status->timer); g_free(status); diff --git a/hw/block/nvme.c b/hw/block/nvme.c index 173988ee84..9faad29fad 100644 --- a/hw/block/nvme.c +++ b/hw/block/nvme.c @@ -239,7 +239,7 @@ static uint16_t nvme_rw(NvmeCtrl *n, NvmeNamespace *ns, NvmeCmd *cmd, uint8_t lba_index = NVME_ID_NS_FLBAS_INDEX(ns->id_ns.flbas); uint8_t data_shift = ns->id_ns.lbaf[lba_index].ds; uint64_t data_size = (uint64_t)nlb << data_shift; - uint64_t aio_slba = slba << (data_shift - BDRV_SECTOR_BITS); + uint64_t data_offset = slba << data_shift; int is_write = rw->opcode == NVME_CMD_WRITE ? 1 : 0; enum BlockAcctType acct = is_write ? BLOCK_ACCT_WRITE : BLOCK_ACCT_READ; @@ -258,8 +258,8 @@ static uint16_t nvme_rw(NvmeCtrl *n, NvmeNamespace *ns, NvmeCmd *cmd, req->has_sg = true; dma_acct_start(n->conf.blk, &req->acct, &req->qsg, acct); req->aiocb = is_write ? - dma_blk_write(n->conf.blk, &req->qsg, aio_slba, nvme_rw_cb, req) : - dma_blk_read(n->conf.blk, &req->qsg, aio_slba, nvme_rw_cb, req); + dma_blk_write(n->conf.blk, &req->qsg, data_offset, nvme_rw_cb, req) : + dma_blk_read(n->conf.blk, &req->qsg, data_offset, nvme_rw_cb, req); return NVME_NO_COMPLETE; } diff --git a/hw/core/Makefile.objs b/hw/core/Makefile.objs index 70951d4137..82a9ef84f8 100644 --- a/hw/core/Makefile.objs +++ b/hw/core/Makefile.objs @@ -1,5 +1,6 @@ # core qdev-related obj files, also used by *-user: common-obj-y += qdev.o qdev-properties.o +common-obj-y += bus.o common-obj-y += fw-path-provider.o # irq.o needed for qdev GPIO handling: common-obj-y += irq.o diff --git a/hw/core/bus.c b/hw/core/bus.c new file mode 100644 index 0000000000..3e3f8ac740 --- /dev/null +++ b/hw/core/bus.c @@ -0,0 +1,251 @@ +/* + * Dynamic device configuration and creation -- buses. + * + * Copyright (c) 2009 CodeSourcery + * + * This library is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2 of the License, or (at your option) any later version. + * + * This library is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with this library; if not, see <http://www.gnu.org/licenses/>. + */ + +#include "qemu/osdep.h" +#include "qemu-common.h" +#include "hw/qdev.h" +#include "qapi/error.h" + +static void qbus_set_hotplug_handler_internal(BusState *bus, Object *handler, + Error **errp) +{ + + object_property_set_link(OBJECT(bus), OBJECT(handler), + QDEV_HOTPLUG_HANDLER_PROPERTY, errp); +} + +void qbus_set_hotplug_handler(BusState *bus, DeviceState *handler, Error **errp) +{ + qbus_set_hotplug_handler_internal(bus, OBJECT(handler), errp); +} + +void qbus_set_bus_hotplug_handler(BusState *bus, Error **errp) +{ + qbus_set_hotplug_handler_internal(bus, OBJECT(bus), errp); +} + +int qbus_walk_children(BusState *bus, + qdev_walkerfn *pre_devfn, qbus_walkerfn *pre_busfn, + qdev_walkerfn *post_devfn, qbus_walkerfn *post_busfn, + void *opaque) +{ + BusChild *kid; + int err; + + if (pre_busfn) { + err = pre_busfn(bus, opaque); + if (err) { + return err; + } + } + + QTAILQ_FOREACH(kid, &bus->children, sibling) { + err = qdev_walk_children(kid->child, + pre_devfn, pre_busfn, + post_devfn, post_busfn, opaque); + if (err < 0) { + return err; + } + } + + if (post_busfn) { + err = post_busfn(bus, opaque); + if (err) { + return err; + } + } + + return 0; +} + +static void qbus_realize(BusState *bus, DeviceState *parent, const char *name) +{ + const char *typename = object_get_typename(OBJECT(bus)); + BusClass *bc; + char *buf; + int i, len, bus_id; + + bus->parent = parent; + + if (name) { + bus->name = g_strdup(name); + } else if (bus->parent && bus->parent->id) { + /* parent device has id -> use it plus parent-bus-id for bus name */ + bus_id = bus->parent->num_child_bus; + + len = strlen(bus->parent->id) + 16; + buf = g_malloc(len); + snprintf(buf, len, "%s.%d", bus->parent->id, bus_id); + bus->name = buf; + } else { + /* no id -> use lowercase bus type plus global bus-id for bus name */ + bc = BUS_GET_CLASS(bus); + bus_id = bc->automatic_ids++; + + len = strlen(typename) + 16; + buf = g_malloc(len); + len = snprintf(buf, len, "%s.%d", typename, bus_id); + for (i = 0; i < len; i++) { + buf[i] = qemu_tolower(buf[i]); + } + bus->name = buf; + } + + if (bus->parent) { + QLIST_INSERT_HEAD(&bus->parent->child_bus, bus, sibling); + bus->parent->num_child_bus++; + object_property_add_child(OBJECT(bus->parent), bus->name, OBJECT(bus), NULL); + object_unref(OBJECT(bus)); + } else if (bus != sysbus_get_default()) { + /* TODO: once all bus devices are qdevified, + only reset handler for main_system_bus should be registered here. */ + qemu_register_reset(qbus_reset_all_fn, bus); + } +} + +static void bus_unparent(Object *obj) +{ + BusState *bus = BUS(obj); + BusChild *kid; + + while ((kid = QTAILQ_FIRST(&bus->children)) != NULL) { + DeviceState *dev = kid->child; + object_unparent(OBJECT(dev)); + } + if (bus->parent) { + QLIST_REMOVE(bus, sibling); + bus->parent->num_child_bus--; + bus->parent = NULL; + } else { + assert(bus != sysbus_get_default()); /* main_system_bus is never freed */ + qemu_unregister_reset(qbus_reset_all_fn, bus); + } +} + +void qbus_create_inplace(void *bus, size_t size, const char *typename, + DeviceState *parent, const char *name) +{ + object_initialize(bus, size, typename); + qbus_realize(bus, parent, name); +} + +BusState *qbus_create(const char *typename, DeviceState *parent, const char *name) +{ + BusState *bus; + + bus = BUS(object_new(typename)); + qbus_realize(bus, parent, name); + + return bus; +} + +static bool bus_get_realized(Object *obj, Error **errp) +{ + BusState *bus = BUS(obj); + + return bus->realized; +} + +static void bus_set_realized(Object *obj, bool value, Error **errp) +{ + BusState *bus = BUS(obj); + BusClass *bc = BUS_GET_CLASS(bus); + BusChild *kid; + Error *local_err = NULL; + + if (value && !bus->realized) { + if (bc->realize) { + bc->realize(bus, &local_err); + } + + /* TODO: recursive realization */ + } else if (!value && bus->realized) { + QTAILQ_FOREACH(kid, &bus->children, sibling) { + DeviceState *dev = kid->child; + object_property_set_bool(OBJECT(dev), false, "realized", + &local_err); + if (local_err != NULL) { + break; + } + } + if (bc->unrealize && local_err == NULL) { + bc->unrealize(bus, &local_err); + } + } + + if (local_err != NULL) { + error_propagate(errp, local_err); + return; + } + + bus->realized = value; +} + +static void qbus_initfn(Object *obj) +{ + BusState *bus = BUS(obj); + + QTAILQ_INIT(&bus->children); + object_property_add_link(obj, QDEV_HOTPLUG_HANDLER_PROPERTY, + TYPE_HOTPLUG_HANDLER, + (Object **)&bus->hotplug_handler, + object_property_allow_set_link, + OBJ_PROP_LINK_UNREF_ON_RELEASE, + NULL); + object_property_add_bool(obj, "realized", + bus_get_realized, bus_set_realized, NULL); +} + +static char *default_bus_get_fw_dev_path(DeviceState *dev) +{ + return g_strdup(object_get_typename(OBJECT(dev))); +} + +static void bus_class_init(ObjectClass *class, void *data) +{ + BusClass *bc = BUS_CLASS(class); + + class->unparent = bus_unparent; + bc->get_fw_dev_path = default_bus_get_fw_dev_path; +} + +static void qbus_finalize(Object *obj) +{ + BusState *bus = BUS(obj); + + g_free((char *)bus->name); +} + +static const TypeInfo bus_info = { + .name = TYPE_BUS, + .parent = TYPE_OBJECT, + .instance_size = sizeof(BusState), + .abstract = true, + .class_size = sizeof(BusClass), + .instance_init = qbus_initfn, + .instance_finalize = qbus_finalize, + .class_init = bus_class_init, +}; + +static void bus_register_types(void) +{ + type_register_static(&bus_info); +} + +type_init(bus_register_types) diff --git a/hw/core/qdev.c b/hw/core/qdev.c index db41aa1f26..853162b670 100644 --- a/hw/core/qdev.c +++ b/hw/core/qdev.c @@ -109,24 +109,6 @@ void qdev_set_parent_bus(DeviceState *dev, BusState *bus) bus_add_child(bus, dev); } -static void qbus_set_hotplug_handler_internal(BusState *bus, Object *handler, - Error **errp) -{ - - object_property_set_link(OBJECT(bus), OBJECT(handler), - QDEV_HOTPLUG_HANDLER_PROPERTY, errp); -} - -void qbus_set_hotplug_handler(BusState *bus, DeviceState *handler, Error **errp) -{ - qbus_set_hotplug_handler_internal(bus, OBJECT(handler), errp); -} - -void qbus_set_bus_hotplug_handler(BusState *bus, Error **errp) -{ - qbus_set_hotplug_handler_internal(bus, OBJECT(bus), errp); -} - /* Create a new device. This only initializes the device state structure and allows properties to be set. The device still needs to be realized. See qdev-core.h. */ @@ -595,40 +577,6 @@ BusState *qdev_get_child_bus(DeviceState *dev, const char *name) return NULL; } -int qbus_walk_children(BusState *bus, - qdev_walkerfn *pre_devfn, qbus_walkerfn *pre_busfn, - qdev_walkerfn *post_devfn, qbus_walkerfn *post_busfn, - void *opaque) -{ - BusChild *kid; - int err; - - if (pre_busfn) { - err = pre_busfn(bus, opaque); - if (err) { - return err; - } - } - - QTAILQ_FOREACH(kid, &bus->children, sibling) { - err = qdev_walk_children(kid->child, - pre_devfn, pre_busfn, - post_devfn, post_busfn, opaque); - if (err < 0) { - return err; - } - } - - if (post_busfn) { - err = post_busfn(bus, opaque); - if (err) { - return err; - } - } - - return 0; -} - int qdev_walk_children(DeviceState *dev, qdev_walkerfn *pre_devfn, qbus_walkerfn *pre_busfn, qdev_walkerfn *post_devfn, qbus_walkerfn *post_busfn, @@ -685,129 +633,6 @@ DeviceState *qdev_find_recursive(BusState *bus, const char *id) return NULL; } -static void qbus_realize(BusState *bus, DeviceState *parent, const char *name) -{ - const char *typename = object_get_typename(OBJECT(bus)); - BusClass *bc; - char *buf; - int i, len, bus_id; - - bus->parent = parent; - - if (name) { - bus->name = g_strdup(name); - } else if (bus->parent && bus->parent->id) { - /* parent device has id -> use it plus parent-bus-id for bus name */ - bus_id = bus->parent->num_child_bus; - - len = strlen(bus->parent->id) + 16; - buf = g_malloc(len); - snprintf(buf, len, "%s.%d", bus->parent->id, bus_id); - bus->name = buf; - } else { - /* no id -> use lowercase bus type plus global bus-id for bus name */ - bc = BUS_GET_CLASS(bus); - bus_id = bc->automatic_ids++; - - len = strlen(typename) + 16; - buf = g_malloc(len); - len = snprintf(buf, len, "%s.%d", typename, bus_id); - for (i = 0; i < len; i++) { - buf[i] = qemu_tolower(buf[i]); - } - bus->name = buf; - } - - if (bus->parent) { - QLIST_INSERT_HEAD(&bus->parent->child_bus, bus, sibling); - bus->parent->num_child_bus++; - object_property_add_child(OBJECT(bus->parent), bus->name, OBJECT(bus), NULL); - object_unref(OBJECT(bus)); - } else if (bus != sysbus_get_default()) { - /* TODO: once all bus devices are qdevified, - only reset handler for main_system_bus should be registered here. */ - qemu_register_reset(qbus_reset_all_fn, bus); - } -} - -static void bus_unparent(Object *obj) -{ - BusState *bus = BUS(obj); - BusChild *kid; - - while ((kid = QTAILQ_FIRST(&bus->children)) != NULL) { - DeviceState *dev = kid->child; - object_unparent(OBJECT(dev)); - } - if (bus->parent) { - QLIST_REMOVE(bus, sibling); - bus->parent->num_child_bus--; - bus->parent = NULL; - } else { - assert(bus != sysbus_get_default()); /* main_system_bus is never freed */ - qemu_unregister_reset(qbus_reset_all_fn, bus); - } -} - -static bool bus_get_realized(Object *obj, Error **errp) -{ - BusState *bus = BUS(obj); - - return bus->realized; -} - -static void bus_set_realized(Object *obj, bool value, Error **errp) -{ - BusState *bus = BUS(obj); - BusClass *bc = BUS_GET_CLASS(bus); - BusChild *kid; - Error *local_err = NULL; - - if (value && !bus->realized) { - if (bc->realize) { - bc->realize(bus, &local_err); - } - - /* TODO: recursive realization */ - } else if (!value && bus->realized) { - QTAILQ_FOREACH(kid, &bus->children, sibling) { - DeviceState *dev = kid->child; - object_property_set_bool(OBJECT(dev), false, "realized", - &local_err); - if (local_err != NULL) { - break; - } - } - if (bc->unrealize && local_err == NULL) { - bc->unrealize(bus, &local_err); - } - } - - if (local_err != NULL) { - error_propagate(errp, local_err); - return; - } - - bus->realized = value; -} - -void qbus_create_inplace(void *bus, size_t size, const char *typename, - DeviceState *parent, const char *name) -{ - object_initialize(bus, size, typename); - qbus_realize(bus, parent, name); -} - -BusState *qbus_create(const char *typename, DeviceState *parent, const char *name) -{ - BusState *bus; - - bus = BUS(object_new(typename)); - qbus_realize(bus, parent, name); - - return bus; -} - static char *bus_get_fw_dev_path(BusState *bus, DeviceState *dev) { BusClass *bc = BUS_GET_CLASS(bus); @@ -1315,55 +1140,8 @@ static const TypeInfo device_type_info = { .class_size = sizeof(DeviceClass), }; -static void qbus_initfn(Object *obj) -{ - BusState *bus = BUS(obj); - - QTAILQ_INIT(&bus->children); - object_property_add_link(obj, QDEV_HOTPLUG_HANDLER_PROPERTY, - TYPE_HOTPLUG_HANDLER, - (Object **)&bus->hotplug_handler, - object_property_allow_set_link, - OBJ_PROP_LINK_UNREF_ON_RELEASE, - NULL); - object_property_add_bool(obj, "realized", - bus_get_realized, bus_set_realized, NULL); -} - -static char *default_bus_get_fw_dev_path(DeviceState *dev) -{ - return g_strdup(object_get_typename(OBJECT(dev))); -} - -static void bus_class_init(ObjectClass *class, void *data) -{ - BusClass *bc = BUS_CLASS(class); - - class->unparent = bus_unparent; - bc->get_fw_dev_path = default_bus_get_fw_dev_path; -} - -static void qbus_finalize(Object *obj) -{ - BusState *bus = BUS(obj); - - g_free((char *)bus->name); -} - -static const TypeInfo bus_info = { - .name = TYPE_BUS, - .parent = TYPE_OBJECT, - .instance_size = sizeof(BusState), - .abstract = true, - .class_size = sizeof(BusClass), - .instance_init = qbus_initfn, - .instance_finalize = qbus_finalize, - .class_init = bus_class_init, -}; - static void qdev_register_types(void) { - type_register_static(&bus_info); type_register_static(&device_type_info); } diff --git a/hw/ide/ahci.c b/hw/ide/ahci.c index f244bc01c9..502d4f1c7b 100644 --- a/hw/ide/ahci.c +++ b/hw/ide/ahci.c @@ -1006,7 +1006,8 @@ static void execute_ncq_command(NCQTransferState *ncq_tfs) dma_acct_start(ide_state->blk, &ncq_tfs->acct, &ncq_tfs->sglist, BLOCK_ACCT_READ); ncq_tfs->aiocb = dma_blk_read(ide_state->blk, &ncq_tfs->sglist, - ncq_tfs->lba, ncq_cb, ncq_tfs); + ncq_tfs->lba << BDRV_SECTOR_BITS, + ncq_cb, ncq_tfs); break; case WRITE_FPDMA_QUEUED: DPRINTF(port, "NCQ writing %d sectors to LBA %"PRId64", tag %d\n", @@ -1018,7 +1019,8 @@ static void execute_ncq_command(NCQTransferState *ncq_tfs) dma_acct_start(ide_state->blk, &ncq_tfs->acct, &ncq_tfs->sglist, BLOCK_ACCT_WRITE); ncq_tfs->aiocb = dma_blk_write(ide_state->blk, &ncq_tfs->sglist, - ncq_tfs->lba, ncq_cb, ncq_tfs); + ncq_tfs->lba << BDRV_SECTOR_BITS, + ncq_cb, ncq_tfs); break; default: DPRINTF(port, "error: unsupported NCQ command (0x%02x) received\n", diff --git a/hw/ide/core.c b/hw/ide/core.c index fe2bfba489..029f6b9b12 100644 --- a/hw/ide/core.c +++ b/hw/ide/core.c @@ -441,13 +441,14 @@ static void ide_issue_trim_cb(void *opaque, int ret) } } -BlockAIOCB *ide_issue_trim(BlockBackend *blk, - int64_t offset, QEMUIOVector *qiov, BdrvRequestFlags flags, - BlockCompletionFunc *cb, void *opaque) +BlockAIOCB *ide_issue_trim( + int64_t offset, QEMUIOVector *qiov, + BlockCompletionFunc *cb, void *cb_opaque, void *opaque) { + BlockBackend *blk = opaque; TrimAIOCB *iocb; - iocb = blk_aio_get(&trim_aiocb_info, blk, cb, opaque); + iocb = blk_aio_get(&trim_aiocb_info, blk, cb, cb_opaque); iocb->blk = blk; iocb->bh = qemu_bh_new(ide_trim_bh_cb, iocb); iocb->ret = 0; @@ -799,6 +800,7 @@ static void ide_dma_cb(void *opaque, int ret) IDEState *s = opaque; int n; int64_t sector_num; + uint64_t offset; bool stay_active = false; if (ret == -ECANCELED) { @@ -859,18 +861,20 @@ static void ide_dma_cb(void *opaque, int ret) return; } + offset = sector_num << BDRV_SECTOR_BITS; switch (s->dma_cmd) { case IDE_DMA_READ: - s->bus->dma->aiocb = dma_blk_read(s->blk, &s->sg, sector_num, + s->bus->dma->aiocb = dma_blk_read(s->blk, &s->sg, offset, ide_dma_cb, s); break; case IDE_DMA_WRITE: - s->bus->dma->aiocb = dma_blk_write(s->blk, &s->sg, sector_num, + s->bus->dma->aiocb = dma_blk_write(s->blk, &s->sg, offset, ide_dma_cb, s); break; case IDE_DMA_TRIM: - s->bus->dma->aiocb = dma_blk_io(s->blk, &s->sg, sector_num, - ide_issue_trim, ide_dma_cb, s, + s->bus->dma->aiocb = dma_blk_io(blk_get_aio_context(s->blk), + &s->sg, offset, + ide_issue_trim, s->blk, ide_dma_cb, s, DMA_DIRECTION_TO_DEVICE); break; default: diff --git a/hw/ide/internal.h b/hw/ide/internal.h index ceb9e5994a..773928af77 100644 --- a/hw/ide/internal.h +++ b/hw/ide/internal.h @@ -613,9 +613,9 @@ void ide_transfer_start(IDEState *s, uint8_t *buf, int size, EndTransferFunc *end_transfer_func); void ide_transfer_stop(IDEState *s); void ide_set_inactive(IDEState *s, bool more); -BlockAIOCB *ide_issue_trim(BlockBackend *blk, - int64_t offset, QEMUIOVector *qiov, BdrvRequestFlags flags, - BlockCompletionFunc *cb, void *opaque); +BlockAIOCB *ide_issue_trim( + int64_t offset, QEMUIOVector *qiov, + BlockCompletionFunc *cb, void *cb_opaque, void *opaque); BlockAIOCB *ide_buffered_readv(IDEState *s, int64_t sector_num, QEMUIOVector *iov, int nb_sectors, BlockCompletionFunc *cb, void *opaque); diff --git a/hw/ide/macio.c b/hw/ide/macio.c index d7d9c0ff3a..42ad68a1c0 100644 --- a/hw/ide/macio.c +++ b/hw/ide/macio.c @@ -230,7 +230,7 @@ static void pmac_dma_trim(BlockBackend *blk, s->io_buffer_index += io->len; io->len = 0; - s->bus->dma->aiocb = ide_issue_trim(blk, offset, &io->iov, 0, cb, io); + s->bus->dma->aiocb = ide_issue_trim(offset, &io->iov, cb, io, blk); } static void pmac_ide_atapi_transfer_cb(void *opaque, int ret) diff --git a/hw/net/spapr_llan.c b/hw/net/spapr_llan.c index a8266f8ec7..8b2eebd4e3 100644 --- a/hw/net/spapr_llan.c +++ b/hw/net/spapr_llan.c @@ -110,6 +110,7 @@ typedef struct VIOsPAPRVLANDevice { hwaddr buf_list; uint32_t add_buf_ptr, use_buf_ptr, rx_bufs; hwaddr rxq_ptr; + QEMUTimer *rxp_timer; uint32_t compat_flags; /* Compatability flags for migration */ RxBufPool *rx_pool[RX_MAX_POOLS]; /* Receive buffer descriptor pools */ } VIOsPAPRVLANDevice; @@ -122,6 +123,21 @@ static int spapr_vlan_can_receive(NetClientState *nc) } /** + * The last 8 bytes of the receive buffer list page (that has been + * supplied by the guest with the H_REGISTER_LOGICAL_LAN call) contain + * a counter for frames that have been dropped because there was no + * suitable receive buffer available. This function is used to increase + * this counter by one. + */ +static void spapr_vlan_record_dropped_rx_frame(VIOsPAPRVLANDevice *dev) +{ + uint64_t cnt; + + cnt = vio_ldq(&dev->sdev, dev->buf_list + 4096 - 8); + vio_stq(&dev->sdev, dev->buf_list + 4096 - 8, cnt + 1); +} + +/** * Get buffer descriptor from one of our receive buffer pools */ static vlan_bd_t spapr_vlan_get_rx_bd_from_pool(VIOsPAPRVLANDevice *dev, @@ -206,7 +222,8 @@ static ssize_t spapr_vlan_receive(NetClientState *nc, const uint8_t *buf, } if (!dev->rx_bufs) { - return -1; + spapr_vlan_record_dropped_rx_frame(dev); + return 0; } if (dev->compat_flags & SPAPRVLAN_FLAG_RX_BUF_POOLS) { @@ -215,7 +232,8 @@ static ssize_t spapr_vlan_receive(NetClientState *nc, const uint8_t *buf, bd = spapr_vlan_get_rx_bd_from_page(dev, size); } if (!bd) { - return -1; + spapr_vlan_record_dropped_rx_frame(dev); + return 0; } dev->rx_bufs--; @@ -266,6 +284,13 @@ static NetClientInfo net_spapr_vlan_info = { .receive = spapr_vlan_receive, }; +static void spapr_vlan_flush_rx_queue(void *opaque) +{ + VIOsPAPRVLANDevice *dev = opaque; + + qemu_flush_queued_packets(qemu_get_queue(dev->nic)); +} + static void spapr_vlan_reset_rx_pool(RxBufPool *rxp) { /* @@ -302,6 +327,9 @@ static void spapr_vlan_realize(VIOsPAPRDevice *sdev, Error **errp) dev->nic = qemu_new_nic(&net_spapr_vlan_info, &dev->nicconf, object_get_typename(OBJECT(sdev)), sdev->qdev.id, dev); qemu_format_nic_info_str(qemu_get_queue(dev->nic), dev->nicconf.macaddr.a); + + dev->rxp_timer = timer_new_us(QEMU_CLOCK_VIRTUAL, spapr_vlan_flush_rx_queue, + dev); } static void spapr_vlan_instance_init(Object *obj) @@ -332,6 +360,11 @@ static void spapr_vlan_instance_finalize(Object *obj) dev->rx_pool[i] = NULL; } } + + if (dev->rxp_timer) { + timer_del(dev->rxp_timer); + timer_free(dev->rxp_timer); + } } void spapr_vlan_create(VIOsPAPRBus *bus, NICInfo *nd) @@ -629,7 +662,13 @@ static target_ulong h_add_logical_lan_buffer(PowerPCCPU *cpu, dev->rx_bufs++; - qemu_flush_queued_packets(qemu_get_queue(dev->nic)); + /* + * Give guest some more time to add additional RX buffers before we + * flush the receive queue, so that e.g. fragmented IP packets can + * be passed to the guest in one go later (instead of passing single + * fragments if there is only one receive buffer available). + */ + timer_mod(dev->rxp_timer, qemu_clock_get_us(QEMU_CLOCK_VIRTUAL) + 500); return H_SUCCESS; } diff --git a/hw/ppc/spapr.c b/hw/ppc/spapr.c index add68acfef..44e401ae99 100644 --- a/hw/ppc/spapr.c +++ b/hw/ppc/spapr.c @@ -1842,6 +1842,10 @@ static void ppc_spapr_init(MachineState *machine) exit(1); } spapr->rtas_size = get_image_size(filename); + if (spapr->rtas_size < 0) { + error_report("Could not get size of LPAR rtas '%s'", filename); + exit(1); + } spapr->rtas_blob = g_malloc(spapr->rtas_size); if (load_image_size(filename, spapr->rtas_blob, spapr->rtas_size) < 0) { error_report("Could not load LPAR rtas '%s'", filename); @@ -2132,15 +2136,6 @@ static void spapr_add_lmbs(DeviceState *dev, uint64_t addr, uint64_t size, int i, fdt_offset, fdt_size; void *fdt; - /* - * Check for DRC connectors and send hotplug notification to the - * guest only in case of hotplugged memory. This allows cold plugged - * memory to be specified at boot time. - */ - if (!dev->hotplugged) { - return; - } - for (i = 0; i < nr_lmbs; i++) { drc = spapr_dr_connector_by_id(SPAPR_DR_CONNECTOR_TYPE_LMB, addr/SPAPR_MEMORY_BLOCK_SIZE); @@ -2154,7 +2149,12 @@ static void spapr_add_lmbs(DeviceState *dev, uint64_t addr, uint64_t size, drck->attach(drc, dev, fdt, fdt_offset, !dev->hotplugged, errp); addr += SPAPR_MEMORY_BLOCK_SIZE; } - spapr_hotplug_req_add_by_count(SPAPR_DR_CONNECTOR_TYPE_LMB, nr_lmbs); + /* send hotplug notification to the + * guest only in case of hotplugged memory + */ + if (dev->hotplugged) { + spapr_hotplug_req_add_by_count(SPAPR_DR_CONNECTOR_TYPE_LMB, nr_lmbs); + } } static void spapr_memory_plug(HotplugHandler *hotplug_dev, DeviceState *dev, diff --git a/hw/ppc/spapr_iommu.c b/hw/ppc/spapr_iommu.c index 722db91ffa..96bb0181a7 100644 --- a/hw/ppc/spapr_iommu.c +++ b/hw/ppc/spapr_iommu.c @@ -76,6 +76,37 @@ static IOMMUAccessFlags spapr_tce_iommu_access_flags(uint64_t tce) } } +static uint64_t *spapr_tce_alloc_table(uint32_t liobn, + uint32_t page_shift, + uint32_t nb_table, + int *fd, + bool need_vfio) +{ + uint64_t *table = NULL; + uint64_t window_size = (uint64_t)nb_table << page_shift; + + if (kvm_enabled() && !(window_size >> 32)) { + table = kvmppc_create_spapr_tce(liobn, window_size, fd, need_vfio); + } + + if (!table) { + *fd = -1; + table = g_malloc0(nb_table * sizeof(uint64_t)); + } + + trace_spapr_iommu_new_table(liobn, table, *fd); + + return table; +} + +static void spapr_tce_free_table(uint64_t *table, int fd, uint32_t nb_table) +{ + if (!kvm_enabled() || + (kvmppc_remove_spapr_tce(table, fd, nb_table) != 0)) { + g_free(table); + } +} + /* Called from RCU critical section */ static IOMMUTLBEntry spapr_tce_translate_iommu(MemoryRegion *iommu, hwaddr addr, bool is_write) @@ -142,21 +173,13 @@ static MemoryRegionIOMMUOps spapr_iommu_ops = { static int spapr_tce_table_realize(DeviceState *dev) { sPAPRTCETable *tcet = SPAPR_TCE_TABLE(dev); - uint64_t window_size = (uint64_t)tcet->nb_table << tcet->page_shift; - - if (kvm_enabled() && !(window_size >> 32)) { - tcet->table = kvmppc_create_spapr_tce(tcet->liobn, - window_size, - &tcet->fd, - tcet->need_vfio); - } - if (!tcet->table) { - size_t table_size = tcet->nb_table * sizeof(uint64_t); - tcet->table = g_malloc0(table_size); - } - - trace_spapr_iommu_new_table(tcet->liobn, tcet, tcet->table, tcet->fd); + tcet->fd = -1; + tcet->table = spapr_tce_alloc_table(tcet->liobn, + tcet->page_shift, + tcet->nb_table, + &tcet->fd, + tcet->need_vfio); memory_region_init_iommu(&tcet->iommu, OBJECT(dev), &spapr_iommu_ops, "iommu-spapr", @@ -242,11 +265,8 @@ static void spapr_tce_table_unrealize(DeviceState *dev, Error **errp) QLIST_REMOVE(tcet, list); - if (!kvm_enabled() || - (kvmppc_remove_spapr_tce(tcet->table, tcet->fd, - tcet->nb_table) != 0)) { - g_free(tcet->table); - } + spapr_tce_free_table(tcet->table, tcet->fd, tcet->nb_table); + tcet->fd = -1; } MemoryRegion *spapr_tce_get_iommu(sPAPRTCETable *tcet) @@ -278,7 +298,7 @@ static target_ulong put_tce_emu(sPAPRTCETable *tcet, target_ulong ioba, tcet->table[index] = tce; entry.target_as = &address_space_memory, - entry.iova = ioba & page_mask; + entry.iova = (ioba - tcet->bus_offset) & page_mask; entry.translated_addr = tce & page_mask; entry.addr_mask = ~page_mask; entry.perm = spapr_tce_iommu_access_flags(tce); diff --git a/hw/ppc/spapr_pci.c b/hw/ppc/spapr_pci.c index e55b505c96..856aec7f51 100644 --- a/hw/ppc/spapr_pci.c +++ b/hw/ppc/spapr_pci.c @@ -1093,13 +1093,11 @@ static void spapr_phb_add_pci_device(sPAPRDRConnector *drc, spapr_tce_set_need_vfio(tcet, true); } - if (dev->hotplugged) { - fdt = create_device_tree(&fdt_size); - fdt_start_offset = spapr_create_pci_child_dt(phb, pdev, fdt, 0); - if (!fdt_start_offset) { - error_setg(errp, "Failed to create pci child device tree node"); - goto out; - } + fdt = create_device_tree(&fdt_size); + fdt_start_offset = spapr_create_pci_child_dt(phb, pdev, fdt, 0); + if (!fdt_start_offset) { + error_setg(errp, "Failed to create pci child device tree node"); + goto out; } drck->attach(drc, DEVICE(pdev), @@ -1816,7 +1814,7 @@ int spapr_populate_pci_dt(sPAPRPHBState *phb, _FDT(fdt_setprop(fdt, bus_off, "interrupt-map", &interrupt_map, sizeof(interrupt_map))); - tcet = spapr_tce_find_by_liobn(SPAPR_PCI_LIOBN(phb->index, 0)); + tcet = spapr_tce_find_by_liobn(phb->dma_liobn); if (!tcet) { return -1; } diff --git a/hw/s390x/s390-skeys.c b/hw/s390x/s390-skeys.c index d772cfc7ea..e2d4e1af79 100644 --- a/hw/s390x/s390-skeys.c +++ b/hw/s390x/s390-skeys.c @@ -47,15 +47,11 @@ void s390_skeys_init(void) qdev_init_nofail(DEVICE(obj)); } -static void write_keys(QEMUFile *f, uint8_t *keys, uint64_t startgfn, +static void write_keys(FILE *f, uint8_t *keys, uint64_t startgfn, uint64_t count, Error **errp) { uint64_t curpage = startgfn; uint64_t maxpage = curpage + count - 1; - const char *fmt = "page=%03" PRIx64 ": key(%d) => ACC=%X, FP=%d, REF=%d," - " ch=%d, reserved=%d\n"; - char buf[128]; - int len; for (; curpage <= maxpage; curpage++) { uint8_t acc = (*keys & 0xF0) >> 4; @@ -64,10 +60,9 @@ static void write_keys(QEMUFile *f, uint8_t *keys, uint64_t startgfn, int ch = (*keys & 0x02); int res = (*keys & 0x01); - len = snprintf(buf, sizeof(buf), fmt, curpage, - *keys, acc, fp, ref, ch, res); - assert(len < sizeof(buf)); - qemu_put_buffer(f, (uint8_t *)buf, len); + fprintf(f, "page=%03" PRIx64 ": key(%d) => ACC=%X, FP=%d, REF=%d," + " ch=%d, reserved=%d\n", + curpage, *keys, acc, fp, ref, ch, res); keys++; } } @@ -116,7 +111,8 @@ void qmp_dump_skeys(const char *filename, Error **errp) vaddr cur_gfn = 0; uint8_t *buf; int ret; - QEMUFile *f; + int fd; + FILE *f; /* Quick check to see if guest is using storage keys*/ if (!skeyclass->skeys_enabled(ss)) { @@ -125,8 +121,14 @@ void qmp_dump_skeys(const char *filename, Error **errp) return; } - f = qemu_fopen(filename, "wb"); + fd = qemu_open(filename, O_WRONLY | O_CREAT | O_TRUNC, 0600); + if (fd < 0) { + error_setg_file_open(errp, errno, filename); + return; + } + f = fdopen(fd, "wb"); if (!f) { + close(fd); error_setg_file_open(errp, errno, filename); return; } @@ -162,7 +164,7 @@ out_free: error_propagate(errp, lerr); g_free(buf); out: - qemu_fclose(f); + fclose(f); } static void qemu_s390_skeys_init(Object *obj) diff --git a/hw/scsi/scsi-disk.c b/hw/scsi/scsi-disk.c index ce89c98b4e..8865da53e8 100644 --- a/hw/scsi/scsi-disk.c +++ b/hw/scsi/scsi-disk.c @@ -335,7 +335,8 @@ static void scsi_do_read(SCSIDiskReq *r, int ret) if (r->req.sg) { dma_acct_start(s->qdev.conf.blk, &r->acct, r->req.sg, BLOCK_ACCT_READ); r->req.resid -= r->req.sg->size; - r->req.aiocb = dma_blk_read(s->qdev.conf.blk, r->req.sg, r->sector, + r->req.aiocb = dma_blk_read(s->qdev.conf.blk, r->req.sg, + r->sector << BDRV_SECTOR_BITS, scsi_dma_complete, r); } else { scsi_init_iovec(r, SCSI_DMA_BUF_SIZE); @@ -539,7 +540,8 @@ static void scsi_write_data(SCSIRequest *req) if (r->req.sg) { dma_acct_start(s->qdev.conf.blk, &r->acct, r->req.sg, BLOCK_ACCT_WRITE); r->req.resid -= r->req.sg->size; - r->req.aiocb = dma_blk_write(s->qdev.conf.blk, r->req.sg, r->sector, + r->req.aiocb = dma_blk_write(s->qdev.conf.blk, r->req.sg, + r->sector << BDRV_SECTOR_BITS, scsi_dma_complete, r); } else { block_acct_start(blk_get_stats(s->qdev.conf.blk), &r->acct, @@ -1778,7 +1780,7 @@ static void scsi_disk_emulate_write_same(SCSIDiskReq *r, uint8_t *inbuf) block_acct_start(blk_get_stats(s->qdev.conf.blk), &r->acct, nb_sectors * s->qdev.blocksize, BLOCK_ACCT_WRITE); - r->req.aiocb = blk_aio_write_zeroes(s->qdev.conf.blk, + r->req.aiocb = blk_aio_pwrite_zeroes(s->qdev.conf.blk, r->req.cmd.lba * s->qdev.blocksize, nb_sectors * s->qdev.blocksize, flags, scsi_aio_complete, r); diff --git a/hw/vfio/common.c b/hw/vfio/common.c index 88154a1f03..e51ed3a348 100644 --- a/hw/vfio/common.c +++ b/hw/vfio/common.c @@ -260,14 +260,20 @@ static void vfio_iommu_map_notify(Notifier *n, void *data) VFIOGuestIOMMU *giommu = container_of(n, VFIOGuestIOMMU, n); VFIOContainer *container = giommu->container; IOMMUTLBEntry *iotlb = data; + hwaddr iova = iotlb->iova + giommu->iommu_offset; MemoryRegion *mr; hwaddr xlat; hwaddr len = iotlb->addr_mask + 1; void *vaddr; int ret; - trace_vfio_iommu_map_notify(iotlb->iova, - iotlb->iova + iotlb->addr_mask); + trace_vfio_iommu_map_notify(iova, iova + iotlb->addr_mask); + + if (iotlb->target_as != &address_space_memory) { + error_report("Wrong target AS \"%s\", only system memory is allowed", + iotlb->target_as->name ? iotlb->target_as->name : "none"); + return; + } /* * The IOMMU TLB entry we have just covers translation through @@ -294,21 +300,21 @@ static void vfio_iommu_map_notify(Notifier *n, void *data) if ((iotlb->perm & IOMMU_RW) != IOMMU_NONE) { vaddr = memory_region_get_ram_ptr(mr) + xlat; - ret = vfio_dma_map(container, iotlb->iova, + ret = vfio_dma_map(container, iova, iotlb->addr_mask + 1, vaddr, !(iotlb->perm & IOMMU_WO) || mr->readonly); if (ret) { error_report("vfio_dma_map(%p, 0x%"HWADDR_PRIx", " "0x%"HWADDR_PRIx", %p) = %d (%m)", - container, iotlb->iova, + container, iova, iotlb->addr_mask + 1, vaddr, ret); } } else { - ret = vfio_dma_unmap(container, iotlb->iova, iotlb->addr_mask + 1); + ret = vfio_dma_unmap(container, iova, iotlb->addr_mask + 1); if (ret) { error_report("vfio_dma_unmap(%p, 0x%"HWADDR_PRIx", " "0x%"HWADDR_PRIx") = %d (%m)", - container, iotlb->iova, + container, iova, iotlb->addr_mask + 1, ret); } } @@ -380,6 +386,8 @@ static void vfio_listener_region_add(MemoryListener *listener, */ giommu = g_malloc0(sizeof(*giommu)); giommu->iommu = section->mr; + giommu->iommu_offset = section->offset_within_address_space - + section->offset_within_region; giommu->container = container; giommu->n.notify = vfio_iommu_map_notify; QLIST_INSERT_HEAD(&container->giommu_list, giommu, giommu_next); @@ -433,6 +441,7 @@ static void vfio_listener_region_del(MemoryListener *listener, { VFIOContainer *container = container_of(listener, VFIOContainer, listener); hwaddr iova, end; + Int128 llend, llsize; int ret; if (vfio_listener_skipped_section(section)) { @@ -471,21 +480,25 @@ static void vfio_listener_region_del(MemoryListener *listener, } iova = TARGET_PAGE_ALIGN(section->offset_within_address_space); - end = (section->offset_within_address_space + int128_get64(section->size)) & - TARGET_PAGE_MASK; + llend = int128_make64(section->offset_within_address_space); + llend = int128_add(llend, section->size); + llend = int128_and(llend, int128_exts64(TARGET_PAGE_MASK)); - if (iova >= end) { + if (int128_ge(int128_make64(iova), llend)) { return; } + end = int128_get64(int128_sub(llend, int128_one())); + + llsize = int128_sub(llend, int128_make64(iova)); - trace_vfio_listener_region_del(iova, end - 1); + trace_vfio_listener_region_del(iova, end); - ret = vfio_dma_unmap(container, iova, end - iova); + ret = vfio_dma_unmap(container, iova, int128_get64(llsize)); memory_region_unref(section->mr); if (ret) { error_report("vfio_dma_unmap(%p, 0x%"HWADDR_PRIx", " "0x%"HWADDR_PRIx") = %d (%m)", - container, iova, end - iova, ret); + container, iova, int128_get64(llsize), ret); } } @@ -499,6 +512,54 @@ static void vfio_listener_release(VFIOContainer *container) memory_listener_unregister(&container->listener); } +static struct vfio_info_cap_header * +vfio_get_region_info_cap(struct vfio_region_info *info, uint16_t id) +{ + struct vfio_info_cap_header *hdr; + void *ptr = info; + + if (!(info->flags & VFIO_REGION_INFO_FLAG_CAPS)) { + return NULL; + } + + for (hdr = ptr + info->cap_offset; hdr != ptr; hdr = ptr + hdr->next) { + if (hdr->id == id) { + return hdr; + } + } + + return NULL; +} + +static void vfio_setup_region_sparse_mmaps(VFIORegion *region, + struct vfio_region_info *info) +{ + struct vfio_info_cap_header *hdr; + struct vfio_region_info_cap_sparse_mmap *sparse; + int i; + + hdr = vfio_get_region_info_cap(info, VFIO_REGION_INFO_CAP_SPARSE_MMAP); + if (!hdr) { + return; + } + + sparse = container_of(hdr, struct vfio_region_info_cap_sparse_mmap, header); + + trace_vfio_region_sparse_mmap_header(region->vbasedev->name, + region->nr, sparse->nr_areas); + + region->nr_mmaps = sparse->nr_areas; + region->mmaps = g_new0(VFIOMmap, region->nr_mmaps); + + for (i = 0; i < region->nr_mmaps; i++) { + region->mmaps[i].offset = sparse->areas[i].offset; + region->mmaps[i].size = sparse->areas[i].size; + trace_vfio_region_sparse_mmap_entry(i, region->mmaps[i].offset, + region->mmaps[i].offset + + region->mmaps[i].size); + } +} + int vfio_region_setup(Object *obj, VFIODevice *vbasedev, VFIORegion *region, int index, const char *name) { @@ -525,11 +586,14 @@ int vfio_region_setup(Object *obj, VFIODevice *vbasedev, VFIORegion *region, region->flags & VFIO_REGION_INFO_FLAG_MMAP && !(region->size & ~qemu_real_host_page_mask)) { - region->nr_mmaps = 1; - region->mmaps = g_new0(VFIOMmap, region->nr_mmaps); + vfio_setup_region_sparse_mmaps(region, info); - region->mmaps[0].offset = 0; - region->mmaps[0].size = region->size; + if (!region->nr_mmaps) { + region->nr_mmaps = 1; + region->mmaps = g_new0(VFIOMmap, region->nr_mmaps); + region->mmaps[0].offset = 0; + region->mmaps[0].size = region->size; + } } } @@ -1089,16 +1153,60 @@ int vfio_get_region_info(VFIODevice *vbasedev, int index, *info = g_malloc0(argsz); (*info)->index = index; +retry: (*info)->argsz = argsz; if (ioctl(vbasedev->fd, VFIO_DEVICE_GET_REGION_INFO, *info)) { g_free(*info); + *info = NULL; return -errno; } + if ((*info)->argsz > argsz) { + argsz = (*info)->argsz; + *info = g_realloc(*info, argsz); + + goto retry; + } + return 0; } +int vfio_get_dev_region_info(VFIODevice *vbasedev, uint32_t type, + uint32_t subtype, struct vfio_region_info **info) +{ + int i; + + for (i = 0; i < vbasedev->num_regions; i++) { + struct vfio_info_cap_header *hdr; + struct vfio_region_info_cap_type *cap_type; + + if (vfio_get_region_info(vbasedev, i, info)) { + continue; + } + + hdr = vfio_get_region_info_cap(*info, VFIO_REGION_INFO_CAP_TYPE); + if (!hdr) { + g_free(*info); + continue; + } + + cap_type = container_of(hdr, struct vfio_region_info_cap_type, header); + + trace_vfio_get_dev_region(vbasedev->name, i, + cap_type->type, cap_type->subtype); + + if (cap_type->type == type && cap_type->subtype == subtype) { + return 0; + } + + g_free(*info); + } + + *info = NULL; + return -ENODEV; +} + /* * Interfaces for IBM EEH (Enhanced Error Handling) */ diff --git a/hw/vfio/pci-quirks.c b/hw/vfio/pci-quirks.c index 49ecf1172a..35d32b78f4 100644 --- a/hw/vfio/pci-quirks.c +++ b/hw/vfio/pci-quirks.c @@ -11,9 +11,12 @@ */ #include "qemu/osdep.h" +#include "qemu/error-report.h" +#include "qemu/range.h" +#include "qapi/error.h" +#include "hw/nvram/fw_cfg.h" #include "pci.h" #include "trace.h" -#include "qemu/range.h" /* Use uin32_t for vendor & device so PCI_ANY_ID expands and cannot match hw */ static bool vfio_pci_is(VFIOPCIDevice *vdev, uint32_t vendor, uint32_t device) @@ -962,6 +965,643 @@ static void vfio_probe_rtl8168_bar2_quirk(VFIOPCIDevice *vdev, int nr) } /* + * Intel IGD support + * + * Obviously IGD is not a discrete device, this is evidenced not only by it + * being integrated into the CPU, but by the various chipset and BIOS + * dependencies that it brings along with it. Intel is trying to move away + * from this and Broadwell and newer devices can run in what Intel calls + * "Universal Pass-Through" mode, or UPT. Theoretically in UPT mode, nothing + * more is required beyond assigning the IGD device to a VM. There are + * however support limitations to this mode. It only supports IGD as a + * secondary graphics device in the VM and it doesn't officially support any + * physical outputs. + * + * The code here attempts to enable what we'll call legacy mode assignment, + * IGD retains most of the capabilities we expect for it to have on bare + * metal. To enable this mode, the IGD device must be assigned to the VM + * at PCI address 00:02.0, it must have a ROM, it very likely needs VGA + * support, we must have VM BIOS support for reserving and populating some + * of the required tables, and we need to tweak the chipset with revisions + * and IDs and an LPC/ISA bridge device. The intention is to make all of + * this happen automatically by installing the device at the correct VM PCI + * bus address. If any of the conditions are not met, we cross our fingers + * and hope the user knows better. + * + * NB - It is possible to enable physical outputs in UPT mode by supplying + * an OpRegion table. We don't do this by default because the guest driver + * behaves differently if an OpRegion is provided and no monitor is attached + * vs no OpRegion and a monitor being attached or not. Effectively, if a + * headless setup is desired, the OpRegion gets in the way of that. + */ + +/* + * This presumes the device is already known to be an Intel VGA device, so we + * take liberties in which device ID bits match which generation. This should + * not be taken as an indication that all the devices are supported, or even + * supportable, some of them don't even support VT-d. + * See linux:include/drm/i915_pciids.h for IDs. + */ +static int igd_gen(VFIOPCIDevice *vdev) +{ + if ((vdev->device_id & 0xfff) == 0xa84) { + return 8; /* Broxton */ + } + + switch (vdev->device_id & 0xff00) { + /* Old, untested, unavailable, unknown */ + case 0x0000: + case 0x2500: + case 0x2700: + case 0x2900: + case 0x2a00: + case 0x2e00: + case 0x3500: + case 0xa000: + return -1; + /* SandyBridge, IvyBridge, ValleyView, Haswell */ + case 0x0100: + case 0x0400: + case 0x0a00: + case 0x0c00: + case 0x0d00: + case 0x0f00: + return 6; + /* BroadWell, CherryView, SkyLake, KabyLake */ + case 0x1600: + case 0x1900: + case 0x2200: + case 0x5900: + return 8; + } + + return 8; /* Assume newer is compatible */ +} + +typedef struct VFIOIGDQuirk { + struct VFIOPCIDevice *vdev; + uint32_t index; +} VFIOIGDQuirk; + +#define IGD_GMCH 0x50 /* Graphics Control Register */ +#define IGD_BDSM 0x5c /* Base Data of Stolen Memory */ +#define IGD_ASLS 0xfc /* ASL Storage Register */ + +/* + * The OpRegion includes the Video BIOS Table, which seems important for + * telling the driver what sort of outputs it has. Without this, the device + * may work in the guest, but we may not get output. This also requires BIOS + * support to reserve and populate a section of guest memory sufficient for + * the table and to write the base address of that memory to the ASLS register + * of the IGD device. + */ +int vfio_pci_igd_opregion_init(VFIOPCIDevice *vdev, + struct vfio_region_info *info) +{ + int ret; + + vdev->igd_opregion = g_malloc0(info->size); + ret = pread(vdev->vbasedev.fd, vdev->igd_opregion, + info->size, info->offset); + if (ret != info->size) { + error_report("vfio: Error reading IGD OpRegion"); + g_free(vdev->igd_opregion); + vdev->igd_opregion = NULL; + return -EINVAL; + } + + /* + * Provide fw_cfg with a copy of the OpRegion which the VM firmware is to + * allocate 32bit reserved memory for, copy these contents into, and write + * the reserved memory base address to the device ASLS register at 0xFC. + * Alignment of this reserved region seems flexible, but using a 4k page + * alignment seems to work well. This interface assumes a single IGD + * device, which may be at VM address 00:02.0 in legacy mode or another + * address in UPT mode. + * + * NB, there may be future use cases discovered where the VM should have + * direct interaction with the host OpRegion, in which case the write to + * the ASLS register would trigger MemoryRegion setup to enable that. + */ + fw_cfg_add_file(fw_cfg_find(), "etc/igd-opregion", + vdev->igd_opregion, info->size); + + trace_vfio_pci_igd_opregion_enabled(vdev->vbasedev.name); + + pci_set_long(vdev->pdev.config + IGD_ASLS, 0); + pci_set_long(vdev->pdev.wmask + IGD_ASLS, ~0); + pci_set_long(vdev->emulated_config_bits + IGD_ASLS, ~0); + + return 0; +} + +/* + * The rather short list of registers that we copy from the host devices. + * The LPC/ISA bridge values are definitely needed to support the vBIOS, the + * host bridge values may or may not be needed depending on the guest OS. + * Since we're only munging revision and subsystem values on the host bridge, + * we don't require our own device. The LPC/ISA bridge needs to be our very + * own though. + */ +typedef struct { + uint8_t offset; + uint8_t len; +} IGDHostInfo; + +static const IGDHostInfo igd_host_bridge_infos[] = { + {PCI_REVISION_ID, 2}, + {PCI_SUBSYSTEM_VENDOR_ID, 2}, + {PCI_SUBSYSTEM_ID, 2}, +}; + +static const IGDHostInfo igd_lpc_bridge_infos[] = { + {PCI_VENDOR_ID, 2}, + {PCI_DEVICE_ID, 2}, + {PCI_REVISION_ID, 2}, + {PCI_SUBSYSTEM_VENDOR_ID, 2}, + {PCI_SUBSYSTEM_ID, 2}, +}; + +static int vfio_pci_igd_copy(VFIOPCIDevice *vdev, PCIDevice *pdev, + struct vfio_region_info *info, + const IGDHostInfo *list, int len) +{ + int i, ret; + + for (i = 0; i < len; i++) { + ret = pread(vdev->vbasedev.fd, pdev->config + list[i].offset, + list[i].len, info->offset + list[i].offset); + if (ret != list[i].len) { + error_report("IGD copy failed: %m"); + return -errno; + } + } + + return 0; +} + +/* + * Stuff a few values into the host bridge. + */ +static int vfio_pci_igd_host_init(VFIOPCIDevice *vdev, + struct vfio_region_info *info) +{ + PCIBus *bus; + PCIDevice *host_bridge; + int ret; + + bus = pci_device_root_bus(&vdev->pdev); + host_bridge = pci_find_device(bus, 0, PCI_DEVFN(0, 0)); + + if (!host_bridge) { + error_report("Can't find host bridge"); + return -ENODEV; + } + + ret = vfio_pci_igd_copy(vdev, host_bridge, info, igd_host_bridge_infos, + ARRAY_SIZE(igd_host_bridge_infos)); + if (!ret) { + trace_vfio_pci_igd_host_bridge_enabled(vdev->vbasedev.name); + } + + return ret; +} + +/* + * IGD LPC/ISA bridge support code. The vBIOS needs this, but we can't write + * arbitrary values into just any bridge, so we must create our own. We try + * to handle if the user has created it for us, which they might want to do + * to enable multifuction so we don't occupy the whole PCI slot. + */ +static void vfio_pci_igd_lpc_bridge_realize(PCIDevice *pdev, Error **errp) +{ + if (pdev->devfn != PCI_DEVFN(0x1f, 0)) { + error_setg(errp, "VFIO dummy ISA/LPC bridge must have address 1f.0"); + } +} + +static void vfio_pci_igd_lpc_bridge_class_init(ObjectClass *klass, void *data) +{ + DeviceClass *dc = DEVICE_CLASS(klass); + PCIDeviceClass *k = PCI_DEVICE_CLASS(klass); + + dc->desc = "VFIO dummy ISA/LPC bridge for IGD assignment"; + dc->hotpluggable = false; + k->realize = vfio_pci_igd_lpc_bridge_realize; + k->class_id = PCI_CLASS_BRIDGE_ISA; +} + +static TypeInfo vfio_pci_igd_lpc_bridge_info = { + .name = "vfio-pci-igd-lpc-bridge", + .parent = TYPE_PCI_DEVICE, + .class_init = vfio_pci_igd_lpc_bridge_class_init, +}; + +static void vfio_pci_igd_register_types(void) +{ + type_register_static(&vfio_pci_igd_lpc_bridge_info); +} + +type_init(vfio_pci_igd_register_types) + +static int vfio_pci_igd_lpc_init(VFIOPCIDevice *vdev, + struct vfio_region_info *info) +{ + PCIDevice *lpc_bridge; + int ret; + + lpc_bridge = pci_find_device(pci_device_root_bus(&vdev->pdev), + 0, PCI_DEVFN(0x1f, 0)); + if (!lpc_bridge) { + lpc_bridge = pci_create_simple(pci_device_root_bus(&vdev->pdev), + PCI_DEVFN(0x1f, 0), "vfio-pci-igd-lpc-bridge"); + } + + ret = vfio_pci_igd_copy(vdev, lpc_bridge, info, igd_lpc_bridge_infos, + ARRAY_SIZE(igd_lpc_bridge_infos)); + if (!ret) { + trace_vfio_pci_igd_lpc_bridge_enabled(vdev->vbasedev.name); + } + + return ret; +} + +/* + * IGD Gen8 and newer support up to 8MB for the GTT and use a 64bit PTE + * entry, older IGDs use 2MB and 32bit. Each PTE maps a 4k page. Therefore + * we either have 2M/4k * 4 = 2k or 8M/4k * 8 = 16k as the maximum iobar index + * for programming the GTT. + * + * See linux:include/drm/i915_drm.h for shift and mask values. + */ +static int vfio_igd_gtt_max(VFIOPCIDevice *vdev) +{ + uint32_t gmch = vfio_pci_read_config(&vdev->pdev, IGD_GMCH, sizeof(gmch)); + int ggms, gen = igd_gen(vdev); + + gmch = vfio_pci_read_config(&vdev->pdev, IGD_GMCH, sizeof(gmch)); + ggms = (gmch >> (gen < 8 ? 8 : 6)) & 0x3; + if (gen > 6) { + ggms = 1 << ggms; + } + + ggms *= 1024 * 1024; + + return (ggms / (4 * 1024)) * (gen < 8 ? 4 : 8); +} + +/* + * The IGD ROM will make use of stolen memory (GGMS) for support of VESA modes. + * Somehow the host stolen memory range is used for this, but how the ROM gets + * it is a mystery, perhaps it's hardcoded into the ROM. Thankfully though, it + * reprograms the GTT through the IOBAR where we can trap it and transpose the + * programming to the VM allocated buffer. That buffer gets reserved by the VM + * firmware via the fw_cfg entry added below. Here we're just monitoring the + * IOBAR address and data registers to detect a write sequence targeting the + * GTTADR. This code is developed by observed behavior and doesn't have a + * direct spec reference, unfortunately. + */ +static uint64_t vfio_igd_quirk_data_read(void *opaque, + hwaddr addr, unsigned size) +{ + VFIOIGDQuirk *igd = opaque; + VFIOPCIDevice *vdev = igd->vdev; + + igd->index = ~0; + + return vfio_region_read(&vdev->bars[4].region, addr + 4, size); +} + +static void vfio_igd_quirk_data_write(void *opaque, hwaddr addr, + uint64_t data, unsigned size) +{ + VFIOIGDQuirk *igd = opaque; + VFIOPCIDevice *vdev = igd->vdev; + uint64_t val = data; + int gen = igd_gen(vdev); + + /* + * Programming the GGMS starts at index 0x1 and uses every 4th index (ie. + * 0x1, 0x5, 0x9, 0xd,...). For pre-Gen8 each 4-byte write is a whole PTE + * entry, with 0th bit enable set. For Gen8 and up, PTEs are 64bit, so + * entries 0x5 & 0xd are the high dword, in our case zero. Each PTE points + * to a 4k page, which we translate to a page from the VM allocated region, + * pointed to by the BDSM register. If this is not set, we fail. + * + * We trap writes to the full configured GTT size, but we typically only + * see the vBIOS writing up to (nearly) the 1MB barrier. In fact it often + * seems to miss the last entry for an even 1MB GTT. Doing a gratuitous + * write of that last entry does work, but is hopefully unnecessary since + * we clear the previous GTT on initialization. + */ + if ((igd->index % 4 == 1) && igd->index < vfio_igd_gtt_max(vdev)) { + if (gen < 8 || (igd->index % 8 == 1)) { + uint32_t base; + + base = pci_get_long(vdev->pdev.config + IGD_BDSM); + if (!base) { + hw_error("vfio-igd: Guest attempted to program IGD GTT before " + "BIOS reserved stolen memory. Unsupported BIOS?"); + } + + val = base | (data & ((1 << 20) - 1)); + } else { + val = 0; /* upper 32bits of pte, we only enable below 4G PTEs */ + } + + trace_vfio_pci_igd_bar4_write(vdev->vbasedev.name, + igd->index, data, val); + } + + vfio_region_write(&vdev->bars[4].region, addr + 4, val, size); + + igd->index = ~0; +} + +static const MemoryRegionOps vfio_igd_data_quirk = { + .read = vfio_igd_quirk_data_read, + .write = vfio_igd_quirk_data_write, + .endianness = DEVICE_LITTLE_ENDIAN, +}; + +static uint64_t vfio_igd_quirk_index_read(void *opaque, + hwaddr addr, unsigned size) +{ + VFIOIGDQuirk *igd = opaque; + VFIOPCIDevice *vdev = igd->vdev; + + igd->index = ~0; + + return vfio_region_read(&vdev->bars[4].region, addr, size); +} + +static void vfio_igd_quirk_index_write(void *opaque, hwaddr addr, + uint64_t data, unsigned size) +{ + VFIOIGDQuirk *igd = opaque; + VFIOPCIDevice *vdev = igd->vdev; + + igd->index = data; + + vfio_region_write(&vdev->bars[4].region, addr, data, size); +} + +static const MemoryRegionOps vfio_igd_index_quirk = { + .read = vfio_igd_quirk_index_read, + .write = vfio_igd_quirk_index_write, + .endianness = DEVICE_LITTLE_ENDIAN, +}; + +static void vfio_probe_igd_bar4_quirk(VFIOPCIDevice *vdev, int nr) +{ + struct vfio_region_info *rom = NULL, *opregion = NULL, + *host = NULL, *lpc = NULL; + VFIOQuirk *quirk; + VFIOIGDQuirk *igd; + PCIDevice *lpc_bridge; + int i, ret, ggms_mb, gms_mb = 0, gen; + uint64_t *bdsm_size; + uint32_t gmch; + uint16_t cmd_orig, cmd; + + /* + * This must be an Intel VGA device at address 00:02.0 for us to even + * consider enabling legacy mode. The vBIOS has dependencies on the + * PCI bus address. + */ + if (!vfio_pci_is(vdev, PCI_VENDOR_ID_INTEL, PCI_ANY_ID) || + !vfio_is_vga(vdev) || nr != 4 || + &vdev->pdev != pci_find_device(pci_device_root_bus(&vdev->pdev), + 0, PCI_DEVFN(0x2, 0))) { + return; + } + + /* + * We need to create an LPC/ISA bridge at PCI bus address 00:1f.0 that we + * can stuff host values into, so if there's already one there and it's not + * one we can hack on, legacy mode is no-go. Sorry Q35. + */ + lpc_bridge = pci_find_device(pci_device_root_bus(&vdev->pdev), + 0, PCI_DEVFN(0x1f, 0)); + if (lpc_bridge && !object_dynamic_cast(OBJECT(lpc_bridge), + "vfio-pci-igd-lpc-bridge")) { + error_report("IGD device %s cannot support legacy mode due to existing " + "devices at address 1f.0", vdev->vbasedev.name); + return; + } + + /* + * IGD is not a standard, they like to change their specs often. We + * only attempt to support back to SandBridge and we hope that newer + * devices maintain compatibility with generation 8. + */ + gen = igd_gen(vdev); + if (gen != 6 && gen != 8) { + error_report("IGD device %s is unsupported in legacy mode, " + "try SandyBridge or newer", vdev->vbasedev.name); + return; + } + + /* + * Most of what we're doing here is to enable the ROM to run, so if + * there's no ROM, there's no point in setting up this quirk. + * NB. We only seem to get BIOS ROMs, so a UEFI VM would need CSM support. + */ + ret = vfio_get_region_info(&vdev->vbasedev, + VFIO_PCI_ROM_REGION_INDEX, &rom); + if ((ret || !rom->size) && !vdev->pdev.romfile) { + error_report("IGD device %s has no ROM, legacy mode disabled", + vdev->vbasedev.name); + goto out; + } + + /* + * Ignore the hotplug corner case, mark the ROM failed, we can't + * create the devices we need for legacy mode in the hotplug scenario. + */ + if (vdev->pdev.qdev.hotplugged) { + error_report("IGD device %s hotplugged, ROM disabled, " + "legacy mode disabled", vdev->vbasedev.name); + vdev->rom_read_failed = true; + goto out; + } + + /* + * Check whether we have all the vfio device specific regions to + * support legacy mode (added in Linux v4.6). If not, bail. + */ + ret = vfio_get_dev_region_info(&vdev->vbasedev, + VFIO_REGION_TYPE_PCI_VENDOR_TYPE | PCI_VENDOR_ID_INTEL, + VFIO_REGION_SUBTYPE_INTEL_IGD_OPREGION, &opregion); + if (ret) { + error_report("IGD device %s does not support OpRegion access," + "legacy mode disabled", vdev->vbasedev.name); + goto out; + } + + ret = vfio_get_dev_region_info(&vdev->vbasedev, + VFIO_REGION_TYPE_PCI_VENDOR_TYPE | PCI_VENDOR_ID_INTEL, + VFIO_REGION_SUBTYPE_INTEL_IGD_HOST_CFG, &host); + if (ret) { + error_report("IGD device %s does not support host bridge access," + "legacy mode disabled", vdev->vbasedev.name); + goto out; + } + + ret = vfio_get_dev_region_info(&vdev->vbasedev, + VFIO_REGION_TYPE_PCI_VENDOR_TYPE | PCI_VENDOR_ID_INTEL, + VFIO_REGION_SUBTYPE_INTEL_IGD_LPC_CFG, &lpc); + if (ret) { + error_report("IGD device %s does not support LPC bridge access," + "legacy mode disabled", vdev->vbasedev.name); + goto out; + } + + gmch = vfio_pci_read_config(&vdev->pdev, IGD_GMCH, 4); + + /* + * If IGD VGA Disable is clear (expected) and VGA is not already enabled, + * try to enable it. Probably shouldn't be using legacy mode without VGA, + * but also no point in us enabling VGA if disabled in hardware. + */ + if (!(gmch & 0x2) && !vdev->vga && vfio_populate_vga(vdev)) { + error_report("IGD device %s failed to enable VGA access, " + "legacy mode disabled", vdev->vbasedev.name); + goto out; + } + + /* Create our LPC/ISA bridge */ + ret = vfio_pci_igd_lpc_init(vdev, lpc); + if (ret) { + error_report("IGD device %s failed to create LPC bridge, " + "legacy mode disabled", vdev->vbasedev.name); + goto out; + } + + /* Stuff some host values into the VM PCI host bridge */ + ret = vfio_pci_igd_host_init(vdev, host); + if (ret) { + error_report("IGD device %s failed to modify host bridge, " + "legacy mode disabled", vdev->vbasedev.name); + goto out; + } + + /* Setup OpRegion access */ + ret = vfio_pci_igd_opregion_init(vdev, opregion); + if (ret) { + error_report("IGD device %s failed to setup OpRegion, " + "legacy mode disabled", vdev->vbasedev.name); + goto out; + } + + /* Setup our quirk to munge GTT addresses to the VM allocated buffer */ + quirk = g_malloc0(sizeof(*quirk)); + quirk->mem = g_new0(MemoryRegion, 2); + quirk->nr_mem = 2; + igd = quirk->data = g_malloc0(sizeof(*igd)); + igd->vdev = vdev; + igd->index = ~0; + + memory_region_init_io(&quirk->mem[0], OBJECT(vdev), &vfio_igd_index_quirk, + igd, "vfio-igd-index-quirk", 4); + memory_region_add_subregion_overlap(vdev->bars[nr].region.mem, + 0, &quirk->mem[0], 1); + + memory_region_init_io(&quirk->mem[1], OBJECT(vdev), &vfio_igd_data_quirk, + igd, "vfio-igd-data-quirk", 4); + memory_region_add_subregion_overlap(vdev->bars[nr].region.mem, + 4, &quirk->mem[1], 1); + + QLIST_INSERT_HEAD(&vdev->bars[nr].quirks, quirk, next); + + /* Determine the size of stolen memory needed for GTT */ + ggms_mb = (gmch >> (gen < 8 ? 8 : 6)) & 0x3; + if (gen > 6) { + ggms_mb = 1 << ggms_mb; + } + + /* + * Assume we have no GMS memory, but allow it to be overrided by device + * option (experimental). The spec doesn't actually allow zero GMS when + * when IVD (IGD VGA Disable) is clear, but the claim is that it's unused, + * so let's not waste VM memory for it. + */ + gmch &= ~((gen < 8 ? 0x1f : 0xff) << (gen < 8 ? 3 : 8)); + + if (vdev->igd_gms) { + if (vdev->igd_gms <= 0x10) { + gms_mb = vdev->igd_gms * 32; + gmch |= vdev->igd_gms << (gen < 8 ? 3 : 8); + } else { + error_report("Unsupported IGD GMS value 0x%x", vdev->igd_gms); + vdev->igd_gms = 0; + } + } + + /* + * Request reserved memory for stolen memory via fw_cfg. VM firmware + * must allocate a 1MB aligned reserved memory region below 4GB with + * the requested size (in bytes) for use by the Intel PCI class VGA + * device at VM address 00:02.0. The base address of this reserved + * memory region must be written to the device BDSM regsiter at PCI + * config offset 0x5C. + */ + bdsm_size = g_malloc(sizeof(*bdsm_size)); + *bdsm_size = cpu_to_le64((ggms_mb + gms_mb) * 1024 * 1024); + fw_cfg_add_file(fw_cfg_find(), "etc/igd-bdsm-size", + bdsm_size, sizeof(*bdsm_size)); + + /* GMCH is read-only, emulated */ + pci_set_long(vdev->pdev.config + IGD_GMCH, gmch); + pci_set_long(vdev->pdev.wmask + IGD_GMCH, 0); + pci_set_long(vdev->emulated_config_bits + IGD_GMCH, ~0); + + /* BDSM is read-write, emulated. The BIOS needs to be able to write it */ + pci_set_long(vdev->pdev.config + IGD_BDSM, 0); + pci_set_long(vdev->pdev.wmask + IGD_BDSM, ~0); + pci_set_long(vdev->emulated_config_bits + IGD_BDSM, ~0); + + /* + * This IOBAR gives us access to GTTADR, which allows us to write to + * the GTT itself. So let's go ahead and write zero to all the GTT + * entries to avoid spurious DMA faults. Be sure I/O access is enabled + * before talking to the device. + */ + if (pread(vdev->vbasedev.fd, &cmd_orig, sizeof(cmd_orig), + vdev->config_offset + PCI_COMMAND) != sizeof(cmd_orig)) { + error_report("IGD device %s - failed to read PCI command register", + vdev->vbasedev.name); + } + + cmd = cmd_orig | PCI_COMMAND_IO; + + if (pwrite(vdev->vbasedev.fd, &cmd, sizeof(cmd), + vdev->config_offset + PCI_COMMAND) != sizeof(cmd)) { + error_report("IGD device %s - failed to write PCI command register", + vdev->vbasedev.name); + } + + for (i = 1; i < vfio_igd_gtt_max(vdev); i += 4) { + vfio_region_write(&vdev->bars[4].region, 0, i, 4); + vfio_region_write(&vdev->bars[4].region, 4, 0, 4); + } + + if (pwrite(vdev->vbasedev.fd, &cmd_orig, sizeof(cmd_orig), + vdev->config_offset + PCI_COMMAND) != sizeof(cmd_orig)) { + error_report("IGD device %s - failed to restore PCI command register", + vdev->vbasedev.name); + } + + trace_vfio_pci_igd_bdsm_enabled(vdev->vbasedev.name, ggms_mb + gms_mb); + +out: + g_free(rom); + g_free(opregion); + g_free(host); + g_free(lpc); +} + +/* * Common quirk probe entry points. */ void vfio_vga_quirk_setup(VFIOPCIDevice *vdev) @@ -1010,6 +1650,7 @@ void vfio_bar_quirk_setup(VFIOPCIDevice *vdev, int nr) vfio_probe_nvidia_bar5_quirk(vdev, nr); vfio_probe_nvidia_bar0_quirk(vdev, nr); vfio_probe_rtl8168_bar2_quirk(vdev, nr); + vfio_probe_igd_bar4_quirk(vdev, nr); } void vfio_bar_quirk_exit(VFIOPCIDevice *vdev, int nr) diff --git a/hw/vfio/pci.c b/hw/vfio/pci.c index d091d8cf0e..deab0c601a 100644 --- a/hw/vfio/pci.c +++ b/hw/vfio/pci.c @@ -1440,8 +1440,6 @@ static void vfio_bar_setup(VFIOPCIDevice *vdev, int nr) vdev->vbasedev.name, nr); } - vfio_bar_quirk_setup(vdev, nr); - pci_register_bar(&vdev->pdev, nr, type, bar->region.mem); } @@ -1452,29 +1450,6 @@ static void vfio_bars_setup(VFIOPCIDevice *vdev) for (i = 0; i < PCI_ROM_SLOT; i++) { vfio_bar_setup(vdev, i); } - - if (vdev->vga) { - memory_region_init_io(&vdev->vga->region[QEMU_PCI_VGA_MEM].mem, - OBJECT(vdev), &vfio_vga_ops, - &vdev->vga->region[QEMU_PCI_VGA_MEM], - "vfio-vga-mmio@0xa0000", - QEMU_PCI_VGA_MEM_SIZE); - memory_region_init_io(&vdev->vga->region[QEMU_PCI_VGA_IO_LO].mem, - OBJECT(vdev), &vfio_vga_ops, - &vdev->vga->region[QEMU_PCI_VGA_IO_LO], - "vfio-vga-io@0x3b0", - QEMU_PCI_VGA_IO_LO_SIZE); - memory_region_init_io(&vdev->vga->region[QEMU_PCI_VGA_IO_HI].mem, - OBJECT(vdev), &vfio_vga_ops, - &vdev->vga->region[QEMU_PCI_VGA_IO_HI], - "vfio-vga-io@0x3c0", - QEMU_PCI_VGA_IO_HI_SIZE); - - pci_register_vga(&vdev->pdev, &vdev->vga->region[QEMU_PCI_VGA_MEM].mem, - &vdev->vga->region[QEMU_PCI_VGA_IO_LO].mem, - &vdev->vga->region[QEMU_PCI_VGA_IO_HI].mem); - vfio_vga_quirk_setup(vdev); - } } static void vfio_bars_exit(VFIOPCIDevice *vdev) @@ -2061,42 +2036,61 @@ int vfio_populate_vga(VFIOPCIDevice *vdev) struct vfio_region_info *reg_info; int ret; - if (vbasedev->num_regions > VFIO_PCI_VGA_REGION_INDEX) { - ret = vfio_get_region_info(vbasedev, - VFIO_PCI_VGA_REGION_INDEX, ®_info); - if (ret) { - return ret; - } + ret = vfio_get_region_info(vbasedev, VFIO_PCI_VGA_REGION_INDEX, ®_info); + if (ret) { + return ret; + } - if (!(reg_info->flags & VFIO_REGION_INFO_FLAG_READ) || - !(reg_info->flags & VFIO_REGION_INFO_FLAG_WRITE) || - reg_info->size < 0xbffff + 1) { - error_report("vfio: Unexpected VGA info, flags 0x%lx, size 0x%lx", - (unsigned long)reg_info->flags, - (unsigned long)reg_info->size); - g_free(reg_info); - return -EINVAL; - } + if (!(reg_info->flags & VFIO_REGION_INFO_FLAG_READ) || + !(reg_info->flags & VFIO_REGION_INFO_FLAG_WRITE) || + reg_info->size < 0xbffff + 1) { + error_report("vfio: Unexpected VGA info, flags 0x%lx, size 0x%lx", + (unsigned long)reg_info->flags, + (unsigned long)reg_info->size); + g_free(reg_info); + return -EINVAL; + } - vdev->vga = g_new0(VFIOVGA, 1); + vdev->vga = g_new0(VFIOVGA, 1); - vdev->vga->fd_offset = reg_info->offset; - vdev->vga->fd = vdev->vbasedev.fd; + vdev->vga->fd_offset = reg_info->offset; + vdev->vga->fd = vdev->vbasedev.fd; - g_free(reg_info); + g_free(reg_info); - vdev->vga->region[QEMU_PCI_VGA_MEM].offset = QEMU_PCI_VGA_MEM_BASE; - vdev->vga->region[QEMU_PCI_VGA_MEM].nr = QEMU_PCI_VGA_MEM; - QLIST_INIT(&vdev->vga->region[QEMU_PCI_VGA_MEM].quirks); + vdev->vga->region[QEMU_PCI_VGA_MEM].offset = QEMU_PCI_VGA_MEM_BASE; + vdev->vga->region[QEMU_PCI_VGA_MEM].nr = QEMU_PCI_VGA_MEM; + QLIST_INIT(&vdev->vga->region[QEMU_PCI_VGA_MEM].quirks); - vdev->vga->region[QEMU_PCI_VGA_IO_LO].offset = QEMU_PCI_VGA_IO_LO_BASE; - vdev->vga->region[QEMU_PCI_VGA_IO_LO].nr = QEMU_PCI_VGA_IO_LO; - QLIST_INIT(&vdev->vga->region[QEMU_PCI_VGA_IO_LO].quirks); + memory_region_init_io(&vdev->vga->region[QEMU_PCI_VGA_MEM].mem, + OBJECT(vdev), &vfio_vga_ops, + &vdev->vga->region[QEMU_PCI_VGA_MEM], + "vfio-vga-mmio@0xa0000", + QEMU_PCI_VGA_MEM_SIZE); - vdev->vga->region[QEMU_PCI_VGA_IO_HI].offset = QEMU_PCI_VGA_IO_HI_BASE; - vdev->vga->region[QEMU_PCI_VGA_IO_HI].nr = QEMU_PCI_VGA_IO_HI; - QLIST_INIT(&vdev->vga->region[QEMU_PCI_VGA_IO_HI].quirks); - } + vdev->vga->region[QEMU_PCI_VGA_IO_LO].offset = QEMU_PCI_VGA_IO_LO_BASE; + vdev->vga->region[QEMU_PCI_VGA_IO_LO].nr = QEMU_PCI_VGA_IO_LO; + QLIST_INIT(&vdev->vga->region[QEMU_PCI_VGA_IO_LO].quirks); + + memory_region_init_io(&vdev->vga->region[QEMU_PCI_VGA_IO_LO].mem, + OBJECT(vdev), &vfio_vga_ops, + &vdev->vga->region[QEMU_PCI_VGA_IO_LO], + "vfio-vga-io@0x3b0", + QEMU_PCI_VGA_IO_LO_SIZE); + + vdev->vga->region[QEMU_PCI_VGA_IO_HI].offset = QEMU_PCI_VGA_IO_HI_BASE; + vdev->vga->region[QEMU_PCI_VGA_IO_HI].nr = QEMU_PCI_VGA_IO_HI; + QLIST_INIT(&vdev->vga->region[QEMU_PCI_VGA_IO_HI].quirks); + + memory_region_init_io(&vdev->vga->region[QEMU_PCI_VGA_IO_HI].mem, + OBJECT(vdev), &vfio_vga_ops, + &vdev->vga->region[QEMU_PCI_VGA_IO_HI], + "vfio-vga-io@0x3c0", + QEMU_PCI_VGA_IO_HI_SIZE); + + pci_register_vga(&vdev->pdev, &vdev->vga->region[QEMU_PCI_VGA_MEM].mem, + &vdev->vga->region[QEMU_PCI_VGA_IO_LO].mem, + &vdev->vga->region[QEMU_PCI_VGA_IO_HI].mem); return 0; } @@ -2398,7 +2392,7 @@ static int vfio_initfn(PCIDevice *pdev) ssize_t len; struct stat st; int groupid; - int ret; + int i, ret; if (!vdev->vbasedev.sysfsdev) { vdev->vbasedev.sysfsdev = @@ -2560,6 +2554,43 @@ static int vfio_initfn(PCIDevice *pdev) goto out_teardown; } + if (vdev->vga) { + vfio_vga_quirk_setup(vdev); + } + + for (i = 0; i < PCI_ROM_SLOT; i++) { + vfio_bar_quirk_setup(vdev, i); + } + + if (!vdev->igd_opregion && + vdev->features & VFIO_FEATURE_ENABLE_IGD_OPREGION) { + struct vfio_region_info *opregion; + + if (vdev->pdev.qdev.hotplugged) { + error_report("Cannot support IGD OpRegion feature on hotplugged " + "device %s", vdev->vbasedev.name); + ret = -EINVAL; + goto out_teardown; + } + + ret = vfio_get_dev_region_info(&vdev->vbasedev, + VFIO_REGION_TYPE_PCI_VENDOR_TYPE | PCI_VENDOR_ID_INTEL, + VFIO_REGION_SUBTYPE_INTEL_IGD_OPREGION, &opregion); + if (ret) { + error_report("Device %s does not support requested IGD OpRegion " + "feature", vdev->vbasedev.name); + goto out_teardown; + } + + ret = vfio_pci_igd_opregion_init(vdev, opregion); + g_free(opregion); + if (ret) { + error_report("Device %s IGD OpRegion initialization failed", + vdev->vbasedev.name); + goto out_teardown; + } + } + /* QEMU emulates all of MSI & MSIX */ if (pdev->cap_present & QEMU_PCI_CAP_MSIX) { memset(vdev->emulated_config_bits + pdev->msix_cap, 0xff, @@ -2603,6 +2634,13 @@ static void vfio_instance_finalize(Object *obj) vfio_bars_finalize(vdev); g_free(vdev->emulated_config_bits); g_free(vdev->rom); + /* + * XXX Leaking igd_opregion is not an oversight, we can't remove the + * fw_cfg entry therefore leaking this allocation seems like the safest + * option. + * + * g_free(vdev->igd_opregion); + */ vfio_put_device(vdev); vfio_put_group(group); } @@ -2677,6 +2715,8 @@ static Property vfio_pci_dev_properties[] = { VFIO_FEATURE_ENABLE_VGA_BIT, false), DEFINE_PROP_BIT("x-req", VFIOPCIDevice, features, VFIO_FEATURE_ENABLE_REQ_BIT, true), + DEFINE_PROP_BIT("x-igd-opregion", VFIOPCIDevice, features, + VFIO_FEATURE_ENABLE_IGD_OPREGION_BIT, false), DEFINE_PROP_BOOL("x-no-mmap", VFIOPCIDevice, vbasedev.no_mmap, false), DEFINE_PROP_BOOL("x-no-kvm-intx", VFIOPCIDevice, no_kvm_intx, false), DEFINE_PROP_BOOL("x-no-kvm-msi", VFIOPCIDevice, no_kvm_msi, false), @@ -2687,6 +2727,7 @@ static Property vfio_pci_dev_properties[] = { sub_vendor_id, PCI_ANY_ID), DEFINE_PROP_UINT32("x-pci-sub-device-id", VFIOPCIDevice, sub_device_id, PCI_ANY_ID), + DEFINE_PROP_UINT32("x-igd-gms", VFIOPCIDevice, igd_gms, 0), /* * TODO - support passed fds... is this necessary? * DEFINE_PROP_STRING("vfiofd", VFIOPCIDevice, vfiofd_name), diff --git a/hw/vfio/pci.h b/hw/vfio/pci.h index 3976f68549..b3eb0d838e 100644 --- a/hw/vfio/pci.h +++ b/hw/vfio/pci.h @@ -115,6 +115,7 @@ typedef struct VFIOPCIDevice { int interrupt; /* Current interrupt type */ VFIOBAR bars[PCI_NUM_REGIONS - 1]; /* No ROM */ VFIOVGA *vga; /* 0xa0000, 0x3b0, 0x3c0 */ + void *igd_opregion; PCIHostDeviceAddress host; EventNotifier err_notifier; EventNotifier req_notifier; @@ -128,7 +129,11 @@ typedef struct VFIOPCIDevice { #define VFIO_FEATURE_ENABLE_VGA (1 << VFIO_FEATURE_ENABLE_VGA_BIT) #define VFIO_FEATURE_ENABLE_REQ_BIT 1 #define VFIO_FEATURE_ENABLE_REQ (1 << VFIO_FEATURE_ENABLE_REQ_BIT) +#define VFIO_FEATURE_ENABLE_IGD_OPREGION_BIT 2 +#define VFIO_FEATURE_ENABLE_IGD_OPREGION \ + (1 << VFIO_FEATURE_ENABLE_IGD_OPREGION_BIT) int32_t bootindex; + uint32_t igd_gms; uint8_t pm_cap; bool has_vga; bool pci_aer; @@ -159,4 +164,7 @@ void vfio_setup_resetfn_quirk(VFIOPCIDevice *vdev); int vfio_populate_vga(VFIOPCIDevice *vdev); +int vfio_pci_igd_opregion_init(VFIOPCIDevice *vdev, + struct vfio_region_info *info); + #endif /* HW_VFIO_VFIO_PCI_H */ diff --git a/include/block/block.h b/include/block/block.h index a8c15e36e7..70ea29947c 100644 --- a/include/block/block.h +++ b/include/block/block.h @@ -17,7 +17,6 @@ typedef struct BlockJob BlockJob; typedef struct BdrvChild BdrvChild; typedef struct BdrvChildRole BdrvChildRole; typedef struct BlockJobTxn BlockJobTxn; -typedef struct BdrvNextIterator BdrvNextIterator; typedef struct BlockDriverInfo { /* in bytes, 0 if irrelevant */ @@ -198,7 +197,6 @@ BlockDriver *bdrv_find_format(const char *format_name); int bdrv_create(BlockDriver *drv, const char* filename, QemuOpts *opts, Error **errp); int bdrv_create_file(const char *filename, QemuOpts *opts, Error **errp); -BlockDriverState *bdrv_new_root(void); BlockDriverState *bdrv_new(void); void bdrv_append(BlockDriverState *bs_new, BlockDriverState *bs_top); void bdrv_replace_in_backing_chain(BlockDriverState *old, @@ -214,8 +212,8 @@ BdrvChild *bdrv_open_child(const char *filename, void bdrv_set_backing_hd(BlockDriverState *bs, BlockDriverState *backing_hd); int bdrv_open_backing_file(BlockDriverState *bs, QDict *parent_options, const char *bdref_key, Error **errp); -int bdrv_open(BlockDriverState **pbs, const char *filename, - const char *reference, QDict *options, int flags, Error **errp); +BlockDriverState *bdrv_open(const char *filename, const char *reference, + QDict *options, int flags, Error **errp); BlockReopenQueue *bdrv_reopen_queue(BlockReopenQueue *bs_queue, BlockDriverState *bs, QDict *options, int flags); @@ -244,10 +242,6 @@ int bdrv_pwrite_sync(BlockDriverState *bs, int64_t offset, const void *buf, int count); int coroutine_fn bdrv_co_readv(BlockDriverState *bs, int64_t sector_num, int nb_sectors, QEMUIOVector *qiov); -int coroutine_fn bdrv_co_copy_on_readv(BlockDriverState *bs, - int64_t sector_num, int nb_sectors, QEMUIOVector *qiov); -int coroutine_fn bdrv_co_readv_no_serialising(BlockDriverState *bs, - int64_t sector_num, int nb_sectors, QEMUIOVector *qiov); int coroutine_fn bdrv_co_writev(BlockDriverState *bs, int64_t sector_num, int nb_sectors, QEMUIOVector *qiov); /* @@ -402,7 +396,19 @@ BlockDriverState *bdrv_lookup_bs(const char *device, Error **errp); bool bdrv_chain_contains(BlockDriverState *top, BlockDriverState *base); BlockDriverState *bdrv_next_node(BlockDriverState *bs); -BdrvNextIterator *bdrv_next(BdrvNextIterator *it, BlockDriverState **bs); + +typedef struct BdrvNextIterator { + enum { + BDRV_NEXT_BACKEND_ROOTS, + BDRV_NEXT_MONITOR_OWNED, + } phase; + BlockBackend *blk; + BlockDriverState *bs; +} BdrvNextIterator; + +BlockDriverState *bdrv_first(BdrvNextIterator *it); +BlockDriverState *bdrv_next(BdrvNextIterator *it); + BlockDriverState *bdrv_next_monitor_owned(BlockDriverState *bs); int bdrv_is_encrypted(BlockDriverState *bs); int bdrv_key_required(BlockDriverState *bs); diff --git a/include/block/block_int.h b/include/block/block_int.h index b6f4755725..30a97178c8 100644 --- a/include/block/block_int.h +++ b/include/block/block_int.h @@ -719,7 +719,8 @@ void hmp_drive_add_node(Monitor *mon, const char *optstr); BdrvChild *bdrv_root_attach_child(BlockDriverState *child_bs, const char *child_name, - const BdrvChildRole *child_role); + const BdrvChildRole *child_role, + void *opaque); void bdrv_root_unref_child(BdrvChild *child); const char *bdrv_get_parent_name(const BlockDriverState *bs); diff --git a/include/block/blockjob.h b/include/block/blockjob.h index 073a433cf8..86d28070b8 100644 --- a/include/block/blockjob.h +++ b/include/block/blockjob.h @@ -82,7 +82,7 @@ struct BlockJob { const BlockJobDriver *driver; /** The block device on which the job is operating. */ - BlockDriverState *bs; + BlockBackend *blk; /** * The ID of the block job. Currently the BlockBackend name of the BDS @@ -135,6 +135,9 @@ struct BlockJob { */ bool deferred_to_main_loop; + /** Element of the list of block jobs */ + QLIST_ENTRY(BlockJob) job_list; + /** Status that is published by the query-block-jobs QMP API */ BlockDeviceIoStatus iostatus; @@ -173,6 +176,17 @@ struct BlockJob { }; /** + * block_job_next: + * @job: A block job, or %NULL. + * + * Get the next element from the list of block jobs after @job, or the + * first one if @job is %NULL. + * + * Returns the requested job, or %NULL if there are no more jobs left. + */ +BlockJob *block_job_next(BlockJob *job); + +/** * block_job_create: * @job_type: The class object for the newly-created job. * @bs: The block @@ -357,6 +371,13 @@ bool block_job_is_paused(BlockJob *job); int block_job_cancel_sync(BlockJob *job); /** + * block_job_cancel_sync_all: + * + * Synchronously cancels all jobs using block_job_cancel_sync(). + */ +void block_job_cancel_sync_all(void); + +/** * block_job_complete_sync: * @job: The job to be completed. * @errp: Error object which may be set by block_job_complete(); this is not diff --git a/include/hw/vfio/vfio-common.h b/include/hw/vfio/vfio-common.h index eb0e1b0342..0610377789 100644 --- a/include/hw/vfio/vfio-common.h +++ b/include/hw/vfio/vfio-common.h @@ -90,6 +90,7 @@ typedef struct VFIOContainer { typedef struct VFIOGuestIOMMU { VFIOContainer *container; MemoryRegion *iommu; + hwaddr iommu_offset; Notifier n; QLIST_ENTRY(VFIOGuestIOMMU) giommu_next; } VFIOGuestIOMMU; @@ -154,5 +155,7 @@ extern QLIST_HEAD(vfio_as_head, VFIOAddressSpace) vfio_address_spaces; #ifdef CONFIG_LINUX int vfio_get_region_info(VFIODevice *vbasedev, int index, struct vfio_region_info **info); +int vfio_get_dev_region_info(VFIODevice *vbasedev, uint32_t type, + uint32_t subtype, struct vfio_region_info **info); #endif #endif /* !HW_VFIO_VFIO_COMMON_H */ diff --git a/include/migration/migration.h b/include/migration/migration.h index 9e36a97fc5..13b12b7e87 100644 --- a/include/migration/migration.h +++ b/include/migration/migration.h @@ -135,9 +135,12 @@ struct MigrationState QemuThread thread; QEMUBH *cleanup_bh; QEMUFile *to_dst_file; - int parameters[MIGRATION_PARAMETER__MAX]; + + /* New style params from 'migrate-set-parameters' */ + MigrationParameters parameters; int state; + /* Old style params from 'migrate' command */ MigrationParams params; /* State related to return path */ @@ -171,6 +174,9 @@ struct MigrationState QSIMPLEQ_HEAD(src_page_requests, MigrationSrcPageRequest) src_page_requests; /* The RAMBlock used in the last src_page_request */ RAMBlock *last_req_rb; + + /* The last error that occurred */ + Error *error; }; void migrate_set_state(int *state, int old_state, int new_state); @@ -179,6 +185,22 @@ void process_incoming_migration(QEMUFile *f); void qemu_start_incoming_migration(const char *uri, Error **errp); +void migration_set_incoming_channel(MigrationState *s, + QIOChannel *ioc); + +void migration_tls_set_incoming_channel(MigrationState *s, + QIOChannel *ioc, + Error **errp); + +void migration_set_outgoing_channel(MigrationState *s, + QIOChannel *ioc, + const char *hostname); + +void migration_tls_set_outgoing_channel(MigrationState *s, + QIOChannel *ioc, + const char *hostname, + Error **errp); + uint64_t migrate_max_downtime(void); void exec_start_incoming_migration(const char *host_port, Error **errp); @@ -201,7 +223,7 @@ void rdma_start_outgoing_migration(void *opaque, const char *host_port, Error ** void rdma_start_incoming_migration(const char *host_port, Error **errp); -void migrate_fd_error(MigrationState *s); +void migrate_fd_error(MigrationState *s, const Error *error); void migrate_fd_connect(MigrationState *s); diff --git a/include/migration/qemu-file.h b/include/migration/qemu-file.h index 3f6b4ed581..2409a98967 100644 --- a/include/migration/qemu-file.h +++ b/include/migration/qemu-file.h @@ -23,16 +23,11 @@ */ #ifndef QEMU_FILE_H #define QEMU_FILE_H 1 +#include "qemu-common.h" #include "exec/cpu-common.h" +#include "io/channel.h" -/* This function writes a chunk of data to a file at the given position. - * The pos argument can be ignored if the file is only being used for - * streaming. The handler should try to write all of the data it can. - */ -typedef ssize_t (QEMUFilePutBufferFunc)(void *opaque, const uint8_t *buf, - int64_t pos, size_t size); - /* Read a chunk of data from a file at the given position. The pos argument * can be ignored if the file is only be used for streaming. The number of * bytes actually read should be returned. @@ -53,8 +48,13 @@ typedef int (QEMUFileCloseFunc)(void *opaque); */ typedef int (QEMUFileGetFD)(void *opaque); +/* Called to change the blocking mode of the file + */ +typedef int (QEMUFileSetBlocking)(void *opaque, bool enabled); + /* - * This function writes an iovec to file. + * This function writes an iovec to file. The handler must write all + * of the data or return a negative errno value. */ typedef ssize_t (QEMUFileWritevBufferFunc)(void *opaque, struct iovec *iov, int iovcnt, int64_t pos); @@ -101,32 +101,25 @@ typedef QEMUFile *(QEMURetPathFunc)(void *opaque); typedef int (QEMUFileShutdownFunc)(void *opaque, bool rd, bool wr); typedef struct QEMUFileOps { - QEMUFilePutBufferFunc *put_buffer; QEMUFileGetBufferFunc *get_buffer; QEMUFileCloseFunc *close; - QEMUFileGetFD *get_fd; + QEMUFileSetBlocking *set_blocking; QEMUFileWritevBufferFunc *writev_buffer; - QEMURamHookFunc *before_ram_iterate; - QEMURamHookFunc *after_ram_iterate; - QEMURamHookFunc *hook_ram_load; - QEMURamSaveFunc *save_page; QEMURetPathFunc *get_return_path; QEMUFileShutdownFunc *shut_down; } QEMUFileOps; -struct QEMUSizedBuffer { - struct iovec *iov; - size_t n_iov; - size_t size; /* total allocated size in all iov's */ - size_t used; /* number of used bytes */ -}; +typedef struct QEMUFileHooks { + QEMURamHookFunc *before_ram_iterate; + QEMURamHookFunc *after_ram_iterate; + QEMURamHookFunc *hook_ram_load; + QEMURamSaveFunc *save_page; +} QEMUFileHooks; QEMUFile *qemu_fopen_ops(void *opaque, const QEMUFileOps *ops); -QEMUFile *qemu_fopen(const char *filename, const char *mode); -QEMUFile *qemu_fdopen(int fd, const char *mode); -QEMUFile *qemu_fopen_socket(int fd, const char *mode); -QEMUFile *qemu_popen_cmd(const char *command, const char *mode); -QEMUFile *qemu_bufopen(const char *mode, QEMUSizedBuffer *input); +QEMUFile *qemu_fopen_channel_input(QIOChannel *ioc); +QEMUFile *qemu_fopen_channel_output(QIOChannel *ioc); +void qemu_file_set_hooks(QEMUFile *f, const QEMUFileHooks *hooks); int qemu_get_fd(QEMUFile *f); int qemu_fclose(QEMUFile *f); int64_t qemu_ftell(QEMUFile *f); @@ -141,20 +134,6 @@ void qemu_put_buffer_async(QEMUFile *f, const uint8_t *buf, size_t size); bool qemu_file_mode_is_not_valid(const char *mode); bool qemu_file_is_writable(QEMUFile *f); -QEMUSizedBuffer *qsb_create(const uint8_t *buffer, size_t len); -void qsb_free(QEMUSizedBuffer *); -size_t qsb_set_length(QEMUSizedBuffer *qsb, size_t length); -size_t qsb_get_length(const QEMUSizedBuffer *qsb); -ssize_t qsb_get_buffer(const QEMUSizedBuffer *, off_t start, size_t count, - uint8_t *buf); -ssize_t qsb_write_at(QEMUSizedBuffer *qsb, const uint8_t *buf, - off_t pos, size_t count); - - -/* - * For use on files opened with qemu_bufopen - */ -const QEMUSizedBuffer *qemu_buf_get(QEMUFile *f); static inline void qemu_put_ubyte(QEMUFile *f, unsigned int v) { diff --git a/include/qapi/error.h b/include/qapi/error.h index 11be2327c0..0576659603 100644 --- a/include/qapi/error.h +++ b/include/qapi/error.h @@ -134,7 +134,7 @@ typedef enum ErrorClass { /* * Get @err's human-readable error message. */ -const char *error_get_pretty(Error *err); +const char *error_get_pretty(const Error *err); /* * Get @err's error class. diff --git a/include/qemu/typedefs.h b/include/qemu/typedefs.h index 1dcf6f5d53..b113fcf156 100644 --- a/include/qemu/typedefs.h +++ b/include/qemu/typedefs.h @@ -82,7 +82,6 @@ typedef struct QemuOpt QemuOpt; typedef struct QemuOpts QemuOpts; typedef struct QemuOptsList QemuOptsList; typedef struct QEMUSGList QEMUSGList; -typedef struct QEMUSizedBuffer QEMUSizedBuffer; typedef struct QEMUTimer QEMUTimer; typedef struct QEMUTimerListGroup QEMUTimerListGroup; typedef struct QObject QObject; diff --git a/include/sysemu/block-backend.h b/include/sysemu/block-backend.h index 68d92b556e..c04af8ea46 100644 --- a/include/sysemu/block-backend.h +++ b/include/sysemu/block-backend.h @@ -78,8 +78,7 @@ typedef struct BlockBackendPublic { QLIST_ENTRY(BlockBackendPublic) round_robin; } BlockBackendPublic; -BlockBackend *blk_new(Error **errp); -BlockBackend *blk_new_with_bs(Error **errp); +BlockBackend *blk_new(void); BlockBackend *blk_new_open(const char *filename, const char *reference, QDict *options, int flags, Error **errp); int blk_get_refcnt(BlockBackend *blk); @@ -114,11 +113,17 @@ void *blk_get_attached_dev(BlockBackend *blk); void blk_set_dev_ops(BlockBackend *blk, const BlockDevOps *ops, void *opaque); int blk_pread_unthrottled(BlockBackend *blk, int64_t offset, uint8_t *buf, int count); -int blk_write_zeroes(BlockBackend *blk, int64_t offset, - int count, BdrvRequestFlags flags); -BlockAIOCB *blk_aio_write_zeroes(BlockBackend *blk, int64_t offset, - int count, BdrvRequestFlags flags, - BlockCompletionFunc *cb, void *opaque); +int coroutine_fn blk_co_preadv(BlockBackend *blk, int64_t offset, + unsigned int bytes, QEMUIOVector *qiov, + BdrvRequestFlags flags); +int coroutine_fn blk_co_pwritev(BlockBackend *blk, int64_t offset, + unsigned int bytes, QEMUIOVector *qiov, + BdrvRequestFlags flags); +int blk_pwrite_zeroes(BlockBackend *blk, int64_t offset, + int count, BdrvRequestFlags flags); +BlockAIOCB *blk_aio_pwrite_zeroes(BlockBackend *blk, int64_t offset, + int count, BdrvRequestFlags flags, + BlockCompletionFunc *cb, void *opaque); int blk_pread(BlockBackend *blk, int64_t offset, void *buf, int count); int blk_pwrite(BlockBackend *blk, int64_t offset, const void *buf, int count, BdrvRequestFlags flags); @@ -196,8 +201,8 @@ int blk_get_open_flags_from_root_state(BlockBackend *blk); void *blk_aio_get(const AIOCBInfo *aiocb_info, BlockBackend *blk, BlockCompletionFunc *cb, void *opaque); -int coroutine_fn blk_co_write_zeroes(BlockBackend *blk, int64_t offset, - int count, BdrvRequestFlags flags); +int coroutine_fn blk_co_pwrite_zeroes(BlockBackend *blk, int64_t offset, + int count, BdrvRequestFlags flags); int blk_write_compressed(BlockBackend *blk, int64_t sector_num, const uint8_t *buf, int nb_sectors); int blk_truncate(BlockBackend *blk, int64_t offset); diff --git a/include/sysemu/dma.h b/include/sysemu/dma.h index d6e96a4298..34c8eaf64e 100644 --- a/include/sysemu/dma.h +++ b/include/sysemu/dma.h @@ -194,19 +194,19 @@ void qemu_sglist_add(QEMUSGList *qsg, dma_addr_t base, dma_addr_t len); void qemu_sglist_destroy(QEMUSGList *qsg); #endif -typedef BlockAIOCB *DMAIOFunc(BlockBackend *blk, int64_t offset, - QEMUIOVector *iov, BdrvRequestFlags flags, - BlockCompletionFunc *cb, void *opaque); - -BlockAIOCB *dma_blk_io(BlockBackend *blk, - QEMUSGList *sg, uint64_t sector_num, - DMAIOFunc *io_func, BlockCompletionFunc *cb, - void *opaque, DMADirection dir); +typedef BlockAIOCB *DMAIOFunc(int64_t offset, QEMUIOVector *iov, + BlockCompletionFunc *cb, void *cb_opaque, + void *opaque); + +BlockAIOCB *dma_blk_io(AioContext *ctx, + QEMUSGList *sg, uint64_t offset, + DMAIOFunc *io_func, void *io_func_opaque, + BlockCompletionFunc *cb, void *opaque, DMADirection dir); BlockAIOCB *dma_blk_read(BlockBackend *blk, - QEMUSGList *sg, uint64_t sector, + QEMUSGList *sg, uint64_t offset, BlockCompletionFunc *cb, void *opaque); BlockAIOCB *dma_blk_write(BlockBackend *blk, - QEMUSGList *sg, uint64_t sector, + QEMUSGList *sg, uint64_t offset, BlockCompletionFunc *cb, void *opaque); uint64_t dma_buf_read(uint8_t *ptr, int32_t len, QEMUSGList *sg); uint64_t dma_buf_write(uint8_t *ptr, int32_t len, QEMUSGList *sg); diff --git a/include/sysemu/kvm.h b/include/sysemu/kvm.h index f9f00e2e56..f357ccde91 100644 --- a/include/sysemu/kvm.h +++ b/include/sysemu/kvm.h @@ -345,6 +345,8 @@ int kvm_arch_init(MachineState *ms, KVMState *s); int kvm_arch_init_vcpu(CPUState *cpu); +bool kvm_vcpu_id_is_valid(int vcpu_id); + /* Returns VCPU ID to be used on KVM_CREATE_VCPU ioctl() */ unsigned long kvm_arch_vcpu_id(CPUState *cpu); diff --git a/include/sysemu/sysemu.h b/include/sysemu/sysemu.h index 618169c4d5..94281413d0 100644 --- a/include/sysemu/sysemu.h +++ b/include/sysemu/sysemu.h @@ -119,7 +119,7 @@ void qemu_savevm_command_send(QEMUFile *f, enum qemu_vm_cmd command, uint16_t len, uint8_t *data); void qemu_savevm_send_ping(QEMUFile *f, uint32_t value); void qemu_savevm_send_open_return_path(QEMUFile *f); -int qemu_savevm_send_packaged(QEMUFile *f, const QEMUSizedBuffer *qsb); +int qemu_savevm_send_packaged(QEMUFile *f, const uint8_t *buf, size_t len); void qemu_savevm_send_postcopy_advise(QEMUFile *f); void qemu_savevm_send_postcopy_listen(QEMUFile *f); void qemu_savevm_send_postcopy_run(QEMUFile *f); diff --git a/io/channel-buffer.c b/io/channel-buffer.c index 3e5117bf28..43d795976d 100644 --- a/io/channel-buffer.c +++ b/io/channel-buffer.c @@ -140,6 +140,7 @@ static int qio_channel_buffer_close(QIOChannel *ioc, QIOChannelBuffer *bioc = QIO_CHANNEL_BUFFER(ioc); g_free(bioc->data); + bioc->data = NULL; bioc->capacity = bioc->usage = bioc->offset = 0; return 0; diff --git a/kvm-all.c b/kvm-all.c index f9ae8f9bf8..e56f385278 100644 --- a/kvm-all.c +++ b/kvm-all.c @@ -1459,6 +1459,12 @@ static int kvm_max_vcpus(KVMState *s) return (ret) ? ret : kvm_recommended_vcpus(s); } +bool kvm_vcpu_id_is_valid(int vcpu_id) +{ + KVMState *s = KVM_STATE(current_machine->accelerator); + return vcpu_id >= 0 && vcpu_id < kvm_max_vcpus(s); +} + static int kvm_init(MachineState *ms) { MachineClass *mc = MACHINE_GET_CLASS(ms); diff --git a/migration/Makefile.objs b/migration/Makefile.objs index d25ff483eb..30ad945918 100644 --- a/migration/Makefile.objs +++ b/migration/Makefile.objs @@ -1,11 +1,12 @@ -common-obj-y += migration.o tcp.o +common-obj-y += migration.o socket.o fd.o exec.o +common-obj-y += tls.o common-obj-y += vmstate.o -common-obj-y += qemu-file.o qemu-file-buf.o qemu-file-unix.o qemu-file-stdio.o +common-obj-y += qemu-file.o +common-obj-y += qemu-file-channel.o common-obj-y += xbzrle.o postcopy-ram.o common-obj-y += qjson.o common-obj-$(CONFIG_RDMA) += rdma.o -common-obj-$(CONFIG_POSIX) += exec.o unix.o fd.o common-obj-y += block.o diff --git a/migration/block.c b/migration/block.c index a7a76a0fb9..e0628d187f 100644 --- a/migration/block.c +++ b/migration/block.c @@ -383,7 +383,7 @@ static void init_blk_migration(QEMUFile *f) BlockDriverState *bs; BlkMigDevState *bmds; int64_t sectors; - BdrvNextIterator *it = NULL; + BdrvNextIterator it; block_mig_state.submitted = 0; block_mig_state.read_done = 0; @@ -394,7 +394,7 @@ static void init_blk_migration(QEMUFile *f) block_mig_state.zero_blocks = migrate_zero_blocks(); - while ((it = bdrv_next(it, &bs))) { + for (bs = bdrv_first(&it); bs; bs = bdrv_next(&it)) { if (bdrv_is_read_only(bs)) { continue; } diff --git a/migration/exec.c b/migration/exec.c index 559420969b..1515cc3319 100644 --- a/migration/exec.c +++ b/migration/exec.c @@ -3,10 +3,12 @@ * * Copyright IBM, Corp. 2008 * Copyright Dell MessageOne 2008 + * Copyright Red Hat, Inc. 2015-2016 * * Authors: * Anthony Liguori <aliguori@us.ibm.com> * Charles Duffy <charles_duffy@messageone.com> + * Daniel P. Berrange <berrange@redhat.com> * * This work is licensed under the terms of the GNU GPL, version 2. See * the COPYING file in the top-level directory. @@ -18,53 +20,53 @@ #include "qemu/osdep.h" #include "qapi/error.h" #include "qemu-common.h" -#include "qemu/sockets.h" -#include "qemu/main-loop.h" #include "migration/migration.h" -#include "migration/qemu-file.h" -#include "block/block.h" -#include <sys/wait.h> +#include "io/channel-command.h" +#include "trace.h" -//#define DEBUG_MIGRATION_EXEC - -#ifdef DEBUG_MIGRATION_EXEC -#define DPRINTF(fmt, ...) \ - do { printf("migration-exec: " fmt, ## __VA_ARGS__); } while (0) -#else -#define DPRINTF(fmt, ...) \ - do { } while (0) -#endif void exec_start_outgoing_migration(MigrationState *s, const char *command, Error **errp) { - s->to_dst_file = qemu_popen_cmd(command, "w"); - if (s->to_dst_file == NULL) { - error_setg_errno(errp, errno, "failed to popen the migration target"); + QIOChannel *ioc; + const char *argv[] = { "/bin/sh", "-c", command, NULL }; + + trace_migration_exec_outgoing(command); + ioc = QIO_CHANNEL(qio_channel_command_new_spawn(argv, + O_WRONLY, + errp)); + if (!ioc) { return; } - migrate_fd_connect(s); + migration_set_outgoing_channel(s, ioc, NULL); + object_unref(OBJECT(ioc)); } -static void exec_accept_incoming_migration(void *opaque) +static gboolean exec_accept_incoming_migration(QIOChannel *ioc, + GIOCondition condition, + gpointer opaque) { - QEMUFile *f = opaque; - - qemu_set_fd_handler(qemu_get_fd(f), NULL, NULL, NULL); - process_incoming_migration(f); + migration_set_incoming_channel(migrate_get_current(), ioc); + object_unref(OBJECT(ioc)); + return FALSE; /* unregister */ } void exec_start_incoming_migration(const char *command, Error **errp) { - QEMUFile *f; + QIOChannel *ioc; + const char *argv[] = { "/bin/sh", "-c", command, NULL }; - DPRINTF("Attempting to start an incoming migration\n"); - f = qemu_popen_cmd(command, "r"); - if(f == NULL) { - error_setg_errno(errp, errno, "failed to popen the migration source"); + trace_migration_exec_incoming(command); + ioc = QIO_CHANNEL(qio_channel_command_new_spawn(argv, + O_RDONLY, + errp)); + if (!ioc) { return; } - qemu_set_fd_handler(qemu_get_fd(f), exec_accept_incoming_migration, NULL, - f); + qio_channel_add_watch(ioc, + G_IO_IN, + exec_accept_incoming_migration, + NULL, + NULL); } diff --git a/migration/fd.c b/migration/fd.c index 3d788bb297..fc5c9eee02 100644 --- a/migration/fd.c +++ b/migration/fd.c @@ -1,10 +1,11 @@ /* * QEMU live migration via generic fd * - * Copyright Red Hat, Inc. 2009 + * Copyright Red Hat, Inc. 2009-2016 * * Authors: * Chris Lalancette <clalance@redhat.com> + * Daniel P. Berrange <berrange@redhat.com> * * This work is licensed under the terms of the GNU GPL, version 2. See * the COPYING file in the top-level directory. @@ -16,75 +17,57 @@ #include "qemu/osdep.h" #include "qapi/error.h" #include "qemu-common.h" -#include "qemu/main-loop.h" -#include "qemu/sockets.h" #include "migration/migration.h" #include "monitor/monitor.h" -#include "migration/qemu-file.h" -#include "block/block.h" +#include "io/channel-util.h" +#include "trace.h" -//#define DEBUG_MIGRATION_FD - -#ifdef DEBUG_MIGRATION_FD -#define DPRINTF(fmt, ...) \ - do { printf("migration-fd: " fmt, ## __VA_ARGS__); } while (0) -#else -#define DPRINTF(fmt, ...) \ - do { } while (0) -#endif - -static bool fd_is_socket(int fd) -{ - struct stat stat; - int ret = fstat(fd, &stat); - if (ret == -1) { - /* When in doubt say no */ - return false; - } - return S_ISSOCK(stat.st_mode); -} void fd_start_outgoing_migration(MigrationState *s, const char *fdname, Error **errp) { + QIOChannel *ioc; int fd = monitor_get_fd(cur_mon, fdname, errp); if (fd == -1) { return; } - if (fd_is_socket(fd)) { - s->to_dst_file = qemu_fopen_socket(fd, "wb"); - } else { - s->to_dst_file = qemu_fdopen(fd, "wb"); + trace_migration_fd_outgoing(fd); + ioc = qio_channel_new_fd(fd, errp); + if (!ioc) { + close(fd); + return; } - migrate_fd_connect(s); + migration_set_outgoing_channel(s, ioc, NULL); + object_unref(OBJECT(ioc)); } -static void fd_accept_incoming_migration(void *opaque) +static gboolean fd_accept_incoming_migration(QIOChannel *ioc, + GIOCondition condition, + gpointer opaque) { - QEMUFile *f = opaque; - - qemu_set_fd_handler(qemu_get_fd(f), NULL, NULL, NULL); - process_incoming_migration(f); + migration_set_incoming_channel(migrate_get_current(), ioc); + object_unref(OBJECT(ioc)); + return FALSE; /* unregister */ } void fd_start_incoming_migration(const char *infd, Error **errp) { + QIOChannel *ioc; int fd; - QEMUFile *f; - - DPRINTF("Attempting to start an incoming migration via fd\n"); fd = strtol(infd, NULL, 0); - if (fd_is_socket(fd)) { - f = qemu_fopen_socket(fd, "rb"); - } else { - f = qemu_fdopen(fd, "rb"); - } - if(f == NULL) { - error_setg_errno(errp, errno, "failed to open the source descriptor"); + trace_migration_fd_incoming(fd); + + ioc = qio_channel_new_fd(fd, errp); + if (!ioc) { + close(fd); return; } - qemu_set_fd_handler(fd, fd_accept_incoming_migration, NULL, f); + qio_channel_add_watch(ioc, + G_IO_IN, + fd_accept_incoming_migration, + NULL, + NULL); } diff --git a/migration/migration.c b/migration/migration.c index f5327e8c0a..7ecbadee6f 100644 --- a/migration/migration.c +++ b/migration/migration.c @@ -34,6 +34,8 @@ #include "qom/cpu.h" #include "exec/memory.h" #include "exec/address-spaces.h" +#include "io/channel-buffer.h" +#include "io/channel-tls.h" #define MAX_THROTTLE (32 << 20) /* Migration transfer speed throttling */ @@ -81,16 +83,13 @@ MigrationState *migrate_get_current(void) .bandwidth_limit = MAX_THROTTLE, .xbzrle_cache_size = DEFAULT_MIGRATE_CACHE_SIZE, .mbps = -1, - .parameters[MIGRATION_PARAMETER_COMPRESS_LEVEL] = - DEFAULT_MIGRATE_COMPRESS_LEVEL, - .parameters[MIGRATION_PARAMETER_COMPRESS_THREADS] = - DEFAULT_MIGRATE_COMPRESS_THREAD_COUNT, - .parameters[MIGRATION_PARAMETER_DECOMPRESS_THREADS] = - DEFAULT_MIGRATE_DECOMPRESS_THREAD_COUNT, - .parameters[MIGRATION_PARAMETER_CPU_THROTTLE_INITIAL] = - DEFAULT_MIGRATE_CPU_THROTTLE_INITIAL, - .parameters[MIGRATION_PARAMETER_CPU_THROTTLE_INCREMENT] = - DEFAULT_MIGRATE_CPU_THROTTLE_INCREMENT, + .parameters = { + .compress_level = DEFAULT_MIGRATE_COMPRESS_LEVEL, + .compress_threads = DEFAULT_MIGRATE_COMPRESS_THREAD_COUNT, + .decompress_threads = DEFAULT_MIGRATE_DECOMPRESS_THREAD_COUNT, + .cpu_throttle_initial = DEFAULT_MIGRATE_CPU_THROTTLE_INITIAL, + .cpu_throttle_increment = DEFAULT_MIGRATE_CPU_THROTTLE_INCREMENT, + }, }; if (!once) { @@ -310,14 +309,12 @@ void qemu_start_incoming_migration(const char *uri, Error **errp) } else if (strstart(uri, "rdma:", &p)) { rdma_start_incoming_migration(p, errp); #endif -#if !defined(WIN32) } else if (strstart(uri, "exec:", &p)) { exec_start_incoming_migration(p, errp); } else if (strstart(uri, "unix:", &p)) { unix_start_incoming_migration(p, errp); } else if (strstart(uri, "fd:", &p)) { fd_start_incoming_migration(p, errp); -#endif } else { error_setg(errp, "unknown migration protocol: %s", uri); } @@ -422,14 +419,60 @@ static void process_incoming_migration_co(void *opaque) void process_incoming_migration(QEMUFile *f) { Coroutine *co = qemu_coroutine_create(process_incoming_migration_co); - int fd = qemu_get_fd(f); - assert(fd != -1); migrate_decompress_threads_create(); - qemu_set_nonblock(fd); + qemu_file_set_blocking(f, false); qemu_coroutine_enter(co, f); } + +void migration_set_incoming_channel(MigrationState *s, + QIOChannel *ioc) +{ + trace_migration_set_incoming_channel( + ioc, object_get_typename(OBJECT(ioc))); + + if (s->parameters.tls_creds && + !object_dynamic_cast(OBJECT(ioc), + TYPE_QIO_CHANNEL_TLS)) { + Error *local_err = NULL; + migration_tls_set_incoming_channel(s, ioc, &local_err); + if (local_err) { + error_report_err(local_err); + } + } else { + QEMUFile *f = qemu_fopen_channel_input(ioc); + process_incoming_migration(f); + } +} + + +void migration_set_outgoing_channel(MigrationState *s, + QIOChannel *ioc, + const char *hostname) +{ + trace_migration_set_outgoing_channel( + ioc, object_get_typename(OBJECT(ioc)), hostname); + + if (s->parameters.tls_creds && + !object_dynamic_cast(OBJECT(ioc), + TYPE_QIO_CHANNEL_TLS)) { + Error *local_err = NULL; + migration_tls_set_outgoing_channel(s, ioc, hostname, &local_err); + if (local_err) { + migrate_fd_error(s, local_err); + error_free(local_err); + } + } else { + QEMUFile *f = qemu_fopen_channel_output(ioc); + + s->to_dst_file = f; + + migrate_fd_connect(s); + } +} + + /* * Send a message on the return channel back to the source * of the migration. @@ -516,15 +559,13 @@ MigrationParameters *qmp_query_migrate_parameters(Error **errp) MigrationState *s = migrate_get_current(); params = g_malloc0(sizeof(*params)); - params->compress_level = s->parameters[MIGRATION_PARAMETER_COMPRESS_LEVEL]; - params->compress_threads = - s->parameters[MIGRATION_PARAMETER_COMPRESS_THREADS]; - params->decompress_threads = - s->parameters[MIGRATION_PARAMETER_DECOMPRESS_THREADS]; - params->cpu_throttle_initial = - s->parameters[MIGRATION_PARAMETER_CPU_THROTTLE_INITIAL]; - params->cpu_throttle_increment = - s->parameters[MIGRATION_PARAMETER_CPU_THROTTLE_INCREMENT]; + params->compress_level = s->parameters.compress_level; + params->compress_threads = s->parameters.compress_threads; + params->decompress_threads = s->parameters.decompress_threads; + params->cpu_throttle_initial = s->parameters.cpu_throttle_initial; + params->cpu_throttle_increment = s->parameters.cpu_throttle_increment; + params->tls_creds = g_strdup(s->parameters.tls_creds); + params->tls_hostname = g_strdup(s->parameters.tls_hostname); return params; } @@ -672,6 +713,10 @@ MigrationInfo *qmp_query_migrate(Error **errp) break; case MIGRATION_STATUS_FAILED: info->has_status = true; + if (s->error) { + info->has_error_desc = true; + info->error_desc = g_strdup(error_get_pretty(s->error)); + } break; case MIGRATION_STATUS_CANCELLED: info->has_status = true; @@ -721,7 +766,12 @@ void qmp_migrate_set_parameters(bool has_compress_level, bool has_cpu_throttle_initial, int64_t cpu_throttle_initial, bool has_cpu_throttle_increment, - int64_t cpu_throttle_increment, Error **errp) + int64_t cpu_throttle_increment, + bool has_tls_creds, + const char *tls_creds, + bool has_tls_hostname, + const char *tls_hostname, + Error **errp) { MigrationState *s = migrate_get_current(); @@ -758,26 +808,31 @@ void qmp_migrate_set_parameters(bool has_compress_level, } if (has_compress_level) { - s->parameters[MIGRATION_PARAMETER_COMPRESS_LEVEL] = compress_level; + s->parameters.compress_level = compress_level; } if (has_compress_threads) { - s->parameters[MIGRATION_PARAMETER_COMPRESS_THREADS] = compress_threads; + s->parameters.compress_threads = compress_threads; } if (has_decompress_threads) { - s->parameters[MIGRATION_PARAMETER_DECOMPRESS_THREADS] = - decompress_threads; + s->parameters.decompress_threads = decompress_threads; } if (has_cpu_throttle_initial) { - s->parameters[MIGRATION_PARAMETER_CPU_THROTTLE_INITIAL] = - cpu_throttle_initial; + s->parameters.cpu_throttle_initial = cpu_throttle_initial; } - if (has_cpu_throttle_increment) { - s->parameters[MIGRATION_PARAMETER_CPU_THROTTLE_INCREMENT] = - cpu_throttle_increment; + s->parameters.cpu_throttle_increment = cpu_throttle_increment; + } + if (has_tls_creds) { + g_free(s->parameters.tls_creds); + s->parameters.tls_creds = g_strdup(tls_creds); + } + if (has_tls_hostname) { + g_free(s->parameters.tls_hostname); + s->parameters.tls_hostname = g_strdup(tls_hostname); } } + void qmp_migrate_start_postcopy(Error **errp) { MigrationState *s = migrate_get_current(); @@ -844,12 +899,15 @@ static void migrate_fd_cleanup(void *opaque) notifier_list_notify(&migration_state_notifiers, s); } -void migrate_fd_error(MigrationState *s) +void migrate_fd_error(MigrationState *s, const Error *error) { - trace_migrate_fd_error(); + trace_migrate_fd_error(error ? error_get_pretty(error) : ""); assert(s->to_dst_file == NULL); migrate_set_state(&s->state, MIGRATION_STATUS_SETUP, MIGRATION_STATUS_FAILED); + if (!s->error) { + s->error = error_copy(error); + } notifier_list_notify(&migration_state_notifiers, s); } @@ -948,6 +1006,8 @@ MigrationState *migrate_init(const MigrationParams *params) s->postcopy_after_devices = false; s->migration_thread_running = false; s->last_req_rb = NULL; + error_free(s->error); + s->error = NULL; migrate_set_state(&s->state, MIGRATION_STATUS_NONE, MIGRATION_STATUS_SETUP); @@ -1040,14 +1100,12 @@ void qmp_migrate(const char *uri, bool has_blk, bool blk, } else if (strstart(uri, "rdma:", &p)) { rdma_start_outgoing_migration(s, p, &local_err); #endif -#if !defined(WIN32) } else if (strstart(uri, "exec:", &p)) { exec_start_outgoing_migration(s, p, &local_err); } else if (strstart(uri, "unix:", &p)) { unix_start_outgoing_migration(s, p, &local_err); } else if (strstart(uri, "fd:", &p)) { fd_start_outgoing_migration(s, p, &local_err); -#endif } else { error_setg(errp, QERR_INVALID_PARAMETER_VALUE, "uri", "a valid migration protocol"); @@ -1057,7 +1115,7 @@ void qmp_migrate(const char *uri, bool has_blk, bool blk, } if (local_err) { - migrate_fd_error(s); + migrate_fd_error(s, local_err); error_propagate(errp, local_err); return; } @@ -1170,7 +1228,7 @@ int migrate_compress_level(void) s = migrate_get_current(); - return s->parameters[MIGRATION_PARAMETER_COMPRESS_LEVEL]; + return s->parameters.compress_level; } int migrate_compress_threads(void) @@ -1179,7 +1237,7 @@ int migrate_compress_threads(void) s = migrate_get_current(); - return s->parameters[MIGRATION_PARAMETER_COMPRESS_THREADS]; + return s->parameters.compress_threads; } int migrate_decompress_threads(void) @@ -1188,7 +1246,7 @@ int migrate_decompress_threads(void) s = migrate_get_current(); - return s->parameters[MIGRATION_PARAMETER_DECOMPRESS_THREADS]; + return s->parameters.decompress_threads; } bool migrate_use_events(void) @@ -1429,7 +1487,8 @@ static int await_return_path_close_on_source(MigrationState *ms) static int postcopy_start(MigrationState *ms, bool *old_vm_running) { int ret; - const QEMUSizedBuffer *qsb; + QIOChannelBuffer *bioc; + QEMUFile *fb; int64_t time_at_stop = qemu_clock_get_ms(QEMU_CLOCK_REALTIME); migrate_set_state(&ms->state, MIGRATION_STATUS_ACTIVE, MIGRATION_STATUS_POSTCOPY_ACTIVE); @@ -1488,11 +1547,9 @@ static int postcopy_start(MigrationState *ms, bool *old_vm_running) * So we wrap the device state up in a package with a length at the start; * to do this we use a qemu_buf to hold the whole of the device state. */ - QEMUFile *fb = qemu_bufopen("w", NULL); - if (!fb) { - error_report("Failed to create buffered file"); - goto fail; - } + bioc = qio_channel_buffer_new(4096); + fb = qemu_fopen_channel_output(QIO_CHANNEL(bioc)); + object_unref(OBJECT(bioc)); /* * Make sure the receiver can get incoming pages before we send the rest @@ -1506,10 +1563,9 @@ static int postcopy_start(MigrationState *ms, bool *old_vm_running) qemu_savevm_send_postcopy_run(fb); /* <><> end of stuff going into the package */ - qsb = qemu_buf_get(fb); /* Now send that blob */ - if (qemu_savevm_send_packaged(ms->to_dst_file, qsb)) { + if (qemu_savevm_send_packaged(ms->to_dst_file, bioc->data, bioc->usage)) { goto fail_closefb; } qemu_fclose(fb); @@ -1793,6 +1849,7 @@ void migrate_fd_connect(MigrationState *s) s->expected_downtime = max_downtime/1000000; s->cleanup_bh = qemu_bh_new(migrate_fd_cleanup, s); + qemu_file_set_blocking(s->to_dst_file, true); qemu_file_set_rate_limit(s->to_dst_file, s->bandwidth_limit / XFER_LIMIT_RATIO); diff --git a/migration/qemu-file-buf.c b/migration/qemu-file-buf.c deleted file mode 100644 index 7b8e78e99c..0000000000 --- a/migration/qemu-file-buf.c +++ /dev/null @@ -1,464 +0,0 @@ -/* - * QEMU System Emulator - * - * Copyright (c) 2003-2008 Fabrice Bellard - * Copyright (c) 2014 IBM Corp. - * - * Authors: - * Stefan Berger <stefanb@linux.vnet.ibm.com> - * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to deal - * in the Software without restriction, including without limitation the rights - * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell - * copies of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in - * all copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL - * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN - * THE SOFTWARE. - */ -#include "qemu/osdep.h" -#include "qemu-common.h" -#include "qemu/error-report.h" -#include "qemu/iov.h" -#include "qemu/sockets.h" -#include "qemu/coroutine.h" -#include "migration/migration.h" -#include "migration/qemu-file.h" -#include "migration/qemu-file-internal.h" -#include "trace.h" - -#define QSB_CHUNK_SIZE (1 << 10) -#define QSB_MAX_CHUNK_SIZE (16 * QSB_CHUNK_SIZE) - -/** - * Create a QEMUSizedBuffer - * This type of buffer uses scatter-gather lists internally and - * can grow to any size. Any data array in the scatter-gather list - * can hold different amount of bytes. - * - * @buffer: Optional buffer to copy into the QSB - * @len: size of initial buffer; if @buffer is given, buffer must - * hold at least len bytes - * - * Returns a pointer to a QEMUSizedBuffer or NULL on allocation failure - */ -QEMUSizedBuffer *qsb_create(const uint8_t *buffer, size_t len) -{ - QEMUSizedBuffer *qsb; - size_t alloc_len, num_chunks, i, to_copy; - size_t chunk_size = (len > QSB_MAX_CHUNK_SIZE) - ? QSB_MAX_CHUNK_SIZE - : QSB_CHUNK_SIZE; - - num_chunks = DIV_ROUND_UP(len ? len : QSB_CHUNK_SIZE, chunk_size); - alloc_len = num_chunks * chunk_size; - - qsb = g_try_new0(QEMUSizedBuffer, 1); - if (!qsb) { - return NULL; - } - - qsb->iov = g_try_new0(struct iovec, num_chunks); - if (!qsb->iov) { - g_free(qsb); - return NULL; - } - - qsb->n_iov = num_chunks; - - for (i = 0; i < num_chunks; i++) { - qsb->iov[i].iov_base = g_try_malloc0(chunk_size); - if (!qsb->iov[i].iov_base) { - /* qsb_free is safe since g_free can cope with NULL */ - qsb_free(qsb); - return NULL; - } - - qsb->iov[i].iov_len = chunk_size; - if (buffer) { - to_copy = (len - qsb->used) > chunk_size - ? chunk_size : (len - qsb->used); - memcpy(qsb->iov[i].iov_base, &buffer[qsb->used], to_copy); - qsb->used += to_copy; - } - } - - qsb->size = alloc_len; - - return qsb; -} - -/** - * Free the QEMUSizedBuffer - * - * @qsb: The QEMUSizedBuffer to free - */ -void qsb_free(QEMUSizedBuffer *qsb) -{ - size_t i; - - if (!qsb) { - return; - } - - for (i = 0; i < qsb->n_iov; i++) { - g_free(qsb->iov[i].iov_base); - } - g_free(qsb->iov); - g_free(qsb); -} - -/** - * Get the number of used bytes in the QEMUSizedBuffer - * - * @qsb: A QEMUSizedBuffer - * - * Returns the number of bytes currently used in this buffer - */ -size_t qsb_get_length(const QEMUSizedBuffer *qsb) -{ - return qsb->used; -} - -/** - * Set the length of the buffer; the primary usage of this - * function is to truncate the number of used bytes in the buffer. - * The size will not be extended beyond the current number of - * allocated bytes in the QEMUSizedBuffer. - * - * @qsb: A QEMUSizedBuffer - * @new_len: The new length of bytes in the buffer - * - * Returns the number of bytes the buffer was truncated or extended - * to. - */ -size_t qsb_set_length(QEMUSizedBuffer *qsb, size_t new_len) -{ - if (new_len <= qsb->size) { - qsb->used = new_len; - } else { - qsb->used = qsb->size; - } - return qsb->used; -} - -/** - * Get the iovec that holds the data for a given position @pos. - * - * @qsb: A QEMUSizedBuffer - * @pos: The index of a byte in the buffer - * @d_off: Pointer to an offset that this function will indicate - * at what position within the returned iovec the byte - * is to be found - * - * Returns the index of the iovec that holds the byte at the given - * index @pos in the byte stream; a negative number if the iovec - * for the given position @pos does not exist. - */ -static ssize_t qsb_get_iovec(const QEMUSizedBuffer *qsb, - off_t pos, off_t *d_off) -{ - ssize_t i; - off_t curr = 0; - - if (pos > qsb->used) { - return -1; - } - - for (i = 0; i < qsb->n_iov; i++) { - if (curr + qsb->iov[i].iov_len > pos) { - *d_off = pos - curr; - return i; - } - curr += qsb->iov[i].iov_len; - } - return -1; -} - -/* - * Convert the QEMUSizedBuffer into a flat buffer. - * - * Note: If at all possible, try to avoid this function since it - * may unnecessarily copy memory around. - * - * @qsb: pointer to QEMUSizedBuffer - * @start: offset to start at - * @count: number of bytes to copy - * @buf: a pointer to a buffer to write into (at least @count bytes) - * - * Returns the number of bytes copied into the output buffer - */ -ssize_t qsb_get_buffer(const QEMUSizedBuffer *qsb, off_t start, - size_t count, uint8_t *buffer) -{ - const struct iovec *iov; - size_t to_copy, all_copy; - ssize_t index; - off_t s_off; - off_t d_off = 0; - char *s; - - if (start > qsb->used) { - return 0; - } - - all_copy = qsb->used - start; - if (all_copy > count) { - all_copy = count; - } else { - count = all_copy; - } - - index = qsb_get_iovec(qsb, start, &s_off); - if (index < 0) { - return 0; - } - - while (all_copy > 0) { - iov = &qsb->iov[index]; - - s = iov->iov_base; - - to_copy = iov->iov_len - s_off; - if (to_copy > all_copy) { - to_copy = all_copy; - } - memcpy(&buffer[d_off], &s[s_off], to_copy); - - d_off += to_copy; - all_copy -= to_copy; - - s_off = 0; - index++; - } - - return count; -} - -/** - * Grow the QEMUSizedBuffer to the given size and allocate - * memory for it. - * - * @qsb: A QEMUSizedBuffer - * @new_size: The new size of the buffer - * - * Return: - * a negative error code in case of memory allocation failure - * or - * the new size of the buffer. The returned size may be greater or equal - * to @new_size. - */ -static ssize_t qsb_grow(QEMUSizedBuffer *qsb, size_t new_size) -{ - size_t needed_chunks, i; - - if (qsb->size < new_size) { - struct iovec *new_iov; - size_t size_diff = new_size - qsb->size; - size_t chunk_size = (size_diff > QSB_MAX_CHUNK_SIZE) - ? QSB_MAX_CHUNK_SIZE : QSB_CHUNK_SIZE; - - needed_chunks = DIV_ROUND_UP(size_diff, chunk_size); - - new_iov = g_try_new(struct iovec, qsb->n_iov + needed_chunks); - if (new_iov == NULL) { - return -ENOMEM; - } - - /* Allocate new chunks as needed into new_iov */ - for (i = qsb->n_iov; i < qsb->n_iov + needed_chunks; i++) { - new_iov[i].iov_base = g_try_malloc0(chunk_size); - new_iov[i].iov_len = chunk_size; - if (!new_iov[i].iov_base) { - size_t j; - - /* Free previously allocated new chunks */ - for (j = qsb->n_iov; j < i; j++) { - g_free(new_iov[j].iov_base); - } - g_free(new_iov); - - return -ENOMEM; - } - } - - /* - * Now we can't get any allocation errors, copy over to new iov - * and switch. - */ - for (i = 0; i < qsb->n_iov; i++) { - new_iov[i] = qsb->iov[i]; - } - - qsb->n_iov += needed_chunks; - g_free(qsb->iov); - qsb->iov = new_iov; - qsb->size += (needed_chunks * chunk_size); - } - - return qsb->size; -} - -/** - * Write into the QEMUSizedBuffer at a given position and a given - * number of bytes. This function will automatically grow the - * QEMUSizedBuffer. - * - * @qsb: A QEMUSizedBuffer - * @source: A byte array to copy data from - * @pos: The position within the @qsb to write data to - * @size: The number of bytes to copy into the @qsb - * - * Returns @size or a negative error code in case of memory allocation failure, - * or with an invalid 'pos' - */ -ssize_t qsb_write_at(QEMUSizedBuffer *qsb, const uint8_t *source, - off_t pos, size_t count) -{ - ssize_t rc = qsb_grow(qsb, pos + count); - size_t to_copy; - size_t all_copy = count; - const struct iovec *iov; - ssize_t index; - char *dest; - off_t d_off, s_off = 0; - - if (rc < 0) { - return rc; - } - - if (pos + count > qsb->used) { - qsb->used = pos + count; - } - - index = qsb_get_iovec(qsb, pos, &d_off); - if (index < 0) { - return -EINVAL; - } - - while (all_copy > 0) { - iov = &qsb->iov[index]; - - dest = iov->iov_base; - - to_copy = iov->iov_len - d_off; - if (to_copy > all_copy) { - to_copy = all_copy; - } - - memcpy(&dest[d_off], &source[s_off], to_copy); - - s_off += to_copy; - all_copy -= to_copy; - - d_off = 0; - index++; - } - - return count; -} - -typedef struct QEMUBuffer { - QEMUSizedBuffer *qsb; - QEMUFile *file; - bool qsb_allocated; -} QEMUBuffer; - -static ssize_t buf_get_buffer(void *opaque, uint8_t *buf, int64_t pos, - size_t size) -{ - QEMUBuffer *s = opaque; - ssize_t len = qsb_get_length(s->qsb) - pos; - - if (len <= 0) { - return 0; - } - - if (len > size) { - len = size; - } - return qsb_get_buffer(s->qsb, pos, len, buf); -} - -static ssize_t buf_put_buffer(void *opaque, const uint8_t *buf, - int64_t pos, size_t size) -{ - QEMUBuffer *s = opaque; - - return qsb_write_at(s->qsb, buf, pos, size); -} - -static int buf_close(void *opaque) -{ - QEMUBuffer *s = opaque; - - if (s->qsb_allocated) { - qsb_free(s->qsb); - } - - g_free(s); - - return 0; -} - -const QEMUSizedBuffer *qemu_buf_get(QEMUFile *f) -{ - QEMUBuffer *p; - - qemu_fflush(f); - - p = f->opaque; - - return p->qsb; -} - -static const QEMUFileOps buf_read_ops = { - .get_buffer = buf_get_buffer, - .close = buf_close, -}; - -static const QEMUFileOps buf_write_ops = { - .put_buffer = buf_put_buffer, - .close = buf_close, -}; - -QEMUFile *qemu_bufopen(const char *mode, QEMUSizedBuffer *input) -{ - QEMUBuffer *s; - - if (mode == NULL || (mode[0] != 'r' && mode[0] != 'w') || - mode[1] != '\0') { - error_report("qemu_bufopen: Argument validity check failed"); - return NULL; - } - - s = g_new0(QEMUBuffer, 1); - s->qsb = input; - - if (s->qsb == NULL) { - s->qsb = qsb_create(NULL, 0); - s->qsb_allocated = true; - } - if (!s->qsb) { - g_free(s); - error_report("qemu_bufopen: qsb_create failed"); - return NULL; - } - - - if (mode[0] == 'r') { - s->file = qemu_fopen_ops(s, &buf_read_ops); - } else { - s->file = qemu_fopen_ops(s, &buf_write_ops); - } - return s->file; -} diff --git a/migration/qemu-file-channel.c b/migration/qemu-file-channel.c new file mode 100644 index 0000000000..45c13f1028 --- /dev/null +++ b/migration/qemu-file-channel.c @@ -0,0 +1,180 @@ +/* + * QEMUFile backend for QIOChannel objects + * + * Copyright (c) 2015-2016 Red Hat, Inc + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL + * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN + * THE SOFTWARE. + */ + +#include "qemu/osdep.h" +#include "migration/qemu-file.h" +#include "io/channel-socket.h" +#include "qemu/iov.h" + + +static ssize_t channel_writev_buffer(void *opaque, + struct iovec *iov, + int iovcnt, + int64_t pos) +{ + QIOChannel *ioc = QIO_CHANNEL(opaque); + ssize_t done = 0; + struct iovec *local_iov = g_new(struct iovec, iovcnt); + struct iovec *local_iov_head = local_iov; + unsigned int nlocal_iov = iovcnt; + + nlocal_iov = iov_copy(local_iov, nlocal_iov, + iov, iovcnt, + 0, iov_size(iov, iovcnt)); + + while (nlocal_iov > 0) { + ssize_t len; + len = qio_channel_writev(ioc, local_iov, nlocal_iov, NULL); + if (len == QIO_CHANNEL_ERR_BLOCK) { + qio_channel_wait(ioc, G_IO_OUT); + continue; + } + if (len < 0) { + /* XXX handle Error objects */ + done = -EIO; + goto cleanup; + } + + iov_discard_front(&local_iov, &nlocal_iov, len); + done += len; + } + + cleanup: + g_free(local_iov_head); + return done; +} + + +static ssize_t channel_get_buffer(void *opaque, + uint8_t *buf, + int64_t pos, + size_t size) +{ + QIOChannel *ioc = QIO_CHANNEL(opaque); + ssize_t ret; + + do { + ret = qio_channel_read(ioc, (char *)buf, size, NULL); + if (ret < 0) { + if (ret == QIO_CHANNEL_ERR_BLOCK) { + qio_channel_yield(ioc, G_IO_IN); + } else { + /* XXX handle Error * object */ + return -EIO; + } + } + } while (ret == QIO_CHANNEL_ERR_BLOCK); + + return ret; +} + + +static int channel_close(void *opaque) +{ + QIOChannel *ioc = QIO_CHANNEL(opaque); + qio_channel_close(ioc, NULL); + object_unref(OBJECT(ioc)); + return 0; +} + + +static int channel_shutdown(void *opaque, + bool rd, + bool wr) +{ + QIOChannel *ioc = QIO_CHANNEL(opaque); + + if (qio_channel_has_feature(ioc, + QIO_CHANNEL_FEATURE_SHUTDOWN)) { + QIOChannelShutdown mode; + if (rd && wr) { + mode = QIO_CHANNEL_SHUTDOWN_BOTH; + } else if (rd) { + mode = QIO_CHANNEL_SHUTDOWN_READ; + } else { + mode = QIO_CHANNEL_SHUTDOWN_WRITE; + } + if (qio_channel_shutdown(ioc, mode, NULL) < 0) { + /* XXX handler Error * object */ + return -EIO; + } + } + return 0; +} + + +static int channel_set_blocking(void *opaque, + bool enabled) +{ + QIOChannel *ioc = QIO_CHANNEL(opaque); + + if (qio_channel_set_blocking(ioc, enabled, NULL) < 0) { + return -1; + } + return 0; +} + +static QEMUFile *channel_get_input_return_path(void *opaque) +{ + QIOChannel *ioc = QIO_CHANNEL(opaque); + + return qemu_fopen_channel_output(ioc); +} + +static QEMUFile *channel_get_output_return_path(void *opaque) +{ + QIOChannel *ioc = QIO_CHANNEL(opaque); + + return qemu_fopen_channel_input(ioc); +} + +static const QEMUFileOps channel_input_ops = { + .get_buffer = channel_get_buffer, + .close = channel_close, + .shut_down = channel_shutdown, + .set_blocking = channel_set_blocking, + .get_return_path = channel_get_input_return_path, +}; + + +static const QEMUFileOps channel_output_ops = { + .writev_buffer = channel_writev_buffer, + .close = channel_close, + .shut_down = channel_shutdown, + .set_blocking = channel_set_blocking, + .get_return_path = channel_get_output_return_path, +}; + + +QEMUFile *qemu_fopen_channel_input(QIOChannel *ioc) +{ + object_ref(OBJECT(ioc)); + return qemu_fopen_ops(ioc, &channel_input_ops); +} + +QEMUFile *qemu_fopen_channel_output(QIOChannel *ioc) +{ + object_ref(OBJECT(ioc)); + return qemu_fopen_ops(ioc, &channel_output_ops); +} diff --git a/migration/qemu-file-internal.h b/migration/qemu-file-internal.h deleted file mode 100644 index d95e8538e7..0000000000 --- a/migration/qemu-file-internal.h +++ /dev/null @@ -1,53 +0,0 @@ -/* - * QEMU System Emulator - * - * Copyright (c) 2003-2008 Fabrice Bellard - * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to deal - * in the Software without restriction, including without limitation the rights - * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell - * copies of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in - * all copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL - * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN - * THE SOFTWARE. - */ - -#ifndef QEMU_FILE_INTERNAL_H -#define QEMU_FILE_INTERNAL_H 1 - -#include "qemu-common.h" -#include "qemu/iov.h" - -#define IO_BUF_SIZE 32768 -#define MAX_IOV_SIZE MIN(IOV_MAX, 64) - -struct QEMUFile { - const QEMUFileOps *ops; - void *opaque; - - int64_t bytes_xfer; - int64_t xfer_limit; - - int64_t pos; /* start of buffer when writing, end of buffer - when reading */ - int buf_index; - int buf_size; /* 0 when writing */ - uint8_t buf[IO_BUF_SIZE]; - - struct iovec iov[MAX_IOV_SIZE]; - unsigned int iovcnt; - - int last_error; -}; - -#endif diff --git a/migration/qemu-file-stdio.c b/migration/qemu-file-stdio.c deleted file mode 100644 index f402e8f708..0000000000 --- a/migration/qemu-file-stdio.c +++ /dev/null @@ -1,196 +0,0 @@ -/* - * QEMU System Emulator - * - * Copyright (c) 2003-2008 Fabrice Bellard - * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to deal - * in the Software without restriction, including without limitation the rights - * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell - * copies of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in - * all copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL - * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN - * THE SOFTWARE. - */ -#include "qemu/osdep.h" -#include "qemu-common.h" -#include "qemu/coroutine.h" -#include "migration/qemu-file.h" - -typedef struct QEMUFileStdio { - FILE *stdio_file; - QEMUFile *file; -} QEMUFileStdio; - -static int stdio_get_fd(void *opaque) -{ - QEMUFileStdio *s = opaque; - - return fileno(s->stdio_file); -} - -static ssize_t stdio_put_buffer(void *opaque, const uint8_t *buf, int64_t pos, - size_t size) -{ - QEMUFileStdio *s = opaque; - size_t res; - - res = fwrite(buf, 1, size, s->stdio_file); - - if (res != size) { - return -errno; - } - return res; -} - -static ssize_t stdio_get_buffer(void *opaque, uint8_t *buf, int64_t pos, - size_t size) -{ - QEMUFileStdio *s = opaque; - FILE *fp = s->stdio_file; - ssize_t bytes; - - for (;;) { - clearerr(fp); - bytes = fread(buf, 1, size, fp); - if (bytes != 0 || !ferror(fp)) { - break; - } - if (errno == EAGAIN) { - yield_until_fd_readable(fileno(fp)); - } else if (errno != EINTR) { - break; - } - } - return bytes; -} - -static int stdio_pclose(void *opaque) -{ - QEMUFileStdio *s = opaque; - int ret; - ret = pclose(s->stdio_file); - if (ret == -1) { - ret = -errno; - } else if (!WIFEXITED(ret) || WEXITSTATUS(ret) != 0) { - /* close succeeded, but non-zero exit code: */ - ret = -EIO; /* fake errno value */ - } - g_free(s); - return ret; -} - -static int stdio_fclose(void *opaque) -{ - QEMUFileStdio *s = opaque; - int ret = 0; - - if (qemu_file_is_writable(s->file)) { - int fd = fileno(s->stdio_file); - struct stat st; - - ret = fstat(fd, &st); - if (ret == 0 && S_ISREG(st.st_mode)) { - /* - * If the file handle is a regular file make sure the - * data is flushed to disk before signaling success. - */ - ret = fsync(fd); - if (ret != 0) { - ret = -errno; - return ret; - } - } - } - if (fclose(s->stdio_file) == EOF) { - ret = -errno; - } - g_free(s); - return ret; -} - -static const QEMUFileOps stdio_pipe_read_ops = { - .get_fd = stdio_get_fd, - .get_buffer = stdio_get_buffer, - .close = stdio_pclose -}; - -static const QEMUFileOps stdio_pipe_write_ops = { - .get_fd = stdio_get_fd, - .put_buffer = stdio_put_buffer, - .close = stdio_pclose -}; - -QEMUFile *qemu_popen_cmd(const char *command, const char *mode) -{ - FILE *stdio_file; - QEMUFileStdio *s; - - if (mode == NULL || (mode[0] != 'r' && mode[0] != 'w') || mode[1] != 0) { - fprintf(stderr, "qemu_popen: Argument validity check failed\n"); - return NULL; - } - - stdio_file = popen(command, mode); - if (stdio_file == NULL) { - return NULL; - } - - s = g_new0(QEMUFileStdio, 1); - - s->stdio_file = stdio_file; - - if (mode[0] == 'r') { - s->file = qemu_fopen_ops(s, &stdio_pipe_read_ops); - } else { - s->file = qemu_fopen_ops(s, &stdio_pipe_write_ops); - } - return s->file; -} - -static const QEMUFileOps stdio_file_read_ops = { - .get_fd = stdio_get_fd, - .get_buffer = stdio_get_buffer, - .close = stdio_fclose -}; - -static const QEMUFileOps stdio_file_write_ops = { - .get_fd = stdio_get_fd, - .put_buffer = stdio_put_buffer, - .close = stdio_fclose -}; - -QEMUFile *qemu_fopen(const char *filename, const char *mode) -{ - QEMUFileStdio *s; - - if (qemu_file_mode_is_not_valid(mode)) { - return NULL; - } - - s = g_new0(QEMUFileStdio, 1); - - s->stdio_file = fopen(filename, mode); - if (!s->stdio_file) { - goto fail; - } - - if (mode[0] == 'w') { - s->file = qemu_fopen_ops(s, &stdio_file_write_ops); - } else { - s->file = qemu_fopen_ops(s, &stdio_file_read_ops); - } - return s->file; -fail: - g_free(s); - return NULL; -} diff --git a/migration/qemu-file-unix.c b/migration/qemu-file-unix.c deleted file mode 100644 index 4474e18ff8..0000000000 --- a/migration/qemu-file-unix.c +++ /dev/null @@ -1,323 +0,0 @@ -/* - * QEMU System Emulator - * - * Copyright (c) 2003-2008 Fabrice Bellard - * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to deal - * in the Software without restriction, including without limitation the rights - * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell - * copies of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in - * all copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL - * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN - * THE SOFTWARE. - */ -#include "qemu/osdep.h" -#include "qemu-common.h" -#include "qemu/error-report.h" -#include "qemu/iov.h" -#include "qemu/sockets.h" -#include "qemu/coroutine.h" -#include "migration/qemu-file.h" -#include "migration/qemu-file-internal.h" - -typedef struct QEMUFileSocket { - int fd; - QEMUFile *file; -} QEMUFileSocket; - -static ssize_t socket_writev_buffer(void *opaque, struct iovec *iov, int iovcnt, - int64_t pos) -{ - QEMUFileSocket *s = opaque; - ssize_t len; - ssize_t size = iov_size(iov, iovcnt); - ssize_t offset = 0; - int err; - - while (size > 0) { - len = iov_send(s->fd, iov, iovcnt, offset, size); - - if (len > 0) { - size -= len; - offset += len; - } - - if (size > 0) { - if (errno != EAGAIN && errno != EWOULDBLOCK) { - error_report("socket_writev_buffer: Got err=%d for (%zu/%zu)", - errno, (size_t)size, (size_t)len); - /* - * If I've already sent some but only just got the error, I - * could return the amount validly sent so far and wait for the - * next call to report the error, but I'd rather flag the error - * immediately. - */ - return -errno; - } - - /* Emulate blocking */ - GPollFD pfd; - - pfd.fd = s->fd; - pfd.events = G_IO_OUT | G_IO_ERR; - pfd.revents = 0; - TFR(err = g_poll(&pfd, 1, -1 /* no timeout */)); - /* Errors other than EINTR intentionally ignored */ - } - } - - return offset; -} - -static int socket_get_fd(void *opaque) -{ - QEMUFileSocket *s = opaque; - - return s->fd; -} - -static ssize_t socket_get_buffer(void *opaque, uint8_t *buf, int64_t pos, - size_t size) -{ - QEMUFileSocket *s = opaque; - ssize_t len; - - for (;;) { - len = qemu_recv(s->fd, buf, size, 0); - if (len != -1) { - break; - } - if (errno == EAGAIN) { - yield_until_fd_readable(s->fd); - } else if (errno != EINTR) { - break; - } - } - - if (len == -1) { - len = -errno; - } - return len; -} - -static int socket_close(void *opaque) -{ - QEMUFileSocket *s = opaque; - closesocket(s->fd); - g_free(s); - return 0; -} - -static int socket_shutdown(void *opaque, bool rd, bool wr) -{ - QEMUFileSocket *s = opaque; - - if (shutdown(s->fd, rd ? (wr ? SHUT_RDWR : SHUT_RD) : SHUT_WR)) { - return -errno; - } else { - return 0; - } -} - -static int socket_return_close(void *opaque) -{ - QEMUFileSocket *s = opaque; - /* - * Note: We don't close the socket, that should be done by the forward - * path. - */ - g_free(s); - return 0; -} - -static const QEMUFileOps socket_return_read_ops = { - .get_fd = socket_get_fd, - .get_buffer = socket_get_buffer, - .close = socket_return_close, - .shut_down = socket_shutdown, -}; - -static const QEMUFileOps socket_return_write_ops = { - .get_fd = socket_get_fd, - .writev_buffer = socket_writev_buffer, - .close = socket_return_close, - .shut_down = socket_shutdown, -}; - -/* - * Give a QEMUFile* off the same socket but data in the opposite - * direction. - */ -static QEMUFile *socket_get_return_path(void *opaque) -{ - QEMUFileSocket *forward = opaque; - QEMUFileSocket *reverse; - - if (qemu_file_get_error(forward->file)) { - /* If the forward file is in error, don't try and open a return */ - return NULL; - } - - reverse = g_malloc0(sizeof(QEMUFileSocket)); - reverse->fd = forward->fd; - /* I don't think there's a better way to tell which direction 'this' is */ - if (forward->file->ops->get_buffer != NULL) { - /* being called from the read side, so we need to be able to write */ - return qemu_fopen_ops(reverse, &socket_return_write_ops); - } else { - return qemu_fopen_ops(reverse, &socket_return_read_ops); - } -} - -static ssize_t unix_writev_buffer(void *opaque, struct iovec *iov, int iovcnt, - int64_t pos) -{ - QEMUFileSocket *s = opaque; - ssize_t len, offset; - ssize_t size = iov_size(iov, iovcnt); - ssize_t total = 0; - - assert(iovcnt > 0); - offset = 0; - while (size > 0) { - /* Find the next start position; skip all full-sized vector elements */ - while (offset >= iov[0].iov_len) { - offset -= iov[0].iov_len; - iov++, iovcnt--; - } - - /* skip `offset' bytes from the (now) first element, undo it on exit */ - assert(iovcnt > 0); - iov[0].iov_base += offset; - iov[0].iov_len -= offset; - - do { - len = writev(s->fd, iov, iovcnt); - } while (len == -1 && errno == EINTR); - if (len == -1) { - return -errno; - } - - /* Undo the changes above */ - iov[0].iov_base -= offset; - iov[0].iov_len += offset; - - /* Prepare for the next iteration */ - offset += len; - total += len; - size -= len; - } - - return total; -} - -static ssize_t unix_get_buffer(void *opaque, uint8_t *buf, int64_t pos, - size_t size) -{ - QEMUFileSocket *s = opaque; - ssize_t len; - - for (;;) { - len = read(s->fd, buf, size); - if (len != -1) { - break; - } - if (errno == EAGAIN) { - yield_until_fd_readable(s->fd); - } else if (errno != EINTR) { - break; - } - } - - if (len == -1) { - len = -errno; - } - return len; -} - -static int unix_close(void *opaque) -{ - QEMUFileSocket *s = opaque; - close(s->fd); - g_free(s); - return 0; -} - -static const QEMUFileOps unix_read_ops = { - .get_fd = socket_get_fd, - .get_buffer = unix_get_buffer, - .close = unix_close -}; - -static const QEMUFileOps unix_write_ops = { - .get_fd = socket_get_fd, - .writev_buffer = unix_writev_buffer, - .close = unix_close -}; - -QEMUFile *qemu_fdopen(int fd, const char *mode) -{ - QEMUFileSocket *s; - - if (mode == NULL || - (mode[0] != 'r' && mode[0] != 'w') || - mode[1] != 'b' || mode[2] != 0) { - fprintf(stderr, "qemu_fdopen: Argument validity check failed\n"); - return NULL; - } - - s = g_new0(QEMUFileSocket, 1); - s->fd = fd; - - if (mode[0] == 'r') { - s->file = qemu_fopen_ops(s, &unix_read_ops); - } else { - s->file = qemu_fopen_ops(s, &unix_write_ops); - } - return s->file; -} - -static const QEMUFileOps socket_read_ops = { - .get_fd = socket_get_fd, - .get_buffer = socket_get_buffer, - .close = socket_close, - .shut_down = socket_shutdown, - .get_return_path = socket_get_return_path -}; - -static const QEMUFileOps socket_write_ops = { - .get_fd = socket_get_fd, - .writev_buffer = socket_writev_buffer, - .close = socket_close, - .shut_down = socket_shutdown, - .get_return_path = socket_get_return_path -}; - -QEMUFile *qemu_fopen_socket(int fd, const char *mode) -{ - QEMUFileSocket *s; - - if (qemu_file_mode_is_not_valid(mode)) { - return NULL; - } - - s = g_new0(QEMUFileSocket, 1); - s->fd = fd; - if (mode[0] == 'w') { - qemu_set_block(s->fd); - s->file = qemu_fopen_ops(s, &socket_write_ops); - } else { - s->file = qemu_fopen_ops(s, &socket_read_ops); - } - return s->file; -} diff --git a/migration/qemu-file.c b/migration/qemu-file.c index 6f4a1299b3..8aea1c7094 100644 --- a/migration/qemu-file.c +++ b/migration/qemu-file.c @@ -30,9 +30,31 @@ #include "qemu/coroutine.h" #include "migration/migration.h" #include "migration/qemu-file.h" -#include "migration/qemu-file-internal.h" #include "trace.h" +#define IO_BUF_SIZE 32768 +#define MAX_IOV_SIZE MIN(IOV_MAX, 64) + +struct QEMUFile { + const QEMUFileOps *ops; + const QEMUFileHooks *hooks; + void *opaque; + + int64_t bytes_xfer; + int64_t xfer_limit; + + int64_t pos; /* start of buffer when writing, end of buffer + when reading */ + int buf_index; + int buf_size; /* 0 when writing */ + uint8_t buf[IO_BUF_SIZE]; + + struct iovec iov[MAX_IOV_SIZE]; + unsigned int iovcnt; + + int last_error; +}; + /* * Stop a file from being read/written - not all backing files can do this * typically only sockets can. @@ -80,6 +102,12 @@ QEMUFile *qemu_fopen_ops(void *opaque, const QEMUFileOps *ops) return f; } + +void qemu_file_set_hooks(QEMUFile *f, const QEMUFileHooks *hooks) +{ + f->hooks = hooks; +} + /* * Get last error for stream f * @@ -101,48 +129,49 @@ void qemu_file_set_error(QEMUFile *f, int ret) bool qemu_file_is_writable(QEMUFile *f) { - return f->ops->writev_buffer || f->ops->put_buffer; + return f->ops->writev_buffer; } /** * Flushes QEMUFile buffer * * If there is writev_buffer QEMUFileOps it uses it otherwise uses - * put_buffer ops. + * put_buffer ops. This will flush all pending data. If data was + * only partially flushed, it will set an error state. */ void qemu_fflush(QEMUFile *f) { ssize_t ret = 0; + ssize_t expect = 0; if (!qemu_file_is_writable(f)) { return; } - if (f->ops->writev_buffer) { - if (f->iovcnt > 0) { - ret = f->ops->writev_buffer(f->opaque, f->iov, f->iovcnt, f->pos); - } - } else { - if (f->buf_index > 0) { - ret = f->ops->put_buffer(f->opaque, f->buf, f->pos, f->buf_index); - } + if (f->iovcnt > 0) { + expect = iov_size(f->iov, f->iovcnt); + ret = f->ops->writev_buffer(f->opaque, f->iov, f->iovcnt, f->pos); } + if (ret >= 0) { f->pos += ret; } + /* We expect the QEMUFile write impl to send the full + * data set we requested, so sanity check that. + */ + if (ret != expect) { + qemu_file_set_error(f, ret < 0 ? ret : -EIO); + } f->buf_index = 0; f->iovcnt = 0; - if (ret < 0) { - qemu_file_set_error(f, ret); - } } void ram_control_before_iterate(QEMUFile *f, uint64_t flags) { int ret = 0; - if (f->ops->before_ram_iterate) { - ret = f->ops->before_ram_iterate(f, f->opaque, flags, NULL); + if (f->hooks && f->hooks->before_ram_iterate) { + ret = f->hooks->before_ram_iterate(f, f->opaque, flags, NULL); if (ret < 0) { qemu_file_set_error(f, ret); } @@ -153,8 +182,8 @@ void ram_control_after_iterate(QEMUFile *f, uint64_t flags) { int ret = 0; - if (f->ops->after_ram_iterate) { - ret = f->ops->after_ram_iterate(f, f->opaque, flags, NULL); + if (f->hooks && f->hooks->after_ram_iterate) { + ret = f->hooks->after_ram_iterate(f, f->opaque, flags, NULL); if (ret < 0) { qemu_file_set_error(f, ret); } @@ -165,8 +194,8 @@ void ram_control_load_hook(QEMUFile *f, uint64_t flags, void *data) { int ret = -EINVAL; - if (f->ops->hook_ram_load) { - ret = f->ops->hook_ram_load(f, f->opaque, flags, data); + if (f->hooks && f->hooks->hook_ram_load) { + ret = f->hooks->hook_ram_load(f, f->opaque, flags, data); if (ret < 0) { qemu_file_set_error(f, ret); } @@ -185,9 +214,9 @@ size_t ram_control_save_page(QEMUFile *f, ram_addr_t block_offset, ram_addr_t offset, size_t size, uint64_t *bytes_sent) { - if (f->ops->save_page) { - int ret = f->ops->save_page(f, f->opaque, block_offset, - offset, size, bytes_sent); + if (f->hooks && f->hooks->save_page) { + int ret = f->hooks->save_page(f, f->opaque, block_offset, + offset, size, bytes_sent); if (ret != RAM_SAVE_CONTROL_DELAYED) { if (bytes_sent && *bytes_sent > 0) { @@ -239,14 +268,6 @@ static ssize_t qemu_fill_buffer(QEMUFile *f) return len; } -int qemu_get_fd(QEMUFile *f) -{ - if (f->ops->get_fd) { - return f->ops->get_fd(f->opaque); - } - return -1; -} - void qemu_update_position(QEMUFile *f, size_t size) { f->pos += size; @@ -301,11 +322,6 @@ static void add_to_iovec(QEMUFile *f, const uint8_t *buf, size_t size) void qemu_put_buffer_async(QEMUFile *f, const uint8_t *buf, size_t size) { - if (!f->ops->writev_buffer) { - qemu_put_buffer(f, buf, size); - return; - } - if (f->last_error) { return; } @@ -329,9 +345,7 @@ void qemu_put_buffer(QEMUFile *f, const uint8_t *buf, size_t size) } memcpy(f->buf + f->buf_index, buf, l); f->bytes_xfer += l; - if (f->ops->writev_buffer) { - add_to_iovec(f, f->buf + f->buf_index, l); - } + add_to_iovec(f, f->buf + f->buf_index, l); f->buf_index += l; if (f->buf_index == IO_BUF_SIZE) { qemu_fflush(f); @@ -352,9 +366,7 @@ void qemu_put_byte(QEMUFile *f, int v) f->buf[f->buf_index] = v; f->bytes_xfer++; - if (f->ops->writev_buffer) { - add_to_iovec(f, f->buf + f->buf_index, 1); - } + add_to_iovec(f, f->buf + f->buf_index, 1); f->buf_index++; if (f->buf_index == IO_BUF_SIZE) { qemu_fflush(f); @@ -518,12 +530,8 @@ int64_t qemu_ftell_fast(QEMUFile *f) int64_t ret = f->pos; int i; - if (f->ops->writev_buffer) { - for (i = 0; i < f->iovcnt; i++) { - ret += f->iov[i].iov_len; - } - } else { - ret += f->buf_index; + for (i = 0; i < f->iovcnt; i++) { + ret += f->iov[i].iov_len; } return ret; @@ -670,9 +678,7 @@ size_t qemu_get_counted_string(QEMUFile *f, char buf[256]) */ void qemu_file_set_blocking(QEMUFile *f, bool block) { - if (block) { - qemu_set_block(qemu_get_fd(f)); - } else { - qemu_set_nonblock(qemu_get_fd(f)); + if (f->ops->set_blocking) { + f->ops->set_blocking(f->opaque, block); } } diff --git a/migration/ram.c b/migration/ram.c index 54e215128c..844ea4694f 100644 --- a/migration/ram.c +++ b/migration/ram.c @@ -429,10 +429,8 @@ static size_t save_page_header(QEMUFile *f, RAMBlock *block, ram_addr_t offset) static void mig_throttle_guest_down(void) { MigrationState *s = migrate_get_current(); - uint64_t pct_initial = - s->parameters[MIGRATION_PARAMETER_CPU_THROTTLE_INITIAL]; - uint64_t pct_icrement = - s->parameters[MIGRATION_PARAMETER_CPU_THROTTLE_INCREMENT]; + uint64_t pct_initial = s->parameters.cpu_throttle_initial; + uint64_t pct_icrement = s->parameters.cpu_throttle_increment; /* We have not started throttling yet. Let's start it. */ if (!cpu_throttle_active()) { diff --git a/migration/rdma.c b/migration/rdma.c index f6a9992b3e..51bafc702b 100644 --- a/migration/rdma.c +++ b/migration/rdma.c @@ -2,10 +2,12 @@ * RDMA protocol and interfaces * * Copyright IBM, Corp. 2010-2013 + * Copyright Red Hat, Inc. 2015-2016 * * Authors: * Michael R. Hines <mrhines@us.ibm.com> * Jiuxing Liu <jl@us.ibm.com> + * Daniel P. Berrange <berrange@redhat.com> * * This work is licensed under the terms of the GNU GPL, version 2 or * later. See the COPYING file in the top-level directory. @@ -374,14 +376,20 @@ typedef struct RDMAContext { GHashTable *blockmap; } RDMAContext; -/* - * Interface to the rest of the migration call stack. - */ -typedef struct QEMUFileRDMA { +#define TYPE_QIO_CHANNEL_RDMA "qio-channel-rdma" +#define QIO_CHANNEL_RDMA(obj) \ + OBJECT_CHECK(QIOChannelRDMA, (obj), TYPE_QIO_CHANNEL_RDMA) + +typedef struct QIOChannelRDMA QIOChannelRDMA; + + +struct QIOChannelRDMA { + QIOChannel parent; RDMAContext *rdma; + QEMUFile *file; size_t len; - void *file; -} QEMUFileRDMA; + bool blocking; /* XXX we don't actually honour this yet */ +}; /* * Main structure for IB Send/Recv control messages. @@ -2518,15 +2526,19 @@ static void *qemu_rdma_data_init(const char *host_port, Error **errp) * SEND messages for control only. * VM's ram is handled with regular RDMA messages. */ -static ssize_t qemu_rdma_put_buffer(void *opaque, const uint8_t *buf, - int64_t pos, size_t size) -{ - QEMUFileRDMA *r = opaque; - QEMUFile *f = r->file; - RDMAContext *rdma = r->rdma; - size_t remaining = size; - uint8_t * data = (void *) buf; +static ssize_t qio_channel_rdma_writev(QIOChannel *ioc, + const struct iovec *iov, + size_t niov, + int *fds, + size_t nfds, + Error **errp) +{ + QIOChannelRDMA *rioc = QIO_CHANNEL_RDMA(ioc); + QEMUFile *f = rioc->file; + RDMAContext *rdma = rioc->rdma; int ret; + ssize_t done = 0; + size_t i; CHECK_ERROR_STATE(); @@ -2540,27 +2552,31 @@ static ssize_t qemu_rdma_put_buffer(void *opaque, const uint8_t *buf, return ret; } - while (remaining) { - RDMAControlHeader head; + for (i = 0; i < niov; i++) { + size_t remaining = iov[i].iov_len; + uint8_t * data = (void *)iov[i].iov_base; + while (remaining) { + RDMAControlHeader head; - r->len = MIN(remaining, RDMA_SEND_INCREMENT); - remaining -= r->len; + rioc->len = MIN(remaining, RDMA_SEND_INCREMENT); + remaining -= rioc->len; - /* Guaranteed to fit due to RDMA_SEND_INCREMENT MIN above */ - head.len = (uint32_t)r->len; - head.type = RDMA_CONTROL_QEMU_FILE; + head.len = rioc->len; + head.type = RDMA_CONTROL_QEMU_FILE; - ret = qemu_rdma_exchange_send(rdma, &head, data, NULL, NULL, NULL); + ret = qemu_rdma_exchange_send(rdma, &head, data, NULL, NULL, NULL); - if (ret < 0) { - rdma->error_state = ret; - return ret; - } + if (ret < 0) { + rdma->error_state = ret; + return ret; + } - data += r->len; + data += rioc->len; + done += rioc->len; + } } - return size; + return done; } static size_t qemu_rdma_fill(RDMAContext *rdma, uint8_t *buf, @@ -2585,41 +2601,74 @@ static size_t qemu_rdma_fill(RDMAContext *rdma, uint8_t *buf, * RDMA links don't use bytestreams, so we have to * return bytes to QEMUFile opportunistically. */ -static ssize_t qemu_rdma_get_buffer(void *opaque, uint8_t *buf, - int64_t pos, size_t size) -{ - QEMUFileRDMA *r = opaque; - RDMAContext *rdma = r->rdma; +static ssize_t qio_channel_rdma_readv(QIOChannel *ioc, + const struct iovec *iov, + size_t niov, + int **fds, + size_t *nfds, + Error **errp) +{ + QIOChannelRDMA *rioc = QIO_CHANNEL_RDMA(ioc); + RDMAContext *rdma = rioc->rdma; RDMAControlHeader head; int ret = 0; + ssize_t i; + size_t done = 0; CHECK_ERROR_STATE(); - /* - * First, we hold on to the last SEND message we - * were given and dish out the bytes until we run - * out of bytes. - */ - r->len = qemu_rdma_fill(r->rdma, buf, size, 0); - if (r->len) { - return r->len; - } + for (i = 0; i < niov; i++) { + size_t want = iov[i].iov_len; + uint8_t *data = (void *)iov[i].iov_base; - /* - * Once we run out, we block and wait for another - * SEND message to arrive. - */ - ret = qemu_rdma_exchange_recv(rdma, &head, RDMA_CONTROL_QEMU_FILE); + /* + * First, we hold on to the last SEND message we + * were given and dish out the bytes until we run + * out of bytes. + */ + ret = qemu_rdma_fill(rioc->rdma, data, want, 0); + done += ret; + want -= ret; + /* Got what we needed, so go to next iovec */ + if (want == 0) { + continue; + } - if (ret < 0) { - rdma->error_state = ret; - return ret; - } + /* If we got any data so far, then don't wait + * for more, just return what we have */ + if (done > 0) { + break; + } - /* - * SEND was received with new bytes, now try again. - */ - return qemu_rdma_fill(r->rdma, buf, size, 0); + + /* We've got nothing at all, so lets wait for + * more to arrive + */ + ret = qemu_rdma_exchange_recv(rdma, &head, RDMA_CONTROL_QEMU_FILE); + + if (ret < 0) { + rdma->error_state = ret; + return ret; + } + + /* + * SEND was received with new bytes, now try again. + */ + ret = qemu_rdma_fill(rioc->rdma, data, want, 0); + done += ret; + want -= ret; + + /* Still didn't get enough, so lets just return */ + if (want) { + if (done == 0) { + return QIO_CHANNEL_ERR_BLOCK; + } else { + break; + } + } + } + rioc->len = done; + return rioc->len; } /* @@ -2646,15 +2695,122 @@ static int qemu_rdma_drain_cq(QEMUFile *f, RDMAContext *rdma) return 0; } -static int qemu_rdma_close(void *opaque) + +static int qio_channel_rdma_set_blocking(QIOChannel *ioc, + bool blocking, + Error **errp) +{ + QIOChannelRDMA *rioc = QIO_CHANNEL_RDMA(ioc); + /* XXX we should make readv/writev actually honour this :-) */ + rioc->blocking = blocking; + return 0; +} + + +typedef struct QIOChannelRDMASource QIOChannelRDMASource; +struct QIOChannelRDMASource { + GSource parent; + QIOChannelRDMA *rioc; + GIOCondition condition; +}; + +static gboolean +qio_channel_rdma_source_prepare(GSource *source, + gint *timeout) +{ + QIOChannelRDMASource *rsource = (QIOChannelRDMASource *)source; + RDMAContext *rdma = rsource->rioc->rdma; + GIOCondition cond = 0; + *timeout = -1; + + if (rdma->wr_data[0].control_len) { + cond |= G_IO_IN; + } + cond |= G_IO_OUT; + + return cond & rsource->condition; +} + +static gboolean +qio_channel_rdma_source_check(GSource *source) +{ + QIOChannelRDMASource *rsource = (QIOChannelRDMASource *)source; + RDMAContext *rdma = rsource->rioc->rdma; + GIOCondition cond = 0; + + if (rdma->wr_data[0].control_len) { + cond |= G_IO_IN; + } + cond |= G_IO_OUT; + + return cond & rsource->condition; +} + +static gboolean +qio_channel_rdma_source_dispatch(GSource *source, + GSourceFunc callback, + gpointer user_data) +{ + QIOChannelFunc func = (QIOChannelFunc)callback; + QIOChannelRDMASource *rsource = (QIOChannelRDMASource *)source; + RDMAContext *rdma = rsource->rioc->rdma; + GIOCondition cond = 0; + + if (rdma->wr_data[0].control_len) { + cond |= G_IO_IN; + } + cond |= G_IO_OUT; + + return (*func)(QIO_CHANNEL(rsource->rioc), + (cond & rsource->condition), + user_data); +} + +static void +qio_channel_rdma_source_finalize(GSource *source) +{ + QIOChannelRDMASource *ssource = (QIOChannelRDMASource *)source; + + object_unref(OBJECT(ssource->rioc)); +} + +GSourceFuncs qio_channel_rdma_source_funcs = { + qio_channel_rdma_source_prepare, + qio_channel_rdma_source_check, + qio_channel_rdma_source_dispatch, + qio_channel_rdma_source_finalize +}; + +static GSource *qio_channel_rdma_create_watch(QIOChannel *ioc, + GIOCondition condition) +{ + QIOChannelRDMA *rioc = QIO_CHANNEL_RDMA(ioc); + QIOChannelRDMASource *ssource; + GSource *source; + + source = g_source_new(&qio_channel_rdma_source_funcs, + sizeof(QIOChannelRDMASource)); + ssource = (QIOChannelRDMASource *)source; + + ssource->rioc = rioc; + object_ref(OBJECT(rioc)); + + ssource->condition = condition; + + return source; +} + + +static int qio_channel_rdma_close(QIOChannel *ioc, + Error **errp) { + QIOChannelRDMA *rioc = QIO_CHANNEL_RDMA(ioc); trace_qemu_rdma_close(); - QEMUFileRDMA *r = opaque; - if (r->rdma) { - qemu_rdma_cleanup(r->rdma); - g_free(r->rdma); + if (rioc->rdma) { + qemu_rdma_cleanup(rioc->rdma); + g_free(rioc->rdma); + rioc->rdma = NULL; } - g_free(r); return 0; } @@ -2696,8 +2852,8 @@ static size_t qemu_rdma_save_page(QEMUFile *f, void *opaque, ram_addr_t block_offset, ram_addr_t offset, size_t size, uint64_t *bytes_sent) { - QEMUFileRDMA *rfile = opaque; - RDMAContext *rdma = rfile->rdma; + QIOChannelRDMA *rioc = QIO_CHANNEL_RDMA(opaque); + RDMAContext *rdma = rioc->rdma; int ret; CHECK_ERROR_STATE(); @@ -2951,8 +3107,8 @@ static int qemu_rdma_registration_handle(QEMUFile *f, void *opaque) }; RDMAControlHeader blocks = { .type = RDMA_CONTROL_RAM_BLOCKS_RESULT, .repeat = 1 }; - QEMUFileRDMA *rfile = opaque; - RDMAContext *rdma = rfile->rdma; + QIOChannelRDMA *rioc = QIO_CHANNEL_RDMA(opaque); + RDMAContext *rdma = rioc->rdma; RDMALocalBlocks *local = &rdma->local_ram_blocks; RDMAControlHeader head; RDMARegister *reg, *registers; @@ -3207,9 +3363,10 @@ out: * We've already built our local RAMBlock list, but not yet sent the list to * the source. */ -static int rdma_block_notification_handle(QEMUFileRDMA *rfile, const char *name) +static int +rdma_block_notification_handle(QIOChannelRDMA *rioc, const char *name) { - RDMAContext *rdma = rfile->rdma; + RDMAContext *rdma = rioc->rdma; int curr; int found = -1; @@ -3251,8 +3408,8 @@ static int rdma_load_hook(QEMUFile *f, void *opaque, uint64_t flags, void *data) static int qemu_rdma_registration_start(QEMUFile *f, void *opaque, uint64_t flags, void *data) { - QEMUFileRDMA *rfile = opaque; - RDMAContext *rdma = rfile->rdma; + QIOChannelRDMA *rioc = QIO_CHANNEL_RDMA(opaque); + RDMAContext *rdma = rioc->rdma; CHECK_ERROR_STATE(); @@ -3271,8 +3428,8 @@ static int qemu_rdma_registration_stop(QEMUFile *f, void *opaque, uint64_t flags, void *data) { Error *local_err = NULL, **errp = &local_err; - QEMUFileRDMA *rfile = opaque; - RDMAContext *rdma = rfile->rdma; + QIOChannelRDMA *rioc = QIO_CHANNEL_RDMA(opaque); + RDMAContext *rdma = rioc->rdma; RDMAControlHeader head = { .len = 0, .repeat = 1 }; int ret = 0; @@ -3368,47 +3525,74 @@ err: return ret; } -static int qemu_rdma_get_fd(void *opaque) -{ - QEMUFileRDMA *rfile = opaque; - RDMAContext *rdma = rfile->rdma; - - return rdma->comp_channel->fd; -} - -static const QEMUFileOps rdma_read_ops = { - .get_buffer = qemu_rdma_get_buffer, - .get_fd = qemu_rdma_get_fd, - .close = qemu_rdma_close, +static const QEMUFileHooks rdma_read_hooks = { .hook_ram_load = rdma_load_hook, }; -static const QEMUFileOps rdma_write_ops = { - .put_buffer = qemu_rdma_put_buffer, - .close = qemu_rdma_close, +static const QEMUFileHooks rdma_write_hooks = { .before_ram_iterate = qemu_rdma_registration_start, .after_ram_iterate = qemu_rdma_registration_stop, .save_page = qemu_rdma_save_page, }; -static void *qemu_fopen_rdma(RDMAContext *rdma, const char *mode) + +static void qio_channel_rdma_finalize(Object *obj) { - QEMUFileRDMA *r; + QIOChannelRDMA *rioc = QIO_CHANNEL_RDMA(obj); + if (rioc->rdma) { + qemu_rdma_cleanup(rioc->rdma); + g_free(rioc->rdma); + rioc->rdma = NULL; + } +} + +static void qio_channel_rdma_class_init(ObjectClass *klass, + void *class_data G_GNUC_UNUSED) +{ + QIOChannelClass *ioc_klass = QIO_CHANNEL_CLASS(klass); + + ioc_klass->io_writev = qio_channel_rdma_writev; + ioc_klass->io_readv = qio_channel_rdma_readv; + ioc_klass->io_set_blocking = qio_channel_rdma_set_blocking; + ioc_klass->io_close = qio_channel_rdma_close; + ioc_klass->io_create_watch = qio_channel_rdma_create_watch; +} + +static const TypeInfo qio_channel_rdma_info = { + .parent = TYPE_QIO_CHANNEL, + .name = TYPE_QIO_CHANNEL_RDMA, + .instance_size = sizeof(QIOChannelRDMA), + .instance_finalize = qio_channel_rdma_finalize, + .class_init = qio_channel_rdma_class_init, +}; + +static void qio_channel_rdma_register_types(void) +{ + type_register_static(&qio_channel_rdma_info); +} + +type_init(qio_channel_rdma_register_types); + +static QEMUFile *qemu_fopen_rdma(RDMAContext *rdma, const char *mode) +{ + QIOChannelRDMA *rioc; if (qemu_file_mode_is_not_valid(mode)) { return NULL; } - r = g_new0(QEMUFileRDMA, 1); - r->rdma = rdma; + rioc = QIO_CHANNEL_RDMA(object_new(TYPE_QIO_CHANNEL_RDMA)); + rioc->rdma = rdma; if (mode[0] == 'w') { - r->file = qemu_fopen_ops(r, &rdma_write_ops); + rioc->file = qemu_fopen_channel_output(QIO_CHANNEL(rioc)); + qemu_file_set_hooks(rioc->file, &rdma_write_hooks); } else { - r->file = qemu_fopen_ops(r, &rdma_read_ops); + rioc->file = qemu_fopen_channel_input(QIO_CHANNEL(rioc)); + qemu_file_set_hooks(rioc->file, &rdma_read_hooks); } - return r->file; + return rioc->file; } static void rdma_accept_incoming_migration(void *opaque) @@ -3481,16 +3665,14 @@ void rdma_start_outgoing_migration(void *opaque, const char *host_port, Error **errp) { MigrationState *s = opaque; - Error *local_err = NULL, **temp = &local_err; - RDMAContext *rdma = qemu_rdma_data_init(host_port, &local_err); + RDMAContext *rdma = qemu_rdma_data_init(host_port, errp); int ret = 0; if (rdma == NULL) { - ERROR(temp, "Failed to initialize RDMA data structures! %d", ret); goto err; } - ret = qemu_rdma_source_init(rdma, &local_err, + ret = qemu_rdma_source_init(rdma, errp, s->enabled_capabilities[MIGRATION_CAPABILITY_RDMA_PIN_ALL]); if (ret) { @@ -3498,7 +3680,7 @@ void rdma_start_outgoing_migration(void *opaque, } trace_rdma_start_outgoing_migration_after_rdma_source_init(); - ret = qemu_rdma_connect(rdma, &local_err); + ret = qemu_rdma_connect(rdma, errp); if (ret) { goto err; @@ -3510,7 +3692,5 @@ void rdma_start_outgoing_migration(void *opaque, migrate_fd_connect(s); return; err: - error_propagate(errp, local_err); g_free(rdma); - migrate_fd_error(s); } diff --git a/migration/savevm.c b/migration/savevm.c index 65ce0c61a3..6c21231131 100644 --- a/migration/savevm.c +++ b/migration/savevm.c @@ -51,6 +51,8 @@ #include "block/snapshot.h" #include "block/qapi.h" #include "qemu/cutils.h" +#include "io/channel-buffer.h" +#include "io/channel-file.h" #ifndef ETH_P_RARP #define ETH_P_RARP 0x8035 @@ -158,13 +160,6 @@ static ssize_t block_writev_buffer(void *opaque, struct iovec *iov, int iovcnt, return qiov.size; } -static ssize_t block_put_buffer(void *opaque, const uint8_t *buf, - int64_t pos, size_t size) -{ - bdrv_save_vmstate(opaque, buf, pos, size); - return size; -} - static ssize_t block_get_buffer(void *opaque, uint8_t *buf, int64_t pos, size_t size) { @@ -182,7 +177,6 @@ static const QEMUFileOps bdrv_read_ops = { }; static const QEMUFileOps bdrv_write_ops = { - .put_buffer = block_put_buffer, .writev_buffer = block_writev_buffer, .close = bdrv_fclose }; @@ -760,10 +754,8 @@ void qemu_savevm_send_open_return_path(QEMUFile *f) * 0 on success * -ve on error */ -int qemu_savevm_send_packaged(QEMUFile *f, const QEMUSizedBuffer *qsb) +int qemu_savevm_send_packaged(QEMUFile *f, const uint8_t *buf, size_t len) { - size_t cur_iov; - size_t len = qsb_get_length(qsb); uint32_t tmp; if (len > MAX_VM_CMD_PACKAGED_SIZE) { @@ -777,18 +769,7 @@ int qemu_savevm_send_packaged(QEMUFile *f, const QEMUSizedBuffer *qsb) trace_qemu_savevm_send_packaged(); qemu_savevm_command_send(f, MIG_CMD_PACKAGED, 4, (uint8_t *)&tmp); - /* all the data follows (concatinating the iov's) */ - for (cur_iov = 0; cur_iov < qsb->n_iov; cur_iov++) { - /* The iov entries are partially filled */ - size_t towrite = MIN(qsb->iov[cur_iov].iov_len, len); - len -= towrite; - - if (!towrite) { - break; - } - - qemu_put_buffer(f, qsb->iov[cur_iov].iov_base, towrite); - } + qemu_put_buffer(f, buf, len); return 0; } @@ -1578,39 +1559,36 @@ static int loadvm_postcopy_handle_run(MigrationIncomingState *mis) static int loadvm_handle_cmd_packaged(MigrationIncomingState *mis) { int ret; - uint8_t *buffer; - uint32_t length; - QEMUSizedBuffer *qsb; + size_t length; + QIOChannelBuffer *bioc; length = qemu_get_be32(mis->from_src_file); trace_loadvm_handle_cmd_packaged(length); if (length > MAX_VM_CMD_PACKAGED_SIZE) { - error_report("Unreasonably large packaged state: %u", length); + error_report("Unreasonably large packaged state: %zu", length); return -1; } - buffer = g_malloc0(length); - ret = qemu_get_buffer(mis->from_src_file, buffer, (int)length); + + bioc = qio_channel_buffer_new(length); + ret = qemu_get_buffer(mis->from_src_file, + bioc->data, + length); if (ret != length) { - g_free(buffer); - error_report("CMD_PACKAGED: Buffer receive fail ret=%d length=%d", + object_unref(OBJECT(bioc)); + error_report("CMD_PACKAGED: Buffer receive fail ret=%d length=%zu", ret, length); return (ret < 0) ? ret : -EAGAIN; } + bioc->usage += length; trace_loadvm_handle_cmd_packaged_received(ret); - /* Setup a dummy QEMUFile that actually reads from the buffer */ - qsb = qsb_create(buffer, length); - g_free(buffer); /* Because qsb_create copies */ - if (!qsb) { - error_report("Unable to create qsb"); - } - QEMUFile *packf = qemu_bufopen("r", qsb); + QEMUFile *packf = qemu_fopen_channel_input(QIO_CHANNEL(bioc)); ret = qemu_loadvm_state_main(packf, mis); trace_loadvm_handle_cmd_packaged_main(ret); qemu_fclose(packf); - qsb_free(qsb); + object_unref(OBJECT(bioc)); return ret; } @@ -2061,6 +2039,7 @@ void hmp_savevm(Monitor *mon, const QDict *qdict) void qmp_xen_save_devices_state(const char *filename, Error **errp) { QEMUFile *f; + QIOChannelFile *ioc; int saved_vm_running; int ret; @@ -2068,11 +2047,11 @@ void qmp_xen_save_devices_state(const char *filename, Error **errp) vm_stop(RUN_STATE_SAVE_VM); global_state_store_running(); - f = qemu_fopen(filename, "wb"); - if (!f) { - error_setg_file_open(errp, errno, filename); + ioc = qio_channel_file_new_path(filename, O_WRONLY | O_CREAT, 0660, errp); + if (!ioc) { goto the_end; } + f = qemu_fopen_channel_output(QIO_CHANNEL(ioc)); ret = qemu_save_device_state(f); qemu_fclose(f); if (ret < 0) { diff --git a/migration/socket.c b/migration/socket.c new file mode 100644 index 0000000000..977a8d3c1d --- /dev/null +++ b/migration/socket.c @@ -0,0 +1,183 @@ +/* + * QEMU live migration via Unix Domain Sockets + * + * Copyright Red Hat, Inc. 2009-2016 + * + * Authors: + * Chris Lalancette <clalance@redhat.com> + * Daniel P. Berrange <berrange@redhat.com> + * + * This work is licensed under the terms of the GNU GPL, version 2. See + * the COPYING file in the top-level directory. + * + * Contributions after 2012-01-13 are licensed under the terms of the + * GNU GPL, version 2 or (at your option) any later version. + */ + +#include "qemu/osdep.h" + +#include "qemu-common.h" +#include "qemu/error-report.h" +#include "qapi/error.h" +#include "migration/migration.h" +#include "migration/qemu-file.h" +#include "io/channel-socket.h" +#include "trace.h" + + +static SocketAddress *tcp_build_address(const char *host_port, Error **errp) +{ + InetSocketAddress *iaddr = inet_parse(host_port, errp); + SocketAddress *saddr; + + if (!iaddr) { + return NULL; + } + + saddr = g_new0(SocketAddress, 1); + saddr->type = SOCKET_ADDRESS_KIND_INET; + saddr->u.inet.data = iaddr; + + return saddr; +} + + +static SocketAddress *unix_build_address(const char *path) +{ + SocketAddress *saddr; + + saddr = g_new0(SocketAddress, 1); + saddr->type = SOCKET_ADDRESS_KIND_UNIX; + saddr->u.q_unix.data = g_new0(UnixSocketAddress, 1); + saddr->u.q_unix.data->path = g_strdup(path); + + return saddr; +} + + +struct SocketConnectData { + MigrationState *s; + char *hostname; +}; + +static void socket_connect_data_free(void *opaque) +{ + struct SocketConnectData *data = opaque; + if (!data) { + return; + } + g_free(data->hostname); + g_free(data); +} + +static void socket_outgoing_migration(Object *src, + Error *err, + gpointer opaque) +{ + struct SocketConnectData *data = opaque; + QIOChannel *sioc = QIO_CHANNEL(src); + + if (err) { + trace_migration_socket_outgoing_error(error_get_pretty(err)); + data->s->to_dst_file = NULL; + migrate_fd_error(data->s, err); + } else { + trace_migration_socket_outgoing_connected(data->hostname); + migration_set_outgoing_channel(data->s, sioc, data->hostname); + } + object_unref(src); +} + +static void socket_start_outgoing_migration(MigrationState *s, + SocketAddress *saddr, + Error **errp) +{ + QIOChannelSocket *sioc = qio_channel_socket_new(); + struct SocketConnectData *data = g_new0(struct SocketConnectData, 1); + data->s = s; + if (saddr->type == SOCKET_ADDRESS_KIND_INET) { + data->hostname = g_strdup(saddr->u.inet.data->host); + } + qio_channel_socket_connect_async(sioc, + saddr, + socket_outgoing_migration, + data, + socket_connect_data_free); + qapi_free_SocketAddress(saddr); +} + +void tcp_start_outgoing_migration(MigrationState *s, + const char *host_port, + Error **errp) +{ + SocketAddress *saddr = tcp_build_address(host_port, errp); + socket_start_outgoing_migration(s, saddr, errp); +} + +void unix_start_outgoing_migration(MigrationState *s, + const char *path, + Error **errp) +{ + SocketAddress *saddr = unix_build_address(path); + socket_start_outgoing_migration(s, saddr, errp); +} + + +static gboolean socket_accept_incoming_migration(QIOChannel *ioc, + GIOCondition condition, + gpointer opaque) +{ + QIOChannelSocket *sioc; + Error *err = NULL; + + sioc = qio_channel_socket_accept(QIO_CHANNEL_SOCKET(ioc), + &err); + if (!sioc) { + error_report("could not accept migration connection (%s)", + error_get_pretty(err)); + goto out; + } + + trace_migration_socket_incoming_accepted(); + + migration_set_incoming_channel(migrate_get_current(), + QIO_CHANNEL(sioc)); + object_unref(OBJECT(sioc)); + +out: + /* Close listening socket as its no longer needed */ + qio_channel_close(ioc, NULL); + return FALSE; /* unregister */ +} + + +static void socket_start_incoming_migration(SocketAddress *saddr, + Error **errp) +{ + QIOChannelSocket *listen_ioc = qio_channel_socket_new(); + + if (qio_channel_socket_listen_sync(listen_ioc, saddr, errp) < 0) { + object_unref(OBJECT(listen_ioc)); + qapi_free_SocketAddress(saddr); + return; + } + + qio_channel_add_watch(QIO_CHANNEL(listen_ioc), + G_IO_IN, + socket_accept_incoming_migration, + listen_ioc, + (GDestroyNotify)object_unref); + qapi_free_SocketAddress(saddr); +} + +void tcp_start_incoming_migration(const char *host_port, Error **errp) +{ + SocketAddress *saddr = tcp_build_address(host_port, errp); + socket_start_incoming_migration(saddr, errp); +} + +void unix_start_incoming_migration(const char *path, Error **errp) +{ + SocketAddress *saddr = unix_build_address(path); + socket_start_incoming_migration(saddr, errp); +} diff --git a/migration/tcp.c b/migration/tcp.c deleted file mode 100644 index e1fa7f8f18..0000000000 --- a/migration/tcp.c +++ /dev/null @@ -1,102 +0,0 @@ -/* - * QEMU live migration - * - * Copyright IBM, Corp. 2008 - * - * Authors: - * Anthony Liguori <aliguori@us.ibm.com> - * - * This work is licensed under the terms of the GNU GPL, version 2. See - * the COPYING file in the top-level directory. - * - * Contributions after 2012-01-13 are licensed under the terms of the - * GNU GPL, version 2 or (at your option) any later version. - */ - -#include "qemu/osdep.h" - -#include "qemu-common.h" -#include "qemu/error-report.h" -#include "qemu/sockets.h" -#include "migration/migration.h" -#include "migration/qemu-file.h" -#include "block/block.h" -#include "qemu/main-loop.h" - -//#define DEBUG_MIGRATION_TCP - -#ifdef DEBUG_MIGRATION_TCP -#define DPRINTF(fmt, ...) \ - do { printf("migration-tcp: " fmt, ## __VA_ARGS__); } while (0) -#else -#define DPRINTF(fmt, ...) \ - do { } while (0) -#endif - -static void tcp_wait_for_connect(int fd, Error *err, void *opaque) -{ - MigrationState *s = opaque; - - if (fd < 0) { - DPRINTF("migrate connect error: %s\n", error_get_pretty(err)); - s->to_dst_file = NULL; - migrate_fd_error(s); - } else { - DPRINTF("migrate connect success\n"); - s->to_dst_file = qemu_fopen_socket(fd, "wb"); - migrate_fd_connect(s); - } -} - -void tcp_start_outgoing_migration(MigrationState *s, const char *host_port, Error **errp) -{ - inet_nonblocking_connect(host_port, tcp_wait_for_connect, s, errp); -} - -static void tcp_accept_incoming_migration(void *opaque) -{ - struct sockaddr_in addr; - socklen_t addrlen = sizeof(addr); - int s = (intptr_t)opaque; - QEMUFile *f; - int c; - - do { - c = qemu_accept(s, (struct sockaddr *)&addr, &addrlen); - } while (c < 0 && errno == EINTR); - qemu_set_fd_handler(s, NULL, NULL, NULL); - closesocket(s); - - DPRINTF("accepted migration\n"); - - if (c < 0) { - error_report("could not accept migration connection (%s)", - strerror(errno)); - return; - } - - f = qemu_fopen_socket(c, "rb"); - if (f == NULL) { - error_report("could not qemu_fopen socket"); - goto out; - } - - process_incoming_migration(f); - return; - -out: - closesocket(c); -} - -void tcp_start_incoming_migration(const char *host_port, Error **errp) -{ - int s; - - s = inet_listen(host_port, NULL, 256, SOCK_STREAM, 0, errp); - if (s < 0) { - return; - } - - qemu_set_fd_handler(s, tcp_accept_incoming_migration, NULL, - (void *)(intptr_t)s); -} diff --git a/migration/tls.c b/migration/tls.c new file mode 100644 index 0000000000..75f959ff9c --- /dev/null +++ b/migration/tls.c @@ -0,0 +1,161 @@ +/* + * QEMU migration TLS support + * + * Copyright (c) 2015 Red Hat, Inc. + * + * This library is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2 of the License, or (at your option) any later version. + * + * This library is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with this library; if not, see <http://www.gnu.org/licenses/>. + * + */ + +#include "qemu/osdep.h" +#include "migration/migration.h" +#include "io/channel-tls.h" +#include "crypto/tlscreds.h" +#include "qemu/error-report.h" +#include "qapi/error.h" +#include "trace.h" + +static QCryptoTLSCreds * +migration_tls_get_creds(MigrationState *s, + QCryptoTLSCredsEndpoint endpoint, + Error **errp) +{ + Object *creds; + QCryptoTLSCreds *ret; + + creds = object_resolve_path_component( + object_get_objects_root(), s->parameters.tls_creds); + if (!creds) { + error_setg(errp, "No TLS credentials with id '%s'", + s->parameters.tls_creds); + return NULL; + } + ret = (QCryptoTLSCreds *)object_dynamic_cast( + creds, TYPE_QCRYPTO_TLS_CREDS); + if (!ret) { + error_setg(errp, "Object with id '%s' is not TLS credentials", + s->parameters.tls_creds); + return NULL; + } + if (ret->endpoint != endpoint) { + error_setg(errp, + "Expected TLS credentials for a %s endpoint", + endpoint == QCRYPTO_TLS_CREDS_ENDPOINT_CLIENT ? + "client" : "server"); + return NULL; + } + + object_ref(OBJECT(ret)); + return ret; +} + + +static void migration_tls_incoming_handshake(Object *src, + Error *err, + gpointer opaque) +{ + QIOChannel *ioc = QIO_CHANNEL(src); + + if (err) { + trace_migration_tls_incoming_handshake_error(error_get_pretty(err)); + error_report("%s", error_get_pretty(err)); + } else { + trace_migration_tls_incoming_handshake_complete(); + migration_set_incoming_channel(migrate_get_current(), ioc); + } + object_unref(OBJECT(ioc)); +} + +void migration_tls_set_incoming_channel(MigrationState *s, + QIOChannel *ioc, + Error **errp) +{ + QCryptoTLSCreds *creds; + QIOChannelTLS *tioc; + + creds = migration_tls_get_creds( + s, QCRYPTO_TLS_CREDS_ENDPOINT_SERVER, errp); + if (!creds) { + return; + } + + tioc = qio_channel_tls_new_server( + ioc, creds, + NULL, /* XXX pass ACL name */ + errp); + if (!tioc) { + return; + } + + trace_migration_tls_incoming_handshake_start(); + qio_channel_tls_handshake(tioc, + migration_tls_incoming_handshake, + NULL, + NULL); +} + + +static void migration_tls_outgoing_handshake(Object *src, + Error *err, + gpointer opaque) +{ + MigrationState *s = opaque; + QIOChannel *ioc = QIO_CHANNEL(src); + + if (err) { + trace_migration_tls_outgoing_handshake_error(error_get_pretty(err)); + s->to_dst_file = NULL; + migrate_fd_error(s, err); + } else { + trace_migration_tls_outgoing_handshake_complete(); + migration_set_outgoing_channel(s, ioc, NULL); + } + object_unref(OBJECT(ioc)); +} + + +void migration_tls_set_outgoing_channel(MigrationState *s, + QIOChannel *ioc, + const char *hostname, + Error **errp) +{ + QCryptoTLSCreds *creds; + QIOChannelTLS *tioc; + + creds = migration_tls_get_creds( + s, QCRYPTO_TLS_CREDS_ENDPOINT_CLIENT, errp); + if (!creds) { + return; + } + + if (s->parameters.tls_hostname) { + hostname = s->parameters.tls_hostname; + } + if (!hostname) { + error_setg(errp, "No hostname available for TLS"); + return; + } + + tioc = qio_channel_tls_new_client( + ioc, creds, hostname, errp); + if (!tioc) { + return; + } + + trace_migration_tls_outgoing_handshake_start(hostname); + qio_channel_tls_handshake(tioc, + migration_tls_outgoing_handshake, + s, + NULL); +} diff --git a/migration/unix.c b/migration/unix.c deleted file mode 100644 index d9aac36b9a..0000000000 --- a/migration/unix.c +++ /dev/null @@ -1,103 +0,0 @@ -/* - * QEMU live migration via Unix Domain Sockets - * - * Copyright Red Hat, Inc. 2009 - * - * Authors: - * Chris Lalancette <clalance@redhat.com> - * - * This work is licensed under the terms of the GNU GPL, version 2. See - * the COPYING file in the top-level directory. - * - * Contributions after 2012-01-13 are licensed under the terms of the - * GNU GPL, version 2 or (at your option) any later version. - */ - -#include "qemu/osdep.h" - -#include "qemu-common.h" -#include "qemu/error-report.h" -#include "qemu/sockets.h" -#include "qemu/main-loop.h" -#include "migration/migration.h" -#include "migration/qemu-file.h" -#include "block/block.h" - -//#define DEBUG_MIGRATION_UNIX - -#ifdef DEBUG_MIGRATION_UNIX -#define DPRINTF(fmt, ...) \ - do { printf("migration-unix: " fmt, ## __VA_ARGS__); } while (0) -#else -#define DPRINTF(fmt, ...) \ - do { } while (0) -#endif - -static void unix_wait_for_connect(int fd, Error *err, void *opaque) -{ - MigrationState *s = opaque; - - if (fd < 0) { - DPRINTF("migrate connect error: %s\n", error_get_pretty(err)); - s->to_dst_file = NULL; - migrate_fd_error(s); - } else { - DPRINTF("migrate connect success\n"); - s->to_dst_file = qemu_fopen_socket(fd, "wb"); - migrate_fd_connect(s); - } -} - -void unix_start_outgoing_migration(MigrationState *s, const char *path, Error **errp) -{ - unix_nonblocking_connect(path, unix_wait_for_connect, s, errp); -} - -static void unix_accept_incoming_migration(void *opaque) -{ - struct sockaddr_un addr; - socklen_t addrlen = sizeof(addr); - int s = (intptr_t)opaque; - QEMUFile *f; - int c, err; - - do { - c = qemu_accept(s, (struct sockaddr *)&addr, &addrlen); - err = errno; - } while (c < 0 && err == EINTR); - qemu_set_fd_handler(s, NULL, NULL, NULL); - close(s); - - DPRINTF("accepted migration\n"); - - if (c < 0) { - error_report("could not accept migration connection (%s)", - strerror(err)); - return; - } - - f = qemu_fopen_socket(c, "rb"); - if (f == NULL) { - error_report("could not qemu_fopen socket"); - goto out; - } - - process_incoming_migration(f); - return; - -out: - close(c); -} - -void unix_start_incoming_migration(const char *path, Error **errp) -{ - int s; - - s = unix_listen(path, NULL, 0, errp); - if (s < 0) { - return; - } - - qemu_set_fd_handler(s, unix_accept_incoming_migration, NULL, - (void *)(intptr_t)s); -} diff --git a/monitor.c b/monitor.c index 6a32b9bf59..404d594bb3 100644 --- a/monitor.c +++ b/monitor.c @@ -3432,12 +3432,12 @@ static void vm_completion(ReadLineState *rs, const char *str) { size_t len; BlockDriverState *bs; - BdrvNextIterator *it = NULL; + BdrvNextIterator it; len = strlen(str); readline_set_completion_index(rs, len); - while ((it = bdrv_next(it, &bs))) { + for (bs = bdrv_first(&it); bs; bs = bdrv_next(&it)) { SnapshotInfoList *snapshots, *snapshot; AioContext *ctx = bdrv_get_aio_context(bs); bool ok = false; diff --git a/qapi-schema.json b/qapi-schema.json index 9a322d1836..8483bdfcce 100644 --- a/qapi-schema.json +++ b/qapi-schema.json @@ -484,6 +484,10 @@ # throttled during auto-converge. This is only present when auto-converge # has started throttling guest cpus. (Since 2.7) # +# @error-desc: #optional the human readable error description string, when +# @status is 'failed'. Clients should not attempt to parse the +# error strings. (Since 2.6) +# # Since: 0.14.0 ## { 'struct': 'MigrationInfo', @@ -494,7 +498,8 @@ '*expected-downtime': 'int', '*downtime': 'int', '*setup-time': 'int', - '*cpu-throttle-percentage': 'int'} } + '*cpu-throttle-percentage': 'int', + '*error-desc': 'str'} } ## # @query-migrate @@ -612,11 +617,28 @@ # @cpu-throttle-increment: throttle percentage increase each time # auto-converge detects that migration is not making # progress. The default value is 10. (Since 2.7) +# +# @tls-creds: ID of the 'tls-creds' object that provides credentials for +# establishing a TLS connection over the migration data channel. +# On the outgoing side of the migration, the credentials must +# be for a 'client' endpoint, while for the incoming side the +# credentials must be for a 'server' endpoint. Setting this +# will enable TLS for all migrations. The default is unset, +# resulting in unsecured migration at the QEMU level. (Since 2.7) +# +# @tls-hostname: hostname of the target host for the migration. This is +# required when using x509 based TLS credentials and the +# migration URI does not already include a hostname. For +# example if using fd: or exec: based migration, the +# hostname must be provided so that the server's x509 +# certificate identity canbe validated. (Since 2.7) +# # Since: 2.4 ## { 'enum': 'MigrationParameter', 'data': ['compress-level', 'compress-threads', 'decompress-threads', - 'cpu-throttle-initial', 'cpu-throttle-increment'] } + 'cpu-throttle-initial', 'cpu-throttle-increment', + 'tls-creds', 'tls-hostname'] } # # @migrate-set-parameters @@ -636,6 +658,22 @@ # @cpu-throttle-increment: throttle percentage increase each time # auto-converge detects that migration is not making # progress. The default value is 10. (Since 2.7) +# +# @tls-creds: ID of the 'tls-creds' object that provides credentials for +# establishing a TLS connection over the migration data channel. +# On the outgoing side of the migration, the credentials must +# be for a 'client' endpoint, while for the incoming side the +# credentials must be for a 'server' endpoint. Setting this +# will enable TLS for all migrations. The default is unset, +# resulting in unsecured migration at the QEMU level. (Since 2.7) +# +# @tls-hostname: hostname of the target host for the migration. This is +# required when using x509 based TLS credentials and the +# migration URI does not already include a hostname. For +# example if using fd: or exec: based migration, the +# hostname must be provided so that the server's x509 +# certificate identity canbe validated. (Since 2.7) +# # Since: 2.4 ## { 'command': 'migrate-set-parameters', @@ -643,7 +681,9 @@ '*compress-threads': 'int', '*decompress-threads': 'int', '*cpu-throttle-initial': 'int', - '*cpu-throttle-increment': 'int'} } + '*cpu-throttle-increment': 'int', + '*tls-creds': 'str', + '*tls-hostname': 'str'} } # # @MigrationParameters @@ -662,6 +702,21 @@ # auto-converge detects that migration is not making # progress. The default value is 10. (Since 2.7) # +# @tls-creds: ID of the 'tls-creds' object that provides credentials for +# establishing a TLS connection over the migration data channel. +# On the outgoing side of the migration, the credentials must +# be for a 'client' endpoint, while for the incoming side the +# credentials must be for a 'server' endpoint. Setting this +# will enable TLS for all migrations. The default is unset, +# resulting in unsecured migration at the QEMU level. (Since 2.6) +# +# @tls-hostname: hostname of the target host for the migration. This is +# required when using x509 based TLS credentials and the +# migration URI does not already include a hostname. For +# example if using fd: or exec: based migration, the +# hostname must be provided so that the server's x509 +# certificate identity canbe validated. (Since 2.6) +# # Since: 2.4 ## { 'struct': 'MigrationParameters', @@ -669,7 +724,9 @@ 'compress-threads': 'int', 'decompress-threads': 'int', 'cpu-throttle-initial': 'int', - 'cpu-throttle-increment': 'int'} } + 'cpu-throttle-increment': 'int', + 'tls-creds': 'str', + 'tls-hostname': 'str'} } ## # @query-migrate-parameters # diff --git a/qemu-img.c b/qemu-img.c index 7ed8ef21cb..4b56ad36aa 100644 --- a/qemu-img.c +++ b/qemu-img.c @@ -775,7 +775,7 @@ static void common_block_job_cb(void *opaque, int ret) static void run_block_job(BlockJob *job, Error **errp) { - AioContext *aio_context = bdrv_get_aio_context(job->bs); + AioContext *aio_context = blk_get_aio_context(job->blk); do { aio_poll(aio_context, true); @@ -1606,8 +1606,8 @@ static int convert_write(ImgConvertState *s, int64_t sector_num, int nb_sectors, if (s->has_zero_init) { break; } - ret = blk_write_zeroes(s->target, sector_num << BDRV_SECTOR_BITS, - n << BDRV_SECTOR_BITS, 0); + ret = blk_pwrite_zeroes(s->target, sector_num << BDRV_SECTOR_BITS, + n << BDRV_SECTOR_BITS, 0); if (ret < 0) { return ret; } diff --git a/qemu-io-cmds.c b/qemu-io-cmds.c index e766791ffc..09e879f872 100644 --- a/qemu-io-cmds.c +++ b/qemu-io-cmds.c @@ -451,12 +451,12 @@ typedef struct { bool done; } CoWriteZeroes; -static void coroutine_fn co_write_zeroes_entry(void *opaque) +static void coroutine_fn co_pwrite_zeroes_entry(void *opaque) { CoWriteZeroes *data = opaque; - data->ret = blk_co_write_zeroes(data->blk, data->offset, data->count, - data->flags); + data->ret = blk_co_pwrite_zeroes(data->blk, data->offset, data->count, + data->flags); data->done = true; if (data->ret < 0) { *data->total = data->ret; @@ -466,8 +466,8 @@ static void coroutine_fn co_write_zeroes_entry(void *opaque) *data->total = data->count; } -static int do_co_write_zeroes(BlockBackend *blk, int64_t offset, int64_t count, - int flags, int64_t *total) +static int do_co_pwrite_zeroes(BlockBackend *blk, int64_t offset, + int64_t count, int flags, int64_t *total) { Coroutine *co; CoWriteZeroes data = { @@ -483,7 +483,7 @@ static int do_co_write_zeroes(BlockBackend *blk, int64_t offset, int64_t count, return -ERANGE; } - co = qemu_coroutine_create(co_write_zeroes_entry); + co = qemu_coroutine_create(co_pwrite_zeroes_entry); qemu_coroutine_enter(co, &data); while (!data.done) { aio_poll(blk_get_aio_context(blk), true); @@ -901,7 +901,7 @@ static void write_help(void) " -C, -- report statistics in a machine parsable format\n" " -q, -- quiet mode, do not show I/O statistics\n" " -u, -- with -z, allow unmapping\n" -" -z, -- write zeroes using blk_co_write_zeroes\n" +" -z, -- write zeroes using blk_co_pwrite_zeroes\n" "\n"); } @@ -1033,7 +1033,7 @@ static int write_f(BlockBackend *blk, int argc, char **argv) if (bflag) { cnt = do_save_vmstate(blk, buf, offset, count, &total); } else if (zflag) { - cnt = do_co_write_zeroes(blk, offset, count, flags, &total); + cnt = do_co_pwrite_zeroes(blk, offset, count, flags, &total); } else if (cflag) { cnt = do_write_compressed(blk, buf, offset, count, &total); } else { @@ -1376,7 +1376,7 @@ static void aio_write_help(void) " -i, -- treat request as invalid, for exercising stats\n" " -q, -- quiet mode, do not show I/O statistics\n" " -u, -- with -z, allow unmapping\n" -" -z, -- write zeroes using blk_aio_write_zeroes\n" +" -z, -- write zeroes using blk_aio_pwrite_zeroes\n" "\n"); } @@ -1475,8 +1475,8 @@ static int aio_write_f(BlockBackend *blk, int argc, char **argv) } ctx->qiov.size = count; - blk_aio_write_zeroes(blk, ctx->offset, count, flags, aio_write_done, - ctx); + blk_aio_pwrite_zeroes(blk, ctx->offset, count, flags, aio_write_done, + ctx); } else { nr_iov = argc - optind; ctx->buf = create_iovec(blk, &ctx->qiov, &argv[optind], nr_iov, diff --git a/qmp.c b/qmp.c index 8f8ae3a79d..3165f8726b 100644 --- a/qmp.c +++ b/qmp.c @@ -181,7 +181,7 @@ void qmp_cont(Error **errp) Error *local_err = NULL; BlockBackend *blk; BlockDriverState *bs; - BdrvNextIterator *it; + BdrvNextIterator it; /* if there is a dump in background, we should wait until the dump * finished */ @@ -201,8 +201,7 @@ void qmp_cont(Error **errp) blk_iostatus_reset(blk); } - it = NULL; - while ((it = bdrv_next(it, &bs))) { + for (bs = bdrv_first(&it); bs; bs = bdrv_next(&it)) { bdrv_add_key(bs, NULL, &local_err); if (local_err) { error_propagate(errp, local_err); diff --git a/target-ppc/kvm_ppc.h b/target-ppc/kvm_ppc.h index fc7931227d..3b2090e42e 100644 --- a/target-ppc/kvm_ppc.h +++ b/target-ppc/kvm_ppc.h @@ -163,7 +163,7 @@ static inline bool kvmppc_spapr_use_multitce(void) static inline void *kvmppc_create_spapr_tce(uint32_t liobn, uint32_t window_size, int *fd, - bool vfio_accel) + bool need_vfio) { return NULL; } diff --git a/target-ppc/mmu-hash64.c b/target-ppc/mmu-hash64.c index 04e6932fa0..17e24800cb 100644 --- a/target-ppc/mmu-hash64.c +++ b/target-ppc/mmu-hash64.c @@ -284,8 +284,6 @@ void ppc_hash64_set_external_hpt(PowerPCCPU *cpu, void *hpt, int shift, CPUPPCState *env = &cpu->env; Error *local_err = NULL; - cpu_synchronize_state(CPU(cpu)); - if (hpt) { env->external_htab = hpt; } else { diff --git a/target-ppc/translate.c b/target-ppc/translate.c index 745f4de98f..f5ceae5900 100644 --- a/target-ppc/translate.c +++ b/target-ppc/translate.c @@ -756,27 +756,20 @@ static void gen_cmpli(DisasContext *ctx) /* isel (PowerPC 2.03 specification) */ static void gen_isel(DisasContext *ctx) { - TCGLabel *l1, *l2; uint32_t bi = rC(ctx->opcode); - uint32_t mask; - TCGv_i32 t0; + uint32_t mask = 0x08 >> (bi & 0x03); + TCGv t0 = tcg_temp_new(); + TCGv zr; - l1 = gen_new_label(); - l2 = gen_new_label(); + tcg_gen_extu_i32_tl(t0, cpu_crf[bi >> 2]); + tcg_gen_andi_tl(t0, t0, mask); - mask = 0x08 >> (bi & 0x03); - t0 = tcg_temp_new_i32(); - tcg_gen_andi_i32(t0, cpu_crf[bi >> 2], mask); - tcg_gen_brcondi_i32(TCG_COND_EQ, t0, 0, l1); - if (rA(ctx->opcode) == 0) - tcg_gen_movi_tl(cpu_gpr[rD(ctx->opcode)], 0); - else - tcg_gen_mov_tl(cpu_gpr[rD(ctx->opcode)], cpu_gpr[rA(ctx->opcode)]); - tcg_gen_br(l2); - gen_set_label(l1); - tcg_gen_mov_tl(cpu_gpr[rD(ctx->opcode)], cpu_gpr[rB(ctx->opcode)]); - gen_set_label(l2); - tcg_temp_free_i32(t0); + zr = tcg_const_tl(0); + tcg_gen_movcond_tl(TCG_COND_NE, cpu_gpr[rD(ctx->opcode)], t0, zr, + rA(ctx->opcode) ? cpu_gpr[rA(ctx->opcode)] : zr, + cpu_gpr[rB(ctx->opcode)]); + tcg_temp_free(zr); + tcg_temp_free(t0); } /* cmpb: PowerPC 2.05 specification */ @@ -1617,141 +1610,109 @@ static void gen_cntlzd(DisasContext *ctx) /* rlwimi & rlwimi. */ static void gen_rlwimi(DisasContext *ctx) { - uint32_t mb, me, sh; + TCGv t_ra = cpu_gpr[rA(ctx->opcode)]; + TCGv t_rs = cpu_gpr[rS(ctx->opcode)]; + uint32_t sh = SH(ctx->opcode); + uint32_t mb = MB(ctx->opcode); + uint32_t me = ME(ctx->opcode); - mb = MB(ctx->opcode); - me = ME(ctx->opcode); - sh = SH(ctx->opcode); - if (likely(sh == (31-me) && mb <= me)) { - tcg_gen_deposit_tl(cpu_gpr[rA(ctx->opcode)], cpu_gpr[rA(ctx->opcode)], - cpu_gpr[rS(ctx->opcode)], sh, me - mb + 1); + if (sh == (31-me) && mb <= me) { + tcg_gen_deposit_tl(t_ra, t_ra, t_rs, sh, me - mb + 1); } else { target_ulong mask; + TCGv_i32 t0; TCGv t1; - TCGv t0 = tcg_temp_new(); -#if defined(TARGET_PPC64) - tcg_gen_deposit_i64(t0, cpu_gpr[rS(ctx->opcode)], - cpu_gpr[rS(ctx->opcode)], 32, 32); - tcg_gen_rotli_i64(t0, t0, sh); -#else - tcg_gen_rotli_i32(t0, cpu_gpr[rS(ctx->opcode)], sh); -#endif + #if defined(TARGET_PPC64) mb += 32; me += 32; #endif mask = MASK(mb, me); + + t0 = tcg_temp_new_i32(); t1 = tcg_temp_new(); - tcg_gen_andi_tl(t0, t0, mask); - tcg_gen_andi_tl(t1, cpu_gpr[rA(ctx->opcode)], ~mask); - tcg_gen_or_tl(cpu_gpr[rA(ctx->opcode)], t0, t1); - tcg_temp_free(t0); + tcg_gen_trunc_tl_i32(t0, t_rs); + tcg_gen_rotli_i32(t0, t0, sh); + tcg_gen_extu_i32_tl(t1, t0); + tcg_temp_free_i32(t0); + + tcg_gen_andi_tl(t1, t1, mask); + tcg_gen_andi_tl(t_ra, t_ra, ~mask); + tcg_gen_or_tl(t_ra, t_ra, t1); tcg_temp_free(t1); } - if (unlikely(Rc(ctx->opcode) != 0)) - gen_set_Rc0(ctx, cpu_gpr[rA(ctx->opcode)]); + if (unlikely(Rc(ctx->opcode) != 0)) { + gen_set_Rc0(ctx, t_ra); + } } /* rlwinm & rlwinm. */ static void gen_rlwinm(DisasContext *ctx) { - uint32_t mb, me, sh; - - sh = SH(ctx->opcode); - mb = MB(ctx->opcode); - me = ME(ctx->opcode); + TCGv t_ra = cpu_gpr[rA(ctx->opcode)]; + TCGv t_rs = cpu_gpr[rS(ctx->opcode)]; + uint32_t sh = SH(ctx->opcode); + uint32_t mb = MB(ctx->opcode); + uint32_t me = ME(ctx->opcode); - if (likely(mb == 0 && me == (31 - sh))) { - if (likely(sh == 0)) { - tcg_gen_ext32u_tl(cpu_gpr[rA(ctx->opcode)], cpu_gpr[rS(ctx->opcode)]); - } else { - TCGv t0 = tcg_temp_new(); - tcg_gen_ext32u_tl(t0, cpu_gpr[rS(ctx->opcode)]); - tcg_gen_shli_tl(t0, t0, sh); - tcg_gen_ext32u_tl(cpu_gpr[rA(ctx->opcode)], t0); - tcg_temp_free(t0); - } - } else if (likely(sh != 0 && me == 31 && sh == (32 - mb))) { - TCGv t0 = tcg_temp_new(); - tcg_gen_ext32u_tl(t0, cpu_gpr[rS(ctx->opcode)]); - tcg_gen_shri_tl(t0, t0, mb); - tcg_gen_ext32u_tl(cpu_gpr[rA(ctx->opcode)], t0); - tcg_temp_free(t0); - } else if (likely(mb == 0 && me == 31)) { - TCGv_i32 t0 = tcg_temp_new_i32(); - tcg_gen_trunc_tl_i32(t0, cpu_gpr[rS(ctx->opcode)]); - tcg_gen_rotli_i32(t0, t0, sh); - tcg_gen_extu_i32_tl(cpu_gpr[rA(ctx->opcode)], t0); - tcg_temp_free_i32(t0); + if (mb == 0 && me == (31 - sh)) { + tcg_gen_shli_tl(t_ra, t_rs, sh); + tcg_gen_ext32u_tl(t_ra, t_ra); + } else if (sh != 0 && me == 31 && sh == (32 - mb)) { + tcg_gen_ext32u_tl(t_ra, t_rs); + tcg_gen_shri_tl(t_ra, t_ra, mb); } else { - TCGv t0 = tcg_temp_new(); -#if defined(TARGET_PPC64) - tcg_gen_deposit_i64(t0, cpu_gpr[rS(ctx->opcode)], - cpu_gpr[rS(ctx->opcode)], 32, 32); - tcg_gen_rotli_i64(t0, t0, sh); -#else - tcg_gen_rotli_i32(t0, cpu_gpr[rS(ctx->opcode)], sh); -#endif #if defined(TARGET_PPC64) mb += 32; me += 32; #endif - tcg_gen_andi_tl(cpu_gpr[rA(ctx->opcode)], t0, MASK(mb, me)); - tcg_temp_free(t0); + if (sh == 0) { + tcg_gen_andi_tl(t_ra, t_rs, MASK(mb, me)); + } else { + TCGv_i32 t0 = tcg_temp_new_i32(); + + tcg_gen_trunc_tl_i32(t0, t_rs); + tcg_gen_rotli_i32(t0, t0, sh); + tcg_gen_andi_i32(t0, t0, MASK(mb, me)); + tcg_gen_extu_i32_tl(t_ra, t0); + tcg_temp_free_i32(t0); + } + } + if (unlikely(Rc(ctx->opcode) != 0)) { + gen_set_Rc0(ctx, t_ra); } - if (unlikely(Rc(ctx->opcode) != 0)) - gen_set_Rc0(ctx, cpu_gpr[rA(ctx->opcode)]); } /* rlwnm & rlwnm. */ static void gen_rlwnm(DisasContext *ctx) { - uint32_t mb, me; - mb = MB(ctx->opcode); - me = ME(ctx->opcode); + TCGv t_ra = cpu_gpr[rA(ctx->opcode)]; + TCGv t_rs = cpu_gpr[rS(ctx->opcode)]; + TCGv t_rb = cpu_gpr[rB(ctx->opcode)]; + uint32_t mb = MB(ctx->opcode); + uint32_t me = ME(ctx->opcode); + TCGv_i32 t0, t1; - if (likely(mb == 0 && me == 31)) { - TCGv_i32 t0, t1; - t0 = tcg_temp_new_i32(); - t1 = tcg_temp_new_i32(); - tcg_gen_trunc_tl_i32(t0, cpu_gpr[rB(ctx->opcode)]); - tcg_gen_trunc_tl_i32(t1, cpu_gpr[rS(ctx->opcode)]); - tcg_gen_andi_i32(t0, t0, 0x1f); - tcg_gen_rotl_i32(t1, t1, t0); - tcg_gen_extu_i32_tl(cpu_gpr[rA(ctx->opcode)], t1); - tcg_temp_free_i32(t0); - tcg_temp_free_i32(t1); - } else { - TCGv t0; #if defined(TARGET_PPC64) - TCGv t1; + mb += 32; + me += 32; #endif - t0 = tcg_temp_new(); - tcg_gen_andi_tl(t0, cpu_gpr[rB(ctx->opcode)], 0x1f); -#if defined(TARGET_PPC64) - t1 = tcg_temp_new_i64(); - tcg_gen_deposit_i64(t1, cpu_gpr[rS(ctx->opcode)], - cpu_gpr[rS(ctx->opcode)], 32, 32); - tcg_gen_rotl_i64(t0, t1, t0); - tcg_temp_free_i64(t1); -#else - tcg_gen_rotl_i32(t0, cpu_gpr[rS(ctx->opcode)], t0); -#endif - if (unlikely(mb != 0 || me != 31)) { -#if defined(TARGET_PPC64) - mb += 32; - me += 32; -#endif - tcg_gen_andi_tl(cpu_gpr[rA(ctx->opcode)], t0, MASK(mb, me)); - } else { - tcg_gen_andi_tl(t0, t0, MASK(32, 63)); - tcg_gen_mov_tl(cpu_gpr[rA(ctx->opcode)], t0); - } - tcg_temp_free(t0); + t0 = tcg_temp_new_i32(); + t1 = tcg_temp_new_i32(); + tcg_gen_trunc_tl_i32(t0, t_rb); + tcg_gen_trunc_tl_i32(t1, t_rs); + tcg_gen_andi_i32(t0, t0, 0x1f); + tcg_gen_rotl_i32(t1, t1, t0); + tcg_temp_free_i32(t0); + + tcg_gen_andi_i32(t1, t1, MASK(mb, me)); + tcg_gen_extu_i32_tl(t_ra, t1); + tcg_temp_free_i32(t1); + + if (unlikely(Rc(ctx->opcode) != 0)) { + gen_set_Rc0(ctx, t_ra); } - if (unlikely(Rc(ctx->opcode) != 0)) - gen_set_Rc0(ctx, cpu_gpr[rA(ctx->opcode)]); } #if defined(TARGET_PPC64) @@ -1786,26 +1747,24 @@ static void glue(gen_, name##3)(DisasContext *ctx) \ gen_##name(ctx, 1, 1); \ } -static inline void gen_rldinm(DisasContext *ctx, uint32_t mb, uint32_t me, - uint32_t sh) +static void gen_rldinm(DisasContext *ctx, int mb, int me, int sh) { - if (likely(sh != 0 && mb == 0 && me == (63 - sh))) { - tcg_gen_shli_tl(cpu_gpr[rA(ctx->opcode)], cpu_gpr[rS(ctx->opcode)], sh); - } else if (likely(sh != 0 && me == 63 && sh == (64 - mb))) { - tcg_gen_shri_tl(cpu_gpr[rA(ctx->opcode)], cpu_gpr[rS(ctx->opcode)], mb); + TCGv t_ra = cpu_gpr[rA(ctx->opcode)]; + TCGv t_rs = cpu_gpr[rS(ctx->opcode)]; + + if (sh != 0 && mb == 0 && me == (63 - sh)) { + tcg_gen_shli_tl(t_ra, t_rs, sh); + } else if (sh != 0 && me == 63 && sh == (64 - mb)) { + tcg_gen_shri_tl(t_ra, t_rs, mb); } else { - TCGv t0 = tcg_temp_new(); - tcg_gen_rotli_tl(t0, cpu_gpr[rS(ctx->opcode)], sh); - if (likely(mb == 0 && me == 63)) { - tcg_gen_mov_tl(cpu_gpr[rA(ctx->opcode)], t0); - } else { - tcg_gen_andi_tl(cpu_gpr[rA(ctx->opcode)], t0, MASK(mb, me)); - } - tcg_temp_free(t0); + tcg_gen_rotli_tl(t_ra, t_rs, sh); + tcg_gen_andi_tl(t_ra, t_ra, MASK(mb, me)); + } + if (unlikely(Rc(ctx->opcode) != 0)) { + gen_set_Rc0(ctx, t_ra); } - if (unlikely(Rc(ctx->opcode) != 0)) - gen_set_Rc0(ctx, cpu_gpr[rA(ctx->opcode)]); } + /* rldicl - rldicl. */ static inline void gen_rldicl(DisasContext *ctx, int mbn, int shn) { @@ -1816,6 +1775,7 @@ static inline void gen_rldicl(DisasContext *ctx, int mbn, int shn) gen_rldinm(ctx, mb, 63, sh); } GEN_PPC64_R4(rldicl, 0x1E, 0x00); + /* rldicr - rldicr. */ static inline void gen_rldicr(DisasContext *ctx, int men, int shn) { @@ -1826,6 +1786,7 @@ static inline void gen_rldicr(DisasContext *ctx, int men, int shn) gen_rldinm(ctx, 0, me, sh); } GEN_PPC64_R4(rldicr, 0x1E, 0x02); + /* rldic - rldic. */ static inline void gen_rldic(DisasContext *ctx, int mbn, int shn) { @@ -1837,21 +1798,22 @@ static inline void gen_rldic(DisasContext *ctx, int mbn, int shn) } GEN_PPC64_R4(rldic, 0x1E, 0x04); -static inline void gen_rldnm(DisasContext *ctx, uint32_t mb, uint32_t me) +static void gen_rldnm(DisasContext *ctx, int mb, int me) { + TCGv t_ra = cpu_gpr[rA(ctx->opcode)]; + TCGv t_rs = cpu_gpr[rS(ctx->opcode)]; + TCGv t_rb = cpu_gpr[rB(ctx->opcode)]; TCGv t0; t0 = tcg_temp_new(); - tcg_gen_andi_tl(t0, cpu_gpr[rB(ctx->opcode)], 0x3f); - tcg_gen_rotl_tl(t0, cpu_gpr[rS(ctx->opcode)], t0); - if (unlikely(mb != 0 || me != 63)) { - tcg_gen_andi_tl(cpu_gpr[rA(ctx->opcode)], t0, MASK(mb, me)); - } else { - tcg_gen_mov_tl(cpu_gpr[rA(ctx->opcode)], t0); - } + tcg_gen_andi_tl(t0, t_rb, 0x3f); + tcg_gen_rotl_tl(t_ra, t_rs, t0); tcg_temp_free(t0); - if (unlikely(Rc(ctx->opcode) != 0)) - gen_set_Rc0(ctx, cpu_gpr[rA(ctx->opcode)]); + + tcg_gen_andi_tl(t_ra, t_ra, MASK(mb, me)); + if (unlikely(Rc(ctx->opcode) != 0)) { + gen_set_Rc0(ctx, t_ra); + } } /* rldcl - rldcl. */ @@ -1863,6 +1825,7 @@ static inline void gen_rldcl(DisasContext *ctx, int mbn) gen_rldnm(ctx, mb, 63); } GEN_PPC64_R2(rldcl, 0x1E, 0x08); + /* rldcr - rldcr. */ static inline void gen_rldcr(DisasContext *ctx, int men) { @@ -1872,32 +1835,31 @@ static inline void gen_rldcr(DisasContext *ctx, int men) gen_rldnm(ctx, 0, me); } GEN_PPC64_R2(rldcr, 0x1E, 0x09); + /* rldimi - rldimi. */ -static inline void gen_rldimi(DisasContext *ctx, int mbn, int shn) +static void gen_rldimi(DisasContext *ctx, int mbn, int shn) { - uint32_t sh, mb, me; + TCGv t_ra = cpu_gpr[rA(ctx->opcode)]; + TCGv t_rs = cpu_gpr[rS(ctx->opcode)]; + uint32_t sh = SH(ctx->opcode) | (shn << 5); + uint32_t mb = MB(ctx->opcode) | (mbn << 5); + uint32_t me = 63 - sh; - sh = SH(ctx->opcode) | (shn << 5); - mb = MB(ctx->opcode) | (mbn << 5); - me = 63 - sh; - if (unlikely(sh == 0 && mb == 0)) { - tcg_gen_mov_tl(cpu_gpr[rA(ctx->opcode)], cpu_gpr[rS(ctx->opcode)]); + if (mb <= me) { + tcg_gen_deposit_tl(t_ra, t_ra, t_rs, sh, me - mb + 1); } else { - TCGv t0, t1; - target_ulong mask; + target_ulong mask = MASK(mb, me); + TCGv t1 = tcg_temp_new(); - t0 = tcg_temp_new(); - tcg_gen_rotli_tl(t0, cpu_gpr[rS(ctx->opcode)], sh); - t1 = tcg_temp_new(); - mask = MASK(mb, me); - tcg_gen_andi_tl(t0, t0, mask); - tcg_gen_andi_tl(t1, cpu_gpr[rA(ctx->opcode)], ~mask); - tcg_gen_or_tl(cpu_gpr[rA(ctx->opcode)], t0, t1); - tcg_temp_free(t0); + tcg_gen_rotli_tl(t1, t_rs, sh); + tcg_gen_andi_tl(t1, t1, mask); + tcg_gen_andi_tl(t_ra, t_ra, ~mask); + tcg_gen_or_tl(t_ra, t_ra, t1); tcg_temp_free(t1); } - if (unlikely(Rc(ctx->opcode) != 0)) - gen_set_Rc0(ctx, cpu_gpr[rA(ctx->opcode)]); + if (unlikely(Rc(ctx->opcode) != 0)) { + gen_set_Rc0(ctx, t_ra); + } } GEN_PPC64_R4(rldimi, 0x1E, 0x06); #endif diff --git a/target-ppc/translate_init.c b/target-ppc/translate_init.c index 954195f5e4..a003c1029d 100644 --- a/target-ppc/translate_init.c +++ b/target-ppc/translate_init.c @@ -9231,6 +9231,14 @@ static void ppc_cpu_realizefn(DeviceState *dev, Error **errp) #if !defined(CONFIG_USER_ONLY) cpu->cpu_dt_id = (cs->cpu_index / smp_threads) * max_smt + (cs->cpu_index % smp_threads); + + if (kvm_enabled() && !kvm_vcpu_id_is_valid(cpu->cpu_dt_id)) { + error_setg(errp, "Can't create CPU with id %d in KVM", cpu->cpu_dt_id); + error_append_hint(errp, "Adjust the number of cpus to %d " + "or try to raise the number of threads per core\n", + cpu->cpu_dt_id * smp_threads / max_smt); + return; + } #endif if (tcg_enabled()) { diff --git a/tests/Makefile b/tests/Makefile index 1bbd1ca463..4bc041c8c7 100644 --- a/tests/Makefile +++ b/tests/Makefile @@ -434,13 +434,14 @@ tests/test-rcu-list$(EXESUF): tests/test-rcu-list.o $(test-util-obj-y) tests/test-qdev-global-props$(EXESUF): tests/test-qdev-global-props.o \ hw/core/qdev.o hw/core/qdev-properties.o hw/core/hotplug.o\ + hw/core/bus.o \ hw/core/irq.o \ hw/core/fw-path-provider.o \ $(test-qapi-obj-y) tests/test-vmstate$(EXESUF): tests/test-vmstate.o \ - migration/vmstate.o migration/qemu-file.o migration/qemu-file-buf.o \ - migration/qemu-file-unix.o migration/qjson.o \ - $(test-qom-obj-y) + migration/vmstate.o migration/qemu-file.o \ + migration/qemu-file-channel.o migration/qjson.o \ + $(test-io-obj-y) tests/test-timed-average$(EXESUF): tests/test-timed-average.o qemu-timer.o \ $(test-util-obj-y) tests/test-base64$(EXESUF): tests/test-base64.o \ diff --git a/tests/qemu-iotests/041 b/tests/qemu-iotests/041 index b1c542f99b..ed1d9d464c 100755 --- a/tests/qemu-iotests/041 +++ b/tests/qemu-iotests/041 @@ -207,33 +207,6 @@ class TestSingleBlockdev(TestSingleDrive): test_image_not_found = None test_small_buffer2 = None -class TestBlockdevAttached(iotests.QMPTestCase): - image_len = 1 * 1024 * 1024 # MB - - def setUp(self): - iotests.create_image(backing_img, self.image_len) - qemu_img('create', '-f', iotests.imgfmt, '-o', 'backing_file=%s' % backing_img, test_img) - qemu_img('create', '-f', iotests.imgfmt, '-o', 'backing_file=%s' % backing_img, target_img) - self.vm = iotests.VM().add_drive(test_img) - self.vm.launch() - - def tearDown(self): - self.vm.shutdown() - os.remove(test_img) - os.remove(target_img) - - def test_blockdev_attached(self): - self.assert_no_active_block_jobs() - args = {'options': - {'driver': iotests.imgfmt, - 'id': 'drive1', - 'file': { 'filename': target_img, 'driver': 'file' } } } - result = self.vm.qmp("blockdev-add", **args) - self.assert_qmp(result, 'return', {}) - result = self.vm.qmp('blockdev-mirror', device='drive0', sync='full', - target='drive1') - self.assert_qmp(result, 'error/class', 'GenericError') - class TestSingleDriveZeroLength(TestSingleDrive): image_len = 0 test_small_buffer2 = None diff --git a/tests/qemu-iotests/041.out b/tests/qemu-iotests/041.out index b67d0504a6..b0cadc8245 100644 --- a/tests/qemu-iotests/041.out +++ b/tests/qemu-iotests/041.out @@ -1,5 +1,5 @@ -............................................................................ +........................................................................... ---------------------------------------------------------------------- -Ran 76 tests +Ran 75 tests OK diff --git a/tests/test-blockjob-txn.c b/tests/test-blockjob-txn.c index 55fad9507a..828389bb45 100644 --- a/tests/test-blockjob-txn.c +++ b/tests/test-blockjob-txn.c @@ -15,6 +15,7 @@ #include "qapi/error.h" #include "qemu/main-loop.h" #include "block/blockjob.h" +#include "sysemu/block-backend.h" typedef struct { BlockJob common; @@ -30,7 +31,7 @@ static const BlockJobDriver test_block_job_driver = { static void test_block_job_complete(BlockJob *job, void *opaque) { - BlockDriverState *bs = job->bs; + BlockDriverState *bs = blk_bs(job->blk); int rc = (intptr_t)opaque; if (block_job_is_cancelled(job)) { diff --git a/tests/test-throttle.c b/tests/test-throttle.c index 5ec966c8a4..c02be805f7 100644 --- a/tests/test-throttle.c +++ b/tests/test-throttle.c @@ -578,9 +578,9 @@ static void test_groups(void) BlockBackend *blk1, *blk2, *blk3; BlockBackendPublic *blkp1, *blkp2, *blkp3; - blk1 = blk_new_with_bs(&error_abort); - blk2 = blk_new_with_bs(&error_abort); - blk3 = blk_new_with_bs(&error_abort); + blk1 = blk_new(); + blk2 = blk_new(); + blk3 = blk_new(); blkp1 = blk_get_public(blk1); blkp2 = blk_get_public(blk2); diff --git a/tests/test-vmstate.c b/tests/test-vmstate.c index 713d4443b2..d19b16a60e 100644 --- a/tests/test-vmstate.c +++ b/tests/test-vmstate.c @@ -29,6 +29,7 @@ #include "migration/migration.h" #include "migration/vmstate.h" #include "qemu/coroutine.h" +#include "io/channel-file.h" static char temp_file[] = "/tmp/vmst.test.XXXXXX"; static int temp_fd; @@ -44,35 +45,22 @@ void yield_until_fd_readable(int fd) select(fd + 1, &fds, NULL, NULL, NULL); } -/* - * Some tests use 'open_test_file' to work on a real fd, some use - * an in memory file (QEMUSizedBuffer+qemu_bufopen); we could pick one - * but this way we test both. - */ /* Duplicate temp_fd and seek to the beginning of the file */ static QEMUFile *open_test_file(bool write) { int fd = dup(temp_fd); + QIOChannel *ioc; lseek(fd, 0, SEEK_SET); if (write) { g_assert_cmpint(ftruncate(fd, 0), ==, 0); } - return qemu_fdopen(fd, write ? "wb" : "rb"); -} - -/* - * Check that the contents of the memory-buffered file f match - * the given size/data. - */ -static void check_mem_file(QEMUFile *f, void *data, size_t size) -{ - uint8_t *result = g_malloc(size); - const QEMUSizedBuffer *qsb = qemu_buf_get(f); - g_assert_cmpint(qsb_get_length(qsb), ==, size); - g_assert_cmpint(qsb_get_buffer(qsb, 0, size, result), ==, size); - g_assert_cmpint(memcmp(result, data, size), ==, 0); - g_free(result); + ioc = QIO_CHANNEL(qio_channel_file_new_fd(fd)); + if (write) { + return qemu_fopen_channel_output(ioc); + } else { + return qemu_fopen_channel_input(ioc); + } } #define SUCCESS(val) \ @@ -392,7 +380,7 @@ static const VMStateDescription vmstate_skipping = { static void test_save_noskip(void) { - QEMUFile *fsave = qemu_bufopen("w", NULL); + QEMUFile *fsave = open_test_file(true); TestStruct obj = { .a = 1, .b = 2, .c = 3, .d = 4, .e = 5, .f = 6, .skip_c_e = false }; vmstate_save_state(fsave, &vmstate_skipping, &obj, NULL); @@ -406,13 +394,14 @@ static void test_save_noskip(void) 0, 0, 0, 5, /* e */ 0, 0, 0, 0, 0, 0, 0, 6, /* f */ }; - check_mem_file(fsave, expected, sizeof(expected)); + qemu_fclose(fsave); + compare_vmstate(expected, sizeof(expected)); } static void test_save_skip(void) { - QEMUFile *fsave = qemu_bufopen("w", NULL); + QEMUFile *fsave = open_test_file(true); TestStruct obj = { .a = 1, .b = 2, .c = 3, .d = 4, .e = 5, .f = 6, .skip_c_e = true }; vmstate_save_state(fsave, &vmstate_skipping, &obj, NULL); @@ -424,13 +413,14 @@ static void test_save_skip(void) 0, 0, 0, 0, 0, 0, 0, 4, /* d */ 0, 0, 0, 0, 0, 0, 0, 6, /* f */ }; - check_mem_file(fsave, expected, sizeof(expected)); qemu_fclose(fsave); + compare_vmstate(expected, sizeof(expected)); } static void test_load_noskip(void) { + QEMUFile *fsave = open_test_file(true); uint8_t buf[] = { 0, 0, 0, 10, /* a */ 0, 0, 0, 20, /* b */ @@ -440,10 +430,10 @@ static void test_load_noskip(void) 0, 0, 0, 0, 0, 0, 0, 60, /* f */ QEMU_VM_EOF, /* just to ensure we won't get EOF reported prematurely */ }; + qemu_put_buffer(fsave, buf, sizeof(buf)); + qemu_fclose(fsave); - QEMUSizedBuffer *qsb = qsb_create(buf, sizeof(buf)); - g_assert(qsb); - QEMUFile *loading = qemu_bufopen("r", qsb); + QEMUFile *loading = open_test_file(false); TestStruct obj = { .skip_c_e = false }; vmstate_load_state(loading, &vmstate_skipping, &obj, 2); g_assert(!qemu_file_get_error(loading)); @@ -454,11 +444,11 @@ static void test_load_noskip(void) g_assert_cmpint(obj.e, ==, 50); g_assert_cmpint(obj.f, ==, 60); qemu_fclose(loading); - qsb_free(qsb); } static void test_load_skip(void) { + QEMUFile *fsave = open_test_file(true); uint8_t buf[] = { 0, 0, 0, 10, /* a */ 0, 0, 0, 20, /* b */ @@ -466,10 +456,10 @@ static void test_load_skip(void) 0, 0, 0, 0, 0, 0, 0, 60, /* f */ QEMU_VM_EOF, /* just to ensure we won't get EOF reported prematurely */ }; + qemu_put_buffer(fsave, buf, sizeof(buf)); + qemu_fclose(fsave); - QEMUSizedBuffer *qsb = qsb_create(buf, sizeof(buf)); - g_assert(qsb); - QEMUFile *loading = qemu_bufopen("r", qsb); + QEMUFile *loading = open_test_file(false); TestStruct obj = { .skip_c_e = true, .c = 300, .e = 500 }; vmstate_load_state(loading, &vmstate_skipping, &obj, 2); g_assert(!qemu_file_get_error(loading)); @@ -480,13 +470,14 @@ static void test_load_skip(void) g_assert_cmpint(obj.e, ==, 500); g_assert_cmpint(obj.f, ==, 60); qemu_fclose(loading); - qsb_free(qsb); } int main(int argc, char **argv) { temp_fd = mkstemp(temp_file); + module_call_init(MODULE_INIT_QOM); + g_test_init(&argc, &argv, NULL); g_test_add_func("/vmstate/simple/primitive", test_simple_primitive); g_test_add_func("/vmstate/versioned/load/v1", test_load_v1); diff --git a/trace-events b/trace-events index b53c3541a3..b27d1dab65 100644 --- a/trace-events +++ b/trace-events @@ -61,6 +61,10 @@ virtio_console_chr_event(unsigned int port, int event) "port %u, event %d" bdrv_open_common(void *bs, const char *filename, int flags, const char *format_name) "bs %p filename \"%s\" flags %#x format_name \"%s\"" bdrv_lock_medium(void *bs, bool locked) "bs %p locked %d" +# block/block-backend.c +blk_co_preadv(void *blk, void *bs, int64_t offset, unsigned int bytes, int flags) "blk %p bs %p offset %"PRId64" bytes %u flags %x" +blk_co_pwritev(void *blk, void *bs, int64_t offset, unsigned int bytes, int flags) "blk %p bs %p offset %"PRId64" bytes %u flags %x" + # block/io.c bdrv_aio_discard(void *bs, int64_t sector_num, int nb_sectors, void *opaque) "bs %p sector_num %"PRId64" nb_sectors %d opaque %p" bdrv_aio_flush(void *bs, void *opaque) "bs %p opaque %p" @@ -68,8 +72,6 @@ bdrv_aio_readv(void *bs, int64_t sector_num, int nb_sectors, void *opaque) "bs % bdrv_aio_writev(void *bs, int64_t sector_num, int nb_sectors, void *opaque) "bs %p sector_num %"PRId64" nb_sectors %d opaque %p" bdrv_aio_write_zeroes(void *bs, int64_t sector_num, int nb_sectors, int flags, void *opaque) "bs %p sector_num %"PRId64" nb_sectors %d flags %#x opaque %p" bdrv_co_readv(void *bs, int64_t sector_num, int nb_sector) "bs %p sector_num %"PRId64" nb_sectors %d" -bdrv_co_copy_on_readv(void *bs, int64_t sector_num, int nb_sector) "bs %p sector_num %"PRId64" nb_sectors %d" -bdrv_co_readv_no_serialising(void *bs, int64_t sector_num, int nb_sector) "bs %p sector_num %"PRId64" nb_sectors %d" bdrv_co_writev(void *bs, int64_t sector_num, int nb_sector) "bs %p sector_num %"PRId64" nb_sectors %d" bdrv_co_write_zeroes(void *bs, int64_t sector_num, int nb_sector, int flags) "bs %p sector_num %"PRId64" nb_sectors %d flags %#x" bdrv_co_do_copy_on_readv(void *bs, int64_t sector_num, int nb_sectors, int64_t cluster_sector_num, int cluster_nb_sectors) "bs %p sector_num %"PRId64" nb_sectors %d cluster_sector_num %"PRId64" cluster_nb_sectors %d" @@ -1143,7 +1145,7 @@ win_helper_done(uint32_t tl) "tl=%d" win_helper_retry(uint32_t tl) "tl=%d" # dma-helpers.c -dma_blk_io(void *dbs, void *bs, int64_t sector_num, bool to_dev) "dbs=%p bs=%p sector_num=%" PRId64 " to_dev=%d" +dma_blk_io(void *dbs, void *bs, int64_t offset, bool to_dev) "dbs=%p bs=%p offset=%" PRId64 " to_dev=%d" dma_aio_cancel(void *dbs) "dbs=%p" dma_complete(void *dbs, int ret, void *cb) "dbs=%p ret=%d cb=%p" dma_blk_cb(void *dbs, int ret) "dbs=%p ret=%d" @@ -1428,7 +1430,7 @@ spapr_iommu_pci_get(uint64_t liobn, uint64_t ioba, uint64_t ret, uint64_t tce) " spapr_iommu_pci_indirect(uint64_t liobn, uint64_t ioba, uint64_t tce, uint64_t iobaN, uint64_t tceN, uint64_t ret) "liobn=%"PRIx64" ioba=0x%"PRIx64" tcelist=0x%"PRIx64" iobaN=0x%"PRIx64" tceN=0x%"PRIx64" ret=%"PRId64 spapr_iommu_pci_stuff(uint64_t liobn, uint64_t ioba, uint64_t tce_value, uint64_t npages, uint64_t ret) "liobn=%"PRIx64" ioba=0x%"PRIx64" tcevalue=0x%"PRIx64" npages=%"PRId64" ret=%"PRId64 spapr_iommu_xlate(uint64_t liobn, uint64_t ioba, uint64_t tce, unsigned perm, unsigned pgsize) "liobn=%"PRIx64" 0x%"PRIx64" -> 0x%"PRIx64" perm=%u mask=%x" -spapr_iommu_new_table(uint64_t liobn, void *tcet, void *table, int fd) "liobn=%"PRIx64" tcet=%p table=%p fd=%d" +spapr_iommu_new_table(uint64_t liobn, void *table, int fd) "liobn=%"PRIx64" table=%p fd=%d" # hw/ppc/ppc.c ppc_tb_adjust(uint64_t offs1, uint64_t offs2, int64_t diff, int64_t seconds) "adjusted from 0x%"PRIx64" to 0x%"PRIx64", diff %"PRId64" (%"PRId64"s)" @@ -1481,7 +1483,7 @@ await_return_path_close_on_source_close(void) "" await_return_path_close_on_source_joining(void) "" migrate_set_state(int new_state) "new state %d" migrate_fd_cleanup(void) "" -migrate_fd_error(void) "" +migrate_fd_error(const char *error_desc) "error=%s" migrate_fd_cancel(void) "" migrate_handle_rp_req_pages(const char *rbname, size_t start, size_t len) "in %s at %zx len %zx" migrate_pending(uint64_t size, uint64_t max, uint64_t post, uint64_t nonpost) "pending size %" PRIu64 " max %" PRIu64 " (post=%" PRIu64 " nonpost=%" PRIu64 ")" @@ -1511,6 +1513,8 @@ migrate_state_too_big(void) "" migrate_transferred(uint64_t tranferred, uint64_t time_spent, double bandwidth, uint64_t size) "transferred %" PRIu64 " time_spent %" PRIu64 " bandwidth %g max_size %" PRId64 process_incoming_migration_co_end(int ret, int ps) "ret=%d postcopy-state=%d" process_incoming_migration_co_postcopy_end_main(void) "" +migration_set_incoming_channel(void *ioc, const char *ioctype) "ioc=%p ioctype=%s" +migration_set_outgoing_channel(void *ioc, const char *ioctype, const char *hostname) "ioc=%p ioctype=%s hostname=%s" # migration/rdma.c qemu_rdma_accept_incoming_migration(void) "" @@ -1595,6 +1599,27 @@ postcopy_ram_incoming_cleanup_entry(void) "" postcopy_ram_incoming_cleanup_exit(void) "" postcopy_ram_incoming_cleanup_join(void) "" +# migration/exec.c +migration_exec_outgoing(const char *cmd) "cmd=%s" +migration_exec_incoming(const char *cmd) "cmd=%s" + +# migration/fd.c +migration_fd_outgoing(int fd) "fd=%d" +migration_fd_incoming(int fd) "fd=%d" + +# migration/socket.c +migration_socket_incoming_accepted(void) "" +migration_socket_outgoing_connected(const char *hostname) "hostname=%s" +migration_socket_outgoing_error(const char *err) "error=%s" + +# migration/tls.c +migration_tls_outgoing_handshake_start(const char *hostname) "hostname=%s" +migration_tls_outgoing_handshake_error(const char *err) "err=%s" +migration_tls_outgoing_handshake_complete(void) "" +migration_tls_incoming_handshake_start(void) "" +migration_tls_incoming_handshake_error(const char *err) "err=%s" +migration_tls_incoming_handshake_complete(void) "" + # kvm-all.c kvm_ioctl(int type, void *arg) "type 0x%x, arg %p" kvm_vm_ioctl(int type, void *arg) "type 0x%x, arg %p" @@ -1711,9 +1736,13 @@ vfio_quirk_ati_bonaire_reset_no_smc(const char *name) "%s" vfio_quirk_ati_bonaire_reset_timeout(const char *name) "%s" vfio_quirk_ati_bonaire_reset_done(const char *name) "%s" vfio_quirk_ati_bonaire_reset(const char *name) "%s" +vfio_pci_igd_bar4_write(const char *name, uint32_t index, uint32_t data, uint32_t base) "%s [%03x] %08x -> %08x" +vfio_pci_igd_bdsm_enabled(const char *name, int size) "%s %dMB" +vfio_pci_igd_opregion_enabled(const char *name) "%s" +vfio_pci_igd_host_bridge_enabled(const char *name) "%s" +vfio_pci_igd_lpc_bridge_enabled(const char *name) "%s" - -# hw/vfio/vfio-common.c +# hw/vfio/common.c vfio_region_write(const char *name, int index, uint64_t addr, uint64_t data, unsigned size) " (%s:region%d+0x%"PRIx64", 0x%"PRIx64 ", %d)" vfio_region_read(char *name, int index, uint64_t addr, unsigned size, uint64_t data) " (%s:region%d+0x%"PRIx64", %d) = 0x%"PRIx64 vfio_iommu_map_notify(uint64_t iova_start, uint64_t iova_end) "iommu map @ %"PRIx64" - %"PRIx64 @@ -1732,6 +1761,9 @@ vfio_region_mmap(const char *name, unsigned long offset, unsigned long end) "Reg vfio_region_exit(const char *name, int index) "Device %s, region %d" vfio_region_finalize(const char *name, int index) "Device %s, region %d" vfio_region_mmaps_set_enabled(const char *name, bool enabled) "Region %s mmaps enabled: %d" +vfio_region_sparse_mmap_header(const char *name, int index, int nr_areas) "Device %s region %d: %d sparse mmap entries" +vfio_region_sparse_mmap_entry(int i, unsigned long start, unsigned long end) "sparse entry %d [0x%lx - 0x%lx]" +vfio_get_dev_region(const char *name, int index, uint32_t type, uint32_t subtype) "%s index %d, %08x/%0x8" # hw/vfio/platform.c vfio_platform_base_device_init(char *name, int groupid) "%s belongs to group #%d" diff --git a/util/error.c b/util/error.c index cae2511732..9c40b1f458 100644 --- a/util/error.c +++ b/util/error.c @@ -217,7 +217,7 @@ ErrorClass error_get_class(const Error *err) return err->err_class; } -const char *error_get_pretty(Error *err) +const char *error_get_pretty(const Error *err) { return err->msg; } |