diff options
137 files changed, 5758 insertions, 1964 deletions
diff --git a/MAINTAINERS b/MAINTAINERS index bdd9e5a558..87ddaced59 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -872,6 +872,7 @@ VFIO M: Alex Williamson <alex.williamson@redhat.com> S: Supported F: hw/vfio/* +F: include/hw/vfio/ vhost M: Michael S. Tsirkin <mst@redhat.com> diff --git a/Makefile b/Makefile index 70e3ebcd50..1d076a9d85 100644 --- a/Makefile +++ b/Makefile @@ -238,7 +238,7 @@ qemu-img$(EXESUF): qemu-img.o $(block-obj-y) $(crypto-obj-y) $(io-obj-y) $(qom-o qemu-nbd$(EXESUF): qemu-nbd.o $(block-obj-y) $(crypto-obj-y) $(io-obj-y) $(qom-obj-y) libqemuutil.a libqemustub.a qemu-io$(EXESUF): qemu-io.o $(block-obj-y) $(crypto-obj-y) $(io-obj-y) $(qom-obj-y) libqemuutil.a libqemustub.a -qemu-bridge-helper$(EXESUF): qemu-bridge-helper.o +qemu-bridge-helper$(EXESUF): qemu-bridge-helper.o libqemuutil.a libqemustub.a fsdev/virtfs-proxy-helper$(EXESUF): fsdev/virtfs-proxy-helper.o fsdev/9p-marshal.o fsdev/9p-iov-marshal.o libqemuutil.a libqemustub.a fsdev/virtfs-proxy-helper$(EXESUF): LIBS += -lcap @@ -329,7 +329,7 @@ ifneq ($(EXESUF),) qemu-ga: qemu-ga$(EXESUF) $(QGA_VSS_PROVIDER) $(QEMU_GA_MSI) endif -ivshmem-client$(EXESUF): $(ivshmem-client-obj-y) +ivshmem-client$(EXESUF): $(ivshmem-client-obj-y) libqemuutil.a libqemustub.a $(call LINK, $^) ivshmem-server$(EXESUF): $(ivshmem-server-obj-y) libqemuutil.a libqemustub.a $(call LINK, $^) diff --git a/block.c b/block.c index ba24b8e674..59a18a3a66 100644 --- a/block.c +++ b/block.c @@ -53,23 +53,6 @@ #include <windows.h> #endif -/** - * A BdrvDirtyBitmap can be in three possible states: - * (1) successor is NULL and disabled is false: full r/w mode - * (2) successor is NULL and disabled is true: read only mode ("disabled") - * (3) successor is set: frozen mode. - * A frozen bitmap cannot be renamed, deleted, anonymized, cleared, set, - * or enabled. A frozen bitmap can only abdicate() or reclaim(). - */ -struct BdrvDirtyBitmap { - HBitmap *bitmap; /* Dirty sector bitmap implementation */ - BdrvDirtyBitmap *successor; /* Anonymous child; implies frozen status */ - char *name; /* Optional non-empty unique ID */ - int64_t size; /* Size of the bitmap (Number of sectors) */ - bool disabled; /* Bitmap is read-only */ - QLIST_ENTRY(BdrvDirtyBitmap) list; -}; - #define NOT_DONE 0x7fffffff /* used while emulated sync operation in progress */ struct BdrvStates bdrv_states = QTAILQ_HEAD_INITIALIZER(bdrv_states); @@ -88,9 +71,6 @@ static int bdrv_open_inherit(BlockDriverState **pbs, const char *filename, BlockDriverState *parent, const BdrvChildRole *child_role, Error **errp); -static void bdrv_dirty_bitmap_truncate(BlockDriverState *bs); -static void bdrv_release_named_dirty_bitmaps(BlockDriverState *bs); - /* If non-zero, use only whitelisted block drivers */ static int use_bdrv_whitelist; @@ -687,13 +667,19 @@ int bdrv_parse_cache_flags(const char *mode, int *flags) } /* - * Returns the flags that a temporary snapshot should get, based on the - * originally requested flags (the originally requested image will have flags - * like a backing file) + * Returns the options and flags that a temporary snapshot should get, based on + * the originally requested flags (the originally requested image will have + * flags like a backing file) */ -static int bdrv_temp_snapshot_flags(int flags) +static void bdrv_temp_snapshot_options(int *child_flags, QDict *child_options, + int parent_flags, QDict *parent_options) { - return (flags & ~BDRV_O_SNAPSHOT) | BDRV_O_TEMPORARY; + *child_flags = (parent_flags & ~BDRV_O_SNAPSHOT) | BDRV_O_TEMPORARY; + + /* For temporary files, unconditional cache=unsafe is fine */ + qdict_set_default_str(child_options, BDRV_OPT_CACHE_WB, "on"); + qdict_set_default_str(child_options, BDRV_OPT_CACHE_DIRECT, "off"); + qdict_set_default_str(child_options, BDRV_OPT_CACHE_NO_FLUSH, "on"); } /* @@ -1424,13 +1410,13 @@ done: return c; } -int bdrv_append_temp_snapshot(BlockDriverState *bs, int flags, Error **errp) +static int bdrv_append_temp_snapshot(BlockDriverState *bs, int flags, + QDict *snapshot_options, Error **errp) { /* TODO: extra byte is a hack to ensure MAX_PATH space on Windows. */ char *tmp_filename = g_malloc0(PATH_MAX + 1); int64_t total_size; QemuOpts *opts = NULL; - QDict *snapshot_options; BlockDriverState *bs_snapshot; Error *local_err = NULL; int ret; @@ -1464,8 +1450,7 @@ int bdrv_append_temp_snapshot(BlockDriverState *bs, int flags, Error **errp) goto out; } - /* Prepare a new options QDict for the temporary file */ - snapshot_options = qdict_new(); + /* Prepare options QDict for the temporary file */ qdict_put(snapshot_options, "file.driver", qstring_from_str("file")); qdict_put(snapshot_options, "file.filename", @@ -1477,6 +1462,7 @@ int bdrv_append_temp_snapshot(BlockDriverState *bs, int flags, Error **errp) ret = bdrv_open(&bs_snapshot, NULL, NULL, snapshot_options, flags, &local_err); + snapshot_options = NULL; if (ret < 0) { error_propagate(errp, local_err); goto out; @@ -1485,6 +1471,7 @@ int bdrv_append_temp_snapshot(BlockDriverState *bs, int flags, Error **errp) bdrv_append(bs_snapshot, bs); out: + QDECREF(snapshot_options); g_free(tmp_filename); return ret; } @@ -1516,6 +1503,7 @@ static int bdrv_open_inherit(BlockDriverState **pbs, const char *filename, const char *drvname; const char *backing; Error *local_err = NULL; + QDict *snapshot_options = NULL; int snapshot_flags = 0; assert(pbs); @@ -1607,7 +1595,9 @@ static int bdrv_open_inherit(BlockDriverState **pbs, const char *filename, flags |= BDRV_O_ALLOW_RDWR; } if (flags & BDRV_O_SNAPSHOT) { - snapshot_flags = bdrv_temp_snapshot_flags(flags); + snapshot_options = qdict_new(); + bdrv_temp_snapshot_options(&snapshot_flags, snapshot_options, + flags, options); bdrv_backing_options(&flags, options, flags, options); } @@ -1709,7 +1699,9 @@ static int bdrv_open_inherit(BlockDriverState **pbs, const char *filename, /* For snapshot=on, create a temporary qcow2 overlay. bs points to the * temporary snapshot afterwards. */ if (snapshot_flags) { - ret = bdrv_append_temp_snapshot(bs, snapshot_flags, &local_err); + ret = bdrv_append_temp_snapshot(bs, snapshot_flags, snapshot_options, + &local_err); + snapshot_options = NULL; if (local_err) { goto close_and_fail; } @@ -1721,6 +1713,7 @@ fail: if (file != NULL) { bdrv_unref_child(bs, file); } + QDECREF(snapshot_options); QDECREF(bs->explicit_options); QDECREF(bs->options); QDECREF(options); @@ -1743,6 +1736,7 @@ close_and_fail: } else { bdrv_unref(bs); } + QDECREF(snapshot_options); QDECREF(options); if (local_err) { error_propagate(errp, local_err); @@ -3431,346 +3425,6 @@ void bdrv_lock_medium(BlockDriverState *bs, bool locked) } } -BdrvDirtyBitmap *bdrv_find_dirty_bitmap(BlockDriverState *bs, const char *name) -{ - BdrvDirtyBitmap *bm; - - assert(name); - QLIST_FOREACH(bm, &bs->dirty_bitmaps, list) { - if (bm->name && !strcmp(name, bm->name)) { - return bm; - } - } - return NULL; -} - -void bdrv_dirty_bitmap_make_anon(BdrvDirtyBitmap *bitmap) -{ - assert(!bdrv_dirty_bitmap_frozen(bitmap)); - g_free(bitmap->name); - bitmap->name = NULL; -} - -BdrvDirtyBitmap *bdrv_create_dirty_bitmap(BlockDriverState *bs, - uint32_t granularity, - const char *name, - Error **errp) -{ - int64_t bitmap_size; - BdrvDirtyBitmap *bitmap; - uint32_t sector_granularity; - - assert((granularity & (granularity - 1)) == 0); - - if (name && bdrv_find_dirty_bitmap(bs, name)) { - error_setg(errp, "Bitmap already exists: %s", name); - return NULL; - } - sector_granularity = granularity >> BDRV_SECTOR_BITS; - assert(sector_granularity); - bitmap_size = bdrv_nb_sectors(bs); - if (bitmap_size < 0) { - error_setg_errno(errp, -bitmap_size, "could not get length of device"); - errno = -bitmap_size; - return NULL; - } - bitmap = g_new0(BdrvDirtyBitmap, 1); - bitmap->bitmap = hbitmap_alloc(bitmap_size, ctz32(sector_granularity)); - bitmap->size = bitmap_size; - bitmap->name = g_strdup(name); - bitmap->disabled = false; - QLIST_INSERT_HEAD(&bs->dirty_bitmaps, bitmap, list); - return bitmap; -} - -bool bdrv_dirty_bitmap_frozen(BdrvDirtyBitmap *bitmap) -{ - return bitmap->successor; -} - -bool bdrv_dirty_bitmap_enabled(BdrvDirtyBitmap *bitmap) -{ - return !(bitmap->disabled || bitmap->successor); -} - -DirtyBitmapStatus bdrv_dirty_bitmap_status(BdrvDirtyBitmap *bitmap) -{ - if (bdrv_dirty_bitmap_frozen(bitmap)) { - return DIRTY_BITMAP_STATUS_FROZEN; - } else if (!bdrv_dirty_bitmap_enabled(bitmap)) { - return DIRTY_BITMAP_STATUS_DISABLED; - } else { - return DIRTY_BITMAP_STATUS_ACTIVE; - } -} - -/** - * Create a successor bitmap destined to replace this bitmap after an operation. - * Requires that the bitmap is not frozen and has no successor. - */ -int bdrv_dirty_bitmap_create_successor(BlockDriverState *bs, - BdrvDirtyBitmap *bitmap, Error **errp) -{ - uint64_t granularity; - BdrvDirtyBitmap *child; - - if (bdrv_dirty_bitmap_frozen(bitmap)) { - error_setg(errp, "Cannot create a successor for a bitmap that is " - "currently frozen"); - return -1; - } - assert(!bitmap->successor); - - /* Create an anonymous successor */ - granularity = bdrv_dirty_bitmap_granularity(bitmap); - child = bdrv_create_dirty_bitmap(bs, granularity, NULL, errp); - if (!child) { - return -1; - } - - /* Successor will be on or off based on our current state. */ - child->disabled = bitmap->disabled; - - /* Install the successor and freeze the parent */ - bitmap->successor = child; - return 0; -} - -/** - * For a bitmap with a successor, yield our name to the successor, - * delete the old bitmap, and return a handle to the new bitmap. - */ -BdrvDirtyBitmap *bdrv_dirty_bitmap_abdicate(BlockDriverState *bs, - BdrvDirtyBitmap *bitmap, - Error **errp) -{ - char *name; - BdrvDirtyBitmap *successor = bitmap->successor; - - if (successor == NULL) { - error_setg(errp, "Cannot relinquish control if " - "there's no successor present"); - return NULL; - } - - name = bitmap->name; - bitmap->name = NULL; - successor->name = name; - bitmap->successor = NULL; - bdrv_release_dirty_bitmap(bs, bitmap); - - return successor; -} - -/** - * In cases of failure where we can no longer safely delete the parent, - * we may wish to re-join the parent and child/successor. - * The merged parent will be un-frozen, but not explicitly re-enabled. - */ -BdrvDirtyBitmap *bdrv_reclaim_dirty_bitmap(BlockDriverState *bs, - BdrvDirtyBitmap *parent, - Error **errp) -{ - BdrvDirtyBitmap *successor = parent->successor; - - if (!successor) { - error_setg(errp, "Cannot reclaim a successor when none is present"); - return NULL; - } - - if (!hbitmap_merge(parent->bitmap, successor->bitmap)) { - error_setg(errp, "Merging of parent and successor bitmap failed"); - return NULL; - } - bdrv_release_dirty_bitmap(bs, successor); - parent->successor = NULL; - - return parent; -} - -/** - * Truncates _all_ bitmaps attached to a BDS. - */ -static void bdrv_dirty_bitmap_truncate(BlockDriverState *bs) -{ - BdrvDirtyBitmap *bitmap; - uint64_t size = bdrv_nb_sectors(bs); - - QLIST_FOREACH(bitmap, &bs->dirty_bitmaps, list) { - assert(!bdrv_dirty_bitmap_frozen(bitmap)); - hbitmap_truncate(bitmap->bitmap, size); - bitmap->size = size; - } -} - -static void bdrv_do_release_matching_dirty_bitmap(BlockDriverState *bs, - BdrvDirtyBitmap *bitmap, - bool only_named) -{ - BdrvDirtyBitmap *bm, *next; - QLIST_FOREACH_SAFE(bm, &bs->dirty_bitmaps, list, next) { - if ((!bitmap || bm == bitmap) && (!only_named || bm->name)) { - assert(!bdrv_dirty_bitmap_frozen(bm)); - QLIST_REMOVE(bm, list); - hbitmap_free(bm->bitmap); - g_free(bm->name); - g_free(bm); - - if (bitmap) { - return; - } - } - } -} - -void bdrv_release_dirty_bitmap(BlockDriverState *bs, BdrvDirtyBitmap *bitmap) -{ - bdrv_do_release_matching_dirty_bitmap(bs, bitmap, false); -} - -/** - * Release all named dirty bitmaps attached to a BDS (for use in bdrv_close()). - * There must not be any frozen bitmaps attached. - */ -static void bdrv_release_named_dirty_bitmaps(BlockDriverState *bs) -{ - bdrv_do_release_matching_dirty_bitmap(bs, NULL, true); -} - -void bdrv_disable_dirty_bitmap(BdrvDirtyBitmap *bitmap) -{ - assert(!bdrv_dirty_bitmap_frozen(bitmap)); - bitmap->disabled = true; -} - -void bdrv_enable_dirty_bitmap(BdrvDirtyBitmap *bitmap) -{ - assert(!bdrv_dirty_bitmap_frozen(bitmap)); - bitmap->disabled = false; -} - -BlockDirtyInfoList *bdrv_query_dirty_bitmaps(BlockDriverState *bs) -{ - BdrvDirtyBitmap *bm; - BlockDirtyInfoList *list = NULL; - BlockDirtyInfoList **plist = &list; - - QLIST_FOREACH(bm, &bs->dirty_bitmaps, list) { - BlockDirtyInfo *info = g_new0(BlockDirtyInfo, 1); - BlockDirtyInfoList *entry = g_new0(BlockDirtyInfoList, 1); - info->count = bdrv_get_dirty_count(bm); - info->granularity = bdrv_dirty_bitmap_granularity(bm); - info->has_name = !!bm->name; - info->name = g_strdup(bm->name); - info->status = bdrv_dirty_bitmap_status(bm); - entry->value = info; - *plist = entry; - plist = &entry->next; - } - - return list; -} - -int bdrv_get_dirty(BlockDriverState *bs, BdrvDirtyBitmap *bitmap, int64_t sector) -{ - if (bitmap) { - return hbitmap_get(bitmap->bitmap, sector); - } else { - return 0; - } -} - -/** - * Chooses a default granularity based on the existing cluster size, - * but clamped between [4K, 64K]. Defaults to 64K in the case that there - * is no cluster size information available. - */ -uint32_t bdrv_get_default_bitmap_granularity(BlockDriverState *bs) -{ - BlockDriverInfo bdi; - uint32_t granularity; - - if (bdrv_get_info(bs, &bdi) >= 0 && bdi.cluster_size > 0) { - granularity = MAX(4096, bdi.cluster_size); - granularity = MIN(65536, granularity); - } else { - granularity = 65536; - } - - return granularity; -} - -uint32_t bdrv_dirty_bitmap_granularity(BdrvDirtyBitmap *bitmap) -{ - return BDRV_SECTOR_SIZE << hbitmap_granularity(bitmap->bitmap); -} - -void bdrv_dirty_iter_init(BdrvDirtyBitmap *bitmap, HBitmapIter *hbi) -{ - hbitmap_iter_init(hbi, bitmap->bitmap, 0); -} - -void bdrv_set_dirty_bitmap(BdrvDirtyBitmap *bitmap, - int64_t cur_sector, int nr_sectors) -{ - assert(bdrv_dirty_bitmap_enabled(bitmap)); - hbitmap_set(bitmap->bitmap, cur_sector, nr_sectors); -} - -void bdrv_reset_dirty_bitmap(BdrvDirtyBitmap *bitmap, - int64_t cur_sector, int nr_sectors) -{ - assert(bdrv_dirty_bitmap_enabled(bitmap)); - hbitmap_reset(bitmap->bitmap, cur_sector, nr_sectors); -} - -void bdrv_clear_dirty_bitmap(BdrvDirtyBitmap *bitmap, HBitmap **out) -{ - assert(bdrv_dirty_bitmap_enabled(bitmap)); - if (!out) { - hbitmap_reset_all(bitmap->bitmap); - } else { - HBitmap *backup = bitmap->bitmap; - bitmap->bitmap = hbitmap_alloc(bitmap->size, - hbitmap_granularity(backup)); - *out = backup; - } -} - -void bdrv_undo_clear_dirty_bitmap(BdrvDirtyBitmap *bitmap, HBitmap *in) -{ - HBitmap *tmp = bitmap->bitmap; - assert(bdrv_dirty_bitmap_enabled(bitmap)); - bitmap->bitmap = in; - hbitmap_free(tmp); -} - -void bdrv_set_dirty(BlockDriverState *bs, int64_t cur_sector, - int nr_sectors) -{ - BdrvDirtyBitmap *bitmap; - QLIST_FOREACH(bitmap, &bs->dirty_bitmaps, list) { - if (!bdrv_dirty_bitmap_enabled(bitmap)) { - continue; - } - hbitmap_set(bitmap->bitmap, cur_sector, nr_sectors); - } -} - -/** - * Advance an HBitmapIter to an arbitrary offset. - */ -void bdrv_set_dirty_iter(HBitmapIter *hbi, int64_t offset) -{ - assert(hbi->hb); - hbitmap_iter_init(hbi, hbi->hb, offset); -} - -int64_t bdrv_get_dirty_count(BdrvDirtyBitmap *bitmap) -{ - return hbitmap_count(bitmap->bitmap); -} - /* Get a reference to bs */ void bdrv_ref(BlockDriverState *bs) { diff --git a/block/Makefile.objs b/block/Makefile.objs index 58ef2ef3f2..cdd865597a 100644 --- a/block/Makefile.objs +++ b/block/Makefile.objs @@ -20,7 +20,7 @@ block-obj-$(CONFIG_RBD) += rbd.o block-obj-$(CONFIG_GLUSTERFS) += gluster.o block-obj-$(CONFIG_ARCHIPELAGO) += archipelago.o block-obj-$(CONFIG_LIBSSH2) += ssh.o -block-obj-y += accounting.o +block-obj-y += accounting.o dirty-bitmap.o block-obj-y += write-threshold.o common-obj-y += stream.o diff --git a/block/backup.c b/block/backup.c index 0f1b1bc084..ab3e345e92 100644 --- a/block/backup.c +++ b/block/backup.c @@ -20,6 +20,7 @@ #include "qapi/qmp/qerror.h" #include "qemu/ratelimit.h" #include "sysemu/block-backend.h" +#include "qemu/bitmap.h" #define BACKUP_CLUSTER_SIZE_DEFAULT (1 << 16) #define SLICE_TIME 100000000ULL /* ns */ @@ -42,7 +43,7 @@ typedef struct BackupBlockJob { BlockdevOnError on_target_error; CoRwlock flush_rwlock; uint64_t sectors_read; - HBitmap *bitmap; + unsigned long *done_bitmap; int64_t cluster_size; QLIST_HEAD(, CowRequest) inflight_reqs; } BackupBlockJob; @@ -116,7 +117,7 @@ static int coroutine_fn backup_do_cow(BlockDriverState *bs, cow_request_begin(&cow_request, job, start, end); for (; start < end; start++) { - if (hbitmap_get(job->bitmap, start)) { + if (test_bit(start, job->done_bitmap)) { trace_backup_do_cow_skip(job, start); continue; /* already copied */ } @@ -167,7 +168,7 @@ static int coroutine_fn backup_do_cow(BlockDriverState *bs, goto out; } - hbitmap_set(job->bitmap, start, 1); + set_bit(start, job->done_bitmap); /* Publish progress, guest I/O counts as progress too. Note that the * offset field is an opaque progress value, it is not a disk offset. @@ -399,7 +400,7 @@ static void coroutine_fn backup_run(void *opaque) start = 0; end = DIV_ROUND_UP(job->common.len, job->cluster_size); - job->bitmap = hbitmap_alloc(end, 0); + job->done_bitmap = bitmap_new(end); bdrv_set_enable_write_cache(target, true); if (target->blk) { @@ -480,7 +481,7 @@ static void coroutine_fn backup_run(void *opaque) /* wait until pending backup_do_cow() calls have completed */ qemu_co_rwlock_wrlock(&job->flush_rwlock); qemu_co_rwlock_unlock(&job->flush_rwlock); - hbitmap_free(job->bitmap); + g_free(job->done_bitmap); if (target->blk) { blk_iostatus_disable(target->blk); diff --git a/block/block-backend.c b/block/block-backend.c index ebdf78a11c..03e71b4368 100644 --- a/block/block-backend.c +++ b/block/block-backend.c @@ -50,6 +50,8 @@ struct BlockBackend { bool iostatus_enabled; BlockDeviceIoStatus iostatus; + bool allow_write_beyond_eof; + NotifierList remove_bs_notifiers, insert_bs_notifiers; }; @@ -579,6 +581,11 @@ void blk_iostatus_set_err(BlockBackend *blk, int error) } } +void blk_set_allow_write_beyond_eof(BlockBackend *blk, bool allow) +{ + blk->allow_write_beyond_eof = allow; +} + static int blk_check_byte_request(BlockBackend *blk, int64_t offset, size_t size) { @@ -592,17 +599,19 @@ static int blk_check_byte_request(BlockBackend *blk, int64_t offset, return -ENOMEDIUM; } - len = blk_getlength(blk); - if (len < 0) { - return len; - } - if (offset < 0) { return -EIO; } - if (offset > len || len - offset < size) { - return -EIO; + if (!blk->allow_write_beyond_eof) { + len = blk_getlength(blk); + if (len < 0) { + return len; + } + + if (offset > len || len - offset < size) { + return -EIO; + } } return 0; diff --git a/block/dirty-bitmap.c b/block/dirty-bitmap.c new file mode 100644 index 0000000000..556e1d15c4 --- /dev/null +++ b/block/dirty-bitmap.c @@ -0,0 +1,387 @@ +/* + * Block Dirty Bitmap + * + * Copyright (c) 2016 Red Hat. Inc + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL + * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN + * THE SOFTWARE. + */ +#include "qemu/osdep.h" +#include "config-host.h" +#include "qemu-common.h" +#include "trace.h" +#include "block/block_int.h" +#include "block/blockjob.h" + +/** + * A BdrvDirtyBitmap can be in three possible states: + * (1) successor is NULL and disabled is false: full r/w mode + * (2) successor is NULL and disabled is true: read only mode ("disabled") + * (3) successor is set: frozen mode. + * A frozen bitmap cannot be renamed, deleted, anonymized, cleared, set, + * or enabled. A frozen bitmap can only abdicate() or reclaim(). + */ +struct BdrvDirtyBitmap { + HBitmap *bitmap; /* Dirty sector bitmap implementation */ + BdrvDirtyBitmap *successor; /* Anonymous child; implies frozen status */ + char *name; /* Optional non-empty unique ID */ + int64_t size; /* Size of the bitmap (Number of sectors) */ + bool disabled; /* Bitmap is read-only */ + QLIST_ENTRY(BdrvDirtyBitmap) list; +}; + +BdrvDirtyBitmap *bdrv_find_dirty_bitmap(BlockDriverState *bs, const char *name) +{ + BdrvDirtyBitmap *bm; + + assert(name); + QLIST_FOREACH(bm, &bs->dirty_bitmaps, list) { + if (bm->name && !strcmp(name, bm->name)) { + return bm; + } + } + return NULL; +} + +void bdrv_dirty_bitmap_make_anon(BdrvDirtyBitmap *bitmap) +{ + assert(!bdrv_dirty_bitmap_frozen(bitmap)); + g_free(bitmap->name); + bitmap->name = NULL; +} + +BdrvDirtyBitmap *bdrv_create_dirty_bitmap(BlockDriverState *bs, + uint32_t granularity, + const char *name, + Error **errp) +{ + int64_t bitmap_size; + BdrvDirtyBitmap *bitmap; + uint32_t sector_granularity; + + assert((granularity & (granularity - 1)) == 0); + + if (name && bdrv_find_dirty_bitmap(bs, name)) { + error_setg(errp, "Bitmap already exists: %s", name); + return NULL; + } + sector_granularity = granularity >> BDRV_SECTOR_BITS; + assert(sector_granularity); + bitmap_size = bdrv_nb_sectors(bs); + if (bitmap_size < 0) { + error_setg_errno(errp, -bitmap_size, "could not get length of device"); + errno = -bitmap_size; + return NULL; + } + bitmap = g_new0(BdrvDirtyBitmap, 1); + bitmap->bitmap = hbitmap_alloc(bitmap_size, ctz32(sector_granularity)); + bitmap->size = bitmap_size; + bitmap->name = g_strdup(name); + bitmap->disabled = false; + QLIST_INSERT_HEAD(&bs->dirty_bitmaps, bitmap, list); + return bitmap; +} + +bool bdrv_dirty_bitmap_frozen(BdrvDirtyBitmap *bitmap) +{ + return bitmap->successor; +} + +bool bdrv_dirty_bitmap_enabled(BdrvDirtyBitmap *bitmap) +{ + return !(bitmap->disabled || bitmap->successor); +} + +DirtyBitmapStatus bdrv_dirty_bitmap_status(BdrvDirtyBitmap *bitmap) +{ + if (bdrv_dirty_bitmap_frozen(bitmap)) { + return DIRTY_BITMAP_STATUS_FROZEN; + } else if (!bdrv_dirty_bitmap_enabled(bitmap)) { + return DIRTY_BITMAP_STATUS_DISABLED; + } else { + return DIRTY_BITMAP_STATUS_ACTIVE; + } +} + +/** + * Create a successor bitmap destined to replace this bitmap after an operation. + * Requires that the bitmap is not frozen and has no successor. + */ +int bdrv_dirty_bitmap_create_successor(BlockDriverState *bs, + BdrvDirtyBitmap *bitmap, Error **errp) +{ + uint64_t granularity; + BdrvDirtyBitmap *child; + + if (bdrv_dirty_bitmap_frozen(bitmap)) { + error_setg(errp, "Cannot create a successor for a bitmap that is " + "currently frozen"); + return -1; + } + assert(!bitmap->successor); + + /* Create an anonymous successor */ + granularity = bdrv_dirty_bitmap_granularity(bitmap); + child = bdrv_create_dirty_bitmap(bs, granularity, NULL, errp); + if (!child) { + return -1; + } + + /* Successor will be on or off based on our current state. */ + child->disabled = bitmap->disabled; + + /* Install the successor and freeze the parent */ + bitmap->successor = child; + return 0; +} + +/** + * For a bitmap with a successor, yield our name to the successor, + * delete the old bitmap, and return a handle to the new bitmap. + */ +BdrvDirtyBitmap *bdrv_dirty_bitmap_abdicate(BlockDriverState *bs, + BdrvDirtyBitmap *bitmap, + Error **errp) +{ + char *name; + BdrvDirtyBitmap *successor = bitmap->successor; + + if (successor == NULL) { + error_setg(errp, "Cannot relinquish control if " + "there's no successor present"); + return NULL; + } + + name = bitmap->name; + bitmap->name = NULL; + successor->name = name; + bitmap->successor = NULL; + bdrv_release_dirty_bitmap(bs, bitmap); + + return successor; +} + +/** + * In cases of failure where we can no longer safely delete the parent, + * we may wish to re-join the parent and child/successor. + * The merged parent will be un-frozen, but not explicitly re-enabled. + */ +BdrvDirtyBitmap *bdrv_reclaim_dirty_bitmap(BlockDriverState *bs, + BdrvDirtyBitmap *parent, + Error **errp) +{ + BdrvDirtyBitmap *successor = parent->successor; + + if (!successor) { + error_setg(errp, "Cannot reclaim a successor when none is present"); + return NULL; + } + + if (!hbitmap_merge(parent->bitmap, successor->bitmap)) { + error_setg(errp, "Merging of parent and successor bitmap failed"); + return NULL; + } + bdrv_release_dirty_bitmap(bs, successor); + parent->successor = NULL; + + return parent; +} + +/** + * Truncates _all_ bitmaps attached to a BDS. + */ +void bdrv_dirty_bitmap_truncate(BlockDriverState *bs) +{ + BdrvDirtyBitmap *bitmap; + uint64_t size = bdrv_nb_sectors(bs); + + QLIST_FOREACH(bitmap, &bs->dirty_bitmaps, list) { + assert(!bdrv_dirty_bitmap_frozen(bitmap)); + hbitmap_truncate(bitmap->bitmap, size); + bitmap->size = size; + } +} + +static void bdrv_do_release_matching_dirty_bitmap(BlockDriverState *bs, + BdrvDirtyBitmap *bitmap, + bool only_named) +{ + BdrvDirtyBitmap *bm, *next; + QLIST_FOREACH_SAFE(bm, &bs->dirty_bitmaps, list, next) { + if ((!bitmap || bm == bitmap) && (!only_named || bm->name)) { + assert(!bdrv_dirty_bitmap_frozen(bm)); + QLIST_REMOVE(bm, list); + hbitmap_free(bm->bitmap); + g_free(bm->name); + g_free(bm); + + if (bitmap) { + return; + } + } + } +} + +void bdrv_release_dirty_bitmap(BlockDriverState *bs, BdrvDirtyBitmap *bitmap) +{ + bdrv_do_release_matching_dirty_bitmap(bs, bitmap, false); +} + +/** + * Release all named dirty bitmaps attached to a BDS (for use in bdrv_close()). + * There must not be any frozen bitmaps attached. + */ +void bdrv_release_named_dirty_bitmaps(BlockDriverState *bs) +{ + bdrv_do_release_matching_dirty_bitmap(bs, NULL, true); +} + +void bdrv_disable_dirty_bitmap(BdrvDirtyBitmap *bitmap) +{ + assert(!bdrv_dirty_bitmap_frozen(bitmap)); + bitmap->disabled = true; +} + +void bdrv_enable_dirty_bitmap(BdrvDirtyBitmap *bitmap) +{ + assert(!bdrv_dirty_bitmap_frozen(bitmap)); + bitmap->disabled = false; +} + +BlockDirtyInfoList *bdrv_query_dirty_bitmaps(BlockDriverState *bs) +{ + BdrvDirtyBitmap *bm; + BlockDirtyInfoList *list = NULL; + BlockDirtyInfoList **plist = &list; + + QLIST_FOREACH(bm, &bs->dirty_bitmaps, list) { + BlockDirtyInfo *info = g_new0(BlockDirtyInfo, 1); + BlockDirtyInfoList *entry = g_new0(BlockDirtyInfoList, 1); + info->count = bdrv_get_dirty_count(bm); + info->granularity = bdrv_dirty_bitmap_granularity(bm); + info->has_name = !!bm->name; + info->name = g_strdup(bm->name); + info->status = bdrv_dirty_bitmap_status(bm); + entry->value = info; + *plist = entry; + plist = &entry->next; + } + + return list; +} + +int bdrv_get_dirty(BlockDriverState *bs, BdrvDirtyBitmap *bitmap, + int64_t sector) +{ + if (bitmap) { + return hbitmap_get(bitmap->bitmap, sector); + } else { + return 0; + } +} + +/** + * Chooses a default granularity based on the existing cluster size, + * but clamped between [4K, 64K]. Defaults to 64K in the case that there + * is no cluster size information available. + */ +uint32_t bdrv_get_default_bitmap_granularity(BlockDriverState *bs) +{ + BlockDriverInfo bdi; + uint32_t granularity; + + if (bdrv_get_info(bs, &bdi) >= 0 && bdi.cluster_size > 0) { + granularity = MAX(4096, bdi.cluster_size); + granularity = MIN(65536, granularity); + } else { + granularity = 65536; + } + + return granularity; +} + +uint32_t bdrv_dirty_bitmap_granularity(BdrvDirtyBitmap *bitmap) +{ + return BDRV_SECTOR_SIZE << hbitmap_granularity(bitmap->bitmap); +} + +void bdrv_dirty_iter_init(BdrvDirtyBitmap *bitmap, HBitmapIter *hbi) +{ + hbitmap_iter_init(hbi, bitmap->bitmap, 0); +} + +void bdrv_set_dirty_bitmap(BdrvDirtyBitmap *bitmap, + int64_t cur_sector, int nr_sectors) +{ + assert(bdrv_dirty_bitmap_enabled(bitmap)); + hbitmap_set(bitmap->bitmap, cur_sector, nr_sectors); +} + +void bdrv_reset_dirty_bitmap(BdrvDirtyBitmap *bitmap, + int64_t cur_sector, int nr_sectors) +{ + assert(bdrv_dirty_bitmap_enabled(bitmap)); + hbitmap_reset(bitmap->bitmap, cur_sector, nr_sectors); +} + +void bdrv_clear_dirty_bitmap(BdrvDirtyBitmap *bitmap, HBitmap **out) +{ + assert(bdrv_dirty_bitmap_enabled(bitmap)); + if (!out) { + hbitmap_reset_all(bitmap->bitmap); + } else { + HBitmap *backup = bitmap->bitmap; + bitmap->bitmap = hbitmap_alloc(bitmap->size, + hbitmap_granularity(backup)); + *out = backup; + } +} + +void bdrv_undo_clear_dirty_bitmap(BdrvDirtyBitmap *bitmap, HBitmap *in) +{ + HBitmap *tmp = bitmap->bitmap; + assert(bdrv_dirty_bitmap_enabled(bitmap)); + bitmap->bitmap = in; + hbitmap_free(tmp); +} + +void bdrv_set_dirty(BlockDriverState *bs, int64_t cur_sector, + int nr_sectors) +{ + BdrvDirtyBitmap *bitmap; + QLIST_FOREACH(bitmap, &bs->dirty_bitmaps, list) { + if (!bdrv_dirty_bitmap_enabled(bitmap)) { + continue; + } + hbitmap_set(bitmap->bitmap, cur_sector, nr_sectors); + } +} + +/** + * Advance an HBitmapIter to an arbitrary offset. + */ +void bdrv_set_dirty_iter(HBitmapIter *hbi, int64_t offset) +{ + assert(hbi->hb); + hbitmap_iter_init(hbi, hbi->hb, offset); +} + +int64_t bdrv_get_dirty_count(BdrvDirtyBitmap *bitmap) +{ + return hbitmap_count(bitmap->bitmap); +} diff --git a/block/parallels.c b/block/parallels.c index 645521d783..0d1a60c972 100644 --- a/block/parallels.c +++ b/block/parallels.c @@ -30,6 +30,7 @@ #include "qemu/osdep.h" #include "qemu-common.h" #include "block/block_int.h" +#include "sysemu/block-backend.h" #include "qemu/module.h" #include "qemu/bitmap.h" #include "qapi/util.h" @@ -461,7 +462,7 @@ static int parallels_create(const char *filename, QemuOpts *opts, Error **errp) int64_t total_size, cl_size; uint8_t tmp[BDRV_SECTOR_SIZE]; Error *local_err = NULL; - BlockDriverState *file; + BlockBackend *file; uint32_t bat_entries, bat_sectors; ParallelsHeader header; int ret; @@ -477,14 +478,17 @@ static int parallels_create(const char *filename, QemuOpts *opts, Error **errp) return ret; } - file = NULL; - ret = bdrv_open(&file, filename, NULL, NULL, - BDRV_O_RDWR | BDRV_O_PROTOCOL, &local_err); - if (ret < 0) { + file = blk_new_open("image", filename, NULL, NULL, + BDRV_O_RDWR | BDRV_O_CACHE_WB | BDRV_O_PROTOCOL, + &local_err); + if (file == NULL) { error_propagate(errp, local_err); - return ret; + return -EIO; } - ret = bdrv_truncate(file, 0); + + blk_set_allow_write_beyond_eof(file, true); + + ret = blk_truncate(file, 0); if (ret < 0) { goto exit; } @@ -508,18 +512,18 @@ static int parallels_create(const char *filename, QemuOpts *opts, Error **errp) memset(tmp, 0, sizeof(tmp)); memcpy(tmp, &header, sizeof(header)); - ret = bdrv_pwrite(file, 0, tmp, BDRV_SECTOR_SIZE); + ret = blk_pwrite(file, 0, tmp, BDRV_SECTOR_SIZE); if (ret < 0) { goto exit; } - ret = bdrv_write_zeroes(file, 1, bat_sectors - 1, 0); + ret = blk_write_zeroes(file, 1, bat_sectors - 1, 0); if (ret < 0) { goto exit; } ret = 0; done: - bdrv_unref(file); + blk_unref(file); return ret; exit: diff --git a/block/qapi.c b/block/qapi.c index db2d3fb915..6a4869a8d9 100644 --- a/block/qapi.c +++ b/block/qapi.c @@ -355,100 +355,116 @@ static void bdrv_query_info(BlockBackend *blk, BlockInfo **p_info, qapi_free_BlockInfo(info); } -static BlockStats *bdrv_query_stats(const BlockDriverState *bs, - bool query_backing) +static BlockStats *bdrv_query_stats(BlockBackend *blk, + const BlockDriverState *bs, + bool query_backing); + +static void bdrv_query_blk_stats(BlockStats *s, BlockBackend *blk) { - BlockStats *s; + BlockAcctStats *stats = blk_get_stats(blk); + BlockAcctTimedStats *ts = NULL; - s = g_malloc0(sizeof(*s)); + s->has_device = true; + s->device = g_strdup(blk_name(blk)); - if (bdrv_get_device_name(bs)[0]) { - s->has_device = true; - s->device = g_strdup(bdrv_get_device_name(bs)); - } + s->stats->rd_bytes = stats->nr_bytes[BLOCK_ACCT_READ]; + s->stats->wr_bytes = stats->nr_bytes[BLOCK_ACCT_WRITE]; + s->stats->rd_operations = stats->nr_ops[BLOCK_ACCT_READ]; + s->stats->wr_operations = stats->nr_ops[BLOCK_ACCT_WRITE]; - if (bdrv_get_node_name(bs)[0]) { - s->has_node_name = true; - s->node_name = g_strdup(bdrv_get_node_name(bs)); + s->stats->failed_rd_operations = stats->failed_ops[BLOCK_ACCT_READ]; + s->stats->failed_wr_operations = stats->failed_ops[BLOCK_ACCT_WRITE]; + s->stats->failed_flush_operations = stats->failed_ops[BLOCK_ACCT_FLUSH]; + + s->stats->invalid_rd_operations = stats->invalid_ops[BLOCK_ACCT_READ]; + s->stats->invalid_wr_operations = stats->invalid_ops[BLOCK_ACCT_WRITE]; + s->stats->invalid_flush_operations = + stats->invalid_ops[BLOCK_ACCT_FLUSH]; + + s->stats->rd_merged = stats->merged[BLOCK_ACCT_READ]; + s->stats->wr_merged = stats->merged[BLOCK_ACCT_WRITE]; + s->stats->flush_operations = stats->nr_ops[BLOCK_ACCT_FLUSH]; + s->stats->wr_total_time_ns = stats->total_time_ns[BLOCK_ACCT_WRITE]; + s->stats->rd_total_time_ns = stats->total_time_ns[BLOCK_ACCT_READ]; + s->stats->flush_total_time_ns = stats->total_time_ns[BLOCK_ACCT_FLUSH]; + + s->stats->has_idle_time_ns = stats->last_access_time_ns > 0; + if (s->stats->has_idle_time_ns) { + s->stats->idle_time_ns = block_acct_idle_time_ns(stats); } - s->stats = g_malloc0(sizeof(*s->stats)); - if (bs->blk) { - BlockAcctStats *stats = blk_get_stats(bs->blk); - BlockAcctTimedStats *ts = NULL; - - s->stats->rd_bytes = stats->nr_bytes[BLOCK_ACCT_READ]; - s->stats->wr_bytes = stats->nr_bytes[BLOCK_ACCT_WRITE]; - s->stats->rd_operations = stats->nr_ops[BLOCK_ACCT_READ]; - s->stats->wr_operations = stats->nr_ops[BLOCK_ACCT_WRITE]; - - s->stats->failed_rd_operations = stats->failed_ops[BLOCK_ACCT_READ]; - s->stats->failed_wr_operations = stats->failed_ops[BLOCK_ACCT_WRITE]; - s->stats->failed_flush_operations = stats->failed_ops[BLOCK_ACCT_FLUSH]; - - s->stats->invalid_rd_operations = stats->invalid_ops[BLOCK_ACCT_READ]; - s->stats->invalid_wr_operations = stats->invalid_ops[BLOCK_ACCT_WRITE]; - s->stats->invalid_flush_operations = - stats->invalid_ops[BLOCK_ACCT_FLUSH]; - - s->stats->rd_merged = stats->merged[BLOCK_ACCT_READ]; - s->stats->wr_merged = stats->merged[BLOCK_ACCT_WRITE]; - s->stats->flush_operations = stats->nr_ops[BLOCK_ACCT_FLUSH]; - s->stats->wr_total_time_ns = stats->total_time_ns[BLOCK_ACCT_WRITE]; - s->stats->rd_total_time_ns = stats->total_time_ns[BLOCK_ACCT_READ]; - s->stats->flush_total_time_ns = stats->total_time_ns[BLOCK_ACCT_FLUSH]; - - s->stats->has_idle_time_ns = stats->last_access_time_ns > 0; - if (s->stats->has_idle_time_ns) { - s->stats->idle_time_ns = block_acct_idle_time_ns(stats); - } + s->stats->account_invalid = stats->account_invalid; + s->stats->account_failed = stats->account_failed; - s->stats->account_invalid = stats->account_invalid; - s->stats->account_failed = stats->account_failed; + while ((ts = block_acct_interval_next(stats, ts))) { + BlockDeviceTimedStatsList *timed_stats = + g_malloc0(sizeof(*timed_stats)); + BlockDeviceTimedStats *dev_stats = g_malloc0(sizeof(*dev_stats)); + timed_stats->next = s->stats->timed_stats; + timed_stats->value = dev_stats; + s->stats->timed_stats = timed_stats; - while ((ts = block_acct_interval_next(stats, ts))) { - BlockDeviceTimedStatsList *timed_stats = - g_malloc0(sizeof(*timed_stats)); - BlockDeviceTimedStats *dev_stats = g_malloc0(sizeof(*dev_stats)); - timed_stats->next = s->stats->timed_stats; - timed_stats->value = dev_stats; - s->stats->timed_stats = timed_stats; + TimedAverage *rd = &ts->latency[BLOCK_ACCT_READ]; + TimedAverage *wr = &ts->latency[BLOCK_ACCT_WRITE]; + TimedAverage *fl = &ts->latency[BLOCK_ACCT_FLUSH]; - TimedAverage *rd = &ts->latency[BLOCK_ACCT_READ]; - TimedAverage *wr = &ts->latency[BLOCK_ACCT_WRITE]; - TimedAverage *fl = &ts->latency[BLOCK_ACCT_FLUSH]; + dev_stats->interval_length = ts->interval_length; - dev_stats->interval_length = ts->interval_length; + dev_stats->min_rd_latency_ns = timed_average_min(rd); + dev_stats->max_rd_latency_ns = timed_average_max(rd); + dev_stats->avg_rd_latency_ns = timed_average_avg(rd); - dev_stats->min_rd_latency_ns = timed_average_min(rd); - dev_stats->max_rd_latency_ns = timed_average_max(rd); - dev_stats->avg_rd_latency_ns = timed_average_avg(rd); + dev_stats->min_wr_latency_ns = timed_average_min(wr); + dev_stats->max_wr_latency_ns = timed_average_max(wr); + dev_stats->avg_wr_latency_ns = timed_average_avg(wr); - dev_stats->min_wr_latency_ns = timed_average_min(wr); - dev_stats->max_wr_latency_ns = timed_average_max(wr); - dev_stats->avg_wr_latency_ns = timed_average_avg(wr); + dev_stats->min_flush_latency_ns = timed_average_min(fl); + dev_stats->max_flush_latency_ns = timed_average_max(fl); + dev_stats->avg_flush_latency_ns = timed_average_avg(fl); - dev_stats->min_flush_latency_ns = timed_average_min(fl); - dev_stats->max_flush_latency_ns = timed_average_max(fl); - dev_stats->avg_flush_latency_ns = timed_average_avg(fl); + dev_stats->avg_rd_queue_depth = + block_acct_queue_depth(ts, BLOCK_ACCT_READ); + dev_stats->avg_wr_queue_depth = + block_acct_queue_depth(ts, BLOCK_ACCT_WRITE); + } +} - dev_stats->avg_rd_queue_depth = - block_acct_queue_depth(ts, BLOCK_ACCT_READ); - dev_stats->avg_wr_queue_depth = - block_acct_queue_depth(ts, BLOCK_ACCT_WRITE); - } +static void bdrv_query_bds_stats(BlockStats *s, const BlockDriverState *bs, + bool query_backing) +{ + if (bdrv_get_node_name(bs)[0]) { + s->has_node_name = true; + s->node_name = g_strdup(bdrv_get_node_name(bs)); } s->stats->wr_highest_offset = bs->wr_highest_offset; if (bs->file) { s->has_parent = true; - s->parent = bdrv_query_stats(bs->file->bs, query_backing); + s->parent = bdrv_query_stats(NULL, bs->file->bs, query_backing); } if (query_backing && bs->backing) { s->has_backing = true; - s->backing = bdrv_query_stats(bs->backing->bs, query_backing); + s->backing = bdrv_query_stats(NULL, bs->backing->bs, query_backing); + } + +} + +static BlockStats *bdrv_query_stats(BlockBackend *blk, + const BlockDriverState *bs, + bool query_backing) +{ + BlockStats *s; + + s = g_malloc0(sizeof(*s)); + s->stats = g_malloc0(sizeof(*s->stats)); + + if (blk) { + bdrv_query_blk_stats(s, blk); + } + if (bs) { + bdrv_query_bds_stats(s, bs, query_backing); } return s; @@ -477,22 +493,38 @@ BlockInfoList *qmp_query_block(Error **errp) return head; } +static bool next_query_bds(BlockBackend **blk, BlockDriverState **bs, + bool query_nodes) +{ + if (query_nodes) { + *bs = bdrv_next_node(*bs); + return !!*bs; + } + + *blk = blk_next(*blk); + *bs = *blk ? blk_bs(*blk) : NULL; + + return !!*blk; +} + BlockStatsList *qmp_query_blockstats(bool has_query_nodes, bool query_nodes, Error **errp) { BlockStatsList *head = NULL, **p_next = &head; + BlockBackend *blk = NULL; BlockDriverState *bs = NULL; /* Just to be safe if query_nodes is not always initialized */ query_nodes = has_query_nodes && query_nodes; - while ((bs = query_nodes ? bdrv_next_node(bs) : bdrv_next(bs))) { + while (next_query_bds(&blk, &bs, query_nodes)) { BlockStatsList *info = g_malloc0(sizeof(*info)); - AioContext *ctx = bdrv_get_aio_context(bs); + AioContext *ctx = blk ? blk_get_aio_context(blk) + : bdrv_get_aio_context(bs); aio_context_acquire(ctx); - info->value = bdrv_query_stats(bs, !query_nodes); + info->value = bdrv_query_stats(blk, bs, !query_nodes); aio_context_release(ctx); *p_next = info; diff --git a/block/qcow.c b/block/qcow.c index 251910cc9d..2fd5ee65d4 100644 --- a/block/qcow.c +++ b/block/qcow.c @@ -24,6 +24,7 @@ #include "qemu/osdep.h" #include "qemu-common.h" #include "block/block_int.h" +#include "sysemu/block-backend.h" #include "qemu/module.h" #include <zlib.h> #include "qapi/qmp/qerror.h" @@ -780,7 +781,7 @@ static int qcow_create(const char *filename, QemuOpts *opts, Error **errp) int flags = 0; Error *local_err = NULL; int ret; - BlockDriverState *qcow_bs; + BlockBackend *qcow_blk; /* Read out options */ total_size = ROUND_UP(qemu_opt_get_size_del(opts, BLOCK_OPT_SIZE, 0), @@ -796,15 +797,18 @@ static int qcow_create(const char *filename, QemuOpts *opts, Error **errp) goto cleanup; } - qcow_bs = NULL; - ret = bdrv_open(&qcow_bs, filename, NULL, NULL, - BDRV_O_RDWR | BDRV_O_PROTOCOL, &local_err); - if (ret < 0) { + qcow_blk = blk_new_open("image", filename, NULL, NULL, + BDRV_O_RDWR | BDRV_O_CACHE_WB | BDRV_O_PROTOCOL, + &local_err); + if (qcow_blk == NULL) { error_propagate(errp, local_err); + ret = -EIO; goto cleanup; } - ret = bdrv_truncate(qcow_bs, 0); + blk_set_allow_write_beyond_eof(qcow_blk, true); + + ret = blk_truncate(qcow_blk, 0); if (ret < 0) { goto exit; } @@ -844,13 +848,13 @@ static int qcow_create(const char *filename, QemuOpts *opts, Error **errp) } /* write all the data */ - ret = bdrv_pwrite(qcow_bs, 0, &header, sizeof(header)); + ret = blk_pwrite(qcow_blk, 0, &header, sizeof(header)); if (ret != sizeof(header)) { goto exit; } if (backing_file) { - ret = bdrv_pwrite(qcow_bs, sizeof(header), + ret = blk_pwrite(qcow_blk, sizeof(header), backing_file, backing_filename_len); if (ret != backing_filename_len) { goto exit; @@ -860,7 +864,7 @@ static int qcow_create(const char *filename, QemuOpts *opts, Error **errp) tmp = g_malloc0(BDRV_SECTOR_SIZE); for (i = 0; i < ((sizeof(uint64_t)*l1_size + BDRV_SECTOR_SIZE - 1)/ BDRV_SECTOR_SIZE); i++) { - ret = bdrv_pwrite(qcow_bs, header_size + + ret = blk_pwrite(qcow_blk, header_size + BDRV_SECTOR_SIZE*i, tmp, BDRV_SECTOR_SIZE); if (ret != BDRV_SECTOR_SIZE) { g_free(tmp); @@ -871,7 +875,7 @@ static int qcow_create(const char *filename, QemuOpts *opts, Error **errp) g_free(tmp); ret = 0; exit: - bdrv_unref(qcow_bs); + blk_unref(qcow_blk); cleanup: g_free(backing_file); return ret; diff --git a/block/qcow2.c b/block/qcow2.c index 8babecdab2..1ce6264011 100644 --- a/block/qcow2.c +++ b/block/qcow2.c @@ -24,6 +24,7 @@ #include "qemu/osdep.h" #include "qemu-common.h" #include "block/block_int.h" +#include "sysemu/block-backend.h" #include "qemu/module.h" #include <zlib.h> #include "block/qcow2.h" @@ -2097,7 +2098,7 @@ static int qcow2_create2(const char *filename, int64_t total_size, * 2 GB for 64k clusters, and we don't want to have a 2 GB initial file * size for any qcow2 image. */ - BlockDriverState* bs; + BlockBackend *blk; QCowHeader *header; uint64_t* refcount_table; Error *local_err = NULL; @@ -2172,14 +2173,16 @@ static int qcow2_create2(const char *filename, int64_t total_size, return ret; } - bs = NULL; - ret = bdrv_open(&bs, filename, NULL, NULL, BDRV_O_RDWR | BDRV_O_PROTOCOL, - &local_err); - if (ret < 0) { + blk = blk_new_open("image", filename, NULL, NULL, + BDRV_O_RDWR | BDRV_O_CACHE_WB | BDRV_O_PROTOCOL, + &local_err); + if (blk == NULL) { error_propagate(errp, local_err); - return ret; + return -EIO; } + blk_set_allow_write_beyond_eof(blk, true); + /* Write the header */ QEMU_BUILD_BUG_ON((1 << MIN_CLUSTER_BITS) < sizeof(*header)); header = g_malloc0(cluster_size); @@ -2207,7 +2210,7 @@ static int qcow2_create2(const char *filename, int64_t total_size, cpu_to_be64(QCOW2_COMPAT_LAZY_REFCOUNTS); } - ret = bdrv_pwrite(bs, 0, header, cluster_size); + ret = blk_pwrite(blk, 0, header, cluster_size); g_free(header); if (ret < 0) { error_setg_errno(errp, -ret, "Could not write qcow2 header"); @@ -2217,7 +2220,7 @@ static int qcow2_create2(const char *filename, int64_t total_size, /* Write a refcount table with one refcount block */ refcount_table = g_malloc0(2 * cluster_size); refcount_table[0] = cpu_to_be64(2 * cluster_size); - ret = bdrv_pwrite(bs, cluster_size, refcount_table, 2 * cluster_size); + ret = blk_pwrite(blk, cluster_size, refcount_table, 2 * cluster_size); g_free(refcount_table); if (ret < 0) { @@ -2225,8 +2228,8 @@ static int qcow2_create2(const char *filename, int64_t total_size, goto out; } - bdrv_unref(bs); - bs = NULL; + blk_unref(blk); + blk = NULL; /* * And now open the image and make it consistent first (i.e. increase the @@ -2235,15 +2238,16 @@ static int qcow2_create2(const char *filename, int64_t total_size, */ options = qdict_new(); qdict_put(options, "driver", qstring_from_str("qcow2")); - ret = bdrv_open(&bs, filename, NULL, options, - BDRV_O_RDWR | BDRV_O_CACHE_WB | BDRV_O_NO_FLUSH, - &local_err); - if (ret < 0) { + blk = blk_new_open("image-qcow2", filename, NULL, options, + BDRV_O_RDWR | BDRV_O_CACHE_WB | BDRV_O_NO_FLUSH, + &local_err); + if (blk == NULL) { error_propagate(errp, local_err); + ret = -EIO; goto out; } - ret = qcow2_alloc_clusters(bs, 3 * cluster_size); + ret = qcow2_alloc_clusters(blk_bs(blk), 3 * cluster_size); if (ret < 0) { error_setg_errno(errp, -ret, "Could not allocate clusters for qcow2 " "header and refcount table"); @@ -2255,14 +2259,14 @@ static int qcow2_create2(const char *filename, int64_t total_size, } /* Create a full header (including things like feature table) */ - ret = qcow2_update_header(bs); + ret = qcow2_update_header(blk_bs(blk)); if (ret < 0) { error_setg_errno(errp, -ret, "Could not update qcow2 header"); goto out; } /* Okay, now that we have a valid image, let's give it the right size */ - ret = bdrv_truncate(bs, total_size); + ret = blk_truncate(blk, total_size); if (ret < 0) { error_setg_errno(errp, -ret, "Could not resize image"); goto out; @@ -2270,7 +2274,7 @@ static int qcow2_create2(const char *filename, int64_t total_size, /* Want a backing file? There you go.*/ if (backing_file) { - ret = bdrv_change_backing_file(bs, backing_file, backing_format); + ret = bdrv_change_backing_file(blk_bs(blk), backing_file, backing_format); if (ret < 0) { error_setg_errno(errp, -ret, "Could not assign backing file '%s' " "with format '%s'", backing_file, backing_format); @@ -2280,9 +2284,9 @@ static int qcow2_create2(const char *filename, int64_t total_size, /* And if we're supposed to preallocate metadata, do that now */ if (prealloc != PREALLOC_MODE_OFF) { - BDRVQcow2State *s = bs->opaque; + BDRVQcow2State *s = blk_bs(blk)->opaque; qemu_co_mutex_lock(&s->lock); - ret = preallocate(bs); + ret = preallocate(blk_bs(blk)); qemu_co_mutex_unlock(&s->lock); if (ret < 0) { error_setg_errno(errp, -ret, "Could not preallocate metadata"); @@ -2290,24 +2294,25 @@ static int qcow2_create2(const char *filename, int64_t total_size, } } - bdrv_unref(bs); - bs = NULL; + blk_unref(blk); + blk = NULL; /* Reopen the image without BDRV_O_NO_FLUSH to flush it before returning */ options = qdict_new(); qdict_put(options, "driver", qstring_from_str("qcow2")); - ret = bdrv_open(&bs, filename, NULL, options, - BDRV_O_RDWR | BDRV_O_CACHE_WB | BDRV_O_NO_BACKING, - &local_err); - if (local_err) { + blk = blk_new_open("image-flush", filename, NULL, options, + BDRV_O_RDWR | BDRV_O_CACHE_WB | BDRV_O_NO_BACKING, + &local_err); + if (blk == NULL) { error_propagate(errp, local_err); + ret = -EIO; goto out; } ret = 0; out: - if (bs) { - bdrv_unref(bs); + if (blk) { + blk_unref(blk); } return ret; } diff --git a/block/qed.c b/block/qed.c index 404be1e9b9..8de7dd0832 100644 --- a/block/qed.c +++ b/block/qed.c @@ -18,6 +18,7 @@ #include "qed.h" #include "qapi/qmp/qerror.h" #include "migration/migration.h" +#include "sysemu/block-backend.h" static const AIOCBInfo qed_aiocb_info = { .aiocb_size = sizeof(QEDAIOCB), @@ -580,7 +581,7 @@ static int qed_create(const char *filename, uint32_t cluster_size, size_t l1_size = header.cluster_size * header.table_size; Error *local_err = NULL; int ret = 0; - BlockDriverState *bs; + BlockBackend *blk; ret = bdrv_create_file(filename, opts, &local_err); if (ret < 0) { @@ -588,17 +589,18 @@ static int qed_create(const char *filename, uint32_t cluster_size, return ret; } - bs = NULL; - ret = bdrv_open(&bs, filename, NULL, NULL, - BDRV_O_RDWR | BDRV_O_CACHE_WB | BDRV_O_PROTOCOL, - &local_err); - if (ret < 0) { + blk = blk_new_open("image", filename, NULL, NULL, + BDRV_O_RDWR | BDRV_O_CACHE_WB | BDRV_O_PROTOCOL, + &local_err); + if (blk == NULL) { error_propagate(errp, local_err); - return ret; + return -EIO; } + blk_set_allow_write_beyond_eof(blk, true); + /* File must start empty and grow, check truncate is supported */ - ret = bdrv_truncate(bs, 0); + ret = blk_truncate(blk, 0); if (ret < 0) { goto out; } @@ -614,18 +616,18 @@ static int qed_create(const char *filename, uint32_t cluster_size, } qed_header_cpu_to_le(&header, &le_header); - ret = bdrv_pwrite(bs, 0, &le_header, sizeof(le_header)); + ret = blk_pwrite(blk, 0, &le_header, sizeof(le_header)); if (ret < 0) { goto out; } - ret = bdrv_pwrite(bs, sizeof(le_header), backing_file, - header.backing_filename_size); + ret = blk_pwrite(blk, sizeof(le_header), backing_file, + header.backing_filename_size); if (ret < 0) { goto out; } l1_table = g_malloc0(l1_size); - ret = bdrv_pwrite(bs, header.l1_table_offset, l1_table, l1_size); + ret = blk_pwrite(blk, header.l1_table_offset, l1_table, l1_size); if (ret < 0) { goto out; } @@ -633,7 +635,7 @@ static int qed_create(const char *filename, uint32_t cluster_size, ret = 0; /* success */ out: g_free(l1_table); - bdrv_unref(bs); + blk_unref(blk); return ret; } diff --git a/block/quorum.c b/block/quorum.c index 11cc60b713..3d473515a8 100644 --- a/block/quorum.c +++ b/block/quorum.c @@ -215,14 +215,16 @@ static QuorumAIOCB *quorum_aio_get(BDRVQuorumState *s, return acb; } -static void quorum_report_bad(QuorumAIOCB *acb, char *node_name, int ret) +static void quorum_report_bad(QuorumOpType type, uint64_t sector_num, + int nb_sectors, char *node_name, int ret) { const char *msg = NULL; if (ret < 0) { msg = strerror(-ret); } - qapi_event_send_quorum_report_bad(!!msg, msg, node_name, - acb->sector_num, acb->nb_sectors, &error_abort); + + qapi_event_send_quorum_report_bad(type, !!msg, msg, node_name, + sector_num, nb_sectors, &error_abort); } static void quorum_report_failure(QuorumAIOCB *acb) @@ -282,6 +284,7 @@ static void quorum_aio_cb(void *opaque, int ret) QuorumChildRequest *sacb = opaque; QuorumAIOCB *acb = sacb->parent; BDRVQuorumState *s = acb->common.bs->opaque; + QuorumOpType type; bool rewrite = false; if (acb->is_read && s->read_pattern == QUORUM_READ_PATTERN_FIFO) { @@ -300,12 +303,14 @@ static void quorum_aio_cb(void *opaque, int ret) return; } + type = acb->is_read ? QUORUM_OP_TYPE_READ : QUORUM_OP_TYPE_WRITE; sacb->ret = ret; acb->count++; if (ret == 0) { acb->success_count++; } else { - quorum_report_bad(acb, sacb->aiocb->bs->node_name, ret); + quorum_report_bad(type, acb->sector_num, acb->nb_sectors, + sacb->aiocb->bs->node_name, ret); } assert(acb->count <= s->num_children); assert(acb->success_count <= s->num_children); @@ -338,7 +343,9 @@ static void quorum_report_bad_versions(BDRVQuorumState *s, continue; } QLIST_FOREACH(item, &version->items, next) { - quorum_report_bad(acb, s->children[item->index]->bs->node_name, 0); + quorum_report_bad(QUORUM_OP_TYPE_READ, acb->sector_num, + acb->nb_sectors, + s->children[item->index]->bs->node_name, 0); } } } @@ -648,8 +655,9 @@ static BlockAIOCB *read_quorum_children(QuorumAIOCB *acb) } for (i = 0; i < s->num_children; i++) { - bdrv_aio_readv(s->children[i]->bs, acb->sector_num, &acb->qcrs[i].qiov, - acb->nb_sectors, quorum_aio_cb, &acb->qcrs[i]); + acb->qcrs[i].aiocb = bdrv_aio_readv(s->children[i]->bs, acb->sector_num, + &acb->qcrs[i].qiov, acb->nb_sectors, + quorum_aio_cb, &acb->qcrs[i]); } return &acb->common; @@ -664,9 +672,10 @@ static BlockAIOCB *read_fifo_child(QuorumAIOCB *acb) qemu_iovec_init(&acb->qcrs[acb->child_iter].qiov, acb->qiov->niov); qemu_iovec_clone(&acb->qcrs[acb->child_iter].qiov, acb->qiov, acb->qcrs[acb->child_iter].buf); - bdrv_aio_readv(s->children[acb->child_iter]->bs, acb->sector_num, - &acb->qcrs[acb->child_iter].qiov, acb->nb_sectors, - quorum_aio_cb, &acb->qcrs[acb->child_iter]); + acb->qcrs[acb->child_iter].aiocb = + bdrv_aio_readv(s->children[acb->child_iter]->bs, acb->sector_num, + &acb->qcrs[acb->child_iter].qiov, acb->nb_sectors, + quorum_aio_cb, &acb->qcrs[acb->child_iter]); return &acb->common; } @@ -760,19 +769,30 @@ static coroutine_fn int quorum_co_flush(BlockDriverState *bs) QuorumVoteValue result_value; int i; int result = 0; + int success_count = 0; QLIST_INIT(&error_votes.vote_list); error_votes.compare = quorum_64bits_compare; for (i = 0; i < s->num_children; i++) { result = bdrv_co_flush(s->children[i]->bs); - result_value.l = result; - quorum_count_vote(&error_votes, &result_value, i); + if (result) { + quorum_report_bad(QUORUM_OP_TYPE_FLUSH, 0, + bdrv_nb_sectors(s->children[i]->bs), + s->children[i]->bs->node_name, result); + result_value.l = result; + quorum_count_vote(&error_votes, &result_value, i); + } else { + success_count++; + } } - winner = quorum_get_vote_winner(&error_votes); - result = winner->value.l; - + if (success_count >= s->threshold) { + result = 0; + } else { + winner = quorum_get_vote_winner(&error_votes); + result = winner->value.l; + } quorum_free_vote_list(&error_votes); return result; diff --git a/block/sheepdog.c b/block/sheepdog.c index 8739accddd..a6e98a5a72 100644 --- a/block/sheepdog.c +++ b/block/sheepdog.c @@ -18,6 +18,7 @@ #include "qemu/error-report.h" #include "qemu/sockets.h" #include "block/block_int.h" +#include "sysemu/block-backend.h" #include "qemu/bitops.h" #define SD_PROTO_VER 0x01 @@ -615,14 +616,13 @@ static coroutine_fn int send_co_req(int sockfd, SheepdogReq *hdr, void *data, ret = qemu_co_send(sockfd, hdr, sizeof(*hdr)); if (ret != sizeof(*hdr)) { error_report("failed to send a req, %s", strerror(errno)); - ret = -socket_error(); - return ret; + return -errno; } ret = qemu_co_send(sockfd, data, *wlen); if (ret != *wlen) { - ret = -socket_error(); error_report("failed to send a req, %s", strerror(errno)); + return -errno; } return ret; @@ -1637,7 +1637,7 @@ static int do_sd_create(BDRVSheepdogState *s, uint32_t *vdi_id, int snapshot, static int sd_prealloc(const char *filename, Error **errp) { - BlockDriverState *bs = NULL; + BlockBackend *blk = NULL; BDRVSheepdogState *base = NULL; unsigned long buf_size; uint32_t idx, max_idx; @@ -1646,19 +1646,23 @@ static int sd_prealloc(const char *filename, Error **errp) void *buf = NULL; int ret; - ret = bdrv_open(&bs, filename, NULL, NULL, BDRV_O_RDWR | BDRV_O_PROTOCOL, - errp); - if (ret < 0) { + blk = blk_new_open("image-prealloc", filename, NULL, NULL, + BDRV_O_RDWR | BDRV_O_CACHE_WB | BDRV_O_PROTOCOL, + errp); + if (blk == NULL) { + ret = -EIO; goto out_with_err_set; } - vdi_size = bdrv_getlength(bs); + blk_set_allow_write_beyond_eof(blk, true); + + vdi_size = blk_getlength(blk); if (vdi_size < 0) { ret = vdi_size; goto out; } - base = bs->opaque; + base = blk_bs(blk)->opaque; object_size = (UINT32_C(1) << base->inode.block_size_shift); buf_size = MIN(object_size, SD_DATA_OBJ_SIZE); buf = g_malloc0(buf_size); @@ -1670,23 +1674,24 @@ static int sd_prealloc(const char *filename, Error **errp) * The created image can be a cloned image, so we need to read * a data from the source image. */ - ret = bdrv_pread(bs, idx * buf_size, buf, buf_size); + ret = blk_pread(blk, idx * buf_size, buf, buf_size); if (ret < 0) { goto out; } - ret = bdrv_pwrite(bs, idx * buf_size, buf, buf_size); + ret = blk_pwrite(blk, idx * buf_size, buf, buf_size); if (ret < 0) { goto out; } } + ret = 0; out: if (ret < 0) { error_setg_errno(errp, -ret, "Can't pre-allocate"); } out_with_err_set: - if (bs) { - bdrv_unref(bs); + if (blk) { + blk_unref(blk); } g_free(buf); @@ -1826,7 +1831,7 @@ static int sd_create(const char *filename, QemuOpts *opts, } if (backing_file) { - BlockDriverState *bs; + BlockBackend *blk; BDRVSheepdogState *base; BlockDriver *drv; @@ -1838,22 +1843,23 @@ static int sd_create(const char *filename, QemuOpts *opts, goto out; } - bs = NULL; - ret = bdrv_open(&bs, backing_file, NULL, NULL, BDRV_O_PROTOCOL, errp); - if (ret < 0) { + blk = blk_new_open("backing", backing_file, NULL, NULL, + BDRV_O_PROTOCOL | BDRV_O_CACHE_WB, errp); + if (blk == NULL) { + ret = -EIO; goto out; } - base = bs->opaque; + base = blk_bs(blk)->opaque; if (!is_snapshot(&base->inode)) { error_setg(errp, "cannot clone from a non snapshot vdi"); - bdrv_unref(bs); + blk_unref(blk); ret = -EINVAL; goto out; } s->inode.vdi_id = base->inode.vdi_id; - bdrv_unref(bs); + blk_unref(blk); } s->aio_context = qemu_get_aio_context(); diff --git a/block/vdi.c b/block/vdi.c index b403243604..662d14b74e 100644 --- a/block/vdi.c +++ b/block/vdi.c @@ -52,6 +52,7 @@ #include "qemu/osdep.h" #include "qemu-common.h" #include "block/block_int.h" +#include "sysemu/block-backend.h" #include "qemu/module.h" #include "migration/migration.h" #include "qemu/coroutine.h" @@ -733,7 +734,7 @@ static int vdi_create(const char *filename, QemuOpts *opts, Error **errp) size_t bmap_size; int64_t offset = 0; Error *local_err = NULL; - BlockDriverState *bs = NULL; + BlockBackend *blk = NULL; uint32_t *bmap = NULL; logout("\n"); @@ -766,13 +767,18 @@ static int vdi_create(const char *filename, QemuOpts *opts, Error **errp) error_propagate(errp, local_err); goto exit; } - ret = bdrv_open(&bs, filename, NULL, NULL, BDRV_O_RDWR | BDRV_O_PROTOCOL, - &local_err); - if (ret < 0) { + + blk = blk_new_open("image", filename, NULL, NULL, + BDRV_O_RDWR | BDRV_O_CACHE_WB | BDRV_O_PROTOCOL, + &local_err); + if (blk == NULL) { error_propagate(errp, local_err); + ret = -EIO; goto exit; } + blk_set_allow_write_beyond_eof(blk, true); + /* We need enough blocks to store the given disk size, so always round up. */ blocks = DIV_ROUND_UP(bytes, block_size); @@ -802,7 +808,7 @@ static int vdi_create(const char *filename, QemuOpts *opts, Error **errp) vdi_header_print(&header); #endif vdi_header_to_le(&header); - ret = bdrv_pwrite_sync(bs, offset, &header, sizeof(header)); + ret = blk_pwrite(blk, offset, &header, sizeof(header)); if (ret < 0) { error_setg(errp, "Error writing header to %s", filename); goto exit; @@ -823,7 +829,7 @@ static int vdi_create(const char *filename, QemuOpts *opts, Error **errp) bmap[i] = VDI_UNALLOCATED; } } - ret = bdrv_pwrite_sync(bs, offset, bmap, bmap_size); + ret = blk_pwrite(blk, offset, bmap, bmap_size); if (ret < 0) { error_setg(errp, "Error writing bmap to %s", filename); goto exit; @@ -832,7 +838,7 @@ static int vdi_create(const char *filename, QemuOpts *opts, Error **errp) } if (image_type == VDI_TYPE_STATIC) { - ret = bdrv_truncate(bs, offset + blocks * block_size); + ret = blk_truncate(blk, offset + blocks * block_size); if (ret < 0) { error_setg(errp, "Failed to statically allocate %s", filename); goto exit; @@ -840,7 +846,7 @@ static int vdi_create(const char *filename, QemuOpts *opts, Error **errp) } exit: - bdrv_unref(bs); + blk_unref(blk); g_free(bmap); return ret; } diff --git a/block/vhdx.c b/block/vhdx.c index 9a51428317..e15020c9be 100644 --- a/block/vhdx.c +++ b/block/vhdx.c @@ -18,6 +18,7 @@ #include "qemu/osdep.h" #include "qemu-common.h" #include "block/block_int.h" +#include "sysemu/block-backend.h" #include "qemu/module.h" #include "qemu/crc32c.h" #include "block/vhdx.h" @@ -1772,7 +1773,7 @@ static int vhdx_create(const char *filename, QemuOpts *opts, Error **errp) gunichar2 *creator = NULL; glong creator_items; - BlockDriverState *bs; + BlockBackend *blk; char *type = NULL; VHDXImageType image_type; Error *local_err = NULL; @@ -1837,14 +1838,17 @@ static int vhdx_create(const char *filename, QemuOpts *opts, Error **errp) goto exit; } - bs = NULL; - ret = bdrv_open(&bs, filename, NULL, NULL, BDRV_O_RDWR | BDRV_O_PROTOCOL, - &local_err); - if (ret < 0) { + blk = blk_new_open("image", filename, NULL, NULL, + BDRV_O_RDWR | BDRV_O_CACHE_WB | BDRV_O_PROTOCOL, + &local_err); + if (blk == NULL) { error_propagate(errp, local_err); + ret = -EIO; goto exit; } + blk_set_allow_write_beyond_eof(blk, true); + /* Create (A) */ /* The creator field is optional, but may be useful for @@ -1852,13 +1856,13 @@ static int vhdx_create(const char *filename, QemuOpts *opts, Error **errp) creator = g_utf8_to_utf16("QEMU v" QEMU_VERSION, -1, NULL, &creator_items, NULL); signature = cpu_to_le64(VHDX_FILE_SIGNATURE); - ret = bdrv_pwrite(bs, VHDX_FILE_ID_OFFSET, &signature, sizeof(signature)); + ret = blk_pwrite(blk, VHDX_FILE_ID_OFFSET, &signature, sizeof(signature)); if (ret < 0) { goto delete_and_exit; } if (creator) { - ret = bdrv_pwrite(bs, VHDX_FILE_ID_OFFSET + sizeof(signature), - creator, creator_items * sizeof(gunichar2)); + ret = blk_pwrite(blk, VHDX_FILE_ID_OFFSET + sizeof(signature), + creator, creator_items * sizeof(gunichar2)); if (ret < 0) { goto delete_and_exit; } @@ -1866,13 +1870,13 @@ static int vhdx_create(const char *filename, QemuOpts *opts, Error **errp) /* Creates (B),(C) */ - ret = vhdx_create_new_headers(bs, image_size, log_size); + ret = vhdx_create_new_headers(blk_bs(blk), image_size, log_size); if (ret < 0) { goto delete_and_exit; } /* Creates (D),(E),(G) explicitly. (F) created as by-product */ - ret = vhdx_create_new_region_table(bs, image_size, block_size, 512, + ret = vhdx_create_new_region_table(blk_bs(blk), image_size, block_size, 512, log_size, use_zero_blocks, image_type, &metadata_offset); if (ret < 0) { @@ -1880,7 +1884,7 @@ static int vhdx_create(const char *filename, QemuOpts *opts, Error **errp) } /* Creates (H) */ - ret = vhdx_create_new_metadata(bs, image_size, block_size, 512, + ret = vhdx_create_new_metadata(blk_bs(blk), image_size, block_size, 512, metadata_offset, image_type); if (ret < 0) { goto delete_and_exit; @@ -1888,7 +1892,7 @@ static int vhdx_create(const char *filename, QemuOpts *opts, Error **errp) delete_and_exit: - bdrv_unref(bs); + blk_unref(blk); exit: g_free(type); g_free(creator); diff --git a/block/vmdk.c b/block/vmdk.c index a8db5d9ec2..23bd57e20e 100644 --- a/block/vmdk.c +++ b/block/vmdk.c @@ -26,6 +26,7 @@ #include "qemu/osdep.h" #include "qemu-common.h" #include "block/block_int.h" +#include "sysemu/block-backend.h" #include "qapi/qmp/qerror.h" #include "qemu/error-report.h" #include "qemu/module.h" @@ -242,15 +243,17 @@ static void vmdk_free_last_extent(BlockDriverState *bs) static uint32_t vmdk_read_cid(BlockDriverState *bs, int parent) { - char desc[DESC_SIZE]; + char *desc; uint32_t cid = 0xffffffff; const char *p_name, *cid_str; size_t cid_str_size; BDRVVmdkState *s = bs->opaque; int ret; + desc = g_malloc0(DESC_SIZE); ret = bdrv_pread(bs->file->bs, s->desc_offset, desc, DESC_SIZE); if (ret < 0) { + g_free(desc); return 0; } @@ -269,41 +272,45 @@ static uint32_t vmdk_read_cid(BlockDriverState *bs, int parent) sscanf(p_name, "%" SCNx32, &cid); } + g_free(desc); return cid; } static int vmdk_write_cid(BlockDriverState *bs, uint32_t cid) { - char desc[DESC_SIZE], tmp_desc[DESC_SIZE]; + char *desc, *tmp_desc; char *p_name, *tmp_str; BDRVVmdkState *s = bs->opaque; - int ret; + int ret = 0; + desc = g_malloc0(DESC_SIZE); + tmp_desc = g_malloc0(DESC_SIZE); ret = bdrv_pread(bs->file->bs, s->desc_offset, desc, DESC_SIZE); if (ret < 0) { - return ret; + goto out; } desc[DESC_SIZE - 1] = '\0'; tmp_str = strstr(desc, "parentCID"); if (tmp_str == NULL) { - return -EINVAL; + ret = -EINVAL; + goto out; } - pstrcpy(tmp_desc, sizeof(tmp_desc), tmp_str); + pstrcpy(tmp_desc, DESC_SIZE, tmp_str); p_name = strstr(desc, "CID"); if (p_name != NULL) { p_name += sizeof("CID"); - snprintf(p_name, sizeof(desc) - (p_name - desc), "%" PRIx32 "\n", cid); - pstrcat(desc, sizeof(desc), tmp_desc); + snprintf(p_name, DESC_SIZE - (p_name - desc), "%" PRIx32 "\n", cid); + pstrcat(desc, DESC_SIZE, tmp_desc); } ret = bdrv_pwrite_sync(bs->file->bs, s->desc_offset, desc, DESC_SIZE); - if (ret < 0) { - return ret; - } - return 0; +out: + g_free(desc); + g_free(tmp_desc); + return ret; } static int vmdk_is_cid_valid(BlockDriverState *bs) @@ -337,15 +344,16 @@ static int vmdk_reopen_prepare(BDRVReopenState *state, static int vmdk_parent_open(BlockDriverState *bs) { char *p_name; - char desc[DESC_SIZE + 1]; + char *desc; BDRVVmdkState *s = bs->opaque; int ret; - desc[DESC_SIZE] = '\0'; + desc = g_malloc0(DESC_SIZE + 1); ret = bdrv_pread(bs->file->bs, s->desc_offset, desc, DESC_SIZE); if (ret < 0) { - return ret; + goto out; } + ret = 0; p_name = strstr(desc, "parentFileNameHint"); if (p_name != NULL) { @@ -354,16 +362,20 @@ static int vmdk_parent_open(BlockDriverState *bs) p_name += sizeof("parentFileNameHint") + 1; end_name = strchr(p_name, '\"'); if (end_name == NULL) { - return -EINVAL; + ret = -EINVAL; + goto out; } if ((end_name - p_name) > sizeof(bs->backing_file) - 1) { - return -EINVAL; + ret = -EINVAL; + goto out; } pstrcpy(bs->backing_file, end_name - p_name + 1, p_name); } - return 0; +out: + g_free(desc); + return ret; } /* Create and append extent to the extent array. Return the added VmdkExtent @@ -1639,7 +1651,7 @@ static int vmdk_create_extent(const char *filename, int64_t filesize, QemuOpts *opts, Error **errp) { int ret, i; - BlockDriverState *bs = NULL; + BlockBackend *blk = NULL; VMDK4Header header; Error *local_err = NULL; uint32_t tmp, magic, grains, gd_sectors, gt_size, gt_count; @@ -1652,16 +1664,19 @@ static int vmdk_create_extent(const char *filename, int64_t filesize, goto exit; } - assert(bs == NULL); - ret = bdrv_open(&bs, filename, NULL, NULL, BDRV_O_RDWR | BDRV_O_PROTOCOL, - &local_err); - if (ret < 0) { + blk = blk_new_open("extent", filename, NULL, NULL, + BDRV_O_RDWR | BDRV_O_CACHE_WB | BDRV_O_PROTOCOL, + &local_err); + if (blk == NULL) { error_propagate(errp, local_err); + ret = -EIO; goto exit; } + blk_set_allow_write_beyond_eof(blk, true); + if (flat) { - ret = bdrv_truncate(bs, filesize); + ret = blk_truncate(blk, filesize); if (ret < 0) { error_setg_errno(errp, -ret, "Could not truncate file"); } @@ -1716,18 +1731,18 @@ static int vmdk_create_extent(const char *filename, int64_t filesize, header.check_bytes[3] = 0xa; /* write all the data */ - ret = bdrv_pwrite(bs, 0, &magic, sizeof(magic)); + ret = blk_pwrite(blk, 0, &magic, sizeof(magic)); if (ret < 0) { error_setg(errp, QERR_IO_ERROR); goto exit; } - ret = bdrv_pwrite(bs, sizeof(magic), &header, sizeof(header)); + ret = blk_pwrite(blk, sizeof(magic), &header, sizeof(header)); if (ret < 0) { error_setg(errp, QERR_IO_ERROR); goto exit; } - ret = bdrv_truncate(bs, le64_to_cpu(header.grain_offset) << 9); + ret = blk_truncate(blk, le64_to_cpu(header.grain_offset) << 9); if (ret < 0) { error_setg_errno(errp, -ret, "Could not truncate file"); goto exit; @@ -1740,8 +1755,8 @@ static int vmdk_create_extent(const char *filename, int64_t filesize, i < gt_count; i++, tmp += gt_size) { gd_buf[i] = cpu_to_le32(tmp); } - ret = bdrv_pwrite(bs, le64_to_cpu(header.rgd_offset) * BDRV_SECTOR_SIZE, - gd_buf, gd_buf_size); + ret = blk_pwrite(blk, le64_to_cpu(header.rgd_offset) * BDRV_SECTOR_SIZE, + gd_buf, gd_buf_size); if (ret < 0) { error_setg(errp, QERR_IO_ERROR); goto exit; @@ -1752,8 +1767,8 @@ static int vmdk_create_extent(const char *filename, int64_t filesize, i < gt_count; i++, tmp += gt_size) { gd_buf[i] = cpu_to_le32(tmp); } - ret = bdrv_pwrite(bs, le64_to_cpu(header.gd_offset) * BDRV_SECTOR_SIZE, - gd_buf, gd_buf_size); + ret = blk_pwrite(blk, le64_to_cpu(header.gd_offset) * BDRV_SECTOR_SIZE, + gd_buf, gd_buf_size); if (ret < 0) { error_setg(errp, QERR_IO_ERROR); goto exit; @@ -1761,8 +1776,8 @@ static int vmdk_create_extent(const char *filename, int64_t filesize, ret = 0; exit: - if (bs) { - bdrv_unref(bs); + if (blk) { + blk_unref(blk); } g_free(gd_buf); return ret; @@ -1811,7 +1826,7 @@ static int filename_decompose(const char *filename, char *path, char *prefix, static int vmdk_create(const char *filename, QemuOpts *opts, Error **errp) { int idx = 0; - BlockDriverState *new_bs = NULL; + BlockBackend *new_blk = NULL; Error *local_err = NULL; char *desc = NULL; int64_t total_size = 0, filesize; @@ -1922,7 +1937,7 @@ static int vmdk_create(const char *filename, QemuOpts *opts, Error **errp) goto exit; } if (backing_file) { - BlockDriverState *bs = NULL; + BlockBackend *blk; char *full_backing = g_new0(char, PATH_MAX); bdrv_get_full_backing_filename_from_filename(filename, backing_file, full_backing, PATH_MAX, @@ -1933,18 +1948,21 @@ static int vmdk_create(const char *filename, QemuOpts *opts, Error **errp) ret = -ENOENT; goto exit; } - ret = bdrv_open(&bs, full_backing, NULL, NULL, BDRV_O_NO_BACKING, errp); + + blk = blk_new_open("backing", full_backing, NULL, NULL, + BDRV_O_NO_BACKING | BDRV_O_CACHE_WB, errp); g_free(full_backing); - if (ret != 0) { + if (blk == NULL) { + ret = -EIO; goto exit; } - if (strcmp(bs->drv->format_name, "vmdk")) { - bdrv_unref(bs); + if (strcmp(blk_bs(blk)->drv->format_name, "vmdk")) { + blk_unref(blk); ret = -EINVAL; goto exit; } - parent_cid = vmdk_read_cid(bs, 0); - bdrv_unref(bs); + parent_cid = vmdk_read_cid(blk_bs(blk), 0); + blk_unref(blk); snprintf(parent_desc_line, BUF_SIZE, "parentFileNameHint=\"%s\"", backing_file); } @@ -2002,14 +2020,19 @@ static int vmdk_create(const char *filename, QemuOpts *opts, Error **errp) goto exit; } } - assert(new_bs == NULL); - ret = bdrv_open(&new_bs, filename, NULL, NULL, - BDRV_O_RDWR | BDRV_O_PROTOCOL, &local_err); - if (ret < 0) { + + new_blk = blk_new_open("descriptor", filename, NULL, NULL, + BDRV_O_RDWR | BDRV_O_CACHE_WB | BDRV_O_PROTOCOL, + &local_err); + if (new_blk == NULL) { error_propagate(errp, local_err); + ret = -EIO; goto exit; } - ret = bdrv_pwrite(new_bs, desc_offset, desc, desc_len); + + blk_set_allow_write_beyond_eof(new_blk, true); + + ret = blk_pwrite(new_blk, desc_offset, desc, desc_len); if (ret < 0) { error_setg_errno(errp, -ret, "Could not write description"); goto exit; @@ -2017,14 +2040,14 @@ static int vmdk_create(const char *filename, QemuOpts *opts, Error **errp) /* bdrv_pwrite write padding zeros to align to sector, we don't need that * for description file */ if (desc_offset == 0) { - ret = bdrv_truncate(new_bs, desc_len); + ret = blk_truncate(new_blk, desc_len); if (ret < 0) { error_setg_errno(errp, -ret, "Could not truncate file"); } } exit: - if (new_bs) { - bdrv_unref(new_bs); + if (new_blk) { + blk_unref(new_blk); } g_free(adapter_type); g_free(backing_file); diff --git a/block/vpc.c b/block/vpc.c index f504536d1c..0d1524d6f6 100644 --- a/block/vpc.c +++ b/block/vpc.c @@ -25,6 +25,7 @@ #include "qemu/osdep.h" #include "qemu-common.h" #include "block/block_int.h" +#include "sysemu/block-backend.h" #include "qemu/module.h" #include "migration/migration.h" #if defined(CONFIG_UUID) @@ -46,8 +47,14 @@ enum vhd_type { // Seconds since Jan 1, 2000 0:00:00 (UTC) #define VHD_TIMESTAMP_BASE 946684800 +#define VHD_CHS_MAX_C 65535LL +#define VHD_CHS_MAX_H 16 +#define VHD_CHS_MAX_S 255 + #define VHD_MAX_SECTORS (65535LL * 255 * 255) -#define VHD_MAX_GEOMETRY (65535LL * 16 * 255) +#define VHD_MAX_GEOMETRY (VHD_CHS_MAX_C * VHD_CHS_MAX_H * VHD_CHS_MAX_S) + +#define VPC_OPT_FORCE_SIZE "force_size" // always big-endian typedef struct vhd_footer { @@ -128,6 +135,8 @@ typedef struct BDRVVPCState { uint32_t block_size; uint32_t bitmap_size; + bool force_use_chs; + bool force_use_sz; #ifdef CACHE uint8_t *pageentry_u8; @@ -140,6 +149,22 @@ typedef struct BDRVVPCState { Error *migration_blocker; } BDRVVPCState; +#define VPC_OPT_SIZE_CALC "force_size_calc" +static QemuOptsList vpc_runtime_opts = { + .name = "vpc-runtime-opts", + .head = QTAILQ_HEAD_INITIALIZER(vpc_runtime_opts.head), + .desc = { + { + .name = VPC_OPT_SIZE_CALC, + .type = QEMU_OPT_STRING, + .help = "Force disk size calculation to use either CHS geometry, " + "or use the disk current_size specified in the VHD footer. " + "{chs, current_size}" + }, + { /* end of list */ } + } +}; + static uint32_t vpc_checksum(uint8_t* buf, size_t size) { uint32_t res = 0; @@ -159,6 +184,25 @@ static int vpc_probe(const uint8_t *buf, int buf_size, const char *filename) return 0; } +static void vpc_parse_options(BlockDriverState *bs, QemuOpts *opts, + Error **errp) +{ + BDRVVPCState *s = bs->opaque; + const char *size_calc; + + size_calc = qemu_opt_get(opts, VPC_OPT_SIZE_CALC); + + if (!size_calc) { + /* no override, use autodetect only */ + } else if (!strcmp(size_calc, "current_size")) { + s->force_use_sz = true; + } else if (!strcmp(size_calc, "chs")) { + s->force_use_chs = true; + } else { + error_setg(errp, "Invalid size calculation mode: '%s'", size_calc); + } +} + static int vpc_open(BlockDriverState *bs, QDict *options, int flags, Error **errp) { @@ -166,6 +210,9 @@ static int vpc_open(BlockDriverState *bs, QDict *options, int flags, int i; VHDFooter *footer; VHDDynDiskHeader *dyndisk_header; + QemuOpts *opts = NULL; + Error *local_err = NULL; + bool use_chs; uint8_t buf[HEADER_SIZE]; uint32_t checksum; uint64_t computed_size; @@ -173,6 +220,21 @@ static int vpc_open(BlockDriverState *bs, QDict *options, int flags, int disk_type = VHD_DYNAMIC; int ret; + opts = qemu_opts_create(&vpc_runtime_opts, NULL, 0, &error_abort); + qemu_opts_absorb_qdict(opts, options, &local_err); + if (local_err) { + error_propagate(errp, local_err); + ret = -EINVAL; + goto fail; + } + + vpc_parse_options(bs, opts, &local_err); + if (local_err) { + error_propagate(errp, local_err); + ret = -EINVAL; + goto fail; + } + ret = bdrv_pread(bs->file->bs, 0, s->footer_buf, HEADER_SIZE); if (ret < 0) { goto fail; @@ -218,12 +280,36 @@ static int vpc_open(BlockDriverState *bs, QDict *options, int flags, bs->total_sectors = (int64_t) be16_to_cpu(footer->cyls) * footer->heads * footer->secs_per_cyl; - /* Images that have exactly the maximum geometry are probably bigger and - * would be truncated if we adhered to the geometry for them. Rely on - * footer->current_size for them. */ - if (bs->total_sectors == VHD_MAX_GEOMETRY) { + /* Microsoft Virtual PC and Microsoft Hyper-V produce and read + * VHD image sizes differently. VPC will rely on CHS geometry, + * while Hyper-V and disk2vhd use the size specified in the footer. + * + * We use a couple of approaches to try and determine the correct method: + * look at the Creator App field, and look for images that have CHS + * geometry that is the maximum value. + * + * If the CHS geometry is the maximum CHS geometry, then we assume that + * the size is the footer->current_size to avoid truncation. Otherwise, + * we follow the table based on footer->creator_app: + * + * Known creator apps: + * 'vpc ' : CHS Virtual PC (uses disk geometry) + * 'qemu' : CHS QEMU (uses disk geometry) + * 'qem2' : current_size QEMU (uses current_size) + * 'win ' : current_size Hyper-V + * 'd2v ' : current_size Disk2vhd + * + * The user can override the table values via drive options, however + * even with an override we will still use current_size for images + * that have CHS geometry of the maximum size. + */ + use_chs = (!!strncmp(footer->creator_app, "win ", 4) && + !!strncmp(footer->creator_app, "qem2", 4) && + !!strncmp(footer->creator_app, "d2v ", 4)) || s->force_use_chs; + + if (!use_chs || bs->total_sectors == VHD_MAX_GEOMETRY || s->force_use_sz) { bs->total_sectors = be64_to_cpu(footer->current_size) / - BDRV_SECTOR_SIZE; + BDRV_SECTOR_SIZE; } /* Allow a maximum disk size of approximately 2 TB */ @@ -673,7 +759,7 @@ static int calculate_geometry(int64_t total_sectors, uint16_t* cyls, return 0; } -static int create_dynamic_disk(BlockDriverState *bs, uint8_t *buf, +static int create_dynamic_disk(BlockBackend *blk, uint8_t *buf, int64_t total_sectors) { VHDDynDiskHeader *dyndisk_header = @@ -687,13 +773,13 @@ static int create_dynamic_disk(BlockDriverState *bs, uint8_t *buf, block_size = 0x200000; num_bat_entries = (total_sectors + block_size / 512) / (block_size / 512); - ret = bdrv_pwrite_sync(bs, offset, buf, HEADER_SIZE); + ret = blk_pwrite(blk, offset, buf, HEADER_SIZE); if (ret) { goto fail; } offset = 1536 + ((num_bat_entries * 4 + 511) & ~511); - ret = bdrv_pwrite_sync(bs, offset, buf, HEADER_SIZE); + ret = blk_pwrite(blk, offset, buf, HEADER_SIZE); if (ret < 0) { goto fail; } @@ -703,7 +789,7 @@ static int create_dynamic_disk(BlockDriverState *bs, uint8_t *buf, memset(buf, 0xFF, 512); for (i = 0; i < (num_bat_entries * 4 + 511) / 512; i++) { - ret = bdrv_pwrite_sync(bs, offset, buf, 512); + ret = blk_pwrite(blk, offset, buf, 512); if (ret < 0) { goto fail; } @@ -730,7 +816,7 @@ static int create_dynamic_disk(BlockDriverState *bs, uint8_t *buf, // Write the header offset = 512; - ret = bdrv_pwrite_sync(bs, offset, buf, 1024); + ret = blk_pwrite(blk, offset, buf, 1024); if (ret < 0) { goto fail; } @@ -739,7 +825,7 @@ static int create_dynamic_disk(BlockDriverState *bs, uint8_t *buf, return ret; } -static int create_fixed_disk(BlockDriverState *bs, uint8_t *buf, +static int create_fixed_disk(BlockBackend *blk, uint8_t *buf, int64_t total_size) { int ret; @@ -747,12 +833,12 @@ static int create_fixed_disk(BlockDriverState *bs, uint8_t *buf, /* Add footer to total size */ total_size += HEADER_SIZE; - ret = bdrv_truncate(bs, total_size); + ret = blk_truncate(blk, total_size); if (ret < 0) { return ret; } - ret = bdrv_pwrite_sync(bs, total_size - HEADER_SIZE, buf, HEADER_SIZE); + ret = blk_pwrite(blk, total_size - HEADER_SIZE, buf, HEADER_SIZE); if (ret < 0) { return ret; } @@ -773,8 +859,9 @@ static int vpc_create(const char *filename, QemuOpts *opts, Error **errp) int64_t total_size; int disk_type; int ret = -EIO; + bool force_size; Error *local_err = NULL; - BlockDriverState *bs = NULL; + BlockBackend *blk = NULL; /* Read out options */ total_size = ROUND_UP(qemu_opt_get_size_del(opts, BLOCK_OPT_SIZE, 0), @@ -793,30 +880,44 @@ static int vpc_create(const char *filename, QemuOpts *opts, Error **errp) disk_type = VHD_DYNAMIC; } + force_size = qemu_opt_get_bool_del(opts, VPC_OPT_FORCE_SIZE, false); + ret = bdrv_create_file(filename, opts, &local_err); if (ret < 0) { error_propagate(errp, local_err); goto out; } - ret = bdrv_open(&bs, filename, NULL, NULL, BDRV_O_RDWR | BDRV_O_PROTOCOL, - &local_err); - if (ret < 0) { + + blk = blk_new_open("image", filename, NULL, NULL, + BDRV_O_RDWR | BDRV_O_CACHE_WB | BDRV_O_PROTOCOL, + &local_err); + if (blk == NULL) { error_propagate(errp, local_err); + ret = -EIO; goto out; } + blk_set_allow_write_beyond_eof(blk, true); + /* * Calculate matching total_size and geometry. Increase the number of * sectors requested until we get enough (or fail). This ensures that * qemu-img convert doesn't truncate images, but rather rounds up. * - * If the image size can't be represented by a spec conform CHS geometry, + * If the image size can't be represented by a spec conformant CHS geometry, * we set the geometry to 65535 x 16 x 255 (CxHxS) sectors and use * the image size from the VHD footer to calculate total_sectors. */ - total_sectors = MIN(VHD_MAX_GEOMETRY, total_size / BDRV_SECTOR_SIZE); - for (i = 0; total_sectors > (int64_t)cyls * heads * secs_per_cyl; i++) { - calculate_geometry(total_sectors + i, &cyls, &heads, &secs_per_cyl); + if (force_size) { + /* This will force the use of total_size for sector count, below */ + cyls = VHD_CHS_MAX_C; + heads = VHD_CHS_MAX_H; + secs_per_cyl = VHD_CHS_MAX_S; + } else { + total_sectors = MIN(VHD_MAX_GEOMETRY, total_size / BDRV_SECTOR_SIZE); + for (i = 0; total_sectors > (int64_t)cyls * heads * secs_per_cyl; i++) { + calculate_geometry(total_sectors + i, &cyls, &heads, &secs_per_cyl); + } } if ((int64_t)cyls * heads * secs_per_cyl == VHD_MAX_GEOMETRY) { @@ -835,8 +936,11 @@ static int vpc_create(const char *filename, QemuOpts *opts, Error **errp) memset(buf, 0, 1024); memcpy(footer->creator, "conectix", 8); - /* TODO Check if "qemu" creator_app is ok for VPC */ - memcpy(footer->creator_app, "qemu", 4); + if (force_size) { + memcpy(footer->creator_app, "qem2", 4); + } else { + memcpy(footer->creator_app, "qemu", 4); + } memcpy(footer->creator_os, "Wi2k", 4); footer->features = cpu_to_be32(0x02); @@ -866,13 +970,13 @@ static int vpc_create(const char *filename, QemuOpts *opts, Error **errp) footer->checksum = cpu_to_be32(vpc_checksum(buf, HEADER_SIZE)); if (disk_type == VHD_DYNAMIC) { - ret = create_dynamic_disk(bs, buf, total_sectors); + ret = create_dynamic_disk(blk, buf, total_sectors); } else { - ret = create_fixed_disk(bs, buf, total_size); + ret = create_fixed_disk(blk, buf, total_size); } out: - bdrv_unref(bs); + blk_unref(blk); g_free(disk_type_param); return ret; } @@ -917,6 +1021,13 @@ static QemuOptsList vpc_create_opts = { "Type of virtual hard disk format. Supported formats are " "{dynamic (default) | fixed} " }, + { + .name = VPC_OPT_FORCE_SIZE, + .type = QEMU_OPT_BOOL, + .help = "Force disk size calculation to use the actual size " + "specified, rather than using the nearest CHS-based " + "calculation" + }, { /* end of list */ } } }; diff --git a/blockdev.c b/blockdev.c index 0f20c6511f..322ca03908 100644 --- a/blockdev.c +++ b/blockdev.c @@ -593,13 +593,6 @@ static BlockBackend *blockdev_init(const char *file, QDict *bs_opts, qdict_set_default_str(bs_opts, BDRV_OPT_CACHE_DIRECT, "off"); qdict_set_default_str(bs_opts, BDRV_OPT_CACHE_NO_FLUSH, "off"); - if (snapshot) { - /* always use cache=unsafe with snapshot */ - qdict_put(bs_opts, BDRV_OPT_CACHE_WB, qstring_from_str("on")); - qdict_put(bs_opts, BDRV_OPT_CACHE_DIRECT, qstring_from_str("off")); - qdict_put(bs_opts, BDRV_OPT_CACHE_NO_FLUSH, qstring_from_str("on")); - } - if (runstate_check(RUN_STATE_INMIGRATE)) { bdrv_flags |= BDRV_O_INACTIVE; } @@ -682,6 +675,13 @@ static BlockDriverState *bds_tree_init(QDict *bs_opts, Error **errp) goto fail; } + /* bdrv_open() defaults to the values in bdrv_flags (for compatibility + * with other callers) rather than what we want as the real defaults. + * Apply the defaults here instead. */ + qdict_set_default_str(bs_opts, BDRV_OPT_CACHE_WB, "on"); + qdict_set_default_str(bs_opts, BDRV_OPT_CACHE_DIRECT, "off"); + qdict_set_default_str(bs_opts, BDRV_OPT_CACHE_NO_FLUSH, "off"); + if (runstate_check(RUN_STATE_INMIGRATE)) { bdrv_flags |= BDRV_O_INACTIVE; } @@ -1732,10 +1732,15 @@ static void external_snapshot_prepare(BlkActionState *common, /* create new image w/backing file */ mode = s->has_mode ? s->mode : NEW_IMAGE_MODE_ABSOLUTE_PATHS; if (mode != NEW_IMAGE_MODE_EXISTING) { + int64_t size = bdrv_getlength(state->old_bs); + if (size < 0) { + error_setg_errno(errp, -size, "bdrv_getlength failed"); + return; + } bdrv_img_create(new_image_file, format, state->old_bs->filename, state->old_bs->drv->format_name, - NULL, -1, flags, &local_err, false); + NULL, size, flags, &local_err, false); if (local_err) { error_propagate(errp, local_err); return; @@ -2819,6 +2824,15 @@ void hmp_drive_del(Monitor *mon, const QDict *qdict) AioContext *aio_context; Error *local_err = NULL; + bs = bdrv_find_node(id); + if (bs) { + qmp_x_blockdev_del(false, NULL, true, id, &local_err); + if (local_err) { + error_report_err(local_err); + } + return; + } + blk = blk_by_name(id); if (!blk) { error_report("Device '%s' not found", id); @@ -3870,6 +3884,36 @@ out: aio_context_release(aio_context); } +void hmp_drive_add_node(Monitor *mon, const char *optstr) +{ + QemuOpts *opts; + QDict *qdict; + Error *local_err = NULL; + + opts = qemu_opts_parse_noisily(&qemu_drive_opts, optstr, false); + if (!opts) { + return; + } + + qdict = qemu_opts_to_qdict(opts, NULL); + + if (!qdict_get_try_str(qdict, "node-name")) { + error_report("'node-name' needs to be specified"); + goto out; + } + + BlockDriverState *bs = bds_tree_init(qdict, &local_err); + if (!bs) { + error_report_err(local_err); + goto out; + } + + QTAILQ_INSERT_TAIL(&monitor_bdrv_states, bs, monitor_list); + +out: + qemu_opts_del(opts); +} + void qmp_blockdev_add(BlockdevOptions *options, Error **errp) { QmpOutputVisitor *ov = qmp_output_visitor_new(); diff --git a/cpus.c b/cpus.c index bc774e2540..4052be525f 100644 --- a/cpus.c +++ b/cpus.c @@ -370,9 +370,12 @@ static void icount_warp_rt(void) } } -static void icount_dummy_timer(void *opaque) +static void icount_timer_cb(void *opaque) { - (void)opaque; + /* No need for a checkpoint because the timer already synchronizes + * with CHECKPOINT_CLOCK_VIRTUAL_RT. + */ + icount_warp_rt(); } void qtest_clock_warp(int64_t dest) @@ -396,17 +399,12 @@ void qtest_clock_warp(int64_t dest) qemu_clock_notify(QEMU_CLOCK_VIRTUAL); } -void qemu_clock_warp(QEMUClockType type) +void qemu_start_warp_timer(void) { int64_t clock; int64_t deadline; - /* - * There are too many global variables to make the "warp" behavior - * applicable to other clocks. But a clock argument removes the - * need for if statements all over the place. - */ - if (type != QEMU_CLOCK_VIRTUAL || !use_icount) { + if (!use_icount) { return; } @@ -418,29 +416,17 @@ void qemu_clock_warp(QEMUClockType type) } /* warp clock deterministically in record/replay mode */ - if (!replay_checkpoint(CHECKPOINT_CLOCK_WARP)) { + if (!replay_checkpoint(CHECKPOINT_CLOCK_WARP_START)) { return; } - if (icount_sleep) { - /* - * If the CPUs have been sleeping, advance QEMU_CLOCK_VIRTUAL timer now. - * This ensures that the deadline for the timer is computed correctly - * below. - * This also makes sure that the insn counter is synchronized before - * the CPU starts running, in case the CPU is woken by an event other - * than the earliest QEMU_CLOCK_VIRTUAL timer. - */ - icount_warp_rt(); - timer_del(icount_warp_timer); - } if (!all_cpu_threads_idle()) { return; } if (qtest_enabled()) { /* When testing, qtest commands advance icount. */ - return; + return; } /* We want to use the earliest deadline from ALL vm_clocks */ @@ -496,6 +482,28 @@ void qemu_clock_warp(QEMUClockType type) } } +static void qemu_account_warp_timer(void) +{ + if (!use_icount || !icount_sleep) { + return; + } + + /* Nothing to do if the VM is stopped: QEMU_CLOCK_VIRTUAL timers + * do not fire, so computing the deadline does not make sense. + */ + if (!runstate_is_running()) { + return; + } + + /* warp clock deterministically in record/replay mode */ + if (!replay_checkpoint(CHECKPOINT_CLOCK_WARP_ACCOUNT)) { + return; + } + + timer_del(icount_warp_timer); + icount_warp_rt(); +} + static bool icount_state_needed(void *opaque) { return use_icount; @@ -624,7 +632,7 @@ void configure_icount(QemuOpts *opts, Error **errp) icount_sleep = qemu_opt_get_bool(opts, "sleep", true); if (icount_sleep) { icount_warp_timer = timer_new_ns(QEMU_CLOCK_VIRTUAL_RT, - icount_dummy_timer, NULL); + icount_timer_cb, NULL); } icount_align_option = qemu_opt_get_bool(opts, "align", false); @@ -995,9 +1003,6 @@ static void qemu_wait_io_event_common(CPUState *cpu) static void qemu_tcg_wait_io_event(CPUState *cpu) { while (all_cpu_threads_idle()) { - /* Start accounting real time to the virtual clock if the CPUs - are idle. */ - qemu_clock_warp(QEMU_CLOCK_VIRTUAL); qemu_cond_wait(cpu->halt_cond, &qemu_global_mutex); } @@ -1499,7 +1504,7 @@ static void tcg_exec_all(void) int r; /* Account partial waits to QEMU_CLOCK_VIRTUAL. */ - qemu_clock_warp(QEMU_CLOCK_VIRTUAL); + qemu_account_warp_timer(); if (next_cpu == NULL) { next_cpu = first_cpu; diff --git a/device-hotplug.c b/device-hotplug.c index 9a7cd669d5..3e5cdaad10 100644 --- a/device-hotplug.c +++ b/device-hotplug.c @@ -30,6 +30,7 @@ #include "qemu/config-file.h" #include "sysemu/sysemu.h" #include "monitor/monitor.h" +#include "block/block_int.h" static DriveInfo *add_init_drive(const char *optstr) { @@ -55,6 +56,12 @@ void hmp_drive_add(Monitor *mon, const QDict *qdict) { DriveInfo *dinfo = NULL; const char *opts = qdict_get_str(qdict, "opts"); + bool node = qdict_get_try_bool(qdict, "node", false); + + if (node) { + hmp_drive_add_node(mon, opts); + return; + } dinfo = add_init_drive(opts); if (!dinfo) { diff --git a/docs/migration.txt b/docs/migration.txt index fda8d61d69..90209ab294 100644 --- a/docs/migration.txt +++ b/docs/migration.txt @@ -333,7 +333,7 @@ doesn't finish in a given time the switch is made to postcopy. To enable postcopy, issue this command on the monitor prior to the start of migration: -migrate_set_capability x-postcopy-ram on +migrate_set_capability postcopy-ram on The normal commands are then used to start a migration, which is still started in precopy mode. Issuing: diff --git a/docs/qmp-events.txt b/docs/qmp-events.txt index 4e3eb9e77a..fa7574d671 100644 --- a/docs/qmp-events.txt +++ b/docs/qmp-events.txt @@ -325,6 +325,7 @@ Emitted to report a corruption of a Quorum file. Data: +- "type": Quorum operation type - "error": Error message (json-string, optional) Only present on failure. This field contains a human-readable error message. There are no semantics other than that the @@ -336,10 +337,18 @@ Data: Example: +Read operation: { "event": "QUORUM_REPORT_BAD", - "data": { "node-name": "1.raw", "sector-num": 345435, "sectors-count": 5 }, + "data": { "node-name": "node0", "sector-num": 345435, "sectors-count": 5, + "type": "read" }, "timestamp": { "seconds": 1344522075, "microseconds": 745528 } } +Flush operation: +{ "event": "QUORUM_REPORT_BAD", + "data": { "node-name": "node0", "sector-num": 0, "sectors-count": 2097120, + "type": "flush", "error": "Broken pipe" }, + "timestamp": { "seconds": 1456406829, "microseconds": 291763 } } + Note: this event is rate-limited. RESET diff --git a/docs/replay.txt b/docs/replay.txt index 149727e2a6..3cedc25b2e 100644 --- a/docs/replay.txt +++ b/docs/replay.txt @@ -107,7 +107,7 @@ at the specified moments of time. There are several kinds of timers: sources (e.g. real time clock chip). Host clock is the one of the sources of non-determinism. Host clock read operations should be logged to make the execution deterministic. - * Real time clock for icount. This clock is similar to real time clock but + * Virtual real time clock. This clock is similar to real time clock but it is used only for increasing virtual clock while virtual machine is sleeping. Due to its nature it is also non-deterministic as the host clock and has to be logged too. @@ -134,11 +134,20 @@ of time. That's why we do not process a group of timers until the checkpoint event will be read from the log. Such an event allows synchronizing CPU execution and timer events. -Another checkpoints application in record/replay is instruction counting -while the virtual machine is idle. This function (qemu_clock_warp) is called -from the wait loop. It changes virtual machine state and must be deterministic -then. That is why we added checkpoint to this function to prevent its -operation in replay mode when it does not correspond to record mode. +Two other checkpoints govern the "warping" of the virtual clock. +While the virtual machine is idle, the virtual clock increments at +1 ns per *real time* nanosecond. This is done by setting up a timer +(called the warp timer) on the virtual real time clock, so that the +timer fires at the next deadline of the virtual clock; the virtual clock +is then incremented (which is called "warping" the virtual clock) as +soon as the timer fires or the CPUs need to go out of the idle state. +Two functions are used for this purpose; because these actions change +virtual machine state and must be deterministic, each of them creates a +checkpoint. qemu_start_warp_timer checks if the CPUs are idle and if so +starts accounting real time to virtual clock. qemu_account_warp_timer +is called when the CPUs get an interrupt or when the warp timer fires, +and it warps the virtual clock by the amount of real time that has passed +since qemu_start_warp_timer. Bottom halves ------------- diff --git a/exec.c b/exec.c index f09dd4e928..f398d212f6 100644 --- a/exec.c +++ b/exec.c @@ -1228,92 +1228,83 @@ void qemu_mutex_unlock_ramlist(void) } #ifdef __linux__ - -#include <sys/vfs.h> - -#define HUGETLBFS_MAGIC 0x958458f6 - -static long gethugepagesize(const char *path, Error **errp) -{ - struct statfs fs; - int ret; - - do { - ret = statfs(path, &fs); - } while (ret != 0 && errno == EINTR); - - if (ret != 0) { - error_setg_errno(errp, errno, "failed to get page size of file %s", - path); - return 0; - } - - return fs.f_bsize; -} - static void *file_ram_alloc(RAMBlock *block, ram_addr_t memory, const char *path, Error **errp) { - struct stat st; + bool unlink_on_error = false; char *filename; char *sanitized_name; char *c; void *area; int fd; - uint64_t hpagesize; - Error *local_err = NULL; - - hpagesize = gethugepagesize(path, &local_err); - if (local_err) { - error_propagate(errp, local_err); - goto error; - } - block->mr->align = hpagesize; - - if (memory < hpagesize) { - error_setg(errp, "memory size 0x" RAM_ADDR_FMT " must be equal to " - "or larger than huge page size 0x%" PRIx64, - memory, hpagesize); - goto error; - } + int64_t page_size; if (kvm_enabled() && !kvm_has_sync_mmu()) { error_setg(errp, "host lacks kvm mmu notifiers, -mem-path unsupported"); - goto error; + return NULL; } - if (!stat(path, &st) && S_ISDIR(st.st_mode)) { - /* Make name safe to use with mkstemp by replacing '/' with '_'. */ - sanitized_name = g_strdup(memory_region_name(block->mr)); - for (c = sanitized_name; *c != '\0'; c++) { - if (*c == '/') { - *c = '_'; - } + for (;;) { + fd = open(path, O_RDWR); + if (fd >= 0) { + /* @path names an existing file, use it */ + break; } + if (errno == ENOENT) { + /* @path names a file that doesn't exist, create it */ + fd = open(path, O_RDWR | O_CREAT | O_EXCL, 0644); + if (fd >= 0) { + unlink_on_error = true; + break; + } + } else if (errno == EISDIR) { + /* @path names a directory, create a file there */ + /* Make name safe to use with mkstemp by replacing '/' with '_'. */ + sanitized_name = g_strdup(memory_region_name(block->mr)); + for (c = sanitized_name; *c != '\0'; c++) { + if (*c == '/') { + *c = '_'; + } + } - filename = g_strdup_printf("%s/qemu_back_mem.%s.XXXXXX", path, - sanitized_name); - g_free(sanitized_name); + filename = g_strdup_printf("%s/qemu_back_mem.%s.XXXXXX", path, + sanitized_name); + g_free(sanitized_name); - fd = mkstemp(filename); - if (fd >= 0) { - unlink(filename); + fd = mkstemp(filename); + if (fd >= 0) { + unlink(filename); + g_free(filename); + break; + } + g_free(filename); } - g_free(filename); - } else { - fd = open(path, O_RDWR | O_CREAT, 0644); + if (errno != EEXIST && errno != EINTR) { + error_setg_errno(errp, errno, + "can't open backing store %s for guest RAM", + path); + goto error; + } + /* + * Try again on EINTR and EEXIST. The latter happens when + * something else creates the file between our two open(). + */ } - if (fd < 0) { - error_setg_errno(errp, errno, - "unable to create backing store for hugepages"); + page_size = qemu_fd_getpagesize(fd); + block->mr->align = page_size; + + if (memory < page_size) { + error_setg(errp, "memory size 0x" RAM_ADDR_FMT " must be equal to " + "or larger than page size 0x%" PRIx64, + memory, page_size); goto error; } - memory = ROUND_UP(memory, hpagesize); + memory = ROUND_UP(memory, page_size); /* * ftruncate is not supported by hugetlbfs in older @@ -1325,10 +1316,10 @@ static void *file_ram_alloc(RAMBlock *block, perror("ftruncate"); } - area = qemu_ram_mmap(fd, memory, hpagesize, block->flags & RAM_SHARED); + area = qemu_ram_mmap(fd, memory, page_size, block->flags & RAM_SHARED); if (area == MAP_FAILED) { error_setg_errno(errp, errno, - "unable to map backing store for hugepages"); + "unable to map backing store for guest RAM"); close(fd); goto error; } @@ -1341,6 +1332,10 @@ static void *file_ram_alloc(RAMBlock *block, return area; error: + if (unlink_on_error) { + unlink(path); + } + close(fd); return NULL; } #endif @@ -1594,6 +1589,7 @@ static void ram_block_add(RAMBlock *new_block, Error **errp) if (err) { error_propagate(errp, err); qemu_mutex_unlock_ramlist(); + return; } } else { new_block->host = phys_mem_alloc(new_block->max_length, @@ -1603,6 +1599,7 @@ static void ram_block_add(RAMBlock *new_block, Error **errp) "cannot set up guest memory '%s'", memory_region_name(new_block->mr)); qemu_mutex_unlock_ramlist(); + return; } memory_try_enable_merging(new_block->host, new_block->max_length); } diff --git a/gdbstub.c b/gdbstub.c index 61c12b168e..fdcb0eea8f 100644 --- a/gdbstub.c +++ b/gdbstub.c @@ -1752,7 +1752,7 @@ int gdbserver_start(const char *device) sigaction(SIGINT, &act, NULL); } #endif - chr = qemu_chr_new("gdb", device, NULL); + chr = qemu_chr_new_noreplay("gdb", device, NULL); if (!chr) return -1; diff --git a/hmp-commands.hx b/hmp-commands.hx index 664d794f29..4f4f60a0df 100644 --- a/hmp-commands.hx +++ b/hmp-commands.hx @@ -1026,7 +1026,7 @@ ETEXI .args_type = "", .params = "", .help = "Followup to a migration command to switch the migration" - " to postcopy mode. The x-postcopy-ram capability must " + " to postcopy mode. The postcopy-ram capability must " "be set before the original migration command.", .mhandler.cmd = hmp_migrate_start_postcopy, }, @@ -1201,8 +1201,8 @@ ETEXI { .name = "drive_add", - .args_type = "pci_addr:s,opts:s", - .params = "[[<domain>:]<bus>:]<slot>\n" + .args_type = "node:-n,pci_addr:s,opts:s", + .params = "[-n] [[<domain>:]<bus>:]<slot>\n" "[file=file][,if=type][,bus=n]\n" "[,unit=m][,media=d][,index=i]\n" "[,cyls=c,heads=h,secs=s[,trans=t]]\n" diff --git a/hw/arm/sysbus-fdt.c b/hw/arm/sysbus-fdt.c index 04afeae226..49bd212d07 100644 --- a/hw/arm/sysbus-fdt.c +++ b/hw/arm/sysbus-fdt.c @@ -240,7 +240,7 @@ static int add_calxeda_midway_xgmac_fdt_node(SysBusDevice *sbdev, void *opaque) mmio_base = platform_bus_get_mmio_addr(pbus, sbdev, i); reg_attr[2 * i] = cpu_to_be32(mmio_base); reg_attr[2 * i + 1] = cpu_to_be32( - memory_region_size(&vdev->regions[i]->mem)); + memory_region_size(vdev->regions[i]->mem)); } qemu_fdt_setprop(fdt, nodename, "reg", reg_attr, vbasedev->num_regions * 2 * sizeof(uint32_t)); @@ -374,7 +374,7 @@ static int add_amd_xgbe_fdt_node(SysBusDevice *sbdev, void *opaque) mmio_base = platform_bus_get_mmio_addr(pbus, sbdev, i); reg_attr[2 * i] = cpu_to_be32(mmio_base); reg_attr[2 * i + 1] = cpu_to_be32( - memory_region_size(&vdev->regions[i]->mem)); + memory_region_size(vdev->regions[i]->mem)); } qemu_fdt_setprop(guest_fdt, nodename, "reg", reg_attr, vbasedev->num_regions * 2 * sizeof(uint32_t)); diff --git a/hw/i386/pc_q35.c b/hw/i386/pc_q35.c index 17915b05c4..9ee939b4c2 100644 --- a/hw/i386/pc_q35.c +++ b/hw/i386/pc_q35.c @@ -39,6 +39,7 @@ #include "hw/kvm/clock.h" #include "hw/pci-host/q35.h" #include "exec/address-spaces.h" +#include "hw/i386/pc.h" #include "hw/i386/ich9.h" #include "hw/smbios/smbios.h" #include "hw/ide/pci.h" @@ -146,7 +147,7 @@ static void pc_q35_init(MachineState *machine) /* irq lines */ gsi_state = g_malloc0(sizeof(*gsi_state)); - if (kvm_irqchip_in_kernel()) { + if (kvm_ioapic_in_kernel()) { kvm_pc_setup_irq_routing(pcmc->pci_enabled); gsi = qemu_allocate_irqs(kvm_pc_gsi_handler, gsi_state, GSI_NUM_PINS); @@ -193,7 +194,7 @@ static void pc_q35_init(MachineState *machine) /*end early*/ isa_bus_irqs(isa_bus, gsi); - if (kvm_irqchip_in_kernel()) { + if (kvm_pic_in_kernel()) { i8259 = kvm_i8259_init(isa_bus); } else if (xen_enabled()) { i8259 = xen_interrupt_controller_init(); diff --git a/hw/s390x/s390-virtio-ccw.c b/hw/s390x/s390-virtio-ccw.c index 89f5d0d6a6..3d8c3c4fa8 100644 --- a/hw/s390x/s390-virtio-ccw.c +++ b/hw/s390x/s390-virtio-ccw.c @@ -22,20 +22,7 @@ #include "s390-pci-bus.h" #include "hw/s390x/storage-keys.h" #include "hw/compat.h" - -#define TYPE_S390_CCW_MACHINE "s390-ccw-machine" - -#define S390_CCW_MACHINE(obj) \ - OBJECT_CHECK(S390CcwMachineState, (obj), TYPE_S390_CCW_MACHINE) - -typedef struct S390CcwMachineState { - /*< private >*/ - MachineState parent_obj; - - /*< public >*/ - bool aes_key_wrap; - bool dea_key_wrap; -} S390CcwMachineState; +#include "hw/s390x/s390-virtio-ccw.h" static const char *const reset_dev_types[] = { "virtual-css-bridge", @@ -136,7 +123,7 @@ static void ccw_init(MachineState *machine) virtio_ccw_register_hcalls(); /* init CPUs */ - s390_init_cpus(machine->cpu_model); + s390_init_cpus(machine); if (kvm_enabled()) { kvm_s390_enable_css_support(s390_cpu_addr2state(0)); @@ -156,13 +143,54 @@ static void ccw_init(MachineState *machine) gtod_save, gtod_load, kvm_state); } +static void s390_cpu_plug(HotplugHandler *hotplug_dev, + DeviceState *dev, Error **errp) +{ + gchar *name; + S390CPU *cpu = S390_CPU(dev); + CPUState *cs = CPU(dev); + + name = g_strdup_printf("cpu[%i]", cpu->env.cpu_num); + object_property_set_link(OBJECT(hotplug_dev), OBJECT(cs), name, + errp); + g_free(name); +} + +static void s390_machine_device_plug(HotplugHandler *hotplug_dev, + DeviceState *dev, Error **errp) +{ + if (object_dynamic_cast(OBJECT(dev), TYPE_CPU)) { + s390_cpu_plug(hotplug_dev, dev, errp); + } +} + +static HotplugHandler *s390_get_hotplug_handler(MachineState *machine, + DeviceState *dev) +{ + if (object_dynamic_cast(OBJECT(dev), TYPE_CPU)) { + return HOTPLUG_HANDLER(machine); + } + return NULL; +} + +static void s390_hot_add_cpu(const int64_t id, Error **errp) +{ + MachineState *machine = MACHINE(qdev_get_machine()); + Error *err = NULL; + + s390x_new_cpu(machine->cpu_model, id, &err); + error_propagate(errp, err); +} + static void ccw_machine_class_init(ObjectClass *oc, void *data) { MachineClass *mc = MACHINE_CLASS(oc); NMIClass *nc = NMI_CLASS(oc); + HotplugHandlerClass *hc = HOTPLUG_HANDLER_CLASS(oc); mc->init = ccw_init; mc->reset = s390_machine_reset; + mc->hot_add_cpu = s390_hot_add_cpu; mc->block_default_type = IF_VIRTIO; mc->no_cdrom = 1; mc->no_floppy = 1; @@ -171,6 +199,8 @@ static void ccw_machine_class_init(ObjectClass *oc, void *data) mc->no_sdcard = 1; mc->use_sclp = 1; mc->max_cpus = 255; + mc->get_hotplug_handler = s390_get_hotplug_handler; + hc->plug = s390_machine_device_plug; nc->nmi_monitor_handler = s390_nmi; } @@ -232,10 +262,40 @@ static const TypeInfo ccw_machine_info = { .class_init = ccw_machine_class_init, .interfaces = (InterfaceInfo[]) { { TYPE_NMI }, + { TYPE_HOTPLUG_HANDLER}, { } }, }; +#define DEFINE_CCW_MACHINE(suffix, verstr, latest) \ + static void ccw_machine_##suffix##_class_init(ObjectClass *oc, \ + void *data) \ + { \ + MachineClass *mc = MACHINE_CLASS(oc); \ + ccw_machine_##suffix##_class_options(mc); \ + mc->desc = "VirtIO-ccw based S390 machine v" verstr; \ + if (latest) { \ + mc->alias = "s390-ccw-virtio"; \ + mc->is_default = 1; \ + } \ + } \ + static void ccw_machine_##suffix##_instance_init(Object *obj) \ + { \ + MachineState *machine = MACHINE(obj); \ + ccw_machine_##suffix##_instance_options(machine); \ + } \ + static const TypeInfo ccw_machine_##suffix##_info = { \ + .name = MACHINE_TYPE_NAME("s390-ccw-virtio-" verstr), \ + .parent = TYPE_S390_CCW_MACHINE, \ + .class_init = ccw_machine_##suffix##_class_init, \ + .instance_init = ccw_machine_##suffix##_instance_init, \ + }; \ + static void ccw_machine_register_##suffix(void) \ + { \ + type_register_static(&ccw_machine_##suffix##_info); \ + } \ + machine_init(ccw_machine_register_##suffix) + #define CCW_COMPAT_2_5 \ HW_COMPAT_2_5 @@ -280,63 +340,39 @@ static const TypeInfo ccw_machine_info = { .value = "0",\ }, -static void ccw_machine_2_4_class_init(ObjectClass *oc, void *data) +static void ccw_machine_2_6_instance_options(MachineState *machine) { - MachineClass *mc = MACHINE_CLASS(oc); - static GlobalProperty compat_props[] = { - CCW_COMPAT_2_4 - { /* end of list */ } - }; - - mc->desc = "VirtIO-ccw based S390 machine v2.4"; - mc->compat_props = compat_props; } -static const TypeInfo ccw_machine_2_4_info = { - .name = MACHINE_TYPE_NAME("s390-ccw-virtio-2.4"), - .parent = TYPE_S390_CCW_MACHINE, - .class_init = ccw_machine_2_4_class_init, -}; - -static void ccw_machine_2_5_class_init(ObjectClass *oc, void *data) +static void ccw_machine_2_6_class_options(MachineClass *mc) { - MachineClass *mc = MACHINE_CLASS(oc); - static GlobalProperty compat_props[] = { - CCW_COMPAT_2_5 - { /* end of list */ } - }; - - mc->desc = "VirtIO-ccw based S390 machine v2.5"; - mc->compat_props = compat_props; } +DEFINE_CCW_MACHINE(2_6, "2.6", true); -static const TypeInfo ccw_machine_2_5_info = { - .name = MACHINE_TYPE_NAME("s390-ccw-virtio-2.5"), - .parent = TYPE_S390_CCW_MACHINE, - .class_init = ccw_machine_2_5_class_init, -}; +static void ccw_machine_2_5_instance_options(MachineState *machine) +{ +} -static void ccw_machine_2_6_class_init(ObjectClass *oc, void *data) +static void ccw_machine_2_5_class_options(MachineClass *mc) { - MachineClass *mc = MACHINE_CLASS(oc); + SET_MACHINE_COMPAT(mc, CCW_COMPAT_2_5); +} +DEFINE_CCW_MACHINE(2_5, "2.5", false); - mc->alias = "s390-ccw-virtio"; - mc->desc = "VirtIO-ccw based S390 machine v2.6"; - mc->is_default = 1; +static void ccw_machine_2_4_instance_options(MachineState *machine) +{ + ccw_machine_2_5_instance_options(machine); } -static const TypeInfo ccw_machine_2_6_info = { - .name = MACHINE_TYPE_NAME("s390-ccw-virtio-2.6"), - .parent = TYPE_S390_CCW_MACHINE, - .class_init = ccw_machine_2_6_class_init, -}; +static void ccw_machine_2_4_class_options(MachineClass *mc) +{ + SET_MACHINE_COMPAT(mc, CCW_COMPAT_2_4); +} +DEFINE_CCW_MACHINE(2_4, "2.4", false); static void ccw_machine_register_types(void) { type_register_static(&ccw_machine_info); - type_register_static(&ccw_machine_2_4_info); - type_register_static(&ccw_machine_2_5_info); - type_register_static(&ccw_machine_2_6_info); } type_init(ccw_machine_register_types) diff --git a/hw/s390x/s390-virtio.c b/hw/s390x/s390-virtio.c index 8e533ae88a..7c6e281af1 100644 --- a/hw/s390x/s390-virtio.c +++ b/hw/s390x/s390-virtio.c @@ -58,15 +58,16 @@ #define S390_TOD_CLOCK_VALUE_MISSING 0x00 #define S390_TOD_CLOCK_VALUE_PRESENT 0x01 -static S390CPU **ipi_states; +static S390CPU **cpu_states; S390CPU *s390_cpu_addr2state(uint16_t cpu_addr) { - if (cpu_addr >= smp_cpus) { + if (cpu_addr >= max_cpus) { return NULL; } - return ipi_states[cpu_addr]; + /* Fast lookup via CPU ID */ + return cpu_states[cpu_addr]; } void s390_init_ipl_dev(const char *kernel_filename, @@ -93,26 +94,29 @@ void s390_init_ipl_dev(const char *kernel_filename, qdev_init_nofail(dev); } -void s390_init_cpus(const char *cpu_model) +void s390_init_cpus(MachineState *machine) { int i; + gchar *name; - if (cpu_model == NULL) { - cpu_model = "host"; + if (machine->cpu_model == NULL) { + machine->cpu_model = "host"; } - ipi_states = g_malloc(sizeof(S390CPU *) * smp_cpus); + cpu_states = g_new0(S390CPU *, max_cpus); - for (i = 0; i < smp_cpus; i++) { - S390CPU *cpu; - CPUState *cs; - - cpu = cpu_s390x_init(cpu_model); - cs = CPU(cpu); + for (i = 0; i < max_cpus; i++) { + name = g_strdup_printf("cpu[%i]", i); + object_property_add_link(OBJECT(machine), name, TYPE_S390_CPU, + (Object **) &cpu_states[i], + object_property_allow_set_link, + OBJ_PROP_LINK_UNREF_ON_RELEASE, + &error_abort); + g_free(name); + } - ipi_states[i] = cpu; - cs->halted = 1; - cs->exception_index = EXCP_HLT; + for (i = 0; i < smp_cpus; i++) { + s390x_new_cpu(machine->cpu_model, i, &error_fatal); } } diff --git a/hw/s390x/s390-virtio.h b/hw/s390x/s390-virtio.h index eebce8e5e6..ffd014cb5b 100644 --- a/hw/s390x/s390-virtio.h +++ b/hw/s390x/s390-virtio.h @@ -19,7 +19,7 @@ typedef int (*s390_virtio_fn)(const uint64_t *args); void s390_register_virtio_hypercall(uint64_t code, s390_virtio_fn fn); -void s390_init_cpus(const char *cpu_model); +void s390_init_cpus(MachineState *machine); void s390_init_ipl_dev(const char *kernel_filename, const char *kernel_cmdline, const char *initrd_filename, diff --git a/hw/vfio/common.c b/hw/vfio/common.c index 607ec70be3..96ccb797fe 100644 --- a/hw/vfio/common.c +++ b/hw/vfio/common.c @@ -493,46 +493,162 @@ static void vfio_listener_release(VFIOContainer *container) memory_listener_unregister(&container->listener); } -int vfio_mmap_region(Object *obj, VFIORegion *region, - MemoryRegion *mem, MemoryRegion *submem, - void **map, size_t size, off_t offset, - const char *name) +int vfio_region_setup(Object *obj, VFIODevice *vbasedev, VFIORegion *region, + int index, const char *name) { - int ret = 0; - VFIODevice *vbasedev = region->vbasedev; + struct vfio_region_info *info; + int ret; + + ret = vfio_get_region_info(vbasedev, index, &info); + if (ret) { + return ret; + } + + region->vbasedev = vbasedev; + region->flags = info->flags; + region->size = info->size; + region->fd_offset = info->offset; + region->nr = index; + + if (region->size) { + region->mem = g_new0(MemoryRegion, 1); + memory_region_init_io(region->mem, obj, &vfio_region_ops, + region, name, region->size); - if (!vbasedev->no_mmap && size && region->flags & - VFIO_REGION_INFO_FLAG_MMAP) { - int prot = 0; + if (!vbasedev->no_mmap && + region->flags & VFIO_REGION_INFO_FLAG_MMAP && + !(region->size & ~qemu_real_host_page_mask)) { - if (region->flags & VFIO_REGION_INFO_FLAG_READ) { - prot |= PROT_READ; + region->nr_mmaps = 1; + region->mmaps = g_new0(VFIOMmap, region->nr_mmaps); + + region->mmaps[0].offset = 0; + region->mmaps[0].size = region->size; } + } + + g_free(info); + + trace_vfio_region_setup(vbasedev->name, index, name, + region->flags, region->fd_offset, region->size); + return 0; +} + +int vfio_region_mmap(VFIORegion *region) +{ + int i, prot = 0; + char *name; - if (region->flags & VFIO_REGION_INFO_FLAG_WRITE) { - prot |= PROT_WRITE; + if (!region->mem) { + return 0; + } + + prot |= region->flags & VFIO_REGION_INFO_FLAG_READ ? PROT_READ : 0; + prot |= region->flags & VFIO_REGION_INFO_FLAG_WRITE ? PROT_WRITE : 0; + + for (i = 0; i < region->nr_mmaps; i++) { + region->mmaps[i].mmap = mmap(NULL, region->mmaps[i].size, prot, + MAP_SHARED, region->vbasedev->fd, + region->fd_offset + + region->mmaps[i].offset); + if (region->mmaps[i].mmap == MAP_FAILED) { + int ret = -errno; + + trace_vfio_region_mmap_fault(memory_region_name(region->mem), i, + region->fd_offset + + region->mmaps[i].offset, + region->fd_offset + + region->mmaps[i].offset + + region->mmaps[i].size - 1, ret); + + region->mmaps[i].mmap = NULL; + + for (i--; i >= 0; i--) { + memory_region_del_subregion(region->mem, ®ion->mmaps[i].mem); + munmap(region->mmaps[i].mmap, region->mmaps[i].size); + object_unparent(OBJECT(®ion->mmaps[i].mem)); + region->mmaps[i].mmap = NULL; + } + + return ret; } - *map = mmap(NULL, size, prot, MAP_SHARED, - vbasedev->fd, - region->fd_offset + offset); - if (*map == MAP_FAILED) { - *map = NULL; - ret = -errno; - goto empty_region; + name = g_strdup_printf("%s mmaps[%d]", + memory_region_name(region->mem), i); + memory_region_init_ram_ptr(®ion->mmaps[i].mem, + memory_region_owner(region->mem), + name, region->mmaps[i].size, + region->mmaps[i].mmap); + g_free(name); + memory_region_set_skip_dump(®ion->mmaps[i].mem); + memory_region_add_subregion(region->mem, region->mmaps[i].offset, + ®ion->mmaps[i].mem); + + trace_vfio_region_mmap(memory_region_name(®ion->mmaps[i].mem), + region->mmaps[i].offset, + region->mmaps[i].offset + + region->mmaps[i].size - 1); + } + + return 0; +} + +void vfio_region_exit(VFIORegion *region) +{ + int i; + + if (!region->mem) { + return; + } + + for (i = 0; i < region->nr_mmaps; i++) { + if (region->mmaps[i].mmap) { + memory_region_del_subregion(region->mem, ®ion->mmaps[i].mem); } + } - memory_region_init_ram_ptr(submem, obj, name, size, *map); - memory_region_set_skip_dump(submem); - } else { -empty_region: - /* Create a zero sized sub-region to make cleanup easy. */ - memory_region_init(submem, obj, name, 0); + trace_vfio_region_exit(region->vbasedev->name, region->nr); +} + +void vfio_region_finalize(VFIORegion *region) +{ + int i; + + if (!region->mem) { + return; + } + + for (i = 0; i < region->nr_mmaps; i++) { + if (region->mmaps[i].mmap) { + munmap(region->mmaps[i].mmap, region->mmaps[i].size); + object_unparent(OBJECT(®ion->mmaps[i].mem)); + } } - memory_region_add_subregion(mem, offset, submem); + object_unparent(OBJECT(region->mem)); - return ret; + g_free(region->mem); + g_free(region->mmaps); + + trace_vfio_region_finalize(region->vbasedev->name, region->nr); +} + +void vfio_region_mmaps_set_enabled(VFIORegion *region, bool enabled) +{ + int i; + + if (!region->mem) { + return; + } + + for (i = 0; i < region->nr_mmaps; i++) { + if (region->mmaps[i].mmap) { + memory_region_set_enabled(®ion->mmaps[i].mem, enabled); + } + } + + trace_vfio_region_mmaps_set_enabled(memory_region_name(region->mem), + enabled); } void vfio_reset_handler(void *opaque) @@ -959,6 +1075,24 @@ void vfio_put_base_device(VFIODevice *vbasedev) close(vbasedev->fd); } +int vfio_get_region_info(VFIODevice *vbasedev, int index, + struct vfio_region_info **info) +{ + size_t argsz = sizeof(struct vfio_region_info); + + *info = g_malloc0(argsz); + + (*info)->index = index; + (*info)->argsz = argsz; + + if (ioctl(vbasedev->fd, VFIO_DEVICE_GET_REGION_INFO, *info)) { + g_free(*info); + return -errno; + } + + return 0; +} + static int vfio_container_do_ioctl(AddressSpace *as, int32_t groupid, int req, void *param) { diff --git a/hw/vfio/pci-quirks.c b/hw/vfio/pci-quirks.c index 48155277c6..49ecf1172a 100644 --- a/hw/vfio/pci-quirks.c +++ b/hw/vfio/pci-quirks.c @@ -290,10 +290,10 @@ static void vfio_vga_probe_ati_3c3_quirk(VFIOPCIDevice *vdev) memory_region_init_io(quirk->mem, OBJECT(vdev), &vfio_ati_3c3_quirk, vdev, "vfio-ati-3c3-quirk", 1); - memory_region_add_subregion(&vdev->vga.region[QEMU_PCI_VGA_IO_HI].mem, + memory_region_add_subregion(&vdev->vga->region[QEMU_PCI_VGA_IO_HI].mem, 3 /* offset 3 bytes from 0x3c0 */, quirk->mem); - QLIST_INSERT_HEAD(&vdev->vga.region[QEMU_PCI_VGA_IO_HI].quirks, + QLIST_INSERT_HEAD(&vdev->vga->region[QEMU_PCI_VGA_IO_HI].quirks, quirk, next); trace_vfio_quirk_ati_3c3_probe(vdev->vbasedev.name); @@ -337,14 +337,14 @@ static void vfio_probe_ati_bar4_quirk(VFIOPCIDevice *vdev, int nr) memory_region_init_io(window->addr_mem, OBJECT(vdev), &vfio_generic_window_address_quirk, window, "vfio-ati-bar4-window-address-quirk", 4); - memory_region_add_subregion_overlap(&vdev->bars[nr].region.mem, + memory_region_add_subregion_overlap(vdev->bars[nr].region.mem, window->address_offset, window->addr_mem, 1); memory_region_init_io(window->data_mem, OBJECT(vdev), &vfio_generic_window_data_quirk, window, "vfio-ati-bar4-window-data-quirk", 4); - memory_region_add_subregion_overlap(&vdev->bars[nr].region.mem, + memory_region_add_subregion_overlap(vdev->bars[nr].region.mem, window->data_offset, window->data_mem, 1); @@ -378,7 +378,7 @@ static void vfio_probe_ati_bar2_quirk(VFIOPCIDevice *vdev, int nr) memory_region_init_io(mirror->mem, OBJECT(vdev), &vfio_generic_mirror_quirk, mirror, "vfio-ati-bar2-4000-quirk", PCI_CONFIG_SPACE_SIZE); - memory_region_add_subregion_overlap(&vdev->bars[nr].region.mem, + memory_region_add_subregion_overlap(vdev->bars[nr].region.mem, mirror->offset, mirror->mem, 1); QLIST_INSERT_HEAD(&vdev->bars[nr].quirks, quirk, next); @@ -428,7 +428,7 @@ static uint64_t vfio_nvidia_3d4_quirk_read(void *opaque, quirk->state = NONE; - return vfio_vga_read(&vdev->vga.region[QEMU_PCI_VGA_IO_HI], + return vfio_vga_read(&vdev->vga->region[QEMU_PCI_VGA_IO_HI], addr + 0x14, size); } @@ -465,7 +465,7 @@ static void vfio_nvidia_3d4_quirk_write(void *opaque, hwaddr addr, break; } - vfio_vga_write(&vdev->vga.region[QEMU_PCI_VGA_IO_HI], + vfio_vga_write(&vdev->vga->region[QEMU_PCI_VGA_IO_HI], addr + 0x14, data, size); } @@ -481,7 +481,7 @@ static uint64_t vfio_nvidia_3d0_quirk_read(void *opaque, VFIONvidia3d0Quirk *quirk = opaque; VFIOPCIDevice *vdev = quirk->vdev; VFIONvidia3d0State old_state = quirk->state; - uint64_t data = vfio_vga_read(&vdev->vga.region[QEMU_PCI_VGA_IO_HI], + uint64_t data = vfio_vga_read(&vdev->vga->region[QEMU_PCI_VGA_IO_HI], addr + 0x10, size); quirk->state = NONE; @@ -523,7 +523,7 @@ static void vfio_nvidia_3d0_quirk_write(void *opaque, hwaddr addr, } } - vfio_vga_write(&vdev->vga.region[QEMU_PCI_VGA_IO_HI], + vfio_vga_write(&vdev->vga->region[QEMU_PCI_VGA_IO_HI], addr + 0x10, data, size); } @@ -551,15 +551,15 @@ static void vfio_vga_probe_nvidia_3d0_quirk(VFIOPCIDevice *vdev) memory_region_init_io(&quirk->mem[0], OBJECT(vdev), &vfio_nvidia_3d4_quirk, data, "vfio-nvidia-3d4-quirk", 2); - memory_region_add_subregion(&vdev->vga.region[QEMU_PCI_VGA_IO_HI].mem, + memory_region_add_subregion(&vdev->vga->region[QEMU_PCI_VGA_IO_HI].mem, 0x14 /* 0x3c0 + 0x14 */, &quirk->mem[0]); memory_region_init_io(&quirk->mem[1], OBJECT(vdev), &vfio_nvidia_3d0_quirk, data, "vfio-nvidia-3d0-quirk", 2); - memory_region_add_subregion(&vdev->vga.region[QEMU_PCI_VGA_IO_HI].mem, + memory_region_add_subregion(&vdev->vga->region[QEMU_PCI_VGA_IO_HI].mem, 0x10 /* 0x3c0 + 0x10 */, &quirk->mem[1]); - QLIST_INSERT_HEAD(&vdev->vga.region[QEMU_PCI_VGA_IO_HI].quirks, + QLIST_INSERT_HEAD(&vdev->vga->region[QEMU_PCI_VGA_IO_HI].quirks, quirk, next); trace_vfio_quirk_nvidia_3d0_probe(vdev->vbasedev.name); @@ -683,7 +683,7 @@ static void vfio_probe_nvidia_bar5_quirk(VFIOPCIDevice *vdev, int nr) memory_region_init_io(window->addr_mem, OBJECT(vdev), &vfio_generic_window_address_quirk, window, "vfio-nvidia-bar5-window-address-quirk", 4); - memory_region_add_subregion_overlap(&vdev->bars[nr].region.mem, + memory_region_add_subregion_overlap(vdev->bars[nr].region.mem, window->address_offset, window->addr_mem, 1); memory_region_set_enabled(window->addr_mem, false); @@ -691,7 +691,7 @@ static void vfio_probe_nvidia_bar5_quirk(VFIOPCIDevice *vdev, int nr) memory_region_init_io(window->data_mem, OBJECT(vdev), &vfio_generic_window_data_quirk, window, "vfio-nvidia-bar5-window-data-quirk", 4); - memory_region_add_subregion_overlap(&vdev->bars[nr].region.mem, + memory_region_add_subregion_overlap(vdev->bars[nr].region.mem, window->data_offset, window->data_mem, 1); memory_region_set_enabled(window->data_mem, false); @@ -699,13 +699,13 @@ static void vfio_probe_nvidia_bar5_quirk(VFIOPCIDevice *vdev, int nr) memory_region_init_io(&quirk->mem[2], OBJECT(vdev), &vfio_nvidia_bar5_quirk_master, bar5, "vfio-nvidia-bar5-master-quirk", 4); - memory_region_add_subregion_overlap(&vdev->bars[nr].region.mem, + memory_region_add_subregion_overlap(vdev->bars[nr].region.mem, 0, &quirk->mem[2], 1); memory_region_init_io(&quirk->mem[3], OBJECT(vdev), &vfio_nvidia_bar5_quirk_enable, bar5, "vfio-nvidia-bar5-enable-quirk", 4); - memory_region_add_subregion_overlap(&vdev->bars[nr].region.mem, + memory_region_add_subregion_overlap(vdev->bars[nr].region.mem, 4, &quirk->mem[3], 1); QLIST_INSERT_HEAD(&vdev->bars[nr].quirks, quirk, next); @@ -767,7 +767,7 @@ static void vfio_probe_nvidia_bar0_quirk(VFIOPCIDevice *vdev, int nr) &vfio_nvidia_mirror_quirk, mirror, "vfio-nvidia-bar0-88000-mirror-quirk", vdev->config_size); - memory_region_add_subregion_overlap(&vdev->bars[nr].region.mem, + memory_region_add_subregion_overlap(vdev->bars[nr].region.mem, mirror->offset, mirror->mem, 1); QLIST_INSERT_HEAD(&vdev->bars[nr].quirks, quirk, next); @@ -786,7 +786,7 @@ static void vfio_probe_nvidia_bar0_quirk(VFIOPCIDevice *vdev, int nr) &vfio_nvidia_mirror_quirk, mirror, "vfio-nvidia-bar0-1800-mirror-quirk", PCI_CONFIG_SPACE_SIZE); - memory_region_add_subregion_overlap(&vdev->bars[nr].region.mem, + memory_region_add_subregion_overlap(vdev->bars[nr].region.mem, mirror->offset, mirror->mem, 1); QLIST_INSERT_HEAD(&vdev->bars[nr].quirks, quirk, next); @@ -947,13 +947,13 @@ static void vfio_probe_rtl8168_bar2_quirk(VFIOPCIDevice *vdev, int nr) memory_region_init_io(&quirk->mem[0], OBJECT(vdev), &vfio_rtl_address_quirk, rtl, "vfio-rtl8168-window-address-quirk", 4); - memory_region_add_subregion_overlap(&vdev->bars[nr].region.mem, + memory_region_add_subregion_overlap(vdev->bars[nr].region.mem, 0x74, &quirk->mem[0], 1); memory_region_init_io(&quirk->mem[1], OBJECT(vdev), &vfio_rtl_data_quirk, rtl, "vfio-rtl8168-window-data-quirk", 4); - memory_region_add_subregion_overlap(&vdev->bars[nr].region.mem, + memory_region_add_subregion_overlap(vdev->bars[nr].region.mem, 0x70, &quirk->mem[1], 1); QLIST_INSERT_HEAD(&vdev->bars[nr].quirks, quirk, next); @@ -970,28 +970,28 @@ void vfio_vga_quirk_setup(VFIOPCIDevice *vdev) vfio_vga_probe_nvidia_3d0_quirk(vdev); } -void vfio_vga_quirk_teardown(VFIOPCIDevice *vdev) +void vfio_vga_quirk_exit(VFIOPCIDevice *vdev) { VFIOQuirk *quirk; int i, j; - for (i = 0; i < ARRAY_SIZE(vdev->vga.region); i++) { - QLIST_FOREACH(quirk, &vdev->vga.region[i].quirks, next) { + for (i = 0; i < ARRAY_SIZE(vdev->vga->region); i++) { + QLIST_FOREACH(quirk, &vdev->vga->region[i].quirks, next) { for (j = 0; j < quirk->nr_mem; j++) { - memory_region_del_subregion(&vdev->vga.region[i].mem, + memory_region_del_subregion(&vdev->vga->region[i].mem, &quirk->mem[j]); } } } } -void vfio_vga_quirk_free(VFIOPCIDevice *vdev) +void vfio_vga_quirk_finalize(VFIOPCIDevice *vdev) { int i, j; - for (i = 0; i < ARRAY_SIZE(vdev->vga.region); i++) { - while (!QLIST_EMPTY(&vdev->vga.region[i].quirks)) { - VFIOQuirk *quirk = QLIST_FIRST(&vdev->vga.region[i].quirks); + for (i = 0; i < ARRAY_SIZE(vdev->vga->region); i++) { + while (!QLIST_EMPTY(&vdev->vga->region[i].quirks)) { + VFIOQuirk *quirk = QLIST_FIRST(&vdev->vga->region[i].quirks); QLIST_REMOVE(quirk, next); for (j = 0; j < quirk->nr_mem; j++) { object_unparent(OBJECT(&quirk->mem[j])); @@ -1012,7 +1012,7 @@ void vfio_bar_quirk_setup(VFIOPCIDevice *vdev, int nr) vfio_probe_rtl8168_bar2_quirk(vdev, nr); } -void vfio_bar_quirk_teardown(VFIOPCIDevice *vdev, int nr) +void vfio_bar_quirk_exit(VFIOPCIDevice *vdev, int nr) { VFIOBAR *bar = &vdev->bars[nr]; VFIOQuirk *quirk; @@ -1020,12 +1020,12 @@ void vfio_bar_quirk_teardown(VFIOPCIDevice *vdev, int nr) QLIST_FOREACH(quirk, &bar->quirks, next) { for (i = 0; i < quirk->nr_mem; i++) { - memory_region_del_subregion(&bar->region.mem, &quirk->mem[i]); + memory_region_del_subregion(bar->region.mem, &quirk->mem[i]); } } } -void vfio_bar_quirk_free(VFIOPCIDevice *vdev, int nr) +void vfio_bar_quirk_finalize(VFIOPCIDevice *vdev, int nr) { VFIOBAR *bar = &vdev->bars[nr]; int i; diff --git a/hw/vfio/pci.c b/hw/vfio/pci.c index 20b505f4ec..d091d8cf0e 100644 --- a/hw/vfio/pci.c +++ b/hw/vfio/pci.c @@ -783,25 +783,25 @@ static void vfio_update_msi(VFIOPCIDevice *vdev) static void vfio_pci_load_rom(VFIOPCIDevice *vdev) { - struct vfio_region_info reg_info = { - .argsz = sizeof(reg_info), - .index = VFIO_PCI_ROM_REGION_INDEX - }; + struct vfio_region_info *reg_info; uint64_t size; off_t off = 0; ssize_t bytes; - if (ioctl(vdev->vbasedev.fd, VFIO_DEVICE_GET_REGION_INFO, ®_info)) { + if (vfio_get_region_info(&vdev->vbasedev, + VFIO_PCI_ROM_REGION_INDEX, ®_info)) { error_report("vfio: Error getting ROM info: %m"); return; } - trace_vfio_pci_load_rom(vdev->vbasedev.name, (unsigned long)reg_info.size, - (unsigned long)reg_info.offset, - (unsigned long)reg_info.flags); + trace_vfio_pci_load_rom(vdev->vbasedev.name, (unsigned long)reg_info->size, + (unsigned long)reg_info->offset, + (unsigned long)reg_info->flags); + + vdev->rom_size = size = reg_info->size; + vdev->rom_offset = reg_info->offset; - vdev->rom_size = size = reg_info.size; - vdev->rom_offset = reg_info.offset; + g_free(reg_info); if (!vdev->rom_size) { vdev->rom_read_failed = true; @@ -832,6 +832,36 @@ static void vfio_pci_load_rom(VFIOPCIDevice *vdev) break; } } + + /* + * Test the ROM signature against our device, if the vendor is correct + * but the device ID doesn't match, store the correct device ID and + * recompute the checksum. Intel IGD devices need this and are known + * to have bogus checksums so we can't simply adjust the checksum. + */ + if (pci_get_word(vdev->rom) == 0xaa55 && + pci_get_word(vdev->rom + 0x18) + 8 < vdev->rom_size && + !memcmp(vdev->rom + pci_get_word(vdev->rom + 0x18), "PCIR", 4)) { + uint16_t vid, did; + + vid = pci_get_word(vdev->rom + pci_get_word(vdev->rom + 0x18) + 4); + did = pci_get_word(vdev->rom + pci_get_word(vdev->rom + 0x18) + 6); + + if (vid == vdev->vendor_id && did != vdev->device_id) { + int i; + uint8_t csum, *data = vdev->rom; + + pci_set_word(vdev->rom + pci_get_word(vdev->rom + 0x18) + 6, + vdev->device_id); + data[6] = 0; + + for (csum = 0, i = 0; i < vdev->rom_size; i++) { + csum += data[i]; + } + + data[6] = -csum; + } + } } static uint64_t vfio_rom_read(void *opaque, hwaddr addr, unsigned size) @@ -889,18 +919,14 @@ static void vfio_pci_size_rom(VFIOPCIDevice *vdev) uint32_t orig, size = cpu_to_le32((uint32_t)PCI_ROM_ADDRESS_MASK); off_t offset = vdev->config_offset + PCI_ROM_ADDRESS; DeviceState *dev = DEVICE(vdev); - char name[32]; + char *name; int fd = vdev->vbasedev.fd; if (vdev->pdev.romfile || !vdev->pdev.rom_bar) { /* Since pci handles romfile, just print a message and return */ if (vfio_blacklist_opt_rom(vdev) && vdev->pdev.romfile) { - error_printf("Warning : Device at %04x:%02x:%02x.%x " - "is known to cause system instability issues during " - "option rom execution. " - "Proceeding anyway since user specified romfile\n", - vdev->host.domain, vdev->host.bus, vdev->host.slot, - vdev->host.function); + error_printf("Warning : Device at %s is known to cause system instability issues during option rom execution. Proceeding anyway since user specified romfile\n", + vdev->vbasedev.name); } return; } @@ -913,9 +939,7 @@ static void vfio_pci_size_rom(VFIOPCIDevice *vdev) pwrite(fd, &size, 4, offset) != 4 || pread(fd, &size, 4, offset) != 4 || pwrite(fd, &orig, 4, offset) != 4) { - error_report("%s(%04x:%02x:%02x.%x) failed: %m", - __func__, vdev->host.domain, vdev->host.bus, - vdev->host.slot, vdev->host.function); + error_report("%s(%s) failed: %m", __func__, vdev->vbasedev.name); return; } @@ -927,32 +951,22 @@ static void vfio_pci_size_rom(VFIOPCIDevice *vdev) if (vfio_blacklist_opt_rom(vdev)) { if (dev->opts && qemu_opt_get(dev->opts, "rombar")) { - error_printf("Warning : Device at %04x:%02x:%02x.%x " - "is known to cause system instability issues during " - "option rom execution. " - "Proceeding anyway since user specified non zero value for " - "rombar\n", - vdev->host.domain, vdev->host.bus, vdev->host.slot, - vdev->host.function); + error_printf("Warning : Device at %s is known to cause system instability issues during option rom execution. Proceeding anyway since user specified non zero value for rombar\n", + vdev->vbasedev.name); } else { - error_printf("Warning : Rom loading for device at " - "%04x:%02x:%02x.%x has been disabled due to " - "system instability issues. " - "Specify rombar=1 or romfile to force\n", - vdev->host.domain, vdev->host.bus, vdev->host.slot, - vdev->host.function); + error_printf("Warning : Rom loading for device at %s has been disabled due to system instability issues. Specify rombar=1 or romfile to force\n", + vdev->vbasedev.name); return; } } trace_vfio_pci_size_rom(vdev->vbasedev.name, size); - snprintf(name, sizeof(name), "vfio[%04x:%02x:%02x.%x].rom", - vdev->host.domain, vdev->host.bus, vdev->host.slot, - vdev->host.function); + name = g_strdup_printf("vfio[%s].rom", vdev->vbasedev.name); memory_region_init_io(&vdev->pdev.rom, OBJECT(vdev), &vfio_rom_ops, vdev, name, size); + g_free(name); pci_register_bar(&vdev->pdev, PCI_ROM_SLOT, PCI_BASE_ADDRESS_SPACE_MEMORY, &vdev->pdev.rom); @@ -1063,9 +1077,8 @@ uint32_t vfio_pci_read_config(PCIDevice *pdev, uint32_t addr, int len) ret = pread(vdev->vbasedev.fd, &phys_val, len, vdev->config_offset + addr); if (ret != len) { - error_report("%s(%04x:%02x:%02x.%x, 0x%x, 0x%x) failed: %m", - __func__, vdev->host.domain, vdev->host.bus, - vdev->host.slot, vdev->host.function, addr, len); + error_report("%s(%s, 0x%x, 0x%x) failed: %m", + __func__, vdev->vbasedev.name, addr, len); return -errno; } phys_val = le32_to_cpu(phys_val); @@ -1089,9 +1102,8 @@ void vfio_pci_write_config(PCIDevice *pdev, /* Write everything to VFIO, let it filter out what we can't write */ if (pwrite(vdev->vbasedev.fd, &val_le, len, vdev->config_offset + addr) != len) { - error_report("%s(%04x:%02x:%02x.%x, 0x%x, 0x%x, 0x%x) failed: %m", - __func__, vdev->host.domain, vdev->host.bus, - vdev->host.slot, vdev->host.function, addr, val, len); + error_report("%s(%s, 0x%x, 0x%x, 0x%x) failed: %m", + __func__, vdev->vbasedev.name, addr, val, len); } /* MSI/MSI-X Enabling/Disabling */ @@ -1185,6 +1197,74 @@ static int vfio_msi_setup(VFIOPCIDevice *vdev, int pos) return 0; } +static void vfio_pci_fixup_msix_region(VFIOPCIDevice *vdev) +{ + off_t start, end; + VFIORegion *region = &vdev->bars[vdev->msix->table_bar].region; + + /* + * We expect to find a single mmap covering the whole BAR, anything else + * means it's either unsupported or already setup. + */ + if (region->nr_mmaps != 1 || region->mmaps[0].offset || + region->size != region->mmaps[0].size) { + return; + } + + /* MSI-X table start and end aligned to host page size */ + start = vdev->msix->table_offset & qemu_real_host_page_mask; + end = REAL_HOST_PAGE_ALIGN((uint64_t)vdev->msix->table_offset + + (vdev->msix->entries * PCI_MSIX_ENTRY_SIZE)); + + /* + * Does the MSI-X table cover the beginning of the BAR? The whole BAR? + * NB - Host page size is necessarily a power of two and so is the PCI + * BAR (not counting EA yet), therefore if we have host page aligned + * @start and @end, then any remainder of the BAR before or after those + * must be at least host page sized and therefore mmap'able. + */ + if (!start) { + if (end >= region->size) { + region->nr_mmaps = 0; + g_free(region->mmaps); + region->mmaps = NULL; + trace_vfio_msix_fixup(vdev->vbasedev.name, + vdev->msix->table_bar, 0, 0); + } else { + region->mmaps[0].offset = end; + region->mmaps[0].size = region->size - end; + trace_vfio_msix_fixup(vdev->vbasedev.name, + vdev->msix->table_bar, region->mmaps[0].offset, + region->mmaps[0].offset + region->mmaps[0].size); + } + + /* Maybe it's aligned at the end of the BAR */ + } else if (end >= region->size) { + region->mmaps[0].size = start; + trace_vfio_msix_fixup(vdev->vbasedev.name, + vdev->msix->table_bar, region->mmaps[0].offset, + region->mmaps[0].offset + region->mmaps[0].size); + + /* Otherwise it must split the BAR */ + } else { + region->nr_mmaps = 2; + region->mmaps = g_renew(VFIOMmap, region->mmaps, 2); + + memcpy(®ion->mmaps[1], ®ion->mmaps[0], sizeof(VFIOMmap)); + + region->mmaps[0].size = start; + trace_vfio_msix_fixup(vdev->vbasedev.name, + vdev->msix->table_bar, region->mmaps[0].offset, + region->mmaps[0].offset + region->mmaps[0].size); + + region->mmaps[1].offset = end; + region->mmaps[1].size = region->size - end; + trace_vfio_msix_fixup(vdev->vbasedev.name, + vdev->msix->table_bar, region->mmaps[1].offset, + region->mmaps[1].offset + region->mmaps[1].size); + } +} + /* * We don't have any control over how pci_add_capability() inserts * capabilities into the chain. In order to setup MSI-X we need a @@ -1259,6 +1339,8 @@ static int vfio_msix_early_setup(VFIOPCIDevice *vdev) msix->table_offset, msix->entries); vdev->msix = msix; + vfio_pci_fixup_msix_region(vdev); + return 0; } @@ -1269,9 +1351,9 @@ static int vfio_msix_setup(VFIOPCIDevice *vdev, int pos) vdev->msix->pending = g_malloc0(BITS_TO_LONGS(vdev->msix->entries) * sizeof(unsigned long)); ret = msix_init(&vdev->pdev, vdev->msix->entries, - &vdev->bars[vdev->msix->table_bar].region.mem, + vdev->bars[vdev->msix->table_bar].region.mem, vdev->msix->table_bar, vdev->msix->table_offset, - &vdev->bars[vdev->msix->pba_bar].region.mem, + vdev->bars[vdev->msix->pba_bar].region.mem, vdev->msix->pba_bar, vdev->msix->pba_offset, pos); if (ret < 0) { if (ret == -ENOTSUP) { @@ -1308,8 +1390,8 @@ static void vfio_teardown_msi(VFIOPCIDevice *vdev) if (vdev->msix) { msix_uninit(&vdev->pdev, - &vdev->bars[vdev->msix->table_bar].region.mem, - &vdev->bars[vdev->msix->pba_bar].region.mem); + vdev->bars[vdev->msix->table_bar].region.mem, + vdev->bars[vdev->msix->pba_bar].region.mem); g_free(vdev->msix->pending); } } @@ -1322,71 +1404,23 @@ static void vfio_mmap_set_enabled(VFIOPCIDevice *vdev, bool enabled) int i; for (i = 0; i < PCI_ROM_SLOT; i++) { - VFIOBAR *bar = &vdev->bars[i]; - - if (!bar->region.size) { - continue; - } - - memory_region_set_enabled(&bar->region.mmap_mem, enabled); - if (vdev->msix && vdev->msix->table_bar == i) { - memory_region_set_enabled(&vdev->msix->mmap_mem, enabled); - } - } -} - -static void vfio_unregister_bar(VFIOPCIDevice *vdev, int nr) -{ - VFIOBAR *bar = &vdev->bars[nr]; - - if (!bar->region.size) { - return; - } - - vfio_bar_quirk_teardown(vdev, nr); - - memory_region_del_subregion(&bar->region.mem, &bar->region.mmap_mem); - - if (vdev->msix && vdev->msix->table_bar == nr) { - memory_region_del_subregion(&bar->region.mem, &vdev->msix->mmap_mem); + vfio_region_mmaps_set_enabled(&vdev->bars[i].region, enabled); } } -static void vfio_unmap_bar(VFIOPCIDevice *vdev, int nr) +static void vfio_bar_setup(VFIOPCIDevice *vdev, int nr) { VFIOBAR *bar = &vdev->bars[nr]; - if (!bar->region.size) { - return; - } - - vfio_bar_quirk_free(vdev, nr); - - munmap(bar->region.mmap, memory_region_size(&bar->region.mmap_mem)); - - if (vdev->msix && vdev->msix->table_bar == nr) { - munmap(vdev->msix->mmap, memory_region_size(&vdev->msix->mmap_mem)); - } -} - -static void vfio_map_bar(VFIOPCIDevice *vdev, int nr) -{ - VFIOBAR *bar = &vdev->bars[nr]; - uint64_t size = bar->region.size; - char name[64]; uint32_t pci_bar; uint8_t type; int ret; /* Skip both unimplemented BARs and the upper half of 64bit BARS. */ - if (!size) { + if (!bar->region.size) { return; } - snprintf(name, sizeof(name), "VFIO %04x:%02x:%02x.%x BAR %d", - vdev->host.domain, vdev->host.bus, vdev->host.slot, - vdev->host.function, nr); - /* Determine what type of BAR this is for registration */ ret = pread(vdev->vbasedev.fd, &pci_bar, sizeof(pci_bar), vdev->config_offset + PCI_BASE_ADDRESS_0 + (4 * nr)); @@ -1401,102 +1435,78 @@ static void vfio_map_bar(VFIOPCIDevice *vdev, int nr) type = pci_bar & (bar->ioport ? ~PCI_BASE_ADDRESS_IO_MASK : ~PCI_BASE_ADDRESS_MEM_MASK); - /* A "slow" read/write mapping underlies all BARs */ - memory_region_init_io(&bar->region.mem, OBJECT(vdev), &vfio_region_ops, - bar, name, size); - pci_register_bar(&vdev->pdev, nr, type, &bar->region.mem); - - /* - * We can't mmap areas overlapping the MSIX vector table, so we - * potentially insert a direct-mapped subregion before and after it. - */ - if (vdev->msix && vdev->msix->table_bar == nr) { - size = vdev->msix->table_offset & qemu_real_host_page_mask; - } - - strncat(name, " mmap", sizeof(name) - strlen(name) - 1); - if (vfio_mmap_region(OBJECT(vdev), &bar->region, &bar->region.mem, - &bar->region.mmap_mem, &bar->region.mmap, - size, 0, name)) { - error_report("%s unsupported. Performance may be slow", name); - } - - if (vdev->msix && vdev->msix->table_bar == nr) { - uint64_t start; - - start = REAL_HOST_PAGE_ALIGN((uint64_t)vdev->msix->table_offset + - (vdev->msix->entries * - PCI_MSIX_ENTRY_SIZE)); - - size = start < bar->region.size ? bar->region.size - start : 0; - strncat(name, " msix-hi", sizeof(name) - strlen(name) - 1); - /* VFIOMSIXInfo contains another MemoryRegion for this mapping */ - if (vfio_mmap_region(OBJECT(vdev), &bar->region, &bar->region.mem, - &vdev->msix->mmap_mem, - &vdev->msix->mmap, size, start, name)) { - error_report("%s unsupported. Performance may be slow", name); - } + if (vfio_region_mmap(&bar->region)) { + error_report("Failed to mmap %s BAR %d. Performance may be slow", + vdev->vbasedev.name, nr); } vfio_bar_quirk_setup(vdev, nr); + + pci_register_bar(&vdev->pdev, nr, type, bar->region.mem); } -static void vfio_map_bars(VFIOPCIDevice *vdev) +static void vfio_bars_setup(VFIOPCIDevice *vdev) { int i; for (i = 0; i < PCI_ROM_SLOT; i++) { - vfio_map_bar(vdev, i); + vfio_bar_setup(vdev, i); } - if (vdev->has_vga) { - memory_region_init_io(&vdev->vga.region[QEMU_PCI_VGA_MEM].mem, + if (vdev->vga) { + memory_region_init_io(&vdev->vga->region[QEMU_PCI_VGA_MEM].mem, OBJECT(vdev), &vfio_vga_ops, - &vdev->vga.region[QEMU_PCI_VGA_MEM], + &vdev->vga->region[QEMU_PCI_VGA_MEM], "vfio-vga-mmio@0xa0000", QEMU_PCI_VGA_MEM_SIZE); - memory_region_init_io(&vdev->vga.region[QEMU_PCI_VGA_IO_LO].mem, + memory_region_init_io(&vdev->vga->region[QEMU_PCI_VGA_IO_LO].mem, OBJECT(vdev), &vfio_vga_ops, - &vdev->vga.region[QEMU_PCI_VGA_IO_LO], + &vdev->vga->region[QEMU_PCI_VGA_IO_LO], "vfio-vga-io@0x3b0", QEMU_PCI_VGA_IO_LO_SIZE); - memory_region_init_io(&vdev->vga.region[QEMU_PCI_VGA_IO_HI].mem, + memory_region_init_io(&vdev->vga->region[QEMU_PCI_VGA_IO_HI].mem, OBJECT(vdev), &vfio_vga_ops, - &vdev->vga.region[QEMU_PCI_VGA_IO_HI], + &vdev->vga->region[QEMU_PCI_VGA_IO_HI], "vfio-vga-io@0x3c0", QEMU_PCI_VGA_IO_HI_SIZE); - pci_register_vga(&vdev->pdev, &vdev->vga.region[QEMU_PCI_VGA_MEM].mem, - &vdev->vga.region[QEMU_PCI_VGA_IO_LO].mem, - &vdev->vga.region[QEMU_PCI_VGA_IO_HI].mem); + pci_register_vga(&vdev->pdev, &vdev->vga->region[QEMU_PCI_VGA_MEM].mem, + &vdev->vga->region[QEMU_PCI_VGA_IO_LO].mem, + &vdev->vga->region[QEMU_PCI_VGA_IO_HI].mem); vfio_vga_quirk_setup(vdev); } } -static void vfio_unregister_bars(VFIOPCIDevice *vdev) +static void vfio_bars_exit(VFIOPCIDevice *vdev) { int i; for (i = 0; i < PCI_ROM_SLOT; i++) { - vfio_unregister_bar(vdev, i); + vfio_bar_quirk_exit(vdev, i); + vfio_region_exit(&vdev->bars[i].region); } - if (vdev->has_vga) { - vfio_vga_quirk_teardown(vdev); + if (vdev->vga) { pci_unregister_vga(&vdev->pdev); + vfio_vga_quirk_exit(vdev); } } -static void vfio_unmap_bars(VFIOPCIDevice *vdev) +static void vfio_bars_finalize(VFIOPCIDevice *vdev) { int i; for (i = 0; i < PCI_ROM_SLOT; i++) { - vfio_unmap_bar(vdev, i); + vfio_bar_quirk_finalize(vdev, i); + vfio_region_finalize(&vdev->bars[i].region); } - if (vdev->has_vga) { - vfio_vga_quirk_free(vdev); + if (vdev->vga) { + vfio_vga_quirk_finalize(vdev); + for (i = 0; i < ARRAY_SIZE(vdev->vga->region); i++) { + object_unparent(OBJECT(&vdev->vga->region[i].mem)); + } + g_free(vdev->vga); } } @@ -1756,9 +1766,8 @@ static int vfio_add_std_cap(VFIOPCIDevice *vdev, uint8_t pos) } if (ret < 0) { - error_report("vfio: %04x:%02x:%02x.%x Error adding PCI capability " - "0x%x[0x%x]@0x%x: %d", vdev->host.domain, - vdev->host.bus, vdev->host.slot, vdev->host.function, + error_report("vfio: %s Error adding PCI capability " + "0x%x[0x%x]@0x%x: %d", vdev->vbasedev.name, cap_id, size, pos, ret); return ret; } @@ -1820,11 +1829,14 @@ static void vfio_pci_post_reset(VFIOPCIDevice *vdev) vfio_intx_enable(vdev); } -static bool vfio_pci_host_match(PCIHostDeviceAddress *host1, - PCIHostDeviceAddress *host2) +static bool vfio_pci_host_match(PCIHostDeviceAddress *addr, const char *name) { - return (host1->domain == host2->domain && host1->bus == host2->bus && - host1->slot == host2->slot && host1->function == host2->function); + char tmp[13]; + + sprintf(tmp, "%04x:%02x:%02x.%1x", addr->domain, + addr->bus, addr->slot, addr->function); + + return (strcmp(tmp, name) == 0); } static int vfio_pci_hot_reset(VFIOPCIDevice *vdev, bool single) @@ -1849,9 +1861,8 @@ static int vfio_pci_hot_reset(VFIOPCIDevice *vdev, bool single) if (ret && errno != ENOSPC) { ret = -errno; if (!vdev->has_pm_reset) { - error_report("vfio: Cannot reset device %04x:%02x:%02x.%x, " - "no available reset mechanism.", vdev->host.domain, - vdev->host.bus, vdev->host.slot, vdev->host.function); + error_report("vfio: Cannot reset device %s, " + "no available reset mechanism.", vdev->vbasedev.name); } goto out_single; } @@ -1884,7 +1895,7 @@ static int vfio_pci_hot_reset(VFIOPCIDevice *vdev, bool single) trace_vfio_pci_hot_reset_dep_devices(host.domain, host.bus, host.slot, host.function, devices[i].group_id); - if (vfio_pci_host_match(&host, &vdev->host)) { + if (vfio_pci_host_match(&host, vdev->vbasedev.name)) { continue; } @@ -1910,7 +1921,7 @@ static int vfio_pci_hot_reset(VFIOPCIDevice *vdev, bool single) continue; } tmp = container_of(vbasedev_iter, VFIOPCIDevice, vbasedev); - if (vfio_pci_host_match(&host, &tmp->host)) { + if (vfio_pci_host_match(&host, tmp->vbasedev.name)) { if (single) { ret = -EINVAL; goto out_single; @@ -1972,7 +1983,7 @@ out: host.slot = PCI_SLOT(devices[i].devfn); host.function = PCI_FUNC(devices[i].devfn); - if (vfio_pci_host_match(&host, &vdev->host)) { + if (vfio_pci_host_match(&host, vdev->vbasedev.name)) { continue; } @@ -1991,7 +2002,7 @@ out: continue; } tmp = container_of(vbasedev_iter, VFIOPCIDevice, vbasedev); - if (vfio_pci_host_match(&host, &tmp->host)) { + if (vfio_pci_host_match(&host, tmp->vbasedev.name)) { vfio_pci_post_reset(tmp); break; } @@ -2044,10 +2055,56 @@ static VFIODeviceOps vfio_pci_ops = { .vfio_eoi = vfio_intx_eoi, }; +int vfio_populate_vga(VFIOPCIDevice *vdev) +{ + VFIODevice *vbasedev = &vdev->vbasedev; + struct vfio_region_info *reg_info; + int ret; + + if (vbasedev->num_regions > VFIO_PCI_VGA_REGION_INDEX) { + ret = vfio_get_region_info(vbasedev, + VFIO_PCI_VGA_REGION_INDEX, ®_info); + if (ret) { + return ret; + } + + if (!(reg_info->flags & VFIO_REGION_INFO_FLAG_READ) || + !(reg_info->flags & VFIO_REGION_INFO_FLAG_WRITE) || + reg_info->size < 0xbffff + 1) { + error_report("vfio: Unexpected VGA info, flags 0x%lx, size 0x%lx", + (unsigned long)reg_info->flags, + (unsigned long)reg_info->size); + g_free(reg_info); + return -EINVAL; + } + + vdev->vga = g_new0(VFIOVGA, 1); + + vdev->vga->fd_offset = reg_info->offset; + vdev->vga->fd = vdev->vbasedev.fd; + + g_free(reg_info); + + vdev->vga->region[QEMU_PCI_VGA_MEM].offset = QEMU_PCI_VGA_MEM_BASE; + vdev->vga->region[QEMU_PCI_VGA_MEM].nr = QEMU_PCI_VGA_MEM; + QLIST_INIT(&vdev->vga->region[QEMU_PCI_VGA_MEM].quirks); + + vdev->vga->region[QEMU_PCI_VGA_IO_LO].offset = QEMU_PCI_VGA_IO_LO_BASE; + vdev->vga->region[QEMU_PCI_VGA_IO_LO].nr = QEMU_PCI_VGA_IO_LO; + QLIST_INIT(&vdev->vga->region[QEMU_PCI_VGA_IO_LO].quirks); + + vdev->vga->region[QEMU_PCI_VGA_IO_HI].offset = QEMU_PCI_VGA_IO_HI_BASE; + vdev->vga->region[QEMU_PCI_VGA_IO_HI].nr = QEMU_PCI_VGA_IO_HI; + QLIST_INIT(&vdev->vga->region[QEMU_PCI_VGA_IO_HI].quirks); + } + + return 0; +} + static int vfio_populate_device(VFIOPCIDevice *vdev) { VFIODevice *vbasedev = &vdev->vbasedev; - struct vfio_region_info reg_info = { .argsz = sizeof(reg_info) }; + struct vfio_region_info *reg_info; struct vfio_irq_info irq_info = { .argsz = sizeof(irq_info) }; int i, ret = -1; @@ -2069,85 +2126,47 @@ static int vfio_populate_device(VFIOPCIDevice *vdev) } for (i = VFIO_PCI_BAR0_REGION_INDEX; i < VFIO_PCI_ROM_REGION_INDEX; i++) { - reg_info.index = i; + char *name = g_strdup_printf("%s BAR %d", vbasedev->name, i); + + ret = vfio_region_setup(OBJECT(vdev), vbasedev, + &vdev->bars[i].region, i, name); + g_free(name); - ret = ioctl(vbasedev->fd, VFIO_DEVICE_GET_REGION_INFO, ®_info); if (ret) { error_report("vfio: Error getting region %d info: %m", i); goto error; } - trace_vfio_populate_device_region(vbasedev->name, i, - (unsigned long)reg_info.size, - (unsigned long)reg_info.offset, - (unsigned long)reg_info.flags); - - vdev->bars[i].region.vbasedev = vbasedev; - vdev->bars[i].region.flags = reg_info.flags; - vdev->bars[i].region.size = reg_info.size; - vdev->bars[i].region.fd_offset = reg_info.offset; - vdev->bars[i].region.nr = i; QLIST_INIT(&vdev->bars[i].quirks); } - reg_info.index = VFIO_PCI_CONFIG_REGION_INDEX; - - ret = ioctl(vdev->vbasedev.fd, VFIO_DEVICE_GET_REGION_INFO, ®_info); + ret = vfio_get_region_info(vbasedev, + VFIO_PCI_CONFIG_REGION_INDEX, ®_info); if (ret) { error_report("vfio: Error getting config info: %m"); goto error; } trace_vfio_populate_device_config(vdev->vbasedev.name, - (unsigned long)reg_info.size, - (unsigned long)reg_info.offset, - (unsigned long)reg_info.flags); + (unsigned long)reg_info->size, + (unsigned long)reg_info->offset, + (unsigned long)reg_info->flags); - vdev->config_size = reg_info.size; + vdev->config_size = reg_info->size; if (vdev->config_size == PCI_CONFIG_SPACE_SIZE) { vdev->pdev.cap_present &= ~QEMU_PCI_CAP_EXPRESS; } - vdev->config_offset = reg_info.offset; + vdev->config_offset = reg_info->offset; - if ((vdev->features & VFIO_FEATURE_ENABLE_VGA) && - vbasedev->num_regions > VFIO_PCI_VGA_REGION_INDEX) { - struct vfio_region_info vga_info = { - .argsz = sizeof(vga_info), - .index = VFIO_PCI_VGA_REGION_INDEX, - }; + g_free(reg_info); - ret = ioctl(vdev->vbasedev.fd, VFIO_DEVICE_GET_REGION_INFO, &vga_info); + if (vdev->features & VFIO_FEATURE_ENABLE_VGA) { + ret = vfio_populate_vga(vdev); if (ret) { error_report( "vfio: Device does not support requested feature x-vga"); goto error; } - - if (!(vga_info.flags & VFIO_REGION_INFO_FLAG_READ) || - !(vga_info.flags & VFIO_REGION_INFO_FLAG_WRITE) || - vga_info.size < 0xbffff + 1) { - error_report("vfio: Unexpected VGA info, flags 0x%lx, size 0x%lx", - (unsigned long)vga_info.flags, - (unsigned long)vga_info.size); - goto error; - } - - vdev->vga.fd_offset = vga_info.offset; - vdev->vga.fd = vdev->vbasedev.fd; - - vdev->vga.region[QEMU_PCI_VGA_MEM].offset = QEMU_PCI_VGA_MEM_BASE; - vdev->vga.region[QEMU_PCI_VGA_MEM].nr = QEMU_PCI_VGA_MEM; - QLIST_INIT(&vdev->vga.region[QEMU_PCI_VGA_MEM].quirks); - - vdev->vga.region[QEMU_PCI_VGA_IO_LO].offset = QEMU_PCI_VGA_IO_LO_BASE; - vdev->vga.region[QEMU_PCI_VGA_IO_LO].nr = QEMU_PCI_VGA_IO_LO; - QLIST_INIT(&vdev->vga.region[QEMU_PCI_VGA_IO_LO].quirks); - - vdev->vga.region[QEMU_PCI_VGA_IO_HI].offset = QEMU_PCI_VGA_IO_HI_BASE; - vdev->vga.region[QEMU_PCI_VGA_IO_HI].nr = QEMU_PCI_VGA_IO_HI; - QLIST_INIT(&vdev->vga.region[QEMU_PCI_VGA_IO_HI].quirks); - - vdev->has_vga = true; } irq_info.index = VFIO_PCI_ERR_IRQ_INDEX; @@ -2172,11 +2191,8 @@ error: static void vfio_put_device(VFIOPCIDevice *vdev) { g_free(vdev->vbasedev.name); - if (vdev->msix) { - object_unparent(OBJECT(&vdev->msix->mmap_mem)); - g_free(vdev->msix); - vdev->msix = NULL; - } + g_free(vdev->msix); + vfio_put_base_device(&vdev->vbasedev); } @@ -2197,10 +2213,7 @@ static void vfio_err_notifier_handler(void *opaque) * guest to contain the error. */ - error_report("%s(%04x:%02x:%02x.%x) Unrecoverable error detected. " - "Please collect any data possible and then kill the guest", - __func__, vdev->host.domain, vdev->host.bus, - vdev->host.slot, vdev->host.function); + error_report("%s(%s) Unrecoverable error detected. Please collect any data possible and then kill the guest", __func__, vdev->vbasedev.name); vm_stop(RUN_STATE_INTERNAL_ERROR); } @@ -2381,42 +2394,43 @@ static int vfio_initfn(PCIDevice *pdev) VFIOPCIDevice *vdev = DO_UPCAST(VFIOPCIDevice, pdev, pdev); VFIODevice *vbasedev_iter; VFIOGroup *group; - char path[PATH_MAX], iommu_group_path[PATH_MAX], *group_name; + char *tmp, group_path[PATH_MAX], *group_name; ssize_t len; struct stat st; int groupid; int ret; - /* Check that the host device exists */ - snprintf(path, sizeof(path), - "/sys/bus/pci/devices/%04x:%02x:%02x.%01x/", - vdev->host.domain, vdev->host.bus, vdev->host.slot, - vdev->host.function); - if (stat(path, &st) < 0) { - error_report("vfio: error: no such host device: %s", path); + if (!vdev->vbasedev.sysfsdev) { + vdev->vbasedev.sysfsdev = + g_strdup_printf("/sys/bus/pci/devices/%04x:%02x:%02x.%01x", + vdev->host.domain, vdev->host.bus, + vdev->host.slot, vdev->host.function); + } + + if (stat(vdev->vbasedev.sysfsdev, &st) < 0) { + error_report("vfio: error: no such host device: %s", + vdev->vbasedev.sysfsdev); return -errno; } + vdev->vbasedev.name = g_strdup(basename(vdev->vbasedev.sysfsdev)); vdev->vbasedev.ops = &vfio_pci_ops; - vdev->vbasedev.type = VFIO_DEVICE_TYPE_PCI; - vdev->vbasedev.name = g_strdup_printf("%04x:%02x:%02x.%01x", - vdev->host.domain, vdev->host.bus, - vdev->host.slot, vdev->host.function); - strncat(path, "iommu_group", sizeof(path) - strlen(path) - 1); + tmp = g_strdup_printf("%s/iommu_group", vdev->vbasedev.sysfsdev); + len = readlink(tmp, group_path, sizeof(group_path)); + g_free(tmp); - len = readlink(path, iommu_group_path, sizeof(path)); - if (len <= 0 || len >= sizeof(path)) { + if (len <= 0 || len >= sizeof(group_path)) { error_report("vfio: error no iommu_group for device"); return len < 0 ? -errno : -ENAMETOOLONG; } - iommu_group_path[len] = 0; - group_name = basename(iommu_group_path); + group_path[len] = 0; + group_name = basename(group_path); if (sscanf(group_name, "%d", &groupid) != 1) { - error_report("vfio: error reading %s: %m", path); + error_report("vfio: error reading %s: %m", group_path); return -errno; } @@ -2428,21 +2442,18 @@ static int vfio_initfn(PCIDevice *pdev) return -ENOENT; } - snprintf(path, sizeof(path), "%04x:%02x:%02x.%01x", - vdev->host.domain, vdev->host.bus, vdev->host.slot, - vdev->host.function); - QLIST_FOREACH(vbasedev_iter, &group->device_list, next) { if (strcmp(vbasedev_iter->name, vdev->vbasedev.name) == 0) { - error_report("vfio: error: device %s is already attached", path); + error_report("vfio: error: device %s is already attached", + vdev->vbasedev.name); vfio_put_group(group); return -EBUSY; } } - ret = vfio_get_device(group, path, &vdev->vbasedev); + ret = vfio_get_device(group, vdev->vbasedev.name, &vdev->vbasedev); if (ret) { - error_report("vfio: failed to get device %s", path); + error_report("vfio: failed to get device %s", vdev->vbasedev.name); vfio_put_group(group); return ret; } @@ -2542,7 +2553,7 @@ static int vfio_initfn(PCIDevice *pdev) return ret; } - vfio_map_bars(vdev); + vfio_bars_setup(vdev); ret = vfio_add_capabilities(vdev); if (ret) { @@ -2579,7 +2590,7 @@ static int vfio_initfn(PCIDevice *pdev) out_teardown: pci_device_set_intx_routing_notifier(&vdev->pdev, NULL); vfio_teardown_msi(vdev); - vfio_unregister_bars(vdev); + vfio_bars_exit(vdev); return ret; } @@ -2589,7 +2600,7 @@ static void vfio_instance_finalize(Object *obj) VFIOPCIDevice *vdev = DO_UPCAST(VFIOPCIDevice, pdev, pci_dev); VFIOGroup *group = vdev->vbasedev.group; - vfio_unmap_bars(vdev); + vfio_bars_finalize(vdev); g_free(vdev->emulated_config_bits); g_free(vdev->rom); vfio_put_device(vdev); @@ -2608,7 +2619,7 @@ static void vfio_exitfn(PCIDevice *pdev) timer_free(vdev->intx.mmap_timer); } vfio_teardown_msi(vdev); - vfio_unregister_bars(vdev); + vfio_bars_exit(vdev); } static void vfio_pci_reset(DeviceState *dev) @@ -2659,6 +2670,7 @@ static void vfio_instance_init(Object *obj) static Property vfio_pci_dev_properties[] = { DEFINE_PROP_PCI_HOST_DEVADDR("host", VFIOPCIDevice, host), + DEFINE_PROP_STRING("sysfsdev", VFIOPCIDevice, vbasedev.sysfsdev), DEFINE_PROP_UINT32("x-intx-mmap-timeout-ms", VFIOPCIDevice, intx.mmap_timeout, 1100), DEFINE_PROP_BIT("x-vga", VFIOPCIDevice, features, diff --git a/hw/vfio/pci.h b/hw/vfio/pci.h index 62565878fc..3976f68549 100644 --- a/hw/vfio/pci.h +++ b/hw/vfio/pci.h @@ -114,7 +114,7 @@ typedef struct VFIOPCIDevice { int nr_vectors; /* Number of MSI/MSIX vectors currently in use */ int interrupt; /* Current interrupt type */ VFIOBAR bars[PCI_NUM_REGIONS - 1]; /* No ROM */ - VFIOVGA vga; /* 0xa0000, 0x3b0, 0x3c0 */ + VFIOVGA *vga; /* 0xa0000, 0x3b0, 0x3c0 */ PCIHostDeviceAddress host; EventNotifier err_notifier; EventNotifier req_notifier; @@ -150,11 +150,13 @@ void vfio_vga_write(void *opaque, hwaddr addr, uint64_t data, unsigned size); bool vfio_blacklist_opt_rom(VFIOPCIDevice *vdev); void vfio_vga_quirk_setup(VFIOPCIDevice *vdev); -void vfio_vga_quirk_teardown(VFIOPCIDevice *vdev); -void vfio_vga_quirk_free(VFIOPCIDevice *vdev); +void vfio_vga_quirk_exit(VFIOPCIDevice *vdev); +void vfio_vga_quirk_finalize(VFIOPCIDevice *vdev); void vfio_bar_quirk_setup(VFIOPCIDevice *vdev, int nr); -void vfio_bar_quirk_teardown(VFIOPCIDevice *vdev, int nr); -void vfio_bar_quirk_free(VFIOPCIDevice *vdev, int nr); +void vfio_bar_quirk_exit(VFIOPCIDevice *vdev, int nr); +void vfio_bar_quirk_finalize(VFIOPCIDevice *vdev, int nr); void vfio_setup_resetfn_quirk(VFIOPCIDevice *vdev); +int vfio_populate_vga(VFIOPCIDevice *vdev); + #endif /* HW_VFIO_VFIO_PCI_H */ diff --git a/hw/vfio/platform.c b/hw/vfio/platform.c index ebc9dcbb99..a2ab75d3f2 100644 --- a/hw/vfio/platform.c +++ b/hw/vfio/platform.c @@ -143,12 +143,8 @@ static void vfio_mmap_set_enabled(VFIOPlatformDevice *vdev, bool enabled) { int i; - trace_vfio_platform_mmap_set_enabled(enabled); - for (i = 0; i < vdev->vbasedev.num_regions; i++) { - VFIORegion *region = vdev->regions[i]; - - memory_region_set_enabled(®ion->mmap_mem, enabled); + vfio_region_mmaps_set_enabled(vdev->regions[i], enabled); } } @@ -476,28 +472,16 @@ static int vfio_populate_device(VFIODevice *vbasedev) vdev->regions = g_new0(VFIORegion *, vbasedev->num_regions); for (i = 0; i < vbasedev->num_regions; i++) { - struct vfio_region_info reg_info = { .argsz = sizeof(reg_info) }; - VFIORegion *ptr; + char *name = g_strdup_printf("VFIO %s region %d\n", vbasedev->name, i); vdev->regions[i] = g_new0(VFIORegion, 1); - ptr = vdev->regions[i]; - reg_info.index = i; - ret = ioctl(vbasedev->fd, VFIO_DEVICE_GET_REGION_INFO, ®_info); + ret = vfio_region_setup(OBJECT(vdev), vbasedev, + vdev->regions[i], i, name); + g_free(name); if (ret) { error_report("vfio: Error getting region %d info: %m", i); goto reg_error; } - ptr->flags = reg_info.flags; - ptr->size = reg_info.size; - ptr->fd_offset = reg_info.offset; - ptr->nr = i; - ptr->vbasedev = vbasedev; - - trace_vfio_platform_populate_regions(ptr->nr, - (unsigned long)ptr->flags, - (unsigned long)ptr->size, - ptr->vbasedev->fd, - (unsigned long)ptr->fd_offset); } vdev->mmap_timer = timer_new_ms(QEMU_CLOCK_VIRTUAL, @@ -534,6 +518,9 @@ irq_err: } reg_error: for (i = 0; i < vbasedev->num_regions; i++) { + if (vdev->regions[i]) { + vfio_region_finalize(vdev->regions[i]); + } g_free(vdev->regions[i]); } g_free(vdev->regions); @@ -560,38 +547,45 @@ static int vfio_base_device_init(VFIODevice *vbasedev) { VFIOGroup *group; VFIODevice *vbasedev_iter; - char path[PATH_MAX], iommu_group_path[PATH_MAX], *group_name; + char *tmp, group_path[PATH_MAX], *group_name; ssize_t len; struct stat st; int groupid; int ret; - /* name must be set prior to the call */ - if (!vbasedev->name || strchr(vbasedev->name, '/')) { - return -EINVAL; - } + /* @sysfsdev takes precedence over @host */ + if (vbasedev->sysfsdev) { + g_free(vbasedev->name); + vbasedev->name = g_strdup(basename(vbasedev->sysfsdev)); + } else { + if (!vbasedev->name || strchr(vbasedev->name, '/')) { + return -EINVAL; + } - /* Check that the host device exists */ - g_snprintf(path, sizeof(path), "/sys/bus/platform/devices/%s/", - vbasedev->name); + vbasedev->sysfsdev = g_strdup_printf("/sys/bus/platform/devices/%s", + vbasedev->name); + } - if (stat(path, &st) < 0) { - error_report("vfio: error: no such host device: %s", path); + if (stat(vbasedev->sysfsdev, &st) < 0) { + error_report("vfio: error: no such host device: %s", + vbasedev->sysfsdev); return -errno; } - g_strlcat(path, "iommu_group", sizeof(path)); - len = readlink(path, iommu_group_path, sizeof(iommu_group_path)); - if (len < 0 || len >= sizeof(iommu_group_path)) { + tmp = g_strdup_printf("%s/iommu_group", vbasedev->sysfsdev); + len = readlink(tmp, group_path, sizeof(group_path)); + g_free(tmp); + + if (len < 0 || len >= sizeof(group_path)) { error_report("vfio: error no iommu_group for device"); return len < 0 ? -errno : -ENAMETOOLONG; } - iommu_group_path[len] = 0; - group_name = basename(iommu_group_path); + group_path[len] = 0; + group_name = basename(group_path); if (sscanf(group_name, "%d", &groupid) != 1) { - error_report("vfio: error reading %s: %m", path); + error_report("vfio: error reading %s: %m", group_path); return -errno; } @@ -603,25 +597,24 @@ static int vfio_base_device_init(VFIODevice *vbasedev) return -ENOENT; } - g_snprintf(path, sizeof(path), "%s", vbasedev->name); - QLIST_FOREACH(vbasedev_iter, &group->device_list, next) { if (strcmp(vbasedev_iter->name, vbasedev->name) == 0) { - error_report("vfio: error: device %s is already attached", path); + error_report("vfio: error: device %s is already attached", + vbasedev->name); vfio_put_group(group); return -EBUSY; } } - ret = vfio_get_device(group, path, vbasedev); + ret = vfio_get_device(group, vbasedev->name, vbasedev); if (ret) { - error_report("vfio: failed to get device %s", path); + error_report("vfio: failed to get device %s", vbasedev->name); vfio_put_group(group); return ret; } ret = vfio_populate_device(vbasedev); if (ret) { - error_report("vfio: failed to populate device %s", path); + error_report("vfio: failed to populate device %s", vbasedev->name); vfio_put_group(group); } @@ -629,41 +622,6 @@ static int vfio_base_device_init(VFIODevice *vbasedev) } /** - * vfio_map_region - initialize the 2 memory regions for a given - * MMIO region index - * @vdev: the VFIO platform device handle - * @nr: the index of the region - * - * Init the top memory region and the mmapped memory region beneath - * VFIOPlatformDevice is used since VFIODevice is not a QOM Object - * and could not be passed to memory region functions -*/ -static void vfio_map_region(VFIOPlatformDevice *vdev, int nr) -{ - VFIORegion *region = vdev->regions[nr]; - uint64_t size = region->size; - char name[64]; - - if (!size) { - return; - } - - g_snprintf(name, sizeof(name), "VFIO %s region %d", - vdev->vbasedev.name, nr); - - /* A "slow" read/write mapping underlies all regions */ - memory_region_init_io(®ion->mem, OBJECT(vdev), &vfio_region_ops, - region, name, size); - - g_strlcat(name, " mmap", sizeof(name)); - - if (vfio_mmap_region(OBJECT(vdev), region, ®ion->mem, - ®ion->mmap_mem, ®ion->mmap, size, 0, name)) { - error_report("%s unsupported. Performance may be slow", name); - } -} - -/** * vfio_platform_realize - the device realize function * @dev: device state pointer * @errp: error @@ -681,7 +639,9 @@ static void vfio_platform_realize(DeviceState *dev, Error **errp) vbasedev->type = VFIO_DEVICE_TYPE_PLATFORM; vbasedev->ops = &vfio_platform_ops; - trace_vfio_platform_realize(vbasedev->name, vdev->compat); + trace_vfio_platform_realize(vbasedev->sysfsdev ? + vbasedev->sysfsdev : vbasedev->name, + vdev->compat); ret = vfio_base_device_init(vbasedev); if (ret) { @@ -691,8 +651,11 @@ static void vfio_platform_realize(DeviceState *dev, Error **errp) } for (i = 0; i < vbasedev->num_regions; i++) { - vfio_map_region(vdev, i); - sysbus_init_mmio(sbdev, &vdev->regions[i]->mem); + if (vfio_region_mmap(vdev->regions[i])) { + error_report("%s mmap unsupported. Performance may be slow", + memory_region_name(vdev->regions[i]->mem)); + } + sysbus_init_mmio(sbdev, vdev->regions[i]->mem); } } @@ -703,6 +666,7 @@ static const VMStateDescription vfio_platform_vmstate = { static Property vfio_platform_dev_properties[] = { DEFINE_PROP_STRING("host", VFIOPlatformDevice, vbasedev.name), + DEFINE_PROP_STRING("sysfsdev", VFIOPlatformDevice, vbasedev.sysfsdev), DEFINE_PROP_BOOL("x-no-mmap", VFIOPlatformDevice, vbasedev.no_mmap, false), DEFINE_PROP_UINT32("mmap-timeout-ms", VFIOPlatformDevice, mmap_timeout, 1100), diff --git a/include/block/block.h b/include/block/block.h index 1c4f4d8141..eaa64262d9 100644 --- a/include/block/block.h +++ b/include/block/block.h @@ -6,8 +6,10 @@ #include "qemu/option.h" #include "qemu/coroutine.h" #include "block/accounting.h" +#include "block/dirty-bitmap.h" #include "qapi/qmp/qobject.h" #include "qapi-types.h" +#include "qemu/hbitmap.h" /* block.c */ typedef struct BlockDriver BlockDriver; @@ -215,7 +217,6 @@ BdrvChild *bdrv_open_child(const char *filename, void bdrv_set_backing_hd(BlockDriverState *bs, BlockDriverState *backing_hd); int bdrv_open_backing_file(BlockDriverState *bs, QDict *parent_options, const char *bdref_key, Error **errp); -int bdrv_append_temp_snapshot(BlockDriverState *bs, int flags, Error **errp); int bdrv_open(BlockDriverState **pbs, const char *filename, const char *reference, QDict *options, int flags, Error **errp); BlockReopenQueue *bdrv_reopen_queue(BlockReopenQueue *bs_queue, @@ -320,8 +321,6 @@ BlockDriverState *check_to_replace_node(BlockDriverState *parent_bs, const char *node_name, Error **errp); /* async block I/O */ -typedef void BlockDriverDirtyHandler(BlockDriverState *bs, int64_t sector, - int sector_num); BlockAIOCB *bdrv_aio_readv(BlockDriverState *bs, int64_t sector_num, QEMUIOVector *iov, int nb_sectors, BlockCompletionFunc *cb, void *opaque); @@ -475,42 +474,6 @@ void *qemu_try_blockalign(BlockDriverState *bs, size_t size); void *qemu_try_blockalign0(BlockDriverState *bs, size_t size); bool bdrv_qiov_is_aligned(BlockDriverState *bs, QEMUIOVector *qiov); -struct HBitmapIter; -typedef struct BdrvDirtyBitmap BdrvDirtyBitmap; -BdrvDirtyBitmap *bdrv_create_dirty_bitmap(BlockDriverState *bs, - uint32_t granularity, - const char *name, - Error **errp); -int bdrv_dirty_bitmap_create_successor(BlockDriverState *bs, - BdrvDirtyBitmap *bitmap, - Error **errp); -BdrvDirtyBitmap *bdrv_dirty_bitmap_abdicate(BlockDriverState *bs, - BdrvDirtyBitmap *bitmap, - Error **errp); -BdrvDirtyBitmap *bdrv_reclaim_dirty_bitmap(BlockDriverState *bs, - BdrvDirtyBitmap *bitmap, - Error **errp); -BdrvDirtyBitmap *bdrv_find_dirty_bitmap(BlockDriverState *bs, - const char *name); -void bdrv_dirty_bitmap_make_anon(BdrvDirtyBitmap *bitmap); -void bdrv_release_dirty_bitmap(BlockDriverState *bs, BdrvDirtyBitmap *bitmap); -void bdrv_disable_dirty_bitmap(BdrvDirtyBitmap *bitmap); -void bdrv_enable_dirty_bitmap(BdrvDirtyBitmap *bitmap); -BlockDirtyInfoList *bdrv_query_dirty_bitmaps(BlockDriverState *bs); -uint32_t bdrv_get_default_bitmap_granularity(BlockDriverState *bs); -uint32_t bdrv_dirty_bitmap_granularity(BdrvDirtyBitmap *bitmap); -bool bdrv_dirty_bitmap_enabled(BdrvDirtyBitmap *bitmap); -bool bdrv_dirty_bitmap_frozen(BdrvDirtyBitmap *bitmap); -DirtyBitmapStatus bdrv_dirty_bitmap_status(BdrvDirtyBitmap *bitmap); -int bdrv_get_dirty(BlockDriverState *bs, BdrvDirtyBitmap *bitmap, int64_t sector); -void bdrv_set_dirty_bitmap(BdrvDirtyBitmap *bitmap, - int64_t cur_sector, int nr_sectors); -void bdrv_reset_dirty_bitmap(BdrvDirtyBitmap *bitmap, - int64_t cur_sector, int nr_sectors); -void bdrv_dirty_iter_init(BdrvDirtyBitmap *bitmap, struct HBitmapIter *hbi); -void bdrv_set_dirty_iter(struct HBitmapIter *hbi, int64_t offset); -int64_t bdrv_get_dirty_count(BdrvDirtyBitmap *bitmap); - void bdrv_enable_copy_on_read(BlockDriverState *bs); void bdrv_disable_copy_on_read(BlockDriverState *bs); diff --git a/include/block/block_int.h b/include/block/block_int.h index 9ef823a660..dda5ba0927 100644 --- a/include/block/block_int.h +++ b/include/block/block_int.h @@ -694,6 +694,8 @@ void backup_start(BlockDriverState *bs, BlockDriverState *target, BlockCompletionFunc *cb, void *opaque, BlockJobTxn *txn, Error **errp); +void hmp_drive_add_node(Monitor *mon, const char *optstr); + void blk_set_bs(BlockBackend *blk, BlockDriverState *bs); void blk_dev_change_media_cb(BlockBackend *blk, bool load); diff --git a/include/block/dirty-bitmap.h b/include/block/dirty-bitmap.h new file mode 100644 index 0000000000..80afe603f6 --- /dev/null +++ b/include/block/dirty-bitmap.h @@ -0,0 +1,44 @@ +#ifndef BLOCK_DIRTY_BITMAP_H +#define BLOCK_DIRTY_BITMAP_H + +#include "qemu-common.h" +#include "qemu/hbitmap.h" + +BdrvDirtyBitmap *bdrv_create_dirty_bitmap(BlockDriverState *bs, + uint32_t granularity, + const char *name, + Error **errp); +int bdrv_dirty_bitmap_create_successor(BlockDriverState *bs, + BdrvDirtyBitmap *bitmap, + Error **errp); +BdrvDirtyBitmap *bdrv_dirty_bitmap_abdicate(BlockDriverState *bs, + BdrvDirtyBitmap *bitmap, + Error **errp); +BdrvDirtyBitmap *bdrv_reclaim_dirty_bitmap(BlockDriverState *bs, + BdrvDirtyBitmap *bitmap, + Error **errp); +BdrvDirtyBitmap *bdrv_find_dirty_bitmap(BlockDriverState *bs, + const char *name); +void bdrv_dirty_bitmap_make_anon(BdrvDirtyBitmap *bitmap); +void bdrv_release_dirty_bitmap(BlockDriverState *bs, BdrvDirtyBitmap *bitmap); +void bdrv_release_named_dirty_bitmaps(BlockDriverState *bs); +void bdrv_disable_dirty_bitmap(BdrvDirtyBitmap *bitmap); +void bdrv_enable_dirty_bitmap(BdrvDirtyBitmap *bitmap); +BlockDirtyInfoList *bdrv_query_dirty_bitmaps(BlockDriverState *bs); +uint32_t bdrv_get_default_bitmap_granularity(BlockDriverState *bs); +uint32_t bdrv_dirty_bitmap_granularity(BdrvDirtyBitmap *bitmap); +bool bdrv_dirty_bitmap_enabled(BdrvDirtyBitmap *bitmap); +bool bdrv_dirty_bitmap_frozen(BdrvDirtyBitmap *bitmap); +DirtyBitmapStatus bdrv_dirty_bitmap_status(BdrvDirtyBitmap *bitmap); +int bdrv_get_dirty(BlockDriverState *bs, BdrvDirtyBitmap *bitmap, + int64_t sector); +void bdrv_set_dirty_bitmap(BdrvDirtyBitmap *bitmap, + int64_t cur_sector, int nr_sectors); +void bdrv_reset_dirty_bitmap(BdrvDirtyBitmap *bitmap, + int64_t cur_sector, int nr_sectors); +void bdrv_dirty_iter_init(BdrvDirtyBitmap *bitmap, struct HBitmapIter *hbi); +void bdrv_set_dirty_iter(struct HBitmapIter *hbi, int64_t offset); +int64_t bdrv_get_dirty_count(BdrvDirtyBitmap *bitmap); +void bdrv_dirty_bitmap_truncate(BlockDriverState *bs); + +#endif diff --git a/include/hw/s390x/s390-virtio-ccw.h b/include/hw/s390x/s390-virtio-ccw.h new file mode 100644 index 0000000000..ab08332fe1 --- /dev/null +++ b/include/hw/s390x/s390-virtio-ccw.h @@ -0,0 +1,40 @@ +/* + * virtio ccw machine definitions + * + * Copyright 2012, 2016 IBM Corp. + * Author(s): Cornelia Huck <cornelia.huck@de.ibm.com> + * + * This work is licensed under the terms of the GNU GPL, version 2 or (at + * your option) any later version. See the COPYING file in the top-level + * directory. + */ +#ifndef HW_S390X_S390_VIRTIO_CCW_H +#define HW_S390X_S390_VIRTIO_CCW_H + +#include "hw/boards.h" + +#define TYPE_S390_CCW_MACHINE "s390-ccw-machine" + +#define S390_CCW_MACHINE(obj) \ + OBJECT_CHECK(S390CcwMachineState, (obj), TYPE_S390_CCW_MACHINE) + +#define S390_MACHINE_CLASS(klass) \ + OBJECT_CLASS_CHECK(S390CcwMachineClass, (klass), TYPE_S390_CCW_MACHINE) + +typedef struct S390CcwMachineState { + /*< private >*/ + MachineState parent_obj; + + /*< public >*/ + bool aes_key_wrap; + bool dea_key_wrap; +} S390CcwMachineState; + +typedef struct S390CcwMachineClass { + /*< private >*/ + MachineClass parent_class; + + /*< public >*/ +} S390CcwMachineClass; + +#endif diff --git a/include/hw/vfio/vfio-common.h b/include/hw/vfio/vfio-common.h index f037f3c425..eb0e1b0342 100644 --- a/include/hw/vfio/vfio-common.h +++ b/include/hw/vfio/vfio-common.h @@ -25,6 +25,9 @@ #include "exec/memory.h" #include "qemu/queue.h" #include "qemu/notify.h" +#ifdef CONFIG_LINUX +#include <linux/vfio.h> +#endif /*#define DEBUG_VFIO*/ #ifdef DEBUG_VFIO @@ -40,14 +43,21 @@ enum { VFIO_DEVICE_TYPE_PLATFORM = 1, }; +typedef struct VFIOMmap { + MemoryRegion mem; + void *mmap; + off_t offset; + size_t size; +} VFIOMmap; + typedef struct VFIORegion { struct VFIODevice *vbasedev; off_t fd_offset; /* offset of region within device fd */ - MemoryRegion mem; /* slow, read/write access */ - MemoryRegion mmap_mem; /* direct mapped access */ - void *mmap; + MemoryRegion *mem; /* slow, read/write access */ size_t size; uint32_t flags; /* VFIO region flags (rd/wr/mmap) */ + uint32_t nr_mmaps; + VFIOMmap *mmaps; uint8_t nr; /* cache the region number for debug */ } VFIORegion; @@ -89,6 +99,7 @@ typedef struct VFIODeviceOps VFIODeviceOps; typedef struct VFIODevice { QLIST_ENTRY(VFIODevice) next; struct VFIOGroup *group; + char *sysfsdev; char *name; int fd; int type; @@ -124,10 +135,12 @@ void vfio_region_write(void *opaque, hwaddr addr, uint64_t data, unsigned size); uint64_t vfio_region_read(void *opaque, hwaddr addr, unsigned size); -int vfio_mmap_region(Object *vdev, VFIORegion *region, - MemoryRegion *mem, MemoryRegion *submem, - void **map, size_t size, off_t offset, - const char *name); +int vfio_region_setup(Object *obj, VFIODevice *vbasedev, VFIORegion *region, + int index, const char *name); +int vfio_region_mmap(VFIORegion *region); +void vfio_region_mmaps_set_enabled(VFIORegion *region, bool enabled); +void vfio_region_exit(VFIORegion *region); +void vfio_region_finalize(VFIORegion *region); void vfio_reset_handler(void *opaque); VFIOGroup *vfio_get_group(int groupid, AddressSpace *as); void vfio_put_group(VFIOGroup *group); @@ -138,4 +151,8 @@ extern const MemoryRegionOps vfio_region_ops; extern QLIST_HEAD(vfio_group_head, VFIOGroup) vfio_group_list; extern QLIST_HEAD(vfio_as_head, VFIOAddressSpace) vfio_address_spaces; +#ifdef CONFIG_LINUX +int vfio_get_region_info(VFIODevice *vbasedev, int index, + struct vfio_region_info **info); +#endif #endif /* !HW_VFIO_VFIO_COMMON_H */ diff --git a/include/io/channel-watch.h b/include/io/channel-watch.h index 656358ad64..76d764223e 100644 --- a/include/io/channel-watch.h +++ b/include/io/channel-watch.h @@ -39,7 +39,7 @@ * monitor the file descriptor @fd for the * I/O conditions in @condition. This is able * monitor block devices, character devices, - * sockets, pipes but not plain files. + * pipes but not plain files or, on Win32, sockets. * * Returns: the new main loop source */ @@ -48,6 +48,24 @@ GSource *qio_channel_create_fd_watch(QIOChannel *ioc, GIOCondition condition); /** + * qio_channel_create_socket_watch: + * @ioc: the channel object + * @fd: the file descriptor + * @condition: the I/O condition + * + * Create a new main loop source that is able to + * monitor the file descriptor @fd for the + * I/O conditions in @condition. This is equivalent + * to qio_channel_create_fd_watch on POSIX systems + * but not on Windows. + * + * Returns: the new main loop source + */ +GSource *qio_channel_create_socket_watch(QIOChannel *ioc, + int fd, + GIOCondition condition); + +/** * qio_channel_create_fd_pair_watch: * @ioc: the channel object * @fdread: the file descriptor for reading diff --git a/include/io/channel.h b/include/io/channel.h index 0a1f1ce7fc..d37acd29e0 100644 --- a/include/io/channel.h +++ b/include/io/channel.h @@ -78,6 +78,9 @@ typedef gboolean (*QIOChannelFunc)(QIOChannel *ioc, struct QIOChannel { Object parent; unsigned int features; /* bitmask of QIOChannelFeatures */ +#ifdef _WIN32 + HANDLE event; /* For use with GSource on Win32 */ +#endif }; /** diff --git a/include/qemu/sockets.h b/include/qemu/sockets.h index 0be68de87d..1bd92180f3 100644 --- a/include/qemu/sockets.h +++ b/include/qemu/sockets.h @@ -3,26 +3,9 @@ #define QEMU_SOCKET_H #ifdef _WIN32 -#include <windows.h> -#include <winsock2.h> -#include <ws2tcpip.h> - -#define socket_error() WSAGetLastError() int inet_aton(const char *cp, struct in_addr *ia); -#else - -#include <sys/socket.h> -#include <netinet/in.h> -#include <netinet/tcp.h> -#include <arpa/inet.h> -#include <netdb.h> -#include <sys/un.h> - -#define socket_error() errno -#define closesocket(s) close(s) - #endif /* !_WIN32 */ #include "qapi-types.h" diff --git a/include/qemu/timer.h b/include/qemu/timer.h index d0946cb953..7197d0859a 100644 --- a/include/qemu/timer.h +++ b/include/qemu/timer.h @@ -210,12 +210,11 @@ void qemu_clock_notify(QEMUClockType type); void qemu_clock_enable(QEMUClockType type, bool enabled); /** - * qemu_clock_warp: - * @type: the clock type + * qemu_start_warp_timer: * - * Warp a clock to a new value + * Starts a timer for virtual clock update */ -void qemu_clock_warp(QEMUClockType type); +void qemu_start_warp_timer(void); /** * qemu_clock_register_reset_notifier: diff --git a/include/qemu/typedefs.h b/include/qemu/typedefs.h index 9a5ead69a1..fd039e0e81 100644 --- a/include/qemu/typedefs.h +++ b/include/qemu/typedefs.h @@ -10,6 +10,7 @@ typedef struct AddressSpace AddressSpace; typedef struct AioContext AioContext; typedef struct AllwinnerAHCIState AllwinnerAHCIState; typedef struct AudioState AudioState; +typedef struct BdrvDirtyBitmap BdrvDirtyBitmap; typedef struct BlockBackend BlockBackend; typedef struct BlockBackendRootState BlockBackendRootState; typedef struct BlockDriverState BlockDriverState; diff --git a/include/sysemu/block-backend.h b/include/sysemu/block-backend.h index 66c5cf22e1..00d69baa07 100644 --- a/include/sysemu/block-backend.h +++ b/include/sysemu/block-backend.h @@ -78,6 +78,7 @@ void blk_insert_bs(BlockBackend *blk, BlockDriverState *bs); void blk_hide_on_behalf_of_hmp_drive_del(BlockBackend *blk); +void blk_set_allow_write_beyond_eof(BlockBackend *blk, bool allow); void blk_iostatus_enable(BlockBackend *blk); bool blk_iostatus_is_enabled(const BlockBackend *blk); BlockDeviceIoStatus blk_iostatus(const BlockBackend *blk); diff --git a/include/sysemu/char.h b/include/sysemu/char.h index e46884f367..4c2f777ad1 100644 --- a/include/sysemu/char.h +++ b/include/sysemu/char.h @@ -86,6 +86,7 @@ struct CharDriverState { int is_mux; guint fd_in_tag; QemuOpts *opts; + bool replay; QTAILQ_ENTRY(CharDriverState) next; }; @@ -139,6 +140,22 @@ CharDriverState *qemu_chr_new(const char *label, const char *filename, void (*init)(struct CharDriverState *s)); /** + * @qemu_chr_new_noreplay: + * + * Create a new character backend from a URI. + * Character device communications are not written + * into the replay log. + * + * @label the name of the backend + * @filename the URI + * @init not sure.. + * + * Returns: a new character backend + */ +CharDriverState *qemu_chr_new_noreplay(const char *label, const char *filename, + void (*init)(struct CharDriverState *s)); + +/** * @qemu_chr_delete: * * Destroy a character backend and remove it from the list of @@ -341,6 +358,15 @@ int qemu_chr_be_can_write(CharDriverState *s); */ void qemu_chr_be_write(CharDriverState *s, uint8_t *buf, int len); +/** + * @qemu_chr_be_write_impl: + * + * Implementation of back end writing. Used by replay module. + * + * @buf a buffer to receive data from the front end + * @len the number of bytes to receive from the front end + */ +void qemu_chr_be_write_impl(CharDriverState *s, uint8_t *buf, int len); /** * @qemu_chr_be_event: diff --git a/include/sysemu/os-posix.h b/include/sysemu/os-posix.h index 5b9c4d6143..07e3e5ae9b 100644 --- a/include/sysemu/os-posix.h +++ b/include/sysemu/os-posix.h @@ -26,6 +26,12 @@ #ifndef QEMU_OS_POSIX_H #define QEMU_OS_POSIX_H +#include <sys/socket.h> +#include <netinet/in.h> +#include <netinet/tcp.h> +#include <arpa/inet.h> +#include <netdb.h> +#include <sys/un.h> void os_set_line_buffering(void); void os_set_proc_name(const char *s); @@ -34,6 +40,9 @@ void os_daemonize(void); void os_setup_post(void); int os_mlock(void); +#define closesocket(s) close(s) +#define ioctlsocket(s, r, v) ioctl(s, r, v) + typedef struct timeval qemu_timeval; #define qemu_gettimeofday(tp) gettimeofday(tp, NULL) diff --git a/include/sysemu/os-win32.h b/include/sysemu/os-win32.h index fbed346716..17aad3b20f 100644 --- a/include/sysemu/os-win32.h +++ b/include/sysemu/os-win32.h @@ -28,32 +28,7 @@ #include <winsock2.h> #include <windows.h> - -/* Workaround for older versions of MinGW. */ -#ifndef ECONNREFUSED -# define ECONNREFUSED WSAECONNREFUSED -#endif -#ifndef EINPROGRESS -# define EINPROGRESS WSAEINPROGRESS -#endif -#ifndef EHOSTUNREACH -# define EHOSTUNREACH WSAEHOSTUNREACH -#endif -#ifndef EINTR -# define EINTR WSAEINTR -#endif -#ifndef EINPROGRESS -# define EINPROGRESS WSAEINPROGRESS -#endif -#ifndef ENETUNREACH -# define ENETUNREACH WSAENETUNREACH -#endif -#ifndef ENOTCONN -# define ENOTCONN WSAENOTCONN -#endif -#ifndef EWOULDBLOCK -# define EWOULDBLOCK WSAEWOULDBLOCK -#endif +#include <ws2tcpip.h> #if defined(_WIN64) /* On w64, setjmp is implemented by _setjmp which needs a second parameter. @@ -80,7 +55,6 @@ struct tm *gmtime_r(const time_t *timep, struct tm *result); struct tm *localtime_r(const time_t *timep, struct tm *result); #endif /* CONFIG_LOCALTIME_R */ - static inline void os_setup_signal_handling(void) {} static inline void os_daemonize(void) {} static inline void os_setup_post(void) {} @@ -129,4 +103,82 @@ static inline char *realpath(const char *path, char *resolved_path) return resolved_path; } + +/* We wrap all the sockets functions so that we can + * set errno based on WSAGetLastError() + */ + +#undef connect +#define connect qemu_connect_wrap +int qemu_connect_wrap(int sockfd, const struct sockaddr *addr, + socklen_t addrlen); + +#undef listen +#define listen qemu_listen_wrap +int qemu_listen_wrap(int sockfd, int backlog); + +#undef bind +#define bind qemu_bind_wrap +int qemu_bind_wrap(int sockfd, const struct sockaddr *addr, + socklen_t addrlen); + +#undef socket +#define socket qemu_socket_wrap +int qemu_socket_wrap(int domain, int type, int protocol); + +#undef accept +#define accept qemu_accept_wrap +int qemu_accept_wrap(int sockfd, struct sockaddr *addr, + socklen_t *addrlen); + +#undef shutdown +#define shutdown qemu_shutdown_wrap +int qemu_shutdown_wrap(int sockfd, int how); + +#undef ioctlsocket +#define ioctlsocket qemu_ioctlsocket_wrap +int qemu_ioctlsocket_wrap(int fd, int req, void *val); + +#undef closesocket +#define closesocket qemu_closesocket_wrap +int qemu_closesocket_wrap(int fd); + +#undef getsockopt +#define getsockopt qemu_getsockopt_wrap +int qemu_getsockopt_wrap(int sockfd, int level, int optname, + void *optval, socklen_t *optlen); + +#undef setsockopt +#define setsockopt qemu_setsockopt_wrap +int qemu_setsockopt_wrap(int sockfd, int level, int optname, + const void *optval, socklen_t optlen); + +#undef getpeername +#define getpeername qemu_getpeername_wrap +int qemu_getpeername_wrap(int sockfd, struct sockaddr *addr, + socklen_t *addrlen); + +#undef getsockname +#define getsockname qemu_getsockname_wrap +int qemu_getsockname_wrap(int sockfd, struct sockaddr *addr, + socklen_t *addrlen); + +#undef send +#define send qemu_send_wrap +ssize_t qemu_send_wrap(int sockfd, const void *buf, size_t len, int flags); + +#undef sendto +#define sendto qemu_sendto_wrap +ssize_t qemu_sendto_wrap(int sockfd, const void *buf, size_t len, int flags, + const struct sockaddr *addr, socklen_t addrlen); + +#undef recv +#define recv qemu_recv_wrap +ssize_t qemu_recv_wrap(int sockfd, void *buf, size_t len, int flags); + +#undef recvfrom +#define recvfrom qemu_recvfrom_wrap +ssize_t qemu_recvfrom_wrap(int sockfd, void *buf, size_t len, int flags, + struct sockaddr *addr, socklen_t *addrlen); + #endif diff --git a/include/sysemu/replay.h b/include/sysemu/replay.h index e4108e8b1a..e7989199fc 100644 --- a/include/sysemu/replay.h +++ b/include/sysemu/replay.h @@ -27,7 +27,8 @@ typedef enum ReplayClockKind ReplayClockKind; /* IDs of the checkpoints */ enum ReplayCheckpoint { - CHECKPOINT_CLOCK_WARP, + CHECKPOINT_CLOCK_WARP_START, + CHECKPOINT_CLOCK_WARP_ACCOUNT, CHECKPOINT_RESET_REQUESTED, CHECKPOINT_SUSPEND_REQUESTED, CHECKPOINT_CLOCK_VIRTUAL, @@ -114,4 +115,21 @@ void replay_input_event(QemuConsole *src, InputEvent *evt); /*! Adds input sync event to the queue */ void replay_input_sync_event(void); +/* Character device */ + +/*! Registers char driver to save it's events */ +void replay_register_char_driver(struct CharDriverState *chr); +/*! Saves write to char device event to the log */ +void replay_chr_be_write(struct CharDriverState *s, uint8_t *buf, int len); +/*! Writes char write return value to the replay log. */ +void replay_char_write_event_save(int res, int offset); +/*! Reads char write return value from the replay log. */ +void replay_char_write_event_load(int *res, int *offset); +/*! Reads information about read_all character event. */ +int replay_char_read_all_load(uint8_t *buf); +/*! Writes character read_all error code into the replay log. */ +void replay_char_read_all_save_error(int res); +/*! Writes character read_all execution result into the replay log. */ +void replay_char_read_all_save_buf(uint8_t *buf, int offset); + #endif diff --git a/io/channel-command.c b/io/channel-command.c index f53ce0f4f4..604514adfc 100644 --- a/io/channel-command.c +++ b/io/channel-command.c @@ -236,8 +236,7 @@ static ssize_t qio_channel_command_readv(QIOChannel *ioc, retry: ret = readv(cioc->readfd, iov, niov); if (ret < 0) { - if (errno == EAGAIN || - errno == EWOULDBLOCK) { + if (errno == EAGAIN) { return QIO_CHANNEL_ERR_BLOCK; } if (errno == EINTR) { @@ -265,8 +264,7 @@ static ssize_t qio_channel_command_writev(QIOChannel *ioc, retry: ret = writev(cioc->writefd, iov, niov); if (ret <= 0) { - if (errno == EAGAIN || - errno == EWOULDBLOCK) { + if (errno == EAGAIN) { return QIO_CHANNEL_ERR_BLOCK; } if (errno == EINTR) { diff --git a/io/channel-file.c b/io/channel-file.c index 19a432562a..f28e2b0a5c 100644 --- a/io/channel-file.c +++ b/io/channel-file.c @@ -96,8 +96,7 @@ static ssize_t qio_channel_file_readv(QIOChannel *ioc, retry: ret = readv(fioc->fd, iov, niov); if (ret < 0) { - if (errno == EAGAIN || - errno == EWOULDBLOCK) { + if (errno == EAGAIN) { return QIO_CHANNEL_ERR_BLOCK; } if (errno == EINTR) { @@ -125,8 +124,7 @@ static ssize_t qio_channel_file_writev(QIOChannel *ioc, retry: ret = writev(fioc->fd, iov, niov); if (ret <= 0) { - if (errno == EAGAIN || - errno == EWOULDBLOCK) { + if (errno == EAGAIN) { return QIO_CHANNEL_ERR_BLOCK; } if (errno == EINTR) { diff --git a/io/channel-socket.c b/io/channel-socket.c index bf66a78235..d005070584 100644 --- a/io/channel-socket.c +++ b/io/channel-socket.c @@ -55,6 +55,10 @@ qio_channel_socket_new(void) ioc = QIO_CHANNEL(sioc); ioc->features |= (1 << QIO_CHANNEL_FEATURE_SHUTDOWN); +#ifdef WIN32 + ioc->event = CreateEvent(NULL, FALSE, FALSE, NULL); +#endif + trace_qio_channel_socket_new(sioc); return sioc; @@ -78,11 +82,11 @@ qio_channel_socket_set_fd(QIOChannelSocket *sioc, if (getpeername(fd, (struct sockaddr *)&sioc->remoteAddr, &sioc->remoteAddrLen) < 0) { - if (socket_error() == ENOTCONN) { + if (errno == ENOTCONN) { memset(&sioc->remoteAddr, 0, sizeof(sioc->remoteAddr)); sioc->remoteAddrLen = sizeof(sioc->remoteAddr); } else { - error_setg_errno(errp, socket_error(), + error_setg_errno(errp, errno, "Unable to query remote socket address"); goto error; } @@ -90,7 +94,7 @@ qio_channel_socket_set_fd(QIOChannelSocket *sioc, if (getsockname(fd, (struct sockaddr *)&sioc->localAddr, &sioc->localAddrLen) < 0) { - error_setg_errno(errp, socket_error(), + error_setg_errno(errp, errno, "Unable to query local socket address"); goto error; } @@ -341,13 +345,18 @@ qio_channel_socket_accept(QIOChannelSocket *ioc, cioc->remoteAddrLen = sizeof(ioc->remoteAddr); cioc->localAddrLen = sizeof(ioc->localAddr); +#ifdef WIN32 + QIO_CHANNEL(cioc)->event = CreateEvent(NULL, FALSE, FALSE, NULL); +#endif + + retry: trace_qio_channel_socket_accept(ioc); - cioc->fd = accept(ioc->fd, (struct sockaddr *)&cioc->remoteAddr, - &cioc->remoteAddrLen); + cioc->fd = qemu_accept(ioc->fd, (struct sockaddr *)&cioc->remoteAddr, + &cioc->remoteAddrLen); if (cioc->fd < 0) { trace_qio_channel_socket_accept_fail(ioc); - if (socket_error() == EINTR) { + if (errno == EINTR) { goto retry; } goto error; @@ -355,7 +364,7 @@ qio_channel_socket_accept(QIOChannelSocket *ioc, if (getsockname(cioc->fd, (struct sockaddr *)&cioc->localAddr, &cioc->localAddrLen) < 0) { - error_setg_errno(errp, socket_error(), + error_setg_errno(errp, errno, "Unable to query local socket address"); goto error; } @@ -384,7 +393,10 @@ static void qio_channel_socket_finalize(Object *obj) { QIOChannelSocket *ioc = QIO_CHANNEL_SOCKET(obj); if (ioc->fd != -1) { - close(ioc->fd); +#ifdef WIN32 + WSAEventSelect(ioc->fd, NULL, 0); +#endif + closesocket(ioc->fd); ioc->fd = -1; } } @@ -466,15 +478,14 @@ static ssize_t qio_channel_socket_readv(QIOChannel *ioc, retry: ret = recvmsg(sioc->fd, &msg, sflags); if (ret < 0) { - if (socket_error() == EAGAIN || - socket_error() == EWOULDBLOCK) { + if (errno == EAGAIN) { return QIO_CHANNEL_ERR_BLOCK; } - if (socket_error() == EINTR) { + if (errno == EINTR) { goto retry; } - error_setg_errno(errp, socket_error(), + error_setg_errno(errp, errno, "Unable to read from socket"); return -1; } @@ -526,14 +537,13 @@ static ssize_t qio_channel_socket_writev(QIOChannel *ioc, retry: ret = sendmsg(sioc->fd, &msg, 0); if (ret <= 0) { - if (socket_error() == EAGAIN || - socket_error() == EWOULDBLOCK) { + if (errno == EAGAIN) { return QIO_CHANNEL_ERR_BLOCK; } - if (socket_error() == EINTR) { + if (errno == EINTR) { goto retry; } - error_setg_errno(errp, socket_error(), + error_setg_errno(errp, errno, "Unable to write to socket"); return -1; } @@ -559,17 +569,17 @@ static ssize_t qio_channel_socket_readv(QIOChannel *ioc, iov[i].iov_len, 0); if (ret < 0) { - if (socket_error() == EAGAIN) { + if (errno == EAGAIN) { if (done) { return done; } else { return QIO_CHANNEL_ERR_BLOCK; } - } else if (socket_error() == EINTR) { + } else if (errno == EINTR) { goto retry; } else { - error_setg_errno(errp, socket_error(), - "Unable to write to socket"); + error_setg_errno(errp, errno, + "Unable to read from socket"); return -1; } } @@ -601,16 +611,16 @@ static ssize_t qio_channel_socket_writev(QIOChannel *ioc, iov[i].iov_len, 0); if (ret < 0) { - if (socket_error() == EAGAIN) { + if (errno == EAGAIN) { if (done) { return done; } else { return QIO_CHANNEL_ERR_BLOCK; } - } else if (socket_error() == EINTR) { + } else if (errno == EINTR) { goto retry; } else { - error_setg_errno(errp, socket_error(), + error_setg_errno(errp, errno, "Unable to write to socket"); return -1; } @@ -636,6 +646,11 @@ qio_channel_socket_set_blocking(QIOChannel *ioc, qemu_set_block(sioc->fd); } else { qemu_set_nonblock(sioc->fd); +#ifdef WIN32 + WSAEventSelect(sioc->fd, ioc->event, + FD_READ | FD_ACCEPT | FD_CLOSE | + FD_CONNECT | FD_WRITE | FD_OOB); +#endif } return 0; } @@ -671,13 +686,18 @@ qio_channel_socket_close(QIOChannel *ioc, { QIOChannelSocket *sioc = QIO_CHANNEL_SOCKET(ioc); - if (closesocket(sioc->fd) < 0) { + if (sioc->fd != -1) { +#ifdef WIN32 + WSAEventSelect(sioc->fd, NULL, 0); +#endif + if (closesocket(sioc->fd) < 0) { + sioc->fd = -1; + error_setg_errno(errp, errno, + "Unable to close socket"); + return -1; + } sioc->fd = -1; - error_setg_errno(errp, socket_error(), - "Unable to close socket"); - return -1; } - sioc->fd = -1; return 0; } @@ -703,7 +723,7 @@ qio_channel_socket_shutdown(QIOChannel *ioc, } if (shutdown(sioc->fd, sockhow) < 0) { - error_setg_errno(errp, socket_error(), + error_setg_errno(errp, errno, "Unable to shutdown socket"); return -1; } @@ -714,9 +734,9 @@ static GSource *qio_channel_socket_create_watch(QIOChannel *ioc, GIOCondition condition) { QIOChannelSocket *sioc = QIO_CHANNEL_SOCKET(ioc); - return qio_channel_create_fd_watch(ioc, - sioc->fd, - condition); + return qio_channel_create_socket_watch(ioc, + sioc->fd, + condition); } static void qio_channel_socket_class_init(ObjectClass *klass, diff --git a/io/channel-watch.c b/io/channel-watch.c index 931fa4d49d..cf1cdff896 100644 --- a/io/channel-watch.c +++ b/io/channel-watch.c @@ -30,6 +30,20 @@ struct QIOChannelFDSource { }; +#ifdef CONFIG_WIN32 +typedef struct QIOChannelSocketSource QIOChannelSocketSource; +struct QIOChannelSocketSource { + GSource parent; + GPollFD fd; + QIOChannel *ioc; + SOCKET socket; + int revents; + GIOCondition condition; +}; + +#endif + + typedef struct QIOChannelFDPairSource QIOChannelFDPairSource; struct QIOChannelFDPairSource { GSource parent; @@ -82,6 +96,97 @@ qio_channel_fd_source_finalize(GSource *source) } +#ifdef CONFIG_WIN32 +static gboolean +qio_channel_socket_source_prepare(GSource *source G_GNUC_UNUSED, + gint *timeout) +{ + *timeout = -1; + + return FALSE; +} + + +/* + * NB, this impl only works when the socket is in non-blocking + * mode on Win32 + */ +static gboolean +qio_channel_socket_source_check(GSource *source) +{ + static struct timeval tv0; + + QIOChannelSocketSource *ssource = (QIOChannelSocketSource *)source; + WSANETWORKEVENTS ev; + fd_set rfds, wfds, xfds; + + if (!ssource->condition) { + return 0; + } + + WSAEnumNetworkEvents(ssource->socket, ssource->ioc->event, &ev); + + FD_ZERO(&rfds); + FD_ZERO(&wfds); + FD_ZERO(&xfds); + if (ssource->condition & G_IO_IN) { + FD_SET((SOCKET)ssource->socket, &rfds); + } + if (ssource->condition & G_IO_OUT) { + FD_SET((SOCKET)ssource->socket, &wfds); + } + if (ssource->condition & G_IO_PRI) { + FD_SET((SOCKET)ssource->socket, &xfds); + } + ssource->revents = 0; + if (select(0, &rfds, &wfds, &xfds, &tv0) == 0) { + return 0; + } + + if (FD_ISSET(ssource->socket, &rfds)) { + ssource->revents |= G_IO_IN; + } + if (FD_ISSET(ssource->socket, &wfds)) { + ssource->revents |= G_IO_OUT; + } + if (FD_ISSET(ssource->socket, &xfds)) { + ssource->revents |= G_IO_PRI; + } + + return ssource->revents; +} + + +static gboolean +qio_channel_socket_source_dispatch(GSource *source, + GSourceFunc callback, + gpointer user_data) +{ + QIOChannelFunc func = (QIOChannelFunc)callback; + QIOChannelSocketSource *ssource = (QIOChannelSocketSource *)source; + + return (*func)(ssource->ioc, ssource->revents, user_data); +} + + +static void +qio_channel_socket_source_finalize(GSource *source) +{ + QIOChannelSocketSource *ssource = (QIOChannelSocketSource *)source; + + object_unref(OBJECT(ssource->ioc)); +} + + +GSourceFuncs qio_channel_socket_source_funcs = { + qio_channel_socket_source_prepare, + qio_channel_socket_source_check, + qio_channel_socket_source_dispatch, + qio_channel_socket_source_finalize +}; +#endif + + static gboolean qio_channel_fd_pair_source_prepare(GSource *source G_GNUC_UNUSED, gint *timeout) @@ -160,7 +265,11 @@ GSource *qio_channel_create_fd_watch(QIOChannel *ioc, ssource->condition = condition; +#ifdef CONFIG_WIN32 + ssource->fd.fd = (gint64)_get_osfhandle(fd); +#else ssource->fd.fd = fd; +#endif ssource->fd.events = condition; g_source_add_poll(source, &ssource->fd); @@ -168,6 +277,40 @@ GSource *qio_channel_create_fd_watch(QIOChannel *ioc, return source; } +#ifdef CONFIG_WIN32 +GSource *qio_channel_create_socket_watch(QIOChannel *ioc, + int socket, + GIOCondition condition) +{ + GSource *source; + QIOChannelSocketSource *ssource; + + source = g_source_new(&qio_channel_socket_source_funcs, + sizeof(QIOChannelSocketSource)); + ssource = (QIOChannelSocketSource *)source; + + ssource->ioc = ioc; + object_ref(OBJECT(ioc)); + + ssource->condition = condition; + ssource->socket = socket; + ssource->revents = 0; + + ssource->fd.fd = (gintptr)ioc->event; + ssource->fd.events = G_IO_IN; + + g_source_add_poll(source, &ssource->fd); + + return source; +} +#else +GSource *qio_channel_create_socket_watch(QIOChannel *ioc, + int socket, + GIOCondition condition) +{ + return qio_channel_create_fd_watch(ioc, socket, condition); +} +#endif GSource *qio_channel_create_fd_pair_watch(QIOChannel *ioc, int fdread, @@ -186,10 +329,15 @@ GSource *qio_channel_create_fd_pair_watch(QIOChannel *ioc, ssource->condition = condition; +#ifdef CONFIG_WIN32 + ssource->fdread.fd = (gint64)_get_osfhandle(fdread); + ssource->fdwrite.fd = (gint64)_get_osfhandle(fdwrite); +#else ssource->fdread.fd = fdread; - ssource->fdread.events = condition & G_IO_IN; - ssource->fdwrite.fd = fdwrite; +#endif + + ssource->fdread.events = condition & G_IO_IN; ssource->fdwrite.events = condition & G_IO_OUT; g_source_add_poll(source, &ssource->fdread); diff --git a/io/channel.c b/io/channel.c index 3fc09f887c..dd6fc0eb28 100644 --- a/io/channel.c +++ b/io/channel.c @@ -274,10 +274,24 @@ void qio_channel_wait(QIOChannel *ioc, } +#ifdef _WIN32 +static void qio_channel_finalize(Object *obj) +{ + QIOChannel *ioc = QIO_CHANNEL(obj); + + if (ioc->event) { + CloseHandle(ioc->event); + } +} +#endif + static const TypeInfo qio_channel_info = { .parent = TYPE_OBJECT, .name = TYPE_QIO_CHANNEL, .instance_size = sizeof(QIOChannel), +#ifdef _WIN32 + .instance_finalize = qio_channel_finalize, +#endif .abstract = true, .class_size = sizeof(QIOChannelClass), }; diff --git a/linux-user/flatload.c b/linux-user/flatload.c index a25c797b73..f9139c399a 100644 --- a/linux-user/flatload.c +++ b/linux-user/flatload.c @@ -38,7 +38,6 @@ #include "qemu.h" #include "flat.h" -#define ntohl(x) be32_to_cpu(x) #include <target_flat.h> //#define DEBUG diff --git a/main-loop.c b/main-loop.c index 19beae76ad..3a7f4cdbb2 100644 --- a/main-loop.c +++ b/main-loop.c @@ -509,7 +509,7 @@ int main_loop_wait(int nonblocking) /* CPU thread can infinitely wait for event after missing the warp */ - qemu_clock_warp(QEMU_CLOCK_VIRTUAL); + qemu_start_warp_timer(); qemu_clock_run_all_timers(); return ret; diff --git a/memory.c b/memory.c index 9f5c4584d1..95f720964b 100644 --- a/memory.c +++ b/memory.c @@ -386,6 +386,14 @@ static hwaddr memory_region_to_absolute_addr(MemoryRegion *mr, hwaddr offset) return abs_addr; } +static int get_cpu_index(void) +{ + if (current_cpu) { + return current_cpu->cpu_index; + } + return -1; +} + static MemTxResult memory_region_oldmmio_read_accessor(MemoryRegion *mr, hwaddr addr, uint64_t *value, @@ -398,10 +406,15 @@ static MemTxResult memory_region_oldmmio_read_accessor(MemoryRegion *mr, tmp = mr->ops->old_mmio.read[ctz32(size)](mr->opaque, addr); if (mr->subpage) { - trace_memory_region_subpage_read(mr, addr, tmp, size); + trace_memory_region_subpage_read(get_cpu_index(), mr, addr, tmp, size); + } else if (mr == &io_mem_notdirty) { + /* Accesses to code which has previously been translated into a TB show + * up in the MMIO path, as accesses to the io_mem_notdirty + * MemoryRegion. */ + trace_memory_region_tb_read(get_cpu_index(), addr, tmp, size); } else if (TRACE_MEMORY_REGION_OPS_READ_ENABLED) { hwaddr abs_addr = memory_region_to_absolute_addr(mr, addr); - trace_memory_region_ops_read(mr, abs_addr, tmp, size); + trace_memory_region_ops_read(get_cpu_index(), mr, abs_addr, tmp, size); } *value |= (tmp & mask) << shift; return MEMTX_OK; @@ -419,10 +432,15 @@ static MemTxResult memory_region_read_accessor(MemoryRegion *mr, tmp = mr->ops->read(mr->opaque, addr, size); if (mr->subpage) { - trace_memory_region_subpage_read(mr, addr, tmp, size); + trace_memory_region_subpage_read(get_cpu_index(), mr, addr, tmp, size); + } else if (mr == &io_mem_notdirty) { + /* Accesses to code which has previously been translated into a TB show + * up in the MMIO path, as accesses to the io_mem_notdirty + * MemoryRegion. */ + trace_memory_region_tb_read(get_cpu_index(), addr, tmp, size); } else if (TRACE_MEMORY_REGION_OPS_READ_ENABLED) { hwaddr abs_addr = memory_region_to_absolute_addr(mr, addr); - trace_memory_region_ops_read(mr, abs_addr, tmp, size); + trace_memory_region_ops_read(get_cpu_index(), mr, abs_addr, tmp, size); } *value |= (tmp & mask) << shift; return MEMTX_OK; @@ -441,10 +459,15 @@ static MemTxResult memory_region_read_with_attrs_accessor(MemoryRegion *mr, r = mr->ops->read_with_attrs(mr->opaque, addr, &tmp, size, attrs); if (mr->subpage) { - trace_memory_region_subpage_read(mr, addr, tmp, size); + trace_memory_region_subpage_read(get_cpu_index(), mr, addr, tmp, size); + } else if (mr == &io_mem_notdirty) { + /* Accesses to code which has previously been translated into a TB show + * up in the MMIO path, as accesses to the io_mem_notdirty + * MemoryRegion. */ + trace_memory_region_tb_read(get_cpu_index(), addr, tmp, size); } else if (TRACE_MEMORY_REGION_OPS_READ_ENABLED) { hwaddr abs_addr = memory_region_to_absolute_addr(mr, addr); - trace_memory_region_ops_read(mr, abs_addr, tmp, size); + trace_memory_region_ops_read(get_cpu_index(), mr, abs_addr, tmp, size); } *value |= (tmp & mask) << shift; return r; @@ -462,10 +485,15 @@ static MemTxResult memory_region_oldmmio_write_accessor(MemoryRegion *mr, tmp = (*value >> shift) & mask; if (mr->subpage) { - trace_memory_region_subpage_write(mr, addr, tmp, size); + trace_memory_region_subpage_write(get_cpu_index(), mr, addr, tmp, size); + } else if (mr == &io_mem_notdirty) { + /* Accesses to code which has previously been translated into a TB show + * up in the MMIO path, as accesses to the io_mem_notdirty + * MemoryRegion. */ + trace_memory_region_tb_write(get_cpu_index(), addr, tmp, size); } else if (TRACE_MEMORY_REGION_OPS_WRITE_ENABLED) { hwaddr abs_addr = memory_region_to_absolute_addr(mr, addr); - trace_memory_region_ops_write(mr, abs_addr, tmp, size); + trace_memory_region_ops_write(get_cpu_index(), mr, abs_addr, tmp, size); } mr->ops->old_mmio.write[ctz32(size)](mr->opaque, addr, tmp); return MEMTX_OK; @@ -483,10 +511,15 @@ static MemTxResult memory_region_write_accessor(MemoryRegion *mr, tmp = (*value >> shift) & mask; if (mr->subpage) { - trace_memory_region_subpage_write(mr, addr, tmp, size); + trace_memory_region_subpage_write(get_cpu_index(), mr, addr, tmp, size); + } else if (mr == &io_mem_notdirty) { + /* Accesses to code which has previously been translated into a TB show + * up in the MMIO path, as accesses to the io_mem_notdirty + * MemoryRegion. */ + trace_memory_region_tb_write(get_cpu_index(), addr, tmp, size); } else if (TRACE_MEMORY_REGION_OPS_WRITE_ENABLED) { hwaddr abs_addr = memory_region_to_absolute_addr(mr, addr); - trace_memory_region_ops_write(mr, abs_addr, tmp, size); + trace_memory_region_ops_write(get_cpu_index(), mr, abs_addr, tmp, size); } mr->ops->write(mr->opaque, addr, tmp, size); return MEMTX_OK; @@ -504,10 +537,15 @@ static MemTxResult memory_region_write_with_attrs_accessor(MemoryRegion *mr, tmp = (*value >> shift) & mask; if (mr->subpage) { - trace_memory_region_subpage_write(mr, addr, tmp, size); + trace_memory_region_subpage_write(get_cpu_index(), mr, addr, tmp, size); + } else if (mr == &io_mem_notdirty) { + /* Accesses to code which has previously been translated into a TB show + * up in the MMIO path, as accesses to the io_mem_notdirty + * MemoryRegion. */ + trace_memory_region_tb_write(get_cpu_index(), addr, tmp, size); } else if (TRACE_MEMORY_REGION_OPS_WRITE_ENABLED) { hwaddr abs_addr = memory_region_to_absolute_addr(mr, addr); - trace_memory_region_ops_write(mr, abs_addr, tmp, size); + trace_memory_region_ops_write(get_cpu_index(), mr, abs_addr, tmp, size); } return mr->ops->write_with_attrs(mr->opaque, addr, tmp, size, attrs); } diff --git a/migration/migration.c b/migration/migration.c index 7d13377b8e..034a918d32 100644 --- a/migration/migration.c +++ b/migration/migration.c @@ -706,7 +706,7 @@ void qmp_migrate_set_capabilities(MigrationCapabilityStatusList *params, */ error_report("Postcopy is not currently compatible with " "compression"); - s->enabled_capabilities[MIGRATION_CAPABILITY_X_POSTCOPY_RAM] = + s->enabled_capabilities[MIGRATION_CAPABILITY_POSTCOPY_RAM] = false; } } @@ -1125,7 +1125,7 @@ bool migrate_postcopy_ram(void) s = migrate_get_current(); - return s->enabled_capabilities[MIGRATION_CAPABILITY_X_POSTCOPY_RAM]; + return s->enabled_capabilities[MIGRATION_CAPABILITY_POSTCOPY_RAM]; } bool migrate_auto_converge(void) @@ -1269,8 +1269,7 @@ static void *source_return_path_thread(void *opaque) MigrationState *ms = opaque; QEMUFile *rp = ms->rp_state.from_dst_file; uint16_t header_len, header_type; - const int max_len = 512; - uint8_t buf[max_len]; + uint8_t buf[512]; uint32_t tmp32, sibling_error; ram_addr_t start = 0; /* =0 to silence warning */ size_t len = 0, expected_len; @@ -1293,7 +1292,7 @@ static void *source_return_path_thread(void *opaque) if ((rp_cmd_args[header_type].len != -1 && header_len != rp_cmd_args[header_type].len) || - header_len > max_len) { + header_len > sizeof(buf)) { error_report("RP: Received '%s' message (0x%04x) with" "incorrect length %d expecting %zu", rp_cmd_args[header_type].name, header_type, header_len, diff --git a/migration/qemu-file-unix.c b/migration/qemu-file-unix.c index 61b059b25b..4474e18ff8 100644 --- a/migration/qemu-file-unix.c +++ b/migration/qemu-file-unix.c @@ -53,18 +53,16 @@ static ssize_t socket_writev_buffer(void *opaque, struct iovec *iov, int iovcnt, } if (size > 0) { - err = socket_error(); - - if (err != EAGAIN && err != EWOULDBLOCK) { + if (errno != EAGAIN && errno != EWOULDBLOCK) { error_report("socket_writev_buffer: Got err=%d for (%zu/%zu)", - err, (size_t)size, (size_t)len); + errno, (size_t)size, (size_t)len); /* * If I've already sent some but only just got the error, I * could return the amount validly sent so far and wait for the * next call to report the error, but I'd rather flag the error * immediately. */ - return -err; + return -errno; } /* Emulate blocking */ @@ -99,15 +97,15 @@ static ssize_t socket_get_buffer(void *opaque, uint8_t *buf, int64_t pos, if (len != -1) { break; } - if (socket_error() == EAGAIN) { + if (errno == EAGAIN) { yield_until_fd_readable(s->fd); - } else if (socket_error() != EINTR) { + } else if (errno != EINTR) { break; } } if (len == -1) { - len = -socket_error(); + len = -errno; } return len; } diff --git a/migration/savevm.c b/migration/savevm.c index 96e7db5967..0a33c227c5 100644 --- a/migration/savevm.c +++ b/migration/savevm.c @@ -1494,17 +1494,22 @@ static int loadvm_postcopy_handle_listen(MigrationIncomingState *mis) qemu_sem_init(&mis->listen_thread_sem, 0); qemu_thread_create(&mis->listen_thread, "postcopy/listen", postcopy_ram_listen_thread, mis->from_src_file, - QEMU_THREAD_JOINABLE); + QEMU_THREAD_DETACHED); qemu_sem_wait(&mis->listen_thread_sem); qemu_sem_destroy(&mis->listen_thread_sem); return 0; } + +typedef struct { + QEMUBH *bh; +} HandleRunBhData; + static void loadvm_postcopy_handle_run_bh(void *opaque) { Error *local_err = NULL; - MigrationIncomingState *mis = opaque; + HandleRunBhData *data = opaque; /* TODO we should move all of this lot into postcopy_ram.c or a shared code * in migration.c @@ -1532,13 +1537,15 @@ static void loadvm_postcopy_handle_run_bh(void *opaque) runstate_set(RUN_STATE_PAUSED); } - qemu_bh_delete(mis->bh); + qemu_bh_delete(data->bh); + g_free(data); } /* After all discards we can start running and asking for pages */ static int loadvm_postcopy_handle_run(MigrationIncomingState *mis) { PostcopyState ps = postcopy_state_set(POSTCOPY_INCOMING_RUNNING); + HandleRunBhData *data; trace_loadvm_postcopy_handle_run(); if (ps != POSTCOPY_INCOMING_LISTENING) { @@ -1546,8 +1553,9 @@ static int loadvm_postcopy_handle_run(MigrationIncomingState *mis) return -1; } - mis->bh = qemu_bh_new(loadvm_postcopy_handle_run_bh, NULL); - qemu_bh_schedule(mis->bh); + data = g_new(HandleRunBhData, 1); + data->bh = qemu_bh_new(loadvm_postcopy_handle_run_bh, data); + qemu_bh_schedule(data->bh); /* We need to finish reading the stream from the package * and also stop reading anything more from the stream that loaded the diff --git a/migration/tcp.c b/migration/tcp.c index e888a4e490..e1fa7f8f18 100644 --- a/migration/tcp.c +++ b/migration/tcp.c @@ -59,12 +59,11 @@ static void tcp_accept_incoming_migration(void *opaque) socklen_t addrlen = sizeof(addr); int s = (intptr_t)opaque; QEMUFile *f; - int c, err; + int c; do { c = qemu_accept(s, (struct sockaddr *)&addr, &addrlen); - err = socket_error(); - } while (c < 0 && err == EINTR); + } while (c < 0 && errno == EINTR); qemu_set_fd_handler(s, NULL, NULL, NULL); closesocket(s); @@ -72,7 +71,7 @@ static void tcp_accept_incoming_migration(void *opaque) if (c < 0) { error_report("could not accept migration connection (%s)", - strerror(err)); + strerror(errno)); return; } diff --git a/monitor.c b/monitor.c index e99ca8c91e..894f862dd3 100644 --- a/monitor.c +++ b/monitor.c @@ -76,6 +76,7 @@ #include "qapi-event.h" #include "qmp-introspect.h" #include "sysemu/block-backend.h" +#include "sysemu/qtest.h" /* for hmp_info_irq/pic */ #if defined(TARGET_SPARC) @@ -232,6 +233,8 @@ static const mon_cmd_t qmp_cmds[]; Monitor *cur_mon; +static QEMUClockType event_clock_type = QEMU_CLOCK_REALTIME; + static void monitor_command_cb(void *opaque, const char *cmdline, void *readline_opaque); @@ -513,7 +516,7 @@ monitor_qapi_event_queue(QAPIEvent event, QDict *qdict, Error **errp) * monitor_qapi_event_handler() in evconf->rate ns. Any * events arriving before then will be delayed until then. */ - int64_t now = qemu_clock_get_ns(QEMU_CLOCK_REALTIME); + int64_t now = qemu_clock_get_ns(event_clock_type); monitor_qapi_event_emit(event, qdict); @@ -522,7 +525,7 @@ monitor_qapi_event_queue(QAPIEvent event, QDict *qdict, Error **errp) evstate->data = data; QINCREF(evstate->data); evstate->qdict = NULL; - evstate->timer = timer_new_ns(QEMU_CLOCK_REALTIME, + evstate->timer = timer_new_ns(event_clock_type, monitor_qapi_event_handler, evstate); g_hash_table_add(monitor_qapi_event_state, evstate); @@ -547,7 +550,7 @@ static void monitor_qapi_event_handler(void *opaque) qemu_mutex_lock(&monitor_lock); if (evstate->qdict) { - int64_t now = qemu_clock_get_ns(QEMU_CLOCK_REALTIME); + int64_t now = qemu_clock_get_ns(event_clock_type); monitor_qapi_event_emit(evstate->event, evstate->qdict); QDECREF(evstate->qdict); @@ -572,6 +575,10 @@ static unsigned int qapi_event_throttle_hash(const void *key) hash += g_str_hash(qdict_get_str(evstate->data, "id")); } + if (evstate->event == QAPI_EVENT_QUORUM_REPORT_BAD) { + hash += g_str_hash(qdict_get_str(evstate->data, "node-name")); + } + return hash; } @@ -589,11 +596,20 @@ static gboolean qapi_event_throttle_equal(const void *a, const void *b) qdict_get_str(evb->data, "id")); } + if (eva->event == QAPI_EVENT_QUORUM_REPORT_BAD) { + return !strcmp(qdict_get_str(eva->data, "node-name"), + qdict_get_str(evb->data, "node-name")); + } + return TRUE; } static void monitor_qapi_event_init(void) { + if (qtest_enabled()) { + event_clock_type = QEMU_CLOCK_VIRTUAL; + } + monitor_qapi_event_state = g_hash_table_new(qapi_event_throttle_hash, qapi_event_throttle_equal); qmp_event_set_func_emit(monitor_qapi_event_queue); diff --git a/net/net.c b/net/net.c index b0c832e1f0..73cefbcb67 100644 --- a/net/net.c +++ b/net/net.c @@ -1050,6 +1050,37 @@ int net_client_init(QemuOpts *opts, int is_netdev, Error **errp) OptsVisitor *ov = opts_visitor_new(opts); Visitor *v = opts_get_visitor(ov); + { + /* Parse convenience option format ip6-net=fec0::0[/64] */ + const char *ip6_net = qemu_opt_get(opts, "ip6-net"); + + if (ip6_net) { + char buf[strlen(ip6_net) + 1]; + + if (get_str_sep(buf, sizeof(buf), &ip6_net, '/') < 0) { + /* Default 64bit prefix length. */ + qemu_opt_set(opts, "ip6-prefix", ip6_net, &error_abort); + qemu_opt_set_number(opts, "ip6-prefixlen", 64, &error_abort); + } else { + /* User-specified prefix length. */ + unsigned long len; + int err; + + qemu_opt_set(opts, "ip6-prefix", buf, &error_abort); + err = qemu_strtoul(ip6_net, NULL, 10, &len); + + if (err) { + error_setg(errp, QERR_INVALID_PARAMETER_VALUE, + "ip6-prefix", "a number"); + } else { + qemu_opt_set_number(opts, "ip6-prefixlen", len, + &error_abort); + } + } + qemu_opt_unset(opts, "ip6-net"); + } + } + if (is_netdev) { visit_type_Netdev(v, NULL, (Netdev **)&object, &err); } else { diff --git a/net/slirp.c b/net/slirp.c index 6b51fbc306..9954160de3 100644 --- a/net/slirp.c +++ b/net/slirp.c @@ -36,6 +36,7 @@ #include "qemu/error-report.h" #include "qemu/sockets.h" #include "slirp/libslirp.h" +#include "slirp/ip6.h" #include "sysemu/char.h" static int get_str_sep(char *buf, int buf_size, const char **pp, int sep) @@ -135,10 +136,13 @@ static NetClientInfo net_slirp_info = { static int net_slirp_init(NetClientState *peer, const char *model, const char *name, int restricted, const char *vnetwork, const char *vhost, + const char *vprefix6, int vprefix6_len, + const char *vhost6, const char *vhostname, const char *tftp_export, const char *bootfile, const char *vdhcp_start, - const char *vnameserver, const char *smb_export, - const char *vsmbserver, const char **dnssearch) + const char *vnameserver, const char *vnameserver6, + const char *smb_export, const char *vsmbserver, + const char **dnssearch) { /* default settings according to historic slirp */ struct in_addr net = { .s_addr = htonl(0x0a000200) }; /* 10.0.2.0 */ @@ -146,6 +150,9 @@ static int net_slirp_init(NetClientState *peer, const char *model, struct in_addr host = { .s_addr = htonl(0x0a000202) }; /* 10.0.2.2 */ struct in_addr dhcp = { .s_addr = htonl(0x0a00020f) }; /* 10.0.2.15 */ struct in_addr dns = { .s_addr = htonl(0x0a000203) }; /* 10.0.2.3 */ + struct in6_addr ip6_prefix; + struct in6_addr ip6_host; + struct in6_addr ip6_dns; #ifndef _WIN32 struct in_addr smbsrv = { .s_addr = 0 }; #endif @@ -235,6 +242,64 @@ static int net_slirp_init(NetClientState *peer, const char *model, } #endif +#if defined(_WIN32) && (_WIN32_WINNT < 0x0600) + /* No inet_pton helper before Vista... */ + if (vprefix6) { + /* Unsupported */ + return -1; + } + memset(&ip6_prefix, 0, sizeof(ip6_prefix)); + ip6_prefix.s6_addr[0] = 0xfe; + ip6_prefix.s6_addr[1] = 0xc0; +#else + if (!vprefix6) { + vprefix6 = "fec0::"; + } + if (!inet_pton(AF_INET6, vprefix6, &ip6_prefix)) { + return -1; + } +#endif + + if (!vprefix6_len) { + vprefix6_len = 64; + } + if (vprefix6_len < 0 || vprefix6_len > 126) { + return -1; + } + + if (vhost6) { +#if defined(_WIN32) && (_WIN32_WINNT < 0x0600) + return -1; +#else + if (!inet_pton(AF_INET6, vhost6, &ip6_host)) { + return -1; + } + if (!in6_equal_net(&ip6_prefix, &ip6_host, vprefix6_len)) { + return -1; + } +#endif + } else { + ip6_host = ip6_prefix; + ip6_host.s6_addr[15] |= 2; + } + + if (vnameserver6) { +#if defined(_WIN32) && (_WIN32_WINNT < 0x0600) + return -1; +#else + if (!inet_pton(AF_INET6, vnameserver6, &ip6_dns)) { + return -1; + } + if (!in6_equal_net(&ip6_prefix, &ip6_dns, vprefix6_len)) { + return -1; + } +#endif + } else { + ip6_dns = ip6_prefix; + ip6_dns.s6_addr[15] |= 3; + } + + nc = qemu_new_net_client(&net_slirp_info, peer, model, name); snprintf(nc->info_str, sizeof(nc->info_str), @@ -243,8 +308,10 @@ static int net_slirp_init(NetClientState *peer, const char *model, s = DO_UPCAST(SlirpState, nc, nc); - s->slirp = slirp_init(restricted, net, mask, host, vhostname, - tftp_export, bootfile, dhcp, dns, dnssearch, s); + s->slirp = slirp_init(restricted, net, mask, host, + ip6_prefix, vprefix6_len, ip6_host, + vhostname, tftp_export, bootfile, dhcp, + dns, ip6_dns, dnssearch, s); QTAILQ_INSERT_TAIL(&slirp_stacks, s, entry); for (config = slirp_configs; config; config = config->next) { @@ -761,8 +828,10 @@ int net_init_slirp(const NetClientOptions *opts, const char *name, net_init_slirp_configs(user->guestfwd, 0); ret = net_slirp_init(peer, "user", name, user->q_restrict, vnet, - user->host, user->hostname, user->tftp, - user->bootfile, user->dhcpstart, user->dns, user->smb, + user->host, user->ip6_prefix, user->ip6_prefixlen, + user->ip6_host, user->hostname, user->tftp, + user->bootfile, user->dhcpstart, + user->dns, user->ip6_dns, user->smb, user->smbserver, dnssearch); while (slirp_configs) { diff --git a/net/socket.c b/net/socket.c index e32e3cb996..73dc49a3a4 100644 --- a/net/socket.c +++ b/net/socket.c @@ -145,15 +145,14 @@ static void net_socket_send_completed(NetClientState *nc, ssize_t len) static void net_socket_send(void *opaque) { NetSocketState *s = opaque; - int size, err; + int size; unsigned l; uint8_t buf1[NET_BUFSIZE]; const uint8_t *buf; size = qemu_recv(s->fd, buf1, sizeof(buf1), 0); if (size < 0) { - err = socket_error(); - if (err != EWOULDBLOCK) + if (errno != EWOULDBLOCK) goto eoc; } else if (size == 0) { /* end of connection */ @@ -566,7 +565,7 @@ static int net_socket_connect_init(NetClientState *peer, const char *host_str) { NetSocketState *s; - int fd, connected, ret, err; + int fd, connected, ret; struct sockaddr_in saddr; if (parse_host_port(&saddr, host_str) < 0) @@ -583,14 +582,12 @@ static int net_socket_connect_init(NetClientState *peer, for(;;) { ret = connect(fd, (struct sockaddr *)&saddr, sizeof(saddr)); if (ret < 0) { - err = socket_error(); - if (err == EINTR || err == EWOULDBLOCK) { - } else if (err == EINPROGRESS) { - break; -#ifdef _WIN32 - } else if (err == WSAEALREADY || err == WSAEINVAL) { + if (errno == EINTR || errno == EWOULDBLOCK) { + /* continue */ + } else if (errno == EINPROGRESS || + errno == EALREADY || + errno == EINVAL) { break; -#endif } else { perror("connect"); closesocket(fd); diff --git a/pc-bios/s390-ccw/bootmap.c b/pc-bios/s390-ccw/bootmap.c index 415508b279..492530275d 100644 --- a/pc-bios/s390-ccw/bootmap.c +++ b/pc-bios/s390-ccw/bootmap.c @@ -424,7 +424,7 @@ static void ipl_scsi(void) IPL_assert(magic_match(sec, ZIPL_MAGIC), "No zIPL magic"); ns_end = sec + virtio_get_block_size(); - for (ns = (sec + pte_len); (ns + pte_len) < ns_end; ns++) { + for (ns = (sec + pte_len); (ns + pte_len) < ns_end; ns += pte_len) { prog_table_entry = (ScsiBlockPtr *)ns; if (!prog_table_entry->blockno) { break; diff --git a/qapi-schema.json b/qapi-schema.json index 362c9d816a..f253a37672 100644 --- a/qapi-schema.json +++ b/qapi-schema.json @@ -540,15 +540,15 @@ # @auto-converge: If enabled, QEMU will automatically throttle down the guest # to speed up convergence of RAM migration. (since 1.6) # -# @x-postcopy-ram: Start executing on the migration target before all of RAM has +# @postcopy-ram: Start executing on the migration target before all of RAM has # been migrated, pulling the remaining pages along as needed. NOTE: If -# the migration fails during postcopy the VM will fail. (since 2.5) +# the migration fails during postcopy the VM will fail. (since 2.6) # # Since: 1.2 ## { 'enum': 'MigrationCapability', 'data': ['xbzrle', 'rdma-pin-all', 'auto-converge', 'zero-blocks', - 'compress', 'events', 'x-postcopy-ram'] } + 'compress', 'events', 'postcopy-ram'] } ## # @MigrationCapabilityStatus @@ -705,7 +705,7 @@ # @migrate-start-postcopy # # Followup to a migration command to switch the migration to postcopy mode. -# The x-postcopy-ram capability must be set before the original migration +# The postcopy-ram capability must be set before the original migration # command. # # Since: 2.5 @@ -2451,6 +2451,14 @@ # @dnssearch: #optional list of DNS suffixes to search, passed as DHCP option # to the guest # +# @ip6-prefix: #optional IPv6 network prefix (default is fec0::) (since 2.6) +# +# @ip6-prefixlen: #optional IPv6 network prefix length (default is 64) (since 2.6) +# +# @ip6-host: #optional guest-visible IPv6 address of the host (since 2.6) +# +# @ip6-dns: #optional guest-visible IPv6 address of the virtual nameserver (since 2.6) +# # @smb: #optional root directory of the built-in SMB server # # @smbserver: #optional IP address of the built-in SMB server @@ -2474,6 +2482,10 @@ '*dhcpstart': 'str', '*dns': 'str', '*dnssearch': ['String'], + '*ip6-prefix': 'str', + '*ip6-prefixlen': 'int', + '*ip6-host': 'str', + '*ip6-dns': 'str', '*smb': 'str', '*smbserver': 'str', '*hostfwd': ['String'], diff --git a/qapi/block.json b/qapi/block.json index 58e6b301bf..937337dce5 100644 --- a/qapi/block.json +++ b/qapi/block.json @@ -196,3 +196,19 @@ ## { 'event': 'DEVICE_TRAY_MOVED', 'data': { 'device': 'str', 'tray-open': 'bool' } } + +## +# @QuorumOpType +# +# An enumeration of the quorum operation types +# +# @read: read operation +# +# @write: write operation +# +# @flush: flush operation +# +# Since: 2.6 +## +{ 'enum': 'QuorumOpType', + 'data': [ 'read', 'write', 'flush' ] } diff --git a/qapi/event.json b/qapi/event.json index 1a45a6cb26..8642052ebc 100644 --- a/qapi/event.json +++ b/qapi/event.json @@ -325,6 +325,8 @@ # # Emitted to report a corruption of a Quorum file # +# @type: quorum operation type (Since 2.6) +# # @error: #optional, error message. Only present on failure. This field # contains a human-readable error message. There are no semantics other # than that the block layer reported an error and clients should not @@ -339,7 +341,7 @@ # Since: 2.0 ## { 'event': 'QUORUM_REPORT_BAD', - 'data': { '*error': 'str', 'node-name': 'str', + 'data': { 'type': 'QuorumOpType', '*error': 'str', 'node-name': 'str', 'sector-num': 'int', 'sectors-count': 'int' } } ## diff --git a/qemu-char.c b/qemu-char.c index 27fbb440ac..0a14e57839 100644 --- a/qemu-char.c +++ b/qemu-char.c @@ -37,6 +37,7 @@ #include "io/channel-socket.h" #include "io/channel-file.h" #include "io/channel-tls.h" +#include "sysemu/replay.h" #include <zlib.h> @@ -234,10 +235,46 @@ static void qemu_chr_fe_write_log(CharDriverState *s, } } +static int qemu_chr_fe_write_buffer(CharDriverState *s, const uint8_t *buf, int len, int *offset) +{ + int res = 0; + *offset = 0; + + qemu_mutex_lock(&s->chr_write_lock); + while (*offset < len) { + do { + res = s->chr_write(s, buf + *offset, len - *offset); + if (res == -1 && errno == EAGAIN) { + g_usleep(100); + } + } while (res == -1 && errno == EAGAIN); + + if (res <= 0) { + break; + } + + *offset += res; + } + if (*offset > 0) { + qemu_chr_fe_write_log(s, buf, *offset); + } + qemu_mutex_unlock(&s->chr_write_lock); + + return res; +} + int qemu_chr_fe_write(CharDriverState *s, const uint8_t *buf, int len) { int ret; + if (s->replay && replay_mode == REPLAY_MODE_PLAY) { + int offset; + replay_char_write_event_load(&ret, &offset); + assert(offset <= len); + qemu_chr_fe_write_buffer(s, buf, offset, &offset); + return ret; + } + qemu_mutex_lock(&s->chr_write_lock); ret = s->chr_write(s, buf, len); @@ -246,35 +283,32 @@ int qemu_chr_fe_write(CharDriverState *s, const uint8_t *buf, int len) } qemu_mutex_unlock(&s->chr_write_lock); + + if (s->replay && replay_mode == REPLAY_MODE_RECORD) { + replay_char_write_event_save(ret, ret < 0 ? 0 : ret); + } + return ret; } int qemu_chr_fe_write_all(CharDriverState *s, const uint8_t *buf, int len) { - int offset = 0; - int res = 0; + int offset; + int res; - qemu_mutex_lock(&s->chr_write_lock); - while (offset < len) { - do { - res = s->chr_write(s, buf + offset, len - offset); - if (res == -1 && errno == EAGAIN) { - g_usleep(100); - } - } while (res == -1 && errno == EAGAIN); + if (s->replay && replay_mode == REPLAY_MODE_PLAY) { + replay_char_write_event_load(&res, &offset); + assert(offset <= len); + qemu_chr_fe_write_buffer(s, buf, offset, &offset); + return res; + } - if (res <= 0) { - break; - } + res = qemu_chr_fe_write_buffer(s, buf, len, &offset); - offset += res; - } - if (offset > 0) { - qemu_chr_fe_write_log(s, buf, offset); + if (s->replay && replay_mode == REPLAY_MODE_RECORD) { + replay_char_write_event_save(res, offset); } - qemu_mutex_unlock(&s->chr_write_lock); - if (res < 0) { return res; } @@ -289,6 +323,10 @@ int qemu_chr_fe_read_all(CharDriverState *s, uint8_t *buf, int len) if (!s->chr_sync_read) { return 0; } + + if (s->replay && replay_mode == REPLAY_MODE_PLAY) { + return replay_char_read_all_load(buf); + } while (offset < len) { do { @@ -303,6 +341,9 @@ int qemu_chr_fe_read_all(CharDriverState *s, uint8_t *buf, int len) } if (res < 0) { + if (s->replay && replay_mode == REPLAY_MODE_RECORD) { + replay_char_read_all_save_error(res); + } return res; } @@ -313,14 +354,22 @@ int qemu_chr_fe_read_all(CharDriverState *s, uint8_t *buf, int len) } } + if (s->replay && replay_mode == REPLAY_MODE_RECORD) { + replay_char_read_all_save_buf(buf, offset); + } return offset; } int qemu_chr_fe_ioctl(CharDriverState *s, int cmd, void *arg) { - if (!s->chr_ioctl) - return -ENOTSUP; - return s->chr_ioctl(s, cmd, arg); + int res; + if (!s->chr_ioctl || s->replay) { + res = -ENOTSUP; + } else { + res = s->chr_ioctl(s, cmd, arg); + } + + return res; } int qemu_chr_be_can_write(CharDriverState *s) @@ -330,17 +379,35 @@ int qemu_chr_be_can_write(CharDriverState *s) return s->chr_can_read(s->handler_opaque); } -void qemu_chr_be_write(CharDriverState *s, uint8_t *buf, int len) +void qemu_chr_be_write_impl(CharDriverState *s, uint8_t *buf, int len) { if (s->chr_read) { s->chr_read(s->handler_opaque, buf, len); } } +void qemu_chr_be_write(CharDriverState *s, uint8_t *buf, int len) +{ + if (s->replay) { + if (replay_mode == REPLAY_MODE_PLAY) { + return; + } + replay_chr_be_write(s, buf, len); + } else { + qemu_chr_be_write_impl(s, buf, len); + } +} + int qemu_chr_fe_get_msgfd(CharDriverState *s) { int fd; - return (qemu_chr_fe_get_msgfds(s, &fd, 1) == 1) ? fd : -1; + int res = (qemu_chr_fe_get_msgfds(s, &fd, 1) == 1) ? fd : -1; + if (s->replay) { + fprintf(stderr, + "Replay: get msgfd is not supported for serial devices yet\n"); + exit(1); + } + return res; } int qemu_chr_fe_get_msgfds(CharDriverState *s, int *fds, int len) @@ -3097,20 +3164,6 @@ static void tcp_chr_close(CharDriverState *chr) qemu_chr_be_event(chr, CHR_EVENT_CLOSED); } -static void qemu_chr_finish_socket_connection(CharDriverState *chr, - QIOChannelSocket *sioc) -{ - TCPCharDriver *s = chr->opaque; - - if (s->is_listen) { - s->listen_ioc = sioc; - s->listen_tag = qio_channel_add_watch( - QIO_CHANNEL(s->listen_ioc), G_IO_IN, tcp_chr_accept, chr, NULL); - } else { - tcp_chr_new_client(chr, sioc); - object_unref(OBJECT(sioc)); - } -} static void qemu_chr_socket_connected(Object *src, Error *err, void *opaque) { @@ -3125,37 +3178,11 @@ static void qemu_chr_socket_connected(Object *src, Error *err, void *opaque) } s->connect_err_reported = false; - qemu_chr_finish_socket_connection(chr, sioc); -} - -static bool qemu_chr_open_socket_fd(CharDriverState *chr, Error **errp) -{ - TCPCharDriver *s = chr->opaque; - QIOChannelSocket *sioc = qio_channel_socket_new(); - - if (s->is_listen) { - if (qio_channel_socket_listen_sync(sioc, s->addr, errp) < 0) { - goto fail; - } - qemu_chr_finish_socket_connection(chr, sioc); - } else if (s->reconnect_time) { - qio_channel_socket_connect_async(sioc, s->addr, - qemu_chr_socket_connected, - chr, NULL); - } else { - if (qio_channel_socket_connect_sync(sioc, s->addr, errp) < 0) { - goto fail; - } - qemu_chr_finish_socket_connection(chr, sioc); - } - - return true; - - fail: + tcp_chr_new_client(chr, sioc); object_unref(OBJECT(sioc)); - return false; } + /*********************************************************/ /* Ring buffer chardev */ @@ -3861,7 +3888,8 @@ err: return NULL; } -CharDriverState *qemu_chr_new(const char *label, const char *filename, void (*init)(struct CharDriverState *s)) +CharDriverState *qemu_chr_new_noreplay(const char *label, const char *filename, + void (*init)(struct CharDriverState *s)) { const char *p; CharDriverState *chr; @@ -3887,6 +3915,21 @@ CharDriverState *qemu_chr_new(const char *label, const char *filename, void (*in return chr; } +CharDriverState *qemu_chr_new(const char *label, const char *filename, void (*init)(struct CharDriverState *s)) +{ + CharDriverState *chr; + chr = qemu_chr_new_noreplay(label, filename, init); + if (chr) { + chr->replay = replay_mode != REPLAY_MODE_NONE; + if (chr->replay && chr->chr_ioctl) { + fprintf(stderr, + "Replay: ioctl is not supported for serial devices yet\n"); + } + replay_register_char_driver(chr); + } + return chr; +} + void qemu_chr_fe_set_echo(struct CharDriverState *chr, bool echo) { if (chr->chr_set_echo) { @@ -4264,19 +4307,11 @@ static CharDriverState *qmp_chardev_open_parallel(const char *id, #endif /* WIN32 */ -static void socket_try_connect(CharDriverState *chr) -{ - Error *err = NULL; - - if (!qemu_chr_open_socket_fd(chr, &err)) { - check_report_connect_error(chr, err); - } -} - static gboolean socket_reconnect_timeout(gpointer opaque) { CharDriverState *chr = opaque; TCPCharDriver *s = chr->opaque; + QIOChannelSocket *sioc; s->reconnect_timer = 0; @@ -4284,7 +4319,10 @@ static gboolean socket_reconnect_timeout(gpointer opaque) return false; } - socket_try_connect(chr); + sioc = qio_channel_socket_new(); + qio_channel_socket_connect_async(sioc, s->addr, + qemu_chr_socket_connected, + chr, NULL); return false; } @@ -4304,6 +4342,7 @@ static CharDriverState *qmp_chardev_open_socket(const char *id, bool is_waitconnect = sock->has_wait ? sock->wait : false; int64_t reconnect = sock->has_reconnect ? sock->reconnect : 0; ChardevCommon *common = qapi_ChardevSocket_base(sock); + QIOChannelSocket *sioc = NULL; chr = qemu_chr_alloc(common, errp); if (!chr) { @@ -4373,22 +4412,40 @@ static CharDriverState *qmp_chardev_open_socket(const char *id, s->reconnect_time = reconnect; } + sioc = qio_channel_socket_new(); if (s->reconnect_time) { - socket_try_connect(chr); - } else if (!qemu_chr_open_socket_fd(chr, errp)) { - goto error; - } - - if (is_listen && is_waitconnect) { - fprintf(stderr, "QEMU waiting for connection on: %s\n", - chr->filename); - tcp_chr_accept(QIO_CHANNEL(s->listen_ioc), G_IO_IN, chr); + qio_channel_socket_connect_async(sioc, s->addr, + qemu_chr_socket_connected, + chr, NULL); + } else if (s->is_listen) { + if (qio_channel_socket_listen_sync(sioc, s->addr, errp) < 0) { + goto error; + } + s->listen_ioc = sioc; + if (is_waitconnect) { + fprintf(stderr, "QEMU waiting for connection on: %s\n", + chr->filename); + tcp_chr_accept(QIO_CHANNEL(s->listen_ioc), G_IO_IN, chr); + } qio_channel_set_blocking(QIO_CHANNEL(s->listen_ioc), false, NULL); + if (!s->ioc) { + s->listen_tag = qio_channel_add_watch( + QIO_CHANNEL(s->listen_ioc), G_IO_IN, tcp_chr_accept, chr, NULL); + } + } else { + if (qio_channel_socket_connect_sync(sioc, s->addr, errp) < 0) { + goto error; + } + tcp_chr_new_client(chr, sioc); + object_unref(OBJECT(sioc)); } return chr; error: + if (sioc) { + object_unref(OBJECT(sioc)); + } if (s->tls_creds) { object_unref(OBJECT(s->tls_creds)); } @@ -4481,6 +4538,11 @@ void qmp_chardev_remove(const char *id, Error **errp) error_setg(errp, "Chardev '%s' is busy", id); return; } + if (chr->replay) { + error_setg(errp, + "Chardev '%s' cannot be unplugged in record/replay mode", id); + return; + } qemu_chr_delete(chr); } diff --git a/qemu-img.c b/qemu-img.c index 2edb139073..3103150717 100644 --- a/qemu-img.c +++ b/qemu-img.c @@ -2775,6 +2775,8 @@ static int img_snapshot(int argc, char **argv) static int img_rebase(int argc, char **argv) { BlockBackend *blk = NULL, *blk_old_backing = NULL, *blk_new_backing = NULL; + uint8_t *buf_old = NULL; + uint8_t *buf_new = NULL; BlockDriverState *bs = NULL; char *filename; const char *fmt, *cache, *src_cache, *out_basefmt, *out_baseimg; @@ -2957,8 +2959,6 @@ static int img_rebase(int argc, char **argv) int64_t new_backing_num_sectors = 0; uint64_t sector; int n; - uint8_t * buf_old; - uint8_t * buf_new; float local_progress = 0; buf_old = blk_blockalign(blk, IO_BUF_SIZE); @@ -3070,9 +3070,6 @@ static int img_rebase(int argc, char **argv) } qemu_progress_print(local_progress, 100); } - - qemu_vfree(buf_old); - qemu_vfree(buf_new); } /* @@ -3108,6 +3105,8 @@ out: blk_unref(blk_old_backing); blk_unref(blk_new_backing); } + qemu_vfree(buf_old); + qemu_vfree(buf_new); blk_unref(blk); if (ret) { diff --git a/qemu-options.hx b/qemu-options.hx index 0cf7bb96c4..732ed8cbc5 100644 --- a/qemu-options.hx +++ b/qemu-options.hx @@ -1560,8 +1560,9 @@ DEF("smb", HAS_ARG, QEMU_OPTION_smb, "", QEMU_ARCH_ALL) DEF("netdev", HAS_ARG, QEMU_OPTION_netdev, #ifdef CONFIG_SLIRP - "-netdev user,id=str[,net=addr[/mask]][,host=addr][,restrict=on|off]\n" - " [,hostname=host][,dhcpstart=addr][,dns=addr][,dnssearch=domain][,tftp=dir]\n" + "-netdev user,id=str[,net=addr[/mask]][,host=addr][,ip6-net=addr[/int]]\n" + " [,ip6-host=addr][,restrict=on|off][,hostname=host][,dhcpstart=addr]\n" + " [,dns=addr][,ip6-dns=addr][,dnssearch=domain][,tftp=dir]\n" " [,bootfile=f][,hostfwd=rule][,guestfwd=rule]" #ifndef _WIN32 "[,smb=dir[,smbserver=addr]]\n" @@ -1718,6 +1719,14 @@ either in the form a.b.c.d or as number of valid top-most bits. Default is Specify the guest-visible address of the host. Default is the 2nd IP in the guest network, i.e. x.x.x.2. +@item ip6-net=@var{addr}[/@var{int}] +Set IPv6 network address the guest will see. Optionally specify the prefix +size, as number of valid top-most bits. Default is fec0::/64. + +@item ip6-host=@var{addr} +Specify the guest-visible IPv6 address of the host. Default is the 2nd IPv6 in +the guest network, i.e. xxxx::2. + @item restrict=on|off If this option is enabled, the guest will be isolated, i.e. it will not be able to contact the host and no guest IP packets will be routed over the host @@ -1735,6 +1744,11 @@ Specify the guest-visible address of the virtual nameserver. The address must be different from the host address. Default is the 3rd IP in the guest network, i.e. x.x.x.3. +@item ip6-dns=@var{addr} +Specify the guest-visible address of the IPv6 virtual nameserver. The address +must be different from the host address. Default is the 3rd IP in the guest +network, i.e. xxxx::3. + @item dnssearch=@var{domain} Provides an entry for the domain-search list sent by the built-in DHCP server. More than one domain suffix can be transmitted by specifying diff --git a/qemu-timer.c b/qemu-timer.c index e98ecc9733..4441fe66ff 100644 --- a/qemu-timer.c +++ b/qemu-timer.c @@ -394,7 +394,9 @@ static bool timer_mod_ns_locked(QEMUTimerList *timer_list, static void timerlist_rearm(QEMUTimerList *timer_list) { /* Interrupt execution to force deadline recalculation. */ - qemu_clock_warp(timer_list->clock->type); + if (timer_list->clock->type == QEMU_CLOCK_VIRTUAL) { + qemu_start_warp_timer(); + } timerlist_notify(timer_list); } diff --git a/qmp-commands.hx b/qmp-commands.hx index b629673459..9e05365ccf 100644 --- a/qmp-commands.hx +++ b/qmp-commands.hx @@ -3683,7 +3683,7 @@ Enable/Disable migration capabilities - "zero-blocks": compress zero blocks during block migration - "compress": use multiple compression threads to accelerate live migration - "events": generate events for each migration state change -- "x-postcopy-ram": postcopy mode for live migration +- "postcopy-ram": postcopy mode for live migration Arguments: @@ -3713,7 +3713,7 @@ Query current migration capabilities - "zero-blocks" : Zero Blocks state (json-bool) - "compress": Multiple compression threads state (json-bool) - "events": Migration state change event state (json-bool) - - "x-postcopy-ram": postcopy ram state (json-bool) + - "postcopy-ram": postcopy ram state (json-bool) Arguments: @@ -3727,7 +3727,7 @@ Example: {"state": false, "capability": "zero-blocks"}, {"state": false, "capability": "compress"}, {"state": true, "capability": "events"}, - {"state": false, "capability": "x-postcopy-ram"} + {"state": false, "capability": "postcopy-ram"} ]} EQMP diff --git a/replay/Makefile.objs b/replay/Makefile.objs index 232193a24b..fcb3f74d60 100644 --- a/replay/Makefile.objs +++ b/replay/Makefile.objs @@ -3,3 +3,4 @@ common-obj-y += replay-internal.o common-obj-y += replay-events.o common-obj-y += replay-time.o common-obj-y += replay-input.o +common-obj-y += replay-char.o diff --git a/replay/replay-char.c b/replay/replay-char.c new file mode 100755 index 0000000000..23b6922977 --- /dev/null +++ b/replay/replay-char.c @@ -0,0 +1,168 @@ +/* + * replay-char.c + * + * Copyright (c) 2010-2016 Institute for System Programming + * of the Russian Academy of Sciences. + * + * This work is licensed under the terms of the GNU GPL, version 2 or later. + * See the COPYING file in the top-level directory. + * + */ + +#include <stdio.h> +#include <stdlib.h> +#include <string.h> + +#include "qemu/osdep.h" +#include "qemu/error-report.h" +#include "sysemu/replay.h" +#include "replay-internal.h" +#include "sysemu/sysemu.h" +#include "sysemu/char.h" + +/* Char drivers that generate qemu_chr_be_write events + that should be saved into the log. */ +static CharDriverState **char_drivers; +static int drivers_count; + +/* Char event attributes. */ +typedef struct CharEvent { + int id; + uint8_t *buf; + size_t len; +} CharEvent; + +static int find_char_driver(CharDriverState *chr) +{ + int i = 0; + for ( ; i < drivers_count ; ++i) { + if (char_drivers[i] == chr) { + return i; + } + } + return -1; +} + +void replay_register_char_driver(CharDriverState *chr) +{ + if (replay_mode == REPLAY_MODE_NONE) { + return; + } + char_drivers = g_realloc(char_drivers, + sizeof(*char_drivers) * (drivers_count + 1)); + char_drivers[drivers_count++] = chr; +} + +void replay_chr_be_write(CharDriverState *s, uint8_t *buf, int len) +{ + CharEvent *event = g_malloc0(sizeof(CharEvent)); + + event->id = find_char_driver(s); + if (event->id < 0) { + fprintf(stderr, "Replay: cannot find char driver\n"); + exit(1); + } + event->buf = g_malloc(len); + memcpy(event->buf, buf, len); + event->len = len; + + replay_add_event(REPLAY_ASYNC_EVENT_CHAR_READ, event, NULL, 0); +} + +void replay_event_char_read_run(void *opaque) +{ + CharEvent *event = (CharEvent *)opaque; + + qemu_chr_be_write_impl(char_drivers[event->id], event->buf, + (int)event->len); + + g_free(event->buf); + g_free(event); +} + +void replay_event_char_read_save(void *opaque) +{ + CharEvent *event = (CharEvent *)opaque; + + replay_put_byte(event->id); + replay_put_array(event->buf, event->len); +} + +void *replay_event_char_read_load(void) +{ + CharEvent *event = g_malloc0(sizeof(CharEvent)); + + event->id = replay_get_byte(); + replay_get_array_alloc(&event->buf, &event->len); + + return event; +} + +void replay_char_write_event_save(int res, int offset) +{ + replay_save_instructions(); + replay_mutex_lock(); + replay_put_event(EVENT_CHAR_WRITE); + replay_put_dword(res); + replay_put_dword(offset); + replay_mutex_unlock(); +} + +void replay_char_write_event_load(int *res, int *offset) +{ + replay_account_executed_instructions(); + replay_mutex_lock(); + if (replay_next_event_is(EVENT_CHAR_WRITE)) { + *res = replay_get_dword(); + *offset = replay_get_dword(); + replay_finish_event(); + replay_mutex_unlock(); + } else { + replay_mutex_unlock(); + error_report("Missing character write event in the replay log"); + exit(1); + } +} + +int replay_char_read_all_load(uint8_t *buf) +{ + replay_mutex_lock(); + if (replay_next_event_is(EVENT_CHAR_READ_ALL)) { + size_t size; + int res; + replay_get_array(buf, &size); + replay_finish_event(); + replay_mutex_unlock(); + res = (int)size; + assert(res >= 0); + return res; + } else if (replay_next_event_is(EVENT_CHAR_READ_ALL_ERROR)) { + int res = replay_get_dword(); + replay_finish_event(); + replay_mutex_unlock(); + return res; + } else { + replay_mutex_unlock(); + error_report("Missing character read all event in the replay log"); + exit(1); + } +} + +void replay_char_read_all_save_error(int res) +{ + assert(res < 0); + replay_save_instructions(); + replay_mutex_lock(); + replay_put_event(EVENT_CHAR_READ_ALL_ERROR); + replay_put_dword(res); + replay_mutex_unlock(); +} + +void replay_char_read_all_save_buf(uint8_t *buf, int offset) +{ + replay_save_instructions(); + replay_mutex_lock(); + replay_put_event(EVENT_CHAR_READ_ALL); + replay_put_array(buf, offset); + replay_mutex_unlock(); +} diff --git a/replay/replay-events.c b/replay/replay-events.c index 2628109ed8..ca940f70e7 100644 --- a/replay/replay-events.c +++ b/replay/replay-events.c @@ -48,6 +48,9 @@ static void replay_run_event(Event *event) case REPLAY_ASYNC_EVENT_INPUT_SYNC: qemu_input_event_sync_impl(); break; + case REPLAY_ASYNC_EVENT_CHAR_READ: + replay_event_char_read_run(event->opaque); + break; default: error_report("Replay: invalid async event ID (%d) in the queue", event->event_kind); @@ -102,9 +105,9 @@ void replay_clear_events(void) } /*! Adds specified async event to the queue */ -static void replay_add_event(ReplayAsyncEventKind event_kind, - void *opaque, - void *opaque2, uint64_t id) +void replay_add_event(ReplayAsyncEventKind event_kind, + void *opaque, + void *opaque2, uint64_t id) { assert(event_kind < REPLAY_ASYNC_COUNT); @@ -168,6 +171,9 @@ static void replay_save_event(Event *event, int checkpoint) break; case REPLAY_ASYNC_EVENT_INPUT_SYNC: break; + case REPLAY_ASYNC_EVENT_CHAR_READ: + replay_event_char_read_save(event->opaque); + break; default: error_report("Unknown ID %d of replay event", read_event_kind); exit(1); @@ -221,6 +227,11 @@ static Event *replay_read_event(int checkpoint) event->event_kind = read_event_kind; event->opaque = 0; return event; + case REPLAY_ASYNC_EVENT_CHAR_READ: + event = g_malloc0(sizeof(Event)); + event->event_kind = read_event_kind; + event->opaque = replay_event_char_read_load(); + return event; default: error_report("Unknown ID %d of replay event", read_event_kind); exit(1); diff --git a/replay/replay-internal.h b/replay/replay-internal.h index 5438ebdb9c..11f9a85f3e 100644 --- a/replay/replay-internal.h +++ b/replay/replay-internal.h @@ -24,6 +24,11 @@ enum ReplayEvents { EVENT_ASYNC, /* for shutdown request */ EVENT_SHUTDOWN, + /* for character device write event */ + EVENT_CHAR_WRITE, + /* for character device read all event */ + EVENT_CHAR_READ_ALL, + EVENT_CHAR_READ_ALL_ERROR, /* for clock read/writes */ /* some of greater codes are reserved for clocks */ EVENT_CLOCK, @@ -43,6 +48,7 @@ enum ReplayAsyncEventKind { REPLAY_ASYNC_EVENT_BH, REPLAY_ASYNC_EVENT_INPUT, REPLAY_ASYNC_EVENT_INPUT_SYNC, + REPLAY_ASYNC_EVENT_CHAR_READ, REPLAY_ASYNC_COUNT }; @@ -124,6 +130,9 @@ bool replay_has_events(void); void replay_save_events(int checkpoint); /*! Read events from the file into the input queue */ void replay_read_events(int checkpoint); +/*! Adds specified async event to the queue */ +void replay_add_event(ReplayAsyncEventKind event_kind, void *opaque, + void *opaque2, uint64_t id); /* Input events */ @@ -136,4 +145,13 @@ void replay_add_input_event(struct InputEvent *event); /*! Adds input sync event to the queue */ void replay_add_input_sync_event(void); +/* Character devices */ + +/*! Called to run char device read event. */ +void replay_event_char_read_run(void *opaque); +/*! Writes char read event to the file. */ +void replay_event_char_read_save(void *opaque); +/*! Reads char event read from the file. */ +void *replay_event_char_read_load(void); + #endif diff --git a/replay/replay.c b/replay/replay.c index f8739c26c8..fcfde4fc93 100644 --- a/replay/replay.c +++ b/replay/replay.c @@ -20,7 +20,7 @@ /* Current version of the replay mechanism. Increase it when file format changes. */ -#define REPLAY_VERSION 0xe02002 +#define REPLAY_VERSION 0xe02003 /* Size of replay log header */ #define HEADER_SIZE (sizeof(uint32_t) + sizeof(uint64_t)) diff --git a/scripts/update-linux-headers.sh b/scripts/update-linux-headers.sh index ff5b0c7033..6aa8407f12 100755 --- a/scripts/update-linux-headers.sh +++ b/scripts/update-linux-headers.sh @@ -103,7 +103,7 @@ done rm -rf "$output/linux-headers/linux" mkdir -p "$output/linux-headers/linux" for header in kvm.h kvm_para.h vfio.h vhost.h \ - psci.h; do + psci.h userfaultfd.h; do cp "$tmpdir/include/linux/$header" "$output/linux-headers/linux" done rm -rf "$output/linux-headers/asm-generic" diff --git a/slirp/Makefile.objs b/slirp/Makefile.objs index 2daa9dc58d..6748e4f60a 100644 --- a/slirp/Makefile.objs +++ b/slirp/Makefile.objs @@ -1,3 +1,5 @@ -common-obj-y = cksum.o if.o ip_icmp.o ip_input.o ip_output.o dnssearch.o +common-obj-y = cksum.o if.o ip_icmp.o ip6_icmp.o ip6_input.o ip6_output.o \ + ip_input.o ip_output.o dnssearch.o common-obj-y += slirp.o mbuf.o misc.o sbuf.o socket.o tcp_input.o tcp_output.o -common-obj-y += tcp_subr.o tcp_timer.o udp.o bootp.o tftp.o arp_table.o +common-obj-y += tcp_subr.o tcp_timer.o udp.o udp6.o bootp.o tftp.o arp_table.o \ + ndp_table.o diff --git a/slirp/cksum.c b/slirp/cksum.c index bc0d017d24..2ad0e6540d 100644 --- a/slirp/cksum.c +++ b/slirp/cksum.c @@ -138,3 +138,28 @@ cont: REDUCE; return (~sum & 0xffff); } + +int ip6_cksum(struct mbuf *m) +{ + /* TODO: Optimize this by being able to pass the ip6_pseudohdr to cksum + * separately from the mbuf */ + struct ip6 save_ip, *ip = mtod(m, struct ip6 *); + struct ip6_pseudohdr *ih = mtod(m, struct ip6_pseudohdr *); + int sum; + + save_ip = *ip; + + ih->ih_src = save_ip.ip_src; + ih->ih_dst = save_ip.ip_dst; + ih->ih_pl = htonl((uint32_t)ntohs(save_ip.ip_pl)); + ih->ih_zero_hi = 0; + ih->ih_zero_lo = 0; + ih->ih_nh = save_ip.ip_nh; + + sum = cksum(m, ((int)sizeof(struct ip6_pseudohdr)) + + ntohl(ih->ih_pl)); + + *ip = save_ip; + + return sum; +} diff --git a/slirp/if.c b/slirp/if.c index 93d7cc0b43..2e21f438e8 100644 --- a/slirp/if.c +++ b/slirp/if.c @@ -194,7 +194,7 @@ void if_start(Slirp *slirp) /* Try to send packet unless it already expired */ if (ifm->expiration_date >= now && !if_encap(slirp, ifm)) { - /* Packet is delayed due to pending ARP resolution */ + /* Packet is delayed due to pending ARP or NDP resolution */ continue; } diff --git a/slirp/if.h b/slirp/if.h index 33270239fd..c7a5c5724d 100644 --- a/slirp/if.h +++ b/slirp/if.h @@ -17,7 +17,7 @@ #define IF_MRU 1500 #define IF_COMP IF_AUTOCOMP /* Flags for compression */ -/* 2 for alignment, 14 for ethernet, 40 for TCP/IP */ -#define IF_MAXLINKHDR (2 + 14 + 40) +/* 2 for alignment, 14 for ethernet */ +#define IF_MAXLINKHDR (2 + ETH_HLEN) #endif diff --git a/slirp/ip6.h b/slirp/ip6.h new file mode 100644 index 0000000000..8ddfa242c4 --- /dev/null +++ b/slirp/ip6.h @@ -0,0 +1,141 @@ +/* + * Copyright (c) 2013 + * Guillaume Subiron, Yann Bordenave, Serigne Modou Wagne. + */ + +#ifndef SLIRP_IP6_H_ +#define SLIRP_IP6_H_ + +#include "net/eth.h" + +#define ALLNODES_MULTICAST { .s6_addr = \ + { 0xff, 0x02, 0x00, 0x00,\ + 0x00, 0x00, 0x00, 0x00,\ + 0x00, 0x00, 0x00, 0x00,\ + 0x00, 0x00, 0x00, 0x01 } } + +#define SOLICITED_NODE_PREFIX { .s6_addr = \ + { 0xff, 0x02, 0x00, 0x00,\ + 0x00, 0x00, 0x00, 0x00,\ + 0x00, 0x00, 0x00, 0x01,\ + 0xff, 0x00, 0x00, 0x00 } } + +#define LINKLOCAL_ADDR { .s6_addr = \ + { 0xfe, 0x80, 0x00, 0x00,\ + 0x00, 0x00, 0x00, 0x00,\ + 0x00, 0x00, 0x00, 0x00,\ + 0x00, 0x00, 0x00, 0x02 } } + +static inline bool in6_equal(const struct in6_addr *a, const struct in6_addr *b) +{ + return memcmp(a, b, sizeof(*a)) == 0; +} + +static inline bool in6_equal_net(const struct in6_addr *a, + const struct in6_addr *b, + int prefix_len) +{ + if (memcmp(a, b, prefix_len / 8) != 0) { + return 0; + } + + if (prefix_len % 8 == 0) { + return 1; + } + + return a->s6_addr[prefix_len / 8] >> (8 - (prefix_len % 8)) + == b->s6_addr[prefix_len / 8] >> (8 - (prefix_len % 8)); +} + +static inline bool in6_equal_mach(const struct in6_addr *a, + const struct in6_addr *b, + int prefix_len) +{ + if (memcmp(&(a->s6_addr[(prefix_len + 7) / 8]), + &(b->s6_addr[(prefix_len + 7) / 8]), + 16 - (prefix_len + 7) / 8) != 0) { + return 0; + } + + if (prefix_len % 8 == 0) { + return 1; + } + + return (a->s6_addr[prefix_len / 8] & ((1U << (8 - (prefix_len % 8))) - 1)) + == (b->s6_addr[prefix_len / 8] & ((1U << (8 - (prefix_len % 8))) - 1)); +} + + +#define in6_equal_router(a)\ + ((in6_equal_net(a, &slirp->vprefix_addr6, slirp->vprefix_len)\ + && in6_equal_mach(a, &slirp->vhost_addr6, slirp->vprefix_len))\ + || (in6_equal_net(a, &(struct in6_addr)LINKLOCAL_ADDR, 64)\ + && in6_equal_mach(a, &slirp->vhost_addr6, 64))) + +#define in6_equal_dns(a)\ + ((in6_equal_net(a, &slirp->vprefix_addr6, slirp->vprefix_len)\ + && in6_equal_mach(a, &slirp->vnameserver_addr6, slirp->vprefix_len))\ + || (in6_equal_net(a, &(struct in6_addr)LINKLOCAL_ADDR, 64)\ + && in6_equal_mach(a, &slirp->vnameserver_addr6, 64))) + +#define in6_equal_host(a)\ + (in6_equal_router(a) || in6_equal_dns(a)) + +#define in6_solicitednode_multicast(a)\ + (in6_equal_net(a, &(struct in6_addr)SOLICITED_NODE_PREFIX, 104)) + +/* Compute emulated host MAC address from its ipv6 address */ +static inline void in6_compute_ethaddr(struct in6_addr ip, + uint8_t eth[ETH_ALEN]) +{ + eth[0] = 0x52; + eth[1] = 0x56; + memcpy(ð[2], &ip.s6_addr[16 - (ETH_ALEN - 2)], ETH_ALEN - 2); +} + +/* + * Definitions for internet protocol version 6. + * Per RFC 2460, December 1998. + */ +#define IP6VERSION 6 +#define IP6_HOP_LIMIT 255 + +/* + * Structure of an internet header, naked of options. + */ +struct ip6 { +#ifdef HOST_WORDS_BIGENDIAN + uint32_t + ip_v:4, /* version */ + ip_tc_hi:4, /* traffic class */ + ip_tc_lo:4, + ip_fl_hi:4, /* flow label */ + ip_fl_lo:16; +#else + uint32_t + ip_tc_hi:4, + ip_v:4, + ip_fl_hi:4, + ip_tc_lo:4, + ip_fl_lo:16; +#endif + uint16_t ip_pl; /* payload length */ + uint8_t ip_nh; /* next header */ + uint8_t ip_hl; /* hop limit */ + struct in6_addr ip_src, ip_dst; /* source and dest address */ +} QEMU_PACKED; + +/* + * IPv6 pseudo-header used by upper-layer protocols + */ +struct ip6_pseudohdr { + struct in6_addr ih_src; /* source internet address */ + struct in6_addr ih_dst; /* destination internet address */ + uint32_t ih_pl; /* upper-layer packet length */ + uint16_t ih_zero_hi; /* zero */ + uint8_t ih_zero_lo; /* zero */ + uint8_t ih_nh; /* next header */ +} QEMU_PACKED; + + +#endif diff --git a/slirp/ip6_icmp.c b/slirp/ip6_icmp.c new file mode 100644 index 0000000000..9d61349c7d --- /dev/null +++ b/slirp/ip6_icmp.c @@ -0,0 +1,416 @@ +/* + * Copyright (c) 2013 + * Guillaume Subiron, Yann Bordenave, Serigne Modou Wagne. + */ + +#include "qemu/osdep.h" +#include "slirp.h" +#include "ip6_icmp.h" +#include "qemu/timer.h" +#include "qemu/error-report.h" +#include "qemu/log.h" +#include <time.h> + +#define NDP_Interval g_rand_int_range(slirp->grand, \ + NDP_MinRtrAdvInterval, NDP_MaxRtrAdvInterval) + +static void ra_timer_handler(void *opaque) +{ + Slirp *slirp = opaque; + timer_mod(slirp->ra_timer, + qemu_clock_get_ms(QEMU_CLOCK_VIRTUAL) + NDP_Interval); + ndp_send_ra(slirp); +} + +void icmp6_init(Slirp *slirp) +{ + slirp->ra_timer = timer_new_ms(QEMU_CLOCK_VIRTUAL, ra_timer_handler, slirp); + timer_mod(slirp->ra_timer, + qemu_clock_get_ms(QEMU_CLOCK_VIRTUAL) + NDP_Interval); +} + +void icmp6_cleanup(Slirp *slirp) +{ + timer_del(slirp->ra_timer); + timer_free(slirp->ra_timer); +} + +static void icmp6_send_echoreply(struct mbuf *m, Slirp *slirp, struct ip6 *ip, + struct icmp6 *icmp) +{ + struct mbuf *t = m_get(slirp); + t->m_len = sizeof(struct ip6) + ntohs(ip->ip_pl); + memcpy(t->m_data, m->m_data, t->m_len); + + /* IPv6 Packet */ + struct ip6 *rip = mtod(t, struct ip6 *); + rip->ip_dst = ip->ip_src; + rip->ip_src = ip->ip_dst; + + /* ICMPv6 packet */ + t->m_data += sizeof(struct ip6); + struct icmp6 *ricmp = mtod(t, struct icmp6 *); + ricmp->icmp6_type = ICMP6_ECHO_REPLY; + ricmp->icmp6_cksum = 0; + + /* Checksum */ + t->m_data -= sizeof(struct ip6); + ricmp->icmp6_cksum = ip6_cksum(t); + + ip6_output(NULL, t, 0); +} + +void icmp6_send_error(struct mbuf *m, uint8_t type, uint8_t code) +{ + Slirp *slirp = m->slirp; + struct mbuf *t; + struct ip6 *ip = mtod(m, struct ip6 *); + + DEBUG_CALL("icmp6_send_error"); + DEBUG_ARGS((dfd, " type = %d, code = %d\n", type, code)); + + if (IN6_IS_ADDR_MULTICAST(&ip->ip_src) || + IN6_IS_ADDR_UNSPECIFIED(&ip->ip_src)) { + /* TODO icmp error? */ + return; + } + + t = m_get(slirp); + + /* IPv6 packet */ + struct ip6 *rip = mtod(t, struct ip6 *); + rip->ip_src = (struct in6_addr)LINKLOCAL_ADDR; + rip->ip_dst = ip->ip_src; +#if !defined(_WIN32) || (_WIN32_WINNT >= 0x0600) + char addrstr[INET6_ADDRSTRLEN]; + inet_ntop(AF_INET6, &rip->ip_dst, addrstr, INET6_ADDRSTRLEN); + DEBUG_ARG("target = %s", addrstr); +#endif + + rip->ip_nh = IPPROTO_ICMPV6; + const int error_data_len = min(m->m_len, + IF_MTU - (sizeof(struct ip6) + ICMP6_ERROR_MINLEN)); + rip->ip_pl = htons(ICMP6_ERROR_MINLEN + error_data_len); + t->m_len = sizeof(struct ip6) + ntohs(rip->ip_pl); + + /* ICMPv6 packet */ + t->m_data += sizeof(struct ip6); + struct icmp6 *ricmp = mtod(t, struct icmp6 *); + ricmp->icmp6_type = type; + ricmp->icmp6_code = code; + ricmp->icmp6_cksum = 0; + + switch (type) { + case ICMP6_UNREACH: + case ICMP6_TIMXCEED: + ricmp->icmp6_err.unused = 0; + break; + case ICMP6_TOOBIG: + ricmp->icmp6_err.mtu = htonl(IF_MTU); + break; + case ICMP6_PARAMPROB: + /* TODO: Handle this case */ + break; + default: + g_assert_not_reached(); + break; + } + t->m_data += ICMP6_ERROR_MINLEN; + memcpy(t->m_data, m->m_data, error_data_len); + + /* Checksum */ + t->m_data -= ICMP6_ERROR_MINLEN; + t->m_data -= sizeof(struct ip6); + ricmp->icmp6_cksum = ip6_cksum(t); + + ip6_output(NULL, t, 0); +} + +/* + * Send NDP Router Advertisement + */ +void ndp_send_ra(Slirp *slirp) +{ + DEBUG_CALL("ndp_send_ra"); + + /* Build IPv6 packet */ + struct mbuf *t = m_get(slirp); + struct ip6 *rip = mtod(t, struct ip6 *); + rip->ip_src = (struct in6_addr)LINKLOCAL_ADDR; + rip->ip_dst = (struct in6_addr)ALLNODES_MULTICAST; + rip->ip_nh = IPPROTO_ICMPV6; + rip->ip_pl = htons(ICMP6_NDP_RA_MINLEN + + NDPOPT_LINKLAYER_LEN + + NDPOPT_PREFIXINFO_LEN); + t->m_len = sizeof(struct ip6) + ntohs(rip->ip_pl); + + /* Build ICMPv6 packet */ + t->m_data += sizeof(struct ip6); + struct icmp6 *ricmp = mtod(t, struct icmp6 *); + ricmp->icmp6_type = ICMP6_NDP_RA; + ricmp->icmp6_code = 0; + ricmp->icmp6_cksum = 0; + + /* NDP */ + ricmp->icmp6_nra.chl = NDP_AdvCurHopLimit; + ricmp->icmp6_nra.M = NDP_AdvManagedFlag; + ricmp->icmp6_nra.O = NDP_AdvOtherConfigFlag; + ricmp->icmp6_nra.reserved = 0; + ricmp->icmp6_nra.lifetime = htons(NDP_AdvDefaultLifetime); + ricmp->icmp6_nra.reach_time = htonl(NDP_AdvReachableTime); + ricmp->icmp6_nra.retrans_time = htonl(NDP_AdvRetransTime); + + /* Source link-layer address (NDP option) */ + t->m_data += ICMP6_NDP_RA_MINLEN; + struct ndpopt *opt = mtod(t, struct ndpopt *); + opt->ndpopt_type = NDPOPT_LINKLAYER_SOURCE; + opt->ndpopt_len = NDPOPT_LINKLAYER_LEN / 8; + in6_compute_ethaddr(rip->ip_src, opt->ndpopt_linklayer); + + /* Prefix information (NDP option) */ + t->m_data += NDPOPT_LINKLAYER_LEN; + struct ndpopt *opt2 = mtod(t, struct ndpopt *); + opt2->ndpopt_type = NDPOPT_PREFIX_INFO; + opt2->ndpopt_len = NDPOPT_PREFIXINFO_LEN / 8; + opt2->ndpopt_prefixinfo.prefix_length = slirp->vprefix_len; + opt2->ndpopt_prefixinfo.L = 1; + opt2->ndpopt_prefixinfo.A = 1; + opt2->ndpopt_prefixinfo.reserved1 = 0; + opt2->ndpopt_prefixinfo.valid_lt = htonl(NDP_AdvValidLifetime); + opt2->ndpopt_prefixinfo.pref_lt = htonl(NDP_AdvPrefLifetime); + opt2->ndpopt_prefixinfo.reserved2 = 0; + opt2->ndpopt_prefixinfo.prefix = slirp->vprefix_addr6; + + /* ICMPv6 Checksum */ + t->m_data -= NDPOPT_LINKLAYER_LEN; + t->m_data -= ICMP6_NDP_RA_MINLEN; + t->m_data -= sizeof(struct ip6); + ricmp->icmp6_cksum = ip6_cksum(t); + + ip6_output(NULL, t, 0); +} + +/* + * Send NDP Neighbor Solitication + */ +void ndp_send_ns(Slirp *slirp, struct in6_addr addr) +{ + DEBUG_CALL("ndp_send_ns"); +#if !defined(_WIN32) || (_WIN32_WINNT >= 0x0600) + char addrstr[INET6_ADDRSTRLEN]; + inet_ntop(AF_INET6, &addr, addrstr, INET6_ADDRSTRLEN); + DEBUG_ARG("target = %s", addrstr); +#endif + + /* Build IPv6 packet */ + struct mbuf *t = m_get(slirp); + struct ip6 *rip = mtod(t, struct ip6 *); + rip->ip_src = slirp->vhost_addr6; + rip->ip_dst = (struct in6_addr)SOLICITED_NODE_PREFIX; + memcpy(&rip->ip_dst.s6_addr[13], &addr.s6_addr[13], 3); + rip->ip_nh = IPPROTO_ICMPV6; + rip->ip_pl = htons(ICMP6_NDP_NS_MINLEN + NDPOPT_LINKLAYER_LEN); + t->m_len = sizeof(struct ip6) + ntohs(rip->ip_pl); + + /* Build ICMPv6 packet */ + t->m_data += sizeof(struct ip6); + struct icmp6 *ricmp = mtod(t, struct icmp6 *); + ricmp->icmp6_type = ICMP6_NDP_NS; + ricmp->icmp6_code = 0; + ricmp->icmp6_cksum = 0; + + /* NDP */ + ricmp->icmp6_nns.reserved = 0; + ricmp->icmp6_nns.target = addr; + + /* Build NDP option */ + t->m_data += ICMP6_NDP_NS_MINLEN; + struct ndpopt *opt = mtod(t, struct ndpopt *); + opt->ndpopt_type = NDPOPT_LINKLAYER_SOURCE; + opt->ndpopt_len = NDPOPT_LINKLAYER_LEN / 8; + in6_compute_ethaddr(slirp->vhost_addr6, opt->ndpopt_linklayer); + + /* ICMPv6 Checksum */ + t->m_data -= ICMP6_NDP_NA_MINLEN; + t->m_data -= sizeof(struct ip6); + ricmp->icmp6_cksum = ip6_cksum(t); + + ip6_output(NULL, t, 1); +} + +/* + * Send NDP Neighbor Advertisement + */ +static void ndp_send_na(Slirp *slirp, struct ip6 *ip, struct icmp6 *icmp) +{ + /* Build IPv6 packet */ + struct mbuf *t = m_get(slirp); + struct ip6 *rip = mtod(t, struct ip6 *); + rip->ip_src = icmp->icmp6_nns.target; + if (IN6_IS_ADDR_UNSPECIFIED(&ip->ip_src)) { + rip->ip_dst = (struct in6_addr)ALLNODES_MULTICAST; + } else { + rip->ip_dst = ip->ip_src; + } + rip->ip_nh = IPPROTO_ICMPV6; + rip->ip_pl = htons(ICMP6_NDP_NA_MINLEN + + NDPOPT_LINKLAYER_LEN); + t->m_len = sizeof(struct ip6) + ntohs(rip->ip_pl); + + /* Build ICMPv6 packet */ + t->m_data += sizeof(struct ip6); + struct icmp6 *ricmp = mtod(t, struct icmp6 *); + ricmp->icmp6_type = ICMP6_NDP_NA; + ricmp->icmp6_code = 0; + ricmp->icmp6_cksum = 0; + + /* NDP */ + ricmp->icmp6_nna.R = NDP_IsRouter; + ricmp->icmp6_nna.S = !IN6_IS_ADDR_MULTICAST(&rip->ip_dst); + ricmp->icmp6_nna.O = 1; + ricmp->icmp6_nna.reserved_hi = 0; + ricmp->icmp6_nna.reserved_lo = 0; + ricmp->icmp6_nna.target = icmp->icmp6_nns.target; + + /* Build NDP option */ + t->m_data += ICMP6_NDP_NA_MINLEN; + struct ndpopt *opt = mtod(t, struct ndpopt *); + opt->ndpopt_type = NDPOPT_LINKLAYER_TARGET; + opt->ndpopt_len = NDPOPT_LINKLAYER_LEN / 8; + in6_compute_ethaddr(ricmp->icmp6_nna.target, + opt->ndpopt_linklayer); + + /* ICMPv6 Checksum */ + t->m_data -= ICMP6_NDP_NA_MINLEN; + t->m_data -= sizeof(struct ip6); + ricmp->icmp6_cksum = ip6_cksum(t); + + ip6_output(NULL, t, 0); +} + +/* + * Process a NDP message + */ +static void ndp_input(struct mbuf *m, Slirp *slirp, struct ip6 *ip, + struct icmp6 *icmp) +{ + m->m_len += ETH_HLEN; + m->m_data -= ETH_HLEN; + struct ethhdr *eth = mtod(m, struct ethhdr *); + m->m_len -= ETH_HLEN; + m->m_data += ETH_HLEN; + + switch (icmp->icmp6_type) { + case ICMP6_NDP_RS: + DEBUG_CALL(" type = Router Solicitation"); + if (ip->ip_hl == 255 + && icmp->icmp6_code == 0 + && ntohs(ip->ip_pl) >= ICMP6_NDP_RS_MINLEN) { + /* Gratuitous NDP */ + ndp_table_add(slirp, ip->ip_src, eth->h_source); + + ndp_send_ra(slirp); + } + break; + + case ICMP6_NDP_RA: + DEBUG_CALL(" type = Router Advertisement"); + qemu_log_mask(LOG_GUEST_ERROR, + "Warning: guest sent NDP RA, but shouldn't"); + break; + + case ICMP6_NDP_NS: + DEBUG_CALL(" type = Neighbor Solicitation"); + if (ip->ip_hl == 255 + && icmp->icmp6_code == 0 + && !IN6_IS_ADDR_MULTICAST(&icmp->icmp6_nns.target) + && ntohs(ip->ip_pl) >= ICMP6_NDP_NS_MINLEN + && (!IN6_IS_ADDR_UNSPECIFIED(&ip->ip_src) + || in6_solicitednode_multicast(&ip->ip_dst))) { + if (in6_equal_host(&icmp->icmp6_nns.target)) { + /* Gratuitous NDP */ + ndp_table_add(slirp, ip->ip_src, eth->h_source); + ndp_send_na(slirp, ip, icmp); + } + } + break; + + case ICMP6_NDP_NA: + DEBUG_CALL(" type = Neighbor Advertisement"); + if (ip->ip_hl == 255 + && icmp->icmp6_code == 0 + && ntohs(ip->ip_pl) >= ICMP6_NDP_NA_MINLEN + && !IN6_IS_ADDR_MULTICAST(&icmp->icmp6_nna.target) + && (!IN6_IS_ADDR_MULTICAST(&ip->ip_dst) + || icmp->icmp6_nna.S == 0)) { + ndp_table_add(slirp, ip->ip_src, eth->h_source); + } + break; + + case ICMP6_NDP_REDIRECT: + DEBUG_CALL(" type = Redirect"); + qemu_log_mask(LOG_GUEST_ERROR, + "Warning: guest sent NDP REDIRECT, but shouldn't"); + break; + } +} + +/* + * Process a received ICMPv6 message. + */ +void icmp6_input(struct mbuf *m) +{ + struct icmp6 *icmp; + struct ip6 *ip = mtod(m, struct ip6 *); + Slirp *slirp = m->slirp; + int hlen = sizeof(struct ip6); + + DEBUG_CALL("icmp6_input"); + DEBUG_ARG("m = %lx", (long) m); + DEBUG_ARG("m_len = %d", m->m_len); + + if (ntohs(ip->ip_pl) < ICMP6_MINLEN) { + goto end; + } + + if (ip6_cksum(m)) { + goto end; + } + + m->m_len -= hlen; + m->m_data += hlen; + icmp = mtod(m, struct icmp6 *); + m->m_len += hlen; + m->m_data -= hlen; + + DEBUG_ARG("icmp6_type = %d", icmp->icmp6_type); + switch (icmp->icmp6_type) { + case ICMP6_ECHO_REQUEST: + if (in6_equal_host(&ip->ip_dst)) { + icmp6_send_echoreply(m, slirp, ip, icmp); + } else { + /* TODO */ + error_report("external icmpv6 not supported yet"); + } + break; + + case ICMP6_NDP_RS: + case ICMP6_NDP_RA: + case ICMP6_NDP_NS: + case ICMP6_NDP_NA: + case ICMP6_NDP_REDIRECT: + ndp_input(m, slirp, ip, icmp); + break; + + case ICMP6_UNREACH: + case ICMP6_TOOBIG: + case ICMP6_TIMXCEED: + case ICMP6_PARAMPROB: + /* XXX? report error? close socket? */ + default: + break; + } + +end: + m_free(m); +} diff --git a/slirp/ip6_icmp.h b/slirp/ip6_icmp.h new file mode 100644 index 0000000000..9460bf837a --- /dev/null +++ b/slirp/ip6_icmp.h @@ -0,0 +1,213 @@ +/* + * Copyright (c) 2013 + * Guillaume Subiron, Yann Bordenave, Serigne Modou Wagne. + */ + +#ifndef SLIRP_NETINET_ICMP6_H_ +#define SLIRP_NETINET_ICMP6_H_ + +/* + * Interface Control Message Protocol version 6 Definitions. + * Per RFC 4443, March 2006. + * + * Network Discover Protocol Definitions. + * Per RFC 4861, September 2007. + */ + +struct icmp6_echo { /* Echo Messages */ + uint16_t id; + uint16_t seq_num; +}; + +union icmp6_error_body { + uint32_t unused; + uint32_t pointer; + uint32_t mtu; +}; + +/* + * NDP Messages + */ +struct ndp_rs { /* Router Solicitation Message */ + uint32_t reserved; +}; + +struct ndp_ra { /* Router Advertisement Message */ + uint8_t chl; /* Cur Hop Limit */ +#ifdef HOST_WORDS_BIGENDIAN + uint8_t + M:1, + O:1, + reserved:6; +#else + uint8_t + reserved:6, + O:1, + M:1; +#endif + uint16_t lifetime; /* Router Lifetime */ + uint32_t reach_time; /* Reachable Time */ + uint32_t retrans_time; /* Retrans Timer */ +} QEMU_PACKED; + +struct ndp_ns { /* Neighbor Solicitation Message */ + uint32_t reserved; + struct in6_addr target; /* Target Address */ +} QEMU_PACKED; + +struct ndp_na { /* Neighbor Advertisement Message */ +#ifdef HOST_WORDS_BIGENDIAN + uint32_t + R:1, /* Router Flag */ + S:1, /* Solicited Flag */ + O:1, /* Override Flag */ + reserved_hi:5, + reserved_lo:24; +#else + uint32_t + reserved_hi:5, + O:1, + S:1, + R:1, + reserved_lo:24; +#endif + struct in6_addr target; /* Target Address */ +} QEMU_PACKED; + +struct ndp_redirect { + uint32_t reserved; + struct in6_addr target; /* Target Address */ + struct in6_addr dest; /* Destination Address */ +} QEMU_PACKED; + +/* + * Structure of an icmpv6 header. + */ +struct icmp6 { + uint8_t icmp6_type; /* type of message, see below */ + uint8_t icmp6_code; /* type sub code */ + uint16_t icmp6_cksum; /* ones complement cksum of struct */ + union { + union icmp6_error_body error_body; + struct icmp6_echo echo; + struct ndp_rs ndp_rs; + struct ndp_ra ndp_ra; + struct ndp_ns ndp_ns; + struct ndp_na ndp_na; + struct ndp_redirect ndp_redirect; + } icmp6_body; +#define icmp6_err icmp6_body.error_body +#define icmp6_echo icmp6_body.echo +#define icmp6_nrs icmp6_body.ndp_rs +#define icmp6_nra icmp6_body.ndp_ra +#define icmp6_nns icmp6_body.ndp_ns +#define icmp6_nna icmp6_body.ndp_na +#define icmp6_redirect icmp6_body.ndp_redirect +} QEMU_PACKED; + +#define ICMP6_MINLEN 4 +#define ICMP6_ERROR_MINLEN 8 +#define ICMP6_ECHO_MINLEN 8 +#define ICMP6_NDP_RS_MINLEN 8 +#define ICMP6_NDP_RA_MINLEN 16 +#define ICMP6_NDP_NS_MINLEN 24 +#define ICMP6_NDP_NA_MINLEN 24 +#define ICMP6_NDP_REDIRECT_MINLEN 40 + +/* + * NDP Options + */ +struct ndpopt { + uint8_t ndpopt_type; /* Option type */ + uint8_t ndpopt_len; /* /!\ In units of 8 octets */ + union { + unsigned char linklayer_addr[6]; /* Source/Target Link-layer */ + struct prefixinfo { /* Prefix Information */ + uint8_t prefix_length; +#ifdef HOST_WORDS_BIGENDIAN + uint8_t L:1, A:1, reserved1:6; +#else + uint8_t reserved1:6, A:1, L:1; +#endif + uint32_t valid_lt; /* Valid Lifetime */ + uint32_t pref_lt; /* Preferred Lifetime */ + uint32_t reserved2; + struct in6_addr prefix; + } QEMU_PACKED prefixinfo; + } ndpopt_body; +#define ndpopt_linklayer ndpopt_body.linklayer_addr +#define ndpopt_prefixinfo ndpopt_body.prefixinfo +} QEMU_PACKED; + +/* NDP options type */ +#define NDPOPT_LINKLAYER_SOURCE 1 /* Source Link-Layer Address */ +#define NDPOPT_LINKLAYER_TARGET 2 /* Target Link-Layer Address */ +#define NDPOPT_PREFIX_INFO 3 /* Prefix Information */ + +/* NDP options size, in octets. */ +#define NDPOPT_LINKLAYER_LEN 8 +#define NDPOPT_PREFIXINFO_LEN 32 + +/* + * Definition of type and code field values. + * Per https://www.iana.org/assignments/icmpv6-parameters/icmpv6-parameters.xml + * Last Updated 2012-11-12 + */ + +/* Errors */ +#define ICMP6_UNREACH 1 /* Destination Unreachable */ +#define ICMP6_UNREACH_NO_ROUTE 0 /* no route to dest */ +#define ICMP6_UNREACH_DEST_PROHIB 1 /* com with dest prohibited */ +#define ICMP6_UNREACH_SCOPE 2 /* beyond scope of src addr */ +#define ICMP6_UNREACH_ADDRESS 3 /* address unreachable */ +#define ICMP6_UNREACH_PORT 4 /* port unreachable */ +#define ICMP6_UNREACH_SRC_FAIL 5 /* src addr failed */ +#define ICMP6_UNREACH_REJECT_ROUTE 6 /* reject route to dest */ +#define ICMP6_UNREACH_SRC_HDR_ERROR 7 /* error in src routing header */ +#define ICMP6_TOOBIG 2 /* Packet Too Big */ +#define ICMP6_TIMXCEED 3 /* Time Exceeded */ +#define ICMP6_TIMXCEED_INTRANS 0 /* hop limit exceeded in transit */ +#define ICMP6_TIMXCEED_REASS 1 /* ttl=0 in reass */ +#define ICMP6_PARAMPROB 4 /* Parameter Problem */ +#define ICMP6_PARAMPROB_HDR_FIELD 0 /* err header field */ +#define ICMP6_PARAMPROB_NXTHDR_TYPE 1 /* unrecognized Next Header type */ +#define ICMP6_PARAMPROB_IPV6_OPT 2 /* unrecognized IPv6 option */ + +/* Informational Messages */ +#define ICMP6_ECHO_REQUEST 128 /* Echo Request */ +#define ICMP6_ECHO_REPLY 129 /* Echo Reply */ +#define ICMP6_NDP_RS 133 /* Router Solicitation (NDP) */ +#define ICMP6_NDP_RA 134 /* Router Advertisement (NDP) */ +#define ICMP6_NDP_NS 135 /* Neighbor Solicitation (NDP) */ +#define ICMP6_NDP_NA 136 /* Neighbor Advertisement (NDP) */ +#define ICMP6_NDP_REDIRECT 137 /* Redirect Message (NDP) */ + +/* + * Router Configuration Variables (rfc4861#section-6) + */ +#define NDP_IsRouter 1 +#define NDP_AdvSendAdvertisements 1 +#define NDP_MaxRtrAdvInterval 600000 +#define NDP_MinRtrAdvInterval ((NDP_MaxRtrAdvInterval >= 9) ? \ + NDP_MaxRtrAdvInterval / 3 : \ + NDP_MaxRtrAdvInterval) +#define NDP_AdvManagedFlag 0 +#define NDP_AdvOtherConfigFlag 0 +#define NDP_AdvLinkMTU 0 +#define NDP_AdvReachableTime 0 +#define NDP_AdvRetransTime 0 +#define NDP_AdvCurHopLimit 64 +#define NDP_AdvDefaultLifetime ((3 * NDP_MaxRtrAdvInterval) / 1000) +#define NDP_AdvValidLifetime 86400 +#define NDP_AdvOnLinkFlag 1 +#define NDP_AdvPrefLifetime 14400 +#define NDP_AdvAutonomousFlag 1 + +void icmp6_init(Slirp *slirp); +void icmp6_cleanup(Slirp *slirp); +void icmp6_input(struct mbuf *); +void icmp6_send_error(struct mbuf *m, uint8_t type, uint8_t code); +void ndp_send_ra(Slirp *slirp); +void ndp_send_ns(Slirp *slirp, struct in6_addr addr); + +#endif diff --git a/slirp/ip6_input.c b/slirp/ip6_input.c new file mode 100644 index 0000000000..c0b11e73cd --- /dev/null +++ b/slirp/ip6_input.c @@ -0,0 +1,73 @@ +/* + * Copyright (c) 2013 + * Guillaume Subiron, Yann Bordenave, Serigne Modou Wagne. + */ + +#include "qemu/osdep.h" +#include "slirp.h" +#include "ip6_icmp.h" + +/* + * IP initialization: fill in IP protocol switch table. + * All protocols not implemented in kernel go to raw IP protocol handler. + */ +void ip6_init(Slirp *slirp) +{ + icmp6_init(slirp); +} + +void ip6_cleanup(Slirp *slirp) +{ + icmp6_cleanup(slirp); +} + +void ip6_input(struct mbuf *m) +{ + struct ip6 *ip6; + + DEBUG_CALL("ip6_input"); + DEBUG_ARG("m = %lx", (long)m); + DEBUG_ARG("m_len = %d", m->m_len); + + if (m->m_len < sizeof(struct ip6)) { + goto bad; + } + + ip6 = mtod(m, struct ip6 *); + + if (ip6->ip_v != IP6VERSION) { + goto bad; + } + + if (ntohs(ip6->ip_pl) > IF_MTU) { + icmp6_send_error(m, ICMP6_TOOBIG, 0); + goto bad; + } + + /* check ip_ttl for a correct ICMP reply */ + if (ip6->ip_hl == 0) { + icmp6_send_error(m, ICMP6_TIMXCEED, ICMP6_TIMXCEED_INTRANS); + goto bad; + } + + /* + * Switch out to protocol's input routine. + */ + switch (ip6->ip_nh) { + case IPPROTO_TCP: + NTOHS(ip6->ip_pl); + tcp_input(m, sizeof(struct ip6), (struct socket *)NULL, AF_INET6); + break; + case IPPROTO_UDP: + udp6_input(m); + break; + case IPPROTO_ICMPV6: + icmp6_input(m); + break; + default: + m_free(m); + } + return; +bad: + m_free(m); +} diff --git a/slirp/ip6_output.c b/slirp/ip6_output.c new file mode 100644 index 0000000000..762cbfe89c --- /dev/null +++ b/slirp/ip6_output.c @@ -0,0 +1,40 @@ +/* + * Copyright (c) 2013 + * Guillaume Subiron, Yann Bordenave, Serigne Modou Wagne. + */ + +#include "qemu/osdep.h" +#include "qemu-common.h" +#include "slirp.h" + +/* Number of packets queued before we start sending + * (to prevent allocing too many mbufs) */ +#define IF6_THRESH 10 + +/* + * IPv6 output. The packet in mbuf chain m contains a IP header + */ +int ip6_output(struct socket *so, struct mbuf *m, int fast) +{ + struct ip6 *ip = mtod(m, struct ip6 *); + + DEBUG_CALL("ip6_output"); + DEBUG_ARG("so = %lx", (long)so); + DEBUG_ARG("m = %lx", (long)m); + + /* Fill IPv6 header */ + ip->ip_v = IP6VERSION; + ip->ip_hl = IP6_HOP_LIMIT; + ip->ip_tc_hi = 0; + ip->ip_tc_lo = 0; + ip->ip_fl_hi = 0; + ip->ip_fl_lo = 0; + + if (fast) { + if_encap(m->slirp, m); + } else { + if_output(so, m); + } + + return 0; +} diff --git a/slirp/ip_icmp.c b/slirp/ip_icmp.c index ace39821d9..590dada0aa 100644 --- a/slirp/ip_icmp.c +++ b/slirp/ip_icmp.c @@ -38,7 +38,7 @@ /* Be nice and tell them it's just a pseudo-ping packet */ static const char icmp_ping_msg[] = "This is a pseudo-PING packet used by Slirp to emulate ICMP ECHO-REQUEST packets.\n"; -/* list of actions for icmp_error() on RX of an icmp message */ +/* list of actions for icmp_send_error() on RX of an icmp message */ static const int icmp_flush[19] = { /* ECHO REPLY (0) */ 0, 1, @@ -101,7 +101,7 @@ static int icmp_send(struct socket *so, struct mbuf *m, int hlen) (struct sockaddr *)&addr, sizeof(addr)) == -1) { DEBUG_MISC((dfd, "icmp_input icmp sendto tx errno = %d-%s\n", errno, strerror(errno))); - icmp_error(m, ICMP_UNREACH, ICMP_UNREACH_NET, 0, strerror(errno)); + icmp_send_error(m, ICMP_UNREACH, ICMP_UNREACH_NET, 0, strerror(errno)); icmp_detach(so); } @@ -189,7 +189,7 @@ icmp_input(struct mbuf *m, int hlen) (struct sockaddr *)&addr, sizeof(addr)) == -1) { DEBUG_MISC((dfd,"icmp_input udp sendto tx errno = %d-%s\n", errno,strerror(errno))); - icmp_error(m, ICMP_UNREACH,ICMP_UNREACH_NET, 0,strerror(errno)); + icmp_send_error(m, ICMP_UNREACH, ICMP_UNREACH_NET, 0, strerror(errno)); udp_detach(so); } } /* if ip->ip_dst.s_addr == alias_addr.s_addr */ @@ -235,7 +235,7 @@ end_error: #define ICMP_MAXDATALEN (IP_MSS-28) void -icmp_error(struct mbuf *msrc, u_char type, u_char code, int minsize, +icmp_send_error(struct mbuf *msrc, u_char type, u_char code, int minsize, const char *message) { unsigned hlen, shlen, s_ip_len; @@ -243,7 +243,7 @@ icmp_error(struct mbuf *msrc, u_char type, u_char code, int minsize, register struct icmp *icp; register struct mbuf *m; - DEBUG_CALL("icmp_error"); + DEBUG_CALL("icmp_send_error"); DEBUG_ARG("msrc = %p", msrc); DEBUG_ARG("msrc_len = %d", msrc->m_len); @@ -433,7 +433,7 @@ void icmp_receive(struct socket *so) } DEBUG_MISC((dfd, " udp icmp rx errno = %d-%s\n", errno, strerror(errno))); - icmp_error(so->so_m, ICMP_UNREACH, error_code, 0, strerror(errno)); + icmp_send_error(so->so_m, ICMP_UNREACH, error_code, 0, strerror(errno)); } else { icmp_reflect(so->so_m); so->so_m = NULL; /* Don't m_free() it again! */ diff --git a/slirp/ip_icmp.h b/slirp/ip_icmp.h index be4426b8e7..846761d08e 100644 --- a/slirp/ip_icmp.h +++ b/slirp/ip_icmp.h @@ -156,8 +156,8 @@ struct icmp { void icmp_init(Slirp *slirp); void icmp_cleanup(Slirp *slirp); void icmp_input(struct mbuf *, int); -void icmp_error(struct mbuf *msrc, u_char type, u_char code, int minsize, - const char *message); +void icmp_send_error(struct mbuf *msrc, u_char type, u_char code, int minsize, + const char *message); void icmp_reflect(struct mbuf *); void icmp_receive(struct socket *so); void icmp_detach(struct socket *so); diff --git a/slirp/ip_input.c b/slirp/ip_input.c index e4855ae0f0..12f173de6c 100644 --- a/slirp/ip_input.c +++ b/slirp/ip_input.c @@ -132,9 +132,9 @@ ip_input(struct mbuf *m) m_adj(m, ip->ip_len - m->m_len); /* check ip_ttl for a correct ICMP reply */ - if(ip->ip_ttl==0) { - icmp_error(m, ICMP_TIMXCEED,ICMP_TIMXCEED_INTRANS, 0,"ttl"); - goto bad; + if (ip->ip_ttl == 0) { + icmp_send_error(m, ICMP_TIMXCEED, ICMP_TIMXCEED_INTRANS, 0, "ttl"); + goto bad; } /* @@ -200,7 +200,7 @@ ip_input(struct mbuf *m) */ switch (ip->ip_p) { case IPPROTO_TCP: - tcp_input(m, hlen, (struct socket *)NULL); + tcp_input(m, hlen, (struct socket *)NULL, AF_INET); break; case IPPROTO_UDP: udp_input(m, hlen); @@ -637,7 +637,7 @@ typedef uint32_t n_time; } return (0); bad: - icmp_error(m, type, code, 0, 0); + icmp_send_error(m, type, code, 0, 0); return (1); } diff --git a/slirp/libslirp.h b/slirp/libslirp.h index 5bdcbd50f7..c4b25c90e6 100644 --- a/slirp/libslirp.h +++ b/slirp/libslirp.h @@ -10,9 +10,11 @@ int get_dns_addr(struct in_addr *pdns_addr); Slirp *slirp_init(int restricted, struct in_addr vnetwork, struct in_addr vnetmask, struct in_addr vhost, - const char *vhostname, const char *tftp_path, - const char *bootfile, struct in_addr vdhcp_start, - struct in_addr vnameserver, const char **vdnssearch, + struct in6_addr vprefix_addr6, uint8_t vprefix_len, + struct in6_addr vhost6, const char *vhostname, + const char *tftp_path, const char *bootfile, + struct in_addr vdhcp_start, struct in_addr vnameserver, + struct in6_addr vnameserver6, const char **vdnssearch, void *opaque); void slirp_cleanup(Slirp *slirp); diff --git a/slirp/mbuf.c b/slirp/mbuf.c index c959758465..d688dd43f7 100644 --- a/slirp/mbuf.c +++ b/slirp/mbuf.c @@ -22,9 +22,9 @@ /* * Find a nice value for msize - * XXX if_maxlinkhdr already in mtu */ -#define SLIRP_MSIZE (IF_MTU + IF_MAXLINKHDR + offsetof(struct mbuf, m_dat) + 6) +#define SLIRP_MSIZE\ + (offsetof(struct mbuf, m_dat) + IF_MAXLINKHDR + TCPIPHDR_DELTA + IF_MTU) void m_init(Slirp *slirp) diff --git a/slirp/ndp_table.c b/slirp/ndp_table.c new file mode 100644 index 0000000000..9d4c39b45c --- /dev/null +++ b/slirp/ndp_table.c @@ -0,0 +1,90 @@ +/* + * Copyright (c) 2013 + * Guillaume Subiron, Yann Bordenave, Serigne Modou Wagne. + */ + +#include "qemu/osdep.h" +#include "qemu-common.h" +#include "slirp.h" + +void ndp_table_add(Slirp *slirp, struct in6_addr ip_addr, + uint8_t ethaddr[ETH_ALEN]) +{ + NdpTable *ndp_table = &slirp->ndp_table; + int i; + + DEBUG_CALL("ndp_table_add"); +#if !defined(_WIN32) || (_WIN32_WINNT >= 0x0600) + char addrstr[INET6_ADDRSTRLEN]; + inet_ntop(AF_INET6, &(ip_addr), addrstr, INET6_ADDRSTRLEN); + DEBUG_ARG("ip = %s", addrstr); +#endif + DEBUG_ARGS((dfd, " hw addr = %02x:%02x:%02x:%02x:%02x:%02x\n", + ethaddr[0], ethaddr[1], ethaddr[2], + ethaddr[3], ethaddr[4], ethaddr[5])); + + if (IN6_IS_ADDR_MULTICAST(&ip_addr) || IN6_IS_ADDR_UNSPECIFIED(&ip_addr)) { + /* Do not register multicast or unspecified addresses */ + DEBUG_CALL(" abort: do not register multicast or unspecified address"); + return; + } + + /* Search for an entry */ + for (i = 0; i < NDP_TABLE_SIZE; i++) { + if (in6_equal(&ndp_table->table[i].ip_addr, &ip_addr)) { + DEBUG_CALL(" already in table: update the entry"); + /* Update the entry */ + memcpy(ndp_table->table[i].eth_addr, ethaddr, ETH_ALEN); + return; + } + } + + /* No entry found, create a new one */ + DEBUG_CALL(" create new entry"); + ndp_table->table[ndp_table->next_victim].ip_addr = ip_addr; + memcpy(ndp_table->table[ndp_table->next_victim].eth_addr, + ethaddr, ETH_ALEN); + ndp_table->next_victim = (ndp_table->next_victim + 1) % NDP_TABLE_SIZE; +} + +bool ndp_table_search(Slirp *slirp, struct in6_addr ip_addr, + uint8_t out_ethaddr[ETH_ALEN]) +{ + NdpTable *ndp_table = &slirp->ndp_table; + int i; + + DEBUG_CALL("ndp_table_search"); +#if !defined(_WIN32) || (_WIN32_WINNT >= 0x0600) + char addrstr[INET6_ADDRSTRLEN]; + inet_ntop(AF_INET6, &(ip_addr), addrstr, INET6_ADDRSTRLEN); + DEBUG_ARG("ip = %s", addrstr); +#endif + + assert(!IN6_IS_ADDR_UNSPECIFIED(&ip_addr)); + + /* Multicast address: fec0::abcd:efgh/8 -> 33:33:ab:cd:ef:gh */ + if (IN6_IS_ADDR_MULTICAST(&ip_addr)) { + out_ethaddr[0] = 0x33; out_ethaddr[1] = 0x33; + out_ethaddr[2] = ip_addr.s6_addr[12]; + out_ethaddr[3] = ip_addr.s6_addr[13]; + out_ethaddr[4] = ip_addr.s6_addr[14]; + out_ethaddr[5] = ip_addr.s6_addr[15]; + DEBUG_ARGS((dfd, " multicast addr = %02x:%02x:%02x:%02x:%02x:%02x\n", + out_ethaddr[0], out_ethaddr[1], out_ethaddr[2], + out_ethaddr[3], out_ethaddr[4], out_ethaddr[5])); + return 1; + } + + for (i = 0; i < NDP_TABLE_SIZE; i++) { + if (in6_equal(&ndp_table->table[i].ip_addr, &ip_addr)) { + memcpy(out_ethaddr, ndp_table->table[i].eth_addr, ETH_ALEN); + DEBUG_ARGS((dfd, " found hw addr = %02x:%02x:%02x:%02x:%02x:%02x\n", + out_ethaddr[0], out_ethaddr[1], out_ethaddr[2], + out_ethaddr[3], out_ethaddr[4], out_ethaddr[5])); + return 1; + } + } + + DEBUG_CALL(" ip not found in table"); + return 0; +} diff --git a/slirp/slirp.c b/slirp/slirp.c index 0466d330da..9ccf4157d8 100644 --- a/slirp/slirp.c +++ b/slirp/slirp.c @@ -201,19 +201,23 @@ static int slirp_state_load(QEMUFile *f, void *opaque, int version_id); Slirp *slirp_init(int restricted, struct in_addr vnetwork, struct in_addr vnetmask, struct in_addr vhost, - const char *vhostname, const char *tftp_path, - const char *bootfile, struct in_addr vdhcp_start, - struct in_addr vnameserver, const char **vdnssearch, + struct in6_addr vprefix_addr6, uint8_t vprefix_len, + struct in6_addr vhost6, const char *vhostname, + const char *tftp_path, const char *bootfile, + struct in_addr vdhcp_start, struct in_addr vnameserver, + struct in6_addr vnameserver6, const char **vdnssearch, void *opaque) { Slirp *slirp = g_malloc0(sizeof(Slirp)); slirp_init_once(); + slirp->grand = g_rand_new(); slirp->restricted = restricted; if_init(slirp); ip_init(slirp); + ip6_init(slirp); /* Initialise mbufs *after* setting the MTU */ m_init(slirp); @@ -221,6 +225,9 @@ Slirp *slirp_init(int restricted, struct in_addr vnetwork, slirp->vnetwork_addr = vnetwork; slirp->vnetwork_mask = vnetmask; slirp->vhost_addr = vhost; + slirp->vprefix_addr6 = vprefix_addr6; + slirp->vprefix_len = vprefix_len; + slirp->vhost_addr6 = vhost6; if (vhostname) { pstrcpy(slirp->client_hostname, sizeof(slirp->client_hostname), vhostname); @@ -229,6 +236,7 @@ Slirp *slirp_init(int restricted, struct in_addr vnetwork, slirp->bootp_filename = g_strdup(bootfile); slirp->vdhcp_startaddr = vdhcp_start; slirp->vnameserver_addr = vnameserver; + slirp->vnameserver_addr6 = vnameserver6; if (vdnssearch) { translate_dnssearch(slirp, vdnssearch); @@ -251,8 +259,11 @@ void slirp_cleanup(Slirp *slirp) unregister_savevm(NULL, "slirp", slirp); ip_cleanup(slirp); + ip6_cleanup(slirp); m_cleanup(slirp); + g_rand_free(slirp->grand); + g_free(slirp->vdnssearch); g_free(slirp->tftp_prefix); g_free(slirp->bootp_filename); @@ -568,7 +579,8 @@ void slirp_pollfds_poll(GArray *pollfds, int select_error) /* * Continue tcp_input */ - tcp_input((struct mbuf *)NULL, sizeof(struct ip), so); + tcp_input((struct mbuf *)NULL, sizeof(struct ip), so, + so->so_ffamily); /* continue; */ } else { ret = sowrite(so); @@ -617,7 +629,8 @@ void slirp_pollfds_poll(GArray *pollfds, int select_error) } } - tcp_input((struct mbuf *)NULL, sizeof(struct ip), so); + tcp_input((struct mbuf *)NULL, sizeof(struct ip), so, + so->so_ffamily); } /* SS_ISFCONNECTING */ #endif } @@ -744,21 +757,28 @@ void slirp_input(Slirp *slirp, const uint8_t *pkt, int pkt_len) arp_input(slirp, pkt, pkt_len); break; case ETH_P_IP: + case ETH_P_IPV6: m = m_get(slirp); if (!m) return; - /* Note: we add to align the IP header */ - if (M_FREEROOM(m) < pkt_len + 2) { - m_inc(m, pkt_len + 2); + /* Note: we add 2 to align the IP header on 4 bytes, + * and add the margin for the tcpiphdr overhead */ + if (M_FREEROOM(m) < pkt_len + TCPIPHDR_DELTA + 2) { + m_inc(m, pkt_len + TCPIPHDR_DELTA + 2); } - m->m_len = pkt_len + 2; - memcpy(m->m_data + 2, pkt, pkt_len); + m->m_len = pkt_len + TCPIPHDR_DELTA + 2; + memcpy(m->m_data + TCPIPHDR_DELTA + 2, pkt, pkt_len); - m->m_data += 2 + ETH_HLEN; - m->m_len -= 2 + ETH_HLEN; + m->m_data += TCPIPHDR_DELTA + 2 + ETH_HLEN; + m->m_len -= TCPIPHDR_DELTA + 2 + ETH_HLEN; - ip_input(m); + if (proto == ETH_P_IP) { + ip_input(m); + } else if (proto == ETH_P_IPV6) { + ip6_input(m); + } break; + default: break; } @@ -826,6 +846,31 @@ static int if_encap4(Slirp *slirp, struct mbuf *ifm, struct ethhdr *eh, } } +/* Prepare the IPv6 packet to be sent to the ethernet device. Returns 1 if no + * packet should be sent, 0 if the packet must be re-queued, 2 if the packet + * is ready to go. + */ +static int if_encap6(Slirp *slirp, struct mbuf *ifm, struct ethhdr *eh, + uint8_t ethaddr[ETH_ALEN]) +{ + const struct ip6 *ip6h = mtod(ifm, const struct ip6 *); + if (!ndp_table_search(slirp, ip6h->ip_dst, ethaddr)) { + if (!ifm->resolution_requested) { + ndp_send_ns(slirp, ip6h->ip_dst); + ifm->resolution_requested = true; + ifm->expiration_date = + qemu_clock_get_ns(QEMU_CLOCK_REALTIME) + 1000000000ULL; + } + return 0; + } else { + eh->h_proto = htons(ETH_P_IPV6); + in6_compute_ethaddr(ip6h->ip_src, eh->h_source); + + /* Send this */ + return 2; + } +} + /* Output the IP packet to the ethernet device. Returns 0 if the packet must be * re-queued. */ @@ -849,9 +894,15 @@ int if_encap(Slirp *slirp, struct mbuf *ifm) } break; + case IP6VERSION: + ret = if_encap6(slirp, ifm, eh, ethaddr); + if (ret < 2) { + return ret; + } + break; + default: - /* Do not assert while we don't manage IP6VERSION */ - /* assert(0); */ + g_assert_not_reached(); break; } diff --git a/slirp/slirp.h b/slirp/slirp.h index 07c13b4725..71f2439461 100644 --- a/slirp/slirp.h +++ b/slirp/slirp.h @@ -14,8 +14,6 @@ typedef char *caddr_t; # include <iphlpapi.h> #else -# define ioctlsocket ioctl -# define closesocket(s) close(s) # if !defined(__HAIKU__) # define O_BINARY 0 # endif @@ -113,6 +111,8 @@ void free(void *ptr); #include <sys/stropts.h> #endif +#include <glib.h> + #include "debug.h" #include "qemu/queue.h" @@ -121,12 +121,14 @@ void free(void *ptr); #include "libslirp.h" #include "ip.h" +#include "ip6.h" #include "tcp.h" #include "tcp_timer.h" #include "tcp_var.h" #include "tcpip.h" #include "udp.h" #include "ip_icmp.h" +#include "ip6_icmp.h" #include "mbuf.h" #include "sbuf.h" #include "socket.h" @@ -178,6 +180,23 @@ void arp_table_add(Slirp *slirp, uint32_t ip_addr, uint8_t ethaddr[ETH_ALEN]); bool arp_table_search(Slirp *slirp, uint32_t ip_addr, uint8_t out_ethaddr[ETH_ALEN]); +struct ndpentry { + unsigned char eth_addr[ETH_ALEN]; /* sender hardware address */ + struct in6_addr ip_addr; /* sender IP address */ +} QEMU_PACKED; + +#define NDP_TABLE_SIZE 16 + +typedef struct NdpTable { + struct ndpentry table[NDP_TABLE_SIZE]; + int next_victim; +} NdpTable; + +void ndp_table_add(Slirp *slirp, struct in6_addr ip_addr, + uint8_t ethaddr[ETH_ALEN]); +bool ndp_table_search(Slirp *slirp, struct in6_addr ip_addr, + uint8_t out_ethaddr[ETH_ALEN]); + struct Slirp { QTAILQ_ENTRY(Slirp) entry; u_int time_fasttimo; @@ -188,8 +207,12 @@ struct Slirp { struct in_addr vnetwork_addr; struct in_addr vnetwork_mask; struct in_addr vhost_addr; + struct in6_addr vprefix_addr6; + uint8_t vprefix_len; + struct in6_addr vhost_addr6; struct in_addr vdhcp_startaddr; struct in_addr vnameserver_addr; + struct in6_addr vnameserver_addr6; struct in_addr client_ipaddr; char client_hostname[33]; @@ -236,6 +259,10 @@ struct Slirp { struct tftp_session tftp_sessions[TFTP_SESSIONS_MAX]; ArpTable arp_table; + NdpTable ndp_table; + + GRand *grand; + QEMUTimer *ra_timer; void *opaque; }; @@ -278,6 +305,7 @@ int translate_dnssearch(Slirp *s, const char ** names); /* cksum.c */ int cksum(struct mbuf *m, int len); +int ip6_cksum(struct mbuf *m); /* if.c */ void if_init(Slirp *); @@ -293,8 +321,16 @@ void ip_stripoptions(register struct mbuf *, struct mbuf *); /* ip_output.c */ int ip_output(struct socket *, struct mbuf *); +/* ip6_input.c */ +void ip6_init(Slirp *); +void ip6_cleanup(Slirp *); +void ip6_input(struct mbuf *); + +/* ip6_output */ +int ip6_output(struct socket *, struct mbuf *, int fast); + /* tcp_input.c */ -void tcp_input(register struct mbuf *, int, struct socket *); +void tcp_input(register struct mbuf *, int, struct socket *, unsigned short af); int tcp_mss(register struct tcpcb *, u_int); /* tcp_output.c */ @@ -305,7 +341,8 @@ void tcp_setpersist(register struct tcpcb *); void tcp_init(Slirp *); void tcp_cleanup(Slirp *); void tcp_template(struct tcpcb *); -void tcp_respond(struct tcpcb *, register struct tcpiphdr *, register struct mbuf *, tcp_seq, tcp_seq, int); +void tcp_respond(struct tcpcb *, register struct tcpiphdr *, + register struct mbuf *, tcp_seq, tcp_seq, int, unsigned short); struct tcpcb * tcp_newtcpcb(struct socket *); struct tcpcb * tcp_close(register struct tcpcb *); void tcp_sockclosed(struct tcpcb *); diff --git a/slirp/socket.c b/slirp/socket.c index 2b5453e020..b836c42b8e 100644 --- a/slirp/socket.c +++ b/slirp/socket.c @@ -463,7 +463,7 @@ sorecvfrom(struct socket *so) DEBUG_MISC((dfd," udp icmp rx errno = %d-%s\n", errno,strerror(errno))); - icmp_error(so->so_m, ICMP_UNREACH,code, 0,strerror(errno)); + icmp_send_error(so->so_m, ICMP_UNREACH, code, 0, strerror(errno)); } else { icmp_reflect(so->so_m); so->so_m = NULL; /* Don't m_free() it again! */ @@ -483,7 +483,18 @@ sorecvfrom(struct socket *so) if (!m) { return; } - m->m_data += IF_MAXLINKHDR; + switch (so->so_ffamily) { + case AF_INET: + m->m_data += IF_MAXLINKHDR + sizeof(struct udpiphdr); + break; + case AF_INET6: + m->m_data += IF_MAXLINKHDR + sizeof(struct ip6) + + sizeof(struct udphdr); + break; + default: + g_assert_not_reached(); + break; + } /* * XXX Shouldn't FIONREAD packets destined for port 53, @@ -505,13 +516,37 @@ sorecvfrom(struct socket *so) DEBUG_MISC((dfd, " did recvfrom %d, errno = %d-%s\n", m->m_len, errno,strerror(errno))); if(m->m_len<0) { - u_char code=ICMP_UNREACH_PORT; - - if(errno == EHOSTUNREACH) code=ICMP_UNREACH_HOST; - else if(errno == ENETUNREACH) code=ICMP_UNREACH_NET; - - DEBUG_MISC((dfd," rx error, tx icmp ICMP_UNREACH:%i\n", code)); - icmp_error(so->so_m, ICMP_UNREACH,code, 0,strerror(errno)); + /* Report error as ICMP */ + switch (so->so_lfamily) { + uint8_t code; + case AF_INET: + code = ICMP_UNREACH_PORT; + + if (errno == EHOSTUNREACH) { + code = ICMP_UNREACH_HOST; + } else if (errno == ENETUNREACH) { + code = ICMP_UNREACH_NET; + } + + DEBUG_MISC((dfd, " rx error, tx icmp ICMP_UNREACH:%i\n", code)); + icmp_send_error(so->so_m, ICMP_UNREACH, code, 0, strerror(errno)); + break; + case AF_INET6: + code = ICMP6_UNREACH_PORT; + + if (errno == EHOSTUNREACH) { + code = ICMP6_UNREACH_ADDRESS; + } else if (errno == ENETUNREACH) { + code = ICMP6_UNREACH_NO_ROUTE; + } + + DEBUG_MISC((dfd, " rx error, tx icmp6 ICMP_UNREACH:%i\n", code)); + icmp6_send_error(so->so_m, ICMP6_UNREACH, code); + break; + default: + g_assert_not_reached(); + break; + } m_free(m); } else { /* @@ -541,7 +576,12 @@ sorecvfrom(struct socket *so) (struct sockaddr_in *) &daddr, so->so_iptos); break; + case AF_INET6: + udp6_output(so, m, (struct sockaddr_in6 *) &saddr, + (struct sockaddr_in6 *) &daddr); + break; default: + g_assert_not_reached(); break; } } /* rx error */ @@ -731,6 +771,7 @@ void sotranslate_out(struct socket *so, struct sockaddr_storage *addr) { Slirp *slirp = so->slirp; struct sockaddr_in *sin = (struct sockaddr_in *)addr; + struct sockaddr_in6 *sin6 = (struct sockaddr_in6 *)addr; switch (addr->ss_family) { case AF_INET: @@ -751,6 +792,19 @@ void sotranslate_out(struct socket *so, struct sockaddr_storage *addr) ntohs(sin->sin_port), inet_ntoa(sin->sin_addr))); break; + case AF_INET6: + if (in6_equal_net(&so->so_faddr6, &slirp->vprefix_addr6, + slirp->vprefix_len)) { + if (in6_equal(&so->so_faddr6, &slirp->vnameserver_addr6)) { + /*if (get_dns_addr(&addr) < 0) {*/ /* TODO */ + sin6->sin6_addr = in6addr_loopback; + /*}*/ + } else { + sin6->sin6_addr = in6addr_loopback; + } + } + break; + default: break; } @@ -760,6 +814,7 @@ void sotranslate_in(struct socket *so, struct sockaddr_storage *addr) { Slirp *slirp = so->slirp; struct sockaddr_in *sin = (struct sockaddr_in *)addr; + struct sockaddr_in6 *sin6 = (struct sockaddr_in6 *)addr; switch (addr->ss_family) { case AF_INET: @@ -776,6 +831,16 @@ void sotranslate_in(struct socket *so, struct sockaddr_storage *addr) } break; + case AF_INET6: + if (in6_equal_net(&so->so_faddr6, &slirp->vprefix_addr6, + slirp->vprefix_len)) { + if (in6_equal(&sin6->sin6_addr, &in6addr_loopback) + || !in6_equal(&so->so_faddr6, &slirp->vhost_addr6)) { + sin6->sin6_addr = so->so_faddr6; + } + } + break; + default: break; } @@ -797,6 +862,13 @@ void sotranslate_accept(struct socket *so) } break; + case AF_INET6: + if (in6_equal(&so->so_faddr6, &in6addr_any) || + in6_equal(&so->so_faddr6, &in6addr_loopback)) { + so->so_faddr6 = slirp->vhost_addr6; + } + break; + default: break; } diff --git a/slirp/socket.h b/slirp/socket.h index c4afc9494f..e9c9b053dc 100644 --- a/slirp/socket.h +++ b/slirp/socket.h @@ -34,17 +34,23 @@ struct socket { union { /* foreign host */ struct sockaddr_storage ss; struct sockaddr_in sin; + struct sockaddr_in6 sin6; } fhost; #define so_faddr fhost.sin.sin_addr #define so_fport fhost.sin.sin_port +#define so_faddr6 fhost.sin6.sin6_addr +#define so_fport6 fhost.sin6.sin6_port #define so_ffamily fhost.ss.ss_family union { /* local host */ struct sockaddr_storage ss; struct sockaddr_in sin; + struct sockaddr_in6 sin6; } lhost; #define so_laddr lhost.sin.sin_addr #define so_lport lhost.sin.sin_port +#define so_laddr6 lhost.sin6.sin6_addr +#define so_lport6 lhost.sin6.sin6_port #define so_lfamily lhost.ss.ss_family uint8_t so_iptos; /* Type of service */ @@ -102,6 +108,13 @@ static inline int sockaddr_equal(struct sockaddr_storage *a, return a4->sin_addr.s_addr == b4->sin_addr.s_addr && a4->sin_port == b4->sin_port; } + case AF_INET6: + { + struct sockaddr_in6 *a6 = (struct sockaddr_in6 *) a; + struct sockaddr_in6 *b6 = (struct sockaddr_in6 *) b; + return (in6_equal(&a6->sin6_addr, &b6->sin6_addr) + && a6->sin6_port == b6->sin6_port); + } default: g_assert_not_reached(); } diff --git a/slirp/tcp.h b/slirp/tcp.h index 2e2b4033a6..61befcde57 100644 --- a/slirp/tcp.h +++ b/slirp/tcp.h @@ -106,6 +106,8 @@ struct tcphdr { */ #undef TCP_MSS #define TCP_MSS 1460 +#undef TCP6_MSS +#define TCP6_MSS 1440 #undef TCP_MAXWIN #define TCP_MAXWIN 65535 /* largest value for (unscaled) window */ diff --git a/slirp/tcp_input.c b/slirp/tcp_input.c index 2027a7511d..1fcca3040e 100644 --- a/slirp/tcp_input.c +++ b/slirp/tcp_input.c @@ -214,9 +214,10 @@ present: * protocol specification dated September, 1981 very closely. */ void -tcp_input(struct mbuf *m, int iphlen, struct socket *inso) +tcp_input(struct mbuf *m, int iphlen, struct socket *inso, unsigned short af) { - struct ip save_ip, *ip; + struct ip save_ip, *ip; + struct ip6 save_ip6, *ip6; register struct tcpiphdr *ti; caddr_t optp = NULL; int optlen = 0; @@ -230,6 +231,7 @@ tcp_input(struct mbuf *m, int iphlen, struct socket *inso) int ret; struct sockaddr_storage lhost, fhost; struct sockaddr_in *lhost4, *fhost4; + struct sockaddr_in6 *lhost6, *fhost6; struct ex_list *ex_ptr; Slirp *slirp; @@ -256,37 +258,83 @@ tcp_input(struct mbuf *m, int iphlen, struct socket *inso) } slirp = m->slirp; - /* - * Get IP and TCP header together in first mbuf. - * Note: IP leaves IP header in first mbuf. - */ - ti = mtod(m, struct tcpiphdr *); - if (iphlen > sizeof(struct ip )) { - ip_stripoptions(m, (struct mbuf *)0); - iphlen=sizeof(struct ip ); - } - /* XXX Check if too short */ + ip = mtod(m, struct ip *); + ip6 = mtod(m, struct ip6 *); + switch (af) { + case AF_INET: + if (iphlen > sizeof(struct ip)) { + ip_stripoptions(m, (struct mbuf *)0); + iphlen = sizeof(struct ip); + } + /* XXX Check if too short */ - /* - * Save a copy of the IP header in case we want restore it - * for sending an ICMP error message in response. - */ - ip=mtod(m, struct ip *); - save_ip = *ip; - save_ip.ip_len+= iphlen; - /* - * Checksum extended TCP header and data. - */ - tlen = ((struct ip *)ti)->ip_len; - tcpiphdr2qlink(ti)->next = tcpiphdr2qlink(ti)->prev = NULL; - memset(&ti->ti_i.ih_mbuf, 0 , sizeof(struct mbuf_ptr)); - ti->ti_x1 = 0; - ti->ti_len = htons((uint16_t)tlen); - len = sizeof(struct ip ) + tlen; - if(cksum(m, len)) { - goto drop; + /* + * Save a copy of the IP header in case we want restore it + * for sending an ICMP error message in response. + */ + save_ip = *ip; + save_ip.ip_len += iphlen; + + /* + * Get IP and TCP header together in first mbuf. + * Note: IP leaves IP header in first mbuf. + */ + m->m_data -= sizeof(struct tcpiphdr) - sizeof(struct ip) + - sizeof(struct tcphdr); + m->m_len += sizeof(struct tcpiphdr) - sizeof(struct ip) + - sizeof(struct tcphdr); + ti = mtod(m, struct tcpiphdr *); + + /* + * Checksum extended TCP header and data. + */ + tlen = ip->ip_len; + tcpiphdr2qlink(ti)->next = tcpiphdr2qlink(ti)->prev = NULL; + memset(&ti->ih_mbuf, 0 , sizeof(struct mbuf_ptr)); + memset(&ti->ti, 0, sizeof(ti->ti)); + ti->ti_x0 = 0; + ti->ti_src = save_ip.ip_src; + ti->ti_dst = save_ip.ip_dst; + ti->ti_pr = save_ip.ip_p; + ti->ti_len = htons((uint16_t)tlen); + break; + + case AF_INET6: + /* + * Save a copy of the IP header in case we want restore it + * for sending an ICMP error message in response. + */ + save_ip6 = *ip6; + /* + * Get IP and TCP header together in first mbuf. + * Note: IP leaves IP header in first mbuf. + */ + m->m_data -= sizeof(struct tcpiphdr) - (sizeof(struct ip6) + + sizeof(struct tcphdr)); + m->m_len += sizeof(struct tcpiphdr) - (sizeof(struct ip6) + + sizeof(struct tcphdr)); + ti = mtod(m, struct tcpiphdr *); + + tlen = ip6->ip_pl; + tcpiphdr2qlink(ti)->next = tcpiphdr2qlink(ti)->prev = NULL; + memset(&ti->ih_mbuf, 0 , sizeof(struct mbuf_ptr)); + memset(&ti->ti, 0, sizeof(ti->ti)); + ti->ti_x0 = 0; + ti->ti_src6 = save_ip6.ip_src; + ti->ti_dst6 = save_ip6.ip_dst; + ti->ti_nh6 = save_ip6.ip_nh; + ti->ti_len = htons((uint16_t)tlen); + break; + + default: + g_assert_not_reached(); + } + + len = ((sizeof(struct tcpiphdr) - sizeof(struct tcphdr)) + tlen); + if (cksum(m, len)) { + goto drop; } /* @@ -323,14 +371,28 @@ tcp_input(struct mbuf *m, int iphlen, struct socket *inso) * Locate pcb for segment. */ findso: - lhost.ss_family = AF_INET; - lhost4 = (struct sockaddr_in *) &lhost; - lhost4->sin_addr = ti->ti_src; - lhost4->sin_port = ti->ti_sport; - fhost.ss_family = AF_INET; - fhost4 = (struct sockaddr_in *) &fhost; - fhost4->sin_addr = ti->ti_dst; - fhost4->sin_port = ti->ti_dport; + lhost.ss_family = af; + fhost.ss_family = af; + switch (af) { + case AF_INET: + lhost4 = (struct sockaddr_in *) &lhost; + lhost4->sin_addr = ti->ti_src; + lhost4->sin_port = ti->ti_sport; + fhost4 = (struct sockaddr_in *) &fhost; + fhost4->sin_addr = ti->ti_dst; + fhost4->sin_port = ti->ti_dport; + break; + case AF_INET6: + lhost6 = (struct sockaddr_in6 *) &lhost; + lhost6->sin6_addr = ti->ti_src6; + lhost6->sin6_port = ti->ti_sport; + fhost6 = (struct sockaddr_in6 *) &fhost; + fhost6->sin6_addr = ti->ti_dst6; + fhost6->sin6_port = ti->ti_dport; + break; + default: + g_assert_not_reached(); + } so = solookup(&slirp->tcp_last_so, &slirp->tcb, &lhost, &fhost); @@ -380,8 +442,18 @@ findso: so->lhost.ss = lhost; so->fhost.ss = fhost; - if ((so->so_iptos = tcp_tos(so)) == 0) - so->so_iptos = ((struct ip *)ti)->ip_tos; + so->so_iptos = tcp_tos(so); + if (so->so_iptos == 0) { + switch (af) { + case AF_INET: + so->so_iptos = ((struct ip *)ti)->ip_tos; + break; + case AF_INET6: + break; + default: + g_assert_not_reached(); + } + } tp = sototcpcb(so); tp->t_state = TCPS_LISTEN; @@ -560,8 +632,9 @@ findso: * If this is destined for the control address, then flag to * tcp_ctl once connected, otherwise connect */ - if ((so->so_faddr.s_addr & slirp->vnetwork_mask.s_addr) == - slirp->vnetwork_addr.s_addr) { + if (af == AF_INET && + (so->so_faddr.s_addr & slirp->vnetwork_mask.s_addr) == + slirp->vnetwork_addr.s_addr) { if (so->so_faddr.s_addr != slirp->vhost_addr.s_addr && so->so_faddr.s_addr != slirp->vnameserver_addr.s_addr) { /* May be an add exec */ @@ -586,29 +659,58 @@ findso: } if ((tcp_fconnect(so, so->so_ffamily) == -1) && -#if defined(_WIN32) - socket_error() != WSAEWOULDBLOCK -#else (errno != EINPROGRESS) && (errno != EWOULDBLOCK) -#endif ) { - u_char code=ICMP_UNREACH_NET; + uint8_t code; DEBUG_MISC((dfd, " tcp fconnect errno = %d-%s\n", errno,strerror(errno))); if(errno == ECONNREFUSED) { /* ACK the SYN, send RST to refuse the connection */ - tcp_respond(tp, ti, m, ti->ti_seq+1, (tcp_seq)0, - TH_RST|TH_ACK); + tcp_respond(tp, ti, m, ti->ti_seq + 1, (tcp_seq) 0, + TH_RST | TH_ACK, af); } else { - if(errno == EHOSTUNREACH) code=ICMP_UNREACH_HOST; + switch (af) { + case AF_INET: + code = ICMP_UNREACH_NET; + if (errno == EHOSTUNREACH) { + code = ICMP_UNREACH_HOST; + } + break; + case AF_INET6: + code = ICMP6_UNREACH_NO_ROUTE; + if (errno == EHOSTUNREACH) { + code = ICMP6_UNREACH_ADDRESS; + } + break; + default: + g_assert_not_reached(); + } HTONL(ti->ti_seq); /* restore tcp header */ HTONL(ti->ti_ack); HTONS(ti->ti_win); HTONS(ti->ti_urp); m->m_data -= sizeof(struct tcpiphdr)+off-sizeof(struct tcphdr); m->m_len += sizeof(struct tcpiphdr)+off-sizeof(struct tcphdr); - *ip=save_ip; - icmp_error(m, ICMP_UNREACH,code, 0,strerror(errno)); + switch (af) { + case AF_INET: + m->m_data += sizeof(struct tcpiphdr) - sizeof(struct ip) + - sizeof(struct tcphdr); + m->m_len -= sizeof(struct tcpiphdr) - sizeof(struct ip) + - sizeof(struct tcphdr); + *ip = save_ip; + icmp_send_error(m, ICMP_UNREACH, code, 0, strerror(errno)); + break; + case AF_INET6: + m->m_data += sizeof(struct tcpiphdr) - (sizeof(struct ip6) + + sizeof(struct tcphdr)); + m->m_len -= sizeof(struct tcpiphdr) - (sizeof(struct ip6) + + sizeof(struct tcphdr)); + *ip6 = save_ip6; + icmp6_send_error(m, ICMP6_UNREACH, code); + break; + default: + g_assert_not_reached(); + } } tcp_close(tp); m_free(m); @@ -1280,11 +1382,11 @@ dropafterack: dropwithreset: /* reuses m if m!=NULL, m_free() unnecessary */ if (tiflags & TH_ACK) - tcp_respond(tp, ti, m, (tcp_seq)0, ti->ti_ack, TH_RST); + tcp_respond(tp, ti, m, (tcp_seq)0, ti->ti_ack, TH_RST, af); else { if (tiflags & TH_SYN) ti->ti_len++; - tcp_respond(tp, ti, m, ti->ti_seq+ti->ti_len, (tcp_seq)0, - TH_RST|TH_ACK); + tcp_respond(tp, ti, m, ti->ti_seq + ti->ti_len, (tcp_seq) 0, + TH_RST | TH_ACK, af); } return; @@ -1475,7 +1577,19 @@ tcp_mss(struct tcpcb *tp, u_int offer) DEBUG_ARG("tp = %p", tp); DEBUG_ARG("offer = %d", offer); - mss = min(IF_MTU, IF_MRU) - sizeof(struct tcpiphdr); + switch (so->so_ffamily) { + case AF_INET: + mss = min(IF_MTU, IF_MRU) - sizeof(struct tcphdr) + + sizeof(struct ip); + break; + case AF_INET6: + mss = min(IF_MTU, IF_MRU) - sizeof(struct tcphdr) + + sizeof(struct ip6); + break; + default: + g_assert_not_reached(); + } + if (offer) mss = min(mss, offer); mss = max(mss, 32); diff --git a/slirp/tcp_output.c b/slirp/tcp_output.c index 34e4d2e5d4..99b0a9b1cb 100644 --- a/slirp/tcp_output.c +++ b/slirp/tcp_output.c @@ -61,7 +61,9 @@ tcp_output(struct tcpcb *tp) register long len, win; int off, flags, error; register struct mbuf *m; - register struct tcpiphdr *ti; + register struct tcpiphdr *ti, tcpiph_save; + struct ip *ip; + struct ip6 *ip6; u_char opt[MAX_TCPOPTLEN]; unsigned optlen, hdrlen; int idle, sendalot; @@ -447,16 +449,45 @@ send: * the template, but need a way to checksum without them. */ m->m_len = hdrlen + len; /* XXX Needed? m_len should be correct */ + tcpiph_save = *mtod(m, struct tcpiphdr *); + + switch (so->so_ffamily) { + case AF_INET: + m->m_data += sizeof(struct tcpiphdr) - sizeof(struct tcphdr) + - sizeof(struct ip); + m->m_len -= sizeof(struct tcpiphdr) - sizeof(struct tcphdr) + - sizeof(struct ip); + ip = mtod(m, struct ip *); + + ip->ip_len = m->m_len; + ip->ip_dst = tcpiph_save.ti_dst; + ip->ip_src = tcpiph_save.ti_src; + ip->ip_p = tcpiph_save.ti_pr; + + ip->ip_ttl = IPDEFTTL; + ip->ip_tos = so->so_iptos; + error = ip_output(so, m); + break; + + case AF_INET6: + m->m_data += sizeof(struct tcpiphdr) - sizeof(struct tcphdr) + - sizeof(struct ip6); + m->m_len -= sizeof(struct tcpiphdr) - sizeof(struct tcphdr) + - sizeof(struct ip6); + ip6 = mtod(m, struct ip6 *); + + ip6->ip_pl = tcpiph_save.ti_len; + ip6->ip_dst = tcpiph_save.ti_dst6; + ip6->ip_src = tcpiph_save.ti_src6; + ip6->ip_nh = tcpiph_save.ti_nh6; + + error = ip6_output(so, m, 0); + break; + + default: + g_assert_not_reached(); + } - { - - ((struct ip *)ti)->ip_len = m->m_len; - - ((struct ip *)ti)->ip_ttl = IPDEFTTL; - ((struct ip *)ti)->ip_tos = so->so_iptos; - - error = ip_output(so, m); - } if (error) { out: return (error); diff --git a/slirp/tcp_subr.c b/slirp/tcp_subr.c index b1aa1f23f4..dbfd2c673b 100644 --- a/slirp/tcp_subr.c +++ b/slirp/tcp_subr.c @@ -76,13 +76,30 @@ tcp_template(struct tcpcb *tp) register struct tcpiphdr *n = &tp->t_template; n->ti_mbuf = NULL; - n->ti_x1 = 0; - n->ti_pr = IPPROTO_TCP; - n->ti_len = htons(sizeof (struct tcpiphdr) - sizeof (struct ip)); - n->ti_src = so->so_faddr; - n->ti_dst = so->so_laddr; - n->ti_sport = so->so_fport; - n->ti_dport = so->so_lport; + memset(&n->ti, 0, sizeof(n->ti)); + n->ti_x0 = 0; + switch (so->so_ffamily) { + case AF_INET: + n->ti_pr = IPPROTO_TCP; + n->ti_len = htons(sizeof(struct tcphdr)); + n->ti_src = so->so_faddr; + n->ti_dst = so->so_laddr; + n->ti_sport = so->so_fport; + n->ti_dport = so->so_lport; + break; + + case AF_INET6: + n->ti_nh6 = IPPROTO_TCP; + n->ti_len = htons(sizeof(struct tcphdr)); + n->ti_src6 = so->so_faddr6; + n->ti_dst6 = so->so_laddr6; + n->ti_sport = so->so_fport6; + n->ti_dport = so->so_lport6; + break; + + default: + g_assert_not_reached(); + } n->ti_seq = 0; n->ti_ack = 0; @@ -109,7 +126,7 @@ tcp_template(struct tcpcb *tp) */ void tcp_respond(struct tcpcb *tp, struct tcpiphdr *ti, struct mbuf *m, - tcp_seq ack, tcp_seq seq, int flags) + tcp_seq ack, tcp_seq seq, int flags, unsigned short af) { register int tlen; int win = 0; @@ -131,6 +148,7 @@ tcp_respond(struct tcpcb *tp, struct tcpiphdr *ti, struct mbuf *m, m->m_data += IF_MAXLINKHDR; *mtod(m, struct tcpiphdr *) = *ti; ti = mtod(m, struct tcpiphdr *); + memset(&ti->ti, 0, sizeof(ti->ti)); flags = TH_ACK; } else { /* @@ -142,16 +160,26 @@ tcp_respond(struct tcpcb *tp, struct tcpiphdr *ti, struct mbuf *m, m->m_len = sizeof (struct tcpiphdr); tlen = 0; #define xchg(a,b,type) { type t; t=a; a=b; b=t; } - xchg(ti->ti_dst.s_addr, ti->ti_src.s_addr, uint32_t); - xchg(ti->ti_dport, ti->ti_sport, uint16_t); + switch (af) { + case AF_INET: + xchg(ti->ti_dst.s_addr, ti->ti_src.s_addr, uint32_t); + xchg(ti->ti_dport, ti->ti_sport, uint16_t); + break; + case AF_INET6: + xchg(ti->ti_dst6, ti->ti_src6, struct in6_addr); + xchg(ti->ti_dport, ti->ti_sport, uint16_t); + break; + default: + g_assert_not_reached(); + } #undef xchg } ti->ti_len = htons((u_short)(sizeof (struct tcphdr) + tlen)); tlen += sizeof (struct tcpiphdr); m->m_len = tlen; - ti->ti_mbuf = NULL; - ti->ti_x1 = 0; + ti->ti_mbuf = NULL; + ti->ti_x0 = 0; ti->ti_seq = htonl(seq); ti->ti_ack = htonl(ack); ti->ti_x2 = 0; @@ -164,14 +192,49 @@ tcp_respond(struct tcpcb *tp, struct tcpiphdr *ti, struct mbuf *m, ti->ti_urp = 0; ti->ti_sum = 0; ti->ti_sum = cksum(m, tlen); - ((struct ip *)ti)->ip_len = tlen; - - if(flags & TH_RST) - ((struct ip *)ti)->ip_ttl = MAXTTL; - else - ((struct ip *)ti)->ip_ttl = IPDEFTTL; - (void) ip_output((struct socket *)0, m); + struct tcpiphdr tcpiph_save = *(mtod(m, struct tcpiphdr *)); + struct ip *ip; + struct ip6 *ip6; + + switch (af) { + case AF_INET: + m->m_data += sizeof(struct tcpiphdr) - sizeof(struct tcphdr) + - sizeof(struct ip); + m->m_len -= sizeof(struct tcpiphdr) - sizeof(struct tcphdr) + - sizeof(struct ip); + ip = mtod(m, struct ip *); + ip->ip_len = tlen; + ip->ip_dst = tcpiph_save.ti_dst; + ip->ip_src = tcpiph_save.ti_src; + ip->ip_p = tcpiph_save.ti_pr; + + if (flags & TH_RST) { + ip->ip_ttl = MAXTTL; + } else { + ip->ip_ttl = IPDEFTTL; + } + + ip_output(NULL, m); + break; + + case AF_INET6: + m->m_data += sizeof(struct tcpiphdr) - sizeof(struct tcphdr) + - sizeof(struct ip6); + m->m_len -= sizeof(struct tcpiphdr) - sizeof(struct tcphdr) + - sizeof(struct ip6); + ip6 = mtod(m, struct ip6 *); + ip6->ip_pl = tlen; + ip6->ip_dst = tcpiph_save.ti_dst6; + ip6->ip_src = tcpiph_save.ti_src6; + ip6->ip_nh = tcpiph_save.ti_nh6; + + ip6_output(NULL, m, 0); + break; + + default: + g_assert_not_reached(); + } } /* @@ -190,7 +253,7 @@ tcp_newtcpcb(struct socket *so) memset((char *) tp, 0, sizeof(struct tcpcb)); tp->seg_next = tp->seg_prev = (struct tcpiphdr*)tp; - tp->t_maxseg = TCP_MSS; + tp->t_maxseg = (so->so_ffamily == AF_INET) ? TCP_MSS : TCP6_MSS; tp->t_flags = TCP_DO_RFC1323 ? (TF_REQ_SCALE|TF_REQ_TSTMP) : 0; tp->t_socket = so; @@ -375,8 +438,8 @@ void tcp_connect(struct socket *inso) { Slirp *slirp = inso->slirp; struct socket *so; - struct sockaddr_in addr; - socklen_t addrlen = sizeof(struct sockaddr_in); + struct sockaddr_storage addr; + socklen_t addrlen = sizeof(struct sockaddr_storage); struct tcpcb *tp; int s, opt; @@ -401,9 +464,8 @@ void tcp_connect(struct socket *inso) free(so); /* NOT sofree */ return; } - so->so_lfamily = AF_INET; - so->so_laddr = inso->so_laddr; - so->so_lport = inso->so_lport; + so->lhost = inso->lhost; + so->so_ffamily = inso->so_ffamily; } tcp_mss(sototcpcb(so), 0); @@ -419,7 +481,7 @@ void tcp_connect(struct socket *inso) qemu_setsockopt(s, SOL_SOCKET, SO_OOBINLINE, &opt, sizeof(int)); socket_set_nodelay(s); - so->fhost.sin = addr; + so->fhost.ss = addr; sotranslate_accept(so); /* Close the accept() socket, set right state */ diff --git a/slirp/tcp_timer.c b/slirp/tcp_timer.c index 1214c2e6fa..8f5dd772ad 100644 --- a/slirp/tcp_timer.c +++ b/slirp/tcp_timer.c @@ -278,7 +278,8 @@ tcp_timers(register struct tcpcb *tp, int timer) * correspondent TCP to respond. */ tcp_respond(tp, &tp->t_template, (struct mbuf *)NULL, - tp->rcv_nxt, tp->snd_una - 1, 0); + tp->rcv_nxt, tp->snd_una - 1, 0, + tp->t_socket->so_ffamily); tp->t_timer[TCPT_KEEP] = TCPTV_KEEPINTVL; } else tp->t_timer[TCPT_KEEP] = TCPTV_KEEP_IDLE; diff --git a/slirp/tcpip.h b/slirp/tcpip.h index 7974ce3d52..124b4a9f62 100644 --- a/slirp/tcpip.h +++ b/slirp/tcpip.h @@ -37,15 +37,32 @@ * Tcp+ip header, after ip options removed. */ struct tcpiphdr { - struct ipovly ti_i; /* overlaid ip structure */ - struct tcphdr ti_t; /* tcp header */ + struct mbuf_ptr ih_mbuf; /* backpointer to mbuf */ + union { + struct { + struct in_addr ih_src; /* source internet address */ + struct in_addr ih_dst; /* destination internet address */ + uint8_t ih_x1; /* (unused) */ + uint8_t ih_pr; /* protocol */ + } ti_i4; + struct { + struct in6_addr ih_src; + struct in6_addr ih_dst; + uint8_t ih_x1; + uint8_t ih_nh; + } ti_i6; + } ti; + uint16_t ti_x0; + uint16_t ti_len; /* protocol length */ + struct tcphdr ti_t; /* tcp header */ }; -#define ti_mbuf ti_i.ih_mbuf.mptr -#define ti_x1 ti_i.ih_x1 -#define ti_pr ti_i.ih_pr -#define ti_len ti_i.ih_len -#define ti_src ti_i.ih_src -#define ti_dst ti_i.ih_dst +#define ti_mbuf ih_mbuf.mptr +#define ti_pr ti.ti_i4.ih_pr +#define ti_src ti.ti_i4.ih_src +#define ti_dst ti.ti_i4.ih_dst +#define ti_src6 ti.ti_i6.ih_src +#define ti_dst6 ti.ti_i6.ih_dst +#define ti_nh6 ti.ti_i6.ih_nh #define ti_sport ti_t.th_sport #define ti_dport ti_t.th_dport #define ti_seq ti_t.th_seq @@ -65,6 +82,13 @@ struct tcpiphdr { #define tcpfrag_list_end(F, T) (tcpiphdr2qlink(F) == (struct qlink*)(T)) #define tcpfrag_list_empty(T) ((T)->seg_next == (struct tcpiphdr*)(T)) +/* This is the difference between the size of a tcpiphdr structure, and the + * size of actual ip+tcp headers, rounded up since we need to align data. */ +#define TCPIPHDR_DELTA\ + (max(0,\ + (sizeof(struct tcpiphdr)\ + - sizeof(struct ip) - sizeof(struct tcphdr) + 3) & ~3)) + /* * Just a clean way to get to the first byte * of the packet diff --git a/slirp/tftp.c b/slirp/tftp.c index abb010621c..25ad6efdf8 100644 --- a/slirp/tftp.c +++ b/slirp/tftp.c @@ -46,7 +46,8 @@ static void tftp_session_terminate(struct tftp_session *spt) spt->slirp = NULL; } -static int tftp_session_allocate(Slirp *slirp, struct tftp_t *tp) +static int tftp_session_allocate(Slirp *slirp, struct sockaddr_storage *srcsas, + struct tftp_t *tp) { struct tftp_session *spt; int k; @@ -68,7 +69,7 @@ static int tftp_session_allocate(Slirp *slirp, struct tftp_t *tp) found: memset(spt, 0, sizeof(*spt)); - memcpy(&spt->client_ip, &tp->ip.ip_src, sizeof(spt->client_ip)); + spt->client_addr = *srcsas; spt->fd = -1; spt->client_port = tp->udp.uh_sport; spt->slirp = slirp; @@ -78,7 +79,8 @@ static int tftp_session_allocate(Slirp *slirp, struct tftp_t *tp) return k; } -static int tftp_session_find(Slirp *slirp, struct tftp_t *tp) +static int tftp_session_find(Slirp *slirp, struct sockaddr_storage *srcsas, + struct tftp_t *tp) { struct tftp_session *spt; int k; @@ -87,7 +89,7 @@ static int tftp_session_find(Slirp *slirp, struct tftp_t *tp) spt = &slirp->tftp_sessions[k]; if (tftp_session_in_use(spt)) { - if (!memcmp(&spt->client_ip, &tp->ip.ip_src, sizeof(spt->client_ip))) { + if (sockaddr_equal(&spt->client_addr, srcsas)) { if (spt->client_port == tp->udp.uh_sport) { return k; } @@ -120,11 +122,53 @@ static int tftp_read_data(struct tftp_session *spt, uint32_t block_nr, return bytes_read; } +static struct tftp_t *tftp_prep_mbuf_data(struct tftp_session *spt, + struct mbuf *m) +{ + struct tftp_t *tp; + + memset(m->m_data, 0, m->m_size); + + m->m_data += IF_MAXLINKHDR; + if (spt->client_addr.ss_family == AF_INET6) { + m->m_data += sizeof(struct ip6); + } else { + m->m_data += sizeof(struct ip); + } + tp = (void *)m->m_data; + m->m_data += sizeof(struct udphdr); + + return tp; +} + +static void tftp_udp_output(struct tftp_session *spt, struct mbuf *m, + struct tftp_t *recv_tp) +{ + if (spt->client_addr.ss_family == AF_INET6) { + struct sockaddr_in6 sa6, da6; + + sa6.sin6_addr = spt->slirp->vhost_addr6; + sa6.sin6_port = recv_tp->udp.uh_dport; + da6.sin6_addr = ((struct sockaddr_in6 *)&spt->client_addr)->sin6_addr; + da6.sin6_port = spt->client_port; + + udp6_output(NULL, m, &sa6, &da6); + } else { + struct sockaddr_in sa4, da4; + + sa4.sin_addr = spt->slirp->vhost_addr; + sa4.sin_port = recv_tp->udp.uh_dport; + da4.sin_addr = ((struct sockaddr_in *)&spt->client_addr)->sin_addr; + da4.sin_port = spt->client_port; + + udp_output(NULL, m, &sa4, &da4, IPTOS_LOWDELAY); + } +} + static int tftp_send_oack(struct tftp_session *spt, const char *keys[], uint32_t values[], int nb, struct tftp_t *recv_tp) { - struct sockaddr_in saddr, daddr; struct mbuf *m; struct tftp_t *tp; int i, n = 0; @@ -132,13 +176,9 @@ static int tftp_send_oack(struct tftp_session *spt, m = m_get(spt->slirp); if (!m) - return -1; - - memset(m->m_data, 0, m->m_size); + return -1; - m->m_data += IF_MAXLINKHDR; - tp = (void *)m->m_data; - m->m_data += sizeof(struct udpiphdr); + tp = tftp_prep_mbuf_data(spt, m); tp->tp_op = htons(TFTP_OACK); for (i = 0; i < nb; i++) { @@ -148,15 +188,8 @@ static int tftp_send_oack(struct tftp_session *spt, values[i]) + 1; } - saddr.sin_addr = recv_tp->ip.ip_dst; - saddr.sin_port = recv_tp->udp.uh_dport; - - daddr.sin_addr = spt->client_ip; - daddr.sin_port = spt->client_port; - - m->m_len = sizeof(struct tftp_t) - 514 + n - - sizeof(struct ip) - sizeof(struct udphdr); - udp_output(NULL, m, &saddr, &daddr, IPTOS_LOWDELAY); + m->m_len = sizeof(struct tftp_t) - 514 + n - sizeof(struct udphdr); + tftp_udp_output(spt, m, recv_tp); return 0; } @@ -165,7 +198,6 @@ static void tftp_send_error(struct tftp_session *spt, uint16_t errorcode, const char *msg, struct tftp_t *recv_tp) { - struct sockaddr_in saddr, daddr; struct mbuf *m; struct tftp_t *tp; @@ -177,24 +209,15 @@ static void tftp_send_error(struct tftp_session *spt, memset(m->m_data, 0, m->m_size); - m->m_data += IF_MAXLINKHDR; - tp = (void *)m->m_data; - m->m_data += sizeof(struct udpiphdr); + tp = tftp_prep_mbuf_data(spt, m); tp->tp_op = htons(TFTP_ERROR); tp->x.tp_error.tp_error_code = htons(errorcode); pstrcpy((char *)tp->x.tp_error.tp_msg, sizeof(tp->x.tp_error.tp_msg), msg); - saddr.sin_addr = recv_tp->ip.ip_dst; - saddr.sin_port = recv_tp->udp.uh_dport; - - daddr.sin_addr = spt->client_ip; - daddr.sin_port = spt->client_port; - - m->m_len = sizeof(struct tftp_t) - 514 + 3 + strlen(msg) - - sizeof(struct ip) - sizeof(struct udphdr); - - udp_output(NULL, m, &saddr, &daddr, IPTOS_LOWDELAY); + m->m_len = sizeof(struct tftp_t) - 514 + 3 + strlen(msg) + - sizeof(struct udphdr); + tftp_udp_output(spt, m, recv_tp); out: tftp_session_terminate(spt); @@ -203,7 +226,6 @@ out: static void tftp_send_next_block(struct tftp_session *spt, struct tftp_t *recv_tp) { - struct sockaddr_in saddr, daddr; struct mbuf *m; struct tftp_t *tp; int nobytes; @@ -216,19 +238,11 @@ static void tftp_send_next_block(struct tftp_session *spt, memset(m->m_data, 0, m->m_size); - m->m_data += IF_MAXLINKHDR; - tp = (void *)m->m_data; - m->m_data += sizeof(struct udpiphdr); + tp = tftp_prep_mbuf_data(spt, m); tp->tp_op = htons(TFTP_DATA); tp->x.tp_data.tp_block_nr = htons((spt->block_nr + 1) & 0xffff); - saddr.sin_addr = recv_tp->ip.ip_dst; - saddr.sin_port = recv_tp->udp.uh_dport; - - daddr.sin_addr = spt->client_ip; - daddr.sin_port = spt->client_port; - nobytes = tftp_read_data(spt, spt->block_nr, tp->x.tp_data.tp_buf, 512); if (nobytes < 0) { @@ -241,10 +255,8 @@ static void tftp_send_next_block(struct tftp_session *spt, return; } - m->m_len = sizeof(struct tftp_t) - (512 - nobytes) - - sizeof(struct ip) - sizeof(struct udphdr); - - udp_output(NULL, m, &saddr, &daddr, IPTOS_LOWDELAY); + m->m_len = sizeof(struct tftp_t) - (512 - nobytes) - sizeof(struct udphdr); + tftp_udp_output(spt, m, recv_tp); if (nobytes == 512) { tftp_session_update(spt); @@ -256,7 +268,8 @@ static void tftp_send_next_block(struct tftp_session *spt, spt->block_nr++; } -static void tftp_handle_rrq(Slirp *slirp, struct tftp_t *tp, int pktlen) +static void tftp_handle_rrq(Slirp *slirp, struct sockaddr_storage *srcsas, + struct tftp_t *tp, int pktlen) { struct tftp_session *spt; int s, k; @@ -267,12 +280,12 @@ static void tftp_handle_rrq(Slirp *slirp, struct tftp_t *tp, int pktlen) int nb_options = 0; /* check if a session already exists and if so terminate it */ - s = tftp_session_find(slirp, tp); + s = tftp_session_find(slirp, srcsas, tp); if (s >= 0) { tftp_session_terminate(&slirp->tftp_sessions[s]); } - s = tftp_session_allocate(slirp, tp); + s = tftp_session_allocate(slirp, srcsas, tp); if (s < 0) { return; @@ -397,11 +410,12 @@ static void tftp_handle_rrq(Slirp *slirp, struct tftp_t *tp, int pktlen) tftp_send_next_block(spt, tp); } -static void tftp_handle_ack(Slirp *slirp, struct tftp_t *tp, int pktlen) +static void tftp_handle_ack(Slirp *slirp, struct sockaddr_storage *srcsas, + struct tftp_t *tp, int pktlen) { int s; - s = tftp_session_find(slirp, tp); + s = tftp_session_find(slirp, srcsas, tp); if (s < 0) { return; @@ -410,11 +424,12 @@ static void tftp_handle_ack(Slirp *slirp, struct tftp_t *tp, int pktlen) tftp_send_next_block(&slirp->tftp_sessions[s], tp); } -static void tftp_handle_error(Slirp *slirp, struct tftp_t *tp, int pktlen) +static void tftp_handle_error(Slirp *slirp, struct sockaddr_storage *srcsas, + struct tftp_t *tp, int pktlen) { int s; - s = tftp_session_find(slirp, tp); + s = tftp_session_find(slirp, srcsas, tp); if (s < 0) { return; @@ -423,21 +438,21 @@ static void tftp_handle_error(Slirp *slirp, struct tftp_t *tp, int pktlen) tftp_session_terminate(&slirp->tftp_sessions[s]); } -void tftp_input(struct mbuf *m) +void tftp_input(struct sockaddr_storage *srcsas, struct mbuf *m) { struct tftp_t *tp = (struct tftp_t *)m->m_data; switch(ntohs(tp->tp_op)) { case TFTP_RRQ: - tftp_handle_rrq(m->slirp, tp, m->m_len); + tftp_handle_rrq(m->slirp, srcsas, tp, m->m_len); break; case TFTP_ACK: - tftp_handle_ack(m->slirp, tp, m->m_len); + tftp_handle_ack(m->slirp, srcsas, tp, m->m_len); break; case TFTP_ERROR: - tftp_handle_error(m->slirp, tp, m->m_len); + tftp_handle_error(m->slirp, srcsas, tp, m->m_len); break; } } diff --git a/slirp/tftp.h b/slirp/tftp.h index e1cc24b9bf..1cb1adf591 100644 --- a/slirp/tftp.h +++ b/slirp/tftp.h @@ -16,7 +16,6 @@ #define TFTP_FILENAME_MAX 512 struct tftp_t { - struct ip ip; struct udphdr udp; uint16_t tp_op; union { @@ -30,20 +29,20 @@ struct tftp_t { } tp_error; char tp_buf[512 + 2]; } x; -}; +} __attribute__((packed)); struct tftp_session { Slirp *slirp; char *filename; int fd; - struct in_addr client_ip; + struct sockaddr_storage client_addr; uint16_t client_port; uint32_t block_nr; int timestamp; }; -void tftp_input(struct mbuf *m); +void tftp_input(struct sockaddr_storage *srcsas, struct mbuf *m); #endif diff --git a/slirp/udp.c b/slirp/udp.c index 6b39cab0c6..247024fd86 100644 --- a/slirp/udp.c +++ b/slirp/udp.c @@ -128,6 +128,11 @@ udp_input(register struct mbuf *m, int iphlen) } } + lhost.ss_family = AF_INET; + lhost4 = (struct sockaddr_in *) &lhost; + lhost4->sin_addr = ip->ip_src; + lhost4->sin_port = uh->uh_sport; + /* * handle DHCP/BOOTP */ @@ -143,7 +148,11 @@ udp_input(register struct mbuf *m, int iphlen) */ if (ntohs(uh->uh_dport) == TFTP_SERVER && ip->ip_dst.s_addr == slirp->vhost_addr.s_addr) { - tftp_input(m); + m->m_data += iphlen; + m->m_len -= iphlen; + tftp_input(&lhost, m); + m->m_data -= iphlen; + m->m_len += iphlen; goto bad; } @@ -154,11 +163,6 @@ udp_input(register struct mbuf *m, int iphlen) /* * Locate pcb for datagram. */ - lhost.ss_family = AF_INET; - lhost4 = (struct sockaddr_in *) &lhost; - lhost4->sin_addr = ip->ip_src; - lhost4->sin_port = uh->uh_sport; - so = solookup(&slirp->udp_last_so, &slirp->udb, &lhost, NULL); if (so == NULL) { @@ -209,7 +213,8 @@ udp_input(register struct mbuf *m, int iphlen) m->m_data -= iphlen; *ip=save_ip; DEBUG_MISC((dfd,"udp tx errno = %d-%s\n",errno,strerror(errno))); - icmp_error(m, ICMP_UNREACH,ICMP_UNREACH_NET, 0,strerror(errno)); + icmp_send_error(m, ICMP_UNREACH, ICMP_UNREACH_NET, 0, + strerror(errno)); goto bad; } diff --git a/slirp/udp.h b/slirp/udp.h index 2f9de3886c..10cc7809b1 100644 --- a/slirp/udp.h +++ b/slirp/udp.h @@ -83,4 +83,9 @@ struct socket * udp_listen(Slirp *, uint32_t, u_int, uint32_t, u_int, int udp_output(struct socket *so, struct mbuf *m, struct sockaddr_in *saddr, struct sockaddr_in *daddr, int iptos); + +void udp6_input(register struct mbuf *); +int udp6_output(struct socket *so, struct mbuf *m, + struct sockaddr_in6 *saddr, struct sockaddr_in6 *daddr); + #endif diff --git a/slirp/udp6.c b/slirp/udp6.c new file mode 100644 index 0000000000..60a91c9532 --- /dev/null +++ b/slirp/udp6.c @@ -0,0 +1,169 @@ +/* + * Copyright (c) 2013 + * Guillaume Subiron + */ + +#include "qemu/osdep.h" +#include "qemu-common.h" +#include "slirp.h" +#include "qemu/osdep.h" +#include "udp.h" + +void udp6_input(struct mbuf *m) +{ + Slirp *slirp = m->slirp; + struct ip6 *ip, save_ip; + struct udphdr *uh; + int iphlen = sizeof(struct ip6); + int len; + struct socket *so; + struct sockaddr_in6 lhost; + + DEBUG_CALL("udp6_input"); + DEBUG_ARG("m = %lx", (long)m); + + if (slirp->restricted) { + goto bad; + } + + ip = mtod(m, struct ip6 *); + m->m_len -= iphlen; + m->m_data += iphlen; + uh = mtod(m, struct udphdr *); + m->m_len += iphlen; + m->m_data -= iphlen; + + if (ip6_cksum(m)) { + goto bad; + } + + len = ntohs((uint16_t)uh->uh_ulen); + + /* + * Make mbuf data length reflect UDP length. + * If not enough data to reflect UDP length, drop. + */ + if (ntohs(ip->ip_pl) != len) { + if (len > ntohs(ip->ip_pl)) { + goto bad; + } + m_adj(m, len - ntohs(ip->ip_pl)); + ip->ip_pl = htons(len); + } + + /* + * Save a copy of the IP header in case we want restore it + * for sending an ICMP error message in response. + */ + save_ip = *ip; + + /* Locate pcb for datagram. */ + lhost.sin6_family = AF_INET6; + lhost.sin6_addr = ip->ip_src; + lhost.sin6_port = uh->uh_sport; + + /* TODO handle DHCP/BOOTP */ + + /* handle TFTP */ + if (ntohs(uh->uh_dport) == TFTP_SERVER && + !memcmp(ip->ip_dst.s6_addr, slirp->vhost_addr6.s6_addr, 16)) { + m->m_data += iphlen; + m->m_len -= iphlen; + tftp_input((struct sockaddr_storage *)&lhost, m); + m->m_data -= iphlen; + m->m_len += iphlen; + goto bad; + } + + so = solookup(&slirp->udp_last_so, &slirp->udb, + (struct sockaddr_storage *) &lhost, NULL); + + if (so == NULL) { + /* If there's no socket for this packet, create one. */ + so = socreate(slirp); + if (!so) { + goto bad; + } + if (udp_attach(so, AF_INET6) == -1) { + DEBUG_MISC((dfd, " udp6_attach errno = %d-%s\n", + errno, strerror(errno))); + sofree(so); + goto bad; + } + + /* Setup fields */ + so->so_lfamily = AF_INET6; + so->so_laddr6 = ip->ip_src; + so->so_lport6 = uh->uh_sport; + } + + so->so_ffamily = AF_INET6; + so->so_faddr6 = ip->ip_dst; /* XXX */ + so->so_fport6 = uh->uh_dport; /* XXX */ + + iphlen += sizeof(struct udphdr); + m->m_len -= iphlen; + m->m_data += iphlen; + + /* + * Now we sendto() the packet. + */ + if (sosendto(so, m) == -1) { + m->m_len += iphlen; + m->m_data -= iphlen; + *ip = save_ip; + DEBUG_MISC((dfd, "udp tx errno = %d-%s\n", errno, strerror(errno))); + /* TODO: ICMPv6 error */ + /*icmp_error(m, ICMP_UNREACH,ICMP_UNREACH_NET, 0,strerror(errno));*/ + goto bad; + } + + m_free(so->so_m); /* used for ICMP if error on sorecvfrom */ + + /* restore the orig mbuf packet */ + m->m_len += iphlen; + m->m_data -= iphlen; + *ip = save_ip; + so->so_m = m; + + return; +bad: + m_free(m); +} + +int udp6_output(struct socket *so, struct mbuf *m, + struct sockaddr_in6 *saddr, struct sockaddr_in6 *daddr) +{ + struct ip6 *ip; + struct udphdr *uh; + + DEBUG_CALL("udp6_output"); + DEBUG_ARG("so = %lx", (long)so); + DEBUG_ARG("m = %lx", (long)m); + + /* adjust for header */ + m->m_data -= sizeof(struct udphdr); + m->m_len += sizeof(struct udphdr); + uh = mtod(m, struct udphdr *); + m->m_data -= sizeof(struct ip6); + m->m_len += sizeof(struct ip6); + ip = mtod(m, struct ip6 *); + + /* Build IP header */ + ip->ip_pl = htons(m->m_len - sizeof(struct ip6)); + ip->ip_nh = IPPROTO_UDP; + ip->ip_src = saddr->sin6_addr; + ip->ip_dst = daddr->sin6_addr; + + /* Build UDP header */ + uh->uh_sport = saddr->sin6_port; + uh->uh_dport = daddr->sin6_port; + uh->uh_ulen = ip->ip_pl; + uh->uh_sum = 0; + uh->uh_sum = ip6_cksum(m); + if (uh->uh_sum == 0) { + uh->uh_sum = 0xffff; + } + + return ip6_output(so, m, 0); +} diff --git a/stubs/clock-warp.c b/stubs/clock-warp.c index 5ae32b9e6a..8acb58a775 100644 --- a/stubs/clock-warp.c +++ b/stubs/clock-warp.c @@ -2,7 +2,7 @@ #include "qemu-common.h" #include "qemu/timer.h" -void qemu_clock_warp(QEMUClockType type) +void qemu_start_warp_timer(void) { } diff --git a/stubs/replay.c b/stubs/replay.c index 00ca01f55a..2f1a6dc62e 100644 --- a/stubs/replay.c +++ b/stubs/replay.c @@ -29,3 +29,37 @@ bool replay_events_enabled(void) void replay_finish(void) { } + +void replay_register_char_driver(CharDriverState *chr) +{ +} + +void replay_chr_be_write(CharDriverState *s, uint8_t *buf, int len) +{ + abort(); +} + +void replay_char_write_event_save(int res, int offset) +{ + abort(); +} + +void replay_char_write_event_load(int *res, int *offset) +{ + abort(); +} + +int replay_char_read_all_load(uint8_t *buf) +{ + abort(); +} + +void replay_char_read_all_save_error(int res) +{ + abort(); +} + +void replay_char_read_all_save_buf(uint8_t *buf, int offset) +{ + abort(); +} diff --git a/target-i386/cpu.c b/target-i386/cpu.c index 0f38d1eae3..3ea6b294a4 100644 --- a/target-i386/cpu.c +++ b/target-i386/cpu.c @@ -2132,6 +2132,10 @@ static void x86_cpu_load_def(X86CPU *cpu, X86CPUDefinition *def, Error **errp) /* Special cases not set in the X86CPUDefinition structs: */ if (kvm_enabled()) { + if (!kvm_irqchip_in_kernel()) { + x86_cpu_change_kvm_default("x2apic", "off"); + } + x86_cpu_apply_props(cpu, kvm_default_props); } diff --git a/target-i386/kvm.c b/target-i386/kvm.c index 7974acb399..08d6444741 100644 --- a/target-i386/kvm.c +++ b/target-i386/kvm.c @@ -639,6 +639,7 @@ int kvm_arch_init_vcpu(CPUState *cs) if (cpu->hyperv_crash && has_msr_hv_crash) { c->edx |= HV_X64_GUEST_CRASH_MSR_AVAILABLE; } + c->edx |= HV_X64_CPU_DYNAMIC_PARTITIONING_AVAILABLE; if (cpu->hyperv_reset && has_msr_hv_reset) { c->eax |= HV_X64_MSR_RESET_AVAILABLE; } diff --git a/target-i386/translate.c b/target-i386/translate.c index 53dee79afd..dd8d5cc360 100644 --- a/target-i386/translate.c +++ b/target-i386/translate.c @@ -57,11 +57,17 @@ #endif /* For a switch indexed by MODRM, match all memory operands for a given OP. */ -#define CASE_MEM_OP(OP) \ +#define CASE_MODRM_MEM_OP(OP) \ case (0 << 6) | (OP << 3) | 0 ... (0 << 6) | (OP << 3) | 7: \ case (1 << 6) | (OP << 3) | 0 ... (1 << 6) | (OP << 3) | 7: \ case (2 << 6) | (OP << 3) | 0 ... (2 << 6) | (OP << 3) | 7 +#define CASE_MODRM_OP(OP) \ + case (0 << 6) | (OP << 3) | 0 ... (0 << 6) | (OP << 3) | 7: \ + case (1 << 6) | (OP << 3) | 0 ... (1 << 6) | (OP << 3) | 7: \ + case (2 << 6) | (OP << 3) | 0 ... (2 << 6) | (OP << 3) | 7: \ + case (3 << 6) | (OP << 3) | 0 ... (3 << 6) | (OP << 3) | 7 + //#define MACRO_TEST 1 /* global register indexes */ @@ -93,6 +99,7 @@ typedef struct DisasContext { int prefix; TCGMemOp aflag; TCGMemOp dflag; + target_ulong pc_start; target_ulong pc; /* pc = eip + cs_base */ int is_jmp; /* 1 = means jump (stop translation), 2 means CPU static state change (stop translation) */ @@ -460,15 +467,15 @@ static void gen_lea_v_seg(DisasContext *s, TCGMemOp aflag, TCGv a0, break; case MO_16: /* 16 bit address */ - if (ovr_seg < 0) { - ovr_seg = def_seg; - } tcg_gen_ext16u_tl(cpu_A0, a0); - /* ADDSEG will only be false in 16-bit mode for LEA. */ - if (!s->addseg) { - return; - } a0 = cpu_A0; + if (ovr_seg < 0) { + if (s->addseg) { + ovr_seg = def_seg; + } else { + return; + } + } break; default: tcg_abort(); @@ -2362,6 +2369,30 @@ static void gen_exception(DisasContext *s, int trapno, target_ulong cur_eip) s->is_jmp = DISAS_TB_JUMP; } +/* Generate #UD for the current instruction. The assumption here is that + the instruction is known, but it isn't allowed in the current cpu mode. */ +static void gen_illegal_opcode(DisasContext *s) +{ + gen_exception(s, EXCP06_ILLOP, s->pc_start - s->cs_base); +} + +/* Similarly, except that the assumption here is that we don't decode + the instruction at all -- either a missing opcode, an unimplemented + feature, or just a bogus instruction stream. */ +static void gen_unknown_opcode(CPUX86State *env, DisasContext *s) +{ + gen_illegal_opcode(s); + + if (qemu_loglevel_mask(LOG_UNIMP)) { + target_ulong pc = s->pc_start, end = s->pc; + qemu_log("ILLOPC: " TARGET_FMT_lx ":", pc); + for (; pc < end; ++pc) { + qemu_log(" %02x", cpu_ldub_code(env, pc)); + } + qemu_log("\n"); + } +} + /* an interrupt is different from an exception because of the privilege checks */ static void gen_interrupt(DisasContext *s, int intno, @@ -2409,22 +2440,29 @@ static void gen_reset_hflag(DisasContext *s, uint32_t mask) /* Clear BND registers during legacy branches. */ static void gen_bnd_jmp(DisasContext *s) { - /* Do nothing if BND prefix present, MPX is disabled, or if the - BNDREGs are known to be in INIT state already. The helper - itself will check BNDPRESERVE at runtime. */ + /* Clear the registers only if BND prefix is missing, MPX is enabled, + and if the BNDREGs are known to be in use (non-zero) already. + The helper itself will check BNDPRESERVE at runtime. */ if ((s->prefix & PREFIX_REPNZ) == 0 - && (s->flags & HF_MPX_EN_MASK) == 0 - && (s->flags & HF_MPX_IU_MASK) == 0) { + && (s->flags & HF_MPX_EN_MASK) != 0 + && (s->flags & HF_MPX_IU_MASK) != 0) { gen_helper_bnd_jmp(cpu_env); } } -/* generate a generic end of block. Trace exception is also generated - if needed */ -static void gen_eob(DisasContext *s) +/* Generate an end of block. Trace exception is also generated if needed. + If IIM, set HF_INHIBIT_IRQ_MASK if it isn't already set. */ +static void gen_eob_inhibit_irq(DisasContext *s, bool inhibit) { gen_update_cc_op(s); - gen_reset_hflag(s, HF_INHIBIT_IRQ_MASK); + + /* If several instructions disable interrupts, only the first does it. */ + if (inhibit && !(s->flags & HF_INHIBIT_IRQ_MASK)) { + gen_set_hflag(s, HF_INHIBIT_IRQ_MASK); + } else { + gen_reset_hflag(s, HF_INHIBIT_IRQ_MASK); + } + if (s->tb->flags & HF_RF_MASK) { gen_helper_reset_rf(cpu_env); } @@ -2438,6 +2476,12 @@ static void gen_eob(DisasContext *s) s->is_jmp = DISAS_TB_JUMP; } +/* End of block, resetting the inhibit irq flag. */ +static void gen_eob(DisasContext *s) +{ + gen_eob_inhibit_irq(s, false); +} + /* generate a jump to eip. No segment change must happen before as a direct call to the next block may occur */ static void gen_jmp_tb(DisasContext *s, target_ulong eip, int tb_num) @@ -2868,7 +2912,7 @@ static void gen_sse(CPUX86State *env, DisasContext *s, int b, b1 = 0; sse_fn_epp = sse_op_table1[b][b1]; if (!sse_fn_epp) { - goto illegal_op; + goto unknown_op; } if ((b <= 0x5f && b >= 0x10) || b == 0xc6 || b == 0xc2) { is_xmm = 1; @@ -2887,15 +2931,19 @@ static void gen_sse(CPUX86State *env, DisasContext *s, int b, } if (s->flags & HF_EM_MASK) { illegal_op: - gen_exception(s, EXCP06_ILLOP, pc_start - s->cs_base); + gen_illegal_opcode(s); return; } - if (is_xmm && !(s->flags & HF_OSFXSR_MASK)) - if ((b != 0x38 && b != 0x3a) || (s->prefix & PREFIX_DATA)) - goto illegal_op; + if (is_xmm + && !(s->flags & HF_OSFXSR_MASK) + && ((b != 0x38 && b != 0x3a) || (s->prefix & PREFIX_DATA))) { + goto unknown_op; + } if (b == 0x0e) { - if (!(s->cpuid_ext2_features & CPUID_EXT2_3DNOW)) - goto illegal_op; + if (!(s->cpuid_ext2_features & CPUID_EXT2_3DNOW)) { + /* If we were fully decoding this we might use illegal_op. */ + goto unknown_op; + } /* femms */ gen_helper_emms(cpu_env); return; @@ -2920,8 +2968,9 @@ static void gen_sse(CPUX86State *env, DisasContext *s, int b, b |= (b1 << 8); switch(b) { case 0x0e7: /* movntq */ - if (mod == 3) + if (mod == 3) { goto illegal_op; + } gen_lea_modrm(env, s, modrm); gen_stq_env_A0(s, offsetof(CPUX86State, fpregs[reg].mmx)); break; @@ -3247,7 +3296,7 @@ static void gen_sse(CPUX86State *env, DisasContext *s, int b, case 0x172: case 0x173: if (b1 >= 2) { - goto illegal_op; + goto unknown_op; } val = cpu_ldub_code(env, s->pc++); if (is_xmm) { @@ -3266,7 +3315,7 @@ static void gen_sse(CPUX86State *env, DisasContext *s, int b, sse_fn_epp = sse_op_table2[((b - 1) & 3) * 8 + (((modrm >> 3)) & 7)][b1]; if (!sse_fn_epp) { - goto illegal_op; + goto unknown_op; } if (is_xmm) { rm = (modrm & 7) | REX_B(s); @@ -3490,12 +3539,12 @@ static void gen_sse(CPUX86State *env, DisasContext *s, int b, reg = ((modrm >> 3) & 7) | rex_r; mod = (modrm >> 6) & 3; if (b1 >= 2) { - goto illegal_op; + goto unknown_op; } sse_fn_epp = sse_op_table6[b].op[b1]; if (!sse_fn_epp) { - goto illegal_op; + goto unknown_op; } if (!(s->cpuid_ext_features & sse_op_table6[b].ext_mask)) goto illegal_op; @@ -3545,7 +3594,7 @@ static void gen_sse(CPUX86State *env, DisasContext *s, int b, } } if (sse_fn_epp == SSE_SPECIAL) { - goto illegal_op; + goto unknown_op; } tcg_gen_addi_ptr(cpu_ptr0, cpu_env, op1_offset); @@ -3913,12 +3962,12 @@ static void gen_sse(CPUX86State *env, DisasContext *s, int b, break; default: - goto illegal_op; + goto unknown_op; } break; default: - goto illegal_op; + goto unknown_op; } break; @@ -3930,12 +3979,12 @@ static void gen_sse(CPUX86State *env, DisasContext *s, int b, reg = ((modrm >> 3) & 7) | rex_r; mod = (modrm >> 6) & 3; if (b1 >= 2) { - goto illegal_op; + goto unknown_op; } sse_fn_eppi = sse_op_table7[b].op[b1]; if (!sse_fn_eppi) { - goto illegal_op; + goto unknown_op; } if (!(s->cpuid_ext_features & sse_op_table7[b].ext_mask)) goto illegal_op; @@ -4137,12 +4186,14 @@ static void gen_sse(CPUX86State *env, DisasContext *s, int b, break; default: - goto illegal_op; + goto unknown_op; } break; default: - goto illegal_op; + unknown_op: + gen_unknown_opcode(env, s); + return; } } else { /* generic MMX or SSE operation */ @@ -4218,11 +4269,12 @@ static void gen_sse(CPUX86State *env, DisasContext *s, int b, } switch(b) { case 0x0f: /* 3DNow! data insns */ - if (!(s->cpuid_ext2_features & CPUID_EXT2_3DNOW)) - goto illegal_op; val = cpu_ldub_code(env, s->pc++); sse_fn_epp = sse_op_table5[val]; if (!sse_fn_epp) { + goto unknown_op; + } + if (!(s->cpuid_ext2_features & CPUID_EXT2_3DNOW)) { goto illegal_op; } tcg_gen_addi_ptr(cpu_ptr0, cpu_env, op1_offset); @@ -4242,7 +4294,7 @@ static void gen_sse(CPUX86State *env, DisasContext *s, int b, /* compare insns */ val = cpu_ldub_code(env, s->pc++); if (val >= 8) - goto illegal_op; + goto unknown_op; sse_fn_epp = sse_op_table4[val][b1]; tcg_gen_addi_ptr(cpu_ptr0, cpu_env, op1_offset); @@ -4287,7 +4339,7 @@ static target_ulong disas_insn(CPUX86State *env, DisasContext *s, target_ulong next_eip, tval; int rex_w, rex_r; - s->pc = pc_start; + s->pc_start = s->pc = pc_start; prefixes = 0; s->override = -1; rex_w = -1; @@ -4400,7 +4452,7 @@ static target_ulong disas_insn(CPUX86State *env, DisasContext *s, b = 0x13a; break; default: /* Reserved for future use. */ - goto illegal_op; + goto unknown_op; } } s->vex_v = (~vex3 >> 3) & 0xf; @@ -4750,7 +4802,7 @@ static target_ulong disas_insn(CPUX86State *env, DisasContext *s, } break; default: - goto illegal_op; + goto unknown_op; } break; @@ -4763,7 +4815,7 @@ static target_ulong disas_insn(CPUX86State *env, DisasContext *s, rm = (modrm & 7) | REX_B(s); op = (modrm >> 3) & 7; if (op >= 2 && b == 0xfe) { - goto illegal_op; + goto unknown_op; } if (CODE64(s)) { if (op == 2 || op == 4) { @@ -4856,7 +4908,7 @@ static target_ulong disas_insn(CPUX86State *env, DisasContext *s, gen_push_v(s, cpu_T0); break; default: - goto illegal_op; + goto unknown_op; } break; @@ -5171,16 +5223,15 @@ static target_ulong disas_insn(CPUX86State *env, DisasContext *s, ot = gen_pop_T0(s); gen_movl_seg_T0(s, reg); gen_pop_update(s, ot); - if (reg == R_SS) { - /* if reg == SS, inhibit interrupts/trace. */ - /* If several instructions disable interrupts, only the - _first_ does it */ - gen_set_hflag(s, HF_INHIBIT_IRQ_MASK); - s->tf = 0; - } + /* Note that reg == R_SS in gen_movl_seg_T0 always sets is_jmp. */ if (s->is_jmp) { gen_jmp_im(s->pc - s->cs_base); - gen_eob(s); + if (reg == R_SS) { + s->tf = 0; + gen_eob_inhibit_irq(s, true); + } else { + gen_eob(s); + } } break; case 0x1a1: /* pop fs */ @@ -5238,16 +5289,15 @@ static target_ulong disas_insn(CPUX86State *env, DisasContext *s, goto illegal_op; gen_ldst_modrm(env, s, modrm, MO_16, OR_TMP0, 0); gen_movl_seg_T0(s, reg); - if (reg == R_SS) { - /* if reg == SS, inhibit interrupts/trace */ - /* If several instructions disable interrupts, only the - _first_ does it */ - gen_set_hflag(s, HF_INHIBIT_IRQ_MASK); - s->tf = 0; - } + /* Note that reg == R_SS in gen_movl_seg_T0 always sets is_jmp. */ if (s->is_jmp) { gen_jmp_im(s->pc - s->cs_base); - gen_eob(s); + if (reg == R_SS) { + s->tf = 0; + gen_eob_inhibit_irq(s, true); + } else { + gen_eob(s); + } } break; case 0x8c: /* mov Gv, seg */ @@ -5727,7 +5777,7 @@ static target_ulong disas_insn(CPUX86State *env, DisasContext *s, gen_helper_fpop(cpu_env); break; default: - goto illegal_op; + goto unknown_op; } } else { /* register float ops */ @@ -5751,7 +5801,7 @@ static target_ulong disas_insn(CPUX86State *env, DisasContext *s, gen_helper_fwait(cpu_env); break; default: - goto illegal_op; + goto unknown_op; } break; case 0x0c: /* grp d9/4 */ @@ -5770,7 +5820,7 @@ static target_ulong disas_insn(CPUX86State *env, DisasContext *s, gen_helper_fxam_ST0(cpu_env); break; default: - goto illegal_op; + goto unknown_op; } break; case 0x0d: /* grp d9/5 */ @@ -5805,7 +5855,7 @@ static target_ulong disas_insn(CPUX86State *env, DisasContext *s, gen_helper_fldz_ST0(cpu_env); break; default: - goto illegal_op; + goto unknown_op; } } break; @@ -5905,7 +5955,7 @@ static target_ulong disas_insn(CPUX86State *env, DisasContext *s, gen_helper_fpop(cpu_env); break; default: - goto illegal_op; + goto unknown_op; } break; case 0x1c: @@ -5923,7 +5973,7 @@ static target_ulong disas_insn(CPUX86State *env, DisasContext *s, case 4: /* fsetpm (287 only, just do nop here) */ break; default: - goto illegal_op; + goto unknown_op; } break; case 0x1d: /* fucomi */ @@ -5975,7 +6025,7 @@ static target_ulong disas_insn(CPUX86State *env, DisasContext *s, gen_helper_fpop(cpu_env); break; default: - goto illegal_op; + goto unknown_op; } break; case 0x38: /* ffreep sti, undocumented op */ @@ -5990,7 +6040,7 @@ static target_ulong disas_insn(CPUX86State *env, DisasContext *s, gen_op_mov_reg_v(MO_16, R_EAX, cpu_T0); break; default: - goto illegal_op; + goto unknown_op; } break; case 0x3d: /* fucomip */ @@ -6036,7 +6086,7 @@ static target_ulong disas_insn(CPUX86State *env, DisasContext *s, } break; default: - goto illegal_op; + goto unknown_op; } } break; @@ -6507,7 +6557,7 @@ static target_ulong disas_insn(CPUX86State *env, DisasContext *s, val = cpu_ldub_code(env, s->pc++); tcg_gen_movi_tl(cpu_T1, val); if (op < 4) - goto illegal_op; + goto unknown_op; op -= 4; goto bt_op; case 0x1a3: /* bt Gv, Ev */ @@ -6773,26 +6823,13 @@ static target_ulong disas_insn(CPUX86State *env, DisasContext *s, } break; case 0xfb: /* sti */ - if (!s->vm86) { - if (s->cpl <= s->iopl) { - gen_sti: - gen_helper_sti(cpu_env); - /* interruptions are enabled only the first insn after sti */ - /* If several instructions disable interrupts, only the - _first_ does it */ - gen_set_hflag(s, HF_INHIBIT_IRQ_MASK); - /* give a chance to handle pending irqs */ - gen_jmp_im(s->pc - s->cs_base); - gen_eob(s); - } else { - gen_exception(s, EXCP0D_GPF, pc_start - s->cs_base); - } + if (s->vm86 ? s->iopl == 3 : s->cpl <= s->iopl) { + gen_helper_sti(cpu_env); + /* interruptions are enabled only the first insn after sti */ + gen_jmp_im(s->pc - s->cs_base); + gen_eob_inhibit_irq(s, true); } else { - if (s->iopl == 3) { - goto gen_sti; - } else { - gen_exception(s, EXCP0D_GPF, pc_start - s->cs_base); - } + gen_exception(s, EXCP0D_GPF, pc_start - s->cs_base); } break; case 0x62: /* bound */ @@ -7031,14 +7068,14 @@ static target_ulong disas_insn(CPUX86State *env, DisasContext *s, set_cc_op(s, CC_OP_EFLAGS); break; default: - goto illegal_op; + goto unknown_op; } break; case 0x101: modrm = cpu_ldub_code(env, s->pc++); switch (modrm) { - CASE_MEM_OP(0): /* sgdt */ + CASE_MODRM_MEM_OP(0): /* sgdt */ gen_svm_check_intercept(s, pc_start, SVM_EXIT_GDTR_READ); gen_lea_modrm(env, s, modrm); tcg_gen_ld32u_tl(cpu_T0, @@ -7094,7 +7131,7 @@ static target_ulong disas_insn(CPUX86State *env, DisasContext *s, gen_eob(s); break; - CASE_MEM_OP(1): /* sidt */ + CASE_MODRM_MEM_OP(1): /* sidt */ gen_svm_check_intercept(s, pc_start, SVM_EXIT_IDTR_READ); gen_lea_modrm(env, s, modrm); tcg_gen_ld32u_tl(cpu_T0, cpu_env, offsetof(CPUX86State, idt.limit)); @@ -7240,7 +7277,7 @@ static target_ulong disas_insn(CPUX86State *env, DisasContext *s, gen_helper_invlpga(cpu_env, tcg_const_i32(s->aflag - 1)); break; - CASE_MEM_OP(2): /* lgdt */ + CASE_MODRM_MEM_OP(2): /* lgdt */ if (s->cpl != 0) { gen_exception(s, EXCP0D_GPF, pc_start - s->cs_base); break; @@ -7257,7 +7294,7 @@ static target_ulong disas_insn(CPUX86State *env, DisasContext *s, tcg_gen_st32_tl(cpu_T1, cpu_env, offsetof(CPUX86State, gdt.limit)); break; - CASE_MEM_OP(3): /* lidt */ + CASE_MODRM_MEM_OP(3): /* lidt */ if (s->cpl != 0) { gen_exception(s, EXCP0D_GPF, pc_start - s->cs_base); break; @@ -7274,17 +7311,19 @@ static target_ulong disas_insn(CPUX86State *env, DisasContext *s, tcg_gen_st32_tl(cpu_T1, cpu_env, offsetof(CPUX86State, idt.limit)); break; - CASE_MEM_OP(4): /* smsw */ + CASE_MODRM_OP(4): /* smsw */ gen_svm_check_intercept(s, pc_start, SVM_EXIT_READ_CR0); -#if defined TARGET_X86_64 && defined HOST_WORDS_BIGENDIAN - tcg_gen_ld32u_tl(cpu_T0, cpu_env, offsetof(CPUX86State, cr[0]) + 4); -#else - tcg_gen_ld32u_tl(cpu_T0, cpu_env, offsetof(CPUX86State, cr[0])); -#endif - gen_ldst_modrm(env, s, modrm, MO_16, OR_TMP0, 1); + tcg_gen_ld_tl(cpu_T0, cpu_env, offsetof(CPUX86State, cr[0])); + if (CODE64(s)) { + mod = (modrm >> 6) & 3; + ot = (mod != 3 ? MO_16 : s->dflag); + } else { + ot = MO_16; + } + gen_ldst_modrm(env, s, modrm, ot, OR_TMP0, 1); break; - CASE_MEM_OP(6): /* lmsw */ + CASE_MODRM_OP(6): /* lmsw */ if (s->cpl != 0) { gen_exception(s, EXCP0D_GPF, pc_start - s->cs_base); break; @@ -7296,7 +7335,7 @@ static target_ulong disas_insn(CPUX86State *env, DisasContext *s, gen_eob(s); break; - CASE_MEM_OP(7): /* invlpg */ + CASE_MODRM_MEM_OP(7): /* invlpg */ if (s->cpl != 0) { gen_exception(s, EXCP0D_GPF, pc_start - s->cs_base); break; @@ -7343,7 +7382,7 @@ static target_ulong disas_insn(CPUX86State *env, DisasContext *s, break; default: - goto illegal_op; + goto unknown_op; } break; @@ -7467,7 +7506,7 @@ static target_ulong disas_insn(CPUX86State *env, DisasContext *s, case 3: /* prefetchnt0 */ if (mod == 3) goto illegal_op; - gen_lea_modrm(env, s, modrm); + gen_nop_modrm(env, s, modrm); /* nothing more to do */ break; default: /* nop (multi byte) */ @@ -7712,7 +7751,7 @@ static target_ulong disas_insn(CPUX86State *env, DisasContext *s, } break; default: - goto illegal_op; + goto unknown_op; } } break; @@ -7778,7 +7817,7 @@ static target_ulong disas_insn(CPUX86State *env, DisasContext *s, case 0x1ae: modrm = cpu_ldub_code(env, s->pc++); switch (modrm) { - CASE_MEM_OP(0): /* fxsave */ + CASE_MODRM_MEM_OP(0): /* fxsave */ if (!(s->cpuid_features & CPUID_FXSR) || (prefixes & PREFIX_LOCK)) { goto illegal_op; @@ -7791,7 +7830,7 @@ static target_ulong disas_insn(CPUX86State *env, DisasContext *s, gen_helper_fxsave(cpu_env, cpu_A0); break; - CASE_MEM_OP(1): /* fxrstor */ + CASE_MODRM_MEM_OP(1): /* fxrstor */ if (!(s->cpuid_features & CPUID_FXSR) || (prefixes & PREFIX_LOCK)) { goto illegal_op; @@ -7804,7 +7843,7 @@ static target_ulong disas_insn(CPUX86State *env, DisasContext *s, gen_helper_fxrstor(cpu_env, cpu_A0); break; - CASE_MEM_OP(2): /* ldmxcsr */ + CASE_MODRM_MEM_OP(2): /* ldmxcsr */ if ((s->flags & HF_EM_MASK) || !(s->flags & HF_OSFXSR_MASK)) { goto illegal_op; } @@ -7817,7 +7856,7 @@ static target_ulong disas_insn(CPUX86State *env, DisasContext *s, gen_helper_ldmxcsr(cpu_env, cpu_tmp2_i32); break; - CASE_MEM_OP(3): /* stmxcsr */ + CASE_MODRM_MEM_OP(3): /* stmxcsr */ if ((s->flags & HF_EM_MASK) || !(s->flags & HF_OSFXSR_MASK)) { goto illegal_op; } @@ -7830,7 +7869,7 @@ static target_ulong disas_insn(CPUX86State *env, DisasContext *s, gen_op_st_v(s, MO_32, cpu_T0, cpu_A0); break; - CASE_MEM_OP(4): /* xsave */ + CASE_MODRM_MEM_OP(4): /* xsave */ if ((s->cpuid_ext_features & CPUID_EXT_XSAVE) == 0 || (prefixes & (PREFIX_LOCK | PREFIX_DATA | PREFIX_REPZ | PREFIX_REPNZ))) { @@ -7842,7 +7881,7 @@ static target_ulong disas_insn(CPUX86State *env, DisasContext *s, gen_helper_xsave(cpu_env, cpu_A0, cpu_tmp1_i64); break; - CASE_MEM_OP(5): /* xrstor */ + CASE_MODRM_MEM_OP(5): /* xrstor */ if ((s->cpuid_ext_features & CPUID_EXT_XSAVE) == 0 || (prefixes & (PREFIX_LOCK | PREFIX_DATA | PREFIX_REPZ | PREFIX_REPNZ))) { @@ -7859,7 +7898,7 @@ static target_ulong disas_insn(CPUX86State *env, DisasContext *s, gen_eob(s); break; - CASE_MEM_OP(6): /* xsaveopt / clwb */ + CASE_MODRM_MEM_OP(6): /* xsaveopt / clwb */ if (prefixes & PREFIX_LOCK) { goto illegal_op; } @@ -7883,7 +7922,7 @@ static target_ulong disas_insn(CPUX86State *env, DisasContext *s, } break; - CASE_MEM_OP(7): /* clflush / clflushopt */ + CASE_MODRM_MEM_OP(7): /* clflush / clflushopt */ if (prefixes & PREFIX_LOCK) { goto illegal_op; } @@ -7934,7 +7973,7 @@ static target_ulong disas_insn(CPUX86State *env, DisasContext *s, } break; } - goto illegal_op; + goto unknown_op; case 0xf8: /* sfence / pcommit */ if (prefixes & PREFIX_DATA) { @@ -7956,7 +7995,7 @@ static target_ulong disas_insn(CPUX86State *env, DisasContext *s, break; default: - goto illegal_op; + goto unknown_op; } break; @@ -7965,8 +8004,7 @@ static target_ulong disas_insn(CPUX86State *env, DisasContext *s, mod = (modrm >> 6) & 3; if (mod == 3) goto illegal_op; - gen_lea_modrm(env, s, modrm); - /* ignore for now */ + gen_nop_modrm(env, s, modrm); break; case 0x1aa: /* rsm */ gen_svm_check_intercept(s, pc_start, SVM_EXIT_RSM); @@ -8013,7 +8051,7 @@ static target_ulong disas_insn(CPUX86State *env, DisasContext *s, gen_sse(env, s, b, pc_start, rex_r); break; default: - goto illegal_op; + goto unknown_op; } /* lock generation */ if (s->prefix & PREFIX_LOCK) @@ -8023,7 +8061,13 @@ static target_ulong disas_insn(CPUX86State *env, DisasContext *s, if (s->prefix & PREFIX_LOCK) gen_helper_unlock(); /* XXX: ensure that no lock was generated */ - gen_exception(s, EXCP06_ILLOP, pc_start - s->cs_base); + gen_illegal_opcode(s); + return s->pc; + unknown_op: + if (s->prefix & PREFIX_LOCK) + gen_helper_unlock(); + /* XXX: ensure that no lock was generated */ + gen_unknown_opcode(env, s); return s->pc; } diff --git a/target-s390x/cpu-qom.h b/target-s390x/cpu-qom.h index 029a44af24..1c90933965 100644 --- a/target-s390x/cpu-qom.h +++ b/target-s390x/cpu-qom.h @@ -47,6 +47,8 @@ typedef struct S390CPUClass { CPUClass parent_class; /*< public >*/ + int64_t next_cpu_id; + DeviceRealize parent_realize; void (*parent_reset)(CPUState *cpu); void (*load_normal)(CPUState *cpu); @@ -66,6 +68,7 @@ typedef struct S390CPU { /*< public >*/ CPUS390XState env; + int64_t id; /* needed for live migration */ void *irqstate; uint32_t irqstate_saved_size; diff --git a/target-s390x/cpu.c b/target-s390x/cpu.c index 73a910d2fa..1cbf70355d 100644 --- a/target-s390x/cpu.c +++ b/target-s390x/cpu.c @@ -30,8 +30,11 @@ #include "qemu/error-report.h" #include "hw/hw.h" #include "trace.h" +#include "qapi/visitor.h" #ifndef CONFIG_USER_ONLY #include "sysemu/arch_init.h" +#include "sysemu/sysemu.h" +#include "hw/s390x/sclp.h" #endif #define CR0_RESET 0xE0UL @@ -195,7 +198,39 @@ static void s390_cpu_realizefn(DeviceState *dev, Error **errp) { CPUState *cs = CPU(dev); S390CPUClass *scc = S390_CPU_GET_CLASS(dev); + S390CPU *cpu = S390_CPU(dev); + CPUS390XState *env = &cpu->env; + Error *err = NULL; + +#if !defined(CONFIG_USER_ONLY) + if (cpu->id >= max_cpus) { + error_setg(&err, "Unable to add CPU: %" PRIi64 + ", max allowed: %d", cpu->id, max_cpus - 1); + goto out; + } +#endif + if (cpu_exists(cpu->id)) { + error_setg(&err, "Unable to add CPU: %" PRIi64 + ", it already exists", cpu->id); + goto out; + } + if (cpu->id != scc->next_cpu_id) { + error_setg(&err, "Unable to add CPU: %" PRIi64 + ", The next available id is %" PRIi64, cpu->id, + scc->next_cpu_id); + goto out; + } + + cpu_exec_init(cs, &err); + if (err != NULL) { + goto out; + } + scc->next_cpu_id++; +#if !defined(CONFIG_USER_ONLY) + qemu_register_reset(s390_cpu_machine_reset_cb, cpu); +#endif + env->cpu_num = cpu->id; s390_cpu_gdb_init(cs); qemu_init_vcpu(cs); #if !defined(CONFIG_USER_ONLY) @@ -204,7 +239,55 @@ static void s390_cpu_realizefn(DeviceState *dev, Error **errp) cpu_reset(cs); #endif - scc->parent_realize(dev, errp); + scc->parent_realize(dev, &err); + +#if !defined(CONFIG_USER_ONLY) + if (dev->hotplugged) { + raise_irq_cpu_hotplug(); + } +#endif + +out: + error_propagate(errp, err); +} + +static void s390x_cpu_get_id(Object *obj, Visitor *v, const char *name, + void *opaque, Error **errp) +{ + S390CPU *cpu = S390_CPU(obj); + int64_t value = cpu->id; + + visit_type_int(v, name, &value, errp); +} + +static void s390x_cpu_set_id(Object *obj, Visitor *v, const char *name, + void *opaque, Error **errp) +{ + S390CPU *cpu = S390_CPU(obj); + DeviceState *dev = DEVICE(obj); + const int64_t min = 0; + const int64_t max = UINT32_MAX; + Error *err = NULL; + int64_t value; + + if (dev->realized) { + error_setg(errp, "Attempt to set property '%s' on '%s' after " + "it was realized", name, object_get_typename(obj)); + return; + } + + visit_type_int(v, name, &value, &err); + if (err) { + error_propagate(errp, err); + return; + } + if (value < min || value > max) { + error_setg(errp, "Property %s.%s doesn't take value %" PRId64 + " (minimum: %" PRId64 ", maximum: %" PRId64 ")" , + object_get_typename(obj), name, value, min, max); + return; + } + cpu->id = value; } static void s390_cpu_initfn(Object *obj) @@ -213,15 +296,16 @@ static void s390_cpu_initfn(Object *obj) S390CPU *cpu = S390_CPU(obj); CPUS390XState *env = &cpu->env; static bool inited; - static int cpu_num = 0; #if !defined(CONFIG_USER_ONLY) struct tm tm; #endif cs->env_ptr = env; - cpu_exec_init(cs, &error_abort); + cs->halted = 1; + cs->exception_index = EXCP_HLT; + object_property_add(OBJECT(cpu), "id", "int64_t", s390x_cpu_get_id, + s390x_cpu_set_id, NULL, NULL, NULL); #if !defined(CONFIG_USER_ONLY) - qemu_register_reset(s390_cpu_machine_reset_cb, cpu); qemu_get_timedate(&tm, 0); env->tod_offset = TOD_UNIX_EPOCH + (time2tod(mktimegm(&tm)) * 1000000000ULL); @@ -230,7 +314,6 @@ static void s390_cpu_initfn(Object *obj) env->cpu_timer = timer_new_ns(QEMU_CLOCK_VIRTUAL, s390x_cpu_timer, cpu); s390_cpu_set_state(CPU_STATE_STOPPED, cpu); #endif - env->cpu_num = cpu_num++; if (tcg_enabled() && !inited) { inited = true; @@ -337,6 +420,7 @@ static void s390_cpu_class_init(ObjectClass *oc, void *data) CPUClass *cc = CPU_CLASS(scc); DeviceClass *dc = DEVICE_CLASS(oc); + scc->next_cpu_id = 0; scc->parent_realize = dc->realize; dc->realize = s390_cpu_realizefn; @@ -369,7 +453,7 @@ static void s390_cpu_class_init(ObjectClass *oc, void *data) cc->gdb_arch_name = s390_gdb_arch_name; /* - * Reason: s390_cpu_initfn() calls cpu_exec_init(), which saves + * Reason: s390_cpu_realizefn() calls cpu_exec_init(), which saves * the object in cpus -> dangling pointer after final * object_unref(). */ diff --git a/target-s390x/cpu.h b/target-s390x/cpu.h index 49c84155be..6d97c089a4 100644 --- a/target-s390x/cpu.h +++ b/target-s390x/cpu.h @@ -413,6 +413,8 @@ void trigger_pgm_exception(CPUS390XState *env, uint32_t code, uint32_t ilen); #endif S390CPU *cpu_s390x_init(const char *cpu_model); +S390CPU *s390x_new_cpu(const char *cpu_model, int64_t id, Error **errp); +S390CPU *cpu_s390x_create(const char *cpu_model, Error **errp); void s390x_translate_init(void); int cpu_s390x_exec(CPUState *cpu); diff --git a/target-s390x/helper.c b/target-s390x/helper.c index 838bdd9e9e..76d5fbebe8 100644 --- a/target-s390x/helper.c +++ b/target-s390x/helper.c @@ -65,14 +65,51 @@ void s390x_cpu_timer(void *opaque) } #endif -S390CPU *cpu_s390x_init(const char *cpu_model) +S390CPU *cpu_s390x_create(const char *cpu_model, Error **errp) { S390CPU *cpu; cpu = S390_CPU(object_new(TYPE_S390_CPU)); - object_property_set_bool(OBJECT(cpu), true, "realized", NULL); + return cpu; +} + +S390CPU *s390x_new_cpu(const char *cpu_model, int64_t id, Error **errp) +{ + S390CPU *cpu; + Error *err = NULL; + + cpu = cpu_s390x_create(cpu_model, &err); + if (err != NULL) { + goto out; + } + + object_property_set_int(OBJECT(cpu), id, "id", &err); + if (err != NULL) { + goto out; + } + object_property_set_bool(OBJECT(cpu), true, "realized", &err); +out: + if (err) { + error_propagate(errp, err); + object_unref(OBJECT(cpu)); + cpu = NULL; + } + return cpu; +} + +S390CPU *cpu_s390x_init(const char *cpu_model) +{ + Error *err = NULL; + S390CPU *cpu; + /* Use to track CPU ID for linux-user only */ + static int64_t next_cpu_id; + + cpu = s390x_new_cpu(cpu_model, next_cpu_id++, &err); + if (err) { + error_report_err(err); + } return cpu; } diff --git a/tests/io-channel-helpers.c b/tests/io-channel-helpers.c index 844066904b..a4dedbe0ad 100644 --- a/tests/io-channel-helpers.c +++ b/tests/io-channel-helpers.c @@ -132,7 +132,7 @@ static gpointer test_io_thread_reader(gpointer opaque) if (ret == QIO_CHANNEL_ERR_BLOCK) { if (data->blocking) { - error_setg(&data->writeerr, + error_setg(&data->readerr, "Unexpected I/O blocking"); break; } else { @@ -233,11 +233,11 @@ void qio_channel_test_run_reader(QIOChannelTest *test, void qio_channel_test_validate(QIOChannelTest *test) { + g_assert(test->readerr == NULL); + g_assert(test->writeerr == NULL); g_assert_cmpint(memcmp(test->input, test->output, test->len), ==, 0); - g_assert(test->readerr == NULL); - g_assert(test->writeerr == NULL); g_free(test->inputv); g_free(test->outputv); diff --git a/tests/qemu-iotests/081.out b/tests/qemu-iotests/081.out index 70632314c8..97df69d71c 100644 --- a/tests/qemu-iotests/081.out +++ b/tests/qemu-iotests/081.out @@ -31,7 +31,7 @@ QMP_VERSION {"return": {}} {"return": {}} {"return": {}} -{"timestamp": {"seconds": TIMESTAMP, "microseconds": TIMESTAMP}, "event": "QUORUM_REPORT_BAD", "data": {"node-name": "drive2", "sectors-count": 20480, "sector-num": 0}} +{"timestamp": {"seconds": TIMESTAMP, "microseconds": TIMESTAMP}, "event": "QUORUM_REPORT_BAD", "data": {"node-name": "drive2", "sectors-count": 20480, "sector-num": 0, "type": "read"}} read 10485760/10485760 bytes at offset 0 10 MiB, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec) {"return": ""} diff --git a/tests/qemu-iotests/146 b/tests/qemu-iotests/146 new file mode 100755 index 0000000000..043711be68 --- /dev/null +++ b/tests/qemu-iotests/146 @@ -0,0 +1,165 @@ +#!/bin/bash +# +# Test VHD image format creator detection and override +# +# Copyright (C) 2016 Red Hat, Inc. +# +# This program is free software; you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by +# the Free Software Foundation; either version 2 of the License, or +# (at your option) any later version. +# +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License +# along with this program. If not, see <http://www.gnu.org/licenses/>. +# + +# creator +owner=jcody@redhat.com + +seq=`basename $0` +echo "QA output created by $seq" + +here=`pwd` +status=1 # failure is the default! + +_cleanup() +{ + _cleanup_qemu + _cleanup_test_img +} +trap "_cleanup; exit \$status" 0 1 2 3 15 + +# get standard environment, filters and checks +. ./common.rc +. ./common.filter +. ./common.qemu + +_supported_fmt vpc +_supported_proto file +_supported_os Linux + + +qemu_comm_method="monitor" +silent= + +echo +echo === Testing VPC Autodetect === +echo +_use_sample_img virtualpc-dynamic.vhd.bz2 + +${QEMU_IO} -c "open -o driver=vpc ${TEST_IMG}" -c 'map' + +echo +echo === Testing VPC with current_size force === +echo + +${QEMU_IO} -c "open -o driver=vpc,force_size_calc=current_size ${TEST_IMG}" -c 'map' + +echo +echo === Testing VPC with chs force === +echo + +${QEMU_IO} -c "open -o driver=vpc,force_size_calc=chs ${TEST_IMG}" -c 'map' + +_cleanup_test_img + +echo +echo === Testing Hyper-V Autodetect === +echo +_use_sample_img hyperv2012r2-dynamic.vhd.bz2 + +${QEMU_IO} -c "open -o driver=vpc ${TEST_IMG}" -c 'map' + +echo +echo === Testing Hyper-V with current_size force === +echo + +${QEMU_IO} -c "open -o driver=vpc,force_size_calc=current_size ${TEST_IMG}" -c 'map' + +echo +echo === Testing Hyper-V with chs force === +echo + +${QEMU_IO} -c "open -o driver=vpc,force_size_calc=chs ${TEST_IMG}" -c 'map' + +_cleanup_test_img + +echo +echo === Testing d2v Autodetect === +echo +_use_sample_img d2v-zerofilled.vhd.bz2 + +${QEMU_IO} -c "open -o driver=vpc ${TEST_IMG}" -c 'map' + +echo +echo === Testing d2v with current_size force === +echo + +${QEMU_IO} -c "open -o driver=vpc,force_size_calc=current_size ${TEST_IMG}" -c 'map' + +echo +echo === Testing d2v with chs force === +echo + +${QEMU_IO} -c "open -o driver=vpc,force_size_calc=chs ${TEST_IMG}" -c 'map' + +_cleanup_test_img + +echo +echo === Testing Image create, default === +echo + +TEST_IMG="${TEST_DIR}/vpc-create-test.vpc" + +_make_test_img 4G + +echo +echo === Read created image, default opts ==== +echo + +${QEMU_IO} -c "open -o driver=vpc ${TEST_IMG}" -c 'map' + +echo +echo === Read created image, force_size_calc=chs ==== +echo + +${QEMU_IO} -c "open -o driver=vpc,force_size_calc=chs ${TEST_IMG}" -c 'map' + +echo +echo === Read created image, force_size_calc=current_size ==== +echo + +${QEMU_IO} -c "open -o driver=vpc,force_size_calc=current_size ${TEST_IMG}" -c 'map' + +echo +echo === Testing Image create, force_size === +echo + +_make_test_img -o force_size 4G + +echo +echo === Read created image, default opts ==== +echo + +${QEMU_IO} -c "open -o driver=vpc ${TEST_IMG}" -c 'map' + +echo +echo === Read created image, force_size_calc=chs ==== +echo + +${QEMU_IO} -c "open -o driver=vpc,force_size_calc=chs ${TEST_IMG}" -c 'map' + +echo +echo === Read created image, force_size_calc=current_size ==== +echo + +${QEMU_IO} -c "open -o driver=vpc,force_size_calc=current_size ${TEST_IMG}" -c 'map' + +echo "*** done" +rm -f $seq.full +status=0 diff --git a/tests/qemu-iotests/146.out b/tests/qemu-iotests/146.out new file mode 100644 index 0000000000..4f334d86bc --- /dev/null +++ b/tests/qemu-iotests/146.out @@ -0,0 +1,70 @@ +QA output created by 146 + +=== Testing VPC Autodetect === + +[ 0] 266334240/ 266334240 sectors not allocated at offset 0 bytes (0) + +=== Testing VPC with current_size force === + +[ 0] 266338304/ 266338304 sectors not allocated at offset 0 bytes (0) + +=== Testing VPC with chs force === + +[ 0] 266334240/ 266334240 sectors not allocated at offset 0 bytes (0) + +=== Testing Hyper-V Autodetect === + +[ 0] 266338304/ 266338304 sectors not allocated at offset 0 bytes (0) + +=== Testing Hyper-V with current_size force === + +[ 0] 266338304/ 266338304 sectors not allocated at offset 0 bytes (0) + +=== Testing Hyper-V with chs force === + +[ 0] 266334240/ 266334240 sectors not allocated at offset 0 bytes (0) + +=== Testing d2v Autodetect === + +[ 0] 514560/ 514560 sectors allocated at offset 0 bytes (1) + +=== Testing d2v with current_size force === + +[ 0] 514560/ 514560 sectors allocated at offset 0 bytes (1) + +=== Testing d2v with chs force === + +[ 0] 514560/ 514560 sectors allocated at offset 0 bytes (1) + +=== Testing Image create, default === + +Formatting 'TEST_DIR/IMGFMT-create-test.IMGFMT', fmt=IMGFMT size=4294967296 + +=== Read created image, default opts ==== + +[ 0] 8389584/ 8389584 sectors not allocated at offset 0 bytes (0) + +=== Read created image, force_size_calc=chs ==== + +[ 0] 8389584/ 8389584 sectors not allocated at offset 0 bytes (0) + +=== Read created image, force_size_calc=current_size ==== + +[ 0] 8389584/ 8389584 sectors not allocated at offset 0 bytes (0) + +=== Testing Image create, force_size === + +Formatting 'TEST_DIR/IMGFMT-create-test.IMGFMT', fmt=IMGFMT size=4294967296 force_size=on + +=== Read created image, default opts ==== + +[ 0] 8388608/ 8388608 sectors not allocated at offset 0 bytes (0) + +=== Read created image, force_size_calc=chs ==== + +[ 0] 8388608/ 8388608 sectors not allocated at offset 0 bytes (0) + +=== Read created image, force_size_calc=current_size ==== + +[ 0] 8388608/ 8388608 sectors not allocated at offset 0 bytes (0) +*** done diff --git a/tests/qemu-iotests/148 b/tests/qemu-iotests/148 new file mode 100644 index 0000000000..30bc37958e --- /dev/null +++ b/tests/qemu-iotests/148 @@ -0,0 +1,129 @@ +#!/usr/bin/env python +# +# Test the rate limit of QMP events +# +# Copyright (C) 2016 Igalia, S.L. +# Author: Alberto Garcia <berto@igalia.com> +# +# This program is free software; you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by +# the Free Software Foundation; either version 2 of the License, or +# (at your option) any later version. +# +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License +# along with this program. If not, see <http://www.gnu.org/licenses/>. +# + +import os +import iotests + +imgs = (os.path.join(iotests.test_dir, 'quorum0.img'), + os.path.join(iotests.test_dir, 'quorum1.img'), + os.path.join(iotests.test_dir, 'quorum2.img')) + +img_conf = (os.path.join(iotests.test_dir, 'quorum0.conf'), + os.path.join(iotests.test_dir, 'quorum1.conf'), + os.path.join(iotests.test_dir, 'quorum2.conf')) + +event_rate = 1000000000 +sector_size = 512 +offset = 10 + +class TestQuorumEvents(iotests.QMPTestCase): + + def create_blkdebug_file(self, blkdebug_file, bad_sector): + file = open(blkdebug_file, 'w') + file.write(''' +[inject-error] +event = "read_aio" +errno = "5" +sector = "%d" +''' % bad_sector) + file.close() + + def setUp(self): + driveopts = ['driver=quorum', 'vote-threshold=2'] + for i in range(len(imgs)): + iotests.qemu_img('create', '-f', iotests.imgfmt, imgs[i], '1M') + self.create_blkdebug_file(img_conf[i], i + offset) + driveopts.append('children.%d.driver=%s' % (i, iotests.imgfmt)) + driveopts.append('children.%d.file.driver=blkdebug' % i) + driveopts.append('children.%d.file.config=%s' % (i, img_conf[i])) + driveopts.append('children.%d.file.image.filename=%s' % (i, imgs[i])) + driveopts.append('children.%d.node-name=img%d' % (i, i)) + self.vm = iotests.VM() + self.vm.add_drive(None, opts = ','.join(driveopts)) + self.vm.launch() + + def tearDown(self): + self.vm.shutdown() + for i in range(len(imgs)): + os.remove(imgs[i]) + os.remove(img_conf[i]) + + def do_check_event(self, node, sector = 0): + if node == None: + self.assertEqual(self.vm.get_qmp_event(), None) + return + + for event in self.vm.get_qmp_events(wait=True): + if event['event'] == 'QUORUM_REPORT_BAD': + self.assert_qmp(event, 'data/node-name', node) + self.assert_qmp(event, 'data/sector-num', sector) + + def testQuorum(self): + if not 'quorum' in iotests.qemu_img_pipe('--help'): + return + + # Generate an error and get an event + self.vm.hmp_qemu_io("drive0", "aio_read %d %d" % + (offset * sector_size, sector_size)) + self.vm.qtest("clock_step 10") + self.do_check_event('img0', offset) + + # I/O errors in the same child: only one event is emitted + delay = 10 + for i in range(3): + self.vm.hmp_qemu_io("drive0", "aio_read %d %d" % + (offset * sector_size, sector_size)) + self.vm.qtest("clock_step %d" % delay) + self.do_check_event(None) + + # Wait enough so the event is finally emitted + self.vm.qtest("clock_step %d" % (2 * event_rate)) + self.do_check_event('img0', offset) + + # I/O errors in the same child: all events are emitted + delay = 2 * event_rate + for i in range(3): + self.vm.hmp_qemu_io("drive0", "aio_read %d %d" % + (offset * sector_size, sector_size)) + self.vm.qtest("clock_step %d" % delay) + self.do_check_event('img0', offset) + + # I/O errors in different children: all events are emitted + delay = 10 + for i in range(len(imgs)): + self.vm.hmp_qemu_io("drive0", "aio_read %d %d" % + ((offset + i) * sector_size, sector_size)) + self.vm.qtest("clock_step %d" % delay) + self.do_check_event('img%d' % i, offset + i) + + # I/O errors in different children: all events are emitted + delay = 2 * event_rate + for i in range(len(imgs)): + self.vm.hmp_qemu_io("drive0", "aio_read %d %d" % + ((offset + i) * sector_size, sector_size)) + self.vm.qtest("clock_step %d" % delay) + self.do_check_event('img%d' % i, offset + i) + + # No more pending events + self.do_check_event(None) + +if __name__ == '__main__': + iotests.main(supported_fmts=["raw"]) diff --git a/tests/qemu-iotests/148.out b/tests/qemu-iotests/148.out new file mode 100644 index 0000000000..ae1213e6f8 --- /dev/null +++ b/tests/qemu-iotests/148.out @@ -0,0 +1,5 @@ +. +---------------------------------------------------------------------- +Ran 1 tests + +OK diff --git a/tests/qemu-iotests/group b/tests/qemu-iotests/group index 47fd40c546..faf0f21397 100644 --- a/tests/qemu-iotests/group +++ b/tests/qemu-iotests/group @@ -148,3 +148,5 @@ 143 auto quick 144 rw auto quick 145 auto quick +146 auto quick +148 rw auto quick diff --git a/tests/qemu-iotests/sample_images/d2v-zerofilled.vhd.bz2 b/tests/qemu-iotests/sample_images/d2v-zerofilled.vhd.bz2 new file mode 100644 index 0000000000..f12cb9203a --- /dev/null +++ b/tests/qemu-iotests/sample_images/d2v-zerofilled.vhd.bz2 Binary files differdiff --git a/tests/qemu-iotests/sample_images/hyperv2012r2-dynamic.vhd.bz2 b/tests/qemu-iotests/sample_images/hyperv2012r2-dynamic.vhd.bz2 new file mode 100644 index 0000000000..bfeccf7b9f --- /dev/null +++ b/tests/qemu-iotests/sample_images/hyperv2012r2-dynamic.vhd.bz2 Binary files differdiff --git a/tests/qemu-iotests/sample_images/virtualpc-dynamic.vhd.bz2 b/tests/qemu-iotests/sample_images/virtualpc-dynamic.vhd.bz2 new file mode 100644 index 0000000000..783be3c8f0 --- /dev/null +++ b/tests/qemu-iotests/sample_images/virtualpc-dynamic.vhd.bz2 Binary files differdiff --git a/tests/test-io-channel-socket.c b/tests/test-io-channel-socket.c index 8a34056670..87018acb8a 100644 --- a/tests/test-io-channel-socket.c +++ b/tests/test-io-channel-socket.c @@ -22,66 +22,77 @@ #include "io/channel-socket.h" #include "io/channel-util.h" #include "io-channel-helpers.h" -#ifdef HAVE_IFADDRS_H -#include <ifaddrs.h> + +#ifndef AI_ADDRCONFIG +# define AI_ADDRCONFIG 0 +#endif +#ifndef AI_V4MAPPED +# define AI_V4MAPPED 0 +#endif +#ifndef EAI_ADDRFAMILY +# define EAI_ADDRFAMILY 0 #endif -static int check_protocol_support(bool *has_ipv4, bool *has_ipv6) +static int check_bind(const char *hostname, bool *has_proto) { -#ifdef HAVE_IFADDRS_H - struct ifaddrs *ifaddr = NULL, *ifa; - struct addrinfo hints = { 0 }; - struct addrinfo *ai = NULL; - int gaierr; - - *has_ipv4 = *has_ipv6 = false; - - if (getifaddrs(&ifaddr) < 0) { - g_printerr("Failed to lookup interface addresses: %s\n", - strerror(errno)); - return -1; + int fd = -1; + struct addrinfo ai, *res = NULL; + int rc; + int ret = -1; + + memset(&ai, 0, sizeof(ai)); + ai.ai_flags = AI_CANONNAME | AI_V4MAPPED | AI_ADDRCONFIG; + ai.ai_family = AF_UNSPEC; + ai.ai_socktype = SOCK_STREAM; + + /* lookup */ + rc = getaddrinfo(hostname, NULL, &ai, &res); + if (rc != 0) { + if (rc == EAI_ADDRFAMILY || + rc == EAI_FAMILY) { + *has_proto = false; + goto done; + } + goto cleanup; } - for (ifa = ifaddr; ifa != NULL; ifa = ifa->ifa_next) { - if (!ifa->ifa_addr) { - continue; - } + fd = qemu_socket(res->ai_family, res->ai_socktype, res->ai_protocol); + if (fd < 0) { + goto cleanup; + } - if (ifa->ifa_addr->sa_family == AF_INET) { - *has_ipv4 = true; - } - if (ifa->ifa_addr->sa_family == AF_INET6) { - *has_ipv6 = true; + if (bind(fd, res->ai_addr, res->ai_addrlen) < 0) { + if (errno == EADDRNOTAVAIL) { + *has_proto = false; + goto done; } + goto cleanup; } - freeifaddrs(ifaddr); - - hints.ai_flags = AI_PASSIVE | AI_ADDRCONFIG; - hints.ai_family = AF_INET6; - hints.ai_socktype = SOCK_STREAM; - - gaierr = getaddrinfo("::1", NULL, &hints, &ai); - if (gaierr != 0) { - if (gaierr == EAI_ADDRFAMILY || - gaierr == EAI_FAMILY || - gaierr == EAI_NONAME) { - *has_ipv6 = false; - } else { - g_printerr("Failed to resolve ::1 address: %s\n", - gai_strerror(gaierr)); - return -1; - } + *has_proto = true; + done: + ret = 0; + + cleanup: + if (fd != -1) { + close(fd); } + if (res) { + freeaddrinfo(res); + } + return ret; +} - freeaddrinfo(ai); +static int check_protocol_support(bool *has_ipv4, bool *has_ipv6) +{ + if (check_bind("127.0.0.1", has_ipv4) < 0) { + return -1; + } + if (check_bind("::1", has_ipv6) < 0) { + return -1; + } return 0; -#else - *has_ipv4 = *has_ipv6 = false; - - return -1; -#endif } @@ -131,6 +142,7 @@ static void test_io_channel_setup_sync(SocketAddress *listen_addr, QIO_CHANNEL_SOCKET(*src), connect_addr, &error_abort); qio_channel_set_delay(*src, false); + qio_channel_wait(QIO_CHANNEL(lioc), G_IO_IN); *dst = QIO_CHANNEL(qio_channel_socket_accept(lioc, &error_abort)); g_assert(*dst); @@ -198,6 +210,7 @@ static void test_io_channel_setup_async(SocketAddress *listen_addr, g_assert(!data.err); + qio_channel_wait(QIO_CHANNEL(lioc), G_IO_IN); *dst = QIO_CHANNEL(qio_channel_socket_accept(lioc, &error_abort)); g_assert(*dst); @@ -487,10 +500,20 @@ static void test_io_channel_ipv4_fd(void) { QIOChannel *ioc; int fd = -1; + struct sockaddr_in sa = { + .sin_family = AF_INET, + .sin_addr = { + .s_addr = htonl(INADDR_LOOPBACK), + } + /* Leave port unset for auto-assign */ + }; + socklen_t salen = sizeof(sa); fd = socket(AF_INET, SOCK_STREAM, 0); g_assert_cmpint(fd, >, -1); + g_assert_cmpint(bind(fd, (struct sockaddr *)&sa, salen), ==, 0); + ioc = qio_channel_new_fd(fd, &error_abort); g_assert_cmpstr(object_get_typename(OBJECT(ioc)), @@ -506,6 +529,7 @@ int main(int argc, char **argv) bool has_ipv4, has_ipv6; module_call_init(MODULE_INIT_QOM); + socket_init(); g_test_init(&argc, &argv, NULL); diff --git a/trace-events b/trace-events index 6fba6cc474..0ad8a1c85f 100644 --- a/trace-events +++ b/trace-events @@ -1620,10 +1620,12 @@ disable exec_tb_exit(void *next_tb, unsigned int flags) "tb:%p flags=%x" translate_block(void *tb, uintptr_t pc, uint8_t *tb_code) "tb:%p, pc:0x%"PRIxPTR", tb_code:%p" # memory.c -memory_region_ops_read(void *mr, uint64_t addr, uint64_t value, unsigned size) "mr %p addr %#"PRIx64" value %#"PRIx64" size %u" -memory_region_ops_write(void *mr, uint64_t addr, uint64_t value, unsigned size) "mr %p addr %#"PRIx64" value %#"PRIx64" size %u" -memory_region_subpage_read(void *mr, uint64_t offset, uint64_t value, unsigned size) "mr %p offset %#"PRIx64" value %#"PRIx64" size %u" -memory_region_subpage_write(void *mr, uint64_t offset, uint64_t value, unsigned size) "mr %p offset %#"PRIx64" value %#"PRIx64" size %u" +memory_region_ops_read(int cpu_index, void *mr, uint64_t addr, uint64_t value, unsigned size) "cpu %d mr %p addr %#"PRIx64" value %#"PRIx64" size %u" +memory_region_ops_write(int cpu_index, void *mr, uint64_t addr, uint64_t value, unsigned size) "cpu %d mr %p addr %#"PRIx64" value %#"PRIx64" size %u" +memory_region_subpage_read(int cpu_index, void *mr, uint64_t offset, uint64_t value, unsigned size) "cpu %d mr %p offset %#"PRIx64" value %#"PRIx64" size %u" +memory_region_subpage_write(int cpu_index, void *mr, uint64_t offset, uint64_t value, unsigned size) "cpu %d mr %p offset %#"PRIx64" value %#"PRIx64" size %u" +memory_region_tb_read(int cpu_index, uint64_t addr, uint64_t value, unsigned size) "cpu %d addr %#"PRIx64" value %#"PRIx64" size %u" +memory_region_tb_write(int cpu_index, uint64_t addr, uint64_t value, unsigned size) "cpu %d addr %#"PRIx64" value %#"PRIx64" size %u" # qom/object.c object_dynamic_cast_assert(const char *type, const char *target, const char *file, int line, const char *func) "%s->%s (%s:%d:%s)" @@ -1652,6 +1654,7 @@ vfio_msix_enable(const char *name) " (%s)" vfio_msix_pba_disable(const char *name) " (%s)" vfio_msix_pba_enable(const char *name) " (%s)" vfio_msix_disable(const char *name) " (%s)" +vfio_msix_fixup(const char *name, int bar, uint64_t start, uint64_t end) " (%s) MSI-X region %d mmap fixup [0x%"PRIx64" - 0x%"PRIx64"]" vfio_msi_enable(const char *name, int nr_vectors) " (%s) Enabled %d MSI vectors" vfio_msi_disable(const char *name) " (%s)" vfio_pci_load_rom(const char *name, unsigned long size, unsigned long offset, unsigned long flags) "Device %s ROM:\n size: 0x%lx, offset: 0x%lx, flags: 0x%lx" @@ -1670,7 +1673,6 @@ vfio_pci_hot_reset(const char *name, const char *type) " (%s) %s" vfio_pci_hot_reset_has_dep_devices(const char *name) "%s: hot reset dependent devices:" vfio_pci_hot_reset_dep_devices(int domain, int bus, int slot, int function, int group_id) "\t%04x:%02x:%02x.%x group %d" vfio_pci_hot_reset_result(const char *name, const char *result) "%s hot reset: %s" -vfio_populate_device_region(const char *region_name, int index, unsigned long size, unsigned long offset, unsigned long flags) "Device %s region %d:\n size: 0x%lx, offset: 0x%lx, flags: 0x%lx" vfio_populate_device_config(const char *name, unsigned long size, unsigned long offset, unsigned long flags) "Device %s config:\n size: 0x%lx, offset: 0x%lx, flags: 0x%lx" vfio_populate_device_get_irq_info_failure(void) "VFIO_DEVICE_GET_IRQ_INFO failure: %m" vfio_initfn(const char *name, int group_id) " (%s) group %d" @@ -1726,13 +1728,17 @@ vfio_disconnect_container(int fd) "close container->fd=%d" vfio_put_group(int fd) "close group->fd=%d" vfio_get_device(const char * name, unsigned int flags, unsigned int num_regions, unsigned int num_irqs) "Device %s flags: %u, regions: %u, irqs: %u" vfio_put_base_device(int fd) "close vdev->fd=%d" +vfio_region_setup(const char *dev, int index, const char *name, unsigned long flags, unsigned long offset, unsigned long size) "Device %s, region %d \"%s\", flags: %lx, offset: %lx, size: %lx" +vfio_region_mmap_fault(const char *name, int index, unsigned long offset, unsigned long size, int fault) "Region %s mmaps[%d], [%lx - %lx], fault: %d" +vfio_region_mmap(const char *name, unsigned long offset, unsigned long end) "Region %s [%lx - %lx]" +vfio_region_exit(const char *name, int index) "Device %s, region %d" +vfio_region_finalize(const char *name, int index) "Device %s, region %d" +vfio_region_mmaps_set_enabled(const char *name, bool enabled) "Region %s mmaps enabled: %d" # hw/vfio/platform.c -vfio_platform_populate_regions(int region_index, unsigned long flag, unsigned long size, int fd, unsigned long offset) "- region %d flags = 0x%lx, size = 0x%lx, fd= %d, offset = 0x%lx" vfio_platform_base_device_init(char *name, int groupid) "%s belongs to group #%d" vfio_platform_realize(char *name, char *compat) "vfio device %s, compat = %s" vfio_platform_eoi(int pin, int fd) "EOI IRQ pin %d (fd=%d)" -vfio_platform_mmap_set_enabled(bool enabled) "fast path = %d" vfio_platform_intp_mmap_enable(int pin) "IRQ #%d still active, stay in slow path" vfio_platform_intp_interrupt(int pin, int fd) "Inject IRQ #%d (fd = %d)" vfio_platform_intp_inject_pending_lockheld(int pin, int fd) "Inject pending IRQ #%d (fd = %d)" diff --git a/util/oslib-win32.c b/util/oslib-win32.c index 438cfa4f6a..a3f0664763 100644 --- a/util/oslib-win32.c +++ b/util/oslib-win32.c @@ -2,7 +2,7 @@ * os-win32.c * * Copyright (c) 2003-2008 Fabrice Bellard - * Copyright (c) 2010 Red Hat, Inc. + * Copyright (c) 2010-2016 Red Hat, Inc. * * QEMU library functions for win32 which are shared between QEMU and * the QEMU tools. @@ -144,6 +144,83 @@ int socket_set_fast_reuse(int fd) return 0; } + +static int socket_error(void) +{ + switch (WSAGetLastError()) { + case 0: + return 0; + case WSAEINTR: + return EINTR; + case WSAEINVAL: + return EINVAL; + case WSA_INVALID_HANDLE: + return EBADF; + case WSA_NOT_ENOUGH_MEMORY: + return ENOMEM; + case WSA_INVALID_PARAMETER: + return EINVAL; + case WSAENAMETOOLONG: + return ENAMETOOLONG; + case WSAENOTEMPTY: + return ENOTEMPTY; + case WSAEWOULDBLOCK: + /* not using EWOULDBLOCK as we don't want code to have + * to check both EWOULDBLOCK and EAGAIN */ + return EAGAIN; + case WSAEINPROGRESS: + return EINPROGRESS; + case WSAEALREADY: + return EALREADY; + case WSAENOTSOCK: + return ENOTSOCK; + case WSAEDESTADDRREQ: + return EDESTADDRREQ; + case WSAEMSGSIZE: + return EMSGSIZE; + case WSAEPROTOTYPE: + return EPROTOTYPE; + case WSAENOPROTOOPT: + return ENOPROTOOPT; + case WSAEPROTONOSUPPORT: + return EPROTONOSUPPORT; + case WSAEOPNOTSUPP: + return EOPNOTSUPP; + case WSAEAFNOSUPPORT: + return EAFNOSUPPORT; + case WSAEADDRINUSE: + return EADDRINUSE; + case WSAEADDRNOTAVAIL: + return EADDRNOTAVAIL; + case WSAENETDOWN: + return ENETDOWN; + case WSAENETUNREACH: + return ENETUNREACH; + case WSAENETRESET: + return ENETRESET; + case WSAECONNABORTED: + return ECONNABORTED; + case WSAECONNRESET: + return ECONNRESET; + case WSAENOBUFS: + return ENOBUFS; + case WSAEISCONN: + return EISCONN; + case WSAENOTCONN: + return ENOTCONN; + case WSAETIMEDOUT: + return ETIMEDOUT; + case WSAECONNREFUSED: + return ECONNREFUSED; + case WSAELOOP: + return ELOOP; + case WSAEHOSTUNREACH: + return EHOSTUNREACH; + default: + return EIO; + } +} + int inet_aton(const char *cp, struct in_addr *ia) { uint32_t addr = inet_addr(cp); @@ -504,3 +581,204 @@ pid_t qemu_fork(Error **errp) "cannot fork child process"); return -1; } + + +#undef connect +int qemu_connect_wrap(int sockfd, const struct sockaddr *addr, + socklen_t addrlen) +{ + int ret; + ret = connect(sockfd, addr, addrlen); + if (ret < 0) { + errno = socket_error(); + } + return ret; +} + + +#undef listen +int qemu_listen_wrap(int sockfd, int backlog) +{ + int ret; + ret = listen(sockfd, backlog); + if (ret < 0) { + errno = socket_error(); + } + return ret; +} + + +#undef bind +int qemu_bind_wrap(int sockfd, const struct sockaddr *addr, + socklen_t addrlen) +{ + int ret; + ret = bind(sockfd, addr, addrlen); + if (ret < 0) { + errno = socket_error(); + } + return ret; +} + + +#undef socket +int qemu_socket_wrap(int domain, int type, int protocol) +{ + int ret; + ret = socket(domain, type, protocol); + if (ret < 0) { + errno = socket_error(); + } + return ret; +} + + +#undef accept +int qemu_accept_wrap(int sockfd, struct sockaddr *addr, + socklen_t *addrlen) +{ + int ret; + ret = accept(sockfd, addr, addrlen); + if (ret < 0) { + errno = socket_error(); + } + return ret; +} + + +#undef shutdown +int qemu_shutdown_wrap(int sockfd, int how) +{ + int ret; + ret = shutdown(sockfd, how); + if (ret < 0) { + errno = socket_error(); + } + return ret; +} + + +#undef ioctlsocket +int qemu_ioctlsocket_wrap(int fd, int req, void *val) +{ + int ret; + ret = ioctlsocket(fd, req, val); + if (ret < 0) { + errno = socket_error(); + } + return ret; +} + + +#undef closesocket +int qemu_closesocket_wrap(int fd) +{ + int ret; + ret = closesocket(fd); + if (ret < 0) { + errno = socket_error(); + } + return ret; +} + + +#undef getsockopt +int qemu_getsockopt_wrap(int sockfd, int level, int optname, + void *optval, socklen_t *optlen) +{ + int ret; + ret = getsockopt(sockfd, level, optname, optval, optlen); + if (ret < 0) { + errno = socket_error(); + } + return ret; +} + + +#undef setsockopt +int qemu_setsockopt_wrap(int sockfd, int level, int optname, + const void *optval, socklen_t optlen) +{ + int ret; + ret = setsockopt(sockfd, level, optname, optval, optlen); + if (ret < 0) { + errno = socket_error(); + } + return ret; +} + + +#undef getpeername +int qemu_getpeername_wrap(int sockfd, struct sockaddr *addr, + socklen_t *addrlen) +{ + int ret; + ret = getpeername(sockfd, addr, addrlen); + if (ret < 0) { + errno = socket_error(); + } + return ret; +} + + +#undef getsockname +int qemu_getsockname_wrap(int sockfd, struct sockaddr *addr, + socklen_t *addrlen) +{ + int ret; + ret = getsockname(sockfd, addr, addrlen); + if (ret < 0) { + errno = socket_error(); + } + return ret; +} + + +#undef send +ssize_t qemu_send_wrap(int sockfd, const void *buf, size_t len, int flags) +{ + int ret; + ret = send(sockfd, buf, len, flags); + if (ret < 0) { + errno = socket_error(); + } + return ret; +} + + +#undef sendto +ssize_t qemu_sendto_wrap(int sockfd, const void *buf, size_t len, int flags, + const struct sockaddr *addr, socklen_t addrlen) +{ + int ret; + ret = sendto(sockfd, buf, len, flags, addr, addrlen); + if (ret < 0) { + errno = socket_error(); + } + return ret; +} + + +#undef recv +ssize_t qemu_recv_wrap(int sockfd, void *buf, size_t len, int flags) +{ + int ret; + ret = recv(sockfd, buf, len, flags); + if (ret < 0) { + errno = socket_error(); + } + return ret; +} + + +#undef recvfrom +ssize_t qemu_recvfrom_wrap(int sockfd, void *buf, size_t len, int flags, + struct sockaddr *addr, socklen_t *addrlen) +{ + int ret; + ret = recvfrom(sockfd, buf, len, flags, addr, addrlen); + if (ret < 0) { + errno = socket_error(); + } + return ret; +} diff --git a/util/qemu-coroutine-io.c b/util/qemu-coroutine-io.c index 0d5041c1c3..91b9357d4a 100644 --- a/util/qemu-coroutine-io.c +++ b/util/qemu-coroutine-io.c @@ -35,18 +35,16 @@ qemu_co_sendv_recvv(int sockfd, struct iovec *iov, unsigned iov_cnt, { size_t done = 0; ssize_t ret; - int err; while (done < bytes) { ret = iov_send_recv(sockfd, iov, iov_cnt, offset + done, bytes - done, do_send); if (ret > 0) { done += ret; } else if (ret < 0) { - err = socket_error(); - if (err == EAGAIN || err == EWOULDBLOCK) { + if (errno == EAGAIN || errno == EWOULDBLOCK) { qemu_coroutine_yield(); } else if (done == 0) { - return -err; + return -errno; } else { break; } diff --git a/util/qemu-sockets.c b/util/qemu-sockets.c index ad7c00c9ad..fd37ac209b 100644 --- a/util/qemu-sockets.c +++ b/util/qemu-sockets.c @@ -268,7 +268,7 @@ static void wait_for_connect(void *opaque) do { rc = qemu_getsockopt(s->fd, SOL_SOCKET, SO_ERROR, &val, &valsize); - } while (rc == -1 && socket_error() == EINTR); + } while (rc == -1 && errno == EINTR); /* update rc to contain error */ if (!rc && val) { @@ -330,7 +330,7 @@ static int inet_connect_addr(struct addrinfo *addr, bool *in_progress, do { rc = 0; if (connect(sock, addr->ai_addr, addr->ai_addrlen) < 0) { - rc = -socket_error(); + rc = -errno; } } while (rc == -EINTR); @@ -787,7 +787,7 @@ static int unix_connect_saddr(UnixSocketAddress *saddr, Error **errp, do { rc = 0; if (connect(sock, (struct sockaddr *) &un, sizeof(un)) < 0) { - rc = -socket_error(); + rc = -errno; } } while (rc == -EINTR); @@ -1082,7 +1082,7 @@ SocketAddress *socket_local_address(int fd, Error **errp) socklen_t sslen = sizeof(ss); if (getsockname(fd, (struct sockaddr *)&ss, &sslen) < 0) { - error_setg_errno(errp, socket_error(), "%s", + error_setg_errno(errp, errno, "%s", "Unable to query local socket address"); return NULL; } @@ -1097,7 +1097,7 @@ SocketAddress *socket_remote_address(int fd, Error **errp) socklen_t sslen = sizeof(ss); if (getpeername(fd, (struct sockaddr *)&ss, &sslen) < 0) { - error_setg_errno(errp, socket_error(), "%s", + error_setg_errno(errp, errno, "%s", "Unable to query remote socket address"); return NULL; } |