diff options
| -rw-r--r-- | docs/interop/qmp-spec.txt | 8 | ||||
| -rw-r--r-- | monitor/qmp-cmds.c | 2 | ||||
| -rw-r--r-- | monitor/qmp.c | 44 | ||||
| -rw-r--r-- | monitor/trace-events | 4 | ||||
| -rw-r--r-- | qobject/json-parser.c | 3 | ||||
| -rw-r--r-- | qobject/qdict.c | 12 | ||||
| -rw-r--r-- | qobject/qjson.c | 3 | ||||
| -rw-r--r-- | tools/virtiofsd/passthrough_ll.c | 224 | ||||
| -rw-r--r-- | tools/virtiofsd/passthrough_seccomp.c | 2 |
9 files changed, 205 insertions, 97 deletions
diff --git a/docs/interop/qmp-spec.txt b/docs/interop/qmp-spec.txt index cdf5842555..b0e8351d5b 100644 --- a/docs/interop/qmp-spec.txt +++ b/docs/interop/qmp-spec.txt @@ -133,9 +133,11 @@ to pass "id" with out-of-band commands. Passing it with all commands is recommended for clients that accept capability "oob". If the client sends in-band commands faster than the server can -execute them, the server will stop reading the requests from the QMP -channel until the request queue length is reduced to an acceptable -range. +execute them, the server will stop reading requests until the request +queue length is reduced to an acceptable range. + +To ensure commands to be executed out-of-band get read and executed, +the client should have at most eight in-band commands in flight. Only a few commands support out-of-band execution. The ones that do have "allow-oob": true in output of query-qmp-schema. diff --git a/monitor/qmp-cmds.c b/monitor/qmp-cmds.c index 990936136c..c7df8c0ee2 100644 --- a/monitor/qmp-cmds.c +++ b/monitor/qmp-cmds.c @@ -23,7 +23,7 @@ #include "qemu/uuid.h" #include "chardev/char.h" #include "ui/qemu-spice.h" -#include "ui/vnc.h" +#include "ui/console.h" #include "sysemu/kvm.h" #include "sysemu/runstate.h" #include "sysemu/runstate-action.h" diff --git a/monitor/qmp.c b/monitor/qmp.c index 8f91af32be..43880fa623 100644 --- a/monitor/qmp.c +++ b/monitor/qmp.c @@ -79,7 +79,7 @@ static void monitor_qmp_cleanup_queue_and_resume(MonitorQMP *mon) qemu_mutex_lock(&mon->qmp_queue_lock); /* - * Same condition as in monitor_qmp_bh_dispatcher(), but before + * Same condition as in monitor_qmp_dispatcher_co(), but before * removing an element from the queue (hence no `- 1`). * Also, the queue should not be empty either, otherwise the * monitor hasn't been suspended yet (or was already resumed). @@ -113,6 +113,7 @@ void qmp_send_response(MonitorQMP *mon, const QDict *rsp) json = qobject_to_json_pretty(data, mon->pretty); assert(json != NULL); + trace_monitor_qmp_respond(mon, json->str); g_string_append_c(json, '\n'); monitor_puts(&mon->common, json->str); @@ -213,7 +214,7 @@ void coroutine_fn monitor_qmp_dispatcher_co(void *data) { QMPRequest *req_obj = NULL; QDict *rsp; - bool need_resume; + bool oob_enabled; MonitorQMP *mon; while (true) { @@ -251,6 +252,9 @@ void coroutine_fn monitor_qmp_dispatcher_co(void *data) } } + trace_monitor_qmp_in_band_dequeue(req_obj, + req_obj->mon->qmp_requests->length); + if (qatomic_xchg(&qmp_dispatcher_co_busy, true) == true) { /* * Someone rescheduled us (probably because a new requests @@ -269,11 +273,32 @@ void coroutine_fn monitor_qmp_dispatcher_co(void *data) aio_co_schedule(qemu_get_aio_context(), qmp_dispatcher_co); qemu_coroutine_yield(); + /* + * @req_obj has a request, we hold req_obj->mon->qmp_queue_lock + */ + mon = req_obj->mon; - /* qmp_oob_enabled() might change after "qmp_capabilities" */ - need_resume = !qmp_oob_enabled(mon) || - mon->qmp_requests->length == QMP_REQ_QUEUE_LEN_MAX - 1; + + /* + * We need to resume the monitor if handle_qmp_command() + * suspended it. Two cases: + * 1. OOB enabled: mon->qmp_requests has no more space + * Resume right away, so that OOB commands can get executed while + * this request is being processed. + * 2. OOB disabled: always + * Resume only after we're done processing the request, + * We need to save qmp_oob_enabled() for later, because + * qmp_qmp_capabilities() can change it. + */ + oob_enabled = qmp_oob_enabled(mon); + if (oob_enabled + && mon->qmp_requests->length == QMP_REQ_QUEUE_LEN_MAX - 1) { + monitor_resume(&mon->common); + } + qemu_mutex_unlock(&mon->qmp_queue_lock); + + /* Process request */ if (req_obj->req) { if (trace_event_get_state(TRACE_MONITOR_QMP_CMD_IN_BAND)) { QDict *qdict = qobject_to(QDict, req_obj->req); @@ -287,16 +312,17 @@ void coroutine_fn monitor_qmp_dispatcher_co(void *data) monitor_qmp_dispatch(mon, req_obj->req); } else { assert(req_obj->err); + trace_monitor_qmp_err_in_band(error_get_pretty(req_obj->err)); rsp = qmp_error_response(req_obj->err); req_obj->err = NULL; monitor_qmp_respond(mon, rsp); qobject_unref(rsp); } - if (need_resume) { - /* Pairs with the monitor_suspend() in handle_qmp_command() */ + if (!oob_enabled) { monitor_resume(&mon->common); } + qmp_request_free(req_obj); /* @@ -349,7 +375,7 @@ static void handle_qmp_command(void *opaque, QObject *req, Error *err) /* * Suspend the monitor when we can't queue more requests after - * this one. Dequeuing in monitor_qmp_bh_dispatcher() or + * this one. Dequeuing in monitor_qmp_dispatcher_co() or * monitor_qmp_cleanup_queue_and_resume() will resume it. * Note that when OOB is disabled, we queue at most one command, * for backward compatibility. @@ -364,6 +390,8 @@ static void handle_qmp_command(void *opaque, QObject *req, Error *err) * handled in time order. Ownership for req_obj, req, * etc. will be delivered to the handler side. */ + trace_monitor_qmp_in_band_enqueue(req_obj, mon, + mon->qmp_requests->length); assert(mon->qmp_requests->length < QMP_REQ_QUEUE_LEN_MAX); g_queue_push_tail(mon->qmp_requests, req_obj); qemu_mutex_unlock(&mon->qmp_queue_lock); diff --git a/monitor/trace-events b/monitor/trace-events index 0365ac4d99..348dcfca9b 100644 --- a/monitor/trace-events +++ b/monitor/trace-events @@ -10,6 +10,10 @@ monitor_protocol_event_queue(uint32_t event, void *qdict, uint64_t rate) "event= monitor_suspend(void *ptr, int cnt) "mon %p: %d" # qmp.c +monitor_qmp_in_band_enqueue(void *req, void *mon, unsigned len) "%p mon %p len %u" +monitor_qmp_in_band_dequeue(void *req, unsigned len) "%p len %u" monitor_qmp_cmd_in_band(const char *id) "%s" +monitor_qmp_err_in_band(const char *desc) "%s" monitor_qmp_cmd_out_of_band(const char *id) "%s" +monitor_qmp_respond(void *mon, const char *json) "mon %p resp: %s" handle_qmp_command(void *mon, const char *req) "mon %p req: %s" diff --git a/qobject/json-parser.c b/qobject/json-parser.c index d351039b10..008b326fb8 100644 --- a/qobject/json-parser.c +++ b/qobject/json-parser.c @@ -31,8 +31,7 @@ struct JSONToken { char str[]; }; -typedef struct JSONParserContext -{ +typedef struct JSONParserContext { Error *err; JSONToken *current; GQueue *buf; diff --git a/qobject/qdict.c b/qobject/qdict.c index d84443391e..0216ca7ac1 100644 --- a/qobject/qdict.c +++ b/qobject/qdict.c @@ -39,12 +39,13 @@ QDict *qdict_new(void) */ static unsigned int tdb_hash(const char *name) { - unsigned value; /* Used to compute the hash value. */ - unsigned i; /* Used to cycle through random values. */ + unsigned value; /* Used to compute the hash value. */ + unsigned i; /* Used to cycle through random values. */ /* Set the initial value from the key size. */ - for (value = 0x238F13AF * strlen(name), i=0; name[i]; i++) - value = (value + (((const unsigned char *)name)[i] << (i*5 % 24))); + for (value = 0x238F13AF * strlen(name), i = 0; name[i]; i++) { + value = (value + (((const unsigned char *)name)[i] << (i * 5 % 24))); + } return (1103515243 * value + 12345); } @@ -93,8 +94,9 @@ static QDictEntry *qdict_find(const QDict *qdict, QDictEntry *entry; QLIST_FOREACH(entry, &qdict->table[bucket], next) - if (!strcmp(entry->key, key)) + if (!strcmp(entry->key, key)) { return entry; + } return NULL; } diff --git a/qobject/qjson.c b/qobject/qjson.c index bcc376e626..167fcb429c 100644 --- a/qobject/qjson.c +++ b/qobject/qjson.c @@ -22,8 +22,7 @@ #include "qapi/qmp/qnum.h" #include "qapi/qmp/qstring.h" -typedef struct JSONParsingState -{ +typedef struct JSONParsingState { JSONMessageParser parser; QObject *result; Error *err; diff --git a/tools/virtiofsd/passthrough_ll.c b/tools/virtiofsd/passthrough_ll.c index 5fb36d9407..147b59338a 100644 --- a/tools/virtiofsd/passthrough_ll.c +++ b/tools/virtiofsd/passthrough_ll.c @@ -459,17 +459,17 @@ static void lo_map_remove(struct lo_map *map, size_t key) } /* Assumes lo->mutex is held */ -static ssize_t lo_add_fd_mapping(fuse_req_t req, int fd) +static ssize_t lo_add_fd_mapping(struct lo_data *lo, int fd) { struct lo_map_elem *elem; - elem = lo_map_alloc_elem(&lo_data(req)->fd_map); + elem = lo_map_alloc_elem(&lo->fd_map); if (!elem) { return -1; } elem->fd = fd; - return elem - lo_data(req)->fd_map.elems; + return elem - lo->fd_map.elems; } /* Assumes lo->mutex is held */ @@ -555,6 +555,38 @@ static int lo_fd(fuse_req_t req, fuse_ino_t ino) return fd; } +/* + * Open a file descriptor for an inode. Returns -EBADF if the inode is not a + * regular file or a directory. + * + * Use this helper function instead of raw openat(2) to prevent security issues + * when a malicious client opens special files such as block device nodes. + * Symlink inodes are also rejected since symlinks must already have been + * traversed on the client side. + */ +static int lo_inode_open(struct lo_data *lo, struct lo_inode *inode, + int open_flags) +{ + g_autofree char *fd_str = g_strdup_printf("%d", inode->fd); + int fd; + + if (!S_ISREG(inode->filetype) && !S_ISDIR(inode->filetype)) { + return -EBADF; + } + + /* + * The file is a symlink so O_NOFOLLOW must be ignored. We checked earlier + * that the inode is not a special file but if an external process races + * with us then symlinks are traversed here. It is not possible to escape + * the shared directory since it is mounted as "/" though. + */ + fd = openat(lo->proc_self_fd, fd_str, open_flags & ~O_NOFOLLOW); + if (fd < 0) { + return -errno; + } + return fd; +} + static void lo_init(void *userdata, struct fuse_conn_info *conn) { struct lo_data *lo = (struct lo_data *)userdata; @@ -684,9 +716,9 @@ static void lo_setattr(fuse_req_t req, fuse_ino_t ino, struct stat *attr, if (fi) { truncfd = fd; } else { - sprintf(procname, "%i", ifd); - truncfd = openat(lo->proc_self_fd, procname, O_RDWR); + truncfd = lo_inode_open(lo, inode, O_RDWR); if (truncfd < 0) { + errno = -truncfd; goto out_err; } } @@ -831,11 +863,13 @@ static int do_statx(struct lo_data *lo, int dirfd, const char *pathname, } /* - * Increments nlookup and caller must release refcount using - * lo_inode_put(&parent). + * Increments nlookup on the inode on success. unref_inode_lolocked() must be + * called eventually to decrement nlookup again. If inodep is non-NULL, the + * inode pointer is stored and the caller must call lo_inode_put(). */ static int lo_do_lookup(fuse_req_t req, fuse_ino_t parent, const char *name, - struct fuse_entry_param *e) + struct fuse_entry_param *e, + struct lo_inode **inodep) { int newfd; int res; @@ -845,6 +879,10 @@ static int lo_do_lookup(fuse_req_t req, fuse_ino_t parent, const char *name, struct lo_inode *inode = NULL; struct lo_inode *dir = lo_inode(req, parent); + if (inodep) { + *inodep = NULL; /* in case there is an error */ + } + /* * name_to_handle_at() and open_by_handle_at() can reach here with fuse * mount point in guest, but we don't have its inode info in the @@ -913,7 +951,14 @@ static int lo_do_lookup(fuse_req_t req, fuse_ino_t parent, const char *name, pthread_mutex_unlock(&lo->mutex); } e->ino = inode->fuse_ino; - lo_inode_put(lo, &inode); + + /* Transfer ownership of inode pointer to caller or drop it */ + if (inodep) { + *inodep = inode; + } else { + lo_inode_put(lo, &inode); + } + lo_inode_put(lo, &dir); fuse_log(FUSE_LOG_DEBUG, " %lli/%s -> %lli\n", (unsigned long long)parent, @@ -948,7 +993,7 @@ static void lo_lookup(fuse_req_t req, fuse_ino_t parent, const char *name) return; } - err = lo_do_lookup(req, parent, name, &e); + err = lo_do_lookup(req, parent, name, &e, NULL); if (err) { fuse_reply_err(req, err); } else { @@ -1056,7 +1101,7 @@ static void lo_mknod_symlink(fuse_req_t req, fuse_ino_t parent, goto out; } - saverr = lo_do_lookup(req, parent, name, &e); + saverr = lo_do_lookup(req, parent, name, &e, NULL); if (saverr) { goto out; } @@ -1534,7 +1579,7 @@ static void lo_do_readdir(fuse_req_t req, fuse_ino_t ino, size_t size, if (plus) { if (!is_dot_or_dotdot(name)) { - err = lo_do_lookup(req, ino, name, &e); + err = lo_do_lookup(req, ino, name, &e, NULL); if (err) { goto error; } @@ -1651,12 +1696,52 @@ static void update_open_flags(int writeback, int allow_direct_io, } } +/* + * Open a regular file, set up an fd mapping, and fill out the struct + * fuse_file_info for it. If existing_fd is not negative, use that fd instead + * opening a new one. Takes ownership of existing_fd. + * + * Returns 0 on success or a positive errno. + */ +static int lo_do_open(struct lo_data *lo, struct lo_inode *inode, + int existing_fd, struct fuse_file_info *fi) +{ + ssize_t fh; + int fd = existing_fd; + + update_open_flags(lo->writeback, lo->allow_direct_io, fi); + + if (fd < 0) { + fd = lo_inode_open(lo, inode, fi->flags); + if (fd < 0) { + return -fd; + } + } + + pthread_mutex_lock(&lo->mutex); + fh = lo_add_fd_mapping(lo, fd); + pthread_mutex_unlock(&lo->mutex); + if (fh == -1) { + close(fd); + return ENOMEM; + } + + fi->fh = fh; + if (lo->cache == CACHE_NONE) { + fi->direct_io = 1; + } else if (lo->cache == CACHE_ALWAYS) { + fi->keep_cache = 1; + } + return 0; +} + static void lo_create(fuse_req_t req, fuse_ino_t parent, const char *name, mode_t mode, struct fuse_file_info *fi) { - int fd; + int fd = -1; struct lo_data *lo = lo_data(req); struct lo_inode *parent_inode; + struct lo_inode *inode = NULL; struct fuse_entry_param e; int err; struct lo_cred old = {}; @@ -1682,36 +1767,38 @@ static void lo_create(fuse_req_t req, fuse_ino_t parent, const char *name, update_open_flags(lo->writeback, lo->allow_direct_io, fi); - fd = openat(parent_inode->fd, name, (fi->flags | O_CREAT) & ~O_NOFOLLOW, - mode); + /* Try to create a new file but don't open existing files */ + fd = openat(parent_inode->fd, name, fi->flags | O_CREAT | O_EXCL, mode); err = fd == -1 ? errno : 0; - lo_restore_cred(&old); - if (!err) { - ssize_t fh; + lo_restore_cred(&old); - pthread_mutex_lock(&lo->mutex); - fh = lo_add_fd_mapping(req, fd); - pthread_mutex_unlock(&lo->mutex); - if (fh == -1) { - close(fd); - err = ENOMEM; - goto out; - } + /* Ignore the error if file exists and O_EXCL was not given */ + if (err && (err != EEXIST || (fi->flags & O_EXCL))) { + goto out; + } - fi->fh = fh; - err = lo_do_lookup(req, parent, name, &e); + err = lo_do_lookup(req, parent, name, &e, &inode); + if (err) { + goto out; } - if (lo->cache == CACHE_NONE) { - fi->direct_io = 1; - } else if (lo->cache == CACHE_ALWAYS) { - fi->keep_cache = 1; + + err = lo_do_open(lo, inode, fd, fi); + fd = -1; /* lo_do_open() takes ownership of fd */ + if (err) { + /* Undo lo_do_lookup() nlookup ref */ + unref_inode_lolocked(lo, inode, 1); } out: + lo_inode_put(lo, &inode); lo_inode_put(lo, &parent_inode); if (err) { + if (fd >= 0) { + close(fd); + } + fuse_reply_err(req, err); } else { fuse_reply_create(req, &e, fi); @@ -1725,7 +1812,6 @@ static struct lo_inode_plock *lookup_create_plock_ctx(struct lo_data *lo, pid_t pid, int *err) { struct lo_inode_plock *plock; - char procname[64]; int fd; plock = @@ -1742,12 +1828,10 @@ static struct lo_inode_plock *lookup_create_plock_ctx(struct lo_data *lo, } /* Open another instance of file which can be used for ofd locks. */ - sprintf(procname, "%i", inode->fd); - /* TODO: What if file is not writable? */ - fd = openat(lo->proc_self_fd, procname, O_RDWR); - if (fd == -1) { - *err = errno; + fd = lo_inode_open(lo, inode, O_RDWR); + if (fd < 0) { + *err = -fd; free(plock); return NULL; } @@ -1892,38 +1976,25 @@ static void lo_fsyncdir(fuse_req_t req, fuse_ino_t ino, int datasync, static void lo_open(fuse_req_t req, fuse_ino_t ino, struct fuse_file_info *fi) { - int fd; - ssize_t fh; - char buf[64]; struct lo_data *lo = lo_data(req); + struct lo_inode *inode = lo_inode(req, ino); + int err; fuse_log(FUSE_LOG_DEBUG, "lo_open(ino=%" PRIu64 ", flags=%d)\n", ino, fi->flags); - update_open_flags(lo->writeback, lo->allow_direct_io, fi); - - sprintf(buf, "%i", lo_fd(req, ino)); - fd = openat(lo->proc_self_fd, buf, fi->flags & ~O_NOFOLLOW); - if (fd == -1) { - return (void)fuse_reply_err(req, errno); - } - - pthread_mutex_lock(&lo->mutex); - fh = lo_add_fd_mapping(req, fd); - pthread_mutex_unlock(&lo->mutex); - if (fh == -1) { - close(fd); - fuse_reply_err(req, ENOMEM); + if (!inode) { + fuse_reply_err(req, EBADF); return; } - fi->fh = fh; - if (lo->cache == CACHE_NONE) { - fi->direct_io = 1; - } else if (lo->cache == CACHE_ALWAYS) { - fi->keep_cache = 1; + err = lo_do_open(lo, inode, -1, fi); + lo_inode_put(lo, &inode); + if (err) { + fuse_reply_err(req, err); + } else { + fuse_reply_open(req, fi); } - fuse_reply_open(req, fi); } static void lo_release(fuse_req_t req, fuse_ino_t ino, @@ -1982,39 +2053,40 @@ static void lo_flush(fuse_req_t req, fuse_ino_t ino, struct fuse_file_info *fi) static void lo_fsync(fuse_req_t req, fuse_ino_t ino, int datasync, struct fuse_file_info *fi) { + struct lo_inode *inode = lo_inode(req, ino); + struct lo_data *lo = lo_data(req); int res; int fd; - char *buf; fuse_log(FUSE_LOG_DEBUG, "lo_fsync(ino=%" PRIu64 ", fi=0x%p)\n", ino, (void *)fi); - if (!fi) { - struct lo_data *lo = lo_data(req); - - res = asprintf(&buf, "%i", lo_fd(req, ino)); - if (res == -1) { - return (void)fuse_reply_err(req, errno); - } + if (!inode) { + fuse_reply_err(req, EBADF); + return; + } - fd = openat(lo->proc_self_fd, buf, O_RDWR); - free(buf); - if (fd == -1) { - return (void)fuse_reply_err(req, errno); + if (!fi) { + fd = lo_inode_open(lo, inode, O_RDWR); + if (fd < 0) { + res = -fd; + goto out; } } else { fd = lo_fi_fd(req, fi); } if (datasync) { - res = fdatasync(fd); + res = fdatasync(fd) == -1 ? errno : 0; } else { - res = fsync(fd); + res = fsync(fd) == -1 ? errno : 0; } if (!fi) { close(fd); } - fuse_reply_err(req, res == -1 ? errno : 0); +out: + lo_inode_put(lo, &inode); + fuse_reply_err(req, res); } static void lo_read(fuse_req_t req, fuse_ino_t ino, size_t size, off_t offset, diff --git a/tools/virtiofsd/passthrough_seccomp.c b/tools/virtiofsd/passthrough_seccomp.c index a60d7da4b4..ea852e2e33 100644 --- a/tools/virtiofsd/passthrough_seccomp.c +++ b/tools/virtiofsd/passthrough_seccomp.c @@ -65,6 +65,7 @@ static const int syscall_whitelist[] = { SCMP_SYS(linkat), SCMP_SYS(listxattr), SCMP_SYS(lseek), + SCMP_SYS(_llseek), /* For POWER */ SCMP_SYS(madvise), SCMP_SYS(mkdirat), SCMP_SYS(mknodat), @@ -88,6 +89,7 @@ static const int syscall_whitelist[] = { SCMP_SYS(renameat), SCMP_SYS(renameat2), SCMP_SYS(removexattr), + SCMP_SYS(restart_syscall), SCMP_SYS(rt_sigaction), SCMP_SYS(rt_sigprocmask), SCMP_SYS(rt_sigreturn), |