diff options
162 files changed, 7774 insertions, 3085 deletions
diff --git a/MAINTAINERS b/MAINTAINERS index d5ff6c2498..98eddf7ae1 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -140,6 +140,7 @@ F: docs/system/target-i386* F: target/i386/*.[ch] F: target/i386/Kconfig F: target/i386/meson.build +F: tools/i386/ Guest CPU cores (TCG) --------------------- @@ -1878,6 +1879,7 @@ M: Eduardo Habkost <eduardo@habkost.net> M: Marcel Apfelbaum <marcel.apfelbaum@gmail.com> R: Philippe Mathieu-Daudé <philmd@linaro.org> R: Yanan Wang <wangyanan55@huawei.com> +R: Zhao Liu <zhao1.liu@intel.com> S: Supported F: hw/core/cpu-common.c F: hw/core/cpu-sysemu.c @@ -2009,6 +2011,7 @@ F: hw/pci-bridge/* F: qapi/pci.json F: docs/pci* F: docs/specs/*pci* +F: docs/system/sriov.rst PCIE DOE M: Huai-Cheng Kuo <hchkuo@avery-design.com.tw> @@ -2208,6 +2211,7 @@ F: docs/devel/vfio-iommufd.rst vhost M: Michael S. Tsirkin <mst@redhat.com> +R: Stefano Garzarella <sgarzare@redhat.com> S: Supported F: hw/*/*vhost* F: docs/interop/vhost-user.json @@ -3218,6 +3222,7 @@ M: Eric Blake <eblake@redhat.com> M: Markus Armbruster <armbru@redhat.com> S: Supported F: qapi/*.json +F: qga/qapi-schema.json T: git https://repo.or.cz/qemu/armbru.git qapi-next QObject @@ -3398,6 +3403,12 @@ F: tests/qtest/*tpm* F: docs/specs/tpm.rst T: git https://github.com/stefanberger/qemu-tpm.git tpm-next +SPDM +M: Alistair Francis <alistair.francis@wdc.com> +S: Maintained +F: backends/spdm-socket.c +F: include/sysemu/spdm-socket.h + Checkpatch S: Odd Fixes F: scripts/checkpatch.pl @@ -3440,6 +3451,7 @@ Detached LUKS header M: Hyman Huang <yong.huang@smartx.com> S: Maintained F: tests/qemu-iotests/tests/luks-detached-header +F: docs/devel/luks-detached-header.rst D-Bus M: Marc-André Lureau <marcandre.lureau@redhat.com> @@ -3473,7 +3485,7 @@ F: qapi/crypto.json F: tests/unit/test-crypto-* F: tests/bench/benchmark-crypto-* F: tests/unit/crypto-tls-* -F: tests/unit/pkix_asn1_tab.c +F: tests/unit/pkix_asn1_tab.c.inc F: qemu.sasl Coroutines @@ -3657,6 +3669,7 @@ F: tests/uefi-test-tools/ VT-d Emulation M: Michael S. Tsirkin <mst@redhat.com> R: Jason Wang <jasowang@redhat.com> +R: Yi Liu <yi.l.liu@intel.com> S: Supported F: hw/i386/intel_iommu.c F: hw/i386/intel_iommu_internal.h diff --git a/accel/kvm/kvm-all.c b/accel/kvm/kvm-all.c index 64bf47a033..67b773692f 100644 --- a/accel/kvm/kvm-all.c +++ b/accel/kvm/kvm-all.c @@ -340,14 +340,71 @@ err: return ret; } +void kvm_park_vcpu(CPUState *cpu) +{ + struct KVMParkedVcpu *vcpu; + + trace_kvm_park_vcpu(cpu->cpu_index, kvm_arch_vcpu_id(cpu)); + + vcpu = g_malloc0(sizeof(*vcpu)); + vcpu->vcpu_id = kvm_arch_vcpu_id(cpu); + vcpu->kvm_fd = cpu->kvm_fd; + QLIST_INSERT_HEAD(&kvm_state->kvm_parked_vcpus, vcpu, node); +} + +int kvm_unpark_vcpu(KVMState *s, unsigned long vcpu_id) +{ + struct KVMParkedVcpu *cpu; + int kvm_fd = -ENOENT; + + QLIST_FOREACH(cpu, &s->kvm_parked_vcpus, node) { + if (cpu->vcpu_id == vcpu_id) { + QLIST_REMOVE(cpu, node); + kvm_fd = cpu->kvm_fd; + g_free(cpu); + } + } + + trace_kvm_unpark_vcpu(vcpu_id, kvm_fd > 0 ? "unparked" : "!found parked"); + + return kvm_fd; +} + +int kvm_create_vcpu(CPUState *cpu) +{ + unsigned long vcpu_id = kvm_arch_vcpu_id(cpu); + KVMState *s = kvm_state; + int kvm_fd; + + /* check if the KVM vCPU already exist but is parked */ + kvm_fd = kvm_unpark_vcpu(s, vcpu_id); + if (kvm_fd < 0) { + /* vCPU not parked: create a new KVM vCPU */ + kvm_fd = kvm_vm_ioctl(s, KVM_CREATE_VCPU, vcpu_id); + if (kvm_fd < 0) { + error_report("KVM_CREATE_VCPU IOCTL failed for vCPU %lu", vcpu_id); + return kvm_fd; + } + } + + cpu->kvm_fd = kvm_fd; + cpu->kvm_state = s; + cpu->vcpu_dirty = true; + cpu->dirty_pages = 0; + cpu->throttle_us_per_full = 0; + + trace_kvm_create_vcpu(cpu->cpu_index, vcpu_id, kvm_fd); + + return 0; +} + static int do_kvm_destroy_vcpu(CPUState *cpu) { KVMState *s = kvm_state; long mmap_size; - struct KVMParkedVcpu *vcpu = NULL; int ret = 0; - trace_kvm_destroy_vcpu(); + trace_kvm_destroy_vcpu(cpu->cpu_index, kvm_arch_vcpu_id(cpu)); ret = kvm_arch_destroy_vcpu(cpu); if (ret < 0) { @@ -373,10 +430,7 @@ static int do_kvm_destroy_vcpu(CPUState *cpu) } } - vcpu = g_malloc0(sizeof(*vcpu)); - vcpu->vcpu_id = kvm_arch_vcpu_id(cpu); - vcpu->kvm_fd = cpu->kvm_fd; - QLIST_INSERT_HEAD(&kvm_state->kvm_parked_vcpus, vcpu, node); + kvm_park_vcpu(cpu); err: return ret; } @@ -389,24 +443,6 @@ void kvm_destroy_vcpu(CPUState *cpu) } } -static int kvm_get_vcpu(KVMState *s, unsigned long vcpu_id) -{ - struct KVMParkedVcpu *cpu; - - QLIST_FOREACH(cpu, &s->kvm_parked_vcpus, node) { - if (cpu->vcpu_id == vcpu_id) { - int kvm_fd; - - QLIST_REMOVE(cpu, node); - kvm_fd = cpu->kvm_fd; - g_free(cpu); - return kvm_fd; - } - } - - return kvm_vm_ioctl(s, KVM_CREATE_VCPU, (void *)vcpu_id); -} - int kvm_init_vcpu(CPUState *cpu, Error **errp) { KVMState *s = kvm_state; @@ -415,19 +451,14 @@ int kvm_init_vcpu(CPUState *cpu, Error **errp) trace_kvm_init_vcpu(cpu->cpu_index, kvm_arch_vcpu_id(cpu)); - ret = kvm_get_vcpu(s, kvm_arch_vcpu_id(cpu)); + ret = kvm_create_vcpu(cpu); if (ret < 0) { - error_setg_errno(errp, -ret, "kvm_init_vcpu: kvm_get_vcpu failed (%lu)", + error_setg_errno(errp, -ret, + "kvm_init_vcpu: kvm_create_vcpu failed (%lu)", kvm_arch_vcpu_id(cpu)); goto err; } - cpu->kvm_fd = ret; - cpu->kvm_state = s; - cpu->vcpu_dirty = true; - cpu->dirty_pages = 0; - cpu->throttle_us_per_full = 0; - mmap_size = kvm_ioctl(s, KVM_GET_VCPU_MMAP_SIZE, 0); if (mmap_size < 0) { ret = mmap_size; @@ -3745,6 +3776,21 @@ static void kvm_set_device(Object *obj, s->device = g_strdup(value); } +static void kvm_set_kvm_rapl(Object *obj, bool value, Error **errp) +{ + KVMState *s = KVM_STATE(obj); + s->msr_energy.enable = value; +} + +static void kvm_set_kvm_rapl_socket_path(Object *obj, + const char *str, + Error **errp) +{ + KVMState *s = KVM_STATE(obj); + g_free(s->msr_energy.socket_path); + s->msr_energy.socket_path = g_strdup(str); +} + static void kvm_accel_instance_init(Object *obj) { KVMState *s = KVM_STATE(obj); @@ -3764,6 +3810,7 @@ static void kvm_accel_instance_init(Object *obj) s->xen_gnttab_max_frames = 64; s->xen_evtchn_max_pirq = 256; s->device = NULL; + s->msr_energy.enable = false; } /** @@ -3808,6 +3855,17 @@ static void kvm_accel_class_init(ObjectClass *oc, void *data) object_class_property_set_description(oc, "device", "Path to the device node to use (default: /dev/kvm)"); + object_class_property_add_bool(oc, "rapl", + NULL, + kvm_set_kvm_rapl); + object_class_property_set_description(oc, "rapl", + "Allow energy related MSRs for RAPL interface in Guest"); + + object_class_property_add_str(oc, "rapl-helper-socket", NULL, + kvm_set_kvm_rapl_socket_path); + object_class_property_set_description(oc, "rapl-helper-socket", + "Socket Path for comminucating with the Virtual MSR helper daemon"); + kvm_arch_accel_class_init(oc); } diff --git a/accel/kvm/kvm-cpus.h b/accel/kvm/kvm-cpus.h index ca40add32c..171b22fd29 100644 --- a/accel/kvm/kvm-cpus.h +++ b/accel/kvm/kvm-cpus.h @@ -22,5 +22,4 @@ bool kvm_supports_guest_debug(void); int kvm_insert_breakpoint(CPUState *cpu, int type, vaddr addr, vaddr len); int kvm_remove_breakpoint(CPUState *cpu, int type, vaddr addr, vaddr len); void kvm_remove_all_breakpoints(CPUState *cpu); - #endif /* KVM_CPUS_H */ diff --git a/accel/kvm/trace-events b/accel/kvm/trace-events index 681ccb667d..37626c1ac5 100644 --- a/accel/kvm/trace-events +++ b/accel/kvm/trace-events @@ -9,6 +9,10 @@ kvm_device_ioctl(int fd, int type, void *arg) "dev fd %d, type 0x%x, arg %p" kvm_failed_reg_get(uint64_t id, const char *msg) "Warning: Unable to retrieve ONEREG %" PRIu64 " from KVM: %s" kvm_failed_reg_set(uint64_t id, const char *msg) "Warning: Unable to set ONEREG %" PRIu64 " to KVM: %s" kvm_init_vcpu(int cpu_index, unsigned long arch_cpu_id) "index: %d id: %lu" +kvm_create_vcpu(int cpu_index, unsigned long arch_cpu_id, int kvm_fd) "index: %d, id: %lu, kvm fd: %d" +kvm_destroy_vcpu(int cpu_index, unsigned long arch_cpu_id) "index: %d id: %lu" +kvm_park_vcpu(int cpu_index, unsigned long arch_cpu_id) "index: %d id: %lu" +kvm_unpark_vcpu(unsigned long arch_cpu_id, const char *msg) "id: %lu %s" kvm_irqchip_commit_routes(void) "" kvm_irqchip_add_msi_route(char *name, int vector, int virq) "dev %s vector %d virq %d" kvm_irqchip_update_msi_route(int virq) "Updating MSI route virq=%d" @@ -25,7 +29,6 @@ kvm_dirty_ring_reaper(const char *s) "%s" kvm_dirty_ring_reap(uint64_t count, int64_t t) "reaped %"PRIu64" pages (took %"PRIi64" us)" kvm_dirty_ring_reaper_kick(const char *reason) "%s" kvm_dirty_ring_flush(int finished) "%d" -kvm_destroy_vcpu(void) "" kvm_failed_get_vcpu_mmap_size(void) "" kvm_cpu_exec(void) "" kvm_interrupt_exit_request(void) "" diff --git a/accel/stubs/tcg-stub.c b/accel/stubs/tcg-stub.c index dd890d6cf6..7f4208fddf 100644 --- a/accel/stubs/tcg-stub.c +++ b/accel/stubs/tcg-stub.c @@ -18,20 +18,6 @@ void tb_flush(CPUState *cpu) { } -int probe_access_flags(CPUArchState *env, vaddr addr, int size, - MMUAccessType access_type, int mmu_idx, - bool nonfault, void **phost, uintptr_t retaddr) -{ - g_assert_not_reached(); -} - -void *probe_access(CPUArchState *env, vaddr addr, int size, - MMUAccessType access_type, int mmu_idx, uintptr_t retaddr) -{ - /* Handled by hardware accelerator. */ - g_assert_not_reached(); -} - G_NORETURN void cpu_loop_exit(CPUState *cpu) { g_assert_not_reached(); diff --git a/backends/Kconfig b/backends/Kconfig index 2cb23f62fa..d3dbe19868 100644 --- a/backends/Kconfig +++ b/backends/Kconfig @@ -3,3 +3,7 @@ source tpm/Kconfig config IOMMUFD bool depends on VFIO + +config SPDM_SOCKET + bool + default y diff --git a/backends/iommufd.c b/backends/iommufd.c index cabd1b5002..9bc466a89c 100644 --- a/backends/iommufd.c +++ b/backends/iommufd.c @@ -18,6 +18,7 @@ #include "qemu/error-report.h" #include "monitor/monitor.h" #include "trace.h" +#include "hw/vfio/vfio-common.h" #include <sys/ioctl.h> #include <linux/iommufd.h> @@ -207,9 +208,91 @@ int iommufd_backend_unmap_dma(IOMMUFDBackend *be, uint32_t ioas_id, return ret; } +bool iommufd_backend_alloc_hwpt(IOMMUFDBackend *be, uint32_t dev_id, + uint32_t pt_id, uint32_t flags, + uint32_t data_type, uint32_t data_len, + void *data_ptr, uint32_t *out_hwpt, + Error **errp) +{ + int ret, fd = be->fd; + struct iommu_hwpt_alloc alloc_hwpt = { + .size = sizeof(struct iommu_hwpt_alloc), + .flags = flags, + .dev_id = dev_id, + .pt_id = pt_id, + .data_type = data_type, + .data_len = data_len, + .data_uptr = (uintptr_t)data_ptr, + }; + + ret = ioctl(fd, IOMMU_HWPT_ALLOC, &alloc_hwpt); + trace_iommufd_backend_alloc_hwpt(fd, dev_id, pt_id, flags, data_type, + data_len, (uintptr_t)data_ptr, + alloc_hwpt.out_hwpt_id, ret); + if (ret) { + error_setg_errno(errp, errno, "Failed to allocate hwpt"); + return false; + } + + *out_hwpt = alloc_hwpt.out_hwpt_id; + return true; +} + +bool iommufd_backend_set_dirty_tracking(IOMMUFDBackend *be, + uint32_t hwpt_id, bool start, + Error **errp) +{ + int ret; + struct iommu_hwpt_set_dirty_tracking set_dirty = { + .size = sizeof(set_dirty), + .hwpt_id = hwpt_id, + .flags = start ? IOMMU_HWPT_DIRTY_TRACKING_ENABLE : 0, + }; + + ret = ioctl(be->fd, IOMMU_HWPT_SET_DIRTY_TRACKING, &set_dirty); + trace_iommufd_backend_set_dirty(be->fd, hwpt_id, start, ret ? errno : 0); + if (ret) { + error_setg_errno(errp, errno, + "IOMMU_HWPT_SET_DIRTY_TRACKING(hwpt_id %u) failed", + hwpt_id); + return false; + } + + return true; +} + +bool iommufd_backend_get_dirty_bitmap(IOMMUFDBackend *be, + uint32_t hwpt_id, + uint64_t iova, ram_addr_t size, + uint64_t page_size, uint64_t *data, + Error **errp) +{ + int ret; + struct iommu_hwpt_get_dirty_bitmap get_dirty_bitmap = { + .size = sizeof(get_dirty_bitmap), + .hwpt_id = hwpt_id, + .iova = iova, + .length = size, + .page_size = page_size, + .data = (uintptr_t)data, + }; + + ret = ioctl(be->fd, IOMMU_HWPT_GET_DIRTY_BITMAP, &get_dirty_bitmap); + trace_iommufd_backend_get_dirty_bitmap(be->fd, hwpt_id, iova, size, + page_size, ret ? errno : 0); + if (ret) { + error_setg_errno(errp, errno, + "IOMMU_HWPT_GET_DIRTY_BITMAP (iova: 0x%"HWADDR_PRIx + " size: 0x"RAM_ADDR_FMT") failed", iova, size); + return false; + } + + return true; +} + bool iommufd_backend_get_device_info(IOMMUFDBackend *be, uint32_t devid, uint32_t *type, void *data, uint32_t len, - Error **errp) + uint64_t *caps, Error **errp) { struct iommu_hw_info info = { .size = sizeof(info), @@ -225,6 +308,8 @@ bool iommufd_backend_get_device_info(IOMMUFDBackend *be, uint32_t devid, g_assert(type); *type = info.out_data_type; + g_assert(caps); + *caps = info.out_capabilities; return true; } @@ -237,7 +322,7 @@ static int hiod_iommufd_get_cap(HostIOMMUDevice *hiod, int cap, Error **errp) case HOST_IOMMU_DEVICE_CAP_IOMMU_TYPE: return caps->type; case HOST_IOMMU_DEVICE_CAP_AW_BITS: - return caps->aw_bits; + return vfio_device_get_aw_bits(hiod->agent); default: error_setg(errp, "%s: unsupported capability %x", hiod->name, cap); return -EINVAL; diff --git a/backends/meson.build b/backends/meson.build index 749b491f12..da714b93d1 100644 --- a/backends/meson.build +++ b/backends/meson.build @@ -33,4 +33,6 @@ endif system_ss.add(when: gio, if_true: files('dbus-vmstate.c')) system_ss.add(when: 'CONFIG_SGX', if_true: files('hostmem-epc.c')) +system_ss.add(when: 'CONFIG_SPDM_SOCKET', if_true: files('spdm-socket.c')) + subdir('tpm') diff --git a/backends/spdm-socket.c b/backends/spdm-socket.c new file mode 100644 index 0000000000..d0663d696c --- /dev/null +++ b/backends/spdm-socket.c @@ -0,0 +1,216 @@ +/* SPDX-License-Identifier: BSD-3-Clause */ +/* + * QEMU SPDM socket support + * + * This is based on: + * https://github.com/DMTF/spdm-emu/blob/07c0a838bcc1c6207c656ac75885c0603e344b6f/spdm_emu/spdm_emu_common/command.c + * but has been re-written to match QEMU style + * + * Copyright (c) 2021, DMTF. All rights reserved. + * Copyright (c) 2023. Western Digital Corporation or its affiliates. + */ + +#include "qemu/osdep.h" +#include "sysemu/spdm-socket.h" +#include "qapi/error.h" + +static bool read_bytes(const int socket, uint8_t *buffer, + size_t number_of_bytes) +{ + ssize_t number_received = 0; + ssize_t result; + + while (number_received < number_of_bytes) { + result = recv(socket, buffer + number_received, + number_of_bytes - number_received, 0); + if (result <= 0) { + return false; + } + number_received += result; + } + return true; +} + +static bool read_data32(const int socket, uint32_t *data) +{ + bool result; + + result = read_bytes(socket, (uint8_t *)data, sizeof(uint32_t)); + if (!result) { + return result; + } + *data = ntohl(*data); + return true; +} + +static bool read_multiple_bytes(const int socket, uint8_t *buffer, + uint32_t *bytes_received, + uint32_t max_buffer_length) +{ + uint32_t length; + bool result; + + result = read_data32(socket, &length); + if (!result) { + return result; + } + + if (length > max_buffer_length) { + return false; + } + + if (bytes_received) { + *bytes_received = length; + } + + if (length == 0) { + return true; + } + + return read_bytes(socket, buffer, length); +} + +static bool receive_platform_data(const int socket, + uint32_t transport_type, + uint32_t *command, + uint8_t *receive_buffer, + uint32_t *bytes_to_receive) +{ + bool result; + uint32_t response; + uint32_t bytes_received; + + result = read_data32(socket, &response); + if (!result) { + return result; + } + *command = response; + + result = read_data32(socket, &transport_type); + if (!result) { + return result; + } + + bytes_received = 0; + result = read_multiple_bytes(socket, receive_buffer, &bytes_received, + *bytes_to_receive); + if (!result) { + return result; + } + *bytes_to_receive = bytes_received; + + return result; +} + +static bool write_bytes(const int socket, const uint8_t *buffer, + uint32_t number_of_bytes) +{ + ssize_t number_sent = 0; + ssize_t result; + + while (number_sent < number_of_bytes) { + result = send(socket, buffer + number_sent, + number_of_bytes - number_sent, 0); + if (result == -1) { + return false; + } + number_sent += result; + } + return true; +} + +static bool write_data32(const int socket, uint32_t data) +{ + data = htonl(data); + return write_bytes(socket, (uint8_t *)&data, sizeof(uint32_t)); +} + +static bool write_multiple_bytes(const int socket, const uint8_t *buffer, + uint32_t bytes_to_send) +{ + bool result; + + result = write_data32(socket, bytes_to_send); + if (!result) { + return result; + } + + return write_bytes(socket, buffer, bytes_to_send); +} + +static bool send_platform_data(const int socket, + uint32_t transport_type, uint32_t command, + const uint8_t *send_buffer, size_t bytes_to_send) +{ + bool result; + + result = write_data32(socket, command); + if (!result) { + return result; + } + + result = write_data32(socket, transport_type); + if (!result) { + return result; + } + + return write_multiple_bytes(socket, send_buffer, bytes_to_send); +} + +int spdm_socket_connect(uint16_t port, Error **errp) +{ + int client_socket; + struct sockaddr_in server_addr; + + client_socket = socket(AF_INET, SOCK_STREAM, IPPROTO_TCP); + if (client_socket < 0) { + error_setg(errp, "cannot create socket: %s", strerror(errno)); + return -1; + } + + memset((char *)&server_addr, 0, sizeof(server_addr)); + server_addr.sin_family = AF_INET; + server_addr.sin_addr.s_addr = htonl(INADDR_LOOPBACK); + server_addr.sin_port = htons(port); + + + if (connect(client_socket, (struct sockaddr *)&server_addr, + sizeof(server_addr)) < 0) { + error_setg(errp, "cannot connect: %s", strerror(errno)); + close(client_socket); + return -1; + } + + return client_socket; +} + +uint32_t spdm_socket_rsp(const int socket, uint32_t transport_type, + void *req, uint32_t req_len, + void *rsp, uint32_t rsp_len) +{ + uint32_t command; + bool result; + + result = send_platform_data(socket, transport_type, + SPDM_SOCKET_COMMAND_NORMAL, + req, req_len); + if (!result) { + return 0; + } + + result = receive_platform_data(socket, transport_type, &command, + (uint8_t *)rsp, &rsp_len); + if (!result) { + return 0; + } + + assert(command != 0); + + return rsp_len; +} + +void spdm_socket_close(const int socket, uint32_t transport_type) +{ + send_platform_data(socket, transport_type, + SPDM_SOCKET_COMMAND_SHUTDOWN, NULL, 0); +} diff --git a/backends/trace-events b/backends/trace-events index 211e6f374a..40811a3162 100644 --- a/backends/trace-events +++ b/backends/trace-events @@ -14,4 +14,7 @@ iommufd_backend_map_dma(int iommufd, uint32_t ioas, uint64_t iova, uint64_t size iommufd_backend_unmap_dma_non_exist(int iommufd, uint32_t ioas, uint64_t iova, uint64_t size, int ret) " Unmap nonexistent mapping: iommufd=%d ioas=%d iova=0x%"PRIx64" size=0x%"PRIx64" (%d)" iommufd_backend_unmap_dma(int iommufd, uint32_t ioas, uint64_t iova, uint64_t size, int ret) " iommufd=%d ioas=%d iova=0x%"PRIx64" size=0x%"PRIx64" (%d)" iommufd_backend_alloc_ioas(int iommufd, uint32_t ioas) " iommufd=%d ioas=%d" +iommufd_backend_alloc_hwpt(int iommufd, uint32_t dev_id, uint32_t pt_id, uint32_t flags, uint32_t hwpt_type, uint32_t len, uint64_t data_ptr, uint32_t out_hwpt_id, int ret) " iommufd=%d dev_id=%u pt_id=%u flags=0x%x hwpt_type=%u len=%u data_ptr=0x%"PRIx64" out_hwpt=%u (%d)" iommufd_backend_free_id(int iommufd, uint32_t id, int ret) " iommufd=%d id=%d (%d)" +iommufd_backend_set_dirty(int iommufd, uint32_t hwpt_id, bool start, int ret) " iommufd=%d hwpt=%u enable=%d (%d)" +iommufd_backend_get_dirty_bitmap(int iommufd, uint32_t hwpt_id, uint64_t iova, uint64_t size, uint64_t page_size, int ret) " iommufd=%d hwpt=%u iova=0x%"PRIx64" size=0x%"PRIx64" page_size=0x%"PRIx64" (%d)" diff --git a/chardev/char-socket.c b/chardev/char-socket.c index 812d7aa38a..1ca9441b1b 100644 --- a/chardev/char-socket.c +++ b/chardev/char-socket.c @@ -33,6 +33,7 @@ #include "qapi/clone-visitor.h" #include "qapi/qapi-visit-sockets.h" #include "qemu/yank.h" +#include "trace.h" #include "chardev/char-io.h" #include "chardev/char-socket.h" @@ -126,6 +127,7 @@ static int tcp_chr_write(Chardev *chr, const uint8_t *buf, int len) if (ret < 0 && errno != EAGAIN) { if (tcp_chr_read_poll(chr) <= 0) { /* Perform disconnect and return error. */ + trace_chr_socket_poll_err(chr, chr->label); tcp_chr_disconnect_locked(chr); } /* else let the read handler finish it properly */ } @@ -279,15 +281,16 @@ static ssize_t tcp_chr_recv(Chardev *chr, char *buf, size_t len) size_t i; int *msgfds = NULL; size_t msgfds_num = 0; + Error *err = NULL; if (qio_channel_has_feature(s->ioc, QIO_CHANNEL_FEATURE_FD_PASS)) { ret = qio_channel_readv_full(s->ioc, &iov, 1, &msgfds, &msgfds_num, - 0, NULL); + 0, &err); } else { ret = qio_channel_readv_full(s->ioc, &iov, 1, NULL, NULL, - 0, NULL); + 0, &err); } if (msgfds_num) { @@ -322,7 +325,11 @@ static ssize_t tcp_chr_recv(Chardev *chr, char *buf, size_t len) errno = EAGAIN; ret = -1; } else if (ret == -1) { + trace_chr_socket_recv_err(chr, chr->label, error_get_pretty(err)); + error_free(err); errno = EIO; + } else if (ret == 0) { + trace_chr_socket_recv_eof(chr, chr->label); } return ret; @@ -463,6 +470,7 @@ static void tcp_chr_disconnect_locked(Chardev *chr) SocketChardev *s = SOCKET_CHARDEV(chr); bool emit_close = s->state == TCP_CHARDEV_STATE_CONNECTED; + trace_chr_socket_disconnect(chr, chr->label); tcp_chr_free_connection(chr); if (s->listener) { @@ -521,6 +529,7 @@ static gboolean tcp_chr_hup(QIOChannel *channel, void *opaque) { Chardev *chr = CHARDEV(opaque); + trace_chr_socket_hangup(chr, chr->label); tcp_chr_disconnect(chr); return G_SOURCE_REMOVE; } @@ -672,15 +681,18 @@ static gboolean tcp_chr_telnet_init_io(QIOChannel *ioc, SocketChardev *s = user_data; Chardev *chr = CHARDEV(s); TCPChardevTelnetInit *init = s->telnet_init; + Error *err = NULL; ssize_t ret; assert(init); - ret = qio_channel_write(ioc, init->buf, init->buflen, NULL); + ret = qio_channel_write(ioc, init->buf, init->buflen, &err); if (ret < 0) { if (ret == QIO_CHANNEL_ERR_BLOCK) { ret = 0; } else { + trace_chr_socket_write_err(chr, chr->label, error_get_pretty(err)); + error_free(err); tcp_chr_disconnect(chr); goto end; } @@ -765,9 +777,9 @@ static void tcp_chr_websock_handshake(QIOTask *task, gpointer user_data) Error *err = NULL; if (qio_task_propagate_error(task, &err)) { - error_reportf_err(err, - "websock handshake of character device %s failed: ", - chr->label); + trace_chr_socket_ws_handshake_err(chr, chr->label, + error_get_pretty(err)); + error_free(err); tcp_chr_disconnect(chr); } else { if (s->do_telnetopt) { @@ -805,9 +817,9 @@ static void tcp_chr_tls_handshake(QIOTask *task, Error *err = NULL; if (qio_task_propagate_error(task, &err)) { - error_reportf_err(err, - "TLS handshake of character device %s failed: ", - chr->label); + trace_chr_socket_tls_handshake_err(chr, chr->label, + error_get_pretty(err)); + error_free(err); tcp_chr_disconnect(chr); } else { if (s->is_websock) { @@ -826,19 +838,22 @@ static void tcp_chr_tls_init(Chardev *chr) SocketChardev *s = SOCKET_CHARDEV(chr); QIOChannelTLS *tioc; gchar *name; + Error *err = NULL; if (s->is_listen) { tioc = qio_channel_tls_new_server( s->ioc, s->tls_creds, s->tls_authz, - NULL); + &err); } else { tioc = qio_channel_tls_new_client( s->ioc, s->tls_creds, s->addr->u.inet.host, - NULL); + &err); } if (tioc == NULL) { + trace_chr_socket_tls_init_err(chr, chr->label, error_get_pretty(err)); + error_free(err); tcp_chr_disconnect(chr); return; } diff --git a/chardev/msmouse.c b/chardev/msmouse.c index a774c397b4..2279694cfa 100644 --- a/chardev/msmouse.c +++ b/chardev/msmouse.c @@ -81,7 +81,7 @@ static void msmouse_chr_accept_input(Chardev *chr) const uint8_t *buf; uint32_t size; - buf = fifo8_pop_buf(&mouse->outbuf, MIN(len, avail), &size); + buf = fifo8_pop_bufptr(&mouse->outbuf, MIN(len, avail), &size); qemu_chr_be_write(chr, buf, size); len = qemu_chr_be_can_write(chr); avail -= size; diff --git a/chardev/trace-events b/chardev/trace-events index 027107b0c1..7e97b8a988 100644 --- a/chardev/trace-events +++ b/chardev/trace-events @@ -17,3 +17,13 @@ spice_vmc_register_interface(void *scd) "spice vmc registered interface %p" spice_vmc_unregister_interface(void *scd) "spice vmc unregistered interface %p" spice_vmc_event(int event) "spice vmc event %d" +# char-socket.c +chr_socket_poll_err(void *chrdev, const char *label) "chardev socket poll error %p (%s)" +chr_socket_recv_err(void *chrdev, const char *label, const char *err) "chardev socket recv error %p (%s): %s" +chr_socket_recv_eof(void *chrdev, const char *label) "chardev socket recv end-of-file %p (%s)" +chr_socket_write_err(void *chrdev, const char *label, const char *err) "chardev socket write error %p (%s): %s" +chr_socket_disconnect(void *chrdev, const char *label) "chardev socket disconnect %p (%s)" +chr_socket_hangup(void *chrdev, const char *label) "chardev socket hangup %p (%s)" +chr_socket_ws_handshake_err(void *chrdev, const char *label, const char *err) "chardev socket websock handshake error %p (%s): %s" +chr_socket_tls_handshake_err(void *chrdev, const char *label, const char *err) "chardev socket TLS handshake error %p (%s): %s" +chr_socket_tls_init_err(void *chrdev, const char *label, const char *err) "chardev socket TLS init error %p (%s): %s" diff --git a/contrib/systemd/qemu-vmsr-helper.service b/contrib/systemd/qemu-vmsr-helper.service new file mode 100644 index 0000000000..8fd397bf79 --- /dev/null +++ b/contrib/systemd/qemu-vmsr-helper.service @@ -0,0 +1,15 @@ +[Unit] +Description=Virtual RAPL MSR Daemon for QEMU + +[Service] +WorkingDirectory=/tmp +Type=simple +ExecStart=/usr/bin/qemu-vmsr-helper +PrivateTmp=yes +ProtectSystem=strict +ReadWritePaths=/var/run +RestrictAddressFamilies=AF_UNIX +Restart=always +RestartSec=0 + +[Install] diff --git a/contrib/systemd/qemu-vmsr-helper.socket b/contrib/systemd/qemu-vmsr-helper.socket new file mode 100644 index 0000000000..183e8304d6 --- /dev/null +++ b/contrib/systemd/qemu-vmsr-helper.socket @@ -0,0 +1,9 @@ +[Unit] +Description=Virtual RAPL MSR helper for QEMU + +[Socket] +ListenStream=/run/qemu-vmsr-helper.sock +SocketMode=0600 + +[Install] +WantedBy=multi-user.target diff --git a/contrib/vhost-user-blk/vhost-user-blk.c b/contrib/vhost-user-blk/vhost-user-blk.c index 9492146855..6cc18a1c04 100644 --- a/contrib/vhost-user-blk/vhost-user-blk.c +++ b/contrib/vhost-user-blk/vhost-user-blk.c @@ -196,7 +196,7 @@ vub_discard_write_zeroes(VubReq *req, struct iovec *iov, uint32_t iovcnt, VubDev *vdev_blk = req->vdev_blk; desc = buf; uint64_t range[2] = { le64_to_cpu(desc->sector) << 9, - le32_to_cpu(desc->num_sectors) << 9 }; + (uint64_t)le32_to_cpu(desc->num_sectors) << 9 }; if (type == VIRTIO_BLK_T_DISCARD) { if (ioctl(vdev_blk->blk_fd, BLKDISCARD, range) == 0) { g_free(buf); diff --git a/crypto/block-luks.c b/crypto/block-luks.c index 5b777c15d3..45347adeeb 100644 --- a/crypto/block-luks.c +++ b/crypto/block-luks.c @@ -33,6 +33,7 @@ #include "qemu/uuid.h" #include "qemu/bitmap.h" +#include "qemu/range.h" /* * Reference for the LUKS format implemented here is @@ -572,7 +573,7 @@ qcrypto_block_luks_check_header(const QCryptoBlockLUKS *luks, header_sectors, slot2->stripes); - if (start1 + len1 > start2 && start2 + len2 > start1) { + if (ranges_overlap(start1, len1, start2, len2)) { error_setg(errp, "Keyslots %zu and %zu are overlapping in the header", i, j); diff --git a/crypto/init.c b/crypto/init.c index fb7f1bff10..674d237fa9 100644 --- a/crypto/init.c +++ b/crypto/init.c @@ -34,14 +34,11 @@ #include "crypto/random.h" -/* #define DEBUG_GNUTLS */ -#ifdef DEBUG_GNUTLS -static void qcrypto_gnutls_log(int level, const char *str) -{ - fprintf(stderr, "%d: %s", level, str); -} -#endif +/* + * To debug GNUTLS see env vars listed in + * https://gnutls.org/manual/html_node/Debugging-and-auditing.html + */ int qcrypto_init(Error **errp) { #ifdef CONFIG_GNUTLS @@ -53,10 +50,6 @@ int qcrypto_init(Error **errp) gnutls_strerror(ret)); return -1; } -#ifdef DEBUG_GNUTLS - gnutls_global_set_log_level(10); - gnutls_global_set_log_function(qcrypto_gnutls_log); -#endif #endif #ifdef CONFIG_GCRYPT diff --git a/crypto/tlssession.c b/crypto/tlssession.c index 1e98f44e0d..77286e23f4 100644 --- a/crypto/tlssession.c +++ b/crypto/tlssession.c @@ -44,6 +44,13 @@ struct QCryptoTLSSession { QCryptoTLSSessionReadFunc readFunc; void *opaque; char *peername; + + /* + * Allow concurrent reads and writes, so track + * errors separately + */ + Error *rerr; + Error *werr; }; @@ -54,6 +61,9 @@ qcrypto_tls_session_free(QCryptoTLSSession *session) return; } + error_free(session->rerr); + error_free(session->werr); + gnutls_deinit(session->handle); g_free(session->hostname); g_free(session->peername); @@ -67,13 +77,26 @@ static ssize_t qcrypto_tls_session_push(void *opaque, const void *buf, size_t len) { QCryptoTLSSession *session = opaque; + ssize_t ret; if (!session->writeFunc) { errno = EIO; return -1; }; - return session->writeFunc(buf, len, session->opaque); + error_free(session->werr); + session->werr = NULL; + + ret = session->writeFunc(buf, len, session->opaque, &session->werr); + if (ret == QCRYPTO_TLS_SESSION_ERR_BLOCK) { + errno = EAGAIN; + return -1; + } else if (ret < 0) { + errno = EIO; + return -1; + } else { + return ret; + } } @@ -81,13 +104,26 @@ static ssize_t qcrypto_tls_session_pull(void *opaque, void *buf, size_t len) { QCryptoTLSSession *session = opaque; + ssize_t ret; if (!session->readFunc) { errno = EIO; return -1; }; - return session->readFunc(buf, len, session->opaque); + error_free(session->rerr); + session->rerr = NULL; + + ret = session->readFunc(buf, len, session->opaque, &session->rerr); + if (ret == QCRYPTO_TLS_SESSION_ERR_BLOCK) { + errno = EAGAIN; + return -1; + } else if (ret < 0) { + errno = EIO; + return -1; + } else { + return ret; + } } #define TLS_PRIORITY_ADDITIONAL_ANON "+ANON-DH" @@ -441,23 +477,25 @@ qcrypto_tls_session_set_callbacks(QCryptoTLSSession *session, ssize_t qcrypto_tls_session_write(QCryptoTLSSession *session, const char *buf, - size_t len) + size_t len, + Error **errp) { ssize_t ret = gnutls_record_send(session->handle, buf, len); if (ret < 0) { - switch (ret) { - case GNUTLS_E_AGAIN: - errno = EAGAIN; - break; - case GNUTLS_E_INTERRUPTED: - errno = EINTR; - break; - default: - errno = EIO; - break; + if (ret == GNUTLS_E_AGAIN) { + return QCRYPTO_TLS_SESSION_ERR_BLOCK; + } else { + if (session->werr) { + error_propagate(errp, session->werr); + session->werr = NULL; + } else { + error_setg(errp, + "Cannot write to TLS channel: %s", + gnutls_strerror(ret)); + } + return -1; } - ret = -1; } return ret; @@ -467,26 +505,29 @@ qcrypto_tls_session_write(QCryptoTLSSession *session, ssize_t qcrypto_tls_session_read(QCryptoTLSSession *session, char *buf, - size_t len) + size_t len, + bool gracefulTermination, + Error **errp) { ssize_t ret = gnutls_record_recv(session->handle, buf, len); if (ret < 0) { - switch (ret) { - case GNUTLS_E_AGAIN: - errno = EAGAIN; - break; - case GNUTLS_E_INTERRUPTED: - errno = EINTR; - break; - case GNUTLS_E_PREMATURE_TERMINATION: - errno = ECONNABORTED; - break; - default: - errno = EIO; - break; + if (ret == GNUTLS_E_AGAIN) { + return QCRYPTO_TLS_SESSION_ERR_BLOCK; + } else if ((ret == GNUTLS_E_PREMATURE_TERMINATION) && + gracefulTermination){ + return 0; + } else { + if (session->rerr) { + error_propagate(errp, session->rerr); + session->rerr = NULL; + } else { + error_setg(errp, + "Cannot read from TLS channel: %s", + gnutls_strerror(ret)); + } + return -1; } - ret = -1; } return ret; @@ -512,11 +553,21 @@ qcrypto_tls_session_handshake(QCryptoTLSSession *session, ret == GNUTLS_E_AGAIN) { ret = 1; } else { - error_setg(errp, "TLS handshake failed: %s", - gnutls_strerror(ret)); + if (session->rerr || session->werr) { + error_setg(errp, "TLS handshake failed: %s: %s", + gnutls_strerror(ret), + error_get_pretty(session->rerr ? + session->rerr : session->werr)); + } else { + error_setg(errp, "TLS handshake failed: %s", + gnutls_strerror(ret)); + } ret = -1; } } + error_free(session->rerr); + error_free(session->werr); + session->rerr = session->werr = NULL; return ret; } @@ -605,9 +656,10 @@ qcrypto_tls_session_set_callbacks( ssize_t qcrypto_tls_session_write(QCryptoTLSSession *sess, const char *buf, - size_t len) + size_t len, + Error **errp) { - errno = -EIO; + error_setg(errp, "TLS requires GNUTLS support"); return -1; } @@ -615,9 +667,11 @@ qcrypto_tls_session_write(QCryptoTLSSession *sess, ssize_t qcrypto_tls_session_read(QCryptoTLSSession *sess, char *buf, - size_t len) + size_t len, + bool gracefulTermination, + Error **errp) { - errno = -EIO; + error_setg(errp, "TLS requires GNUTLS support"); return -1; } diff --git a/docs/about/emulation.rst b/docs/about/emulation.rst index b5ff9c5f69..3bfe8cc14a 100644 --- a/docs/about/emulation.rst +++ b/docs/about/emulation.rst @@ -42,7 +42,7 @@ depending on the guest architecture. - :ref:`Yes<QEMU-PC-System-emulator>` - Yes - The ubiquitous desktop PC CPU architecture, 32 and 64 bit. - * - Loongarch + * - LoongArch - Yes - Yes - A MIPS-like 64bit RISC architecture developed in China diff --git a/docs/devel/crypto.rst b/docs/devel/crypto.rst new file mode 100644 index 0000000000..39b1c910e7 --- /dev/null +++ b/docs/devel/crypto.rst @@ -0,0 +1,10 @@ +.. _crypto-ref: + +==================== +Cryptography in QEMU +==================== + +.. toctree:: + :maxdepth: 2 + + luks-detached-header diff --git a/docs/devel/index-internals.rst b/docs/devel/index-internals.rst index 5636e9cf1d..4ac7725d72 100644 --- a/docs/devel/index-internals.rst +++ b/docs/devel/index-internals.rst @@ -20,3 +20,4 @@ Details about QEMU's various subsystems including how to add features to them. vfio-iommufd writing-monitor-commands virtio-backends + crypto diff --git a/docs/devel/luks-detached-header.rst b/docs/devel/luks-detached-header.rst new file mode 100644 index 0000000000..94ec285c27 --- /dev/null +++ b/docs/devel/luks-detached-header.rst @@ -0,0 +1,182 @@ +================================ +LUKS volume with detached header +================================ + +Introduction +============ + +This document gives an overview of the design of LUKS volume with detached +header and how to use it. + +Background +========== + +The LUKS format has ability to store the header in a separate volume from +the payload. We could extend the LUKS driver in QEMU to support this use +case. + +Normally a LUKS volume has a layout: + +:: + + +-----------------------------------------------+ + | | | | + disk | header | key material | disk payload data | + | | | | + +-----------------------------------------------+ + +With a detached LUKS header, you need 2 disks so getting: + +:: + + +--------------------------+ + disk1 | header | key material | + +--------------------------+ + +---------------------+ + disk2 | disk payload data | + +---------------------+ + +There are a variety of benefits to doing this: + + * Secrecy - the disk2 cannot be identified as containing LUKS + volume since there's no header + * Control - if access to the disk1 is restricted, then even + if someone has access to disk2 they can't unlock + it. Might be useful if you have disks on NFS but + want to restrict which host can launch a VM + instance from it, by dynamically providing access + to the header to a designated host + * Flexibility - your application data volume may be a given + size and it is inconvenient to resize it to + add encryption.You can store the LUKS header + separately and use the existing storage + volume for payload + * Recovery - corruption of a bit in the header may make the + entire payload inaccessible. It might be + convenient to take backups of the header. If + your primary disk header becomes corrupt, you + can unlock the data still by pointing to the + backup detached header + +Architecture +============ + +Take the qcow2 encryption, for example. The architecture of the +LUKS volume with detached header is shown in the diagram below. + +There are two children of the root node: a file and a header. +Data from the disk payload is stored in the file node. The +LUKS header and key material are located in the header node, +as previously mentioned. + +:: + + +-----------------------------+ + Root node | foo[luks] | + +-----------------------------+ + | | + file | header | + | | + +---------------------+ +------------------+ + Child node |payload-format[qcow2]| |header-format[raw]| + +---------------------+ +------------------+ + | | + file | file | + | | + +----------------------+ +---------------------+ + Child node |payload-protocol[file]| |header-protocol[file]| + +----------------------+ +---------------------+ + | | + | | + | | + Host storage Host storage + +Usage +===== + +Create a LUKS disk with a detached header using qemu-img +-------------------------------------------------------- + +Shell commandline:: + + # qemu-img create --object secret,id=sec0,data=abc123 -f luks \ + -o cipher-alg=aes-256,cipher-mode=xts -o key-secret=sec0 \ + -o detached-header=true test-header.img + # qemu-img create -f qcow2 test-payload.qcow2 200G + # qemu-img info 'json:{"driver":"luks","file":{"filename": \ + "test-payload.img"},"header":{"filename":"test-header.img"}}' + +Set up a VM's LUKS volume with a detached header +------------------------------------------------ + +Qemu commandline:: + + # qemu-system-x86_64 ... \ + -object '{"qom-type":"secret","id":"libvirt-3-format-secret", \ + "data":"abc123"}' \ + -blockdev '{"driver":"file","filename":"/path/to/test-header.img", \ + "node-name":"libvirt-1-storage"}' \ + -blockdev '{"node-name":"libvirt-1-format","read-only":false, \ + "driver":"raw","file":"libvirt-1-storage"}' \ + -blockdev '{"driver":"file","filename":"/path/to/test-payload.qcow2", \ + "node-name":"libvirt-2-storage"}' \ + -blockdev '{"node-name":"libvirt-2-format","read-only":false, \ + "driver":"qcow2","file":"libvirt-2-storage"}' \ + -blockdev '{"node-name":"libvirt-3-format","driver":"luks", \ + "file":"libvirt-2-format","header":"libvirt-1-format","key-secret": \ + "libvirt-3-format-secret"}' \ + -device '{"driver":"virtio-blk-pci","bus":XXX,"addr":YYY,"drive": \ + "libvirt-3-format","id":"virtio-disk1"}' + +Add LUKS volume to a VM with a detached header +---------------------------------------------- + +1. object-add the secret for decrypting the cipher stored in + LUKS header above:: + + # virsh qemu-monitor-command vm '{"execute":"object-add", \ + "arguments":{"qom-type":"secret", "id": \ + "libvirt-4-format-secret", "data":"abc123"}}' + +2. block-add the protocol node for LUKS header:: + + # virsh qemu-monitor-command vm '{"execute":"blockdev-add", \ + "arguments":{"node-name":"libvirt-1-storage", "driver":"file", \ + "filename": "/path/to/test-header.img" }}' + +3. block-add the raw-drived node for LUKS header:: + + # virsh qemu-monitor-command vm '{"execute":"blockdev-add", \ + "arguments":{"node-name":"libvirt-1-format", "driver":"raw", \ + "file":"libvirt-1-storage"}}' + +4. block-add the protocol node for disk payload image:: + + # virsh qemu-monitor-command vm '{"execute":"blockdev-add", \ + "arguments":{"node-name":"libvirt-2-storage", "driver":"file", \ + "filename":"/path/to/test-payload.qcow2"}}' + +5. block-add the qcow2-drived format node for disk payload data:: + + # virsh qemu-monitor-command vm '{"execute":"blockdev-add", \ + "arguments":{"node-name":"libvirt-2-format", "driver":"qcow2", \ + "file":"libvirt-2-storage"}}' + +6. block-add the luks-drived format node to link the qcow2 disk + with the LUKS header by specifying the field "header":: + + # virsh qemu-monitor-command vm '{"execute":"blockdev-add", \ + "arguments":{"node-name":"libvirt-3-format", "driver":"luks", \ + "file":"libvirt-2-format", "header":"libvirt-1-format", \ + "key-secret":"libvirt-2-format-secret"}}' + +7. hot-plug the virtio-blk device finally:: + + # virsh qemu-monitor-command vm '{"execute":"device_add", \ + "arguments": {"driver":"virtio-blk-pci", \ + "drive": "libvirt-3-format", "id":"virtio-disk2"}} + +TODO +==== + +1. Support the shared detached LUKS header within the VM. diff --git a/docs/interop/firmware.json b/docs/interop/firmware.json index 54a1fc6c10..57f55f6c54 100644 --- a/docs/interop/firmware.json +++ b/docs/interop/firmware.json @@ -14,8 +14,10 @@ # = Firmware ## -{ 'include' : 'machine.json' } -{ 'include' : 'block-core.json' } +{ 'pragma': { + 'member-name-exceptions': [ + 'FirmwareArchitecture' # x86_64 + ] } } ## # @FirmwareOSInterface: @@ -61,6 +63,27 @@ 'data' : [ 'flash', 'kernel', 'memory' ] } ## +# @FirmwareArchitecture: +# +# Enumeration of architectures for which Qemu uses additional +# firmware files. +# +# @aarch64: 64-bit Arm. +# +# @arm: 32-bit Arm. +# +# @i386: 32-bit x86. +# +# @loongarch64: 64-bit LoongArch. (since: 7.1) +# +# @x86_64: 64-bit x86. +# +# Since: 3.0 +## +{ 'enum' : 'FirmwareArchitecture', + 'data' : [ 'aarch64', 'arm', 'i386', 'loongarch64', 'x86_64' ] } + +## # @FirmwareTarget: # # Defines the machine types that firmware may execute on. @@ -81,7 +104,7 @@ # Since: 3.0 ## { 'struct' : 'FirmwareTarget', - 'data' : { 'architecture' : 'SysEmuTarget', + 'data' : { 'architecture' : 'FirmwareArchitecture', 'machines' : [ 'str' ] } } ## @@ -201,6 +224,20 @@ 'verbose-dynamic', 'verbose-static' ] } ## +# @FirmwareFormat: +# +# Formats that are supported for firmware images. +# +# @raw: Raw disk image format. +# +# @qcow2: The QCOW2 image format. +# +# Since: 3.0 +## +{ 'enum': 'FirmwareFormat', + 'data': [ 'raw', 'qcow2' ] } + +## # @FirmwareFlashFile: # # Defines common properties that are necessary for loading a firmware @@ -219,7 +256,7 @@ ## { 'struct' : 'FirmwareFlashFile', 'data' : { 'filename' : 'str', - 'format' : 'BlockdevDriver' } } + 'format' : 'FirmwareFormat' } } ## @@ -433,7 +470,7 @@ # # Since: 3.0 # -# Examples: +# .. qmp-example:: # # { # "description": "SeaBIOS", diff --git a/docs/interop/qemu-ga.rst b/docs/interop/qemu-ga.rst index 72fb75a6f5..9c7380896a 100644 --- a/docs/interop/qemu-ga.rst +++ b/docs/interop/qemu-ga.rst @@ -28,11 +28,30 @@ configuration options on the command line. For the same key, the last option wins, but the lists accumulate (see below for configuration file format). +If an allowed RPCs list is defined in the configuration, then all +RPCs will be blocked by default, except for the allowed list. + +If a blocked RPCs list is defined in the configuration, then all +RPCs will be allowed by default, except for the blocked list. + +If both allowed and blocked RPCs lists are defined in the configuration, +then all RPCs will be blocked by default, then the allowed list will +be applied, followed by the blocked list. + +While filesystems are frozen, all except for a designated safe set +of RPCs will blocked, regardless of what the general configuration +declares. + Options ------- .. program:: qemu-ga +.. option:: -c, --config=PATH + + Configuration file path (the default is |CONFDIR|\ ``/qemu-ga.conf``, + unless overriden by the QGA_CONF environment variable) + .. option:: -m, --method=METHOD Transport method: one of ``unix-listen``, ``virtio-serial``, or @@ -131,6 +150,7 @@ fsfreeze-hook string statedir string verbose boolean block-rpcs string list +allow-rpcs string list ============= =========== See also diff --git a/docs/specs/acpi_hw_reduced_hotplug.rst b/docs/specs/acpi_hw_reduced_hotplug.rst index 0bd3f9399f..3acd6fcd8b 100644 --- a/docs/specs/acpi_hw_reduced_hotplug.rst +++ b/docs/specs/acpi_hw_reduced_hotplug.rst @@ -64,7 +64,8 @@ GED IO interface (4 byte access) 0: Memory hotplug event 1: System power down event 2: NVDIMM hotplug event - 3-31: Reserved + 3: CPU hotplug event + 4-31: Reserved **write_access:** diff --git a/docs/specs/index.rst b/docs/specs/index.rst index 1484e3e760..be899b49c2 100644 --- a/docs/specs/index.rst +++ b/docs/specs/index.rst @@ -29,7 +29,9 @@ guest hardware that is specific to QEMU. edu ivshmem-spec pvpanic + spdm standard-vga virt-ctlr vmcoreinfo vmgenid + rapl-msr diff --git a/docs/specs/rapl-msr.rst b/docs/specs/rapl-msr.rst new file mode 100644 index 0000000000..1202ee89be --- /dev/null +++ b/docs/specs/rapl-msr.rst @@ -0,0 +1,155 @@ +================ +RAPL MSR support +================ + +The RAPL interface (Running Average Power Limit) is advertising the accumulated +energy consumption of various power domains (e.g. CPU packages, DRAM, etc.). + +The consumption is reported via MSRs (model specific registers) like +MSR_PKG_ENERGY_STATUS for the CPU package power domain. These MSRs are 64 bits +registers that represent the accumulated energy consumption in micro Joules. + +Thanks to the MSR Filtering patch [#a]_ not all MSRs are handled by KVM. Some +of them can now be handled by the userspace (QEMU). It uses a mechanism called +"MSR filtering" where a list of MSRs is given at init time of a VM to KVM so +that a callback is put in place. The design of this patch uses only this +mechanism for handling the MSRs between guest/host. + +At the moment the following MSRs are involved: + +.. code:: C + + #define MSR_RAPL_POWER_UNIT 0x00000606 + #define MSR_PKG_POWER_LIMIT 0x00000610 + #define MSR_PKG_ENERGY_STATUS 0x00000611 + #define MSR_PKG_POWER_INFO 0x00000614 + +The ``*_POWER_UNIT``, ``*_POWER_LIMIT``, ``*_POWER INFO`` are part of the RAPL +spec and specify the power limit of the package, provide range of parameter(min +power, max power,..) and also the information of the multiplier for the energy +counter to calculate the power. Those MSRs are populated once at the beginning +by reading the host CPU MSRs and are given back to the guest 1:1 when +requested. + +The MSR_PKG_ENERGY_STATUS is a counter; it represents the total amount of +energy consumed since the last time the register was cleared. If you multiply +it with the UNIT provided above you'll get the power in micro-joules. This +counter is always increasing and it increases more or less faster depending on +the consumption of the package. This counter is supposed to overflow at some +point. + +Each core belonging to the same Package reading the MSR_PKG_ENERGY_STATUS (i.e +"rdmsr 0x611") will retrieve the same value. The value represents the energy +for the whole package. Whatever Core reading it will get the same value and a +core that belongs to PKG-0 will not be able to get the value of PKG-1 and +vice-versa. + +High level implementation +------------------------- + +In order to update the value of the virtual MSR, a QEMU thread is created. +The thread is basically just an infinity loop that does: + +1. Snapshot of the time metrics of all QEMU threads (Time spent scheduled in + Userspace and System) + +2. Snapshot of the actual MSR_PKG_ENERGY_STATUS counter of all packages where + the QEMU threads are running on. + +3. Sleep for 1 second - During this pause the vcpu and other non-vcpu threads + will do what they have to do and so the energy counter will increase. + +4. Repeat 2. and 3. and calculate the delta of every metrics representing the + time spent scheduled for each QEMU thread *and* the energy spent by the + packages during the pause. + +5. Filter the vcpu threads and the non-vcpu threads. + +6. Retrieve the topology of the Virtual Machine. This helps identify which + vCPU is running on which virtual package. + +7. The total energy spent by the non-vcpu threads is divided by the number + of vcpu threads so that each vcpu thread will get an equal part of the + energy spent by the QEMU workers. + +8. Calculate the ratio of energy spent per vcpu threads. + +9. Calculate the energy for each virtual package. + +10. The virtual MSRs are updated for each virtual package. Each vCPU that + belongs to the same package will return the same value when accessing the + the MSR. + +11. Loop back to 1. + +Ratio calculation +----------------- + +In Linux, a process has an execution time associated with it. The scheduler is +dividing the time in clock ticks. The number of clock ticks per second can be +found by the sysconf system call. A typical value of clock ticks per second is +100. So a core can run a process at the maximum of 100 ticks per second. If a +package has 4 cores, 400 ticks maximum can be scheduled on all the cores +of the package for a period of 1 second. + +The /proc/[pid]/stat [#b]_ is a sysfs file that can give the executed time of a +process with the [pid] as the process ID. It gives the amount of ticks the +process has been scheduled in userspace (utime) and kernel space (stime). + +By reading those metrics for a thread, one can calculate the ratio of time the +package has spent executing the thread. + +Example: + +A 4 cores package can schedule a maximum of 400 ticks per second with 100 ticks +per second per core. If a thread was scheduled for 100 ticks between a second +on this package, that means my thread has been scheduled for 1/4 of the whole +package. With that, the calculation of the energy spent by the thread on this +package during this whole second is 1/4 of the total energy spent by the +package. + +Usage +----- + +Currently this feature is only working on an Intel CPU that has the RAPL driver +mounted and available in the sysfs. if not, QEMU fails at start-up. + +This feature is activated with -accel +kvm,rapl=true,rapl-helper-socket=/path/sock.sock + +It is important that the socket path is the same as the one +:program:`qemu-vmsr-helper` is listening to. + +qemu-vmsr-helper +---------------- + +The qemu-vmsr-helper is working very much like the qemu-pr-helper. Instead of +making persistent reservation, qemu-vmsr-helper is here to overcome the +CVE-2020-8694 which remove user access to the rapl msr attributes. + +A socket communication is established between QEMU processes that has the RAPL +MSR support activated and the qemu-vmsr-helper. A systemd service and socket +activation is provided in contrib/systemd/qemu-vmsr-helper.(service/socket). + +The systemd socket uses 600, like contrib/systemd/qemu-pr-helper.socket. The +socket can be passed via SCM_RIGHTS by libvirt, or its permissions can be +changed (e.g. 660 and root:kvm for a Debian system for example). Libvirt could +also start a separate helper if needed. All in all, the policy is left to the +user. + +See the qemu-pr-helper documentation or manpage for further details. + +Current Limitations +------------------- + +- Works only on Intel host CPUs because AMD CPUs are using different MSR + addresses. + +- Only the Package Power-Plane (MSR_PKG_ENERGY_STATUS) is reported at the + moment. + +References +---------- + +.. [#a] https://patchwork.kernel.org/project/kvm/patch/20200916202951.23760-7-graf@amazon.com/ +.. [#b] https://man7.org/linux/man-pages/man5/proc.5.html diff --git a/docs/specs/spdm.rst b/docs/specs/spdm.rst new file mode 100644 index 0000000000..f7de080ff0 --- /dev/null +++ b/docs/specs/spdm.rst @@ -0,0 +1,134 @@ +====================================================== +QEMU Security Protocols and Data Models (SPDM) Support +====================================================== + +SPDM enables authentication, attestation and key exchange to assist in +providing infrastructure security enablement. It's a standard published +by the `DMTF`_. + +QEMU supports connecting to a SPDM responder implementation. This allows an +external application to emulate the SPDM responder logic for an SPDM device. + +Setting up a SPDM server +======================== + +When using QEMU with SPDM devices QEMU will connect to a server which +implements the SPDM functionality. + +SPDM-Utils +---------- + +You can use `SPDM Utils`_ to emulate a responder. This is the simplest method. + +SPDM-Utils is a Linux applications to manage, test and develop devices +supporting DMTF Security Protocol and Data Model (SPDM). It is written in Rust +and utilises libspdm. + +To use SPDM-Utils you will need to do the following steps. Details are included +in the SPDM-Utils README. + + 1. `Build libspdm`_ + 2. `Build SPDM Utils`_ + 3. `Run it as a server`_ + +spdm-emu +-------- + +You can use `spdm emu`_ to model the +SPDM responder. + +.. code-block:: shell + + $ cd spdm-emu + $ git submodule init; git submodule update --recursive + $ mkdir build; cd build + $ cmake -DARCH=x64 -DTOOLCHAIN=GCC -DTARGET=Debug -DCRYPTO=openssl .. + $ make -j32 + $ make copy_sample_key # Build certificates, required for SPDM authentication. + +It is worth noting that the certificates should be in compliance with +PCIe r6.1 sec 6.31.3. This means you will need to add the following to +openssl.cnf + +.. code-block:: + + subjectAltName = otherName:2.23.147;UTF8:Vendor=1b36:Device=0010:CC=010802:REV=02:SSVID=1af4:SSID=1100 + 2.23.147 = ASN1:OID:2.23.147 + +and then manually regenerate some certificates with: + +.. code-block:: shell + + $ openssl req -nodes -newkey ec:param.pem -keyout end_responder.key \ + -out end_responder.req -sha384 -batch \ + -subj "/CN=DMTF libspdm ECP384 responder cert" + + $ openssl x509 -req -in end_responder.req -out end_responder.cert \ + -CA inter.cert -CAkey inter.key -sha384 -days 3650 -set_serial 3 \ + -extensions v3_end -extfile ../openssl.cnf + + $ openssl asn1parse -in end_responder.cert -out end_responder.cert.der + + $ cat ca.cert.der inter.cert.der end_responder.cert.der > bundle_responder.certchain.der + +You can use SPDM-Utils instead as it will generate the correct certificates +automatically. + +The responder can then be launched with + +.. code-block:: shell + + $ cd bin + $ ./spdm_responder_emu --trans PCI_DOE + +Connecting an SPDM NVMe device +============================== + +Once a SPDM server is running we can start QEMU and connect to the server. + +For an NVMe device first let's setup a block we can use + +.. code-block:: shell + + $ cd qemu-spdm/linux/image + $ dd if=/dev/zero of=blknvme bs=1M count=2096 # 2GB NNMe Drive + +Then you can add this to your QEMU command line: + +.. code-block:: shell + + -drive file=blknvme,if=none,id=mynvme,format=raw \ + -device nvme,drive=mynvme,serial=deadbeef,spdm_port=2323 + +At which point QEMU will try to connect to the SPDM server. + +Note that if using x64-64 you will want to use the q35 machine instead +of the default. So the entire QEMU command might look like this + +.. code-block:: shell + + qemu-system-x86_64 -M q35 \ + --kernel bzImage \ + -drive file=rootfs.ext2,if=virtio,format=raw \ + -append "root=/dev/vda console=ttyS0" \ + -net none -nographic \ + -drive file=blknvme,if=none,id=mynvme,format=raw \ + -device nvme,drive=mynvme,serial=deadbeef,spdm_port=2323 + +.. _DMTF: + https://www.dmtf.org/standards/SPDM + +.. _SPDM Utils: + https://github.com/westerndigitalcorporation/spdm-utils + +.. _spdm emu: + https://github.com/dmtf/spdm-emu + +.. _Build libspdm: + https://github.com/westerndigitalcorporation/spdm-utils?tab=readme-ov-file#build-libspdm + +.. _Build SPDM Utils: + https://github.com/westerndigitalcorporation/spdm-utils?tab=readme-ov-file#build-the-binary + +.. _Run it as a server: + https://github.com/westerndigitalcorporation/spdm-utils#qemu-spdm-device-emulation diff --git a/docs/system/index.rst b/docs/system/index.rst index c21065e519..718e9d3c56 100644 --- a/docs/system/index.rst +++ b/docs/system/index.rst @@ -39,3 +39,4 @@ or Hypervisor.Framework. multi-process confidential-guest-support vm-templating + sriov diff --git a/docs/system/sriov.rst b/docs/system/sriov.rst new file mode 100644 index 0000000000..a851a66a4b --- /dev/null +++ b/docs/system/sriov.rst @@ -0,0 +1,36 @@ +.. SPDX-License-Identifier: GPL-2.0-or-later + +Compsable SR-IOV device +======================= + +SR-IOV (Single Root I/O Virtualization) is an optional extended capability of a +PCI Express device. It allows a single physical function (PF) to appear as +multiple virtual functions (VFs) for the main purpose of eliminating software +overhead in I/O from virtual machines. + +There are devices with predefined SR-IOV configurations, but it is also possible +to compose an SR-IOV device yourself. Composing an SR-IOV device is currently +only supported by virtio-net-pci. + +Users can configure an SR-IOV-capable virtio-net device by adding +virtio-net-pci functions to a bus. Below is a command line example: + +.. code-block:: shell + + -netdev user,id=n -netdev user,id=o + -netdev user,id=p -netdev user,id=q + -device pcie-root-port,id=b + -device virtio-net-pci,bus=b,addr=0x0.0x3,netdev=q,sriov-pf=f + -device virtio-net-pci,bus=b,addr=0x0.0x2,netdev=p,sriov-pf=f + -device virtio-net-pci,bus=b,addr=0x0.0x1,netdev=o,sriov-pf=f + -device virtio-net-pci,bus=b,addr=0x0.0x0,netdev=n,id=f + +The VFs specify the paired PF with ``sriov-pf`` property. The PF must be +added after all VFs. It is the user's responsibility to ensure that VFs have +function numbers larger than one of the PF, and that the function numbers +have a consistent stride. + +You may also need to perform additional steps to activate the SR-IOV feature on +your guest. For Linux, refer to [1]_. + +.. [1] https://docs.kernel.org/PCI/pci-iov-howto.html diff --git a/docs/tools/index.rst b/docs/tools/index.rst index 8e65ce0dfc..33ad438e86 100644 --- a/docs/tools/index.rst +++ b/docs/tools/index.rst @@ -16,3 +16,4 @@ command line utilities and other standalone programs. qemu-pr-helper qemu-trace-stap virtfs-proxy-helper + qemu-vmsr-helper diff --git a/docs/tools/qemu-vmsr-helper.rst b/docs/tools/qemu-vmsr-helper.rst new file mode 100644 index 0000000000..6ec87b49d9 --- /dev/null +++ b/docs/tools/qemu-vmsr-helper.rst @@ -0,0 +1,89 @@ +================================== +QEMU virtual RAPL MSR helper +================================== + +Synopsis +-------- + +**qemu-vmsr-helper** [*OPTION*] + +Description +----------- + +Implements the virtual RAPL MSR helper for QEMU. + +Accessing the RAPL (Running Average Power Limit) MSR enables the RAPL powercap +driver to advertise and monitor the power consumption or accumulated energy +consumption of different power domains, such as CPU packages, DRAM, and other +components when available. + +However those register are accesible under priviliged access (CAP_SYS_RAWIO). +QEMU can use an external helper to access those priviliged register. + +:program:`qemu-vmsr-helper` is that external helper; it creates a listener +socket which will accept incoming connections for communication with QEMU. + +If you want to run VMs in a setup like this, this helper should be started as a +system service, and you should read the QEMU manual section on "RAPL MSR +support" to find out how to configure QEMU to connect to the socket created by +:program:`qemu-vmsr-helper`. + +After connecting to the socket, :program:`qemu-vmsr-helper` can +optionally drop root privileges, except for those capabilities that +are needed for its operation. + +:program:`qemu-vmsr-helper` can also use the systemd socket activation +protocol. In this case, the systemd socket unit should specify a +Unix stream socket, like this:: + + [Socket] + ListenStream=/var/run/qemu-vmsr-helper.sock + +Options +------- + +.. program:: qemu-vmsr-helper + +.. option:: -d, --daemon + + run in the background (and create a PID file) + +.. option:: -q, --quiet + + decrease verbosity + +.. option:: -v, --verbose + + increase verbosity + +.. option:: -f, --pidfile=PATH + + PID file when running as a daemon. By default the PID file + is created in the system runtime state directory, for example + :file:`/var/run/qemu-vmsr-helper.pid`. + +.. option:: -k, --socket=PATH + + path to the socket. By default the socket is created in + the system runtime state directory, for example + :file:`/var/run/qemu-vmsr-helper.sock`. + +.. option:: -T, --trace [[enable=]PATTERN][,events=FILE][,file=FILE] + + .. include:: ../qemu-option-trace.rst.inc + +.. option:: -u, --user=USER + + user to drop privileges to + +.. option:: -g, --group=GROUP + + group to drop privileges to + +.. option:: -h, --help + + Display a help message and exit. + +.. option:: -V, --version + + Display version information and exit. diff --git a/dump/dump.c b/dump/dump.c index 84064d890d..45e84428ae 100644 --- a/dump/dump.c +++ b/dump/dump.c @@ -30,6 +30,7 @@ #include "migration/blocker.h" #include "hw/core/cpu.h" #include "win_dump.h" +#include "qemu/range.h" #include <zlib.h> #ifdef CONFIG_LZO @@ -574,8 +575,10 @@ static void get_offset_range(hwaddr phys_addr, QTAILQ_FOREACH(block, &s->guest_phys_blocks.head, next) { if (dump_has_filter(s)) { - if (block->target_start >= s->filter_area_begin + s->filter_area_length || - block->target_end <= s->filter_area_begin) { + if (!ranges_overlap(block->target_start, + block->target_end - block->target_start, + s->filter_area_begin, + s->filter_area_length)) { /* This block is out of the range */ continue; } @@ -734,8 +737,9 @@ int64_t dump_filtered_memblock_start(GuestPhysBlock *block, { if (filter_area_length) { /* return -1 if the block is not within filter area */ - if (block->target_start >= filter_area_start + filter_area_length || - block->target_end <= filter_area_start) { + if (!ranges_overlap(block->target_start, + block->target_end - block->target_start, + filter_area_start, filter_area_length)) { return -1; } diff --git a/gdbstub/gdbstub.c b/gdbstub/gdbstub.c index b7be8e5a44..d08568cea0 100644 --- a/gdbstub/gdbstub.c +++ b/gdbstub/gdbstub.c @@ -618,6 +618,19 @@ void gdb_register_coprocessor(CPUState *cpu, } } +void gdb_unregister_coprocessor_all(CPUState *cpu) +{ + /* + * Safe to nuke everything. GDBRegisterState::xml is static const char so + * it won't be freed + */ + g_array_free(cpu->gdb_regs, true); + + cpu->gdb_regs = NULL; + cpu->gdb_num_regs = 0; + cpu->gdb_num_g_regs = 0; +} + static void gdb_process_breakpoint_remove_all(GDBProcess *p) { CPUState *cpu = gdb_get_first_cpu_in_process(p); diff --git a/hw/acpi/acpi-cpu-hotplug-stub.c b/hw/acpi/acpi-cpu-hotplug-stub.c index 3fc4b14c26..c6c61bb9cd 100644 --- a/hw/acpi/acpi-cpu-hotplug-stub.c +++ b/hw/acpi/acpi-cpu-hotplug-stub.c @@ -19,6 +19,12 @@ void legacy_acpi_cpu_hotplug_init(MemoryRegion *parent, Object *owner, return; } +void cpu_hotplug_hw_init(MemoryRegion *as, Object *owner, + CPUHotplugState *state, hwaddr base_addr) +{ + return; +} + void acpi_cpu_ospm_status(CPUHotplugState *cpu_st, ACPIOSTInfoList ***list) { return; diff --git a/hw/acpi/cpu.c b/hw/acpi/cpu.c index 2d81c1e790..5cb60ca8bc 100644 --- a/hw/acpi/cpu.c +++ b/hw/acpi/cpu.c @@ -7,7 +7,6 @@ #include "trace.h" #include "sysemu/numa.h" -#define ACPI_CPU_HOTPLUG_REG_LEN 12 #define ACPI_CPU_SELECTOR_OFFSET_WR 0 #define ACPI_CPU_FLAGS_OFFSET_RW 4 #define ACPI_CPU_CMD_OFFSET_WR 5 @@ -339,9 +338,10 @@ const VMStateDescription vmstate_cpu_hotplug = { #define CPU_FW_EJECT_EVENT "CEJF" void build_cpus_aml(Aml *table, MachineState *machine, CPUHotplugFeatures opts, - build_madt_cpu_fn build_madt_cpu, hwaddr io_base, + build_madt_cpu_fn build_madt_cpu, hwaddr base_addr, const char *res_root, - const char *event_handler_method) + const char *event_handler_method, + AmlRegionSpace rs) { Aml *ifctx; Aml *field; @@ -365,14 +365,22 @@ void build_cpus_aml(Aml *table, MachineState *machine, CPUHotplugFeatures opts, aml_name_decl("_UID", aml_string("CPU Hotplug resources"))); aml_append(cpu_ctrl_dev, aml_mutex(CPU_LOCK, 0)); + assert((rs == AML_SYSTEM_IO) || (rs == AML_SYSTEM_MEMORY)); + crs = aml_resource_template(); - aml_append(crs, aml_io(AML_DECODE16, io_base, io_base, 1, + if (rs == AML_SYSTEM_IO) { + aml_append(crs, aml_io(AML_DECODE16, base_addr, base_addr, 1, ACPI_CPU_HOTPLUG_REG_LEN)); + } else if (rs == AML_SYSTEM_MEMORY) { + aml_append(crs, aml_memory32_fixed(base_addr, + ACPI_CPU_HOTPLUG_REG_LEN, AML_READ_WRITE)); + } + aml_append(cpu_ctrl_dev, aml_name_decl("_CRS", crs)); /* declare CPU hotplug MMIO region with related access fields */ aml_append(cpu_ctrl_dev, - aml_operation_region("PRST", AML_SYSTEM_IO, aml_int(io_base), + aml_operation_region("PRST", rs, aml_int(base_addr), ACPI_CPU_HOTPLUG_REG_LEN)); field = aml_field("PRST", AML_BYTE_ACC, AML_NOLOCK, diff --git a/hw/acpi/generic_event_device.c b/hw/acpi/generic_event_device.c index 2d6e91b124..15b4c3ebbf 100644 --- a/hw/acpi/generic_event_device.c +++ b/hw/acpi/generic_event_device.c @@ -25,6 +25,7 @@ static const uint32_t ged_supported_events[] = { ACPI_GED_MEM_HOTPLUG_EVT, ACPI_GED_PWR_DOWN_EVT, ACPI_GED_NVDIMM_HOTPLUG_EVT, + ACPI_GED_CPU_HOTPLUG_EVT, }; /* @@ -107,6 +108,9 @@ void build_ged_aml(Aml *table, const char *name, HotplugHandler *hotplug_dev, aml_append(if_ctx, aml_call0(MEMORY_DEVICES_CONTAINER "." MEMORY_SLOT_SCAN_METHOD)); break; + case ACPI_GED_CPU_HOTPLUG_EVT: + aml_append(if_ctx, aml_call0(AML_GED_EVT_CPU_SCAN_METHOD)); + break; case ACPI_GED_PWR_DOWN_EVT: aml_append(if_ctx, aml_notify(aml_name(ACPI_POWER_BUTTON_DEVICE), @@ -234,6 +238,8 @@ static void acpi_ged_device_plug_cb(HotplugHandler *hotplug_dev, } else { acpi_memory_plug_cb(hotplug_dev, &s->memhp_state, dev, errp); } + } else if (object_dynamic_cast(OBJECT(dev), TYPE_CPU)) { + acpi_cpu_plug_cb(hotplug_dev, &s->cpuhp_state, dev, errp); } else { error_setg(errp, "virt: device plug request for unsupported device" " type: %s", object_get_typename(OBJECT(dev))); @@ -248,6 +254,8 @@ static void acpi_ged_unplug_request_cb(HotplugHandler *hotplug_dev, if ((object_dynamic_cast(OBJECT(dev), TYPE_PC_DIMM) && !(object_dynamic_cast(OBJECT(dev), TYPE_NVDIMM)))) { acpi_memory_unplug_request_cb(hotplug_dev, &s->memhp_state, dev, errp); + } else if (object_dynamic_cast(OBJECT(dev), TYPE_CPU)) { + acpi_cpu_unplug_request_cb(hotplug_dev, &s->cpuhp_state, dev, errp); } else { error_setg(errp, "acpi: device unplug request for unsupported device" " type: %s", object_get_typename(OBJECT(dev))); @@ -261,6 +269,8 @@ static void acpi_ged_unplug_cb(HotplugHandler *hotplug_dev, if (object_dynamic_cast(OBJECT(dev), TYPE_PC_DIMM)) { acpi_memory_unplug_cb(&s->memhp_state, dev, errp); + } else if (object_dynamic_cast(OBJECT(dev), TYPE_CPU)) { + acpi_cpu_unplug_cb(&s->cpuhp_state, dev, errp); } else { error_setg(errp, "acpi: device unplug for unsupported device" " type: %s", object_get_typename(OBJECT(dev))); @@ -272,6 +282,7 @@ static void acpi_ged_ospm_status(AcpiDeviceIf *adev, ACPIOSTInfoList ***list) AcpiGedState *s = ACPI_GED(adev); acpi_memory_ospm_status(&s->memhp_state, list); + acpi_cpu_ospm_status(&s->cpuhp_state, list); } static void acpi_ged_send_event(AcpiDeviceIf *adev, AcpiEventStatusBits ev) @@ -286,6 +297,8 @@ static void acpi_ged_send_event(AcpiDeviceIf *adev, AcpiEventStatusBits ev) sel = ACPI_GED_PWR_DOWN_EVT; } else if (ev & ACPI_NVDIMM_HOTPLUG_STATUS) { sel = ACPI_GED_NVDIMM_HOTPLUG_EVT; + } else if (ev & ACPI_CPU_HOTPLUG_STATUS) { + sel = ACPI_GED_CPU_HOTPLUG_EVT; } else { /* Unknown event. Return without generating interrupt. */ warn_report("GED: Unsupported event %d. No irq injected", ev); @@ -371,6 +384,42 @@ static const VMStateDescription vmstate_acpi_ged = { } }; +static void acpi_ged_realize(DeviceState *dev, Error **errp) +{ + SysBusDevice *sbd = SYS_BUS_DEVICE(dev); + AcpiGedState *s = ACPI_GED(dev); + uint32_t ged_events; + int i; + + ged_events = ctpop32(s->ged_event_bitmap); + + for (i = 0; i < ARRAY_SIZE(ged_supported_events) && ged_events; i++) { + uint32_t event = s->ged_event_bitmap & ged_supported_events[i]; + + if (!event) { + continue; + } + + switch (event) { + case ACPI_GED_CPU_HOTPLUG_EVT: + /* initialize CPU Hotplug related regions */ + memory_region_init(&s->container_cpuhp, OBJECT(dev), + "cpuhp container", + ACPI_CPU_HOTPLUG_REG_LEN); + sysbus_init_mmio(sbd, &s->container_cpuhp); + cpu_hotplug_hw_init(&s->container_cpuhp, OBJECT(dev), + &s->cpuhp_state, 0); + break; + } + ged_events--; + } + + if (ged_events) { + error_report("Unsupported events specified"); + abort(); + } +} + static void acpi_ged_initfn(Object *obj) { DeviceState *dev = DEVICE(obj); @@ -411,6 +460,7 @@ static void acpi_ged_class_init(ObjectClass *class, void *data) dc->desc = "ACPI Generic Event Device"; device_class_set_props(dc, acpi_ged_properties); dc->vmsd = &vmstate_acpi_ged; + dc->realize = acpi_ged_realize; hc->plug = acpi_ged_device_plug_cb; hc->unplug_request = acpi_ged_unplug_request_cb; diff --git a/hw/arm/virt.c b/hw/arm/virt.c index b0c68d66a3..719e83e6a1 100644 --- a/hw/arm/virt.c +++ b/hw/arm/virt.c @@ -3308,6 +3308,7 @@ DEFINE_VIRT_MACHINE_AS_LATEST(9, 1) static void virt_machine_9_0_options(MachineClass *mc) { virt_machine_9_1_options(mc); + mc->smbios_memory_device_size = 16 * GiB; compat_props_add(mc->compat_props, hw_compat_9_0, hw_compat_9_0_len); } DEFINE_VIRT_MACHINE(9, 0) diff --git a/hw/audio/virtio-snd.c b/hw/audio/virtio-snd.c index 5993f4f040..e5196aa4bb 100644 --- a/hw/audio/virtio-snd.c +++ b/hw/audio/virtio-snd.c @@ -282,11 +282,13 @@ uint32_t virtio_snd_set_pcm_params(VirtIOSound *s, error_report("Number of channels is not supported."); return cpu_to_le32(VIRTIO_SND_S_NOT_SUPP); } - if (!(supported_formats & BIT(params->format))) { + if (BIT(params->format) > sizeof(supported_formats) || + !(supported_formats & BIT(params->format))) { error_report("Stream format is not supported."); return cpu_to_le32(VIRTIO_SND_S_NOT_SUPP); } - if (!(supported_rates & BIT(params->rate))) { + if (BIT(params->rate) > sizeof(supported_rates) || + !(supported_rates & BIT(params->rate))) { error_report("Stream rate is not supported."); return cpu_to_le32(VIRTIO_SND_S_NOT_SUPP); } @@ -1261,7 +1263,7 @@ static void virtio_snd_pcm_in_cb(void *data, int available) { VirtIOSoundPCMStream *stream = data; VirtIOSoundPCMBuffer *buffer; - size_t size; + size_t size, max_size; WITH_QEMU_LOCK_GUARD(&stream->queue_mutex) { while (!QSIMPLEQ_EMPTY(&stream->queue)) { @@ -1275,7 +1277,12 @@ static void virtio_snd_pcm_in_cb(void *data, int available) continue; } + max_size = iov_size(buffer->elem->in_sg, buffer->elem->in_num); for (;;) { + if (buffer->size >= max_size) { + return_rx_buffer(stream, buffer); + break; + } size = AUD_read(stream->voice.in, buffer->data + buffer->size, MIN(available, (stream->params.period_bytes - diff --git a/hw/block/vhost-user-blk.c b/hw/block/vhost-user-blk.c index fdbc30b9ce..5b7f46bbb0 100644 --- a/hw/block/vhost-user-blk.c +++ b/hw/block/vhost-user-blk.c @@ -51,6 +51,7 @@ static const int user_feature_bits[] = { VIRTIO_F_RING_PACKED, VIRTIO_F_IOMMU_PLATFORM, VIRTIO_F_RING_RESET, + VIRTIO_F_IN_ORDER, VIRTIO_F_NOTIFICATION_DATA, VHOST_INVALID_FEATURE_BIT }; diff --git a/hw/char/goldfish_tty.c b/hw/char/goldfish_tty.c index f8ff043c39..c2e1f6537f 100644 --- a/hw/char/goldfish_tty.c +++ b/hw/char/goldfish_tty.c @@ -16,6 +16,7 @@ #include "qemu/log.h" #include "trace.h" #include "exec/address-spaces.h" +#include "sysemu/dma.h" #include "hw/char/goldfish_tty.h" #define GOLDFISH_TTY_VERSION 1 @@ -69,7 +70,6 @@ static uint64_t goldfish_tty_read(void *opaque, hwaddr addr, static void goldfish_tty_cmd(GoldfishTTYState *s, uint32_t cmd) { uint32_t to_copy; - uint8_t *buf; uint8_t data_out[GOLFISH_TTY_BUFFER_SIZE]; int len; uint64_t ptr; @@ -97,8 +97,8 @@ static void goldfish_tty_cmd(GoldfishTTYState *s, uint32_t cmd) while (len) { to_copy = MIN(GOLFISH_TTY_BUFFER_SIZE, len); - address_space_rw(&address_space_memory, ptr, - MEMTXATTRS_UNSPECIFIED, data_out, to_copy, 0); + dma_memory_read_relaxed(&address_space_memory, ptr, + data_out, to_copy); qemu_chr_fe_write_all(&s->chr, data_out, to_copy); len -= to_copy; @@ -109,9 +109,9 @@ static void goldfish_tty_cmd(GoldfishTTYState *s, uint32_t cmd) len = s->data_len; ptr = s->data_ptr; while (len && !fifo8_is_empty(&s->rx_fifo)) { - buf = (uint8_t *)fifo8_pop_buf(&s->rx_fifo, len, &to_copy); - address_space_rw(&address_space_memory, ptr, - MEMTXATTRS_UNSPECIFIED, buf, to_copy, 1); + const uint8_t *buf = fifo8_pop_bufptr(&s->rx_fifo, len, &to_copy); + + dma_memory_write_relaxed(&address_space_memory, ptr, buf, to_copy); len -= to_copy; ptr += to_copy; diff --git a/hw/core/cpu-common.c b/hw/core/cpu-common.c index d2e3e4570a..7982ecd39a 100644 --- a/hw/core/cpu-common.c +++ b/hw/core/cpu-common.c @@ -282,7 +282,10 @@ static void cpu_common_finalize(Object *obj) } #endif free_queued_cpu_work(cpu); - g_array_free(cpu->gdb_regs, TRUE); + /* If cleanup didn't happen in context to gdb_unregister_coprocessor_all */ + if (cpu->gdb_regs) { + g_array_free(cpu->gdb_regs, TRUE); + } qemu_lockcnt_destroy(&cpu->in_ioctl_lock); qemu_mutex_destroy(&cpu->work_mutex); qemu_cond_destroy(cpu->halt_cond); diff --git a/hw/core/machine.c b/hw/core/machine.c index 8a878f84d7..27dcda0248 100644 --- a/hw/core/machine.c +++ b/hw/core/machine.c @@ -1005,6 +1005,12 @@ static void machine_class_init(ObjectClass *oc, void *data) /* Default 128 MB as guest ram size */ mc->default_ram_size = 128 * MiB; mc->rom_file_has_mr = true; + /* + * SMBIOS 3.1.0 7.18.5 Memory Device — Extended Size + * use max possible value that could be encoded into + * 'Extended Size' field (2047Tb). + */ + mc->smbios_memory_device_size = 2047 * TiB; /* numa node memory size aligned on 8MB by default. * On Linux, each node's border has to be 8MB aligned diff --git a/hw/cxl/cxl-events.c b/hw/cxl/cxl-events.c index d397718b1b..12dee2e467 100644 --- a/hw/cxl/cxl-events.c +++ b/hw/cxl/cxl-events.c @@ -139,6 +139,19 @@ bool cxl_event_insert(CXLDeviceState *cxlds, CXLEventLogType log_type, return cxl_event_count(log) == 1; } +void cxl_discard_all_event_records(CXLDeviceState *cxlds) +{ + CXLEventLogType log_type; + CXLEventLog *log; + + for (log_type = 0; log_type < CXL_EVENT_TYPE_MAX; log_type++) { + log = &cxlds->event_logs[log_type]; + while (!cxl_event_empty(log)) { + cxl_event_delete_head(cxlds, log_type, log); + } + } +} + CXLRetCode cxl_event_get_records(CXLDeviceState *cxlds, CXLGetEventPayload *pl, uint8_t log_type, int max_recs, size_t *len) diff --git a/hw/cxl/cxl-host.c b/hw/cxl/cxl-host.c index c5f5fcfd64..e9f2543c43 100644 --- a/hw/cxl/cxl-host.c +++ b/hw/cxl/cxl-host.c @@ -315,7 +315,8 @@ static void machine_set_cxl(Object *obj, Visitor *v, const char *name, static void machine_get_cfmw(Object *obj, Visitor *v, const char *name, void *opaque, Error **errp) { - CXLFixedMemoryWindowOptionsList **list = opaque; + CXLState *state = opaque; + CXLFixedMemoryWindowOptionsList **list = &state->cfmw_list; visit_type_CXLFixedMemoryWindowOptionsList(v, name, list, errp); } diff --git a/hw/cxl/cxl-mailbox-utils.c b/hw/cxl/cxl-mailbox-utils.c index 74eeb6fde7..3ebbd32e10 100644 --- a/hw/cxl/cxl-mailbox-utils.c +++ b/hw/cxl/cxl-mailbox-utils.c @@ -12,6 +12,7 @@ #include "hw/pci/msix.h" #include "hw/cxl/cxl.h" #include "hw/cxl/cxl_events.h" +#include "hw/cxl/cxl_mailbox.h" #include "hw/pci/pci.h" #include "hw/pci-bridge/cxl_upstream_port.h" #include "qemu/cutils.h" @@ -62,12 +63,18 @@ enum { #define SET_INTERRUPT_POLICY 0x3 FIRMWARE_UPDATE = 0x02, #define GET_INFO 0x0 + #define TRANSFER 0x1 + #define ACTIVATE 0x2 TIMESTAMP = 0x03, #define GET 0x0 #define SET 0x1 LOGS = 0x04, #define GET_SUPPORTED 0x0 #define GET_LOG 0x1 + FEATURES = 0x05, + #define GET_SUPPORTED 0x0 + #define GET_FEATURE 0x1 + #define SET_FEATURE 0x2 IDENTIFY = 0x40, #define MEMORY_DEVICE 0x0 CCLS = 0x41, @@ -83,6 +90,9 @@ enum { #define GET_POISON_LIST 0x0 #define INJECT_POISON 0x1 #define CLEAR_POISON 0x2 + #define GET_SCAN_MEDIA_CAPABILITIES 0x3 + #define SCAN_MEDIA 0x4 + #define GET_SCAN_MEDIA_RESULTS 0x5 DCD_CONFIG = 0x48, #define GET_DC_CONFIG 0x0 #define GET_DYN_CAP_EXT_LIST 0x1 @@ -235,7 +245,6 @@ static CXLRetCode cmd_events_get_records(const struct cxl_cmd *cmd, log_type = payload_in[0]; pl = (CXLGetEventPayload *)payload_out; - memset(pl, 0, sizeof(*pl)); max_recs = (cxlds->payload_size - CXL_EVENT_PAYLOAD_HDR_SIZE) / CXL_EVENT_RECORD_SIZE; @@ -273,7 +282,6 @@ static CXLRetCode cmd_events_get_interrupt_policy(const struct cxl_cmd *cmd, CXLEventLog *log; policy = (CXLEventInterruptPolicy *)payload_out; - memset(policy, 0, sizeof(*policy)); log = &cxlds->event_logs[CXL_EVENT_TYPE_INFO]; if (log->irq_enabled) { @@ -372,7 +380,6 @@ static CXLRetCode cmd_infostat_identify(const struct cxl_cmd *cmd, QEMU_BUILD_BUG_ON(sizeof(*is_identify) != 18); is_identify = (void *)payload_out; - memset(is_identify, 0, sizeof(*is_identify)); is_identify->pcie_vid = class->vendor_id; is_identify->pcie_did = class->device_id; if (object_dynamic_cast(OBJECT(cci->d), TYPE_CXL_USP)) { @@ -606,7 +613,6 @@ static CXLRetCode cmd_infostat_bg_op_sts(const struct cxl_cmd *cmd, QEMU_BUILD_BUG_ON(sizeof(*bg_op_status) != 8); bg_op_status = (void *)payload_out; - memset(bg_op_status, 0, sizeof(*bg_op_status)); bg_op_status->status = cci->bg.complete_pct << 1; if (cci->bg.runtime > 0) { bg_op_status->status |= 1U << 0; @@ -618,6 +624,9 @@ static CXLRetCode cmd_infostat_bg_op_sts(const struct cxl_cmd *cmd, return CXL_MBOX_SUCCESS; } +#define CXL_FW_SLOTS 2 +#define CXL_FW_SIZE 0x02000000 /* 32 mb */ + /* CXL r3.1 Section 8.2.9.3.1: Get FW Info (Opcode 0200h) */ static CXLRetCode cmd_firmware_update_get_info(const struct cxl_cmd *cmd, uint8_t *payload_in, @@ -647,17 +656,193 @@ static CXLRetCode cmd_firmware_update_get_info(const struct cxl_cmd *cmd, } fw_info = (void *)payload_out; - memset(fw_info, 0, sizeof(*fw_info)); - fw_info->slots_supported = 2; - fw_info->slot_info = BIT(0) | BIT(3); - fw_info->caps = 0; - pstrcpy(fw_info->fw_rev1, sizeof(fw_info->fw_rev1), "BWFW VERSION 0"); + fw_info->slots_supported = CXL_FW_SLOTS; + fw_info->slot_info = (cci->fw.active_slot & 0x7) | + ((cci->fw.staged_slot & 0x7) << 3); + fw_info->caps = BIT(0); /* online update supported */ + + if (cci->fw.slot[0]) { + pstrcpy(fw_info->fw_rev1, sizeof(fw_info->fw_rev1), "BWFW VERSION 0"); + } + if (cci->fw.slot[1]) { + pstrcpy(fw_info->fw_rev2, sizeof(fw_info->fw_rev2), "BWFW VERSION 1"); + } *len_out = sizeof(*fw_info); return CXL_MBOX_SUCCESS; } +/* CXL r3.1 section 8.2.9.3.2: Transfer FW (Opcode 0201h) */ +#define CXL_FW_XFER_ALIGNMENT 128 + +#define CXL_FW_XFER_ACTION_FULL 0x0 +#define CXL_FW_XFER_ACTION_INIT 0x1 +#define CXL_FW_XFER_ACTION_CONTINUE 0x2 +#define CXL_FW_XFER_ACTION_END 0x3 +#define CXL_FW_XFER_ACTION_ABORT 0x4 + +static CXLRetCode cmd_firmware_update_transfer(const struct cxl_cmd *cmd, + uint8_t *payload_in, + size_t len, + uint8_t *payload_out, + size_t *len_out, + CXLCCI *cci) +{ + struct { + uint8_t action; + uint8_t slot; + uint8_t rsvd1[2]; + uint32_t offset; + uint8_t rsvd2[0x78]; + uint8_t data[]; + } QEMU_PACKED *fw_transfer = (void *)payload_in; + size_t offset, length; + + if (fw_transfer->action == CXL_FW_XFER_ACTION_ABORT) { + /* + * At this point there aren't any on-going transfers + * running in the bg - this is serialized before this + * call altogether. Just mark the state machine and + * disregard any other input. + */ + cci->fw.transferring = false; + return CXL_MBOX_SUCCESS; + } + + offset = fw_transfer->offset * CXL_FW_XFER_ALIGNMENT; + length = len - sizeof(*fw_transfer); + if (offset + length > CXL_FW_SIZE) { + return CXL_MBOX_INVALID_INPUT; + } + + if (cci->fw.transferring) { + if (fw_transfer->action == CXL_FW_XFER_ACTION_FULL || + fw_transfer->action == CXL_FW_XFER_ACTION_INIT) { + return CXL_MBOX_FW_XFER_IN_PROGRESS; + } + /* + * Abort partitioned package transfer if over 30 secs + * between parts. As opposed to the explicit ABORT action, + * semantically treat this condition as an error - as + * if a part action were passed without a previous INIT. + */ + if (difftime(time(NULL), cci->fw.last_partxfer) > 30.0) { + cci->fw.transferring = false; + return CXL_MBOX_INVALID_INPUT; + } + } else if (fw_transfer->action == CXL_FW_XFER_ACTION_CONTINUE || + fw_transfer->action == CXL_FW_XFER_ACTION_END) { + return CXL_MBOX_INVALID_INPUT; + } + + /* allow back-to-back retransmission */ + if ((offset != cci->fw.prev_offset || length != cci->fw.prev_len) && + (fw_transfer->action == CXL_FW_XFER_ACTION_CONTINUE || + fw_transfer->action == CXL_FW_XFER_ACTION_END)) { + /* verify no overlaps */ + if (offset < cci->fw.prev_offset + cci->fw.prev_len) { + return CXL_MBOX_FW_XFER_OUT_OF_ORDER; + } + } + + switch (fw_transfer->action) { + case CXL_FW_XFER_ACTION_FULL: /* ignores offset */ + case CXL_FW_XFER_ACTION_END: + if (fw_transfer->slot == 0 || + fw_transfer->slot == cci->fw.active_slot || + fw_transfer->slot > CXL_FW_SLOTS) { + return CXL_MBOX_FW_INVALID_SLOT; + } + + /* mark the slot used upon bg completion */ + break; + case CXL_FW_XFER_ACTION_INIT: + if (offset != 0) { + return CXL_MBOX_INVALID_INPUT; + } + + cci->fw.transferring = true; + cci->fw.prev_offset = offset; + cci->fw.prev_len = length; + break; + case CXL_FW_XFER_ACTION_CONTINUE: + cci->fw.prev_offset = offset; + cci->fw.prev_len = length; + break; + default: + return CXL_MBOX_INVALID_INPUT; + } + + if (fw_transfer->action == CXL_FW_XFER_ACTION_FULL) { + cci->bg.runtime = 10 * 1000UL; + } else { + cci->bg.runtime = 2 * 1000UL; + } + /* keep relevant context for bg completion */ + cci->fw.curr_action = fw_transfer->action; + cci->fw.curr_slot = fw_transfer->slot; + *len_out = 0; + + return CXL_MBOX_BG_STARTED; +} + +static void __do_firmware_xfer(CXLCCI *cci) +{ + switch (cci->fw.curr_action) { + case CXL_FW_XFER_ACTION_FULL: + case CXL_FW_XFER_ACTION_END: + cci->fw.slot[cci->fw.curr_slot - 1] = true; + cci->fw.transferring = false; + break; + case CXL_FW_XFER_ACTION_INIT: + case CXL_FW_XFER_ACTION_CONTINUE: + time(&cci->fw.last_partxfer); + break; + default: + break; + } +} + +/* CXL r3.1 section 8.2.9.3.3: Activate FW (Opcode 0202h) */ +static CXLRetCode cmd_firmware_update_activate(const struct cxl_cmd *cmd, + uint8_t *payload_in, + size_t len, + uint8_t *payload_out, + size_t *len_out, + CXLCCI *cci) +{ + struct { + uint8_t action; + uint8_t slot; + } QEMU_PACKED *fw_activate = (void *)payload_in; + QEMU_BUILD_BUG_ON(sizeof(*fw_activate) != 0x2); + + if (fw_activate->slot == 0 || + fw_activate->slot == cci->fw.active_slot || + fw_activate->slot > CXL_FW_SLOTS) { + return CXL_MBOX_FW_INVALID_SLOT; + } + + /* ensure that an actual fw package is there */ + if (!cci->fw.slot[fw_activate->slot - 1]) { + return CXL_MBOX_FW_INVALID_SLOT; + } + + switch (fw_activate->action) { + case 0: /* online */ + cci->fw.active_slot = fw_activate->slot; + break; + case 1: /* reset */ + cci->fw.staged_slot = fw_activate->slot; + break; + default: + return CXL_MBOX_INVALID_INPUT; + } + + return CXL_MBOX_SUCCESS; +} + /* CXL r3.1 Section 8.2.9.4.1: Get Timestamp (Opcode 0300h) */ static CXLRetCode cmd_timestamp_get(const struct cxl_cmd *cmd, uint8_t *payload_in, @@ -768,6 +953,388 @@ static CXLRetCode cmd_logs_get_log(const struct cxl_cmd *cmd, return CXL_MBOX_SUCCESS; } +/* CXL r3.1 section 8.2.9.6: Features */ +/* + * Get Supported Features output payload + * CXL r3.1 section 8.2.9.6.1 Table 8-96 + */ +typedef struct CXLSupportedFeatureHeader { + uint16_t entries; + uint16_t nsuppfeats_dev; + uint32_t reserved; +} QEMU_PACKED CXLSupportedFeatureHeader; + +/* + * Get Supported Features Supported Feature Entry + * CXL r3.1 section 8.2.9.6.1 Table 8-97 + */ +typedef struct CXLSupportedFeatureEntry { + QemuUUID uuid; + uint16_t feat_index; + uint16_t get_feat_size; + uint16_t set_feat_size; + uint32_t attr_flags; + uint8_t get_feat_version; + uint8_t set_feat_version; + uint16_t set_feat_effects; + uint8_t rsvd[18]; +} QEMU_PACKED CXLSupportedFeatureEntry; + +/* + * Get Supported Features Supported Feature Entry + * CXL rev 3.1 section 8.2.9.6.1 Table 8-97 + */ +/* Supported Feature Entry : attribute flags */ +#define CXL_FEAT_ENTRY_ATTR_FLAG_CHANGABLE BIT(0) +#define CXL_FEAT_ENTRY_ATTR_FLAG_DEEPEST_RESET_PERSISTENCE_MASK GENMASK(3, 1) +#define CXL_FEAT_ENTRY_ATTR_FLAG_PERSIST_ACROSS_FIRMWARE_UPDATE BIT(4) +#define CXL_FEAT_ENTRY_ATTR_FLAG_SUPPORT_DEFAULT_SELECTION BIT(5) +#define CXL_FEAT_ENTRY_ATTR_FLAG_SUPPORT_SAVED_SELECTION BIT(6) + +/* Supported Feature Entry : set feature effects */ +#define CXL_FEAT_ENTRY_SFE_CONFIG_CHANGE_COLD_RESET BIT(0) +#define CXL_FEAT_ENTRY_SFE_IMMEDIATE_CONFIG_CHANGE BIT(1) +#define CXL_FEAT_ENTRY_SFE_IMMEDIATE_DATA_CHANGE BIT(2) +#define CXL_FEAT_ENTRY_SFE_IMMEDIATE_POLICY_CHANGE BIT(3) +#define CXL_FEAT_ENTRY_SFE_IMMEDIATE_LOG_CHANGE BIT(4) +#define CXL_FEAT_ENTRY_SFE_SECURITY_STATE_CHANGE BIT(5) +#define CXL_FEAT_ENTRY_SFE_BACKGROUND_OPERATION BIT(6) +#define CXL_FEAT_ENTRY_SFE_SUPPORT_SECONDARY_MAILBOX BIT(7) +#define CXL_FEAT_ENTRY_SFE_SUPPORT_ABORT_BACKGROUND_OPERATION BIT(8) +#define CXL_FEAT_ENTRY_SFE_CEL_VALID BIT(9) +#define CXL_FEAT_ENTRY_SFE_CONFIG_CHANGE_CONV_RESET BIT(10) +#define CXL_FEAT_ENTRY_SFE_CONFIG_CHANGE_CXL_RESET BIT(11) + +enum CXL_SUPPORTED_FEATURES_LIST { + CXL_FEATURE_PATROL_SCRUB = 0, + CXL_FEATURE_ECS, + CXL_FEATURE_MAX +}; + +/* Get Feature CXL 3.1 Spec 8.2.9.6.2 */ +/* + * Get Feature input payload + * CXL r3.1 section 8.2.9.6.2 Table 8-99 + */ +/* Get Feature : Payload in selection */ +enum CXL_GET_FEATURE_SELECTION { + CXL_GET_FEATURE_SEL_CURRENT_VALUE, + CXL_GET_FEATURE_SEL_DEFAULT_VALUE, + CXL_GET_FEATURE_SEL_SAVED_VALUE, + CXL_GET_FEATURE_SEL_MAX +}; + +/* Set Feature CXL 3.1 Spec 8.2.9.6.3 */ +/* + * Set Feature input payload + * CXL r3.1 section 8.2.9.6.3 Table 8-101 + */ +typedef struct CXLSetFeatureInHeader { + QemuUUID uuid; + uint32_t flags; + uint16_t offset; + uint8_t version; + uint8_t rsvd[9]; +} QEMU_PACKED QEMU_ALIGNED(16) CXLSetFeatureInHeader; + +/* Set Feature : Payload in flags */ +#define CXL_SET_FEATURE_FLAG_DATA_TRANSFER_MASK 0x7 +enum CXL_SET_FEATURE_FLAG_DATA_TRANSFER { + CXL_SET_FEATURE_FLAG_FULL_DATA_TRANSFER, + CXL_SET_FEATURE_FLAG_INITIATE_DATA_TRANSFER, + CXL_SET_FEATURE_FLAG_CONTINUE_DATA_TRANSFER, + CXL_SET_FEATURE_FLAG_FINISH_DATA_TRANSFER, + CXL_SET_FEATURE_FLAG_ABORT_DATA_TRANSFER, + CXL_SET_FEATURE_FLAG_DATA_TRANSFER_MAX +}; +#define CXL_SET_FEAT_DATA_SAVED_ACROSS_RESET BIT(3) + +/* CXL r3.1 section 8.2.9.9.11.1: Device Patrol Scrub Control Feature */ +static const QemuUUID patrol_scrub_uuid = { + .data = UUID(0x96dad7d6, 0xfde8, 0x482b, 0xa7, 0x33, + 0x75, 0x77, 0x4e, 0x06, 0xdb, 0x8a) +}; + +typedef struct CXLMemPatrolScrubSetFeature { + CXLSetFeatureInHeader hdr; + CXLMemPatrolScrubWriteAttrs feat_data; +} QEMU_PACKED QEMU_ALIGNED(16) CXLMemPatrolScrubSetFeature; + +/* + * CXL r3.1 section 8.2.9.9.11.2: + * DDR5 Error Check Scrub (ECS) Control Feature + */ +static const QemuUUID ecs_uuid = { + .data = UUID(0xe5b13f22, 0x2328, 0x4a14, 0xb8, 0xba, + 0xb9, 0x69, 0x1e, 0x89, 0x33, 0x86) +}; + +typedef struct CXLMemECSSetFeature { + CXLSetFeatureInHeader hdr; + CXLMemECSWriteAttrs feat_data[]; +} QEMU_PACKED QEMU_ALIGNED(16) CXLMemECSSetFeature; + +/* CXL r3.1 section 8.2.9.6.1: Get Supported Features (Opcode 0500h) */ +static CXLRetCode cmd_features_get_supported(const struct cxl_cmd *cmd, + uint8_t *payload_in, + size_t len_in, + uint8_t *payload_out, + size_t *len_out, + CXLCCI *cci) +{ + struct { + uint32_t count; + uint16_t start_index; + uint16_t reserved; + } QEMU_PACKED QEMU_ALIGNED(16) * get_feats_in = (void *)payload_in; + + struct { + CXLSupportedFeatureHeader hdr; + CXLSupportedFeatureEntry feat_entries[]; + } QEMU_PACKED QEMU_ALIGNED(16) * get_feats_out = (void *)payload_out; + uint16_t index, req_entries; + uint16_t entry; + + if (!object_dynamic_cast(OBJECT(cci->d), TYPE_CXL_TYPE3)) { + return CXL_MBOX_UNSUPPORTED; + } + if (get_feats_in->count < sizeof(CXLSupportedFeatureHeader) || + get_feats_in->start_index >= CXL_FEATURE_MAX) { + return CXL_MBOX_INVALID_INPUT; + } + + req_entries = (get_feats_in->count - + sizeof(CXLSupportedFeatureHeader)) / + sizeof(CXLSupportedFeatureEntry); + req_entries = MIN(req_entries, + (CXL_FEATURE_MAX - get_feats_in->start_index)); + + for (entry = 0, index = get_feats_in->start_index; + entry < req_entries; index++) { + switch (index) { + case CXL_FEATURE_PATROL_SCRUB: + /* Fill supported feature entry for device patrol scrub control */ + get_feats_out->feat_entries[entry++] = + (struct CXLSupportedFeatureEntry) { + .uuid = patrol_scrub_uuid, + .feat_index = index, + .get_feat_size = sizeof(CXLMemPatrolScrubReadAttrs), + .set_feat_size = sizeof(CXLMemPatrolScrubWriteAttrs), + .attr_flags = CXL_FEAT_ENTRY_ATTR_FLAG_CHANGABLE, + .get_feat_version = CXL_MEMDEV_PS_GET_FEATURE_VERSION, + .set_feat_version = CXL_MEMDEV_PS_SET_FEATURE_VERSION, + .set_feat_effects = CXL_FEAT_ENTRY_SFE_IMMEDIATE_CONFIG_CHANGE | + CXL_FEAT_ENTRY_SFE_CEL_VALID, + }; + break; + case CXL_FEATURE_ECS: + /* Fill supported feature entry for device DDR5 ECS control */ + get_feats_out->feat_entries[entry++] = + (struct CXLSupportedFeatureEntry) { + .uuid = ecs_uuid, + .feat_index = index, + .get_feat_size = CXL_ECS_NUM_MEDIA_FRUS * + sizeof(CXLMemECSReadAttrs), + .set_feat_size = CXL_ECS_NUM_MEDIA_FRUS * + sizeof(CXLMemECSWriteAttrs), + .attr_flags = CXL_FEAT_ENTRY_ATTR_FLAG_CHANGABLE, + .get_feat_version = CXL_ECS_GET_FEATURE_VERSION, + .set_feat_version = CXL_ECS_SET_FEATURE_VERSION, + .set_feat_effects = CXL_FEAT_ENTRY_SFE_IMMEDIATE_CONFIG_CHANGE | + CXL_FEAT_ENTRY_SFE_CEL_VALID, + }; + break; + default: + __builtin_unreachable(); + } + } + get_feats_out->hdr.nsuppfeats_dev = CXL_FEATURE_MAX; + get_feats_out->hdr.entries = req_entries; + *len_out = sizeof(CXLSupportedFeatureHeader) + + req_entries * sizeof(CXLSupportedFeatureEntry); + + return CXL_MBOX_SUCCESS; +} + +/* CXL r3.1 section 8.2.9.6.2: Get Feature (Opcode 0501h) */ +static CXLRetCode cmd_features_get_feature(const struct cxl_cmd *cmd, + uint8_t *payload_in, + size_t len_in, + uint8_t *payload_out, + size_t *len_out, + CXLCCI *cci) +{ + struct { + QemuUUID uuid; + uint16_t offset; + uint16_t count; + uint8_t selection; + } QEMU_PACKED QEMU_ALIGNED(16) * get_feature; + uint16_t bytes_to_copy = 0; + CXLType3Dev *ct3d; + CXLSetFeatureInfo *set_feat_info; + + if (!object_dynamic_cast(OBJECT(cci->d), TYPE_CXL_TYPE3)) { + return CXL_MBOX_UNSUPPORTED; + } + + ct3d = CXL_TYPE3(cci->d); + get_feature = (void *)payload_in; + + set_feat_info = &ct3d->set_feat_info; + if (qemu_uuid_is_equal(&get_feature->uuid, &set_feat_info->uuid)) { + return CXL_MBOX_FEATURE_TRANSFER_IN_PROGRESS; + } + + if (get_feature->selection != CXL_GET_FEATURE_SEL_CURRENT_VALUE) { + return CXL_MBOX_UNSUPPORTED; + } + if (get_feature->offset + get_feature->count > cci->payload_max) { + return CXL_MBOX_INVALID_INPUT; + } + + if (qemu_uuid_is_equal(&get_feature->uuid, &patrol_scrub_uuid)) { + if (get_feature->offset >= sizeof(CXLMemPatrolScrubReadAttrs)) { + return CXL_MBOX_INVALID_INPUT; + } + bytes_to_copy = sizeof(CXLMemPatrolScrubReadAttrs) - + get_feature->offset; + bytes_to_copy = MIN(bytes_to_copy, get_feature->count); + memcpy(payload_out, + (uint8_t *)&ct3d->patrol_scrub_attrs + get_feature->offset, + bytes_to_copy); + } else if (qemu_uuid_is_equal(&get_feature->uuid, &ecs_uuid)) { + if (get_feature->offset >= CXL_ECS_NUM_MEDIA_FRUS * + sizeof(CXLMemECSReadAttrs)) { + return CXL_MBOX_INVALID_INPUT; + } + bytes_to_copy = CXL_ECS_NUM_MEDIA_FRUS * + sizeof(CXLMemECSReadAttrs) - + get_feature->offset; + bytes_to_copy = MIN(bytes_to_copy, get_feature->count); + memcpy(payload_out, + (uint8_t *)&ct3d->ecs_attrs + get_feature->offset, + bytes_to_copy); + } else { + return CXL_MBOX_UNSUPPORTED; + } + + *len_out = bytes_to_copy; + + return CXL_MBOX_SUCCESS; +} + +/* CXL r3.1 section 8.2.9.6.3: Set Feature (Opcode 0502h) */ +static CXLRetCode cmd_features_set_feature(const struct cxl_cmd *cmd, + uint8_t *payload_in, + size_t len_in, + uint8_t *payload_out, + size_t *len_out, + CXLCCI *cci) +{ + CXLSetFeatureInHeader *hdr = (void *)payload_in; + CXLMemPatrolScrubWriteAttrs *ps_write_attrs; + CXLMemPatrolScrubSetFeature *ps_set_feature; + CXLMemECSWriteAttrs *ecs_write_attrs; + CXLMemECSSetFeature *ecs_set_feature; + CXLSetFeatureInfo *set_feat_info; + uint16_t bytes_to_copy = 0; + uint8_t data_transfer_flag; + CXLType3Dev *ct3d; + uint16_t count; + + + if (!object_dynamic_cast(OBJECT(cci->d), TYPE_CXL_TYPE3)) { + return CXL_MBOX_UNSUPPORTED; + } + ct3d = CXL_TYPE3(cci->d); + set_feat_info = &ct3d->set_feat_info; + + if (!qemu_uuid_is_null(&set_feat_info->uuid) && + !qemu_uuid_is_equal(&hdr->uuid, &set_feat_info->uuid)) { + return CXL_MBOX_FEATURE_TRANSFER_IN_PROGRESS; + } + if (hdr->flags & CXL_SET_FEAT_DATA_SAVED_ACROSS_RESET) { + set_feat_info->data_saved_across_reset = true; + } else { + set_feat_info->data_saved_across_reset = false; + } + + data_transfer_flag = + hdr->flags & CXL_SET_FEATURE_FLAG_DATA_TRANSFER_MASK; + if (data_transfer_flag == CXL_SET_FEATURE_FLAG_INITIATE_DATA_TRANSFER) { + set_feat_info->uuid = hdr->uuid; + set_feat_info->data_size = 0; + } + set_feat_info->data_transfer_flag = data_transfer_flag; + set_feat_info->data_offset = hdr->offset; + bytes_to_copy = len_in - sizeof(CXLSetFeatureInHeader); + + if (qemu_uuid_is_equal(&hdr->uuid, &patrol_scrub_uuid)) { + if (hdr->version != CXL_MEMDEV_PS_SET_FEATURE_VERSION) { + return CXL_MBOX_UNSUPPORTED; + } + + ps_set_feature = (void *)payload_in; + ps_write_attrs = &ps_set_feature->feat_data; + memcpy((uint8_t *)&ct3d->patrol_scrub_wr_attrs + hdr->offset, + ps_write_attrs, + bytes_to_copy); + set_feat_info->data_size += bytes_to_copy; + + if (data_transfer_flag == CXL_SET_FEATURE_FLAG_FULL_DATA_TRANSFER || + data_transfer_flag == CXL_SET_FEATURE_FLAG_FINISH_DATA_TRANSFER) { + ct3d->patrol_scrub_attrs.scrub_cycle &= ~0xFF; + ct3d->patrol_scrub_attrs.scrub_cycle |= + ct3d->patrol_scrub_wr_attrs.scrub_cycle_hr & 0xFF; + ct3d->patrol_scrub_attrs.scrub_flags &= ~0x1; + ct3d->patrol_scrub_attrs.scrub_flags |= + ct3d->patrol_scrub_wr_attrs.scrub_flags & 0x1; + } + } else if (qemu_uuid_is_equal(&hdr->uuid, + &ecs_uuid)) { + if (hdr->version != CXL_ECS_SET_FEATURE_VERSION) { + return CXL_MBOX_UNSUPPORTED; + } + + ecs_set_feature = (void *)payload_in; + ecs_write_attrs = ecs_set_feature->feat_data; + memcpy((uint8_t *)ct3d->ecs_wr_attrs + hdr->offset, + ecs_write_attrs, + bytes_to_copy); + set_feat_info->data_size += bytes_to_copy; + + if (data_transfer_flag == CXL_SET_FEATURE_FLAG_FULL_DATA_TRANSFER || + data_transfer_flag == CXL_SET_FEATURE_FLAG_FINISH_DATA_TRANSFER) { + for (count = 0; count < CXL_ECS_NUM_MEDIA_FRUS; count++) { + ct3d->ecs_attrs[count].ecs_log_cap = + ct3d->ecs_wr_attrs[count].ecs_log_cap; + ct3d->ecs_attrs[count].ecs_config = + ct3d->ecs_wr_attrs[count].ecs_config & 0x1F; + } + } + } else { + return CXL_MBOX_UNSUPPORTED; + } + + if (data_transfer_flag == CXL_SET_FEATURE_FLAG_FULL_DATA_TRANSFER || + data_transfer_flag == CXL_SET_FEATURE_FLAG_FINISH_DATA_TRANSFER || + data_transfer_flag == CXL_SET_FEATURE_FLAG_ABORT_DATA_TRANSFER) { + memset(&set_feat_info->uuid, 0, sizeof(QemuUUID)); + if (qemu_uuid_is_equal(&hdr->uuid, &patrol_scrub_uuid)) { + memset(&ct3d->patrol_scrub_wr_attrs, 0, set_feat_info->data_size); + } else if (qemu_uuid_is_equal(&hdr->uuid, &ecs_uuid)) { + memset(ct3d->ecs_wr_attrs, 0, set_feat_info->data_size); + } + set_feat_info->data_transfer_flag = 0; + set_feat_info->data_saved_across_reset = false; + set_feat_info->data_offset = 0; + set_feat_info->data_size = 0; + } + + return CXL_MBOX_SUCCESS; +} + /* CXL r3.1 Section 8.2.9.9.1.1: Identify Memory Device (Opcode 4000h) */ static CXLRetCode cmd_identify_memory_device(const struct cxl_cmd *cmd, uint8_t *payload_in, @@ -805,7 +1372,6 @@ static CXLRetCode cmd_identify_memory_device(const struct cxl_cmd *cmd, } id = (void *)payload_out; - memset(id, 0, sizeof(*id)); snprintf(id->fw_revision, 0x10, "BWFW VERSION %02d", 0); @@ -953,6 +1519,7 @@ static void __do_sanitization(CXLType3Dev *ct3d) memset(lsa, 0, memory_region_size(mr)); } } + cxl_discard_all_event_records(&ct3d->cxl_dstate); } /* @@ -1086,8 +1653,8 @@ static CXLRetCode cmd_media_get_poison_list(const struct cxl_cmd *cmd, QLIST_FOREACH(ent, poison_list, node) { /* Check for no overlap */ - if (ent->start >= query_start + query_length || - ent->start + ent->length <= query_start) { + if (!ranges_overlap(ent->start, ent->length, + query_start, query_length)) { continue; } record_count++; @@ -1095,13 +1662,12 @@ static CXLRetCode cmd_media_get_poison_list(const struct cxl_cmd *cmd, out_pl_len = sizeof(*out) + record_count * sizeof(out->records[0]); assert(out_pl_len <= CXL_MAILBOX_MAX_PAYLOAD_SIZE); - memset(out, 0, out_pl_len); QLIST_FOREACH(ent, poison_list, node) { uint64_t start, stop; /* Check for no overlap */ - if (ent->start >= query_start + query_length || - ent->start + ent->length <= query_start) { + if (!ranges_overlap(ent->start, ent->length, + query_start, query_length)) { continue; } @@ -1117,6 +1683,10 @@ static CXLRetCode cmd_media_get_poison_list(const struct cxl_cmd *cmd, out->flags = (1 << 1); stq_le_p(&out->overflow_timestamp, ct3d->poison_list_overflow_ts); } + if (scan_media_running(cci)) { + out->flags |= (1 << 2); + } + stw_le_p(&out->count, record_count); *len_out = out_pl_len; return CXL_MBOX_SUCCESS; @@ -1146,6 +1716,16 @@ static CXLRetCode cmd_media_inject_poison(const struct cxl_cmd *cmd, return CXL_MBOX_SUCCESS; } } + /* + * Freeze the list if there is an on-going scan media operation. + */ + if (scan_media_running(cci)) { + /* + * XXX: Spec is ambiguous - is this case considered + * a successful return despite not adding to the list? + */ + goto success; + } if (ct3d->poison_list_cnt == CXL_POISON_LIST_LIMIT) { return CXL_MBOX_INJECT_POISON_LIMIT; @@ -1161,6 +1741,7 @@ static CXLRetCode cmd_media_inject_poison(const struct cxl_cmd *cmd, */ QLIST_INSERT_HEAD(poison_list, p, node); ct3d->poison_list_cnt++; +success: *len_out = 0; return CXL_MBOX_SUCCESS; @@ -1200,6 +1781,17 @@ static CXLRetCode cmd_media_clear_poison(const struct cxl_cmd *cmd, } } + /* + * Freeze the list if there is an on-going scan media operation. + */ + if (scan_media_running(cci)) { + /* + * XXX: Spec is ambiguous - is this case considered + * a successful return despite not removing from the list? + */ + goto success; + } + QLIST_FOREACH(ent, poison_list, node) { /* * Test for contained in entry. Simpler than general case @@ -1210,7 +1802,7 @@ static CXLRetCode cmd_media_clear_poison(const struct cxl_cmd *cmd, } } if (!ent) { - return CXL_MBOX_SUCCESS; + goto success; } QLIST_REMOVE(ent, node); @@ -1247,12 +1839,262 @@ static CXLRetCode cmd_media_clear_poison(const struct cxl_cmd *cmd, } /* Any fragments have been added, free original entry */ g_free(ent); +success: *len_out = 0; return CXL_MBOX_SUCCESS; } /* + * CXL r3.1 section 8.2.9.9.4.4: Get Scan Media Capabilities + */ +static CXLRetCode +cmd_media_get_scan_media_capabilities(const struct cxl_cmd *cmd, + uint8_t *payload_in, + size_t len_in, + uint8_t *payload_out, + size_t *len_out, + CXLCCI *cci) +{ + struct get_scan_media_capabilities_pl { + uint64_t pa; + uint64_t length; + } QEMU_PACKED; + + struct get_scan_media_capabilities_out_pl { + uint32_t estimated_runtime_ms; + }; + + CXLType3Dev *ct3d = CXL_TYPE3(cci->d); + CXLDeviceState *cxl_dstate = &ct3d->cxl_dstate; + struct get_scan_media_capabilities_pl *in = (void *)payload_in; + struct get_scan_media_capabilities_out_pl *out = (void *)payload_out; + uint64_t query_start; + uint64_t query_length; + + query_start = ldq_le_p(&in->pa); + /* 64 byte alignment required */ + if (query_start & 0x3f) { + return CXL_MBOX_INVALID_INPUT; + } + query_length = ldq_le_p(&in->length) * CXL_CACHE_LINE_SIZE; + + if (query_start + query_length > cxl_dstate->static_mem_size) { + return CXL_MBOX_INVALID_PA; + } + + /* + * Just use 400 nanosecond access/read latency + 100 ns for + * the cost of updating the poison list. For small enough + * chunks return at least 1 ms. + */ + stl_le_p(&out->estimated_runtime_ms, + MAX(1, query_length * (0.0005L / 64))); + + *len_out = sizeof(*out); + return CXL_MBOX_SUCCESS; +} + +static void __do_scan_media(CXLType3Dev *ct3d) +{ + CXLPoison *ent; + unsigned int results_cnt = 0; + + QLIST_FOREACH(ent, &ct3d->scan_media_results, node) { + results_cnt++; + } + + /* only scan media may clear the overflow */ + if (ct3d->poison_list_overflowed && + ct3d->poison_list_cnt == results_cnt) { + cxl_clear_poison_list_overflowed(ct3d); + } + /* scan media has run since last conventional reset */ + ct3d->scan_media_hasrun = true; +} + +/* + * CXL r3.1 section 8.2.9.9.4.5: Scan Media + */ +static CXLRetCode cmd_media_scan_media(const struct cxl_cmd *cmd, + uint8_t *payload_in, + size_t len_in, + uint8_t *payload_out, + size_t *len_out, + CXLCCI *cci) +{ + struct scan_media_pl { + uint64_t pa; + uint64_t length; + uint8_t flags; + } QEMU_PACKED; + + struct scan_media_pl *in = (void *)payload_in; + CXLType3Dev *ct3d = CXL_TYPE3(cci->d); + CXLDeviceState *cxl_dstate = &ct3d->cxl_dstate; + uint64_t query_start; + uint64_t query_length; + CXLPoison *ent, *next; + + query_start = ldq_le_p(&in->pa); + /* 64 byte alignment required */ + if (query_start & 0x3f) { + return CXL_MBOX_INVALID_INPUT; + } + query_length = ldq_le_p(&in->length) * CXL_CACHE_LINE_SIZE; + + if (query_start + query_length > cxl_dstate->static_mem_size) { + return CXL_MBOX_INVALID_PA; + } + if (ct3d->dc.num_regions && query_start + query_length >= + cxl_dstate->static_mem_size + ct3d->dc.total_capacity) { + return CXL_MBOX_INVALID_PA; + } + + if (in->flags == 0) { /* TODO */ + qemu_log_mask(LOG_UNIMP, + "Scan Media Event Log is unsupported\n"); + } + + /* any previous results are discarded upon a new Scan Media */ + QLIST_FOREACH_SAFE(ent, &ct3d->scan_media_results, node, next) { + QLIST_REMOVE(ent, node); + g_free(ent); + } + + /* kill the poison list - it will be recreated */ + if (ct3d->poison_list_overflowed) { + QLIST_FOREACH_SAFE(ent, &ct3d->poison_list, node, next) { + QLIST_REMOVE(ent, node); + g_free(ent); + ct3d->poison_list_cnt--; + } + } + + /* + * Scan the backup list and move corresponding entries + * into the results list, updating the poison list + * when possible. + */ + QLIST_FOREACH_SAFE(ent, &ct3d->poison_list_bkp, node, next) { + CXLPoison *res; + + if (ent->start >= query_start + query_length || + ent->start + ent->length <= query_start) { + continue; + } + + /* + * If a Get Poison List cmd comes in while this + * scan is being done, it will see the new complete + * list, while setting the respective flag. + */ + if (ct3d->poison_list_cnt < CXL_POISON_LIST_LIMIT) { + CXLPoison *p = g_new0(CXLPoison, 1); + + p->start = ent->start; + p->length = ent->length; + p->type = ent->type; + QLIST_INSERT_HEAD(&ct3d->poison_list, p, node); + ct3d->poison_list_cnt++; + } + + res = g_new0(CXLPoison, 1); + res->start = ent->start; + res->length = ent->length; + res->type = ent->type; + QLIST_INSERT_HEAD(&ct3d->scan_media_results, res, node); + + QLIST_REMOVE(ent, node); + g_free(ent); + } + + cci->bg.runtime = MAX(1, query_length * (0.0005L / 64)); + *len_out = 0; + + return CXL_MBOX_BG_STARTED; +} + +/* + * CXL r3.1 section 8.2.9.9.4.6: Get Scan Media Results + */ +static CXLRetCode cmd_media_get_scan_media_results(const struct cxl_cmd *cmd, + uint8_t *payload_in, + size_t len_in, + uint8_t *payload_out, + size_t *len_out, + CXLCCI *cci) +{ + struct get_scan_media_results_out_pl { + uint64_t dpa_restart; + uint64_t length; + uint8_t flags; + uint8_t rsvd1; + uint16_t count; + uint8_t rsvd2[0xc]; + struct { + uint64_t addr; + uint32_t length; + uint32_t resv; + } QEMU_PACKED records[]; + } QEMU_PACKED; + + struct get_scan_media_results_out_pl *out = (void *)payload_out; + CXLType3Dev *ct3d = CXL_TYPE3(cci->d); + CXLPoisonList *scan_media_results = &ct3d->scan_media_results; + CXLPoison *ent, *next; + uint16_t total_count = 0, record_count = 0, i = 0; + uint16_t out_pl_len; + + if (!ct3d->scan_media_hasrun) { + return CXL_MBOX_UNSUPPORTED; + } + + /* + * Calculate limits, all entries are within the same address range of the + * last scan media call. + */ + QLIST_FOREACH(ent, scan_media_results, node) { + size_t rec_size = record_count * sizeof(out->records[0]); + + if (sizeof(*out) + rec_size < CXL_MAILBOX_MAX_PAYLOAD_SIZE) { + record_count++; + } + total_count++; + } + + out_pl_len = sizeof(*out) + record_count * sizeof(out->records[0]); + assert(out_pl_len <= CXL_MAILBOX_MAX_PAYLOAD_SIZE); + + memset(out, 0, out_pl_len); + QLIST_FOREACH_SAFE(ent, scan_media_results, node, next) { + uint64_t start, stop; + + if (i == record_count) { + break; + } + + start = ROUND_DOWN(ent->start, 64ull); + stop = ROUND_DOWN(ent->start, 64ull) + ent->length; + stq_le_p(&out->records[i].addr, start | (ent->type & 0x7)); + stl_le_p(&out->records[i].length, (stop - start) / CXL_CACHE_LINE_SIZE); + i++; + + /* consume the returning entry */ + QLIST_REMOVE(ent, node); + g_free(ent); + } + + stw_le_p(&out->count, record_count); + if (total_count > record_count) { + out->flags = (1 << 0); /* More Media Error Records */ + } + + *len_out = out_pl_len; + return CXL_MBOX_SUCCESS; +} + +/* * CXL r3.1 section 8.2.9.9.9.1: Get Dynamic Capacity Configuration * (Opcode: 4800h) */ @@ -1822,40 +2664,51 @@ static CXLRetCode cmd_dcd_release_dyn_cap(const struct cxl_cmd *cmd, return CXL_MBOX_SUCCESS; } -#define IMMEDIATE_CONFIG_CHANGE (1 << 1) -#define IMMEDIATE_DATA_CHANGE (1 << 2) -#define IMMEDIATE_POLICY_CHANGE (1 << 3) -#define IMMEDIATE_LOG_CHANGE (1 << 4) -#define SECURITY_STATE_CHANGE (1 << 5) -#define BACKGROUND_OPERATION (1 << 6) - static const struct cxl_cmd cxl_cmd_set[256][256] = { [EVENTS][GET_RECORDS] = { "EVENTS_GET_RECORDS", cmd_events_get_records, 1, 0 }, [EVENTS][CLEAR_RECORDS] = { "EVENTS_CLEAR_RECORDS", - cmd_events_clear_records, ~0, IMMEDIATE_LOG_CHANGE }, + cmd_events_clear_records, ~0, CXL_MBOX_IMMEDIATE_LOG_CHANGE }, [EVENTS][GET_INTERRUPT_POLICY] = { "EVENTS_GET_INTERRUPT_POLICY", cmd_events_get_interrupt_policy, 0, 0 }, [EVENTS][SET_INTERRUPT_POLICY] = { "EVENTS_SET_INTERRUPT_POLICY", cmd_events_set_interrupt_policy, - ~0, IMMEDIATE_CONFIG_CHANGE }, + ~0, CXL_MBOX_IMMEDIATE_CONFIG_CHANGE }, [FIRMWARE_UPDATE][GET_INFO] = { "FIRMWARE_UPDATE_GET_INFO", cmd_firmware_update_get_info, 0, 0 }, + [FIRMWARE_UPDATE][TRANSFER] = { "FIRMWARE_UPDATE_TRANSFER", + cmd_firmware_update_transfer, ~0, CXL_MBOX_BACKGROUND_OPERATION }, + [FIRMWARE_UPDATE][ACTIVATE] = { "FIRMWARE_UPDATE_ACTIVATE", + cmd_firmware_update_activate, 2, CXL_MBOX_BACKGROUND_OPERATION }, [TIMESTAMP][GET] = { "TIMESTAMP_GET", cmd_timestamp_get, 0, 0 }, [TIMESTAMP][SET] = { "TIMESTAMP_SET", cmd_timestamp_set, - 8, IMMEDIATE_POLICY_CHANGE }, + 8, CXL_MBOX_IMMEDIATE_POLICY_CHANGE }, [LOGS][GET_SUPPORTED] = { "LOGS_GET_SUPPORTED", cmd_logs_get_supported, 0, 0 }, [LOGS][GET_LOG] = { "LOGS_GET_LOG", cmd_logs_get_log, 0x18, 0 }, + [FEATURES][GET_SUPPORTED] = { "FEATURES_GET_SUPPORTED", + cmd_features_get_supported, 0x8, 0 }, + [FEATURES][GET_FEATURE] = { "FEATURES_GET_FEATURE", + cmd_features_get_feature, 0x15, 0 }, + [FEATURES][SET_FEATURE] = { "FEATURES_SET_FEATURE", + cmd_features_set_feature, + ~0, + (CXL_MBOX_IMMEDIATE_CONFIG_CHANGE | + CXL_MBOX_IMMEDIATE_DATA_CHANGE | + CXL_MBOX_IMMEDIATE_POLICY_CHANGE | + CXL_MBOX_IMMEDIATE_LOG_CHANGE | + CXL_MBOX_SECURITY_STATE_CHANGE)}, [IDENTIFY][MEMORY_DEVICE] = { "IDENTIFY_MEMORY_DEVICE", cmd_identify_memory_device, 0, 0 }, [CCLS][GET_PARTITION_INFO] = { "CCLS_GET_PARTITION_INFO", cmd_ccls_get_partition_info, 0, 0 }, [CCLS][GET_LSA] = { "CCLS_GET_LSA", cmd_ccls_get_lsa, 8, 0 }, [CCLS][SET_LSA] = { "CCLS_SET_LSA", cmd_ccls_set_lsa, - ~0, IMMEDIATE_CONFIG_CHANGE | IMMEDIATE_DATA_CHANGE }, + ~0, CXL_MBOX_IMMEDIATE_CONFIG_CHANGE | CXL_MBOX_IMMEDIATE_DATA_CHANGE }, [SANITIZE][OVERWRITE] = { "SANITIZE_OVERWRITE", cmd_sanitize_overwrite, 0, - IMMEDIATE_DATA_CHANGE | SECURITY_STATE_CHANGE | BACKGROUND_OPERATION }, + (CXL_MBOX_IMMEDIATE_DATA_CHANGE | + CXL_MBOX_SECURITY_STATE_CHANGE | + CXL_MBOX_BACKGROUND_OPERATION)}, [PERSISTENT_MEM][GET_SECURITY_STATE] = { "GET_SECURITY_STATE", cmd_get_security_state, 0, 0 }, [MEDIA_AND_POISON][GET_POISON_LIST] = { "MEDIA_AND_POISON_GET_POISON_LIST", @@ -1864,6 +2717,14 @@ static const struct cxl_cmd cxl_cmd_set[256][256] = { cmd_media_inject_poison, 8, 0 }, [MEDIA_AND_POISON][CLEAR_POISON] = { "MEDIA_AND_POISON_CLEAR_POISON", cmd_media_clear_poison, 72, 0 }, + [MEDIA_AND_POISON][GET_SCAN_MEDIA_CAPABILITIES] = { + "MEDIA_AND_POISON_GET_SCAN_MEDIA_CAPABILITIES", + cmd_media_get_scan_media_capabilities, 16, 0 }, + [MEDIA_AND_POISON][SCAN_MEDIA] = { "MEDIA_AND_POISON_SCAN_MEDIA", + cmd_media_scan_media, 17, CXL_MBOX_BACKGROUND_OPERATION }, + [MEDIA_AND_POISON][GET_SCAN_MEDIA_RESULTS] = { + "MEDIA_AND_POISON_GET_SCAN_MEDIA_RESULTS", + cmd_media_get_scan_media_results, 0, 0 }, }; static const struct cxl_cmd cxl_cmd_set_dcd[256][256] = { @@ -1874,10 +2735,10 @@ static const struct cxl_cmd cxl_cmd_set_dcd[256][256] = { 8, 0 }, [DCD_CONFIG][ADD_DYN_CAP_RSP] = { "DCD_ADD_DYNAMIC_CAPACITY_RESPONSE", cmd_dcd_add_dyn_cap_rsp, - ~0, IMMEDIATE_DATA_CHANGE }, + ~0, CXL_MBOX_IMMEDIATE_DATA_CHANGE }, [DCD_CONFIG][RELEASE_DYN_CAP] = { "DCD_RELEASE_DYNAMIC_CAPACITY", cmd_dcd_release_dyn_cap, - ~0, IMMEDIATE_DATA_CHANGE }, + ~0, CXL_MBOX_IMMEDIATE_DATA_CHANGE }, }; static const struct cxl_cmd cxl_cmd_set_sw[256][256] = { @@ -1885,8 +2746,8 @@ static const struct cxl_cmd cxl_cmd_set_sw[256][256] = { [INFOSTAT][BACKGROUND_OPERATION_STATUS] = { "BACKGROUND_OPERATION_STATUS", cmd_infostat_bg_op_sts, 0, 0 }, [TIMESTAMP][GET] = { "TIMESTAMP_GET", cmd_timestamp_get, 0, 0 }, - [TIMESTAMP][SET] = { "TIMESTAMP_SET", cmd_timestamp_set, 0, - IMMEDIATE_POLICY_CHANGE }, + [TIMESTAMP][SET] = { "TIMESTAMP_SET", cmd_timestamp_set, 8, + CXL_MBOX_IMMEDIATE_POLICY_CHANGE }, [LOGS][GET_SUPPORTED] = { "LOGS_GET_SUPPORTED", cmd_logs_get_supported, 0, 0 }, [LOGS][GET_LOG] = { "LOGS_GET_LOG", cmd_logs_get_log, 0x18, 0 }, @@ -1913,6 +2774,7 @@ int cxl_process_cci_message(CXLCCI *cci, uint8_t set, uint8_t cmd, int ret; const struct cxl_cmd *cxl_cmd; opcode_handler h; + CXLDeviceState *cxl_dstate; *len_out = 0; cxl_cmd = &cci->cxl_cmd_set[set][cmd]; @@ -1928,28 +2790,34 @@ int cxl_process_cci_message(CXLCCI *cci, uint8_t set, uint8_t cmd, } /* Only one bg command at a time */ - if ((cxl_cmd->effect & BACKGROUND_OPERATION) && + if ((cxl_cmd->effect & CXL_MBOX_BACKGROUND_OPERATION) && cci->bg.runtime > 0) { return CXL_MBOX_BUSY; } - /* forbid any selected commands while overwriting */ - if (sanitize_running(cci)) { - if (h == cmd_events_get_records || - h == cmd_ccls_get_partition_info || - h == cmd_ccls_set_lsa || - h == cmd_ccls_get_lsa || - h == cmd_logs_get_log || - h == cmd_media_get_poison_list || - h == cmd_media_inject_poison || - h == cmd_media_clear_poison || - h == cmd_sanitize_overwrite) { - return CXL_MBOX_MEDIA_DISABLED; + /* forbid any selected commands while the media is disabled */ + if (object_dynamic_cast(OBJECT(cci->d), TYPE_CXL_TYPE3)) { + cxl_dstate = &CXL_TYPE3(cci->d)->cxl_dstate; + + if (cxl_dev_media_disabled(cxl_dstate)) { + if (h == cmd_events_get_records || + h == cmd_ccls_get_partition_info || + h == cmd_ccls_set_lsa || + h == cmd_ccls_get_lsa || + h == cmd_logs_get_log || + h == cmd_media_get_poison_list || + h == cmd_media_inject_poison || + h == cmd_media_clear_poison || + h == cmd_sanitize_overwrite || + h == cmd_firmware_update_transfer || + h == cmd_firmware_update_activate) { + return CXL_MBOX_MEDIA_DISABLED; + } } } ret = (*h)(cxl_cmd, pl_in, len_in, pl_out, len_out, cci); - if ((cxl_cmd->effect & BACKGROUND_OPERATION) && + if ((cxl_cmd->effect & CXL_MBOX_BACKGROUND_OPERATION) && ret == CXL_MBOX_BG_STARTED) { *bg_started = true; } else { @@ -1987,6 +2855,9 @@ static void bg_timercb(void *opaque) cci->bg.complete_pct = 100; cci->bg.ret_code = ret; switch (cci->bg.opcode) { + case 0x0201: /* fw transfer */ + __do_firmware_xfer(cci); + break; case 0x4400: /* sanitize */ { CXLType3Dev *ct3d = CXL_TYPE3(cci->d); @@ -1995,8 +2866,13 @@ static void bg_timercb(void *opaque) cxl_dev_enable_media(&ct3d->cxl_dstate); } break; - case 0x4304: /* TODO: scan media */ + case 0x4304: /* scan media */ + { + CXLType3Dev *ct3d = CXL_TYPE3(cci->d); + + __do_scan_media(ct3d); break; + } default: __builtin_unreachable(); break; @@ -2053,6 +2929,10 @@ void cxl_init_cci(CXLCCI *cci, size_t payload_max) cci->bg.runtime = 0; cci->bg.timer = timer_new_ms(QEMU_CLOCK_VIRTUAL, bg_timercb, cci); + + memset(&cci->fw, 0, sizeof(cci->fw)); + cci->fw.active_slot = 1; + cci->fw.slot[cci->fw.active_slot - 1] = true; } static void cxl_copy_cci_commands(CXLCCI *cci, const struct cxl_cmd (*cxl_cmds)[256]) diff --git a/hw/i2c/mpc_i2c.c b/hw/i2c/mpc_i2c.c index cb051a520f..06d4ce7d68 100644 --- a/hw/i2c/mpc_i2c.c +++ b/hw/i2c/mpc_i2c.c @@ -82,7 +82,7 @@ struct MPCI2CState { uint8_t cr; uint8_t sr; uint8_t dr; - uint8_t dfssr; + uint8_t dfsrr; }; static bool mpc_i2c_is_enabled(MPCI2CState *s) @@ -293,7 +293,7 @@ static void mpc_i2c_write(void *opaque, hwaddr addr, } break; case MPC_I2C_DFSRR: - s->dfssr = value; + s->dfsrr = value; break; default: DPRINTF("ERROR: Bad write addr 0x%x\n", (unsigned int)addr); @@ -319,7 +319,7 @@ static const VMStateDescription mpc_i2c_vmstate = { VMSTATE_UINT8(cr, MPCI2CState), VMSTATE_UINT8(sr, MPCI2CState), VMSTATE_UINT8(dr, MPCI2CState), - VMSTATE_UINT8(dfssr, MPCI2CState), + VMSTATE_UINT8(dfsrr, MPCI2CState), VMSTATE_END_OF_LIST() } }; @@ -329,7 +329,7 @@ static void mpc_i2c_realize(DeviceState *dev, Error **errp) MPCI2CState *i2c = MPC_I2C(dev); sysbus_init_irq(SYS_BUS_DEVICE(dev), &i2c->irq); memory_region_init_io(&i2c->iomem, OBJECT(i2c), &i2c_ops, i2c, - "mpc-i2c", 0x14); + "mpc-i2c", 0x15); sysbus_init_mmio(SYS_BUS_DEVICE(dev), &i2c->iomem); i2c->bus = i2c_init_bus(dev, "i2c"); } diff --git a/hw/i386/acpi-build.c b/hw/i386/acpi-build.c index f4e366f64f..5d4bd2b710 100644 --- a/hw/i386/acpi-build.c +++ b/hw/i386/acpi-build.c @@ -1536,7 +1536,8 @@ build_dsdt(GArray *table_data, BIOSLinker *linker, .fw_unplugs_cpu = pm->smi_on_cpu_unplug, }; build_cpus_aml(dsdt, machine, opts, pc_madt_cpu_entry, - pm->cpu_hp_io_base, "\\_SB.PCI0", "\\_GPE._E02"); + pm->cpu_hp_io_base, "\\_SB.PCI0", "\\_GPE._E02", + AML_SYSTEM_IO); } if (pcms->memhp_io_base && nr_mem) { diff --git a/hw/i386/intel_iommu.c b/hw/i386/intel_iommu.c index 37c21a0aec..9a768f0b44 100644 --- a/hw/i386/intel_iommu.c +++ b/hw/i386/intel_iommu.c @@ -358,7 +358,7 @@ static VTDIOTLBEntry *vtd_lookup_iotlb(IntelIOMMUState *s, uint16_t source_id, { struct vtd_iotlb_key key; VTDIOTLBEntry *entry; - int level; + unsigned level; for (level = VTD_SL_PT_LEVEL; level < VTD_SL_PML4_LEVEL; level++) { key.gfn = vtd_get_iotlb_gfn(addr, level); @@ -2666,13 +2666,43 @@ static bool vtd_process_inv_iec_desc(IntelIOMMUState *s, return true; } +static void do_invalidate_device_tlb(VTDAddressSpace *vtd_dev_as, + bool size, hwaddr addr) +{ + /* + * According to ATS spec table 2.4: + * S = 0, bits 15:12 = xxxx range size: 4K + * S = 1, bits 15:12 = xxx0 range size: 8K + * S = 1, bits 15:12 = xx01 range size: 16K + * S = 1, bits 15:12 = x011 range size: 32K + * S = 1, bits 15:12 = 0111 range size: 64K + * ... + */ + + IOMMUTLBEvent event; + uint64_t sz; + + if (size) { + sz = (VTD_PAGE_SIZE * 2) << cto64(addr >> VTD_PAGE_SHIFT); + addr &= ~(sz - 1); + } else { + sz = VTD_PAGE_SIZE; + } + + event.type = IOMMU_NOTIFIER_DEVIOTLB_UNMAP; + event.entry.target_as = &vtd_dev_as->as; + event.entry.addr_mask = sz - 1; + event.entry.iova = addr; + event.entry.perm = IOMMU_NONE; + event.entry.translated_addr = 0; + memory_region_notify_iommu(&vtd_dev_as->iommu, 0, event); +} + static bool vtd_process_device_iotlb_desc(IntelIOMMUState *s, VTDInvDesc *inv_desc) { VTDAddressSpace *vtd_dev_as; - IOMMUTLBEvent event; hwaddr addr; - uint64_t sz; uint16_t sid; bool size; @@ -2697,28 +2727,7 @@ static bool vtd_process_device_iotlb_desc(IntelIOMMUState *s, goto done; } - /* According to ATS spec table 2.4: - * S = 0, bits 15:12 = xxxx range size: 4K - * S = 1, bits 15:12 = xxx0 range size: 8K - * S = 1, bits 15:12 = xx01 range size: 16K - * S = 1, bits 15:12 = x011 range size: 32K - * S = 1, bits 15:12 = 0111 range size: 64K - * ... - */ - if (size) { - sz = (VTD_PAGE_SIZE * 2) << cto64(addr >> VTD_PAGE_SHIFT); - addr &= ~(sz - 1); - } else { - sz = VTD_PAGE_SIZE; - } - - event.type = IOMMU_NOTIFIER_DEVIOTLB_UNMAP; - event.entry.target_as = &vtd_dev_as->as; - event.entry.addr_mask = sz - 1; - event.entry.iova = addr; - event.entry.perm = IOMMU_NONE; - event.entry.translated_addr = 0; - memory_region_notify_iommu(&vtd_dev_as->iommu, 0, event); + do_invalidate_device_tlb(vtd_dev_as, size, addr); done: return true; diff --git a/hw/i386/intel_iommu_internal.h b/hw/i386/intel_iommu_internal.h index f8cf99bddf..5f32c36943 100644 --- a/hw/i386/intel_iommu_internal.h +++ b/hw/i386/intel_iommu_internal.h @@ -264,10 +264,10 @@ #define VTD_FRCD_FR(val) (((val) & 0xffULL) << 32) #define VTD_FRCD_SID_MASK 0xffffULL #define VTD_FRCD_SID(val) ((val) & VTD_FRCD_SID_MASK) +#define VTD_FRCD_PV(val) (((val) & 0xffffULL) << 40) +#define VTD_FRCD_PP(val) (((val) & 0x1ULL) << 31) /* For the low 64-bit of 128-bit */ #define VTD_FRCD_FI(val) ((val) & ~0xfffULL) -#define VTD_FRCD_PV(val) (((val) & 0xffffULL) << 40) -#define VTD_FRCD_PP(val) (((val) & 0x1) << 31) #define VTD_FRCD_IR_IDX(val) (((val) & 0xffffULL) << 48) /* DMA Remapping Fault Conditions */ @@ -436,7 +436,7 @@ struct VTDIOTLBPageInvInfo { uint16_t domain_id; uint32_t pasid; uint64_t addr; - uint8_t mask; + uint64_t mask; }; typedef struct VTDIOTLBPageInvInfo VTDIOTLBPageInvInfo; diff --git a/hw/i386/pc_piix.c b/hw/i386/pc_piix.c index 9445b07b4f..d9e69243b4 100644 --- a/hw/i386/pc_piix.c +++ b/hw/i386/pc_piix.c @@ -495,6 +495,7 @@ static void pc_i440fx_machine_9_0_options(MachineClass *m) pc_i440fx_machine_9_1_options(m); m->alias = NULL; m->is_default = false; + m->smbios_memory_device_size = 16 * GiB; compat_props_add(m->compat_props, hw_compat_9_0, hw_compat_9_0_len); compat_props_add(m->compat_props, pc_compat_9_0, pc_compat_9_0_len); diff --git a/hw/i386/pc_q35.c b/hw/i386/pc_q35.c index 71d3c6d122..9d108b194e 100644 --- a/hw/i386/pc_q35.c +++ b/hw/i386/pc_q35.c @@ -374,6 +374,7 @@ static void pc_q35_machine_9_0_options(MachineClass *m) PCMachineClass *pcmc = PC_MACHINE_CLASS(m); pc_q35_machine_9_1_options(m); m->alias = NULL; + m->smbios_memory_device_size = 16 * GiB; compat_props_add(m->compat_props, hw_compat_9_0, hw_compat_9_0_len); compat_props_add(m->compat_props, pc_compat_9_0, pc_compat_9_0_len); pcmc->isa_bios_alias = false; diff --git a/hw/i386/sgx.c b/hw/i386/sgx.c index a14a84bc6f..849472a128 100644 --- a/hw/i386/sgx.c +++ b/hw/i386/sgx.c @@ -268,10 +268,12 @@ void hmp_info_sgx(Monitor *mon, const QDict *qdict) bool sgx_epc_get_section(int section_nr, uint64_t *addr, uint64_t *size) { - PCMachineState *pcms = PC_MACHINE(qdev_get_machine()); + PCMachineState *pcms = + (PCMachineState *)object_dynamic_cast(qdev_get_machine(), + TYPE_PC_MACHINE); SGXEPCDevice *epc; - if (pcms->sgx_epc.size == 0 || pcms->sgx_epc.nr_sections <= section_nr) { + if (!pcms || pcms->sgx_epc.size == 0 || pcms->sgx_epc.nr_sections <= section_nr) { return true; } diff --git a/hw/intc/loongson_ipi.c b/hw/intc/loongson_ipi.c index e6a7142480..682cec96f3 100644 --- a/hw/intc/loongson_ipi.c +++ b/hw/intc/loongson_ipi.c @@ -14,6 +14,7 @@ #include "qapi/error.h" #include "qemu/log.h" #include "exec/address-spaces.h" +#include "exec/memory.h" #include "migration/vmstate.h" #ifdef TARGET_LOONGARCH64 #include "target/loongarch/cpu.h" @@ -102,7 +103,7 @@ static MemTxResult send_ipi_data(CPUState *cpu, uint64_t val, hwaddr addr, * if the mask is 0, we need not to do anything. */ if ((val >> 27) & 0xf) { - data = address_space_ldl(iocsr_as, addr, attrs, NULL); + data = address_space_ldl_le(iocsr_as, addr, attrs, NULL); for (i = 0; i < 4; i++) { /* get mask for byte writing */ if (val & (0x1 << (27 + i))) { @@ -113,7 +114,7 @@ static MemTxResult send_ipi_data(CPUState *cpu, uint64_t val, hwaddr addr, data &= mask; data |= (val >> 32) & ~mask; - address_space_stl(iocsr_as, addr, data, attrs, NULL); + address_space_stl_le(iocsr_as, addr, data, attrs, NULL); return MEMTX_OK; } @@ -317,6 +318,13 @@ static void loongson_ipi_realize(DeviceState *dev, Error **errp) } } +static void loongson_ipi_unrealize(DeviceState *dev) +{ + LoongsonIPI *s = LOONGSON_IPI(dev); + + g_free(s->cpu); +} + static const VMStateDescription vmstate_ipi_core = { .name = "ipi-single", .version_id = 2, @@ -352,28 +360,18 @@ static void loongson_ipi_class_init(ObjectClass *klass, void *data) DeviceClass *dc = DEVICE_CLASS(klass); dc->realize = loongson_ipi_realize; + dc->unrealize = loongson_ipi_unrealize; device_class_set_props(dc, ipi_properties); dc->vmsd = &vmstate_loongson_ipi; } -static void loongson_ipi_finalize(Object *obj) -{ - LoongsonIPI *s = LOONGSON_IPI(obj); - - g_free(s->cpu); -} - -static const TypeInfo loongson_ipi_info = { - .name = TYPE_LOONGSON_IPI, - .parent = TYPE_SYS_BUS_DEVICE, - .instance_size = sizeof(LoongsonIPI), - .class_init = loongson_ipi_class_init, - .instance_finalize = loongson_ipi_finalize, +static const TypeInfo loongson_ipi_types[] = { + { + .name = TYPE_LOONGSON_IPI, + .parent = TYPE_SYS_BUS_DEVICE, + .instance_size = sizeof(LoongsonIPI), + .class_init = loongson_ipi_class_init, + } }; -static void loongson_ipi_register_types(void) -{ - type_register_static(&loongson_ipi_info); -} - -type_init(loongson_ipi_register_types) +DEFINE_TYPES(loongson_ipi_types) diff --git a/hw/mem/cxl_type3.c b/hw/mem/cxl_type3.c index 35ac59883a..d648192ab9 100644 --- a/hw/mem/cxl_type3.c +++ b/hw/mem/cxl_type3.c @@ -737,6 +737,11 @@ static bool cxl_setup_memory(CXLType3Dev *ct3d, Error **errp) error_setg(errp, "volatile memdev must have backing device"); return false; } + if (host_memory_backend_is_mapped(ct3d->hostvmem)) { + error_setg(errp, "memory backend %s can't be used multiple times.", + object_get_canonical_path_component(OBJECT(ct3d->hostvmem))); + return false; + } memory_region_set_nonvolatile(vmr, false); memory_region_set_enabled(vmr, true); host_memory_backend_set_mapped(ct3d->hostvmem, true); @@ -760,6 +765,11 @@ static bool cxl_setup_memory(CXLType3Dev *ct3d, Error **errp) error_setg(errp, "persistent memdev must have backing device"); return false; } + if (host_memory_backend_is_mapped(ct3d->hostpmem)) { + error_setg(errp, "memory backend %s can't be used multiple times.", + object_get_canonical_path_component(OBJECT(ct3d->hostpmem))); + return false; + } memory_region_set_nonvolatile(pmr, true); memory_region_set_enabled(pmr, true); host_memory_backend_set_mapped(ct3d->hostpmem, true); @@ -790,6 +800,11 @@ static bool cxl_setup_memory(CXLType3Dev *ct3d, Error **errp) return false; } + if (host_memory_backend_is_mapped(ct3d->dc.host_dc)) { + error_setg(errp, "memory backend %s can't be used multiple times.", + object_get_canonical_path_component(OBJECT(ct3d->dc.host_dc))); + return false; + } /* * Set DC regions as volatile for now, non-volatile support can * be added in the future if needed. @@ -829,6 +844,7 @@ static void ct3_realize(PCIDevice *pci_dev, Error **errp) uint8_t *pci_conf = pci_dev->config; unsigned short msix_num = 6; int i, rc; + uint16_t count; QTAILQ_INIT(&ct3d->error_list); @@ -893,6 +909,28 @@ static void ct3_realize(PCIDevice *pci_dev, Error **errp) } cxl_event_init(&ct3d->cxl_dstate, 2); + /* Set default value for patrol scrub attributes */ + ct3d->patrol_scrub_attrs.scrub_cycle_cap = + CXL_MEMDEV_PS_SCRUB_CYCLE_CHANGE_CAP_DEFAULT | + CXL_MEMDEV_PS_SCRUB_REALTIME_REPORT_CAP_DEFAULT; + ct3d->patrol_scrub_attrs.scrub_cycle = + CXL_MEMDEV_PS_CUR_SCRUB_CYCLE_DEFAULT | + (CXL_MEMDEV_PS_MIN_SCRUB_CYCLE_DEFAULT << 8); + ct3d->patrol_scrub_attrs.scrub_flags = CXL_MEMDEV_PS_ENABLE_DEFAULT; + + /* Set default value for DDR5 ECS read attributes */ + for (count = 0; count < CXL_ECS_NUM_MEDIA_FRUS; count++) { + ct3d->ecs_attrs[count].ecs_log_cap = + CXL_ECS_LOG_ENTRY_TYPE_DEFAULT; + ct3d->ecs_attrs[count].ecs_cap = + CXL_ECS_REALTIME_REPORT_CAP_DEFAULT; + ct3d->ecs_attrs[count].ecs_config = + CXL_ECS_THRESHOLD_COUNT_DEFAULT | + (CXL_ECS_MODE_DEFAULT << 3); + /* Reserved */ + ct3d->ecs_attrs[count].ecs_flags = 0; + } + return; err_release_cdat: @@ -1127,7 +1165,7 @@ MemTxResult cxl_type3_read(PCIDevice *d, hwaddr host_addr, uint64_t *data, return MEMTX_ERROR; } - if (sanitize_running(&ct3d->cci)) { + if (cxl_dev_media_disabled(&ct3d->cxl_dstate)) { qemu_guest_getrandom_nofail(data, size); return MEMTX_OK; } @@ -1149,7 +1187,7 @@ MemTxResult cxl_type3_write(PCIDevice *d, hwaddr host_addr, uint64_t data, return MEMTX_ERROR; } - if (sanitize_running(&ct3d->cci)) { + if (cxl_dev_media_disabled(&ct3d->cxl_dstate)) { return MEMTX_OK; } @@ -1304,6 +1342,12 @@ void cxl_set_poison_list_overflowed(CXLType3Dev *ct3d) cxl_device_get_timestamp(&ct3d->cxl_dstate); } +void cxl_clear_poison_list_overflowed(CXLType3Dev *ct3d) +{ + ct3d->poison_list_overflowed = false; + ct3d->poison_list_overflow_ts = 0; +} + void qmp_cxl_inject_poison(const char *path, uint64_t start, uint64_t length, Error **errp) { @@ -1340,19 +1384,21 @@ void qmp_cxl_inject_poison(const char *path, uint64_t start, uint64_t length, } } - if (ct3d->poison_list_cnt == CXL_POISON_LIST_LIMIT) { - cxl_set_poison_list_overflowed(ct3d); - return; - } - p = g_new0(CXLPoison, 1); p->length = length; p->start = start; /* Different from injected via the mbox */ p->type = CXL_POISON_TYPE_INTERNAL; - QLIST_INSERT_HEAD(&ct3d->poison_list, p, node); - ct3d->poison_list_cnt++; + if (ct3d->poison_list_cnt < CXL_POISON_LIST_LIMIT) { + QLIST_INSERT_HEAD(&ct3d->poison_list, p, node); + ct3d->poison_list_cnt++; + } else { + if (!ct3d->poison_list_overflowed) { + cxl_set_poison_list_overflowed(ct3d); + } + QLIST_INSERT_HEAD(&ct3d->poison_list_bkp, p, node); + } } /* For uncorrectable errors include support for multiple header recording */ diff --git a/hw/mips/loongson3_virt.c b/hw/mips/loongson3_virt.c index 4ad36f0c5b..408e3d7054 100644 --- a/hw/mips/loongson3_virt.c +++ b/hw/mips/loongson3_virt.c @@ -355,8 +355,8 @@ static uint64_t load_kernel(CPUMIPSState *env) kernel_size = load_elf(loaderparams.kernel_filename, NULL, cpu_mips_kseg0_to_phys, NULL, - (uint64_t *)&kernel_entry, - (uint64_t *)&kernel_low, (uint64_t *)&kernel_high, + &kernel_entry, + &kernel_low, &kernel_high, NULL, 0, EM_MIPS, 1, 0); if (kernel_size < 0) { error_report("could not load kernel '%s': %s", diff --git a/hw/net/allwinner_emac.c b/hw/net/allwinner_emac.c index 989839784a..d40ff37e99 100644 --- a/hw/net/allwinner_emac.c +++ b/hw/net/allwinner_emac.c @@ -349,7 +349,7 @@ static void aw_emac_write(void *opaque, hwaddr offset, uint64_t value, "allwinner_emac: TX length > fifo data length\n"); } if (len > 0) { - data = fifo8_pop_buf(fifo, len, &ret); + data = fifo8_pop_bufptr(fifo, len, &ret); qemu_send_packet(nc, data, ret); aw_emac_tx_reset(s, chan); /* Raise TX interrupt */ diff --git a/hw/net/vhost_net.c b/hw/net/vhost_net.c index 18898afe81..a788e6937e 100644 --- a/hw/net/vhost_net.c +++ b/hw/net/vhost_net.c @@ -48,6 +48,7 @@ static const int kernel_feature_bits[] = { VIRTIO_F_IOMMU_PLATFORM, VIRTIO_F_RING_PACKED, VIRTIO_F_RING_RESET, + VIRTIO_F_IN_ORDER, VIRTIO_F_NOTIFICATION_DATA, VIRTIO_NET_F_HASH_REPORT, VHOST_INVALID_FEATURE_BIT @@ -78,6 +79,7 @@ static const int user_feature_bits[] = { VIRTIO_F_IOMMU_PLATFORM, VIRTIO_F_RING_PACKED, VIRTIO_F_RING_RESET, + VIRTIO_F_IN_ORDER, VIRTIO_NET_F_RSS, VIRTIO_NET_F_HASH_REPORT, VIRTIO_NET_F_GUEST_USO4, diff --git a/hw/nubus/nubus-virtio-mmio.c b/hw/nubus/nubus-virtio-mmio.c index 58a63c84d0..7a98731c45 100644 --- a/hw/nubus/nubus-virtio-mmio.c +++ b/hw/nubus/nubus-virtio-mmio.c @@ -7,6 +7,7 @@ */ #include "qemu/osdep.h" +#include "qapi/error.h" #include "hw/nubus/nubus-virtio-mmio.h" @@ -23,6 +24,7 @@ static void nubus_virtio_mmio_set_input_irq(void *opaque, int n, int level) static void nubus_virtio_mmio_realize(DeviceState *dev, Error **errp) { + ERRP_GUARD(); NubusVirtioMMIODeviceClass *nvmdc = NUBUS_VIRTIO_MMIO_GET_CLASS(dev); NubusVirtioMMIO *s = NUBUS_VIRTIO_MMIO(dev); NubusDevice *nd = NUBUS_DEVICE(dev); diff --git a/hw/nvme/ctrl.c b/hw/nvme/ctrl.c index 8d25174d25..e86ea2e7ce 100644 --- a/hw/nvme/ctrl.c +++ b/hw/nvme/ctrl.c @@ -203,6 +203,7 @@ #include "sysemu/hostmem.h" #include "hw/pci/msix.h" #include "hw/pci/pcie_sriov.h" +#include "sysemu/spdm-socket.h" #include "migration/vmstate.h" #include "nvme.h" @@ -8315,6 +8316,27 @@ static int nvme_add_pm_capability(PCIDevice *pci_dev, uint8_t offset) return 0; } +static bool pcie_doe_spdm_rsp(DOECap *doe_cap) +{ + void *req = pcie_doe_get_write_mbox_ptr(doe_cap); + uint32_t req_len = pcie_doe_get_obj_len(req) * 4; + void *rsp = doe_cap->read_mbox; + uint32_t rsp_len = SPDM_SOCKET_MAX_MESSAGE_BUFFER_SIZE; + + uint32_t recvd = spdm_socket_rsp(doe_cap->spdm_socket, + SPDM_SOCKET_TRANSPORT_TYPE_PCI_DOE, + req, req_len, rsp, rsp_len); + doe_cap->read_mbox_len += DIV_ROUND_UP(recvd, 4); + + return recvd != 0; +} + +static DOEProtocol doe_spdm_prot[] = { + { PCI_VENDOR_ID_PCI_SIG, PCI_SIG_DOE_CMA, pcie_doe_spdm_rsp }, + { PCI_VENDOR_ID_PCI_SIG, PCI_SIG_DOE_SECURED_CMA, pcie_doe_spdm_rsp }, + { } +}; + static bool nvme_init_pci(NvmeCtrl *n, PCIDevice *pci_dev, Error **errp) { ERRP_GUARD(); @@ -8402,6 +8424,25 @@ static bool nvme_init_pci(NvmeCtrl *n, PCIDevice *pci_dev, Error **errp) nvme_update_msixcap_ts(pci_dev, n->conf_msix_qsize); + pcie_cap_deverr_init(pci_dev); + + /* DOE Initialisation */ + if (pci_dev->spdm_port) { + uint16_t doe_offset = n->params.sriov_max_vfs ? + PCI_CONFIG_SPACE_SIZE + PCI_ARI_SIZEOF + : PCI_CONFIG_SPACE_SIZE; + + pcie_doe_init(pci_dev, &pci_dev->doe_spdm, doe_offset, + doe_spdm_prot, true, 0); + + pci_dev->doe_spdm.spdm_socket = spdm_socket_connect(pci_dev->spdm_port, + errp); + + if (pci_dev->doe_spdm.spdm_socket < 0) { + return false; + } + } + if (n->params.cmb_size_mb) { nvme_init_cmb(n, pci_dev); } @@ -8650,6 +8691,11 @@ static void nvme_exit(PCIDevice *pci_dev) g_free(n->cmb.buf); } + if (pci_dev->doe_spdm.spdm_socket > 0) { + spdm_socket_close(pci_dev->doe_spdm.spdm_socket, + SPDM_SOCKET_TRANSPORT_TYPE_PCI_DOE); + } + if (n->pmr.dev) { host_memory_backend_set_mapped(n->pmr.dev, false); } @@ -8695,6 +8741,7 @@ static Property nvme_props[] = { DEFINE_PROP_BOOL("msix-exclusive-bar", NvmeCtrl, params.msix_exclusive_bar, false), DEFINE_PROP_UINT16("mqes", NvmeCtrl, params.mqes, 0x7ff), + DEFINE_PROP_UINT16("spdm_port", PCIDevice, spdm_port, 0), DEFINE_PROP_END_OF_LIST(), }; @@ -8766,11 +8813,25 @@ static void nvme_pci_write_config(PCIDevice *dev, uint32_t address, { uint16_t old_num_vfs = pcie_sriov_num_vfs(dev); + if (pcie_find_capability(dev, PCI_EXT_CAP_ID_DOE)) { + pcie_doe_write_config(&dev->doe_spdm, address, val, len); + } pci_default_write_config(dev, address, val, len); pcie_cap_flr_write_config(dev, address, val, len); nvme_sriov_post_write_config(dev, old_num_vfs); } +static uint32_t nvme_pci_read_config(PCIDevice *dev, uint32_t address, int len) +{ + uint32_t val; + if (dev->spdm_port && pcie_find_capability(dev, PCI_EXT_CAP_ID_DOE)) { + if (pcie_doe_read_config(&dev->doe_spdm, address, len, &val)) { + return val; + } + } + return pci_default_read_config(dev, address, len); +} + static const VMStateDescription nvme_vmstate = { .name = "nvme", .unmigratable = 1, @@ -8783,6 +8844,7 @@ static void nvme_class_init(ObjectClass *oc, void *data) pc->realize = nvme_realize; pc->config_write = nvme_pci_write_config; + pc->config_read = nvme_pci_read_config; pc->exit = nvme_exit; pc->class_id = PCI_CLASS_STORAGE_EXPRESS; pc->revision = 2; diff --git a/hw/pci-host/gpex-acpi.c b/hw/pci-host/gpex-acpi.c index f69413ea2c..391fabb8a8 100644 --- a/hw/pci-host/gpex-acpi.c +++ b/hw/pci-host/gpex-acpi.c @@ -7,7 +7,8 @@ #include "hw/pci/pcie_host.h" #include "hw/acpi/cxl.h" -static void acpi_dsdt_add_pci_route_table(Aml *dev, uint32_t irq) +static void acpi_dsdt_add_pci_route_table(Aml *dev, uint32_t irq, + Aml *scope, uint8_t bus_num) { Aml *method, *crs; int i, slot_no; @@ -20,7 +21,7 @@ static void acpi_dsdt_add_pci_route_table(Aml *dev, uint32_t irq) Aml *pkg = aml_package(4); aml_append(pkg, aml_int((slot_no << 16) | 0xFFFF)); aml_append(pkg, aml_int(i)); - aml_append(pkg, aml_name("GSI%d", gsi)); + aml_append(pkg, aml_name("L%.02X%X", bus_num, gsi)); aml_append(pkg, aml_int(0)); aml_append(rt_pkg, pkg); } @@ -30,7 +31,7 @@ static void acpi_dsdt_add_pci_route_table(Aml *dev, uint32_t irq) /* Create GSI link device */ for (i = 0; i < PCI_NUM_PINS; i++) { uint32_t irqs = irq + i; - Aml *dev_gsi = aml_device("GSI%d", i); + Aml *dev_gsi = aml_device("L%.02X%X", bus_num, i); aml_append(dev_gsi, aml_name_decl("_HID", aml_string("PNP0C0F"))); aml_append(dev_gsi, aml_name_decl("_UID", aml_int(i))); crs = aml_resource_template(); @@ -45,7 +46,7 @@ static void acpi_dsdt_add_pci_route_table(Aml *dev, uint32_t irq) aml_append(dev_gsi, aml_name_decl("_CRS", crs)); method = aml_method("_SRS", 1, AML_NOTSERIALIZED); aml_append(dev_gsi, method); - aml_append(dev, dev_gsi); + aml_append(scope, dev_gsi); } } @@ -174,7 +175,7 @@ void acpi_dsdt_add_gpex(Aml *scope, struct GPEXConfig *cfg) aml_append(dev, aml_name_decl("_PXM", aml_int(numa_node))); } - acpi_dsdt_add_pci_route_table(dev, cfg->irq); + acpi_dsdt_add_pci_route_table(dev, cfg->irq, scope, bus_num); /* * Resources defined for PXBs are composed of the following parts: @@ -205,7 +206,7 @@ void acpi_dsdt_add_gpex(Aml *scope, struct GPEXConfig *cfg) aml_append(dev, aml_name_decl("_STR", aml_unicode("PCIe 0 Device"))); aml_append(dev, aml_name_decl("_CCA", aml_int(1))); - acpi_dsdt_add_pci_route_table(dev, cfg->irq); + acpi_dsdt_add_pci_route_table(dev, cfg->irq, scope, 0); method = aml_method("_CBA", 0, AML_NOTSERIALIZED); aml_append(method, aml_return(aml_int(cfg->ecam.base))); diff --git a/hw/pci/pci.c b/hw/pci/pci.c index 4c7be52951..8ad5d7e2d8 100644 --- a/hw/pci/pci.c +++ b/hw/pci/pci.c @@ -85,6 +85,7 @@ static Property pci_props[] = { QEMU_PCIE_ERR_UNC_MASK_BITNR, true), DEFINE_PROP_BIT("x-pcie-ari-nextfn-1", PCIDevice, cap_present, QEMU_PCIE_ARI_NEXTFN_1_BITNR, false), + DEFINE_PROP_STRING("sriov-pf", PCIDevice, sriov_pf), DEFINE_PROP_END_OF_LIST() }; @@ -959,13 +960,8 @@ static void pci_init_multifunction(PCIBus *bus, PCIDevice *dev, Error **errp) dev->config[PCI_HEADER_TYPE] |= PCI_HEADER_TYPE_MULTI_FUNCTION; } - /* - * With SR/IOV and ARI, a device at function 0 need not be a multifunction - * device, as it may just be a VF that ended up with function 0 in - * the legacy PCI interpretation. Avoid failing in such cases: - */ - if (pci_is_vf(dev) && - dev->exp.sriov_vf.pf->cap_present & QEMU_PCI_CAP_MULTIFUNCTION) { + /* SR/IOV is not handled here. */ + if (pci_is_vf(dev)) { return; } @@ -998,7 +994,8 @@ static void pci_init_multifunction(PCIBus *bus, PCIDevice *dev, Error **errp) } /* function 0 indicates single function, so function > 0 must be NULL */ for (func = 1; func < PCI_FUNC_MAX; ++func) { - if (bus->devices[PCI_DEVFN(slot, func)]) { + PCIDevice *device = bus->devices[PCI_DEVFN(slot, func)]; + if (device && !pci_is_vf(device)) { error_setg(errp, "PCI: %x.0 indicates single function, " "but %x.%x is already populated.", slot, slot, func); @@ -1283,6 +1280,7 @@ static void pci_qdev_unrealize(DeviceState *dev) pci_unregister_io_regions(pci_dev); pci_del_option_rom(pci_dev); + pcie_sriov_unregister_device(pci_dev); if (pc->exit) { pc->exit(pci_dev); @@ -1314,7 +1312,6 @@ void pci_register_bar(PCIDevice *pci_dev, int region_num, pcibus_t size = memory_region_size(memory); uint8_t hdr_type; - assert(!pci_is_vf(pci_dev)); /* VFs must use pcie_sriov_vf_register_bar */ assert(region_num >= 0); assert(region_num < PCI_NUM_REGIONS); assert(is_power_of_2(size)); @@ -1325,7 +1322,6 @@ void pci_register_bar(PCIDevice *pci_dev, int region_num, assert(hdr_type != PCI_HEADER_TYPE_BRIDGE || region_num < 2); r = &pci_dev->io_regions[region_num]; - r->addr = PCI_BAR_UNMAPPED; r->size = size; r->type = type; r->memory = memory; @@ -1333,22 +1329,35 @@ void pci_register_bar(PCIDevice *pci_dev, int region_num, ? pci_get_bus(pci_dev)->address_space_io : pci_get_bus(pci_dev)->address_space_mem; - wmask = ~(size - 1); - if (region_num == PCI_ROM_SLOT) { - /* ROM enable bit is writable */ - wmask |= PCI_ROM_ADDRESS_ENABLE; - } - - addr = pci_bar(pci_dev, region_num); - pci_set_long(pci_dev->config + addr, type); + if (pci_is_vf(pci_dev)) { + PCIDevice *pf = pci_dev->exp.sriov_vf.pf; + assert(!pf || type == pf->exp.sriov_pf.vf_bar_type[region_num]); - if (!(r->type & PCI_BASE_ADDRESS_SPACE_IO) && - r->type & PCI_BASE_ADDRESS_MEM_TYPE_64) { - pci_set_quad(pci_dev->wmask + addr, wmask); - pci_set_quad(pci_dev->cmask + addr, ~0ULL); + r->addr = pci_bar_address(pci_dev, region_num, r->type, r->size); + if (r->addr != PCI_BAR_UNMAPPED) { + memory_region_add_subregion_overlap(r->address_space, + r->addr, r->memory, 1); + } } else { - pci_set_long(pci_dev->wmask + addr, wmask & 0xffffffff); - pci_set_long(pci_dev->cmask + addr, 0xffffffff); + r->addr = PCI_BAR_UNMAPPED; + + wmask = ~(size - 1); + if (region_num == PCI_ROM_SLOT) { + /* ROM enable bit is writable */ + wmask |= PCI_ROM_ADDRESS_ENABLE; + } + + addr = pci_bar(pci_dev, region_num); + pci_set_long(pci_dev->config + addr, type); + + if (!(r->type & PCI_BASE_ADDRESS_SPACE_IO) && + r->type & PCI_BASE_ADDRESS_MEM_TYPE_64) { + pci_set_quad(pci_dev->wmask + addr, wmask); + pci_set_quad(pci_dev->cmask + addr, ~0ULL); + } else { + pci_set_long(pci_dev->wmask + addr, wmask & 0xffffffff); + pci_set_long(pci_dev->cmask + addr, 0xffffffff); + } } } @@ -1437,7 +1446,11 @@ static pcibus_t pci_config_get_bar_addr(PCIDevice *d, int reg, pci_get_word(pf->config + sriov_cap + PCI_SRIOV_VF_OFFSET); uint16_t vf_stride = pci_get_word(pf->config + sriov_cap + PCI_SRIOV_VF_STRIDE); - uint32_t vf_num = (d->devfn - (pf->devfn + vf_offset)) / vf_stride; + uint32_t vf_num = d->devfn - (pf->devfn + vf_offset); + + if (vf_num) { + vf_num /= vf_stride; + } if (type & PCI_BASE_ADDRESS_MEM_TYPE_64) { new_addr = pci_get_quad(pf->config + bar); @@ -2105,6 +2118,11 @@ static void pci_qdev_realize(DeviceState *qdev, Error **errp) } } + if (!pcie_sriov_register_device(pci_dev, errp)) { + pci_qdev_unrealize(DEVICE(pci_dev)); + return; + } + /* * A PCIe Downstream Port that do not have ARI Forwarding enabled must * associate only Device 0 with the device attached to the bus diff --git a/hw/pci/pcie_sriov.c b/hw/pci/pcie_sriov.c index 56523ab4e8..0fc9f810b9 100644 --- a/hw/pci/pcie_sriov.c +++ b/hw/pci/pcie_sriov.c @@ -20,6 +20,8 @@ #include "qapi/error.h" #include "trace.h" +static GHashTable *pfs; + static void unparent_vfs(PCIDevice *dev, uint16_t total_vfs) { for (uint16_t i = 0; i < total_vfs; i++) { @@ -31,17 +33,62 @@ static void unparent_vfs(PCIDevice *dev, uint16_t total_vfs) dev->exp.sriov_pf.vf = NULL; } -bool pcie_sriov_pf_init(PCIDevice *dev, uint16_t offset, - const char *vfname, uint16_t vf_dev_id, - uint16_t init_vfs, uint16_t total_vfs, - uint16_t vf_offset, uint16_t vf_stride, - Error **errp) +static void clear_ctrl_vfe(PCIDevice *dev) +{ + uint8_t *ctrl = dev->config + dev->exp.sriov_cap + PCI_SRIOV_CTRL; + pci_set_word(ctrl, pci_get_word(ctrl) & ~PCI_SRIOV_CTRL_VFE); +} + +static void register_vfs(PCIDevice *dev) +{ + uint16_t num_vfs; + uint16_t i; + uint16_t sriov_cap = dev->exp.sriov_cap; + + assert(sriov_cap > 0); + num_vfs = pci_get_word(dev->config + sriov_cap + PCI_SRIOV_NUM_VF); + if (num_vfs > pci_get_word(dev->config + sriov_cap + PCI_SRIOV_TOTAL_VF)) { + clear_ctrl_vfe(dev); + return; + } + + trace_sriov_register_vfs(dev->name, PCI_SLOT(dev->devfn), + PCI_FUNC(dev->devfn), num_vfs); + for (i = 0; i < num_vfs; i++) { + pci_set_enabled(dev->exp.sriov_pf.vf[i], true); + } +} + +static void unregister_vfs(PCIDevice *dev) +{ + uint16_t i; + uint8_t *cfg = dev->config + dev->exp.sriov_cap; + + trace_sriov_unregister_vfs(dev->name, PCI_SLOT(dev->devfn), + PCI_FUNC(dev->devfn)); + for (i = 0; i < pci_get_word(cfg + PCI_SRIOV_TOTAL_VF); i++) { + pci_set_enabled(dev->exp.sriov_pf.vf[i], false); + } +} + +static bool pcie_sriov_pf_init_common(PCIDevice *dev, uint16_t offset, + uint16_t vf_dev_id, uint16_t init_vfs, + uint16_t total_vfs, uint16_t vf_offset, + uint16_t vf_stride, Error **errp) { - BusState *bus = qdev_get_parent_bus(&dev->qdev); - int32_t devfn = dev->devfn + vf_offset; uint8_t *cfg = dev->config + offset; uint8_t *wmask; + if (!pci_is_express(dev)) { + error_setg(errp, "PCI Express is required for SR-IOV PF"); + return false; + } + + if (pci_is_vf(dev)) { + error_setg(errp, "a device cannot be both an SR-IOV PF and a VF"); + return false; + } + if (total_vfs) { uint16_t ari_cap = pcie_find_capability(dev, PCI_EXT_CAP_ID_ARI); uint16_t first_vf_devfn = dev->devfn + vf_offset; @@ -90,6 +137,28 @@ bool pcie_sriov_pf_init(PCIDevice *dev, uint16_t offset, qdev_prop_set_bit(&dev->qdev, "multifunction", true); + return true; +} + +bool pcie_sriov_pf_init(PCIDevice *dev, uint16_t offset, + const char *vfname, uint16_t vf_dev_id, + uint16_t init_vfs, uint16_t total_vfs, + uint16_t vf_offset, uint16_t vf_stride, + Error **errp) +{ + BusState *bus = qdev_get_parent_bus(&dev->qdev); + int32_t devfn = dev->devfn + vf_offset; + + if (pfs && g_hash_table_contains(pfs, dev->qdev.id)) { + error_setg(errp, "attaching user-created SR-IOV VF unsupported"); + return false; + } + + if (!pcie_sriov_pf_init_common(dev, offset, vf_dev_id, init_vfs, + total_vfs, vf_offset, vf_stride, errp)) { + return false; + } + dev->exp.sriov_pf.vf = g_new(PCIDevice *, total_vfs); for (uint16_t i = 0; i < total_vfs; i++) { @@ -119,7 +188,24 @@ void pcie_sriov_pf_exit(PCIDevice *dev) { uint8_t *cfg = dev->config + dev->exp.sriov_cap; - unparent_vfs(dev, pci_get_word(cfg + PCI_SRIOV_TOTAL_VF)); + if (dev->exp.sriov_pf.vf_user_created) { + uint16_t ven_id = pci_get_word(dev->config + PCI_VENDOR_ID); + uint16_t total_vfs = pci_get_word(dev->config + PCI_SRIOV_TOTAL_VF); + uint16_t vf_dev_id = pci_get_word(dev->config + PCI_SRIOV_VF_DID); + + unregister_vfs(dev); + + for (uint16_t i = 0; i < total_vfs; i++) { + PCIDevice *vf = dev->exp.sriov_pf.vf[i]; + + vf->exp.sriov_vf.pf = NULL; + + pci_config_set_vendor_id(vf->config, ven_id); + pci_config_set_device_id(vf->config, vf_dev_id); + } + } else { + unparent_vfs(dev, pci_get_word(cfg + PCI_SRIOV_TOTAL_VF)); + } } void pcie_sriov_pf_init_vf_bar(PCIDevice *dev, int region_num, @@ -152,74 +238,172 @@ void pcie_sriov_pf_init_vf_bar(PCIDevice *dev, int region_num, void pcie_sriov_vf_register_bar(PCIDevice *dev, int region_num, MemoryRegion *memory) { - PCIIORegion *r; - PCIBus *bus = pci_get_bus(dev); uint8_t type; - pcibus_t size = memory_region_size(memory); - assert(pci_is_vf(dev)); /* PFs must use pci_register_bar */ - assert(region_num >= 0); - assert(region_num < PCI_NUM_REGIONS); + assert(dev->exp.sriov_vf.pf); type = dev->exp.sriov_vf.pf->exp.sriov_pf.vf_bar_type[region_num]; - if (!is_power_of_2(size)) { - error_report("%s: PCI region size must be a power" - " of two - type=0x%x, size=0x%"FMT_PCIBUS, - __func__, type, size); - exit(1); - } - - r = &dev->io_regions[region_num]; - r->memory = memory; - r->address_space = - type & PCI_BASE_ADDRESS_SPACE_IO - ? bus->address_space_io - : bus->address_space_mem; - r->size = size; - r->type = type; - - r->addr = pci_bar_address(dev, region_num, r->type, r->size); - if (r->addr != PCI_BAR_UNMAPPED) { - memory_region_add_subregion_overlap(r->address_space, - r->addr, r->memory, 1); - } + return pci_register_bar(dev, region_num, type, memory); } -static void clear_ctrl_vfe(PCIDevice *dev) +static gint compare_vf_devfns(gconstpointer a, gconstpointer b) { - uint8_t *ctrl = dev->config + dev->exp.sriov_cap + PCI_SRIOV_CTRL; - pci_set_word(ctrl, pci_get_word(ctrl) & ~PCI_SRIOV_CTRL_VFE); + return (*(PCIDevice **)a)->devfn - (*(PCIDevice **)b)->devfn; } -static void register_vfs(PCIDevice *dev) +int16_t pcie_sriov_pf_init_from_user_created_vfs(PCIDevice *dev, + uint16_t offset, + Error **errp) { - uint16_t num_vfs; + GPtrArray *pf; + PCIDevice **vfs; + BusState *bus = qdev_get_parent_bus(DEVICE(dev)); + uint16_t ven_id = pci_get_word(dev->config + PCI_VENDOR_ID); + uint16_t vf_dev_id; + uint16_t vf_offset; + uint16_t vf_stride; uint16_t i; - uint16_t sriov_cap = dev->exp.sriov_cap; - assert(sriov_cap > 0); - num_vfs = pci_get_word(dev->config + sriov_cap + PCI_SRIOV_NUM_VF); - if (num_vfs > pci_get_word(dev->config + sriov_cap + PCI_SRIOV_TOTAL_VF)) { - clear_ctrl_vfe(dev); - return; + if (!pfs || !dev->qdev.id) { + return 0; } - trace_sriov_register_vfs(dev->name, PCI_SLOT(dev->devfn), - PCI_FUNC(dev->devfn), num_vfs); - for (i = 0; i < num_vfs; i++) { - pci_set_enabled(dev->exp.sriov_pf.vf[i], true); + pf = g_hash_table_lookup(pfs, dev->qdev.id); + if (!pf) { + return 0; } + + if (pf->len > UINT16_MAX) { + error_setg(errp, "too many VFs"); + return -1; + } + + g_ptr_array_sort(pf, compare_vf_devfns); + vfs = (void *)pf->pdata; + + if (vfs[0]->devfn <= dev->devfn) { + error_setg(errp, "a VF function number is less than the PF function number"); + return -1; + } + + vf_dev_id = pci_get_word(vfs[0]->config + PCI_DEVICE_ID); + vf_offset = vfs[0]->devfn - dev->devfn; + vf_stride = pf->len < 2 ? 0 : vfs[1]->devfn - vfs[0]->devfn; + + for (i = 0; i < pf->len; i++) { + if (bus != qdev_get_parent_bus(&vfs[i]->qdev)) { + error_setg(errp, "SR-IOV VF parent bus mismatches with PF"); + return -1; + } + + if (ven_id != pci_get_word(vfs[i]->config + PCI_VENDOR_ID)) { + error_setg(errp, "SR-IOV VF vendor ID mismatches with PF"); + return -1; + } + + if (vf_dev_id != pci_get_word(vfs[i]->config + PCI_DEVICE_ID)) { + error_setg(errp, "inconsistent SR-IOV VF device IDs"); + return -1; + } + + for (size_t j = 0; j < PCI_NUM_REGIONS; j++) { + if (vfs[i]->io_regions[j].size != vfs[0]->io_regions[j].size || + vfs[i]->io_regions[j].type != vfs[0]->io_regions[j].type) { + error_setg(errp, "inconsistent SR-IOV BARs"); + return -1; + } + } + + if (vfs[i]->devfn - vfs[0]->devfn != vf_stride * i) { + error_setg(errp, "inconsistent SR-IOV stride"); + return -1; + } + } + + if (!pcie_sriov_pf_init_common(dev, offset, vf_dev_id, pf->len, + pf->len, vf_offset, vf_stride, errp)) { + return -1; + } + + for (i = 0; i < pf->len; i++) { + vfs[i]->exp.sriov_vf.pf = dev; + vfs[i]->exp.sriov_vf.vf_number = i; + + /* set vid/did according to sr/iov spec - they are not used */ + pci_config_set_vendor_id(vfs[i]->config, 0xffff); + pci_config_set_device_id(vfs[i]->config, 0xffff); + } + + dev->exp.sriov_pf.vf = vfs; + dev->exp.sriov_pf.vf_user_created = true; + + for (i = 0; i < PCI_NUM_REGIONS; i++) { + PCIIORegion *region = &vfs[0]->io_regions[i]; + + if (region->size) { + pcie_sriov_pf_init_vf_bar(dev, i, region->type, region->size); + } + } + + return PCI_EXT_CAP_SRIOV_SIZEOF; } -static void unregister_vfs(PCIDevice *dev) +bool pcie_sriov_register_device(PCIDevice *dev, Error **errp) { - uint16_t i; - uint8_t *cfg = dev->config + dev->exp.sriov_cap; + if (!dev->exp.sriov_pf.vf && dev->qdev.id && + pfs && g_hash_table_contains(pfs, dev->qdev.id)) { + error_setg(errp, "attaching user-created SR-IOV VF unsupported"); + return false; + } - trace_sriov_unregister_vfs(dev->name, PCI_SLOT(dev->devfn), - PCI_FUNC(dev->devfn)); - for (i = 0; i < pci_get_word(cfg + PCI_SRIOV_TOTAL_VF); i++) { - pci_set_enabled(dev->exp.sriov_pf.vf[i], false); + if (dev->sriov_pf) { + PCIDevice *pci_pf; + GPtrArray *pf; + + if (!PCI_DEVICE_GET_CLASS(dev)->sriov_vf_user_creatable) { + error_setg(errp, "user cannot create SR-IOV VF with this device type"); + return false; + } + + if (!pci_is_express(dev)) { + error_setg(errp, "PCI Express is required for SR-IOV VF"); + return false; + } + + if (!pci_qdev_find_device(dev->sriov_pf, &pci_pf)) { + error_setg(errp, "PCI device specified as SR-IOV PF already exists"); + return false; + } + + if (!pfs) { + pfs = g_hash_table_new_full(g_str_hash, g_str_equal, g_free, NULL); + } + + pf = g_hash_table_lookup(pfs, dev->sriov_pf); + if (!pf) { + pf = g_ptr_array_new(); + g_hash_table_insert(pfs, g_strdup(dev->sriov_pf), pf); + } + + g_ptr_array_add(pf, dev); + } + + return true; +} + +void pcie_sriov_unregister_device(PCIDevice *dev) +{ + if (dev->sriov_pf && pfs) { + GPtrArray *pf = g_hash_table_lookup(pfs, dev->sriov_pf); + + if (pf) { + g_ptr_array_remove_fast(pf, dev); + + if (!pf->len) { + g_hash_table_remove(pfs, dev->sriov_pf); + g_ptr_array_free(pf, FALSE); + } + } } } @@ -306,7 +490,7 @@ void pcie_sriov_pf_add_sup_pgsize(PCIDevice *dev, uint16_t opt_sup_pgsize) uint16_t pcie_sriov_vf_number(PCIDevice *dev) { - assert(pci_is_vf(dev)); + assert(dev->exp.sriov_vf.pf); return dev->exp.sriov_vf.vf_number; } diff --git a/hw/riscv/virt-acpi-build.c b/hw/riscv/virt-acpi-build.c index 0925528160..36d6a3a412 100644 --- a/hw/riscv/virt-acpi-build.c +++ b/hw/riscv/virt-acpi-build.c @@ -141,12 +141,36 @@ static void acpi_dsdt_add_cpus(Aml *scope, RISCVVirtState *s) } } +static void acpi_dsdt_add_plic_aplic(Aml *scope, uint8_t socket_count, + uint64_t mmio_base, uint64_t mmio_size, + const char *hid) +{ + uint64_t plic_aplic_addr; + uint32_t gsi_base; + uint8_t socket; + + for (socket = 0; socket < socket_count; socket++) { + plic_aplic_addr = mmio_base + mmio_size * socket; + gsi_base = VIRT_IRQCHIP_NUM_SOURCES * socket; + Aml *dev = aml_device("IC%.02X", socket); + aml_append(dev, aml_name_decl("_HID", aml_string("%s", hid))); + aml_append(dev, aml_name_decl("_UID", aml_int(socket))); + aml_append(dev, aml_name_decl("_GSB", aml_int(gsi_base))); + + Aml *crs = aml_resource_template(); + aml_append(crs, aml_memory32_fixed(plic_aplic_addr, mmio_size, + AML_READ_WRITE)); + aml_append(dev, aml_name_decl("_CRS", crs)); + aml_append(scope, dev); + } +} + static void acpi_dsdt_add_uart(Aml *scope, const MemMapEntry *uart_memmap, uint32_t uart_irq) { Aml *dev = aml_device("COM0"); - aml_append(dev, aml_name_decl("_HID", aml_string("PNP0501"))); + aml_append(dev, aml_name_decl("_HID", aml_string("RSCV0003"))); aml_append(dev, aml_name_decl("_UID", aml_int(0))); Aml *crs = aml_resource_template(); @@ -411,6 +435,14 @@ static void build_dsdt(GArray *table_data, socket_count = riscv_socket_count(ms); + if (s->aia_type == VIRT_AIA_TYPE_NONE) { + acpi_dsdt_add_plic_aplic(scope, socket_count, memmap[VIRT_PLIC].base, + memmap[VIRT_PLIC].size, "RSCV0001"); + } else { + acpi_dsdt_add_plic_aplic(scope, socket_count, memmap[VIRT_APLIC_S].base, + memmap[VIRT_APLIC_S].size, "RSCV0002"); + } + acpi_dsdt_add_uart(scope, &memmap[VIRT_UART0], UART0_IRQ); if (socket_count == 1) { diff --git a/hw/rtc/ls7a_rtc.c b/hw/rtc/ls7a_rtc.c index 052201c2cd..3226b6105e 100644 --- a/hw/rtc/ls7a_rtc.c +++ b/hw/rtc/ls7a_rtc.c @@ -1,6 +1,6 @@ /* SPDX-License-Identifier: GPL-2.0-or-later */ /* - * Loongarch LS7A Real Time Clock emulation + * LoongArch LS7A Real Time Clock emulation * * Copyright (C) 2021 Loongson Technology Corporation Limited */ diff --git a/hw/scsi/esp.c b/hw/scsi/esp.c index 8504dd30a0..b7af825623 100644 --- a/hw/scsi/esp.c +++ b/hw/scsi/esp.c @@ -197,39 +197,9 @@ static uint8_t esp_fifo_pop(ESPState *s) return val; } -static uint32_t esp_fifo8_pop_buf(Fifo8 *fifo, uint8_t *dest, int maxlen) -{ - const uint8_t *buf; - uint32_t n, n2; - int len; - - if (maxlen == 0) { - return 0; - } - - len = maxlen; - buf = fifo8_pop_buf(fifo, len, &n); - if (dest) { - memcpy(dest, buf, n); - } - - /* Add FIFO wraparound if needed */ - len -= n; - len = MIN(len, fifo8_num_used(fifo)); - if (len) { - buf = fifo8_pop_buf(fifo, len, &n2); - if (dest) { - memcpy(&dest[n], buf, n2); - } - n += n2; - } - - return n; -} - static uint32_t esp_fifo_pop_buf(ESPState *s, uint8_t *dest, int maxlen) { - uint32_t len = esp_fifo8_pop_buf(&s->fifo, dest, maxlen); + uint32_t len = fifo8_pop_buf(&s->fifo, dest, maxlen); esp_update_drq(s); return len; @@ -335,7 +305,7 @@ static void do_command_phase(ESPState *s) if (!cmdlen || !s->current_dev) { return; } - esp_fifo8_pop_buf(&s->cmdfifo, buf, cmdlen); + fifo8_pop_buf(&s->cmdfifo, buf, cmdlen); current_lun = scsi_device_find(&s->bus, 0, s->current_dev->id, s->lun); if (!current_lun) { @@ -381,7 +351,7 @@ static void do_message_phase(ESPState *s) /* Ignore extended messages for now */ if (s->cmdfifo_cdb_offset) { int len = MIN(s->cmdfifo_cdb_offset, fifo8_num_used(&s->cmdfifo)); - esp_fifo8_pop_buf(&s->cmdfifo, NULL, len); + fifo8_drop(&s->cmdfifo, len); s->cmdfifo_cdb_offset = 0; } } @@ -486,7 +456,7 @@ static bool esp_cdb_ready(ESPState *s) return false; } - pbuf = fifo8_peek_buf(&s->cmdfifo, len, &n); + pbuf = fifo8_peek_bufptr(&s->cmdfifo, len, &n); if (n < len) { /* * In normal use the cmdfifo should never wrap, but include this check diff --git a/hw/scsi/vhost-scsi.c b/hw/scsi/vhost-scsi.c index 3d5fe0994d..49cff2a0cb 100644 --- a/hw/scsi/vhost-scsi.c +++ b/hw/scsi/vhost-scsi.c @@ -38,6 +38,7 @@ static const int kernel_feature_bits[] = { VIRTIO_RING_F_EVENT_IDX, VIRTIO_SCSI_F_HOTPLUG, VIRTIO_F_RING_RESET, + VIRTIO_F_IN_ORDER, VIRTIO_F_NOTIFICATION_DATA, VHOST_INVALID_FEATURE_BIT }; diff --git a/hw/scsi/vhost-user-scsi.c b/hw/scsi/vhost-user-scsi.c index cc91ade525..55e4be5b34 100644 --- a/hw/scsi/vhost-user-scsi.c +++ b/hw/scsi/vhost-user-scsi.c @@ -36,6 +36,7 @@ static const int user_feature_bits[] = { VIRTIO_RING_F_EVENT_IDX, VIRTIO_SCSI_F_HOTPLUG, VIRTIO_F_RING_RESET, + VIRTIO_F_IN_ORDER, VIRTIO_F_NOTIFICATION_DATA, VHOST_INVALID_FEATURE_BIT }; diff --git a/hw/smbios/smbios.c b/hw/smbios/smbios.c index 3b7703489d..a394514264 100644 --- a/hw/smbios/smbios.c +++ b/hw/smbios/smbios.c @@ -1093,6 +1093,7 @@ static bool smbios_get_tables_ep(MachineState *ms, Error **errp) { unsigned i, dimm_cnt, offset; + MachineClass *mc = MACHINE_GET_CLASS(ms); ERRP_GUARD(); assert(ep_type == SMBIOS_ENTRY_POINT_TYPE_32 || @@ -1123,12 +1124,12 @@ static bool smbios_get_tables_ep(MachineState *ms, smbios_build_type_9_table(errp); smbios_build_type_11_table(); -#define MAX_DIMM_SZ (16 * GiB) -#define GET_DIMM_SZ ((i < dimm_cnt - 1) ? MAX_DIMM_SZ \ - : ((current_machine->ram_size - 1) % MAX_DIMM_SZ) + 1) +#define GET_DIMM_SZ ((i < dimm_cnt - 1) ? mc->smbios_memory_device_size \ + : ((current_machine->ram_size - 1) % mc->smbios_memory_device_size) + 1) - dimm_cnt = QEMU_ALIGN_UP(current_machine->ram_size, MAX_DIMM_SZ) / - MAX_DIMM_SZ; + dimm_cnt = QEMU_ALIGN_UP(current_machine->ram_size, + mc->smbios_memory_device_size) / + mc->smbios_memory_device_size; /* * The offset determines if we need to keep additional space between diff --git a/hw/timer/hpet.c b/hw/timer/hpet.c index 4cb5393c0b..471950adef 100644 --- a/hw/timer/hpet.c +++ b/hw/timer/hpet.c @@ -54,10 +54,12 @@ typedef struct HPETTimer { /* timers */ uint64_t cmp; /* comparator */ uint64_t fsb; /* FSB route */ /* Hidden register state */ + uint64_t cmp64; /* comparator (extended to counter width) */ uint64_t period; /* Last value written to comparator */ uint8_t wrap_flag; /* timer pop will indicate wrap for one-shot 32-bit * mode. Next pop will be actual timer expiration. */ + uint64_t last; /* last value armed, to avoid timer storms */ } HPETTimer; struct HPETState { @@ -116,11 +118,6 @@ static uint32_t timer_enabled(HPETTimer *t) static uint32_t hpet_time_after(uint64_t a, uint64_t b) { - return ((int32_t)(b - a) < 0); -} - -static uint32_t hpet_time_after64(uint64_t a, uint64_t b) -{ return ((int64_t)(b - a) < 0); } @@ -156,29 +153,34 @@ static uint64_t hpet_get_ticks(HPETState *s) return ns_to_ticks(qemu_clock_get_ns(QEMU_CLOCK_VIRTUAL) + s->hpet_offset); } +static uint64_t hpet_get_ns(HPETState *s, uint64_t tick) +{ + return ticks_to_ns(tick) - s->hpet_offset; +} + /* - * calculate diff between comparator value and current ticks + * calculate next value of the general counter that matches the + * target (either entirely, or the low 32-bit only depending on + * the timer mode). */ -static inline uint64_t hpet_calculate_diff(HPETTimer *t, uint64_t current) +static uint64_t hpet_calculate_cmp64(HPETTimer *t, uint64_t cur_tick, uint64_t target) { - if (t->config & HPET_TN_32BIT) { - uint32_t diff, cmp; - - cmp = (uint32_t)t->cmp; - diff = cmp - (uint32_t)current; - diff = (int32_t)diff > 0 ? diff : (uint32_t)1; - return (uint64_t)diff; + uint64_t result = deposit64(cur_tick, 0, 32, target); + if (result < cur_tick) { + result += 0x100000000ULL; + } + return result; } else { - uint64_t diff, cmp; - - cmp = t->cmp; - diff = cmp - current; - diff = (int64_t)diff > 0 ? diff : (uint64_t)1; - return diff; + return target; } } +static uint64_t hpet_next_wrap(uint64_t cur_tick) +{ + return (cur_tick | 0xffffffffU) + 1; +} + static void update_irq(struct HPETTimer *timer, int set) { uint64_t mask; @@ -196,21 +198,31 @@ static void update_irq(struct HPETTimer *timer, int set) } s = timer->state; mask = 1 << timer->tn; - if (!set || !timer_enabled(timer) || !hpet_enabled(timer->state)) { + + if (set && (timer->config & HPET_TN_TYPE_LEVEL)) { + /* + * If HPET_TN_ENABLE bit is 0, "the timer will still operate and + * generate appropriate status bits, but will not cause an interrupt" + */ + s->isr |= mask; + } else { s->isr &= ~mask; + } + + if (set && timer_enabled(timer) && hpet_enabled(s)) { + if (timer_fsb_route(timer)) { + address_space_stl_le(&address_space_memory, timer->fsb >> 32, + timer->fsb & 0xffffffff, MEMTXATTRS_UNSPECIFIED, + NULL); + } else if (timer->config & HPET_TN_TYPE_LEVEL) { + qemu_irq_raise(s->irqs[route]); + } else { + qemu_irq_pulse(s->irqs[route]); + } + } else { if (!timer_fsb_route(timer)) { qemu_irq_lower(s->irqs[route]); } - } else if (timer_fsb_route(timer)) { - address_space_stl_le(&address_space_memory, timer->fsb >> 32, - timer->fsb & 0xffffffff, MEMTXATTRS_UNSPECIFIED, - NULL); - } else if (timer->config & HPET_TN_TYPE_LEVEL) { - s->isr |= mask; - qemu_irq_raise(s->irqs[route]); - } else { - s->isr &= ~mask; - qemu_irq_pulse(s->irqs[route]); } } @@ -250,7 +262,13 @@ static bool hpet_validate_num_timers(void *opaque, int version_id) static int hpet_post_load(void *opaque, int version_id) { HPETState *s = opaque; + int i; + for (i = 0; i < s->num_timers; i++) { + HPETTimer *t = &s->timer[i]; + t->cmp64 = hpet_calculate_cmp64(t, s->hpet_counter, t->cmp); + t->last = qemu_clock_get_ns(QEMU_CLOCK_VIRTUAL) - NANOSECONDS_PER_SECOND; + } /* Recalculate the offset between the main counter and guest time */ if (!s->hpet_offset_saved) { s->hpet_offset = ticks_to_ns(s->hpet_counter) @@ -346,14 +364,17 @@ static const VMStateDescription vmstate_hpet = { } }; -static void hpet_arm(HPETTimer *t, uint64_t ticks) +static void hpet_arm(HPETTimer *t, uint64_t tick) { - if (ticks < ns_to_ticks(INT64_MAX / 2)) { - timer_mod(t->qemu_timer, - qemu_clock_get_ns(QEMU_CLOCK_VIRTUAL) + ticks_to_ns(ticks)); - } else { - timer_del(t->qemu_timer); + uint64_t ns = hpet_get_ns(t->state, tick); + + /* Clamp period to reasonable min value (1 us) */ + if (timer_is_periodic(t) && ns - t->last < 1000) { + ns = t->last + 1000; } + + t->last = ns; + timer_mod(t->qemu_timer, ns); } /* @@ -362,72 +383,68 @@ static void hpet_arm(HPETTimer *t, uint64_t ticks) static void hpet_timer(void *opaque) { HPETTimer *t = opaque; - uint64_t diff; - uint64_t period = t->period; uint64_t cur_tick = hpet_get_ticks(t->state); if (timer_is_periodic(t) && period != 0) { + while (hpet_time_after(cur_tick, t->cmp64)) { + t->cmp64 += period; + } if (t->config & HPET_TN_32BIT) { - while (hpet_time_after(cur_tick, t->cmp)) { - t->cmp = (uint32_t)(t->cmp + t->period); - } + t->cmp = (uint32_t)t->cmp64; } else { - while (hpet_time_after64(cur_tick, t->cmp)) { - t->cmp += period; - } - } - diff = hpet_calculate_diff(t, cur_tick); - hpet_arm(t, diff); - } else if (t->config & HPET_TN_32BIT && !timer_is_periodic(t)) { - if (t->wrap_flag) { - diff = hpet_calculate_diff(t, cur_tick); - hpet_arm(t, diff); - t->wrap_flag = 0; + t->cmp = t->cmp64; } + hpet_arm(t, t->cmp64); + } else if (t->wrap_flag) { + t->wrap_flag = 0; + hpet_arm(t, t->cmp64); } update_irq(t, 1); } static void hpet_set_timer(HPETTimer *t) { - uint64_t diff; - uint32_t wrap_diff; /* how many ticks until we wrap? */ uint64_t cur_tick = hpet_get_ticks(t->state); - /* whenever new timer is being set up, make sure wrap_flag is 0 */ t->wrap_flag = 0; - diff = hpet_calculate_diff(t, cur_tick); - - /* hpet spec says in one-shot 32-bit mode, generate an interrupt when - * counter wraps in addition to an interrupt with comparator match. - */ - if (t->config & HPET_TN_32BIT && !timer_is_periodic(t)) { - wrap_diff = 0xffffffff - (uint32_t)cur_tick; - if (wrap_diff < (uint32_t)diff) { - diff = wrap_diff; + t->cmp64 = hpet_calculate_cmp64(t, cur_tick, t->cmp); + if (t->config & HPET_TN_32BIT) { + + /* hpet spec says in one-shot 32-bit mode, generate an interrupt when + * counter wraps in addition to an interrupt with comparator match. + */ + if (!timer_is_periodic(t) && t->cmp64 > hpet_next_wrap(cur_tick)) { t->wrap_flag = 1; + hpet_arm(t, hpet_next_wrap(cur_tick)); + return; } } - hpet_arm(t, diff); + hpet_arm(t, t->cmp64); } static void hpet_del_timer(HPETTimer *t) { + HPETState *s = t->state; timer_del(t->qemu_timer); - update_irq(t, 0); + + if (s->isr & (1 << t->tn)) { + /* For level-triggered interrupt, this leaves ISR set but lowers irq. */ + update_irq(t, 1); + } } static uint64_t hpet_ram_read(void *opaque, hwaddr addr, unsigned size) { HPETState *s = opaque; - uint64_t cur_tick, index; + int shift = (addr & 4) * 8; + uint64_t cur_tick; trace_hpet_ram_read(addr); - index = addr; + /*address range of all TN regs*/ - if (index >= 0x100 && index <= 0x3ff) { + if (addr >= 0x100 && addr <= 0x3ff) { uint8_t timer_id = (addr - 0x100) / 0x20; HPETTimer *timer = &s->timer[timer_id]; @@ -436,52 +453,33 @@ static uint64_t hpet_ram_read(void *opaque, hwaddr addr, return 0; } - switch ((addr - 0x100) % 0x20) { - case HPET_TN_CFG: - return timer->config; - case HPET_TN_CFG + 4: // Interrupt capabilities - return timer->config >> 32; + switch (addr & 0x18) { + case HPET_TN_CFG: // including interrupt capabilities + return timer->config >> shift; case HPET_TN_CMP: // comparator register - return timer->cmp; - case HPET_TN_CMP + 4: - return timer->cmp >> 32; + return timer->cmp >> shift; case HPET_TN_ROUTE: - return timer->fsb; - case HPET_TN_ROUTE + 4: - return timer->fsb >> 32; + return timer->fsb >> shift; default: trace_hpet_ram_read_invalid(); break; } } else { - switch (index) { - case HPET_ID: - return s->capability; - case HPET_PERIOD: - return s->capability >> 32; + switch (addr & ~4) { + case HPET_ID: // including HPET_PERIOD + return s->capability >> shift; case HPET_CFG: - return s->config; - case HPET_CFG + 4: - trace_hpet_invalid_hpet_cfg(4); - return 0; + return s->config >> shift; case HPET_COUNTER: if (hpet_enabled(s)) { cur_tick = hpet_get_ticks(s); } else { cur_tick = s->hpet_counter; } - trace_hpet_ram_read_reading_counter(0, cur_tick); - return cur_tick; - case HPET_COUNTER + 4: - if (hpet_enabled(s)) { - cur_tick = hpet_get_ticks(s); - } else { - cur_tick = s->hpet_counter; - } - trace_hpet_ram_read_reading_counter(4, cur_tick); - return cur_tick >> 32; + trace_hpet_ram_read_reading_counter(addr & 4, cur_tick); + return cur_tick >> shift; case HPET_STATUS: - return s->isr; + return s->isr >> shift; default: trace_hpet_ram_read_invalid(); break; @@ -495,15 +493,14 @@ static void hpet_ram_write(void *opaque, hwaddr addr, { int i; HPETState *s = opaque; - uint64_t old_val, new_val, val, index; + int shift = (addr & 4) * 8; + int len = MIN(size * 8, 64 - shift); + uint64_t old_val, new_val, cleared; trace_hpet_ram_write(addr, value); - index = addr; - old_val = hpet_ram_read(opaque, addr, 4); - new_val = value; /*address range of all TN regs*/ - if (index >= 0x100 && index <= 0x3ff) { + if (addr >= 0x100 && addr <= 0x3ff) { uint8_t timer_id = (addr - 0x100) / 0x20; HPETTimer *timer = &s->timer[timer_id]; @@ -512,71 +509,49 @@ static void hpet_ram_write(void *opaque, hwaddr addr, trace_hpet_timer_id_out_of_range(timer_id); return; } - switch ((addr - 0x100) % 0x20) { + switch (addr & 0x18) { case HPET_TN_CFG: - trace_hpet_ram_write_tn_cfg(); - if (activating_bit(old_val, new_val, HPET_TN_FSB_ENABLE)) { + trace_hpet_ram_write_tn_cfg(addr & 4); + old_val = timer->config; + new_val = deposit64(old_val, shift, len, value); + new_val = hpet_fixup_reg(new_val, old_val, HPET_TN_CFG_WRITE_MASK); + if (deactivating_bit(old_val, new_val, HPET_TN_TYPE_LEVEL)) { + /* + * Do this before changing timer->config; otherwise, if + * HPET_TN_FSB is set, update_irq will not lower the qemu_irq. + */ update_irq(timer, 0); } - val = hpet_fixup_reg(new_val, old_val, HPET_TN_CFG_WRITE_MASK); - timer->config = (timer->config & 0xffffffff00000000ULL) | val; + timer->config = new_val; + if (activating_bit(old_val, new_val, HPET_TN_ENABLE) + && (s->isr & (1 << timer_id))) { + update_irq(timer, 1); + } if (new_val & HPET_TN_32BIT) { timer->cmp = (uint32_t)timer->cmp; timer->period = (uint32_t)timer->period; } - if (activating_bit(old_val, new_val, HPET_TN_ENABLE) && - hpet_enabled(s)) { + if (hpet_enabled(s)) { hpet_set_timer(timer); - } else if (deactivating_bit(old_val, new_val, HPET_TN_ENABLE)) { - hpet_del_timer(timer); } break; - case HPET_TN_CFG + 4: // Interrupt capabilities - trace_hpet_ram_write_invalid_tn_cfg(4); - break; case HPET_TN_CMP: // comparator register - trace_hpet_ram_write_tn_cmp(0); if (timer->config & HPET_TN_32BIT) { - new_val = (uint32_t)new_val; - } - if (!timer_is_periodic(timer) - || (timer->config & HPET_TN_SETVAL)) { - timer->cmp = (timer->cmp & 0xffffffff00000000ULL) | new_val; - } - if (timer_is_periodic(timer)) { - /* - * FIXME: Clamp period to reasonable min value? - * Clamp period to reasonable max value - */ - if (timer->config & HPET_TN_32BIT) { - new_val = MIN(new_val, ~0u >> 1); + /* High 32-bits are zero, leave them untouched. */ + if (shift) { + trace_hpet_ram_write_invalid_tn_cmp(); + break; } - timer->period = - (timer->period & 0xffffffff00000000ULL) | new_val; - } - /* - * FIXME: on a 64-bit write, HPET_TN_SETVAL should apply to the - * high bits part as well. - */ - timer->config &= ~HPET_TN_SETVAL; - if (hpet_enabled(s)) { - hpet_set_timer(timer); + len = 64; + value = (uint32_t) value; } - break; - case HPET_TN_CMP + 4: // comparator register high order - trace_hpet_ram_write_tn_cmp(4); + trace_hpet_ram_write_tn_cmp(addr & 4); if (!timer_is_periodic(timer) || (timer->config & HPET_TN_SETVAL)) { - timer->cmp = (timer->cmp & 0xffffffffULL) | new_val << 32; + timer->cmp = deposit64(timer->cmp, shift, len, value); } if (timer_is_periodic(timer)) { - /* - * FIXME: Clamp period to reasonable min value? - * Clamp period to reasonable max value - */ - new_val = MIN(new_val, ~0u >> 1); - timer->period = - (timer->period & 0xffffffffULL) | new_val << 32; + timer->period = deposit64(timer->period, shift, len, value); } timer->config &= ~HPET_TN_SETVAL; if (hpet_enabled(s)) { @@ -584,10 +559,7 @@ static void hpet_ram_write(void *opaque, hwaddr addr, } break; case HPET_TN_ROUTE: - timer->fsb = (timer->fsb & 0xffffffff00000000ULL) | new_val; - break; - case HPET_TN_ROUTE + 4: - timer->fsb = (new_val << 32) | (timer->fsb & 0xffffffff); + timer->fsb = deposit64(timer->fsb, shift, len, value); break; default: trace_hpet_ram_write_invalid(); @@ -595,20 +567,23 @@ static void hpet_ram_write(void *opaque, hwaddr addr, } return; } else { - switch (index) { + switch (addr & ~4) { case HPET_ID: return; case HPET_CFG: - val = hpet_fixup_reg(new_val, old_val, HPET_CFG_WRITE_MASK); - s->config = (s->config & 0xffffffff00000000ULL) | val; + old_val = s->config; + new_val = deposit64(old_val, shift, len, value); + new_val = hpet_fixup_reg(new_val, old_val, HPET_CFG_WRITE_MASK); + s->config = new_val; if (activating_bit(old_val, new_val, HPET_CFG_ENABLE)) { /* Enable main counter and interrupt generation. */ s->hpet_offset = ticks_to_ns(s->hpet_counter) - qemu_clock_get_ns(QEMU_CLOCK_VIRTUAL); for (i = 0; i < s->num_timers; i++) { - if ((&s->timer[i])->cmp != ~0ULL) { - hpet_set_timer(&s->timer[i]); + if (timer_enabled(&s->timer[i]) && (s->isr & (1 << i))) { + update_irq(&s->timer[i], 1); } + hpet_set_timer(&s->timer[i]); } } else if (deactivating_bit(old_val, new_val, HPET_CFG_ENABLE)) { /* Halt main counter and disable interrupt generation. */ @@ -629,13 +604,11 @@ static void hpet_ram_write(void *opaque, hwaddr addr, qemu_set_irq(s->irqs[RTC_ISA_IRQ], s->rtc_irq_level); } break; - case HPET_CFG + 4: - trace_hpet_invalid_hpet_cfg(4); - break; case HPET_STATUS: - val = new_val & s->isr; + new_val = value << shift; + cleared = new_val & s->isr; for (i = 0; i < s->num_timers; i++) { - if (val & (1 << i)) { + if (cleared & (1 << i)) { update_irq(&s->timer[i], 0); } } @@ -644,15 +617,7 @@ static void hpet_ram_write(void *opaque, hwaddr addr, if (hpet_enabled(s)) { trace_hpet_ram_write_counter_write_while_enabled(); } - s->hpet_counter = - (s->hpet_counter & 0xffffffff00000000ULL) | value; - trace_hpet_ram_write_counter_written(0, value, s->hpet_counter); - break; - case HPET_COUNTER + 4: - trace_hpet_ram_write_counter_write_while_enabled(); - s->hpet_counter = - (s->hpet_counter & 0xffffffffULL) | (((uint64_t)value) << 32); - trace_hpet_ram_write_counter_written(4, value, s->hpet_counter); + s->hpet_counter = deposit64(s->hpet_counter, shift, len, value); break; default: trace_hpet_ram_write_invalid(); @@ -666,7 +631,11 @@ static const MemoryRegionOps hpet_ram_ops = { .write = hpet_ram_write, .valid = { .min_access_size = 4, - .max_access_size = 4, + .max_access_size = 8, + }, + .impl = { + .min_access_size = 4, + .max_access_size = 8, }, .endianness = DEVICE_NATIVE_ENDIAN, }; diff --git a/hw/timer/trace-events b/hw/timer/trace-events index de769f4b71..f48a712801 100644 --- a/hw/timer/trace-events +++ b/hw/timer/trace-events @@ -108,9 +108,9 @@ hpet_ram_read_reading_counter(uint8_t reg_off, uint64_t cur_tick) "reading count hpet_ram_read_invalid(void) "invalid hpet_ram_readl" hpet_ram_write(uint64_t addr, uint64_t value) "enter hpet_ram_writel at 0x%" PRIx64 " = 0x%" PRIx64 hpet_ram_write_timer_id(uint64_t timer_id) "hpet_ram_writel timer_id = 0x%" PRIx64 -hpet_ram_write_tn_cfg(void) "hpet_ram_writel HPET_TN_CFG" -hpet_ram_write_invalid_tn_cfg(uint8_t reg_off) "invalid HPET_TN_CFG + %" PRIu8 " write" +hpet_ram_write_tn_cfg(uint8_t reg_off) "hpet_ram_writel HPET_TN_CFG + %" PRIu8 hpet_ram_write_tn_cmp(uint8_t reg_off) "hpet_ram_writel HPET_TN_CMP + %" PRIu8 +hpet_ram_write_invalid_tn_cmp(void) "invalid HPET_TN_CMP + 4 write" hpet_ram_write_invalid(void) "invalid hpet_ram_writel" hpet_ram_write_counter_write_while_enabled(void) "Writing counter while HPET enabled!" hpet_ram_write_counter_written(uint8_t reg_off, uint64_t value, uint64_t counter) "HPET counter + %" PRIu8 "written. crt = 0x%" PRIx64 " -> 0x%" PRIx64 diff --git a/hw/vfio/ap.c b/hw/vfio/ap.c index 0c4354e3e7..71bf32b83c 100644 --- a/hw/vfio/ap.c +++ b/hw/vfio/ap.c @@ -230,6 +230,9 @@ static void vfio_ap_instance_init(Object *obj) */ vfio_device_init(vbasedev, VFIO_DEVICE_TYPE_AP, &vfio_ap_ops, DEVICE(vapdev), true); + + /* AP device is mdev type device */ + vbasedev->mdev = true; } #ifdef CONFIG_IOMMUFD diff --git a/hw/vfio/ccw.c b/hw/vfio/ccw.c index 1f8e1272c7..115862f430 100644 --- a/hw/vfio/ccw.c +++ b/hw/vfio/ccw.c @@ -675,6 +675,9 @@ static void vfio_ccw_instance_init(Object *obj) VFIOCCWDevice *vcdev = VFIO_CCW(obj); VFIODevice *vbasedev = &vcdev->vdev; + /* CCW device is mdev type device */ + vbasedev->mdev = true; + /* * All vfio-ccw devices are believed to operate in a way compatible with * discarding of memory in RAM blocks, ie. pages pinned in the host are diff --git a/hw/vfio/common.c b/hw/vfio/common.c index 6d15b36e0b..36d0cf6585 100644 --- a/hw/vfio/common.c +++ b/hw/vfio/common.c @@ -199,6 +199,9 @@ bool vfio_devices_all_device_dirty_tracking(const VFIOContainerBase *bcontainer) VFIODevice *vbasedev; QLIST_FOREACH(vbasedev, &bcontainer->device_list, container_next) { + if (vbasedev->device_dirty_page_tracking == ON_OFF_AUTO_OFF) { + return false; + } if (!vbasedev->dirty_pages_supported) { return false; } @@ -599,7 +602,7 @@ static void vfio_listener_region_add(MemoryListener *listener, IOMMUMemoryRegion *iommu_mr = IOMMU_MEMORY_REGION(section->mr); int iommu_idx; - trace_vfio_listener_region_add_iommu(iova, end); + trace_vfio_listener_region_add_iommu(section->mr->name, iova, end); /* * FIXME: For VFIO iommu types which have KVM acceleration to * avoid bouncing all map/unmaps through qemu this way, this @@ -725,6 +728,7 @@ static void vfio_listener_region_del(MemoryListener *listener, if (memory_region_is_iommu(section->mr)) { VFIOGuestIOMMU *giommu; + trace_vfio_listener_region_del_iommu(section->mr->name); QLIST_FOREACH(giommu, &bcontainer->giommu_list, giommu_next) { if (MEMORY_REGION(giommu->iommu_mr) == section->mr && giommu->n.start == section->offset_within_region) { @@ -1536,7 +1540,7 @@ bool vfio_attach_device(char *name, VFIODevice *vbasedev, { const VFIOIOMMUClass *ops = VFIO_IOMMU_CLASS(object_class_by_name(TYPE_VFIO_IOMMU_LEGACY)); - HostIOMMUDevice *hiod; + HostIOMMUDevice *hiod = NULL; if (vbasedev->iommufd) { ops = VFIO_IOMMU_CLASS(object_class_by_name(TYPE_VFIO_IOMMU_IOMMUFD)); @@ -1544,17 +1548,17 @@ bool vfio_attach_device(char *name, VFIODevice *vbasedev, assert(ops); - if (!ops->attach_device(name, vbasedev, as, errp)) { - return false; + + if (!vbasedev->mdev) { + hiod = HOST_IOMMU_DEVICE(object_new(ops->hiod_typename)); + vbasedev->hiod = hiod; } - hiod = HOST_IOMMU_DEVICE(object_new(ops->hiod_typename)); - if (!HOST_IOMMU_DEVICE_GET_CLASS(hiod)->realize(hiod, vbasedev, errp)) { + if (!ops->attach_device(name, vbasedev, as, errp)) { object_unref(hiod); - ops->detach_device(vbasedev); + vbasedev->hiod = NULL; return false; } - vbasedev->hiod = hiod; return true; } diff --git a/hw/vfio/container.c b/hw/vfio/container.c index 38a9df3496..9ccdb639ac 100644 --- a/hw/vfio/container.c +++ b/hw/vfio/container.c @@ -656,7 +656,6 @@ static bool vfio_connect_container(VFIOGroup *group, AddressSpace *as, return true; listener_release_exit: QLIST_REMOVE(group, container_next); - QLIST_REMOVE(bcontainer, next); vfio_kvm_device_del_group(group); memory_listener_unregister(&bcontainer->listener); if (vioc->release) { @@ -915,6 +914,10 @@ static bool vfio_legacy_attach_device(const char *name, VFIODevice *vbasedev, trace_vfio_attach_device(vbasedev->name, groupid); + if (!vfio_device_hiod_realize(vbasedev, errp)) { + return false; + } + group = vfio_get_group(groupid, as, errp); if (!group) { return false; @@ -1142,7 +1145,6 @@ static bool hiod_legacy_vfio_realize(HostIOMMUDevice *hiod, void *opaque, VFIODevice *vdev = opaque; hiod->name = g_strdup(vdev->name); - hiod->caps.aw_bits = vfio_device_get_aw_bits(vdev); hiod->agent = opaque; return true; @@ -1151,11 +1153,9 @@ static bool hiod_legacy_vfio_realize(HostIOMMUDevice *hiod, void *opaque, static int hiod_legacy_vfio_get_cap(HostIOMMUDevice *hiod, int cap, Error **errp) { - HostIOMMUDeviceCaps *caps = &hiod->caps; - switch (cap) { case HOST_IOMMU_DEVICE_CAP_AW_BITS: - return caps->aw_bits; + return vfio_device_get_aw_bits(hiod->agent); default: error_setg(errp, "%s: unsupported capability %x", hiod->name, cap); return -EINVAL; diff --git a/hw/vfio/helpers.c b/hw/vfio/helpers.c index b14edd46ed..ea15c79db0 100644 --- a/hw/vfio/helpers.c +++ b/hw/vfio/helpers.c @@ -675,3 +675,28 @@ int vfio_device_get_aw_bits(VFIODevice *vdev) return HOST_IOMMU_DEVICE_CAP_AW_BITS_MAX; } + +bool vfio_device_is_mdev(VFIODevice *vbasedev) +{ + g_autofree char *subsys = NULL; + g_autofree char *tmp = NULL; + + if (!vbasedev->sysfsdev) { + return false; + } + + tmp = g_strdup_printf("%s/subsystem", vbasedev->sysfsdev); + subsys = realpath(tmp, NULL); + return subsys && (strcmp(subsys, "/sys/bus/mdev") == 0); +} + +bool vfio_device_hiod_realize(VFIODevice *vbasedev, Error **errp) +{ + HostIOMMUDevice *hiod = vbasedev->hiod; + + if (!hiod) { + return true; + } + + return HOST_IOMMU_DEVICE_GET_CLASS(hiod)->realize(hiod, vbasedev, errp); +} diff --git a/hw/vfio/iommufd.c b/hw/vfio/iommufd.c index 7b5f87a148..e7bece4ea1 100644 --- a/hw/vfio/iommufd.c +++ b/hw/vfio/iommufd.c @@ -25,6 +25,7 @@ #include "qemu/cutils.h" #include "qemu/chardev_open.h" #include "pci.h" +#include "exec/ram_addr.h" static int iommufd_cdev_map(const VFIOContainerBase *bcontainer, hwaddr iova, ram_addr_t size, void *vaddr, bool readonly) @@ -110,6 +111,68 @@ static void iommufd_cdev_unbind_and_disconnect(VFIODevice *vbasedev) iommufd_backend_disconnect(vbasedev->iommufd); } +static bool iommufd_hwpt_dirty_tracking(VFIOIOASHwpt *hwpt) +{ + return hwpt && hwpt->hwpt_flags & IOMMU_HWPT_ALLOC_DIRTY_TRACKING; +} + +static int iommufd_set_dirty_page_tracking(const VFIOContainerBase *bcontainer, + bool start, Error **errp) +{ + const VFIOIOMMUFDContainer *container = + container_of(bcontainer, VFIOIOMMUFDContainer, bcontainer); + VFIOIOASHwpt *hwpt; + + QLIST_FOREACH(hwpt, &container->hwpt_list, next) { + if (!iommufd_hwpt_dirty_tracking(hwpt)) { + continue; + } + + if (!iommufd_backend_set_dirty_tracking(container->be, + hwpt->hwpt_id, start, errp)) { + goto err; + } + } + + return 0; + +err: + QLIST_FOREACH(hwpt, &container->hwpt_list, next) { + if (!iommufd_hwpt_dirty_tracking(hwpt)) { + continue; + } + iommufd_backend_set_dirty_tracking(container->be, + hwpt->hwpt_id, !start, NULL); + } + return -EINVAL; +} + +static int iommufd_query_dirty_bitmap(const VFIOContainerBase *bcontainer, + VFIOBitmap *vbmap, hwaddr iova, + hwaddr size, Error **errp) +{ + VFIOIOMMUFDContainer *container = container_of(bcontainer, + VFIOIOMMUFDContainer, + bcontainer); + unsigned long page_size = qemu_real_host_page_size(); + VFIOIOASHwpt *hwpt; + + QLIST_FOREACH(hwpt, &container->hwpt_list, next) { + if (!iommufd_hwpt_dirty_tracking(hwpt)) { + continue; + } + + if (!iommufd_backend_get_dirty_bitmap(container->be, hwpt->hwpt_id, + iova, size, page_size, + (uint64_t *)vbmap->bitmap, + errp)) { + return -EINVAL; + } + } + + return 0; +} + static int iommufd_cdev_getfd(const char *sysfs_path, Error **errp) { ERRP_GUARD(); @@ -172,7 +235,7 @@ out: return ret; } -static bool iommufd_cdev_attach_ioas_hwpt(VFIODevice *vbasedev, uint32_t id, +static int iommufd_cdev_attach_ioas_hwpt(VFIODevice *vbasedev, uint32_t id, Error **errp) { int iommufd = vbasedev->iommufd->fd; @@ -187,12 +250,12 @@ static bool iommufd_cdev_attach_ioas_hwpt(VFIODevice *vbasedev, uint32_t id, error_setg_errno(errp, errno, "[iommufd=%d] error attach %s (%d) to id=%d", iommufd, vbasedev->name, vbasedev->fd, id); - return false; + return -errno; } trace_iommufd_cdev_attach_ioas_hwpt(iommufd, vbasedev->name, vbasedev->fd, id); - return true; + return 0; } static bool iommufd_cdev_detach_ioas_hwpt(VFIODevice *vbasedev, Error **errp) @@ -212,11 +275,111 @@ static bool iommufd_cdev_detach_ioas_hwpt(VFIODevice *vbasedev, Error **errp) return true; } +static bool iommufd_cdev_autodomains_get(VFIODevice *vbasedev, + VFIOIOMMUFDContainer *container, + Error **errp) +{ + ERRP_GUARD(); + IOMMUFDBackend *iommufd = vbasedev->iommufd; + uint32_t flags = 0; + VFIOIOASHwpt *hwpt; + uint32_t hwpt_id; + int ret; + + /* Try to find a domain */ + QLIST_FOREACH(hwpt, &container->hwpt_list, next) { + ret = iommufd_cdev_attach_ioas_hwpt(vbasedev, hwpt->hwpt_id, errp); + if (ret) { + /* -EINVAL means the domain is incompatible with the device. */ + if (ret == -EINVAL) { + /* + * It is an expected failure and it just means we will try + * another domain, or create one if no existing compatible + * domain is found. Hence why the error is discarded below. + */ + error_free(*errp); + *errp = NULL; + continue; + } + + return false; + } else { + vbasedev->hwpt = hwpt; + QLIST_INSERT_HEAD(&hwpt->device_list, vbasedev, hwpt_next); + vbasedev->iommu_dirty_tracking = iommufd_hwpt_dirty_tracking(hwpt); + return true; + } + } + + /* + * This is quite early and VFIO Migration state isn't yet fully + * initialized, thus rely only on IOMMU hardware capabilities as to + * whether IOMMU dirty tracking is going to be requested. Later + * vfio_migration_realize() may decide to use VF dirty tracking + * instead. + */ + if (vbasedev->hiod->caps.hw_caps & IOMMU_HW_CAP_DIRTY_TRACKING) { + flags = IOMMU_HWPT_ALLOC_DIRTY_TRACKING; + } + + if (!iommufd_backend_alloc_hwpt(iommufd, vbasedev->devid, + container->ioas_id, flags, + IOMMU_HWPT_DATA_NONE, 0, NULL, + &hwpt_id, errp)) { + return false; + } + + hwpt = g_malloc0(sizeof(*hwpt)); + hwpt->hwpt_id = hwpt_id; + hwpt->hwpt_flags = flags; + QLIST_INIT(&hwpt->device_list); + + ret = iommufd_cdev_attach_ioas_hwpt(vbasedev, hwpt->hwpt_id, errp); + if (ret) { + iommufd_backend_free_id(container->be, hwpt->hwpt_id); + g_free(hwpt); + return false; + } + + vbasedev->hwpt = hwpt; + vbasedev->iommu_dirty_tracking = iommufd_hwpt_dirty_tracking(hwpt); + QLIST_INSERT_HEAD(&hwpt->device_list, vbasedev, hwpt_next); + QLIST_INSERT_HEAD(&container->hwpt_list, hwpt, next); + container->bcontainer.dirty_pages_supported |= + vbasedev->iommu_dirty_tracking; + if (container->bcontainer.dirty_pages_supported && + !vbasedev->iommu_dirty_tracking) { + warn_report("IOMMU instance for device %s doesn't support dirty tracking", + vbasedev->name); + } + return true; +} + +static void iommufd_cdev_autodomains_put(VFIODevice *vbasedev, + VFIOIOMMUFDContainer *container) +{ + VFIOIOASHwpt *hwpt = vbasedev->hwpt; + + QLIST_REMOVE(vbasedev, hwpt_next); + vbasedev->hwpt = NULL; + + if (QLIST_EMPTY(&hwpt->device_list)) { + QLIST_REMOVE(hwpt, next); + iommufd_backend_free_id(container->be, hwpt->hwpt_id); + g_free(hwpt); + } +} + static bool iommufd_cdev_attach_container(VFIODevice *vbasedev, VFIOIOMMUFDContainer *container, Error **errp) { - return iommufd_cdev_attach_ioas_hwpt(vbasedev, container->ioas_id, errp); + /* mdevs aren't physical devices and will fail with auto domains */ + if (!vbasedev->mdev) { + return iommufd_cdev_autodomains_get(vbasedev, container, errp); + } + + return !iommufd_cdev_attach_ioas_hwpt(vbasedev, container->ioas_id, errp); } static void iommufd_cdev_detach_container(VFIODevice *vbasedev, @@ -227,6 +390,11 @@ static void iommufd_cdev_detach_container(VFIODevice *vbasedev, if (!iommufd_cdev_detach_ioas_hwpt(vbasedev, &err)) { error_report_err(err); } + + if (vbasedev->hwpt) { + iommufd_cdev_autodomains_put(vbasedev, container); + } + } static void iommufd_cdev_container_destroy(VFIOIOMMUFDContainer *container) @@ -320,6 +488,17 @@ static bool iommufd_cdev_attach(const char *name, VFIODevice *vbasedev, space = vfio_get_address_space(as); + /* + * The HostIOMMUDevice data from legacy backend is static and doesn't need + * any information from the (type1-iommu) backend to be initialized. In + * contrast however, the IOMMUFD HostIOMMUDevice data requires the iommufd + * FD to be connected and having a devid to be able to successfully call + * iommufd_backend_get_device_info(). + */ + if (!vfio_device_hiod_realize(vbasedev, errp)) { + goto err_alloc_ioas; + } + /* try to attach to an existing container in this space */ QLIST_FOREACH(bcontainer, &space->containers, next) { container = container_of(bcontainer, VFIOIOMMUFDContainer, bcontainer); @@ -354,6 +533,7 @@ static bool iommufd_cdev_attach(const char *name, VFIODevice *vbasedev, container = VFIO_IOMMU_IOMMUFD(object_new(TYPE_VFIO_IOMMU_IOMMUFD)); container->be = vbasedev->iommufd; container->ioas_id = ioas_id; + QLIST_INIT(&container->hwpt_list); bcontainer = &container->bcontainer; vfio_address_space_insert(space, bcontainer); @@ -617,6 +797,8 @@ static void vfio_iommu_iommufd_class_init(ObjectClass *klass, void *data) vioc->attach_device = iommufd_cdev_attach; vioc->detach_device = iommufd_cdev_detach; vioc->pci_hot_reset = iommufd_cdev_pci_hot_reset; + vioc->set_dirty_page_tracking = iommufd_set_dirty_page_tracking; + vioc->query_dirty_bitmap = iommufd_query_dirty_bitmap; }; static bool hiod_iommufd_vfio_realize(HostIOMMUDevice *hiod, void *opaque, @@ -628,17 +810,19 @@ static bool hiod_iommufd_vfio_realize(HostIOMMUDevice *hiod, void *opaque, union { struct iommu_hw_info_vtd vtd; } data; + uint64_t hw_caps; hiod->agent = opaque; if (!iommufd_backend_get_device_info(vdev->iommufd, vdev->devid, - &type, &data, sizeof(data), errp)) { + &type, &data, sizeof(data), + &hw_caps, errp)) { return false; } hiod->name = g_strdup(vdev->name); caps->type = type; - caps->aw_bits = vfio_device_get_aw_bits(vdev); + caps->hw_caps = hw_caps; return true; } diff --git a/hw/vfio/migration.c b/hw/vfio/migration.c index 34d4be2ce1..262d42a46e 100644 --- a/hw/vfio/migration.c +++ b/hw/vfio/migration.c @@ -1036,16 +1036,18 @@ bool vfio_migration_realize(VFIODevice *vbasedev, Error **errp) return !vfio_block_migration(vbasedev, err, errp); } - if (!vbasedev->dirty_pages_supported) { + if ((!vbasedev->dirty_pages_supported || + vbasedev->device_dirty_page_tracking == ON_OFF_AUTO_OFF) && + !vbasedev->iommu_dirty_tracking) { if (vbasedev->enable_migration == ON_OFF_AUTO_AUTO) { error_setg(&err, - "%s: VFIO device doesn't support device dirty tracking", - vbasedev->name); + "%s: VFIO device doesn't support device and " + "IOMMU dirty tracking", vbasedev->name); goto add_blocker; } - warn_report("%s: VFIO device doesn't support device dirty tracking", - vbasedev->name); + warn_report("%s: VFIO device doesn't support device and " + "IOMMU dirty tracking", vbasedev->name); } ret = vfio_block_multiple_devices_migration(vbasedev, errp); diff --git a/hw/vfio/pci.c b/hw/vfio/pci.c index e03d9f3ba5..2407720c35 100644 --- a/hw/vfio/pci.c +++ b/hw/vfio/pci.c @@ -2963,12 +2963,9 @@ static void vfio_realize(PCIDevice *pdev, Error **errp) ERRP_GUARD(); VFIOPCIDevice *vdev = VFIO_PCI(pdev); VFIODevice *vbasedev = &vdev->vbasedev; - char *subsys; int i, ret; - bool is_mdev; char uuid[UUID_STR_LEN]; g_autofree char *name = NULL; - g_autofree char *tmp = NULL; if (vbasedev->fd < 0 && !vbasedev->sysfsdev) { if (!(~vdev->host.domain || ~vdev->host.bus || @@ -2997,14 +2994,11 @@ static void vfio_realize(PCIDevice *pdev, Error **errp) * stays in sync with the active working set of the guest driver. Prevent * the x-balloon-allowed option unless this is minimally an mdev device. */ - tmp = g_strdup_printf("%s/subsystem", vbasedev->sysfsdev); - subsys = realpath(tmp, NULL); - is_mdev = subsys && (strcmp(subsys, "/sys/bus/mdev") == 0); - free(subsys); + vbasedev->mdev = vfio_device_is_mdev(vbasedev); - trace_vfio_mdev(vbasedev->name, is_mdev); + trace_vfio_mdev(vbasedev->name, vbasedev->mdev); - if (vbasedev->ram_block_discard_allowed && !is_mdev) { + if (vbasedev->ram_block_discard_allowed && !vbasedev->mdev) { error_setg(errp, "x-balloon-allowed only potentially compatible " "with mdev devices"); goto error; @@ -3121,7 +3115,8 @@ static void vfio_realize(PCIDevice *pdev, Error **errp) vfio_bars_register(vdev); - if (!pci_device_set_iommu_device(pdev, vbasedev->hiod, errp)) { + if (!vbasedev->mdev && + !pci_device_set_iommu_device(pdev, vbasedev->hiod, errp)) { error_prepend(errp, "Failed to set iommu_device: "); goto out_teardown; } @@ -3244,7 +3239,9 @@ out_deregister: timer_free(vdev->intx.mmap_timer); } out_unset_idev: - pci_device_unset_iommu_device(pdev); + if (!vbasedev->mdev) { + pci_device_unset_iommu_device(pdev); + } out_teardown: vfio_teardown_msi(vdev); vfio_bars_exit(vdev); @@ -3289,7 +3286,9 @@ static void vfio_exitfn(PCIDevice *pdev) vfio_pci_disable_rp_atomics(vdev); vfio_bars_exit(vdev); vfio_migration_exit(vbasedev); - pci_device_unset_iommu_device(pdev); + if (!vbasedev->mdev) { + pci_device_unset_iommu_device(pdev); + } } static void vfio_pci_reset(DeviceState *dev) @@ -3362,6 +3361,9 @@ static Property vfio_pci_dev_properties[] = { DEFINE_PROP_ON_OFF_AUTO("x-pre-copy-dirty-page-tracking", VFIOPCIDevice, vbasedev.pre_copy_dirty_page_tracking, ON_OFF_AUTO_ON), + DEFINE_PROP_ON_OFF_AUTO("x-device-dirty-page-tracking", VFIOPCIDevice, + vbasedev.device_dirty_page_tracking, + ON_OFF_AUTO_ON), DEFINE_PROP_ON_OFF_AUTO("display", VFIOPCIDevice, display, ON_OFF_AUTO_OFF), DEFINE_PROP_UINT32("xres", VFIOPCIDevice, display_xres, 0), diff --git a/hw/vfio/trace-events b/hw/vfio/trace-events index e16179b507..98bd4dccea 100644 --- a/hw/vfio/trace-events +++ b/hw/vfio/trace-events @@ -95,7 +95,8 @@ vfio_region_read(char *name, int index, uint64_t addr, unsigned size, uint64_t d vfio_iommu_map_notify(const char *op, uint64_t iova_start, uint64_t iova_end) "iommu %s @ 0x%"PRIx64" - 0x%"PRIx64 vfio_listener_region_skip(const char *name, uint64_t start, uint64_t end) "SKIPPING %s 0x%"PRIx64" - 0x%"PRIx64 vfio_spapr_group_attach(int groupfd, int tablefd) "Attached groupfd %d to liobn fd %d" -vfio_listener_region_add_iommu(uint64_t start, uint64_t end) "region_add [iommu] 0x%"PRIx64" - 0x%"PRIx64 +vfio_listener_region_add_iommu(const char* name, uint64_t start, uint64_t end) "region_add [iommu] %s 0x%"PRIx64" - 0x%"PRIx64 +vfio_listener_region_del_iommu(const char *name) "region_del [iommu] %s" vfio_listener_region_add_ram(uint64_t iova_start, uint64_t iova_end, void *vaddr) "region_add [ram] 0x%"PRIx64" - 0x%"PRIx64" [%p]" vfio_known_safe_misalignment(const char *name, uint64_t iova, uint64_t offset_within_region, uintptr_t page_size) "Region \"%s\" iova=0x%"PRIx64" offset_within_region=0x%"PRIx64" qemu_real_host_page_size=0x%"PRIxPTR vfio_listener_region_add_no_dma_map(const char *name, uint64_t iova, uint64_t size, uint64_t page_size) "Region \"%s\" 0x%"PRIx64" size=0x%"PRIx64" is not aligned to 0x%"PRIx64" and cannot be mapped for DMA" diff --git a/hw/virtio/trace-events b/hw/virtio/trace-events index b7c04f0856..04e36ae047 100644 --- a/hw/virtio/trace-events +++ b/hw/virtio/trace-events @@ -116,6 +116,7 @@ virtio_iommu_get_config(uint64_t page_size_mask, uint64_t start, uint64_t end, u virtio_iommu_set_config(uint8_t bypass) "bypass=0x%x" virtio_iommu_attach(uint32_t domain_id, uint32_t ep_id) "domain=%d endpoint=%d" virtio_iommu_detach(uint32_t domain_id, uint32_t ep_id) "domain=%d endpoint=%d" +virtio_iommu_detach_endpoint_from_domain(uint32_t domain_id, uint32_t ep_id) "domain=%d endpoint=%d" virtio_iommu_map(uint32_t domain_id, uint64_t virt_start, uint64_t virt_end, uint64_t phys_start, uint32_t flags) "domain=%d virt_start=0x%"PRIx64" virt_end=0x%"PRIx64 " phys_start=0x%"PRIx64" flags=%d" virtio_iommu_unmap(uint32_t domain_id, uint64_t virt_start, uint64_t virt_end) "domain=%d virt_start=0x%"PRIx64" virt_end=0x%"PRIx64 virtio_iommu_unmap_done(uint32_t domain_id, uint64_t virt_start, uint64_t virt_end) "domain=%d virt_start=0x%"PRIx64" virt_end=0x%"PRIx64 diff --git a/hw/virtio/vhost-user-fs.c b/hw/virtio/vhost-user-fs.c index ae48cc1c96..32ee7f496d 100644 --- a/hw/virtio/vhost-user-fs.c +++ b/hw/virtio/vhost-user-fs.c @@ -33,6 +33,7 @@ static const int user_feature_bits[] = { VIRTIO_F_RING_PACKED, VIRTIO_F_IOMMU_PLATFORM, VIRTIO_F_RING_RESET, + VIRTIO_F_IN_ORDER, VIRTIO_F_NOTIFICATION_DATA, VHOST_INVALID_FEATURE_BIT }; diff --git a/hw/virtio/vhost-user-vsock.c b/hw/virtio/vhost-user-vsock.c index 802b44a07d..da3b0e0229 100644 --- a/hw/virtio/vhost-user-vsock.c +++ b/hw/virtio/vhost-user-vsock.c @@ -21,6 +21,7 @@ static const int user_feature_bits[] = { VIRTIO_RING_F_INDIRECT_DESC, VIRTIO_RING_F_EVENT_IDX, VIRTIO_F_NOTIFY_ON_EMPTY, + VIRTIO_F_IN_ORDER, VIRTIO_F_NOTIFICATION_DATA, VHOST_INVALID_FEATURE_BIT }; diff --git a/hw/virtio/virtio-crypto.c b/hw/virtio/virtio-crypto.c index bbe8aa4b99..5034768bff 100644 --- a/hw/virtio/virtio-crypto.c +++ b/hw/virtio/virtio-crypto.c @@ -205,6 +205,7 @@ virtio_crypto_create_asym_session(VirtIOCrypto *vcrypto, int queue_index; uint32_t algo, keytype, keylen; + sreq->info.op_code = opcode; algo = ldl_le_p(&sess_req->para.algo); keytype = ldl_le_p(&sess_req->para.keytype); keylen = ldl_le_p(&sess_req->para.keylen); @@ -224,7 +225,6 @@ virtio_crypto_create_asym_session(VirtIOCrypto *vcrypto, iov_discard_front(&iov, &out_num, keylen); } - sreq->info.op_code = opcode; asym_info = &sreq->info.u.asym_sess_info; asym_info->algo = algo; asym_info->keytype = keytype; diff --git a/hw/virtio/virtio-iommu.c b/hw/virtio/virtio-iommu.c index 33ae61c4a6..59ef4fb217 100644 --- a/hw/virtio/virtio-iommu.c +++ b/hw/virtio/virtio-iommu.c @@ -308,6 +308,7 @@ static void virtio_iommu_detach_endpoint_from_domain(VirtIOIOMMUEndpoint *ep) if (!ep->domain) { return; } + trace_virtio_iommu_detach_endpoint_from_domain(domain->id, ep->id); g_tree_foreach(domain->mappings, virtio_iommu_notify_unmap_cb, ep->iommu_mr); QLIST_REMOVE(ep, next); @@ -467,26 +468,6 @@ static AddressSpace *virtio_iommu_find_add_as(PCIBus *bus, void *opaque, return &sdev->as; } -static void virtio_iommu_device_clear(VirtIOIOMMU *s, PCIBus *bus, int devfn) -{ - IOMMUPciBus *sbus = g_hash_table_lookup(s->as_by_busptr, bus); - IOMMUDevice *sdev; - - if (!sbus) { - return; - } - - sdev = sbus->pbdev[devfn]; - if (!sdev) { - return; - } - - g_list_free_full(sdev->resv_regions, g_free); - sdev->resv_regions = NULL; - g_free(sdev); - sbus->pbdev[devfn] = NULL; -} - static gboolean hiod_equal(gconstpointer v1, gconstpointer v2) { const struct hiod_key *key1 = v1; @@ -558,8 +539,6 @@ static int virtio_iommu_set_host_iova_ranges(VirtIOIOMMU *s, PCIBus *bus, { IOMMUPciBus *sbus = g_hash_table_lookup(s->as_by_busptr, bus); IOMMUDevice *sdev; - GList *current_ranges; - GList *l, *tmp, *new_ranges = NULL; int ret = -EINVAL; if (!sbus) { @@ -573,35 +552,10 @@ static int virtio_iommu_set_host_iova_ranges(VirtIOIOMMU *s, PCIBus *bus, return ret; } - current_ranges = sdev->host_resv_ranges; - - g_assert(!sdev->probe_done); - - /* check that each new resv region is included in an existing one */ if (sdev->host_resv_ranges) { - range_inverse_array(iova_ranges, - &new_ranges, - 0, UINT64_MAX); - - for (tmp = new_ranges; tmp; tmp = tmp->next) { - Range *newr = (Range *)tmp->data; - bool included = false; - - for (l = current_ranges; l; l = l->next) { - Range * r = (Range *)l->data; - - if (range_contains_range(r, newr)) { - included = true; - break; - } - } - if (!included) { - goto error; - } - } - /* all new reserved ranges are included in existing ones */ - ret = 0; - goto out; + error_setg(errp, "%s virtio-iommu does not support aliased BDF", + __func__); + return ret; } range_inverse_array(iova_ranges, @@ -610,14 +564,31 @@ static int virtio_iommu_set_host_iova_ranges(VirtIOIOMMU *s, PCIBus *bus, rebuild_resv_regions(sdev); return 0; -error: - error_setg(errp, "%s Conflicting host reserved ranges set!", - __func__); -out: - g_list_free_full(new_ranges, g_free); - return ret; } +static void virtio_iommu_unset_host_iova_ranges(VirtIOIOMMU *s, PCIBus *bus, + int devfn) +{ + IOMMUPciBus *sbus = g_hash_table_lookup(s->as_by_busptr, bus); + IOMMUDevice *sdev; + + if (!sbus) { + return; + } + + sdev = sbus->pbdev[devfn]; + if (!sdev) { + return; + } + + g_list_free_full(g_steal_pointer(&sdev->host_resv_ranges), g_free); + g_list_free_full(sdev->resv_regions, g_free); + sdev->host_resv_ranges = NULL; + sdev->resv_regions = NULL; + add_prop_resv_regions(sdev); +} + + static bool check_page_size_mask(VirtIOIOMMU *viommu, uint64_t new_mask, Error **errp) { @@ -726,9 +697,10 @@ virtio_iommu_unset_iommu_device(PCIBus *bus, void *opaque, int devfn) if (!hiod) { return; } + virtio_iommu_unset_host_iova_ranges(viommu, hiod->aliased_bus, + hiod->aliased_devfn); g_hash_table_remove(viommu->host_iommu_devices, &key); - virtio_iommu_device_clear(viommu, bus, devfn); } static const PCIIOMMUOps virtio_iommu_ops = { @@ -815,6 +787,7 @@ static int virtio_iommu_detach(VirtIOIOMMU *s, if (QLIST_EMPTY(&domain->endpoint_list)) { g_tree_remove(s->domains, GUINT_TO_POINTER(domain->id)); } + g_tree_remove(s->endpoints, GUINT_TO_POINTER(ep_id)); return VIRTIO_IOMMU_S_OK; } @@ -977,7 +950,6 @@ static int virtio_iommu_probe(VirtIOIOMMU *s, } buf += count; free -= count; - sdev->probe_done = true; return VIRTIO_IOMMU_S_OK; } diff --git a/hw/virtio/virtio-net-pci.c b/hw/virtio/virtio-net-pci.c index e03543a70a..dba4987d6e 100644 --- a/hw/virtio/virtio-net-pci.c +++ b/hw/virtio/virtio-net-pci.c @@ -75,6 +75,7 @@ static void virtio_net_pci_class_init(ObjectClass *klass, void *data) k->device_id = PCI_DEVICE_ID_VIRTIO_NET; k->revision = VIRTIO_PCI_ABI_VERSION; k->class_id = PCI_CLASS_NETWORK_ETHERNET; + k->sriov_vf_user_creatable = true; set_bit(DEVICE_CATEGORY_NETWORK, dc->categories); device_class_set_props(dc, virtio_net_properties); vpciklass->realize = virtio_net_pci_realize; diff --git a/hw/virtio/virtio-pci.c b/hw/virtio/virtio-pci.c index 9534730bba..0c8fcc5627 100644 --- a/hw/virtio/virtio-pci.c +++ b/hw/virtio/virtio-pci.c @@ -1955,6 +1955,7 @@ static void virtio_pci_device_plugged(DeviceState *d, Error **errp) uint8_t *config; uint32_t size; VirtIODevice *vdev = virtio_bus_get_device(bus); + int16_t res; /* * Virtio capabilities present without @@ -2100,6 +2101,14 @@ static void virtio_pci_device_plugged(DeviceState *d, Error **errp) pci_register_bar(&proxy->pci_dev, proxy->legacy_io_bar_idx, PCI_BASE_ADDRESS_SPACE_IO, &proxy->bar); } + + res = pcie_sriov_pf_init_from_user_created_vfs(&proxy->pci_dev, + proxy->last_pcie_cap_offset, + errp); + if (res > 0) { + proxy->last_pcie_cap_offset += res; + virtio_add_feature(&vdev->host_features, VIRTIO_F_SR_IOV); + } } static void virtio_pci_device_unplugged(DeviceState *d) @@ -2187,7 +2196,7 @@ static void virtio_pci_realize(PCIDevice *pci_dev, Error **errp) if (pcie_port && pci_is_express(pci_dev)) { int pos; - uint16_t last_pcie_cap_offset = PCI_CONFIG_SPACE_SIZE; + proxy->last_pcie_cap_offset = PCI_CONFIG_SPACE_SIZE; pos = pcie_endpoint_cap_init(pci_dev, 0); assert(pos > 0); @@ -2207,9 +2216,9 @@ static void virtio_pci_realize(PCIDevice *pci_dev, Error **errp) pci_set_word(pci_dev->config + pos + PCI_PM_PMC, 0x3); if (proxy->flags & VIRTIO_PCI_FLAG_AER) { - pcie_aer_init(pci_dev, PCI_ERR_VER, last_pcie_cap_offset, + pcie_aer_init(pci_dev, PCI_ERR_VER, proxy->last_pcie_cap_offset, PCI_ERR_SIZEOF, NULL); - last_pcie_cap_offset += PCI_ERR_SIZEOF; + proxy->last_pcie_cap_offset += PCI_ERR_SIZEOF; } if (proxy->flags & VIRTIO_PCI_FLAG_INIT_DEVERR) { @@ -2234,9 +2243,9 @@ static void virtio_pci_realize(PCIDevice *pci_dev, Error **errp) } if (proxy->flags & VIRTIO_PCI_FLAG_ATS) { - pcie_ats_init(pci_dev, last_pcie_cap_offset, + pcie_ats_init(pci_dev, proxy->last_pcie_cap_offset, proxy->flags & VIRTIO_PCI_FLAG_ATS_PAGE_ALIGNED); - last_pcie_cap_offset += PCI_EXT_CAP_ATS_SIZEOF; + proxy->last_pcie_cap_offset += PCI_EXT_CAP_ATS_SIZEOF; } if (proxy->flags & VIRTIO_PCI_FLAG_INIT_FLR) { @@ -2263,6 +2272,7 @@ static void virtio_pci_exit(PCIDevice *pci_dev) bool pcie_port = pci_bus_is_express(pci_get_bus(pci_dev)) && !pci_bus_is_root(pci_get_bus(pci_dev)); + pcie_sriov_pf_exit(&proxy->pci_dev); msix_uninit_exclusive_bar(pci_dev); if (proxy->flags & VIRTIO_PCI_FLAG_AER && pcie_port && pci_is_express(pci_dev)) { diff --git a/hw/virtio/virtio.c b/hw/virtio/virtio.c index 583a224163..397c261c3c 100644 --- a/hw/virtio/virtio.c +++ b/hw/virtio/virtio.c @@ -872,6 +872,46 @@ static void virtqueue_packed_fill(VirtQueue *vq, const VirtQueueElement *elem, vq->used_elems[idx].ndescs = elem->ndescs; } +static void virtqueue_ordered_fill(VirtQueue *vq, const VirtQueueElement *elem, + unsigned int len) +{ + unsigned int i, steps, max_steps; + + i = vq->used_idx % vq->vring.num; + steps = 0; + /* + * We shouldn't need to increase 'i' by more than the distance + * between used_idx and last_avail_idx. + */ + max_steps = (vq->last_avail_idx - vq->used_idx) % vq->vring.num; + + /* Search for element in vq->used_elems */ + while (steps <= max_steps) { + /* Found element, set length and mark as filled */ + if (vq->used_elems[i].index == elem->index) { + vq->used_elems[i].len = len; + vq->used_elems[i].in_order_filled = true; + break; + } + + i += vq->used_elems[i].ndescs; + steps += vq->used_elems[i].ndescs; + + if (i >= vq->vring.num) { + i -= vq->vring.num; + } + } + + /* + * We should be able to find a matching VirtQueueElement in + * used_elems. If we don't, this is an error. + */ + if (steps >= max_steps) { + qemu_log_mask(LOG_GUEST_ERROR, "%s: %s cannot fill buffer id %u\n", + __func__, vq->vdev->name, elem->index); + } +} + static void virtqueue_packed_fill_desc(VirtQueue *vq, const VirtQueueElement *elem, unsigned int idx, @@ -922,7 +962,9 @@ void virtqueue_fill(VirtQueue *vq, const VirtQueueElement *elem, return; } - if (virtio_vdev_has_feature(vq->vdev, VIRTIO_F_RING_PACKED)) { + if (virtio_vdev_has_feature(vq->vdev, VIRTIO_F_IN_ORDER)) { + virtqueue_ordered_fill(vq, elem, len); + } else if (virtio_vdev_has_feature(vq->vdev, VIRTIO_F_RING_PACKED)) { virtqueue_packed_fill(vq, elem, len, idx); } else { virtqueue_split_fill(vq, elem, len, idx); @@ -981,6 +1023,73 @@ static void virtqueue_packed_flush(VirtQueue *vq, unsigned int count) } } +static void virtqueue_ordered_flush(VirtQueue *vq) +{ + unsigned int i = vq->used_idx % vq->vring.num; + unsigned int ndescs = 0; + uint16_t old = vq->used_idx; + uint16_t new; + bool packed; + VRingUsedElem uelem; + + packed = virtio_vdev_has_feature(vq->vdev, VIRTIO_F_RING_PACKED); + + if (packed) { + if (unlikely(!vq->vring.desc)) { + return; + } + } else if (unlikely(!vq->vring.used)) { + return; + } + + /* First expected in-order element isn't ready, nothing to do */ + if (!vq->used_elems[i].in_order_filled) { + return; + } + + /* Search for filled elements in-order */ + while (vq->used_elems[i].in_order_filled) { + /* + * First entry for packed VQs is written last so the guest + * doesn't see invalid descriptors. + */ + if (packed && i != vq->used_idx) { + virtqueue_packed_fill_desc(vq, &vq->used_elems[i], ndescs, false); + } else if (!packed) { + uelem.id = vq->used_elems[i].index; + uelem.len = vq->used_elems[i].len; + vring_used_write(vq, &uelem, i); + } + + vq->used_elems[i].in_order_filled = false; + ndescs += vq->used_elems[i].ndescs; + i += vq->used_elems[i].ndescs; + if (i >= vq->vring.num) { + i -= vq->vring.num; + } + } + + if (packed) { + virtqueue_packed_fill_desc(vq, &vq->used_elems[vq->used_idx], 0, true); + vq->used_idx += ndescs; + if (vq->used_idx >= vq->vring.num) { + vq->used_idx -= vq->vring.num; + vq->used_wrap_counter ^= 1; + vq->signalled_used_valid = false; + } + } else { + /* Make sure buffer is written before we update index. */ + smp_wmb(); + new = old + ndescs; + vring_used_idx_set(vq, new); + if (unlikely((int16_t)(new - vq->signalled_used) < + (uint16_t)(new - old))) { + vq->signalled_used_valid = false; + } + } + vq->inuse -= ndescs; +} + void virtqueue_flush(VirtQueue *vq, unsigned int count) { if (virtio_device_disabled(vq->vdev)) { @@ -988,7 +1097,9 @@ void virtqueue_flush(VirtQueue *vq, unsigned int count) return; } - if (virtio_vdev_has_feature(vq->vdev, VIRTIO_F_RING_PACKED)) { + if (virtio_vdev_has_feature(vq->vdev, VIRTIO_F_IN_ORDER)) { + virtqueue_ordered_flush(vq); + } else if (virtio_vdev_has_feature(vq->vdev, VIRTIO_F_RING_PACKED)) { virtqueue_packed_flush(vq, count); } else { virtqueue_split_flush(vq, count); @@ -1505,7 +1616,7 @@ static void *virtqueue_alloc_element(size_t sz, unsigned out_num, unsigned in_nu static void *virtqueue_split_pop(VirtQueue *vq, size_t sz) { - unsigned int i, head, max; + unsigned int i, head, max, idx; VRingMemoryRegionCaches *caches; MemoryRegionCache indirect_desc_cache; MemoryRegionCache *desc_cache; @@ -1629,6 +1740,13 @@ static void *virtqueue_split_pop(VirtQueue *vq, size_t sz) elem->in_sg[i] = iov[out_num + i]; } + if (virtio_vdev_has_feature(vdev, VIRTIO_F_IN_ORDER)) { + idx = (vq->last_avail_idx - 1) % vq->vring.num; + vq->used_elems[idx].index = elem->index; + vq->used_elems[idx].len = elem->len; + vq->used_elems[idx].ndescs = elem->ndescs; + } + vq->inuse++; trace_virtqueue_pop(vq, elem, elem->in_num, elem->out_num); @@ -1762,6 +1880,13 @@ static void *virtqueue_packed_pop(VirtQueue *vq, size_t sz) elem->index = id; elem->ndescs = (desc_cache == &indirect_desc_cache) ? 1 : elem_entries; + + if (virtio_vdev_has_feature(vdev, VIRTIO_F_IN_ORDER)) { + vq->used_elems[vq->last_avail_idx].index = elem->index; + vq->used_elems[vq->last_avail_idx].len = elem->len; + vq->used_elems[vq->last_avail_idx].ndescs = elem->ndescs; + } + vq->last_avail_idx += elem->ndescs; vq->inuse += elem->ndescs; diff --git a/include/chardev/char-fe.h b/include/chardev/char-fe.h index ecef182835..3310449eaf 100644 --- a/include/chardev/char-fe.h +++ b/include/chardev/char-fe.h @@ -228,6 +228,7 @@ guint qemu_chr_fe_add_watch(CharBackend *be, GIOCondition cond, * is thread-safe. * * Returns: the number of bytes consumed (0 if no associated Chardev) + * or -1 on error. */ int qemu_chr_fe_write(CharBackend *be, const uint8_t *buf, int len); @@ -242,6 +243,7 @@ int qemu_chr_fe_write(CharBackend *be, const uint8_t *buf, int len); * attempted to be written. This function is thread-safe. * * Returns: the number of bytes consumed (0 if no associated Chardev) + * or -1 on error. */ int qemu_chr_fe_write_all(CharBackend *be, const uint8_t *buf, int len); @@ -253,6 +255,7 @@ int qemu_chr_fe_write_all(CharBackend *be, const uint8_t *buf, int len); * Read data to a buffer from the back end. * * Returns: the number of bytes read (0 if no associated Chardev) + * or -1 on error. */ int qemu_chr_fe_read_all(CharBackend *be, uint8_t *buf, int len); diff --git a/include/crypto/tlssession.h b/include/crypto/tlssession.h index 571049bd0e..f694a5c3c5 100644 --- a/include/crypto/tlssession.h +++ b/include/crypto/tlssession.h @@ -107,6 +107,7 @@ typedef struct QCryptoTLSSession QCryptoTLSSession; +#define QCRYPTO_TLS_SESSION_ERR_BLOCK -2 /** * qcrypto_tls_session_new: @@ -177,12 +178,18 @@ G_DEFINE_AUTOPTR_CLEANUP_FUNC(QCryptoTLSSession, qcrypto_tls_session_free) int qcrypto_tls_session_check_credentials(QCryptoTLSSession *sess, Error **errp); +/* + * These must return QCRYPTO_TLS_SESSION_ERR_BLOCK if the I/O + * would block, but on other errors, must fill 'errp' + */ typedef ssize_t (*QCryptoTLSSessionWriteFunc)(const char *buf, size_t len, - void *opaque); + void *opaque, + Error **errp); typedef ssize_t (*QCryptoTLSSessionReadFunc)(char *buf, size_t len, - void *opaque); + void *opaque, + Error **errp); /** * qcrypto_tls_session_set_callbacks: @@ -212,6 +219,7 @@ void qcrypto_tls_session_set_callbacks(QCryptoTLSSession *sess, * @sess: the TLS session object * @buf: the plain text to send * @len: the length of @buf + * @errp: pointer to hold returned error object * * Encrypt @len bytes of the data in @buf and send * it to the remote peer using the callback previously @@ -221,32 +229,45 @@ void qcrypto_tls_session_set_callbacks(QCryptoTLSSession *sess, * qcrypto_tls_session_get_handshake_status() returns * QCRYPTO_TLS_HANDSHAKE_COMPLETE * - * Returns: the number of bytes sent, or -1 on error + * Returns: the number of bytes sent, + * or QCRYPTO_TLS_SESSION_ERR_BLOCK if the write would block, + * or -1 on error. */ ssize_t qcrypto_tls_session_write(QCryptoTLSSession *sess, const char *buf, - size_t len); + size_t len, + Error **errp); /** * qcrypto_tls_session_read: * @sess: the TLS session object * @buf: to fill with plain text received * @len: the length of @buf + * @gracefulTermination: treat premature termination as graceful EOF + * @errp: pointer to hold returned error object * * Receive up to @len bytes of data from the remote peer * using the callback previously registered with * qcrypto_tls_session_set_callbacks(), decrypt it and * store it in @buf. * + * If @gracefulTermination is true, then a premature termination + * of the TLS session will be treated as indicating EOF, as + * opposed to an error. + * * It is an error to call this before * qcrypto_tls_session_get_handshake_status() returns * QCRYPTO_TLS_HANDSHAKE_COMPLETE * - * Returns: the number of bytes received, or -1 on error + * Returns: the number of bytes received, + * or QCRYPTO_TLS_SESSION_ERR_BLOCK if the receive would block, + * or -1 on error. */ ssize_t qcrypto_tls_session_read(QCryptoTLSSession *sess, char *buf, - size_t len); + size_t len, + bool gracefulTermination, + Error **errp); /** * qcrypto_tls_session_check_pending: diff --git a/include/exec/cpu-common.h b/include/exec/cpu-common.h index 815342d043..240ee04369 100644 --- a/include/exec/cpu-common.h +++ b/include/exec/cpu-common.h @@ -129,6 +129,14 @@ size_t qemu_ram_pagesize_largest(void); */ void cpu_address_space_init(CPUState *cpu, int asidx, const char *prefix, MemoryRegion *mr); +/** + * cpu_address_space_destroy: + * @cpu: CPU for which address space needs to be destroyed + * @asidx: integer index of this address space + * + * Note that with KVM only one address space is supported. + */ +void cpu_address_space_destroy(CPUState *cpu, int asidx); void cpu_physical_memory_rw(hwaddr addr, void *buf, hwaddr len, bool is_write); diff --git a/include/exec/exec-all.h b/include/exec/exec-all.h index b6b46ad13c..72240ef426 100644 --- a/include/exec/exec-all.h +++ b/include/exec/exec-all.h @@ -301,6 +301,9 @@ static inline void tlb_flush_range_by_mmuidx_all_cpus_synced(CPUState *cpu, { } #endif + +#if defined(CONFIG_TCG) + /** * probe_access: * @env: CPUArchState @@ -357,6 +360,7 @@ int probe_access_flags(CPUArchState *env, vaddr addr, int size, bool nonfault, void **phost, uintptr_t retaddr); #ifndef CONFIG_USER_ONLY + /** * probe_access_full: * Like probe_access_flags, except also return into @pfull. @@ -392,7 +396,8 @@ int probe_access_full_mmu(CPUArchState *env, vaddr addr, int size, MMUAccessType access_type, int mmu_idx, void **phost, CPUTLBEntryFull **pfull); -#endif +#endif /* !CONFIG_USER_ONLY */ +#endif /* CONFIG_TCG */ static inline tb_page_addr_t tb_page_addr0(const TranslationBlock *tb) { diff --git a/include/exec/gdbstub.h b/include/exec/gdbstub.h index 1bd2c4ec2a..d73f424f56 100644 --- a/include/exec/gdbstub.h +++ b/include/exec/gdbstub.h @@ -41,6 +41,12 @@ void gdb_register_coprocessor(CPUState *cpu, const GDBFeature *feature, int g_pos); /** + * gdb_unregister_coprocessor_all() - unregisters supplemental set of registers + * @cpu - the CPU associated with registers + */ +void gdb_unregister_coprocessor_all(CPUState *cpu); + +/** * gdbserver_start: start the gdb server * @port_or_device: connection spec for gdb * diff --git a/include/hw/acpi/cpu.h b/include/hw/acpi/cpu.h index e6e1a9ef59..32654dc274 100644 --- a/include/hw/acpi/cpu.h +++ b/include/hw/acpi/cpu.h @@ -19,6 +19,8 @@ #include "hw/boards.h" #include "hw/hotplug.h" +#define ACPI_CPU_HOTPLUG_REG_LEN 12 + typedef struct AcpiCpuStatus { CPUState *cpu; uint64_t arch_id; @@ -61,9 +63,10 @@ typedef void (*build_madt_cpu_fn)(int uid, const CPUArchIdList *apic_ids, GArray *entry, bool force_enabled); void build_cpus_aml(Aml *table, MachineState *machine, CPUHotplugFeatures opts, - build_madt_cpu_fn build_madt_cpu, hwaddr io_base, + build_madt_cpu_fn build_madt_cpu, hwaddr base_addr, const char *res_root, - const char *event_handler_method); + const char *event_handler_method, + AmlRegionSpace rs); void acpi_cpu_ospm_status(CPUHotplugState *cpu_st, ACPIOSTInfoList ***list); diff --git a/include/hw/acpi/generic_event_device.h b/include/hw/acpi/generic_event_device.h index ba84ce0214..40af3550b5 100644 --- a/include/hw/acpi/generic_event_device.h +++ b/include/hw/acpi/generic_event_device.h @@ -62,6 +62,7 @@ #include "hw/sysbus.h" #include "hw/acpi/memory_hotplug.h" #include "hw/acpi/ghes.h" +#include "hw/acpi/cpu.h" #include "qom/object.h" #define ACPI_POWER_BUTTON_DEVICE "PWRB" @@ -86,6 +87,7 @@ OBJECT_DECLARE_SIMPLE_TYPE(AcpiGedState, ACPI_GED) #define GED_DEVICE "GED" #define AML_GED_EVT_REG "EREG" #define AML_GED_EVT_SEL "ESEL" +#define AML_GED_EVT_CPU_SCAN_METHOD "\\_SB.GED.CSCN" /* * Platforms need to specify the GED event bitmap @@ -95,6 +97,7 @@ OBJECT_DECLARE_SIMPLE_TYPE(AcpiGedState, ACPI_GED) #define ACPI_GED_MEM_HOTPLUG_EVT 0x1 #define ACPI_GED_PWR_DOWN_EVT 0x2 #define ACPI_GED_NVDIMM_HOTPLUG_EVT 0x4 +#define ACPI_GED_CPU_HOTPLUG_EVT 0x8 typedef struct GEDState { MemoryRegion evt; @@ -106,6 +109,8 @@ struct AcpiGedState { SysBusDevice parent_obj; MemHotplugState memhp_state; MemoryRegion container_memhp; + CPUHotplugState cpuhp_state; + MemoryRegion container_cpuhp; GEDState ged_state; uint32_t ged_event_bitmap; qemu_irq irq; diff --git a/include/hw/boards.h b/include/hw/boards.h index ef6f18f2c1..48ff6d8b93 100644 --- a/include/hw/boards.h +++ b/include/hw/boards.h @@ -237,6 +237,9 @@ typedef struct { * purposes only. * Applies only to default memory backend, i.e., explicit memory backend * wasn't used. + * @smbios_memory_device_size: + * Default size of memory device, + * SMBIOS 3.1.0 "7.18 Memory Device (Type 17)" */ struct MachineClass { /*< private >*/ @@ -304,6 +307,7 @@ struct MachineClass { const CPUArchIdList *(*possible_cpu_arch_ids)(MachineState *machine); int64_t (*get_default_cpu_node_id)(const MachineState *ms, int idx); ram_addr_t (*fixup_ram_size)(ram_addr_t size); + uint64_t smbios_memory_device_size; }; /** diff --git a/include/hw/core/cpu.h b/include/hw/core/cpu.h index d946161717..1c9c775df6 100644 --- a/include/hw/core/cpu.h +++ b/include/hw/core/cpu.h @@ -496,6 +496,7 @@ struct CPUState { QSIMPLEQ_HEAD(, qemu_work_item) work_list; struct CPUAddressSpace *cpu_ases; + int cpu_ases_count; int num_ases; AddressSpace *as; MemoryRegion *memory; diff --git a/include/hw/cxl/cxl_device.h b/include/hw/cxl/cxl_device.h index 0a4fcb2800..fdd0f4e62b 100644 --- a/include/hw/cxl/cxl_device.h +++ b/include/hw/cxl/cxl_device.h @@ -181,6 +181,21 @@ typedef struct CXLCCI { uint64_t runtime; QEMUTimer *timer; } bg; + + /* firmware update */ + struct { + uint8_t active_slot; + uint8_t staged_slot; + bool slot[4]; + uint8_t curr_action; + uint8_t curr_slot; + /* handle partial transfers */ + bool transferring; + size_t prev_offset; + size_t prev_len; + time_t last_partxfer; + } fw; + size_t payload_max; /* Pointer to device hosting the CCI */ DeviceState *d; @@ -397,9 +412,14 @@ static inline void __toggle_media(CXLDeviceState *cxl_dstate, int val) #define cxl_dev_enable_media(cxlds) \ do { __toggle_media((cxlds), 0x1); } while (0) -static inline bool sanitize_running(CXLCCI *cci) +static inline bool cxl_dev_media_disabled(CXLDeviceState *cxl_dstate) +{ + uint64_t dev_status_reg = cxl_dstate->mbox_reg_state64[R_CXL_MEM_DEV_STS]; + return FIELD_EX64(dev_status_reg, CXL_MEM_DEV_STS, MEDIA_STATUS) == 0x3; +} +static inline bool scan_media_running(CXLCCI *cci) { - return !!cci->bg.runtime && cci->bg.opcode == 0x4400; + return !!cci->bg.runtime && cci->bg.opcode == 0x4304; } typedef struct CXLError { @@ -422,6 +442,47 @@ typedef struct CXLPoison { typedef QLIST_HEAD(, CXLPoison) CXLPoisonList; #define CXL_POISON_LIST_LIMIT 256 +/* CXL memory device patrol scrub control attributes */ +typedef struct CXLMemPatrolScrubReadAttrs { + uint8_t scrub_cycle_cap; + uint16_t scrub_cycle; + uint8_t scrub_flags; +} QEMU_PACKED CXLMemPatrolScrubReadAttrs; + +typedef struct CXLMemPatrolScrubWriteAttrs { + uint8_t scrub_cycle_hr; + uint8_t scrub_flags; +} QEMU_PACKED CXLMemPatrolScrubWriteAttrs; + +#define CXL_MEMDEV_PS_GET_FEATURE_VERSION 0x01 +#define CXL_MEMDEV_PS_SET_FEATURE_VERSION 0x01 +#define CXL_MEMDEV_PS_SCRUB_CYCLE_CHANGE_CAP_DEFAULT BIT(0) +#define CXL_MEMDEV_PS_SCRUB_REALTIME_REPORT_CAP_DEFAULT BIT(1) +#define CXL_MEMDEV_PS_CUR_SCRUB_CYCLE_DEFAULT 12 +#define CXL_MEMDEV_PS_MIN_SCRUB_CYCLE_DEFAULT 1 +#define CXL_MEMDEV_PS_ENABLE_DEFAULT 0 + +/* CXL memory device DDR5 ECS control attributes */ +typedef struct CXLMemECSReadAttrs { + uint8_t ecs_log_cap; + uint8_t ecs_cap; + uint16_t ecs_config; + uint8_t ecs_flags; +} QEMU_PACKED CXLMemECSReadAttrs; + +typedef struct CXLMemECSWriteAttrs { + uint8_t ecs_log_cap; + uint16_t ecs_config; +} QEMU_PACKED CXLMemECSWriteAttrs; + +#define CXL_ECS_GET_FEATURE_VERSION 0x01 +#define CXL_ECS_SET_FEATURE_VERSION 0x01 +#define CXL_ECS_LOG_ENTRY_TYPE_DEFAULT 0x01 +#define CXL_ECS_REALTIME_REPORT_CAP_DEFAULT 1 +#define CXL_ECS_THRESHOLD_COUNT_DEFAULT 3 /* 3: 256, 4: 1024, 5: 4096 */ +#define CXL_ECS_MODE_DEFAULT 0 +#define CXL_ECS_NUM_MEDIA_FRUS 3 /* Default */ + #define DCD_MAX_NUM_REGION 8 typedef struct CXLDCExtentRaw { @@ -459,6 +520,14 @@ typedef struct CXLDCRegion { unsigned long *blk_bitmap; } CXLDCRegion; +typedef struct CXLSetFeatureInfo { + QemuUUID uuid; + uint8_t data_transfer_flag; + bool data_saved_across_reset; + uint16_t data_offset; + size_t data_size; +} CXLSetFeatureInfo; + struct CXLType3Dev { /* Private */ PCIDevice parent_obj; @@ -491,6 +560,19 @@ struct CXLType3Dev { unsigned int poison_list_cnt; bool poison_list_overflowed; uint64_t poison_list_overflow_ts; + /* Poison Injection - backup */ + CXLPoisonList poison_list_bkp; + CXLPoisonList scan_media_results; + bool scan_media_hasrun; + + CXLSetFeatureInfo set_feat_info; + + /* Patrol scrub control attributes */ + CXLMemPatrolScrubReadAttrs patrol_scrub_attrs; + CXLMemPatrolScrubWriteAttrs patrol_scrub_wr_attrs; + /* ECS control attributes */ + CXLMemECSReadAttrs ecs_attrs[CXL_ECS_NUM_MEDIA_FRUS]; + CXLMemECSWriteAttrs ecs_wr_attrs[CXL_ECS_NUM_MEDIA_FRUS]; struct dynamic_capacity { HostMemoryBackend *host_dc; @@ -554,10 +636,12 @@ CXLRetCode cxl_event_get_records(CXLDeviceState *cxlds, CXLGetEventPayload *pl, size_t *len); CXLRetCode cxl_event_clear_records(CXLDeviceState *cxlds, CXLClearEventPayload *pl); +void cxl_discard_all_event_records(CXLDeviceState *cxlds); void cxl_event_irq_assert(CXLType3Dev *ct3d); void cxl_set_poison_list_overflowed(CXLType3Dev *ct3d); +void cxl_clear_poison_list_overflowed(CXLType3Dev *ct3d); CXLDCRegion *cxl_find_dc_region(CXLType3Dev *ct3d, uint64_t dpa, uint64_t len); diff --git a/include/hw/cxl/cxl_mailbox.h b/include/hw/cxl/cxl_mailbox.h new file mode 100644 index 0000000000..beb048052e --- /dev/null +++ b/include/hw/cxl/cxl_mailbox.h @@ -0,0 +1,18 @@ +/* + * QEMU CXL Mailbox + * + * This work is licensed under the terms of the GNU GPL, version 2. See the + * COPYING file in the top-level directory. + */ + +#ifndef CXL_MAILBOX_H +#define CXL_MAILBOX_H + +#define CXL_MBOX_IMMEDIATE_CONFIG_CHANGE (1 << 1) +#define CXL_MBOX_IMMEDIATE_DATA_CHANGE (1 << 2) +#define CXL_MBOX_IMMEDIATE_POLICY_CHANGE (1 << 3) +#define CXL_MBOX_IMMEDIATE_LOG_CHANGE (1 << 4) +#define CXL_MBOX_SECURITY_STATE_CHANGE (1 << 5) +#define CXL_MBOX_BACKGROUND_OPERATION (1 << 6) + +#endif diff --git a/include/hw/pci/pci_device.h b/include/hw/pci/pci_device.h index ca15132508..e7e41cb939 100644 --- a/include/hw/pci/pci_device.h +++ b/include/hw/pci/pci_device.h @@ -3,6 +3,7 @@ #include "hw/pci/pci.h" #include "hw/pci/pcie.h" +#include "hw/pci/pcie_doe.h" #define TYPE_PCI_DEVICE "pci-device" typedef struct PCIDeviceClass PCIDeviceClass; @@ -37,6 +38,8 @@ struct PCIDeviceClass { uint16_t subsystem_id; /* only for header type = 0 */ const char *romfile; /* rom bar */ + + bool sriov_vf_user_creatable; }; enum PCIReqIDType { @@ -157,9 +160,17 @@ struct PCIDevice { MSIVectorReleaseNotifier msix_vector_release_notifier; MSIVectorPollNotifier msix_vector_poll_notifier; + /* SPDM */ + uint16_t spdm_port; + + /* DOE */ + DOECap doe_spdm; + /* ID of standby device in net_failover pair */ char *failover_pair_id; uint32_t acpi_index; + + char *sriov_pf; }; static inline int pci_intx(PCIDevice *pci_dev) @@ -192,7 +203,7 @@ static inline int pci_is_express_downstream_port(const PCIDevice *d) static inline int pci_is_vf(const PCIDevice *d) { - return d->exp.sriov_vf.pf != NULL; + return d->sriov_pf || d->exp.sriov_vf.pf != NULL; } static inline uint32_t pci_config_size(const PCIDevice *d) diff --git a/include/hw/pci/pcie_doe.h b/include/hw/pci/pcie_doe.h index 87dc17dcef..9e1275db8a 100644 --- a/include/hw/pci/pcie_doe.h +++ b/include/hw/pci/pcie_doe.h @@ -46,6 +46,8 @@ REG32(PCI_DOE_CAP_STATUS, 0) /* PCI-SIG defined Data Object Types - r6.0 Table 6-32 */ #define PCI_SIG_DOE_DISCOVERY 0x00 +#define PCI_SIG_DOE_CMA 0x01 +#define PCI_SIG_DOE_SECURED_CMA 0x02 #define PCI_DOE_DW_SIZE_MAX (1 << 18) #define PCI_DOE_PROTOCOL_NUM_MAX 256 @@ -106,6 +108,9 @@ struct DOECap { /* Protocols and its callback response */ DOEProtocol *protocols; uint16_t protocol_num; + + /* Used for spdm-socket */ + int spdm_socket; }; void pcie_doe_init(PCIDevice *pdev, DOECap *doe_cap, uint16_t offset, diff --git a/include/hw/pci/pcie_sriov.h b/include/hw/pci/pcie_sriov.h index c5d2d318d3..f75b8f22ee 100644 --- a/include/hw/pci/pcie_sriov.h +++ b/include/hw/pci/pcie_sriov.h @@ -18,6 +18,7 @@ typedef struct PCIESriovPF { uint8_t vf_bar_type[PCI_NUM_REGIONS]; /* Store type for each VF bar */ PCIDevice **vf; /* Pointer to an array of num_vfs VF devices */ + bool vf_user_created; /* If VFs are created by user */ } PCIESriovPF; typedef struct PCIESriovVF { @@ -40,6 +41,23 @@ void pcie_sriov_pf_init_vf_bar(PCIDevice *dev, int region_num, void pcie_sriov_vf_register_bar(PCIDevice *dev, int region_num, MemoryRegion *memory); +/** + * pcie_sriov_pf_init_from_user_created_vfs() - Initialize PF with user-created + * VFs. + * @dev: A PCIe device being realized. + * @offset: The offset of the SR-IOV capability. + * @errp: pointer to Error*, to store an error if it happens. + * + * Return: The size of added capability. 0 if the user did not create VFs. + * -1 if failed. + */ +int16_t pcie_sriov_pf_init_from_user_created_vfs(PCIDevice *dev, + uint16_t offset, + Error **errp); + +bool pcie_sriov_register_device(PCIDevice *dev, Error **errp); +void pcie_sriov_unregister_device(PCIDevice *dev); + /* * Default (minimal) page size support values * as required by the SR/IOV standard: diff --git a/include/hw/vfio/vfio-common.h b/include/hw/vfio/vfio-common.h index e8ddf92bb1..fed499b199 100644 --- a/include/hw/vfio/vfio-common.h +++ b/include/hw/vfio/vfio-common.h @@ -95,10 +95,18 @@ typedef struct VFIOHostDMAWindow { typedef struct IOMMUFDBackend IOMMUFDBackend; +typedef struct VFIOIOASHwpt { + uint32_t hwpt_id; + uint32_t hwpt_flags; + QLIST_HEAD(, VFIODevice) device_list; + QLIST_ENTRY(VFIOIOASHwpt) next; +} VFIOIOASHwpt; + typedef struct VFIOIOMMUFDContainer { VFIOContainerBase bcontainer; IOMMUFDBackend *be; uint32_t ioas_id; + QLIST_HEAD(, VFIOIOASHwpt) hwpt_list; } VFIOIOMMUFDContainer; OBJECT_DECLARE_SIMPLE_TYPE(VFIOIOMMUFDContainer, VFIO_IOMMU_IOMMUFD); @@ -116,6 +124,7 @@ typedef struct VFIODevice { DeviceState *dev; int fd; int type; + bool mdev; bool reset_works; bool needs_reset; bool no_mmap; @@ -129,11 +138,15 @@ typedef struct VFIODevice { VFIOMigration *migration; Error *migration_blocker; OnOffAuto pre_copy_dirty_page_tracking; + OnOffAuto device_dirty_page_tracking; bool dirty_pages_supported; bool dirty_tracking; + bool iommu_dirty_tracking; HostIOMMUDevice *hiod; int devid; IOMMUFDBackend *iommufd; + VFIOIOASHwpt *hwpt; + QLIST_ENTRY(VFIODevice) hwpt_next; } VFIODevice; struct VFIODeviceOps { @@ -231,6 +244,8 @@ void vfio_region_exit(VFIORegion *region); void vfio_region_finalize(VFIORegion *region); void vfio_reset_handler(void *opaque); struct vfio_device_info *vfio_get_device_info(int fd); +bool vfio_device_is_mdev(VFIODevice *vbasedev); +bool vfio_device_hiod_realize(VFIODevice *vbasedev, Error **errp); bool vfio_attach_device(char *name, VFIODevice *vbasedev, AddressSpace *as, Error **errp); void vfio_detach_device(VFIODevice *vbasedev); diff --git a/include/hw/virtio/virtio-iommu.h b/include/hw/virtio/virtio-iommu.h index bdb3da72d0..7db4210b16 100644 --- a/include/hw/virtio/virtio-iommu.h +++ b/include/hw/virtio/virtio-iommu.h @@ -43,7 +43,6 @@ typedef struct IOMMUDevice { MemoryRegion bypass_mr; /* The alias of shared memory MR */ GList *resv_regions; GList *host_resv_ranges; - bool probe_done; } IOMMUDevice; typedef struct IOMMUPciBus { diff --git a/include/hw/virtio/virtio-pci.h b/include/hw/virtio/virtio-pci.h index 9e67ba38c7..34539f2f67 100644 --- a/include/hw/virtio/virtio-pci.h +++ b/include/hw/virtio/virtio-pci.h @@ -152,6 +152,7 @@ struct VirtIOPCIProxy { uint32_t modern_io_bar_idx; uint32_t modern_mem_bar_idx; int config_cap; + uint16_t last_pcie_cap_offset; uint32_t flags; bool disable_modern; bool ignore_backend_features; diff --git a/include/hw/virtio/virtio.h b/include/hw/virtio/virtio.h index 7512afbc84..d2a1938757 100644 --- a/include/hw/virtio/virtio.h +++ b/include/hw/virtio/virtio.h @@ -69,6 +69,8 @@ typedef struct VirtQueueElement unsigned int ndescs; unsigned int out_num; unsigned int in_num; + /* Element has been processed (VIRTIO_F_IN_ORDER) */ + bool in_order_filled; hwaddr *in_addr; hwaddr *out_addr; struct iovec *in_sg; @@ -371,7 +373,9 @@ typedef struct VirtIORNGConf VirtIORNGConf; DEFINE_PROP_BIT64("packed", _state, _field, \ VIRTIO_F_RING_PACKED, false), \ DEFINE_PROP_BIT64("queue_reset", _state, _field, \ - VIRTIO_F_RING_RESET, true) + VIRTIO_F_RING_RESET, true), \ + DEFINE_PROP_BIT64("in_order", _state, _field, \ + VIRTIO_F_IN_ORDER, false) hwaddr virtio_queue_get_desc_addr(VirtIODevice *vdev, int n); bool virtio_queue_enabled_legacy(VirtIODevice *vdev, int n); diff --git a/include/io/channel.h b/include/io/channel.h index 7986c49c71..bdf0bca92a 100644 --- a/include/io/channel.h +++ b/include/io/channel.h @@ -160,6 +160,9 @@ struct QIOChannelClass { void *opaque); int (*io_flush)(QIOChannel *ioc, Error **errp); + int (*io_peerpid)(QIOChannel *ioc, + unsigned int *pid, + Error **errp); }; /* General I/O handling functions */ @@ -981,4 +984,22 @@ int coroutine_mixed_fn qio_channel_writev_full_all(QIOChannel *ioc, int qio_channel_flush(QIOChannel *ioc, Error **errp); +/** + * qio_channel_get_peercred: + * @ioc: the channel object + * @pid: pointer to pid + * @errp: pointer to a NULL-initialized error object + * + * Returns the pid of the peer process connected to this socket. + * + * The use of this function is possible only for connected + * AF_UNIX stream sockets and for AF_UNIX stream and datagram + * socket pairs on Linux. + * Return -1 on error with pid -1 for the non-Linux OS. + * + */ +int qio_channel_get_peerpid(QIOChannel *ioc, + unsigned int *pid, + Error **errp); + #endif /* QIO_CHANNEL_H */ diff --git a/include/qemu/fifo8.h b/include/qemu/fifo8.h index c6295c6ff0..d1d06754d8 100644 --- a/include/qemu/fifo8.h +++ b/include/qemu/fifo8.h @@ -15,10 +15,9 @@ typedef struct { * @fifo: struct Fifo8 to initialise with new FIFO * @capacity: capacity of the newly created FIFO * - * Create a FIFO of the specified size. Clients should call fifo8_destroy() + * Create a FIFO of the specified capacity. Clients should call fifo8_destroy() * when finished using the fifo. The FIFO is initially empty. */ - void fifo8_create(Fifo8 *fifo, uint32_t capacity); /** @@ -26,9 +25,8 @@ void fifo8_create(Fifo8 *fifo, uint32_t capacity); * @fifo: FIFO to cleanup * * Cleanup a FIFO created with fifo8_create(). Frees memory created for FIFO - *storage. The FIFO is no longer usable after this has been called. + * storage. The FIFO is no longer usable after this has been called. */ - void fifo8_destroy(Fifo8 *fifo); /** @@ -39,7 +37,6 @@ void fifo8_destroy(Fifo8 *fifo); * Push a data byte to the FIFO. Behaviour is undefined if the FIFO is full. * Clients are responsible for checking for fullness using fifo8_is_full(). */ - void fifo8_push(Fifo8 *fifo, uint8_t data); /** @@ -52,7 +49,6 @@ void fifo8_push(Fifo8 *fifo, uint8_t data); * Clients are responsible for checking the space left in the FIFO using * fifo8_num_free(). */ - void fifo8_push_all(Fifo8 *fifo, const uint8_t *data, uint32_t num); /** @@ -64,25 +60,40 @@ void fifo8_push_all(Fifo8 *fifo, const uint8_t *data, uint32_t num); * * Returns: The popped data byte. */ - uint8_t fifo8_pop(Fifo8 *fifo); /** * fifo8_pop_buf: * @fifo: FIFO to pop from + * @dest: the buffer to write the data into (can be NULL) + * @destlen: size of @dest and maximum number of bytes to pop + * + * Pop a number of elements from the FIFO up to a maximum of @destlen. + * The popped data is copied into the @dest buffer. + * Care is taken when the data wraps around in the ring buffer. + * + * Returns: number of bytes popped. + */ +uint32_t fifo8_pop_buf(Fifo8 *fifo, uint8_t *dest, uint32_t destlen); + +/** + * fifo8_pop_bufptr: + * @fifo: FIFO to pop from * @max: maximum number of bytes to pop * @numptr: pointer filled with number of bytes returned (can be NULL) * - * Pop a number of elements from the FIFO up to a maximum of max. The buffer + * New code should prefer to use fifo8_pop_buf() instead of fifo8_pop_bufptr(). + * + * Pop a number of elements from the FIFO up to a maximum of @max. The buffer * containing the popped data is returned. This buffer points directly into - * the FIFO backing store and data is invalidated once any of the fifo8_* APIs - * are called on the FIFO. + * the internal FIFO backing store and data (without checking for overflow!) + * and is invalidated once any of the fifo8_* APIs are called on the FIFO. * * The function may return fewer bytes than requested when the data wraps * around in the ring buffer; in this case only a contiguous part of the data * is returned. * - * The number of valid bytes returned is populated in *numptr; will always + * The number of valid bytes returned is populated in *@numptr; will always * return at least 1 byte. max must not be 0 or greater than the number of * bytes in the FIFO. * @@ -91,15 +102,15 @@ uint8_t fifo8_pop(Fifo8 *fifo); * * Returns: A pointer to popped data. */ -const uint8_t *fifo8_pop_buf(Fifo8 *fifo, uint32_t max, uint32_t *numptr); +const uint8_t *fifo8_pop_bufptr(Fifo8 *fifo, uint32_t max, uint32_t *numptr); /** - * fifo8_peek_buf: read upto max bytes from the fifo + * fifo8_peek_bufptr: read upto max bytes from the fifo * @fifo: FIFO to read from * @max: maximum number of bytes to peek * @numptr: pointer filled with number of bytes returned (can be NULL) * - * Peek into a number of elements from the FIFO up to a maximum of max. + * Peek into a number of elements from the FIFO up to a maximum of @max. * The buffer containing the data peeked into is returned. This buffer points * directly into the FIFO backing store. Since data is invalidated once any * of the fifo8_* APIs are called on the FIFO, it is the caller responsibility @@ -109,7 +120,7 @@ const uint8_t *fifo8_pop_buf(Fifo8 *fifo, uint32_t max, uint32_t *numptr); * around in the ring buffer; in this case only a contiguous part of the data * is returned. * - * The number of valid bytes returned is populated in *numptr; will always + * The number of valid bytes returned is populated in *@numptr; will always * return at least 1 byte. max must not be 0 or greater than the number of * bytes in the FIFO. * @@ -118,7 +129,16 @@ const uint8_t *fifo8_pop_buf(Fifo8 *fifo, uint32_t max, uint32_t *numptr); * * Returns: A pointer to peekable data. */ -const uint8_t *fifo8_peek_buf(Fifo8 *fifo, uint32_t max, uint32_t *numptr); +const uint8_t *fifo8_peek_bufptr(Fifo8 *fifo, uint32_t max, uint32_t *numptr); + +/** + * fifo8_drop: + * @fifo: FIFO to drop bytes + * @len: number of bytes to drop + * + * Drop (consume) bytes from a FIFO. + */ +void fifo8_drop(Fifo8 *fifo, uint32_t len); /** * fifo8_reset: @@ -126,7 +146,6 @@ const uint8_t *fifo8_peek_buf(Fifo8 *fifo, uint32_t max, uint32_t *numptr); * * Reset a FIFO. All data is discarded and the FIFO is emptied. */ - void fifo8_reset(Fifo8 *fifo); /** @@ -137,7 +156,6 @@ void fifo8_reset(Fifo8 *fifo); * * Returns: True if the fifo is empty, false otherwise. */ - bool fifo8_is_empty(Fifo8 *fifo); /** @@ -148,7 +166,6 @@ bool fifo8_is_empty(Fifo8 *fifo); * * Returns: True if the fifo is full, false otherwise. */ - bool fifo8_is_full(Fifo8 *fifo); /** @@ -159,7 +176,6 @@ bool fifo8_is_full(Fifo8 *fifo); * * Returns: Number of free bytes. */ - uint32_t fifo8_num_free(Fifo8 *fifo); /** @@ -170,7 +186,6 @@ uint32_t fifo8_num_free(Fifo8 *fifo); * * Returns: Number of used bytes. */ - uint32_t fifo8_num_used(Fifo8 *fifo); extern const VMStateDescription vmstate_fifo8; diff --git a/include/qemu/range.h b/include/qemu/range.h index 4ce694a398..d446ad885d 100644 --- a/include/qemu/range.h +++ b/include/qemu/range.h @@ -210,8 +210,8 @@ static inline int range_covers_byte(uint64_t offset, uint64_t len, /* Check whether 2 given ranges overlap. * Undefined if ranges that wrap around 0. */ -static inline int ranges_overlap(uint64_t first1, uint64_t len1, - uint64_t first2, uint64_t len2) +static inline bool ranges_overlap(uint64_t first1, uint64_t len1, + uint64_t first2, uint64_t len2) { uint64_t last1 = range_get_last(first1, len1); uint64_t last2 = range_get_last(first2, len2); diff --git a/include/sysemu/host_iommu_device.h b/include/sysemu/host_iommu_device.h index c1bf74ae2c..809cced4ba 100644 --- a/include/sysemu/host_iommu_device.h +++ b/include/sysemu/host_iommu_device.h @@ -20,11 +20,12 @@ * * @type: host platform IOMMU type. * - * @aw_bits: host IOMMU address width. 0xff if no limitation. + * @hw_caps: host platform IOMMU capabilities (e.g. on IOMMUFD this represents + * the @out_capabilities value returned from IOMMU_GET_HW_INFO ioctl) */ typedef struct HostIOMMUDeviceCaps { uint32_t type; - uint8_t aw_bits; + uint64_t hw_caps; } HostIOMMUDeviceCaps; #define TYPE_HOST_IOMMU_DEVICE "host-iommu-device" diff --git a/include/sysemu/iommufd.h b/include/sysemu/iommufd.h index 9edfec6045..4c4886c778 100644 --- a/include/sysemu/iommufd.h +++ b/include/sysemu/iommufd.h @@ -49,7 +49,18 @@ int iommufd_backend_unmap_dma(IOMMUFDBackend *be, uint32_t ioas_id, hwaddr iova, ram_addr_t size); bool iommufd_backend_get_device_info(IOMMUFDBackend *be, uint32_t devid, uint32_t *type, void *data, uint32_t len, - Error **errp); + uint64_t *caps, Error **errp); +bool iommufd_backend_alloc_hwpt(IOMMUFDBackend *be, uint32_t dev_id, + uint32_t pt_id, uint32_t flags, + uint32_t data_type, uint32_t data_len, + void *data_ptr, uint32_t *out_hwpt, + Error **errp); +bool iommufd_backend_set_dirty_tracking(IOMMUFDBackend *be, uint32_t hwpt_id, + bool start, Error **errp); +bool iommufd_backend_get_dirty_bitmap(IOMMUFDBackend *be, uint32_t hwpt_id, + uint64_t iova, ram_addr_t size, + uint64_t page_size, uint64_t *data, + Error **errp); #define TYPE_HOST_IOMMU_DEVICE_IOMMUFD TYPE_HOST_IOMMU_DEVICE "-iommufd" #endif diff --git a/include/sysemu/kvm.h b/include/sysemu/kvm.h index c31d9c7356..c4a914b3d8 100644 --- a/include/sysemu/kvm.h +++ b/include/sysemu/kvm.h @@ -313,6 +313,31 @@ int kvm_create_device(KVMState *s, uint64_t type, bool test); */ bool kvm_device_supported(int vmfd, uint64_t type); +/** + * kvm_create_vcpu - Gets a parked KVM vCPU or creates a KVM vCPU + * @cpu: QOM CPUState object for which KVM vCPU has to be fetched/created. + * + * @returns: 0 when success, errno (<0) when failed. + */ +int kvm_create_vcpu(CPUState *cpu); + +/** + * kvm_park_vcpu - Park QEMU KVM vCPU context + * @cpu: QOM CPUState object for which QEMU KVM vCPU context has to be parked. + * + * @returns: none + */ +void kvm_park_vcpu(CPUState *cpu); + +/** + * kvm_unpark_vcpu - unpark QEMU KVM vCPU context + * @s: KVM State + * @vcpu_id: Architecture vCPU ID of the parked vCPU + * + * @returns: KVM fd + */ +int kvm_unpark_vcpu(KVMState *s, unsigned long vcpu_id); + /* Arch specific hooks */ extern const KVMCapabilityInfo kvm_arch_required_capabilities[]; diff --git a/include/sysemu/kvm_int.h b/include/sysemu/kvm_int.h index 3f3d13f816..1d8fb1473b 100644 --- a/include/sysemu/kvm_int.h +++ b/include/sysemu/kvm_int.h @@ -14,6 +14,9 @@ #include "qemu/accel.h" #include "qemu/queue.h" #include "sysemu/kvm.h" +#include "hw/boards.h" +#include "hw/i386/topology.h" +#include "io/channel-socket.h" typedef struct KVMSlot { @@ -50,6 +53,34 @@ typedef struct KVMMemoryListener { #define KVM_MSI_HASHTAB_SIZE 256 +typedef struct KVMHostTopoInfo { + /* Number of package on the Host */ + unsigned int maxpkgs; + /* Number of cpus on the Host */ + unsigned int maxcpus; + /* Number of cpus on each different package */ + unsigned int *pkg_cpu_count; + /* Each package can have different maxticks */ + unsigned int *maxticks; +} KVMHostTopoInfo; + +struct KVMMsrEnergy { + pid_t pid; + bool enable; + char *socket_path; + QIOChannelSocket *sioc; + QemuThread msr_thr; + unsigned int guest_vcpus; + unsigned int guest_vsockets; + X86CPUTopoInfo guest_topo_info; + KVMHostTopoInfo host_topo; + const CPUArchIdList *guest_cpu_list; + uint64_t *msr_value; + uint64_t msr_unit; + uint64_t msr_limit; + uint64_t msr_info; +}; + enum KVMDirtyRingReaperState { KVM_DIRTY_RING_REAPER_NONE = 0, /* The reaper is sleeping */ @@ -117,6 +148,7 @@ struct KVMState bool kvm_dirty_ring_with_bitmap; uint64_t kvm_eager_split_size; /* Eager Page Splitting chunk size */ struct KVMDirtyRingReaper reaper; + struct KVMMsrEnergy msr_energy; NotifyVmexitOption notify_vmexit; uint32_t notify_window; uint32_t xen_version; diff --git a/include/sysemu/spdm-socket.h b/include/sysemu/spdm-socket.h new file mode 100644 index 0000000000..5d8bd9aa4e --- /dev/null +++ b/include/sysemu/spdm-socket.h @@ -0,0 +1,74 @@ +/* + * QEMU SPDM socket support + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL + * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN + * THE SOFTWARE. + */ + +#ifndef SPDM_REQUESTER_H +#define SPDM_REQUESTER_H + +/** + * spdm_socket_connect: connect to an external SPDM socket + * @port: port to connect to + * @errp: error object handle + * + * This will connect to an external SPDM socket server. On error + * it will return -1 and errp will be set. On success this function + * will return the socket number. + */ +int spdm_socket_connect(uint16_t port, Error **errp); + +/** + * spdm_socket_rsp: send and receive a message to a SPDM server + * @socket: socket returned from spdm_socket_connect() + * @transport_type: SPDM_SOCKET_TRANSPORT_TYPE_* macro + * @req: request buffer + * @req_len: request buffer length + * @rsp: response buffer + * @rsp_len: response buffer length + * + * Send platform data to a SPDM server on socket and then receive + * a response. + */ +uint32_t spdm_socket_rsp(const int socket, uint32_t transport_type, + void *req, uint32_t req_len, + void *rsp, uint32_t rsp_len); + +/** + * spdm_socket_close: send a shutdown command to the server + * @socket: socket returned from spdm_socket_connect() + * @transport_type: SPDM_SOCKET_TRANSPORT_TYPE_* macro + * + * This will issue a shutdown command to the server. + */ +void spdm_socket_close(const int socket, uint32_t transport_type); + +#define SPDM_SOCKET_COMMAND_NORMAL 0x0001 +#define SPDM_SOCKET_COMMAND_OOB_ENCAP_KEY_UPDATE 0x8001 +#define SPDM_SOCKET_COMMAND_CONTINUE 0xFFFD +#define SPDM_SOCKET_COMMAND_SHUTDOWN 0xFFFE +#define SPDM_SOCKET_COMMAND_UNKOWN 0xFFFF +#define SPDM_SOCKET_COMMAND_TEST 0xDEAD + +#define SPDM_SOCKET_TRANSPORT_TYPE_MCTP 0x01 +#define SPDM_SOCKET_TRANSPORT_TYPE_PCI_DOE 0x02 + +#define SPDM_SOCKET_MAX_MESSAGE_BUFFER_SIZE 0x1200 + +#endif diff --git a/io/channel-socket.c b/io/channel-socket.c index 3a899b0608..608bcf066e 100644 --- a/io/channel-socket.c +++ b/io/channel-socket.c @@ -841,6 +841,33 @@ qio_channel_socket_set_cork(QIOChannel *ioc, socket_set_cork(sioc->fd, v); } +static int +qio_channel_socket_get_peerpid(QIOChannel *ioc, + unsigned int *pid, + Error **errp) +{ +#ifdef CONFIG_LINUX + QIOChannelSocket *sioc = QIO_CHANNEL_SOCKET(ioc); + Error *err = NULL; + socklen_t len = sizeof(struct ucred); + + struct ucred cred; + if (getsockopt(sioc->fd, + SOL_SOCKET, SO_PEERCRED, + &cred, &len) == -1) { + error_setg_errno(&err, errno, "Unable to get peer credentials"); + error_propagate(errp, err); + *pid = -1; + return -1; + } + *pid = (unsigned int)cred.pid; + return 0; +#else + error_setg(errp, "Unsupported feature"); + *pid = -1; + return -1; +#endif +} static int qio_channel_socket_close(QIOChannel *ioc, @@ -938,6 +965,7 @@ static void qio_channel_socket_class_init(ObjectClass *klass, #ifdef QEMU_MSG_ZEROCOPY ioc_klass->io_flush = qio_channel_socket_flush; #endif + ioc_klass->io_peerpid = qio_channel_socket_get_peerpid; } static const TypeInfo qio_channel_socket_info = { diff --git a/io/channel-tls.c b/io/channel-tls.c index 67b9700006..aab630e5ae 100644 --- a/io/channel-tls.c +++ b/io/channel-tls.c @@ -28,17 +28,16 @@ static ssize_t qio_channel_tls_write_handler(const char *buf, size_t len, - void *opaque) + void *opaque, + Error **errp) { QIOChannelTLS *tioc = QIO_CHANNEL_TLS(opaque); ssize_t ret; - ret = qio_channel_write(tioc->master, buf, len, NULL); + ret = qio_channel_write(tioc->master, buf, len, errp); if (ret == QIO_CHANNEL_ERR_BLOCK) { - errno = EAGAIN; - return -1; + return QCRYPTO_TLS_SESSION_ERR_BLOCK; } else if (ret < 0) { - errno = EIO; return -1; } return ret; @@ -46,17 +45,16 @@ static ssize_t qio_channel_tls_write_handler(const char *buf, static ssize_t qio_channel_tls_read_handler(char *buf, size_t len, - void *opaque) + void *opaque, + Error **errp) { QIOChannelTLS *tioc = QIO_CHANNEL_TLS(opaque); ssize_t ret; - ret = qio_channel_read(tioc->master, buf, len, NULL); + ret = qio_channel_read(tioc->master, buf, len, errp); if (ret == QIO_CHANNEL_ERR_BLOCK) { - errno = EAGAIN; - return -1; + return QCRYPTO_TLS_SESSION_ERR_BLOCK; } else if (ret < 0) { - errno = EIO; return -1; } return ret; @@ -277,24 +275,19 @@ static ssize_t qio_channel_tls_readv(QIOChannel *ioc, ssize_t got = 0; for (i = 0 ; i < niov ; i++) { - ssize_t ret = qcrypto_tls_session_read(tioc->session, - iov[i].iov_base, - iov[i].iov_len); - if (ret < 0) { - if (errno == EAGAIN) { - if (got) { - return got; - } else { - return QIO_CHANNEL_ERR_BLOCK; - } - } else if (errno == ECONNABORTED && - (qatomic_load_acquire(&tioc->shutdown) & - QIO_CHANNEL_SHUTDOWN_READ)) { - return 0; + ssize_t ret = qcrypto_tls_session_read( + tioc->session, + iov[i].iov_base, + iov[i].iov_len, + qatomic_load_acquire(&tioc->shutdown) & QIO_CHANNEL_SHUTDOWN_READ, + errp); + if (ret == QCRYPTO_TLS_SESSION_ERR_BLOCK) { + if (got) { + return got; + } else { + return QIO_CHANNEL_ERR_BLOCK; } - - error_setg_errno(errp, errno, - "Cannot read from TLS channel"); + } else if (ret < 0) { return -1; } got += ret; @@ -321,18 +314,15 @@ static ssize_t qio_channel_tls_writev(QIOChannel *ioc, for (i = 0 ; i < niov ; i++) { ssize_t ret = qcrypto_tls_session_write(tioc->session, iov[i].iov_base, - iov[i].iov_len); - if (ret <= 0) { - if (errno == EAGAIN) { - if (done) { - return done; - } else { - return QIO_CHANNEL_ERR_BLOCK; - } + iov[i].iov_len, + errp); + if (ret == QCRYPTO_TLS_SESSION_ERR_BLOCK) { + if (done) { + return done; + } else { + return QIO_CHANNEL_ERR_BLOCK; } - - error_setg_errno(errp, errno, - "Cannot write to TLS channel"); + } else if (ret < 0) { return -1; } done += ret; diff --git a/io/channel.c b/io/channel.c index a1f12f8e90..e3f17c24a0 100644 --- a/io/channel.c +++ b/io/channel.c @@ -548,6 +548,19 @@ void qio_channel_set_cork(QIOChannel *ioc, } } +int qio_channel_get_peerpid(QIOChannel *ioc, + unsigned int *pid, + Error **errp) +{ + QIOChannelClass *klass = QIO_CHANNEL_GET_CLASS(ioc); + + if (!klass->io_peerpid) { + error_setg(errp, "Channel does not support peer pid"); + return -1; + } + klass->io_peerpid(ioc, pid, errp); + return 0; +} off_t qio_channel_io_seek(QIOChannel *ioc, off_t offset, diff --git a/meson.build b/meson.build index a1e51277b0..5613b62a4f 100644 --- a/meson.build +++ b/meson.build @@ -1696,7 +1696,6 @@ endif if not gnutls_crypto.found() if (not get_option('gcrypt').auto() or have_system) and not get_option('nettle').enabled() gcrypt = dependency('libgcrypt', version: '>=1.8', - method: 'config-tool', required: get_option('gcrypt')) # Debian has removed -lgpg-error from libgcrypt-config # as it "spreads unnecessary dependencies" which in @@ -1979,6 +1978,7 @@ endif tasn1 = not_found if gnutls.found() tasn1 = dependency('libtasn1', + required: false, method: 'pkg-config') endif keyutils = not_found @@ -2187,6 +2187,19 @@ have_virtfs_proxy_helper = get_option('virtfs_proxy_helper') \ .require(libcap_ng.found(), error_message: 'the virtfs proxy helper requires libcap-ng') \ .allowed() +qga_fsfreeze = false +qga_fstrim = false +if host_os == 'linux' + if cc.has_header_symbol('linux/fs.h', 'FIFREEZE') + qga_fsfreeze = true + endif + if cc.has_header_symbol('linux/fs.h', 'FITRIM') + qga_fstrim = true + endif +elif host_os == 'freebsd' and cc.has_header_symbol('ufs/ffs/fs.h', 'UFSSUSPEND') + qga_fsfreeze = true +endif + if get_option('block_drv_ro_whitelist') == '' config_host_data.set('CONFIG_BDRV_RO_WHITELIST', '') else @@ -2263,6 +2276,7 @@ config_host_data.set('CONFIG_ATTR', libattr.found()) config_host_data.set('CONFIG_BDRV_WHITELIST_TOOLS', get_option('block_drv_whitelist_in_tools')) config_host_data.set('CONFIG_BRLAPI', brlapi.found()) config_host_data.set('CONFIG_BSD', host_os in bsd_oses) +config_host_data.set('CONFIG_FREEBSD', host_os == 'freebsd') config_host_data.set('CONFIG_CAPSTONE', capstone.found()) config_host_data.set('CONFIG_COCOA', cocoa.found()) config_host_data.set('CONFIG_DARWIN', host_os == 'darwin') @@ -2423,6 +2437,8 @@ config_host_data.set('CONFIG_DEBUG_TCG', get_option('debug_tcg')) config_host_data.set('CONFIG_DEBUG_REMAP', get_option('debug_remap')) config_host_data.set('CONFIG_QOM_CAST_DEBUG', get_option('qom_cast_debug')) config_host_data.set('CONFIG_REPLICATION', get_option('replication').allowed()) +config_host_data.set('CONFIG_FSFREEZE', qga_fsfreeze) +config_host_data.set('CONFIG_FSTRIM', qga_fstrim) # has_header config_host_data.set('CONFIG_EPOLL', cc.has_header('sys/epoll.h')) @@ -3327,6 +3343,7 @@ if have_block trace_events_subdirs += [ 'authz', 'block', + 'chardev', 'io', 'nbd', 'scsi', @@ -3338,7 +3355,6 @@ if have_system 'audio', 'backends', 'backends/tpm', - 'chardev', 'ebpf', 'hw/9pfs', 'hw/acpi', @@ -4073,6 +4089,13 @@ if have_tools dependencies: [authz, crypto, io, qom, qemuutil, libcap_ng, mpathpersist], install: true) + + if cpu in ['x86', 'x86_64'] + executable('qemu-vmsr-helper', files('tools/i386/qemu-vmsr-helper.c'), + dependencies: [authz, crypto, io, qom, qemuutil, + libcap_ng, mpathpersist], + install: true) + endif endif if have_ivshmem diff --git a/net/vhost-vdpa.c b/net/vhost-vdpa.c index daa38428c5..03457ead66 100644 --- a/net/vhost-vdpa.c +++ b/net/vhost-vdpa.c @@ -62,6 +62,7 @@ const int vdpa_feature_bits[] = { VIRTIO_F_RING_PACKED, VIRTIO_F_RING_RESET, VIRTIO_F_VERSION_1, + VIRTIO_F_IN_ORDER, VIRTIO_F_NOTIFICATION_DATA, VIRTIO_NET_F_CSUM, VIRTIO_NET_F_CTRL_GUEST_OFFLOADS, diff --git a/qapi/crypto.json b/qapi/crypto.json index e102be337b..f03bdab8c9 100644 --- a/qapi/crypto.json +++ b/qapi/crypto.json @@ -226,8 +226,6 @@ # @iter-time: number of milliseconds to spend in PBKDF passphrase # processing. Currently defaults to 2000. (since 2.8) # -# @detached-header: create a detached LUKS header. (since 9.0) -# # Since: 2.6 ## { 'struct': 'QCryptoBlockCreateOptionsLUKS', @@ -237,8 +235,7 @@ '*ivgen-alg': 'QCryptoIVGenAlgorithm', '*ivgen-hash-alg': 'QCryptoHashAlgorithm', '*hash-alg': 'QCryptoHashAlgorithm', - '*iter-time': 'int', - '*detached-header': 'bool'}} + '*iter-time': 'int' }} ## # @QCryptoBlockOpenOptions: diff --git a/qga/commands-bsd.c b/qga/commands-bsd.c index 17bddda1cf..9ce48af311 100644 --- a/qga/commands-bsd.c +++ b/qga/commands-bsd.c @@ -149,30 +149,6 @@ int qmp_guest_fsfreeze_do_thaw(Error **errp) } return ret; } - -GuestFilesystemInfoList *qmp_guest_get_fsinfo(Error **errp) -{ - error_setg(errp, QERR_UNSUPPORTED); - return NULL; -} - -GuestDiskInfoList *qmp_guest_get_disks(Error **errp) -{ - error_setg(errp, QERR_UNSUPPORTED); - return NULL; -} - -GuestDiskStatsInfoList *qmp_guest_get_diskstats(Error **errp) -{ - error_setg(errp, QERR_UNSUPPORTED); - return NULL; -} - -GuestCpuStatsList *qmp_guest_get_cpustats(Error **errp) -{ - error_setg(errp, QERR_UNSUPPORTED); - return NULL; -} #endif /* CONFIG_FSFREEZE */ #ifdef HAVE_GETIFADDRS diff --git a/qga/commands-common.h b/qga/commands-common.h index 8c1c56aac9..263e7c0525 100644 --- a/qga/commands-common.h +++ b/qga/commands-common.h @@ -15,19 +15,10 @@ #if defined(__linux__) #include <linux/fs.h> -#ifdef FIFREEZE -#define CONFIG_FSFREEZE -#endif -#ifdef FITRIM -#define CONFIG_FSTRIM -#endif #endif /* __linux__ */ #ifdef __FreeBSD__ #include <ufs/ffs/fs.h> -#ifdef UFSSUSPEND -#define CONFIG_FSFREEZE -#endif #endif /* __FreeBSD__ */ #if defined(CONFIG_FSFREEZE) || defined(CONFIG_FSTRIM) diff --git a/qga/commands-linux.c b/qga/commands-linux.c index 214e408fcd..51d5e3d927 100644 --- a/qga/commands-linux.c +++ b/qga/commands-linux.c @@ -13,10 +13,26 @@ #include "qemu/osdep.h" #include "qapi/error.h" +#include "qga-qapi-commands.h" +#include "qapi/error.h" +#include "qapi/qmp/qerror.h" #include "commands-common.h" #include "cutils.h" #include <mntent.h> #include <sys/ioctl.h> +#include <mntent.h> +#include <linux/nvme_ioctl.h> +#include "block/nvme.h" + +#ifdef CONFIG_LIBUDEV +#include <libudev.h> +#endif + +#ifdef HAVE_GETIFADDRS +#include <net/if.h> +#endif + +#include <sys/statvfs.h> #if defined(CONFIG_FSFREEZE) || defined(CONFIG_FSTRIM) static int dev_major_minor(const char *devpath, @@ -284,3 +300,1925 @@ int qmp_guest_fsfreeze_do_thaw(Error **errp) return i; } #endif /* CONFIG_FSFREEZE */ + +#if defined(CONFIG_FSFREEZE) + +static char *get_pci_driver(char const *syspath, int pathlen, Error **errp) +{ + char *path; + char *dpath; + char *driver = NULL; + char buf[PATH_MAX]; + ssize_t len; + + path = g_strndup(syspath, pathlen); + dpath = g_strdup_printf("%s/driver", path); + len = readlink(dpath, buf, sizeof(buf) - 1); + if (len != -1) { + buf[len] = 0; + driver = g_path_get_basename(buf); + } + g_free(dpath); + g_free(path); + return driver; +} + +static int compare_uint(const void *_a, const void *_b) +{ + unsigned int a = *(unsigned int *)_a; + unsigned int b = *(unsigned int *)_b; + + return a < b ? -1 : a > b ? 1 : 0; +} + +/* Walk the specified sysfs and build a sorted list of host or ata numbers */ +static int build_hosts(char const *syspath, char const *host, bool ata, + unsigned int *hosts, int hosts_max, Error **errp) +{ + char *path; + DIR *dir; + struct dirent *entry; + int i = 0; + + path = g_strndup(syspath, host - syspath); + dir = opendir(path); + if (!dir) { + error_setg_errno(errp, errno, "opendir(\"%s\")", path); + g_free(path); + return -1; + } + + while (i < hosts_max) { + entry = readdir(dir); + if (!entry) { + break; + } + if (ata && sscanf(entry->d_name, "ata%d", hosts + i) == 1) { + ++i; + } else if (!ata && sscanf(entry->d_name, "host%d", hosts + i) == 1) { + ++i; + } + } + + qsort(hosts, i, sizeof(hosts[0]), compare_uint); + + g_free(path); + closedir(dir); + return i; +} + +/* + * Store disk device info for devices on the PCI bus. + * Returns true if information has been stored, or false for failure. + */ +static bool build_guest_fsinfo_for_pci_dev(char const *syspath, + GuestDiskAddress *disk, + Error **errp) +{ + unsigned int pci[4], host, hosts[8], tgt[3]; + int i, nhosts = 0, pcilen; + GuestPCIAddress *pciaddr = disk->pci_controller; + bool has_ata = false, has_host = false, has_tgt = false; + char *p, *q, *driver = NULL; + bool ret = false; + + p = strstr(syspath, "/devices/pci"); + if (!p || sscanf(p + 12, "%*x:%*x/%x:%x:%x.%x%n", + pci, pci + 1, pci + 2, pci + 3, &pcilen) < 4) { + g_debug("only pci device is supported: sysfs path '%s'", syspath); + return false; + } + + p += 12 + pcilen; + while (true) { + driver = get_pci_driver(syspath, p - syspath, errp); + if (driver && (g_str_equal(driver, "ata_piix") || + g_str_equal(driver, "sym53c8xx") || + g_str_equal(driver, "virtio-pci") || + g_str_equal(driver, "ahci") || + g_str_equal(driver, "nvme") || + g_str_equal(driver, "xhci_hcd") || + g_str_equal(driver, "ehci-pci"))) { + break; + } + + g_free(driver); + if (sscanf(p, "/%x:%x:%x.%x%n", + pci, pci + 1, pci + 2, pci + 3, &pcilen) == 4) { + p += pcilen; + continue; + } + + g_debug("unsupported driver or sysfs path '%s'", syspath); + return false; + } + + p = strstr(syspath, "/target"); + if (p && sscanf(p + 7, "%*u:%*u:%*u/%*u:%u:%u:%u", + tgt, tgt + 1, tgt + 2) == 3) { + has_tgt = true; + } + + p = strstr(syspath, "/ata"); + if (p) { + q = p + 4; + has_ata = true; + } else { + p = strstr(syspath, "/host"); + q = p + 5; + } + if (p && sscanf(q, "%u", &host) == 1) { + has_host = true; + nhosts = build_hosts(syspath, p, has_ata, hosts, + ARRAY_SIZE(hosts), errp); + if (nhosts < 0) { + goto cleanup; + } + } + + pciaddr->domain = pci[0]; + pciaddr->bus = pci[1]; + pciaddr->slot = pci[2]; + pciaddr->function = pci[3]; + + if (strcmp(driver, "ata_piix") == 0) { + /* a host per ide bus, target*:0:<unit>:0 */ + if (!has_host || !has_tgt) { + g_debug("invalid sysfs path '%s' (driver '%s')", syspath, driver); + goto cleanup; + } + for (i = 0; i < nhosts; i++) { + if (host == hosts[i]) { + disk->bus_type = GUEST_DISK_BUS_TYPE_IDE; + disk->bus = i; + disk->unit = tgt[1]; + break; + } + } + if (i >= nhosts) { + g_debug("no host for '%s' (driver '%s')", syspath, driver); + goto cleanup; + } + } else if (strcmp(driver, "sym53c8xx") == 0) { + /* scsi(LSI Logic): target*:0:<unit>:0 */ + if (!has_tgt) { + g_debug("invalid sysfs path '%s' (driver '%s')", syspath, driver); + goto cleanup; + } + disk->bus_type = GUEST_DISK_BUS_TYPE_SCSI; + disk->unit = tgt[1]; + } else if (strcmp(driver, "virtio-pci") == 0) { + if (has_tgt) { + /* virtio-scsi: target*:0:0:<unit> */ + disk->bus_type = GUEST_DISK_BUS_TYPE_SCSI; + disk->unit = tgt[2]; + } else { + /* virtio-blk: 1 disk per 1 device */ + disk->bus_type = GUEST_DISK_BUS_TYPE_VIRTIO; + } + } else if (strcmp(driver, "ahci") == 0) { + /* ahci: 1 host per 1 unit */ + if (!has_host || !has_tgt) { + g_debug("invalid sysfs path '%s' (driver '%s')", syspath, driver); + goto cleanup; + } + for (i = 0; i < nhosts; i++) { + if (host == hosts[i]) { + disk->unit = i; + disk->bus_type = GUEST_DISK_BUS_TYPE_SATA; + break; + } + } + if (i >= nhosts) { + g_debug("no host for '%s' (driver '%s')", syspath, driver); + goto cleanup; + } + } else if (strcmp(driver, "nvme") == 0) { + disk->bus_type = GUEST_DISK_BUS_TYPE_NVME; + } else if (strcmp(driver, "ehci-pci") == 0 || strcmp(driver, "xhci_hcd") == 0) { + disk->bus_type = GUEST_DISK_BUS_TYPE_USB; + } else { + g_debug("unknown driver '%s' (sysfs path '%s')", driver, syspath); + goto cleanup; + } + + ret = true; + +cleanup: + g_free(driver); + return ret; +} + +/* + * Store disk device info for non-PCI virtio devices (for example s390x + * channel I/O devices). Returns true if information has been stored, or + * false for failure. + */ +static bool build_guest_fsinfo_for_nonpci_virtio(char const *syspath, + GuestDiskAddress *disk, + Error **errp) +{ + unsigned int tgt[3]; + char *p; + + if (!strstr(syspath, "/virtio") || !strstr(syspath, "/block")) { + g_debug("Unsupported virtio device '%s'", syspath); + return false; + } + + p = strstr(syspath, "/target"); + if (p && sscanf(p + 7, "%*u:%*u:%*u/%*u:%u:%u:%u", + &tgt[0], &tgt[1], &tgt[2]) == 3) { + /* virtio-scsi: target*:0:<target>:<unit> */ + disk->bus_type = GUEST_DISK_BUS_TYPE_SCSI; + disk->bus = tgt[0]; + disk->target = tgt[1]; + disk->unit = tgt[2]; + } else { + /* virtio-blk: 1 disk per 1 device */ + disk->bus_type = GUEST_DISK_BUS_TYPE_VIRTIO; + } + + return true; +} + +/* + * Store disk device info for CCW devices (s390x channel I/O devices). + * Returns true if information has been stored, or false for failure. + */ +static bool build_guest_fsinfo_for_ccw_dev(char const *syspath, + GuestDiskAddress *disk, + Error **errp) +{ + unsigned int cssid, ssid, subchno, devno; + char *p; + + p = strstr(syspath, "/devices/css"); + if (!p || sscanf(p + 12, "%*x/%x.%x.%x/%*x.%*x.%x/", + &cssid, &ssid, &subchno, &devno) < 4) { + g_debug("could not parse ccw device sysfs path: %s", syspath); + return false; + } + + disk->ccw_address = g_new0(GuestCCWAddress, 1); + disk->ccw_address->cssid = cssid; + disk->ccw_address->ssid = ssid; + disk->ccw_address->subchno = subchno; + disk->ccw_address->devno = devno; + + if (strstr(p, "/virtio")) { + build_guest_fsinfo_for_nonpci_virtio(syspath, disk, errp); + } + + return true; +} + +/* Store disk device info specified by @sysfs into @fs */ +static void build_guest_fsinfo_for_real_device(char const *syspath, + GuestFilesystemInfo *fs, + Error **errp) +{ + GuestDiskAddress *disk; + GuestPCIAddress *pciaddr; + bool has_hwinf; +#ifdef CONFIG_LIBUDEV + struct udev *udev = NULL; + struct udev_device *udevice = NULL; +#endif + + pciaddr = g_new0(GuestPCIAddress, 1); + pciaddr->domain = -1; /* -1 means field is invalid */ + pciaddr->bus = -1; + pciaddr->slot = -1; + pciaddr->function = -1; + + disk = g_new0(GuestDiskAddress, 1); + disk->pci_controller = pciaddr; + disk->bus_type = GUEST_DISK_BUS_TYPE_UNKNOWN; + +#ifdef CONFIG_LIBUDEV + udev = udev_new(); + udevice = udev_device_new_from_syspath(udev, syspath); + if (udev == NULL || udevice == NULL) { + g_debug("failed to query udev"); + } else { + const char *devnode, *serial; + devnode = udev_device_get_devnode(udevice); + if (devnode != NULL) { + disk->dev = g_strdup(devnode); + } + serial = udev_device_get_property_value(udevice, "ID_SERIAL"); + if (serial != NULL && *serial != 0) { + disk->serial = g_strdup(serial); + } + } + + udev_unref(udev); + udev_device_unref(udevice); +#endif + + if (strstr(syspath, "/devices/pci")) { + has_hwinf = build_guest_fsinfo_for_pci_dev(syspath, disk, errp); + } else if (strstr(syspath, "/devices/css")) { + has_hwinf = build_guest_fsinfo_for_ccw_dev(syspath, disk, errp); + } else if (strstr(syspath, "/virtio")) { + has_hwinf = build_guest_fsinfo_for_nonpci_virtio(syspath, disk, errp); + } else { + g_debug("Unsupported device type for '%s'", syspath); + has_hwinf = false; + } + + if (has_hwinf || disk->dev || disk->serial) { + QAPI_LIST_PREPEND(fs->disk, disk); + } else { + qapi_free_GuestDiskAddress(disk); + } +} + +static void build_guest_fsinfo_for_device(char const *devpath, + GuestFilesystemInfo *fs, + Error **errp); + +/* Store a list of slave devices of virtual volume specified by @syspath into + * @fs */ +static void build_guest_fsinfo_for_virtual_device(char const *syspath, + GuestFilesystemInfo *fs, + Error **errp) +{ + Error *err = NULL; + DIR *dir; + char *dirpath; + struct dirent *entry; + + dirpath = g_strdup_printf("%s/slaves", syspath); + dir = opendir(dirpath); + if (!dir) { + if (errno != ENOENT) { + error_setg_errno(errp, errno, "opendir(\"%s\")", dirpath); + } + g_free(dirpath); + return; + } + + for (;;) { + errno = 0; + entry = readdir(dir); + if (entry == NULL) { + if (errno) { + error_setg_errno(errp, errno, "readdir(\"%s\")", dirpath); + } + break; + } + + if (entry->d_type == DT_LNK) { + char *path; + + g_debug(" slave device '%s'", entry->d_name); + path = g_strdup_printf("%s/slaves/%s", syspath, entry->d_name); + build_guest_fsinfo_for_device(path, fs, &err); + g_free(path); + + if (err) { + error_propagate(errp, err); + break; + } + } + } + + g_free(dirpath); + closedir(dir); +} + +static bool is_disk_virtual(const char *devpath, Error **errp) +{ + g_autofree char *syspath = realpath(devpath, NULL); + + if (!syspath) { + error_setg_errno(errp, errno, "realpath(\"%s\")", devpath); + return false; + } + return strstr(syspath, "/devices/virtual/block/") != NULL; +} + +/* Dispatch to functions for virtual/real device */ +static void build_guest_fsinfo_for_device(char const *devpath, + GuestFilesystemInfo *fs, + Error **errp) +{ + ERRP_GUARD(); + g_autofree char *syspath = NULL; + bool is_virtual = false; + + syspath = realpath(devpath, NULL); + if (!syspath) { + if (errno != ENOENT) { + error_setg_errno(errp, errno, "realpath(\"%s\")", devpath); + return; + } + + /* ENOENT: This devpath may not exist because of container config */ + if (!fs->name) { + fs->name = g_path_get_basename(devpath); + } + return; + } + + if (!fs->name) { + fs->name = g_path_get_basename(syspath); + } + + g_debug(" parse sysfs path '%s'", syspath); + is_virtual = is_disk_virtual(syspath, errp); + if (*errp != NULL) { + return; + } + if (is_virtual) { + build_guest_fsinfo_for_virtual_device(syspath, fs, errp); + } else { + build_guest_fsinfo_for_real_device(syspath, fs, errp); + } +} + +#ifdef CONFIG_LIBUDEV + +/* + * Wrapper around build_guest_fsinfo_for_device() for getting just + * the disk address. + */ +static GuestDiskAddress *get_disk_address(const char *syspath, Error **errp) +{ + g_autoptr(GuestFilesystemInfo) fs = NULL; + + fs = g_new0(GuestFilesystemInfo, 1); + build_guest_fsinfo_for_device(syspath, fs, errp); + if (fs->disk != NULL) { + return g_steal_pointer(&fs->disk->value); + } + return NULL; +} + +static char *get_alias_for_syspath(const char *syspath) +{ + struct udev *udev = NULL; + struct udev_device *udevice = NULL; + char *ret = NULL; + + udev = udev_new(); + if (udev == NULL) { + g_debug("failed to query udev"); + goto out; + } + udevice = udev_device_new_from_syspath(udev, syspath); + if (udevice == NULL) { + g_debug("failed to query udev for path: %s", syspath); + goto out; + } else { + const char *alias = udev_device_get_property_value( + udevice, "DM_NAME"); + /* + * NULL means there was an error and empty string means there is no + * alias. In case of no alias we return NULL instead of empty string. + */ + if (alias == NULL) { + g_debug("failed to query udev for device alias for: %s", + syspath); + } else if (*alias != 0) { + ret = g_strdup(alias); + } + } + +out: + udev_unref(udev); + udev_device_unref(udevice); + return ret; +} + +static char *get_device_for_syspath(const char *syspath) +{ + struct udev *udev = NULL; + struct udev_device *udevice = NULL; + char *ret = NULL; + + udev = udev_new(); + if (udev == NULL) { + g_debug("failed to query udev"); + goto out; + } + udevice = udev_device_new_from_syspath(udev, syspath); + if (udevice == NULL) { + g_debug("failed to query udev for path: %s", syspath); + goto out; + } else { + ret = g_strdup(udev_device_get_devnode(udevice)); + } + +out: + udev_unref(udev); + udev_device_unref(udevice); + return ret; +} + +static void get_disk_deps(const char *disk_dir, GuestDiskInfo *disk) +{ + g_autofree char *deps_dir = NULL; + const gchar *dep; + GDir *dp_deps = NULL; + + /* List dependent disks */ + deps_dir = g_strdup_printf("%s/slaves", disk_dir); + g_debug(" listing entries in: %s", deps_dir); + dp_deps = g_dir_open(deps_dir, 0, NULL); + if (dp_deps == NULL) { + g_debug("failed to list entries in %s", deps_dir); + return; + } + disk->has_dependencies = true; + while ((dep = g_dir_read_name(dp_deps)) != NULL) { + g_autofree char *dep_dir = NULL; + char *dev_name; + + /* Add dependent disks */ + dep_dir = g_strdup_printf("%s/%s", deps_dir, dep); + dev_name = get_device_for_syspath(dep_dir); + if (dev_name != NULL) { + g_debug(" adding dependent device: %s", dev_name); + QAPI_LIST_PREPEND(disk->dependencies, dev_name); + } + } + g_dir_close(dp_deps); +} + +/* + * Detect partitions subdirectory, name is "<disk_name><number>" or + * "<disk_name>p<number>" + * + * @disk_name -- last component of /sys path (e.g. sda) + * @disk_dir -- sys path of the disk (e.g. /sys/block/sda) + * @disk_dev -- device node of the disk (e.g. /dev/sda) + */ +static GuestDiskInfoList *get_disk_partitions( + GuestDiskInfoList *list, + const char *disk_name, const char *disk_dir, + const char *disk_dev) +{ + GuestDiskInfoList *ret = list; + struct dirent *de_disk; + DIR *dp_disk = NULL; + size_t len = strlen(disk_name); + + dp_disk = opendir(disk_dir); + while ((de_disk = readdir(dp_disk)) != NULL) { + g_autofree char *partition_dir = NULL; + char *dev_name; + GuestDiskInfo *partition; + + if (!(de_disk->d_type & DT_DIR)) { + continue; + } + + if (!(strncmp(disk_name, de_disk->d_name, len) == 0 && + ((*(de_disk->d_name + len) == 'p' && + isdigit(*(de_disk->d_name + len + 1))) || + isdigit(*(de_disk->d_name + len))))) { + continue; + } + + partition_dir = g_strdup_printf("%s/%s", + disk_dir, de_disk->d_name); + dev_name = get_device_for_syspath(partition_dir); + if (dev_name == NULL) { + g_debug("Failed to get device name for syspath: %s", + disk_dir); + continue; + } + partition = g_new0(GuestDiskInfo, 1); + partition->name = dev_name; + partition->partition = true; + partition->has_dependencies = true; + /* Add parent disk as dependent for easier tracking of hierarchy */ + QAPI_LIST_PREPEND(partition->dependencies, g_strdup(disk_dev)); + + QAPI_LIST_PREPEND(ret, partition); + } + closedir(dp_disk); + + return ret; +} + +static void get_nvme_smart(GuestDiskInfo *disk) +{ + int fd; + GuestNVMeSmart *smart; + NvmeSmartLog log = {0}; + struct nvme_admin_cmd cmd = { + .opcode = NVME_ADM_CMD_GET_LOG_PAGE, + .nsid = NVME_NSID_BROADCAST, + .addr = (uintptr_t)&log, + .data_len = sizeof(log), + .cdw10 = NVME_LOG_SMART_INFO | (1 << 15) /* RAE bit */ + | (((sizeof(log) >> 2) - 1) << 16) + }; + + fd = qga_open_cloexec(disk->name, O_RDONLY, 0); + if (fd == -1) { + g_debug("Failed to open device: %s: %s", disk->name, g_strerror(errno)); + return; + } + + if (ioctl(fd, NVME_IOCTL_ADMIN_CMD, &cmd)) { + g_debug("Failed to get smart: %s: %s", disk->name, g_strerror(errno)); + close(fd); + return; + } + + disk->smart = g_new0(GuestDiskSmart, 1); + disk->smart->type = GUEST_DISK_BUS_TYPE_NVME; + + smart = &disk->smart->u.nvme; + smart->critical_warning = log.critical_warning; + smart->temperature = lduw_le_p(&log.temperature); /* unaligned field */ + smart->available_spare = log.available_spare; + smart->available_spare_threshold = log.available_spare_threshold; + smart->percentage_used = log.percentage_used; + smart->data_units_read_lo = le64_to_cpu(log.data_units_read[0]); + smart->data_units_read_hi = le64_to_cpu(log.data_units_read[1]); + smart->data_units_written_lo = le64_to_cpu(log.data_units_written[0]); + smart->data_units_written_hi = le64_to_cpu(log.data_units_written[1]); + smart->host_read_commands_lo = le64_to_cpu(log.host_read_commands[0]); + smart->host_read_commands_hi = le64_to_cpu(log.host_read_commands[1]); + smart->host_write_commands_lo = le64_to_cpu(log.host_write_commands[0]); + smart->host_write_commands_hi = le64_to_cpu(log.host_write_commands[1]); + smart->controller_busy_time_lo = le64_to_cpu(log.controller_busy_time[0]); + smart->controller_busy_time_hi = le64_to_cpu(log.controller_busy_time[1]); + smart->power_cycles_lo = le64_to_cpu(log.power_cycles[0]); + smart->power_cycles_hi = le64_to_cpu(log.power_cycles[1]); + smart->power_on_hours_lo = le64_to_cpu(log.power_on_hours[0]); + smart->power_on_hours_hi = le64_to_cpu(log.power_on_hours[1]); + smart->unsafe_shutdowns_lo = le64_to_cpu(log.unsafe_shutdowns[0]); + smart->unsafe_shutdowns_hi = le64_to_cpu(log.unsafe_shutdowns[1]); + smart->media_errors_lo = le64_to_cpu(log.media_errors[0]); + smart->media_errors_hi = le64_to_cpu(log.media_errors[1]); + smart->number_of_error_log_entries_lo = + le64_to_cpu(log.number_of_error_log_entries[0]); + smart->number_of_error_log_entries_hi = + le64_to_cpu(log.number_of_error_log_entries[1]); + + close(fd); +} + +static void get_disk_smart(GuestDiskInfo *disk) +{ + if (disk->address + && (disk->address->bus_type == GUEST_DISK_BUS_TYPE_NVME)) { + get_nvme_smart(disk); + } +} + +GuestDiskInfoList *qmp_guest_get_disks(Error **errp) +{ + GuestDiskInfoList *ret = NULL; + GuestDiskInfo *disk; + DIR *dp = NULL; + struct dirent *de = NULL; + + g_debug("listing /sys/block directory"); + dp = opendir("/sys/block"); + if (dp == NULL) { + error_setg_errno(errp, errno, "Can't open directory \"/sys/block\""); + return NULL; + } + while ((de = readdir(dp)) != NULL) { + g_autofree char *disk_dir = NULL, *line = NULL, + *size_path = NULL; + char *dev_name; + Error *local_err = NULL; + if (de->d_type != DT_LNK) { + g_debug(" skipping entry: %s", de->d_name); + continue; + } + + /* Check size and skip zero-sized disks */ + g_debug(" checking disk size"); + size_path = g_strdup_printf("/sys/block/%s/size", de->d_name); + if (!g_file_get_contents(size_path, &line, NULL, NULL)) { + g_debug(" failed to read disk size"); + continue; + } + if (g_strcmp0(line, "0\n") == 0) { + g_debug(" skipping zero-sized disk"); + continue; + } + + g_debug(" adding %s", de->d_name); + disk_dir = g_strdup_printf("/sys/block/%s", de->d_name); + dev_name = get_device_for_syspath(disk_dir); + if (dev_name == NULL) { + g_debug("Failed to get device name for syspath: %s", + disk_dir); + continue; + } + disk = g_new0(GuestDiskInfo, 1); + disk->name = dev_name; + disk->partition = false; + disk->alias = get_alias_for_syspath(disk_dir); + QAPI_LIST_PREPEND(ret, disk); + + /* Get address for non-virtual devices */ + bool is_virtual = is_disk_virtual(disk_dir, &local_err); + if (local_err != NULL) { + g_debug(" failed to check disk path, ignoring error: %s", + error_get_pretty(local_err)); + error_free(local_err); + local_err = NULL; + /* Don't try to get the address */ + is_virtual = true; + } + if (!is_virtual) { + disk->address = get_disk_address(disk_dir, &local_err); + if (local_err != NULL) { + g_debug(" failed to get device info, ignoring error: %s", + error_get_pretty(local_err)); + error_free(local_err); + local_err = NULL; + } + } + + get_disk_deps(disk_dir, disk); + get_disk_smart(disk); + ret = get_disk_partitions(ret, de->d_name, disk_dir, dev_name); + } + + closedir(dp); + + return ret; +} + +#endif + +/* Return a list of the disk device(s)' info which @mount lies on */ +static GuestFilesystemInfo *build_guest_fsinfo(struct FsMount *mount, + Error **errp) +{ + GuestFilesystemInfo *fs = g_malloc0(sizeof(*fs)); + struct statvfs buf; + unsigned long used, nonroot_total, fr_size; + char *devpath = g_strdup_printf("/sys/dev/block/%u:%u", + mount->devmajor, mount->devminor); + + fs->mountpoint = g_strdup(mount->dirname); + fs->type = g_strdup(mount->devtype); + build_guest_fsinfo_for_device(devpath, fs, errp); + + if (statvfs(fs->mountpoint, &buf) == 0) { + fr_size = buf.f_frsize; + used = buf.f_blocks - buf.f_bfree; + nonroot_total = used + buf.f_bavail; + fs->used_bytes = used * fr_size; + fs->total_bytes = nonroot_total * fr_size; + fs->total_bytes_privileged = buf.f_blocks * fr_size; + + fs->has_total_bytes = true; + fs->has_total_bytes_privileged = true; + fs->has_used_bytes = true; + } + + g_free(devpath); + + return fs; +} + +GuestFilesystemInfoList *qmp_guest_get_fsinfo(Error **errp) +{ + FsMountList mounts; + struct FsMount *mount; + GuestFilesystemInfoList *ret = NULL; + Error *local_err = NULL; + + QTAILQ_INIT(&mounts); + if (!build_fs_mount_list(&mounts, &local_err)) { + error_propagate(errp, local_err); + return NULL; + } + + QTAILQ_FOREACH(mount, &mounts, next) { + g_debug("Building guest fsinfo for '%s'", mount->dirname); + + QAPI_LIST_PREPEND(ret, build_guest_fsinfo(mount, &local_err)); + if (local_err) { + error_propagate(errp, local_err); + qapi_free_GuestFilesystemInfoList(ret); + ret = NULL; + break; + } + } + + free_fs_mount_list(&mounts); + return ret; +} +#endif /* CONFIG_FSFREEZE */ + +#if defined(CONFIG_FSTRIM) +/* + * Walk list of mounted file systems in the guest, and trim them. + */ +GuestFilesystemTrimResponse * +qmp_guest_fstrim(bool has_minimum, int64_t minimum, Error **errp) +{ + GuestFilesystemTrimResponse *response; + GuestFilesystemTrimResult *result; + int ret = 0; + FsMountList mounts; + struct FsMount *mount; + int fd; + struct fstrim_range r; + + slog("guest-fstrim called"); + + QTAILQ_INIT(&mounts); + if (!build_fs_mount_list(&mounts, errp)) { + return NULL; + } + + response = g_malloc0(sizeof(*response)); + + QTAILQ_FOREACH(mount, &mounts, next) { + result = g_malloc0(sizeof(*result)); + result->path = g_strdup(mount->dirname); + + QAPI_LIST_PREPEND(response->paths, result); + + fd = qga_open_cloexec(mount->dirname, O_RDONLY, 0); + if (fd == -1) { + result->error = g_strdup_printf("failed to open: %s", + strerror(errno)); + continue; + } + + /* We try to cull filesystems we know won't work in advance, but other + * filesystems may not implement fstrim for less obvious reasons. + * These will report EOPNOTSUPP; while in some other cases ENOTTY + * will be reported (e.g. CD-ROMs). + * Any other error means an unexpected error. + */ + r.start = 0; + r.len = -1; + r.minlen = has_minimum ? minimum : 0; + ret = ioctl(fd, FITRIM, &r); + if (ret == -1) { + if (errno == ENOTTY || errno == EOPNOTSUPP) { + result->error = g_strdup("trim not supported"); + } else { + result->error = g_strdup_printf("failed to trim: %s", + strerror(errno)); + } + close(fd); + continue; + } + + result->has_minimum = true; + result->minimum = r.minlen; + result->has_trimmed = true; + result->trimmed = r.len; + close(fd); + } + + free_fs_mount_list(&mounts); + return response; +} +#endif /* CONFIG_FSTRIM */ + +#define LINUX_SYS_STATE_FILE "/sys/power/state" +#define SUSPEND_SUPPORTED 0 +#define SUSPEND_NOT_SUPPORTED 1 + +typedef enum { + SUSPEND_MODE_DISK = 0, + SUSPEND_MODE_RAM = 1, + SUSPEND_MODE_HYBRID = 2, +} SuspendMode; + +/* + * Executes a command in a child process using g_spawn_sync, + * returning an int >= 0 representing the exit status of the + * process. + * + * If the program wasn't found in path, returns -1. + * + * If a problem happened when creating the child process, + * returns -1 and errp is set. + */ +static int run_process_child(const char *command[], Error **errp) +{ + int exit_status, spawn_flag; + GError *g_err = NULL; + bool success; + + spawn_flag = G_SPAWN_SEARCH_PATH | G_SPAWN_STDOUT_TO_DEV_NULL | + G_SPAWN_STDERR_TO_DEV_NULL; + + success = g_spawn_sync(NULL, (char **)command, NULL, spawn_flag, + NULL, NULL, NULL, NULL, + &exit_status, &g_err); + + if (success) { + return WEXITSTATUS(exit_status); + } + + if (g_err && (g_err->code != G_SPAWN_ERROR_NOENT)) { + error_setg(errp, "failed to create child process, error '%s'", + g_err->message); + } + + g_error_free(g_err); + return -1; +} + +static bool systemd_supports_mode(SuspendMode mode, Error **errp) +{ + const char *systemctl_args[3] = {"systemd-hibernate", "systemd-suspend", + "systemd-hybrid-sleep"}; + const char *cmd[4] = {"systemctl", "status", systemctl_args[mode], NULL}; + int status; + + status = run_process_child(cmd, errp); + + /* + * systemctl status uses LSB return codes so we can expect + * status > 0 and be ok. To assert if the guest has support + * for the selected suspend mode, status should be < 4. 4 is + * the code for unknown service status, the return value when + * the service does not exist. A common value is status = 3 + * (program is not running). + */ + if (status > 0 && status < 4) { + return true; + } + + return false; +} + +static void systemd_suspend(SuspendMode mode, Error **errp) +{ + Error *local_err = NULL; + const char *systemctl_args[3] = {"hibernate", "suspend", "hybrid-sleep"}; + const char *cmd[3] = {"systemctl", systemctl_args[mode], NULL}; + int status; + + status = run_process_child(cmd, &local_err); + + if (status == 0) { + return; + } + + if ((status == -1) && !local_err) { + error_setg(errp, "the helper program 'systemctl %s' was not found", + systemctl_args[mode]); + return; + } + + if (local_err) { + error_propagate(errp, local_err); + } else { + error_setg(errp, "the helper program 'systemctl %s' returned an " + "unexpected exit status code (%d)", + systemctl_args[mode], status); + } +} + +static bool pmutils_supports_mode(SuspendMode mode, Error **errp) +{ + Error *local_err = NULL; + const char *pmutils_args[3] = {"--hibernate", "--suspend", + "--suspend-hybrid"}; + const char *cmd[3] = {"pm-is-supported", pmutils_args[mode], NULL}; + int status; + + status = run_process_child(cmd, &local_err); + + if (status == SUSPEND_SUPPORTED) { + return true; + } + + if ((status == -1) && !local_err) { + return false; + } + + if (local_err) { + error_propagate(errp, local_err); + } else { + error_setg(errp, + "the helper program '%s' returned an unexpected exit" + " status code (%d)", "pm-is-supported", status); + } + + return false; +} + +static void pmutils_suspend(SuspendMode mode, Error **errp) +{ + Error *local_err = NULL; + const char *pmutils_binaries[3] = {"pm-hibernate", "pm-suspend", + "pm-suspend-hybrid"}; + const char *cmd[2] = {pmutils_binaries[mode], NULL}; + int status; + + status = run_process_child(cmd, &local_err); + + if (status == 0) { + return; + } + + if ((status == -1) && !local_err) { + error_setg(errp, "the helper program '%s' was not found", + pmutils_binaries[mode]); + return; + } + + if (local_err) { + error_propagate(errp, local_err); + } else { + error_setg(errp, + "the helper program '%s' returned an unexpected exit" + " status code (%d)", pmutils_binaries[mode], status); + } +} + +static bool linux_sys_state_supports_mode(SuspendMode mode, Error **errp) +{ + const char *sysfile_strs[3] = {"disk", "mem", NULL}; + const char *sysfile_str = sysfile_strs[mode]; + char buf[32]; /* hopefully big enough */ + int fd; + ssize_t ret; + + if (!sysfile_str) { + error_setg(errp, "unknown guest suspend mode"); + return false; + } + + fd = open(LINUX_SYS_STATE_FILE, O_RDONLY); + if (fd < 0) { + return false; + } + + ret = read(fd, buf, sizeof(buf) - 1); + close(fd); + if (ret <= 0) { + return false; + } + buf[ret] = '\0'; + + if (strstr(buf, sysfile_str)) { + return true; + } + return false; +} + +static void linux_sys_state_suspend(SuspendMode mode, Error **errp) +{ + g_autoptr(GError) local_gerr = NULL; + const char *sysfile_strs[3] = {"disk", "mem", NULL}; + const char *sysfile_str = sysfile_strs[mode]; + + if (!sysfile_str) { + error_setg(errp, "unknown guest suspend mode"); + return; + } + + if (!g_file_set_contents(LINUX_SYS_STATE_FILE, sysfile_str, + -1, &local_gerr)) { + error_setg(errp, "suspend: cannot write to '%s': %s", + LINUX_SYS_STATE_FILE, local_gerr->message); + return; + } +} + +static void guest_suspend(SuspendMode mode, Error **errp) +{ + Error *local_err = NULL; + bool mode_supported = false; + + if (systemd_supports_mode(mode, &local_err)) { + mode_supported = true; + systemd_suspend(mode, &local_err); + + if (!local_err) { + return; + } + } + + error_free(local_err); + local_err = NULL; + + if (pmutils_supports_mode(mode, &local_err)) { + mode_supported = true; + pmutils_suspend(mode, &local_err); + + if (!local_err) { + return; + } + } + + error_free(local_err); + local_err = NULL; + + if (linux_sys_state_supports_mode(mode, &local_err)) { + mode_supported = true; + linux_sys_state_suspend(mode, &local_err); + } + + if (!mode_supported) { + error_free(local_err); + error_setg(errp, + "the requested suspend mode is not supported by the guest"); + } else { + error_propagate(errp, local_err); + } +} + +void qmp_guest_suspend_disk(Error **errp) +{ + guest_suspend(SUSPEND_MODE_DISK, errp); +} + +void qmp_guest_suspend_ram(Error **errp) +{ + guest_suspend(SUSPEND_MODE_RAM, errp); +} + +void qmp_guest_suspend_hybrid(Error **errp) +{ + guest_suspend(SUSPEND_MODE_HYBRID, errp); +} + +/* Transfer online/offline status between @vcpu and the guest system. + * + * On input either @errp or *@errp must be NULL. + * + * In system-to-@vcpu direction, the following @vcpu fields are accessed: + * - R: vcpu->logical_id + * - W: vcpu->online + * - W: vcpu->can_offline + * + * In @vcpu-to-system direction, the following @vcpu fields are accessed: + * - R: vcpu->logical_id + * - R: vcpu->online + * + * Written members remain unmodified on error. + */ +static void transfer_vcpu(GuestLogicalProcessor *vcpu, bool sys2vcpu, + char *dirpath, Error **errp) +{ + int fd; + int res; + int dirfd; + static const char fn[] = "online"; + + dirfd = open(dirpath, O_RDONLY | O_DIRECTORY); + if (dirfd == -1) { + error_setg_errno(errp, errno, "open(\"%s\")", dirpath); + return; + } + + fd = openat(dirfd, fn, sys2vcpu ? O_RDONLY : O_RDWR); + if (fd == -1) { + if (errno != ENOENT) { + error_setg_errno(errp, errno, "open(\"%s/%s\")", dirpath, fn); + } else if (sys2vcpu) { + vcpu->online = true; + vcpu->can_offline = false; + } else if (!vcpu->online) { + error_setg(errp, "logical processor #%" PRId64 " can't be " + "offlined", vcpu->logical_id); + } /* otherwise pretend successful re-onlining */ + } else { + unsigned char status; + + res = pread(fd, &status, 1, 0); + if (res == -1) { + error_setg_errno(errp, errno, "pread(\"%s/%s\")", dirpath, fn); + } else if (res == 0) { + error_setg(errp, "pread(\"%s/%s\"): unexpected EOF", dirpath, + fn); + } else if (sys2vcpu) { + vcpu->online = (status != '0'); + vcpu->can_offline = true; + } else if (vcpu->online != (status != '0')) { + status = '0' + vcpu->online; + if (pwrite(fd, &status, 1, 0) == -1) { + error_setg_errno(errp, errno, "pwrite(\"%s/%s\")", dirpath, + fn); + } + } /* otherwise pretend successful re-(on|off)-lining */ + + res = close(fd); + g_assert(res == 0); + } + + res = close(dirfd); + g_assert(res == 0); +} + +GuestLogicalProcessorList *qmp_guest_get_vcpus(Error **errp) +{ + GuestLogicalProcessorList *head, **tail; + const char *cpu_dir = "/sys/devices/system/cpu"; + const gchar *line; + g_autoptr(GDir) cpu_gdir = NULL; + Error *local_err = NULL; + + head = NULL; + tail = &head; + cpu_gdir = g_dir_open(cpu_dir, 0, NULL); + + if (cpu_gdir == NULL) { + error_setg_errno(errp, errno, "failed to list entries: %s", cpu_dir); + return NULL; + } + + while (local_err == NULL && (line = g_dir_read_name(cpu_gdir)) != NULL) { + GuestLogicalProcessor *vcpu; + int64_t id; + if (sscanf(line, "cpu%" PRId64, &id)) { + g_autofree char *path = g_strdup_printf("/sys/devices/system/cpu/" + "cpu%" PRId64 "/", id); + vcpu = g_malloc0(sizeof *vcpu); + vcpu->logical_id = id; + vcpu->has_can_offline = true; /* lolspeak ftw */ + transfer_vcpu(vcpu, true, path, &local_err); + QAPI_LIST_APPEND(tail, vcpu); + } + } + + if (local_err == NULL) { + /* there's no guest with zero VCPUs */ + g_assert(head != NULL); + return head; + } + + qapi_free_GuestLogicalProcessorList(head); + error_propagate(errp, local_err); + return NULL; +} + +int64_t qmp_guest_set_vcpus(GuestLogicalProcessorList *vcpus, Error **errp) +{ + int64_t processed; + Error *local_err = NULL; + + processed = 0; + while (vcpus != NULL) { + char *path = g_strdup_printf("/sys/devices/system/cpu/cpu%" PRId64 "/", + vcpus->value->logical_id); + + transfer_vcpu(vcpus->value, false, path, &local_err); + g_free(path); + if (local_err != NULL) { + break; + } + ++processed; + vcpus = vcpus->next; + } + + if (local_err != NULL) { + if (processed == 0) { + error_propagate(errp, local_err); + } else { + error_free(local_err); + } + } + + return processed; +} + + +static void ga_read_sysfs_file(int dirfd, const char *pathname, char *buf, + int size, Error **errp) +{ + int fd; + int res; + + errno = 0; + fd = openat(dirfd, pathname, O_RDONLY); + if (fd == -1) { + error_setg_errno(errp, errno, "open sysfs file \"%s\"", pathname); + return; + } + + res = pread(fd, buf, size, 0); + if (res == -1) { + error_setg_errno(errp, errno, "pread sysfs file \"%s\"", pathname); + } else if (res == 0) { + error_setg(errp, "pread sysfs file \"%s\": unexpected EOF", pathname); + } + close(fd); +} + +static void ga_write_sysfs_file(int dirfd, const char *pathname, + const char *buf, int size, Error **errp) +{ + int fd; + + errno = 0; + fd = openat(dirfd, pathname, O_WRONLY); + if (fd == -1) { + error_setg_errno(errp, errno, "open sysfs file \"%s\"", pathname); + return; + } + + if (pwrite(fd, buf, size, 0) == -1) { + error_setg_errno(errp, errno, "pwrite sysfs file \"%s\"", pathname); + } + + close(fd); +} + +/* Transfer online/offline status between @mem_blk and the guest system. + * + * On input either @errp or *@errp must be NULL. + * + * In system-to-@mem_blk direction, the following @mem_blk fields are accessed: + * - R: mem_blk->phys_index + * - W: mem_blk->online + * - W: mem_blk->can_offline + * + * In @mem_blk-to-system direction, the following @mem_blk fields are accessed: + * - R: mem_blk->phys_index + * - R: mem_blk->online + *- R: mem_blk->can_offline + * Written members remain unmodified on error. + */ +static void transfer_memory_block(GuestMemoryBlock *mem_blk, bool sys2memblk, + GuestMemoryBlockResponse *result, + Error **errp) +{ + char *dirpath; + int dirfd; + char *status; + Error *local_err = NULL; + + if (!sys2memblk) { + DIR *dp; + + if (!result) { + error_setg(errp, "Internal error, 'result' should not be NULL"); + return; + } + errno = 0; + dp = opendir("/sys/devices/system/memory/"); + /* if there is no 'memory' directory in sysfs, + * we think this VM does not support online/offline memory block, + * any other solution? + */ + if (!dp) { + if (errno == ENOENT) { + result->response = + GUEST_MEMORY_BLOCK_RESPONSE_TYPE_OPERATION_NOT_SUPPORTED; + } + goto out1; + } + closedir(dp); + } + + dirpath = g_strdup_printf("/sys/devices/system/memory/memory%" PRId64 "/", + mem_blk->phys_index); + dirfd = open(dirpath, O_RDONLY | O_DIRECTORY); + if (dirfd == -1) { + if (sys2memblk) { + error_setg_errno(errp, errno, "open(\"%s\")", dirpath); + } else { + if (errno == ENOENT) { + result->response = GUEST_MEMORY_BLOCK_RESPONSE_TYPE_NOT_FOUND; + } else { + result->response = + GUEST_MEMORY_BLOCK_RESPONSE_TYPE_OPERATION_FAILED; + } + } + g_free(dirpath); + goto out1; + } + g_free(dirpath); + + status = g_malloc0(10); + ga_read_sysfs_file(dirfd, "state", status, 10, &local_err); + if (local_err) { + /* treat with sysfs file that not exist in old kernel */ + if (errno == ENOENT) { + error_free(local_err); + if (sys2memblk) { + mem_blk->online = true; + mem_blk->can_offline = false; + } else if (!mem_blk->online) { + result->response = + GUEST_MEMORY_BLOCK_RESPONSE_TYPE_OPERATION_NOT_SUPPORTED; + } + } else { + if (sys2memblk) { + error_propagate(errp, local_err); + } else { + error_free(local_err); + result->response = + GUEST_MEMORY_BLOCK_RESPONSE_TYPE_OPERATION_FAILED; + } + } + goto out2; + } + + if (sys2memblk) { + char removable = '0'; + + mem_blk->online = (strncmp(status, "online", 6) == 0); + + ga_read_sysfs_file(dirfd, "removable", &removable, 1, &local_err); + if (local_err) { + /* if no 'removable' file, it doesn't support offline mem blk */ + if (errno == ENOENT) { + error_free(local_err); + mem_blk->can_offline = false; + } else { + error_propagate(errp, local_err); + } + } else { + mem_blk->can_offline = (removable != '0'); + } + } else { + if (mem_blk->online != (strncmp(status, "online", 6) == 0)) { + const char *new_state = mem_blk->online ? "online" : "offline"; + + ga_write_sysfs_file(dirfd, "state", new_state, strlen(new_state), + &local_err); + if (local_err) { + error_free(local_err); + result->response = + GUEST_MEMORY_BLOCK_RESPONSE_TYPE_OPERATION_FAILED; + goto out2; + } + + result->response = GUEST_MEMORY_BLOCK_RESPONSE_TYPE_SUCCESS; + result->has_error_code = false; + } /* otherwise pretend successful re-(on|off)-lining */ + } + g_free(status); + close(dirfd); + return; + +out2: + g_free(status); + close(dirfd); +out1: + if (!sys2memblk) { + result->has_error_code = true; + result->error_code = errno; + } +} + +GuestMemoryBlockList *qmp_guest_get_memory_blocks(Error **errp) +{ + GuestMemoryBlockList *head, **tail; + Error *local_err = NULL; + struct dirent *de; + DIR *dp; + + head = NULL; + tail = &head; + + dp = opendir("/sys/devices/system/memory/"); + if (!dp) { + /* it's ok if this happens to be a system that doesn't expose + * memory blocks via sysfs, but otherwise we should report + * an error + */ + if (errno != ENOENT) { + error_setg_errno(errp, errno, "Can't open directory" + "\"/sys/devices/system/memory/\""); + } + return NULL; + } + + /* Note: the phys_index of memory block may be discontinuous, + * this is because a memblk is the unit of the Sparse Memory design, which + * allows discontinuous memory ranges (ex. NUMA), so here we should + * traverse the memory block directory. + */ + while ((de = readdir(dp)) != NULL) { + GuestMemoryBlock *mem_blk; + + if ((strncmp(de->d_name, "memory", 6) != 0) || + !(de->d_type & DT_DIR)) { + continue; + } + + mem_blk = g_malloc0(sizeof *mem_blk); + /* The d_name is "memoryXXX", phys_index is block id, same as XXX */ + mem_blk->phys_index = strtoul(&de->d_name[6], NULL, 10); + mem_blk->has_can_offline = true; /* lolspeak ftw */ + transfer_memory_block(mem_blk, true, NULL, &local_err); + if (local_err) { + break; + } + + QAPI_LIST_APPEND(tail, mem_blk); + } + + closedir(dp); + if (local_err == NULL) { + /* there's no guest with zero memory blocks */ + if (head == NULL) { + error_setg(errp, "guest reported zero memory blocks!"); + } + return head; + } + + qapi_free_GuestMemoryBlockList(head); + error_propagate(errp, local_err); + return NULL; +} + +GuestMemoryBlockResponseList * +qmp_guest_set_memory_blocks(GuestMemoryBlockList *mem_blks, Error **errp) +{ + GuestMemoryBlockResponseList *head, **tail; + Error *local_err = NULL; + + head = NULL; + tail = &head; + + while (mem_blks != NULL) { + GuestMemoryBlockResponse *result; + GuestMemoryBlock *current_mem_blk = mem_blks->value; + + result = g_malloc0(sizeof(*result)); + result->phys_index = current_mem_blk->phys_index; + transfer_memory_block(current_mem_blk, false, result, &local_err); + if (local_err) { /* should never happen */ + goto err; + } + + QAPI_LIST_APPEND(tail, result); + mem_blks = mem_blks->next; + } + + return head; +err: + qapi_free_GuestMemoryBlockResponseList(head); + error_propagate(errp, local_err); + return NULL; +} + +GuestMemoryBlockInfo *qmp_guest_get_memory_block_info(Error **errp) +{ + Error *local_err = NULL; + char *dirpath; + int dirfd; + char *buf; + GuestMemoryBlockInfo *info; + + dirpath = g_strdup_printf("/sys/devices/system/memory/"); + dirfd = open(dirpath, O_RDONLY | O_DIRECTORY); + if (dirfd == -1) { + error_setg_errno(errp, errno, "open(\"%s\")", dirpath); + g_free(dirpath); + return NULL; + } + g_free(dirpath); + + buf = g_malloc0(20); + ga_read_sysfs_file(dirfd, "block_size_bytes", buf, 20, &local_err); + close(dirfd); + if (local_err) { + g_free(buf); + error_propagate(errp, local_err); + return NULL; + } + + info = g_new0(GuestMemoryBlockInfo, 1); + info->size = strtol(buf, NULL, 16); /* the unit is bytes */ + + g_free(buf); + + return info; +} + +#define MAX_NAME_LEN 128 +static GuestDiskStatsInfoList *guest_get_diskstats(Error **errp) +{ + GuestDiskStatsInfoList *head = NULL, **tail = &head; + const char *diskstats = "/proc/diskstats"; + FILE *fp; + size_t n; + char *line = NULL; + + fp = fopen(diskstats, "r"); + if (fp == NULL) { + error_setg_errno(errp, errno, "open(\"%s\")", diskstats); + return NULL; + } + + while (getline(&line, &n, fp) != -1) { + g_autofree GuestDiskStatsInfo *diskstatinfo = NULL; + g_autofree GuestDiskStats *diskstat = NULL; + char dev_name[MAX_NAME_LEN]; + unsigned int ios_pgr, tot_ticks, rq_ticks, wr_ticks, dc_ticks, fl_ticks; + unsigned long rd_ios, rd_merges_or_rd_sec, rd_ticks_or_wr_sec, wr_ios; + unsigned long wr_merges, rd_sec_or_wr_ios, wr_sec; + unsigned long dc_ios, dc_merges, dc_sec, fl_ios; + unsigned int major, minor; + int i; + + i = sscanf(line, "%u %u %s %lu %lu %lu" + "%lu %lu %lu %lu %u %u %u %u" + "%lu %lu %lu %u %lu %u", + &major, &minor, dev_name, + &rd_ios, &rd_merges_or_rd_sec, &rd_sec_or_wr_ios, + &rd_ticks_or_wr_sec, &wr_ios, &wr_merges, &wr_sec, + &wr_ticks, &ios_pgr, &tot_ticks, &rq_ticks, + &dc_ios, &dc_merges, &dc_sec, &dc_ticks, + &fl_ios, &fl_ticks); + + if (i < 7) { + continue; + } + + diskstatinfo = g_new0(GuestDiskStatsInfo, 1); + diskstatinfo->name = g_strdup(dev_name); + diskstatinfo->major = major; + diskstatinfo->minor = minor; + + diskstat = g_new0(GuestDiskStats, 1); + if (i == 7) { + diskstat->has_read_ios = true; + diskstat->read_ios = rd_ios; + diskstat->has_read_sectors = true; + diskstat->read_sectors = rd_merges_or_rd_sec; + diskstat->has_write_ios = true; + diskstat->write_ios = rd_sec_or_wr_ios; + diskstat->has_write_sectors = true; + diskstat->write_sectors = rd_ticks_or_wr_sec; + } + if (i >= 14) { + diskstat->has_read_ios = true; + diskstat->read_ios = rd_ios; + diskstat->has_read_sectors = true; + diskstat->read_sectors = rd_sec_or_wr_ios; + diskstat->has_read_merges = true; + diskstat->read_merges = rd_merges_or_rd_sec; + diskstat->has_read_ticks = true; + diskstat->read_ticks = rd_ticks_or_wr_sec; + diskstat->has_write_ios = true; + diskstat->write_ios = wr_ios; + diskstat->has_write_sectors = true; + diskstat->write_sectors = wr_sec; + diskstat->has_write_merges = true; + diskstat->write_merges = wr_merges; + diskstat->has_write_ticks = true; + diskstat->write_ticks = wr_ticks; + diskstat->has_ios_pgr = true; + diskstat->ios_pgr = ios_pgr; + diskstat->has_total_ticks = true; + diskstat->total_ticks = tot_ticks; + diskstat->has_weight_ticks = true; + diskstat->weight_ticks = rq_ticks; + } + if (i >= 18) { + diskstat->has_discard_ios = true; + diskstat->discard_ios = dc_ios; + diskstat->has_discard_merges = true; + diskstat->discard_merges = dc_merges; + diskstat->has_discard_sectors = true; + diskstat->discard_sectors = dc_sec; + diskstat->has_discard_ticks = true; + diskstat->discard_ticks = dc_ticks; + } + if (i >= 20) { + diskstat->has_flush_ios = true; + diskstat->flush_ios = fl_ios; + diskstat->has_flush_ticks = true; + diskstat->flush_ticks = fl_ticks; + } + + diskstatinfo->stats = g_steal_pointer(&diskstat); + QAPI_LIST_APPEND(tail, diskstatinfo); + diskstatinfo = NULL; + } + free(line); + fclose(fp); + return head; +} + +GuestDiskStatsInfoList *qmp_guest_get_diskstats(Error **errp) +{ + return guest_get_diskstats(errp); +} + +GuestCpuStatsList *qmp_guest_get_cpustats(Error **errp) +{ + GuestCpuStatsList *head = NULL, **tail = &head; + const char *cpustats = "/proc/stat"; + int clk_tck = sysconf(_SC_CLK_TCK); + FILE *fp; + size_t n; + char *line = NULL; + + fp = fopen(cpustats, "r"); + if (fp == NULL) { + error_setg_errno(errp, errno, "open(\"%s\")", cpustats); + return NULL; + } + + while (getline(&line, &n, fp) != -1) { + GuestCpuStats *cpustat = NULL; + GuestLinuxCpuStats *linuxcpustat; + int i; + unsigned long user, system, idle, iowait, irq, softirq, steal, guest; + unsigned long nice, guest_nice; + char name[64]; + + i = sscanf(line, "%s %lu %lu %lu %lu %lu %lu %lu %lu %lu %lu", + name, &user, &nice, &system, &idle, &iowait, &irq, &softirq, + &steal, &guest, &guest_nice); + + /* drop "cpu 1 2 3 ...", get "cpuX 1 2 3 ..." only */ + if ((i == EOF) || strncmp(name, "cpu", 3) || (name[3] == '\0')) { + continue; + } + + if (i < 5) { + slog("Parsing cpu stat from %s failed, see \"man proc\"", cpustats); + break; + } + + cpustat = g_new0(GuestCpuStats, 1); + cpustat->type = GUEST_CPU_STATS_TYPE_LINUX; + + linuxcpustat = &cpustat->u.q_linux; + linuxcpustat->cpu = atoi(&name[3]); + linuxcpustat->user = user * 1000 / clk_tck; + linuxcpustat->nice = nice * 1000 / clk_tck; + linuxcpustat->system = system * 1000 / clk_tck; + linuxcpustat->idle = idle * 1000 / clk_tck; + + if (i > 5) { + linuxcpustat->has_iowait = true; + linuxcpustat->iowait = iowait * 1000 / clk_tck; + } + + if (i > 6) { + linuxcpustat->has_irq = true; + linuxcpustat->irq = irq * 1000 / clk_tck; + linuxcpustat->has_softirq = true; + linuxcpustat->softirq = softirq * 1000 / clk_tck; + } + + if (i > 8) { + linuxcpustat->has_steal = true; + linuxcpustat->steal = steal * 1000 / clk_tck; + } + + if (i > 9) { + linuxcpustat->has_guest = true; + linuxcpustat->guest = guest * 1000 / clk_tck; + } + + if (i > 10) { + linuxcpustat->has_guest = true; + linuxcpustat->guest = guest * 1000 / clk_tck; + linuxcpustat->has_guestnice = true; + linuxcpustat->guestnice = guest_nice * 1000 / clk_tck; + } + + QAPI_LIST_APPEND(tail, cpustat); + } + + free(line); + fclose(fp); + return head; +} + +static char *hexToIPAddress(const void *hexValue, int is_ipv6) +{ + if (is_ipv6) { + char addr[INET6_ADDRSTRLEN]; + struct in6_addr in6; + const char *hexStr = (const char *)hexValue; + int i; + + for (i = 0; i < 16; i++) { + sscanf(&hexStr[i * 2], "%02hhx", &in6.s6_addr[i]); + } + inet_ntop(AF_INET6, &in6, addr, INET6_ADDRSTRLEN); + + return g_strdup(addr); + } else { + unsigned int hexInt = *(unsigned int *)hexValue; + unsigned int byte1 = (hexInt >> 24) & 0xFF; + unsigned int byte2 = (hexInt >> 16) & 0xFF; + unsigned int byte3 = (hexInt >> 8) & 0xFF; + unsigned int byte4 = hexInt & 0xFF; + + return g_strdup_printf("%u.%u.%u.%u", byte4, byte3, byte2, byte1); + } +} + +GuestNetworkRouteList *qmp_guest_network_get_route(Error **errp) +{ + GuestNetworkRouteList *head = NULL, **tail = &head; + const char *routeFiles[] = {"/proc/net/route", "/proc/net/ipv6_route"}; + FILE *fp; + size_t n; + char *line = NULL; + int firstLine; + int is_ipv6; + int i; + + for (i = 0; i < 2; i++) { + firstLine = 1; + is_ipv6 = (i == 1); + fp = fopen(routeFiles[i], "r"); + if (fp == NULL) { + error_setg_errno(errp, errno, "open(\"%s\")", routeFiles[i]); + free(line); + continue; + } + + while (getline(&line, &n, fp) != -1) { + if (firstLine && !is_ipv6) { + firstLine = 0; + continue; + } + GuestNetworkRoute *route = NULL; + GuestNetworkRoute *networkroute; + char Iface[IFNAMSIZ]; + if (is_ipv6) { + char Destination[33], Source[33], NextHop[33]; + int DesPrefixlen, SrcPrefixlen, Metric, RefCnt, Use, Flags; + + /* Parse the line and extract the values */ + if (sscanf(line, "%32s %x %32s %x %32s %x %x %x %x %s", + Destination, &DesPrefixlen, Source, + &SrcPrefixlen, NextHop, &Metric, &RefCnt, + &Use, &Flags, Iface) != 10) { + continue; + } + + route = g_new0(GuestNetworkRoute, 1); + networkroute = route; + networkroute->iface = g_strdup(Iface); + networkroute->destination = hexToIPAddress(Destination, 1); + networkroute->metric = Metric; + networkroute->source = hexToIPAddress(Source, 1); + networkroute->desprefixlen = g_strdup_printf( + "%d", DesPrefixlen + ); + networkroute->srcprefixlen = g_strdup_printf( + "%d", SrcPrefixlen + ); + networkroute->nexthop = hexToIPAddress(NextHop, 1); + networkroute->has_flags = true; + networkroute->flags = Flags; + networkroute->has_refcnt = true; + networkroute->refcnt = RefCnt; + networkroute->has_use = true; + networkroute->use = Use; + networkroute->version = 6; + } else { + unsigned int Destination, Gateway, Mask, Flags; + int RefCnt, Use, Metric, MTU, Window, IRTT; + + /* Parse the line and extract the values */ + if (sscanf(line, "%s %X %X %x %d %d %d %X %d %d %d", + Iface, &Destination, &Gateway, &Flags, &RefCnt, + &Use, &Metric, &Mask, &MTU, &Window, &IRTT) != 11) { + continue; + } + + route = g_new0(GuestNetworkRoute, 1); + networkroute = route; + networkroute->iface = g_strdup(Iface); + networkroute->destination = hexToIPAddress(&Destination, 0); + networkroute->gateway = hexToIPAddress(&Gateway, 0); + networkroute->mask = hexToIPAddress(&Mask, 0); + networkroute->metric = Metric; + networkroute->has_flags = true; + networkroute->flags = Flags; + networkroute->has_refcnt = true; + networkroute->refcnt = RefCnt; + networkroute->has_use = true; + networkroute->use = Use; + networkroute->has_mtu = true; + networkroute->mtu = MTU; + networkroute->has_window = true; + networkroute->window = Window; + networkroute->has_irtt = true; + networkroute->irtt = IRTT; + networkroute->version = 4; + } + + QAPI_LIST_APPEND(tail, route); + } + + free(line); + fclose(fp); + } + + return head; +} diff --git a/qga/commands-posix.c b/qga/commands-posix.c index 7f05996495..c2bd0b4316 100644 --- a/qga/commands-posix.c +++ b/qga/commands-posix.c @@ -24,23 +24,12 @@ #include "qemu/base64.h" #include "qemu/cutils.h" #include "commands-common.h" -#include "block/nvme.h" #include "cutils.h" #ifdef HAVE_UTMPX #include <utmpx.h> #endif -#if defined(__linux__) -#include <mntent.h> -#include <sys/statvfs.h> -#include <linux/nvme_ioctl.h> - -#ifdef CONFIG_LIBUDEV -#include <libudev.h> -#endif -#endif - #ifdef HAVE_GETIFADDRS #include <arpa/inet.h> #include <sys/socket.h> @@ -59,7 +48,7 @@ #endif #endif -static void ga_wait_child(pid_t pid, int *status, Error **errp) +static bool ga_wait_child(pid_t pid, int *status, Error **errp) { pid_t rpid; @@ -70,10 +59,11 @@ static void ga_wait_child(pid_t pid, int *status, Error **errp) if (rpid == -1) { error_setg_errno(errp, errno, "failed to wait for child (pid: %d)", pid); - return; + return false; } g_assert(rpid == pid); + return true; } static ssize_t ga_pipe_read_str(int fd[2], char **str) @@ -178,8 +168,7 @@ static int ga_run_command(const char *argv[], const char *in_str, goto out; } - ga_wait_child(pid, &status, errp); - if (*errp) { + if (!ga_wait_child(pid, &status, errp)) { goto out; } @@ -842,1308 +831,6 @@ static void guest_fsfreeze_cleanup(void) } #endif -/* linux-specific implementations. avoid this if at all possible. */ -#if defined(__linux__) -#if defined(CONFIG_FSFREEZE) - -static char *get_pci_driver(char const *syspath, int pathlen, Error **errp) -{ - char *path; - char *dpath; - char *driver = NULL; - char buf[PATH_MAX]; - ssize_t len; - - path = g_strndup(syspath, pathlen); - dpath = g_strdup_printf("%s/driver", path); - len = readlink(dpath, buf, sizeof(buf) - 1); - if (len != -1) { - buf[len] = 0; - driver = g_path_get_basename(buf); - } - g_free(dpath); - g_free(path); - return driver; -} - -static int compare_uint(const void *_a, const void *_b) -{ - unsigned int a = *(unsigned int *)_a; - unsigned int b = *(unsigned int *)_b; - - return a < b ? -1 : a > b ? 1 : 0; -} - -/* Walk the specified sysfs and build a sorted list of host or ata numbers */ -static int build_hosts(char const *syspath, char const *host, bool ata, - unsigned int *hosts, int hosts_max, Error **errp) -{ - char *path; - DIR *dir; - struct dirent *entry; - int i = 0; - - path = g_strndup(syspath, host - syspath); - dir = opendir(path); - if (!dir) { - error_setg_errno(errp, errno, "opendir(\"%s\")", path); - g_free(path); - return -1; - } - - while (i < hosts_max) { - entry = readdir(dir); - if (!entry) { - break; - } - if (ata && sscanf(entry->d_name, "ata%d", hosts + i) == 1) { - ++i; - } else if (!ata && sscanf(entry->d_name, "host%d", hosts + i) == 1) { - ++i; - } - } - - qsort(hosts, i, sizeof(hosts[0]), compare_uint); - - g_free(path); - closedir(dir); - return i; -} - -/* - * Store disk device info for devices on the PCI bus. - * Returns true if information has been stored, or false for failure. - */ -static bool build_guest_fsinfo_for_pci_dev(char const *syspath, - GuestDiskAddress *disk, - Error **errp) -{ - unsigned int pci[4], host, hosts[8], tgt[3]; - int i, nhosts = 0, pcilen; - GuestPCIAddress *pciaddr = disk->pci_controller; - bool has_ata = false, has_host = false, has_tgt = false; - char *p, *q, *driver = NULL; - bool ret = false; - - p = strstr(syspath, "/devices/pci"); - if (!p || sscanf(p + 12, "%*x:%*x/%x:%x:%x.%x%n", - pci, pci + 1, pci + 2, pci + 3, &pcilen) < 4) { - g_debug("only pci device is supported: sysfs path '%s'", syspath); - return false; - } - - p += 12 + pcilen; - while (true) { - driver = get_pci_driver(syspath, p - syspath, errp); - if (driver && (g_str_equal(driver, "ata_piix") || - g_str_equal(driver, "sym53c8xx") || - g_str_equal(driver, "virtio-pci") || - g_str_equal(driver, "ahci") || - g_str_equal(driver, "nvme") || - g_str_equal(driver, "xhci_hcd") || - g_str_equal(driver, "ehci-pci"))) { - break; - } - - g_free(driver); - if (sscanf(p, "/%x:%x:%x.%x%n", - pci, pci + 1, pci + 2, pci + 3, &pcilen) == 4) { - p += pcilen; - continue; - } - - g_debug("unsupported driver or sysfs path '%s'", syspath); - return false; - } - - p = strstr(syspath, "/target"); - if (p && sscanf(p + 7, "%*u:%*u:%*u/%*u:%u:%u:%u", - tgt, tgt + 1, tgt + 2) == 3) { - has_tgt = true; - } - - p = strstr(syspath, "/ata"); - if (p) { - q = p + 4; - has_ata = true; - } else { - p = strstr(syspath, "/host"); - q = p + 5; - } - if (p && sscanf(q, "%u", &host) == 1) { - has_host = true; - nhosts = build_hosts(syspath, p, has_ata, hosts, - ARRAY_SIZE(hosts), errp); - if (nhosts < 0) { - goto cleanup; - } - } - - pciaddr->domain = pci[0]; - pciaddr->bus = pci[1]; - pciaddr->slot = pci[2]; - pciaddr->function = pci[3]; - - if (strcmp(driver, "ata_piix") == 0) { - /* a host per ide bus, target*:0:<unit>:0 */ - if (!has_host || !has_tgt) { - g_debug("invalid sysfs path '%s' (driver '%s')", syspath, driver); - goto cleanup; - } - for (i = 0; i < nhosts; i++) { - if (host == hosts[i]) { - disk->bus_type = GUEST_DISK_BUS_TYPE_IDE; - disk->bus = i; - disk->unit = tgt[1]; - break; - } - } - if (i >= nhosts) { - g_debug("no host for '%s' (driver '%s')", syspath, driver); - goto cleanup; - } - } else if (strcmp(driver, "sym53c8xx") == 0) { - /* scsi(LSI Logic): target*:0:<unit>:0 */ - if (!has_tgt) { - g_debug("invalid sysfs path '%s' (driver '%s')", syspath, driver); - goto cleanup; - } - disk->bus_type = GUEST_DISK_BUS_TYPE_SCSI; - disk->unit = tgt[1]; - } else if (strcmp(driver, "virtio-pci") == 0) { - if (has_tgt) { - /* virtio-scsi: target*:0:0:<unit> */ - disk->bus_type = GUEST_DISK_BUS_TYPE_SCSI; - disk->unit = tgt[2]; - } else { - /* virtio-blk: 1 disk per 1 device */ - disk->bus_type = GUEST_DISK_BUS_TYPE_VIRTIO; - } - } else if (strcmp(driver, "ahci") == 0) { - /* ahci: 1 host per 1 unit */ - if (!has_host || !has_tgt) { - g_debug("invalid sysfs path '%s' (driver '%s')", syspath, driver); - goto cleanup; - } - for (i = 0; i < nhosts; i++) { - if (host == hosts[i]) { - disk->unit = i; - disk->bus_type = GUEST_DISK_BUS_TYPE_SATA; - break; - } - } - if (i >= nhosts) { - g_debug("no host for '%s' (driver '%s')", syspath, driver); - goto cleanup; - } - } else if (strcmp(driver, "nvme") == 0) { - disk->bus_type = GUEST_DISK_BUS_TYPE_NVME; - } else if (strcmp(driver, "ehci-pci") == 0 || strcmp(driver, "xhci_hcd") == 0) { - disk->bus_type = GUEST_DISK_BUS_TYPE_USB; - } else { - g_debug("unknown driver '%s' (sysfs path '%s')", driver, syspath); - goto cleanup; - } - - ret = true; - -cleanup: - g_free(driver); - return ret; -} - -/* - * Store disk device info for non-PCI virtio devices (for example s390x - * channel I/O devices). Returns true if information has been stored, or - * false for failure. - */ -static bool build_guest_fsinfo_for_nonpci_virtio(char const *syspath, - GuestDiskAddress *disk, - Error **errp) -{ - unsigned int tgt[3]; - char *p; - - if (!strstr(syspath, "/virtio") || !strstr(syspath, "/block")) { - g_debug("Unsupported virtio device '%s'", syspath); - return false; - } - - p = strstr(syspath, "/target"); - if (p && sscanf(p + 7, "%*u:%*u:%*u/%*u:%u:%u:%u", - &tgt[0], &tgt[1], &tgt[2]) == 3) { - /* virtio-scsi: target*:0:<target>:<unit> */ - disk->bus_type = GUEST_DISK_BUS_TYPE_SCSI; - disk->bus = tgt[0]; - disk->target = tgt[1]; - disk->unit = tgt[2]; - } else { - /* virtio-blk: 1 disk per 1 device */ - disk->bus_type = GUEST_DISK_BUS_TYPE_VIRTIO; - } - - return true; -} - -/* - * Store disk device info for CCW devices (s390x channel I/O devices). - * Returns true if information has been stored, or false for failure. - */ -static bool build_guest_fsinfo_for_ccw_dev(char const *syspath, - GuestDiskAddress *disk, - Error **errp) -{ - unsigned int cssid, ssid, subchno, devno; - char *p; - - p = strstr(syspath, "/devices/css"); - if (!p || sscanf(p + 12, "%*x/%x.%x.%x/%*x.%*x.%x/", - &cssid, &ssid, &subchno, &devno) < 4) { - g_debug("could not parse ccw device sysfs path: %s", syspath); - return false; - } - - disk->ccw_address = g_new0(GuestCCWAddress, 1); - disk->ccw_address->cssid = cssid; - disk->ccw_address->ssid = ssid; - disk->ccw_address->subchno = subchno; - disk->ccw_address->devno = devno; - - if (strstr(p, "/virtio")) { - build_guest_fsinfo_for_nonpci_virtio(syspath, disk, errp); - } - - return true; -} - -/* Store disk device info specified by @sysfs into @fs */ -static void build_guest_fsinfo_for_real_device(char const *syspath, - GuestFilesystemInfo *fs, - Error **errp) -{ - GuestDiskAddress *disk; - GuestPCIAddress *pciaddr; - bool has_hwinf; -#ifdef CONFIG_LIBUDEV - struct udev *udev = NULL; - struct udev_device *udevice = NULL; -#endif - - pciaddr = g_new0(GuestPCIAddress, 1); - pciaddr->domain = -1; /* -1 means field is invalid */ - pciaddr->bus = -1; - pciaddr->slot = -1; - pciaddr->function = -1; - - disk = g_new0(GuestDiskAddress, 1); - disk->pci_controller = pciaddr; - disk->bus_type = GUEST_DISK_BUS_TYPE_UNKNOWN; - -#ifdef CONFIG_LIBUDEV - udev = udev_new(); - udevice = udev_device_new_from_syspath(udev, syspath); - if (udev == NULL || udevice == NULL) { - g_debug("failed to query udev"); - } else { - const char *devnode, *serial; - devnode = udev_device_get_devnode(udevice); - if (devnode != NULL) { - disk->dev = g_strdup(devnode); - } - serial = udev_device_get_property_value(udevice, "ID_SERIAL"); - if (serial != NULL && *serial != 0) { - disk->serial = g_strdup(serial); - } - } - - udev_unref(udev); - udev_device_unref(udevice); -#endif - - if (strstr(syspath, "/devices/pci")) { - has_hwinf = build_guest_fsinfo_for_pci_dev(syspath, disk, errp); - } else if (strstr(syspath, "/devices/css")) { - has_hwinf = build_guest_fsinfo_for_ccw_dev(syspath, disk, errp); - } else if (strstr(syspath, "/virtio")) { - has_hwinf = build_guest_fsinfo_for_nonpci_virtio(syspath, disk, errp); - } else { - g_debug("Unsupported device type for '%s'", syspath); - has_hwinf = false; - } - - if (has_hwinf || disk->dev || disk->serial) { - QAPI_LIST_PREPEND(fs->disk, disk); - } else { - qapi_free_GuestDiskAddress(disk); - } -} - -static void build_guest_fsinfo_for_device(char const *devpath, - GuestFilesystemInfo *fs, - Error **errp); - -/* Store a list of slave devices of virtual volume specified by @syspath into - * @fs */ -static void build_guest_fsinfo_for_virtual_device(char const *syspath, - GuestFilesystemInfo *fs, - Error **errp) -{ - Error *err = NULL; - DIR *dir; - char *dirpath; - struct dirent *entry; - - dirpath = g_strdup_printf("%s/slaves", syspath); - dir = opendir(dirpath); - if (!dir) { - if (errno != ENOENT) { - error_setg_errno(errp, errno, "opendir(\"%s\")", dirpath); - } - g_free(dirpath); - return; - } - - for (;;) { - errno = 0; - entry = readdir(dir); - if (entry == NULL) { - if (errno) { - error_setg_errno(errp, errno, "readdir(\"%s\")", dirpath); - } - break; - } - - if (entry->d_type == DT_LNK) { - char *path; - - g_debug(" slave device '%s'", entry->d_name); - path = g_strdup_printf("%s/slaves/%s", syspath, entry->d_name); - build_guest_fsinfo_for_device(path, fs, &err); - g_free(path); - - if (err) { - error_propagate(errp, err); - break; - } - } - } - - g_free(dirpath); - closedir(dir); -} - -static bool is_disk_virtual(const char *devpath, Error **errp) -{ - g_autofree char *syspath = realpath(devpath, NULL); - - if (!syspath) { - error_setg_errno(errp, errno, "realpath(\"%s\")", devpath); - return false; - } - return strstr(syspath, "/devices/virtual/block/") != NULL; -} - -/* Dispatch to functions for virtual/real device */ -static void build_guest_fsinfo_for_device(char const *devpath, - GuestFilesystemInfo *fs, - Error **errp) -{ - ERRP_GUARD(); - g_autofree char *syspath = NULL; - bool is_virtual = false; - - syspath = realpath(devpath, NULL); - if (!syspath) { - if (errno != ENOENT) { - error_setg_errno(errp, errno, "realpath(\"%s\")", devpath); - return; - } - - /* ENOENT: This devpath may not exist because of container config */ - if (!fs->name) { - fs->name = g_path_get_basename(devpath); - } - return; - } - - if (!fs->name) { - fs->name = g_path_get_basename(syspath); - } - - g_debug(" parse sysfs path '%s'", syspath); - is_virtual = is_disk_virtual(syspath, errp); - if (*errp != NULL) { - return; - } - if (is_virtual) { - build_guest_fsinfo_for_virtual_device(syspath, fs, errp); - } else { - build_guest_fsinfo_for_real_device(syspath, fs, errp); - } -} - -#ifdef CONFIG_LIBUDEV - -/* - * Wrapper around build_guest_fsinfo_for_device() for getting just - * the disk address. - */ -static GuestDiskAddress *get_disk_address(const char *syspath, Error **errp) -{ - g_autoptr(GuestFilesystemInfo) fs = NULL; - - fs = g_new0(GuestFilesystemInfo, 1); - build_guest_fsinfo_for_device(syspath, fs, errp); - if (fs->disk != NULL) { - return g_steal_pointer(&fs->disk->value); - } - return NULL; -} - -static char *get_alias_for_syspath(const char *syspath) -{ - struct udev *udev = NULL; - struct udev_device *udevice = NULL; - char *ret = NULL; - - udev = udev_new(); - if (udev == NULL) { - g_debug("failed to query udev"); - goto out; - } - udevice = udev_device_new_from_syspath(udev, syspath); - if (udevice == NULL) { - g_debug("failed to query udev for path: %s", syspath); - goto out; - } else { - const char *alias = udev_device_get_property_value( - udevice, "DM_NAME"); - /* - * NULL means there was an error and empty string means there is no - * alias. In case of no alias we return NULL instead of empty string. - */ - if (alias == NULL) { - g_debug("failed to query udev for device alias for: %s", - syspath); - } else if (*alias != 0) { - ret = g_strdup(alias); - } - } - -out: - udev_unref(udev); - udev_device_unref(udevice); - return ret; -} - -static char *get_device_for_syspath(const char *syspath) -{ - struct udev *udev = NULL; - struct udev_device *udevice = NULL; - char *ret = NULL; - - udev = udev_new(); - if (udev == NULL) { - g_debug("failed to query udev"); - goto out; - } - udevice = udev_device_new_from_syspath(udev, syspath); - if (udevice == NULL) { - g_debug("failed to query udev for path: %s", syspath); - goto out; - } else { - ret = g_strdup(udev_device_get_devnode(udevice)); - } - -out: - udev_unref(udev); - udev_device_unref(udevice); - return ret; -} - -static void get_disk_deps(const char *disk_dir, GuestDiskInfo *disk) -{ - g_autofree char *deps_dir = NULL; - const gchar *dep; - GDir *dp_deps = NULL; - - /* List dependent disks */ - deps_dir = g_strdup_printf("%s/slaves", disk_dir); - g_debug(" listing entries in: %s", deps_dir); - dp_deps = g_dir_open(deps_dir, 0, NULL); - if (dp_deps == NULL) { - g_debug("failed to list entries in %s", deps_dir); - return; - } - disk->has_dependencies = true; - while ((dep = g_dir_read_name(dp_deps)) != NULL) { - g_autofree char *dep_dir = NULL; - char *dev_name; - - /* Add dependent disks */ - dep_dir = g_strdup_printf("%s/%s", deps_dir, dep); - dev_name = get_device_for_syspath(dep_dir); - if (dev_name != NULL) { - g_debug(" adding dependent device: %s", dev_name); - QAPI_LIST_PREPEND(disk->dependencies, dev_name); - } - } - g_dir_close(dp_deps); -} - -/* - * Detect partitions subdirectory, name is "<disk_name><number>" or - * "<disk_name>p<number>" - * - * @disk_name -- last component of /sys path (e.g. sda) - * @disk_dir -- sys path of the disk (e.g. /sys/block/sda) - * @disk_dev -- device node of the disk (e.g. /dev/sda) - */ -static GuestDiskInfoList *get_disk_partitions( - GuestDiskInfoList *list, - const char *disk_name, const char *disk_dir, - const char *disk_dev) -{ - GuestDiskInfoList *ret = list; - struct dirent *de_disk; - DIR *dp_disk = NULL; - size_t len = strlen(disk_name); - - dp_disk = opendir(disk_dir); - while ((de_disk = readdir(dp_disk)) != NULL) { - g_autofree char *partition_dir = NULL; - char *dev_name; - GuestDiskInfo *partition; - - if (!(de_disk->d_type & DT_DIR)) { - continue; - } - - if (!(strncmp(disk_name, de_disk->d_name, len) == 0 && - ((*(de_disk->d_name + len) == 'p' && - isdigit(*(de_disk->d_name + len + 1))) || - isdigit(*(de_disk->d_name + len))))) { - continue; - } - - partition_dir = g_strdup_printf("%s/%s", - disk_dir, de_disk->d_name); - dev_name = get_device_for_syspath(partition_dir); - if (dev_name == NULL) { - g_debug("Failed to get device name for syspath: %s", - disk_dir); - continue; - } - partition = g_new0(GuestDiskInfo, 1); - partition->name = dev_name; - partition->partition = true; - partition->has_dependencies = true; - /* Add parent disk as dependent for easier tracking of hierarchy */ - QAPI_LIST_PREPEND(partition->dependencies, g_strdup(disk_dev)); - - QAPI_LIST_PREPEND(ret, partition); - } - closedir(dp_disk); - - return ret; -} - -static void get_nvme_smart(GuestDiskInfo *disk) -{ - int fd; - GuestNVMeSmart *smart; - NvmeSmartLog log = {0}; - struct nvme_admin_cmd cmd = { - .opcode = NVME_ADM_CMD_GET_LOG_PAGE, - .nsid = NVME_NSID_BROADCAST, - .addr = (uintptr_t)&log, - .data_len = sizeof(log), - .cdw10 = NVME_LOG_SMART_INFO | (1 << 15) /* RAE bit */ - | (((sizeof(log) >> 2) - 1) << 16) - }; - - fd = qga_open_cloexec(disk->name, O_RDONLY, 0); - if (fd == -1) { - g_debug("Failed to open device: %s: %s", disk->name, g_strerror(errno)); - return; - } - - if (ioctl(fd, NVME_IOCTL_ADMIN_CMD, &cmd)) { - g_debug("Failed to get smart: %s: %s", disk->name, g_strerror(errno)); - close(fd); - return; - } - - disk->smart = g_new0(GuestDiskSmart, 1); - disk->smart->type = GUEST_DISK_BUS_TYPE_NVME; - - smart = &disk->smart->u.nvme; - smart->critical_warning = log.critical_warning; - smart->temperature = lduw_le_p(&log.temperature); /* unaligned field */ - smart->available_spare = log.available_spare; - smart->available_spare_threshold = log.available_spare_threshold; - smart->percentage_used = log.percentage_used; - smart->data_units_read_lo = le64_to_cpu(log.data_units_read[0]); - smart->data_units_read_hi = le64_to_cpu(log.data_units_read[1]); - smart->data_units_written_lo = le64_to_cpu(log.data_units_written[0]); - smart->data_units_written_hi = le64_to_cpu(log.data_units_written[1]); - smart->host_read_commands_lo = le64_to_cpu(log.host_read_commands[0]); - smart->host_read_commands_hi = le64_to_cpu(log.host_read_commands[1]); - smart->host_write_commands_lo = le64_to_cpu(log.host_write_commands[0]); - smart->host_write_commands_hi = le64_to_cpu(log.host_write_commands[1]); - smart->controller_busy_time_lo = le64_to_cpu(log.controller_busy_time[0]); - smart->controller_busy_time_hi = le64_to_cpu(log.controller_busy_time[1]); - smart->power_cycles_lo = le64_to_cpu(log.power_cycles[0]); - smart->power_cycles_hi = le64_to_cpu(log.power_cycles[1]); - smart->power_on_hours_lo = le64_to_cpu(log.power_on_hours[0]); - smart->power_on_hours_hi = le64_to_cpu(log.power_on_hours[1]); - smart->unsafe_shutdowns_lo = le64_to_cpu(log.unsafe_shutdowns[0]); - smart->unsafe_shutdowns_hi = le64_to_cpu(log.unsafe_shutdowns[1]); - smart->media_errors_lo = le64_to_cpu(log.media_errors[0]); - smart->media_errors_hi = le64_to_cpu(log.media_errors[1]); - smart->number_of_error_log_entries_lo = - le64_to_cpu(log.number_of_error_log_entries[0]); - smart->number_of_error_log_entries_hi = - le64_to_cpu(log.number_of_error_log_entries[1]); - - close(fd); -} - -static void get_disk_smart(GuestDiskInfo *disk) -{ - if (disk->address - && (disk->address->bus_type == GUEST_DISK_BUS_TYPE_NVME)) { - get_nvme_smart(disk); - } -} - -GuestDiskInfoList *qmp_guest_get_disks(Error **errp) -{ - GuestDiskInfoList *ret = NULL; - GuestDiskInfo *disk; - DIR *dp = NULL; - struct dirent *de = NULL; - - g_debug("listing /sys/block directory"); - dp = opendir("/sys/block"); - if (dp == NULL) { - error_setg_errno(errp, errno, "Can't open directory \"/sys/block\""); - return NULL; - } - while ((de = readdir(dp)) != NULL) { - g_autofree char *disk_dir = NULL, *line = NULL, - *size_path = NULL; - char *dev_name; - Error *local_err = NULL; - if (de->d_type != DT_LNK) { - g_debug(" skipping entry: %s", de->d_name); - continue; - } - - /* Check size and skip zero-sized disks */ - g_debug(" checking disk size"); - size_path = g_strdup_printf("/sys/block/%s/size", de->d_name); - if (!g_file_get_contents(size_path, &line, NULL, NULL)) { - g_debug(" failed to read disk size"); - continue; - } - if (g_strcmp0(line, "0\n") == 0) { - g_debug(" skipping zero-sized disk"); - continue; - } - - g_debug(" adding %s", de->d_name); - disk_dir = g_strdup_printf("/sys/block/%s", de->d_name); - dev_name = get_device_for_syspath(disk_dir); - if (dev_name == NULL) { - g_debug("Failed to get device name for syspath: %s", - disk_dir); - continue; - } - disk = g_new0(GuestDiskInfo, 1); - disk->name = dev_name; - disk->partition = false; - disk->alias = get_alias_for_syspath(disk_dir); - QAPI_LIST_PREPEND(ret, disk); - - /* Get address for non-virtual devices */ - bool is_virtual = is_disk_virtual(disk_dir, &local_err); - if (local_err != NULL) { - g_debug(" failed to check disk path, ignoring error: %s", - error_get_pretty(local_err)); - error_free(local_err); - local_err = NULL; - /* Don't try to get the address */ - is_virtual = true; - } - if (!is_virtual) { - disk->address = get_disk_address(disk_dir, &local_err); - if (local_err != NULL) { - g_debug(" failed to get device info, ignoring error: %s", - error_get_pretty(local_err)); - error_free(local_err); - local_err = NULL; - } - } - - get_disk_deps(disk_dir, disk); - get_disk_smart(disk); - ret = get_disk_partitions(ret, de->d_name, disk_dir, dev_name); - } - - closedir(dp); - - return ret; -} - -#else - -GuestDiskInfoList *qmp_guest_get_disks(Error **errp) -{ - error_setg(errp, QERR_UNSUPPORTED); - return NULL; -} - -#endif - -/* Return a list of the disk device(s)' info which @mount lies on */ -static GuestFilesystemInfo *build_guest_fsinfo(struct FsMount *mount, - Error **errp) -{ - GuestFilesystemInfo *fs = g_malloc0(sizeof(*fs)); - struct statvfs buf; - unsigned long used, nonroot_total, fr_size; - char *devpath = g_strdup_printf("/sys/dev/block/%u:%u", - mount->devmajor, mount->devminor); - - fs->mountpoint = g_strdup(mount->dirname); - fs->type = g_strdup(mount->devtype); - build_guest_fsinfo_for_device(devpath, fs, errp); - - if (statvfs(fs->mountpoint, &buf) == 0) { - fr_size = buf.f_frsize; - used = buf.f_blocks - buf.f_bfree; - nonroot_total = used + buf.f_bavail; - fs->used_bytes = used * fr_size; - fs->total_bytes = nonroot_total * fr_size; - fs->total_bytes_privileged = buf.f_blocks * fr_size; - - fs->has_total_bytes = true; - fs->has_total_bytes_privileged = true; - fs->has_used_bytes = true; - } - - g_free(devpath); - - return fs; -} - -GuestFilesystemInfoList *qmp_guest_get_fsinfo(Error **errp) -{ - FsMountList mounts; - struct FsMount *mount; - GuestFilesystemInfoList *ret = NULL; - Error *local_err = NULL; - - QTAILQ_INIT(&mounts); - if (!build_fs_mount_list(&mounts, &local_err)) { - error_propagate(errp, local_err); - return NULL; - } - - QTAILQ_FOREACH(mount, &mounts, next) { - g_debug("Building guest fsinfo for '%s'", mount->dirname); - - QAPI_LIST_PREPEND(ret, build_guest_fsinfo(mount, &local_err)); - if (local_err) { - error_propagate(errp, local_err); - qapi_free_GuestFilesystemInfoList(ret); - ret = NULL; - break; - } - } - - free_fs_mount_list(&mounts); - return ret; -} -#endif /* CONFIG_FSFREEZE */ - -#if defined(CONFIG_FSTRIM) -/* - * Walk list of mounted file systems in the guest, and trim them. - */ -GuestFilesystemTrimResponse * -qmp_guest_fstrim(bool has_minimum, int64_t minimum, Error **errp) -{ - GuestFilesystemTrimResponse *response; - GuestFilesystemTrimResult *result; - int ret = 0; - FsMountList mounts; - struct FsMount *mount; - int fd; - struct fstrim_range r; - - slog("guest-fstrim called"); - - QTAILQ_INIT(&mounts); - if (!build_fs_mount_list(&mounts, errp)) { - return NULL; - } - - response = g_malloc0(sizeof(*response)); - - QTAILQ_FOREACH(mount, &mounts, next) { - result = g_malloc0(sizeof(*result)); - result->path = g_strdup(mount->dirname); - - QAPI_LIST_PREPEND(response->paths, result); - - fd = qga_open_cloexec(mount->dirname, O_RDONLY, 0); - if (fd == -1) { - result->error = g_strdup_printf("failed to open: %s", - strerror(errno)); - continue; - } - - /* We try to cull filesystems we know won't work in advance, but other - * filesystems may not implement fstrim for less obvious reasons. - * These will report EOPNOTSUPP; while in some other cases ENOTTY - * will be reported (e.g. CD-ROMs). - * Any other error means an unexpected error. - */ - r.start = 0; - r.len = -1; - r.minlen = has_minimum ? minimum : 0; - ret = ioctl(fd, FITRIM, &r); - if (ret == -1) { - if (errno == ENOTTY || errno == EOPNOTSUPP) { - result->error = g_strdup("trim not supported"); - } else { - result->error = g_strdup_printf("failed to trim: %s", - strerror(errno)); - } - close(fd); - continue; - } - - result->has_minimum = true; - result->minimum = r.minlen; - result->has_trimmed = true; - result->trimmed = r.len; - close(fd); - } - - free_fs_mount_list(&mounts); - return response; -} -#endif /* CONFIG_FSTRIM */ - - -#define LINUX_SYS_STATE_FILE "/sys/power/state" -#define SUSPEND_SUPPORTED 0 -#define SUSPEND_NOT_SUPPORTED 1 - -typedef enum { - SUSPEND_MODE_DISK = 0, - SUSPEND_MODE_RAM = 1, - SUSPEND_MODE_HYBRID = 2, -} SuspendMode; - -/* - * Executes a command in a child process using g_spawn_sync, - * returning an int >= 0 representing the exit status of the - * process. - * - * If the program wasn't found in path, returns -1. - * - * If a problem happened when creating the child process, - * returns -1 and errp is set. - */ -static int run_process_child(const char *command[], Error **errp) -{ - int exit_status, spawn_flag; - GError *g_err = NULL; - bool success; - - spawn_flag = G_SPAWN_SEARCH_PATH | G_SPAWN_STDOUT_TO_DEV_NULL | - G_SPAWN_STDERR_TO_DEV_NULL; - - success = g_spawn_sync(NULL, (char **)command, NULL, spawn_flag, - NULL, NULL, NULL, NULL, - &exit_status, &g_err); - - if (success) { - return WEXITSTATUS(exit_status); - } - - if (g_err && (g_err->code != G_SPAWN_ERROR_NOENT)) { - error_setg(errp, "failed to create child process, error '%s'", - g_err->message); - } - - g_error_free(g_err); - return -1; -} - -static bool systemd_supports_mode(SuspendMode mode, Error **errp) -{ - const char *systemctl_args[3] = {"systemd-hibernate", "systemd-suspend", - "systemd-hybrid-sleep"}; - const char *cmd[4] = {"systemctl", "status", systemctl_args[mode], NULL}; - int status; - - status = run_process_child(cmd, errp); - - /* - * systemctl status uses LSB return codes so we can expect - * status > 0 and be ok. To assert if the guest has support - * for the selected suspend mode, status should be < 4. 4 is - * the code for unknown service status, the return value when - * the service does not exist. A common value is status = 3 - * (program is not running). - */ - if (status > 0 && status < 4) { - return true; - } - - return false; -} - -static void systemd_suspend(SuspendMode mode, Error **errp) -{ - Error *local_err = NULL; - const char *systemctl_args[3] = {"hibernate", "suspend", "hybrid-sleep"}; - const char *cmd[3] = {"systemctl", systemctl_args[mode], NULL}; - int status; - - status = run_process_child(cmd, &local_err); - - if (status == 0) { - return; - } - - if ((status == -1) && !local_err) { - error_setg(errp, "the helper program 'systemctl %s' was not found", - systemctl_args[mode]); - return; - } - - if (local_err) { - error_propagate(errp, local_err); - } else { - error_setg(errp, "the helper program 'systemctl %s' returned an " - "unexpected exit status code (%d)", - systemctl_args[mode], status); - } -} - -static bool pmutils_supports_mode(SuspendMode mode, Error **errp) -{ - Error *local_err = NULL; - const char *pmutils_args[3] = {"--hibernate", "--suspend", - "--suspend-hybrid"}; - const char *cmd[3] = {"pm-is-supported", pmutils_args[mode], NULL}; - int status; - - status = run_process_child(cmd, &local_err); - - if (status == SUSPEND_SUPPORTED) { - return true; - } - - if ((status == -1) && !local_err) { - return false; - } - - if (local_err) { - error_propagate(errp, local_err); - } else { - error_setg(errp, - "the helper program '%s' returned an unexpected exit" - " status code (%d)", "pm-is-supported", status); - } - - return false; -} - -static void pmutils_suspend(SuspendMode mode, Error **errp) -{ - Error *local_err = NULL; - const char *pmutils_binaries[3] = {"pm-hibernate", "pm-suspend", - "pm-suspend-hybrid"}; - const char *cmd[2] = {pmutils_binaries[mode], NULL}; - int status; - - status = run_process_child(cmd, &local_err); - - if (status == 0) { - return; - } - - if ((status == -1) && !local_err) { - error_setg(errp, "the helper program '%s' was not found", - pmutils_binaries[mode]); - return; - } - - if (local_err) { - error_propagate(errp, local_err); - } else { - error_setg(errp, - "the helper program '%s' returned an unexpected exit" - " status code (%d)", pmutils_binaries[mode], status); - } -} - -static bool linux_sys_state_supports_mode(SuspendMode mode, Error **errp) -{ - const char *sysfile_strs[3] = {"disk", "mem", NULL}; - const char *sysfile_str = sysfile_strs[mode]; - char buf[32]; /* hopefully big enough */ - int fd; - ssize_t ret; - - if (!sysfile_str) { - error_setg(errp, "unknown guest suspend mode"); - return false; - } - - fd = open(LINUX_SYS_STATE_FILE, O_RDONLY); - if (fd < 0) { - return false; - } - - ret = read(fd, buf, sizeof(buf) - 1); - close(fd); - if (ret <= 0) { - return false; - } - buf[ret] = '\0'; - - if (strstr(buf, sysfile_str)) { - return true; - } - return false; -} - -static void linux_sys_state_suspend(SuspendMode mode, Error **errp) -{ - g_autoptr(GError) local_gerr = NULL; - const char *sysfile_strs[3] = {"disk", "mem", NULL}; - const char *sysfile_str = sysfile_strs[mode]; - - if (!sysfile_str) { - error_setg(errp, "unknown guest suspend mode"); - return; - } - - if (!g_file_set_contents(LINUX_SYS_STATE_FILE, sysfile_str, - -1, &local_gerr)) { - error_setg(errp, "suspend: cannot write to '%s': %s", - LINUX_SYS_STATE_FILE, local_gerr->message); - return; - } -} - -static void guest_suspend(SuspendMode mode, Error **errp) -{ - Error *local_err = NULL; - bool mode_supported = false; - - if (systemd_supports_mode(mode, &local_err)) { - mode_supported = true; - systemd_suspend(mode, &local_err); - - if (!local_err) { - return; - } - } - - error_free(local_err); - local_err = NULL; - - if (pmutils_supports_mode(mode, &local_err)) { - mode_supported = true; - pmutils_suspend(mode, &local_err); - - if (!local_err) { - return; - } - } - - error_free(local_err); - local_err = NULL; - - if (linux_sys_state_supports_mode(mode, &local_err)) { - mode_supported = true; - linux_sys_state_suspend(mode, &local_err); - } - - if (!mode_supported) { - error_free(local_err); - error_setg(errp, - "the requested suspend mode is not supported by the guest"); - } else { - error_propagate(errp, local_err); - } -} - -void qmp_guest_suspend_disk(Error **errp) -{ - guest_suspend(SUSPEND_MODE_DISK, errp); -} - -void qmp_guest_suspend_ram(Error **errp) -{ - guest_suspend(SUSPEND_MODE_RAM, errp); -} - -void qmp_guest_suspend_hybrid(Error **errp) -{ - guest_suspend(SUSPEND_MODE_HYBRID, errp); -} - -/* Transfer online/offline status between @vcpu and the guest system. - * - * On input either @errp or *@errp must be NULL. - * - * In system-to-@vcpu direction, the following @vcpu fields are accessed: - * - R: vcpu->logical_id - * - W: vcpu->online - * - W: vcpu->can_offline - * - * In @vcpu-to-system direction, the following @vcpu fields are accessed: - * - R: vcpu->logical_id - * - R: vcpu->online - * - * Written members remain unmodified on error. - */ -static void transfer_vcpu(GuestLogicalProcessor *vcpu, bool sys2vcpu, - char *dirpath, Error **errp) -{ - int fd; - int res; - int dirfd; - static const char fn[] = "online"; - - dirfd = open(dirpath, O_RDONLY | O_DIRECTORY); - if (dirfd == -1) { - error_setg_errno(errp, errno, "open(\"%s\")", dirpath); - return; - } - - fd = openat(dirfd, fn, sys2vcpu ? O_RDONLY : O_RDWR); - if (fd == -1) { - if (errno != ENOENT) { - error_setg_errno(errp, errno, "open(\"%s/%s\")", dirpath, fn); - } else if (sys2vcpu) { - vcpu->online = true; - vcpu->can_offline = false; - } else if (!vcpu->online) { - error_setg(errp, "logical processor #%" PRId64 " can't be " - "offlined", vcpu->logical_id); - } /* otherwise pretend successful re-onlining */ - } else { - unsigned char status; - - res = pread(fd, &status, 1, 0); - if (res == -1) { - error_setg_errno(errp, errno, "pread(\"%s/%s\")", dirpath, fn); - } else if (res == 0) { - error_setg(errp, "pread(\"%s/%s\"): unexpected EOF", dirpath, - fn); - } else if (sys2vcpu) { - vcpu->online = (status != '0'); - vcpu->can_offline = true; - } else if (vcpu->online != (status != '0')) { - status = '0' + vcpu->online; - if (pwrite(fd, &status, 1, 0) == -1) { - error_setg_errno(errp, errno, "pwrite(\"%s/%s\")", dirpath, - fn); - } - } /* otherwise pretend successful re-(on|off)-lining */ - - res = close(fd); - g_assert(res == 0); - } - - res = close(dirfd); - g_assert(res == 0); -} - -GuestLogicalProcessorList *qmp_guest_get_vcpus(Error **errp) -{ - GuestLogicalProcessorList *head, **tail; - const char *cpu_dir = "/sys/devices/system/cpu"; - const gchar *line; - g_autoptr(GDir) cpu_gdir = NULL; - Error *local_err = NULL; - - head = NULL; - tail = &head; - cpu_gdir = g_dir_open(cpu_dir, 0, NULL); - - if (cpu_gdir == NULL) { - error_setg_errno(errp, errno, "failed to list entries: %s", cpu_dir); - return NULL; - } - - while (local_err == NULL && (line = g_dir_read_name(cpu_gdir)) != NULL) { - GuestLogicalProcessor *vcpu; - int64_t id; - if (sscanf(line, "cpu%" PRId64, &id)) { - g_autofree char *path = g_strdup_printf("/sys/devices/system/cpu/" - "cpu%" PRId64 "/", id); - vcpu = g_malloc0(sizeof *vcpu); - vcpu->logical_id = id; - vcpu->has_can_offline = true; /* lolspeak ftw */ - transfer_vcpu(vcpu, true, path, &local_err); - QAPI_LIST_APPEND(tail, vcpu); - } - } - - if (local_err == NULL) { - /* there's no guest with zero VCPUs */ - g_assert(head != NULL); - return head; - } - - qapi_free_GuestLogicalProcessorList(head); - error_propagate(errp, local_err); - return NULL; -} - -int64_t qmp_guest_set_vcpus(GuestLogicalProcessorList *vcpus, Error **errp) -{ - int64_t processed; - Error *local_err = NULL; - - processed = 0; - while (vcpus != NULL) { - char *path = g_strdup_printf("/sys/devices/system/cpu/cpu%" PRId64 "/", - vcpus->value->logical_id); - - transfer_vcpu(vcpus->value, false, path, &local_err); - g_free(path); - if (local_err != NULL) { - break; - } - ++processed; - vcpus = vcpus->next; - } - - if (local_err != NULL) { - if (processed == 0) { - error_propagate(errp, local_err); - } else { - error_free(local_err); - } - } - - return processed; -} -#endif /* __linux__ */ - #if defined(__linux__) || defined(__FreeBSD__) void qmp_guest_set_user_password(const char *username, const char *password, @@ -2190,574 +877,8 @@ void qmp_guest_set_user_password(const char *username, return; } } -#else /* __linux__ || __FreeBSD__ */ -void qmp_guest_set_user_password(const char *username, - const char *password, - bool crypted, - Error **errp) -{ - error_setg(errp, QERR_UNSUPPORTED); -} #endif /* __linux__ || __FreeBSD__ */ -#ifdef __linux__ -static void ga_read_sysfs_file(int dirfd, const char *pathname, char *buf, - int size, Error **errp) -{ - int fd; - int res; - - errno = 0; - fd = openat(dirfd, pathname, O_RDONLY); - if (fd == -1) { - error_setg_errno(errp, errno, "open sysfs file \"%s\"", pathname); - return; - } - - res = pread(fd, buf, size, 0); - if (res == -1) { - error_setg_errno(errp, errno, "pread sysfs file \"%s\"", pathname); - } else if (res == 0) { - error_setg(errp, "pread sysfs file \"%s\": unexpected EOF", pathname); - } - close(fd); -} - -static void ga_write_sysfs_file(int dirfd, const char *pathname, - const char *buf, int size, Error **errp) -{ - int fd; - - errno = 0; - fd = openat(dirfd, pathname, O_WRONLY); - if (fd == -1) { - error_setg_errno(errp, errno, "open sysfs file \"%s\"", pathname); - return; - } - - if (pwrite(fd, buf, size, 0) == -1) { - error_setg_errno(errp, errno, "pwrite sysfs file \"%s\"", pathname); - } - - close(fd); -} - -/* Transfer online/offline status between @mem_blk and the guest system. - * - * On input either @errp or *@errp must be NULL. - * - * In system-to-@mem_blk direction, the following @mem_blk fields are accessed: - * - R: mem_blk->phys_index - * - W: mem_blk->online - * - W: mem_blk->can_offline - * - * In @mem_blk-to-system direction, the following @mem_blk fields are accessed: - * - R: mem_blk->phys_index - * - R: mem_blk->online - *- R: mem_blk->can_offline - * Written members remain unmodified on error. - */ -static void transfer_memory_block(GuestMemoryBlock *mem_blk, bool sys2memblk, - GuestMemoryBlockResponse *result, - Error **errp) -{ - char *dirpath; - int dirfd; - char *status; - Error *local_err = NULL; - - if (!sys2memblk) { - DIR *dp; - - if (!result) { - error_setg(errp, "Internal error, 'result' should not be NULL"); - return; - } - errno = 0; - dp = opendir("/sys/devices/system/memory/"); - /* if there is no 'memory' directory in sysfs, - * we think this VM does not support online/offline memory block, - * any other solution? - */ - if (!dp) { - if (errno == ENOENT) { - result->response = - GUEST_MEMORY_BLOCK_RESPONSE_TYPE_OPERATION_NOT_SUPPORTED; - } - goto out1; - } - closedir(dp); - } - - dirpath = g_strdup_printf("/sys/devices/system/memory/memory%" PRId64 "/", - mem_blk->phys_index); - dirfd = open(dirpath, O_RDONLY | O_DIRECTORY); - if (dirfd == -1) { - if (sys2memblk) { - error_setg_errno(errp, errno, "open(\"%s\")", dirpath); - } else { - if (errno == ENOENT) { - result->response = GUEST_MEMORY_BLOCK_RESPONSE_TYPE_NOT_FOUND; - } else { - result->response = - GUEST_MEMORY_BLOCK_RESPONSE_TYPE_OPERATION_FAILED; - } - } - g_free(dirpath); - goto out1; - } - g_free(dirpath); - - status = g_malloc0(10); - ga_read_sysfs_file(dirfd, "state", status, 10, &local_err); - if (local_err) { - /* treat with sysfs file that not exist in old kernel */ - if (errno == ENOENT) { - error_free(local_err); - if (sys2memblk) { - mem_blk->online = true; - mem_blk->can_offline = false; - } else if (!mem_blk->online) { - result->response = - GUEST_MEMORY_BLOCK_RESPONSE_TYPE_OPERATION_NOT_SUPPORTED; - } - } else { - if (sys2memblk) { - error_propagate(errp, local_err); - } else { - error_free(local_err); - result->response = - GUEST_MEMORY_BLOCK_RESPONSE_TYPE_OPERATION_FAILED; - } - } - goto out2; - } - - if (sys2memblk) { - char removable = '0'; - - mem_blk->online = (strncmp(status, "online", 6) == 0); - - ga_read_sysfs_file(dirfd, "removable", &removable, 1, &local_err); - if (local_err) { - /* if no 'removable' file, it doesn't support offline mem blk */ - if (errno == ENOENT) { - error_free(local_err); - mem_blk->can_offline = false; - } else { - error_propagate(errp, local_err); - } - } else { - mem_blk->can_offline = (removable != '0'); - } - } else { - if (mem_blk->online != (strncmp(status, "online", 6) == 0)) { - const char *new_state = mem_blk->online ? "online" : "offline"; - - ga_write_sysfs_file(dirfd, "state", new_state, strlen(new_state), - &local_err); - if (local_err) { - error_free(local_err); - result->response = - GUEST_MEMORY_BLOCK_RESPONSE_TYPE_OPERATION_FAILED; - goto out2; - } - - result->response = GUEST_MEMORY_BLOCK_RESPONSE_TYPE_SUCCESS; - result->has_error_code = false; - } /* otherwise pretend successful re-(on|off)-lining */ - } - g_free(status); - close(dirfd); - return; - -out2: - g_free(status); - close(dirfd); -out1: - if (!sys2memblk) { - result->has_error_code = true; - result->error_code = errno; - } -} - -GuestMemoryBlockList *qmp_guest_get_memory_blocks(Error **errp) -{ - GuestMemoryBlockList *head, **tail; - Error *local_err = NULL; - struct dirent *de; - DIR *dp; - - head = NULL; - tail = &head; - - dp = opendir("/sys/devices/system/memory/"); - if (!dp) { - /* it's ok if this happens to be a system that doesn't expose - * memory blocks via sysfs, but otherwise we should report - * an error - */ - if (errno != ENOENT) { - error_setg_errno(errp, errno, "Can't open directory" - "\"/sys/devices/system/memory/\""); - } - return NULL; - } - - /* Note: the phys_index of memory block may be discontinuous, - * this is because a memblk is the unit of the Sparse Memory design, which - * allows discontinuous memory ranges (ex. NUMA), so here we should - * traverse the memory block directory. - */ - while ((de = readdir(dp)) != NULL) { - GuestMemoryBlock *mem_blk; - - if ((strncmp(de->d_name, "memory", 6) != 0) || - !(de->d_type & DT_DIR)) { - continue; - } - - mem_blk = g_malloc0(sizeof *mem_blk); - /* The d_name is "memoryXXX", phys_index is block id, same as XXX */ - mem_blk->phys_index = strtoul(&de->d_name[6], NULL, 10); - mem_blk->has_can_offline = true; /* lolspeak ftw */ - transfer_memory_block(mem_blk, true, NULL, &local_err); - if (local_err) { - break; - } - - QAPI_LIST_APPEND(tail, mem_blk); - } - - closedir(dp); - if (local_err == NULL) { - /* there's no guest with zero memory blocks */ - if (head == NULL) { - error_setg(errp, "guest reported zero memory blocks!"); - } - return head; - } - - qapi_free_GuestMemoryBlockList(head); - error_propagate(errp, local_err); - return NULL; -} - -GuestMemoryBlockResponseList * -qmp_guest_set_memory_blocks(GuestMemoryBlockList *mem_blks, Error **errp) -{ - GuestMemoryBlockResponseList *head, **tail; - Error *local_err = NULL; - - head = NULL; - tail = &head; - - while (mem_blks != NULL) { - GuestMemoryBlockResponse *result; - GuestMemoryBlock *current_mem_blk = mem_blks->value; - - result = g_malloc0(sizeof(*result)); - result->phys_index = current_mem_blk->phys_index; - transfer_memory_block(current_mem_blk, false, result, &local_err); - if (local_err) { /* should never happen */ - goto err; - } - - QAPI_LIST_APPEND(tail, result); - mem_blks = mem_blks->next; - } - - return head; -err: - qapi_free_GuestMemoryBlockResponseList(head); - error_propagate(errp, local_err); - return NULL; -} - -GuestMemoryBlockInfo *qmp_guest_get_memory_block_info(Error **errp) -{ - Error *local_err = NULL; - char *dirpath; - int dirfd; - char *buf; - GuestMemoryBlockInfo *info; - - dirpath = g_strdup_printf("/sys/devices/system/memory/"); - dirfd = open(dirpath, O_RDONLY | O_DIRECTORY); - if (dirfd == -1) { - error_setg_errno(errp, errno, "open(\"%s\")", dirpath); - g_free(dirpath); - return NULL; - } - g_free(dirpath); - - buf = g_malloc0(20); - ga_read_sysfs_file(dirfd, "block_size_bytes", buf, 20, &local_err); - close(dirfd); - if (local_err) { - g_free(buf); - error_propagate(errp, local_err); - return NULL; - } - - info = g_new0(GuestMemoryBlockInfo, 1); - info->size = strtol(buf, NULL, 16); /* the unit is bytes */ - - g_free(buf); - - return info; -} - -#define MAX_NAME_LEN 128 -static GuestDiskStatsInfoList *guest_get_diskstats(Error **errp) -{ -#ifdef CONFIG_LINUX - GuestDiskStatsInfoList *head = NULL, **tail = &head; - const char *diskstats = "/proc/diskstats"; - FILE *fp; - size_t n; - char *line = NULL; - - fp = fopen(diskstats, "r"); - if (fp == NULL) { - error_setg_errno(errp, errno, "open(\"%s\")", diskstats); - return NULL; - } - - while (getline(&line, &n, fp) != -1) { - g_autofree GuestDiskStatsInfo *diskstatinfo = NULL; - g_autofree GuestDiskStats *diskstat = NULL; - char dev_name[MAX_NAME_LEN]; - unsigned int ios_pgr, tot_ticks, rq_ticks, wr_ticks, dc_ticks, fl_ticks; - unsigned long rd_ios, rd_merges_or_rd_sec, rd_ticks_or_wr_sec, wr_ios; - unsigned long wr_merges, rd_sec_or_wr_ios, wr_sec; - unsigned long dc_ios, dc_merges, dc_sec, fl_ios; - unsigned int major, minor; - int i; - - i = sscanf(line, "%u %u %s %lu %lu %lu" - "%lu %lu %lu %lu %u %u %u %u" - "%lu %lu %lu %u %lu %u", - &major, &minor, dev_name, - &rd_ios, &rd_merges_or_rd_sec, &rd_sec_or_wr_ios, - &rd_ticks_or_wr_sec, &wr_ios, &wr_merges, &wr_sec, - &wr_ticks, &ios_pgr, &tot_ticks, &rq_ticks, - &dc_ios, &dc_merges, &dc_sec, &dc_ticks, - &fl_ios, &fl_ticks); - - if (i < 7) { - continue; - } - - diskstatinfo = g_new0(GuestDiskStatsInfo, 1); - diskstatinfo->name = g_strdup(dev_name); - diskstatinfo->major = major; - diskstatinfo->minor = minor; - - diskstat = g_new0(GuestDiskStats, 1); - if (i == 7) { - diskstat->has_read_ios = true; - diskstat->read_ios = rd_ios; - diskstat->has_read_sectors = true; - diskstat->read_sectors = rd_merges_or_rd_sec; - diskstat->has_write_ios = true; - diskstat->write_ios = rd_sec_or_wr_ios; - diskstat->has_write_sectors = true; - diskstat->write_sectors = rd_ticks_or_wr_sec; - } - if (i >= 14) { - diskstat->has_read_ios = true; - diskstat->read_ios = rd_ios; - diskstat->has_read_sectors = true; - diskstat->read_sectors = rd_sec_or_wr_ios; - diskstat->has_read_merges = true; - diskstat->read_merges = rd_merges_or_rd_sec; - diskstat->has_read_ticks = true; - diskstat->read_ticks = rd_ticks_or_wr_sec; - diskstat->has_write_ios = true; - diskstat->write_ios = wr_ios; - diskstat->has_write_sectors = true; - diskstat->write_sectors = wr_sec; - diskstat->has_write_merges = true; - diskstat->write_merges = wr_merges; - diskstat->has_write_ticks = true; - diskstat->write_ticks = wr_ticks; - diskstat->has_ios_pgr = true; - diskstat->ios_pgr = ios_pgr; - diskstat->has_total_ticks = true; - diskstat->total_ticks = tot_ticks; - diskstat->has_weight_ticks = true; - diskstat->weight_ticks = rq_ticks; - } - if (i >= 18) { - diskstat->has_discard_ios = true; - diskstat->discard_ios = dc_ios; - diskstat->has_discard_merges = true; - diskstat->discard_merges = dc_merges; - diskstat->has_discard_sectors = true; - diskstat->discard_sectors = dc_sec; - diskstat->has_discard_ticks = true; - diskstat->discard_ticks = dc_ticks; - } - if (i >= 20) { - diskstat->has_flush_ios = true; - diskstat->flush_ios = fl_ios; - diskstat->has_flush_ticks = true; - diskstat->flush_ticks = fl_ticks; - } - - diskstatinfo->stats = g_steal_pointer(&diskstat); - QAPI_LIST_APPEND(tail, diskstatinfo); - diskstatinfo = NULL; - } - free(line); - fclose(fp); - return head; -#else - g_debug("disk stats reporting available only for Linux"); - return NULL; -#endif -} - -GuestDiskStatsInfoList *qmp_guest_get_diskstats(Error **errp) -{ - return guest_get_diskstats(errp); -} - -GuestCpuStatsList *qmp_guest_get_cpustats(Error **errp) -{ - GuestCpuStatsList *head = NULL, **tail = &head; - const char *cpustats = "/proc/stat"; - int clk_tck = sysconf(_SC_CLK_TCK); - FILE *fp; - size_t n; - char *line = NULL; - - fp = fopen(cpustats, "r"); - if (fp == NULL) { - error_setg_errno(errp, errno, "open(\"%s\")", cpustats); - return NULL; - } - - while (getline(&line, &n, fp) != -1) { - GuestCpuStats *cpustat = NULL; - GuestLinuxCpuStats *linuxcpustat; - int i; - unsigned long user, system, idle, iowait, irq, softirq, steal, guest; - unsigned long nice, guest_nice; - char name[64]; - - i = sscanf(line, "%s %lu %lu %lu %lu %lu %lu %lu %lu %lu %lu", - name, &user, &nice, &system, &idle, &iowait, &irq, &softirq, - &steal, &guest, &guest_nice); - - /* drop "cpu 1 2 3 ...", get "cpuX 1 2 3 ..." only */ - if ((i == EOF) || strncmp(name, "cpu", 3) || (name[3] == '\0')) { - continue; - } - - if (i < 5) { - slog("Parsing cpu stat from %s failed, see \"man proc\"", cpustats); - break; - } - - cpustat = g_new0(GuestCpuStats, 1); - cpustat->type = GUEST_CPU_STATS_TYPE_LINUX; - - linuxcpustat = &cpustat->u.q_linux; - linuxcpustat->cpu = atoi(&name[3]); - linuxcpustat->user = user * 1000 / clk_tck; - linuxcpustat->nice = nice * 1000 / clk_tck; - linuxcpustat->system = system * 1000 / clk_tck; - linuxcpustat->idle = idle * 1000 / clk_tck; - - if (i > 5) { - linuxcpustat->has_iowait = true; - linuxcpustat->iowait = iowait * 1000 / clk_tck; - } - - if (i > 6) { - linuxcpustat->has_irq = true; - linuxcpustat->irq = irq * 1000 / clk_tck; - linuxcpustat->has_softirq = true; - linuxcpustat->softirq = softirq * 1000 / clk_tck; - } - - if (i > 8) { - linuxcpustat->has_steal = true; - linuxcpustat->steal = steal * 1000 / clk_tck; - } - - if (i > 9) { - linuxcpustat->has_guest = true; - linuxcpustat->guest = guest * 1000 / clk_tck; - } - - if (i > 10) { - linuxcpustat->has_guest = true; - linuxcpustat->guest = guest * 1000 / clk_tck; - linuxcpustat->has_guestnice = true; - linuxcpustat->guestnice = guest_nice * 1000 / clk_tck; - } - - QAPI_LIST_APPEND(tail, cpustat); - } - - free(line); - fclose(fp); - return head; -} - -#else /* defined(__linux__) */ - -void qmp_guest_suspend_disk(Error **errp) -{ - error_setg(errp, QERR_UNSUPPORTED); -} - -void qmp_guest_suspend_ram(Error **errp) -{ - error_setg(errp, QERR_UNSUPPORTED); -} - -void qmp_guest_suspend_hybrid(Error **errp) -{ - error_setg(errp, QERR_UNSUPPORTED); -} - -GuestLogicalProcessorList *qmp_guest_get_vcpus(Error **errp) -{ - error_setg(errp, QERR_UNSUPPORTED); - return NULL; -} - -int64_t qmp_guest_set_vcpus(GuestLogicalProcessorList *vcpus, Error **errp) -{ - error_setg(errp, QERR_UNSUPPORTED); - return -1; -} - -GuestMemoryBlockList *qmp_guest_get_memory_blocks(Error **errp) -{ - error_setg(errp, QERR_UNSUPPORTED); - return NULL; -} - -GuestMemoryBlockResponseList * -qmp_guest_set_memory_blocks(GuestMemoryBlockList *mem_blks, Error **errp) -{ - error_setg(errp, QERR_UNSUPPORTED); - return NULL; -} - -GuestMemoryBlockInfo *qmp_guest_get_memory_block_info(Error **errp) -{ - error_setg(errp, QERR_UNSUPPORTED); - return NULL; -} - -#endif - #ifdef HAVE_GETIFADDRS static GuestNetworkInterface * guest_find_interface(GuestNetworkInterfaceList *head, @@ -3013,131 +1134,8 @@ error: return NULL; } -#else - -GuestNetworkInterfaceList *qmp_guest_network_get_interfaces(Error **errp) -{ - error_setg(errp, QERR_UNSUPPORTED); - return NULL; -} - #endif /* HAVE_GETIFADDRS */ -#if !defined(CONFIG_FSFREEZE) - -GuestFilesystemInfoList *qmp_guest_get_fsinfo(Error **errp) -{ - error_setg(errp, QERR_UNSUPPORTED); - return NULL; -} - -GuestFsfreezeStatus qmp_guest_fsfreeze_status(Error **errp) -{ - error_setg(errp, QERR_UNSUPPORTED); - - return 0; -} - -int64_t qmp_guest_fsfreeze_freeze(Error **errp) -{ - error_setg(errp, QERR_UNSUPPORTED); - - return 0; -} - -int64_t qmp_guest_fsfreeze_freeze_list(bool has_mountpoints, - strList *mountpoints, - Error **errp) -{ - error_setg(errp, QERR_UNSUPPORTED); - - return 0; -} - -int64_t qmp_guest_fsfreeze_thaw(Error **errp) -{ - error_setg(errp, QERR_UNSUPPORTED); - - return 0; -} - -GuestDiskInfoList *qmp_guest_get_disks(Error **errp) -{ - error_setg(errp, QERR_UNSUPPORTED); - return NULL; -} - -GuestDiskStatsInfoList *qmp_guest_get_diskstats(Error **errp) -{ - error_setg(errp, QERR_UNSUPPORTED); - return NULL; -} - -GuestCpuStatsList *qmp_guest_get_cpustats(Error **errp) -{ - error_setg(errp, QERR_UNSUPPORTED); - return NULL; -} - -#endif /* CONFIG_FSFREEZE */ - -#if !defined(CONFIG_FSTRIM) -GuestFilesystemTrimResponse * -qmp_guest_fstrim(bool has_minimum, int64_t minimum, Error **errp) -{ - error_setg(errp, QERR_UNSUPPORTED); - return NULL; -} -#endif - -/* add unsupported commands to the list of blocked RPCs */ -GList *ga_command_init_blockedrpcs(GList *blockedrpcs) -{ -#if !defined(__linux__) - { - const char *list[] = { - "guest-suspend-disk", "guest-suspend-ram", - "guest-suspend-hybrid", "guest-get-vcpus", "guest-set-vcpus", - "guest-get-memory-blocks", "guest-set-memory-blocks", - "guest-get-memory-block-size", "guest-get-memory-block-info", - NULL}; - char **p = (char **)list; - - while (*p) { - blockedrpcs = g_list_append(blockedrpcs, g_strdup(*p++)); - } - } -#endif - -#if !defined(HAVE_GETIFADDRS) - blockedrpcs = g_list_append(blockedrpcs, - g_strdup("guest-network-get-interfaces")); -#endif - -#if !defined(CONFIG_FSFREEZE) - { - const char *list[] = { - "guest-get-fsinfo", "guest-fsfreeze-status", - "guest-fsfreeze-freeze", "guest-fsfreeze-freeze-list", - "guest-fsfreeze-thaw", "guest-get-fsinfo", - "guest-get-disks", NULL}; - char **p = (char **)list; - - while (*p) { - blockedrpcs = g_list_append(blockedrpcs, g_strdup(*p++)); - } - } -#endif - -#if !defined(CONFIG_FSTRIM) - blockedrpcs = g_list_append(blockedrpcs, g_strdup("guest-fstrim")); -#endif - - blockedrpcs = g_list_append(blockedrpcs, g_strdup("guest-get-devices")); - - return blockedrpcs; -} - /* register init/cleanup routines for stateful command groups */ void ga_command_state_init(GAState *s, GACommandState *cs) { @@ -3200,15 +1198,7 @@ GuestUserList *qmp_guest_get_users(Error **errp) return head; } -#else - -GuestUserList *qmp_guest_get_users(Error **errp) -{ - error_setg(errp, QERR_UNSUPPORTED); - return NULL; -} - -#endif +#endif /* HAVE_UTMPX */ /* Replace escaped special characters with their real values. The replacement * is done in place -- returned value is in the original string. @@ -3345,13 +1335,6 @@ GuestOSInfo *qmp_guest_get_osinfo(Error **errp) return info; } -GuestDeviceInfoList *qmp_guest_get_devices(Error **errp) -{ - error_setg(errp, QERR_UNSUPPORTED); - - return NULL; -} - #ifndef HOST_NAME_MAX # ifdef _POSIX_HOST_NAME_MAX # define HOST_NAME_MAX _POSIX_HOST_NAME_MAX diff --git a/qga/commands-win32.c b/qga/commands-win32.c index 0d1b836e87..61b36da469 100644 --- a/qga/commands-win32.c +++ b/qga/commands-win32.c @@ -1203,7 +1203,7 @@ GuestFilesystemInfoList *qmp_guest_get_fsinfo(Error **errp) GuestFsfreezeStatus qmp_guest_fsfreeze_status(Error **errp) { if (!vss_initialized()) { - error_setg(errp, QERR_UNSUPPORTED); + error_setg(errp, "fsfreeze not possible as VSS failed to initialize"); return 0; } @@ -1231,7 +1231,7 @@ int64_t qmp_guest_fsfreeze_freeze_list(bool has_mountpoints, Error *local_err = NULL; if (!vss_initialized()) { - error_setg(errp, QERR_UNSUPPORTED); + error_setg(errp, "fsfreeze not possible as VSS failed to initialize"); return 0; } @@ -1266,7 +1266,7 @@ int64_t qmp_guest_fsfreeze_thaw(Error **errp) int i; if (!vss_initialized()) { - error_setg(errp, QERR_UNSUPPORTED); + error_setg(errp, "fsfreeze not possible as VSS failed to initialize"); return 0; } @@ -1494,11 +1494,6 @@ out: } } -void qmp_guest_suspend_hybrid(Error **errp) -{ - error_setg(errp, QERR_UNSUPPORTED); -} - static IP_ADAPTER_ADDRESSES *guest_get_adapters_addresses(Error **errp) { IP_ADAPTER_ADDRESSES *adptr_addrs = NULL; @@ -1862,12 +1857,6 @@ GuestLogicalProcessorList *qmp_guest_get_vcpus(Error **errp) return NULL; } -int64_t qmp_guest_set_vcpus(GuestLogicalProcessorList *vcpus, Error **errp) -{ - error_setg(errp, QERR_UNSUPPORTED); - return -1; -} - static gchar * get_net_error_message(gint error) { @@ -1969,55 +1958,6 @@ done: g_free(rawpasswddata); } -GuestMemoryBlockList *qmp_guest_get_memory_blocks(Error **errp) -{ - error_setg(errp, QERR_UNSUPPORTED); - return NULL; -} - -GuestMemoryBlockResponseList * -qmp_guest_set_memory_blocks(GuestMemoryBlockList *mem_blks, Error **errp) -{ - error_setg(errp, QERR_UNSUPPORTED); - return NULL; -} - -GuestMemoryBlockInfo *qmp_guest_get_memory_block_info(Error **errp) -{ - error_setg(errp, QERR_UNSUPPORTED); - return NULL; -} - -/* add unsupported commands to the list of blocked RPCs */ -GList *ga_command_init_blockedrpcs(GList *blockedrpcs) -{ - const char *list_unsupported[] = { - "guest-suspend-hybrid", - "guest-set-vcpus", - "guest-get-memory-blocks", "guest-set-memory-blocks", - "guest-get-memory-block-size", "guest-get-memory-block-info", - NULL}; - char **p = (char **)list_unsupported; - - while (*p) { - blockedrpcs = g_list_append(blockedrpcs, g_strdup(*p++)); - } - - if (!vss_init(true)) { - g_debug("vss_init failed, vss commands are going to be disabled"); - const char *list[] = { - "guest-get-fsinfo", "guest-fsfreeze-status", - "guest-fsfreeze-freeze", "guest-fsfreeze-thaw", NULL}; - p = (char **)list; - - while (*p) { - blockedrpcs = g_list_append(blockedrpcs, g_strdup(*p++)); - } - } - - return blockedrpcs; -} - /* register init/cleanup routines for stateful command groups */ void ga_command_state_init(GAState *s, GACommandState *cs) { @@ -2505,15 +2445,3 @@ char *qga_get_host_name(Error **errp) return g_utf16_to_utf8(tmp, size, NULL, NULL, NULL); } - -GuestDiskStatsInfoList *qmp_guest_get_diskstats(Error **errp) -{ - error_setg(errp, QERR_UNSUPPORTED); - return NULL; -} - -GuestCpuStatsList *qmp_guest_get_cpustats(Error **errp) -{ - error_setg(errp, QERR_UNSUPPORTED); - return NULL; -} diff --git a/qga/main.c b/qga/main.c index f4d5f15bb3..b8f7b1e4a3 100644 --- a/qga/main.c +++ b/qga/main.c @@ -70,6 +70,28 @@ typedef struct GAPersistentState { typedef struct GAConfig GAConfig; +struct GAConfig { + char *channel_path; + char *method; + char *log_filepath; + char *pid_filepath; +#ifdef CONFIG_FSFREEZE + char *fsfreeze_hook; +#endif + char *state_dir; +#ifdef _WIN32 + const char *service; +#endif + gchar *bliststr; /* blockedrpcs may point to this string */ + gchar *aliststr; /* allowedrpcs may point to this string */ + GList *blockedrpcs; + GList *allowedrpcs; + int daemonize; + GLogLevelFlags log_level; + int dumpconf; + bool retry_path; +}; + struct GAState { JSONMessageParser parser; GMainLoop *main_loop; @@ -226,12 +248,16 @@ static void usage(const char *cmd) #ifdef CONFIG_FSFREEZE g_autofree char *fsfreeze_hook = get_relocated_path(QGA_FSFREEZE_HOOK_DEFAULT); #endif + g_autofree char *conf_path = get_relocated_path(QGA_CONF_DEFAULT); printf( "Usage: %s [-m <method> -p <path>] [<options>]\n" "QEMU Guest Agent " QEMU_FULL_VERSION "\n" QEMU_COPYRIGHT "\n" "\n" +" -c, --config=PATH configuration file path (default is\n" +" %s/qemu-ga.conf\n" +" unless overriden by the QGA_CONF environment variable)\n" " -m, --method transport method: one of unix-listen, virtio-serial,\n" " isa-serial, or vsock-listen (virtio-serial is the default)\n" " -p, --path device/socket path (the default for virtio-serial is:\n" @@ -272,8 +298,8 @@ QEMU_COPYRIGHT "\n" " plug/unplug, etc.)\n" " -h, --help display this help and exit\n" "\n" -QEMU_HELP_BOTTOM "\n" - , cmd, QGA_VIRTIO_PATH_DEFAULT, QGA_SERIAL_PATH_DEFAULT, +QEMU_HELP_BOTTOM "\n", + cmd, conf_path, QGA_VIRTIO_PATH_DEFAULT, QGA_SERIAL_PATH_DEFAULT, dfl_pathnames.pidfile, #ifdef CONFIG_FSFREEZE fsfreeze_hook, @@ -397,60 +423,79 @@ static gint ga_strcmp(gconstpointer str1, gconstpointer str2) return strcmp(str1, str2); } -/* disable commands that aren't safe for fsfreeze */ -static void ga_disable_not_allowed_freeze(const QmpCommand *cmd, void *opaque) +static bool ga_command_is_allowed(const QmpCommand *cmd, GAState *state) { - bool allowed = false; int i = 0; + GAConfig *config = state->config; const char *name = qmp_command_name(cmd); + /* Fallback policy is allow everything */ + bool allowed = true; + + if (config->allowedrpcs) { + /* + * If an allow-list is given, this changes the fallback + * policy to deny everything + */ + allowed = false; - while (ga_freeze_allowlist[i] != NULL) { - if (strcmp(name, ga_freeze_allowlist[i]) == 0) { + if (g_list_find_custom(config->allowedrpcs, name, ga_strcmp) != NULL) { allowed = true; } - i++; } - if (!allowed) { - g_debug("disabling command: %s", name); - qmp_disable_command(&ga_commands, name, "the agent is in frozen state"); - } -} -/* [re-]enable all commands, except those explicitly blocked by user */ -static void ga_enable_non_blocked(const QmpCommand *cmd, void *opaque) -{ - GAState *s = opaque; - GList *blockedrpcs = s->blockedrpcs; - GList *allowedrpcs = s->allowedrpcs; - const char *name = qmp_command_name(cmd); - - if (g_list_find_custom(blockedrpcs, name, ga_strcmp) == NULL) { - if (qmp_command_is_enabled(cmd)) { - return; + /* + * If both allowedrpcs and blockedrpcs are set, the blocked + * list will take priority + */ + if (config->blockedrpcs) { + if (g_list_find_custom(config->blockedrpcs, name, ga_strcmp) != NULL) { + allowed = false; } + } - if (allowedrpcs && - g_list_find_custom(allowedrpcs, name, ga_strcmp) == NULL) { - return; - } + /* + * If frozen, this filtering must take priority over + * absolutely everything + */ + if (state->frozen) { + allowed = false; - g_debug("enabling command: %s", name); - qmp_enable_command(&ga_commands, name); + while (ga_freeze_allowlist[i] != NULL) { + if (strcmp(name, ga_freeze_allowlist[i]) == 0) { + allowed = true; + } + i++; + } } + + return allowed; } -/* disable commands that aren't allowed */ -static void ga_disable_not_allowed(const QmpCommand *cmd, void *opaque) +static void ga_apply_command_filters_iter(const QmpCommand *cmd, void *opaque) { - GList *allowedrpcs = opaque; + GAState *state = opaque; + bool want = ga_command_is_allowed(cmd, state); + bool have = qmp_command_is_enabled(cmd); const char *name = qmp_command_name(cmd); - if (g_list_find_custom(allowedrpcs, name, ga_strcmp) == NULL) { + if (want == have) { + return; + } + + if (have) { g_debug("disabling command: %s", name); qmp_disable_command(&ga_commands, name, "the command is not allowed"); + } else { + g_debug("enabling command: %s", name); + qmp_enable_command(&ga_commands, name); } } +static void ga_apply_command_filters(GAState *state) +{ + qmp_for_each_command(&ga_commands, ga_apply_command_filters_iter, state); +} + static bool ga_create_file(const char *path) { int fd = open(path, O_CREAT | O_WRONLY, S_IWUSR | S_IRUSR); @@ -483,15 +528,14 @@ void ga_set_frozen(GAState *s) if (ga_is_frozen(s)) { return; } - /* disable all forbidden (for frozen state) commands */ - qmp_for_each_command(&ga_commands, ga_disable_not_allowed_freeze, NULL); g_warning("disabling logging due to filesystem freeze"); - ga_disable_logging(s); s->frozen = true; if (!ga_create_file(s->state_filepath_isfrozen)) { g_warning("unable to create %s, fsfreeze may not function properly", s->state_filepath_isfrozen); } + ga_apply_command_filters(s); + ga_disable_logging(s); } void ga_unset_frozen(GAState *s) @@ -523,12 +567,12 @@ void ga_unset_frozen(GAState *s) } /* enable all disabled, non-blocked and allowed commands */ - qmp_for_each_command(&ga_commands, ga_enable_non_blocked, s); s->frozen = false; if (!ga_delete_file(s->state_filepath_isfrozen)) { g_warning("unable to delete %s, fsfreeze may not function properly", s->state_filepath_isfrozen); } + ga_apply_command_filters(s); } #ifdef CONFIG_FSFREEZE @@ -996,38 +1040,14 @@ static GList *split_list(const gchar *str, const gchar *delim) return list; } -struct GAConfig { - char *channel_path; - char *method; - char *log_filepath; - char *pid_filepath; -#ifdef CONFIG_FSFREEZE - char *fsfreeze_hook; -#endif - char *state_dir; -#ifdef _WIN32 - const char *service; -#endif - gchar *bliststr; /* blockedrpcs may point to this string */ - gchar *aliststr; /* allowedrpcs may point to this string */ - GList *blockedrpcs; - GList *allowedrpcs; - int daemonize; - GLogLevelFlags log_level; - int dumpconf; - bool retry_path; -}; - -static void config_load(GAConfig *config) +static void config_load(GAConfig *config, const char *confpath, bool required) { GError *gerr = NULL; GKeyFile *keyfile; - g_autofree char *conf = g_strdup(g_getenv("QGA_CONF")) ?: get_relocated_path(QGA_CONF_DEFAULT); - const gchar *blockrpcs_key = "block-rpcs"; /* read system config */ keyfile = g_key_file_new(); - if (!g_key_file_load_from_file(keyfile, conf, 0, &gerr)) { + if (!g_key_file_load_from_file(keyfile, confpath, 0, &gerr)) { goto end; } if (g_key_file_has_key(keyfile, "general", "daemon", NULL)) { @@ -1071,9 +1091,9 @@ static void config_load(GAConfig *config) g_key_file_get_boolean(keyfile, "general", "retry-path", &gerr); } - if (g_key_file_has_key(keyfile, "general", blockrpcs_key, NULL)) { + if (g_key_file_has_key(keyfile, "general", "block-rpcs", NULL)) { config->bliststr = - g_key_file_get_string(keyfile, "general", blockrpcs_key, &gerr); + g_key_file_get_string(keyfile, "general", "block-rpcs", &gerr); config->blockedrpcs = g_list_concat(config->blockedrpcs, split_list(config->bliststr, ",")); } @@ -1084,19 +1104,12 @@ static void config_load(GAConfig *config) split_list(config->aliststr, ",")); } - if (g_key_file_has_key(keyfile, "general", blockrpcs_key, NULL) && - g_key_file_has_key(keyfile, "general", "allow-rpcs", NULL)) { - g_critical("wrong config, using 'block-rpcs' and 'allow-rpcs' keys at" - " the same time is not allowed"); - exit(EXIT_FAILURE); - } - end: g_key_file_free(keyfile); - if (gerr && - !(gerr->domain == G_FILE_ERROR && gerr->code == G_FILE_ERROR_NOENT)) { + if (gerr && (required || + !(gerr->domain == G_FILE_ERROR && gerr->code == G_FILE_ERROR_NOENT))) { g_critical("error loading configuration from path: %s, %s", - conf, gerr->message); + confpath, gerr->message); exit(EXIT_FAILURE); } g_clear_error(&gerr); @@ -1168,12 +1181,12 @@ static void config_dump(GAConfig *config) static void config_parse(GAConfig *config, int argc, char **argv) { - const char *sopt = "hVvdm:p:l:f:F::b:a:s:t:Dr"; + const char *sopt = "hVvdc:m:p:l:f:F::b:a:s:t:Dr"; int opt_ind = 0, ch; - bool block_rpcs = false, allow_rpcs = false; const struct option lopt[] = { { "help", 0, NULL, 'h' }, { "version", 0, NULL, 'V' }, + { "config", 1, NULL, 'c' }, { "dump-conf", 0, NULL, 'D' }, { "logfile", 1, NULL, 'l' }, { "pidfile", 1, NULL, 'f' }, @@ -1193,6 +1206,26 @@ static void config_parse(GAConfig *config, int argc, char **argv) { "retry-path", 0, NULL, 'r' }, { NULL, 0, NULL, 0 } }; + g_autofree char *confpath = g_strdup(g_getenv("QGA_CONF")) ?: + get_relocated_path(QGA_CONF_DEFAULT); + bool confrequired = false; + + while ((ch = getopt_long(argc, argv, sopt, lopt, NULL)) != -1) { + switch (ch) { + case 'c': + g_free(confpath); + confpath = g_strdup(optarg); + confrequired = true; + break; + default: + break; + } + } + + config_load(config, confpath, confrequired); + + /* Reset for second pass */ + optind = 1; while ((ch = getopt_long(argc, argv, sopt, lopt, &opt_ind)) != -1) { switch (ch) { @@ -1245,7 +1278,6 @@ static void config_parse(GAConfig *config, int argc, char **argv) } config->blockedrpcs = g_list_concat(config->blockedrpcs, split_list(optarg, ",")); - block_rpcs = true; break; } case 'a': { @@ -1255,7 +1287,6 @@ static void config_parse(GAConfig *config, int argc, char **argv) } config->allowedrpcs = g_list_concat(config->allowedrpcs, split_list(optarg, ",")); - allow_rpcs = true; break; } #ifdef _WIN32 @@ -1296,12 +1327,6 @@ static void config_parse(GAConfig *config, int argc, char **argv) exit(EXIT_FAILURE); } } - - if (block_rpcs && allow_rpcs) { - g_critical("wrong commandline, using --block-rpcs and --allow-rpcs at the" - " same time is not allowed"); - exit(EXIT_FAILURE); - } } static void config_free(GAConfig *config) @@ -1395,6 +1420,10 @@ static GAState *initialize_agent(GAConfig *config, int socket_activation) " '%s': %s", config->state_dir, strerror(errno)); return NULL; } + + if (!vss_init(true)) { + g_debug("vss_init failed, vss commands will not function"); + } #endif if (ga_is_frozen(s)) { @@ -1408,7 +1437,6 @@ static GAState *initialize_agent(GAConfig *config, int socket_activation) s->deferred_options.log_filepath = config->log_filepath; } ga_disable_logging(s); - qmp_for_each_command(&ga_commands, ga_disable_not_allowed_freeze, NULL); } else { if (config->daemonize) { become_daemon(config->pid_filepath); @@ -1432,25 +1460,6 @@ static GAState *initialize_agent(GAConfig *config, int socket_activation) return NULL; } - if (config->allowedrpcs) { - qmp_for_each_command(&ga_commands, ga_disable_not_allowed, config->allowedrpcs); - s->allowedrpcs = config->allowedrpcs; - } - - /* - * Some commands can be blocked due to system limitation. - * Initialize blockedrpcs list even if allowedrpcs specified. - */ - config->blockedrpcs = ga_command_init_blockedrpcs(config->blockedrpcs); - if (config->blockedrpcs) { - GList *l = config->blockedrpcs; - s->blockedrpcs = config->blockedrpcs; - do { - g_debug("disabling command: %s", (char *)l->data); - qmp_disable_command(&ga_commands, l->data, NULL); - l = g_list_next(l); - } while (l); - } s->command_state = ga_command_state_new(); ga_command_state_init(s, s->command_state); ga_command_state_init_all(s->command_state); @@ -1476,6 +1485,8 @@ static GAState *initialize_agent(GAConfig *config, int socket_activation) } #endif + ga_apply_command_filters(s); + ga_state = s; return s; } @@ -1579,7 +1590,6 @@ int main(int argc, char **argv) qga_qmp_init_marshal(&ga_commands); init_dfl_pathnames(); - config_load(config); config_parse(config, argc, argv); if (config->pid_filepath == NULL) { diff --git a/qga/qapi-schema.json b/qga/qapi-schema.json index 1273d85bb5..495706cf73 100644 --- a/qga/qapi-schema.json +++ b/qga/qapi-schema.json @@ -412,7 +412,8 @@ # Since: 0.15.0 ## { 'enum': 'GuestFsfreezeStatus', - 'data': [ 'thawed', 'frozen' ] } + 'data': [ 'thawed', 'frozen' ], + 'if': { 'any': ['CONFIG_WIN32', 'CONFIG_FSFREEZE'] } } ## # @guest-fsfreeze-status: @@ -429,7 +430,8 @@ # Since: 0.15.0 ## { 'command': 'guest-fsfreeze-status', - 'returns': 'GuestFsfreezeStatus' } + 'returns': 'GuestFsfreezeStatus', + 'if': { 'any': ['CONFIG_WIN32', 'CONFIG_FSFREEZE'] } } ## # @guest-fsfreeze-freeze: @@ -451,7 +453,8 @@ # Since: 0.15.0 ## { 'command': 'guest-fsfreeze-freeze', - 'returns': 'int' } + 'returns': 'int', + 'if': { 'any': ['CONFIG_WIN32', 'CONFIG_FSFREEZE'] } } ## # @guest-fsfreeze-freeze-list: @@ -471,7 +474,8 @@ ## { 'command': 'guest-fsfreeze-freeze-list', 'data': { '*mountpoints': ['str'] }, - 'returns': 'int' } + 'returns': 'int', + 'if': { 'any': ['CONFIG_WIN32', 'CONFIG_FSFREEZE'] } } ## # @guest-fsfreeze-thaw: @@ -488,7 +492,8 @@ # Since: 0.15.0 ## { 'command': 'guest-fsfreeze-thaw', - 'returns': 'int' } + 'returns': 'int', + 'if': { 'any': ['CONFIG_WIN32', 'CONFIG_FSFREEZE'] } } ## # @GuestFilesystemTrimResult: @@ -505,7 +510,8 @@ ## { 'struct': 'GuestFilesystemTrimResult', 'data': {'path': 'str', - '*trimmed': 'int', '*minimum': 'int', '*error': 'str'} } + '*trimmed': 'int', '*minimum': 'int', '*error': 'str'}, + 'if': { 'any': ['CONFIG_WIN32', 'CONFIG_FSTRIM'] } } ## # @GuestFilesystemTrimResponse: @@ -515,7 +521,8 @@ # Since: 2.4 ## { 'struct': 'GuestFilesystemTrimResponse', - 'data': {'paths': ['GuestFilesystemTrimResult']} } + 'data': {'paths': ['GuestFilesystemTrimResult']}, + 'if': { 'any': ['CONFIG_WIN32', 'CONFIG_FSTRIM'] } } ## # @guest-fstrim: @@ -537,7 +544,8 @@ ## { 'command': 'guest-fstrim', 'data': { '*minimum': 'int' }, - 'returns': 'GuestFilesystemTrimResponse' } + 'returns': 'GuestFilesystemTrimResponse', + 'if': { 'any': ['CONFIG_WIN32', 'CONFIG_FSTRIM'] } } ## # @guest-suspend-disk: @@ -566,7 +574,8 @@ # # Since: 1.1 ## -{ 'command': 'guest-suspend-disk', 'success-response': false } +{ 'command': 'guest-suspend-disk', 'success-response': false, + 'if': { 'any': ['CONFIG_LINUX', 'CONFIG_WIN32'] } } ## # @guest-suspend-ram: @@ -602,7 +611,8 @@ # # Since: 1.1 ## -{ 'command': 'guest-suspend-ram', 'success-response': false } +{ 'command': 'guest-suspend-ram', 'success-response': false, + 'if': { 'any': ['CONFIG_LINUX', 'CONFIG_WIN32'] } } ## # @guest-suspend-hybrid: @@ -637,7 +647,8 @@ # # Since: 1.1 ## -{ 'command': 'guest-suspend-hybrid', 'success-response': false } +{ 'command': 'guest-suspend-hybrid', 'success-response': false, + 'if': 'CONFIG_LINUX' } ## # @GuestIpAddressType: @@ -651,7 +662,8 @@ # Since: 1.1 ## { 'enum': 'GuestIpAddressType', - 'data': [ 'ipv4', 'ipv6' ] } + 'data': [ 'ipv4', 'ipv6' ], + 'if': { 'any': ['CONFIG_WIN32', 'HAVE_GETIFADDRS'] } } ## # @GuestIpAddress: @@ -667,7 +679,8 @@ { 'struct': 'GuestIpAddress', 'data': {'ip-address': 'str', 'ip-address-type': 'GuestIpAddressType', - 'prefix': 'int'} } + 'prefix': 'int'}, + 'if': { 'any': ['CONFIG_WIN32', 'HAVE_GETIFADDRS'] } } ## # @GuestNetworkInterfaceStat: @@ -699,7 +712,8 @@ 'tx-packets': 'uint64', 'tx-errs': 'uint64', 'tx-dropped': 'uint64' - } } + }, + 'if': { 'any': ['CONFIG_WIN32', 'HAVE_GETIFADDRS'] } } ## # @GuestNetworkInterface: @@ -719,7 +733,8 @@ 'data': {'name': 'str', '*hardware-address': 'str', '*ip-addresses': ['GuestIpAddress'], - '*statistics': 'GuestNetworkInterfaceStat' } } + '*statistics': 'GuestNetworkInterfaceStat' }, + 'if': { 'any': ['CONFIG_WIN32', 'HAVE_GETIFADDRS'] } } ## # @guest-network-get-interfaces: @@ -731,7 +746,8 @@ # Since: 1.1 ## { 'command': 'guest-network-get-interfaces', - 'returns': ['GuestNetworkInterface'] } + 'returns': ['GuestNetworkInterface'], + 'if': { 'any': ['CONFIG_WIN32', 'HAVE_GETIFADDRS'] } } ## # @GuestLogicalProcessor: @@ -750,7 +766,8 @@ { 'struct': 'GuestLogicalProcessor', 'data': {'logical-id': 'int', 'online': 'bool', - '*can-offline': 'bool'} } + '*can-offline': 'bool'}, + 'if': { 'any': ['CONFIG_LINUX', 'CONFIG_WIN32'] } } ## # @guest-get-vcpus: @@ -765,7 +782,8 @@ # Since: 1.5 ## { 'command': 'guest-get-vcpus', - 'returns': ['GuestLogicalProcessor'] } + 'returns': ['GuestLogicalProcessor'], + 'if': { 'any': ['CONFIG_LINUX', 'CONFIG_WIN32'] } } ## # @guest-set-vcpus: @@ -807,7 +825,8 @@ ## { 'command': 'guest-set-vcpus', 'data': {'vcpus': ['GuestLogicalProcessor'] }, - 'returns': 'int' } + 'returns': 'int', + 'if': 'CONFIG_LINUX' } ## # @GuestDiskBusType: @@ -859,7 +878,8 @@ { 'enum': 'GuestDiskBusType', 'data': [ 'ide', 'fdc', 'scsi', 'virtio', 'xen', 'usb', 'uml', 'sata', 'sd', 'unknown', 'ieee1394', 'ssa', 'fibre', 'raid', 'iscsi', - 'sas', 'mmc', 'virtual', 'file-backed-virtual', 'nvme' ] } + 'sas', 'mmc', 'virtual', 'file-backed-virtual', 'nvme' ], + 'if': { 'any': [ 'CONFIG_WIN32', 'CONFIG_LINUX' ] } } ## @@ -877,7 +897,8 @@ ## { 'struct': 'GuestPCIAddress', 'data': {'domain': 'int', 'bus': 'int', - 'slot': 'int', 'function': 'int'} } + 'slot': 'int', 'function': 'int'}, + 'if': { 'any': [ 'CONFIG_WIN32', 'CONFIG_LINUX' ] } } ## # @GuestCCWAddress: @@ -896,7 +917,8 @@ 'data': {'cssid': 'int', 'ssid': 'int', 'subchno': 'int', - 'devno': 'int'} } + 'devno': 'int'}, + 'if': { 'any': [ 'CONFIG_WIN32', 'CONFIG_LINUX' ] } } ## # @GuestDiskAddress: @@ -925,7 +947,8 @@ 'bus-type': 'GuestDiskBusType', 'bus': 'int', 'target': 'int', 'unit': 'int', '*serial': 'str', '*dev': 'str', - '*ccw-address': 'GuestCCWAddress'} } + '*ccw-address': 'GuestCCWAddress'}, + 'if': { 'any': [ 'CONFIG_WIN32', 'CONFIG_LINUX' ] } } ## # @GuestNVMeSmart: @@ -962,7 +985,8 @@ 'media-errors-lo': 'uint64', 'media-errors-hi': 'uint64', 'number-of-error-log-entries-lo': 'uint64', - 'number-of-error-log-entries-hi': 'uint64' } } + 'number-of-error-log-entries-hi': 'uint64' }, + 'if': { 'any': [ 'CONFIG_WIN32', 'CONFIG_LIBUDEV' ] } } ## # @GuestDiskSmart: @@ -976,7 +1000,8 @@ { 'union': 'GuestDiskSmart', 'base': { 'type': 'GuestDiskBusType' }, 'discriminator': 'type', - 'data': { 'nvme': 'GuestNVMeSmart' } } + 'data': { 'nvme': 'GuestNVMeSmart' }, + 'if': { 'any': [ 'CONFIG_WIN32', 'CONFIG_LIBUDEV' ] } } ## # @GuestDiskInfo: @@ -1001,7 +1026,8 @@ { 'struct': 'GuestDiskInfo', 'data': {'name': 'str', 'partition': 'bool', '*dependencies': ['str'], '*address': 'GuestDiskAddress', '*alias': 'str', - '*smart': 'GuestDiskSmart'} } + '*smart': 'GuestDiskSmart'}, + 'if': { 'any': [ 'CONFIG_WIN32', 'CONFIG_LIBUDEV' ] } } ## # @guest-get-disks: @@ -1014,7 +1040,8 @@ # Since: 5.2 ## { 'command': 'guest-get-disks', - 'returns': ['GuestDiskInfo'] } + 'returns': ['GuestDiskInfo'], + 'if': { 'any': [ 'CONFIG_WIN32', 'CONFIG_LIBUDEV' ] } } ## # @GuestFilesystemInfo: @@ -1040,7 +1067,8 @@ { 'struct': 'GuestFilesystemInfo', 'data': {'name': 'str', 'mountpoint': 'str', 'type': 'str', '*used-bytes': 'uint64', '*total-bytes': 'uint64', - '*total-bytes-privileged': 'uint64', 'disk': ['GuestDiskAddress']} } + '*total-bytes-privileged': 'uint64', 'disk': ['GuestDiskAddress']}, + 'if': { 'any': [ 'CONFIG_WIN32', 'CONFIG_LINUX' ] } } ## # @guest-get-fsinfo: @@ -1053,7 +1081,8 @@ # Since: 2.2 ## { 'command': 'guest-get-fsinfo', - 'returns': ['GuestFilesystemInfo'] } + 'returns': ['GuestFilesystemInfo'], + 'if': { 'any': [ 'CONFIG_WIN32', 'CONFIG_LINUX' ] } } ## # @guest-set-user-password: @@ -1080,7 +1109,8 @@ # Since: 2.3 ## { 'command': 'guest-set-user-password', - 'data': { 'username': 'str', 'password': 'str', 'crypted': 'bool' } } + 'data': { 'username': 'str', 'password': 'str', 'crypted': 'bool' }, + 'if': { 'any': [ 'CONFIG_WIN32', 'CONFIG_LINUX', 'CONFIG_FREEBSD'] } } ## # @GuestMemoryBlock: @@ -1100,7 +1130,8 @@ { 'struct': 'GuestMemoryBlock', 'data': {'phys-index': 'uint64', 'online': 'bool', - '*can-offline': 'bool'} } + '*can-offline': 'bool'}, + 'if': 'CONFIG_LINUX' } ## # @guest-get-memory-blocks: @@ -1116,7 +1147,8 @@ # Since: 2.3 ## { 'command': 'guest-get-memory-blocks', - 'returns': ['GuestMemoryBlock'] } + 'returns': ['GuestMemoryBlock'], + 'if': 'CONFIG_LINUX' } ## # @GuestMemoryBlockResponseType: @@ -1139,7 +1171,8 @@ ## { 'enum': 'GuestMemoryBlockResponseType', 'data': ['success', 'not-found', 'operation-not-supported', - 'operation-failed'] } + 'operation-failed'], + 'if': 'CONFIG_LINUX' } ## # @GuestMemoryBlockResponse: @@ -1157,7 +1190,8 @@ { 'struct': 'GuestMemoryBlockResponse', 'data': { 'phys-index': 'uint64', 'response': 'GuestMemoryBlockResponseType', - '*error-code': 'int' }} + '*error-code': 'int' }, + 'if': 'CONFIG_LINUX'} ## # @guest-set-memory-blocks: @@ -1188,7 +1222,8 @@ ## { 'command': 'guest-set-memory-blocks', 'data': {'mem-blks': ['GuestMemoryBlock'] }, - 'returns': ['GuestMemoryBlockResponse'] } + 'returns': ['GuestMemoryBlockResponse'], + 'if': 'CONFIG_LINUX' } ## # @GuestMemoryBlockInfo: @@ -1200,7 +1235,8 @@ # Since: 2.3 ## { 'struct': 'GuestMemoryBlockInfo', - 'data': {'size': 'uint64'} } + 'data': {'size': 'uint64'}, + 'if': 'CONFIG_LINUX' } ## # @guest-get-memory-block-info: @@ -1212,7 +1248,8 @@ # Since: 2.3 ## { 'command': 'guest-get-memory-block-info', - 'returns': 'GuestMemoryBlockInfo' } + 'returns': 'GuestMemoryBlockInfo', + 'if': 'CONFIG_LINUX' } ## # @GuestExecStatus: @@ -1378,7 +1415,8 @@ # Since: 2.10 ## { 'struct': 'GuestUser', - 'data': { 'user': 'str', 'login-time': 'number', '*domain': 'str' } } + 'data': { 'user': 'str', 'login-time': 'number', '*domain': 'str' }, + 'if': { 'any': ['CONFIG_WIN32', 'HAVE_UTMPX' ] } } ## # @guest-get-users: @@ -1390,7 +1428,8 @@ # Since: 2.10 ## { 'command': 'guest-get-users', - 'returns': ['GuestUser'] } + 'returns': ['GuestUser'], + 'if': { 'any': ['CONFIG_WIN32', 'HAVE_UTMPX' ] } } ## # @GuestTimezone: @@ -1499,7 +1538,8 @@ # @pci: PCI device ## { 'enum': 'GuestDeviceType', - 'data': [ 'pci' ] } + 'data': [ 'pci' ], + 'if': 'CONFIG_WIN32' } ## # @GuestDeviceIdPCI: @@ -1511,7 +1551,8 @@ # Since: 5.2 ## { 'struct': 'GuestDeviceIdPCI', - 'data': { 'vendor-id': 'uint16', 'device-id': 'uint16' } } + 'data': { 'vendor-id': 'uint16', 'device-id': 'uint16' }, + 'if': 'CONFIG_WIN32' } ## # @GuestDeviceId: @@ -1525,7 +1566,8 @@ { 'union': 'GuestDeviceId', 'base': { 'type': 'GuestDeviceType' }, 'discriminator': 'type', - 'data': { 'pci': 'GuestDeviceIdPCI' } } + 'data': { 'pci': 'GuestDeviceIdPCI' }, + 'if': 'CONFIG_WIN32' } ## # @GuestDeviceInfo: @@ -1546,7 +1588,8 @@ '*driver-date': 'int', '*driver-version': 'str', '*id': 'GuestDeviceId' - } } + }, + 'if': 'CONFIG_WIN32' } ## # @guest-get-devices: @@ -1558,7 +1601,8 @@ # Since: 5.2 ## { 'command': 'guest-get-devices', - 'returns': ['GuestDeviceInfo'] } + 'returns': ['GuestDeviceInfo'], + 'if': 'CONFIG_WIN32' } ## # @GuestAuthorizedKeys: @@ -1685,7 +1729,8 @@ '*ios-pgr': 'uint64', '*total-ticks': 'uint64', '*weight-ticks': 'uint64' - } } + }, + 'if': 'CONFIG_LINUX' } ## # @GuestDiskStatsInfo: @@ -1702,7 +1747,8 @@ 'data': {'name': 'str', 'major': 'uint64', 'minor': 'uint64', - 'stats': 'GuestDiskStats' } } + 'stats': 'GuestDiskStats' }, + 'if': 'CONFIG_LINUX' } ## # @guest-get-diskstats: @@ -1714,7 +1760,8 @@ # Since: 7.1 ## { 'command': 'guest-get-diskstats', - 'returns': ['GuestDiskStatsInfo'] + 'returns': ['GuestDiskStatsInfo'], + 'if': 'CONFIG_LINUX' } ## @@ -1727,7 +1774,8 @@ # Since: 7.1 ## { 'enum': 'GuestCpuStatsType', - 'data': [ 'linux' ] } + 'data': [ 'linux' ], + 'if': 'CONFIG_LINUX' } ## @@ -1772,7 +1820,8 @@ '*steal': 'uint64', '*guest': 'uint64', '*guestnice': 'uint64' - } } + }, + 'if': 'CONFIG_LINUX' } ## # @GuestCpuStats: @@ -1786,7 +1835,8 @@ { 'union': 'GuestCpuStats', 'base': { 'type': 'GuestCpuStatsType' }, 'discriminator': 'type', - 'data': { 'linux': 'GuestLinuxCpuStats' } } + 'data': { 'linux': 'GuestLinuxCpuStats' }, + 'if': 'CONFIG_LINUX' } ## # @guest-get-cpustats: @@ -1798,5 +1848,79 @@ # Since: 7.1 ## { 'command': 'guest-get-cpustats', - 'returns': ['GuestCpuStats'] + 'returns': ['GuestCpuStats'], + 'if': 'CONFIG_LINUX' +} + +## +# @GuestNetworkRoute: +# +# Route information, currently, only linux supported. +# +# @iface: The destination network or host's egress network interface in the routing table +# +# @destination: The IP address of the target network or host, The final destination of the packet +# +# @metric: Route metric +# +# @gateway: The IP address of the next hop router +# +# @mask: Subnet Mask (IPv4 only) +# +# @irtt: Initial round-trip delay (not for windows, IPv4 only) +# +# @flags: Route flags (not for windows) +# +# @refcnt: The route's reference count (not for windows) +# +# @use: Route usage count (not for windows) +# +# @window: TCP window size, used for flow control (not for windows, IPv4 only) +# +# @mtu: Data link layer maximum packet size (not for windows) +# +# @desprefixlen: Destination prefix length (for IPv6) +# +# @source: Source IP address (for IPv6) +# +# @srcprefixlen: Source prefix length (for IPv6) +# +# @nexthop: Next hop IP address (for IPv6) +# +# @version: IP version (4 or 6) +# +# Since: 9.1 + +## +{ 'struct': 'GuestNetworkRoute', + 'data': {'iface': 'str', + 'destination': 'str', + 'metric': 'int', + '*gateway': 'str', + '*mask': 'str', + '*irtt': 'int', + '*flags': 'uint64', + '*refcnt': 'int', + '*use': 'int', + '*window': 'int', + '*mtu': 'int', + '*desprefixlen': 'str', + '*source': 'str', + '*srcprefixlen': 'str', + '*nexthop': 'str', + 'version': 'int' + }, + 'if': 'CONFIG_LINUX' } + +## +# @guest-network-get-route: +# +# Retrieve information about route of network. +# Returns: List of route info of guest. +# +# Since: 9.1 +## +{ 'command': 'guest-network-get-route', + 'returns': ['GuestNetworkRoute'], + 'if': 'CONFIG_LINUX' } diff --git a/system/memory_mapping.c b/system/memory_mapping.c index 6f884c5b90..ca2390eb80 100644 --- a/system/memory_mapping.c +++ b/system/memory_mapping.c @@ -12,6 +12,7 @@ */ #include "qemu/osdep.h" +#include "qemu/range.h" #include "qapi/error.h" #include "sysemu/memory_mapping.h" @@ -353,8 +354,7 @@ void memory_mapping_filter(MemoryMappingList *list, int64_t begin, MemoryMapping *cur, *next; QTAILQ_FOREACH_SAFE(cur, &list->head, next, next) { - if (cur->phys_addr >= begin + length || - cur->phys_addr + cur->length <= begin) { + if (!ranges_overlap(cur->phys_addr, cur->length, begin, length)) { QTAILQ_REMOVE(&list->head, cur, next); g_free(cur); list->num--; diff --git a/system/physmem.c b/system/physmem.c index 9a3b3a7636..0e19186e1b 100644 --- a/system/physmem.c +++ b/system/physmem.c @@ -763,6 +763,7 @@ void cpu_address_space_init(CPUState *cpu, int asidx, if (!cpu->cpu_ases) { cpu->cpu_ases = g_new0(CPUAddressSpace, cpu->num_ases); + cpu->cpu_ases_count = cpu->num_ases; } newas = &cpu->cpu_ases[asidx]; @@ -776,6 +777,34 @@ void cpu_address_space_init(CPUState *cpu, int asidx, } } +void cpu_address_space_destroy(CPUState *cpu, int asidx) +{ + CPUAddressSpace *cpuas; + + assert(cpu->cpu_ases); + assert(asidx >= 0 && asidx < cpu->num_ases); + /* KVM cannot currently support multiple address spaces. */ + assert(asidx == 0 || !kvm_enabled()); + + cpuas = &cpu->cpu_ases[asidx]; + if (tcg_enabled()) { + memory_listener_unregister(&cpuas->tcg_as_listener); + } + + address_space_destroy(cpuas->as); + g_free_rcu(cpuas->as, rcu); + + if (asidx == 0) { + /* reset the convenience alias for address space 0 */ + cpu->as = NULL; + } + + if (--cpu->cpu_ases_count == 0) { + g_free(cpu->cpu_ases); + cpu->cpu_ases = NULL; + } +} + AddressSpace *cpu_get_address_space(CPUState *cpu, int asidx) { /* Return the AddressSpace corresponding to the specified index */ diff --git a/target/i386/cpu.h b/target/i386/cpu.h index 1e121acef5..c6cc035df3 100644 --- a/target/i386/cpu.h +++ b/target/i386/cpu.h @@ -414,6 +414,10 @@ typedef enum X86Seg { #define MSR_IA32_TSX_CTRL 0x122 #define MSR_IA32_TSCDEADLINE 0x6e0 #define MSR_IA32_PKRS 0x6e1 +#define MSR_RAPL_POWER_UNIT 0x00000606 +#define MSR_PKG_POWER_LIMIT 0x00000610 +#define MSR_PKG_ENERGY_STATUS 0x00000611 +#define MSR_PKG_POWER_INFO 0x00000614 #define MSR_ARCH_LBR_CTL 0x000014ce #define MSR_ARCH_LBR_DEPTH 0x000014cf #define MSR_ARCH_LBR_FROM_0 0x00001500 @@ -1880,6 +1884,10 @@ typedef struct CPUArchState { uintptr_t retaddr; + /* RAPL MSR */ + uint64_t msr_rapl_power_unit; + uint64_t msr_pkg_energy_status; + /* Fields up to this point are cleared by a CPU reset */ struct {} end_reset_fields; diff --git a/target/i386/kvm/kvm.c b/target/i386/kvm/kvm.c index becca2efa5..b4aab9a410 100644 --- a/target/i386/kvm/kvm.c +++ b/target/i386/kvm/kvm.c @@ -16,9 +16,12 @@ #include "qapi/qapi-events-run-state.h" #include "qapi/error.h" #include "qapi/visitor.h" +#include <math.h> #include <sys/ioctl.h> #include <sys/utsname.h> #include <sys/syscall.h> +#include <sys/resource.h> +#include <sys/time.h> #include <linux/kvm.h> #include <linux/kvm_para.h> @@ -27,6 +30,7 @@ #include "cpu.h" #include "host-cpu.h" +#include "vmsr_energy.h" #include "sysemu/sysemu.h" #include "sysemu/hw_accel.h" #include "sysemu/kvm_int.h" @@ -2559,7 +2563,8 @@ static int kvm_get_supported_msrs(KVMState *s) return ret; } -static bool kvm_rdmsr_core_thread_count(X86CPU *cpu, uint32_t msr, +static bool kvm_rdmsr_core_thread_count(X86CPU *cpu, + uint32_t msr, uint64_t *val) { CPUState *cs = CPU(cpu); @@ -2570,6 +2575,53 @@ static bool kvm_rdmsr_core_thread_count(X86CPU *cpu, uint32_t msr, return true; } +static bool kvm_rdmsr_rapl_power_unit(X86CPU *cpu, + uint32_t msr, + uint64_t *val) +{ + + CPUState *cs = CPU(cpu); + + *val = cs->kvm_state->msr_energy.msr_unit; + + return true; +} + +static bool kvm_rdmsr_pkg_power_limit(X86CPU *cpu, + uint32_t msr, + uint64_t *val) +{ + + CPUState *cs = CPU(cpu); + + *val = cs->kvm_state->msr_energy.msr_limit; + + return true; +} + +static bool kvm_rdmsr_pkg_power_info(X86CPU *cpu, + uint32_t msr, + uint64_t *val) +{ + + CPUState *cs = CPU(cpu); + + *val = cs->kvm_state->msr_energy.msr_info; + + return true; +} + +static bool kvm_rdmsr_pkg_energy_status(X86CPU *cpu, + uint32_t msr, + uint64_t *val) +{ + + CPUState *cs = CPU(cpu); + *val = cs->kvm_state->msr_energy.msr_value[cs->cpu_index]; + + return true; +} + static Notifier smram_machine_done; static KVMMemoryListener smram_listener; static AddressSpace smram_address_space; @@ -2604,6 +2656,340 @@ static void register_smram_listener(Notifier *n, void *unused) &smram_address_space, 1, "kvm-smram"); } +static void *kvm_msr_energy_thread(void *data) +{ + KVMState *s = data; + struct KVMMsrEnergy *vmsr = &s->msr_energy; + + g_autofree vmsr_package_energy_stat *pkg_stat = NULL; + g_autofree vmsr_thread_stat *thd_stat = NULL; + g_autofree CPUState *cpu = NULL; + g_autofree unsigned int *vpkgs_energy_stat = NULL; + unsigned int num_threads = 0; + + X86CPUTopoIDs topo_ids; + + rcu_register_thread(); + + /* Allocate memory for each package energy status */ + pkg_stat = g_new0(vmsr_package_energy_stat, vmsr->host_topo.maxpkgs); + + /* Allocate memory for thread stats */ + thd_stat = g_new0(vmsr_thread_stat, 1); + + /* Allocate memory for holding virtual package energy counter */ + vpkgs_energy_stat = g_new0(unsigned int, vmsr->guest_vsockets); + + /* Populate the max tick of each packages */ + for (int i = 0; i < vmsr->host_topo.maxpkgs; i++) { + /* + * Max numbers of ticks per package + * Time in second * Number of ticks/second * Number of cores/package + * ex: 100 ticks/second/CPU, 12 CPUs per Package gives 1200 ticks max + */ + vmsr->host_topo.maxticks[i] = (MSR_ENERGY_THREAD_SLEEP_US / 1000000) + * sysconf(_SC_CLK_TCK) + * vmsr->host_topo.pkg_cpu_count[i]; + } + + while (true) { + /* Get all qemu threads id */ + g_autofree pid_t *thread_ids = + thread_ids = vmsr_get_thread_ids(vmsr->pid, &num_threads); + + if (thread_ids == NULL) { + goto clean; + } + + thd_stat = g_renew(vmsr_thread_stat, thd_stat, num_threads); + /* Unlike g_new0, g_renew0 function doesn't exist yet... */ + memset(thd_stat, 0, num_threads * sizeof(vmsr_thread_stat)); + + /* Populate all the thread stats */ + for (int i = 0; i < num_threads; i++) { + thd_stat[i].utime = g_new0(unsigned long long, 2); + thd_stat[i].stime = g_new0(unsigned long long, 2); + thd_stat[i].thread_id = thread_ids[i]; + vmsr_read_thread_stat(vmsr->pid, + thd_stat[i].thread_id, + thd_stat[i].utime, + thd_stat[i].stime, + &thd_stat[i].cpu_id); + thd_stat[i].pkg_id = + vmsr_get_physical_package_id(thd_stat[i].cpu_id); + } + + /* Retrieve all packages power plane energy counter */ + for (int i = 0; i < vmsr->host_topo.maxpkgs; i++) { + for (int j = 0; j < num_threads; j++) { + /* + * Use the first thread we found that ran on the CPU + * of the package to read the packages energy counter + */ + if (thd_stat[j].pkg_id == i) { + pkg_stat[i].e_start = + vmsr_read_msr(MSR_PKG_ENERGY_STATUS, + thd_stat[j].cpu_id, + thd_stat[j].thread_id, + s->msr_energy.sioc); + break; + } + } + } + + /* Sleep a short period while the other threads are working */ + usleep(MSR_ENERGY_THREAD_SLEEP_US); + + /* + * Retrieve all packages power plane energy counter + * Calculate the delta of all packages + */ + for (int i = 0; i < vmsr->host_topo.maxpkgs; i++) { + for (int j = 0; j < num_threads; j++) { + /* + * Use the first thread we found that ran on the CPU + * of the package to read the packages energy counter + */ + if (thd_stat[j].pkg_id == i) { + pkg_stat[i].e_end = + vmsr_read_msr(MSR_PKG_ENERGY_STATUS, + thd_stat[j].cpu_id, + thd_stat[j].thread_id, + s->msr_energy.sioc); + /* + * Prevent the case we have migrate the VM + * during the sleep period or any other cases + * were energy counter might be lower after + * the sleep period. + */ + if (pkg_stat[i].e_end > pkg_stat[i].e_start) { + pkg_stat[i].e_delta = + pkg_stat[i].e_end - pkg_stat[i].e_start; + } else { + pkg_stat[i].e_delta = 0; + } + break; + } + } + } + + /* Delta of ticks spend by each thread between the sample */ + for (int i = 0; i < num_threads; i++) { + vmsr_read_thread_stat(vmsr->pid, + thd_stat[i].thread_id, + thd_stat[i].utime, + thd_stat[i].stime, + &thd_stat[i].cpu_id); + + if (vmsr->pid < 0) { + /* + * We don't count the dead thread + * i.e threads that existed before the sleep + * and not anymore + */ + thd_stat[i].delta_ticks = 0; + } else { + vmsr_delta_ticks(thd_stat, i); + } + } + + /* + * Identify the vcpu threads + * Calculate the number of vcpu per package + */ + CPU_FOREACH(cpu) { + for (int i = 0; i < num_threads; i++) { + if (cpu->thread_id == thd_stat[i].thread_id) { + thd_stat[i].is_vcpu = true; + thd_stat[i].vcpu_id = cpu->cpu_index; + pkg_stat[thd_stat[i].pkg_id].nb_vcpu++; + thd_stat[i].acpi_id = kvm_arch_vcpu_id(cpu); + break; + } + } + } + + /* Retrieve the virtual package number of each vCPU */ + for (int i = 0; i < vmsr->guest_cpu_list->len; i++) { + for (int j = 0; j < num_threads; j++) { + if ((thd_stat[j].acpi_id == + vmsr->guest_cpu_list->cpus[i].arch_id) + && (thd_stat[j].is_vcpu == true)) { + x86_topo_ids_from_apicid(thd_stat[j].acpi_id, + &vmsr->guest_topo_info, &topo_ids); + thd_stat[j].vpkg_id = topo_ids.pkg_id; + } + } + } + + /* Calculate the total energy of all non-vCPU thread */ + for (int i = 0; i < num_threads; i++) { + if ((thd_stat[i].is_vcpu != true) && + (thd_stat[i].delta_ticks > 0)) { + double temp; + temp = vmsr_get_ratio(pkg_stat[thd_stat[i].pkg_id].e_delta, + thd_stat[i].delta_ticks, + vmsr->host_topo.maxticks[thd_stat[i].pkg_id]); + pkg_stat[thd_stat[i].pkg_id].e_ratio + += (uint64_t)lround(temp); + } + } + + /* Calculate the ratio per non-vCPU thread of each package */ + for (int i = 0; i < vmsr->host_topo.maxpkgs; i++) { + if (pkg_stat[i].nb_vcpu > 0) { + pkg_stat[i].e_ratio = pkg_stat[i].e_ratio / pkg_stat[i].nb_vcpu; + } + } + + /* + * Calculate the energy for each Package: + * Energy Package = sum of each vCPU energy that belongs to the package + */ + for (int i = 0; i < num_threads; i++) { + if ((thd_stat[i].is_vcpu == true) && \ + (thd_stat[i].delta_ticks > 0)) { + double temp; + temp = vmsr_get_ratio(pkg_stat[thd_stat[i].pkg_id].e_delta, + thd_stat[i].delta_ticks, + vmsr->host_topo.maxticks[thd_stat[i].pkg_id]); + vpkgs_energy_stat[thd_stat[i].vpkg_id] += + (uint64_t)lround(temp); + vpkgs_energy_stat[thd_stat[i].vpkg_id] += + pkg_stat[thd_stat[i].pkg_id].e_ratio; + } + } + + /* + * Finally populate the vmsr register of each vCPU with the total + * package value to emulate the real hardware where each CPU return the + * value of the package it belongs. + */ + for (int i = 0; i < num_threads; i++) { + if ((thd_stat[i].is_vcpu == true) && \ + (thd_stat[i].delta_ticks > 0)) { + vmsr->msr_value[thd_stat[i].vcpu_id] = \ + vpkgs_energy_stat[thd_stat[i].vpkg_id]; + } + } + + /* Freeing memory before zeroing the pointer */ + for (int i = 0; i < num_threads; i++) { + g_free(thd_stat[i].utime); + g_free(thd_stat[i].stime); + } + } + +clean: + rcu_unregister_thread(); + return NULL; +} + +static int kvm_msr_energy_thread_init(KVMState *s, MachineState *ms) +{ + MachineClass *mc = MACHINE_GET_CLASS(ms); + struct KVMMsrEnergy *r = &s->msr_energy; + int ret = 0; + + /* + * Sanity check + * 1. Host cpu must be Intel cpu + * 2. RAPL must be enabled on the Host + */ + if (is_host_cpu_intel()) { + error_report("The RAPL feature can only be enabled on hosts\ + with Intel CPU models"); + ret = 1; + goto out; + } + + if (!is_rapl_enabled()) { + ret = 1; + goto out; + } + + /* Retrieve the virtual topology */ + vmsr_init_topo_info(&r->guest_topo_info, ms); + + /* Retrieve the number of vcpu */ + r->guest_vcpus = ms->smp.cpus; + + /* Retrieve the number of virtual sockets */ + r->guest_vsockets = ms->smp.sockets; + + /* Allocate register memory (MSR_PKG_STATUS) for each vcpu */ + r->msr_value = g_new0(uint64_t, r->guest_vcpus); + + /* Retrieve the CPUArchIDlist */ + r->guest_cpu_list = mc->possible_cpu_arch_ids(ms); + + /* Max number of cpus on the Host */ + r->host_topo.maxcpus = vmsr_get_maxcpus(); + if (r->host_topo.maxcpus == 0) { + error_report("host max cpus = 0"); + ret = 1; + goto out; + } + + /* Max number of packages on the host */ + r->host_topo.maxpkgs = vmsr_get_max_physical_package(r->host_topo.maxcpus); + if (r->host_topo.maxpkgs == 0) { + error_report("host max pkgs = 0"); + ret = 1; + goto out; + } + + /* Allocate memory for each package on the host */ + r->host_topo.pkg_cpu_count = g_new0(unsigned int, r->host_topo.maxpkgs); + r->host_topo.maxticks = g_new0(unsigned int, r->host_topo.maxpkgs); + + vmsr_count_cpus_per_package(r->host_topo.pkg_cpu_count, + r->host_topo.maxpkgs); + for (int i = 0; i < r->host_topo.maxpkgs; i++) { + if (r->host_topo.pkg_cpu_count[i] == 0) { + error_report("cpu per packages = 0 on package_%d", i); + ret = 1; + goto out; + } + } + + /* Get QEMU PID*/ + r->pid = getpid(); + + /* Compute the socket path if necessary */ + if (s->msr_energy.socket_path == NULL) { + s->msr_energy.socket_path = vmsr_compute_default_paths(); + } + + /* Open socket with vmsr helper */ + s->msr_energy.sioc = vmsr_open_socket(s->msr_energy.socket_path); + + if (s->msr_energy.sioc == NULL) { + error_report("vmsr socket opening failed"); + ret = 1; + goto out; + } + + /* Those MSR values should not change */ + r->msr_unit = vmsr_read_msr(MSR_RAPL_POWER_UNIT, 0, r->pid, + s->msr_energy.sioc); + r->msr_limit = vmsr_read_msr(MSR_PKG_POWER_LIMIT, 0, r->pid, + s->msr_energy.sioc); + r->msr_info = vmsr_read_msr(MSR_PKG_POWER_INFO, 0, r->pid, + s->msr_energy.sioc); + if (r->msr_unit == 0 || r->msr_limit == 0 || r->msr_info == 0) { + error_report("can't read any virtual msr"); + ret = 1; + goto out; + } + + qemu_thread_create(&r->msr_thr, "kvm-msr", + kvm_msr_energy_thread, + s, QEMU_THREAD_JOINABLE); +out: + return ret; +} + int kvm_arch_get_default_type(MachineState *ms) { return 0; @@ -2804,6 +3190,49 @@ int kvm_arch_init(MachineState *ms, KVMState *s) strerror(-ret)); exit(1); } + + if (s->msr_energy.enable == true) { + r = kvm_filter_msr(s, MSR_RAPL_POWER_UNIT, + kvm_rdmsr_rapl_power_unit, NULL); + if (!r) { + error_report("Could not install MSR_RAPL_POWER_UNIT \ + handler: %s", + strerror(-ret)); + exit(1); + } + + r = kvm_filter_msr(s, MSR_PKG_POWER_LIMIT, + kvm_rdmsr_pkg_power_limit, NULL); + if (!r) { + error_report("Could not install MSR_PKG_POWER_LIMIT \ + handler: %s", + strerror(-ret)); + exit(1); + } + + r = kvm_filter_msr(s, MSR_PKG_POWER_INFO, + kvm_rdmsr_pkg_power_info, NULL); + if (!r) { + error_report("Could not install MSR_PKG_POWER_INFO \ + handler: %s", + strerror(-ret)); + exit(1); + } + r = kvm_filter_msr(s, MSR_PKG_ENERGY_STATUS, + kvm_rdmsr_pkg_energy_status, NULL); + if (!r) { + error_report("Could not install MSR_PKG_ENERGY_STATUS \ + handler: %s", + strerror(-ret)); + exit(1); + } + r = kvm_msr_energy_thread_init(s, ms); + if (r) { + error_report("kvm : error RAPL feature requirement not meet"); + exit(1); + } + + } } return 0; diff --git a/target/i386/kvm/meson.build b/target/i386/kvm/meson.build index e7850981e6..3996cafaf2 100644 --- a/target/i386/kvm/meson.build +++ b/target/i386/kvm/meson.build @@ -3,6 +3,7 @@ i386_kvm_ss = ss.source_set() i386_kvm_ss.add(files( 'kvm.c', 'kvm-cpu.c', + 'vmsr_energy.c', )) i386_kvm_ss.add(when: 'CONFIG_XEN_EMU', if_true: files('xen-emu.c')) diff --git a/target/i386/kvm/vmsr_energy.c b/target/i386/kvm/vmsr_energy.c new file mode 100644 index 0000000000..a1d78f2f2a --- /dev/null +++ b/target/i386/kvm/vmsr_energy.c @@ -0,0 +1,345 @@ +/* + * QEMU KVM support -- x86 virtual RAPL msr + * + * Copyright 2024 Red Hat, Inc. 2024 + * + * Author: + * Anthony Harivel <aharivel@redhat.com> + * + * This work is licensed under the terms of the GNU GPL, version 2 or later. + * See the COPYING file in the top-level directory. + * + */ + +#include "qemu/osdep.h" +#include "qemu/error-report.h" +#include "vmsr_energy.h" +#include "io/channel.h" +#include "io/channel-socket.h" +#include "hw/boards.h" +#include "cpu.h" +#include "host-cpu.h" + +char *vmsr_compute_default_paths(void) +{ + g_autofree char *state = qemu_get_local_state_dir(); + + return g_build_filename(state, "run", "qemu-vmsr-helper.sock", NULL); +} + +bool is_host_cpu_intel(void) +{ + int family, model, stepping; + char vendor[CPUID_VENDOR_SZ + 1]; + + host_cpu_vendor_fms(vendor, &family, &model, &stepping); + + return strcmp(vendor, CPUID_VENDOR_INTEL); +} + +int is_rapl_enabled(void) +{ + const char *path = "/sys/class/powercap/intel-rapl/enabled"; + FILE *file = fopen(path, "r"); + int value = 0; + + if (file != NULL) { + if (fscanf(file, "%d", &value) != 1) { + error_report("INTEL RAPL not enabled"); + } + fclose(file); + } else { + error_report("Error opening %s", path); + } + + return value; +} + +QIOChannelSocket *vmsr_open_socket(const char *path) +{ + g_autofree char *socket_path = NULL; + + socket_path = g_strdup(path); + + SocketAddress saddr = { + .type = SOCKET_ADDRESS_TYPE_UNIX, + .u.q_unix.path = socket_path + }; + + QIOChannelSocket *sioc = qio_channel_socket_new(); + Error *local_err = NULL; + + qio_channel_set_name(QIO_CHANNEL(sioc), "vmsr-helper"); + qio_channel_socket_connect_sync(sioc, + &saddr, + &local_err); + if (local_err) { + /* Close socket. */ + qio_channel_close(QIO_CHANNEL(sioc), NULL); + object_unref(OBJECT(sioc)); + sioc = NULL; + goto out; + } + + qio_channel_set_delay(QIO_CHANNEL(sioc), false); +out: + return sioc; +} + +uint64_t vmsr_read_msr(uint32_t reg, uint32_t cpu_id, uint32_t tid, + QIOChannelSocket *sioc) +{ + uint64_t data = 0; + int r = 0; + Error *local_err = NULL; + uint32_t buffer[3]; + /* + * Send the required arguments: + * 1. RAPL MSR register to read + * 2. On which CPU ID + * 3. From which vCPU (Thread ID) + */ + buffer[0] = reg; + buffer[1] = cpu_id; + buffer[2] = tid; + + r = qio_channel_write_all(QIO_CHANNEL(sioc), + (char *)buffer, sizeof(buffer), + &local_err); + if (r < 0) { + goto out_close; + } + + r = qio_channel_read(QIO_CHANNEL(sioc), + (char *)&data, sizeof(data), + &local_err); + if (r < 0) { + data = 0; + goto out_close; + } + +out_close: + return data; +} + +/* Retrieve the max number of physical package */ +unsigned int vmsr_get_max_physical_package(unsigned int max_cpus) +{ + const char *dir = "/sys/devices/system/cpu/"; + const char *topo_path = "topology/physical_package_id"; + g_autofree int *uniquePackages = g_new0(int, max_cpus); + unsigned int packageCount = 0; + FILE *file = NULL; + + for (int i = 0; i < max_cpus; i++) { + g_autofree char *filePath = NULL; + g_autofree char *cpuid = g_strdup_printf("cpu%d", i); + + filePath = g_build_filename(dir, cpuid, topo_path, NULL); + + file = fopen(filePath, "r"); + + if (file == NULL) { + error_report("Error opening physical_package_id file"); + return 0; + } + + char packageId[10]; + if (fgets(packageId, sizeof(packageId), file) == NULL) { + packageCount = 0; + } + + fclose(file); + + int currentPackageId = atoi(packageId); + + bool isUnique = true; + for (int j = 0; j < packageCount; j++) { + if (uniquePackages[j] == currentPackageId) { + isUnique = false; + break; + } + } + + if (isUnique) { + uniquePackages[packageCount] = currentPackageId; + packageCount++; + + if (packageCount >= max_cpus) { + break; + } + } + } + + return (packageCount == 0) ? 1 : packageCount; +} + +/* Retrieve the max number of physical cpu on the host */ +unsigned int vmsr_get_maxcpus(void) +{ + GDir *dir; + const gchar *entry_name; + unsigned int cpu_count = 0; + const char *path = "/sys/devices/system/cpu/"; + + dir = g_dir_open(path, 0, NULL); + if (dir == NULL) { + error_report("Unable to open cpu directory"); + return -1; + } + + while ((entry_name = g_dir_read_name(dir)) != NULL) { + if (g_ascii_strncasecmp(entry_name, "cpu", 3) == 0 && + isdigit(entry_name[3])) { + cpu_count++; + } + } + + g_dir_close(dir); + + return cpu_count; +} + +/* Count the number of physical cpu on each packages */ +unsigned int vmsr_count_cpus_per_package(unsigned int *package_count, + unsigned int max_pkgs) +{ + g_autofree char *file_contents = NULL; + g_autofree char *path = NULL; + g_autofree char *path_name = NULL; + gsize length; + + /* Iterate over cpus and count cpus in each package */ + for (int cpu_id = 0; ; cpu_id++) { + path_name = g_strdup_printf("/sys/devices/system/cpu/cpu%d/" + "topology/physical_package_id", cpu_id); + + path = g_build_filename(path_name, NULL); + + if (!g_file_get_contents(path, &file_contents, &length, NULL)) { + break; /* No more cpus */ + } + + /* Get the physical package ID for this CPU */ + int package_id = atoi(file_contents); + + /* Check if the package ID is within the known number of packages */ + if (package_id >= 0 && package_id < max_pkgs) { + /* If yes, count the cpu for this package*/ + package_count[package_id]++; + } + } + + return 0; +} + +/* Get the physical package id from a given cpu id */ +int vmsr_get_physical_package_id(int cpu_id) +{ + g_autofree char *file_contents = NULL; + g_autofree char *file_path = NULL; + int package_id = -1; + gsize length; + + file_path = g_strdup_printf("/sys/devices/system/cpu/cpu%d" + "/topology/physical_package_id", cpu_id); + + if (!g_file_get_contents(file_path, &file_contents, &length, NULL)) { + goto out; + } + + package_id = atoi(file_contents); + +out: + return package_id; +} + +/* Read the scheduled time for a given thread of a give pid */ +void vmsr_read_thread_stat(pid_t pid, + unsigned int thread_id, + unsigned long long *utime, + unsigned long long *stime, + unsigned int *cpu_id) +{ + g_autofree char *path = NULL; + g_autofree char *path_name = NULL; + + path_name = g_strdup_printf("/proc/%u/task/%d/stat", pid, thread_id); + + path = g_build_filename(path_name, NULL); + + FILE *file = fopen(path, "r"); + if (file == NULL) { + pid = -1; + return; + } + + if (fscanf(file, "%*d (%*[^)]) %*c %*d %*d %*d %*d %*d %*u %*u %*u %*u %*u" + " %llu %llu %*d %*d %*d %*d %*d %*d %*u %*u %*d %*u %*u" + " %*u %*u %*u %*u %*u %*u %*u %*u %*u %*d %*u %*u %u", + utime, stime, cpu_id) != 3) + { + pid = -1; + return; + } + + fclose(file); + return; +} + +/* Read QEMU stat task folder to retrieve all QEMU threads ID */ +pid_t *vmsr_get_thread_ids(pid_t pid, unsigned int *num_threads) +{ + g_autofree char *task_path = g_strdup_printf("%d/task", pid); + g_autofree char *path = g_build_filename("/proc", task_path, NULL); + + DIR *dir = opendir(path); + if (dir == NULL) { + error_report("Error opening /proc/qemu/task"); + return NULL; + } + + pid_t *thread_ids = NULL; + unsigned int thread_count = 0; + + g_autofree struct dirent *ent = NULL; + while ((ent = readdir(dir)) != NULL) { + if (ent->d_name[0] == '.') { + continue; + } + pid_t tid = atoi(ent->d_name); + if (pid != tid) { + thread_ids = g_renew(pid_t, thread_ids, (thread_count + 1)); + thread_ids[thread_count] = tid; + thread_count++; + } + } + + closedir(dir); + + *num_threads = thread_count; + return thread_ids; +} + +void vmsr_delta_ticks(vmsr_thread_stat *thd_stat, int i) +{ + thd_stat[i].delta_ticks = (thd_stat[i].utime[1] + thd_stat[i].stime[1]) + - (thd_stat[i].utime[0] + thd_stat[i].stime[0]); +} + +double vmsr_get_ratio(uint64_t e_delta, + unsigned long long delta_ticks, + unsigned int maxticks) +{ + return (e_delta / 100.0) * ((100.0 / maxticks) * delta_ticks); +} + +void vmsr_init_topo_info(X86CPUTopoInfo *topo_info, + const MachineState *ms) +{ + topo_info->dies_per_pkg = ms->smp.dies; + topo_info->modules_per_die = ms->smp.modules; + topo_info->cores_per_module = ms->smp.cores; + topo_info->threads_per_core = ms->smp.threads; +} + diff --git a/target/i386/kvm/vmsr_energy.h b/target/i386/kvm/vmsr_energy.h new file mode 100644 index 0000000000..16cc1f4814 --- /dev/null +++ b/target/i386/kvm/vmsr_energy.h @@ -0,0 +1,99 @@ +/* + * QEMU KVM support -- x86 virtual energy-related MSR. + * + * Copyright 2024 Red Hat, Inc. 2024 + * + * Author: + * Anthony Harivel <aharivel@redhat.com> + * + * This work is licensed under the terms of the GNU GPL, version 2 or later. + * See the COPYING file in the top-level directory. + * + */ + +#ifndef VMSR_ENERGY_H +#define VMSR_ENERGY_H + +#include <stdint.h> +#include "qemu/osdep.h" +#include "io/channel-socket.h" +#include "hw/i386/topology.h" + +/* + * Define the interval time in micro seconds between 2 samples of + * energy related MSRs + */ +#define MSR_ENERGY_THREAD_SLEEP_US 1000000.0 + +/* + * Thread statistic + * @ thread_id: TID (thread ID) + * @ is_vcpu: true if TID is vCPU thread + * @ cpu_id: CPU number last executed on + * @ pkg_id: package number of the CPU + * @ vcpu_id: vCPU ID + * @ vpkg: virtual package number + * @ acpi_id: APIC id of the vCPU + * @ utime: amount of clock ticks the thread + * has been scheduled in User mode + * @ stime: amount of clock ticks the thread + * has been scheduled in System mode + * @ delta_ticks: delta of utime+stime between + * the two samples (before/after sleep) + */ +struct vmsr_thread_stat { + unsigned int thread_id; + bool is_vcpu; + unsigned int cpu_id; + unsigned int pkg_id; + unsigned int vpkg_id; + unsigned int vcpu_id; + unsigned long acpi_id; + unsigned long long *utime; + unsigned long long *stime; + unsigned long long delta_ticks; +}; + +/* + * Package statistic + * @ e_start: package energy counter before the sleep + * @ e_end: package energy counter after the sleep + * @ e_delta: delta of package energy counter + * @ e_ratio: store the energy ratio of non-vCPU thread + * @ nb_vcpu: number of vCPU running on this package + */ +struct vmsr_package_energy_stat { + uint64_t e_start; + uint64_t e_end; + uint64_t e_delta; + uint64_t e_ratio; + unsigned int nb_vcpu; +}; + +typedef struct vmsr_thread_stat vmsr_thread_stat; +typedef struct vmsr_package_energy_stat vmsr_package_energy_stat; + +char *vmsr_compute_default_paths(void); +void vmsr_read_thread_stat(pid_t pid, + unsigned int thread_id, + unsigned long long *utime, + unsigned long long *stime, + unsigned int *cpu_id); + +QIOChannelSocket *vmsr_open_socket(const char *path); +uint64_t vmsr_read_msr(uint32_t reg, uint32_t cpu_id, + uint32_t tid, QIOChannelSocket *sioc); +void vmsr_delta_ticks(vmsr_thread_stat *thd_stat, int i); +unsigned int vmsr_get_maxcpus(void); +unsigned int vmsr_get_max_physical_package(unsigned int max_cpus); +unsigned int vmsr_count_cpus_per_package(unsigned int *package_count, + unsigned int max_pkgs); +int vmsr_get_physical_package_id(int cpu_id); +pid_t *vmsr_get_thread_ids(pid_t pid, unsigned int *num_threads); +double vmsr_get_ratio(uint64_t e_delta, + unsigned long long delta_ticks, + unsigned int maxticks); +void vmsr_init_topo_info(X86CPUTopoInfo *topo_info, const MachineState *ms); +bool is_host_cpu_intel(void); +int is_rapl_enabled(void); +#endif /* VMSR_ENERGY_H */ diff --git a/target/loongarch/tcg/tlb_helper.c b/target/loongarch/tcg/tlb_helper.c index d6331f9b0b..97f38fc391 100644 --- a/target/loongarch/tcg/tlb_helper.c +++ b/target/loongarch/tcg/tlb_helper.c @@ -525,6 +525,7 @@ target_ulong helper_lddir(CPULoongArchState *env, target_ulong base, if (unlikely(level == 4)) { qemu_log_mask(LOG_GUEST_ERROR, "Attempted use of level 4 huge page\n"); + return base; } if (FIELD_EX64(base, TLBENTRY, LEVEL)) { diff --git a/target/sparc/ldst_helper.c b/target/sparc/ldst_helper.c index 2d48e98bf4..d92c9f1593 100644 --- a/target/sparc/ldst_helper.c +++ b/target/sparc/ldst_helper.c @@ -19,6 +19,7 @@ #include "qemu/osdep.h" #include "qemu/log.h" +#include "qemu/range.h" #include "cpu.h" #include "tcg/tcg.h" #include "exec/helper-proto.h" @@ -240,9 +241,7 @@ static void replace_tlb_1bit_lru(SparcTLBEntry *tlb, if (new_ctx == ctx) { uint64_t vaddr = tlb[i].tag & ~0x1fffULL; uint64_t size = 8192ULL << 3 * TTE_PGSIZE(tlb[i].tte); - if (new_vaddr == vaddr - || (new_vaddr < vaddr + size - && vaddr < new_vaddr + new_size)) { + if (ranges_overlap(new_vaddr, new_size, vaddr, size)) { DPRINTF_MMU("auto demap entry [%d] %lx->%lx\n", i, vaddr, new_vaddr); replace_tlb_entry(&tlb[i], tlb_tag, tlb_tte, env1); diff --git a/tests/data/acpi/aarch64/virt/DSDT b/tests/data/acpi/aarch64/virt/DSDT index c475039907..36d3e5d5a5 100644 --- a/tests/data/acpi/aarch64/virt/DSDT +++ b/tests/data/acpi/aarch64/virt/DSDT Binary files differdiff --git a/tests/data/acpi/aarch64/virt/DSDT.acpihmatvirt b/tests/data/acpi/aarch64/virt/DSDT.acpihmatvirt index aee6ba017c..e6154d0355 100644 --- a/tests/data/acpi/aarch64/virt/DSDT.acpihmatvirt +++ b/tests/data/acpi/aarch64/virt/DSDT.acpihmatvirt Binary files differdiff --git a/tests/data/acpi/aarch64/virt/DSDT.memhp b/tests/data/acpi/aarch64/virt/DSDT.memhp index bae36cdd39..33f011d6b6 100644 --- a/tests/data/acpi/aarch64/virt/DSDT.memhp +++ b/tests/data/acpi/aarch64/virt/DSDT.memhp Binary files differdiff --git a/tests/data/acpi/aarch64/virt/DSDT.pxb b/tests/data/acpi/aarch64/virt/DSDT.pxb index fbd78f44c4..c0fdc6e9c1 100644 --- a/tests/data/acpi/aarch64/virt/DSDT.pxb +++ b/tests/data/acpi/aarch64/virt/DSDT.pxb Binary files differdiff --git a/tests/data/acpi/aarch64/virt/DSDT.topology b/tests/data/acpi/aarch64/virt/DSDT.topology index 501314c91b..029d03eecc 100644 --- a/tests/data/acpi/aarch64/virt/DSDT.topology +++ b/tests/data/acpi/aarch64/virt/DSDT.topology Binary files differdiff --git a/tests/data/acpi/riscv64/virt/APIC b/tests/data/acpi/riscv64/virt/APIC new file mode 100644 index 0000000000..66a25dfd2d --- /dev/null +++ b/tests/data/acpi/riscv64/virt/APIC Binary files differdiff --git a/tests/data/acpi/riscv64/virt/DSDT b/tests/data/acpi/riscv64/virt/DSDT new file mode 100644 index 0000000000..6a33f5647d --- /dev/null +++ b/tests/data/acpi/riscv64/virt/DSDT Binary files differdiff --git a/tests/data/acpi/riscv64/virt/FACP b/tests/data/acpi/riscv64/virt/FACP new file mode 100644 index 0000000000..a5276b65ea --- /dev/null +++ b/tests/data/acpi/riscv64/virt/FACP Binary files differdiff --git a/tests/data/acpi/riscv64/virt/MCFG b/tests/data/acpi/riscv64/virt/MCFG new file mode 100644 index 0000000000..37eb923a93 --- /dev/null +++ b/tests/data/acpi/riscv64/virt/MCFG Binary files differdiff --git a/tests/data/acpi/riscv64/virt/RHCT b/tests/data/acpi/riscv64/virt/RHCT new file mode 100644 index 0000000000..4f231735ab --- /dev/null +++ b/tests/data/acpi/riscv64/virt/RHCT Binary files differdiff --git a/tests/data/acpi/riscv64/virt/SPCR b/tests/data/acpi/riscv64/virt/SPCR new file mode 100644 index 0000000000..4da9daf65f --- /dev/null +++ b/tests/data/acpi/riscv64/virt/SPCR Binary files differdiff --git a/tests/data/acpi/x86/microvm/DSDT.pcie b/tests/data/acpi/x86/microvm/DSDT.pcie index 765f14ef3d..8eacd21d6e 100644 --- a/tests/data/acpi/x86/microvm/DSDT.pcie +++ b/tests/data/acpi/x86/microvm/DSDT.pcie Binary files differdiff --git a/tests/qtest/bios-tables-test.c b/tests/qtest/bios-tables-test.c index f4c4704bab..36e5c0adde 100644 --- a/tests/qtest/bios-tables-test.c +++ b/tests/qtest/bios-tables-test.c @@ -267,15 +267,6 @@ static void dump_aml_files(test_data *data, bool rebuild) data->arch, data->machine, sdt->aml, ext); - /* - * To keep test cases not failing before the DATA files are moved to - * ${arch}/${machine} folder, add this check as well. - */ - if (!g_file_test(aml_file, G_FILE_TEST_EXISTS)) { - aml_file = g_strdup_printf("%s/%s/%.4s%s", data_dir, - data->machine, sdt->aml, ext); - } - if (!g_file_test(aml_file, G_FILE_TEST_EXISTS) && sdt->aml_len == exp_sdt->aml_len && !memcmp(sdt->aml, exp_sdt->aml, sdt->aml_len)) { @@ -412,11 +403,6 @@ static GArray *load_expected_aml(test_data *data) try_again: aml_file = g_strdup_printf("%s/%s/%s/%.4s%s", data_dir, data->arch, data->machine, sdt->aml, ext); - if (!g_file_test(aml_file, G_FILE_TEST_EXISTS)) { - aml_file = g_strdup_printf("%s/%s/%.4s%s", data_dir, data->machine, - sdt->aml, ext); - } - if (verbosity_level >= 2) { fprintf(stderr, "Looking for expected file '%s'\n", aml_file); } @@ -1977,6 +1963,28 @@ static void test_acpi_microvm_acpi_erst(void) } #endif /* CONFIG_POSIX */ +static void test_acpi_riscv64_virt_tcg(void) +{ + test_data data = { + .machine = "virt", + .arch = "riscv64", + .tcg_only = true, + .uefi_fl1 = "pc-bios/edk2-riscv-code.fd", + .uefi_fl2 = "pc-bios/edk2-riscv-vars.fd", + .cd = "tests/data/uefi-boot-images/bios-tables-test.riscv64.iso.qcow2", + .ram_start = 0x80000000ULL, + .scan_len = 128ULL * 1024 * 1024, + }; + + /* + * RHCT will have ISA string encoded. To reduce the effort + * of updating expected AML file for any new default ISA extension, + * use the profile rva22s64. + */ + test_acpi_one("-cpu rva22s64 ", &data); + free_test_data(&data); +} + static void test_acpi_aarch64_virt_tcg(void) { test_data data = { @@ -2455,6 +2463,10 @@ int main(int argc, char *argv[]) qtest_add_func("acpi/virt/viot", test_acpi_aarch64_virt_viot); } } + } else if (strcmp(arch, "riscv64") == 0) { + if (has_tcg && qtest_has_device("virtio-blk-pci")) { + qtest_add_func("acpi/virt", test_acpi_riscv64_virt_tcg); + } } ret = g_test_run(); boot_sector_cleanup(disk); diff --git a/tests/qtest/meson.build b/tests/qtest/meson.build index ff9200f882..e7ab2a4312 100644 --- a/tests/qtest/meson.build +++ b/tests/qtest/meson.build @@ -322,8 +322,7 @@ if gnutls.found() migration_files += [files('../unit/crypto-tls-psk-helpers.c'), gnutls] if tasn1.found() - migration_files += [files('../unit/crypto-tls-x509-helpers.c', - '../unit/pkix_asn1_tab.c'), tasn1] + migration_files += [files('../unit/crypto-tls-x509-helpers.c'), tasn1] endif endif diff --git a/tests/unit/crypto-tls-psk-helpers.c b/tests/unit/crypto-tls-psk-helpers.c index c6cc740772..36527fd655 100644 --- a/tests/unit/crypto-tls-psk-helpers.c +++ b/tests/unit/crypto-tls-psk-helpers.c @@ -20,7 +20,6 @@ #include "qemu/osdep.h" -#include "crypto-tls-x509-helpers.h" #include "crypto-tls-psk-helpers.h" #include "qemu/sockets.h" diff --git a/tests/unit/crypto-tls-x509-helpers.c b/tests/unit/crypto-tls-x509-helpers.c index e9937f60d8..3e74ec5b5d 100644 --- a/tests/unit/crypto-tls-x509-helpers.c +++ b/tests/unit/crypto-tls-x509-helpers.c @@ -20,15 +20,19 @@ #include "qemu/osdep.h" +#include <libtasn1.h> + #include "crypto-tls-x509-helpers.h" #include "crypto/init.h" #include "qemu/sockets.h" +#include "pkix_asn1_tab.c.inc" + /* * This stores some static data that is needed when * encoding extensions in the x509 certs */ -asn1_node pkix_asn1; +static asn1_node pkix_asn1; /* * To avoid consuming random entropy to generate keys, diff --git a/tests/unit/crypto-tls-x509-helpers.h b/tests/unit/crypto-tls-x509-helpers.h index 247e7160eb..562c160653 100644 --- a/tests/unit/crypto-tls-x509-helpers.h +++ b/tests/unit/crypto-tls-x509-helpers.h @@ -23,7 +23,6 @@ #include <gnutls/gnutls.h> #include <gnutls/x509.h> -#include <libtasn1.h> #define QCRYPTO_TLS_TEST_CLIENT_NAME "ACME QEMU Client" @@ -171,6 +170,4 @@ void test_tls_cleanup(const char *keyfile); }; \ test_tls_generate_cert(&varname, cavarname.crt) -extern const asn1_static_node pkix_asn1_tab[]; - #endif diff --git a/tests/unit/meson.build b/tests/unit/meson.build index 26c109c968..490ab8182d 100644 --- a/tests/unit/meson.build +++ b/tests/unit/meson.build @@ -99,11 +99,11 @@ if have_block tasn1.found() and \ host_os != 'windows' tests += { - 'test-crypto-tlscredsx509': ['crypto-tls-x509-helpers.c', 'pkix_asn1_tab.c', + 'test-crypto-tlscredsx509': ['crypto-tls-x509-helpers.c', tasn1, crypto, gnutls], - 'test-crypto-tlssession': ['crypto-tls-x509-helpers.c', 'pkix_asn1_tab.c', 'crypto-tls-psk-helpers.c', + 'test-crypto-tlssession': ['crypto-tls-x509-helpers.c', 'crypto-tls-psk-helpers.c', tasn1, crypto, gnutls], - 'test-io-channel-tls': ['io-channel-helpers.c', 'crypto-tls-x509-helpers.c', 'pkix_asn1_tab.c', + 'test-io-channel-tls': ['io-channel-helpers.c', 'crypto-tls-x509-helpers.c', tasn1, io, crypto, gnutls]} endif if pam.found() diff --git a/tests/unit/pkix_asn1_tab.c b/tests/unit/pkix_asn1_tab.c.inc index 89521408a1..fe29c4102a 100644 --- a/tests/unit/pkix_asn1_tab.c +++ b/tests/unit/pkix_asn1_tab.c.inc @@ -3,10 +3,7 @@ * and is under copyright of various GNUTLS contributors. */ -#include "qemu/osdep.h" -#include "crypto-tls-x509-helpers.h" - -const asn1_static_node pkix_asn1_tab[] = { +static const asn1_static_node pkix_asn1_tab[] = { {"PKIX1", 536875024, 0}, {0, 1073741836, 0}, {"id-ce", 1879048204, 0}, diff --git a/tests/unit/test-crypto-tlssession.c b/tests/unit/test-crypto-tlssession.c index b12e7b6879..3395f73560 100644 --- a/tests/unit/test-crypto-tlssession.c +++ b/tests/unit/test-crypto-tlssession.c @@ -35,18 +35,40 @@ #define PSKFILE WORKDIR "keys.psk" #define KEYFILE WORKDIR "key-ctx.pem" -static ssize_t testWrite(const char *buf, size_t len, void *opaque) +static ssize_t +testWrite(const char *buf, size_t len, void *opaque, Error **errp) { int *fd = opaque; + int ret; - return write(*fd, buf, len); + ret = write(*fd, buf, len); + if (ret < 0) { + if (errno == EAGAIN) { + return QCRYPTO_TLS_SESSION_ERR_BLOCK; + } else { + error_setg_errno(errp, errno, "unable to write"); + return -1; + } + } + return ret; } -static ssize_t testRead(char *buf, size_t len, void *opaque) +static ssize_t +testRead(char *buf, size_t len, void *opaque, Error **errp) { int *fd = opaque; + int ret; - return read(*fd, buf, len); + ret = read(*fd, buf, len); + if (ret < 0) { + if (errno == EAGAIN) { + return QCRYPTO_TLS_SESSION_ERR_BLOCK; + } else { + error_setg_errno(errp, errno, "unable to read"); + return -1; + } + } + return ret; } static QCryptoTLSCreds *test_tls_creds_psk_create( diff --git a/tools/i386/qemu-vmsr-helper.c b/tools/i386/qemu-vmsr-helper.c new file mode 100644 index 0000000000..ebf562c3ff --- /dev/null +++ b/tools/i386/qemu-vmsr-helper.c @@ -0,0 +1,530 @@ +/* + * Privileged RAPL MSR helper commands for QEMU + * + * Copyright (C) 2024 Red Hat, Inc. <aharivel@redhat.com> + * + * Author: Anthony Harivel <aharivel@redhat.com> + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; under version 2 of the License. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, see <http://www.gnu.org/licenses/>. + */ + +#include "qemu/osdep.h" +#include <getopt.h> +#include <stdbool.h> +#include <sys/ioctl.h> +#ifdef CONFIG_LIBCAP_NG +#include <cap-ng.h> +#endif +#include <pwd.h> +#include <grp.h> + +#include "qemu/help-texts.h" +#include "qapi/error.h" +#include "qemu/cutils.h" +#include "qemu/main-loop.h" +#include "qemu/module.h" +#include "qemu/error-report.h" +#include "qemu/config-file.h" +#include "qemu-version.h" +#include "qapi/error.h" +#include "qemu/error-report.h" +#include "qemu/log.h" +#include "qemu/systemd.h" +#include "io/channel.h" +#include "io/channel-socket.h" +#include "trace/control.h" +#include "qemu-version.h" +#include "rapl-msr-index.h" + +#define MSR_PATH_TEMPLATE "/dev/cpu/%u/msr" + +static char *socket_path; +static char *pidfile; +static enum { RUNNING, TERMINATE, TERMINATING } state; +static QIOChannelSocket *server_ioc; +static int server_watch; +static int num_active_sockets = 1; + +#ifdef CONFIG_LIBCAP_NG +static int uid = -1; +static int gid = -1; +#endif + +static void compute_default_paths(void) +{ + g_autofree char *state = qemu_get_local_state_dir(); + + socket_path = g_build_filename(state, "run", "qemu-vmsr-helper.sock", NULL); + pidfile = g_build_filename(state, "run", "qemu-vmsr-helper.pid", NULL); +} + +static int is_intel_processor(void) +{ + int result; + int ebx, ecx, edx; + + /* Execute CPUID instruction with eax=0 (basic identification) */ + asm volatile ( + "cpuid" + : "=b" (ebx), "=c" (ecx), "=d" (edx) + : "a" (0) + ); + + /* + * Check if processor is "GenuineIntel" + * 0x756e6547 = "Genu" + * 0x49656e69 = "ineI" + * 0x6c65746e = "ntel" + */ + result = (ebx == 0x756e6547) && (edx == 0x49656e69) && (ecx == 0x6c65746e); + + return result; +} + +static int is_rapl_enabled(void) +{ + const char *path = "/sys/class/powercap/intel-rapl/enabled"; + FILE *file = fopen(path, "r"); + int value = 0; + + if (file != NULL) { + if (fscanf(file, "%d", &value) != 1) { + error_report("INTEL RAPL not enabled"); + } + fclose(file); + } else { + error_report("Error opening %s", path); + } + + return value; +} + +/* + * Check if the TID that request the MSR read + * belongs to the peer. It be should a TID of a vCPU. + */ +static bool is_tid_present(pid_t pid, pid_t tid) +{ + g_autofree char *tidPath = g_strdup_printf("/proc/%d/task/%d", pid, tid); + + /* Check if the TID directory exists within the PID directory */ + if (access(tidPath, F_OK) == 0) { + return true; + } + + error_report("Failed to open /proc at %s", tidPath); + return false; +} + +/* + * Only the RAPL MSR in target/i386/cpu.h are allowed + */ +static bool is_msr_allowed(uint32_t reg) +{ + switch (reg) { + case MSR_RAPL_POWER_UNIT: + case MSR_PKG_POWER_LIMIT: + case MSR_PKG_ENERGY_STATUS: + case MSR_PKG_POWER_INFO: + return true; + default: + return false; + } +} + +static uint64_t vmsr_read_msr(uint32_t msr_register, unsigned int cpu_id) +{ + int fd; + uint64_t result = 0; + + g_autofree char *path = g_strdup_printf(MSR_PATH_TEMPLATE, cpu_id); + + fd = open(path, O_RDONLY); + if (fd < 0) { + error_report("Failed to open MSR file at %s", path); + return result; + } + + if (pread(fd, &result, sizeof(result), msr_register) != sizeof(result)) { + error_report("Failed to read MSR"); + result = 0; + } + + close(fd); + return result; +} + +static void usage(const char *name) +{ + (printf) ( +"Usage: %s [OPTIONS] FILE\n" +"Virtual RAPL MSR helper program for QEMU\n" +"\n" +" -h, --help display this help and exit\n" +" -V, --version output version information and exit\n" +"\n" +" -d, --daemon run in the background\n" +" -f, --pidfile=PATH PID file when running as a daemon\n" +" (default '%s')\n" +" -k, --socket=PATH path to the unix socket\n" +" (default '%s')\n" +" -T, --trace [[enable=]<pattern>][,events=<file>][,file=<file>]\n" +" specify tracing options\n" +#ifdef CONFIG_LIBCAP_NG +" -u, --user=USER user to drop privileges to\n" +" -g, --group=GROUP group to drop privileges to\n" +#endif +"\n" +QEMU_HELP_BOTTOM "\n" + , name, pidfile, socket_path); +} + +static void version(const char *name) +{ + printf( +"%s " QEMU_FULL_VERSION "\n" +"Written by Anthony Harivel.\n" +"\n" +QEMU_COPYRIGHT "\n" +"This is free software; see the source for copying conditions. There is NO\n" +"warranty; not even for MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.\n" + , name); +} + +typedef struct VMSRHelperClient { + QIOChannelSocket *ioc; + Coroutine *co; +} VMSRHelperClient; + +static void coroutine_fn vh_co_entry(void *opaque) +{ + VMSRHelperClient *client = opaque; + Error *local_err = NULL; + unsigned int peer_pid; + uint32_t request[3]; + uint64_t vmsr; + int r; + + qio_channel_set_blocking(QIO_CHANNEL(client->ioc), + false, NULL); + + qio_channel_set_follow_coroutine_ctx(QIO_CHANNEL(client->ioc), true); + + /* + * Check peer credentials + */ + r = qio_channel_get_peerpid(QIO_CHANNEL(client->ioc), + &peer_pid, + &local_err); + if (r < 0) { + error_report_err(local_err); + goto out; + } + + while (r < 0) { + /* + * Read the requested MSR + * Only RAPL MSR in rapl-msr-index.h is allowed + */ + r = qio_channel_read_all(QIO_CHANNEL(client->ioc), + (char *) &request, sizeof(request), &local_err); + if (r < 0) { + error_report_err(local_err); + break; + } + + if (!is_msr_allowed(request[0])) { + error_report("Requested unallowed msr: %d", request[0]); + break; + } + + vmsr = vmsr_read_msr(request[0], request[1]); + + if (!is_tid_present(peer_pid, request[2])) { + error_report("Requested TID not in peer PID: %d %d", + peer_pid, request[2]); + vmsr = 0; + } + + r = qio_channel_write_all(QIO_CHANNEL(client->ioc), + (char *) &vmsr, + sizeof(vmsr), + &local_err); + if (r < 0) { + error_report_err(local_err); + break; + } + } +out: + object_unref(OBJECT(client->ioc)); + g_free(client); +} + +static gboolean accept_client(QIOChannel *ioc, + GIOCondition cond, + gpointer opaque) +{ + QIOChannelSocket *cioc; + VMSRHelperClient *vmsrh; + + cioc = qio_channel_socket_accept(QIO_CHANNEL_SOCKET(ioc), + NULL); + if (!cioc) { + return TRUE; + } + + vmsrh = g_new(VMSRHelperClient, 1); + vmsrh->ioc = cioc; + vmsrh->co = qemu_coroutine_create(vh_co_entry, vmsrh); + qemu_coroutine_enter(vmsrh->co); + + return TRUE; +} + +static void termsig_handler(int signum) +{ + qatomic_cmpxchg(&state, RUNNING, TERMINATE); + qemu_notify_event(); +} + +static void close_server_socket(void) +{ + assert(server_ioc); + + g_source_remove(server_watch); + server_watch = -1; + object_unref(OBJECT(server_ioc)); + num_active_sockets--; +} + +#ifdef CONFIG_LIBCAP_NG +static int drop_privileges(void) +{ + /* clear all capabilities */ + capng_clear(CAPNG_SELECT_BOTH); + + if (capng_update(CAPNG_ADD, CAPNG_EFFECTIVE | CAPNG_PERMITTED, + CAP_SYS_RAWIO) < 0) { + return -1; + } + + return 0; +} +#endif + +int main(int argc, char **argv) +{ + const char *sopt = "hVk:f:dT:u:g:vq"; + struct option lopt[] = { + { "help", no_argument, NULL, 'h' }, + { "version", no_argument, NULL, 'V' }, + { "socket", required_argument, NULL, 'k' }, + { "pidfile", required_argument, NULL, 'f' }, + { "daemon", no_argument, NULL, 'd' }, + { "trace", required_argument, NULL, 'T' }, + { "verbose", no_argument, NULL, 'v' }, + { NULL, 0, NULL, 0 } + }; + int opt_ind = 0; + int ch; + Error *local_err = NULL; + bool daemonize = false; + bool pidfile_specified = false; + bool socket_path_specified = false; + unsigned socket_activation; + + struct sigaction sa_sigterm; + memset(&sa_sigterm, 0, sizeof(sa_sigterm)); + sa_sigterm.sa_handler = termsig_handler; + sigaction(SIGTERM, &sa_sigterm, NULL); + sigaction(SIGINT, &sa_sigterm, NULL); + sigaction(SIGHUP, &sa_sigterm, NULL); + + signal(SIGPIPE, SIG_IGN); + + error_init(argv[0]); + module_call_init(MODULE_INIT_TRACE); + module_call_init(MODULE_INIT_QOM); + qemu_add_opts(&qemu_trace_opts); + qemu_init_exec_dir(argv[0]); + + compute_default_paths(); + + /* + * Sanity check + * 1. cpu must be Intel cpu + * 2. RAPL must be enabled + */ + if (!is_intel_processor()) { + error_report("error: CPU is not INTEL cpu"); + exit(EXIT_FAILURE); + } + + if (!is_rapl_enabled()) { + error_report("error: RAPL driver not enable"); + exit(EXIT_FAILURE); + } + + while ((ch = getopt_long(argc, argv, sopt, lopt, &opt_ind)) != -1) { + switch (ch) { + case 'k': + g_free(socket_path); + socket_path = g_strdup(optarg); + socket_path_specified = true; + if (socket_path[0] != '/') { + error_report("socket path must be absolute"); + exit(EXIT_FAILURE); + } + break; + case 'f': + g_free(pidfile); + pidfile = g_strdup(optarg); + pidfile_specified = true; + break; +#ifdef CONFIG_LIBCAP_NG + case 'u': { + unsigned long res; + struct passwd *userinfo = getpwnam(optarg); + if (userinfo) { + uid = userinfo->pw_uid; + } else if (qemu_strtoul(optarg, NULL, 10, &res) == 0 && + (uid_t)res == res) { + uid = res; + } else { + error_report("invalid user '%s'", optarg); + exit(EXIT_FAILURE); + } + break; + } + case 'g': { + unsigned long res; + struct group *groupinfo = getgrnam(optarg); + if (groupinfo) { + gid = groupinfo->gr_gid; + } else if (qemu_strtoul(optarg, NULL, 10, &res) == 0 && + (gid_t)res == res) { + gid = res; + } else { + error_report("invalid group '%s'", optarg); + exit(EXIT_FAILURE); + } + break; + } +#else + case 'u': + case 'g': + error_report("-%c not supported by this %s", ch, argv[0]); + exit(1); +#endif + case 'd': + daemonize = true; + break; + case 'T': + trace_opt_parse(optarg); + break; + case 'V': + version(argv[0]); + exit(EXIT_SUCCESS); + break; + case 'h': + usage(argv[0]); + exit(EXIT_SUCCESS); + break; + case '?': + error_report("Try `%s --help' for more information.", argv[0]); + exit(EXIT_FAILURE); + } + } + + if (!trace_init_backends()) { + exit(EXIT_FAILURE); + } + trace_init_file(); + qemu_set_log(LOG_TRACE, &error_fatal); + + socket_activation = check_socket_activation(); + if (socket_activation == 0) { + SocketAddress saddr; + saddr = (SocketAddress){ + .type = SOCKET_ADDRESS_TYPE_UNIX, + .u.q_unix.path = socket_path, + }; + server_ioc = qio_channel_socket_new(); + if (qio_channel_socket_listen_sync(server_ioc, &saddr, + 1, &local_err) < 0) { + object_unref(OBJECT(server_ioc)); + error_report_err(local_err); + return 1; + } + } else { + /* Using socket activation - check user didn't use -p etc. */ + if (socket_path_specified) { + error_report("Unix socket can't be set when" + "using socket activation"); + exit(EXIT_FAILURE); + } + + /* Can only listen on a single socket. */ + if (socket_activation > 1) { + error_report("%s does not support socket activation" + "with LISTEN_FDS > 1", + argv[0]); + exit(EXIT_FAILURE); + } + server_ioc = qio_channel_socket_new_fd(FIRST_SOCKET_ACTIVATION_FD, + &local_err); + if (server_ioc == NULL) { + error_reportf_err(local_err, + "Failed to use socket activation: "); + exit(EXIT_FAILURE); + } + } + + qemu_init_main_loop(&error_fatal); + + server_watch = qio_channel_add_watch(QIO_CHANNEL(server_ioc), + G_IO_IN, + accept_client, + NULL, NULL); + + if (daemonize) { + if (daemon(0, 0) < 0) { + error_report("Failed to daemonize: %s", strerror(errno)); + exit(EXIT_FAILURE); + } + } + + if (daemonize || pidfile_specified) { + qemu_write_pidfile(pidfile, &error_fatal); + } + +#ifdef CONFIG_LIBCAP_NG + if (drop_privileges() < 0) { + error_report("Failed to drop privileges: %s", strerror(errno)); + exit(EXIT_FAILURE); + } +#endif + + info_report("Listening on %s", socket_path); + + state = RUNNING; + do { + main_loop_wait(false); + if (state == TERMINATE) { + state = TERMINATING; + close_server_socket(); + } + } while (num_active_sockets > 0); + + exit(EXIT_SUCCESS); +} diff --git a/tools/i386/rapl-msr-index.h b/tools/i386/rapl-msr-index.h new file mode 100644 index 0000000000..9a7118639a --- /dev/null +++ b/tools/i386/rapl-msr-index.h @@ -0,0 +1,28 @@ +/* + * Allowed list of MSR for Privileged RAPL MSR helper commands for QEMU + * + * Copyright (C) 2023 Red Hat, Inc. <aharivel@redhat.com> + * + * Author: Anthony Harivel <aharivel@redhat.com> + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; under version 2 of the License. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, see <http://www.gnu.org/licenses/>. + */ + +/* + * Should stay in sync with the RAPL MSR + * in target/i386/cpu.h + */ +#define MSR_RAPL_POWER_UNIT 0x00000606 +#define MSR_PKG_POWER_LIMIT 0x00000610 +#define MSR_PKG_ENERGY_STATUS 0x00000611 +#define MSR_PKG_POWER_INFO 0x00000614 diff --git a/ui/console-vc.c b/ui/console-vc.c index 899fa11c94..8393d532e7 100644 --- a/ui/console-vc.c +++ b/ui/console-vc.c @@ -287,7 +287,7 @@ static void kbd_send_chars(QemuTextConsole *s) const uint8_t *buf; uint32_t size; - buf = fifo8_pop_buf(&s->out_fifo, MIN(len, avail), &size); + buf = fifo8_pop_bufptr(&s->out_fifo, MIN(len, avail), &size); qemu_chr_be_write(s->chr, buf, size); len = qemu_chr_be_can_write(s->chr); avail -= size; diff --git a/ui/gtk.c b/ui/gtk.c index bc29f7a1b4..8e14c2ac81 100644 --- a/ui/gtk.c +++ b/ui/gtk.c @@ -1820,7 +1820,7 @@ static void gd_vc_send_chars(VirtualConsole *vc) const uint8_t *buf; uint32_t size; - buf = fifo8_pop_buf(&vc->vte.out_fifo, MIN(len, avail), &size); + buf = fifo8_pop_bufptr(&vc->vte.out_fifo, MIN(len, avail), &size); qemu_chr_be_write(vc->vte.chr, buf, size); len = qemu_chr_be_can_write(vc->vte.chr); avail -= size; diff --git a/util/fifo8.c b/util/fifo8.c index 4e01b532d9..1ffa19d900 100644 --- a/util/fifo8.c +++ b/util/fifo8.c @@ -16,12 +16,17 @@ #include "migration/vmstate.h" #include "qemu/fifo8.h" +void fifo8_reset(Fifo8 *fifo) +{ + fifo->num = 0; + fifo->head = 0; +} + void fifo8_create(Fifo8 *fifo, uint32_t capacity) { fifo->data = g_new(uint8_t, capacity); fifo->capacity = capacity; - fifo->head = 0; - fifo->num = 0; + fifo8_reset(fifo); } void fifo8_destroy(Fifo8 *fifo) @@ -87,20 +92,49 @@ static const uint8_t *fifo8_peekpop_buf(Fifo8 *fifo, uint32_t max, return ret; } -const uint8_t *fifo8_peek_buf(Fifo8 *fifo, uint32_t max, uint32_t *numptr) +const uint8_t *fifo8_peek_bufptr(Fifo8 *fifo, uint32_t max, uint32_t *numptr) { return fifo8_peekpop_buf(fifo, max, numptr, false); } -const uint8_t *fifo8_pop_buf(Fifo8 *fifo, uint32_t max, uint32_t *numptr) +const uint8_t *fifo8_pop_bufptr(Fifo8 *fifo, uint32_t max, uint32_t *numptr) { return fifo8_peekpop_buf(fifo, max, numptr, true); } -void fifo8_reset(Fifo8 *fifo) +uint32_t fifo8_pop_buf(Fifo8 *fifo, uint8_t *dest, uint32_t destlen) { - fifo->num = 0; - fifo->head = 0; + const uint8_t *buf; + uint32_t n1, n2 = 0; + uint32_t len; + + if (destlen == 0) { + return 0; + } + + len = destlen; + buf = fifo8_pop_bufptr(fifo, len, &n1); + if (dest) { + memcpy(dest, buf, n1); + } + + /* Add FIFO wraparound if needed */ + len -= n1; + len = MIN(len, fifo8_num_used(fifo)); + if (len) { + buf = fifo8_pop_bufptr(fifo, len, &n2); + if (dest) { + memcpy(&dest[n1], buf, n2); + } + } + + return n1 + n2; +} + +void fifo8_drop(Fifo8 *fifo, uint32_t len) +{ + len -= fifo8_pop_buf(fifo, NULL, len); + assert(len == 0); } bool fifo8_is_empty(Fifo8 *fifo) |