Qemu hangs during migration Source server: linux 4.19.5 qemu-3.0.0 from source, libvirt 4.9 Dest server: linux 4.18.19 qemu-3.0.0 from source, libvirt 4.9 When this VM is running on source server: /usr/bin/qemu-system-x86_64 -name guest=testvm,debug-threads=on -S -object secret,id=masterKey0,format=raw,file=/var/lib/libvirt/qemu/domain-13-testvm/master-key.aes -machine pc-q35-3.0,accel=kvm,usb=off,dump-guest-core=off -cpu Skylake-Server-IBRS,ss=on,hypervisor=on,tsc_adjust=on,clflushopt=on,umip=on,pku=on,ssbd=on,xsaves=on,topoext=on,hv_time,hv_relaxed,hv_vapic,hv_spinlocks=0x1fff,hv_vpindex,hv_runtime,hv_synic,hv_stimer,hv_reset,hv_vendor_id=KVM Hv -m 4096 -realtime mlock=off -smp 2,sockets=2,cores=1,threads=1 -object iothread,id=iothread1 -uuid 3b00b788-ee91-4e45-80a6-c7319da71225 -no-user-config -nodefaults -chardev socket,id=charmonitor,fd=23,server,nowait -mon chardev=charmonitor,id=monitor,mode=control -rtc base=localtime,driftfix=slew -global kvm-pit.lost_tick_policy=delay -no-hpet -no-shutdown -boot strict=on -device pcie-root-port,port=0x10,chassis=1,id=pci.1,bus=pcie.0,multifunction=on,addr=0x2 -device pcie-root-port,port=0x11,chassis=2,id=pci.2,bus=pcie.0,addr=0x2.0x1 -device pcie-pci-bridge,id=pci.3,bus=pci.1,addr=0x0 -device pcie-root-port,port=0x12,chassis=4,id=pci.4,bus=pcie.0,addr=0x2.0x2 -device pcie-root-port,port=0x13,chassis=5,id=pci.5,bus=pcie.0,addr=0x2.0x3 -device piix3-usb-uhci,id=usb,bus=pci.3,addr=0x1 -device virtio-scsi-pci,iothread=iothread1,id=scsi0,bus=pci.4,addr=0x0 -drive file=/dev/zvol/datastore/vm/testvm-vda,format=raw,if=none,id=drive-scsi0-0-0-0,cache=writeback,aio=threads -device scsi-hd,bus=scsi0.0,channel=0,scsi-id=0,lun=0,drive=drive-scsi0-0-0-0,id=scsi0-0-0-0,bootindex=2,write-cache=on -drive if=none,id=drive-sata0-0-4,media=cdrom,readonly=on -device ide-cd,bus=ide.4,drive=drive-sata0-0-4,id=sata0-0-4,bootindex=1 -netdev tap,fd=25,id=hostnet0,vhost=on,vhostfd=26 -device virtio-net-pci,netdev=hostnet0,id=net0,mac=52:54:00:a2:b7:a1,bus=pci.2,addr=0x0 -device usb-tablet,id=input0,bus=usb.0,port=1 -vnc 127.0.0.1:0 -device cirrus-vga,id=video0,bus=pcie.0,addr=0x1 -s -sandbox on,obsolete=deny,elevateprivileges=deny,spawn=deny,resourcecontrol=deny -msg timestamp=on I try to migrate it and the disks to the other side: virsh migrate --live --undefinesource --persistent --verbose --copy-storage-all testvm qemu+ssh://wasvirt1/system We get to 99% then hang with both sides in the pause state. Source server is stuck here: (gdb) bt full #0 0x00007f327994f3c1 in ppoll () at /lib64/libc.so.6 #1 0x000000000086167b in qemu_poll_ns (fds=, nfds=nfds@entry=1, timeout=) at util/qemu-timer.c:322 #2 0x0000000000863302 in aio_poll (ctx=0x21044e0, blocking=blocking@entry=true) at util/aio-posix.c:629 node = i = ret = 0 progress = timeout = start = __PRETTY_FUNCTION__ = "aio_poll" #3 0x00000000007e0d52 in nbd_client_close (bs=0x2ba2400) at block/nbd-client.c:62 waited_ = wait_ = 0x2ba563c ctx_ = 0x2109bb0 bs_ = 0x2ba2400 client = 0x31287e0 client = request = {handle = 0, from = 0, len = 0, flags = 0, type = 2} #4 0x00000000007e0d52 in nbd_client_close (bs=0x2ba2400) at block/nbd-client.c:965 client = request = {handle = 0, from = 0, len = 0, flags = 0, type = 2} #5 0x00000000007de5ca in nbd_close (bs=) at block/nbd.c:491 s = 0x31287e0 #6 0x00000000007823d6 in bdrv_unref (bs=0x2ba2400) at block.c:3352 ban = ban_next = child = next = #7 0x00000000007823d6 in bdrv_unref (bs=0x2ba2400) at block.c:3560 #8 0x00000000007823d6 in bdrv_unref (bs=0x2ba2400) at block.c:4616 #9 0x0000000000782403 in bdrv_unref (bs=0x2af96f0) at block.c:3359 ban = ban_next = child = next = #10 0x0000000000782403 in bdrv_unref (bs=0x2af96f0) at block.c:3560 #11 0x0000000000782403 in bdrv_unref (bs=0x2af96f0) at block.c:4616 #12 0x0000000000785784 in block_job_remove_all_bdrv (job=job@entry=0x2f32570) at blockjob.c:200 c = 0x23bac30 l = 0x20dd330 = {0x23bac30, 0x2b89410} #13 0x00000000007ceb5f in mirror_exit (job=0x2f32570, opaque=0x7f326407a350) at block/mirror.c:700 s = 0x2f32570 bjob = 0x2f32570 data = 0x7f326407a350 bs_opaque = 0x30d5600 replace_aio_context = src = 0x2131080 target_bs = 0x2af96f0 mirror_top_bs = 0x210eb70 local_err = 0x0 #14 0x0000000000786452 in job_defer_to_main_loop_bh (opaque=0x7f32640786a0) at job.c:973 data = 0x7f32640786a0 job = aio_context = 0x2109bb0 #15 0x000000000085fd3f in aio_bh_poll (ctx=ctx@entry=0x21044e0) at util/async.c:118 ---Type to continue, or q to quit--- bh = bhp = next = 0x2ea86e0 ret = 1 deleted = false #16 0x00000000008631b0 in aio_dispatch (ctx=0x21044e0) at util/aio-posix.c:436 #17 0x000000000085fc1e in aio_ctx_dispatch (source=, callback=, user_data=) at util/async.c:261 ctx = #18 0x00007f327f17d797 in g_main_context_dispatch () at /usr/lib64/libglib-2.0.so.0 #19 0x00000000008622ed in main_loop_wait () at util/main-loop.c:215 context = 0x2104900 pfds = context = 0x2104900 ret = 1 ret = 1 timeout = 4294967295 timeout_ns = #20 0x00000000008622ed in main_loop_wait (timeout=) at util/main-loop.c:238 context = 0x2104900 ret = 1 ret = 1 timeout = 4294967295 timeout_ns = #21 0x00000000008622ed in main_loop_wait (nonblocking=nonblocking@entry=0) at util/main-loop.c:497 ret = 1 timeout = 4294967295 timeout_ns = #22 0x0000000000595dee in main_loop () at vl.c:1866 #23 0x000000000041f35d in main (argc=, argv=, envp=) at vl.c:4644 i = snapshot = 0 linux_boot = initrd_filename = 0x0 kernel_filename = kernel_cmdline = boot_order = 0x918f44 "cad" boot_once = 0x0 ds = opts = machine_opts = icount_opts = accel_opts = 0x0 olist = optind = 71 optarg = 0x7ffdfc94df69 "timestamp=on" loadvm = 0x0 machine_class = 0x0 cpu_model = 0x7ffdfc94d864 "Skylake-Server-IBRS,ss=on,hypervisor=on,tsc_adjust=on,clflushopt=on,umip=on,pku=on,ssbd=on,xsaves=on,topoext=on,hv_time,hv_relaxed,hv_vapic,hv_spinlocks=0x1fff,hv_vpindex,hv_runtime,hv_synic,hv_stimer"... vga_model = 0x0 qtest_chrdev = 0x0 qtest_log = 0x0 pid_file = incoming = 0x0 userconfig = ---Type to continue, or q to quit--- nographic = false display_remote = log_mask = log_file = trace_file = maxram_size = 4294967296 ram_slots = 0 vmstate_dump_file = 0x0 main_loop_err = 0x0 err = 0x0 list_data_dirs = false dir = dirs = bdo_queue = {sqh_first = 0x0, sqh_last = 0x7ffdfc94c170} __func__ = "main" Strace shows: ppoll([{fd=9, events=POLLIN|POLLERR|POLLHUP}], 1, NULL, NULL, 8 Which points to this: ls -al /proc/2286/fd/9 lrwx------ 1 root users 64 Dec 5 13:04 /proc/2286/fd/9 -> anon_inode:[eventfd] The dest side is stuck here: (gdb) bt full #0 0x00007f21f070d3c1 in ppoll () at /lib64/libc.so.6 #1 0x0000000000861659 in qemu_poll_ns (fds=, nfds=, timeout=timeout@entry=2999926258) at util/qemu-timer.c:334 ts = {tv_sec = 2, tv_nsec = 999926258} Python Exception That operation is not available on integers of more than 8 bytes.: #2 0x00000000008622a4 in main_loop_wait (timeout=) at util/main-loop.c:233 context = 0x2142900 ret = ret = -1295041038 timeout = 4294967295 timeout_ns = #3 0x00000000008622a4 in main_loop_wait (nonblocking=nonblocking@entry=0) at util/main-loop.c:497 ret = -1295041038 timeout = 4294967295 timeout_ns = #4 0x0000000000595dee in main_loop () at vl.c:1866 #5 0x000000000041f35d in main (argc=, argv=, envp=) at vl.c:4644 i = snapshot = 0 linux_boot = initrd_filename = 0x0 kernel_filename = kernel_cmdline = boot_order = 0x918f44 "cad" boot_once = 0x0 ds = opts = machine_opts = icount_opts = accel_opts = 0x0 olist = optind = 73 optarg = 0x7ffdd6ee8f69 "timestamp=on" loadvm = 0x0 machine_class = 0x0 cpu_model = 0x7ffdd6ee8854 "Skylake-Server-IBRS,ss=on,hypervisor=on,tsc_adjust=on,clflushopt=on,umip=on,pku=on,ssbd=on,xsaves=on,topoext=on,hv_time,hv_relaxed,hv_vapic,hv_spinlocks=0x1fff,hv_vpindex,hv_runtime,hv_synic,hv_stimer"... vga_model = 0x0 qtest_chrdev = 0x0 qtest_log = 0x0 pid_file = incoming = 0x7ffdd6ee8f0a "defer" userconfig = nographic = false display_remote = log_mask = log_file = trace_file = maxram_size = 4294967296 ram_slots = 0 vmstate_dump_file = 0x0 main_loop_err = 0x0 err = 0x0 list_data_dirs = false dir = dirs = bdo_queue = {sqh_first = 0x0, sqh_last = 0x7ffdd6ee6630} ---Type to continue, or q to quit--- __func__ = "main" Strace show this over and over ppoll([{fd=6, events=POLLIN}, {fd=7, events=POLLIN}, {fd=9, events=POLLIN}, {fd=10, events=POLLIN}, {fd=21, events=POLLIN}, {fd=22, events=POLLIN}, {fd=23, events=POLLIN}, {fd=24, events=POLLIN}, {fd=27, events=POLLIN}], 9, {0, 594527977}, NULL, 8) = 0 (Timeout) lrwx------ 1 root users 64 Dec 5 13:15 /proc/20170/fd/10 -> anon_inode:[eventfd] lrwx------ 1 root users 64 Dec 5 13:15 /proc/20170/fd/21 -> socket:[42631161] lrwx------ 1 root users 64 Dec 5 13:15 /proc/20170/fd/22 -> socket:[42631165] lrwx------ 1 root users 64 Dec 5 13:15 /proc/20170/fd/23 -> socket:[42631167] lrwx------ 1 root users 64 Dec 5 13:15 /proc/20170/fd/24 -> socket:[42631168] lrwx------ 1 root users 64 Dec 5 13:15 /proc/20170/fd/27 -> socket:[42690422] lrwx------ 1 root users 64 Dec 5 13:15 /proc/20170/fd/6 -> anon_inode:[eventfd] lrwx------ 1 root users 64 Dec 5 13:15 /proc/20170/fd/7 -> anon_inode:[signalfd] lrwx------ 1 root users 64 Dec 5 13:15 /proc/20170/fd/9 -> anon_inode:[eventfd]