1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
|
Qemu hangs during migration
Source server: linux 4.19.5 qemu-3.0.0 from source, libvirt 4.9
Dest server: linux 4.18.19 qemu-3.0.0 from source, libvirt 4.9
When this VM is running on source server:
/usr/bin/qemu-system-x86_64 -name guest=testvm,debug-threads=on -S -object secret,id=masterKey0,format=raw,file=/var/lib/libvirt/qemu/domain-13-testvm/master-key.aes -machine pc-q35-3.0,accel=kvm,usb=off,dump-guest-core=off -cpu Skylake-Server-IBRS,ss=on,hypervisor=on,tsc_adjust=on,clflushopt=on,umip=on,pku=on,ssbd=on,xsaves=on,topoext=on,hv_time,hv_relaxed,hv_vapic,hv_spinlocks=0x1fff,hv_vpindex,hv_runtime,hv_synic,hv_stimer,hv_reset,hv_vendor_id=KVM Hv -m 4096 -realtime mlock=off -smp 2,sockets=2,cores=1,threads=1 -object iothread,id=iothread1 -uuid 3b00b788-ee91-4e45-80a6-c7319da71225 -no-user-config -nodefaults -chardev socket,id=charmonitor,fd=23,server,nowait -mon chardev=charmonitor,id=monitor,mode=control -rtc base=localtime,driftfix=slew -global kvm-pit.lost_tick_policy=delay -no-hpet -no-shutdown -boot strict=on -device pcie-root-port,port=0x10,chassis=1,id=pci.1,bus=pcie.0,multifunction=on,addr=0x2 -device pcie-root-port,port=0x11,chassis=2,id=pci.2,bus=pcie.0,addr=0x2.0x1 -device pcie-pci-bridge,id=pci.3,bus=pci.1,addr=0x0 -device pcie-root-port,port=0x12,chassis=4,id=pci.4,bus=pcie.0,addr=0x2.0x2 -device pcie-root-port,port=0x13,chassis=5,id=pci.5,bus=pcie.0,addr=0x2.0x3 -device piix3-usb-uhci,id=usb,bus=pci.3,addr=0x1 -device virtio-scsi-pci,iothread=iothread1,id=scsi0,bus=pci.4,addr=0x0 -drive file=/dev/zvol/datastore/vm/testvm-vda,format=raw,if=none,id=drive-scsi0-0-0-0,cache=writeback,aio=threads -device scsi-hd,bus=scsi0.0,channel=0,scsi-id=0,lun=0,drive=drive-scsi0-0-0-0,id=scsi0-0-0-0,bootindex=2,write-cache=on -drive if=none,id=drive-sata0-0-4,media=cdrom,readonly=on -device ide-cd,bus=ide.4,drive=drive-sata0-0-4,id=sata0-0-4,bootindex=1 -netdev tap,fd=25,id=hostnet0,vhost=on,vhostfd=26 -device virtio-net-pci,netdev=hostnet0,id=net0,mac=52:54:00:a2:b7:a1,bus=pci.2,addr=0x0 -device usb-tablet,id=input0,bus=usb.0,port=1 -vnc 127.0.0.1:0 -device cirrus-vga,id=video0,bus=pcie.0,addr=0x1 -s -sandbox on,obsolete=deny,elevateprivileges=deny,spawn=deny,resourcecontrol=deny -msg timestamp=on
I try to migrate it and the disks to the other side:
virsh migrate --live --undefinesource --persistent --verbose --copy-storage-all testvm qemu+ssh://wasvirt1/system
We get to 99% then hang with both sides in the pause state.
Source server is stuck here:
(gdb) bt full
#0 0x00007f327994f3c1 in ppoll () at /lib64/libc.so.6
#1 0x000000000086167b in qemu_poll_ns (fds=<optimized out>, nfds=nfds@entry=1, timeout=<optimized out>) at util/qemu-timer.c:322
#2 0x0000000000863302 in aio_poll (ctx=0x21044e0, blocking=blocking@entry=true) at util/aio-posix.c:629
node = <optimized out>
i = <optimized out>
ret = 0
progress = <optimized out>
timeout = <optimized out>
start = <optimized out>
__PRETTY_FUNCTION__ = "aio_poll"
#3 0x00000000007e0d52 in nbd_client_close (bs=0x2ba2400) at block/nbd-client.c:62
waited_ = <optimized out>
wait_ = 0x2ba563c
ctx_ = 0x2109bb0
bs_ = 0x2ba2400
client = 0x31287e0
client = <optimized out>
request = {handle = 0, from = 0, len = 0, flags = 0, type = 2}
#4 0x00000000007e0d52 in nbd_client_close (bs=0x2ba2400) at block/nbd-client.c:965
client = <optimized out>
request = {handle = 0, from = 0, len = 0, flags = 0, type = 2}
#5 0x00000000007de5ca in nbd_close (bs=<optimized out>) at block/nbd.c:491
s = 0x31287e0
#6 0x00000000007823d6 in bdrv_unref (bs=0x2ba2400) at block.c:3352
ban = <optimized out>
ban_next = <optimized out>
child = <optimized out>
next = <optimized out>
#7 0x00000000007823d6 in bdrv_unref (bs=0x2ba2400) at block.c:3560
#8 0x00000000007823d6 in bdrv_unref (bs=0x2ba2400) at block.c:4616
#9 0x0000000000782403 in bdrv_unref (bs=0x2af96f0) at block.c:3359
ban = <optimized out>
ban_next = <optimized out>
child = <optimized out>
next = <optimized out>
#10 0x0000000000782403 in bdrv_unref (bs=0x2af96f0) at block.c:3560
#11 0x0000000000782403 in bdrv_unref (bs=0x2af96f0) at block.c:4616
#12 0x0000000000785784 in block_job_remove_all_bdrv (job=job@entry=0x2f32570) at blockjob.c:200
c = 0x23bac30
l = 0x20dd330 = {0x23bac30, 0x2b89410}
#13 0x00000000007ceb5f in mirror_exit (job=0x2f32570, opaque=0x7f326407a350) at block/mirror.c:700
s = 0x2f32570
bjob = 0x2f32570
data = 0x7f326407a350
bs_opaque = 0x30d5600
replace_aio_context = <optimized out>
src = 0x2131080
target_bs = 0x2af96f0
mirror_top_bs = 0x210eb70
local_err = 0x0
#14 0x0000000000786452 in job_defer_to_main_loop_bh (opaque=0x7f32640786a0) at job.c:973
data = 0x7f32640786a0
job = <optimized out>
aio_context = 0x2109bb0
#15 0x000000000085fd3f in aio_bh_poll (ctx=ctx@entry=0x21044e0) at util/async.c:118
---Type <return> to continue, or q <return> to quit---
bh = <optimized out>
bhp = <optimized out>
next = 0x2ea86e0
ret = 1
deleted = false
#16 0x00000000008631b0 in aio_dispatch (ctx=0x21044e0) at util/aio-posix.c:436
#17 0x000000000085fc1e in aio_ctx_dispatch (source=<optimized out>, callback=<optimized out>, user_data=<optimized out>) at util/async.c:261
ctx = <optimized out>
#18 0x00007f327f17d797 in g_main_context_dispatch () at /usr/lib64/libglib-2.0.so.0
#19 0x00000000008622ed in main_loop_wait () at util/main-loop.c:215
context = 0x2104900
pfds = <optimized out>
context = 0x2104900
ret = 1
ret = 1
timeout = 4294967295
timeout_ns = <optimized out>
#20 0x00000000008622ed in main_loop_wait (timeout=<optimized out>) at util/main-loop.c:238
context = 0x2104900
ret = 1
ret = 1
timeout = 4294967295
timeout_ns = <optimized out>
#21 0x00000000008622ed in main_loop_wait (nonblocking=nonblocking@entry=0) at util/main-loop.c:497
ret = 1
timeout = 4294967295
timeout_ns = <optimized out>
#22 0x0000000000595dee in main_loop () at vl.c:1866
#23 0x000000000041f35d in main (argc=<optimized out>, argv=<optimized out>, envp=<optimized out>) at vl.c:4644
i = <optimized out>
snapshot = 0
linux_boot = <optimized out>
initrd_filename = 0x0
kernel_filename = <optimized out>
kernel_cmdline = <optimized out>
boot_order = 0x918f44 "cad"
boot_once = 0x0
ds = <optimized out>
opts = <optimized out>
machine_opts = <optimized out>
icount_opts = <optimized out>
accel_opts = 0x0
olist = <optimized out>
optind = 71
optarg = 0x7ffdfc94df69 "timestamp=on"
loadvm = 0x0
machine_class = 0x0
cpu_model = 0x7ffdfc94d864 "Skylake-Server-IBRS,ss=on,hypervisor=on,tsc_adjust=on,clflushopt=on,umip=on,pku=on,ssbd=on,xsaves=on,topoext=on,hv_time,hv_relaxed,hv_vapic,hv_spinlocks=0x1fff,hv_vpindex,hv_runtime,hv_synic,hv_stimer"...
vga_model = 0x0
qtest_chrdev = 0x0
qtest_log = 0x0
pid_file = <optimized out>
incoming = 0x0
userconfig = <optimized out>
---Type <return> to continue, or q <return> to quit---
nographic = false
display_remote = <optimized out>
log_mask = <optimized out>
log_file = <optimized out>
trace_file = <optimized out>
maxram_size = 4294967296
ram_slots = 0
vmstate_dump_file = 0x0
main_loop_err = 0x0
err = 0x0
list_data_dirs = false
dir = <optimized out>
dirs = <optimized out>
bdo_queue = {sqh_first = 0x0, sqh_last = 0x7ffdfc94c170}
__func__ = "main"
Strace shows:
ppoll([{fd=9, events=POLLIN|POLLERR|POLLHUP}], 1, NULL, NULL, 8
Which points to this:
ls -al /proc/2286/fd/9
lrwx------ 1 root users 64 Dec 5 13:04 /proc/2286/fd/9 -> anon_inode:[eventfd]
The dest side is stuck here:
(gdb) bt full
#0 0x00007f21f070d3c1 in ppoll () at /lib64/libc.so.6
#1 0x0000000000861659 in qemu_poll_ns (fds=<optimized out>, nfds=<optimized out>, timeout=timeout@entry=2999926258) at util/qemu-timer.c:334
ts = {tv_sec = 2, tv_nsec = 999926258}
Python Exception <class 'gdb.error'> That operation is not available on integers of more than 8 bytes.:
#2 0x00000000008622a4 in main_loop_wait (timeout=<optimized out>) at util/main-loop.c:233
context = 0x2142900
ret = <optimized out>
ret = -1295041038
timeout = 4294967295
timeout_ns = <optimized out>
#3 0x00000000008622a4 in main_loop_wait (nonblocking=nonblocking@entry=0) at util/main-loop.c:497
ret = -1295041038
timeout = 4294967295
timeout_ns = <optimized out>
#4 0x0000000000595dee in main_loop () at vl.c:1866
#5 0x000000000041f35d in main (argc=<optimized out>, argv=<optimized out>, envp=<optimized out>) at vl.c:4644
i = <optimized out>
snapshot = 0
linux_boot = <optimized out>
initrd_filename = 0x0
kernel_filename = <optimized out>
kernel_cmdline = <optimized out>
boot_order = 0x918f44 "cad"
boot_once = 0x0
ds = <optimized out>
opts = <optimized out>
machine_opts = <optimized out>
icount_opts = <optimized out>
accel_opts = 0x0
olist = <optimized out>
optind = 73
optarg = 0x7ffdd6ee8f69 "timestamp=on"
loadvm = 0x0
machine_class = 0x0
cpu_model = 0x7ffdd6ee8854 "Skylake-Server-IBRS,ss=on,hypervisor=on,tsc_adjust=on,clflushopt=on,umip=on,pku=on,ssbd=on,xsaves=on,topoext=on,hv_time,hv_relaxed,hv_vapic,hv_spinlocks=0x1fff,hv_vpindex,hv_runtime,hv_synic,hv_stimer"...
vga_model = 0x0
qtest_chrdev = 0x0
qtest_log = 0x0
pid_file = <optimized out>
incoming = 0x7ffdd6ee8f0a "defer"
userconfig = <optimized out>
nographic = false
display_remote = <optimized out>
log_mask = <optimized out>
log_file = <optimized out>
trace_file = <optimized out>
maxram_size = 4294967296
ram_slots = 0
vmstate_dump_file = 0x0
main_loop_err = 0x0
err = 0x0
list_data_dirs = false
dir = <optimized out>
dirs = <optimized out>
bdo_queue = {sqh_first = 0x0, sqh_last = 0x7ffdd6ee6630}
---Type <return> to continue, or q <return> to quit---
__func__ = "main"
Strace show this over and over
ppoll([{fd=6, events=POLLIN}, {fd=7, events=POLLIN}, {fd=9, events=POLLIN}, {fd=10, events=POLLIN}, {fd=21, events=POLLIN}, {fd=22, events=POLLIN}, {fd=23, events=POLLIN}, {fd=24, events=POLLIN}, {fd=27, events=POLLIN}], 9, {0, 594527977}, NULL, 8) = 0 (Timeout)
lrwx------ 1 root users 64 Dec 5 13:15 /proc/20170/fd/10 -> anon_inode:[eventfd]
lrwx------ 1 root users 64 Dec 5 13:15 /proc/20170/fd/21 -> socket:[42631161]
lrwx------ 1 root users 64 Dec 5 13:15 /proc/20170/fd/22 -> socket:[42631165]
lrwx------ 1 root users 64 Dec 5 13:15 /proc/20170/fd/23 -> socket:[42631167]
lrwx------ 1 root users 64 Dec 5 13:15 /proc/20170/fd/24 -> socket:[42631168]
lrwx------ 1 root users 64 Dec 5 13:15 /proc/20170/fd/27 -> socket:[42690422]
lrwx------ 1 root users 64 Dec 5 13:15 /proc/20170/fd/6 -> anon_inode:[eventfd]
lrwx------ 1 root users 64 Dec 5 13:15 /proc/20170/fd/7 -> anon_inode:[signalfd]
lrwx------ 1 root users 64 Dec 5 13:15 /proc/20170/fd/9 -> anon_inode:[eventfd]
|