1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
|
QEMU monitor hangs after "stop" QMP command called in postcopy-paused migration state
Description of problem:
QEMU monitor hangs when I try to pause virtual CPUs using "stop" QMP command
on the destination host once migration enters postcopy-paused (after it was
paused using "migrate-pause" QMP command on the source host). QEMU just does
not send any reply to the "stop" command.
Steps to reproduce:
1. start migration
2. wait for the first iteration to finish
3. switch to post-copy using "migrate-start-postcopy"
3. break migration with "migrate-pause"
4. send "stop" to the destination monitor
Additional information:
Unfortunately I haven't been able to get a stack trace as gdb just hangs when
I try to attach it to QEMU after step 4. I can see threads getting SIGUSR1
after the "stop" command, but I cannot get to gdb prompt afterwards:
```
(gdb) c
Continuing.
[New Thread 0x7f41ec9be640 (LWP 1112)]
[New Thread 0x7f41d7fff640 (LWP 1113)]
Thread 4 "CPU 0/KVM" received signal SIGUSR1, User defined signal 1.
Thread 5 "CPU 1/KVM" received signal SIGUSR1, User defined signal 1.
Thread 4 "CPU 0/KVM" received signal SIGUSR1, User defined signal 1.
Thread 5 "CPU 1/KVM" received signal SIGUSR1, User defined signal 1.
Thread 4 "CPU 0/KVM" received signal SIGUSR1, User defined signal 1.
Thread 5 "CPU 1/KVM" received signal SIGUSR1, User defined signal 1.
Thread 4 "CPU 0/KVM" received signal SIGUSR1, User defined signal 1.
Thread 5 "CPU 1/KVM" received signal SIGUSR1, User defined signal 1.
Thread 4 "CPU 0/KVM" received signal SIGUSR1, User defined signal 1.
Thread 5 "CPU 1/KVM" received signal SIGUSR1, User defined signal 1.
Thread 4 "CPU 0/KVM" received signal SIGUSR1, User defined signal 1.
Thread 4 "CPU 0/KVM" received signal SIGUSR1, User defined signal 1.
```
I was able to attach strace to it though (in case it is at least a bit
useful). The first line corresponds to the final '}' of the
{"execute":"stop","id":"libvirt-413"} QMP comamnd:
```
[pid 72970] recvmsg(20, {msg_name=NULL, msg_namelen=0, msg_iov=[{iov_base="}", iov_len=1}], msg_iovlen=1, msg_controllen=0, msg_flags=MSG_CMSG_CLOEXEC}, MSG_CMSG_CLOEXEC) = 1
[pid 72970] write(4, "\1\0\0\0\0\0\0\0", 8) = 8
[pid 72949] <... ppoll resumed>) = 1 ([{fd=4, revents=POLLIN}], left {tv_sec=0, tv_nsec=513181335})
[pid 72970] write(19, "\1\0\0\0\0\0\0\0", 8 <unfinished ...>
[pid 72949] read(4, <unfinished ...>
[pid 72970] <... write resumed>) = 8
[pid 72949] <... read resumed>"\1\0\0\0\0\0\0\0", 512) = 8
[pid 72970] write(19, "\1\0\0\0\0\0\0\0", 8 <unfinished ...>
[pid 72949] ppoll([{fd=4, events=POLLIN}, {fd=5, events=POLLIN}, {fd=8, events=POLLIN}, {fd=9, events=POLLIN}, {fd=11, events=POLLIN}, {fd=12, events=POLLIN}, {fd=23, events=POLLIN}, {fd=24, events=POLLIN}, {fd=29, events=POLLIN}, {fd=30, events=POLLIN}, {fd=31, events=POLLIN}, {fd=32, events=POLLIN}, {fd=33, events=POLLIN}, {fd=34, events=POLLIN}, {fd=38, events=POLLIN}, {fd=40, events=POLLIN}, {fd=41, events=POLLIN}, {fd=42, events=POLLIN}, {fd=43, events=POLLIN}, {fd=44, events=POLLIN}, {fd=45, events=POLLIN}, {fd=46, events=POLLIN}, {fd=47, events=POLLIN}, {fd=48, events=POLLIN}, {fd=49, events=POLLIN}, {fd=50, events=POLLIN}, {fd=51, events=POLLIN}, {fd=52, events=POLLIN}, {fd=53, events=POLLIN}, {fd=54, events=POLLIN}, {fd=55, events=POLLIN}, {fd=56, events=POLLIN}, ...], 74, {tv_sec=0, tv_nsec=0}, NULL, 8 <unfinished ...>
[pid 72970] <... write resumed>) = 8
[pid 72949] <... ppoll resumed>) = 0 (Timeout)
[pid 72970] write(19, "\1\0\0\0\0\0\0\0", 8 <unfinished ...>
[pid 72949] write(8, "\1\0\0\0\0\0\0\0", 8 <unfinished ...>
[pid 72970] <... write resumed>) = 8
[pid 72949] <... write resumed>) = 8
[pid 72970] write(19, "\1\0\0\0\0\0\0\0", 8 <unfinished ...>
[pid 72949] ppoll([{fd=4, events=POLLIN}, {fd=5, events=POLLIN}, {fd=8, events=POLLIN}, {fd=9, events=POLLIN}, {fd=11, events=POLLIN}, {fd=12, events=POLLIN}, {fd=23, events=POLLIN}, {fd=24, events=POLLIN}, {fd=29, events=POLLIN}, {fd=30, events=POLLIN}, {fd=31, events=POLLIN}, {fd=32, events=POLLIN}, {fd=33, events=POLLIN}, {fd=34, events=POLLIN}, {fd=38, events=POLLIN}, {fd=40, events=POLLIN}, {fd=41, events=POLLIN}, {fd=42, events=POLLIN}, {fd=43, events=POLLIN}, {fd=44, events=POLLIN}, {fd=45, events=POLLIN}, {fd=46, events=POLLIN}, {fd=47, events=POLLIN}, {fd=48, events=POLLIN}, {fd=49, events=POLLIN}, {fd=50, events=POLLIN}, {fd=51, events=POLLIN}, {fd=52, events=POLLIN}, {fd=53, events=POLLIN}, {fd=54, events=POLLIN}, {fd=55, events=POLLIN}, {fd=56, events=POLLIN}, ...], 74, {tv_sec=0, tv_nsec=0}, NULL, 8 <unfinished ...>
[pid 72970] <... write resumed>) = 8
[pid 72949] <... ppoll resumed>) = 1 ([{fd=8, revents=POLLIN}], left {tv_sec=0, tv_nsec=0})
[pid 72970] poll([{fd=18, events=POLLIN}, {fd=19, events=POLLIN}, {fd=20, events=0}], 3, -1 <unfinished ...>
[pid 72949] rt_sigprocmask(SIG_BLOCK, ~[], <unfinished ...>
[pid 72970] <... poll resumed>) = 1 ([{fd=19, revents=POLLIN}])
[pid 72949] <... rt_sigprocmask resumed>[BUS USR1 ALRM IO], 8) = 0
[pid 72970] read(19, <unfinished ...>
[pid 72949] getpid() = 72949
[pid 72970] <... read resumed>"\5\0\0\0\0\0\0\0", 16) = 8
[pid 72949] tgkill(72949, 72971, SIGUSR1 <unfinished ...>
[pid 72970] poll([{fd=18, events=POLLIN}, {fd=19, events=POLLIN}, {fd=20, events=0}], 3, -1 <unfinished ...>
[pid 72949] <... tgkill resumed>) = 0
[pid 72949] rt_sigprocmask(SIG_SETMASK, [BUS USR1 ALRM IO], NULL, 8) = 0
[pid 72949] rt_sigprocmask(SIG_BLOCK, ~[], [BUS USR1 ALRM IO], 8) = 0
[pid 72949] getpid() = 72949
[pid 72949] tgkill(72949, 72972, SIGUSR1) = 0
[pid 72949] rt_sigprocmask(SIG_SETMASK, [BUS USR1 ALRM IO], NULL, 8) = 0
[pid 72949] futex(0x5606f6cb73a8, FUTEX_WAIT_BITSET_PRIVATE|FUTEX_CLOCK_REALTIME, 0, NULL, FUTEX_BITSET_MATCH_ANY
```
And that's it, the last futex never returns.
|