diff options
| author | Christian Krinitsin <mail@krinitsin.com> | 2025-07-03 19:39:53 +0200 |
|---|---|---|
| committer | Christian Krinitsin <mail@krinitsin.com> | 2025-07-03 19:39:53 +0200 |
| commit | dee4dcba78baf712cab403d47d9db319ab7f95d6 (patch) | |
| tree | 418478faf06786701a56268672f73d6b0b4eb239 /results/classifier/013/risc-v | |
| parent | 4d9e26c0333abd39bdbd039dcdb30ed429c475ba (diff) | |
| download | qemu-analysis-dee4dcba78baf712cab403d47d9db319ab7f95d6.tar.gz qemu-analysis-dee4dcba78baf712cab403d47d9db319ab7f95d6.zip | |
restructure results
Diffstat (limited to 'results/classifier/013/risc-v')
| -rw-r--r-- | results/classifier/013/risc-v/16228234 | 1872 | ||||
| -rw-r--r-- | results/classifier/013/risc-v/25892827 | 1105 | ||||
| -rw-r--r-- | results/classifier/013/risc-v/55367348 | 560 | ||||
| -rw-r--r-- | results/classifier/013/risc-v/55753058 | 321 | ||||
| -rw-r--r-- | results/classifier/013/risc-v/65781993 | 2821 | ||||
| -rw-r--r-- | results/classifier/013/risc-v/70294255 | 1089 | ||||
| -rw-r--r-- | results/classifier/013/risc-v/74545755 | 372 |
7 files changed, 0 insertions, 8140 deletions
diff --git a/results/classifier/013/risc-v/16228234 b/results/classifier/013/risc-v/16228234 deleted file mode 100644 index 2f6d34c48..000000000 --- a/results/classifier/013/risc-v/16228234 +++ /dev/null @@ -1,1872 +0,0 @@ -risc-v: 0.649 -user-level: 0.531 -mistranslation: 0.518 -ppc: 0.483 -operating system: 0.460 -KVM: 0.445 -VMM: 0.443 -network: 0.440 -permissions: 0.439 -device: 0.439 -register: 0.438 -TCG: 0.437 -hypervisor: 0.436 -arm: 0.435 -assembly: 0.435 -virtual: 0.421 -x86: 0.421 -vnc: 0.420 -peripherals: 0.411 -semantic: 0.411 -architecture: 0.410 -performance: 0.409 -graphic: 0.408 -i386: 0.404 -boot: 0.402 -socket: 0.401 -files: 0.394 -kernel: 0.393 -PID: 0.385 -debug: 0.384 -alpha: 0.379 -system: 0.373 - -[Qemu-devel] [Bug?] BQL about live migration - -Hello Juan & Dave, - -We hit a bug in our test: -Network error occurs when migrating a guest, libvirt then rollback the -migration, causes qemu coredump -qemu log: -2017-03-01T12:54:33.904949+08:00|info|qemu[17672]|[33614]|monitor_qapi_event_emit[479]|: - {"timestamp": {"seconds": 1488344073, "microseconds": 904914}, "event": "STOP"} -2017-03-01T12:54:37.522500+08:00|info|qemu[17672]|[17672]|handle_qmp_command[3930]|: - qmp_cmd_name: migrate_cancel -2017-03-01T12:54:37.522607+08:00|info|qemu[17672]|[17672]|monitor_qapi_event_emit[479]|: - {"timestamp": {"seconds": 1488344077, "microseconds": 522556}, "event": -"MIGRATION", "data": {"status": "cancelling"}} -2017-03-01T12:54:37.524671+08:00|info|qemu[17672]|[17672]|handle_qmp_command[3930]|: - qmp_cmd_name: cont -2017-03-01T12:54:37.524733+08:00|info|qemu[17672]|[17672]|virtio_set_status[725]|: - virtio-balloon device status is 7 that means DRIVER OK -2017-03-01T12:54:37.525434+08:00|info|qemu[17672]|[17672]|virtio_set_status[725]|: - virtio-net device status is 7 that means DRIVER OK -2017-03-01T12:54:37.525484+08:00|info|qemu[17672]|[17672]|virtio_set_status[725]|: - virtio-blk device status is 7 that means DRIVER OK -2017-03-01T12:54:37.525562+08:00|info|qemu[17672]|[17672]|virtio_set_status[725]|: - virtio-serial device status is 7 that means DRIVER OK -2017-03-01T12:54:37.527653+08:00|info|qemu[17672]|[17672]|vm_start[981]|: -vm_state-notify:3ms -2017-03-01T12:54:37.528523+08:00|info|qemu[17672]|[17672]|monitor_qapi_event_emit[479]|: - {"timestamp": {"seconds": 1488344077, "microseconds": 527699}, "event": -"RESUME"} -2017-03-01T12:54:37.530680+08:00|info|qemu[17672]|[33614]|migration_bitmap_sync[720]|: - this iteration cycle takes 3s, new dirtied data:0MB -2017-03-01T12:54:37.530909+08:00|info|qemu[17672]|[33614]|monitor_qapi_event_emit[479]|: - {"timestamp": {"seconds": 1488344077, "microseconds": 530733}, "event": -"MIGRATION_PASS", "data": {"pass": 3}} -2017-03-01T04:54:37.530997Z qemu-kvm: socket_writev_buffer: Got err=32 for -(131583/18446744073709551615) -qemu-kvm: /home/abuild/rpmbuild/BUILD/qemu-kvm-2.6.0/hw/net/virtio_net.c:1519: -virtio_net_save: Assertion `!n->vhost_started' failed. -2017-03-01 12:54:43.028: shutting down - -> -From qemu log, qemu received and processed migrate_cancel/cont qmp commands -after guest been stopped and entered the last round of migration. Then -migration thread try to save device state when guest is running(started by -cont command), causes assert and coredump. -This is because in last iter, we call cpu_synchronize_all_states() to -synchronize vcpu states, this call will release qemu_global_mutex and wait -for do_kvm_cpu_synchronize_state() to be executed on target vcpu: -(gdb) bt -#0 0x00007f763d1046d5 in pthread_cond_wait@@GLIBC_2.3.2 () from -/lib64/libpthread.so.0 -#1 0x00007f7643e51d7f in qemu_cond_wait (cond=0x7f764445eca0 <qemu_work_cond>, -mutex=0x7f764445eba0 <qemu_global_mutex>) at util/qemu-thread-posix.c:132 -#2 0x00007f7643a2e154 in run_on_cpu (cpu=0x7f7644e06d80, func=0x7f7643a46413 -<do_kvm_cpu_synchronize_state>, data=0x7f7644e06d80) at -/mnt/public/yanghy/qemu-kvm/cpus.c:995 -#3 0x00007f7643a46487 in kvm_cpu_synchronize_state (cpu=0x7f7644e06d80) at -/mnt/public/yanghy/qemu-kvm/kvm-all.c:1805 -#4 0x00007f7643a2c700 in cpu_synchronize_state (cpu=0x7f7644e06d80) at -/mnt/public/yanghy/qemu-kvm/include/sysemu/kvm.h:457 -#5 0x00007f7643a2db0c in cpu_synchronize_all_states () at -/mnt/public/yanghy/qemu-kvm/cpus.c:766 -#6 0x00007f7643a67b5b in qemu_savevm_state_complete_precopy (f=0x7f76462f2d30, -iterable_only=false) at /mnt/public/yanghy/qemu-kvm/migration/savevm.c:1051 -#7 0x00007f7643d121e9 in migration_completion (s=0x7f76443e78c0 -<current_migration.37571>, current_active_state=4, -old_vm_running=0x7f74343fda00, start_time=0x7f74343fda08) at -migration/migration.c:1753 -#8 0x00007f7643d126c5 in migration_thread (opaque=0x7f76443e78c0 -<current_migration.37571>) at migration/migration.c:1922 -#9 0x00007f763d100dc5 in start_thread () from /lib64/libpthread.so.0 -#10 0x00007f763ce2e71d in clone () from /lib64/libc.so.6 -(gdb) p iothread_locked -$1 = true - -and then, qemu main thread been executed, it won't block because migration -thread released the qemu_global_mutex: -(gdb) thr 1 -[Switching to thread 1 (Thread 0x7fe298e08bc0 (LWP 30767))] -#0 os_host_main_loop_wait (timeout=931565) at main-loop.c:270 -270 QEMU_LOG(LOG_INFO,"***** after qemu_pool_ns: timeout %d\n", -timeout); -(gdb) p iothread_locked -$2 = true -(gdb) l 268 -263 -264 ret = qemu_poll_ns((GPollFD *)gpollfds->data, gpollfds->len, -timeout); -265 -266 -267 if (timeout) { -268 qemu_mutex_lock_iothread(); -269 if (runstate_check(RUN_STATE_FINISH_MIGRATE)) { -270 QEMU_LOG(LOG_INFO,"***** after qemu_pool_ns: timeout %d\n", -timeout); -271 } -272 } -(gdb) - -So, although we've hold iothread_lock in stop© phase of migration, we -can't guarantee the iothread been locked all through the stop & copy phase, -any thoughts on how to solve this problem? - - -Thanks, --Gonglei - -On Fri, 03/03 09:29, Gonglei (Arei) wrote: -> -Hello Juan & Dave, -> -> -We hit a bug in our test: -> -Network error occurs when migrating a guest, libvirt then rollback the -> -migration, causes qemu coredump -> -qemu log: -> -2017-03-01T12:54:33.904949+08:00|info|qemu[17672]|[33614]|monitor_qapi_event_emit[479]|: -> -{"timestamp": {"seconds": 1488344073, "microseconds": 904914}, "event": -> -"STOP"} -> -2017-03-01T12:54:37.522500+08:00|info|qemu[17672]|[17672]|handle_qmp_command[3930]|: -> -qmp_cmd_name: migrate_cancel -> -2017-03-01T12:54:37.522607+08:00|info|qemu[17672]|[17672]|monitor_qapi_event_emit[479]|: -> -{"timestamp": {"seconds": 1488344077, "microseconds": 522556}, "event": -> -"MIGRATION", "data": {"status": "cancelling"}} -> -2017-03-01T12:54:37.524671+08:00|info|qemu[17672]|[17672]|handle_qmp_command[3930]|: -> -qmp_cmd_name: cont -> -2017-03-01T12:54:37.524733+08:00|info|qemu[17672]|[17672]|virtio_set_status[725]|: -> -virtio-balloon device status is 7 that means DRIVER OK -> -2017-03-01T12:54:37.525434+08:00|info|qemu[17672]|[17672]|virtio_set_status[725]|: -> -virtio-net device status is 7 that means DRIVER OK -> -2017-03-01T12:54:37.525484+08:00|info|qemu[17672]|[17672]|virtio_set_status[725]|: -> -virtio-blk device status is 7 that means DRIVER OK -> -2017-03-01T12:54:37.525562+08:00|info|qemu[17672]|[17672]|virtio_set_status[725]|: -> -virtio-serial device status is 7 that means DRIVER OK -> -2017-03-01T12:54:37.527653+08:00|info|qemu[17672]|[17672]|vm_start[981]|: -> -vm_state-notify:3ms -> -2017-03-01T12:54:37.528523+08:00|info|qemu[17672]|[17672]|monitor_qapi_event_emit[479]|: -> -{"timestamp": {"seconds": 1488344077, "microseconds": 527699}, "event": -> -"RESUME"} -> -2017-03-01T12:54:37.530680+08:00|info|qemu[17672]|[33614]|migration_bitmap_sync[720]|: -> -this iteration cycle takes 3s, new dirtied data:0MB -> -2017-03-01T12:54:37.530909+08:00|info|qemu[17672]|[33614]|monitor_qapi_event_emit[479]|: -> -{"timestamp": {"seconds": 1488344077, "microseconds": 530733}, "event": -> -"MIGRATION_PASS", "data": {"pass": 3}} -> -2017-03-01T04:54:37.530997Z qemu-kvm: socket_writev_buffer: Got err=32 for -> -(131583/18446744073709551615) -> -qemu-kvm: -> -/home/abuild/rpmbuild/BUILD/qemu-kvm-2.6.0/hw/net/virtio_net.c:1519: -> -virtio_net_save: Assertion `!n->vhost_started' failed. -> -2017-03-01 12:54:43.028: shutting down -> -> -From qemu log, qemu received and processed migrate_cancel/cont qmp commands -> -after guest been stopped and entered the last round of migration. Then -> -migration thread try to save device state when guest is running(started by -> -cont command), causes assert and coredump. -> -This is because in last iter, we call cpu_synchronize_all_states() to -> -synchronize vcpu states, this call will release qemu_global_mutex and wait -> -for do_kvm_cpu_synchronize_state() to be executed on target vcpu: -> -(gdb) bt -> -#0 0x00007f763d1046d5 in pthread_cond_wait@@GLIBC_2.3.2 () from -> -/lib64/libpthread.so.0 -> -#1 0x00007f7643e51d7f in qemu_cond_wait (cond=0x7f764445eca0 -> -<qemu_work_cond>, mutex=0x7f764445eba0 <qemu_global_mutex>) at -> -util/qemu-thread-posix.c:132 -> -#2 0x00007f7643a2e154 in run_on_cpu (cpu=0x7f7644e06d80, func=0x7f7643a46413 -> -<do_kvm_cpu_synchronize_state>, data=0x7f7644e06d80) at -> -/mnt/public/yanghy/qemu-kvm/cpus.c:995 -> -#3 0x00007f7643a46487 in kvm_cpu_synchronize_state (cpu=0x7f7644e06d80) at -> -/mnt/public/yanghy/qemu-kvm/kvm-all.c:1805 -> -#4 0x00007f7643a2c700 in cpu_synchronize_state (cpu=0x7f7644e06d80) at -> -/mnt/public/yanghy/qemu-kvm/include/sysemu/kvm.h:457 -> -#5 0x00007f7643a2db0c in cpu_synchronize_all_states () at -> -/mnt/public/yanghy/qemu-kvm/cpus.c:766 -> -#6 0x00007f7643a67b5b in qemu_savevm_state_complete_precopy -> -(f=0x7f76462f2d30, iterable_only=false) at -> -/mnt/public/yanghy/qemu-kvm/migration/savevm.c:1051 -> -#7 0x00007f7643d121e9 in migration_completion (s=0x7f76443e78c0 -> -<current_migration.37571>, current_active_state=4, -> -old_vm_running=0x7f74343fda00, start_time=0x7f74343fda08) at -> -migration/migration.c:1753 -> -#8 0x00007f7643d126c5 in migration_thread (opaque=0x7f76443e78c0 -> -<current_migration.37571>) at migration/migration.c:1922 -> -#9 0x00007f763d100dc5 in start_thread () from /lib64/libpthread.so.0 -> -#10 0x00007f763ce2e71d in clone () from /lib64/libc.so.6 -> -(gdb) p iothread_locked -> -$1 = true -> -> -and then, qemu main thread been executed, it won't block because migration -> -thread released the qemu_global_mutex: -> -(gdb) thr 1 -> -[Switching to thread 1 (Thread 0x7fe298e08bc0 (LWP 30767))] -> -#0 os_host_main_loop_wait (timeout=931565) at main-loop.c:270 -> -270 QEMU_LOG(LOG_INFO,"***** after qemu_pool_ns: timeout -> -%d\n", timeout); -> -(gdb) p iothread_locked -> -$2 = true -> -(gdb) l 268 -> -263 -> -264 ret = qemu_poll_ns((GPollFD *)gpollfds->data, gpollfds->len, -> -timeout); -> -265 -> -266 -> -267 if (timeout) { -> -268 qemu_mutex_lock_iothread(); -> -269 if (runstate_check(RUN_STATE_FINISH_MIGRATE)) { -> -270 QEMU_LOG(LOG_INFO,"***** after qemu_pool_ns: timeout -> -%d\n", timeout); -> -271 } -> -272 } -> -(gdb) -> -> -So, although we've hold iothread_lock in stop© phase of migration, we -> -can't guarantee the iothread been locked all through the stop & copy phase, -> -any thoughts on how to solve this problem? -Could you post a backtrace of the assertion? - -Fam - -On 2017/3/3 18:42, Fam Zheng wrote: -> -On Fri, 03/03 09:29, Gonglei (Arei) wrote: -> -> Hello Juan & Dave, -> -> -> -> We hit a bug in our test: -> -> Network error occurs when migrating a guest, libvirt then rollback the -> -> migration, causes qemu coredump -> -> qemu log: -> -> 2017-03-01T12:54:33.904949+08:00|info|qemu[17672]|[33614]|monitor_qapi_event_emit[479]|: -> -> {"timestamp": {"seconds": 1488344073, "microseconds": 904914}, "event": -> -> "STOP"} -> -> 2017-03-01T12:54:37.522500+08:00|info|qemu[17672]|[17672]|handle_qmp_command[3930]|: -> -> qmp_cmd_name: migrate_cancel -> -> 2017-03-01T12:54:37.522607+08:00|info|qemu[17672]|[17672]|monitor_qapi_event_emit[479]|: -> -> {"timestamp": {"seconds": 1488344077, "microseconds": 522556}, "event": -> -> "MIGRATION", "data": {"status": "cancelling"}} -> -> 2017-03-01T12:54:37.524671+08:00|info|qemu[17672]|[17672]|handle_qmp_command[3930]|: -> -> qmp_cmd_name: cont -> -> 2017-03-01T12:54:37.524733+08:00|info|qemu[17672]|[17672]|virtio_set_status[725]|: -> -> virtio-balloon device status is 7 that means DRIVER OK -> -> 2017-03-01T12:54:37.525434+08:00|info|qemu[17672]|[17672]|virtio_set_status[725]|: -> -> virtio-net device status is 7 that means DRIVER OK -> -> 2017-03-01T12:54:37.525484+08:00|info|qemu[17672]|[17672]|virtio_set_status[725]|: -> -> virtio-blk device status is 7 that means DRIVER OK -> -> 2017-03-01T12:54:37.525562+08:00|info|qemu[17672]|[17672]|virtio_set_status[725]|: -> -> virtio-serial device status is 7 that means DRIVER OK -> -> 2017-03-01T12:54:37.527653+08:00|info|qemu[17672]|[17672]|vm_start[981]|: -> -> vm_state-notify:3ms -> -> 2017-03-01T12:54:37.528523+08:00|info|qemu[17672]|[17672]|monitor_qapi_event_emit[479]|: -> -> {"timestamp": {"seconds": 1488344077, "microseconds": 527699}, "event": -> -> "RESUME"} -> -> 2017-03-01T12:54:37.530680+08:00|info|qemu[17672]|[33614]|migration_bitmap_sync[720]|: -> -> this iteration cycle takes 3s, new dirtied data:0MB -> -> 2017-03-01T12:54:37.530909+08:00|info|qemu[17672]|[33614]|monitor_qapi_event_emit[479]|: -> -> {"timestamp": {"seconds": 1488344077, "microseconds": 530733}, "event": -> -> "MIGRATION_PASS", "data": {"pass": 3}} -> -> 2017-03-01T04:54:37.530997Z qemu-kvm: socket_writev_buffer: Got err=32 for -> -> (131583/18446744073709551615) -> -> qemu-kvm: -> -> /home/abuild/rpmbuild/BUILD/qemu-kvm-2.6.0/hw/net/virtio_net.c:1519: -> -> virtio_net_save: Assertion `!n->vhost_started' failed. -> -> 2017-03-01 12:54:43.028: shutting down -> -> -> -> From qemu log, qemu received and processed migrate_cancel/cont qmp commands -> -> after guest been stopped and entered the last round of migration. Then -> -> migration thread try to save device state when guest is running(started by -> -> cont command), causes assert and coredump. -> -> This is because in last iter, we call cpu_synchronize_all_states() to -> -> synchronize vcpu states, this call will release qemu_global_mutex and wait -> -> for do_kvm_cpu_synchronize_state() to be executed on target vcpu: -> -> (gdb) bt -> -> #0 0x00007f763d1046d5 in pthread_cond_wait@@GLIBC_2.3.2 () from -> -> /lib64/libpthread.so.0 -> -> #1 0x00007f7643e51d7f in qemu_cond_wait (cond=0x7f764445eca0 -> -> <qemu_work_cond>, mutex=0x7f764445eba0 <qemu_global_mutex>) at -> -> util/qemu-thread-posix.c:132 -> -> #2 0x00007f7643a2e154 in run_on_cpu (cpu=0x7f7644e06d80, -> -> func=0x7f7643a46413 <do_kvm_cpu_synchronize_state>, data=0x7f7644e06d80) at -> -> /mnt/public/yanghy/qemu-kvm/cpus.c:995 -> -> #3 0x00007f7643a46487 in kvm_cpu_synchronize_state (cpu=0x7f7644e06d80) at -> -> /mnt/public/yanghy/qemu-kvm/kvm-all.c:1805 -> -> #4 0x00007f7643a2c700 in cpu_synchronize_state (cpu=0x7f7644e06d80) at -> -> /mnt/public/yanghy/qemu-kvm/include/sysemu/kvm.h:457 -> -> #5 0x00007f7643a2db0c in cpu_synchronize_all_states () at -> -> /mnt/public/yanghy/qemu-kvm/cpus.c:766 -> -> #6 0x00007f7643a67b5b in qemu_savevm_state_complete_precopy -> -> (f=0x7f76462f2d30, iterable_only=false) at -> -> /mnt/public/yanghy/qemu-kvm/migration/savevm.c:1051 -> -> #7 0x00007f7643d121e9 in migration_completion (s=0x7f76443e78c0 -> -> <current_migration.37571>, current_active_state=4, -> -> old_vm_running=0x7f74343fda00, start_time=0x7f74343fda08) at -> -> migration/migration.c:1753 -> -> #8 0x00007f7643d126c5 in migration_thread (opaque=0x7f76443e78c0 -> -> <current_migration.37571>) at migration/migration.c:1922 -> -> #9 0x00007f763d100dc5 in start_thread () from /lib64/libpthread.so.0 -> -> #10 0x00007f763ce2e71d in clone () from /lib64/libc.so.6 -> -> (gdb) p iothread_locked -> -> $1 = true -> -> -> -> and then, qemu main thread been executed, it won't block because migration -> -> thread released the qemu_global_mutex: -> -> (gdb) thr 1 -> -> [Switching to thread 1 (Thread 0x7fe298e08bc0 (LWP 30767))] -> -> #0 os_host_main_loop_wait (timeout=931565) at main-loop.c:270 -> -> 270 QEMU_LOG(LOG_INFO,"***** after qemu_pool_ns: timeout -> -> %d\n", timeout); -> -> (gdb) p iothread_locked -> -> $2 = true -> -> (gdb) l 268 -> -> 263 -> -> 264 ret = qemu_poll_ns((GPollFD *)gpollfds->data, gpollfds->len, -> -> timeout); -> -> 265 -> -> 266 -> -> 267 if (timeout) { -> -> 268 qemu_mutex_lock_iothread(); -> -> 269 if (runstate_check(RUN_STATE_FINISH_MIGRATE)) { -> -> 270 QEMU_LOG(LOG_INFO,"***** after qemu_pool_ns: timeout -> -> %d\n", timeout); -> -> 271 } -> -> 272 } -> -> (gdb) -> -> -> -> So, although we've hold iothread_lock in stop© phase of migration, we -> -> can't guarantee the iothread been locked all through the stop & copy phase, -> -> any thoughts on how to solve this problem? -> -> -Could you post a backtrace of the assertion? -#0 0x00007f97b1fbe5d7 in raise () from /usr/lib64/libc.so.6 -#1 0x00007f97b1fbfcc8 in abort () from /usr/lib64/libc.so.6 -#2 0x00007f97b1fb7546 in __assert_fail_base () from /usr/lib64/libc.so.6 -#3 0x00007f97b1fb75f2 in __assert_fail () from /usr/lib64/libc.so.6 -#4 0x000000000049fd19 in virtio_net_save (f=0x7f97a8ca44d0, -opaque=0x7f97a86e9018) at /usr/src/debug/qemu-kvm-2.6.0/hw/ -#5 0x000000000047e380 in vmstate_save_old_style (address@hidden, -address@hidden, se=0x7f9 -#6 0x000000000047fb93 in vmstate_save (address@hidden, address@hidden, -address@hidden -#7 0x0000000000481ad2 in qemu_savevm_state_complete_precopy (f=0x7f97a8ca44d0, -address@hidden) -#8 0x00000000006c6b60 in migration_completion (address@hidden -<current_migration.38312>, current_active_state=curre - address@hidden) at migration/migration.c:1761 -#9 0x00000000006c71db in migration_thread (address@hidden -<current_migration.38312>) at migration/migrati - -> -> -Fam -> --- -Thanks, -Yang - -* Gonglei (Arei) (address@hidden) wrote: -> -Hello Juan & Dave, -cc'ing in pbonzini since it's magic involving cpu_synrhonize_all_states() - -> -We hit a bug in our test: -> -Network error occurs when migrating a guest, libvirt then rollback the -> -migration, causes qemu coredump -> -qemu log: -> -2017-03-01T12:54:33.904949+08:00|info|qemu[17672]|[33614]|monitor_qapi_event_emit[479]|: -> -{"timestamp": {"seconds": 1488344073, "microseconds": 904914}, "event": -> -"STOP"} -> -2017-03-01T12:54:37.522500+08:00|info|qemu[17672]|[17672]|handle_qmp_command[3930]|: -> -qmp_cmd_name: migrate_cancel -> -2017-03-01T12:54:37.522607+08:00|info|qemu[17672]|[17672]|monitor_qapi_event_emit[479]|: -> -{"timestamp": {"seconds": 1488344077, "microseconds": 522556}, "event": -> -"MIGRATION", "data": {"status": "cancelling"}} -> -2017-03-01T12:54:37.524671+08:00|info|qemu[17672]|[17672]|handle_qmp_command[3930]|: -> -qmp_cmd_name: cont -> -2017-03-01T12:54:37.524733+08:00|info|qemu[17672]|[17672]|virtio_set_status[725]|: -> -virtio-balloon device status is 7 that means DRIVER OK -> -2017-03-01T12:54:37.525434+08:00|info|qemu[17672]|[17672]|virtio_set_status[725]|: -> -virtio-net device status is 7 that means DRIVER OK -> -2017-03-01T12:54:37.525484+08:00|info|qemu[17672]|[17672]|virtio_set_status[725]|: -> -virtio-blk device status is 7 that means DRIVER OK -> -2017-03-01T12:54:37.525562+08:00|info|qemu[17672]|[17672]|virtio_set_status[725]|: -> -virtio-serial device status is 7 that means DRIVER OK -> -2017-03-01T12:54:37.527653+08:00|info|qemu[17672]|[17672]|vm_start[981]|: -> -vm_state-notify:3ms -> -2017-03-01T12:54:37.528523+08:00|info|qemu[17672]|[17672]|monitor_qapi_event_emit[479]|: -> -{"timestamp": {"seconds": 1488344077, "microseconds": 527699}, "event": -> -"RESUME"} -> -2017-03-01T12:54:37.530680+08:00|info|qemu[17672]|[33614]|migration_bitmap_sync[720]|: -> -this iteration cycle takes 3s, new dirtied data:0MB -> -2017-03-01T12:54:37.530909+08:00|info|qemu[17672]|[33614]|monitor_qapi_event_emit[479]|: -> -{"timestamp": {"seconds": 1488344077, "microseconds": 530733}, "event": -> -"MIGRATION_PASS", "data": {"pass": 3}} -> -2017-03-01T04:54:37.530997Z qemu-kvm: socket_writev_buffer: Got err=32 for -> -(131583/18446744073709551615) -> -qemu-kvm: -> -/home/abuild/rpmbuild/BUILD/qemu-kvm-2.6.0/hw/net/virtio_net.c:1519: -> -virtio_net_save: Assertion `!n->vhost_started' failed. -> -2017-03-01 12:54:43.028: shutting down -> -> -From qemu log, qemu received and processed migrate_cancel/cont qmp commands -> -after guest been stopped and entered the last round of migration. Then -> -migration thread try to save device state when guest is running(started by -> -cont command), causes assert and coredump. -> -This is because in last iter, we call cpu_synchronize_all_states() to -> -synchronize vcpu states, this call will release qemu_global_mutex and wait -> -for do_kvm_cpu_synchronize_state() to be executed on target vcpu: -> -(gdb) bt -> -#0 0x00007f763d1046d5 in pthread_cond_wait@@GLIBC_2.3.2 () from -> -/lib64/libpthread.so.0 -> -#1 0x00007f7643e51d7f in qemu_cond_wait (cond=0x7f764445eca0 -> -<qemu_work_cond>, mutex=0x7f764445eba0 <qemu_global_mutex>) at -> -util/qemu-thread-posix.c:132 -> -#2 0x00007f7643a2e154 in run_on_cpu (cpu=0x7f7644e06d80, func=0x7f7643a46413 -> -<do_kvm_cpu_synchronize_state>, data=0x7f7644e06d80) at -> -/mnt/public/yanghy/qemu-kvm/cpus.c:995 -> -#3 0x00007f7643a46487 in kvm_cpu_synchronize_state (cpu=0x7f7644e06d80) at -> -/mnt/public/yanghy/qemu-kvm/kvm-all.c:1805 -> -#4 0x00007f7643a2c700 in cpu_synchronize_state (cpu=0x7f7644e06d80) at -> -/mnt/public/yanghy/qemu-kvm/include/sysemu/kvm.h:457 -> -#5 0x00007f7643a2db0c in cpu_synchronize_all_states () at -> -/mnt/public/yanghy/qemu-kvm/cpus.c:766 -> -#6 0x00007f7643a67b5b in qemu_savevm_state_complete_precopy -> -(f=0x7f76462f2d30, iterable_only=false) at -> -/mnt/public/yanghy/qemu-kvm/migration/savevm.c:1051 -> -#7 0x00007f7643d121e9 in migration_completion (s=0x7f76443e78c0 -> -<current_migration.37571>, current_active_state=4, -> -old_vm_running=0x7f74343fda00, start_time=0x7f74343fda08) at -> -migration/migration.c:1753 -> -#8 0x00007f7643d126c5 in migration_thread (opaque=0x7f76443e78c0 -> -<current_migration.37571>) at migration/migration.c:1922 -> -#9 0x00007f763d100dc5 in start_thread () from /lib64/libpthread.so.0 -> -#10 0x00007f763ce2e71d in clone () from /lib64/libc.so.6 -> -(gdb) p iothread_locked -> -$1 = true -> -> -and then, qemu main thread been executed, it won't block because migration -> -thread released the qemu_global_mutex: -> -(gdb) thr 1 -> -[Switching to thread 1 (Thread 0x7fe298e08bc0 (LWP 30767))] -> -#0 os_host_main_loop_wait (timeout=931565) at main-loop.c:270 -> -270 QEMU_LOG(LOG_INFO,"***** after qemu_pool_ns: timeout -> -%d\n", timeout); -> -(gdb) p iothread_locked -> -$2 = true -> -(gdb) l 268 -> -263 -> -264 ret = qemu_poll_ns((GPollFD *)gpollfds->data, gpollfds->len, -> -timeout); -> -265 -> -266 -> -267 if (timeout) { -> -268 qemu_mutex_lock_iothread(); -> -269 if (runstate_check(RUN_STATE_FINISH_MIGRATE)) { -> -270 QEMU_LOG(LOG_INFO,"***** after qemu_pool_ns: timeout -> -%d\n", timeout); -> -271 } -> -272 } -> -(gdb) -> -> -So, although we've hold iothread_lock in stop© phase of migration, we -> -can't guarantee the iothread been locked all through the stop & copy phase, -> -any thoughts on how to solve this problem? -Ouch that's pretty nasty; I remember Paolo explaining to me a while ago that -their were times when run_on_cpu would have to drop the BQL and I worried about -it, -but this is the 1st time I've seen an error due to it. - -Do you know what the migration state was at that point? Was it -MIGRATION_STATUS_CANCELLING? -I'm thinking perhaps we should stop 'cont' from continuing while migration is in -MIGRATION_STATUS_CANCELLING. Do we send an event when we hit CANCELLED - so -that -perhaps libvirt could avoid sending the 'cont' until then? - -Dave - - -> -> -Thanks, -> --Gonglei -> --- -Dr. David Alan Gilbert / address@hidden / Manchester, UK - -On 03/03/2017 13:00, Dr. David Alan Gilbert wrote: -> -Ouch that's pretty nasty; I remember Paolo explaining to me a while ago that -> -their were times when run_on_cpu would have to drop the BQL and I worried -> -about it, -> -but this is the 1st time I've seen an error due to it. -> -> -Do you know what the migration state was at that point? Was it -> -MIGRATION_STATUS_CANCELLING? -> -I'm thinking perhaps we should stop 'cont' from continuing while migration is -> -in -> -MIGRATION_STATUS_CANCELLING. Do we send an event when we hit CANCELLED - so -> -that -> -perhaps libvirt could avoid sending the 'cont' until then? -No, there's no event, though I thought libvirt would poll until -"query-migrate" returns the cancelled state. Of course that is a small -consolation, because a segfault is unacceptable. - -One possibility is to suspend the monitor in qmp_migrate_cancel and -resume it (with add_migration_state_change_notifier) when we hit the -CANCELLED state. I'm not sure what the latency would be between the end -of migrate_fd_cancel and finally reaching CANCELLED. - -Paolo - -* Paolo Bonzini (address@hidden) wrote: -> -> -> -On 03/03/2017 13:00, Dr. David Alan Gilbert wrote: -> -> Ouch that's pretty nasty; I remember Paolo explaining to me a while ago that -> -> their were times when run_on_cpu would have to drop the BQL and I worried -> -> about it, -> -> but this is the 1st time I've seen an error due to it. -> -> -> -> Do you know what the migration state was at that point? Was it -> -> MIGRATION_STATUS_CANCELLING? -> -> I'm thinking perhaps we should stop 'cont' from continuing while migration -> -> is in -> -> MIGRATION_STATUS_CANCELLING. Do we send an event when we hit CANCELLED - -> -> so that -> -> perhaps libvirt could avoid sending the 'cont' until then? -> -> -No, there's no event, though I thought libvirt would poll until -> -"query-migrate" returns the cancelled state. Of course that is a small -> -consolation, because a segfault is unacceptable. -I think you might get an event if you set the new migrate capability called -'events' on! - -void migrate_set_state(int *state, int old_state, int new_state) -{ - if (atomic_cmpxchg(state, old_state, new_state) == old_state) { - trace_migrate_set_state(new_state); - migrate_generate_event(new_state); - } -} - -static void migrate_generate_event(int new_state) -{ - if (migrate_use_events()) { - qapi_event_send_migration(new_state, &error_abort); - } -} - -That event feature went in sometime after 2.3.0. - -> -One possibility is to suspend the monitor in qmp_migrate_cancel and -> -resume it (with add_migration_state_change_notifier) when we hit the -> -CANCELLED state. I'm not sure what the latency would be between the end -> -of migrate_fd_cancel and finally reaching CANCELLED. -I don't like suspending monitors; it can potentially take quite a significant -time to do a cancel. -How about making 'cont' fail if we're in CANCELLING? - -I'd really love to see the 'run_on_cpu' being more careful about the BQL; -we really need all of the rest of the devices to stay quiesced at times. - -Dave - -> -Paolo --- -Dr. David Alan Gilbert / address@hidden / Manchester, UK - -On 03/03/2017 14:11, Dr. David Alan Gilbert wrote: -> -* Paolo Bonzini (address@hidden) wrote: -> -> -> -> -> -> On 03/03/2017 13:00, Dr. David Alan Gilbert wrote: -> ->> Ouch that's pretty nasty; I remember Paolo explaining to me a while ago that -> ->> their were times when run_on_cpu would have to drop the BQL and I worried -> ->> about it, -> ->> but this is the 1st time I've seen an error due to it. -> ->> -> ->> Do you know what the migration state was at that point? Was it -> ->> MIGRATION_STATUS_CANCELLING? -> ->> I'm thinking perhaps we should stop 'cont' from continuing while migration -> ->> is in -> ->> MIGRATION_STATUS_CANCELLING. Do we send an event when we hit CANCELLED - -> ->> so that -> ->> perhaps libvirt could avoid sending the 'cont' until then? -> -> -> -> No, there's no event, though I thought libvirt would poll until -> -> "query-migrate" returns the cancelled state. Of course that is a small -> -> consolation, because a segfault is unacceptable. -> -> -I think you might get an event if you set the new migrate capability called -> -'events' on! -> -> -void migrate_set_state(int *state, int old_state, int new_state) -> -{ -> -if (atomic_cmpxchg(state, old_state, new_state) == old_state) { -> -trace_migrate_set_state(new_state); -> -migrate_generate_event(new_state); -> -} -> -} -> -> -static void migrate_generate_event(int new_state) -> -{ -> -if (migrate_use_events()) { -> -qapi_event_send_migration(new_state, &error_abort); -> -} -> -} -> -> -That event feature went in sometime after 2.3.0. -> -> -> One possibility is to suspend the monitor in qmp_migrate_cancel and -> -> resume it (with add_migration_state_change_notifier) when we hit the -> -> CANCELLED state. I'm not sure what the latency would be between the end -> -> of migrate_fd_cancel and finally reaching CANCELLED. -> -> -I don't like suspending monitors; it can potentially take quite a significant -> -time to do a cancel. -> -How about making 'cont' fail if we're in CANCELLING? -Actually I thought that would be the case already (in fact CANCELLING is -internal only; the outside world sees it as "active" in query-migrate). - -Lei, what is the runstate? (That is, why did cont succeed at all)? - -Paolo - -> -I'd really love to see the 'run_on_cpu' being more careful about the BQL; -> -we really need all of the rest of the devices to stay quiesced at times. -That's not really possible, because of how condition variables work. :( - -* Paolo Bonzini (address@hidden) wrote: -> -> -> -On 03/03/2017 14:11, Dr. David Alan Gilbert wrote: -> -> * Paolo Bonzini (address@hidden) wrote: -> ->> -> ->> -> ->> On 03/03/2017 13:00, Dr. David Alan Gilbert wrote: -> ->>> Ouch that's pretty nasty; I remember Paolo explaining to me a while ago -> ->>> that -> ->>> their were times when run_on_cpu would have to drop the BQL and I worried -> ->>> about it, -> ->>> but this is the 1st time I've seen an error due to it. -> ->>> -> ->>> Do you know what the migration state was at that point? Was it -> ->>> MIGRATION_STATUS_CANCELLING? -> ->>> I'm thinking perhaps we should stop 'cont' from continuing while -> ->>> migration is in -> ->>> MIGRATION_STATUS_CANCELLING. Do we send an event when we hit CANCELLED - -> ->>> so that -> ->>> perhaps libvirt could avoid sending the 'cont' until then? -> ->> -> ->> No, there's no event, though I thought libvirt would poll until -> ->> "query-migrate" returns the cancelled state. Of course that is a small -> ->> consolation, because a segfault is unacceptable. -> -> -> -> I think you might get an event if you set the new migrate capability called -> -> 'events' on! -> -> -> -> void migrate_set_state(int *state, int old_state, int new_state) -> -> { -> -> if (atomic_cmpxchg(state, old_state, new_state) == old_state) { -> -> trace_migrate_set_state(new_state); -> -> migrate_generate_event(new_state); -> -> } -> -> } -> -> -> -> static void migrate_generate_event(int new_state) -> -> { -> -> if (migrate_use_events()) { -> -> qapi_event_send_migration(new_state, &error_abort); -> -> } -> -> } -> -> -> -> That event feature went in sometime after 2.3.0. -> -> -> ->> One possibility is to suspend the monitor in qmp_migrate_cancel and -> ->> resume it (with add_migration_state_change_notifier) when we hit the -> ->> CANCELLED state. I'm not sure what the latency would be between the end -> ->> of migrate_fd_cancel and finally reaching CANCELLED. -> -> -> -> I don't like suspending monitors; it can potentially take quite a -> -> significant -> -> time to do a cancel. -> -> How about making 'cont' fail if we're in CANCELLING? -> -> -Actually I thought that would be the case already (in fact CANCELLING is -> -internal only; the outside world sees it as "active" in query-migrate). -> -> -Lei, what is the runstate? (That is, why did cont succeed at all)? -I suspect it's RUN_STATE_FINISH_MIGRATE - we set that before we do the device -save, and that's what we get at the end of a migrate and it's legal to restart -from there. - -> -Paolo -> -> -> I'd really love to see the 'run_on_cpu' being more careful about the BQL; -> -> we really need all of the rest of the devices to stay quiesced at times. -> -> -That's not really possible, because of how condition variables work. :( -*Really* we need to find a solution to that - there's probably lots of -other things that can spring up in that small window other than the -'cont'. - -Dave - --- -Dr. David Alan Gilbert / address@hidden / Manchester, UK - -On 03/03/2017 14:26, Dr. David Alan Gilbert wrote: -> -* Paolo Bonzini (address@hidden) wrote: -> -> -> -> -> -> On 03/03/2017 14:11, Dr. David Alan Gilbert wrote: -> ->> * Paolo Bonzini (address@hidden) wrote: -> ->>> -> ->>> -> ->>> On 03/03/2017 13:00, Dr. David Alan Gilbert wrote: -> ->>>> Ouch that's pretty nasty; I remember Paolo explaining to me a while ago -> ->>>> that -> ->>>> their were times when run_on_cpu would have to drop the BQL and I worried -> ->>>> about it, -> ->>>> but this is the 1st time I've seen an error due to it. -> ->>>> -> ->>>> Do you know what the migration state was at that point? Was it -> ->>>> MIGRATION_STATUS_CANCELLING? -> ->>>> I'm thinking perhaps we should stop 'cont' from continuing while -> ->>>> migration is in -> ->>>> MIGRATION_STATUS_CANCELLING. Do we send an event when we hit CANCELLED - -> ->>>> so that -> ->>>> perhaps libvirt could avoid sending the 'cont' until then? -> ->>> -> ->>> No, there's no event, though I thought libvirt would poll until -> ->>> "query-migrate" returns the cancelled state. Of course that is a small -> ->>> consolation, because a segfault is unacceptable. -> ->> -> ->> I think you might get an event if you set the new migrate capability called -> ->> 'events' on! -> ->> -> ->> void migrate_set_state(int *state, int old_state, int new_state) -> ->> { -> ->> if (atomic_cmpxchg(state, old_state, new_state) == old_state) { -> ->> trace_migrate_set_state(new_state); -> ->> migrate_generate_event(new_state); -> ->> } -> ->> } -> ->> -> ->> static void migrate_generate_event(int new_state) -> ->> { -> ->> if (migrate_use_events()) { -> ->> qapi_event_send_migration(new_state, &error_abort); -> ->> } -> ->> } -> ->> -> ->> That event feature went in sometime after 2.3.0. -> ->> -> ->>> One possibility is to suspend the monitor in qmp_migrate_cancel and -> ->>> resume it (with add_migration_state_change_notifier) when we hit the -> ->>> CANCELLED state. I'm not sure what the latency would be between the end -> ->>> of migrate_fd_cancel and finally reaching CANCELLED. -> ->> -> ->> I don't like suspending monitors; it can potentially take quite a -> ->> significant -> ->> time to do a cancel. -> ->> How about making 'cont' fail if we're in CANCELLING? -> -> -> -> Actually I thought that would be the case already (in fact CANCELLING is -> -> internal only; the outside world sees it as "active" in query-migrate). -> -> -> -> Lei, what is the runstate? (That is, why did cont succeed at all)? -> -> -I suspect it's RUN_STATE_FINISH_MIGRATE - we set that before we do the device -> -save, and that's what we get at the end of a migrate and it's legal to restart -> -from there. -Yeah, but I think we get there at the end of a failed migrate only. So -perhaps we can introduce a new state RUN_STATE_FAILED_MIGRATE and forbid -"cont" from finish-migrate (only allow it from failed-migrate)? - -Paolo - -> -> Paolo -> -> -> ->> I'd really love to see the 'run_on_cpu' being more careful about the BQL; -> ->> we really need all of the rest of the devices to stay quiesced at times. -> -> -> -> That's not really possible, because of how condition variables work. :( -> -> -*Really* we need to find a solution to that - there's probably lots of -> -other things that can spring up in that small window other than the -> -'cont'. -> -> -Dave -> -> --- -> -Dr. David Alan Gilbert / address@hidden / Manchester, UK -> - -Hi Paolo, - -On Fri, Mar 3, 2017 at 9:33 PM, Paolo Bonzini <address@hidden> wrote: - -> -> -> -On 03/03/2017 14:26, Dr. David Alan Gilbert wrote: -> -> * Paolo Bonzini (address@hidden) wrote: -> ->> -> ->> -> ->> On 03/03/2017 14:11, Dr. David Alan Gilbert wrote: -> ->>> * Paolo Bonzini (address@hidden) wrote: -> ->>>> -> ->>>> -> ->>>> On 03/03/2017 13:00, Dr. David Alan Gilbert wrote: -> ->>>>> Ouch that's pretty nasty; I remember Paolo explaining to me a while -> -ago that -> ->>>>> their were times when run_on_cpu would have to drop the BQL and I -> -worried about it, -> ->>>>> but this is the 1st time I've seen an error due to it. -> ->>>>> -> ->>>>> Do you know what the migration state was at that point? Was it -> -MIGRATION_STATUS_CANCELLING? -> ->>>>> I'm thinking perhaps we should stop 'cont' from continuing while -> -migration is in -> ->>>>> MIGRATION_STATUS_CANCELLING. Do we send an event when we hit -> -CANCELLED - so that -> ->>>>> perhaps libvirt could avoid sending the 'cont' until then? -> ->>>> -> ->>>> No, there's no event, though I thought libvirt would poll until -> ->>>> "query-migrate" returns the cancelled state. Of course that is a -> -small -> ->>>> consolation, because a segfault is unacceptable. -> ->>> -> ->>> I think you might get an event if you set the new migrate capability -> -called -> ->>> 'events' on! -> ->>> -> ->>> void migrate_set_state(int *state, int old_state, int new_state) -> ->>> { -> ->>> if (atomic_cmpxchg(state, old_state, new_state) == old_state) { -> ->>> trace_migrate_set_state(new_state); -> ->>> migrate_generate_event(new_state); -> ->>> } -> ->>> } -> ->>> -> ->>> static void migrate_generate_event(int new_state) -> ->>> { -> ->>> if (migrate_use_events()) { -> ->>> qapi_event_send_migration(new_state, &error_abort); -> ->>> } -> ->>> } -> ->>> -> ->>> That event feature went in sometime after 2.3.0. -> ->>> -> ->>>> One possibility is to suspend the monitor in qmp_migrate_cancel and -> ->>>> resume it (with add_migration_state_change_notifier) when we hit the -> ->>>> CANCELLED state. I'm not sure what the latency would be between the -> -end -> ->>>> of migrate_fd_cancel and finally reaching CANCELLED. -> ->>> -> ->>> I don't like suspending monitors; it can potentially take quite a -> -significant -> ->>> time to do a cancel. -> ->>> How about making 'cont' fail if we're in CANCELLING? -> ->> -> ->> Actually I thought that would be the case already (in fact CANCELLING is -> ->> internal only; the outside world sees it as "active" in query-migrate). -> ->> -> ->> Lei, what is the runstate? (That is, why did cont succeed at all)? -> -> -> -> I suspect it's RUN_STATE_FINISH_MIGRATE - we set that before we do the -> -device -> -> save, and that's what we get at the end of a migrate and it's legal to -> -restart -> -> from there. -> -> -Yeah, but I think we get there at the end of a failed migrate only. So -> -perhaps we can introduce a new state RUN_STATE_FAILED_MIGRATE -I think we do not need to introduce a new state here. If we hit 'cont' and -the run state is RUN_STATE_FINISH_MIGRATE, we could assume that -migration failed because 'RUN_STATE_FINISH_MIGRATE' only exists on -source side, means we are finishing migration, a 'cont' at the meantime -indicates that we are rolling back, otherwise source side should be -destroyed. - - -> -and forbid -> -"cont" from finish-migrate (only allow it from failed-migrate)? -> -The problem of forbid 'cont' here is that it will result in a failed -migration and the source -side will remain paused. We actually expect a usable guest when rollback. -Is there a way to kill migration thread when we're under main thread, if -there is, we -could do the following to solve this problem: -1. 'cont' received during runstate RUN_STATE_FINISH_MIGRATE -2. kill migration thread -3. vm_start() - -But this only solves 'cont' problem. As Dave said before, other things could -happen during the small windows while we are finishing migration, that's -what I was worried about... - - -> -Paolo -> -> ->> Paolo -> ->> -> ->>> I'd really love to see the 'run_on_cpu' being more careful about the -> -BQL; -> ->>> we really need all of the rest of the devices to stay quiesced at -> -times. -> ->> -> ->> That's not really possible, because of how condition variables work. :( -> -> -> -> *Really* we need to find a solution to that - there's probably lots of -> -> other things that can spring up in that small window other than the -> -> 'cont'. -> -> -> -> Dave -> -> -> -> -- -> -> Dr. David Alan Gilbert / address@hidden / Manchester, UK -> -> -> -> - -* Paolo Bonzini (address@hidden) wrote: -> -> -> -On 03/03/2017 14:26, Dr. David Alan Gilbert wrote: -> -> * Paolo Bonzini (address@hidden) wrote: -> ->> -> ->> -> ->> On 03/03/2017 14:11, Dr. David Alan Gilbert wrote: -> ->>> * Paolo Bonzini (address@hidden) wrote: -> ->>>> -> ->>>> -> ->>>> On 03/03/2017 13:00, Dr. David Alan Gilbert wrote: -> ->>>>> Ouch that's pretty nasty; I remember Paolo explaining to me a while ago -> ->>>>> that -> ->>>>> their were times when run_on_cpu would have to drop the BQL and I -> ->>>>> worried about it, -> ->>>>> but this is the 1st time I've seen an error due to it. -> ->>>>> -> ->>>>> Do you know what the migration state was at that point? Was it -> ->>>>> MIGRATION_STATUS_CANCELLING? -> ->>>>> I'm thinking perhaps we should stop 'cont' from continuing while -> ->>>>> migration is in -> ->>>>> MIGRATION_STATUS_CANCELLING. Do we send an event when we hit CANCELLED -> ->>>>> - so that -> ->>>>> perhaps libvirt could avoid sending the 'cont' until then? -> ->>>> -> ->>>> No, there's no event, though I thought libvirt would poll until -> ->>>> "query-migrate" returns the cancelled state. Of course that is a small -> ->>>> consolation, because a segfault is unacceptable. -> ->>> -> ->>> I think you might get an event if you set the new migrate capability -> ->>> called -> ->>> 'events' on! -> ->>> -> ->>> void migrate_set_state(int *state, int old_state, int new_state) -> ->>> { -> ->>> if (atomic_cmpxchg(state, old_state, new_state) == old_state) { -> ->>> trace_migrate_set_state(new_state); -> ->>> migrate_generate_event(new_state); -> ->>> } -> ->>> } -> ->>> -> ->>> static void migrate_generate_event(int new_state) -> ->>> { -> ->>> if (migrate_use_events()) { -> ->>> qapi_event_send_migration(new_state, &error_abort); -> ->>> } -> ->>> } -> ->>> -> ->>> That event feature went in sometime after 2.3.0. -> ->>> -> ->>>> One possibility is to suspend the monitor in qmp_migrate_cancel and -> ->>>> resume it (with add_migration_state_change_notifier) when we hit the -> ->>>> CANCELLED state. I'm not sure what the latency would be between the end -> ->>>> of migrate_fd_cancel and finally reaching CANCELLED. -> ->>> -> ->>> I don't like suspending monitors; it can potentially take quite a -> ->>> significant -> ->>> time to do a cancel. -> ->>> How about making 'cont' fail if we're in CANCELLING? -> ->> -> ->> Actually I thought that would be the case already (in fact CANCELLING is -> ->> internal only; the outside world sees it as "active" in query-migrate). -> ->> -> ->> Lei, what is the runstate? (That is, why did cont succeed at all)? -> -> -> -> I suspect it's RUN_STATE_FINISH_MIGRATE - we set that before we do the -> -> device -> -> save, and that's what we get at the end of a migrate and it's legal to -> -> restart -> -> from there. -> -> -Yeah, but I think we get there at the end of a failed migrate only. So -> -perhaps we can introduce a new state RUN_STATE_FAILED_MIGRATE and forbid -> -"cont" from finish-migrate (only allow it from failed-migrate)? -OK, I was wrong in my previous statement; we actually go -FINISH_MIGRATE->POSTMIGRATE -so no new state is needed; you shouldn't be restarting the cpu in -FINISH_MIGRATE. - -My preference is to get libvirt to wait for the transition to POSTMIGRATE before -it issues the 'cont'. I'd rather not block the monitor with 'cont' but I'm -not sure how we'd cleanly make cont fail without breaking existing libvirts -that usually don't hit this race. (cc'ing in Jiri). - -Dave - -> -Paolo -> -> ->> Paolo -> ->> -> ->>> I'd really love to see the 'run_on_cpu' being more careful about the BQL; -> ->>> we really need all of the rest of the devices to stay quiesced at times. -> ->> -> ->> That's not really possible, because of how condition variables work. :( -> -> -> -> *Really* we need to find a solution to that - there's probably lots of -> -> other things that can spring up in that small window other than the -> -> 'cont'. -> -> -> -> Dave -> -> -> -> -- -> -> Dr. David Alan Gilbert / address@hidden / Manchester, UK -> -> --- -Dr. David Alan Gilbert / address@hidden / Manchester, UK - -Hi Dave, - -On Fri, Mar 3, 2017 at 9:26 PM, Dr. David Alan Gilbert <address@hidden> -wrote: - -> -* Paolo Bonzini (address@hidden) wrote: -> -> -> -> -> -> On 03/03/2017 14:11, Dr. David Alan Gilbert wrote: -> -> > * Paolo Bonzini (address@hidden) wrote: -> -> >> -> -> >> -> -> >> On 03/03/2017 13:00, Dr. David Alan Gilbert wrote: -> -... -> -> > That event feature went in sometime after 2.3.0. -> -> > -> -> >> One possibility is to suspend the monitor in qmp_migrate_cancel and -> -> >> resume it (with add_migration_state_change_notifier) when we hit the -> -> >> CANCELLED state. I'm not sure what the latency would be between the -> -end -> -> >> of migrate_fd_cancel and finally reaching CANCELLED. -> -> > -> -> > I don't like suspending monitors; it can potentially take quite a -> -significant -> -> > time to do a cancel. -> -> > How about making 'cont' fail if we're in CANCELLING? -> -> -> -> Actually I thought that would be the case already (in fact CANCELLING is -> -> internal only; the outside world sees it as "active" in query-migrate). -> -> -> -> Lei, what is the runstate? (That is, why did cont succeed at all)? -> -> -I suspect it's RUN_STATE_FINISH_MIGRATE - we set that before we do the -> -device -> -It is RUN_STATE_FINISH_MIGRATE. - - -> -save, and that's what we get at the end of a migrate and it's legal to -> -restart -> -from there. -> -> -> Paolo -> -> -> -> > I'd really love to see the 'run_on_cpu' being more careful about the -> -BQL; -> -> > we really need all of the rest of the devices to stay quiesced at -> -times. -> -> -> -> That's not really possible, because of how condition variables work. :( -> -> -*Really* we need to find a solution to that - there's probably lots of -> -other things that can spring up in that small window other than the -> -'cont'. -> -This is what I was worry about. Not only sync_cpu_state() will call -run_on_cpu() -but also vm_stop_force_state() will, both of them did hit the small windows -in our -test. - - -> -> -Dave -> -> --- -> -Dr. David Alan Gilbert / address@hidden / Manchester, UK -> -> - diff --git a/results/classifier/013/risc-v/25892827 b/results/classifier/013/risc-v/25892827 deleted file mode 100644 index 5c0ad470b..000000000 --- a/results/classifier/013/risc-v/25892827 +++ /dev/null @@ -1,1105 +0,0 @@ -risc-v: 0.908 -user-level: 0.889 -permissions: 0.881 -register: 0.876 -KVM: 0.872 -hypervisor: 0.871 -operating system: 0.871 -debug: 0.868 -x86: 0.849 -vnc: 0.846 -system: 0.846 -mistranslation: 0.842 -boot: 0.839 -network: 0.839 -VMM: 0.839 -device: 0.839 -TCG: 0.837 -virtual: 0.835 -i386: 0.835 -peripherals: 0.833 -graphic: 0.832 -assembly: 0.829 -architecture: 0.825 -semantic: 0.825 -ppc: 0.824 -socket: 0.822 -arm: 0.821 -performance: 0.819 -alpha: 0.816 -kernel: 0.810 -files: 0.804 -PID: 0.792 - -[Qemu-devel] [BUG/RFC] Two cpus are not brought up normally in SLES11 sp3 VM after reboot - -Hi, - -Recently we encountered a problem in our project: 2 CPUs in VM are not brought -up normally after reboot. - -Our host is using KVM kmod 3.6 and QEMU 2.1. -A SLES 11 sp3 VM configured with 8 vcpus, -cpu model is configured with 'host-passthrough'. - -After VM's first time started up, everything seems to be OK. -and then VM is paniced and rebooted. -After reboot, only 6 cpus are brought up in VM, cpu1 and cpu7 are not online. - -This is the only message we can get from VM: -VM dmesg shows: -[ 0.069867] Booting Node 0, Processors #1 -[ 5.060042] CPU1: Stuck ?? -[ 5.060499] #2 -[ 5.088322] kvm-clock: cpu 2, msr 6:3fc90901, secondary cpu clock -[ 5.088335] KVM setup async PF for cpu 2 -[ 5.092967] NMI watchdog enabled, takes one hw-pmu counter. -[ 5.094405] #3 -[ 5.108324] kvm-clock: cpu 3, msr 6:3fcd0901, secondary cpu clock -[ 5.108333] KVM setup async PF for cpu 3 -[ 5.113553] NMI watchdog enabled, takes one hw-pmu counter. -[ 5.114970] #4 -[ 5.128325] kvm-clock: cpu 4, msr 6:3fd10901, secondary cpu clock -[ 5.128336] KVM setup async PF for cpu 4 -[ 5.134576] NMI watchdog enabled, takes one hw-pmu counter. -[ 5.135998] #5 -[ 5.152324] kvm-clock: cpu 5, msr 6:3fd50901, secondary cpu clock -[ 5.152334] KVM setup async PF for cpu 5 -[ 5.154764] NMI watchdog enabled, takes one hw-pmu counter. -[ 5.156467] #6 -[ 5.172327] kvm-clock: cpu 6, msr 6:3fd90901, secondary cpu clock -[ 5.172341] KVM setup async PF for cpu 6 -[ 5.180738] NMI watchdog enabled, takes one hw-pmu counter. -[ 5.182173] #7 Ok. -[ 10.170815] CPU7: Stuck ?? -[ 10.171648] Brought up 6 CPUs -[ 10.172394] Total of 6 processors activated (28799.97 BogoMIPS). - -From host, we found that QEMU vcpu1 thread and vcpu7 thread were not consuming -any cpu (Should be in idle state), -All of VCPUs' stacks in host is like bellow: - -[<ffffffffa07089b5>] kvm_vcpu_block+0x65/0xa0 [kvm] -[<ffffffffa071c7c1>] __vcpu_run+0xd1/0x260 [kvm] -[<ffffffffa071d508>] kvm_arch_vcpu_ioctl_run+0x68/0x1a0 [kvm] -[<ffffffffa0709cee>] kvm_vcpu_ioctl+0x38e/0x580 [kvm] -[<ffffffff8116be8b>] do_vfs_ioctl+0x8b/0x3b0 -[<ffffffff8116c251>] sys_ioctl+0xa1/0xb0 -[<ffffffff81468092>] system_call_fastpath+0x16/0x1b -[<00002ab9fe1f99a7>] 0x2ab9fe1f99a7 -[<ffffffffffffffff>] 0xffffffffffffffff - -We looked into the kernel codes that could leading to the above 'Stuck' warning, -and found that the only possible is the emulation of 'cpuid' instruct in -kvm/qemu has something wrong. -But since we canât reproduce this problem, we are not quite sure. -Is there any possible that the cupid emulation in kvm/qemu has some bug ? - -Has anyone come across these problem before? Or any idea? - -Thanks, -zhanghailiang - -On 06/07/2015 09:54, zhanghailiang wrote: -> -> -From host, we found that QEMU vcpu1 thread and vcpu7 thread were not -> -consuming any cpu (Should be in idle state), -> -All of VCPUs' stacks in host is like bellow: -> -> -[<ffffffffa07089b5>] kvm_vcpu_block+0x65/0xa0 [kvm] -> -[<ffffffffa071c7c1>] __vcpu_run+0xd1/0x260 [kvm] -> -[<ffffffffa071d508>] kvm_arch_vcpu_ioctl_run+0x68/0x1a0 [kvm] -> -[<ffffffffa0709cee>] kvm_vcpu_ioctl+0x38e/0x580 [kvm] -> -[<ffffffff8116be8b>] do_vfs_ioctl+0x8b/0x3b0 -> -[<ffffffff8116c251>] sys_ioctl+0xa1/0xb0 -> -[<ffffffff81468092>] system_call_fastpath+0x16/0x1b -> -[<00002ab9fe1f99a7>] 0x2ab9fe1f99a7 -> -[<ffffffffffffffff>] 0xffffffffffffffff -> -> -We looked into the kernel codes that could leading to the above 'Stuck' -> -warning, -> -and found that the only possible is the emulation of 'cpuid' instruct in -> -kvm/qemu has something wrong. -> -But since we canât reproduce this problem, we are not quite sure. -> -Is there any possible that the cupid emulation in kvm/qemu has some bug ? -Can you explain the relationship to the cpuid emulation? What do the -traces say about vcpus 1 and 7? - -Paolo - -On 2015/7/6 16:45, Paolo Bonzini wrote: -On 06/07/2015 09:54, zhanghailiang wrote: -From host, we found that QEMU vcpu1 thread and vcpu7 thread were not -consuming any cpu (Should be in idle state), -All of VCPUs' stacks in host is like bellow: - -[<ffffffffa07089b5>] kvm_vcpu_block+0x65/0xa0 [kvm] -[<ffffffffa071c7c1>] __vcpu_run+0xd1/0x260 [kvm] -[<ffffffffa071d508>] kvm_arch_vcpu_ioctl_run+0x68/0x1a0 [kvm] -[<ffffffffa0709cee>] kvm_vcpu_ioctl+0x38e/0x580 [kvm] -[<ffffffff8116be8b>] do_vfs_ioctl+0x8b/0x3b0 -[<ffffffff8116c251>] sys_ioctl+0xa1/0xb0 -[<ffffffff81468092>] system_call_fastpath+0x16/0x1b -[<00002ab9fe1f99a7>] 0x2ab9fe1f99a7 -[<ffffffffffffffff>] 0xffffffffffffffff - -We looked into the kernel codes that could leading to the above 'Stuck' -warning, -and found that the only possible is the emulation of 'cpuid' instruct in -kvm/qemu has something wrong. -But since we canât reproduce this problem, we are not quite sure. -Is there any possible that the cupid emulation in kvm/qemu has some bug ? -Can you explain the relationship to the cpuid emulation? What do the -traces say about vcpus 1 and 7? -OK, we searched the VM's kernel codes with the 'Stuck' message, and it is -located in -do_boot_cpu(). It's in BSP context, the call process is: -BSP executes start_kernel() -> smp_init() -> smp_boot_cpus() -> do_boot_cpu() --> wakeup_secondary_via_INIT() to trigger APs. -It will wait 5s for APs to startup, if some AP not startup normally, it will -print 'CPU%d Stuck' or 'CPU%d: Not responding'. - -If it prints 'Stuck', it means the AP has received the SIPI interrupt and -begins to execute the code -'ENTRY(trampoline_data)' (trampoline_64.S) , but be stuck in some places before -smp_callin()(smpboot.c). -The follow is the starup process of BSP and AP. -BSP: -start_kernel() - ->smp_init() - ->smp_boot_cpus() - ->do_boot_cpu() - ->start_ip = trampoline_address(); //set the address that AP will go -to execute - ->wakeup_secondary_cpu_via_init(); // kick the secondary CPU - ->for (timeout = 0; timeout < 50000; timeout++) - if (cpumask_test_cpu(cpu, cpu_callin_mask)) break;// check if AP -startup or not - -APs: -ENTRY(trampoline_data) (trampoline_64.S) - ->ENTRY(secondary_startup_64) (head_64.S) - ->start_secondary() (smpboot.c) - ->cpu_init(); - ->smp_callin(); - ->cpumask_set_cpu(cpuid, cpu_callin_mask); ->Note: if AP comes -here, the BSP will not prints the error message. - -From above call process, we can be sure that, the AP has been stuck between -trampoline_data and the cpumask_set_cpu() in -smp_callin(), we look through these codes path carefully, and only found a -'hlt' instruct that could block the process. -It is located in trampoline_data(): - -ENTRY(trampoline_data) - ... - - call verify_cpu # Verify the cpu supports long mode - testl %eax, %eax # Check for return code - jnz no_longmode - - ... - -no_longmode: - hlt - jmp no_longmode - -For the process verify_cpu(), -we can only find the 'cpuid' sensitive instruct that could lead VM exit from -No-root mode. -This is why we doubt if cpuid emulation is wrong in KVM/QEMU that leading to -the fail in verify_cpu. - -From the message in VM, we know vcpu1 and vcpu7 is something wrong. -[ 5.060042] CPU1: Stuck ?? -[ 10.170815] CPU7: Stuck ?? -[ 10.171648] Brought up 6 CPUs - -Besides, the follow is the cpus message got from host. -80FF72F5-FF6D-E411-A8C8-000000821800:/home/fsp/hrg # virsh qemu-monitor-command -instance-0000000 -* CPU #0: pc=0x00007f64160c683d thread_id=68570 - CPU #1: pc=0xffffffff810301f1 (halted) thread_id=68573 - CPU #2: pc=0xffffffff810301e2 (halted) thread_id=68575 - CPU #3: pc=0xffffffff810301e2 (halted) thread_id=68576 - CPU #4: pc=0xffffffff810301e2 (halted) thread_id=68577 - CPU #5: pc=0xffffffff810301e2 (halted) thread_id=68578 - CPU #6: pc=0xffffffff810301e2 (halted) thread_id=68583 - CPU #7: pc=0xffffffff810301f1 (halted) thread_id=68584 - -Oh, i also forgot to mention in the above message that, we have bond each vCPU -to different physical CPU in -host. - -Thanks, -zhanghailiang - -On 06/07/2015 11:59, zhanghailiang wrote: -> -> -> -Besides, the follow is the cpus message got from host. -> -80FF72F5-FF6D-E411-A8C8-000000821800:/home/fsp/hrg # virsh -> -qemu-monitor-command instance-0000000 -> -* CPU #0: pc=0x00007f64160c683d thread_id=68570 -> -CPU #1: pc=0xffffffff810301f1 (halted) thread_id=68573 -> -CPU #2: pc=0xffffffff810301e2 (halted) thread_id=68575 -> -CPU #3: pc=0xffffffff810301e2 (halted) thread_id=68576 -> -CPU #4: pc=0xffffffff810301e2 (halted) thread_id=68577 -> -CPU #5: pc=0xffffffff810301e2 (halted) thread_id=68578 -> -CPU #6: pc=0xffffffff810301e2 (halted) thread_id=68583 -> -CPU #7: pc=0xffffffff810301f1 (halted) thread_id=68584 -> -> -Oh, i also forgot to mention in the above message that, we have bond -> -each vCPU to different physical CPU in -> -host. -Can you capture a trace on the host (trace-cmd record -e kvm) and send -it privately? Please note which CPUs get stuck, since I guess it's not -always 1 and 7. - -Paolo - -On Mon, 6 Jul 2015 17:59:10 +0800 -zhanghailiang <address@hidden> wrote: - -> -On 2015/7/6 16:45, Paolo Bonzini wrote: -> -> -> -> -> -> On 06/07/2015 09:54, zhanghailiang wrote: -> ->> -> ->> From host, we found that QEMU vcpu1 thread and vcpu7 thread were not -> ->> consuming any cpu (Should be in idle state), -> ->> All of VCPUs' stacks in host is like bellow: -> ->> -> ->> [<ffffffffa07089b5>] kvm_vcpu_block+0x65/0xa0 [kvm] -> ->> [<ffffffffa071c7c1>] __vcpu_run+0xd1/0x260 [kvm] -> ->> [<ffffffffa071d508>] kvm_arch_vcpu_ioctl_run+0x68/0x1a0 [kvm] -> ->> [<ffffffffa0709cee>] kvm_vcpu_ioctl+0x38e/0x580 [kvm] -> ->> [<ffffffff8116be8b>] do_vfs_ioctl+0x8b/0x3b0 -> ->> [<ffffffff8116c251>] sys_ioctl+0xa1/0xb0 -> ->> [<ffffffff81468092>] system_call_fastpath+0x16/0x1b -> ->> [<00002ab9fe1f99a7>] 0x2ab9fe1f99a7 -> ->> [<ffffffffffffffff>] 0xffffffffffffffff -> ->> -> ->> We looked into the kernel codes that could leading to the above 'Stuck' -> ->> warning, -in current upstream there isn't any printk(...Stuck...) left since that code -path -has been reworked. -I've often seen this on over-committed host during guest CPUs up/down torture -test. -Could you update guest kernel to upstream and see if issue reproduces? - -> ->> and found that the only possible is the emulation of 'cpuid' instruct in -> ->> kvm/qemu has something wrong. -> ->> But since we canât reproduce this problem, we are not quite sure. -> ->> Is there any possible that the cupid emulation in kvm/qemu has some bug ? -> -> -> -> Can you explain the relationship to the cpuid emulation? What do the -> -> traces say about vcpus 1 and 7? -> -> -OK, we searched the VM's kernel codes with the 'Stuck' message, and it is -> -located in -> -do_boot_cpu(). It's in BSP context, the call process is: -> -BSP executes start_kernel() -> smp_init() -> smp_boot_cpus() -> do_boot_cpu() -> --> wakeup_secondary_via_INIT() to trigger APs. -> -It will wait 5s for APs to startup, if some AP not startup normally, it will -> -print 'CPU%d Stuck' or 'CPU%d: Not responding'. -> -> -If it prints 'Stuck', it means the AP has received the SIPI interrupt and -> -begins to execute the code -> -'ENTRY(trampoline_data)' (trampoline_64.S) , but be stuck in some places -> -before smp_callin()(smpboot.c). -> -The follow is the starup process of BSP and AP. -> -BSP: -> -start_kernel() -> -->smp_init() -> -->smp_boot_cpus() -> -->do_boot_cpu() -> -->start_ip = trampoline_address(); //set the address that AP will -> -go to execute -> -->wakeup_secondary_cpu_via_init(); // kick the secondary CPU -> -->for (timeout = 0; timeout < 50000; timeout++) -> -if (cpumask_test_cpu(cpu, cpu_callin_mask)) break;// check if -> -AP startup or not -> -> -APs: -> -ENTRY(trampoline_data) (trampoline_64.S) -> -->ENTRY(secondary_startup_64) (head_64.S) -> -->start_secondary() (smpboot.c) -> -->cpu_init(); -> -->smp_callin(); -> -->cpumask_set_cpu(cpuid, cpu_callin_mask); ->Note: if AP -> -comes here, the BSP will not prints the error message. -> -> -From above call process, we can be sure that, the AP has been stuck between -> -trampoline_data and the cpumask_set_cpu() in -> -smp_callin(), we look through these codes path carefully, and only found a -> -'hlt' instruct that could block the process. -> -It is located in trampoline_data(): -> -> -ENTRY(trampoline_data) -> -... -> -> -call verify_cpu # Verify the cpu supports long mode -> -testl %eax, %eax # Check for return code -> -jnz no_longmode -> -> -... -> -> -no_longmode: -> -hlt -> -jmp no_longmode -> -> -For the process verify_cpu(), -> -we can only find the 'cpuid' sensitive instruct that could lead VM exit from -> -No-root mode. -> -This is why we doubt if cpuid emulation is wrong in KVM/QEMU that leading to -> -the fail in verify_cpu. -> -> -From the message in VM, we know vcpu1 and vcpu7 is something wrong. -> -[ 5.060042] CPU1: Stuck ?? -> -[ 10.170815] CPU7: Stuck ?? -> -[ 10.171648] Brought up 6 CPUs -> -> -Besides, the follow is the cpus message got from host. -> -80FF72F5-FF6D-E411-A8C8-000000821800:/home/fsp/hrg # virsh -> -qemu-monitor-command instance-0000000 -> -* CPU #0: pc=0x00007f64160c683d thread_id=68570 -> -CPU #1: pc=0xffffffff810301f1 (halted) thread_id=68573 -> -CPU #2: pc=0xffffffff810301e2 (halted) thread_id=68575 -> -CPU #3: pc=0xffffffff810301e2 (halted) thread_id=68576 -> -CPU #4: pc=0xffffffff810301e2 (halted) thread_id=68577 -> -CPU #5: pc=0xffffffff810301e2 (halted) thread_id=68578 -> -CPU #6: pc=0xffffffff810301e2 (halted) thread_id=68583 -> -CPU #7: pc=0xffffffff810301f1 (halted) thread_id=68584 -> -> -Oh, i also forgot to mention in the above message that, we have bond each -> -vCPU to different physical CPU in -> -host. -> -> -Thanks, -> -zhanghailiang -> -> -> -> -> --- -> -To unsubscribe from this list: send the line "unsubscribe kvm" in -> -the body of a message to address@hidden -> -More majordomo info at -http://vger.kernel.org/majordomo-info.html - -On 2015/7/7 19:23, Igor Mammedov wrote: -On Mon, 6 Jul 2015 17:59:10 +0800 -zhanghailiang <address@hidden> wrote: -On 2015/7/6 16:45, Paolo Bonzini wrote: -On 06/07/2015 09:54, zhanghailiang wrote: -From host, we found that QEMU vcpu1 thread and vcpu7 thread were not -consuming any cpu (Should be in idle state), -All of VCPUs' stacks in host is like bellow: - -[<ffffffffa07089b5>] kvm_vcpu_block+0x65/0xa0 [kvm] -[<ffffffffa071c7c1>] __vcpu_run+0xd1/0x260 [kvm] -[<ffffffffa071d508>] kvm_arch_vcpu_ioctl_run+0x68/0x1a0 [kvm] -[<ffffffffa0709cee>] kvm_vcpu_ioctl+0x38e/0x580 [kvm] -[<ffffffff8116be8b>] do_vfs_ioctl+0x8b/0x3b0 -[<ffffffff8116c251>] sys_ioctl+0xa1/0xb0 -[<ffffffff81468092>] system_call_fastpath+0x16/0x1b -[<00002ab9fe1f99a7>] 0x2ab9fe1f99a7 -[<ffffffffffffffff>] 0xffffffffffffffff - -We looked into the kernel codes that could leading to the above 'Stuck' -warning, -in current upstream there isn't any printk(...Stuck...) left since that code -path -has been reworked. -I've often seen this on over-committed host during guest CPUs up/down torture -test. -Could you update guest kernel to upstream and see if issue reproduces? -Hmm, Unfortunately, it is very hard to reproduce, and we are still trying to -reproduce it. - -For your test case, is it a kernel bug? -Or is there any related patch could solve your test problem been merged into -upstream ? - -Thanks, -zhanghailiang -and found that the only possible is the emulation of 'cpuid' instruct in -kvm/qemu has something wrong. -But since we canât reproduce this problem, we are not quite sure. -Is there any possible that the cupid emulation in kvm/qemu has some bug ? -Can you explain the relationship to the cpuid emulation? What do the -traces say about vcpus 1 and 7? -OK, we searched the VM's kernel codes with the 'Stuck' message, and it is -located in -do_boot_cpu(). It's in BSP context, the call process is: -BSP executes start_kernel() -> smp_init() -> smp_boot_cpus() -> do_boot_cpu() --> wakeup_secondary_via_INIT() to trigger APs. -It will wait 5s for APs to startup, if some AP not startup normally, it will -print 'CPU%d Stuck' or 'CPU%d: Not responding'. - -If it prints 'Stuck', it means the AP has received the SIPI interrupt and -begins to execute the code -'ENTRY(trampoline_data)' (trampoline_64.S) , but be stuck in some places before -smp_callin()(smpboot.c). -The follow is the starup process of BSP and AP. -BSP: -start_kernel() - ->smp_init() - ->smp_boot_cpus() - ->do_boot_cpu() - ->start_ip = trampoline_address(); //set the address that AP will -go to execute - ->wakeup_secondary_cpu_via_init(); // kick the secondary CPU - ->for (timeout = 0; timeout < 50000; timeout++) - if (cpumask_test_cpu(cpu, cpu_callin_mask)) break;// check if -AP startup or not - -APs: -ENTRY(trampoline_data) (trampoline_64.S) - ->ENTRY(secondary_startup_64) (head_64.S) - ->start_secondary() (smpboot.c) - ->cpu_init(); - ->smp_callin(); - ->cpumask_set_cpu(cpuid, cpu_callin_mask); ->Note: if AP -comes here, the BSP will not prints the error message. - - From above call process, we can be sure that, the AP has been stuck between -trampoline_data and the cpumask_set_cpu() in -smp_callin(), we look through these codes path carefully, and only found a -'hlt' instruct that could block the process. -It is located in trampoline_data(): - -ENTRY(trampoline_data) - ... - - call verify_cpu # Verify the cpu supports long mode - testl %eax, %eax # Check for return code - jnz no_longmode - - ... - -no_longmode: - hlt - jmp no_longmode - -For the process verify_cpu(), -we can only find the 'cpuid' sensitive instruct that could lead VM exit from -No-root mode. -This is why we doubt if cpuid emulation is wrong in KVM/QEMU that leading to -the fail in verify_cpu. - - From the message in VM, we know vcpu1 and vcpu7 is something wrong. -[ 5.060042] CPU1: Stuck ?? -[ 10.170815] CPU7: Stuck ?? -[ 10.171648] Brought up 6 CPUs - -Besides, the follow is the cpus message got from host. -80FF72F5-FF6D-E411-A8C8-000000821800:/home/fsp/hrg # virsh qemu-monitor-command -instance-0000000 -* CPU #0: pc=0x00007f64160c683d thread_id=68570 - CPU #1: pc=0xffffffff810301f1 (halted) thread_id=68573 - CPU #2: pc=0xffffffff810301e2 (halted) thread_id=68575 - CPU #3: pc=0xffffffff810301e2 (halted) thread_id=68576 - CPU #4: pc=0xffffffff810301e2 (halted) thread_id=68577 - CPU #5: pc=0xffffffff810301e2 (halted) thread_id=68578 - CPU #6: pc=0xffffffff810301e2 (halted) thread_id=68583 - CPU #7: pc=0xffffffff810301f1 (halted) thread_id=68584 - -Oh, i also forgot to mention in the above message that, we have bond each vCPU -to different physical CPU in -host. - -Thanks, -zhanghailiang - - - - --- -To unsubscribe from this list: send the line "unsubscribe kvm" in -the body of a message to address@hidden -More majordomo info at -http://vger.kernel.org/majordomo-info.html -. - -On Tue, 7 Jul 2015 19:43:35 +0800 -zhanghailiang <address@hidden> wrote: - -> -On 2015/7/7 19:23, Igor Mammedov wrote: -> -> On Mon, 6 Jul 2015 17:59:10 +0800 -> -> zhanghailiang <address@hidden> wrote: -> -> -> ->> On 2015/7/6 16:45, Paolo Bonzini wrote: -> ->>> -> ->>> -> ->>> On 06/07/2015 09:54, zhanghailiang wrote: -> ->>>> -> ->>>> From host, we found that QEMU vcpu1 thread and vcpu7 thread were not -> ->>>> consuming any cpu (Should be in idle state), -> ->>>> All of VCPUs' stacks in host is like bellow: -> ->>>> -> ->>>> [<ffffffffa07089b5>] kvm_vcpu_block+0x65/0xa0 [kvm] -> ->>>> [<ffffffffa071c7c1>] __vcpu_run+0xd1/0x260 [kvm] -> ->>>> [<ffffffffa071d508>] kvm_arch_vcpu_ioctl_run+0x68/0x1a0 [kvm] -> ->>>> [<ffffffffa0709cee>] kvm_vcpu_ioctl+0x38e/0x580 [kvm] -> ->>>> [<ffffffff8116be8b>] do_vfs_ioctl+0x8b/0x3b0 -> ->>>> [<ffffffff8116c251>] sys_ioctl+0xa1/0xb0 -> ->>>> [<ffffffff81468092>] system_call_fastpath+0x16/0x1b -> ->>>> [<00002ab9fe1f99a7>] 0x2ab9fe1f99a7 -> ->>>> [<ffffffffffffffff>] 0xffffffffffffffff -> ->>>> -> ->>>> We looked into the kernel codes that could leading to the above 'Stuck' -> ->>>> warning, -> -> in current upstream there isn't any printk(...Stuck...) left since that -> -> code path -> -> has been reworked. -> -> I've often seen this on over-committed host during guest CPUs up/down -> -> torture test. -> -> Could you update guest kernel to upstream and see if issue reproduces? -> -> -> -> -Hmm, Unfortunately, it is very hard to reproduce, and we are still trying to -> -reproduce it. -> -> -For your test case, is it a kernel bug? -> -Or is there any related patch could solve your test problem been merged into -> -upstream ? -I don't remember all prerequisite patches but you should be able to find -http://marc.info/?l=linux-kernel&m=140326703108009&w=2 -"x86/smpboot: Initialize secondary CPU only if master CPU will wait for it" -and then look for dependencies. - - -> -> -Thanks, -> -zhanghailiang -> -> ->>>> and found that the only possible is the emulation of 'cpuid' instruct in -> ->>>> kvm/qemu has something wrong. -> ->>>> But since we canât reproduce this problem, we are not quite sure. -> ->>>> Is there any possible that the cupid emulation in kvm/qemu has some bug ? -> ->>> -> ->>> Can you explain the relationship to the cpuid emulation? What do the -> ->>> traces say about vcpus 1 and 7? -> ->> -> ->> OK, we searched the VM's kernel codes with the 'Stuck' message, and it is -> ->> located in -> ->> do_boot_cpu(). It's in BSP context, the call process is: -> ->> BSP executes start_kernel() -> smp_init() -> smp_boot_cpus() -> -> ->> do_boot_cpu() -> wakeup_secondary_via_INIT() to trigger APs. -> ->> It will wait 5s for APs to startup, if some AP not startup normally, it -> ->> will print 'CPU%d Stuck' or 'CPU%d: Not responding'. -> ->> -> ->> If it prints 'Stuck', it means the AP has received the SIPI interrupt and -> ->> begins to execute the code -> ->> 'ENTRY(trampoline_data)' (trampoline_64.S) , but be stuck in some places -> ->> before smp_callin()(smpboot.c). -> ->> The follow is the starup process of BSP and AP. -> ->> BSP: -> ->> start_kernel() -> ->> ->smp_init() -> ->> ->smp_boot_cpus() -> ->> ->do_boot_cpu() -> ->> ->start_ip = trampoline_address(); //set the address that AP -> ->> will go to execute -> ->> ->wakeup_secondary_cpu_via_init(); // kick the secondary CPU -> ->> ->for (timeout = 0; timeout < 50000; timeout++) -> ->> if (cpumask_test_cpu(cpu, cpu_callin_mask)) break;// -> ->> check if AP startup or not -> ->> -> ->> APs: -> ->> ENTRY(trampoline_data) (trampoline_64.S) -> ->> ->ENTRY(secondary_startup_64) (head_64.S) -> ->> ->start_secondary() (smpboot.c) -> ->> ->cpu_init(); -> ->> ->smp_callin(); -> ->> ->cpumask_set_cpu(cpuid, cpu_callin_mask); ->Note: if AP -> ->> comes here, the BSP will not prints the error message. -> ->> -> ->> From above call process, we can be sure that, the AP has been stuck -> ->> between trampoline_data and the cpumask_set_cpu() in -> ->> smp_callin(), we look through these codes path carefully, and only found a -> ->> 'hlt' instruct that could block the process. -> ->> It is located in trampoline_data(): -> ->> -> ->> ENTRY(trampoline_data) -> ->> ... -> ->> -> ->> call verify_cpu # Verify the cpu supports long mode -> ->> testl %eax, %eax # Check for return code -> ->> jnz no_longmode -> ->> -> ->> ... -> ->> -> ->> no_longmode: -> ->> hlt -> ->> jmp no_longmode -> ->> -> ->> For the process verify_cpu(), -> ->> we can only find the 'cpuid' sensitive instruct that could lead VM exit -> ->> from No-root mode. -> ->> This is why we doubt if cpuid emulation is wrong in KVM/QEMU that leading -> ->> to the fail in verify_cpu. -> ->> -> ->> From the message in VM, we know vcpu1 and vcpu7 is something wrong. -> ->> [ 5.060042] CPU1: Stuck ?? -> ->> [ 10.170815] CPU7: Stuck ?? -> ->> [ 10.171648] Brought up 6 CPUs -> ->> -> ->> Besides, the follow is the cpus message got from host. -> ->> 80FF72F5-FF6D-E411-A8C8-000000821800:/home/fsp/hrg # virsh -> ->> qemu-monitor-command instance-0000000 -> ->> * CPU #0: pc=0x00007f64160c683d thread_id=68570 -> ->> CPU #1: pc=0xffffffff810301f1 (halted) thread_id=68573 -> ->> CPU #2: pc=0xffffffff810301e2 (halted) thread_id=68575 -> ->> CPU #3: pc=0xffffffff810301e2 (halted) thread_id=68576 -> ->> CPU #4: pc=0xffffffff810301e2 (halted) thread_id=68577 -> ->> CPU #5: pc=0xffffffff810301e2 (halted) thread_id=68578 -> ->> CPU #6: pc=0xffffffff810301e2 (halted) thread_id=68583 -> ->> CPU #7: pc=0xffffffff810301f1 (halted) thread_id=68584 -> ->> -> ->> Oh, i also forgot to mention in the above message that, we have bond each -> ->> vCPU to different physical CPU in -> ->> host. -> ->> -> ->> Thanks, -> ->> zhanghailiang -> ->> -> ->> -> ->> -> ->> -> ->> -- -> ->> To unsubscribe from this list: send the line "unsubscribe kvm" in -> ->> the body of a message to address@hidden -> ->> More majordomo info at -http://vger.kernel.org/majordomo-info.html -> -> -> -> -> -> . -> -> -> -> -> - -On 2015/7/7 20:21, Igor Mammedov wrote: -On Tue, 7 Jul 2015 19:43:35 +0800 -zhanghailiang <address@hidden> wrote: -On 2015/7/7 19:23, Igor Mammedov wrote: -On Mon, 6 Jul 2015 17:59:10 +0800 -zhanghailiang <address@hidden> wrote: -On 2015/7/6 16:45, Paolo Bonzini wrote: -On 06/07/2015 09:54, zhanghailiang wrote: -From host, we found that QEMU vcpu1 thread and vcpu7 thread were not -consuming any cpu (Should be in idle state), -All of VCPUs' stacks in host is like bellow: - -[<ffffffffa07089b5>] kvm_vcpu_block+0x65/0xa0 [kvm] -[<ffffffffa071c7c1>] __vcpu_run+0xd1/0x260 [kvm] -[<ffffffffa071d508>] kvm_arch_vcpu_ioctl_run+0x68/0x1a0 [kvm] -[<ffffffffa0709cee>] kvm_vcpu_ioctl+0x38e/0x580 [kvm] -[<ffffffff8116be8b>] do_vfs_ioctl+0x8b/0x3b0 -[<ffffffff8116c251>] sys_ioctl+0xa1/0xb0 -[<ffffffff81468092>] system_call_fastpath+0x16/0x1b -[<00002ab9fe1f99a7>] 0x2ab9fe1f99a7 -[<ffffffffffffffff>] 0xffffffffffffffff - -We looked into the kernel codes that could leading to the above 'Stuck' -warning, -in current upstream there isn't any printk(...Stuck...) left since that code -path -has been reworked. -I've often seen this on over-committed host during guest CPUs up/down torture -test. -Could you update guest kernel to upstream and see if issue reproduces? -Hmm, Unfortunately, it is very hard to reproduce, and we are still trying to -reproduce it. - -For your test case, is it a kernel bug? -Or is there any related patch could solve your test problem been merged into -upstream ? -I don't remember all prerequisite patches but you should be able to find -http://marc.info/?l=linux-kernel&m=140326703108009&w=2 -"x86/smpboot: Initialize secondary CPU only if master CPU will wait for it" -and then look for dependencies. -Er, we have investigated this patch, and it is not related to our problem, :) - -Thanks. -Thanks, -zhanghailiang -and found that the only possible is the emulation of 'cpuid' instruct in -kvm/qemu has something wrong. -But since we canât reproduce this problem, we are not quite sure. -Is there any possible that the cupid emulation in kvm/qemu has some bug ? -Can you explain the relationship to the cpuid emulation? What do the -traces say about vcpus 1 and 7? -OK, we searched the VM's kernel codes with the 'Stuck' message, and it is -located in -do_boot_cpu(). It's in BSP context, the call process is: -BSP executes start_kernel() -> smp_init() -> smp_boot_cpus() -> do_boot_cpu() --> wakeup_secondary_via_INIT() to trigger APs. -It will wait 5s for APs to startup, if some AP not startup normally, it will -print 'CPU%d Stuck' or 'CPU%d: Not responding'. - -If it prints 'Stuck', it means the AP has received the SIPI interrupt and -begins to execute the code -'ENTRY(trampoline_data)' (trampoline_64.S) , but be stuck in some places before -smp_callin()(smpboot.c). -The follow is the starup process of BSP and AP. -BSP: -start_kernel() - ->smp_init() - ->smp_boot_cpus() - ->do_boot_cpu() - ->start_ip = trampoline_address(); //set the address that AP will -go to execute - ->wakeup_secondary_cpu_via_init(); // kick the secondary CPU - ->for (timeout = 0; timeout < 50000; timeout++) - if (cpumask_test_cpu(cpu, cpu_callin_mask)) break;// check if -AP startup or not - -APs: -ENTRY(trampoline_data) (trampoline_64.S) - ->ENTRY(secondary_startup_64) (head_64.S) - ->start_secondary() (smpboot.c) - ->cpu_init(); - ->smp_callin(); - ->cpumask_set_cpu(cpuid, cpu_callin_mask); ->Note: if AP -comes here, the BSP will not prints the error message. - - From above call process, we can be sure that, the AP has been stuck between -trampoline_data and the cpumask_set_cpu() in -smp_callin(), we look through these codes path carefully, and only found a -'hlt' instruct that could block the process. -It is located in trampoline_data(): - -ENTRY(trampoline_data) - ... - - call verify_cpu # Verify the cpu supports long mode - testl %eax, %eax # Check for return code - jnz no_longmode - - ... - -no_longmode: - hlt - jmp no_longmode - -For the process verify_cpu(), -we can only find the 'cpuid' sensitive instruct that could lead VM exit from -No-root mode. -This is why we doubt if cpuid emulation is wrong in KVM/QEMU that leading to -the fail in verify_cpu. - - From the message in VM, we know vcpu1 and vcpu7 is something wrong. -[ 5.060042] CPU1: Stuck ?? -[ 10.170815] CPU7: Stuck ?? -[ 10.171648] Brought up 6 CPUs - -Besides, the follow is the cpus message got from host. -80FF72F5-FF6D-E411-A8C8-000000821800:/home/fsp/hrg # virsh qemu-monitor-command -instance-0000000 -* CPU #0: pc=0x00007f64160c683d thread_id=68570 - CPU #1: pc=0xffffffff810301f1 (halted) thread_id=68573 - CPU #2: pc=0xffffffff810301e2 (halted) thread_id=68575 - CPU #3: pc=0xffffffff810301e2 (halted) thread_id=68576 - CPU #4: pc=0xffffffff810301e2 (halted) thread_id=68577 - CPU #5: pc=0xffffffff810301e2 (halted) thread_id=68578 - CPU #6: pc=0xffffffff810301e2 (halted) thread_id=68583 - CPU #7: pc=0xffffffff810301f1 (halted) thread_id=68584 - -Oh, i also forgot to mention in the above message that, we have bond each vCPU -to different physical CPU in -host. - -Thanks, -zhanghailiang - - - - --- -To unsubscribe from this list: send the line "unsubscribe kvm" in -the body of a message to address@hidden -More majordomo info at -http://vger.kernel.org/majordomo-info.html -. -. - diff --git a/results/classifier/013/risc-v/55367348 b/results/classifier/013/risc-v/55367348 deleted file mode 100644 index 8b95754d8..000000000 --- a/results/classifier/013/risc-v/55367348 +++ /dev/null @@ -1,560 +0,0 @@ -risc-v: 0.675 -user-level: 0.674 -mistranslation: 0.615 -permissions: 0.595 -device: 0.586 -arm: 0.573 -PID: 0.559 -semantic: 0.555 -register: 0.553 -performance: 0.546 -operating system: 0.546 -graphic: 0.532 -assembly: 0.531 -architecture: 0.530 -ppc: 0.523 -network: 0.518 -TCG: 0.517 -debug: 0.516 -system: 0.516 -virtual: 0.512 -hypervisor: 0.502 -socket: 0.501 -files: 0.490 -boot: 0.486 -VMM: 0.473 -KVM: 0.470 -peripherals: 0.466 -vnc: 0.465 -kernel: 0.441 -x86: 0.402 -alpha: 0.375 -i386: 0.370 - -[Qemu-devel] [Bug] Docs build fails at interop.rst - -https://paste.fedoraproject.org/paste/kOPx4jhtUli---TmxSLrlw -running python3-sphinx-2.0.1-1.fc31.noarch on Fedora release 31 -(Rawhide) - -uname - a -Linux iouring 5.1.0-0.rc6.git3.1.fc31.x86_64 #1 SMP Thu Apr 25 14:25:32 -UTC 2019 x86_64 x86_64 x86_64 GNU/Linux - -Reverting commmit 90edef80a0852cf8a3d2668898ee40e8970e431 -allows for the build to occur - -Regards -Aarushi Mehta - -On 5/20/19 7:30 AM, Aarushi Mehta wrote: -> -https://paste.fedoraproject.org/paste/kOPx4jhtUli---TmxSLrlw -> -running python3-sphinx-2.0.1-1.fc31.noarch on Fedora release 31 -> -(Rawhide) -> -> -uname - a -> -Linux iouring 5.1.0-0.rc6.git3.1.fc31.x86_64 #1 SMP Thu Apr 25 14:25:32 -> -UTC 2019 x86_64 x86_64 x86_64 GNU/Linux -> -> -Reverting commmit 90edef80a0852cf8a3d2668898ee40e8970e431 -> -allows for the build to occur -> -> -Regards -> -Aarushi Mehta -> -> -Ah, dang. The blocks aren't strictly conforming json, but the version I -tested this under didn't seem to care. Your version is much newer. (I -was using 1.7 as provided by Fedora 29.) - -For now, try reverting 9e5b6cb87db66dfb606604fe6cf40e5ddf1ef0e7 instead, -which should at least turn off the "warnings as errors" option, but I -don't think that reverting -n will turn off this warning. - -I'll try to get ahold of this newer version and see if I can't fix it -more appropriately. - ---js - -On 5/20/19 12:37 PM, John Snow wrote: -> -> -> -On 5/20/19 7:30 AM, Aarushi Mehta wrote: -> -> -https://paste.fedoraproject.org/paste/kOPx4jhtUli---TmxSLrlw -> -> running python3-sphinx-2.0.1-1.fc31.noarch on Fedora release 31 -> -> (Rawhide) -> -> -> -> uname - a -> -> Linux iouring 5.1.0-0.rc6.git3.1.fc31.x86_64 #1 SMP Thu Apr 25 14:25:32 -> -> UTC 2019 x86_64 x86_64 x86_64 GNU/Linux -> -> -> -> Reverting commmit 90edef80a0852cf8a3d2668898ee40e8970e431 -> -> allows for the build to occur -> -> -> -> Regards -> -> Aarushi Mehta -> -> -> -> -> -> -Ah, dang. The blocks aren't strictly conforming json, but the version I -> -tested this under didn't seem to care. Your version is much newer. (I -> -was using 1.7 as provided by Fedora 29.) -> -> -For now, try reverting 9e5b6cb87db66dfb606604fe6cf40e5ddf1ef0e7 instead, -> -which should at least turn off the "warnings as errors" option, but I -> -don't think that reverting -n will turn off this warning. -> -> -I'll try to get ahold of this newer version and see if I can't fix it -> -more appropriately. -> -> ---js -> -...Sigh, okay. - -So, I am still not actually sure what changed from pygments 2.2 and -sphinx 1.7 to pygments 2.4 and sphinx 2.0.1, but it appears as if Sphinx -by default always tries to do add a filter to the pygments lexer that -raises an error on highlighting failure, instead of the default behavior -which is to just highlight those errors in the output. There is no -option to Sphinx that I am aware of to retain this lexing behavior. -(Effectively, it's strict or nothing.) - -This approach, apparently, is broken in Sphinx 1.7/Pygments 2.2, so the -build works with our malformed json. - -There are a few options: - -1. Update conf.py to ignore these warnings (and all future lexing -errors), and settle for the fact that there will be no QMP highlighting -wherever we use the directionality indicators ('->', '<-'). - -2. Update bitmaps.rst to remove the directionality indicators. - -3. Update bitmaps.rst to format the QMP blocks as raw text instead of JSON. - -4. Update bitmaps.rst to remove the "json" specification from the code -block. This will cause sphinx to "guess" the formatting, and the -pygments guesser will decide it's Python3. - -This will parse well enough, but will mis-highlight 'true' and 'false' -which are not python keywords. This approach may break in the future if -the Python3 lexer is upgraded to be stricter (because '->' and '<-' are -still invalid), and leaves us at the mercy of both the guesser and the -lexer. - -I'm not actually sure what I dislike the least; I think I dislike #1 the -most. #4 gets us most of what we want but is perhaps porcelain. - -I suspect if we attempt to move more of our documentation to ReST and -Sphinx that we will need to answer for ourselves how we intend to -document QMP code flow examples. - ---js - -On Mon, May 20, 2019 at 05:25:28PM -0400, John Snow wrote: -> -> -> -On 5/20/19 12:37 PM, John Snow wrote: -> -> -> -> -> -> On 5/20/19 7:30 AM, Aarushi Mehta wrote: -> ->> -https://paste.fedoraproject.org/paste/kOPx4jhtUli---TmxSLrlw -> ->> running python3-sphinx-2.0.1-1.fc31.noarch on Fedora release 31 -> ->> (Rawhide) -> ->> -> ->> uname - a -> ->> Linux iouring 5.1.0-0.rc6.git3.1.fc31.x86_64 #1 SMP Thu Apr 25 14:25:32 -> ->> UTC 2019 x86_64 x86_64 x86_64 GNU/Linux -> ->> -> ->> Reverting commmit 90edef80a0852cf8a3d2668898ee40e8970e431 -> ->> allows for the build to occur -> ->> -> ->> Regards -> ->> Aarushi Mehta -> ->> -> ->> -> -> -> -> Ah, dang. The blocks aren't strictly conforming json, but the version I -> -> tested this under didn't seem to care. Your version is much newer. (I -> -> was using 1.7 as provided by Fedora 29.) -> -> -> -> For now, try reverting 9e5b6cb87db66dfb606604fe6cf40e5ddf1ef0e7 instead, -> -> which should at least turn off the "warnings as errors" option, but I -> -> don't think that reverting -n will turn off this warning. -> -> -> -> I'll try to get ahold of this newer version and see if I can't fix it -> -> more appropriately. -> -> -> -> --js -> -> -> -> -...Sigh, okay. -> -> -So, I am still not actually sure what changed from pygments 2.2 and -> -sphinx 1.7 to pygments 2.4 and sphinx 2.0.1, but it appears as if Sphinx -> -by default always tries to do add a filter to the pygments lexer that -> -raises an error on highlighting failure, instead of the default behavior -> -which is to just highlight those errors in the output. There is no -> -option to Sphinx that I am aware of to retain this lexing behavior. -> -(Effectively, it's strict or nothing.) -> -> -This approach, apparently, is broken in Sphinx 1.7/Pygments 2.2, so the -> -build works with our malformed json. -> -> -There are a few options: -> -> -1. Update conf.py to ignore these warnings (and all future lexing -> -errors), and settle for the fact that there will be no QMP highlighting -> -wherever we use the directionality indicators ('->', '<-'). -> -> -2. Update bitmaps.rst to remove the directionality indicators. -> -> -3. Update bitmaps.rst to format the QMP blocks as raw text instead of JSON. -> -> -4. Update bitmaps.rst to remove the "json" specification from the code -> -block. This will cause sphinx to "guess" the formatting, and the -> -pygments guesser will decide it's Python3. -> -> -This will parse well enough, but will mis-highlight 'true' and 'false' -> -which are not python keywords. This approach may break in the future if -> -the Python3 lexer is upgraded to be stricter (because '->' and '<-' are -> -still invalid), and leaves us at the mercy of both the guesser and the -> -lexer. -> -> -I'm not actually sure what I dislike the least; I think I dislike #1 the -> -most. #4 gets us most of what we want but is perhaps porcelain. -> -> -I suspect if we attempt to move more of our documentation to ReST and -> -Sphinx that we will need to answer for ourselves how we intend to -> -document QMP code flow examples. -Writing a custom lexer that handles "<-" and "->" was simple (see below). - -Now, is it possible to convince Sphinx to register and use a custom lexer? - -$ cat > /tmp/lexer.py <<EOF -from pygments.lexer import RegexLexer, DelegatingLexer -from pygments.lexers.data import JsonLexer -import re -from pygments.token import * - -class QMPExampleMarkersLexer(RegexLexer): - tokens = { - 'root': [ - (r' *-> *', Generic.Prompt), - (r' *<- *', Generic.Output), - ] - } - -class QMPExampleLexer(DelegatingLexer): - def __init__(self, **options): - super(QMPExampleLexer, self).__init__(JsonLexer, -QMPExampleMarkersLexer, Error, **options) -EOF -$ pygmentize -l /tmp/lexer.py:QMPExampleLexer -x -f html <<EOF - -> { - "execute": "drive-backup", - "arguments": { - "device": "drive0", - "bitmap": "bitmap0", - "target": "drive0.inc0.qcow2", - "format": "qcow2", - "sync": "incremental", - "mode": "existing" - } - } - - <- { "return": {} } -EOF -<div class="highlight"><pre><span></span><span class="gp"> -> -</span><span class="p">{</span> - <span class="nt">"execute"</span><span class="p">:</span> -<span class="s2">"drive-backup"</span><span class="p">,</span> - <span class="nt">"arguments"</span><span class="p">:</span> -<span class="p">{</span> - <span class="nt">"device"</span><span class="p">:</span> -<span class="s2">"drive0"</span><span class="p">,</span> - <span class="nt">"bitmap"</span><span class="p">:</span> -<span class="s2">"bitmap0"</span><span class="p">,</span> - <span class="nt">"target"</span><span class="p">:</span> -<span class="s2">"drive0.inc0.qcow2"</span><span class="p">,</span> - <span class="nt">"format"</span><span class="p">:</span> -<span class="s2">"qcow2"</span><span class="p">,</span> - <span class="nt">"sync"</span><span class="p">:</span> -<span class="s2">"incremental"</span><span class="p">,</span> - <span class="nt">"mode"</span><span class="p">:</span> -<span class="s2">"existing"</span> - <span class="p">}</span> - <span class="p">}</span> - -<span class="go"> <- </span><span class="p">{</span> <span -class="nt">"return"</span><span class="p">:</span> <span -class="p">{}</span> <span class="p">}</span> -</pre></div> -$ - - --- -Eduardo - -On 5/20/19 7:04 PM, Eduardo Habkost wrote: -> -On Mon, May 20, 2019 at 05:25:28PM -0400, John Snow wrote: -> -> -> -> -> -> On 5/20/19 12:37 PM, John Snow wrote: -> ->> -> ->> -> ->> On 5/20/19 7:30 AM, Aarushi Mehta wrote: -> ->>> -https://paste.fedoraproject.org/paste/kOPx4jhtUli---TmxSLrlw -> ->>> running python3-sphinx-2.0.1-1.fc31.noarch on Fedora release 31 -> ->>> (Rawhide) -> ->>> -> ->>> uname - a -> ->>> Linux iouring 5.1.0-0.rc6.git3.1.fc31.x86_64 #1 SMP Thu Apr 25 14:25:32 -> ->>> UTC 2019 x86_64 x86_64 x86_64 GNU/Linux -> ->>> -> ->>> Reverting commmit 90edef80a0852cf8a3d2668898ee40e8970e431 -> ->>> allows for the build to occur -> ->>> -> ->>> Regards -> ->>> Aarushi Mehta -> ->>> -> ->>> -> ->> -> ->> Ah, dang. The blocks aren't strictly conforming json, but the version I -> ->> tested this under didn't seem to care. Your version is much newer. (I -> ->> was using 1.7 as provided by Fedora 29.) -> ->> -> ->> For now, try reverting 9e5b6cb87db66dfb606604fe6cf40e5ddf1ef0e7 instead, -> ->> which should at least turn off the "warnings as errors" option, but I -> ->> don't think that reverting -n will turn off this warning. -> ->> -> ->> I'll try to get ahold of this newer version and see if I can't fix it -> ->> more appropriately. -> ->> -> ->> --js -> ->> -> -> -> -> ...Sigh, okay. -> -> -> -> So, I am still not actually sure what changed from pygments 2.2 and -> -> sphinx 1.7 to pygments 2.4 and sphinx 2.0.1, but it appears as if Sphinx -> -> by default always tries to do add a filter to the pygments lexer that -> -> raises an error on highlighting failure, instead of the default behavior -> -> which is to just highlight those errors in the output. There is no -> -> option to Sphinx that I am aware of to retain this lexing behavior. -> -> (Effectively, it's strict or nothing.) -> -> -> -> This approach, apparently, is broken in Sphinx 1.7/Pygments 2.2, so the -> -> build works with our malformed json. -> -> -> -> There are a few options: -> -> -> -> 1. Update conf.py to ignore these warnings (and all future lexing -> -> errors), and settle for the fact that there will be no QMP highlighting -> -> wherever we use the directionality indicators ('->', '<-'). -> -> -> -> 2. Update bitmaps.rst to remove the directionality indicators. -> -> -> -> 3. Update bitmaps.rst to format the QMP blocks as raw text instead of JSON. -> -> -> -> 4. Update bitmaps.rst to remove the "json" specification from the code -> -> block. This will cause sphinx to "guess" the formatting, and the -> -> pygments guesser will decide it's Python3. -> -> -> -> This will parse well enough, but will mis-highlight 'true' and 'false' -> -> which are not python keywords. This approach may break in the future if -> -> the Python3 lexer is upgraded to be stricter (because '->' and '<-' are -> -> still invalid), and leaves us at the mercy of both the guesser and the -> -> lexer. -> -> -> -> I'm not actually sure what I dislike the least; I think I dislike #1 the -> -> most. #4 gets us most of what we want but is perhaps porcelain. -> -> -> -> I suspect if we attempt to move more of our documentation to ReST and -> -> Sphinx that we will need to answer for ourselves how we intend to -> -> document QMP code flow examples. -> -> -Writing a custom lexer that handles "<-" and "->" was simple (see below). -> -> -Now, is it possible to convince Sphinx to register and use a custom lexer? -> -Spoilers, yes, and I've sent a patch to list. Thanks for your help! - diff --git a/results/classifier/013/risc-v/55753058 b/results/classifier/013/risc-v/55753058 deleted file mode 100644 index ba2c39dbf..000000000 --- a/results/classifier/013/risc-v/55753058 +++ /dev/null @@ -1,321 +0,0 @@ -risc-v: 0.789 -TCG: 0.743 -ppc: 0.731 -peripherals: 0.728 -hypervisor: 0.728 -x86: 0.713 -KVM: 0.713 -i386: 0.700 -operating system: 0.696 -vnc: 0.682 -VMM: 0.657 -mistranslation: 0.649 -user-level: 0.648 -graphic: 0.630 -device: 0.623 -register: 0.620 -debug: 0.611 -arm: 0.595 -performance: 0.591 -permissions: 0.580 -semantic: 0.577 -system: 0.565 -virtual: 0.539 -architecture: 0.534 -assembly: 0.529 -alpha: 0.529 -network: 0.525 -PID: 0.512 -kernel: 0.485 -boot: 0.478 -socket: 0.462 -files: 0.459 - -[RESEND][BUG FIX HELP] QEMU main thread endlessly hangs in __ppoll() - -Hi Genius, -I am a user of QEMU v4.2.0 and stuck in an interesting bug, which may still -exist in the mainline. -Thanks in advance to heroes who can take a look and share understanding. - -The qemu main thread endlessly hangs in the handle of the qmp statement: -{'execute': 'human-monitor-command', 'arguments':{ 'command-line': -'drive_del replication0' } } -and we have the call trace looks like: -#0 0x00007f3c22045bf6 in __ppoll (fds=0x555611328410, nfds=1, -timeout=<optimized out>, timeout@entry=0x7ffc56c66db0, -sigmask=sigmask@entry=0x0) at ../sysdeps/unix/sysv/linux/ppoll.c:44 -#1 0x000055561021f415 in ppoll (__ss=0x0, __timeout=0x7ffc56c66db0, -__nfds=<optimized out>, __fds=<optimized out>) -at /usr/include/x86_64-linux-gnu/bits/poll2.h:77 -#2 qemu_poll_ns (fds=<optimized out>, nfds=<optimized out>, -timeout=<optimized out>) at util/qemu-timer.c:348 -#3 0x0000555610221430 in aio_poll (ctx=ctx@entry=0x5556113010f0, -blocking=blocking@entry=true) at util/aio-posix.c:669 -#4 0x000055561019268d in bdrv_do_drained_begin (poll=true, -ignore_bds_parents=false, parent=0x0, recursive=false, -bs=0x55561138b0a0) at block/io.c:430 -#5 bdrv_do_drained_begin (bs=0x55561138b0a0, recursive=<optimized out>, -parent=0x0, ignore_bds_parents=<optimized out>, -poll=<optimized out>) at block/io.c:396 -#6 0x000055561017b60b in quorum_del_child (bs=0x55561138b0a0, -child=0x7f36dc0ce380, errp=<optimized out>) -at block/quorum.c:1063 -#7 0x000055560ff5836b in qmp_x_blockdev_change (parent=0x555612373120 -"colo-disk0", has_child=<optimized out>, -child=0x5556112df3e0 "children.1", has_node=<optimized out>, node=0x0, -errp=0x7ffc56c66f98) at blockdev.c:4494 -#8 0x00005556100f8f57 in qmp_marshal_x_blockdev_change (args=<optimized -out>, ret=<optimized out>, errp=0x7ffc56c67018) -at qapi/qapi-commands-block-core.c:1538 -#9 0x00005556101d8290 in do_qmp_dispatch (errp=0x7ffc56c67010, -allow_oob=<optimized out>, request=<optimized out>, -cmds=0x5556109c69a0 <qmp_commands>) at qapi/qmp-dispatch.c:132 -#10 qmp_dispatch (cmds=0x5556109c69a0 <qmp_commands>, request=<optimized -out>, allow_oob=<optimized out>) -at qapi/qmp-dispatch.c:175 -#11 0x00005556100d4c4d in monitor_qmp_dispatch (mon=0x5556113a6f40, -req=<optimized out>) at monitor/qmp.c:145 -#12 0x00005556100d5437 in monitor_qmp_bh_dispatcher (data=<optimized out>) -at monitor/qmp.c:234 -#13 0x000055561021dbec in aio_bh_call (bh=0x5556112164bGrateful0) at -util/async.c:117 -#14 aio_bh_poll (ctx=ctx@entry=0x5556112151b0) at util/async.c:117 -#15 0x00005556102212c4 in aio_dispatch (ctx=0x5556112151b0) at -util/aio-posix.c:459 -#16 0x000055561021dab2 in aio_ctx_dispatch (source=<optimized out>, -callback=<optimized out>, user_data=<optimized out>) -at util/async.c:260 -#17 0x00007f3c22302fbd in g_main_context_dispatch () from -/lib/x86_64-linux-gnu/libglib-2.0.so.0 -#18 0x0000555610220358 in glib_pollfds_poll () at util/main-loop.c:219 -#19 os_host_main_loop_wait (timeout=<optimized out>) at util/main-loop.c:242 -#20 main_loop_wait (nonblocking=<optimized out>) at util/main-loop.c:518 -#21 0x000055560ff600fe in main_loop () at vl.c:1814 -#22 0x000055560fddbce9 in main (argc=<optimized out>, argv=<optimized out>, -envp=<optimized out>) at vl.c:4503 -We found that we're doing endless check in the line of -block/io.c:bdrv_do_drained_begin(): -BDRV_POLL_WHILE(bs, bdrv_drain_poll_top_level(bs, recursive, parent)); -and it turns out that the bdrv_drain_poll() always get true from: -- bdrv_parent_drained_poll(bs, ignore_parent, ignore_bds_parents) -- AND atomic_read(&bs->in_flight) - -I personally think this is a deadlock issue in the a QEMU block layer -(as we know, we have some #FIXME comments in related codes, such as block -permisson update). -Any comments are welcome and appreciated. - ---- -thx,likexu - -On 2/28/21 9:39 PM, Like Xu wrote: -Hi Genius, -I am a user of QEMU v4.2.0 and stuck in an interesting bug, which may -still exist in the mainline. -Thanks in advance to heroes who can take a look and share understanding. -Do you have a test case that reproduces on 5.2? It'd be nice to know if -it was still a problem in the latest source tree or not. ---js -The qemu main thread endlessly hangs in the handle of the qmp statement: -{'execute': 'human-monitor-command', 'arguments':{ 'command-line': -'drive_del replication0' } } -and we have the call trace looks like: -#0 0x00007f3c22045bf6 in __ppoll (fds=0x555611328410, nfds=1, -timeout=<optimized out>, timeout@entry=0x7ffc56c66db0, -sigmask=sigmask@entry=0x0) at ../sysdeps/unix/sysv/linux/ppoll.c:44 -#1 0x000055561021f415 in ppoll (__ss=0x0, __timeout=0x7ffc56c66db0, -__nfds=<optimized out>, __fds=<optimized out>) -at /usr/include/x86_64-linux-gnu/bits/poll2.h:77 -#2 qemu_poll_ns (fds=<optimized out>, nfds=<optimized out>, -timeout=<optimized out>) at util/qemu-timer.c:348 -#3 0x0000555610221430 in aio_poll (ctx=ctx@entry=0x5556113010f0, -blocking=blocking@entry=true) at util/aio-posix.c:669 -#4 0x000055561019268d in bdrv_do_drained_begin (poll=true, -ignore_bds_parents=false, parent=0x0, recursive=false, -bs=0x55561138b0a0) at block/io.c:430 -#5 bdrv_do_drained_begin (bs=0x55561138b0a0, recursive=<optimized out>, -parent=0x0, ignore_bds_parents=<optimized out>, -poll=<optimized out>) at block/io.c:396 -#6 0x000055561017b60b in quorum_del_child (bs=0x55561138b0a0, -child=0x7f36dc0ce380, errp=<optimized out>) -at block/quorum.c:1063 -#7 0x000055560ff5836b in qmp_x_blockdev_change (parent=0x555612373120 -"colo-disk0", has_child=<optimized out>, -child=0x5556112df3e0 "children.1", has_node=<optimized out>, node=0x0, -errp=0x7ffc56c66f98) at blockdev.c:4494 -#8 0x00005556100f8f57 in qmp_marshal_x_blockdev_change (args=<optimized -out>, ret=<optimized out>, errp=0x7ffc56c67018) -at qapi/qapi-commands-block-core.c:1538 -#9 0x00005556101d8290 in do_qmp_dispatch (errp=0x7ffc56c67010, -allow_oob=<optimized out>, request=<optimized out>, -cmds=0x5556109c69a0 <qmp_commands>) at qapi/qmp-dispatch.c:132 -#10 qmp_dispatch (cmds=0x5556109c69a0 <qmp_commands>, request=<optimized -out>, allow_oob=<optimized out>) -at qapi/qmp-dispatch.c:175 -#11 0x00005556100d4c4d in monitor_qmp_dispatch (mon=0x5556113a6f40, -req=<optimized out>) at monitor/qmp.c:145 -#12 0x00005556100d5437 in monitor_qmp_bh_dispatcher (data=<optimized -out>) at monitor/qmp.c:234 -#13 0x000055561021dbec in aio_bh_call (bh=0x5556112164bGrateful0) at -util/async.c:117 -#14 aio_bh_poll (ctx=ctx@entry=0x5556112151b0) at util/async.c:117 -#15 0x00005556102212c4 in aio_dispatch (ctx=0x5556112151b0) at -util/aio-posix.c:459 -#16 0x000055561021dab2 in aio_ctx_dispatch (source=<optimized out>, -callback=<optimized out>, user_data=<optimized out>) -at util/async.c:260 -#17 0x00007f3c22302fbd in g_main_context_dispatch () from -/lib/x86_64-linux-gnu/libglib-2.0.so.0 -#18 0x0000555610220358 in glib_pollfds_poll () at util/main-loop.c:219 -#19 os_host_main_loop_wait (timeout=<optimized out>) at -util/main-loop.c:242 -#20 main_loop_wait (nonblocking=<optimized out>) at util/main-loop.c:518 -#21 0x000055560ff600fe in main_loop () at vl.c:1814 -#22 0x000055560fddbce9 in main (argc=<optimized out>, argv=<optimized -out>, envp=<optimized out>) at vl.c:4503 -We found that we're doing endless check in the line of -block/io.c:bdrv_do_drained_begin(): -    BDRV_POLL_WHILE(bs, bdrv_drain_poll_top_level(bs, recursive, parent)); -and it turns out that the bdrv_drain_poll() always get true from: -- bdrv_parent_drained_poll(bs, ignore_parent, ignore_bds_parents) -- AND atomic_read(&bs->in_flight) - -I personally think this is a deadlock issue in the a QEMU block layer -(as we know, we have some #FIXME comments in related codes, such as -block permisson update). -Any comments are welcome and appreciated. - ---- -thx,likexu - -Hi John, - -Thanks for your comment. - -On 2021/3/5 7:53, John Snow wrote: -On 2/28/21 9:39 PM, Like Xu wrote: -Hi Genius, -I am a user of QEMU v4.2.0 and stuck in an interesting bug, which may -still exist in the mainline. -Thanks in advance to heroes who can take a look and share understanding. -Do you have a test case that reproduces on 5.2? It'd be nice to know if it -was still a problem in the latest source tree or not. -We narrowed down the source of the bug, which basically came from -the following qmp usage: -{'execute': 'human-monitor-command', 'arguments':{ 'command-line': -'drive_del replication0' } } -One of the test cases is the COLO usage (docs/colo-proxy.txt). - -This issue is sporadic,the probability may be 1/15 for a io-heavy guest. - -I believe it's reproducible on 5.2 and the latest tree. ---js -The qemu main thread endlessly hangs in the handle of the qmp statement: -{'execute': 'human-monitor-command', 'arguments':{ 'command-line': -'drive_del replication0' } } -and we have the call trace looks like: -#0 0x00007f3c22045bf6 in __ppoll (fds=0x555611328410, nfds=1, -timeout=<optimized out>, timeout@entry=0x7ffc56c66db0, -sigmask=sigmask@entry=0x0) at ../sysdeps/unix/sysv/linux/ppoll.c:44 -#1 0x000055561021f415 in ppoll (__ss=0x0, __timeout=0x7ffc56c66db0, -__nfds=<optimized out>, __fds=<optimized out>) -at /usr/include/x86_64-linux-gnu/bits/poll2.h:77 -#2 qemu_poll_ns (fds=<optimized out>, nfds=<optimized out>, -timeout=<optimized out>) at util/qemu-timer.c:348 -#3 0x0000555610221430 in aio_poll (ctx=ctx@entry=0x5556113010f0, -blocking=blocking@entry=true) at util/aio-posix.c:669 -#4 0x000055561019268d in bdrv_do_drained_begin (poll=true, -ignore_bds_parents=false, parent=0x0, recursive=false, -bs=0x55561138b0a0) at block/io.c:430 -#5 bdrv_do_drained_begin (bs=0x55561138b0a0, recursive=<optimized out>, -parent=0x0, ignore_bds_parents=<optimized out>, -poll=<optimized out>) at block/io.c:396 -#6 0x000055561017b60b in quorum_del_child (bs=0x55561138b0a0, -child=0x7f36dc0ce380, errp=<optimized out>) -at block/quorum.c:1063 -#7 0x000055560ff5836b in qmp_x_blockdev_change (parent=0x555612373120 -"colo-disk0", has_child=<optimized out>, -child=0x5556112df3e0 "children.1", has_node=<optimized out>, node=0x0, -errp=0x7ffc56c66f98) at blockdev.c:4494 -#8 0x00005556100f8f57 in qmp_marshal_x_blockdev_change (args=<optimized -out>, ret=<optimized out>, errp=0x7ffc56c67018) -at qapi/qapi-commands-block-core.c:1538 -#9 0x00005556101d8290 in do_qmp_dispatch (errp=0x7ffc56c67010, -allow_oob=<optimized out>, request=<optimized out>, -cmds=0x5556109c69a0 <qmp_commands>) at qapi/qmp-dispatch.c:132 -#10 qmp_dispatch (cmds=0x5556109c69a0 <qmp_commands>, request=<optimized -out>, allow_oob=<optimized out>) -at qapi/qmp-dispatch.c:175 -#11 0x00005556100d4c4d in monitor_qmp_dispatch (mon=0x5556113a6f40, -req=<optimized out>) at monitor/qmp.c:145 -#12 0x00005556100d5437 in monitor_qmp_bh_dispatcher (data=<optimized -out>) at monitor/qmp.c:234 -#13 0x000055561021dbec in aio_bh_call (bh=0x5556112164bGrateful0) at -util/async.c:117 -#14 aio_bh_poll (ctx=ctx@entry=0x5556112151b0) at util/async.c:117 -#15 0x00005556102212c4 in aio_dispatch (ctx=0x5556112151b0) at -util/aio-posix.c:459 -#16 0x000055561021dab2 in aio_ctx_dispatch (source=<optimized out>, -callback=<optimized out>, user_data=<optimized out>) -at util/async.c:260 -#17 0x00007f3c22302fbd in g_main_context_dispatch () from -/lib/x86_64-linux-gnu/libglib-2.0.so.0 -#18 0x0000555610220358 in glib_pollfds_poll () at util/main-loop.c:219 -#19 os_host_main_loop_wait (timeout=<optimized out>) at util/main-loop.c:242 -#20 main_loop_wait (nonblocking=<optimized out>) at util/main-loop.c:518 -#21 0x000055560ff600fe in main_loop () at vl.c:1814 -#22 0x000055560fddbce9 in main (argc=<optimized out>, argv=<optimized -out>, envp=<optimized out>) at vl.c:4503 -We found that we're doing endless check in the line of -block/io.c:bdrv_do_drained_begin(): -     BDRV_POLL_WHILE(bs, bdrv_drain_poll_top_level(bs, recursive, parent)); -and it turns out that the bdrv_drain_poll() always get true from: -- bdrv_parent_drained_poll(bs, ignore_parent, ignore_bds_parents) -- AND atomic_read(&bs->in_flight) - -I personally think this is a deadlock issue in the a QEMU block layer -(as we know, we have some #FIXME comments in related codes, such as block -permisson update). -Any comments are welcome and appreciated. - ---- -thx,likexu - -On 3/4/21 10:08 PM, Like Xu wrote: -Hi John, - -Thanks for your comment. - -On 2021/3/5 7:53, John Snow wrote: -On 2/28/21 9:39 PM, Like Xu wrote: -Hi Genius, -I am a user of QEMU v4.2.0 and stuck in an interesting bug, which may -still exist in the mainline. -Thanks in advance to heroes who can take a look and share understanding. -Do you have a test case that reproduces on 5.2? It'd be nice to know -if it was still a problem in the latest source tree or not. -We narrowed down the source of the bug, which basically came from -the following qmp usage: -{'execute': 'human-monitor-command', 'arguments':{ 'command-line': -'drive_del replication0' } } -One of the test cases is the COLO usage (docs/colo-proxy.txt). - -This issue is sporadic,the probability may be 1/15 for a io-heavy guest. - -I believe it's reproducible on 5.2 and the latest tree. -Can you please test and confirm that this is the case, and then file a -bug report on the LP: -https://launchpad.net/qemu -and include: -- The exact commit you used (current origin/master debug build would be -the most ideal.) -- Which QEMU binary you are using (qemu-system-x86_64?) -- The shortest command line you are aware of that reproduces the problem -- The host OS and kernel version -- An updated call trace -- Any relevant commands issued prior to the one that caused the hang; or -detailed reproduction steps if possible. -Thanks, ---js - diff --git a/results/classifier/013/risc-v/65781993 b/results/classifier/013/risc-v/65781993 deleted file mode 100644 index 2a66aa6c4..000000000 --- a/results/classifier/013/risc-v/65781993 +++ /dev/null @@ -1,2821 +0,0 @@ -risc-v: 0.745 -user-level: 0.697 -PID: 0.673 -debug: 0.673 -arm: 0.672 -virtual: 0.670 -assembly: 0.666 -semantic: 0.665 -graphic: 0.664 -alpha: 0.662 -socket: 0.660 -operating system: 0.660 -register: 0.659 -permissions: 0.658 -architecture: 0.658 -network: 0.657 -files: 0.657 -kernel: 0.656 -mistranslation: 0.650 -device: 0.647 -performance: 0.636 -boot: 0.635 -KVM: 0.627 -system: 0.624 -peripherals: 0.624 -i386: 0.612 -VMM: 0.612 -TCG: 0.607 -vnc: 0.590 -hypervisor: 0.586 -x86: 0.579 -ppc: 0.557 - -[Qemu-devel] 答复: Re: 答复: Re: [BUG]COLO failover hang - -Thank youã - -I have test areadyã - -When the Primary Node panic,the Secondary Node qemu hang at the same placeã - -Incorrding -http://wiki.qemu-project.org/Features/COLO -ï¼kill Primary Node qemu -will not produce the problem,but Primary Node panic canã - -I think due to the feature of channel does not support -QIO_CHANNEL_FEATURE_SHUTDOWN. - - -when failover,channel_shutdown could not shut down the channel. - - -so the colo_process_incoming_thread will hang at recvmsg. - - -I test a patch: - - -diff --git a/migration/socket.c b/migration/socket.c - - -index 13966f1..d65a0ea 100644 - - ---- a/migration/socket.c - - -+++ b/migration/socket.c - - -@@ -147,8 +147,9 @@ static gboolean socket_accept_incoming_migration(QIOChannel -*ioc, - - - } - - - - - - trace_migration_socket_incoming_accepted() - - - - - - qio_channel_set_name(QIO_CHANNEL(sioc), "migration-socket-incoming") - - -+ qio_channel_set_feature(QIO_CHANNEL(sioc), QIO_CHANNEL_FEATURE_SHUTDOWN) - - - migration_channel_process_incoming(migrate_get_current(), - - - QIO_CHANNEL(sioc)) - - - object_unref(OBJECT(sioc)) - - - - -My test will not hang any more. - - - - - - - - - - - - - - - - - -åå§é®ä»¶ - - - -åä»¶äººï¼ address@hidden -æ¶ä»¶äººï¼ç广10165992 address@hidden -æéäººï¼ address@hidden address@hidden -æ¥ æ ï¼2017å¹´03æ21æ¥ 15:58 -主 é¢ ï¼Re: [Qemu-devel] çå¤: Re: [BUG]COLO failover hang - - - - - -Hi,Wang. - -You can test this branch: -https://github.com/coloft/qemu/tree/colo-v5.1-developing-COLO-frame-v21-with-shared-disk -and please follow wiki ensure your own configuration correctly. -http://wiki.qemu-project.org/Features/COLO -Thanks - -Zhang Chen - - -On 03/21/2017 03:27 PM, address@hidden wrote: -ï¼ -ï¼ hi. -ï¼ -ï¼ I test the git qemu master have the same problem. -ï¼ -ï¼ (gdb) bt -ï¼ -ï¼ #0 qio_channel_socket_readv (ioc=0x7f65911b4e50, iov=0x7f64ef3fd880, -ï¼ niov=1, fds=0x0, nfds=0x0, errp=0x0) at io/channel-socket.c:461 -ï¼ -ï¼ #1 0x00007f658e4aa0c2 in qio_channel_read -ï¼ (address@hidden, address@hidden "", -ï¼ address@hidden, address@hidden) at io/channel.c:114 -ï¼ -ï¼ #2 0x00007f658e3ea990 in channel_get_buffer (opaque=ï¼optimized outï¼, -ï¼ buf=0x7f65907cb838 "", pos=ï¼optimized outï¼, size=32768) at -ï¼ migration/qemu-file-channel.c:78 -ï¼ -ï¼ #3 0x00007f658e3e97fc in qemu_fill_buffer (f=0x7f65907cb800) at -ï¼ migration/qemu-file.c:295 -ï¼ -ï¼ #4 0x00007f658e3ea2e1 in qemu_peek_byte (address@hidden, -ï¼ address@hidden) at migration/qemu-file.c:555 -ï¼ -ï¼ #5 0x00007f658e3ea34b in qemu_get_byte (address@hidden) at -ï¼ migration/qemu-file.c:568 -ï¼ -ï¼ #6 0x00007f658e3ea552 in qemu_get_be32 (address@hidden) at -ï¼ migration/qemu-file.c:648 -ï¼ -ï¼ #7 0x00007f658e3e66e5 in colo_receive_message (f=0x7f65907cb800, -ï¼ address@hidden) at migration/colo.c:244 -ï¼ -ï¼ #8 0x00007f658e3e681e in colo_receive_check_message (f=ï¼optimized -ï¼ outï¼, address@hidden, -ï¼ address@hidden) -ï¼ -ï¼ at migration/colo.c:264 -ï¼ -ï¼ #9 0x00007f658e3e740e in colo_process_incoming_thread -ï¼ (opaque=0x7f658eb30360 ï¼mis_current.31286ï¼) at migration/colo.c:577 -ï¼ -ï¼ #10 0x00007f658be09df3 in start_thread () from /lib64/libpthread.so.0 -ï¼ -ï¼ #11 0x00007f65881983ed in clone () from /lib64/libc.so.6 -ï¼ -ï¼ (gdb) p ioc-ï¼name -ï¼ -ï¼ $2 = 0x7f658ff7d5c0 "migration-socket-incoming" -ï¼ -ï¼ (gdb) p ioc-ï¼features Do not support QIO_CHANNEL_FEATURE_SHUTDOWN -ï¼ -ï¼ $3 = 0 -ï¼ -ï¼ -ï¼ (gdb) bt -ï¼ -ï¼ #0 socket_accept_incoming_migration (ioc=0x7fdcceeafa90, -ï¼ condition=G_IO_IN, opaque=0x7fdcceeafa90) at migration/socket.c:137 -ï¼ -ï¼ #1 0x00007fdcc6966350 in g_main_dispatch (context=ï¼optimized outï¼) at -ï¼ gmain.c:3054 -ï¼ -ï¼ #2 g_main_context_dispatch (context=ï¼optimized outï¼, -ï¼ address@hidden) at gmain.c:3630 -ï¼ -ï¼ #3 0x00007fdccb8a6dcc in glib_pollfds_poll () at util/main-loop.c:213 -ï¼ -ï¼ #4 os_host_main_loop_wait (timeout=ï¼optimized outï¼) at -ï¼ util/main-loop.c:258 -ï¼ -ï¼ #5 main_loop_wait (address@hidden) at -ï¼ util/main-loop.c:506 -ï¼ -ï¼ #6 0x00007fdccb526187 in main_loop () at vl.c:1898 -ï¼ -ï¼ #7 main (argc=ï¼optimized outï¼, argv=ï¼optimized outï¼, envp=ï¼optimized -ï¼ outï¼) at vl.c:4709 -ï¼ -ï¼ (gdb) p ioc-ï¼features -ï¼ -ï¼ $1 = 6 -ï¼ -ï¼ (gdb) p ioc-ï¼name -ï¼ -ï¼ $2 = 0x7fdcce1b1ab0 "migration-socket-listener" -ï¼ -ï¼ -ï¼ May be socket_accept_incoming_migration should -ï¼ call qio_channel_set_feature(ioc, QIO_CHANNEL_FEATURE_SHUTDOWN)?? -ï¼ -ï¼ -ï¼ thank you. -ï¼ -ï¼ -ï¼ -ï¼ -ï¼ -ï¼ åå§é®ä»¶ -ï¼ address@hidden -ï¼ address@hidden -ï¼ address@hidden@huawei.comï¼ -ï¼ *æ¥ æ ï¼*2017å¹´03æ16æ¥ 14:46 -ï¼ *主 é¢ ï¼**Re: [Qemu-devel] COLO failover hang* -ï¼ -ï¼ -ï¼ -ï¼ -ï¼ On 03/15/2017 05:06 PM, wangguang wrote: -ï¼ ï¼ am testing QEMU COLO feature described here [QEMU -ï¼ ï¼ Wiki]( -http://wiki.qemu-project.org/Features/COLO -). -ï¼ ï¼ -ï¼ ï¼ When the Primary Node panic,the Secondary Node qemu hang. -ï¼ ï¼ hang at recvmsg in qio_channel_socket_readv. -ï¼ ï¼ And I run { 'execute': 'nbd-server-stop' } and { "execute": -ï¼ ï¼ "x-colo-lost-heartbeat" } in Secondary VM's -ï¼ ï¼ monitor,the Secondary Node qemu still hang at recvmsg . -ï¼ ï¼ -ï¼ ï¼ I found that the colo in qemu is not complete yet. -ï¼ ï¼ Do the colo have any plan for development? -ï¼ -ï¼ Yes, We are developing. You can see some of patch we pushing. -ï¼ -ï¼ ï¼ Has anyone ever run it successfully? Any help is appreciated! -ï¼ -ï¼ In our internal version can run it successfully, -ï¼ The failover detail you can ask Zhanghailiang for help. -ï¼ Next time if you have some question about COLO, -ï¼ please cc me and zhanghailiang address@hidden -ï¼ -ï¼ -ï¼ Thanks -ï¼ Zhang Chen -ï¼ -ï¼ -ï¼ ï¼ -ï¼ ï¼ -ï¼ ï¼ -ï¼ ï¼ centos7.2+qemu2.7.50 -ï¼ ï¼ (gdb) bt -ï¼ ï¼ #0 0x00007f3e00cc86ad in recvmsg () from /lib64/libpthread.so.0 -ï¼ ï¼ #1 0x00007f3e0332b738 in qio_channel_socket_readv (ioc=ï¼optimized outï¼, -ï¼ ï¼ iov=ï¼optimized outï¼, niov=ï¼optimized outï¼, fds=0x0, nfds=0x0, errp=0x0) at -ï¼ ï¼ io/channel-socket.c:497 -ï¼ ï¼ #2 0x00007f3e03329472 in qio_channel_read (address@hidden, -ï¼ ï¼ address@hidden "", address@hidden, -ï¼ ï¼ address@hidden) at io/channel.c:97 -ï¼ ï¼ #3 0x00007f3e032750e0 in channel_get_buffer (opaque=ï¼optimized outï¼, -ï¼ ï¼ buf=0x7f3e05910f38 "", pos=ï¼optimized outï¼, size=32768) at -ï¼ ï¼ migration/qemu-file-channel.c:78 -ï¼ ï¼ #4 0x00007f3e0327412c in qemu_fill_buffer (f=0x7f3e05910f00) at -ï¼ ï¼ migration/qemu-file.c:257 -ï¼ ï¼ #5 0x00007f3e03274a41 in qemu_peek_byte (address@hidden, -ï¼ ï¼ address@hidden) at migration/qemu-file.c:510 -ï¼ ï¼ #6 0x00007f3e03274aab in qemu_get_byte (address@hidden) at -ï¼ ï¼ migration/qemu-file.c:523 -ï¼ ï¼ #7 0x00007f3e03274cb2 in qemu_get_be32 (address@hidden) at -ï¼ ï¼ migration/qemu-file.c:603 -ï¼ ï¼ #8 0x00007f3e03271735 in colo_receive_message (f=0x7f3e05910f00, -ï¼ ï¼ address@hidden) at migration/colo.c:215 -ï¼ ï¼ #9 0x00007f3e0327250d in colo_wait_handle_message (errp=0x7f3d62bfaa48, -ï¼ ï¼ checkpoint_request=ï¼synthetic pointerï¼, f=ï¼optimized outï¼) at -ï¼ ï¼ migration/colo.c:546 -ï¼ ï¼ #10 colo_process_incoming_thread (opaque=0x7f3e067245e0) at -ï¼ ï¼ migration/colo.c:649 -ï¼ ï¼ #11 0x00007f3e00cc1df3 in start_thread () from /lib64/libpthread.so.0 -ï¼ ï¼ #12 0x00007f3dfc9c03ed in clone () from /lib64/libc.so.6 -ï¼ ï¼ -ï¼ ï¼ -ï¼ ï¼ -ï¼ ï¼ -ï¼ ï¼ -ï¼ ï¼ -- -ï¼ ï¼ View this message in context: -http://qemu.11.n7.nabble.com/COLO-failover-hang-tp473250.html -ï¼ ï¼ Sent from the Developer mailing list archive at Nabble.com. -ï¼ ï¼ -ï¼ ï¼ -ï¼ ï¼ -ï¼ ï¼ -ï¼ -ï¼ -- -ï¼ Thanks -ï¼ Zhang Chen -ï¼ -ï¼ -ï¼ -ï¼ -ï¼ - --- -Thanks -Zhang Chen - -Hi, - -On 2017/3/21 16:10, address@hidden wrote: -Thank youã - -I have test areadyã - -When the Primary Node panic,the Secondary Node qemu hang at the same placeã - -Incorrding -http://wiki.qemu-project.org/Features/COLO -ï¼kill Primary Node qemu -will not produce the problem,but Primary Node panic canã - -I think due to the feature of channel does not support -QIO_CHANNEL_FEATURE_SHUTDOWN. -Yes, you are right, when we do failover for primary/secondary VM, we will -shutdown the related -fd in case it is stuck in the read/write fd. - -It seems that you didn't follow the above introduction exactly to do the test. -Could you -share your test procedures ? Especially the commands used in the test. - -Thanks, -Hailiang -when failover,channel_shutdown could not shut down the channel. - - -so the colo_process_incoming_thread will hang at recvmsg. - - -I test a patch: - - -diff --git a/migration/socket.c b/migration/socket.c - - -index 13966f1..d65a0ea 100644 - - ---- a/migration/socket.c - - -+++ b/migration/socket.c - - -@@ -147,8 +147,9 @@ static gboolean socket_accept_incoming_migration(QIOChannel -*ioc, - - - } - - - - - - trace_migration_socket_incoming_accepted() - - - - - - qio_channel_set_name(QIO_CHANNEL(sioc), "migration-socket-incoming") - - -+ qio_channel_set_feature(QIO_CHANNEL(sioc), QIO_CHANNEL_FEATURE_SHUTDOWN) - - - migration_channel_process_incoming(migrate_get_current(), - - - QIO_CHANNEL(sioc)) - - - object_unref(OBJECT(sioc)) - - - - -My test will not hang any more. - - - - - - - - - - - - - - - - - -åå§é®ä»¶ - - - -åä»¶äººï¼ address@hidden -æ¶ä»¶äººï¼ç广10165992 address@hidden -æéäººï¼ address@hidden address@hidden -æ¥ æ ï¼2017å¹´03æ21æ¥ 15:58 -主 é¢ ï¼Re: [Qemu-devel] çå¤: Re: [BUG]COLO failover hang - - - - - -Hi,Wang. - -You can test this branch: -https://github.com/coloft/qemu/tree/colo-v5.1-developing-COLO-frame-v21-with-shared-disk -and please follow wiki ensure your own configuration correctly. -http://wiki.qemu-project.org/Features/COLO -Thanks - -Zhang Chen - - -On 03/21/2017 03:27 PM, address@hidden wrote: -ï¼ -ï¼ hi. -ï¼ -ï¼ I test the git qemu master have the same problem. -ï¼ -ï¼ (gdb) bt -ï¼ -ï¼ #0 qio_channel_socket_readv (ioc=0x7f65911b4e50, iov=0x7f64ef3fd880, -ï¼ niov=1, fds=0x0, nfds=0x0, errp=0x0) at io/channel-socket.c:461 -ï¼ -ï¼ #1 0x00007f658e4aa0c2 in qio_channel_read -ï¼ (address@hidden, address@hidden "", -ï¼ address@hidden, address@hidden) at io/channel.c:114 -ï¼ -ï¼ #2 0x00007f658e3ea990 in channel_get_buffer (opaque=ï¼optimized outï¼, -ï¼ buf=0x7f65907cb838 "", pos=ï¼optimized outï¼, size=32768) at -ï¼ migration/qemu-file-channel.c:78 -ï¼ -ï¼ #3 0x00007f658e3e97fc in qemu_fill_buffer (f=0x7f65907cb800) at -ï¼ migration/qemu-file.c:295 -ï¼ -ï¼ #4 0x00007f658e3ea2e1 in qemu_peek_byte (address@hidden, -ï¼ address@hidden) at migration/qemu-file.c:555 -ï¼ -ï¼ #5 0x00007f658e3ea34b in qemu_get_byte (address@hidden) at -ï¼ migration/qemu-file.c:568 -ï¼ -ï¼ #6 0x00007f658e3ea552 in qemu_get_be32 (address@hidden) at -ï¼ migration/qemu-file.c:648 -ï¼ -ï¼ #7 0x00007f658e3e66e5 in colo_receive_message (f=0x7f65907cb800, -ï¼ address@hidden) at migration/colo.c:244 -ï¼ -ï¼ #8 0x00007f658e3e681e in colo_receive_check_message (f=ï¼optimized -ï¼ outï¼, address@hidden, -ï¼ address@hidden) -ï¼ -ï¼ at migration/colo.c:264 -ï¼ -ï¼ #9 0x00007f658e3e740e in colo_process_incoming_thread -ï¼ (opaque=0x7f658eb30360 ï¼mis_current.31286ï¼) at migration/colo.c:577 -ï¼ -ï¼ #10 0x00007f658be09df3 in start_thread () from /lib64/libpthread.so.0 -ï¼ -ï¼ #11 0x00007f65881983ed in clone () from /lib64/libc.so.6 -ï¼ -ï¼ (gdb) p ioc-ï¼name -ï¼ -ï¼ $2 = 0x7f658ff7d5c0 "migration-socket-incoming" -ï¼ -ï¼ (gdb) p ioc-ï¼features Do not support QIO_CHANNEL_FEATURE_SHUTDOWN -ï¼ -ï¼ $3 = 0 -ï¼ -ï¼ -ï¼ (gdb) bt -ï¼ -ï¼ #0 socket_accept_incoming_migration (ioc=0x7fdcceeafa90, -ï¼ condition=G_IO_IN, opaque=0x7fdcceeafa90) at migration/socket.c:137 -ï¼ -ï¼ #1 0x00007fdcc6966350 in g_main_dispatch (context=ï¼optimized outï¼) at -ï¼ gmain.c:3054 -ï¼ -ï¼ #2 g_main_context_dispatch (context=ï¼optimized outï¼, -ï¼ address@hidden) at gmain.c:3630 -ï¼ -ï¼ #3 0x00007fdccb8a6dcc in glib_pollfds_poll () at util/main-loop.c:213 -ï¼ -ï¼ #4 os_host_main_loop_wait (timeout=ï¼optimized outï¼) at -ï¼ util/main-loop.c:258 -ï¼ -ï¼ #5 main_loop_wait (address@hidden) at -ï¼ util/main-loop.c:506 -ï¼ -ï¼ #6 0x00007fdccb526187 in main_loop () at vl.c:1898 -ï¼ -ï¼ #7 main (argc=ï¼optimized outï¼, argv=ï¼optimized outï¼, envp=ï¼optimized -ï¼ outï¼) at vl.c:4709 -ï¼ -ï¼ (gdb) p ioc-ï¼features -ï¼ -ï¼ $1 = 6 -ï¼ -ï¼ (gdb) p ioc-ï¼name -ï¼ -ï¼ $2 = 0x7fdcce1b1ab0 "migration-socket-listener" -ï¼ -ï¼ -ï¼ May be socket_accept_incoming_migration should -ï¼ call qio_channel_set_feature(ioc, QIO_CHANNEL_FEATURE_SHUTDOWN)?? -ï¼ -ï¼ -ï¼ thank you. -ï¼ -ï¼ -ï¼ -ï¼ -ï¼ -ï¼ åå§é®ä»¶ -ï¼ address@hidden -ï¼ address@hidden -ï¼ address@hidden@huawei.comï¼ -ï¼ *æ¥ æ ï¼*2017å¹´03æ16æ¥ 14:46 -ï¼ *主 é¢ ï¼**Re: [Qemu-devel] COLO failover hang* -ï¼ -ï¼ -ï¼ -ï¼ -ï¼ On 03/15/2017 05:06 PM, wangguang wrote: -ï¼ ï¼ am testing QEMU COLO feature described here [QEMU -ï¼ ï¼ Wiki]( -http://wiki.qemu-project.org/Features/COLO -). -ï¼ ï¼ -ï¼ ï¼ When the Primary Node panic,the Secondary Node qemu hang. -ï¼ ï¼ hang at recvmsg in qio_channel_socket_readv. -ï¼ ï¼ And I run { 'execute': 'nbd-server-stop' } and { "execute": -ï¼ ï¼ "x-colo-lost-heartbeat" } in Secondary VM's -ï¼ ï¼ monitor,the Secondary Node qemu still hang at recvmsg . -ï¼ ï¼ -ï¼ ï¼ I found that the colo in qemu is not complete yet. -ï¼ ï¼ Do the colo have any plan for development? -ï¼ -ï¼ Yes, We are developing. You can see some of patch we pushing. -ï¼ -ï¼ ï¼ Has anyone ever run it successfully? Any help is appreciated! -ï¼ -ï¼ In our internal version can run it successfully, -ï¼ The failover detail you can ask Zhanghailiang for help. -ï¼ Next time if you have some question about COLO, -ï¼ please cc me and zhanghailiang address@hidden -ï¼ -ï¼ -ï¼ Thanks -ï¼ Zhang Chen -ï¼ -ï¼ -ï¼ ï¼ -ï¼ ï¼ -ï¼ ï¼ -ï¼ ï¼ centos7.2+qemu2.7.50 -ï¼ ï¼ (gdb) bt -ï¼ ï¼ #0 0x00007f3e00cc86ad in recvmsg () from /lib64/libpthread.so.0 -ï¼ ï¼ #1 0x00007f3e0332b738 in qio_channel_socket_readv (ioc=ï¼optimized outï¼, -ï¼ ï¼ iov=ï¼optimized outï¼, niov=ï¼optimized outï¼, fds=0x0, nfds=0x0, errp=0x0) at -ï¼ ï¼ io/channel-socket.c:497 -ï¼ ï¼ #2 0x00007f3e03329472 in qio_channel_read (address@hidden, -ï¼ ï¼ address@hidden "", address@hidden, -ï¼ ï¼ address@hidden) at io/channel.c:97 -ï¼ ï¼ #3 0x00007f3e032750e0 in channel_get_buffer (opaque=ï¼optimized outï¼, -ï¼ ï¼ buf=0x7f3e05910f38 "", pos=ï¼optimized outï¼, size=32768) at -ï¼ ï¼ migration/qemu-file-channel.c:78 -ï¼ ï¼ #4 0x00007f3e0327412c in qemu_fill_buffer (f=0x7f3e05910f00) at -ï¼ ï¼ migration/qemu-file.c:257 -ï¼ ï¼ #5 0x00007f3e03274a41 in qemu_peek_byte (address@hidden, -ï¼ ï¼ address@hidden) at migration/qemu-file.c:510 -ï¼ ï¼ #6 0x00007f3e03274aab in qemu_get_byte (address@hidden) at -ï¼ ï¼ migration/qemu-file.c:523 -ï¼ ï¼ #7 0x00007f3e03274cb2 in qemu_get_be32 (address@hidden) at -ï¼ ï¼ migration/qemu-file.c:603 -ï¼ ï¼ #8 0x00007f3e03271735 in colo_receive_message (f=0x7f3e05910f00, -ï¼ ï¼ address@hidden) at migration/colo.c:215 -ï¼ ï¼ #9 0x00007f3e0327250d in colo_wait_handle_message (errp=0x7f3d62bfaa48, -ï¼ ï¼ checkpoint_request=ï¼synthetic pointerï¼, f=ï¼optimized outï¼) at -ï¼ ï¼ migration/colo.c:546 -ï¼ ï¼ #10 colo_process_incoming_thread (opaque=0x7f3e067245e0) at -ï¼ ï¼ migration/colo.c:649 -ï¼ ï¼ #11 0x00007f3e00cc1df3 in start_thread () from /lib64/libpthread.so.0 -ï¼ ï¼ #12 0x00007f3dfc9c03ed in clone () from /lib64/libc.so.6 -ï¼ ï¼ -ï¼ ï¼ -ï¼ ï¼ -ï¼ ï¼ -ï¼ ï¼ -ï¼ ï¼ -- -ï¼ ï¼ View this message in context: -http://qemu.11.n7.nabble.com/COLO-failover-hang-tp473250.html -ï¼ ï¼ Sent from the Developer mailing list archive at Nabble.com. -ï¼ ï¼ -ï¼ ï¼ -ï¼ ï¼ -ï¼ ï¼ -ï¼ -ï¼ -- -ï¼ Thanks -ï¼ Zhang Chen -ï¼ -ï¼ -ï¼ -ï¼ -ï¼ - -Hi, - -Thanks for reporting this, and i confirmed it in my test, and it is a bug. - -Though we tried to call qemu_file_shutdown() to shutdown the related fd, in -case COLO thread/incoming thread is stuck in read/write() while do failover, -but it didn't take effect, because all the fd used by COLO (also migration) -has been wrapped by qio channel, and it will not call the shutdown API if -we didn't qio_channel_set_feature(QIO_CHANNEL(sioc), -QIO_CHANNEL_FEATURE_SHUTDOWN). - -Cc: Dr. David Alan Gilbert <address@hidden> - -I doubted migration cancel has the same problem, it may be stuck in write() -if we tried to cancel migration. - -void fd_start_outgoing_migration(MigrationState *s, const char *fdname, Error -**errp) -{ - qio_channel_set_name(QIO_CHANNEL(ioc), "migration-fd-outgoing"); - migration_channel_connect(s, ioc, NULL); - ... ... -We didn't call qio_channel_set_feature(QIO_CHANNEL(sioc), -QIO_CHANNEL_FEATURE_SHUTDOWN) above, -and the -migrate_fd_cancel() -{ - ... ... - if (s->state == MIGRATION_STATUS_CANCELLING && f) { - qemu_file_shutdown(f); --> This will not take effect. No ? - } -} - -Thanks, -Hailiang - -On 2017/3/21 16:10, address@hidden wrote: -Thank youã - -I have test areadyã - -When the Primary Node panic,the Secondary Node qemu hang at the same placeã - -Incorrding -http://wiki.qemu-project.org/Features/COLO -ï¼kill Primary Node qemu -will not produce the problem,but Primary Node panic canã - -I think due to the feature of channel does not support -QIO_CHANNEL_FEATURE_SHUTDOWN. - - -when failover,channel_shutdown could not shut down the channel. - - -so the colo_process_incoming_thread will hang at recvmsg. - - -I test a patch: - - -diff --git a/migration/socket.c b/migration/socket.c - - -index 13966f1..d65a0ea 100644 - - ---- a/migration/socket.c - - -+++ b/migration/socket.c - - -@@ -147,8 +147,9 @@ static gboolean socket_accept_incoming_migration(QIOChannel -*ioc, - - - } - - - - - - trace_migration_socket_incoming_accepted() - - - - - - qio_channel_set_name(QIO_CHANNEL(sioc), "migration-socket-incoming") - - -+ qio_channel_set_feature(QIO_CHANNEL(sioc), QIO_CHANNEL_FEATURE_SHUTDOWN) - - - migration_channel_process_incoming(migrate_get_current(), - - - QIO_CHANNEL(sioc)) - - - object_unref(OBJECT(sioc)) - - - - -My test will not hang any more. - - - - - - - - - - - - - - - - - -åå§é®ä»¶ - - - -åä»¶äººï¼ address@hidden -æ¶ä»¶äººï¼ç广10165992 address@hidden -æéäººï¼ address@hidden address@hidden -æ¥ æ ï¼2017å¹´03æ21æ¥ 15:58 -主 é¢ ï¼Re: [Qemu-devel] çå¤: Re: [BUG]COLO failover hang - - - - - -Hi,Wang. - -You can test this branch: -https://github.com/coloft/qemu/tree/colo-v5.1-developing-COLO-frame-v21-with-shared-disk -and please follow wiki ensure your own configuration correctly. -http://wiki.qemu-project.org/Features/COLO -Thanks - -Zhang Chen - - -On 03/21/2017 03:27 PM, address@hidden wrote: -ï¼ -ï¼ hi. -ï¼ -ï¼ I test the git qemu master have the same problem. -ï¼ -ï¼ (gdb) bt -ï¼ -ï¼ #0 qio_channel_socket_readv (ioc=0x7f65911b4e50, iov=0x7f64ef3fd880, -ï¼ niov=1, fds=0x0, nfds=0x0, errp=0x0) at io/channel-socket.c:461 -ï¼ -ï¼ #1 0x00007f658e4aa0c2 in qio_channel_read -ï¼ (address@hidden, address@hidden "", -ï¼ address@hidden, address@hidden) at io/channel.c:114 -ï¼ -ï¼ #2 0x00007f658e3ea990 in channel_get_buffer (opaque=ï¼optimized outï¼, -ï¼ buf=0x7f65907cb838 "", pos=ï¼optimized outï¼, size=32768) at -ï¼ migration/qemu-file-channel.c:78 -ï¼ -ï¼ #3 0x00007f658e3e97fc in qemu_fill_buffer (f=0x7f65907cb800) at -ï¼ migration/qemu-file.c:295 -ï¼ -ï¼ #4 0x00007f658e3ea2e1 in qemu_peek_byte (address@hidden, -ï¼ address@hidden) at migration/qemu-file.c:555 -ï¼ -ï¼ #5 0x00007f658e3ea34b in qemu_get_byte (address@hidden) at -ï¼ migration/qemu-file.c:568 -ï¼ -ï¼ #6 0x00007f658e3ea552 in qemu_get_be32 (address@hidden) at -ï¼ migration/qemu-file.c:648 -ï¼ -ï¼ #7 0x00007f658e3e66e5 in colo_receive_message (f=0x7f65907cb800, -ï¼ address@hidden) at migration/colo.c:244 -ï¼ -ï¼ #8 0x00007f658e3e681e in colo_receive_check_message (f=ï¼optimized -ï¼ outï¼, address@hidden, -ï¼ address@hidden) -ï¼ -ï¼ at migration/colo.c:264 -ï¼ -ï¼ #9 0x00007f658e3e740e in colo_process_incoming_thread -ï¼ (opaque=0x7f658eb30360 ï¼mis_current.31286ï¼) at migration/colo.c:577 -ï¼ -ï¼ #10 0x00007f658be09df3 in start_thread () from /lib64/libpthread.so.0 -ï¼ -ï¼ #11 0x00007f65881983ed in clone () from /lib64/libc.so.6 -ï¼ -ï¼ (gdb) p ioc-ï¼name -ï¼ -ï¼ $2 = 0x7f658ff7d5c0 "migration-socket-incoming" -ï¼ -ï¼ (gdb) p ioc-ï¼features Do not support QIO_CHANNEL_FEATURE_SHUTDOWN -ï¼ -ï¼ $3 = 0 -ï¼ -ï¼ -ï¼ (gdb) bt -ï¼ -ï¼ #0 socket_accept_incoming_migration (ioc=0x7fdcceeafa90, -ï¼ condition=G_IO_IN, opaque=0x7fdcceeafa90) at migration/socket.c:137 -ï¼ -ï¼ #1 0x00007fdcc6966350 in g_main_dispatch (context=ï¼optimized outï¼) at -ï¼ gmain.c:3054 -ï¼ -ï¼ #2 g_main_context_dispatch (context=ï¼optimized outï¼, -ï¼ address@hidden) at gmain.c:3630 -ï¼ -ï¼ #3 0x00007fdccb8a6dcc in glib_pollfds_poll () at util/main-loop.c:213 -ï¼ -ï¼ #4 os_host_main_loop_wait (timeout=ï¼optimized outï¼) at -ï¼ util/main-loop.c:258 -ï¼ -ï¼ #5 main_loop_wait (address@hidden) at -ï¼ util/main-loop.c:506 -ï¼ -ï¼ #6 0x00007fdccb526187 in main_loop () at vl.c:1898 -ï¼ -ï¼ #7 main (argc=ï¼optimized outï¼, argv=ï¼optimized outï¼, envp=ï¼optimized -ï¼ outï¼) at vl.c:4709 -ï¼ -ï¼ (gdb) p ioc-ï¼features -ï¼ -ï¼ $1 = 6 -ï¼ -ï¼ (gdb) p ioc-ï¼name -ï¼ -ï¼ $2 = 0x7fdcce1b1ab0 "migration-socket-listener" -ï¼ -ï¼ -ï¼ May be socket_accept_incoming_migration should -ï¼ call qio_channel_set_feature(ioc, QIO_CHANNEL_FEATURE_SHUTDOWN)?? -ï¼ -ï¼ -ï¼ thank you. -ï¼ -ï¼ -ï¼ -ï¼ -ï¼ -ï¼ åå§é®ä»¶ -ï¼ address@hidden -ï¼ address@hidden -ï¼ address@hidden@huawei.comï¼ -ï¼ *æ¥ æ ï¼*2017å¹´03æ16æ¥ 14:46 -ï¼ *主 é¢ ï¼**Re: [Qemu-devel] COLO failover hang* -ï¼ -ï¼ -ï¼ -ï¼ -ï¼ On 03/15/2017 05:06 PM, wangguang wrote: -ï¼ ï¼ am testing QEMU COLO feature described here [QEMU -ï¼ ï¼ Wiki]( -http://wiki.qemu-project.org/Features/COLO -). -ï¼ ï¼ -ï¼ ï¼ When the Primary Node panic,the Secondary Node qemu hang. -ï¼ ï¼ hang at recvmsg in qio_channel_socket_readv. -ï¼ ï¼ And I run { 'execute': 'nbd-server-stop' } and { "execute": -ï¼ ï¼ "x-colo-lost-heartbeat" } in Secondary VM's -ï¼ ï¼ monitor,the Secondary Node qemu still hang at recvmsg . -ï¼ ï¼ -ï¼ ï¼ I found that the colo in qemu is not complete yet. -ï¼ ï¼ Do the colo have any plan for development? -ï¼ -ï¼ Yes, We are developing. You can see some of patch we pushing. -ï¼ -ï¼ ï¼ Has anyone ever run it successfully? Any help is appreciated! -ï¼ -ï¼ In our internal version can run it successfully, -ï¼ The failover detail you can ask Zhanghailiang for help. -ï¼ Next time if you have some question about COLO, -ï¼ please cc me and zhanghailiang address@hidden -ï¼ -ï¼ -ï¼ Thanks -ï¼ Zhang Chen -ï¼ -ï¼ -ï¼ ï¼ -ï¼ ï¼ -ï¼ ï¼ -ï¼ ï¼ centos7.2+qemu2.7.50 -ï¼ ï¼ (gdb) bt -ï¼ ï¼ #0 0x00007f3e00cc86ad in recvmsg () from /lib64/libpthread.so.0 -ï¼ ï¼ #1 0x00007f3e0332b738 in qio_channel_socket_readv (ioc=ï¼optimized outï¼, -ï¼ ï¼ iov=ï¼optimized outï¼, niov=ï¼optimized outï¼, fds=0x0, nfds=0x0, errp=0x0) at -ï¼ ï¼ io/channel-socket.c:497 -ï¼ ï¼ #2 0x00007f3e03329472 in qio_channel_read (address@hidden, -ï¼ ï¼ address@hidden "", address@hidden, -ï¼ ï¼ address@hidden) at io/channel.c:97 -ï¼ ï¼ #3 0x00007f3e032750e0 in channel_get_buffer (opaque=ï¼optimized outï¼, -ï¼ ï¼ buf=0x7f3e05910f38 "", pos=ï¼optimized outï¼, size=32768) at -ï¼ ï¼ migration/qemu-file-channel.c:78 -ï¼ ï¼ #4 0x00007f3e0327412c in qemu_fill_buffer (f=0x7f3e05910f00) at -ï¼ ï¼ migration/qemu-file.c:257 -ï¼ ï¼ #5 0x00007f3e03274a41 in qemu_peek_byte (address@hidden, -ï¼ ï¼ address@hidden) at migration/qemu-file.c:510 -ï¼ ï¼ #6 0x00007f3e03274aab in qemu_get_byte (address@hidden) at -ï¼ ï¼ migration/qemu-file.c:523 -ï¼ ï¼ #7 0x00007f3e03274cb2 in qemu_get_be32 (address@hidden) at -ï¼ ï¼ migration/qemu-file.c:603 -ï¼ ï¼ #8 0x00007f3e03271735 in colo_receive_message (f=0x7f3e05910f00, -ï¼ ï¼ address@hidden) at migration/colo.c:215 -ï¼ ï¼ #9 0x00007f3e0327250d in colo_wait_handle_message (errp=0x7f3d62bfaa48, -ï¼ ï¼ checkpoint_request=ï¼synthetic pointerï¼, f=ï¼optimized outï¼) at -ï¼ ï¼ migration/colo.c:546 -ï¼ ï¼ #10 colo_process_incoming_thread (opaque=0x7f3e067245e0) at -ï¼ ï¼ migration/colo.c:649 -ï¼ ï¼ #11 0x00007f3e00cc1df3 in start_thread () from /lib64/libpthread.so.0 -ï¼ ï¼ #12 0x00007f3dfc9c03ed in clone () from /lib64/libc.so.6 -ï¼ ï¼ -ï¼ ï¼ -ï¼ ï¼ -ï¼ ï¼ -ï¼ ï¼ -ï¼ ï¼ -- -ï¼ ï¼ View this message in context: -http://qemu.11.n7.nabble.com/COLO-failover-hang-tp473250.html -ï¼ ï¼ Sent from the Developer mailing list archive at Nabble.com. -ï¼ ï¼ -ï¼ ï¼ -ï¼ ï¼ -ï¼ ï¼ -ï¼ -ï¼ -- -ï¼ Thanks -ï¼ Zhang Chen -ï¼ -ï¼ -ï¼ -ï¼ -ï¼ - -* Hailiang Zhang (address@hidden) wrote: -> -Hi, -> -> -Thanks for reporting this, and i confirmed it in my test, and it is a bug. -> -> -Though we tried to call qemu_file_shutdown() to shutdown the related fd, in -> -case COLO thread/incoming thread is stuck in read/write() while do failover, -> -but it didn't take effect, because all the fd used by COLO (also migration) -> -has been wrapped by qio channel, and it will not call the shutdown API if -> -we didn't qio_channel_set_feature(QIO_CHANNEL(sioc), -> -QIO_CHANNEL_FEATURE_SHUTDOWN). -> -> -Cc: Dr. David Alan Gilbert <address@hidden> -> -> -I doubted migration cancel has the same problem, it may be stuck in write() -> -if we tried to cancel migration. -> -> -void fd_start_outgoing_migration(MigrationState *s, const char *fdname, Error -> -**errp) -> -{ -> -qio_channel_set_name(QIO_CHANNEL(ioc), "migration-fd-outgoing"); -> -migration_channel_connect(s, ioc, NULL); -> -... ... -> -We didn't call qio_channel_set_feature(QIO_CHANNEL(sioc), -> -QIO_CHANNEL_FEATURE_SHUTDOWN) above, -> -and the -> -migrate_fd_cancel() -> -{ -> -... ... -> -if (s->state == MIGRATION_STATUS_CANCELLING && f) { -> -qemu_file_shutdown(f); --> This will not take effect. No ? -> -} -> -} -(cc'd in Daniel Berrange). -I see that we call qio_channel_set_feature(ioc, QIO_CHANNEL_FEATURE_SHUTDOWN); -at the -top of qio_channel_socket_new; so I think that's safe isn't it? - -Dave - -> -Thanks, -> -Hailiang -> -> -On 2017/3/21 16:10, address@hidden wrote: -> -> Thank youã -> -> -> -> I have test areadyã -> -> -> -> When the Primary Node panic,the Secondary Node qemu hang at the same placeã -> -> -> -> Incorrding -http://wiki.qemu-project.org/Features/COLO -ï¼kill Primary Node -> -> qemu will not produce the problem,but Primary Node panic canã -> -> -> -> I think due to the feature of channel does not support -> -> QIO_CHANNEL_FEATURE_SHUTDOWN. -> -> -> -> -> -> when failover,channel_shutdown could not shut down the channel. -> -> -> -> -> -> so the colo_process_incoming_thread will hang at recvmsg. -> -> -> -> -> -> I test a patch: -> -> -> -> -> -> diff --git a/migration/socket.c b/migration/socket.c -> -> -> -> -> -> index 13966f1..d65a0ea 100644 -> -> -> -> -> -> --- a/migration/socket.c -> -> -> -> -> -> +++ b/migration/socket.c -> -> -> -> -> -> @@ -147,8 +147,9 @@ static gboolean -> -> socket_accept_incoming_migration(QIOChannel *ioc, -> -> -> -> -> -> } -> -> -> -> -> -> -> -> -> -> -> -> trace_migration_socket_incoming_accepted() -> -> -> -> -> -> -> -> -> -> -> -> qio_channel_set_name(QIO_CHANNEL(sioc), "migration-socket-incoming") -> -> -> -> -> -> + qio_channel_set_feature(QIO_CHANNEL(sioc), -> -> QIO_CHANNEL_FEATURE_SHUTDOWN) -> -> -> -> -> -> migration_channel_process_incoming(migrate_get_current(), -> -> -> -> -> -> QIO_CHANNEL(sioc)) -> -> -> -> -> -> object_unref(OBJECT(sioc)) -> -> -> -> -> -> -> -> -> -> My test will not hang any more. -> -> -> -> -> -> -> -> -> -> -> -> -> -> -> -> -> -> -> -> -> -> -> -> -> -> -> -> -> -> -> -> -> -> -> -> åå§é®ä»¶ -> -> -> -> -> -> -> -> åä»¶äººï¼ address@hidden -> -> æ¶ä»¶äººï¼ç广10165992 address@hidden -> -> æéäººï¼ address@hidden address@hidden -> -> æ¥ æ ï¼2017å¹´03æ21æ¥ 15:58 -> -> 主 é¢ ï¼Re: [Qemu-devel] çå¤: Re: [BUG]COLO failover hang -> -> -> -> -> -> -> -> -> -> -> -> Hi,Wang. -> -> -> -> You can test this branch: -> -> -> -> -https://github.com/coloft/qemu/tree/colo-v5.1-developing-COLO-frame-v21-with-shared-disk -> -> -> -> and please follow wiki ensure your own configuration correctly. -> -> -> -> -http://wiki.qemu-project.org/Features/COLO -> -> -> -> -> -> Thanks -> -> -> -> Zhang Chen -> -> -> -> -> -> On 03/21/2017 03:27 PM, address@hidden wrote: -> -> ï¼ -> -> ï¼ hi. -> -> ï¼ -> -> ï¼ I test the git qemu master have the same problem. -> -> ï¼ -> -> ï¼ (gdb) bt -> -> ï¼ -> -> ï¼ #0 qio_channel_socket_readv (ioc=0x7f65911b4e50, iov=0x7f64ef3fd880, -> -> ï¼ niov=1, fds=0x0, nfds=0x0, errp=0x0) at io/channel-socket.c:461 -> -> ï¼ -> -> ï¼ #1 0x00007f658e4aa0c2 in qio_channel_read -> -> ï¼ (address@hidden, address@hidden "", -> -> ï¼ address@hidden, address@hidden) at io/channel.c:114 -> -> ï¼ -> -> ï¼ #2 0x00007f658e3ea990 in channel_get_buffer (opaque=ï¼optimized outï¼, -> -> ï¼ buf=0x7f65907cb838 "", pos=ï¼optimized outï¼, size=32768) at -> -> ï¼ migration/qemu-file-channel.c:78 -> -> ï¼ -> -> ï¼ #3 0x00007f658e3e97fc in qemu_fill_buffer (f=0x7f65907cb800) at -> -> ï¼ migration/qemu-file.c:295 -> -> ï¼ -> -> ï¼ #4 0x00007f658e3ea2e1 in qemu_peek_byte (address@hidden, -> -> ï¼ address@hidden) at migration/qemu-file.c:555 -> -> ï¼ -> -> ï¼ #5 0x00007f658e3ea34b in qemu_get_byte (address@hidden) at -> -> ï¼ migration/qemu-file.c:568 -> -> ï¼ -> -> ï¼ #6 0x00007f658e3ea552 in qemu_get_be32 (address@hidden) at -> -> ï¼ migration/qemu-file.c:648 -> -> ï¼ -> -> ï¼ #7 0x00007f658e3e66e5 in colo_receive_message (f=0x7f65907cb800, -> -> ï¼ address@hidden) at migration/colo.c:244 -> -> ï¼ -> -> ï¼ #8 0x00007f658e3e681e in colo_receive_check_message (f=ï¼optimized -> -> ï¼ outï¼, address@hidden, -> -> ï¼ address@hidden) -> -> ï¼ -> -> ï¼ at migration/colo.c:264 -> -> ï¼ -> -> ï¼ #9 0x00007f658e3e740e in colo_process_incoming_thread -> -> ï¼ (opaque=0x7f658eb30360 ï¼mis_current.31286ï¼) at migration/colo.c:577 -> -> ï¼ -> -> ï¼ #10 0x00007f658be09df3 in start_thread () from /lib64/libpthread.so.0 -> -> ï¼ -> -> ï¼ #11 0x00007f65881983ed in clone () from /lib64/libc.so.6 -> -> ï¼ -> -> ï¼ (gdb) p ioc-ï¼name -> -> ï¼ -> -> ï¼ $2 = 0x7f658ff7d5c0 "migration-socket-incoming" -> -> ï¼ -> -> ï¼ (gdb) p ioc-ï¼features Do not support QIO_CHANNEL_FEATURE_SHUTDOWN -> -> ï¼ -> -> ï¼ $3 = 0 -> -> ï¼ -> -> ï¼ -> -> ï¼ (gdb) bt -> -> ï¼ -> -> ï¼ #0 socket_accept_incoming_migration (ioc=0x7fdcceeafa90, -> -> ï¼ condition=G_IO_IN, opaque=0x7fdcceeafa90) at migration/socket.c:137 -> -> ï¼ -> -> ï¼ #1 0x00007fdcc6966350 in g_main_dispatch (context=ï¼optimized outï¼) at -> -> ï¼ gmain.c:3054 -> -> ï¼ -> -> ï¼ #2 g_main_context_dispatch (context=ï¼optimized outï¼, -> -> ï¼ address@hidden) at gmain.c:3630 -> -> ï¼ -> -> ï¼ #3 0x00007fdccb8a6dcc in glib_pollfds_poll () at util/main-loop.c:213 -> -> ï¼ -> -> ï¼ #4 os_host_main_loop_wait (timeout=ï¼optimized outï¼) at -> -> ï¼ util/main-loop.c:258 -> -> ï¼ -> -> ï¼ #5 main_loop_wait (address@hidden) at -> -> ï¼ util/main-loop.c:506 -> -> ï¼ -> -> ï¼ #6 0x00007fdccb526187 in main_loop () at vl.c:1898 -> -> ï¼ -> -> ï¼ #7 main (argc=ï¼optimized outï¼, argv=ï¼optimized outï¼, envp=ï¼optimized -> -> ï¼ outï¼) at vl.c:4709 -> -> ï¼ -> -> ï¼ (gdb) p ioc-ï¼features -> -> ï¼ -> -> ï¼ $1 = 6 -> -> ï¼ -> -> ï¼ (gdb) p ioc-ï¼name -> -> ï¼ -> -> ï¼ $2 = 0x7fdcce1b1ab0 "migration-socket-listener" -> -> ï¼ -> -> ï¼ -> -> ï¼ May be socket_accept_incoming_migration should -> -> ï¼ call qio_channel_set_feature(ioc, QIO_CHANNEL_FEATURE_SHUTDOWN)?? -> -> ï¼ -> -> ï¼ -> -> ï¼ thank you. -> -> ï¼ -> -> ï¼ -> -> ï¼ -> -> ï¼ -> -> ï¼ -> -> ï¼ åå§é®ä»¶ -> -> ï¼ address@hidden -> -> ï¼ address@hidden -> -> ï¼ address@hidden@huawei.comï¼ -> -> ï¼ *æ¥ æ ï¼*2017å¹´03æ16æ¥ 14:46 -> -> ï¼ *主 é¢ ï¼**Re: [Qemu-devel] COLO failover hang* -> -> ï¼ -> -> ï¼ -> -> ï¼ -> -> ï¼ -> -> ï¼ On 03/15/2017 05:06 PM, wangguang wrote: -> -> ï¼ ï¼ am testing QEMU COLO feature described here [QEMU -> -> ï¼ ï¼ Wiki]( -http://wiki.qemu-project.org/Features/COLO -). -> -> ï¼ ï¼ -> -> ï¼ ï¼ When the Primary Node panic,the Secondary Node qemu hang. -> -> ï¼ ï¼ hang at recvmsg in qio_channel_socket_readv. -> -> ï¼ ï¼ And I run { 'execute': 'nbd-server-stop' } and { "execute": -> -> ï¼ ï¼ "x-colo-lost-heartbeat" } in Secondary VM's -> -> ï¼ ï¼ monitor,the Secondary Node qemu still hang at recvmsg . -> -> ï¼ ï¼ -> -> ï¼ ï¼ I found that the colo in qemu is not complete yet. -> -> ï¼ ï¼ Do the colo have any plan for development? -> -> ï¼ -> -> ï¼ Yes, We are developing. You can see some of patch we pushing. -> -> ï¼ -> -> ï¼ ï¼ Has anyone ever run it successfully? Any help is appreciated! -> -> ï¼ -> -> ï¼ In our internal version can run it successfully, -> -> ï¼ The failover detail you can ask Zhanghailiang for help. -> -> ï¼ Next time if you have some question about COLO, -> -> ï¼ please cc me and zhanghailiang address@hidden -> -> ï¼ -> -> ï¼ -> -> ï¼ Thanks -> -> ï¼ Zhang Chen -> -> ï¼ -> -> ï¼ -> -> ï¼ ï¼ -> -> ï¼ ï¼ -> -> ï¼ ï¼ -> -> ï¼ ï¼ centos7.2+qemu2.7.50 -> -> ï¼ ï¼ (gdb) bt -> -> ï¼ ï¼ #0 0x00007f3e00cc86ad in recvmsg () from /lib64/libpthread.so.0 -> -> ï¼ ï¼ #1 0x00007f3e0332b738 in qio_channel_socket_readv (ioc=ï¼optimized outï¼, -> -> ï¼ ï¼ iov=ï¼optimized outï¼, niov=ï¼optimized outï¼, fds=0x0, nfds=0x0, errp=0x0) -> -> at -> -> ï¼ ï¼ io/channel-socket.c:497 -> -> ï¼ ï¼ #2 0x00007f3e03329472 in qio_channel_read (address@hidden, -> -> ï¼ ï¼ address@hidden "", address@hidden, -> -> ï¼ ï¼ address@hidden) at io/channel.c:97 -> -> ï¼ ï¼ #3 0x00007f3e032750e0 in channel_get_buffer (opaque=ï¼optimized outï¼, -> -> ï¼ ï¼ buf=0x7f3e05910f38 "", pos=ï¼optimized outï¼, size=32768) at -> -> ï¼ ï¼ migration/qemu-file-channel.c:78 -> -> ï¼ ï¼ #4 0x00007f3e0327412c in qemu_fill_buffer (f=0x7f3e05910f00) at -> -> ï¼ ï¼ migration/qemu-file.c:257 -> -> ï¼ ï¼ #5 0x00007f3e03274a41 in qemu_peek_byte (address@hidden, -> -> ï¼ ï¼ address@hidden) at migration/qemu-file.c:510 -> -> ï¼ ï¼ #6 0x00007f3e03274aab in qemu_get_byte (address@hidden) at -> -> ï¼ ï¼ migration/qemu-file.c:523 -> -> ï¼ ï¼ #7 0x00007f3e03274cb2 in qemu_get_be32 (address@hidden) at -> -> ï¼ ï¼ migration/qemu-file.c:603 -> -> ï¼ ï¼ #8 0x00007f3e03271735 in colo_receive_message (f=0x7f3e05910f00, -> -> ï¼ ï¼ address@hidden) at migration/colo.c:215 -> -> ï¼ ï¼ #9 0x00007f3e0327250d in colo_wait_handle_message (errp=0x7f3d62bfaa48, -> -> ï¼ ï¼ checkpoint_request=ï¼synthetic pointerï¼, f=ï¼optimized outï¼) at -> -> ï¼ ï¼ migration/colo.c:546 -> -> ï¼ ï¼ #10 colo_process_incoming_thread (opaque=0x7f3e067245e0) at -> -> ï¼ ï¼ migration/colo.c:649 -> -> ï¼ ï¼ #11 0x00007f3e00cc1df3 in start_thread () from /lib64/libpthread.so.0 -> -> ï¼ ï¼ #12 0x00007f3dfc9c03ed in clone () from /lib64/libc.so.6 -> -> ï¼ ï¼ -> -> ï¼ ï¼ -> -> ï¼ ï¼ -> -> ï¼ ï¼ -> -> ï¼ ï¼ -> -> ï¼ ï¼ -- -> -> ï¼ ï¼ View this message in context: -> -> -http://qemu.11.n7.nabble.com/COLO-failover-hang-tp473250.html -> -> ï¼ ï¼ Sent from the Developer mailing list archive at Nabble.com. -> -> ï¼ ï¼ -> -> ï¼ ï¼ -> -> ï¼ ï¼ -> -> ï¼ ï¼ -> -> ï¼ -> -> ï¼ -- -> -> ï¼ Thanks -> -> ï¼ Zhang Chen -> -> ï¼ -> -> ï¼ -> -> ï¼ -> -> ï¼ -> -> ï¼ -> -> -> --- -Dr. David Alan Gilbert / address@hidden / Manchester, UK - -On 2017/3/21 19:56, Dr. David Alan Gilbert wrote: -* Hailiang Zhang (address@hidden) wrote: -Hi, - -Thanks for reporting this, and i confirmed it in my test, and it is a bug. - -Though we tried to call qemu_file_shutdown() to shutdown the related fd, in -case COLO thread/incoming thread is stuck in read/write() while do failover, -but it didn't take effect, because all the fd used by COLO (also migration) -has been wrapped by qio channel, and it will not call the shutdown API if -we didn't qio_channel_set_feature(QIO_CHANNEL(sioc), -QIO_CHANNEL_FEATURE_SHUTDOWN). - -Cc: Dr. David Alan Gilbert <address@hidden> - -I doubted migration cancel has the same problem, it may be stuck in write() -if we tried to cancel migration. - -void fd_start_outgoing_migration(MigrationState *s, const char *fdname, Error -**errp) -{ - qio_channel_set_name(QIO_CHANNEL(ioc), "migration-fd-outgoing"); - migration_channel_connect(s, ioc, NULL); - ... ... -We didn't call qio_channel_set_feature(QIO_CHANNEL(sioc), -QIO_CHANNEL_FEATURE_SHUTDOWN) above, -and the -migrate_fd_cancel() -{ - ... ... - if (s->state == MIGRATION_STATUS_CANCELLING && f) { - qemu_file_shutdown(f); --> This will not take effect. No ? - } -} -(cc'd in Daniel Berrange). -I see that we call qio_channel_set_feature(ioc, QIO_CHANNEL_FEATURE_SHUTDOWN); -at the -top of qio_channel_socket_new; so I think that's safe isn't it? -Hmm, you are right, this problem is only exist for the migration incoming fd, -thanks. -Dave -Thanks, -Hailiang - -On 2017/3/21 16:10, address@hidden wrote: -Thank youã - -I have test areadyã - -When the Primary Node panic,the Secondary Node qemu hang at the same placeã - -Incorrding -http://wiki.qemu-project.org/Features/COLO -ï¼kill Primary Node qemu -will not produce the problem,but Primary Node panic canã - -I think due to the feature of channel does not support -QIO_CHANNEL_FEATURE_SHUTDOWN. - - -when failover,channel_shutdown could not shut down the channel. - - -so the colo_process_incoming_thread will hang at recvmsg. - - -I test a patch: - - -diff --git a/migration/socket.c b/migration/socket.c - - -index 13966f1..d65a0ea 100644 - - ---- a/migration/socket.c - - -+++ b/migration/socket.c - - -@@ -147,8 +147,9 @@ static gboolean socket_accept_incoming_migration(QIOChannel -*ioc, - - - } - - - - - - trace_migration_socket_incoming_accepted() - - - - - - qio_channel_set_name(QIO_CHANNEL(sioc), "migration-socket-incoming") - - -+ qio_channel_set_feature(QIO_CHANNEL(sioc), QIO_CHANNEL_FEATURE_SHUTDOWN) - - - migration_channel_process_incoming(migrate_get_current(), - - - QIO_CHANNEL(sioc)) - - - object_unref(OBJECT(sioc)) - - - - -My test will not hang any more. - - - - - - - - - - - - - - - - - -åå§é®ä»¶ - - - -åä»¶äººï¼ address@hidden -æ¶ä»¶äººï¼ç广10165992 address@hidden -æéäººï¼ address@hidden address@hidden -æ¥ æ ï¼2017å¹´03æ21æ¥ 15:58 -主 é¢ ï¼Re: [Qemu-devel] çå¤: Re: [BUG]COLO failover hang - - - - - -Hi,Wang. - -You can test this branch: -https://github.com/coloft/qemu/tree/colo-v5.1-developing-COLO-frame-v21-with-shared-disk -and please follow wiki ensure your own configuration correctly. -http://wiki.qemu-project.org/Features/COLO -Thanks - -Zhang Chen - - -On 03/21/2017 03:27 PM, address@hidden wrote: -ï¼ -ï¼ hi. -ï¼ -ï¼ I test the git qemu master have the same problem. -ï¼ -ï¼ (gdb) bt -ï¼ -ï¼ #0 qio_channel_socket_readv (ioc=0x7f65911b4e50, iov=0x7f64ef3fd880, -ï¼ niov=1, fds=0x0, nfds=0x0, errp=0x0) at io/channel-socket.c:461 -ï¼ -ï¼ #1 0x00007f658e4aa0c2 in qio_channel_read -ï¼ (address@hidden, address@hidden "", -ï¼ address@hidden, address@hidden) at io/channel.c:114 -ï¼ -ï¼ #2 0x00007f658e3ea990 in channel_get_buffer (opaque=ï¼optimized outï¼, -ï¼ buf=0x7f65907cb838 "", pos=ï¼optimized outï¼, size=32768) at -ï¼ migration/qemu-file-channel.c:78 -ï¼ -ï¼ #3 0x00007f658e3e97fc in qemu_fill_buffer (f=0x7f65907cb800) at -ï¼ migration/qemu-file.c:295 -ï¼ -ï¼ #4 0x00007f658e3ea2e1 in qemu_peek_byte (address@hidden, -ï¼ address@hidden) at migration/qemu-file.c:555 -ï¼ -ï¼ #5 0x00007f658e3ea34b in qemu_get_byte (address@hidden) at -ï¼ migration/qemu-file.c:568 -ï¼ -ï¼ #6 0x00007f658e3ea552 in qemu_get_be32 (address@hidden) at -ï¼ migration/qemu-file.c:648 -ï¼ -ï¼ #7 0x00007f658e3e66e5 in colo_receive_message (f=0x7f65907cb800, -ï¼ address@hidden) at migration/colo.c:244 -ï¼ -ï¼ #8 0x00007f658e3e681e in colo_receive_check_message (f=ï¼optimized -ï¼ outï¼, address@hidden, -ï¼ address@hidden) -ï¼ -ï¼ at migration/colo.c:264 -ï¼ -ï¼ #9 0x00007f658e3e740e in colo_process_incoming_thread -ï¼ (opaque=0x7f658eb30360 ï¼mis_current.31286ï¼) at migration/colo.c:577 -ï¼ -ï¼ #10 0x00007f658be09df3 in start_thread () from /lib64/libpthread.so.0 -ï¼ -ï¼ #11 0x00007f65881983ed in clone () from /lib64/libc.so.6 -ï¼ -ï¼ (gdb) p ioc-ï¼name -ï¼ -ï¼ $2 = 0x7f658ff7d5c0 "migration-socket-incoming" -ï¼ -ï¼ (gdb) p ioc-ï¼features Do not support QIO_CHANNEL_FEATURE_SHUTDOWN -ï¼ -ï¼ $3 = 0 -ï¼ -ï¼ -ï¼ (gdb) bt -ï¼ -ï¼ #0 socket_accept_incoming_migration (ioc=0x7fdcceeafa90, -ï¼ condition=G_IO_IN, opaque=0x7fdcceeafa90) at migration/socket.c:137 -ï¼ -ï¼ #1 0x00007fdcc6966350 in g_main_dispatch (context=ï¼optimized outï¼) at -ï¼ gmain.c:3054 -ï¼ -ï¼ #2 g_main_context_dispatch (context=ï¼optimized outï¼, -ï¼ address@hidden) at gmain.c:3630 -ï¼ -ï¼ #3 0x00007fdccb8a6dcc in glib_pollfds_poll () at util/main-loop.c:213 -ï¼ -ï¼ #4 os_host_main_loop_wait (timeout=ï¼optimized outï¼) at -ï¼ util/main-loop.c:258 -ï¼ -ï¼ #5 main_loop_wait (address@hidden) at -ï¼ util/main-loop.c:506 -ï¼ -ï¼ #6 0x00007fdccb526187 in main_loop () at vl.c:1898 -ï¼ -ï¼ #7 main (argc=ï¼optimized outï¼, argv=ï¼optimized outï¼, envp=ï¼optimized -ï¼ outï¼) at vl.c:4709 -ï¼ -ï¼ (gdb) p ioc-ï¼features -ï¼ -ï¼ $1 = 6 -ï¼ -ï¼ (gdb) p ioc-ï¼name -ï¼ -ï¼ $2 = 0x7fdcce1b1ab0 "migration-socket-listener" -ï¼ -ï¼ -ï¼ May be socket_accept_incoming_migration should -ï¼ call qio_channel_set_feature(ioc, QIO_CHANNEL_FEATURE_SHUTDOWN)?? -ï¼ -ï¼ -ï¼ thank you. -ï¼ -ï¼ -ï¼ -ï¼ -ï¼ -ï¼ åå§é®ä»¶ -ï¼ address@hidden -ï¼ address@hidden -ï¼ address@hidden@huawei.comï¼ -ï¼ *æ¥ æ ï¼*2017å¹´03æ16æ¥ 14:46 -ï¼ *主 é¢ ï¼**Re: [Qemu-devel] COLO failover hang* -ï¼ -ï¼ -ï¼ -ï¼ -ï¼ On 03/15/2017 05:06 PM, wangguang wrote: -ï¼ ï¼ am testing QEMU COLO feature described here [QEMU -ï¼ ï¼ Wiki]( -http://wiki.qemu-project.org/Features/COLO -). -ï¼ ï¼ -ï¼ ï¼ When the Primary Node panic,the Secondary Node qemu hang. -ï¼ ï¼ hang at recvmsg in qio_channel_socket_readv. -ï¼ ï¼ And I run { 'execute': 'nbd-server-stop' } and { "execute": -ï¼ ï¼ "x-colo-lost-heartbeat" } in Secondary VM's -ï¼ ï¼ monitor,the Secondary Node qemu still hang at recvmsg . -ï¼ ï¼ -ï¼ ï¼ I found that the colo in qemu is not complete yet. -ï¼ ï¼ Do the colo have any plan for development? -ï¼ -ï¼ Yes, We are developing. You can see some of patch we pushing. -ï¼ -ï¼ ï¼ Has anyone ever run it successfully? Any help is appreciated! -ï¼ -ï¼ In our internal version can run it successfully, -ï¼ The failover detail you can ask Zhanghailiang for help. -ï¼ Next time if you have some question about COLO, -ï¼ please cc me and zhanghailiang address@hidden -ï¼ -ï¼ -ï¼ Thanks -ï¼ Zhang Chen -ï¼ -ï¼ -ï¼ ï¼ -ï¼ ï¼ -ï¼ ï¼ -ï¼ ï¼ centos7.2+qemu2.7.50 -ï¼ ï¼ (gdb) bt -ï¼ ï¼ #0 0x00007f3e00cc86ad in recvmsg () from /lib64/libpthread.so.0 -ï¼ ï¼ #1 0x00007f3e0332b738 in qio_channel_socket_readv (ioc=ï¼optimized outï¼, -ï¼ ï¼ iov=ï¼optimized outï¼, niov=ï¼optimized outï¼, fds=0x0, nfds=0x0, errp=0x0) at -ï¼ ï¼ io/channel-socket.c:497 -ï¼ ï¼ #2 0x00007f3e03329472 in qio_channel_read (address@hidden, -ï¼ ï¼ address@hidden "", address@hidden, -ï¼ ï¼ address@hidden) at io/channel.c:97 -ï¼ ï¼ #3 0x00007f3e032750e0 in channel_get_buffer (opaque=ï¼optimized outï¼, -ï¼ ï¼ buf=0x7f3e05910f38 "", pos=ï¼optimized outï¼, size=32768) at -ï¼ ï¼ migration/qemu-file-channel.c:78 -ï¼ ï¼ #4 0x00007f3e0327412c in qemu_fill_buffer (f=0x7f3e05910f00) at -ï¼ ï¼ migration/qemu-file.c:257 -ï¼ ï¼ #5 0x00007f3e03274a41 in qemu_peek_byte (address@hidden, -ï¼ ï¼ address@hidden) at migration/qemu-file.c:510 -ï¼ ï¼ #6 0x00007f3e03274aab in qemu_get_byte (address@hidden) at -ï¼ ï¼ migration/qemu-file.c:523 -ï¼ ï¼ #7 0x00007f3e03274cb2 in qemu_get_be32 (address@hidden) at -ï¼ ï¼ migration/qemu-file.c:603 -ï¼ ï¼ #8 0x00007f3e03271735 in colo_receive_message (f=0x7f3e05910f00, -ï¼ ï¼ address@hidden) at migration/colo.c:215 -ï¼ ï¼ #9 0x00007f3e0327250d in colo_wait_handle_message (errp=0x7f3d62bfaa48, -ï¼ ï¼ checkpoint_request=ï¼synthetic pointerï¼, f=ï¼optimized outï¼) at -ï¼ ï¼ migration/colo.c:546 -ï¼ ï¼ #10 colo_process_incoming_thread (opaque=0x7f3e067245e0) at -ï¼ ï¼ migration/colo.c:649 -ï¼ ï¼ #11 0x00007f3e00cc1df3 in start_thread () from /lib64/libpthread.so.0 -ï¼ ï¼ #12 0x00007f3dfc9c03ed in clone () from /lib64/libc.so.6 -ï¼ ï¼ -ï¼ ï¼ -ï¼ ï¼ -ï¼ ï¼ -ï¼ ï¼ -ï¼ ï¼ -- -ï¼ ï¼ View this message in context: -http://qemu.11.n7.nabble.com/COLO-failover-hang-tp473250.html -ï¼ ï¼ Sent from the Developer mailing list archive at Nabble.com. -ï¼ ï¼ -ï¼ ï¼ -ï¼ ï¼ -ï¼ ï¼ -ï¼ -ï¼ -- -ï¼ Thanks -ï¼ Zhang Chen -ï¼ -ï¼ -ï¼ -ï¼ -ï¼ --- -Dr. David Alan Gilbert / address@hidden / Manchester, UK - -. - -* Hailiang Zhang (address@hidden) wrote: -> -On 2017/3/21 19:56, Dr. David Alan Gilbert wrote: -> -> * Hailiang Zhang (address@hidden) wrote: -> -> > Hi, -> -> > -> -> > Thanks for reporting this, and i confirmed it in my test, and it is a bug. -> -> > -> -> > Though we tried to call qemu_file_shutdown() to shutdown the related fd, -> -> > in -> -> > case COLO thread/incoming thread is stuck in read/write() while do -> -> > failover, -> -> > but it didn't take effect, because all the fd used by COLO (also -> -> > migration) -> -> > has been wrapped by qio channel, and it will not call the shutdown API if -> -> > we didn't qio_channel_set_feature(QIO_CHANNEL(sioc), -> -> > QIO_CHANNEL_FEATURE_SHUTDOWN). -> -> > -> -> > Cc: Dr. David Alan Gilbert <address@hidden> -> -> > -> -> > I doubted migration cancel has the same problem, it may be stuck in -> -> > write() -> -> > if we tried to cancel migration. -> -> > -> -> > void fd_start_outgoing_migration(MigrationState *s, const char *fdname, -> -> > Error **errp) -> -> > { -> -> > qio_channel_set_name(QIO_CHANNEL(ioc), "migration-fd-outgoing"); -> -> > migration_channel_connect(s, ioc, NULL); -> -> > ... ... -> -> > We didn't call qio_channel_set_feature(QIO_CHANNEL(sioc), -> -> > QIO_CHANNEL_FEATURE_SHUTDOWN) above, -> -> > and the -> -> > migrate_fd_cancel() -> -> > { -> -> > ... ... -> -> > if (s->state == MIGRATION_STATUS_CANCELLING && f) { -> -> > qemu_file_shutdown(f); --> This will not take effect. No ? -> -> > } -> -> > } -> -> -> -> (cc'd in Daniel Berrange). -> -> I see that we call qio_channel_set_feature(ioc, -> -> QIO_CHANNEL_FEATURE_SHUTDOWN); at the -> -> top of qio_channel_socket_new; so I think that's safe isn't it? -> -> -> -> -Hmm, you are right, this problem is only exist for the migration incoming fd, -> -thanks. -Yes, and I don't think we normally do a cancel on the incoming side of a -migration. - -Dave - -> -> Dave -> -> -> -> > Thanks, -> -> > Hailiang -> -> > -> -> > On 2017/3/21 16:10, address@hidden wrote: -> -> > > Thank youã -> -> > > -> -> > > I have test areadyã -> -> > > -> -> > > When the Primary Node panic,the Secondary Node qemu hang at the same -> -> > > placeã -> -> > > -> -> > > Incorrding -http://wiki.qemu-project.org/Features/COLO -ï¼kill Primary -> -> > > Node qemu will not produce the problem,but Primary Node panic canã -> -> > > -> -> > > I think due to the feature of channel does not support -> -> > > QIO_CHANNEL_FEATURE_SHUTDOWN. -> -> > > -> -> > > -> -> > > when failover,channel_shutdown could not shut down the channel. -> -> > > -> -> > > -> -> > > so the colo_process_incoming_thread will hang at recvmsg. -> -> > > -> -> > > -> -> > > I test a patch: -> -> > > -> -> > > -> -> > > diff --git a/migration/socket.c b/migration/socket.c -> -> > > -> -> > > -> -> > > index 13966f1..d65a0ea 100644 -> -> > > -> -> > > -> -> > > --- a/migration/socket.c -> -> > > -> -> > > -> -> > > +++ b/migration/socket.c -> -> > > -> -> > > -> -> > > @@ -147,8 +147,9 @@ static gboolean -> -> > > socket_accept_incoming_migration(QIOChannel *ioc, -> -> > > -> -> > > -> -> > > } -> -> > > -> -> > > -> -> > > -> -> > > -> -> > > -> -> > > trace_migration_socket_incoming_accepted() -> -> > > -> -> > > -> -> > > -> -> > > -> -> > > -> -> > > qio_channel_set_name(QIO_CHANNEL(sioc), -> -> > > "migration-socket-incoming") -> -> > > -> -> > > -> -> > > + qio_channel_set_feature(QIO_CHANNEL(sioc), -> -> > > QIO_CHANNEL_FEATURE_SHUTDOWN) -> -> > > -> -> > > -> -> > > migration_channel_process_incoming(migrate_get_current(), -> -> > > -> -> > > -> -> > > QIO_CHANNEL(sioc)) -> -> > > -> -> > > -> -> > > object_unref(OBJECT(sioc)) -> -> > > -> -> > > -> -> > > -> -> > > -> -> > > My test will not hang any more. -> -> > > -> -> > > -> -> > > -> -> > > -> -> > > -> -> > > -> -> > > -> -> > > -> -> > > -> -> > > -> -> > > -> -> > > -> -> > > -> -> > > -> -> > > -> -> > > -> -> > > -> -> > > åå§é®ä»¶ -> -> > > -> -> > > -> -> > > -> -> > > åä»¶äººï¼ address@hidden -> -> > > æ¶ä»¶äººï¼ç广10165992 address@hidden -> -> > > æéäººï¼ address@hidden address@hidden -> -> > > æ¥ æ ï¼2017å¹´03æ21æ¥ 15:58 -> -> > > 主 é¢ ï¼Re: [Qemu-devel] çå¤: Re: [BUG]COLO failover hang -> -> > > -> -> > > -> -> > > -> -> > > -> -> > > -> -> > > Hi,Wang. -> -> > > -> -> > > You can test this branch: -> -> > > -> -> > > -https://github.com/coloft/qemu/tree/colo-v5.1-developing-COLO-frame-v21-with-shared-disk -> -> > > -> -> > > and please follow wiki ensure your own configuration correctly. -> -> > > -> -> > > -http://wiki.qemu-project.org/Features/COLO -> -> > > -> -> > > -> -> > > Thanks -> -> > > -> -> > > Zhang Chen -> -> > > -> -> > > -> -> > > On 03/21/2017 03:27 PM, address@hidden wrote: -> -> > > ï¼ -> -> > > ï¼ hi. -> -> > > ï¼ -> -> > > ï¼ I test the git qemu master have the same problem. -> -> > > ï¼ -> -> > > ï¼ (gdb) bt -> -> > > ï¼ -> -> > > ï¼ #0 qio_channel_socket_readv (ioc=0x7f65911b4e50, iov=0x7f64ef3fd880, -> -> > > ï¼ niov=1, fds=0x0, nfds=0x0, errp=0x0) at io/channel-socket.c:461 -> -> > > ï¼ -> -> > > ï¼ #1 0x00007f658e4aa0c2 in qio_channel_read -> -> > > ï¼ (address@hidden, address@hidden "", -> -> > > ï¼ address@hidden, address@hidden) at io/channel.c:114 -> -> > > ï¼ -> -> > > ï¼ #2 0x00007f658e3ea990 in channel_get_buffer (opaque=ï¼optimized outï¼, -> -> > > ï¼ buf=0x7f65907cb838 "", pos=ï¼optimized outï¼, size=32768) at -> -> > > ï¼ migration/qemu-file-channel.c:78 -> -> > > ï¼ -> -> > > ï¼ #3 0x00007f658e3e97fc in qemu_fill_buffer (f=0x7f65907cb800) at -> -> > > ï¼ migration/qemu-file.c:295 -> -> > > ï¼ -> -> > > ï¼ #4 0x00007f658e3ea2e1 in qemu_peek_byte (address@hidden, -> -> > > ï¼ address@hidden) at migration/qemu-file.c:555 -> -> > > ï¼ -> -> > > ï¼ #5 0x00007f658e3ea34b in qemu_get_byte (address@hidden) at -> -> > > ï¼ migration/qemu-file.c:568 -> -> > > ï¼ -> -> > > ï¼ #6 0x00007f658e3ea552 in qemu_get_be32 (address@hidden) at -> -> > > ï¼ migration/qemu-file.c:648 -> -> > > ï¼ -> -> > > ï¼ #7 0x00007f658e3e66e5 in colo_receive_message (f=0x7f65907cb800, -> -> > > ï¼ address@hidden) at migration/colo.c:244 -> -> > > ï¼ -> -> > > ï¼ #8 0x00007f658e3e681e in colo_receive_check_message (f=ï¼optimized -> -> > > ï¼ outï¼, address@hidden, -> -> > > ï¼ address@hidden) -> -> > > ï¼ -> -> > > ï¼ at migration/colo.c:264 -> -> > > ï¼ -> -> > > ï¼ #9 0x00007f658e3e740e in colo_process_incoming_thread -> -> > > ï¼ (opaque=0x7f658eb30360 ï¼mis_current.31286ï¼) at migration/colo.c:577 -> -> > > ï¼ -> -> > > ï¼ #10 0x00007f658be09df3 in start_thread () from /lib64/libpthread.so.0 -> -> > > ï¼ -> -> > > ï¼ #11 0x00007f65881983ed in clone () from /lib64/libc.so.6 -> -> > > ï¼ -> -> > > ï¼ (gdb) p ioc-ï¼name -> -> > > ï¼ -> -> > > ï¼ $2 = 0x7f658ff7d5c0 "migration-socket-incoming" -> -> > > ï¼ -> -> > > ï¼ (gdb) p ioc-ï¼features Do not support -> -> > > QIO_CHANNEL_FEATURE_SHUTDOWN -> -> > > ï¼ -> -> > > ï¼ $3 = 0 -> -> > > ï¼ -> -> > > ï¼ -> -> > > ï¼ (gdb) bt -> -> > > ï¼ -> -> > > ï¼ #0 socket_accept_incoming_migration (ioc=0x7fdcceeafa90, -> -> > > ï¼ condition=G_IO_IN, opaque=0x7fdcceeafa90) at migration/socket.c:137 -> -> > > ï¼ -> -> > > ï¼ #1 0x00007fdcc6966350 in g_main_dispatch (context=ï¼optimized outï¼) at -> -> > > ï¼ gmain.c:3054 -> -> > > ï¼ -> -> > > ï¼ #2 g_main_context_dispatch (context=ï¼optimized outï¼, -> -> > > ï¼ address@hidden) at gmain.c:3630 -> -> > > ï¼ -> -> > > ï¼ #3 0x00007fdccb8a6dcc in glib_pollfds_poll () at util/main-loop.c:213 -> -> > > ï¼ -> -> > > ï¼ #4 os_host_main_loop_wait (timeout=ï¼optimized outï¼) at -> -> > > ï¼ util/main-loop.c:258 -> -> > > ï¼ -> -> > > ï¼ #5 main_loop_wait (address@hidden) at -> -> > > ï¼ util/main-loop.c:506 -> -> > > ï¼ -> -> > > ï¼ #6 0x00007fdccb526187 in main_loop () at vl.c:1898 -> -> > > ï¼ -> -> > > ï¼ #7 main (argc=ï¼optimized outï¼, argv=ï¼optimized outï¼, envp=ï¼optimized -> -> > > ï¼ outï¼) at vl.c:4709 -> -> > > ï¼ -> -> > > ï¼ (gdb) p ioc-ï¼features -> -> > > ï¼ -> -> > > ï¼ $1 = 6 -> -> > > ï¼ -> -> > > ï¼ (gdb) p ioc-ï¼name -> -> > > ï¼ -> -> > > ï¼ $2 = 0x7fdcce1b1ab0 "migration-socket-listener" -> -> > > ï¼ -> -> > > ï¼ -> -> > > ï¼ May be socket_accept_incoming_migration should -> -> > > ï¼ call qio_channel_set_feature(ioc, QIO_CHANNEL_FEATURE_SHUTDOWN)?? -> -> > > ï¼ -> -> > > ï¼ -> -> > > ï¼ thank you. -> -> > > ï¼ -> -> > > ï¼ -> -> > > ï¼ -> -> > > ï¼ -> -> > > ï¼ -> -> > > ï¼ åå§é®ä»¶ -> -> > > ï¼ address@hidden -> -> > > ï¼ address@hidden -> -> > > ï¼ address@hidden@huawei.comï¼ -> -> > > ï¼ *æ¥ æ ï¼*2017å¹´03æ16æ¥ 14:46 -> -> > > ï¼ *主 é¢ ï¼**Re: [Qemu-devel] COLO failover hang* -> -> > > ï¼ -> -> > > ï¼ -> -> > > ï¼ -> -> > > ï¼ -> -> > > ï¼ On 03/15/2017 05:06 PM, wangguang wrote: -> -> > > ï¼ ï¼ am testing QEMU COLO feature described here [QEMU -> -> > > ï¼ ï¼ Wiki]( -http://wiki.qemu-project.org/Features/COLO -). -> -> > > ï¼ ï¼ -> -> > > ï¼ ï¼ When the Primary Node panic,the Secondary Node qemu hang. -> -> > > ï¼ ï¼ hang at recvmsg in qio_channel_socket_readv. -> -> > > ï¼ ï¼ And I run { 'execute': 'nbd-server-stop' } and { "execute": -> -> > > ï¼ ï¼ "x-colo-lost-heartbeat" } in Secondary VM's -> -> > > ï¼ ï¼ monitor,the Secondary Node qemu still hang at recvmsg . -> -> > > ï¼ ï¼ -> -> > > ï¼ ï¼ I found that the colo in qemu is not complete yet. -> -> > > ï¼ ï¼ Do the colo have any plan for development? -> -> > > ï¼ -> -> > > ï¼ Yes, We are developing. You can see some of patch we pushing. -> -> > > ï¼ -> -> > > ï¼ ï¼ Has anyone ever run it successfully? Any help is appreciated! -> -> > > ï¼ -> -> > > ï¼ In our internal version can run it successfully, -> -> > > ï¼ The failover detail you can ask Zhanghailiang for help. -> -> > > ï¼ Next time if you have some question about COLO, -> -> > > ï¼ please cc me and zhanghailiang address@hidden -> -> > > ï¼ -> -> > > ï¼ -> -> > > ï¼ Thanks -> -> > > ï¼ Zhang Chen -> -> > > ï¼ -> -> > > ï¼ -> -> > > ï¼ ï¼ -> -> > > ï¼ ï¼ -> -> > > ï¼ ï¼ -> -> > > ï¼ ï¼ centos7.2+qemu2.7.50 -> -> > > ï¼ ï¼ (gdb) bt -> -> > > ï¼ ï¼ #0 0x00007f3e00cc86ad in recvmsg () from /lib64/libpthread.so.0 -> -> > > ï¼ ï¼ #1 0x00007f3e0332b738 in qio_channel_socket_readv (ioc=ï¼optimized -> -> > > outï¼, -> -> > > ï¼ ï¼ iov=ï¼optimized outï¼, niov=ï¼optimized outï¼, fds=0x0, nfds=0x0, -> -> > > errp=0x0) at -> -> > > ï¼ ï¼ io/channel-socket.c:497 -> -> > > ï¼ ï¼ #2 0x00007f3e03329472 in qio_channel_read (address@hidden, -> -> > > ï¼ ï¼ address@hidden "", address@hidden, -> -> > > ï¼ ï¼ address@hidden) at io/channel.c:97 -> -> > > ï¼ ï¼ #3 0x00007f3e032750e0 in channel_get_buffer (opaque=ï¼optimized -> -> > > outï¼, -> -> > > ï¼ ï¼ buf=0x7f3e05910f38 "", pos=ï¼optimized outï¼, size=32768) at -> -> > > ï¼ ï¼ migration/qemu-file-channel.c:78 -> -> > > ï¼ ï¼ #4 0x00007f3e0327412c in qemu_fill_buffer (f=0x7f3e05910f00) at -> -> > > ï¼ ï¼ migration/qemu-file.c:257 -> -> > > ï¼ ï¼ #5 0x00007f3e03274a41 in qemu_peek_byte (address@hidden, -> -> > > ï¼ ï¼ address@hidden) at migration/qemu-file.c:510 -> -> > > ï¼ ï¼ #6 0x00007f3e03274aab in qemu_get_byte (address@hidden) at -> -> > > ï¼ ï¼ migration/qemu-file.c:523 -> -> > > ï¼ ï¼ #7 0x00007f3e03274cb2 in qemu_get_be32 (address@hidden) at -> -> > > ï¼ ï¼ migration/qemu-file.c:603 -> -> > > ï¼ ï¼ #8 0x00007f3e03271735 in colo_receive_message (f=0x7f3e05910f00, -> -> > > ï¼ ï¼ address@hidden) at migration/colo.c:215 -> -> > > ï¼ ï¼ #9 0x00007f3e0327250d in colo_wait_handle_message -> -> > > (errp=0x7f3d62bfaa48, -> -> > > ï¼ ï¼ checkpoint_request=ï¼synthetic pointerï¼, f=ï¼optimized outï¼) at -> -> > > ï¼ ï¼ migration/colo.c:546 -> -> > > ï¼ ï¼ #10 colo_process_incoming_thread (opaque=0x7f3e067245e0) at -> -> > > ï¼ ï¼ migration/colo.c:649 -> -> > > ï¼ ï¼ #11 0x00007f3e00cc1df3 in start_thread () from -> -> > > /lib64/libpthread.so.0 -> -> > > ï¼ ï¼ #12 0x00007f3dfc9c03ed in clone () from /lib64/libc.so.6 -> -> > > ï¼ ï¼ -> -> > > ï¼ ï¼ -> -> > > ï¼ ï¼ -> -> > > ï¼ ï¼ -> -> > > ï¼ ï¼ -> -> > > ï¼ ï¼ -- -> -> > > ï¼ ï¼ View this message in context: -> -> > > -http://qemu.11.n7.nabble.com/COLO-failover-hang-tp473250.html -> -> > > ï¼ ï¼ Sent from the Developer mailing list archive at Nabble.com. -> -> > > ï¼ ï¼ -> -> > > ï¼ ï¼ -> -> > > ï¼ ï¼ -> -> > > ï¼ ï¼ -> -> > > ï¼ -> -> > > ï¼ -- -> -> > > ï¼ Thanks -> -> > > ï¼ Zhang Chen -> -> > > ï¼ -> -> > > ï¼ -> -> > > ï¼ -> -> > > ï¼ -> -> > > ï¼ -> -> > > -> -> > -> -> -- -> -> Dr. David Alan Gilbert / address@hidden / Manchester, UK -> -> -> -> . -> -> -> --- -Dr. David Alan Gilbert / address@hidden / Manchester, UK - diff --git a/results/classifier/013/risc-v/70294255 b/results/classifier/013/risc-v/70294255 deleted file mode 100644 index a19c119c6..000000000 --- a/results/classifier/013/risc-v/70294255 +++ /dev/null @@ -1,1089 +0,0 @@ -risc-v: 0.863 -mistranslation: 0.862 -assembly: 0.861 -PID: 0.859 -semantic: 0.858 -socket: 0.858 -device: 0.857 -user-level: 0.857 -graphic: 0.857 -arm: 0.856 -debug: 0.854 -permissions: 0.854 -architecture: 0.851 -system: 0.851 -performance: 0.850 -kernel: 0.848 -network: 0.846 -operating system: 0.844 -register: 0.842 -vnc: 0.837 -alpha: 0.834 -files: 0.832 -virtual: 0.832 -hypervisor: 0.828 -peripherals: 0.819 -boot: 0.811 -i386: 0.811 -KVM: 0.806 -x86: 0.803 -ppc: 0.800 -TCG: 0.792 -VMM: 0.784 - -[Qemu-devel] 答复: Re: 答复: Re: 答复: Re: 答复: Re: [BUG]COLO failover hang - -hi: - -yes.it is better. - -And should we delete - - - - -#ifdef WIN32 - - QIO_CHANNEL(cioc)-ï¼event = CreateEvent(NULL, FALSE, FALSE, NULL) - -#endif - - - - -in qio_channel_socket_acceptï¼ - -qio_channel_socket_new already have it. - - - - - - - - - - - - -åå§é®ä»¶ - - - -åä»¶äººï¼ address@hidden -æ¶ä»¶äººï¼ç广10165992 -æéäººï¼ address@hidden address@hidden address@hidden address@hidden -æ¥ æ ï¼2017å¹´03æ22æ¥ 15:03 -主 é¢ ï¼Re: [Qemu-devel] çå¤: Re: çå¤: Re: çå¤: Re: [BUG]COLO failover hang - - - - - -Hi, - -On 2017/3/22 9:42, address@hidden wrote: -ï¼ diff --git a/migration/socket.c b/migration/socket.c -ï¼ -ï¼ -ï¼ index 13966f1..d65a0ea 100644 -ï¼ -ï¼ -ï¼ --- a/migration/socket.c -ï¼ -ï¼ -ï¼ +++ b/migration/socket.c -ï¼ -ï¼ -ï¼ @@ -147,8 +147,9 @@ static gboolean -socket_accept_incoming_migration(QIOChannel *ioc, -ï¼ -ï¼ -ï¼ } -ï¼ -ï¼ -ï¼ -ï¼ -ï¼ -ï¼ trace_migration_socket_incoming_accepted() -ï¼ -ï¼ -ï¼ -ï¼ -ï¼ -ï¼ qio_channel_set_name(QIO_CHANNEL(sioc), "migration-socket-incoming") -ï¼ -ï¼ -ï¼ + qio_channel_set_feature(QIO_CHANNEL(sioc), QIO_CHANNEL_FEATURE_SHUTDOWN) -ï¼ -ï¼ -ï¼ migration_channel_process_incoming(migrate_get_current(), -ï¼ -ï¼ -ï¼ QIO_CHANNEL(sioc)) -ï¼ -ï¼ -ï¼ object_unref(OBJECT(sioc)) -ï¼ -ï¼ -ï¼ -ï¼ -ï¼ Is this patch ok? -ï¼ - -Yes, i think this works, but a better way maybe to call -qio_channel_set_feature() -in qio_channel_socket_accept(), we didn't set the SHUTDOWN feature for the -socket accept fd, -Or fix it by this: - -diff --git a/io/channel-socket.c b/io/channel-socket.c -index f546c68..ce6894c 100644 ---- a/io/channel-socket.c -+++ b/io/channel-socket.c -@@ -330,9 +330,8 @@ qio_channel_socket_accept(QIOChannelSocket *ioc, - Error **errp) - { - QIOChannelSocket *cioc -- -- cioc = QIO_CHANNEL_SOCKET(object_new(TYPE_QIO_CHANNEL_SOCKET)) -- cioc-ï¼fd = -1 -+ -+ cioc = qio_channel_socket_new() - cioc-ï¼remoteAddrLen = sizeof(ioc-ï¼remoteAddr) - cioc-ï¼localAddrLen = sizeof(ioc-ï¼localAddr) - - -Thanks, -Hailiang - -ï¼ I have test it . The test could not hang any more. -ï¼ -ï¼ -ï¼ -ï¼ -ï¼ -ï¼ -ï¼ -ï¼ -ï¼ -ï¼ -ï¼ -ï¼ -ï¼ åå§é®ä»¶ -ï¼ -ï¼ -ï¼ -ï¼ åä»¶äººï¼ address@hidden -ï¼ æ¶ä»¶äººï¼ address@hidden address@hidden -ï¼ æéäººï¼ address@hidden address@hidden address@hidden -ï¼ æ¥ æ ï¼2017å¹´03æ22æ¥ 09:11 -ï¼ ä¸» é¢ ï¼Re: [Qemu-devel] çå¤: Re: çå¤: Re: [BUG]COLO failover hang -ï¼ -ï¼ -ï¼ -ï¼ -ï¼ -ï¼ On 2017/3/21 19:56, Dr. David Alan Gilbert wrote: -ï¼ ï¼ * Hailiang Zhang (address@hidden) wrote: -ï¼ ï¼ï¼ Hi, -ï¼ ï¼ï¼ -ï¼ ï¼ï¼ Thanks for reporting this, and i confirmed it in my test, and it is a bug. -ï¼ ï¼ï¼ -ï¼ ï¼ï¼ Though we tried to call qemu_file_shutdown() to shutdown the related fd, in -ï¼ ï¼ï¼ case COLO thread/incoming thread is stuck in read/write() while do -failover, -ï¼ ï¼ï¼ but it didn't take effect, because all the fd used by COLO (also migration) -ï¼ ï¼ï¼ has been wrapped by qio channel, and it will not call the shutdown API if -ï¼ ï¼ï¼ we didn't qio_channel_set_feature(QIO_CHANNEL(sioc), -QIO_CHANNEL_FEATURE_SHUTDOWN). -ï¼ ï¼ï¼ -ï¼ ï¼ï¼ Cc: Dr. David Alan Gilbert address@hidden -ï¼ ï¼ï¼ -ï¼ ï¼ï¼ I doubted migration cancel has the same problem, it may be stuck in write() -ï¼ ï¼ï¼ if we tried to cancel migration. -ï¼ ï¼ï¼ -ï¼ ï¼ï¼ void fd_start_outgoing_migration(MigrationState *s, const char *fdname, -Error **errp) -ï¼ ï¼ï¼ { -ï¼ ï¼ï¼ qio_channel_set_name(QIO_CHANNEL(ioc), "migration-fd-outgoing") -ï¼ ï¼ï¼ migration_channel_connect(s, ioc, NULL) -ï¼ ï¼ï¼ ... ... -ï¼ ï¼ï¼ We didn't call qio_channel_set_feature(QIO_CHANNEL(sioc), -QIO_CHANNEL_FEATURE_SHUTDOWN) above, -ï¼ ï¼ï¼ and the -ï¼ ï¼ï¼ migrate_fd_cancel() -ï¼ ï¼ï¼ { -ï¼ ï¼ï¼ ... ... -ï¼ ï¼ï¼ if (s-ï¼state == MIGRATION_STATUS_CANCELLING && f) { -ï¼ ï¼ï¼ qemu_file_shutdown(f) --ï¼ This will not take effect. No ? -ï¼ ï¼ï¼ } -ï¼ ï¼ï¼ } -ï¼ ï¼ -ï¼ ï¼ (cc'd in Daniel Berrange). -ï¼ ï¼ I see that we call qio_channel_set_feature(ioc, -QIO_CHANNEL_FEATURE_SHUTDOWN) at the -ï¼ ï¼ top of qio_channel_socket_new so I think that's safe isn't it? -ï¼ ï¼ -ï¼ -ï¼ Hmm, you are right, this problem is only exist for the migration incoming fd, -thanks. -ï¼ -ï¼ ï¼ Dave -ï¼ ï¼ -ï¼ ï¼ï¼ Thanks, -ï¼ ï¼ï¼ Hailiang -ï¼ ï¼ï¼ -ï¼ ï¼ï¼ On 2017/3/21 16:10, address@hidden wrote: -ï¼ ï¼ï¼ï¼ Thank youã -ï¼ ï¼ï¼ï¼ -ï¼ ï¼ï¼ï¼ I have test areadyã -ï¼ ï¼ï¼ï¼ -ï¼ ï¼ï¼ï¼ When the Primary Node panic,the Secondary Node qemu hang at the same -placeã -ï¼ ï¼ï¼ï¼ -ï¼ ï¼ï¼ï¼ Incorrding -http://wiki.qemu-project.org/Features/COLO -ï¼kill Primary Node -qemu will not produce the problem,but Primary Node panic canã -ï¼ ï¼ï¼ï¼ -ï¼ ï¼ï¼ï¼ I think due to the feature of channel does not support -QIO_CHANNEL_FEATURE_SHUTDOWN. -ï¼ ï¼ï¼ï¼ -ï¼ ï¼ï¼ï¼ -ï¼ ï¼ï¼ï¼ when failover,channel_shutdown could not shut down the channel. -ï¼ ï¼ï¼ï¼ -ï¼ ï¼ï¼ï¼ -ï¼ ï¼ï¼ï¼ so the colo_process_incoming_thread will hang at recvmsg. -ï¼ ï¼ï¼ï¼ -ï¼ ï¼ï¼ï¼ -ï¼ ï¼ï¼ï¼ I test a patch: -ï¼ ï¼ï¼ï¼ -ï¼ ï¼ï¼ï¼ -ï¼ ï¼ï¼ï¼ diff --git a/migration/socket.c b/migration/socket.c -ï¼ ï¼ï¼ï¼ -ï¼ ï¼ï¼ï¼ -ï¼ ï¼ï¼ï¼ index 13966f1..d65a0ea 100644 -ï¼ ï¼ï¼ï¼ -ï¼ ï¼ï¼ï¼ -ï¼ ï¼ï¼ï¼ --- a/migration/socket.c -ï¼ ï¼ï¼ï¼ -ï¼ ï¼ï¼ï¼ -ï¼ ï¼ï¼ï¼ +++ b/migration/socket.c -ï¼ ï¼ï¼ï¼ -ï¼ ï¼ï¼ï¼ -ï¼ ï¼ï¼ï¼ @@ -147,8 +147,9 @@ static gboolean -socket_accept_incoming_migration(QIOChannel *ioc, -ï¼ ï¼ï¼ï¼ -ï¼ ï¼ï¼ï¼ -ï¼ ï¼ï¼ï¼ } -ï¼ ï¼ï¼ï¼ -ï¼ ï¼ï¼ï¼ -ï¼ ï¼ï¼ï¼ -ï¼ ï¼ï¼ï¼ -ï¼ ï¼ï¼ï¼ -ï¼ ï¼ï¼ï¼ trace_migration_socket_incoming_accepted() -ï¼ ï¼ï¼ï¼ -ï¼ ï¼ï¼ï¼ -ï¼ ï¼ï¼ï¼ -ï¼ ï¼ï¼ï¼ -ï¼ ï¼ï¼ï¼ -ï¼ ï¼ï¼ï¼ qio_channel_set_name(QIO_CHANNEL(sioc), -"migration-socket-incoming") -ï¼ ï¼ï¼ï¼ -ï¼ ï¼ï¼ï¼ -ï¼ ï¼ï¼ï¼ + qio_channel_set_feature(QIO_CHANNEL(sioc), -QIO_CHANNEL_FEATURE_SHUTDOWN) -ï¼ ï¼ï¼ï¼ -ï¼ ï¼ï¼ï¼ -ï¼ ï¼ï¼ï¼ migration_channel_process_incoming(migrate_get_current(), -ï¼ ï¼ï¼ï¼ -ï¼ ï¼ï¼ï¼ -ï¼ ï¼ï¼ï¼ QIO_CHANNEL(sioc)) -ï¼ ï¼ï¼ï¼ -ï¼ ï¼ï¼ï¼ -ï¼ ï¼ï¼ï¼ object_unref(OBJECT(sioc)) -ï¼ ï¼ï¼ï¼ -ï¼ ï¼ï¼ï¼ -ï¼ ï¼ï¼ï¼ -ï¼ ï¼ï¼ï¼ -ï¼ ï¼ï¼ï¼ My test will not hang any more. -ï¼ ï¼ï¼ï¼ -ï¼ ï¼ï¼ï¼ -ï¼ ï¼ï¼ï¼ -ï¼ ï¼ï¼ï¼ -ï¼ ï¼ï¼ï¼ -ï¼ ï¼ï¼ï¼ -ï¼ ï¼ï¼ï¼ -ï¼ ï¼ï¼ï¼ -ï¼ ï¼ï¼ï¼ -ï¼ ï¼ï¼ï¼ -ï¼ ï¼ï¼ï¼ -ï¼ ï¼ï¼ï¼ -ï¼ ï¼ï¼ï¼ -ï¼ ï¼ï¼ï¼ -ï¼ ï¼ï¼ï¼ -ï¼ ï¼ï¼ï¼ -ï¼ ï¼ï¼ï¼ -ï¼ ï¼ï¼ï¼ åå§é®ä»¶ -ï¼ ï¼ï¼ï¼ -ï¼ ï¼ï¼ï¼ -ï¼ ï¼ï¼ï¼ -ï¼ ï¼ï¼ï¼ åä»¶äººï¼ address@hidden -ï¼ ï¼ï¼ï¼ æ¶ä»¶äººï¼ç广10165992 address@hidden -ï¼ ï¼ï¼ï¼ æéäººï¼ address@hidden address@hidden -ï¼ ï¼ï¼ï¼ æ¥ æ ï¼2017å¹´03æ21æ¥ 15:58 -ï¼ ï¼ï¼ï¼ 主 é¢ ï¼Re: [Qemu-devel] çå¤: Re: [BUG]COLO failover hang -ï¼ ï¼ï¼ï¼ -ï¼ ï¼ï¼ï¼ -ï¼ ï¼ï¼ï¼ -ï¼ ï¼ï¼ï¼ -ï¼ ï¼ï¼ï¼ -ï¼ ï¼ï¼ï¼ Hi,Wang. -ï¼ ï¼ï¼ï¼ -ï¼ ï¼ï¼ï¼ You can test this branch: -ï¼ ï¼ï¼ï¼ -ï¼ ï¼ï¼ï¼ -https://github.com/coloft/qemu/tree/colo-v5.1-developing-COLO-frame-v21-with-shared-disk -ï¼ ï¼ï¼ï¼ -ï¼ ï¼ï¼ï¼ and please follow wiki ensure your own configuration correctly. -ï¼ ï¼ï¼ï¼ -ï¼ ï¼ï¼ï¼ -http://wiki.qemu-project.org/Features/COLO -ï¼ ï¼ï¼ï¼ -ï¼ ï¼ï¼ï¼ -ï¼ ï¼ï¼ï¼ Thanks -ï¼ ï¼ï¼ï¼ -ï¼ ï¼ï¼ï¼ Zhang Chen -ï¼ ï¼ï¼ï¼ -ï¼ ï¼ï¼ï¼ -ï¼ ï¼ï¼ï¼ On 03/21/2017 03:27 PM, address@hidden wrote: -ï¼ ï¼ï¼ï¼ ï¼ -ï¼ ï¼ï¼ï¼ ï¼ hi. -ï¼ ï¼ï¼ï¼ ï¼ -ï¼ ï¼ï¼ï¼ ï¼ I test the git qemu master have the same problem. -ï¼ ï¼ï¼ï¼ ï¼ -ï¼ ï¼ï¼ï¼ ï¼ (gdb) bt -ï¼ ï¼ï¼ï¼ ï¼ -ï¼ ï¼ï¼ï¼ ï¼ #0 qio_channel_socket_readv (ioc=0x7f65911b4e50, iov=0x7f64ef3fd880, -ï¼ ï¼ï¼ï¼ ï¼ niov=1, fds=0x0, nfds=0x0, errp=0x0) at io/channel-socket.c:461 -ï¼ ï¼ï¼ï¼ ï¼ -ï¼ ï¼ï¼ï¼ ï¼ #1 0x00007f658e4aa0c2 in qio_channel_read -ï¼ ï¼ï¼ï¼ ï¼ (address@hidden, address@hidden "", -ï¼ ï¼ï¼ï¼ ï¼ address@hidden, address@hidden) at io/channel.c:114 -ï¼ ï¼ï¼ï¼ ï¼ -ï¼ ï¼ï¼ï¼ ï¼ #2 0x00007f658e3ea990 in channel_get_buffer (opaque=ï¼optimized outï¼, -ï¼ ï¼ï¼ï¼ ï¼ buf=0x7f65907cb838 "", pos=ï¼optimized outï¼, size=32768) at -ï¼ ï¼ï¼ï¼ ï¼ migration/qemu-file-channel.c:78 -ï¼ ï¼ï¼ï¼ ï¼ -ï¼ ï¼ï¼ï¼ ï¼ #3 0x00007f658e3e97fc in qemu_fill_buffer (f=0x7f65907cb800) at -ï¼ ï¼ï¼ï¼ ï¼ migration/qemu-file.c:295 -ï¼ ï¼ï¼ï¼ ï¼ -ï¼ ï¼ï¼ï¼ ï¼ #4 0x00007f658e3ea2e1 in qemu_peek_byte (address@hidden, -ï¼ ï¼ï¼ï¼ ï¼ address@hidden) at migration/qemu-file.c:555 -ï¼ ï¼ï¼ï¼ ï¼ -ï¼ ï¼ï¼ï¼ ï¼ #5 0x00007f658e3ea34b in qemu_get_byte (address@hidden) at -ï¼ ï¼ï¼ï¼ ï¼ migration/qemu-file.c:568 -ï¼ ï¼ï¼ï¼ ï¼ -ï¼ ï¼ï¼ï¼ ï¼ #6 0x00007f658e3ea552 in qemu_get_be32 (address@hidden) at -ï¼ ï¼ï¼ï¼ ï¼ migration/qemu-file.c:648 -ï¼ ï¼ï¼ï¼ ï¼ -ï¼ ï¼ï¼ï¼ ï¼ #7 0x00007f658e3e66e5 in colo_receive_message (f=0x7f65907cb800, -ï¼ ï¼ï¼ï¼ ï¼ address@hidden) at migration/colo.c:244 -ï¼ ï¼ï¼ï¼ ï¼ -ï¼ ï¼ï¼ï¼ ï¼ #8 0x00007f658e3e681e in colo_receive_check_message (f=ï¼optimized -ï¼ ï¼ï¼ï¼ ï¼ outï¼, address@hidden, -ï¼ ï¼ï¼ï¼ ï¼ address@hidden) -ï¼ ï¼ï¼ï¼ ï¼ -ï¼ ï¼ï¼ï¼ ï¼ at migration/colo.c:264 -ï¼ ï¼ï¼ï¼ ï¼ -ï¼ ï¼ï¼ï¼ ï¼ #9 0x00007f658e3e740e in colo_process_incoming_thread -ï¼ ï¼ï¼ï¼ ï¼ (opaque=0x7f658eb30360 ï¼mis_current.31286ï¼) at migration/colo.c:577 -ï¼ ï¼ï¼ï¼ ï¼ -ï¼ ï¼ï¼ï¼ ï¼ #10 0x00007f658be09df3 in start_thread () from /lib64/libpthread.so.0 -ï¼ ï¼ï¼ï¼ ï¼ -ï¼ ï¼ï¼ï¼ ï¼ #11 0x00007f65881983ed in clone () from /lib64/libc.so.6 -ï¼ ï¼ï¼ï¼ ï¼ -ï¼ ï¼ï¼ï¼ ï¼ (gdb) p ioc-ï¼name -ï¼ ï¼ï¼ï¼ ï¼ -ï¼ ï¼ï¼ï¼ ï¼ $2 = 0x7f658ff7d5c0 "migration-socket-incoming" -ï¼ ï¼ï¼ï¼ ï¼ -ï¼ ï¼ï¼ï¼ ï¼ (gdb) p ioc-ï¼features Do not support QIO_CHANNEL_FEATURE_SHUTDOWN -ï¼ ï¼ï¼ï¼ ï¼ -ï¼ ï¼ï¼ï¼ ï¼ $3 = 0 -ï¼ ï¼ï¼ï¼ ï¼ -ï¼ ï¼ï¼ï¼ ï¼ -ï¼ ï¼ï¼ï¼ ï¼ (gdb) bt -ï¼ ï¼ï¼ï¼ ï¼ -ï¼ ï¼ï¼ï¼ ï¼ #0 socket_accept_incoming_migration (ioc=0x7fdcceeafa90, -ï¼ ï¼ï¼ï¼ ï¼ condition=G_IO_IN, opaque=0x7fdcceeafa90) at migration/socket.c:137 -ï¼ ï¼ï¼ï¼ ï¼ -ï¼ ï¼ï¼ï¼ ï¼ #1 0x00007fdcc6966350 in g_main_dispatch (context=ï¼optimized outï¼) at -ï¼ ï¼ï¼ï¼ ï¼ gmain.c:3054 -ï¼ ï¼ï¼ï¼ ï¼ -ï¼ ï¼ï¼ï¼ ï¼ #2 g_main_context_dispatch (context=ï¼optimized outï¼, -ï¼ ï¼ï¼ï¼ ï¼ address@hidden) at gmain.c:3630 -ï¼ ï¼ï¼ï¼ ï¼ -ï¼ ï¼ï¼ï¼ ï¼ #3 0x00007fdccb8a6dcc in glib_pollfds_poll () at util/main-loop.c:213 -ï¼ ï¼ï¼ï¼ ï¼ -ï¼ ï¼ï¼ï¼ ï¼ #4 os_host_main_loop_wait (timeout=ï¼optimized outï¼) at -ï¼ ï¼ï¼ï¼ ï¼ util/main-loop.c:258 -ï¼ ï¼ï¼ï¼ ï¼ -ï¼ ï¼ï¼ï¼ ï¼ #5 main_loop_wait (address@hidden) at -ï¼ ï¼ï¼ï¼ ï¼ util/main-loop.c:506 -ï¼ ï¼ï¼ï¼ ï¼ -ï¼ ï¼ï¼ï¼ ï¼ #6 0x00007fdccb526187 in main_loop () at vl.c:1898 -ï¼ ï¼ï¼ï¼ ï¼ -ï¼ ï¼ï¼ï¼ ï¼ #7 main (argc=ï¼optimized outï¼, argv=ï¼optimized outï¼, envp=ï¼optimized -ï¼ ï¼ï¼ï¼ ï¼ outï¼) at vl.c:4709 -ï¼ ï¼ï¼ï¼ ï¼ -ï¼ ï¼ï¼ï¼ ï¼ (gdb) p ioc-ï¼features -ï¼ ï¼ï¼ï¼ ï¼ -ï¼ ï¼ï¼ï¼ ï¼ $1 = 6 -ï¼ ï¼ï¼ï¼ ï¼ -ï¼ ï¼ï¼ï¼ ï¼ (gdb) p ioc-ï¼name -ï¼ ï¼ï¼ï¼ ï¼ -ï¼ ï¼ï¼ï¼ ï¼ $2 = 0x7fdcce1b1ab0 "migration-socket-listener" -ï¼ ï¼ï¼ï¼ ï¼ -ï¼ ï¼ï¼ï¼ ï¼ -ï¼ ï¼ï¼ï¼ ï¼ May be socket_accept_incoming_migration should -ï¼ ï¼ï¼ï¼ ï¼ call qio_channel_set_feature(ioc, QIO_CHANNEL_FEATURE_SHUTDOWN)?? -ï¼ ï¼ï¼ï¼ ï¼ -ï¼ ï¼ï¼ï¼ ï¼ -ï¼ ï¼ï¼ï¼ ï¼ thank you. -ï¼ ï¼ï¼ï¼ ï¼ -ï¼ ï¼ï¼ï¼ ï¼ -ï¼ ï¼ï¼ï¼ ï¼ -ï¼ ï¼ï¼ï¼ ï¼ -ï¼ ï¼ï¼ï¼ ï¼ -ï¼ ï¼ï¼ï¼ ï¼ åå§é®ä»¶ -ï¼ ï¼ï¼ï¼ ï¼ address@hidden -ï¼ ï¼ï¼ï¼ ï¼ address@hidden -ï¼ ï¼ï¼ï¼ ï¼ address@hidden@huawei.comï¼ -ï¼ ï¼ï¼ï¼ ï¼ *æ¥ æ ï¼*2017å¹´03æ16æ¥ 14:46 -ï¼ ï¼ï¼ï¼ ï¼ *主 é¢ ï¼**Re: [Qemu-devel] COLO failover hang* -ï¼ ï¼ï¼ï¼ ï¼ -ï¼ ï¼ï¼ï¼ ï¼ -ï¼ ï¼ï¼ï¼ ï¼ -ï¼ ï¼ï¼ï¼ ï¼ -ï¼ ï¼ï¼ï¼ ï¼ On 03/15/2017 05:06 PM, wangguang wrote: -ï¼ ï¼ï¼ï¼ ï¼ ï¼ am testing QEMU COLO feature described here [QEMU -ï¼ ï¼ï¼ï¼ ï¼ ï¼ Wiki]( -http://wiki.qemu-project.org/Features/COLO -). -ï¼ ï¼ï¼ï¼ ï¼ ï¼ -ï¼ ï¼ï¼ï¼ ï¼ ï¼ When the Primary Node panic,the Secondary Node qemu hang. -ï¼ ï¼ï¼ï¼ ï¼ ï¼ hang at recvmsg in qio_channel_socket_readv. -ï¼ ï¼ï¼ï¼ ï¼ ï¼ And I run { 'execute': 'nbd-server-stop' } and { "execute": -ï¼ ï¼ï¼ï¼ ï¼ ï¼ "x-colo-lost-heartbeat" } in Secondary VM's -ï¼ ï¼ï¼ï¼ ï¼ ï¼ monitor,the Secondary Node qemu still hang at recvmsg . -ï¼ ï¼ï¼ï¼ ï¼ ï¼ -ï¼ ï¼ï¼ï¼ ï¼ ï¼ I found that the colo in qemu is not complete yet. -ï¼ ï¼ï¼ï¼ ï¼ ï¼ Do the colo have any plan for development? -ï¼ ï¼ï¼ï¼ ï¼ -ï¼ ï¼ï¼ï¼ ï¼ Yes, We are developing. You can see some of patch we pushing. -ï¼ ï¼ï¼ï¼ ï¼ -ï¼ ï¼ï¼ï¼ ï¼ ï¼ Has anyone ever run it successfully? Any help is appreciated! -ï¼ ï¼ï¼ï¼ ï¼ -ï¼ ï¼ï¼ï¼ ï¼ In our internal version can run it successfully, -ï¼ ï¼ï¼ï¼ ï¼ The failover detail you can ask Zhanghailiang for help. -ï¼ ï¼ï¼ï¼ ï¼ Next time if you have some question about COLO, -ï¼ ï¼ï¼ï¼ ï¼ please cc me and zhanghailiang address@hidden -ï¼ ï¼ï¼ï¼ ï¼ -ï¼ ï¼ï¼ï¼ ï¼ -ï¼ ï¼ï¼ï¼ ï¼ Thanks -ï¼ ï¼ï¼ï¼ ï¼ Zhang Chen -ï¼ ï¼ï¼ï¼ ï¼ -ï¼ ï¼ï¼ï¼ ï¼ -ï¼ ï¼ï¼ï¼ ï¼ ï¼ -ï¼ ï¼ï¼ï¼ ï¼ ï¼ -ï¼ ï¼ï¼ï¼ ï¼ ï¼ -ï¼ ï¼ï¼ï¼ ï¼ ï¼ centos7.2+qemu2.7.50 -ï¼ ï¼ï¼ï¼ ï¼ ï¼ (gdb) bt -ï¼ ï¼ï¼ï¼ ï¼ ï¼ #0 0x00007f3e00cc86ad in recvmsg () from /lib64/libpthread.so.0 -ï¼ ï¼ï¼ï¼ ï¼ ï¼ #1 0x00007f3e0332b738 in qio_channel_socket_readv (ioc=ï¼optimized -outï¼, -ï¼ ï¼ï¼ï¼ ï¼ ï¼ iov=ï¼optimized outï¼, niov=ï¼optimized outï¼, fds=0x0, nfds=0x0, -errp=0x0) at -ï¼ ï¼ï¼ï¼ ï¼ ï¼ io/channel-socket.c:497 -ï¼ ï¼ï¼ï¼ ï¼ ï¼ #2 0x00007f3e03329472 in qio_channel_read (address@hidden, -ï¼ ï¼ï¼ï¼ ï¼ ï¼ address@hidden "", address@hidden, -ï¼ ï¼ï¼ï¼ ï¼ ï¼ address@hidden) at io/channel.c:97 -ï¼ ï¼ï¼ï¼ ï¼ ï¼ #3 0x00007f3e032750e0 in channel_get_buffer (opaque=ï¼optimized outï¼, -ï¼ ï¼ï¼ï¼ ï¼ ï¼ buf=0x7f3e05910f38 "", pos=ï¼optimized outï¼, size=32768) at -ï¼ ï¼ï¼ï¼ ï¼ ï¼ migration/qemu-file-channel.c:78 -ï¼ ï¼ï¼ï¼ ï¼ ï¼ #4 0x00007f3e0327412c in qemu_fill_buffer (f=0x7f3e05910f00) at -ï¼ ï¼ï¼ï¼ ï¼ ï¼ migration/qemu-file.c:257 -ï¼ ï¼ï¼ï¼ ï¼ ï¼ #5 0x00007f3e03274a41 in qemu_peek_byte (address@hidden, -ï¼ ï¼ï¼ï¼ ï¼ ï¼ address@hidden) at migration/qemu-file.c:510 -ï¼ ï¼ï¼ï¼ ï¼ ï¼ #6 0x00007f3e03274aab in qemu_get_byte (address@hidden) at -ï¼ ï¼ï¼ï¼ ï¼ ï¼ migration/qemu-file.c:523 -ï¼ ï¼ï¼ï¼ ï¼ ï¼ #7 0x00007f3e03274cb2 in qemu_get_be32 (address@hidden) at -ï¼ ï¼ï¼ï¼ ï¼ ï¼ migration/qemu-file.c:603 -ï¼ ï¼ï¼ï¼ ï¼ ï¼ #8 0x00007f3e03271735 in colo_receive_message (f=0x7f3e05910f00, -ï¼ ï¼ï¼ï¼ ï¼ ï¼ address@hidden) at migration/colo.c:215 -ï¼ ï¼ï¼ï¼ ï¼ ï¼ #9 0x00007f3e0327250d in colo_wait_handle_message -(errp=0x7f3d62bfaa48, -ï¼ ï¼ï¼ï¼ ï¼ ï¼ checkpoint_request=ï¼synthetic pointerï¼, f=ï¼optimized outï¼) at -ï¼ ï¼ï¼ï¼ ï¼ ï¼ migration/colo.c:546 -ï¼ ï¼ï¼ï¼ ï¼ ï¼ #10 colo_process_incoming_thread (opaque=0x7f3e067245e0) at -ï¼ ï¼ï¼ï¼ ï¼ ï¼ migration/colo.c:649 -ï¼ ï¼ï¼ï¼ ï¼ ï¼ #11 0x00007f3e00cc1df3 in start_thread () from /lib64/libpthread.so.0 -ï¼ ï¼ï¼ï¼ ï¼ ï¼ #12 0x00007f3dfc9c03ed in clone () from /lib64/libc..so.6 -ï¼ ï¼ï¼ï¼ ï¼ ï¼ -ï¼ ï¼ï¼ï¼ ï¼ ï¼ -ï¼ ï¼ï¼ï¼ ï¼ ï¼ -ï¼ ï¼ï¼ï¼ ï¼ ï¼ -ï¼ ï¼ï¼ï¼ ï¼ ï¼ -ï¼ ï¼ï¼ï¼ ï¼ ï¼ -- -ï¼ ï¼ï¼ï¼ ï¼ ï¼ View this message in context: -http://qemu.11.n7.nabble.com/COLO-failover-hang-tp473250.html -ï¼ ï¼ï¼ï¼ ï¼ ï¼ Sent from the Developer mailing list archive at Nabble.com. -ï¼ ï¼ï¼ï¼ ï¼ ï¼ -ï¼ ï¼ï¼ï¼ ï¼ ï¼ -ï¼ ï¼ï¼ï¼ ï¼ ï¼ -ï¼ ï¼ï¼ï¼ ï¼ ï¼ -ï¼ ï¼ï¼ï¼ ï¼ -ï¼ ï¼ï¼ï¼ ï¼ -- -ï¼ ï¼ï¼ï¼ ï¼ Thanks -ï¼ ï¼ï¼ï¼ ï¼ Zhang Chen -ï¼ ï¼ï¼ï¼ ï¼ -ï¼ ï¼ï¼ï¼ ï¼ -ï¼ ï¼ï¼ï¼ ï¼ -ï¼ ï¼ï¼ï¼ ï¼ -ï¼ ï¼ï¼ï¼ ï¼ -ï¼ ï¼ï¼ï¼ -ï¼ ï¼ï¼ -ï¼ ï¼ -- -ï¼ ï¼ Dr. David Alan Gilbert / address@hidden / Manchester, UK -ï¼ ï¼ -ï¼ ï¼ . -ï¼ ï¼ -ï¼ - -On 2017/3/22 16:09, address@hidden wrote: -hi: - -yes.it is better. - -And should we delete -Yes, you are right. -#ifdef WIN32 - - QIO_CHANNEL(cioc)-ï¼event = CreateEvent(NULL, FALSE, FALSE, NULL) - -#endif - - - - -in qio_channel_socket_acceptï¼ - -qio_channel_socket_new already have it. - - - - - - - - - - - - -åå§é®ä»¶ - - - -åä»¶äººï¼ address@hidden -æ¶ä»¶äººï¼ç广10165992 -æéäººï¼ address@hidden address@hidden address@hidden address@hidden -æ¥ æ ï¼2017å¹´03æ22æ¥ 15:03 -主 é¢ ï¼Re: [Qemu-devel] çå¤: Re: çå¤: Re: çå¤: Re: [BUG]COLO failover hang - - - - - -Hi, - -On 2017/3/22 9:42, address@hidden wrote: -ï¼ diff --git a/migration/socket.c b/migration/socket.c -ï¼ -ï¼ -ï¼ index 13966f1..d65a0ea 100644 -ï¼ -ï¼ -ï¼ --- a/migration/socket.c -ï¼ -ï¼ -ï¼ +++ b/migration/socket.c -ï¼ -ï¼ -ï¼ @@ -147,8 +147,9 @@ static gboolean -socket_accept_incoming_migration(QIOChannel *ioc, -ï¼ -ï¼ -ï¼ } -ï¼ -ï¼ -ï¼ -ï¼ -ï¼ -ï¼ trace_migration_socket_incoming_accepted() -ï¼ -ï¼ -ï¼ -ï¼ -ï¼ -ï¼ qio_channel_set_name(QIO_CHANNEL(sioc), "migration-socket-incoming") -ï¼ -ï¼ -ï¼ + qio_channel_set_feature(QIO_CHANNEL(sioc), QIO_CHANNEL_FEATURE_SHUTDOWN) -ï¼ -ï¼ -ï¼ migration_channel_process_incoming(migrate_get_current(), -ï¼ -ï¼ -ï¼ QIO_CHANNEL(sioc)) -ï¼ -ï¼ -ï¼ object_unref(OBJECT(sioc)) -ï¼ -ï¼ -ï¼ -ï¼ -ï¼ Is this patch ok? -ï¼ - -Yes, i think this works, but a better way maybe to call -qio_channel_set_feature() -in qio_channel_socket_accept(), we didn't set the SHUTDOWN feature for the -socket accept fd, -Or fix it by this: - -diff --git a/io/channel-socket.c b/io/channel-socket.c -index f546c68..ce6894c 100644 ---- a/io/channel-socket.c -+++ b/io/channel-socket.c -@@ -330,9 +330,8 @@ qio_channel_socket_accept(QIOChannelSocket *ioc, - Error **errp) - { - QIOChannelSocket *cioc -- -- cioc = QIO_CHANNEL_SOCKET(object_new(TYPE_QIO_CHANNEL_SOCKET)) -- cioc-ï¼fd = -1 -+ -+ cioc = qio_channel_socket_new() - cioc-ï¼remoteAddrLen = sizeof(ioc-ï¼remoteAddr) - cioc-ï¼localAddrLen = sizeof(ioc-ï¼localAddr) - - -Thanks, -Hailiang - -ï¼ I have test it . The test could not hang any more. -ï¼ -ï¼ -ï¼ -ï¼ -ï¼ -ï¼ -ï¼ -ï¼ -ï¼ -ï¼ -ï¼ -ï¼ -ï¼ åå§é®ä»¶ -ï¼ -ï¼ -ï¼ -ï¼ åä»¶äººï¼ address@hidden -ï¼ æ¶ä»¶äººï¼ address@hidden address@hidden -ï¼ æéäººï¼ address@hidden address@hidden address@hidden -ï¼ æ¥ æ ï¼2017å¹´03æ22æ¥ 09:11 -ï¼ ä¸» é¢ ï¼Re: [Qemu-devel] çå¤: Re: çå¤: Re: [BUG]COLO failover hang -ï¼ -ï¼ -ï¼ -ï¼ -ï¼ -ï¼ On 2017/3/21 19:56, Dr. David Alan Gilbert wrote: -ï¼ ï¼ * Hailiang Zhang (address@hidden) wrote: -ï¼ ï¼ï¼ Hi, -ï¼ ï¼ï¼ -ï¼ ï¼ï¼ Thanks for reporting this, and i confirmed it in my test, and it is a bug. -ï¼ ï¼ï¼ -ï¼ ï¼ï¼ Though we tried to call qemu_file_shutdown() to shutdown the related fd, in -ï¼ ï¼ï¼ case COLO thread/incoming thread is stuck in read/write() while do -failover, -ï¼ ï¼ï¼ but it didn't take effect, because all the fd used by COLO (also migration) -ï¼ ï¼ï¼ has been wrapped by qio channel, and it will not call the shutdown API if -ï¼ ï¼ï¼ we didn't qio_channel_set_feature(QIO_CHANNEL(sioc), -QIO_CHANNEL_FEATURE_SHUTDOWN). -ï¼ ï¼ï¼ -ï¼ ï¼ï¼ Cc: Dr. David Alan Gilbert address@hidden -ï¼ ï¼ï¼ -ï¼ ï¼ï¼ I doubted migration cancel has the same problem, it may be stuck in write() -ï¼ ï¼ï¼ if we tried to cancel migration. -ï¼ ï¼ï¼ -ï¼ ï¼ï¼ void fd_start_outgoing_migration(MigrationState *s, const char *fdname, -Error **errp) -ï¼ ï¼ï¼ { -ï¼ ï¼ï¼ qio_channel_set_name(QIO_CHANNEL(ioc), "migration-fd-outgoing") -ï¼ ï¼ï¼ migration_channel_connect(s, ioc, NULL) -ï¼ ï¼ï¼ ... ... -ï¼ ï¼ï¼ We didn't call qio_channel_set_feature(QIO_CHANNEL(sioc), -QIO_CHANNEL_FEATURE_SHUTDOWN) above, -ï¼ ï¼ï¼ and the -ï¼ ï¼ï¼ migrate_fd_cancel() -ï¼ ï¼ï¼ { -ï¼ ï¼ï¼ ... ... -ï¼ ï¼ï¼ if (s-ï¼state == MIGRATION_STATUS_CANCELLING && f) { -ï¼ ï¼ï¼ qemu_file_shutdown(f) --ï¼ This will not take effect. No ? -ï¼ ï¼ï¼ } -ï¼ ï¼ï¼ } -ï¼ ï¼ -ï¼ ï¼ (cc'd in Daniel Berrange). -ï¼ ï¼ I see that we call qio_channel_set_feature(ioc, -QIO_CHANNEL_FEATURE_SHUTDOWN) at the -ï¼ ï¼ top of qio_channel_socket_new so I think that's safe isn't it? -ï¼ ï¼ -ï¼ -ï¼ Hmm, you are right, this problem is only exist for the migration incoming fd, -thanks. -ï¼ -ï¼ ï¼ Dave -ï¼ ï¼ -ï¼ ï¼ï¼ Thanks, -ï¼ ï¼ï¼ Hailiang -ï¼ ï¼ï¼ -ï¼ ï¼ï¼ On 2017/3/21 16:10, address@hidden wrote: -ï¼ ï¼ï¼ï¼ Thank youã -ï¼ ï¼ï¼ï¼ -ï¼ ï¼ï¼ï¼ I have test areadyã -ï¼ ï¼ï¼ï¼ -ï¼ ï¼ï¼ï¼ When the Primary Node panic,the Secondary Node qemu hang at the same -placeã -ï¼ ï¼ï¼ï¼ -ï¼ ï¼ï¼ï¼ Incorrding -http://wiki.qemu-project.org/Features/COLO -ï¼kill Primary Node -qemu will not produce the problem,but Primary Node panic canã -ï¼ ï¼ï¼ï¼ -ï¼ ï¼ï¼ï¼ I think due to the feature of channel does not support -QIO_CHANNEL_FEATURE_SHUTDOWN. -ï¼ ï¼ï¼ï¼ -ï¼ ï¼ï¼ï¼ -ï¼ ï¼ï¼ï¼ when failover,channel_shutdown could not shut down the channel. -ï¼ ï¼ï¼ï¼ -ï¼ ï¼ï¼ï¼ -ï¼ ï¼ï¼ï¼ so the colo_process_incoming_thread will hang at recvmsg. -ï¼ ï¼ï¼ï¼ -ï¼ ï¼ï¼ï¼ -ï¼ ï¼ï¼ï¼ I test a patch: -ï¼ ï¼ï¼ï¼ -ï¼ ï¼ï¼ï¼ -ï¼ ï¼ï¼ï¼ diff --git a/migration/socket.c b/migration/socket.c -ï¼ ï¼ï¼ï¼ -ï¼ ï¼ï¼ï¼ -ï¼ ï¼ï¼ï¼ index 13966f1..d65a0ea 100644 -ï¼ ï¼ï¼ï¼ -ï¼ ï¼ï¼ï¼ -ï¼ ï¼ï¼ï¼ --- a/migration/socket.c -ï¼ ï¼ï¼ï¼ -ï¼ ï¼ï¼ï¼ -ï¼ ï¼ï¼ï¼ +++ b/migration/socket.c -ï¼ ï¼ï¼ï¼ -ï¼ ï¼ï¼ï¼ -ï¼ ï¼ï¼ï¼ @@ -147,8 +147,9 @@ static gboolean -socket_accept_incoming_migration(QIOChannel *ioc, -ï¼ ï¼ï¼ï¼ -ï¼ ï¼ï¼ï¼ -ï¼ ï¼ï¼ï¼ } -ï¼ ï¼ï¼ï¼ -ï¼ ï¼ï¼ï¼ -ï¼ ï¼ï¼ï¼ -ï¼ ï¼ï¼ï¼ -ï¼ ï¼ï¼ï¼ -ï¼ ï¼ï¼ï¼ trace_migration_socket_incoming_accepted() -ï¼ ï¼ï¼ï¼ -ï¼ ï¼ï¼ï¼ -ï¼ ï¼ï¼ï¼ -ï¼ ï¼ï¼ï¼ -ï¼ ï¼ï¼ï¼ -ï¼ ï¼ï¼ï¼ qio_channel_set_name(QIO_CHANNEL(sioc), -"migration-socket-incoming") -ï¼ ï¼ï¼ï¼ -ï¼ ï¼ï¼ï¼ -ï¼ ï¼ï¼ï¼ + qio_channel_set_feature(QIO_CHANNEL(sioc), -QIO_CHANNEL_FEATURE_SHUTDOWN) -ï¼ ï¼ï¼ï¼ -ï¼ ï¼ï¼ï¼ -ï¼ ï¼ï¼ï¼ migration_channel_process_incoming(migrate_get_current(), -ï¼ ï¼ï¼ï¼ -ï¼ ï¼ï¼ï¼ -ï¼ ï¼ï¼ï¼ QIO_CHANNEL(sioc)) -ï¼ ï¼ï¼ï¼ -ï¼ ï¼ï¼ï¼ -ï¼ ï¼ï¼ï¼ object_unref(OBJECT(sioc)) -ï¼ ï¼ï¼ï¼ -ï¼ ï¼ï¼ï¼ -ï¼ ï¼ï¼ï¼ -ï¼ ï¼ï¼ï¼ -ï¼ ï¼ï¼ï¼ My test will not hang any more. -ï¼ ï¼ï¼ï¼ -ï¼ ï¼ï¼ï¼ -ï¼ ï¼ï¼ï¼ -ï¼ ï¼ï¼ï¼ -ï¼ ï¼ï¼ï¼ -ï¼ ï¼ï¼ï¼ -ï¼ ï¼ï¼ï¼ -ï¼ ï¼ï¼ï¼ -ï¼ ï¼ï¼ï¼ -ï¼ ï¼ï¼ï¼ -ï¼ ï¼ï¼ï¼ -ï¼ ï¼ï¼ï¼ -ï¼ ï¼ï¼ï¼ -ï¼ ï¼ï¼ï¼ -ï¼ ï¼ï¼ï¼ -ï¼ ï¼ï¼ï¼ -ï¼ ï¼ï¼ï¼ -ï¼ ï¼ï¼ï¼ åå§é®ä»¶ -ï¼ ï¼ï¼ï¼ -ï¼ ï¼ï¼ï¼ -ï¼ ï¼ï¼ï¼ -ï¼ ï¼ï¼ï¼ åä»¶äººï¼ address@hidden -ï¼ ï¼ï¼ï¼ æ¶ä»¶äººï¼ç广10165992 address@hidden -ï¼ ï¼ï¼ï¼ æéäººï¼ address@hidden address@hidden -ï¼ ï¼ï¼ï¼ æ¥ æ ï¼2017å¹´03æ21æ¥ 15:58 -ï¼ ï¼ï¼ï¼ 主 é¢ ï¼Re: [Qemu-devel] çå¤: Re: [BUG]COLO failover hang -ï¼ ï¼ï¼ï¼ -ï¼ ï¼ï¼ï¼ -ï¼ ï¼ï¼ï¼ -ï¼ ï¼ï¼ï¼ -ï¼ ï¼ï¼ï¼ -ï¼ ï¼ï¼ï¼ Hi,Wang. -ï¼ ï¼ï¼ï¼ -ï¼ ï¼ï¼ï¼ You can test this branch: -ï¼ ï¼ï¼ï¼ -ï¼ ï¼ï¼ï¼ -https://github.com/coloft/qemu/tree/colo-v5.1-developing-COLO-frame-v21-with-shared-disk -ï¼ ï¼ï¼ï¼ -ï¼ ï¼ï¼ï¼ and please follow wiki ensure your own configuration correctly. -ï¼ ï¼ï¼ï¼ -ï¼ ï¼ï¼ï¼ -http://wiki.qemu-project.org/Features/COLO -ï¼ ï¼ï¼ï¼ -ï¼ ï¼ï¼ï¼ -ï¼ ï¼ï¼ï¼ Thanks -ï¼ ï¼ï¼ï¼ -ï¼ ï¼ï¼ï¼ Zhang Chen -ï¼ ï¼ï¼ï¼ -ï¼ ï¼ï¼ï¼ -ï¼ ï¼ï¼ï¼ On 03/21/2017 03:27 PM, address@hidden wrote: -ï¼ ï¼ï¼ï¼ ï¼ -ï¼ ï¼ï¼ï¼ ï¼ hi. -ï¼ ï¼ï¼ï¼ ï¼ -ï¼ ï¼ï¼ï¼ ï¼ I test the git qemu master have the same problem. -ï¼ ï¼ï¼ï¼ ï¼ -ï¼ ï¼ï¼ï¼ ï¼ (gdb) bt -ï¼ ï¼ï¼ï¼ ï¼ -ï¼ ï¼ï¼ï¼ ï¼ #0 qio_channel_socket_readv (ioc=0x7f65911b4e50, iov=0x7f64ef3fd880, -ï¼ ï¼ï¼ï¼ ï¼ niov=1, fds=0x0, nfds=0x0, errp=0x0) at io/channel-socket.c:461 -ï¼ ï¼ï¼ï¼ ï¼ -ï¼ ï¼ï¼ï¼ ï¼ #1 0x00007f658e4aa0c2 in qio_channel_read -ï¼ ï¼ï¼ï¼ ï¼ (address@hidden, address@hidden "", -ï¼ ï¼ï¼ï¼ ï¼ address@hidden, address@hidden) at io/channel.c:114 -ï¼ ï¼ï¼ï¼ ï¼ -ï¼ ï¼ï¼ï¼ ï¼ #2 0x00007f658e3ea990 in channel_get_buffer (opaque=ï¼optimized outï¼, -ï¼ ï¼ï¼ï¼ ï¼ buf=0x7f65907cb838 "", pos=ï¼optimized outï¼, size=32768) at -ï¼ ï¼ï¼ï¼ ï¼ migration/qemu-file-channel.c:78 -ï¼ ï¼ï¼ï¼ ï¼ -ï¼ ï¼ï¼ï¼ ï¼ #3 0x00007f658e3e97fc in qemu_fill_buffer (f=0x7f65907cb800) at -ï¼ ï¼ï¼ï¼ ï¼ migration/qemu-file.c:295 -ï¼ ï¼ï¼ï¼ ï¼ -ï¼ ï¼ï¼ï¼ ï¼ #4 0x00007f658e3ea2e1 in qemu_peek_byte (address@hidden, -ï¼ ï¼ï¼ï¼ ï¼ address@hidden) at migration/qemu-file.c:555 -ï¼ ï¼ï¼ï¼ ï¼ -ï¼ ï¼ï¼ï¼ ï¼ #5 0x00007f658e3ea34b in qemu_get_byte (address@hidden) at -ï¼ ï¼ï¼ï¼ ï¼ migration/qemu-file.c:568 -ï¼ ï¼ï¼ï¼ ï¼ -ï¼ ï¼ï¼ï¼ ï¼ #6 0x00007f658e3ea552 in qemu_get_be32 (address@hidden) at -ï¼ ï¼ï¼ï¼ ï¼ migration/qemu-file.c:648 -ï¼ ï¼ï¼ï¼ ï¼ -ï¼ ï¼ï¼ï¼ ï¼ #7 0x00007f658e3e66e5 in colo_receive_message (f=0x7f65907cb800, -ï¼ ï¼ï¼ï¼ ï¼ address@hidden) at migration/colo.c:244 -ï¼ ï¼ï¼ï¼ ï¼ -ï¼ ï¼ï¼ï¼ ï¼ #8 0x00007f658e3e681e in colo_receive_check_message (f=ï¼optimized -ï¼ ï¼ï¼ï¼ ï¼ outï¼, address@hidden, -ï¼ ï¼ï¼ï¼ ï¼ address@hidden) -ï¼ ï¼ï¼ï¼ ï¼ -ï¼ ï¼ï¼ï¼ ï¼ at migration/colo.c:264 -ï¼ ï¼ï¼ï¼ ï¼ -ï¼ ï¼ï¼ï¼ ï¼ #9 0x00007f658e3e740e in colo_process_incoming_thread -ï¼ ï¼ï¼ï¼ ï¼ (opaque=0x7f658eb30360 ï¼mis_current.31286ï¼) at migration/colo.c:577 -ï¼ ï¼ï¼ï¼ ï¼ -ï¼ ï¼ï¼ï¼ ï¼ #10 0x00007f658be09df3 in start_thread () from /lib64/libpthread.so.0 -ï¼ ï¼ï¼ï¼ ï¼ -ï¼ ï¼ï¼ï¼ ï¼ #11 0x00007f65881983ed in clone () from /lib64/libc.so.6 -ï¼ ï¼ï¼ï¼ ï¼ -ï¼ ï¼ï¼ï¼ ï¼ (gdb) p ioc-ï¼name -ï¼ ï¼ï¼ï¼ ï¼ -ï¼ ï¼ï¼ï¼ ï¼ $2 = 0x7f658ff7d5c0 "migration-socket-incoming" -ï¼ ï¼ï¼ï¼ ï¼ -ï¼ ï¼ï¼ï¼ ï¼ (gdb) p ioc-ï¼features Do not support QIO_CHANNEL_FEATURE_SHUTDOWN -ï¼ ï¼ï¼ï¼ ï¼ -ï¼ ï¼ï¼ï¼ ï¼ $3 = 0 -ï¼ ï¼ï¼ï¼ ï¼ -ï¼ ï¼ï¼ï¼ ï¼ -ï¼ ï¼ï¼ï¼ ï¼ (gdb) bt -ï¼ ï¼ï¼ï¼ ï¼ -ï¼ ï¼ï¼ï¼ ï¼ #0 socket_accept_incoming_migration (ioc=0x7fdcceeafa90, -ï¼ ï¼ï¼ï¼ ï¼ condition=G_IO_IN, opaque=0x7fdcceeafa90) at migration/socket.c:137 -ï¼ ï¼ï¼ï¼ ï¼ -ï¼ ï¼ï¼ï¼ ï¼ #1 0x00007fdcc6966350 in g_main_dispatch (context=ï¼optimized outï¼) at -ï¼ ï¼ï¼ï¼ ï¼ gmain.c:3054 -ï¼ ï¼ï¼ï¼ ï¼ -ï¼ ï¼ï¼ï¼ ï¼ #2 g_main_context_dispatch (context=ï¼optimized outï¼, -ï¼ ï¼ï¼ï¼ ï¼ address@hidden) at gmain.c:3630 -ï¼ ï¼ï¼ï¼ ï¼ -ï¼ ï¼ï¼ï¼ ï¼ #3 0x00007fdccb8a6dcc in glib_pollfds_poll () at util/main-loop.c:213 -ï¼ ï¼ï¼ï¼ ï¼ -ï¼ ï¼ï¼ï¼ ï¼ #4 os_host_main_loop_wait (timeout=ï¼optimized outï¼) at -ï¼ ï¼ï¼ï¼ ï¼ util/main-loop.c:258 -ï¼ ï¼ï¼ï¼ ï¼ -ï¼ ï¼ï¼ï¼ ï¼ #5 main_loop_wait (address@hidden) at -ï¼ ï¼ï¼ï¼ ï¼ util/main-loop.c:506 -ï¼ ï¼ï¼ï¼ ï¼ -ï¼ ï¼ï¼ï¼ ï¼ #6 0x00007fdccb526187 in main_loop () at vl.c:1898 -ï¼ ï¼ï¼ï¼ ï¼ -ï¼ ï¼ï¼ï¼ ï¼ #7 main (argc=ï¼optimized outï¼, argv=ï¼optimized outï¼, envp=ï¼optimized -ï¼ ï¼ï¼ï¼ ï¼ outï¼) at vl.c:4709 -ï¼ ï¼ï¼ï¼ ï¼ -ï¼ ï¼ï¼ï¼ ï¼ (gdb) p ioc-ï¼features -ï¼ ï¼ï¼ï¼ ï¼ -ï¼ ï¼ï¼ï¼ ï¼ $1 = 6 -ï¼ ï¼ï¼ï¼ ï¼ -ï¼ ï¼ï¼ï¼ ï¼ (gdb) p ioc-ï¼name -ï¼ ï¼ï¼ï¼ ï¼ -ï¼ ï¼ï¼ï¼ ï¼ $2 = 0x7fdcce1b1ab0 "migration-socket-listener" -ï¼ ï¼ï¼ï¼ ï¼ -ï¼ ï¼ï¼ï¼ ï¼ -ï¼ ï¼ï¼ï¼ ï¼ May be socket_accept_incoming_migration should -ï¼ ï¼ï¼ï¼ ï¼ call qio_channel_set_feature(ioc, QIO_CHANNEL_FEATURE_SHUTDOWN)?? -ï¼ ï¼ï¼ï¼ ï¼ -ï¼ ï¼ï¼ï¼ ï¼ -ï¼ ï¼ï¼ï¼ ï¼ thank you. -ï¼ ï¼ï¼ï¼ ï¼ -ï¼ ï¼ï¼ï¼ ï¼ -ï¼ ï¼ï¼ï¼ ï¼ -ï¼ ï¼ï¼ï¼ ï¼ -ï¼ ï¼ï¼ï¼ ï¼ -ï¼ ï¼ï¼ï¼ ï¼ åå§é®ä»¶ -ï¼ ï¼ï¼ï¼ ï¼ address@hidden -ï¼ ï¼ï¼ï¼ ï¼ address@hidden -ï¼ ï¼ï¼ï¼ ï¼ address@hidden@huawei.comï¼ -ï¼ ï¼ï¼ï¼ ï¼ *æ¥ æ ï¼*2017å¹´03æ16æ¥ 14:46 -ï¼ ï¼ï¼ï¼ ï¼ *主 é¢ ï¼**Re: [Qemu-devel] COLO failover hang* -ï¼ ï¼ï¼ï¼ ï¼ -ï¼ ï¼ï¼ï¼ ï¼ -ï¼ ï¼ï¼ï¼ ï¼ -ï¼ ï¼ï¼ï¼ ï¼ -ï¼ ï¼ï¼ï¼ ï¼ On 03/15/2017 05:06 PM, wangguang wrote: -ï¼ ï¼ï¼ï¼ ï¼ ï¼ am testing QEMU COLO feature described here [QEMU -ï¼ ï¼ï¼ï¼ ï¼ ï¼ Wiki]( -http://wiki.qemu-project.org/Features/COLO -). -ï¼ ï¼ï¼ï¼ ï¼ ï¼ -ï¼ ï¼ï¼ï¼ ï¼ ï¼ When the Primary Node panic,the Secondary Node qemu hang. -ï¼ ï¼ï¼ï¼ ï¼ ï¼ hang at recvmsg in qio_channel_socket_readv. -ï¼ ï¼ï¼ï¼ ï¼ ï¼ And I run { 'execute': 'nbd-server-stop' } and { "execute": -ï¼ ï¼ï¼ï¼ ï¼ ï¼ "x-colo-lost-heartbeat" } in Secondary VM's -ï¼ ï¼ï¼ï¼ ï¼ ï¼ monitor,the Secondary Node qemu still hang at recvmsg . -ï¼ ï¼ï¼ï¼ ï¼ ï¼ -ï¼ ï¼ï¼ï¼ ï¼ ï¼ I found that the colo in qemu is not complete yet. -ï¼ ï¼ï¼ï¼ ï¼ ï¼ Do the colo have any plan for development? -ï¼ ï¼ï¼ï¼ ï¼ -ï¼ ï¼ï¼ï¼ ï¼ Yes, We are developing. You can see some of patch we pushing. -ï¼ ï¼ï¼ï¼ ï¼ -ï¼ ï¼ï¼ï¼ ï¼ ï¼ Has anyone ever run it successfully? Any help is appreciated! -ï¼ ï¼ï¼ï¼ ï¼ -ï¼ ï¼ï¼ï¼ ï¼ In our internal version can run it successfully, -ï¼ ï¼ï¼ï¼ ï¼ The failover detail you can ask Zhanghailiang for help. -ï¼ ï¼ï¼ï¼ ï¼ Next time if you have some question about COLO, -ï¼ ï¼ï¼ï¼ ï¼ please cc me and zhanghailiang address@hidden -ï¼ ï¼ï¼ï¼ ï¼ -ï¼ ï¼ï¼ï¼ ï¼ -ï¼ ï¼ï¼ï¼ ï¼ Thanks -ï¼ ï¼ï¼ï¼ ï¼ Zhang Chen -ï¼ ï¼ï¼ï¼ ï¼ -ï¼ ï¼ï¼ï¼ ï¼ -ï¼ ï¼ï¼ï¼ ï¼ ï¼ -ï¼ ï¼ï¼ï¼ ï¼ ï¼ -ï¼ ï¼ï¼ï¼ ï¼ ï¼ -ï¼ ï¼ï¼ï¼ ï¼ ï¼ centos7.2+qemu2.7.50 -ï¼ ï¼ï¼ï¼ ï¼ ï¼ (gdb) bt -ï¼ ï¼ï¼ï¼ ï¼ ï¼ #0 0x00007f3e00cc86ad in recvmsg () from /lib64/libpthread.so.0 -ï¼ ï¼ï¼ï¼ ï¼ ï¼ #1 0x00007f3e0332b738 in qio_channel_socket_readv (ioc=ï¼optimized -outï¼, -ï¼ ï¼ï¼ï¼ ï¼ ï¼ iov=ï¼optimized outï¼, niov=ï¼optimized outï¼, fds=0x0, nfds=0x0, -errp=0x0) at -ï¼ ï¼ï¼ï¼ ï¼ ï¼ io/channel-socket.c:497 -ï¼ ï¼ï¼ï¼ ï¼ ï¼ #2 0x00007f3e03329472 in qio_channel_read (address@hidden, -ï¼ ï¼ï¼ï¼ ï¼ ï¼ address@hidden "", address@hidden, -ï¼ ï¼ï¼ï¼ ï¼ ï¼ address@hidden) at io/channel.c:97 -ï¼ ï¼ï¼ï¼ ï¼ ï¼ #3 0x00007f3e032750e0 in channel_get_buffer (opaque=ï¼optimized outï¼, -ï¼ ï¼ï¼ï¼ ï¼ ï¼ buf=0x7f3e05910f38 "", pos=ï¼optimized outï¼, size=32768) at -ï¼ ï¼ï¼ï¼ ï¼ ï¼ migration/qemu-file-channel.c:78 -ï¼ ï¼ï¼ï¼ ï¼ ï¼ #4 0x00007f3e0327412c in qemu_fill_buffer (f=0x7f3e05910f00) at -ï¼ ï¼ï¼ï¼ ï¼ ï¼ migration/qemu-file.c:257 -ï¼ ï¼ï¼ï¼ ï¼ ï¼ #5 0x00007f3e03274a41 in qemu_peek_byte (address@hidden, -ï¼ ï¼ï¼ï¼ ï¼ ï¼ address@hidden) at migration/qemu-file.c:510 -ï¼ ï¼ï¼ï¼ ï¼ ï¼ #6 0x00007f3e03274aab in qemu_get_byte (address@hidden) at -ï¼ ï¼ï¼ï¼ ï¼ ï¼ migration/qemu-file.c:523 -ï¼ ï¼ï¼ï¼ ï¼ ï¼ #7 0x00007f3e03274cb2 in qemu_get_be32 (address@hidden) at -ï¼ ï¼ï¼ï¼ ï¼ ï¼ migration/qemu-file.c:603 -ï¼ ï¼ï¼ï¼ ï¼ ï¼ #8 0x00007f3e03271735 in colo_receive_message (f=0x7f3e05910f00, -ï¼ ï¼ï¼ï¼ ï¼ ï¼ address@hidden) at migration/colo.c:215 -ï¼ ï¼ï¼ï¼ ï¼ ï¼ #9 0x00007f3e0327250d in colo_wait_handle_message -(errp=0x7f3d62bfaa48, -ï¼ ï¼ï¼ï¼ ï¼ ï¼ checkpoint_request=ï¼synthetic pointerï¼, f=ï¼optimized outï¼) at -ï¼ ï¼ï¼ï¼ ï¼ ï¼ migration/colo.c:546 -ï¼ ï¼ï¼ï¼ ï¼ ï¼ #10 colo_process_incoming_thread (opaque=0x7f3e067245e0) at -ï¼ ï¼ï¼ï¼ ï¼ ï¼ migration/colo.c:649 -ï¼ ï¼ï¼ï¼ ï¼ ï¼ #11 0x00007f3e00cc1df3 in start_thread () from /lib64/libpthread.so.0 -ï¼ ï¼ï¼ï¼ ï¼ ï¼ #12 0x00007f3dfc9c03ed in clone () from /lib64/libc..so.6 -ï¼ ï¼ï¼ï¼ ï¼ ï¼ -ï¼ ï¼ï¼ï¼ ï¼ ï¼ -ï¼ ï¼ï¼ï¼ ï¼ ï¼ -ï¼ ï¼ï¼ï¼ ï¼ ï¼ -ï¼ ï¼ï¼ï¼ ï¼ ï¼ -ï¼ ï¼ï¼ï¼ ï¼ ï¼ -- -ï¼ ï¼ï¼ï¼ ï¼ ï¼ View this message in context: -http://qemu.11.n7.nabble.com/COLO-failover-hang-tp473250.html -ï¼ ï¼ï¼ï¼ ï¼ ï¼ Sent from the Developer mailing list archive at Nabble.com. -ï¼ ï¼ï¼ï¼ ï¼ ï¼ -ï¼ ï¼ï¼ï¼ ï¼ ï¼ -ï¼ ï¼ï¼ï¼ ï¼ ï¼ -ï¼ ï¼ï¼ï¼ ï¼ ï¼ -ï¼ ï¼ï¼ï¼ ï¼ -ï¼ ï¼ï¼ï¼ ï¼ -- -ï¼ ï¼ï¼ï¼ ï¼ Thanks -ï¼ ï¼ï¼ï¼ ï¼ Zhang Chen -ï¼ ï¼ï¼ï¼ ï¼ -ï¼ ï¼ï¼ï¼ ï¼ -ï¼ ï¼ï¼ï¼ ï¼ -ï¼ ï¼ï¼ï¼ ï¼ -ï¼ ï¼ï¼ï¼ ï¼ -ï¼ ï¼ï¼ï¼ -ï¼ ï¼ï¼ -ï¼ ï¼ -- -ï¼ ï¼ Dr. David Alan Gilbert / address@hidden / Manchester, UK -ï¼ ï¼ -ï¼ ï¼ . -ï¼ ï¼ -ï¼ - diff --git a/results/classifier/013/risc-v/74545755 b/results/classifier/013/risc-v/74545755 deleted file mode 100644 index 5b655ed17..000000000 --- a/results/classifier/013/risc-v/74545755 +++ /dev/null @@ -1,372 +0,0 @@ -risc-v: 0.845 -user-level: 0.790 -operating system: 0.784 -register: 0.778 -permissions: 0.770 -mistranslation: 0.752 -debug: 0.740 -TCG: 0.722 -performance: 0.721 -device: 0.720 -semantic: 0.669 -virtual: 0.667 -arm: 0.662 -KVM: 0.661 -graphic: 0.660 -ppc: 0.659 -vnc: 0.650 -assembly: 0.648 -architecture: 0.636 -boot: 0.607 -VMM: 0.602 -files: 0.577 -system: 0.574 -peripherals: 0.566 -hypervisor: 0.563 -network: 0.550 -socket: 0.549 -x86: 0.545 -alpha: 0.508 -PID: 0.479 -kernel: 0.452 -i386: 0.376 - -[Bug Report][RFC PATCH 0/1] block: fix failing assert on paused VM migration - -There's a bug (failing assert) which is reproduced during migration of -a paused VM. I am able to reproduce it on a stand with 2 nodes and a common -NFS share, with VM's disk on that share. - -root@fedora40-1-vm:~# virsh domblklist alma8-vm - Target Source ------------------------------------------- - sda /mnt/shared/images/alma8.qcow2 - -root@fedora40-1-vm:~# df -Th /mnt/shared -Filesystem Type Size Used Avail Use% Mounted on -127.0.0.1:/srv/nfsd nfs4 63G 16G 48G 25% /mnt/shared - -On the 1st node: - -root@fedora40-1-vm:~# virsh start alma8-vm ; virsh suspend alma8-vm -root@fedora40-1-vm:~# virsh migrate --compressed --p2p --persistent ---undefinesource --live alma8-vm qemu+ssh://fedora40-2-vm/system - -Then on the 2nd node: - -root@fedora40-2-vm:~# virsh migrate --compressed --p2p --persistent ---undefinesource --live alma8-vm qemu+ssh://fedora40-1-vm/system -error: operation failed: domain is not running - -root@fedora40-2-vm:~# tail -3 /var/log/libvirt/qemu/alma8-vm.log -2024-09-19 13:53:33.336+0000: initiating migration -qemu-system-x86_64: ../block.c:6976: int -bdrv_inactivate_recurse(BlockDriverState *): Assertion `!(bs->open_flags & -BDRV_O_INACTIVE)' failed. -2024-09-19 13:53:42.991+0000: shutting down, reason=crashed - -Backtrace: - -(gdb) bt -#0 0x00007f7eaa2f1664 in __pthread_kill_implementation () at /lib64/libc.so.6 -#1 0x00007f7eaa298c4e in raise () at /lib64/libc.so.6 -#2 0x00007f7eaa280902 in abort () at /lib64/libc.so.6 -#3 0x00007f7eaa28081e in __assert_fail_base.cold () at /lib64/libc.so.6 -#4 0x00007f7eaa290d87 in __assert_fail () at /lib64/libc.so.6 -#5 0x0000563c38b95eb8 in bdrv_inactivate_recurse (bs=0x563c3b6c60c0) at -../block.c:6976 -#6 0x0000563c38b95aeb in bdrv_inactivate_all () at ../block.c:7038 -#7 0x0000563c3884d354 in qemu_savevm_state_complete_precopy_non_iterable -(f=0x563c3b700c20, in_postcopy=false, inactivate_disks=true) - at ../migration/savevm.c:1571 -#8 0x0000563c3884dc1a in qemu_savevm_state_complete_precopy (f=0x563c3b700c20, -iterable_only=false, inactivate_disks=true) at ../migration/savevm.c:1631 -#9 0x0000563c3883a340 in migration_completion_precopy (s=0x563c3b4d51f0, -current_active_state=<optimized out>) at ../migration/migration.c:2780 -#10 migration_completion (s=0x563c3b4d51f0) at ../migration/migration.c:2844 -#11 migration_iteration_run (s=0x563c3b4d51f0) at ../migration/migration.c:3270 -#12 migration_thread (opaque=0x563c3b4d51f0) at ../migration/migration.c:3536 -#13 0x0000563c38dbcf14 in qemu_thread_start (args=0x563c3c2d5bf0) at -../util/qemu-thread-posix.c:541 -#14 0x00007f7eaa2ef6d7 in start_thread () at /lib64/libc.so.6 -#15 0x00007f7eaa373414 in clone () at /lib64/libc.so.6 - -What happens here is that after 1st migration BDS related to HDD remains -inactive as VM is still paused. Then when we initiate 2nd migration, -bdrv_inactivate_all() leads to the attempt to set BDRV_O_INACTIVE flag -on that node which is already set, thus assert fails. - -Attached patch which simply skips setting flag if it's already set is more -of a kludge than a clean solution. Should we use more sophisticated logic -which allows some of the nodes be in inactive state prior to the migration, -and takes them into account during bdrv_inactivate_all()? Comments would -be appreciated. - -Andrey - -Andrey Drobyshev (1): - block: do not fail when inactivating node which is inactive - - block.c | 10 +++++++++- - 1 file changed, 9 insertions(+), 1 deletion(-) - --- -2.39.3 - -Instead of throwing an assert let's just ignore that flag is already set -and return. We assume that it's going to be safe to ignore. Otherwise -this assert fails when migrating a paused VM back and forth. - -Ideally we'd like to have a more sophisticated solution, e.g. not even -scan the nodes which should be inactive at this point. - -Signed-off-by: Andrey Drobyshev <andrey.drobyshev@virtuozzo.com> ---- - block.c | 10 +++++++++- - 1 file changed, 9 insertions(+), 1 deletion(-) - -diff --git a/block.c b/block.c -index 7d90007cae..c1dcf906d1 100644 ---- a/block.c -+++ b/block.c -@@ -6973,7 +6973,15 @@ static int GRAPH_RDLOCK -bdrv_inactivate_recurse(BlockDriverState *bs) - return 0; - } - -- assert(!(bs->open_flags & BDRV_O_INACTIVE)); -+ if (bs->open_flags & BDRV_O_INACTIVE) { -+ /* -+ * Return here instead of throwing assert as a workaround to -+ * prevent failure on migrating paused VM. -+ * Here we assume that if we're trying to inactivate BDS that's -+ * already inactive, it's safe to just ignore it. -+ */ -+ return 0; -+ } - - /* Inactivate this node */ - if (bs->drv->bdrv_inactivate) { --- -2.39.3 - -[add migration maintainers] - -On 24.09.24 15:56, Andrey Drobyshev wrote: -Instead of throwing an assert let's just ignore that flag is already set -and return. We assume that it's going to be safe to ignore. Otherwise -this assert fails when migrating a paused VM back and forth. - -Ideally we'd like to have a more sophisticated solution, e.g. not even -scan the nodes which should be inactive at this point. - -Signed-off-by: Andrey Drobyshev <andrey.drobyshev@virtuozzo.com> ---- - block.c | 10 +++++++++- - 1 file changed, 9 insertions(+), 1 deletion(-) - -diff --git a/block.c b/block.c -index 7d90007cae..c1dcf906d1 100644 ---- a/block.c -+++ b/block.c -@@ -6973,7 +6973,15 @@ static int GRAPH_RDLOCK -bdrv_inactivate_recurse(BlockDriverState *bs) - return 0; - } -- assert(!(bs->open_flags & BDRV_O_INACTIVE)); -+ if (bs->open_flags & BDRV_O_INACTIVE) { -+ /* -+ * Return here instead of throwing assert as a workaround to -+ * prevent failure on migrating paused VM. -+ * Here we assume that if we're trying to inactivate BDS that's -+ * already inactive, it's safe to just ignore it. -+ */ -+ return 0; -+ } -/* Inactivate this node */ -if (bs->drv->bdrv_inactivate) { -I doubt that this a correct way to go. - -As far as I understand, "inactive" actually means that "storage is not belong to -qemu, but to someone else (another qemu process for example), and may be changed -transparently". In turn this means that Qemu should do nothing with inactive disks. So the -problem is that nobody called bdrv_activate_all on target, and we shouldn't ignore that. - -Hmm, I see in process_incoming_migration_bh() we do call bdrv_activate_all(), -but only in some scenarios. May be, the condition should be less strict here. - -Why we need any condition here at all? Don't we want to activate block-layer on -target after migration anyway? - --- -Best regards, -Vladimir - -On 9/30/24 12:25 PM, Vladimir Sementsov-Ogievskiy wrote: -> -[add migration maintainers] -> -> -On 24.09.24 15:56, Andrey Drobyshev wrote: -> -> [...] -> -> -I doubt that this a correct way to go. -> -> -As far as I understand, "inactive" actually means that "storage is not -> -belong to qemu, but to someone else (another qemu process for example), -> -and may be changed transparently". In turn this means that Qemu should -> -do nothing with inactive disks. So the problem is that nobody called -> -bdrv_activate_all on target, and we shouldn't ignore that. -> -> -Hmm, I see in process_incoming_migration_bh() we do call -> -bdrv_activate_all(), but only in some scenarios. May be, the condition -> -should be less strict here. -> -> -Why we need any condition here at all? Don't we want to activate -> -block-layer on target after migration anyway? -> -Hmm I'm not sure about the unconditional activation, since we at least -have to honor LATE_BLOCK_ACTIVATE cap if it's set (and probably delay it -in such a case). In current libvirt upstream I see such code: - -> -/* Migration capabilities which should always be enabled as long as they -> -> -* are supported by QEMU. If the capability is supposed to be enabled on both -> -> -* sides of migration, it won't be enabled unless both sides support it. -> -> -*/ -> -> -static const qemuMigrationParamsAlwaysOnItem qemuMigrationParamsAlwaysOn[] = -> -{ -> -> -{QEMU_MIGRATION_CAP_PAUSE_BEFORE_SWITCHOVER, -> -> -QEMU_MIGRATION_SOURCE}, -> -> -> -> -{QEMU_MIGRATION_CAP_LATE_BLOCK_ACTIVATE, -> -> -QEMU_MIGRATION_DESTINATION}, -> -> -}; -which means that libvirt always wants LATE_BLOCK_ACTIVATE to be set. - -The code from process_incoming_migration_bh() you're referring to: - -> -/* If capability late_block_activate is set: -> -> -* Only fire up the block code now if we're going to restart the -> -> -* VM, else 'cont' will do it. -> -> -* This causes file locking to happen; so we don't want it to happen -> -> -* unless we really are starting the VM. -> -> -*/ -> -> -if (!migrate_late_block_activate() || -> -> -(autostart && (!global_state_received() || -> -> -runstate_is_live(global_state_get_runstate())))) { -> -> -/* Make sure all file formats throw away their mutable metadata. -> -> -> -* If we get an error here, just don't restart the VM yet. */ -> -> -bdrv_activate_all(&local_err); -> -> -if (local_err) { -> -> -error_report_err(local_err); -> -> -local_err = NULL; -> -> -autostart = false; -> -> -} -> -> -} -It states explicitly that we're either going to start VM right at this -point if (autostart == true), or we wait till "cont" command happens. -None of this is going to happen if we start another migration while -still being in PAUSED state. So I think it seems reasonable to take -such case into account. For instance, this patch does prevent the crash: - -> -diff --git a/migration/migration.c b/migration/migration.c -> -index ae2be31557..3222f6745b 100644 -> ---- a/migration/migration.c -> -+++ b/migration/migration.c -> -@@ -733,7 +733,8 @@ static void process_incoming_migration_bh(void *opaque) -> -*/ -> -if (!migrate_late_block_activate() || -> -(autostart && (!global_state_received() || -> -- runstate_is_live(global_state_get_runstate())))) { -> -+ runstate_is_live(global_state_get_runstate()))) || -> -+ (!autostart && global_state_get_runstate() == RUN_STATE_PAUSED)) { -> -/* Make sure all file formats throw away their mutable metadata. -> -* If we get an error here, just don't restart the VM yet. */ -> -bdrv_activate_all(&local_err); -What are your thoughts on it? - -Andrey - |