peripherals: 0.875 hypervisor: 0.863 mistranslation: 0.861 register: 0.858 architecture: 0.856 device: 0.853 vnc: 0.851 virtual: 0.845 permissions: 0.842 assembly: 0.841 performance: 0.839 ppc: 0.838 semantic: 0.835 operating system: 0.835 TCG: 0.833 VMM: 0.833 arm: 0.828 boot: 0.827 user-level: 0.826 graphic: 0.824 network: 0.822 socket: 0.820 PID: 0.819 KVM: 0.817 kernel: 0.817 files: 0.816 x86: 0.814 alpha: 0.810 debug: 0.803 i386: 0.782 risc-v: 0.755 [Qemu-devel] 答复: Re: 答复: Re: 答复: Re: [BUG]COLO failover hang amost like wiki,but panic in Primary Node. setp: 1 Primary Node. x86_64-softmmu/qemu-system-x86_64 -enable-kvm -boot c -m 2048 -smp 2 -qmp stdio -vnc :7 -name primary -cpu qemu64,+kvmclock -device piix3-usb-uhci -usb -usbdevice tablet\ -drive if=virtio,id=colo-disk0,driver=quorum,read-pattern=fifo,vote-threshold=1, children.0.file.filename=/mnt/sdd/pure_IMG/linux/redhat/rhel_6.5_64_2U_ide,children.0.driver=qcow2 -S \ -netdev tap,id=hn1,vhost=off,script=/etc/qemu-ifup2,downscript=/etc/qemu-ifdown2 \ -device e1000,id=e1,netdev=hn1,mac=52:a4:00:12:78:67 \ -netdev tap,id=hn0,vhost=off,script=/etc/qemu-ifup,downscript=/etc/qemu-ifdown \ -device e1000,id=e0,netdev=hn0,mac=52:a4:00:12:78:66 \ -chardev socket,id=mirror0,host=9.61.1.8,port=9003,server,nowait -chardev socket,id=compare1,host=9.61.1.8,port=9004,server,nowait \ -chardev socket,id=compare0,host=9.61.1.8,port=9001,server,nowait -chardev socket,id=compare0-0,host=9.61.1.8,port=9001 \ -chardev socket,id=compare_out,host=9.61.1.8,port=9005,server,nowait \ -chardev socket,id=compare_out0,host=9.61.1.8,port=9005 \ -object filter-mirror,id=m0,netdev=hn0,queue=tx,outdev=mirror0 \ -object filter-redirector,netdev=hn0,id=redire0,queue=rx,indev=compare_out -object filter-redirector,netdev=hn0,id=redire1,queue=rx,outdev=compare0 \ -object colo-compare,id=comp0,primary_in=compare0-0,secondary_in=compare1,outdev=compare_out0 2 Second node: x86_64-softmmu/qemu-system-x86_64 -boot c -m 2048 -smp 2 -qmp stdio -vnc :7 -name secondary -enable-kvm -cpu qemu64,+kvmclock -device piix3-usb-uhci -usb -usbdevice tablet\ -drive if=none,id=colo-disk0,file.filename=/mnt/sdd/pure_IMG/linux/redhat/rhel_6.5_64_2U_ide,driver=qcow2,node-name=node0 \ -drive if=virtio,id=active-disk0,driver=replication,mode=secondary,file.driver=qcow2,top-id=active-disk0,file.file.filename=/mnt/ramfstest/active_disk.img,file.backing.driver=qcow2,file.backing.file.filename=/mnt/ramfstest/hidden_disk.img,file.backing.backing=colo-disk0 \ -netdev tap,id=hn1,vhost=off,script=/etc/qemu-ifup2,downscript=/etc/qemu-ifdown2 \ -device e1000,id=e1,netdev=hn1,mac=52:a4:00:12:78:67 \ -netdev tap,id=hn0,vhost=off,script=/etc/qemu-ifup,downscript=/etc/qemu-ifdown \ -device e1000,netdev=hn0,mac=52:a4:00:12:78:66 -chardev socket,id=red0,host=9.61.1.8,port=9003 \ -chardev socket,id=red1,host=9.61.1.8,port=9004 \ -object filter-redirector,id=f1,netdev=hn0,queue=tx,indev=red0 \ -object filter-redirector,id=f2,netdev=hn0,queue=rx,outdev=red1 \ -object filter-rewriter,id=rew0,netdev=hn0,queue=all -incoming tcp:0:8888 3 Secondary node: {'execute':'qmp_capabilities'} { 'execute': 'nbd-server-start', 'arguments': {'addr': {'type': 'inet', 'data': {'host': '9.61.1.7', 'port': '8889'} } } } {'execute': 'nbd-server-add', 'arguments': {'device': 'colo-disk0', 'writable': true } } 4:Primary Node: {'execute':'qmp_capabilities'} { 'execute': 'human-monitor-command', 'arguments': {'command-line': 'drive_add -n buddy driver=replication,mode=primary,file.driver=nbd,file.host=9.61.1.7,file.port=8889,file.export=colo-disk0,node-name=node0'}} { 'execute':'x-blockdev-change', 'arguments':{'parent': 'colo-disk0', 'node': 'node0' } } { 'execute': 'migrate-set-capabilities', 'arguments': {'capabilities': [ {'capability': 'x-colo', 'state': true } ] } } { 'execute': 'migrate', 'arguments': {'uri': 'tcp:9.61.1.7:8888' } } then can see two runing VMs, whenever you make changes to PVM, SVM will be synced. 5:Primary Node: echo c > /proc/sysrq-trigger 6:Secondary node: { 'execute': 'nbd-server-stop' } { "execute": "x-colo-lost-heartbeat" } then can see the Secondary node hang at recvmsg recvmsg . 原始邮件 发件人: address@hidden 收件人:王广10165992 address@hidden 抄送人: address@hidden address@hidden 日 期 :2017å¹´03月21日 16:27 主 题 :Re: [Qemu-devel] 答复: Re: 答复: Re: [BUG]COLO failover hang Hi, On 2017/3/21 16:10, address@hidden wrote: > Thank you。 > > I have test aready。 > > When the Primary Node panic,the Secondary Node qemu hang at the same place。 > > Incorrding http://wiki.qemu-project.org/Features/COLO ,kill Primary Node qemu will not produce the problem,but Primary Node panic can。 > > I think due to the feature of channel does not support QIO_CHANNEL_FEATURE_SHUTDOWN. > > Yes, you are right, when we do failover for primary/secondary VM, we will shutdown the related fd in case it is stuck in the read/write fd. It seems that you didn't follow the above introduction exactly to do the test. Could you share your test procedures ? Especially the commands used in the test. Thanks, Hailiang > when failover,channel_shutdown could not shut down the channel. > > > so the colo_process_incoming_thread will hang at recvmsg. > > > I test a patch: > > > diff --git a/migration/socket.c b/migration/socket.c > > > index 13966f1..d65a0ea 100644 > > > --- a/migration/socket.c > > > +++ b/migration/socket.c > > > @@ -147,8 +147,9 @@ static gboolean socket_accept_incoming_migration(QIOChannel *ioc, > > > } > > > > > > trace_migration_socket_incoming_accepted() > > > > > > qio_channel_set_name(QIO_CHANNEL(sioc), "migration-socket-incoming") > > > + qio_channel_set_feature(QIO_CHANNEL(sioc), QIO_CHANNEL_FEATURE_SHUTDOWN) > > > migration_channel_process_incoming(migrate_get_current(), > > > QIO_CHANNEL(sioc)) > > > object_unref(OBJECT(sioc)) > > > > > My test will not hang any more. > > > > > > > > > > > > > > > > > > 原始邮件 > > > > 发件人: address@hidden > 收件人:王广10165992 address@hidden > 抄送人: address@hidden address@hidden > 日 期 :2017å¹´03月21日 15:58 > 主 题 :Re: [Qemu-devel] 答复: Re: [BUG]COLO failover hang > > > > > > Hi,Wang. > > You can test this branch: > > https://github.com/coloft/qemu/tree/colo-v5.1-developing-COLO-frame-v21-with-shared-disk > > and please follow wiki ensure your own configuration correctly. > > http://wiki.qemu-project.org/Features/COLO > > > Thanks > > Zhang Chen > > > On 03/21/2017 03:27 PM, address@hidden wrote: > > > > hi. > > > > I test the git qemu master have the same problem. > > > > (gdb) bt > > > > #0 qio_channel_socket_readv (ioc=0x7f65911b4e50, iov=0x7f64ef3fd880, > > niov=1, fds=0x0, nfds=0x0, errp=0x0) at io/channel-socket.c:461 > > > > #1 0x00007f658e4aa0c2 in qio_channel_read > > (address@hidden, address@hidden "", > > address@hidden, address@hidden) at io/channel.c:114 > > > > #2 0x00007f658e3ea990 in channel_get_buffer (opaque=<optimized out>, > > buf=0x7f65907cb838 "", pos=<optimized out>, size=32768) at > > migration/qemu-file-channel.c:78 > > > > #3 0x00007f658e3e97fc in qemu_fill_buffer (f=0x7f65907cb800) at > > migration/qemu-file.c:295 > > > > #4 0x00007f658e3ea2e1 in qemu_peek_byte (address@hidden, > > address@hidden) at migration/qemu-file.c:555 > > > > #5 0x00007f658e3ea34b in qemu_get_byte (address@hidden) at > > migration/qemu-file.c:568 > > > > #6 0x00007f658e3ea552 in qemu_get_be32 (address@hidden) at > > migration/qemu-file.c:648 > > > > #7 0x00007f658e3e66e5 in colo_receive_message (f=0x7f65907cb800, > > address@hidden) at migration/colo.c:244 > > > > #8 0x00007f658e3e681e in colo_receive_check_message (f=<optimized > > out>, address@hidden, > > address@hidden) > > > > at migration/colo.c:264 > > > > #9 0x00007f658e3e740e in colo_process_incoming_thread > > (opaque=0x7f658eb30360 <mis_current.31286>) at migration/colo.c:577 > > > > #10 0x00007f658be09df3 in start_thread () from /lib64/libpthread.so.0 > > > > #11 0x00007f65881983ed in clone () from /lib64/libc.so.6 > > > > (gdb) p ioc->name > > > > $2 = 0x7f658ff7d5c0 "migration-socket-incoming" > > > > (gdb) p ioc->features Do not support QIO_CHANNEL_FEATURE_SHUTDOWN > > > > $3 = 0 > > > > > > (gdb) bt > > > > #0 socket_accept_incoming_migration (ioc=0x7fdcceeafa90, > > condition=G_IO_IN, opaque=0x7fdcceeafa90) at migration/socket.c:137 > > > > #1 0x00007fdcc6966350 in g_main_dispatch (context=<optimized out>) at > > gmain.c:3054 > > > > #2 g_main_context_dispatch (context=<optimized out>, > > address@hidden) at gmain.c:3630 > > > > #3 0x00007fdccb8a6dcc in glib_pollfds_poll () at util/main-loop.c:213 > > > > #4 os_host_main_loop_wait (timeout=<optimized out>) at > > util/main-loop.c:258 > > > > #5 main_loop_wait (address@hidden) at > > util/main-loop.c:506 > > > > #6 0x00007fdccb526187 in main_loop () at vl.c:1898 > > > > #7 main (argc=<optimized out>, argv=<optimized out>, envp=<optimized > > out>) at vl.c:4709 > > > > (gdb) p ioc->features > > > > $1 = 6 > > > > (gdb) p ioc->name > > > > $2 = 0x7fdcce1b1ab0 "migration-socket-listener" > > > > > > May be socket_accept_incoming_migration should > > call qio_channel_set_feature(ioc, QIO_CHANNEL_FEATURE_SHUTDOWN)?? > > > > > > thank you. > > > > > > > > > > > > 原始邮件 > > address@hidden > > address@hidden > > address@hidden@huawei.com> > > *日 期 :*2017å¹´03月16日 14:46 > > *主 题 :**Re: [Qemu-devel] COLO failover hang* > > > > > > > > > > On 03/15/2017 05:06 PM, wangguang wrote: > > > am testing QEMU COLO feature described here [QEMU > > > Wiki]( http://wiki.qemu-project.org/Features/COLO ). > > > > > > When the Primary Node panic,the Secondary Node qemu hang. > > > hang at recvmsg in qio_channel_socket_readv. > > > And I run { 'execute': 'nbd-server-stop' } and { "execute": > > > "x-colo-lost-heartbeat" } in Secondary VM's > > > monitor,the Secondary Node qemu still hang at recvmsg . > > > > > > I found that the colo in qemu is not complete yet. > > > Do the colo have any plan for development? > > > > Yes, We are developing. You can see some of patch we pushing. > > > > > Has anyone ever run it successfully? Any help is appreciated! > > > > In our internal version can run it successfully, > > The failover detail you can ask Zhanghailiang for help. > > Next time if you have some question about COLO, > > please cc me and zhanghailiang address@hidden > > > > > > Thanks > > Zhang Chen > > > > > > > > > > > > > > > > centos7.2+qemu2.7.50 > > > (gdb) bt > > > #0 0x00007f3e00cc86ad in recvmsg () from /lib64/libpthread.so.0 > > > #1 0x00007f3e0332b738 in qio_channel_socket_readv (ioc=<optimized out>, > > > iov=<optimized out>, niov=<optimized out>, fds=0x0, nfds=0x0, errp=0x0) at > > > io/channel-socket.c:497 > > > #2 0x00007f3e03329472 in qio_channel_read (address@hidden, > > > address@hidden "", address@hidden, > > > address@hidden) at io/channel.c:97 > > > #3 0x00007f3e032750e0 in channel_get_buffer (opaque=<optimized out>, > > > buf=0x7f3e05910f38 "", pos=<optimized out>, size=32768) at > > > migration/qemu-file-channel.c:78 > > > #4 0x00007f3e0327412c in qemu_fill_buffer (f=0x7f3e05910f00) at > > > migration/qemu-file.c:257 > > > #5 0x00007f3e03274a41 in qemu_peek_byte (address@hidden, > > > address@hidden) at migration/qemu-file.c:510 > > > #6 0x00007f3e03274aab in qemu_get_byte (address@hidden) at > > > migration/qemu-file.c:523 > > > #7 0x00007f3e03274cb2 in qemu_get_be32 (address@hidden) at > > > migration/qemu-file.c:603 > > > #8 0x00007f3e03271735 in colo_receive_message (f=0x7f3e05910f00, > > > address@hidden) at migration/colo..c:215 > > > #9 0x00007f3e0327250d in colo_wait_handle_message (errp=0x7f3d62bfaa48, > > > checkpoint_request=<synthetic pointer>, f=<optimized out>) at > > > migration/colo.c:546 > > > #10 colo_process_incoming_thread (opaque=0x7f3e067245e0) at > > > migration/colo.c:649 > > > #11 0x00007f3e00cc1df3 in start_thread () from /lib64/libpthread.so.0 > > > #12 0x00007f3dfc9c03ed in clone () from /lib64/libc.so.6 > > > > > > > > > > > > > > > > > > -- > > > View this message in context: http://qemu.11.n7.nabble.com/COLO-failover-hang-tp473250.html > > > Sent from the Developer mailing list archive at Nabble.com. > > > > > > > > > > > > > > > > -- > > Thanks > > Zhang Chen > > > > > > > > > > > diff --git a/migration/socket.c b/migration/socket.c index 13966f1..d65a0ea 100644 --- a/migration/socket.c +++ b/migration/socket.c @@ -147,8 +147,9 @@ static gboolean socket_accept_incoming_migration(QIOChannel *ioc, } trace_migration_socket_incoming_accepted() qio_channel_set_name(QIO_CHANNEL(sioc), "migration-socket-incoming") + qio_channel_set_feature(QIO_CHANNEL(sioc), QIO_CHANNEL_FEATURE_SHUTDOWN) migration_channel_process_incoming(migrate_get_current(), QIO_CHANNEL(sioc)) object_unref(OBJECT(sioc)) Is this patch ok? I have test it . The test could not hang any more. 原始邮件 发件人: address@hidden 收件人: address@hidden address@hidden 抄送人: address@hidden address@hidden address@hidden 日 期 :2017å¹´03月22日 09:11 主 题 :Re: [Qemu-devel] 答复: Re: 答复: Re: [BUG]COLO failover hang On 2017/3/21 19:56, Dr. David Alan Gilbert wrote: > * Hailiang Zhang (address@hidden) wrote: >> Hi, >> >> Thanks for reporting this, and i confirmed it in my test, and it is a bug. >> >> Though we tried to call qemu_file_shutdown() to shutdown the related fd, in >> case COLO thread/incoming thread is stuck in read/write() while do failover, >> but it didn't take effect, because all the fd used by COLO (also migration) >> has been wrapped by qio channel, and it will not call the shutdown API if >> we didn't qio_channel_set_feature(QIO_CHANNEL(sioc), QIO_CHANNEL_FEATURE_SHUTDOWN). >> >> Cc: Dr. David Alan Gilbert address@hidden >> >> I doubted migration cancel has the same problem, it may be stuck in write() >> if we tried to cancel migration. >> >> void fd_start_outgoing_migration(MigrationState *s, const char *fdname, Error **errp) >> { >> qio_channel_set_name(QIO_CHANNEL(ioc), "migration-fd-outgoing") >> migration_channel_connect(s, ioc, NULL) >> ... ... >> We didn't call qio_channel_set_feature(QIO_CHANNEL(sioc), QIO_CHANNEL_FEATURE_SHUTDOWN) above, >> and the >> migrate_fd_cancel() >> { >> ... ... >> if (s->state == MIGRATION_STATUS_CANCELLING && f) { >> qemu_file_shutdown(f) --> This will not take effect. No ? >> } >> } > > (cc'd in Daniel Berrange). > I see that we call qio_channel_set_feature(ioc, QIO_CHANNEL_FEATURE_SHUTDOWN) at the > top of qio_channel_socket_new so I think that's safe isn't it? > Hmm, you are right, this problem is only exist for the migration incoming fd, thanks. > Dave > >> Thanks, >> Hailiang >> >> On 2017/3/21 16:10, address@hidden wrote: >>> Thank you。 >>> >>> I have test aready。 >>> >>> When the Primary Node panic,the Secondary Node qemu hang at the same place。 >>> >>> Incorrding http://wiki.qemu-project.org/Features/COLO ,kill Primary Node qemu will not produce the problem,but Primary Node panic can。 >>> >>> I think due to the feature of channel does not support QIO_CHANNEL_FEATURE_SHUTDOWN. >>> >>> >>> when failover,channel_shutdown could not shut down the channel. >>> >>> >>> so the colo_process_incoming_thread will hang at recvmsg. >>> >>> >>> I test a patch: >>> >>> >>> diff --git a/migration/socket.c b/migration/socket.c >>> >>> >>> index 13966f1..d65a0ea 100644 >>> >>> >>> --- a/migration/socket.c >>> >>> >>> +++ b/migration/socket.c >>> >>> >>> @@ -147,8 +147,9 @@ static gboolean socket_accept_incoming_migration(QIOChannel *ioc, >>> >>> >>> } >>> >>> >>> >>> >>> >>> trace_migration_socket_incoming_accepted() >>> >>> >>> >>> >>> >>> qio_channel_set_name(QIO_CHANNEL(sioc), "migration-socket-incoming") >>> >>> >>> + qio_channel_set_feature(QIO_CHANNEL(sioc), QIO_CHANNEL_FEATURE_SHUTDOWN) >>> >>> >>> migration_channel_process_incoming(migrate_get_current(), >>> >>> >>> QIO_CHANNEL(sioc)) >>> >>> >>> object_unref(OBJECT(sioc)) >>> >>> >>> >>> >>> My test will not hang any more. >>> >>> >>> >>> >>> >>> >>> >>> >>> >>> >>> >>> >>> >>> >>> >>> >>> >>> 原始邮件 >>> >>> >>> >>> 发件人: address@hidden >>> 收件人:王广10165992 address@hidden >>> 抄送人: address@hidden address@hidden >>> 日 期 :2017å¹´03月21日 15:58 >>> 主 题 :Re: [Qemu-devel] 答复: Re: [BUG]COLO failover hang >>> >>> >>> >>> >>> >>> Hi,Wang. >>> >>> You can test this branch: >>> >>> https://github.com/coloft/qemu/tree/colo-v5.1-developing-COLO-frame-v21-with-shared-disk >>> >>> and please follow wiki ensure your own configuration correctly. >>> >>> http://wiki.qemu-project.org/Features/COLO >>> >>> >>> Thanks >>> >>> Zhang Chen >>> >>> >>> On 03/21/2017 03:27 PM, address@hidden wrote: >>> > >>> > hi. >>> > >>> > I test the git qemu master have the same problem. >>> > >>> > (gdb) bt >>> > >>> > #0 qio_channel_socket_readv (ioc=0x7f65911b4e50, iov=0x7f64ef3fd880, >>> > niov=1, fds=0x0, nfds=0x0, errp=0x0) at io/channel-socket.c:461 >>> > >>> > #1 0x00007f658e4aa0c2 in qio_channel_read >>> > (address@hidden, address@hidden "", >>> > address@hidden, address@hidden) at io/channel.c:114 >>> > >>> > #2 0x00007f658e3ea990 in channel_get_buffer (opaque=<optimized out>, >>> > buf=0x7f65907cb838 "", pos=<optimized out>, size=32768) at >>> > migration/qemu-file-channel.c:78 >>> > >>> > #3 0x00007f658e3e97fc in qemu_fill_buffer (f=0x7f65907cb800) at >>> > migration/qemu-file.c:295 >>> > >>> > #4 0x00007f658e3ea2e1 in qemu_peek_byte (address@hidden, >>> > address@hidden) at migration/qemu-file.c:555 >>> > >>> > #5 0x00007f658e3ea34b in qemu_get_byte (address@hidden) at >>> > migration/qemu-file.c:568 >>> > >>> > #6 0x00007f658e3ea552 in qemu_get_be32 (address@hidden) at >>> > migration/qemu-file.c:648 >>> > >>> > #7 0x00007f658e3e66e5 in colo_receive_message (f=0x7f65907cb800, >>> > address@hidden) at migration/colo.c:244 >>> > >>> > #8 0x00007f658e3e681e in colo_receive_check_message (f=<optimized >>> > out>, address@hidden, >>> > address@hidden) >>> > >>> > at migration/colo.c:264 >>> > >>> > #9 0x00007f658e3e740e in colo_process_incoming_thread >>> > (opaque=0x7f658eb30360 <mis_current.31286>) at migration/colo.c:577 >>> > >>> > #10 0x00007f658be09df3 in start_thread () from /lib64/libpthread.so.0 >>> > >>> > #11 0x00007f65881983ed in clone () from /lib64/libc.so.6 >>> > >>> > (gdb) p ioc->name >>> > >>> > $2 = 0x7f658ff7d5c0 "migration-socket-incoming" >>> > >>> > (gdb) p ioc->features Do not support QIO_CHANNEL_FEATURE_SHUTDOWN >>> > >>> > $3 = 0 >>> > >>> > >>> > (gdb) bt >>> > >>> > #0 socket_accept_incoming_migration (ioc=0x7fdcceeafa90, >>> > condition=G_IO_IN, opaque=0x7fdcceeafa90) at migration/socket.c:137 >>> > >>> > #1 0x00007fdcc6966350 in g_main_dispatch (context=<optimized out>) at >>> > gmain.c:3054 >>> > >>> > #2 g_main_context_dispatch (context=<optimized out>, >>> > address@hidden) at gmain.c:3630 >>> > >>> > #3 0x00007fdccb8a6dcc in glib_pollfds_poll () at util/main-loop.c:213 >>> > >>> > #4 os_host_main_loop_wait (timeout=<optimized out>) at >>> > util/main-loop.c:258 >>> > >>> > #5 main_loop_wait (address@hidden) at >>> > util/main-loop.c:506 >>> > >>> > #6 0x00007fdccb526187 in main_loop () at vl.c:1898 >>> > >>> > #7 main (argc=<optimized out>, argv=<optimized out>, envp=<optimized >>> > out>) at vl.c:4709 >>> > >>> > (gdb) p ioc->features >>> > >>> > $1 = 6 >>> > >>> > (gdb) p ioc->name >>> > >>> > $2 = 0x7fdcce1b1ab0 "migration-socket-listener" >>> > >>> > >>> > May be socket_accept_incoming_migration should >>> > call qio_channel_set_feature(ioc, QIO_CHANNEL_FEATURE_SHUTDOWN)?? >>> > >>> > >>> > thank you. >>> > >>> > >>> > >>> > >>> > >>> > 原始邮件 >>> > address@hidden >>> > address@hidden >>> > address@hidden@huawei.com> >>> > *日 期 :*2017å¹´03月16日 14:46 >>> > *主 题 :**Re: [Qemu-devel] COLO failover hang* >>> > >>> > >>> > >>> > >>> > On 03/15/2017 05:06 PM, wangguang wrote: >>> > > am testing QEMU COLO feature described here [QEMU >>> > > Wiki]( http://wiki.qemu-project.org/Features/COLO ). >>> > > >>> > > When the Primary Node panic,the Secondary Node qemu hang. >>> > > hang at recvmsg in qio_channel_socket_readv. >>> > > And I run { 'execute': 'nbd-server-stop' } and { "execute": >>> > > "x-colo-lost-heartbeat" } in Secondary VM's >>> > > monitor,the Secondary Node qemu still hang at recvmsg . >>> > > >>> > > I found that the colo in qemu is not complete yet. >>> > > Do the colo have any plan for development? >>> > >>> > Yes, We are developing. You can see some of patch we pushing. >>> > >>> > > Has anyone ever run it successfully? Any help is appreciated! >>> > >>> > In our internal version can run it successfully, >>> > The failover detail you can ask Zhanghailiang for help. >>> > Next time if you have some question about COLO, >>> > please cc me and zhanghailiang address@hidden >>> > >>> > >>> > Thanks >>> > Zhang Chen >>> > >>> > >>> > > >>> > > >>> > > >>> > > centos7.2+qemu2.7.50 >>> > > (gdb) bt >>> > > #0 0x00007f3e00cc86ad in recvmsg () from /lib64/libpthread.so.0 >>> > > #1 0x00007f3e0332b738 in qio_channel_socket_readv (ioc=<optimized out>, >>> > > iov=<optimized out>, niov=<optimized out>, fds=0x0, nfds=0x0, errp=0x0) at >>> > > io/channel-socket.c:497 >>> > > #2 0x00007f3e03329472 in qio_channel_read (address@hidden, >>> > > address@hidden "", address@hidden, >>> > > address@hidden) at io/channel.c:97 >>> > > #3 0x00007f3e032750e0 in channel_get_buffer (opaque=<optimized out>, >>> > > buf=0x7f3e05910f38 "", pos=<optimized out>, size=32768) at >>> > > migration/qemu-file-channel.c:78 >>> > > #4 0x00007f3e0327412c in qemu_fill_buffer (f=0x7f3e05910f00) at >>> > > migration/qemu-file.c:257 >>> > > #5 0x00007f3e03274a41 in qemu_peek_byte (address@hidden, >>> > > address@hidden) at migration/qemu-file.c:510 >>> > > #6 0x00007f3e03274aab in qemu_get_byte (address@hidden) at >>> > > migration/qemu-file.c:523 >>> > > #7 0x00007f3e03274cb2 in qemu_get_be32 (address@hidden) at >>> > > migration/qemu-file.c:603 >>> > > #8 0x00007f3e03271735 in colo_receive_message (f=0x7f3e05910f00, >>> > > address@hidden) at migration/colo.c:215 >>> > > #9 0x00007f3e0327250d in colo_wait_handle_message (errp=0x7f3d62bfaa48, >>> > > checkpoint_request=<synthetic pointer>, f=<optimized out>) at >>> > > migration/colo.c:546 >>> > > #10 colo_process_incoming_thread (opaque=0x7f3e067245e0) at >>> > > migration/colo.c:649 >>> > > #11 0x00007f3e00cc1df3 in start_thread () from /lib64/libpthread.so.0 >>> > > #12 0x00007f3dfc9c03ed in clone () from /lib64/libc..so.6 >>> > > >>> > > >>> > > >>> > > >>> > > >>> > > -- >>> > > View this message in context: http://qemu.11.n7.nabble.com/COLO-failover-hang-tp473250.html >>> > > Sent from the Developer mailing list archive at Nabble.com. >>> > > >>> > > >>> > > >>> > > >>> > >>> > -- >>> > Thanks >>> > Zhang Chen >>> > >>> > >>> > >>> > >>> > >>> >> > -- > Dr. David Alan Gilbert / address@hidden / Manchester, UK > > . > Hi, On 2017/3/22 9:42, address@hidden wrote: diff --git a/migration/socket.c b/migration/socket.c index 13966f1..d65a0ea 100644 --- a/migration/socket.c +++ b/migration/socket.c @@ -147,8 +147,9 @@ static gboolean socket_accept_incoming_migration(QIOChannel *ioc, } trace_migration_socket_incoming_accepted() qio_channel_set_name(QIO_CHANNEL(sioc), "migration-socket-incoming") + qio_channel_set_feature(QIO_CHANNEL(sioc), QIO_CHANNEL_FEATURE_SHUTDOWN) migration_channel_process_incoming(migrate_get_current(), QIO_CHANNEL(sioc)) object_unref(OBJECT(sioc)) Is this patch ok? Yes, i think this works, but a better way maybe to call qio_channel_set_feature() in qio_channel_socket_accept(), we didn't set the SHUTDOWN feature for the socket accept fd, Or fix it by this: diff --git a/io/channel-socket.c b/io/channel-socket.c index f546c68..ce6894c 100644 --- a/io/channel-socket.c +++ b/io/channel-socket.c @@ -330,9 +330,8 @@ qio_channel_socket_accept(QIOChannelSocket *ioc, Error **errp) { QIOChannelSocket *cioc; - - cioc = QIO_CHANNEL_SOCKET(object_new(TYPE_QIO_CHANNEL_SOCKET)); - cioc->fd = -1; + + cioc = qio_channel_socket_new(); cioc->remoteAddrLen = sizeof(ioc->remoteAddr); cioc->localAddrLen = sizeof(ioc->localAddr); Thanks, Hailiang I have test it . The test could not hang any more. 原始邮件 发件人: address@hidden 收件人: address@hidden address@hidden 抄送人: address@hidden address@hidden address@hidden 日 期 :2017å¹´03月22日 09:11 主 题 :Re: [Qemu-devel] 答复: Re: 答复: Re: [BUG]COLO failover hang On 2017/3/21 19:56, Dr. David Alan Gilbert wrote: > * Hailiang Zhang (address@hidden) wrote: >> Hi, >> >> Thanks for reporting this, and i confirmed it in my test, and it is a bug. >> >> Though we tried to call qemu_file_shutdown() to shutdown the related fd, in >> case COLO thread/incoming thread is stuck in read/write() while do failover, >> but it didn't take effect, because all the fd used by COLO (also migration) >> has been wrapped by qio channel, and it will not call the shutdown API if >> we didn't qio_channel_set_feature(QIO_CHANNEL(sioc), QIO_CHANNEL_FEATURE_SHUTDOWN). >> >> Cc: Dr. David Alan Gilbert address@hidden >> >> I doubted migration cancel has the same problem, it may be stuck in write() >> if we tried to cancel migration. >> >> void fd_start_outgoing_migration(MigrationState *s, const char *fdname, Error **errp) >> { >> qio_channel_set_name(QIO_CHANNEL(ioc), "migration-fd-outgoing") >> migration_channel_connect(s, ioc, NULL) >> ... ... >> We didn't call qio_channel_set_feature(QIO_CHANNEL(sioc), QIO_CHANNEL_FEATURE_SHUTDOWN) above, >> and the >> migrate_fd_cancel() >> { >> ... ... >> if (s->state == MIGRATION_STATUS_CANCELLING && f) { >> qemu_file_shutdown(f) --> This will not take effect. No ? >> } >> } > > (cc'd in Daniel Berrange). > I see that we call qio_channel_set_feature(ioc, QIO_CHANNEL_FEATURE_SHUTDOWN) at the > top of qio_channel_socket_new so I think that's safe isn't it? > Hmm, you are right, this problem is only exist for the migration incoming fd, thanks. > Dave > >> Thanks, >> Hailiang >> >> On 2017/3/21 16:10, address@hidden wrote: >>> Thank you。 >>> >>> I have test aready。 >>> >>> When the Primary Node panic,the Secondary Node qemu hang at the same place。 >>> >>> Incorrding http://wiki.qemu-project.org/Features/COLO ,kill Primary Node qemu will not produce the problem,but Primary Node panic can。 >>> >>> I think due to the feature of channel does not support QIO_CHANNEL_FEATURE_SHUTDOWN. >>> >>> >>> when failover,channel_shutdown could not shut down the channel. >>> >>> >>> so the colo_process_incoming_thread will hang at recvmsg. >>> >>> >>> I test a patch: >>> >>> >>> diff --git a/migration/socket.c b/migration/socket.c >>> >>> >>> index 13966f1..d65a0ea 100644 >>> >>> >>> --- a/migration/socket.c >>> >>> >>> +++ b/migration/socket.c >>> >>> >>> @@ -147,8 +147,9 @@ static gboolean socket_accept_incoming_migration(QIOChannel *ioc, >>> >>> >>> } >>> >>> >>> >>> >>> >>> trace_migration_socket_incoming_accepted() >>> >>> >>> >>> >>> >>> qio_channel_set_name(QIO_CHANNEL(sioc), "migration-socket-incoming") >>> >>> >>> + qio_channel_set_feature(QIO_CHANNEL(sioc), QIO_CHANNEL_FEATURE_SHUTDOWN) >>> >>> >>> migration_channel_process_incoming(migrate_get_current(), >>> >>> >>> QIO_CHANNEL(sioc)) >>> >>> >>> object_unref(OBJECT(sioc)) >>> >>> >>> >>> >>> My test will not hang any more. >>> >>> >>> >>> >>> >>> >>> >>> >>> >>> >>> >>> >>> >>> >>> >>> >>> >>> 原始邮件 >>> >>> >>> >>> 发件人: address@hidden >>> 收件人:王广10165992 address@hidden >>> 抄送人: address@hidden address@hidden >>> 日 期 :2017å¹´03月21日 15:58 >>> 主 题 :Re: [Qemu-devel] 答复: Re: [BUG]COLO failover hang >>> >>> >>> >>> >>> >>> Hi,Wang. >>> >>> You can test this branch: >>> >>> https://github.com/coloft/qemu/tree/colo-v5.1-developing-COLO-frame-v21-with-shared-disk >>> >>> and please follow wiki ensure your own configuration correctly. >>> >>> http://wiki.qemu-project.org/Features/COLO >>> >>> >>> Thanks >>> >>> Zhang Chen >>> >>> >>> On 03/21/2017 03:27 PM, address@hidden wrote: >>> > >>> > hi. >>> > >>> > I test the git qemu master have the same problem. >>> > >>> > (gdb) bt >>> > >>> > #0 qio_channel_socket_readv (ioc=0x7f65911b4e50, iov=0x7f64ef3fd880, >>> > niov=1, fds=0x0, nfds=0x0, errp=0x0) at io/channel-socket.c:461 >>> > >>> > #1 0x00007f658e4aa0c2 in qio_channel_read >>> > (address@hidden, address@hidden "", >>> > address@hidden, address@hidden) at io/channel.c:114 >>> > >>> > #2 0x00007f658e3ea990 in channel_get_buffer (opaque=<optimized out>, >>> > buf=0x7f65907cb838 "", pos=<optimized out>, size=32768) at >>> > migration/qemu-file-channel.c:78 >>> > >>> > #3 0x00007f658e3e97fc in qemu_fill_buffer (f=0x7f65907cb800) at >>> > migration/qemu-file.c:295 >>> > >>> > #4 0x00007f658e3ea2e1 in qemu_peek_byte (address@hidden, >>> > address@hidden) at migration/qemu-file.c:555 >>> > >>> > #5 0x00007f658e3ea34b in qemu_get_byte (address@hidden) at >>> > migration/qemu-file.c:568 >>> > >>> > #6 0x00007f658e3ea552 in qemu_get_be32 (address@hidden) at >>> > migration/qemu-file.c:648 >>> > >>> > #7 0x00007f658e3e66e5 in colo_receive_message (f=0x7f65907cb800, >>> > address@hidden) at migration/colo.c:244 >>> > >>> > #8 0x00007f658e3e681e in colo_receive_check_message (f=<optimized >>> > out>, address@hidden, >>> > address@hidden) >>> > >>> > at migration/colo.c:264 >>> > >>> > #9 0x00007f658e3e740e in colo_process_incoming_thread >>> > (opaque=0x7f658eb30360 <mis_current.31286>) at migration/colo.c:577 >>> > >>> > #10 0x00007f658be09df3 in start_thread () from /lib64/libpthread.so.0 >>> > >>> > #11 0x00007f65881983ed in clone () from /lib64/libc.so.6 >>> > >>> > (gdb) p ioc->name >>> > >>> > $2 = 0x7f658ff7d5c0 "migration-socket-incoming" >>> > >>> > (gdb) p ioc->features Do not support QIO_CHANNEL_FEATURE_SHUTDOWN >>> > >>> > $3 = 0 >>> > >>> > >>> > (gdb) bt >>> > >>> > #0 socket_accept_incoming_migration (ioc=0x7fdcceeafa90, >>> > condition=G_IO_IN, opaque=0x7fdcceeafa90) at migration/socket.c:137 >>> > >>> > #1 0x00007fdcc6966350 in g_main_dispatch (context=<optimized out>) at >>> > gmain.c:3054 >>> > >>> > #2 g_main_context_dispatch (context=<optimized out>, >>> > address@hidden) at gmain.c:3630 >>> > >>> > #3 0x00007fdccb8a6dcc in glib_pollfds_poll () at util/main-loop.c:213 >>> > >>> > #4 os_host_main_loop_wait (timeout=<optimized out>) at >>> > util/main-loop.c:258 >>> > >>> > #5 main_loop_wait (address@hidden) at >>> > util/main-loop.c:506 >>> > >>> > #6 0x00007fdccb526187 in main_loop () at vl.c:1898 >>> > >>> > #7 main (argc=<optimized out>, argv=<optimized out>, envp=<optimized >>> > out>) at vl.c:4709 >>> > >>> > (gdb) p ioc->features >>> > >>> > $1 = 6 >>> > >>> > (gdb) p ioc->name >>> > >>> > $2 = 0x7fdcce1b1ab0 "migration-socket-listener" >>> > >>> > >>> > May be socket_accept_incoming_migration should >>> > call qio_channel_set_feature(ioc, QIO_CHANNEL_FEATURE_SHUTDOWN)?? >>> > >>> > >>> > thank you. >>> > >>> > >>> > >>> > >>> > >>> > 原始邮件 >>> > address@hidden >>> > address@hidden >>> > address@hidden@huawei.com> >>> > *日 期 :*2017å¹´03月16日 14:46 >>> > *主 题 :**Re: [Qemu-devel] COLO failover hang* >>> > >>> > >>> > >>> > >>> > On 03/15/2017 05:06 PM, wangguang wrote: >>> > > am testing QEMU COLO feature described here [QEMU >>> > > Wiki]( http://wiki.qemu-project.org/Features/COLO ). >>> > > >>> > > When the Primary Node panic,the Secondary Node qemu hang. >>> > > hang at recvmsg in qio_channel_socket_readv. >>> > > And I run { 'execute': 'nbd-server-stop' } and { "execute": >>> > > "x-colo-lost-heartbeat" } in Secondary VM's >>> > > monitor,the Secondary Node qemu still hang at recvmsg . >>> > > >>> > > I found that the colo in qemu is not complete yet. >>> > > Do the colo have any plan for development? >>> > >>> > Yes, We are developing. You can see some of patch we pushing. >>> > >>> > > Has anyone ever run it successfully? Any help is appreciated! >>> > >>> > In our internal version can run it successfully, >>> > The failover detail you can ask Zhanghailiang for help. >>> > Next time if you have some question about COLO, >>> > please cc me and zhanghailiang address@hidden >>> > >>> > >>> > Thanks >>> > Zhang Chen >>> > >>> > >>> > > >>> > > >>> > > >>> > > centos7.2+qemu2.7.50 >>> > > (gdb) bt >>> > > #0 0x00007f3e00cc86ad in recvmsg () from /lib64/libpthread.so.0 >>> > > #1 0x00007f3e0332b738 in qio_channel_socket_readv (ioc=<optimized out>, >>> > > iov=<optimized out>, niov=<optimized out>, fds=0x0, nfds=0x0, errp=0x0) at >>> > > io/channel-socket.c:497 >>> > > #2 0x00007f3e03329472 in qio_channel_read (address@hidden, >>> > > address@hidden "", address@hidden, >>> > > address@hidden) at io/channel.c:97 >>> > > #3 0x00007f3e032750e0 in channel_get_buffer (opaque=<optimized out>, >>> > > buf=0x7f3e05910f38 "", pos=<optimized out>, size=32768) at >>> > > migration/qemu-file-channel.c:78 >>> > > #4 0x00007f3e0327412c in qemu_fill_buffer (f=0x7f3e05910f00) at >>> > > migration/qemu-file.c:257 >>> > > #5 0x00007f3e03274a41 in qemu_peek_byte (address@hidden, >>> > > address@hidden) at migration/qemu-file.c:510 >>> > > #6 0x00007f3e03274aab in qemu_get_byte (address@hidden) at >>> > > migration/qemu-file.c:523 >>> > > #7 0x00007f3e03274cb2 in qemu_get_be32 (address@hidden) at >>> > > migration/qemu-file.c:603 >>> > > #8 0x00007f3e03271735 in colo_receive_message (f=0x7f3e05910f00, >>> > > address@hidden) at migration/colo.c:215 >>> > > #9 0x00007f3e0327250d in colo_wait_handle_message (errp=0x7f3d62bfaa48, >>> > > checkpoint_request=<synthetic pointer>, f=<optimized out>) at >>> > > migration/colo.c:546 >>> > > #10 colo_process_incoming_thread (opaque=0x7f3e067245e0) at >>> > > migration/colo.c:649 >>> > > #11 0x00007f3e00cc1df3 in start_thread () from /lib64/libpthread.so.0 >>> > > #12 0x00007f3dfc9c03ed in clone () from /lib64/libc..so.6 >>> > > >>> > > >>> > > >>> > > >>> > > >>> > > -- >>> > > View this message in context: http://qemu.11.n7.nabble.com/COLO-failover-hang-tp473250.html >>> > > Sent from the Developer mailing list archive at Nabble.com. >>> > > >>> > > >>> > > >>> > > >>> > >>> > -- >>> > Thanks >>> > Zhang Chen >>> > >>> > >>> > >>> > >>> > >>> >> > -- > Dr. David Alan Gilbert / address@hidden / Manchester, UK > > . >