QEMU crashes when virtio network cards are used together with e1000 network cards QEMU version: QEMU emulator version 2.2.50, Copyright (c) 2003-2008 Fabrice Bellard QEMU GIT version: ab0302ee764fd702465aef6d88612cdff4302809 Configure flags: ./configure --enable-kvm --prefix=/opt/qemu-devel Linux version: Ubuntu 14.04.1 LTS Kernel version: 3.13.0-43-generic #72-Ubuntu SMP Mon Dec 8 19:35:06 UTC 2014 x86_64 x86_64 x86_64 GNU/Linux Problem: QEMU crashes when using one (or more) virtio network cards together with one (or more) e1000 (and possibly others) network cards when those cards are bound to a linux bridge. When the cards are *not* bound to a bridge QEMU does not crash. Bridge configuration: iface bridge0 inet dhcp bridge_ports eth1 bridge_stp off bridge_fd 0 Start-up command (including binding the network cards to the bridge + strace logging): ./qemu-system-x86_64 -daemonize -smp 1 -m 128 -vnc 0.0.0.0:0 \ -netdev tap,id=tap_1,script=no,downscript=no,ifname=net_1_1,vhost=on \ -device virtio-net-pci,bootindex=1,id=nic_1,netdev=tap_1,mac=02:16:3F:00:00:FA \ -netdev tap,id=tap_2,script=no,downscript=no,ifname=net_1_2 \ -device e1000,bootindex=2,id=nic_2,netdev=tap_2,mac=02:16:3F:00:00:FB; \ brctl addif bridge0 net_1_1; \ brctl addif bridge0 net_1_2; \ ifconfig net_1_1 0.0.0.0 up; \ ifconfig net_1_2 0.0.0.0 up; \ sleep 2; \ strace -p `ps x |grep qemu-system-x86_64 |grep -v grep|awk '{print $1}'` -o /tmp/qemu-devel-trace.txt Kernel log: Dec 24 11:12:08 bramws kernel: [12466.885581] device net_1_1 entered promiscuous mode Dec 24 11:12:08 bramws kernel: [12466.886238] device net_1_2 entered promiscuous mode Dec 24 11:12:08 bramws kernel: [12466.887084] bridge0: port 2(net_1_1) entered forwarding state Dec 24 11:12:08 bramws kernel: [12466.887089] bridge0: port 2(net_1_1) entered forwarding state Dec 24 11:12:08 bramws kernel: [12466.888940] bridge0: port 3(net_1_2) entered forwarding state Dec 24 11:12:08 bramws kernel: [12466.888947] bridge0: port 3(net_1_2) entered forwarding state Dec 24 11:12:29 bramws kernel: [12488.026376] bridge0: port 2(net_1_1) entered disabled state Dec 24 11:12:29 bramws kernel: [12488.026820] device net_1_1 left promiscuous mode Dec 24 11:12:29 bramws kernel: [12488.026832] bridge0: port 2(net_1_1) entered disabled state Dec 24 11:12:29 bramws kernel: [12488.049636] bridge0: port 3(net_1_2) entered disabled state Dec 24 11:12:29 bramws kernel: [12488.050058] device net_1_2 left promiscuous mode Dec 24 11:12:29 bramws kernel: [12488.050074] bridge0: port 3(net_1_2) entered disabled state Strace log: (full log attached) ppoll([{fd=13, events=POLLIN|POLLERR|POLLHUP}, {fd=7, events=POLLIN|POLLERR|POLLHUP}, {fd=12, events=POLLIN|POLLERR|POLLHUP}, {fd=3, events=POLLIN|POLLERR|POLLHUP}, {fd=6, events=POLLIN}, {fd=5, events=POLLIN}], 6, {0, 28646613}, NULL, 8) = 0 (Timeout) write(5, "\1\0\0\0\0\0\0\0", 8) = 8 ppoll([{fd=13, events=POLLIN|POLLERR|POLLHUP}, {fd=7, events=POLLIN|POLLERR|POLLHUP}, {fd=12, events=POLLIN|POLLERR|POLLHUP}, {fd=3, events=POLLIN|POLLERR|POLLHUP}, {fd=6, events=POLLIN}, {fd=5, events=POLLIN}], 6, {0, 10899760}, NULL, 8) = 1 ([{fd=5, revents=POLLIN}], left {0, 10895457}) write(6, "\1\0\0\0\0\0\0\0", 8) = 8 read(5, "\1\0\0\0\0\0\0\0", 512) = 8 write(6, "\1\0\0\0\0\0\0\0", 8) = 8 ppoll([{fd=13, events=POLLIN|POLLERR|POLLHUP}, {fd=7, events=POLLIN|POLLERR|POLLHUP}, {fd=12, events=POLLIN|POLLERR|POLLHUP}, {fd=3, events=POLLIN|POLLERR|POLLHUP}, {fd=6, events=POLLIN}, {fd=5, events=POLLIN}], 6, {0, 0}, NULL, 8) = 1 ([{fd=6, revents=POLLIN}], left {0, 0}) ppoll([{fd=13, events=POLLIN|POLLERR|POLLHUP}, {fd=7, events=POLLIN|POLLERR|POLLHUP}, {fd=12, events=POLLIN|POLLERR|POLLHUP}, {fd=3, events=POLLIN|POLLERR|POLLHUP}, {fd=6, events=POLLIN}, {fd=5, events=POLLIN}], 6, {0, 0}, NULL, 8) = 1 ([{fd=6, revents=POLLIN}], left {0, 0}) read(6, "\2\0\0\0\0\0\0\0", 16) = 8 ppoll([{fd=13, events=POLLIN|POLLERR|POLLHUP}, {fd=7, events=POLLIN|POLLERR|POLLHUP}, {fd=12, events=POLLIN|POLLERR|POLLHUP}, {fd=3, events=POLLIN|POLLERR|POLLHUP}, {fd=6, events=POLLIN}, {fd=5, events=POLLIN}], 6, {0, 0}, NULL, 8) = 0 (Timeout) read(6, 0x7fff697320e0, 16) = -1 EAGAIN (Resource temporarily unavailable) ppoll([{fd=13, events=POLLIN|POLLERR|POLLHUP}, {fd=7, events=POLLIN|POLLERR|POLLHUP}, {fd=12, events=POLLIN|POLLERR|POLLHUP}, {fd=3, events=POLLIN|POLLERR|POLLHUP}, {fd=6, events=POLLIN}, {fd=5, events=POLLIN}], 6, {0, 9570429}, NULL, 8) = 0 (Timeout) futex(0x7f011c8ef094, FUTEX_CMP_REQUEUE_PRIVATE, 1, 2147483647, 0x7f011aaa0860, 224) = 1 write(5, "\1\0\0\0\0\0\0\0", 8) = 8 write(5, "\1\0\0\0\0\0\0\0", 8) = 8 futex(0x7f011aaa0860, FUTEX_WAKE_PRIVATE, 1) = 1 ppoll([{fd=13, events=POLLIN|POLLERR|POLLHUP}, {fd=7, events=POLLIN|POLLERR|POLLHUP}, {fd=12, events=POLLIN|POLLERR|POLLHUP}, {fd=3, events=POLLIN|POLLERR|POLLHUP}, {fd=6, events=POLLIN}, {fd=5, events=POLLIN}], 6, {0, 54463396}, NULL, 8) = 1 ([{fd=5, revents=POLLIN}], left {0, 54459649}) tgkill(7779, 7784, SIGUSR1) = 0 futex(0x7f011aaa0824, FUTEX_CMP_REQUEUE_PRIVATE, 1, 2147483647, 0x7f011aaa0860, 1650) = 1 write(6, "\1\0\0\0\0\0\0\0", 8) = 8 read(5, "\2\0\0\0\0\0\0\0", 512) = 8 write(6, "\1\0\0\0\0\0\0\0", 8) = 8 ppoll([{fd=13, events=POLLIN|POLLERR|POLLHUP}, {fd=7, events=POLLIN|POLLERR|POLLHUP}, {fd=12, events=POLLIN|POLLERR|POLLHUP}, {fd=3, events=POLLIN|POLLERR|POLLHUP}, {fd=6, events=POLLIN}, {fd=5, events=POLLIN}], 6, {0, 0}, NULL, 8) = 1 ([{fd=6, revents=POLLIN}], left {0, 0}) ppoll([{fd=13, events=POLLIN|POLLERR|POLLHUP}, {fd=7, events=POLLIN|POLLERR|POLLHUP}, {fd=12, events=POLLIN|POLLERR|POLLHUP}, {fd=3, events=POLLIN|POLLERR|POLLHUP}, {fd=6, events=POLLIN}, {fd=5, events=POLLIN}], 6, {0, 0}, NULL, 8) = 1 ([{fd=6, revents=POLLIN}], left {0, 0}) read(6, "\2\0\0\0\0\0\0\0", 16) = 8 ppoll([{fd=13, events=POLLIN|POLLERR|POLLHUP}, {fd=7, events=POLLIN|POLLERR|POLLHUP}, {fd=12, events=POLLIN|POLLERR|POLLHUP}, {fd=3, events=POLLIN|POLLERR|POLLHUP}, {fd=6, events=POLLIN}, {fd=5, events=POLLIN}], 6, {0, 0}, NULL, 8) = 0 (Timeout) read(6, 0x7fff697320e0, 16) = -1 EAGAIN (Resource temporarily unavailable) futex(0x7f011aaa0860, FUTEX_WAKE_PRIVATE, 1) = 1 ppoll([{fd=13, events=POLLIN|POLLERR|POLLHUP}, {fd=7, events=POLLIN|POLLERR|POLLHUP}, {fd=12, events=POLLIN|POLLERR|POLLHUP}, {fd=3, events=POLLIN|POLLERR|POLLHUP}, {fd=6, events=POLLIN}, {fd=5, events=POLLIN}], 6, {0, 53843633}, NULL, 8 +++ killed by SIGABRT +++ What does qemu say when aborting? Hm. I guess it says nothing, as else some write(2) should be seen by strace. So it is like abort() not assert(). And we have about 800 abort() calls in the code. Oh well. Indeed, it does not say anything, it simply crashes. Besides the strace log I created I can't find any other usefull information in other logfiles. a backtrace from a coredump or gdb would be better; it'll tell us the line the abort is on and the state at that point. Run it under gdb and do bt full and paste the result. I can't reproduce this, no matter how hard I try. Tried 4 virtio devices and 4 e1000 devices (8 network devices in total). Tried 2.1 and 2.2 and current git. It all Just Works (tm). What I'm doing wrong? :) I think your just not trying hard enough ;-) I have reproduced this on four different (bare metal) machines. I used the default ubuntu QEMU (2.0.0) and the latest GIT version. All machines where running ubuntu 14.04. I also tried to reproduce it on a virtualbox VM and I could only get it to crash when I put the network card of my virtual machine (the virtualbox one) in promiscuous modus. If I would set promiscuous modus to "deny all" in virtual box QEMU would not crash. I will do the gdb debug trace when I get back at work, I don't have a suitable system available at home to test this with. I have a hard time getting a full backtrace. I recompiled qemu with --enable-debug. Running QEMU with -s -S and then using GDB with debug using: attach remote localhost:1234 works but when QEMU has crashed the command "bt full" always gives back: (gdb) bt full No stack. I then ran QEMU directly from GDB using: run -nographic -smp 1 -m 128 -vnc 0.0.0.0:0 -netdev tap,id=tap_1,script=no,downscript=no,ifname=net_1_1,vhost=on -device virtio-net-pci,bootindex=1,id=nic_1,netdev=tap_1,mac=02:16:3F:00:00:FA -netdev tap,id=tap_2,script=no,downscript=no,ifname=net_1_2 -device e1000,bootindex=2,id=nic_2,netdev=tap_2,mac=02:16:3F:00:00:FB I have to press a key continuously to get QEMU "running" (otherwise it halts). Right before QEMU crashes I get this error in gdb: Program received signal SIGUSR1, User defined signal 1. spin_lock (lock=0x5a4f072640e15f00) at /home/bram/git/qemu/include/exec/spinlock.h:42 42 { (gdb) Continuing. qemu: fatal: Trying to execute code outside RAM or ROM at 0x0000000074f0a5f9 EAX=00000000 EBX=0001939c ECX=00009cf3 EDX=00001c79 ESI=f81ac248 EDI=0009bd70 EBP=0009bd20 ESP=0009bd14 EIP=6d016c49 EFL=00000083 [--S---C] CPL=0 II=0 A20=1 SMM=0 HLT=0 ES =0010 07ef39b0 ffffffff 00cf9300 DPL=0 DS [-WA] CS =0008 07ef39b0 ffffffff 00cf9f00 DPL=0 CS32 [CRA] SS =0010 07ef39b0 ffffffff 00cf9300 DPL=0 DS [-WA] DS =0010 07ef39b0 ffffffff 00cf9300 DPL=0 DS [-WA] FS =0010 07ef39b0 ffffffff 00cf9300 DPL=0 DS [-WA] GS =0010 07ef39b0 ffffffff 00cf9300 DPL=0 DS [-WA] LDT=0000 00000000 0000ffff 00008200 DPL=0 LDT TR =0000 00000000 0000ffff 00008b00 DPL=0 TSS32-busy GDT= 0009cf40 00000037 IDT= 07f8df10 00006deb CR0=00000011 CR2=00000000 CR3=00000000 CR4=00000000 DR0=0000000000000000 DR1=0000000000000000 DR2=0000000000000000 DR3=0000000000000000 DR6=00000000ffff0ff0 DR7=0000000000000400 CCS=000193bc CCD=ffffffe0 CCO=SUBL EFER=0000000000000000 FCW=037f FSW=0000 [ST=0] FTW=00 MXCSR=00001f80 FPR0=0000000000000000 0000 FPR1=0000000000000000 0000 FPR2=0000000000000000 0000 FPR3=0000000000000000 0000 FPR4=0000000000000000 0000 FPR5=0000000000000000 0000 FPR6=0000000000000000 0000 FPR7=0000000000000000 0000 XMM00=00000000000000000000000000000000 XMM01=00000000000000000000000000000000 XMM02=00000000000000000000000000000000 XMM03=00000000000000000000000000000000 XMM04=00000000000000000000000000000000 XMM05=00000000000000000000000000000000 XMM06=00000000000000000000000000000000 XMM07=00000000000000000000000000000000 Program received signal SIGABRT, Aborted. 0x00007ffff2797bb9 in __GI_raise (sig=sig@entry=6) at ../nptl/sysdeps/unix/sysv/linux/raise.c:56 56 ../nptl/sysdeps/unix/sysv/linux/raise.c: No such file or directory. (gdb) Continuing. [Thread 0x7fffdadff700 (LWP 27991) exited] [Thread 0x7ffff7fa8980 (LWP 27930) exited] Program terminated with signal SIGABRT, Aborted. If I run bt full right after that I get the exact same "No stack." error. I then created a core dump and ran the bt full on that, giving me this output: #0 0x00007f038ec86bb9 in __GI_raise (sig=sig@entry=6) at ../nptl/sysdeps/unix/sysv/linux/raise.c:56 resultvar = 0 pid = 29457 selftid = 29462 #1 0x00007f038ec89fc8 in __GI_abort () at abort.c:89 save_stage = 2 act = {__sigaction_handler = {sa_handler = 0x4c425553, sa_sigaction = 0x4c425553}, sa_mask = {__val = {139653355451349, 893353197568, 139653355450960, 139653355450970, 15179618306775321344, 1, 139653029951232, 0, 0, 139653029951936, 139653029951232, 139653029947664, 139653353918150, 1688849860263936, 139653257249376, 139653260837312}}, sa_flags = -1760754560, sa_restorer = 0x7f039708a7e0} sigs = {__val = {32, 0 }} #2 0x00007f039459377f in cpu_abort (cpu=0x7f03970d0480, fmt=0x7f03949f2830 "Trying to execute code outside RAM or ROM at 0x%016lx\n") at /home/bram/git/qemu/exec.c:805 ap = {{gp_offset = 24, fp_offset = 48, overflow_arg_area = 0x7f03813dea30, reg_save_area = 0x7f03813de970}} ap2 = {{gp_offset = 16, fp_offset = 48, overflow_arg_area = 0x7f03813dea30, reg_save_area = 0x7f03813de970}} #3 0x00007f03945f4d20 in get_page_addr_code (env1=0x7f03970d86d0, addr=1961928185) at /home/bram/git/qemu/cputlb.c:357 cc = 0x7f039708a7e0 mmu_idx = 2 page_index = 10 pd = 0 p = 0x7ef5000 mr = 0x7f0394e79620 cpu = 0x7f03970d0480 __func__ = "get_page_addr_code" #4 0x00007f039459c38e in tb_find_slow (env=0x7f03970d86d0, pc=1961928185, cs_base=133118384, flags=244) at /home/bram/git/qemu/cpu-exec.c:243 cpu = 0x7f03970d0480 tb = 0x4fb813deb00 ptb1 = 0x7f0394a03680 h = 32515 phys_pc = 139653395842176 phys_page1 = 139653029948228 virt_page2 = 139653029948232 #5 0x00007f039459c63c in tb_find_fast (env=0x7f03970d86d0) at /home/bram/git/qemu/cpu-exec.c:300 cpu = 0x7f03970d0480 tb = 0x0 cs_base = 133118384 pc = 1961928185 flags = 244 #6 0x00007f039459cade in cpu_x86_exec (env=0x7f03970d86d0) at /home/bram/git/qemu/cpu-exec.c:456 cpu = 0x7f03970d0480 cc = 0x7f039708a7e0 __func__ = "cpu_x86_exec" x86_cpu = 0x7f03970d0480 ret = 65536 interrupt_request = 2 tb = 0x7f038166b6d8 tc_ptr = 0x7f0383576ad0 "A\213n\364\205\355\017\205\245" next_tb = 0 sc = {diff_clk = 139653029948416, last_cpu_icount = 139653351987455, realtime_clock = 139653029948448} have_tb_lock = true #7 0x00007f03945ca913 in tcg_cpu_exec (env=0x7f03970d86d0) at /home/bram/git/qemu/cpus.c:1363 cpu = 0x7f03970d0480 ret = 32515 #8 0x00007f03945caa28 in tcg_exec_all () at /home/bram/git/qemu/cpus.c:1396 cpu = 0x7f03970d0480 env = 0x7f03970d86d0 r = 32515 #9 0x00007f03945c9d47 in qemu_tcg_cpu_thread_fn (arg=0x7f03970d0480) at /home/bram/git/qemu/cpus.c:1042 cpu = 0x0 #10 0x00007f038f01e182 in start_thread (arg=0x7f03813df700) at pthread_create.c:312 __res = pd = 0x7f03813df700 now = unwind_buf = {cancel_jmp_buf = {{jmp_buf = {139653029951232, 5823000673722110008, 0, 0, 139653029951936, 139653029951232, -5852297728485158856, -5852310887850110920}, mask_was_saved = 0}}, priv = {pad = {0x0, 0x0, 0x0, 0x0}, data = {prev = 0x0, cleanup = 0x0, canceltype = 0}}} not_first_call = pagesize_m1 = sp = freesize = __PRETTY_FUNCTION__ = "start_thread" #11 0x00007f038ed4aefd in clone () at ../sysdeps/unix/sysv/linux/x86_64/clone.S:111 No locals. Im not sure if this is all the information needed? I attached the core dump. On 29 December 2014 at 08:29, Bram Klein Gunnewiek