diff options
Diffstat (limited to 'gitlab/issues/target_i386/host_x86/accel_KVM')
4 files changed, 173 insertions, 0 deletions
diff --git a/gitlab/issues/target_i386/host_x86/accel_KVM/1151.toml b/gitlab/issues/target_i386/host_x86/accel_KVM/1151.toml new file mode 100644 index 000000000..b26fba89f --- /dev/null +++ b/gitlab/issues/target_i386/host_x86/accel_KVM/1151.toml @@ -0,0 +1,59 @@ +id = 1151 +title = "when guest unexpect shutdown,can't enter system,the terminal has a black screen" +state = "opened" +created_at = "2022-08-12T06:34:17.125Z" +closed_at = "n/a" +labels = ["accel: KVM", "host: x86", "target: i386"] +url = "https://gitlab.com/qemu-project/qemu/-/issues/1151" +host-os = "centos8" +host-arch = "x86" +qemu-version = "6.2.0" +guest-os = "Windows 7" +guest-arch = "x86" +description = """""" +reproduce = """1.guest unexpect shutdown + +2.when start again,cpu usage is high and can't enter the guest system + +3.restart guest can recovery + +**libvirt print:** + +`2022-08-11 14:39:58.080+0000: 1942: warning : qemuDomainObjTaint:6079 : Domain id=117 name='GDT99d2578e-f06e-4fbe-88dd-7d9dd56fd02d' uuid=99d2578e-f06e-4fbe-88dd-7d9dd56fd02d is tainted: high-privileges + +2022-08-11 14:39:58.080+0000: 1942: warning : qemuDomainObjTaint:6079 : Domain id=117 name='GDT99d2578e-f06e-4fbe-88dd-7d9dd56fd02d' uuid=99d2578e-f06e-4fbe-88dd-7d9dd56fd02d is tainted: custom-argv + +2022-08-11 14:40:28.792+0000: 741037: warning : qemuDomainObjBeginJobInternal:946 : Cannot start job (modify, none, none) for domain GDT99d2578e-f06e-4fbe-88dd-7d9dd56fd02d; current job is (none, none, migration in) owned by (0 <null>, 0 <null>, 0 remoteDispatchDomainMigratePrepare3Params (flags=0x203)) for (0s, 0s, 30s) + +2022-08-11 14:40:28.792+0000: 741037: error : qemuDomainObjBeginJobInternal:968 : Timed out during operation: cannot acquire state change lock (held by monitor=remoteDispatchDomainMigratePrepare3Params) +` + + +**user perf to analyse:** + +\\#top -d 3 -Hp 1311519 + + + +\\#perf record -a -g -p 1311519 sleep 20 + +\\#report -n --header --stdio + + + + +**query kvm stat:** + + \\# perf stat -e 'kvm:*' -a -p 1311519 sleep 20 + + + + +kvm vmexit stat: + +\\#perf kvm stat record -a -p 1311519 sleep 10 + +\\#perf kvm stat report --event=vmexit + +""" +additional = "n/a" diff --git a/gitlab/issues/target_i386/host_x86/accel_KVM/1152.toml b/gitlab/issues/target_i386/host_x86/accel_KVM/1152.toml new file mode 100644 index 000000000..a81178c00 --- /dev/null +++ b/gitlab/issues/target_i386/host_x86/accel_KVM/1152.toml @@ -0,0 +1,36 @@ +id = 1152 +title = "Windows crashes on resuming from sleep if hv-tlbflush is enabled" +state = "opened" +created_at = "2022-08-12T09:17:41.461Z" +closed_at = "n/a" +labels = ["accel: KVM", "host: x86", "target: i386"] +url = "https://gitlab.com/qemu-project/qemu/-/issues/1152" +host-os = "Arch Linux" +host-arch = "x86_64 Intel i9-12900K" +qemu-version = "7.0.0" +guest-os = "Windows 10 21H2" +guest-arch = "x86_64" +description = """The above steps cause my Windows VM to BSOD immediately upon waking up (even before restarting the display driver in my case).""" +reproduce = """1. Boot Windows +2. Tell Windows to go to sleep (observe that qemu's state switches to suspended) +3. Cause windows to wake up (e.g. using the `system_wakeup` HMP command)""" +additional = """Looking at the crash dumps always shows the "ATTEMPTED WRITE TO READONLY MEMORY" error, and always with this stack trace: + +``` +nt!KeBugCheckEx +nt!MiRaisedIrqlFault+0x1413a6 +nt!MmAccessFault+0x4ef +nt!KiPageFault+0x35e +nt!MiIncreaseUsedPtesCount+0x12 +nt!MiBuildForkPte+0xc6 +nt!MiCloneVads+0x4ab +nt!MiCloneProcessAddressSpace+0x261 +nt!MmInitializeProcessAddressSpace+0x1cb631 +nt!PspAllocateProcess+0x1d13 +nt!PspCreateProcess+0x242 +nt!NtCreateProcessEx+0x85 +nt!KiSystemServiceCopyEnd+0x25 +ntdll!NtCreateProcessEx+0x14 +``` + +However, the process that is being created here is always `WerFault.exe`, i.e. the crash reporter. The crashing process is seemingly random. Removing `hv-tlbflush` from the command line resolves the problem. Hence, my hypothesis is that due to improper TLB flushing during wakeup, a random application on the core will crash, which spawns `WerFault.exe` which then immediately crashes again inside the kernel (also because of bad/stale TLB contents) and causes the BSOD. Perhaps one core wakes up first, requests a TLB flush, which is then *not* propagated to sleeping cores due to hv-tlbflush. Then one of those cores wakes up without the TLB flush?""" diff --git a/gitlab/issues/target_i386/host_x86/accel_KVM/2574.toml b/gitlab/issues/target_i386/host_x86/accel_KVM/2574.toml new file mode 100644 index 000000000..7685fe45b --- /dev/null +++ b/gitlab/issues/target_i386/host_x86/accel_KVM/2574.toml @@ -0,0 +1,57 @@ +id = 2574 +title = "VM hang: 'error: kvm run failed Bad address' with some AMD GPUs since kernel 6.7" +state = "opened" +created_at = "2024-09-16T20:43:20.431Z" +closed_at = "n/a" +labels = ["accel: KVM", "host: x86", "target: i386"] +url = "https://gitlab.com/qemu-project/qemu/-/issues/2574" +host-os = "Debian unstable" +host-arch = "x86 (AMD 5600G)" +qemu-version = "9.1.0 (Debian 1:9.1.0+ds-3+b1)" +guest-os = "Debian unstable" +guest-arch = "x86" +description = """The Debian ROCm Team runs GPU-utilizing test workloads in QEMU VMs into which we pass through AMD GPUs attached to PCIe x16 slots on the host. We do this to quickly test various Debian distributions/kernels/firmwares on a single physical host per GPU, and to isolate the host as much as possible from potentially hostile code. + +Starting with kernel 6.7 in the **guest**, with Navi 31 GPUs (eg: RX 7900 XT), as soon as anything triggers access to the GPU's memory, the VM hangs with `error: kvm run failed Bad address` and dumps its state. + +I gather that [this](https://gitlab.com/qemu-project/qemu/-/blob/ea9cdbcf3a0b8d5497cddf87990f1b39d8f3bb0a/accel/kvm/kvm-all.c#L3046-L3048) is where this message originates from. It would seem that the preceding [ioctl](https://gitlab.com/qemu-project/qemu/-/blob/ea9cdbcf3a0b8d5497cddf87990f1b39d8f3bb0a/accel/kvm/kvm-all.c#L3025) runs into `EFAULT` which eventually leads to a break out of the surrounding loop. + +Since we can reliably reproduce this starting with 6.7, our assumption is that this is caused by a change in the kernel and/or the `amdgpu` driver. However, as the error originates from kvm *on the host*, we could not rule out that this might also be a emulation issue. In particular, it was only 9.1 [c15e568](c15e568) where the handling of the `EFAULT` and `KVM_EXIT_MEMORY_FAULT` case was added, so perhaps we ran into something that is still incomplete. + +I'd appreciate any advice you could give us for further debugging. We will bisect 6.7 to see what could have triggered this on the guest side, but is there something that we can do on the host to further track this down, in particular which `-trace`s might be helpful? + +Other notes: +- The VM boots and runs fine, GPU initializes fine according to `dmesg`. The issue is only triggered on GPU utilization +- The problematic GPU in question worked fine with kernels 6.3 - 6.6 +- All other GPU architectures that we test this way (eg: Navi 2x) do not experience this issue, they work fine with all kernels we tested +- We have checked with more than one GPU, to rule out a physical defect""" +reproduce = """Reproducing the issue requires +1. A suitable image +2. Access to a Navi 3x card. Remote access can be arranged, if necessary. + +Building a suitable image can be rather complicated and requires a Debian host. If needed, it would be easier for me to just share a pre-built image.""" +additional = """This is dumped just before the VM hangs: +``` +ROCk module is loaded +error: kvm run failed Bad address +RAX=00000000000035c8 RBX=00000000000006ba RCX=0003000108b08073 RDX=00000000000006b9 +RSI=ffff9994b00035c8 RDI=ffff899403c80000 RBP=ffff899408b285e0 RSP=ffff9994816ab620 +R8 =0003000000000073 R9 =ffff9994b0000000 R10=ffff899403c8fb18 R11=ffff899408b065b8 +R12=ffff899403c80000 R13=0003000000000073 R14=ffff9994b0000000 R15=00000000000006ba +RIP=ffffffffc11d8f93 RFL=00000282 [--S----] CPL=0 II=0 A20=1 SMM=0 HLT=0 +ES =0000 0000000000000000 00000000 00000000 +CS =0010 0000000000000000 ffffffff 00a09b00 DPL=0 CS64 [-RA] +SS =0018 0000000000000000 ffffffff 00c09300 DPL=0 DS [-WA] +DS =0000 0000000000000000 00000000 00000000 +FS =0000 00007faa76aea780 00000000 00000000 +GS =0000 ffff899f0dd80000 00000000 00000000 +LDT=0000 0000000000000000 00000000 00000000 +TR =0040 fffffe41c66fc000 00004087 00008b00 DPL=0 TSS64-busy +GDT= fffffe41c66fa000 0000007f +IDT= fffffe0000000000 00000fff +CR0=80050033 CR2=000055bd5a5d8598 CR3=000000010342c000 CR4=00750ef0 +DR0=0000000000000000 DR1=0000000000000000 DR2=0000000000000000 DR3=0000000000000000 +DR6=00000000ffff0ff0 DR7=0000000000000400 +EFER=0000000000000d01 +Code=ff ff 00 00 48 21 c1 8d 04 d5 00 00 00 00 4c 09 c1 48 01 c6 <48> 89 0e 31 c0 e9 6e b1 92 d2 0f 1f 00 90 90 90 90 90 90 90 90 90 90 90 90 90 90 90 90 66 +```""" diff --git a/gitlab/issues/target_i386/host_x86/accel_KVM/2609.toml b/gitlab/issues/target_i386/host_x86/accel_KVM/2609.toml new file mode 100644 index 000000000..13a33780f --- /dev/null +++ b/gitlab/issues/target_i386/host_x86/accel_KVM/2609.toml @@ -0,0 +1,21 @@ +id = 2609 +title = "Blue screen in Windows XP" +state = "closed" +created_at = "2024-10-05T09:30:01.873Z" +closed_at = "2024-10-20T12:21:16.168Z" +labels = ["accel: KVM", "guest: Windows", "host: x86", "target: i386"] +url = "https://gitlab.com/qemu-project/qemu/-/issues/2609" +host-os = "Ubuntu 24.04.1 LTS (GNU/Linux )" +host-arch = "x86_x64" +qemu-version = "9.1.0" +guest-os = "n/a" +guest-arch = "n/a" +description = """When starting the installation of Windows XP when using a virtioblk device you immediately get a bluescreen: `STOP: 0x000000A5 (0x00000002, 0x8A1A6008, 0xE1018808, 0x8A1B7F00)`. I think this happens even before it loads the SATA drivers that are slipstreamed in the ISO. + +After a lot of Googling about this error 0x000000A5 I found some posts suggesting that changing the machine type from `q35` to `pc-q35-2.10` solves the issue. And it worked. Anything above 2.10 (for example 2.11) and the bluescreens return. + +So I always used this solution, but in QEMU 9.1.0 it warns that `pc-q35-2.10` will be removed soon. This would mean there is no way anymore to install XP to a SATA disk unattendly.""" +reproduce = """1. Use a virtioblk disk and SATA drivers +2. Start the Windows XP installer +3. Bluescreen will appear""" +additional = "n/a" |