From e5634e2806195bee44407853c4bf8776f7abfa4f Mon Sep 17 00:00:00 2001
From: Christian Krinitsin <mail@krinitsin.com>
Date: Sun, 1 Jun 2025 21:19:55 +0200
Subject: add the outputs of the first five revisions of the classifier

---
 classification_output/01/instruction/0966902  |   39 -
 classification_output/01/instruction/11357571 |   47 +
 classification_output/01/instruction/11933524 | 1125 ++++++
 classification_output/01/instruction/24190340 | 2056 ++++++++++
 classification_output/01/instruction/26095107 |  158 +
 classification_output/01/instruction/2609717  | 4939 -------------------------
 classification_output/01/instruction/2880487  |  187 -
 classification_output/01/instruction/33802194 | 4939 +++++++++++++++++++++++++
 classification_output/01/instruction/3457423  |   40 -
 classification_output/01/instruction/42226390 |  187 +
 classification_output/01/instruction/50773216 |  110 +
 classification_output/01/instruction/51610399 |  308 ++
 classification_output/01/instruction/55961334 |   39 +
 classification_output/01/instruction/5843372  | 2056 ----------
 classification_output/01/instruction/6117378  |   31 -
 classification_output/01/instruction/62179944 |   31 +
 classification_output/01/instruction/63565653 |   49 +
 classification_output/01/instruction/70868267 |   40 +
 classification_output/01/instruction/73660729 |   31 +
 classification_output/01/instruction/7647456  |  110 -
 classification_output/01/instruction/7658242  | 1125 ------
 classification_output/01/instruction/7733130  |   47 -
 classification_output/01/instruction/7960594  |  158 -
 classification_output/01/instruction/8019995  |   31 -
 classification_output/01/instruction/8566429  |   49 -
 classification_output/01/instruction/9818783  |  308 --
 26 files changed, 9120 insertions(+), 9120 deletions(-)
 delete mode 100644 classification_output/01/instruction/0966902
 create mode 100644 classification_output/01/instruction/11357571
 create mode 100644 classification_output/01/instruction/11933524
 create mode 100644 classification_output/01/instruction/24190340
 create mode 100644 classification_output/01/instruction/26095107
 delete mode 100644 classification_output/01/instruction/2609717
 delete mode 100644 classification_output/01/instruction/2880487
 create mode 100644 classification_output/01/instruction/33802194
 delete mode 100644 classification_output/01/instruction/3457423
 create mode 100644 classification_output/01/instruction/42226390
 create mode 100644 classification_output/01/instruction/50773216
 create mode 100644 classification_output/01/instruction/51610399
 create mode 100644 classification_output/01/instruction/55961334
 delete mode 100644 classification_output/01/instruction/5843372
 delete mode 100644 classification_output/01/instruction/6117378
 create mode 100644 classification_output/01/instruction/62179944
 create mode 100644 classification_output/01/instruction/63565653
 create mode 100644 classification_output/01/instruction/70868267
 create mode 100644 classification_output/01/instruction/73660729
 delete mode 100644 classification_output/01/instruction/7647456
 delete mode 100644 classification_output/01/instruction/7658242
 delete mode 100644 classification_output/01/instruction/7733130
 delete mode 100644 classification_output/01/instruction/7960594
 delete mode 100644 classification_output/01/instruction/8019995
 delete mode 100644 classification_output/01/instruction/8566429
 delete mode 100644 classification_output/01/instruction/9818783

(limited to 'classification_output/01/instruction')

diff --git a/classification_output/01/instruction/0966902 b/classification_output/01/instruction/0966902
deleted file mode 100644
index 80cdabd29..000000000
--- a/classification_output/01/instruction/0966902
+++ /dev/null
@@ -1,39 +0,0 @@
-instruction: 0.803
-semantic: 0.775
-mistranslation: 0.718
-other: 0.715
-
-[Bug] "-ht" flag ignored under KVM - guest still reports HT
-
-Hi Community,
-We have observed that the 'ht' feature bit cannot be disabled when QEMU runs
-with KVM acceleration.
-qemu-system-x86_64 \
-  --enable-kvm \
-  -machine q35 \
-  -cpu host,-ht \
-  -smp 4 \
-  -m 4G \
-  -drive file=rootfs.img,format=raw \
-  -nographic \
-  -append 'console=ttyS0 root=/dev/sda rw'
-Because '-ht' is specified, the guest should expose no HT capability
-(cpuid.1.edx[28] = 0), and /proc/cpuinfo shouldn't show HT feature, but we still
-saw ht in linux guest when run 'cat /proc/cpuinfo'.
-XiaoYao mentioned that:
-
-It has been the behavior of QEMU since
-
-  commit 400281af34e5ee6aa9f5496b53d8f82c6fef9319
-  Author: Andre Przywara <andre.przywara@amd.com>
-  Date:   Wed Aug 19 15:42:42 2009 +0200
-
-    set CPUID bits to present cores and threads topology
-
-that we cannot remove HT CPUID bit from guest via "-cpu xxx,-ht" if the
-VM has >= 2 vcpus.
-I'd like to know whether there's a plan to address this issue, or if the current
-behaviour is considered acceptable.
-Best regards,
-Ewan.
-
diff --git a/classification_output/01/instruction/11357571 b/classification_output/01/instruction/11357571
new file mode 100644
index 000000000..1c3bc483f
--- /dev/null
+++ b/classification_output/01/instruction/11357571
@@ -0,0 +1,47 @@
+instruction: 0.758
+semantic: 0.694
+other: 0.687
+mistranslation: 0.516
+
+[Qemu-devel] [BUG] VNC: client won't send FramebufferUpdateRequest if job in flight is aborted
+
+Hi Gerd, Daniel.
+
+We noticed that if VncSharePolicy was configured with 
+VNC_SHARE_POLICY_FORCE_SHARED mode and
+multiple vnc clients opened vnc connections, some clients could go blank screen 
+at high probability.
+This problem can be reproduced when we regularly reboot suse12sp3 in graphic 
+mode both
+with RealVNC and noVNC client.
+
+Then we dig into it and find out that some clients go blank screen because they 
+don't
+send FramebufferUpdateRequest any more. One step further, we notice that each 
+time
+the job in flight is aborted one client go blank screen.
+
+The bug is triggered in the following procedure.
+Guest reboot => graphic mode switch => graphic_hw_update =>  vga_update_display
+=> vga_draw_graphic (full_update = 1) => dpy_gfx_replace_surface => 
+vnc_dpy_switch =>
+vnc_abort_display_jobs (client may have job in flight) => job removed from the 
+queue
+If one client has vnc job in flight, *vnc_abort_display_jobs* will wait until 
+its job is abandoned.
+This behavior is done in vnc_worker_thread_loop when 'if (job->vs->ioc == NULL 
+|| job->vs->abort == true)'
+branch is taken.
+
+As we can see, *vnc_abort_display_jobs* is intended to do some optimization to 
+avoid unnecessary client update.
+But if client sends FramebufferUpdateRequest for some graphic area and its 
+FramebufferUpdate response job
+is abandoned, the client may wait for the response and never send new 
+FramebufferUpdateRequest, which may
+case the client go blank screen forever.
+
+So I am wondering whether we should drop the *vnc_abort_display_jobs*  
+optimization  or do some trick here
+to push the client to send new FramebufferUpdateRequest. Do you have any idea ?
+
diff --git a/classification_output/01/instruction/11933524 b/classification_output/01/instruction/11933524
new file mode 100644
index 000000000..3ff255be0
--- /dev/null
+++ b/classification_output/01/instruction/11933524
@@ -0,0 +1,1125 @@
+instruction: 0.775
+other: 0.771
+mistranslation: 0.719
+semantic: 0.673
+
+[BUG] hw/i386/pc.c: CXL Fixed Memory Window should not reserve e820 in bios
+
+Early-boot e820 records will be inserted by the bios/efi/early boot
+software and be reported to the kernel via insert_resource.  Later, when
+CXL drivers iterate through the regions again, they will insert another
+resource and make the RESERVED memory area a child.
+
+This RESERVED memory area causes the memory region to become unusable,
+and as a result attempting to create memory regions with
+
+    `cxl create-region ...`
+
+Will fail due to the RESERVED area intersecting with the CXL window.
+
+
+During boot the following traceback is observed:
+
+0xffffffff81101650 in insert_resource_expand_to_fit ()
+0xffffffff83d964c5 in e820__reserve_resources_late ()
+0xffffffff83e03210 in pcibios_resource_survey ()
+0xffffffff83e04f4a in pcibios_init ()
+
+Which produces a call to reserve the CFMWS area:
+
+(gdb) p *new
+$54 = {start = 0x290000000, end = 0x2cfffffff, name = "Reserved",
+       flags = 0x200, desc = 0x7, parent = 0x0, sibling = 0x0,
+       child = 0x0}
+
+Later the Kernel parses ACPI tables and reserves the exact same area as
+the CXL Fixed Memory Window.  The use of `insert_resource_conflict`
+retains the RESERVED region and makes it a child of the new region.
+
+0xffffffff811016a4 in insert_resource_conflict ()
+                      insert_resource ()
+0xffffffff81a81389 in cxl_parse_cfmws ()
+0xffffffff818c4a81 in call_handler ()
+                      acpi_parse_entries_array ()
+
+(gdb) p/x *new
+$59 = {start = 0x290000000, end = 0x2cfffffff, name = "CXL Window 0",
+       flags = 0x200, desc = 0x0, parent = 0x0, sibling = 0x0,
+       child = 0x0}
+
+This produces the following output in /proc/iomem:
+
+590000000-68fffffff : CXL Window 0
+  590000000-68fffffff : Reserved
+
+This reserved area causes `get_free_mem_region()` to fail due to a check
+against `__region_intersects()`.  Due to this reserved area, the
+intersect check will only ever return REGION_INTERSECTS, which causes
+`cxl create-region` to always fail.
+
+Signed-off-by: Gregory Price <gregory.price@memverge.com>
+---
+ hw/i386/pc.c | 2 --
+ 1 file changed, 2 deletions(-)
+
+diff --git a/hw/i386/pc.c b/hw/i386/pc.c
+index 566accf7e6..5bf5465a21 100644
+--- a/hw/i386/pc.c
++++ b/hw/i386/pc.c
+@@ -1061,7 +1061,6 @@ void pc_memory_init(PCMachineState *pcms,
+         hwaddr cxl_size = MiB;
+ 
+         cxl_base = pc_get_cxl_range_start(pcms);
+-        e820_add_entry(cxl_base, cxl_size, E820_RESERVED);
+         memory_region_init(mr, OBJECT(machine), "cxl_host_reg", cxl_size);
+         memory_region_add_subregion(system_memory, cxl_base, mr);
+         cxl_resv_end = cxl_base + cxl_size;
+@@ -1077,7 +1076,6 @@ void pc_memory_init(PCMachineState *pcms,
+                 memory_region_init_io(&fw->mr, OBJECT(machine), &cfmws_ops, fw,
+                                       "cxl-fixed-memory-region", fw->size);
+                 memory_region_add_subregion(system_memory, fw->base, &fw->mr);
+-                e820_add_entry(fw->base, fw->size, E820_RESERVED);
+                 cxl_fmw_base += fw->size;
+                 cxl_resv_end = cxl_fmw_base;
+             }
+-- 
+2.37.3
+
+Early-boot e820 records will be inserted by the bios/efi/early boot
+software and be reported to the kernel via insert_resource.  Later, when
+CXL drivers iterate through the regions again, they will insert another
+resource and make the RESERVED memory area a child.
+
+This RESERVED memory area causes the memory region to become unusable,
+and as a result attempting to create memory regions with
+
+     `cxl create-region ...`
+
+Will fail due to the RESERVED area intersecting with the CXL window.
+
+
+During boot the following traceback is observed:
+
+0xffffffff81101650 in insert_resource_expand_to_fit ()
+0xffffffff83d964c5 in e820__reserve_resources_late ()
+0xffffffff83e03210 in pcibios_resource_survey ()
+0xffffffff83e04f4a in pcibios_init ()
+
+Which produces a call to reserve the CFMWS area:
+
+(gdb) p *new
+$54 = {start = 0x290000000, end = 0x2cfffffff, name = "Reserved",
+        flags = 0x200, desc = 0x7, parent = 0x0, sibling = 0x0,
+        child = 0x0}
+
+Later the Kernel parses ACPI tables and reserves the exact same area as
+the CXL Fixed Memory Window.  The use of `insert_resource_conflict`
+retains the RESERVED region and makes it a child of the new region.
+
+0xffffffff811016a4 in insert_resource_conflict ()
+                       insert_resource ()
+0xffffffff81a81389 in cxl_parse_cfmws ()
+0xffffffff818c4a81 in call_handler ()
+                       acpi_parse_entries_array ()
+
+(gdb) p/x *new
+$59 = {start = 0x290000000, end = 0x2cfffffff, name = "CXL Window 0",
+        flags = 0x200, desc = 0x0, parent = 0x0, sibling = 0x0,
+        child = 0x0}
+
+This produces the following output in /proc/iomem:
+
+590000000-68fffffff : CXL Window 0
+   590000000-68fffffff : Reserved
+
+This reserved area causes `get_free_mem_region()` to fail due to a check
+against `__region_intersects()`.  Due to this reserved area, the
+intersect check will only ever return REGION_INTERSECTS, which causes
+`cxl create-region` to always fail.
+
+Signed-off-by: Gregory Price <gregory.price@memverge.com>
+---
+  hw/i386/pc.c | 2 --
+  1 file changed, 2 deletions(-)
+
+diff --git a/hw/i386/pc.c b/hw/i386/pc.c
+index 566accf7e6..5bf5465a21 100644
+--- a/hw/i386/pc.c
++++ b/hw/i386/pc.c
+@@ -1061,7 +1061,6 @@ void pc_memory_init(PCMachineState *pcms,
+          hwaddr cxl_size = MiB;
+cxl_base = pc_get_cxl_range_start(pcms);
+-        e820_add_entry(cxl_base, cxl_size, E820_RESERVED);
+          memory_region_init(mr, OBJECT(machine), "cxl_host_reg", cxl_size);
+          memory_region_add_subregion(system_memory, cxl_base, mr);
+          cxl_resv_end = cxl_base + cxl_size;
+@@ -1077,7 +1076,6 @@ void pc_memory_init(PCMachineState *pcms,
+                  memory_region_init_io(&fw->mr, OBJECT(machine), &cfmws_ops, 
+fw,
+                                        "cxl-fixed-memory-region", fw->size);
+                  memory_region_add_subregion(system_memory, fw->base, &fw->mr);
+Or will this be subregion of cxl_base?
+
+Thanks,
+Pankaj
+-                e820_add_entry(fw->base, fw->size, E820_RESERVED);
+                  cxl_fmw_base += fw->size;
+                  cxl_resv_end = cxl_fmw_base;
+              }
+
+>
+> -        e820_add_entry(cxl_base, cxl_size, E820_RESERVED);
+>
+>           memory_region_init(mr, OBJECT(machine), "cxl_host_reg", cxl_size);
+>
+>           memory_region_add_subregion(system_memory, cxl_base, mr);
+>
+>           cxl_resv_end = cxl_base + cxl_size;
+>
+> @@ -1077,7 +1076,6 @@ void pc_memory_init(PCMachineState *pcms,
+>
+>                   memory_region_init_io(&fw->mr, OBJECT(machine),
+>
+> &cfmws_ops, fw,
+>
+>                                         "cxl-fixed-memory-region",
+>
+> fw->size);
+>
+>                   memory_region_add_subregion(system_memory, fw->base,
+>
+> &fw->mr);
+>
+>
+Or will this be subregion of cxl_base?
+>
+>
+Thanks,
+>
+Pankaj
+The memory region backing this memory area still has to be initialized
+and added in the QEMU system, but it will now be initialized for use by
+linux after PCI/ACPI setup occurs and the CXL driver discovers it via
+CDAT.
+
+It's also still possible to assign this area a static memory region at
+bool by setting up the SRATs in the ACPI tables, but that patch is not
+upstream yet.
+
+On Tue, Oct 18, 2022 at 5:14 AM Gregory Price <gourry.memverge@gmail.com> wrote:
+>
+>
+Early-boot e820 records will be inserted by the bios/efi/early boot
+>
+software and be reported to the kernel via insert_resource.  Later, when
+>
+CXL drivers iterate through the regions again, they will insert another
+>
+resource and make the RESERVED memory area a child.
+I have already sent a patch
+https://www.mail-archive.com/qemu-devel@nongnu.org/msg882012.html
+.
+When the patch is applied, there would not be any reserved entries
+even with passing E820_RESERVED .
+So this patch needs to be evaluated in the light of the above patch I
+sent. Once you apply my patch, does the issue still exist?
+
+>
+>
+This RESERVED memory area causes the memory region to become unusable,
+>
+and as a result attempting to create memory regions with
+>
+>
+`cxl create-region ...`
+>
+>
+Will fail due to the RESERVED area intersecting with the CXL window.
+>
+>
+>
+During boot the following traceback is observed:
+>
+>
+0xffffffff81101650 in insert_resource_expand_to_fit ()
+>
+0xffffffff83d964c5 in e820__reserve_resources_late ()
+>
+0xffffffff83e03210 in pcibios_resource_survey ()
+>
+0xffffffff83e04f4a in pcibios_init ()
+>
+>
+Which produces a call to reserve the CFMWS area:
+>
+>
+(gdb) p *new
+>
+$54 = {start = 0x290000000, end = 0x2cfffffff, name = "Reserved",
+>
+flags = 0x200, desc = 0x7, parent = 0x0, sibling = 0x0,
+>
+child = 0x0}
+>
+>
+Later the Kernel parses ACPI tables and reserves the exact same area as
+>
+the CXL Fixed Memory Window.  The use of `insert_resource_conflict`
+>
+retains the RESERVED region and makes it a child of the new region.
+>
+>
+0xffffffff811016a4 in insert_resource_conflict ()
+>
+insert_resource ()
+>
+0xffffffff81a81389 in cxl_parse_cfmws ()
+>
+0xffffffff818c4a81 in call_handler ()
+>
+acpi_parse_entries_array ()
+>
+>
+(gdb) p/x *new
+>
+$59 = {start = 0x290000000, end = 0x2cfffffff, name = "CXL Window 0",
+>
+flags = 0x200, desc = 0x0, parent = 0x0, sibling = 0x0,
+>
+child = 0x0}
+>
+>
+This produces the following output in /proc/iomem:
+>
+>
+590000000-68fffffff : CXL Window 0
+>
+590000000-68fffffff : Reserved
+>
+>
+This reserved area causes `get_free_mem_region()` to fail due to a check
+>
+against `__region_intersects()`.  Due to this reserved area, the
+>
+intersect check will only ever return REGION_INTERSECTS, which causes
+>
+`cxl create-region` to always fail.
+>
+>
+Signed-off-by: Gregory Price <gregory.price@memverge.com>
+>
+---
+>
+hw/i386/pc.c | 2 --
+>
+1 file changed, 2 deletions(-)
+>
+>
+diff --git a/hw/i386/pc.c b/hw/i386/pc.c
+>
+index 566accf7e6..5bf5465a21 100644
+>
+--- a/hw/i386/pc.c
+>
++++ b/hw/i386/pc.c
+>
+@@ -1061,7 +1061,6 @@ void pc_memory_init(PCMachineState *pcms,
+>
+hwaddr cxl_size = MiB;
+>
+>
+cxl_base = pc_get_cxl_range_start(pcms);
+>
+-        e820_add_entry(cxl_base, cxl_size, E820_RESERVED);
+>
+memory_region_init(mr, OBJECT(machine), "cxl_host_reg", cxl_size);
+>
+memory_region_add_subregion(system_memory, cxl_base, mr);
+>
+cxl_resv_end = cxl_base + cxl_size;
+>
+@@ -1077,7 +1076,6 @@ void pc_memory_init(PCMachineState *pcms,
+>
+memory_region_init_io(&fw->mr, OBJECT(machine), &cfmws_ops,
+>
+fw,
+>
+"cxl-fixed-memory-region", fw->size);
+>
+memory_region_add_subregion(system_memory, fw->base,
+>
+&fw->mr);
+>
+-                e820_add_entry(fw->base, fw->size, E820_RESERVED);
+>
+cxl_fmw_base += fw->size;
+>
+cxl_resv_end = cxl_fmw_base;
+>
+}
+>
+--
+>
+2.37.3
+>
+
+This patch does not resolve the issue, reserved entries are still created.
+[Â  Â  0.000000] BIOS-e820: [mem 0x0000000280000000-0x00000002800fffff] reserved
+[Â  Â  0.000000] BIOS-e820: [mem 0x0000000290000000-0x000000029fffffff] reserved
+# cat /proc/iomem
+290000000-29fffffff : CXL Window 0
+Â  290000000-29fffffff : Reserved
+# cxl create-region -m -d decoder0.0 -w 1 -g 256 mem0
+cxl region: create_region: region0: set_size failed: Numerical result out of range
+cxl region: cmd_create_region: created 0 regions
+On Tue, Oct 18, 2022 at 2:05 AM Ani Sinha <
+ani@anisinha.ca
+> wrote:
+On Tue, Oct 18, 2022 at 5:14 AM Gregory Price <
+gourry.memverge@gmail.com
+> wrote:
+>
+> Early-boot e820 records will be inserted by the bios/efi/early boot
+> software and be reported to the kernel via insert_resource.Â  Later, when
+> CXL drivers iterate through the regions again, they will insert another
+> resource and make the RESERVED memory area a child.
+I have already sent a patch
+https://www.mail-archive.com/qemu-devel@nongnu.org/msg882012.html
+.
+When the patch is applied, there would not be any reserved entries
+even with passing E820_RESERVED .
+So this patch needs to be evaluated in the light of the above patch I
+sent. Once you apply my patch, does the issue still exist?
+>
+> This RESERVED memory area causes the memory region to become unusable,
+> and as a result attempting to create memory regions with
+>
+>Â  Â  Â `cxl create-region ...`
+>
+> Will fail due to the RESERVED area intersecting with the CXL window.
+>
+>
+> During boot the following traceback is observed:
+>
+> 0xffffffff81101650 in insert_resource_expand_to_fit ()
+> 0xffffffff83d964c5 in e820__reserve_resources_late ()
+> 0xffffffff83e03210 in pcibios_resource_survey ()
+> 0xffffffff83e04f4a in pcibios_init ()
+>
+> Which produces a call to reserve the CFMWS area:
+>
+> (gdb) p *new
+> $54 = {start = 0x290000000, end = 0x2cfffffff, name = "Reserved",
+>Â  Â  Â  Â  flags = 0x200, desc = 0x7, parent = 0x0, sibling = 0x0,
+>Â  Â  Â  Â  child = 0x0}
+>
+> Later the Kernel parses ACPI tables and reserves the exact same area as
+> the CXL Fixed Memory Window.Â  The use of `insert_resource_conflict`
+> retains the RESERVED region and makes it a child of the new region.
+>
+> 0xffffffff811016a4 in insert_resource_conflict ()
+>Â  Â  Â  Â  Â  Â  Â  Â  Â  Â  Â  Â insert_resource ()
+> 0xffffffff81a81389 in cxl_parse_cfmws ()
+> 0xffffffff818c4a81 in call_handler ()
+>Â  Â  Â  Â  Â  Â  Â  Â  Â  Â  Â  Â acpi_parse_entries_array ()
+>
+> (gdb) p/x *new
+> $59 = {start = 0x290000000, end = 0x2cfffffff, name = "CXL Window 0",
+>Â  Â  Â  Â  flags = 0x200, desc = 0x0, parent = 0x0, sibling = 0x0,
+>Â  Â  Â  Â  child = 0x0}
+>
+> This produces the following output in /proc/iomem:
+>
+> 590000000-68fffffff : CXL Window 0
+>Â  Â 590000000-68fffffff : Reserved
+>
+> This reserved area causes `get_free_mem_region()` to fail due to a check
+> against `__region_intersects()`.Â  Due to this reserved area, the
+> intersect check will only ever return REGION_INTERSECTS, which causes
+> `cxl create-region` to always fail.
+>
+> Signed-off-by: Gregory Price <
+gregory.price@memverge.com
+>
+> ---
+>Â  hw/i386/pc.c | 2 --
+>Â  1 file changed, 2 deletions(-)
+>
+> diff --git a/hw/i386/pc.c b/hw/i386/pc.c
+> index 566accf7e6..5bf5465a21 100644
+> --- a/hw/i386/pc.c
+> +++ b/hw/i386/pc.c
+> @@ -1061,7 +1061,6 @@ void pc_memory_init(PCMachineState *pcms,
+>Â  Â  Â  Â  Â  hwaddr cxl_size = MiB;
+>
+>Â  Â  Â  Â  Â  cxl_base = pc_get_cxl_range_start(pcms);
+> -Â  Â  Â  Â  e820_add_entry(cxl_base, cxl_size, E820_RESERVED);
+>Â  Â  Â  Â  Â  memory_region_init(mr, OBJECT(machine), "cxl_host_reg", cxl_size);
+>Â  Â  Â  Â  Â  memory_region_add_subregion(system_memory, cxl_base, mr);
+>Â  Â  Â  Â  Â  cxl_resv_end = cxl_base + cxl_size;
+> @@ -1077,7 +1076,6 @@ void pc_memory_init(PCMachineState *pcms,
+>Â  Â  Â  Â  Â  Â  Â  Â  Â  memory_region_init_io(&fw->mr, OBJECT(machine), &cfmws_ops, fw,
+>Â  Â  Â  Â  Â  Â  Â  Â  Â  Â  Â  Â  Â  Â  Â  Â  Â  Â  Â  Â  "cxl-fixed-memory-region", fw->size);
+>Â  Â  Â  Â  Â  Â  Â  Â  Â  memory_region_add_subregion(system_memory, fw->base, &fw->mr);
+> -Â  Â  Â  Â  Â  Â  Â  Â  e820_add_entry(fw->base, fw->size, E820_RESERVED);
+>Â  Â  Â  Â  Â  Â  Â  Â  Â  cxl_fmw_base += fw->size;
+>Â  Â  Â  Â  Â  Â  Â  Â  Â  cxl_resv_end = cxl_fmw_base;
+>Â  Â  Â  Â  Â  Â  Â  }
+> --
+> 2.37.3
+>
+
++Gerd Hoffmann
+
+On Tue, Oct 18, 2022 at 8:16 PM Gregory Price <gourry.memverge@gmail.com> wrote:
+>
+>
+This patch does not resolve the issue, reserved entries are still created.
+>
+>
+[    0.000000] BIOS-e820: [mem 0x0000000280000000-0x00000002800fffff] reserved
+>
+[    0.000000] BIOS-e820: [mem 0x0000000290000000-0x000000029fffffff] reserved
+>
+>
+# cat /proc/iomem
+>
+290000000-29fffffff : CXL Window 0
+>
+290000000-29fffffff : Reserved
+>
+>
+# cxl create-region -m -d decoder0.0 -w 1 -g 256 mem0
+>
+cxl region: create_region: region0: set_size failed: Numerical result out of
+>
+range
+>
+cxl region: cmd_create_region: created 0 regions
+>
+>
+On Tue, Oct 18, 2022 at 2:05 AM Ani Sinha <ani@anisinha.ca> wrote:
+>
+>
+>
+> On Tue, Oct 18, 2022 at 5:14 AM Gregory Price <gourry.memverge@gmail.com>
+>
+> wrote:
+>
+> >
+>
+> > Early-boot e820 records will be inserted by the bios/efi/early boot
+>
+> > software and be reported to the kernel via insert_resource.  Later, when
+>
+> > CXL drivers iterate through the regions again, they will insert another
+>
+> > resource and make the RESERVED memory area a child.
+>
+>
+>
+> I have already sent a patch
+>
+>
+https://www.mail-archive.com/qemu-devel@nongnu.org/msg882012.html
+.
+>
+> When the patch is applied, there would not be any reserved entries
+>
+> even with passing E820_RESERVED .
+>
+> So this patch needs to be evaluated in the light of the above patch I
+>
+> sent. Once you apply my patch, does the issue still exist?
+>
+>
+>
+> >
+>
+> > This RESERVED memory area causes the memory region to become unusable,
+>
+> > and as a result attempting to create memory regions with
+>
+> >
+>
+> >     `cxl create-region ...`
+>
+> >
+>
+> > Will fail due to the RESERVED area intersecting with the CXL window.
+>
+> >
+>
+> >
+>
+> > During boot the following traceback is observed:
+>
+> >
+>
+> > 0xffffffff81101650 in insert_resource_expand_to_fit ()
+>
+> > 0xffffffff83d964c5 in e820__reserve_resources_late ()
+>
+> > 0xffffffff83e03210 in pcibios_resource_survey ()
+>
+> > 0xffffffff83e04f4a in pcibios_init ()
+>
+> >
+>
+> > Which produces a call to reserve the CFMWS area:
+>
+> >
+>
+> > (gdb) p *new
+>
+> > $54 = {start = 0x290000000, end = 0x2cfffffff, name = "Reserved",
+>
+> >        flags = 0x200, desc = 0x7, parent = 0x0, sibling = 0x0,
+>
+> >        child = 0x0}
+>
+> >
+>
+> > Later the Kernel parses ACPI tables and reserves the exact same area as
+>
+> > the CXL Fixed Memory Window.  The use of `insert_resource_conflict`
+>
+> > retains the RESERVED region and makes it a child of the new region.
+>
+> >
+>
+> > 0xffffffff811016a4 in insert_resource_conflict ()
+>
+> >                       insert_resource ()
+>
+> > 0xffffffff81a81389 in cxl_parse_cfmws ()
+>
+> > 0xffffffff818c4a81 in call_handler ()
+>
+> >                       acpi_parse_entries_array ()
+>
+> >
+>
+> > (gdb) p/x *new
+>
+> > $59 = {start = 0x290000000, end = 0x2cfffffff, name = "CXL Window 0",
+>
+> >        flags = 0x200, desc = 0x0, parent = 0x0, sibling = 0x0,
+>
+> >        child = 0x0}
+>
+> >
+>
+> > This produces the following output in /proc/iomem:
+>
+> >
+>
+> > 590000000-68fffffff : CXL Window 0
+>
+> >   590000000-68fffffff : Reserved
+>
+> >
+>
+> > This reserved area causes `get_free_mem_region()` to fail due to a check
+>
+> > against `__region_intersects()`.  Due to this reserved area, the
+>
+> > intersect check will only ever return REGION_INTERSECTS, which causes
+>
+> > `cxl create-region` to always fail.
+>
+> >
+>
+> > Signed-off-by: Gregory Price <gregory.price@memverge.com>
+>
+> > ---
+>
+> >  hw/i386/pc.c | 2 --
+>
+> >  1 file changed, 2 deletions(-)
+>
+> >
+>
+> > diff --git a/hw/i386/pc.c b/hw/i386/pc.c
+>
+> > index 566accf7e6..5bf5465a21 100644
+>
+> > --- a/hw/i386/pc.c
+>
+> > +++ b/hw/i386/pc.c
+>
+> > @@ -1061,7 +1061,6 @@ void pc_memory_init(PCMachineState *pcms,
+>
+> >          hwaddr cxl_size = MiB;
+>
+> >
+>
+> >          cxl_base = pc_get_cxl_range_start(pcms);
+>
+> > -        e820_add_entry(cxl_base, cxl_size, E820_RESERVED);
+>
+> >          memory_region_init(mr, OBJECT(machine), "cxl_host_reg", cxl_size);
+>
+> >          memory_region_add_subregion(system_memory, cxl_base, mr);
+>
+> >          cxl_resv_end = cxl_base + cxl_size;
+>
+> > @@ -1077,7 +1076,6 @@ void pc_memory_init(PCMachineState *pcms,
+>
+> >                  memory_region_init_io(&fw->mr, OBJECT(machine),
+>
+> > &cfmws_ops, fw,
+>
+> >                                        "cxl-fixed-memory-region",
+>
+> > fw->size);
+>
+> >                  memory_region_add_subregion(system_memory, fw->base,
+>
+> > &fw->mr);
+>
+> > -                e820_add_entry(fw->base, fw->size, E820_RESERVED);
+>
+> >                  cxl_fmw_base += fw->size;
+>
+> >                  cxl_resv_end = cxl_fmw_base;
+>
+> >              }
+>
+> > --
+>
+> > 2.37.3
+>
+> >
+
+>
+>> > diff --git a/hw/i386/pc.c b/hw/i386/pc.c
+>
+>> > index 566accf7e6..5bf5465a21 100644
+>
+>> > --- a/hw/i386/pc.c
+>
+>> > +++ b/hw/i386/pc.c
+>
+>> > @@ -1061,7 +1061,6 @@ void pc_memory_init(PCMachineState *pcms,
+>
+>> >          hwaddr cxl_size = MiB;
+>
+>> >
+>
+>> >          cxl_base = pc_get_cxl_range_start(pcms);
+>
+>> > -        e820_add_entry(cxl_base, cxl_size, E820_RESERVED);
+Just dropping it doesn't look like a good plan to me.
+
+You can try set etc/reserved-memory-end fw_cfg file instead.  Firmware
+(both seabios and ovmf) read it and will make sure the 64bit pci mmio
+window is placed above that address, i.e. this effectively reserves
+address space.  Right now used by memory hotplug code, but should work
+for cxl too I think (disclaimer: don't know much about cxl ...).
+
+take care & HTH,
+  Gerd
+
+On Tue, 8 Nov 2022 12:21:11 +0100
+Gerd Hoffmann <kraxel@redhat.com> wrote:
+
+>
+> >> > diff --git a/hw/i386/pc.c b/hw/i386/pc.c
+>
+> >> > index 566accf7e6..5bf5465a21 100644
+>
+> >> > --- a/hw/i386/pc.c
+>
+> >> > +++ b/hw/i386/pc.c
+>
+> >> > @@ -1061,7 +1061,6 @@ void pc_memory_init(PCMachineState *pcms,
+>
+> >> >          hwaddr cxl_size = MiB;
+>
+> >> >
+>
+> >> >          cxl_base = pc_get_cxl_range_start(pcms);
+>
+> >> > -        e820_add_entry(cxl_base, cxl_size, E820_RESERVED);
+>
+>
+Just dropping it doesn't look like a good plan to me.
+>
+>
+You can try set etc/reserved-memory-end fw_cfg file instead.  Firmware
+>
+(both seabios and ovmf) read it and will make sure the 64bit pci mmio
+>
+window is placed above that address, i.e. this effectively reserves
+>
+address space.  Right now used by memory hotplug code, but should work
+>
+for cxl too I think (disclaimer: don't know much about cxl ...).
+As far as I know CXL impl. in QEMU isn't using etc/reserved-memory-end
+at all, it' has its own mapping.
+
+Regardless of that, reserved E820 entries look wrong, and looking at
+commit message OS is right to bailout on them (expected according
+to ACPI spec).
+Also spec says 
+
+"
+E820 Assumptions and Limitations
+ [...]
+ The platform boot firmware does not return a range description for the memory 
+mapping of
+ PCI devices, ISA Option ROMs, and ISA Plug and Play cards because the OS has 
+mechanisms
+ available to detect them.
+"
+
+so dropping reserved entries looks reasonable from ACPI spec point of view.
+(disclaimer: don't know much about cxl ... either)
+>
+>
+take care & HTH,
+>
+Gerd
+>
+
+On Fri, Nov 11, 2022 at 11:51:23AM +0100, Igor Mammedov wrote:
+>
+On Tue, 8 Nov 2022 12:21:11 +0100
+>
+Gerd Hoffmann <kraxel@redhat.com> wrote:
+>
+>
+> > >> > diff --git a/hw/i386/pc.c b/hw/i386/pc.c
+>
+> > >> > index 566accf7e6..5bf5465a21 100644
+>
+> > >> > --- a/hw/i386/pc.c
+>
+> > >> > +++ b/hw/i386/pc.c
+>
+> > >> > @@ -1061,7 +1061,6 @@ void pc_memory_init(PCMachineState *pcms,
+>
+> > >> >          hwaddr cxl_size = MiB;
+>
+> > >> >
+>
+> > >> >          cxl_base = pc_get_cxl_range_start(pcms);
+>
+> > >> > -        e820_add_entry(cxl_base, cxl_size, E820_RESERVED);
+>
+>
+>
+> Just dropping it doesn't look like a good plan to me.
+>
+>
+>
+> You can try set etc/reserved-memory-end fw_cfg file instead.  Firmware
+>
+> (both seabios and ovmf) read it and will make sure the 64bit pci mmio
+>
+> window is placed above that address, i.e. this effectively reserves
+>
+> address space.  Right now used by memory hotplug code, but should work
+>
+> for cxl too I think (disclaimer: don't know much about cxl ...).
+>
+>
+As far as I know CXL impl. in QEMU isn't using etc/reserved-memory-end
+>
+at all, it' has its own mapping.
+This should be changed.  cxl should make sure the highest address used
+is stored in etc/reserved-memory-end to avoid the firmware mapping pci
+resources there.
+
+>
+so dropping reserved entries looks reasonable from ACPI spec point of view.
+Yep, I don't want dispute that.
+
+I suspect the reason for these entries to exist in the first place is to
+inform the firmware that it should not place stuff there, and if we
+remove that to conform with the spec we need some alternative way for
+that ...
+
+take care,
+  Gerd
+
+On Fri, 11 Nov 2022 12:40:59 +0100
+Gerd Hoffmann <kraxel@redhat.com> wrote:
+
+>
+On Fri, Nov 11, 2022 at 11:51:23AM +0100, Igor Mammedov wrote:
+>
+> On Tue, 8 Nov 2022 12:21:11 +0100
+>
+> Gerd Hoffmann <kraxel@redhat.com> wrote:
+>
+>
+>
+> > > >> > diff --git a/hw/i386/pc.c b/hw/i386/pc.c
+>
+> > > >> > index 566accf7e6..5bf5465a21 100644
+>
+> > > >> > --- a/hw/i386/pc.c
+>
+> > > >> > +++ b/hw/i386/pc.c
+>
+> > > >> > @@ -1061,7 +1061,6 @@ void pc_memory_init(PCMachineState *pcms,
+>
+> > > >> >          hwaddr cxl_size = MiB;
+>
+> > > >> >
+>
+> > > >> >          cxl_base = pc_get_cxl_range_start(pcms);
+>
+> > > >> > -        e820_add_entry(cxl_base, cxl_size, E820_RESERVED);
+>
+> >
+>
+> > Just dropping it doesn't look like a good plan to me.
+>
+> >
+>
+> > You can try set etc/reserved-memory-end fw_cfg file instead.  Firmware
+>
+> > (both seabios and ovmf) read it and will make sure the 64bit pci mmio
+>
+> > window is placed above that address, i.e. this effectively reserves
+>
+> > address space.  Right now used by memory hotplug code, but should work
+>
+> > for cxl too I think (disclaimer: don't know much about cxl ...).
+>
+>
+>
+> As far as I know CXL impl. in QEMU isn't using etc/reserved-memory-end
+>
+> at all, it' has its own mapping.
+>
+>
+This should be changed.  cxl should make sure the highest address used
+>
+is stored in etc/reserved-memory-end to avoid the firmware mapping pci
+>
+resources there.
+if (pcmc->has_reserved_memory && machine->device_memory->base) {            
+ 
+[...]
+                                                             
+        if (pcms->cxl_devices_state.is_enabled) {                               
+ 
+            res_mem_end = cxl_resv_end;
+
+that should be handled by this line
+
+        }                                   
+                                     
+        *val = cpu_to_le64(ROUND_UP(res_mem_end, 1 * GiB));                     
+ 
+        fw_cfg_add_file(fw_cfg, "etc/reserved-memory-end", val, sizeof(*val));  
+ 
+    }  
+
+so SeaBIOS shouldn't intrude into CXL address space
+(I assume EDK2 behave similarly here)
+ 
+>
+> so dropping reserved entries looks reasonable from ACPI spec point of view.
+>
+>
+>
+>
+Yep, I don't want dispute that.
+>
+>
+I suspect the reason for these entries to exist in the first place is to
+>
+inform the firmware that it should not place stuff there, and if we
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+just to educate me, can you point out what SeaBIOS code does with reservations.
+
+>
+remove that to conform with the spec we need some alternative way for
+>
+that ...
+with etc/reserved-memory-end set as above,
+is E820_RESERVED really needed here?
+
+(my understanding was that E820_RESERVED weren't accounted for when
+initializing PCI devices)
+
+>
+>
+take care,
+>
+Gerd
+>
+
+>
+if (pcmc->has_reserved_memory && machine->device_memory->base) {
+>
+>
+[...]
+>
+>
+if (pcms->cxl_devices_state.is_enabled) {
+>
+>
+res_mem_end = cxl_resv_end;
+>
+>
+that should be handled by this line
+>
+>
+}
+>
+>
+*val = cpu_to_le64(ROUND_UP(res_mem_end, 1 * GiB));
+>
+>
+fw_cfg_add_file(fw_cfg, "etc/reserved-memory-end", val,
+>
+sizeof(*val));
+>
+}
+>
+>
+so SeaBIOS shouldn't intrude into CXL address space
+Yes, looks good, so with this in place already everyting should be fine.
+
+>
+(I assume EDK2 behave similarly here)
+Correct, ovmf reads that fw_cfg file too.
+
+>
+> I suspect the reason for these entries to exist in the first place is to
+>
+> inform the firmware that it should not place stuff there, and if we
+>
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+>
+just to educate me, can you point out what SeaBIOS code does with
+>
+reservations.
+They are added to the e820 map which gets passed on to the OS.  seabios
+uses (and updateas) the e820 map too, when allocating memory for
+example.  While thinking about it I'm not fully sure it actually looks
+at reservations, maybe it only uses (and updates) ram entries when
+allocating memory.
+
+>
+> remove that to conform with the spec we need some alternative way for
+>
+> that ...
+>
+>
+with etc/reserved-memory-end set as above,
+>
+is E820_RESERVED really needed here?
+No.  Setting etc/reserved-memory-end is enough.
+
+So for the original patch:
+Acked-by: Gerd Hoffmann <kraxel@redhat.com>
+
+take care,
+  Gerd
+
+On Fri, Nov 11, 2022 at 02:36:02PM +0100, Gerd Hoffmann wrote:
+>
+>     if (pcmc->has_reserved_memory && machine->device_memory->base) {
+>
+>
+>
+> [...]
+>
+>
+>
+>         if (pcms->cxl_devices_state.is_enabled) {
+>
+>
+>
+>             res_mem_end = cxl_resv_end;
+>
+>
+>
+> that should be handled by this line
+>
+>
+>
+>         }
+>
+>
+>
+>         *val = cpu_to_le64(ROUND_UP(res_mem_end, 1 * GiB));
+>
+>
+>
+>         fw_cfg_add_file(fw_cfg, "etc/reserved-memory-end", val,
+>
+> sizeof(*val));
+>
+>     }
+>
+>
+>
+> so SeaBIOS shouldn't intrude into CXL address space
+>
+>
+Yes, looks good, so with this in place already everyting should be fine.
+>
+>
+> (I assume EDK2 behave similarly here)
+>
+>
+Correct, ovmf reads that fw_cfg file too.
+>
+>
+> > I suspect the reason for these entries to exist in the first place is to
+>
+> > inform the firmware that it should not place stuff there, and if we
+>
+>        ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+>
+> just to educate me, can you point out what SeaBIOS code does with
+>
+> reservations.
+>
+>
+They are added to the e820 map which gets passed on to the OS.  seabios
+>
+uses (and updateas) the e820 map too, when allocating memory for
+>
+example.  While thinking about it I'm not fully sure it actually looks
+>
+at reservations, maybe it only uses (and updates) ram entries when
+>
+allocating memory.
+>
+>
+> > remove that to conform with the spec we need some alternative way for
+>
+> > that ...
+>
+>
+>
+> with etc/reserved-memory-end set as above,
+>
+> is E820_RESERVED really needed here?
+>
+>
+No.  Setting etc/reserved-memory-end is enough.
+>
+>
+So for the original patch:
+>
+Acked-by: Gerd Hoffmann <kraxel@redhat.com>
+>
+>
+take care,
+>
+Gerd
+It's upstream already, sorry I can't add your tag.
+
+-- 
+MST
+
diff --git a/classification_output/01/instruction/24190340 b/classification_output/01/instruction/24190340
new file mode 100644
index 000000000..784962c9c
--- /dev/null
+++ b/classification_output/01/instruction/24190340
@@ -0,0 +1,2056 @@
+instruction: 0.818
+other: 0.811
+semantic: 0.793
+mistranslation: 0.758
+
+[BUG, RFC] Block graph deadlock on job-dismiss
+
+Hi all,
+
+There's a bug in block layer which leads to block graph deadlock.
+Notably, it takes place when blockdev IO is processed within a separate
+iothread.
+
+This was initially caught by our tests, and I was able to reduce it to a
+relatively simple reproducer.  Such deadlocks are probably supposed to
+be covered in iotests/graph-changes-while-io, but this deadlock isn't.
+
+Basically what the reproducer does is launches QEMU with a drive having
+'iothread' option set, creates a chain of 2 snapshots, launches
+block-commit job for a snapshot and then dismisses the job, starting
+from the lower snapshot.  If the guest is issuing IO at the same time,
+there's a race in acquiring block graph lock and a potential deadlock.
+
+Here's how it can be reproduced:
+
+1. Run QEMU:
+>
+SRCDIR=/path/to/srcdir
+>
+>
+>
+>
+>
+$SRCDIR/build/qemu-system-x86_64 -enable-kvm \
+>
+>
+-machine q35 -cpu Nehalem \
+>
+>
+-name guest=alma8-vm,debug-threads=on \
+>
+>
+-m 2g -smp 2 \
+>
+>
+-nographic -nodefaults \
+>
+>
+-qmp unix:/var/run/alma8-qmp.sock,server=on,wait=off \
+>
+>
+-serial unix:/var/run/alma8-serial.sock,server=on,wait=off \
+>
+>
+-object iothread,id=iothread0 \
+>
+>
+-blockdev
+>
+node-name=disk,driver=qcow2,file.driver=file,file.filename=/path/to/img/alma8.qcow2
+>
+\
+>
+-device virtio-blk-pci,drive=disk,iothread=iothread0
+2. Launch IO (random reads) from within the guest:
+>
+nc -U /var/run/alma8-serial.sock
+>
+...
+>
+[root@alma8-vm ~]# fio --name=randread --ioengine=libaio --direct=1 --bs=4k
+>
+--size=1G --numjobs=1 --time_based=1 --runtime=300 --group_reporting
+>
+--rw=randread --iodepth=1 --filename=/testfile
+3. Run snapshots creation & removal of lower snapshot operation in a
+loop (script attached):
+>
+while /bin/true ; do ./remove_lower_snap.sh ; done
+And then it occasionally hangs.
+
+Note: I've tried bisecting this, and looks like deadlock occurs starting
+from the following commit:
+
+(BAD)  5bdbaebcce virtio: Re-enable notifications after drain
+(GOOD) c42c3833e0 virtio-scsi: Attach event vq notifier with no_poll
+
+On the latest v10.0.0 it does hang as well.
+
+
+Here's backtrace of the main thread:
+
+>
+#0  0x00007fc547d427ce in __ppoll (fds=0x557eb79657b0, nfds=1,
+>
+timeout=<optimized out>, sigmask=0x0) at ../sysdeps/unix/sysv/linux/ppoll.c:43
+>
+#1  0x0000557eb47d955c in qemu_poll_ns (fds=0x557eb79657b0, nfds=1,
+>
+timeout=-1) at ../util/qemu-timer.c:329
+>
+#2  0x0000557eb47b2204 in fdmon_poll_wait (ctx=0x557eb76c5f20,
+>
+ready_list=0x7ffd94b4edd8, timeout=-1) at ../util/fdmon-poll.c:79
+>
+#3  0x0000557eb47b1c45 in aio_poll (ctx=0x557eb76c5f20, blocking=true) at
+>
+../util/aio-posix.c:730
+>
+#4  0x0000557eb4621edd in bdrv_do_drained_begin (bs=0x557eb795e950,
+>
+parent=0x0, poll=true) at ../block/io.c:378
+>
+#5  0x0000557eb4621f7b in bdrv_drained_begin (bs=0x557eb795e950) at
+>
+../block/io.c:391
+>
+#6  0x0000557eb45ec125 in bdrv_change_aio_context (bs=0x557eb795e950,
+>
+ctx=0x557eb76c5f20, visited=0x557eb7e06b60 = {...}, tran=0x557eb7a87160,
+>
+errp=0x0)
+>
+at ../block.c:7682
+>
+#7  0x0000557eb45ebf2b in bdrv_child_change_aio_context (c=0x557eb7964250,
+>
+ctx=0x557eb76c5f20, visited=0x557eb7e06b60 = {...}, tran=0x557eb7a87160,
+>
+errp=0x0)
+>
+at ../block.c:7608
+>
+#8  0x0000557eb45ec0c4 in bdrv_change_aio_context (bs=0x557eb79575e0,
+>
+ctx=0x557eb76c5f20, visited=0x557eb7e06b60 = {...}, tran=0x557eb7a87160,
+>
+errp=0x0)
+>
+at ../block.c:7668
+>
+#9  0x0000557eb45ebf2b in bdrv_child_change_aio_context (c=0x557eb7e59110,
+>
+ctx=0x557eb76c5f20, visited=0x557eb7e06b60 = {...}, tran=0x557eb7a87160,
+>
+errp=0x0)
+>
+at ../block.c:7608
+>
+#10 0x0000557eb45ec0c4 in bdrv_change_aio_context (bs=0x557eb7e51960,
+>
+ctx=0x557eb76c5f20, visited=0x557eb7e06b60 = {...}, tran=0x557eb7a87160,
+>
+errp=0x0)
+>
+at ../block.c:7668
+>
+#11 0x0000557eb45ebf2b in bdrv_child_change_aio_context (c=0x557eb814ed80,
+>
+ctx=0x557eb76c5f20, visited=0x557eb7e06b60 = {...}, tran=0x557eb7a87160,
+>
+errp=0x0)
+>
+at ../block.c:7608
+>
+#12 0x0000557eb45ee8e4 in child_job_change_aio_ctx (c=0x557eb7c9d3f0,
+>
+ctx=0x557eb76c5f20, visited=0x557eb7e06b60 = {...}, tran=0x557eb7a87160,
+>
+errp=0x0)
+>
+at ../blockjob.c:157
+>
+#13 0x0000557eb45ebe2d in bdrv_parent_change_aio_context (c=0x557eb7c9d3f0,
+>
+ctx=0x557eb76c5f20, visited=0x557eb7e06b60 = {...}, tran=0x557eb7a87160,
+>
+errp=0x0)
+>
+at ../block.c:7592
+>
+#14 0x0000557eb45ec06b in bdrv_change_aio_context (bs=0x557eb7d74310,
+>
+ctx=0x557eb76c5f20, visited=0x557eb7e06b60 = {...}, tran=0x557eb7a87160,
+>
+errp=0x0)
+>
+at ../block.c:7661
+>
+#15 0x0000557eb45dcd7e in bdrv_child_cb_change_aio_ctx
+>
+(child=0x557eb8565af0, ctx=0x557eb76c5f20, visited=0x557eb7e06b60 =
+>
+{...}, tran=0x557eb7a87160, errp=0x0) at ../block.c:1234
+>
+#16 0x0000557eb45ebe2d in bdrv_parent_change_aio_context (c=0x557eb8565af0,
+>
+ctx=0x557eb76c5f20, visited=0x557eb7e06b60 = {...}, tran=0x557eb7a87160,
+>
+errp=0x0)
+>
+at ../block.c:7592
+>
+#17 0x0000557eb45ec06b in bdrv_change_aio_context (bs=0x557eb79575e0,
+>
+ctx=0x557eb76c5f20, visited=0x557eb7e06b60 = {...}, tran=0x557eb7a87160,
+>
+errp=0x0)
+>
+at ../block.c:7661
+>
+#18 0x0000557eb45ec1f3 in bdrv_try_change_aio_context (bs=0x557eb79575e0,
+>
+ctx=0x557eb76c5f20, ignore_child=0x0, errp=0x0) at ../block.c:7715
+>
+#19 0x0000557eb45e1b15 in bdrv_root_unref_child (child=0x557eb7966f30) at
+>
+../block.c:3317
+>
+#20 0x0000557eb45eeaa8 in block_job_remove_all_bdrv (job=0x557eb7952800) at
+>
+../blockjob.c:209
+>
+#21 0x0000557eb45ee641 in block_job_free (job=0x557eb7952800) at
+>
+../blockjob.c:82
+>
+#22 0x0000557eb45f17af in job_unref_locked (job=0x557eb7952800) at
+>
+../job.c:474
+>
+#23 0x0000557eb45f257d in job_do_dismiss_locked (job=0x557eb7952800) at
+>
+../job.c:771
+>
+#24 0x0000557eb45f25fe in job_dismiss_locked (jobptr=0x7ffd94b4f400,
+>
+errp=0x7ffd94b4f488) at ../job.c:783
+>
+--Type <RET> for more, q to quit, c to continue without paging--
+>
+#25 0x0000557eb45d8e84 in qmp_job_dismiss (id=0x557eb7aa42b0 "commit-snap1",
+>
+errp=0x7ffd94b4f488) at ../job-qmp.c:138
+>
+#26 0x0000557eb472f6a3 in qmp_marshal_job_dismiss (args=0x7fc52c00a3b0,
+>
+ret=0x7fc53c880da8, errp=0x7fc53c880da0) at qapi/qapi-commands-job.c:221
+>
+#27 0x0000557eb47a35f3 in do_qmp_dispatch_bh (opaque=0x7fc53c880e40) at
+>
+../qapi/qmp-dispatch.c:128
+>
+#28 0x0000557eb47d1cd2 in aio_bh_call (bh=0x557eb79568f0) at
+>
+../util/async.c:172
+>
+#29 0x0000557eb47d1df5 in aio_bh_poll (ctx=0x557eb76c0200) at
+>
+../util/async.c:219
+>
+#30 0x0000557eb47b12f3 in aio_dispatch (ctx=0x557eb76c0200) at
+>
+../util/aio-posix.c:436
+>
+#31 0x0000557eb47d2266 in aio_ctx_dispatch (source=0x557eb76c0200,
+>
+callback=0x0, user_data=0x0) at ../util/async.c:361
+>
+#32 0x00007fc549232f4f in g_main_dispatch (context=0x557eb76c6430) at
+>
+../glib/gmain.c:3364
+>
+#33 g_main_context_dispatch (context=0x557eb76c6430) at ../glib/gmain.c:4079
+>
+#34 0x0000557eb47d3ab1 in glib_pollfds_poll () at ../util/main-loop.c:287
+>
+#35 0x0000557eb47d3b38 in os_host_main_loop_wait (timeout=0) at
+>
+../util/main-loop.c:310
+>
+#36 0x0000557eb47d3c58 in main_loop_wait (nonblocking=0) at
+>
+../util/main-loop.c:589
+>
+#37 0x0000557eb4218b01 in qemu_main_loop () at ../system/runstate.c:835
+>
+#38 0x0000557eb46df166 in qemu_default_main (opaque=0x0) at
+>
+../system/main.c:50
+>
+#39 0x0000557eb46df215 in main (argc=24, argv=0x7ffd94b4f8d8) at
+>
+../system/main.c:80
+And here's coroutine trying to acquire read lock:
+
+>
+(gdb) qemu coroutine reader_queue->entries.sqh_first
+>
+#0  0x0000557eb47d7068 in qemu_coroutine_switch (from_=0x557eb7aa48b0,
+>
+to_=0x7fc537fff508, action=COROUTINE_YIELD) at
+>
+../util/coroutine-ucontext.c:321
+>
+#1  0x0000557eb47d4d4a in qemu_coroutine_yield () at
+>
+../util/qemu-coroutine.c:339
+>
+#2  0x0000557eb47d56c8 in qemu_co_queue_wait_impl (queue=0x557eb59954c0
+>
+<reader_queue>, lock=0x7fc53c57de50, flags=0) at
+>
+../util/qemu-coroutine-lock.c:60
+>
+#3  0x0000557eb461fea7 in bdrv_graph_co_rdlock () at ../block/graph-lock.c:231
+>
+#4  0x0000557eb460c81a in graph_lockable_auto_lock (x=0x7fc53c57dee3) at
+>
+/home/root/src/qemu/master/include/block/graph-lock.h:213
+>
+#5  0x0000557eb460fa41 in blk_co_do_preadv_part
+>
+(blk=0x557eb84c0810, offset=6890553344, bytes=4096, qiov=0x7fc530006988,
+>
+qiov_offset=0, flags=BDRV_REQ_REGISTERED_BUF) at ../block/block-backend.c:1339
+>
+#6  0x0000557eb46104d7 in blk_aio_read_entry (opaque=0x7fc530003240) at
+>
+../block/block-backend.c:1619
+>
+#7  0x0000557eb47d6c40 in coroutine_trampoline (i0=-1213577040, i1=21886) at
+>
+../util/coroutine-ucontext.c:175
+>
+#8  0x00007fc547c2a360 in __start_context () at
+>
+../sysdeps/unix/sysv/linux/x86_64/__start_context.S:91
+>
+#9  0x00007ffd94b4ea40 in  ()
+>
+#10 0x0000000000000000 in  ()
+So it looks like main thread is processing job-dismiss request and is
+holding write lock taken in block_job_remove_all_bdrv() (frame #20
+above).  At the same time iothread spawns a coroutine which performs IO
+request.  Before the coroutine is spawned, blk_aio_prwv() increases
+'in_flight' counter for Blk.  Then blk_co_do_preadv_part() (frame #5) is
+trying to acquire the read lock.  But main thread isn't releasing the
+lock as blk_root_drained_poll() returns true since blk->in_flight > 0.
+Here's the deadlock.
+
+Any comments and suggestions on the subject are welcomed.  Thanks!
+
+Andrey
+remove_lower_snap.sh
+Description:
+application/shellscript
+
+On 4/24/25 8:32 PM, Andrey Drobyshev wrote:
+>
+Hi all,
+>
+>
+There's a bug in block layer which leads to block graph deadlock.
+>
+Notably, it takes place when blockdev IO is processed within a separate
+>
+iothread.
+>
+>
+This was initially caught by our tests, and I was able to reduce it to a
+>
+relatively simple reproducer.  Such deadlocks are probably supposed to
+>
+be covered in iotests/graph-changes-while-io, but this deadlock isn't.
+>
+>
+Basically what the reproducer does is launches QEMU with a drive having
+>
+'iothread' option set, creates a chain of 2 snapshots, launches
+>
+block-commit job for a snapshot and then dismisses the job, starting
+>
+from the lower snapshot.  If the guest is issuing IO at the same time,
+>
+there's a race in acquiring block graph lock and a potential deadlock.
+>
+>
+Here's how it can be reproduced:
+>
+>
+[...]
+>
+I took a closer look at iotests/graph-changes-while-io, and have managed
+to reproduce the same deadlock in a much simpler setup, without a guest.
+
+1. Run QSD:> ./build/storage-daemon/qemu-storage-daemon --object
+iothread,id=iothread0 \
+>
+--blockdev null-co,node-name=node0,read-zeroes=true \
+>
+>
+--nbd-server addr.type=unix,addr.path=/var/run/qsd_nbd.sock \
+>
+>
+--export
+>
+nbd,id=exp0,node-name=node0,iothread=iothread0,fixed-iothread=true,writable=true
+>
+\
+>
+--chardev
+>
+socket,id=qmp-sock,path=/var/run/qsd_qmp.sock,server=on,wait=off \
+>
+--monitor chardev=qmp-sock
+2. Launch IO:
+>
+qemu-img bench -f raw -c 2000000
+>
+'nbd+unix:///node0?socket=/var/run/qsd_nbd.sock'
+3. Add 2 snapshots and remove lower one (script attached):> while
+/bin/true ; do ./rls_qsd.sh ; done
+
+And then it hangs.
+
+I'll also send a patch with corresponding test case added directly to
+iotests.
+
+This reproduce seems to be hanging starting from Fiona's commit
+67446e605dc ("blockjob: drop AioContext lock before calling
+bdrv_graph_wrlock()").  AioContext locks were dropped entirely later on
+in Stefan's commit b49f4755c7 ("block: remove AioContext locking"), but
+the problem remains.
+
+Andrey
+rls_qsd.sh
+Description:
+application/shellscript
+
+From: Andrey Drobyshev <andrey.drobyshev@virtuozzo.com>
+
+This case is catching potential deadlock which takes place when job-dismiss
+is issued when I/O requests are processed in a separate iothread.
+
+See
+https://mail.gnu.org/archive/html/qemu-devel/2025-04/msg04421.html
+Signed-off-by: Andrey Drobyshev <andrey.drobyshev@virtuozzo.com>
+---
+ .../qemu-iotests/tests/graph-changes-while-io | 101 ++++++++++++++++--
+ .../tests/graph-changes-while-io.out          |   4 +-
+ 2 files changed, 96 insertions(+), 9 deletions(-)
+
+diff --git a/tests/qemu-iotests/tests/graph-changes-while-io 
+b/tests/qemu-iotests/tests/graph-changes-while-io
+index 194fda500e..e30f823da4 100755
+--- a/tests/qemu-iotests/tests/graph-changes-while-io
++++ b/tests/qemu-iotests/tests/graph-changes-while-io
+@@ -27,6 +27,8 @@ from iotests import imgfmt, qemu_img, qemu_img_create, 
+qemu_io, \
+ 
+ 
+ top = os.path.join(iotests.test_dir, 'top.img')
++snap1 = os.path.join(iotests.test_dir, 'snap1.img')
++snap2 = os.path.join(iotests.test_dir, 'snap2.img')
+ nbd_sock = os.path.join(iotests.sock_dir, 'nbd.sock')
+ 
+ 
+@@ -58,6 +60,15 @@ class TestGraphChangesWhileIO(QMPTestCase):
+     def tearDown(self) -> None:
+         self.qsd.stop()
+ 
++    def _wait_for_blockjob(self, status) -> None:
++        done = False
++        while not done:
++            for event in self.qsd.get_qmp().get_events(wait=10.0):
++                if event['event'] != 'JOB_STATUS_CHANGE':
++                    continue
++                if event['data']['status'] == status:
++                    done = True
++
+     def test_blockdev_add_while_io(self) -> None:
+         # Run qemu-img bench in the background
+         bench_thr = Thread(target=do_qemu_img_bench)
+@@ -116,13 +127,89 @@ class TestGraphChangesWhileIO(QMPTestCase):
+                 'device': 'job0',
+             })
+ 
+-            cancelled = False
+-            while not cancelled:
+-                for event in self.qsd.get_qmp().get_events(wait=10.0):
+-                    if event['event'] != 'JOB_STATUS_CHANGE':
+-                        continue
+-                    if event['data']['status'] == 'null':
+-                        cancelled = True
++            self._wait_for_blockjob('null')
++
++        bench_thr.join()
++
++    def test_remove_lower_snapshot_while_io(self) -> None:
++        # Run qemu-img bench in the background
++        bench_thr = Thread(target=do_qemu_img_bench, args=(100000, ))
++        bench_thr.start()
++
++        # While I/O is performed on 'node0' node, consequently add 2 snapshots
++        # on top of it, then remove (commit) them starting from lower one.
++        while bench_thr.is_alive():
++            # Recreate snapshot images on every iteration
++            qemu_img_create('-f', imgfmt, snap1, '1G')
++            qemu_img_create('-f', imgfmt, snap2, '1G')
++
++            self.qsd.cmd('blockdev-add', {
++                'driver': imgfmt,
++                'node-name': 'snap1',
++                'file': {
++                    'driver': 'file',
++                    'filename': snap1
++                }
++            })
++
++            self.qsd.cmd('blockdev-snapshot', {
++                'node': 'node0',
++                'overlay': 'snap1',
++            })
++
++            self.qsd.cmd('blockdev-add', {
++                'driver': imgfmt,
++                'node-name': 'snap2',
++                'file': {
++                    'driver': 'file',
++                    'filename': snap2
++                }
++            })
++
++            self.qsd.cmd('blockdev-snapshot', {
++                'node': 'snap1',
++                'overlay': 'snap2',
++            })
++
++            self.qsd.cmd('block-commit', {
++                'job-id': 'commit-snap1',
++                'device': 'snap2',
++                'top-node': 'snap1',
++                'base-node': 'node0',
++                'auto-finalize': True,
++                'auto-dismiss': False,
++            })
++
++            self._wait_for_blockjob('concluded')
++            self.qsd.cmd('job-dismiss', {
++                'id': 'commit-snap1',
++            })
++
++            self.qsd.cmd('block-commit', {
++                'job-id': 'commit-snap2',
++                'device': 'snap2',
++                'top-node': 'snap2',
++                'base-node': 'node0',
++                'auto-finalize': True,
++                'auto-dismiss': False,
++            })
++
++            self._wait_for_blockjob('ready')
++            self.qsd.cmd('job-complete', {
++                'id': 'commit-snap2',
++            })
++
++            self._wait_for_blockjob('concluded')
++            self.qsd.cmd('job-dismiss', {
++                'id': 'commit-snap2',
++            })
++
++            self.qsd.cmd('blockdev-del', {
++                'node-name': 'snap1'
++            })
++            self.qsd.cmd('blockdev-del', {
++                'node-name': 'snap2'
++            })
+ 
+         bench_thr.join()
+ 
+diff --git a/tests/qemu-iotests/tests/graph-changes-while-io.out 
+b/tests/qemu-iotests/tests/graph-changes-while-io.out
+index fbc63e62f8..8d7e996700 100644
+--- a/tests/qemu-iotests/tests/graph-changes-while-io.out
++++ b/tests/qemu-iotests/tests/graph-changes-while-io.out
+@@ -1,5 +1,5 @@
+-..
++...
+ ----------------------------------------------------------------------
+-Ran 2 tests
++Ran 3 tests
+ 
+ OK
+-- 
+2.43.5
+
+Am 24.04.25 um 19:32 schrieb Andrey Drobyshev:
+>
+So it looks like main thread is processing job-dismiss request and is
+>
+holding write lock taken in block_job_remove_all_bdrv() (frame #20
+>
+above).  At the same time iothread spawns a coroutine which performs IO
+>
+request.  Before the coroutine is spawned, blk_aio_prwv() increases
+>
+'in_flight' counter for Blk.  Then blk_co_do_preadv_part() (frame #5) is
+>
+trying to acquire the read lock.  But main thread isn't releasing the
+>
+lock as blk_root_drained_poll() returns true since blk->in_flight > 0.
+>
+Here's the deadlock.
+And for the IO test you provided, it's client->nb_requests that behaves
+similarly to blk->in_flight here.
+
+The issue also reproduces easily when issuing the following QMP command
+in a loop while doing IO on a device:
+
+>
+void qmp_block_locked_drain(const char *node_name, Error **errp)
+>
+{
+>
+BlockDriverState *bs;
+>
+>
+bs = bdrv_find_node(node_name);
+>
+if (!bs) {
+>
+error_setg(errp, "node not found");
+>
+return;
+>
+}
+>
+>
+bdrv_graph_wrlock();
+>
+bdrv_drained_begin(bs);
+>
+bdrv_drained_end(bs);
+>
+bdrv_graph_wrunlock();
+>
+}
+It seems like either it would be necessary to require:
+1. not draining inside an exclusively locked section
+or
+2. making sure that variables used by drained_poll routines are only set
+while holding the reader lock
+?
+
+Those seem to require rather involved changes, so a third option might
+be to make draining inside an exclusively locked section possible, by
+embedding such locked sections in a drained section:
+
+>
+diff --git a/blockjob.c b/blockjob.c
+>
+index 32007f31a9..9b2f3b3ea9 100644
+>
+--- a/blockjob.c
+>
++++ b/blockjob.c
+>
+@@ -198,6 +198,7 @@ void block_job_remove_all_bdrv(BlockJob *job)
+>
+* one to make sure that such a concurrent access does not attempt
+>
+* to process an already freed BdrvChild.
+>
+*/
+>
++    bdrv_drain_all_begin();
+>
+bdrv_graph_wrlock();
+>
+while (job->nodes) {
+>
+GSList *l = job->nodes;
+>
+@@ -211,6 +212,7 @@ void block_job_remove_all_bdrv(BlockJob *job)
+>
+g_slist_free_1(l);
+>
+}
+>
+bdrv_graph_wrunlock();
+>
++    bdrv_drain_all_end();
+>
+}
+>
+>
+bool block_job_has_bdrv(BlockJob *job, BlockDriverState *bs)
+This seems to fix the issue at hand. I can send a patch if this is
+considered an acceptable approach.
+
+Best Regards,
+Fiona
+
+On 4/30/25 11:47 AM, Fiona Ebner wrote:
+>
+Am 24.04.25 um 19:32 schrieb Andrey Drobyshev:
+>
+> So it looks like main thread is processing job-dismiss request and is
+>
+> holding write lock taken in block_job_remove_all_bdrv() (frame #20
+>
+> above).  At the same time iothread spawns a coroutine which performs IO
+>
+> request.  Before the coroutine is spawned, blk_aio_prwv() increases
+>
+> 'in_flight' counter for Blk.  Then blk_co_do_preadv_part() (frame #5) is
+>
+> trying to acquire the read lock.  But main thread isn't releasing the
+>
+> lock as blk_root_drained_poll() returns true since blk->in_flight > 0.
+>
+> Here's the deadlock.
+>
+>
+And for the IO test you provided, it's client->nb_requests that behaves
+>
+similarly to blk->in_flight here.
+>
+>
+The issue also reproduces easily when issuing the following QMP command
+>
+in a loop while doing IO on a device:
+>
+>
+> void qmp_block_locked_drain(const char *node_name, Error **errp)
+>
+> {
+>
+>     BlockDriverState *bs;
+>
+>
+>
+>     bs = bdrv_find_node(node_name);
+>
+>     if (!bs) {
+>
+>         error_setg(errp, "node not found");
+>
+>         return;
+>
+>     }
+>
+>
+>
+>     bdrv_graph_wrlock();
+>
+>     bdrv_drained_begin(bs);
+>
+>     bdrv_drained_end(bs);
+>
+>     bdrv_graph_wrunlock();
+>
+> }
+>
+>
+It seems like either it would be necessary to require:
+>
+1. not draining inside an exclusively locked section
+>
+or
+>
+2. making sure that variables used by drained_poll routines are only set
+>
+while holding the reader lock
+>
+?
+>
+>
+Those seem to require rather involved changes, so a third option might
+>
+be to make draining inside an exclusively locked section possible, by
+>
+embedding such locked sections in a drained section:
+>
+>
+> diff --git a/blockjob.c b/blockjob.c
+>
+> index 32007f31a9..9b2f3b3ea9 100644
+>
+> --- a/blockjob.c
+>
+> +++ b/blockjob.c
+>
+> @@ -198,6 +198,7 @@ void block_job_remove_all_bdrv(BlockJob *job)
+>
+>       * one to make sure that such a concurrent access does not attempt
+>
+>       * to process an already freed BdrvChild.
+>
+>       */
+>
+> +    bdrv_drain_all_begin();
+>
+>      bdrv_graph_wrlock();
+>
+>      while (job->nodes) {
+>
+>          GSList *l = job->nodes;
+>
+> @@ -211,6 +212,7 @@ void block_job_remove_all_bdrv(BlockJob *job)
+>
+>          g_slist_free_1(l);
+>
+>      }
+>
+>      bdrv_graph_wrunlock();
+>
+> +    bdrv_drain_all_end();
+>
+>  }
+>
+>
+>
+>  bool block_job_has_bdrv(BlockJob *job, BlockDriverState *bs)
+>
+>
+This seems to fix the issue at hand. I can send a patch if this is
+>
+considered an acceptable approach.
+>
+>
+Best Regards,
+>
+Fiona
+>
+Hello Fiona,
+
+Thanks for looking into it.  I've tried your 3rd option above and can
+confirm it does fix the deadlock, at least I can't reproduce it.  Other
+iotests also don't seem to be breaking.  So I personally am fine with
+that patch.  Would be nice to hear a word from the maintainers though on
+whether there're any caveats with such approach.
+
+Andrey
+
+On Wed, Apr 30, 2025 at 10:11â¯AM Andrey Drobyshev
+<andrey.drobyshev@virtuozzo.com> wrote:
+>
+>
+On 4/30/25 11:47 AM, Fiona Ebner wrote:
+>
+> Am 24.04.25 um 19:32 schrieb Andrey Drobyshev:
+>
+>> So it looks like main thread is processing job-dismiss request and is
+>
+>> holding write lock taken in block_job_remove_all_bdrv() (frame #20
+>
+>> above).  At the same time iothread spawns a coroutine which performs IO
+>
+>> request.  Before the coroutine is spawned, blk_aio_prwv() increases
+>
+>> 'in_flight' counter for Blk.  Then blk_co_do_preadv_part() (frame #5) is
+>
+>> trying to acquire the read lock.  But main thread isn't releasing the
+>
+>> lock as blk_root_drained_poll() returns true since blk->in_flight > 0.
+>
+>> Here's the deadlock.
+>
+>
+>
+> And for the IO test you provided, it's client->nb_requests that behaves
+>
+> similarly to blk->in_flight here.
+>
+>
+>
+> The issue also reproduces easily when issuing the following QMP command
+>
+> in a loop while doing IO on a device:
+>
+>
+>
+>> void qmp_block_locked_drain(const char *node_name, Error **errp)
+>
+>> {
+>
+>>     BlockDriverState *bs;
+>
+>>
+>
+>>     bs = bdrv_find_node(node_name);
+>
+>>     if (!bs) {
+>
+>>         error_setg(errp, "node not found");
+>
+>>         return;
+>
+>>     }
+>
+>>
+>
+>>     bdrv_graph_wrlock();
+>
+>>     bdrv_drained_begin(bs);
+>
+>>     bdrv_drained_end(bs);
+>
+>>     bdrv_graph_wrunlock();
+>
+>> }
+>
+>
+>
+> It seems like either it would be necessary to require:
+>
+> 1. not draining inside an exclusively locked section
+>
+> or
+>
+> 2. making sure that variables used by drained_poll routines are only set
+>
+> while holding the reader lock
+>
+> ?
+>
+>
+>
+> Those seem to require rather involved changes, so a third option might
+>
+> be to make draining inside an exclusively locked section possible, by
+>
+> embedding such locked sections in a drained section:
+>
+>
+>
+>> diff --git a/blockjob.c b/blockjob.c
+>
+>> index 32007f31a9..9b2f3b3ea9 100644
+>
+>> --- a/blockjob.c
+>
+>> +++ b/blockjob.c
+>
+>> @@ -198,6 +198,7 @@ void block_job_remove_all_bdrv(BlockJob *job)
+>
+>>       * one to make sure that such a concurrent access does not attempt
+>
+>>       * to process an already freed BdrvChild.
+>
+>>       */
+>
+>> +    bdrv_drain_all_begin();
+>
+>>      bdrv_graph_wrlock();
+>
+>>      while (job->nodes) {
+>
+>>          GSList *l = job->nodes;
+>
+>> @@ -211,6 +212,7 @@ void block_job_remove_all_bdrv(BlockJob *job)
+>
+>>          g_slist_free_1(l);
+>
+>>      }
+>
+>>      bdrv_graph_wrunlock();
+>
+>> +    bdrv_drain_all_end();
+>
+>>  }
+>
+>>
+>
+>>  bool block_job_has_bdrv(BlockJob *job, BlockDriverState *bs)
+>
+>
+>
+> This seems to fix the issue at hand. I can send a patch if this is
+>
+> considered an acceptable approach.
+Kevin is aware of this thread but it's a public holiday tomorrow so it
+may be a little longer.
+
+Stefan
+
+Am 24.04.2025 um 19:32 hat Andrey Drobyshev geschrieben:
+>
+Hi all,
+>
+>
+There's a bug in block layer which leads to block graph deadlock.
+>
+Notably, it takes place when blockdev IO is processed within a separate
+>
+iothread.
+>
+>
+This was initially caught by our tests, and I was able to reduce it to a
+>
+relatively simple reproducer.  Such deadlocks are probably supposed to
+>
+be covered in iotests/graph-changes-while-io, but this deadlock isn't.
+>
+>
+Basically what the reproducer does is launches QEMU with a drive having
+>
+'iothread' option set, creates a chain of 2 snapshots, launches
+>
+block-commit job for a snapshot and then dismisses the job, starting
+>
+from the lower snapshot.  If the guest is issuing IO at the same time,
+>
+there's a race in acquiring block graph lock and a potential deadlock.
+>
+>
+Here's how it can be reproduced:
+>
+>
+1. Run QEMU:
+>
+> SRCDIR=/path/to/srcdir
+>
+>
+>
+>
+>
+>
+>
+>
+>
+> $SRCDIR/build/qemu-system-x86_64 -enable-kvm \
+>
+>
+>
+>   -machine q35 -cpu Nehalem \
+>
+>
+>
+>   -name guest=alma8-vm,debug-threads=on \
+>
+>
+>
+>   -m 2g -smp 2 \
+>
+>
+>
+>   -nographic -nodefaults \
+>
+>
+>
+>   -qmp unix:/var/run/alma8-qmp.sock,server=on,wait=off \
+>
+>
+>
+>   -serial unix:/var/run/alma8-serial.sock,server=on,wait=off \
+>
+>
+>
+>   -object iothread,id=iothread0 \
+>
+>
+>
+>   -blockdev
+>
+> node-name=disk,driver=qcow2,file.driver=file,file.filename=/path/to/img/alma8.qcow2
+>
+>  \
+>
+>   -device virtio-blk-pci,drive=disk,iothread=iothread0
+>
+>
+2. Launch IO (random reads) from within the guest:
+>
+> nc -U /var/run/alma8-serial.sock
+>
+> ...
+>
+> [root@alma8-vm ~]# fio --name=randread --ioengine=libaio --direct=1 --bs=4k
+>
+> --size=1G --numjobs=1 --time_based=1 --runtime=300 --group_reporting
+>
+> --rw=randread --iodepth=1 --filename=/testfile
+>
+>
+3. Run snapshots creation & removal of lower snapshot operation in a
+>
+loop (script attached):
+>
+> while /bin/true ; do ./remove_lower_snap.sh ; done
+>
+>
+And then it occasionally hangs.
+>
+>
+Note: I've tried bisecting this, and looks like deadlock occurs starting
+>
+from the following commit:
+>
+>
+(BAD)  5bdbaebcce virtio: Re-enable notifications after drain
+>
+(GOOD) c42c3833e0 virtio-scsi: Attach event vq notifier with no_poll
+>
+>
+On the latest v10.0.0 it does hang as well.
+>
+>
+>
+Here's backtrace of the main thread:
+>
+>
+> #0  0x00007fc547d427ce in __ppoll (fds=0x557eb79657b0, nfds=1,
+>
+> timeout=<optimized out>, sigmask=0x0) at
+>
+> ../sysdeps/unix/sysv/linux/ppoll.c:43
+>
+> #1  0x0000557eb47d955c in qemu_poll_ns (fds=0x557eb79657b0, nfds=1,
+>
+> timeout=-1) at ../util/qemu-timer.c:329
+>
+> #2  0x0000557eb47b2204 in fdmon_poll_wait (ctx=0x557eb76c5f20,
+>
+> ready_list=0x7ffd94b4edd8, timeout=-1) at ../util/fdmon-poll.c:79
+>
+> #3  0x0000557eb47b1c45 in aio_poll (ctx=0x557eb76c5f20, blocking=true) at
+>
+> ../util/aio-posix.c:730
+>
+> #4  0x0000557eb4621edd in bdrv_do_drained_begin (bs=0x557eb795e950,
+>
+> parent=0x0, poll=true) at ../block/io.c:378
+>
+> #5  0x0000557eb4621f7b in bdrv_drained_begin (bs=0x557eb795e950) at
+>
+> ../block/io.c:391
+>
+> #6  0x0000557eb45ec125 in bdrv_change_aio_context (bs=0x557eb795e950,
+>
+> ctx=0x557eb76c5f20, visited=0x557eb7e06b60 = {...}, tran=0x557eb7a87160,
+>
+> errp=0x0)
+>
+>     at ../block.c:7682
+>
+> #7  0x0000557eb45ebf2b in bdrv_child_change_aio_context (c=0x557eb7964250,
+>
+> ctx=0x557eb76c5f20, visited=0x557eb7e06b60 = {...}, tran=0x557eb7a87160,
+>
+> errp=0x0)
+>
+>     at ../block.c:7608
+>
+> #8  0x0000557eb45ec0c4 in bdrv_change_aio_context (bs=0x557eb79575e0,
+>
+> ctx=0x557eb76c5f20, visited=0x557eb7e06b60 = {...}, tran=0x557eb7a87160,
+>
+> errp=0x0)
+>
+>     at ../block.c:7668
+>
+> #9  0x0000557eb45ebf2b in bdrv_child_change_aio_context (c=0x557eb7e59110,
+>
+> ctx=0x557eb76c5f20, visited=0x557eb7e06b60 = {...}, tran=0x557eb7a87160,
+>
+> errp=0x0)
+>
+>     at ../block.c:7608
+>
+> #10 0x0000557eb45ec0c4 in bdrv_change_aio_context (bs=0x557eb7e51960,
+>
+> ctx=0x557eb76c5f20, visited=0x557eb7e06b60 = {...}, tran=0x557eb7a87160,
+>
+> errp=0x0)
+>
+>     at ../block.c:7668
+>
+> #11 0x0000557eb45ebf2b in bdrv_child_change_aio_context (c=0x557eb814ed80,
+>
+> ctx=0x557eb76c5f20, visited=0x557eb7e06b60 = {...}, tran=0x557eb7a87160,
+>
+> errp=0x0)
+>
+>     at ../block.c:7608
+>
+> #12 0x0000557eb45ee8e4 in child_job_change_aio_ctx (c=0x557eb7c9d3f0,
+>
+> ctx=0x557eb76c5f20, visited=0x557eb7e06b60 = {...}, tran=0x557eb7a87160,
+>
+> errp=0x0)
+>
+>     at ../blockjob.c:157
+>
+> #13 0x0000557eb45ebe2d in bdrv_parent_change_aio_context (c=0x557eb7c9d3f0,
+>
+> ctx=0x557eb76c5f20, visited=0x557eb7e06b60 = {...}, tran=0x557eb7a87160,
+>
+> errp=0x0)
+>
+>     at ../block.c:7592
+>
+> #14 0x0000557eb45ec06b in bdrv_change_aio_context (bs=0x557eb7d74310,
+>
+> ctx=0x557eb76c5f20, visited=0x557eb7e06b60 = {...}, tran=0x557eb7a87160,
+>
+> errp=0x0)
+>
+>     at ../block.c:7661
+>
+> #15 0x0000557eb45dcd7e in bdrv_child_cb_change_aio_ctx
+>
+>     (child=0x557eb8565af0, ctx=0x557eb76c5f20, visited=0x557eb7e06b60 =
+>
+> {...}, tran=0x557eb7a87160, errp=0x0) at ../block.c:1234
+>
+> #16 0x0000557eb45ebe2d in bdrv_parent_change_aio_context (c=0x557eb8565af0,
+>
+> ctx=0x557eb76c5f20, visited=0x557eb7e06b60 = {...}, tran=0x557eb7a87160,
+>
+> errp=0x0)
+>
+>     at ../block.c:7592
+>
+> #17 0x0000557eb45ec06b in bdrv_change_aio_context (bs=0x557eb79575e0,
+>
+> ctx=0x557eb76c5f20, visited=0x557eb7e06b60 = {...}, tran=0x557eb7a87160,
+>
+> errp=0x0)
+>
+>     at ../block.c:7661
+>
+> #18 0x0000557eb45ec1f3 in bdrv_try_change_aio_context (bs=0x557eb79575e0,
+>
+> ctx=0x557eb76c5f20, ignore_child=0x0, errp=0x0) at ../block.c:7715
+>
+> #19 0x0000557eb45e1b15 in bdrv_root_unref_child (child=0x557eb7966f30) at
+>
+> ../block.c:3317
+>
+> #20 0x0000557eb45eeaa8 in block_job_remove_all_bdrv (job=0x557eb7952800) at
+>
+> ../blockjob.c:209
+>
+> #21 0x0000557eb45ee641 in block_job_free (job=0x557eb7952800) at
+>
+> ../blockjob.c:82
+>
+> #22 0x0000557eb45f17af in job_unref_locked (job=0x557eb7952800) at
+>
+> ../job.c:474
+>
+> #23 0x0000557eb45f257d in job_do_dismiss_locked (job=0x557eb7952800) at
+>
+> ../job.c:771
+>
+> #24 0x0000557eb45f25fe in job_dismiss_locked (jobptr=0x7ffd94b4f400,
+>
+> errp=0x7ffd94b4f488) at ../job.c:783
+>
+> --Type <RET> for more, q to quit, c to continue without paging--
+>
+> #25 0x0000557eb45d8e84 in qmp_job_dismiss (id=0x557eb7aa42b0
+>
+> "commit-snap1", errp=0x7ffd94b4f488) at ../job-qmp.c:138
+>
+> #26 0x0000557eb472f6a3 in qmp_marshal_job_dismiss (args=0x7fc52c00a3b0,
+>
+> ret=0x7fc53c880da8, errp=0x7fc53c880da0) at qapi/qapi-commands-job.c:221
+>
+> #27 0x0000557eb47a35f3 in do_qmp_dispatch_bh (opaque=0x7fc53c880e40) at
+>
+> ../qapi/qmp-dispatch.c:128
+>
+> #28 0x0000557eb47d1cd2 in aio_bh_call (bh=0x557eb79568f0) at
+>
+> ../util/async.c:172
+>
+> #29 0x0000557eb47d1df5 in aio_bh_poll (ctx=0x557eb76c0200) at
+>
+> ../util/async.c:219
+>
+> #30 0x0000557eb47b12f3 in aio_dispatch (ctx=0x557eb76c0200) at
+>
+> ../util/aio-posix.c:436
+>
+> #31 0x0000557eb47d2266 in aio_ctx_dispatch (source=0x557eb76c0200,
+>
+> callback=0x0, user_data=0x0) at ../util/async.c:361
+>
+> #32 0x00007fc549232f4f in g_main_dispatch (context=0x557eb76c6430) at
+>
+> ../glib/gmain.c:3364
+>
+> #33 g_main_context_dispatch (context=0x557eb76c6430) at ../glib/gmain.c:4079
+>
+> #34 0x0000557eb47d3ab1 in glib_pollfds_poll () at ../util/main-loop.c:287
+>
+> #35 0x0000557eb47d3b38 in os_host_main_loop_wait (timeout=0) at
+>
+> ../util/main-loop.c:310
+>
+> #36 0x0000557eb47d3c58 in main_loop_wait (nonblocking=0) at
+>
+> ../util/main-loop.c:589
+>
+> #37 0x0000557eb4218b01 in qemu_main_loop () at ../system/runstate.c:835
+>
+> #38 0x0000557eb46df166 in qemu_default_main (opaque=0x0) at
+>
+> ../system/main.c:50
+>
+> #39 0x0000557eb46df215 in main (argc=24, argv=0x7ffd94b4f8d8) at
+>
+> ../system/main.c:80
+>
+>
+>
+And here's coroutine trying to acquire read lock:
+>
+>
+> (gdb) qemu coroutine reader_queue->entries.sqh_first
+>
+> #0  0x0000557eb47d7068 in qemu_coroutine_switch (from_=0x557eb7aa48b0,
+>
+> to_=0x7fc537fff508, action=COROUTINE_YIELD) at
+>
+> ../util/coroutine-ucontext.c:321
+>
+> #1  0x0000557eb47d4d4a in qemu_coroutine_yield () at
+>
+> ../util/qemu-coroutine.c:339
+>
+> #2  0x0000557eb47d56c8 in qemu_co_queue_wait_impl (queue=0x557eb59954c0
+>
+> <reader_queue>, lock=0x7fc53c57de50, flags=0) at
+>
+> ../util/qemu-coroutine-lock.c:60
+>
+> #3  0x0000557eb461fea7 in bdrv_graph_co_rdlock () at
+>
+> ../block/graph-lock.c:231
+>
+> #4  0x0000557eb460c81a in graph_lockable_auto_lock (x=0x7fc53c57dee3) at
+>
+> /home/root/src/qemu/master/include/block/graph-lock.h:213
+>
+> #5  0x0000557eb460fa41 in blk_co_do_preadv_part
+>
+>     (blk=0x557eb84c0810, offset=6890553344, bytes=4096,
+>
+> qiov=0x7fc530006988, qiov_offset=0, flags=BDRV_REQ_REGISTERED_BUF) at
+>
+> ../block/block-backend.c:1339
+>
+> #6  0x0000557eb46104d7 in blk_aio_read_entry (opaque=0x7fc530003240) at
+>
+> ../block/block-backend.c:1619
+>
+> #7  0x0000557eb47d6c40 in coroutine_trampoline (i0=-1213577040, i1=21886)
+>
+> at ../util/coroutine-ucontext.c:175
+>
+> #8  0x00007fc547c2a360 in __start_context () at
+>
+> ../sysdeps/unix/sysv/linux/x86_64/__start_context.S:91
+>
+> #9  0x00007ffd94b4ea40 in  ()
+>
+> #10 0x0000000000000000 in  ()
+>
+>
+>
+So it looks like main thread is processing job-dismiss request and is
+>
+holding write lock taken in block_job_remove_all_bdrv() (frame #20
+>
+above).  At the same time iothread spawns a coroutine which performs IO
+>
+request.  Before the coroutine is spawned, blk_aio_prwv() increases
+>
+'in_flight' counter for Blk.  Then blk_co_do_preadv_part() (frame #5) is
+>
+trying to acquire the read lock.  But main thread isn't releasing the
+>
+lock as blk_root_drained_poll() returns true since blk->in_flight > 0.
+>
+Here's the deadlock.
+>
+>
+Any comments and suggestions on the subject are welcomed.  Thanks!
+I think this is what the blk_wait_while_drained() call was supposed to
+address in blk_co_do_preadv_part(). However, with the use of multiple
+I/O threads, this is racy.
+
+Do you think that in your case we hit the small race window between the
+checks in blk_wait_while_drained() and GRAPH_RDLOCK_GUARD()? Or is there
+another reason why blk_wait_while_drained() didn't do its job?
+
+Kevin
+
+On 5/2/25 19:34, Kevin Wolf wrote:
+Am 24.04.2025 um 19:32 hat Andrey Drobyshev geschrieben:
+Hi all,
+
+There's a bug in block layer which leads to block graph deadlock.
+Notably, it takes place when blockdev IO is processed within a separate
+iothread.
+
+This was initially caught by our tests, and I was able to reduce it to a
+relatively simple reproducer.  Such deadlocks are probably supposed to
+be covered in iotests/graph-changes-while-io, but this deadlock isn't.
+
+Basically what the reproducer does is launches QEMU with a drive having
+'iothread' option set, creates a chain of 2 snapshots, launches
+block-commit job for a snapshot and then dismisses the job, starting
+from the lower snapshot.  If the guest is issuing IO at the same time,
+there's a race in acquiring block graph lock and a potential deadlock.
+
+Here's how it can be reproduced:
+
+1. Run QEMU:
+SRCDIR=/path/to/srcdir
+$SRCDIR/build/qemu-system-x86_64 -enable-kvm \
+-machine q35 -cpu Nehalem \
+   -name guest=alma8-vm,debug-threads=on \
+   -m 2g -smp 2 \
+   -nographic -nodefaults \
+   -qmp unix:/var/run/alma8-qmp.sock,server=on,wait=off \
+   -serial unix:/var/run/alma8-serial.sock,server=on,wait=off \
+   -object iothread,id=iothread0 \
+   -blockdev 
+node-name=disk,driver=qcow2,file.driver=file,file.filename=/path/to/img/alma8.qcow2
+ \
+   -device virtio-blk-pci,drive=disk,iothread=iothread0
+2. Launch IO (random reads) from within the guest:
+nc -U /var/run/alma8-serial.sock
+...
+[root@alma8-vm ~]# fio --name=randread --ioengine=libaio --direct=1 --bs=4k 
+--size=1G --numjobs=1 --time_based=1 --runtime=300 --group_reporting 
+--rw=randread --iodepth=1 --filename=/testfile
+3. Run snapshots creation & removal of lower snapshot operation in a
+loop (script attached):
+while /bin/true ; do ./remove_lower_snap.sh ; done
+And then it occasionally hangs.
+
+Note: I've tried bisecting this, and looks like deadlock occurs starting
+from the following commit:
+
+(BAD)  5bdbaebcce virtio: Re-enable notifications after drain
+(GOOD) c42c3833e0 virtio-scsi: Attach event vq notifier with no_poll
+
+On the latest v10.0.0 it does hang as well.
+
+
+Here's backtrace of the main thread:
+#0  0x00007fc547d427ce in __ppoll (fds=0x557eb79657b0, nfds=1, timeout=<optimized 
+out>, sigmask=0x0) at ../sysdeps/unix/sysv/linux/ppoll.c:43
+#1  0x0000557eb47d955c in qemu_poll_ns (fds=0x557eb79657b0, nfds=1, timeout=-1) 
+at ../util/qemu-timer.c:329
+#2  0x0000557eb47b2204 in fdmon_poll_wait (ctx=0x557eb76c5f20, 
+ready_list=0x7ffd94b4edd8, timeout=-1) at ../util/fdmon-poll.c:79
+#3  0x0000557eb47b1c45 in aio_poll (ctx=0x557eb76c5f20, blocking=true) at 
+../util/aio-posix.c:730
+#4  0x0000557eb4621edd in bdrv_do_drained_begin (bs=0x557eb795e950, parent=0x0, 
+poll=true) at ../block/io.c:378
+#5  0x0000557eb4621f7b in bdrv_drained_begin (bs=0x557eb795e950) at 
+../block/io.c:391
+#6  0x0000557eb45ec125 in bdrv_change_aio_context (bs=0x557eb795e950, 
+ctx=0x557eb76c5f20, visited=0x557eb7e06b60 = {...}, tran=0x557eb7a87160, 
+errp=0x0)
+     at ../block.c:7682
+#7  0x0000557eb45ebf2b in bdrv_child_change_aio_context (c=0x557eb7964250, 
+ctx=0x557eb76c5f20, visited=0x557eb7e06b60 = {...}, tran=0x557eb7a87160, 
+errp=0x0)
+     at ../block.c:7608
+#8  0x0000557eb45ec0c4 in bdrv_change_aio_context (bs=0x557eb79575e0, 
+ctx=0x557eb76c5f20, visited=0x557eb7e06b60 = {...}, tran=0x557eb7a87160, 
+errp=0x0)
+     at ../block.c:7668
+#9  0x0000557eb45ebf2b in bdrv_child_change_aio_context (c=0x557eb7e59110, 
+ctx=0x557eb76c5f20, visited=0x557eb7e06b60 = {...}, tran=0x557eb7a87160, 
+errp=0x0)
+     at ../block.c:7608
+#10 0x0000557eb45ec0c4 in bdrv_change_aio_context (bs=0x557eb7e51960, 
+ctx=0x557eb76c5f20, visited=0x557eb7e06b60 = {...}, tran=0x557eb7a87160, 
+errp=0x0)
+     at ../block.c:7668
+#11 0x0000557eb45ebf2b in bdrv_child_change_aio_context (c=0x557eb814ed80, 
+ctx=0x557eb76c5f20, visited=0x557eb7e06b60 = {...}, tran=0x557eb7a87160, 
+errp=0x0)
+     at ../block.c:7608
+#12 0x0000557eb45ee8e4 in child_job_change_aio_ctx (c=0x557eb7c9d3f0, 
+ctx=0x557eb76c5f20, visited=0x557eb7e06b60 = {...}, tran=0x557eb7a87160, 
+errp=0x0)
+     at ../blockjob.c:157
+#13 0x0000557eb45ebe2d in bdrv_parent_change_aio_context (c=0x557eb7c9d3f0, 
+ctx=0x557eb76c5f20, visited=0x557eb7e06b60 = {...}, tran=0x557eb7a87160, 
+errp=0x0)
+     at ../block.c:7592
+#14 0x0000557eb45ec06b in bdrv_change_aio_context (bs=0x557eb7d74310, 
+ctx=0x557eb76c5f20, visited=0x557eb7e06b60 = {...}, tran=0x557eb7a87160, 
+errp=0x0)
+     at ../block.c:7661
+#15 0x0000557eb45dcd7e in bdrv_child_cb_change_aio_ctx
+     (child=0x557eb8565af0, ctx=0x557eb76c5f20, visited=0x557eb7e06b60 = {...}, 
+tran=0x557eb7a87160, errp=0x0) at ../block.c:1234
+#16 0x0000557eb45ebe2d in bdrv_parent_change_aio_context (c=0x557eb8565af0, 
+ctx=0x557eb76c5f20, visited=0x557eb7e06b60 = {...}, tran=0x557eb7a87160, 
+errp=0x0)
+     at ../block.c:7592
+#17 0x0000557eb45ec06b in bdrv_change_aio_context (bs=0x557eb79575e0, 
+ctx=0x557eb76c5f20, visited=0x557eb7e06b60 = {...}, tran=0x557eb7a87160, 
+errp=0x0)
+     at ../block.c:7661
+#18 0x0000557eb45ec1f3 in bdrv_try_change_aio_context (bs=0x557eb79575e0, 
+ctx=0x557eb76c5f20, ignore_child=0x0, errp=0x0) at ../block.c:7715
+#19 0x0000557eb45e1b15 in bdrv_root_unref_child (child=0x557eb7966f30) at 
+../block.c:3317
+#20 0x0000557eb45eeaa8 in block_job_remove_all_bdrv (job=0x557eb7952800) at 
+../blockjob.c:209
+#21 0x0000557eb45ee641 in block_job_free (job=0x557eb7952800) at 
+../blockjob.c:82
+#22 0x0000557eb45f17af in job_unref_locked (job=0x557eb7952800) at ../job.c:474
+#23 0x0000557eb45f257d in job_do_dismiss_locked (job=0x557eb7952800) at 
+../job.c:771
+#24 0x0000557eb45f25fe in job_dismiss_locked (jobptr=0x7ffd94b4f400, 
+errp=0x7ffd94b4f488) at ../job.c:783
+--Type <RET> for more, q to quit, c to continue without paging--
+#25 0x0000557eb45d8e84 in qmp_job_dismiss (id=0x557eb7aa42b0 "commit-snap1", 
+errp=0x7ffd94b4f488) at ../job-qmp.c:138
+#26 0x0000557eb472f6a3 in qmp_marshal_job_dismiss (args=0x7fc52c00a3b0, 
+ret=0x7fc53c880da8, errp=0x7fc53c880da0) at qapi/qapi-commands-job.c:221
+#27 0x0000557eb47a35f3 in do_qmp_dispatch_bh (opaque=0x7fc53c880e40) at 
+../qapi/qmp-dispatch.c:128
+#28 0x0000557eb47d1cd2 in aio_bh_call (bh=0x557eb79568f0) at ../util/async.c:172
+#29 0x0000557eb47d1df5 in aio_bh_poll (ctx=0x557eb76c0200) at 
+../util/async.c:219
+#30 0x0000557eb47b12f3 in aio_dispatch (ctx=0x557eb76c0200) at 
+../util/aio-posix.c:436
+#31 0x0000557eb47d2266 in aio_ctx_dispatch (source=0x557eb76c0200, 
+callback=0x0, user_data=0x0) at ../util/async.c:361
+#32 0x00007fc549232f4f in g_main_dispatch (context=0x557eb76c6430) at 
+../glib/gmain.c:3364
+#33 g_main_context_dispatch (context=0x557eb76c6430) at ../glib/gmain.c:4079
+#34 0x0000557eb47d3ab1 in glib_pollfds_poll () at ../util/main-loop.c:287
+#35 0x0000557eb47d3b38 in os_host_main_loop_wait (timeout=0) at 
+../util/main-loop.c:310
+#36 0x0000557eb47d3c58 in main_loop_wait (nonblocking=0) at 
+../util/main-loop.c:589
+#37 0x0000557eb4218b01 in qemu_main_loop () at ../system/runstate.c:835
+#38 0x0000557eb46df166 in qemu_default_main (opaque=0x0) at ../system/main.c:50
+#39 0x0000557eb46df215 in main (argc=24, argv=0x7ffd94b4f8d8) at 
+../system/main.c:80
+And here's coroutine trying to acquire read lock:
+(gdb) qemu coroutine reader_queue->entries.sqh_first
+#0  0x0000557eb47d7068 in qemu_coroutine_switch (from_=0x557eb7aa48b0, 
+to_=0x7fc537fff508, action=COROUTINE_YIELD) at ../util/coroutine-ucontext.c:321
+#1  0x0000557eb47d4d4a in qemu_coroutine_yield () at 
+../util/qemu-coroutine.c:339
+#2  0x0000557eb47d56c8 in qemu_co_queue_wait_impl (queue=0x557eb59954c0 
+<reader_queue>, lock=0x7fc53c57de50, flags=0) at 
+../util/qemu-coroutine-lock.c:60
+#3  0x0000557eb461fea7 in bdrv_graph_co_rdlock () at ../block/graph-lock.c:231
+#4  0x0000557eb460c81a in graph_lockable_auto_lock (x=0x7fc53c57dee3) at 
+/home/root/src/qemu/master/include/block/graph-lock.h:213
+#5  0x0000557eb460fa41 in blk_co_do_preadv_part
+     (blk=0x557eb84c0810, offset=6890553344, bytes=4096, qiov=0x7fc530006988, 
+qiov_offset=0, flags=BDRV_REQ_REGISTERED_BUF) at ../block/block-backend.c:1339
+#6  0x0000557eb46104d7 in blk_aio_read_entry (opaque=0x7fc530003240) at 
+../block/block-backend.c:1619
+#7  0x0000557eb47d6c40 in coroutine_trampoline (i0=-1213577040, i1=21886) at 
+../util/coroutine-ucontext.c:175
+#8  0x00007fc547c2a360 in __start_context () at 
+../sysdeps/unix/sysv/linux/x86_64/__start_context.S:91
+#9  0x00007ffd94b4ea40 in  ()
+#10 0x0000000000000000 in  ()
+So it looks like main thread is processing job-dismiss request and is
+holding write lock taken in block_job_remove_all_bdrv() (frame #20
+above).  At the same time iothread spawns a coroutine which performs IO
+request.  Before the coroutine is spawned, blk_aio_prwv() increases
+'in_flight' counter for Blk.  Then blk_co_do_preadv_part() (frame #5) is
+trying to acquire the read lock.  But main thread isn't releasing the
+lock as blk_root_drained_poll() returns true since blk->in_flight > 0.
+Here's the deadlock.
+
+Any comments and suggestions on the subject are welcomed.  Thanks!
+I think this is what the blk_wait_while_drained() call was supposed to
+address in blk_co_do_preadv_part(). However, with the use of multiple
+I/O threads, this is racy.
+
+Do you think that in your case we hit the small race window between the
+checks in blk_wait_while_drained() and GRAPH_RDLOCK_GUARD()? Or is there
+another reason why blk_wait_while_drained() didn't do its job?
+
+Kevin
+At my opinion there is very big race window. Main thread has
+eaten graph write lock. After that another coroutine is stalled
+within GRAPH_RDLOCK_GUARD() as there is no drain at the moment and only
+after that main thread has started drain. That is why Fiona's idea is
+looking working. Though this would mean that normally we should always
+do that at the moment when we acquire write lock. May be even inside
+this function. Den
+
+Am 02.05.2025 um 19:52 hat Denis V. Lunev geschrieben:
+>
+On 5/2/25 19:34, Kevin Wolf wrote:
+>
+> Am 24.04.2025 um 19:32 hat Andrey Drobyshev geschrieben:
+>
+> > Hi all,
+>
+> >
+>
+> > There's a bug in block layer which leads to block graph deadlock.
+>
+> > Notably, it takes place when blockdev IO is processed within a separate
+>
+> > iothread.
+>
+> >
+>
+> > This was initially caught by our tests, and I was able to reduce it to a
+>
+> > relatively simple reproducer.  Such deadlocks are probably supposed to
+>
+> > be covered in iotests/graph-changes-while-io, but this deadlock isn't.
+>
+> >
+>
+> > Basically what the reproducer does is launches QEMU with a drive having
+>
+> > 'iothread' option set, creates a chain of 2 snapshots, launches
+>
+> > block-commit job for a snapshot and then dismisses the job, starting
+>
+> > from the lower snapshot.  If the guest is issuing IO at the same time,
+>
+> > there's a race in acquiring block graph lock and a potential deadlock.
+>
+> >
+>
+> > Here's how it can be reproduced:
+>
+> >
+>
+> > 1. Run QEMU:
+>
+> > > SRCDIR=/path/to/srcdir
+>
+> > > $SRCDIR/build/qemu-system-x86_64 -enable-kvm \
+>
+> > >    -machine q35 -cpu Nehalem \
+>
+> > >    -name guest=alma8-vm,debug-threads=on \
+>
+> > >    -m 2g -smp 2 \
+>
+> > >    -nographic -nodefaults \
+>
+> > >    -qmp unix:/var/run/alma8-qmp.sock,server=on,wait=off \
+>
+> > >    -serial unix:/var/run/alma8-serial.sock,server=on,wait=off \
+>
+> > >    -object iothread,id=iothread0 \
+>
+> > >    -blockdev
+>
+> > > node-name=disk,driver=qcow2,file.driver=file,file.filename=/path/to/img/alma8.qcow2
+>
+> > >  \
+>
+> > >    -device virtio-blk-pci,drive=disk,iothread=iothread0
+>
+> > 2. Launch IO (random reads) from within the guest:
+>
+> > > nc -U /var/run/alma8-serial.sock
+>
+> > > ...
+>
+> > > [root@alma8-vm ~]# fio --name=randread --ioengine=libaio --direct=1
+>
+> > > --bs=4k --size=1G --numjobs=1 --time_based=1 --runtime=300
+>
+> > > --group_reporting --rw=randread --iodepth=1 --filename=/testfile
+>
+> > 3. Run snapshots creation & removal of lower snapshot operation in a
+>
+> > loop (script attached):
+>
+> > > while /bin/true ; do ./remove_lower_snap.sh ; done
+>
+> > And then it occasionally hangs.
+>
+> >
+>
+> > Note: I've tried bisecting this, and looks like deadlock occurs starting
+>
+> > from the following commit:
+>
+> >
+>
+> > (BAD)  5bdbaebcce virtio: Re-enable notifications after drain
+>
+> > (GOOD) c42c3833e0 virtio-scsi: Attach event vq notifier with no_poll
+>
+> >
+>
+> > On the latest v10.0.0 it does hang as well.
+>
+> >
+>
+> >
+>
+> > Here's backtrace of the main thread:
+>
+> >
+>
+> > > #0  0x00007fc547d427ce in __ppoll (fds=0x557eb79657b0, nfds=1,
+>
+> > > timeout=<optimized out>, sigmask=0x0) at
+>
+> > > ../sysdeps/unix/sysv/linux/ppoll.c:43
+>
+> > > #1  0x0000557eb47d955c in qemu_poll_ns (fds=0x557eb79657b0, nfds=1,
+>
+> > > timeout=-1) at ../util/qemu-timer.c:329
+>
+> > > #2  0x0000557eb47b2204 in fdmon_poll_wait (ctx=0x557eb76c5f20,
+>
+> > > ready_list=0x7ffd94b4edd8, timeout=-1) at ../util/fdmon-poll.c:79
+>
+> > > #3  0x0000557eb47b1c45 in aio_poll (ctx=0x557eb76c5f20, blocking=true)
+>
+> > > at ../util/aio-posix.c:730
+>
+> > > #4  0x0000557eb4621edd in bdrv_do_drained_begin (bs=0x557eb795e950,
+>
+> > > parent=0x0, poll=true) at ../block/io.c:378
+>
+> > > #5  0x0000557eb4621f7b in bdrv_drained_begin (bs=0x557eb795e950) at
+>
+> > > ../block/io.c:391
+>
+> > > #6  0x0000557eb45ec125 in bdrv_change_aio_context (bs=0x557eb795e950,
+>
+> > > ctx=0x557eb76c5f20, visited=0x557eb7e06b60 = {...},
+>
+> > > tran=0x557eb7a87160, errp=0x0)
+>
+> > >      at ../block.c:7682
+>
+> > > #7  0x0000557eb45ebf2b in bdrv_child_change_aio_context
+>
+> > > (c=0x557eb7964250, ctx=0x557eb76c5f20, visited=0x557eb7e06b60 = {...},
+>
+> > > tran=0x557eb7a87160, errp=0x0)
+>
+> > >      at ../block.c:7608
+>
+> > > #8  0x0000557eb45ec0c4 in bdrv_change_aio_context (bs=0x557eb79575e0,
+>
+> > > ctx=0x557eb76c5f20, visited=0x557eb7e06b60 = {...},
+>
+> > > tran=0x557eb7a87160, errp=0x0)
+>
+> > >      at ../block.c:7668
+>
+> > > #9  0x0000557eb45ebf2b in bdrv_child_change_aio_context
+>
+> > > (c=0x557eb7e59110, ctx=0x557eb76c5f20, visited=0x557eb7e06b60 = {...},
+>
+> > > tran=0x557eb7a87160, errp=0x0)
+>
+> > >      at ../block.c:7608
+>
+> > > #10 0x0000557eb45ec0c4 in bdrv_change_aio_context (bs=0x557eb7e51960,
+>
+> > > ctx=0x557eb76c5f20, visited=0x557eb7e06b60 = {...},
+>
+> > > tran=0x557eb7a87160, errp=0x0)
+>
+> > >      at ../block.c:7668
+>
+> > > #11 0x0000557eb45ebf2b in bdrv_child_change_aio_context
+>
+> > > (c=0x557eb814ed80, ctx=0x557eb76c5f20, visited=0x557eb7e06b60 = {...},
+>
+> > > tran=0x557eb7a87160, errp=0x0)
+>
+> > >      at ../block.c:7608
+>
+> > > #12 0x0000557eb45ee8e4 in child_job_change_aio_ctx (c=0x557eb7c9d3f0,
+>
+> > > ctx=0x557eb76c5f20, visited=0x557eb7e06b60 = {...},
+>
+> > > tran=0x557eb7a87160, errp=0x0)
+>
+> > >      at ../blockjob.c:157
+>
+> > > #13 0x0000557eb45ebe2d in bdrv_parent_change_aio_context
+>
+> > > (c=0x557eb7c9d3f0, ctx=0x557eb76c5f20, visited=0x557eb7e06b60 = {...},
+>
+> > > tran=0x557eb7a87160, errp=0x0)
+>
+> > >      at ../block.c:7592
+>
+> > > #14 0x0000557eb45ec06b in bdrv_change_aio_context (bs=0x557eb7d74310,
+>
+> > > ctx=0x557eb76c5f20, visited=0x557eb7e06b60 = {...},
+>
+> > > tran=0x557eb7a87160, errp=0x0)
+>
+> > >      at ../block.c:7661
+>
+> > > #15 0x0000557eb45dcd7e in bdrv_child_cb_change_aio_ctx
+>
+> > >      (child=0x557eb8565af0, ctx=0x557eb76c5f20, visited=0x557eb7e06b60
+>
+> > > = {...}, tran=0x557eb7a87160, errp=0x0) at ../block.c:1234
+>
+> > > #16 0x0000557eb45ebe2d in bdrv_parent_change_aio_context
+>
+> > > (c=0x557eb8565af0, ctx=0x557eb76c5f20, visited=0x557eb7e06b60 = {...},
+>
+> > > tran=0x557eb7a87160, errp=0x0)
+>
+> > >      at ../block.c:7592
+>
+> > > #17 0x0000557eb45ec06b in bdrv_change_aio_context (bs=0x557eb79575e0,
+>
+> > > ctx=0x557eb76c5f20, visited=0x557eb7e06b60 = {...},
+>
+> > > tran=0x557eb7a87160, errp=0x0)
+>
+> > >      at ../block.c:7661
+>
+> > > #18 0x0000557eb45ec1f3 in bdrv_try_change_aio_context
+>
+> > > (bs=0x557eb79575e0, ctx=0x557eb76c5f20, ignore_child=0x0, errp=0x0) at
+>
+> > > ../block.c:7715
+>
+> > > #19 0x0000557eb45e1b15 in bdrv_root_unref_child (child=0x557eb7966f30)
+>
+> > > at ../block.c:3317
+>
+> > > #20 0x0000557eb45eeaa8 in block_job_remove_all_bdrv
+>
+> > > (job=0x557eb7952800) at ../blockjob.c:209
+>
+> > > #21 0x0000557eb45ee641 in block_job_free (job=0x557eb7952800) at
+>
+> > > ../blockjob.c:82
+>
+> > > #22 0x0000557eb45f17af in job_unref_locked (job=0x557eb7952800) at
+>
+> > > ../job.c:474
+>
+> > > #23 0x0000557eb45f257d in job_do_dismiss_locked (job=0x557eb7952800) at
+>
+> > > ../job.c:771
+>
+> > > #24 0x0000557eb45f25fe in job_dismiss_locked (jobptr=0x7ffd94b4f400,
+>
+> > > errp=0x7ffd94b4f488) at ../job.c:783
+>
+> > > --Type <RET> for more, q to quit, c to continue without paging--
+>
+> > > #25 0x0000557eb45d8e84 in qmp_job_dismiss (id=0x557eb7aa42b0
+>
+> > > "commit-snap1", errp=0x7ffd94b4f488) at ../job-qmp.c:138
+>
+> > > #26 0x0000557eb472f6a3 in qmp_marshal_job_dismiss (args=0x7fc52c00a3b0,
+>
+> > > ret=0x7fc53c880da8, errp=0x7fc53c880da0) at qapi/qapi-commands-job.c:221
+>
+> > > #27 0x0000557eb47a35f3 in do_qmp_dispatch_bh (opaque=0x7fc53c880e40) at
+>
+> > > ../qapi/qmp-dispatch.c:128
+>
+> > > #28 0x0000557eb47d1cd2 in aio_bh_call (bh=0x557eb79568f0) at
+>
+> > > ../util/async.c:172
+>
+> > > #29 0x0000557eb47d1df5 in aio_bh_poll (ctx=0x557eb76c0200) at
+>
+> > > ../util/async.c:219
+>
+> > > #30 0x0000557eb47b12f3 in aio_dispatch (ctx=0x557eb76c0200) at
+>
+> > > ../util/aio-posix.c:436
+>
+> > > #31 0x0000557eb47d2266 in aio_ctx_dispatch (source=0x557eb76c0200,
+>
+> > > callback=0x0, user_data=0x0) at ../util/async.c:361
+>
+> > > #32 0x00007fc549232f4f in g_main_dispatch (context=0x557eb76c6430) at
+>
+> > > ../glib/gmain.c:3364
+>
+> > > #33 g_main_context_dispatch (context=0x557eb76c6430) at
+>
+> > > ../glib/gmain.c:4079
+>
+> > > #34 0x0000557eb47d3ab1 in glib_pollfds_poll () at
+>
+> > > ../util/main-loop.c:287
+>
+> > > #35 0x0000557eb47d3b38 in os_host_main_loop_wait (timeout=0) at
+>
+> > > ../util/main-loop.c:310
+>
+> > > #36 0x0000557eb47d3c58 in main_loop_wait (nonblocking=0) at
+>
+> > > ../util/main-loop.c:589
+>
+> > > #37 0x0000557eb4218b01 in qemu_main_loop () at ../system/runstate.c:835
+>
+> > > #38 0x0000557eb46df166 in qemu_default_main (opaque=0x0) at
+>
+> > > ../system/main.c:50
+>
+> > > #39 0x0000557eb46df215 in main (argc=24, argv=0x7ffd94b4f8d8) at
+>
+> > > ../system/main.c:80
+>
+> >
+>
+> > And here's coroutine trying to acquire read lock:
+>
+> >
+>
+> > > (gdb) qemu coroutine reader_queue->entries.sqh_first
+>
+> > > #0  0x0000557eb47d7068 in qemu_coroutine_switch (from_=0x557eb7aa48b0,
+>
+> > > to_=0x7fc537fff508, action=COROUTINE_YIELD) at
+>
+> > > ../util/coroutine-ucontext.c:321
+>
+> > > #1  0x0000557eb47d4d4a in qemu_coroutine_yield () at
+>
+> > > ../util/qemu-coroutine.c:339
+>
+> > > #2  0x0000557eb47d56c8 in qemu_co_queue_wait_impl (queue=0x557eb59954c0
+>
+> > > <reader_queue>, lock=0x7fc53c57de50, flags=0) at
+>
+> > > ../util/qemu-coroutine-lock.c:60
+>
+> > > #3  0x0000557eb461fea7 in bdrv_graph_co_rdlock () at
+>
+> > > ../block/graph-lock.c:231
+>
+> > > #4  0x0000557eb460c81a in graph_lockable_auto_lock (x=0x7fc53c57dee3)
+>
+> > > at /home/root/src/qemu/master/include/block/graph-lock.h:213
+>
+> > > #5  0x0000557eb460fa41 in blk_co_do_preadv_part
+>
+> > >      (blk=0x557eb84c0810, offset=6890553344, bytes=4096,
+>
+> > > qiov=0x7fc530006988, qiov_offset=0, flags=BDRV_REQ_REGISTERED_BUF) at
+>
+> > > ../block/block-backend.c:1339
+>
+> > > #6  0x0000557eb46104d7 in blk_aio_read_entry (opaque=0x7fc530003240) at
+>
+> > > ../block/block-backend.c:1619
+>
+> > > #7  0x0000557eb47d6c40 in coroutine_trampoline (i0=-1213577040,
+>
+> > > i1=21886) at ../util/coroutine-ucontext.c:175
+>
+> > > #8  0x00007fc547c2a360 in __start_context () at
+>
+> > > ../sysdeps/unix/sysv/linux/x86_64/__start_context.S:91
+>
+> > > #9  0x00007ffd94b4ea40 in  ()
+>
+> > > #10 0x0000000000000000 in  ()
+>
+> >
+>
+> > So it looks like main thread is processing job-dismiss request and is
+>
+> > holding write lock taken in block_job_remove_all_bdrv() (frame #20
+>
+> > above).  At the same time iothread spawns a coroutine which performs IO
+>
+> > request.  Before the coroutine is spawned, blk_aio_prwv() increases
+>
+> > 'in_flight' counter for Blk.  Then blk_co_do_preadv_part() (frame #5) is
+>
+> > trying to acquire the read lock.  But main thread isn't releasing the
+>
+> > lock as blk_root_drained_poll() returns true since blk->in_flight > 0.
+>
+> > Here's the deadlock.
+>
+> >
+>
+> > Any comments and suggestions on the subject are welcomed.  Thanks!
+>
+> I think this is what the blk_wait_while_drained() call was supposed to
+>
+> address in blk_co_do_preadv_part(). However, with the use of multiple
+>
+> I/O threads, this is racy.
+>
+>
+>
+> Do you think that in your case we hit the small race window between the
+>
+> checks in blk_wait_while_drained() and GRAPH_RDLOCK_GUARD()? Or is there
+>
+> another reason why blk_wait_while_drained() didn't do its job?
+>
+>
+>
+At my opinion there is very big race window. Main thread has
+>
+eaten graph write lock. After that another coroutine is stalled
+>
+within GRAPH_RDLOCK_GUARD() as there is no drain at the moment and only
+>
+after that main thread has started drain.
+You're right, I confused taking the write lock with draining there.
+
+>
+That is why Fiona's idea is looking working. Though this would mean
+>
+that normally we should always do that at the moment when we acquire
+>
+write lock. May be even inside this function.
+I actually see now that not all of my graph locking patches were merged.
+At least I did have the thought that bdrv_drained_begin() must be marked
+GRAPH_UNLOCKED because it polls. That means that calling it from inside
+bdrv_try_change_aio_context() is actually forbidden (and that's the part
+I didn't see back then because it doesn't have TSA annotations).
+
+If you refactor the code to move the drain out to before the lock is
+taken, I think you end up with Fiona's patch, except you'll remove the
+forbidden inner drain and add more annotations for some functions and
+clarify the rules around them. I don't know, but I wouldn't be surprised
+if along the process we find other bugs, too.
+
+So Fiona's drain looks right to me, but we should probably approach it
+more systematically.
+
+Kevin
+
diff --git a/classification_output/01/instruction/26095107 b/classification_output/01/instruction/26095107
new file mode 100644
index 000000000..c06d35dd8
--- /dev/null
+++ b/classification_output/01/instruction/26095107
@@ -0,0 +1,158 @@
+instruction: 0.991
+other: 0.979
+semantic: 0.974
+mistranslation: 0.930
+
+[Qemu-devel]  [Bug Report] vm paused after succeeding to migrate
+
+Hi, all
+I encounterd a bug when I try to migrate a windows vm.
+
+Enviroment information:
+host A: cpu E5620(model WestmereEP without flag xsave)
+host B: cpu E5-2643(model SandyBridgeEP with xsave)
+
+The reproduce steps is :
+1. Start a windows 2008 vm with -cpu host(which means host-passthrough).
+2. Migrate the vm to host B when cr4.OSXSAVE=0 (successfully).
+3. Vm runs on host B for a while so that cr4.OSXSAVE changes to 1.
+4. Then migrate the vm to host A (successfully), but vm was paused, and qemu 
+printed log as followed:
+
+KVM: entry failed, hardware error 0x80000021
+
+If you're running a guest on an Intel machine without unrestricted mode
+support, the failure can be most likely due to the guest entering an invalid
+state for Intel VT. For example, the guest maybe running in big real mode
+which is not supported on less recent Intel processors.
+
+EAX=019b3bb0 EBX=01a3ae80 ECX=01a61ce8 EDX=00000000
+ESI=01a62000 EDI=00000000 EBP=00000000 ESP=01718b20
+EIP=0185d982 EFL=00000286 [--S--P-] CPL=0 II=0 A20=1 SMM=0 HLT=0
+ES =0000 00000000 0000ffff 00009300
+CS =f000 ffff0000 0000ffff 00009b00
+SS =0000 00000000 0000ffff 00009300
+DS =0000 00000000 0000ffff 00009300
+FS =0000 00000000 0000ffff 00009300
+GS =0000 00000000 0000ffff 00009300
+LDT=0000 00000000 0000ffff 00008200
+TR =0000 00000000 0000ffff 00008b00
+GDT=     00000000 0000ffff
+IDT=     00000000 0000ffff
+CR0=60000010 CR2=00000000 CR3=00000000 CR4=00000000
+DR0=0000000000000000 DR1=0000000000000000 DR2=0000000000000000 
+DR3=0000000000000000
+DR6=00000000ffff0ff0 DR7=0000000000000400
+EFER=0000000000000000
+Code=00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 <00> 00 00 00 
+00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00
+
+I have found that problem happened when kvm_put_sregs returns err -22(called by 
+kvm_arch_put_registers(qemu)).
+Because kvm_arch_vcpu_ioctl_set_sregs(kvm-mod) checked that guest_cpuid_has no 
+X86_FEATURE_XSAVE but cr4.OSXSAVE=1.
+So should we cancel migration when kvm_arch_put_registers returns error?
+
+* linzhecheng (address@hidden) wrote:
+>
+Hi, all
+>
+I encounterd a bug when I try to migrate a windows vm.
+>
+>
+Enviroment information:
+>
+host A: cpu E5620(model WestmereEP without flag xsave)
+>
+host B: cpu E5-2643(model SandyBridgeEP with xsave)
+>
+>
+The reproduce steps is :
+>
+1. Start a windows 2008 vm with -cpu host(which means host-passthrough).
+>
+2. Migrate the vm to host B when cr4.OSXSAVE=0 (successfully).
+>
+3. Vm runs on host B for a while so that cr4.OSXSAVE changes to 1.
+>
+4. Then migrate the vm to host A (successfully), but vm was paused, and qemu
+>
+printed log as followed:
+Remember that migrating using -cpu host  across different CPU models is NOT
+expected to work.
+
+>
+KVM: entry failed, hardware error 0x80000021
+>
+>
+If you're running a guest on an Intel machine without unrestricted mode
+>
+support, the failure can be most likely due to the guest entering an invalid
+>
+state for Intel VT. For example, the guest maybe running in big real mode
+>
+which is not supported on less recent Intel processors.
+>
+>
+EAX=019b3bb0 EBX=01a3ae80 ECX=01a61ce8 EDX=00000000
+>
+ESI=01a62000 EDI=00000000 EBP=00000000 ESP=01718b20
+>
+EIP=0185d982 EFL=00000286 [--S--P-] CPL=0 II=0 A20=1 SMM=0 HLT=0
+>
+ES =0000 00000000 0000ffff 00009300
+>
+CS =f000 ffff0000 0000ffff 00009b00
+>
+SS =0000 00000000 0000ffff 00009300
+>
+DS =0000 00000000 0000ffff 00009300
+>
+FS =0000 00000000 0000ffff 00009300
+>
+GS =0000 00000000 0000ffff 00009300
+>
+LDT=0000 00000000 0000ffff 00008200
+>
+TR =0000 00000000 0000ffff 00008b00
+>
+GDT=     00000000 0000ffff
+>
+IDT=     00000000 0000ffff
+>
+CR0=60000010 CR2=00000000 CR3=00000000 CR4=00000000
+>
+DR0=0000000000000000 DR1=0000000000000000 DR2=0000000000000000
+>
+DR3=0000000000000000
+>
+DR6=00000000ffff0ff0 DR7=0000000000000400
+>
+EFER=0000000000000000
+>
+Code=00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 <00> 00 00
+>
+00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00
+>
+00
+>
+>
+I have found that problem happened when kvm_put_sregs returns err -22(called
+>
+by kvm_arch_put_registers(qemu)).
+>
+Because kvm_arch_vcpu_ioctl_set_sregs(kvm-mod) checked that guest_cpuid_has
+>
+no X86_FEATURE_XSAVE but cr4.OSXSAVE=1.
+>
+So should we cancel migration when kvm_arch_put_registers returns error?
+It would seem good if we can make the migration fail there rather than
+hitting that KVM error.
+It looks like we need to do a bit of plumbing to convert the places that
+call it to return a bool rather than void.
+
+Dave
+
+--
+Dr. David Alan Gilbert / address@hidden / Manchester, UK
+
diff --git a/classification_output/01/instruction/2609717 b/classification_output/01/instruction/2609717
deleted file mode 100644
index b8e563ad9..000000000
--- a/classification_output/01/instruction/2609717
+++ /dev/null
@@ -1,4939 +0,0 @@
-instruction: 0.693
-mistranslation: 0.687
-semantic: 0.656
-other: 0.637
-
-[BUG] cxl can not create region
-
-Hi list
-
-I want to test cxl functions in arm64, and found some problems I can't
-figure out.
-
-My test environment:
-
-1. build latest bios from
-https://github.com/tianocore/edk2.git
-master
-branch(cc2db6ebfb6d9d85ba4c7b35fba1fa37fffc0bc2)
-2. build latest qemu-system-aarch64 from git://git.qemu.org/qemu.git
-master branch(846dcf0ba4eff824c295f06550b8673ff3f31314). With cxl arm
-support patch:
-https://patchwork.kernel.org/project/cxl/cover/20220616141950.23374-1-Jonathan.Cameron@huawei.com/
-3. build Linux kernel from
-https://git.kernel.org/pub/scm/linux/kernel/git/cxl/cxl.git
-preview
-branch(65fc1c3d26b96002a5aa1f4012fae4dc98fd5683)
-4. build latest ndctl tools from
-https://github.com/pmem/ndctl
-create_region branch(8558b394e449779e3a4f3ae90fae77ede0bca159)
-
-And my qemu test commands:
-sudo $QEMU_BIN -M virt,gic-version=3,cxl=on -m 4g,maxmem=8G,slots=8 \
-        -cpu max -smp 8 -nographic -no-reboot \
-        -kernel $KERNEL -bios $BIOS_BIN \
-        -drive if=none,file=$ROOTFS,format=qcow2,id=hd \
-        -device virtio-blk-pci,drive=hd -append 'root=/dev/vda1
-nokaslr dyndbg="module cxl* +p"' \
-        -object memory-backend-ram,size=4G,id=mem0 \
-        -numa node,nodeid=0,cpus=0-7,memdev=mem0 \
-        -net nic -net user,hostfwd=tcp::2222-:22 -enable-kvm \
-        -object
-memory-backend-file,id=cxl-mem0,share=on,mem-path=/tmp/cxltest.raw,size=256M
-\
-        -object
-memory-backend-file,id=cxl-mem1,share=on,mem-path=/tmp/cxltest1.raw,size=256M
-\
-        -object
-memory-backend-file,id=cxl-mem2,share=on,mem-path=/tmp/cxltest2.raw,size=256M
-\
-        -object
-memory-backend-file,id=cxl-mem3,share=on,mem-path=/tmp/cxltest3.raw,size=256M
-\
-        -object
-memory-backend-file,id=cxl-lsa0,share=on,mem-path=/tmp/lsa0.raw,size=256M
-\
-        -object
-memory-backend-file,id=cxl-lsa1,share=on,mem-path=/tmp/lsa1.raw,size=256M
-\
-        -object
-memory-backend-file,id=cxl-lsa2,share=on,mem-path=/tmp/lsa2.raw,size=256M
-\
-        -object
-memory-backend-file,id=cxl-lsa3,share=on,mem-path=/tmp/lsa3.raw,size=256M
-\
-        -device pxb-cxl,bus_nr=12,bus=pcie.0,id=cxl.1 \
-        -device cxl-rp,port=0,bus=cxl.1,id=root_port0,chassis=0,slot=0 \
-        -device cxl-upstream,bus=root_port0,id=us0 \
-        -device cxl-downstream,port=0,bus=us0,id=swport0,chassis=0,slot=4 \
-        -device
-cxl-type3,bus=swport0,memdev=cxl-mem0,lsa=cxl-lsa0,id=cxl-pmem0 \
-        -device cxl-downstream,port=1,bus=us0,id=swport1,chassis=0,slot=5 \
-        -device
-cxl-type3,bus=swport1,memdev=cxl-mem1,lsa=cxl-lsa1,id=cxl-pmem1 \
-        -device cxl-downstream,port=2,bus=us0,id=swport2,chassis=0,slot=6 \
-        -device
-cxl-type3,bus=swport2,memdev=cxl-mem2,lsa=cxl-lsa2,id=cxl-pmem2 \
-        -device cxl-downstream,port=3,bus=us0,id=swport3,chassis=0,slot=7 \
-        -device
-cxl-type3,bus=swport3,memdev=cxl-mem3,lsa=cxl-lsa3,id=cxl-pmem3 \
-        -M 
-cxl-fmw.0.targets.0=cxl.1,cxl-fmw.0.size=4G,cxl-fmw.0.interleave-granularity=4k
-
-And I have got two problems.
-1. When I want to create x1 region with command: "cxl create-region -d
-decoder0.0 -w 1 -g 4096 mem0", kernel crashed with null pointer
-reference. Crash log:
-
-[  534.697324] cxl_region region0: config state: 0
-[  534.697346] cxl_region region0: probe: -6
-[  534.697368] cxl_acpi ACPI0017:00: decoder0.0: created region0
-[  534.699115] cxl region0: mem0:endpoint3 decoder3.0 add:
-mem0:decoder3.0 @ 0 next: none nr_eps: 1 nr_targets: 1
-[  534.699149] cxl region0: 0000:0d:00.0:port2 decoder2.0 add:
-mem0:decoder3.0 @ 0 next: mem0 nr_eps: 1 nr_targets: 1
-[  534.699167] cxl region0: ACPI0016:00:port1 decoder1.0 add:
-mem0:decoder3.0 @ 0 next: 0000:0d:00.0 nr_eps: 1 nr_targets: 1
-[  534.699176] cxl region0: ACPI0016:00:port1 iw: 1 ig: 256
-[  534.699182] cxl region0: ACPI0016:00:port1 target[0] = 0000:0c:00.0
-for mem0:decoder3.0 @ 0
-[  534.699189] cxl region0: 0000:0d:00.0:port2 iw: 1 ig: 256
-[  534.699193] cxl region0: 0000:0d:00.0:port2 target[0] =
-0000:0e:00.0 for mem0:decoder3.0 @ 0
-[  534.699405] Unable to handle kernel NULL pointer dereference at
-virtual address 0000000000000000
-[  534.701474] Mem abort info:
-[  534.701994]   ESR = 0x0000000086000004
-[  534.702653]   EC = 0x21: IABT (current EL), IL = 32 bits
-[  534.703616]   SET = 0, FnV = 0
-[  534.704174]   EA = 0, S1PTW = 0
-[  534.704803]   FSC = 0x04: level 0 translation fault
-[  534.705694] user pgtable: 4k pages, 48-bit VAs, pgdp=000000010144a000
-[  534.706875] [0000000000000000] pgd=0000000000000000, p4d=0000000000000000
-[  534.709855] Internal error: Oops: 86000004 [#1] PREEMPT SMP
-[  534.710301] Modules linked in:
-[  534.710546] CPU: 7 PID: 331 Comm: cxl Not tainted
-5.19.0-rc3-00064-g65fc1c3d26b9-dirty #11
-[  534.715393] Hardware name: QEMU KVM Virtual Machine, BIOS 0.0.0 02/06/2015
-[  534.717179] pstate: 60400005 (nZCv daif +PAN -UAO -TCO -DIT -SSBS BTYPE=--)
-[  534.719190] pc : 0x0
-[  534.719928] lr : commit_store+0x118/0x2cc
-[  534.721007] sp : ffff80000aec3c30
-[  534.721793] x29: ffff80000aec3c30 x28: ffff0000da62e740 x27: ffff0000c0c06b30
-[  534.723875] x26: 0000000000000000 x25: ffff0000c0a2a400 x24: ffff0000c0a29400
-[  534.725440] x23: 0000000000000003 x22: 0000000000000000 x21: ffff0000c0c06800
-[  534.727312] x20: 0000000000000000 x19: ffff0000c1559800 x18: 0000000000000000
-[  534.729138] x17: 0000000000000000 x16: 0000000000000000 x15: 0000ffffd41fe838
-[  534.731046] x14: 0000000000000000 x13: 0000000000000000 x12: 0000000000000000
-[  534.732402] x11: 0000000000000000 x10: 0000000000000000 x9 : 0000000000000000
-[  534.734432] x8 : 0000000000000000 x7 : 0000000000000000 x6 : ffff0000c0906e80
-[  534.735921] x5 : 0000000000000000 x4 : 0000000000000000 x3 : ffff80000aec3bf0
-[  534.737437] x2 : 0000000000000000 x1 : 0000000000000000 x0 : ffff0000c155a000
-[  534.738878] Call trace:
-[  534.739368]  0x0
-[  534.739713]  dev_attr_store+0x1c/0x30
-[  534.740186]  sysfs_kf_write+0x48/0x58
-[  534.740961]  kernfs_fop_write_iter+0x128/0x184
-[  534.741872]  new_sync_write+0xdc/0x158
-[  534.742706]  vfs_write+0x1ac/0x2a8
-[  534.743440]  ksys_write+0x68/0xf0
-[  534.744328]  __arm64_sys_write+0x1c/0x28
-[  534.745180]  invoke_syscall+0x44/0xf0
-[  534.745989]  el0_svc_common+0x4c/0xfc
-[  534.746661]  do_el0_svc+0x60/0xa8
-[  534.747378]  el0_svc+0x2c/0x78
-[  534.748066]  el0t_64_sync_handler+0xb8/0x12c
-[  534.748919]  el0t_64_sync+0x18c/0x190
-[  534.749629] Code: bad PC value
-[  534.750169] ---[ end trace 0000000000000000 ]---
-
-2. When I want to create x4 region with command: "cxl create-region -d
-decoder0.0 -w 4 -g 4096 -m mem0 mem1 mem2 mem3". I got below errors:
-
-cxl region: create_region: region0: failed to set target3 to mem3
-cxl region: cmd_create_region: created 0 regions
-
-And kernel log as below:
-[   60.536663] cxl_region region0: config state: 0
-[   60.536675] cxl_region region0: probe: -6
-[   60.536696] cxl_acpi ACPI0017:00: decoder0.0: created region0
-[   60.538251] cxl region0: mem0:endpoint3 decoder3.0 add:
-mem0:decoder3.0 @ 0 next: none nr_eps: 1 nr_targets: 1
-[   60.538278] cxl region0: 0000:0d:00.0:port2 decoder2.0 add:
-mem0:decoder3.0 @ 0 next: mem0 nr_eps: 1 nr_targets: 1
-[   60.538295] cxl region0: ACPI0016:00:port1 decoder1.0 add:
-mem0:decoder3.0 @ 0 next: 0000:0d:00.0 nr_eps: 1 nr_targets: 1
-[   60.538647] cxl region0: mem1:endpoint4 decoder4.0 add:
-mem1:decoder4.0 @ 1 next: none nr_eps: 1 nr_targets: 1
-[   60.538663] cxl region0: 0000:0d:00.0:port2 decoder2.0 add:
-mem1:decoder4.0 @ 1 next: mem1 nr_eps: 2 nr_targets: 2
-[   60.538675] cxl region0: ACPI0016:00:port1 decoder1.0 add:
-mem1:decoder4.0 @ 1 next: 0000:0d:00.0 nr_eps: 2 nr_targets: 1
-[   60.539311] cxl region0: mem2:endpoint5 decoder5.0 add:
-mem2:decoder5.0 @ 2 next: none nr_eps: 1 nr_targets: 1
-[   60.539332] cxl region0: 0000:0d:00.0:port2 decoder2.0 add:
-mem2:decoder5.0 @ 2 next: mem2 nr_eps: 3 nr_targets: 3
-[   60.539343] cxl region0: ACPI0016:00:port1 decoder1.0 add:
-mem2:decoder5.0 @ 2 next: 0000:0d:00.0 nr_eps: 3 nr_targets: 1
-[   60.539711] cxl region0: mem3:endpoint6 decoder6.0 add:
-mem3:decoder6.0 @ 3 next: none nr_eps: 1 nr_targets: 1
-[   60.539723] cxl region0: 0000:0d:00.0:port2 decoder2.0 add:
-mem3:decoder6.0 @ 3 next: mem3 nr_eps: 4 nr_targets: 4
-[   60.539735] cxl region0: ACPI0016:00:port1 decoder1.0 add:
-mem3:decoder6.0 @ 3 next: 0000:0d:00.0 nr_eps: 4 nr_targets: 1
-[   60.539742] cxl region0: ACPI0016:00:port1 iw: 1 ig: 256
-[   60.539747] cxl region0: ACPI0016:00:port1 target[0] = 0000:0c:00.0
-for mem0:decoder3.0 @ 0
-[   60.539754] cxl region0: 0000:0d:00.0:port2 iw: 4 ig: 512
-[   60.539758] cxl region0: 0000:0d:00.0:port2 target[0] =
-0000:0e:00.0 for mem0:decoder3.0 @ 0
-[   60.539764] cxl region0: ACPI0016:00:port1: cannot host mem1:decoder4.0 at 1
-
-I have tried to write sysfs node manually, got same errors.
-
-Hope I can get some helps here.
-
-Bob
-
-On Fri, 5 Aug 2022 10:20:23 +0800
-Bobo WL <lmw.bobo@gmail.com> wrote:
-
->
-Hi list
->
->
-I want to test cxl functions in arm64, and found some problems I can't
->
-figure out.
-Hi Bob,
-
-Glad to see people testing this code.
-
->
->
-My test environment:
->
->
-1. build latest bios from
-https://github.com/tianocore/edk2.git
-master
->
-branch(cc2db6ebfb6d9d85ba4c7b35fba1fa37fffc0bc2)
->
-2. build latest qemu-system-aarch64 from git://git.qemu.org/qemu.git
->
-master branch(846dcf0ba4eff824c295f06550b8673ff3f31314). With cxl arm
->
-support patch:
->
-https://patchwork.kernel.org/project/cxl/cover/20220616141950.23374-1-Jonathan.Cameron@huawei.com/
->
-3. build Linux kernel from
->
-https://git.kernel.org/pub/scm/linux/kernel/git/cxl/cxl.git
-preview
->
-branch(65fc1c3d26b96002a5aa1f4012fae4dc98fd5683)
->
-4. build latest ndctl tools from
-https://github.com/pmem/ndctl
->
-create_region branch(8558b394e449779e3a4f3ae90fae77ede0bca159)
->
->
-And my qemu test commands:
->
-sudo $QEMU_BIN -M virt,gic-version=3,cxl=on -m 4g,maxmem=8G,slots=8 \
->
--cpu max -smp 8 -nographic -no-reboot \
->
--kernel $KERNEL -bios $BIOS_BIN \
->
--drive if=none,file=$ROOTFS,format=qcow2,id=hd \
->
--device virtio-blk-pci,drive=hd -append 'root=/dev/vda1
->
-nokaslr dyndbg="module cxl* +p"' \
->
--object memory-backend-ram,size=4G,id=mem0 \
->
--numa node,nodeid=0,cpus=0-7,memdev=mem0 \
->
--net nic -net user,hostfwd=tcp::2222-:22 -enable-kvm \
->
--object
->
-memory-backend-file,id=cxl-mem0,share=on,mem-path=/tmp/cxltest.raw,size=256M
->
-\
->
--object
->
-memory-backend-file,id=cxl-mem1,share=on,mem-path=/tmp/cxltest1.raw,size=256M
->
-\
->
--object
->
-memory-backend-file,id=cxl-mem2,share=on,mem-path=/tmp/cxltest2.raw,size=256M
->
-\
->
--object
->
-memory-backend-file,id=cxl-mem3,share=on,mem-path=/tmp/cxltest3.raw,size=256M
->
-\
->
--object
->
-memory-backend-file,id=cxl-lsa0,share=on,mem-path=/tmp/lsa0.raw,size=256M
->
-\
->
--object
->
-memory-backend-file,id=cxl-lsa1,share=on,mem-path=/tmp/lsa1.raw,size=256M
->
-\
->
--object
->
-memory-backend-file,id=cxl-lsa2,share=on,mem-path=/tmp/lsa2.raw,size=256M
->
-\
->
--object
->
-memory-backend-file,id=cxl-lsa3,share=on,mem-path=/tmp/lsa3.raw,size=256M
->
-\
->
--device pxb-cxl,bus_nr=12,bus=pcie.0,id=cxl.1 \
->
--device cxl-rp,port=0,bus=cxl.1,id=root_port0,chassis=0,slot=0 \
-Probably not related to your problem, but there is a disconnect in QEMU /
-kernel assumptionsaround the presence of an HDM decoder when a HB only
-has a single root port. Spec allows it to be provided or not as an 
-implementation choice.
-Kernel assumes it isn't provide. Qemu assumes it is.
-
-The temporary solution is to throw in a second root port on the HB and not
-connect anything to it.  Longer term I may special case this so that the 
-particular
-decoder defaults to pass through settings in QEMU if there is only one root 
-port.
-
->
--device cxl-upstream,bus=root_port0,id=us0 \
->
--device cxl-downstream,port=0,bus=us0,id=swport0,chassis=0,slot=4 \
->
--device
->
-cxl-type3,bus=swport0,memdev=cxl-mem0,lsa=cxl-lsa0,id=cxl-pmem0 \
->
--device cxl-downstream,port=1,bus=us0,id=swport1,chassis=0,slot=5 \
->
--device
->
-cxl-type3,bus=swport1,memdev=cxl-mem1,lsa=cxl-lsa1,id=cxl-pmem1 \
->
--device cxl-downstream,port=2,bus=us0,id=swport2,chassis=0,slot=6 \
->
--device
->
-cxl-type3,bus=swport2,memdev=cxl-mem2,lsa=cxl-lsa2,id=cxl-pmem2 \
->
--device cxl-downstream,port=3,bus=us0,id=swport3,chassis=0,slot=7 \
->
--device
->
-cxl-type3,bus=swport3,memdev=cxl-mem3,lsa=cxl-lsa3,id=cxl-pmem3 \
->
--M
->
-cxl-fmw.0.targets.0=cxl.1,cxl-fmw.0.size=4G,cxl-fmw.0.interleave-granularity=4k
->
->
-And I have got two problems.
->
-1. When I want to create x1 region with command: "cxl create-region -d
->
-decoder0.0 -w 1 -g 4096 mem0", kernel crashed with null pointer
->
-reference. Crash log:
->
->
-[  534.697324] cxl_region region0: config state: 0
->
-[  534.697346] cxl_region region0: probe: -6
-Seems odd this is up here.  But maybe fine.
-
->
-[  534.697368] cxl_acpi ACPI0017:00: decoder0.0: created region0
->
-[  534.699115] cxl region0: mem0:endpoint3 decoder3.0 add:
->
-mem0:decoder3.0 @ 0 next: none nr_eps: 1 nr_targets: 1
->
-[  534.699149] cxl region0: 0000:0d:00.0:port2 decoder2.0 add:
->
-mem0:decoder3.0 @ 0 next: mem0 nr_eps: 1 nr_targets: 1
->
-[  534.699167] cxl region0: ACPI0016:00:port1 decoder1.0 add:
->
-mem0:decoder3.0 @ 0 next: 0000:0d:00.0 nr_eps: 1 nr_targets: 1
->
-[  534.699176] cxl region0: ACPI0016:00:port1 iw: 1 ig: 256
->
-[  534.699182] cxl region0: ACPI0016:00:port1 target[0] = 0000:0c:00.0
->
-for mem0:decoder3.0 @ 0
->
-[  534.699189] cxl region0: 0000:0d:00.0:port2 iw: 1 ig: 256
->
-[  534.699193] cxl region0: 0000:0d:00.0:port2 target[0] =
->
-0000:0e:00.0 for mem0:decoder3.0 @ 0
->
-[  534.699405] Unable to handle kernel NULL pointer dereference at
->
-virtual address 0000000000000000
->
-[  534.701474] Mem abort info:
->
-[  534.701994]   ESR = 0x0000000086000004
->
-[  534.702653]   EC = 0x21: IABT (current EL), IL = 32 bits
->
-[  534.703616]   SET = 0, FnV = 0
->
-[  534.704174]   EA = 0, S1PTW = 0
->
-[  534.704803]   FSC = 0x04: level 0 translation fault
->
-[  534.705694] user pgtable: 4k pages, 48-bit VAs, pgdp=000000010144a000
->
-[  534.706875] [0000000000000000] pgd=0000000000000000, p4d=0000000000000000
->
-[  534.709855] Internal error: Oops: 86000004 [#1] PREEMPT SMP
->
-[  534.710301] Modules linked in:
->
-[  534.710546] CPU: 7 PID: 331 Comm: cxl Not tainted
->
-5.19.0-rc3-00064-g65fc1c3d26b9-dirty #11
->
-[  534.715393] Hardware name: QEMU KVM Virtual Machine, BIOS 0.0.0 02/06/2015
->
-[  534.717179] pstate: 60400005 (nZCv daif +PAN -UAO -TCO -DIT -SSBS BTYPE=--)
->
-[  534.719190] pc : 0x0
->
-[  534.719928] lr : commit_store+0x118/0x2cc
->
-[  534.721007] sp : ffff80000aec3c30
->
-[  534.721793] x29: ffff80000aec3c30 x28: ffff0000da62e740 x27:
->
-ffff0000c0c06b30
->
-[  534.723875] x26: 0000000000000000 x25: ffff0000c0a2a400 x24:
->
-ffff0000c0a29400
->
-[  534.725440] x23: 0000000000000003 x22: 0000000000000000 x21:
->
-ffff0000c0c06800
->
-[  534.727312] x20: 0000000000000000 x19: ffff0000c1559800 x18:
->
-0000000000000000
->
-[  534.729138] x17: 0000000000000000 x16: 0000000000000000 x15:
->
-0000ffffd41fe838
->
-[  534.731046] x14: 0000000000000000 x13: 0000000000000000 x12:
->
-0000000000000000
->
-[  534.732402] x11: 0000000000000000 x10: 0000000000000000 x9 :
->
-0000000000000000
->
-[  534.734432] x8 : 0000000000000000 x7 : 0000000000000000 x6 :
->
-ffff0000c0906e80
->
-[  534.735921] x5 : 0000000000000000 x4 : 0000000000000000 x3 :
->
-ffff80000aec3bf0
->
-[  534.737437] x2 : 0000000000000000 x1 : 0000000000000000 x0 :
->
-ffff0000c155a000
->
-[  534.738878] Call trace:
->
-[  534.739368]  0x0
->
-[  534.739713]  dev_attr_store+0x1c/0x30
->
-[  534.740186]  sysfs_kf_write+0x48/0x58
->
-[  534.740961]  kernfs_fop_write_iter+0x128/0x184
->
-[  534.741872]  new_sync_write+0xdc/0x158
->
-[  534.742706]  vfs_write+0x1ac/0x2a8
->
-[  534.743440]  ksys_write+0x68/0xf0
->
-[  534.744328]  __arm64_sys_write+0x1c/0x28
->
-[  534.745180]  invoke_syscall+0x44/0xf0
->
-[  534.745989]  el0_svc_common+0x4c/0xfc
->
-[  534.746661]  do_el0_svc+0x60/0xa8
->
-[  534.747378]  el0_svc+0x2c/0x78
->
-[  534.748066]  el0t_64_sync_handler+0xb8/0x12c
->
-[  534.748919]  el0t_64_sync+0x18c/0x190
->
-[  534.749629] Code: bad PC value
->
-[  534.750169] ---[ end trace 0000000000000000 ]---
->
->
-2. When I want to create x4 region with command: "cxl create-region -d
->
-decoder0.0 -w 4 -g 4096 -m mem0 mem1 mem2 mem3". I got below errors:
->
->
-cxl region: create_region: region0: failed to set target3 to mem3
->
-cxl region: cmd_create_region: created 0 regions
->
->
-And kernel log as below:
->
-[   60.536663] cxl_region region0: config state: 0
->
-[   60.536675] cxl_region region0: probe: -6
->
-[   60.536696] cxl_acpi ACPI0017:00: decoder0.0: created region0
->
-[   60.538251] cxl region0: mem0:endpoint3 decoder3.0 add:
->
-mem0:decoder3.0 @ 0 next: none nr_eps: 1 nr_targets: 1
->
-[   60.538278] cxl region0: 0000:0d:00.0:port2 decoder2.0 add:
->
-mem0:decoder3.0 @ 0 next: mem0 nr_eps: 1 nr_targets: 1
->
-[   60.538295] cxl region0: ACPI0016:00:port1 decoder1.0 add:
->
-mem0:decoder3.0 @ 0 next: 0000:0d:00.0 nr_eps: 1 nr_targets: 1
->
-[   60.538647] cxl region0: mem1:endpoint4 decoder4.0 add:
->
-mem1:decoder4.0 @ 1 next: none nr_eps: 1 nr_targets: 1
->
-[   60.538663] cxl region0: 0000:0d:00.0:port2 decoder2.0 add:
->
-mem1:decoder4.0 @ 1 next: mem1 nr_eps: 2 nr_targets: 2
->
-[   60.538675] cxl region0: ACPI0016:00:port1 decoder1.0 add:
->
-mem1:decoder4.0 @ 1 next: 0000:0d:00.0 nr_eps: 2 nr_targets: 1
->
-[   60.539311] cxl region0: mem2:endpoint5 decoder5.0 add:
->
-mem2:decoder5.0 @ 2 next: none nr_eps: 1 nr_targets: 1
->
-[   60.539332] cxl region0: 0000:0d:00.0:port2 decoder2.0 add:
->
-mem2:decoder5.0 @ 2 next: mem2 nr_eps: 3 nr_targets: 3
->
-[   60.539343] cxl region0: ACPI0016:00:port1 decoder1.0 add:
->
-mem2:decoder5.0 @ 2 next: 0000:0d:00.0 nr_eps: 3 nr_targets: 1
->
-[   60.539711] cxl region0: mem3:endpoint6 decoder6.0 add:
->
-mem3:decoder6.0 @ 3 next: none nr_eps: 1 nr_targets: 1
->
-[   60.539723] cxl region0: 0000:0d:00.0:port2 decoder2.0 add:
->
-mem3:decoder6.0 @ 3 next: mem3 nr_eps: 4 nr_targets: 4
->
-[   60.539735] cxl region0: ACPI0016:00:port1 decoder1.0 add:
->
-mem3:decoder6.0 @ 3 next: 0000:0d:00.0 nr_eps: 4 nr_targets: 1
->
-[   60.539742] cxl region0: ACPI0016:00:port1 iw: 1 ig: 256
->
-[   60.539747] cxl region0: ACPI0016:00:port1 target[0] = 0000:0c:00.0
->
-for mem0:decoder3.0 @ 0
->
-[   60.539754] cxl region0: 0000:0d:00.0:port2 iw: 4 ig: 512
-This looks like off by 1 that should be fixed in the below mentioned
-cxl/pending branch.  That ig should be 256.  Note the fix was
-for a test case with a fat HB and no switch, but certainly looks
-like this is the same issue.
-
->
-[   60.539758] cxl region0: 0000:0d:00.0:port2 target[0] =
->
-0000:0e:00.0 for mem0:decoder3.0 @ 0
->
-[   60.539764] cxl region0: ACPI0016:00:port1: cannot host mem1:decoder4.0 at
->
-1
->
->
-I have tried to write sysfs node manually, got same errors.
-When stepping through by hand, which sysfs write triggers the crash above?
-
-Not sure it's related, but I've just sent out a fix to the
-target register handling in QEMU.
-20220808122051.14822-1-Jonathan.Cameron@huawei.com
-/T/#m47ff985412ce44559e6b04d677c302f8cd371330">https://lore.kernel.org/linux-cxl/
-20220808122051.14822-1-Jonathan.Cameron@huawei.com
-/T/#m47ff985412ce44559e6b04d677c302f8cd371330
-I did have one instance last week of triggering what looked to be a race 
-condition but
-the stack trace doesn't looks related to what you've hit.
-
-It will probably be a few days before I have time to take a look at replicating
-what you have seen.
-
-If you have time, try using the kernel.org cxl/pending branch as there are
-a few additional fixes on there since you sent this email.  Optimistic to hope
-this is covered by one of those, but at least it will mean we are trying to 
-replicate
-on same branch.
-
-Jonathan
-
-
->
->
-Hope I can get some helps here.
->
->
-Bob
-
-Hi Jonathan
-
-Thanks for your reply!
-
-On Mon, Aug 8, 2022 at 8:37 PM Jonathan Cameron
-<Jonathan.Cameron@huawei.com> wrote:
->
->
-Probably not related to your problem, but there is a disconnect in QEMU /
->
-kernel assumptionsaround the presence of an HDM decoder when a HB only
->
-has a single root port. Spec allows it to be provided or not as an
->
-implementation choice.
->
-Kernel assumes it isn't provide. Qemu assumes it is.
->
->
-The temporary solution is to throw in a second root port on the HB and not
->
-connect anything to it.  Longer term I may special case this so that the
->
-particular
->
-decoder defaults to pass through settings in QEMU if there is only one root
->
-port.
->
-You are right! After adding an extra HB in qemu, I can create a x1
-region successfully.
-But have some errors in Nvdimm:
-
-[   74.925838] Unknown online node for memory at 0x10000000000, assuming node 0
-[   74.925846] Unknown target node for memory at 0x10000000000, assuming node 0
-[   74.927470] nd_region region0: nmem0: is disabled, failing probe
-
-And x4 region still failed with same errors, using latest cxl/preview
-branch don't work.
-I have picked "Two CXL emulation fixes" patches in qemu, still not working.
-
-Bob
-
-On Tue, 9 Aug 2022 21:07:06 +0800
-Bobo WL <lmw.bobo@gmail.com> wrote:
-
->
-Hi Jonathan
->
->
-Thanks for your reply!
->
->
-On Mon, Aug 8, 2022 at 8:37 PM Jonathan Cameron
->
-<Jonathan.Cameron@huawei.com> wrote:
->
->
->
-> Probably not related to your problem, but there is a disconnect in QEMU /
->
-> kernel assumptionsaround the presence of an HDM decoder when a HB only
->
-> has a single root port. Spec allows it to be provided or not as an
->
-> implementation choice.
->
-> Kernel assumes it isn't provide. Qemu assumes it is.
->
->
->
-> The temporary solution is to throw in a second root port on the HB and not
->
-> connect anything to it.  Longer term I may special case this so that the
->
-> particular
->
-> decoder defaults to pass through settings in QEMU if there is only one root
->
-> port.
->
->
->
->
-You are right! After adding an extra HB in qemu, I can create a x1
->
-region successfully.
->
-But have some errors in Nvdimm:
->
->
-[   74.925838] Unknown online node for memory at 0x10000000000, assuming node > 0
->
-[   74.925846] Unknown target node for memory at 0x10000000000, assuming node > 0
->
-[   74.927470] nd_region region0: nmem0: is disabled, failing probe
-Ah. I've seen this one, but not chased it down yet.  Was on my todo list to 
-chase
-down. Once I reach this state I can verify the HDM Decode is correct which is 
-what
-I've been using to test (Which wasn't true until earlier this week). 
-I'm currently testing via devmem, more for historical reasons than because it 
-makes
-that much sense anymore.  
-
->
->
-And x4 region still failed with same errors, using latest cxl/preview
->
-branch don't work.
->
-I have picked "Two CXL emulation fixes" patches in qemu, still not working.
->
->
-Bob
-
-On Tue, 9 Aug 2022 17:08:25 +0100
-Jonathan Cameron <Jonathan.Cameron@huawei.com> wrote:
-
->
-On Tue, 9 Aug 2022 21:07:06 +0800
->
-Bobo WL <lmw.bobo@gmail.com> wrote:
->
->
-> Hi Jonathan
->
->
->
-> Thanks for your reply!
->
->
->
-> On Mon, Aug 8, 2022 at 8:37 PM Jonathan Cameron
->
-> <Jonathan.Cameron@huawei.com> wrote:
->
-> >
->
-> > Probably not related to your problem, but there is a disconnect in QEMU /
->
-> > kernel assumptionsaround the presence of an HDM decoder when a HB only
->
-> > has a single root port. Spec allows it to be provided or not as an
->
-> > implementation choice.
->
-> > Kernel assumes it isn't provide. Qemu assumes it is.
->
-> >
->
-> > The temporary solution is to throw in a second root port on the HB and not
->
-> > connect anything to it.  Longer term I may special case this so that the
->
-> > particular
->
-> > decoder defaults to pass through settings in QEMU if there is only one
->
-> > root port.
->
-> >
->
->
->
-> You are right! After adding an extra HB in qemu, I can create a x1
->
-> region successfully.
->
-> But have some errors in Nvdimm:
->
->
->
-> [   74.925838] Unknown online node for memory at 0x10000000000, assuming
->
-> node 0
->
-> [   74.925846] Unknown target node for memory at 0x10000000000, assuming
->
-> node 0
->
-> [   74.927470] nd_region region0: nmem0: is disabled, failing probe
->
->
-Ah. I've seen this one, but not chased it down yet.  Was on my todo list to
->
-chase
->
-down. Once I reach this state I can verify the HDM Decode is correct which is
->
-what
->
-I've been using to test (Which wasn't true until earlier this week).
->
-I'm currently testing via devmem, more for historical reasons than because it
->
-makes
->
-that much sense anymore.
-*embarassed cough*.  We haven't fully hooked the LSA up in qemu yet.
-I'd forgotten that was still on the todo list. I don't think it will
-be particularly hard to do and will take a look in next few days.
-
-Very very indirectly this error is causing a driver probe fail that means that
-we hit a code path that has a rather odd looking check on NDD_LABELING.
-Should not have gotten near that path though - hence the problem is actually
-when we call cxl_pmem_get_config_data() and it returns an error because
-we haven't fully connected up the command in QEMU.
-
-Jonathan
-
-
->
->
->
->
-> And x4 region still failed with same errors, using latest cxl/preview
->
-> branch don't work.
->
-> I have picked "Two CXL emulation fixes" patches in qemu, still not working.
->
->
->
-> Bob
-
-On Thu, 11 Aug 2022 18:08:57 +0100
-Jonathan Cameron via <qemu-devel@nongnu.org> wrote:
-
->
-On Tue, 9 Aug 2022 17:08:25 +0100
->
-Jonathan Cameron <Jonathan.Cameron@huawei.com> wrote:
->
->
-> On Tue, 9 Aug 2022 21:07:06 +0800
->
-> Bobo WL <lmw.bobo@gmail.com> wrote:
->
->
->
-> > Hi Jonathan
->
-> >
->
-> > Thanks for your reply!
->
-> >
->
-> > On Mon, Aug 8, 2022 at 8:37 PM Jonathan Cameron
->
-> > <Jonathan.Cameron@huawei.com> wrote:
->
-> > >
->
-> > > Probably not related to your problem, but there is a disconnect in QEMU
->
-> > > /
->
-> > > kernel assumptionsaround the presence of an HDM decoder when a HB only
->
-> > > has a single root port. Spec allows it to be provided or not as an
->
-> > > implementation choice.
->
-> > > Kernel assumes it isn't provide. Qemu assumes it is.
->
-> > >
->
-> > > The temporary solution is to throw in a second root port on the HB and
->
-> > > not
->
-> > > connect anything to it.  Longer term I may special case this so that
->
-> > > the particular
->
-> > > decoder defaults to pass through settings in QEMU if there is only one
->
-> > > root port.
->
-> > >
->
-> >
->
-> > You are right! After adding an extra HB in qemu, I can create a x1
->
-> > region successfully.
->
-> > But have some errors in Nvdimm:
->
-> >
->
-> > [   74.925838] Unknown online node for memory at 0x10000000000, assuming
->
-> > node 0
->
-> > [   74.925846] Unknown target node for memory at 0x10000000000, assuming
->
-> > node 0
->
-> > [   74.927470] nd_region region0: nmem0: is disabled, failing probe
->
->
->
-> Ah. I've seen this one, but not chased it down yet.  Was on my todo list to
->
-> chase
->
-> down. Once I reach this state I can verify the HDM Decode is correct which
->
-> is what
->
-> I've been using to test (Which wasn't true until earlier this week).
->
-> I'm currently testing via devmem, more for historical reasons than because
->
-> it makes
->
-> that much sense anymore.
->
->
-*embarassed cough*.  We haven't fully hooked the LSA up in qemu yet.
->
-I'd forgotten that was still on the todo list. I don't think it will
->
-be particularly hard to do and will take a look in next few days.
->
->
-Very very indirectly this error is causing a driver probe fail that means that
->
-we hit a code path that has a rather odd looking check on NDD_LABELING.
->
-Should not have gotten near that path though - hence the problem is actually
->
-when we call cxl_pmem_get_config_data() and it returns an error because
->
-we haven't fully connected up the command in QEMU.
-So a least one bug in QEMU. We were not supporting variable length payloads on 
-mailbox
-inputs (but were on outputs).  That hasn't mattered until we get to LSA writes.
-We just need to relax condition on the supplied length.
-
-diff --git a/hw/cxl/cxl-mailbox-utils.c b/hw/cxl/cxl-mailbox-utils.c
-index c352a935c4..fdda9529fe 100644
---- a/hw/cxl/cxl-mailbox-utils.c
-+++ b/hw/cxl/cxl-mailbox-utils.c
-@@ -510,7 +510,7 @@ void cxl_process_mailbox(CXLDeviceState *cxl_dstate)
-     cxl_cmd = &cxl_cmd_set[set][cmd];
-     h = cxl_cmd->handler;
-     if (h) {
--        if (len == cxl_cmd->in) {
-+        if (len == cxl_cmd->in || !cxl_cmd->in) {
-             cxl_cmd->payload = cxl_dstate->mbox_reg_state +
-                 A_CXL_DEV_CMD_PAYLOAD;
-             ret = (*h)(cxl_cmd, cxl_dstate, &len);
-
-
-This lets the nvdimm/region probe fine, but I'm getting some issues with
-namespace capacity so I'll look at what is causing that next.
-Unfortunately I'm not that familiar with the driver/nvdimm side of things
-so it's take a while to figure out what kicks off what!
-
-Jonathan
-
->
->
-Jonathan
->
->
->
->
->
-> >
->
-> > And x4 region still failed with same errors, using latest cxl/preview
->
-> > branch don't work.
->
-> > I have picked "Two CXL emulation fixes" patches in qemu, still not
->
-> > working.
->
-> >
->
-> > Bob
->
->
-
-Jonathan Cameron wrote:
->
-On Thu, 11 Aug 2022 18:08:57 +0100
->
-Jonathan Cameron via <qemu-devel@nongnu.org> wrote:
->
->
-> On Tue, 9 Aug 2022 17:08:25 +0100
->
-> Jonathan Cameron <Jonathan.Cameron@huawei.com> wrote:
->
->
->
-> > On Tue, 9 Aug 2022 21:07:06 +0800
->
-> > Bobo WL <lmw.bobo@gmail.com> wrote:
->
-> >
->
-> > > Hi Jonathan
->
-> > >
->
-> > > Thanks for your reply!
->
-> > >
->
-> > > On Mon, Aug 8, 2022 at 8:37 PM Jonathan Cameron
->
-> > > <Jonathan.Cameron@huawei.com> wrote:
->
-> > > >
->
-> > > > Probably not related to your problem, but there is a disconnect in
->
-> > > > QEMU /
->
-> > > > kernel assumptionsaround the presence of an HDM decoder when a HB only
->
-> > > > has a single root port. Spec allows it to be provided or not as an
->
-> > > > implementation choice.
->
-> > > > Kernel assumes it isn't provide. Qemu assumes it is.
->
-> > > >
->
-> > > > The temporary solution is to throw in a second root port on the HB
->
-> > > > and not
->
-> > > > connect anything to it.  Longer term I may special case this so that
->
-> > > > the particular
->
-> > > > decoder defaults to pass through settings in QEMU if there is only
->
-> > > > one root port.
->
-> > > >
->
-> > >
->
-> > > You are right! After adding an extra HB in qemu, I can create a x1
->
-> > > region successfully.
->
-> > > But have some errors in Nvdimm:
->
-> > >
->
-> > > [   74.925838] Unknown online node for memory at 0x10000000000,
->
-> > > assuming node 0
->
-> > > [   74.925846] Unknown target node for memory at 0x10000000000,
->
-> > > assuming node 0
->
-> > > [   74.927470] nd_region region0: nmem0: is disabled, failing probe
->
-> >
->
-> > Ah. I've seen this one, but not chased it down yet.  Was on my todo list
->
-> > to chase
->
-> > down. Once I reach this state I can verify the HDM Decode is correct
->
-> > which is what
->
-> > I've been using to test (Which wasn't true until earlier this week).
->
-> > I'm currently testing via devmem, more for historical reasons than
->
-> > because it makes
->
-> > that much sense anymore.
->
->
->
-> *embarassed cough*.  We haven't fully hooked the LSA up in qemu yet.
->
-> I'd forgotten that was still on the todo list. I don't think it will
->
-> be particularly hard to do and will take a look in next few days.
->
->
->
-> Very very indirectly this error is causing a driver probe fail that means
->
-> that
->
-> we hit a code path that has a rather odd looking check on NDD_LABELING.
->
-> Should not have gotten near that path though - hence the problem is actually
->
-> when we call cxl_pmem_get_config_data() and it returns an error because
->
-> we haven't fully connected up the command in QEMU.
->
->
-So a least one bug in QEMU. We were not supporting variable length payloads
->
-on mailbox
->
-inputs (but were on outputs).  That hasn't mattered until we get to LSA
->
-writes.
->
-We just need to relax condition on the supplied length.
->
->
-diff --git a/hw/cxl/cxl-mailbox-utils.c b/hw/cxl/cxl-mailbox-utils.c
->
-index c352a935c4..fdda9529fe 100644
->
---- a/hw/cxl/cxl-mailbox-utils.c
->
-+++ b/hw/cxl/cxl-mailbox-utils.c
->
-@@ -510,7 +510,7 @@ void cxl_process_mailbox(CXLDeviceState *cxl_dstate)
->
-cxl_cmd = &cxl_cmd_set[set][cmd];
->
-h = cxl_cmd->handler;
->
-if (h) {
->
--        if (len == cxl_cmd->in) {
->
-+        if (len == cxl_cmd->in || !cxl_cmd->in) {
->
-cxl_cmd->payload = cxl_dstate->mbox_reg_state +
->
-A_CXL_DEV_CMD_PAYLOAD;
->
-ret = (*h)(cxl_cmd, cxl_dstate, &len);
->
->
->
-This lets the nvdimm/region probe fine, but I'm getting some issues with
->
-namespace capacity so I'll look at what is causing that next.
->
-Unfortunately I'm not that familiar with the driver/nvdimm side of things
->
-so it's take a while to figure out what kicks off what!
-The whirlwind tour is that 'struct nd_region' instances that represent a
-persitent memory address range are composed of one more mappings of
-'struct nvdimm' objects. The nvdimm object is driven by the dimm driver
-in drivers/nvdimm/dimm.c. That driver is mainly charged with unlocking
-the dimm (if locked) and interrogating the label area to look for
-namespace labels.
-
-The label command calls are routed to the '->ndctl()' callback that was
-registered when the CXL nvdimm_bus_descriptor was created. That callback
-handles both 'bus' scope calls, currently none for CXL, and per nvdimm
-calls. cxl_pmem_nvdimm_ctl() translates those generic LIBNVDIMM commands
-to CXL commands.
-
-The 'struct nvdimm' objects that the CXL side registers have the
-NDD_LABELING flag set which means that namespaces need to be explicitly
-created / provisioned from region capacity. Otherwise, if
-drivers/nvdimm/dimm.c does not find a namespace-label-index block then
-the region reverts to label-less mode and a default namespace equal to
-the size of the region is instantiated.
-
-If you are seeing small mismatches in namespace capacity then it may
-just be the fact that by default 'ndctl create-namespace' results in an
-'fsdax' mode namespace which just means that it is a block device where
-1.5% of the capacity is reserved for 'struct page' metadata. You should
-be able to see namespace capacity == region capacity by doing "ndctl
-create-namespace -m raw", and disable DAX operation.
-
-Hope that helps.
-
-On Fri, 12 Aug 2022 09:03:02 -0700
-Dan Williams <dan.j.williams@intel.com> wrote:
-
->
-Jonathan Cameron wrote:
->
-> On Thu, 11 Aug 2022 18:08:57 +0100
->
-> Jonathan Cameron via <qemu-devel@nongnu.org> wrote:
->
->
->
-> > On Tue, 9 Aug 2022 17:08:25 +0100
->
-> > Jonathan Cameron <Jonathan.Cameron@huawei.com> wrote:
->
-> >
->
-> > > On Tue, 9 Aug 2022 21:07:06 +0800
->
-> > > Bobo WL <lmw.bobo@gmail.com> wrote:
->
-> > >
->
-> > > > Hi Jonathan
->
-> > > >
->
-> > > > Thanks for your reply!
->
-> > > >
->
-> > > > On Mon, Aug 8, 2022 at 8:37 PM Jonathan Cameron
->
-> > > > <Jonathan.Cameron@huawei.com> wrote:
->
-> > > > >
->
-> > > > > Probably not related to your problem, but there is a disconnect in
->
-> > > > > QEMU /
->
-> > > > > kernel assumptionsaround the presence of an HDM decoder when a HB
->
-> > > > > only
->
-> > > > > has a single root port. Spec allows it to be provided or not as an
->
-> > > > > implementation choice.
->
-> > > > > Kernel assumes it isn't provide. Qemu assumes it is.
->
-> > > > >
->
-> > > > > The temporary solution is to throw in a second root port on the HB
->
-> > > > > and not
->
-> > > > > connect anything to it.  Longer term I may special case this so
->
-> > > > > that the particular
->
-> > > > > decoder defaults to pass through settings in QEMU if there is only
->
-> > > > > one root port.
->
-> > > > >
->
-> > > >
->
-> > > > You are right! After adding an extra HB in qemu, I can create a x1
->
-> > > > region successfully.
->
-> > > > But have some errors in Nvdimm:
->
-> > > >
->
-> > > > [   74.925838] Unknown online node for memory at 0x10000000000,
->
-> > > > assuming node 0
->
-> > > > [   74.925846] Unknown target node for memory at 0x10000000000,
->
-> > > > assuming node 0
->
-> > > > [   74.927470] nd_region region0: nmem0: is disabled, failing probe
->
-> > > >
->
-> > >
->
-> > > Ah. I've seen this one, but not chased it down yet.  Was on my todo
->
-> > > list to chase
->
-> > > down. Once I reach this state I can verify the HDM Decode is correct
->
-> > > which is what
->
-> > > I've been using to test (Which wasn't true until earlier this week).
->
-> > > I'm currently testing via devmem, more for historical reasons than
->
-> > > because it makes
->
-> > > that much sense anymore.
->
-> >
->
-> > *embarassed cough*.  We haven't fully hooked the LSA up in qemu yet.
->
-> > I'd forgotten that was still on the todo list. I don't think it will
->
-> > be particularly hard to do and will take a look in next few days.
->
-> >
->
-> > Very very indirectly this error is causing a driver probe fail that means
->
-> > that
->
-> > we hit a code path that has a rather odd looking check on NDD_LABELING.
->
-> > Should not have gotten near that path though - hence the problem is
->
-> > actually
->
-> > when we call cxl_pmem_get_config_data() and it returns an error because
->
-> > we haven't fully connected up the command in QEMU.
->
->
->
-> So a least one bug in QEMU. We were not supporting variable length payloads
->
-> on mailbox
->
-> inputs (but were on outputs).  That hasn't mattered until we get to LSA
->
-> writes.
->
-> We just need to relax condition on the supplied length.
->
->
->
-> diff --git a/hw/cxl/cxl-mailbox-utils.c b/hw/cxl/cxl-mailbox-utils.c
->
-> index c352a935c4..fdda9529fe 100644
->
-> --- a/hw/cxl/cxl-mailbox-utils.c
->
-> +++ b/hw/cxl/cxl-mailbox-utils.c
->
-> @@ -510,7 +510,7 @@ void cxl_process_mailbox(CXLDeviceState *cxl_dstate)
->
->      cxl_cmd = &cxl_cmd_set[set][cmd];
->
->      h = cxl_cmd->handler;
->
->      if (h) {
->
-> -        if (len == cxl_cmd->in) {
->
-> +        if (len == cxl_cmd->in || !cxl_cmd->in) {
->
->              cxl_cmd->payload = cxl_dstate->mbox_reg_state +
->
->                  A_CXL_DEV_CMD_PAYLOAD;
->
->              ret = (*h)(cxl_cmd, cxl_dstate, &len);
->
->
->
->
->
-> This lets the nvdimm/region probe fine, but I'm getting some issues with
->
-> namespace capacity so I'll look at what is causing that next.
->
-> Unfortunately I'm not that familiar with the driver/nvdimm side of things
->
-> so it's take a while to figure out what kicks off what!
->
->
-The whirlwind tour is that 'struct nd_region' instances that represent a
->
-persitent memory address range are composed of one more mappings of
->
-'struct nvdimm' objects. The nvdimm object is driven by the dimm driver
->
-in drivers/nvdimm/dimm.c. That driver is mainly charged with unlocking
->
-the dimm (if locked) and interrogating the label area to look for
->
-namespace labels.
->
->
-The label command calls are routed to the '->ndctl()' callback that was
->
-registered when the CXL nvdimm_bus_descriptor was created. That callback
->
-handles both 'bus' scope calls, currently none for CXL, and per nvdimm
->
-calls. cxl_pmem_nvdimm_ctl() translates those generic LIBNVDIMM commands
->
-to CXL commands.
->
->
-The 'struct nvdimm' objects that the CXL side registers have the
->
-NDD_LABELING flag set which means that namespaces need to be explicitly
->
-created / provisioned from region capacity. Otherwise, if
->
-drivers/nvdimm/dimm.c does not find a namespace-label-index block then
->
-the region reverts to label-less mode and a default namespace equal to
->
-the size of the region is instantiated.
->
->
-If you are seeing small mismatches in namespace capacity then it may
->
-just be the fact that by default 'ndctl create-namespace' results in an
->
-'fsdax' mode namespace which just means that it is a block device where
->
-1.5% of the capacity is reserved for 'struct page' metadata. You should
->
-be able to see namespace capacity == region capacity by doing "ndctl
->
-create-namespace -m raw", and disable DAX operation.
-Currently ndctl create-namespace crashes qemu ;)
-Which isn't ideal!
-
->
->
-Hope that helps.
-Got me looking at the right code. Thanks!
-
-Jonathan
-
-On Fri, 12 Aug 2022 17:15:09 +0100
-Jonathan Cameron <Jonathan.Cameron@huawei.com> wrote:
-
->
-On Fri, 12 Aug 2022 09:03:02 -0700
->
-Dan Williams <dan.j.williams@intel.com> wrote:
->
->
-> Jonathan Cameron wrote:
->
-> > On Thu, 11 Aug 2022 18:08:57 +0100
->
-> > Jonathan Cameron via <qemu-devel@nongnu.org> wrote:
->
-> >
->
-> > > On Tue, 9 Aug 2022 17:08:25 +0100
->
-> > > Jonathan Cameron <Jonathan.Cameron@huawei.com> wrote:
->
-> > >
->
-> > > > On Tue, 9 Aug 2022 21:07:06 +0800
->
-> > > > Bobo WL <lmw.bobo@gmail.com> wrote:
->
-> > > >
->
-> > > > > Hi Jonathan
->
-> > > > >
->
-> > > > > Thanks for your reply!
->
-> > > > >
->
-> > > > > On Mon, Aug 8, 2022 at 8:37 PM Jonathan Cameron
->
-> > > > > <Jonathan.Cameron@huawei.com> wrote:
->
-> > > > > >
->
-> > > > > > Probably not related to your problem, but there is a disconnect
->
-> > > > > > in QEMU /
->
-> > > > > > kernel assumptionsaround the presence of an HDM decoder when a HB
->
-> > > > > > only
->
-> > > > > > has a single root port. Spec allows it to be provided or not as
->
-> > > > > > an implementation choice.
->
-> > > > > > Kernel assumes it isn't provide. Qemu assumes it is.
->
-> > > > > >
->
-> > > > > > The temporary solution is to throw in a second root port on the
->
-> > > > > > HB and not
->
-> > > > > > connect anything to it.  Longer term I may special case this so
->
-> > > > > > that the particular
->
-> > > > > > decoder defaults to pass through settings in QEMU if there is
->
-> > > > > > only one root port.
->
-> > > > > >
->
-> > > > >
->
-> > > > > You are right! After adding an extra HB in qemu, I can create a x1
->
-> > > > > region successfully.
->
-> > > > > But have some errors in Nvdimm:
->
-> > > > >
->
-> > > > > [   74.925838] Unknown online node for memory at 0x10000000000,
->
-> > > > > assuming node 0
->
-> > > > > [   74.925846] Unknown target node for memory at 0x10000000000,
->
-> > > > > assuming node 0
->
-> > > > > [   74.927470] nd_region region0: nmem0: is disabled, failing probe
->
-> > > > >
->
-> > > >
->
-> > > > Ah. I've seen this one, but not chased it down yet.  Was on my todo
->
-> > > > list to chase
->
-> > > > down. Once I reach this state I can verify the HDM Decode is correct
->
-> > > > which is what
->
-> > > > I've been using to test (Which wasn't true until earlier this week).
->
-> > > > I'm currently testing via devmem, more for historical reasons than
->
-> > > > because it makes
->
-> > > > that much sense anymore.
->
-> > >
->
-> > > *embarassed cough*.  We haven't fully hooked the LSA up in qemu yet.
->
-> > > I'd forgotten that was still on the todo list. I don't think it will
->
-> > > be particularly hard to do and will take a look in next few days.
->
-> > >
->
-> > > Very very indirectly this error is causing a driver probe fail that
->
-> > > means that
->
-> > > we hit a code path that has a rather odd looking check on NDD_LABELING.
->
-> > > Should not have gotten near that path though - hence the problem is
->
-> > > actually
->
-> > > when we call cxl_pmem_get_config_data() and it returns an error because
->
-> > > we haven't fully connected up the command in QEMU.
->
-> >
->
-> > So a least one bug in QEMU. We were not supporting variable length
->
-> > payloads on mailbox
->
-> > inputs (but were on outputs).  That hasn't mattered until we get to LSA
->
-> > writes.
->
-> > We just need to relax condition on the supplied length.
->
-> >
->
-> > diff --git a/hw/cxl/cxl-mailbox-utils.c b/hw/cxl/cxl-mailbox-utils.c
->
-> > index c352a935c4..fdda9529fe 100644
->
-> > --- a/hw/cxl/cxl-mailbox-utils.c
->
-> > +++ b/hw/cxl/cxl-mailbox-utils.c
->
-> > @@ -510,7 +510,7 @@ void cxl_process_mailbox(CXLDeviceState *cxl_dstate)
->
-> >      cxl_cmd = &cxl_cmd_set[set][cmd];
->
-> >      h = cxl_cmd->handler;
->
-> >      if (h) {
->
-> > -        if (len == cxl_cmd->in) {
->
-> > +        if (len == cxl_cmd->in || !cxl_cmd->in) {
->
-> >              cxl_cmd->payload = cxl_dstate->mbox_reg_state +
->
-> >                  A_CXL_DEV_CMD_PAYLOAD;
->
-> >              ret = (*h)(cxl_cmd, cxl_dstate, &len);
->
-> >
->
-> >
->
-> > This lets the nvdimm/region probe fine, but I'm getting some issues with
->
-> > namespace capacity so I'll look at what is causing that next.
->
-> > Unfortunately I'm not that familiar with the driver/nvdimm side of things
->
-> > so it's take a while to figure out what kicks off what!
->
->
->
-> The whirlwind tour is that 'struct nd_region' instances that represent a
->
-> persitent memory address range are composed of one more mappings of
->
-> 'struct nvdimm' objects. The nvdimm object is driven by the dimm driver
->
-> in drivers/nvdimm/dimm.c. That driver is mainly charged with unlocking
->
-> the dimm (if locked) and interrogating the label area to look for
->
-> namespace labels.
->
->
->
-> The label command calls are routed to the '->ndctl()' callback that was
->
-> registered when the CXL nvdimm_bus_descriptor was created. That callback
->
-> handles both 'bus' scope calls, currently none for CXL, and per nvdimm
->
-> calls. cxl_pmem_nvdimm_ctl() translates those generic LIBNVDIMM commands
->
-> to CXL commands.
->
->
->
-> The 'struct nvdimm' objects that the CXL side registers have the
->
-> NDD_LABELING flag set which means that namespaces need to be explicitly
->
-> created / provisioned from region capacity. Otherwise, if
->
-> drivers/nvdimm/dimm.c does not find a namespace-label-index block then
->
-> the region reverts to label-less mode and a default namespace equal to
->
-> the size of the region is instantiated.
->
->
->
-> If you are seeing small mismatches in namespace capacity then it may
->
-> just be the fact that by default 'ndctl create-namespace' results in an
->
-> 'fsdax' mode namespace which just means that it is a block device where
->
-> 1.5% of the capacity is reserved for 'struct page' metadata. You should
->
-> be able to see namespace capacity == region capacity by doing "ndctl
->
-> create-namespace -m raw", and disable DAX operation.
->
->
-Currently ndctl create-namespace crashes qemu ;)
->
-Which isn't ideal!
->
-Found a cause for this one.  Mailbox payload may be as small as 256 bytes.
-We have code in kernel sanity checking that output payload fits in the
-mailbox, but nothing on the input payload.  Symptom is that we write just
-off the end whatever size the payload is.  Note doing this shouldn't crash
-qemu - so I need to fix a range check somewhere.
-
-I think this is because cxl_pmem_get_config_size() returns the mailbox
-payload size as being the available LSA size, forgetting to remove the
-size of the headers on the set_lsa side of things.
-https://git.kernel.org/pub/scm/linux/kernel/git/cxl/cxl.git/tree/drivers/cxl/pmem.c?h=next#n110
-I've hacked the max_payload to be -8
-
-Now we still don't succeed in creating the namespace, but bonus is it doesn't 
-crash any more.
-
-
-Jonathan
-
-
-
->
->
->
-> Hope that helps.
->
-Got me looking at the right code. Thanks!
->
->
-Jonathan
->
->
-
-On Mon, 15 Aug 2022 15:18:09 +0100
-Jonathan Cameron via <qemu-devel@nongnu.org> wrote:
-
->
-On Fri, 12 Aug 2022 17:15:09 +0100
->
-Jonathan Cameron <Jonathan.Cameron@huawei.com> wrote:
->
->
-> On Fri, 12 Aug 2022 09:03:02 -0700
->
-> Dan Williams <dan.j.williams@intel.com> wrote:
->
->
->
-> > Jonathan Cameron wrote:
->
-> > > On Thu, 11 Aug 2022 18:08:57 +0100
->
-> > > Jonathan Cameron via <qemu-devel@nongnu.org> wrote:
->
-> > >
->
-> > > > On Tue, 9 Aug 2022 17:08:25 +0100
->
-> > > > Jonathan Cameron <Jonathan.Cameron@huawei.com> wrote:
->
-> > > >
->
-> > > > > On Tue, 9 Aug 2022 21:07:06 +0800
->
-> > > > > Bobo WL <lmw.bobo@gmail.com> wrote:
->
-> > > > >
->
-> > > > > > Hi Jonathan
->
-> > > > > >
->
-> > > > > > Thanks for your reply!
->
-> > > > > >
->
-> > > > > > On Mon, Aug 8, 2022 at 8:37 PM Jonathan Cameron
->
-> > > > > > <Jonathan.Cameron@huawei.com> wrote:
->
-> > > > > > >
->
-> > > > > > > Probably not related to your problem, but there is a disconnect
->
-> > > > > > > in QEMU /
->
-> > > > > > > kernel assumptionsaround the presence of an HDM decoder when a
->
-> > > > > > > HB only
->
-> > > > > > > has a single root port. Spec allows it to be provided or not as
->
-> > > > > > > an implementation choice.
->
-> > > > > > > Kernel assumes it isn't provide. Qemu assumes it is.
->
-> > > > > > >
->
-> > > > > > > The temporary solution is to throw in a second root port on the
->
-> > > > > > > HB and not
->
-> > > > > > > connect anything to it.  Longer term I may special case this so
->
-> > > > > > > that the particular
->
-> > > > > > > decoder defaults to pass through settings in QEMU if there is
->
-> > > > > > > only one root port.
->
-> > > > > > >
->
-> > > > > >
->
-> > > > > > You are right! After adding an extra HB in qemu, I can create a x1
->
-> > > > > > region successfully.
->
-> > > > > > But have some errors in Nvdimm:
->
-> > > > > >
->
-> > > > > > [   74.925838] Unknown online node for memory at 0x10000000000,
->
-> > > > > > assuming node 0
->
-> > > > > > [   74.925846] Unknown target node for memory at 0x10000000000,
->
-> > > > > > assuming node 0
->
-> > > > > > [   74.927470] nd_region region0: nmem0: is disabled, failing
->
-> > > > > > probe
->
-> > > > >
->
-> > > > > Ah. I've seen this one, but not chased it down yet.  Was on my todo
->
-> > > > > list to chase
->
-> > > > > down. Once I reach this state I can verify the HDM Decode is
->
-> > > > > correct which is what
->
-> > > > > I've been using to test (Which wasn't true until earlier this
->
-> > > > > week).
->
-> > > > > I'm currently testing via devmem, more for historical reasons than
->
-> > > > > because it makes
->
-> > > > > that much sense anymore.
->
-> > > >
->
-> > > > *embarassed cough*.  We haven't fully hooked the LSA up in qemu yet.
->
-> > > > I'd forgotten that was still on the todo list. I don't think it will
->
-> > > > be particularly hard to do and will take a look in next few days.
->
-> > > >
->
-> > > > Very very indirectly this error is causing a driver probe fail that
->
-> > > > means that
->
-> > > > we hit a code path that has a rather odd looking check on
->
-> > > > NDD_LABELING.
->
-> > > > Should not have gotten near that path though - hence the problem is
->
-> > > > actually
->
-> > > > when we call cxl_pmem_get_config_data() and it returns an error
->
-> > > > because
->
-> > > > we haven't fully connected up the command in QEMU.
->
-> > >
->
-> > > So a least one bug in QEMU. We were not supporting variable length
->
-> > > payloads on mailbox
->
-> > > inputs (but were on outputs).  That hasn't mattered until we get to LSA
->
-> > > writes.
->
-> > > We just need to relax condition on the supplied length.
->
-> > >
->
-> > > diff --git a/hw/cxl/cxl-mailbox-utils.c b/hw/cxl/cxl-mailbox-utils.c
->
-> > > index c352a935c4..fdda9529fe 100644
->
-> > > --- a/hw/cxl/cxl-mailbox-utils.c
->
-> > > +++ b/hw/cxl/cxl-mailbox-utils.c
->
-> > > @@ -510,7 +510,7 @@ void cxl_process_mailbox(CXLDeviceState *cxl_dstate)
->
-> > >      cxl_cmd = &cxl_cmd_set[set][cmd];
->
-> > >      h = cxl_cmd->handler;
->
-> > >      if (h) {
->
-> > > -        if (len == cxl_cmd->in) {
->
-> > > +        if (len == cxl_cmd->in || !cxl_cmd->in) {
->
-> > >              cxl_cmd->payload = cxl_dstate->mbox_reg_state +
->
-> > >                  A_CXL_DEV_CMD_PAYLOAD;
->
-> > >              ret = (*h)(cxl_cmd, cxl_dstate, &len);
->
-> > >
->
-> > >
->
-> > > This lets the nvdimm/region probe fine, but I'm getting some issues with
->
-> > > namespace capacity so I'll look at what is causing that next.
->
-> > > Unfortunately I'm not that familiar with the driver/nvdimm side of
->
-> > > things
->
-> > > so it's take a while to figure out what kicks off what!
->
-> >
->
-> > The whirlwind tour is that 'struct nd_region' instances that represent a
->
-> > persitent memory address range are composed of one more mappings of
->
-> > 'struct nvdimm' objects. The nvdimm object is driven by the dimm driver
->
-> > in drivers/nvdimm/dimm.c. That driver is mainly charged with unlocking
->
-> > the dimm (if locked) and interrogating the label area to look for
->
-> > namespace labels.
->
-> >
->
-> > The label command calls are routed to the '->ndctl()' callback that was
->
-> > registered when the CXL nvdimm_bus_descriptor was created. That callback
->
-> > handles both 'bus' scope calls, currently none for CXL, and per nvdimm
->
-> > calls. cxl_pmem_nvdimm_ctl() translates those generic LIBNVDIMM commands
->
-> > to CXL commands.
->
-> >
->
-> > The 'struct nvdimm' objects that the CXL side registers have the
->
-> > NDD_LABELING flag set which means that namespaces need to be explicitly
->
-> > created / provisioned from region capacity. Otherwise, if
->
-> > drivers/nvdimm/dimm.c does not find a namespace-label-index block then
->
-> > the region reverts to label-less mode and a default namespace equal to
->
-> > the size of the region is instantiated.
->
-> >
->
-> > If you are seeing small mismatches in namespace capacity then it may
->
-> > just be the fact that by default 'ndctl create-namespace' results in an
->
-> > 'fsdax' mode namespace which just means that it is a block device where
->
-> > 1.5% of the capacity is reserved for 'struct page' metadata. You should
->
-> > be able to see namespace capacity == region capacity by doing "ndctl
->
-> > create-namespace -m raw", and disable DAX operation.
->
->
->
-> Currently ndctl create-namespace crashes qemu ;)
->
-> Which isn't ideal!
->
->
->
->
-Found a cause for this one.  Mailbox payload may be as small as 256 bytes.
->
-We have code in kernel sanity checking that output payload fits in the
->
-mailbox, but nothing on the input payload.  Symptom is that we write just
->
-off the end whatever size the payload is.  Note doing this shouldn't crash
->
-qemu - so I need to fix a range check somewhere.
->
->
-I think this is because cxl_pmem_get_config_size() returns the mailbox
->
-payload size as being the available LSA size, forgetting to remove the
->
-size of the headers on the set_lsa side of things.
->
-https://git.kernel.org/pub/scm/linux/kernel/git/cxl/cxl.git/tree/drivers/cxl/pmem.c?h=next#n110
->
->
-I've hacked the max_payload to be -8
->
->
-Now we still don't succeed in creating the namespace, but bonus is it doesn't
->
-crash any more.
-In the interests of defensive / correct handling from QEMU I took a
-look into why it was crashing.  Turns out that providing a NULL write callback 
-for
-the memory device region (that the above overlarge write was spilling into) 
-isn't
-a safe thing to do.  Needs a stub. Oops.
-
-On plus side we might never have noticed this was going wrong without the crash
-*silver lining in every cloud*
-
-Fix to follow...
-
-Jonathan
-
-
->
->
->
-Jonathan
->
->
->
->
-> >
->
-> > Hope that helps.
->
-> Got me looking at the right code. Thanks!
->
->
->
-> Jonathan
->
->
->
->
->
->
-
-On Mon, 15 Aug 2022 at 15:55, Jonathan Cameron via <qemu-arm@nongnu.org> wrote:
->
-In the interests of defensive / correct handling from QEMU I took a
->
-look into why it was crashing.  Turns out that providing a NULL write
->
-callback for
->
-the memory device region (that the above overlarge write was spilling into)
->
-isn't
->
-a safe thing to do.  Needs a stub. Oops.
-Yeah. We've talked before about adding an assert so that that kind of
-"missing function" bug is caught at device creation rather than only
-if the guest tries to access the device, but we never quite got around
-to it...
-
--- PMM
-
-On Fri, 12 Aug 2022 16:44:03 +0100
-Jonathan Cameron <Jonathan.Cameron@huawei.com> wrote:
-
->
-On Thu, 11 Aug 2022 18:08:57 +0100
->
-Jonathan Cameron via <qemu-devel@nongnu.org> wrote:
->
->
-> On Tue, 9 Aug 2022 17:08:25 +0100
->
-> Jonathan Cameron <Jonathan.Cameron@huawei.com> wrote:
->
->
->
-> > On Tue, 9 Aug 2022 21:07:06 +0800
->
-> > Bobo WL <lmw.bobo@gmail.com> wrote:
->
-> >
->
-> > > Hi Jonathan
->
-> > >
->
-> > > Thanks for your reply!
->
-> > >
->
-> > > On Mon, Aug 8, 2022 at 8:37 PM Jonathan Cameron
->
-> > > <Jonathan.Cameron@huawei.com> wrote:
->
-> > > >
->
-> > > > Probably not related to your problem, but there is a disconnect in
->
-> > > > QEMU /
->
-> > > > kernel assumptionsaround the presence of an HDM decoder when a HB only
->
-> > > > has a single root port. Spec allows it to be provided or not as an
->
-> > > > implementation choice.
->
-> > > > Kernel assumes it isn't provide. Qemu assumes it is.
->
-> > > >
->
-> > > > The temporary solution is to throw in a second root port on the HB
->
-> > > > and not
->
-> > > > connect anything to it.  Longer term I may special case this so that
->
-> > > > the particular
->
-> > > > decoder defaults to pass through settings in QEMU if there is only
->
-> > > > one root port.
->
-> > > >
->
-> > >
->
-> > > You are right! After adding an extra HB in qemu, I can create a x1
->
-> > > region successfully.
->
-> > > But have some errors in Nvdimm:
->
-> > >
->
-> > > [   74.925838] Unknown online node for memory at 0x10000000000,
->
-> > > assuming node 0
->
-> > > [   74.925846] Unknown target node for memory at 0x10000000000,
->
-> > > assuming node 0
->
-> > > [   74.927470] nd_region region0: nmem0: is disabled, failing probe
->
-> > >
->
-> >
->
-> > Ah. I've seen this one, but not chased it down yet.  Was on my todo list
->
-> > to chase
->
-> > down. Once I reach this state I can verify the HDM Decode is correct
->
-> > which is what
->
-> > I've been using to test (Which wasn't true until earlier this week).
->
-> > I'm currently testing via devmem, more for historical reasons than
->
-> > because it makes
->
-> > that much sense anymore.
->
->
->
-> *embarassed cough*.  We haven't fully hooked the LSA up in qemu yet.
->
-> I'd forgotten that was still on the todo list. I don't think it will
->
-> be particularly hard to do and will take a look in next few days.
->
->
->
-> Very very indirectly this error is causing a driver probe fail that means
->
-> that
->
-> we hit a code path that has a rather odd looking check on NDD_LABELING.
->
-> Should not have gotten near that path though - hence the problem is actually
->
-> when we call cxl_pmem_get_config_data() and it returns an error because
->
-> we haven't fully connected up the command in QEMU.
->
->
-So a least one bug in QEMU. We were not supporting variable length payloads
->
-on mailbox
->
-inputs (but were on outputs).  That hasn't mattered until we get to LSA
->
-writes.
->
-We just need to relax condition on the supplied length.
->
->
-diff --git a/hw/cxl/cxl-mailbox-utils.c b/hw/cxl/cxl-mailbox-utils.c
->
-index c352a935c4..fdda9529fe 100644
->
---- a/hw/cxl/cxl-mailbox-utils.c
->
-+++ b/hw/cxl/cxl-mailbox-utils.c
->
-@@ -510,7 +510,7 @@ void cxl_process_mailbox(CXLDeviceState *cxl_dstate)
->
-cxl_cmd = &cxl_cmd_set[set][cmd];
->
-h = cxl_cmd->handler;
->
-if (h) {
->
--        if (len == cxl_cmd->in) {
->
-+        if (len == cxl_cmd->in || !cxl_cmd->in) {
-Fix is wrong as we use ~0 as the placeholder for variable payload, not 0.
-
-With that fixed we hit new fun paths - after some errors we get the
-worrying - not totally sure but looks like a failure on an error cleanup.
-I'll chase down the error source, but even then this is probably triggerable by
-hardware problem or similar.  Some bonus prints in here from me chasing
-error paths, but it's otherwise just cxl/next + the fix I posted earlier today.
-
-[   69.919877] nd_bus ndbus0: START: nd_region.probe(region0)
-[   69.920108] nd_region_probe
-[   69.920623] ------------[ cut here ]------------
-[   69.920675] refcount_t: addition on 0; use-after-free.
-[   69.921314] WARNING: CPU: 3 PID: 710 at lib/refcount.c:25 
-refcount_warn_saturate+0xa0/0x144
-[   69.926949] Modules linked in: cxl_pmem cxl_mem cxl_pci cxl_port cxl_acpi 
-cxl_core
-[   69.928830] CPU: 3 PID: 710 Comm: kworker/u8:9 Not tainted 5.19.0-rc3+ #399
-[   69.930596] Hardware name: QEMU QEMU Virtual Machine, BIOS 0.0.0 02/06/2015
-[   69.931482] Workqueue: events_unbound async_run_entry_fn
-[   69.932403] pstate: 60400005 (nZCv daif +PAN -UAO -TCO -DIT -SSBS BTYPE=--)
-[   69.934023] pc : refcount_warn_saturate+0xa0/0x144
-[   69.935161] lr : refcount_warn_saturate+0xa0/0x144
-[   69.936541] sp : ffff80000890b960
-[   69.937921] x29: ffff80000890b960 x28: 0000000000000000 x27: 0000000000000000
-[   69.940917] x26: ffffa54a90d5cb10 x25: ffffa54a90809e98 x24: 0000000000000000
-[   69.942537] x23: ffffa54a91a3d8d8 x22: ffff0000c5254800 x21: ffff0000c5254800
-[   69.944013] x20: ffff0000ce924180 x19: ffff0000c5254800 x18: ffffffffffffffff
-[   69.946100] x17: ffff5ab66e5ef000 x16: ffff80000801c000 x15: 0000000000000000
-[   69.947585] x14: 0000000000000001 x13: 0a2e656572662d72 x12: 657466612d657375
-[   69.948670] x11: 203b30206e6f206e x10: 6f69746964646120 x9 : ffffa54a8f63d288
-[   69.950679] x8 : 206e6f206e6f6974 x7 : 69646461203a745f x6 : 00000000fffff31e
-[   69.952113] x5 : ffff0000ff61ba08 x4 : 00000000fffff31e x3 : ffff5ab66e5ef000
-root@debian:/sys/bus/cxl/devices/decoder0.0/region0# [   69.954752] x2 : 
-0000000000000000 x1 : 0000000000000000 x0 : ffff0000c512e740
-[   69.957098] Call trace:
-[   69.957959]  refcount_warn_saturate+0xa0/0x144
-[   69.958773]  get_ndd+0x5c/0x80
-[   69.959294]  nd_region_register_namespaces+0xe4/0xe90
-[   69.960253]  nd_region_probe+0x100/0x290
-[   69.960796]  nvdimm_bus_probe+0xf4/0x1c0
-[   69.962087]  really_probe+0x19c/0x3f0
-[   69.962620]  __driver_probe_device+0x11c/0x190
-[   69.963258]  driver_probe_device+0x44/0xf4
-[   69.963773]  __device_attach_driver+0xa4/0x140
-[   69.964471]  bus_for_each_drv+0x84/0xe0
-[   69.965068]  __device_attach+0xb0/0x1f0
-[   69.966101]  device_initial_probe+0x20/0x30
-[   69.967142]  bus_probe_device+0xa4/0xb0
-[   69.968104]  device_add+0x3e8/0x910
-[   69.969111]  nd_async_device_register+0x24/0x74
-[   69.969928]  async_run_entry_fn+0x40/0x150
-[   69.970725]  process_one_work+0x1dc/0x450
-[   69.971796]  worker_thread+0x154/0x450
-[   69.972700]  kthread+0x118/0x120
-[   69.974141]  ret_from_fork+0x10/0x20
-[   69.975141] ---[ end trace 0000000000000000 ]---
-[   70.117887] Into nd_namespace_pmem_set_resource()
-
->
-cxl_cmd->payload = cxl_dstate->mbox_reg_state +
->
-A_CXL_DEV_CMD_PAYLOAD;
->
-ret = (*h)(cxl_cmd, cxl_dstate, &len);
->
->
->
-This lets the nvdimm/region probe fine, but I'm getting some issues with
->
-namespace capacity so I'll look at what is causing that next.
->
-Unfortunately I'm not that familiar with the driver/nvdimm side of things
->
-so it's take a while to figure out what kicks off what!
->
->
-Jonathan
->
->
->
->
-> Jonathan
->
->
->
->
->
-> >
->
-> > >
->
-> > > And x4 region still failed with same errors, using latest cxl/preview
->
-> > > branch don't work.
->
-> > > I have picked "Two CXL emulation fixes" patches in qemu, still not
->
-> > > working.
->
-> > >
->
-> > > Bob
->
->
->
->
->
-
-On Mon, 15 Aug 2022 18:04:44 +0100
-Jonathan Cameron <Jonathan.Cameron@huawei.com> wrote:
-
->
-On Fri, 12 Aug 2022 16:44:03 +0100
->
-Jonathan Cameron <Jonathan.Cameron@huawei.com> wrote:
->
->
-> On Thu, 11 Aug 2022 18:08:57 +0100
->
-> Jonathan Cameron via <qemu-devel@nongnu.org> wrote:
->
->
->
-> > On Tue, 9 Aug 2022 17:08:25 +0100
->
-> > Jonathan Cameron <Jonathan.Cameron@huawei.com> wrote:
->
-> >
->
-> > > On Tue, 9 Aug 2022 21:07:06 +0800
->
-> > > Bobo WL <lmw.bobo@gmail.com> wrote:
->
-> > >
->
-> > > > Hi Jonathan
->
-> > > >
->
-> > > > Thanks for your reply!
->
-> > > >
->
-> > > > On Mon, Aug 8, 2022 at 8:37 PM Jonathan Cameron
->
-> > > > <Jonathan.Cameron@huawei.com> wrote:
->
-> > > > >
->
-> > > > > Probably not related to your problem, but there is a disconnect in
->
-> > > > > QEMU /
->
-> > > > > kernel assumptionsaround the presence of an HDM decoder when a HB
->
-> > > > > only
->
-> > > > > has a single root port. Spec allows it to be provided or not as an
->
-> > > > > implementation choice.
->
-> > > > > Kernel assumes it isn't provide. Qemu assumes it is.
->
-> > > > >
->
-> > > > > The temporary solution is to throw in a second root port on the HB
->
-> > > > > and not
->
-> > > > > connect anything to it.  Longer term I may special case this so
->
-> > > > > that the particular
->
-> > > > > decoder defaults to pass through settings in QEMU if there is only
->
-> > > > > one root port.
->
-> > > > >
->
-> > > >
->
-> > > > You are right! After adding an extra HB in qemu, I can create a x1
->
-> > > > region successfully.
->
-> > > > But have some errors in Nvdimm:
->
-> > > >
->
-> > > > [   74.925838] Unknown online node for memory at 0x10000000000,
->
-> > > > assuming node 0
->
-> > > > [   74.925846] Unknown target node for memory at 0x10000000000,
->
-> > > > assuming node 0
->
-> > > > [   74.927470] nd_region region0: nmem0: is disabled, failing probe
->
-> > > >
->
-> > >
->
-> > > Ah. I've seen this one, but not chased it down yet.  Was on my todo
->
-> > > list to chase
->
-> > > down. Once I reach this state I can verify the HDM Decode is correct
->
-> > > which is what
->
-> > > I've been using to test (Which wasn't true until earlier this week).
->
-> > > I'm currently testing via devmem, more for historical reasons than
->
-> > > because it makes
->
-> > > that much sense anymore.
->
-> >
->
-> > *embarassed cough*.  We haven't fully hooked the LSA up in qemu yet.
->
-> > I'd forgotten that was still on the todo list. I don't think it will
->
-> > be particularly hard to do and will take a look in next few days.
->
-> >
->
-> > Very very indirectly this error is causing a driver probe fail that means
->
-> > that
->
-> > we hit a code path that has a rather odd looking check on NDD_LABELING.
->
-> > Should not have gotten near that path though - hence the problem is
->
-> > actually
->
-> > when we call cxl_pmem_get_config_data() and it returns an error because
->
-> > we haven't fully connected up the command in QEMU.
->
->
->
-> So a least one bug in QEMU. We were not supporting variable length payloads
->
-> on mailbox
->
-> inputs (but were on outputs).  That hasn't mattered until we get to LSA
->
-> writes.
->
-> We just need to relax condition on the supplied length.
->
->
->
-> diff --git a/hw/cxl/cxl-mailbox-utils.c b/hw/cxl/cxl-mailbox-utils.c
->
-> index c352a935c4..fdda9529fe 100644
->
-> --- a/hw/cxl/cxl-mailbox-utils.c
->
-> +++ b/hw/cxl/cxl-mailbox-utils.c
->
-> @@ -510,7 +510,7 @@ void cxl_process_mailbox(CXLDeviceState *cxl_dstate)
->
->      cxl_cmd = &cxl_cmd_set[set][cmd];
->
->      h = cxl_cmd->handler;
->
->      if (h) {
->
-> -        if (len == cxl_cmd->in) {
->
-> +        if (len == cxl_cmd->in || !cxl_cmd->in) {
->
-Fix is wrong as we use ~0 as the placeholder for variable payload, not 0.
-Cause of the error is a failure in GET_LSA.
-Reason, payload length is wrong in QEMU but was hidden previously by my wrong
-fix here.  Probably still a good idea to inject an error in GET_LSA and chase
-down the refcount issue.
-
-
-diff --git a/hw/cxl/cxl-mailbox-utils.c b/hw/cxl/cxl-mailbox-utils.c
-index fdda9529fe..e8565fbd6e 100644
---- a/hw/cxl/cxl-mailbox-utils.c
-+++ b/hw/cxl/cxl-mailbox-utils.c
-@@ -489,7 +489,7 @@ static struct cxl_cmd cxl_cmd_set[256][256] = {
-         cmd_identify_memory_device, 0, 0 },
-     [CCLS][GET_PARTITION_INFO] = { "CCLS_GET_PARTITION_INFO",
-         cmd_ccls_get_partition_info, 0, 0 },
--    [CCLS][GET_LSA] = { "CCLS_GET_LSA", cmd_ccls_get_lsa, 0, 0 },
-+    [CCLS][GET_LSA] = { "CCLS_GET_LSA", cmd_ccls_get_lsa, 8, 0 },
-     [CCLS][SET_LSA] = { "CCLS_SET_LSA", cmd_ccls_set_lsa,
-         ~0, IMMEDIATE_CONFIG_CHANGE | IMMEDIATE_DATA_CHANGE },
-     [MEDIA_AND_POISON][GET_POISON_LIST] = { "MEDIA_AND_POISON_GET_POISON_LIST",
-@@ -510,12 +510,13 @@ void cxl_process_mailbox(CXLDeviceState *cxl_dstate)
-     cxl_cmd = &cxl_cmd_set[set][cmd];
-     h = cxl_cmd->handler;
-     if (h) {
--        if (len == cxl_cmd->in || !cxl_cmd->in) {
-+        if (len == cxl_cmd->in || cxl_cmd->in == ~0) {
-             cxl_cmd->payload = cxl_dstate->mbox_reg_state +
-                 A_CXL_DEV_CMD_PAYLOAD;
-
-And woot, we get a namespace in the LSA :)
-
-I'll post QEMU fixes in next day or two.  Kernel side now seems more or less
-fine be it with suspicious refcount underflow.
-
->
->
-With that fixed we hit new fun paths - after some errors we get the
->
-worrying - not totally sure but looks like a failure on an error cleanup.
->
-I'll chase down the error source, but even then this is probably triggerable
->
-by
->
-hardware problem or similar.  Some bonus prints in here from me chasing
->
-error paths, but it's otherwise just cxl/next + the fix I posted earlier
->
-today.
->
->
-[   69.919877] nd_bus ndbus0: START: nd_region.probe(region0)
->
-[   69.920108] nd_region_probe
->
-[   69.920623] ------------[ cut here ]------------
->
-[   69.920675] refcount_t: addition on 0; use-after-free.
->
-[   69.921314] WARNING: CPU: 3 PID: 710 at lib/refcount.c:25
->
-refcount_warn_saturate+0xa0/0x144
->
-[   69.926949] Modules linked in: cxl_pmem cxl_mem cxl_pci cxl_port cxl_acpi
->
-cxl_core
->
-[   69.928830] CPU: 3 PID: 710 Comm: kworker/u8:9 Not tainted 5.19.0-rc3+ #399
->
-[   69.930596] Hardware name: QEMU QEMU Virtual Machine, BIOS 0.0.0 02/06/2015
->
-[   69.931482] Workqueue: events_unbound async_run_entry_fn
->
-[   69.932403] pstate: 60400005 (nZCv daif +PAN -UAO -TCO -DIT -SSBS BTYPE=--)
->
-[   69.934023] pc : refcount_warn_saturate+0xa0/0x144
->
-[   69.935161] lr : refcount_warn_saturate+0xa0/0x144
->
-[   69.936541] sp : ffff80000890b960
->
-[   69.937921] x29: ffff80000890b960 x28: 0000000000000000 x27:
->
-0000000000000000
->
-[   69.940917] x26: ffffa54a90d5cb10 x25: ffffa54a90809e98 x24:
->
-0000000000000000
->
-[   69.942537] x23: ffffa54a91a3d8d8 x22: ffff0000c5254800 x21:
->
-ffff0000c5254800
->
-[   69.944013] x20: ffff0000ce924180 x19: ffff0000c5254800 x18:
->
-ffffffffffffffff
->
-[   69.946100] x17: ffff5ab66e5ef000 x16: ffff80000801c000 x15:
->
-0000000000000000
->
-[   69.947585] x14: 0000000000000001 x13: 0a2e656572662d72 x12:
->
-657466612d657375
->
-[   69.948670] x11: 203b30206e6f206e x10: 6f69746964646120 x9 :
->
-ffffa54a8f63d288
->
-[   69.950679] x8 : 206e6f206e6f6974 x7 : 69646461203a745f x6 :
->
-00000000fffff31e
->
-[   69.952113] x5 : ffff0000ff61ba08 x4 : 00000000fffff31e x3 :
->
-ffff5ab66e5ef000
->
-root@debian:/sys/bus/cxl/devices/decoder0.0/region0# [   69.954752] x2 :
->
-0000000000000000 x1 : 0000000000000000 x0 : ffff0000c512e740
->
-[   69.957098] Call trace:
->
-[   69.957959]  refcount_warn_saturate+0xa0/0x144
->
-[   69.958773]  get_ndd+0x5c/0x80
->
-[   69.959294]  nd_region_register_namespaces+0xe4/0xe90
->
-[   69.960253]  nd_region_probe+0x100/0x290
->
-[   69.960796]  nvdimm_bus_probe+0xf4/0x1c0
->
-[   69.962087]  really_probe+0x19c/0x3f0
->
-[   69.962620]  __driver_probe_device+0x11c/0x190
->
-[   69.963258]  driver_probe_device+0x44/0xf4
->
-[   69.963773]  __device_attach_driver+0xa4/0x140
->
-[   69.964471]  bus_for_each_drv+0x84/0xe0
->
-[   69.965068]  __device_attach+0xb0/0x1f0
->
-[   69.966101]  device_initial_probe+0x20/0x30
->
-[   69.967142]  bus_probe_device+0xa4/0xb0
->
-[   69.968104]  device_add+0x3e8/0x910
->
-[   69.969111]  nd_async_device_register+0x24/0x74
->
-[   69.969928]  async_run_entry_fn+0x40/0x150
->
-[   69.970725]  process_one_work+0x1dc/0x450
->
-[   69.971796]  worker_thread+0x154/0x450
->
-[   69.972700]  kthread+0x118/0x120
->
-[   69.974141]  ret_from_fork+0x10/0x20
->
-[   69.975141] ---[ end trace 0000000000000000 ]---
->
-[   70.117887] Into nd_namespace_pmem_set_resource()
->
->
->              cxl_cmd->payload = cxl_dstate->mbox_reg_state +
->
->                  A_CXL_DEV_CMD_PAYLOAD;
->
->              ret = (*h)(cxl_cmd, cxl_dstate, &len);
->
->
->
->
->
-> This lets the nvdimm/region probe fine, but I'm getting some issues with
->
-> namespace capacity so I'll look at what is causing that next.
->
-> Unfortunately I'm not that familiar with the driver/nvdimm side of things
->
-> so it's take a while to figure out what kicks off what!
->
->
->
-> Jonathan
->
->
->
-> >
->
-> > Jonathan
->
-> >
->
-> >
->
-> > >
->
-> > > >
->
-> > > > And x4 region still failed with same errors, using latest cxl/preview
->
-> > > > branch don't work.
->
-> > > > I have picked "Two CXL emulation fixes" patches in qemu, still not
->
-> > > > working.
->
-> > > >
->
-> > > > Bob
->
-> >
->
-> >
->
->
->
-
-Jonathan Cameron wrote:
->
-On Fri, 12 Aug 2022 16:44:03 +0100
->
-Jonathan Cameron <Jonathan.Cameron@huawei.com> wrote:
->
->
-> On Thu, 11 Aug 2022 18:08:57 +0100
->
-> Jonathan Cameron via <qemu-devel@nongnu.org> wrote:
->
->
->
-> > On Tue, 9 Aug 2022 17:08:25 +0100
->
-> > Jonathan Cameron <Jonathan.Cameron@huawei.com> wrote:
->
-> >
->
-> > > On Tue, 9 Aug 2022 21:07:06 +0800
->
-> > > Bobo WL <lmw.bobo@gmail.com> wrote:
->
-> > >
->
-> > > > Hi Jonathan
->
-> > > >
->
-> > > > Thanks for your reply!
->
-> > > >
->
-> > > > On Mon, Aug 8, 2022 at 8:37 PM Jonathan Cameron
->
-> > > > <Jonathan.Cameron@huawei.com> wrote:
->
-> > > > >
->
-> > > > > Probably not related to your problem, but there is a disconnect in
->
-> > > > > QEMU /
->
-> > > > > kernel assumptionsaround the presence of an HDM decoder when a HB
->
-> > > > > only
->
-> > > > > has a single root port. Spec allows it to be provided or not as an
->
-> > > > > implementation choice.
->
-> > > > > Kernel assumes it isn't provide. Qemu assumes it is.
->
-> > > > >
->
-> > > > > The temporary solution is to throw in a second root port on the HB
->
-> > > > > and not
->
-> > > > > connect anything to it.  Longer term I may special case this so
->
-> > > > > that the particular
->
-> > > > > decoder defaults to pass through settings in QEMU if there is only
->
-> > > > > one root port.
->
-> > > > >
->
-> > > >
->
-> > > > You are right! After adding an extra HB in qemu, I can create a x1
->
-> > > > region successfully.
->
-> > > > But have some errors in Nvdimm:
->
-> > > >
->
-> > > > [   74.925838] Unknown online node for memory at 0x10000000000,
->
-> > > > assuming node 0
->
-> > > > [   74.925846] Unknown target node for memory at 0x10000000000,
->
-> > > > assuming node 0
->
-> > > > [   74.927470] nd_region region0: nmem0: is disabled, failing probe
->
-> > > >
->
-> > >
->
-> > > Ah. I've seen this one, but not chased it down yet.  Was on my todo
->
-> > > list to chase
->
-> > > down. Once I reach this state I can verify the HDM Decode is correct
->
-> > > which is what
->
-> > > I've been using to test (Which wasn't true until earlier this week).
->
-> > > I'm currently testing via devmem, more for historical reasons than
->
-> > > because it makes
->
-> > > that much sense anymore.
->
-> >
->
-> > *embarassed cough*.  We haven't fully hooked the LSA up in qemu yet.
->
-> > I'd forgotten that was still on the todo list. I don't think it will
->
-> > be particularly hard to do and will take a look in next few days.
->
-> >
->
-> > Very very indirectly this error is causing a driver probe fail that means
->
-> > that
->
-> > we hit a code path that has a rather odd looking check on NDD_LABELING.
->
-> > Should not have gotten near that path though - hence the problem is
->
-> > actually
->
-> > when we call cxl_pmem_get_config_data() and it returns an error because
->
-> > we haven't fully connected up the command in QEMU.
->
->
->
-> So a least one bug in QEMU. We were not supporting variable length payloads
->
-> on mailbox
->
-> inputs (but were on outputs).  That hasn't mattered until we get to LSA
->
-> writes.
->
-> We just need to relax condition on the supplied length.
->
->
->
-> diff --git a/hw/cxl/cxl-mailbox-utils.c b/hw/cxl/cxl-mailbox-utils.c
->
-> index c352a935c4..fdda9529fe 100644
->
-> --- a/hw/cxl/cxl-mailbox-utils.c
->
-> +++ b/hw/cxl/cxl-mailbox-utils.c
->
-> @@ -510,7 +510,7 @@ void cxl_process_mailbox(CXLDeviceState *cxl_dstate)
->
->      cxl_cmd = &cxl_cmd_set[set][cmd];
->
->      h = cxl_cmd->handler;
->
->      if (h) {
->
-> -        if (len == cxl_cmd->in) {
->
-> +        if (len == cxl_cmd->in || !cxl_cmd->in) {
->
-Fix is wrong as we use ~0 as the placeholder for variable payload, not 0.
->
->
-With that fixed we hit new fun paths - after some errors we get the
->
-worrying - not totally sure but looks like a failure on an error cleanup.
->
-I'll chase down the error source, but even then this is probably triggerable
->
-by
->
-hardware problem or similar.  Some bonus prints in here from me chasing
->
-error paths, but it's otherwise just cxl/next + the fix I posted earlier
->
-today.
-One of the scenarios that I cannot rule out is nvdimm_probe() racing
-nd_region_probe(), but given all the work it takes to create a region I
-suspect all the nvdimm_probe() work to have completed...
-
-It is at least one potentially wrong hypothesis that needs to be chased
-down.
-
->
->
-[   69.919877] nd_bus ndbus0: START: nd_region.probe(region0)
->
-[   69.920108] nd_region_probe
->
-[   69.920623] ------------[ cut here ]------------
->
-[   69.920675] refcount_t: addition on 0; use-after-free.
->
-[   69.921314] WARNING: CPU: 3 PID: 710 at lib/refcount.c:25
->
-refcount_warn_saturate+0xa0/0x144
->
-[   69.926949] Modules linked in: cxl_pmem cxl_mem cxl_pci cxl_port cxl_acpi
->
-cxl_core
->
-[   69.928830] CPU: 3 PID: 710 Comm: kworker/u8:9 Not tainted 5.19.0-rc3+ #399
->
-[   69.930596] Hardware name: QEMU QEMU Virtual Machine, BIOS 0.0.0 02/06/2015
->
-[   69.931482] Workqueue: events_unbound async_run_entry_fn
->
-[   69.932403] pstate: 60400005 (nZCv daif +PAN -UAO -TCO -DIT -SSBS BTYPE=--)
->
-[   69.934023] pc : refcount_warn_saturate+0xa0/0x144
->
-[   69.935161] lr : refcount_warn_saturate+0xa0/0x144
->
-[   69.936541] sp : ffff80000890b960
->
-[   69.937921] x29: ffff80000890b960 x28: 0000000000000000 x27:
->
-0000000000000000
->
-[   69.940917] x26: ffffa54a90d5cb10 x25: ffffa54a90809e98 x24:
->
-0000000000000000
->
-[   69.942537] x23: ffffa54a91a3d8d8 x22: ffff0000c5254800 x21:
->
-ffff0000c5254800
->
-[   69.944013] x20: ffff0000ce924180 x19: ffff0000c5254800 x18:
->
-ffffffffffffffff
->
-[   69.946100] x17: ffff5ab66e5ef000 x16: ffff80000801c000 x15:
->
-0000000000000000
->
-[   69.947585] x14: 0000000000000001 x13: 0a2e656572662d72 x12:
->
-657466612d657375
->
-[   69.948670] x11: 203b30206e6f206e x10: 6f69746964646120 x9 :
->
-ffffa54a8f63d288
->
-[   69.950679] x8 : 206e6f206e6f6974 x7 : 69646461203a745f x6 :
->
-00000000fffff31e
->
-[   69.952113] x5 : ffff0000ff61ba08 x4 : 00000000fffff31e x3 :
->
-ffff5ab66e5ef000
->
-root@debian:/sys/bus/cxl/devices/decoder0.0/region0# [   69.954752] x2 :
->
-0000000000000000 x1 : 0000000000000000 x0 : ffff0000c512e740
->
-[   69.957098] Call trace:
->
-[   69.957959]  refcount_warn_saturate+0xa0/0x144
->
-[   69.958773]  get_ndd+0x5c/0x80
->
-[   69.959294]  nd_region_register_namespaces+0xe4/0xe90
->
-[   69.960253]  nd_region_probe+0x100/0x290
->
-[   69.960796]  nvdimm_bus_probe+0xf4/0x1c0
->
-[   69.962087]  really_probe+0x19c/0x3f0
->
-[   69.962620]  __driver_probe_device+0x11c/0x190
->
-[   69.963258]  driver_probe_device+0x44/0xf4
->
-[   69.963773]  __device_attach_driver+0xa4/0x140
->
-[   69.964471]  bus_for_each_drv+0x84/0xe0
->
-[   69.965068]  __device_attach+0xb0/0x1f0
->
-[   69.966101]  device_initial_probe+0x20/0x30
->
-[   69.967142]  bus_probe_device+0xa4/0xb0
->
-[   69.968104]  device_add+0x3e8/0x910
->
-[   69.969111]  nd_async_device_register+0x24/0x74
->
-[   69.969928]  async_run_entry_fn+0x40/0x150
->
-[   69.970725]  process_one_work+0x1dc/0x450
->
-[   69.971796]  worker_thread+0x154/0x450
->
-[   69.972700]  kthread+0x118/0x120
->
-[   69.974141]  ret_from_fork+0x10/0x20
->
-[   69.975141] ---[ end trace 0000000000000000 ]---
->
-[   70.117887] Into nd_namespace_pmem_set_resource()
-
-On Mon, 15 Aug 2022 15:55:15 -0700
-Dan Williams <dan.j.williams@intel.com> wrote:
-
->
-Jonathan Cameron wrote:
->
-> On Fri, 12 Aug 2022 16:44:03 +0100
->
-> Jonathan Cameron <Jonathan.Cameron@huawei.com> wrote:
->
->
->
-> > On Thu, 11 Aug 2022 18:08:57 +0100
->
-> > Jonathan Cameron via <qemu-devel@nongnu.org> wrote:
->
-> >
->
-> > > On Tue, 9 Aug 2022 17:08:25 +0100
->
-> > > Jonathan Cameron <Jonathan.Cameron@huawei.com> wrote:
->
-> > >
->
-> > > > On Tue, 9 Aug 2022 21:07:06 +0800
->
-> > > > Bobo WL <lmw.bobo@gmail.com> wrote:
->
-> > > >
->
-> > > > > Hi Jonathan
->
-> > > > >
->
-> > > > > Thanks for your reply!
->
-> > > > >
->
-> > > > > On Mon, Aug 8, 2022 at 8:37 PM Jonathan Cameron
->
-> > > > > <Jonathan.Cameron@huawei.com> wrote:
->
-> > > > > >
->
-> > > > > > Probably not related to your problem, but there is a disconnect
->
-> > > > > > in QEMU /
->
-> > > > > > kernel assumptionsaround the presence of an HDM decoder when a HB
->
-> > > > > > only
->
-> > > > > > has a single root port. Spec allows it to be provided or not as
->
-> > > > > > an implementation choice.
->
-> > > > > > Kernel assumes it isn't provide. Qemu assumes it is.
->
-> > > > > >
->
-> > > > > > The temporary solution is to throw in a second root port on the
->
-> > > > > > HB and not
->
-> > > > > > connect anything to it.  Longer term I may special case this so
->
-> > > > > > that the particular
->
-> > > > > > decoder defaults to pass through settings in QEMU if there is
->
-> > > > > > only one root port.
->
-> > > > > >
->
-> > > > >
->
-> > > > > You are right! After adding an extra HB in qemu, I can create a x1
->
-> > > > > region successfully.
->
-> > > > > But have some errors in Nvdimm:
->
-> > > > >
->
-> > > > > [   74.925838] Unknown online node for memory at 0x10000000000,
->
-> > > > > assuming node 0
->
-> > > > > [   74.925846] Unknown target node for memory at 0x10000000000,
->
-> > > > > assuming node 0
->
-> > > > > [   74.927470] nd_region region0: nmem0: is disabled, failing probe
->
-> > > > >
->
-> > > >
->
-> > > > Ah. I've seen this one, but not chased it down yet.  Was on my todo
->
-> > > > list to chase
->
-> > > > down. Once I reach this state I can verify the HDM Decode is correct
->
-> > > > which is what
->
-> > > > I've been using to test (Which wasn't true until earlier this week).
->
-> > > > I'm currently testing via devmem, more for historical reasons than
->
-> > > > because it makes
->
-> > > > that much sense anymore.
->
-> > >
->
-> > > *embarassed cough*.  We haven't fully hooked the LSA up in qemu yet.
->
-> > > I'd forgotten that was still on the todo list. I don't think it will
->
-> > > be particularly hard to do and will take a look in next few days.
->
-> > >
->
-> > > Very very indirectly this error is causing a driver probe fail that
->
-> > > means that
->
-> > > we hit a code path that has a rather odd looking check on NDD_LABELING.
->
-> > > Should not have gotten near that path though - hence the problem is
->
-> > > actually
->
-> > > when we call cxl_pmem_get_config_data() and it returns an error because
->
-> > > we haven't fully connected up the command in QEMU.
->
-> >
->
-> > So a least one bug in QEMU. We were not supporting variable length
->
-> > payloads on mailbox
->
-> > inputs (but were on outputs).  That hasn't mattered until we get to LSA
->
-> > writes.
->
-> > We just need to relax condition on the supplied length.
->
-> >
->
-> > diff --git a/hw/cxl/cxl-mailbox-utils.c b/hw/cxl/cxl-mailbox-utils.c
->
-> > index c352a935c4..fdda9529fe 100644
->
-> > --- a/hw/cxl/cxl-mailbox-utils.c
->
-> > +++ b/hw/cxl/cxl-mailbox-utils.c
->
-> > @@ -510,7 +510,7 @@ void cxl_process_mailbox(CXLDeviceState *cxl_dstate)
->
-> >      cxl_cmd = &cxl_cmd_set[set][cmd];
->
-> >      h = cxl_cmd->handler;
->
-> >      if (h) {
->
-> > -        if (len == cxl_cmd->in) {
->
-> > +        if (len == cxl_cmd->in || !cxl_cmd->in) {
->
-> Fix is wrong as we use ~0 as the placeholder for variable payload, not 0.
->
->
->
-> With that fixed we hit new fun paths - after some errors we get the
->
-> worrying - not totally sure but looks like a failure on an error cleanup.
->
-> I'll chase down the error source, but even then this is probably
->
-> triggerable by
->
-> hardware problem or similar.  Some bonus prints in here from me chasing
->
-> error paths, but it's otherwise just cxl/next + the fix I posted earlier
->
-> today.
->
->
-One of the scenarios that I cannot rule out is nvdimm_probe() racing
->
-nd_region_probe(), but given all the work it takes to create a region I
->
-suspect all the nvdimm_probe() work to have completed...
->
->
-It is at least one potentially wrong hypothesis that needs to be chased
->
-down.
-Maybe there should be a special award for the non-intuitive 
-ndctl create-namespace command (modifies existing namespace and might create
-a different empty one...) I'm sure there is some interesting history behind 
-that one :)
-
-Upshot is I just threw a filesystem on fsdax and wrote some text files on it
-to allow easy grepping. The right data ends up in the memory and a plausible
-namespace description is stored in the LSA.
-
-So to some degree at least it's 'working' on an 8 way direct connected
-set of emulated devices.
-
-One snag is that serial number support isn't yet upstream in QEMU.
-(I have had it in my tree for a while but not posted it yet because of
- QEMU feature freeze)
-https://gitlab.com/jic23/qemu/-/commit/144c783ea8a5fbe169f46ea1ba92940157f42733
-That's needed for meaningful cookie generation.  Otherwise you can build the
-namespace once, but it won't work on next probe as the cookie is 0 and you
-hit some error paths.
-
-Maybe sensible to add a sanity check and fail namespace creation if
-cookie is 0?  (Silly side question, but is there a theoretical risk of
-a serial number / other data combination leading to a fletcher64()
-checksum that happens to be 0 - that would give a very odd bug report!)
-
-So to make it work the following is needed:
-
-1) The kernel fix for mailbox buffer overflow.
-2) Qemu fix for size of arguements for get_lsa
-3) Qemu fix to allow variable size input arguements (for set_lsa)
-4) Serial number patch above + command lines to qemu to set appropriate
-   serial numbers.
-
-I'll send out the QEMU fixes shortly and post the Serial number patch,
-though that almost certainly won't go in until next QEMU development
-cycle starts in a few weeks.
-
-Next up, run through same tests on some other topologies.
-
-Jonathan
-
->
->
->
->
-> [   69.919877] nd_bus ndbus0: START: nd_region.probe(region0)
->
-> [   69.920108] nd_region_probe
->
-> [   69.920623] ------------[ cut here ]------------
->
-> [   69.920675] refcount_t: addition on 0; use-after-free.
->
-> [   69.921314] WARNING: CPU: 3 PID: 710 at lib/refcount.c:25
->
-> refcount_warn_saturate+0xa0/0x144
->
-> [   69.926949] Modules linked in: cxl_pmem cxl_mem cxl_pci cxl_port
->
-> cxl_acpi cxl_core
->
-> [   69.928830] CPU: 3 PID: 710 Comm: kworker/u8:9 Not tainted 5.19.0-rc3+
->
-> #399
->
-> [   69.930596] Hardware name: QEMU QEMU Virtual Machine, BIOS 0.0.0
->
-> 02/06/2015
->
-> [   69.931482] Workqueue: events_unbound async_run_entry_fn
->
-> [   69.932403] pstate: 60400005 (nZCv daif +PAN -UAO -TCO -DIT -SSBS
->
-> BTYPE=--)
->
-> [   69.934023] pc : refcount_warn_saturate+0xa0/0x144
->
-> [   69.935161] lr : refcount_warn_saturate+0xa0/0x144
->
-> [   69.936541] sp : ffff80000890b960
->
-> [   69.937921] x29: ffff80000890b960 x28: 0000000000000000 x27:
->
-> 0000000000000000
->
-> [   69.940917] x26: ffffa54a90d5cb10 x25: ffffa54a90809e98 x24:
->
-> 0000000000000000
->
-> [   69.942537] x23: ffffa54a91a3d8d8 x22: ffff0000c5254800 x21:
->
-> ffff0000c5254800
->
-> [   69.944013] x20: ffff0000ce924180 x19: ffff0000c5254800 x18:
->
-> ffffffffffffffff
->
-> [   69.946100] x17: ffff5ab66e5ef000 x16: ffff80000801c000 x15:
->
-> 0000000000000000
->
-> [   69.947585] x14: 0000000000000001 x13: 0a2e656572662d72 x12:
->
-> 657466612d657375
->
-> [   69.948670] x11: 203b30206e6f206e x10: 6f69746964646120 x9 :
->
-> ffffa54a8f63d288
->
-> [   69.950679] x8 : 206e6f206e6f6974 x7 : 69646461203a745f x6 :
->
-> 00000000fffff31e
->
-> [   69.952113] x5 : ffff0000ff61ba08 x4 : 00000000fffff31e x3 :
->
-> ffff5ab66e5ef000
->
-> root@debian:/sys/bus/cxl/devices/decoder0.0/region0# [   69.954752] x2 :
->
-> 0000000000000000 x1 : 0000000000000000 x0 : ffff0000c512e740
->
-> [   69.957098] Call trace:
->
-> [   69.957959]  refcount_warn_saturate+0xa0/0x144
->
-> [   69.958773]  get_ndd+0x5c/0x80
->
-> [   69.959294]  nd_region_register_namespaces+0xe4/0xe90
->
-> [   69.960253]  nd_region_probe+0x100/0x290
->
-> [   69.960796]  nvdimm_bus_probe+0xf4/0x1c0
->
-> [   69.962087]  really_probe+0x19c/0x3f0
->
-> [   69.962620]  __driver_probe_device+0x11c/0x190
->
-> [   69.963258]  driver_probe_device+0x44/0xf4
->
-> [   69.963773]  __device_attach_driver+0xa4/0x140
->
-> [   69.964471]  bus_for_each_drv+0x84/0xe0
->
-> [   69.965068]  __device_attach+0xb0/0x1f0
->
-> [   69.966101]  device_initial_probe+0x20/0x30
->
-> [   69.967142]  bus_probe_device+0xa4/0xb0
->
-> [   69.968104]  device_add+0x3e8/0x910
->
-> [   69.969111]  nd_async_device_register+0x24/0x74
->
-> [   69.969928]  async_run_entry_fn+0x40/0x150
->
-> [   69.970725]  process_one_work+0x1dc/0x450
->
-> [   69.971796]  worker_thread+0x154/0x450
->
-> [   69.972700]  kthread+0x118/0x120
->
-> [   69.974141]  ret_from_fork+0x10/0x20
->
-> [   69.975141] ---[ end trace 0000000000000000 ]---
->
-> [   70.117887] Into nd_namespace_pmem_set_resource()
-
-Bobo WL wrote:
->
-Hi list
->
->
-I want to test cxl functions in arm64, and found some problems I can't
->
-figure out.
->
->
-My test environment:
->
->
-1. build latest bios from
-https://github.com/tianocore/edk2.git
-master
->
-branch(cc2db6ebfb6d9d85ba4c7b35fba1fa37fffc0bc2)
->
-2. build latest qemu-system-aarch64 from git://git.qemu.org/qemu.git
->
-master branch(846dcf0ba4eff824c295f06550b8673ff3f31314). With cxl arm
->
-support patch:
->
-https://patchwork.kernel.org/project/cxl/cover/20220616141950.23374-1-Jonathan.Cameron@huawei.com/
->
-3. build Linux kernel from
->
-https://git.kernel.org/pub/scm/linux/kernel/git/cxl/cxl.git
-preview
->
-branch(65fc1c3d26b96002a5aa1f4012fae4dc98fd5683)
->
-4. build latest ndctl tools from
-https://github.com/pmem/ndctl
->
-create_region branch(8558b394e449779e3a4f3ae90fae77ede0bca159)
->
->
-And my qemu test commands:
->
-sudo $QEMU_BIN -M virt,gic-version=3,cxl=on -m 4g,maxmem=8G,slots=8 \
->
--cpu max -smp 8 -nographic -no-reboot \
->
--kernel $KERNEL -bios $BIOS_BIN \
->
--drive if=none,file=$ROOTFS,format=qcow2,id=hd \
->
--device virtio-blk-pci,drive=hd -append 'root=/dev/vda1
->
-nokaslr dyndbg="module cxl* +p"' \
->
--object memory-backend-ram,size=4G,id=mem0 \
->
--numa node,nodeid=0,cpus=0-7,memdev=mem0 \
->
--net nic -net user,hostfwd=tcp::2222-:22 -enable-kvm \
->
--object
->
-memory-backend-file,id=cxl-mem0,share=on,mem-path=/tmp/cxltest.raw,size=256M
->
-\
->
--object
->
-memory-backend-file,id=cxl-mem1,share=on,mem-path=/tmp/cxltest1.raw,size=256M
->
-\
->
--object
->
-memory-backend-file,id=cxl-mem2,share=on,mem-path=/tmp/cxltest2.raw,size=256M
->
-\
->
--object
->
-memory-backend-file,id=cxl-mem3,share=on,mem-path=/tmp/cxltest3.raw,size=256M
->
-\
->
--object
->
-memory-backend-file,id=cxl-lsa0,share=on,mem-path=/tmp/lsa0.raw,size=256M
->
-\
->
--object
->
-memory-backend-file,id=cxl-lsa1,share=on,mem-path=/tmp/lsa1.raw,size=256M
->
-\
->
--object
->
-memory-backend-file,id=cxl-lsa2,share=on,mem-path=/tmp/lsa2.raw,size=256M
->
-\
->
--object
->
-memory-backend-file,id=cxl-lsa3,share=on,mem-path=/tmp/lsa3.raw,size=256M
->
-\
->
--device pxb-cxl,bus_nr=12,bus=pcie.0,id=cxl.1 \
->
--device cxl-rp,port=0,bus=cxl.1,id=root_port0,chassis=0,slot=0 \
->
--device cxl-upstream,bus=root_port0,id=us0 \
->
--device cxl-downstream,port=0,bus=us0,id=swport0,chassis=0,slot=4 \
->
--device
->
-cxl-type3,bus=swport0,memdev=cxl-mem0,lsa=cxl-lsa0,id=cxl-pmem0 \
->
--device cxl-downstream,port=1,bus=us0,id=swport1,chassis=0,slot=5 \
->
--device
->
-cxl-type3,bus=swport1,memdev=cxl-mem1,lsa=cxl-lsa1,id=cxl-pmem1 \
->
--device cxl-downstream,port=2,bus=us0,id=swport2,chassis=0,slot=6 \
->
--device
->
-cxl-type3,bus=swport2,memdev=cxl-mem2,lsa=cxl-lsa2,id=cxl-pmem2 \
->
--device cxl-downstream,port=3,bus=us0,id=swport3,chassis=0,slot=7 \
->
--device
->
-cxl-type3,bus=swport3,memdev=cxl-mem3,lsa=cxl-lsa3,id=cxl-pmem3 \
->
--M
->
-cxl-fmw.0.targets.0=cxl.1,cxl-fmw.0.size=4G,cxl-fmw.0.interleave-granularity=4k
->
->
-And I have got two problems.
->
-1. When I want to create x1 region with command: "cxl create-region -d
->
-decoder0.0 -w 1 -g 4096 mem0", kernel crashed with null pointer
->
-reference. Crash log:
->
->
-[  534.697324] cxl_region region0: config state: 0
->
-[  534.697346] cxl_region region0: probe: -6
->
-[  534.697368] cxl_acpi ACPI0017:00: decoder0.0: created region0
->
-[  534.699115] cxl region0: mem0:endpoint3 decoder3.0 add:
->
-mem0:decoder3.0 @ 0 next: none nr_eps: 1 nr_targets: 1
->
-[  534.699149] cxl region0: 0000:0d:00.0:port2 decoder2.0 add:
->
-mem0:decoder3.0 @ 0 next: mem0 nr_eps: 1 nr_targets: 1
->
-[  534.699167] cxl region0: ACPI0016:00:port1 decoder1.0 add:
->
-mem0:decoder3.0 @ 0 next: 0000:0d:00.0 nr_eps: 1 nr_targets: 1
->
-[  534.699176] cxl region0: ACPI0016:00:port1 iw: 1 ig: 256
->
-[  534.699182] cxl region0: ACPI0016:00:port1 target[0] = 0000:0c:00.0
->
-for mem0:decoder3.0 @ 0
->
-[  534.699189] cxl region0: 0000:0d:00.0:port2 iw: 1 ig: 256
->
-[  534.699193] cxl region0: 0000:0d:00.0:port2 target[0] =
->
-0000:0e:00.0 for mem0:decoder3.0 @ 0
->
-[  534.699405] Unable to handle kernel NULL pointer dereference at
->
-virtual address 0000000000000000
->
-[  534.701474] Mem abort info:
->
-[  534.701994]   ESR = 0x0000000086000004
->
-[  534.702653]   EC = 0x21: IABT (current EL), IL = 32 bits
->
-[  534.703616]   SET = 0, FnV = 0
->
-[  534.704174]   EA = 0, S1PTW = 0
->
-[  534.704803]   FSC = 0x04: level 0 translation fault
->
-[  534.705694] user pgtable: 4k pages, 48-bit VAs, pgdp=000000010144a000
->
-[  534.706875] [0000000000000000] pgd=0000000000000000, p4d=0000000000000000
->
-[  534.709855] Internal error: Oops: 86000004 [#1] PREEMPT SMP
->
-[  534.710301] Modules linked in:
->
-[  534.710546] CPU: 7 PID: 331 Comm: cxl Not tainted
->
-5.19.0-rc3-00064-g65fc1c3d26b9-dirty #11
->
-[  534.715393] Hardware name: QEMU KVM Virtual Machine, BIOS 0.0.0 02/06/2015
->
-[  534.717179] pstate: 60400005 (nZCv daif +PAN -UAO -TCO -DIT -SSBS BTYPE=--)
->
-[  534.719190] pc : 0x0
->
-[  534.719928] lr : commit_store+0x118/0x2cc
->
-[  534.721007] sp : ffff80000aec3c30
->
-[  534.721793] x29: ffff80000aec3c30 x28: ffff0000da62e740 x27:
->
-ffff0000c0c06b30
->
-[  534.723875] x26: 0000000000000000 x25: ffff0000c0a2a400 x24:
->
-ffff0000c0a29400
->
-[  534.725440] x23: 0000000000000003 x22: 0000000000000000 x21:
->
-ffff0000c0c06800
->
-[  534.727312] x20: 0000000000000000 x19: ffff0000c1559800 x18:
->
-0000000000000000
->
-[  534.729138] x17: 0000000000000000 x16: 0000000000000000 x15:
->
-0000ffffd41fe838
->
-[  534.731046] x14: 0000000000000000 x13: 0000000000000000 x12:
->
-0000000000000000
->
-[  534.732402] x11: 0000000000000000 x10: 0000000000000000 x9 :
->
-0000000000000000
->
-[  534.734432] x8 : 0000000000000000 x7 : 0000000000000000 x6 :
->
-ffff0000c0906e80
->
-[  534.735921] x5 : 0000000000000000 x4 : 0000000000000000 x3 :
->
-ffff80000aec3bf0
->
-[  534.737437] x2 : 0000000000000000 x1 : 0000000000000000 x0 :
->
-ffff0000c155a000
->
-[  534.738878] Call trace:
->
-[  534.739368]  0x0
->
-[  534.739713]  dev_attr_store+0x1c/0x30
->
-[  534.740186]  sysfs_kf_write+0x48/0x58
->
-[  534.740961]  kernfs_fop_write_iter+0x128/0x184
->
-[  534.741872]  new_sync_write+0xdc/0x158
->
-[  534.742706]  vfs_write+0x1ac/0x2a8
->
-[  534.743440]  ksys_write+0x68/0xf0
->
-[  534.744328]  __arm64_sys_write+0x1c/0x28
->
-[  534.745180]  invoke_syscall+0x44/0xf0
->
-[  534.745989]  el0_svc_common+0x4c/0xfc
->
-[  534.746661]  do_el0_svc+0x60/0xa8
->
-[  534.747378]  el0_svc+0x2c/0x78
->
-[  534.748066]  el0t_64_sync_handler+0xb8/0x12c
->
-[  534.748919]  el0t_64_sync+0x18c/0x190
->
-[  534.749629] Code: bad PC value
->
-[  534.750169] ---[ end trace 0000000000000000 ]---
-What was the top kernel commit when you ran this test? What is the line
-number of "commit_store+0x118"?
-
->
-2. When I want to create x4 region with command: "cxl create-region -d
->
-decoder0.0 -w 4 -g 4096 -m mem0 mem1 mem2 mem3". I got below errors:
->
->
-cxl region: create_region: region0: failed to set target3 to mem3
->
-cxl region: cmd_create_region: created 0 regions
->
->
-And kernel log as below:
->
-[   60.536663] cxl_region region0: config state: 0
->
-[   60.536675] cxl_region region0: probe: -6
->
-[   60.536696] cxl_acpi ACPI0017:00: decoder0.0: created region0
->
-[   60.538251] cxl region0: mem0:endpoint3 decoder3.0 add:
->
-mem0:decoder3.0 @ 0 next: none nr_eps: 1 nr_targets: 1
->
-[   60.538278] cxl region0: 0000:0d:00.0:port2 decoder2.0 add:
->
-mem0:decoder3.0 @ 0 next: mem0 nr_eps: 1 nr_targets: 1
->
-[   60.538295] cxl region0: ACPI0016:00:port1 decoder1.0 add:
->
-mem0:decoder3.0 @ 0 next: 0000:0d:00.0 nr_eps: 1 nr_targets: 1
->
-[   60.538647] cxl region0: mem1:endpoint4 decoder4.0 add:
->
-mem1:decoder4.0 @ 1 next: none nr_eps: 1 nr_targets: 1
->
-[   60.538663] cxl region0: 0000:0d:00.0:port2 decoder2.0 add:
->
-mem1:decoder4.0 @ 1 next: mem1 nr_eps: 2 nr_targets: 2
->
-[   60.538675] cxl region0: ACPI0016:00:port1 decoder1.0 add:
->
-mem1:decoder4.0 @ 1 next: 0000:0d:00.0 nr_eps: 2 nr_targets: 1
->
-[   60.539311] cxl region0: mem2:endpoint5 decoder5.0 add:
->
-mem2:decoder5.0 @ 2 next: none nr_eps: 1 nr_targets: 1
->
-[   60.539332] cxl region0: 0000:0d:00.0:port2 decoder2.0 add:
->
-mem2:decoder5.0 @ 2 next: mem2 nr_eps: 3 nr_targets: 3
->
-[   60.539343] cxl region0: ACPI0016:00:port1 decoder1.0 add:
->
-mem2:decoder5.0 @ 2 next: 0000:0d:00.0 nr_eps: 3 nr_targets: 1
->
-[   60.539711] cxl region0: mem3:endpoint6 decoder6.0 add:
->
-mem3:decoder6.0 @ 3 next: none nr_eps: 1 nr_targets: 1
->
-[   60.539723] cxl region0: 0000:0d:00.0:port2 decoder2.0 add:
->
-mem3:decoder6.0 @ 3 next: mem3 nr_eps: 4 nr_targets: 4
->
-[   60.539735] cxl region0: ACPI0016:00:port1 decoder1.0 add:
->
-mem3:decoder6.0 @ 3 next: 0000:0d:00.0 nr_eps: 4 nr_targets: 1
->
-[   60.539742] cxl region0: ACPI0016:00:port1 iw: 1 ig: 256
->
-[   60.539747] cxl region0: ACPI0016:00:port1 target[0] = 0000:0c:00.0
->
-for mem0:decoder3.0 @ 0
->
-[   60.539754] cxl region0: 0000:0d:00.0:port2 iw: 4 ig: 512
->
-[   60.539758] cxl region0: 0000:0d:00.0:port2 target[0] =
->
-0000:0e:00.0 for mem0:decoder3.0 @ 0
->
-[   60.539764] cxl region0: ACPI0016:00:port1: cannot host mem1:decoder4.0 at
->
-1
->
->
-I have tried to write sysfs node manually, got same errors.
->
->
-Hope I can get some helps here.
-What is the output of:
-
-    cxl list -MDTu -d decoder0.0
-
-...? It might be the case that mem1 cannot be mapped by decoder0.0, or
-at least not in the specified order, or that validation check is broken.
-
-Hi Dan,
-
-Thanks for your reply!
-
-On Mon, Aug 8, 2022 at 11:58 PM Dan Williams <dan.j.williams@intel.com> wrote:
->
->
-What is the output of:
->
->
-cxl list -MDTu -d decoder0.0
->
->
-...? It might be the case that mem1 cannot be mapped by decoder0.0, or
->
-at least not in the specified order, or that validation check is broken.
-Command "cxl list -MDTu -d decoder0.0" output:
-
-[
-  {
-    "memdevs":[
-      {
-        "memdev":"mem2",
-        "pmem_size":"256.00 MiB (268.44 MB)",
-        "ram_size":0,
-        "serial":"0",
-        "host":"0000:11:00.0"
-      },
-      {
-        "memdev":"mem1",
-        "pmem_size":"256.00 MiB (268.44 MB)",
-        "ram_size":0,
-        "serial":"0",
-        "host":"0000:10:00.0"
-      },
-      {
-        "memdev":"mem0",
-        "pmem_size":"256.00 MiB (268.44 MB)",
-        "ram_size":0,
-        "serial":"0",
-        "host":"0000:0f:00.0"
-      },
-      {
-        "memdev":"mem3",
-        "pmem_size":"256.00 MiB (268.44 MB)",
-        "ram_size":0,
-        "serial":"0",
-        "host":"0000:12:00.0"
-      }
-    ]
-  },
-  {
-    "root decoders":[
-      {
-        "decoder":"decoder0.0",
-        "resource":"0x10000000000",
-        "size":"4.00 GiB (4.29 GB)",
-        "pmem_capable":true,
-        "volatile_capable":true,
-        "accelmem_capable":true,
-        "nr_targets":1,
-        "targets":[
-          {
-            "target":"ACPI0016:01",
-            "alias":"pci0000:0c",
-            "position":0,
-            "id":"0xc"
-          }
-        ]
-      }
-    ]
-  }
-]
-
-Bobo WL wrote:
->
-Hi Dan,
->
->
-Thanks for your reply!
->
->
-On Mon, Aug 8, 2022 at 11:58 PM Dan Williams <dan.j.williams@intel.com> wrote:
->
->
->
-> What is the output of:
->
->
->
->     cxl list -MDTu -d decoder0.0
->
->
->
-> ...? It might be the case that mem1 cannot be mapped by decoder0.0, or
->
-> at least not in the specified order, or that validation check is broken.
->
->
-Command "cxl list -MDTu -d decoder0.0" output:
-Thanks for this, I think I know the problem, but will try some
-experiments with cxl_test first.
-
-Did the commit_store() crash stop reproducing with latest cxl/preview
-branch?
-
-On Tue, Aug 9, 2022 at 11:17 PM Dan Williams <dan.j.williams@intel.com> wrote:
->
->
-Bobo WL wrote:
->
-> Hi Dan,
->
->
->
-> Thanks for your reply!
->
->
->
-> On Mon, Aug 8, 2022 at 11:58 PM Dan Williams <dan.j.williams@intel.com>
->
-> wrote:
->
-> >
->
-> > What is the output of:
->
-> >
->
-> >     cxl list -MDTu -d decoder0.0
->
-> >
->
-> > ...? It might be the case that mem1 cannot be mapped by decoder0.0, or
->
-> > at least not in the specified order, or that validation check is broken.
->
->
->
-> Command "cxl list -MDTu -d decoder0.0" output:
->
->
-Thanks for this, I think I know the problem, but will try some
->
-experiments with cxl_test first.
->
->
-Did the commit_store() crash stop reproducing with latest cxl/preview
->
-branch?
-No, still hitting this bug if don't add extra HB device in qemu
-
-Dan Williams wrote:
->
-Bobo WL wrote:
->
-> Hi Dan,
->
->
->
-> Thanks for your reply!
->
->
->
-> On Mon, Aug 8, 2022 at 11:58 PM Dan Williams <dan.j.williams@intel.com>
->
-> wrote:
->
-> >
->
-> > What is the output of:
->
-> >
->
-> >     cxl list -MDTu -d decoder0.0
->
-> >
->
-> > ...? It might be the case that mem1 cannot be mapped by decoder0.0, or
->
-> > at least not in the specified order, or that validation check is broken.
->
->
->
-> Command "cxl list -MDTu -d decoder0.0" output:
->
->
-Thanks for this, I think I know the problem, but will try some
->
-experiments with cxl_test first.
-Hmm, so my cxl_test experiment unfortunately passed so I'm not
-reproducing the failure mode. This is the result of creating x4 region
-with devices directly attached to a single host-bridge:
-
-# cxl create-region -d decoder3.5 -w 4 -m -g 256 mem{12,10,9,11} -s $((1<<30))
-{
-  "region":"region8",
-  "resource":"0xf1f0000000",
-  "size":"1024.00 MiB (1073.74 MB)",
-  "interleave_ways":4,
-  "interleave_granularity":256,
-  "decode_state":"commit",
-  "mappings":[
-    {
-      "position":3,
-      "memdev":"mem11",
-      "decoder":"decoder21.0"
-    },
-    {
-      "position":2,
-      "memdev":"mem9",
-      "decoder":"decoder19.0"
-    },
-    {
-      "position":1,
-      "memdev":"mem10",
-      "decoder":"decoder20.0"
-    },
-    {
-      "position":0,
-      "memdev":"mem12",
-      "decoder":"decoder22.0"
-    }
-  ]
-}
-cxl region: cmd_create_region: created 1 region
-
->
-Did the commit_store() crash stop reproducing with latest cxl/preview
->
-branch?
-I missed the answer to this question.
-
-All of these changes are now in Linus' tree perhaps give that a try and
-post the debug log again?
-
-On Thu, 11 Aug 2022 17:46:55 -0700
-Dan Williams <dan.j.williams@intel.com> wrote:
-
->
-Dan Williams wrote:
->
-> Bobo WL wrote:
->
-> > Hi Dan,
->
-> >
->
-> > Thanks for your reply!
->
-> >
->
-> > On Mon, Aug 8, 2022 at 11:58 PM Dan Williams <dan.j.williams@intel.com>
->
-> > wrote:
->
-> > >
->
-> > > What is the output of:
->
-> > >
->
-> > >     cxl list -MDTu -d decoder0.0
->
-> > >
->
-> > > ...? It might be the case that mem1 cannot be mapped by decoder0.0, or
->
-> > > at least not in the specified order, or that validation check is
->
-> > > broken.
->
-> >
->
-> > Command "cxl list -MDTu -d decoder0.0" output:
->
->
->
-> Thanks for this, I think I know the problem, but will try some
->
-> experiments with cxl_test first.
->
->
-Hmm, so my cxl_test experiment unfortunately passed so I'm not
->
-reproducing the failure mode. This is the result of creating x4 region
->
-with devices directly attached to a single host-bridge:
->
->
-# cxl create-region -d decoder3.5 -w 4 -m -g 256 mem{12,10,9,11} -s $((1<<30))
->
-{
->
-"region":"region8",
->
-"resource":"0xf1f0000000",
->
-"size":"1024.00 MiB (1073.74 MB)",
->
-"interleave_ways":4,
->
-"interleave_granularity":256,
->
-"decode_state":"commit",
->
-"mappings":[
->
-{
->
-"position":3,
->
-"memdev":"mem11",
->
-"decoder":"decoder21.0"
->
-},
->
-{
->
-"position":2,
->
-"memdev":"mem9",
->
-"decoder":"decoder19.0"
->
-},
->
-{
->
-"position":1,
->
-"memdev":"mem10",
->
-"decoder":"decoder20.0"
->
-},
->
-{
->
-"position":0,
->
-"memdev":"mem12",
->
-"decoder":"decoder22.0"
->
-}
->
-]
->
-}
->
-cxl region: cmd_create_region: created 1 region
->
->
-> Did the commit_store() crash stop reproducing with latest cxl/preview
->
-> branch?
->
->
-I missed the answer to this question.
->
->
-All of these changes are now in Linus' tree perhaps give that a try and
->
-post the debug log again?
-Hi Dan,
-
-I've moved onto looking at this one.
-1 HB, 2RP (to make it configure the HDM decoder in the QEMU HB, I'll tidy that 
-up
-at some stage), 1 switch, 4 downstream switch ports each with a type 3
-
-I'm not getting a crash, but can't successfully setup a region.
-Upon adding the final target
-It's failing in check_last_peer() as pos < distance.
-Seems distance is 4 which makes me think it's using the wrong level of the 
-heirarchy for
-some reason or that distance check is wrong.
-Wasn't a good idea to just skip that step though as it goes boom - though
-stack trace is not useful.
-
-Jonathan
-
-On Wed, 17 Aug 2022 17:16:19 +0100
-Jonathan Cameron <Jonathan.Cameron@huawei.com> wrote:
-
->
-On Thu, 11 Aug 2022 17:46:55 -0700
->
-Dan Williams <dan.j.williams@intel.com> wrote:
->
->
-> Dan Williams wrote:
->
-> > Bobo WL wrote:
->
-> > > Hi Dan,
->
-> > >
->
-> > > Thanks for your reply!
->
-> > >
->
-> > > On Mon, Aug 8, 2022 at 11:58 PM Dan Williams <dan.j.williams@intel.com>
->
-> > > wrote:
->
-> > > >
->
-> > > > What is the output of:
->
-> > > >
->
-> > > >     cxl list -MDTu -d decoder0.0
->
-> > > >
->
-> > > > ...? It might be the case that mem1 cannot be mapped by decoder0.0, or
->
-> > > > at least not in the specified order, or that validation check is
->
-> > > > broken.
->
-> > >
->
-> > > Command "cxl list -MDTu -d decoder0.0" output:
->
-> >
->
-> > Thanks for this, I think I know the problem, but will try some
->
-> > experiments with cxl_test first.
->
->
->
-> Hmm, so my cxl_test experiment unfortunately passed so I'm not
->
-> reproducing the failure mode. This is the result of creating x4 region
->
-> with devices directly attached to a single host-bridge:
->
->
->
-> # cxl create-region -d decoder3.5 -w 4 -m -g 256 mem{12,10,9,11} -s
->
-> $((1<<30))
->
-> {
->
->   "region":"region8",
->
->   "resource":"0xf1f0000000",
->
->   "size":"1024.00 MiB (1073.74 MB)",
->
->   "interleave_ways":4,
->
->   "interleave_granularity":256,
->
->   "decode_state":"commit",
->
->   "mappings":[
->
->     {
->
->       "position":3,
->
->       "memdev":"mem11",
->
->       "decoder":"decoder21.0"
->
->     },
->
->     {
->
->       "position":2,
->
->       "memdev":"mem9",
->
->       "decoder":"decoder19.0"
->
->     },
->
->     {
->
->       "position":1,
->
->       "memdev":"mem10",
->
->       "decoder":"decoder20.0"
->
->     },
->
->     {
->
->       "position":0,
->
->       "memdev":"mem12",
->
->       "decoder":"decoder22.0"
->
->     }
->
->   ]
->
-> }
->
-> cxl region: cmd_create_region: created 1 region
->
->
->
-> > Did the commit_store() crash stop reproducing with latest cxl/preview
->
-> > branch?
->
->
->
-> I missed the answer to this question.
->
->
->
-> All of these changes are now in Linus' tree perhaps give that a try and
->
-> post the debug log again?
->
->
-Hi Dan,
->
->
-I've moved onto looking at this one.
->
-1 HB, 2RP (to make it configure the HDM decoder in the QEMU HB, I'll tidy
->
-that up
->
-at some stage), 1 switch, 4 downstream switch ports each with a type 3
->
->
-I'm not getting a crash, but can't successfully setup a region.
->
-Upon adding the final target
->
-It's failing in check_last_peer() as pos < distance.
->
-Seems distance is 4 which makes me think it's using the wrong level of the
->
-heirarchy for
->
-some reason or that distance check is wrong.
->
-Wasn't a good idea to just skip that step though as it goes boom - though
->
-stack trace is not useful.
-Turns out really weird corruption happens if you accidentally back two type3 
-devices
-with the same memory device. Who would have thought it :)
-
-That aside ignoring the check_last_peer() failure seems to make everything work 
-for this
-topology.  I'm not seeing the crash, so my guess is we fixed it somewhere along 
-the way.
-
-Now for the fun one.  I've replicated the crash if we have
-
-1HB 1*RP 1SW, 4SW-DSP, 4Type3
-
-Now, I'd expect to see it not 'work' because the QEMU HDM decoder won't be 
-programmed
-but the null pointer dereference isn't related to that.
-
-The bug is straight forward.  Not all decoders have commit callbacks... Will 
-send out
-a possible fix shortly.
-
-Jonathan
-
-
-
->
->
-Jonathan
->
->
->
->
->
->
-
-On Thu, 18 Aug 2022 17:37:40 +0100
-Jonathan Cameron via <qemu-devel@nongnu.org> wrote:
-
->
-On Wed, 17 Aug 2022 17:16:19 +0100
->
-Jonathan Cameron <Jonathan.Cameron@huawei.com> wrote:
->
->
-> On Thu, 11 Aug 2022 17:46:55 -0700
->
-> Dan Williams <dan.j.williams@intel.com> wrote:
->
->
->
-> > Dan Williams wrote:
->
-> > > Bobo WL wrote:
->
-> > > > Hi Dan,
->
-> > > >
->
-> > > > Thanks for your reply!
->
-> > > >
->
-> > > > On Mon, Aug 8, 2022 at 11:58 PM Dan Williams
->
-> > > > <dan.j.williams@intel.com> wrote:
->
-> > > > >
->
-> > > > > What is the output of:
->
-> > > > >
->
-> > > > >     cxl list -MDTu -d decoder0.0
->
-> > > > >
->
-> > > > > ...? It might be the case that mem1 cannot be mapped by decoder0.0,
->
-> > > > > or
->
-> > > > > at least not in the specified order, or that validation check is
->
-> > > > > broken.
->
-> > > >
->
-> > > > Command "cxl list -MDTu -d decoder0.0" output:
->
-> > >
->
-> > > Thanks for this, I think I know the problem, but will try some
->
-> > > experiments with cxl_test first.
->
-> >
->
-> > Hmm, so my cxl_test experiment unfortunately passed so I'm not
->
-> > reproducing the failure mode. This is the result of creating x4 region
->
-> > with devices directly attached to a single host-bridge:
->
-> >
->
-> > # cxl create-region -d decoder3.5 -w 4 -m -g 256 mem{12,10,9,11} -s
->
-> > $((1<<30))
->
-> > {
->
-> >   "region":"region8",
->
-> >   "resource":"0xf1f0000000",
->
-> >   "size":"1024.00 MiB (1073.74 MB)",
->
-> >   "interleave_ways":4,
->
-> >   "interleave_granularity":256,
->
-> >   "decode_state":"commit",
->
-> >   "mappings":[
->
-> >     {
->
-> >       "position":3,
->
-> >       "memdev":"mem11",
->
-> >       "decoder":"decoder21.0"
->
-> >     },
->
-> >     {
->
-> >       "position":2,
->
-> >       "memdev":"mem9",
->
-> >       "decoder":"decoder19.0"
->
-> >     },
->
-> >     {
->
-> >       "position":1,
->
-> >       "memdev":"mem10",
->
-> >       "decoder":"decoder20.0"
->
-> >     },
->
-> >     {
->
-> >       "position":0,
->
-> >       "memdev":"mem12",
->
-> >       "decoder":"decoder22.0"
->
-> >     }
->
-> >   ]
->
-> > }
->
-> > cxl region: cmd_create_region: created 1 region
->
-> >
->
-> > > Did the commit_store() crash stop reproducing with latest cxl/preview
->
-> > > branch?
->
-> >
->
-> > I missed the answer to this question.
->
-> >
->
-> > All of these changes are now in Linus' tree perhaps give that a try and
->
-> > post the debug log again?
->
->
->
-> Hi Dan,
->
->
->
-> I've moved onto looking at this one.
->
-> 1 HB, 2RP (to make it configure the HDM decoder in the QEMU HB, I'll tidy
->
-> that up
->
-> at some stage), 1 switch, 4 downstream switch ports each with a type 3
->
->
->
-> I'm not getting a crash, but can't successfully setup a region.
->
-> Upon adding the final target
->
-> It's failing in check_last_peer() as pos < distance.
->
-> Seems distance is 4 which makes me think it's using the wrong level of the
->
-> heirarchy for
->
-> some reason or that distance check is wrong.
->
-> Wasn't a good idea to just skip that step though as it goes boom - though
->
-> stack trace is not useful.
->
->
-Turns out really weird corruption happens if you accidentally back two type3
->
-devices
->
-with the same memory device. Who would have thought it :)
->
->
-That aside ignoring the check_last_peer() failure seems to make everything
->
-work for this
->
-topology.  I'm not seeing the crash, so my guess is we fixed it somewhere
->
-along the way.
->
->
-Now for the fun one.  I've replicated the crash if we have
->
->
-1HB 1*RP 1SW, 4SW-DSP, 4Type3
->
->
-Now, I'd expect to see it not 'work' because the QEMU HDM decoder won't be
->
-programmed
->
-but the null pointer dereference isn't related to that.
->
->
-The bug is straight forward.  Not all decoders have commit callbacks... Will
->
-send out
->
-a possible fix shortly.
->
-For completeness I'm carrying this hack because I haven't gotten my head
-around the right fix for check_last_peer() failing on this test topology.
-
-diff --git a/drivers/cxl/core/region.c b/drivers/cxl/core/region.c
-index c49d9a5f1091..275e143bd748 100644
---- a/drivers/cxl/core/region.c
-+++ b/drivers/cxl/core/region.c
-@@ -978,7 +978,7 @@ static int cxl_port_setup_targets(struct cxl_port *port,
-                                rc = check_last_peer(cxled, ep, cxl_rr,
-                                                     distance);
-                                if (rc)
--                                       return rc;
-+                                       //      return rc;
-                                goto out_target_set;
-                        }
-                goto add_target;
---
-
-I might find more bugs with more testing, but this is all the ones I've
-seen so far + in Bobo's reports.  Qemu fixes are now in upstream so
-will be there in the release. 
-
-As a reminder, testing on QEMU has a few corners...
-
-Need a patch to add serial number ECAP support. It is on list for revew,
-but will have wait for after QEMU 7.1 release (which may be next week)
-
-QEMU still assumes HDM decoder on the host bridge will be programmed.
-So if you want anything to work there should be at least
-2 RP below the HB (no need to plug anything in to one of them).
-
-I don't want to add a commandline parameter to hide the decoder in QEMU
-and detecting there is only one RP would require moving a bunch of static
-stuff into runtime code (I think).
-
-I still think we should make the kernel check to see if there is a decoder,
-but if not I might see how bad a hack it is to have QEMU ignore that decoder
-if not committed in this one special case (HB HDM decoder with only one place
-it can send stuff). Obviously that would be a break from specification
-so less than idea!
-
-Thanks,
-
-Jonathan
-
-On Fri, 19 Aug 2022 09:46:55 +0100
-Jonathan Cameron <Jonathan.Cameron@huawei.com> wrote:
-
->
-On Thu, 18 Aug 2022 17:37:40 +0100
->
-Jonathan Cameron via <qemu-devel@nongnu.org> wrote:
->
->
-> On Wed, 17 Aug 2022 17:16:19 +0100
->
-> Jonathan Cameron <Jonathan.Cameron@huawei.com> wrote:
->
->
->
-> > On Thu, 11 Aug 2022 17:46:55 -0700
->
-> > Dan Williams <dan.j.williams@intel.com> wrote:
->
-> >
->
-> > > Dan Williams wrote:
->
-> > > > Bobo WL wrote:
->
-> > > > > Hi Dan,
->
-> > > > >
->
-> > > > > Thanks for your reply!
->
-> > > > >
->
-> > > > > On Mon, Aug 8, 2022 at 11:58 PM Dan Williams
->
-> > > > > <dan.j.williams@intel.com> wrote:
->
-> > > > > >
->
-> > > > > > What is the output of:
->
-> > > > > >
->
-> > > > > >     cxl list -MDTu -d decoder0.0
->
-> > > > > >
->
-> > > > > > ...? It might be the case that mem1 cannot be mapped by
->
-> > > > > > decoder0.0, or
->
-> > > > > > at least not in the specified order, or that validation check is
->
-> > > > > > broken.
->
-> > > > >
->
-> > > > > Command "cxl list -MDTu -d decoder0.0" output:
->
-> > > >
->
-> > > > Thanks for this, I think I know the problem, but will try some
->
-> > > > experiments with cxl_test first.
->
-> > >
->
-> > > Hmm, so my cxl_test experiment unfortunately passed so I'm not
->
-> > > reproducing the failure mode. This is the result of creating x4 region
->
-> > > with devices directly attached to a single host-bridge:
->
-> > >
->
-> > > # cxl create-region -d decoder3.5 -w 4 -m -g 256 mem{12,10,9,11} -s
->
-> > > $((1<<30))
->
-> > > {
->
-> > >   "region":"region8",
->
-> > >   "resource":"0xf1f0000000",
->
-> > >   "size":"1024.00 MiB (1073.74 MB)",
->
-> > >   "interleave_ways":4,
->
-> > >   "interleave_granularity":256,
->
-> > >   "decode_state":"commit",
->
-> > >   "mappings":[
->
-> > >     {
->
-> > >       "position":3,
->
-> > >       "memdev":"mem11",
->
-> > >       "decoder":"decoder21.0"
->
-> > >     },
->
-> > >     {
->
-> > >       "position":2,
->
-> > >       "memdev":"mem9",
->
-> > >       "decoder":"decoder19.0"
->
-> > >     },
->
-> > >     {
->
-> > >       "position":1,
->
-> > >       "memdev":"mem10",
->
-> > >       "decoder":"decoder20.0"
->
-> > >     },
->
-> > >     {
->
-> > >       "position":0,
->
-> > >       "memdev":"mem12",
->
-> > >       "decoder":"decoder22.0"
->
-> > >     }
->
-> > >   ]
->
-> > > }
->
-> > > cxl region: cmd_create_region: created 1 region
->
-> > >
->
-> > > > Did the commit_store() crash stop reproducing with latest cxl/preview
->
-> > > > branch?
->
-> > >
->
-> > > I missed the answer to this question.
->
-> > >
->
-> > > All of these changes are now in Linus' tree perhaps give that a try and
->
-> > > post the debug log again?
->
-> >
->
-> > Hi Dan,
->
-> >
->
-> > I've moved onto looking at this one.
->
-> > 1 HB, 2RP (to make it configure the HDM decoder in the QEMU HB, I'll tidy
->
-> > that up
->
-> > at some stage), 1 switch, 4 downstream switch ports each with a type 3
->
-> >
->
-> > I'm not getting a crash, but can't successfully setup a region.
->
-> > Upon adding the final target
->
-> > It's failing in check_last_peer() as pos < distance.
->
-> > Seems distance is 4 which makes me think it's using the wrong level of
->
-> > the heirarchy for
->
-> > some reason or that distance check is wrong.
->
-> > Wasn't a good idea to just skip that step though as it goes boom - though
->
-> > stack trace is not useful.
->
->
->
-> Turns out really weird corruption happens if you accidentally back two
->
-> type3 devices
->
-> with the same memory device. Who would have thought it :)
->
->
->
-> That aside ignoring the check_last_peer() failure seems to make everything
->
-> work for this
->
-> topology.  I'm not seeing the crash, so my guess is we fixed it somewhere
->
-> along the way.
->
->
->
-> Now for the fun one.  I've replicated the crash if we have
->
->
->
-> 1HB 1*RP 1SW, 4SW-DSP, 4Type3
->
->
->
-> Now, I'd expect to see it not 'work' because the QEMU HDM decoder won't be
->
-> programmed
->
-> but the null pointer dereference isn't related to that.
->
->
->
-> The bug is straight forward.  Not all decoders have commit callbacks...
->
-> Will send out
->
-> a possible fix shortly.
->
->
->
-For completeness I'm carrying this hack because I haven't gotten my head
->
-around the right fix for check_last_peer() failing on this test topology.
->
->
-diff --git a/drivers/cxl/core/region.c b/drivers/cxl/core/region.c
->
-index c49d9a5f1091..275e143bd748 100644
->
---- a/drivers/cxl/core/region.c
->
-+++ b/drivers/cxl/core/region.c
->
-@@ -978,7 +978,7 @@ static int cxl_port_setup_targets(struct cxl_port *port,
->
-rc = check_last_peer(cxled, ep, cxl_rr,
->
-distance);
->
-if (rc)
->
--                                       return rc;
->
-+                                       //      return rc;
->
-goto out_target_set;
->
-}
->
-goto add_target;
-I'm still carrying this hack and still haven't worked out the right fix.
-
-Suggestions welcome!  If not I'll hopefully get some time on this
-towards the end of the week.
-
-Jonathan
-
diff --git a/classification_output/01/instruction/2880487 b/classification_output/01/instruction/2880487
deleted file mode 100644
index 1d455d6fa..000000000
--- a/classification_output/01/instruction/2880487
+++ /dev/null
@@ -1,187 +0,0 @@
-instruction: 0.925
-semantic: 0.924
-other: 0.894
-mistranslation: 0.826
-
-[BUG] AArch64 boot hang with -icount and -smp >1 (iothread locking issue?)
-
-Hello,
-
-I am encountering one or more bugs when using -icount and -smp >1 that I am
-attempting to sort out. My current theory is that it is an iothread locking
-issue.
-
-I am using a command-line like the following where $kernel is a recent upstream
-AArch64 Linux kernel Image (I can provide a binary if that would be helpful -
-let me know how is best to post):
-
-        qemu-system-aarch64 \
-                -M virt -cpu cortex-a57 -m 1G \
-                -nographic \
-                -smp 2 \
-                -icount 0 \
-                -kernel $kernel
-
-For any/all of the symptoms described below, they seem to disappear when I
-either remove `-icount 0` or change smp to `-smp 1`. In other words, it is the
-combination of `-smp >1` and `-icount` which triggers what I'm seeing.
-
-I am seeing two different (but seemingly related) behaviors. The first (and
-what I originally started debugging) shows up as a boot hang. When booting
-using the above command after Peter's "icount: Take iothread lock when running
-QEMU timers" patch [1], The kernel boots for a while and then hangs after:
-
->
-...snip...
->
-[    0.010764] Serial: AMBA PL011 UART driver
->
-[    0.016334] 9000000.pl011: ttyAMA0 at MMIO 0x9000000 (irq = 13, base_baud
->
-= 0) is a PL011 rev1
->
-[    0.016907] printk: console [ttyAMA0] enabled
->
-[    0.017624] KASLR enabled
->
-[    0.031986] HugeTLB: registered 16.0 GiB page size, pre-allocated 0 pages
->
-[    0.031986] HugeTLB: 16320 KiB vmemmap can be freed for a 16.0 GiB page
->
-[    0.031986] HugeTLB: registered 512 MiB page size, pre-allocated 0 pages
->
-[    0.031986] HugeTLB: 448 KiB vmemmap can be freed for a 512 MiB page
->
-[    0.031986] HugeTLB: registered 2.00 MiB page size, pre-allocated 0 pages
->
-[    0.031986] HugeTLB: 0 KiB vmemmap can be freed for a 2.00 MiB page
-When it hangs here, I drop into QEMU's console, attach to the gdbserver, and it
-always reports that it is at address 0xffff800008dc42e8 (as shown below from an
-objdump of the vmlinux). I note this is in the middle of messing with timer
-system registers - which makes me suspect we're attempting to take the iothread
-lock when its already held:
-
->
-ffff800008dc42b8 <arch_timer_set_next_event_virt>:
->
-ffff800008dc42b8:       d503201f        nop
->
-ffff800008dc42bc:       d503201f        nop
->
-ffff800008dc42c0:       d503233f        paciasp
->
-ffff800008dc42c4:       d53be321        mrs     x1, cntv_ctl_el0
->
-ffff800008dc42c8:       32000021        orr     w1, w1, #0x1
->
-ffff800008dc42cc:       d5033fdf        isb
->
-ffff800008dc42d0:       d53be042        mrs     x2, cntvct_el0
->
-ffff800008dc42d4:       ca020043        eor     x3, x2, x2
->
-ffff800008dc42d8:       8b2363e3        add     x3, sp, x3
->
-ffff800008dc42dc:       f940007f        ldr     xzr, [x3]
->
-ffff800008dc42e0:       8b020000        add     x0, x0, x2
->
-ffff800008dc42e4:       d51be340        msr     cntv_cval_el0, x0
->
-* ffff800008dc42e8:       927ef820        and     x0, x1, #0xfffffffffffffffd
->
-ffff800008dc42ec:       d51be320        msr     cntv_ctl_el0, x0
->
-ffff800008dc42f0:       d5033fdf        isb
->
-ffff800008dc42f4:       52800000        mov     w0, #0x0
->
-// #0
->
-ffff800008dc42f8:       d50323bf        autiasp
->
-ffff800008dc42fc:       d65f03c0        ret
-The second behavior is that prior to Peter's "icount: Take iothread lock when
-running QEMU timers" patch [1], I observe the following message (same command
-as above):
-
->
-ERROR:../accel/tcg/tcg-accel-ops.c:79:tcg_handle_interrupt: assertion failed:
->
-(qemu_mutex_iothread_locked())
->
-Aborted (core dumped)
-This is the same behavior described in Gitlab issue 1130 [0] and addressed by
-[1]. I bisected the appearance of this assertion, and found it was introduced
-by Pavel's "replay: rewrite async event handling" commit [2]. Commits prior to
-that one boot successfully (neither assertions nor hangs) with `-icount 0 -smp
-2`.
-
-I've looked over these two commits ([1], [2]), but it is not obvious to me
-how/why they might be interacting to produce the boot hangs I'm seeing and
-I welcome any help investigating further.
-
-Thanks!
-
--Aaron Lindsay
-
-[0] -
-https://gitlab.com/qemu-project/qemu/-/issues/1130
-[1] -
-https://gitlab.com/qemu-project/qemu/-/commit/c7f26ded6d5065e4116f630f6a490b55f6c5f58e
-[2] -
-https://gitlab.com/qemu-project/qemu/-/commit/60618e2d77691e44bb78e23b2b0cf07b5c405e56
-
-On Fri, 21 Oct 2022 at 16:48, Aaron Lindsay
-<aaron@os.amperecomputing.com> wrote:
->
->
-Hello,
->
->
-I am encountering one or more bugs when using -icount and -smp >1 that I am
->
-attempting to sort out. My current theory is that it is an iothread locking
->
-issue.
-Weird coincidence, that is a bug that's been in the tree for months
-but was only reported to me earlier this week. Try reverting
-commit a82fd5a4ec24d923ff1e -- that should fix it.
-CAFEAcA_i8x00hD-4XX18ySLNbCB6ds1-DSazVb4yDnF8skjd9A@mail.gmail.com
-/">https://lore.kernel.org/qemu-devel/
-CAFEAcA_i8x00hD-4XX18ySLNbCB6ds1-DSazVb4yDnF8skjd9A@mail.gmail.com
-/
-has the explanation.
-
-thanks
--- PMM
-
-On Oct 21 17:00, Peter Maydell wrote:
->
-On Fri, 21 Oct 2022 at 16:48, Aaron Lindsay
->
-<aaron@os.amperecomputing.com> wrote:
->
->
->
-> Hello,
->
->
->
-> I am encountering one or more bugs when using -icount and -smp >1 that I am
->
-> attempting to sort out. My current theory is that it is an iothread locking
->
-> issue.
->
->
-Weird coincidence, that is a bug that's been in the tree for months
->
-but was only reported to me earlier this week. Try reverting
->
-commit a82fd5a4ec24d923ff1e -- that should fix it.
-I can confirm that reverting a82fd5a4ec24d923ff1e fixes it for me.
-Thanks for the help and fast response!
-
--Aaron
-
diff --git a/classification_output/01/instruction/33802194 b/classification_output/01/instruction/33802194
new file mode 100644
index 000000000..b8e563ad9
--- /dev/null
+++ b/classification_output/01/instruction/33802194
@@ -0,0 +1,4939 @@
+instruction: 0.693
+mistranslation: 0.687
+semantic: 0.656
+other: 0.637
+
+[BUG] cxl can not create region
+
+Hi list
+
+I want to test cxl functions in arm64, and found some problems I can't
+figure out.
+
+My test environment:
+
+1. build latest bios from
+https://github.com/tianocore/edk2.git
+master
+branch(cc2db6ebfb6d9d85ba4c7b35fba1fa37fffc0bc2)
+2. build latest qemu-system-aarch64 from git://git.qemu.org/qemu.git
+master branch(846dcf0ba4eff824c295f06550b8673ff3f31314). With cxl arm
+support patch:
+https://patchwork.kernel.org/project/cxl/cover/20220616141950.23374-1-Jonathan.Cameron@huawei.com/
+3. build Linux kernel from
+https://git.kernel.org/pub/scm/linux/kernel/git/cxl/cxl.git
+preview
+branch(65fc1c3d26b96002a5aa1f4012fae4dc98fd5683)
+4. build latest ndctl tools from
+https://github.com/pmem/ndctl
+create_region branch(8558b394e449779e3a4f3ae90fae77ede0bca159)
+
+And my qemu test commands:
+sudo $QEMU_BIN -M virt,gic-version=3,cxl=on -m 4g,maxmem=8G,slots=8 \
+        -cpu max -smp 8 -nographic -no-reboot \
+        -kernel $KERNEL -bios $BIOS_BIN \
+        -drive if=none,file=$ROOTFS,format=qcow2,id=hd \
+        -device virtio-blk-pci,drive=hd -append 'root=/dev/vda1
+nokaslr dyndbg="module cxl* +p"' \
+        -object memory-backend-ram,size=4G,id=mem0 \
+        -numa node,nodeid=0,cpus=0-7,memdev=mem0 \
+        -net nic -net user,hostfwd=tcp::2222-:22 -enable-kvm \
+        -object
+memory-backend-file,id=cxl-mem0,share=on,mem-path=/tmp/cxltest.raw,size=256M
+\
+        -object
+memory-backend-file,id=cxl-mem1,share=on,mem-path=/tmp/cxltest1.raw,size=256M
+\
+        -object
+memory-backend-file,id=cxl-mem2,share=on,mem-path=/tmp/cxltest2.raw,size=256M
+\
+        -object
+memory-backend-file,id=cxl-mem3,share=on,mem-path=/tmp/cxltest3.raw,size=256M
+\
+        -object
+memory-backend-file,id=cxl-lsa0,share=on,mem-path=/tmp/lsa0.raw,size=256M
+\
+        -object
+memory-backend-file,id=cxl-lsa1,share=on,mem-path=/tmp/lsa1.raw,size=256M
+\
+        -object
+memory-backend-file,id=cxl-lsa2,share=on,mem-path=/tmp/lsa2.raw,size=256M
+\
+        -object
+memory-backend-file,id=cxl-lsa3,share=on,mem-path=/tmp/lsa3.raw,size=256M
+\
+        -device pxb-cxl,bus_nr=12,bus=pcie.0,id=cxl.1 \
+        -device cxl-rp,port=0,bus=cxl.1,id=root_port0,chassis=0,slot=0 \
+        -device cxl-upstream,bus=root_port0,id=us0 \
+        -device cxl-downstream,port=0,bus=us0,id=swport0,chassis=0,slot=4 \
+        -device
+cxl-type3,bus=swport0,memdev=cxl-mem0,lsa=cxl-lsa0,id=cxl-pmem0 \
+        -device cxl-downstream,port=1,bus=us0,id=swport1,chassis=0,slot=5 \
+        -device
+cxl-type3,bus=swport1,memdev=cxl-mem1,lsa=cxl-lsa1,id=cxl-pmem1 \
+        -device cxl-downstream,port=2,bus=us0,id=swport2,chassis=0,slot=6 \
+        -device
+cxl-type3,bus=swport2,memdev=cxl-mem2,lsa=cxl-lsa2,id=cxl-pmem2 \
+        -device cxl-downstream,port=3,bus=us0,id=swport3,chassis=0,slot=7 \
+        -device
+cxl-type3,bus=swport3,memdev=cxl-mem3,lsa=cxl-lsa3,id=cxl-pmem3 \
+        -M 
+cxl-fmw.0.targets.0=cxl.1,cxl-fmw.0.size=4G,cxl-fmw.0.interleave-granularity=4k
+
+And I have got two problems.
+1. When I want to create x1 region with command: "cxl create-region -d
+decoder0.0 -w 1 -g 4096 mem0", kernel crashed with null pointer
+reference. Crash log:
+
+[  534.697324] cxl_region region0: config state: 0
+[  534.697346] cxl_region region0: probe: -6
+[  534.697368] cxl_acpi ACPI0017:00: decoder0.0: created region0
+[  534.699115] cxl region0: mem0:endpoint3 decoder3.0 add:
+mem0:decoder3.0 @ 0 next: none nr_eps: 1 nr_targets: 1
+[  534.699149] cxl region0: 0000:0d:00.0:port2 decoder2.0 add:
+mem0:decoder3.0 @ 0 next: mem0 nr_eps: 1 nr_targets: 1
+[  534.699167] cxl region0: ACPI0016:00:port1 decoder1.0 add:
+mem0:decoder3.0 @ 0 next: 0000:0d:00.0 nr_eps: 1 nr_targets: 1
+[  534.699176] cxl region0: ACPI0016:00:port1 iw: 1 ig: 256
+[  534.699182] cxl region0: ACPI0016:00:port1 target[0] = 0000:0c:00.0
+for mem0:decoder3.0 @ 0
+[  534.699189] cxl region0: 0000:0d:00.0:port2 iw: 1 ig: 256
+[  534.699193] cxl region0: 0000:0d:00.0:port2 target[0] =
+0000:0e:00.0 for mem0:decoder3.0 @ 0
+[  534.699405] Unable to handle kernel NULL pointer dereference at
+virtual address 0000000000000000
+[  534.701474] Mem abort info:
+[  534.701994]   ESR = 0x0000000086000004
+[  534.702653]   EC = 0x21: IABT (current EL), IL = 32 bits
+[  534.703616]   SET = 0, FnV = 0
+[  534.704174]   EA = 0, S1PTW = 0
+[  534.704803]   FSC = 0x04: level 0 translation fault
+[  534.705694] user pgtable: 4k pages, 48-bit VAs, pgdp=000000010144a000
+[  534.706875] [0000000000000000] pgd=0000000000000000, p4d=0000000000000000
+[  534.709855] Internal error: Oops: 86000004 [#1] PREEMPT SMP
+[  534.710301] Modules linked in:
+[  534.710546] CPU: 7 PID: 331 Comm: cxl Not tainted
+5.19.0-rc3-00064-g65fc1c3d26b9-dirty #11
+[  534.715393] Hardware name: QEMU KVM Virtual Machine, BIOS 0.0.0 02/06/2015
+[  534.717179] pstate: 60400005 (nZCv daif +PAN -UAO -TCO -DIT -SSBS BTYPE=--)
+[  534.719190] pc : 0x0
+[  534.719928] lr : commit_store+0x118/0x2cc
+[  534.721007] sp : ffff80000aec3c30
+[  534.721793] x29: ffff80000aec3c30 x28: ffff0000da62e740 x27: ffff0000c0c06b30
+[  534.723875] x26: 0000000000000000 x25: ffff0000c0a2a400 x24: ffff0000c0a29400
+[  534.725440] x23: 0000000000000003 x22: 0000000000000000 x21: ffff0000c0c06800
+[  534.727312] x20: 0000000000000000 x19: ffff0000c1559800 x18: 0000000000000000
+[  534.729138] x17: 0000000000000000 x16: 0000000000000000 x15: 0000ffffd41fe838
+[  534.731046] x14: 0000000000000000 x13: 0000000000000000 x12: 0000000000000000
+[  534.732402] x11: 0000000000000000 x10: 0000000000000000 x9 : 0000000000000000
+[  534.734432] x8 : 0000000000000000 x7 : 0000000000000000 x6 : ffff0000c0906e80
+[  534.735921] x5 : 0000000000000000 x4 : 0000000000000000 x3 : ffff80000aec3bf0
+[  534.737437] x2 : 0000000000000000 x1 : 0000000000000000 x0 : ffff0000c155a000
+[  534.738878] Call trace:
+[  534.739368]  0x0
+[  534.739713]  dev_attr_store+0x1c/0x30
+[  534.740186]  sysfs_kf_write+0x48/0x58
+[  534.740961]  kernfs_fop_write_iter+0x128/0x184
+[  534.741872]  new_sync_write+0xdc/0x158
+[  534.742706]  vfs_write+0x1ac/0x2a8
+[  534.743440]  ksys_write+0x68/0xf0
+[  534.744328]  __arm64_sys_write+0x1c/0x28
+[  534.745180]  invoke_syscall+0x44/0xf0
+[  534.745989]  el0_svc_common+0x4c/0xfc
+[  534.746661]  do_el0_svc+0x60/0xa8
+[  534.747378]  el0_svc+0x2c/0x78
+[  534.748066]  el0t_64_sync_handler+0xb8/0x12c
+[  534.748919]  el0t_64_sync+0x18c/0x190
+[  534.749629] Code: bad PC value
+[  534.750169] ---[ end trace 0000000000000000 ]---
+
+2. When I want to create x4 region with command: "cxl create-region -d
+decoder0.0 -w 4 -g 4096 -m mem0 mem1 mem2 mem3". I got below errors:
+
+cxl region: create_region: region0: failed to set target3 to mem3
+cxl region: cmd_create_region: created 0 regions
+
+And kernel log as below:
+[   60.536663] cxl_region region0: config state: 0
+[   60.536675] cxl_region region0: probe: -6
+[   60.536696] cxl_acpi ACPI0017:00: decoder0.0: created region0
+[   60.538251] cxl region0: mem0:endpoint3 decoder3.0 add:
+mem0:decoder3.0 @ 0 next: none nr_eps: 1 nr_targets: 1
+[   60.538278] cxl region0: 0000:0d:00.0:port2 decoder2.0 add:
+mem0:decoder3.0 @ 0 next: mem0 nr_eps: 1 nr_targets: 1
+[   60.538295] cxl region0: ACPI0016:00:port1 decoder1.0 add:
+mem0:decoder3.0 @ 0 next: 0000:0d:00.0 nr_eps: 1 nr_targets: 1
+[   60.538647] cxl region0: mem1:endpoint4 decoder4.0 add:
+mem1:decoder4.0 @ 1 next: none nr_eps: 1 nr_targets: 1
+[   60.538663] cxl region0: 0000:0d:00.0:port2 decoder2.0 add:
+mem1:decoder4.0 @ 1 next: mem1 nr_eps: 2 nr_targets: 2
+[   60.538675] cxl region0: ACPI0016:00:port1 decoder1.0 add:
+mem1:decoder4.0 @ 1 next: 0000:0d:00.0 nr_eps: 2 nr_targets: 1
+[   60.539311] cxl region0: mem2:endpoint5 decoder5.0 add:
+mem2:decoder5.0 @ 2 next: none nr_eps: 1 nr_targets: 1
+[   60.539332] cxl region0: 0000:0d:00.0:port2 decoder2.0 add:
+mem2:decoder5.0 @ 2 next: mem2 nr_eps: 3 nr_targets: 3
+[   60.539343] cxl region0: ACPI0016:00:port1 decoder1.0 add:
+mem2:decoder5.0 @ 2 next: 0000:0d:00.0 nr_eps: 3 nr_targets: 1
+[   60.539711] cxl region0: mem3:endpoint6 decoder6.0 add:
+mem3:decoder6.0 @ 3 next: none nr_eps: 1 nr_targets: 1
+[   60.539723] cxl region0: 0000:0d:00.0:port2 decoder2.0 add:
+mem3:decoder6.0 @ 3 next: mem3 nr_eps: 4 nr_targets: 4
+[   60.539735] cxl region0: ACPI0016:00:port1 decoder1.0 add:
+mem3:decoder6.0 @ 3 next: 0000:0d:00.0 nr_eps: 4 nr_targets: 1
+[   60.539742] cxl region0: ACPI0016:00:port1 iw: 1 ig: 256
+[   60.539747] cxl region0: ACPI0016:00:port1 target[0] = 0000:0c:00.0
+for mem0:decoder3.0 @ 0
+[   60.539754] cxl region0: 0000:0d:00.0:port2 iw: 4 ig: 512
+[   60.539758] cxl region0: 0000:0d:00.0:port2 target[0] =
+0000:0e:00.0 for mem0:decoder3.0 @ 0
+[   60.539764] cxl region0: ACPI0016:00:port1: cannot host mem1:decoder4.0 at 1
+
+I have tried to write sysfs node manually, got same errors.
+
+Hope I can get some helps here.
+
+Bob
+
+On Fri, 5 Aug 2022 10:20:23 +0800
+Bobo WL <lmw.bobo@gmail.com> wrote:
+
+>
+Hi list
+>
+>
+I want to test cxl functions in arm64, and found some problems I can't
+>
+figure out.
+Hi Bob,
+
+Glad to see people testing this code.
+
+>
+>
+My test environment:
+>
+>
+1. build latest bios from
+https://github.com/tianocore/edk2.git
+master
+>
+branch(cc2db6ebfb6d9d85ba4c7b35fba1fa37fffc0bc2)
+>
+2. build latest qemu-system-aarch64 from git://git.qemu.org/qemu.git
+>
+master branch(846dcf0ba4eff824c295f06550b8673ff3f31314). With cxl arm
+>
+support patch:
+>
+https://patchwork.kernel.org/project/cxl/cover/20220616141950.23374-1-Jonathan.Cameron@huawei.com/
+>
+3. build Linux kernel from
+>
+https://git.kernel.org/pub/scm/linux/kernel/git/cxl/cxl.git
+preview
+>
+branch(65fc1c3d26b96002a5aa1f4012fae4dc98fd5683)
+>
+4. build latest ndctl tools from
+https://github.com/pmem/ndctl
+>
+create_region branch(8558b394e449779e3a4f3ae90fae77ede0bca159)
+>
+>
+And my qemu test commands:
+>
+sudo $QEMU_BIN -M virt,gic-version=3,cxl=on -m 4g,maxmem=8G,slots=8 \
+>
+-cpu max -smp 8 -nographic -no-reboot \
+>
+-kernel $KERNEL -bios $BIOS_BIN \
+>
+-drive if=none,file=$ROOTFS,format=qcow2,id=hd \
+>
+-device virtio-blk-pci,drive=hd -append 'root=/dev/vda1
+>
+nokaslr dyndbg="module cxl* +p"' \
+>
+-object memory-backend-ram,size=4G,id=mem0 \
+>
+-numa node,nodeid=0,cpus=0-7,memdev=mem0 \
+>
+-net nic -net user,hostfwd=tcp::2222-:22 -enable-kvm \
+>
+-object
+>
+memory-backend-file,id=cxl-mem0,share=on,mem-path=/tmp/cxltest.raw,size=256M
+>
+\
+>
+-object
+>
+memory-backend-file,id=cxl-mem1,share=on,mem-path=/tmp/cxltest1.raw,size=256M
+>
+\
+>
+-object
+>
+memory-backend-file,id=cxl-mem2,share=on,mem-path=/tmp/cxltest2.raw,size=256M
+>
+\
+>
+-object
+>
+memory-backend-file,id=cxl-mem3,share=on,mem-path=/tmp/cxltest3.raw,size=256M
+>
+\
+>
+-object
+>
+memory-backend-file,id=cxl-lsa0,share=on,mem-path=/tmp/lsa0.raw,size=256M
+>
+\
+>
+-object
+>
+memory-backend-file,id=cxl-lsa1,share=on,mem-path=/tmp/lsa1.raw,size=256M
+>
+\
+>
+-object
+>
+memory-backend-file,id=cxl-lsa2,share=on,mem-path=/tmp/lsa2.raw,size=256M
+>
+\
+>
+-object
+>
+memory-backend-file,id=cxl-lsa3,share=on,mem-path=/tmp/lsa3.raw,size=256M
+>
+\
+>
+-device pxb-cxl,bus_nr=12,bus=pcie.0,id=cxl.1 \
+>
+-device cxl-rp,port=0,bus=cxl.1,id=root_port0,chassis=0,slot=0 \
+Probably not related to your problem, but there is a disconnect in QEMU /
+kernel assumptionsaround the presence of an HDM decoder when a HB only
+has a single root port. Spec allows it to be provided or not as an 
+implementation choice.
+Kernel assumes it isn't provide. Qemu assumes it is.
+
+The temporary solution is to throw in a second root port on the HB and not
+connect anything to it.  Longer term I may special case this so that the 
+particular
+decoder defaults to pass through settings in QEMU if there is only one root 
+port.
+
+>
+-device cxl-upstream,bus=root_port0,id=us0 \
+>
+-device cxl-downstream,port=0,bus=us0,id=swport0,chassis=0,slot=4 \
+>
+-device
+>
+cxl-type3,bus=swport0,memdev=cxl-mem0,lsa=cxl-lsa0,id=cxl-pmem0 \
+>
+-device cxl-downstream,port=1,bus=us0,id=swport1,chassis=0,slot=5 \
+>
+-device
+>
+cxl-type3,bus=swport1,memdev=cxl-mem1,lsa=cxl-lsa1,id=cxl-pmem1 \
+>
+-device cxl-downstream,port=2,bus=us0,id=swport2,chassis=0,slot=6 \
+>
+-device
+>
+cxl-type3,bus=swport2,memdev=cxl-mem2,lsa=cxl-lsa2,id=cxl-pmem2 \
+>
+-device cxl-downstream,port=3,bus=us0,id=swport3,chassis=0,slot=7 \
+>
+-device
+>
+cxl-type3,bus=swport3,memdev=cxl-mem3,lsa=cxl-lsa3,id=cxl-pmem3 \
+>
+-M
+>
+cxl-fmw.0.targets.0=cxl.1,cxl-fmw.0.size=4G,cxl-fmw.0.interleave-granularity=4k
+>
+>
+And I have got two problems.
+>
+1. When I want to create x1 region with command: "cxl create-region -d
+>
+decoder0.0 -w 1 -g 4096 mem0", kernel crashed with null pointer
+>
+reference. Crash log:
+>
+>
+[  534.697324] cxl_region region0: config state: 0
+>
+[  534.697346] cxl_region region0: probe: -6
+Seems odd this is up here.  But maybe fine.
+
+>
+[  534.697368] cxl_acpi ACPI0017:00: decoder0.0: created region0
+>
+[  534.699115] cxl region0: mem0:endpoint3 decoder3.0 add:
+>
+mem0:decoder3.0 @ 0 next: none nr_eps: 1 nr_targets: 1
+>
+[  534.699149] cxl region0: 0000:0d:00.0:port2 decoder2.0 add:
+>
+mem0:decoder3.0 @ 0 next: mem0 nr_eps: 1 nr_targets: 1
+>
+[  534.699167] cxl region0: ACPI0016:00:port1 decoder1.0 add:
+>
+mem0:decoder3.0 @ 0 next: 0000:0d:00.0 nr_eps: 1 nr_targets: 1
+>
+[  534.699176] cxl region0: ACPI0016:00:port1 iw: 1 ig: 256
+>
+[  534.699182] cxl region0: ACPI0016:00:port1 target[0] = 0000:0c:00.0
+>
+for mem0:decoder3.0 @ 0
+>
+[  534.699189] cxl region0: 0000:0d:00.0:port2 iw: 1 ig: 256
+>
+[  534.699193] cxl region0: 0000:0d:00.0:port2 target[0] =
+>
+0000:0e:00.0 for mem0:decoder3.0 @ 0
+>
+[  534.699405] Unable to handle kernel NULL pointer dereference at
+>
+virtual address 0000000000000000
+>
+[  534.701474] Mem abort info:
+>
+[  534.701994]   ESR = 0x0000000086000004
+>
+[  534.702653]   EC = 0x21: IABT (current EL), IL = 32 bits
+>
+[  534.703616]   SET = 0, FnV = 0
+>
+[  534.704174]   EA = 0, S1PTW = 0
+>
+[  534.704803]   FSC = 0x04: level 0 translation fault
+>
+[  534.705694] user pgtable: 4k pages, 48-bit VAs, pgdp=000000010144a000
+>
+[  534.706875] [0000000000000000] pgd=0000000000000000, p4d=0000000000000000
+>
+[  534.709855] Internal error: Oops: 86000004 [#1] PREEMPT SMP
+>
+[  534.710301] Modules linked in:
+>
+[  534.710546] CPU: 7 PID: 331 Comm: cxl Not tainted
+>
+5.19.0-rc3-00064-g65fc1c3d26b9-dirty #11
+>
+[  534.715393] Hardware name: QEMU KVM Virtual Machine, BIOS 0.0.0 02/06/2015
+>
+[  534.717179] pstate: 60400005 (nZCv daif +PAN -UAO -TCO -DIT -SSBS BTYPE=--)
+>
+[  534.719190] pc : 0x0
+>
+[  534.719928] lr : commit_store+0x118/0x2cc
+>
+[  534.721007] sp : ffff80000aec3c30
+>
+[  534.721793] x29: ffff80000aec3c30 x28: ffff0000da62e740 x27:
+>
+ffff0000c0c06b30
+>
+[  534.723875] x26: 0000000000000000 x25: ffff0000c0a2a400 x24:
+>
+ffff0000c0a29400
+>
+[  534.725440] x23: 0000000000000003 x22: 0000000000000000 x21:
+>
+ffff0000c0c06800
+>
+[  534.727312] x20: 0000000000000000 x19: ffff0000c1559800 x18:
+>
+0000000000000000
+>
+[  534.729138] x17: 0000000000000000 x16: 0000000000000000 x15:
+>
+0000ffffd41fe838
+>
+[  534.731046] x14: 0000000000000000 x13: 0000000000000000 x12:
+>
+0000000000000000
+>
+[  534.732402] x11: 0000000000000000 x10: 0000000000000000 x9 :
+>
+0000000000000000
+>
+[  534.734432] x8 : 0000000000000000 x7 : 0000000000000000 x6 :
+>
+ffff0000c0906e80
+>
+[  534.735921] x5 : 0000000000000000 x4 : 0000000000000000 x3 :
+>
+ffff80000aec3bf0
+>
+[  534.737437] x2 : 0000000000000000 x1 : 0000000000000000 x0 :
+>
+ffff0000c155a000
+>
+[  534.738878] Call trace:
+>
+[  534.739368]  0x0
+>
+[  534.739713]  dev_attr_store+0x1c/0x30
+>
+[  534.740186]  sysfs_kf_write+0x48/0x58
+>
+[  534.740961]  kernfs_fop_write_iter+0x128/0x184
+>
+[  534.741872]  new_sync_write+0xdc/0x158
+>
+[  534.742706]  vfs_write+0x1ac/0x2a8
+>
+[  534.743440]  ksys_write+0x68/0xf0
+>
+[  534.744328]  __arm64_sys_write+0x1c/0x28
+>
+[  534.745180]  invoke_syscall+0x44/0xf0
+>
+[  534.745989]  el0_svc_common+0x4c/0xfc
+>
+[  534.746661]  do_el0_svc+0x60/0xa8
+>
+[  534.747378]  el0_svc+0x2c/0x78
+>
+[  534.748066]  el0t_64_sync_handler+0xb8/0x12c
+>
+[  534.748919]  el0t_64_sync+0x18c/0x190
+>
+[  534.749629] Code: bad PC value
+>
+[  534.750169] ---[ end trace 0000000000000000 ]---
+>
+>
+2. When I want to create x4 region with command: "cxl create-region -d
+>
+decoder0.0 -w 4 -g 4096 -m mem0 mem1 mem2 mem3". I got below errors:
+>
+>
+cxl region: create_region: region0: failed to set target3 to mem3
+>
+cxl region: cmd_create_region: created 0 regions
+>
+>
+And kernel log as below:
+>
+[   60.536663] cxl_region region0: config state: 0
+>
+[   60.536675] cxl_region region0: probe: -6
+>
+[   60.536696] cxl_acpi ACPI0017:00: decoder0.0: created region0
+>
+[   60.538251] cxl region0: mem0:endpoint3 decoder3.0 add:
+>
+mem0:decoder3.0 @ 0 next: none nr_eps: 1 nr_targets: 1
+>
+[   60.538278] cxl region0: 0000:0d:00.0:port2 decoder2.0 add:
+>
+mem0:decoder3.0 @ 0 next: mem0 nr_eps: 1 nr_targets: 1
+>
+[   60.538295] cxl region0: ACPI0016:00:port1 decoder1.0 add:
+>
+mem0:decoder3.0 @ 0 next: 0000:0d:00.0 nr_eps: 1 nr_targets: 1
+>
+[   60.538647] cxl region0: mem1:endpoint4 decoder4.0 add:
+>
+mem1:decoder4.0 @ 1 next: none nr_eps: 1 nr_targets: 1
+>
+[   60.538663] cxl region0: 0000:0d:00.0:port2 decoder2.0 add:
+>
+mem1:decoder4.0 @ 1 next: mem1 nr_eps: 2 nr_targets: 2
+>
+[   60.538675] cxl region0: ACPI0016:00:port1 decoder1.0 add:
+>
+mem1:decoder4.0 @ 1 next: 0000:0d:00.0 nr_eps: 2 nr_targets: 1
+>
+[   60.539311] cxl region0: mem2:endpoint5 decoder5.0 add:
+>
+mem2:decoder5.0 @ 2 next: none nr_eps: 1 nr_targets: 1
+>
+[   60.539332] cxl region0: 0000:0d:00.0:port2 decoder2.0 add:
+>
+mem2:decoder5.0 @ 2 next: mem2 nr_eps: 3 nr_targets: 3
+>
+[   60.539343] cxl region0: ACPI0016:00:port1 decoder1.0 add:
+>
+mem2:decoder5.0 @ 2 next: 0000:0d:00.0 nr_eps: 3 nr_targets: 1
+>
+[   60.539711] cxl region0: mem3:endpoint6 decoder6.0 add:
+>
+mem3:decoder6.0 @ 3 next: none nr_eps: 1 nr_targets: 1
+>
+[   60.539723] cxl region0: 0000:0d:00.0:port2 decoder2.0 add:
+>
+mem3:decoder6.0 @ 3 next: mem3 nr_eps: 4 nr_targets: 4
+>
+[   60.539735] cxl region0: ACPI0016:00:port1 decoder1.0 add:
+>
+mem3:decoder6.0 @ 3 next: 0000:0d:00.0 nr_eps: 4 nr_targets: 1
+>
+[   60.539742] cxl region0: ACPI0016:00:port1 iw: 1 ig: 256
+>
+[   60.539747] cxl region0: ACPI0016:00:port1 target[0] = 0000:0c:00.0
+>
+for mem0:decoder3.0 @ 0
+>
+[   60.539754] cxl region0: 0000:0d:00.0:port2 iw: 4 ig: 512
+This looks like off by 1 that should be fixed in the below mentioned
+cxl/pending branch.  That ig should be 256.  Note the fix was
+for a test case with a fat HB and no switch, but certainly looks
+like this is the same issue.
+
+>
+[   60.539758] cxl region0: 0000:0d:00.0:port2 target[0] =
+>
+0000:0e:00.0 for mem0:decoder3.0 @ 0
+>
+[   60.539764] cxl region0: ACPI0016:00:port1: cannot host mem1:decoder4.0 at
+>
+1
+>
+>
+I have tried to write sysfs node manually, got same errors.
+When stepping through by hand, which sysfs write triggers the crash above?
+
+Not sure it's related, but I've just sent out a fix to the
+target register handling in QEMU.
+20220808122051.14822-1-Jonathan.Cameron@huawei.com
+/T/#m47ff985412ce44559e6b04d677c302f8cd371330">https://lore.kernel.org/linux-cxl/
+20220808122051.14822-1-Jonathan.Cameron@huawei.com
+/T/#m47ff985412ce44559e6b04d677c302f8cd371330
+I did have one instance last week of triggering what looked to be a race 
+condition but
+the stack trace doesn't looks related to what you've hit.
+
+It will probably be a few days before I have time to take a look at replicating
+what you have seen.
+
+If you have time, try using the kernel.org cxl/pending branch as there are
+a few additional fixes on there since you sent this email.  Optimistic to hope
+this is covered by one of those, but at least it will mean we are trying to 
+replicate
+on same branch.
+
+Jonathan
+
+
+>
+>
+Hope I can get some helps here.
+>
+>
+Bob
+
+Hi Jonathan
+
+Thanks for your reply!
+
+On Mon, Aug 8, 2022 at 8:37 PM Jonathan Cameron
+<Jonathan.Cameron@huawei.com> wrote:
+>
+>
+Probably not related to your problem, but there is a disconnect in QEMU /
+>
+kernel assumptionsaround the presence of an HDM decoder when a HB only
+>
+has a single root port. Spec allows it to be provided or not as an
+>
+implementation choice.
+>
+Kernel assumes it isn't provide. Qemu assumes it is.
+>
+>
+The temporary solution is to throw in a second root port on the HB and not
+>
+connect anything to it.  Longer term I may special case this so that the
+>
+particular
+>
+decoder defaults to pass through settings in QEMU if there is only one root
+>
+port.
+>
+You are right! After adding an extra HB in qemu, I can create a x1
+region successfully.
+But have some errors in Nvdimm:
+
+[   74.925838] Unknown online node for memory at 0x10000000000, assuming node 0
+[   74.925846] Unknown target node for memory at 0x10000000000, assuming node 0
+[   74.927470] nd_region region0: nmem0: is disabled, failing probe
+
+And x4 region still failed with same errors, using latest cxl/preview
+branch don't work.
+I have picked "Two CXL emulation fixes" patches in qemu, still not working.
+
+Bob
+
+On Tue, 9 Aug 2022 21:07:06 +0800
+Bobo WL <lmw.bobo@gmail.com> wrote:
+
+>
+Hi Jonathan
+>
+>
+Thanks for your reply!
+>
+>
+On Mon, Aug 8, 2022 at 8:37 PM Jonathan Cameron
+>
+<Jonathan.Cameron@huawei.com> wrote:
+>
+>
+>
+> Probably not related to your problem, but there is a disconnect in QEMU /
+>
+> kernel assumptionsaround the presence of an HDM decoder when a HB only
+>
+> has a single root port. Spec allows it to be provided or not as an
+>
+> implementation choice.
+>
+> Kernel assumes it isn't provide. Qemu assumes it is.
+>
+>
+>
+> The temporary solution is to throw in a second root port on the HB and not
+>
+> connect anything to it.  Longer term I may special case this so that the
+>
+> particular
+>
+> decoder defaults to pass through settings in QEMU if there is only one root
+>
+> port.
+>
+>
+>
+>
+You are right! After adding an extra HB in qemu, I can create a x1
+>
+region successfully.
+>
+But have some errors in Nvdimm:
+>
+>
+[   74.925838] Unknown online node for memory at 0x10000000000, assuming node > 0
+>
+[   74.925846] Unknown target node for memory at 0x10000000000, assuming node > 0
+>
+[   74.927470] nd_region region0: nmem0: is disabled, failing probe
+Ah. I've seen this one, but not chased it down yet.  Was on my todo list to 
+chase
+down. Once I reach this state I can verify the HDM Decode is correct which is 
+what
+I've been using to test (Which wasn't true until earlier this week). 
+I'm currently testing via devmem, more for historical reasons than because it 
+makes
+that much sense anymore.  
+
+>
+>
+And x4 region still failed with same errors, using latest cxl/preview
+>
+branch don't work.
+>
+I have picked "Two CXL emulation fixes" patches in qemu, still not working.
+>
+>
+Bob
+
+On Tue, 9 Aug 2022 17:08:25 +0100
+Jonathan Cameron <Jonathan.Cameron@huawei.com> wrote:
+
+>
+On Tue, 9 Aug 2022 21:07:06 +0800
+>
+Bobo WL <lmw.bobo@gmail.com> wrote:
+>
+>
+> Hi Jonathan
+>
+>
+>
+> Thanks for your reply!
+>
+>
+>
+> On Mon, Aug 8, 2022 at 8:37 PM Jonathan Cameron
+>
+> <Jonathan.Cameron@huawei.com> wrote:
+>
+> >
+>
+> > Probably not related to your problem, but there is a disconnect in QEMU /
+>
+> > kernel assumptionsaround the presence of an HDM decoder when a HB only
+>
+> > has a single root port. Spec allows it to be provided or not as an
+>
+> > implementation choice.
+>
+> > Kernel assumes it isn't provide. Qemu assumes it is.
+>
+> >
+>
+> > The temporary solution is to throw in a second root port on the HB and not
+>
+> > connect anything to it.  Longer term I may special case this so that the
+>
+> > particular
+>
+> > decoder defaults to pass through settings in QEMU if there is only one
+>
+> > root port.
+>
+> >
+>
+>
+>
+> You are right! After adding an extra HB in qemu, I can create a x1
+>
+> region successfully.
+>
+> But have some errors in Nvdimm:
+>
+>
+>
+> [   74.925838] Unknown online node for memory at 0x10000000000, assuming
+>
+> node 0
+>
+> [   74.925846] Unknown target node for memory at 0x10000000000, assuming
+>
+> node 0
+>
+> [   74.927470] nd_region region0: nmem0: is disabled, failing probe
+>
+>
+Ah. I've seen this one, but not chased it down yet.  Was on my todo list to
+>
+chase
+>
+down. Once I reach this state I can verify the HDM Decode is correct which is
+>
+what
+>
+I've been using to test (Which wasn't true until earlier this week).
+>
+I'm currently testing via devmem, more for historical reasons than because it
+>
+makes
+>
+that much sense anymore.
+*embarassed cough*.  We haven't fully hooked the LSA up in qemu yet.
+I'd forgotten that was still on the todo list. I don't think it will
+be particularly hard to do and will take a look in next few days.
+
+Very very indirectly this error is causing a driver probe fail that means that
+we hit a code path that has a rather odd looking check on NDD_LABELING.
+Should not have gotten near that path though - hence the problem is actually
+when we call cxl_pmem_get_config_data() and it returns an error because
+we haven't fully connected up the command in QEMU.
+
+Jonathan
+
+
+>
+>
+>
+>
+> And x4 region still failed with same errors, using latest cxl/preview
+>
+> branch don't work.
+>
+> I have picked "Two CXL emulation fixes" patches in qemu, still not working.
+>
+>
+>
+> Bob
+
+On Thu, 11 Aug 2022 18:08:57 +0100
+Jonathan Cameron via <qemu-devel@nongnu.org> wrote:
+
+>
+On Tue, 9 Aug 2022 17:08:25 +0100
+>
+Jonathan Cameron <Jonathan.Cameron@huawei.com> wrote:
+>
+>
+> On Tue, 9 Aug 2022 21:07:06 +0800
+>
+> Bobo WL <lmw.bobo@gmail.com> wrote:
+>
+>
+>
+> > Hi Jonathan
+>
+> >
+>
+> > Thanks for your reply!
+>
+> >
+>
+> > On Mon, Aug 8, 2022 at 8:37 PM Jonathan Cameron
+>
+> > <Jonathan.Cameron@huawei.com> wrote:
+>
+> > >
+>
+> > > Probably not related to your problem, but there is a disconnect in QEMU
+>
+> > > /
+>
+> > > kernel assumptionsaround the presence of an HDM decoder when a HB only
+>
+> > > has a single root port. Spec allows it to be provided or not as an
+>
+> > > implementation choice.
+>
+> > > Kernel assumes it isn't provide. Qemu assumes it is.
+>
+> > >
+>
+> > > The temporary solution is to throw in a second root port on the HB and
+>
+> > > not
+>
+> > > connect anything to it.  Longer term I may special case this so that
+>
+> > > the particular
+>
+> > > decoder defaults to pass through settings in QEMU if there is only one
+>
+> > > root port.
+>
+> > >
+>
+> >
+>
+> > You are right! After adding an extra HB in qemu, I can create a x1
+>
+> > region successfully.
+>
+> > But have some errors in Nvdimm:
+>
+> >
+>
+> > [   74.925838] Unknown online node for memory at 0x10000000000, assuming
+>
+> > node 0
+>
+> > [   74.925846] Unknown target node for memory at 0x10000000000, assuming
+>
+> > node 0
+>
+> > [   74.927470] nd_region region0: nmem0: is disabled, failing probe
+>
+>
+>
+> Ah. I've seen this one, but not chased it down yet.  Was on my todo list to
+>
+> chase
+>
+> down. Once I reach this state I can verify the HDM Decode is correct which
+>
+> is what
+>
+> I've been using to test (Which wasn't true until earlier this week).
+>
+> I'm currently testing via devmem, more for historical reasons than because
+>
+> it makes
+>
+> that much sense anymore.
+>
+>
+*embarassed cough*.  We haven't fully hooked the LSA up in qemu yet.
+>
+I'd forgotten that was still on the todo list. I don't think it will
+>
+be particularly hard to do and will take a look in next few days.
+>
+>
+Very very indirectly this error is causing a driver probe fail that means that
+>
+we hit a code path that has a rather odd looking check on NDD_LABELING.
+>
+Should not have gotten near that path though - hence the problem is actually
+>
+when we call cxl_pmem_get_config_data() and it returns an error because
+>
+we haven't fully connected up the command in QEMU.
+So a least one bug in QEMU. We were not supporting variable length payloads on 
+mailbox
+inputs (but were on outputs).  That hasn't mattered until we get to LSA writes.
+We just need to relax condition on the supplied length.
+
+diff --git a/hw/cxl/cxl-mailbox-utils.c b/hw/cxl/cxl-mailbox-utils.c
+index c352a935c4..fdda9529fe 100644
+--- a/hw/cxl/cxl-mailbox-utils.c
++++ b/hw/cxl/cxl-mailbox-utils.c
+@@ -510,7 +510,7 @@ void cxl_process_mailbox(CXLDeviceState *cxl_dstate)
+     cxl_cmd = &cxl_cmd_set[set][cmd];
+     h = cxl_cmd->handler;
+     if (h) {
+-        if (len == cxl_cmd->in) {
++        if (len == cxl_cmd->in || !cxl_cmd->in) {
+             cxl_cmd->payload = cxl_dstate->mbox_reg_state +
+                 A_CXL_DEV_CMD_PAYLOAD;
+             ret = (*h)(cxl_cmd, cxl_dstate, &len);
+
+
+This lets the nvdimm/region probe fine, but I'm getting some issues with
+namespace capacity so I'll look at what is causing that next.
+Unfortunately I'm not that familiar with the driver/nvdimm side of things
+so it's take a while to figure out what kicks off what!
+
+Jonathan
+
+>
+>
+Jonathan
+>
+>
+>
+>
+>
+> >
+>
+> > And x4 region still failed with same errors, using latest cxl/preview
+>
+> > branch don't work.
+>
+> > I have picked "Two CXL emulation fixes" patches in qemu, still not
+>
+> > working.
+>
+> >
+>
+> > Bob
+>
+>
+
+Jonathan Cameron wrote:
+>
+On Thu, 11 Aug 2022 18:08:57 +0100
+>
+Jonathan Cameron via <qemu-devel@nongnu.org> wrote:
+>
+>
+> On Tue, 9 Aug 2022 17:08:25 +0100
+>
+> Jonathan Cameron <Jonathan.Cameron@huawei.com> wrote:
+>
+>
+>
+> > On Tue, 9 Aug 2022 21:07:06 +0800
+>
+> > Bobo WL <lmw.bobo@gmail.com> wrote:
+>
+> >
+>
+> > > Hi Jonathan
+>
+> > >
+>
+> > > Thanks for your reply!
+>
+> > >
+>
+> > > On Mon, Aug 8, 2022 at 8:37 PM Jonathan Cameron
+>
+> > > <Jonathan.Cameron@huawei.com> wrote:
+>
+> > > >
+>
+> > > > Probably not related to your problem, but there is a disconnect in
+>
+> > > > QEMU /
+>
+> > > > kernel assumptionsaround the presence of an HDM decoder when a HB only
+>
+> > > > has a single root port. Spec allows it to be provided or not as an
+>
+> > > > implementation choice.
+>
+> > > > Kernel assumes it isn't provide. Qemu assumes it is.
+>
+> > > >
+>
+> > > > The temporary solution is to throw in a second root port on the HB
+>
+> > > > and not
+>
+> > > > connect anything to it.  Longer term I may special case this so that
+>
+> > > > the particular
+>
+> > > > decoder defaults to pass through settings in QEMU if there is only
+>
+> > > > one root port.
+>
+> > > >
+>
+> > >
+>
+> > > You are right! After adding an extra HB in qemu, I can create a x1
+>
+> > > region successfully.
+>
+> > > But have some errors in Nvdimm:
+>
+> > >
+>
+> > > [   74.925838] Unknown online node for memory at 0x10000000000,
+>
+> > > assuming node 0
+>
+> > > [   74.925846] Unknown target node for memory at 0x10000000000,
+>
+> > > assuming node 0
+>
+> > > [   74.927470] nd_region region0: nmem0: is disabled, failing probe
+>
+> >
+>
+> > Ah. I've seen this one, but not chased it down yet.  Was on my todo list
+>
+> > to chase
+>
+> > down. Once I reach this state I can verify the HDM Decode is correct
+>
+> > which is what
+>
+> > I've been using to test (Which wasn't true until earlier this week).
+>
+> > I'm currently testing via devmem, more for historical reasons than
+>
+> > because it makes
+>
+> > that much sense anymore.
+>
+>
+>
+> *embarassed cough*.  We haven't fully hooked the LSA up in qemu yet.
+>
+> I'd forgotten that was still on the todo list. I don't think it will
+>
+> be particularly hard to do and will take a look in next few days.
+>
+>
+>
+> Very very indirectly this error is causing a driver probe fail that means
+>
+> that
+>
+> we hit a code path that has a rather odd looking check on NDD_LABELING.
+>
+> Should not have gotten near that path though - hence the problem is actually
+>
+> when we call cxl_pmem_get_config_data() and it returns an error because
+>
+> we haven't fully connected up the command in QEMU.
+>
+>
+So a least one bug in QEMU. We were not supporting variable length payloads
+>
+on mailbox
+>
+inputs (but were on outputs).  That hasn't mattered until we get to LSA
+>
+writes.
+>
+We just need to relax condition on the supplied length.
+>
+>
+diff --git a/hw/cxl/cxl-mailbox-utils.c b/hw/cxl/cxl-mailbox-utils.c
+>
+index c352a935c4..fdda9529fe 100644
+>
+--- a/hw/cxl/cxl-mailbox-utils.c
+>
++++ b/hw/cxl/cxl-mailbox-utils.c
+>
+@@ -510,7 +510,7 @@ void cxl_process_mailbox(CXLDeviceState *cxl_dstate)
+>
+cxl_cmd = &cxl_cmd_set[set][cmd];
+>
+h = cxl_cmd->handler;
+>
+if (h) {
+>
+-        if (len == cxl_cmd->in) {
+>
++        if (len == cxl_cmd->in || !cxl_cmd->in) {
+>
+cxl_cmd->payload = cxl_dstate->mbox_reg_state +
+>
+A_CXL_DEV_CMD_PAYLOAD;
+>
+ret = (*h)(cxl_cmd, cxl_dstate, &len);
+>
+>
+>
+This lets the nvdimm/region probe fine, but I'm getting some issues with
+>
+namespace capacity so I'll look at what is causing that next.
+>
+Unfortunately I'm not that familiar with the driver/nvdimm side of things
+>
+so it's take a while to figure out what kicks off what!
+The whirlwind tour is that 'struct nd_region' instances that represent a
+persitent memory address range are composed of one more mappings of
+'struct nvdimm' objects. The nvdimm object is driven by the dimm driver
+in drivers/nvdimm/dimm.c. That driver is mainly charged with unlocking
+the dimm (if locked) and interrogating the label area to look for
+namespace labels.
+
+The label command calls are routed to the '->ndctl()' callback that was
+registered when the CXL nvdimm_bus_descriptor was created. That callback
+handles both 'bus' scope calls, currently none for CXL, and per nvdimm
+calls. cxl_pmem_nvdimm_ctl() translates those generic LIBNVDIMM commands
+to CXL commands.
+
+The 'struct nvdimm' objects that the CXL side registers have the
+NDD_LABELING flag set which means that namespaces need to be explicitly
+created / provisioned from region capacity. Otherwise, if
+drivers/nvdimm/dimm.c does not find a namespace-label-index block then
+the region reverts to label-less mode and a default namespace equal to
+the size of the region is instantiated.
+
+If you are seeing small mismatches in namespace capacity then it may
+just be the fact that by default 'ndctl create-namespace' results in an
+'fsdax' mode namespace which just means that it is a block device where
+1.5% of the capacity is reserved for 'struct page' metadata. You should
+be able to see namespace capacity == region capacity by doing "ndctl
+create-namespace -m raw", and disable DAX operation.
+
+Hope that helps.
+
+On Fri, 12 Aug 2022 09:03:02 -0700
+Dan Williams <dan.j.williams@intel.com> wrote:
+
+>
+Jonathan Cameron wrote:
+>
+> On Thu, 11 Aug 2022 18:08:57 +0100
+>
+> Jonathan Cameron via <qemu-devel@nongnu.org> wrote:
+>
+>
+>
+> > On Tue, 9 Aug 2022 17:08:25 +0100
+>
+> > Jonathan Cameron <Jonathan.Cameron@huawei.com> wrote:
+>
+> >
+>
+> > > On Tue, 9 Aug 2022 21:07:06 +0800
+>
+> > > Bobo WL <lmw.bobo@gmail.com> wrote:
+>
+> > >
+>
+> > > > Hi Jonathan
+>
+> > > >
+>
+> > > > Thanks for your reply!
+>
+> > > >
+>
+> > > > On Mon, Aug 8, 2022 at 8:37 PM Jonathan Cameron
+>
+> > > > <Jonathan.Cameron@huawei.com> wrote:
+>
+> > > > >
+>
+> > > > > Probably not related to your problem, but there is a disconnect in
+>
+> > > > > QEMU /
+>
+> > > > > kernel assumptionsaround the presence of an HDM decoder when a HB
+>
+> > > > > only
+>
+> > > > > has a single root port. Spec allows it to be provided or not as an
+>
+> > > > > implementation choice.
+>
+> > > > > Kernel assumes it isn't provide. Qemu assumes it is.
+>
+> > > > >
+>
+> > > > > The temporary solution is to throw in a second root port on the HB
+>
+> > > > > and not
+>
+> > > > > connect anything to it.  Longer term I may special case this so
+>
+> > > > > that the particular
+>
+> > > > > decoder defaults to pass through settings in QEMU if there is only
+>
+> > > > > one root port.
+>
+> > > > >
+>
+> > > >
+>
+> > > > You are right! After adding an extra HB in qemu, I can create a x1
+>
+> > > > region successfully.
+>
+> > > > But have some errors in Nvdimm:
+>
+> > > >
+>
+> > > > [   74.925838] Unknown online node for memory at 0x10000000000,
+>
+> > > > assuming node 0
+>
+> > > > [   74.925846] Unknown target node for memory at 0x10000000000,
+>
+> > > > assuming node 0
+>
+> > > > [   74.927470] nd_region region0: nmem0: is disabled, failing probe
+>
+> > > >
+>
+> > >
+>
+> > > Ah. I've seen this one, but not chased it down yet.  Was on my todo
+>
+> > > list to chase
+>
+> > > down. Once I reach this state I can verify the HDM Decode is correct
+>
+> > > which is what
+>
+> > > I've been using to test (Which wasn't true until earlier this week).
+>
+> > > I'm currently testing via devmem, more for historical reasons than
+>
+> > > because it makes
+>
+> > > that much sense anymore.
+>
+> >
+>
+> > *embarassed cough*.  We haven't fully hooked the LSA up in qemu yet.
+>
+> > I'd forgotten that was still on the todo list. I don't think it will
+>
+> > be particularly hard to do and will take a look in next few days.
+>
+> >
+>
+> > Very very indirectly this error is causing a driver probe fail that means
+>
+> > that
+>
+> > we hit a code path that has a rather odd looking check on NDD_LABELING.
+>
+> > Should not have gotten near that path though - hence the problem is
+>
+> > actually
+>
+> > when we call cxl_pmem_get_config_data() and it returns an error because
+>
+> > we haven't fully connected up the command in QEMU.
+>
+>
+>
+> So a least one bug in QEMU. We were not supporting variable length payloads
+>
+> on mailbox
+>
+> inputs (but were on outputs).  That hasn't mattered until we get to LSA
+>
+> writes.
+>
+> We just need to relax condition on the supplied length.
+>
+>
+>
+> diff --git a/hw/cxl/cxl-mailbox-utils.c b/hw/cxl/cxl-mailbox-utils.c
+>
+> index c352a935c4..fdda9529fe 100644
+>
+> --- a/hw/cxl/cxl-mailbox-utils.c
+>
+> +++ b/hw/cxl/cxl-mailbox-utils.c
+>
+> @@ -510,7 +510,7 @@ void cxl_process_mailbox(CXLDeviceState *cxl_dstate)
+>
+>      cxl_cmd = &cxl_cmd_set[set][cmd];
+>
+>      h = cxl_cmd->handler;
+>
+>      if (h) {
+>
+> -        if (len == cxl_cmd->in) {
+>
+> +        if (len == cxl_cmd->in || !cxl_cmd->in) {
+>
+>              cxl_cmd->payload = cxl_dstate->mbox_reg_state +
+>
+>                  A_CXL_DEV_CMD_PAYLOAD;
+>
+>              ret = (*h)(cxl_cmd, cxl_dstate, &len);
+>
+>
+>
+>
+>
+> This lets the nvdimm/region probe fine, but I'm getting some issues with
+>
+> namespace capacity so I'll look at what is causing that next.
+>
+> Unfortunately I'm not that familiar with the driver/nvdimm side of things
+>
+> so it's take a while to figure out what kicks off what!
+>
+>
+The whirlwind tour is that 'struct nd_region' instances that represent a
+>
+persitent memory address range are composed of one more mappings of
+>
+'struct nvdimm' objects. The nvdimm object is driven by the dimm driver
+>
+in drivers/nvdimm/dimm.c. That driver is mainly charged with unlocking
+>
+the dimm (if locked) and interrogating the label area to look for
+>
+namespace labels.
+>
+>
+The label command calls are routed to the '->ndctl()' callback that was
+>
+registered when the CXL nvdimm_bus_descriptor was created. That callback
+>
+handles both 'bus' scope calls, currently none for CXL, and per nvdimm
+>
+calls. cxl_pmem_nvdimm_ctl() translates those generic LIBNVDIMM commands
+>
+to CXL commands.
+>
+>
+The 'struct nvdimm' objects that the CXL side registers have the
+>
+NDD_LABELING flag set which means that namespaces need to be explicitly
+>
+created / provisioned from region capacity. Otherwise, if
+>
+drivers/nvdimm/dimm.c does not find a namespace-label-index block then
+>
+the region reverts to label-less mode and a default namespace equal to
+>
+the size of the region is instantiated.
+>
+>
+If you are seeing small mismatches in namespace capacity then it may
+>
+just be the fact that by default 'ndctl create-namespace' results in an
+>
+'fsdax' mode namespace which just means that it is a block device where
+>
+1.5% of the capacity is reserved for 'struct page' metadata. You should
+>
+be able to see namespace capacity == region capacity by doing "ndctl
+>
+create-namespace -m raw", and disable DAX operation.
+Currently ndctl create-namespace crashes qemu ;)
+Which isn't ideal!
+
+>
+>
+Hope that helps.
+Got me looking at the right code. Thanks!
+
+Jonathan
+
+On Fri, 12 Aug 2022 17:15:09 +0100
+Jonathan Cameron <Jonathan.Cameron@huawei.com> wrote:
+
+>
+On Fri, 12 Aug 2022 09:03:02 -0700
+>
+Dan Williams <dan.j.williams@intel.com> wrote:
+>
+>
+> Jonathan Cameron wrote:
+>
+> > On Thu, 11 Aug 2022 18:08:57 +0100
+>
+> > Jonathan Cameron via <qemu-devel@nongnu.org> wrote:
+>
+> >
+>
+> > > On Tue, 9 Aug 2022 17:08:25 +0100
+>
+> > > Jonathan Cameron <Jonathan.Cameron@huawei.com> wrote:
+>
+> > >
+>
+> > > > On Tue, 9 Aug 2022 21:07:06 +0800
+>
+> > > > Bobo WL <lmw.bobo@gmail.com> wrote:
+>
+> > > >
+>
+> > > > > Hi Jonathan
+>
+> > > > >
+>
+> > > > > Thanks for your reply!
+>
+> > > > >
+>
+> > > > > On Mon, Aug 8, 2022 at 8:37 PM Jonathan Cameron
+>
+> > > > > <Jonathan.Cameron@huawei.com> wrote:
+>
+> > > > > >
+>
+> > > > > > Probably not related to your problem, but there is a disconnect
+>
+> > > > > > in QEMU /
+>
+> > > > > > kernel assumptionsaround the presence of an HDM decoder when a HB
+>
+> > > > > > only
+>
+> > > > > > has a single root port. Spec allows it to be provided or not as
+>
+> > > > > > an implementation choice.
+>
+> > > > > > Kernel assumes it isn't provide. Qemu assumes it is.
+>
+> > > > > >
+>
+> > > > > > The temporary solution is to throw in a second root port on the
+>
+> > > > > > HB and not
+>
+> > > > > > connect anything to it.  Longer term I may special case this so
+>
+> > > > > > that the particular
+>
+> > > > > > decoder defaults to pass through settings in QEMU if there is
+>
+> > > > > > only one root port.
+>
+> > > > > >
+>
+> > > > >
+>
+> > > > > You are right! After adding an extra HB in qemu, I can create a x1
+>
+> > > > > region successfully.
+>
+> > > > > But have some errors in Nvdimm:
+>
+> > > > >
+>
+> > > > > [   74.925838] Unknown online node for memory at 0x10000000000,
+>
+> > > > > assuming node 0
+>
+> > > > > [   74.925846] Unknown target node for memory at 0x10000000000,
+>
+> > > > > assuming node 0
+>
+> > > > > [   74.927470] nd_region region0: nmem0: is disabled, failing probe
+>
+> > > > >
+>
+> > > >
+>
+> > > > Ah. I've seen this one, but not chased it down yet.  Was on my todo
+>
+> > > > list to chase
+>
+> > > > down. Once I reach this state I can verify the HDM Decode is correct
+>
+> > > > which is what
+>
+> > > > I've been using to test (Which wasn't true until earlier this week).
+>
+> > > > I'm currently testing via devmem, more for historical reasons than
+>
+> > > > because it makes
+>
+> > > > that much sense anymore.
+>
+> > >
+>
+> > > *embarassed cough*.  We haven't fully hooked the LSA up in qemu yet.
+>
+> > > I'd forgotten that was still on the todo list. I don't think it will
+>
+> > > be particularly hard to do and will take a look in next few days.
+>
+> > >
+>
+> > > Very very indirectly this error is causing a driver probe fail that
+>
+> > > means that
+>
+> > > we hit a code path that has a rather odd looking check on NDD_LABELING.
+>
+> > > Should not have gotten near that path though - hence the problem is
+>
+> > > actually
+>
+> > > when we call cxl_pmem_get_config_data() and it returns an error because
+>
+> > > we haven't fully connected up the command in QEMU.
+>
+> >
+>
+> > So a least one bug in QEMU. We were not supporting variable length
+>
+> > payloads on mailbox
+>
+> > inputs (but were on outputs).  That hasn't mattered until we get to LSA
+>
+> > writes.
+>
+> > We just need to relax condition on the supplied length.
+>
+> >
+>
+> > diff --git a/hw/cxl/cxl-mailbox-utils.c b/hw/cxl/cxl-mailbox-utils.c
+>
+> > index c352a935c4..fdda9529fe 100644
+>
+> > --- a/hw/cxl/cxl-mailbox-utils.c
+>
+> > +++ b/hw/cxl/cxl-mailbox-utils.c
+>
+> > @@ -510,7 +510,7 @@ void cxl_process_mailbox(CXLDeviceState *cxl_dstate)
+>
+> >      cxl_cmd = &cxl_cmd_set[set][cmd];
+>
+> >      h = cxl_cmd->handler;
+>
+> >      if (h) {
+>
+> > -        if (len == cxl_cmd->in) {
+>
+> > +        if (len == cxl_cmd->in || !cxl_cmd->in) {
+>
+> >              cxl_cmd->payload = cxl_dstate->mbox_reg_state +
+>
+> >                  A_CXL_DEV_CMD_PAYLOAD;
+>
+> >              ret = (*h)(cxl_cmd, cxl_dstate, &len);
+>
+> >
+>
+> >
+>
+> > This lets the nvdimm/region probe fine, but I'm getting some issues with
+>
+> > namespace capacity so I'll look at what is causing that next.
+>
+> > Unfortunately I'm not that familiar with the driver/nvdimm side of things
+>
+> > so it's take a while to figure out what kicks off what!
+>
+>
+>
+> The whirlwind tour is that 'struct nd_region' instances that represent a
+>
+> persitent memory address range are composed of one more mappings of
+>
+> 'struct nvdimm' objects. The nvdimm object is driven by the dimm driver
+>
+> in drivers/nvdimm/dimm.c. That driver is mainly charged with unlocking
+>
+> the dimm (if locked) and interrogating the label area to look for
+>
+> namespace labels.
+>
+>
+>
+> The label command calls are routed to the '->ndctl()' callback that was
+>
+> registered when the CXL nvdimm_bus_descriptor was created. That callback
+>
+> handles both 'bus' scope calls, currently none for CXL, and per nvdimm
+>
+> calls. cxl_pmem_nvdimm_ctl() translates those generic LIBNVDIMM commands
+>
+> to CXL commands.
+>
+>
+>
+> The 'struct nvdimm' objects that the CXL side registers have the
+>
+> NDD_LABELING flag set which means that namespaces need to be explicitly
+>
+> created / provisioned from region capacity. Otherwise, if
+>
+> drivers/nvdimm/dimm.c does not find a namespace-label-index block then
+>
+> the region reverts to label-less mode and a default namespace equal to
+>
+> the size of the region is instantiated.
+>
+>
+>
+> If you are seeing small mismatches in namespace capacity then it may
+>
+> just be the fact that by default 'ndctl create-namespace' results in an
+>
+> 'fsdax' mode namespace which just means that it is a block device where
+>
+> 1.5% of the capacity is reserved for 'struct page' metadata. You should
+>
+> be able to see namespace capacity == region capacity by doing "ndctl
+>
+> create-namespace -m raw", and disable DAX operation.
+>
+>
+Currently ndctl create-namespace crashes qemu ;)
+>
+Which isn't ideal!
+>
+Found a cause for this one.  Mailbox payload may be as small as 256 bytes.
+We have code in kernel sanity checking that output payload fits in the
+mailbox, but nothing on the input payload.  Symptom is that we write just
+off the end whatever size the payload is.  Note doing this shouldn't crash
+qemu - so I need to fix a range check somewhere.
+
+I think this is because cxl_pmem_get_config_size() returns the mailbox
+payload size as being the available LSA size, forgetting to remove the
+size of the headers on the set_lsa side of things.
+https://git.kernel.org/pub/scm/linux/kernel/git/cxl/cxl.git/tree/drivers/cxl/pmem.c?h=next#n110
+I've hacked the max_payload to be -8
+
+Now we still don't succeed in creating the namespace, but bonus is it doesn't 
+crash any more.
+
+
+Jonathan
+
+
+
+>
+>
+>
+> Hope that helps.
+>
+Got me looking at the right code. Thanks!
+>
+>
+Jonathan
+>
+>
+
+On Mon, 15 Aug 2022 15:18:09 +0100
+Jonathan Cameron via <qemu-devel@nongnu.org> wrote:
+
+>
+On Fri, 12 Aug 2022 17:15:09 +0100
+>
+Jonathan Cameron <Jonathan.Cameron@huawei.com> wrote:
+>
+>
+> On Fri, 12 Aug 2022 09:03:02 -0700
+>
+> Dan Williams <dan.j.williams@intel.com> wrote:
+>
+>
+>
+> > Jonathan Cameron wrote:
+>
+> > > On Thu, 11 Aug 2022 18:08:57 +0100
+>
+> > > Jonathan Cameron via <qemu-devel@nongnu.org> wrote:
+>
+> > >
+>
+> > > > On Tue, 9 Aug 2022 17:08:25 +0100
+>
+> > > > Jonathan Cameron <Jonathan.Cameron@huawei.com> wrote:
+>
+> > > >
+>
+> > > > > On Tue, 9 Aug 2022 21:07:06 +0800
+>
+> > > > > Bobo WL <lmw.bobo@gmail.com> wrote:
+>
+> > > > >
+>
+> > > > > > Hi Jonathan
+>
+> > > > > >
+>
+> > > > > > Thanks for your reply!
+>
+> > > > > >
+>
+> > > > > > On Mon, Aug 8, 2022 at 8:37 PM Jonathan Cameron
+>
+> > > > > > <Jonathan.Cameron@huawei.com> wrote:
+>
+> > > > > > >
+>
+> > > > > > > Probably not related to your problem, but there is a disconnect
+>
+> > > > > > > in QEMU /
+>
+> > > > > > > kernel assumptionsaround the presence of an HDM decoder when a
+>
+> > > > > > > HB only
+>
+> > > > > > > has a single root port. Spec allows it to be provided or not as
+>
+> > > > > > > an implementation choice.
+>
+> > > > > > > Kernel assumes it isn't provide. Qemu assumes it is.
+>
+> > > > > > >
+>
+> > > > > > > The temporary solution is to throw in a second root port on the
+>
+> > > > > > > HB and not
+>
+> > > > > > > connect anything to it.  Longer term I may special case this so
+>
+> > > > > > > that the particular
+>
+> > > > > > > decoder defaults to pass through settings in QEMU if there is
+>
+> > > > > > > only one root port.
+>
+> > > > > > >
+>
+> > > > > >
+>
+> > > > > > You are right! After adding an extra HB in qemu, I can create a x1
+>
+> > > > > > region successfully.
+>
+> > > > > > But have some errors in Nvdimm:
+>
+> > > > > >
+>
+> > > > > > [   74.925838] Unknown online node for memory at 0x10000000000,
+>
+> > > > > > assuming node 0
+>
+> > > > > > [   74.925846] Unknown target node for memory at 0x10000000000,
+>
+> > > > > > assuming node 0
+>
+> > > > > > [   74.927470] nd_region region0: nmem0: is disabled, failing
+>
+> > > > > > probe
+>
+> > > > >
+>
+> > > > > Ah. I've seen this one, but not chased it down yet.  Was on my todo
+>
+> > > > > list to chase
+>
+> > > > > down. Once I reach this state I can verify the HDM Decode is
+>
+> > > > > correct which is what
+>
+> > > > > I've been using to test (Which wasn't true until earlier this
+>
+> > > > > week).
+>
+> > > > > I'm currently testing via devmem, more for historical reasons than
+>
+> > > > > because it makes
+>
+> > > > > that much sense anymore.
+>
+> > > >
+>
+> > > > *embarassed cough*.  We haven't fully hooked the LSA up in qemu yet.
+>
+> > > > I'd forgotten that was still on the todo list. I don't think it will
+>
+> > > > be particularly hard to do and will take a look in next few days.
+>
+> > > >
+>
+> > > > Very very indirectly this error is causing a driver probe fail that
+>
+> > > > means that
+>
+> > > > we hit a code path that has a rather odd looking check on
+>
+> > > > NDD_LABELING.
+>
+> > > > Should not have gotten near that path though - hence the problem is
+>
+> > > > actually
+>
+> > > > when we call cxl_pmem_get_config_data() and it returns an error
+>
+> > > > because
+>
+> > > > we haven't fully connected up the command in QEMU.
+>
+> > >
+>
+> > > So a least one bug in QEMU. We were not supporting variable length
+>
+> > > payloads on mailbox
+>
+> > > inputs (but were on outputs).  That hasn't mattered until we get to LSA
+>
+> > > writes.
+>
+> > > We just need to relax condition on the supplied length.
+>
+> > >
+>
+> > > diff --git a/hw/cxl/cxl-mailbox-utils.c b/hw/cxl/cxl-mailbox-utils.c
+>
+> > > index c352a935c4..fdda9529fe 100644
+>
+> > > --- a/hw/cxl/cxl-mailbox-utils.c
+>
+> > > +++ b/hw/cxl/cxl-mailbox-utils.c
+>
+> > > @@ -510,7 +510,7 @@ void cxl_process_mailbox(CXLDeviceState *cxl_dstate)
+>
+> > >      cxl_cmd = &cxl_cmd_set[set][cmd];
+>
+> > >      h = cxl_cmd->handler;
+>
+> > >      if (h) {
+>
+> > > -        if (len == cxl_cmd->in) {
+>
+> > > +        if (len == cxl_cmd->in || !cxl_cmd->in) {
+>
+> > >              cxl_cmd->payload = cxl_dstate->mbox_reg_state +
+>
+> > >                  A_CXL_DEV_CMD_PAYLOAD;
+>
+> > >              ret = (*h)(cxl_cmd, cxl_dstate, &len);
+>
+> > >
+>
+> > >
+>
+> > > This lets the nvdimm/region probe fine, but I'm getting some issues with
+>
+> > > namespace capacity so I'll look at what is causing that next.
+>
+> > > Unfortunately I'm not that familiar with the driver/nvdimm side of
+>
+> > > things
+>
+> > > so it's take a while to figure out what kicks off what!
+>
+> >
+>
+> > The whirlwind tour is that 'struct nd_region' instances that represent a
+>
+> > persitent memory address range are composed of one more mappings of
+>
+> > 'struct nvdimm' objects. The nvdimm object is driven by the dimm driver
+>
+> > in drivers/nvdimm/dimm.c. That driver is mainly charged with unlocking
+>
+> > the dimm (if locked) and interrogating the label area to look for
+>
+> > namespace labels.
+>
+> >
+>
+> > The label command calls are routed to the '->ndctl()' callback that was
+>
+> > registered when the CXL nvdimm_bus_descriptor was created. That callback
+>
+> > handles both 'bus' scope calls, currently none for CXL, and per nvdimm
+>
+> > calls. cxl_pmem_nvdimm_ctl() translates those generic LIBNVDIMM commands
+>
+> > to CXL commands.
+>
+> >
+>
+> > The 'struct nvdimm' objects that the CXL side registers have the
+>
+> > NDD_LABELING flag set which means that namespaces need to be explicitly
+>
+> > created / provisioned from region capacity. Otherwise, if
+>
+> > drivers/nvdimm/dimm.c does not find a namespace-label-index block then
+>
+> > the region reverts to label-less mode and a default namespace equal to
+>
+> > the size of the region is instantiated.
+>
+> >
+>
+> > If you are seeing small mismatches in namespace capacity then it may
+>
+> > just be the fact that by default 'ndctl create-namespace' results in an
+>
+> > 'fsdax' mode namespace which just means that it is a block device where
+>
+> > 1.5% of the capacity is reserved for 'struct page' metadata. You should
+>
+> > be able to see namespace capacity == region capacity by doing "ndctl
+>
+> > create-namespace -m raw", and disable DAX operation.
+>
+>
+>
+> Currently ndctl create-namespace crashes qemu ;)
+>
+> Which isn't ideal!
+>
+>
+>
+>
+Found a cause for this one.  Mailbox payload may be as small as 256 bytes.
+>
+We have code in kernel sanity checking that output payload fits in the
+>
+mailbox, but nothing on the input payload.  Symptom is that we write just
+>
+off the end whatever size the payload is.  Note doing this shouldn't crash
+>
+qemu - so I need to fix a range check somewhere.
+>
+>
+I think this is because cxl_pmem_get_config_size() returns the mailbox
+>
+payload size as being the available LSA size, forgetting to remove the
+>
+size of the headers on the set_lsa side of things.
+>
+https://git.kernel.org/pub/scm/linux/kernel/git/cxl/cxl.git/tree/drivers/cxl/pmem.c?h=next#n110
+>
+>
+I've hacked the max_payload to be -8
+>
+>
+Now we still don't succeed in creating the namespace, but bonus is it doesn't
+>
+crash any more.
+In the interests of defensive / correct handling from QEMU I took a
+look into why it was crashing.  Turns out that providing a NULL write callback 
+for
+the memory device region (that the above overlarge write was spilling into) 
+isn't
+a safe thing to do.  Needs a stub. Oops.
+
+On plus side we might never have noticed this was going wrong without the crash
+*silver lining in every cloud*
+
+Fix to follow...
+
+Jonathan
+
+
+>
+>
+>
+Jonathan
+>
+>
+>
+>
+> >
+>
+> > Hope that helps.
+>
+> Got me looking at the right code. Thanks!
+>
+>
+>
+> Jonathan
+>
+>
+>
+>
+>
+>
+
+On Mon, 15 Aug 2022 at 15:55, Jonathan Cameron via <qemu-arm@nongnu.org> wrote:
+>
+In the interests of defensive / correct handling from QEMU I took a
+>
+look into why it was crashing.  Turns out that providing a NULL write
+>
+callback for
+>
+the memory device region (that the above overlarge write was spilling into)
+>
+isn't
+>
+a safe thing to do.  Needs a stub. Oops.
+Yeah. We've talked before about adding an assert so that that kind of
+"missing function" bug is caught at device creation rather than only
+if the guest tries to access the device, but we never quite got around
+to it...
+
+-- PMM
+
+On Fri, 12 Aug 2022 16:44:03 +0100
+Jonathan Cameron <Jonathan.Cameron@huawei.com> wrote:
+
+>
+On Thu, 11 Aug 2022 18:08:57 +0100
+>
+Jonathan Cameron via <qemu-devel@nongnu.org> wrote:
+>
+>
+> On Tue, 9 Aug 2022 17:08:25 +0100
+>
+> Jonathan Cameron <Jonathan.Cameron@huawei.com> wrote:
+>
+>
+>
+> > On Tue, 9 Aug 2022 21:07:06 +0800
+>
+> > Bobo WL <lmw.bobo@gmail.com> wrote:
+>
+> >
+>
+> > > Hi Jonathan
+>
+> > >
+>
+> > > Thanks for your reply!
+>
+> > >
+>
+> > > On Mon, Aug 8, 2022 at 8:37 PM Jonathan Cameron
+>
+> > > <Jonathan.Cameron@huawei.com> wrote:
+>
+> > > >
+>
+> > > > Probably not related to your problem, but there is a disconnect in
+>
+> > > > QEMU /
+>
+> > > > kernel assumptionsaround the presence of an HDM decoder when a HB only
+>
+> > > > has a single root port. Spec allows it to be provided or not as an
+>
+> > > > implementation choice.
+>
+> > > > Kernel assumes it isn't provide. Qemu assumes it is.
+>
+> > > >
+>
+> > > > The temporary solution is to throw in a second root port on the HB
+>
+> > > > and not
+>
+> > > > connect anything to it.  Longer term I may special case this so that
+>
+> > > > the particular
+>
+> > > > decoder defaults to pass through settings in QEMU if there is only
+>
+> > > > one root port.
+>
+> > > >
+>
+> > >
+>
+> > > You are right! After adding an extra HB in qemu, I can create a x1
+>
+> > > region successfully.
+>
+> > > But have some errors in Nvdimm:
+>
+> > >
+>
+> > > [   74.925838] Unknown online node for memory at 0x10000000000,
+>
+> > > assuming node 0
+>
+> > > [   74.925846] Unknown target node for memory at 0x10000000000,
+>
+> > > assuming node 0
+>
+> > > [   74.927470] nd_region region0: nmem0: is disabled, failing probe
+>
+> > >
+>
+> >
+>
+> > Ah. I've seen this one, but not chased it down yet.  Was on my todo list
+>
+> > to chase
+>
+> > down. Once I reach this state I can verify the HDM Decode is correct
+>
+> > which is what
+>
+> > I've been using to test (Which wasn't true until earlier this week).
+>
+> > I'm currently testing via devmem, more for historical reasons than
+>
+> > because it makes
+>
+> > that much sense anymore.
+>
+>
+>
+> *embarassed cough*.  We haven't fully hooked the LSA up in qemu yet.
+>
+> I'd forgotten that was still on the todo list. I don't think it will
+>
+> be particularly hard to do and will take a look in next few days.
+>
+>
+>
+> Very very indirectly this error is causing a driver probe fail that means
+>
+> that
+>
+> we hit a code path that has a rather odd looking check on NDD_LABELING.
+>
+> Should not have gotten near that path though - hence the problem is actually
+>
+> when we call cxl_pmem_get_config_data() and it returns an error because
+>
+> we haven't fully connected up the command in QEMU.
+>
+>
+So a least one bug in QEMU. We were not supporting variable length payloads
+>
+on mailbox
+>
+inputs (but were on outputs).  That hasn't mattered until we get to LSA
+>
+writes.
+>
+We just need to relax condition on the supplied length.
+>
+>
+diff --git a/hw/cxl/cxl-mailbox-utils.c b/hw/cxl/cxl-mailbox-utils.c
+>
+index c352a935c4..fdda9529fe 100644
+>
+--- a/hw/cxl/cxl-mailbox-utils.c
+>
++++ b/hw/cxl/cxl-mailbox-utils.c
+>
+@@ -510,7 +510,7 @@ void cxl_process_mailbox(CXLDeviceState *cxl_dstate)
+>
+cxl_cmd = &cxl_cmd_set[set][cmd];
+>
+h = cxl_cmd->handler;
+>
+if (h) {
+>
+-        if (len == cxl_cmd->in) {
+>
++        if (len == cxl_cmd->in || !cxl_cmd->in) {
+Fix is wrong as we use ~0 as the placeholder for variable payload, not 0.
+
+With that fixed we hit new fun paths - after some errors we get the
+worrying - not totally sure but looks like a failure on an error cleanup.
+I'll chase down the error source, but even then this is probably triggerable by
+hardware problem or similar.  Some bonus prints in here from me chasing
+error paths, but it's otherwise just cxl/next + the fix I posted earlier today.
+
+[   69.919877] nd_bus ndbus0: START: nd_region.probe(region0)
+[   69.920108] nd_region_probe
+[   69.920623] ------------[ cut here ]------------
+[   69.920675] refcount_t: addition on 0; use-after-free.
+[   69.921314] WARNING: CPU: 3 PID: 710 at lib/refcount.c:25 
+refcount_warn_saturate+0xa0/0x144
+[   69.926949] Modules linked in: cxl_pmem cxl_mem cxl_pci cxl_port cxl_acpi 
+cxl_core
+[   69.928830] CPU: 3 PID: 710 Comm: kworker/u8:9 Not tainted 5.19.0-rc3+ #399
+[   69.930596] Hardware name: QEMU QEMU Virtual Machine, BIOS 0.0.0 02/06/2015
+[   69.931482] Workqueue: events_unbound async_run_entry_fn
+[   69.932403] pstate: 60400005 (nZCv daif +PAN -UAO -TCO -DIT -SSBS BTYPE=--)
+[   69.934023] pc : refcount_warn_saturate+0xa0/0x144
+[   69.935161] lr : refcount_warn_saturate+0xa0/0x144
+[   69.936541] sp : ffff80000890b960
+[   69.937921] x29: ffff80000890b960 x28: 0000000000000000 x27: 0000000000000000
+[   69.940917] x26: ffffa54a90d5cb10 x25: ffffa54a90809e98 x24: 0000000000000000
+[   69.942537] x23: ffffa54a91a3d8d8 x22: ffff0000c5254800 x21: ffff0000c5254800
+[   69.944013] x20: ffff0000ce924180 x19: ffff0000c5254800 x18: ffffffffffffffff
+[   69.946100] x17: ffff5ab66e5ef000 x16: ffff80000801c000 x15: 0000000000000000
+[   69.947585] x14: 0000000000000001 x13: 0a2e656572662d72 x12: 657466612d657375
+[   69.948670] x11: 203b30206e6f206e x10: 6f69746964646120 x9 : ffffa54a8f63d288
+[   69.950679] x8 : 206e6f206e6f6974 x7 : 69646461203a745f x6 : 00000000fffff31e
+[   69.952113] x5 : ffff0000ff61ba08 x4 : 00000000fffff31e x3 : ffff5ab66e5ef000
+root@debian:/sys/bus/cxl/devices/decoder0.0/region0# [   69.954752] x2 : 
+0000000000000000 x1 : 0000000000000000 x0 : ffff0000c512e740
+[   69.957098] Call trace:
+[   69.957959]  refcount_warn_saturate+0xa0/0x144
+[   69.958773]  get_ndd+0x5c/0x80
+[   69.959294]  nd_region_register_namespaces+0xe4/0xe90
+[   69.960253]  nd_region_probe+0x100/0x290
+[   69.960796]  nvdimm_bus_probe+0xf4/0x1c0
+[   69.962087]  really_probe+0x19c/0x3f0
+[   69.962620]  __driver_probe_device+0x11c/0x190
+[   69.963258]  driver_probe_device+0x44/0xf4
+[   69.963773]  __device_attach_driver+0xa4/0x140
+[   69.964471]  bus_for_each_drv+0x84/0xe0
+[   69.965068]  __device_attach+0xb0/0x1f0
+[   69.966101]  device_initial_probe+0x20/0x30
+[   69.967142]  bus_probe_device+0xa4/0xb0
+[   69.968104]  device_add+0x3e8/0x910
+[   69.969111]  nd_async_device_register+0x24/0x74
+[   69.969928]  async_run_entry_fn+0x40/0x150
+[   69.970725]  process_one_work+0x1dc/0x450
+[   69.971796]  worker_thread+0x154/0x450
+[   69.972700]  kthread+0x118/0x120
+[   69.974141]  ret_from_fork+0x10/0x20
+[   69.975141] ---[ end trace 0000000000000000 ]---
+[   70.117887] Into nd_namespace_pmem_set_resource()
+
+>
+cxl_cmd->payload = cxl_dstate->mbox_reg_state +
+>
+A_CXL_DEV_CMD_PAYLOAD;
+>
+ret = (*h)(cxl_cmd, cxl_dstate, &len);
+>
+>
+>
+This lets the nvdimm/region probe fine, but I'm getting some issues with
+>
+namespace capacity so I'll look at what is causing that next.
+>
+Unfortunately I'm not that familiar with the driver/nvdimm side of things
+>
+so it's take a while to figure out what kicks off what!
+>
+>
+Jonathan
+>
+>
+>
+>
+> Jonathan
+>
+>
+>
+>
+>
+> >
+>
+> > >
+>
+> > > And x4 region still failed with same errors, using latest cxl/preview
+>
+> > > branch don't work.
+>
+> > > I have picked "Two CXL emulation fixes" patches in qemu, still not
+>
+> > > working.
+>
+> > >
+>
+> > > Bob
+>
+>
+>
+>
+>
+
+On Mon, 15 Aug 2022 18:04:44 +0100
+Jonathan Cameron <Jonathan.Cameron@huawei.com> wrote:
+
+>
+On Fri, 12 Aug 2022 16:44:03 +0100
+>
+Jonathan Cameron <Jonathan.Cameron@huawei.com> wrote:
+>
+>
+> On Thu, 11 Aug 2022 18:08:57 +0100
+>
+> Jonathan Cameron via <qemu-devel@nongnu.org> wrote:
+>
+>
+>
+> > On Tue, 9 Aug 2022 17:08:25 +0100
+>
+> > Jonathan Cameron <Jonathan.Cameron@huawei.com> wrote:
+>
+> >
+>
+> > > On Tue, 9 Aug 2022 21:07:06 +0800
+>
+> > > Bobo WL <lmw.bobo@gmail.com> wrote:
+>
+> > >
+>
+> > > > Hi Jonathan
+>
+> > > >
+>
+> > > > Thanks for your reply!
+>
+> > > >
+>
+> > > > On Mon, Aug 8, 2022 at 8:37 PM Jonathan Cameron
+>
+> > > > <Jonathan.Cameron@huawei.com> wrote:
+>
+> > > > >
+>
+> > > > > Probably not related to your problem, but there is a disconnect in
+>
+> > > > > QEMU /
+>
+> > > > > kernel assumptionsaround the presence of an HDM decoder when a HB
+>
+> > > > > only
+>
+> > > > > has a single root port. Spec allows it to be provided or not as an
+>
+> > > > > implementation choice.
+>
+> > > > > Kernel assumes it isn't provide. Qemu assumes it is.
+>
+> > > > >
+>
+> > > > > The temporary solution is to throw in a second root port on the HB
+>
+> > > > > and not
+>
+> > > > > connect anything to it.  Longer term I may special case this so
+>
+> > > > > that the particular
+>
+> > > > > decoder defaults to pass through settings in QEMU if there is only
+>
+> > > > > one root port.
+>
+> > > > >
+>
+> > > >
+>
+> > > > You are right! After adding an extra HB in qemu, I can create a x1
+>
+> > > > region successfully.
+>
+> > > > But have some errors in Nvdimm:
+>
+> > > >
+>
+> > > > [   74.925838] Unknown online node for memory at 0x10000000000,
+>
+> > > > assuming node 0
+>
+> > > > [   74.925846] Unknown target node for memory at 0x10000000000,
+>
+> > > > assuming node 0
+>
+> > > > [   74.927470] nd_region region0: nmem0: is disabled, failing probe
+>
+> > > >
+>
+> > >
+>
+> > > Ah. I've seen this one, but not chased it down yet.  Was on my todo
+>
+> > > list to chase
+>
+> > > down. Once I reach this state I can verify the HDM Decode is correct
+>
+> > > which is what
+>
+> > > I've been using to test (Which wasn't true until earlier this week).
+>
+> > > I'm currently testing via devmem, more for historical reasons than
+>
+> > > because it makes
+>
+> > > that much sense anymore.
+>
+> >
+>
+> > *embarassed cough*.  We haven't fully hooked the LSA up in qemu yet.
+>
+> > I'd forgotten that was still on the todo list. I don't think it will
+>
+> > be particularly hard to do and will take a look in next few days.
+>
+> >
+>
+> > Very very indirectly this error is causing a driver probe fail that means
+>
+> > that
+>
+> > we hit a code path that has a rather odd looking check on NDD_LABELING.
+>
+> > Should not have gotten near that path though - hence the problem is
+>
+> > actually
+>
+> > when we call cxl_pmem_get_config_data() and it returns an error because
+>
+> > we haven't fully connected up the command in QEMU.
+>
+>
+>
+> So a least one bug in QEMU. We were not supporting variable length payloads
+>
+> on mailbox
+>
+> inputs (but were on outputs).  That hasn't mattered until we get to LSA
+>
+> writes.
+>
+> We just need to relax condition on the supplied length.
+>
+>
+>
+> diff --git a/hw/cxl/cxl-mailbox-utils.c b/hw/cxl/cxl-mailbox-utils.c
+>
+> index c352a935c4..fdda9529fe 100644
+>
+> --- a/hw/cxl/cxl-mailbox-utils.c
+>
+> +++ b/hw/cxl/cxl-mailbox-utils.c
+>
+> @@ -510,7 +510,7 @@ void cxl_process_mailbox(CXLDeviceState *cxl_dstate)
+>
+>      cxl_cmd = &cxl_cmd_set[set][cmd];
+>
+>      h = cxl_cmd->handler;
+>
+>      if (h) {
+>
+> -        if (len == cxl_cmd->in) {
+>
+> +        if (len == cxl_cmd->in || !cxl_cmd->in) {
+>
+Fix is wrong as we use ~0 as the placeholder for variable payload, not 0.
+Cause of the error is a failure in GET_LSA.
+Reason, payload length is wrong in QEMU but was hidden previously by my wrong
+fix here.  Probably still a good idea to inject an error in GET_LSA and chase
+down the refcount issue.
+
+
+diff --git a/hw/cxl/cxl-mailbox-utils.c b/hw/cxl/cxl-mailbox-utils.c
+index fdda9529fe..e8565fbd6e 100644
+--- a/hw/cxl/cxl-mailbox-utils.c
++++ b/hw/cxl/cxl-mailbox-utils.c
+@@ -489,7 +489,7 @@ static struct cxl_cmd cxl_cmd_set[256][256] = {
+         cmd_identify_memory_device, 0, 0 },
+     [CCLS][GET_PARTITION_INFO] = { "CCLS_GET_PARTITION_INFO",
+         cmd_ccls_get_partition_info, 0, 0 },
+-    [CCLS][GET_LSA] = { "CCLS_GET_LSA", cmd_ccls_get_lsa, 0, 0 },
++    [CCLS][GET_LSA] = { "CCLS_GET_LSA", cmd_ccls_get_lsa, 8, 0 },
+     [CCLS][SET_LSA] = { "CCLS_SET_LSA", cmd_ccls_set_lsa,
+         ~0, IMMEDIATE_CONFIG_CHANGE | IMMEDIATE_DATA_CHANGE },
+     [MEDIA_AND_POISON][GET_POISON_LIST] = { "MEDIA_AND_POISON_GET_POISON_LIST",
+@@ -510,12 +510,13 @@ void cxl_process_mailbox(CXLDeviceState *cxl_dstate)
+     cxl_cmd = &cxl_cmd_set[set][cmd];
+     h = cxl_cmd->handler;
+     if (h) {
+-        if (len == cxl_cmd->in || !cxl_cmd->in) {
++        if (len == cxl_cmd->in || cxl_cmd->in == ~0) {
+             cxl_cmd->payload = cxl_dstate->mbox_reg_state +
+                 A_CXL_DEV_CMD_PAYLOAD;
+
+And woot, we get a namespace in the LSA :)
+
+I'll post QEMU fixes in next day or two.  Kernel side now seems more or less
+fine be it with suspicious refcount underflow.
+
+>
+>
+With that fixed we hit new fun paths - after some errors we get the
+>
+worrying - not totally sure but looks like a failure on an error cleanup.
+>
+I'll chase down the error source, but even then this is probably triggerable
+>
+by
+>
+hardware problem or similar.  Some bonus prints in here from me chasing
+>
+error paths, but it's otherwise just cxl/next + the fix I posted earlier
+>
+today.
+>
+>
+[   69.919877] nd_bus ndbus0: START: nd_region.probe(region0)
+>
+[   69.920108] nd_region_probe
+>
+[   69.920623] ------------[ cut here ]------------
+>
+[   69.920675] refcount_t: addition on 0; use-after-free.
+>
+[   69.921314] WARNING: CPU: 3 PID: 710 at lib/refcount.c:25
+>
+refcount_warn_saturate+0xa0/0x144
+>
+[   69.926949] Modules linked in: cxl_pmem cxl_mem cxl_pci cxl_port cxl_acpi
+>
+cxl_core
+>
+[   69.928830] CPU: 3 PID: 710 Comm: kworker/u8:9 Not tainted 5.19.0-rc3+ #399
+>
+[   69.930596] Hardware name: QEMU QEMU Virtual Machine, BIOS 0.0.0 02/06/2015
+>
+[   69.931482] Workqueue: events_unbound async_run_entry_fn
+>
+[   69.932403] pstate: 60400005 (nZCv daif +PAN -UAO -TCO -DIT -SSBS BTYPE=--)
+>
+[   69.934023] pc : refcount_warn_saturate+0xa0/0x144
+>
+[   69.935161] lr : refcount_warn_saturate+0xa0/0x144
+>
+[   69.936541] sp : ffff80000890b960
+>
+[   69.937921] x29: ffff80000890b960 x28: 0000000000000000 x27:
+>
+0000000000000000
+>
+[   69.940917] x26: ffffa54a90d5cb10 x25: ffffa54a90809e98 x24:
+>
+0000000000000000
+>
+[   69.942537] x23: ffffa54a91a3d8d8 x22: ffff0000c5254800 x21:
+>
+ffff0000c5254800
+>
+[   69.944013] x20: ffff0000ce924180 x19: ffff0000c5254800 x18:
+>
+ffffffffffffffff
+>
+[   69.946100] x17: ffff5ab66e5ef000 x16: ffff80000801c000 x15:
+>
+0000000000000000
+>
+[   69.947585] x14: 0000000000000001 x13: 0a2e656572662d72 x12:
+>
+657466612d657375
+>
+[   69.948670] x11: 203b30206e6f206e x10: 6f69746964646120 x9 :
+>
+ffffa54a8f63d288
+>
+[   69.950679] x8 : 206e6f206e6f6974 x7 : 69646461203a745f x6 :
+>
+00000000fffff31e
+>
+[   69.952113] x5 : ffff0000ff61ba08 x4 : 00000000fffff31e x3 :
+>
+ffff5ab66e5ef000
+>
+root@debian:/sys/bus/cxl/devices/decoder0.0/region0# [   69.954752] x2 :
+>
+0000000000000000 x1 : 0000000000000000 x0 : ffff0000c512e740
+>
+[   69.957098] Call trace:
+>
+[   69.957959]  refcount_warn_saturate+0xa0/0x144
+>
+[   69.958773]  get_ndd+0x5c/0x80
+>
+[   69.959294]  nd_region_register_namespaces+0xe4/0xe90
+>
+[   69.960253]  nd_region_probe+0x100/0x290
+>
+[   69.960796]  nvdimm_bus_probe+0xf4/0x1c0
+>
+[   69.962087]  really_probe+0x19c/0x3f0
+>
+[   69.962620]  __driver_probe_device+0x11c/0x190
+>
+[   69.963258]  driver_probe_device+0x44/0xf4
+>
+[   69.963773]  __device_attach_driver+0xa4/0x140
+>
+[   69.964471]  bus_for_each_drv+0x84/0xe0
+>
+[   69.965068]  __device_attach+0xb0/0x1f0
+>
+[   69.966101]  device_initial_probe+0x20/0x30
+>
+[   69.967142]  bus_probe_device+0xa4/0xb0
+>
+[   69.968104]  device_add+0x3e8/0x910
+>
+[   69.969111]  nd_async_device_register+0x24/0x74
+>
+[   69.969928]  async_run_entry_fn+0x40/0x150
+>
+[   69.970725]  process_one_work+0x1dc/0x450
+>
+[   69.971796]  worker_thread+0x154/0x450
+>
+[   69.972700]  kthread+0x118/0x120
+>
+[   69.974141]  ret_from_fork+0x10/0x20
+>
+[   69.975141] ---[ end trace 0000000000000000 ]---
+>
+[   70.117887] Into nd_namespace_pmem_set_resource()
+>
+>
+>              cxl_cmd->payload = cxl_dstate->mbox_reg_state +
+>
+>                  A_CXL_DEV_CMD_PAYLOAD;
+>
+>              ret = (*h)(cxl_cmd, cxl_dstate, &len);
+>
+>
+>
+>
+>
+> This lets the nvdimm/region probe fine, but I'm getting some issues with
+>
+> namespace capacity so I'll look at what is causing that next.
+>
+> Unfortunately I'm not that familiar with the driver/nvdimm side of things
+>
+> so it's take a while to figure out what kicks off what!
+>
+>
+>
+> Jonathan
+>
+>
+>
+> >
+>
+> > Jonathan
+>
+> >
+>
+> >
+>
+> > >
+>
+> > > >
+>
+> > > > And x4 region still failed with same errors, using latest cxl/preview
+>
+> > > > branch don't work.
+>
+> > > > I have picked "Two CXL emulation fixes" patches in qemu, still not
+>
+> > > > working.
+>
+> > > >
+>
+> > > > Bob
+>
+> >
+>
+> >
+>
+>
+>
+
+Jonathan Cameron wrote:
+>
+On Fri, 12 Aug 2022 16:44:03 +0100
+>
+Jonathan Cameron <Jonathan.Cameron@huawei.com> wrote:
+>
+>
+> On Thu, 11 Aug 2022 18:08:57 +0100
+>
+> Jonathan Cameron via <qemu-devel@nongnu.org> wrote:
+>
+>
+>
+> > On Tue, 9 Aug 2022 17:08:25 +0100
+>
+> > Jonathan Cameron <Jonathan.Cameron@huawei.com> wrote:
+>
+> >
+>
+> > > On Tue, 9 Aug 2022 21:07:06 +0800
+>
+> > > Bobo WL <lmw.bobo@gmail.com> wrote:
+>
+> > >
+>
+> > > > Hi Jonathan
+>
+> > > >
+>
+> > > > Thanks for your reply!
+>
+> > > >
+>
+> > > > On Mon, Aug 8, 2022 at 8:37 PM Jonathan Cameron
+>
+> > > > <Jonathan.Cameron@huawei.com> wrote:
+>
+> > > > >
+>
+> > > > > Probably not related to your problem, but there is a disconnect in
+>
+> > > > > QEMU /
+>
+> > > > > kernel assumptionsaround the presence of an HDM decoder when a HB
+>
+> > > > > only
+>
+> > > > > has a single root port. Spec allows it to be provided or not as an
+>
+> > > > > implementation choice.
+>
+> > > > > Kernel assumes it isn't provide. Qemu assumes it is.
+>
+> > > > >
+>
+> > > > > The temporary solution is to throw in a second root port on the HB
+>
+> > > > > and not
+>
+> > > > > connect anything to it.  Longer term I may special case this so
+>
+> > > > > that the particular
+>
+> > > > > decoder defaults to pass through settings in QEMU if there is only
+>
+> > > > > one root port.
+>
+> > > > >
+>
+> > > >
+>
+> > > > You are right! After adding an extra HB in qemu, I can create a x1
+>
+> > > > region successfully.
+>
+> > > > But have some errors in Nvdimm:
+>
+> > > >
+>
+> > > > [   74.925838] Unknown online node for memory at 0x10000000000,
+>
+> > > > assuming node 0
+>
+> > > > [   74.925846] Unknown target node for memory at 0x10000000000,
+>
+> > > > assuming node 0
+>
+> > > > [   74.927470] nd_region region0: nmem0: is disabled, failing probe
+>
+> > > >
+>
+> > >
+>
+> > > Ah. I've seen this one, but not chased it down yet.  Was on my todo
+>
+> > > list to chase
+>
+> > > down. Once I reach this state I can verify the HDM Decode is correct
+>
+> > > which is what
+>
+> > > I've been using to test (Which wasn't true until earlier this week).
+>
+> > > I'm currently testing via devmem, more for historical reasons than
+>
+> > > because it makes
+>
+> > > that much sense anymore.
+>
+> >
+>
+> > *embarassed cough*.  We haven't fully hooked the LSA up in qemu yet.
+>
+> > I'd forgotten that was still on the todo list. I don't think it will
+>
+> > be particularly hard to do and will take a look in next few days.
+>
+> >
+>
+> > Very very indirectly this error is causing a driver probe fail that means
+>
+> > that
+>
+> > we hit a code path that has a rather odd looking check on NDD_LABELING.
+>
+> > Should not have gotten near that path though - hence the problem is
+>
+> > actually
+>
+> > when we call cxl_pmem_get_config_data() and it returns an error because
+>
+> > we haven't fully connected up the command in QEMU.
+>
+>
+>
+> So a least one bug in QEMU. We were not supporting variable length payloads
+>
+> on mailbox
+>
+> inputs (but were on outputs).  That hasn't mattered until we get to LSA
+>
+> writes.
+>
+> We just need to relax condition on the supplied length.
+>
+>
+>
+> diff --git a/hw/cxl/cxl-mailbox-utils.c b/hw/cxl/cxl-mailbox-utils.c
+>
+> index c352a935c4..fdda9529fe 100644
+>
+> --- a/hw/cxl/cxl-mailbox-utils.c
+>
+> +++ b/hw/cxl/cxl-mailbox-utils.c
+>
+> @@ -510,7 +510,7 @@ void cxl_process_mailbox(CXLDeviceState *cxl_dstate)
+>
+>      cxl_cmd = &cxl_cmd_set[set][cmd];
+>
+>      h = cxl_cmd->handler;
+>
+>      if (h) {
+>
+> -        if (len == cxl_cmd->in) {
+>
+> +        if (len == cxl_cmd->in || !cxl_cmd->in) {
+>
+Fix is wrong as we use ~0 as the placeholder for variable payload, not 0.
+>
+>
+With that fixed we hit new fun paths - after some errors we get the
+>
+worrying - not totally sure but looks like a failure on an error cleanup.
+>
+I'll chase down the error source, but even then this is probably triggerable
+>
+by
+>
+hardware problem or similar.  Some bonus prints in here from me chasing
+>
+error paths, but it's otherwise just cxl/next + the fix I posted earlier
+>
+today.
+One of the scenarios that I cannot rule out is nvdimm_probe() racing
+nd_region_probe(), but given all the work it takes to create a region I
+suspect all the nvdimm_probe() work to have completed...
+
+It is at least one potentially wrong hypothesis that needs to be chased
+down.
+
+>
+>
+[   69.919877] nd_bus ndbus0: START: nd_region.probe(region0)
+>
+[   69.920108] nd_region_probe
+>
+[   69.920623] ------------[ cut here ]------------
+>
+[   69.920675] refcount_t: addition on 0; use-after-free.
+>
+[   69.921314] WARNING: CPU: 3 PID: 710 at lib/refcount.c:25
+>
+refcount_warn_saturate+0xa0/0x144
+>
+[   69.926949] Modules linked in: cxl_pmem cxl_mem cxl_pci cxl_port cxl_acpi
+>
+cxl_core
+>
+[   69.928830] CPU: 3 PID: 710 Comm: kworker/u8:9 Not tainted 5.19.0-rc3+ #399
+>
+[   69.930596] Hardware name: QEMU QEMU Virtual Machine, BIOS 0.0.0 02/06/2015
+>
+[   69.931482] Workqueue: events_unbound async_run_entry_fn
+>
+[   69.932403] pstate: 60400005 (nZCv daif +PAN -UAO -TCO -DIT -SSBS BTYPE=--)
+>
+[   69.934023] pc : refcount_warn_saturate+0xa0/0x144
+>
+[   69.935161] lr : refcount_warn_saturate+0xa0/0x144
+>
+[   69.936541] sp : ffff80000890b960
+>
+[   69.937921] x29: ffff80000890b960 x28: 0000000000000000 x27:
+>
+0000000000000000
+>
+[   69.940917] x26: ffffa54a90d5cb10 x25: ffffa54a90809e98 x24:
+>
+0000000000000000
+>
+[   69.942537] x23: ffffa54a91a3d8d8 x22: ffff0000c5254800 x21:
+>
+ffff0000c5254800
+>
+[   69.944013] x20: ffff0000ce924180 x19: ffff0000c5254800 x18:
+>
+ffffffffffffffff
+>
+[   69.946100] x17: ffff5ab66e5ef000 x16: ffff80000801c000 x15:
+>
+0000000000000000
+>
+[   69.947585] x14: 0000000000000001 x13: 0a2e656572662d72 x12:
+>
+657466612d657375
+>
+[   69.948670] x11: 203b30206e6f206e x10: 6f69746964646120 x9 :
+>
+ffffa54a8f63d288
+>
+[   69.950679] x8 : 206e6f206e6f6974 x7 : 69646461203a745f x6 :
+>
+00000000fffff31e
+>
+[   69.952113] x5 : ffff0000ff61ba08 x4 : 00000000fffff31e x3 :
+>
+ffff5ab66e5ef000
+>
+root@debian:/sys/bus/cxl/devices/decoder0.0/region0# [   69.954752] x2 :
+>
+0000000000000000 x1 : 0000000000000000 x0 : ffff0000c512e740
+>
+[   69.957098] Call trace:
+>
+[   69.957959]  refcount_warn_saturate+0xa0/0x144
+>
+[   69.958773]  get_ndd+0x5c/0x80
+>
+[   69.959294]  nd_region_register_namespaces+0xe4/0xe90
+>
+[   69.960253]  nd_region_probe+0x100/0x290
+>
+[   69.960796]  nvdimm_bus_probe+0xf4/0x1c0
+>
+[   69.962087]  really_probe+0x19c/0x3f0
+>
+[   69.962620]  __driver_probe_device+0x11c/0x190
+>
+[   69.963258]  driver_probe_device+0x44/0xf4
+>
+[   69.963773]  __device_attach_driver+0xa4/0x140
+>
+[   69.964471]  bus_for_each_drv+0x84/0xe0
+>
+[   69.965068]  __device_attach+0xb0/0x1f0
+>
+[   69.966101]  device_initial_probe+0x20/0x30
+>
+[   69.967142]  bus_probe_device+0xa4/0xb0
+>
+[   69.968104]  device_add+0x3e8/0x910
+>
+[   69.969111]  nd_async_device_register+0x24/0x74
+>
+[   69.969928]  async_run_entry_fn+0x40/0x150
+>
+[   69.970725]  process_one_work+0x1dc/0x450
+>
+[   69.971796]  worker_thread+0x154/0x450
+>
+[   69.972700]  kthread+0x118/0x120
+>
+[   69.974141]  ret_from_fork+0x10/0x20
+>
+[   69.975141] ---[ end trace 0000000000000000 ]---
+>
+[   70.117887] Into nd_namespace_pmem_set_resource()
+
+On Mon, 15 Aug 2022 15:55:15 -0700
+Dan Williams <dan.j.williams@intel.com> wrote:
+
+>
+Jonathan Cameron wrote:
+>
+> On Fri, 12 Aug 2022 16:44:03 +0100
+>
+> Jonathan Cameron <Jonathan.Cameron@huawei.com> wrote:
+>
+>
+>
+> > On Thu, 11 Aug 2022 18:08:57 +0100
+>
+> > Jonathan Cameron via <qemu-devel@nongnu.org> wrote:
+>
+> >
+>
+> > > On Tue, 9 Aug 2022 17:08:25 +0100
+>
+> > > Jonathan Cameron <Jonathan.Cameron@huawei.com> wrote:
+>
+> > >
+>
+> > > > On Tue, 9 Aug 2022 21:07:06 +0800
+>
+> > > > Bobo WL <lmw.bobo@gmail.com> wrote:
+>
+> > > >
+>
+> > > > > Hi Jonathan
+>
+> > > > >
+>
+> > > > > Thanks for your reply!
+>
+> > > > >
+>
+> > > > > On Mon, Aug 8, 2022 at 8:37 PM Jonathan Cameron
+>
+> > > > > <Jonathan.Cameron@huawei.com> wrote:
+>
+> > > > > >
+>
+> > > > > > Probably not related to your problem, but there is a disconnect
+>
+> > > > > > in QEMU /
+>
+> > > > > > kernel assumptionsaround the presence of an HDM decoder when a HB
+>
+> > > > > > only
+>
+> > > > > > has a single root port. Spec allows it to be provided or not as
+>
+> > > > > > an implementation choice.
+>
+> > > > > > Kernel assumes it isn't provide. Qemu assumes it is.
+>
+> > > > > >
+>
+> > > > > > The temporary solution is to throw in a second root port on the
+>
+> > > > > > HB and not
+>
+> > > > > > connect anything to it.  Longer term I may special case this so
+>
+> > > > > > that the particular
+>
+> > > > > > decoder defaults to pass through settings in QEMU if there is
+>
+> > > > > > only one root port.
+>
+> > > > > >
+>
+> > > > >
+>
+> > > > > You are right! After adding an extra HB in qemu, I can create a x1
+>
+> > > > > region successfully.
+>
+> > > > > But have some errors in Nvdimm:
+>
+> > > > >
+>
+> > > > > [   74.925838] Unknown online node for memory at 0x10000000000,
+>
+> > > > > assuming node 0
+>
+> > > > > [   74.925846] Unknown target node for memory at 0x10000000000,
+>
+> > > > > assuming node 0
+>
+> > > > > [   74.927470] nd_region region0: nmem0: is disabled, failing probe
+>
+> > > > >
+>
+> > > >
+>
+> > > > Ah. I've seen this one, but not chased it down yet.  Was on my todo
+>
+> > > > list to chase
+>
+> > > > down. Once I reach this state I can verify the HDM Decode is correct
+>
+> > > > which is what
+>
+> > > > I've been using to test (Which wasn't true until earlier this week).
+>
+> > > > I'm currently testing via devmem, more for historical reasons than
+>
+> > > > because it makes
+>
+> > > > that much sense anymore.
+>
+> > >
+>
+> > > *embarassed cough*.  We haven't fully hooked the LSA up in qemu yet.
+>
+> > > I'd forgotten that was still on the todo list. I don't think it will
+>
+> > > be particularly hard to do and will take a look in next few days.
+>
+> > >
+>
+> > > Very very indirectly this error is causing a driver probe fail that
+>
+> > > means that
+>
+> > > we hit a code path that has a rather odd looking check on NDD_LABELING.
+>
+> > > Should not have gotten near that path though - hence the problem is
+>
+> > > actually
+>
+> > > when we call cxl_pmem_get_config_data() and it returns an error because
+>
+> > > we haven't fully connected up the command in QEMU.
+>
+> >
+>
+> > So a least one bug in QEMU. We were not supporting variable length
+>
+> > payloads on mailbox
+>
+> > inputs (but were on outputs).  That hasn't mattered until we get to LSA
+>
+> > writes.
+>
+> > We just need to relax condition on the supplied length.
+>
+> >
+>
+> > diff --git a/hw/cxl/cxl-mailbox-utils.c b/hw/cxl/cxl-mailbox-utils.c
+>
+> > index c352a935c4..fdda9529fe 100644
+>
+> > --- a/hw/cxl/cxl-mailbox-utils.c
+>
+> > +++ b/hw/cxl/cxl-mailbox-utils.c
+>
+> > @@ -510,7 +510,7 @@ void cxl_process_mailbox(CXLDeviceState *cxl_dstate)
+>
+> >      cxl_cmd = &cxl_cmd_set[set][cmd];
+>
+> >      h = cxl_cmd->handler;
+>
+> >      if (h) {
+>
+> > -        if (len == cxl_cmd->in) {
+>
+> > +        if (len == cxl_cmd->in || !cxl_cmd->in) {
+>
+> Fix is wrong as we use ~0 as the placeholder for variable payload, not 0.
+>
+>
+>
+> With that fixed we hit new fun paths - after some errors we get the
+>
+> worrying - not totally sure but looks like a failure on an error cleanup.
+>
+> I'll chase down the error source, but even then this is probably
+>
+> triggerable by
+>
+> hardware problem or similar.  Some bonus prints in here from me chasing
+>
+> error paths, but it's otherwise just cxl/next + the fix I posted earlier
+>
+> today.
+>
+>
+One of the scenarios that I cannot rule out is nvdimm_probe() racing
+>
+nd_region_probe(), but given all the work it takes to create a region I
+>
+suspect all the nvdimm_probe() work to have completed...
+>
+>
+It is at least one potentially wrong hypothesis that needs to be chased
+>
+down.
+Maybe there should be a special award for the non-intuitive 
+ndctl create-namespace command (modifies existing namespace and might create
+a different empty one...) I'm sure there is some interesting history behind 
+that one :)
+
+Upshot is I just threw a filesystem on fsdax and wrote some text files on it
+to allow easy grepping. The right data ends up in the memory and a plausible
+namespace description is stored in the LSA.
+
+So to some degree at least it's 'working' on an 8 way direct connected
+set of emulated devices.
+
+One snag is that serial number support isn't yet upstream in QEMU.
+(I have had it in my tree for a while but not posted it yet because of
+ QEMU feature freeze)
+https://gitlab.com/jic23/qemu/-/commit/144c783ea8a5fbe169f46ea1ba92940157f42733
+That's needed for meaningful cookie generation.  Otherwise you can build the
+namespace once, but it won't work on next probe as the cookie is 0 and you
+hit some error paths.
+
+Maybe sensible to add a sanity check and fail namespace creation if
+cookie is 0?  (Silly side question, but is there a theoretical risk of
+a serial number / other data combination leading to a fletcher64()
+checksum that happens to be 0 - that would give a very odd bug report!)
+
+So to make it work the following is needed:
+
+1) The kernel fix for mailbox buffer overflow.
+2) Qemu fix for size of arguements for get_lsa
+3) Qemu fix to allow variable size input arguements (for set_lsa)
+4) Serial number patch above + command lines to qemu to set appropriate
+   serial numbers.
+
+I'll send out the QEMU fixes shortly and post the Serial number patch,
+though that almost certainly won't go in until next QEMU development
+cycle starts in a few weeks.
+
+Next up, run through same tests on some other topologies.
+
+Jonathan
+
+>
+>
+>
+>
+> [   69.919877] nd_bus ndbus0: START: nd_region.probe(region0)
+>
+> [   69.920108] nd_region_probe
+>
+> [   69.920623] ------------[ cut here ]------------
+>
+> [   69.920675] refcount_t: addition on 0; use-after-free.
+>
+> [   69.921314] WARNING: CPU: 3 PID: 710 at lib/refcount.c:25
+>
+> refcount_warn_saturate+0xa0/0x144
+>
+> [   69.926949] Modules linked in: cxl_pmem cxl_mem cxl_pci cxl_port
+>
+> cxl_acpi cxl_core
+>
+> [   69.928830] CPU: 3 PID: 710 Comm: kworker/u8:9 Not tainted 5.19.0-rc3+
+>
+> #399
+>
+> [   69.930596] Hardware name: QEMU QEMU Virtual Machine, BIOS 0.0.0
+>
+> 02/06/2015
+>
+> [   69.931482] Workqueue: events_unbound async_run_entry_fn
+>
+> [   69.932403] pstate: 60400005 (nZCv daif +PAN -UAO -TCO -DIT -SSBS
+>
+> BTYPE=--)
+>
+> [   69.934023] pc : refcount_warn_saturate+0xa0/0x144
+>
+> [   69.935161] lr : refcount_warn_saturate+0xa0/0x144
+>
+> [   69.936541] sp : ffff80000890b960
+>
+> [   69.937921] x29: ffff80000890b960 x28: 0000000000000000 x27:
+>
+> 0000000000000000
+>
+> [   69.940917] x26: ffffa54a90d5cb10 x25: ffffa54a90809e98 x24:
+>
+> 0000000000000000
+>
+> [   69.942537] x23: ffffa54a91a3d8d8 x22: ffff0000c5254800 x21:
+>
+> ffff0000c5254800
+>
+> [   69.944013] x20: ffff0000ce924180 x19: ffff0000c5254800 x18:
+>
+> ffffffffffffffff
+>
+> [   69.946100] x17: ffff5ab66e5ef000 x16: ffff80000801c000 x15:
+>
+> 0000000000000000
+>
+> [   69.947585] x14: 0000000000000001 x13: 0a2e656572662d72 x12:
+>
+> 657466612d657375
+>
+> [   69.948670] x11: 203b30206e6f206e x10: 6f69746964646120 x9 :
+>
+> ffffa54a8f63d288
+>
+> [   69.950679] x8 : 206e6f206e6f6974 x7 : 69646461203a745f x6 :
+>
+> 00000000fffff31e
+>
+> [   69.952113] x5 : ffff0000ff61ba08 x4 : 00000000fffff31e x3 :
+>
+> ffff5ab66e5ef000
+>
+> root@debian:/sys/bus/cxl/devices/decoder0.0/region0# [   69.954752] x2 :
+>
+> 0000000000000000 x1 : 0000000000000000 x0 : ffff0000c512e740
+>
+> [   69.957098] Call trace:
+>
+> [   69.957959]  refcount_warn_saturate+0xa0/0x144
+>
+> [   69.958773]  get_ndd+0x5c/0x80
+>
+> [   69.959294]  nd_region_register_namespaces+0xe4/0xe90
+>
+> [   69.960253]  nd_region_probe+0x100/0x290
+>
+> [   69.960796]  nvdimm_bus_probe+0xf4/0x1c0
+>
+> [   69.962087]  really_probe+0x19c/0x3f0
+>
+> [   69.962620]  __driver_probe_device+0x11c/0x190
+>
+> [   69.963258]  driver_probe_device+0x44/0xf4
+>
+> [   69.963773]  __device_attach_driver+0xa4/0x140
+>
+> [   69.964471]  bus_for_each_drv+0x84/0xe0
+>
+> [   69.965068]  __device_attach+0xb0/0x1f0
+>
+> [   69.966101]  device_initial_probe+0x20/0x30
+>
+> [   69.967142]  bus_probe_device+0xa4/0xb0
+>
+> [   69.968104]  device_add+0x3e8/0x910
+>
+> [   69.969111]  nd_async_device_register+0x24/0x74
+>
+> [   69.969928]  async_run_entry_fn+0x40/0x150
+>
+> [   69.970725]  process_one_work+0x1dc/0x450
+>
+> [   69.971796]  worker_thread+0x154/0x450
+>
+> [   69.972700]  kthread+0x118/0x120
+>
+> [   69.974141]  ret_from_fork+0x10/0x20
+>
+> [   69.975141] ---[ end trace 0000000000000000 ]---
+>
+> [   70.117887] Into nd_namespace_pmem_set_resource()
+
+Bobo WL wrote:
+>
+Hi list
+>
+>
+I want to test cxl functions in arm64, and found some problems I can't
+>
+figure out.
+>
+>
+My test environment:
+>
+>
+1. build latest bios from
+https://github.com/tianocore/edk2.git
+master
+>
+branch(cc2db6ebfb6d9d85ba4c7b35fba1fa37fffc0bc2)
+>
+2. build latest qemu-system-aarch64 from git://git.qemu.org/qemu.git
+>
+master branch(846dcf0ba4eff824c295f06550b8673ff3f31314). With cxl arm
+>
+support patch:
+>
+https://patchwork.kernel.org/project/cxl/cover/20220616141950.23374-1-Jonathan.Cameron@huawei.com/
+>
+3. build Linux kernel from
+>
+https://git.kernel.org/pub/scm/linux/kernel/git/cxl/cxl.git
+preview
+>
+branch(65fc1c3d26b96002a5aa1f4012fae4dc98fd5683)
+>
+4. build latest ndctl tools from
+https://github.com/pmem/ndctl
+>
+create_region branch(8558b394e449779e3a4f3ae90fae77ede0bca159)
+>
+>
+And my qemu test commands:
+>
+sudo $QEMU_BIN -M virt,gic-version=3,cxl=on -m 4g,maxmem=8G,slots=8 \
+>
+-cpu max -smp 8 -nographic -no-reboot \
+>
+-kernel $KERNEL -bios $BIOS_BIN \
+>
+-drive if=none,file=$ROOTFS,format=qcow2,id=hd \
+>
+-device virtio-blk-pci,drive=hd -append 'root=/dev/vda1
+>
+nokaslr dyndbg="module cxl* +p"' \
+>
+-object memory-backend-ram,size=4G,id=mem0 \
+>
+-numa node,nodeid=0,cpus=0-7,memdev=mem0 \
+>
+-net nic -net user,hostfwd=tcp::2222-:22 -enable-kvm \
+>
+-object
+>
+memory-backend-file,id=cxl-mem0,share=on,mem-path=/tmp/cxltest.raw,size=256M
+>
+\
+>
+-object
+>
+memory-backend-file,id=cxl-mem1,share=on,mem-path=/tmp/cxltest1.raw,size=256M
+>
+\
+>
+-object
+>
+memory-backend-file,id=cxl-mem2,share=on,mem-path=/tmp/cxltest2.raw,size=256M
+>
+\
+>
+-object
+>
+memory-backend-file,id=cxl-mem3,share=on,mem-path=/tmp/cxltest3.raw,size=256M
+>
+\
+>
+-object
+>
+memory-backend-file,id=cxl-lsa0,share=on,mem-path=/tmp/lsa0.raw,size=256M
+>
+\
+>
+-object
+>
+memory-backend-file,id=cxl-lsa1,share=on,mem-path=/tmp/lsa1.raw,size=256M
+>
+\
+>
+-object
+>
+memory-backend-file,id=cxl-lsa2,share=on,mem-path=/tmp/lsa2.raw,size=256M
+>
+\
+>
+-object
+>
+memory-backend-file,id=cxl-lsa3,share=on,mem-path=/tmp/lsa3.raw,size=256M
+>
+\
+>
+-device pxb-cxl,bus_nr=12,bus=pcie.0,id=cxl.1 \
+>
+-device cxl-rp,port=0,bus=cxl.1,id=root_port0,chassis=0,slot=0 \
+>
+-device cxl-upstream,bus=root_port0,id=us0 \
+>
+-device cxl-downstream,port=0,bus=us0,id=swport0,chassis=0,slot=4 \
+>
+-device
+>
+cxl-type3,bus=swport0,memdev=cxl-mem0,lsa=cxl-lsa0,id=cxl-pmem0 \
+>
+-device cxl-downstream,port=1,bus=us0,id=swport1,chassis=0,slot=5 \
+>
+-device
+>
+cxl-type3,bus=swport1,memdev=cxl-mem1,lsa=cxl-lsa1,id=cxl-pmem1 \
+>
+-device cxl-downstream,port=2,bus=us0,id=swport2,chassis=0,slot=6 \
+>
+-device
+>
+cxl-type3,bus=swport2,memdev=cxl-mem2,lsa=cxl-lsa2,id=cxl-pmem2 \
+>
+-device cxl-downstream,port=3,bus=us0,id=swport3,chassis=0,slot=7 \
+>
+-device
+>
+cxl-type3,bus=swport3,memdev=cxl-mem3,lsa=cxl-lsa3,id=cxl-pmem3 \
+>
+-M
+>
+cxl-fmw.0.targets.0=cxl.1,cxl-fmw.0.size=4G,cxl-fmw.0.interleave-granularity=4k
+>
+>
+And I have got two problems.
+>
+1. When I want to create x1 region with command: "cxl create-region -d
+>
+decoder0.0 -w 1 -g 4096 mem0", kernel crashed with null pointer
+>
+reference. Crash log:
+>
+>
+[  534.697324] cxl_region region0: config state: 0
+>
+[  534.697346] cxl_region region0: probe: -6
+>
+[  534.697368] cxl_acpi ACPI0017:00: decoder0.0: created region0
+>
+[  534.699115] cxl region0: mem0:endpoint3 decoder3.0 add:
+>
+mem0:decoder3.0 @ 0 next: none nr_eps: 1 nr_targets: 1
+>
+[  534.699149] cxl region0: 0000:0d:00.0:port2 decoder2.0 add:
+>
+mem0:decoder3.0 @ 0 next: mem0 nr_eps: 1 nr_targets: 1
+>
+[  534.699167] cxl region0: ACPI0016:00:port1 decoder1.0 add:
+>
+mem0:decoder3.0 @ 0 next: 0000:0d:00.0 nr_eps: 1 nr_targets: 1
+>
+[  534.699176] cxl region0: ACPI0016:00:port1 iw: 1 ig: 256
+>
+[  534.699182] cxl region0: ACPI0016:00:port1 target[0] = 0000:0c:00.0
+>
+for mem0:decoder3.0 @ 0
+>
+[  534.699189] cxl region0: 0000:0d:00.0:port2 iw: 1 ig: 256
+>
+[  534.699193] cxl region0: 0000:0d:00.0:port2 target[0] =
+>
+0000:0e:00.0 for mem0:decoder3.0 @ 0
+>
+[  534.699405] Unable to handle kernel NULL pointer dereference at
+>
+virtual address 0000000000000000
+>
+[  534.701474] Mem abort info:
+>
+[  534.701994]   ESR = 0x0000000086000004
+>
+[  534.702653]   EC = 0x21: IABT (current EL), IL = 32 bits
+>
+[  534.703616]   SET = 0, FnV = 0
+>
+[  534.704174]   EA = 0, S1PTW = 0
+>
+[  534.704803]   FSC = 0x04: level 0 translation fault
+>
+[  534.705694] user pgtable: 4k pages, 48-bit VAs, pgdp=000000010144a000
+>
+[  534.706875] [0000000000000000] pgd=0000000000000000, p4d=0000000000000000
+>
+[  534.709855] Internal error: Oops: 86000004 [#1] PREEMPT SMP
+>
+[  534.710301] Modules linked in:
+>
+[  534.710546] CPU: 7 PID: 331 Comm: cxl Not tainted
+>
+5.19.0-rc3-00064-g65fc1c3d26b9-dirty #11
+>
+[  534.715393] Hardware name: QEMU KVM Virtual Machine, BIOS 0.0.0 02/06/2015
+>
+[  534.717179] pstate: 60400005 (nZCv daif +PAN -UAO -TCO -DIT -SSBS BTYPE=--)
+>
+[  534.719190] pc : 0x0
+>
+[  534.719928] lr : commit_store+0x118/0x2cc
+>
+[  534.721007] sp : ffff80000aec3c30
+>
+[  534.721793] x29: ffff80000aec3c30 x28: ffff0000da62e740 x27:
+>
+ffff0000c0c06b30
+>
+[  534.723875] x26: 0000000000000000 x25: ffff0000c0a2a400 x24:
+>
+ffff0000c0a29400
+>
+[  534.725440] x23: 0000000000000003 x22: 0000000000000000 x21:
+>
+ffff0000c0c06800
+>
+[  534.727312] x20: 0000000000000000 x19: ffff0000c1559800 x18:
+>
+0000000000000000
+>
+[  534.729138] x17: 0000000000000000 x16: 0000000000000000 x15:
+>
+0000ffffd41fe838
+>
+[  534.731046] x14: 0000000000000000 x13: 0000000000000000 x12:
+>
+0000000000000000
+>
+[  534.732402] x11: 0000000000000000 x10: 0000000000000000 x9 :
+>
+0000000000000000
+>
+[  534.734432] x8 : 0000000000000000 x7 : 0000000000000000 x6 :
+>
+ffff0000c0906e80
+>
+[  534.735921] x5 : 0000000000000000 x4 : 0000000000000000 x3 :
+>
+ffff80000aec3bf0
+>
+[  534.737437] x2 : 0000000000000000 x1 : 0000000000000000 x0 :
+>
+ffff0000c155a000
+>
+[  534.738878] Call trace:
+>
+[  534.739368]  0x0
+>
+[  534.739713]  dev_attr_store+0x1c/0x30
+>
+[  534.740186]  sysfs_kf_write+0x48/0x58
+>
+[  534.740961]  kernfs_fop_write_iter+0x128/0x184
+>
+[  534.741872]  new_sync_write+0xdc/0x158
+>
+[  534.742706]  vfs_write+0x1ac/0x2a8
+>
+[  534.743440]  ksys_write+0x68/0xf0
+>
+[  534.744328]  __arm64_sys_write+0x1c/0x28
+>
+[  534.745180]  invoke_syscall+0x44/0xf0
+>
+[  534.745989]  el0_svc_common+0x4c/0xfc
+>
+[  534.746661]  do_el0_svc+0x60/0xa8
+>
+[  534.747378]  el0_svc+0x2c/0x78
+>
+[  534.748066]  el0t_64_sync_handler+0xb8/0x12c
+>
+[  534.748919]  el0t_64_sync+0x18c/0x190
+>
+[  534.749629] Code: bad PC value
+>
+[  534.750169] ---[ end trace 0000000000000000 ]---
+What was the top kernel commit when you ran this test? What is the line
+number of "commit_store+0x118"?
+
+>
+2. When I want to create x4 region with command: "cxl create-region -d
+>
+decoder0.0 -w 4 -g 4096 -m mem0 mem1 mem2 mem3". I got below errors:
+>
+>
+cxl region: create_region: region0: failed to set target3 to mem3
+>
+cxl region: cmd_create_region: created 0 regions
+>
+>
+And kernel log as below:
+>
+[   60.536663] cxl_region region0: config state: 0
+>
+[   60.536675] cxl_region region0: probe: -6
+>
+[   60.536696] cxl_acpi ACPI0017:00: decoder0.0: created region0
+>
+[   60.538251] cxl region0: mem0:endpoint3 decoder3.0 add:
+>
+mem0:decoder3.0 @ 0 next: none nr_eps: 1 nr_targets: 1
+>
+[   60.538278] cxl region0: 0000:0d:00.0:port2 decoder2.0 add:
+>
+mem0:decoder3.0 @ 0 next: mem0 nr_eps: 1 nr_targets: 1
+>
+[   60.538295] cxl region0: ACPI0016:00:port1 decoder1.0 add:
+>
+mem0:decoder3.0 @ 0 next: 0000:0d:00.0 nr_eps: 1 nr_targets: 1
+>
+[   60.538647] cxl region0: mem1:endpoint4 decoder4.0 add:
+>
+mem1:decoder4.0 @ 1 next: none nr_eps: 1 nr_targets: 1
+>
+[   60.538663] cxl region0: 0000:0d:00.0:port2 decoder2.0 add:
+>
+mem1:decoder4.0 @ 1 next: mem1 nr_eps: 2 nr_targets: 2
+>
+[   60.538675] cxl region0: ACPI0016:00:port1 decoder1.0 add:
+>
+mem1:decoder4.0 @ 1 next: 0000:0d:00.0 nr_eps: 2 nr_targets: 1
+>
+[   60.539311] cxl region0: mem2:endpoint5 decoder5.0 add:
+>
+mem2:decoder5.0 @ 2 next: none nr_eps: 1 nr_targets: 1
+>
+[   60.539332] cxl region0: 0000:0d:00.0:port2 decoder2.0 add:
+>
+mem2:decoder5.0 @ 2 next: mem2 nr_eps: 3 nr_targets: 3
+>
+[   60.539343] cxl region0: ACPI0016:00:port1 decoder1.0 add:
+>
+mem2:decoder5.0 @ 2 next: 0000:0d:00.0 nr_eps: 3 nr_targets: 1
+>
+[   60.539711] cxl region0: mem3:endpoint6 decoder6.0 add:
+>
+mem3:decoder6.0 @ 3 next: none nr_eps: 1 nr_targets: 1
+>
+[   60.539723] cxl region0: 0000:0d:00.0:port2 decoder2.0 add:
+>
+mem3:decoder6.0 @ 3 next: mem3 nr_eps: 4 nr_targets: 4
+>
+[   60.539735] cxl region0: ACPI0016:00:port1 decoder1.0 add:
+>
+mem3:decoder6.0 @ 3 next: 0000:0d:00.0 nr_eps: 4 nr_targets: 1
+>
+[   60.539742] cxl region0: ACPI0016:00:port1 iw: 1 ig: 256
+>
+[   60.539747] cxl region0: ACPI0016:00:port1 target[0] = 0000:0c:00.0
+>
+for mem0:decoder3.0 @ 0
+>
+[   60.539754] cxl region0: 0000:0d:00.0:port2 iw: 4 ig: 512
+>
+[   60.539758] cxl region0: 0000:0d:00.0:port2 target[0] =
+>
+0000:0e:00.0 for mem0:decoder3.0 @ 0
+>
+[   60.539764] cxl region0: ACPI0016:00:port1: cannot host mem1:decoder4.0 at
+>
+1
+>
+>
+I have tried to write sysfs node manually, got same errors.
+>
+>
+Hope I can get some helps here.
+What is the output of:
+
+    cxl list -MDTu -d decoder0.0
+
+...? It might be the case that mem1 cannot be mapped by decoder0.0, or
+at least not in the specified order, or that validation check is broken.
+
+Hi Dan,
+
+Thanks for your reply!
+
+On Mon, Aug 8, 2022 at 11:58 PM Dan Williams <dan.j.williams@intel.com> wrote:
+>
+>
+What is the output of:
+>
+>
+cxl list -MDTu -d decoder0.0
+>
+>
+...? It might be the case that mem1 cannot be mapped by decoder0.0, or
+>
+at least not in the specified order, or that validation check is broken.
+Command "cxl list -MDTu -d decoder0.0" output:
+
+[
+  {
+    "memdevs":[
+      {
+        "memdev":"mem2",
+        "pmem_size":"256.00 MiB (268.44 MB)",
+        "ram_size":0,
+        "serial":"0",
+        "host":"0000:11:00.0"
+      },
+      {
+        "memdev":"mem1",
+        "pmem_size":"256.00 MiB (268.44 MB)",
+        "ram_size":0,
+        "serial":"0",
+        "host":"0000:10:00.0"
+      },
+      {
+        "memdev":"mem0",
+        "pmem_size":"256.00 MiB (268.44 MB)",
+        "ram_size":0,
+        "serial":"0",
+        "host":"0000:0f:00.0"
+      },
+      {
+        "memdev":"mem3",
+        "pmem_size":"256.00 MiB (268.44 MB)",
+        "ram_size":0,
+        "serial":"0",
+        "host":"0000:12:00.0"
+      }
+    ]
+  },
+  {
+    "root decoders":[
+      {
+        "decoder":"decoder0.0",
+        "resource":"0x10000000000",
+        "size":"4.00 GiB (4.29 GB)",
+        "pmem_capable":true,
+        "volatile_capable":true,
+        "accelmem_capable":true,
+        "nr_targets":1,
+        "targets":[
+          {
+            "target":"ACPI0016:01",
+            "alias":"pci0000:0c",
+            "position":0,
+            "id":"0xc"
+          }
+        ]
+      }
+    ]
+  }
+]
+
+Bobo WL wrote:
+>
+Hi Dan,
+>
+>
+Thanks for your reply!
+>
+>
+On Mon, Aug 8, 2022 at 11:58 PM Dan Williams <dan.j.williams@intel.com> wrote:
+>
+>
+>
+> What is the output of:
+>
+>
+>
+>     cxl list -MDTu -d decoder0.0
+>
+>
+>
+> ...? It might be the case that mem1 cannot be mapped by decoder0.0, or
+>
+> at least not in the specified order, or that validation check is broken.
+>
+>
+Command "cxl list -MDTu -d decoder0.0" output:
+Thanks for this, I think I know the problem, but will try some
+experiments with cxl_test first.
+
+Did the commit_store() crash stop reproducing with latest cxl/preview
+branch?
+
+On Tue, Aug 9, 2022 at 11:17 PM Dan Williams <dan.j.williams@intel.com> wrote:
+>
+>
+Bobo WL wrote:
+>
+> Hi Dan,
+>
+>
+>
+> Thanks for your reply!
+>
+>
+>
+> On Mon, Aug 8, 2022 at 11:58 PM Dan Williams <dan.j.williams@intel.com>
+>
+> wrote:
+>
+> >
+>
+> > What is the output of:
+>
+> >
+>
+> >     cxl list -MDTu -d decoder0.0
+>
+> >
+>
+> > ...? It might be the case that mem1 cannot be mapped by decoder0.0, or
+>
+> > at least not in the specified order, or that validation check is broken.
+>
+>
+>
+> Command "cxl list -MDTu -d decoder0.0" output:
+>
+>
+Thanks for this, I think I know the problem, but will try some
+>
+experiments with cxl_test first.
+>
+>
+Did the commit_store() crash stop reproducing with latest cxl/preview
+>
+branch?
+No, still hitting this bug if don't add extra HB device in qemu
+
+Dan Williams wrote:
+>
+Bobo WL wrote:
+>
+> Hi Dan,
+>
+>
+>
+> Thanks for your reply!
+>
+>
+>
+> On Mon, Aug 8, 2022 at 11:58 PM Dan Williams <dan.j.williams@intel.com>
+>
+> wrote:
+>
+> >
+>
+> > What is the output of:
+>
+> >
+>
+> >     cxl list -MDTu -d decoder0.0
+>
+> >
+>
+> > ...? It might be the case that mem1 cannot be mapped by decoder0.0, or
+>
+> > at least not in the specified order, or that validation check is broken.
+>
+>
+>
+> Command "cxl list -MDTu -d decoder0.0" output:
+>
+>
+Thanks for this, I think I know the problem, but will try some
+>
+experiments with cxl_test first.
+Hmm, so my cxl_test experiment unfortunately passed so I'm not
+reproducing the failure mode. This is the result of creating x4 region
+with devices directly attached to a single host-bridge:
+
+# cxl create-region -d decoder3.5 -w 4 -m -g 256 mem{12,10,9,11} -s $((1<<30))
+{
+  "region":"region8",
+  "resource":"0xf1f0000000",
+  "size":"1024.00 MiB (1073.74 MB)",
+  "interleave_ways":4,
+  "interleave_granularity":256,
+  "decode_state":"commit",
+  "mappings":[
+    {
+      "position":3,
+      "memdev":"mem11",
+      "decoder":"decoder21.0"
+    },
+    {
+      "position":2,
+      "memdev":"mem9",
+      "decoder":"decoder19.0"
+    },
+    {
+      "position":1,
+      "memdev":"mem10",
+      "decoder":"decoder20.0"
+    },
+    {
+      "position":0,
+      "memdev":"mem12",
+      "decoder":"decoder22.0"
+    }
+  ]
+}
+cxl region: cmd_create_region: created 1 region
+
+>
+Did the commit_store() crash stop reproducing with latest cxl/preview
+>
+branch?
+I missed the answer to this question.
+
+All of these changes are now in Linus' tree perhaps give that a try and
+post the debug log again?
+
+On Thu, 11 Aug 2022 17:46:55 -0700
+Dan Williams <dan.j.williams@intel.com> wrote:
+
+>
+Dan Williams wrote:
+>
+> Bobo WL wrote:
+>
+> > Hi Dan,
+>
+> >
+>
+> > Thanks for your reply!
+>
+> >
+>
+> > On Mon, Aug 8, 2022 at 11:58 PM Dan Williams <dan.j.williams@intel.com>
+>
+> > wrote:
+>
+> > >
+>
+> > > What is the output of:
+>
+> > >
+>
+> > >     cxl list -MDTu -d decoder0.0
+>
+> > >
+>
+> > > ...? It might be the case that mem1 cannot be mapped by decoder0.0, or
+>
+> > > at least not in the specified order, or that validation check is
+>
+> > > broken.
+>
+> >
+>
+> > Command "cxl list -MDTu -d decoder0.0" output:
+>
+>
+>
+> Thanks for this, I think I know the problem, but will try some
+>
+> experiments with cxl_test first.
+>
+>
+Hmm, so my cxl_test experiment unfortunately passed so I'm not
+>
+reproducing the failure mode. This is the result of creating x4 region
+>
+with devices directly attached to a single host-bridge:
+>
+>
+# cxl create-region -d decoder3.5 -w 4 -m -g 256 mem{12,10,9,11} -s $((1<<30))
+>
+{
+>
+"region":"region8",
+>
+"resource":"0xf1f0000000",
+>
+"size":"1024.00 MiB (1073.74 MB)",
+>
+"interleave_ways":4,
+>
+"interleave_granularity":256,
+>
+"decode_state":"commit",
+>
+"mappings":[
+>
+{
+>
+"position":3,
+>
+"memdev":"mem11",
+>
+"decoder":"decoder21.0"
+>
+},
+>
+{
+>
+"position":2,
+>
+"memdev":"mem9",
+>
+"decoder":"decoder19.0"
+>
+},
+>
+{
+>
+"position":1,
+>
+"memdev":"mem10",
+>
+"decoder":"decoder20.0"
+>
+},
+>
+{
+>
+"position":0,
+>
+"memdev":"mem12",
+>
+"decoder":"decoder22.0"
+>
+}
+>
+]
+>
+}
+>
+cxl region: cmd_create_region: created 1 region
+>
+>
+> Did the commit_store() crash stop reproducing with latest cxl/preview
+>
+> branch?
+>
+>
+I missed the answer to this question.
+>
+>
+All of these changes are now in Linus' tree perhaps give that a try and
+>
+post the debug log again?
+Hi Dan,
+
+I've moved onto looking at this one.
+1 HB, 2RP (to make it configure the HDM decoder in the QEMU HB, I'll tidy that 
+up
+at some stage), 1 switch, 4 downstream switch ports each with a type 3
+
+I'm not getting a crash, but can't successfully setup a region.
+Upon adding the final target
+It's failing in check_last_peer() as pos < distance.
+Seems distance is 4 which makes me think it's using the wrong level of the 
+heirarchy for
+some reason or that distance check is wrong.
+Wasn't a good idea to just skip that step though as it goes boom - though
+stack trace is not useful.
+
+Jonathan
+
+On Wed, 17 Aug 2022 17:16:19 +0100
+Jonathan Cameron <Jonathan.Cameron@huawei.com> wrote:
+
+>
+On Thu, 11 Aug 2022 17:46:55 -0700
+>
+Dan Williams <dan.j.williams@intel.com> wrote:
+>
+>
+> Dan Williams wrote:
+>
+> > Bobo WL wrote:
+>
+> > > Hi Dan,
+>
+> > >
+>
+> > > Thanks for your reply!
+>
+> > >
+>
+> > > On Mon, Aug 8, 2022 at 11:58 PM Dan Williams <dan.j.williams@intel.com>
+>
+> > > wrote:
+>
+> > > >
+>
+> > > > What is the output of:
+>
+> > > >
+>
+> > > >     cxl list -MDTu -d decoder0.0
+>
+> > > >
+>
+> > > > ...? It might be the case that mem1 cannot be mapped by decoder0.0, or
+>
+> > > > at least not in the specified order, or that validation check is
+>
+> > > > broken.
+>
+> > >
+>
+> > > Command "cxl list -MDTu -d decoder0.0" output:
+>
+> >
+>
+> > Thanks for this, I think I know the problem, but will try some
+>
+> > experiments with cxl_test first.
+>
+>
+>
+> Hmm, so my cxl_test experiment unfortunately passed so I'm not
+>
+> reproducing the failure mode. This is the result of creating x4 region
+>
+> with devices directly attached to a single host-bridge:
+>
+>
+>
+> # cxl create-region -d decoder3.5 -w 4 -m -g 256 mem{12,10,9,11} -s
+>
+> $((1<<30))
+>
+> {
+>
+>   "region":"region8",
+>
+>   "resource":"0xf1f0000000",
+>
+>   "size":"1024.00 MiB (1073.74 MB)",
+>
+>   "interleave_ways":4,
+>
+>   "interleave_granularity":256,
+>
+>   "decode_state":"commit",
+>
+>   "mappings":[
+>
+>     {
+>
+>       "position":3,
+>
+>       "memdev":"mem11",
+>
+>       "decoder":"decoder21.0"
+>
+>     },
+>
+>     {
+>
+>       "position":2,
+>
+>       "memdev":"mem9",
+>
+>       "decoder":"decoder19.0"
+>
+>     },
+>
+>     {
+>
+>       "position":1,
+>
+>       "memdev":"mem10",
+>
+>       "decoder":"decoder20.0"
+>
+>     },
+>
+>     {
+>
+>       "position":0,
+>
+>       "memdev":"mem12",
+>
+>       "decoder":"decoder22.0"
+>
+>     }
+>
+>   ]
+>
+> }
+>
+> cxl region: cmd_create_region: created 1 region
+>
+>
+>
+> > Did the commit_store() crash stop reproducing with latest cxl/preview
+>
+> > branch?
+>
+>
+>
+> I missed the answer to this question.
+>
+>
+>
+> All of these changes are now in Linus' tree perhaps give that a try and
+>
+> post the debug log again?
+>
+>
+Hi Dan,
+>
+>
+I've moved onto looking at this one.
+>
+1 HB, 2RP (to make it configure the HDM decoder in the QEMU HB, I'll tidy
+>
+that up
+>
+at some stage), 1 switch, 4 downstream switch ports each with a type 3
+>
+>
+I'm not getting a crash, but can't successfully setup a region.
+>
+Upon adding the final target
+>
+It's failing in check_last_peer() as pos < distance.
+>
+Seems distance is 4 which makes me think it's using the wrong level of the
+>
+heirarchy for
+>
+some reason or that distance check is wrong.
+>
+Wasn't a good idea to just skip that step though as it goes boom - though
+>
+stack trace is not useful.
+Turns out really weird corruption happens if you accidentally back two type3 
+devices
+with the same memory device. Who would have thought it :)
+
+That aside ignoring the check_last_peer() failure seems to make everything work 
+for this
+topology.  I'm not seeing the crash, so my guess is we fixed it somewhere along 
+the way.
+
+Now for the fun one.  I've replicated the crash if we have
+
+1HB 1*RP 1SW, 4SW-DSP, 4Type3
+
+Now, I'd expect to see it not 'work' because the QEMU HDM decoder won't be 
+programmed
+but the null pointer dereference isn't related to that.
+
+The bug is straight forward.  Not all decoders have commit callbacks... Will 
+send out
+a possible fix shortly.
+
+Jonathan
+
+
+
+>
+>
+Jonathan
+>
+>
+>
+>
+>
+>
+
+On Thu, 18 Aug 2022 17:37:40 +0100
+Jonathan Cameron via <qemu-devel@nongnu.org> wrote:
+
+>
+On Wed, 17 Aug 2022 17:16:19 +0100
+>
+Jonathan Cameron <Jonathan.Cameron@huawei.com> wrote:
+>
+>
+> On Thu, 11 Aug 2022 17:46:55 -0700
+>
+> Dan Williams <dan.j.williams@intel.com> wrote:
+>
+>
+>
+> > Dan Williams wrote:
+>
+> > > Bobo WL wrote:
+>
+> > > > Hi Dan,
+>
+> > > >
+>
+> > > > Thanks for your reply!
+>
+> > > >
+>
+> > > > On Mon, Aug 8, 2022 at 11:58 PM Dan Williams
+>
+> > > > <dan.j.williams@intel.com> wrote:
+>
+> > > > >
+>
+> > > > > What is the output of:
+>
+> > > > >
+>
+> > > > >     cxl list -MDTu -d decoder0.0
+>
+> > > > >
+>
+> > > > > ...? It might be the case that mem1 cannot be mapped by decoder0.0,
+>
+> > > > > or
+>
+> > > > > at least not in the specified order, or that validation check is
+>
+> > > > > broken.
+>
+> > > >
+>
+> > > > Command "cxl list -MDTu -d decoder0.0" output:
+>
+> > >
+>
+> > > Thanks for this, I think I know the problem, but will try some
+>
+> > > experiments with cxl_test first.
+>
+> >
+>
+> > Hmm, so my cxl_test experiment unfortunately passed so I'm not
+>
+> > reproducing the failure mode. This is the result of creating x4 region
+>
+> > with devices directly attached to a single host-bridge:
+>
+> >
+>
+> > # cxl create-region -d decoder3.5 -w 4 -m -g 256 mem{12,10,9,11} -s
+>
+> > $((1<<30))
+>
+> > {
+>
+> >   "region":"region8",
+>
+> >   "resource":"0xf1f0000000",
+>
+> >   "size":"1024.00 MiB (1073.74 MB)",
+>
+> >   "interleave_ways":4,
+>
+> >   "interleave_granularity":256,
+>
+> >   "decode_state":"commit",
+>
+> >   "mappings":[
+>
+> >     {
+>
+> >       "position":3,
+>
+> >       "memdev":"mem11",
+>
+> >       "decoder":"decoder21.0"
+>
+> >     },
+>
+> >     {
+>
+> >       "position":2,
+>
+> >       "memdev":"mem9",
+>
+> >       "decoder":"decoder19.0"
+>
+> >     },
+>
+> >     {
+>
+> >       "position":1,
+>
+> >       "memdev":"mem10",
+>
+> >       "decoder":"decoder20.0"
+>
+> >     },
+>
+> >     {
+>
+> >       "position":0,
+>
+> >       "memdev":"mem12",
+>
+> >       "decoder":"decoder22.0"
+>
+> >     }
+>
+> >   ]
+>
+> > }
+>
+> > cxl region: cmd_create_region: created 1 region
+>
+> >
+>
+> > > Did the commit_store() crash stop reproducing with latest cxl/preview
+>
+> > > branch?
+>
+> >
+>
+> > I missed the answer to this question.
+>
+> >
+>
+> > All of these changes are now in Linus' tree perhaps give that a try and
+>
+> > post the debug log again?
+>
+>
+>
+> Hi Dan,
+>
+>
+>
+> I've moved onto looking at this one.
+>
+> 1 HB, 2RP (to make it configure the HDM decoder in the QEMU HB, I'll tidy
+>
+> that up
+>
+> at some stage), 1 switch, 4 downstream switch ports each with a type 3
+>
+>
+>
+> I'm not getting a crash, but can't successfully setup a region.
+>
+> Upon adding the final target
+>
+> It's failing in check_last_peer() as pos < distance.
+>
+> Seems distance is 4 which makes me think it's using the wrong level of the
+>
+> heirarchy for
+>
+> some reason or that distance check is wrong.
+>
+> Wasn't a good idea to just skip that step though as it goes boom - though
+>
+> stack trace is not useful.
+>
+>
+Turns out really weird corruption happens if you accidentally back two type3
+>
+devices
+>
+with the same memory device. Who would have thought it :)
+>
+>
+That aside ignoring the check_last_peer() failure seems to make everything
+>
+work for this
+>
+topology.  I'm not seeing the crash, so my guess is we fixed it somewhere
+>
+along the way.
+>
+>
+Now for the fun one.  I've replicated the crash if we have
+>
+>
+1HB 1*RP 1SW, 4SW-DSP, 4Type3
+>
+>
+Now, I'd expect to see it not 'work' because the QEMU HDM decoder won't be
+>
+programmed
+>
+but the null pointer dereference isn't related to that.
+>
+>
+The bug is straight forward.  Not all decoders have commit callbacks... Will
+>
+send out
+>
+a possible fix shortly.
+>
+For completeness I'm carrying this hack because I haven't gotten my head
+around the right fix for check_last_peer() failing on this test topology.
+
+diff --git a/drivers/cxl/core/region.c b/drivers/cxl/core/region.c
+index c49d9a5f1091..275e143bd748 100644
+--- a/drivers/cxl/core/region.c
++++ b/drivers/cxl/core/region.c
+@@ -978,7 +978,7 @@ static int cxl_port_setup_targets(struct cxl_port *port,
+                                rc = check_last_peer(cxled, ep, cxl_rr,
+                                                     distance);
+                                if (rc)
+-                                       return rc;
++                                       //      return rc;
+                                goto out_target_set;
+                        }
+                goto add_target;
+--
+
+I might find more bugs with more testing, but this is all the ones I've
+seen so far + in Bobo's reports.  Qemu fixes are now in upstream so
+will be there in the release. 
+
+As a reminder, testing on QEMU has a few corners...
+
+Need a patch to add serial number ECAP support. It is on list for revew,
+but will have wait for after QEMU 7.1 release (which may be next week)
+
+QEMU still assumes HDM decoder on the host bridge will be programmed.
+So if you want anything to work there should be at least
+2 RP below the HB (no need to plug anything in to one of them).
+
+I don't want to add a commandline parameter to hide the decoder in QEMU
+and detecting there is only one RP would require moving a bunch of static
+stuff into runtime code (I think).
+
+I still think we should make the kernel check to see if there is a decoder,
+but if not I might see how bad a hack it is to have QEMU ignore that decoder
+if not committed in this one special case (HB HDM decoder with only one place
+it can send stuff). Obviously that would be a break from specification
+so less than idea!
+
+Thanks,
+
+Jonathan
+
+On Fri, 19 Aug 2022 09:46:55 +0100
+Jonathan Cameron <Jonathan.Cameron@huawei.com> wrote:
+
+>
+On Thu, 18 Aug 2022 17:37:40 +0100
+>
+Jonathan Cameron via <qemu-devel@nongnu.org> wrote:
+>
+>
+> On Wed, 17 Aug 2022 17:16:19 +0100
+>
+> Jonathan Cameron <Jonathan.Cameron@huawei.com> wrote:
+>
+>
+>
+> > On Thu, 11 Aug 2022 17:46:55 -0700
+>
+> > Dan Williams <dan.j.williams@intel.com> wrote:
+>
+> >
+>
+> > > Dan Williams wrote:
+>
+> > > > Bobo WL wrote:
+>
+> > > > > Hi Dan,
+>
+> > > > >
+>
+> > > > > Thanks for your reply!
+>
+> > > > >
+>
+> > > > > On Mon, Aug 8, 2022 at 11:58 PM Dan Williams
+>
+> > > > > <dan.j.williams@intel.com> wrote:
+>
+> > > > > >
+>
+> > > > > > What is the output of:
+>
+> > > > > >
+>
+> > > > > >     cxl list -MDTu -d decoder0.0
+>
+> > > > > >
+>
+> > > > > > ...? It might be the case that mem1 cannot be mapped by
+>
+> > > > > > decoder0.0, or
+>
+> > > > > > at least not in the specified order, or that validation check is
+>
+> > > > > > broken.
+>
+> > > > >
+>
+> > > > > Command "cxl list -MDTu -d decoder0.0" output:
+>
+> > > >
+>
+> > > > Thanks for this, I think I know the problem, but will try some
+>
+> > > > experiments with cxl_test first.
+>
+> > >
+>
+> > > Hmm, so my cxl_test experiment unfortunately passed so I'm not
+>
+> > > reproducing the failure mode. This is the result of creating x4 region
+>
+> > > with devices directly attached to a single host-bridge:
+>
+> > >
+>
+> > > # cxl create-region -d decoder3.5 -w 4 -m -g 256 mem{12,10,9,11} -s
+>
+> > > $((1<<30))
+>
+> > > {
+>
+> > >   "region":"region8",
+>
+> > >   "resource":"0xf1f0000000",
+>
+> > >   "size":"1024.00 MiB (1073.74 MB)",
+>
+> > >   "interleave_ways":4,
+>
+> > >   "interleave_granularity":256,
+>
+> > >   "decode_state":"commit",
+>
+> > >   "mappings":[
+>
+> > >     {
+>
+> > >       "position":3,
+>
+> > >       "memdev":"mem11",
+>
+> > >       "decoder":"decoder21.0"
+>
+> > >     },
+>
+> > >     {
+>
+> > >       "position":2,
+>
+> > >       "memdev":"mem9",
+>
+> > >       "decoder":"decoder19.0"
+>
+> > >     },
+>
+> > >     {
+>
+> > >       "position":1,
+>
+> > >       "memdev":"mem10",
+>
+> > >       "decoder":"decoder20.0"
+>
+> > >     },
+>
+> > >     {
+>
+> > >       "position":0,
+>
+> > >       "memdev":"mem12",
+>
+> > >       "decoder":"decoder22.0"
+>
+> > >     }
+>
+> > >   ]
+>
+> > > }
+>
+> > > cxl region: cmd_create_region: created 1 region
+>
+> > >
+>
+> > > > Did the commit_store() crash stop reproducing with latest cxl/preview
+>
+> > > > branch?
+>
+> > >
+>
+> > > I missed the answer to this question.
+>
+> > >
+>
+> > > All of these changes are now in Linus' tree perhaps give that a try and
+>
+> > > post the debug log again?
+>
+> >
+>
+> > Hi Dan,
+>
+> >
+>
+> > I've moved onto looking at this one.
+>
+> > 1 HB, 2RP (to make it configure the HDM decoder in the QEMU HB, I'll tidy
+>
+> > that up
+>
+> > at some stage), 1 switch, 4 downstream switch ports each with a type 3
+>
+> >
+>
+> > I'm not getting a crash, but can't successfully setup a region.
+>
+> > Upon adding the final target
+>
+> > It's failing in check_last_peer() as pos < distance.
+>
+> > Seems distance is 4 which makes me think it's using the wrong level of
+>
+> > the heirarchy for
+>
+> > some reason or that distance check is wrong.
+>
+> > Wasn't a good idea to just skip that step though as it goes boom - though
+>
+> > stack trace is not useful.
+>
+>
+>
+> Turns out really weird corruption happens if you accidentally back two
+>
+> type3 devices
+>
+> with the same memory device. Who would have thought it :)
+>
+>
+>
+> That aside ignoring the check_last_peer() failure seems to make everything
+>
+> work for this
+>
+> topology.  I'm not seeing the crash, so my guess is we fixed it somewhere
+>
+> along the way.
+>
+>
+>
+> Now for the fun one.  I've replicated the crash if we have
+>
+>
+>
+> 1HB 1*RP 1SW, 4SW-DSP, 4Type3
+>
+>
+>
+> Now, I'd expect to see it not 'work' because the QEMU HDM decoder won't be
+>
+> programmed
+>
+> but the null pointer dereference isn't related to that.
+>
+>
+>
+> The bug is straight forward.  Not all decoders have commit callbacks...
+>
+> Will send out
+>
+> a possible fix shortly.
+>
+>
+>
+For completeness I'm carrying this hack because I haven't gotten my head
+>
+around the right fix for check_last_peer() failing on this test topology.
+>
+>
+diff --git a/drivers/cxl/core/region.c b/drivers/cxl/core/region.c
+>
+index c49d9a5f1091..275e143bd748 100644
+>
+--- a/drivers/cxl/core/region.c
+>
++++ b/drivers/cxl/core/region.c
+>
+@@ -978,7 +978,7 @@ static int cxl_port_setup_targets(struct cxl_port *port,
+>
+rc = check_last_peer(cxled, ep, cxl_rr,
+>
+distance);
+>
+if (rc)
+>
+-                                       return rc;
+>
++                                       //      return rc;
+>
+goto out_target_set;
+>
+}
+>
+goto add_target;
+I'm still carrying this hack and still haven't worked out the right fix.
+
+Suggestions welcome!  If not I'll hopefully get some time on this
+towards the end of the week.
+
+Jonathan
+
diff --git a/classification_output/01/instruction/3457423 b/classification_output/01/instruction/3457423
deleted file mode 100644
index ffcf905b4..000000000
--- a/classification_output/01/instruction/3457423
+++ /dev/null
@@ -1,40 +0,0 @@
-instruction: 0.778
-semantic: 0.635
-mistranslation: 0.537
-other: 0.236
-
-[Qemu-devel] [BUG] Failed to compile using gcc7.1
-
-Hi all,
-
-After upgrading gcc from 6.3.1 to 7.1.1, qemu can't be compiled with gcc.
-
-The error is:
-
-------
-  CC      block/blkdebug.o
-block/blkdebug.c: In function 'blkdebug_refresh_filename':
-block/blkdebug.c:693:31: error: '%s' directive output may be truncated
-writing up to 4095 bytes into a region of size 4086
-[-Werror=format-truncation=]
-"blkdebug:%s:%s", s->config_file ?: "",
-                               ^~
-In file included from /usr/include/stdio.h:939:0,
-                 from /home/adam/qemu/include/qemu/osdep.h:68,
-                 from block/blkdebug.c:25:
-/usr/include/bits/stdio2.h:64:10: note: '__builtin___snprintf_chk'
-output 11 or more bytes (assuming 4106) into a destination of size 4096
-return __builtin___snprintf_chk (__s, __n, __USE_FORTIFY_LEVEL - 1,
-          ^~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-        __bos (__s), __fmt, __va_arg_pack ());
-        ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-cc1: all warnings being treated as errors
-make: *** [/home/adam/qemu/rules.mak:69: block/blkdebug.o] Error 1
-------
-
-It seems that gcc 7 is introducing more restrict check for printf.
-If using clang, although there are some extra warning, it can at least
-pass the compile.
-Thanks,
-Qu
-
diff --git a/classification_output/01/instruction/42226390 b/classification_output/01/instruction/42226390
new file mode 100644
index 000000000..1d455d6fa
--- /dev/null
+++ b/classification_output/01/instruction/42226390
@@ -0,0 +1,187 @@
+instruction: 0.925
+semantic: 0.924
+other: 0.894
+mistranslation: 0.826
+
+[BUG] AArch64 boot hang with -icount and -smp >1 (iothread locking issue?)
+
+Hello,
+
+I am encountering one or more bugs when using -icount and -smp >1 that I am
+attempting to sort out. My current theory is that it is an iothread locking
+issue.
+
+I am using a command-line like the following where $kernel is a recent upstream
+AArch64 Linux kernel Image (I can provide a binary if that would be helpful -
+let me know how is best to post):
+
+        qemu-system-aarch64 \
+                -M virt -cpu cortex-a57 -m 1G \
+                -nographic \
+                -smp 2 \
+                -icount 0 \
+                -kernel $kernel
+
+For any/all of the symptoms described below, they seem to disappear when I
+either remove `-icount 0` or change smp to `-smp 1`. In other words, it is the
+combination of `-smp >1` and `-icount` which triggers what I'm seeing.
+
+I am seeing two different (but seemingly related) behaviors. The first (and
+what I originally started debugging) shows up as a boot hang. When booting
+using the above command after Peter's "icount: Take iothread lock when running
+QEMU timers" patch [1], The kernel boots for a while and then hangs after:
+
+>
+...snip...
+>
+[    0.010764] Serial: AMBA PL011 UART driver
+>
+[    0.016334] 9000000.pl011: ttyAMA0 at MMIO 0x9000000 (irq = 13, base_baud
+>
+= 0) is a PL011 rev1
+>
+[    0.016907] printk: console [ttyAMA0] enabled
+>
+[    0.017624] KASLR enabled
+>
+[    0.031986] HugeTLB: registered 16.0 GiB page size, pre-allocated 0 pages
+>
+[    0.031986] HugeTLB: 16320 KiB vmemmap can be freed for a 16.0 GiB page
+>
+[    0.031986] HugeTLB: registered 512 MiB page size, pre-allocated 0 pages
+>
+[    0.031986] HugeTLB: 448 KiB vmemmap can be freed for a 512 MiB page
+>
+[    0.031986] HugeTLB: registered 2.00 MiB page size, pre-allocated 0 pages
+>
+[    0.031986] HugeTLB: 0 KiB vmemmap can be freed for a 2.00 MiB page
+When it hangs here, I drop into QEMU's console, attach to the gdbserver, and it
+always reports that it is at address 0xffff800008dc42e8 (as shown below from an
+objdump of the vmlinux). I note this is in the middle of messing with timer
+system registers - which makes me suspect we're attempting to take the iothread
+lock when its already held:
+
+>
+ffff800008dc42b8 <arch_timer_set_next_event_virt>:
+>
+ffff800008dc42b8:       d503201f        nop
+>
+ffff800008dc42bc:       d503201f        nop
+>
+ffff800008dc42c0:       d503233f        paciasp
+>
+ffff800008dc42c4:       d53be321        mrs     x1, cntv_ctl_el0
+>
+ffff800008dc42c8:       32000021        orr     w1, w1, #0x1
+>
+ffff800008dc42cc:       d5033fdf        isb
+>
+ffff800008dc42d0:       d53be042        mrs     x2, cntvct_el0
+>
+ffff800008dc42d4:       ca020043        eor     x3, x2, x2
+>
+ffff800008dc42d8:       8b2363e3        add     x3, sp, x3
+>
+ffff800008dc42dc:       f940007f        ldr     xzr, [x3]
+>
+ffff800008dc42e0:       8b020000        add     x0, x0, x2
+>
+ffff800008dc42e4:       d51be340        msr     cntv_cval_el0, x0
+>
+* ffff800008dc42e8:       927ef820        and     x0, x1, #0xfffffffffffffffd
+>
+ffff800008dc42ec:       d51be320        msr     cntv_ctl_el0, x0
+>
+ffff800008dc42f0:       d5033fdf        isb
+>
+ffff800008dc42f4:       52800000        mov     w0, #0x0
+>
+// #0
+>
+ffff800008dc42f8:       d50323bf        autiasp
+>
+ffff800008dc42fc:       d65f03c0        ret
+The second behavior is that prior to Peter's "icount: Take iothread lock when
+running QEMU timers" patch [1], I observe the following message (same command
+as above):
+
+>
+ERROR:../accel/tcg/tcg-accel-ops.c:79:tcg_handle_interrupt: assertion failed:
+>
+(qemu_mutex_iothread_locked())
+>
+Aborted (core dumped)
+This is the same behavior described in Gitlab issue 1130 [0] and addressed by
+[1]. I bisected the appearance of this assertion, and found it was introduced
+by Pavel's "replay: rewrite async event handling" commit [2]. Commits prior to
+that one boot successfully (neither assertions nor hangs) with `-icount 0 -smp
+2`.
+
+I've looked over these two commits ([1], [2]), but it is not obvious to me
+how/why they might be interacting to produce the boot hangs I'm seeing and
+I welcome any help investigating further.
+
+Thanks!
+
+-Aaron Lindsay
+
+[0] -
+https://gitlab.com/qemu-project/qemu/-/issues/1130
+[1] -
+https://gitlab.com/qemu-project/qemu/-/commit/c7f26ded6d5065e4116f630f6a490b55f6c5f58e
+[2] -
+https://gitlab.com/qemu-project/qemu/-/commit/60618e2d77691e44bb78e23b2b0cf07b5c405e56
+
+On Fri, 21 Oct 2022 at 16:48, Aaron Lindsay
+<aaron@os.amperecomputing.com> wrote:
+>
+>
+Hello,
+>
+>
+I am encountering one or more bugs when using -icount and -smp >1 that I am
+>
+attempting to sort out. My current theory is that it is an iothread locking
+>
+issue.
+Weird coincidence, that is a bug that's been in the tree for months
+but was only reported to me earlier this week. Try reverting
+commit a82fd5a4ec24d923ff1e -- that should fix it.
+CAFEAcA_i8x00hD-4XX18ySLNbCB6ds1-DSazVb4yDnF8skjd9A@mail.gmail.com
+/">https://lore.kernel.org/qemu-devel/
+CAFEAcA_i8x00hD-4XX18ySLNbCB6ds1-DSazVb4yDnF8skjd9A@mail.gmail.com
+/
+has the explanation.
+
+thanks
+-- PMM
+
+On Oct 21 17:00, Peter Maydell wrote:
+>
+On Fri, 21 Oct 2022 at 16:48, Aaron Lindsay
+>
+<aaron@os.amperecomputing.com> wrote:
+>
+>
+>
+> Hello,
+>
+>
+>
+> I am encountering one or more bugs when using -icount and -smp >1 that I am
+>
+> attempting to sort out. My current theory is that it is an iothread locking
+>
+> issue.
+>
+>
+Weird coincidence, that is a bug that's been in the tree for months
+>
+but was only reported to me earlier this week. Try reverting
+>
+commit a82fd5a4ec24d923ff1e -- that should fix it.
+I can confirm that reverting a82fd5a4ec24d923ff1e fixes it for me.
+Thanks for the help and fast response!
+
+-Aaron
+
diff --git a/classification_output/01/instruction/50773216 b/classification_output/01/instruction/50773216
new file mode 100644
index 000000000..d887fe7b5
--- /dev/null
+++ b/classification_output/01/instruction/50773216
@@ -0,0 +1,110 @@
+instruction: 0.768
+other: 0.737
+semantic: 0.669
+mistranslation: 0.652
+
+[Qemu-devel] Can I have someone's feedback on [bug 1809075] Concurrency bug on keyboard events: capslock LED messing up keycode streams causes character misses at guest kernel
+
+Hi everyone.
+Can I please have someone's feedback on this bug?
+https://bugs.launchpad.net/qemu/+bug/1809075
+Briefly, guest OS loses characters sent to it via vnc. And I spot the
+bug in relation to ps2 driver.
+I'm thinking of possible fixes and I might want to use a memory barrier.
+But I would really like to have some suggestion from a qemu developer
+first. For example, can we brutally drop capslock LED key events in ps2
+queue?
+It is actually relevant to openQA, an automated QA tool for openSUSE.
+And this bug blocks a few test cases for us.
+Thank you in advance!
+
+Kind regards,
+Gao Zhiyuan
+
+Cc'ing Marc-AndrÃ© & Gerd.
+
+On 12/19/18 10:31 AM, Gao Zhiyuan wrote:
+>
+Hi everyone.
+>
+>
+Can I please have someone's feedback on this bug?
+>
+https://bugs.launchpad.net/qemu/+bug/1809075
+>
+Briefly, guest OS loses characters sent to it via vnc. And I spot the
+>
+bug in relation to ps2 driver.
+>
+>
+I'm thinking of possible fixes and I might want to use a memory barrier.
+>
+But I would really like to have some suggestion from a qemu developer
+>
+first. For example, can we brutally drop capslock LED key events in ps2
+>
+queue?
+>
+>
+It is actually relevant to openQA, an automated QA tool for openSUSE.
+>
+And this bug blocks a few test cases for us.
+>
+>
+Thank you in advance!
+>
+>
+Kind regards,
+>
+Gao Zhiyuan
+>
+
+On Thu, Jan 03, 2019 at 12:05:54PM +0100, Philippe Mathieu-DaudÃ© wrote:
+>
+Cc'ing Marc-AndrÃ© & Gerd.
+>
+>
+On 12/19/18 10:31 AM, Gao Zhiyuan wrote:
+>
+> Hi everyone.
+>
+>
+>
+> Can I please have someone's feedback on this bug?
+>
+>
+https://bugs.launchpad.net/qemu/+bug/1809075
+>
+> Briefly, guest OS loses characters sent to it via vnc. And I spot the
+>
+> bug in relation to ps2 driver.
+>
+>
+>
+> I'm thinking of possible fixes and I might want to use a memory barrier.
+>
+> But I would really like to have some suggestion from a qemu developer
+>
+> first. For example, can we brutally drop capslock LED key events in ps2
+>
+> queue?
+There is no "capslock LED key event".  0xfa is KBD_REPLY_ACK, and the
+device queues it in response to guest port writes.  Yes, the ack can
+race with actual key events.  But IMO that isn't a bug in qemu.
+
+Probably the linux kernel just throws away everything until it got the
+ack for the port write, and that way the key event gets lost.  On
+physical hardware you will not notice because it is next to impossible
+to type fast enough to hit the race window.
+
+So, go fix the kernel.
+
+Alternatively fix vncdotool to send uppercase letters properly with
+shift key pressed.  Then qemu wouldn't generate capslock key events
+(that happens because qemu thinks guest and host capslock state is out
+of sync) and the guests's capslock led update request wouldn't get into
+the way.
+
+cheers,
+  Gerd
+
diff --git a/classification_output/01/instruction/51610399 b/classification_output/01/instruction/51610399
new file mode 100644
index 000000000..a78585284
--- /dev/null
+++ b/classification_output/01/instruction/51610399
@@ -0,0 +1,308 @@
+instruction: 0.985
+other: 0.985
+semantic: 0.984
+mistranslation: 0.983
+
+[BUG][powerpc] KVM Guest Boot Failure – Hangs at "Booting Linux via __start()”
+
+Bug Description:
+Encountering a boot failure when launching a KVM guest with
+qemu-system-ppc64. The guest hangs at boot, and the QEMU monitor
+crashes.
+Reproduction Steps:
+# qemu-system-ppc64 --version
+QEMU emulator version 9.2.50 (v9.2.0-2799-g0462a32b4f)
+Copyright (c) 2003-2025 Fabrice Bellard and the QEMU Project developers
+# /usr/bin/qemu-system-ppc64 -name avocado-vt-vm1 -machine
+pseries,accel=kvm \
+-m 32768 -smp 32,sockets=1,cores=32,threads=1 -nographic \
+  -device virtio-scsi-pci,id=scsi \
+-drive
+file=/home/kvmci/tests/data/avocado-vt/images/rhel8.0devel-ppc64le.qcow2,if=none,id=drive0,format=qcow2
+\
+-device scsi-hd,drive=drive0,bus=scsi.0 \
+  -netdev bridge,id=net0,br=virbr0 \
+  -device virtio-net-pci,netdev=net0 \
+  -serial pty \
+  -device virtio-balloon-pci \
+  -cpu host
+QEMU 9.2.50 monitor - type 'help' for more information
+char device redirected to /dev/pts/2 (label serial0)
+(qemu)
+(qemu) qemu-system-ppc64: warning: kernel_irqchip allowed but
+unavailable: IRQ_XIVE capability must be present for KVM
+Falling back to kernel-irqchip=off
+** Qemu Hang
+
+(In another ssh session)
+# screen /dev/pts/2
+Preparing to boot Linux version 6.10.4-200.fc40.ppc64le
+(mockbuild@c23cc4e677614c34bb22d54eeea4dc1f) (gcc (GCC) 14.2.1 20240801
+(Red Hat 14.2.1-1), GNU ld version 2.41-37.fc40) #1 SMP Sun Aug 11
+15:20:17 UTC 2024
+Detected machine type: 0000000000000101
+command line:
+BOOT_IMAGE=(ieee1275/disk,msdos2)/vmlinuz-6.10.4-200.fc40.ppc64le
+root=/dev/mapper/fedora-root ro rd.lvm.lv=fedora/root crashkernel=1024M
+Max number of cores passed to firmware: 2048 (NR_CPUS = 2048)
+Calling ibm,client-architecture-support... done
+memory layout at init:
+  memory_limit : 0000000000000000 (16 MB aligned)
+  alloc_bottom : 0000000008200000
+  alloc_top    : 0000000030000000
+  alloc_top_hi : 0000000800000000
+  rmo_top      : 0000000030000000
+  ram_top      : 0000000800000000
+instantiating rtas at 0x000000002fff0000... done
+prom_hold_cpus: skipped
+copying OF device tree...
+Building dt strings...
+Building dt structure...
+Device tree strings 0x0000000008210000 -> 0x0000000008210bd0
+Device tree struct  0x0000000008220000 -> 0x0000000008230000
+Quiescing Open Firmware ...
+Booting Linux via __start() @ 0x0000000000440000 ...
+** Guest Console Hang
+
+
+Git Bisect:
+Performing git bisect points to the following patch:
+# git bisect bad
+e8291ec16da80566c121c68d9112be458954d90b is the first bad commit
+commit e8291ec16da80566c121c68d9112be458954d90b (HEAD)
+Author: Nicholas Piggin <npiggin@gmail.com>
+Date:   Thu Dec 19 13:40:31 2024 +1000
+
+    target/ppc: fix timebase register reset state
+(H)DEC and PURR get reset before icount does, which causes them to
+be
+skewed and not match the init state. This can cause replay to not
+match the recorded trace exactly. For DEC and HDEC this is usually
+not
+noticable since they tend to get programmed before affecting the
+    target machine. PURR has been observed to cause replay bugs when
+    running Linux.
+
+    Fix this by resetting using a time of 0.
+
+    Message-ID: <20241219034035.1826173-2-npiggin@gmail.com>
+    Signed-off-by: Nicholas Piggin <npiggin@gmail.com>
+
+ hw/ppc/ppc.c | 11 ++++++++---
+ 1 file changed, 8 insertions(+), 3 deletions(-)
+
+
+Reverting the patch helps boot the guest.
+Thanks,
+Misbah Anjum N
+
+Thanks for the report.
+
+Tricky problem. A secondary CPU is hanging before it is started by the
+primary via rtas call.
+
+That secondary keeps calling kvm_cpu_exec(), which keeps exiting out
+early with EXCP_HLT because kvm_arch_process_async_events() returns
+true because that cpu has ->halted=1. That just goes around he run
+loop because there is an interrupt pending (DEC).
+
+So it never runs. It also never releases the BQL, and another CPU,
+the primary which is actually supposed to be running, is stuck in
+spapr_set_all_lpcrs() in run_on_cpu() waiting for the BQL.
+
+This patch just exposes the bug I think, by causing the interrupt.
+although I'm not quite sure why it's okay previously (-ve decrementer
+values should be causing a timer exception too). The timer exception
+should not be taken as an interrupt by those secondary CPUs, and it
+doesn't because it is masked, until set_all_lpcrs sets an LPCR value
+that enables powersave wakeup on decrementer interrupt.
+
+The start_powered_off sate just sets ->halted, which makes it look
+like a powersaving state. Logically I think it's not the same thing
+as far as spapr goes. I don't know why start_powered_off only sets
+->halted, and not ->stop/stopped as well.
+
+Not sure how best to solve it cleanly. I'll send a revert if I can't
+get something working soon.
+
+Thanks,
+Nick
+
+On Tue Mar 18, 2025 at 7:09 AM AEST, misanjum wrote:
+>
+Bug Description:
+>
+Encountering a boot failure when launching a KVM guest with
+>
+qemu-system-ppc64. The guest hangs at boot, and the QEMU monitor
+>
+crashes.
+>
+>
+>
+Reproduction Steps:
+>
+# qemu-system-ppc64 --version
+>
+QEMU emulator version 9.2.50 (v9.2.0-2799-g0462a32b4f)
+>
+Copyright (c) 2003-2025 Fabrice Bellard and the QEMU Project developers
+>
+>
+# /usr/bin/qemu-system-ppc64 -name avocado-vt-vm1 -machine
+>
+pseries,accel=kvm \
+>
+-m 32768 -smp 32,sockets=1,cores=32,threads=1 -nographic \
+>
+-device virtio-scsi-pci,id=scsi \
+>
+-drive
+>
+file=/home/kvmci/tests/data/avocado-vt/images/rhel8.0devel-ppc64le.qcow2,if=none,id=drive0,format=qcow2
+>
+>
+\
+>
+-device scsi-hd,drive=drive0,bus=scsi.0 \
+>
+-netdev bridge,id=net0,br=virbr0 \
+>
+-device virtio-net-pci,netdev=net0 \
+>
+-serial pty \
+>
+-device virtio-balloon-pci \
+>
+-cpu host
+>
+QEMU 9.2.50 monitor - type 'help' for more information
+>
+char device redirected to /dev/pts/2 (label serial0)
+>
+(qemu)
+>
+(qemu) qemu-system-ppc64: warning: kernel_irqchip allowed but
+>
+unavailable: IRQ_XIVE capability must be present for KVM
+>
+Falling back to kernel-irqchip=off
+>
+** Qemu Hang
+>
+>
+(In another ssh session)
+>
+# screen /dev/pts/2
+>
+Preparing to boot Linux version 6.10.4-200.fc40.ppc64le
+>
+(mockbuild@c23cc4e677614c34bb22d54eeea4dc1f) (gcc (GCC) 14.2.1 20240801
+>
+(Red Hat 14.2.1-1), GNU ld version 2.41-37.fc40) #1 SMP Sun Aug 11
+>
+15:20:17 UTC 2024
+>
+Detected machine type: 0000000000000101
+>
+command line:
+>
+BOOT_IMAGE=(ieee1275/disk,msdos2)/vmlinuz-6.10.4-200.fc40.ppc64le
+>
+root=/dev/mapper/fedora-root ro rd.lvm.lv=fedora/root crashkernel=1024M
+>
+Max number of cores passed to firmware: 2048 (NR_CPUS = 2048)
+>
+Calling ibm,client-architecture-support... done
+>
+memory layout at init:
+>
+memory_limit : 0000000000000000 (16 MB aligned)
+>
+alloc_bottom : 0000000008200000
+>
+alloc_top    : 0000000030000000
+>
+alloc_top_hi : 0000000800000000
+>
+rmo_top      : 0000000030000000
+>
+ram_top      : 0000000800000000
+>
+instantiating rtas at 0x000000002fff0000... done
+>
+prom_hold_cpus: skipped
+>
+copying OF device tree...
+>
+Building dt strings...
+>
+Building dt structure...
+>
+Device tree strings 0x0000000008210000 -> 0x0000000008210bd0
+>
+Device tree struct  0x0000000008220000 -> 0x0000000008230000
+>
+Quiescing Open Firmware ...
+>
+Booting Linux via __start() @ 0x0000000000440000 ...
+>
+** Guest Console Hang
+>
+>
+>
+Git Bisect:
+>
+Performing git bisect points to the following patch:
+>
+# git bisect bad
+>
+e8291ec16da80566c121c68d9112be458954d90b is the first bad commit
+>
+commit e8291ec16da80566c121c68d9112be458954d90b (HEAD)
+>
+Author: Nicholas Piggin <npiggin@gmail.com>
+>
+Date:   Thu Dec 19 13:40:31 2024 +1000
+>
+>
+target/ppc: fix timebase register reset state
+>
+>
+(H)DEC and PURR get reset before icount does, which causes them to
+>
+be
+>
+skewed and not match the init state. This can cause replay to not
+>
+match the recorded trace exactly. For DEC and HDEC this is usually
+>
+not
+>
+noticable since they tend to get programmed before affecting the
+>
+target machine. PURR has been observed to cause replay bugs when
+>
+running Linux.
+>
+>
+Fix this by resetting using a time of 0.
+>
+>
+Message-ID: <20241219034035.1826173-2-npiggin@gmail.com>
+>
+Signed-off-by: Nicholas Piggin <npiggin@gmail.com>
+>
+>
+hw/ppc/ppc.c | 11 ++++++++---
+>
+1 file changed, 8 insertions(+), 3 deletions(-)
+>
+>
+>
+Reverting the patch helps boot the guest.
+>
+Thanks,
+>
+Misbah Anjum N
+
diff --git a/classification_output/01/instruction/55961334 b/classification_output/01/instruction/55961334
new file mode 100644
index 000000000..80cdabd29
--- /dev/null
+++ b/classification_output/01/instruction/55961334
@@ -0,0 +1,39 @@
+instruction: 0.803
+semantic: 0.775
+mistranslation: 0.718
+other: 0.715
+
+[Bug] "-ht" flag ignored under KVM - guest still reports HT
+
+Hi Community,
+We have observed that the 'ht' feature bit cannot be disabled when QEMU runs
+with KVM acceleration.
+qemu-system-x86_64 \
+  --enable-kvm \
+  -machine q35 \
+  -cpu host,-ht \
+  -smp 4 \
+  -m 4G \
+  -drive file=rootfs.img,format=raw \
+  -nographic \
+  -append 'console=ttyS0 root=/dev/sda rw'
+Because '-ht' is specified, the guest should expose no HT capability
+(cpuid.1.edx[28] = 0), and /proc/cpuinfo shouldn't show HT feature, but we still
+saw ht in linux guest when run 'cat /proc/cpuinfo'.
+XiaoYao mentioned that:
+
+It has been the behavior of QEMU since
+
+  commit 400281af34e5ee6aa9f5496b53d8f82c6fef9319
+  Author: Andre Przywara <andre.przywara@amd.com>
+  Date:   Wed Aug 19 15:42:42 2009 +0200
+
+    set CPUID bits to present cores and threads topology
+
+that we cannot remove HT CPUID bit from guest via "-cpu xxx,-ht" if the
+VM has >= 2 vcpus.
+I'd like to know whether there's a plan to address this issue, or if the current
+behaviour is considered acceptable.
+Best regards,
+Ewan.
+
diff --git a/classification_output/01/instruction/5843372 b/classification_output/01/instruction/5843372
deleted file mode 100644
index 784962c9c..000000000
--- a/classification_output/01/instruction/5843372
+++ /dev/null
@@ -1,2056 +0,0 @@
-instruction: 0.818
-other: 0.811
-semantic: 0.793
-mistranslation: 0.758
-
-[BUG, RFC] Block graph deadlock on job-dismiss
-
-Hi all,
-
-There's a bug in block layer which leads to block graph deadlock.
-Notably, it takes place when blockdev IO is processed within a separate
-iothread.
-
-This was initially caught by our tests, and I was able to reduce it to a
-relatively simple reproducer.  Such deadlocks are probably supposed to
-be covered in iotests/graph-changes-while-io, but this deadlock isn't.
-
-Basically what the reproducer does is launches QEMU with a drive having
-'iothread' option set, creates a chain of 2 snapshots, launches
-block-commit job for a snapshot and then dismisses the job, starting
-from the lower snapshot.  If the guest is issuing IO at the same time,
-there's a race in acquiring block graph lock and a potential deadlock.
-
-Here's how it can be reproduced:
-
-1. Run QEMU:
->
-SRCDIR=/path/to/srcdir
->
->
->
->
->
-$SRCDIR/build/qemu-system-x86_64 -enable-kvm \
->
->
--machine q35 -cpu Nehalem \
->
->
--name guest=alma8-vm,debug-threads=on \
->
->
--m 2g -smp 2 \
->
->
--nographic -nodefaults \
->
->
--qmp unix:/var/run/alma8-qmp.sock,server=on,wait=off \
->
->
--serial unix:/var/run/alma8-serial.sock,server=on,wait=off \
->
->
--object iothread,id=iothread0 \
->
->
--blockdev
->
-node-name=disk,driver=qcow2,file.driver=file,file.filename=/path/to/img/alma8.qcow2
->
-\
->
--device virtio-blk-pci,drive=disk,iothread=iothread0
-2. Launch IO (random reads) from within the guest:
->
-nc -U /var/run/alma8-serial.sock
->
-...
->
-[root@alma8-vm ~]# fio --name=randread --ioengine=libaio --direct=1 --bs=4k
->
---size=1G --numjobs=1 --time_based=1 --runtime=300 --group_reporting
->
---rw=randread --iodepth=1 --filename=/testfile
-3. Run snapshots creation & removal of lower snapshot operation in a
-loop (script attached):
->
-while /bin/true ; do ./remove_lower_snap.sh ; done
-And then it occasionally hangs.
-
-Note: I've tried bisecting this, and looks like deadlock occurs starting
-from the following commit:
-
-(BAD)  5bdbaebcce virtio: Re-enable notifications after drain
-(GOOD) c42c3833e0 virtio-scsi: Attach event vq notifier with no_poll
-
-On the latest v10.0.0 it does hang as well.
-
-
-Here's backtrace of the main thread:
-
->
-#0  0x00007fc547d427ce in __ppoll (fds=0x557eb79657b0, nfds=1,
->
-timeout=<optimized out>, sigmask=0x0) at ../sysdeps/unix/sysv/linux/ppoll.c:43
->
-#1  0x0000557eb47d955c in qemu_poll_ns (fds=0x557eb79657b0, nfds=1,
->
-timeout=-1) at ../util/qemu-timer.c:329
->
-#2  0x0000557eb47b2204 in fdmon_poll_wait (ctx=0x557eb76c5f20,
->
-ready_list=0x7ffd94b4edd8, timeout=-1) at ../util/fdmon-poll.c:79
->
-#3  0x0000557eb47b1c45 in aio_poll (ctx=0x557eb76c5f20, blocking=true) at
->
-../util/aio-posix.c:730
->
-#4  0x0000557eb4621edd in bdrv_do_drained_begin (bs=0x557eb795e950,
->
-parent=0x0, poll=true) at ../block/io.c:378
->
-#5  0x0000557eb4621f7b in bdrv_drained_begin (bs=0x557eb795e950) at
->
-../block/io.c:391
->
-#6  0x0000557eb45ec125 in bdrv_change_aio_context (bs=0x557eb795e950,
->
-ctx=0x557eb76c5f20, visited=0x557eb7e06b60 = {...}, tran=0x557eb7a87160,
->
-errp=0x0)
->
-at ../block.c:7682
->
-#7  0x0000557eb45ebf2b in bdrv_child_change_aio_context (c=0x557eb7964250,
->
-ctx=0x557eb76c5f20, visited=0x557eb7e06b60 = {...}, tran=0x557eb7a87160,
->
-errp=0x0)
->
-at ../block.c:7608
->
-#8  0x0000557eb45ec0c4 in bdrv_change_aio_context (bs=0x557eb79575e0,
->
-ctx=0x557eb76c5f20, visited=0x557eb7e06b60 = {...}, tran=0x557eb7a87160,
->
-errp=0x0)
->
-at ../block.c:7668
->
-#9  0x0000557eb45ebf2b in bdrv_child_change_aio_context (c=0x557eb7e59110,
->
-ctx=0x557eb76c5f20, visited=0x557eb7e06b60 = {...}, tran=0x557eb7a87160,
->
-errp=0x0)
->
-at ../block.c:7608
->
-#10 0x0000557eb45ec0c4 in bdrv_change_aio_context (bs=0x557eb7e51960,
->
-ctx=0x557eb76c5f20, visited=0x557eb7e06b60 = {...}, tran=0x557eb7a87160,
->
-errp=0x0)
->
-at ../block.c:7668
->
-#11 0x0000557eb45ebf2b in bdrv_child_change_aio_context (c=0x557eb814ed80,
->
-ctx=0x557eb76c5f20, visited=0x557eb7e06b60 = {...}, tran=0x557eb7a87160,
->
-errp=0x0)
->
-at ../block.c:7608
->
-#12 0x0000557eb45ee8e4 in child_job_change_aio_ctx (c=0x557eb7c9d3f0,
->
-ctx=0x557eb76c5f20, visited=0x557eb7e06b60 = {...}, tran=0x557eb7a87160,
->
-errp=0x0)
->
-at ../blockjob.c:157
->
-#13 0x0000557eb45ebe2d in bdrv_parent_change_aio_context (c=0x557eb7c9d3f0,
->
-ctx=0x557eb76c5f20, visited=0x557eb7e06b60 = {...}, tran=0x557eb7a87160,
->
-errp=0x0)
->
-at ../block.c:7592
->
-#14 0x0000557eb45ec06b in bdrv_change_aio_context (bs=0x557eb7d74310,
->
-ctx=0x557eb76c5f20, visited=0x557eb7e06b60 = {...}, tran=0x557eb7a87160,
->
-errp=0x0)
->
-at ../block.c:7661
->
-#15 0x0000557eb45dcd7e in bdrv_child_cb_change_aio_ctx
->
-(child=0x557eb8565af0, ctx=0x557eb76c5f20, visited=0x557eb7e06b60 =
->
-{...}, tran=0x557eb7a87160, errp=0x0) at ../block.c:1234
->
-#16 0x0000557eb45ebe2d in bdrv_parent_change_aio_context (c=0x557eb8565af0,
->
-ctx=0x557eb76c5f20, visited=0x557eb7e06b60 = {...}, tran=0x557eb7a87160,
->
-errp=0x0)
->
-at ../block.c:7592
->
-#17 0x0000557eb45ec06b in bdrv_change_aio_context (bs=0x557eb79575e0,
->
-ctx=0x557eb76c5f20, visited=0x557eb7e06b60 = {...}, tran=0x557eb7a87160,
->
-errp=0x0)
->
-at ../block.c:7661
->
-#18 0x0000557eb45ec1f3 in bdrv_try_change_aio_context (bs=0x557eb79575e0,
->
-ctx=0x557eb76c5f20, ignore_child=0x0, errp=0x0) at ../block.c:7715
->
-#19 0x0000557eb45e1b15 in bdrv_root_unref_child (child=0x557eb7966f30) at
->
-../block.c:3317
->
-#20 0x0000557eb45eeaa8 in block_job_remove_all_bdrv (job=0x557eb7952800) at
->
-../blockjob.c:209
->
-#21 0x0000557eb45ee641 in block_job_free (job=0x557eb7952800) at
->
-../blockjob.c:82
->
-#22 0x0000557eb45f17af in job_unref_locked (job=0x557eb7952800) at
->
-../job.c:474
->
-#23 0x0000557eb45f257d in job_do_dismiss_locked (job=0x557eb7952800) at
->
-../job.c:771
->
-#24 0x0000557eb45f25fe in job_dismiss_locked (jobptr=0x7ffd94b4f400,
->
-errp=0x7ffd94b4f488) at ../job.c:783
->
---Type <RET> for more, q to quit, c to continue without paging--
->
-#25 0x0000557eb45d8e84 in qmp_job_dismiss (id=0x557eb7aa42b0 "commit-snap1",
->
-errp=0x7ffd94b4f488) at ../job-qmp.c:138
->
-#26 0x0000557eb472f6a3 in qmp_marshal_job_dismiss (args=0x7fc52c00a3b0,
->
-ret=0x7fc53c880da8, errp=0x7fc53c880da0) at qapi/qapi-commands-job.c:221
->
-#27 0x0000557eb47a35f3 in do_qmp_dispatch_bh (opaque=0x7fc53c880e40) at
->
-../qapi/qmp-dispatch.c:128
->
-#28 0x0000557eb47d1cd2 in aio_bh_call (bh=0x557eb79568f0) at
->
-../util/async.c:172
->
-#29 0x0000557eb47d1df5 in aio_bh_poll (ctx=0x557eb76c0200) at
->
-../util/async.c:219
->
-#30 0x0000557eb47b12f3 in aio_dispatch (ctx=0x557eb76c0200) at
->
-../util/aio-posix.c:436
->
-#31 0x0000557eb47d2266 in aio_ctx_dispatch (source=0x557eb76c0200,
->
-callback=0x0, user_data=0x0) at ../util/async.c:361
->
-#32 0x00007fc549232f4f in g_main_dispatch (context=0x557eb76c6430) at
->
-../glib/gmain.c:3364
->
-#33 g_main_context_dispatch (context=0x557eb76c6430) at ../glib/gmain.c:4079
->
-#34 0x0000557eb47d3ab1 in glib_pollfds_poll () at ../util/main-loop.c:287
->
-#35 0x0000557eb47d3b38 in os_host_main_loop_wait (timeout=0) at
->
-../util/main-loop.c:310
->
-#36 0x0000557eb47d3c58 in main_loop_wait (nonblocking=0) at
->
-../util/main-loop.c:589
->
-#37 0x0000557eb4218b01 in qemu_main_loop () at ../system/runstate.c:835
->
-#38 0x0000557eb46df166 in qemu_default_main (opaque=0x0) at
->
-../system/main.c:50
->
-#39 0x0000557eb46df215 in main (argc=24, argv=0x7ffd94b4f8d8) at
->
-../system/main.c:80
-And here's coroutine trying to acquire read lock:
-
->
-(gdb) qemu coroutine reader_queue->entries.sqh_first
->
-#0  0x0000557eb47d7068 in qemu_coroutine_switch (from_=0x557eb7aa48b0,
->
-to_=0x7fc537fff508, action=COROUTINE_YIELD) at
->
-../util/coroutine-ucontext.c:321
->
-#1  0x0000557eb47d4d4a in qemu_coroutine_yield () at
->
-../util/qemu-coroutine.c:339
->
-#2  0x0000557eb47d56c8 in qemu_co_queue_wait_impl (queue=0x557eb59954c0
->
-<reader_queue>, lock=0x7fc53c57de50, flags=0) at
->
-../util/qemu-coroutine-lock.c:60
->
-#3  0x0000557eb461fea7 in bdrv_graph_co_rdlock () at ../block/graph-lock.c:231
->
-#4  0x0000557eb460c81a in graph_lockable_auto_lock (x=0x7fc53c57dee3) at
->
-/home/root/src/qemu/master/include/block/graph-lock.h:213
->
-#5  0x0000557eb460fa41 in blk_co_do_preadv_part
->
-(blk=0x557eb84c0810, offset=6890553344, bytes=4096, qiov=0x7fc530006988,
->
-qiov_offset=0, flags=BDRV_REQ_REGISTERED_BUF) at ../block/block-backend.c:1339
->
-#6  0x0000557eb46104d7 in blk_aio_read_entry (opaque=0x7fc530003240) at
->
-../block/block-backend.c:1619
->
-#7  0x0000557eb47d6c40 in coroutine_trampoline (i0=-1213577040, i1=21886) at
->
-../util/coroutine-ucontext.c:175
->
-#8  0x00007fc547c2a360 in __start_context () at
->
-../sysdeps/unix/sysv/linux/x86_64/__start_context.S:91
->
-#9  0x00007ffd94b4ea40 in  ()
->
-#10 0x0000000000000000 in  ()
-So it looks like main thread is processing job-dismiss request and is
-holding write lock taken in block_job_remove_all_bdrv() (frame #20
-above).  At the same time iothread spawns a coroutine which performs IO
-request.  Before the coroutine is spawned, blk_aio_prwv() increases
-'in_flight' counter for Blk.  Then blk_co_do_preadv_part() (frame #5) is
-trying to acquire the read lock.  But main thread isn't releasing the
-lock as blk_root_drained_poll() returns true since blk->in_flight > 0.
-Here's the deadlock.
-
-Any comments and suggestions on the subject are welcomed.  Thanks!
-
-Andrey
-remove_lower_snap.sh
-Description:
-application/shellscript
-
-On 4/24/25 8:32 PM, Andrey Drobyshev wrote:
->
-Hi all,
->
->
-There's a bug in block layer which leads to block graph deadlock.
->
-Notably, it takes place when blockdev IO is processed within a separate
->
-iothread.
->
->
-This was initially caught by our tests, and I was able to reduce it to a
->
-relatively simple reproducer.  Such deadlocks are probably supposed to
->
-be covered in iotests/graph-changes-while-io, but this deadlock isn't.
->
->
-Basically what the reproducer does is launches QEMU with a drive having
->
-'iothread' option set, creates a chain of 2 snapshots, launches
->
-block-commit job for a snapshot and then dismisses the job, starting
->
-from the lower snapshot.  If the guest is issuing IO at the same time,
->
-there's a race in acquiring block graph lock and a potential deadlock.
->
->
-Here's how it can be reproduced:
->
->
-[...]
->
-I took a closer look at iotests/graph-changes-while-io, and have managed
-to reproduce the same deadlock in a much simpler setup, without a guest.
-
-1. Run QSD:> ./build/storage-daemon/qemu-storage-daemon --object
-iothread,id=iothread0 \
->
---blockdev null-co,node-name=node0,read-zeroes=true \
->
->
---nbd-server addr.type=unix,addr.path=/var/run/qsd_nbd.sock \
->
->
---export
->
-nbd,id=exp0,node-name=node0,iothread=iothread0,fixed-iothread=true,writable=true
->
-\
->
---chardev
->
-socket,id=qmp-sock,path=/var/run/qsd_qmp.sock,server=on,wait=off \
->
---monitor chardev=qmp-sock
-2. Launch IO:
->
-qemu-img bench -f raw -c 2000000
->
-'nbd+unix:///node0?socket=/var/run/qsd_nbd.sock'
-3. Add 2 snapshots and remove lower one (script attached):> while
-/bin/true ; do ./rls_qsd.sh ; done
-
-And then it hangs.
-
-I'll also send a patch with corresponding test case added directly to
-iotests.
-
-This reproduce seems to be hanging starting from Fiona's commit
-67446e605dc ("blockjob: drop AioContext lock before calling
-bdrv_graph_wrlock()").  AioContext locks were dropped entirely later on
-in Stefan's commit b49f4755c7 ("block: remove AioContext locking"), but
-the problem remains.
-
-Andrey
-rls_qsd.sh
-Description:
-application/shellscript
-
-From: Andrey Drobyshev <andrey.drobyshev@virtuozzo.com>
-
-This case is catching potential deadlock which takes place when job-dismiss
-is issued when I/O requests are processed in a separate iothread.
-
-See
-https://mail.gnu.org/archive/html/qemu-devel/2025-04/msg04421.html
-Signed-off-by: Andrey Drobyshev <andrey.drobyshev@virtuozzo.com>
----
- .../qemu-iotests/tests/graph-changes-while-io | 101 ++++++++++++++++--
- .../tests/graph-changes-while-io.out          |   4 +-
- 2 files changed, 96 insertions(+), 9 deletions(-)
-
-diff --git a/tests/qemu-iotests/tests/graph-changes-while-io 
-b/tests/qemu-iotests/tests/graph-changes-while-io
-index 194fda500e..e30f823da4 100755
---- a/tests/qemu-iotests/tests/graph-changes-while-io
-+++ b/tests/qemu-iotests/tests/graph-changes-while-io
-@@ -27,6 +27,8 @@ from iotests import imgfmt, qemu_img, qemu_img_create, 
-qemu_io, \
- 
- 
- top = os.path.join(iotests.test_dir, 'top.img')
-+snap1 = os.path.join(iotests.test_dir, 'snap1.img')
-+snap2 = os.path.join(iotests.test_dir, 'snap2.img')
- nbd_sock = os.path.join(iotests.sock_dir, 'nbd.sock')
- 
- 
-@@ -58,6 +60,15 @@ class TestGraphChangesWhileIO(QMPTestCase):
-     def tearDown(self) -> None:
-         self.qsd.stop()
- 
-+    def _wait_for_blockjob(self, status) -> None:
-+        done = False
-+        while not done:
-+            for event in self.qsd.get_qmp().get_events(wait=10.0):
-+                if event['event'] != 'JOB_STATUS_CHANGE':
-+                    continue
-+                if event['data']['status'] == status:
-+                    done = True
-+
-     def test_blockdev_add_while_io(self) -> None:
-         # Run qemu-img bench in the background
-         bench_thr = Thread(target=do_qemu_img_bench)
-@@ -116,13 +127,89 @@ class TestGraphChangesWhileIO(QMPTestCase):
-                 'device': 'job0',
-             })
- 
--            cancelled = False
--            while not cancelled:
--                for event in self.qsd.get_qmp().get_events(wait=10.0):
--                    if event['event'] != 'JOB_STATUS_CHANGE':
--                        continue
--                    if event['data']['status'] == 'null':
--                        cancelled = True
-+            self._wait_for_blockjob('null')
-+
-+        bench_thr.join()
-+
-+    def test_remove_lower_snapshot_while_io(self) -> None:
-+        # Run qemu-img bench in the background
-+        bench_thr = Thread(target=do_qemu_img_bench, args=(100000, ))
-+        bench_thr.start()
-+
-+        # While I/O is performed on 'node0' node, consequently add 2 snapshots
-+        # on top of it, then remove (commit) them starting from lower one.
-+        while bench_thr.is_alive():
-+            # Recreate snapshot images on every iteration
-+            qemu_img_create('-f', imgfmt, snap1, '1G')
-+            qemu_img_create('-f', imgfmt, snap2, '1G')
-+
-+            self.qsd.cmd('blockdev-add', {
-+                'driver': imgfmt,
-+                'node-name': 'snap1',
-+                'file': {
-+                    'driver': 'file',
-+                    'filename': snap1
-+                }
-+            })
-+
-+            self.qsd.cmd('blockdev-snapshot', {
-+                'node': 'node0',
-+                'overlay': 'snap1',
-+            })
-+
-+            self.qsd.cmd('blockdev-add', {
-+                'driver': imgfmt,
-+                'node-name': 'snap2',
-+                'file': {
-+                    'driver': 'file',
-+                    'filename': snap2
-+                }
-+            })
-+
-+            self.qsd.cmd('blockdev-snapshot', {
-+                'node': 'snap1',
-+                'overlay': 'snap2',
-+            })
-+
-+            self.qsd.cmd('block-commit', {
-+                'job-id': 'commit-snap1',
-+                'device': 'snap2',
-+                'top-node': 'snap1',
-+                'base-node': 'node0',
-+                'auto-finalize': True,
-+                'auto-dismiss': False,
-+            })
-+
-+            self._wait_for_blockjob('concluded')
-+            self.qsd.cmd('job-dismiss', {
-+                'id': 'commit-snap1',
-+            })
-+
-+            self.qsd.cmd('block-commit', {
-+                'job-id': 'commit-snap2',
-+                'device': 'snap2',
-+                'top-node': 'snap2',
-+                'base-node': 'node0',
-+                'auto-finalize': True,
-+                'auto-dismiss': False,
-+            })
-+
-+            self._wait_for_blockjob('ready')
-+            self.qsd.cmd('job-complete', {
-+                'id': 'commit-snap2',
-+            })
-+
-+            self._wait_for_blockjob('concluded')
-+            self.qsd.cmd('job-dismiss', {
-+                'id': 'commit-snap2',
-+            })
-+
-+            self.qsd.cmd('blockdev-del', {
-+                'node-name': 'snap1'
-+            })
-+            self.qsd.cmd('blockdev-del', {
-+                'node-name': 'snap2'
-+            })
- 
-         bench_thr.join()
- 
-diff --git a/tests/qemu-iotests/tests/graph-changes-while-io.out 
-b/tests/qemu-iotests/tests/graph-changes-while-io.out
-index fbc63e62f8..8d7e996700 100644
---- a/tests/qemu-iotests/tests/graph-changes-while-io.out
-+++ b/tests/qemu-iotests/tests/graph-changes-while-io.out
-@@ -1,5 +1,5 @@
--..
-+...
- ----------------------------------------------------------------------
--Ran 2 tests
-+Ran 3 tests
- 
- OK
--- 
-2.43.5
-
-Am 24.04.25 um 19:32 schrieb Andrey Drobyshev:
->
-So it looks like main thread is processing job-dismiss request and is
->
-holding write lock taken in block_job_remove_all_bdrv() (frame #20
->
-above).  At the same time iothread spawns a coroutine which performs IO
->
-request.  Before the coroutine is spawned, blk_aio_prwv() increases
->
-'in_flight' counter for Blk.  Then blk_co_do_preadv_part() (frame #5) is
->
-trying to acquire the read lock.  But main thread isn't releasing the
->
-lock as blk_root_drained_poll() returns true since blk->in_flight > 0.
->
-Here's the deadlock.
-And for the IO test you provided, it's client->nb_requests that behaves
-similarly to blk->in_flight here.
-
-The issue also reproduces easily when issuing the following QMP command
-in a loop while doing IO on a device:
-
->
-void qmp_block_locked_drain(const char *node_name, Error **errp)
->
-{
->
-BlockDriverState *bs;
->
->
-bs = bdrv_find_node(node_name);
->
-if (!bs) {
->
-error_setg(errp, "node not found");
->
-return;
->
-}
->
->
-bdrv_graph_wrlock();
->
-bdrv_drained_begin(bs);
->
-bdrv_drained_end(bs);
->
-bdrv_graph_wrunlock();
->
-}
-It seems like either it would be necessary to require:
-1. not draining inside an exclusively locked section
-or
-2. making sure that variables used by drained_poll routines are only set
-while holding the reader lock
-?
-
-Those seem to require rather involved changes, so a third option might
-be to make draining inside an exclusively locked section possible, by
-embedding such locked sections in a drained section:
-
->
-diff --git a/blockjob.c b/blockjob.c
->
-index 32007f31a9..9b2f3b3ea9 100644
->
---- a/blockjob.c
->
-+++ b/blockjob.c
->
-@@ -198,6 +198,7 @@ void block_job_remove_all_bdrv(BlockJob *job)
->
-* one to make sure that such a concurrent access does not attempt
->
-* to process an already freed BdrvChild.
->
-*/
->
-+    bdrv_drain_all_begin();
->
-bdrv_graph_wrlock();
->
-while (job->nodes) {
->
-GSList *l = job->nodes;
->
-@@ -211,6 +212,7 @@ void block_job_remove_all_bdrv(BlockJob *job)
->
-g_slist_free_1(l);
->
-}
->
-bdrv_graph_wrunlock();
->
-+    bdrv_drain_all_end();
->
-}
->
->
-bool block_job_has_bdrv(BlockJob *job, BlockDriverState *bs)
-This seems to fix the issue at hand. I can send a patch if this is
-considered an acceptable approach.
-
-Best Regards,
-Fiona
-
-On 4/30/25 11:47 AM, Fiona Ebner wrote:
->
-Am 24.04.25 um 19:32 schrieb Andrey Drobyshev:
->
-> So it looks like main thread is processing job-dismiss request and is
->
-> holding write lock taken in block_job_remove_all_bdrv() (frame #20
->
-> above).  At the same time iothread spawns a coroutine which performs IO
->
-> request.  Before the coroutine is spawned, blk_aio_prwv() increases
->
-> 'in_flight' counter for Blk.  Then blk_co_do_preadv_part() (frame #5) is
->
-> trying to acquire the read lock.  But main thread isn't releasing the
->
-> lock as blk_root_drained_poll() returns true since blk->in_flight > 0.
->
-> Here's the deadlock.
->
->
-And for the IO test you provided, it's client->nb_requests that behaves
->
-similarly to blk->in_flight here.
->
->
-The issue also reproduces easily when issuing the following QMP command
->
-in a loop while doing IO on a device:
->
->
-> void qmp_block_locked_drain(const char *node_name, Error **errp)
->
-> {
->
->     BlockDriverState *bs;
->
->
->
->     bs = bdrv_find_node(node_name);
->
->     if (!bs) {
->
->         error_setg(errp, "node not found");
->
->         return;
->
->     }
->
->
->
->     bdrv_graph_wrlock();
->
->     bdrv_drained_begin(bs);
->
->     bdrv_drained_end(bs);
->
->     bdrv_graph_wrunlock();
->
-> }
->
->
-It seems like either it would be necessary to require:
->
-1. not draining inside an exclusively locked section
->
-or
->
-2. making sure that variables used by drained_poll routines are only set
->
-while holding the reader lock
->
-?
->
->
-Those seem to require rather involved changes, so a third option might
->
-be to make draining inside an exclusively locked section possible, by
->
-embedding such locked sections in a drained section:
->
->
-> diff --git a/blockjob.c b/blockjob.c
->
-> index 32007f31a9..9b2f3b3ea9 100644
->
-> --- a/blockjob.c
->
-> +++ b/blockjob.c
->
-> @@ -198,6 +198,7 @@ void block_job_remove_all_bdrv(BlockJob *job)
->
->       * one to make sure that such a concurrent access does not attempt
->
->       * to process an already freed BdrvChild.
->
->       */
->
-> +    bdrv_drain_all_begin();
->
->      bdrv_graph_wrlock();
->
->      while (job->nodes) {
->
->          GSList *l = job->nodes;
->
-> @@ -211,6 +212,7 @@ void block_job_remove_all_bdrv(BlockJob *job)
->
->          g_slist_free_1(l);
->
->      }
->
->      bdrv_graph_wrunlock();
->
-> +    bdrv_drain_all_end();
->
->  }
->
->
->
->  bool block_job_has_bdrv(BlockJob *job, BlockDriverState *bs)
->
->
-This seems to fix the issue at hand. I can send a patch if this is
->
-considered an acceptable approach.
->
->
-Best Regards,
->
-Fiona
->
-Hello Fiona,
-
-Thanks for looking into it.  I've tried your 3rd option above and can
-confirm it does fix the deadlock, at least I can't reproduce it.  Other
-iotests also don't seem to be breaking.  So I personally am fine with
-that patch.  Would be nice to hear a word from the maintainers though on
-whether there're any caveats with such approach.
-
-Andrey
-
-On Wed, Apr 30, 2025 at 10:11â¯AM Andrey Drobyshev
-<andrey.drobyshev@virtuozzo.com> wrote:
->
->
-On 4/30/25 11:47 AM, Fiona Ebner wrote:
->
-> Am 24.04.25 um 19:32 schrieb Andrey Drobyshev:
->
->> So it looks like main thread is processing job-dismiss request and is
->
->> holding write lock taken in block_job_remove_all_bdrv() (frame #20
->
->> above).  At the same time iothread spawns a coroutine which performs IO
->
->> request.  Before the coroutine is spawned, blk_aio_prwv() increases
->
->> 'in_flight' counter for Blk.  Then blk_co_do_preadv_part() (frame #5) is
->
->> trying to acquire the read lock.  But main thread isn't releasing the
->
->> lock as blk_root_drained_poll() returns true since blk->in_flight > 0.
->
->> Here's the deadlock.
->
->
->
-> And for the IO test you provided, it's client->nb_requests that behaves
->
-> similarly to blk->in_flight here.
->
->
->
-> The issue also reproduces easily when issuing the following QMP command
->
-> in a loop while doing IO on a device:
->
->
->
->> void qmp_block_locked_drain(const char *node_name, Error **errp)
->
->> {
->
->>     BlockDriverState *bs;
->
->>
->
->>     bs = bdrv_find_node(node_name);
->
->>     if (!bs) {
->
->>         error_setg(errp, "node not found");
->
->>         return;
->
->>     }
->
->>
->
->>     bdrv_graph_wrlock();
->
->>     bdrv_drained_begin(bs);
->
->>     bdrv_drained_end(bs);
->
->>     bdrv_graph_wrunlock();
->
->> }
->
->
->
-> It seems like either it would be necessary to require:
->
-> 1. not draining inside an exclusively locked section
->
-> or
->
-> 2. making sure that variables used by drained_poll routines are only set
->
-> while holding the reader lock
->
-> ?
->
->
->
-> Those seem to require rather involved changes, so a third option might
->
-> be to make draining inside an exclusively locked section possible, by
->
-> embedding such locked sections in a drained section:
->
->
->
->> diff --git a/blockjob.c b/blockjob.c
->
->> index 32007f31a9..9b2f3b3ea9 100644
->
->> --- a/blockjob.c
->
->> +++ b/blockjob.c
->
->> @@ -198,6 +198,7 @@ void block_job_remove_all_bdrv(BlockJob *job)
->
->>       * one to make sure that such a concurrent access does not attempt
->
->>       * to process an already freed BdrvChild.
->
->>       */
->
->> +    bdrv_drain_all_begin();
->
->>      bdrv_graph_wrlock();
->
->>      while (job->nodes) {
->
->>          GSList *l = job->nodes;
->
->> @@ -211,6 +212,7 @@ void block_job_remove_all_bdrv(BlockJob *job)
->
->>          g_slist_free_1(l);
->
->>      }
->
->>      bdrv_graph_wrunlock();
->
->> +    bdrv_drain_all_end();
->
->>  }
->
->>
->
->>  bool block_job_has_bdrv(BlockJob *job, BlockDriverState *bs)
->
->
->
-> This seems to fix the issue at hand. I can send a patch if this is
->
-> considered an acceptable approach.
-Kevin is aware of this thread but it's a public holiday tomorrow so it
-may be a little longer.
-
-Stefan
-
-Am 24.04.2025 um 19:32 hat Andrey Drobyshev geschrieben:
->
-Hi all,
->
->
-There's a bug in block layer which leads to block graph deadlock.
->
-Notably, it takes place when blockdev IO is processed within a separate
->
-iothread.
->
->
-This was initially caught by our tests, and I was able to reduce it to a
->
-relatively simple reproducer.  Such deadlocks are probably supposed to
->
-be covered in iotests/graph-changes-while-io, but this deadlock isn't.
->
->
-Basically what the reproducer does is launches QEMU with a drive having
->
-'iothread' option set, creates a chain of 2 snapshots, launches
->
-block-commit job for a snapshot and then dismisses the job, starting
->
-from the lower snapshot.  If the guest is issuing IO at the same time,
->
-there's a race in acquiring block graph lock and a potential deadlock.
->
->
-Here's how it can be reproduced:
->
->
-1. Run QEMU:
->
-> SRCDIR=/path/to/srcdir
->
->
->
->
->
->
->
->
->
-> $SRCDIR/build/qemu-system-x86_64 -enable-kvm \
->
->
->
->   -machine q35 -cpu Nehalem \
->
->
->
->   -name guest=alma8-vm,debug-threads=on \
->
->
->
->   -m 2g -smp 2 \
->
->
->
->   -nographic -nodefaults \
->
->
->
->   -qmp unix:/var/run/alma8-qmp.sock,server=on,wait=off \
->
->
->
->   -serial unix:/var/run/alma8-serial.sock,server=on,wait=off \
->
->
->
->   -object iothread,id=iothread0 \
->
->
->
->   -blockdev
->
-> node-name=disk,driver=qcow2,file.driver=file,file.filename=/path/to/img/alma8.qcow2
->
->  \
->
->   -device virtio-blk-pci,drive=disk,iothread=iothread0
->
->
-2. Launch IO (random reads) from within the guest:
->
-> nc -U /var/run/alma8-serial.sock
->
-> ...
->
-> [root@alma8-vm ~]# fio --name=randread --ioengine=libaio --direct=1 --bs=4k
->
-> --size=1G --numjobs=1 --time_based=1 --runtime=300 --group_reporting
->
-> --rw=randread --iodepth=1 --filename=/testfile
->
->
-3. Run snapshots creation & removal of lower snapshot operation in a
->
-loop (script attached):
->
-> while /bin/true ; do ./remove_lower_snap.sh ; done
->
->
-And then it occasionally hangs.
->
->
-Note: I've tried bisecting this, and looks like deadlock occurs starting
->
-from the following commit:
->
->
-(BAD)  5bdbaebcce virtio: Re-enable notifications after drain
->
-(GOOD) c42c3833e0 virtio-scsi: Attach event vq notifier with no_poll
->
->
-On the latest v10.0.0 it does hang as well.
->
->
->
-Here's backtrace of the main thread:
->
->
-> #0  0x00007fc547d427ce in __ppoll (fds=0x557eb79657b0, nfds=1,
->
-> timeout=<optimized out>, sigmask=0x0) at
->
-> ../sysdeps/unix/sysv/linux/ppoll.c:43
->
-> #1  0x0000557eb47d955c in qemu_poll_ns (fds=0x557eb79657b0, nfds=1,
->
-> timeout=-1) at ../util/qemu-timer.c:329
->
-> #2  0x0000557eb47b2204 in fdmon_poll_wait (ctx=0x557eb76c5f20,
->
-> ready_list=0x7ffd94b4edd8, timeout=-1) at ../util/fdmon-poll.c:79
->
-> #3  0x0000557eb47b1c45 in aio_poll (ctx=0x557eb76c5f20, blocking=true) at
->
-> ../util/aio-posix.c:730
->
-> #4  0x0000557eb4621edd in bdrv_do_drained_begin (bs=0x557eb795e950,
->
-> parent=0x0, poll=true) at ../block/io.c:378
->
-> #5  0x0000557eb4621f7b in bdrv_drained_begin (bs=0x557eb795e950) at
->
-> ../block/io.c:391
->
-> #6  0x0000557eb45ec125 in bdrv_change_aio_context (bs=0x557eb795e950,
->
-> ctx=0x557eb76c5f20, visited=0x557eb7e06b60 = {...}, tran=0x557eb7a87160,
->
-> errp=0x0)
->
->     at ../block.c:7682
->
-> #7  0x0000557eb45ebf2b in bdrv_child_change_aio_context (c=0x557eb7964250,
->
-> ctx=0x557eb76c5f20, visited=0x557eb7e06b60 = {...}, tran=0x557eb7a87160,
->
-> errp=0x0)
->
->     at ../block.c:7608
->
-> #8  0x0000557eb45ec0c4 in bdrv_change_aio_context (bs=0x557eb79575e0,
->
-> ctx=0x557eb76c5f20, visited=0x557eb7e06b60 = {...}, tran=0x557eb7a87160,
->
-> errp=0x0)
->
->     at ../block.c:7668
->
-> #9  0x0000557eb45ebf2b in bdrv_child_change_aio_context (c=0x557eb7e59110,
->
-> ctx=0x557eb76c5f20, visited=0x557eb7e06b60 = {...}, tran=0x557eb7a87160,
->
-> errp=0x0)
->
->     at ../block.c:7608
->
-> #10 0x0000557eb45ec0c4 in bdrv_change_aio_context (bs=0x557eb7e51960,
->
-> ctx=0x557eb76c5f20, visited=0x557eb7e06b60 = {...}, tran=0x557eb7a87160,
->
-> errp=0x0)
->
->     at ../block.c:7668
->
-> #11 0x0000557eb45ebf2b in bdrv_child_change_aio_context (c=0x557eb814ed80,
->
-> ctx=0x557eb76c5f20, visited=0x557eb7e06b60 = {...}, tran=0x557eb7a87160,
->
-> errp=0x0)
->
->     at ../block.c:7608
->
-> #12 0x0000557eb45ee8e4 in child_job_change_aio_ctx (c=0x557eb7c9d3f0,
->
-> ctx=0x557eb76c5f20, visited=0x557eb7e06b60 = {...}, tran=0x557eb7a87160,
->
-> errp=0x0)
->
->     at ../blockjob.c:157
->
-> #13 0x0000557eb45ebe2d in bdrv_parent_change_aio_context (c=0x557eb7c9d3f0,
->
-> ctx=0x557eb76c5f20, visited=0x557eb7e06b60 = {...}, tran=0x557eb7a87160,
->
-> errp=0x0)
->
->     at ../block.c:7592
->
-> #14 0x0000557eb45ec06b in bdrv_change_aio_context (bs=0x557eb7d74310,
->
-> ctx=0x557eb76c5f20, visited=0x557eb7e06b60 = {...}, tran=0x557eb7a87160,
->
-> errp=0x0)
->
->     at ../block.c:7661
->
-> #15 0x0000557eb45dcd7e in bdrv_child_cb_change_aio_ctx
->
->     (child=0x557eb8565af0, ctx=0x557eb76c5f20, visited=0x557eb7e06b60 =
->
-> {...}, tran=0x557eb7a87160, errp=0x0) at ../block.c:1234
->
-> #16 0x0000557eb45ebe2d in bdrv_parent_change_aio_context (c=0x557eb8565af0,
->
-> ctx=0x557eb76c5f20, visited=0x557eb7e06b60 = {...}, tran=0x557eb7a87160,
->
-> errp=0x0)
->
->     at ../block.c:7592
->
-> #17 0x0000557eb45ec06b in bdrv_change_aio_context (bs=0x557eb79575e0,
->
-> ctx=0x557eb76c5f20, visited=0x557eb7e06b60 = {...}, tran=0x557eb7a87160,
->
-> errp=0x0)
->
->     at ../block.c:7661
->
-> #18 0x0000557eb45ec1f3 in bdrv_try_change_aio_context (bs=0x557eb79575e0,
->
-> ctx=0x557eb76c5f20, ignore_child=0x0, errp=0x0) at ../block.c:7715
->
-> #19 0x0000557eb45e1b15 in bdrv_root_unref_child (child=0x557eb7966f30) at
->
-> ../block.c:3317
->
-> #20 0x0000557eb45eeaa8 in block_job_remove_all_bdrv (job=0x557eb7952800) at
->
-> ../blockjob.c:209
->
-> #21 0x0000557eb45ee641 in block_job_free (job=0x557eb7952800) at
->
-> ../blockjob.c:82
->
-> #22 0x0000557eb45f17af in job_unref_locked (job=0x557eb7952800) at
->
-> ../job.c:474
->
-> #23 0x0000557eb45f257d in job_do_dismiss_locked (job=0x557eb7952800) at
->
-> ../job.c:771
->
-> #24 0x0000557eb45f25fe in job_dismiss_locked (jobptr=0x7ffd94b4f400,
->
-> errp=0x7ffd94b4f488) at ../job.c:783
->
-> --Type <RET> for more, q to quit, c to continue without paging--
->
-> #25 0x0000557eb45d8e84 in qmp_job_dismiss (id=0x557eb7aa42b0
->
-> "commit-snap1", errp=0x7ffd94b4f488) at ../job-qmp.c:138
->
-> #26 0x0000557eb472f6a3 in qmp_marshal_job_dismiss (args=0x7fc52c00a3b0,
->
-> ret=0x7fc53c880da8, errp=0x7fc53c880da0) at qapi/qapi-commands-job.c:221
->
-> #27 0x0000557eb47a35f3 in do_qmp_dispatch_bh (opaque=0x7fc53c880e40) at
->
-> ../qapi/qmp-dispatch.c:128
->
-> #28 0x0000557eb47d1cd2 in aio_bh_call (bh=0x557eb79568f0) at
->
-> ../util/async.c:172
->
-> #29 0x0000557eb47d1df5 in aio_bh_poll (ctx=0x557eb76c0200) at
->
-> ../util/async.c:219
->
-> #30 0x0000557eb47b12f3 in aio_dispatch (ctx=0x557eb76c0200) at
->
-> ../util/aio-posix.c:436
->
-> #31 0x0000557eb47d2266 in aio_ctx_dispatch (source=0x557eb76c0200,
->
-> callback=0x0, user_data=0x0) at ../util/async.c:361
->
-> #32 0x00007fc549232f4f in g_main_dispatch (context=0x557eb76c6430) at
->
-> ../glib/gmain.c:3364
->
-> #33 g_main_context_dispatch (context=0x557eb76c6430) at ../glib/gmain.c:4079
->
-> #34 0x0000557eb47d3ab1 in glib_pollfds_poll () at ../util/main-loop.c:287
->
-> #35 0x0000557eb47d3b38 in os_host_main_loop_wait (timeout=0) at
->
-> ../util/main-loop.c:310
->
-> #36 0x0000557eb47d3c58 in main_loop_wait (nonblocking=0) at
->
-> ../util/main-loop.c:589
->
-> #37 0x0000557eb4218b01 in qemu_main_loop () at ../system/runstate.c:835
->
-> #38 0x0000557eb46df166 in qemu_default_main (opaque=0x0) at
->
-> ../system/main.c:50
->
-> #39 0x0000557eb46df215 in main (argc=24, argv=0x7ffd94b4f8d8) at
->
-> ../system/main.c:80
->
->
->
-And here's coroutine trying to acquire read lock:
->
->
-> (gdb) qemu coroutine reader_queue->entries.sqh_first
->
-> #0  0x0000557eb47d7068 in qemu_coroutine_switch (from_=0x557eb7aa48b0,
->
-> to_=0x7fc537fff508, action=COROUTINE_YIELD) at
->
-> ../util/coroutine-ucontext.c:321
->
-> #1  0x0000557eb47d4d4a in qemu_coroutine_yield () at
->
-> ../util/qemu-coroutine.c:339
->
-> #2  0x0000557eb47d56c8 in qemu_co_queue_wait_impl (queue=0x557eb59954c0
->
-> <reader_queue>, lock=0x7fc53c57de50, flags=0) at
->
-> ../util/qemu-coroutine-lock.c:60
->
-> #3  0x0000557eb461fea7 in bdrv_graph_co_rdlock () at
->
-> ../block/graph-lock.c:231
->
-> #4  0x0000557eb460c81a in graph_lockable_auto_lock (x=0x7fc53c57dee3) at
->
-> /home/root/src/qemu/master/include/block/graph-lock.h:213
->
-> #5  0x0000557eb460fa41 in blk_co_do_preadv_part
->
->     (blk=0x557eb84c0810, offset=6890553344, bytes=4096,
->
-> qiov=0x7fc530006988, qiov_offset=0, flags=BDRV_REQ_REGISTERED_BUF) at
->
-> ../block/block-backend.c:1339
->
-> #6  0x0000557eb46104d7 in blk_aio_read_entry (opaque=0x7fc530003240) at
->
-> ../block/block-backend.c:1619
->
-> #7  0x0000557eb47d6c40 in coroutine_trampoline (i0=-1213577040, i1=21886)
->
-> at ../util/coroutine-ucontext.c:175
->
-> #8  0x00007fc547c2a360 in __start_context () at
->
-> ../sysdeps/unix/sysv/linux/x86_64/__start_context.S:91
->
-> #9  0x00007ffd94b4ea40 in  ()
->
-> #10 0x0000000000000000 in  ()
->
->
->
-So it looks like main thread is processing job-dismiss request and is
->
-holding write lock taken in block_job_remove_all_bdrv() (frame #20
->
-above).  At the same time iothread spawns a coroutine which performs IO
->
-request.  Before the coroutine is spawned, blk_aio_prwv() increases
->
-'in_flight' counter for Blk.  Then blk_co_do_preadv_part() (frame #5) is
->
-trying to acquire the read lock.  But main thread isn't releasing the
->
-lock as blk_root_drained_poll() returns true since blk->in_flight > 0.
->
-Here's the deadlock.
->
->
-Any comments and suggestions on the subject are welcomed.  Thanks!
-I think this is what the blk_wait_while_drained() call was supposed to
-address in blk_co_do_preadv_part(). However, with the use of multiple
-I/O threads, this is racy.
-
-Do you think that in your case we hit the small race window between the
-checks in blk_wait_while_drained() and GRAPH_RDLOCK_GUARD()? Or is there
-another reason why blk_wait_while_drained() didn't do its job?
-
-Kevin
-
-On 5/2/25 19:34, Kevin Wolf wrote:
-Am 24.04.2025 um 19:32 hat Andrey Drobyshev geschrieben:
-Hi all,
-
-There's a bug in block layer which leads to block graph deadlock.
-Notably, it takes place when blockdev IO is processed within a separate
-iothread.
-
-This was initially caught by our tests, and I was able to reduce it to a
-relatively simple reproducer.  Such deadlocks are probably supposed to
-be covered in iotests/graph-changes-while-io, but this deadlock isn't.
-
-Basically what the reproducer does is launches QEMU with a drive having
-'iothread' option set, creates a chain of 2 snapshots, launches
-block-commit job for a snapshot and then dismisses the job, starting
-from the lower snapshot.  If the guest is issuing IO at the same time,
-there's a race in acquiring block graph lock and a potential deadlock.
-
-Here's how it can be reproduced:
-
-1. Run QEMU:
-SRCDIR=/path/to/srcdir
-$SRCDIR/build/qemu-system-x86_64 -enable-kvm \
--machine q35 -cpu Nehalem \
-   -name guest=alma8-vm,debug-threads=on \
-   -m 2g -smp 2 \
-   -nographic -nodefaults \
-   -qmp unix:/var/run/alma8-qmp.sock,server=on,wait=off \
-   -serial unix:/var/run/alma8-serial.sock,server=on,wait=off \
-   -object iothread,id=iothread0 \
-   -blockdev 
-node-name=disk,driver=qcow2,file.driver=file,file.filename=/path/to/img/alma8.qcow2
- \
-   -device virtio-blk-pci,drive=disk,iothread=iothread0
-2. Launch IO (random reads) from within the guest:
-nc -U /var/run/alma8-serial.sock
-...
-[root@alma8-vm ~]# fio --name=randread --ioengine=libaio --direct=1 --bs=4k 
---size=1G --numjobs=1 --time_based=1 --runtime=300 --group_reporting 
---rw=randread --iodepth=1 --filename=/testfile
-3. Run snapshots creation & removal of lower snapshot operation in a
-loop (script attached):
-while /bin/true ; do ./remove_lower_snap.sh ; done
-And then it occasionally hangs.
-
-Note: I've tried bisecting this, and looks like deadlock occurs starting
-from the following commit:
-
-(BAD)  5bdbaebcce virtio: Re-enable notifications after drain
-(GOOD) c42c3833e0 virtio-scsi: Attach event vq notifier with no_poll
-
-On the latest v10.0.0 it does hang as well.
-
-
-Here's backtrace of the main thread:
-#0  0x00007fc547d427ce in __ppoll (fds=0x557eb79657b0, nfds=1, timeout=<optimized 
-out>, sigmask=0x0) at ../sysdeps/unix/sysv/linux/ppoll.c:43
-#1  0x0000557eb47d955c in qemu_poll_ns (fds=0x557eb79657b0, nfds=1, timeout=-1) 
-at ../util/qemu-timer.c:329
-#2  0x0000557eb47b2204 in fdmon_poll_wait (ctx=0x557eb76c5f20, 
-ready_list=0x7ffd94b4edd8, timeout=-1) at ../util/fdmon-poll.c:79
-#3  0x0000557eb47b1c45 in aio_poll (ctx=0x557eb76c5f20, blocking=true) at 
-../util/aio-posix.c:730
-#4  0x0000557eb4621edd in bdrv_do_drained_begin (bs=0x557eb795e950, parent=0x0, 
-poll=true) at ../block/io.c:378
-#5  0x0000557eb4621f7b in bdrv_drained_begin (bs=0x557eb795e950) at 
-../block/io.c:391
-#6  0x0000557eb45ec125 in bdrv_change_aio_context (bs=0x557eb795e950, 
-ctx=0x557eb76c5f20, visited=0x557eb7e06b60 = {...}, tran=0x557eb7a87160, 
-errp=0x0)
-     at ../block.c:7682
-#7  0x0000557eb45ebf2b in bdrv_child_change_aio_context (c=0x557eb7964250, 
-ctx=0x557eb76c5f20, visited=0x557eb7e06b60 = {...}, tran=0x557eb7a87160, 
-errp=0x0)
-     at ../block.c:7608
-#8  0x0000557eb45ec0c4 in bdrv_change_aio_context (bs=0x557eb79575e0, 
-ctx=0x557eb76c5f20, visited=0x557eb7e06b60 = {...}, tran=0x557eb7a87160, 
-errp=0x0)
-     at ../block.c:7668
-#9  0x0000557eb45ebf2b in bdrv_child_change_aio_context (c=0x557eb7e59110, 
-ctx=0x557eb76c5f20, visited=0x557eb7e06b60 = {...}, tran=0x557eb7a87160, 
-errp=0x0)
-     at ../block.c:7608
-#10 0x0000557eb45ec0c4 in bdrv_change_aio_context (bs=0x557eb7e51960, 
-ctx=0x557eb76c5f20, visited=0x557eb7e06b60 = {...}, tran=0x557eb7a87160, 
-errp=0x0)
-     at ../block.c:7668
-#11 0x0000557eb45ebf2b in bdrv_child_change_aio_context (c=0x557eb814ed80, 
-ctx=0x557eb76c5f20, visited=0x557eb7e06b60 = {...}, tran=0x557eb7a87160, 
-errp=0x0)
-     at ../block.c:7608
-#12 0x0000557eb45ee8e4 in child_job_change_aio_ctx (c=0x557eb7c9d3f0, 
-ctx=0x557eb76c5f20, visited=0x557eb7e06b60 = {...}, tran=0x557eb7a87160, 
-errp=0x0)
-     at ../blockjob.c:157
-#13 0x0000557eb45ebe2d in bdrv_parent_change_aio_context (c=0x557eb7c9d3f0, 
-ctx=0x557eb76c5f20, visited=0x557eb7e06b60 = {...}, tran=0x557eb7a87160, 
-errp=0x0)
-     at ../block.c:7592
-#14 0x0000557eb45ec06b in bdrv_change_aio_context (bs=0x557eb7d74310, 
-ctx=0x557eb76c5f20, visited=0x557eb7e06b60 = {...}, tran=0x557eb7a87160, 
-errp=0x0)
-     at ../block.c:7661
-#15 0x0000557eb45dcd7e in bdrv_child_cb_change_aio_ctx
-     (child=0x557eb8565af0, ctx=0x557eb76c5f20, visited=0x557eb7e06b60 = {...}, 
-tran=0x557eb7a87160, errp=0x0) at ../block.c:1234
-#16 0x0000557eb45ebe2d in bdrv_parent_change_aio_context (c=0x557eb8565af0, 
-ctx=0x557eb76c5f20, visited=0x557eb7e06b60 = {...}, tran=0x557eb7a87160, 
-errp=0x0)
-     at ../block.c:7592
-#17 0x0000557eb45ec06b in bdrv_change_aio_context (bs=0x557eb79575e0, 
-ctx=0x557eb76c5f20, visited=0x557eb7e06b60 = {...}, tran=0x557eb7a87160, 
-errp=0x0)
-     at ../block.c:7661
-#18 0x0000557eb45ec1f3 in bdrv_try_change_aio_context (bs=0x557eb79575e0, 
-ctx=0x557eb76c5f20, ignore_child=0x0, errp=0x0) at ../block.c:7715
-#19 0x0000557eb45e1b15 in bdrv_root_unref_child (child=0x557eb7966f30) at 
-../block.c:3317
-#20 0x0000557eb45eeaa8 in block_job_remove_all_bdrv (job=0x557eb7952800) at 
-../blockjob.c:209
-#21 0x0000557eb45ee641 in block_job_free (job=0x557eb7952800) at 
-../blockjob.c:82
-#22 0x0000557eb45f17af in job_unref_locked (job=0x557eb7952800) at ../job.c:474
-#23 0x0000557eb45f257d in job_do_dismiss_locked (job=0x557eb7952800) at 
-../job.c:771
-#24 0x0000557eb45f25fe in job_dismiss_locked (jobptr=0x7ffd94b4f400, 
-errp=0x7ffd94b4f488) at ../job.c:783
---Type <RET> for more, q to quit, c to continue without paging--
-#25 0x0000557eb45d8e84 in qmp_job_dismiss (id=0x557eb7aa42b0 "commit-snap1", 
-errp=0x7ffd94b4f488) at ../job-qmp.c:138
-#26 0x0000557eb472f6a3 in qmp_marshal_job_dismiss (args=0x7fc52c00a3b0, 
-ret=0x7fc53c880da8, errp=0x7fc53c880da0) at qapi/qapi-commands-job.c:221
-#27 0x0000557eb47a35f3 in do_qmp_dispatch_bh (opaque=0x7fc53c880e40) at 
-../qapi/qmp-dispatch.c:128
-#28 0x0000557eb47d1cd2 in aio_bh_call (bh=0x557eb79568f0) at ../util/async.c:172
-#29 0x0000557eb47d1df5 in aio_bh_poll (ctx=0x557eb76c0200) at 
-../util/async.c:219
-#30 0x0000557eb47b12f3 in aio_dispatch (ctx=0x557eb76c0200) at 
-../util/aio-posix.c:436
-#31 0x0000557eb47d2266 in aio_ctx_dispatch (source=0x557eb76c0200, 
-callback=0x0, user_data=0x0) at ../util/async.c:361
-#32 0x00007fc549232f4f in g_main_dispatch (context=0x557eb76c6430) at 
-../glib/gmain.c:3364
-#33 g_main_context_dispatch (context=0x557eb76c6430) at ../glib/gmain.c:4079
-#34 0x0000557eb47d3ab1 in glib_pollfds_poll () at ../util/main-loop.c:287
-#35 0x0000557eb47d3b38 in os_host_main_loop_wait (timeout=0) at 
-../util/main-loop.c:310
-#36 0x0000557eb47d3c58 in main_loop_wait (nonblocking=0) at 
-../util/main-loop.c:589
-#37 0x0000557eb4218b01 in qemu_main_loop () at ../system/runstate.c:835
-#38 0x0000557eb46df166 in qemu_default_main (opaque=0x0) at ../system/main.c:50
-#39 0x0000557eb46df215 in main (argc=24, argv=0x7ffd94b4f8d8) at 
-../system/main.c:80
-And here's coroutine trying to acquire read lock:
-(gdb) qemu coroutine reader_queue->entries.sqh_first
-#0  0x0000557eb47d7068 in qemu_coroutine_switch (from_=0x557eb7aa48b0, 
-to_=0x7fc537fff508, action=COROUTINE_YIELD) at ../util/coroutine-ucontext.c:321
-#1  0x0000557eb47d4d4a in qemu_coroutine_yield () at 
-../util/qemu-coroutine.c:339
-#2  0x0000557eb47d56c8 in qemu_co_queue_wait_impl (queue=0x557eb59954c0 
-<reader_queue>, lock=0x7fc53c57de50, flags=0) at 
-../util/qemu-coroutine-lock.c:60
-#3  0x0000557eb461fea7 in bdrv_graph_co_rdlock () at ../block/graph-lock.c:231
-#4  0x0000557eb460c81a in graph_lockable_auto_lock (x=0x7fc53c57dee3) at 
-/home/root/src/qemu/master/include/block/graph-lock.h:213
-#5  0x0000557eb460fa41 in blk_co_do_preadv_part
-     (blk=0x557eb84c0810, offset=6890553344, bytes=4096, qiov=0x7fc530006988, 
-qiov_offset=0, flags=BDRV_REQ_REGISTERED_BUF) at ../block/block-backend.c:1339
-#6  0x0000557eb46104d7 in blk_aio_read_entry (opaque=0x7fc530003240) at 
-../block/block-backend.c:1619
-#7  0x0000557eb47d6c40 in coroutine_trampoline (i0=-1213577040, i1=21886) at 
-../util/coroutine-ucontext.c:175
-#8  0x00007fc547c2a360 in __start_context () at 
-../sysdeps/unix/sysv/linux/x86_64/__start_context.S:91
-#9  0x00007ffd94b4ea40 in  ()
-#10 0x0000000000000000 in  ()
-So it looks like main thread is processing job-dismiss request and is
-holding write lock taken in block_job_remove_all_bdrv() (frame #20
-above).  At the same time iothread spawns a coroutine which performs IO
-request.  Before the coroutine is spawned, blk_aio_prwv() increases
-'in_flight' counter for Blk.  Then blk_co_do_preadv_part() (frame #5) is
-trying to acquire the read lock.  But main thread isn't releasing the
-lock as blk_root_drained_poll() returns true since blk->in_flight > 0.
-Here's the deadlock.
-
-Any comments and suggestions on the subject are welcomed.  Thanks!
-I think this is what the blk_wait_while_drained() call was supposed to
-address in blk_co_do_preadv_part(). However, with the use of multiple
-I/O threads, this is racy.
-
-Do you think that in your case we hit the small race window between the
-checks in blk_wait_while_drained() and GRAPH_RDLOCK_GUARD()? Or is there
-another reason why blk_wait_while_drained() didn't do its job?
-
-Kevin
-At my opinion there is very big race window. Main thread has
-eaten graph write lock. After that another coroutine is stalled
-within GRAPH_RDLOCK_GUARD() as there is no drain at the moment and only
-after that main thread has started drain. That is why Fiona's idea is
-looking working. Though this would mean that normally we should always
-do that at the moment when we acquire write lock. May be even inside
-this function. Den
-
-Am 02.05.2025 um 19:52 hat Denis V. Lunev geschrieben:
->
-On 5/2/25 19:34, Kevin Wolf wrote:
->
-> Am 24.04.2025 um 19:32 hat Andrey Drobyshev geschrieben:
->
-> > Hi all,
->
-> >
->
-> > There's a bug in block layer which leads to block graph deadlock.
->
-> > Notably, it takes place when blockdev IO is processed within a separate
->
-> > iothread.
->
-> >
->
-> > This was initially caught by our tests, and I was able to reduce it to a
->
-> > relatively simple reproducer.  Such deadlocks are probably supposed to
->
-> > be covered in iotests/graph-changes-while-io, but this deadlock isn't.
->
-> >
->
-> > Basically what the reproducer does is launches QEMU with a drive having
->
-> > 'iothread' option set, creates a chain of 2 snapshots, launches
->
-> > block-commit job for a snapshot and then dismisses the job, starting
->
-> > from the lower snapshot.  If the guest is issuing IO at the same time,
->
-> > there's a race in acquiring block graph lock and a potential deadlock.
->
-> >
->
-> > Here's how it can be reproduced:
->
-> >
->
-> > 1. Run QEMU:
->
-> > > SRCDIR=/path/to/srcdir
->
-> > > $SRCDIR/build/qemu-system-x86_64 -enable-kvm \
->
-> > >    -machine q35 -cpu Nehalem \
->
-> > >    -name guest=alma8-vm,debug-threads=on \
->
-> > >    -m 2g -smp 2 \
->
-> > >    -nographic -nodefaults \
->
-> > >    -qmp unix:/var/run/alma8-qmp.sock,server=on,wait=off \
->
-> > >    -serial unix:/var/run/alma8-serial.sock,server=on,wait=off \
->
-> > >    -object iothread,id=iothread0 \
->
-> > >    -blockdev
->
-> > > node-name=disk,driver=qcow2,file.driver=file,file.filename=/path/to/img/alma8.qcow2
->
-> > >  \
->
-> > >    -device virtio-blk-pci,drive=disk,iothread=iothread0
->
-> > 2. Launch IO (random reads) from within the guest:
->
-> > > nc -U /var/run/alma8-serial.sock
->
-> > > ...
->
-> > > [root@alma8-vm ~]# fio --name=randread --ioengine=libaio --direct=1
->
-> > > --bs=4k --size=1G --numjobs=1 --time_based=1 --runtime=300
->
-> > > --group_reporting --rw=randread --iodepth=1 --filename=/testfile
->
-> > 3. Run snapshots creation & removal of lower snapshot operation in a
->
-> > loop (script attached):
->
-> > > while /bin/true ; do ./remove_lower_snap.sh ; done
->
-> > And then it occasionally hangs.
->
-> >
->
-> > Note: I've tried bisecting this, and looks like deadlock occurs starting
->
-> > from the following commit:
->
-> >
->
-> > (BAD)  5bdbaebcce virtio: Re-enable notifications after drain
->
-> > (GOOD) c42c3833e0 virtio-scsi: Attach event vq notifier with no_poll
->
-> >
->
-> > On the latest v10.0.0 it does hang as well.
->
-> >
->
-> >
->
-> > Here's backtrace of the main thread:
->
-> >
->
-> > > #0  0x00007fc547d427ce in __ppoll (fds=0x557eb79657b0, nfds=1,
->
-> > > timeout=<optimized out>, sigmask=0x0) at
->
-> > > ../sysdeps/unix/sysv/linux/ppoll.c:43
->
-> > > #1  0x0000557eb47d955c in qemu_poll_ns (fds=0x557eb79657b0, nfds=1,
->
-> > > timeout=-1) at ../util/qemu-timer.c:329
->
-> > > #2  0x0000557eb47b2204 in fdmon_poll_wait (ctx=0x557eb76c5f20,
->
-> > > ready_list=0x7ffd94b4edd8, timeout=-1) at ../util/fdmon-poll.c:79
->
-> > > #3  0x0000557eb47b1c45 in aio_poll (ctx=0x557eb76c5f20, blocking=true)
->
-> > > at ../util/aio-posix.c:730
->
-> > > #4  0x0000557eb4621edd in bdrv_do_drained_begin (bs=0x557eb795e950,
->
-> > > parent=0x0, poll=true) at ../block/io.c:378
->
-> > > #5  0x0000557eb4621f7b in bdrv_drained_begin (bs=0x557eb795e950) at
->
-> > > ../block/io.c:391
->
-> > > #6  0x0000557eb45ec125 in bdrv_change_aio_context (bs=0x557eb795e950,
->
-> > > ctx=0x557eb76c5f20, visited=0x557eb7e06b60 = {...},
->
-> > > tran=0x557eb7a87160, errp=0x0)
->
-> > >      at ../block.c:7682
->
-> > > #7  0x0000557eb45ebf2b in bdrv_child_change_aio_context
->
-> > > (c=0x557eb7964250, ctx=0x557eb76c5f20, visited=0x557eb7e06b60 = {...},
->
-> > > tran=0x557eb7a87160, errp=0x0)
->
-> > >      at ../block.c:7608
->
-> > > #8  0x0000557eb45ec0c4 in bdrv_change_aio_context (bs=0x557eb79575e0,
->
-> > > ctx=0x557eb76c5f20, visited=0x557eb7e06b60 = {...},
->
-> > > tran=0x557eb7a87160, errp=0x0)
->
-> > >      at ../block.c:7668
->
-> > > #9  0x0000557eb45ebf2b in bdrv_child_change_aio_context
->
-> > > (c=0x557eb7e59110, ctx=0x557eb76c5f20, visited=0x557eb7e06b60 = {...},
->
-> > > tran=0x557eb7a87160, errp=0x0)
->
-> > >      at ../block.c:7608
->
-> > > #10 0x0000557eb45ec0c4 in bdrv_change_aio_context (bs=0x557eb7e51960,
->
-> > > ctx=0x557eb76c5f20, visited=0x557eb7e06b60 = {...},
->
-> > > tran=0x557eb7a87160, errp=0x0)
->
-> > >      at ../block.c:7668
->
-> > > #11 0x0000557eb45ebf2b in bdrv_child_change_aio_context
->
-> > > (c=0x557eb814ed80, ctx=0x557eb76c5f20, visited=0x557eb7e06b60 = {...},
->
-> > > tran=0x557eb7a87160, errp=0x0)
->
-> > >      at ../block.c:7608
->
-> > > #12 0x0000557eb45ee8e4 in child_job_change_aio_ctx (c=0x557eb7c9d3f0,
->
-> > > ctx=0x557eb76c5f20, visited=0x557eb7e06b60 = {...},
->
-> > > tran=0x557eb7a87160, errp=0x0)
->
-> > >      at ../blockjob.c:157
->
-> > > #13 0x0000557eb45ebe2d in bdrv_parent_change_aio_context
->
-> > > (c=0x557eb7c9d3f0, ctx=0x557eb76c5f20, visited=0x557eb7e06b60 = {...},
->
-> > > tran=0x557eb7a87160, errp=0x0)
->
-> > >      at ../block.c:7592
->
-> > > #14 0x0000557eb45ec06b in bdrv_change_aio_context (bs=0x557eb7d74310,
->
-> > > ctx=0x557eb76c5f20, visited=0x557eb7e06b60 = {...},
->
-> > > tran=0x557eb7a87160, errp=0x0)
->
-> > >      at ../block.c:7661
->
-> > > #15 0x0000557eb45dcd7e in bdrv_child_cb_change_aio_ctx
->
-> > >      (child=0x557eb8565af0, ctx=0x557eb76c5f20, visited=0x557eb7e06b60
->
-> > > = {...}, tran=0x557eb7a87160, errp=0x0) at ../block.c:1234
->
-> > > #16 0x0000557eb45ebe2d in bdrv_parent_change_aio_context
->
-> > > (c=0x557eb8565af0, ctx=0x557eb76c5f20, visited=0x557eb7e06b60 = {...},
->
-> > > tran=0x557eb7a87160, errp=0x0)
->
-> > >      at ../block.c:7592
->
-> > > #17 0x0000557eb45ec06b in bdrv_change_aio_context (bs=0x557eb79575e0,
->
-> > > ctx=0x557eb76c5f20, visited=0x557eb7e06b60 = {...},
->
-> > > tran=0x557eb7a87160, errp=0x0)
->
-> > >      at ../block.c:7661
->
-> > > #18 0x0000557eb45ec1f3 in bdrv_try_change_aio_context
->
-> > > (bs=0x557eb79575e0, ctx=0x557eb76c5f20, ignore_child=0x0, errp=0x0) at
->
-> > > ../block.c:7715
->
-> > > #19 0x0000557eb45e1b15 in bdrv_root_unref_child (child=0x557eb7966f30)
->
-> > > at ../block.c:3317
->
-> > > #20 0x0000557eb45eeaa8 in block_job_remove_all_bdrv
->
-> > > (job=0x557eb7952800) at ../blockjob.c:209
->
-> > > #21 0x0000557eb45ee641 in block_job_free (job=0x557eb7952800) at
->
-> > > ../blockjob.c:82
->
-> > > #22 0x0000557eb45f17af in job_unref_locked (job=0x557eb7952800) at
->
-> > > ../job.c:474
->
-> > > #23 0x0000557eb45f257d in job_do_dismiss_locked (job=0x557eb7952800) at
->
-> > > ../job.c:771
->
-> > > #24 0x0000557eb45f25fe in job_dismiss_locked (jobptr=0x7ffd94b4f400,
->
-> > > errp=0x7ffd94b4f488) at ../job.c:783
->
-> > > --Type <RET> for more, q to quit, c to continue without paging--
->
-> > > #25 0x0000557eb45d8e84 in qmp_job_dismiss (id=0x557eb7aa42b0
->
-> > > "commit-snap1", errp=0x7ffd94b4f488) at ../job-qmp.c:138
->
-> > > #26 0x0000557eb472f6a3 in qmp_marshal_job_dismiss (args=0x7fc52c00a3b0,
->
-> > > ret=0x7fc53c880da8, errp=0x7fc53c880da0) at qapi/qapi-commands-job.c:221
->
-> > > #27 0x0000557eb47a35f3 in do_qmp_dispatch_bh (opaque=0x7fc53c880e40) at
->
-> > > ../qapi/qmp-dispatch.c:128
->
-> > > #28 0x0000557eb47d1cd2 in aio_bh_call (bh=0x557eb79568f0) at
->
-> > > ../util/async.c:172
->
-> > > #29 0x0000557eb47d1df5 in aio_bh_poll (ctx=0x557eb76c0200) at
->
-> > > ../util/async.c:219
->
-> > > #30 0x0000557eb47b12f3 in aio_dispatch (ctx=0x557eb76c0200) at
->
-> > > ../util/aio-posix.c:436
->
-> > > #31 0x0000557eb47d2266 in aio_ctx_dispatch (source=0x557eb76c0200,
->
-> > > callback=0x0, user_data=0x0) at ../util/async.c:361
->
-> > > #32 0x00007fc549232f4f in g_main_dispatch (context=0x557eb76c6430) at
->
-> > > ../glib/gmain.c:3364
->
-> > > #33 g_main_context_dispatch (context=0x557eb76c6430) at
->
-> > > ../glib/gmain.c:4079
->
-> > > #34 0x0000557eb47d3ab1 in glib_pollfds_poll () at
->
-> > > ../util/main-loop.c:287
->
-> > > #35 0x0000557eb47d3b38 in os_host_main_loop_wait (timeout=0) at
->
-> > > ../util/main-loop.c:310
->
-> > > #36 0x0000557eb47d3c58 in main_loop_wait (nonblocking=0) at
->
-> > > ../util/main-loop.c:589
->
-> > > #37 0x0000557eb4218b01 in qemu_main_loop () at ../system/runstate.c:835
->
-> > > #38 0x0000557eb46df166 in qemu_default_main (opaque=0x0) at
->
-> > > ../system/main.c:50
->
-> > > #39 0x0000557eb46df215 in main (argc=24, argv=0x7ffd94b4f8d8) at
->
-> > > ../system/main.c:80
->
-> >
->
-> > And here's coroutine trying to acquire read lock:
->
-> >
->
-> > > (gdb) qemu coroutine reader_queue->entries.sqh_first
->
-> > > #0  0x0000557eb47d7068 in qemu_coroutine_switch (from_=0x557eb7aa48b0,
->
-> > > to_=0x7fc537fff508, action=COROUTINE_YIELD) at
->
-> > > ../util/coroutine-ucontext.c:321
->
-> > > #1  0x0000557eb47d4d4a in qemu_coroutine_yield () at
->
-> > > ../util/qemu-coroutine.c:339
->
-> > > #2  0x0000557eb47d56c8 in qemu_co_queue_wait_impl (queue=0x557eb59954c0
->
-> > > <reader_queue>, lock=0x7fc53c57de50, flags=0) at
->
-> > > ../util/qemu-coroutine-lock.c:60
->
-> > > #3  0x0000557eb461fea7 in bdrv_graph_co_rdlock () at
->
-> > > ../block/graph-lock.c:231
->
-> > > #4  0x0000557eb460c81a in graph_lockable_auto_lock (x=0x7fc53c57dee3)
->
-> > > at /home/root/src/qemu/master/include/block/graph-lock.h:213
->
-> > > #5  0x0000557eb460fa41 in blk_co_do_preadv_part
->
-> > >      (blk=0x557eb84c0810, offset=6890553344, bytes=4096,
->
-> > > qiov=0x7fc530006988, qiov_offset=0, flags=BDRV_REQ_REGISTERED_BUF) at
->
-> > > ../block/block-backend.c:1339
->
-> > > #6  0x0000557eb46104d7 in blk_aio_read_entry (opaque=0x7fc530003240) at
->
-> > > ../block/block-backend.c:1619
->
-> > > #7  0x0000557eb47d6c40 in coroutine_trampoline (i0=-1213577040,
->
-> > > i1=21886) at ../util/coroutine-ucontext.c:175
->
-> > > #8  0x00007fc547c2a360 in __start_context () at
->
-> > > ../sysdeps/unix/sysv/linux/x86_64/__start_context.S:91
->
-> > > #9  0x00007ffd94b4ea40 in  ()
->
-> > > #10 0x0000000000000000 in  ()
->
-> >
->
-> > So it looks like main thread is processing job-dismiss request and is
->
-> > holding write lock taken in block_job_remove_all_bdrv() (frame #20
->
-> > above).  At the same time iothread spawns a coroutine which performs IO
->
-> > request.  Before the coroutine is spawned, blk_aio_prwv() increases
->
-> > 'in_flight' counter for Blk.  Then blk_co_do_preadv_part() (frame #5) is
->
-> > trying to acquire the read lock.  But main thread isn't releasing the
->
-> > lock as blk_root_drained_poll() returns true since blk->in_flight > 0.
->
-> > Here's the deadlock.
->
-> >
->
-> > Any comments and suggestions on the subject are welcomed.  Thanks!
->
-> I think this is what the blk_wait_while_drained() call was supposed to
->
-> address in blk_co_do_preadv_part(). However, with the use of multiple
->
-> I/O threads, this is racy.
->
->
->
-> Do you think that in your case we hit the small race window between the
->
-> checks in blk_wait_while_drained() and GRAPH_RDLOCK_GUARD()? Or is there
->
-> another reason why blk_wait_while_drained() didn't do its job?
->
->
->
-At my opinion there is very big race window. Main thread has
->
-eaten graph write lock. After that another coroutine is stalled
->
-within GRAPH_RDLOCK_GUARD() as there is no drain at the moment and only
->
-after that main thread has started drain.
-You're right, I confused taking the write lock with draining there.
-
->
-That is why Fiona's idea is looking working. Though this would mean
->
-that normally we should always do that at the moment when we acquire
->
-write lock. May be even inside this function.
-I actually see now that not all of my graph locking patches were merged.
-At least I did have the thought that bdrv_drained_begin() must be marked
-GRAPH_UNLOCKED because it polls. That means that calling it from inside
-bdrv_try_change_aio_context() is actually forbidden (and that's the part
-I didn't see back then because it doesn't have TSA annotations).
-
-If you refactor the code to move the drain out to before the lock is
-taken, I think you end up with Fiona's patch, except you'll remove the
-forbidden inner drain and add more annotations for some functions and
-clarify the rules around them. I don't know, but I wouldn't be surprised
-if along the process we find other bugs, too.
-
-So Fiona's drain looks right to me, but we should probably approach it
-more systematically.
-
-Kevin
-
diff --git a/classification_output/01/instruction/6117378 b/classification_output/01/instruction/6117378
deleted file mode 100644
index 5dad058d5..000000000
--- a/classification_output/01/instruction/6117378
+++ /dev/null
@@ -1,31 +0,0 @@
-instruction: 0.693
-mistranslation: 0.533
-other: 0.519
-semantic: 0.454
-
-[Qemu-devel] [BUG] network : windows os lost ip address of the network card  in some cases
-
-we  found this problem for a long time ãFor example, if we has three network 
-card in virtual xml file ï¼such as "network connection 1" / "network connection 
-2"/"network connection 3" ã
-
-Echo network card has own ip address ï¼such as 192.168.1.1 / 2.1 /3.1 , when 
-delete the first card ï¼reboot the windows virtual os, then this problem 
-happened !
-
-
-
-
-we found that the sencond network card will  replace the first one , then the 
-ip address of "network connection 2 " become 192.168.1.1 ã
-
-
-Our third party users began to complain about this bug ãAll the business of the 
-second ip  lost !!! 
-
-I mean both of windows and linux has this bug ,  we solve this bug in linux  
-throught bonding netcrad pci and mac address ã
-
-There is no good solution on windows os . thera are ?  we implemented a plan to 
-resumption of IP by QGA.  Is there a better way ?
-
diff --git a/classification_output/01/instruction/62179944 b/classification_output/01/instruction/62179944
new file mode 100644
index 000000000..5dad058d5
--- /dev/null
+++ b/classification_output/01/instruction/62179944
@@ -0,0 +1,31 @@
+instruction: 0.693
+mistranslation: 0.533
+other: 0.519
+semantic: 0.454
+
+[Qemu-devel] [BUG] network : windows os lost ip address of the network card  in some cases
+
+we  found this problem for a long time ãFor example, if we has three network 
+card in virtual xml file ï¼such as "network connection 1" / "network connection 
+2"/"network connection 3" ã
+
+Echo network card has own ip address ï¼such as 192.168.1.1 / 2.1 /3.1 , when 
+delete the first card ï¼reboot the windows virtual os, then this problem 
+happened !
+
+
+
+
+we found that the sencond network card will  replace the first one , then the 
+ip address of "network connection 2 " become 192.168.1.1 ã
+
+
+Our third party users began to complain about this bug ãAll the business of the 
+second ip  lost !!! 
+
+I mean both of windows and linux has this bug ,  we solve this bug in linux  
+throught bonding netcrad pci and mac address ã
+
+There is no good solution on windows os . thera are ?  we implemented a plan to 
+resumption of IP by QGA.  Is there a better way ?
+
diff --git a/classification_output/01/instruction/63565653 b/classification_output/01/instruction/63565653
new file mode 100644
index 000000000..dfac92bf4
--- /dev/null
+++ b/classification_output/01/instruction/63565653
@@ -0,0 +1,49 @@
+instruction: 0.905
+other: 0.898
+semantic: 0.825
+mistranslation: 0.462
+
+[Qemu-devel] [BUG]pcibus_reset assertion failure on guest reboot
+
+Qemu-2.6.2
+
+Start a vm with vhost-net , do reboot and hot-unplug viritio-net nic in short 
+time, we touch 
+pcibus_reset assertion failure.
+
+Here is qemu log:
+22:29:46.359386+08:00  acpi_pm1_cnt_write -> guest do soft power off
+22:29:46.785310+08:00  qemu_devices_reset
+22:29:46.788093+08:00  virtio_pci_device_unplugged -> virtio net unpluged
+22:29:46.803427+08:00  pcibus_reset: Assertion `bus->irq_count[i] == 0' failed.
+
+Here is stack info: 
+(gdb) bt
+#0  0x00007f9a336795d7 in raise () from /usr/lib64/libc.so.6
+#1  0x00007f9a3367acc8 in abort () from /usr/lib64/libc.so.6
+#2  0x00007f9a33672546 in __assert_fail_base () from /usr/lib64/libc.so.6
+#3  0x00007f9a336725f2 in __assert_fail () from /usr/lib64/libc.so.6
+#4  0x0000000000641884 in pcibus_reset (qbus=0x29eee60) at hw/pci/pci.c:283
+#5  0x00000000005bfc30 in qbus_reset_one (bus=0x29eee60, opaque=<optimized 
+out>) at hw/core/qdev.c:319
+#6  0x00000000005c1b19 in qdev_walk_children (dev=0x29ed2b0, pre_devfn=0x0, 
+pre_busfn=0x0, post_devfn=0x5c2440 ...
+#7  0x00000000005c1c59 in qbus_walk_children (bus=0x2736f80, pre_devfn=0x0, 
+pre_busfn=0x0, post_devfn=0x5c2440 ...
+#8  0x00000000005513f5 in qemu_devices_reset () at vl.c:1998
+#9  0x00000000004cab9d in pc_machine_reset () at 
+/home/abuild/rpmbuild/BUILD/qemu-kvm-2.6.0/hw/i386/pc.c:1976
+#10 0x000000000055148b in qemu_system_reset (address@hidden) at vl.c:2011
+#11 0x000000000055164f in main_loop_should_exit () at vl.c:2169
+#12 0x0000000000551719 in main_loop () at vl.c:2212
+#13 0x000000000041c9a8 in main (argc=<optimized out>, argv=<optimized out>, 
+envp=<optimized out>) at vl.c:5130
+(gdb) f 4
+...
+(gdb) p bus->irq_count[0]
+$6 = 1
+
+Seems pci_update_irq_disabled doesn't work well
+
+can anyone help?
+
diff --git a/classification_output/01/instruction/70868267 b/classification_output/01/instruction/70868267
new file mode 100644
index 000000000..ffcf905b4
--- /dev/null
+++ b/classification_output/01/instruction/70868267
@@ -0,0 +1,40 @@
+instruction: 0.778
+semantic: 0.635
+mistranslation: 0.537
+other: 0.236
+
+[Qemu-devel] [BUG] Failed to compile using gcc7.1
+
+Hi all,
+
+After upgrading gcc from 6.3.1 to 7.1.1, qemu can't be compiled with gcc.
+
+The error is:
+
+------
+  CC      block/blkdebug.o
+block/blkdebug.c: In function 'blkdebug_refresh_filename':
+block/blkdebug.c:693:31: error: '%s' directive output may be truncated
+writing up to 4095 bytes into a region of size 4086
+[-Werror=format-truncation=]
+"blkdebug:%s:%s", s->config_file ?: "",
+                               ^~
+In file included from /usr/include/stdio.h:939:0,
+                 from /home/adam/qemu/include/qemu/osdep.h:68,
+                 from block/blkdebug.c:25:
+/usr/include/bits/stdio2.h:64:10: note: '__builtin___snprintf_chk'
+output 11 or more bytes (assuming 4106) into a destination of size 4096
+return __builtin___snprintf_chk (__s, __n, __USE_FORTIFY_LEVEL - 1,
+          ^~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+        __bos (__s), __fmt, __va_arg_pack ());
+        ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+cc1: all warnings being treated as errors
+make: *** [/home/adam/qemu/rules.mak:69: block/blkdebug.o] Error 1
+------
+
+It seems that gcc 7 is introducing more restrict check for printf.
+If using clang, although there are some extra warning, it can at least
+pass the compile.
+Thanks,
+Qu
+
diff --git a/classification_output/01/instruction/73660729 b/classification_output/01/instruction/73660729
new file mode 100644
index 000000000..92d85cc82
--- /dev/null
+++ b/classification_output/01/instruction/73660729
@@ -0,0 +1,31 @@
+instruction: 0.753
+semantic: 0.698
+mistranslation: 0.633
+other: 0.620
+
+[BUG]The latest qemu crashed when I tested cxl
+
+I test cxl with the patch:[v11,0/2] arm/virt:
+ CXL support via pxb_cxl.
+https://patchwork.kernel.org/project/cxl/cover/20220616141950.23374-1-Jonathan.Cameron@huawei.com/
+But the qemu crashed,and showing an error:
+qemu-system-aarch64: ../hw/arm/virt.c:1735: virt_get_high_memmap_enabled:
+ Assertion `ARRAY_SIZE(extended_memmap) - VIRT_LOWMEMMAP_LAST == ARRAY_SIZE(enabled_array)' failed.
+Then I modify the patch to fix the bug:
+diff --git a/hw/arm/virt.c b/hw/arm/virt.c
+index ea2413a0ba..3d4cee3491 100644
+--- a/hw/arm/virt.c
++++ b/hw/arm/virt.c
+@@ -1710,6 +1730,7 @@ static inline bool *virt_get_high_memmap_enabled(VirtMachineState
+ *vms,
+&vms->highmem_redists,
+&vms->highmem_ecam,
+&vms->highmem_mmio,
++ &vms->cxl_devices_state.is_enabled,
+};
+Now qemu works good.
+Could you tell me when the patch(
+arm/virt:
+ CXL support via pxb_cxl
+) will be merged into upstream?
+
diff --git a/classification_output/01/instruction/7647456 b/classification_output/01/instruction/7647456
deleted file mode 100644
index d887fe7b5..000000000
--- a/classification_output/01/instruction/7647456
+++ /dev/null
@@ -1,110 +0,0 @@
-instruction: 0.768
-other: 0.737
-semantic: 0.669
-mistranslation: 0.652
-
-[Qemu-devel] Can I have someone's feedback on [bug 1809075] Concurrency bug on keyboard events: capslock LED messing up keycode streams causes character misses at guest kernel
-
-Hi everyone.
-Can I please have someone's feedback on this bug?
-https://bugs.launchpad.net/qemu/+bug/1809075
-Briefly, guest OS loses characters sent to it via vnc. And I spot the
-bug in relation to ps2 driver.
-I'm thinking of possible fixes and I might want to use a memory barrier.
-But I would really like to have some suggestion from a qemu developer
-first. For example, can we brutally drop capslock LED key events in ps2
-queue?
-It is actually relevant to openQA, an automated QA tool for openSUSE.
-And this bug blocks a few test cases for us.
-Thank you in advance!
-
-Kind regards,
-Gao Zhiyuan
-
-Cc'ing Marc-AndrÃ© & Gerd.
-
-On 12/19/18 10:31 AM, Gao Zhiyuan wrote:
->
-Hi everyone.
->
->
-Can I please have someone's feedback on this bug?
->
-https://bugs.launchpad.net/qemu/+bug/1809075
->
-Briefly, guest OS loses characters sent to it via vnc. And I spot the
->
-bug in relation to ps2 driver.
->
->
-I'm thinking of possible fixes and I might want to use a memory barrier.
->
-But I would really like to have some suggestion from a qemu developer
->
-first. For example, can we brutally drop capslock LED key events in ps2
->
-queue?
->
->
-It is actually relevant to openQA, an automated QA tool for openSUSE.
->
-And this bug blocks a few test cases for us.
->
->
-Thank you in advance!
->
->
-Kind regards,
->
-Gao Zhiyuan
->
-
-On Thu, Jan 03, 2019 at 12:05:54PM +0100, Philippe Mathieu-DaudÃ© wrote:
->
-Cc'ing Marc-AndrÃ© & Gerd.
->
->
-On 12/19/18 10:31 AM, Gao Zhiyuan wrote:
->
-> Hi everyone.
->
->
->
-> Can I please have someone's feedback on this bug?
->
->
-https://bugs.launchpad.net/qemu/+bug/1809075
->
-> Briefly, guest OS loses characters sent to it via vnc. And I spot the
->
-> bug in relation to ps2 driver.
->
->
->
-> I'm thinking of possible fixes and I might want to use a memory barrier.
->
-> But I would really like to have some suggestion from a qemu developer
->
-> first. For example, can we brutally drop capslock LED key events in ps2
->
-> queue?
-There is no "capslock LED key event".  0xfa is KBD_REPLY_ACK, and the
-device queues it in response to guest port writes.  Yes, the ack can
-race with actual key events.  But IMO that isn't a bug in qemu.
-
-Probably the linux kernel just throws away everything until it got the
-ack for the port write, and that way the key event gets lost.  On
-physical hardware you will not notice because it is next to impossible
-to type fast enough to hit the race window.
-
-So, go fix the kernel.
-
-Alternatively fix vncdotool to send uppercase letters properly with
-shift key pressed.  Then qemu wouldn't generate capslock key events
-(that happens because qemu thinks guest and host capslock state is out
-of sync) and the guests's capslock led update request wouldn't get into
-the way.
-
-cheers,
-  Gerd
-
diff --git a/classification_output/01/instruction/7658242 b/classification_output/01/instruction/7658242
deleted file mode 100644
index 3ff255be0..000000000
--- a/classification_output/01/instruction/7658242
+++ /dev/null
@@ -1,1125 +0,0 @@
-instruction: 0.775
-other: 0.771
-mistranslation: 0.719
-semantic: 0.673
-
-[BUG] hw/i386/pc.c: CXL Fixed Memory Window should not reserve e820 in bios
-
-Early-boot e820 records will be inserted by the bios/efi/early boot
-software and be reported to the kernel via insert_resource.  Later, when
-CXL drivers iterate through the regions again, they will insert another
-resource and make the RESERVED memory area a child.
-
-This RESERVED memory area causes the memory region to become unusable,
-and as a result attempting to create memory regions with
-
-    `cxl create-region ...`
-
-Will fail due to the RESERVED area intersecting with the CXL window.
-
-
-During boot the following traceback is observed:
-
-0xffffffff81101650 in insert_resource_expand_to_fit ()
-0xffffffff83d964c5 in e820__reserve_resources_late ()
-0xffffffff83e03210 in pcibios_resource_survey ()
-0xffffffff83e04f4a in pcibios_init ()
-
-Which produces a call to reserve the CFMWS area:
-
-(gdb) p *new
-$54 = {start = 0x290000000, end = 0x2cfffffff, name = "Reserved",
-       flags = 0x200, desc = 0x7, parent = 0x0, sibling = 0x0,
-       child = 0x0}
-
-Later the Kernel parses ACPI tables and reserves the exact same area as
-the CXL Fixed Memory Window.  The use of `insert_resource_conflict`
-retains the RESERVED region and makes it a child of the new region.
-
-0xffffffff811016a4 in insert_resource_conflict ()
-                      insert_resource ()
-0xffffffff81a81389 in cxl_parse_cfmws ()
-0xffffffff818c4a81 in call_handler ()
-                      acpi_parse_entries_array ()
-
-(gdb) p/x *new
-$59 = {start = 0x290000000, end = 0x2cfffffff, name = "CXL Window 0",
-       flags = 0x200, desc = 0x0, parent = 0x0, sibling = 0x0,
-       child = 0x0}
-
-This produces the following output in /proc/iomem:
-
-590000000-68fffffff : CXL Window 0
-  590000000-68fffffff : Reserved
-
-This reserved area causes `get_free_mem_region()` to fail due to a check
-against `__region_intersects()`.  Due to this reserved area, the
-intersect check will only ever return REGION_INTERSECTS, which causes
-`cxl create-region` to always fail.
-
-Signed-off-by: Gregory Price <gregory.price@memverge.com>
----
- hw/i386/pc.c | 2 --
- 1 file changed, 2 deletions(-)
-
-diff --git a/hw/i386/pc.c b/hw/i386/pc.c
-index 566accf7e6..5bf5465a21 100644
---- a/hw/i386/pc.c
-+++ b/hw/i386/pc.c
-@@ -1061,7 +1061,6 @@ void pc_memory_init(PCMachineState *pcms,
-         hwaddr cxl_size = MiB;
- 
-         cxl_base = pc_get_cxl_range_start(pcms);
--        e820_add_entry(cxl_base, cxl_size, E820_RESERVED);
-         memory_region_init(mr, OBJECT(machine), "cxl_host_reg", cxl_size);
-         memory_region_add_subregion(system_memory, cxl_base, mr);
-         cxl_resv_end = cxl_base + cxl_size;
-@@ -1077,7 +1076,6 @@ void pc_memory_init(PCMachineState *pcms,
-                 memory_region_init_io(&fw->mr, OBJECT(machine), &cfmws_ops, fw,
-                                       "cxl-fixed-memory-region", fw->size);
-                 memory_region_add_subregion(system_memory, fw->base, &fw->mr);
--                e820_add_entry(fw->base, fw->size, E820_RESERVED);
-                 cxl_fmw_base += fw->size;
-                 cxl_resv_end = cxl_fmw_base;
-             }
--- 
-2.37.3
-
-Early-boot e820 records will be inserted by the bios/efi/early boot
-software and be reported to the kernel via insert_resource.  Later, when
-CXL drivers iterate through the regions again, they will insert another
-resource and make the RESERVED memory area a child.
-
-This RESERVED memory area causes the memory region to become unusable,
-and as a result attempting to create memory regions with
-
-     `cxl create-region ...`
-
-Will fail due to the RESERVED area intersecting with the CXL window.
-
-
-During boot the following traceback is observed:
-
-0xffffffff81101650 in insert_resource_expand_to_fit ()
-0xffffffff83d964c5 in e820__reserve_resources_late ()
-0xffffffff83e03210 in pcibios_resource_survey ()
-0xffffffff83e04f4a in pcibios_init ()
-
-Which produces a call to reserve the CFMWS area:
-
-(gdb) p *new
-$54 = {start = 0x290000000, end = 0x2cfffffff, name = "Reserved",
-        flags = 0x200, desc = 0x7, parent = 0x0, sibling = 0x0,
-        child = 0x0}
-
-Later the Kernel parses ACPI tables and reserves the exact same area as
-the CXL Fixed Memory Window.  The use of `insert_resource_conflict`
-retains the RESERVED region and makes it a child of the new region.
-
-0xffffffff811016a4 in insert_resource_conflict ()
-                       insert_resource ()
-0xffffffff81a81389 in cxl_parse_cfmws ()
-0xffffffff818c4a81 in call_handler ()
-                       acpi_parse_entries_array ()
-
-(gdb) p/x *new
-$59 = {start = 0x290000000, end = 0x2cfffffff, name = "CXL Window 0",
-        flags = 0x200, desc = 0x0, parent = 0x0, sibling = 0x0,
-        child = 0x0}
-
-This produces the following output in /proc/iomem:
-
-590000000-68fffffff : CXL Window 0
-   590000000-68fffffff : Reserved
-
-This reserved area causes `get_free_mem_region()` to fail due to a check
-against `__region_intersects()`.  Due to this reserved area, the
-intersect check will only ever return REGION_INTERSECTS, which causes
-`cxl create-region` to always fail.
-
-Signed-off-by: Gregory Price <gregory.price@memverge.com>
----
-  hw/i386/pc.c | 2 --
-  1 file changed, 2 deletions(-)
-
-diff --git a/hw/i386/pc.c b/hw/i386/pc.c
-index 566accf7e6..5bf5465a21 100644
---- a/hw/i386/pc.c
-+++ b/hw/i386/pc.c
-@@ -1061,7 +1061,6 @@ void pc_memory_init(PCMachineState *pcms,
-          hwaddr cxl_size = MiB;
-cxl_base = pc_get_cxl_range_start(pcms);
--        e820_add_entry(cxl_base, cxl_size, E820_RESERVED);
-          memory_region_init(mr, OBJECT(machine), "cxl_host_reg", cxl_size);
-          memory_region_add_subregion(system_memory, cxl_base, mr);
-          cxl_resv_end = cxl_base + cxl_size;
-@@ -1077,7 +1076,6 @@ void pc_memory_init(PCMachineState *pcms,
-                  memory_region_init_io(&fw->mr, OBJECT(machine), &cfmws_ops, 
-fw,
-                                        "cxl-fixed-memory-region", fw->size);
-                  memory_region_add_subregion(system_memory, fw->base, &fw->mr);
-Or will this be subregion of cxl_base?
-
-Thanks,
-Pankaj
--                e820_add_entry(fw->base, fw->size, E820_RESERVED);
-                  cxl_fmw_base += fw->size;
-                  cxl_resv_end = cxl_fmw_base;
-              }
-
->
-> -        e820_add_entry(cxl_base, cxl_size, E820_RESERVED);
->
->           memory_region_init(mr, OBJECT(machine), "cxl_host_reg", cxl_size);
->
->           memory_region_add_subregion(system_memory, cxl_base, mr);
->
->           cxl_resv_end = cxl_base + cxl_size;
->
-> @@ -1077,7 +1076,6 @@ void pc_memory_init(PCMachineState *pcms,
->
->                   memory_region_init_io(&fw->mr, OBJECT(machine),
->
-> &cfmws_ops, fw,
->
->                                         "cxl-fixed-memory-region",
->
-> fw->size);
->
->                   memory_region_add_subregion(system_memory, fw->base,
->
-> &fw->mr);
->
->
-Or will this be subregion of cxl_base?
->
->
-Thanks,
->
-Pankaj
-The memory region backing this memory area still has to be initialized
-and added in the QEMU system, but it will now be initialized for use by
-linux after PCI/ACPI setup occurs and the CXL driver discovers it via
-CDAT.
-
-It's also still possible to assign this area a static memory region at
-bool by setting up the SRATs in the ACPI tables, but that patch is not
-upstream yet.
-
-On Tue, Oct 18, 2022 at 5:14 AM Gregory Price <gourry.memverge@gmail.com> wrote:
->
->
-Early-boot e820 records will be inserted by the bios/efi/early boot
->
-software and be reported to the kernel via insert_resource.  Later, when
->
-CXL drivers iterate through the regions again, they will insert another
->
-resource and make the RESERVED memory area a child.
-I have already sent a patch
-https://www.mail-archive.com/qemu-devel@nongnu.org/msg882012.html
-.
-When the patch is applied, there would not be any reserved entries
-even with passing E820_RESERVED .
-So this patch needs to be evaluated in the light of the above patch I
-sent. Once you apply my patch, does the issue still exist?
-
->
->
-This RESERVED memory area causes the memory region to become unusable,
->
-and as a result attempting to create memory regions with
->
->
-`cxl create-region ...`
->
->
-Will fail due to the RESERVED area intersecting with the CXL window.
->
->
->
-During boot the following traceback is observed:
->
->
-0xffffffff81101650 in insert_resource_expand_to_fit ()
->
-0xffffffff83d964c5 in e820__reserve_resources_late ()
->
-0xffffffff83e03210 in pcibios_resource_survey ()
->
-0xffffffff83e04f4a in pcibios_init ()
->
->
-Which produces a call to reserve the CFMWS area:
->
->
-(gdb) p *new
->
-$54 = {start = 0x290000000, end = 0x2cfffffff, name = "Reserved",
->
-flags = 0x200, desc = 0x7, parent = 0x0, sibling = 0x0,
->
-child = 0x0}
->
->
-Later the Kernel parses ACPI tables and reserves the exact same area as
->
-the CXL Fixed Memory Window.  The use of `insert_resource_conflict`
->
-retains the RESERVED region and makes it a child of the new region.
->
->
-0xffffffff811016a4 in insert_resource_conflict ()
->
-insert_resource ()
->
-0xffffffff81a81389 in cxl_parse_cfmws ()
->
-0xffffffff818c4a81 in call_handler ()
->
-acpi_parse_entries_array ()
->
->
-(gdb) p/x *new
->
-$59 = {start = 0x290000000, end = 0x2cfffffff, name = "CXL Window 0",
->
-flags = 0x200, desc = 0x0, parent = 0x0, sibling = 0x0,
->
-child = 0x0}
->
->
-This produces the following output in /proc/iomem:
->
->
-590000000-68fffffff : CXL Window 0
->
-590000000-68fffffff : Reserved
->
->
-This reserved area causes `get_free_mem_region()` to fail due to a check
->
-against `__region_intersects()`.  Due to this reserved area, the
->
-intersect check will only ever return REGION_INTERSECTS, which causes
->
-`cxl create-region` to always fail.
->
->
-Signed-off-by: Gregory Price <gregory.price@memverge.com>
->
----
->
-hw/i386/pc.c | 2 --
->
-1 file changed, 2 deletions(-)
->
->
-diff --git a/hw/i386/pc.c b/hw/i386/pc.c
->
-index 566accf7e6..5bf5465a21 100644
->
---- a/hw/i386/pc.c
->
-+++ b/hw/i386/pc.c
->
-@@ -1061,7 +1061,6 @@ void pc_memory_init(PCMachineState *pcms,
->
-hwaddr cxl_size = MiB;
->
->
-cxl_base = pc_get_cxl_range_start(pcms);
->
--        e820_add_entry(cxl_base, cxl_size, E820_RESERVED);
->
-memory_region_init(mr, OBJECT(machine), "cxl_host_reg", cxl_size);
->
-memory_region_add_subregion(system_memory, cxl_base, mr);
->
-cxl_resv_end = cxl_base + cxl_size;
->
-@@ -1077,7 +1076,6 @@ void pc_memory_init(PCMachineState *pcms,
->
-memory_region_init_io(&fw->mr, OBJECT(machine), &cfmws_ops,
->
-fw,
->
-"cxl-fixed-memory-region", fw->size);
->
-memory_region_add_subregion(system_memory, fw->base,
->
-&fw->mr);
->
--                e820_add_entry(fw->base, fw->size, E820_RESERVED);
->
-cxl_fmw_base += fw->size;
->
-cxl_resv_end = cxl_fmw_base;
->
-}
->
---
->
-2.37.3
->
-
-This patch does not resolve the issue, reserved entries are still created.
-[Â  Â  0.000000] BIOS-e820: [mem 0x0000000280000000-0x00000002800fffff] reserved
-[Â  Â  0.000000] BIOS-e820: [mem 0x0000000290000000-0x000000029fffffff] reserved
-# cat /proc/iomem
-290000000-29fffffff : CXL Window 0
-Â  290000000-29fffffff : Reserved
-# cxl create-region -m -d decoder0.0 -w 1 -g 256 mem0
-cxl region: create_region: region0: set_size failed: Numerical result out of range
-cxl region: cmd_create_region: created 0 regions
-On Tue, Oct 18, 2022 at 2:05 AM Ani Sinha <
-ani@anisinha.ca
-> wrote:
-On Tue, Oct 18, 2022 at 5:14 AM Gregory Price <
-gourry.memverge@gmail.com
-> wrote:
->
-> Early-boot e820 records will be inserted by the bios/efi/early boot
-> software and be reported to the kernel via insert_resource.Â  Later, when
-> CXL drivers iterate through the regions again, they will insert another
-> resource and make the RESERVED memory area a child.
-I have already sent a patch
-https://www.mail-archive.com/qemu-devel@nongnu.org/msg882012.html
-.
-When the patch is applied, there would not be any reserved entries
-even with passing E820_RESERVED .
-So this patch needs to be evaluated in the light of the above patch I
-sent. Once you apply my patch, does the issue still exist?
->
-> This RESERVED memory area causes the memory region to become unusable,
-> and as a result attempting to create memory regions with
->
->Â  Â  Â `cxl create-region ...`
->
-> Will fail due to the RESERVED area intersecting with the CXL window.
->
->
-> During boot the following traceback is observed:
->
-> 0xffffffff81101650 in insert_resource_expand_to_fit ()
-> 0xffffffff83d964c5 in e820__reserve_resources_late ()
-> 0xffffffff83e03210 in pcibios_resource_survey ()
-> 0xffffffff83e04f4a in pcibios_init ()
->
-> Which produces a call to reserve the CFMWS area:
->
-> (gdb) p *new
-> $54 = {start = 0x290000000, end = 0x2cfffffff, name = "Reserved",
->Â  Â  Â  Â  flags = 0x200, desc = 0x7, parent = 0x0, sibling = 0x0,
->Â  Â  Â  Â  child = 0x0}
->
-> Later the Kernel parses ACPI tables and reserves the exact same area as
-> the CXL Fixed Memory Window.Â  The use of `insert_resource_conflict`
-> retains the RESERVED region and makes it a child of the new region.
->
-> 0xffffffff811016a4 in insert_resource_conflict ()
->Â  Â  Â  Â  Â  Â  Â  Â  Â  Â  Â  Â insert_resource ()
-> 0xffffffff81a81389 in cxl_parse_cfmws ()
-> 0xffffffff818c4a81 in call_handler ()
->Â  Â  Â  Â  Â  Â  Â  Â  Â  Â  Â  Â acpi_parse_entries_array ()
->
-> (gdb) p/x *new
-> $59 = {start = 0x290000000, end = 0x2cfffffff, name = "CXL Window 0",
->Â  Â  Â  Â  flags = 0x200, desc = 0x0, parent = 0x0, sibling = 0x0,
->Â  Â  Â  Â  child = 0x0}
->
-> This produces the following output in /proc/iomem:
->
-> 590000000-68fffffff : CXL Window 0
->Â  Â 590000000-68fffffff : Reserved
->
-> This reserved area causes `get_free_mem_region()` to fail due to a check
-> against `__region_intersects()`.Â  Due to this reserved area, the
-> intersect check will only ever return REGION_INTERSECTS, which causes
-> `cxl create-region` to always fail.
->
-> Signed-off-by: Gregory Price <
-gregory.price@memverge.com
->
-> ---
->Â  hw/i386/pc.c | 2 --
->Â  1 file changed, 2 deletions(-)
->
-> diff --git a/hw/i386/pc.c b/hw/i386/pc.c
-> index 566accf7e6..5bf5465a21 100644
-> --- a/hw/i386/pc.c
-> +++ b/hw/i386/pc.c
-> @@ -1061,7 +1061,6 @@ void pc_memory_init(PCMachineState *pcms,
->Â  Â  Â  Â  Â  hwaddr cxl_size = MiB;
->
->Â  Â  Â  Â  Â  cxl_base = pc_get_cxl_range_start(pcms);
-> -Â  Â  Â  Â  e820_add_entry(cxl_base, cxl_size, E820_RESERVED);
->Â  Â  Â  Â  Â  memory_region_init(mr, OBJECT(machine), "cxl_host_reg", cxl_size);
->Â  Â  Â  Â  Â  memory_region_add_subregion(system_memory, cxl_base, mr);
->Â  Â  Â  Â  Â  cxl_resv_end = cxl_base + cxl_size;
-> @@ -1077,7 +1076,6 @@ void pc_memory_init(PCMachineState *pcms,
->Â  Â  Â  Â  Â  Â  Â  Â  Â  memory_region_init_io(&fw->mr, OBJECT(machine), &cfmws_ops, fw,
->Â  Â  Â  Â  Â  Â  Â  Â  Â  Â  Â  Â  Â  Â  Â  Â  Â  Â  Â  Â  "cxl-fixed-memory-region", fw->size);
->Â  Â  Â  Â  Â  Â  Â  Â  Â  memory_region_add_subregion(system_memory, fw->base, &fw->mr);
-> -Â  Â  Â  Â  Â  Â  Â  Â  e820_add_entry(fw->base, fw->size, E820_RESERVED);
->Â  Â  Â  Â  Â  Â  Â  Â  Â  cxl_fmw_base += fw->size;
->Â  Â  Â  Â  Â  Â  Â  Â  Â  cxl_resv_end = cxl_fmw_base;
->Â  Â  Â  Â  Â  Â  Â  }
-> --
-> 2.37.3
->
-
-+Gerd Hoffmann
-
-On Tue, Oct 18, 2022 at 8:16 PM Gregory Price <gourry.memverge@gmail.com> wrote:
->
->
-This patch does not resolve the issue, reserved entries are still created.
->
->
-[    0.000000] BIOS-e820: [mem 0x0000000280000000-0x00000002800fffff] reserved
->
-[    0.000000] BIOS-e820: [mem 0x0000000290000000-0x000000029fffffff] reserved
->
->
-# cat /proc/iomem
->
-290000000-29fffffff : CXL Window 0
->
-290000000-29fffffff : Reserved
->
->
-# cxl create-region -m -d decoder0.0 -w 1 -g 256 mem0
->
-cxl region: create_region: region0: set_size failed: Numerical result out of
->
-range
->
-cxl region: cmd_create_region: created 0 regions
->
->
-On Tue, Oct 18, 2022 at 2:05 AM Ani Sinha <ani@anisinha.ca> wrote:
->
->
->
-> On Tue, Oct 18, 2022 at 5:14 AM Gregory Price <gourry.memverge@gmail.com>
->
-> wrote:
->
-> >
->
-> > Early-boot e820 records will be inserted by the bios/efi/early boot
->
-> > software and be reported to the kernel via insert_resource.  Later, when
->
-> > CXL drivers iterate through the regions again, they will insert another
->
-> > resource and make the RESERVED memory area a child.
->
->
->
-> I have already sent a patch
->
->
-https://www.mail-archive.com/qemu-devel@nongnu.org/msg882012.html
-.
->
-> When the patch is applied, there would not be any reserved entries
->
-> even with passing E820_RESERVED .
->
-> So this patch needs to be evaluated in the light of the above patch I
->
-> sent. Once you apply my patch, does the issue still exist?
->
->
->
-> >
->
-> > This RESERVED memory area causes the memory region to become unusable,
->
-> > and as a result attempting to create memory regions with
->
-> >
->
-> >     `cxl create-region ...`
->
-> >
->
-> > Will fail due to the RESERVED area intersecting with the CXL window.
->
-> >
->
-> >
->
-> > During boot the following traceback is observed:
->
-> >
->
-> > 0xffffffff81101650 in insert_resource_expand_to_fit ()
->
-> > 0xffffffff83d964c5 in e820__reserve_resources_late ()
->
-> > 0xffffffff83e03210 in pcibios_resource_survey ()
->
-> > 0xffffffff83e04f4a in pcibios_init ()
->
-> >
->
-> > Which produces a call to reserve the CFMWS area:
->
-> >
->
-> > (gdb) p *new
->
-> > $54 = {start = 0x290000000, end = 0x2cfffffff, name = "Reserved",
->
-> >        flags = 0x200, desc = 0x7, parent = 0x0, sibling = 0x0,
->
-> >        child = 0x0}
->
-> >
->
-> > Later the Kernel parses ACPI tables and reserves the exact same area as
->
-> > the CXL Fixed Memory Window.  The use of `insert_resource_conflict`
->
-> > retains the RESERVED region and makes it a child of the new region.
->
-> >
->
-> > 0xffffffff811016a4 in insert_resource_conflict ()
->
-> >                       insert_resource ()
->
-> > 0xffffffff81a81389 in cxl_parse_cfmws ()
->
-> > 0xffffffff818c4a81 in call_handler ()
->
-> >                       acpi_parse_entries_array ()
->
-> >
->
-> > (gdb) p/x *new
->
-> > $59 = {start = 0x290000000, end = 0x2cfffffff, name = "CXL Window 0",
->
-> >        flags = 0x200, desc = 0x0, parent = 0x0, sibling = 0x0,
->
-> >        child = 0x0}
->
-> >
->
-> > This produces the following output in /proc/iomem:
->
-> >
->
-> > 590000000-68fffffff : CXL Window 0
->
-> >   590000000-68fffffff : Reserved
->
-> >
->
-> > This reserved area causes `get_free_mem_region()` to fail due to a check
->
-> > against `__region_intersects()`.  Due to this reserved area, the
->
-> > intersect check will only ever return REGION_INTERSECTS, which causes
->
-> > `cxl create-region` to always fail.
->
-> >
->
-> > Signed-off-by: Gregory Price <gregory.price@memverge.com>
->
-> > ---
->
-> >  hw/i386/pc.c | 2 --
->
-> >  1 file changed, 2 deletions(-)
->
-> >
->
-> > diff --git a/hw/i386/pc.c b/hw/i386/pc.c
->
-> > index 566accf7e6..5bf5465a21 100644
->
-> > --- a/hw/i386/pc.c
->
-> > +++ b/hw/i386/pc.c
->
-> > @@ -1061,7 +1061,6 @@ void pc_memory_init(PCMachineState *pcms,
->
-> >          hwaddr cxl_size = MiB;
->
-> >
->
-> >          cxl_base = pc_get_cxl_range_start(pcms);
->
-> > -        e820_add_entry(cxl_base, cxl_size, E820_RESERVED);
->
-> >          memory_region_init(mr, OBJECT(machine), "cxl_host_reg", cxl_size);
->
-> >          memory_region_add_subregion(system_memory, cxl_base, mr);
->
-> >          cxl_resv_end = cxl_base + cxl_size;
->
-> > @@ -1077,7 +1076,6 @@ void pc_memory_init(PCMachineState *pcms,
->
-> >                  memory_region_init_io(&fw->mr, OBJECT(machine),
->
-> > &cfmws_ops, fw,
->
-> >                                        "cxl-fixed-memory-region",
->
-> > fw->size);
->
-> >                  memory_region_add_subregion(system_memory, fw->base,
->
-> > &fw->mr);
->
-> > -                e820_add_entry(fw->base, fw->size, E820_RESERVED);
->
-> >                  cxl_fmw_base += fw->size;
->
-> >                  cxl_resv_end = cxl_fmw_base;
->
-> >              }
->
-> > --
->
-> > 2.37.3
->
-> >
-
->
->> > diff --git a/hw/i386/pc.c b/hw/i386/pc.c
->
->> > index 566accf7e6..5bf5465a21 100644
->
->> > --- a/hw/i386/pc.c
->
->> > +++ b/hw/i386/pc.c
->
->> > @@ -1061,7 +1061,6 @@ void pc_memory_init(PCMachineState *pcms,
->
->> >          hwaddr cxl_size = MiB;
->
->> >
->
->> >          cxl_base = pc_get_cxl_range_start(pcms);
->
->> > -        e820_add_entry(cxl_base, cxl_size, E820_RESERVED);
-Just dropping it doesn't look like a good plan to me.
-
-You can try set etc/reserved-memory-end fw_cfg file instead.  Firmware
-(both seabios and ovmf) read it and will make sure the 64bit pci mmio
-window is placed above that address, i.e. this effectively reserves
-address space.  Right now used by memory hotplug code, but should work
-for cxl too I think (disclaimer: don't know much about cxl ...).
-
-take care & HTH,
-  Gerd
-
-On Tue, 8 Nov 2022 12:21:11 +0100
-Gerd Hoffmann <kraxel@redhat.com> wrote:
-
->
-> >> > diff --git a/hw/i386/pc.c b/hw/i386/pc.c
->
-> >> > index 566accf7e6..5bf5465a21 100644
->
-> >> > --- a/hw/i386/pc.c
->
-> >> > +++ b/hw/i386/pc.c
->
-> >> > @@ -1061,7 +1061,6 @@ void pc_memory_init(PCMachineState *pcms,
->
-> >> >          hwaddr cxl_size = MiB;
->
-> >> >
->
-> >> >          cxl_base = pc_get_cxl_range_start(pcms);
->
-> >> > -        e820_add_entry(cxl_base, cxl_size, E820_RESERVED);
->
->
-Just dropping it doesn't look like a good plan to me.
->
->
-You can try set etc/reserved-memory-end fw_cfg file instead.  Firmware
->
-(both seabios and ovmf) read it and will make sure the 64bit pci mmio
->
-window is placed above that address, i.e. this effectively reserves
->
-address space.  Right now used by memory hotplug code, but should work
->
-for cxl too I think (disclaimer: don't know much about cxl ...).
-As far as I know CXL impl. in QEMU isn't using etc/reserved-memory-end
-at all, it' has its own mapping.
-
-Regardless of that, reserved E820 entries look wrong, and looking at
-commit message OS is right to bailout on them (expected according
-to ACPI spec).
-Also spec says 
-
-"
-E820 Assumptions and Limitations
- [...]
- The platform boot firmware does not return a range description for the memory 
-mapping of
- PCI devices, ISA Option ROMs, and ISA Plug and Play cards because the OS has 
-mechanisms
- available to detect them.
-"
-
-so dropping reserved entries looks reasonable from ACPI spec point of view.
-(disclaimer: don't know much about cxl ... either)
->
->
-take care & HTH,
->
-Gerd
->
-
-On Fri, Nov 11, 2022 at 11:51:23AM +0100, Igor Mammedov wrote:
->
-On Tue, 8 Nov 2022 12:21:11 +0100
->
-Gerd Hoffmann <kraxel@redhat.com> wrote:
->
->
-> > >> > diff --git a/hw/i386/pc.c b/hw/i386/pc.c
->
-> > >> > index 566accf7e6..5bf5465a21 100644
->
-> > >> > --- a/hw/i386/pc.c
->
-> > >> > +++ b/hw/i386/pc.c
->
-> > >> > @@ -1061,7 +1061,6 @@ void pc_memory_init(PCMachineState *pcms,
->
-> > >> >          hwaddr cxl_size = MiB;
->
-> > >> >
->
-> > >> >          cxl_base = pc_get_cxl_range_start(pcms);
->
-> > >> > -        e820_add_entry(cxl_base, cxl_size, E820_RESERVED);
->
->
->
-> Just dropping it doesn't look like a good plan to me.
->
->
->
-> You can try set etc/reserved-memory-end fw_cfg file instead.  Firmware
->
-> (both seabios and ovmf) read it and will make sure the 64bit pci mmio
->
-> window is placed above that address, i.e. this effectively reserves
->
-> address space.  Right now used by memory hotplug code, but should work
->
-> for cxl too I think (disclaimer: don't know much about cxl ...).
->
->
-As far as I know CXL impl. in QEMU isn't using etc/reserved-memory-end
->
-at all, it' has its own mapping.
-This should be changed.  cxl should make sure the highest address used
-is stored in etc/reserved-memory-end to avoid the firmware mapping pci
-resources there.
-
->
-so dropping reserved entries looks reasonable from ACPI spec point of view.
-Yep, I don't want dispute that.
-
-I suspect the reason for these entries to exist in the first place is to
-inform the firmware that it should not place stuff there, and if we
-remove that to conform with the spec we need some alternative way for
-that ...
-
-take care,
-  Gerd
-
-On Fri, 11 Nov 2022 12:40:59 +0100
-Gerd Hoffmann <kraxel@redhat.com> wrote:
-
->
-On Fri, Nov 11, 2022 at 11:51:23AM +0100, Igor Mammedov wrote:
->
-> On Tue, 8 Nov 2022 12:21:11 +0100
->
-> Gerd Hoffmann <kraxel@redhat.com> wrote:
->
->
->
-> > > >> > diff --git a/hw/i386/pc.c b/hw/i386/pc.c
->
-> > > >> > index 566accf7e6..5bf5465a21 100644
->
-> > > >> > --- a/hw/i386/pc.c
->
-> > > >> > +++ b/hw/i386/pc.c
->
-> > > >> > @@ -1061,7 +1061,6 @@ void pc_memory_init(PCMachineState *pcms,
->
-> > > >> >          hwaddr cxl_size = MiB;
->
-> > > >> >
->
-> > > >> >          cxl_base = pc_get_cxl_range_start(pcms);
->
-> > > >> > -        e820_add_entry(cxl_base, cxl_size, E820_RESERVED);
->
-> >
->
-> > Just dropping it doesn't look like a good plan to me.
->
-> >
->
-> > You can try set etc/reserved-memory-end fw_cfg file instead.  Firmware
->
-> > (both seabios and ovmf) read it and will make sure the 64bit pci mmio
->
-> > window is placed above that address, i.e. this effectively reserves
->
-> > address space.  Right now used by memory hotplug code, but should work
->
-> > for cxl too I think (disclaimer: don't know much about cxl ...).
->
->
->
-> As far as I know CXL impl. in QEMU isn't using etc/reserved-memory-end
->
-> at all, it' has its own mapping.
->
->
-This should be changed.  cxl should make sure the highest address used
->
-is stored in etc/reserved-memory-end to avoid the firmware mapping pci
->
-resources there.
-if (pcmc->has_reserved_memory && machine->device_memory->base) {            
- 
-[...]
-                                                             
-        if (pcms->cxl_devices_state.is_enabled) {                               
- 
-            res_mem_end = cxl_resv_end;
-
-that should be handled by this line
-
-        }                                   
-                                     
-        *val = cpu_to_le64(ROUND_UP(res_mem_end, 1 * GiB));                     
- 
-        fw_cfg_add_file(fw_cfg, "etc/reserved-memory-end", val, sizeof(*val));  
- 
-    }  
-
-so SeaBIOS shouldn't intrude into CXL address space
-(I assume EDK2 behave similarly here)
- 
->
-> so dropping reserved entries looks reasonable from ACPI spec point of view.
->
->
->
->
-Yep, I don't want dispute that.
->
->
-I suspect the reason for these entries to exist in the first place is to
->
-inform the firmware that it should not place stuff there, and if we
-^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-just to educate me, can you point out what SeaBIOS code does with reservations.
-
->
-remove that to conform with the spec we need some alternative way for
->
-that ...
-with etc/reserved-memory-end set as above,
-is E820_RESERVED really needed here?
-
-(my understanding was that E820_RESERVED weren't accounted for when
-initializing PCI devices)
-
->
->
-take care,
->
-Gerd
->
-
->
-if (pcmc->has_reserved_memory && machine->device_memory->base) {
->
->
-[...]
->
->
-if (pcms->cxl_devices_state.is_enabled) {
->
->
-res_mem_end = cxl_resv_end;
->
->
-that should be handled by this line
->
->
-}
->
->
-*val = cpu_to_le64(ROUND_UP(res_mem_end, 1 * GiB));
->
->
-fw_cfg_add_file(fw_cfg, "etc/reserved-memory-end", val,
->
-sizeof(*val));
->
-}
->
->
-so SeaBIOS shouldn't intrude into CXL address space
-Yes, looks good, so with this in place already everyting should be fine.
-
->
-(I assume EDK2 behave similarly here)
-Correct, ovmf reads that fw_cfg file too.
-
->
-> I suspect the reason for these entries to exist in the first place is to
->
-> inform the firmware that it should not place stuff there, and if we
->
-^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
->
-just to educate me, can you point out what SeaBIOS code does with
->
-reservations.
-They are added to the e820 map which gets passed on to the OS.  seabios
-uses (and updateas) the e820 map too, when allocating memory for
-example.  While thinking about it I'm not fully sure it actually looks
-at reservations, maybe it only uses (and updates) ram entries when
-allocating memory.
-
->
-> remove that to conform with the spec we need some alternative way for
->
-> that ...
->
->
-with etc/reserved-memory-end set as above,
->
-is E820_RESERVED really needed here?
-No.  Setting etc/reserved-memory-end is enough.
-
-So for the original patch:
-Acked-by: Gerd Hoffmann <kraxel@redhat.com>
-
-take care,
-  Gerd
-
-On Fri, Nov 11, 2022 at 02:36:02PM +0100, Gerd Hoffmann wrote:
->
->     if (pcmc->has_reserved_memory && machine->device_memory->base) {
->
->
->
-> [...]
->
->
->
->         if (pcms->cxl_devices_state.is_enabled) {
->
->
->
->             res_mem_end = cxl_resv_end;
->
->
->
-> that should be handled by this line
->
->
->
->         }
->
->
->
->         *val = cpu_to_le64(ROUND_UP(res_mem_end, 1 * GiB));
->
->
->
->         fw_cfg_add_file(fw_cfg, "etc/reserved-memory-end", val,
->
-> sizeof(*val));
->
->     }
->
->
->
-> so SeaBIOS shouldn't intrude into CXL address space
->
->
-Yes, looks good, so with this in place already everyting should be fine.
->
->
-> (I assume EDK2 behave similarly here)
->
->
-Correct, ovmf reads that fw_cfg file too.
->
->
-> > I suspect the reason for these entries to exist in the first place is to
->
-> > inform the firmware that it should not place stuff there, and if we
->
->        ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
->
-> just to educate me, can you point out what SeaBIOS code does with
->
-> reservations.
->
->
-They are added to the e820 map which gets passed on to the OS.  seabios
->
-uses (and updateas) the e820 map too, when allocating memory for
->
-example.  While thinking about it I'm not fully sure it actually looks
->
-at reservations, maybe it only uses (and updates) ram entries when
->
-allocating memory.
->
->
-> > remove that to conform with the spec we need some alternative way for
->
-> > that ...
->
->
->
-> with etc/reserved-memory-end set as above,
->
-> is E820_RESERVED really needed here?
->
->
-No.  Setting etc/reserved-memory-end is enough.
->
->
-So for the original patch:
->
-Acked-by: Gerd Hoffmann <kraxel@redhat.com>
->
->
-take care,
->
-Gerd
-It's upstream already, sorry I can't add your tag.
-
--- 
-MST
-
diff --git a/classification_output/01/instruction/7733130 b/classification_output/01/instruction/7733130
deleted file mode 100644
index 1c3bc483f..000000000
--- a/classification_output/01/instruction/7733130
+++ /dev/null
@@ -1,47 +0,0 @@
-instruction: 0.758
-semantic: 0.694
-other: 0.687
-mistranslation: 0.516
-
-[Qemu-devel] [BUG] VNC: client won't send FramebufferUpdateRequest if job in flight is aborted
-
-Hi Gerd, Daniel.
-
-We noticed that if VncSharePolicy was configured with 
-VNC_SHARE_POLICY_FORCE_SHARED mode and
-multiple vnc clients opened vnc connections, some clients could go blank screen 
-at high probability.
-This problem can be reproduced when we regularly reboot suse12sp3 in graphic 
-mode both
-with RealVNC and noVNC client.
-
-Then we dig into it and find out that some clients go blank screen because they 
-don't
-send FramebufferUpdateRequest any more. One step further, we notice that each 
-time
-the job in flight is aborted one client go blank screen.
-
-The bug is triggered in the following procedure.
-Guest reboot => graphic mode switch => graphic_hw_update =>  vga_update_display
-=> vga_draw_graphic (full_update = 1) => dpy_gfx_replace_surface => 
-vnc_dpy_switch =>
-vnc_abort_display_jobs (client may have job in flight) => job removed from the 
-queue
-If one client has vnc job in flight, *vnc_abort_display_jobs* will wait until 
-its job is abandoned.
-This behavior is done in vnc_worker_thread_loop when 'if (job->vs->ioc == NULL 
-|| job->vs->abort == true)'
-branch is taken.
-
-As we can see, *vnc_abort_display_jobs* is intended to do some optimization to 
-avoid unnecessary client update.
-But if client sends FramebufferUpdateRequest for some graphic area and its 
-FramebufferUpdate response job
-is abandoned, the client may wait for the response and never send new 
-FramebufferUpdateRequest, which may
-case the client go blank screen forever.
-
-So I am wondering whether we should drop the *vnc_abort_display_jobs*  
-optimization  or do some trick here
-to push the client to send new FramebufferUpdateRequest. Do you have any idea ?
-
diff --git a/classification_output/01/instruction/7960594 b/classification_output/01/instruction/7960594
deleted file mode 100644
index c06d35dd8..000000000
--- a/classification_output/01/instruction/7960594
+++ /dev/null
@@ -1,158 +0,0 @@
-instruction: 0.991
-other: 0.979
-semantic: 0.974
-mistranslation: 0.930
-
-[Qemu-devel]  [Bug Report] vm paused after succeeding to migrate
-
-Hi, all
-I encounterd a bug when I try to migrate a windows vm.
-
-Enviroment information:
-host A: cpu E5620(model WestmereEP without flag xsave)
-host B: cpu E5-2643(model SandyBridgeEP with xsave)
-
-The reproduce steps is :
-1. Start a windows 2008 vm with -cpu host(which means host-passthrough).
-2. Migrate the vm to host B when cr4.OSXSAVE=0 (successfully).
-3. Vm runs on host B for a while so that cr4.OSXSAVE changes to 1.
-4. Then migrate the vm to host A (successfully), but vm was paused, and qemu 
-printed log as followed:
-
-KVM: entry failed, hardware error 0x80000021
-
-If you're running a guest on an Intel machine without unrestricted mode
-support, the failure can be most likely due to the guest entering an invalid
-state for Intel VT. For example, the guest maybe running in big real mode
-which is not supported on less recent Intel processors.
-
-EAX=019b3bb0 EBX=01a3ae80 ECX=01a61ce8 EDX=00000000
-ESI=01a62000 EDI=00000000 EBP=00000000 ESP=01718b20
-EIP=0185d982 EFL=00000286 [--S--P-] CPL=0 II=0 A20=1 SMM=0 HLT=0
-ES =0000 00000000 0000ffff 00009300
-CS =f000 ffff0000 0000ffff 00009b00
-SS =0000 00000000 0000ffff 00009300
-DS =0000 00000000 0000ffff 00009300
-FS =0000 00000000 0000ffff 00009300
-GS =0000 00000000 0000ffff 00009300
-LDT=0000 00000000 0000ffff 00008200
-TR =0000 00000000 0000ffff 00008b00
-GDT=     00000000 0000ffff
-IDT=     00000000 0000ffff
-CR0=60000010 CR2=00000000 CR3=00000000 CR4=00000000
-DR0=0000000000000000 DR1=0000000000000000 DR2=0000000000000000 
-DR3=0000000000000000
-DR6=00000000ffff0ff0 DR7=0000000000000400
-EFER=0000000000000000
-Code=00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 <00> 00 00 00 
-00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00
-
-I have found that problem happened when kvm_put_sregs returns err -22(called by 
-kvm_arch_put_registers(qemu)).
-Because kvm_arch_vcpu_ioctl_set_sregs(kvm-mod) checked that guest_cpuid_has no 
-X86_FEATURE_XSAVE but cr4.OSXSAVE=1.
-So should we cancel migration when kvm_arch_put_registers returns error?
-
-* linzhecheng (address@hidden) wrote:
->
-Hi, all
->
-I encounterd a bug when I try to migrate a windows vm.
->
->
-Enviroment information:
->
-host A: cpu E5620(model WestmereEP without flag xsave)
->
-host B: cpu E5-2643(model SandyBridgeEP with xsave)
->
->
-The reproduce steps is :
->
-1. Start a windows 2008 vm with -cpu host(which means host-passthrough).
->
-2. Migrate the vm to host B when cr4.OSXSAVE=0 (successfully).
->
-3. Vm runs on host B for a while so that cr4.OSXSAVE changes to 1.
->
-4. Then migrate the vm to host A (successfully), but vm was paused, and qemu
->
-printed log as followed:
-Remember that migrating using -cpu host  across different CPU models is NOT
-expected to work.
-
->
-KVM: entry failed, hardware error 0x80000021
->
->
-If you're running a guest on an Intel machine without unrestricted mode
->
-support, the failure can be most likely due to the guest entering an invalid
->
-state for Intel VT. For example, the guest maybe running in big real mode
->
-which is not supported on less recent Intel processors.
->
->
-EAX=019b3bb0 EBX=01a3ae80 ECX=01a61ce8 EDX=00000000
->
-ESI=01a62000 EDI=00000000 EBP=00000000 ESP=01718b20
->
-EIP=0185d982 EFL=00000286 [--S--P-] CPL=0 II=0 A20=1 SMM=0 HLT=0
->
-ES =0000 00000000 0000ffff 00009300
->
-CS =f000 ffff0000 0000ffff 00009b00
->
-SS =0000 00000000 0000ffff 00009300
->
-DS =0000 00000000 0000ffff 00009300
->
-FS =0000 00000000 0000ffff 00009300
->
-GS =0000 00000000 0000ffff 00009300
->
-LDT=0000 00000000 0000ffff 00008200
->
-TR =0000 00000000 0000ffff 00008b00
->
-GDT=     00000000 0000ffff
->
-IDT=     00000000 0000ffff
->
-CR0=60000010 CR2=00000000 CR3=00000000 CR4=00000000
->
-DR0=0000000000000000 DR1=0000000000000000 DR2=0000000000000000
->
-DR3=0000000000000000
->
-DR6=00000000ffff0ff0 DR7=0000000000000400
->
-EFER=0000000000000000
->
-Code=00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 <00> 00 00
->
-00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00
->
-00
->
->
-I have found that problem happened when kvm_put_sregs returns err -22(called
->
-by kvm_arch_put_registers(qemu)).
->
-Because kvm_arch_vcpu_ioctl_set_sregs(kvm-mod) checked that guest_cpuid_has
->
-no X86_FEATURE_XSAVE but cr4.OSXSAVE=1.
->
-So should we cancel migration when kvm_arch_put_registers returns error?
-It would seem good if we can make the migration fail there rather than
-hitting that KVM error.
-It looks like we need to do a bit of plumbing to convert the places that
-call it to return a bool rather than void.
-
-Dave
-
---
-Dr. David Alan Gilbert / address@hidden / Manchester, UK
-
diff --git a/classification_output/01/instruction/8019995 b/classification_output/01/instruction/8019995
deleted file mode 100644
index 92d85cc82..000000000
--- a/classification_output/01/instruction/8019995
+++ /dev/null
@@ -1,31 +0,0 @@
-instruction: 0.753
-semantic: 0.698
-mistranslation: 0.633
-other: 0.620
-
-[BUG]The latest qemu crashed when I tested cxl
-
-I test cxl with the patch:[v11,0/2] arm/virt:
- CXL support via pxb_cxl.
-https://patchwork.kernel.org/project/cxl/cover/20220616141950.23374-1-Jonathan.Cameron@huawei.com/
-But the qemu crashed,and showing an error:
-qemu-system-aarch64: ../hw/arm/virt.c:1735: virt_get_high_memmap_enabled:
- Assertion `ARRAY_SIZE(extended_memmap) - VIRT_LOWMEMMAP_LAST == ARRAY_SIZE(enabled_array)' failed.
-Then I modify the patch to fix the bug:
-diff --git a/hw/arm/virt.c b/hw/arm/virt.c
-index ea2413a0ba..3d4cee3491 100644
---- a/hw/arm/virt.c
-+++ b/hw/arm/virt.c
-@@ -1710,6 +1730,7 @@ static inline bool *virt_get_high_memmap_enabled(VirtMachineState
- *vms,
-&vms->highmem_redists,
-&vms->highmem_ecam,
-&vms->highmem_mmio,
-+ &vms->cxl_devices_state.is_enabled,
-};
-Now qemu works good.
-Could you tell me when the patch(
-arm/virt:
- CXL support via pxb_cxl
-) will be merged into upstream?
-
diff --git a/classification_output/01/instruction/8566429 b/classification_output/01/instruction/8566429
deleted file mode 100644
index dfac92bf4..000000000
--- a/classification_output/01/instruction/8566429
+++ /dev/null
@@ -1,49 +0,0 @@
-instruction: 0.905
-other: 0.898
-semantic: 0.825
-mistranslation: 0.462
-
-[Qemu-devel] [BUG]pcibus_reset assertion failure on guest reboot
-
-Qemu-2.6.2
-
-Start a vm with vhost-net , do reboot and hot-unplug viritio-net nic in short 
-time, we touch 
-pcibus_reset assertion failure.
-
-Here is qemu log:
-22:29:46.359386+08:00  acpi_pm1_cnt_write -> guest do soft power off
-22:29:46.785310+08:00  qemu_devices_reset
-22:29:46.788093+08:00  virtio_pci_device_unplugged -> virtio net unpluged
-22:29:46.803427+08:00  pcibus_reset: Assertion `bus->irq_count[i] == 0' failed.
-
-Here is stack info: 
-(gdb) bt
-#0  0x00007f9a336795d7 in raise () from /usr/lib64/libc.so.6
-#1  0x00007f9a3367acc8 in abort () from /usr/lib64/libc.so.6
-#2  0x00007f9a33672546 in __assert_fail_base () from /usr/lib64/libc.so.6
-#3  0x00007f9a336725f2 in __assert_fail () from /usr/lib64/libc.so.6
-#4  0x0000000000641884 in pcibus_reset (qbus=0x29eee60) at hw/pci/pci.c:283
-#5  0x00000000005bfc30 in qbus_reset_one (bus=0x29eee60, opaque=<optimized 
-out>) at hw/core/qdev.c:319
-#6  0x00000000005c1b19 in qdev_walk_children (dev=0x29ed2b0, pre_devfn=0x0, 
-pre_busfn=0x0, post_devfn=0x5c2440 ...
-#7  0x00000000005c1c59 in qbus_walk_children (bus=0x2736f80, pre_devfn=0x0, 
-pre_busfn=0x0, post_devfn=0x5c2440 ...
-#8  0x00000000005513f5 in qemu_devices_reset () at vl.c:1998
-#9  0x00000000004cab9d in pc_machine_reset () at 
-/home/abuild/rpmbuild/BUILD/qemu-kvm-2.6.0/hw/i386/pc.c:1976
-#10 0x000000000055148b in qemu_system_reset (address@hidden) at vl.c:2011
-#11 0x000000000055164f in main_loop_should_exit () at vl.c:2169
-#12 0x0000000000551719 in main_loop () at vl.c:2212
-#13 0x000000000041c9a8 in main (argc=<optimized out>, argv=<optimized out>, 
-envp=<optimized out>) at vl.c:5130
-(gdb) f 4
-...
-(gdb) p bus->irq_count[0]
-$6 = 1
-
-Seems pci_update_irq_disabled doesn't work well
-
-can anyone help?
-
diff --git a/classification_output/01/instruction/9818783 b/classification_output/01/instruction/9818783
deleted file mode 100644
index a78585284..000000000
--- a/classification_output/01/instruction/9818783
+++ /dev/null
@@ -1,308 +0,0 @@
-instruction: 0.985
-other: 0.985
-semantic: 0.984
-mistranslation: 0.983
-
-[BUG][powerpc] KVM Guest Boot Failure – Hangs at "Booting Linux via __start()”
-
-Bug Description:
-Encountering a boot failure when launching a KVM guest with
-qemu-system-ppc64. The guest hangs at boot, and the QEMU monitor
-crashes.
-Reproduction Steps:
-# qemu-system-ppc64 --version
-QEMU emulator version 9.2.50 (v9.2.0-2799-g0462a32b4f)
-Copyright (c) 2003-2025 Fabrice Bellard and the QEMU Project developers
-# /usr/bin/qemu-system-ppc64 -name avocado-vt-vm1 -machine
-pseries,accel=kvm \
--m 32768 -smp 32,sockets=1,cores=32,threads=1 -nographic \
-  -device virtio-scsi-pci,id=scsi \
--drive
-file=/home/kvmci/tests/data/avocado-vt/images/rhel8.0devel-ppc64le.qcow2,if=none,id=drive0,format=qcow2
-\
--device scsi-hd,drive=drive0,bus=scsi.0 \
-  -netdev bridge,id=net0,br=virbr0 \
-  -device virtio-net-pci,netdev=net0 \
-  -serial pty \
-  -device virtio-balloon-pci \
-  -cpu host
-QEMU 9.2.50 monitor - type 'help' for more information
-char device redirected to /dev/pts/2 (label serial0)
-(qemu)
-(qemu) qemu-system-ppc64: warning: kernel_irqchip allowed but
-unavailable: IRQ_XIVE capability must be present for KVM
-Falling back to kernel-irqchip=off
-** Qemu Hang
-
-(In another ssh session)
-# screen /dev/pts/2
-Preparing to boot Linux version 6.10.4-200.fc40.ppc64le
-(mockbuild@c23cc4e677614c34bb22d54eeea4dc1f) (gcc (GCC) 14.2.1 20240801
-(Red Hat 14.2.1-1), GNU ld version 2.41-37.fc40) #1 SMP Sun Aug 11
-15:20:17 UTC 2024
-Detected machine type: 0000000000000101
-command line:
-BOOT_IMAGE=(ieee1275/disk,msdos2)/vmlinuz-6.10.4-200.fc40.ppc64le
-root=/dev/mapper/fedora-root ro rd.lvm.lv=fedora/root crashkernel=1024M
-Max number of cores passed to firmware: 2048 (NR_CPUS = 2048)
-Calling ibm,client-architecture-support... done
-memory layout at init:
-  memory_limit : 0000000000000000 (16 MB aligned)
-  alloc_bottom : 0000000008200000
-  alloc_top    : 0000000030000000
-  alloc_top_hi : 0000000800000000
-  rmo_top      : 0000000030000000
-  ram_top      : 0000000800000000
-instantiating rtas at 0x000000002fff0000... done
-prom_hold_cpus: skipped
-copying OF device tree...
-Building dt strings...
-Building dt structure...
-Device tree strings 0x0000000008210000 -> 0x0000000008210bd0
-Device tree struct  0x0000000008220000 -> 0x0000000008230000
-Quiescing Open Firmware ...
-Booting Linux via __start() @ 0x0000000000440000 ...
-** Guest Console Hang
-
-
-Git Bisect:
-Performing git bisect points to the following patch:
-# git bisect bad
-e8291ec16da80566c121c68d9112be458954d90b is the first bad commit
-commit e8291ec16da80566c121c68d9112be458954d90b (HEAD)
-Author: Nicholas Piggin <npiggin@gmail.com>
-Date:   Thu Dec 19 13:40:31 2024 +1000
-
-    target/ppc: fix timebase register reset state
-(H)DEC and PURR get reset before icount does, which causes them to
-be
-skewed and not match the init state. This can cause replay to not
-match the recorded trace exactly. For DEC and HDEC this is usually
-not
-noticable since they tend to get programmed before affecting the
-    target machine. PURR has been observed to cause replay bugs when
-    running Linux.
-
-    Fix this by resetting using a time of 0.
-
-    Message-ID: <20241219034035.1826173-2-npiggin@gmail.com>
-    Signed-off-by: Nicholas Piggin <npiggin@gmail.com>
-
- hw/ppc/ppc.c | 11 ++++++++---
- 1 file changed, 8 insertions(+), 3 deletions(-)
-
-
-Reverting the patch helps boot the guest.
-Thanks,
-Misbah Anjum N
-
-Thanks for the report.
-
-Tricky problem. A secondary CPU is hanging before it is started by the
-primary via rtas call.
-
-That secondary keeps calling kvm_cpu_exec(), which keeps exiting out
-early with EXCP_HLT because kvm_arch_process_async_events() returns
-true because that cpu has ->halted=1. That just goes around he run
-loop because there is an interrupt pending (DEC).
-
-So it never runs. It also never releases the BQL, and another CPU,
-the primary which is actually supposed to be running, is stuck in
-spapr_set_all_lpcrs() in run_on_cpu() waiting for the BQL.
-
-This patch just exposes the bug I think, by causing the interrupt.
-although I'm not quite sure why it's okay previously (-ve decrementer
-values should be causing a timer exception too). The timer exception
-should not be taken as an interrupt by those secondary CPUs, and it
-doesn't because it is masked, until set_all_lpcrs sets an LPCR value
-that enables powersave wakeup on decrementer interrupt.
-
-The start_powered_off sate just sets ->halted, which makes it look
-like a powersaving state. Logically I think it's not the same thing
-as far as spapr goes. I don't know why start_powered_off only sets
-->halted, and not ->stop/stopped as well.
-
-Not sure how best to solve it cleanly. I'll send a revert if I can't
-get something working soon.
-
-Thanks,
-Nick
-
-On Tue Mar 18, 2025 at 7:09 AM AEST, misanjum wrote:
->
-Bug Description:
->
-Encountering a boot failure when launching a KVM guest with
->
-qemu-system-ppc64. The guest hangs at boot, and the QEMU monitor
->
-crashes.
->
->
->
-Reproduction Steps:
->
-# qemu-system-ppc64 --version
->
-QEMU emulator version 9.2.50 (v9.2.0-2799-g0462a32b4f)
->
-Copyright (c) 2003-2025 Fabrice Bellard and the QEMU Project developers
->
->
-# /usr/bin/qemu-system-ppc64 -name avocado-vt-vm1 -machine
->
-pseries,accel=kvm \
->
--m 32768 -smp 32,sockets=1,cores=32,threads=1 -nographic \
->
--device virtio-scsi-pci,id=scsi \
->
--drive
->
-file=/home/kvmci/tests/data/avocado-vt/images/rhel8.0devel-ppc64le.qcow2,if=none,id=drive0,format=qcow2
->
->
-\
->
--device scsi-hd,drive=drive0,bus=scsi.0 \
->
--netdev bridge,id=net0,br=virbr0 \
->
--device virtio-net-pci,netdev=net0 \
->
--serial pty \
->
--device virtio-balloon-pci \
->
--cpu host
->
-QEMU 9.2.50 monitor - type 'help' for more information
->
-char device redirected to /dev/pts/2 (label serial0)
->
-(qemu)
->
-(qemu) qemu-system-ppc64: warning: kernel_irqchip allowed but
->
-unavailable: IRQ_XIVE capability must be present for KVM
->
-Falling back to kernel-irqchip=off
->
-** Qemu Hang
->
->
-(In another ssh session)
->
-# screen /dev/pts/2
->
-Preparing to boot Linux version 6.10.4-200.fc40.ppc64le
->
-(mockbuild@c23cc4e677614c34bb22d54eeea4dc1f) (gcc (GCC) 14.2.1 20240801
->
-(Red Hat 14.2.1-1), GNU ld version 2.41-37.fc40) #1 SMP Sun Aug 11
->
-15:20:17 UTC 2024
->
-Detected machine type: 0000000000000101
->
-command line:
->
-BOOT_IMAGE=(ieee1275/disk,msdos2)/vmlinuz-6.10.4-200.fc40.ppc64le
->
-root=/dev/mapper/fedora-root ro rd.lvm.lv=fedora/root crashkernel=1024M
->
-Max number of cores passed to firmware: 2048 (NR_CPUS = 2048)
->
-Calling ibm,client-architecture-support... done
->
-memory layout at init:
->
-memory_limit : 0000000000000000 (16 MB aligned)
->
-alloc_bottom : 0000000008200000
->
-alloc_top    : 0000000030000000
->
-alloc_top_hi : 0000000800000000
->
-rmo_top      : 0000000030000000
->
-ram_top      : 0000000800000000
->
-instantiating rtas at 0x000000002fff0000... done
->
-prom_hold_cpus: skipped
->
-copying OF device tree...
->
-Building dt strings...
->
-Building dt structure...
->
-Device tree strings 0x0000000008210000 -> 0x0000000008210bd0
->
-Device tree struct  0x0000000008220000 -> 0x0000000008230000
->
-Quiescing Open Firmware ...
->
-Booting Linux via __start() @ 0x0000000000440000 ...
->
-** Guest Console Hang
->
->
->
-Git Bisect:
->
-Performing git bisect points to the following patch:
->
-# git bisect bad
->
-e8291ec16da80566c121c68d9112be458954d90b is the first bad commit
->
-commit e8291ec16da80566c121c68d9112be458954d90b (HEAD)
->
-Author: Nicholas Piggin <npiggin@gmail.com>
->
-Date:   Thu Dec 19 13:40:31 2024 +1000
->
->
-target/ppc: fix timebase register reset state
->
->
-(H)DEC and PURR get reset before icount does, which causes them to
->
-be
->
-skewed and not match the init state. This can cause replay to not
->
-match the recorded trace exactly. For DEC and HDEC this is usually
->
-not
->
-noticable since they tend to get programmed before affecting the
->
-target machine. PURR has been observed to cause replay bugs when
->
-running Linux.
->
->
-Fix this by resetting using a time of 0.
->
->
-Message-ID: <20241219034035.1826173-2-npiggin@gmail.com>
->
-Signed-off-by: Nicholas Piggin <npiggin@gmail.com>
->
->
-hw/ppc/ppc.c | 11 ++++++++---
->
-1 file changed, 8 insertions(+), 3 deletions(-)
->
->
->
-Reverting the patch helps boot the guest.
->
-Thanks,
->
-Misbah Anjum N
-
-- 
cgit 1.4.1