1 files changed, 27 insertions, 0 deletions
diff --git a/results/scraper/fex/3791 b/results/scraper/fex/3791
new file mode 100644
index 000000000..7eed39dba
--- /dev/null
+++ b/results/scraper/fex/3791
@@ -0,0 +1,27 @@
+AVX128: Optimize 256-bit contiguous masked loadstore with immediate offset
+Currently the SVE contiguous masked load splits the instruction in to two with an add in the middle for address generation.
+We can remove that add inbetween the two halves because the `ld1w` and `ld1d` instructions support a short immediate offset multiplied by VL.
+
+```json
+    "vmaskmovps ymm0, ymm1, [rax]": {
+      "ExpectedInstructionCount": 9,
+      "Comment": [
+        "Map 2 0b01 0x2c 256-bit"
+      ],
+      "ExpectedArm64ASM": [
+        "ldr q2, [x28, #32]",
+        "mrs x20, nzcv",
+        "cmplt p0.s, p6/z, z17.s, #0",
+        "ld1w {z16.s}, p0/z, [x4]",
+        "add x21, x4, #0x10 (16)", <------------- This add
+        "cmplt p0.s, p6/z, z2.s, #0",
+        "ld1w {z2.s}, p0/z, [x21]", <-------------- Can be rolled in to this instruction
+        "msr nzcv, x20",
+        "str q2, [x28, #16]"
+      ]
+    },
+```
+
+Only saves a single instruction for both 256-bit `vmaskmovps` and `vmaskmovpd` but is fairly trivial do do.
+
+Same thing can be improved on the store side as well.
\ No newline at end of file