about summary refs log tree commit diff stats
path: root/src
diff options
context:
space:
mode:
authorptitSeb <sebastien.chev@gmail.com>2025-04-23 12:47:56 +0200
committerptitSeb <sebastien.chev@gmail.com>2025-04-23 12:47:56 +0200
commit815836d28551983e45f3ef167cc8780f90df24a1 (patch)
tree8675cb22e9c1146a1931bb41ffabf2f481eface8 /src
parent468a3c2165a737029ed01862361d6a62c511501d (diff)
downloadbox64-815836d28551983e45f3ef167cc8780f90df24a1.tar.gz
box64-815836d28551983e45f3ef167cc8780f90df24a1.zip
[ARM64_DYNAREC] Optimized REP STOSB
Diffstat (limited to 'src')
-rw-r--r--src/dynarec/arm64/dynarec_arm64_00.c34
-rw-r--r--src/dynarec/arm64/dynarec_arm64_helper.h4
2 files changed, 36 insertions, 2 deletions
diff --git a/src/dynarec/arm64/dynarec_arm64_00.c b/src/dynarec/arm64/dynarec_arm64_00.c
index 059d9fcc..8b06925a 100644
--- a/src/dynarec/arm64/dynarec_arm64_00.c
+++ b/src/dynarec/arm64/dynarec_arm64_00.c
@@ -1893,16 +1893,46 @@ uintptr_t dynarec64_00(dynarec_arm_t* dyn, uintptr_t addr, uintptr_t ip, int nin
                 INST_NAME("REP STOSB");
                 CBZx_NEXT(xRCX);
                 TBNZ_MARK2(xFlags, F_DF);
+                IF_UNALIGNED(ip) {
+                    MESSAGE(LOG_DEBUG, "\tUnaligned path");
+                    // special optim for large RCX value on forward case only
+                    // but because it's unaligned path, check if a byte per byt is needed, and do 4-bytes per 4-bytes only instead
+                    ANDw_mask(x1, xRDI, 0, 1);    //mask = 3
+                    CBNZw_MARK(x1);
+                    UXTBw(x3, xRAX);    // prepare x3
+                    ORRw_REG_LSL(x3, x3, x3, 8);
+                    ORRw_REG_LSL(x3, x3, x3, 16);   // 4bytes ready
+                    MARK3;
+                    ANDx_mask(x1, xRCX, 1, 0b111110, 0b111101); // mask=0xfffffffffffffffc, so ~3LL
+                    CBZx_MARK(x1);  // xRCX<4
+                    STRw_S9_postindex(x3, xRDI, 4);
+                    SUBx_U12(xRCX, xRCX, 4);
+                    CBNZx_MARK3(xRCX);
+                    CBZx_MARKLOCK(xRCX);
+                } else {
+                    // special optim for large RCX value on forward case only
+                    UXTBw(x3, xRAX);    // prepare x3
+                    ORRw_REG_LSL(x3, x3, x3, 8);
+                    ORRw_REG_LSL(x3, x3, x3, 16);
+                    ORRx_REG_LSL(x3, x3, x3, 32);   // 8 bytes...
+                    MARK3;
+                    ANDx_mask(x1, xRCX, 1, 0b111101, 0b111100); // mask=0xfffffffffffffff8, so ~7LL
+                    CBZx_MARK(x1);  // xRCX<8
+                    STRx_S9_postindex(x3, xRDI, 8);
+                    SUBx_U12(xRCX, xRCX, 8);
+                    CBNZx_MARK3(xRCX);
+                    CBZx_MARKLOCK(xRCX);
+                }
                 MARK;   // Part with DF==0
                 STRB_S9_postindex(xRAX, xRDI, 1);
                 SUBx_U12(xRCX, xRCX, 1);
                 CBNZx_MARK(xRCX);
-                B_MARK3_nocond;
+                B_MARKLOCK_nocond;
                 MARK2;  // Part with DF==1
                 STRB_S9_postindex(xRAX, xRDI, -1);
                 SUBx_U12(xRCX, xRCX, 1);
                 CBNZx_MARK2(xRCX);
-                MARK3;
+                MARKLOCK;
                 // done
             } else {
                 INST_NAME("STOSB");
diff --git a/src/dynarec/arm64/dynarec_arm64_helper.h b/src/dynarec/arm64/dynarec_arm64_helper.h
index ded76a69..af806d38 100644
--- a/src/dynarec/arm64/dynarec_arm64_helper.h
+++ b/src/dynarec/arm64/dynarec_arm64_helper.h
@@ -760,6 +760,10 @@
     j64 = GETMARK-(dyn->native_size);   \
     CBZw(reg, j64)
 // Branch to MARK if reg is 0 (use j64)
+#define CBZx_MARK(reg)                  \
+    j64 = GETMARK-(dyn->native_size);   \
+    CBZx(reg, j64)
+// Branch to MARK if reg is 0 (use j64)
 #define CBZxw_MARK(reg)                 \
     j64 = GETMARK-(dyn->native_size);   \
     CBZxw(reg, j64)