about summary refs log tree commit diff stats
path: root/src
diff options
context:
space:
mode:
authorYang Liu <liuyang22@iscas.ac.cn>2025-08-19 18:26:42 +0800
committerGitHub <noreply@github.com>2025-08-19 12:26:42 +0200
commit7435006b7c65d0d2dc58446e3a84fc8d96f4cf0f (patch)
treeb1c42db334bc8a32dd894acb0d3f7113c90ba436 /src
parent1c2e763ffbff668851ab0845dee3d4f2072a0e36 (diff)
downloadbox64-7435006b7c65d0d2dc58446e3a84fc8d96f4cf0f.tar.gz
box64-7435006b7c65d0d2dc58446e3a84fc8d96f4cf0f.zip
[RV64_DYNAREC] Added more avx scalar 66 0F38 opcodes (#2950)
* [RV64_DYNAREC] Added more avx scalar 66 0F38 opcodes

* more
Diffstat (limited to 'src')
-rw-r--r--src/dynarec/rv64/dynarec_rv64_660f38.c55
-rw-r--r--src/dynarec/rv64/dynarec_rv64_avx_66_0f38.c65
-rw-r--r--src/dynarec/rv64/dynarec_rv64_helper.h28
3 files changed, 83 insertions, 65 deletions
diff --git a/src/dynarec/rv64/dynarec_rv64_660f38.c b/src/dynarec/rv64/dynarec_rv64_660f38.c
index b3088fd2..4e33ca33 100644
--- a/src/dynarec/rv64/dynarec_rv64_660f38.c
+++ b/src/dynarec/rv64/dynarec_rv64_660f38.c
@@ -141,23 +141,13 @@ uintptr_t dynarec64_660F38(dynarec_rv64_t* dyn, uintptr_t addr, uint8_t opcode,
                     INST_NAME("PHADDSW Gx, Ex");
                     nextop = F8;
                     GETGX();
-                    MOV64x(x5, 32767);
-                    MOV64x(x6, -32768);
                     for (int i = 0; i < 4; ++i) {
                         // tmp32s = GX->sw[i*2+0]+GX->sw[i*2+1];
                         // GX->sw[i] = sat(tmp32s);
                         LH(x3, gback, gdoffset + 2 * (i * 2 + 0));
                         LH(x4, gback, gdoffset + 2 * (i * 2 + 1));
                         ADDW(x3, x3, x4);
-                        if (cpuext.zbb) {
-                            MIN(x3, x3, x5);
-                            MAX(x3, x3, x6);
-                        } else {
-                            BLT(x3, x5, 4 + 4);
-                            MV(x3, x5);
-                            BLT(x6, x3, 4 + 4);
-                            MV(x3, x6);
-                        }
+                        SAT16(x3, x6);
                         SH(x3, gback, gdoffset + i * 2);
                     }
                     if (MODREG && gd == (nextop & 7) + (rex.b << 3)) {
@@ -172,15 +162,7 @@ uintptr_t dynarec64_660F38(dynarec_rv64_t* dyn, uintptr_t addr, uint8_t opcode,
                             LH(x3, wback, fixedaddress + 2 * (i * 2 + 0));
                             LH(x4, wback, fixedaddress + 2 * (i * 2 + 1));
                             ADDW(x3, x3, x4);
-                            if (cpuext.zbb) {
-                                MIN(x3, x3, x5);
-                                MAX(x3, x3, x6);
-                            } else {
-                                BLT(x3, x5, 4 + 4);
-                                MV(x3, x5);
-                                BLT(x6, x3, 4 + 4);
-                                MV(x3, x6);
-                            }
+                            SAT16(x3, x6);
                             SH(x3, gback, gdoffset + 2 * (4 + i));
                         }
                     }
@@ -190,8 +172,6 @@ uintptr_t dynarec64_660F38(dynarec_rv64_t* dyn, uintptr_t addr, uint8_t opcode,
                     nextop = F8;
                     GETGX();
                     GETEX(x2, 0, 15);
-                    MOV64x(x5, 32767);
-                    MOV64x(x6, -32768);
                     for (int i = 0; i < 8; ++i) {
                         LBU(x3, gback, gdoffset + i * 2);
                         LB(x4, wback, fixedaddress + i * 2);
@@ -200,15 +180,7 @@ uintptr_t dynarec64_660F38(dynarec_rv64_t* dyn, uintptr_t addr, uint8_t opcode,
                         LB(x4, wback, fixedaddress + i * 2 + 1);
                         MUL(x3, x3, x4);
                         ADD(x3, x3, x7);
-                        if (cpuext.zbb) {
-                            MIN(x3, x3, x5);
-                            MAX(x3, x3, x6);
-                        } else {
-                            BLT(x3, x5, 4 + 4);
-                            MV(x3, x5);
-                            BLT(x6, x3, 4 + 4);
-                            MV(x3, x6);
-                        }
+                        SAT16(x3, x6);
                         SH(x3, gback, gdoffset + i * 2);
                     }
                     break;
@@ -497,18 +469,9 @@ uintptr_t dynarec64_660F38(dynarec_rv64_t* dyn, uintptr_t addr, uint8_t opcode,
                     nextop = F8;
                     GETGX();
                     GETEX(x2, 0, 12);
-                    MOV64x(x5, 65535);
                     for (int i = 0; i < 4; ++i) {
                         LW(x3, gback, gdoffset + i * 4);
-                        if (cpuext.zbb) {
-                            MIN(x3, x3, x5);
-                            MAX(x3, x3, xZR);
-                        } else {
-                            BGE(x3, xZR, 4 + 4);
-                            MV(x3, xZR);
-                            BLT(x3, x5, 4 + 4);
-                            MV(x3, x5);
-                        }
+                        SATU16(x3, x5);
                         SH(x3, gback, gdoffset + i * 2);
                     }
                     if (MODREG && gd == ed) {
@@ -517,15 +480,7 @@ uintptr_t dynarec64_660F38(dynarec_rv64_t* dyn, uintptr_t addr, uint8_t opcode,
                     } else
                         for (int i = 0; i < 4; ++i) {
                             LW(x3, wback, fixedaddress + i * 4);
-                            if (cpuext.zbb) {
-                                MIN(x3, x3, x5);
-                                MAX(x3, x3, xZR);
-                            } else {
-                                BGE(x3, xZR, 4 + 4);
-                                MV(x3, xZR);
-                                BLT(x3, x5, 4 + 4);
-                                MV(x3, x5);
-                            }
+                            SATU16(x3, x5);
                             SH(x3, gback, gdoffset + 8 + i * 2);
                         }
                     break;
diff --git a/src/dynarec/rv64/dynarec_rv64_avx_66_0f38.c b/src/dynarec/rv64/dynarec_rv64_avx_66_0f38.c
index bb41fbdf..4e70bc8c 100644
--- a/src/dynarec/rv64/dynarec_rv64_avx_66_0f38.c
+++ b/src/dynarec/rv64/dynarec_rv64_avx_66_0f38.c
@@ -104,9 +104,13 @@ uintptr_t dynarec64_AVX_66_0F38(dynarec_rv64_t* dyn, uintptr_t addr, uintptr_t i
             }
             break;
         case 0x01:
-            INST_NAME("VPHADDW Gx, Vx, Ex");
+        case 0x03:
+            if (opcode == 0x01)
+                INST_NAME("VPHADDW Gx, Vx, Ex");
+            else
+                INST_NAME("VPHADDSW Gx, Vx, Ex");
             nextop = F8;
-            GETEX(x1, 0, vex.l ? 46 : 14);
+            GETEX(x1, 0, vex.l ? 30 : 14);
             GETGX();
             GETVX();
             GETGY();
@@ -125,6 +129,7 @@ uintptr_t dynarec64_AVX_66_0F38(dynarec_rv64_t* dyn, uintptr_t addr, uintptr_t i
                 LH(x3, vback, vxoffset + 2 * (i * 2 + 0));
                 LH(x4, vback, vxoffset + 2 * (i * 2 + 1));
                 ADDW(x3, x3, x4);
+                if (opcode == 0x03) SAT16(x3, x6);
                 SH(x3, gback, gdoffset + 2 * i);
             }
             if (MODREG && ed == vex.v) {
@@ -137,6 +142,7 @@ uintptr_t dynarec64_AVX_66_0F38(dynarec_rv64_t* dyn, uintptr_t addr, uintptr_t i
                     LH(x3, wback, fixedaddress + 2 * (i * 2 + 0));
                     LH(x4, wback, fixedaddress + 2 * (i * 2 + 1));
                     ADDW(x3, x3, x4);
+                    if (opcode == 0x03) SAT16(x3, x6);
                     SH(x3, gback, gdoffset + 2 * (4 + i));
                 }
             }
@@ -156,6 +162,7 @@ uintptr_t dynarec64_AVX_66_0F38(dynarec_rv64_t* dyn, uintptr_t addr, uintptr_t i
                     LH(x3, vback, vyoffset + 2 * (i * 2 + 0));
                     LH(x4, vback, vyoffset + 2 * (i * 2 + 1));
                     ADDW(x3, x3, x4);
+                    if (opcode == 0x03) SAT16(x3, x6);
                     SH(x3, gback, gyoffset + 2 * i);
                 }
                 if (MODREG && ed == vex.v) {
@@ -168,6 +175,7 @@ uintptr_t dynarec64_AVX_66_0F38(dynarec_rv64_t* dyn, uintptr_t addr, uintptr_t i
                         LH(x3, wback, fixedaddress + 2 * (i * 2 + 0));
                         LH(x4, wback, fixedaddress + 2 * (i * 2 + 1));
                         ADDW(x3, x3, x4);
+                        if (opcode == 0x03) SAT16(x3, x6);
                         SH(x3, gback, gyoffset + 2 * (4 + i));
                     }
                 }
@@ -179,7 +187,7 @@ uintptr_t dynarec64_AVX_66_0F38(dynarec_rv64_t* dyn, uintptr_t addr, uintptr_t i
         case 0x02:
             INST_NAME("VPHADDD Gx, Vx, Ex");
             nextop = F8;
-            GETEX(x1, 0, vex.l ? 44 : 12);
+            GETEX(x1, 0, vex.l ? 28 : 12);
             GETGX();
             GETVX();
             GETGY();
@@ -249,10 +257,51 @@ uintptr_t dynarec64_AVX_66_0F38(dynarec_rv64_t* dyn, uintptr_t addr, uintptr_t i
                 SD(xZR, gback, gyoffset + 8);
             }
             break;
+        case 0x04:
+            INST_NAME("VPMADDUBSW Gx, Vx, Ex");
+            nextop = F8;
+            GETEX(x1, 0, vex.l ? 31 : 15);
+            GETGX();
+            GETVX();
+            GETGY();
+            GETVY();
+            for (int i = 0; i < 8; ++i) {
+                LBU(x3, vback, vxoffset + i * 2);
+                LB(x4, wback, fixedaddress + i * 2);
+                MUL(x7, x3, x4);
+                LBU(x3, vback, vxoffset + i * 2 + 1);
+                LB(x4, wback, fixedaddress + i * 2 + 1);
+                MUL(x3, x3, x4);
+                ADD(x3, x3, x7);
+                SAT16(x3, x6);
+                SH(x3, gback, gdoffset + i * 2);
+            }
+            if (vex.l) {
+                GETEY();
+                for (int i = 0; i < 8; ++i) {
+                    LBU(x3, vback, vyoffset + i * 2);
+                    LB(x4, wback, fixedaddress + i * 2);
+                    MUL(x7, x3, x4);
+                    LBU(x3, vback, vyoffset + i * 2 + 1);
+                    LB(x4, wback, fixedaddress + i * 2 + 1);
+                    MUL(x3, x3, x4);
+                    ADD(x3, x3, x7);
+                    SAT16(x3, x6);
+                    SH(x3, gback, gyoffset + i * 2);
+                }
+            } else {
+                SD(xZR, gback, gyoffset + 0);
+                SD(xZR, gback, gyoffset + 8);
+            }
+            break;
         case 0x05:
-            INST_NAME("VPHSUBW Gx, Vx, Ex");
+        case 0x07:
+            if (opcode == 0x05)
+                INST_NAME("VPHSUBW Gx, Vx, Ex");
+            else
+                INST_NAME("VPHSUBSW Gx, Vx, Ex");
             nextop = F8;
-            GETEX(x1, 0, vex.l ? 46 : 14);
+            GETEX(x1, 0, vex.l ? 30 : 14);
             GETGX();
             GETVX();
             GETGY();
@@ -271,6 +320,7 @@ uintptr_t dynarec64_AVX_66_0F38(dynarec_rv64_t* dyn, uintptr_t addr, uintptr_t i
                 LH(x3, vback, vxoffset + 2 * (i * 2 + 0));
                 LH(x4, vback, vxoffset + 2 * (i * 2 + 1));
                 SUBW(x3, x3, x4);
+                if (opcode == 0x07) SAT16(x3, x6);
                 SH(x3, gback, gdoffset + 2 * i);
             }
             if (MODREG && ed == vex.v) {
@@ -283,6 +333,7 @@ uintptr_t dynarec64_AVX_66_0F38(dynarec_rv64_t* dyn, uintptr_t addr, uintptr_t i
                     LH(x3, wback, fixedaddress + 2 * (i * 2 + 0));
                     LH(x4, wback, fixedaddress + 2 * (i * 2 + 1));
                     SUBW(x3, x3, x4);
+                    if (opcode == 0x07) SAT16(x3, x6);
                     SH(x3, gback, gdoffset + 2 * (4 + i));
                 }
             }
@@ -302,6 +353,7 @@ uintptr_t dynarec64_AVX_66_0F38(dynarec_rv64_t* dyn, uintptr_t addr, uintptr_t i
                     LH(x3, vback, vyoffset + 2 * (i * 2 + 0));
                     LH(x4, vback, vyoffset + 2 * (i * 2 + 1));
                     SUBW(x3, x3, x4);
+                    if (opcode == 0x07) SAT16(x3, x6);
                     SH(x3, gback, gyoffset + 2 * i);
                 }
                 if (MODREG && ed == vex.v) {
@@ -314,6 +366,7 @@ uintptr_t dynarec64_AVX_66_0F38(dynarec_rv64_t* dyn, uintptr_t addr, uintptr_t i
                         LH(x3, wback, fixedaddress + 2 * (i * 2 + 0));
                         LH(x4, wback, fixedaddress + 2 * (i * 2 + 1));
                         SUBW(x3, x3, x4);
+                        if (opcode == 0x07) SAT16(x3, x6);
                         SH(x3, gback, gyoffset + 2 * (4 + i));
                     }
                 }
@@ -325,7 +378,7 @@ uintptr_t dynarec64_AVX_66_0F38(dynarec_rv64_t* dyn, uintptr_t addr, uintptr_t i
         case 0x06:
             INST_NAME("VPHSUBD Gx, Vx, Ex");
             nextop = F8;
-            GETEX(x1, 0, vex.l ? 44 : 12);
+            GETEX(x1, 0, vex.l ? 28 : 12);
             GETGX();
             GETVX();
             GETGY();
diff --git a/src/dynarec/rv64/dynarec_rv64_helper.h b/src/dynarec/rv64/dynarec_rv64_helper.h
index 973f21fa..8f01750d 100644
--- a/src/dynarec/rv64/dynarec_rv64_helper.h
+++ b/src/dynarec/rv64/dynarec_rv64_helper.h
@@ -1950,15 +1950,25 @@ uintptr_t dynarec64_AVX_F3_0F(dynarec_rv64_t* dyn, uintptr_t addr, uintptr_t ip,
 
 #define PURGE_YMM()
 
-// reg = (reg < -32768) ? -32768 : ((reg > 32767) ? 32767 : reg)
-#define SAT16(reg, s)             \
-    LUI(s, 0xFFFF8); /* -32768 */ \
-    BGE(reg, s, 4 + 2 * 4);       \
-    MV(reg, s);                   \
-    J(4 + 4 * 3);                 \
-    LUI(s, 8); /* 32768 */        \
-    BLT(reg, s, 4 + 4);           \
-    ADDIW(reg, s, -1);
+// TODO: zbb?
+#define SAT16(reg, s)                 \
+    do {                              \
+        LUI(s, 0xFFFF8); /* -32768 */ \
+        BGE(reg, s, 4 + 4);           \
+        MV(reg, s);                   \
+        LUI(s, 0x8); /* 32768 */      \
+        BLT(reg, s, 4 + 4);           \
+        ADDIW(reg, s, -1);            \
+    } while (0)
+
+#define SATU16(reg, s)            \
+    do {                          \
+        LUI(s, 0x10); /* 65536 */ \
+        BGE(reg, xZR, 4 + 4);     \
+        MV(reg, xZR);             \
+        BLT(reg, s, 4 + 4);       \
+        ADDIW(reg, s, -1);        \
+    } while (0)
 
 #define FAST_8BIT_OPERATION(dst, src, s1, OP)                                        \
     if (MODREG && (cpuext.zbb || cpuext.xtheadbb) && !dyn->insts[ninst].x64.gen_flags) { \