about summary refs log tree commit diff stats
path: root/src
diff options
context:
space:
mode:
authorxctan <xctan@cirno.icu>2024-05-31 20:09:10 +0800
committerGitHub <noreply@github.com>2024-05-31 14:09:10 +0200
commit6749c2d1a6d1762b3a2ad87c0bc70a65359d1c08 (patch)
treeebfbb4181819a0fe32e8540b23b17f0015d5958d /src
parent7eb24ff51407562fe29fc1259cd3a8ce47f143e8 (diff)
downloadbox64-6749c2d1a6d1762b3a2ad87c0bc70a65359d1c08.tar.gz
box64-6749c2d1a6d1762b3a2ad87c0bc70a65359d1c08.zip
[RV64_DYNAREC] Added more MMX opcodes and some optimizations too (#1542)
* [RV64_DYNAREC] Added 0F 38 06 PHSUBD opcode

* [RV64_DYNAREC] Added 0F 38 07 PHSUBSW opcode

* [RV64_DYNAREC] Added 0F 38 05 PHSUBW opcode

* [RV64_DYNAREC] Added 0F C4 PINSRW opcode

* [RV64_DYNAREC] Added 0F 38 04 PMADDUBSW opcode

* [RV64_DYNAREC] Added 0F EE PMAXSW opcode

* [RV64_DYNAREC] Optimized SSE packed min/max

* [RV64_DYNAREC] Added 0F DE PMAXUB opcode

* [RV64_DYNAREC] Added 0F EA PMINSW opcode

* [RV64_DYNAREC] Added 0F DA PMINUB opcode

* [RV64_DYNAREC] Optimized 0F D9 PSUBUSW opcode

* [RV64_DYNAREC] Added 0F D7 PMOVMSKB opcode

* [RV64_DYNAREC] Optimized (66) 0F D7 PMOVMSKB opcode

* [RV64_DYNAREC] Switched to the simpler implementation for PMOVMSKB
Diffstat (limited to 'src')
-rw-r--r--src/dynarec/arm64/dynarec_arm64_0f.c2
-rw-r--r--src/dynarec/rv64/dynarec_rv64_0f.c240
-rw-r--r--src/dynarec/rv64/dynarec_rv64_660f.c78
3 files changed, 305 insertions, 15 deletions
diff --git a/src/dynarec/arm64/dynarec_arm64_0f.c b/src/dynarec/arm64/dynarec_arm64_0f.c
index 8773b054..596c9fd1 100644
--- a/src/dynarec/arm64/dynarec_arm64_0f.c
+++ b/src/dynarec/arm64/dynarec_arm64_0f.c
@@ -2642,7 +2642,7 @@ uintptr_t dynarec64_0F(dynarec_arm_t* dyn, uintptr_t addr, uintptr_t ip, int nin
             SQADD_16(d0, d0, d1);

             break;

         case 0xEE:

-            INST_NAME("PMAXSW Gx,Ex");

+            INST_NAME("PMAXSW Gm,Em");

             nextop = F8;

             GETGM(v0);

             GETEM(q0, 0);

diff --git a/src/dynarec/rv64/dynarec_rv64_0f.c b/src/dynarec/rv64/dynarec_rv64_0f.c
index a7f96a32..0b63c070 100644
--- a/src/dynarec/rv64/dynarec_rv64_0f.c
+++ b/src/dynarec/rv64/dynarec_rv64_0f.c
@@ -541,6 +541,130 @@ uintptr_t dynarec64_0F(dynarec_rv64_t* dyn, uintptr_t addr, uintptr_t ip, int ni
                         }
                     }
                     break;
+                case 0x04:
+                    INST_NAME("PMADDUBSW Gm,Em");
+                    nextop = F8;
+                    GETGM();
+                    GETEM(x2, 0);
+                    MOV64x(x5, 32767);
+                    MOV64x(x6, -32768);
+                    for (int i = 0; i < 4; ++i) {
+                        LBU(x3, gback, gdoffset + i * 2);
+                        LB(x4, wback, fixedaddress + i * 2);
+                        MUL(x9, x3, x4);
+                        LBU(x3, gback, gdoffset + i * 2 + 1);
+                        LB(x4, wback, fixedaddress + i * 2 + 1);
+                        MUL(x3, x3, x4);
+                        ADD(x3, x3, x9);
+                        if (rv64_zbb) {
+                            MIN(x3, x3, x5);
+                            MAX(x3, x3, x6);
+                        } else {
+                            BLT(x3, x5, 4 + 4);
+                            MV(x3, x5);
+                            BLT(x6, x3, 4 + 4);
+                            MV(x3, x6);
+                        }
+                        SH(x3, gback, gdoffset + i * 2);
+                    }
+                    break;
+                case 0x05:
+                    INST_NAME("PHSUBW Gm,Em");
+                    nextop = F8;
+                    GETGM();
+                    for (int i = 0; i < 2; ++i) {
+                        // tmp32s = GX->sw[i*2+0]+GX->sw[i*2+1];
+                        // GX->sw[i] = sat(tmp32s);
+                        LH(x3, gback, gdoffset + 2 * (i * 2 + 0));
+                        LH(x4, gback, gdoffset + 2 * (i * 2 + 1));
+                        SUBW(x3, x3, x4);
+                        SH(x3, gback, gdoffset + i * 2);
+                    }
+                    if (MODREG && gd == (nextop & 7) + (rex.b << 3)) {
+                        // GM->d[1] = GM->d[0];
+                        LW(x3, gback, gdoffset + 0);
+                        SW(x3, gback, gdoffset + 4);
+                    } else {
+                        GETEM(x2, 0);
+                        for (int i = 0; i < 2; ++i) {
+                            // tmp32s = EX->sw[i*2+0] + EX->sw[i*2+1];
+                            // GX->sw[4+i] = sat(tmp32s);
+                            LH(x3, wback, fixedaddress + 2 * (i * 2 + 0));
+                            LH(x4, wback, fixedaddress + 2 * (i * 2 + 1));
+                            SUBW(x3, x3, x4);
+                            SH(x3, gback, gdoffset + 2 * (2 + i));
+                        }
+                    }
+                    break;
+                case 0x06:
+                    INST_NAME("PHSUBD Gm,Em");
+                    nextop = F8;
+                    GETGM();
+                    // GM->sd[0] += GM->sd[1];
+                    LW(x3, gback, gdoffset + 0 * 4);
+                    LW(x4, gback, gdoffset + 1 * 4);
+                    SUBW(x3, x3, x4);
+                    SW(x3, gback, gdoffset + 0 * 4);
+                    if (MODREG && gd == (nextop & 7) + (rex.b << 3)) {
+                        // GM->sd[1] = GM->sd[0];
+                        SW(x3, gback, gdoffset + 1 * 4);
+                    } else {
+                        GETEM(x2, 0);
+                        // GM->sd[1] = EM->sd[0] + EM->sd[1];
+                        LW(x3, wback, fixedaddress + 0 * 4);
+                        LW(x4, wback, fixedaddress + 1 * 4);
+                        SUBW(x3, x3, x4);
+                        SW(x3, gback, gdoffset + 1 * 4);
+                    }
+                    break;
+                case 0x07:
+                    INST_NAME("PHSUBSW Gm,Em");
+                    nextop = F8;
+                    GETGM();
+                    MOV64x(x5, 32767);
+                    MOV64x(x6, -32768);
+                    for (int i = 0; i < 2; ++i) {
+                        // tmp32s = GX->sw[i*2+0]+GX->sw[i*2+1];
+                        // GX->sw[i] = sat(tmp32s);
+                        LH(x3, gback, gdoffset + 2 * (i * 2 + 0));
+                        LH(x4, gback, gdoffset + 2 * (i * 2 + 1));
+                        SUBW(x3, x3, x4);
+                        if (rv64_zbb) {
+                            MIN(x3, x3, x5);
+                            MAX(x3, x3, x6);
+                        } else {
+                            BLT(x3, x5, 4 + 4);
+                            MV(x3, x5);
+                            BLT(x6, x3, 4 + 4);
+                            MV(x3, x6);
+                        }
+                        SH(x3, gback, gdoffset + i * 2);
+                    }
+                    if (MODREG && gd == (nextop & 7) + (rex.b << 3)) {
+                        // GM->d[1] = GM->d[0];
+                        LW(x3, gback, gdoffset + 0);
+                        SW(x3, gback, gdoffset + 4);
+                    } else {
+                        GETEM(x2, 0);
+                        for (int i = 0; i < 2; ++i) {
+                            // tmp32s = EX->sw[i*2+0] + EX->sw[i*2+1];
+                            // GX->sw[4+i] = sat(tmp32s);
+                            LH(x3, wback, fixedaddress + 2 * (i * 2 + 0));
+                            LH(x4, wback, fixedaddress + 2 * (i * 2 + 1));
+                            SUBW(x3, x3, x4);
+                            if (rv64_zbb) {
+                                MIN(x3, x3, x5);
+                                MAX(x3, x3, x6);
+                            } else {
+                                BLT(x3, x5, 4 + 4);
+                                MV(x3, x5);
+                                BLT(x6, x3, 4 + 4);
+                                MV(x3, x6);
+                            }
+                            SH(x3, gback, gdoffset + 2 * (2 + i));
+                        }
+                    }
+                    break;
                 case 0x1C:
                     INST_NAME("PABSB Gm,Em");
                     nextop = F8;
@@ -2122,6 +2246,14 @@ uintptr_t dynarec64_0F(dynarec_rv64_t* dyn, uintptr_t addr, uintptr_t ip, int ni
                 SDxw(gd, ed, fixedaddress);
             }
             break;
+        case 0xC4:
+            INST_NAME("PINSRW Gm,Ed,Ib");
+            nextop = F8;
+            GETED(1);
+            GETGM();
+            u8 = (F8)&3;
+            SH(ed, gback, gdoffset + u8 * 2);
+            break;
         case 0xC5:
             INST_NAME("PEXTRW Gd,Em,Ib");
             nextop = F8;
@@ -2182,12 +2314,70 @@ uintptr_t dynarec64_0F(dynarec_rv64_t* dyn, uintptr_t addr, uintptr_t ip, int ni
             GETEM(x2, 0);
             MMX_LOOP_WS(x3, x4, MULW(x3, x3, x4));
             break;
+        case 0xD7:
+            INST_NAME("PMOVMSKB Gd, Em");
+            nextop = F8;
+            GETGD;
+            GETEM(x2, 0);
+            LD(x1, wback, fixedaddress + 0);
+            for (int i = 0; i < 8; i++) {
+                if (rv64_zbs) {
+                    if (i == 0) {
+                        BEXTI(gd, x1, 63);
+                    } else {
+                        BEXTI(x6, x1, 63 - i * 8);
+                    }
+                } else {
+                    if (i == 0) {
+                        SRLI(gd, x1, 63);
+                    } else {
+                        SRLI(x6, x1, 63 - i * 8);
+                        ANDI(x6, x6, 1);
+                    }
+                }
+                if (i != 0) {
+                    if (rv64_zba) {
+                        SH1ADD(gd, gd, x6);
+                    } else {
+                        SLLI(gd, gd, 1);
+                        OR(gd, gd, x6);
+                    }
+                }
+            }
+            break;
         case 0xD9:
             INST_NAME("PSUBUSW Gm, Em");
             nextop = F8;
             GETGM();
             GETEM(x2, 0);
-            MMX_LOOP_W(x3, x4, SUB(x3, x3, x4); SLT(x4, xZR, x3); NEG(x4, x4); AND(x3, x3, x4));
+            MMX_LOOP_W(x3, x4,
+                SUB(x3, x3, x4);
+                if (rv64_zbb) {
+                    MAX(x3, x3, xZR);
+                } else {
+                    NOT(x4, x3);
+                    SRAI(x4, x4, 63);
+                    AND(x3, x3, x4);
+                }
+                SH(x3, gback, gdoffset + i * 2);
+            );
+            break;
+        case 0xDA:
+            INST_NAME("PMINUB Gm, Em");
+            nextop = F8;
+            GETGM();
+            GETEM(x2, 0);
+            for (int i = 0; i < 8; ++i) {
+                LBU(x3, gback, gdoffset + i);
+                LBU(x4, wback, fixedaddress + i);
+                if (rv64_zbb) {
+                    MINU(x3, x3, x4);
+                } else {
+                    BLTU(x3, x4, 8);
+                    MV(x3, x4);
+                }
+                SB(x3, gback, gdoffset + i);
+            }
             break;
         case 0xDB:
             INST_NAME("PAND Gm, Em");
@@ -2239,6 +2429,23 @@ uintptr_t dynarec64_0F(dynarec_rv64_t* dyn, uintptr_t addr, uintptr_t ip, int ni
                 SH(x3, gback, gdoffset + i * 2);
             }
             break;
+        case 0xDE:
+            INST_NAME("PMAXUB Gm, Em");
+            nextop = F8;
+            GETGM();
+            GETEM(x2, 0);
+            for (int i = 0; i < 8; ++i) {
+                LBU(x3, gback, gdoffset + i);
+                LBU(x4, wback, fixedaddress + i);
+                if (rv64_zbb) {
+                    MAXU(x3, x3, x4);
+                } else {
+                    BLTU(x4, x3, 8);
+                    MV(x3, x4);
+                }
+                SB(x3, gback, gdoffset + i);
+            }
+            break;
         case 0xDF:
             INST_NAME("PANDN Gm, Em");
             nextop = F8;
@@ -2347,6 +2554,23 @@ uintptr_t dynarec64_0F(dynarec_rv64_t* dyn, uintptr_t addr, uintptr_t ip, int ni
                 SH(x3, gback, gdoffset + 2 * i);
             }
             break;
+        case 0xEA:
+            INST_NAME("PMINSW Gx,Ex");
+            nextop = F8;
+            GETGM();
+            GETEM(x2, 0);
+            for (int i = 0; i < 4; ++i) {
+                LH(x3, gback, gdoffset + 2 * i);
+                LH(x4, wback, fixedaddress + 2 * i);
+                if (rv64_zbb) {
+                    MIN(x3, x3, x4);
+                } else {
+                    BLT(x3, x4, 8);
+                    MV(x3, x4);
+                }
+                SH(x3, gback, gdoffset + 2 * i);
+            }
+            break;
         case 0xEB:
             INST_NAME("POR Gm, Em");
             nextop = F8;
@@ -2407,6 +2631,20 @@ uintptr_t dynarec64_0F(dynarec_rv64_t* dyn, uintptr_t addr, uintptr_t ip, int ni
                 SH(x3, gback, gdoffset + 2 * i);
             }
             break;
+        case 0xEE:
+            INST_NAME("PMAXSW Gm,Em");
+            nextop = F8;
+            GETGM();
+            GETEM(x1, 0);
+            MMX_LOOP_WS(x3, x4,
+                if (rv64_zbb) {
+                    MAX(x3, x3, x4);
+                } else {
+                    BGE(x3, x4, 8);
+                    MV(x3, x4);
+                }
+            );
+            break;
         case 0xEF:
             INST_NAME("PXOR Gm,Em");
             nextop = F8;
diff --git a/src/dynarec/rv64/dynarec_rv64_660f.c b/src/dynarec/rv64/dynarec_rv64_660f.c
index cb6831ee..c6d3c62c 100644
--- a/src/dynarec/rv64/dynarec_rv64_660f.c
+++ b/src/dynarec/rv64/dynarec_rv64_660f.c
@@ -2755,12 +2755,45 @@ uintptr_t dynarec64_660F(dynarec_rv64_t* dyn, uintptr_t addr, uintptr_t ip, int
             nextop = F8;
             GETEX(x2, 0);
             GETGD;
-            MV(gd, xZR);
-            for (int i = 0; i < 16; ++i) {
-                LB(x1, wback, fixedaddress + i);
-                SLT(x3, x1, xZR);
-                if (i > 0) SLLI(x3, x3, i);
-                OR(gd, gd, x3);
+            LD(x1, wback, fixedaddress + 8); // high part
+            LD(x2, wback, fixedaddress + 0); // low part, also destroyed wback(x2)
+            for (int i = 0; i < 8; i++) {
+                if (rv64_zbs) {
+                    if (i == 0) {
+                        BEXTI(gd, x1, 63);
+                    } else {
+                        BEXTI(x6, x1, 63 - i * 8);
+                    }
+                } else {
+                    if (i == 0) {
+                        SRLI(gd, x1, 63);
+                    } else {
+                        SRLI(x6, x1, 63 - i * 8);
+                        ANDI(x6, x6, 1);
+                    }
+                }
+                if (i != 0) {
+                    if (rv64_zba) {
+                        SH1ADD(gd, gd, x6);
+                    } else {
+                        SLLI(gd, gd, 1);
+                        OR(gd, gd, x6);
+                    }
+                }
+            }
+            for (int i = 0; i < 8; i++) {
+                if (rv64_zbs) {
+                    BEXTI(x6, x2, 63 - i * 8);
+                } else {
+                    SRLI(x6, x2, 63 - i * 8);
+                    ANDI(x6, x6, 1);
+                }
+                if (rv64_zba) {
+                    SH1ADD(gd, gd, x6);
+                } else {
+                    SLLI(gd, gd, 1);
+                    OR(gd, gd, x6);
+                }
             }
             break;
         case 0xD8:
@@ -2807,8 +2840,12 @@ uintptr_t dynarec64_660F(dynarec_rv64_t* dyn, uintptr_t addr, uintptr_t ip, int
             for (int i = 0; i < 16; ++i) {
                 LBU(x3, gback, gdoffset + i);
                 LBU(x4, wback, fixedaddress + i);
-                BLTU(x3, x4, 8);
-                MV(x3, x4);
+                if (rv64_zbb) {
+                    MINU(x3, x3, x4);
+                } else {
+                    BLTU(x3, x4, 8);
+                    MV(x3, x4);
+                }
                 SB(x3, gback, gdoffset + i);
             }
             break;
@@ -2867,8 +2904,12 @@ uintptr_t dynarec64_660F(dynarec_rv64_t* dyn, uintptr_t addr, uintptr_t ip, int
             for (int i = 0; i < 16; ++i) {
                 LBU(x3, gback, gdoffset + i);
                 LBU(x4, wback, fixedaddress + i);
-                BLTU(x4, x3, 8);
-                MV(x3, x4);
+                if (rv64_zbb) {
+                    MAXU(x3, x3, x4);
+                } else {
+                    BLTU(x4, x3, 8);
+                    MV(x3, x4);
+                }
                 SB(x3, gback, gdoffset + i);
             }
             break;
@@ -3061,8 +3102,12 @@ uintptr_t dynarec64_660F(dynarec_rv64_t* dyn, uintptr_t addr, uintptr_t ip, int
             for (int i = 0; i < 8; ++i) {
                 LH(x3, gback, gdoffset + 2 * i);
                 LH(x4, wback, fixedaddress + 2 * i);
-                BLT(x3, x4, 8);
-                MV(x3, x4);
+                if (rv64_zbb) {
+                    MIN(x3, x3, x4);
+                } else {
+                    BLT(x3, x4, 8);
+                    MV(x3, x4);
+                }
                 SH(x3, gback, gdoffset + 2 * i);
             }
             break;
@@ -3128,7 +3173,14 @@ uintptr_t dynarec64_660F(dynarec_rv64_t* dyn, uintptr_t addr, uintptr_t ip, int
             nextop = F8;
             GETGX();
             GETEX(x2, 0);
-            SSE_LOOP_WS(x3, x4, BGE(x3, x4, 8); MV(x3, x4));
+            SSE_LOOP_WS(x3, x4,
+                if (rv64_zbb) {
+                    MAX(x3, x3, x4);
+                } else {
+                    BGE(x3, x4, 8);
+                    MV(x3, x4);
+                }
+            );
             break;
         case 0xEF:
             INST_NAME("PXOR Gx, Ex");