about summary refs log tree commit diff stats
path: root/src
diff options
context:
space:
mode:
authorYang Liu <liuyang22@iscas.ac.cn>2023-04-20 14:52:40 +0800
committerGitHub <noreply@github.com>2023-04-20 08:52:40 +0200
commitf280c6498056dc747089d07725f7f6edd03efd24 (patch)
tree9f3f4d742590ff544e5e83d460f2446ae5581f12 /src
parentd0ae6a9a7da7d77f17b97b41c14951a4af0f9c70 (diff)
downloadbox64-f280c6498056dc747089d07725f7f6edd03efd24.tar.gz
box64-f280c6498056dc747089d07725f7f6edd03efd24.zip
[RV64_DYNAREC] Added more opcodes and some fixes (#716)
* Fixed various bugs

* Added 66 0F 38 01 PHADDW opcode

* Added 66 0F 38 02 PHADDD opcode

* Added 66 0F EC PADDSB opcode

* Some small optimizations
Diffstat (limited to 'src')
-rw-r--r--src/dynarec/rv64/dynarec_rv64_660f.c142
-rw-r--r--src/dynarec/rv64/dynarec_rv64_f20f.c2
-rw-r--r--src/dynarec/rv64/dynarec_rv64_f30f.c49
-rw-r--r--src/dynarec/rv64/dynarec_rv64_helper.c4
-rw-r--r--src/dynarec/rv64/dynarec_rv64_helper.h4
5 files changed, 148 insertions, 53 deletions
diff --git a/src/dynarec/rv64/dynarec_rv64_660f.c b/src/dynarec/rv64/dynarec_rv64_660f.c
index 20a5b441..962ddce3 100644
--- a/src/dynarec/rv64/dynarec_rv64_660f.c
+++ b/src/dynarec/rv64/dynarec_rv64_660f.c
@@ -194,6 +194,64 @@ uintptr_t dynarec64_660F(dynarec_rv64_t* dyn, uintptr_t addr, uintptr_t ip, int
                         SB(x4, gback, i);
                     }
                     break;
+                case 0x01:
+                    INST_NAME("PHADDW Gx, Ex");
+                    nextop = F8;
+                    GETGX(x1);
+                    for (int i=0; i<4; ++i) {
+                        // GX->sw[i] = GX->sw[i*2+0]+GX->sw[i*2+1];
+                        LH(x3, gback, 2*(i*2+0));
+                        LH(x4, gback, 2*(i*2+1));
+                        ADDW(x3, x3, x4);
+                        SH(x3, gback, 2*i);
+                    }
+                    if (MODREG && gd==(nextop&7)+(rex.b<<3)) {
+                        // GX->q[1] = GX->q[0];
+                        LD(x3, gback, 0);
+                        SD(x3, gback, 8);
+                    } else {
+                        GETEX(x2, 0);
+                        for (int i=0; i<4; ++i) {
+                            // GX->sw[4+i] = EX->sw[i*2+0] + EX->sw[i*2+1];
+                            LH(x3, wback, fixedaddress+2*(i*2+0));
+                            LH(x4, wback, fixedaddress+2*(i*2+1));
+                            ADDW(x3, x3, x4);
+                            SH(x3, gback, 2*(4+i));
+                        }
+                    }
+                    break;
+                case 0x02:
+                    INST_NAME("PHADDD Gx, Ex");
+                    nextop = F8;
+                    GETGX(x1);
+                    // GX->sd[0] += GX->sd[1];
+                    LW(x3, gback, 0*4);
+                    LW(x4, gback, 1*4);
+                    ADDW(x3, x3, x4);
+                    SW(x3, gback, 0*4);
+                    // GX->sd[1] = GX->sd[2] + GX->sd[3];
+                    LW(x3, gback, 2*4);
+                    LW(x4, gback, 3*4);
+                    ADDW(x3, x3, x4);
+                    SW(x3, gback, 1*4);
+                    if (MODREG && gd==(nextop&7)+(rex.b<<3)) {
+                        // GX->q[1] = GX->q[0];
+                        LD(x3, gback, 0);
+                        SD(x3, gback, 8);
+                    } else {
+                        GETEX(x2, 0);
+                        // GX->sd[2] = EX->sd[0] + EX->sd[1];
+                        LW(x3, wback, fixedaddress+0*4);
+                        LW(x4, wback, fixedaddress+1*4);
+                        ADDW(x3, x3, x4);
+                        SW(x3, gback, 2*4);
+                        // GX->sd[3] = EX->sd[2] + EX->sd[3];
+                        LW(x3, wback, fixedaddress+2*4);
+                        LW(x4, wback, fixedaddress+3*4);
+                        ADDW(x3, x3, x4);
+                        SW(x3, gback, 3*4);
+                    }
+                    break;
                 case 0x17:
                     INST_NAME("PTEST Gx, Ex");
                     nextop = F8;
@@ -245,29 +303,43 @@ uintptr_t dynarec64_660F(dynarec_rv64_t* dyn, uintptr_t addr, uintptr_t ip, int
                     DEFAULT;
             }
             break;
-        // case 0x3A:  // these are some more SSSE3+ opcodes
-        //     opcode = F8;
-        //     switch(opcode) {
-        //         case 0x0B:
-        //             INST_NAME("ROUNDSD Gx, Ex, Ib");
-        //             nextop = F8;
-        //             GETEXSD(d0, 0);
-        //             GETGXSD_empty(v0);
-        //             u8 = F8;
-        //             if(u8&4) {
-        //                 u8 = sse_setround(dyn, ninst, x4, x2);
-        //                 FCVTLD(x5, d0, RD_DYN);
-        //                 FCVTDL(v0, x5, RD_DYN);
-        //                 x87_restoreround(dyn, ninst, u8);
-        //             } else {
-        //                 FCVTLD(x5, d0, round_round[u8&3]);
-        //                 FCVTDL(v0, x5, round_round[u8&3]);
-        //             }
-        //             break;
-        //         default:
-        //             DEFAULT;
-        //     }
-        //     break;
+        case 0x3A:  // these are some more SSSE3+ opcodes
+            opcode = F8;
+            switch(opcode) {
+                case 0x0B:
+                    INST_NAME("ROUNDSD Gx, Ex, Ib");
+                    nextop = F8;
+                    GETEXSD(d0, 0);
+                    GETGXSD_empty(v0);
+                    d1 = fpu_get_scratch(dyn);
+                    u8 = F8;
+                    FEQD(x2, d0, d0);
+                    BNEZ_MARK(x2);
+                    FADDD(v0, d0, d0);
+                    B_NEXT_nocond;
+                    MARK; // d0 is not nan
+                    FABSD(v0, d0);
+                    MOV64x(x3, 1ULL << __DBL_MANT_DIG__);
+                    FCVTDL(d1, x3, RD_RTZ);
+                    FLTD(x3, v0, d1);
+                    BNEZ_MARK2(x3);
+                    if (v0!=d0) FMVD(v0, d0);
+                    B_NEXT_nocond;
+                    MARK2;
+                    if(u8&4) {
+                        u8 = sse_setround(dyn, ninst, x4, x2);
+                        FCVTLD(x5, d0, RD_DYN);
+                        FCVTDL(v0, x5, RD_DYN);
+                        x87_restoreround(dyn, ninst, u8);
+                    } else {
+                        FCVTLD(x5, d0, round_round[u8&3]);
+                        FCVTDL(v0, x5, round_round[u8&3]);
+                    }
+                    break;
+                default:
+                    DEFAULT;
+            }
+            break;
 
         case 0x54:
             INST_NAME("ANDPD Gx, Ex");
@@ -1028,6 +1100,30 @@ uintptr_t dynarec64_660F(dynarec_rv64_t* dyn, uintptr_t addr, uintptr_t ip, int
             GETEX(x2, 0);
             SSE_LOOP_Q(x3, x4, OR(x3, x3, x4));
             break;
+        case 0xEC:
+            INST_NAME("PADDSB Gx,Ex");
+            nextop = F8;
+            GETGX(x1);
+            GETEX(x2, 0);
+            for(int i=0; i<16; ++i) {
+                // tmp16s = (int16_t)GX->sb[i] + EX->sb[i];
+                // GX->sb[i] = (tmp16s>127)?127:((tmp16s<-128)?-128:tmp16s);
+                LB(x3, gback, i);
+                LB(x4, wback, fixedaddress+i);
+                ADDW(x3, x3, x4);
+                SLLIW(x3, x3, 16);
+                SRAIW(x3, x3, 16);
+                ADDI(x4, xZR, 0x7f);
+                BLT(x3, x4, 12);     // tmp16s>127?
+                SB(x4, gback, i);
+                J(24);               // continue
+                ADDI(x4, xZR, 0xf80);
+                BLT(x4, x3, 12);     // tmp16s<-128?
+                SB(x4, gback, i);
+                J(8);                // continue
+                SB(x3, gback, i);
+            }
+            break;
         case 0xEE:
             INST_NAME("PMAXSW Gx,Ex");
             nextop = F8;
diff --git a/src/dynarec/rv64/dynarec_rv64_f20f.c b/src/dynarec/rv64/dynarec_rv64_f20f.c
index fe902ac1..c8976aff 100644
--- a/src/dynarec/rv64/dynarec_rv64_f20f.c
+++ b/src/dynarec/rv64/dynarec_rv64_f20f.c
@@ -130,7 +130,7 @@ uintptr_t dynarec64_F20F(dynarec_rv64_t* dyn, uintptr_t addr, uintptr_t ip, int
                 FSFLAGSI(xZR);  // // reset all bits
             }
             u8 = sse_setround(dyn, ninst, x2, x3);
-            FCVTLDxw(gd, v0, RD_RM);
+            FCVTLDxw(gd, v0, RD_DYN);
             x87_restoreround(dyn, ninst, u8);
             if(!box64_dynarec_fastround) {
                 FRFLAGS(x5);   // get back FPSR to check the IOC bit
diff --git a/src/dynarec/rv64/dynarec_rv64_f30f.c b/src/dynarec/rv64/dynarec_rv64_f30f.c
index e942e9b5..c3d066c8 100644
--- a/src/dynarec/rv64/dynarec_rv64_f30f.c
+++ b/src/dynarec/rv64/dynarec_rv64_f30f.c
@@ -228,24 +228,23 @@ uintptr_t dynarec64_F30F(dynarec_rv64_t* dyn, uintptr_t addr, uintptr_t ip, int
         case 0x5B:
             INST_NAME("CVTTPS2DQ Gx, Ex");
             nextop = F8;
-            GETEX(x5, 0) ;
-            GETGX(x6);
+            GETGX(x1);
+            GETEX(x2, 0);
             v0 = fpu_get_scratch(dyn);
-            v1 = fpu_get_scratch(dyn);
-            q0 = fpu_get_scratch(dyn);
-            q1 = fpu_get_scratch(dyn);
-            FLW(v0, x5, 0);
-            FLW(v1, x5, 4);
-            FLW(q0, x5, 8);
-            FLW(q1, x5, 12);
-            FCVTWS(x1, v0, RD_RTZ);
-            FCVTWS(x2, v1, RD_RTZ);
-            FCVTWS(x3, q0, RD_RTZ);
-            FCVTWS(x4, q1, RD_RTZ);
-            SW(x1, x6, 0);
-            SW(x2, x6, 4);
-            SW(x3, x6, 8);
-            SW(x4, x6, 12);
+            for(int i=0; i<4; ++i) {
+                if(!box64_dynarec_fastround) {
+                    FSFLAGSI(xZR); // reset all bits
+                }
+                FLW(v0, wback, fixedaddress+i*4);
+                FCVTWS(x3, v0, RD_RTZ);
+                if(!box64_dynarec_fastround) {
+                    FRFLAGS(x5);   // get back FPSR to check the IOC bit
+                    ANDI(x5, x5, (1<<FR_NV)|(1<<FR_OF));
+                    BEQZ(x5, 8);
+                    MOV32w(x3, 0x80000000);
+                }
+                SW(x3, gback, i*4);
+            }
             break;
         case 0xBC:
             INST_NAME("TZCNT Gd, Ed");
@@ -379,16 +378,16 @@ uintptr_t dynarec64_F30F(dynarec_rv64_t* dyn, uintptr_t addr, uintptr_t ip, int
         case 0xE6:
             INST_NAME("CVTDQ2PD Gx, Ex");
             nextop = F8;
-            GETEX(x1, 0);
-            GETGX(x2);
+            GETGX(x1);
+            GETEX(x2, 0);
             q0 = fpu_get_scratch(dyn);
             q1 = fpu_get_scratch(dyn);
-            LW(x3, x1, 0);
-            LW(x4, x1, 4);
-            FCVTDW(q0, x3, RD_DYN);
-            FCVTDW(q1, x4, RD_DYN);
-            FSD(q0, x2, 0);
-            FSD(q1, x2, 8);
+            LW(x3, wback, fixedaddress+0);
+            LW(x4, wback, fixedaddress+4);
+            FCVTDW(q0, x3, RD_RTZ);
+            FCVTDW(q1, x4, RD_RTZ);
+            FSD(q0, gback, 0);
+            FSD(q1, gback, 8);
             break;
 
         default:
diff --git a/src/dynarec/rv64/dynarec_rv64_helper.c b/src/dynarec/rv64/dynarec_rv64_helper.c
index a395871d..a0f88502 100644
--- a/src/dynarec/rv64/dynarec_rv64_helper.c
+++ b/src/dynarec/rv64/dynarec_rv64_helper.c
@@ -998,7 +998,7 @@ int x87_setround(dynarec_rv64_t* dyn, int ninst, int s1, int s2)
     ADDI(s2, xZR, 3);
     BEQ(s1, s2, 12);
     ADDI(s1, s1, 1);
-    BEQ(xZR, xZR, 8);
+    J(8);
     ADDI(s1, xZR, 1);
     // transform done (is there a faster way?)
     FSRM(s1, s1);               // exange RM with current
@@ -1020,7 +1020,7 @@ int sse_setround(dynarec_rv64_t* dyn, int ninst, int s1, int s2)
     ADDI(s2, xZR, 3);
     BEQ(s1, s2, 12);
     ADDI(s1, s1, 1);
-    BEQ(xZR, xZR, 8);
+    J(8);
     ADDI(s1, xZR, 1);
     // transform done (is there a faster way?)
     FSRM(s1, s1);               // exange RM with current
diff --git a/src/dynarec/rv64/dynarec_rv64_helper.h b/src/dynarec/rv64/dynarec_rv64_helper.h
index 95fc3f87..76792594 100644
--- a/src/dynarec/rv64/dynarec_rv64_helper.h
+++ b/src/dynarec/rv64/dynarec_rv64_helper.h
@@ -281,10 +281,10 @@
     gd = ((nextop&0x38)>>3)+(rex.r<<3); \
     a = sse_get_reg(dyn, ninst, x1, gd, 0)
 
-// Get GX as a Double (might use x1), no fetching old value
+// Get GX as a Double (might use x2), no fetching old value
 #define GETGXSD_empty(a)                \
     gd = ((nextop&0x38)>>3)+(rex.r<<3); \
-    a = sse_get_reg_empty(dyn, ninst, x1, gd, 0)
+    a = sse_get_reg_empty(dyn, ninst, x2, gd, 0)
 
 // Get Ex as a single, not a quad (warning, x1 get used, x2 might too)
 #define GETEXSS(a, D)                                                                                   \