about summary refs log tree commit diff stats
path: root/src
diff options
context:
space:
mode:
authorLeslie Zhai <zhaixiang@loongson.cn>2024-12-10 17:48:24 +0800
committerGitHub <noreply@github.com>2024-12-10 10:48:24 +0100
commitf1addb8b28ac133d8f15c3db3102200fb32e4152 (patch)
treec46c3e75a6ef479c72df32296cc2df52e72f84b7 /src
parent96f5108a37ac9f0e4e9d4fa9c4763386c43dfe2b (diff)
downloadbox64-f1addb8b28ac133d8f15c3db3102200fb32e4152.tar.gz
box64-f1addb8b28ac133d8f15c3db3102200fb32e4152.zip
[LA64_DYNAREC] Added more 660F opcodes (#2127)
* [LA64_DYNAREC] Added more 660F opcodes

* [LA64_DYNAREC] Change VREPLGR2VR_D to VXOR_V

* [LA64_DYNAREC] Optimize PMULHRSW

Co-authored-by: Yang Liu <liuyang22@iscas.ac.cn>

---------

Co-authored-by: Yang Liu <liuyang22@iscas.ac.cn>
Diffstat (limited to 'src')
-rw-r--r--src/dynarec/la64/dynarec_la64_660f.c241
-rw-r--r--src/dynarec/la64/dynarec_la64_helper.h2
-rw-r--r--src/dynarec/la64/la64_emitter.h6
3 files changed, 248 insertions, 1 deletions
diff --git a/src/dynarec/la64/dynarec_la64_660f.c b/src/dynarec/la64/dynarec_la64_660f.c
index 4daba8bf..6dc339d9 100644
--- a/src/dynarec/la64/dynarec_la64_660f.c
+++ b/src/dynarec/la64/dynarec_la64_660f.c
@@ -36,7 +36,7 @@ uintptr_t dynarec64_660F(dynarec_la64_t* dyn, uintptr_t addr, uintptr_t ip, int
     uint8_t eb1, eb2;
     int64_t j64;
     uint64_t tmp64u, tmp64u2;
-    int v0, v1;
+    int v0, v1, v2;
     int q0, q1;
     int d0, d1, d2;
     int64_t fixedaddress, gdoffset;
@@ -316,6 +316,79 @@ uintptr_t dynarec64_660F(dynarec_la64_t* dyn, uintptr_t addr, uintptr_t ip, int
                     GETEX(q1, 0, 0);
                     VSIGNCOV_W(q0, q1, q0);
                     break;
+                case 0x0B:
+                    INST_NAME("PMULHRSW Gx,Ex");
+                    nextop = F8;
+                    GETGX(q0, 1);
+                    GETEX(q1, 0, 0);
+                    v0 = fpu_get_scratch(dyn);
+                    v1 = fpu_get_scratch(dyn);
+                    VEXT2XV_W_H(v0, q0);
+                    VEXT2XV_W_H(v1, q1);
+                    XVMUL_W(v0, v0, v1);
+                    XVSRLI_W(v0, v0, 14);
+                    XVADDI_WU(v0, v0, 1);
+                    XVSRLNI_H_W(v0, v0, 1);
+                    XVPERMI_D(q0, v0, 0b1000);
+                    break;
+                case 0x1C:
+                    INST_NAME("PABSB Gx,Ex");
+                    nextop = F8;
+                    GETEX(q1, 0, 0);
+                    GETGX_empty(q0);
+                    v0 = fpu_get_scratch(dyn);
+                    VXOR_V(v0, v0, v0);
+                    VABSD_B(q0, q1, v0);
+                    break;
+                case 0x1D:
+                    INST_NAME("PABSW Gx,Ex");
+                    nextop = F8;
+                    GETEX(q1, 0, 0);
+                    GETGX_empty(q0);
+                    v0 = fpu_get_scratch(dyn);
+                    VXOR_V(v0, v0, v0);
+                    VABSD_H(q0, q1, v0);
+                    break;
+                case 0x2B:
+                    INST_NAME("PACKUSDW Gx, Ex"); // SSE4 opcode!
+                    nextop = F8;
+                    GETEX(q1, 0, 0);
+                    GETGX(q0, 1);
+                    v0 = fpu_get_scratch(dyn);
+                    v1 = fpu_get_scratch(dyn);
+                    VSLTI_W(v0, q0, 0);
+                    VANDN_V(q0, v0, q0);
+                    VSSRANI_HU_W(q0, q0, 0);
+                    if (q0 == q1) {
+                        VEXTRINS_D(q0, q0, VEXTRINS_IMM_4_0(1, 0));
+                    } else {
+                        VSLTI_W(v1, q1, 0);
+                        VANDN_V(v1, v1, q1);
+                        VSSRANI_HU_W(v1, v1, 0);
+                        VEXTRINS_D(q0, v1, VEXTRINS_IMM_4_0(1, 0));
+                    }
+                    break;
+                case 0x3A:
+                    INST_NAME("PMINUW Gx, Ex");  // SSE4 opcode!
+                    nextop = F8;
+                    GETEX(q1, 0, 0);
+                    GETGX(q0, 1);
+                    VMIN_HU(q0, q0, q1);
+                    break;
+                case 0x3D:
+                    INST_NAME("PMAXSD Gx, Ex");  // SSE4 opcode!
+                    nextop = F8;
+                    GETEX(q1, 0, 0);
+                    GETGX(q0, 1);
+                    VMAX_W(q0, q0, q1);
+                    break;
+                case 0x40:
+                    INST_NAME("PMULLD Gx, Ex");  // SSE4 opcode!
+                    nextop = F8;
+                    GETEX(q1, 0, 0);
+                    GETGX(q0, 1);
+                    VMUL_W(q0, q0, q1);
+                    break;
                 case 0xDB:
                     INST_NAME("AESIMC Gx, Ex"); // AES-NI
                     nextop = F8;
@@ -418,6 +491,63 @@ uintptr_t dynarec64_660F(dynarec_la64_t* dyn, uintptr_t addr, uintptr_t ip, int
                         VOR_V(q0, q0, d0);
                     }
                     break;
+                case 0x0E:
+                    INST_NAME("PBLENDW Gx, Ex, Ib");
+                    nextop = F8;
+                    GETGX(q0, 1);
+                    GETEX(q1, 0, 1);
+                    u8 = F8;
+                    i32 = 0;
+                    if (q0 != q1) {
+                        if (u8 == 0xff) {
+                            VAND_V(q0, q1, q1);
+                        } else {
+                            /* 64bits */
+                            if ((u8 & 0xf) == 0xf) {
+                                VEXTRINS_D(q0, q1, VEXTRINS_IMM_4_0(0, 0));
+                                u8 &= ~0xf;
+                            }
+                            if ((u8 & 0xf0) == 0xf0) {
+                                VEXTRINS_D(q0, q1, VEXTRINS_IMM_4_0(1, 1));
+                                u8 &= ~0xf0;
+                            }
+                            /* 32bits */
+                            if ((u8 & 0x3) == 0x3) {
+                                VEXTRINS_W(q0, q1, VEXTRINS_IMM_4_0(0, 0));
+                                u8 &= ~0x3;
+                            }
+                            if ((u8 & 0xc) == 0xc) {
+                                VEXTRINS_W(q0, q1, VEXTRINS_IMM_4_0(1, 1));
+                                u8 &= ~0xc;
+                            }
+                            if ((u8 & 0x30) == 0x30) {
+                                VEXTRINS_W(q0, q1, VEXTRINS_IMM_4_0(2, 2));
+                                u8 &= ~0x30;
+                            }
+                            if ((u8 & 0xc0) == 0xc0) {
+                                VEXTRINS_W(q0, q1, VEXTRINS_IMM_4_0(3, 3));
+                                u8 &= ~0xc0;
+                            }
+                            /* 16bits */
+                            if (u8 & 0x1)
+                                VEXTRINS_H(q0, q1, VEXTRINS_IMM_4_0(0, 0));
+                            if (u8 & 0x2)
+                                VEXTRINS_H(q0, q1, VEXTRINS_IMM_4_0(1, 1));
+                            if (u8 & 0x4)
+                                VEXTRINS_H(q0, q1, VEXTRINS_IMM_4_0(2, 2));
+                            if (u8 & 0x8)
+                                VEXTRINS_H(q0, q1, VEXTRINS_IMM_4_0(3, 3));
+                            if (u8 & 0x10)
+                                VEXTRINS_H(q0, q1, VEXTRINS_IMM_4_0(4, 4));
+                            if (u8 & 0x20)
+                                VEXTRINS_H(q0, q1, VEXTRINS_IMM_4_0(5, 5));
+                            if (u8 & 0x40)
+                                VEXTRINS_H(q0, q1, VEXTRINS_IMM_4_0(6, 6));
+                            if (u8 & 0x80)
+                                VEXTRINS_H(q0, q1, VEXTRINS_IMM_4_0(7, 7));
+                        }
+                    }
+                    break;
                 case 0x16:
                     if (rex.w) {
                         INST_NAME("PEXTRQ Ed, Gx, Ib");
@@ -1166,6 +1296,19 @@ uintptr_t dynarec64_660F(dynarec_la64_t* dyn, uintptr_t addr, uintptr_t ip, int
             u8 = F8;
             VSHUF4I_D(v0, v1, 0x8 | (u8 & 1) | ((u8 & 2) << 1));
             break;
+        case 0xD1:
+            INST_NAME("PSRLW Gx,Ex");
+            nextop = F8;
+            GETGX(q0, 1);
+            GETEX(q1, 0, 0);
+            v0 = fpu_get_scratch(dyn);
+            v1 = fpu_get_scratch(dyn);
+            VREPLVEI_D(v0, q1, 0);
+            VSLEI_DU(v1, v0, 15);
+            VREPLVEI_H(v0, q1, 0);
+            VSRL_H(q0, q0, v0);
+            VAND_V(q0, q0, v1);
+            break;
         case 0xD2:
             INST_NAME("PSRLD Gx, Ex");
             nextop = F8;
@@ -1242,6 +1385,13 @@ uintptr_t dynarec64_660F(dynarec_la64_t* dyn, uintptr_t addr, uintptr_t ip, int
             MOVFR2GR_D(x1, v0);
             BSTRPICK_D(gd, x1, 15, 0);
             break;
+        case 0xD8:
+            INST_NAME("PSUBUSB Gx, Ex");
+            nextop = F8;
+            GETGX(q0, 1);
+            GETEX(q1, 0, 0);
+            VSSUB_BU(q0, q0, q1);
+            break;
         case 0xD9:
             INST_NAME("PSUBUSW Gx, Ex");
             nextop = F8;
@@ -1249,6 +1399,13 @@ uintptr_t dynarec64_660F(dynarec_la64_t* dyn, uintptr_t addr, uintptr_t ip, int
             GETEX(q1, 0, 0);
             VSSUB_HU(q0, q0, q1);
             break;
+        case 0xDA:
+            INST_NAME("PMINUB Gx, Ex");
+            nextop = F8;
+            GETGX(q0, 1);
+            GETEX(q1, 0, 0);
+            VMIN_BU(q0, q0, q1);
+            break;
         case 0xDB:
             INST_NAME("PAND Gx,Ex");
             nextop = F8;
@@ -1263,6 +1420,20 @@ uintptr_t dynarec64_660F(dynarec_la64_t* dyn, uintptr_t addr, uintptr_t ip, int
             GETEX(q1, 0, 0);
             VSADD_BU(q0, q0, q1);
             break;
+        case 0xDD:
+            INST_NAME("PADDUSW Gx,Ex");
+            nextop = F8;
+            GETGX(q0, 1);
+            GETEX(q1, 0, 0);
+            VSADD_HU(q0, q0, q1);
+            break;
+        case 0xDE:
+            INST_NAME("PMAXUB Gx, Ex");
+            nextop = F8;
+            GETGX(q0, 1);
+            GETEX(q1, 0, 0);
+            VMAX_BU(q0, q0, q1);
+            break;
         case 0xDF:
             INST_NAME("PANDN Gx,Ex");
             nextop = F8;
@@ -1287,6 +1458,21 @@ uintptr_t dynarec64_660F(dynarec_la64_t* dyn, uintptr_t addr, uintptr_t ip, int
             VREPLVEI_H(v0, v0, 0);
             VSRA_H(q0, q0, v0);
             break;
+        case 0xE2:
+            INST_NAME("PSRAD Gx,Ex");
+            nextop = F8;
+            GETGX(q0, 1);
+            GETEX(q1, 0, 0);
+            v0 = fpu_get_scratch(dyn);
+            v1 = fpu_get_scratch(dyn);
+            v2 = fpu_get_scratch(dyn);
+            VREPLVEI_D(v0, q1, 0);
+            VSLEI_DU(v1, v0, 31);
+            VREPLVEI_W(v0, q1, 0);
+            VSRAI_W(v2, q0, 31);
+            VSRA_W(q0, q0, v0);
+            VBITSEL_V(q0, v2, q0, v1);
+            break;
         case 0xE3:
             INST_NAME("PAVGW Gx,Ex");
             nextop = F8;
@@ -1328,6 +1514,27 @@ uintptr_t dynarec64_660F(dynarec_la64_t* dyn, uintptr_t addr, uintptr_t ip, int
                 VST(v0, ed, fixedaddress);
             }
             break;
+        case 0xE8:
+            INST_NAME("PSUBSB Gx,Ex");
+            nextop = F8;
+            GETGX(v0, 1);
+            GETEX(q0, 0, 0);
+            VSSUB_B(v0, v0, q0);
+            break;
+        case 0xE9:
+            INST_NAME("PSUBSW Gx,Ex");
+            nextop = F8;
+            GETGX(v0, 1);
+            GETEX(q0, 0, 0);
+            VSSUB_H(v0, v0, q0);
+            break;
+        case 0xEA:
+            INST_NAME("PMINSW Gx,Ex");
+            nextop = F8;
+            GETGX(v0, 1);
+            GETEX(q0, 0, 0);
+            VMIN_H(v0, v0, q0);
+            break;
         case 0xEB:
             INST_NAME("POR Gx,Ex");
             nextop = F8;
@@ -1335,6 +1542,27 @@ uintptr_t dynarec64_660F(dynarec_la64_t* dyn, uintptr_t addr, uintptr_t ip, int
             GETEX(q0, 0, 0);
             VOR_V(v0, v0, q0);
             break;
+        case 0xEC:
+            INST_NAME("PADDSB Gx,Ex");
+            nextop = F8;
+            GETGX(v0, 1);
+            GETEX(q0, 0, 0);
+            VSADD_B(v0, v0, q0);
+            break;
+        case 0xED:
+            INST_NAME("PADDSW Gx,Ex");
+            nextop = F8;
+            GETGX(v0, 1);
+            GETEX(q0, 0, 0);
+            VSADD_H(v0, v0, q0);
+            break;
+        case 0xEE:
+            INST_NAME("PMAXSW Gx,Ex");
+            nextop = F8;
+            GETGX(v0, 1);
+            GETEX(q0, 0, 0);
+            VMAX_H(v0, v0, q0);
+            break;
         case 0xEF:
             INST_NAME("PXOR Gx,Ex");
             nextop = F8;
@@ -1356,6 +1584,17 @@ uintptr_t dynarec64_660F(dynarec_la64_t* dyn, uintptr_t addr, uintptr_t ip, int
             GETEX(v1, 0, 0);
             VMULWEV_D_WU(v0, v0, v1);
             break;
+        case 0xF5:
+            INST_NAME("PMADDWD Gx, Ex");
+            nextop = F8;
+            GETGX(v0, 1);
+            GETEX(v1, 0, 0);
+            q0 = fpu_get_scratch(dyn);
+            q1 = fpu_get_scratch(dyn);
+            VMULWEV_W_H(q0, v0, v1);
+            VMULWOD_W_H(q1, v0, v1);
+            VADD_W(v0, q0, q1);
+            break;
         case 0xF6:
             INST_NAME("PSADBW Gx, Ex");
             nextop = F8;
diff --git a/src/dynarec/la64/dynarec_la64_helper.h b/src/dynarec/la64/dynarec_la64_helper.h
index cbfdc4ef..b4b33443 100644
--- a/src/dynarec/la64/dynarec_la64_helper.h
+++ b/src/dynarec/la64/dynarec_la64_helper.h
@@ -314,6 +314,8 @@
         ed = i;                                                                                     \
     }
 
+#define VEXTRINS_IMM_4_0(n, m) ((n & 0xf) << 4 | (m & 0xf))
+
 // Get GX as a quad (might use x1)
 #define GETGX(a, w)                             \
     gd = ((nextop & 0x38) >> 3) + (rex.r << 3); \
diff --git a/src/dynarec/la64/la64_emitter.h b/src/dynarec/la64/la64_emitter.h
index eeaab03a..6e98806f 100644
--- a/src/dynarec/la64/la64_emitter.h
+++ b/src/dynarec/la64/la64_emitter.h
@@ -1287,6 +1287,7 @@ LSX instruction starts with V, LASX instruction starts with XV.
 #define VBITSET_H(vd, vj, vk)        EMIT(type_3R(0b01110001000011101, vk, vj, vd))
 #define VBITSET_W(vd, vj, vk)        EMIT(type_3R(0b01110001000011110, vk, vj, vd))
 #define VBITSET_D(vd, vj, vk)        EMIT(type_3R(0b01110001000011111, vk, vj, vd))
+#define VBITSEL_V(vd, vj, vk, va)    EMIT(type_4R(0b000011010001, va, vk, vj, vd))
 #define VBITREV_B(vd, vj, vk)        EMIT(type_3R(0b01110001000100000, vk, vj, vd))
 #define VBITREV_H(vd, vj, vk)        EMIT(type_3R(0b01110001000100001, vk, vj, vd))
 #define VBITREV_W(vd, vj, vk)        EMIT(type_3R(0b01110001000100010, vk, vj, vd))
@@ -1369,9 +1370,11 @@ LSX instruction starts with V, LASX instruction starts with XV.
 #define VSLE_HU(vd, vj, vk)          EMIT(type_3R(0b01110000000001001, vk, vj, vd))
 #define VSLE_WU(vd, vj, vk)          EMIT(type_3R(0b01110000000001010, vk, vj, vd))
 #define VSLE_DU(vd, vj, vk)          EMIT(type_3R(0b01110000000001011, vk, vj, vd))
+#define VSLEI_DU(vd, vj, imm5)       EMIT(type_2RI5(0b01110010100001011, imm5, vj, vd))
 #define VSLT_B(vd, vj, vk)           EMIT(type_3R(0b01110000000001100, vk, vj, vd))
 #define VSLT_H(vd, vj, vk)           EMIT(type_3R(0b01110000000001101, vk, vj, vd))
 #define VSLT_W(vd, vj, vk)           EMIT(type_3R(0b01110000000001110, vk, vj, vd))
+#define VSLTI_W(vd, vj, imm5)        EMIT(type_2RI5(0b01110010100001110, imm5, vj, vd))
 #define VSLT_D(vd, vj, vk)           EMIT(type_3R(0b01110000000001111, vk, vj, vd))
 #define VSLT_BU(vd, vj, vk)          EMIT(type_3R(0b01110000000010000, vk, vj, vd))
 #define VSLT_HU(vd, vj, vk)          EMIT(type_3R(0b01110000000010001, vk, vj, vd))
@@ -1818,6 +1821,9 @@ LSX instruction starts with V, LASX instruction starts with XV.
 #define VEXT2XV_WU_HU(vd, vj)        EMIT(type_2R(0b0111011010011111001101, vj, vd))
 #define VEXT2XV_DU_HU(vd, vj)        EMIT(type_2R(0b0111011010011111001110, vj, vd))
 #define VEXT2XV_DU_WU(vd, vj)        EMIT(type_2R(0b0111011010011111001111, vj, vd))
+#define XVADDI_WU(vd, vj, imm5)      EMIT(type_2RI5(0b01110110100010110, imm5, vj, vd))
+#define XVSRLNI_H_W(vd, vj, imm5)    EMIT(type_2RI5(0b01110111010000001, imm5, vj, vd))
+#define XVSRLI_W(vd, vj, imm5)       EMIT(type_2RI5(0b01110111001100001, imm5, vj, vd))
 
 ////////////////////////////////////////////////////////////////////////////////
 // (undocumented) LBT extension instructions