about summary refs log tree commit diff stats
path: root/src/dynarec
diff options
context:
space:
mode:
Diffstat (limited to 'src/dynarec')
-rw-r--r--src/dynarec/arm64/dynarec_arm64_660f.c2
-rw-r--r--src/dynarec/arm64/dynarec_arm64_avx_66_0f3a.c85
-rw-r--r--src/dynarec/arm64/dynarec_arm64_helper.c22
-rw-r--r--src/dynarec/arm64/dynarec_arm64_helper.h4
-rw-r--r--src/dynarec/dynarec_native_functions.c34
-rw-r--r--src/dynarec/dynarec_native_functions.h2
6 files changed, 145 insertions, 4 deletions
diff --git a/src/dynarec/arm64/dynarec_arm64_660f.c b/src/dynarec/arm64/dynarec_arm64_660f.c
index 903c427c..79173acf 100644
--- a/src/dynarec/arm64/dynarec_arm64_660f.c
+++ b/src/dynarec/arm64/dynarec_arm64_660f.c
@@ -2894,7 +2894,6 @@ uintptr_t dynarec64_660F(dynarec_arm_t* dyn, uintptr_t addr, uintptr_t ip, int n
             GETEX(q0, 0, 0);

             VBICQ(v0, q0, v0);

             break;

-

          case 0xE0:

             INST_NAME("PAVGB Gx, Ex");

             nextop = F8;

@@ -2902,7 +2901,6 @@ uintptr_t dynarec64_660F(dynarec_arm_t* dyn, uintptr_t addr, uintptr_t ip, int n
             GETEX(v1, 0, 0);

             URHADDQ_8(v0, v0, v1);

             break;

-

         case 0xE1:

             INST_NAME("PSRAW Gx,Ex");

             nextop = F8;

diff --git a/src/dynarec/arm64/dynarec_arm64_avx_66_0f3a.c b/src/dynarec/arm64/dynarec_arm64_avx_66_0f3a.c
index ee1fa401..e5894ce1 100644
--- a/src/dynarec/arm64/dynarec_arm64_avx_66_0f3a.c
+++ b/src/dynarec/arm64/dynarec_arm64_avx_66_0f3a.c
@@ -107,6 +107,29 @@ uintptr_t dynarec64_AVX_66_0F3A(dynarec_arm_t* dyn, uintptr_t addr, uintptr_t ip
             } else YMM0(gd);
             break;
 
+        case 0x0F:
+            INST_NAME("VPALIGNR Gx, Vx, Ex, Ib");
+            nextop = F8;
+            d0 = fpu_get_scratch(dyn, ninst);
+            for(int l=0; l<1+vex.l; ++l) {
+                if(!l) {
+                    GETGX_empty_VXEX(v0, v2, v1, 1);
+                    u8 = F8;
+                } else {
+                    GETGY_empty_VYEY(v0, v2, v1);
+                }
+                if(u8>31) {
+                    VEORQ(v0, v0, v0);
+                } else if(u8>15) {
+                    if(!l) VEORQ(d0, d0, d0);
+                    VEXTQ_8(v0, v2, d0, u8-16);
+                } else {
+                    VEXTQ_8(v0, v1, v2, u8);
+                }
+            }
+            if(!vex.l) YMM0(gd);
+            break;
+
         case 0x15:
             INST_NAME("VPEXTRW Ed, Gx, imm8");
             nextop = F8;
@@ -198,6 +221,68 @@ uintptr_t dynarec64_AVX_66_0F3A(dynarec_arm_t* dyn, uintptr_t addr, uintptr_t ip
             F8; // read u8, but it's been already handled
             break;
 
+        case 0x44:
+            INST_NAME("PCLMULQDQ Gx, Vx, Ex, Ib");
+            nextop = F8;
+            if(arm64_pmull) {
+                d0 = fpu_get_scratch(dyn, ninst);
+                for(int l=0; l<1+vex.l; ++l) {
+                    if(!l) {
+                        GETGX_empty_VXEX(v0, v2, v1, 1);
+                        u8 = F8;
+                    } else {
+                        GETGY_empty_VYEY(v0, v2, v1);
+                    }
+                    switch (u8&0b00010001) {
+                        case 0b00000000:
+                            PMULL_128(v0, v2, v1);
+                            break;
+                        case 0b00010001:
+                            PMULL2_128(v0, v2, v1);
+                            break;
+                        case 0b00000001:
+                            VEXTQ_8(d0, v2, v2, 8); // Swap Up/Lower 64bits parts
+                            PMULL_128(v0, d0, v1);
+                            break;
+                        case 0b00010000:
+                            VEXTQ_8(d0, v2, v2, 8); // Swap Up/Lower 64bits parts
+                            PMULL2_128(v0, d0, v1);
+                            break;
+                    }
+                }
+            } else {
+                for(int l=0; l<1+vex.l; ++l) {
+                    if(!l) {
+                        GETG;
+                        sse_forget_reg(dyn, ninst, gd);
+                        sse_reflect_reg(dyn, ninst, vex.v);
+                    }
+                    MOV32w(x1, gd); // gx
+                    MOV32w(x2, vex.v); // vx
+                    if(MODREG) {
+                        if(!l) {
+                            ed = (nextop&7)+(rex.b<<3);
+                            sse_forget_reg(dyn, ninst, ed);
+                        }
+                        MOV32w(x3, ed);
+                    } else {
+                        if(!l) {
+                            addr = geted(dyn, addr, ninst, nextop, &ed, x3, &fixedaddress, NULL, 0, 0, rex, NULL, 0, 1);
+                            if(ed!=x3) {
+                                MOVx_REG(x3, ed);
+                            }
+                        } else {
+                            ADDx_U12(x3, ed, 16);
+                        }
+                    }
+                    if(!l) u8 = F8;
+                    MOV32w(x4, u8);
+                    CALL_(l?native_pclmul_y:native_pclmul_x, -1, x3);
+                }
+            }
+            if(!vex.l) YMM0(gd);
+            break;
+
         default:
             DEFAULT;
     }
diff --git a/src/dynarec/arm64/dynarec_arm64_helper.c b/src/dynarec/arm64/dynarec_arm64_helper.c
index 2d08510e..908b757d 100644
--- a/src/dynarec/arm64/dynarec_arm64_helper.c
+++ b/src/dynarec/arm64/dynarec_arm64_helper.c
@@ -1630,6 +1630,22 @@ void sse_forget_reg(dynarec_arm_t* dyn, int ninst, int a)
     if(dyn->n.neoncache[dyn->n.ssecache[a].reg].t == NEON_CACHE_XMMW) {
         VSTR128_U12(dyn->n.ssecache[a].reg, xEmu, offsetof(x64emu_t, xmm[a]));
     }
+    // YMM part too
+    if(is_avx_zero_unset(dyn, ninst, a)) {
+        //only  ymm[0] can be accessed with STP :(
+        if(!a)
+            STPx_S7_offset(xZR, xZR, xEmu, offsetof(x64emu_t, ymm[a]));
+        else {
+            STRx_U12(xZR, xEmu, offsetof(x64emu_t, ymm[a]));
+            STRx_U12(xZR, xEmu, offsetof(x64emu_t, ymm[a])+8);
+        }
+    } else for(int i=0; i<32; ++i)
+        if((dyn->n.neoncache[i].t == NEON_CACHE_YMMW) || (dyn->n.neoncache[i].t == NEON_CACHE_YMMR)) {
+            if(dyn->n.neoncache[i].t == NEON_CACHE_YMMW)
+                VSTR128_U12(i, xEmu, offsetof(x64emu_t, ymm[dyn->n.neoncache[i].n]));
+            fpu_free_reg(dyn, i);
+
+    }
     fpu_free_reg(dyn, dyn->n.ssecache[a].reg);
     dyn->n.ssecache[a].v = -1;
     return;
@@ -1725,6 +1741,9 @@ static void sse_reflectcache(dynarec_arm_t* dyn, int ninst, int s1)
                 }
                 STPx_S7_offset(xZR, xZR, s1, i*16);
             }
+        for(int i=0; i<32; ++i)
+                if(dyn->n.neoncache[i].t == NEON_CACHE_YMMW)
+                    VSTR128_U12(i, xEmu, offsetof(x64emu_t, ymm[dyn->n.neoncache[i].n]));
     }
 }
 
@@ -1738,6 +1757,9 @@ void sse_reflect_reg(dynarec_arm_t* dyn, int ninst, int a)
             STRx_U12(xZR, xEmu, offsetof(x64emu_t, ymm[a]));
             STRx_U12(xZR, xEmu, offsetof(x64emu_t, ymm[a])+8);
         }
+    } else for(int i=0; i<32; ++i)
+        if((dyn->n.neoncache[i].t == NEON_CACHE_YMMW) && (dyn->n.neoncache[i].n == a)) {
+            VSTR128_U12(i, xEmu, offsetof(x64emu_t, ymm[a]));
     }
     if(dyn->n.ssecache[a].v==-1)
         return;
diff --git a/src/dynarec/arm64/dynarec_arm64_helper.h b/src/dynarec/arm64/dynarec_arm64_helper.h
index e328d255..872a86fb 100644
--- a/src/dynarec/arm64/dynarec_arm64_helper.h
+++ b/src/dynarec/arm64/dynarec_arm64_helper.h
@@ -1476,11 +1476,11 @@ int mmx_get_reg_empty(dynarec_arm_t* dyn, int ninst, int s1, int s2, int s3, int
 int sse_get_reg(dynarec_arm_t* dyn, int ninst, int s1, int a, int forwrite);
 // get neon register for a SSE reg, but don't try to synch it if it needed to be created
 int sse_get_reg_empty(dynarec_arm_t* dyn, int ninst, int s1, int a);
-// forget neon register for a SSE reg, create the entry if needed
+// forget neon register for a SSE reg, YMM high part too
 void sse_forget_reg(dynarec_arm_t* dyn, int ninst, int a);
 // purge the XMM0..XMM7 cache (before function call)
 void sse_purge07cache(dynarec_arm_t* dyn, int ninst, int s1);
-// Push current value to the cache
+// Push current value to the cache (ymm too)
 void sse_reflect_reg(dynarec_arm_t* dyn, int ninst, int a);
 // common coproc helpers
 // reset the cache with n
diff --git a/src/dynarec/dynarec_native_functions.c b/src/dynarec/dynarec_native_functions.c
index 10d6f333..e0e0f3ff 100644
--- a/src/dynarec/dynarec_native_functions.c
+++ b/src/dynarec/dynarec_native_functions.c
@@ -416,6 +416,40 @@ void native_pclmul(x64emu_t* emu, int gx, int ex, void* p, uint32_t u8)
     GX->q[0] = result&0xffffffffffffffffLL;
     GX->q[1] = (result>>64)&0xffffffffffffffffLL;
 }
+void native_pclmul_x(x64emu_t* emu, int gx, int vx, void* p, uint32_t u8)
+{
+
+    sse_regs_t *EX = ((uintptr_t)p<16)?((sse_regs_t*)p):&emu->xmm[(uintptr_t)p];
+    sse_regs_t *GX = &emu->xmm[gx];
+    sse_regs_t *VX = &emu->xmm[vx];
+    int g = (u8&1)?1:0;
+    int e = (u8&0b10000)?1:0;
+    __int128 result = 0;
+    __int128 op2 = EX->q[e];
+    for (int i=0; i<64; ++i)
+        if(VX->q[g]&(1LL<<i))
+            result ^= (op2<<i);
+
+    GX->q[0] = result&0xffffffffffffffffLL;
+    GX->q[1] = (result>>64)&0xffffffffffffffffLL;
+}
+void native_pclmul_y(x64emu_t* emu, int gy, int vy, void* p, uint32_t u8)
+{
+
+    sse_regs_t *EY = ((uintptr_t)p<16)?((sse_regs_t*)p):&emu->ymm[(uintptr_t)p];
+    sse_regs_t *GY = &emu->ymm[gy];
+    sse_regs_t *VY = &emu->ymm[vy];
+    int g = (u8&1)?1:0;
+    int e = (u8&0b10000)?1:0;
+    __int128 result = 0;
+    __int128 op2 = EY->q[e];
+    for (int i=0; i<64; ++i)
+        if(VY->q[g]&(1LL<<i))
+            result ^= (op2<<i);
+
+    GY->q[0] = result&0xffffffffffffffffLL;
+    GY->q[1] = (result>>64)&0xffffffffffffffffLL;
+}
 
 void native_clflush(x64emu_t* emu, void* p)
 {
diff --git a/src/dynarec/dynarec_native_functions.h b/src/dynarec/dynarec_native_functions.h
index 3e81081b..5085e9e1 100644
--- a/src/dynarec/dynarec_native_functions.h
+++ b/src/dynarec/dynarec_native_functions.h
@@ -42,6 +42,8 @@ void native_aeselast(x64emu_t* emu, int xmm);
 void native_aesimc(x64emu_t* emu, int xmm);
 void native_aeskeygenassist(x64emu_t* emu, int gx, int ex, void* p, uint32_t u8);
 void native_pclmul(x64emu_t* emu, int gx, int ex, void* p, uint32_t u8);
+void native_pclmul_x(x64emu_t* emu, int gx, int vx, void* p, uint32_t u8);
+void native_pclmul_y(x64emu_t* emu, int gy, int vy, void* p, uint32_t u8);
 
 void native_clflush(x64emu_t* emu, void* p);