about summary refs log tree commit diff stats
path: root/src/dynarec
diff options
context:
space:
mode:
authorptitSeb <sebastien.chev@gmail.com>2024-05-31 22:33:12 +0200
committerptitSeb <sebastien.chev@gmail.com>2024-05-31 22:33:12 +0200
commitbfebcba43198f81cfa014a4fd1884a46c878f659 (patch)
tree07e399a3d7eaa544968fc9e8f30b69029d8e72d7 /src/dynarec
parentdde85b761b43c5fecb89177b09d6e596e6338a0c (diff)
downloadbox64-bfebcba43198f81cfa014a4fd1884a46c878f659.tar.gz
box64-bfebcba43198f81cfa014a4fd1884a46c878f659.zip
[ARM64_DYNAREC] Added a bunch of AVX/BMI2/ADX opcodes
Diffstat (limited to 'src/dynarec')
-rw-r--r--src/dynarec/arm64/dynarec_arm64_660f.c20
-rw-r--r--src/dynarec/arm64/dynarec_arm64_67.c9
-rw-r--r--src/dynarec/arm64/dynarec_arm64_avx.c4
-rw-r--r--src/dynarec/arm64/dynarec_arm64_avx_66_0f.c213
-rw-r--r--src/dynarec/arm64/dynarec_arm64_avx_f2_0f38.c87
-rw-r--r--src/dynarec/arm64/dynarec_arm64_avx_f3_0f.c93
-rw-r--r--src/dynarec/arm64/dynarec_arm64_f30f.c30
-rw-r--r--src/dynarec/arm64/dynarec_arm64_helper.h20
8 files changed, 456 insertions, 20 deletions
diff --git a/src/dynarec/arm64/dynarec_arm64_660f.c b/src/dynarec/arm64/dynarec_arm64_660f.c
index fa006bcd..903c427c 100644
--- a/src/dynarec/arm64/dynarec_arm64_660f.c
+++ b/src/dynarec/arm64/dynarec_arm64_660f.c
@@ -900,6 +900,26 @@ uintptr_t dynarec64_660F(dynarec_arm_t* dyn, uintptr_t addr, uintptr_t ip, int n
                         STH(x1, ed, fixedaddress);

                     }

                     break;

+

+                case 0xF6:

+                    INST_NAME("ADCX Gd, Ed");

+                    nextop = F8;

+                    READFLAGS(X_CF);

+                    SETFLAGS(X_CF, SF_SUBSET);

+                    GETED(0);

+                    GETGD;

+                    MRS_nzvc(x3);

+                    BFIx(x3, xFlags, 29, 1); // set C

+                    MSR_nzvc(x3);      // load CC into ARM CF

+                    IFX(X_CF) {

+                        ADCSxw_REG(gd, gd, ed);

+                        CSETw(x3, cCS);

+                        BFIw(xFlags, x3, F_CF, 1);

+                    } else {

+                        ADCxw_REG(gd, gd, ed);

+                    }

+                    break;

+

                 default:

                     DEFAULT;

             }

diff --git a/src/dynarec/arm64/dynarec_arm64_67.c b/src/dynarec/arm64/dynarec_arm64_67.c
index 7a59de5d..6d8c336e 100644
--- a/src/dynarec/arm64/dynarec_arm64_67.c
+++ b/src/dynarec/arm64/dynarec_arm64_67.c
@@ -57,9 +57,14 @@ uintptr_t dynarec64_67(dynarec_arm_t* dyn, uintptr_t addr, uintptr_t ip, int nin
 

     GETREX();

 

+    while(opcode==0x67) opcode = F8;

+    

     rep = 0;

-    while((opcode==0xF2) || (opcode==0xF3)) {

-        rep = opcode-0xF1;

+    while((opcode==0xF2) || (opcode==0xF3) || (opcode>=0x40 && opcode<=0x4F)) {

+        if((opcode==0xF2) || (opcode==0xF3))

+            rep = opcode-0xF1;

+        if(opcode>=0x40 && opcode<=0x4F)

+            rex.rex = opcode;

         opcode = F8;

     }

 

diff --git a/src/dynarec/arm64/dynarec_arm64_avx.c b/src/dynarec/arm64/dynarec_arm64_avx.c
index 14d79ad4..ad17147d 100644
--- a/src/dynarec/arm64/dynarec_arm64_avx.c
+++ b/src/dynarec/arm64/dynarec_arm64_avx.c
@@ -53,10 +53,14 @@ uintptr_t dynarec64_AVX(dynarec_arm_t* dyn, uintptr_t addr, uintptr_t ip, int ni
         addr = dynarec64_AVX_0F(dyn, addr, ip, ninst, vex, ok, need_epilog);
     else if( (vex.m==VEX_M_0F) && (vex.p==VEX_P_66))
         addr = dynarec64_AVX_66_0F(dyn, addr, ip, ninst, vex, ok, need_epilog);
+    else if( (vex.m==VEX_M_0F) && (vex.p==VEX_P_F3))
+        addr = dynarec64_AVX_F3_0F(dyn, addr, ip, ninst, vex, ok, need_epilog);
     else if( (vex.m==VEX_M_0F38) && (vex.p==VEX_P_66))
         addr = dynarec64_AVX_66_0F38(dyn, addr, ip, ninst, vex, ok, need_epilog);
     else if( (vex.m==VEX_M_0F3A) && (vex.p==VEX_P_66))
         addr = dynarec64_AVX_66_0F3A(dyn, addr, ip, ninst, vex, ok, need_epilog);
+    else if( (vex.m==VEX_M_0F38) && (vex.p==VEX_P_F2))
+        addr = dynarec64_AVX_F2_0F38(dyn, addr, ip, ninst, vex, ok, need_epilog);
     else {DEFAULT;}
 
     if((*ok==-1) && (box64_dynarec_log>=LOG_INFO || box64_dynarec_dump || box64_dynarec_missing)) {
diff --git a/src/dynarec/arm64/dynarec_arm64_avx_66_0f.c b/src/dynarec/arm64/dynarec_arm64_avx_66_0f.c
index 06044df1..649cf797 100644
--- a/src/dynarec/arm64/dynarec_arm64_avx_66_0f.c
+++ b/src/dynarec/arm64/dynarec_arm64_avx_66_0f.c
@@ -39,7 +39,7 @@ uintptr_t dynarec64_AVX_66_0F(dynarec_arm_t* dyn, uintptr_t addr, uintptr_t ip,
     int q0, q1, q2;
     int d0, d1, d2;
     int s0;
-    uint64_t tmp64u;
+    uint64_t tmp64u, tmp64u2;
     int64_t j64;
     int64_t fixedaddress;
     int unscaled;
@@ -57,6 +57,8 @@ uintptr_t dynarec64_AVX_66_0F(dynarec_arm_t* dyn, uintptr_t addr, uintptr_t ip,
     MAYUSE(j64);
     MAYUSE(cacheupd);
 
+    /* Remember to not create a new fpu_scratch after some GY/VY/EY is created, because Y can be in the scratch area and might overlap (and scratch will win) */
+
     rex_t rex = vex.rex;
 
     switch(opcode) {
@@ -153,20 +155,14 @@ uintptr_t dynarec64_AVX_66_0F(dynarec_arm_t* dyn, uintptr_t addr, uintptr_t ip,
         case 0x6B:
             INST_NAME("PACKSSDW Gx,Ex");
             nextop = F8;
-            GETGX_empty_VXEX(v0, v2, v1, 0);
-            if(v0==v1) {
-                q0 = fpu_get_scratch(dyn, ninst);
-                VMOVQ(q0, v0);
-            }
-            SQXTN_16(v0, v2);
-            if(v2==v1) {
-                VMOVeD(v0, 1, v0, 0);
-            } else {
-                SQXTN2_16(v0, (v0==v1)?q0:v1);
-            }
-            if(vex.l) {
-                GETGY_empty_VYEY(v0, v2, v1);
+            for(int l=0; l<1+vex.l; ++l) {
+                if(!l) {
+                    GETGX_empty_VXEX(v0, v2, v1, 0);
+                } else {
+                    GETGY_empty_VYEY(v0, v2, v1);
+                }
                 if(v0==v1) {
+                    q0 = fpu_get_scratch(dyn, ninst);
                     VMOVQ(q0, v0);
                 }
                 SQXTN_16(v0, v2);
@@ -175,7 +171,8 @@ uintptr_t dynarec64_AVX_66_0F(dynarec_arm_t* dyn, uintptr_t addr, uintptr_t ip,
                 } else {
                     SQXTN2_16(v0, (v0==v1)?q0:v1);
                 }
-            } else YMM0(gd);
+            } 
+            if(!vex.l) YMM0(gd);
             break;
 
         case 0x6F:
@@ -201,6 +198,192 @@ uintptr_t dynarec64_AVX_66_0F(dynarec_arm_t* dyn, uintptr_t addr, uintptr_t ip,
             }
             if(!vex.l) YMM0(gd);
             break;
+        case 0x70:
+            INST_NAME("VPSHUFD Gx,Ex,Ib");
+            nextop = F8;
+            if(MODREG) {
+                u8 = F8;
+                d0 = fpu_get_scratch(dyn, ninst);
+                for(int l=0; l<1+vex.l; ++l) {
+                    if(!l) {
+                        v1 = sse_get_reg(dyn, ninst, x1, (nextop&7)+(rex.b<<3), 0);
+                        GETGX_empty(v0);
+                    } else {
+                        GETGY_empty_EY(v0, v1);
+                    }
+                    if(u8==0x4E) {
+                        if(v0==v1) {
+                            VEXTQ_8(v0, v0, v0, 8);
+                        } else {
+                            VMOVeD(v0, 0, v1, 1);
+                            VMOVeD(v0, 1, v1, 0);
+                        }
+                    } else if(u8==0x00) {
+                        VDUPQ_32(v0, v1, 0);
+                    } else if(u8==0x55) {
+                        VDUPQ_32(v0, v1, 1);
+                    } else if(u8==0xAA) {
+                        VDUPQ_32(v0, v1, 2);
+                    } else if(u8==0xFF) {
+                        VDUPQ_32(v0, v1, 3);
+                    } else if(u8==0x44) {
+                        VDUPQ_64(v0, v1, 0);
+                    } else if(u8==0xEE) {
+                        VDUPQ_64(v0, v1, 1);
+                    } else if(u8==0xB1) {
+                        VREV64Q_32(v0, v1);
+                    } else if(v0!=v1) {
+                        VMOVeS(v0, 0, v1, (u8>>(0*2))&3);
+                        VMOVeS(v0, 1, v1, (u8>>(1*2))&3);
+                        VMOVeS(v0, 2, v1, (u8>>(2*2))&3);
+                        VMOVeS(v0, 3, v1, (u8>>(3*2))&3);
+                    } else {
+                        if(!l) {
+                            uint64_t swp[4] = {
+                                (0)|(1<<8)|(2<<16)|(3<<24),
+                                (4)|(5<<8)|(6<<16)|(7<<24),
+                                (8)|(9<<8)|(10<<16)|(11<<24),
+                                (12)|(13<<8)|(14<<16)|(15<<24)
+                            };
+                            tmp64u = swp[(u8>>(0*2))&3] | (swp[(u8>>(1*2))&3]<<32);
+                            MOV64x(x2, tmp64u);
+                            VMOVQDfrom(d0, 0, x2);
+                            tmp64u2 = swp[(u8>>(2*2))&3] | (swp[(u8>>(3*2))&3]<<32);
+                            if(tmp64u2==tmp64u) {
+                                VMOVQDfrom(d0, 1, x2);
+                            } else {
+                                MOV64x(x3, tmp64u2);
+                                VMOVQDfrom(d0, 1, x3);
+                            }
+                        }
+                        VTBLQ1_8(v0, v1, d0);
+                    }
+                }
+            } else {
+                SMREAD();
+                for(int l=0; l<1+vex.l; ++l) {
+                    i32 = -1;
+                    if(!l) {
+                        GETGX_empty(v0);
+                        addr = geted(dyn, addr, ninst, nextop, &ed, x3, &fixedaddress, NULL, 0, 0, rex, NULL, 0, 1);
+                        u8 = F8;
+                    } else {
+                        GETGY_empty(v0, -1, -1, -1);
+                        ADDx_U12(x3, ed, 16);
+                        ed = x3;
+                    }
+                    if (u8) {
+                        for (int i=0; i<4; ++i) {
+                            int32_t idx = (u8>>(i*2))&3;
+                            if(idx!=i32) {
+                                ADDx_U12(x2, ed, idx*4);
+                                i32 = idx;
+                            }
+                            VLD1_32(v0, i, x2);
+                        }
+                    } else {
+                        VLDQ1R_32(v0, ed);
+                    }
+                }
+            }
+            if(!vex.l)  YMM0(gd);
+            break;
+
+        case 0x73:
+            nextop = F8;
+            switch((nextop>>3)&7) {
+                case 2:
+                    INST_NAME("VPSRLQ Vx, Ex, Ib");
+                    for(int l=0; l<1+vex.l; ++l) {
+                        if(!l) {
+                            GETEX_Y(v1, 0, 1);
+                            GETVX_empty(v0);
+                            u8 = F8;
+                        } else {
+                            GETVY_empty_EY(v0, v1);
+                        }
+                        if(u8) {
+                            if (u8>63) {
+                                VEORQ(v0, v0, v0);
+                            } else if(u8) {
+                                VSHRQ_64(v0, v1, u8);
+                            }
+                        } else if(v0!=v1)
+                            VMOVQ(v0, v1);
+                    }
+                    if(!vex.l) YMM0(vex.v);
+                    break;
+                case 3:
+                    INST_NAME("VPSRLDQ Vx, Ex, Ib");
+                    q1 = fpu_get_scratch(dyn, ninst);
+                    for(int l=0; l<1+vex.l; ++l) {
+                        if(!l) {
+                            GETEX_Y(v1, 0, 1);
+                            GETVX_empty(v0);
+                            u8 = F8;
+                        } else {
+                            GETVY_empty_EY(v0, v1);
+                        }
+                        if(u8) {
+                            if(u8>15) {
+                                VEORQ(v0, v0, v0);
+                            } else {
+                                if(!l) VEORQ(q1, q1, q1);
+                                VEXTQ_8(v0, v1, q1, u8);
+                            }
+                        } else if(v0!=v1)
+                            VMOVQ(v0, v1);
+                    }
+                    if(!vex.l) YMM0(vex.v);
+                    break;
+                case 6:
+                    INST_NAME("VPSLLQ Vx, Ex, Ib");
+                    for(int l=0; l<1+vex.l; ++l) {
+                        if(!l) {
+                            GETEX_Y(v1, 0, 1);
+                            GETVX_empty(v0);
+                            u8 = F8;
+                        } else {
+                            GETVY_empty_EY(v0, v1);
+                        }
+                        if(u8) {
+                            if (u8>63) {
+                                VEORQ(v0, v0, v0);
+                            } else {
+                                VSHLQ_64(v0, v1, u8);
+                            }
+                        } else if(v0!=v1)
+                            VMOVQ(v0, v1);
+                    }
+                    if(!vex.l) YMM0(vex.v);
+                    break;
+                case 7:
+                    INST_NAME("VPSLLDQ Vx, Ex, Ib");
+                    q1 = fpu_get_scratch(dyn, ninst);
+                    for(int l=0; l<1+vex.l; ++l) {
+                        if(!l) {
+                            GETEX_Y(v1, 0, 1);
+                            GETVX_empty(v0);
+                            u8 = F8;
+                        } else {
+                            GETVY_empty_EY(v0, v1);
+                        }
+                        if(u8) {
+                            if(u8>15) {
+                                VEORQ(v0, v0, v0);
+                            } else if(u8>0) {
+                                if(!l) VEORQ(q1, q1, q1);
+                                VEXTQ_8(v0, q1, v1, 16-u8);
+                            }
+                        } else if(v0!=v1)
+                            VMOVQ(v0, v1);
+                    }
+                    if(!vex.l) YMM0(vex.v);
+                    break;
+                default:
+                    DEFAULT;
+            }
+            break;
 
         default:
             DEFAULT;
diff --git a/src/dynarec/arm64/dynarec_arm64_avx_f2_0f38.c b/src/dynarec/arm64/dynarec_arm64_avx_f2_0f38.c
new file mode 100644
index 00000000..294a7797
--- /dev/null
+++ b/src/dynarec/arm64/dynarec_arm64_avx_f2_0f38.c
@@ -0,0 +1,87 @@
+#include <stdio.h>
+#include <stdlib.h>
+#include <stddef.h>
+#include <errno.h>
+
+#include "debug.h"
+#include "box64context.h"
+#include "dynarec.h"
+#include "emu/x64emu_private.h"
+#include "emu/x64run_private.h"
+#include "x64run.h"
+#include "x64emu.h"
+#include "box64stack.h"
+#include "callback.h"
+#include "emu/x64run_private.h"
+#include "x64trace.h"
+#include "dynarec_native.h"
+#include "my_cpuid.h"
+#include "emu/x87emu_private.h"
+#include "emu/x64shaext.h"
+
+#include "arm64_printer.h"
+#include "dynarec_arm64_private.h"
+#include "dynarec_arm64_functions.h"
+#include "dynarec_arm64_helper.h"
+
+uintptr_t dynarec64_AVX_F2_0F38(dynarec_arm_t* dyn, uintptr_t addr, uintptr_t ip, int ninst, vex_t vex, int* ok, int* need_epilog)
+{
+    (void)ip; (void)need_epilog;
+
+    uint8_t opcode = F8;
+    uint8_t nextop, u8;
+    uint8_t gd, ed, vd;
+    uint8_t wback, wb1, wb2;
+    uint8_t eb1, eb2, gb1, gb2;
+    int32_t i32, i32_;
+    int cacheupd = 0;
+    int v0, v1, v2;
+    int q0, q1, q2;
+    int d0, d1, d2;
+    int s0;
+    uint64_t tmp64u;
+    int64_t j64;
+    int64_t fixedaddress;
+    int unscaled;
+    MAYUSE(wb1);
+    MAYUSE(wb2);
+    MAYUSE(eb1);
+    MAYUSE(eb2);
+    MAYUSE(gb1);
+    MAYUSE(gb2);
+    MAYUSE(q0);
+    MAYUSE(q1);
+    MAYUSE(d0);
+    MAYUSE(d1);
+    MAYUSE(s0);
+    MAYUSE(j64);
+    MAYUSE(cacheupd);
+
+    rex_t rex = vex.rex;
+
+    switch(opcode) {
+
+        case 0xF6:
+            INST_NAME("MULX Gd, Vd, Ed (,RDX)");
+            nextop = F8;
+            GETGD;
+            GETED(0);
+            GETVD;
+            if(rex.w) {
+                // 64bits mul
+                UMULH(x3, xRDX, ed);
+                MULx(vd, xRDX, ed);
+                MOVx_REG(gd, x3);
+            } else {
+                // 32bits mul
+                UMULL(x3, xRDX, ed);
+                MOVw_REG(vd, x3);
+                LSRx(gd, x3, 32);
+            }
+            break;
+
+        default:
+            DEFAULT;
+    }
+    return addr;
+}
diff --git a/src/dynarec/arm64/dynarec_arm64_avx_f3_0f.c b/src/dynarec/arm64/dynarec_arm64_avx_f3_0f.c
new file mode 100644
index 00000000..7ff19269
--- /dev/null
+++ b/src/dynarec/arm64/dynarec_arm64_avx_f3_0f.c
@@ -0,0 +1,93 @@
+#include <stdio.h>
+#include <stdlib.h>
+#include <stddef.h>
+#include <errno.h>
+
+#include "debug.h"
+#include "box64context.h"
+#include "dynarec.h"
+#include "emu/x64emu_private.h"
+#include "emu/x64run_private.h"
+#include "x64run.h"
+#include "x64emu.h"
+#include "box64stack.h"
+#include "callback.h"
+#include "emu/x64run_private.h"
+#include "x64trace.h"
+#include "dynarec_native.h"
+#include "my_cpuid.h"
+#include "emu/x87emu_private.h"
+#include "emu/x64shaext.h"
+
+#include "arm64_printer.h"
+#include "dynarec_arm64_private.h"
+#include "dynarec_arm64_functions.h"
+#include "dynarec_arm64_helper.h"
+
+uintptr_t dynarec64_AVX_F3_0F(dynarec_arm_t* dyn, uintptr_t addr, uintptr_t ip, int ninst, vex_t vex, int* ok, int* need_epilog)
+{
+    (void)ip; (void)need_epilog;
+
+    uint8_t opcode = F8;
+    uint8_t nextop, u8;
+    uint8_t gd, ed, vd;
+    uint8_t wback, wb1, wb2;
+    uint8_t eb1, eb2, gb1, gb2;
+    int32_t i32, i32_;
+    int cacheupd = 0;
+    int v0, v1, v2;
+    int q0, q1, q2;
+    int d0, d1, d2;
+    int s0;
+    uint64_t tmp64u;
+    int64_t j64;
+    int64_t fixedaddress;
+    int unscaled;
+    MAYUSE(wb1);
+    MAYUSE(wb2);
+    MAYUSE(eb1);
+    MAYUSE(eb2);
+    MAYUSE(gb1);
+    MAYUSE(gb2);
+    MAYUSE(q0);
+    MAYUSE(q1);
+    MAYUSE(d0);
+    MAYUSE(d1);
+    MAYUSE(s0);
+    MAYUSE(j64);
+    MAYUSE(cacheupd);
+
+    rex_t rex = vex.rex;
+
+    switch(opcode) {
+
+        case 0x6F:
+            INST_NAME("VMOVDQU Gx,Ex");// no alignment constraint on NEON here, so same as MOVDQA
+            nextop = F8;
+            if(MODREG) {
+                v1 = sse_get_reg(dyn, ninst, x1, (nextop&7)+(rex.b<<3), 0);
+                GETGX_empty(v0);
+                VMOVQ(v0, v1);
+                if(vex.l) {
+                    v1 = ymm_get_reg(dyn, ninst, x1, (nextop&7)+(rex.b<<3), 0, gd, -12, -1);
+                    GETGY_empty(v0, (nextop&7)+(rex.b<<3), -1, -1);
+                    VMOVQ(v0, v1);
+                }
+            } else {
+                GETGX_empty(v0);
+                SMREAD();
+                addr = geted(dyn, addr, ninst, nextop, &ed, x1, &fixedaddress, NULL, 0xffe<<4, 15, rex, NULL, 0, 0);
+                VLDR128_U12(v0, ed, fixedaddress);
+                if(vex.l) {
+                    GETGY_empty(v0, -1, -1, -1);
+                    VLDR128_U12(v0, ed, fixedaddress+16);
+                }
+            }
+            if(!vex.l) YMM0(gd);
+            break;
+
+        default:
+            DEFAULT;
+    }
+    return addr;
+}
diff --git a/src/dynarec/arm64/dynarec_arm64_f30f.c b/src/dynarec/arm64/dynarec_arm64_f30f.c
index 079cd0bc..96fe7543 100644
--- a/src/dynarec/arm64/dynarec_arm64_f30f.c
+++ b/src/dynarec/arm64/dynarec_arm64_f30f.c
@@ -172,6 +172,36 @@ uintptr_t dynarec64_F30F(dynarec_arm_t* dyn, uintptr_t addr, uintptr_t ip, int n
                 }

             }

             break;

+

+        case 0x38:  /* MAP */

+            opcode = F8;

+            switch(opcode) {

+

+                case 0xF6:

+                    INST_NAME("ADOX Gd, Ed");

+                    nextop = F8;

+                    READFLAGS(X_OF);

+                    SETFLAGS(X_OF, SF_SUBSET);

+                    GETED(0);

+                    GETGD;

+                    MRS_nzvc(x3);

+                    LSRw(x4, xFlags, F_OF);

+                    BFIx(x3, x4, 29, 1); // set C

+                    MSR_nzvc(x3);      // load CC into ARM CF

+                    IFX(X_OF) {

+                        ADCSxw_REG(gd, gd, ed);

+                        CSETw(x3, cCS);

+                        BFIw(xFlags, x3, F_OF, 1);

+                    } else {

+                        ADCxw_REG(gd, gd, ed);

+                    }

+                    break;

+

+                default:

+                    DEFAULT;

+            }

+            break;

+

         case 0x51:

             INST_NAME("SQRTSS Gx, Ex");

             nextop = F8;

diff --git a/src/dynarec/arm64/dynarec_arm64_helper.h b/src/dynarec/arm64/dynarec_arm64_helper.h
index b72b4e99..cc4ef30a 100644
--- a/src/dynarec/arm64/dynarec_arm64_helper.h
+++ b/src/dynarec/arm64/dynarec_arm64_helper.h
@@ -90,6 +90,8 @@
 
 // GETGD    get x64 register in gd
 #define GETGD   gd = xRAX+((nextop&0x38)>>3)+(rex.r<<3)
+// GETVD    get x64 register in vd
+#define GETVD   vd = xRAX+vex.v
 //GETED can use r1 for ed, and r2 for wback. wback is 0 if ed is xEAX..xEDI
 #define GETED(D)  if(MODREG) {                          \
                     ed = xRAX+(nextop&7)+(rex.b<<3);    \
@@ -510,13 +512,21 @@
         VLD128(ey, ed, fixedaddress+16);                                            \
     gy = ymm_get_reg_empty(dyn, ninst, x1, gd, -1, (MODREG)?((nextop&7)+(rex.b<<3)):-1, -1)
 
-// Get EX as a quad, (x1 is used)
+// Get empty VY, and non-writen EY
+#define GETVY_empty_EY(vy, ey)                                                      \
+    if(MODREG)                                                                      \
+        ey = ymm_get_reg(dyn, ninst, x1, (nextop&7)+(rex.b<<3), 0, vex.v, -1, -1);     \
+    else                                                                            \
+        VLD128(ey, ed, fixedaddress+16);                                            \
+    vy = ymm_get_reg_empty(dyn, ninst, x1, vex.v, -1, (MODREG)?((nextop&7)+(rex.b<<3)):-1, -1)
+
+// Get EX as a quad, (x3 is used)
 #define GETEX_Y(a, w, D)                                                                                \
     if(MODREG) {                                                                                        \
-        a = sse_get_reg(dyn, ninst, x1, (nextop&7)+(rex.b<<3), w);                                      \
+        a = sse_get_reg(dyn, ninst, x3, (nextop&7)+(rex.b<<3), w);                                      \
     } else {                                                                                            \
         if(w) {WILLWRITE2();} else {SMREAD();}                                                          \
-        addr = geted(dyn, addr, ninst, nextop, &ed, x1, &fixedaddress, NULL, 0xffe<<4, 15, rex, NULL, 0, D);  \
+        addr = geted(dyn, addr, ninst, nextop, &ed, x3, &fixedaddress, NULL, 0xffe<<4, 15, rex, NULL, 0, D);  \
         unscaled = 0;                                                                                   \
         a = fpu_get_scratch(dyn, ninst);                                                                \
         VLD128(a, ed, fixedaddress);                                                                    \
@@ -1093,8 +1103,10 @@ void* arm64_next(x64emu_t* emu, uintptr_t addr);
 #define dynarec64_AVX      STEPNAME(dynarec64_AVX)
 #define dynarec64_AVX_0F   STEPNAME(dynarec64_AVX_0F)
 #define dynarec64_AVX_66_0F     STEPNAME(dynarec64_AVX_66_0F)
+#define dynarec64_AVX_F3_0F     STEPNAME(dynarec64_AVX_F2_0F38)
 #define dynarec64_AVX_66_0F38   STEPNAME(dynarec64_AVX_66_0F38)
 #define dynarec64_AVX_66_0F3A   STEPNAME(dynarec64_AVX_66_0F3A)
+#define dynarec64_AVX_F2_0F38   STEPNAME(dynarec64_AVX_F2_0F38)
 
 #define geted           STEPNAME(geted)
 #define geted32         STEPNAME(geted32)
@@ -1516,8 +1528,10 @@ uintptr_t dynarec64_F30F(dynarec_arm_t* dyn, uintptr_t addr, uintptr_t ip, int n
 uintptr_t dynarec64_AVX(dynarec_arm_t* dyn, uintptr_t addr, uintptr_t ip, int ninst, vex_t vex, int* ok, int* need_epilog);
 uintptr_t dynarec64_AVX_0F(dynarec_arm_t* dyn, uintptr_t addr, uintptr_t ip, int ninst, vex_t vex, int* ok, int* need_epilog);
 uintptr_t dynarec64_AVX_66_0F(dynarec_arm_t* dyn, uintptr_t addr, uintptr_t ip, int ninst, vex_t vex, int* ok, int* need_epilog);
+uintptr_t dynarec64_AVX_F3_0F(dynarec_arm_t* dyn, uintptr_t addr, uintptr_t ip, int ninst, vex_t vex, int* ok, int* need_epilog);
 uintptr_t dynarec64_AVX_66_0F38(dynarec_arm_t* dyn, uintptr_t addr, uintptr_t ip, int ninst, vex_t vex, int* ok, int* need_epilog);
 uintptr_t dynarec64_AVX_66_0F3A(dynarec_arm_t* dyn, uintptr_t addr, uintptr_t ip, int ninst, vex_t vex, int* ok, int* need_epilog);
+uintptr_t dynarec64_AVX_F2_0F38(dynarec_arm_t* dyn, uintptr_t addr, uintptr_t ip, int ninst, vex_t vex, int* ok, int* need_epilog);
 
 #if STEP < 2
 #define PASS2(A)