4 files changed, 366 insertions, 5 deletions
diff --git a/src/dynarec/arm64/arm64_emitter.h b/src/dynarec/arm64/arm64_emitter.h
index 050eece5..4817e130 100644
--- a/src/dynarec/arm64/arm64_emitter.h
+++ b/src/dynarec/arm64/arm64_emitter.h
@@ -1080,6 +1080,12 @@ int convert_bitmask(uint64_t bitmask);
 #define VMOVHto(Wd, Vn, index)      EMIT(UMOV_gen(0, ((index)<<2) | 2, Vn, Wd))
 #define VMOVSto(Wd, Vn, index)      EMIT(UMOV_gen(0, ((index)<<3) | 4, Vn, Wd))
 
+#define SMOV_gen(Q, imm5, Rn, Rd)   ((Q)<<30 | 0b01110000<<21 | (imm5)<<16 | 0b01<<13 | 0<<12 | 1<<11 | 1<<10 | (Rn)<<5 | (Rd))
+#define SMOVQDto(Xd, Vn, index)     EMIT(SMOV_gen(1, ((index)<<4) | 8, Vn, Xd))
+#define SMOVQBto(Xd, Vn, index)     EMIT(SMOV_gen(1, ((index)<<1) | 1, Vn, Xd))
+#define SMOVQHto(Xd, Vn, index)     EMIT(SMOV_gen(1, ((index)<<2) | 2, Vn, Xd))
+#define SMOVQSto(Xd, Vn, index)     EMIT(SMOV_gen(1, ((index)<<3) | 4, Vn, Xd))
+
 #define MVN_vector(Q, Rn, Rd)       ((Q)<<30 | 1<<29 | 0b01110<<24 | 0b10000<<17 | 0b00101<<12 | 0b10<<10 | (Rn)<<5 | (Rd))
 #define VMVNQ(Rd, Rn)               EMIT(MVN_vector(1, Rn, Rd))
 
@@ -1161,11 +1167,6 @@ int convert_bitmask(uint64_t bitmask);
 #define FMULS(Sd, Sn, Sm)           EMIT(FMUL_scalar(0b00, Sm, Sn, Sd))
 #define FMULD(Dd, Dn, Dm)           EMIT(FMUL_scalar(0b01, Dm, Dn, Dd))
 
-#define FMLA_vector(Q, op, sz, Rm, Rn, Rd)	((Q)<<30 | 0b01110<<24 | (op)<<23 | (sz)<<22 | 1<<21 | (Rm)<<16 | 0b11001<<11 | 1<<10 | (Rn)<<5 | (Rd))
-#define VFMLAS(Sd, Sn, Sm)	        EMIT(FMLA_vector(0, 0, 0, Sm, Sn, Sd))
-#define VFMLAQS(Sd, Sn, Sm)	        EMIT(FMLA_vector(1, 0, 0, Sm, Sn, Sd))
-#define VFMLAQD(Dd, Dn, Dm)	        EMIT(FMLA_vector(1, 0, 1, Dm, Dn, Dd))
-
 // DIV
 #define FDIV_vector(Q, sz, Rm, Rn, Rd)  ((Q)<<30 | 1<<29 | 0b01110<<24 | (sz)<<22 | 1<<21 | (Rm)<<16 | 0b11111<<11 | 1<<10 | (Rn)<<5 | (Rd))
 #define VFDIVS(Sd, Sn, Sm)          EMIT(FDIV_vector(0, 0, Sm, Sn, Sd))
@@ -1450,6 +1451,21 @@ int convert_bitmask(uint64_t bitmask);
 // FMAXNM NaN vs Number: number is picked
 #define FMAXNMD(Dd, Dn, Dm)         EMIT(FMINMAX_scalar(0b01, Dm, 0b10, Dn, Dd))
 
+// Fused Add Multiply
+#define FMADD_gen(type, o1, Rm, o0, Ra, Rn, Rd) (0b11111<<24 | (type)<<22 | (o1)<<21 | (Rm)<<16 | (o0)<<0 | (Ra)<<10 | (Rn)<<5 | (Rd))
+// scalar Rd = Ra + Rn*Rm
+#define FMADD_32(Sd, Sa, Sn, Sm)    EMIT(FMADD_gen(0b00, 0, Sm, 0, Sa, Sn, Sd))
+// scalar Rd = Ra + Rn*Rm
+#define FMADD_64(Dd, Da, Dn, Dm)    EMIT(FMADD_gen(0b01, 0, Dm, 0, Da, Dn, Dd))
+
+#define FMLA_vector(Q, op, sz, Rm, Rn, Rd)  ((Q)<<30 | 0b01110<<24 | (op)<<23 | (sz)<<22 | 1<<21 | (Rm)<<16 | 0b11001<<11 | 1<<10 | (Rn)<<5 | (Rd))
+// Vd += Vn*Vm
+#define VFMLAS(Vd, Vn, Vm)          EMIT(FMLA_vector(0, 0, 0, Vm, Vn, Vd))
+// Vd += Vn*Vm
+#define VFMLAQS(Vd, Vn, Vm)         EMIT(FMLA_vector(1, 0, 0, Vm, Vn, Vd))
+// Vd += Vn*Vm
+#define VFMLAQD(Vd, Vn, Vm)         EMIT(FMLA_vector(1, 0, 1, Vm, Vn, Vd))
+
 // ZIP / UZP
 #define ZIP_gen(Q, size, Rm, op, Rn, Rd)    ((Q)<<30 | 0b001110<<24 | (size)<<22 | (Rm)<<16 | (op)<<14 | 0b11<<12 | 0b10<<10 | (Rn)<<5 | (Rd))
 #define VZIP1Q_8(Rt, Rn, Rm)        EMIT(ZIP_gen(1, 0b00, Rm, 0, Rn, Rt))
diff --git a/src/dynarec/arm64/arm64_printer.c b/src/dynarec/arm64/arm64_printer.c
index 98bbcc98..9e8d10ab 100644
--- a/src/dynarec/arm64/arm64_printer.c
+++ b/src/dynarec/arm64/arm64_printer.c
@@ -930,6 +930,22 @@ const char* arm64_print(uint32_t opcode, uintptr_t addr)
             snprintf(buff, sizeof(buff), "UMOV %s, %c%d.%c[%d]", a.Q?Xt[Rd]:Wt[Rd], q, Rn, s, index);
         return buff;
     }
+    // SMOV
+    if(isMask(opcode, "0Q001110000rrrrr001011nnnnnddddd", &a)) {
+        char q = a.Q?'Q':'D';
+        char s = '?';
+        int sz=0;
+        if(a.Q==0 && immr&1) {s='B'; sz=0; }
+        else if(/*a.Q==0 &&*/ (immr&3)==2) {s='H'; sz=1; }
+        else if(/*a.Q==0 &&*/ (immr&7)==4) {s='S'; sz=2; }
+        else if(a.Q==1 && (immr&15)==8) {s='D'; sz=3; }
+        int index = (immr)>>(sz+1);
+        if(sz>2)
+            snprintf(buff, sizeof(buff), "MOV %s, %c%d.%c[%d]", a.Q?Xt[Rd]:Wt[Rd], q, Rn, s, index);
+        else
+            snprintf(buff, sizeof(buff), "SMOV %s, %c%d.%c[%d]", a.Q?Xt[Rd]:Wt[Rd], q, Rn, s, index);
+        return buff;
+    }
     // VEOR
     if(isMask(opcode, "0Q101110001mmmmm000111nnnnnddddd", &a)) {
         char q = a.Q?'Q':'D';
@@ -1303,6 +1319,20 @@ const char* arm64_print(uint32_t opcode, uintptr_t addr)
         snprintf(buff, sizeof(buff), "F%s%s V%d.%d%c, V%d.%d%c, V%d.%d%c", option?"MIN":"MAX", a.Q?"Q":"", Rd, n, s, Rn, n, s, Rm, n, s);
         return buff;
     }
+    // FMADD
+    if(isMask(opcode, "00011111tt0mmmmmoaaaaannnnnddddd", &a)) {
+        char s = (a.t==0b00)?'S':((a.t==0b01)?'D':'?');
+        int n = (a.t==0)?1:2;
+        snprintf(buff, sizeof(buff), "FM%s V%d.%d%c, V%d.%d%c, V%d.%d%c, V%d.%d%c", option?"SUB":"ADD", Rd, n, s, Ra, n, s, Rn, n, s, Rm, n, s);
+    }
+    // FMLA
+    if(isMask(opcode, "0Q001110of1mmmmm110011nnnnnddddd", &a)) {
+        char s = (sf==0)?'S':((sf==1)?'D':'?');
+        int n = (sf==0)?2:1;
+        n *= a.Q?2:1;
+        snprintf(buff, sizeof(buff), "FML%s%s V%d.%d%c, V%d.%d%c, V%d.%d%c", option?"S":"A", a.Q?"Q":"", Rd, n, s, Rn, n, s, Rm, n, s);
+        return buff;
+    }
     // NEG
     if(isMask(opcode, "0Q101110ff100000101110nnnnnddddd", &a)) {
         const char* Y[] = {"8B", "16B", "4H", "8H", "2S", "4S", "??", "2D"};
diff --git a/src/dynarec/arm64/dynarec_arm64_avx_66_0f38.c b/src/dynarec/arm64/dynarec_arm64_avx_66_0f38.c
index be983b14..12c6e3da 100644
--- a/src/dynarec/arm64/dynarec_arm64_avx_66_0f38.c
+++ b/src/dynarec/arm64/dynarec_arm64_avx_66_0f38.c
@@ -77,6 +77,52 @@ uintptr_t dynarec64_AVX_66_0F38(dynarec_arm_t* dyn, uintptr_t addr, uintptr_t ip
             if(!vex.l) YMM0(gd);
             break;
 
+        case 0x08:
+            INST_NAME("VPSIGNB Gx, Vx, Ex");
+            nextop = F8;
+            v1 = fpu_get_scratch(dyn, ninst);
+            v0 = fpu_get_scratch(dyn, ninst);
+            for(int l=0; l<1+vex.l; ++l) {
+                if(!l) { GETGX_empty_VXEX(q0, q2, q1, 0); } else { GETGY_empty_VYEY(q0, q2, q1); }
+                NEGQ_8(v0, q2);     // get NEG
+                CMLTQ_0_8(v1, q1);  // calculate mask
+                VBIFQ(v0, q2, v1);  // put back positive values
+                CMEQQ_0_8(v1, q1);  // handle case where Ex is 0
+                VBICQ(q0,v0, v1);
+            }
+            if(!vex.l) YMM0(gd);
+            break;
+        case 0x09:
+            INST_NAME("VPSIGNW Gx, Vx, Ex");
+            nextop = F8;
+            v1 = fpu_get_scratch(dyn, ninst);
+            v0 = fpu_get_scratch(dyn, ninst);
+            for(int l=0; l<1+vex.l; ++l) {
+                if(!l) { GETGX_empty_VXEX(q0, q2, q1, 0); } else { GETGY_empty_VYEY(q0, q2, q1); }
+                NEGQ_16(v0, q2);    // get NEG
+                CMLTQ_0_16(v1, q1); // calculate mask
+                VBIFQ(v0, q2, v1);  // put back positive values
+                CMEQQ_0_16(v1, q1); // handle case where Ex is 0
+                VBICQ(q0, v0, v1);
+            }
+            if(!vex.l) YMM0(gd);
+            break;
+        case 0x0A:
+            INST_NAME("VPSIGND Gx, Vx, Ex");
+            nextop = F8;
+            v1 = fpu_get_scratch(dyn, ninst);
+            v0 = fpu_get_scratch(dyn, ninst);
+            for(int l=0; l<1+vex.l; ++l) {
+                if(!l) { GETGX_empty_VXEX(q0, q2, q1, 0); } else { GETGY_empty_VYEY(q0, q2, q1); }
+                NEGQ_32(v0, q2);    // get NEG
+                CMLTQ_0_32(v1, q1); // calculate mask
+                VBIFQ(v0, q2, v1);  // put back positive values
+                CMEQQ_0_32(v1, q1); // handle case where Ex is 0
+                VBICQ(q0, v0, v1);
+            }
+            if(!vex.l) YMM0(gd);
+            break;
+
         case 0x18:
             INST_NAME("VBROADCASTSS Gx, Ex");
             nextop = F8;
@@ -95,6 +141,34 @@ uintptr_t dynarec64_AVX_66_0F38(dynarec_arm_t* dyn, uintptr_t addr, uintptr_t ip
             } else YMM0(gd);
             break;
 
+        case 0x1C:
+            INST_NAME("VPABSB Gx, Ex");
+            nextop = F8;
+            for(int l=0; l<1+vex.l; ++l) {
+                if(!l) { GETGX_empty_EX(v0, v1, 0); } else { GETGY_empty_EY(v0, v1); }
+                ABSQ_8(v0, v1);
+            }
+            if(!vex.l) YMM0(gd);
+            break;
+        case 0x1D:
+            INST_NAME("VPABSW Gx, Ex");
+            nextop = F8;
+            for(int l=0; l<1+vex.l; ++l) {
+                if(!l) { GETGX_empty_EX(v0, v1, 0); } else { GETGY_empty_EY(v0, v1); }
+                ABSQ_16(v0, v1);
+            }
+            if(!vex.l) YMM0(gd);
+            break;
+        case 0x1E:
+            INST_NAME("VPABSD Gx, Ex");
+            nextop = F8;
+            for(int l=0; l<1+vex.l; ++l) {
+                if(!l) { GETGX_empty_EX(v0, v1, 0); } else { GETGY_empty_EY(v0, v1); }
+                ABSQ_32(v0, v1);
+            }
+            if(!vex.l) YMM0(gd);
+            break;
+
         case 0x2C:
             INST_NAME("VMASKMOVPS Gx, Vx, Ex");
             nextop = F8;
@@ -162,6 +236,232 @@ uintptr_t dynarec64_AVX_66_0F38(dynarec_arm_t* dyn, uintptr_t addr, uintptr_t ip
                 }
             }
             break;
+        case 0x30:
+            INST_NAME("VPMOVZXBW Gx, Ex");
+            nextop = F8;
+            GETG;
+            if(vex.l) { GETEX(q1, 0, 0); } else { GETEX64(q1, 0, 0); YMM0(gd); }
+            GETGX_empty(q0);
+            if(vex.l) {
+                GETGY_empty(v0, -1, -1, -1);
+                UXTL2_8(v0, q1);
+            }
+            UXTL_8(q0, q1);
+            break;
+        case 0x31:
+            INST_NAME("VPMOVZXBD Gx, Ex");
+            nextop = F8;
+            GETG;
+            if(vex.l) { GETEX64(q1, 0, 0); } else { GETEX32(q1, 0, 0); YMM0(gd); }
+            GETGX_empty(q0);
+            if(vex.l) {
+                GETGY_empty(v0, -1, -1, -1);
+                UXTL_8(v0, q1);
+                UXTL2_16(v0, v0);
+            }
+            UXTL_8(q0, q1); 
+            UXTL_16(q0, q0);
+            break;
+        case 0x32:
+            INST_NAME("VPMOVZXBQ Gx, Ex");
+            nextop = F8;
+            GETG;
+            if(vex.l) { GETEX32(q1, 0, 0); } else { GETEX16(q1, 0, 0); YMM0(gd); }
+            GETGX_empty(q0);
+            if(vex.l) {
+                GETGY_empty(v0, -1, -1, -1);
+                UXTL_8(v0, q1);
+                UXTL_16(v0, v0);
+                UXTL2_32(v0, v0);
+            }
+            UXTL_8(q0, q1);
+            UXTL_16(q0, q0);
+            UXTL_32(q0, q0);
+            break;
+        case 0x33:
+            INST_NAME("VPMOVZXWD Gx, Ex");
+            nextop = F8;
+            GETG;
+            if(vex.l) { GETEX(q1, 0, 0); } else { GETEX64(q1, 0, 0); YMM0(gd); }
+            GETGX_empty(q0);
+            if(vex.l) {
+                GETGY_empty(v0, -1, -1, -1);
+                UXTL2_16(v0, q1);
+            }
+            UXTL_16(q0, q1);
+            break;
+        case 0x34:
+            INST_NAME("VPMOVZXWQ Gx, Ex");
+            nextop = F8;
+            GETG;
+            if(vex.l) { GETEX64(q1, 0, 0); } else { GETEX32(q1, 0, 0); YMM0(gd); }
+            GETGX_empty(q0);
+            if(vex.l) {
+                GETGY_empty(v0, -1, -1, -1);
+                UXTL_16(v0, q1);
+                UXTL2_32(v0, v0);
+            }
+            UXTL_16(q0, q1);
+            UXTL_32(q0, q0);
+            break;
+        case 0x35:
+            INST_NAME("VPMOVZXDQ Gx, Ex");
+            nextop = F8;
+            GETG;
+            if(vex.l) { GETEX(q1, 0, 0); } else { GETEX64(q1, 0, 0); YMM0(gd); }
+            GETGX_empty(q0);
+            if(vex.l) {
+                GETGY_empty(v0, -1, -1, -1);
+                UXTL2_32(v0, q1);
+            }
+            UXTL_32(q0, q1);
+            break;
+
+        case 0x58:
+            INST_NAME("VPBROADCASTD Gx, Ex");
+            nextop = F8;
+            if(MODREG) {
+                GETGX_empty_EX(v0, v1, 0);
+                VDUPQ_32(v0, v1, 0);
+            } else {
+                GETGX_empty(v0);
+                addr = geted(dyn, addr, ninst, nextop, &ed, x3, &fixedaddress, 0, 0, 0, rex, NULL, 0, 0);
+                VLDQ1R_32(v0, ed);
+            }
+            if(vex.l) {
+                GETGY_empty(q0, -1, -1, -1);
+                VMOVQ(q0, v0);
+            } else YMM0(gd);
+            break;
+        case 0x59:
+            INST_NAME("VPBROADCASTQ Gx, Ex");
+            nextop = F8;
+            if(MODREG) {
+                GETGX_empty_EX(v0, v1, 0);
+                VDUPQ_64(v0, v1, 0);
+            } else {
+                GETGX_empty(v0);
+                addr = geted(dyn, addr, ninst, nextop, &ed, x3, &fixedaddress, 0, 0, 0, rex, NULL, 0, 0);
+                VLDQ1R_64(v0, ed);
+            }
+            if(vex.l) {
+                GETGY_empty(q0, -1, -1, -1);
+                VMOVQ(q0, v0);
+            } else YMM0(gd);
+            break;
+
+        case 0x90:
+        case 0x92:
+            if(opcode==0x90) {INST_NAME("VPGATHERDD Gx, VSIB, Vx");} else {INST_NAME("VGATHERDPD/VGATHERDPS Gx, VSIB, Vx");}
+            nextop = F8;
+            if(((nextop&7)!=4) || MODREG) {UDF(0);}
+            GETG;
+            u8 = F8; //SIB
+            eb1 = xRAX + (u8&0x7)+(rex.b<<3); // base
+            eb2 = ((u8>>3)&7)+(rex.x<<3); // index
+            if(nextop&0x40)
+                i32 = F8S;
+            else if(nextop&0x80)
+                i32 = F32S;
+            else
+                i32 = 0;
+            if(!i32) ed = eb1;
+            else {
+                ed = x3;
+                if(i32>0 && i32<4096) ADDx_U12(ed, eb1, i32);
+                else if(i32<0 && i32>-4096) SUBx_U12(ed, eb1, -i32);
+                else {MOV64x(ed, i32); ADDx_REG(ed, ed, eb1);}
+            }
+            // ed is base
+            wb1 = u8>>6;    // scale
+            if(wb1) q1 = fpu_get_scratch(dyn, ninst);
+            for(int l=0; l<1+vex.l; ++l) {
+                if(!l) {
+                    v0 = sse_get_reg(dyn, ninst, x1, gd, 1);
+                    v2 = sse_get_reg(dyn, ninst, x1, vex.v, 1);
+                    v1 = sse_get_reg(dyn, ninst, x1, eb2, 0);
+                } else {
+                    v0 = ymm_get_reg(dyn, ninst, x1, gd, 1, vex.v, eb2, -1);
+                    v2 = ymm_get_reg(dyn, ninst, x1, vex.v, 1, gd, eb2, -1);
+                    v1 = ymm_get_reg(dyn, ninst, x1, eb2, 0, gd, vex.v, -1);
+                }
+                // prepare mask
+                if(rex.w) VSSHRQ_64(v2, v2, 63); else VSSHRQ_32(v2, v2, 31);    // prescale the values
+                if(wb1) VSHLQ_32(q1, v1, wb1); else q1 = v1;
+                // slow gather, not much choice here...
+                if(rex.w) for(int i=0; i<2; ++i) {
+                    VMOVQDto(x4, v2, i);
+                    TBZ(x4, 0, 4+4*4);
+                    SMOVQSto(x4, q1, i);
+                    ADDx_REG(x4, x4, ed);
+                    VLD1_64(v0, i, x4);
+                    VMOVQDfrom(v2, i, xZR);
+                } else for(int i=0; i<4; ++i) {
+                    VMOVSto(x4, v2, i);
+                    TBZ(x4, 0, 4+4*4);
+                    SMOVQSto(x4, q1, i);
+                    ADDx_REG(x4, x4, ed);
+                    VLD1_32(v0, i, x4);
+                    VMOVQSfrom(v2, i, xZR);
+                }
+            }
+            if(!vex.l) {YMM0(gd); YMM0(vex.v);}
+            break;
+
+        case 0xA8:
+            INST_NAME("VFMADD213PS/D Gx, Vx, Ex");
+            nextop = F8;
+            if(MODREG) q0 = fpu_get_scratch(dyn, ninst);
+            for(int l=0; l<1+vex.l; ++l) {
+                if(!l) { GETGX_VXEX(v0, v2, v1, 0); } else { GETGY_VYEY(v0, v2, v1); }
+                if(MODREG) VMOVQ(q0, v1); else q0 = v1;
+                if(rex.w) VFMLAQD(q0, v0, v2); else VFMLAQS(q0, v0, v2);
+                VMOVQ(v0, q0);
+            }
+            if(!vex.l) YMM0(gd);
+            break;
+        case 0xA9:
+            INST_NAME("VFMADD213SS/D Gx, Vx, Ex");
+            nextop = F8;
+            GETGX(v0, 1);
+            GETVX(v2, 0);
+            if(rex.w) {GETEXSD(v1, 0, 0);} else {GETEXSS(v1, 0, 0);}
+            q0 = fpu_get_scratch(dyn, ninst);
+            if(rex.w) {
+                FMADD_64(q0, v1, v0, v2);
+                VMOVeD(v0, 0, q0, 0);
+            } else {
+                FMADD_32(q0, v1, v0, v2);
+                VMOVeS(v0, 0, q0, 0);
+            }
+            YMM0(gd);
+            break;
+
+        case 0xB8:
+            INST_NAME("VFMADD231PS/D Gx, Vx, Ex");
+            nextop = F8;
+            for(int l=0; l<1+vex.l; ++l) {
+                if(!l) { GETGX_VXEX(v0, v2, v1, 0); } else { GETGY_VYEY(v0, v2, v1); }
+                if(rex.w) VFMLAQD(v0, v1, v2); else VFMLAQS(v0, v1, v2);
+            }
+            if(!vex.l) YMM0(gd);
+            break;
+        case 0xB9:
+            INST_NAME("VFMADD231SS/D Gx, Vx, Ex");
+            nextop = F8;
+            GETGX(v0, 1);
+            GETVX(v2, 0);
+            if(rex.w) {GETEXSD(v1, 0, 0);} else {GETEXSS(v1, 0, 0);}
+            q0 = fpu_get_scratch(dyn, ninst);
+            if(rex.w) {
+                FMADD_64(q0, v0, v1, v2);
+                VMOVeD(v0, 0, q0, 0);
+            } else {
+                FMADD_32(q0, v0, v1, v2);
+                VMOVeS(v0, 0, q0, 0);
+            }
+            YMM0(gd);
+            break;
 
         case 0xDC:
             INST_NAME("VAESENC Gx, Vx, Ex");  // AES-NI
diff --git a/src/dynarec/arm64/dynarec_arm64_helper.h b/src/dynarec/arm64/dynarec_arm64_helper.h
index 7471ba55..ed593ca7 100644
--- a/src/dynarec/arm64/dynarec_arm64_helper.h
+++ b/src/dynarec/arm64/dynarec_arm64_helper.h
@@ -506,6 +506,12 @@
     GETEX_Y(ex, 1, D);                      \
     GETGX(gx, 0)
 
+// Get GX and and non-writen VX and EX
+#define GETGX_VXEX(gx, vx, ex, D)            \
+    GETVX(vx, 0);                           \
+    GETEX_Y(ex, 0, D);                      \
+    GETGX(gx, 1)
+
 #define GETGXVXEX_empty(gx, vx, ex, D)      \
     GETVX(vx, 0);                           \
     GETGX(gx, 0);                           \
@@ -529,6 +535,15 @@
         VLDR128_U12(ey, ed, fixedaddress+16);                                                   \
     gy = ymm_get_reg(dyn, ninst, x1, gd, 0, vex.v, (MODREG)?((nextop&7)+(rex.b<<3)):-1, -1)
 
+// Get GY and non-writen VY and EY
+#define GETGY_VYEY(gy, vy, ey)                                                                  \
+    vy = ymm_get_reg(dyn, ninst, x1, vex.v, 0, gd, (MODREG)?((nextop&7)+(rex.b<<3)):-1, -1);    \
+    if(MODREG)                                                                                  \
+        ey = ymm_get_reg(dyn, ninst, x1, (nextop&7)+(rex.b<<3), 0, gd, vex.v, -1);              \
+    else                                                                                        \
+        VLDR128_U12(ey, ed, fixedaddress+16);                                                   \
+    gy = ymm_get_reg(dyn, ninst, x1, gd, 1, vex.v, (MODREG)?((nextop&7)+(rex.b<<3)):-1, -1)
+
 // Get empty EY and non-writen VY and GY
 #define GETGYVYEY_empty(gy, vy, ey)                                                             \
     vy = ymm_get_reg(dyn, ninst, x1, vex.v, 0, gd, (MODREG)?((nextop&7)+(rex.b<<3)):-1, -1);    \