about summary refs log tree commit diff stats
path: root/src
diff options
context:
space:
mode:
authorptitSeb <sebastien.chev@gmail.com>2024-05-30 15:17:13 +0200
committerptitSeb <sebastien.chev@gmail.com>2024-05-30 15:17:13 +0200
commitf0ea459c57b401718d5024c6907a67dce38457c4 (patch)
treeda83ce71c00e3a2166e71a3b15a34c05c9f896c6 /src
parentf3d733c3ff27c7127366c65f6ee898a2aeb50fbd (diff)
downloadbox64-f0ea459c57b401718d5024c6907a67dce38457c4.tar.gz
box64-f0ea459c57b401718d5024c6907a67dce38457c4.zip
[ARM64_DYNAREC] That first avx opcode now is 256bits enabled
Diffstat (limited to 'src')
-rw-r--r--src/dynarec/arm64/dynarec_arm64_0f.c134
-rw-r--r--src/dynarec/arm64/dynarec_arm64_660f.c192
-rw-r--r--src/dynarec/arm64/dynarec_arm64_6664.c2
-rw-r--r--src/dynarec/arm64/dynarec_arm64_67.c2
-rw-r--r--src/dynarec/arm64/dynarec_arm64_avx_0f.c39
-rw-r--r--src/dynarec/arm64/dynarec_arm64_d8.c16
-rw-r--r--src/dynarec/arm64/dynarec_arm64_d9.c6
-rw-r--r--src/dynarec/arm64/dynarec_arm64_da.c16
-rw-r--r--src/dynarec/arm64/dynarec_arm64_db.c6
-rw-r--r--src/dynarec/arm64/dynarec_arm64_dc.c16
-rw-r--r--src/dynarec/arm64/dynarec_arm64_dd.c2
-rw-r--r--src/dynarec/arm64/dynarec_arm64_de.c16
-rw-r--r--src/dynarec/arm64/dynarec_arm64_df.c8
-rw-r--r--src/dynarec/arm64/dynarec_arm64_f20f.c52
-rw-r--r--src/dynarec/arm64/dynarec_arm64_f30f.c38
-rw-r--r--src/dynarec/arm64/dynarec_arm64_functions.c122
-rw-r--r--src/dynarec/arm64/dynarec_arm64_functions.h8
-rw-r--r--src/dynarec/arm64/dynarec_arm64_helper.c166
-rw-r--r--src/dynarec/arm64/dynarec_arm64_helper.h30
-rw-r--r--src/dynarec/arm64/dynarec_arm64_private.h2
-rw-r--r--src/dynarec/dynarec_native.c2
-rw-r--r--src/dynarec/dynarec_native_pass.c4
-rw-r--r--src/dynarec/la64/dynarec_la64_helper.h2
-rw-r--r--src/dynarec/la64/dynarec_la64_private.h2
-rw-r--r--src/dynarec/rv64/dynarec_rv64_helper.h10
-rw-r--r--src/dynarec/rv64/dynarec_rv64_private.h2
26 files changed, 582 insertions, 313 deletions
diff --git a/src/dynarec/arm64/dynarec_arm64_0f.c b/src/dynarec/arm64/dynarec_arm64_0f.c
index 616085a8..8773b054 100644
--- a/src/dynarec/arm64/dynarec_arm64_0f.c
+++ b/src/dynarec/arm64/dynarec_arm64_0f.c
@@ -387,7 +387,7 @@ uintptr_t dynarec64_0F(dynarec_arm_t* dyn, uintptr_t addr, uintptr_t ip, int nin
             nextop = F8;

             GETGX(v0, 1);

             GETEM(q1, 0);

-            d0 = fpu_get_scratch(dyn);

+            d0 = fpu_get_scratch(dyn, ninst);

             u8 = sse_setround(dyn, ninst, x1, x2, x3);

             SCVTFS(d0, q1);

             x87_restoreround(dyn, ninst, u8);

@@ -423,7 +423,7 @@ uintptr_t dynarec64_0F(dynarec_arm_t* dyn, uintptr_t addr, uintptr_t ip, int nin
                     BFCw(x5, FPSR_IOC, 1);   // reset IOC bit

                     MSR_fpsr(x5);

                     ORRw_mask(x2, xZR, 1, 0);    //0x80000000

-                    d0 = fpu_get_scratch(dyn);

+                    d0 = fpu_get_scratch(dyn, ninst);

                     for (int i=0; i<2; ++i) {

                         BFCw(x5, FPSR_IOC, 1);   // reset IOC bit

                         if (i) {

@@ -461,7 +461,7 @@ uintptr_t dynarec64_0F(dynarec_arm_t* dyn, uintptr_t addr, uintptr_t ip, int nin
                     BFCw(x5, FPSR_IOC, 1);   // reset IOC bit

                     MSR_fpsr(x5);

                     ORRw_mask(x2, xZR, 1, 0);    //0x80000000

-                    d0 = fpu_get_scratch(dyn);

+                    d0 = fpu_get_scratch(dyn, ninst);

                     for (int i=0; i<2; ++i) {

                         BFCw(x5, FPSR_IOC, 1);   // reset IOC bit

                         if (i) {

@@ -516,7 +516,7 @@ uintptr_t dynarec64_0F(dynarec_arm_t* dyn, uintptr_t addr, uintptr_t ip, int nin
                     nextop = F8;

                     GETGM(q0);

                     GETEM(q1, 0);

-                    d0 = fpu_get_scratch(dyn);

+                    d0 = fpu_get_scratch(dyn, ninst);

                     MOVI_8(d0, 0b10000111);

                     VAND(d0, d0, q1);  // mask the index

                     VTBL1_8(q0, q0, d0);

@@ -540,7 +540,7 @@ uintptr_t dynarec64_0F(dynarec_arm_t* dyn, uintptr_t addr, uintptr_t ip, int nin
                     nextop = F8;

                     GETGM(q0);

                     GETEM(q1, 0);

-                    v0 = fpu_get_scratch(dyn);

+                    v0 = fpu_get_scratch(dyn, ninst);

                     VUZP1_16(v0, q0, q1);

                     VUZP2_16(q0, q0, q1);

                     SQADD_16(q0, q0, v0);

@@ -550,8 +550,8 @@ uintptr_t dynarec64_0F(dynarec_arm_t* dyn, uintptr_t addr, uintptr_t ip, int nin
                     nextop = F8;

                     GETGM(q0);

                     GETEM(q1, 0);

-                    v0 = fpu_get_scratch(dyn);

-                    v1 = fpu_get_scratch(dyn);

+                    v0 = fpu_get_scratch(dyn, ninst);

+                    v1 = fpu_get_scratch(dyn, ninst);

                     UXTL_8(v0, q0);   // this is unsigned, so 0 extended

                     SXTL_8(v1, q1);   // this is signed

                     VMULQ_16(v0, v0, v1);

@@ -563,7 +563,7 @@ uintptr_t dynarec64_0F(dynarec_arm_t* dyn, uintptr_t addr, uintptr_t ip, int nin
                     nextop = F8;

                     GETGM(q0);

                     GETEM(q1, 0);

-                    v0 = fpu_get_scratch(dyn);

+                    v0 = fpu_get_scratch(dyn, ninst);

                     VUZP1_16(v0, q0, q1);

                     VUZP2_16(q0, q0, q1);

                     VSUB_16(q0, v0, q0);

@@ -573,7 +573,7 @@ uintptr_t dynarec64_0F(dynarec_arm_t* dyn, uintptr_t addr, uintptr_t ip, int nin
                     nextop = F8;

                     GETGM(q0);

                     GETEM(q1, 0);

-                    v0 = fpu_get_scratch(dyn);

+                    v0 = fpu_get_scratch(dyn, ninst);

                     VUZP1_32(v0, q0, q1);

                     VUZP2_32(q0, q0, q1);

                     VSUB_32(q0, v0, q0);

@@ -583,7 +583,7 @@ uintptr_t dynarec64_0F(dynarec_arm_t* dyn, uintptr_t addr, uintptr_t ip, int nin
                     nextop = F8;

                     GETGM(q0);

                     GETEM(q1, 0);

-                    v0 = fpu_get_scratch(dyn);

+                    v0 = fpu_get_scratch(dyn, ninst);

                     VUZP1_16(v0, q0, q1);

                     VUZP2_16(q0, q0, q1);

                     SQSUB_16(q0, v0, q0);

@@ -593,8 +593,8 @@ uintptr_t dynarec64_0F(dynarec_arm_t* dyn, uintptr_t addr, uintptr_t ip, int nin
                     nextop = F8;

                     GETGM(q0);

                     GETEM(q1, 0);

-                    v0 = fpu_get_scratch(dyn);

-                    v1 = fpu_get_scratch(dyn);

+                    v0 = fpu_get_scratch(dyn, ninst);

+                    v1 = fpu_get_scratch(dyn, ninst);

                     CMGT_0_8(v0, q1);

                     VAND(v0, v0, q0);

                     CMLT_0_8(v1, q1);

@@ -606,8 +606,8 @@ uintptr_t dynarec64_0F(dynarec_arm_t* dyn, uintptr_t addr, uintptr_t ip, int nin
                     nextop = F8;

                     GETGM(q0);

                     GETEM(q1, 0);

-                    v0 = fpu_get_scratch(dyn);

-                    v1 = fpu_get_scratch(dyn);

+                    v0 = fpu_get_scratch(dyn, ninst);

+                    v1 = fpu_get_scratch(dyn, ninst);

                     CMGT_0_16(v0, q1);

                     VAND(v0, v0, q0);

                     CMLT_0_16(v1, q1);

@@ -619,8 +619,8 @@ uintptr_t dynarec64_0F(dynarec_arm_t* dyn, uintptr_t addr, uintptr_t ip, int nin
                     nextop = F8;

                     GETGM(q0);

                     GETEM(q1, 0);

-                    v0 = fpu_get_scratch(dyn);

-                    v1 = fpu_get_scratch(dyn);

+                    v0 = fpu_get_scratch(dyn, ninst);

+                    v1 = fpu_get_scratch(dyn, ninst);

                     CMGT_0_32(v0, q1);

                     VAND(v0, v0, q0);

                     CMLT_0_32(v1, q1);

@@ -661,10 +661,10 @@ uintptr_t dynarec64_0F(dynarec_arm_t* dyn, uintptr_t addr, uintptr_t ip, int nin
                     nextop = F8;

                     GETGX(q0, 1);

                     GETEX(q1, 0, 0);

-                    v0 = fpu_get_scratch(dyn);

+                    v0 = fpu_get_scratch(dyn, ninst);

                     VEORQ(v0, v0, v0);

                     if(arm64_sha1) {

-                        v1 = fpu_get_scratch(dyn);

+                        v1 = fpu_get_scratch(dyn, ninst);

                         VMOVeS(v1, 0, q0, 3);

                         SHA1H(v1, v1);

                         VMOVeS(v0, 3, v1, 0);

@@ -680,7 +680,7 @@ uintptr_t dynarec64_0F(dynarec_arm_t* dyn, uintptr_t addr, uintptr_t ip, int nin
                     nextop = F8;

                     GETGX(q0, 1);

                     GETEX(q1, 0, 0);

-                    v0 = fpu_get_scratch(dyn);

+                    v0 = fpu_get_scratch(dyn, ninst);

                     VEXTQ_8(v0, q1, q0, 8);

                     VEORQ(q0, q0, v0);

                     break;

@@ -696,7 +696,7 @@ uintptr_t dynarec64_0F(dynarec_arm_t* dyn, uintptr_t addr, uintptr_t ip, int nin
                             if(q0==q1)

                                 v0 = q0;

                             else {

-                                v0 = fpu_get_scratch(dyn);

+                                v0 = fpu_get_scratch(dyn, ninst);

                                 VEXTQ_8(v0, q1, q1, 8);

                                 VREV64Q_32(v0, v0);

                             }

@@ -733,10 +733,10 @@ uintptr_t dynarec64_0F(dynarec_arm_t* dyn, uintptr_t addr, uintptr_t ip, int nin
                         GETGX(q0, 1);

                         GETEX(q1, 0, 0);

                         d0 = sse_get_reg(dyn, ninst, x1, 0, 0);

-                        v0 = fpu_get_scratch(dyn);

-                        d1 = fpu_get_scratch(dyn);

+                        v0 = fpu_get_scratch(dyn, ninst);

+                        d1 = fpu_get_scratch(dyn, ninst);

                         if(MODREG) {

-                            v1 = fpu_get_scratch(dyn);

+                            v1 = fpu_get_scratch(dyn, ninst);

                         } else

                             v1 = q1;

                         VREV64Q_32(q0, q0);

@@ -803,9 +803,9 @@ uintptr_t dynarec64_0F(dynarec_arm_t* dyn, uintptr_t addr, uintptr_t ip, int nin
                     if(arm64_sha2) {

                         GETGX(q0, 1);

                         GETEX(q1, 0, 0);

-                        v0 = fpu_get_scratch(dyn);

-                        v1 = fpu_get_scratch(dyn);

-                        d0 = fpu_get_scratch(dyn);

+                        v0 = fpu_get_scratch(dyn, ninst);

+                        v1 = fpu_get_scratch(dyn, ninst);

+                        d0 = fpu_get_scratch(dyn, ninst);

                         VEORQ(v1, v1, v1);

                         VMOVQ(v0, q0);

                         SHA256SU1(v0, v1, q1);  // low v0 are ok and also need to be feed again SHA256SU1 to get the high part

@@ -876,7 +876,7 @@ uintptr_t dynarec64_0F(dynarec_arm_t* dyn, uintptr_t addr, uintptr_t ip, int nin
                     if(u8>15) {

                         VEOR(q0, q0, q0);

                     } else if(u8>7) {

-                        d0 = fpu_get_scratch(dyn);

+                        d0 = fpu_get_scratch(dyn, ninst);

                         VEOR(d0, d0, d0);

                         VEXT_8(q0, q0, d0, u8-8);

                     } else {

@@ -891,14 +891,14 @@ uintptr_t dynarec64_0F(dynarec_arm_t* dyn, uintptr_t addr, uintptr_t ip, int nin
                         GETGX(q0, 1);

                         GETEX(q1, 0, 1);

                         u8 = F8&3;

-                        d0 = fpu_get_scratch(dyn);

-                        d1 = fpu_get_scratch(dyn);

-                        v0 = fpu_get_scratch(dyn);

+                        d0 = fpu_get_scratch(dyn, ninst);

+                        d1 = fpu_get_scratch(dyn, ninst);

+                        v0 = fpu_get_scratch(dyn, ninst);

                         VEXTQ_8(v0, q0, q0, 8);

                         VREV64Q_32(v0, v0);

                         VEORQ(d1, d1, d1);

                         if(MODREG) {

-                            v1 = fpu_get_scratch(dyn);

+                            v1 = fpu_get_scratch(dyn, ninst);

                         } else

                             v1 = q1;

                         if(v1!=v0) {

@@ -1024,10 +1024,10 @@ uintptr_t dynarec64_0F(dynarec_arm_t* dyn, uintptr_t addr, uintptr_t ip, int nin
             SKIPTEST(x1);

             GETEX(q0, 0, 0);

             GETGX_empty(q1);

-            v0 = fpu_get_scratch(dyn);

+            v0 = fpu_get_scratch(dyn, ninst);

             // more precise

             if(q1==q0)

-                v1 = fpu_get_scratch(dyn);

+                v1 = fpu_get_scratch(dyn, ninst);

             else

                 v1 = q1;

             VFRSQRTEQS(v0, q0);

@@ -1042,10 +1042,10 @@ uintptr_t dynarec64_0F(dynarec_arm_t* dyn, uintptr_t addr, uintptr_t ip, int nin
             GETEX(q0, 0, 0);

             GETGX_empty(q1);

             if(q0 == q1)

-                v1 = fpu_get_scratch(dyn);

+                v1 = fpu_get_scratch(dyn, ninst);

             else

                 v1 = q1;

-            v0 = fpu_get_scratch(dyn);

+            v0 = fpu_get_scratch(dyn, ninst);

             VFRECPEQS(v0, q0);

             VFRECPSQS(v1, v0, q0);

             VFMULQS(q1, v0, v1);

@@ -1128,7 +1128,7 @@ uintptr_t dynarec64_0F(dynarec_arm_t* dyn, uintptr_t addr, uintptr_t ip, int nin
             // FMIN/FMAX wll not copy the value if v0[x] is NaN

             // but x86 will copy if either v0[x] or v1[x] is NaN, so lets force a copy if source is NaN

             if(!box64_dynarec_fastnan && v0!=v1) {

-                q0 = fpu_get_scratch(dyn);

+                q0 = fpu_get_scratch(dyn, ninst);

                 VFCMEQQS(q0, v0, v0);   // 0 is NaN, 1 is not NaN, so MASK for NaN

                 VANDQ(v0, v0, q0);

                 VBICQ(q0, v1, q0);

@@ -1151,7 +1151,7 @@ uintptr_t dynarec64_0F(dynarec_arm_t* dyn, uintptr_t addr, uintptr_t ip, int nin
             // FMIN/FMAX wll not copy the value if v0[x] is NaN

             // but x86 will copy if either v0[x] or v1[x] is NaN, so lets force a copy if source is NaN

             if(!box64_dynarec_fastnan && v0!=v1) {

-                q0 = fpu_get_scratch(dyn);

+                q0 = fpu_get_scratch(dyn, ninst);

                 VFCMEQQS(q0, v0, v0);   // 0 is NaN, 1 is not NaN, so MASK for NaN

                 VANDQ(v0, v0, q0);

                 VBICQ(q0, v1, q0);

@@ -1185,7 +1185,7 @@ uintptr_t dynarec64_0F(dynarec_arm_t* dyn, uintptr_t addr, uintptr_t ip, int nin
             nextop = F8;

             GETGM(d0);

             GETEM(d1, 0);

-            q0 = fpu_get_scratch(dyn);

+            q0 = fpu_get_scratch(dyn, ninst);

             VMOVeD(q0, 0, d0, 0);

             VMOVeD(q0, 1, d1, 0);

             SQXTN_8(d0, q0);

@@ -1215,7 +1215,7 @@ uintptr_t dynarec64_0F(dynarec_arm_t* dyn, uintptr_t addr, uintptr_t ip, int nin
             INST_NAME("PACKUSWB Gm, Em");

             nextop = F8;

             GETGM(v0);

-            q0 = fpu_get_scratch(dyn);

+            q0 = fpu_get_scratch(dyn, ninst);

             VMOVeD(q0, 0, v0, 0);

             if(MODREG) {

                 v1 = mmx_get_reg(dyn, ninst, x1, x2, x3, (nextop&7));

@@ -1251,7 +1251,7 @@ uintptr_t dynarec64_0F(dynarec_arm_t* dyn, uintptr_t addr, uintptr_t ip, int nin
             INST_NAME("PACKSSDW Gm,Em");

             nextop = F8;

             GETGM(v0);

-            q0 = fpu_get_scratch(dyn);

+            q0 = fpu_get_scratch(dyn, ninst);

             VMOVeD(q0, 0, v0, 0);

             if(MODREG) {

                 GETEM(v1, 0);

@@ -1338,7 +1338,7 @@ uintptr_t dynarec64_0F(dynarec_arm_t* dyn, uintptr_t addr, uintptr_t ip, int nin
                         (4)|(5<<8),

                         (6)|(7<<8)

                     };

-                    d0 = fpu_get_scratch(dyn);

+                    d0 = fpu_get_scratch(dyn, ninst);

                     tmp64u = swp[(u8>>(0*2))&3] | (swp[(u8>>(1*2))&3]<<16);

                     tmp64u |= (swp[(u8>>(2*2))&3]<<32) | (swp[(u8>>(3*2))&3]<<48);

                     MOV64x(x2, tmp64u);

@@ -2239,7 +2239,7 @@ uintptr_t dynarec64_0F(dynarec_arm_t* dyn, uintptr_t addr, uintptr_t ip, int nin
                 case 2: VFCMGEQS(v0, v1, v0); break;   // Less or equal

                 case 3: VFCMEQQS(v0, v0, v0);

                         if(v0!=v1) {

-                            q0 = fpu_get_scratch(dyn);

+                            q0 = fpu_get_scratch(dyn, ninst);

                             VFCMEQQS(q0, v1, v1);

                             VANDQ(v0, v0, q0);

                         }

@@ -2250,7 +2250,7 @@ uintptr_t dynarec64_0F(dynarec_arm_t* dyn, uintptr_t addr, uintptr_t ip, int nin
                 case 6: VFCMGEQS(v0, v1, v0); VMVNQ(v0, v0); break;   // Greater or unordered

                 case 7: VFCMEQQS(v0, v0, v0);

                         if(v0!=v1) {

-                            q0 = fpu_get_scratch(dyn);

+                            q0 = fpu_get_scratch(dyn, ninst);

                             VFCMEQQS(q0, v1, v1);

                             VANDQ(v0, v0, q0);

                         }

@@ -2313,7 +2313,7 @@ uintptr_t dynarec64_0F(dynarec_arm_t* dyn, uintptr_t addr, uintptr_t ip, int nin
             } else if(v0==v1 && (u8==0xe5)) {   // easy special case

                 VMOVeS(v0, 0, v0, 1);

             } else {

-                d0 = fpu_get_scratch(dyn);

+                d0 = fpu_get_scratch(dyn, ninst);

                 // first two elements from Gx

                 for(int i=0; i<2; ++i) {

                     VMOVeS(d0, i, v0, (u8>>(i*2))&3);

@@ -2390,12 +2390,12 @@ uintptr_t dynarec64_0F(dynarec_arm_t* dyn, uintptr_t addr, uintptr_t ip, int nin
             GETGM(d0);

             GETEM(d1, 0);

             if(MODREG) {

-                q0 = fpu_get_scratch(dyn);

+                q0 = fpu_get_scratch(dyn, ninst);

             }

             else {

                 q0 = d1;

             }

-            q1 = fpu_get_scratch(dyn);

+            q1 = fpu_get_scratch(dyn, ninst);

             VMOVBto(x1, d1, 0);

             MOVZw(x2, 16);

             SUBSw_REG(x2, x2, x1);

@@ -2415,12 +2415,12 @@ uintptr_t dynarec64_0F(dynarec_arm_t* dyn, uintptr_t addr, uintptr_t ip, int nin
             GETGM(d0);

             GETEM(d1, 0);

             if(MODREG) {

-                q0 = fpu_get_scratch(dyn);

+                q0 = fpu_get_scratch(dyn, ninst);

             }

             else {

                 q0 = d1;

             }

-            q1 = fpu_get_scratch(dyn);

+            q1 = fpu_get_scratch(dyn, ninst);

             VMOVBto(x1, d1, 0);

             MOVZw(x2, 32);

             SUBSw_REG(x2, x2, x1);

@@ -2439,7 +2439,7 @@ uintptr_t dynarec64_0F(dynarec_arm_t* dyn, uintptr_t addr, uintptr_t ip, int nin
             nextop = F8;

             GETGM(d0);

             GETEM(d1, 0);

-            v0 = fpu_get_scratch(dyn);

+            v0 = fpu_get_scratch(dyn, ninst);

             //MOVI_64(v0, 64);  not 64!

             MOV32w(x1, 64);

             VMOVQDfrom(v0, 0, x1);

@@ -2465,9 +2465,9 @@ uintptr_t dynarec64_0F(dynarec_arm_t* dyn, uintptr_t addr, uintptr_t ip, int nin
         case 0xD7:

             nextop = F8;

             INST_NAME("PMOVMSKB Gd, Em");

-            v0 = fpu_get_scratch(dyn);

-            v1 = fpu_get_scratch(dyn);

-            q1 = fpu_get_scratch(dyn);

+            v0 = fpu_get_scratch(dyn, ninst);

+            v1 = fpu_get_scratch(dyn, ninst);

+            q1 = fpu_get_scratch(dyn, ninst);

             GETEM(q0, 0);

             GETGD;

             TABLE64(x1, (uintptr_t)&mask_shift8);

@@ -2546,8 +2546,8 @@ uintptr_t dynarec64_0F(dynarec_arm_t* dyn, uintptr_t addr, uintptr_t ip, int nin
             nextop = F8;

             GETGM(d0);

             GETEM(d1, 0);

-            v0 = fpu_get_scratch(dyn);

-            v1 = fpu_get_scratch(dyn);

+            v0 = fpu_get_scratch(dyn, ninst);

+            v1 = fpu_get_scratch(dyn, ninst);

             UQXTN_32(v0, d1);

             MOVI_32(v1, 15);

             UMIN_32(v0, v0, v1);    // limit to 0 .. +15 values

@@ -2560,8 +2560,8 @@ uintptr_t dynarec64_0F(dynarec_arm_t* dyn, uintptr_t addr, uintptr_t ip, int nin
             nextop = F8;

             GETGM(d0);

             GETEM(d1, 0);

-            v0 = fpu_get_scratch(dyn);

-            v1 = fpu_get_scratch(dyn);

+            v0 = fpu_get_scratch(dyn, ninst);

+            v1 = fpu_get_scratch(dyn, ninst);

             UQXTN_32(v0, d1);

             MOVI_32(v1, 31);

             UMIN_32(v0, v0, v1);        // limit to 0 .. +31 values

@@ -2582,7 +2582,7 @@ uintptr_t dynarec64_0F(dynarec_arm_t* dyn, uintptr_t addr, uintptr_t ip, int nin
             nextop = F8;

             GETGM(v0);

             GETEM(v1, 0);

-            q0 = fpu_get_scratch(dyn);

+            q0 = fpu_get_scratch(dyn, ninst);

             VSMULL_16(q0, v0, v1);

             SQSHRN_16(v0, q0, 16);

             break;

@@ -2667,7 +2667,7 @@ uintptr_t dynarec64_0F(dynarec_arm_t* dyn, uintptr_t addr, uintptr_t ip, int nin
             nextop = F8;

             GETGM(d0);

             GETEM(d1, 0);

-            v0 = fpu_get_scratch(dyn);

+            v0 = fpu_get_scratch(dyn, ninst);

             VMOVHto(x1, d1, 0);

             VDUPH(v0, x1);

             USHL_16(d0, d0, v0);

@@ -2677,8 +2677,8 @@ uintptr_t dynarec64_0F(dynarec_arm_t* dyn, uintptr_t addr, uintptr_t ip, int nin
             nextop = F8;

             GETGM(d0);

             GETEM(d1, 0);

-            v0 = fpu_get_scratch(dyn);

-            v1 = fpu_get_scratch(dyn);

+            v0 = fpu_get_scratch(dyn, ninst);

+            v1 = fpu_get_scratch(dyn, ninst);

             UQXTN_32(v0, d1);

             MOVI_32(v1, 32);

             UMIN_32(v0, v0, v1); // limit to 0 .. +32 values

@@ -2690,8 +2690,8 @@ uintptr_t dynarec64_0F(dynarec_arm_t* dyn, uintptr_t addr, uintptr_t ip, int nin
             nextop = F8;

             GETGM(d0);

             GETEM(d1, 0);

-            v0 = fpu_get_scratch(dyn);

-            v1 = fpu_get_scratch(dyn);

+            v0 = fpu_get_scratch(dyn, ninst);

+            v1 = fpu_get_scratch(dyn, ninst);

             UQXTN_32(v0, d1);

             MOVI_32(v1, 64);

             UMIN_32(v0, v0, v1); // limit to 0 .. +64 values

@@ -2709,7 +2709,7 @@ uintptr_t dynarec64_0F(dynarec_arm_t* dyn, uintptr_t addr, uintptr_t ip, int nin
             nextop = F8;

             GETGM(v0);

             GETEM(v1, 0);

-            q0 = fpu_get_scratch(dyn);

+            q0 = fpu_get_scratch(dyn, ninst);

             VSMULL_16(q0, v0, v1);

             VADDPQ_32(q0, q0, q0); //ADDP from Q to non-Q?

             VMOVQ(v0, q0);

@@ -2719,8 +2719,8 @@ uintptr_t dynarec64_0F(dynarec_arm_t* dyn, uintptr_t addr, uintptr_t ip, int nin
             nextop = F8;

             GETGM(q0);

             GETEM(q1, 0);

-            d0 = fpu_get_scratch(dyn);

-            d1 = fpu_get_scratch(dyn);

+            d0 = fpu_get_scratch(dyn, ninst);

+            d1 = fpu_get_scratch(dyn, ninst);

             VEOR(d1, d1, d1);   // is it necessary?

             UABDL_8(d0, q0, q1);

             UADDLVQ_16(d1, d0);

@@ -2731,8 +2731,8 @@ uintptr_t dynarec64_0F(dynarec_arm_t* dyn, uintptr_t addr, uintptr_t ip, int nin
             nextop = F8;

             GETGM(q0);

             GETEM(q1, 0);

-            d0 = fpu_get_scratch(dyn);

-            d1 = fpu_get_scratch(dyn);

+            d0 = fpu_get_scratch(dyn, ninst);

+            d1 = fpu_get_scratch(dyn, ninst);

             VSSHR_8(d1, q1, 7); // d1 = byte slection mask

             VLDR64_U12(d0, xRDI, 0);

             VBIC(d0, d0, d1);   // d0 = clear masked byte

diff --git a/src/dynarec/arm64/dynarec_arm64_660f.c b/src/dynarec/arm64/dynarec_arm64_660f.c
index 2afbd088..fa006bcd 100644
--- a/src/dynarec/arm64/dynarec_arm64_660f.c
+++ b/src/dynarec/arm64/dynarec_arm64_660f.c
@@ -135,7 +135,7 @@ uintptr_t dynarec64_660F(dynarec_arm_t* dyn, uintptr_t addr, uintptr_t ip, int n
             } else {

                 SMREAD();

                 addr = geted(dyn, addr, ninst, nextop, &ed, x1, &fixedaddress, NULL, 0, 0, rex, NULL, 0, 0);

-                v1 = fpu_get_scratch(dyn);

+                v1 = fpu_get_scratch(dyn, ninst);

                 ADDx_U12(ed, ed, 8);

                 VLD1_64(v0, 1, ed);

             }

@@ -243,7 +243,7 @@ uintptr_t dynarec64_660F(dynarec_arm_t* dyn, uintptr_t addr, uintptr_t ip, int n
                     BFCw(x5, FPSR_IOC, 1);   // reset IOC bit

                     MSR_fpsr(x5);

                     ORRw_mask(x2, xZR, 1, 0);    //0x80000000

-                    d0 = fpu_get_scratch(dyn);

+                    d0 = fpu_get_scratch(dyn, ninst);

                     for (int i=0; i<2; ++i) {

                         BFCw(x5, FPSR_IOC, 1);   // reset IOC bit

                         if (i) {

@@ -283,7 +283,7 @@ uintptr_t dynarec64_660F(dynarec_arm_t* dyn, uintptr_t addr, uintptr_t ip, int n
                     BFCw(x5, FPSR_IOC, 1);   // reset IOC bit

                     MSR_fpsr(x5);

                     ORRw_mask(x2, xZR, 1, 0);    //0x80000000

-                    d0 = fpu_get_scratch(dyn);

+                    d0 = fpu_get_scratch(dyn, ninst);

                     for (int i=0; i<2; ++i) {

                         BFCw(x5, FPSR_IOC, 1);   // reset IOC bit

                         if (i) {

@@ -322,7 +322,7 @@ uintptr_t dynarec64_660F(dynarec_arm_t* dyn, uintptr_t addr, uintptr_t ip, int n
                     nextop = F8;

                     GETGX(q0, 1);

                     GETEX(q1, 0, 0);

-                    d0 = fpu_get_scratch(dyn);

+                    d0 = fpu_get_scratch(dyn, ninst);

                     MOVIQ_8(d0, 0b10001111);

                     VANDQ(d0, d0, q1);  // mask the index

                     VTBLQ1_8(q0, q0, d0);

@@ -346,7 +346,7 @@ uintptr_t dynarec64_660F(dynarec_arm_t* dyn, uintptr_t addr, uintptr_t ip, int n
                     nextop = F8;

                     GETGX(q0, 1);

                     GETEX(q1, 0, 0);

-                    v0 = fpu_get_scratch(dyn);

+                    v0 = fpu_get_scratch(dyn, ninst);

                     VUZP1Q_16(v0, q0, q1);

                     VUZP2Q_16(q0, q0, q1);

                     SQADDQ_16(q0, q0, v0);

@@ -356,10 +356,10 @@ uintptr_t dynarec64_660F(dynarec_arm_t* dyn, uintptr_t addr, uintptr_t ip, int n
                     nextop = F8;

                     GETGX(q0, 1);

                     GETEX(q1, 0, 0);

-                    v0 = fpu_get_scratch(dyn);

-                    v1 = fpu_get_scratch(dyn);

+                    v0 = fpu_get_scratch(dyn, ninst);

+                    v1 = fpu_get_scratch(dyn, ninst);

                     if(q0==q1)

-                        d0 = fpu_get_scratch(dyn);

+                        d0 = fpu_get_scratch(dyn, ninst);

                     else

                         d0 = q0;

                     UXTL_8(v0, q0);   // this is unsigned, so 0 extended

@@ -381,7 +381,7 @@ uintptr_t dynarec64_660F(dynarec_arm_t* dyn, uintptr_t addr, uintptr_t ip, int n
                     nextop = F8;

                     GETGX(q0, 1);

                     GETEX(q1, 0, 0);

-                    v0 = fpu_get_scratch(dyn);

+                    v0 = fpu_get_scratch(dyn, ninst);

                     VUZP2Q_16(v0, q0, q1);

                     VUZP1Q_16(q0, q0, q1);

                     VSUBQ_16(q0, q0, v0);

@@ -391,7 +391,7 @@ uintptr_t dynarec64_660F(dynarec_arm_t* dyn, uintptr_t addr, uintptr_t ip, int n
                     nextop = F8;

                     GETGX(q0, 1);

                     GETEX(q1, 0, 0);

-                    v0 = fpu_get_scratch(dyn);

+                    v0 = fpu_get_scratch(dyn, ninst);

                     VUZP2Q_32(v0, q0, q1);

                     VUZP1Q_32(q0, q0, q1);

                     VSUBQ_32(q0, q0, v0);

@@ -401,7 +401,7 @@ uintptr_t dynarec64_660F(dynarec_arm_t* dyn, uintptr_t addr, uintptr_t ip, int n
                     nextop = F8;

                     GETGX(q0, 1);

                     GETEX(q1, 0, 0);

-                    v0 = fpu_get_scratch(dyn);

+                    v0 = fpu_get_scratch(dyn, ninst);

                     VUZP2Q_16(v0, q0, q1);

                     VUZP1Q_16(q0, q0, q1);

                     SQSUBQ_16(q0, q0, v0);

@@ -411,8 +411,8 @@ uintptr_t dynarec64_660F(dynarec_arm_t* dyn, uintptr_t addr, uintptr_t ip, int n
                     nextop = F8;

                     GETGX(q0, 1);

                     GETEX(q1, 0, 0);

-                    v1 = fpu_get_scratch(dyn);

-                    v0 = fpu_get_scratch(dyn);

+                    v1 = fpu_get_scratch(dyn, ninst);

+                    v0 = fpu_get_scratch(dyn, ninst);

                     NEGQ_8(v0, q0);     // get NEG

                     CMLTQ_0_8(v1, q1);  // calculate mask

                     VBICQ(q0, q0, v1);  // apply not mask on dest

@@ -426,8 +426,8 @@ uintptr_t dynarec64_660F(dynarec_arm_t* dyn, uintptr_t addr, uintptr_t ip, int n
                     nextop = F8;

                     GETGX(q0, 1);

                     GETEX(q1, 0, 0);

-                    v1 = fpu_get_scratch(dyn);

-                    v0 = fpu_get_scratch(dyn);

+                    v1 = fpu_get_scratch(dyn, ninst);

+                    v0 = fpu_get_scratch(dyn, ninst);

                     NEGQ_16(v0, q0);    // get NEG

                     CMLTQ_0_16(v1, q1); // calculate mask

                     VBICQ(q0, q0, v1);  // apply not mask on dest

@@ -441,8 +441,8 @@ uintptr_t dynarec64_660F(dynarec_arm_t* dyn, uintptr_t addr, uintptr_t ip, int n
                     nextop = F8;

                     GETGX(q0, 1);

                     GETEX(q1, 0, 0);

-                    v1 = fpu_get_scratch(dyn);

-                    v0 = fpu_get_scratch(dyn);

+                    v1 = fpu_get_scratch(dyn, ninst);

+                    v0 = fpu_get_scratch(dyn, ninst);

                     NEGQ_32(v0, q0);    // get NEG

                     CMLTQ_0_32(v1, q1); // calculate mask

                     VBICQ(q0, q0, v1);  // apply not mask on dest

@@ -465,7 +465,7 @@ uintptr_t dynarec64_660F(dynarec_arm_t* dyn, uintptr_t addr, uintptr_t ip, int n
                     GETGX(q0, 1);

                     GETEX(q1, 0, 0);

                     v0 = sse_get_reg(dyn, ninst, x1, 0, 0);

-                    v1 = fpu_get_scratch(dyn);

+                    v1 = fpu_get_scratch(dyn, ninst);

                     if(q0!=q1) {

                         VSSHRQ_8(v1, v0, 7);    // bit[7]-> bit[7..0]

                         VBICQ(q0, q0, v1);

@@ -480,7 +480,7 @@ uintptr_t dynarec64_660F(dynarec_arm_t* dyn, uintptr_t addr, uintptr_t ip, int n
                     GETGX(q0, 1);

                     GETEX(q1, 0, 0);

                     v0 = sse_get_reg(dyn, ninst, x1, 0, 0);

-                    v1 = fpu_get_scratch(dyn);

+                    v1 = fpu_get_scratch(dyn, ninst);

                     if(q0!=q1) {

                         VSSHRQ_32(v1, v0, 31);    // bit[31]-> bit[31..0]

                         VBICQ(q0, q0, v1);

@@ -494,7 +494,7 @@ uintptr_t dynarec64_660F(dynarec_arm_t* dyn, uintptr_t addr, uintptr_t ip, int n
                     GETGX(q0, 1);

                     GETEX(q1, 0, 0);

                     v0 = sse_get_reg(dyn, ninst, x1, 0, 0);

-                    v1 = fpu_get_scratch(dyn);

+                    v1 = fpu_get_scratch(dyn, ninst);

                     if(q0!=q1) {

                         VSSHRQ_64(v1, v0, 63);    // bit[63]-> bit[63..0]

                         VBICQ(q0, q0, v1);

@@ -509,7 +509,7 @@ uintptr_t dynarec64_660F(dynarec_arm_t* dyn, uintptr_t addr, uintptr_t ip, int n
                     SETFLAGS(X_ALL, SF_SET);

                     GETGX(q0, 0);

                     GETEX(q1, 0, 0);

-                    v1 = fpu_get_scratch(dyn);

+                    v1 = fpu_get_scratch(dyn, ninst);

                     IFX(X_ZF) {

                         VANDQ(v1, q1, q0);

                         CMEQQ_0_64(v1, v1);

@@ -612,7 +612,7 @@ uintptr_t dynarec64_660F(dynarec_arm_t* dyn, uintptr_t addr, uintptr_t ip, int n
                         v0 = q0;

                     } else {

                         if(MODREG)

-                            v0 = fpu_get_scratch(dyn);

+                            v0 = fpu_get_scratch(dyn, ninst);

                         else

                             v0 = q1;

                         VUZP1Q_32(v0, q1, q1);

@@ -638,8 +638,8 @@ uintptr_t dynarec64_660F(dynarec_arm_t* dyn, uintptr_t addr, uintptr_t ip, int n
                     nextop = F8;

                     GETEX(q1, 0, 0);

                     GETGX(q0, 1);

-                    v0 = fpu_get_scratch(dyn);

-                    v1 = fpu_get_scratch(dyn);

+                    v0 = fpu_get_scratch(dyn, ninst);

+                    v1 = fpu_get_scratch(dyn, ninst);

                     VEORQ(v0, v0, v0);

                     SMAXQ_32(v1, v0, q0);    // values < 0 => 0

                     UQXTN_16(q0, v1);

@@ -793,7 +793,7 @@ uintptr_t dynarec64_660F(dynarec_arm_t* dyn, uintptr_t addr, uintptr_t ip, int n
                     if(arm64_aes) {

                         GETEX(q1, 0, 0);

                         GETGX(q0, 1);

-                        v0 = fpu_get_scratch(dyn);  // ARM64 internal operation differs a bit from x86_64

+                        v0 = fpu_get_scratch(dyn, ninst);  // ARM64 internal operation differs a bit from x86_64

                         VEORQ(v0, q0, q1);

                         AESE(v0, q1);

                         AESMC(v0, v0);

@@ -814,7 +814,7 @@ uintptr_t dynarec64_660F(dynarec_arm_t* dyn, uintptr_t addr, uintptr_t ip, int n
                     if(arm64_aes) {

                         GETEX(q1, 0, 0);

                         GETGX(q0, 1);

-                        v0 = fpu_get_scratch(dyn);  // ARM64 internal operation differs a bit from x86_64

+                        v0 = fpu_get_scratch(dyn, ninst);  // ARM64 internal operation differs a bit from x86_64

                         VEORQ(v0, q0, q1);

                         AESE(v0, q1);

                         VEORQ(q0, v0, q1);

@@ -834,7 +834,7 @@ uintptr_t dynarec64_660F(dynarec_arm_t* dyn, uintptr_t addr, uintptr_t ip, int n
                     if(arm64_aes) {

                         GETEX(q1, 0, 0);

                         GETGX(q0, 1);

-                        v0 = fpu_get_scratch(dyn);  // ARM64 internal operation differs a bit from x86_64

+                        v0 = fpu_get_scratch(dyn, ninst);  // ARM64 internal operation differs a bit from x86_64

                         VEORQ(v0, q0, q1);

                         AESD(v0, q1);

                         AESIMC(v0, v0);

@@ -855,7 +855,7 @@ uintptr_t dynarec64_660F(dynarec_arm_t* dyn, uintptr_t addr, uintptr_t ip, int n
                     if(arm64_aes) {

                         GETEX(q1, 0, 0);

                         GETGX(q0, 1);

-                        v0 = fpu_get_scratch(dyn);  // ARM64 internal operation differs a bit from x86_64

+                        v0 = fpu_get_scratch(dyn, ninst);  // ARM64 internal operation differs a bit from x86_64

                         VEORQ(v0, q0, q1);

                         AESD(v0, q1);

                         VEORQ(q0, v0, q1);

@@ -914,7 +914,7 @@ uintptr_t dynarec64_660F(dynarec_arm_t* dyn, uintptr_t addr, uintptr_t ip, int n
                     GETEX(q1, 0, 1);

                     GETGX_empty(q0);

                     u8 = F8;

-                    v1 = fpu_get_scratch(dyn);

+                    v1 = fpu_get_scratch(dyn, ninst);

                     if(u8&4) {

                         u8 = sse_setround(dyn, ninst, x1, x2, x3);

                         VFRINTISQ(q0, q1);

@@ -929,7 +929,7 @@ uintptr_t dynarec64_660F(dynarec_arm_t* dyn, uintptr_t addr, uintptr_t ip, int n
                     GETEX(q1, 0, 1);

                     GETGX_empty(q0);

                     u8 = F8;

-                    v1 = fpu_get_scratch(dyn);

+                    v1 = fpu_get_scratch(dyn, ninst);

                     if(u8&4) {

                         u8 = sse_setround(dyn, ninst, x1, x2, x3);

                         VFRINTIDQ(q0, q1);

@@ -944,7 +944,7 @@ uintptr_t dynarec64_660F(dynarec_arm_t* dyn, uintptr_t addr, uintptr_t ip, int n
                     GETGX(q0, 1);

                     GETEXSS(q1, 0, 1);

                     u8 = F8;

-                    v1 = fpu_get_scratch(dyn);

+                    v1 = fpu_get_scratch(dyn, ninst);

                     if(u8&4) {

                         u8 = sse_setround(dyn, ninst, x1, x2, x3);

                         FRINTXS(v1, q1);

@@ -960,7 +960,7 @@ uintptr_t dynarec64_660F(dynarec_arm_t* dyn, uintptr_t addr, uintptr_t ip, int n
                     GETGX(q0, 1);

                     GETEXSD(q1, 0, 1);

                     u8 = F8;

-                    v1 = fpu_get_scratch(dyn);

+                    v1 = fpu_get_scratch(dyn, ninst);

                     if(u8&4) {

                         u8 = sse_setround(dyn, ninst, x1, x2, x3);

                         FRINTXD(v1, q1);

@@ -1044,7 +1044,7 @@ uintptr_t dynarec64_660F(dynarec_arm_t* dyn, uintptr_t addr, uintptr_t ip, int n
                     if(u8>31) {

                         VEORQ(q0, q0, q0);

                     } else if(u8>15) {

-                        d0 = fpu_get_scratch(dyn);

+                        d0 = fpu_get_scratch(dyn, ninst);

                         VEORQ(d0, d0, d0);

                         VEXTQ_8(q0, q0, d0, u8-16);

                     } else {

@@ -1133,7 +1133,7 @@ uintptr_t dynarec64_660F(dynarec_arm_t* dyn, uintptr_t addr, uintptr_t ip, int n
                     INST_NAME("INSERTPS Gx, Ex, Ib");

                     nextop = F8;

                     GETGX(q0, 1);

-                    d0 = fpu_get_scratch(dyn);

+                    d0 = fpu_get_scratch(dyn, ninst);

                     VMOVQ(d0, q0);

                     if (MODREG) {

                         q1 = sse_get_reg(dyn, ninst, x1, (nextop & 7) + (rex.b << 3), 0);

@@ -1172,7 +1172,7 @@ uintptr_t dynarec64_660F(dynarec_arm_t* dyn, uintptr_t addr, uintptr_t ip, int n
                     GETGX(q0, 1);

                     GETEX(q1, 0, 1);

                     u8 = F8;

-                    v0 = fpu_get_scratch(dyn);

+                    v0 = fpu_get_scratch(dyn, ninst);

                     VFMULQS(v0, q0, q1);

                     // mask some, duplicate all, mask some

                     for(int i=0; i<4; ++i)

@@ -1258,7 +1258,7 @@ uintptr_t dynarec64_660F(dynarec_arm_t* dyn, uintptr_t addr, uintptr_t ip, int n
                     CALL(sse42_compare_string_explicit_len, x1);

                     q0 = sse_get_reg_empty(dyn, ninst, x2, 0);

                     if(u8&0b1000000) {

-                        q1 = fpu_get_scratch(dyn);

+                        q1 = fpu_get_scratch(dyn, ninst);

                         switch(u8&1) {

                             case 0b00:

                                 VDUPQB(q0, x1); // load the low 8bits of the mask

@@ -1299,13 +1299,13 @@ uintptr_t dynarec64_660F(dynarec_arm_t* dyn, uintptr_t addr, uintptr_t ip, int n
                         GETGX(v0, 0);

                         GETEX(v1, 0, 1);

                         u8 = F8;

-                        q0 = fpu_get_scratch(dyn);

+                        q0 = fpu_get_scratch(dyn, ninst);

                         if(u8&1) {

                             //16bits

                             VCMEQQ_16(q0, v0, v1);   // equal => mask regs

                             XTN_8(q0, q0);          // 8 bits mask, in lower 64bits

                             // transform that a mask in x1

-                            q1 = fpu_get_scratch(dyn);

+                            q1 = fpu_get_scratch(dyn, ninst);

                             VSHL_8(q0, q0, 7);  // keep only bit 0x80

                             TABLE64(x1, (uintptr_t)&mask_shift8);

                             VLDR64_U12(q1, x1, 0);     // load shift

@@ -1316,8 +1316,8 @@ uintptr_t dynarec64_660F(dynarec_arm_t* dyn, uintptr_t addr, uintptr_t ip, int n
                             //8 bits

                             VCMEQQ_8(q0, v0, v1);   // equal => mask regs

                             // transform that a mask in x1

-                            q1 = fpu_get_scratch(dyn);

-                            d0 = fpu_get_scratch(dyn);

+                            q1 = fpu_get_scratch(dyn, ninst);

+                            d0 = fpu_get_scratch(dyn, ninst);

                             VSHL_8(d0, q0, 7);  // keep only bit 0x80

                             TABLE64(x1, (uintptr_t)&mask_shift8);

                             VLDR64_U12(q1, x1, 0);     // load shift

@@ -1462,7 +1462,7 @@ uintptr_t dynarec64_660F(dynarec_arm_t* dyn, uintptr_t addr, uintptr_t ip, int n
                     CALL(sse42_compare_string_implicit_len, x1);

                     q0 = sse_get_reg_empty(dyn, ninst, x2, 0);

                     if(u8&0b1000000) {

-                        q1 = fpu_get_scratch(dyn);

+                        q1 = fpu_get_scratch(dyn, ninst);

                         switch(u8&1) {

                             case 0b00:

                                 VDUPQB(q0, x1); // load the low 8bits of the mask

@@ -1592,8 +1592,8 @@ uintptr_t dynarec64_660F(dynarec_arm_t* dyn, uintptr_t addr, uintptr_t ip, int n
             GETEX(q0, 0, 0);

             GETGX_empty(q1);

             if(!box64_dynarec_fastnan) {

-                v0 = fpu_get_scratch(dyn);

-                v1 = fpu_get_scratch(dyn);

+                v0 = fpu_get_scratch(dyn, ninst);

+                v1 = fpu_get_scratch(dyn, ninst);

                 // check if any input value was NAN

                 VFCMEQQD(v0, q0, q0);    // 0 if NAN, 1 if not NAN

                 VFSQRTQD(q1, q0);

@@ -1647,8 +1647,8 @@ uintptr_t dynarec64_660F(dynarec_arm_t* dyn, uintptr_t addr, uintptr_t ip, int n
             GETEX(q0, 0, 0);

             GETGX(q1, 1);

             if(!box64_dynarec_fastnan) {

-                v0 = fpu_get_scratch(dyn);

-                v1 = fpu_get_scratch(dyn);

+                v0 = fpu_get_scratch(dyn, ninst);

+                v1 = fpu_get_scratch(dyn, ninst);

                 // check if any input value was NAN

                 VFMAXQD(v0, q0, q1);    // propagate NAN

                 VFCMEQQD(v0, v0, v0);    // 0 if NAN, 1 if not NAN

@@ -1667,8 +1667,8 @@ uintptr_t dynarec64_660F(dynarec_arm_t* dyn, uintptr_t addr, uintptr_t ip, int n
             GETEX(q0, 0, 0);

             GETGX(q1, 1);

             if(!box64_dynarec_fastnan) {

-                v0 = fpu_get_scratch(dyn);

-                v1 = fpu_get_scratch(dyn);

+                v0 = fpu_get_scratch(dyn, ninst);

+                v1 = fpu_get_scratch(dyn, ninst);

                 // check if any input value was NAN

                 VFMAXQD(v0, q0, q1);    // propagate NAN

                 VFCMEQQD(v0, v0, v0);    // 0 if NAN, 1 if not NAN

@@ -1710,7 +1710,7 @@ uintptr_t dynarec64_660F(dynarec_arm_t* dyn, uintptr_t addr, uintptr_t ip, int n
                 MSR_fpsr(x5);

                 u8 = sse_setround(dyn, ninst, x1, x2, x3);

                 MOV32w(x4, 0x80000000);

-                d0 = fpu_get_scratch(dyn);

+                d0 = fpu_get_scratch(dyn, ninst);

                 for(int i=0; i<4; ++i) {

                     BFCw(x5, FPSR_IOC, 1);   // reset IOC bit

                     MSR_fpsr(x5);

@@ -1731,8 +1731,8 @@ uintptr_t dynarec64_660F(dynarec_arm_t* dyn, uintptr_t addr, uintptr_t ip, int n
             GETEX(q0, 0, 0);

             GETGX(q1, 1);

             if(!box64_dynarec_fastnan) {

-                v0 = fpu_get_scratch(dyn);

-                v1 = fpu_get_scratch(dyn);

+                v0 = fpu_get_scratch(dyn, ninst);

+                v1 = fpu_get_scratch(dyn, ninst);

                 // check if any input value was NAN

                 VFMAXQD(v0, q0, q1);    // propagate NAN

                 VFCMEQQD(v0, v0, v0);    // 0 if NAN, 1 if not NAN

@@ -1753,7 +1753,7 @@ uintptr_t dynarec64_660F(dynarec_arm_t* dyn, uintptr_t addr, uintptr_t ip, int n
             // FMIN/FMAX wll not copy the value if v0[x] is NaN

             // but x86 will copy if either v0[x] or v1[x] is NaN, so lets force a copy if source is NaN

             if(!box64_dynarec_fastnan && v0!=v1) {

-                q0 = fpu_get_scratch(dyn);

+                q0 = fpu_get_scratch(dyn, ninst);

                 VFCMEQQD(q0, v0, v0);   // 0 is NaN, 1 is not NaN, so MASK for NaN

                 VANDQ(v0, v0, q0);

                 VBICQ(q0, v1, q0);

@@ -1767,8 +1767,8 @@ uintptr_t dynarec64_660F(dynarec_arm_t* dyn, uintptr_t addr, uintptr_t ip, int n
             GETEX(q0, 0, 0);

             GETGX(q1, 1);

             if(!box64_dynarec_fastnan) {

-                v0 = fpu_get_scratch(dyn);

-                v1 = fpu_get_scratch(dyn);

+                v0 = fpu_get_scratch(dyn, ninst);

+                v1 = fpu_get_scratch(dyn, ninst);

                 // check if any input value was NAN

                 VFMAXQD(v0, q0, q1);    // propagate NAN

                 VFCMEQQD(v0, v0, v0);    // 0 if NAN, 1 if not NAN

@@ -1789,7 +1789,7 @@ uintptr_t dynarec64_660F(dynarec_arm_t* dyn, uintptr_t addr, uintptr_t ip, int n
             // FMIN/FMAX wll not copy the value if v0[x] is NaN

             // but x86 will copy if either v0[x] or v1[x] is NaN, so lets force a copy if source is NaN

             if(!box64_dynarec_fastnan && v0!=v1) {

-                q0 = fpu_get_scratch(dyn);

+                q0 = fpu_get_scratch(dyn, ninst);

                 VFCMEQQD(q0, v0, v0);   // 0 is NaN, 1 is not NaN, so MASK for NaN

                 VANDQ(v0, v0, q0);

                 VBICQ(q0, v1, q0);

@@ -1998,7 +1998,7 @@ uintptr_t dynarec64_660F(dynarec_arm_t* dyn, uintptr_t addr, uintptr_t ip, int n
                         (8)|(9<<8)|(10<<16)|(11<<24),

                         (12)|(13<<8)|(14<<16)|(15<<24)

                     };

-                    d0 = fpu_get_scratch(dyn);

+                    d0 = fpu_get_scratch(dyn, ninst);

                     tmp64u = swp[(u8>>(0*2))&3] | (swp[(u8>>(1*2))&3]<<32);

                     MOV64x(x2, tmp64u);

                     VMOVQDfrom(d0, 0, x2);

@@ -2141,7 +2141,7 @@ uintptr_t dynarec64_660F(dynarec_arm_t* dyn, uintptr_t addr, uintptr_t ip, int n
                         if(u8>15) {

                             VEORQ(q0, q0, q0);

                         } else {

-                            q1 = fpu_get_scratch(dyn);

+                            q1 = fpu_get_scratch(dyn, ninst);

                             VEORQ(q1, q1, q1);

                             VEXTQ_8(q0, q0, q1, u8);

                         }

@@ -2169,7 +2169,7 @@ uintptr_t dynarec64_660F(dynarec_arm_t* dyn, uintptr_t addr, uintptr_t ip, int n
                         if(u8>15) {

                             VEORQ(q0, q0, q0);

                         } else if(u8>0) {

-                            q1 = fpu_get_scratch(dyn);

+                            q1 = fpu_get_scratch(dyn, ninst);

                             VEORQ(q1, q1, q1);

                             VEXTQ_8(q0, q1, q0, 16-u8);

                         }

@@ -2209,8 +2209,8 @@ uintptr_t dynarec64_660F(dynarec_arm_t* dyn, uintptr_t addr, uintptr_t ip, int n
             GETGX(q1, 1);

             GETEX(q0, 0, 0);

             if(!box64_dynarec_fastnan) {

-                v0 = fpu_get_scratch(dyn);

-                v1 = fpu_get_scratch(dyn);

+                v0 = fpu_get_scratch(dyn, ninst);

+                v1 = fpu_get_scratch(dyn, ninst);

                 // check if any input value was NAN

                 // but need to mix low/high part

                 VTRNQ1_64(v0, q1, q0);

@@ -2231,7 +2231,7 @@ uintptr_t dynarec64_660F(dynarec_arm_t* dyn, uintptr_t addr, uintptr_t ip, int n
             nextop = F8;

             GETEX(q1, 0, 0);

             GETGX(q0, 1);

-            v0 = fpu_get_scratch(dyn);

+            v0 = fpu_get_scratch(dyn, ninst);

             VUZP1Q_64(v0, q0, q1);

             VUZP2Q_64(q0, q0, q1);

             VFSUBQD(q0, v0, q0);

@@ -2627,7 +2627,7 @@ uintptr_t dynarec64_660F(dynarec_arm_t* dyn, uintptr_t addr, uintptr_t ip, int n
                 case 2: VFCMGEQD(v0, v1, v0); break;   // Less or equal

                 case 3: VFCMEQQD(v0, v0, v0);

                         if(v0!=v1) {

-                            q0 = fpu_get_scratch(dyn);

+                            q0 = fpu_get_scratch(dyn, ninst);

                             VFCMEQQD(q0, v1, v1);

                             VANDQ(v0, v0, q0);

                         }

@@ -2638,7 +2638,7 @@ uintptr_t dynarec64_660F(dynarec_arm_t* dyn, uintptr_t addr, uintptr_t ip, int n
                 case 6: VFCMGEQD(v0, v1, v0); VMVNQ(v0, v0); break;   // Greater or unordered

                 case 7: VFCMEQQD(v0, v0, v0);

                         if(v0!=v1) {

-                            q0 = fpu_get_scratch(dyn);

+                            q0 = fpu_get_scratch(dyn, ninst);

                             VFCMEQQD(q0, v1, v1);

                             VANDQ(v0, v0, q0);

                         }

@@ -2686,7 +2686,7 @@ uintptr_t dynarec64_660F(dynarec_arm_t* dyn, uintptr_t addr, uintptr_t ip, int n
                 VMOVeD(v0, 1, v0, 0);

             } else {

                 if(v0==v1)

-                    q0 = fpu_get_scratch(dyn);

+                    q0 = fpu_get_scratch(dyn, ninst);

                 else

                     q0 = v0;

                 VMOVeD(q0, 0, v0, (u8&1));

@@ -2720,8 +2720,8 @@ uintptr_t dynarec64_660F(dynarec_arm_t* dyn, uintptr_t addr, uintptr_t ip, int n
             nextop = F8;

             GETGX(q0, 1);

             GETEX(q1, 0, 0);

-            v0 = fpu_get_scratch(dyn);

-            v1 = fpu_get_scratch(dyn);

+            v0 = fpu_get_scratch(dyn, ninst);

+            v1 = fpu_get_scratch(dyn, ninst);

             UQXTN_32(v0, q1);

             MOVI_32(v1, 16);

             UMIN_32(v0, v0, v1);    // limit to 0 .. +16 values

@@ -2734,7 +2734,7 @@ uintptr_t dynarec64_660F(dynarec_arm_t* dyn, uintptr_t addr, uintptr_t ip, int n
             nextop = F8;

             GETGX(q0, 1);

             GETEX(q1, 0, 0);

-            v0 = fpu_get_scratch(dyn);

+            v0 = fpu_get_scratch(dyn, ninst);

             VFSUBQD(v0, q0, q1);

             VFADDQD(q0, q0, q1);

             VMOVeD(q0, 0, v0, 0);

@@ -2744,8 +2744,8 @@ uintptr_t dynarec64_660F(dynarec_arm_t* dyn, uintptr_t addr, uintptr_t ip, int n
             nextop = F8;

             GETGX(q0, 1);

             GETEX(q1, 0, 0);

-            v0 = fpu_get_scratch(dyn);

-            v1 = fpu_get_scratch(dyn);

+            v0 = fpu_get_scratch(dyn, ninst);

+            v1 = fpu_get_scratch(dyn, ninst);

             UQXTN_32(v0, q1);

             MOVI_32(v1, 32);

             UMIN_32(v0, v0, v1);    // limit to 0 .. +32 values

@@ -2758,8 +2758,8 @@ uintptr_t dynarec64_660F(dynarec_arm_t* dyn, uintptr_t addr, uintptr_t ip, int n
             nextop = F8;

             GETGX(q0, 1);

             GETEX(q1, 0, 0);

-            v0 = fpu_get_scratch(dyn);

-            v1 = fpu_get_scratch(dyn);

+            v0 = fpu_get_scratch(dyn, ninst);

+            v1 = fpu_get_scratch(dyn, ninst);

             UQXTN_32(v0, q1);

             MOVI_32(v1, 64);

             UMIN_32(v0, v0, v1);    // limit to 0 .. +64 values

@@ -2798,9 +2798,9 @@ uintptr_t dynarec64_660F(dynarec_arm_t* dyn, uintptr_t addr, uintptr_t ip, int n
         case 0xD7:

             nextop = F8;

             INST_NAME("PMOVMSKB Gd, Ex");

-            v0 = fpu_get_scratch(dyn);

-            v1 = fpu_get_scratch(dyn);

-            q1 = fpu_get_scratch(dyn);

+            v0 = fpu_get_scratch(dyn, ninst);

+            v1 = fpu_get_scratch(dyn, ninst);

+            q1 = fpu_get_scratch(dyn, ninst);

             GETEX(q0, 0, 0);

             GETGD;

             TABLE64(x1, (uintptr_t)&mask_shift8);

@@ -2888,8 +2888,8 @@ uintptr_t dynarec64_660F(dynarec_arm_t* dyn, uintptr_t addr, uintptr_t ip, int n
             nextop = F8;

             GETGX(q0, 1);

             GETEX(q1, 0, 0);

-            v0 = fpu_get_scratch(dyn);

-            v1 = fpu_get_scratch(dyn);

+            v0 = fpu_get_scratch(dyn, ninst);

+            v1 = fpu_get_scratch(dyn, ninst);

             UQXTN_32(v0, q1);

             MOVI_32(v1, 15);

             UMIN_32(v0, v0, v1);    // limit to -15 .. +15 values

@@ -2902,8 +2902,8 @@ uintptr_t dynarec64_660F(dynarec_arm_t* dyn, uintptr_t addr, uintptr_t ip, int n
             nextop = F8;

             GETGX(q0, 1);

             GETEX(q1, 0, 0);

-            v0 = fpu_get_scratch(dyn);

-            v1 = fpu_get_scratch(dyn);

+            v0 = fpu_get_scratch(dyn, ninst);

+            v1 = fpu_get_scratch(dyn, ninst);

             UQXTN_32(v0, q1);

             MOVI_32(v1, 31);

             UMIN_32(v0, v0, v1);        // limit to 0 .. +31 values

@@ -2923,8 +2923,8 @@ uintptr_t dynarec64_660F(dynarec_arm_t* dyn, uintptr_t addr, uintptr_t ip, int n
             nextop = F8;

             GETGX(v0, 1);

             GETEX(v1, 0, 0);

-            q0 = fpu_get_scratch(dyn);

-            q1 = fpu_get_scratch(dyn);

+            q0 = fpu_get_scratch(dyn, ninst);

+            q1 = fpu_get_scratch(dyn, ninst);

             VUMULL_16(q0, v0, v1);

             VUMULL2_16(q1, v0, v1);

             UQSHRN_16(v0, q0, 16);

@@ -2935,8 +2935,8 @@ uintptr_t dynarec64_660F(dynarec_arm_t* dyn, uintptr_t addr, uintptr_t ip, int n
             nextop = F8;

             GETGX(v0, 1);

             GETEX(v1, 0, 0);

-            q0 = fpu_get_scratch(dyn);

-            q1 = fpu_get_scratch(dyn);

+            q0 = fpu_get_scratch(dyn, ninst);

+            q1 = fpu_get_scratch(dyn, ninst);

             VSMULL_16(q0, v0, v1);

             VSMULL2_16(q1, v0, v1);

             SQSHRN_16(v0, q0, 16);

@@ -2955,7 +2955,7 @@ uintptr_t dynarec64_660F(dynarec_arm_t* dyn, uintptr_t addr, uintptr_t ip, int n
                 BFCw(x5, FPSR_IOC, 1);   // reset IOC bit

                 MSR_fpsr(x5);

                 ORRw_mask(x4, xZR, 1, 0);    //0x80000000

-                d0 = fpu_get_scratch(dyn);

+                d0 = fpu_get_scratch(dyn, ninst);

                 for(int i=0; i<2; ++i) {

                     BFCw(x5, FPSR_IOC, 1);   // reset IOC bit

                     MSR_fpsr(x5);

@@ -3054,8 +3054,8 @@ uintptr_t dynarec64_660F(dynarec_arm_t* dyn, uintptr_t addr, uintptr_t ip, int n
             nextop = F8;

             GETGX(q0, 1);

             GETEX(q1, 0, 0);

-            v0 = fpu_get_scratch(dyn);

-            v1 = fpu_get_scratch(dyn);

+            v0 = fpu_get_scratch(dyn, ninst);

+            v1 = fpu_get_scratch(dyn, ninst);

             UQXTN_32(v0, q1);

             MOVI_32(v1, 16);

             UMIN_32(v0, v0, v1);    // limit to 0 .. +16 values

@@ -3067,8 +3067,8 @@ uintptr_t dynarec64_660F(dynarec_arm_t* dyn, uintptr_t addr, uintptr_t ip, int n
             nextop = F8;

             GETGX(q0, 1);

             GETEX(q1, 0, 0);

-            v0 = fpu_get_scratch(dyn);

-            v1 = fpu_get_scratch(dyn);

+            v0 = fpu_get_scratch(dyn, ninst);

+            v1 = fpu_get_scratch(dyn, ninst);

             UQXTN_32(v0, q1);

             MOVI_32(v1, 32);

             UMIN_32(v0, v0, v1);    // limit to 0 .. +32 values

@@ -3080,8 +3080,8 @@ uintptr_t dynarec64_660F(dynarec_arm_t* dyn, uintptr_t addr, uintptr_t ip, int n
             nextop = F8;

             GETGX(q0, 1);

             GETEX(q1, 0, 0);

-            v0 = fpu_get_scratch(dyn);

-            v1 = fpu_get_scratch(dyn);

+            v0 = fpu_get_scratch(dyn, ninst);

+            v1 = fpu_get_scratch(dyn, ninst);

             UQXTN_32(v0, q1);

             MOVI_32(v1, 64);

             UMIN_32(v0, v0, v1);    // limit to 0 .. +64 values

@@ -3093,10 +3093,10 @@ uintptr_t dynarec64_660F(dynarec_arm_t* dyn, uintptr_t addr, uintptr_t ip, int n
             nextop = F8;

             GETGX(v0, 1);

             GETEX(v1, 0, 0);

-            q0 = fpu_get_scratch(dyn);

+            q0 = fpu_get_scratch(dyn, ninst);

             VUZP1Q_32(q0, v0, v0);  //A3 A2 A1 A0 -> A3 A1 A2 A0

             if(MODREG) {

-                q1 = fpu_get_scratch(dyn);

+                q1 = fpu_get_scratch(dyn, ninst);

             } else {

                 q1 = v1;

             }

@@ -3108,8 +3108,8 @@ uintptr_t dynarec64_660F(dynarec_arm_t* dyn, uintptr_t addr, uintptr_t ip, int n
             nextop = F8;

             GETGX(v0, 1);

             GETEX(v1, 0, 0);

-            q0 = fpu_get_scratch(dyn);

-            q1 = fpu_get_scratch(dyn);

+            q0 = fpu_get_scratch(dyn, ninst);

+            q1 = fpu_get_scratch(dyn, ninst);

             VSMULL_16(q0, v0, v1);

             VSMULL2_16(q1, v0, v1);

             VADDPQ_32(v0, q0, q1);

@@ -3119,8 +3119,8 @@ uintptr_t dynarec64_660F(dynarec_arm_t* dyn, uintptr_t addr, uintptr_t ip, int n
             nextop = F8;

             GETGX(q0, 1);

             GETEX(q1, 0, 0);

-            d0 = fpu_get_scratch(dyn);

-            d1 = fpu_get_scratch(dyn);

+            d0 = fpu_get_scratch(dyn, ninst);

+            d1 = fpu_get_scratch(dyn, ninst);

             VEOR(d1, d1, d1);   // is it necessary?

             UABDL_8(d0, q0, q1);

             UADDLVQ_16(d1, d0);

@@ -3134,10 +3134,10 @@ uintptr_t dynarec64_660F(dynarec_arm_t* dyn, uintptr_t addr, uintptr_t ip, int n
             nextop = F8;

             GETGX(q0, 1);

             GETEX(q1, 0, 0);

-            v0 = fpu_get_scratch(dyn);

+            v0 = fpu_get_scratch(dyn, ninst);

             VLDR128_U12(v0, xRDI, 0);

             if(MODREG)

-                v1 = fpu_get_scratch(dyn); // need to preserve the register

+                v1 = fpu_get_scratch(dyn, ninst); // need to preserve the register

             else

                 v1 = q1;

             VSSHRQ_8(v1, q1, 7);  // get the mask

diff --git a/src/dynarec/arm64/dynarec_arm64_6664.c b/src/dynarec/arm64/dynarec_arm64_6664.c
index d5b14771..4b33fab6 100644
--- a/src/dynarec/arm64/dynarec_arm64_6664.c
+++ b/src/dynarec/arm64/dynarec_arm64_6664.c
@@ -61,7 +61,7 @@ uintptr_t dynarec64_6664(dynarec_arm_t* dyn, uintptr_t addr, uintptr_t ip, int n
                         grab_segdata(dyn, addr, ninst, x4, seg);
                         SMREAD();
                         addr = geted(dyn, addr, ninst, nextop, &ed, x1, &fixedaddress, NULL, 0, 0, rex, NULL, 0, 0);
-                        v1 = fpu_get_scratch(dyn);
+                        v1 = fpu_get_scratch(dyn, ninst);
                         VLDR64_REG(v1, ed, x4);
                     }
                     FCMPD(v0, v1);
diff --git a/src/dynarec/arm64/dynarec_arm64_67.c b/src/dynarec/arm64/dynarec_arm64_67.c
index b22c7f92..7a59de5d 100644
--- a/src/dynarec/arm64/dynarec_arm64_67.c
+++ b/src/dynarec/arm64/dynarec_arm64_67.c
@@ -202,7 +202,7 @@ uintptr_t dynarec64_67(dynarec_arm_t* dyn, uintptr_t addr, uintptr_t ip, int nin
                             if(MODREG) {

                                 s0 = sse_get_reg(dyn, ninst, x1, (nextop&7) + (rex.b<<3), 0);

                             } else {

-                                s0 = fpu_get_scratch(dyn);

+                                s0 = fpu_get_scratch(dyn, ninst);

                                 SMREAD();

                                 addr = geted32(dyn, addr, ninst, nextop, &ed, x1, &fixedaddress, &unscaled, 0xfff<<2, 3, rex, NULL, 0, 0);

                                 VLD32(s0, ed, fixedaddress);

diff --git a/src/dynarec/arm64/dynarec_arm64_avx_0f.c b/src/dynarec/arm64/dynarec_arm64_avx_0f.c
index 8addb9b1..856fb0c5 100644
--- a/src/dynarec/arm64/dynarec_arm64_avx_0f.c
+++ b/src/dynarec/arm64/dynarec_arm64_avx_0f.c
@@ -68,24 +68,24 @@ uintptr_t dynarec64_AVX_0F(dynarec_arm_t* dyn, uintptr_t addr, uintptr_t ip, int
             INST_NAME("VSHUFPS Gx, Vx, Ex, Ib");
             nextop = F8;
             GETVX(v2, 0);
-            GETGX_empty(v0);
             if(!MODREG) {
-                addr = geted(dyn, addr, ninst, nextop, &ed, x1, &fixedaddress, NULL, 0, 0, rex, NULL, 0, 1);
+                addr = geted(dyn, addr, ninst, nextop, &ed, x3, &fixedaddress, NULL, 0, 0, rex, NULL, 0, 1);
                 v1 = -1; // to avoid a warning
             } else
                 v1 = sse_get_reg(dyn, ninst, x1, (nextop&7)+(rex.b<<3), 0);
+            GETGX_empty(v0);
             u8 = F8;
             if(v2==v1 && (u8&0x3)==((u8>>2)&3) && (u8&0xf)==((u8>>4)&0xf)) {
                 VDUPQ_32(v0, v2, u8&3);
             } else if(v2==v1 && (u8==0xe0)) {   // easy special case
                 VMOVQ(v0, v2);
                 VMOVeS(v0, 1, v0, 0);
-            } else if(v0==v1 && (u8==0xe5)) {   // easy special case
+            } else if(v2==v1 && (u8==0xe5)) {   // easy special case
                 VMOVQ(v0, v2);
                 VMOVeS(v0, 0, v0, 1);
             } else {
-                d0 = fpu_get_scratch(dyn);
-                // first two elements from Gx
+                d0 = fpu_get_scratch(dyn, ninst);
+                // first two elements from Vx
                 for(int i=0; i<2; ++i) {
                     VMOVeS(d0, i, v2, (u8>>(i*2))&3);
                 }
@@ -104,7 +104,34 @@ uintptr_t dynarec64_AVX_0F(dynarec_arm_t* dyn, uintptr_t addr, uintptr_t ip, int
                 VMOVQ(v0, d0);
             }
             if(vex.l) {
-                DEFAULT;    /* TDOD! */
+                if(MODREG)
+                    v1 = ymm_get_reg(dyn, ninst, x1, (nextop&7)+(rex.b<<3), 0, gd, vex.v, -1);
+                GETGY_empty_VY(v0, v2, 0, (MODREG)?((nextop&7)+(rex.b<<3)):-1, -1);
+                if(v2==v1 && (u8&0x3)==((u8>>2)&3) && (u8&0xf)==((u8>>4)&0xf)) {
+                    VDUPQ_32(v0, v2, u8&3);
+                } else if(v2==v1 && (u8==0xe0)) {
+                    VMOVQ(v0, v2);
+                    VMOVeS(v0, 1, v0, 0);
+                } else if(v2==v1 && (u8==0xe5)) {
+                    VMOVQ(v0, v2);
+                    VMOVeS(v0, 0, v0, 1);
+                } else {
+                    for(int i=0; i<2; ++i) {
+                        VMOVeS(d0, i, v2, (u8>>(i*2))&3);
+                    }
+                    if(MODREG) {
+                        for(int i=2; i<4; ++i) {
+                            VMOVeS(d0, i, v1, (u8>>(i*2))&3);
+                        }
+                    } else {
+                        SMREAD();
+                        for(int i=2; i<4; ++i) {
+                            ADDx_U12(x2, ed, 16+((u8>>(i*2))&3)*4);
+                            VLD1_32(d0, i, x2);
+                        }
+                    }
+                    VMOVQ(v0, d0);
+                }
             } else YMM0(gd);
             break;
 
diff --git a/src/dynarec/arm64/dynarec_arm64_d8.c b/src/dynarec/arm64/dynarec_arm64_d8.c
index 113519a9..fc481aea 100644
--- a/src/dynarec/arm64/dynarec_arm64_d8.c
+++ b/src/dynarec/arm64/dynarec_arm64_d8.c
@@ -187,7 +187,7 @@ uintptr_t dynarec64_D8(dynarec_arm_t* dyn, uintptr_t addr, uintptr_t ip, int nin
             case 0:
                 INST_NAME("FADD ST0, float[ED]");
                 v1 = x87_get_st(dyn, ninst, x1, x2, 0, X87_ST0);
-                s0 = fpu_get_scratch(dyn);
+                s0 = fpu_get_scratch(dyn, ninst);
                 addr = geted(dyn, addr, ninst, nextop, &ed, x2, &fixedaddress, &unscaled, 0xfff<<2, 3, rex, NULL, 0, 0);
                 VLD32(s0, ed, fixedaddress);
                 if(ST_IS_F(0)) {
@@ -200,7 +200,7 @@ uintptr_t dynarec64_D8(dynarec_arm_t* dyn, uintptr_t addr, uintptr_t ip, int nin
             case 1:
                 INST_NAME("FMUL ST0, float[ED]");
                 v1 = x87_get_st(dyn, ninst, x1, x2, 0, X87_ST0);
-                s0 = fpu_get_scratch(dyn);
+                s0 = fpu_get_scratch(dyn, ninst);
                 addr = geted(dyn, addr, ninst, nextop, &ed, x2, &fixedaddress, &unscaled, 0xfff<<2, 3, rex, NULL, 0, 0);
                 VLD32(s0, ed, fixedaddress);
                 if(ST_IS_F(0)) {
@@ -213,7 +213,7 @@ uintptr_t dynarec64_D8(dynarec_arm_t* dyn, uintptr_t addr, uintptr_t ip, int nin
             case 2:
                 INST_NAME("FCOM ST0, float[ED]");
                 v1 = x87_get_st(dyn, ninst, x1, x2, 0, X87_ST0);
-                s0 = fpu_get_scratch(dyn);
+                s0 = fpu_get_scratch(dyn, ninst);
                 addr = geted(dyn, addr, ninst, nextop, &ed, x2, &fixedaddress, &unscaled, 0xfff<<2, 3, rex, NULL, 0, 0);
                 VLD32(s0, ed, fixedaddress);
                 if(ST_IS_F(0)) {
@@ -227,7 +227,7 @@ uintptr_t dynarec64_D8(dynarec_arm_t* dyn, uintptr_t addr, uintptr_t ip, int nin
             case 3:
                 INST_NAME("FCOMP ST0, float[ED]");
                 v1 = x87_get_st(dyn, ninst, x1, x2, 0, X87_ST0);
-                s0 = fpu_get_scratch(dyn);
+                s0 = fpu_get_scratch(dyn, ninst);
                 addr = geted(dyn, addr, ninst, nextop, &ed, x2, &fixedaddress, &unscaled, 0xfff<<2, 3, rex, NULL, 0, 0);
                 VLD32(s0, ed, fixedaddress);
                 if(ST_IS_F(0)) {
@@ -242,7 +242,7 @@ uintptr_t dynarec64_D8(dynarec_arm_t* dyn, uintptr_t addr, uintptr_t ip, int nin
             case 4:
                 INST_NAME("FSUB ST0, float[ED]");
                 v1 = x87_get_st(dyn, ninst, x1, x2, 0, X87_ST0);
-                s0 = fpu_get_scratch(dyn);
+                s0 = fpu_get_scratch(dyn, ninst);
                 addr = geted(dyn, addr, ninst, nextop, &ed, x2, &fixedaddress, &unscaled, 0xfff<<2, 3, rex, NULL, 0, 0);
                 VLD32(s0, ed, fixedaddress);
                 if(ST_IS_F(0)) {
@@ -255,7 +255,7 @@ uintptr_t dynarec64_D8(dynarec_arm_t* dyn, uintptr_t addr, uintptr_t ip, int nin
             case 5:
                 INST_NAME("FSUBR ST0, float[ED]");
                 v1 = x87_get_st(dyn, ninst, x1, x2, 0, X87_ST0);
-                s0 = fpu_get_scratch(dyn);
+                s0 = fpu_get_scratch(dyn, ninst);
                 addr = geted(dyn, addr, ninst, nextop, &ed, x2, &fixedaddress, &unscaled, 0xfff<<2, 3, rex, NULL, 0, 0);
                 VLD32(s0, ed, fixedaddress);
                 if(ST_IS_F(0)) {
@@ -268,7 +268,7 @@ uintptr_t dynarec64_D8(dynarec_arm_t* dyn, uintptr_t addr, uintptr_t ip, int nin
             case 6:
                 INST_NAME("FDIV ST0, float[ED]");
                 v1 = x87_get_st(dyn, ninst, x1, x2, 0, X87_ST0);
-                s0 = fpu_get_scratch(dyn);
+                s0 = fpu_get_scratch(dyn, ninst);
                 addr = geted(dyn, addr, ninst, nextop, &ed, x2, &fixedaddress, &unscaled, 0xfff<<2, 3, rex, NULL, 0, 0);
                 VLD32(s0, ed, fixedaddress);
                 if(ST_IS_F(0)) {
@@ -281,7 +281,7 @@ uintptr_t dynarec64_D8(dynarec_arm_t* dyn, uintptr_t addr, uintptr_t ip, int nin
             case 7:
                 INST_NAME("FDIVR ST0, float[ED]");
                 v1 = x87_get_st(dyn, ninst, x1, x2, 0, X87_ST0);
-                s0 = fpu_get_scratch(dyn);
+                s0 = fpu_get_scratch(dyn, ninst);
                 addr = geted(dyn, addr, ninst, nextop, &ed, x2, &fixedaddress, &unscaled, 0xfff<<2, 3, rex, NULL, 0, 0);
                 VLD32(s0, ed, fixedaddress);
                 if(ST_IS_F(0)) {
diff --git a/src/dynarec/arm64/dynarec_arm64_d9.c b/src/dynarec/arm64/dynarec_arm64_d9.c
index 6c99871d..53c3ad6e 100644
--- a/src/dynarec/arm64/dynarec_arm64_d9.c
+++ b/src/dynarec/arm64/dynarec_arm64_d9.c
@@ -337,7 +337,7 @@ uintptr_t dynarec64_D9(dynarec_arm_t* dyn, uintptr_t addr, uintptr_t ip, int nin
             #else
             v1 = x87_get_st(dyn, ninst, x1, x2, 0, NEON_CACHE_ST_D);
             v2 = x87_get_st(dyn, ninst, x1, x2, 1, NEON_CACHE_ST_D);
-            s0 = fpu_get_scratch(dyn);
+            s0 = fpu_get_scratch(dyn, ninst);
             FDIVD(s0, v1, v2);
             FRINTRRD(s0, s0, 0b00); // Nearest == TieToEven?
             FCVTZSxD(x4, s0);
@@ -385,7 +385,7 @@ uintptr_t dynarec64_D9(dynarec_arm_t* dyn, uintptr_t addr, uintptr_t ip, int nin
             #else
             v1 = x87_get_st(dyn, ninst, x1, x2, 0, NEON_CACHE_ST_D);
             v2 = x87_get_st(dyn, ninst, x1, x2, 1, NEON_CACHE_ST_D);
-            s0 = fpu_get_scratch(dyn);
+            s0 = fpu_get_scratch(dyn, ninst);
             FDIVD(s0, v1, v2);
             FRINTZD(s0, s0);
             FCVTZSxD(x4, s0);
@@ -496,7 +496,7 @@ uintptr_t dynarec64_D9(dynarec_arm_t* dyn, uintptr_t addr, uintptr_t ip, int nin
                 if(ST_IS_F(0))
                     s0 = v1;
                 else {
-                    s0 = fpu_get_scratch(dyn);
+                    s0 = fpu_get_scratch(dyn, ninst);
                     FCVT_S_D(s0, v1);
                 }
                 addr = geted(dyn, addr, ninst, nextop, &ed, x2, &fixedaddress, &unscaled, 0xfff<<2, 3, rex, NULL, 0, 0);
diff --git a/src/dynarec/arm64/dynarec_arm64_da.c b/src/dynarec/arm64/dynarec_arm64_da.c
index b278ef02..afb5130d 100644
--- a/src/dynarec/arm64/dynarec_arm64_da.c
+++ b/src/dynarec/arm64/dynarec_arm64_da.c
@@ -143,7 +143,7 @@ uintptr_t dynarec64_DA(dynarec_arm_t* dyn, uintptr_t addr, uintptr_t ip, int nin
             case 0:
                 INST_NAME("FIADD ST0, Ed");
                 v1 = x87_get_st(dyn, ninst, x1, x2, 0, NEON_CACHE_ST_D);
-                v2 = fpu_get_scratch(dyn);
+                v2 = fpu_get_scratch(dyn, ninst);
                 addr = geted(dyn, addr, ninst, nextop, &ed, x2, &fixedaddress, &unscaled, 0xfff<<2, 3, rex, NULL, 0, 0);
                 VLD32(v2, ed, fixedaddress);
                 SXTL_32(v2, v2);    // i32 -> i64
@@ -153,7 +153,7 @@ uintptr_t dynarec64_DA(dynarec_arm_t* dyn, uintptr_t addr, uintptr_t ip, int nin
             case 1:
                 INST_NAME("FIMUL ST0, Ed");
                 v1 = x87_get_st(dyn, ninst, x1, x2, 0, NEON_CACHE_ST_D);
-                v2 = fpu_get_scratch(dyn);
+                v2 = fpu_get_scratch(dyn, ninst);
                 addr = geted(dyn, addr, ninst, nextop, &ed, x2, &fixedaddress, &unscaled, 0xfff<<2, 3, rex, NULL, 0, 0);
                 VLD32(v2, ed, fixedaddress);
                 SXTL_32(v2, v2);    // i32 -> i64
@@ -163,7 +163,7 @@ uintptr_t dynarec64_DA(dynarec_arm_t* dyn, uintptr_t addr, uintptr_t ip, int nin
             case 2:
                 INST_NAME("FICOM ST0, Ed");
                 v1 = x87_get_st(dyn, ninst, x1, x2, 0, NEON_CACHE_ST_D);
-                v2 = fpu_get_scratch(dyn);
+                v2 = fpu_get_scratch(dyn, ninst);
                 addr = geted(dyn, addr, ninst, nextop, &ed, x2, &fixedaddress, &unscaled, 0xfff<<2, 3, rex, NULL, 0, 0);
                 VLD32(v2, ed, fixedaddress);
                 SXTL_32(v2, v2);    // i32 -> i64
@@ -174,7 +174,7 @@ uintptr_t dynarec64_DA(dynarec_arm_t* dyn, uintptr_t addr, uintptr_t ip, int nin
             case 3:
                 INST_NAME("FICOMP ST0, Ed");
                 v1 = x87_get_st(dyn, ninst, x1, x2, 0, NEON_CACHE_ST_D);
-                v2 = fpu_get_scratch(dyn);
+                v2 = fpu_get_scratch(dyn, ninst);
                 addr = geted(dyn, addr, ninst, nextop, &ed, x2, &fixedaddress, &unscaled, 0xfff<<2, 3, rex, NULL, 0, 0);
                 VLD32(v2, ed, fixedaddress);
                 SXTL_32(v2, v2);    // i32 -> i64
@@ -186,7 +186,7 @@ uintptr_t dynarec64_DA(dynarec_arm_t* dyn, uintptr_t addr, uintptr_t ip, int nin
             case 4:
                 INST_NAME("FISUB ST0, Ed");
                 v1 = x87_get_st(dyn, ninst, x1, x2, 0, NEON_CACHE_ST_D);
-                v2 = fpu_get_scratch(dyn);
+                v2 = fpu_get_scratch(dyn, ninst);
                 addr = geted(dyn, addr, ninst, nextop, &ed, x2, &fixedaddress, &unscaled, 0xfff<<2, 3, rex, NULL, 0, 0);
                 VLD32(v2, ed, fixedaddress);
                 SXTL_32(v2, v2);    // i32 -> i64
@@ -196,7 +196,7 @@ uintptr_t dynarec64_DA(dynarec_arm_t* dyn, uintptr_t addr, uintptr_t ip, int nin
             case 5:
                 INST_NAME("FISUBR ST0, Ed");
                 v1 = x87_get_st(dyn, ninst, x1, x2, 0, NEON_CACHE_ST_D);
-                v2 = fpu_get_scratch(dyn);
+                v2 = fpu_get_scratch(dyn, ninst);
                 addr = geted(dyn, addr, ninst, nextop, &ed, x2, &fixedaddress, &unscaled, 0xfff<<2, 3, rex, NULL, 0, 0);
                 VLD32(v2, ed, fixedaddress);
                 SXTL_32(v2, v2);    // i32 -> i64
@@ -206,7 +206,7 @@ uintptr_t dynarec64_DA(dynarec_arm_t* dyn, uintptr_t addr, uintptr_t ip, int nin
             case 6:
                 INST_NAME("FIDIV ST0, Ed");
                 v1 = x87_get_st(dyn, ninst, x1, x2, 0, NEON_CACHE_ST_D);
-                v2 = fpu_get_scratch(dyn);
+                v2 = fpu_get_scratch(dyn, ninst);
                 addr = geted(dyn, addr, ninst, nextop, &ed, x2, &fixedaddress, &unscaled, 0xfff<<2, 3, rex, NULL, 0, 0);
                 VLD32(v2, ed, fixedaddress);
                 SXTL_32(v2, v2);    // i32 -> i64
@@ -216,7 +216,7 @@ uintptr_t dynarec64_DA(dynarec_arm_t* dyn, uintptr_t addr, uintptr_t ip, int nin
             case 7:
                 INST_NAME("FIDIVR ST0, Ed");
                 v1 = x87_get_st(dyn, ninst, x1, x2, 0, NEON_CACHE_ST_D);
-                v2 = fpu_get_scratch(dyn);
+                v2 = fpu_get_scratch(dyn, ninst);
                 addr = geted(dyn, addr, ninst, nextop, &ed, x2, &fixedaddress, &unscaled, 0xfff<<2, 3, rex, NULL, 0, 0);
                 VLD32(v2, ed, fixedaddress);
                 SXTL_32(v2, v2);    // i32 -> i64
diff --git a/src/dynarec/arm64/dynarec_arm64_db.c b/src/dynarec/arm64/dynarec_arm64_db.c
index 488d18d4..9dc1d673 100644
--- a/src/dynarec/arm64/dynarec_arm64_db.c
+++ b/src/dynarec/arm64/dynarec_arm64_db.c
@@ -194,7 +194,7 @@ uintptr_t dynarec64_DB(dynarec_arm_t* dyn, uintptr_t addr, uintptr_t ip, int nin
                 INST_NAME("FISTTP Ed, ST0");
                 v1 = x87_get_st(dyn, ninst, x1, x2, 0, NEON_CACHE_ST_D);
                 addr = geted(dyn, addr, ninst, nextop, &wback, x2, &fixedaddress, &unscaled, 0xfff<<2, 3, rex, NULL, 0, 0);
-                s0 = fpu_get_scratch(dyn);
+                s0 = fpu_get_scratch(dyn, ninst);
                 if(arm64_frintts) {
                     FRINT32ZD(s0, v1);
                     FCVTZSwD(x5, s0);
@@ -220,7 +220,7 @@ uintptr_t dynarec64_DB(dynarec_arm_t* dyn, uintptr_t addr, uintptr_t ip, int nin
                 v1 = x87_get_st(dyn, ninst, x1, x2, 0, NEON_CACHE_ST_D);
                 u8 = x87_setround(dyn, ninst, x1, x2, x4); // x1 have the modified RPSCR reg
                 addr = geted(dyn, addr, ninst, nextop, &wback, x2, &fixedaddress, &unscaled, 0xfff<<2, 3, rex, NULL, 0, 0);
-                s0 = fpu_get_scratch(dyn);
+                s0 = fpu_get_scratch(dyn, ninst);
                 if(arm64_frintts) {
                     FRINT32XD(s0, v1);
                     FCVTZSwD(x5, s0);
@@ -246,7 +246,7 @@ uintptr_t dynarec64_DB(dynarec_arm_t* dyn, uintptr_t addr, uintptr_t ip, int nin
                 v1 = x87_get_st(dyn, ninst, x1, x2, 0, NEON_CACHE_ST_D);
                 u8 = x87_setround(dyn, ninst, x1, x2, x4); // x1 have the modified RPSCR reg
                 addr = geted(dyn, addr, ninst, nextop, &wback, x2, &fixedaddress, &unscaled, 0xfff<<2, 3, rex, NULL, 0, 0);
-                s0 = fpu_get_scratch(dyn);
+                s0 = fpu_get_scratch(dyn, ninst);
                 if(arm64_frintts) {
                     FRINT32XD(s0, v1);
                     FCVTZSwD(x5, s0);
diff --git a/src/dynarec/arm64/dynarec_arm64_dc.c b/src/dynarec/arm64/dynarec_arm64_dc.c
index a06e765b..76f43bc5 100644
--- a/src/dynarec/arm64/dynarec_arm64_dc.c
+++ b/src/dynarec/arm64/dynarec_arm64_dc.c
@@ -185,7 +185,7 @@ uintptr_t dynarec64_DC(dynarec_arm_t* dyn, uintptr_t addr, uintptr_t ip, int nin
             case 0:
                 INST_NAME("FADD ST0, double[ED]");
                 v1 = x87_get_st(dyn, ninst, x1, x2, 0, NEON_CACHE_ST_D);
-                v2 = fpu_get_scratch(dyn);
+                v2 = fpu_get_scratch(dyn, ninst);
                 addr = geted(dyn, addr, ninst, nextop, &wback, x3, &fixedaddress, &unscaled, 0xfff<<3, 7, rex, NULL, 0, 0);
                 VLD64(v2, wback, fixedaddress);
                 FADDD(v1, v1, v2);
@@ -193,7 +193,7 @@ uintptr_t dynarec64_DC(dynarec_arm_t* dyn, uintptr_t addr, uintptr_t ip, int nin
             case 1:
                 INST_NAME("FMUL ST0, double[ED]");
                 v1 = x87_get_st(dyn, ninst, x1, x2, 0, NEON_CACHE_ST_D);
-                v2 = fpu_get_scratch(dyn);
+                v2 = fpu_get_scratch(dyn, ninst);
                 addr = geted(dyn, addr, ninst, nextop, &wback, x3, &fixedaddress, &unscaled, 0xfff<<3, 7, rex, NULL, 0, 0);
                 VLD64(v2, wback, fixedaddress);
                 FMULD(v1, v1, v2);
@@ -201,7 +201,7 @@ uintptr_t dynarec64_DC(dynarec_arm_t* dyn, uintptr_t addr, uintptr_t ip, int nin
             case 2:
                 INST_NAME("FCOM ST0, double[ED]");
                 v1 = x87_get_st(dyn, ninst, x1, x2, 0, NEON_CACHE_ST_D);
-                v2 = fpu_get_scratch(dyn);
+                v2 = fpu_get_scratch(dyn, ninst);
                 addr = geted(dyn, addr, ninst, nextop, &wback, x3, &fixedaddress, &unscaled, 0xfff<<3, 7, rex, NULL, 0, 0);
                 VLD64(v2, wback, fixedaddress);
                 FCMPD(v1, v2);
@@ -210,7 +210,7 @@ uintptr_t dynarec64_DC(dynarec_arm_t* dyn, uintptr_t addr, uintptr_t ip, int nin
             case 3:
                 INST_NAME("FCOMP ST0, double[ED]");
                 v1 = x87_get_st(dyn, ninst, x1, x2, 0, NEON_CACHE_ST_D);
-                v2 = fpu_get_scratch(dyn);
+                v2 = fpu_get_scratch(dyn, ninst);
                 addr = geted(dyn, addr, ninst, nextop, &wback, x3, &fixedaddress, &unscaled, 0xfff<<3, 7, rex, NULL, 0, 0);
                 VLD64(v2, wback, fixedaddress);
                 FCMPD(v1, v2);
@@ -220,7 +220,7 @@ uintptr_t dynarec64_DC(dynarec_arm_t* dyn, uintptr_t addr, uintptr_t ip, int nin
             case 4:
                 INST_NAME("FSUB ST0, double[ED]");
                 v1 = x87_get_st(dyn, ninst, x1, x2, 0, NEON_CACHE_ST_D);
-                v2 = fpu_get_scratch(dyn);
+                v2 = fpu_get_scratch(dyn, ninst);
                 addr = geted(dyn, addr, ninst, nextop, &wback, x3, &fixedaddress, &unscaled, 0xfff<<3, 7, rex, NULL, 0, 0);
                 VLD64(v2, wback, fixedaddress);
                 FSUBD(v1, v1, v2);
@@ -228,7 +228,7 @@ uintptr_t dynarec64_DC(dynarec_arm_t* dyn, uintptr_t addr, uintptr_t ip, int nin
             case 5:
                 INST_NAME("FSUBR ST0, double[ED]");
                 v1 = x87_get_st(dyn, ninst, x1, x2, 0, NEON_CACHE_ST_D);
-                v2 = fpu_get_scratch(dyn);
+                v2 = fpu_get_scratch(dyn, ninst);
                 addr = geted(dyn, addr, ninst, nextop, &wback, x3, &fixedaddress, &unscaled, 0xfff<<3, 7, rex, NULL, 0, 0);
                 VLD64(v2, wback, fixedaddress);
                 FSUBD(v1, v2, v1);
@@ -236,7 +236,7 @@ uintptr_t dynarec64_DC(dynarec_arm_t* dyn, uintptr_t addr, uintptr_t ip, int nin
             case 6:
                 INST_NAME("FDIV ST0, double[ED]");
                 v1 = x87_get_st(dyn, ninst, x1, x2, 0, NEON_CACHE_ST_D);
-                v2 = fpu_get_scratch(dyn);
+                v2 = fpu_get_scratch(dyn, ninst);
                 addr = geted(dyn, addr, ninst, nextop, &wback, x3, &fixedaddress, &unscaled, 0xfff<<3, 7, rex, NULL, 0, 0);
                 VLD64(v2, wback, fixedaddress);
                 FDIVD(v1, v1, v2);
@@ -244,7 +244,7 @@ uintptr_t dynarec64_DC(dynarec_arm_t* dyn, uintptr_t addr, uintptr_t ip, int nin
             case 7:
                 INST_NAME("FDIVR ST0, double[ED]");
                 v1 = x87_get_st(dyn, ninst, x1, x2, 0, NEON_CACHE_ST_D);
-                v2 = fpu_get_scratch(dyn);
+                v2 = fpu_get_scratch(dyn, ninst);
                 addr = geted(dyn, addr, ninst, nextop, &wback, x3, &fixedaddress, &unscaled, 0xfff<<3, 7, rex, NULL, 0, 0);
                 VLD64(v2, wback, fixedaddress);
                 FDIVD(v1, v2, v1);
diff --git a/src/dynarec/arm64/dynarec_arm64_dd.c b/src/dynarec/arm64/dynarec_arm64_dd.c
index eabfe5fd..afef3358 100644
--- a/src/dynarec/arm64/dynarec_arm64_dd.c
+++ b/src/dynarec/arm64/dynarec_arm64_dd.c
@@ -158,7 +158,7 @@ uintptr_t dynarec64_DD(dynarec_arm_t* dyn, uintptr_t addr, uintptr_t ip, int nin
                 if(ST_IS_I64(0)) {
                     VST64(v1, ed, fixedaddress);
                 } else {
-                    s0 = fpu_get_scratch(dyn);
+                    s0 = fpu_get_scratch(dyn, ninst);
                     if(arm64_frintts) {
                         FRINT64ZD(s0, v1);
                         FCVTZSxD(x2, s0);
diff --git a/src/dynarec/arm64/dynarec_arm64_de.c b/src/dynarec/arm64/dynarec_arm64_de.c
index 660d667c..7ea3b357 100644
--- a/src/dynarec/arm64/dynarec_arm64_de.c
+++ b/src/dynarec/arm64/dynarec_arm64_de.c
@@ -186,7 +186,7 @@ uintptr_t dynarec64_DE(dynarec_arm_t* dyn, uintptr_t addr, uintptr_t ip, int nin
             case 0:
                 INST_NAME("FIADD ST0, word[ED]");
                 v1 = x87_get_st(dyn, ninst, x1, x2, 0, NEON_CACHE_ST_D);
-                v2 = fpu_get_scratch(dyn);
+                v2 = fpu_get_scratch(dyn, ninst);
                 addr = geted(dyn, addr, ninst, nextop, &wback, x3, &fixedaddress, &unscaled, 0xfff<<1, 1, rex, NULL, 0, 0);
                 VLD16(v2, wback, fixedaddress);
                 SXTL_16(v2, v2);
@@ -197,7 +197,7 @@ uintptr_t dynarec64_DE(dynarec_arm_t* dyn, uintptr_t addr, uintptr_t ip, int nin
             case 1:
                 INST_NAME("FIMUL ST0, word[ED]");
                 v1 = x87_get_st(dyn, ninst, x1, x2, 0, NEON_CACHE_ST_D);
-                v2 = fpu_get_scratch(dyn);
+                v2 = fpu_get_scratch(dyn, ninst);
                 addr = geted(dyn, addr, ninst, nextop, &wback, x3, &fixedaddress, &unscaled, 0xfff<<1, 1, rex, NULL, 0, 0);
                 VLD16(v2, wback, fixedaddress);
                 SXTL_16(v2, v2);
@@ -208,7 +208,7 @@ uintptr_t dynarec64_DE(dynarec_arm_t* dyn, uintptr_t addr, uintptr_t ip, int nin
             case 2:
                 INST_NAME("FICOM ST0, word[ED]");
                 v1 = x87_get_st(dyn, ninst, x1, x2, 0, NEON_CACHE_ST_D);
-                v2 = fpu_get_scratch(dyn);
+                v2 = fpu_get_scratch(dyn, ninst);
                 addr = geted(dyn, addr, ninst, nextop, &wback, x3, &fixedaddress, &unscaled, 0xfff<<1, 1, rex, NULL, 0, 0);
                 VLD16(v2, wback, fixedaddress);
                 SXTL_16(v2, v2);
@@ -220,7 +220,7 @@ uintptr_t dynarec64_DE(dynarec_arm_t* dyn, uintptr_t addr, uintptr_t ip, int nin
             case 3:
                 INST_NAME("FICOMP ST0, word[ED]");
                 v1 = x87_get_st(dyn, ninst, x1, x2, 0, NEON_CACHE_ST_D);
-                v2 = fpu_get_scratch(dyn);
+                v2 = fpu_get_scratch(dyn, ninst);
                 addr = geted(dyn, addr, ninst, nextop, &wback, x3, &fixedaddress, &unscaled, 0xfff<<1, 1, rex, NULL, 0, 0);
                 VLD16(v2, wback, fixedaddress);
                 SXTL_16(v2, v2);
@@ -233,7 +233,7 @@ uintptr_t dynarec64_DE(dynarec_arm_t* dyn, uintptr_t addr, uintptr_t ip, int nin
             case 4:
                 INST_NAME("FISUB ST0, word[ED]");
                 v1 = x87_get_st(dyn, ninst, x1, x2, 0, NEON_CACHE_ST_D);
-                v2 = fpu_get_scratch(dyn);
+                v2 = fpu_get_scratch(dyn, ninst);
                 addr = geted(dyn, addr, ninst, nextop, &wback, x3, &fixedaddress, &unscaled, 0xfff<<1, 1, rex, NULL, 0, 0);
                 VLD16(v2, wback, fixedaddress);
                 SXTL_16(v2, v2);
@@ -244,7 +244,7 @@ uintptr_t dynarec64_DE(dynarec_arm_t* dyn, uintptr_t addr, uintptr_t ip, int nin
             case 5:
                 INST_NAME("FISUBR ST0, word[ED]");
                 v1 = x87_get_st(dyn, ninst, x1, x2, 0, NEON_CACHE_ST_D);
-                v2 = fpu_get_scratch(dyn);
+                v2 = fpu_get_scratch(dyn, ninst);
                 addr = geted(dyn, addr, ninst, nextop, &wback, x3, &fixedaddress, &unscaled, 0xfff<<1, 1, rex, NULL, 0, 0);
                 VLD16(v2, wback, fixedaddress);
                 SXTL_16(v2, v2);
@@ -255,7 +255,7 @@ uintptr_t dynarec64_DE(dynarec_arm_t* dyn, uintptr_t addr, uintptr_t ip, int nin
             case 6:
                 INST_NAME("FIDIV ST0, word[ED]");
                 v1 = x87_get_st(dyn, ninst, x1, x2, 0, NEON_CACHE_ST_D);
-                v2 = fpu_get_scratch(dyn);
+                v2 = fpu_get_scratch(dyn, ninst);
                 addr = geted(dyn, addr, ninst, nextop, &wback, x3, &fixedaddress, &unscaled, 0xfff<<1, 1, rex, NULL, 0, 0);
                 VLD16(v2, wback, fixedaddress);
                 SXTL_16(v2, v2);
@@ -266,7 +266,7 @@ uintptr_t dynarec64_DE(dynarec_arm_t* dyn, uintptr_t addr, uintptr_t ip, int nin
             case 7:
                 INST_NAME("FIDIVR ST0, word[ED]");
                 v1 = x87_get_st(dyn, ninst, x1, x2, 0, NEON_CACHE_ST_D);
-                v2 = fpu_get_scratch(dyn);
+                v2 = fpu_get_scratch(dyn, ninst);
                 addr = geted(dyn, addr, ninst, nextop, &wback, x3, &fixedaddress, &unscaled, 0xfff<<1, 1, rex, NULL, 0, 0);
                 VLD16(v2, wback, fixedaddress);
                 SXTL_16(v2, v2);
diff --git a/src/dynarec/arm64/dynarec_arm64_df.c b/src/dynarec/arm64/dynarec_arm64_df.c
index 21828508..79c59bfd 100644
--- a/src/dynarec/arm64/dynarec_arm64_df.c
+++ b/src/dynarec/arm64/dynarec_arm64_df.c
@@ -155,7 +155,7 @@ uintptr_t dynarec64_DF(dynarec_arm_t* dyn, uintptr_t addr, uintptr_t ip, int nin
                 v1 = x87_get_st(dyn, ninst, x1, x2, 0, NEON_CACHE_ST_F);
                 addr = geted(dyn, addr, ninst, nextop, &wback, x2, &fixedaddress, &unscaled, 0xfff<<1, 1, rex, NULL, 0, 0);
                 ed = x1;
-                s0 = fpu_get_scratch(dyn);
+                s0 = fpu_get_scratch(dyn, ninst);
                 #if 0
                 // this version needs ARM v8.5, and doesn't handle saturation for 32bits integer not fitting 16bits
                 FRINT32ZD(s0, v1);
@@ -194,7 +194,7 @@ uintptr_t dynarec64_DF(dynarec_arm_t* dyn, uintptr_t addr, uintptr_t ip, int nin
                 u8 = x87_setround(dyn, ninst, x1, x2, x4);
                 addr = geted(dyn, addr, ninst, nextop, &wback, x2, &fixedaddress, &unscaled, 0xfff<<1, 1, rex, NULL, 0, 0);
                 ed = x1;
-                s0 = fpu_get_scratch(dyn);
+                s0 = fpu_get_scratch(dyn, ninst);
                 #if 0
                 FRINT32XD(s0, v1);
                 // no saturation instruction on Arm, so using NEON
@@ -234,7 +234,7 @@ uintptr_t dynarec64_DF(dynarec_arm_t* dyn, uintptr_t addr, uintptr_t ip, int nin
                 u8 = x87_setround(dyn, ninst, x1, x2, x4);
                 addr = geted(dyn, addr, ninst, nextop, &wback, x2, &fixedaddress, &unscaled, 0xfff<<1, 1, rex, NULL, 0, 0);
                 ed = x1;
-                s0 = fpu_get_scratch(dyn);
+                s0 = fpu_get_scratch(dyn, ninst);
                 #if 0
                 FRINT32XD(s0, v1);
                 // no saturation instruction on Arm, so using NEON
@@ -323,7 +323,7 @@ uintptr_t dynarec64_DF(dynarec_arm_t* dyn, uintptr_t addr, uintptr_t ip, int nin
                 }
                 addr = geted(dyn, addr, ninst, nextop, &wback, x2, &fixedaddress, &unscaled, 0xfff<<3, 7, rex, NULL, 0, 0);
                 ed = x1;
-                s0 = fpu_get_scratch(dyn);
+                s0 = fpu_get_scratch(dyn, ninst);
                 if(ST_IS_I64(0)) {
                     VST64(v1, wback, fixedaddress);
                 } else {
diff --git a/src/dynarec/arm64/dynarec_arm64_f20f.c b/src/dynarec/arm64/dynarec_arm64_f20f.c
index 65607b52..8f8e549b 100644
--- a/src/dynarec/arm64/dynarec_arm64_f20f.c
+++ b/src/dynarec/arm64/dynarec_arm64_f20f.c
@@ -99,7 +99,7 @@ uintptr_t dynarec64_F20F(dynarec_arm_t* dyn, uintptr_t addr, uintptr_t ip, int n
             nextop = F8;

             GETGX(v0, 1);

             GETED(0);

-            d1 = fpu_get_scratch(dyn);

+            d1 = fpu_get_scratch(dyn, ninst);

             if(rex.w) {

                 SCVTFDx(d1, ed);

             } else {

@@ -140,7 +140,7 @@ uintptr_t dynarec64_F20F(dynarec_arm_t* dyn, uintptr_t addr, uintptr_t ip, int n
                 MSR_fpsr(x5);

             }

             u8 = sse_setround(dyn, ninst, x1, x2, x3);

-            d1 = fpu_get_scratch(dyn);

+            d1 = fpu_get_scratch(dyn, ninst);

             FRINTID(d1, q0);

             x87_restoreround(dyn, ninst, u8);

             FCVTZSxwD(gd, d1);

@@ -207,10 +207,10 @@ uintptr_t dynarec64_F20F(dynarec_arm_t* dyn, uintptr_t addr, uintptr_t ip, int n
             INST_NAME("SQRTSD Gx, Ex");

             nextop = F8;

             GETGX(v0, 1);

-            d1 = fpu_get_scratch(dyn);

+            d1 = fpu_get_scratch(dyn, ninst);

             GETEXSD(d0, 0, 0);

             if(!box64_dynarec_fastnan) {

-                v1 = fpu_get_scratch(dyn);

+                v1 = fpu_get_scratch(dyn, ninst);

                 FCMLTD_0(v1, d0);

                 SHL_64(v1, v1, 63);

                 FSQRTD(d1, d0);

@@ -225,11 +225,11 @@ uintptr_t dynarec64_F20F(dynarec_arm_t* dyn, uintptr_t addr, uintptr_t ip, int n
             INST_NAME("ADDSD Gx, Ex");

             nextop = F8;

             GETGX(d1, 1);

-            v1 = fpu_get_scratch(dyn);

+            v1 = fpu_get_scratch(dyn, ninst);

             GETEXSD(d0, 0, 0);

             if(!box64_dynarec_fastnan) {

-                v0 = fpu_get_scratch(dyn);

-                q0 = fpu_get_scratch(dyn);

+                v0 = fpu_get_scratch(dyn, ninst);

+                q0 = fpu_get_scratch(dyn, ninst);

                 // check if any input value was NAN

                 FMAXD(v0, d0, d1);    // propagate NAN

                 FCMEQD(v0, v0, v0);    // 0 if NAN, 1 if not NAN

@@ -247,11 +247,11 @@ uintptr_t dynarec64_F20F(dynarec_arm_t* dyn, uintptr_t addr, uintptr_t ip, int n
             INST_NAME("MULSD Gx, Ex");

             nextop = F8;

             GETGX(d1, 1);

-            v1 = fpu_get_scratch(dyn);

+            v1 = fpu_get_scratch(dyn, ninst);

             GETEXSD(d0, 0, 0);

             if(!box64_dynarec_fastnan) {

-                v0 = fpu_get_scratch(dyn);

-                q0 = fpu_get_scratch(dyn);

+                v0 = fpu_get_scratch(dyn, ninst);

+                q0 = fpu_get_scratch(dyn, ninst);

                 // check if any input value was NAN

                 FMAXD(v0, d0, d1);    // propagate NAN

                 FCMEQD(v0, v0, v0);    // 0 if NAN, 1 if not NAN

@@ -270,7 +270,7 @@ uintptr_t dynarec64_F20F(dynarec_arm_t* dyn, uintptr_t addr, uintptr_t ip, int n
             nextop = F8;

             GETGX(v0, 1);

             GETEXSD(d0, 0, 0);

-            d1 = fpu_get_scratch(dyn);

+            d1 = fpu_get_scratch(dyn, ninst);

             FCVT_S_D(d1, d0);

             VMOVeS(v0, 0, d1, 0);

             break;

@@ -279,11 +279,11 @@ uintptr_t dynarec64_F20F(dynarec_arm_t* dyn, uintptr_t addr, uintptr_t ip, int n
             INST_NAME("SUBSD Gx, Ex");

             nextop = F8;

             GETGX(d1, 1);

-            v1 = fpu_get_scratch(dyn);

+            v1 = fpu_get_scratch(dyn, ninst);

             GETEXSD(d0, 0, 0);

             if(!box64_dynarec_fastnan) {

-                v0 = fpu_get_scratch(dyn);

-                q0 = fpu_get_scratch(dyn);

+                v0 = fpu_get_scratch(dyn, ninst);

+                q0 = fpu_get_scratch(dyn, ninst);

                 // check if any input value was NAN

                 FMAXD(v0, d0, d1);    // propagate NAN

                 FCMEQD(v0, v0, v0);    // 0 if NAN, 1 if not NAN

@@ -304,7 +304,7 @@ uintptr_t dynarec64_F20F(dynarec_arm_t* dyn, uintptr_t addr, uintptr_t ip, int n
             GETEXSD(v1, 0, 0);

             // MINSD: if any input is NaN, or Ex[0]<Gx[0], copy Ex[0] -> Gx[0]

             #if 0

-            d0 = fpu_get_scratch(dyn);

+            d0 = fpu_get_scratch(dyn, ninst);

             FMINNMD(d0, v0, v1);    // NaN handling may be slightly different, is that a problem?

             VMOVeD(v0, 0, d0, 0);   // to not erase uper part

             #else

@@ -317,11 +317,11 @@ uintptr_t dynarec64_F20F(dynarec_arm_t* dyn, uintptr_t addr, uintptr_t ip, int n
             INST_NAME("DIVSD Gx, Ex");

             nextop = F8;

             GETGX(v0, 1);

-            d1 = fpu_get_scratch(dyn);

+            d1 = fpu_get_scratch(dyn, ninst);

             GETEXSD(v1, 0, 0);

             if(!box64_dynarec_fastnan) {

-                d0 = fpu_get_scratch(dyn);

-                q0 = fpu_get_scratch(dyn);

+                d0 = fpu_get_scratch(dyn, ninst);

+                q0 = fpu_get_scratch(dyn, ninst);

                 // check if any input value was NAN

                 FMAXD(d0, v0, v1);      // propagate NAN

                 FCMEQD(d0, d0, d0);     // 0 if NAN, 1 if not NAN

@@ -342,7 +342,7 @@ uintptr_t dynarec64_F20F(dynarec_arm_t* dyn, uintptr_t addr, uintptr_t ip, int n
             GETEXSD(v1, 0, 0);

             // MAXSD: if any input is NaN, or Ex[0]>Gx[0], copy Ex[0] -> Gx[0]

             #if 0

-            d0 = fpu_get_scratch(dyn);

+            d0 = fpu_get_scratch(dyn, ninst);

             FMAXNMD(d0, v0, v1);    // NaN handling may be slightly different, is that a problem?

             VMOVeD(v0, 0, d0, 0);   // to not erase uper part

             #else

@@ -361,7 +361,7 @@ uintptr_t dynarec64_F20F(dynarec_arm_t* dyn, uintptr_t addr, uintptr_t ip, int n
             u8 = F8;

             if(u8==0b00000000 || u8==0b01010101 || u8==0b10101010 || u8==0b11111111) {

                 if(v0==v1) {

-                    d0 = fpu_get_scratch(dyn);

+                    d0 = fpu_get_scratch(dyn, ninst);

                     VMOVQ(d0, v1);

                 }

                 VDUP_16(v0, v1, u8&3);

@@ -375,7 +375,7 @@ uintptr_t dynarec64_F20F(dynarec_arm_t* dyn, uintptr_t addr, uintptr_t ip, int n
                     u64 |= ((uint64_t)((u8>>(i*2))&3)*2+1)<<(i*16+8);

                 }

                 MOV64x(x2, u64);

-                d0 = fpu_get_scratch(dyn);

+                d0 = fpu_get_scratch(dyn, ninst);

                 VMOVQDfrom(d0, 0, x2);

                 VTBL1_8(d0, v1, d0);

                 VMOVeD(v0, 0, d0, 0);

@@ -393,7 +393,7 @@ uintptr_t dynarec64_F20F(dynarec_arm_t* dyn, uintptr_t addr, uintptr_t ip, int n
                 v1 = sse_get_reg(dyn, ninst, x1, (nextop&7)+(rex.b<<3), 0);

             } else {

                 addr = geted(dyn, addr, ninst, nextop, &ed, x1, &fixedaddress, &unscaled, 0xfff<<4, 15, rex, NULL, 0, 0);

-                v1 = fpu_get_scratch(dyn);

+                v1 = fpu_get_scratch(dyn, ninst);

                 VLD128(v1, ed, fixedaddress);

             }

             VFADDPQS(v0, v0, v1);

@@ -406,10 +406,10 @@ uintptr_t dynarec64_F20F(dynarec_arm_t* dyn, uintptr_t addr, uintptr_t ip, int n
                 v1 = sse_get_reg(dyn, ninst, x1, (nextop&7)+(rex.b<<3), 0);

             } else {

                 addr = geted(dyn, addr, ninst, nextop, &ed, x1, &fixedaddress, &unscaled, 0xfff<<4, 15, rex, NULL, 0, 0);

-                v1 = fpu_get_scratch(dyn);

+                v1 = fpu_get_scratch(dyn, ninst);

                 VLD128(v1, ed, fixedaddress);

             }

-            d0 = fpu_get_scratch(dyn);

+            d0 = fpu_get_scratch(dyn, ninst);

             VUZP1Q_32(d0, v0, v1);

             VUZP2Q_32(v0, v0, v1);

             VFSUBQS(v0, d0, v0);

@@ -439,7 +439,7 @@ uintptr_t dynarec64_F20F(dynarec_arm_t* dyn, uintptr_t addr, uintptr_t ip, int n
             nextop = F8;

             GETGX(v0, 1);

             GETEX(v1, 0, 0);

-            q0 = fpu_get_scratch(dyn);

+            q0 = fpu_get_scratch(dyn, ninst);

             static float addsubps[4] = {-1.f, 1.f, -1.f, 1.f};

             MAYUSE(addsubps);

             TABLE64(x2, (uintptr_t)&addsubps);

@@ -472,7 +472,7 @@ uintptr_t dynarec64_F20F(dynarec_arm_t* dyn, uintptr_t addr, uintptr_t ip, int n
                 BFCw(x5, FPSR_IOC, 1);   // reset IOC bit

                 MSR_fpsr(x5);

                 ORRw_mask(x4, xZR, 1, 0);    //0x80000000

-                d0 = fpu_get_scratch(dyn);

+                d0 = fpu_get_scratch(dyn, ninst);

                 for(int i=0; i<2; ++i) {

                     BFCw(x5, FPSR_IOC, 1);   // reset IOC bit

                     MSR_fpsr(x5);

diff --git a/src/dynarec/arm64/dynarec_arm64_f30f.c b/src/dynarec/arm64/dynarec_arm64_f30f.c
index c4ccbff4..079cd0bc 100644
--- a/src/dynarec/arm64/dynarec_arm64_f30f.c
+++ b/src/dynarec/arm64/dynarec_arm64_f30f.c
@@ -84,7 +84,7 @@ uintptr_t dynarec64_F30F(dynarec_arm_t* dyn, uintptr_t addr, uintptr_t ip, int n
             } else {

                 SMREAD();

                 addr = geted(dyn, addr, ninst, nextop, &ed, x1, &fixedaddress, &unscaled, 0xfff<<4, 15, rex, NULL, 0, 0);

-                q1 = fpu_get_scratch(dyn);

+                q1 = fpu_get_scratch(dyn, ninst);

                 VLD128(q1, ed, fixedaddress);

             }

             GETGX_empty(q0);

@@ -99,7 +99,7 @@ uintptr_t dynarec64_F30F(dynarec_arm_t* dyn, uintptr_t addr, uintptr_t ip, int n
             } else {

                 SMREAD();

                 addr = geted(dyn, addr, ninst, nextop, &ed, x1, &fixedaddress, &unscaled, 0xfff<<4, 15, rex, NULL, 0, 0);

-                q1 = fpu_get_scratch(dyn);

+                q1 = fpu_get_scratch(dyn, ninst);

                 VLD128(q1, ed, fixedaddress);

             }

             GETGX_empty(q0);

@@ -117,7 +117,7 @@ uintptr_t dynarec64_F30F(dynarec_arm_t* dyn, uintptr_t addr, uintptr_t ip, int n
             nextop = F8;

             GETGX(v0, 1);

             GETED(0);

-            d1 = fpu_get_scratch(dyn);

+            d1 = fpu_get_scratch(dyn, ninst);

             if(rex.w) {

                 SCVTFSx(d1, ed);

             } else {

@@ -158,7 +158,7 @@ uintptr_t dynarec64_F30F(dynarec_arm_t* dyn, uintptr_t addr, uintptr_t ip, int n
                 MSR_fpsr(x5);

             }

             u8 = sse_setround(dyn, ninst, x1, x2, x3);

-            d1 = fpu_get_scratch(dyn);

+            d1 = fpu_get_scratch(dyn, ninst);

             FRINTIS(d1, q0);

             x87_restoreround(dyn, ninst, u8);

             FCVTZSxwS(gd, d1);

@@ -176,7 +176,7 @@ uintptr_t dynarec64_F30F(dynarec_arm_t* dyn, uintptr_t addr, uintptr_t ip, int n
             INST_NAME("SQRTSS Gx, Ex");

             nextop = F8;

             GETGX(v0, 1);

-            d1 = fpu_get_scratch(dyn);

+            d1 = fpu_get_scratch(dyn, ninst);

             GETEXSS(d0, 0, 0);

             FSQRTS(d1, d0);

             VMOVeS(v0, 0, d1, 0);

@@ -186,8 +186,8 @@ uintptr_t dynarec64_F30F(dynarec_arm_t* dyn, uintptr_t addr, uintptr_t ip, int n
             nextop = F8;

             GETGX(v0, 1);

             GETEXSS(v1, 0, 0);

-            d0 = fpu_get_scratch(dyn);

-            d1 = fpu_get_scratch(dyn);

+            d0 = fpu_get_scratch(dyn, ninst);

+            d1 = fpu_get_scratch(dyn, ninst);

             // so here: F32: Imm8 = abcd efgh that gives => aBbbbbbc defgh000 00000000 00000000

             // and want 1.0f = 0x3f800000

             // so 00111111 10000000 00000000 00000000

@@ -203,7 +203,7 @@ uintptr_t dynarec64_F30F(dynarec_arm_t* dyn, uintptr_t addr, uintptr_t ip, int n
             nextop = F8;

             GETGX(v0, 1);

             GETEXSS(v1, 0, 0);

-            d0 = fpu_get_scratch(dyn);

+            d0 = fpu_get_scratch(dyn, ninst);

             FMOVS_8(d0, 0b01110000);    //1.0f

             FDIVS(d0, d0, v1);

             VMOVeS(v0, 0, d0, 0);

@@ -213,7 +213,7 @@ uintptr_t dynarec64_F30F(dynarec_arm_t* dyn, uintptr_t addr, uintptr_t ip, int n
             INST_NAME("ADDSS Gx, Ex");

             nextop = F8;

             GETGX(v0, 1);

-            d1 = fpu_get_scratch(dyn);

+            d1 = fpu_get_scratch(dyn, ninst);

             GETEXSS(d0, 0, 0);

             FADDS(d1, v0, d0);  // the high part of the vector is erased...

             VMOVeS(v0, 0, d1, 0);

@@ -222,7 +222,7 @@ uintptr_t dynarec64_F30F(dynarec_arm_t* dyn, uintptr_t addr, uintptr_t ip, int n
             INST_NAME("MULSS Gx, Ex");

             nextop = F8;

             GETGX(v0, 1);

-            d1 = fpu_get_scratch(dyn);

+            d1 = fpu_get_scratch(dyn, ninst);

             GETEXSS(d0, 0, 0);

             FMULS(d1, v0, d0);

             VMOVeS(v0, 0, d1, 0);

@@ -232,7 +232,7 @@ uintptr_t dynarec64_F30F(dynarec_arm_t* dyn, uintptr_t addr, uintptr_t ip, int n
             nextop = F8;

             GETGX(v0, 1);

             GETEXSS(v1, 0, 0);

-            d0 = fpu_get_scratch(dyn);

+            d0 = fpu_get_scratch(dyn, ninst);

             FCVT_D_S(d0, v1);

             VMOVeD(v0, 0, d0, 0);

             break;

@@ -248,7 +248,7 @@ uintptr_t dynarec64_F30F(dynarec_arm_t* dyn, uintptr_t addr, uintptr_t ip, int n
                 BFCw(x5, FPSR_IOC, 1);   // reset IOC bit

                 MSR_fpsr(x5);

                 ORRw_mask(x4, xZR, 1, 0);    //0x80000000

-                d0 = fpu_get_scratch(dyn);

+                d0 = fpu_get_scratch(dyn, ninst);

                 for(int i=0; i<4; ++i) {

                     BFCw(x5, FPSR_IOC, 1);   // reset IOC bit

                     MSR_fpsr(x5);

@@ -266,7 +266,7 @@ uintptr_t dynarec64_F30F(dynarec_arm_t* dyn, uintptr_t addr, uintptr_t ip, int n
             INST_NAME("SUBSS Gx, Ex");

             nextop = F8;

             GETGX(v0, 1);

-            d1 = fpu_get_scratch(dyn);

+            d1 = fpu_get_scratch(dyn, ninst);

             GETEXSS(d0, 0, 0);

             FSUBS(d1, v0, d0);

             VMOVeS(v0, 0, d1, 0);

@@ -278,7 +278,7 @@ uintptr_t dynarec64_F30F(dynarec_arm_t* dyn, uintptr_t addr, uintptr_t ip, int n
             GETEXSS(v1, 0, 0);

             // MINSS: if any input is NaN, or Ex[0]<Gx[0], copy Ex[0] -> Gx[0]

             #if 0

-            d0 = fpu_get_scratch(dyn);

+            d0 = fpu_get_scratch(dyn, ninst);

             FMINNMS(d0, v0, v1);    // NaN handling may be slightly different, is that a problem?

             VMOVeS(v0, 0, d0, 0);   // to not erase uper part

             #else

@@ -291,7 +291,7 @@ uintptr_t dynarec64_F30F(dynarec_arm_t* dyn, uintptr_t addr, uintptr_t ip, int n
             INST_NAME("DIVSS Gx, Ex");

             nextop = F8;

             GETGX(v0, 1);

-            d1 = fpu_get_scratch(dyn);

+            d1 = fpu_get_scratch(dyn, ninst);

             GETEXSS(d0, 0, 0);

             FDIVS(d1, v0, d0);

             VMOVeS(v0, 0, d1, 0);

@@ -303,7 +303,7 @@ uintptr_t dynarec64_F30F(dynarec_arm_t* dyn, uintptr_t addr, uintptr_t ip, int n
             GETEXSS(v1, 0, 0);

             // MAXSS: if any input is NaN, or Ex[0]>Gx[0], copy Ex[0] -> Gx[0]

             #if 0

-            d0 = fpu_get_scratch(dyn);

+            d0 = fpu_get_scratch(dyn, ninst);

             FMAXNMS(d0, v0, v1);    // NaN handling may be slightly different, is that a problem?

             VMOVeS(v0, 0, d0, 0);   // to not erase uper part

             #else

@@ -333,7 +333,7 @@ uintptr_t dynarec64_F30F(dynarec_arm_t* dyn, uintptr_t addr, uintptr_t ip, int n
             GETEX(v1, 0, 1) ;

             GETGX(v0, 1);

             u8 = F8;

-            d0 = fpu_get_scratch(dyn);

+            d0 = fpu_get_scratch(dyn, ninst);

             if (u8 == 0b00000000 || u8 == 0b01010101 || u8 == 0b10101010 || u8 == 0b11111111) {

                 VDUPQ_16(d0, v1, (u8 & 3) + 4);

             } else {

@@ -386,7 +386,7 @@ uintptr_t dynarec64_F30F(dynarec_arm_t* dyn, uintptr_t addr, uintptr_t ip, int n
             SETFLAGS(X_ALL, SF_SET);

             SET_DFNONE(x1);

             nextop = F8;

-            v1 = fpu_get_scratch(dyn);

+            v1 = fpu_get_scratch(dyn, ninst);

             GETGD;

             if(MODREG) {

                 GETED(0);

@@ -489,7 +489,7 @@ uintptr_t dynarec64_F30F(dynarec_arm_t* dyn, uintptr_t addr, uintptr_t ip, int n
             nextop = F8;

             GETEXSD(v1, 0, 0);

             GETGX_empty(v0);

-            d0 = fpu_get_scratch(dyn);

+            d0 = fpu_get_scratch(dyn, ninst);

             SXTL_32(v0, v1);

             SCVTQFD(v0, v0);    // there is only I64 -> Double vector conversion, not from i32

             break;

diff --git a/src/dynarec/arm64/dynarec_arm64_functions.c b/src/dynarec/arm64/dynarec_arm64_functions.c
index 7c4bac22..3d23bd00 100644
--- a/src/dynarec/arm64/dynarec_arm64_functions.c
+++ b/src/dynarec/arm64/dynarec_arm64_functions.c
@@ -32,9 +32,15 @@
 #define EMM0    8
 
 // Get a FPU scratch reg
-int fpu_get_scratch(dynarec_arm_t* dyn)
+int fpu_get_scratch(dynarec_arm_t* dyn, int ninst)
 {
-    return SCRATCH0 + dyn->n.fpu_scratch++;  // return an Sx
+    int ret = SCRATCH0 + dyn->n.fpu_scratch++;
+    if(dyn->n.neoncache[ret].t==NEON_CACHE_YMMR || dyn->n.neoncache[ret].t==NEON_CACHE_YMMW) {
+        // should only happens in step 0...
+        dyn->insts[ninst].purge_ymm |= (1<<dyn->n.neoncache[ret].n); // mark as purged
+        dyn->n.neoncache[ret].v = 0; // reset it
+    }
+    return ret;
 }
 // Reset scratch regs counter
 void fpu_reset_scratch(dynarec_arm_t* dyn)
@@ -42,10 +48,15 @@ void fpu_reset_scratch(dynarec_arm_t* dyn)
     dyn->n.fpu_scratch = 0;
 }
 // Get a x87 double reg
-int fpu_get_reg_x87(dynarec_arm_t* dyn, int t, int n)
+int fpu_get_reg_x87(dynarec_arm_t* dyn, int ninst, int t, int n)
 {
     int i=X870;
     while (dyn->n.fpuused[i]) ++i;
+    if(dyn->n.neoncache[i].t==NEON_CACHE_YMMR || dyn->n.neoncache[i].t==NEON_CACHE_YMMW) {
+        // should only happens in step 0...
+        dyn->insts[ninst].purge_ymm |= (1<<dyn->n.neoncache[i].n); // mark as purged
+        dyn->n.neoncache[i].v = 0; // reset it
+    }
     dyn->n.fpuused[i] = 1;
     dyn->n.neoncache[i].n = n;
     dyn->n.neoncache[i].t = t;
@@ -61,13 +72,19 @@ void fpu_free_reg(dynarec_arm_t* dyn, int reg)
         dyn->n.neoncache[reg].v = 0;
 }
 // Get an MMX double reg
-int fpu_get_reg_emm(dynarec_arm_t* dyn, int emm)
+int fpu_get_reg_emm(dynarec_arm_t* dyn, int ninst, int emm)
 {
-    dyn->n.fpuused[EMM0 + emm] = 1;
-    dyn->n.neoncache[EMM0 + emm].t = NEON_CACHE_MM;
-    dyn->n.neoncache[EMM0 + emm].n = emm;
-    dyn->n.news |= (1<<(EMM0 + emm));
-    return EMM0 + emm;
+    int ret = EMM0 + emm;
+    if(dyn->n.neoncache[ret].t==NEON_CACHE_YMMR || dyn->n.neoncache[ret].t==NEON_CACHE_YMMW) {
+        // should only happens in step 0...
+        dyn->insts[ninst].purge_ymm |= (1<<dyn->n.neoncache[ret].n); // mark as purged
+        dyn->n.neoncache[ret].v = 0; // reset it
+    }
+    dyn->n.fpuused[ret] = 1;
+    dyn->n.neoncache[ret].t = NEON_CACHE_MM;
+    dyn->n.neoncache[ret].n = emm;
+    dyn->n.news |= (1<<(ret));
+    return ret;
 }
 // Get an XMM quad reg
 int fpu_get_reg_xmm(dynarec_arm_t* dyn, int t, int xmm)
@@ -84,6 +101,77 @@ int fpu_get_reg_xmm(dynarec_arm_t* dyn, int t, int xmm)
     dyn->n.news |= (1<<i);
     return i;
 }
+static int internal_mark_ymm(dynarec_arm_t* dyn, int t, int ymm, int reg)
+{
+    if(dyn->n.neoncache[reg].t==NEON_CACHE_YMMR || dyn->n.neoncache[reg].t==NEON_CACHE_YMMW) {
+        if(dyn->n.neoncache[reg].n == ymm) {
+            // already there!
+            if(t==NEON_CACHE_YMMW)
+                dyn->n.neoncache[reg].t=t;
+            return reg;
+        }
+        return -1;
+    } else {
+        // found a slot!
+        dyn->n.neoncache[reg].t=t;
+        dyn->n.neoncache[reg].n=ymm;
+        return reg;
+    }
+    return -1;
+}
+static int is_ymm_to_keep(dynarec_arm_t* dyn, int reg, int k1, int k2, int k3)
+{
+    if(k1!=-1 && dyn->n.neoncache[reg].n==k1)
+        return 1;
+    if(k2!=-1 && dyn->n.neoncache[reg].n==k2)
+        return 1;
+    if(k3!=-1 && dyn->n.neoncache[reg].n==k3)
+        return 1;
+    return 0;
+}
+// Get an YMM quad reg, while preserving up to 3 other YMM regs
+int fpu_get_reg_ymm(dynarec_arm_t* dyn, int ninst, int t, int ymm, int k1, int k2, int k3)
+{
+    int i = EMM0;
+    // first pass see if a slot is free in EMM/x87 slots
+    for(int j=0; j<8; ++j) {
+        if(!dyn->n.fpuused[i+j]) {
+            int ret = internal_mark_ymm(dyn, t, ymm, i+j);
+            if(ret>=0) return ret;
+        }
+    }
+    // no slot in the emm space, look for scratch space in reverse
+    i = SCRATCH0;
+    for(int j=7; j>=dyn->n.fpu_scratch; --j) {
+        int ret = internal_mark_ymm(dyn, t, ymm, i+j);
+        if(ret>=0) return ret;
+    }
+    // no free slot, needs to purge a value... First loop on the YMMR, they are easier to purge
+    i = EMM0;
+    int keep = 0;
+    for(int j=0; j<8; ++j) {
+        if(!dyn->n.fpuused[i+j]) {
+            // should a test be done to check if ymm is already in the purge list?
+            if(!is_ymm_to_keep(dyn, i+j, k1, k2, k3) && dyn->n.neoncache[i+j].t==NEON_CACHE_YMMR) {
+                dyn->insts[ninst].purge_ymm |= 1<<dyn->n.neoncache[i+j].n;
+                dyn->n.neoncache[i+j].v = 0;
+                return internal_mark_ymm(dyn, t, ymm, i+j);
+            }
+        }
+    }
+    // make space in the scratch area
+    i = SCRATCH0;
+    for(int j=dyn->n.fpu_scratch; j<8; ++j) {
+            // should a test be done to check if ymm is already in the purge list?
+            if(!is_ymm_to_keep(dyn, i+j, k1, k2, k3)) {
+                dyn->insts[ninst].purge_ymm |= 1<<dyn->n.neoncache[i+j].n;
+                dyn->n.neoncache[i+j].v = 0;
+                return internal_mark_ymm(dyn, t, ymm, i+j);
+            }
+    }
+    printf_log(LOG_NONE, "BOX64 Dynarec: Error, unable to free a reg for YMM %d at inst=%d\n", ymm, ninst);
+    return i;
+}
 // Reset fpu regs counter
 static void fpu_reset_reg_neoncache(neoncache_t* n)
 {
@@ -343,10 +431,11 @@ int fpuCacheNeedsTransform(dynarec_arm_t* dyn, int ninst) {
         return ((dyn->insts[ninst].x64.barrier&BARRIER_FLOAT))?0:(isCacheEmpty(dyn, ninst)?0:1);
     int ret = 0;
     if(!i2) { // just purge
-        if(dyn->insts[ninst].n.stack_next) {
+        if(dyn->insts[ninst].n.stack_next)
             return 1;
-        }
-        for(int i=0; i<24 && !ret; ++i)
+        if(dyn->insts[ninst].ymm_zero)
+            return 1;
+        for(int i=0; i<32 && !ret; ++i)
             if(dyn->insts[ninst].n.neoncache[i].v) {       // there is something at ninst for i
                 if(!(
                 (dyn->insts[ninst].n.neoncache[i].t==NEON_CACHE_ST_F
@@ -361,10 +450,12 @@ int fpuCacheNeedsTransform(dynarec_arm_t* dyn, int ninst) {
     if(dyn->insts[ninst].n.stack_next != dyn->insts[i2].n.stack-dyn->insts[i2].n.stack_push) {
         return 1;
     }
+    if(dyn->insts[ninst].ymm_zero && (dyn->insts[ninst].ymm_zero&~dyn->insts[i2].ymm_zero))
+        return 1;
     neoncache_t cache_i2 = dyn->insts[i2].n;
     neoncacheUnwind(&cache_i2);
 
-    for(int i=0; i<24; ++i) {
+    for(int i=0; i<32; ++i) {
         if(dyn->insts[ninst].n.neoncache[i].v) {       // there is something at ninst for i
             if(!cache_i2.neoncache[i].v) {    // but there is nothing at i2 for i
                 ret = 1;
@@ -374,6 +465,8 @@ int fpuCacheNeedsTransform(dynarec_arm_t* dyn, int ninst) {
                 }
                 else if(dyn->insts[ninst].n.neoncache[i].t == NEON_CACHE_XMMR && cache_i2.neoncache[i].t == NEON_CACHE_XMMW)
                     {/* nothing */ }
+                else if(dyn->insts[ninst].n.neoncache[i].t == NEON_CACHE_YMMR && cache_i2.neoncache[i].t == NEON_CACHE_YMMW)
+                    {/* nothing */ }
                 else
                     ret = 1;
             }
@@ -648,6 +741,9 @@ static void sse_reset(neoncache_t* n)
 {
     for (int i=0; i<16; ++i)
         n->ssecache[i].v = -1;
+    for (int i=0; i<32; ++i)
+        if(n->neoncache[i].t==NEON_CACHE_YMMR || n->neoncache[i].t==NEON_CACHE_YMMW)
+            n->neoncache[i].v = 0;
 }
 
 void fpu_reset(dynarec_arm_t* dyn)
diff --git a/src/dynarec/arm64/dynarec_arm64_functions.h b/src/dynarec/arm64/dynarec_arm64_functions.h
index abe827bb..7da65897 100644
--- a/src/dynarec/arm64/dynarec_arm64_functions.h
+++ b/src/dynarec/arm64/dynarec_arm64_functions.h
@@ -6,15 +6,17 @@
 #define SCRATCH0    24
 
 // Get an FPU scratch reg
-int fpu_get_scratch(dynarec_arm_t* dyn);
+int fpu_get_scratch(dynarec_arm_t* dyn, int ninst);
 // Reset scratch regs counter
 void fpu_reset_scratch(dynarec_arm_t* dyn);
 // Get an x87 double reg
-int fpu_get_reg_x87(dynarec_arm_t* dyn, int t, int n);
+int fpu_get_reg_x87(dynarec_arm_t* dyn, int ninst, int t, int n);
 // Get an MMX double reg
-int fpu_get_reg_emm(dynarec_arm_t* dyn, int emm);
+int fpu_get_reg_emm(dynarec_arm_t* dyn, int ninst, int emm);
 // Get an XMM quad reg
 int fpu_get_reg_xmm(dynarec_arm_t* dyn, int t, int xmm);
+// Get an YMM upper quad reg, while keeping up to 3 other YMM reg (-1 to no keep)
+int fpu_get_reg_ymm(dynarec_arm_t* dyn, int ninst, int t, int ymm, int k1, int k2, int k3);
 // Free a FPU/MMX/XMM reg
 void fpu_free_reg(dynarec_arm_t* dyn, int reg);
 // Reset fpu regs counter
diff --git a/src/dynarec/arm64/dynarec_arm64_helper.c b/src/dynarec/arm64/dynarec_arm64_helper.c
index 5e406588..cdf0eeb6 100644
--- a/src/dynarec/arm64/dynarec_arm64_helper.c
+++ b/src/dynarec/arm64/dynarec_arm64_helper.c
@@ -958,7 +958,7 @@ int x87_do_push(dynarec_arm_t* dyn, int ninst, int s1, int t)
             ++dyn->n.x87cache[i];
         else if(ret==-1) {
             dyn->n.x87cache[i] = 0;
-            ret=dyn->n.x87reg[i]=fpu_get_reg_x87(dyn, t, 0);
+            ret=dyn->n.x87reg[i]=fpu_get_reg_x87(dyn, ninst, t, 0);
             dyn->n.neoncache[ret].t = X87_ST0;
         }
     }
@@ -1274,7 +1274,7 @@ int x87_get_cache(dynarec_arm_t* dyn, int ninst, int populate, int s1, int s2, i
             ret = i;
     // found, setup and grab the value
     dyn->n.x87cache[ret] = st;
-    dyn->n.x87reg[ret] = fpu_get_reg_x87(dyn, NEON_CACHE_ST_D, st);
+    dyn->n.x87reg[ret] = fpu_get_reg_x87(dyn, ninst, NEON_CACHE_ST_D, st);
     if(populate) {
         ADDx_U12(s1, xEmu, offsetof(x64emu_t, x87));
         LDRw_U12(s2, xEmu, offsetof(x64emu_t, top));
@@ -1394,7 +1394,7 @@ void x87_reget_st(dynarec_arm_t* dyn, int ninst, int s1, int s2, int st)
             ret = i;
     // found, setup and grab the value
     dyn->n.x87cache[ret] = st;
-    dyn->n.x87reg[ret] = fpu_get_reg_x87(dyn, NEON_CACHE_ST_D, st);
+    dyn->n.x87reg[ret] = fpu_get_reg_x87(dyn, ninst, NEON_CACHE_ST_D, st);
     ADDx_U12(s1, xEmu, offsetof(x64emu_t, x87));
     LDRw_U12(s2, xEmu, offsetof(x64emu_t, top));
     int a = st - dyn->n.x87stack;
@@ -1544,7 +1544,7 @@ int mmx_get_reg(dynarec_arm_t* dyn, int ninst, int s1, int s2, int s3, int a)
     if(dyn->n.mmxcache[a]!=-1)
         return dyn->n.mmxcache[a];
     ++dyn->n.mmxcount;
-    int ret = dyn->n.mmxcache[a] = fpu_get_reg_emm(dyn, a);
+    int ret = dyn->n.mmxcache[a] = fpu_get_reg_emm(dyn, ninst, a);
     VLDR64_U12(ret, xEmu, offsetof(x64emu_t, mmx[a]));
     return ret;
 }
@@ -1556,7 +1556,7 @@ int mmx_get_reg_empty(dynarec_arm_t* dyn, int ninst, int s1, int s2, int s3, int
     if(dyn->n.mmxcache[a]!=-1)
         return dyn->n.mmxcache[a];
     ++dyn->n.mmxcount;
-    int ret = dyn->n.mmxcache[a] = fpu_get_reg_emm(dyn, a);
+    int ret = dyn->n.mmxcache[a] = fpu_get_reg_emm(dyn, ninst, a);
     return ret;
 }
 // purge the MMX cache only(needs 3 scratch registers)
@@ -1679,12 +1679,30 @@ static void sse_purgecache(dynarec_arm_t* dyn, int ninst, int next, int s1)
             MESSAGE(LOG_DUMP, "\tPurge %sSSE Cache ------\n", next?"locally ":"");
             ++old;
         }
+        int s1_set = 0;
         for(int i=0; i<16; ++i)
-            if(is_avx_zero(dyn, ninst, i))
-                STPx_S7_offset(xZR, xZR, xEmu, offsetof(x64emu_t, ymm[i]));
+            if(is_avx_zero(dyn, ninst, i)) {
+                if(!s1_set) {
+                    ADDx_U12(s1, xEmu, offsetof(x64emu_t, ymm[0]));
+                    s1_set = 1;
+                }
+                STPx_S7_offset(xZR, xZR, s1, i*16);
+            }
         if(!next)
             avx_mark_zero_reset(dyn, ninst);
     }
+    for(int i=0; i<32; ++i) {
+        if(dyn->n.neoncache[i].t==NEON_CACHE_YMMW) {
+            if (old==-1) {
+                MESSAGE(LOG_DUMP, "\tPurge %sSSE Cache ------\n", next?"locally ":"");
+                ++old;
+            }
+            VSTR128_U12(i, xEmu, offsetof(x64emu_t, ymm[dyn->n.neoncache[i].n]));
+        }
+        if(!next && (dyn->n.neoncache[i].t==NEON_CACHE_YMMW || dyn->n.neoncache[i].t==NEON_CACHE_YMMR))
+            dyn->n.neoncache[i].v = 0;
+    }
+    // All done
     if(old!=-1) {
         MESSAGE(LOG_DUMP, "\t------ Purge SSE Cache\n");
     }
@@ -1697,16 +1715,30 @@ static void sse_reflectcache(dynarec_arm_t* dyn, int ninst, int s1)
             VSTR128_U12(dyn->n.ssecache[i].reg, xEmu, offsetof(x64emu_t, xmm[i]));
         }
     //AVX
-    if(dyn->ymm_zero)
+    if(dyn->ymm_zero) {
+        int s1_set = 0;
         for(int i=0; i<16; ++i)
-            if(is_avx_zero(dyn, ninst, i))
-                STPx_S7_offset(xZR, xZR, xEmu, offsetof(x64emu_t, ymm[i]));
+            if(is_avx_zero(dyn, ninst, i)) {
+                if(!s1_set) {
+                    ADDx_U12(s1, xEmu, offsetof(x64emu_t, ymm[0]));
+                    s1_set = 1;
+                }
+                STPx_S7_offset(xZR, xZR, s1, i*16);
+            }
+    }
 }
 
 void sse_reflect_reg(dynarec_arm_t* dyn, int ninst, int a)
 {
-    if(is_avx_zero(dyn, ninst, a))
-        STPx_S7_offset(xZR, xZR, xEmu, offsetof(x64emu_t, ymm[a]));
+    if(is_avx_zero(dyn, ninst, a)) {
+        //only  ymm[0] can be accessed with STP :(
+        if(!a)
+            STPx_S7_offset(xZR, xZR, xEmu, offsetof(x64emu_t, ymm[a]));
+        else {
+            STRx_U12(xZR, xEmu, offsetof(x64emu_t, ymm[a]));
+            STRx_U12(xZR, xEmu, offsetof(x64emu_t, ymm[a])+8);
+        }
+    }
     if(dyn->n.ssecache[a].v==-1)
         return;
     if(dyn->n.neoncache[dyn->n.ssecache[a].reg].t == NEON_CACHE_XMMW) {
@@ -1716,24 +1748,82 @@ void sse_reflect_reg(dynarec_arm_t* dyn, int ninst, int a)
     }
 }
 
+// AVX Helpers
+// get neon register for a YMM upper reg, create the entry if needed
+int ymm_get_reg(dynarec_arm_t* dyn, int ninst, int s1, int a, int forwrite, int k1, int k2, int k3)
+{
+    // look if already exist
+    for(int i=0; i<32; ++i)
+        if((dyn->n.neoncache[i].t==NEON_CACHE_YMMR || dyn->n.neoncache[i].t==NEON_CACHE_YMMW) && dyn->n.neoncache[i].n==a) {
+            if(forwrite) {
+                dyn->n.neoncache[i].t = NEON_CACHE_YMMW;
+                dyn->ymm_zero&=~(1<<a);
+            }
+            return i;
+        }
+    // nope, grab a new one
+    int ret =  fpu_get_reg_ymm(dyn, ninst, forwrite?NEON_CACHE_YMMW:NEON_CACHE_YMMR, a, k1, k2, k3);
+    if(dyn->ymm_zero&(1<<a)) {
+        VEORQ(ret, ret, ret);
+        if(forwrite)
+            dyn->ymm_zero&=~(1<<a);
+    } else {
+        VLDR128_U12(ret, xEmu, offsetof(x64emu_t, ymm[a]));
+    }
+    return ret;
+}
+// get neon register for a YMM reg, but don't try to synch it if it needed to be created
+int ymm_get_reg_empty(dynarec_arm_t* dyn, int ninst, int s1, int a, int k1, int k2, int k3)
+{
+    // look if already exist
+    for(int i=0; i<32; ++i)
+        if((dyn->n.neoncache[i].t==NEON_CACHE_YMMR || dyn->n.neoncache[i].t==NEON_CACHE_YMMW) && dyn->n.neoncache[i].n==a) {
+            dyn->n.neoncache[i].t = NEON_CACHE_YMMW;
+            dyn->ymm_zero&=~(1<<a);
+            return i;
+        }
+    // nope, grab a new one
+    int ret =  fpu_get_reg_ymm(dyn, ninst, NEON_CACHE_YMMW, a, k1, k2, k3);
+    if(dyn->ymm_zero&(1<<a))
+        dyn->ymm_zero&=~(1<<a);
+    return ret;
+}
+
+
 void fpu_pushcache(dynarec_arm_t* dyn, int ninst, int s1, int not07)
 {
     int start = not07?8:0;
     // only SSE regs needs to be push back to xEmu (needs to be "write")
     int n=0;
-    for (int i=start; i<16; i++)
+    for (int i=start; i<16; i++) {
         if((dyn->n.ssecache[i].v!=-1) && (dyn->n.ssecache[i].write))
             ++n;
+        if(is_avx_zero(dyn, ninst, i))
+            ++n;
+    }
+    for(int i=0; i<32; ++i)
+        if(dyn->n.neoncache[i].t==NEON_CACHE_YMMW)
+            ++n;
     if(!n)
         return;
     MESSAGE(LOG_DUMP, "\tPush XMM Cache (%d)------\n", n);
+    int s1_set = 0;
     for (int i=start; i<16; ++i) {
         if((dyn->n.ssecache[i].v!=-1) && (dyn->n.ssecache[i].write)) {
             VSTR128_U12(dyn->n.ssecache[i].reg, xEmu, offsetof(x64emu_t, xmm[i]));
         }
-        if(is_avx_zero(dyn, ninst, i))
-            STPx_S7_offset(xZR, xZR, xEmu, offsetof(x64emu_t, ymm[i]));
+        if(is_avx_zero(dyn, ninst, i)) {
+            if(!s1_set) {
+                ADDx_U12(s1, xEmu, offsetof(x64emu_t, ymm[0]));
+                s1_set = 1;
+            }
+            STPx_S7_offset(xZR, xZR, s1, i*16);
+        }
     }
+    // purge the YMM values
+    for(int i=0; i<32; ++i)
+        if(dyn->n.neoncache[i].t==NEON_CACHE_YMMW)
+            VSTR128_U12(i, xEmu, offsetof(x64emu_t, ymm[dyn->n.neoncache[i].n]));
     MESSAGE(LOG_DUMP, "\t------- Push XMM Cache (%d)\n", n);
 }
 
@@ -1754,6 +1844,9 @@ void fpu_popcache(dynarec_arm_t* dyn, int ninst, int s1, int not07)
             /*dyn->n.ssecache[i].write = 0;   // OPTIM: it's sync, so not write anymore
             dyn->n.neoncache[dyn->n.ssecache[i].reg].t = NEON_CACHE_XMMR;*/
         }
+    for(int i=0; i<32; ++i)
+        if(dyn->n.neoncache[i].t==NEON_CACHE_YMMW)
+            VLDR128_U12(i, xEmu, offsetof(x64emu_t, ymm[dyn->n.neoncache[i].n]));
     MESSAGE(LOG_DUMP, "\t------- Pop XMM Cache (%d)\n", n);
 }
 
@@ -1988,7 +2081,7 @@ static void unloadCache(dynarec_arm_t* dyn, int ninst, int stack_cnt, int s1, in
 
 static void fpuCacheTransform(dynarec_arm_t* dyn, int ninst, int s1, int s2, int s3)
 {
-#if STEP > 1
+#if 1//STEP > 1
     int i2 = dyn->insts[ninst].x64.jmp_insts;
     if(i2<0)
         return;
@@ -2101,6 +2194,19 @@ static void fpuCacheTransform(dynarec_arm_t* dyn, int ninst, int s1, int s2, int
             }
         }
     }
+    // ymm0
+    s3_top = 1;
+    if(dyn->ymm_zero && (dyn->ymm_zero&~dyn->insts[i2].ymm_zero)) {
+        for(int i=0; i<16; ++i)
+            if(dyn->insts[i2].purge_ymm&(1<<i))
+                if(is_avx_zero(dyn, ninst, i)) {
+                    if(s3_top) {
+                        ADDx_U12(s3, xEmu,offsetof(x64emu_t, ymm[0]));
+                        s3_top = 0;
+                    }
+                    STPx_S7_offset(xZR, xZR, s3, i*16);
+                }
+    }
     if(stack_cnt != cache_i2.stack) {
         MESSAGE(LOG_DUMP, "\t    - adjust stack count %d -> %d -\n", stack_cnt, cache_i2.stack);
         int a = stack_cnt - cache_i2.stack;
@@ -2355,13 +2461,29 @@ void fpu_propagate_stack(dynarec_arm_t* dyn, int ninst)
     dyn->n.swapped = 0;
 }
 
-void avx_purge_ymm0(dynarec_arm_t* dyn, int ninst)
+void avx_purge_ymm(dynarec_arm_t* dyn, int ninst, int s1)
 {
-    if(box64_dynarec_dump) dynarec_log(LOG_NONE, "Purge YMM Zero mask=%04x --------\n", dyn->insts[ninst].purge_ymm0);
+    if(box64_dynarec_dump) dynarec_log(LOG_NONE, "Purge YMM mask=%04x --------\n", dyn->insts[ninst].purge_ymm);
+    int s1_set = 0;
     for(int i=0; i<16; ++i)
-        if(dyn->insts[ninst].purge_ymm0&(1<<i) && is_avx_zero(dyn, ninst, i)) {
-            STPx_S7_offset(xZR, xZR, xEmu, offsetof(x64emu_t, ymm[i]));
-            avx_unmark_zero(dyn, ninst, i);
+        if(dyn->insts[ninst].purge_ymm&(1<<i)) {
+            if(is_avx_zero(dyn, ninst, i)) {
+                if(!s1_set) {
+                    ADDx_U12(s1, xEmu, offsetof(x64emu_t, ymm[0]));
+                    s1_set = 1;
+                }
+                STPx_S7_offset(xZR, xZR, s1, i*16);
+                avx_unmark_zero(dyn, ninst, i);
+            }
+            int reg = -1;
+            for(int j=0; j<32; ++j)
+                if(dyn->n.neoncache[j].t==NEON_CACHE_YMMR && dyn->n.neoncache[j].n==i) {
+                    // just forget the reg....
+                    dyn->n.neoncache[j].v = 0;
+                } else if(dyn->n.neoncache[j].t==NEON_CACHE_YMMW && dyn->n.neoncache[j].n==i) {
+                    VSTR128_U12(j, xEmu, offsetof(x64emu_t, ymm[i]));
+                    dyn->n.neoncache[j].v = 0;
+                }
         }
-    if(box64_dynarec_dump) dynarec_log(LOG_NONE, "---------- Purge YMM Zero\n");
+    if(box64_dynarec_dump) dynarec_log(LOG_NONE, "---------- Purge YMM\n");
 }
\ No newline at end of file
diff --git a/src/dynarec/arm64/dynarec_arm64_helper.h b/src/dynarec/arm64/dynarec_arm64_helper.h
index 36950291..fc06d358 100644
--- a/src/dynarec/arm64/dynarec_arm64_helper.h
+++ b/src/dynarec/arm64/dynarec_arm64_helper.h
@@ -457,6 +457,10 @@
 #define GETVX_empty(a)                  \
     a = sse_get_reg_empty(dyn, ninst, x1, vex.v)
 
+#define GETGY_empty_VY(a, b, w2, k1, k2)                    \
+    b = ymm_get_reg(dyn, ninst, x1, vex.v, w2, gd, k1, k2); \
+    a = ymm_get_reg_empty(dyn, ninst, x1, gd, vex.v, k1, k2)
+
 // Get EX as a quad, (x1 is used)
 #define GETEX(a, w, D)                                                                                  \
     if(MODREG) {                                                                                        \
@@ -464,7 +468,7 @@
     } else {                                                                                            \
         if(w) {WILLWRITE2();} else {SMREAD();}                                                          \
         addr = geted(dyn, addr, ninst, nextop, &ed, x1, &fixedaddress, &unscaled, 0xfff<<4, 15, rex, NULL, 0, D);  \
-        a = fpu_get_scratch(dyn);                                                                       \
+        a = fpu_get_scratch(dyn, ninst);                                                                \
         VLD128(a, ed, fixedaddress);                                                                    \
     }
 
@@ -482,7 +486,7 @@
         a = sse_get_reg(dyn, ninst, x1, (nextop&7)+(rex.b<<3), w);                                      \
     } else {                                                                                            \
         if(w) {WILLWRITE2();} else {SMREAD();}                                                          \
-        a = fpu_get_scratch(dyn);                                                                       \
+        a = fpu_get_scratch(dyn, ninst);                                                                \
         addr = geted(dyn, addr, ninst, nextop, &ed, x1, &fixedaddress, &unscaled, 0xfff<<3, 7, rex, NULL, 0, D);   \
         VLD64(a, ed, fixedaddress);                                                                     \
     }
@@ -496,7 +500,7 @@
         a = sse_get_reg(dyn, ninst, x1, (nextop&7)+(rex.b<<3), w);                                      \
     } else {                                                                                            \
         if(w) {WILLWRITE2();} else {SMREAD();}                                                          \
-        a = fpu_get_scratch(dyn);                                                                       \
+        a = fpu_get_scratch(dyn, ninst);                                                                \
         addr = geted(dyn, addr, ninst, nextop, &ed, x1, &fixedaddress, &unscaled, 0xfff<<2, 3, rex, NULL, 0, D);   \
         VLD32(a, ed, fixedaddress);                                                                     \
     }
@@ -510,7 +514,7 @@
         a = sse_get_reg(dyn, ninst, x1, (nextop&7)+(rex.b<<3), w);                                      \
     } else {                                                                                            \
         if(w) {WILLWRITE2();} else {SMREAD();}                                                          \
-        a = fpu_get_scratch(dyn);                                                                       \
+        a = fpu_get_scratch(dyn, ninst);                                                                \
         addr = geted(dyn, addr, ninst, nextop, &ed, x1, &fixedaddress, &unscaled, 0xfff<<1, 1, rex, NULL, 0, D);   \
         VLD16(a, ed, fixedaddress);                                                                     \
     }
@@ -527,7 +531,7 @@
     } else {                                                    \
         SMREAD();                                               \
         addr = geted(dyn, addr, ninst, nextop, &ed, x1, &fixedaddress, &unscaled, 0xfff<<3, 7, rex, NULL, 0, D); \
-        a = fpu_get_scratch(dyn);                               \
+        a = fpu_get_scratch(dyn, ninst);                        \
         VLD64(a, ed, fixedaddress);                             \
     }
 
@@ -1163,6 +1167,8 @@ void* arm64_next(x64emu_t* emu, uintptr_t addr);
 #define sse_forget_reg   STEPNAME(sse_forget_reg)
 #define sse_purge07cache STEPNAME(sse_purge07cache)
 #define sse_reflect_reg  STEPNAME(sse_reflect_reg)
+#define ymm_get_reg     STEPNAME(ymm_get_reg)
+#define ymm_get_reg_empty STEPNAME(ymm_get_reg_empty)
 
 #define fpu_pushcache   STEPNAME(fpu_pushcache)
 #define fpu_popcache    STEPNAME(fpu_popcache)
@@ -1173,7 +1179,7 @@ void* arm64_next(x64emu_t* emu, uintptr_t addr);
 #define x87_purgecache  STEPNAME(x87_purgecache)
 #define fpu_reflectcache STEPNAME(fpu_reflectcache)
 #define fpu_unreflectcache STEPNAME(fpu_unreflectcache)
-#define avx_purge_ymm0  STEPNAME(avx_purge_ymm0)
+#define avx_purge_ymm   STEPNAME(avx_purge_ymm)
 
 #define CacheTransform       STEPNAME(CacheTransform)
 
@@ -1334,8 +1340,8 @@ int x87_setround(dynarec_arm_t* dyn, int ninst, int s1, int s2, int s3);
 void x87_restoreround(dynarec_arm_t* dyn, int ninst, int s1);
 // Set rounding according to mxcsr flags, return reg to restore flags
 int sse_setround(dynarec_arm_t* dyn, int ninst, int s1, int s2, int s3);
-// purge ymm_zero mask according to purge_ymm0
-void avx_purge_ymm0(dynarec_arm_t* dyn, int ninst);
+// purge ymm_zero mask according to purge_ymm
+void avx_purge_ymm(dynarec_arm_t* dyn, int ninst, int s1);
 
 void CacheTransform(dynarec_arm_t* dyn, int ninst, int cacheupd, int s1, int s2, int s3);
 
@@ -1409,6 +1415,12 @@ void fpu_reflectcache(dynarec_arm_t* dyn, int ninst, int s1, int s2, int s3);
 void fpu_unreflectcache(dynarec_arm_t* dyn, int ninst, int s1, int s2, int s3);
 void fpu_pushcache(dynarec_arm_t* dyn, int ninst, int s1, int not07);
 void fpu_popcache(dynarec_arm_t* dyn, int ninst, int s1, int not07);
+// avx helpers
+// get neon register for a SSE reg, create the entry if needed
+int ymm_get_reg(dynarec_arm_t* dyn, int ninst, int s1, int a, int forwrite, int k1, int k2, int k3);
+// get neon register for a SSE reg, but don't try to synch it if it needed to be created
+int ymm_get_reg_empty(dynarec_arm_t* dyn, int ninst, int s1, int a, int k1, int k2, int k3);
+
 
 uintptr_t dynarec64_00(dynarec_arm_t* dyn, uintptr_t addr, uintptr_t ip, int ninst, rex_t rex, int rep, int* ok, int* need_epilog);
 uintptr_t dynarec64_0F(dynarec_arm_t* dyn, uintptr_t addr, uintptr_t ip, int ninst, rex_t rex, int rep, int* ok, int* need_epilog);
@@ -1593,6 +1605,6 @@ uintptr_t dynarec64_AVX_0F(dynarec_arm_t* dyn, uintptr_t addr, uintptr_t ip, int
         }                                       \
     }
 
-#define PURGE_YMM0()    avx_purge_ymm0(dyn, ninst)
+#define PURGE_YMM()    avx_purge_ymm(dyn, ninst, x1)
 
 #endif //__DYNAREC_ARM64_HELPER_H__
diff --git a/src/dynarec/arm64/dynarec_arm64_private.h b/src/dynarec/arm64/dynarec_arm64_private.h
index 2788ddc4..6a6647df 100644
--- a/src/dynarec/arm64/dynarec_arm64_private.h
+++ b/src/dynarec/arm64/dynarec_arm64_private.h
@@ -82,7 +82,7 @@ typedef struct instruction_arm64_s {
     uintptr_t           natcall;
     uint16_t            retn;
     uint16_t            ymm_zero;   // bitmap of ymm to zero at purge
-    uint16_t            purge_ymm0; // need to purge some ymm0 because of a loop
+    uint16_t            purge_ymm;  // need to purge some ymm
     uint8_t             barrier_maybe;
     uint8_t             will_write;
     uint8_t             last_write;
diff --git a/src/dynarec/dynarec_native.c b/src/dynarec/dynarec_native.c
index 4f509ac1..649ba1a1 100644
--- a/src/dynarec/dynarec_native.c
+++ b/src/dynarec/dynarec_native.c
@@ -559,7 +559,7 @@ void* FillBlock64(dynablock_t* block, uintptr_t addr, int alternate, int is32bit
                 if(helper.insts[i].ymm_zero || helper.insts[k].ymm_zero) {
                     // move to pureg the reg that are present in k (jump to) but not in i (jump from)
                     uint16_t to_purge = helper.insts[k].ymm_zero & ~helper.insts[i].ymm_zero;
-                    helper.insts[k].purge_ymm0 |= to_purge;
+                    helper.insts[k].purge_ymm |= to_purge;
                     helper.insts[k].ymm_zero &= ~to_purge;
                 }
             }
diff --git a/src/dynarec/dynarec_native_pass.c b/src/dynarec/dynarec_native_pass.c
index cab03222..60c37b4b 100644
--- a/src/dynarec/dynarec_native_pass.c
+++ b/src/dynarec/dynarec_native_pass.c
@@ -80,8 +80,8 @@ uintptr_t native_pass(dynarec_native_t* dyn, uintptr_t addr, int alternate, int
         }
         #endif
         fpu_propagate_stack(dyn, ninst);
-        if(dyn->insts[ninst].purge_ymm0)
-            PURGE_YMM0();
+        if(dyn->insts[ninst].purge_ymm)
+            PURGE_YMM();
         ip = addr;
         if (reset_n!=-1) {
             dyn->last_ip = 0;
diff --git a/src/dynarec/la64/dynarec_la64_helper.h b/src/dynarec/la64/dynarec_la64_helper.h
index def767b4..484c1324 100644
--- a/src/dynarec/la64/dynarec_la64_helper.h
+++ b/src/dynarec/la64/dynarec_la64_helper.h
@@ -1075,6 +1075,6 @@ uintptr_t dynarec64_F20F(dynarec_la64_t* dyn, uintptr_t addr, uintptr_t ip, int
         }                                  \
     } while (0)
 
-#define PURGE_YMM0()    /* TODO */
+#define PURGE_YMM()    /* TODO */
 
 #endif //__DYNAREC_LA64_HELPER_H__
\ No newline at end of file
diff --git a/src/dynarec/la64/dynarec_la64_private.h b/src/dynarec/la64/dynarec_la64_private.h
index b31d3f2e..09b6698e 100644
--- a/src/dynarec/la64/dynarec_la64_private.h
+++ b/src/dynarec/la64/dynarec_la64_private.h
@@ -81,7 +81,7 @@ typedef struct instruction_la64_s {
     uintptr_t           natcall;
     uint16_t            retn;
     uint16_t            ymm_zero;   // bitmap of ymm to zero at purge
-    uint16_t            purge_ymm0; // need to purge some ymm0 because of a loop
+    uint16_t            purge_ymm;  // need to purge some ymm
     uint8_t             barrier_maybe;
     uint8_t             will_write;
     uint8_t             last_write;
diff --git a/src/dynarec/rv64/dynarec_rv64_helper.h b/src/dynarec/rv64/dynarec_rv64_helper.h
index 2471c71c..3935ee1c 100644
--- a/src/dynarec/rv64/dynarec_rv64_helper.h
+++ b/src/dynarec/rv64/dynarec_rv64_helper.h
@@ -1671,4 +1671,14 @@ uintptr_t dynarec64_F30F(dynarec_rv64_t* dyn, uintptr_t addr, uintptr_t ip, int
 
 #define PURGE_YMM0()    /* TODO */
 
+// reg = (reg < -32768) ? -32768 : ((reg > 32767) ? 32767 : reg)
+#define SAT16(reg, s)             \
+    LUI(s, 0xFFFF8); /* -32768 */ \
+    BGE(reg, s, 4 + 2 * 4);       \
+    MV(reg, s);                   \
+    J(4 + 4 * 3);                 \
+    LUI(s, 8); /* 32768 */        \
+    BLT(reg, s, 4 + 4);           \
+    ADDIW(reg, s, -1);
+
 #endif //__DYNAREC_RV64_HELPER_H__
diff --git a/src/dynarec/rv64/dynarec_rv64_private.h b/src/dynarec/rv64/dynarec_rv64_private.h
index dff6f84e..1ba830d5 100644
--- a/src/dynarec/rv64/dynarec_rv64_private.h
+++ b/src/dynarec/rv64/dynarec_rv64_private.h
@@ -91,7 +91,7 @@ typedef struct instruction_rv64_s {
     uintptr_t           natcall;
     uint16_t            retn;
     uint16_t            ymm_zero;   // bitmap of ymm to zero at purge
-    uint16_t            purge_ymm0; // need to purge some ymm0 because of a loop
+    uint16_t            purge_ymm;  // need to purge some ymm
     int                 barrier_maybe;
     flagcache_t         f_exit;     // flags status at end of intruction
     extcache_t          e;          // extcache at end of intruction (but before poping)