about summary refs log tree commit diff stats
path: root/src
diff options
context:
space:
mode:
authorptitSeb <sebastien.chev@gmail.com>2021-03-21 09:29:31 +0100
committerptitSeb <sebastien.chev@gmail.com>2021-03-21 09:29:31 +0100
commitcc2597674307901efc8773e2ce1c757eb5ccd79c (patch)
treeea6ed1ca80fc34c59c533edc9f0facaa14569fae /src
parent6fea5e5854646f891fe601bc58aa9e33c596f8f5 (diff)
downloadbox64-cc2597674307901efc8773e2ce1c757eb5ccd79c.tar.gz
box64-cc2597674307901efc8773e2ce1c757eb5ccd79c.zip
[DYNAREC] NEON infrastructure
Diffstat (limited to 'src')
-rwxr-xr-xsrc/dynarec/arm64_epilog.S12
-rwxr-xr-xsrc/dynarec/arm64_prolog.S6
-rwxr-xr-xsrc/dynarec/dynarec_arm64_functions.c56
-rwxr-xr-xsrc/dynarec/dynarec_arm64_functions.h24
-rwxr-xr-xsrc/dynarec/dynarec_arm64_helper.c64
-rwxr-xr-xsrc/dynarec/dynarec_arm64_private.h3
6 files changed, 83 insertions, 82 deletions
diff --git a/src/dynarec/arm64_epilog.S b/src/dynarec/arm64_epilog.S
index eef0d906..4b73803a 100755
--- a/src/dynarec/arm64_epilog.S
+++ b/src/dynarec/arm64_epilog.S
@@ -25,7 +25,11 @@ arm64_epilog:
     ldp     x23, x24, [sp, (8 * 4)]
     ldp     x25, x26, [sp, (8 * 6)]
     ldr     x27, [sp, (8 * 8)]
-    add     sp,  sp, (8 * 10)
+    ldp     d8, d9,   [sp, (8 *10)]
+    ldp     d10, d11, [sp, (8 *12)]
+    ldp     d12, d13, [sp, (8 *14)]
+    ldp     d14, d15, [sp, (8 *16)]
+    add     sp,  sp, (8 * 18)
     ldp     lr, fp, [sp], 16  // saved lr
     //end, return...
     ret
@@ -40,7 +44,11 @@ arm64_epilog_fast:
     ldp     x23, x24, [sp, (8 * 4)]
     ldp     x25, x26, [sp, (8 * 6)]
     ldr     x27, [sp, (8 * 8)]
-    add     sp,  sp, (8 * 10)
+    ldp     d8, d9,   [sp, (8 *10)]
+    ldp     d10, d11, [sp, (8 *12)]
+    ldp     d12, d13, [sp, (8 *14)]
+    ldp     d14, d15, [sp, (8 *16)]
+    add     sp,  sp, (8 * 18)
     ldp     lr, fp, [sp], 16  // saved lr
     //end, return...
     ret
diff --git a/src/dynarec/arm64_prolog.S b/src/dynarec/arm64_prolog.S
index a583c081..21961e5b 100755
--- a/src/dynarec/arm64_prolog.S
+++ b/src/dynarec/arm64_prolog.S
@@ -10,12 +10,16 @@
 arm64_prolog:
     //save all 18 used register
     stp     lr, fp, [sp, -16]!  // save lr
-    sub     sp,  sp, (8 * 10)
+    sub     sp,  sp, (8 * 18)
     stp     x19, x20, [sp, (8 * 0)]
     stp     x21, x22, [sp, (8 * 2)]
     stp     x23, x24, [sp, (8 * 4)]
     stp     x25, x26, [sp, (8 * 6)]
     str     x27, [sp, (8 * 8)]
+    stp     d8, d9,   [sp, (8 *10)]
+    stp     d10, d11, [sp, (8 *12)]
+    stp     d12, d13, [sp, (8 *14)]
+    stp     d14, d15, [sp, (8 *16)]
     //vpush     {d8-d15}    // save NEON regs?
     //setup emu -> register
     ldp     x10, x11, [x0, (8 *  0)]
diff --git a/src/dynarec/dynarec_arm64_functions.c b/src/dynarec/dynarec_arm64_functions.c
index 9ad74131..b6bb5e3c 100755
--- a/src/dynarec/dynarec_arm64_functions.c
+++ b/src/dynarec/dynarec_arm64_functions.c
@@ -183,68 +183,68 @@ void arm_fprem1(x64emu_t* emu)
     emu->sw.f.F87_C3 = ((tmp32s>>1)&1);
     emu->sw.f.F87_C1 = ((tmp32s>>2)&1);
 }
-
+#define XMM0    0
+#define XMM8    16
+#define X870    8
+#define EMM0    8
+#define SCRATCH0    24
 
 // Get a FPU single scratch reg
 int fpu_get_scratch_single(dynarec_arm_t* dyn)
 {
-    return dyn->fpu_scratch++;  // return an Sx
+    return SCRATCH0 + dyn->fpu_scratch++;  // return an Sx
 }
 // Get a FPU double scratch reg
 int fpu_get_scratch_double(dynarec_arm_t* dyn)
 {
-    return dyn->fpu_scratch++;  // return an Dx (same as Sx)
+    return SCRATCH0 + dyn->fpu_scratch++;  // return an Dx (same as Sx)
 }
 // Get a FPU quad scratch reg
 int fpu_get_scratch_quad(dynarec_arm_t* dyn)
 {
-    return dyn->fpu_scratch++;  // return an Qx (same as Dx or Sx)
+    return SCRATCH0 + dyn->fpu_scratch++;  // return an Qx (same as Dx or Sx)
 }
 // Reset scratch regs counter
 void fpu_reset_scratch(dynarec_arm_t* dyn)
 {
     dyn->fpu_scratch = 0;
-    if(dyn->fpu_extra_qscratch) {
-        fpu_free_reg_quad(dyn, dyn->fpu_extra_qscratch);
-        dyn->fpu_extra_qscratch = 0;
-    }
 }
-#define FPUFIRST    8
-// Get a FPU double reg
-int fpu_get_reg_double(dynarec_arm_t* dyn)
+// Get a x87 double reg
+int fpu_get_reg_x87(dynarec_arm_t* dyn)
 {
-    // TODO: check upper limit?
-    int i=0;
+    int i=X870;
     while (dyn->fpuused[i]) ++i;
     dyn->fpuused[i] = 1;
-    return i+FPUFIRST; // return a Dx
+    return i; // return a Dx
 }
 // Free a FPU double reg
-void fpu_free_reg_double(dynarec_arm_t* dyn, int reg)
+void fpu_free_reg(dynarec_arm_t* dyn, int reg)
 {
     // TODO: check upper limit?
-    int i=reg-FPUFIRST;
-    dyn->fpuused[i] = 0;
+    dyn->fpuused[reg] = 0;
 }
-// Get a FPU quad reg
-int fpu_get_reg_quad(dynarec_arm_t* dyn)
+// Get an MMX double reg
+int fpu_get_reg_emm(dynarec_arm_t* dyn, int emm)
 {
-    int i=0;
-    while (dyn->fpuused[i]) ++i;
-    dyn->fpuused[i] = 1;
-    return i+FPUFIRST; // return a Qx, it's the same as Dx on aarch64
+    dyn->fpuused[EMM0 + emm] = 1;
+    return EMM0 + emm;
 }
-// Free a FPU quad reg
-void fpu_free_reg_quad(dynarec_arm_t* dyn, int reg)
+// Get an XMM quad reg
+int fpu_get_reg_xmm(dynarec_arm_t* dyn, int xmm)
 {
-    int i=reg-FPUFIRST;
-    dyn->fpuused[i] = 0;
+    if(xmm>7) {
+        dyn->fpuused[XMM8 + xmm - 8] = 1;
+        return XMM8 + xmm - 8;
+    } else {
+        dyn->fpuused[XMM0 + xmm] = 1;
+        return XMM0 + xmm;
+    }
 }
 // Reset fpu regs counter
 void fpu_reset_reg(dynarec_arm_t* dyn)
 {
     dyn->fpu_reg = 0;
-    for (int i=0; i<24; ++i)
+    for (int i=0; i<32; ++i)
         dyn->fpuused[i]=0;
 }
 
diff --git a/src/dynarec/dynarec_arm64_functions.h b/src/dynarec/dynarec_arm64_functions.h
index d932aa4d..0d6a02a5 100755
--- a/src/dynarec/dynarec_arm64_functions.h
+++ b/src/dynarec/dynarec_arm64_functions.h
@@ -31,22 +31,18 @@ void arm_fprem1(x64emu_t* emu);
 
 void arm_ud(x64emu_t* emu);
 
-// Get an FPU single scratch reg
-int fpu_get_scratch_single(dynarec_arm_t* dyn);
-// Get an FPU double scratch reg
-int fpu_get_scratch_double(dynarec_arm_t* dyn);
-// Get an FPU quad scratch reg
-int fpu_get_scratch_quad(dynarec_arm_t* dyn);
+// Get an FPU scratch reg
+int fpu_get_scratch(dynarec_arm_t* dyn);
 // Reset scratch regs counter
 void fpu_reset_scratch(dynarec_arm_t* dyn);
-// Get an FPU double reg
-int fpu_get_reg_double(dynarec_arm_t* dyn);
-// Free a FPU double reg
-void fpu_free_reg_double(dynarec_arm_t* dyn, int reg);
-// Get an FPU quad reg
-int fpu_get_reg_quad(dynarec_arm_t* dyn);
-// Free a FPU quad reg
-void fpu_free_reg_quad(dynarec_arm_t* dyn, int reg);
+// Get an x87 double reg
+int fpu_get_reg_x87(dynarec_arm_t* dyn);
+// Get an MMX double reg
+int fpu_get_reg_emm(dynarec_arm_t* dyn, int emm);
+// Get an XMM quad reg
+int fpu_get_reg_xmm(dynarec_arm_t* dyn, int xmm);
+// Free a FPU/MMX/XMM reg
+void fpu_free_reg(dynarec_arm_t* dyn, int reg);
 // Reset fpu regs counter
 void fpu_reset_reg(dynarec_arm_t* dyn);
 
diff --git a/src/dynarec/dynarec_arm64_helper.c b/src/dynarec/dynarec_arm64_helper.c
index efcc1bd4..ae106539 100755
--- a/src/dynarec/dynarec_arm64_helper.c
+++ b/src/dynarec/dynarec_arm64_helper.c
@@ -461,7 +461,7 @@ int x87_do_push(dynarec_arm_t* dyn, int ninst)
             ++dyn->x87cache[i];
         else if(ret==-1) {
             dyn->x87cache[i] = 0;
-            ret=dyn->x87reg[i]=fpu_get_reg_double(dyn);
+            ret=dyn->x87reg[i]=fpu_get_reg_x87(dyn);
         }
     return ret;
 #else
@@ -489,7 +489,7 @@ void x87_do_pop(dynarec_arm_t* dyn, int ninst)
         if(dyn->x87cache[i]!=-1) {
             --dyn->x87cache[i];
             if(dyn->x87cache[i]==-1) {
-                fpu_free_reg_double(dyn, dyn->x87reg[i]);
+                fpu_free_reg(dyn, dyn->x87reg[i]);
                 dyn->x87reg[i] = -1;
             }
         }
@@ -555,7 +555,7 @@ static void x87_purgecache(dynarec_arm_t* dyn, int ninst, int s1, int s2, int s3
                 ADDw_U12(s3, s2, dyn->x87cache[i]);
                 ANDw_mask(s3, s3, 0b011111, 1);    // (emu->top + st)&7
                 VSTR64_REG_LSL3(dyn->x87reg[i], s1, s3);
-                fpu_free_reg_double(dyn, dyn->x87reg[i]);
+                fpu_free_reg(dyn, dyn->x87reg[i]);
                 dyn->x87reg[i] = -1;
                 dyn->x87cache[i] = -1;
             }
@@ -604,7 +604,7 @@ int x87_get_cache(dynarec_arm_t* dyn, int ninst, int s1, int s2, int st)
             ret = i;
     // found, setup and grab the value
     dyn->x87cache[ret] = st;
-    dyn->x87reg[ret] = fpu_get_reg_double(dyn);
+    dyn->x87reg[ret] = fpu_get_reg_x87(dyn);
     ADDx_U12(s1, xEmu, offsetof(x64emu_t, mmx87));
     LDRw_U12(s2, xEmu, offsetof(x64emu_t, top));
     int a = st - dyn->x87stack;
@@ -683,7 +683,7 @@ void x87_forget(dynarec_arm_t* dyn, int ninst, int s1, int s2, int st)
     VLDR64_REG_LSL3(dyn->x87reg[ret], s1, s2);
     MESSAGE(LOG_DUMP, "\t--------x87 Cache for ST%d\n", st);
     // and forget that cache
-    fpu_free_reg_double(dyn, dyn->x87reg[ret]);
+    fpu_free_reg(dyn, dyn->x87reg[ret]);
     dyn->x87cache[ret] = -1;
     dyn->x87reg[ret] = -1;
 #endif
@@ -720,7 +720,7 @@ void x87_reget_st(dynarec_arm_t* dyn, int ninst, int s1, int s2, int st)
             ret = i;
     // found, setup and grab the value
     dyn->x87cache[ret] = st;
-    dyn->x87reg[ret] = fpu_get_reg_double(dyn);
+    dyn->x87reg[ret] = fpu_get_reg_x87(dyn);
     ADDx_U12(s1, xEmu, offsetof(x64emu_t, mmx87));
     LDRw_U12(s2, xEmu, offsetof(x64emu_t, top));
     int a = st - dyn->x87stack;
@@ -785,7 +785,7 @@ int mmx_get_reg(dynarec_arm_t* dyn, int ninst, int s1, int a)
 #if STEP > 1
     if(dyn->mmxcache[a]!=-1)
         return dyn->mmxcache[a];
-    int ret = dyn->mmxcache[a] = fpu_get_reg_double(dyn);
+    int ret = dyn->mmxcache[a] = fpu_get_reg_emm(dyn, a);
     VLDR64_U12(ret, xEmu, offsetof(x64emu_t, mmx87[a]));
     return ret;
 #else
@@ -798,7 +798,7 @@ int mmx_get_reg_empty(dynarec_arm_t* dyn, int ninst, int s1, int a)
 #if STEP > 1
     if(dyn->mmxcache[a]!=-1)
         return dyn->mmxcache[a];
-    int ret = dyn->mmxcache[a] = fpu_get_reg_double(dyn);
+    int ret = dyn->mmxcache[a] = fpu_get_reg_emm(dyn, a);
     return ret;
 #else
     return 0;
@@ -816,7 +816,7 @@ static void mmx_purgecache(dynarec_arm_t* dyn, int ninst, int s1)
                 ++old;
             }
             VSTR64_U12(dyn->mmxcache[i], xEmu, offsetof(x64emu_t, mmx87[i]));
-            fpu_free_reg_double(dyn, dyn->mmxcache[i]);
+            fpu_free_reg(dyn, dyn->mmxcache[i]);
             dyn->mmxcache[i] = -1;
         }
     if(old!=-1) {
@@ -851,7 +851,7 @@ int sse_get_reg(dynarec_arm_t* dyn, int ninst, int s1, int a)
 #if STEP > 1
     if(dyn->ssecache[a]!=-1)
         return dyn->ssecache[a];
-    int ret = dyn->ssecache[a] = fpu_get_reg_quad(dyn);
+    int ret = dyn->ssecache[a] = fpu_get_reg_xmm(dyn, a);
     VLDR128_U12(ret, xEmu, offsetof(x64emu_t, xmm[a]));
     return ret;
 #else
@@ -864,7 +864,7 @@ int sse_get_reg_empty(dynarec_arm_t* dyn, int ninst, int s1, int a)
 #if STEP > 1
     if(dyn->ssecache[a]!=-1)
         return dyn->ssecache[a];
-    int ret = dyn->ssecache[a] = fpu_get_reg_quad(dyn);
+    int ret = dyn->ssecache[a] = fpu_get_reg_xmm(dyn, a);
     return ret;
 #else
     return 0;
@@ -882,7 +882,7 @@ void sse_purge07cache(dynarec_arm_t* dyn, int ninst, int s1)
                 ++old;
             }
             VSTR128_U12(dyn->ssecache[i], xEmu, offsetof(x64emu_t, xmm[i]));
-            fpu_free_reg_quad(dyn, dyn->ssecache[i]);
+            fpu_free_reg(dyn, dyn->ssecache[i]);
             dyn->ssecache[i] = -1;
         }
     if(old!=-1) {
@@ -903,7 +903,7 @@ static void sse_purgecache(dynarec_arm_t* dyn, int ninst, int s1)
                 ++old;
             }
             VSTR128_U12(dyn->ssecache[i], xEmu, offsetof(x64emu_t, xmm[i]));
-            fpu_free_reg_quad(dyn, dyn->ssecache[i]);
+            fpu_free_reg(dyn, dyn->ssecache[i]);
             dyn->ssecache[i] = -1;
         }
     if(old!=-1) {
@@ -926,44 +926,38 @@ static void sse_reflectcache(dynarec_arm_t* dyn, int ninst, int s1)
 void fpu_pushcache(dynarec_arm_t* dyn, int ninst, int s1)
 {
 #if STEP > 1
-    // only need to push 16-31...
+    // only SSE regs needs to be push back to xEmu
     int n=0;
-    for (int i=8; i<32; i++)
-        if(dyn->fpuused[i-8])
+    for (int i=0; i<16; i++)
+        if(dyn->ssecache[i]!=-1)
             ++n;
     if(!n)
         return;
-    MESSAGE(LOG_DUMP, "\tPush FPU Cache (%d)------\n", n);
-    SUBx_U12(xSP, xSP, n*16);
-    MOV_frmSP(s1);
-    for (int i=8; i<32; ++i) {
-        if(dyn->fpuused[i-8]) {
-            VSTR128_S9_postindex(i, s1, 16);
+    MESSAGE(LOG_DUMP, "\tPush XMM Cache (%d)------\n", n);
+    for (int i=0; i<16; ++i)
+        if(dyn->ssecache[i]!=-1) {
+            VSTR128_U12(dyn->ssecache[i], xEmu, offsetof(x64emu_t, xmm[i]));
         }
-    }
-    MESSAGE(LOG_DUMP, "\t------- Push FPU Cache (%d)\n", n);
+    MESSAGE(LOG_DUMP, "\t------- Push XMM Cache (%d)\n", n);
 #endif
 }
 
 void fpu_popcache(dynarec_arm_t* dyn, int ninst, int s1)
 {
 #if STEP > 1
-    // we need to push 8-31 (because on 8..15 only low part is preserved)
+    // only SSE regs needs to be pop back from xEmu
     int n=0;
-    for (int i=8; i<32; i++)
-        if(dyn->fpuused[i-8])
+    for (int i=16; i<32; i++)
+        if(dyn->ssecache[i]!=-1)
             ++n;
     if(!n)
         return;
-    MESSAGE(LOG_DUMP, "\tPop FPU Cache (%d)------\n", n);
-    MOV_frmSP(s1);
-    for (int i=8; i<32; ++i) {
-        if(dyn->fpuused[i-8]) {
-            VLDR128_S9_postindex(i, s1, 16);
+    MESSAGE(LOG_DUMP, "\tPop XMM Cache (%d)------\n", n);
+    for (int i=0; i<16; ++i)
+        if(dyn->ssecache[i]!=-1) {
+            VLDR128_U12(dyn->ssecache[i], xEmu, offsetof(x64emu_t, xmm[i]));
         }
-    }
-    ADDx_U12(xSP, xSP, n*16);
-    MESSAGE(LOG_DUMP, "\t------- Pop FPU Cache (%d)\n", n);
+    MESSAGE(LOG_DUMP, "\t------- Pop XMM Cache (%d)\n", n);
 #endif
 }
 
diff --git a/src/dynarec/dynarec_arm64_private.h b/src/dynarec/dynarec_arm64_private.h
index 413e7edf..9cadbc63 100755
--- a/src/dynarec/dynarec_arm64_private.h
+++ b/src/dynarec/dynarec_arm64_private.h
@@ -35,10 +35,9 @@ typedef struct dynarec_arm_s {
     int8_t              x87reg[8];  // reg used for x87cache entry
     int8_t              mmxcache[8];// cache status for the 8 MMX registers
     int8_t              ssecache[16];// cache status for the 16 SSE(2) registers
-    int8_t              fpuused[24];// all 8..31 Q reg from fpu, used by x87, sse and mmx
+    int8_t              fpuused[32];// all 8..31 Q reg from fpu, used by x87, sse and mmx
     int                 x87stack;   // cache stack counter
     int                 fpu_scratch;// scratch counter
-    int                 fpu_extra_qscratch; // some opcode need an extra quad scratch register
     int                 fpu_reg;    // x87/sse/mmx reg counter
     int                 dfnone;     // if defered flags is already set to df_none
     uint64_t            *table64;   // table of 64bits value