diff options
| author | ptitSeb <sebastien.chev@gmail.com> | 2023-06-25 16:58:43 +0200 |
|---|---|---|
| committer | ptitSeb <sebastien.chev@gmail.com> | 2023-06-25 16:58:43 +0200 |
| commit | 735d7ab1b4bead627d2a380913864be49c214705 (patch) | |
| tree | 002cacbb04db542e58c867aa79c67808c8063100 | |
| parent | 3c9b556ca80c86938b9a042ccfc48f7e06e80e95 (diff) | |
| download | box64-735d7ab1b4bead627d2a380913864be49c214705.tar.gz box64-735d7ab1b4bead627d2a380913864be49c214705.zip | |
[ARM64_DYNAREC] Improved handling of FILD/FISTP i64 sequence, important fo 32bits process (for #860)
| -rw-r--r-- | src/dynarec/arm64/dynarec_arm64_df.c | 54 | ||||
| -rwxr-xr-x | src/dynarec/arm64/dynarec_arm64_functions.c | 61 | ||||
| -rwxr-xr-x | src/dynarec/arm64/dynarec_arm64_functions.h | 8 | ||||
| -rwxr-xr-x | src/dynarec/arm64/dynarec_arm64_helper.c | 69 | ||||
| -rwxr-xr-x | src/dynarec/arm64/dynarec_arm64_helper.h | 3 | ||||
| -rwxr-xr-x | src/dynarec/arm64/dynarec_arm64_private.h | 15 |
6 files changed, 153 insertions, 57 deletions
diff --git a/src/dynarec/arm64/dynarec_arm64_df.c b/src/dynarec/arm64/dynarec_arm64_df.c index 522fe8f3..3e99ceae 100644 --- a/src/dynarec/arm64/dynarec_arm64_df.c +++ b/src/dynarec/arm64/dynarec_arm64_df.c @@ -287,10 +287,12 @@ uintptr_t dynarec64_DF(dynarec_arm_t* dyn, uintptr_t addr, uintptr_t ip, int nin break; case 5: INST_NAME("FILD ST0, i64"); - v1 = x87_do_push(dyn, ninst, x1, NEON_CACHE_ST_D); + v1 = x87_do_push(dyn, ninst, x1, NEON_CACHE_ST_I64); addr = geted(dyn, addr, ninst, nextop, &wback, x3, &fixedaddress, &unscaled, 0xfff<<3, 7, rex, NULL, 0, 0); - LDx(x1, wback, fixedaddress); - SCVTFDx(v1, x1); + VLD64(v1, wback, fixedaddress); + if(!ST_IS_I64(0)) { + SCVTFDD(v1, v1); + } break; case 6: INST_NAME("FBSTP tbytes, ST0"); @@ -302,29 +304,35 @@ uintptr_t dynarec64_DF(dynarec_arm_t* dyn, uintptr_t addr, uintptr_t ip, int nin break; case 7: INST_NAME("FISTP i64, ST0"); - v1 = x87_get_st(dyn, ninst, x1, x2, 0, NEON_CACHE_ST_D); - u8 = x87_setround(dyn, ninst, x1, x2, x4); + v1 = x87_get_st(dyn, ninst, x1, x2, 0, NEON_CACHE_ST_I64); + if(!ST_IS_I64(0)) { + u8 = x87_setround(dyn, ninst, x1, x2, x4); + } addr = geted(dyn, addr, ninst, nextop, &wback, x2, &fixedaddress, &unscaled, 0xfff<<3, 7, rex, NULL, 0, 0); ed = x1; s0 = fpu_get_scratch(dyn); - #if 0 - FRINT64XD(s0, v1); - VFCVTZSd(s0, s0); - VSTR64_U12(s0, wback, fixedaddress); - #else - MRS_fpsr(x5); - BFCw(x5, FPSR_IOC, 1); // reset IOC bit - MSR_fpsr(x5); - FRINTXD(s0, v1); - VFCVTZSd(s0, s0); - VST64(s0, wback, fixedaddress); - MRS_fpsr(x5); // get back FPSR to check the IOC bit - TBZ_MARK3(x5, FPSR_IOC); - ORRx_mask(x5, xZR, 1, 1, 0); //0x8000000000000000 - STx(x5, wback, fixedaddress); - MARK3; - #endif - x87_restoreround(dyn, ninst, u8); + if(ST_IS_I64(0)) { + VST64(v1, wback, fixedaddress); + } else { + #if 0 + FRINT64XD(s0, v1); + VFCVTZSd(s0, s0); + VSTR64_U12(s0, wback, fixedaddress); + #else + MRS_fpsr(x5); + BFCw(x5, FPSR_IOC, 1); // reset IOC bit + MSR_fpsr(x5); + FRINTXD(s0, v1); + VFCVTZSd(s0, s0); + VST64(s0, wback, fixedaddress); + MRS_fpsr(x5); // get back FPSR to check the IOC bit + TBZ_MARK3(x5, FPSR_IOC); + ORRx_mask(x5, xZR, 1, 1, 0); //0x8000000000000000 + STx(x5, wback, fixedaddress); + MARK3; + #endif + x87_restoreround(dyn, ninst, u8); + } x87_do_pop(dyn, ninst, x3); break; default: diff --git a/src/dynarec/arm64/dynarec_arm64_functions.c b/src/dynarec/arm64/dynarec_arm64_functions.c index 42dfd09e..d2963b55 100755 --- a/src/dynarec/arm64/dynarec_arm64_functions.c +++ b/src/dynarec/arm64/dynarec_arm64_functions.c @@ -58,7 +58,7 @@ void fpu_free_reg(dynarec_arm_t* dyn, int reg) { // TODO: check upper limit? dyn->n.fpuused[reg] = 0; - if(dyn->n.neoncache[reg].t!=NEON_CACHE_ST_F && dyn->n.neoncache[reg].t!=NEON_CACHE_ST_D) + if(dyn->n.neoncache[reg].t!=NEON_CACHE_ST_F && dyn->n.neoncache[reg].t!=NEON_CACHE_ST_D && dyn->n.neoncache[reg].t!=NEON_CACHE_ST_I64) dyn->n.neoncache[reg].v = 0; } // Get an MMX double reg @@ -106,7 +106,8 @@ int neoncache_get_st(dynarec_arm_t* dyn, int ninst, int a) } for(int i=0; i<24; ++i) if((dyn->insts[ninst].n.neoncache[i].t==NEON_CACHE_ST_F - || dyn->insts[ninst].n.neoncache[i].t==NEON_CACHE_ST_D) + || dyn->insts[ninst].n.neoncache[i].t==NEON_CACHE_ST_D + || dyn->insts[ninst].n.neoncache[i].t==NEON_CACHE_ST_I64) && dyn->insts[ninst].n.neoncache[i].n==a) return dyn->insts[ninst].n.neoncache[i].t; // not in the cache yet, so will be fetched... @@ -120,7 +121,8 @@ int neoncache_get_current_st(dynarec_arm_t* dyn, int ninst, int a) return NEON_CACHE_ST_D; for(int i=0; i<24; ++i) if((dyn->n.neoncache[i].t==NEON_CACHE_ST_F - || dyn->n.neoncache[i].t==NEON_CACHE_ST_D) + || dyn->n.neoncache[i].t==NEON_CACHE_ST_D + || dyn->n.neoncache[i].t==NEON_CACHE_ST_I64) && dyn->n.neoncache[i].n==a) return dyn->n.neoncache[i].t; // not in the cache yet, so will be fetched... @@ -138,6 +140,17 @@ int neoncache_get_st_f(dynarec_arm_t* dyn, int ninst, int a) return i; return -1; } +int neoncache_get_st_f_i64(dynarec_arm_t* dyn, int ninst, int a) +{ + /*if(a+dyn->insts[ninst].n.stack_next-st<0) + // The STx has been pushed at the end of instructon, so stop going back + return -1;*/ + for(int i=0; i<24; ++i) + if((dyn->insts[ninst].n.neoncache[i].t==NEON_CACHE_ST_I64 || dyn->insts[ninst].n.neoncache[i].t==NEON_CACHE_ST_F) + && dyn->insts[ninst].n.neoncache[i].n==a) + return i; + return -1; +} int neoncache_get_st_f_noback(dynarec_arm_t* dyn, int ninst, int a) { for(int i=0; i<24; ++i) @@ -146,6 +159,14 @@ int neoncache_get_st_f_noback(dynarec_arm_t* dyn, int ninst, int a) return i; return -1; } +int neoncache_get_st_f_i64_noback(dynarec_arm_t* dyn, int ninst, int a) +{ + for(int i=0; i<24; ++i) + if((dyn->insts[ninst].n.neoncache[i].t==NEON_CACHE_ST_I64 || dyn->insts[ninst].n.neoncache[i].t==NEON_CACHE_ST_F) + && dyn->insts[ninst].n.neoncache[i].n==a) + return i; + return -1; +} int neoncache_get_current_st_f(dynarec_arm_t* dyn, int a) { for(int i=0; i<24; ++i) @@ -154,6 +175,14 @@ int neoncache_get_current_st_f(dynarec_arm_t* dyn, int a) return i; return -1; } +int neoncache_get_current_st_f_i64(dynarec_arm_t* dyn, int a) +{ + for(int i=0; i<24; ++i) + if((dyn->n.neoncache[i].t==NEON_CACHE_ST_I64 || dyn->n.neoncache[i].t==NEON_CACHE_ST_F) + && dyn->n.neoncache[i].n==a) + return i; + return -1; +} static void neoncache_promote_double_forward(dynarec_arm_t* dyn, int ninst, int maxinst, int a); static void neoncache_promote_double_internal(dynarec_arm_t* dyn, int ninst, int maxinst, int a); static void neoncache_promote_double_combined(dynarec_arm_t* dyn, int ninst, int maxinst, int a) @@ -163,7 +192,7 @@ static void neoncache_promote_double_combined(dynarec_arm_t* dyn, int ninst, int a = dyn->insts[ninst].n.combined2; } else a = dyn->insts[ninst].n.combined1; - int i = neoncache_get_st_f_noback(dyn, ninst, a); + int i = neoncache_get_st_f_i64_noback(dyn, ninst, a); //if(box64_dynarec_dump) dynarec_log(LOG_NONE, "neoncache_promote_double_combined, ninst=%d combined%c %d i=%d (stack:%d/%d)\n", ninst, (a == dyn->insts[ninst].n.combined2)?'2':'1', a ,i, dyn->insts[ninst].n.stack_push, -dyn->insts[ninst].n.stack_pop); if(i>=0) { dyn->insts[ninst].n.neoncache[i].t = NEON_CACHE_ST_D; @@ -182,7 +211,7 @@ static void neoncache_promote_double_internal(dynarec_arm_t* dyn, int ninst, int return; while(ninst>=0) { a+=dyn->insts[ninst].n.stack_pop; // adjust Stack depth: add pop'd ST (going backward) - int i = neoncache_get_st_f(dyn, ninst, a); + int i = neoncache_get_st_f_i64(dyn, ninst, a); //if(box64_dynarec_dump) dynarec_log(LOG_NONE, "neoncache_promote_double_internal, ninst=%d, a=%d st=%d:%d, i=%d\n", ninst, a, dyn->insts[ninst].n.stack, dyn->insts[ninst].n.stack_next, i); if(i<0) return; dyn->insts[ninst].n.neoncache[i].t = NEON_CACHE_ST_D; @@ -217,7 +246,7 @@ static void neoncache_promote_double_forward(dynarec_arm_t* dyn, int ninst, int else if (a==dyn->insts[ninst].n.combined2) a = dyn->insts[ninst].n.combined1; } - int i = neoncache_get_st_f_noback(dyn, ninst, a); + int i = neoncache_get_st_f_i64_noback(dyn, ninst, a); //if(box64_dynarec_dump) dynarec_log(LOG_NONE, "neoncache_promote_double_forward, ninst=%d, a=%d st=%d:%d(%d/%d), i=%d\n", ninst, a, dyn->insts[ninst].n.stack, dyn->insts[ninst].n.stack_next, dyn->insts[ninst].n.stack_push, -dyn->insts[ninst].n.stack_pop, i); if(i<0) return; dyn->insts[ninst].n.neoncache[i].t = NEON_CACHE_ST_D; @@ -238,7 +267,7 @@ static void neoncache_promote_double_forward(dynarec_arm_t* dyn, int ninst, int void neoncache_promote_double(dynarec_arm_t* dyn, int ninst, int a) { - int i = neoncache_get_current_st_f(dyn, a); + int i = neoncache_get_current_st_f_i64(dyn, a); //if(box64_dynarec_dump) dynarec_log(LOG_NONE, "neoncache_promote_double, ninst=%d a=%d st=%d i=%d\n", ninst, a, dyn->n.stack, i); if(i<0) return; dyn->n.neoncache[i].t = NEON_CACHE_ST_D; @@ -271,6 +300,9 @@ int neoncache_combine_st(dynarec_arm_t* dyn, int ninst, int a, int b) if( neoncache_get_current_st(dyn, ninst, a)==NEON_CACHE_ST_F && neoncache_get_current_st(dyn, ninst, b)==NEON_CACHE_ST_F ) return NEON_CACHE_ST_F; + if( neoncache_get_current_st(dyn, ninst, a)==NEON_CACHE_ST_I64 + && neoncache_get_current_st(dyn, ninst, b)==NEON_CACHE_ST_I64 ) + return NEON_CACHE_ST_I64; return NEON_CACHE_ST_D; } @@ -281,7 +313,9 @@ static int isCacheEmpty(dynarec_native_t* dyn, int ninst) { for(int i=0; i<24; ++i) if(dyn->insts[ninst].n.neoncache[i].v) { // there is something at ninst for i if(!( - (dyn->insts[ninst].n.neoncache[i].t==NEON_CACHE_ST_F || dyn->insts[ninst].n.neoncache[i].t==NEON_CACHE_ST_D) + (dyn->insts[ninst].n.neoncache[i].t==NEON_CACHE_ST_F + || dyn->insts[ninst].n.neoncache[i].t==NEON_CACHE_ST_D + || dyn->insts[ninst].n.neoncache[i].t==NEON_CACHE_ST_I64) && dyn->insts[ninst].n.neoncache[i].n<dyn->insts[ninst].n.stack_pop)) return 0; } @@ -304,7 +338,9 @@ int fpuCacheNeedsTransform(dynarec_arm_t* dyn, int ninst) { for(int i=0; i<24 && !ret; ++i) if(dyn->insts[ninst].n.neoncache[i].v) { // there is something at ninst for i if(!( - (dyn->insts[ninst].n.neoncache[i].t==NEON_CACHE_ST_F || dyn->insts[ninst].n.neoncache[i].t==NEON_CACHE_ST_D) + (dyn->insts[ninst].n.neoncache[i].t==NEON_CACHE_ST_F + || dyn->insts[ninst].n.neoncache[i].t==NEON_CACHE_ST_D + || dyn->insts[ninst].n.neoncache[i].t==NEON_CACHE_ST_I64) && dyn->insts[ninst].n.neoncache[i].n<dyn->insts[ninst].n.stack_pop)) ret = 1; } @@ -343,7 +379,7 @@ void neoncacheUnwind(neoncache_t* cache) int a = -1; int b = -1; for(int j=0; j<24 && ((a==-1) || (b==-1)); ++j) - if((cache->neoncache[j].t == NEON_CACHE_ST_D || cache->neoncache[j].t == NEON_CACHE_ST_F)) { + if((cache->neoncache[j].t == NEON_CACHE_ST_D || cache->neoncache[j].t == NEON_CACHE_ST_F || cache->neoncache[j].t == NEON_CACHE_ST_I64)) { if(cache->neoncache[j].n == cache->combined1) a = j; else if(cache->neoncache[j].n == cache->combined2) @@ -367,7 +403,7 @@ void neoncacheUnwind(neoncache_t* cache) if(cache->stack_push) { // unpush for(int j=0; j<24; ++j) { - if((cache->neoncache[j].t == NEON_CACHE_ST_D || cache->neoncache[j].t == NEON_CACHE_ST_F)) { + if((cache->neoncache[j].t == NEON_CACHE_ST_D || cache->neoncache[j].t == NEON_CACHE_ST_F || cache->neoncache[j].t == NEON_CACHE_ST_I64)) { if(cache->neoncache[j].n<cache->stack_push) cache->neoncache[j].v = 0; else @@ -412,6 +448,7 @@ void neoncacheUnwind(neoncache_t* cache) break; case NEON_CACHE_ST_F: case NEON_CACHE_ST_D: + case NEON_CACHE_ST_I64: cache->x87cache[x87reg] = cache->neoncache[i].n; cache->x87reg[x87reg] = i; ++x87reg; @@ -477,6 +514,7 @@ const char* getCacheName(int t, int n) switch(t) { case NEON_CACHE_ST_D: sprintf(buff, "ST%d", n); break; case NEON_CACHE_ST_F: sprintf(buff, "st%d", n); break; + case NEON_CACHE_ST_I64: sprintf(buff, "STi%d", n); break; case NEON_CACHE_MM: sprintf(buff, "MM%d", n); break; case NEON_CACHE_XMMW: sprintf(buff, "XMM%d", n); break; case NEON_CACHE_XMMR: sprintf(buff, "xmm%d", n); break; @@ -521,6 +559,7 @@ void inst_name_pass3(dynarec_native_t* dyn, int ninst, const char* name, rex_t r switch(dyn->insts[ninst].n.neoncache[ii].t) { case NEON_CACHE_ST_D: dynarec_log(LOG_NONE, " D%d:%s", ii, getCacheName(dyn->insts[ninst].n.neoncache[ii].t, dyn->insts[ninst].n.neoncache[ii].n)); break; case NEON_CACHE_ST_F: dynarec_log(LOG_NONE, " S%d:%s", ii, getCacheName(dyn->insts[ninst].n.neoncache[ii].t, dyn->insts[ninst].n.neoncache[ii].n)); break; + case NEON_CACHE_ST_I64: dynarec_log(LOG_NONE, " D%d:%s", ii, getCacheName(dyn->insts[ninst].n.neoncache[ii].t, dyn->insts[ninst].n.neoncache[ii].n)); break; case NEON_CACHE_MM: dynarec_log(LOG_NONE, " D%d:%s", ii, getCacheName(dyn->insts[ninst].n.neoncache[ii].t, dyn->insts[ninst].n.neoncache[ii].n)); break; case NEON_CACHE_XMMW: dynarec_log(LOG_NONE, " Q%d:%s", ii, getCacheName(dyn->insts[ninst].n.neoncache[ii].t, dyn->insts[ninst].n.neoncache[ii].n)); break; case NEON_CACHE_XMMR: dynarec_log(LOG_NONE, " Q%d:%s", ii, getCacheName(dyn->insts[ninst].n.neoncache[ii].t, dyn->insts[ninst].n.neoncache[ii].n)); break; diff --git a/src/dynarec/arm64/dynarec_arm64_functions.h b/src/dynarec/arm64/dynarec_arm64_functions.h index 950345fa..201cfcea 100755 --- a/src/dynarec/arm64/dynarec_arm64_functions.h +++ b/src/dynarec/arm64/dynarec_arm64_functions.h @@ -21,12 +21,16 @@ void fpu_reset_reg(dynarec_arm_t* dyn); // ---- Neon cache functions // Get type for STx int neoncache_get_st(dynarec_arm_t* dyn, int ninst, int a); -// Get if STx is FLOAT or DOUBLE +// Get if STx is FLOAT int neoncache_get_st_f(dynarec_arm_t* dyn, int ninst, int a); +// Get if STx is FLOAT or I64 +int neoncache_get_st_f_i64(dynarec_arm_t* dyn, int ninst, int a); // Get actual type for STx int neoncache_get_current_st(dynarec_arm_t* dyn, int ninst, int a); -// Get actual STx is FLOAT or DOUBLE +// Get actual STx is FLOAT int neoncache_get_current_st_f(dynarec_arm_t* dyn, int a); +// Get actual STx is FLOAT or I64 +int neoncache_get_current_st_f_i64(dynarec_arm_t* dyn, int a); // Back-propagate a change float->double void neoncache_promote_double(dynarec_arm_t* dyn, int ninst, int a); // Combine and propagate if needed (pass 1 only) diff --git a/src/dynarec/arm64/dynarec_arm64_helper.c b/src/dynarec/arm64/dynarec_arm64_helper.c index e1b93026..153f82a4 100755 --- a/src/dynarec/arm64/dynarec_arm64_helper.c +++ b/src/dynarec/arm64/dynarec_arm64_helper.c @@ -846,7 +846,9 @@ static void x87_reset(dynarec_arm_t* dyn) dyn->n.swapped = 0; dyn->n.barrier = 0; for(int i=0; i<24; ++i) - if(dyn->n.neoncache[i].t == NEON_CACHE_ST_F || dyn->n.neoncache[i].t == NEON_CACHE_ST_D) + if(dyn->n.neoncache[i].t == NEON_CACHE_ST_F + || dyn->n.neoncache[i].t == NEON_CACHE_ST_D + || dyn->n.neoncache[i].t == NEON_CACHE_ST_I64) dyn->n.neoncache[i].v = 0; } @@ -907,7 +909,9 @@ int x87_do_push(dynarec_arm_t* dyn, int ninst, int s1, int t) dyn->n.stack_push+=1; // move all regs in cache, and find a free one for(int j=0; j<24; ++j) - if((dyn->n.neoncache[j].t == NEON_CACHE_ST_D) || (dyn->n.neoncache[j].t == NEON_CACHE_ST_F)) + if((dyn->n.neoncache[j].t == NEON_CACHE_ST_D) + ||(dyn->n.neoncache[j].t == NEON_CACHE_ST_F) + ||(dyn->n.neoncache[j].t == NEON_CACHE_ST_I64)) ++dyn->n.neoncache[j].n; int ret = -1; for(int i=0; i<8; ++i) @@ -916,13 +920,7 @@ int x87_do_push(dynarec_arm_t* dyn, int ninst, int s1, int t) else if(ret==-1) { dyn->n.x87cache[i] = 0; ret=dyn->n.x87reg[i]=fpu_get_reg_x87(dyn, t, 0); - #if STEP == 1 - // need to check if reg is compatible with float - if((ret>15) && (t == NEON_CACHE_ST_F)) - dyn->n.neoncache[ret].t = NEON_CACHE_ST_D; - #else dyn->n.neoncache[ret].t = X87_ST0; - #endif } return ret; } @@ -936,7 +934,9 @@ void x87_do_push_empty(dynarec_arm_t* dyn, int ninst, int s1) dyn->n.stack_push+=1; // move all regs in cache for(int j=0; j<24; ++j) - if((dyn->n.neoncache[j].t == NEON_CACHE_ST_D) || (dyn->n.neoncache[j].t == NEON_CACHE_ST_F)) + if((dyn->n.neoncache[j].t == NEON_CACHE_ST_D) + ||(dyn->n.neoncache[j].t == NEON_CACHE_ST_F) + ||(dyn->n.neoncache[j].t == NEON_CACHE_ST_I64)) ++dyn->n.neoncache[j].n; for(int i=0; i<8; ++i) if(dyn->n.x87cache[i]!=-1) @@ -1133,7 +1133,7 @@ int x87_get_current_cache(dynarec_arm_t* dyn, int ninst, int st, int t) for (int i=0; i<8; ++i) { if(dyn->n.x87cache[i]==st) { #if STEP == 1 - if(t==NEON_CACHE_ST_D && (dyn->n.neoncache[dyn->n.x87reg[i]].t==NEON_CACHE_ST_F)) + if(t==NEON_CACHE_ST_D && (dyn->n.neoncache[dyn->n.x87reg[i]].t==NEON_CACHE_ST_F || dyn->n.neoncache[dyn->n.x87reg[i]].t==NEON_CACHE_ST_I64)) neoncache_promote_double(dyn, ninst, st); #endif return i; @@ -1179,7 +1179,9 @@ int x87_get_cache(dynarec_arm_t* dyn, int ninst, int populate, int s1, int s2, i int x87_get_neoncache(dynarec_arm_t* dyn, int ninst, int s1, int s2, int st) { for(int ii=0; ii<24; ++ii) - if((dyn->n.neoncache[ii].t == NEON_CACHE_ST_F || dyn->n.neoncache[ii].t == NEON_CACHE_ST_D) + if((dyn->n.neoncache[ii].t == NEON_CACHE_ST_F + || dyn->n.neoncache[ii].t == NEON_CACHE_ST_D + || dyn->n.neoncache[ii].t == NEON_CACHE_ST_I64) && dyn->n.neoncache[ii].n==st) return ii; assert(0); @@ -1217,6 +1219,9 @@ void x87_refresh(dynarec_arm_t* dyn, int ninst, int s1, int s2, int st) if(dyn->n.neoncache[dyn->n.x87reg[ret]].t==NEON_CACHE_ST_F) { FCVT_D_S(31, dyn->n.x87reg[ret]); VSTR64_REG_LSL3(31, s1, s2); + } else if(dyn->n.neoncache[dyn->n.x87reg[ret]].t==NEON_CACHE_ST_I64) { + SCVTFDD(31, dyn->n.x87reg[ret]); + VSTR64_REG_LSL3(31, s1, s2); } else { VSTR64_REG_LSL3(dyn->n.x87reg[ret], s1, s2); } @@ -1234,7 +1239,7 @@ void x87_forget(dynarec_arm_t* dyn, int ninst, int s1, int s2, int st) return; MESSAGE(LOG_DUMP, "\tForget x87 Cache for ST%d\n", st); #if STEP == 1 - if(dyn->n.neoncache[dyn->n.x87reg[ret]].t==NEON_CACHE_ST_F) + if(dyn->n.neoncache[dyn->n.x87reg[ret]].t==NEON_CACHE_ST_F || dyn->n.neoncache[dyn->n.x87reg[ret]].t==NEON_CACHE_ST_I64) neoncache_promote_double(dyn, ninst, st); #endif // prepare offset to fpu => s1 @@ -1265,7 +1270,7 @@ void x87_reget_st(dynarec_arm_t* dyn, int ninst, int s1, int s2, int st) // refresh the value MESSAGE(LOG_DUMP, "\tRefresh x87 Cache for ST%d\n", st); #if STEP == 1 - if(dyn->n.neoncache[dyn->n.x87reg[i]].t==NEON_CACHE_ST_F) + if(dyn->n.neoncache[dyn->n.x87reg[i]].t==NEON_CACHE_ST_F || dyn->n.neoncache[dyn->n.x87reg[i]].t==NEON_CACHE_ST_I64) neoncache_promote_double(dyn, ninst, st); #endif ADDx_U12(s1, xEmu, offsetof(x64emu_t, x87)); @@ -1591,10 +1596,20 @@ static int findCacheSlot(dynarec_arm_t* dyn, int ninst, int t, int n, neoncache_ case NEON_CACHE_ST_F: if (t==NEON_CACHE_ST_D) return i; + if (t==NEON_CACHE_ST_I64) + return i; break; case NEON_CACHE_ST_D: if (t==NEON_CACHE_ST_F) return i; + if (t==NEON_CACHE_ST_I64) + return i; + break; + case NEON_CACHE_ST_I64: + if (t==NEON_CACHE_ST_F) + return i; + if (t==NEON_CACHE_ST_D) + return i; break; case NEON_CACHE_XMMR: if(t==NEON_CACHE_XMMW) @@ -1684,6 +1699,7 @@ static void loadCache(dynarec_arm_t* dyn, int ninst, int stack_cnt, int s1, int break; case NEON_CACHE_ST_D: case NEON_CACHE_ST_F: + case NEON_CACHE_ST_I64: MESSAGE(LOG_DUMP, "\t - Loading %s\n", getCacheName(t, n)); if((*s3_top) == 0xffff) { LDRw_U12(s3, xEmu, offsetof(x64emu_t, top)); @@ -1705,6 +1721,9 @@ static void loadCache(dynarec_arm_t* dyn, int ninst, int stack_cnt, int s1, int if(t==NEON_CACHE_ST_F) { FCVT_S_D(i, i); } + if(t==NEON_CACHE_ST_I64) { + VFCVTZSQD(i, i); + } break; case NEON_CACHE_NONE: case NEON_CACHE_SCR: @@ -1732,6 +1751,7 @@ static void unloadCache(dynarec_arm_t* dyn, int ninst, int stack_cnt, int s1, in break; case NEON_CACHE_ST_D: case NEON_CACHE_ST_F: + case NEON_CACHE_ST_I64: MESSAGE(LOG_DUMP, "\t - Unloading %s\n", getCacheName(t, n)); if((*s3_top)==0xffff) { LDRw_U12(s3, xEmu, offsetof(x64emu_t, top)); @@ -1751,6 +1771,8 @@ static void unloadCache(dynarec_arm_t* dyn, int ninst, int stack_cnt, int s1, in *s2_val = 0; if(t==NEON_CACHE_ST_F) { FCVT_D_S(i, i); + } else if (t==NEON_CACHE_ST_I64) { + SCVTFDD(i, i); } VSTR64_U12(i, s2, offsetof(x64emu_t, x87)); break; @@ -1880,6 +1902,23 @@ static void fpuCacheTransform(dynarec_arm_t* dyn, int ninst, int s1, int s2, int MESSAGE(LOG_DUMP, "\t - Convert %s\n", getCacheName(cache.neoncache[i].t, cache.neoncache[i].n)); FCVT_D_S(i, i); cache.neoncache[i].t = NEON_CACHE_ST_D; + } else if(cache.neoncache[i].t == NEON_CACHE_ST_D && cache_i2.neoncache[i].t == NEON_CACHE_ST_I64) { + MESSAGE(LOG_DUMP, "\t - Convert %s\n", getCacheName(cache.neoncache[i].t, cache.neoncache[i].n)); + VFCVTZSQD(i, i); + cache.neoncache[i].t = NEON_CACHE_ST_I64; + } else if(cache.neoncache[i].t == NEON_CACHE_ST_F && cache_i2.neoncache[i].t == NEON_CACHE_ST_I64) { + MESSAGE(LOG_DUMP, "\t - Convert %s\n", getCacheName(cache.neoncache[i].t, cache.neoncache[i].n)); + VFCVTZSQS(i, i); + cache.neoncache[i].t = NEON_CACHE_ST_D; + } else if(cache.neoncache[i].t == NEON_CACHE_ST_I64 && cache_i2.neoncache[i].t == NEON_CACHE_ST_F) { + MESSAGE(LOG_DUMP, "\t - Convert %s\n", getCacheName(cache.neoncache[i].t, cache.neoncache[i].n)); + SCVTFDD(i, i); + FCVT_S_D(i, i); + cache.neoncache[i].t = NEON_CACHE_ST_F; + } else if(cache.neoncache[i].t == NEON_CACHE_ST_I64 && cache_i2.neoncache[i].t == NEON_CACHE_ST_D) { + MESSAGE(LOG_DUMP, "\t - Convert %s\n", getCacheName(cache.neoncache[i].t, cache.neoncache[i].n)); + SCVTFDD(i, i); + cache.neoncache[i].t = NEON_CACHE_ST_D; } else if(cache.neoncache[i].t == NEON_CACHE_XMMR && cache_i2.neoncache[i].t == NEON_CACHE_XMMW) { cache.neoncache[i].t = NEON_CACHE_XMMW; } else if(cache.neoncache[i].t == NEON_CACHE_XMMW && cache_i2.neoncache[i].t == NEON_CACHE_XMMR) { @@ -2031,7 +2070,9 @@ void fpu_propagate_stack(dynarec_arm_t* dyn, int ninst) { if(dyn->n.stack_pop) { for(int j=0; j<24; ++j) - if((dyn->n.neoncache[j].t == NEON_CACHE_ST_D || dyn->n.neoncache[j].t == NEON_CACHE_ST_F)) { + if((dyn->n.neoncache[j].t == NEON_CACHE_ST_D + || dyn->n.neoncache[j].t == NEON_CACHE_ST_F + || dyn->n.neoncache[j].t == NEON_CACHE_ST_I64)) { if(dyn->n.neoncache[j].n<dyn->n.stack_pop) dyn->n.neoncache[j].v = 0; else diff --git a/src/dynarec/arm64/dynarec_arm64_helper.h b/src/dynarec/arm64/dynarec_arm64_helper.h index ec3dbf33..64866147 100755 --- a/src/dynarec/arm64/dynarec_arm64_helper.h +++ b/src/dynarec/arm64/dynarec_arm64_helper.h @@ -1142,16 +1142,19 @@ int neoncache_st_coherency(dynarec_arm_t* dyn, int ninst, int a, int b); #if STEP == 0 #define ST_IS_F(A) 0 +#define ST_IS_I64(A) 0 #define X87_COMBINE(A, B) NEON_CACHE_ST_D #define X87_ST0 NEON_CACHE_ST_D #define X87_ST(A) NEON_CACHE_ST_D #elif STEP == 1 #define ST_IS_F(A) (neoncache_get_current_st(dyn, ninst, A)==NEON_CACHE_ST_F) +#define ST_IS_I64(A) (neoncache_get_current_st(dyn, ninst, A)==NEON_CACHE_ST_I64) #define X87_COMBINE(A, B) neoncache_combine_st(dyn, ninst, A, B) #define X87_ST0 neoncache_get_current_st(dyn, ninst, 0) #define X87_ST(A) neoncache_get_current_st(dyn, ninst, A) #else #define ST_IS_F(A) (neoncache_get_st(dyn, ninst, A)==NEON_CACHE_ST_F) +#define ST_IS_I64(A) (neoncache_get_st(dyn, ninst, A)==NEON_CACHE_ST_I64) #if STEP == 3 #define X87_COMBINE(A, B) neoncache_st_coherency(dyn, ninst, A, B) #else diff --git a/src/dynarec/arm64/dynarec_arm64_private.h b/src/dynarec/arm64/dynarec_arm64_private.h index 52ad44b4..ba802217 100755 --- a/src/dynarec/arm64/dynarec_arm64_private.h +++ b/src/dynarec/arm64/dynarec_arm64_private.h @@ -9,13 +9,14 @@ typedef struct instsize_s instsize_t; #define BARRIER_MAYBE 8 -#define NEON_CACHE_NONE 0 -#define NEON_CACHE_ST_D 1 -#define NEON_CACHE_ST_F 2 -#define NEON_CACHE_MM 3 -#define NEON_CACHE_XMMW 4 -#define NEON_CACHE_XMMR 5 -#define NEON_CACHE_SCR 6 +#define NEON_CACHE_NONE 0 +#define NEON_CACHE_ST_D 1 +#define NEON_CACHE_ST_F 2 +#define NEON_CACHE_ST_I64 3 +#define NEON_CACHE_MM 4 +#define NEON_CACHE_XMMW 5 +#define NEON_CACHE_XMMR 6 +#define NEON_CACHE_SCR 7 typedef union neon_cache_s { int8_t v; struct { |