diff options
| author | ptitSeb <sebastien.chev@gmail.com> | 2024-04-24 15:34:54 +0200 |
|---|---|---|
| committer | ptitSeb <sebastien.chev@gmail.com> | 2024-04-24 15:34:54 +0200 |
| commit | 2a79b604546769e600600f3d85a684641b0bca28 (patch) | |
| tree | e061157fd647be5248117361a21db702d5026791 /src | |
| parent | db32e498790a13f3dc0a78748d47530cf8404015 (diff) | |
| download | box64-2a79b604546769e600600f3d85a684641b0bca28.tar.gz box64-2a79b604546769e600600f3d85a684641b0bca28.zip | |
Changed x87 way of handling FFREE opcode ([DYNAREC] too, improving x87 robustness overall)
Diffstat (limited to 'src')
27 files changed, 428 insertions, 245 deletions
diff --git a/src/dynarec/arm64/dynarec_arm64_00.c b/src/dynarec/arm64/dynarec_arm64_00.c index 1b5915b8..616f953c 100644 --- a/src/dynarec/arm64/dynarec_arm64_00.c +++ b/src/dynarec/arm64/dynarec_arm64_00.c @@ -2161,7 +2161,7 @@ uintptr_t dynarec64_00(dynarec_arm_t* dyn, uintptr_t addr, uintptr_t ip, int nin if(box64_dynarec_safeflags) { READFLAGS(X_PEND); // lets play safe here too } - BARRIER(BARRIER_FLOAT); + fpu_purgecache(dyn, ninst, 1, x1, x2, x3); // using next, even if there no next i32 = F16; retn_to_epilog(dyn, ninst, rex, i32); *need_epilog = 0; @@ -2173,7 +2173,7 @@ uintptr_t dynarec64_00(dynarec_arm_t* dyn, uintptr_t addr, uintptr_t ip, int nin if(box64_dynarec_safeflags) { READFLAGS(X_PEND); // so instead, force the deferred flags, so it's not too slow, and flags are not lost } - BARRIER(BARRIER_FLOAT); + fpu_purgecache(dyn, ninst, 1, x1, x2, x3); // using next, even if there no next ret_to_epilog(dyn, ninst, rex); *need_epilog = 0; *ok = 0; @@ -3041,19 +3041,20 @@ uintptr_t dynarec64_00(dynarec_arm_t* dyn, uintptr_t addr, uintptr_t ip, int nin SETFLAGS(X_ALL, SF_SET_NODF); // Hack to set flags to "dont'care" state } // regular call - if(box64_dynarec_callret && box64_dynarec_bigblock>1) { + /*if(box64_dynarec_callret && box64_dynarec_bigblock>1) { BARRIER(BARRIER_FULL); BARRIER_NEXT(BARRIER_FULL); } else { BARRIER(BARRIER_FLOAT); *need_epilog = 0; *ok = 0; - } + }*/ if(rex.is32bits) { MOV32w(x2, addr); } else { TABLE64(x2, addr); } + fpu_purgecache(dyn, ninst, 1, x1, x3, x4); PUSH1z(x2); if(box64_dynarec_callret) { SET_HASCALLRET(); diff --git a/src/dynarec/arm64/dynarec_arm64_d9.c b/src/dynarec/arm64/dynarec_arm64_d9.c index e12ecf72..5690e6be 100644 --- a/src/dynarec/arm64/dynarec_arm64_d9.c +++ b/src/dynarec/arm64/dynarec_arm64_d9.c @@ -136,40 +136,44 @@ uintptr_t dynarec64_D9(dynarec_arm_t* dyn, uintptr_t addr, uintptr_t ip, int nin i1 = x87_get_current_cache(dyn, ninst, 0, NEON_CACHE_ST_D); // value put in x14 if(i1==-1) { - // not in cache, so check Empty status and load it - i2 = -dyn->n.x87stack; - LDRw_U12(x3, xEmu, offsetof(x64emu_t, fpu_stack)); - if(i2) { - if(i2<0) { - ADDw_U12(x3, x3, -i2); - } else { - SUBw_U12(x3, x3, i2); + if(fpu_is_st_freed(dyn, ninst, 0)) { + MOV32w(x4, 0b100000100000000); + B_MARK3_nocond; + } else { + // not in cache, so check Empty status and load it + i2 = -dyn->n.x87stack; + LDRw_U12(x3, xEmu, offsetof(x64emu_t, fpu_stack)); + if(i2) { + if(i2<0) { + ADDw_U12(x3, x3, -i2); + } else { + SUBw_U12(x3, x3, i2); + } } - } - CMPSw_U12(x3, 0); - MOV32w(x3, 0b100000100000000); - CSELx(x4, x3, x4, cLE); // empty: C3,C2,C0 = 101 - B_MARK3(cLE); - // x4 will be the actual top - LDRw_U12(x4, xEmu, offsetof(x64emu_t, top)); - if(i2) { - if(i2<0) { - SUBw_U12(x4, x4, -i2); - } else { - ADDw_U12(x4, x4, i2); + CMPSw_U12(x3, 0); + MOV32w(x3, 0b100000100000000); + CSELx(x4, x3, x4, cLE); // empty: C3,C2,C0 = 101 + B_MARK3(cLE); + // x4 will be the actual top + LDRw_U12(x4, xEmu, offsetof(x64emu_t, top)); + if(i2) { + if(i2<0) { + SUBw_U12(x4, x4, -i2); + } else { + ADDw_U12(x4, x4, i2); + } + ANDw_mask(x4, x4, 0, 3); // (emu->top + i)&7 } - ANDw_mask(x4, x4, 0, 3); // (emu->top + i)&7 + // load tag + LDRH_U12(x3, xEmu, offsetof(x64emu_t, fpu_tags)); + TSTw_mask(x3, 0, 1); // 0b11 + MOV32w(x3, 0b100000100000000); + CSELx(x4, x3, x4, cNE); // empty: C3,C2,C0 = 101 + B_MARK3(cNE); + // load x2 with ST0 anyway, for sign extraction + ADDx_REG_LSL(x1, xEmu, x4, 3); + LDRx_U12(x2, x1, offsetof(x64emu_t, x87)); } - // load tag - ADDx_U12(x1, xEmu, offsetof(x64emu_t, p_regs)); - LDRw_REG_LSL2(x3, x1, x4); - CMPSw_U12(x3, 0b11); // empty - MOV32w(x3, 0b100000100000000); - CSELx(x4, x3, x4, cEQ); // empty: C3,C2,C0 = 101 - B_MARK3(cEQ); - // load x2 with ST0 anyway, for sign extraction - ADDx_REG_LSL(x1, xEmu, x4, 3); - LDRx_U12(x2, x1, offsetof(x64emu_t, x87)); } else { // simply move from cache reg to x2 v1 = dyn->n.x87reg[i1]; diff --git a/src/dynarec/arm64/dynarec_arm64_functions.c b/src/dynarec/arm64/dynarec_arm64_functions.c index 73946f88..f62ade0d 100644 --- a/src/dynarec/arm64/dynarec_arm64_functions.c +++ b/src/dynarec/arm64/dynarec_arm64_functions.c @@ -389,6 +389,7 @@ void neoncacheUnwind(neoncache_t* cache) // unswap int a = -1; int b = -1; + // in neoncache for(int j=0; j<24 && ((a==-1) || (b==-1)); ++j) if((cache->neoncache[j].t == NEON_CACHE_ST_D || cache->neoncache[j].t == NEON_CACHE_ST_F || cache->neoncache[j].t == NEON_CACHE_ST_I64)) { if(cache->neoncache[j].n == cache->combined1) @@ -401,11 +402,12 @@ void neoncacheUnwind(neoncache_t* cache) cache->neoncache[a].n = cache->neoncache[b].n; cache->neoncache[b].n = tmp; } + // done cache->swapped = 0; cache->combined1 = cache->combined2 = 0; } if(cache->news) { - // reove the newly created neoncache + // remove the newly created neoncache for(int i=0; i<24; ++i) if(cache->news&(1<<i)) cache->neoncache[i].v = 0; @@ -422,11 +424,23 @@ void neoncacheUnwind(neoncache_t* cache) } } cache->x87stack-=cache->stack_push; + cache->tags>>=(cache->stack_push*2); cache->stack-=cache->stack_push; + if(cache->pushed>=cache->stack_push) + cache->pushed-=cache->stack_push; + else + cache->pushed = 0; cache->stack_push = 0; } cache->x87stack+=cache->stack_pop; cache->stack_next = cache->stack; + if(cache->stack_pop) { + if(cache->poped>=cache->stack_pop) + cache->poped-=cache->stack_pop; + else + cache->poped = 0; + cache->tags<<=(cache->stack_pop*2); + } cache->stack_pop = 0; cache->barrier = 0; // And now, rebuild the x87cache info with neoncache @@ -594,10 +608,9 @@ void print_opcode(dynarec_native_t* dyn, int ninst, uint32_t opcode) static void x87_reset(neoncache_t* n) { - for (int i=0; i<8; ++i) { + for (int i=0; i<8; ++i) n->x87cache[i] = -1; - n->freed[i] = -1; - } + n->tags = 0; n->x87stack = 0; n->stack = 0; n->stack_next = 0; @@ -606,6 +619,9 @@ static void x87_reset(neoncache_t* n) n->combined1 = n->combined2 = 0; n->swapped = 0; n->barrier = 0; + n->pushed = 0; + n->poped = 0; + for(int i=0; i<24; ++i) if(n->neoncache[i].t == NEON_CACHE_ST_F || n->neoncache[i].t == NEON_CACHE_ST_D @@ -641,3 +657,8 @@ void fpu_reset_ninst(dynarec_arm_t* dyn, int ninst) sse_reset(&dyn->insts[ninst].n); fpu_reset_reg_neoncache(&dyn->insts[ninst].n); } + +int fpu_is_st_freed(dynarec_native_t* dyn, int ninst, int st) +{ + return (dyn->n.tags&(0b11<<(st*2)))?1:0; +} \ No newline at end of file diff --git a/src/dynarec/arm64/dynarec_arm64_functions.h b/src/dynarec/arm64/dynarec_arm64_functions.h index 56039889..abe827bb 100644 --- a/src/dynarec/arm64/dynarec_arm64_functions.h +++ b/src/dynarec/arm64/dynarec_arm64_functions.h @@ -58,4 +58,7 @@ void print_opcode(dynarec_native_t* dyn, int ninst, uint32_t opcode); // reset the cache void fpu_reset(dynarec_native_t* dyn); void fpu_reset_ninst(dynarec_native_t* dyn, int ninst); + +// is st freed +int fpu_is_st_freed(dynarec_native_t* dyn, int ninst, int st); #endif //__DYNAREC_ARM_FUNCTIONS_H__ diff --git a/src/dynarec/arm64/dynarec_arm64_helper.c b/src/dynarec/arm64/dynarec_arm64_helper.c index b8eb06f2..e3c2104f 100644 --- a/src/dynarec/arm64/dynarec_arm64_helper.c +++ b/src/dynarec/arm64/dynarec_arm64_helper.c @@ -941,6 +941,9 @@ int x87_do_push(dynarec_arm_t* dyn, int ninst, int s1, int t) dyn->n.stack+=1; dyn->n.stack_next+=1; dyn->n.stack_push+=1; + ++dyn->n.pushed; + if(dyn->n.poped) + --dyn->n.poped; // move all regs in cache, and find a free one for(int j=0; j<24; ++j) if((dyn->n.neoncache[j].t == NEON_CACHE_ST_D) @@ -948,9 +951,8 @@ int x87_do_push(dynarec_arm_t* dyn, int ninst, int s1, int t) ||(dyn->n.neoncache[j].t == NEON_CACHE_ST_I64)) ++dyn->n.neoncache[j].n; int ret = -1; + dyn->n.tags<<=2; for(int i=0; i<8; ++i) { - if(dyn->n.freed[i]!=-1) - ++dyn->n.freed[i]; if(dyn->n.x87cache[i]!=-1) ++dyn->n.x87cache[i]; else if(ret==-1) { @@ -973,6 +975,9 @@ void x87_do_push_empty(dynarec_arm_t* dyn, int ninst, int s1) dyn->n.stack+=1; dyn->n.stack_next+=1; dyn->n.stack_push+=1; + ++dyn->n.pushed; + if(dyn->n.poped) + --dyn->n.poped; // move all regs in cache for(int j=0; j<24; ++j) if((dyn->n.neoncache[j].t == NEON_CACHE_ST_D) @@ -980,9 +985,8 @@ void x87_do_push_empty(dynarec_arm_t* dyn, int ninst, int s1) ||(dyn->n.neoncache[j].t == NEON_CACHE_ST_I64)) ++dyn->n.neoncache[j].n; int ret = -1; + dyn->n.tags<<=2; for(int i=0; i<8; ++i) { - if(dyn->n.freed[i]!=-1) - ++dyn->n.freed[i]; if(dyn->n.x87cache[i]!=-1) ++dyn->n.x87cache[i]; else if(ret==-1) @@ -1008,17 +1012,11 @@ void static internal_x87_dopop(dynarec_arm_t* dyn) } int static internal_x87_dofree(dynarec_arm_t* dyn) { - int ret = 0; - for(int i=0; i<8; ++i) - if(dyn->n.freed[i]!=-1) { - --dyn->n.freed[i]; - if(dyn->n.freed[i]<=0) { - MESSAGE(LOG_DUMP, "\t--------x87 FREED ST0, poping 1 more\n"); - dyn->n.freed[i] = -1; - ret = 1; - } - } - return ret; + if(dyn->n.tags&0b11) { + MESSAGE(LOG_DUMP, "\t--------x87 FREED ST0, poping 1 more\n"); + return 1; + } + return 0; } void x87_do_pop(dynarec_arm_t* dyn, int ninst, int s1) { @@ -1028,6 +1026,10 @@ void x87_do_pop(dynarec_arm_t* dyn, int ninst, int s1) dyn->n.x87stack-=1; dyn->n.stack_next-=1; dyn->n.stack_pop+=1; + dyn->n.tags>>=2; + ++dyn->n.poped; + if(dyn->n.pushed) + --dyn->n.pushed; // move all regs in cache, poping ST0 internal_x87_dopop(dyn); } while(internal_x87_dofree(dyn)); @@ -1051,8 +1053,9 @@ void x87_purgecache(dynarec_arm_t* dyn, int ninst, int next, int s1, int s2, int int a = dyn->n.x87stack; if(a!=0) { // reset x87stack - if(!next) + if(!next) { dyn->n.x87stack = 0; + } // Add x87stack to emu fpu_stack LDRw_U12(s2, xEmu, offsetof(x64emu_t, fpu_stack)); if(a>0) { @@ -1063,31 +1066,33 @@ void x87_purgecache(dynarec_arm_t* dyn, int ninst, int next, int s1, int s2, int STRw_U12(s2, xEmu, offsetof(x64emu_t, fpu_stack)); // Sub x87stack to top, with and 7 LDRw_U12(s2, xEmu, offsetof(x64emu_t, top)); - // update tags (and top at the same time) if(a>0) { - // new tag to fulls - MOVZw(s3, 0); - ADDx_U12(s1, xEmu, offsetof(x64emu_t, p_regs)); - for (int i=0; i<a; ++i) { - SUBw_U12(s2, s2, 1); - ANDw_mask(s2, s2, 0, 2); //mask=7 // (emu->top + st)&7 - if(x87_is_stcached(dyn, i)) // to handle ffree - STRw_REG_LSL2(s3, s1, s2); - } + SUBw_U12(s2, s2, a); } else { - // empty tags - MOVZw(s3, 0b11); - ADDx_U12(s1, xEmu, offsetof(x64emu_t, p_regs)); - for (int i=0; i<-a; ++i) { - STRw_REG_LSL2(s3, s1, s2); - ADDw_U12(s2, s2, 1); - ANDw_mask(s2, s2, 0, 2); //mask=7 // (emu->top + st)&7 - } + ADDw_U12(s2, s2, -a); } + ANDw_mask(s2, s2, 0, 2); STRw_U12(s2, xEmu, offsetof(x64emu_t, top)); + // update tags (and top at the same time) + LDRH_U12(s1, xEmu, offsetof(x64emu_t, fpu_tags)); + if(a>0) { + LSLw_IMM(s1, s1, a*2); + } else { + ORRw_mask(s1, s1, 0b010000, 0b001111); // 0xffff0000 + LSRw_IMM(s1, s1, -a*2); + } + STRH_U12(s1, xEmu, offsetof(x64emu_t, fpu_tags)); } else { LDRw_U12(s2, xEmu, offsetof(x64emu_t, top)); } + // check if free is used + if(dyn->n.tags) { + LDRH_U12(s1, xEmu, offsetof(x64emu_t, fpu_tags)); + MOV32w(s3, dyn->n.tags); + ORRw_REG(s1, s1, s3); + STRH_U12(s1, xEmu, offsetof(x64emu_t, fpu_tags)); + } + if(ret!=0) { // --- set values // prepare offset to fpu => s1 @@ -1133,11 +1138,13 @@ void x87_purgecache(dynarec_arm_t* dyn, int ninst, int next, int s1, int s2, int } if(!next) { dyn->n.stack_next = 0; - for(int i=0; i<8; ++i) - dyn->n.freed[i] = -1; + dyn->n.tags = 0; #if STEP < 2 // refresh the cached valued, in case it's a purge outside a instruction dyn->insts[ninst].n.barrier = 1; + dyn->n.pushed = 0; + dyn->n.poped = 0; + #endif } MESSAGE(LOG_DUMP, "\t---Purge x87 Cache and Synch Stackcount\n"); @@ -1165,6 +1172,15 @@ static void x87_reflectcache(dynarec_arm_t* dyn, int ninst, int s1, int s2, int } ANDw_mask(s2, s2, 0, 2); //mask=7 STRw_U12(s2, xEmu, offsetof(x64emu_t, top)); + // update tags + LDRH_U12(s1, xEmu, offsetof(x64emu_t, fpu_tags)); + if(a>0) { + LSLw_IMM(s1, s1, a*2); + } else { + ORRw_mask(s1, s1, 0b010000, 0b001111); // 0xffff0000 + LSRw_IMM(s1, s1, -a*2); + } + STRH_U12(s1, xEmu, offsetof(x64emu_t, fpu_tags)); } int ret = 0; for (int i=0; (i<8) && (!ret); ++i) @@ -1213,6 +1229,15 @@ static void x87_unreflectcache(dynarec_arm_t* dyn, int ninst, int s1, int s2, in } ANDw_mask(s2, s2, 0, 2); //mask=7 STRw_U12(s2, xEmu, offsetof(x64emu_t, top)); + // update tags + LDRH_U12(s1, xEmu, offsetof(x64emu_t, fpu_tags)); + if(a>0) { + ORRw_mask(s1, s1, 0b010000, 0b001111); // 0xffff0000 + LSRw_IMM(s1, s1, a*2); + } else { + LSLw_IMM(s1, s1, -a*2); + } + STRH_U12(s1, xEmu, offsetof(x64emu_t, fpu_tags)); } } @@ -1439,17 +1464,8 @@ void x87_free(dynarec_arm_t* dyn, int ninst, int s1, int s2, int s3, int st) ANDw_mask(s2, s2, 0, 2); //mask=7 // (emu->top + i)&7 } } - // mark as free - ADDx_U12(s1, xEmu, offsetof(x64emu_t, p_regs)); - MOVZw(s3, 0b11); - STRw_REG_LSL2(s3, s1, s2); // add mark in the freed array - for(int i=0; i<8; ++i) - if(dyn->n.freed[i]==-1) { - dyn->n.freed[i]=st; - MESSAGE(LOG_DUMP, "\t--------x87 Marked ST%d as Freed\n", st); - break; - } + dyn->n.tags |= 0b11<<(st*2); MESSAGE(LOG_DUMP, "\t--------x87 FFREE for ST%d\n", st); } @@ -1984,26 +2000,15 @@ static void fpuCacheTransform(dynarec_arm_t* dyn, int ninst, int s1, int s2, int STRw_U12(s3, xEmu, offsetof(x64emu_t, fpu_stack)); // Sub x87stack to top, with and 7 LDRw_U12(s3, xEmu, offsetof(x64emu_t, top)); - // update tags (and top at the same time) + // update tags + LDRH_U12(s2, xEmu, offsetof(x64emu_t, fpu_tags)); if(a>0) { - // new tag to fulls - MOVZw(s2, 0); - ADDx_U12(s1, xEmu, offsetof(x64emu_t, p_regs)); - for (int i=0; i<a; ++i) { - SUBw_U12(s3, s3, 1); - ANDw_mask(s3, s3, 0, 2); // (emu->top + st)&7 - STRw_REG_LSL2(s2, s1, s3); // that slot is full - } + LSLw_IMM(s2, s2, a*2); } else { - // empty tags - MOVZw(s2, 0b11); - ADDx_U12(s1, xEmu, offsetof(x64emu_t, p_regs)); - for (int i=0; i<-a; ++i) { - STRw_REG_LSL2(s2, s1, s3); // empty slot before leaving it - ADDw_U12(s3, s3, 1); - ANDw_mask(s3, s3, 0, 2); // (emu->top + st)&7 - } + ORRw_mask(s2, s2, 0b010000, 0b001111); // 0xffff0000 + LSRw_IMM(s2, s2, -a*2); } + STRH_U12(s2, xEmu, offsetof(x64emu_t, fpu_tags)); STRw_U12(s3, xEmu, offsetof(x64emu_t, top)); s3_top = 0; stack_cnt = cache_i2.stack; @@ -2243,8 +2248,14 @@ void fpu_reset_cache(dynarec_arm_t* dyn, int ninst, int reset_n) #if STEP > 1 // for STEP 2 & 3, just need to refrest with current, and undo the changes (push & swap) dyn->n = dyn->insts[ninst].n; + #else + dyn->n = dyn->insts[reset_n].n; + #endif neoncacheUnwind(&dyn->n); - #ifdef HAVE_TRACE + #if STEP == 0 + if(box64_dynarec_dump) dynarec_log(LOG_NONE, "New x87stack=%d\n", dyn->n.x87stack); + #endif + #if defined(HAVE_TRACE) && (STEP>2) if(box64_dynarec_dump) if(memcmp(&dyn->n, &dyn->insts[reset_n].n, sizeof(neon_cache_t))) { MESSAGE(LOG_DEBUG, "Warning, difference in neoncache: reset="); @@ -2274,9 +2285,6 @@ void fpu_reset_cache(dynarec_arm_t* dyn, int ninst, int reset_n) MESSAGE(LOG_DEBUG, "\n"); } #endif //HAVE_TRACE - #else - dyn->n = dyn->insts[reset_n].n; - #endif } // propagate ST stack state, especial stack pop that are deferred diff --git a/src/dynarec/arm64/dynarec_arm64_helper.h b/src/dynarec/arm64/dynarec_arm64_helper.h index 617aecab..d62c571c 100644 --- a/src/dynarec/arm64/dynarec_arm64_helper.h +++ b/src/dynarec/arm64/dynarec_arm64_helper.h @@ -812,28 +812,28 @@ #define X87_PUSH_EMPTY_OR_FAIL(dyn, ninst, scratch) x87_do_push_empty(dyn, ninst, scratch) #define X87_POP_OR_FAIL(dyn, ninst, scratch) x87_do_pop(dyn, ninst, scratch) #else -#define X87_PUSH_OR_FAIL(var, dyn, ninst, scratch, t) \ - if (dyn->n.x87stack == +8) { \ - if(box64_dynarec_dump) dynarec_log(LOG_INFO, " Warning, suspicious x87 Push, stack=%d on inst %d\n", dyn->n.x87stack, ninst); \ - dyn->abort = 1; \ - return addr; \ - } \ +#define X87_PUSH_OR_FAIL(var, dyn, ninst, scratch, t) \ + if ((dyn->n.x87stack==8) || (dyn->n.pushed==8)) { \ + if(box64_dynarec_dump) dynarec_log(LOG_NONE, " Warning, suspicious x87 Push, stack=%d/%d on inst %d\n", dyn->n.x87stack, dyn->n.pushed, ninst); \ + dyn->abort = 1; \ + return addr; \ + } \ var = x87_do_push(dyn, ninst, scratch, t) -#define X87_PUSH_EMPTY_OR_FAIL(dyn, ninst, scratch) \ - if (dyn->n.x87stack == +8) { \ - if(box64_dynarec_dump) dynarec_log(LOG_INFO, " Warning, suspicious x87 Push, stack=%d on inst %d\n", dyn->n.x87stack, ninst); \ - dyn->abort = 1; \ - return addr; \ - } \ +#define X87_PUSH_EMPTY_OR_FAIL(dyn, ninst, scratch) \ + if ((dyn->n.x87stack==8) || (dyn->n.pushed==8)) { \ + if(box64_dynarec_dump) dynarec_log(LOG_NONE, " Warning, suspicious x87 Push, stack=%d/%d on inst %d\n", dyn->n.x87stack, dyn->n.pushed, ninst); \ + dyn->abort = 1; \ + return addr; \ + } \ x87_do_push_empty(dyn, ninst, scratch) -#define X87_POP_OR_FAIL(dyn, ninst, scratch) \ - if (dyn->n.x87stack == -8) { \ - if(box64_dynarec_dump) dynarec_log(LOG_INFO, " Warning, suspicious x87 Pop, stack=%d on inst %d\n", dyn->n.x87stack, ninst); \ - dyn->abort = 1; \ - return addr; \ - } \ +#define X87_POP_OR_FAIL(dyn, ninst, scratch) \ + if ((dyn->n.x87stack==-8) || (dyn->n.poped==8)) { \ + if(box64_dynarec_dump) dynarec_log(LOG_NONE, " Warning, suspicious x87 Pop, stack=%d/%d on inst %d\n", dyn->n.x87stack, dyn->n.poped, ninst); \ + dyn->abort = 1; \ + return addr; \ + } \ x87_do_pop(dyn, ninst, scratch) #endif diff --git a/src/dynarec/arm64/dynarec_arm64_pass0.h b/src/dynarec/arm64/dynarec_arm64_pass0.h index e8a4b0a8..7d4c0c2d 100644 --- a/src/dynarec/arm64/dynarec_arm64_pass0.h +++ b/src/dynarec/arm64/dynarec_arm64_pass0.h @@ -16,7 +16,7 @@ dyn->f.pending=(B)&SF_SET_PENDING; \ dyn->f.dfnone=((B)&SF_SET)?(((B)==SF_SET_NODF)?0:1):0; #define EMIT(A) dyn->native_size+=4 -#define JUMP(A, C) add_jump(dyn, ninst); add_next(dyn, (uintptr_t)A); SMEND(); dyn->insts[ninst].x64.jmp = A; dyn->insts[ninst].x64.jmp_cond = C +#define JUMP(A, C) add_jump(dyn, ninst); add_next(dyn, (uintptr_t)A); SMEND(); dyn->insts[ninst].x64.jmp = A; dyn->insts[ninst].x64.jmp_cond = C; dyn->insts[ninst].x64.jmp_insts = 0 #define BARRIER(A) if(A!=BARRIER_MAYBE) {fpu_purgecache(dyn, ninst, 0, x1, x2, x3); dyn->insts[ninst].x64.barrier = A;} else dyn->insts[ninst].barrier_maybe = 1 #define BARRIER_NEXT(A) dyn->insts[ninst].x64.barrier_next = A #define SET_HASCALLRET() dyn->insts[ninst].x64.has_callret = 1 diff --git a/src/dynarec/arm64/dynarec_arm64_private.h b/src/dynarec/arm64/dynarec_arm64_private.h index 6db73b1c..b26d522d 100644 --- a/src/dynarec/arm64/dynarec_arm64_private.h +++ b/src/dynarec/arm64/dynarec_arm64_private.h @@ -42,11 +42,13 @@ typedef struct neoncache_s { uint8_t combined2; uint8_t swapped; // the combined reg were swapped uint8_t barrier; // is there a barrier at instruction epilog? + uint8_t pushed; // positive pushed value (to check for overflow) + uint8_t poped; // positive poped value (to check for underflow) uint32_t news; // bitmask, wich neoncache are new for this opcode // fpu cache int8_t x87cache[8]; // cache status for the 8 x87 register behind the fpu stack int8_t x87reg[8]; // reg used for x87cache entry - int8_t freed[8]; // set when FFREE is used, -1 else + int16_t tags; // similar to fpu_tags int8_t mmxcache[8]; // cache status for the 8 MMX registers sse_cache_t ssecache[16]; // cache status for the 16 SSE(2) registers int8_t fpuused[24]; // all 0..24 double reg from fpu, used by x87, sse and mmx diff --git a/src/dynarec/dynarec_native.c b/src/dynarec/dynarec_native.c index d53861ec..8eb69080 100644 --- a/src/dynarec/dynarec_native.c +++ b/src/dynarec/dynarec_native.c @@ -103,7 +103,7 @@ void add_jump(dynarec_native_t *dyn, int ninst) { int get_first_jump(dynarec_native_t *dyn, int next) { for(int i=0; i<dyn->jmp_sz; ++i) if(dyn->insts[dyn->jmps[i]].x64.jmp == next) - return i; + return dyn->jmps[i]; return -2; } @@ -544,9 +544,11 @@ void* FillBlock64(dynablock_t* block, uintptr_t addr, int alternate, int is32bit if(helper.insts[i2].x64.addr==j) k=i2; }*/ - if(k!=-1 && !helper.insts[i].barrier_maybe) - helper.insts[k].x64.barrier |= BARRIER_FULL; - helper.insts[i].x64.jmp_insts = k; + if(k!=-1) { + if(k!=-1 && !helper.insts[i].barrier_maybe) + helper.insts[k].x64.barrier |= BARRIER_FULL; + helper.insts[i].x64.jmp_insts = k; + } } } // no need for next and jmps anymore diff --git a/src/dynarec/dynarec_native_pass.c b/src/dynarec/dynarec_native_pass.c index 11689307..0288cbeb 100644 --- a/src/dynarec/dynarec_native_pass.c +++ b/src/dynarec/dynarec_native_pass.c @@ -171,7 +171,6 @@ uintptr_t native_pass(dynarec_native_t* dyn, uintptr_t addr, int alternate, int #if STEP > 0 if(!dyn->insts[ninst].x64.has_next && dyn->insts[ninst].x64.jmp && dyn->insts[ninst].x64.jmp_insts!=-1) next = dyn->insts[ninst].x64.jmp_insts; - #endif if(dyn->insts[ninst].x64.has_next && dyn->insts[next].x64.barrier) { if(dyn->insts[next].x64.barrier&BARRIER_FLOAT) { fpu_purgecache(dyn, ninst, 0, x1, x2, x3); @@ -182,6 +181,7 @@ uintptr_t native_pass(dynarec_native_t* dyn, uintptr_t addr, int alternate, int dyn->last_ip = 0; } } + #endif #ifndef PROT_READ #define PROT_READ 1 #endif @@ -216,7 +216,8 @@ uintptr_t native_pass(dynarec_native_t* dyn, uintptr_t addr, int alternate, int if(dyn->forward) { if(dyn->forward_to == addr && !need_epilog) { // we made it! - if(box64_dynarec_dump) dynarec_log(LOG_NONE, "Forward extend block for %d bytes %s%p -> %p\n", dyn->forward_to-dyn->forward, dyn->insts[dyn->forward_ninst].x64.has_callret?"(opt. call) ":"", (void*)dyn->forward, (void*)dyn->forward_to); + reset_n = get_first_jump(dyn, addr); + if(box64_dynarec_dump) dynarec_log(LOG_NONE, "Forward extend block for %d bytes %s%p -> %p (ninst %d - %d)\n", dyn->forward_to-dyn->forward, dyn->insts[dyn->forward_ninst].x64.has_callret?"(opt. call) ":"", (void*)dyn->forward, (void*)dyn->forward_to, reset_n, ninst); if(dyn->insts[dyn->forward_ninst].x64.has_callret && !dyn->insts[dyn->forward_ninst].x64.has_next) dyn->insts[dyn->forward_ninst].x64.has_next = 1; // this block actually continue dyn->forward = 0; @@ -252,7 +253,7 @@ uintptr_t native_pass(dynarec_native_t* dyn, uintptr_t addr, int alternate, int // and pred table is not ready yet reset_n = get_first_jump(dyn, next); } - if(box64_dynarec_dump) dynarec_log(LOG_NONE, "Extend block %p, %s%p -> %p (ninst=%d, jump from %d)\n", dyn, dyn->insts[ninst].x64.has_callret?"(opt. call) ":"", (void*)addr, (void*)next, ninst, dyn->insts[ninst].x64.has_callret?ninst:reset_n); + if(box64_dynarec_dump) dynarec_log(LOG_NONE, "Extend block %p, %s%p -> %p (ninst=%d, jump from %d)\n", dyn, dyn->insts[ninst].x64.has_callret?"(opt. call) ":"", (void*)addr, (void*)next, ninst+1, dyn->insts[ninst].x64.has_callret?ninst:reset_n); } else if(next && (next-addr)<box64_dynarec_forward && (getProtection(next)&PROT_READ)/*box64_dynarec_bigblock>=stopblock*/) { if(!((box64_dynarec_bigblock<stopblock) && !isJumpTableDefault64((void*)next))) { if(dyn->forward) { @@ -295,6 +296,8 @@ uintptr_t native_pass(dynarec_native_t* dyn, uintptr_t addr, int alternate, int } #endif } + if(ok && dyn->insts[ninst].x64.has_callret) + reset_n = -2; ++ninst; #if STEP == 0 memset(&dyn->insts[ninst], 0, sizeof(instruction_native_t)); diff --git a/src/dynarec/la64/dynarec_la64_00.c b/src/dynarec/la64/dynarec_la64_00.c index ebb6f4f8..706094db 100644 --- a/src/dynarec/la64/dynarec_la64_00.c +++ b/src/dynarec/la64/dynarec_la64_00.c @@ -1207,7 +1207,7 @@ uintptr_t dynarec64_00(dynarec_la64_t* dyn, uintptr_t addr, uintptr_t ip, int ni if (box64_dynarec_safeflags) { READFLAGS(X_PEND); // lets play safe here too } - BARRIER(BARRIER_FLOAT); + fpu_purgecache(dyn, ninst, 1, x1, x2, x3); // using next, even if there no next i32 = F16; retn_to_epilog(dyn, ninst, rex, i32); *need_epilog = 0; @@ -1219,7 +1219,7 @@ uintptr_t dynarec64_00(dynarec_la64_t* dyn, uintptr_t addr, uintptr_t ip, int ni if (box64_dynarec_safeflags) { READFLAGS(X_PEND); // so instead, force the deferred flags, so it's not too slow, and flags are not lost } - BARRIER(BARRIER_FLOAT); + fpu_purgecache(dyn, ninst, 1, x1, x2, x3); // using next, even if there no next ret_to_epilog(dyn, ninst, rex); *need_epilog = 0; *ok = 0; @@ -1534,19 +1534,20 @@ uintptr_t dynarec64_00(dynarec_la64_t* dyn, uintptr_t addr, uintptr_t ip, int ni SETFLAGS(X_ALL, SF_SET); // Hack to set flags to "dont'care" state } // regular call - if (box64_dynarec_callret && box64_dynarec_bigblock > 1) { + /*if (box64_dynarec_callret && box64_dynarec_bigblock > 1) { BARRIER(BARRIER_FULL); } else { BARRIER(BARRIER_FLOAT); *need_epilog = 0; *ok = 0; - } + }*/ if (rex.is32bits) { MOV32w(x2, addr); } else { TABLE64(x2, addr); } + fpu_purgecache(dyn, ninst, 1, x1, x3, x4); PUSH1z(x2); if (box64_dynarec_callret) { SET_HASCALLRET(); diff --git a/src/dynarec/la64/dynarec_la64_pass0.h b/src/dynarec/la64/dynarec_la64_pass0.h index 999b48e1..1eef76bf 100644 --- a/src/dynarec/la64/dynarec_la64_pass0.h +++ b/src/dynarec/la64/dynarec_la64_pass0.h @@ -16,7 +16,7 @@ dyn->f.pending = (B) & SF_SET_PENDING; \ dyn->f.dfnone = ((B) & SF_SET) ? 1 : 0; #define EMIT(A) dyn->native_size += 4 -#define JUMP(A, C) add_jump(dyn, ninst); add_next(dyn, (uintptr_t)A); dyn->insts[ninst].x64.jmp = A; dyn->insts[ninst].x64.jmp_cond = C +#define JUMP(A, C) add_jump(dyn, ninst); add_next(dyn, (uintptr_t)A); SMEND(); dyn->insts[ninst].x64.jmp = A; dyn->insts[ninst].x64.jmp_cond = C; dyn->insts[ninst].x64.jmp_insts = 0 #define BARRIER(A) \ if (A != BARRIER_MAYBE) { \ fpu_purgecache(dyn, ninst, 0, x1, x2, x3); \ diff --git a/src/dynarec/rv64/dynarec_rv64_00_3.c b/src/dynarec/rv64/dynarec_rv64_00_3.c index df22e054..93bf1690 100644 --- a/src/dynarec/rv64/dynarec_rv64_00_3.c +++ b/src/dynarec/rv64/dynarec_rv64_00_3.c @@ -279,7 +279,7 @@ uintptr_t dynarec64_00_3(dynarec_rv64_t* dyn, uintptr_t addr, uintptr_t ip, int if(box64_dynarec_safeflags) { READFLAGS(X_PEND); // lets play safe here too } - BARRIER(BARRIER_FLOAT); + fpu_purgecache(dyn, ninst, 1, x1, x2, x3); // using next, even if there no next i32 = F16; retn_to_epilog(dyn, ninst, rex, i32); *need_epilog = 0; @@ -291,7 +291,7 @@ uintptr_t dynarec64_00_3(dynarec_rv64_t* dyn, uintptr_t addr, uintptr_t ip, int if(box64_dynarec_safeflags) { READFLAGS(X_PEND); // so instead, force the deferred flags, so it's not too slow, and flags are not lost } - BARRIER(BARRIER_FLOAT); + fpu_purgecache(dyn, ninst, 1, x1, x2, x3); // using next, even if there no next ret_to_epilog(dyn, ninst, rex); *need_epilog = 0; *ok = 0; @@ -903,19 +903,20 @@ uintptr_t dynarec64_00_3(dynarec_rv64_t* dyn, uintptr_t addr, uintptr_t ip, int SETFLAGS(X_ALL, SF_SET); // Hack to set flags to "dont'care" state } // regular call - if(box64_dynarec_callret && box64_dynarec_bigblock>1) { + /*if(box64_dynarec_callret && box64_dynarec_bigblock>1) { BARRIER(BARRIER_FULL); } else { BARRIER(BARRIER_FLOAT); *need_epilog = 0; *ok = 0; - } + }*/ if(rex.is32bits) { MOV32w(x2, addr); } else { TABLE64(x2, addr); } + fpu_purgecache(dyn, ninst, 1, x1, x3, x4); PUSH1z(x2); if(box64_dynarec_callret) { SET_HASCALLRET(); diff --git a/src/dynarec/rv64/dynarec_rv64_functions.c b/src/dynarec/rv64/dynarec_rv64_functions.c index 3b95c06e..b016e45b 100644 --- a/src/dynarec/rv64/dynarec_rv64_functions.c +++ b/src/dynarec/rv64/dynarec_rv64_functions.c @@ -422,11 +422,23 @@ void extcacheUnwind(extcache_t* cache) } } cache->x87stack-=cache->stack_push; + cache->tags>>=(cache->stack_push*2); cache->stack-=cache->stack_push; + if(cache->pushed>=cache->stack_push) + cache->pushed-=cache->stack_push; + else + cache->pushed = 0; cache->stack_push = 0; } cache->x87stack+=cache->stack_pop; cache->stack_next = cache->stack; + if(cache->stack_pop) { + if(cache->poped>=cache->stack_pop) + cache->poped-=cache->stack_pop; + else + cache->poped = 0; + cache->tags<<=(cache->stack_pop*2); + } cache->stack_pop = 0; cache->barrier = 0; // And now, rebuild the x87cache info with extcache @@ -631,6 +643,9 @@ static void x87_reset(extcache_t* e) e->combined1 = e->combined2 = 0; e->swapped = 0; e->barrier = 0; + e->pushed = 0; + e->poped = 0; + for(int i=0; i<24; ++i) if (e->extcache[i].t == EXT_CACHE_ST_F || e->extcache[i].t == EXT_CACHE_ST_D @@ -666,3 +681,8 @@ void fpu_reset_ninst(dynarec_rv64_t* dyn, int ninst) sse_reset(&dyn->insts[ninst].e); fpu_reset_reg_extcache(&dyn->insts[ninst].e); } + +int fpu_is_st_freed(dynarec_rv64_t* dyn, int ninst, int st) +{ + return (dyn->e.tags&(0b11<<(st*2)))?1:0; +} \ No newline at end of file diff --git a/src/dynarec/rv64/dynarec_rv64_functions.h b/src/dynarec/rv64/dynarec_rv64_functions.h index 01b5e9a4..e3a5171d 100644 --- a/src/dynarec/rv64/dynarec_rv64_functions.h +++ b/src/dynarec/rv64/dynarec_rv64_functions.h @@ -58,4 +58,7 @@ void print_newinst(dynarec_native_t* dyn, int ninst); // reset the cache void fpu_reset(dynarec_native_t* dyn); void fpu_reset_ninst(dynarec_native_t* dyn, int ninst); + +// is st freed +int fpu_is_st_freed(dynarec_native_t* dyn, int ninst, int st); #endif //__DYNAREC_RV64_FUNCTIONS_H__ diff --git a/src/dynarec/rv64/dynarec_rv64_helper.c b/src/dynarec/rv64/dynarec_rv64_helper.c index 6c5d25ee..e5b9ac51 100644 --- a/src/dynarec/rv64/dynarec_rv64_helper.c +++ b/src/dynarec/rv64/dynarec_rv64_helper.c @@ -875,6 +875,9 @@ int x87_do_push(dynarec_rv64_t* dyn, int ninst, int s1, int t) dyn->e.stack+=1; dyn->e.stack_next+=1; dyn->e.stack_push+=1; + ++dyn->e.pushed; + if(dyn->e.poped) + --dyn->e.poped; // move all regs in cache, and find a free one for(int j=0; j<24; ++j) if ((dyn->e.extcache[j].t == EXT_CACHE_ST_D) @@ -882,6 +885,7 @@ int x87_do_push(dynarec_rv64_t* dyn, int ninst, int s1, int t) || (dyn->e.extcache[j].t == EXT_CACHE_ST_I64)) ++dyn->e.extcache[j].n; int ret = -1; + dyn->e.tags<<=2; for(int i=0; i<8; ++i) if(dyn->e.x87cache[i]!=-1) ++dyn->e.x87cache[i]; @@ -900,26 +904,31 @@ void x87_do_push_empty(dynarec_rv64_t* dyn, int ninst, int s1) dyn->e.stack+=1; dyn->e.stack_next+=1; dyn->e.stack_push+=1; + ++dyn->e.pushed; + if(dyn->e.poped) + --dyn->e.poped; // move all regs in cache for(int j=0; j<24; ++j) if ((dyn->e.extcache[j].t == EXT_CACHE_ST_D) || (dyn->e.extcache[j].t == EXT_CACHE_ST_F) || (dyn->e.extcache[j].t == EXT_CACHE_ST_I64)) ++dyn->e.extcache[j].n; + int ret = -1; + dyn->e.tags<<=2; for(int i=0; i<8; ++i) if(dyn->e.x87cache[i]!=-1) ++dyn->e.x87cache[i]; + else if(ret==-1) + ret = i; + if(ret==-1) { + MESSAGE(LOG_DUMP, "Incoherent x87 stack cache, aborting\n"); + dyn->abort = 1; + } if(s1) x87_stackcount(dyn, ninst, s1); } -void x87_do_pop(dynarec_rv64_t* dyn, int ninst, int s1) +void static internal_x87_dopop(dynarec_rv64_t* dyn) { - if(dyn->e.mmxcount) - mmx_purgecache(dyn, ninst, 0, s1); - dyn->e.x87stack-=1; - dyn->e.stack_next-=1; - dyn->e.stack_pop+=1; - // move all regs in cache, poping ST0 for(int i=0; i<8; ++i) if(dyn->e.x87cache[i]!=-1) { --dyn->e.x87cache[i]; @@ -929,6 +938,30 @@ void x87_do_pop(dynarec_rv64_t* dyn, int ninst, int s1) } } } +int static internal_x87_dofree(dynarec_rv64_t* dyn) +{ + if(dyn->e.tags&0b11) { + MESSAGE(LOG_DUMP, "\t--------x87 FREED ST0, poping 1 more\n"); + return 1; + } + return 0; +} +void x87_do_pop(dynarec_rv64_t* dyn, int ninst, int s1) +{ + if(dyn->e.mmxcount) + mmx_purgecache(dyn, ninst, 0, s1); + do { + dyn->e.x87stack-=1; + dyn->e.stack_next-=1; + dyn->e.stack_pop+=1; + dyn->e.tags>>=2; + ++dyn->e.poped; + if(dyn->e.pushed) + --dyn->e.pushed; + // move all regs in cache, poping ST0 + internal_x87_dopop(dyn); + } while(internal_x87_dofree(dyn)); +} void x87_purgecache(dynarec_rv64_t* dyn, int ninst, int next, int s1, int s2, int s3) { @@ -952,27 +985,32 @@ void x87_purgecache(dynarec_rv64_t* dyn, int ninst, int next, int s1, int s2, in LW(s2, xEmu, offsetof(x64emu_t, top)); // update tags (and top at the same time) if(a>0) { - // new tag to fulls - for (int i=0; i<a; ++i) { - SUBI(s2, s2, 1); - ANDI(s2, s2, 7); // (emu->top + st)&7 - if(rv64_zba) SH2ADD(s1, s2, xEmu); else {SLLI(s1, s2, 2); ADD(s1, xEmu, s1);} - SW(xZR, s1, offsetof(x64emu_t, p_regs)); - } + SUBI(s2, s2, a); } else { - // empty tags - ADDI(s3, xZR, 0b11); - for (int i=0; i<-a; ++i) { - if(rv64_zba) SH2ADD(s1, s2, xEmu); else {SLLI(s1, s2, 2); ADD(s1, xEmu, s1);} - SW(s3, s1, offsetof(x64emu_t, p_regs)); - ADDI(s2, s2, 1); - ANDI(s2, s2, 7); // (emu->top + st)&7 - } + ADDI(s2, s2, -a); } + ANDI(s2, s2, 7); SW(s2, xEmu, offsetof(x64emu_t, top)); + // update tags (and top at the same time) + LHU(s1, xEmu, offsetof(x64emu_t, fpu_tags)); + if(a>0) { + SLLI(s1, s1, a*2); + } else { + SLLI(s3, xMASK, 16); // 0xffff0000 (plus some unused hipart) + OR(s1, s1, s3); + SRLI(s1, s1, -a*2); + } + SH(s1, xEmu, offsetof(x64emu_t, fpu_tags)); } else { LW(s2, xEmu, offsetof(x64emu_t, top)); } + // check if free is used + if(dyn->e.tags) { + LH(s1, xEmu, offsetof(x64emu_t, fpu_tags)); + MOV32w(s3, dyn->e.tags); + OR(s1, s1, s3); + SH(s1, xEmu, offsetof(x64emu_t, fpu_tags)); + } if(ret!=0) { // --- set values // Get top @@ -1018,9 +1056,13 @@ void x87_purgecache(dynarec_rv64_t* dyn, int ninst, int next, int s1, int s2, in } if(!next) { dyn->e.stack_next = 0; + dyn->e.tags = 0; #if STEP < 2 // refresh the cached valued, in case it's a purge outside a instruction dyn->insts[ninst].e.barrier = 1; + dyn->e.pushed = 0; + dyn->e.poped = 0; + #endif } MESSAGE(LOG_DUMP, "\t---Purge x87 Cache and Synch Stackcount\n"); @@ -1040,6 +1082,16 @@ static void x87_reflectcache(dynarec_rv64_t* dyn, int ninst, int s1, int s2, int SUBI(s2, s2, a); ANDI(s2, s2, 7); SW(s2, xEmu, offsetof(x64emu_t, top)); + // update tags (and top at the same time) + LH(s1, xEmu, offsetof(x64emu_t, fpu_tags)); + if(a>0) { + SLLI(s1, s1, a*2); + } else { + SLLI(s3, xMASK, 16); // 0xffff0000 + OR(s1, s1, s3); + SRLI(s1, s1, -a*2); + } + SH(s1, xEmu, offsetof(x64emu_t, fpu_tags)); } int ret = 0; for (int i=0; (i<8) && (!ret); ++i) @@ -1080,6 +1132,13 @@ static void x87_unreflectcache(dynarec_rv64_t* dyn, int ninst, int s1, int s2, i ADDI(s2, s2, a); ANDI(s2, s2, 7); SW(s2, xEmu, offsetof(x64emu_t, top)); + if(a>0) { + SLLI(s3, xMASK, 16); // 0xffff0000 + OR(s1, s1, s3); + SRLI(s1, s1, a*2); + } else { + SLLI(s1, s1, -a*2); + } } } @@ -1275,6 +1334,66 @@ void x87_reget_st(dynarec_rv64_t* dyn, int ninst, int s1, int s2, int st) MESSAGE(LOG_DUMP, "\t-------x87 Cache for ST%d\n", st); } +void x87_free(dynarec_rv64_t* dyn, int ninst, int s1, int s2, int s3, int st) +{ + int ret = -1; + for (int i=0; (i<8) && (ret==-1); ++i) + if(dyn->e.x87cache[i] == st) + ret = i; + MESSAGE(LOG_DUMP, "\tFFREE%s x87 Cache for ST%d\n", (ret!=-1)?" (and Forget)":"", st); + if(ret!=-1) { + const int reg = dyn->e.x87reg[ret]; + #if STEP == 1 + if(dyn->e.extcache[reg].t==EXT_CACHE_ST_F || dyn->e.extcache[reg].t==EXT_CACHE_ST_I64) + extcache_promote_double(dyn, ninst, st); + #endif + // Get top + LW(s2, xEmu, offsetof(x64emu_t, top)); + // Update + int ast = st - dyn->e.x87stack; + if(ast) { + if(ast>0) { + ADDI(s2, s2, ast); + } else { + SUBI(s2, s2, -ast); + } + ANDI(s2, s2, 7); // (emu->top + i)&7 + } + if(rv64_zba) SH3ADD(s1, s2, xEmu); else {SLLI(s2, s2, 3); ADD(s1, xEmu, s2);} + if (dyn->e.extcache[EXTIDX(reg)].t == EXT_CACHE_ST_F) { + FCVTDS(SCRATCH0, reg); + FSD(SCRATCH0, s1, offsetof(x64emu_t, x87)); + } else if (dyn->e.extcache[EXTIDX(reg)].t == EXT_CACHE_ST_I64) { + FMVXD(s2, reg); + FCVTDL(SCRATCH0, s2, RD_RTZ); + FSD(SCRATCH0, s1, offsetof(x64emu_t, x87)); + } else { + FSD(reg, s1, offsetof(x64emu_t, x87)); + } + // and forget that cache + fpu_free_reg(dyn, reg); + dyn->e.extcache[reg].v = 0; + dyn->e.x87cache[ret] = -1; + dyn->e.x87reg[ret] = -1; + } else { + // Get top + LW(s2, xEmu, offsetof(x64emu_t, top)); + // Update + int ast = st - dyn->e.x87stack; + if(ast) { + if(ast>0) { + ADDI(s2, s2, ast); + } else { + SUBI(s2, s2, -ast); + } + ANDI(s2, s2, 7); // (emu->top + i)&7 + } + } + // add mark in the freed array + dyn->e.tags |= 0b11<<(st*2); + MESSAGE(LOG_DUMP, "\t--------x87 FFREE for ST%d\n", st); +} + void x87_swapreg(dynarec_rv64_t* dyn, int ninst, int s1, int s2, int a, int b) { int i1, i2, i3; @@ -1905,34 +2024,16 @@ static void fpuCacheTransform(dynarec_rv64_t* dyn, int ninst, int s1, int s2, in SW(s3, xEmu, offsetof(x64emu_t, fpu_stack)); // Sub x87stack to top, with and 7 LWU(s3, xEmu, offsetof(x64emu_t, top)); - // update tags (and top at the same time) + // update tags + LH(s2, xEmu, offsetof(x64emu_t, fpu_tags)); if(a>0) { - // new tag to fulls - ADDI(s2, xZR, 0); - ADDI(s1, xEmu, offsetof(x64emu_t, p_regs)); - SLLI(s3, s3, 2); - for (int i=0; i<a; ++i) { - SUBI(s3, s3, 1<<2); - ANDI(s3, s3, 7<<2); - ADD(s3, s1, s3); - SW(s2, s3, 0); // that slot is full - SUB(s3, s3, s1); - } - SRLI(s3, s3, 2); + SLLI(s2, s2, a*2); } else { - // empty tags - ADDI(s2, xZR, 0b11); - ADDI(s1, xEmu, offsetof(x64emu_t, p_regs)); - SLLI(s3, s3, 2); - for (int i=0; i<-a; ++i) { - ADD(s3, s1, s3); - SW(s2, s3, 0); // empty slot before leaving it - SUB(s3, s3, s1); - ADDI(s3, s3, 1<<2); - ANDI(s3, s3, 7<<2); // (emu->top + st)&7 - } - SRLI(s3, s3, 2); + SLLI(s3, xMASK, 16); // 0xffff0000 + OR(s2, s2, s3); + SRLI(s2, s2, -a*2); } + SH(s2, xEmu, offsetof(x64emu_t, fpu_tags)); SW(s3, xEmu, offsetof(x64emu_t, top)); s3_top = 0; stack_cnt = cache_i2.stack; @@ -2142,8 +2243,14 @@ void fpu_reset_cache(dynarec_rv64_t* dyn, int ninst, int reset_n) #if STEP > 1 // for STEP 2 & 3, just need to refrest with current, and undo the changes (push & swap) dyn->e = dyn->insts[ninst].e; + #else + dyn->e = dyn->insts[reset_n].e; + #endif extcacheUnwind(&dyn->e); - #ifdef HAVE_TRACE + #if STEP == 0 + if(box64_dynarec_dump) dynarec_log(LOG_NONE, "New x87stack=%d\n", dyn->e.x87stack); + #endif + #if defined(HAVE_TRACE) && (STEP>2) if(box64_dynarec_dump) if(memcmp(&dyn->e, &dyn->insts[reset_n].e, sizeof(ext_cache_t))) { MESSAGE(LOG_DEBUG, "Warning, difference in extcache: reset="); @@ -2173,9 +2280,6 @@ void fpu_reset_cache(dynarec_rv64_t* dyn, int ninst, int reset_n) MESSAGE(LOG_DEBUG, "\n"); } #endif //HAVE_TRACE - #else - dyn->e = dyn->insts[reset_n].e; - #endif } // propagate ST stack state, especial stack pop that are deferred diff --git a/src/dynarec/rv64/dynarec_rv64_helper.h b/src/dynarec/rv64/dynarec_rv64_helper.h index 3292ea2f..1ed23785 100644 --- a/src/dynarec/rv64/dynarec_rv64_helper.h +++ b/src/dynarec/rv64/dynarec_rv64_helper.h @@ -896,27 +896,27 @@ #define X87_POP_OR_FAIL(dyn, ninst, scratch) x87_do_pop(dyn, ninst, scratch) #else #define X87_PUSH_OR_FAIL(var, dyn, ninst, scratch, t) \ - if (dyn->e.stack == +8) { \ - if(box64_dynarec_dump) dynarec_log(LOG_INFO, " Warning, suspicious x87 Push, stack=%d on inst %d\n", dyn->e.x87stack, ninst); \ - dyn->abort = 1; \ - return addr; \ - } \ + if ((dyn->e.x87stack==8) || (dyn->e.pushed==8)) { \ + if(box64_dynarec_dump) dynarec_log(LOG_NONE, " Warning, suspicious x87 Push, stack=%d/%d on inst %d\n", dyn->e.x87stack, dyn->e.pushed, ninst); \ + dyn->abort = 1; \ + return addr; \ + } \ var = x87_do_push(dyn, ninst, scratch, t); #define X87_PUSH_EMPTY_OR_FAIL(dyn, ninst, scratch) \ - if (dyn->e.stack == +8) { \ - if(box64_dynarec_dump) dynarec_log(LOG_INFO, " Warning, suspicious x87 Push, stack=%d on inst %d\n", dyn->e.x87stack, ninst); \ - dyn->abort = 1; \ - return addr; \ - } \ + if ((dyn->e.x87stack==8) || (dyn->e.pushed==8)) { \ + if(box64_dynarec_dump) dynarec_log(LOG_NONE, " Warning, suspicious x87 Push, stack=%d/%d on inst %d\n", dyn->e.x87stack, dyn->e.pushed, ninst); \ + dyn->abort = 1; \ + return addr; \ + } \ x87_do_push_empty(dyn, ninst, scratch); #define X87_POP_OR_FAIL(dyn, ninst, scratch) \ - if (dyn->e.stack == -8) { \ - if(box64_dynarec_dump) dynarec_log(LOG_INFO, " Warning, suspicious x87 Pop, stack=%d on inst %d\n", dyn->e.x87stack, ninst); \ - dyn->abort = 1; \ - return addr; \ - } \ + if ((dyn->e.x87stack==-8) || (dyn->e.poped==8)) { \ + if(box64_dynarec_dump) dynarec_log(LOG_NONE, " Warning, suspicious x87 Pop, stack=%d/%d on inst %d\n", dyn->e.x87stack, dyn->e.poped, ninst); \ + dyn->abort = 1; \ + return addr; \ + } \ x87_do_pop(dyn, ninst, scratch); #endif @@ -1196,6 +1196,7 @@ void* rv64_next(x64emu_t* emu, uintptr_t addr); #define x87_get_extcache STEPNAME(x87_get_extcache) #define x87_get_st STEPNAME(x87_get_st) #define x87_get_st_empty STEPNAME(x87_get_st) +#define x87_free STEPNAME(x87_free) #define x87_refresh STEPNAME(x87_refresh) #define x87_forget STEPNAME(x87_forget) #define x87_reget_st STEPNAME(x87_reget_st) @@ -1358,6 +1359,8 @@ int x87_get_extcache(dynarec_rv64_t* dyn, int ninst, int s1, int s2, int a); int x87_get_st(dynarec_rv64_t* dyn, int ninst, int s1, int s2, int a, int t); // get vfpu register for a x87 reg, create the entry if needed. Do not fetch the Stx if not already in cache int x87_get_st_empty(dynarec_rv64_t* dyn, int ninst, int s1, int s2, int a, int t); +// Free st, using the FFREE opcode (so it's freed but stack is not moved) +void x87_free(dynarec_rv64_t* dyn, int ninst, int s1, int s2, int s3, int st); // refresh a value from the cache ->emu (nothing done if value is not cached) void x87_refresh(dynarec_rv64_t* dyn, int ninst, int s1, int s2, int st); // refresh a value from the cache ->emu and then forget the cache (nothing done if value is not cached) diff --git a/src/dynarec/rv64/dynarec_rv64_pass0.h b/src/dynarec/rv64/dynarec_rv64_pass0.h index 0def2bed..d5040d9b 100644 --- a/src/dynarec/rv64/dynarec_rv64_pass0.h +++ b/src/dynarec/rv64/dynarec_rv64_pass0.h @@ -16,7 +16,7 @@ dyn->f.pending=(B)&SF_SET_PENDING; \ dyn->f.dfnone=((B)&SF_SET)?1:0; #define EMIT(A) dyn->native_size+=4 -#define JUMP(A, C) add_jump(dyn, ninst); add_next(dyn, (uintptr_t)A); dyn->insts[ninst].x64.jmp = A; dyn->insts[ninst].x64.jmp_cond = C +#define JUMP(A, C) add_jump(dyn, ninst); add_next(dyn, (uintptr_t)A); SMEND(); dyn->insts[ninst].x64.jmp = A; dyn->insts[ninst].x64.jmp_cond = C; dyn->insts[ninst].x64.jmp_insts = 0 #define BARRIER(A) if(A!=BARRIER_MAYBE) {fpu_purgecache(dyn, ninst, 0, x1, x2, x3); dyn->insts[ninst].x64.barrier = A;} else dyn->insts[ninst].barrier_maybe = 1 #define BARRIER_NEXT(A) dyn->insts[ninst].x64.barrier_next = A #define SET_HASCALLRET() dyn->insts[ninst].x64.has_callret = 1 diff --git a/src/dynarec/rv64/dynarec_rv64_private.h b/src/dynarec/rv64/dynarec_rv64_private.h index 6a870bf7..3acbdfb6 100644 --- a/src/dynarec/rv64/dynarec_rv64_private.h +++ b/src/dynarec/rv64/dynarec_rv64_private.h @@ -52,11 +52,14 @@ typedef struct extcache_s { uint8_t combined2; uint8_t swapped; // the combined reg were swapped uint8_t barrier; // is there a barrier at instruction epilog? + uint8_t pushed; // positive pushed value (to check for overflow) + uint8_t poped; // positive poped value (to check for underflow) uint32_t news; // bitmask, wich neoncache are new for this opcode sse_old_t olds[16]; // SSE regs has changed or has been removed // fpu cache int8_t x87cache[8]; // cache status for the 8 x87 register behind the fpu stack int8_t x87reg[8]; // reg used for x87cache entry + int16_t tags; // similar to fpu_tags int8_t mmxcache[8]; // cache status for the 8 MMX registers sse_cache_t ssecache[16]; // cache status for the 16 SSE(2) registers int8_t fpuused[24]; // all 10..31 & 0..1 double reg from fpu, used by x87, sse and mmx diff --git a/src/emu/x64emu.c b/src/emu/x64emu.c index 69aed8c1..7d54d651 100644 --- a/src/emu/x64emu.c +++ b/src/emu/x64emu.c @@ -234,7 +234,7 @@ void CloneEmu(x64emu_t *newemu, const x64emu_t* emu) memcpy(newemu->mmx, emu->mmx, sizeof(emu->mmx)); memcpy(newemu->fpu_ld, emu->fpu_ld, sizeof(emu->fpu_ld)); memcpy(newemu->fpu_ll, emu->fpu_ll, sizeof(emu->fpu_ll)); - memcpy(newemu->p_regs, emu->p_regs, sizeof(emu->p_regs)); + newemu->fpu_tags = emu->fpu_tags; newemu->cw = emu->cw; newemu->sw = emu->sw; newemu->top = emu->top; @@ -270,7 +270,7 @@ void CopyEmu(x64emu_t *newemu, const x64emu_t* emu) memcpy(newemu->xmm, emu->xmm, sizeof(emu->xmm)); memcpy(newemu->fpu_ld, emu->fpu_ld, sizeof(emu->fpu_ld)); memcpy(newemu->fpu_ll, emu->fpu_ll, sizeof(emu->fpu_ll)); - memcpy(newemu->p_regs, emu->p_regs, sizeof(emu->p_regs)); + newemu->fpu_tags = emu->fpu_tags; newemu->cw = emu->cw; newemu->sw = emu->sw; newemu->top = emu->top; diff --git a/src/emu/x64emu_private.h b/src/emu/x64emu_private.h index 90c9b7b0..7ae9e1bd 100644 --- a/src/emu/x64emu_private.h +++ b/src/emu/x64emu_private.h @@ -76,7 +76,7 @@ typedef struct x64emu_s { #endif fpu_ld_t fpu_ld[8]; // for long double emulation / 80bits fld fst fpu_ll_t fpu_ll[8]; // for 64bits fild / fist sequence - fpu_p_reg_t p_regs[8]; + uint64_t fpu_tags; // tags for the x87 regs, stacked, only on a 16bits anyway // old ip uintptr_t old_ip; // deferred flags diff --git a/src/emu/x64rund9.c b/src/emu/x64rund9.c index 2cc8cdce..015dfa07 100644 --- a/src/emu/x64rund9.c +++ b/src/emu/x64rund9.c @@ -192,7 +192,10 @@ uintptr_t RunD9(x64emu_t *emu, rex_t rex, uintptr_t addr) emu->top=(emu->top-1)&7; // this will probably break a few things break; case 0xF7: /* FINCSTP */ - emu->top=(emu->top+1)&7; // this will probably break a few things + if(emu->fpu_tags&0b11) + fpu_do_pop(emu); + else + emu->top=(emu->top+1)&7; // this will probably break a few things break; case 0xF9: /* FYL2XP1 */ ST(1).d *= log2(ST0.d + 1.0); diff --git a/src/emu/x64rundd.c b/src/emu/x64rundd.c index 35b439fe..a62c9254 100644 --- a/src/emu/x64rundd.c +++ b/src/emu/x64rundd.c @@ -45,7 +45,7 @@ uintptr_t RunDD(x64emu_t *emu, rex_t rex, uintptr_t addr) case 0xC5: case 0xC6: case 0xC7: - fpu_do_free(emu, nextop-0xC0); + fpu_do_free(emu, nextop&7); break; case 0xD0: /* FST ST0, STx */ diff --git a/src/emu/x64test.c b/src/emu/x64test.c index f1294466..447cd384 100644 --- a/src/emu/x64test.c +++ b/src/emu/x64test.c @@ -82,10 +82,10 @@ void x64test_check(x64emu_t* ref, uintptr_t ip) } //memcpy(ref->fpu_ld, emu->fpu_ld, sizeof(emu->fpu_ld)); //memcpy(ref->fpu_ll, emu->fpu_ll, sizeof(emu->fpu_ll)); - /*if(ref->p_regs != emu->p_regs) { + if(ref->fpu_tags != emu->fpu_tags) { BANNER; - printf_log(LOG_NONE, "X87 PREG: %x | %x\n", ref->p_regs, emu->p_regs); - }*/ + printf_log(LOG_NONE, "X87 TAGS: %x | %x\n", ref->fpu_tags, emu->fpu_tags); + } if(ref->cw.x16 != emu->cw.x16) { BANNER; printf_log(LOG_NONE, "X87 CW: %x | %x\n", ref->cw.x16, emu->cw.x16); diff --git a/src/emu/x87emu_private.c b/src/emu/x87emu_private.c index 67573c37..0ad3db8c 100644 --- a/src/emu/x87emu_private.c +++ b/src/emu/x87emu_private.c @@ -11,10 +11,10 @@ void fpu_do_free(x64emu_t* emu, int i) { - emu->p_regs[(emu->top+i)&7].tag = 0b11; // empty + emu->fpu_tags |= 0b11 << (i); // empty // check if all empty for(int j=0; j<8; ++j) - if(emu->p_regs[j].tag != 0b11) + if(emu->fpu_tags != TAGS_EMPTY) return; emu->fpu_stack = 0; } @@ -27,8 +27,7 @@ void reset_fpu(x64emu_t* emu) emu->sw.x16 = 0x0000; emu->top = 0; emu->fpu_stack = 0; - for(int i=0; i<8; ++i) - emu->p_regs[i].tag = 0b11; // STx is empty + emu->fpu_tags = TAGS_EMPTY; } void fpu_fbst(x64emu_t* emu, uint8_t* d) { @@ -258,9 +257,7 @@ void fpu_loadenv(x64emu_t* emu, char* p, int b16) p+=(b16)?2:4; // tagword: 2bits*8 // tags... (only full = 0b11 / free = 0b00) - uint16_t tags = *(uint16_t*)p; - for(int i=0; i<8; ++i) - emu->p_regs[i].tag = (tags>>(i*2))&0b11; + emu->fpu_tags = *(uint16_t*)p; // intruction pointer: 16bits // data (operand) pointer: 16bits // last opcode: 11bits save: 16bits restaured (1st and 2nd opcode only) @@ -277,10 +274,7 @@ void fpu_savenv(x64emu_t* emu, char* p, int b16) if(!b16) {*(uint16_t*)p = 0; p+=2;} // tagword: 2bits*8 // tags... - uint16_t tags = 0; - for (int i=0; i<8; ++i) - tags |= (emu->p_regs[i].tag)<<(i*2); - *(uint16_t*)p = tags; + *(uint16_t*)p = emu->fpu_tags; // other stuff are not pushed.... } @@ -325,14 +319,14 @@ void fpu_fxsave32(x64emu_t* emu, void* ed) int top = emu->top&7; int stack = 8-top; if(top==0) // check if stack is full or empty, based on tag[0] - stack = (emu->p_regs[0].tag)?8:0; + stack = (emu->fpu_tags&0b11)?8:0; emu->sw.f.F87_TOP = top; p->ControlWord = emu->cw.x16; p->StatusWord = emu->sw.x16; p->MxCsr = emu->mxcsr.x32; uint8_t tags = 0; for (int i=0; i<8; ++i) - tags |= ((emu->p_regs[i].tag)<<(i*2)==0b11)?0:1; + tags |= ((emu->fpu_tags>>(i*2))&0b11)?0:1; p->TagWord = tags; p->ErrorOpcode = 0; p->ErrorOffset = 0; @@ -353,15 +347,15 @@ void fpu_fxsave64(x64emu_t* emu, void* ed) int top = emu->top&7; int stack = 8-top; if(top==0) // check if stack is full or empty, based on tag[0] - stack = (emu->p_regs[0].tag)?8:0; + stack = (emu->fpu_tags&0b11)?8:0; emu->sw.f.F87_TOP = top; p->ControlWord = emu->cw.x16; p->StatusWord = emu->sw.x16; p->MxCsr = emu->mxcsr.x32; uint8_t tags = 0; for (int i=0; i<8; ++i) - tags |= ((emu->p_regs[i].tag)<<(i*2)==0b11)?0:1; - p->TagWord = tags; + tags |= ((emu->fpu_tags>>(i*2))&0b11)?0:1; + p->TagWord = emu->fpu_tags; p->ErrorOpcode = 0; p->ErrorOffset = 0; p->DataOffset = 0; @@ -382,12 +376,12 @@ void fpu_fxrstor32(x64emu_t* emu, void* ed) applyFlushTo0(emu); emu->top = emu->sw.f.F87_TOP; uint8_t tags = p->TagWord; - for(int i=0; i<8; ++i) - emu->p_regs[i].tag = (tags>>(i*2))?0:0b11; + for (int i=0; i<8; ++i) + tags |= ((emu->fpu_tags>>(i*2))&0b11)?0:1; int top = emu->top&7; int stack = 8-top; if(top==0) // check if stack is full or empty, based on tag[0] - stack = (emu->p_regs[0].tag)?8:0; + stack = (emu->fpu_tags&0b11)?8:0; // copy back MMX regs... for(int i=0; i<8; ++i) memcpy((i<stack)?&ST(i):&emu->mmx[i], &p->FloatRegisters[i].q[0], sizeof(mmx87_regs_t)); @@ -406,11 +400,11 @@ void fpu_fxrstor64(x64emu_t* emu, void* ed) emu->top = emu->sw.f.F87_TOP; uint8_t tags = p->TagWord; for(int i=0; i<8; ++i) - emu->p_regs[i].tag = (tags>>(i*2))?0:0b11; + emu->fpu_tags |= ((tags>>i)?0:0b11)<<(i*2); int top = emu->top&7; int stack = 8-top; if(top==0) // check if stack is full or empty, based on tag[0] - stack = (emu->p_regs[0].tag)?8:0; + stack = (emu->fpu_tags&0b11)?8:0; // copy back MMX regs... for(int i=0; i<8; ++i) memcpy((i<stack)?&ST(i):&emu->mmx[i], &p->FloatRegisters[i].q[0], sizeof(mmx87_regs_t)); diff --git a/src/emu/x87emu_private.h b/src/emu/x87emu_private.h index ae977133..a3c589df 100644 --- a/src/emu/x87emu_private.h +++ b/src/emu/x87emu_private.h @@ -15,6 +15,8 @@ typedef struct x64emu_s x64emu_t; #define LN2 0.69314718055994531 #define LG2 0.3010299956639812 +#define TAGS_EMPTY 0b1111111111111111 + #define ST0 emu->x87[emu->top] #define ST1 emu->x87[(emu->top+1)&7] #define ST(a) emu->x87[(emu->top+(a))&7] @@ -32,7 +34,8 @@ static inline void fpu_do_push(x64emu_t* emu) }*/ if(emu->fpu_stack<8) ++emu->fpu_stack; - emu->p_regs[newtop].tag = 0; // full + emu->fpu_tags<<=2; // st0 full + emu->fpu_tags &= TAGS_EMPTY; emu->top = newtop; } @@ -47,8 +50,16 @@ static inline void fpu_do_pop(x64emu_t* emu) if(emu->fpu_stack>0) --emu->fpu_stack; - emu->p_regs[curtop].tag = 0b11; // empty + emu->fpu_tags>>=2; + emu->fpu_tags |= 0b1100000000000000; // top empty emu->top = (emu->top+1)&7; + // check tags + /*while((emu->fpu_tags&0b11) && emu->fpu_stack) { + --emu->fpu_stack; + emu->top = (emu->top+1)&7; + emu->fpu_tags>>=2; + emu->fpu_tags |= 0b1100000000000000; // top empty + }*/ } void fpu_do_free(x64emu_t* emu, int i); @@ -128,7 +139,7 @@ static inline double fpu_round(x64emu_t* emu, double d) { static inline void fpu_fxam(x64emu_t* emu) { emu->sw.f.F87_C1 = (ST0.ud[1]&0x80000000)?1:0; - if((emu->fpu_stack<=0) || (emu->p_regs[(emu->top)&7].tag == 0b11)) { + if((emu->fpu_stack<=0) || (emu->fpu_tags&0b11)) { //Empty emu->sw.f.F87_C3 = 1; emu->sw.f.F87_C2 = 0; diff --git a/src/include/regs.h b/src/include/regs.h index dc72a648..721f155a 100644 --- a/src/include/regs.h +++ b/src/include/regs.h @@ -32,10 +32,6 @@ typedef union { uint8_t byte[8]; } reg64_t; -typedef struct { - uint32_t tag; -} fpu_p_reg_t; - typedef enum { ROUND_Nearest = 0, ROUND_Down = 1, |