diff options
| author | ptitSeb <sebastien.chev@gmail.com> | 2024-06-06 16:48:05 +0200 |
|---|---|---|
| committer | ptitSeb <sebastien.chev@gmail.com> | 2024-06-06 16:48:05 +0200 |
| commit | 0dc5761ca6743d5a5f0f6d3086828f6d6ed5c720 (patch) | |
| tree | 2ab46d1455b5cd96049b2f74a1f058842d2115d6 /src | |
| parent | df47fb5be83f3226b11703f23722c7c0b6a0b271 (diff) | |
| download | box64-0dc5761ca6743d5a5f0f6d3086828f6d6ed5c720.tar.gz box64-0dc5761ca6743d5a5f0f6d3086828f6d6ed5c720.zip | |
[INTERPRETER] Added suport for F16C extension (linked to AVX flag) ([ARM64_DYNAREC] too)
Diffstat (limited to 'src')
| -rw-r--r-- | src/core.c | 5 | ||||
| -rw-r--r-- | src/dynarec/arm64/arm64_emitter.h | 12 | ||||
| -rw-r--r-- | src/dynarec/arm64/dynarec_arm64_avx_66_0f38.c | 17 | ||||
| -rw-r--r-- | src/dynarec/arm64/dynarec_arm64_avx_66_0f3a.c | 48 | ||||
| -rw-r--r-- | src/emu/x64runavx660f38.c | 13 | ||||
| -rw-r--r-- | src/emu/x64runavx660f3a.c | 24 | ||||
| -rw-r--r-- | src/emu/x87emu_private.c | 99 | ||||
| -rw-r--r-- | src/emu/x87emu_private.h | 3 | ||||
| -rw-r--r-- | src/include/debug.h | 1 | ||||
| -rw-r--r-- | src/tools/my_cpuid.c | 2 |
10 files changed, 216 insertions, 8 deletions
diff --git a/src/core.c b/src/core.c index 6391b302..01e40fd9 100644 --- a/src/core.c +++ b/src/core.c @@ -92,7 +92,6 @@ int arm64_aes = 0; int arm64_pmull = 0; int arm64_crc32 = 0; int arm64_atomics = 0; -int arm64_asimdhp = 0; int arm64_sha1 = 0; int arm64_sha2 = 0; int arm64_uscat = 0; @@ -405,8 +404,6 @@ HWCAP2_ECV // ATOMIC use are disable for now. They crashes Batman Arkham Knight, bossibly other (also seems to make steamwebhelper unstable) if(hwcap&HWCAP_ATOMICS) arm64_atomics = 1; - if(hwcap&HWCAP_ASIMDHP) - arm64_asimdhp = 1; #ifdef HWCAP_SHA1 if(hwcap&HWCAP_SHA1) arm64_sha1 = 1; @@ -445,8 +442,6 @@ HWCAP2_ECV printf_log(LOG_INFO, " PMULL"); if(arm64_atomics) printf_log(LOG_INFO, " ATOMICS"); - if(arm64_asimdhp) - printf_log(LOG_INFO, " ASIMDHP"); if(arm64_sha1) printf_log(LOG_INFO, " SHA1"); if(arm64_sha2) diff --git a/src/dynarec/arm64/arm64_emitter.h b/src/dynarec/arm64/arm64_emitter.h index c7bb614d..844d29dd 100644 --- a/src/dynarec/arm64/arm64_emitter.h +++ b/src/dynarec/arm64/arm64_emitter.h @@ -1375,18 +1375,30 @@ int convert_bitmask(uint64_t bitmask); #define FCVTN(Vd, Vn) EMIT(FCVTN_vector(0, 1, Vn, Vd)) // Convert Vn from 2*Double to higher Vd as 2*float, use FPCR rounding #define FCVTN2(Vd, Vn) EMIT(FCVTN_vector(1, 1, Vn, Vd)) +// Convert Vn from 2*Float to lower Vd as 2*float16 and clears the upper half, use FPCR rounding +#define FCVTN16(Vd, Vn) EMIT(FCVTN_vector(0, 0, Vn, Vd)) +// Convert Vn from 2*Float to higher Vd as 2*float16, use FPCR rounding +#define FCVTN162(Vd, Vn) EMIT(FCVTN_vector(1, 0, Vn, Vd)) #define FCVTXN_vector(Q, sz, Rn, Rd) ((Q)<<30 | 1<<29 | 0b01110<<24 | (sz)<<22 | 0b10000<<17 | 0b10110<<12 | 0b10<<10 | (Rn)<<5 | (Rd)) // Convert Vn from 2*Double to lower Vd as 2*float and clears the upper half #define FCVTXN(Vd, Vn) EMIT(FCVTXN_vector(0, 1, Vn, Vd)) // Convert Vn from 2*Double to higher Vd as 2*float #define FCVTXN2(Vd, Vn) EMIT(FCVTXN_vector(1, 1, Vn, Vd)) +// Convert Vn from 2*Float to lower Vd as 2*float16 and clears the upper half +#define FCVTXN16(Vd, Vn) EMIT(FCVTXN_vector(0, 0, Vn, Vd)) +// Convert Vn from 2*Float to higher Vd as 2*float16 +#define FCVTXN162(Vd, Vn) EMIT(FCVTXN_vector(1, 0, Vn, Vd)) #define FCVTL_vector(Q, sz, Rn, Rd) ((Q)<<30 | 0<<29 | 0b01110<<24 | (sz)<<22 | 0b10000<<17 | 0b10111<<12 | 0b10<<10 | (Rn)<<5 | (Rd)) // Convert lower Vn from 2*float to Vd as 2*double #define FCVTL(Vd, Vn) EMIT(FCVTL_vector(0, 1, Vn, Vd)) // Convert higher Vn from 2*float to Vd as 2*double #define FCVTL2(Vd, Vn) EMIT(FCVTL_vector(1, 1, Vn, Vd)) +// Convert lower Vn from 2*float16 to Vd as 2*float +#define FCVTL16(Vd, Vn) EMIT(FCVTL_vector(0, 0, Vn, Vd)) +// Convert higher Vn from 2*float16 to Vd as 2*float +#define FCVTL162(Vd, Vn) EMIT(FCVTL_vector(1, 0, Vn, Vd)) #define SCVTF_scalar(sf, type, rmode, opcode, Rn, Rd) ((sf)<<31 | 0b11110<<24 | (type)<<22 | 1<<21 | (rmode)<<19 | (opcode)<<16 | (Rn)<<5 | (Rd)) #define SCVTFSw(Sd, Wn) EMIT(SCVTF_scalar(0, 0b00, 0b00, 0b010, Wn, Sd)) diff --git a/src/dynarec/arm64/dynarec_arm64_avx_66_0f38.c b/src/dynarec/arm64/dynarec_arm64_avx_66_0f38.c index a2a45435..e4ebd5b2 100644 --- a/src/dynarec/arm64/dynarec_arm64_avx_66_0f38.c +++ b/src/dynarec/arm64/dynarec_arm64_avx_66_0f38.c @@ -158,6 +158,23 @@ uintptr_t dynarec64_AVX_66_0F38(dynarec_arm_t* dyn, uintptr_t addr, uintptr_t ip if(!vex.l) YMM0(gd); break; + case 0x13: + INST_NAME("VCVTPH2PS Gx, Ex"); + nextop = F8; + GETEX_Y(v1, 0, 0); + GETGX_empty(v0); + if(vex.l && v0==v1) { + q1 = fpu_get_scratch(dyn, ninst); + VMOVQ(q1, v1); + v1 = q1; + } + FCVTL16(v0, v1); + if(vex.l) { + GETGY_empty(v0, -1, -1, -1); + FCVTL162(v0, v1); + } else YMM0(gd); + break; + case 0x17: INST_NAME("VPTEST GX, EX"); SETFLAGS(X_ALL, SF_SET); diff --git a/src/dynarec/arm64/dynarec_arm64_avx_66_0f3a.c b/src/dynarec/arm64/dynarec_arm64_avx_66_0f3a.c index cdbe93f6..e667f562 100644 --- a/src/dynarec/arm64/dynarec_arm64_avx_66_0f3a.c +++ b/src/dynarec/arm64/dynarec_arm64_avx_66_0f3a.c @@ -402,6 +402,54 @@ uintptr_t dynarec64_AVX_66_0F3A(dynarec_arm_t* dyn, uintptr_t addr, uintptr_t ip F8; // read u8, but it's been already handled break; + case 0x1D: + INST_NAME("VCVTPS2PH Ex, Gx"); + nextop = F8; + GETGX(v0, 0); + if(MODREG) { + v1 = sse_get_reg_empty(dyn, ninst, x3, (nextop&7)+(rex.b<<3)); + } else { + WILLWRITE2(); + v1 = fpu_get_scratch(dyn, ninst); + addr = geted(dyn, addr, ninst, nextop, &ed, x3, &fixedaddress, &unscaled, 0xfff<<(3+vex.l), vex.l?15:7, rex, NULL, 0, 1); + } + if(vex.l && v0==v1) { + q0 = fpu_get_scratch(dyn, ninst); + VMOVQ(q0, v0); + v0 = q0; + } + u8 = F8; + if(u8&4) { + s0 = sse_setround(dyn, ninst, x1, x2, x6); + } else { + u8&=3; + if(u8==1) u8=2; + else if(u8==2) u8=1; + MRS_fpcr(x1); // get fpscr + MOV32w(x2, u8); + MOVx_REG(x6, x1); + BFIx(x1, x2, 22, 2); // inject new round + MSR_fpcr(x1); // put new fpscr + s0 = x6; + } + FCVTN16(v1, v0); + if(vex.l) { + GETGY(v0, 0, MODREG?((nextop&7)+(rex.b<<3)):-1, -1,-1); + FCVTN162(v1, v0); + } + x87_restoreround(dyn, ninst, s0); + if(MODREG) { + YMM0((nextop&7)+(rex.b<<3)); + } else { + if(vex.l) { + VST128(v1, ed, fixedaddress); + } else { + VST64(v1, ed, fixedaddress); + } + SMWRITE2(); + } + break; + case 0x20: INST_NAME("VINSERTD Gx, Vx, Ex, Ib"); nextop = F8; diff --git a/src/emu/x64runavx660f38.c b/src/emu/x64runavx660f38.c index e1302e1e..8bd9c051 100644 --- a/src/emu/x64runavx660f38.c +++ b/src/emu/x64runavx660f38.c @@ -521,6 +521,19 @@ uintptr_t RunAVX_660F38(x64emu_t *emu, vex_t vex, uintptr_t addr, int *step) CLEAR_FLAG(F_PF); break; + case 0x13: /* VCVTPH2PS Gx, Ex */ + nextop = F8; + GETEX(0); + GETGX; + GETGY; + if(vex.l) { + for(int i=3; i>=0; --i) + GY->ud[i] = cvtf16_32(EX->uw[4+i]); + } else GY->u128 = 0; + for(int i=3; i>=0; --i) + GX->ud[i] = cvtf16_32(EX->uw[i]); + break; + case 0x16: /* VPERMPS Gx, Vx, Ex */ // same code as 0x36 nextop = F8; diff --git a/src/emu/x64runavx660f3a.c b/src/emu/x64runavx660f3a.c index 2110dff3..a8f213d6 100644 --- a/src/emu/x64runavx660f3a.c +++ b/src/emu/x64runavx660f3a.c @@ -548,6 +548,30 @@ uintptr_t RunAVX_660F3A(x64emu_t *emu, vex_t vex, uintptr_t addr, int *step) } break; + case 0x1D: /* VCVTPS2PH Ex, Gx, u8 */ + nextop = F8; + GETEX(1); + GETGX; + u8 = F8; + if(u8&4) + u8 = emu->mxcsr.f.MXCSR_RC; + else + u8 = u8&3; + for(int i=0; i<4; ++i) + EX->uw[i] = cvtf32_16(GX->ud[i], u8); + if(vex.l) { + GETGY; + for(int i=0; i<4; ++i) + EX->uw[4+i] = cvtf32_16(GY->ud[i], u8); + } + if(MODREG) { + if(!vex.l) EX->q[1] = 0; + GETEY; + EY->u128 = 0; + } + break; + + case 0x20: // VPINSRB GX, Vx, ED, u8 nextop = F8; GETED(1); // It's ED, and not EB diff --git a/src/emu/x87emu_private.c b/src/emu/x87emu_private.c index da480e6a..9164ea30 100644 --- a/src/emu/x87emu_private.c +++ b/src/emu/x87emu_private.c @@ -540,4 +540,101 @@ void fpu_xrstor(x64emu_t* emu, void* ed, int is32bits) for(int i=0; i<(is32bits?8:16); ++i) memset(&emu->ymm[i], 0, 16); } -} \ No newline at end of file +} + +typedef union f16_s { + uint16_t u16; + struct { + uint16_t fraction:10; + uint16_t exponant:5; + uint16_t sign:1; + }; +} f16_t; + +typedef union f32_s { + uint32_t u32; + struct { + uint32_t fraction:23; + uint32_t exponant:8; + uint32_t sign:1; + }; +} f32_t; + +uint32_t cvtf16_32(uint16_t v) +{ + f16_t in = (f16_t)v; + f32_t ret = {0}; + ret.sign = in.sign; + ret.fraction = in.fraction<<13; + if(!in.exponant) + ret.exponant = 0; + else if(in.exponant==0b11111) + ret.exponant = 0b11111111; + else { + int e = in.exponant - 15; + ret.exponant = e + 127; + } + return ret.u32; +} +uint16_t cvtf32_16(uint32_t v, uint8_t rounding) +{ + f32_t in = (f32_t)v; + f16_t ret = {0}; + ret.sign = in.sign; + rounding&=3; + if(!in.exponant) { + // zero and denormals + ret.exponant = 0; + ret.fraction = in.fraction>>13; + return ret.u16; + } else if(in.exponant==0b11111111) { + // nan and infinites + ret.exponant = 0b11111; + ret.fraction = in.fraction; + return ret.u16; + } else { + // regular numbers + int e = in.exponant - 127; + uint16_t f = (in.fraction>>13); + uint16_t r = in.fraction&0b1111111111111; + switch(rounding) { + case 0: // nearest even + if(r>=0b1000000000000) + ++f; + break; + case 1: // round down + f += r?ret.sign:0; + break; + case 2: // round up + f += r?(1-ret.sign):0; + break; + case 3: // truncate + break; + } + if(f>0b1111111111) { + ++e; + f>>=1; + } + // remove msb, it's implicit + if(!f) e = -15; + else if(e<-14) { + // flush to zero + e = -15; f = 0; + } + else if(e>15) { + if((rounding==1 && !in.sign) || (rounding==2 && in.sign) || (rounding==3)) { + // Clamp to max + f=0b1111111111; + e = 15; + } else { + // overflow to inifity + f=0; + e = 16; + } + } + ret.fraction = f; + ret.exponant = e+15; + } + + return ret.u16; +} diff --git a/src/emu/x87emu_private.h b/src/emu/x87emu_private.h index b389028a..b77e416b 100644 --- a/src/emu/x87emu_private.h +++ b/src/emu/x87emu_private.h @@ -220,4 +220,7 @@ void fpu_xsave(x64emu_t* emu, void* ed, int is32bits); void fpu_xsave_mask(x64emu_t* emu, void* ed, int is32bits, uint64_t mask); void fpu_xrstor(x64emu_t* emu, void* ed, int is32bits); +uint32_t cvtf16_32(uint16_t v); +uint16_t cvtf32_16(uint32_t v, uint8_t rounding); + #endif //__X87RUN_PRIVATE_H_ diff --git a/src/include/debug.h b/src/include/debug.h index 52429f47..be48b59e 100644 --- a/src/include/debug.h +++ b/src/include/debug.h @@ -40,7 +40,6 @@ extern int arm64_aes; extern int arm64_pmull; extern int arm64_crc32; extern int arm64_atomics; -extern int arm64_asimdhp; extern int arm64_sha1; extern int arm64_sha2; extern int arm64_uscat; diff --git a/src/tools/my_cpuid.c b/src/tools/my_cpuid.c index 119a5654..0b71e4f7 100644 --- a/src/tools/my_cpuid.c +++ b/src/tools/my_cpuid.c @@ -264,7 +264,7 @@ void my_cpuid(x64emu_t* emu, uint32_t tmp32u) | 1<<26 // xsave | 1<<27 // osxsave | box64_avx<<28 // AVX - //| box64_avx<<29 // F16C + | box64_avx<<29 // F16C ; break; case 0x2: // TLB and Cache info. Sending 1st gen P4 info... |